PyPI - sigdetect - Versions diffs - 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl - Mend

sigdetect 0.5.0py3-none-any.whl → 0.5.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

sigdetect/api.py +10 -4
sigdetect/cli.py +20 -7
sigdetect/config.py +12 -0
sigdetect/cropping.py +261 -19
sigdetect/detector/signature_model.py +4 -0
sigdetect/wet_detection.py +48 -14
{sigdetect-0.5.0.dist-info → sigdetect-0.5.2.dist-info}/METADATA +11 -7
{sigdetect-0.5.0.dist-info → sigdetect-0.5.2.dist-info}/RECORD +11 -11
{sigdetect-0.5.0.dist-info → sigdetect-0.5.2.dist-info}/WHEEL +0 -0
{sigdetect-0.5.0.dist-info → sigdetect-0.5.2.dist-info}/entry_points.txt +0 -0
{sigdetect-0.5.0.dist-info → sigdetect-0.5.2.dist-info}/top_level.txt +0 -0

sigdetect/api.py CHANGED Viewed

@@ -229,6 +229,7 @@ def CropSignatureImages(
     dpi: int = 200,
     returnBytes: Literal[False] = False,
     saveToDisk: bool = True,
+    docx: bool = False,
 ) -> list[Path]: ...
@@ -241,6 +242,7 @@ def CropSignatureImages(
     dpi: int,
     returnBytes: Literal[True],
     saveToDisk: bool,
+    docx: bool = False,
 ) -> list[SignatureCrop]: ...
@@ -252,16 +254,17 @@ def CropSignatureImages(
     dpi: int = 200,
     returnBytes: bool = False,
     saveToDisk: bool = True,
+    docx: bool = False,
 ) -> list[Path] | list[SignatureCrop]:
-    """Create DOCX files containing cropped signature images.
+    """Create PNG files containing cropped signature images (or DOCX when enabled).
     Accepts either a :class:`FileResult` instance or the ``dict`` returned by
     :func:`DetectPdf`. Requires the optional ``pymupdf`` dependency.
     Set ``returnBytes=True`` to also receive in-memory PNG bytes for each crop. Set
     ``saveToDisk=False`` to skip writing PNG files while still returning in-memory data.
-    When ``saveToDisk`` is enabled, a one-image DOCX file is also written per crop. When
-    ``returnBytes`` is True and ``python-docx`` is available, the returned
-    :class:`SignatureCrop` objects include ``docx_bytes``.
+    When ``docx`` is True, DOCX files are written instead of PNG files. When ``returnBytes`` is
+    True and ``docx`` is enabled, the returned :class:`SignatureCrop` objects include
+    ``docx_bytes``.
     """
     from sigdetect.cropping import crop_signatures
@@ -274,6 +277,7 @@ def CropSignatureImages(
         dpi=dpi,
         return_bytes=returnBytes,
         save_files=saveToDisk,
+        docx=docx,
     )
     if original_dict is not None:
         original_dict.clear()
@@ -305,6 +309,8 @@ def _CoerceFileResult(
                 BoundingBox=tuple(bbox) if bbox else None,
                 CropPath=entry.get("crop_path"),
                 CropBytes=entry.get("crop_bytes"),
+                CropDocxPath=entry.get("crop_docx_path"),
+                CropDocxBytes=entry.get("crop_docx_bytes"),
             )
         )

sigdetect/cli.py CHANGED Viewed

@@ -64,13 +64,19 @@ def Detect(
     cropSignatures: bool | None = typer.Option(
         None,
         "--crop-signatures/--no-crop-signatures",
-        help="Write DOCX files containing cropped signature images (requires PyMuPDF + python-docx)",
+        help="Write PNG crops for signature widgets (requires PyMuPDF)",
+        show_default=False,
+    ),
+    cropDocx: bool | None = typer.Option(
+        None,
+        "--crop-docx/--no-crop-docx",
+        help="Write DOCX crops instead of PNG files (requires PyMuPDF + python-docx)",
         show_default=False,
     ),
     cropDirectory: Path | None = typer.Option(
         None,
         "--crop-dir",
-        help="Directory for signature DOCX crops (defaults to out_dir/signature_crops)",
+        help="Directory for signature crops (defaults to out_dir/signature_crops)",
     ),
     cropDpi: int | None = typer.Option(
         None,
@@ -83,7 +89,7 @@ def Detect(
     cropBytes: bool = typer.Option(
         False,
         "--crop-bytes/--no-crop-bytes",
-        help="Embed base64 PNG bytes for signature crops in results JSON",
+        help="Embed base64 PNG bytes (and DOCX bytes when --crop-docx) in results JSON",
         show_default=False,
     ),
     detectWetSignatures: bool | None = typer.Option(
@@ -128,6 +134,8 @@ def Detect(
         overrides["WriteResults"] = writeResults
     if cropSignatures is not None:
         overrides["CropSignatures"] = cropSignatures
+    if cropDocx is not None:
+        overrides["CropDocx"] = cropDocx
     if cropDirectory is not None:
         overrides["CropOutputDirectory"] = cropDirectory
     if cropDpi is not None:
@@ -181,6 +189,7 @@ def Detect(
         base_dir = configuration.OutputDirectory or configuration.PdfRoot
         crop_dir = base_dir / "signature_crops"
     cropping_enabled = configuration.CropSignatures
+    docx_enabled = configuration.CropDocx
     cropping_available = True
     cropping_attempted = False
@@ -199,6 +208,7 @@ def Detect(
                     logger=Logger,
                     return_bytes=crop_bytes_enabled,
                     save_files=cropping_enabled,
+                    docx=docx_enabled,
                 )
                 cropping_attempted = True
                 if crop_bytes_enabled:
@@ -206,15 +216,18 @@ def Detect(
                         crop.signature.CropBytes = base64.b64encode(crop.image_bytes).decode(
                             "ascii"
                         )
+                        if crop.docx_bytes:
+                            crop.signature.CropDocxBytes = base64.b64encode(
+                                crop.docx_bytes
+                            ).decode("ascii")
             except SignatureCroppingUnavailable as exc:
                 cropping_available = False
                 Logger.warning("Signature cropping unavailable", extra={"error": str(exc)})
                 typer.echo(str(exc), err=True)
             except Exception as exc:  # pragma: no cover - defensive
-                Logger.warning(
-                    "Unexpected error while cropping signatures",
-                    extra={"error": str(exc)},
-                )
+                cropping_available = False
+                Logger.warning("Signature cropping unavailable", extra={"error": str(exc)})
+                typer.echo(str(exc), err=True)
         total_bboxes += sum(1 for sig in file_result.Signatures if sig.BoundingBox)

sigdetect/config.py CHANGED Viewed

@@ -31,6 +31,7 @@ class DetectConfiguration(BaseModel):
     PseudoSignatures: bool = Field(default=True, alias="pseudo_signatures")
     RecurseXObjects: bool = Field(default=True, alias="recurse_xobjects")
     CropSignatures: bool = Field(default=True, alias="crop_signatures")
+    CropDocx: bool = Field(default=False, alias="crop_docx")
     CropOutputDirectory: Path | None = Field(default=None, alias="crop_output_dir")
     CropImageDpi: int = Field(default=200, alias="crop_image_dpi", ge=72, le=600)
     DetectWetSignatures: bool = Field(default=True, alias="detect_wet_signatures")
@@ -88,6 +89,10 @@ class DetectConfiguration(BaseModel):
     def crop_signatures(self) -> bool:  # pragma: no cover - simple passthrough
         return self.CropSignatures
+    @property
+    def crop_docx(self) -> bool:  # pragma: no cover - simple passthrough
+        return self.CropDocx
     @property
     def crop_output_dir(self) -> Path | None:  # pragma: no cover - simple passthrough
         return self.CropOutputDirectory
@@ -133,6 +138,7 @@ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
     env_out_dir = os.getenv("SIGDETECT_OUT_DIR")
     env_profile = os.getenv("SIGDETECT_PROFILE")
     env_crop = os.getenv("SIGDETECT_CROP_SIGNATURES")
+    env_crop_docx = os.getenv("SIGDETECT_CROP_DOCX")
     env_crop_dir = os.getenv("SIGDETECT_CROP_DIR")
     env_crop_dpi = os.getenv("SIGDETECT_CROP_DPI")
     env_detect_wet = os.getenv("SIGDETECT_DETECT_WET")
@@ -159,6 +165,12 @@ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
             raw_data["crop_signatures"] = True
         elif lowered in {"0", "false", "no", "off"}:
             raw_data["crop_signatures"] = False
+    if env_crop_docx is not None:
+        lowered = env_crop_docx.lower()
+        if lowered in {"1", "true", "yes", "on"}:
+            raw_data["crop_docx"] = True
+        elif lowered in {"0", "false", "no", "off"}:
+            raw_data["crop_docx"] = False
     if env_crop_dir:
         raw_data["crop_output_dir"] = env_crop_dir
     if env_crop_dpi:

sigdetect/cropping.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""Helpers for converting signature bounding boxes into DOCX crops."""
+"""Helpers for converting signature bounding boxes into PNG or DOCX crops."""
 from __future__ import annotations
@@ -9,6 +9,8 @@ from dataclasses import dataclass
 from pathlib import Path
 from typing import Literal, overload
+from PIL import Image
 from .detector.file_result_model import FileResult
 from .detector.signature_model import Signature
@@ -27,7 +29,7 @@ class SignatureCroppingUnavailable(RuntimeError):
     """Raised when PNG cropping cannot be performed (e.g., PyMuPDF missing)."""
-class SignatureDocxUnavailable(RuntimeError):
+class SignatureDocxUnavailable(SignatureCroppingUnavailable):
     """Raised when DOCX creation cannot be performed (e.g., python-docx missing)."""
@@ -52,6 +54,8 @@ def crop_signatures(
     logger: logging.Logger | None = None,
     return_bytes: Literal[False] = False,
     save_files: bool = True,
+    docx: bool = False,
+    trim: bool = True,
 ) -> list[Path]: ...
@@ -65,6 +69,8 @@ def crop_signatures(
     logger: logging.Logger | None = None,
     return_bytes: Literal[True],
     save_files: bool = True,
+    docx: bool = False,
+    trim: bool = True,
 ) -> list[SignatureCrop]: ...
@@ -77,14 +83,16 @@ def crop_signatures(
     logger: logging.Logger | None = None,
     return_bytes: bool = False,
     save_files: bool = True,
+    docx: bool = False,
+    trim: bool = True,
 ) -> list[Path] | list[SignatureCrop]:
-    """Render each signature bounding box to a PNG image and wrap it in a DOCX file.
+    """Render each signature bounding box to a PNG image and optionally wrap it in DOCX.
     Set ``return_bytes=True`` to collect in-memory PNG bytes for each crop while also writing
     the files to ``output_dir``. Set ``save_files=False`` to skip writing PNGs to disk.
-    When ``save_files`` is enabled, a one-image DOCX file is also written per signature crop.
-    When ``return_bytes`` is True and ``python-docx`` is available, ``SignatureCrop.docx_bytes``
-    will contain the DOCX payload.
+    When ``docx=True``, DOCX files are written instead of PNGs. When ``return_bytes`` is True
+    and ``docx=True``, ``SignatureCrop.docx_bytes`` will contain the DOCX payload.
+    When ``trim`` is enabled, the crop is tightened around the detected ink where possible.
     """
     if fitz is None:  # pragma: no cover - exercised when dependency absent
@@ -101,14 +109,11 @@ def crop_signatures(
     generated_paths: list[Path] = []
     generated_crops: list[SignatureCrop] = []
-    docx_to_disk = save_files
-    docx_in_memory = return_bytes
-    docx_enabled = docx_to_disk or docx_in_memory
+    docx_enabled = docx
     docx_available = Document is not None
-    if docx_enabled and not docx_available and logger:
-        logger.warning(
-            "Signature DOCX output unavailable",
-            extra={"error": "python-docx is required to generate DOCX outputs"},
+    if docx_enabled and not docx_available:
+        raise SignatureDocxUnavailable(
+            "python-docx is required to generate DOCX outputs for signature crops."
         )
     with fitz.open(pdf_path) as document:  # type: ignore[attr-defined]
@@ -146,8 +151,12 @@ def crop_signatures(
             try:
                 image_bytes: bytes | None = None
                 pixmap = page.get_pixmap(matrix=matrix, clip=clip, alpha=False)
+                raw_bytes = pixmap.tobytes("png")
+                final_bytes = _trim_signature_image_bytes(raw_bytes) if trim else raw_bytes
+                if save_files and not docx_enabled:
+                    png_destination.write_bytes(final_bytes)
                 if return_bytes or docx_enabled:
-                    image_bytes = pixmap.tobytes("png")
+                    image_bytes = final_bytes
             except Exception as exc:  # pragma: no cover - defensive
                 if logger:
                     logger.warning(
@@ -162,12 +171,12 @@ def crop_signatures(
                 continue
             docx_bytes: bytes | None = None
-            if docx_enabled and docx_available:
+            if docx_enabled:
                 if image_bytes is None:  # pragma: no cover - defensive
                     continue
                 try:
                     docx_bytes = _build_docx_bytes(image_bytes)
-                    if docx_to_disk:
+                    if save_files:
                         docx_destination.write_bytes(docx_bytes)
                 except SignatureDocxUnavailable as exc:
                     if logger:
@@ -184,14 +193,20 @@ def crop_signatures(
                         )
             if save_files:
-                signature.CropPath = str(docx_destination)
-                generated_paths.append(docx_destination)
+                if docx_enabled:
+                    signature.CropPath = None
+                    signature.CropDocxPath = str(docx_destination)
+                    generated_paths.append(docx_destination)
+                else:
+                    signature.CropDocxPath = None
+                    signature.CropPath = str(png_destination)
+                    generated_paths.append(png_destination)
             if return_bytes:
                 if image_bytes is None:  # pragma: no cover - defensive
                     continue
                 generated_crops.append(
                     SignatureCrop(
-                        path=docx_destination,
+                        path=docx_destination if docx_enabled else png_destination,
                         image_bytes=image_bytes,
                         signature=signature,
                         docx_bytes=docx_bytes,
@@ -214,6 +229,233 @@ def _build_docx_bytes(image_bytes: bytes) -> bytes:
     return buffer.getvalue()
+def _trim_signature_image_bytes(
+    image_bytes: bytes,
+    *,
+    pad_px: int = 4,
+    gap_px: int = 4,
+    min_density_ratio: float = 0.004,
+) -> bytes:
+    image = Image.open(io.BytesIO(image_bytes))
+    gray = image.convert("L")
+    width, height = gray.size
+    histogram = gray.histogram()
+    total_pixels = width * height
+    cutoff = int(total_pixels * 0.995)
+    cumulative = 0
+    white_level = 255
+    for idx, count in enumerate(histogram):
+        cumulative += count
+        if cumulative >= cutoff:
+            white_level = idx
+            break
+    if white_level < 200:
+        return image_bytes
+    thresholds = [min(254, max(200, white_level - delta)) for delta in (6, 4, 2, 1, 0)]
+    min_density = max(2, int(width * min_density_ratio))
+    pixels = gray.load()
+    row_densities: dict[int, list[int]] = {}
+    for threshold in thresholds:
+        row_density = []
+        for y in range(height):
+            dark = sum(1 for x in range(width) if pixels[x, y] < threshold)
+            row_density.append(dark)
+        row_densities[threshold] = row_density
+    line_bounds = _detect_horizontal_rule_cutoff(row_densities[thresholds[-1]], width)
+    scan_limit = None
+    descender_limit = height - 1
+    if line_bounds is not None:
+        line_start, line_end = line_bounds
+        scan_limit = max(0, line_start - 1)
+        descender_limit = min(height - 1, line_end + max(2, int(height * 0.02)))
+    min_band_height = max(4, int(height * 0.02))
+    best = None
+    best_small = None
+    best_small_threshold = None
+    best_threshold = None
+    line_threshold = int(width * 0.6)
+    for threshold in thresholds:
+        row_density = row_densities[threshold]
+        segments: list[tuple[int, int]] = []
+        start: int | None = None
+        for y, dark in enumerate(row_density):
+            if scan_limit is not None and y > scan_limit:
+                if start is not None:
+                    segments.append((start, y - 1))
+                    start = None
+                break
+            if dark >= min_density:
+                if start is None:
+                    start = y
+            else:
+                if start is not None:
+                    segments.append((start, y - 1))
+                    start = None
+        if start is not None:
+            segments.append((start, height - 1))
+        if not segments:
+            continue
+        merged: list[list[int]] = []
+        for seg in segments:
+            if not merged:
+                merged.append([seg[0], seg[1]])
+                continue
+            if seg[0] - merged[-1][1] <= gap_px:
+                merged[-1][1] = seg[1]
+            else:
+                merged.append([seg[0], seg[1]])
+        candidates = []
+        for y0, y1 in merged:
+            min_x, max_x = width, -1
+            total_dark = 0
+            for y in range(y0, y1 + 1):
+                for x in range(width):
+                    if pixels[x, y] < threshold:
+                        total_dark += 1
+                        if x < min_x:
+                            min_x = x
+                        if x > max_x:
+                            max_x = x
+            if max_x < 0:
+                continue
+            band_height = y1 - y0 + 1
+            band_width = max_x - min_x + 1
+            score = total_dark * (band_height**1.3)
+            if line_bounds is not None:
+                distance = max(0, line_bounds[0] - y1)
+                proximity = 1.0 / (1.0 + (distance / 20.0))
+                score *= 1.0 + 0.5 * proximity
+            candidates.append(
+                {
+                    "y0": y0,
+                    "y1": y1,
+                    "min_x": min_x,
+                    "max_x": max_x,
+                    "total": total_dark,
+                    "height": band_height,
+                    "width": band_width,
+                    "score": score,
+                }
+            )
+        if not candidates:
+            continue
+        candidates.sort(key=lambda item: item["score"], reverse=True)
+        top_candidate = candidates[0]
+        if top_candidate["height"] >= min_band_height:
+            if best is None or top_candidate["score"] > best["score"]:
+                best = top_candidate
+                best_threshold = threshold
+        else:
+            if best_small is None or top_candidate["score"] > best_small["score"]:
+                best_small = top_candidate
+                best_small_threshold = threshold
+    if best is None:
+        best = best_small
+        best_threshold = best_small_threshold
+    if best is None:
+        return image_bytes
+    expansion_density = row_densities.get(best_threshold, row_densities[thresholds[-1]])
+    expand_threshold = max(1, int(min_density * 0.4))
+    y0 = best["y0"]
+    y1 = best["y1"]
+    while y0 > 0 and expansion_density[y0 - 1] >= expand_threshold:
+        y0 -= 1
+    while y1 < descender_limit and expansion_density[y1 + 1] >= expand_threshold:
+        y1 += 1
+    min_x, max_x = width, -1
+    for y in range(y0, y1 + 1):
+        if expansion_density[y] >= line_threshold:
+            continue
+        for x in range(width):
+            if pixels[x, y] < thresholds[-1]:
+                if x < min_x:
+                    min_x = x
+                if x > max_x:
+                    max_x = x
+    if max_x >= 0:
+        best = {
+            "y0": y0,
+            "y1": y1,
+            "min_x": min_x,
+            "max_x": max_x,
+        }
+    x0 = max(0, best["min_x"] - pad_px)
+    x1 = min(width - 1, best["max_x"] + pad_px)
+    y0 = max(0, best["y0"] - pad_px)
+    y1 = min(height - 1, best["y1"] + pad_px)
+    if x1 <= x0 or y1 <= y0:
+        return image_bytes
+    if (x1 - x0) < max(10, int(width * 0.2)) or (y1 - y0) < max(6, int(height * 0.08)):
+        return image_bytes
+    cropped = image.crop((x0, y0, x1 + 1, y1 + 1))
+    buffer = io.BytesIO()
+    cropped.save(buffer, format="PNG")
+    return buffer.getvalue()
+def _detect_horizontal_rule_cutoff(
+    row_density: list[int],
+    width: int,
+) -> tuple[int, int] | None:
+    if not row_density:
+        return None
+    line_threshold = int(width * 0.6)
+    max_thickness = 4
+    segments: list[tuple[int, int]] = []
+    start = None
+    for y, density in enumerate(row_density):
+        if density >= line_threshold:
+            if start is None:
+                start = y
+        else:
+            if start is not None:
+                segments.append((start, y - 1))
+                start = None
+    if start is not None:
+        segments.append((start, len(row_density) - 1))
+    if not segments:
+        return None
+    total_dark = sum(row_density)
+    if total_dark <= 0:
+        return None
+    for y0, y1 in segments:
+        thickness = y1 - y0 + 1
+        if thickness > max_thickness:
+            continue
+        above_dark = sum(row_density[:y0])
+        below_dark = sum(row_density[y1 + 1 :])
+        if above_dark < 40:
+            continue
+        midpoint_ratio = ((y0 + y1) / 2.0) / max(1, len(row_density))
+        if midpoint_ratio >= 0.35:
+            return (y0, y1)
+        if above_dark >= max(40, int(below_dark * 0.3)):
+            return (y0, y1)
+    return None
 def _to_clip_rect(page, bbox: tuple[float, float, float, float]):
     width = float(page.rect.width)
     height = float(page.rect.height)

sigdetect/detector/signature_model.py CHANGED Viewed

@@ -21,6 +21,8 @@ class Signature:
     BoundingBox: tuple[float, float, float, float] | None = None
     CropPath: str | None = None
     CropBytes: str | None = None
+    CropDocxPath: str | None = None
+    CropDocxBytes: str | None = None
     def to_dict(self) -> dict[str, Any]:
         """Return the legacy snake_case representation used in JSON payloads."""
@@ -37,4 +39,6 @@ class Signature:
             "bounding_box": list(self.BoundingBox) if self.BoundingBox else None,
             "crop_path": self.CropPath,
             "crop_bytes": self.CropBytes,
+            "crop_docx_path": self.CropDocxPath,
+            "crop_docx_bytes": self.CropDocxBytes,
         }

sigdetect/wet_detection.py CHANGED Viewed

@@ -94,6 +94,17 @@ def apply_wet_detection(
     original_mixed = file_result.MixedContent
     try:
         added = _detect(pdf_path, configuration, file_result, logger=logger)
+        if added and configuration.Profile == "hipaa":
+            updated = False
+            for signature in file_result.Signatures:
+                if signature.RenderType == "wet" and (signature.Role or "unknown") == "unknown":
+                    signature.Role = "patient"
+                    signature.Scores = {"patient": int(signature.Score or 0)}
+                    signature.Evidence = list(signature.Evidence or [])
+                    signature.Evidence.append("role_default:patient")
+                    updated = True
+            if updated:
+                _refresh_metadata(file_result)
         if not added:
             _mark_manual_review(file_result, "NoHighConfidenceWetSignature")
         return added
@@ -136,6 +147,18 @@ def _detect(
                     scale=configuration.WetOcrDpi / 72.0,
                 )
             )
+            if not candidates:
+                candidates = list(
+                    _build_candidates(
+                        ocr_lines,
+                        image=image,
+                        page_rect=page.rect,
+                        pix_width=pixmap.width,
+                        pix_height=pixmap.height,
+                        scale=configuration.WetOcrDpi / 72.0,
+                        min_y_ratio=0.2,
+                    )
+                )
             candidates.extend(_image_candidates(page))
             candidates = _filter_candidates_for_page(candidates)
             accepted = [
@@ -247,6 +270,7 @@ def _build_candidates(
     pix_width: int,
     pix_height: int,
     scale: float,
+    min_y_ratio: float = 0.4,
 ) -> Iterable[WetCandidate]:
     for line in lines:
         normalized = line.text.lower()
@@ -255,7 +279,7 @@ def _build_candidates(
         if len(normalized) > 80:
             # Ignore long paragraph-like OCR lines
             continue
-        if (line.bottom / pix_height) < 0.4:
+        if (line.bottom / pix_height) < min_y_ratio:
             # Ignore lines in the upper section of the page
             continue
         role = _infer_role(normalized)
@@ -338,28 +362,33 @@ def _expand_bbox(
 ) -> tuple[float, float, float, float]:
     x0 = line.left / scale
     x1 = line.right / scale
-    y1 = (pix_height - line.top) / scale
+    y_top = (pix_height - line.top) / scale
+    y_bottom = (pix_height - line.bottom) / scale
     pad_x = max(14.0, (x1 - x0) * 0.25)
     left = max(page_rect.x0, x0 - pad_x)
     right = min(page_rect.x1, x1 + pad_x)
     gap = 14.0
-    signature_height = 70.0
-    top = min(page_rect.y1, y1 + gap)
-    bottom = min(page_rect.y1, top + signature_height)
-    if bottom <= top:
-        bottom = min(page_rect.y1, top + signature_height)
+    line_height = max(1.0, (line.bottom - line.top) / scale)
+    signature_height = max(70.0, line_height * 6.0)
+    upper = min(page_rect.y1, y_bottom - gap)
+    upper = max(page_rect.y0, upper)
+    lower = max(page_rect.y0, upper - signature_height)
     if stroke_y is not None:
-        # Anchor to the detected stroke under the OCR label when available.
+        # Anchor to the detected stroke (signature line) beneath the label.
         sy = (pix_height - stroke_y) / scale
-        if sy < top:
-            top = sy
-        bottom = max(bottom, sy + signature_height)
+        field_lower = min(page_rect.y1, max(page_rect.y0, sy + 2.0))
+        field_upper = min(page_rect.y1, y_bottom - gap)
+        if field_upper > field_lower + 6.0:
+            lower = field_lower
+            upper = field_upper
+        else:
+            upper = min(page_rect.y1, field_lower + signature_height)
+            lower = max(page_rect.y0, upper - signature_height)
-    return (float(left), float(top), float(right), float(bottom))
+    return (float(left), float(lower), float(right), float(upper))
 def _stroke_under_line(image: Image.Image, line: OcrLine) -> tuple[bool, float | None]:
@@ -513,14 +542,19 @@ def _signature_rank(signature: Signature) -> tuple[int, int, int]:
 def _dedupe_wet_signatures(signatures: Sequence[Signature]) -> list[Signature]:
     best_by_role: dict[str, Signature] = {}
+    best_unknown: Signature | None = None
     for signature in signatures:
         role = (signature.Role or "unknown").strip().lower()
         if role == "unknown":
+            if best_unknown is None or _signature_rank(signature) > _signature_rank(best_unknown):
+                best_unknown = signature
             continue
         existing = best_by_role.get(role)
         if existing is None or _signature_rank(signature) > _signature_rank(existing):
             best_by_role[role] = signature
-    return sorted(best_by_role.values(), key=lambda sig: (int(sig.Page or 0), sig.Role or ""))
+    if best_by_role:
+        return sorted(best_by_role.values(), key=lambda sig: (int(sig.Page or 0), sig.Role or ""))
+    return [best_unknown] if best_unknown is not None else []
 def _mark_manual_review(file_result: FileResult, reason: str) -> None:

{sigdetect-0.5.0.dist-info → sigdetect-0.5.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sigdetect
-Version: 0.5.0
+Version: 0.5.2
 Summary: Signature detection and role attribution for PDFs
 Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
 License: MIT
@@ -105,7 +105,7 @@ sigdetect detect \
   - `retainer` → client / firm (prefers detecting two signatures)
 - `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
 - Results output is disabled by default; set `write_results: true` or pass `--write-results` when you need `results.json` (for EDA).
-- Cropping (`--crop-signatures`) writes a one-image `.docx` per signature in the crop output directory (no PNG files on disk); `--crop-bytes` embeds base64 PNG data in `signatures[].crop_bytes` for in-memory use. PyMuPDF is required for crops, and `python-docx` is required for `.docx` output.
+- Cropping (`--crop-signatures`) writes PNG crops to disk by default; enable `--crop-docx` to write DOCX files instead of PNGs. `--crop-bytes` embeds base64 PNG data in `signatures[].crop_bytes` and, when `--crop-docx` is enabled, embeds DOCX bytes in `signatures[].crop_docx_bytes`. PyMuPDF is required for crops, and `python-docx` is required for DOCX output.
 - Wet detection runs automatically for non-e-sign PDFs when dependencies are available; missing OCR dependencies add a `ManualReview:*` hint instead of failing. PyMuPDF + Tesseract are required for wet detection.
 - If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
@@ -142,7 +142,7 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
 print(result.to_dict())
 ~~~
-`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When cropping is enabled, `crop_path` points at the generated `.docx`. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
+`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image; when DOCX cropping is enabled, `crop_docx_path` points at the generated doc. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
 ---
@@ -194,7 +194,7 @@ for res in ScanDirectory(
     # store in DB, print, etc.
     pass
-# 3) Create DOCX crops for FileResult objects (requires PyMuPDF + python-docx)
+# 3) Crop signature snippets for FileResult objects (requires PyMuPDF; DOCX needs python-docx)
 detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
 file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
 CropSignatureImages(
@@ -233,7 +233,8 @@ High-level summary (per file):
       "hint": "AcroSig:sig_patient",
       "render_type": "typed",
       "bounding_box": [10.0, 10.0, 150.0, 40.0],
-      "crop_path": "signature_crops/example/sig_01_patient.docx"
+      "crop_path": "signature_crops/example/sig_01_patient.png",
+      "crop_docx_path": null
     },
     {
       "page": null,
@@ -259,8 +260,10 @@ High-level summary (per file):
 - **`roles`** summarizes unique non-`unknown` roles across signatures.
 - In retainer profile, emitter prefers two signatures (client + firm), often on the same page.
 - **`signatures[].bounding_box`** reports the widget rectangle in PDF points (origin bottom-left).
-- **`signatures[].crop_path`** is populated when DOCX crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
+- **`signatures[].crop_path`** is populated when PNG crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
+- **`signatures[].crop_docx_path`** is populated when DOCX crops are generated (`--crop-docx` or `docx=True`).
 - **`signatures[].crop_bytes`** contains base64 PNG data when CLI `--crop-bytes` is enabled.
+- **`signatures[].crop_docx_bytes`** contains base64 DOCX data when `--crop-docx` and `--crop-bytes` are enabled together.
 ---
@@ -287,7 +290,8 @@ write_results: false
 pseudo_signatures: true
 recurse_xobjects: true
 profile: retainer    # or: hipaa
-crop_signatures: false   # enable to write DOCX crops (requires pymupdf + python-docx)
+crop_signatures: false   # enable to write PNG crops (requires pymupdf)
+crop_docx: false         # enable to write DOCX crops instead of PNGs (requires python-docx)
 # crop_output_dir: ./signature_crops
 crop_image_dpi: 200
 detect_wet_signatures: false   # kept for compatibility; non-e-sign PDFs still trigger OCR

{sigdetect-0.5.0.dist-info → sigdetect-0.5.2.dist-info}/RECORD RENAMED Viewed

@@ -1,12 +1,12 @@
 sigdetect/__init__.py,sha256=YvnTwlC1jfq83EhQS_1JjiiHK7_wJCCU1JvHv5E1qWY,573
-sigdetect/api.py,sha256=uaU7JbSGpyViiXrrHu-iuifIi8xIes3PGeBZkoLNlPg,10800
-sigdetect/cli.py,sha256=d5AznKwQPvYKVzC8RCBDgC9SlB4Goz1_pB2_EFzrsTg,10349
-sigdetect/config.py,sha256=rJdlu9pM4aqeoY7Ha5qocPmZ7_UeVOOFepBlqOne2b8,7873
-sigdetect/cropping.py,sha256=UeKL6dBY18V1E2DoLSbGjTzdGnjhz2WKPi3l3Q0Brh8,8516
+sigdetect/api.py,sha256=hDfa6z4SoHth1Dw9HDfSPiytMQrqu_oyBZlXBwSh9g4,11010
+sigdetect/cli.py,sha256=X5GqZ-PK67vz4OHN5r7h-V0hO886ZblUiUdKDuFowtU,10930
+sigdetect/config.py,sha256=3SP1rkcWBGXloCDFomBJRMRKZOvXuHQbhIBqpVrzYmY,8365
+sigdetect/cropping.py,sha256=IyiBfIEHBLvOv8t_d-O51BfpljTFpE-dG_RxDxJAzAo,16339
 sigdetect/eda.py,sha256=S92G1Gjmepri__D0n_V6foq0lQgH-RXI9anW8A58jfw,4681
 sigdetect/logging_setup.py,sha256=LMF8ao_a-JwH0S522T6aYTFX3e8Ajjv_5ODS2YiBcHA,6404
 sigdetect/utils.py,sha256=T9rubLf5T9JmjOHYMOba1j34fhOJaWocAXccnGTxRUE,5198
-sigdetect/wet_detection.py,sha256=zvi11XUmm_xLZ4BLvxInwMQg8YLcyQzEYAM9QSdJOIs,18259
+sigdetect/wet_detection.py,sha256=ofKijykm4fKrvFaVkEkPPKL9iKeRNvlAiKkD2vHxD8k,20025
 sigdetect/data/role_rules.retainer.yml,sha256=IFdwKnDBXR2cTkdfrsZ6ku6CXD8S_dg5A3vKRKLW5h8,2532
 sigdetect/data/role_rules.yml,sha256=HuLKsZR_A6sD9XvY4NHiY_VG3dS5ERNCBF9-Mxawomw,2751
 sigdetect/data/vendor_patterns.yml,sha256=NRbZNQxcx_GuL6n1jAphBn6MM6ChCpeWGCsjbRx-PEo,384
@@ -16,9 +16,9 @@ sigdetect/detector/base_detector.py,sha256=GmAgUWO_fQgIfnihZSoyhR3wpnwZ-X3hS0Kuy
 sigdetect/detector/file_result_model.py,sha256=j2gTc9Sw3fJOHlexYsR_m5DiwHA8DzIzAMToESfvo4A,1767
 sigdetect/detector/pymupdf_engine.py,sha256=N6oxvUa-48VvvhjbMk0R0kfScsggNKS7u5FLSeBRfWw,17358
 sigdetect/detector/pypdf2_engine.py,sha256=kB8cIp_gMvCla0LIBi9sd19g0361Oc9TjCW_ZViUBJQ,47410
-sigdetect/detector/signature_model.py,sha256=0SEUc34wvOvrzy_fDzzD42A9LsSzIOeZ4rERPDHimsA,1149
-sigdetect-0.5.0.dist-info/METADATA,sha256=-Jgo6JZwWA18uqhjBv2mqZc43y9KHLfpMoPec7ObGow,13628
-sigdetect-0.5.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
-sigdetect-0.5.0.dist-info/entry_points.txt,sha256=iqtfKjBU44-omM7Sh-idGz2ahw19oAvpvSyKZVArG3o,48
-sigdetect-0.5.0.dist-info/top_level.txt,sha256=PKlfwUobkRC0viwiSXmhtw83G26FSNpimWYC1Uy00FY,10
-sigdetect-0.5.0.dist-info/RECORD,,
+sigdetect/detector/signature_model.py,sha256=T2Hmfkfz_hZsDzwOhepxfNmkedxQp3_XHdrP8yGKoCk,1322
+sigdetect-0.5.2.dist-info/METADATA,sha256=jLin7USVPqeA5tS7KCuPRRt1PLwdt-oJWhWuKSQa6hE,14131
+sigdetect-0.5.2.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+sigdetect-0.5.2.dist-info/entry_points.txt,sha256=iqtfKjBU44-omM7Sh-idGz2ahw19oAvpvSyKZVArG3o,48
+sigdetect-0.5.2.dist-info/top_level.txt,sha256=PKlfwUobkRC0viwiSXmhtw83G26FSNpimWYC1Uy00FY,10
+sigdetect-0.5.2.dist-info/RECORD,,

{sigdetect-0.5.0.dist-info → sigdetect-0.5.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{sigdetect-0.5.0.dist-info → sigdetect-0.5.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{sigdetect-0.5.0.dist-info → sigdetect-0.5.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

sigdetect 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl

sigdetect 0.5.0py3-none-any.whl → 0.5.2py3-none-any.whl