PyPI - sigdetect - Versions diffs - 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

sigdetect 0.4.0py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

sigdetect/api.py +48 -12
sigdetect/cli.py +70 -28
sigdetect/config.py +17 -0
sigdetect/cropping.py +78 -15
sigdetect/detector/__init__.py +10 -8
sigdetect/detector/pymupdf_engine.py +2 -2
sigdetect/detector/signature_model.py +6 -0
sigdetect/wet_detection.py +63 -13
{sigdetect-0.4.0.dist-info → sigdetect-0.5.1.dist-info}/METADATA +25 -12
{sigdetect-0.4.0.dist-info → sigdetect-0.5.1.dist-info}/RECORD +13 -13
{sigdetect-0.4.0.dist-info → sigdetect-0.5.1.dist-info}/WHEEL +1 -1
{sigdetect-0.4.0.dist-info → sigdetect-0.5.1.dist-info}/entry_points.txt +0 -0
{sigdetect-0.4.0.dist-info → sigdetect-0.5.1.dist-info}/top_level.txt +0 -0

sigdetect/api.py CHANGED Viewed

@@ -9,6 +9,7 @@ from typing import Any, Generator, Iterable, Iterator, Literal, overload
 from sigdetect.config import DetectConfiguration
 from sigdetect.cropping import SignatureCrop
 from sigdetect.detector import BuildDetector, Detector, FileResult, Signature
+from sigdetect.wet_detection import apply_wet_detection
 EngineName = Literal["pypdf2", "pypdf", "pymupdf", "auto"]
 ProfileName = Literal["hipaa", "retainer"]
@@ -21,9 +22,13 @@ def DetectPdf(
     engineName: EngineName = "auto",
     includePseudoSignatures: bool = True,
     recurseXObjects: bool = True,
+    runWetDetection: bool = True,
     detector: Detector | None = None,
 ) -> dict[str, Any]:
-    """Detect signature evidence and assign roles for a single PDF."""
+    """Detect signature evidence and assign roles for a single PDF.
+    Wet detection runs by default for non-e-sign PDFs; pass ``runWetDetection=False`` to skip OCR.
+    """
     resolvedPath = Path(pdfPath)
     activeDetector = detector or get_detector(
@@ -36,6 +41,10 @@ def DetectPdf(
     )
     result = activeDetector.Detect(resolvedPath)
+    if runWetDetection:
+        configuration = _ResolveConfiguration(activeDetector)
+        if configuration is not None:
+            apply_wet_detection(resolvedPath, configuration, result)
     return _ToPlainDictionary(result)
@@ -48,7 +57,10 @@ def get_detector(
     recurseXObjects: bool = True,
     outputDirectory: str | Path | None = None,
 ) -> Detector:
-    """Return a reusable detector instance configured with the supplied options."""
+    """Return a reusable detector instance configured with the supplied options.
+    Engine selection is forced to ``auto`` (prefers PyMuPDF when available).
+    """
     configuration = DetectConfiguration(
         PdfRoot=Path(pdfRoot) if pdfRoot is not None else Path.cwd(),
@@ -108,6 +120,7 @@ def _ToPlainValue(value: Any) -> Any:
 def DetectMany(
     pdfPaths: Iterable[str | Path],
     *,
+    runWetDetection: bool = True,
     detector: Detector | None = None,
     **kwargs: Any,
 ) -> Iterator[dict[str, Any]]:
@@ -115,17 +128,18 @@ def DetectMany(
     if detector is not None:
         for pdfPath in pdfPaths:
-            yield _DetectWithDetector(detector, pdfPath)
+            yield _DetectWithDetector(detector, pdfPath, runWetDetection=runWetDetection)
         return
     for pdfPath in pdfPaths:
-        yield DetectPdf(pdfPath, **kwargs)
+        yield DetectPdf(pdfPath, runWetDetection=runWetDetection, **kwargs)
 def ScanDirectory(
     pdfRoot: str | Path,
     *,
     globPattern: str = "**/*.pdf",
+    runWetDetection: bool = True,
     detector: Detector | None = None,
     **kwargs: Any,
 ) -> Iterator[dict[str, Any]]:
@@ -143,7 +157,7 @@ def ScanDirectory(
     for pdfPath in iterator:
         if pdfPath.is_file() and pdfPath.suffix.lower() == ".pdf":
-            yield DetectPdf(pdfPath, detector=detector, **kwargs)
+            yield DetectPdf(pdfPath, detector=detector, runWetDetection=runWetDetection, **kwargs)
 def ToCsvRow(result: dict[str, Any]) -> dict[str, Any]:
@@ -174,11 +188,25 @@ def Version() -> str:
         return "0.0.0-dev"
-def _DetectWithDetector(detector: Detector, pdfPath: str | Path) -> dict[str, Any]:
+def _DetectWithDetector(
+    detector: Detector, pdfPath: str | Path, *, runWetDetection: bool
+) -> dict[str, Any]:
     """Helper that runs ``detector`` and returns the plain dictionary result."""
     resolvedPath = Path(pdfPath)
-    return _ToPlainDictionary(detector.Detect(resolvedPath))
+    result = detector.Detect(resolvedPath)
+    if runWetDetection:
+        configuration = _ResolveConfiguration(detector)
+        if configuration is not None:
+            apply_wet_detection(resolvedPath, configuration, result)
+    return _ToPlainDictionary(result)
+def _ResolveConfiguration(detector: Detector) -> DetectConfiguration | None:
+    configuration = getattr(detector, "Configuration", None)
+    if isinstance(configuration, DetectConfiguration):
+        return configuration
+    return None
 @contextmanager
@@ -201,8 +229,8 @@ def CropSignatureImages(
     dpi: int = 200,
     returnBytes: Literal[False] = False,
     saveToDisk: bool = True,
-) -> list[Path]:
-    ...
+    docx: bool = False,
+) -> list[Path]: ...
 @overload
@@ -214,8 +242,8 @@ def CropSignatureImages(
     dpi: int,
     returnBytes: Literal[True],
     saveToDisk: bool,
-) -> list[SignatureCrop]:
-    ...
+    docx: bool = False,
+) -> list[SignatureCrop]: ...
 def CropSignatureImages(
@@ -226,13 +254,17 @@ def CropSignatureImages(
     dpi: int = 200,
     returnBytes: bool = False,
     saveToDisk: bool = True,
+    docx: bool = False,
 ) -> list[Path] | list[SignatureCrop]:
-    """Crop detected signature regions to PNG files.
+    """Create PNG files containing cropped signature images (or DOCX when enabled).
     Accepts either a :class:`FileResult` instance or the ``dict`` returned by
     :func:`DetectPdf`. Requires the optional ``pymupdf`` dependency.
     Set ``returnBytes=True`` to also receive in-memory PNG bytes for each crop. Set
     ``saveToDisk=False`` to skip writing PNG files while still returning in-memory data.
+    When ``docx`` is True, DOCX files are written instead of PNG files. When ``returnBytes`` is
+    True and ``docx`` is enabled, the returned :class:`SignatureCrop` objects include
+    ``docx_bytes``.
     """
     from sigdetect.cropping import crop_signatures
@@ -245,6 +277,7 @@ def CropSignatureImages(
         dpi=dpi,
         return_bytes=returnBytes,
         save_files=saveToDisk,
+        docx=docx,
     )
     if original_dict is not None:
         original_dict.clear()
@@ -275,6 +308,9 @@ def _CoerceFileResult(
                 RenderType=str(entry.get("render_type") or "unknown"),
                 BoundingBox=tuple(bbox) if bbox else None,
                 CropPath=entry.get("crop_path"),
+                CropBytes=entry.get("crop_bytes"),
+                CropDocxPath=entry.get("crop_docx_path"),
+                CropDocxBytes=entry.get("crop_docx_bytes"),
             )
         )

sigdetect/cli.py CHANGED Viewed

@@ -2,6 +2,7 @@
 from __future__ import annotations
+import base64
 import json
 from collections.abc import Iterator
 from dataclasses import asdict, is_dataclass
@@ -48,6 +49,12 @@ def Detect(
     configurationPath: Path | None = typer.Option(
         None, "--config", "-c", help="Path to YAML config"
     ),
+    writeResults: bool | None = typer.Option(
+        None,
+        "--write-results/--no-write-results",
+        help="Write results.json (or JSON to stdout when out_dir is none)",
+        show_default=False,
+    ),
     profileOverride: str | None = typer.Option(None, "--profile", "-p", help="hipaa or retainer"),
     recursive: bool = typer.Option(
         True,
@@ -57,13 +64,19 @@ def Detect(
     cropSignatures: bool | None = typer.Option(
         None,
         "--crop-signatures/--no-crop-signatures",
-        help="Crop detected signature regions to PNG files (requires PyMuPDF)",
+        help="Write PNG crops for signature widgets (requires PyMuPDF)",
+        show_default=False,
+    ),
+    cropDocx: bool | None = typer.Option(
+        None,
+        "--crop-docx/--no-crop-docx",
+        help="Write DOCX crops instead of PNG files (requires PyMuPDF + python-docx)",
         show_default=False,
     ),
     cropDirectory: Path | None = typer.Option(
         None,
         "--crop-dir",
-        help="Directory for signature PNG crops (defaults to out_dir/signature_crops)",
+        help="Directory for signature crops (defaults to out_dir/signature_crops)",
     ),
     cropDpi: int | None = typer.Option(
         None,
@@ -73,10 +86,16 @@ def Detect(
         help="Rendering DPI for signature crops",
         show_default=False,
     ),
+    cropBytes: bool = typer.Option(
+        False,
+        "--crop-bytes/--no-crop-bytes",
+        help="Embed base64 PNG bytes (and DOCX bytes when --crop-docx) in results JSON",
+        show_default=False,
+    ),
     detectWetSignatures: bool | None = typer.Option(
         None,
         "--detect-wet/--no-detect-wet",
-        help="Run OCR-backed wet signature detection (requires PyMuPDF + Tesseract)",
+        help="Compatibility flag; non-e-sign PDFs always run OCR when deps are available",
         show_default=False,
     ),
     wetOcrDpi: int | None = typer.Option(
@@ -111,8 +130,12 @@ def Detect(
         configuration = configuration.model_copy(update={"Profile": normalized_profile})
     overrides: dict[str, object] = {}
+    if writeResults is not None:
+        overrides["WriteResults"] = writeResults
     if cropSignatures is not None:
         overrides["CropSignatures"] = cropSignatures
+    if cropDocx is not None:
+        overrides["CropDocx"] = cropDocx
     if cropDirectory is not None:
         overrides["CropOutputDirectory"] = cropDirectory
     if cropDpi is not None:
@@ -145,53 +168,66 @@ def Detect(
     except StopIteration:
         raise SystemExit(f"No PDFs found in {configuration.PdfRoot}") from None
-    results_buffer: list[FileResult] | None = [] if configuration.OutputDirectory is None else None
+    write_results = configuration.WriteResults
+    results_buffer: list[FileResult] | None = (
+        [] if write_results and configuration.OutputDirectory is None else None
+    )
     json_handle = None
     json_path: Path | None = None
     wrote_first = False
-    if configuration.OutputDirectory is not None:
+    if write_results and configuration.OutputDirectory is not None:
         outputDirectory = configuration.OutputDirectory
         outputDirectory.mkdir(parents=True, exist_ok=True)
         json_path = outputDirectory / "results.json"
         json_handle = open(json_path, "w", encoding="utf-8")
         json_handle.write("[")
+    crop_bytes_enabled = bool(cropBytes)
     crop_dir = configuration.CropOutputDirectory
+    if crop_dir is None:
+        base_dir = configuration.OutputDirectory or configuration.PdfRoot
+        crop_dir = base_dir / "signature_crops"
     cropping_enabled = configuration.CropSignatures
+    docx_enabled = configuration.CropDocx
     cropping_available = True
     cropping_attempted = False
-    if configuration.CropSignatures and crop_dir is None:
-        Logger.warning(
-            "CropSignatures enabled without an output directory",
-            extra={"pdf_root": str(configuration.PdfRoot)},
-        )
-        cropping_enabled = False
     total_bboxes = 0
     def _append_result(file_result: FileResult, source_pdf: Path) -> None:
         nonlocal wrote_first, json_handle, total_bboxes, cropping_available, cropping_attempted
-        if cropping_enabled and cropping_available and crop_dir is not None:
+        if cropping_available and (cropping_enabled or crop_bytes_enabled) and crop_dir is not None:
             try:
-                crop_signatures(
+                crops = crop_signatures(
                     pdf_path=source_pdf,
                     file_result=file_result,
                     output_dir=crop_dir,
                     dpi=configuration.CropImageDpi,
                     logger=Logger,
+                    return_bytes=crop_bytes_enabled,
+                    save_files=cropping_enabled,
+                    docx=docx_enabled,
                 )
                 cropping_attempted = True
+                if crop_bytes_enabled:
+                    for crop in crops:
+                        crop.signature.CropBytes = base64.b64encode(crop.image_bytes).decode(
+                            "ascii"
+                        )
+                        if crop.docx_bytes:
+                            crop.signature.CropDocxBytes = base64.b64encode(
+                                crop.docx_bytes
+                            ).decode("ascii")
             except SignatureCroppingUnavailable as exc:
                 cropping_available = False
                 Logger.warning("Signature cropping unavailable", extra={"error": str(exc)})
                 typer.echo(str(exc), err=True)
             except Exception as exc:  # pragma: no cover - defensive
-                Logger.warning(
-                    "Unexpected error while cropping signatures",
-                    extra={"error": str(exc)},
-                )
+                cropping_available = False
+                Logger.warning("Signature cropping unavailable", extra={"error": str(exc)})
+                typer.echo(str(exc), err=True)
         total_bboxes += sum(1 for sig in file_result.Signatures if sig.BoundingBox)
@@ -231,18 +267,24 @@ def Detect(
             json_handle.write(closing)
             json_handle.close()
-    if json_handle is not None:
-        typer.echo(f"Wrote {json_path}")
-    else:
-        payload = json.dumps(
-            results_buffer or [], indent=2, ensure_ascii=False, default=_JsonSerializer
-        )
-        typer.echo(payload)
-        typer.echo("Detection completed with output disabled (out_dir=none)")
-    if cropping_enabled and cropping_available and cropping_attempted and total_bboxes == 0:
+    if write_results:
+        if json_handle is not None:
+            typer.echo(f"Wrote {json_path}")
+        else:
+            payload = json.dumps(
+                results_buffer or [], indent=2, ensure_ascii=False, default=_JsonSerializer
+            )
+            typer.echo(payload)
+            typer.echo("Detection completed with output disabled (out_dir=none)")
+    if (
+        (cropping_enabled or crop_bytes_enabled)
+        and cropping_available
+        and cropping_attempted
+        and total_bboxes == 0
+    ):
         Logger.warning(
-            "No signature bounding boxes detected; try --engine pymupdf for crop-ready output",
+            "No signature bounding boxes detected; install PyMuPDF for crop-ready output",
             extra={"engine": configuration.Engine},
         )

sigdetect/config.py CHANGED Viewed

@@ -25,11 +25,13 @@ class DetectConfiguration(BaseModel):
     PdfRoot: Path = Field(default=Path("hipaa_results"), alias="pdf_root")
     OutputDirectory: Path | None = Field(default=Path("out"), alias="out_dir")
+    WriteResults: bool = Field(default=False, alias="write_results")
     Engine: EngineName = Field(default="auto", alias="engine")
     Profile: ProfileName = Field(default="hipaa", alias="profile")
     PseudoSignatures: bool = Field(default=True, alias="pseudo_signatures")
     RecurseXObjects: bool = Field(default=True, alias="recurse_xobjects")
     CropSignatures: bool = Field(default=True, alias="crop_signatures")
+    CropDocx: bool = Field(default=False, alias="crop_docx")
     CropOutputDirectory: Path | None = Field(default=None, alias="crop_output_dir")
     CropImageDpi: int = Field(default=200, alias="crop_image_dpi", ge=72, le=600)
     DetectWetSignatures: bool = Field(default=True, alias="detect_wet_signatures")
@@ -63,6 +65,10 @@ class DetectConfiguration(BaseModel):
     def out_dir(self) -> Path | None:  # pragma: no cover - simple passthrough
         return self.OutputDirectory
+    @property
+    def write_results(self) -> bool:  # pragma: no cover - simple passthrough
+        return self.WriteResults
     @property
     def engine(self) -> EngineName:  # pragma: no cover - simple passthrough
         return self.Engine
@@ -83,6 +89,10 @@ class DetectConfiguration(BaseModel):
     def crop_signatures(self) -> bool:  # pragma: no cover - simple passthrough
         return self.CropSignatures
+    @property
+    def crop_docx(self) -> bool:  # pragma: no cover - simple passthrough
+        return self.CropDocx
     @property
     def crop_output_dir(self) -> Path | None:  # pragma: no cover - simple passthrough
         return self.CropOutputDirectory
@@ -128,6 +138,7 @@ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
     env_out_dir = os.getenv("SIGDETECT_OUT_DIR")
     env_profile = os.getenv("SIGDETECT_PROFILE")
     env_crop = os.getenv("SIGDETECT_CROP_SIGNATURES")
+    env_crop_docx = os.getenv("SIGDETECT_CROP_DOCX")
     env_crop_dir = os.getenv("SIGDETECT_CROP_DIR")
     env_crop_dpi = os.getenv("SIGDETECT_CROP_DPI")
     env_detect_wet = os.getenv("SIGDETECT_DETECT_WET")
@@ -154,6 +165,12 @@ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
             raw_data["crop_signatures"] = True
         elif lowered in {"0", "false", "no", "off"}:
             raw_data["crop_signatures"] = False
+    if env_crop_docx is not None:
+        lowered = env_crop_docx.lower()
+        if lowered in {"1", "true", "yes", "on"}:
+            raw_data["crop_docx"] = True
+        elif lowered in {"0", "false", "no", "off"}:
+            raw_data["crop_docx"] = False
     if env_crop_dir:
         raw_data["crop_output_dir"] = env_crop_dir
     if env_crop_dpi:

sigdetect/cropping.py CHANGED Viewed

@@ -1,7 +1,8 @@
-"""Helpers for converting signature bounding boxes into PNG crops."""
+"""Helpers for converting signature bounding boxes into PNG or DOCX crops."""
 from __future__ import annotations
+import io
 import logging
 import re
 from dataclasses import dataclass
@@ -16,18 +17,28 @@ try:  # pragma: no cover - optional dependency
 except Exception:  # pragma: no cover - optional dependency
     fitz = None  # type: ignore[misc]
+try:  # pragma: no cover - optional dependency
+    from docx import Document  # type: ignore
+except Exception:  # pragma: no cover - optional dependency
+    Document = None  # type: ignore[assignment]
 class SignatureCroppingUnavailable(RuntimeError):
     """Raised when PNG cropping cannot be performed (e.g., PyMuPDF missing)."""
+class SignatureDocxUnavailable(SignatureCroppingUnavailable):
+    """Raised when DOCX creation cannot be performed (e.g., python-docx missing)."""
 @dataclass(slots=True)
 class SignatureCrop:
-    """PNG crop metadata and in-memory content."""
+    """Crop metadata and in-memory content."""
     path: Path
     image_bytes: bytes
     signature: Signature
+    docx_bytes: bytes | None = None
     saved_to_disk: bool = True
@@ -41,8 +52,8 @@ def crop_signatures(
     logger: logging.Logger | None = None,
     return_bytes: Literal[False] = False,
     save_files: bool = True,
-) -> list[Path]:
-    ...
+    docx: bool = False,
+) -> list[Path]: ...
 @overload
@@ -55,8 +66,8 @@ def crop_signatures(
     logger: logging.Logger | None = None,
     return_bytes: Literal[True],
     save_files: bool = True,
-) -> list[SignatureCrop]:
-    ...
+    docx: bool = False,
+) -> list[SignatureCrop]: ...
 def crop_signatures(
@@ -68,16 +79,19 @@ def crop_signatures(
     logger: logging.Logger | None = None,
     return_bytes: bool = False,
     save_files: bool = True,
+    docx: bool = False,
 ) -> list[Path] | list[SignatureCrop]:
-    """Render each signature bounding box to a PNG image using PyMuPDF.
+    """Render each signature bounding box to a PNG image and optionally wrap it in DOCX.
     Set ``return_bytes=True`` to collect in-memory PNG bytes for each crop while also writing
     the files to ``output_dir``. Set ``save_files=False`` to skip writing PNGs to disk.
+    When ``docx=True``, DOCX files are written instead of PNGs. When ``return_bytes`` is True
+    and ``docx=True``, ``SignatureCrop.docx_bytes`` will contain the DOCX payload.
     """
     if fitz is None:  # pragma: no cover - exercised when dependency absent
         raise SignatureCroppingUnavailable(
-            "PyMuPDF is required for PNG crops. Install 'pymupdf' or 'sigdetect[pymupdf]'."
+            "PyMuPDF is required for PNG crops. Install 'pymupdf' or add it to your environment."
         )
     if not save_files and not return_bytes:
         raise ValueError("At least one of save_files or return_bytes must be True")
@@ -89,6 +103,13 @@ def crop_signatures(
     generated_paths: list[Path] = []
     generated_crops: list[SignatureCrop] = []
+    docx_enabled = docx
+    docx_available = Document is not None
+    if docx_enabled and not docx_available:
+        raise SignatureDocxUnavailable(
+            "python-docx is required to generate DOCX outputs for signature crops."
+        )
     with fitz.open(pdf_path) as document:  # type: ignore[attr-defined]
         per_document_dir = output_dir / pdf_path.stem
         if save_files:
@@ -118,14 +139,15 @@ def crop_signatures(
                 continue
             filename = _build_filename(index, signature)
-            destination = per_document_dir / filename
+            png_destination = per_document_dir / filename
+            docx_destination = png_destination.with_suffix(".docx")
             try:
                 image_bytes: bytes | None = None
                 pixmap = page.get_pixmap(matrix=matrix, clip=clip, alpha=False)
-                if save_files:
-                    pixmap.save(destination)
-                if return_bytes:
+                if save_files and not docx_enabled:
+                    pixmap.save(png_destination)
+                if return_bytes or docx_enabled:
                     image_bytes = pixmap.tobytes("png")
             except Exception as exc:  # pragma: no cover - defensive
                 if logger:
@@ -140,17 +162,46 @@ def crop_signatures(
                     )
                 continue
+            docx_bytes: bytes | None = None
+            if docx_enabled:
+                if image_bytes is None:  # pragma: no cover - defensive
+                    continue
+                try:
+                    docx_bytes = _build_docx_bytes(image_bytes)
+                    if save_files:
+                        docx_destination.write_bytes(docx_bytes)
+                except SignatureDocxUnavailable as exc:
+                    if logger:
+                        logger.warning(
+                            "Signature DOCX output unavailable",
+                            extra={"error": str(exc)},
+                        )
+                    docx_available = False
+                except Exception as exc:  # pragma: no cover - defensive
+                    if logger:
+                        logger.warning(
+                            "Failed to write signature DOCX",
+                            extra={"file": pdf_path.name, "error": str(exc)},
+                        )
             if save_files:
-                signature.CropPath = str(destination)
-                generated_paths.append(destination)
+                if docx_enabled:
+                    signature.CropPath = None
+                    signature.CropDocxPath = str(docx_destination)
+                    generated_paths.append(docx_destination)
+                else:
+                    signature.CropDocxPath = None
+                    signature.CropPath = str(png_destination)
+                    generated_paths.append(png_destination)
             if return_bytes:
                 if image_bytes is None:  # pragma: no cover - defensive
                     continue
                 generated_crops.append(
                     SignatureCrop(
-                        path=destination,
+                        path=docx_destination if docx_enabled else png_destination,
                         image_bytes=image_bytes,
                         signature=signature,
+                        docx_bytes=docx_bytes,
                         saved_to_disk=save_files,
                     )
                 )
@@ -158,6 +209,18 @@ def crop_signatures(
     return generated_crops if return_bytes else generated_paths
+def _build_docx_bytes(image_bytes: bytes) -> bytes:
+    if Document is None:
+        raise SignatureDocxUnavailable(
+            "python-docx is required to generate DOCX outputs for signature crops."
+        )
+    document = Document()
+    document.add_picture(io.BytesIO(image_bytes))
+    buffer = io.BytesIO()
+    document.save(buffer)
+    return buffer.getvalue()
 def _to_clip_rect(page, bbox: tuple[float, float, float, float]):
     width = float(page.rect.width)
     height = float(page.rect.height)

sigdetect/detector/__init__.py CHANGED Viewed

@@ -22,10 +22,13 @@ ENGINE_REGISTRY: dict[str, Type[Detector]] = {
 ENGINE_REGISTRY.setdefault("pypdf", PyPDF2Detector)
 try:  # pragma: no cover - optional dependency
-    from .pymupdf_engine import PyMuPDFDetector  # type: ignore
+    from .pymupdf_engine import PyMuPDFDetector
+    from .pymupdf_engine import fitz as pymupdf_fitz  # type: ignore
-    if getattr(PyMuPDFDetector, "Name", None):
+    if pymupdf_fitz is not None and getattr(PyMuPDFDetector, "Name", None):
         ENGINE_REGISTRY[PyMuPDFDetector.Name] = PyMuPDFDetector
+    else:
+        PyMuPDFDetector = None  # type: ignore
 except Exception:
     PyMuPDFDetector = None  # type: ignore
@@ -33,17 +36,16 @@ except Exception:
 def BuildDetector(configuration: DetectConfiguration) -> Detector:
     """Instantiate the configured engine or raise a clear error."""
-    engine_name = (
-        getattr(configuration, "Engine", None)
-        or getattr(configuration, "engine", None)
-        or PyPDF2Detector.Name
-    )
+    # Force geometry-capable engine selection (auto prefers PyMuPDF when available).
+    engine_name = "auto"
     normalized = str(engine_name).lower()
     if normalized == "auto":
         detector_cls: Type[Detector] | None = None
         if PyMuPDFDetector is not None:
-            detector_cls = ENGINE_REGISTRY.get(getattr(PyMuPDFDetector, "Name", "")) or PyMuPDFDetector
+            detector_cls = (
+                ENGINE_REGISTRY.get(getattr(PyMuPDFDetector, "Name", "")) or PyMuPDFDetector
+            )
         if detector_cls is None:
             detector_cls = ENGINE_REGISTRY.get(PyPDF2Detector.Name) or ENGINE_REGISTRY.get("pypdf")
             warnings.warn(

sigdetect/detector/pymupdf_engine.py CHANGED Viewed

@@ -30,8 +30,8 @@ class PyMuPDFDetector(PyPDF2Detector):
     def __init__(self, configuration):
         if fitz is None:  # pragma: no cover - optional dependency
             raise ValueError(
-                "PyMuPDF engine requires the optional 'pymupdf' dependency. Install via 'pip install "
-                "sigdetect[pymupdf]' or add pymupdf to your environment."
+                "PyMuPDF engine requires the optional 'pymupdf' dependency. Install 'pymupdf' or add "
+                "it to your environment."
             )
         super().__init__(configuration)

sigdetect/detector/signature_model.py CHANGED Viewed

@@ -20,6 +20,9 @@ class Signature:
     RenderType: str = "typed"
     BoundingBox: tuple[float, float, float, float] | None = None
     CropPath: str | None = None
+    CropBytes: str | None = None
+    CropDocxPath: str | None = None
+    CropDocxBytes: str | None = None
     def to_dict(self) -> dict[str, Any]:
         """Return the legacy snake_case representation used in JSON payloads."""
@@ -35,4 +38,7 @@ class Signature:
             "render_type": self.RenderType,
             "bounding_box": list(self.BoundingBox) if self.BoundingBox else None,
             "crop_path": self.CropPath,
+            "crop_bytes": self.CropBytes,
+            "crop_docx_path": self.CropDocxPath,
+            "crop_docx_bytes": self.CropDocxBytes,
         }

sigdetect/wet_detection.py CHANGED Viewed

@@ -67,11 +67,7 @@ class OcrLine:
 def should_run_wet_pipeline(file_result: FileResult) -> bool:
     """Return ``True`` when the OCR pipeline should run for ``file_result``."""
-    return (
-        (not file_result.ElectronicSignatureFound or file_result.SignatureCount == 0)
-        or (bool(file_result.ScannedPdf) and not file_result.ElectronicSignatureFound)
-        or bool(file_result.MixedContent)
-    )
+    return not bool(file_result.ElectronicSignatureFound)
 def apply_wet_detection(
@@ -83,8 +79,6 @@ def apply_wet_detection(
 ) -> bool:
     """Augment ``file_result`` with OCR-detected wet signatures when possible."""
-    if not configuration.DetectWetSignatures:
-        return False
     if not should_run_wet_pipeline(file_result):
         return False
@@ -96,6 +90,8 @@ def apply_wet_detection(
             logger.warning("Wet detection unavailable", extra={"error": str(exc)})
         return False
+    original_esign = file_result.ElectronicSignatureFound
+    original_mixed = file_result.MixedContent
     try:
         added = _detect(pdf_path, configuration, file_result, logger=logger)
         if not added:
@@ -106,6 +102,9 @@ def apply_wet_detection(
         if logger:
             logger.warning("Wet detection failed", extra={"error": str(exc)})
         return False
+    finally:
+        file_result.ElectronicSignatureFound = original_esign
+        file_result.MixedContent = original_mixed
 def _detect(
@@ -138,6 +137,7 @@ def _detect(
                 )
             )
             candidates.extend(_image_candidates(page))
+            candidates = _filter_candidates_for_page(candidates)
             accepted = [
                 candidate
                 for candidate in candidates
@@ -157,7 +157,11 @@ def _detect(
         if not new_signatures:
             return False
-        file_result.Signatures.extend(new_signatures)
+        filtered_signatures = _dedupe_wet_signatures(new_signatures)
+        if not filtered_signatures:
+            return False
+        file_result.Signatures.extend(filtered_signatures)
         _refresh_metadata(file_result)
         return True
     finally:
@@ -277,6 +281,31 @@ def _build_candidates(
         )
+def _has_evidence(candidate: WetCandidate, token: str) -> bool:
+    return token in candidate.Evidence
+def _is_image_candidate(candidate: WetCandidate) -> bool:
+    return _has_evidence(candidate, "image_signature:true")
+def _has_stroke(candidate: WetCandidate) -> bool:
+    return _has_evidence(candidate, "stroke:yes")
+def _filter_candidates_for_page(candidates: Sequence[WetCandidate]) -> list[WetCandidate]:
+    if not candidates:
+        return []
+    has_image = any(_is_image_candidate(candidate) for candidate in candidates)
+    if not has_image:
+        return list(candidates)
+    return [
+        candidate
+        for candidate in candidates
+        if _is_image_candidate(candidate) or _has_stroke(candidate)
+    ]
 def _infer_role(normalized_text: str) -> str:
     for role, keywords in ROLE_KEYWORDS.items():
         if any(keyword in normalized_text for keyword in keywords):
@@ -379,7 +408,7 @@ def _image_candidates(page) -> list[WetCandidate]:
             continue
         if hasattr(rect, "x0"):
             x0, y0, x1, y1 = float(rect.x0), float(rect.y0), float(rect.x1), float(rect.y1)
-        elif isinstance(rect, (tuple, list)) and len(rect) == 4:
+        elif isinstance(rect, tuple | list) and len(rect) == 4:
             x0, y0, x1, y1 = map(float, rect)
         else:
             continue
@@ -422,7 +451,7 @@ def _infer_role_nearby(rect, words) -> str:
     proximity_x = 140.0
     if hasattr(rect, "x0"):
         rx0, ry0, rx1, ry1 = float(rect.x0), float(rect.y0), float(rect.x1), float(rect.y1)
-    elif isinstance(rect, (tuple, list)) and len(rect) == 4:
+    elif isinstance(rect, tuple | list) and len(rect) == 4:
         rx0, ry0, rx1, ry1 = map(float, rect)
     else:
         return "unknown"
@@ -471,6 +500,29 @@ def _to_signatures(
     return signatures
+def _signature_rank(signature: Signature) -> tuple[int, int, int]:
+    evidence = set(signature.Evidence or [])
+    if "image_signature:true" in evidence:
+        source_rank = 3
+    elif "stroke:yes" in evidence:
+        source_rank = 2
+    else:
+        source_rank = 1
+    return (source_rank, int(signature.Score or 0), int(signature.Page or 0))
+def _dedupe_wet_signatures(signatures: Sequence[Signature]) -> list[Signature]:
+    best_by_role: dict[str, Signature] = {}
+    for signature in signatures:
+        role = (signature.Role or "unknown").strip().lower()
+        if role == "unknown":
+            continue
+        existing = best_by_role.get(role)
+        if existing is None or _signature_rank(signature) > _signature_rank(existing):
+            best_by_role[role] = signature
+    return sorted(best_by_role.values(), key=lambda sig: (int(sig.Page or 0), sig.Role or ""))
 def _mark_manual_review(file_result: FileResult, reason: str) -> None:
     hints = _split_hints(file_result.Hints)
     hints.add(f"ManualReview:{reason}")
@@ -485,9 +537,7 @@ def _refresh_metadata(file_result: FileResult) -> None:
     if roles:
         file_result.Roles = ";".join(roles)
     file_result.ElectronicSignatureFound = file_result.SignatureCount > 0
-    file_result.MixedContent = (
-        file_result.ElectronicSignatureFound and bool(file_result.ScannedPdf)
-    )
+    file_result.MixedContent = file_result.ElectronicSignatureFound and bool(file_result.ScannedPdf)
     hints = _split_hints(file_result.Hints)
     hints |= {sig.Hint for sig in file_result.Signatures if sig.Hint}
     file_result.Hints = ";".join(sorted(hints))

{sigdetect-0.4.0.dist-info → sigdetect-0.5.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sigdetect
-Version: 0.4.0
+Version: 0.5.1
 Summary: Signature detection and role attribution for PDFs
 Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
 License: MIT
@@ -10,9 +10,11 @@ Requires-Dist: pypdf>=4.0.0
 Requires-Dist: rich>=13.0
 Requires-Dist: typer>=0.12
 Requires-Dist: pydantic>=2.5
+Requires-Dist: pillow>=10.0
+Requires-Dist: python-docx>=1.1.0
+Requires-Dist: pytesseract>=0.3.10
+Requires-Dist: pymupdf>=1.23
 Requires-Dist: pyyaml>=6.0
-Provides-Extra: pymupdf
-Requires-Dist: pymupdf>=1.23; extra == "pymupdf"
 # CaseWorks.Automation.CaseDocumentIntake
@@ -95,14 +97,16 @@ sigdetect detect \
 ### Notes
 - The config file controls `pdf_root`, `out_dir`, `engine`, `pseudo_signatures`, `recurse_xobjects`, etc.
-- `--engine` accepts **auto** (default; prefers PyMuPDF when installed, falls back to PyPDF2), **pypdf2**, or **pymupdf**.
+- Engine selection is forced to **auto** (prefers PyMuPDF for geometry, falls back to PyPDF2); any configured `engine` value is overridden.
 - `--pseudo-signatures` enables a vendor/Acro-only pseudo-signature when no actual `/Widget` is present (useful for DocuSign / Acrobat Sign receipts).
 - `--recurse-xobjects` allows scanning Form XObjects for vendor markers and labels embedded in page resources.
 - `--profile` selects tuned role logic:
   - `hipaa` → patient / representative / attorney
   - `retainer` → client / firm (prefers detecting two signatures)
 - `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
-- Cropping (`--crop-signatures`) and wet detection (`--detect-wet`) are enabled by default for single-pass runs; disable them if you want a light, e-sign-only pass. PyMuPDF is required for crops; PyMuPDF + Tesseract are required for wet detection.
+- Results output is disabled by default; set `write_results: true` or pass `--write-results` when you need `results.json` (for EDA).
+- Cropping (`--crop-signatures`) writes PNG crops to disk by default; enable `--crop-docx` to write DOCX files instead of PNGs. `--crop-bytes` embeds base64 PNG data in `signatures[].crop_bytes` and, when `--crop-docx` is enabled, embeds DOCX bytes in `signatures[].crop_docx_bytes`. PyMuPDF is required for crops, and `python-docx` is required for DOCX output.
+- Wet detection runs automatically for non-e-sign PDFs when dependencies are available; missing OCR dependencies add a `ManualReview:*` hint instead of failing. PyMuPDF + Tesseract are required for wet detection.
 - If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
 ### EDA (quick aggregate stats)
@@ -113,6 +117,8 @@ sigdetect eda \
 ~~~
+`sigdetect eda` expects `results.json`; enable `write_results: true` when running detect.
 ---
 ## Library usage
@@ -136,13 +142,13 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
 print(result.to_dict())
 ~~~
-`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
+`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image; when DOCX cropping is enabled, `crop_docx_path` points at the generated doc. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
 ---
 ## Library API (embed in another script)
-Minimal, plug-and-play API that returns plain dicts (JSON-ready) without side effects unless you opt into cropping:
+Minimal, plug-and-play API that returns plain dicts (JSON-ready) without side effects unless you opt into cropping. Engine selection is forced to `auto` (PyMuPDF preferred) to ensure geometry. Wet detection runs automatically for non-e-sign PDFs; pass `runWetDetection=False` to skip OCR.
 ~~~python
 from pathlib import Path
@@ -165,6 +171,7 @@ result = DetectPdf(
     profileName="retainer",
     includePseudoSignatures=True,
     recurseXObjects=True,
+    # runWetDetection=False,  # disable OCR-backed wet detection if desired
 )
 print(
     result["file"],
@@ -187,7 +194,7 @@ for res in ScanDirectory(
     # store in DB, print, etc.
     pass
-# 3) Crop PNG snippets for FileResult objects (requires PyMuPDF)
+# 3) Crop signature snippets for FileResult objects (requires PyMuPDF; DOCX needs python-docx)
 detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
 file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
 CropSignatureImages(
@@ -226,7 +233,8 @@ High-level summary (per file):
       "hint": "AcroSig:sig_patient",
       "render_type": "typed",
       "bounding_box": [10.0, 10.0, 150.0, 40.0],
-      "crop_path": "signature_crops/example/sig_01_patient.png"
+      "crop_path": "signature_crops/example/sig_01_patient.png",
+      "crop_docx_path": null
     },
     {
       "page": null,
@@ -253,6 +261,9 @@ High-level summary (per file):
 - In retainer profile, emitter prefers two signatures (client + firm), often on the same page.
 - **`signatures[].bounding_box`** reports the widget rectangle in PDF points (origin bottom-left).
 - **`signatures[].crop_path`** is populated when PNG crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
+- **`signatures[].crop_docx_path`** is populated when DOCX crops are generated (`--crop-docx` or `docx=True`).
+- **`signatures[].crop_bytes`** contains base64 PNG data when CLI `--crop-bytes` is enabled.
+- **`signatures[].crop_docx_bytes`** contains base64 DOCX data when `--crop-docx` and `--crop-bytes` are enabled together.
 ---
@@ -274,14 +285,16 @@ You can keep one config YAML per dataset, e.g.:
 # ./sample_data/config.yml (example)
 pdf_root: ./pdfs
 out_dir: ./sigdetect_out
-engine: pypdf2
+engine: auto
+write_results: false
 pseudo_signatures: true
 recurse_xobjects: true
 profile: retainer    # or: hipaa
 crop_signatures: false   # enable to write PNG crops (requires pymupdf)
+crop_docx: false         # enable to write DOCX crops instead of PNGs (requires python-docx)
 # crop_output_dir: ./signature_crops
 crop_image_dpi: 200
-detect_wet_signatures: false   # opt-in OCR wet detection (PyMuPDF + Tesseract)
+detect_wet_signatures: false   # kept for compatibility; non-e-sign PDFs still trigger OCR
 wet_ocr_dpi: 200
 wet_ocr_languages: eng
 wet_precision_threshold: 0.82
@@ -299,7 +312,7 @@ YAML files can be customized or load at runtime (see CLI `--config`, if availabl
   - Looks for client and firm labels/tokens; boosts pages with law-firm markers (LLP/LLC/PA/PC) and “By:” blocks.
   - Applies an anti-front-matter rule to reduce page-1 false positives (e.g., letterheads, firm mastheads).
   - When only vendor/Acro clues exist (no widgets), it will emit two pseudo signatures targeting likely pages.
-- **Wet detection (opt-in):** With `detect_wet_signatures: true`, the CLI runs an OCR-backed pass (PyMuPDF + pytesseract/Tesseract) after e-sign detection. It emits `RenderType="wet"` signatures for high-confidence label/stroke pairs in the lower page region. Missing OCR dependencies add a `ManualReview:*` hint instead of failing.
+- **Wet detection (non-e-sign):** The CLI runs an OCR-backed pass (PyMuPDF + pytesseract/Tesseract) after e-sign detection whenever no e-sign evidence is found. It emits `RenderType="wet"` signatures for high-confidence label/stroke pairs in the lower page region. When an image-based signature is present on a page, label-only OCR candidates are suppressed unless a stroke is detected. Results are deduped to the top signature per role (dropping `unknown`). Missing OCR dependencies add a `ManualReview:*` hint instead of failing.
 ---

{sigdetect-0.4.0.dist-info → sigdetect-0.5.1.dist-info}/RECORD RENAMED Viewed

@@ -1,24 +1,24 @@
 sigdetect/__init__.py,sha256=YvnTwlC1jfq83EhQS_1JjiiHK7_wJCCU1JvHv5E1qWY,573
-sigdetect/api.py,sha256=qLCpbODLvw5AQMEAvpIP6kBNoc03h01ekjilg9tDxuw,9408
-sigdetect/cli.py,sha256=Zco3-r4MAlVEmyEatvPUOZLLamh5ELFrquAK6ovJVlw,9290
-sigdetect/config.py,sha256=-6GCUusdi0Ba-Rt6pwffB5MIz1ApPlBaXVKxpIppbKk,7678
-sigdetect/cropping.py,sha256=zwOXzkG8tt1ZPUaDhJMHfonFEZtVNZZmZOzYQ_4nUAI,6074
+sigdetect/api.py,sha256=hDfa6z4SoHth1Dw9HDfSPiytMQrqu_oyBZlXBwSh9g4,11010
+sigdetect/cli.py,sha256=X5GqZ-PK67vz4OHN5r7h-V0hO886ZblUiUdKDuFowtU,10930
+sigdetect/config.py,sha256=3SP1rkcWBGXloCDFomBJRMRKZOvXuHQbhIBqpVrzYmY,8365
+sigdetect/cropping.py,sha256=HfOJrV2Xv9Eo0lCIl3mukz49agKB6h2TML99B0qQJNc,8837
 sigdetect/eda.py,sha256=S92G1Gjmepri__D0n_V6foq0lQgH-RXI9anW8A58jfw,4681
 sigdetect/logging_setup.py,sha256=LMF8ao_a-JwH0S522T6aYTFX3e8Ajjv_5ODS2YiBcHA,6404
 sigdetect/utils.py,sha256=T9rubLf5T9JmjOHYMOba1j34fhOJaWocAXccnGTxRUE,5198
-sigdetect/wet_detection.py,sha256=6ciFxMQS3f1nF502w4KLTksoYmjdudzTekh7McfWiIg,16464
+sigdetect/wet_detection.py,sha256=zvi11XUmm_xLZ4BLvxInwMQg8YLcyQzEYAM9QSdJOIs,18259
 sigdetect/data/role_rules.retainer.yml,sha256=IFdwKnDBXR2cTkdfrsZ6ku6CXD8S_dg5A3vKRKLW5h8,2532
 sigdetect/data/role_rules.yml,sha256=HuLKsZR_A6sD9XvY4NHiY_VG3dS5ERNCBF9-Mxawomw,2751
 sigdetect/data/vendor_patterns.yml,sha256=NRbZNQxcx_GuL6n1jAphBn6MM6ChCpeWGCsjbRx-PEo,384
-sigdetect/detector/__init__.py,sha256=pUVFLwqj65cVO1qjsZy6NJ9BVY5xrJ6sQe-8LAb9O_A,2421
+sigdetect/detector/__init__.py,sha256=nT52mCI9s03Rso_RS86mm223rJfl5GlGDFsXwMJ3z3E,2548
 sigdetect/detector/base.py,sha256=L-iXWXqsTetDc4jRZo_wOdbNpKqOY20mX9FefrugdT0,263
 sigdetect/detector/base_detector.py,sha256=GmAgUWO_fQgIfnihZSoyhR3wpnwZ-X3hS0Kuyz4G6Ys,608
 sigdetect/detector/file_result_model.py,sha256=j2gTc9Sw3fJOHlexYsR_m5DiwHA8DzIzAMToESfvo4A,1767
-sigdetect/detector/pymupdf_engine.py,sha256=SGtJOStKFdfsdBrscoe5zg9u2KGJ_JTRYZ25adL_7Lw,17390
+sigdetect/detector/pymupdf_engine.py,sha256=N6oxvUa-48VvvhjbMk0R0kfScsggNKS7u5FLSeBRfWw,17358
 sigdetect/detector/pypdf2_engine.py,sha256=kB8cIp_gMvCla0LIBi9sd19g0361Oc9TjCW_ZViUBJQ,47410
-sigdetect/detector/signature_model.py,sha256=sdfQiOJzxnrg0WkGJxZCebA0wHqgzZnLI0gOv6ipSZA,1074
-sigdetect-0.4.0.dist-info/METADATA,sha256=WA7OjyLtM3AH7OtdFRmliqBw0ucNlywoD2bykytlnPA,12475
-sigdetect-0.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-sigdetect-0.4.0.dist-info/entry_points.txt,sha256=iqtfKjBU44-omM7Sh-idGz2ahw19oAvpvSyKZVArG3o,48
-sigdetect-0.4.0.dist-info/top_level.txt,sha256=PKlfwUobkRC0viwiSXmhtw83G26FSNpimWYC1Uy00FY,10
-sigdetect-0.4.0.dist-info/RECORD,,
+sigdetect/detector/signature_model.py,sha256=T2Hmfkfz_hZsDzwOhepxfNmkedxQp3_XHdrP8yGKoCk,1322
+sigdetect-0.5.1.dist-info/METADATA,sha256=_Jnyl9_A1yZUrKwWxUxVB-9rcMG3MdUqiN5WX_zlpqQ,14131
+sigdetect-0.5.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+sigdetect-0.5.1.dist-info/entry_points.txt,sha256=iqtfKjBU44-omM7Sh-idGz2ahw19oAvpvSyKZVArG3o,48
+sigdetect-0.5.1.dist-info/top_level.txt,sha256=PKlfwUobkRC0viwiSXmhtw83G26FSNpimWYC1Uy00FY,10
+sigdetect-0.5.1.dist-info/RECORD,,

{sigdetect-0.4.0.dist-info → sigdetect-0.5.1.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.9.0)
+Generator: setuptools (80.10.2)
 Root-Is-Purelib: true
 Tag: py3-none-any

{sigdetect-0.4.0.dist-info → sigdetect-0.5.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{sigdetect-0.4.0.dist-info → sigdetect-0.5.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

sigdetect 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

sigdetect 0.4.0py3-none-any.whl → 0.5.1py3-none-any.whl