PyPI - sigdetect - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

sigdetect 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

sigdetect/api.py +136 -14
sigdetect/cli.py +154 -20
sigdetect/config.py +49 -9
sigdetect/cropping.py +123 -0
sigdetect/detector/pymupdf_engine.py +420 -0
sigdetect/detector/pypdf2_engine.py +46 -8
sigdetect/detector/signature_model.py +4 -0
{sigdetect-0.1.0.dist-info → sigdetect-0.2.0.dist-info}/METADATA +37 -6
{sigdetect-0.1.0.dist-info → sigdetect-0.2.0.dist-info}/RECORD +12 -11
{sigdetect-0.1.0.dist-info → sigdetect-0.2.0.dist-info}/WHEEL +0 -0
{sigdetect-0.1.0.dist-info → sigdetect-0.2.0.dist-info}/entry_points.txt +0 -0
{sigdetect-0.1.0.dist-info → sigdetect-0.2.0.dist-info}/top_level.txt +0 -0

sigdetect/api.py CHANGED Viewed

@@ -2,11 +2,12 @@
 from __future__ import annotations
+from contextlib import contextmanager
 from pathlib import Path
-from typing import Any, Iterable, Iterator, Literal
+from typing import Any, Generator, Iterable, Iterator, Literal
 from sigdetect.config import DetectConfiguration
-from sigdetect.detector import BuildDetector
+from sigdetect.detector import BuildDetector, Detector, FileResult, Signature
 EngineName = Literal["pypdf2", "pypdf", "pymupdf"]
 ProfileName = Literal["hipaa", "retainer"]
@@ -19,23 +20,44 @@ def DetectPdf(
     engineName: EngineName = "pypdf2",
     includePseudoSignatures: bool = True,
     recurseXObjects: bool = True,
+    detector: Detector | None = None,
 ) -> dict[str, Any]:
     """Detect signature evidence and assign roles for a single PDF."""
     resolvedPath = Path(pdfPath)
+    activeDetector = detector or get_detector(
+        pdfRoot=resolvedPath.parent,
+        profileName=profileName,
+        engineName=engineName,
+        includePseudoSignatures=includePseudoSignatures,
+        recurseXObjects=recurseXObjects,
+        outputDirectory=None,
+    )
+    result = activeDetector.Detect(resolvedPath)
+    return _ToPlainDictionary(result)
+def get_detector(
+    *,
+    pdfRoot: str | Path | None = None,
+    profileName: ProfileName = "hipaa",
+    engineName: EngineName = "pypdf2",
+    includePseudoSignatures: bool = True,
+    recurseXObjects: bool = True,
+    outputDirectory: str | Path | None = None,
+) -> Detector:
+    """Return a reusable detector instance configured with the supplied options."""
     configuration = DetectConfiguration(
-        PdfRoot=resolvedPath.parent,
-        OutputDirectory=None,
+        PdfRoot=Path(pdfRoot) if pdfRoot is not None else Path.cwd(),
+        OutputDirectory=Path(outputDirectory) if outputDirectory is not None else None,
         Engine=engineName,
         PseudoSignatures=includePseudoSignatures,
         RecurseXObjects=recurseXObjects,
         Profile=profileName,
     )
-    detector = BuildDetector(configuration)
-    result = detector.Detect(resolvedPath)
-    return _ToPlainDictionary(result)
+    return BuildDetector(configuration)
 def _ToPlainDictionary(candidate: Any) -> dict[str, Any]:
@@ -84,10 +106,17 @@ def _ToPlainValue(value: Any) -> Any:
 def DetectMany(
     pdfPaths: Iterable[str | Path],
+    *,
+    detector: Detector | None = None,
     **kwargs: Any,
 ) -> Iterator[dict[str, Any]]:
     """Yield :func:`DetectPdf` results for each path in ``pdfPaths``."""
+    if detector is not None:
+        for pdfPath in pdfPaths:
+            yield _DetectWithDetector(detector, pdfPath)
+        return
     for pdfPath in pdfPaths:
         yield DetectPdf(pdfPath, **kwargs)
@@ -96,19 +125,24 @@ def ScanDirectory(
     pdfRoot: str | Path,
     *,
     globPattern: str = "**/*.pdf",
+    detector: Detector | None = None,
     **kwargs: Any,
 ) -> Iterator[dict[str, Any]]:
     """Walk ``pdfRoot`` and yield detection output for every matching PDF."""
     rootDirectory = Path(pdfRoot)
-    iterator = (
-        rootDirectory.rglob(globPattern.replace("**/", "", 1))
-        if globPattern.startswith("**/")
-        else rootDirectory.glob(globPattern)
-    )
+    if globPattern == "**/*.pdf":
+        iterator = (path for path in rootDirectory.rglob("*") if path.is_file())
+    else:
+        iterator = (
+            rootDirectory.rglob(globPattern.replace("**/", "", 1))
+            if globPattern.startswith("**/")
+            else rootDirectory.glob(globPattern)
+        )
     for pdfPath in iterator:
         if pdfPath.is_file() and pdfPath.suffix.lower() == ".pdf":
-            yield DetectPdf(pdfPath, **kwargs)
+            yield DetectPdf(pdfPath, detector=detector, **kwargs)
 def ToCsvRow(result: dict[str, Any]) -> dict[str, Any]:
@@ -137,3 +171,91 @@ def Version() -> str:
         return resolveVersion("sigdetect")
     except Exception:
         return "0.0.0-dev"
+def _DetectWithDetector(detector: Detector, pdfPath: str | Path) -> dict[str, Any]:
+    """Helper that runs ``detector`` and returns the plain dictionary result."""
+    resolvedPath = Path(pdfPath)
+    return _ToPlainDictionary(detector.Detect(resolvedPath))
+@contextmanager
+def detector_context(**kwargs: Any) -> Generator[Detector, None, None]:
+    """Context manager wrapper around :func:`get_detector`."""
+    detector = get_detector(**kwargs)
+    try:
+        yield detector
+    finally:
+        pass
+def CropSignatureImages(
+    pdfPath: str | Path,
+    fileResult: FileResult | dict[str, Any],
+    *,
+    outputDirectory: str | Path,
+    dpi: int = 200,
+) -> list[Path]:
+    """Crop detected signature regions to PNG files.
+    Accepts either a :class:`FileResult` instance or the ``dict`` returned by
+    :func:`DetectPdf`. Requires the optional ``pymupdf`` dependency.
+    """
+    from sigdetect.cropping import crop_signatures
+    file_result_obj, original_dict = _CoerceFileResult(fileResult)
+    paths = crop_signatures(
+        pdf_path=Path(pdfPath),
+        file_result=file_result_obj,
+        output_dir=Path(outputDirectory),
+        dpi=dpi,
+    )
+    if original_dict is not None:
+        original_dict.clear()
+        original_dict.update(file_result_obj.to_dict())
+    return paths
+def _CoerceFileResult(
+    candidate: FileResult | dict[str, Any]
+) -> tuple[FileResult, dict[str, Any] | None]:
+    if isinstance(candidate, FileResult):
+        return candidate, None
+    if not isinstance(candidate, dict):
+        raise TypeError("fileResult must be FileResult or dict")
+    signatures: list[Signature] = []
+    for entry in candidate.get("signatures") or []:
+        bbox = entry.get("bounding_box")
+        signatures.append(
+            Signature(
+                Page=entry.get("page"),
+                FieldName=str(entry.get("field_name") or ""),
+                Role=str(entry.get("role") or "unknown"),
+                Score=int(entry.get("score") or 0),
+                Scores=dict(entry.get("scores") or {}),
+                Evidence=list(entry.get("evidence") or []),
+                Hint=str(entry.get("hint") or ""),
+                RenderType=str(entry.get("render_type") or "unknown"),
+                BoundingBox=tuple(bbox) if bbox else None,
+                CropPath=entry.get("crop_path"),
+            )
+        )
+    file_result = FileResult(
+        File=str(candidate.get("file") or ""),
+        SizeKilobytes=candidate.get("size_kb"),
+        PageCount=int(candidate.get("pages") or 0),
+        ElectronicSignatureFound=bool(candidate.get("esign_found")),
+        ScannedPdf=candidate.get("scanned_pdf"),
+        MixedContent=candidate.get("mixed"),
+        SignatureCount=int(candidate.get("sig_count") or len(signatures)),
+        SignaturePages=str(candidate.get("sig_pages") or ""),
+        Roles=str(candidate.get("roles") or "unknown"),
+        Hints=str(candidate.get("hints") or ""),
+        Signatures=signatures,
+    )
+    return file_result, candidate

sigdetect/cli.py CHANGED Viewed

@@ -3,14 +3,16 @@
 from __future__ import annotations
 import json
+from collections.abc import Iterator
 from dataclasses import asdict, is_dataclass
 from pathlib import Path
 import typer
 from . import __version__
-from .config import LoadConfiguration
-from .detector import BuildDetector
+from .config import FinalizeConfiguration, LoadConfiguration
+from .cropping import SignatureCroppingUnavailable, crop_signatures
+from .detector import BuildDetector, FileResult
 from .eda import RunExploratoryAnalysis
 from .logging_setup import ConfigureLogging
@@ -31,18 +33,65 @@ def _JsonSerializer(candidate):
     return str(candidate)
+def _EnumeratePdfs(pdfRoot: Path, recursive: bool) -> Iterator[Path]:
+    """Yield PDF files under ``pdfRoot`` honoring the recursion flag."""
+    iterator = pdfRoot.rglob("*") if recursive else pdfRoot.glob("*")
+    for path in iterator:
+        if path.is_file() and path.suffix.lower() == ".pdf":
+            yield path
 @CliApplication.command(name="detect")
 def Detect(
     configurationPath: Path | None = typer.Option(
         None, "--config", "-c", help="Path to YAML config"
     ),
     profileOverride: str | None = typer.Option(None, "--profile", "-p", help="hipaa or retainer"),
+    recursive: bool = typer.Option(
+        True,
+        "--recursive/--no-recursive",
+        help="Recurse into subdirectories when gathering PDFs",
+    ),
+    cropSignatures: bool | None = typer.Option(
+        None,
+        "--crop-signatures/--no-crop-signatures",
+        help="Crop detected signature regions to PNG files (requires PyMuPDF)",
+        show_default=False,
+    ),
+    cropDirectory: Path | None = typer.Option(
+        None,
+        "--crop-dir",
+        help="Directory for signature PNG crops (defaults to out_dir/signature_crops)",
+    ),
+    cropDpi: int | None = typer.Option(
+        None,
+        "--crop-dpi",
+        min=72,
+        max=600,
+        help="Rendering DPI for signature crops",
+        show_default=False,
+    ),
 ) -> None:
     """Run detection for the configured directory and emit ``results.json``."""
     configuration = LoadConfiguration(configurationPath)
-    if profileOverride in {"hipaa", "retainer"}:
-        configuration = configuration.model_copy(update={"Profile": profileOverride})
+    if profileOverride is not None:
+        normalized_profile = profileOverride.lower()
+        if normalized_profile not in {"hipaa", "retainer"}:
+            raise typer.BadParameter("Profile must be 'hipaa' or 'retainer'.")
+        configuration = configuration.model_copy(update={"Profile": normalized_profile})
+    overrides: dict[str, object] = {}
+    if cropSignatures is not None:
+        overrides["CropSignatures"] = cropSignatures
+    if cropDirectory is not None:
+        overrides["CropOutputDirectory"] = cropDirectory
+    if cropDpi is not None:
+        overrides["CropImageDpi"] = cropDpi
+    if overrides:
+        configuration = configuration.model_copy(update=overrides)
+        configuration = FinalizeConfiguration(configuration)
     try:
         detector = BuildDetector(configuration)
@@ -54,26 +103,111 @@ def Detect(
         typer.echo(str(exc), err=True)
         raise typer.Exit(code=2) from exc
-    pdfFiles = list(configuration.PdfRoot.glob("*.pdf"))
-    if not pdfFiles:
-        raise SystemExit(f"No PDFs found in {configuration.PdfRoot}")
-    results = [detector.Detect(pdfPath) for pdfPath in pdfFiles]
+    pdfIterator = _EnumeratePdfs(configuration.PdfRoot, recursive)
+    try:
+        firstPdf = next(pdfIterator)
+    except StopIteration:
+        raise SystemExit(f"No PDFs found in {configuration.PdfRoot}") from None
+    results_buffer: list[FileResult] | None = [] if configuration.OutputDirectory is None else None
+    json_handle = None
+    json_path: Path | None = None
+    wrote_first = False
+    if configuration.OutputDirectory is not None:
+        outputDirectory = configuration.OutputDirectory
+        outputDirectory.mkdir(parents=True, exist_ok=True)
+        json_path = outputDirectory / "results.json"
+        json_handle = open(json_path, "w", encoding="utf-8")
+        json_handle.write("[")
+    crop_dir = configuration.CropOutputDirectory
+    cropping_enabled = configuration.CropSignatures
+    cropping_available = True
+    cropping_attempted = False
+    if configuration.CropSignatures and crop_dir is None:
+        Logger.warning(
+            "CropSignatures enabled without an output directory",
+            extra={"pdf_root": str(configuration.PdfRoot)},
+        )
+        cropping_enabled = False
+    total_bboxes = 0
+    def _append_result(file_result: FileResult, source_pdf: Path) -> None:
+        nonlocal wrote_first, json_handle, total_bboxes, cropping_available, cropping_attempted
+        if cropping_enabled and cropping_available and crop_dir is not None:
+            try:
+                crop_signatures(
+                    pdf_path=source_pdf,
+                    file_result=file_result,
+                    output_dir=crop_dir,
+                    dpi=configuration.CropImageDpi,
+                    logger=Logger,
+                )
+                cropping_attempted = True
+            except SignatureCroppingUnavailable as exc:
+                cropping_available = False
+                Logger.warning("Signature cropping unavailable", extra={"error": str(exc)})
+                typer.echo(str(exc), err=True)
+            except Exception as exc:  # pragma: no cover - defensive
+                Logger.warning(
+                    "Unexpected error while cropping signatures",
+                    extra={"error": str(exc)},
+                )
+        total_bboxes += sum(1 for sig in file_result.Signatures if sig.BoundingBox)
+        if results_buffer is not None:
+            results_buffer.append(file_result)
+            return
+        if json_handle is None:
+            return
+        serialized = json.dumps(
+            file_result,
+            indent=2,
+            ensure_ascii=False,
+            default=_JsonSerializer,
+        )
+        indented = "\n".join(f"  {line}" for line in serialized.splitlines())
+        if wrote_first:
+            json_handle.write(",\n")
+        else:
+            json_handle.write("\n")
+        json_handle.write(indented)
+        wrote_first = True
+    def _process(pdf_path: Path) -> None:
+        file_result = detector.Detect(pdf_path)
+        _append_result(file_result, pdf_path)
-    # Allow configuration to suppress file output entirely (out_dir: none / SIGDETECT_OUT_DIR=none)
-    if configuration.OutputDirectory is None:
-        payload = json.dumps(results, indent=2, ensure_ascii=False, default=_JsonSerializer)
+    try:
+        _process(firstPdf)
+        for pdf_path in pdfIterator:
+            _process(pdf_path)
+    finally:
+        if json_handle is not None:
+            closing = "\n]\n" if wrote_first else "]\n"
+            json_handle.write(closing)
+            json_handle.close()
+    if json_handle is not None:
+        typer.echo(f"Wrote {json_path}")
+    else:
+        payload = json.dumps(
+            results_buffer or [], indent=2, ensure_ascii=False, default=_JsonSerializer
+        )
         typer.echo(payload)
         typer.echo("Detection completed with output disabled (out_dir=none)")
-        return
-    outputDirectory = configuration.OutputDirectory
-    outputDirectory.mkdir(parents=True, exist_ok=True)
-    with open(outputDirectory / "results.json", "w", encoding="utf-8") as handle:
-        json.dump(results, handle, indent=2, ensure_ascii=False, default=_JsonSerializer)
-    typer.echo(f"Wrote {outputDirectory / 'results.json'}")
+    if cropping_enabled and cropping_available and cropping_attempted and total_bboxes == 0:
+        Logger.warning(
+            "No signature bounding boxes detected; try --engine pymupdf for crop-ready output",
+            extra={"engine": configuration.Engine},
+        )
 @CliApplication.command(name="eda")

sigdetect/config.py CHANGED Viewed

@@ -3,6 +3,7 @@
 from __future__ import annotations
 import os
+from contextlib import suppress
 from pathlib import Path
 from typing import Literal
@@ -26,11 +27,13 @@ class DetectConfiguration(BaseModel):
     OutputDirectory: Path | None = Field(default=Path("out"), alias="out_dir")
     Engine: EngineName = Field(default="pypdf2", alias="engine")
     Profile: ProfileName = Field(default="hipaa", alias="profile")
-    MaxWorkers: int = Field(default=8, alias="max_workers", ge=1, le=64)
     PseudoSignatures: bool = Field(default=True, alias="pseudo_signatures")
     RecurseXObjects: bool = Field(default=True, alias="recurse_xobjects")
+    CropSignatures: bool = Field(default=False, alias="crop_signatures")
+    CropOutputDirectory: Path | None = Field(default=None, alias="crop_output_dir")
+    CropImageDpi: int = Field(default=200, alias="crop_image_dpi", ge=72, le=600)
-    @field_validator("PdfRoot", "OutputDirectory", mode="before")
+    @field_validator("PdfRoot", "OutputDirectory", "CropOutputDirectory", mode="before")
     @classmethod
     def _CoercePath(cls, value: str | Path | None) -> Path | None:
         """Allow configuration values to be provided as ``str`` or ``Path``.
@@ -42,8 +45,8 @@ class DetectConfiguration(BaseModel):
         if value is None:
             return None
         if isinstance(value, Path):
-            return value
-        return Path(value)
+            return value.expanduser()
+        return Path(value).expanduser()
     # Expose legacy snake_case property names for gradual migration
     @property
@@ -62,10 +65,6 @@ class DetectConfiguration(BaseModel):
     def profile(self) -> ProfileName:  # pragma: no cover - simple passthrough
         return self.Profile
-    @property
-    def max_workers(self) -> int:  # pragma: no cover - simple passthrough
-        return self.MaxWorkers
     @property
     def pseudo_signatures(self) -> bool:  # pragma: no cover - simple passthrough
         return self.PseudoSignatures
@@ -74,6 +73,18 @@ class DetectConfiguration(BaseModel):
     def recurse_xobjects(self) -> bool:  # pragma: no cover - simple passthrough
         return self.RecurseXObjects
+    @property
+    def crop_signatures(self) -> bool:  # pragma: no cover - simple passthrough
+        return self.CropSignatures
+    @property
+    def crop_output_dir(self) -> Path | None:  # pragma: no cover - simple passthrough
+        return self.CropOutputDirectory
+    @property
+    def crop_image_dpi(self) -> int:  # pragma: no cover - simple passthrough
+        return self.CropImageDpi
 def LoadConfiguration(path: Path | None) -> DetectConfiguration:
     """Load configuration from ``path`` while applying environment overrides.
@@ -94,6 +105,9 @@ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
     env_pdf_root = os.getenv("SIGDETECT_PDF_ROOT")
     env_out_dir = os.getenv("SIGDETECT_OUT_DIR")
     env_profile = os.getenv("SIGDETECT_PROFILE")
+    env_crop = os.getenv("SIGDETECT_CROP_SIGNATURES")
+    env_crop_dir = os.getenv("SIGDETECT_CROP_DIR")
+    env_crop_dpi = os.getenv("SIGDETECT_CROP_DPI")
     raw_data: dict[str, object] = {}
     if path and Path(path).exists():
@@ -108,10 +122,36 @@ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
         raw_data["out_dir"] = None if env_out_dir.lower() == "none" else env_out_dir
     if env_profile in {"hipaa", "retainer"}:
         raw_data["profile"] = env_profile
+    if env_crop is not None:
+        lowered = env_crop.lower()
+        if lowered in {"1", "true", "yes", "on"}:
+            raw_data["crop_signatures"] = True
+        elif lowered in {"0", "false", "no", "off"}:
+            raw_data["crop_signatures"] = False
+    if env_crop_dir:
+        raw_data["crop_output_dir"] = env_crop_dir
+    if env_crop_dpi:
+        with suppress(ValueError):
+            raw_data["crop_image_dpi"] = int(env_crop_dpi)
     configuration = DetectConfiguration(**raw_data)
+    return FinalizeConfiguration(configuration)
+def FinalizeConfiguration(configuration: DetectConfiguration) -> DetectConfiguration:
+    """Ensure derived directories exist and defaults are populated."""
+    updates: dict[str, object] = {}
     if configuration.OutputDirectory is not None:
         configuration.OutputDirectory.mkdir(parents=True, exist_ok=True)
-    return configuration
+    if configuration.CropSignatures:
+        crop_dir = configuration.CropOutputDirectory
+        if crop_dir is None:
+            base_dir = configuration.OutputDirectory or configuration.PdfRoot
+            crop_dir = base_dir / "signature_crops"
+        crop_dir.mkdir(parents=True, exist_ok=True)
+        updates["CropOutputDirectory"] = crop_dir
+    return configuration if not updates else configuration.model_copy(update=updates)

sigdetect/cropping.py ADDED Viewed

@@ -0,0 +1,123 @@
+"""Helpers for converting signature bounding boxes into PNG crops."""
+from __future__ import annotations
+import logging
+import re
+from pathlib import Path
+from .detector.file_result_model import FileResult
+from .detector.signature_model import Signature
+try:  # pragma: no cover - optional dependency
+    import fitz  # type: ignore
+except Exception:  # pragma: no cover - optional dependency
+    fitz = None  # type: ignore[misc]
+class SignatureCroppingUnavailable(RuntimeError):
+    """Raised when PNG cropping cannot be performed (e.g., PyMuPDF missing)."""
+def crop_signatures(
+    pdf_path: Path,
+    file_result: FileResult,
+    *,
+    output_dir: Path,
+    dpi: int = 200,
+    logger: logging.Logger | None = None,
+) -> list[Path]:
+    """Render each signature bounding box to a PNG image using PyMuPDF."""
+    if fitz is None:  # pragma: no cover - exercised when dependency absent
+        raise SignatureCroppingUnavailable(
+            "PyMuPDF is required for PNG crops. Install 'pymupdf' or 'sigdetect[pymupdf]'."
+        )
+    pdf_path = Path(pdf_path)
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    generated: list[Path] = []
+    with fitz.open(pdf_path) as document:  # type: ignore[attr-defined]
+        per_document_dir = output_dir / pdf_path.stem
+        per_document_dir.mkdir(parents=True, exist_ok=True)
+        scale = dpi / 72.0
+        matrix = fitz.Matrix(scale, scale)
+        for index, signature in enumerate(file_result.Signatures, start=1):
+            if not signature.BoundingBox or not signature.Page:
+                continue
+            try:
+                page = document.load_page(signature.Page - 1)
+            except Exception as exc:  # pragma: no cover - defensive
+                if logger:
+                    logger.warning(
+                        "Failed to load page for signature crop",
+                        extra={
+                            "file": pdf_path.name,
+                            "page": signature.Page,
+                            "error": str(exc),
+                        },
+                    )
+                continue
+            clip = _to_clip_rect(page, signature.BoundingBox)
+            if clip is None:
+                continue
+            filename = _build_filename(index, signature)
+            destination = per_document_dir / filename
+            try:
+                pixmap = page.get_pixmap(matrix=matrix, clip=clip, alpha=False)
+                pixmap.save(destination)
+            except Exception as exc:  # pragma: no cover - defensive
+                if logger:
+                    logger.warning(
+                        "Failed to render signature crop",
+                        extra={
+                            "file": pdf_path.name,
+                            "page": signature.Page,
+                            "field": signature.FieldName,
+                            "error": str(exc),
+                        },
+                    )
+                continue
+            signature.CropPath = str(destination)
+            generated.append(destination)
+    return generated
+def _to_clip_rect(page, bbox: tuple[float, float, float, float]):
+    width = float(page.rect.width)
+    height = float(page.rect.height)
+    x0, y0, x1, y1 = bbox
+    left = _clamp(min(x0, x1), 0.0, width)
+    right = _clamp(max(x0, x1), 0.0, width)
+    top = _clamp(height - max(y0, y1), 0.0, height)
+    bottom = _clamp(height - min(y0, y1), 0.0, height)
+    if right - left <= 0 or bottom - top <= 0:
+        return None
+    return fitz.Rect(left, top, right, bottom)
+def _clamp(value: float, lower: float, upper: float) -> float:
+    return max(lower, min(value, upper))
+def _build_filename(index: int, signature: Signature) -> str:
+    base = signature.Role or signature.FieldName or "signature"
+    slug = _slugify(base)
+    return f"sig_{index:02d}_{slug}.png"
+def _slugify(value: str) -> str:
+    cleaned = re.sub(r"[^A-Za-z0-9_-]+", "_", value.strip().lower())
+    cleaned = cleaned.strip("_")
+    return cleaned or "signature"

sigdetect/detector/pymupdf_engine.py CHANGED Viewed

@@ -0,0 +1,420 @@
+"""PyMuPDF-backed detector that augments PyPDF2 heuristics with geometry."""
+from __future__ import annotations
+from pathlib import Path
+from typing import Iterable, cast
+from .pypdf2_engine import PyPDF2Detector
+from .signature_model import Signature
+try:  # pragma: no cover - optional dependency
+    import fitz  # type: ignore
+except Exception:  # pragma: no cover - optional dependency
+    fitz = None  # type: ignore[misc]
+class PyMuPDFDetector(PyPDF2Detector):
+    """Detector that reuses PyPDF2 heuristics and annotates results via PyMuPDF."""
+    Name = "pymupdf"
+    SIGNATURE_PADDING = 64.0
+    ROLE_KEYWORDS: dict[str, tuple[str, ...]] = {
+        "client": ("client", "consumer", "claimant"),
+        "firm": ("firm", "attorney", "attorneys", "counsel", "company", "llp", "llc", "law", "by:"),
+        "patient": ("patient", "self", "plaintiff"),
+        "representative": ("representative", "guardian", "parent"),
+        "attorney": ("attorney", "counsel", "lawyer"),
+    }
+    def __init__(self, configuration):
+        if fitz is None:  # pragma: no cover - optional dependency
+            raise ValueError(
+                "PyMuPDF engine requires the optional 'pymupdf' dependency. Install via 'pip install "
+                "sigdetect[pymupdf]' or add pymupdf to your environment."
+            )
+        super().__init__(configuration)
+    def Detect(self, pdf_path: Path):  # type: ignore[override]
+        result = super().Detect(pdf_path)
+        try:
+            document = fitz.open(str(pdf_path))
+        except Exception:  # pragma: no cover - defensive
+            return result
+        with document:
+            widget_map = self._CollectWidgetRects(document)
+            self._ApplyWidgetRects(result.Signatures, widget_map)
+            self._InferPseudoRects(result.Signatures, document)
+        return result
+    # ───────────────────────────────── widget helpers ─────────────────────────────────
+    def _CollectWidgetRects(
+        self, document
+    ) -> dict[tuple[int, str], tuple[float, float, float, float]]:
+        mapping: dict[tuple[int, str], tuple[float, float, float, float]] = {}
+        for page_index in range(document.page_count):
+            page = document.load_page(page_index)
+            widgets = page.widgets() if hasattr(page, "widgets") else None
+            if not widgets:
+                continue
+            for widget in widgets:
+                name = (widget.field_name or "").strip()
+                if not name:
+                    continue
+                # Prefer true signature widgets but fall back to any widget with /Sig appearance
+                if getattr(widget, "field_type", None) not in {
+                    getattr(fitz, "PDF_WIDGET_TYPE_SIGNATURE", 6)
+                }:
+                    continue
+                rect = self._RectToPdfTuple(widget.rect, page.rect.height)
+                mapping[(page_index + 1, name)] = rect
+        return mapping
+    def _ApplyWidgetRects(
+        self,
+        signatures: Iterable[Signature],
+        widget_map: dict[tuple[int, str], tuple[float, float, float, float]],
+    ) -> None:
+        for signature in signatures:
+            if signature.BoundingBox or not signature.FieldName or not signature.Page:
+                continue
+            key = (signature.Page, signature.FieldName.strip())
+            rect = widget_map.get(key)
+            if rect:
+                signature.BoundingBox = rect
+    # ───────────────────────────── pseudo bbox inference ─────────────────────────────
+    def _InferPseudoRects(self, signatures: Iterable[Signature], document) -> None:
+        for signature in signatures:
+            if signature.BoundingBox or signature.FieldName != "vendor_or_acro_detected":
+                continue
+            if signature.Page and signature.Page - 1 >= document.page_count:
+                continue
+            if signature.Page:
+                candidate_pages = [signature.Page - 1]
+            else:
+                candidate_pages = list(range(document.page_count - 1, -1, -1))
+            for page_index in candidate_pages:
+                if page_index < 0 or page_index >= document.page_count:
+                    continue
+                page = document.load_page(page_index)
+                lines = self._ExtractLines(page)
+                rect_info = self._FindRoleLineRect(page, signature.Role, lines)
+                if rect_info is None:
+                    rect_info = self._FallbackSignatureRect(page, signature.Role, lines)
+                if rect_info is not None:
+                    rect, exclusion, mode = rect_info
+                    padded = self._PadRect(rect, page.rect, signature.Role, exclusion, mode)
+                    signature.BoundingBox = self._RectToPdfTuple(padded, page.rect.height)
+                    if signature.Page is None:
+                        signature.Page = page_index + 1
+                    break
+    def _FindRoleLineRect(
+        self,
+        page,
+        role: str,
+        lines: list[dict[str, float | str]] | None = None,
+    ) -> tuple[fitz.Rect, float | None, str] | None:
+        if lines is None:
+            lines = self._ExtractLines(page)
+        page_height = float(page.rect.height)
+        keywords = self.ROLE_KEYWORDS.get(role, ())
+        lower_roles = {"client", "firm", "representative", "attorney"}
+        if self.Profile == "retainer" and role in {"client", "firm"}:
+            min_factor = 0.15 if role == "client" else 0.4
+            min_y = page_height * min_factor
+        else:
+            min_y = page_height * (0.58 if role == "firm" else 0.5) if role in lower_roles else 0.0
+        def match_lines(require_signature: bool) -> list[tuple[int, dict[str, float | str]]]:
+            selected: list[tuple[int, dict[str, float | str]]] = []
+            for idx, line in enumerate(lines):
+                lower = line["lower_text"]
+                if lower.strip() == "":
+                    continue
+                if line["y0"] < min_y:
+                    continue
+                if require_signature and "sign" not in lower:
+                    continue
+                if not require_signature and "sign" not in lower:
+                    if "name" in lower or "print" in lower:
+                        continue
+                if keywords and not any(keyword in lower for keyword in keywords):
+                    continue
+                selected.append((idx, line))
+            return selected
+        matches = match_lines(require_signature=True)
+        if matches and matches[-1][1]["y0"] < page_height * 0.6:
+            matches = []
+        if not matches:
+            matches = match_lines(require_signature=False)
+        if matches:
+            idx, target = matches[-1]
+            label_rect = fitz.Rect(target["x0"], target["y0"], target["x1"], target["y1"])
+            stroke = self._LocateStrokeLine(lines, idx, label_rect)
+            if stroke is not None:
+                rect, exclusion = stroke
+                return rect, exclusion, "stroke"
+            image = self._LocateSignatureImage(page, label_rect)
+            if image is not None:
+                exclusion = self._NextExclusionY(lines, idx + 1, image.y1)
+                return image, exclusion, "image"
+            exclusion = self._NextExclusionY(lines, idx + 1, label_rect.y1)
+            return label_rect, exclusion, "label"
+        return None
+    def _FallbackSignatureRect(
+        self,
+        page,
+        role: str | None = None,
+        lines: list[dict[str, float | str]] | None = None,
+    ) -> tuple[fitz.Rect, float | None, str] | None:
+        if lines is None:
+            lines = self._ExtractLines(page)
+        for idx in range(len(lines) - 1, -1, -1):
+            line = lines[idx]
+            lower = line["lower_text"]
+            if "signature" in lower or "sign" in lower:
+                rect = fitz.Rect(line["x0"], line["y0"], line["x1"], line["y1"])
+                exclusion = self._NextExclusionY(lines, idx + 1, rect.y1)
+                return rect, exclusion, "label"
+        if lines:
+            line = lines[-1]
+            rect = fitz.Rect(line["x0"], line["y0"], line["x1"], line["y1"])
+            exclusion = None
+            return rect, exclusion, "label"
+        return None
+    def _ExtractLines(self, page) -> list[dict[str, float | str]]:
+        words = page.get_text("words") or []
+        buckets: dict[tuple[int, int], dict[str, object]] = {}
+        for x0, y0, x1, y1, text, block, line, *_ in words:
+            if not text.strip():
+                continue
+            key = (int(block), int(line))
+            bucket = buckets.setdefault(
+                key,
+                {
+                    "tokens": [],
+                    "x0": float(x0),
+                    "y0": float(y0),
+                    "x1": float(x1),
+                    "y1": float(y1),
+                },
+            )
+            tokens = cast(list[str], bucket["tokens"])
+            tokens.append(text)
+            bucket["x0"] = min(float(bucket["x0"]), float(x0))
+            bucket["y0"] = min(float(bucket["y0"]), float(y0))
+            bucket["x1"] = max(float(bucket["x1"]), float(x1))
+            bucket["y1"] = max(float(bucket["y1"]), float(y1))
+        lines: list[dict[str, float | str]] = []
+        for bucket in buckets.values():
+            text = " ".join(bucket["tokens"]).strip()  # type: ignore[arg-type]
+            if not text:
+                continue
+            lines.append(
+                {
+                    "text": text,
+                    "lower_text": text.lower(),
+                    "x0": float(bucket["x0"]),
+                    "y0": float(bucket["y0"]),
+                    "x1": float(bucket["x1"]),
+                    "y1": float(bucket["y1"]),
+                }
+            )
+        lines.sort(key=lambda entry: (entry["y0"], entry["x0"]))
+        return lines
+    def _LocateStrokeLine(
+        self,
+        lines: list[dict[str, float | str]],
+        label_index: int,
+        label_rect: fitz.Rect,
+    ) -> tuple[fitz.Rect, float | None] | None:
+        for idx in range(label_index - 1, max(label_index - 4, -1), -1):
+            lower = lines[idx]["lower_text"]
+            if "_" in lower or lower.strip().startswith("x"):
+                rect = fitz.Rect(
+                    lines[idx]["x0"],
+                    lines[idx]["y0"],
+                    lines[idx]["x1"],
+                    lines[idx]["y1"],
+                )
+                overlap = min(rect.x1, label_rect.x1) - max(rect.x0, label_rect.x0)
+                if overlap <= 0:
+                    continue
+                # Keep crops below the label text.
+                return rect, label_rect.y0
+        return None
+    def _LocateSignatureImage(self, page, label_rect: fitz.Rect) -> fitz.Rect | None:
+        candidates: list[tuple[float, fitz.Rect]] = []
+        label_mid_x = (label_rect.x0 + label_rect.x1) / 2.0
+        for image in page.get_images(full=True):
+            bbox = page.get_image_bbox(image)
+            if bbox is None:
+                continue
+            width = float(bbox.width)
+            height = float(bbox.height)
+            if width < 40.0 or height < 12.0:
+                continue
+            if width > 380.0 or height > 220.0:
+                continue
+            # Require the image to sit near the label horizontally and vertically.
+            horiz_overlap = min(bbox.x1, label_rect.x1 + 220.0) - max(bbox.x0, label_rect.x0 - 40.0)
+            if horiz_overlap <= 0:
+                continue
+            vertical_gap = abs(((bbox.y0 + bbox.y1) / 2.0) - label_rect.y0)
+            if vertical_gap > 220.0:
+                continue
+            candidates.append((vertical_gap + abs(((bbox.x0 + bbox.x1) / 2.0) - label_mid_x), bbox))
+        if not candidates:
+            return None
+        candidates.sort(key=lambda item: item[0])
+        return candidates[0][1]
+    def _NextExclusionY(
+        self,
+        lines: list[dict[str, float | str]],
+        start_index: int,
+        minimum_y: float | None = None,
+    ) -> float | None:
+        threshold = (minimum_y or -float("inf")) + 1.0
+        for line in lines[start_index:]:
+            y0 = float(line["y0"])
+            if y0 <= threshold:
+                continue
+            lower = line["lower_text"]
+            if any(token in lower for token in ("name", "print", "date", "by:")):
+                return y0
+        return None
+    def _RectToPdfTuple(self, rect, page_height: float) -> tuple[float, float, float, float]:
+        x0 = float(rect.x0)
+        x1 = float(rect.x1)
+        y0 = page_height - float(rect.y1)
+        y1 = page_height - float(rect.y0)
+        if x1 < x0:
+            x0, x1 = x1, x0
+        if y1 < y0:
+            y0, y1 = y1, y0
+        return (x0, y0, x1, y1)
+    def _PadRect(
+        self,
+        rect,
+        page_rect,
+        role: str | None = None,
+        exclusion_y0: float | None = None,
+        mode: str = "label",
+    ):
+        """Return a region focused on the expected signature line beneath ``rect``."""
+        max_width = 198.0  # 2.75 inches
+        max_height = 72.0  # 1 inch
+        pad_x = max(12.0, float(rect.width) * 0.08)
+        if mode == "stroke":
+            left = max(page_rect.x0, rect.x0 - 8.0)
+            right = min(page_rect.x1, rect.x1 + 8.0)
+        elif mode == "image":
+            left = max(page_rect.x0, rect.x0 - 10.0)
+            right = min(page_rect.x1, rect.x1 + 10.0)
+        else:
+            left = max(page_rect.x0, rect.x0 - pad_x)
+            right = min(page_rect.x1, rect.x1 + pad_x)
+        if self.Profile == "retainer" and role == "client" and mode in {"image", "label"}:
+            left = max(page_rect.x0, rect.x0 - 12.0)
+            right = min(page_rect.x1, rect.x1 + 16.0)
+        elif self.Profile == "retainer" and role == "firm" and mode in {"image", "label"}:
+            left = max(page_rect.x0, rect.x0 - 14.0)
+            right = min(page_rect.x1, rect.x1 + 18.0)
+        if right - left > max_width:
+            if mode == "stroke":
+                right = min(page_rect.x1, left + max_width)
+            else:
+                center = (left + right) / 2.0
+                half = max_width / 2.0
+                left = center - half
+                right = center + half
+                if left < page_rect.x0:
+                    right += page_rect.x0 - left
+                    left = page_rect.x0
+                if right > page_rect.x1:
+                    left -= right - page_rect.x1
+                    right = page_rect.x1
+                left = max(page_rect.x0, left)
+                right = min(page_rect.x1, right)
+        line_height = max(8.0, float(rect.height) or 12.0)
+        signature_height = max(40.0, line_height * 2.2)
+        if role == "client":
+            signature_height = max(signature_height, 65.0)
+        elif role == "firm":
+            signature_height = max(signature_height, 60.0)
+        elif role in {"representative", "patient", "attorney"}:
+            signature_height = max(signature_height, 55.0)
+        signature_height = min(signature_height, max_height)
+        baseline = float(rect.y1)
+        if mode == "stroke":
+            margin_above = max(6.0, line_height)
+            margin_below = max(18.0, line_height * 1.5)
+            top = float(rect.y0) - margin_above
+            bottom = float(rect.y1) + margin_below
+            signature_height = min(bottom - top, max_height)
+        elif mode == "image":
+            image_height = float(rect.height) or 12.0
+            signature_height = min(max_height, max(image_height + 18.0, 40.0))
+            extra = max(0.0, signature_height - image_height)
+            top = float(rect.y0) - min(extra * 0.25, 12.0)
+            bottom = top + signature_height
+            top = max(float(rect.y0) - 2.0, top)
+            bottom = top + signature_height
+        else:
+            gap_above = max(10.0, min(24.0, line_height * 0.9))
+            top = baseline + gap_above
+            bottom = top + signature_height
+        original_top = top
+        if exclusion_y0 is not None:
+            limited = exclusion_y0 - 4.0
+            if bottom > limited:
+                bottom = limited
+                top = max(original_top, bottom - signature_height)
+        if mode == "image":
+            limit_below = float(rect.y1) + 24.0
+            if bottom > limit_below:
+                bottom = limit_below
+                top = max(float(rect.y0) - 4.0, bottom - signature_height)
+        if bottom - top > max_height:
+            bottom = top + max_height
+            signature_height = min(signature_height, max_height)
+        if bottom > page_rect.y1:
+            bottom = page_rect.y1
+            top = max(original_top, bottom - signature_height)
+        if bottom - top > max_height:
+            bottom = top + max_height
+        if top >= bottom:
+            top = max(page_rect.y0, baseline - line_height)
+            bottom = min(page_rect.y1, top + min(signature_height, max_height))
+        return fitz.Rect(left, top, right, bottom)

sigdetect/detector/pypdf2_engine.py CHANGED Viewed

@@ -212,7 +212,9 @@ class PyPDF2Detector(Detector):
                 hits.add(f"VendorText:{rx.pattern}")
         return hits
-    def _ScanPageVendors(self, page) -> set[str]:
+    def _ScanPageVendors(self, page) -> tuple[set[str], str]:
+        """Return vendor hits along with the extracted page text."""
         found: set[str] = set()
         with _QuietIo():
@@ -234,7 +236,7 @@ class PyPDF2Detector(Detector):
             if rx.search(txt):
                 found.add(f"VendorText:{rx.pattern}")
-        return found
+        return found, txt
     def _IterateFormXObjects(self, page) -> Iterator[generic.DictionaryObject]:
         """Yield Form XObject dictionaries recursively from page resources."""
@@ -438,6 +440,40 @@ class PyPDF2Detector(Detector):
         nm = GetFieldNameFromAncestry(wdict)
         return "" if nm is None else str(nm)
+    def _WidgetBoundingBox(
+        self, wdict: generic.DictionaryObject
+    ) -> tuple[float, float, float, float] | None:
+        """Return the widget's ``/Rect`` coordinates normalized as (x0, y0, x1, y1)."""
+        rect = self._RectToTuple(wdict.get("/Rect"))
+        if rect:
+            return rect
+        parent = AsDictionary(wdict.get("/Parent"))
+        if isinstance(parent, generic.DictionaryObject):
+            return self._RectToTuple(parent.get("/Rect"))
+        return None
+    def _RectToTuple(self, candidate) -> tuple[float, float, float, float] | None:
+        if candidate is None:
+            return None
+        if isinstance(candidate, generic.IndirectObject):
+            with suppress(Exception):
+                candidate = candidate.get_object()
+        if isinstance(candidate, generic.ArrayObject) and len(candidate) == 4:
+            coords: list[float] = []
+            for item in candidate:
+                try:
+                    coords.append(float(item))
+                except Exception:
+                    return None
+            x0, y0, x1, y1 = coords
+            if x1 < x0:
+                x0, x1 = x1, x0
+            if y1 < y0:
+                y0, y1 = y1, y0
+            return x0, y0, x1, y1
+        return None
     @staticmethod
     def _PickNameAny(d: generic.DictionaryObject) -> str | None:
         for key in ("/T", "/TU", "/TM"):
@@ -685,7 +721,7 @@ class PyPDF2Detector(Detector):
             for page in reader.pages:
                 # per-page vendor
-                pv = self._ScanPageVendors(page)
+                pv, page_text = self._ScanPageVendors(page)
                 x_hits: set[str] = set()
                 x_text = ""
                 if self.RecurseXObjects:
@@ -693,12 +729,10 @@ class PyPDF2Detector(Detector):
                 vendor_hints |= pv | x_hits
                 vendor_hits_per_page.append(len(pv) + len(x_hits))
-                with _QuietIo():
-                    txt = page.extract_text() or ""
                 if x_text:
-                    txt = f"{txt} {x_text}".strip() if txt else x_text.strip()
-                page_texts.append(txt)
-                any_text = any_text or bool(txt)
+                    page_text = f"{page_text} {x_text}".strip() if page_text else x_text.strip()
+                page_texts.append(page_text)
+                any_text = any_text or bool(page_text)
                 # image counting
                 img_count = 0
@@ -760,6 +794,7 @@ class PyPDF2Detector(Detector):
                     field_name = self._FieldNameForWidget(wdict)
                     page_obj = reader.pages[idx - 1] if 0 <= (idx - 1) < len(reader.pages) else None
                     render_type = self._ClassifyAppearance(wdict, page_obj)
+                    bounding_box = self._WidgetBoundingBox(wdict)
                     # de-dup by object ref (if present) and (page, name)
                     if isinstance(ref, generic.IndirectObject):
@@ -801,6 +836,7 @@ class PyPDF2Detector(Detector):
                             Evidence=evidence,
                             Hint=(f"AcroSig:{field_name}" if field_name else "AcroSig"),
                             RenderType=render_type,
+                            BoundingBox=bounding_box,
                         )
                     )
@@ -969,6 +1005,7 @@ class PyPDF2Detector(Detector):
                 field_name = self._FieldNameForWidget(wdict)
                 page_obj = reader.pages[idx - 1] if 0 <= (idx - 1) < len(reader.pages) else None
                 render_type = self._ClassifyAppearance(wdict, page_obj)
+                bounding_box = self._WidgetBoundingBox(wdict)
                 # de-dup by object ref (if present) and (page, name)
                 if isinstance(ref, generic.IndirectObject):
@@ -995,6 +1032,7 @@ class PyPDF2Detector(Detector):
                         Evidence=evidence,
                         Hint=(f"AcroSig:{field_name}" if field_name else "AcroSig"),
                         RenderType=render_type,
+                        BoundingBox=bounding_box,
                     )
                 )

sigdetect/detector/signature_model.py CHANGED Viewed

@@ -18,6 +18,8 @@ class Signature:
     Evidence: list[str]
     Hint: str
     RenderType: str = "unknown"
+    BoundingBox: tuple[float, float, float, float] | None = None
+    CropPath: str | None = None
     def to_dict(self) -> dict[str, Any]:
         """Return the legacy snake_case representation used in JSON payloads."""
@@ -31,4 +33,6 @@ class Signature:
             "evidence": list(self.Evidence),
             "hint": self.Hint,
             "render_type": self.RenderType,
+            "bounding_box": list(self.BoundingBox) if self.BoundingBox else None,
+            "crop_path": self.CropPath,
         }

{sigdetect-0.1.0.dist-info → sigdetect-0.2.0.dist-info}/METADATA RENAMED Viewed

@@ -1,13 +1,12 @@
 Metadata-Version: 2.4
 Name: sigdetect
-Version: 0.1.0
+Version: 0.2.0
 Summary: Signature detection and role attribution for PDFs
 Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
 License: MIT
 Requires-Python: >=3.9
 Description-Content-Type: text/markdown
 Requires-Dist: pypdf>=4.0.0
-Requires-Dist: pandas>=2.0
 Requires-Dist: rich>=13.0
 Requires-Dist: typer>=0.12
 Requires-Dist: pydantic>=2.5
@@ -102,6 +101,8 @@ sigdetect detect \
 - `--profile` selects tuned role logic:
   - `hipaa` → patient / representative / attorney
   - `retainer` → client / firm (prefers detecting two signatures)
+- `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
+- `--crop-signatures` enables PNG crops for each detected widget (requires installing the optional `pymupdf` dependency). Use `--crop-dir` to override the destination and `--crop-dpi` to choose rendering quality.
 - If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
 ### EDA (quick aggregate stats)
@@ -135,7 +136,7 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
 print(result.to_dict())
 ~~~
-`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)).
+`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image.
 ---
@@ -146,7 +147,17 @@ Import from `sigdetect.api` and get plain dicts out (JSON-ready),
 with no I/O side effects by default:
 ~~~python
-from sigdetect.api import DetectPdf, DetectMany, ScanDirectory, ToCsvRow, Version
+from pathlib import Path
+from sigdetect.api import (
+    CropSignatureImages,
+    DetectMany,
+    DetectPdf,
+    ScanDirectory,
+    ToCsvRow,
+    Version,
+    get_detector,
+)
 print("sigdetect", Version())
@@ -178,6 +189,15 @@ for res in ScanDirectory(
     # store in DB, print, etc.
     pass
+# 3) Crop PNG snippets for FileResult objects (requires PyMuPDF)
+detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
+file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
+CropSignatureImages(
+    "/path/to/pdfs/example.pdf",
+    file_result,
+    outputDirectory="./signature_crops",
+    dpi=200,
+)
 ~~~
@@ -205,7 +225,10 @@ High-level summary (per file):
       "score": 5,
       "scores": { "field": 3, "page_label": 2 },
       "evidence": ["field:patient", "page_label:patient"],
-      "hint": "AcroSig:sig_patient"
+      "hint": "AcroSig:sig_patient",
+      "render_type": "typed",
+      "bounding_box": [10.0, 10.0, 150.0, 40.0],
+      "crop_path": "signature_crops/example/sig_01_patient.png"
     },
     {
       "page": null,
@@ -214,7 +237,10 @@ High-level summary (per file):
       "score": 6,
       "scores": { "page_label": 4, "general": 2 },
       "evidence": ["page_label:representative(parent/guardian)", "pseudo:true"],
-      "hint": "VendorOrAcroOnly"
+      "hint": "VendorOrAcroOnly",
+      "render_type": "unknown",
+      "bounding_box": null,
+      "crop_path": null
     }
   ]
 }
@@ -227,6 +253,8 @@ High-level summary (per file):
 - **`mixed`** means both `esign_found` and `scanned_pdf` are `true`.
 - **`roles`** summarizes unique non-`unknown` roles across signatures.
 - In retainer profile, emitter prefers two signatures (client + firm), often on the same page.
+- **`signatures[].bounding_box`** reports the widget rectangle in PDF points (origin bottom-left).
+- **`signatures[].crop_path`** is populated when PNG crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
 ---
@@ -252,6 +280,9 @@ engine: pypdf2
 pseudo_signatures: true
 recurse_xobjects: true
 profile: retainer    # or: hipaa
+crop_signatures: false   # enable to write PNG crops (requires pymupdf)
+# crop_output_dir: ./signature_crops
+crop_image_dpi: 200
 ~~~
 YAML files can be customized or load at runtime (see CLI `--config`, if available, or import and pass patterns into engine).

{sigdetect-0.1.0.dist-info → sigdetect-0.2.0.dist-info}/RECORD RENAMED Viewed

@@ -1,7 +1,8 @@
 sigdetect/__init__.py,sha256=LhY78mDZ1ClYVNTxW_qtE-vqJoN9N7N5ZcNRDUI_3ss,575
-sigdetect/api.py,sha256=Un4SaZHNAmRLPh1aF9bzOfT6ibilT_y9C0xVmNlqHtI,4248
-sigdetect/cli.py,sha256=jm7aStuv64MCcZZkzv8ncNVGGg8FYIFKjkTPNfXWUgs,3136
-sigdetect/config.py,sha256=d3_AlAEFUHBoXyTbUAHQLTARVqM8q4I8q4xfwakPE0M,4165
+sigdetect/api.py,sha256=F7bM0ctYmtczjqSbsl7MkUZQ28wkRnLAYt1WxfCtzk4,8518
+sigdetect/cli.py,sha256=NctAnaB-TQrUAT9m-v8kj2_KTNs88kbFOCiX32tHZm8,7920
+sigdetect/config.py,sha256=S0NVKuJYiHJCocL-VNFdGJpasFcjTecavC4EthyS1DQ,5951
+sigdetect/cropping.py,sha256=89xPwXhWkJC5E0oW2e3_fDyERH5YGqyt4q4B-HSld4o,4084
 sigdetect/eda.py,sha256=S92G1Gjmepri__D0n_V6foq0lQgH-RXI9anW8A58jfw,4681
 sigdetect/logging_setup.py,sha256=LMF8ao_a-JwH0S522T6aYTFX3e8Ajjv_5ODS2YiBcHA,6404
 sigdetect/utils.py,sha256=T9rubLf5T9JmjOHYMOba1j34fhOJaWocAXccnGTxRUE,5198
@@ -12,11 +13,11 @@ sigdetect/detector/__init__.py,sha256=up2FCmD09f2bRHcS4WbY-clx3GQbWuk1PM2JlxgusH
 sigdetect/detector/base.py,sha256=L-iXWXqsTetDc4jRZo_wOdbNpKqOY20mX9FefrugdT0,263
 sigdetect/detector/base_detector.py,sha256=GmAgUWO_fQgIfnihZSoyhR3wpnwZ-X3hS0Kuyz4G6Ys,608
 sigdetect/detector/file_result_model.py,sha256=j2gTc9Sw3fJOHlexYsR_m5DiwHA8DzIzAMToESfvo4A,1767
-sigdetect/detector/pymupdf_engine.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-sigdetect/detector/pypdf2_engine.py,sha256=e3JasLxI8K10IkpMcijES2EjA7RluNpKq6027oNROPU,45770
-sigdetect/detector/signature_model.py,sha256=nApd53aDRMZhOLdUlmoEPjHO1hs8leM6NysG10v-jVc,857
-sigdetect-0.1.0.dist-info/METADATA,sha256=7au6ZW0VN_y3JyZQJux6zEUO8BMBEp6qVn0HO86aXlU,10363
-sigdetect-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-sigdetect-0.1.0.dist-info/entry_points.txt,sha256=iqtfKjBU44-omM7Sh-idGz2ahw19oAvpvSyKZVArG3o,48
-sigdetect-0.1.0.dist-info/top_level.txt,sha256=PKlfwUobkRC0viwiSXmhtw83G26FSNpimWYC1Uy00FY,10
-sigdetect-0.1.0.dist-info/RECORD,,
+sigdetect/detector/pymupdf_engine.py,sha256=iyp7JuPlUnydwohH5zbNg4MwH44mBmxbBWOS3ZmArBo,17339
+sigdetect/detector/pypdf2_engine.py,sha256=INWQH06kMLvto2VS-EdLC-EtMC6AG7JmdVYmNgx6_RU,47313
+sigdetect/detector/signature_model.py,sha256=mztb9V5wgv2oohQ5Cxzcv8_Bo6TyWAVIXteaeQ2rywQ,1076
+sigdetect-0.2.0.dist-info/METADATA,sha256=HzF-CmGBs48_Cqv9Dv9AdXo_UoztA-tLPxVMN1fXOH0,11866
+sigdetect-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+sigdetect-0.2.0.dist-info/entry_points.txt,sha256=iqtfKjBU44-omM7Sh-idGz2ahw19oAvpvSyKZVArG3o,48
+sigdetect-0.2.0.dist-info/top_level.txt,sha256=PKlfwUobkRC0viwiSXmhtw83G26FSNpimWYC1Uy00FY,10
+sigdetect-0.2.0.dist-info/RECORD,,

{sigdetect-0.1.0.dist-info → sigdetect-0.2.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{sigdetect-0.1.0.dist-info → sigdetect-0.2.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{sigdetect-0.1.0.dist-info → sigdetect-0.2.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

sigdetect 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

sigdetect 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl