PyPI - sigdetect - Versions diffs - 0.3.1__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

sigdetect 0.3.1py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

sigdetect/__init__.py +1 -1
sigdetect/api.py +7 -5
sigdetect/cli.py +37 -0
sigdetect/config.py +43 -3
sigdetect/cropping.py +7 -3
sigdetect/detector/__init__.py +18 -1
sigdetect/detector/pymupdf_engine.py +1 -0
sigdetect/detector/pypdf2_engine.py +7 -5
sigdetect/detector/signature_model.py +1 -1
sigdetect/wet_detection.py +499 -0
{sigdetect-0.3.1.dist-info → sigdetect-0.4.0.dist-info}/METADATA +12 -18
sigdetect-0.4.0.dist-info/RECORD +24 -0
sigdetect-0.3.1.dist-info/RECORD +0 -23
{sigdetect-0.3.1.dist-info → sigdetect-0.4.0.dist-info}/WHEEL +0 -0
{sigdetect-0.3.1.dist-info → sigdetect-0.4.0.dist-info}/entry_points.txt +0 -0
{sigdetect-0.3.1.dist-info → sigdetect-0.4.0.dist-info}/top_level.txt +0 -0

sigdetect/__init__.py CHANGED Viewed

@@ -21,4 +21,4 @@ try:
 except PackageNotFoundError:  # pragma: no cover
     __version__ = "0.0.0"
-DEFAULT_ENGINE = "pypdf2"
+DEFAULT_ENGINE = "auto"

sigdetect/api.py CHANGED Viewed

@@ -10,7 +10,7 @@ from sigdetect.config import DetectConfiguration
 from sigdetect.cropping import SignatureCrop
 from sigdetect.detector import BuildDetector, Detector, FileResult, Signature
-EngineName = Literal["pypdf2", "pypdf", "pymupdf"]
+EngineName = Literal["pypdf2", "pypdf", "pymupdf", "auto"]
 ProfileName = Literal["hipaa", "retainer"]
@@ -18,7 +18,7 @@ def DetectPdf(
     pdfPath: str | Path,
     *,
     profileName: ProfileName = "hipaa",
-    engineName: EngineName = "pypdf2",
+    engineName: EngineName = "auto",
     includePseudoSignatures: bool = True,
     recurseXObjects: bool = True,
     detector: Detector | None = None,
@@ -43,7 +43,7 @@ def get_detector(
     *,
     pdfRoot: str | Path | None = None,
     profileName: ProfileName = "hipaa",
-    engineName: EngineName = "pypdf2",
+    engineName: EngineName = "auto",
     includePseudoSignatures: bool = True,
     recurseXObjects: bool = True,
     outputDirectory: str | Path | None = None,
@@ -201,7 +201,8 @@ def CropSignatureImages(
     dpi: int = 200,
     returnBytes: Literal[False] = False,
     saveToDisk: bool = True,
-) -> list[Path]: ...
+) -> list[Path]:
+    ...
 @overload
@@ -213,7 +214,8 @@ def CropSignatureImages(
     dpi: int,
     returnBytes: Literal[True],
     saveToDisk: bool,
-) -> list[SignatureCrop]: ...
+) -> list[SignatureCrop]:
+    ...
 def CropSignatureImages(

sigdetect/cli.py CHANGED Viewed

@@ -15,6 +15,7 @@ from .cropping import SignatureCroppingUnavailable, crop_signatures
 from .detector import BuildDetector, FileResult
 from .eda import RunExploratoryAnalysis
 from .logging_setup import ConfigureLogging
+from .wet_detection import apply_wet_detection
 Logger = ConfigureLogging()
@@ -72,6 +73,33 @@ def Detect(
         help="Rendering DPI for signature crops",
         show_default=False,
     ),
+    detectWetSignatures: bool | None = typer.Option(
+        None,
+        "--detect-wet/--no-detect-wet",
+        help="Run OCR-backed wet signature detection (requires PyMuPDF + Tesseract)",
+        show_default=False,
+    ),
+    wetOcrDpi: int | None = typer.Option(
+        None,
+        "--wet-ocr-dpi",
+        min=72,
+        max=600,
+        help="Rendering DPI for OCR pages (wet detection)",
+        show_default=False,
+    ),
+    wetOcrLanguages: str | None = typer.Option(
+        None,
+        "--wet-ocr-languages",
+        help="Tesseract language packs for OCR (e.g., 'eng' or 'eng+spa')",
+    ),
+    wetPrecisionThreshold: float | None = typer.Option(
+        None,
+        "--wet-precision-threshold",
+        min=0.0,
+        max=1.0,
+        help="Minimum wet-signature confidence (0-1) to accept a candidate",
+        show_default=False,
+    ),
 ) -> None:
     """Run detection for the configured directory and emit ``results.json``."""
@@ -89,6 +117,14 @@ def Detect(
         overrides["CropOutputDirectory"] = cropDirectory
     if cropDpi is not None:
         overrides["CropImageDpi"] = cropDpi
+    if detectWetSignatures is not None:
+        overrides["DetectWetSignatures"] = detectWetSignatures
+    if wetOcrDpi is not None:
+        overrides["WetOcrDpi"] = wetOcrDpi
+    if wetOcrLanguages is not None:
+        overrides["WetOcrLanguages"] = wetOcrLanguages
+    if wetPrecisionThreshold is not None:
+        overrides["WetPrecisionThreshold"] = wetPrecisionThreshold
     if overrides:
         configuration = configuration.model_copy(update=overrides)
         configuration = FinalizeConfiguration(configuration)
@@ -182,6 +218,7 @@ def Detect(
     def _process(pdf_path: Path) -> None:
         file_result = detector.Detect(pdf_path)
+        apply_wet_detection(pdf_path, configuration, file_result, logger=Logger)
         _append_result(file_result, pdf_path)
     try:

sigdetect/config.py CHANGED Viewed

@@ -10,7 +10,7 @@ from typing import Literal
 import yaml
 from pydantic import BaseModel, ConfigDict, Field, field_validator
-EngineName = Literal["pypdf2", "pypdf", "pymupdf"]
+EngineName = Literal["pypdf2", "pypdf", "pymupdf", "auto"]
 ProfileName = Literal["hipaa", "retainer"]
@@ -25,13 +25,19 @@ class DetectConfiguration(BaseModel):
     PdfRoot: Path = Field(default=Path("hipaa_results"), alias="pdf_root")
     OutputDirectory: Path | None = Field(default=Path("out"), alias="out_dir")
-    Engine: EngineName = Field(default="pypdf2", alias="engine")
+    Engine: EngineName = Field(default="auto", alias="engine")
     Profile: ProfileName = Field(default="hipaa", alias="profile")
     PseudoSignatures: bool = Field(default=True, alias="pseudo_signatures")
     RecurseXObjects: bool = Field(default=True, alias="recurse_xobjects")
-    CropSignatures: bool = Field(default=False, alias="crop_signatures")
+    CropSignatures: bool = Field(default=True, alias="crop_signatures")
     CropOutputDirectory: Path | None = Field(default=None, alias="crop_output_dir")
     CropImageDpi: int = Field(default=200, alias="crop_image_dpi", ge=72, le=600)
+    DetectWetSignatures: bool = Field(default=True, alias="detect_wet_signatures")
+    WetOcrDpi: int = Field(default=200, alias="wet_ocr_dpi", ge=72, le=600)
+    WetOcrLanguages: str = Field(default="eng", alias="wet_ocr_languages")
+    WetPrecisionThreshold: float = Field(
+        default=0.82, alias="wet_precision_threshold", ge=0.0, le=1.0
+    )
     @field_validator("PdfRoot", "OutputDirectory", "CropOutputDirectory", mode="before")
     @classmethod
@@ -85,6 +91,22 @@ class DetectConfiguration(BaseModel):
     def crop_image_dpi(self) -> int:  # pragma: no cover - simple passthrough
         return self.CropImageDpi
+    @property
+    def detect_wet_signatures(self) -> bool:  # pragma: no cover - simple passthrough
+        return self.DetectWetSignatures
+    @property
+    def wet_ocr_dpi(self) -> int:  # pragma: no cover - simple passthrough
+        return self.WetOcrDpi
+    @property
+    def wet_ocr_languages(self) -> str:  # pragma: no cover - simple passthrough
+        return self.WetOcrLanguages
+    @property
+    def wet_precision_threshold(self) -> float:  # pragma: no cover - simple passthrough
+        return self.WetPrecisionThreshold
 def LoadConfiguration(path: Path | None) -> DetectConfiguration:
     """Load configuration from ``path`` while applying environment overrides.
@@ -108,6 +130,10 @@ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
     env_crop = os.getenv("SIGDETECT_CROP_SIGNATURES")
     env_crop_dir = os.getenv("SIGDETECT_CROP_DIR")
     env_crop_dpi = os.getenv("SIGDETECT_CROP_DPI")
+    env_detect_wet = os.getenv("SIGDETECT_DETECT_WET")
+    env_wet_dpi = os.getenv("SIGDETECT_WET_OCR_DPI")
+    env_wet_lang = os.getenv("SIGDETECT_WET_LANGUAGES")
+    env_wet_precision = os.getenv("SIGDETECT_WET_PRECISION")
     raw_data: dict[str, object] = {}
     if path and Path(path).exists():
@@ -133,6 +159,20 @@ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
     if env_crop_dpi:
         with suppress(ValueError):
             raw_data["crop_image_dpi"] = int(env_crop_dpi)
+    if env_detect_wet is not None:
+        lowered = env_detect_wet.lower()
+        if lowered in {"1", "true", "yes", "on"}:
+            raw_data["detect_wet_signatures"] = True
+        elif lowered in {"0", "false", "no", "off"}:
+            raw_data["detect_wet_signatures"] = False
+    if env_wet_dpi:
+        with suppress(ValueError):
+            raw_data["wet_ocr_dpi"] = int(env_wet_dpi)
+    if env_wet_lang:
+        raw_data["wet_ocr_languages"] = env_wet_lang
+    if env_wet_precision:
+        with suppress(ValueError):
+            raw_data["wet_precision_threshold"] = float(env_wet_precision)
     configuration = DetectConfiguration(**raw_data)
     return FinalizeConfiguration(configuration)

sigdetect/cropping.py CHANGED Viewed

@@ -40,7 +40,9 @@ def crop_signatures(
     dpi: int = 200,
     logger: logging.Logger | None = None,
     return_bytes: Literal[False] = False,
-) -> list[Path]: ...
+    save_files: bool = True,
+) -> list[Path]:
+    ...
 @overload
@@ -51,8 +53,10 @@ def crop_signatures(
     output_dir: Path,
     dpi: int = 200,
     logger: logging.Logger | None = None,
-    return_bytes: Literal[True] = True,
-) -> list[SignatureCrop]: ...
+    return_bytes: Literal[True],
+    save_files: bool = True,
+) -> list[SignatureCrop]:
+    ...
 def crop_signatures(

sigdetect/detector/__init__.py CHANGED Viewed

@@ -2,6 +2,7 @@
 from __future__ import annotations
+import warnings
 from typing import TYPE_CHECKING, Type
 from .base_detector import Detector
@@ -37,7 +38,23 @@ def BuildDetector(configuration: DetectConfiguration) -> Detector:
         or getattr(configuration, "engine", None)
         or PyPDF2Detector.Name
     )
-    normalized = engine_name.lower()
+    normalized = str(engine_name).lower()
+    if normalized == "auto":
+        detector_cls: Type[Detector] | None = None
+        if PyMuPDFDetector is not None:
+            detector_cls = ENGINE_REGISTRY.get(getattr(PyMuPDFDetector, "Name", "")) or PyMuPDFDetector
+        if detector_cls is None:
+            detector_cls = ENGINE_REGISTRY.get(PyPDF2Detector.Name) or ENGINE_REGISTRY.get("pypdf")
+            warnings.warn(
+                "Engine 'auto' falling back to 'pypdf2' because PyMuPDF is unavailable",
+                RuntimeWarning,
+                stacklevel=2,
+            )
+        if detector_cls is None:
+            available = ", ".join(sorted(ENGINE_REGISTRY)) or "<none>"
+            raise ValueError(f"No available detector engines. Available engines: {available}")
+        return detector_cls(configuration)
     detector_cls = ENGINE_REGISTRY.get(normalized)
     if detector_cls is None:

sigdetect/detector/pymupdf_engine.py CHANGED Viewed

@@ -111,6 +111,7 @@ class PyMuPDFDetector(PyPDF2Detector):
                     rect, exclusion, mode = rect_info
                     padded = self._PadRect(rect, page.rect, signature.Role, exclusion, mode)
                     signature.BoundingBox = self._RectToPdfTuple(padded, page.rect.height)
+                    signature.RenderType = "drawn"
                     if signature.Page is None:
                         signature.Page = page_index + 1
                     break

sigdetect/detector/pypdf2_engine.py CHANGED Viewed

@@ -348,7 +348,7 @@ class PyPDF2Detector(Detector):
         return normalized.lower().startswith("im")
     def _ClassifyAppearance(self, widget: generic.DictionaryObject, page) -> str:
-        """Classify the widget's appearance as drawn/typed/hybrid/unknown."""
+        """Classify the widget's appearance as drawn or typed."""
         ap_dict = AsDictionary(widget.get("/AP"))
         if not isinstance(ap_dict, generic.DictionaryObject):
@@ -356,7 +356,7 @@ class PyPDF2Detector(Detector):
         normal = ap_dict.get("/N")
         streams = self._ExtractAppearanceStreams(normal)
         if not streams:
-            return "unknown"
+            return "typed"
         has_text = False
         has_vector = False
@@ -384,13 +384,11 @@ class PyPDF2Detector(Detector):
                         has_image = True
                         break
-        if has_image and (has_text or has_vector):
-            return "hybrid"
         if has_image:
             return "drawn"
         if has_text or has_vector:
             return "typed"
-        return "unknown"
+        return "typed"
     # ---- file-wide stream scan (compressed or not)
     def _ScanFileStreamsForVendors(self, file_bytes: bytes) -> tuple[set[str], str]:
@@ -863,6 +861,7 @@ class PyPDF2Detector(Detector):
                                 Scores={r: sc},
                                 Evidence=ev + ["pseudo:true"],
                                 Hint="VendorOrAcroOnly",
+                                RenderType="typed",
                             )
                         )
@@ -903,6 +902,7 @@ class PyPDF2Detector(Detector):
                                 Scores={role: score} if score > 0 else {},
                                 Evidence=ev + ["pseudo:true"],
                                 Hint="VendorOrAcroOnly",
+                                RenderType="typed",
                             )
                         )
@@ -1055,6 +1055,7 @@ class PyPDF2Detector(Detector):
                         Scores=scores,
                         Evidence=evidence,
                         Hint=f"AcroSig:{fname}" if fname else "AcroSig",
+                        RenderType="typed",
                     )
                 )
@@ -1120,6 +1121,7 @@ class PyPDF2Detector(Detector):
                         Scores=dict(scores),
                         Evidence=evidence + ["pseudo:true"],
                         Hint="VendorOrAcroOnly",
+                        RenderType="typed",
                     )
                 )

sigdetect/detector/signature_model.py CHANGED Viewed

@@ -17,7 +17,7 @@ class Signature:
     Scores: dict[str, int]
     Evidence: list[str]
     Hint: str
-    RenderType: str = "unknown"
+    RenderType: str = "typed"
     BoundingBox: tuple[float, float, float, float] | None = None
     CropPath: str | None = None

sigdetect/wet_detection.py ADDED Viewed

@@ -0,0 +1,499 @@
+"""Wet signature detection via OCR-backed heuristics."""
+from __future__ import annotations
+import logging
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Iterable, Sequence
+from PIL import Image
+from sigdetect.config import DetectConfiguration
+from sigdetect.detector.file_result_model import FileResult
+from sigdetect.detector.signature_model import Signature
+try:  # pragma: no cover - optional dependency
+    import fitz  # type: ignore
+except Exception:  # pragma: no cover - optional dependency
+    fitz = None  # type: ignore[misc]
+try:  # pragma: no cover - optional dependency
+    import pytesseract  # type: ignore
+    from pytesseract import Output as TesseractOutput
+except Exception:  # pragma: no cover - optional dependency
+    pytesseract = None  # type: ignore[assignment]
+    TesseractOutput = None  # type: ignore[assignment]
+LOGGER = logging.getLogger("sigdetect.wet")
+SIGNATURE_PATTERNS: tuple[re.Pattern[str], ...] = (
+    re.compile(r"\bsignature\b"),
+    re.compile(r"\bsigned\b"),
+    re.compile(r"\bsign\b"),
+    re.compile(r"\bsignature\s+of\b"),
+    re.compile(r"\bsignature\s*:"),
+    re.compile(r"\bsignature\s*-"),
+    re.compile(r"\bby:\b"),
+)
+ROLE_KEYWORDS: dict[str, tuple[str, ...]] = {
+    "client": ("client", "consumer", "claimant"),
+    "firm": ("firm", "attorney", "counsel", "by:", "esq", "law"),
+    "patient": ("patient", "self", "plaintiff"),
+    "representative": ("guardian", "representative", "parent", "poa"),
+    "attorney": ("attorney", "counsel", "lawyer"),
+}
+class WetDetectionUnavailable(RuntimeError):
+    """Raised when OCR-backed detection cannot run."""
+@dataclass
+class OcrLine:
+    """Structured OCR line extracted from pytesseract."""
+    text: str
+    confidence: float
+    left: int
+    top: int
+    right: int
+    bottom: int
+def should_run_wet_pipeline(file_result: FileResult) -> bool:
+    """Return ``True`` when the OCR pipeline should run for ``file_result``."""
+    return (
+        (not file_result.ElectronicSignatureFound or file_result.SignatureCount == 0)
+        or (bool(file_result.ScannedPdf) and not file_result.ElectronicSignatureFound)
+        or bool(file_result.MixedContent)
+    )
+def apply_wet_detection(
+    pdf_path: Path,
+    configuration: DetectConfiguration,
+    file_result: FileResult,
+    *,
+    logger: logging.Logger | None = None,
+) -> bool:
+    """Augment ``file_result`` with OCR-detected wet signatures when possible."""
+    if not configuration.DetectWetSignatures:
+        return False
+    if not should_run_wet_pipeline(file_result):
+        return False
+    try:
+        _ensure_dependencies()
+    except WetDetectionUnavailable as exc:
+        _mark_manual_review(file_result, str(exc))
+        if logger:
+            logger.warning("Wet detection unavailable", extra={"error": str(exc)})
+        return False
+    try:
+        added = _detect(pdf_path, configuration, file_result, logger=logger)
+        if not added:
+            _mark_manual_review(file_result, "NoHighConfidenceWetSignature")
+        return added
+    except Exception as exc:  # pragma: no cover - defensive
+        _mark_manual_review(file_result, "WetDetectionError")
+        if logger:
+            logger.warning("Wet detection failed", extra={"error": str(exc)})
+        return False
+def _detect(
+    pdf_path: Path,
+    configuration: DetectConfiguration,
+    file_result: FileResult,
+    *,
+    logger: logging.Logger | None = None,
+) -> bool:
+    if fitz is None or pytesseract is None:
+        raise WetDetectionUnavailable("PyMuPDF or pytesseract not available")
+    document = fitz.open(pdf_path)  # type: ignore[attr-defined]
+    try:
+        new_signatures: list[Signature] = []
+        matrix = fitz.Matrix(configuration.WetOcrDpi / 72.0, configuration.WetOcrDpi / 72.0)
+        for page_index in range(document.page_count):
+            page = document.load_page(page_index)
+            pixmap = page.get_pixmap(matrix=matrix, alpha=False)
+            image = _pixmap_to_image(pixmap)
+            ocr_lines = _extract_ocr_lines(image, configuration.WetOcrLanguages)
+            candidates = list(
+                _build_candidates(
+                    ocr_lines,
+                    image=image,
+                    page_rect=page.rect,
+                    pix_width=pixmap.width,
+                    pix_height=pixmap.height,
+                    scale=configuration.WetOcrDpi / 72.0,
+                )
+            )
+            candidates.extend(_image_candidates(page))
+            accepted = [
+                candidate
+                for candidate in candidates
+                if candidate.Score >= configuration.WetPrecisionThreshold
+            ]
+            if logger:
+                logger.debug(
+                    "Wet detection page summary",
+                    extra={
+                        "pdf": pdf_path.name,
+                        "page": page_index + 1,
+                        "candidates": len(candidates),
+                        "accepted": len(accepted),
+                    },
+                )
+            new_signatures.extend(_to_signatures(accepted, page_index + 1))
+        if not new_signatures:
+            return False
+        file_result.Signatures.extend(new_signatures)
+        _refresh_metadata(file_result)
+        return True
+    finally:
+        document.close()
+def _ensure_dependencies() -> None:
+    if fitz is None:
+        raise WetDetectionUnavailable("PyMuPDF is required for wet detection (install 'pymupdf').")
+    if pytesseract is None or TesseractOutput is None:
+        raise WetDetectionUnavailable(
+            "pytesseract is required for wet detection and depends on the Tesseract OCR binary."
+        )
+def _pixmap_to_image(pixmap) -> Image.Image:
+    mode = "RGB"
+    if pixmap.alpha:
+        mode = "RGBA"
+    image = Image.frombytes(mode, [pixmap.width, pixmap.height], pixmap.samples)
+    if mode == "RGBA":
+        image = image.convert("RGB")
+    return image
+def _extract_ocr_lines(image: Image.Image, languages: str) -> list[OcrLine]:
+    if pytesseract is None or TesseractOutput is None:
+        raise WetDetectionUnavailable("pytesseract unavailable")
+    try:
+        data = pytesseract.image_to_data(image, lang=languages, output_type=TesseractOutput.DICT)
+    except Exception as exc:  # pragma: no cover - passthrough to manual review
+        raise WetDetectionUnavailable(f"OCR failed: {exc}") from exc
+    total = len(data.get("text", []))
+    lines: dict[tuple[int, int, int], OcrLine] = {}
+    for idx in range(total):
+        text = (data["text"][idx] or "").strip()
+        if not text:
+            continue
+        conf_raw = float(data["conf"][idx])
+        if conf_raw <= 0:
+            continue
+        key = (data["block_num"][idx], data["par_num"][idx], data["line_num"][idx])
+        left = int(data["left"][idx])
+        top = int(data["top"][idx])
+        width = int(data["width"][idx])
+        height = int(data["height"][idx])
+        right = left + width
+        bottom = top + height
+        existing = lines.get(key)
+        if existing is None:
+            lines[key] = OcrLine(
+                text=text,
+                confidence=conf_raw / 100.0,
+                left=left,
+                top=top,
+                right=right,
+                bottom=bottom,
+            )
+        else:
+            existing.text = f"{existing.text} {text}"
+            existing.confidence = min(1.0, (existing.confidence + conf_raw / 100.0) / 2.0)
+            existing.left = min(existing.left, left)
+            existing.top = min(existing.top, top)
+            existing.right = max(existing.right, right)
+            existing.bottom = max(existing.bottom, bottom)
+    return list(lines.values())
+@dataclass
+class WetCandidate:
+    bbox: tuple[float, float, float, float]
+    Role: str
+    Score: float
+    Evidence: list[str]
+def _build_candidates(
+    lines: Iterable[OcrLine],
+    *,
+    image: Image.Image,
+    page_rect,
+    pix_width: int,
+    pix_height: int,
+    scale: float,
+) -> Iterable[WetCandidate]:
+    for line in lines:
+        normalized = line.text.lower()
+        if not _has_signature_keyword(normalized):
+            continue
+        if len(normalized) > 80:
+            # Ignore long paragraph-like OCR lines
+            continue
+        if (line.bottom / pix_height) < 0.4:
+            # Ignore lines in the upper section of the page
+            continue
+        role = _infer_role(normalized)
+        stroke_found, stroke_y = _stroke_under_line(image, line)
+        bonus = _keyword_bonus(normalized)
+        if stroke_found:
+            bonus += 0.12
+        # Slight positional prior: lines in lower quarter are more likely signatures.
+        if (line.bottom / pix_height) > 0.7:
+            bonus += 0.05
+        confidence = min(1.0, line.confidence + bonus)
+        bbox = _expand_bbox(line, page_rect, pix_height, scale, stroke_y=stroke_y)
+        yield WetCandidate(
+            bbox=bbox,
+            Role=role,
+            Score=confidence,
+            Evidence=[
+                f"ocr_line:{line.text.strip()}",
+                f"ocr_conf:{confidence:.2f}",
+                "wet:true",
+                "stroke:yes" if stroke_found else "stroke:no",
+            ],
+        )
+def _infer_role(normalized_text: str) -> str:
+    for role, keywords in ROLE_KEYWORDS.items():
+        if any(keyword in normalized_text for keyword in keywords):
+            return role
+    return "unknown"
+def _keyword_bonus(normalized_text: str) -> float:
+    bonus = 0.0
+    if "signature" in normalized_text:
+        bonus += 0.05
+    if "date" in normalized_text:
+        bonus -= 0.02
+    if "by:" in normalized_text:
+        bonus += 0.03
+    return bonus
+def _has_signature_keyword(normalized_text: str) -> bool:
+    return any(pattern.search(normalized_text) for pattern in SIGNATURE_PATTERNS)
+def _expand_bbox(
+    line: OcrLine,
+    page_rect,
+    pix_height: int,
+    scale: float,
+    *,
+    stroke_y: float | None = None,
+) -> tuple[float, float, float, float]:
+    x0 = line.left / scale
+    x1 = line.right / scale
+    y1 = (pix_height - line.top) / scale
+    pad_x = max(14.0, (x1 - x0) * 0.25)
+    left = max(page_rect.x0, x0 - pad_x)
+    right = min(page_rect.x1, x1 + pad_x)
+    gap = 14.0
+    signature_height = 70.0
+    top = min(page_rect.y1, y1 + gap)
+    bottom = min(page_rect.y1, top + signature_height)
+    if bottom <= top:
+        bottom = min(page_rect.y1, top + signature_height)
+    if stroke_y is not None:
+        # Anchor to the detected stroke under the OCR label when available.
+        sy = (pix_height - stroke_y) / scale
+        if sy < top:
+            top = sy
+        bottom = max(bottom, sy + signature_height)
+    return (float(left), float(top), float(right), float(bottom))
+def _stroke_under_line(image: Image.Image, line: OcrLine) -> tuple[bool, float | None]:
+    """Heuristic: look for a dark horizontal stroke beneath the OCR line."""
+    gray = image.convert("L")
+    pad_x = 10
+    strip_height = 28
+    x0 = max(0, line.left - pad_x)
+    x1 = min(gray.width, line.right + pad_x)
+    y0 = min(gray.height, line.bottom + 2)
+    y1 = min(gray.height, y0 + strip_height)
+    if x1 <= x0 or y1 <= y0:
+        return False, None
+    crop = gray.crop((x0, y0, x1, y1))
+    width = crop.width or 1
+    max_density = 0.0
+    best_row = None
+    # Simple density scan: percentage of dark pixels per row.
+    threshold = 160
+    for row in range(crop.height):
+        row_pixels = [crop.getpixel((col, row)) for col in range(width)]
+        dark = sum(1 for px in row_pixels if px < threshold)
+        density = dark / width
+        if density > max_density:
+            max_density = density
+            best_row = row
+    if max_density < 0.32 or best_row is None:
+        return False, None
+    return True, float(y0 + best_row)
+def _image_candidates(page) -> list[WetCandidate]:
+    """Heuristic: treat small, wide images near signature areas as wet signatures."""
+    candidates: list[WetCandidate] = []
+    page_width = float(page.rect.width)
+    page_height = float(page.rect.height)
+    page_area = page_width * page_height
+    words = page.get_text("words") or []
+    for info in page.get_image_info(xrefs=True) or []:
+        rect = info.get("bbox") or info.get("rect")
+        if rect is None:
+            continue
+        if hasattr(rect, "x0"):
+            x0, y0, x1, y1 = float(rect.x0), float(rect.y0), float(rect.x1), float(rect.y1)
+        elif isinstance(rect, (tuple, list)) and len(rect) == 4:
+            x0, y0, x1, y1 = map(float, rect)
+        else:
+            continue
+        width = float(x1 - x0)
+        height = float(y1 - y0)
+        if width <= 40 or height <= 15:
+            # Skip tiny marks/logos
+            continue
+        aspect = width / height if height else 0.0
+        if aspect < 1.6:
+            continue
+        if (width * height) / page_area > 0.1:
+            # Ignore large illustrations/backgrounds
+            continue
+        role = _infer_role_nearby(rect, words)
+        score = 0.9 if role != "unknown" else 0.84
+        bbox = (x0, float(page_height - y1), x1, float(page_height - y0))
+        evidence = ["image_signature:true"]
+        if role != "unknown":
+            evidence.append(f"role_hint:{role}")
+        candidates.append(
+            WetCandidate(
+                bbox=bbox,
+                Role=role,
+                Score=min(1.0, score),
+                Evidence=evidence,
+            )
+        )
+    return candidates
+def _infer_role_nearby(rect, words) -> str:
+    """Best-effort role inference using text near the image rectangle."""
+    proximity_y = 48.0
+    proximity_x = 140.0
+    if hasattr(rect, "x0"):
+        rx0, ry0, rx1, ry1 = float(rect.x0), float(rect.y0), float(rect.x1), float(rect.y1)
+    elif isinstance(rect, (tuple, list)) and len(rect) == 4:
+        rx0, ry0, rx1, ry1 = map(float, rect)
+    else:
+        return "unknown"
+    nearby_tokens: list[str] = []
+    for word in words:
+        if len(word) < 5:
+            continue
+        x0, y0, x1, y1, token, *_ = word
+        if y1 < ry0 - proximity_y or y0 > ry1 + proximity_y:
+            continue
+        if x1 < rx0 - proximity_x or x0 > rx1 + proximity_x:
+            continue
+        nearby_tokens.append(str(token))
+    if not nearby_tokens:
+        return "unknown"
+    normalized = " ".join(nearby_tokens).lower()
+    return _infer_role(normalized)
+def _needs_wet_enhancement(file_result: FileResult) -> bool:
+    """Return True when we should run wet OCR to refine pseudo/unknown signatures."""
+    return False
+def _to_signatures(
+    candidates: Sequence[WetCandidate],
+    page_number: int,
+) -> list[Signature]:
+    signatures: list[Signature] = []
+    for candidate in candidates:
+        signatures.append(
+            Signature(
+                Page=page_number,
+                FieldName="wet_signature_detected",
+                Role=candidate.Role,
+                Score=int(round(candidate.Score * 100)),
+                Scores={candidate.Role: int(round(candidate.Score * 100))},
+                Evidence=candidate.Evidence,
+                Hint="WetSignatureOCR",
+                RenderType="wet",
+                BoundingBox=candidate.bbox,
+            )
+        )
+    return signatures
+def _mark_manual_review(file_result: FileResult, reason: str) -> None:
+    hints = _split_hints(file_result.Hints)
+    hints.add(f"ManualReview:{reason}")
+    file_result.Hints = ";".join(sorted(hints)) if hints else file_result.Hints
+def _refresh_metadata(file_result: FileResult) -> None:
+    file_result.SignatureCount = len(file_result.Signatures)
+    signature_pages = sorted({sig.Page for sig in file_result.Signatures if sig.Page})
+    file_result.SignaturePages = ",".join(map(str, signature_pages))
+    roles = sorted({sig.Role for sig in file_result.Signatures if sig.Role != "unknown"})
+    if roles:
+        file_result.Roles = ";".join(roles)
+    file_result.ElectronicSignatureFound = file_result.SignatureCount > 0
+    file_result.MixedContent = (
+        file_result.ElectronicSignatureFound and bool(file_result.ScannedPdf)
+    )
+    hints = _split_hints(file_result.Hints)
+    hints |= {sig.Hint for sig in file_result.Signatures if sig.Hint}
+    file_result.Hints = ";".join(sorted(hints))
+def _split_hints(hints: str | None) -> set[str]:
+    if not hints:
+        return set()
+    return {hint for hint in hints.split(";") if hint}

{sigdetect-0.3.1.dist-info → sigdetect-0.4.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sigdetect
-Version: 0.3.1
+Version: 0.4.0
 Summary: Signature detection and role attribution for PDFs
 Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
 License: MIT
@@ -95,14 +95,14 @@ sigdetect detect \
 ### Notes
 - The config file controls `pdf_root`, `out_dir`, `engine`, `pseudo_signatures`, `recurse_xobjects`, etc.
-- `--engine` supports **pypdf2** (default); a **pymupdf** engine placeholder exists and may be included in a future build.
+- `--engine` accepts **auto** (default; prefers PyMuPDF when installed, falls back to PyPDF2), **pypdf2**, or **pymupdf**.
 - `--pseudo-signatures` enables a vendor/Acro-only pseudo-signature when no actual `/Widget` is present (useful for DocuSign / Acrobat Sign receipts).
 - `--recurse-xobjects` allows scanning Form XObjects for vendor markers and labels embedded in page resources.
 - `--profile` selects tuned role logic:
   - `hipaa` → patient / representative / attorney
   - `retainer` → client / firm (prefers detecting two signatures)
 - `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
-- `--crop-signatures` enables PNG crops for each detected widget (requires installing the optional `pymupdf` dependency). Use `--crop-dir` to override the destination and `--crop-dpi` to choose rendering quality.
+- Cropping (`--crop-signatures`) and wet detection (`--detect-wet`) are enabled by default for single-pass runs; disable them if you want a light, e-sign-only pass. PyMuPDF is required for crops; PyMuPDF + Tesseract are required for wet detection.
 - If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
 ### EDA (quick aggregate stats)
@@ -136,15 +136,13 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
 print(result.to_dict())
 ~~~
-`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image.
+`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
 ---
 ## Library API (embed in another script)
-Minimal, plug-and-play API
-Import from `sigdetect.api` and get plain dicts out (JSON-ready),
-with no I/O side effects by default:
+Minimal, plug-and-play API that returns plain dicts (JSON-ready) without side effects unless you opt into cropping:
 ~~~python
 from pathlib import Path
@@ -192,23 +190,14 @@ for res in ScanDirectory(
 # 3) Crop PNG snippets for FileResult objects (requires PyMuPDF)
 detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
 file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
-crops = CropSignatureImages(
+CropSignatureImages(
     "/path/to/pdfs/example.pdf",
     file_result,
     outputDirectory="./signature_crops",
     dpi=200,
-    returnBytes=True,  # also returns in-memory PNG bytes for each crop
-    # saveToDisk=False,  # optional: skip writing PNGs to disk
 )
-first_crop = crops[0]
-print(first_crop.path, len(first_crop.image_bytes))
 ~~~
-When ``returnBytes=True`` the helper returns ``SignatureCrop`` objects containing the saved path,
-PNG bytes, and the originating signature metadata.
-Pass ``saveToDisk=False`` if you only want in-memory PNG bytes (no files on disk or ``crop_path`` updates).
 ## Result schema
@@ -247,7 +236,7 @@ High-level summary (per file):
       "scores": { "page_label": 4, "general": 2 },
       "evidence": ["page_label:representative(parent/guardian)", "pseudo:true"],
       "hint": "VendorOrAcroOnly",
-      "render_type": "unknown",
+      "render_type": "typed",
       "bounding_box": null,
       "crop_path": null
     }
@@ -292,6 +281,10 @@ profile: retainer    # or: hipaa
 crop_signatures: false   # enable to write PNG crops (requires pymupdf)
 # crop_output_dir: ./signature_crops
 crop_image_dpi: 200
+detect_wet_signatures: false   # opt-in OCR wet detection (PyMuPDF + Tesseract)
+wet_ocr_dpi: 200
+wet_ocr_languages: eng
+wet_precision_threshold: 0.82
 ~~~
 YAML files can be customized or load at runtime (see CLI `--config`, if available, or import and pass patterns into engine).
@@ -306,6 +299,7 @@ YAML files can be customized or load at runtime (see CLI `--config`, if availabl
   - Looks for client and firm labels/tokens; boosts pages with law-firm markers (LLP/LLC/PA/PC) and “By:” blocks.
   - Applies an anti-front-matter rule to reduce page-1 false positives (e.g., letterheads, firm mastheads).
   - When only vendor/Acro clues exist (no widgets), it will emit two pseudo signatures targeting likely pages.
+- **Wet detection (opt-in):** With `detect_wet_signatures: true`, the CLI runs an OCR-backed pass (PyMuPDF + pytesseract/Tesseract) after e-sign detection. It emits `RenderType="wet"` signatures for high-confidence label/stroke pairs in the lower page region. Missing OCR dependencies add a `ManualReview:*` hint instead of failing.
 ---

sigdetect-0.4.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,24 @@
+sigdetect/__init__.py,sha256=YvnTwlC1jfq83EhQS_1JjiiHK7_wJCCU1JvHv5E1qWY,573
+sigdetect/api.py,sha256=qLCpbODLvw5AQMEAvpIP6kBNoc03h01ekjilg9tDxuw,9408
+sigdetect/cli.py,sha256=Zco3-r4MAlVEmyEatvPUOZLLamh5ELFrquAK6ovJVlw,9290
+sigdetect/config.py,sha256=-6GCUusdi0Ba-Rt6pwffB5MIz1ApPlBaXVKxpIppbKk,7678
+sigdetect/cropping.py,sha256=zwOXzkG8tt1ZPUaDhJMHfonFEZtVNZZmZOzYQ_4nUAI,6074
+sigdetect/eda.py,sha256=S92G1Gjmepri__D0n_V6foq0lQgH-RXI9anW8A58jfw,4681
+sigdetect/logging_setup.py,sha256=LMF8ao_a-JwH0S522T6aYTFX3e8Ajjv_5ODS2YiBcHA,6404
+sigdetect/utils.py,sha256=T9rubLf5T9JmjOHYMOba1j34fhOJaWocAXccnGTxRUE,5198
+sigdetect/wet_detection.py,sha256=6ciFxMQS3f1nF502w4KLTksoYmjdudzTekh7McfWiIg,16464
+sigdetect/data/role_rules.retainer.yml,sha256=IFdwKnDBXR2cTkdfrsZ6ku6CXD8S_dg5A3vKRKLW5h8,2532
+sigdetect/data/role_rules.yml,sha256=HuLKsZR_A6sD9XvY4NHiY_VG3dS5ERNCBF9-Mxawomw,2751
+sigdetect/data/vendor_patterns.yml,sha256=NRbZNQxcx_GuL6n1jAphBn6MM6ChCpeWGCsjbRx-PEo,384
+sigdetect/detector/__init__.py,sha256=pUVFLwqj65cVO1qjsZy6NJ9BVY5xrJ6sQe-8LAb9O_A,2421
+sigdetect/detector/base.py,sha256=L-iXWXqsTetDc4jRZo_wOdbNpKqOY20mX9FefrugdT0,263
+sigdetect/detector/base_detector.py,sha256=GmAgUWO_fQgIfnihZSoyhR3wpnwZ-X3hS0Kuyz4G6Ys,608
+sigdetect/detector/file_result_model.py,sha256=j2gTc9Sw3fJOHlexYsR_m5DiwHA8DzIzAMToESfvo4A,1767
+sigdetect/detector/pymupdf_engine.py,sha256=SGtJOStKFdfsdBrscoe5zg9u2KGJ_JTRYZ25adL_7Lw,17390
+sigdetect/detector/pypdf2_engine.py,sha256=kB8cIp_gMvCla0LIBi9sd19g0361Oc9TjCW_ZViUBJQ,47410
+sigdetect/detector/signature_model.py,sha256=sdfQiOJzxnrg0WkGJxZCebA0wHqgzZnLI0gOv6ipSZA,1074
+sigdetect-0.4.0.dist-info/METADATA,sha256=WA7OjyLtM3AH7OtdFRmliqBw0ucNlywoD2bykytlnPA,12475
+sigdetect-0.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+sigdetect-0.4.0.dist-info/entry_points.txt,sha256=iqtfKjBU44-omM7Sh-idGz2ahw19oAvpvSyKZVArG3o,48
+sigdetect-0.4.0.dist-info/top_level.txt,sha256=PKlfwUobkRC0viwiSXmhtw83G26FSNpimWYC1Uy00FY,10
+sigdetect-0.4.0.dist-info/RECORD,,

sigdetect-0.3.1.dist-info/RECORD DELETED Viewed

@@ -1,23 +0,0 @@
-sigdetect/__init__.py,sha256=LhY78mDZ1ClYVNTxW_qtE-vqJoN9N7N5ZcNRDUI_3ss,575
-sigdetect/api.py,sha256=6_CMSxcag9coHHzrpuRSVimHWSNtqQiWY9hdlqQ2IKY,9396
-sigdetect/cli.py,sha256=NctAnaB-TQrUAT9m-v8kj2_KTNs88kbFOCiX32tHZm8,7920
-sigdetect/config.py,sha256=S0NVKuJYiHJCocL-VNFdGJpasFcjTecavC4EthyS1DQ,5951
-sigdetect/cropping.py,sha256=dmJF4Q1tkmkfm0NaiwHddNOP8Sj9S4Lj_d5EBjodEkk,6015
-sigdetect/eda.py,sha256=S92G1Gjmepri__D0n_V6foq0lQgH-RXI9anW8A58jfw,4681
-sigdetect/logging_setup.py,sha256=LMF8ao_a-JwH0S522T6aYTFX3e8Ajjv_5ODS2YiBcHA,6404
-sigdetect/utils.py,sha256=T9rubLf5T9JmjOHYMOba1j34fhOJaWocAXccnGTxRUE,5198
-sigdetect/data/role_rules.retainer.yml,sha256=IFdwKnDBXR2cTkdfrsZ6ku6CXD8S_dg5A3vKRKLW5h8,2532
-sigdetect/data/role_rules.yml,sha256=HuLKsZR_A6sD9XvY4NHiY_VG3dS5ERNCBF9-Mxawomw,2751
-sigdetect/data/vendor_patterns.yml,sha256=NRbZNQxcx_GuL6n1jAphBn6MM6ChCpeWGCsjbRx-PEo,384
-sigdetect/detector/__init__.py,sha256=up2FCmD09f2bRHcS4WbY-clx3GQbWuk1PM2JlxgusHg,1608
-sigdetect/detector/base.py,sha256=L-iXWXqsTetDc4jRZo_wOdbNpKqOY20mX9FefrugdT0,263
-sigdetect/detector/base_detector.py,sha256=GmAgUWO_fQgIfnihZSoyhR3wpnwZ-X3hS0Kuyz4G6Ys,608
-sigdetect/detector/file_result_model.py,sha256=j2gTc9Sw3fJOHlexYsR_m5DiwHA8DzIzAMToESfvo4A,1767
-sigdetect/detector/pymupdf_engine.py,sha256=iyp7JuPlUnydwohH5zbNg4MwH44mBmxbBWOS3ZmArBo,17339
-sigdetect/detector/pypdf2_engine.py,sha256=INWQH06kMLvto2VS-EdLC-EtMC6AG7JmdVYmNgx6_RU,47313
-sigdetect/detector/signature_model.py,sha256=mztb9V5wgv2oohQ5Cxzcv8_Bo6TyWAVIXteaeQ2rywQ,1076
-sigdetect-0.3.1.dist-info/METADATA,sha256=whXGE4-9spAjlMcZz_owdsIiB4EobXL9_UOuAJeDVfA,12342
-sigdetect-0.3.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-sigdetect-0.3.1.dist-info/entry_points.txt,sha256=iqtfKjBU44-omM7Sh-idGz2ahw19oAvpvSyKZVArG3o,48
-sigdetect-0.3.1.dist-info/top_level.txt,sha256=PKlfwUobkRC0viwiSXmhtw83G26FSNpimWYC1Uy00FY,10
-sigdetect-0.3.1.dist-info/RECORD,,

{sigdetect-0.3.1.dist-info → sigdetect-0.4.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{sigdetect-0.3.1.dist-info → sigdetect-0.4.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{sigdetect-0.3.1.dist-info → sigdetect-0.4.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

sigdetect 0.3.1__py3-none-any.whl → 0.4.0__py3-none-any.whl

sigdetect 0.3.1py3-none-any.whl → 0.4.0py3-none-any.whl