PyPI - sigdetect - Versions diffs - 0.1.0__py3-none-any.whl - Mend

sigdetect 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

sigdetect/__init__.py +24 -0
sigdetect/api.py +139 -0
sigdetect/cli.py +98 -0
sigdetect/config.py +117 -0
sigdetect/data/role_rules.retainer.yml +61 -0
sigdetect/data/role_rules.yml +71 -0
sigdetect/data/vendor_patterns.yml +16 -0
sigdetect/detector/__init__.py +55 -0
sigdetect/detector/base.py +9 -0
sigdetect/detector/base_detector.py +22 -0
sigdetect/detector/file_result_model.py +59 -0
sigdetect/detector/pymupdf_engine.py +0 -0
sigdetect/detector/pypdf2_engine.py +1114 -0
sigdetect/detector/signature_model.py +34 -0
sigdetect/eda.py +137 -0
sigdetect/logging_setup.py +218 -0
sigdetect/utils.py +152 -0
sigdetect-0.1.0.dist-info/METADATA +394 -0
sigdetect-0.1.0.dist-info/RECORD +22 -0
sigdetect-0.1.0.dist-info/WHEEL +5 -0
sigdetect-0.1.0.dist-info/entry_points.txt +2 -0
sigdetect-0.1.0.dist-info/top_level.txt +1 -0

sigdetect/detector/signature_model.py ADDED Viewed

@@ -0,0 +1,34 @@
+"""Signature model returned by detection engines."""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any
+@dataclass(slots=True)
+class Signature:
+    """Metadata describing a detected signature field."""
+    Page: int | None
+    FieldName: str
+    Role: str
+    Score: int
+    Scores: dict[str, int]
+    Evidence: list[str]
+    Hint: str
+    RenderType: str = "unknown"
+    def to_dict(self) -> dict[str, Any]:
+        """Return the legacy snake_case representation used in JSON payloads."""
+        return {
+            "page": self.Page,
+            "field_name": self.FieldName,
+            "role": self.Role,
+            "score": self.Score,
+            "scores": self.Scores,
+            "evidence": list(self.Evidence),
+            "hint": self.Hint,
+            "render_type": self.RenderType,
+        }

sigdetect/eda.py ADDED Viewed

@@ -0,0 +1,137 @@
+"""Exploratory data analysis helpers for signature detection output."""
+from __future__ import annotations
+import json
+import statistics
+from collections import Counter
+from pathlib import Path
+from typing import Any
+from rich.console import Console
+from rich.table import Table
+from .config import DetectConfiguration
+ConsoleInstance = Console()
+def _SafeNumber(value: Any, defaultValue: float | None = None) -> float | None:
+    """Attempt to coerce ``value`` to ``float`` while tolerating bad input."""
+    try:
+        return float(value)
+    except Exception:
+        return defaultValue
+def _FormatSizeStatistics(sizeValues: list[float]) -> str:
+    """Return a ``min / median / max`` summary for ``sizeValues``."""
+    if not sizeValues:
+        return "—"
+    sortedValues = sorted(value for value in sizeValues if value is not None)
+    if not sortedValues:
+        return "—"
+    minimum = int(round(sortedValues[0]))
+    median = int(round(statistics.median(sortedValues)))
+    maximum = int(round(sortedValues[-1]))
+    return f"{minimum} / {median} / {maximum}"
+def _LoadResults(resultsPath: Path) -> list[dict[str, Any]]:
+    """Load ``results.json`` from disk and guard against malformed content."""
+    if not resultsPath.exists():
+        ConsoleInstance.print(f"[yellow]No results.json found at {resultsPath}[/yellow]")
+        return []
+    try:
+        data = json.loads(resultsPath.read_text())
+    except Exception as exc:
+        ConsoleInstance.print(f"[red]Failed to read {resultsPath}: {exc}[/red]")
+        return []
+    if not isinstance(data, list):
+        ConsoleInstance.print(f"[red]results.json is not a list: {type(data)}[/red]")
+        return []
+    return data
+def _FlattenSignatures(rows: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Collate signature dictionaries found within ``rows``."""
+    signatures: list[dict[str, Any]] = []
+    for row in rows:
+        for signature in row.get("signatures") or []:
+            if isinstance(signature, dict):
+                signatures.append(signature)
+    return signatures
+def RunExploratoryAnalysis(configuration: DetectConfiguration) -> None:
+    """Print a compact summary of the detection output defined by ``configuration``."""
+    outputDirectory = configuration.OutputDirectory or configuration.PdfRoot
+    resultsPath = outputDirectory / "results.json"
+    rows = _LoadResults(resultsPath)
+    if not rows:
+        ConsoleInstance.print("[yellow]No results to summarize.[/yellow]")
+        return
+    totalCount = len(rows)
+    electronicSignatureCount = sum(1 for row in rows if bool(row.get("esign_found")))
+    wetSignatureCount = totalCount - electronicSignatureCount
+    scannedCount = sum(1 for row in rows if bool(row.get("scanned_pdf")))
+    mixedCount = sum(1 for row in rows if bool(row.get("mixed")))
+    sizeValues = [
+        _SafeNumber(row.get("size_kb"))
+        for row in rows
+        if _SafeNumber(row.get("size_kb")) is not None
+    ]
+    table = Table(show_header=True, header_style="bold")
+    table.add_column("Total", justify="right")
+    table.add_column("E-sign", justify="right")
+    table.add_column("Wet", justify="right")
+    table.add_column("Scans", justify="right")
+    table.add_column("Mixed", justify="right")
+    table.add_column("Size KB (min/med/max)", justify="left")
+    table.add_row(
+        str(totalCount),
+        str(electronicSignatureCount),
+        str(wetSignatureCount),
+        str(scannedCount),
+        str(mixedCount),
+        _FormatSizeStatistics(sizeValues),
+    )
+    ConsoleInstance.print(table)
+    signatures = _FlattenSignatures(rows)
+    roleCounts = Counter((signature.get("role") or "unknown") for signature in signatures)
+    if signatures:
+        ConsoleInstance.print("\nSignature roles (per-signature) — including unknown:")
+        preferredOrder = [
+            "patient",
+            "representative",
+            "client",
+            "firm",
+            "attorney",
+            "unknown",
+        ]
+        seenRoles = set()
+        orderedRoles: list[str] = []
+        for role in preferredOrder:
+            if role in roleCounts:
+                orderedRoles.append(role)
+                seenRoles.add(role)
+        for role in sorted(roleCounts):
+            if role not in seenRoles:
+                orderedRoles.append(role)
+        bulletLines = [f" • {role:<13} — {roleCounts[role]}" for role in orderedRoles]
+        ConsoleInstance.print("\n".join(bulletLines))
+        ConsoleInstance.print(f"(total signatures tallied: {sum(roleCounts.values())})\n")
+    else:
+        ConsoleInstance.print("\n[dim]No signatures found to break down by role.[/dim]\n")

sigdetect/logging_setup.py ADDED Viewed

@@ -0,0 +1,218 @@
+"""Logging helpers configured for the CaseWorks standards."""
+from __future__ import annotations
+import json
+import logging
+import os
+from logging.handlers import RotatingFileHandler
+from pathlib import Path
+from typing import Any
+from rich.logging import RichHandler
+_LEVEL_MAP = {
+    "CRITICAL": logging.CRITICAL,
+    "ERROR": logging.ERROR,
+    "WARNING": logging.WARNING,
+    "INFO": logging.INFO,
+    "DEBUG": logging.DEBUG,
+    "NOTSET": logging.NOTSET,
+}
+def _CoerceLevel(levelValue: str | int | None) -> int:
+    """Translate the provided logging level into a numeric value."""
+    if isinstance(levelValue, int):
+        return levelValue
+    if isinstance(levelValue, str):
+        return _LEVEL_MAP.get(levelValue.upper(), logging.INFO)
+    return logging.INFO
+class JsonFormatter(logging.Formatter):
+    """Minimal JSON formatter with deterministic keys."""
+    default_time_format = "%Y-%m-%dT%H:%M:%S"
+    default_msec_format = "%s.%03d"
+    def format(self, record: logging.LogRecord) -> str:  # noqa: D401 (formatter contract)
+        payload: dict[str, Any] = {
+            "time": self.formatTime(record, self.default_time_format),
+            "level": record.levelname,
+            "name": record.name,
+            "message": record.getMessage(),
+            "module": record.module,
+            "func": record.funcName,
+            "line": record.lineno,
+            "process": record.process,
+            "thread": record.threadName,
+        }
+        for key, value in record.__dict__.items():
+            if key.startswith("_"):
+                continue
+            if key in {
+                "name",
+                "msg",
+                "args",
+                "levelname",
+                "levelno",
+                "pathname",
+                "filename",
+                "module",
+                "exc_info",
+                "exc_text",
+                "stack_info",
+                "lineno",
+                "funcName",
+                "created",
+                "msecs",
+                "relativeCreated",
+                "thread",
+                "threadName",
+                "process",
+                "processName",
+                "message",
+            }:
+                continue
+            payload.setdefault(key, value)
+        if record.exc_info:
+            payload["exc_info"] = self.formatException(record.exc_info)
+        return json.dumps(payload, ensure_ascii=False)
+def _CreateRichHandler(levelValue: int) -> RichHandler:
+    """Instantiate the styled console handler."""
+    return RichHandler(
+        level=levelValue,
+        markup=True,
+        rich_tracebacks=True,
+        show_time=True,
+        show_path=False,
+        log_time_format="[%Y-%m-%d %H:%M:%S]",
+    )
+def _CreateFileHandler(
+    logfile: Path, levelValue: int, jsonFormat: bool, maxBytes: int, backupCount: int
+) -> RotatingFileHandler:
+    """Create a rotating file handler that optionally emits JSON."""
+    logfile.parent.mkdir(parents=True, exist_ok=True)
+    handler = RotatingFileHandler(
+        filename=str(logfile),
+        maxBytes=maxBytes,
+        backupCount=backupCount,
+        encoding="utf-8",
+    )
+    handler.setLevel(levelValue)
+    if jsonFormat:
+        handler.setFormatter(JsonFormatter())
+    else:
+        handler.setFormatter(
+            logging.Formatter(
+                fmt="%(asctime)s %(levelname)s %(name)s:%(lineno)d | %(message)s",
+                datefmt="%Y-%m-%d %H:%M:%S",
+            )
+        )
+    return handler
+def ConfigureLogging(
+    levelValue: str | int | None = None,
+    *,
+    logfile: str | Path | None = None,
+    jsonLogs: bool | None = None,
+    useRich: bool | None = None,
+    maxBytes: int | None = None,
+    backupCount: int | None = None,
+    loggerName: str = "sigdetect",
+) -> logging.Logger:
+    """Initialise logging with precedence ``arguments > env vars > defaults``."""
+    resolvedLevel = _CoerceLevel(levelValue or os.getenv("SIGDETECT_LOG_LEVEL"))
+    resolvedLogfile = (
+        Path(os.getenv("SIGDETECT_LOG_FILE"))
+        if (logfile is None and os.getenv("SIGDETECT_LOG_FILE"))
+        else Path(logfile) if logfile else None
+    )
+    resolvedJson = (
+        jsonLogs
+        if jsonLogs is not None
+        else os.getenv("SIGDETECT_LOG_JSON", "false").lower() in {"1", "true", "yes"}
+    )
+    resolvedRich = (
+        useRich
+        if useRich is not None
+        else os.getenv("SIGDETECT_LOG_RICH", "true").lower() in {"1", "true", "yes"}
+    )
+    resolvedMaxBytes = (
+        maxBytes if maxBytes is not None else int(os.getenv("SIGDETECT_LOG_MAX_BYTES", "1048576"))
+    )
+    resolvedBackups = (
+        backupCount if backupCount is not None else int(os.getenv("SIGDETECT_LOG_BACKUPS", "5"))
+    )
+    logger = logging.getLogger(loggerName)
+    if getattr(logger, "_configured", False):
+        return logger
+    logger.setLevel(resolvedLevel)
+    logger.propagate = False
+    handlers: list[logging.Handler] = []
+    if resolvedRich:
+        handlers.append(_CreateRichHandler(resolvedLevel))
+    else:
+        streamHandler = logging.StreamHandler()
+        streamHandler.setLevel(resolvedLevel)
+        streamHandler.setFormatter(
+            logging.Formatter(
+                fmt="%(asctime)s %(levelname)s %(name)s:%(lineno)d | %(message)s",
+                datefmt="%H:%M:%S",
+            )
+        )
+        handlers.append(streamHandler)
+    if resolvedLogfile:
+        handlers.append(
+            _CreateFileHandler(
+                resolvedLogfile, resolvedLevel, resolvedJson, resolvedMaxBytes, resolvedBackups
+            )
+        )
+    for handler in handlers:
+        logger.addHandler(handler)
+    logger._configured = True  # type: ignore[attr-defined]
+    logger.debug(
+        "Logging initialized",
+        extra={
+            "level": resolvedLevel,
+            "logfile": str(resolvedLogfile) if resolvedLogfile else None,
+            "json_logs": resolvedJson,
+            "use_rich": resolvedRich,
+        },
+    )
+    return logger
+def SetVerbosity(logger: logging.Logger, *, verbose: bool = False, quiet: bool = False) -> None:
+    """Adjust the console verbosity for ``logger``."""
+    if quiet:
+        newLevel = logging.WARNING
+    elif verbose:
+        newLevel = logging.DEBUG
+    else:
+        newLevel = logging.INFO
+    logger.setLevel(newLevel)
+    for handler in logger.handlers:
+        handler.setLevel(newLevel)

sigdetect/utils.py ADDED Viewed

@@ -0,0 +1,152 @@
+"""Utility helpers shared across detectors."""
+from __future__ import annotations
+import re
+from collections.abc import Iterator
+from contextlib import suppress
+from importlib import resources
+from typing import Any, Pattern
+import yaml
+from pypdf import generic
+_PACKAGE_NAME = "sigdetect.data"
+_VENDOR_FILE = "vendor_patterns.yml"
+def LoadPatterns(profileName: str | None = None) -> dict[str, Any]:
+    """Return the merged vendor and role patterns for the requested profile."""
+    roleCandidates: list[str] = []
+    if profileName:
+        roleCandidates.append(f"role_rules.{profileName}.yml")
+    roleCandidates.append("role_rules.yml")
+    rolePatterns: dict[str, Any] = {}
+    for candidate in roleCandidates:
+        try:
+            with resources.files(_PACKAGE_NAME).joinpath(candidate).open("rb") as handle:
+                rolePatterns = yaml.safe_load(handle) or {}
+                break
+        except FileNotFoundError:
+            continue
+    with resources.files(_PACKAGE_NAME).joinpath(_VENDOR_FILE).open("rb") as handle:
+        vendorPatterns = yaml.safe_load(handle) or {}
+    rolePatterns.setdefault("bytes", vendorPatterns.get("bytes"))
+    rolePatterns.setdefault("text", vendorPatterns.get("text"))
+    return rolePatterns
+def NormalizeText(value: str) -> str:
+    """Normalize whitespace so downstream regex work consistently."""
+    return re.sub(r"\s+", " ", (value or "")).strip()
+def AsDictionary(candidate: Any) -> Any:
+    """Resolve pypdf indirect objects to their underlying dictionary."""
+    if isinstance(candidate, generic.IndirectObject):
+        with suppress(Exception):
+            return candidate.get_object()
+    return candidate
+def IterateWidgets(candidate: Any) -> Iterator[Any]:
+    """Yield widget dictionaries from any nested structure."""
+    if candidate is None:
+        return
+    if isinstance(candidate, generic.IndirectObject):
+        yield from IterateWidgets(candidate.get_object())
+    elif isinstance(candidate, generic.ArrayObject):
+        for item in candidate:
+            yield from IterateWidgets(item)
+    elif isinstance(candidate, generic.DictionaryObject):
+        yield candidate
+def HasSignatureFieldInAncestry(candidate: Any, maxHops: int = 12) -> bool:
+    """Check if a dictionary or any parent declares a signature field type."""
+    hopCount = 0
+    current = AsDictionary(candidate)
+    while isinstance(current, generic.DictionaryObject) and hopCount <= maxHops:
+        if current.get("/FT") == "/Sig":
+            return True
+        current = AsDictionary(current.get("/Parent"))
+        hopCount += 1
+    return False
+def HasSignatureValue(candidate: Any) -> bool:
+    """Determine whether the widget or any parent contains signature metadata."""
+    dictionaryCandidate = AsDictionary(candidate)
+    if not isinstance(dictionaryCandidate, generic.DictionaryObject):
+        return False
+    valueCandidate = AsDictionary(dictionaryCandidate.get("/V"))
+    if isinstance(valueCandidate, generic.DictionaryObject):
+        if (
+            valueCandidate.get("/Type") == "/Sig"
+            or valueCandidate.get("/SubFilter")
+            or valueCandidate.get("/Filter")
+        ):
+            return True
+    parentCandidate = AsDictionary(dictionaryCandidate.get("/Parent"))
+    if isinstance(parentCandidate, generic.DictionaryObject):
+        parentValue = AsDictionary(parentCandidate.get("/V"))
+        if isinstance(parentValue, generic.DictionaryObject):
+            if (
+                parentValue.get("/Type") == "/Sig"
+                or parentValue.get("/SubFilter")
+                or parentValue.get("/Filter")
+            ):
+                return True
+    return False
+def GetFieldNameFromAncestry(candidate: Any, maxHops: int = 12) -> str | None:
+    """Return the closest field name (``/T``) in the widget hierarchy."""
+    hopCount = 0
+    current = AsDictionary(candidate)
+    while isinstance(current, generic.DictionaryObject) and hopCount <= maxHops:
+        fieldName = current.get("/T")
+        if fieldName:
+            try:
+                return str(fieldName)
+            except Exception:
+                return None
+        current = AsDictionary(current.get("/Parent"))
+        hopCount += 1
+    return None
+def RolesFromLabels(text: str, labelPatterns: dict[str, Pattern[str]]) -> set[str]:
+    """Identify roles that match the explicit label patterns."""
+    normalizedText = NormalizeText(text)
+    return {role for role, pattern in labelPatterns.items() if pattern.search(normalizedText)}
+def RolesFromGeneral(text: str, generalPatterns: dict[str, Pattern[str]]) -> set[str]:
+    """Identify roles using the broader, free-form regex patterns."""
+    normalizedText = NormalizeText(text)
+    return {role for role, pattern in generalPatterns.items() if pattern.search(normalizedText)}
+def ChooseRole(scores: dict[str, int]) -> str:
+    """Return the dominant role based on the supplied score mapping."""
+    if not scores:
+        return "unknown"
+    topScore = max(scores.values())
+    winners = [role for role, value in scores.items() if value == topScore]
+    return winners[0] if len(winners) == 1 and topScore > 0 else "unknown"