sigdetect 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sigdetect/__init__.py ADDED
@@ -0,0 +1,24 @@
1
+ """sigdetect – PDF e-sign detection & role attribution."""
2
+
3
+ from importlib.metadata import PackageNotFoundError, version
4
+
5
+ try:
6
+ import warnings
7
+
8
+ from pypdf.errors import PdfReadWarning
9
+
10
+ warnings.filterwarnings(
11
+ "ignore",
12
+ message=r"Multiple definitions in dictionary.*key /Subtype",
13
+ category=PdfReadWarning,
14
+ )
15
+ except Exception:
16
+ # Never fail imports because of warnings setup
17
+ pass
18
+
19
+ try:
20
+ __version__ = version("sigdetect")
21
+ except PackageNotFoundError: # pragma: no cover
22
+ __version__ = "0.0.0"
23
+
24
+ DEFAULT_ENGINE = "pypdf2"
sigdetect/api.py ADDED
@@ -0,0 +1,139 @@
1
+ """Public helpers for programmatic use of the signature detection engine."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Any, Iterable, Iterator, Literal
7
+
8
+ from sigdetect.config import DetectConfiguration
9
+ from sigdetect.detector import BuildDetector
10
+
11
+ EngineName = Literal["pypdf2", "pypdf", "pymupdf"]
12
+ ProfileName = Literal["hipaa", "retainer"]
13
+
14
+
15
+ def DetectPdf(
16
+ pdfPath: str | Path,
17
+ *,
18
+ profileName: ProfileName = "hipaa",
19
+ engineName: EngineName = "pypdf2",
20
+ includePseudoSignatures: bool = True,
21
+ recurseXObjects: bool = True,
22
+ ) -> dict[str, Any]:
23
+ """Detect signature evidence and assign roles for a single PDF."""
24
+
25
+ resolvedPath = Path(pdfPath)
26
+
27
+ configuration = DetectConfiguration(
28
+ PdfRoot=resolvedPath.parent,
29
+ OutputDirectory=None,
30
+ Engine=engineName,
31
+ PseudoSignatures=includePseudoSignatures,
32
+ RecurseXObjects=recurseXObjects,
33
+ Profile=profileName,
34
+ )
35
+
36
+ detector = BuildDetector(configuration)
37
+ result = detector.Detect(resolvedPath)
38
+ return _ToPlainDictionary(result)
39
+
40
+
41
+ def _ToPlainDictionary(candidate: Any) -> dict[str, Any]:
42
+ """Convert pydantic/dataclass instances to plain dictionaries."""
43
+
44
+ if hasattr(candidate, "to_dict"):
45
+ return candidate.to_dict()
46
+ if hasattr(candidate, "model_dump"):
47
+ return candidate.model_dump() # type: ignore[attr-defined]
48
+ if hasattr(candidate, "dict"):
49
+ return candidate.dict() # type: ignore[attr-defined]
50
+ try:
51
+ from dataclasses import asdict, is_dataclass
52
+
53
+ if is_dataclass(candidate):
54
+ return asdict(candidate)
55
+ except Exception:
56
+ pass
57
+ if isinstance(candidate, dict):
58
+ return {key: _ToPlainValue(candidate[key]) for key in candidate}
59
+ raise TypeError(f"Unsupported result type: {type(candidate)!r}")
60
+
61
+
62
+ def _ToPlainValue(value: Any) -> Any:
63
+ """Best effort conversion for nested structures."""
64
+
65
+ if hasattr(value, "to_dict"):
66
+ return value.to_dict()
67
+ if hasattr(value, "model_dump") or hasattr(value, "dict"):
68
+ return _ToPlainDictionary(value)
69
+ try:
70
+ from dataclasses import asdict, is_dataclass
71
+
72
+ if is_dataclass(value):
73
+ return asdict(value)
74
+ except Exception:
75
+ pass
76
+ if isinstance(value, list):
77
+ return [_ToPlainValue(item) for item in value]
78
+ if isinstance(value, tuple):
79
+ return tuple(_ToPlainValue(item) for item in value)
80
+ if isinstance(value, dict):
81
+ return {key: _ToPlainValue(result) for key, result in value.items()}
82
+ return value
83
+
84
+
85
+ def DetectMany(
86
+ pdfPaths: Iterable[str | Path],
87
+ **kwargs: Any,
88
+ ) -> Iterator[dict[str, Any]]:
89
+ """Yield :func:`DetectPdf` results for each path in ``pdfPaths``."""
90
+
91
+ for pdfPath in pdfPaths:
92
+ yield DetectPdf(pdfPath, **kwargs)
93
+
94
+
95
+ def ScanDirectory(
96
+ pdfRoot: str | Path,
97
+ *,
98
+ globPattern: str = "**/*.pdf",
99
+ **kwargs: Any,
100
+ ) -> Iterator[dict[str, Any]]:
101
+ """Walk ``pdfRoot`` and yield detection output for every matching PDF."""
102
+
103
+ rootDirectory = Path(pdfRoot)
104
+ iterator = (
105
+ rootDirectory.rglob(globPattern.replace("**/", "", 1))
106
+ if globPattern.startswith("**/")
107
+ else rootDirectory.glob(globPattern)
108
+ )
109
+ for pdfPath in iterator:
110
+ if pdfPath.is_file() and pdfPath.suffix.lower() == ".pdf":
111
+ yield DetectPdf(pdfPath, **kwargs)
112
+
113
+
114
+ def ToCsvRow(result: dict[str, Any]) -> dict[str, Any]:
115
+ """Return a curated subset of keys suitable for CSV export."""
116
+
117
+ return {
118
+ "file": result.get("file"),
119
+ "size_kb": result.get("size_kb"),
120
+ "pages": result.get("pages"),
121
+ "esign_found": result.get("esign_found"),
122
+ "scanned_pdf": result.get("scanned_pdf"),
123
+ "mixed": result.get("mixed"),
124
+ "sig_count": result.get("sig_count"),
125
+ "sig_pages": result.get("sig_pages"),
126
+ "roles": result.get("roles"),
127
+ "hints": result.get("hints"),
128
+ }
129
+
130
+
131
+ def Version() -> str:
132
+ """Expose the installed package version without importing the CLI stack."""
133
+
134
+ try:
135
+ from importlib.metadata import version as resolveVersion
136
+
137
+ return resolveVersion("sigdetect")
138
+ except Exception:
139
+ return "0.0.0-dev"
sigdetect/cli.py ADDED
@@ -0,0 +1,98 @@
1
+ """Command line interface for the signature detection tool."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from dataclasses import asdict, is_dataclass
7
+ from pathlib import Path
8
+
9
+ import typer
10
+
11
+ from . import __version__
12
+ from .config import LoadConfiguration
13
+ from .detector import BuildDetector
14
+ from .eda import RunExploratoryAnalysis
15
+ from .logging_setup import ConfigureLogging
16
+
17
+ Logger = ConfigureLogging()
18
+
19
+ CliApplication = typer.Typer(help="Signature detection & role attribution for PDFs")
20
+
21
+
22
+ def _JsonSerializer(candidate):
23
+ """Ensure dataclasses and paths remain JSON serialisable."""
24
+
25
+ if hasattr(candidate, "to_dict"):
26
+ return candidate.to_dict()
27
+ if is_dataclass(candidate):
28
+ return asdict(candidate)
29
+ if isinstance(candidate, Path):
30
+ return str(candidate)
31
+ return str(candidate)
32
+
33
+
34
+ @CliApplication.command(name="detect")
35
+ def Detect(
36
+ configurationPath: Path | None = typer.Option(
37
+ None, "--config", "-c", help="Path to YAML config"
38
+ ),
39
+ profileOverride: str | None = typer.Option(None, "--profile", "-p", help="hipaa or retainer"),
40
+ ) -> None:
41
+ """Run detection for the configured directory and emit ``results.json``."""
42
+
43
+ configuration = LoadConfiguration(configurationPath)
44
+ if profileOverride in {"hipaa", "retainer"}:
45
+ configuration = configuration.model_copy(update={"Profile": profileOverride})
46
+
47
+ try:
48
+ detector = BuildDetector(configuration)
49
+ except ValueError as exc:
50
+ Logger.error(
51
+ "Detector initialisation failed",
52
+ extra={"engine": configuration.Engine, "error": str(exc)},
53
+ )
54
+ typer.echo(str(exc), err=True)
55
+ raise typer.Exit(code=2) from exc
56
+
57
+ pdfFiles = list(configuration.PdfRoot.glob("*.pdf"))
58
+ if not pdfFiles:
59
+ raise SystemExit(f"No PDFs found in {configuration.PdfRoot}")
60
+
61
+ results = [detector.Detect(pdfPath) for pdfPath in pdfFiles]
62
+
63
+ # Allow configuration to suppress file output entirely (out_dir: none / SIGDETECT_OUT_DIR=none)
64
+ if configuration.OutputDirectory is None:
65
+ payload = json.dumps(results, indent=2, ensure_ascii=False, default=_JsonSerializer)
66
+ typer.echo(payload)
67
+ typer.echo("Detection completed with output disabled (out_dir=none)")
68
+ return
69
+
70
+ outputDirectory = configuration.OutputDirectory
71
+ outputDirectory.mkdir(parents=True, exist_ok=True)
72
+
73
+ with open(outputDirectory / "results.json", "w", encoding="utf-8") as handle:
74
+ json.dump(results, handle, indent=2, ensure_ascii=False, default=_JsonSerializer)
75
+
76
+ typer.echo(f"Wrote {outputDirectory / 'results.json'}")
77
+
78
+
79
+ @CliApplication.command(name="eda")
80
+ def ExploratoryAnalysis(
81
+ configurationPath: Path | None = typer.Option(
82
+ None, "--config", "-c", help="Path to YAML config"
83
+ ),
84
+ ) -> None:
85
+ """Generate a compact exploratory summary for the dataset."""
86
+
87
+ configuration = LoadConfiguration(configurationPath)
88
+ RunExploratoryAnalysis(configuration)
89
+
90
+
91
+ @CliApplication.command(name="version")
92
+ def Version() -> None:
93
+ """Print the installed package version."""
94
+
95
+ typer.echo(__version__)
96
+
97
+
98
+ app = CliApplication
sigdetect/config.py ADDED
@@ -0,0 +1,117 @@
1
+ """Configuration loading utilities for the signature detection service."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from pathlib import Path
7
+ from typing import Literal
8
+
9
+ import yaml
10
+ from pydantic import BaseModel, ConfigDict, Field, field_validator
11
+
12
+ EngineName = Literal["pypdf2", "pypdf", "pymupdf"]
13
+ ProfileName = Literal["hipaa", "retainer"]
14
+
15
+
16
+ class DetectConfiguration(BaseModel):
17
+ """Runtime settings governing signature detection.
18
+
19
+ The fields use PascalCase to comply with the CaseWorks standards while aliases keep
20
+ compatibility with the existing YAML configuration keys and environment variables.
21
+ """
22
+
23
+ model_config = ConfigDict(populate_by_name=True, frozen=True)
24
+
25
+ PdfRoot: Path = Field(default=Path("hipaa_results"), alias="pdf_root")
26
+ OutputDirectory: Path | None = Field(default=Path("out"), alias="out_dir")
27
+ Engine: EngineName = Field(default="pypdf2", alias="engine")
28
+ Profile: ProfileName = Field(default="hipaa", alias="profile")
29
+ MaxWorkers: int = Field(default=8, alias="max_workers", ge=1, le=64)
30
+ PseudoSignatures: bool = Field(default=True, alias="pseudo_signatures")
31
+ RecurseXObjects: bool = Field(default=True, alias="recurse_xobjects")
32
+
33
+ @field_validator("PdfRoot", "OutputDirectory", mode="before")
34
+ @classmethod
35
+ def _CoercePath(cls, value: str | Path | None) -> Path | None:
36
+ """Allow configuration values to be provided as ``str`` or ``Path``.
37
+
38
+ :param value: The candidate value from YAML or environment variables.
39
+ :returns: ``Path`` instances (or ``None`` for optional directories).
40
+ """
41
+
42
+ if value is None:
43
+ return None
44
+ if isinstance(value, Path):
45
+ return value
46
+ return Path(value)
47
+
48
+ # Expose legacy snake_case property names for gradual migration
49
+ @property
50
+ def pdf_root(self) -> Path: # pragma: no cover - simple passthrough
51
+ return self.PdfRoot
52
+
53
+ @property
54
+ def out_dir(self) -> Path | None: # pragma: no cover - simple passthrough
55
+ return self.OutputDirectory
56
+
57
+ @property
58
+ def engine(self) -> EngineName: # pragma: no cover - simple passthrough
59
+ return self.Engine
60
+
61
+ @property
62
+ def profile(self) -> ProfileName: # pragma: no cover - simple passthrough
63
+ return self.Profile
64
+
65
+ @property
66
+ def max_workers(self) -> int: # pragma: no cover - simple passthrough
67
+ return self.MaxWorkers
68
+
69
+ @property
70
+ def pseudo_signatures(self) -> bool: # pragma: no cover - simple passthrough
71
+ return self.PseudoSignatures
72
+
73
+ @property
74
+ def recurse_xobjects(self) -> bool: # pragma: no cover - simple passthrough
75
+ return self.RecurseXObjects
76
+
77
+
78
+ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
79
+ """Load configuration from ``path`` while applying environment overrides.
80
+
81
+ Environment variables provide the final say and follow the existing naming:
82
+
83
+ ``SIGDETECT_ENGINE``
84
+ Override the PDF parsing engine.
85
+ ``SIGDETECT_PDF_ROOT``
86
+ Directory that will be scanned for PDF files.
87
+ ``SIGDETECT_OUT_DIR``
88
+ Output directory for generated artefacts. Use ``"none"`` to disable writes.
89
+ ``SIGDETECT_PROFILE``
90
+ Runtime profile that controls which heuristics are applied.
91
+ """
92
+
93
+ env_engine = os.getenv("SIGDETECT_ENGINE")
94
+ env_pdf_root = os.getenv("SIGDETECT_PDF_ROOT")
95
+ env_out_dir = os.getenv("SIGDETECT_OUT_DIR")
96
+ env_profile = os.getenv("SIGDETECT_PROFILE")
97
+
98
+ raw_data: dict[str, object] = {}
99
+ if path and Path(path).exists():
100
+ with open(path, encoding="utf-8") as handle:
101
+ raw_data = yaml.safe_load(handle) or {}
102
+
103
+ if env_engine:
104
+ raw_data["engine"] = env_engine
105
+ if env_pdf_root:
106
+ raw_data["pdf_root"] = env_pdf_root
107
+ if env_out_dir:
108
+ raw_data["out_dir"] = None if env_out_dir.lower() == "none" else env_out_dir
109
+ if env_profile in {"hipaa", "retainer"}:
110
+ raw_data["profile"] = env_profile
111
+
112
+ configuration = DetectConfiguration(**raw_data)
113
+
114
+ if configuration.OutputDirectory is not None:
115
+ configuration.OutputDirectory.mkdir(parents=True, exist_ok=True)
116
+
117
+ return configuration
@@ -0,0 +1,61 @@
1
+ # --- vendor markers
2
+ bytes:
3
+ - '/DocuSign'
4
+ - '/DSS'
5
+ - '/DocTimeStamp'
6
+ - '/Adobe\.PPKLite'
7
+
8
+ text:
9
+ - 'DocuSign\s+Envelope\s+ID'
10
+ - 'Signature\s+Certificate'
11
+ - 'PandaDoc'
12
+ - 'Signed\s+with\s+PandaDoc'
13
+ - 'Reference\s+number'
14
+
15
+ # --- labels near signature lines/boxes ---
16
+ labels:
17
+ client: '(client\s*signature|name\s+of\s+client|client\s+print\s+name|client\s+signature:)'
18
+ firm: '(signature\s+of\s+(the\s+)?attorney|attorney''?s?\s+signature|counsel\s+signature|by:\s+.*(llp|llc|law|attorney|esq))'
19
+ patient: '(client\s*signature|name\s+of\s+client|client\s+print\s+name|client\s+signature:)'
20
+ representative: '(parent\s*/\s*guardian|guardian\s+signature|parent\s+signature)'
21
+ attorney: '(signature\s+of\s+(the\s+)?attorney|attorney''?s?\s+signature|counsel\s+signature|by:\s+.*(llp|llc|law|attorney|esq))'
22
+
23
+ # --- general tokens allowed across a page ---
24
+ general:
25
+ client: '\bclient\b|\bclient''?s\b'
26
+ firm: '\besq\.?\b|\battorney\b|\blaw\s*f(?:irm)?\b|\bllp\b|\bllc\b|\bpartner\b'
27
+ patient: '\bclient\b|\bclient''?s\b'
28
+ representative: '\brepresentative\b|\bguardian\b|\bparent\b'
29
+ attorney: '\besq\.?\b|\battorney\b|\blaw\s*f(?:irm)?\b|\bllp\b|\bllc\b|\bpartner\b'
30
+
31
+ # --- field-name hints (/T, /TU, /TM) ---
32
+ field_hints:
33
+ client: ['client', 'clientname', 'clientsignature', 'name_of_client', 'client_signature']
34
+ firm: ['attorney', 'lawyer', 'counsel', 'firm', 'partner']
35
+ patient: ['client', 'clientname', 'clientsignature', 'name_of_client', 'client_signature']
36
+ representative: ['parent', 'guardian', 'representative']
37
+ attorney: ['attorney', 'lawyer', 'counsel', 'firm', 'partner']
38
+
39
+ # --- doc-level rules ---
40
+ doc_hard:
41
+ rel_label: 'relationship\s+to\s+(patient|client)'
42
+ kin: '\b(mother|father|mom|dad|child|son|daughter|spouse|husband|wife|guardian|parent|grandmother|grandfather|grandparent|step[-\s]?parent|stepmother|stepfather|next\s+of\s+kin|conservator|custodian|caregiver)\b'
43
+ minor: '\bminor\b|patient\s+was\s+unable\s+to\s+sign|unable\s+to\s+sign'
44
+ first_person: 'I\s*,?\s*[A-Za-z.\-''\s]{1,80}\s*,?\s+authorize\b'
45
+
46
+ # --- weights ---
47
+ weights:
48
+ field: 3
49
+ page_label: 2
50
+ general: 1
51
+ doc_hint_strong: 3
52
+ doc_hint_weak: 2
53
+
54
+ # --- retainer-only helpers: used to pick the most likely signature pages ---
55
+ retain:
56
+ signature_zone: '(signature\b|signed\s*:|by\s*:|date\b)'
57
+ exclude_page: '(signature\s+certificate|pandadoc)'
58
+
59
+ markers:
60
+ client: 'client\\s*(printed\\s*)?signature|signature\\s*:\\s*client'
61
+ firm: 'law\\s*firm|attorney\\s*signature|quinn\\s+emanuel|ventur[a|o]\\s+law'
@@ -0,0 +1,71 @@
1
+ bytes:
2
+ - '/DocuSign'
3
+ - '/Adobe\.PPKLite'
4
+ - '/DocTimeStamp'
5
+ - '/DSS'
6
+ - '/AcrobatSign'
7
+ - '/HelloSign'
8
+ - '/Vinesign'
9
+
10
+ text:
11
+ - 'DocuSign\s+Envelope\s+ID'
12
+ - 'Signature\s+Certificate'
13
+ - 'Electronic\s+Record\s+and\s+Signature\s+Disclosure'
14
+ - 'Adobe\s+Acrobat\s+Sign|Acrobat\s+Sign'
15
+ - 'HelloSign|Dropbox\s+Sign'
16
+ - 'Vinesign'
17
+
18
+ # LABELS: only “Signature of …” style phrases (keep narrow)
19
+ labels:
20
+ patient: '(signature\s+of\s+(the\s+)?patient|patient''?s?\s+signature|patient\s+signature)'
21
+ attorney: '(signature\s+of\s+(the\s+)?attorney|attorney''?s?\s+signature|attorney\s+signature|counsel\s+signature)'
22
+ representative: '(signature\s+of\s+(the\s+)?(authorized\s+)?representative|representative''?s?\s+signature|parent\s*(/|\s+and\s+)?\s*guardian\s+signature|guardian\s+signature|executor\s+signature|custodian\s+signature|conservator\s+signature)'
23
+
24
+ # GENERAL tokens (broader, weaker than labels)
25
+ general:
26
+ attorney: '\battorney''?s?\b'
27
+ representative: '\brepresentative\b|\bparent\b|\bguardian\b'
28
+ patient: '\b(patient|self)\b|I\s*,?\s*[A-Za-z.\-''\s]{1,60}\s*,?\s+authorize\b'
29
+
30
+ # Field-name hints used for /T, /TU, /TM
31
+ field_hints:
32
+ patient: ['patient', 'plaintiff', 'self', 'claimant']
33
+ attorney: ['attorney', 'lawyer', 'counsel']
34
+ representative: ['representative', 'rep', 'guardian', 'parent', 'executor', 'custodian', 'conservator', 'poa', 'powerofattorney']
35
+
36
+ # Hard rules used in vendor/Acro-only pseudo signatures
37
+ doc_hard:
38
+ rel_label: 'relationship\s+to\s+patient(\s*\(.*?\))?'
39
+ kin: '\b(mother|father|mom|dad|child|son|daughter|spouse|husband|wife|guardian|parent|grandmother|grandfather|grandparent|step[-\s]?parent|stepmother|stepfather|next\s+of\s+kin|conservator|custodian|caregiver)\b'
40
+ minor: '\bminor\b|patient\s+was\s+unable\s+to\s+sign|unable\s+to\s+sign'
41
+ first_person: 'I\s*,?\s*[A-Za-z.\-''\s]{1,80}\s*,?\s+authorize\b'
42
+
43
+ # Document-level hints for vendor-only pseudo (multiplicative scoring)
44
+ doc_hints_strong:
45
+ representative:
46
+ - '\bparent\s*/\s*guardian\b'
47
+ - '\brelationship\s+to\s+patient\b'
48
+ - '\bnext\s+of\s+kin\b'
49
+ - '\bminor\b'
50
+ - 'reason\s+patient\s+was\s+unable\s+to\s+sign'
51
+ - '\bauthorized\s+representative\b'
52
+ - '\b(power\s+of\s+attorney|P\.?O\.?A)\b'
53
+ - '\blegal\s+representative\b'
54
+ patient:
55
+ - '\bsigned?\s*\(\s*patient\b'
56
+ - '\bpatient''?s?\s*signature\b'
57
+ attorney:
58
+ - '(signature\s+of\s+(the\s+)?attorney|attorney''?s?\s+signature|counsel\s+signature)'
59
+ - '\besq\.?\b|\bbar\s+no\.?\b'
60
+
61
+ doc_hints_weak:
62
+ representative: ['\bguardian\b', '\brepresentative\b', '\bparent\b']
63
+ patient: ['\bpatient\b', '\bself\b']
64
+ # no weak attorney on purpose
65
+
66
+ weights:
67
+ field: 3
68
+ page_label: 2
69
+ general: 1
70
+ doc_hint_strong: 3
71
+ doc_hint_weak: 2
@@ -0,0 +1,16 @@
1
+ bytes:
2
+ - "/DocuSign"
3
+ - "/Adobe.PPKLite"
4
+ - "/DocTimeStamp"
5
+ - "/DSS"
6
+ - "/AcrobatSign"
7
+ - "/HelloSign"
8
+ - "/Vinesign"
9
+ text:
10
+ - "DocuSign\\s+Envelope\\s+ID"
11
+ - "Digitally\\s+signed\\s+by"
12
+ - "Adobe\\s+Acrobat\\s+Sign|Acrobat\\s+Sign"
13
+ - "HelloSign|Dropbox\\s+Sign"
14
+ - "Vinesign"
15
+ - "Electronic\\s+Record\\s+and\\s+Signature\\s+Disclosure"
16
+ - "Signature\\s+Certificate"
@@ -0,0 +1,55 @@
1
+ """Detector exports and factory helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING, Type
6
+
7
+ from .base_detector import Detector
8
+ from .file_result_model import FileResult
9
+ from .pypdf2_engine import PyPDF2Detector
10
+ from .signature_model import Signature
11
+
12
+ if TYPE_CHECKING: # pragma: no cover - typing only
13
+ from sigdetect.config import DetectConfiguration
14
+
15
+
16
+ ENGINE_REGISTRY: dict[str, Type[Detector]] = {
17
+ PyPDF2Detector.Name: PyPDF2Detector,
18
+ }
19
+
20
+ # Accept modern engine alias alongside legacy configuration default.
21
+ ENGINE_REGISTRY.setdefault("pypdf", PyPDF2Detector)
22
+
23
+ try: # pragma: no cover - optional dependency
24
+ from .pymupdf_engine import PyMuPDFDetector # type: ignore
25
+
26
+ if getattr(PyMuPDFDetector, "Name", None):
27
+ ENGINE_REGISTRY[PyMuPDFDetector.Name] = PyMuPDFDetector
28
+ except Exception:
29
+ PyMuPDFDetector = None # type: ignore
30
+
31
+
32
+ def BuildDetector(configuration: DetectConfiguration) -> Detector:
33
+ """Instantiate the configured engine or raise a clear error."""
34
+
35
+ engine_name = (
36
+ getattr(configuration, "Engine", None)
37
+ or getattr(configuration, "engine", None)
38
+ or PyPDF2Detector.Name
39
+ )
40
+ normalized = engine_name.lower()
41
+
42
+ detector_cls = ENGINE_REGISTRY.get(normalized)
43
+ if detector_cls is None:
44
+ available = ", ".join(sorted(ENGINE_REGISTRY)) or "<none>"
45
+ raise ValueError(f"Unsupported engine '{engine_name}'. Available engines: {available}")
46
+ return detector_cls(configuration)
47
+
48
+
49
+ __all__ = [
50
+ "BuildDetector",
51
+ "Detector",
52
+ "ENGINE_REGISTRY",
53
+ "FileResult",
54
+ "Signature",
55
+ ]
@@ -0,0 +1,9 @@
1
+ """Compatibility module exporting detector primitives."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .base_detector import Detector
6
+ from .file_result_model import FileResult
7
+ from .signature_model import Signature
8
+
9
+ __all__ = ["Detector", "FileResult", "Signature"]
@@ -0,0 +1,22 @@
1
+ """Abstract base class for signature detection engines."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ from .file_result_model import FileResult
8
+
9
+
10
+ class Detector:
11
+ """Common interface implemented by concrete detectors."""
12
+
13
+ Name: str = "base"
14
+
15
+ def Detect(self, pdfPath: Path) -> FileResult: # pragma: no cover
16
+ """Analyse ``pdfPath`` and return detection results."""
17
+
18
+ raise NotImplementedError
19
+
20
+ # Provide backwards compatibility for snake_case callers
21
+ def detect(self, pdfPath: Path) -> FileResult: # pragma: no cover
22
+ return self.Detect(pdfPath)
@@ -0,0 +1,59 @@
1
+ """Result container returned from detection engines."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Any
7
+
8
+ from .signature_model import Signature
9
+
10
+
11
+ @dataclass(slots=True)
12
+ class FileResult:
13
+ """Aggregated detection outcome for a single PDF file."""
14
+
15
+ File: str
16
+ SizeKilobytes: float | None
17
+ PageCount: int
18
+ ElectronicSignatureFound: bool
19
+ ScannedPdf: bool | None
20
+ MixedContent: bool | None
21
+ SignatureCount: int
22
+ SignaturePages: str
23
+ Roles: str
24
+ Hints: str
25
+ Signatures: list[Signature] = field(default_factory=list)
26
+
27
+ # Backwards-compatible attribute aliases
28
+ @property
29
+ def Pages(self) -> int: # pragma: no cover - simple passthrough
30
+ return self.PageCount
31
+
32
+ @property
33
+ def pages(self) -> int: # pragma: no cover - simple passthrough
34
+ return self.PageCount
35
+
36
+ @property
37
+ def sig_pages(self) -> str: # pragma: no cover - simple passthrough
38
+ return self.SignaturePages
39
+
40
+ @property
41
+ def sig_count(self) -> int: # pragma: no cover - simple passthrough
42
+ return self.SignatureCount
43
+
44
+ def to_dict(self) -> dict[str, Any]:
45
+ """Return the legacy snake_case representation used by existing clients."""
46
+
47
+ return {
48
+ "file": self.File,
49
+ "size_kb": self.SizeKilobytes,
50
+ "pages": self.PageCount,
51
+ "esign_found": self.ElectronicSignatureFound,
52
+ "scanned_pdf": self.ScannedPdf,
53
+ "mixed": self.MixedContent,
54
+ "sig_count": self.SignatureCount,
55
+ "sig_pages": self.SignaturePages,
56
+ "roles": self.Roles,
57
+ "hints": self.Hints,
58
+ "signatures": [signature.to_dict() for signature in self.Signatures],
59
+ }
File without changes