sigdetect 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sigdetect/__init__.py +24 -0
- sigdetect/api.py +139 -0
- sigdetect/cli.py +98 -0
- sigdetect/config.py +117 -0
- sigdetect/data/role_rules.retainer.yml +61 -0
- sigdetect/data/role_rules.yml +71 -0
- sigdetect/data/vendor_patterns.yml +16 -0
- sigdetect/detector/__init__.py +55 -0
- sigdetect/detector/base.py +9 -0
- sigdetect/detector/base_detector.py +22 -0
- sigdetect/detector/file_result_model.py +59 -0
- sigdetect/detector/pymupdf_engine.py +0 -0
- sigdetect/detector/pypdf2_engine.py +1114 -0
- sigdetect/detector/signature_model.py +34 -0
- sigdetect/eda.py +137 -0
- sigdetect/logging_setup.py +218 -0
- sigdetect/utils.py +152 -0
- sigdetect-0.1.0.dist-info/METADATA +394 -0
- sigdetect-0.1.0.dist-info/RECORD +22 -0
- sigdetect-0.1.0.dist-info/WHEEL +5 -0
- sigdetect-0.1.0.dist-info/entry_points.txt +2 -0
- sigdetect-0.1.0.dist-info/top_level.txt +1 -0
sigdetect/__init__.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""sigdetect – PDF e-sign detection & role attribution."""
|
|
2
|
+
|
|
3
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
import warnings
|
|
7
|
+
|
|
8
|
+
from pypdf.errors import PdfReadWarning
|
|
9
|
+
|
|
10
|
+
warnings.filterwarnings(
|
|
11
|
+
"ignore",
|
|
12
|
+
message=r"Multiple definitions in dictionary.*key /Subtype",
|
|
13
|
+
category=PdfReadWarning,
|
|
14
|
+
)
|
|
15
|
+
except Exception:
|
|
16
|
+
# Never fail imports because of warnings setup
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
try:
|
|
20
|
+
__version__ = version("sigdetect")
|
|
21
|
+
except PackageNotFoundError: # pragma: no cover
|
|
22
|
+
__version__ = "0.0.0"
|
|
23
|
+
|
|
24
|
+
DEFAULT_ENGINE = "pypdf2"
|
sigdetect/api.py
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"""Public helpers for programmatic use of the signature detection engine."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, Iterable, Iterator, Literal
|
|
7
|
+
|
|
8
|
+
from sigdetect.config import DetectConfiguration
|
|
9
|
+
from sigdetect.detector import BuildDetector
|
|
10
|
+
|
|
11
|
+
EngineName = Literal["pypdf2", "pypdf", "pymupdf"]
|
|
12
|
+
ProfileName = Literal["hipaa", "retainer"]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def DetectPdf(
|
|
16
|
+
pdfPath: str | Path,
|
|
17
|
+
*,
|
|
18
|
+
profileName: ProfileName = "hipaa",
|
|
19
|
+
engineName: EngineName = "pypdf2",
|
|
20
|
+
includePseudoSignatures: bool = True,
|
|
21
|
+
recurseXObjects: bool = True,
|
|
22
|
+
) -> dict[str, Any]:
|
|
23
|
+
"""Detect signature evidence and assign roles for a single PDF."""
|
|
24
|
+
|
|
25
|
+
resolvedPath = Path(pdfPath)
|
|
26
|
+
|
|
27
|
+
configuration = DetectConfiguration(
|
|
28
|
+
PdfRoot=resolvedPath.parent,
|
|
29
|
+
OutputDirectory=None,
|
|
30
|
+
Engine=engineName,
|
|
31
|
+
PseudoSignatures=includePseudoSignatures,
|
|
32
|
+
RecurseXObjects=recurseXObjects,
|
|
33
|
+
Profile=profileName,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
detector = BuildDetector(configuration)
|
|
37
|
+
result = detector.Detect(resolvedPath)
|
|
38
|
+
return _ToPlainDictionary(result)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _ToPlainDictionary(candidate: Any) -> dict[str, Any]:
|
|
42
|
+
"""Convert pydantic/dataclass instances to plain dictionaries."""
|
|
43
|
+
|
|
44
|
+
if hasattr(candidate, "to_dict"):
|
|
45
|
+
return candidate.to_dict()
|
|
46
|
+
if hasattr(candidate, "model_dump"):
|
|
47
|
+
return candidate.model_dump() # type: ignore[attr-defined]
|
|
48
|
+
if hasattr(candidate, "dict"):
|
|
49
|
+
return candidate.dict() # type: ignore[attr-defined]
|
|
50
|
+
try:
|
|
51
|
+
from dataclasses import asdict, is_dataclass
|
|
52
|
+
|
|
53
|
+
if is_dataclass(candidate):
|
|
54
|
+
return asdict(candidate)
|
|
55
|
+
except Exception:
|
|
56
|
+
pass
|
|
57
|
+
if isinstance(candidate, dict):
|
|
58
|
+
return {key: _ToPlainValue(candidate[key]) for key in candidate}
|
|
59
|
+
raise TypeError(f"Unsupported result type: {type(candidate)!r}")
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _ToPlainValue(value: Any) -> Any:
|
|
63
|
+
"""Best effort conversion for nested structures."""
|
|
64
|
+
|
|
65
|
+
if hasattr(value, "to_dict"):
|
|
66
|
+
return value.to_dict()
|
|
67
|
+
if hasattr(value, "model_dump") or hasattr(value, "dict"):
|
|
68
|
+
return _ToPlainDictionary(value)
|
|
69
|
+
try:
|
|
70
|
+
from dataclasses import asdict, is_dataclass
|
|
71
|
+
|
|
72
|
+
if is_dataclass(value):
|
|
73
|
+
return asdict(value)
|
|
74
|
+
except Exception:
|
|
75
|
+
pass
|
|
76
|
+
if isinstance(value, list):
|
|
77
|
+
return [_ToPlainValue(item) for item in value]
|
|
78
|
+
if isinstance(value, tuple):
|
|
79
|
+
return tuple(_ToPlainValue(item) for item in value)
|
|
80
|
+
if isinstance(value, dict):
|
|
81
|
+
return {key: _ToPlainValue(result) for key, result in value.items()}
|
|
82
|
+
return value
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def DetectMany(
|
|
86
|
+
pdfPaths: Iterable[str | Path],
|
|
87
|
+
**kwargs: Any,
|
|
88
|
+
) -> Iterator[dict[str, Any]]:
|
|
89
|
+
"""Yield :func:`DetectPdf` results for each path in ``pdfPaths``."""
|
|
90
|
+
|
|
91
|
+
for pdfPath in pdfPaths:
|
|
92
|
+
yield DetectPdf(pdfPath, **kwargs)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def ScanDirectory(
|
|
96
|
+
pdfRoot: str | Path,
|
|
97
|
+
*,
|
|
98
|
+
globPattern: str = "**/*.pdf",
|
|
99
|
+
**kwargs: Any,
|
|
100
|
+
) -> Iterator[dict[str, Any]]:
|
|
101
|
+
"""Walk ``pdfRoot`` and yield detection output for every matching PDF."""
|
|
102
|
+
|
|
103
|
+
rootDirectory = Path(pdfRoot)
|
|
104
|
+
iterator = (
|
|
105
|
+
rootDirectory.rglob(globPattern.replace("**/", "", 1))
|
|
106
|
+
if globPattern.startswith("**/")
|
|
107
|
+
else rootDirectory.glob(globPattern)
|
|
108
|
+
)
|
|
109
|
+
for pdfPath in iterator:
|
|
110
|
+
if pdfPath.is_file() and pdfPath.suffix.lower() == ".pdf":
|
|
111
|
+
yield DetectPdf(pdfPath, **kwargs)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def ToCsvRow(result: dict[str, Any]) -> dict[str, Any]:
|
|
115
|
+
"""Return a curated subset of keys suitable for CSV export."""
|
|
116
|
+
|
|
117
|
+
return {
|
|
118
|
+
"file": result.get("file"),
|
|
119
|
+
"size_kb": result.get("size_kb"),
|
|
120
|
+
"pages": result.get("pages"),
|
|
121
|
+
"esign_found": result.get("esign_found"),
|
|
122
|
+
"scanned_pdf": result.get("scanned_pdf"),
|
|
123
|
+
"mixed": result.get("mixed"),
|
|
124
|
+
"sig_count": result.get("sig_count"),
|
|
125
|
+
"sig_pages": result.get("sig_pages"),
|
|
126
|
+
"roles": result.get("roles"),
|
|
127
|
+
"hints": result.get("hints"),
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def Version() -> str:
|
|
132
|
+
"""Expose the installed package version without importing the CLI stack."""
|
|
133
|
+
|
|
134
|
+
try:
|
|
135
|
+
from importlib.metadata import version as resolveVersion
|
|
136
|
+
|
|
137
|
+
return resolveVersion("sigdetect")
|
|
138
|
+
except Exception:
|
|
139
|
+
return "0.0.0-dev"
|
sigdetect/cli.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
"""Command line interface for the signature detection tool."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from dataclasses import asdict, is_dataclass
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
import typer
|
|
10
|
+
|
|
11
|
+
from . import __version__
|
|
12
|
+
from .config import LoadConfiguration
|
|
13
|
+
from .detector import BuildDetector
|
|
14
|
+
from .eda import RunExploratoryAnalysis
|
|
15
|
+
from .logging_setup import ConfigureLogging
|
|
16
|
+
|
|
17
|
+
Logger = ConfigureLogging()
|
|
18
|
+
|
|
19
|
+
CliApplication = typer.Typer(help="Signature detection & role attribution for PDFs")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _JsonSerializer(candidate):
|
|
23
|
+
"""Ensure dataclasses and paths remain JSON serialisable."""
|
|
24
|
+
|
|
25
|
+
if hasattr(candidate, "to_dict"):
|
|
26
|
+
return candidate.to_dict()
|
|
27
|
+
if is_dataclass(candidate):
|
|
28
|
+
return asdict(candidate)
|
|
29
|
+
if isinstance(candidate, Path):
|
|
30
|
+
return str(candidate)
|
|
31
|
+
return str(candidate)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@CliApplication.command(name="detect")
|
|
35
|
+
def Detect(
|
|
36
|
+
configurationPath: Path | None = typer.Option(
|
|
37
|
+
None, "--config", "-c", help="Path to YAML config"
|
|
38
|
+
),
|
|
39
|
+
profileOverride: str | None = typer.Option(None, "--profile", "-p", help="hipaa or retainer"),
|
|
40
|
+
) -> None:
|
|
41
|
+
"""Run detection for the configured directory and emit ``results.json``."""
|
|
42
|
+
|
|
43
|
+
configuration = LoadConfiguration(configurationPath)
|
|
44
|
+
if profileOverride in {"hipaa", "retainer"}:
|
|
45
|
+
configuration = configuration.model_copy(update={"Profile": profileOverride})
|
|
46
|
+
|
|
47
|
+
try:
|
|
48
|
+
detector = BuildDetector(configuration)
|
|
49
|
+
except ValueError as exc:
|
|
50
|
+
Logger.error(
|
|
51
|
+
"Detector initialisation failed",
|
|
52
|
+
extra={"engine": configuration.Engine, "error": str(exc)},
|
|
53
|
+
)
|
|
54
|
+
typer.echo(str(exc), err=True)
|
|
55
|
+
raise typer.Exit(code=2) from exc
|
|
56
|
+
|
|
57
|
+
pdfFiles = list(configuration.PdfRoot.glob("*.pdf"))
|
|
58
|
+
if not pdfFiles:
|
|
59
|
+
raise SystemExit(f"No PDFs found in {configuration.PdfRoot}")
|
|
60
|
+
|
|
61
|
+
results = [detector.Detect(pdfPath) for pdfPath in pdfFiles]
|
|
62
|
+
|
|
63
|
+
# Allow configuration to suppress file output entirely (out_dir: none / SIGDETECT_OUT_DIR=none)
|
|
64
|
+
if configuration.OutputDirectory is None:
|
|
65
|
+
payload = json.dumps(results, indent=2, ensure_ascii=False, default=_JsonSerializer)
|
|
66
|
+
typer.echo(payload)
|
|
67
|
+
typer.echo("Detection completed with output disabled (out_dir=none)")
|
|
68
|
+
return
|
|
69
|
+
|
|
70
|
+
outputDirectory = configuration.OutputDirectory
|
|
71
|
+
outputDirectory.mkdir(parents=True, exist_ok=True)
|
|
72
|
+
|
|
73
|
+
with open(outputDirectory / "results.json", "w", encoding="utf-8") as handle:
|
|
74
|
+
json.dump(results, handle, indent=2, ensure_ascii=False, default=_JsonSerializer)
|
|
75
|
+
|
|
76
|
+
typer.echo(f"Wrote {outputDirectory / 'results.json'}")
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@CliApplication.command(name="eda")
|
|
80
|
+
def ExploratoryAnalysis(
|
|
81
|
+
configurationPath: Path | None = typer.Option(
|
|
82
|
+
None, "--config", "-c", help="Path to YAML config"
|
|
83
|
+
),
|
|
84
|
+
) -> None:
|
|
85
|
+
"""Generate a compact exploratory summary for the dataset."""
|
|
86
|
+
|
|
87
|
+
configuration = LoadConfiguration(configurationPath)
|
|
88
|
+
RunExploratoryAnalysis(configuration)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@CliApplication.command(name="version")
|
|
92
|
+
def Version() -> None:
|
|
93
|
+
"""Print the installed package version."""
|
|
94
|
+
|
|
95
|
+
typer.echo(__version__)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
app = CliApplication
|
sigdetect/config.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""Configuration loading utilities for the signature detection service."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Literal
|
|
8
|
+
|
|
9
|
+
import yaml
|
|
10
|
+
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
|
11
|
+
|
|
12
|
+
EngineName = Literal["pypdf2", "pypdf", "pymupdf"]
|
|
13
|
+
ProfileName = Literal["hipaa", "retainer"]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DetectConfiguration(BaseModel):
|
|
17
|
+
"""Runtime settings governing signature detection.
|
|
18
|
+
|
|
19
|
+
The fields use PascalCase to comply with the CaseWorks standards while aliases keep
|
|
20
|
+
compatibility with the existing YAML configuration keys and environment variables.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
model_config = ConfigDict(populate_by_name=True, frozen=True)
|
|
24
|
+
|
|
25
|
+
PdfRoot: Path = Field(default=Path("hipaa_results"), alias="pdf_root")
|
|
26
|
+
OutputDirectory: Path | None = Field(default=Path("out"), alias="out_dir")
|
|
27
|
+
Engine: EngineName = Field(default="pypdf2", alias="engine")
|
|
28
|
+
Profile: ProfileName = Field(default="hipaa", alias="profile")
|
|
29
|
+
MaxWorkers: int = Field(default=8, alias="max_workers", ge=1, le=64)
|
|
30
|
+
PseudoSignatures: bool = Field(default=True, alias="pseudo_signatures")
|
|
31
|
+
RecurseXObjects: bool = Field(default=True, alias="recurse_xobjects")
|
|
32
|
+
|
|
33
|
+
@field_validator("PdfRoot", "OutputDirectory", mode="before")
|
|
34
|
+
@classmethod
|
|
35
|
+
def _CoercePath(cls, value: str | Path | None) -> Path | None:
|
|
36
|
+
"""Allow configuration values to be provided as ``str`` or ``Path``.
|
|
37
|
+
|
|
38
|
+
:param value: The candidate value from YAML or environment variables.
|
|
39
|
+
:returns: ``Path`` instances (or ``None`` for optional directories).
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
if value is None:
|
|
43
|
+
return None
|
|
44
|
+
if isinstance(value, Path):
|
|
45
|
+
return value
|
|
46
|
+
return Path(value)
|
|
47
|
+
|
|
48
|
+
# Expose legacy snake_case property names for gradual migration
|
|
49
|
+
@property
|
|
50
|
+
def pdf_root(self) -> Path: # pragma: no cover - simple passthrough
|
|
51
|
+
return self.PdfRoot
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def out_dir(self) -> Path | None: # pragma: no cover - simple passthrough
|
|
55
|
+
return self.OutputDirectory
|
|
56
|
+
|
|
57
|
+
@property
|
|
58
|
+
def engine(self) -> EngineName: # pragma: no cover - simple passthrough
|
|
59
|
+
return self.Engine
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def profile(self) -> ProfileName: # pragma: no cover - simple passthrough
|
|
63
|
+
return self.Profile
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def max_workers(self) -> int: # pragma: no cover - simple passthrough
|
|
67
|
+
return self.MaxWorkers
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def pseudo_signatures(self) -> bool: # pragma: no cover - simple passthrough
|
|
71
|
+
return self.PseudoSignatures
|
|
72
|
+
|
|
73
|
+
@property
|
|
74
|
+
def recurse_xobjects(self) -> bool: # pragma: no cover - simple passthrough
|
|
75
|
+
return self.RecurseXObjects
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def LoadConfiguration(path: Path | None) -> DetectConfiguration:
|
|
79
|
+
"""Load configuration from ``path`` while applying environment overrides.
|
|
80
|
+
|
|
81
|
+
Environment variables provide the final say and follow the existing naming:
|
|
82
|
+
|
|
83
|
+
``SIGDETECT_ENGINE``
|
|
84
|
+
Override the PDF parsing engine.
|
|
85
|
+
``SIGDETECT_PDF_ROOT``
|
|
86
|
+
Directory that will be scanned for PDF files.
|
|
87
|
+
``SIGDETECT_OUT_DIR``
|
|
88
|
+
Output directory for generated artefacts. Use ``"none"`` to disable writes.
|
|
89
|
+
``SIGDETECT_PROFILE``
|
|
90
|
+
Runtime profile that controls which heuristics are applied.
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
env_engine = os.getenv("SIGDETECT_ENGINE")
|
|
94
|
+
env_pdf_root = os.getenv("SIGDETECT_PDF_ROOT")
|
|
95
|
+
env_out_dir = os.getenv("SIGDETECT_OUT_DIR")
|
|
96
|
+
env_profile = os.getenv("SIGDETECT_PROFILE")
|
|
97
|
+
|
|
98
|
+
raw_data: dict[str, object] = {}
|
|
99
|
+
if path and Path(path).exists():
|
|
100
|
+
with open(path, encoding="utf-8") as handle:
|
|
101
|
+
raw_data = yaml.safe_load(handle) or {}
|
|
102
|
+
|
|
103
|
+
if env_engine:
|
|
104
|
+
raw_data["engine"] = env_engine
|
|
105
|
+
if env_pdf_root:
|
|
106
|
+
raw_data["pdf_root"] = env_pdf_root
|
|
107
|
+
if env_out_dir:
|
|
108
|
+
raw_data["out_dir"] = None if env_out_dir.lower() == "none" else env_out_dir
|
|
109
|
+
if env_profile in {"hipaa", "retainer"}:
|
|
110
|
+
raw_data["profile"] = env_profile
|
|
111
|
+
|
|
112
|
+
configuration = DetectConfiguration(**raw_data)
|
|
113
|
+
|
|
114
|
+
if configuration.OutputDirectory is not None:
|
|
115
|
+
configuration.OutputDirectory.mkdir(parents=True, exist_ok=True)
|
|
116
|
+
|
|
117
|
+
return configuration
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# --- vendor markers
|
|
2
|
+
bytes:
|
|
3
|
+
- '/DocuSign'
|
|
4
|
+
- '/DSS'
|
|
5
|
+
- '/DocTimeStamp'
|
|
6
|
+
- '/Adobe\.PPKLite'
|
|
7
|
+
|
|
8
|
+
text:
|
|
9
|
+
- 'DocuSign\s+Envelope\s+ID'
|
|
10
|
+
- 'Signature\s+Certificate'
|
|
11
|
+
- 'PandaDoc'
|
|
12
|
+
- 'Signed\s+with\s+PandaDoc'
|
|
13
|
+
- 'Reference\s+number'
|
|
14
|
+
|
|
15
|
+
# --- labels near signature lines/boxes ---
|
|
16
|
+
labels:
|
|
17
|
+
client: '(client\s*signature|name\s+of\s+client|client\s+print\s+name|client\s+signature:)'
|
|
18
|
+
firm: '(signature\s+of\s+(the\s+)?attorney|attorney''?s?\s+signature|counsel\s+signature|by:\s+.*(llp|llc|law|attorney|esq))'
|
|
19
|
+
patient: '(client\s*signature|name\s+of\s+client|client\s+print\s+name|client\s+signature:)'
|
|
20
|
+
representative: '(parent\s*/\s*guardian|guardian\s+signature|parent\s+signature)'
|
|
21
|
+
attorney: '(signature\s+of\s+(the\s+)?attorney|attorney''?s?\s+signature|counsel\s+signature|by:\s+.*(llp|llc|law|attorney|esq))'
|
|
22
|
+
|
|
23
|
+
# --- general tokens allowed across a page ---
|
|
24
|
+
general:
|
|
25
|
+
client: '\bclient\b|\bclient''?s\b'
|
|
26
|
+
firm: '\besq\.?\b|\battorney\b|\blaw\s*f(?:irm)?\b|\bllp\b|\bllc\b|\bpartner\b'
|
|
27
|
+
patient: '\bclient\b|\bclient''?s\b'
|
|
28
|
+
representative: '\brepresentative\b|\bguardian\b|\bparent\b'
|
|
29
|
+
attorney: '\besq\.?\b|\battorney\b|\blaw\s*f(?:irm)?\b|\bllp\b|\bllc\b|\bpartner\b'
|
|
30
|
+
|
|
31
|
+
# --- field-name hints (/T, /TU, /TM) ---
|
|
32
|
+
field_hints:
|
|
33
|
+
client: ['client', 'clientname', 'clientsignature', 'name_of_client', 'client_signature']
|
|
34
|
+
firm: ['attorney', 'lawyer', 'counsel', 'firm', 'partner']
|
|
35
|
+
patient: ['client', 'clientname', 'clientsignature', 'name_of_client', 'client_signature']
|
|
36
|
+
representative: ['parent', 'guardian', 'representative']
|
|
37
|
+
attorney: ['attorney', 'lawyer', 'counsel', 'firm', 'partner']
|
|
38
|
+
|
|
39
|
+
# --- doc-level rules ---
|
|
40
|
+
doc_hard:
|
|
41
|
+
rel_label: 'relationship\s+to\s+(patient|client)'
|
|
42
|
+
kin: '\b(mother|father|mom|dad|child|son|daughter|spouse|husband|wife|guardian|parent|grandmother|grandfather|grandparent|step[-\s]?parent|stepmother|stepfather|next\s+of\s+kin|conservator|custodian|caregiver)\b'
|
|
43
|
+
minor: '\bminor\b|patient\s+was\s+unable\s+to\s+sign|unable\s+to\s+sign'
|
|
44
|
+
first_person: 'I\s*,?\s*[A-Za-z.\-''\s]{1,80}\s*,?\s+authorize\b'
|
|
45
|
+
|
|
46
|
+
# --- weights ---
|
|
47
|
+
weights:
|
|
48
|
+
field: 3
|
|
49
|
+
page_label: 2
|
|
50
|
+
general: 1
|
|
51
|
+
doc_hint_strong: 3
|
|
52
|
+
doc_hint_weak: 2
|
|
53
|
+
|
|
54
|
+
# --- retainer-only helpers: used to pick the most likely signature pages ---
|
|
55
|
+
retain:
|
|
56
|
+
signature_zone: '(signature\b|signed\s*:|by\s*:|date\b)'
|
|
57
|
+
exclude_page: '(signature\s+certificate|pandadoc)'
|
|
58
|
+
|
|
59
|
+
markers:
|
|
60
|
+
client: 'client\\s*(printed\\s*)?signature|signature\\s*:\\s*client'
|
|
61
|
+
firm: 'law\\s*firm|attorney\\s*signature|quinn\\s+emanuel|ventur[a|o]\\s+law'
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
bytes:
|
|
2
|
+
- '/DocuSign'
|
|
3
|
+
- '/Adobe\.PPKLite'
|
|
4
|
+
- '/DocTimeStamp'
|
|
5
|
+
- '/DSS'
|
|
6
|
+
- '/AcrobatSign'
|
|
7
|
+
- '/HelloSign'
|
|
8
|
+
- '/Vinesign'
|
|
9
|
+
|
|
10
|
+
text:
|
|
11
|
+
- 'DocuSign\s+Envelope\s+ID'
|
|
12
|
+
- 'Signature\s+Certificate'
|
|
13
|
+
- 'Electronic\s+Record\s+and\s+Signature\s+Disclosure'
|
|
14
|
+
- 'Adobe\s+Acrobat\s+Sign|Acrobat\s+Sign'
|
|
15
|
+
- 'HelloSign|Dropbox\s+Sign'
|
|
16
|
+
- 'Vinesign'
|
|
17
|
+
|
|
18
|
+
# LABELS: only “Signature of …” style phrases (keep narrow)
|
|
19
|
+
labels:
|
|
20
|
+
patient: '(signature\s+of\s+(the\s+)?patient|patient''?s?\s+signature|patient\s+signature)'
|
|
21
|
+
attorney: '(signature\s+of\s+(the\s+)?attorney|attorney''?s?\s+signature|attorney\s+signature|counsel\s+signature)'
|
|
22
|
+
representative: '(signature\s+of\s+(the\s+)?(authorized\s+)?representative|representative''?s?\s+signature|parent\s*(/|\s+and\s+)?\s*guardian\s+signature|guardian\s+signature|executor\s+signature|custodian\s+signature|conservator\s+signature)'
|
|
23
|
+
|
|
24
|
+
# GENERAL tokens (broader, weaker than labels)
|
|
25
|
+
general:
|
|
26
|
+
attorney: '\battorney''?s?\b'
|
|
27
|
+
representative: '\brepresentative\b|\bparent\b|\bguardian\b'
|
|
28
|
+
patient: '\b(patient|self)\b|I\s*,?\s*[A-Za-z.\-''\s]{1,60}\s*,?\s+authorize\b'
|
|
29
|
+
|
|
30
|
+
# Field-name hints used for /T, /TU, /TM
|
|
31
|
+
field_hints:
|
|
32
|
+
patient: ['patient', 'plaintiff', 'self', 'claimant']
|
|
33
|
+
attorney: ['attorney', 'lawyer', 'counsel']
|
|
34
|
+
representative: ['representative', 'rep', 'guardian', 'parent', 'executor', 'custodian', 'conservator', 'poa', 'powerofattorney']
|
|
35
|
+
|
|
36
|
+
# Hard rules used in vendor/Acro-only pseudo signatures
|
|
37
|
+
doc_hard:
|
|
38
|
+
rel_label: 'relationship\s+to\s+patient(\s*\(.*?\))?'
|
|
39
|
+
kin: '\b(mother|father|mom|dad|child|son|daughter|spouse|husband|wife|guardian|parent|grandmother|grandfather|grandparent|step[-\s]?parent|stepmother|stepfather|next\s+of\s+kin|conservator|custodian|caregiver)\b'
|
|
40
|
+
minor: '\bminor\b|patient\s+was\s+unable\s+to\s+sign|unable\s+to\s+sign'
|
|
41
|
+
first_person: 'I\s*,?\s*[A-Za-z.\-''\s]{1,80}\s*,?\s+authorize\b'
|
|
42
|
+
|
|
43
|
+
# Document-level hints for vendor-only pseudo (multiplicative scoring)
|
|
44
|
+
doc_hints_strong:
|
|
45
|
+
representative:
|
|
46
|
+
- '\bparent\s*/\s*guardian\b'
|
|
47
|
+
- '\brelationship\s+to\s+patient\b'
|
|
48
|
+
- '\bnext\s+of\s+kin\b'
|
|
49
|
+
- '\bminor\b'
|
|
50
|
+
- 'reason\s+patient\s+was\s+unable\s+to\s+sign'
|
|
51
|
+
- '\bauthorized\s+representative\b'
|
|
52
|
+
- '\b(power\s+of\s+attorney|P\.?O\.?A)\b'
|
|
53
|
+
- '\blegal\s+representative\b'
|
|
54
|
+
patient:
|
|
55
|
+
- '\bsigned?\s*\(\s*patient\b'
|
|
56
|
+
- '\bpatient''?s?\s*signature\b'
|
|
57
|
+
attorney:
|
|
58
|
+
- '(signature\s+of\s+(the\s+)?attorney|attorney''?s?\s+signature|counsel\s+signature)'
|
|
59
|
+
- '\besq\.?\b|\bbar\s+no\.?\b'
|
|
60
|
+
|
|
61
|
+
doc_hints_weak:
|
|
62
|
+
representative: ['\bguardian\b', '\brepresentative\b', '\bparent\b']
|
|
63
|
+
patient: ['\bpatient\b', '\bself\b']
|
|
64
|
+
# no weak attorney on purpose
|
|
65
|
+
|
|
66
|
+
weights:
|
|
67
|
+
field: 3
|
|
68
|
+
page_label: 2
|
|
69
|
+
general: 1
|
|
70
|
+
doc_hint_strong: 3
|
|
71
|
+
doc_hint_weak: 2
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
bytes:
|
|
2
|
+
- "/DocuSign"
|
|
3
|
+
- "/Adobe.PPKLite"
|
|
4
|
+
- "/DocTimeStamp"
|
|
5
|
+
- "/DSS"
|
|
6
|
+
- "/AcrobatSign"
|
|
7
|
+
- "/HelloSign"
|
|
8
|
+
- "/Vinesign"
|
|
9
|
+
text:
|
|
10
|
+
- "DocuSign\\s+Envelope\\s+ID"
|
|
11
|
+
- "Digitally\\s+signed\\s+by"
|
|
12
|
+
- "Adobe\\s+Acrobat\\s+Sign|Acrobat\\s+Sign"
|
|
13
|
+
- "HelloSign|Dropbox\\s+Sign"
|
|
14
|
+
- "Vinesign"
|
|
15
|
+
- "Electronic\\s+Record\\s+and\\s+Signature\\s+Disclosure"
|
|
16
|
+
- "Signature\\s+Certificate"
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""Detector exports and factory helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING, Type
|
|
6
|
+
|
|
7
|
+
from .base_detector import Detector
|
|
8
|
+
from .file_result_model import FileResult
|
|
9
|
+
from .pypdf2_engine import PyPDF2Detector
|
|
10
|
+
from .signature_model import Signature
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING: # pragma: no cover - typing only
|
|
13
|
+
from sigdetect.config import DetectConfiguration
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
ENGINE_REGISTRY: dict[str, Type[Detector]] = {
|
|
17
|
+
PyPDF2Detector.Name: PyPDF2Detector,
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
# Accept modern engine alias alongside legacy configuration default.
|
|
21
|
+
ENGINE_REGISTRY.setdefault("pypdf", PyPDF2Detector)
|
|
22
|
+
|
|
23
|
+
try: # pragma: no cover - optional dependency
|
|
24
|
+
from .pymupdf_engine import PyMuPDFDetector # type: ignore
|
|
25
|
+
|
|
26
|
+
if getattr(PyMuPDFDetector, "Name", None):
|
|
27
|
+
ENGINE_REGISTRY[PyMuPDFDetector.Name] = PyMuPDFDetector
|
|
28
|
+
except Exception:
|
|
29
|
+
PyMuPDFDetector = None # type: ignore
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def BuildDetector(configuration: DetectConfiguration) -> Detector:
|
|
33
|
+
"""Instantiate the configured engine or raise a clear error."""
|
|
34
|
+
|
|
35
|
+
engine_name = (
|
|
36
|
+
getattr(configuration, "Engine", None)
|
|
37
|
+
or getattr(configuration, "engine", None)
|
|
38
|
+
or PyPDF2Detector.Name
|
|
39
|
+
)
|
|
40
|
+
normalized = engine_name.lower()
|
|
41
|
+
|
|
42
|
+
detector_cls = ENGINE_REGISTRY.get(normalized)
|
|
43
|
+
if detector_cls is None:
|
|
44
|
+
available = ", ".join(sorted(ENGINE_REGISTRY)) or "<none>"
|
|
45
|
+
raise ValueError(f"Unsupported engine '{engine_name}'. Available engines: {available}")
|
|
46
|
+
return detector_cls(configuration)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
__all__ = [
|
|
50
|
+
"BuildDetector",
|
|
51
|
+
"Detector",
|
|
52
|
+
"ENGINE_REGISTRY",
|
|
53
|
+
"FileResult",
|
|
54
|
+
"Signature",
|
|
55
|
+
]
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"""Compatibility module exporting detector primitives."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .base_detector import Detector
|
|
6
|
+
from .file_result_model import FileResult
|
|
7
|
+
from .signature_model import Signature
|
|
8
|
+
|
|
9
|
+
__all__ = ["Detector", "FileResult", "Signature"]
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Abstract base class for signature detection engines."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from .file_result_model import FileResult
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Detector:
|
|
11
|
+
"""Common interface implemented by concrete detectors."""
|
|
12
|
+
|
|
13
|
+
Name: str = "base"
|
|
14
|
+
|
|
15
|
+
def Detect(self, pdfPath: Path) -> FileResult: # pragma: no cover
|
|
16
|
+
"""Analyse ``pdfPath`` and return detection results."""
|
|
17
|
+
|
|
18
|
+
raise NotImplementedError
|
|
19
|
+
|
|
20
|
+
# Provide backwards compatibility for snake_case callers
|
|
21
|
+
def detect(self, pdfPath: Path) -> FileResult: # pragma: no cover
|
|
22
|
+
return self.Detect(pdfPath)
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""Result container returned from detection engines."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from .signature_model import Signature
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass(slots=True)
|
|
12
|
+
class FileResult:
|
|
13
|
+
"""Aggregated detection outcome for a single PDF file."""
|
|
14
|
+
|
|
15
|
+
File: str
|
|
16
|
+
SizeKilobytes: float | None
|
|
17
|
+
PageCount: int
|
|
18
|
+
ElectronicSignatureFound: bool
|
|
19
|
+
ScannedPdf: bool | None
|
|
20
|
+
MixedContent: bool | None
|
|
21
|
+
SignatureCount: int
|
|
22
|
+
SignaturePages: str
|
|
23
|
+
Roles: str
|
|
24
|
+
Hints: str
|
|
25
|
+
Signatures: list[Signature] = field(default_factory=list)
|
|
26
|
+
|
|
27
|
+
# Backwards-compatible attribute aliases
|
|
28
|
+
@property
|
|
29
|
+
def Pages(self) -> int: # pragma: no cover - simple passthrough
|
|
30
|
+
return self.PageCount
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def pages(self) -> int: # pragma: no cover - simple passthrough
|
|
34
|
+
return self.PageCount
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def sig_pages(self) -> str: # pragma: no cover - simple passthrough
|
|
38
|
+
return self.SignaturePages
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def sig_count(self) -> int: # pragma: no cover - simple passthrough
|
|
42
|
+
return self.SignatureCount
|
|
43
|
+
|
|
44
|
+
def to_dict(self) -> dict[str, Any]:
|
|
45
|
+
"""Return the legacy snake_case representation used by existing clients."""
|
|
46
|
+
|
|
47
|
+
return {
|
|
48
|
+
"file": self.File,
|
|
49
|
+
"size_kb": self.SizeKilobytes,
|
|
50
|
+
"pages": self.PageCount,
|
|
51
|
+
"esign_found": self.ElectronicSignatureFound,
|
|
52
|
+
"scanned_pdf": self.ScannedPdf,
|
|
53
|
+
"mixed": self.MixedContent,
|
|
54
|
+
"sig_count": self.SignatureCount,
|
|
55
|
+
"sig_pages": self.SignaturePages,
|
|
56
|
+
"roles": self.Roles,
|
|
57
|
+
"hints": self.Hints,
|
|
58
|
+
"signatures": [signature.to_dict() for signature in self.Signatures],
|
|
59
|
+
}
|
|
File without changes
|