sigdetect 0.3.1__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sigdetect/__init__.py +1 -1
- sigdetect/api.py +7 -5
- sigdetect/cli.py +37 -0
- sigdetect/config.py +43 -3
- sigdetect/cropping.py +7 -3
- sigdetect/detector/__init__.py +18 -1
- sigdetect/detector/pymupdf_engine.py +1 -0
- sigdetect/detector/pypdf2_engine.py +7 -5
- sigdetect/detector/signature_model.py +1 -1
- sigdetect/wet_detection.py +499 -0
- {sigdetect-0.3.1.dist-info → sigdetect-0.4.0.dist-info}/METADATA +12 -18
- sigdetect-0.4.0.dist-info/RECORD +24 -0
- sigdetect-0.3.1.dist-info/RECORD +0 -23
- {sigdetect-0.3.1.dist-info → sigdetect-0.4.0.dist-info}/WHEEL +0 -0
- {sigdetect-0.3.1.dist-info → sigdetect-0.4.0.dist-info}/entry_points.txt +0 -0
- {sigdetect-0.3.1.dist-info → sigdetect-0.4.0.dist-info}/top_level.txt +0 -0
sigdetect/__init__.py
CHANGED
sigdetect/api.py
CHANGED
|
@@ -10,7 +10,7 @@ from sigdetect.config import DetectConfiguration
|
|
|
10
10
|
from sigdetect.cropping import SignatureCrop
|
|
11
11
|
from sigdetect.detector import BuildDetector, Detector, FileResult, Signature
|
|
12
12
|
|
|
13
|
-
EngineName = Literal["pypdf2", "pypdf", "pymupdf"]
|
|
13
|
+
EngineName = Literal["pypdf2", "pypdf", "pymupdf", "auto"]
|
|
14
14
|
ProfileName = Literal["hipaa", "retainer"]
|
|
15
15
|
|
|
16
16
|
|
|
@@ -18,7 +18,7 @@ def DetectPdf(
|
|
|
18
18
|
pdfPath: str | Path,
|
|
19
19
|
*,
|
|
20
20
|
profileName: ProfileName = "hipaa",
|
|
21
|
-
engineName: EngineName = "
|
|
21
|
+
engineName: EngineName = "auto",
|
|
22
22
|
includePseudoSignatures: bool = True,
|
|
23
23
|
recurseXObjects: bool = True,
|
|
24
24
|
detector: Detector | None = None,
|
|
@@ -43,7 +43,7 @@ def get_detector(
|
|
|
43
43
|
*,
|
|
44
44
|
pdfRoot: str | Path | None = None,
|
|
45
45
|
profileName: ProfileName = "hipaa",
|
|
46
|
-
engineName: EngineName = "
|
|
46
|
+
engineName: EngineName = "auto",
|
|
47
47
|
includePseudoSignatures: bool = True,
|
|
48
48
|
recurseXObjects: bool = True,
|
|
49
49
|
outputDirectory: str | Path | None = None,
|
|
@@ -201,7 +201,8 @@ def CropSignatureImages(
|
|
|
201
201
|
dpi: int = 200,
|
|
202
202
|
returnBytes: Literal[False] = False,
|
|
203
203
|
saveToDisk: bool = True,
|
|
204
|
-
) -> list[Path]:
|
|
204
|
+
) -> list[Path]:
|
|
205
|
+
...
|
|
205
206
|
|
|
206
207
|
|
|
207
208
|
@overload
|
|
@@ -213,7 +214,8 @@ def CropSignatureImages(
|
|
|
213
214
|
dpi: int,
|
|
214
215
|
returnBytes: Literal[True],
|
|
215
216
|
saveToDisk: bool,
|
|
216
|
-
) -> list[SignatureCrop]:
|
|
217
|
+
) -> list[SignatureCrop]:
|
|
218
|
+
...
|
|
217
219
|
|
|
218
220
|
|
|
219
221
|
def CropSignatureImages(
|
sigdetect/cli.py
CHANGED
|
@@ -15,6 +15,7 @@ from .cropping import SignatureCroppingUnavailable, crop_signatures
|
|
|
15
15
|
from .detector import BuildDetector, FileResult
|
|
16
16
|
from .eda import RunExploratoryAnalysis
|
|
17
17
|
from .logging_setup import ConfigureLogging
|
|
18
|
+
from .wet_detection import apply_wet_detection
|
|
18
19
|
|
|
19
20
|
Logger = ConfigureLogging()
|
|
20
21
|
|
|
@@ -72,6 +73,33 @@ def Detect(
|
|
|
72
73
|
help="Rendering DPI for signature crops",
|
|
73
74
|
show_default=False,
|
|
74
75
|
),
|
|
76
|
+
detectWetSignatures: bool | None = typer.Option(
|
|
77
|
+
None,
|
|
78
|
+
"--detect-wet/--no-detect-wet",
|
|
79
|
+
help="Run OCR-backed wet signature detection (requires PyMuPDF + Tesseract)",
|
|
80
|
+
show_default=False,
|
|
81
|
+
),
|
|
82
|
+
wetOcrDpi: int | None = typer.Option(
|
|
83
|
+
None,
|
|
84
|
+
"--wet-ocr-dpi",
|
|
85
|
+
min=72,
|
|
86
|
+
max=600,
|
|
87
|
+
help="Rendering DPI for OCR pages (wet detection)",
|
|
88
|
+
show_default=False,
|
|
89
|
+
),
|
|
90
|
+
wetOcrLanguages: str | None = typer.Option(
|
|
91
|
+
None,
|
|
92
|
+
"--wet-ocr-languages",
|
|
93
|
+
help="Tesseract language packs for OCR (e.g., 'eng' or 'eng+spa')",
|
|
94
|
+
),
|
|
95
|
+
wetPrecisionThreshold: float | None = typer.Option(
|
|
96
|
+
None,
|
|
97
|
+
"--wet-precision-threshold",
|
|
98
|
+
min=0.0,
|
|
99
|
+
max=1.0,
|
|
100
|
+
help="Minimum wet-signature confidence (0-1) to accept a candidate",
|
|
101
|
+
show_default=False,
|
|
102
|
+
),
|
|
75
103
|
) -> None:
|
|
76
104
|
"""Run detection for the configured directory and emit ``results.json``."""
|
|
77
105
|
|
|
@@ -89,6 +117,14 @@ def Detect(
|
|
|
89
117
|
overrides["CropOutputDirectory"] = cropDirectory
|
|
90
118
|
if cropDpi is not None:
|
|
91
119
|
overrides["CropImageDpi"] = cropDpi
|
|
120
|
+
if detectWetSignatures is not None:
|
|
121
|
+
overrides["DetectWetSignatures"] = detectWetSignatures
|
|
122
|
+
if wetOcrDpi is not None:
|
|
123
|
+
overrides["WetOcrDpi"] = wetOcrDpi
|
|
124
|
+
if wetOcrLanguages is not None:
|
|
125
|
+
overrides["WetOcrLanguages"] = wetOcrLanguages
|
|
126
|
+
if wetPrecisionThreshold is not None:
|
|
127
|
+
overrides["WetPrecisionThreshold"] = wetPrecisionThreshold
|
|
92
128
|
if overrides:
|
|
93
129
|
configuration = configuration.model_copy(update=overrides)
|
|
94
130
|
configuration = FinalizeConfiguration(configuration)
|
|
@@ -182,6 +218,7 @@ def Detect(
|
|
|
182
218
|
|
|
183
219
|
def _process(pdf_path: Path) -> None:
|
|
184
220
|
file_result = detector.Detect(pdf_path)
|
|
221
|
+
apply_wet_detection(pdf_path, configuration, file_result, logger=Logger)
|
|
185
222
|
_append_result(file_result, pdf_path)
|
|
186
223
|
|
|
187
224
|
try:
|
sigdetect/config.py
CHANGED
|
@@ -10,7 +10,7 @@ from typing import Literal
|
|
|
10
10
|
import yaml
|
|
11
11
|
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
|
12
12
|
|
|
13
|
-
EngineName = Literal["pypdf2", "pypdf", "pymupdf"]
|
|
13
|
+
EngineName = Literal["pypdf2", "pypdf", "pymupdf", "auto"]
|
|
14
14
|
ProfileName = Literal["hipaa", "retainer"]
|
|
15
15
|
|
|
16
16
|
|
|
@@ -25,13 +25,19 @@ class DetectConfiguration(BaseModel):
|
|
|
25
25
|
|
|
26
26
|
PdfRoot: Path = Field(default=Path("hipaa_results"), alias="pdf_root")
|
|
27
27
|
OutputDirectory: Path | None = Field(default=Path("out"), alias="out_dir")
|
|
28
|
-
Engine: EngineName = Field(default="
|
|
28
|
+
Engine: EngineName = Field(default="auto", alias="engine")
|
|
29
29
|
Profile: ProfileName = Field(default="hipaa", alias="profile")
|
|
30
30
|
PseudoSignatures: bool = Field(default=True, alias="pseudo_signatures")
|
|
31
31
|
RecurseXObjects: bool = Field(default=True, alias="recurse_xobjects")
|
|
32
|
-
CropSignatures: bool = Field(default=
|
|
32
|
+
CropSignatures: bool = Field(default=True, alias="crop_signatures")
|
|
33
33
|
CropOutputDirectory: Path | None = Field(default=None, alias="crop_output_dir")
|
|
34
34
|
CropImageDpi: int = Field(default=200, alias="crop_image_dpi", ge=72, le=600)
|
|
35
|
+
DetectWetSignatures: bool = Field(default=True, alias="detect_wet_signatures")
|
|
36
|
+
WetOcrDpi: int = Field(default=200, alias="wet_ocr_dpi", ge=72, le=600)
|
|
37
|
+
WetOcrLanguages: str = Field(default="eng", alias="wet_ocr_languages")
|
|
38
|
+
WetPrecisionThreshold: float = Field(
|
|
39
|
+
default=0.82, alias="wet_precision_threshold", ge=0.0, le=1.0
|
|
40
|
+
)
|
|
35
41
|
|
|
36
42
|
@field_validator("PdfRoot", "OutputDirectory", "CropOutputDirectory", mode="before")
|
|
37
43
|
@classmethod
|
|
@@ -85,6 +91,22 @@ class DetectConfiguration(BaseModel):
|
|
|
85
91
|
def crop_image_dpi(self) -> int: # pragma: no cover - simple passthrough
|
|
86
92
|
return self.CropImageDpi
|
|
87
93
|
|
|
94
|
+
@property
|
|
95
|
+
def detect_wet_signatures(self) -> bool: # pragma: no cover - simple passthrough
|
|
96
|
+
return self.DetectWetSignatures
|
|
97
|
+
|
|
98
|
+
@property
|
|
99
|
+
def wet_ocr_dpi(self) -> int: # pragma: no cover - simple passthrough
|
|
100
|
+
return self.WetOcrDpi
|
|
101
|
+
|
|
102
|
+
@property
|
|
103
|
+
def wet_ocr_languages(self) -> str: # pragma: no cover - simple passthrough
|
|
104
|
+
return self.WetOcrLanguages
|
|
105
|
+
|
|
106
|
+
@property
|
|
107
|
+
def wet_precision_threshold(self) -> float: # pragma: no cover - simple passthrough
|
|
108
|
+
return self.WetPrecisionThreshold
|
|
109
|
+
|
|
88
110
|
|
|
89
111
|
def LoadConfiguration(path: Path | None) -> DetectConfiguration:
|
|
90
112
|
"""Load configuration from ``path`` while applying environment overrides.
|
|
@@ -108,6 +130,10 @@ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
|
|
|
108
130
|
env_crop = os.getenv("SIGDETECT_CROP_SIGNATURES")
|
|
109
131
|
env_crop_dir = os.getenv("SIGDETECT_CROP_DIR")
|
|
110
132
|
env_crop_dpi = os.getenv("SIGDETECT_CROP_DPI")
|
|
133
|
+
env_detect_wet = os.getenv("SIGDETECT_DETECT_WET")
|
|
134
|
+
env_wet_dpi = os.getenv("SIGDETECT_WET_OCR_DPI")
|
|
135
|
+
env_wet_lang = os.getenv("SIGDETECT_WET_LANGUAGES")
|
|
136
|
+
env_wet_precision = os.getenv("SIGDETECT_WET_PRECISION")
|
|
111
137
|
|
|
112
138
|
raw_data: dict[str, object] = {}
|
|
113
139
|
if path and Path(path).exists():
|
|
@@ -133,6 +159,20 @@ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
|
|
|
133
159
|
if env_crop_dpi:
|
|
134
160
|
with suppress(ValueError):
|
|
135
161
|
raw_data["crop_image_dpi"] = int(env_crop_dpi)
|
|
162
|
+
if env_detect_wet is not None:
|
|
163
|
+
lowered = env_detect_wet.lower()
|
|
164
|
+
if lowered in {"1", "true", "yes", "on"}:
|
|
165
|
+
raw_data["detect_wet_signatures"] = True
|
|
166
|
+
elif lowered in {"0", "false", "no", "off"}:
|
|
167
|
+
raw_data["detect_wet_signatures"] = False
|
|
168
|
+
if env_wet_dpi:
|
|
169
|
+
with suppress(ValueError):
|
|
170
|
+
raw_data["wet_ocr_dpi"] = int(env_wet_dpi)
|
|
171
|
+
if env_wet_lang:
|
|
172
|
+
raw_data["wet_ocr_languages"] = env_wet_lang
|
|
173
|
+
if env_wet_precision:
|
|
174
|
+
with suppress(ValueError):
|
|
175
|
+
raw_data["wet_precision_threshold"] = float(env_wet_precision)
|
|
136
176
|
|
|
137
177
|
configuration = DetectConfiguration(**raw_data)
|
|
138
178
|
return FinalizeConfiguration(configuration)
|
sigdetect/cropping.py
CHANGED
|
@@ -40,7 +40,9 @@ def crop_signatures(
|
|
|
40
40
|
dpi: int = 200,
|
|
41
41
|
logger: logging.Logger | None = None,
|
|
42
42
|
return_bytes: Literal[False] = False,
|
|
43
|
-
|
|
43
|
+
save_files: bool = True,
|
|
44
|
+
) -> list[Path]:
|
|
45
|
+
...
|
|
44
46
|
|
|
45
47
|
|
|
46
48
|
@overload
|
|
@@ -51,8 +53,10 @@ def crop_signatures(
|
|
|
51
53
|
output_dir: Path,
|
|
52
54
|
dpi: int = 200,
|
|
53
55
|
logger: logging.Logger | None = None,
|
|
54
|
-
return_bytes: Literal[True]
|
|
55
|
-
|
|
56
|
+
return_bytes: Literal[True],
|
|
57
|
+
save_files: bool = True,
|
|
58
|
+
) -> list[SignatureCrop]:
|
|
59
|
+
...
|
|
56
60
|
|
|
57
61
|
|
|
58
62
|
def crop_signatures(
|
sigdetect/detector/__init__.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import warnings
|
|
5
6
|
from typing import TYPE_CHECKING, Type
|
|
6
7
|
|
|
7
8
|
from .base_detector import Detector
|
|
@@ -37,7 +38,23 @@ def BuildDetector(configuration: DetectConfiguration) -> Detector:
|
|
|
37
38
|
or getattr(configuration, "engine", None)
|
|
38
39
|
or PyPDF2Detector.Name
|
|
39
40
|
)
|
|
40
|
-
normalized = engine_name.lower()
|
|
41
|
+
normalized = str(engine_name).lower()
|
|
42
|
+
|
|
43
|
+
if normalized == "auto":
|
|
44
|
+
detector_cls: Type[Detector] | None = None
|
|
45
|
+
if PyMuPDFDetector is not None:
|
|
46
|
+
detector_cls = ENGINE_REGISTRY.get(getattr(PyMuPDFDetector, "Name", "")) or PyMuPDFDetector
|
|
47
|
+
if detector_cls is None:
|
|
48
|
+
detector_cls = ENGINE_REGISTRY.get(PyPDF2Detector.Name) or ENGINE_REGISTRY.get("pypdf")
|
|
49
|
+
warnings.warn(
|
|
50
|
+
"Engine 'auto' falling back to 'pypdf2' because PyMuPDF is unavailable",
|
|
51
|
+
RuntimeWarning,
|
|
52
|
+
stacklevel=2,
|
|
53
|
+
)
|
|
54
|
+
if detector_cls is None:
|
|
55
|
+
available = ", ".join(sorted(ENGINE_REGISTRY)) or "<none>"
|
|
56
|
+
raise ValueError(f"No available detector engines. Available engines: {available}")
|
|
57
|
+
return detector_cls(configuration)
|
|
41
58
|
|
|
42
59
|
detector_cls = ENGINE_REGISTRY.get(normalized)
|
|
43
60
|
if detector_cls is None:
|
|
@@ -111,6 +111,7 @@ class PyMuPDFDetector(PyPDF2Detector):
|
|
|
111
111
|
rect, exclusion, mode = rect_info
|
|
112
112
|
padded = self._PadRect(rect, page.rect, signature.Role, exclusion, mode)
|
|
113
113
|
signature.BoundingBox = self._RectToPdfTuple(padded, page.rect.height)
|
|
114
|
+
signature.RenderType = "drawn"
|
|
114
115
|
if signature.Page is None:
|
|
115
116
|
signature.Page = page_index + 1
|
|
116
117
|
break
|
|
@@ -348,7 +348,7 @@ class PyPDF2Detector(Detector):
|
|
|
348
348
|
return normalized.lower().startswith("im")
|
|
349
349
|
|
|
350
350
|
def _ClassifyAppearance(self, widget: generic.DictionaryObject, page) -> str:
|
|
351
|
-
"""Classify the widget's appearance as drawn
|
|
351
|
+
"""Classify the widget's appearance as drawn or typed."""
|
|
352
352
|
|
|
353
353
|
ap_dict = AsDictionary(widget.get("/AP"))
|
|
354
354
|
if not isinstance(ap_dict, generic.DictionaryObject):
|
|
@@ -356,7 +356,7 @@ class PyPDF2Detector(Detector):
|
|
|
356
356
|
normal = ap_dict.get("/N")
|
|
357
357
|
streams = self._ExtractAppearanceStreams(normal)
|
|
358
358
|
if not streams:
|
|
359
|
-
return "
|
|
359
|
+
return "typed"
|
|
360
360
|
|
|
361
361
|
has_text = False
|
|
362
362
|
has_vector = False
|
|
@@ -384,13 +384,11 @@ class PyPDF2Detector(Detector):
|
|
|
384
384
|
has_image = True
|
|
385
385
|
break
|
|
386
386
|
|
|
387
|
-
if has_image and (has_text or has_vector):
|
|
388
|
-
return "hybrid"
|
|
389
387
|
if has_image:
|
|
390
388
|
return "drawn"
|
|
391
389
|
if has_text or has_vector:
|
|
392
390
|
return "typed"
|
|
393
|
-
return "
|
|
391
|
+
return "typed"
|
|
394
392
|
|
|
395
393
|
# ---- file-wide stream scan (compressed or not)
|
|
396
394
|
def _ScanFileStreamsForVendors(self, file_bytes: bytes) -> tuple[set[str], str]:
|
|
@@ -863,6 +861,7 @@ class PyPDF2Detector(Detector):
|
|
|
863
861
|
Scores={r: sc},
|
|
864
862
|
Evidence=ev + ["pseudo:true"],
|
|
865
863
|
Hint="VendorOrAcroOnly",
|
|
864
|
+
RenderType="typed",
|
|
866
865
|
)
|
|
867
866
|
)
|
|
868
867
|
|
|
@@ -903,6 +902,7 @@ class PyPDF2Detector(Detector):
|
|
|
903
902
|
Scores={role: score} if score > 0 else {},
|
|
904
903
|
Evidence=ev + ["pseudo:true"],
|
|
905
904
|
Hint="VendorOrAcroOnly",
|
|
905
|
+
RenderType="typed",
|
|
906
906
|
)
|
|
907
907
|
)
|
|
908
908
|
|
|
@@ -1055,6 +1055,7 @@ class PyPDF2Detector(Detector):
|
|
|
1055
1055
|
Scores=scores,
|
|
1056
1056
|
Evidence=evidence,
|
|
1057
1057
|
Hint=f"AcroSig:{fname}" if fname else "AcroSig",
|
|
1058
|
+
RenderType="typed",
|
|
1058
1059
|
)
|
|
1059
1060
|
)
|
|
1060
1061
|
|
|
@@ -1120,6 +1121,7 @@ class PyPDF2Detector(Detector):
|
|
|
1120
1121
|
Scores=dict(scores),
|
|
1121
1122
|
Evidence=evidence + ["pseudo:true"],
|
|
1122
1123
|
Hint="VendorOrAcroOnly",
|
|
1124
|
+
RenderType="typed",
|
|
1123
1125
|
)
|
|
1124
1126
|
)
|
|
1125
1127
|
|
|
@@ -0,0 +1,499 @@
|
|
|
1
|
+
"""Wet signature detection via OCR-backed heuristics."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import re
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Iterable, Sequence
|
|
10
|
+
|
|
11
|
+
from PIL import Image
|
|
12
|
+
|
|
13
|
+
from sigdetect.config import DetectConfiguration
|
|
14
|
+
from sigdetect.detector.file_result_model import FileResult
|
|
15
|
+
from sigdetect.detector.signature_model import Signature
|
|
16
|
+
|
|
17
|
+
try: # pragma: no cover - optional dependency
|
|
18
|
+
import fitz # type: ignore
|
|
19
|
+
except Exception: # pragma: no cover - optional dependency
|
|
20
|
+
fitz = None # type: ignore[misc]
|
|
21
|
+
|
|
22
|
+
try: # pragma: no cover - optional dependency
|
|
23
|
+
import pytesseract # type: ignore
|
|
24
|
+
from pytesseract import Output as TesseractOutput
|
|
25
|
+
except Exception: # pragma: no cover - optional dependency
|
|
26
|
+
pytesseract = None # type: ignore[assignment]
|
|
27
|
+
TesseractOutput = None # type: ignore[assignment]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
LOGGER = logging.getLogger("sigdetect.wet")
|
|
31
|
+
|
|
32
|
+
SIGNATURE_PATTERNS: tuple[re.Pattern[str], ...] = (
|
|
33
|
+
re.compile(r"\bsignature\b"),
|
|
34
|
+
re.compile(r"\bsigned\b"),
|
|
35
|
+
re.compile(r"\bsign\b"),
|
|
36
|
+
re.compile(r"\bsignature\s+of\b"),
|
|
37
|
+
re.compile(r"\bsignature\s*:"),
|
|
38
|
+
re.compile(r"\bsignature\s*-"),
|
|
39
|
+
re.compile(r"\bby:\b"),
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
ROLE_KEYWORDS: dict[str, tuple[str, ...]] = {
|
|
43
|
+
"client": ("client", "consumer", "claimant"),
|
|
44
|
+
"firm": ("firm", "attorney", "counsel", "by:", "esq", "law"),
|
|
45
|
+
"patient": ("patient", "self", "plaintiff"),
|
|
46
|
+
"representative": ("guardian", "representative", "parent", "poa"),
|
|
47
|
+
"attorney": ("attorney", "counsel", "lawyer"),
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class WetDetectionUnavailable(RuntimeError):
|
|
52
|
+
"""Raised when OCR-backed detection cannot run."""
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class OcrLine:
|
|
57
|
+
"""Structured OCR line extracted from pytesseract."""
|
|
58
|
+
|
|
59
|
+
text: str
|
|
60
|
+
confidence: float
|
|
61
|
+
left: int
|
|
62
|
+
top: int
|
|
63
|
+
right: int
|
|
64
|
+
bottom: int
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def should_run_wet_pipeline(file_result: FileResult) -> bool:
|
|
68
|
+
"""Return ``True`` when the OCR pipeline should run for ``file_result``."""
|
|
69
|
+
|
|
70
|
+
return (
|
|
71
|
+
(not file_result.ElectronicSignatureFound or file_result.SignatureCount == 0)
|
|
72
|
+
or (bool(file_result.ScannedPdf) and not file_result.ElectronicSignatureFound)
|
|
73
|
+
or bool(file_result.MixedContent)
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def apply_wet_detection(
|
|
78
|
+
pdf_path: Path,
|
|
79
|
+
configuration: DetectConfiguration,
|
|
80
|
+
file_result: FileResult,
|
|
81
|
+
*,
|
|
82
|
+
logger: logging.Logger | None = None,
|
|
83
|
+
) -> bool:
|
|
84
|
+
"""Augment ``file_result`` with OCR-detected wet signatures when possible."""
|
|
85
|
+
|
|
86
|
+
if not configuration.DetectWetSignatures:
|
|
87
|
+
return False
|
|
88
|
+
if not should_run_wet_pipeline(file_result):
|
|
89
|
+
return False
|
|
90
|
+
|
|
91
|
+
try:
|
|
92
|
+
_ensure_dependencies()
|
|
93
|
+
except WetDetectionUnavailable as exc:
|
|
94
|
+
_mark_manual_review(file_result, str(exc))
|
|
95
|
+
if logger:
|
|
96
|
+
logger.warning("Wet detection unavailable", extra={"error": str(exc)})
|
|
97
|
+
return False
|
|
98
|
+
|
|
99
|
+
try:
|
|
100
|
+
added = _detect(pdf_path, configuration, file_result, logger=logger)
|
|
101
|
+
if not added:
|
|
102
|
+
_mark_manual_review(file_result, "NoHighConfidenceWetSignature")
|
|
103
|
+
return added
|
|
104
|
+
except Exception as exc: # pragma: no cover - defensive
|
|
105
|
+
_mark_manual_review(file_result, "WetDetectionError")
|
|
106
|
+
if logger:
|
|
107
|
+
logger.warning("Wet detection failed", extra={"error": str(exc)})
|
|
108
|
+
return False
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _detect(
|
|
112
|
+
pdf_path: Path,
|
|
113
|
+
configuration: DetectConfiguration,
|
|
114
|
+
file_result: FileResult,
|
|
115
|
+
*,
|
|
116
|
+
logger: logging.Logger | None = None,
|
|
117
|
+
) -> bool:
|
|
118
|
+
if fitz is None or pytesseract is None:
|
|
119
|
+
raise WetDetectionUnavailable("PyMuPDF or pytesseract not available")
|
|
120
|
+
|
|
121
|
+
document = fitz.open(pdf_path) # type: ignore[attr-defined]
|
|
122
|
+
try:
|
|
123
|
+
new_signatures: list[Signature] = []
|
|
124
|
+
matrix = fitz.Matrix(configuration.WetOcrDpi / 72.0, configuration.WetOcrDpi / 72.0)
|
|
125
|
+
for page_index in range(document.page_count):
|
|
126
|
+
page = document.load_page(page_index)
|
|
127
|
+
pixmap = page.get_pixmap(matrix=matrix, alpha=False)
|
|
128
|
+
image = _pixmap_to_image(pixmap)
|
|
129
|
+
ocr_lines = _extract_ocr_lines(image, configuration.WetOcrLanguages)
|
|
130
|
+
candidates = list(
|
|
131
|
+
_build_candidates(
|
|
132
|
+
ocr_lines,
|
|
133
|
+
image=image,
|
|
134
|
+
page_rect=page.rect,
|
|
135
|
+
pix_width=pixmap.width,
|
|
136
|
+
pix_height=pixmap.height,
|
|
137
|
+
scale=configuration.WetOcrDpi / 72.0,
|
|
138
|
+
)
|
|
139
|
+
)
|
|
140
|
+
candidates.extend(_image_candidates(page))
|
|
141
|
+
accepted = [
|
|
142
|
+
candidate
|
|
143
|
+
for candidate in candidates
|
|
144
|
+
if candidate.Score >= configuration.WetPrecisionThreshold
|
|
145
|
+
]
|
|
146
|
+
if logger:
|
|
147
|
+
logger.debug(
|
|
148
|
+
"Wet detection page summary",
|
|
149
|
+
extra={
|
|
150
|
+
"pdf": pdf_path.name,
|
|
151
|
+
"page": page_index + 1,
|
|
152
|
+
"candidates": len(candidates),
|
|
153
|
+
"accepted": len(accepted),
|
|
154
|
+
},
|
|
155
|
+
)
|
|
156
|
+
new_signatures.extend(_to_signatures(accepted, page_index + 1))
|
|
157
|
+
if not new_signatures:
|
|
158
|
+
return False
|
|
159
|
+
|
|
160
|
+
file_result.Signatures.extend(new_signatures)
|
|
161
|
+
_refresh_metadata(file_result)
|
|
162
|
+
return True
|
|
163
|
+
finally:
|
|
164
|
+
document.close()
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def _ensure_dependencies() -> None:
|
|
168
|
+
if fitz is None:
|
|
169
|
+
raise WetDetectionUnavailable("PyMuPDF is required for wet detection (install 'pymupdf').")
|
|
170
|
+
if pytesseract is None or TesseractOutput is None:
|
|
171
|
+
raise WetDetectionUnavailable(
|
|
172
|
+
"pytesseract is required for wet detection and depends on the Tesseract OCR binary."
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _pixmap_to_image(pixmap) -> Image.Image:
|
|
177
|
+
mode = "RGB"
|
|
178
|
+
if pixmap.alpha:
|
|
179
|
+
mode = "RGBA"
|
|
180
|
+
image = Image.frombytes(mode, [pixmap.width, pixmap.height], pixmap.samples)
|
|
181
|
+
if mode == "RGBA":
|
|
182
|
+
image = image.convert("RGB")
|
|
183
|
+
return image
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _extract_ocr_lines(image: Image.Image, languages: str) -> list[OcrLine]:
|
|
187
|
+
if pytesseract is None or TesseractOutput is None:
|
|
188
|
+
raise WetDetectionUnavailable("pytesseract unavailable")
|
|
189
|
+
|
|
190
|
+
try:
|
|
191
|
+
data = pytesseract.image_to_data(image, lang=languages, output_type=TesseractOutput.DICT)
|
|
192
|
+
except Exception as exc: # pragma: no cover - passthrough to manual review
|
|
193
|
+
raise WetDetectionUnavailable(f"OCR failed: {exc}") from exc
|
|
194
|
+
total = len(data.get("text", []))
|
|
195
|
+
lines: dict[tuple[int, int, int], OcrLine] = {}
|
|
196
|
+
for idx in range(total):
|
|
197
|
+
text = (data["text"][idx] or "").strip()
|
|
198
|
+
if not text:
|
|
199
|
+
continue
|
|
200
|
+
conf_raw = float(data["conf"][idx])
|
|
201
|
+
if conf_raw <= 0:
|
|
202
|
+
continue
|
|
203
|
+
key = (data["block_num"][idx], data["par_num"][idx], data["line_num"][idx])
|
|
204
|
+
left = int(data["left"][idx])
|
|
205
|
+
top = int(data["top"][idx])
|
|
206
|
+
width = int(data["width"][idx])
|
|
207
|
+
height = int(data["height"][idx])
|
|
208
|
+
right = left + width
|
|
209
|
+
bottom = top + height
|
|
210
|
+
existing = lines.get(key)
|
|
211
|
+
if existing is None:
|
|
212
|
+
lines[key] = OcrLine(
|
|
213
|
+
text=text,
|
|
214
|
+
confidence=conf_raw / 100.0,
|
|
215
|
+
left=left,
|
|
216
|
+
top=top,
|
|
217
|
+
right=right,
|
|
218
|
+
bottom=bottom,
|
|
219
|
+
)
|
|
220
|
+
else:
|
|
221
|
+
existing.text = f"{existing.text} {text}"
|
|
222
|
+
existing.confidence = min(1.0, (existing.confidence + conf_raw / 100.0) / 2.0)
|
|
223
|
+
existing.left = min(existing.left, left)
|
|
224
|
+
existing.top = min(existing.top, top)
|
|
225
|
+
existing.right = max(existing.right, right)
|
|
226
|
+
existing.bottom = max(existing.bottom, bottom)
|
|
227
|
+
return list(lines.values())
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
@dataclass
|
|
231
|
+
class WetCandidate:
|
|
232
|
+
bbox: tuple[float, float, float, float]
|
|
233
|
+
Role: str
|
|
234
|
+
Score: float
|
|
235
|
+
Evidence: list[str]
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def _build_candidates(
|
|
239
|
+
lines: Iterable[OcrLine],
|
|
240
|
+
*,
|
|
241
|
+
image: Image.Image,
|
|
242
|
+
page_rect,
|
|
243
|
+
pix_width: int,
|
|
244
|
+
pix_height: int,
|
|
245
|
+
scale: float,
|
|
246
|
+
) -> Iterable[WetCandidate]:
|
|
247
|
+
for line in lines:
|
|
248
|
+
normalized = line.text.lower()
|
|
249
|
+
if not _has_signature_keyword(normalized):
|
|
250
|
+
continue
|
|
251
|
+
if len(normalized) > 80:
|
|
252
|
+
# Ignore long paragraph-like OCR lines
|
|
253
|
+
continue
|
|
254
|
+
if (line.bottom / pix_height) < 0.4:
|
|
255
|
+
# Ignore lines in the upper section of the page
|
|
256
|
+
continue
|
|
257
|
+
role = _infer_role(normalized)
|
|
258
|
+
stroke_found, stroke_y = _stroke_under_line(image, line)
|
|
259
|
+
bonus = _keyword_bonus(normalized)
|
|
260
|
+
if stroke_found:
|
|
261
|
+
bonus += 0.12
|
|
262
|
+
# Slight positional prior: lines in lower quarter are more likely signatures.
|
|
263
|
+
if (line.bottom / pix_height) > 0.7:
|
|
264
|
+
bonus += 0.05
|
|
265
|
+
confidence = min(1.0, line.confidence + bonus)
|
|
266
|
+
bbox = _expand_bbox(line, page_rect, pix_height, scale, stroke_y=stroke_y)
|
|
267
|
+
yield WetCandidate(
|
|
268
|
+
bbox=bbox,
|
|
269
|
+
Role=role,
|
|
270
|
+
Score=confidence,
|
|
271
|
+
Evidence=[
|
|
272
|
+
f"ocr_line:{line.text.strip()}",
|
|
273
|
+
f"ocr_conf:{confidence:.2f}",
|
|
274
|
+
"wet:true",
|
|
275
|
+
"stroke:yes" if stroke_found else "stroke:no",
|
|
276
|
+
],
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def _infer_role(normalized_text: str) -> str:
|
|
281
|
+
for role, keywords in ROLE_KEYWORDS.items():
|
|
282
|
+
if any(keyword in normalized_text for keyword in keywords):
|
|
283
|
+
return role
|
|
284
|
+
return "unknown"
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def _keyword_bonus(normalized_text: str) -> float:
|
|
288
|
+
bonus = 0.0
|
|
289
|
+
if "signature" in normalized_text:
|
|
290
|
+
bonus += 0.05
|
|
291
|
+
if "date" in normalized_text:
|
|
292
|
+
bonus -= 0.02
|
|
293
|
+
if "by:" in normalized_text:
|
|
294
|
+
bonus += 0.03
|
|
295
|
+
return bonus
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def _has_signature_keyword(normalized_text: str) -> bool:
|
|
299
|
+
return any(pattern.search(normalized_text) for pattern in SIGNATURE_PATTERNS)
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def _expand_bbox(
|
|
303
|
+
line: OcrLine,
|
|
304
|
+
page_rect,
|
|
305
|
+
pix_height: int,
|
|
306
|
+
scale: float,
|
|
307
|
+
*,
|
|
308
|
+
stroke_y: float | None = None,
|
|
309
|
+
) -> tuple[float, float, float, float]:
|
|
310
|
+
x0 = line.left / scale
|
|
311
|
+
x1 = line.right / scale
|
|
312
|
+
y1 = (pix_height - line.top) / scale
|
|
313
|
+
|
|
314
|
+
pad_x = max(14.0, (x1 - x0) * 0.25)
|
|
315
|
+
left = max(page_rect.x0, x0 - pad_x)
|
|
316
|
+
right = min(page_rect.x1, x1 + pad_x)
|
|
317
|
+
|
|
318
|
+
gap = 14.0
|
|
319
|
+
signature_height = 70.0
|
|
320
|
+
top = min(page_rect.y1, y1 + gap)
|
|
321
|
+
bottom = min(page_rect.y1, top + signature_height)
|
|
322
|
+
|
|
323
|
+
if bottom <= top:
|
|
324
|
+
bottom = min(page_rect.y1, top + signature_height)
|
|
325
|
+
|
|
326
|
+
if stroke_y is not None:
|
|
327
|
+
# Anchor to the detected stroke under the OCR label when available.
|
|
328
|
+
sy = (pix_height - stroke_y) / scale
|
|
329
|
+
if sy < top:
|
|
330
|
+
top = sy
|
|
331
|
+
bottom = max(bottom, sy + signature_height)
|
|
332
|
+
|
|
333
|
+
return (float(left), float(top), float(right), float(bottom))
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
def _stroke_under_line(image: Image.Image, line: OcrLine) -> tuple[bool, float | None]:
|
|
337
|
+
"""Heuristic: look for a dark horizontal stroke beneath the OCR line."""
|
|
338
|
+
|
|
339
|
+
gray = image.convert("L")
|
|
340
|
+
pad_x = 10
|
|
341
|
+
strip_height = 28
|
|
342
|
+
x0 = max(0, line.left - pad_x)
|
|
343
|
+
x1 = min(gray.width, line.right + pad_x)
|
|
344
|
+
y0 = min(gray.height, line.bottom + 2)
|
|
345
|
+
y1 = min(gray.height, y0 + strip_height)
|
|
346
|
+
if x1 <= x0 or y1 <= y0:
|
|
347
|
+
return False, None
|
|
348
|
+
|
|
349
|
+
crop = gray.crop((x0, y0, x1, y1))
|
|
350
|
+
width = crop.width or 1
|
|
351
|
+
max_density = 0.0
|
|
352
|
+
best_row = None
|
|
353
|
+
# Simple density scan: percentage of dark pixels per row.
|
|
354
|
+
threshold = 160
|
|
355
|
+
for row in range(crop.height):
|
|
356
|
+
row_pixels = [crop.getpixel((col, row)) for col in range(width)]
|
|
357
|
+
dark = sum(1 for px in row_pixels if px < threshold)
|
|
358
|
+
density = dark / width
|
|
359
|
+
if density > max_density:
|
|
360
|
+
max_density = density
|
|
361
|
+
best_row = row
|
|
362
|
+
if max_density < 0.32 or best_row is None:
|
|
363
|
+
return False, None
|
|
364
|
+
return True, float(y0 + best_row)
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
def _image_candidates(page) -> list[WetCandidate]:
|
|
368
|
+
"""Heuristic: treat small, wide images near signature areas as wet signatures."""
|
|
369
|
+
|
|
370
|
+
candidates: list[WetCandidate] = []
|
|
371
|
+
page_width = float(page.rect.width)
|
|
372
|
+
page_height = float(page.rect.height)
|
|
373
|
+
page_area = page_width * page_height
|
|
374
|
+
words = page.get_text("words") or []
|
|
375
|
+
|
|
376
|
+
for info in page.get_image_info(xrefs=True) or []:
|
|
377
|
+
rect = info.get("bbox") or info.get("rect")
|
|
378
|
+
if rect is None:
|
|
379
|
+
continue
|
|
380
|
+
if hasattr(rect, "x0"):
|
|
381
|
+
x0, y0, x1, y1 = float(rect.x0), float(rect.y0), float(rect.x1), float(rect.y1)
|
|
382
|
+
elif isinstance(rect, (tuple, list)) and len(rect) == 4:
|
|
383
|
+
x0, y0, x1, y1 = map(float, rect)
|
|
384
|
+
else:
|
|
385
|
+
continue
|
|
386
|
+
width = float(x1 - x0)
|
|
387
|
+
height = float(y1 - y0)
|
|
388
|
+
if width <= 40 or height <= 15:
|
|
389
|
+
# Skip tiny marks/logos
|
|
390
|
+
continue
|
|
391
|
+
aspect = width / height if height else 0.0
|
|
392
|
+
if aspect < 1.6:
|
|
393
|
+
continue
|
|
394
|
+
if (width * height) / page_area > 0.1:
|
|
395
|
+
# Ignore large illustrations/backgrounds
|
|
396
|
+
continue
|
|
397
|
+
|
|
398
|
+
role = _infer_role_nearby(rect, words)
|
|
399
|
+
score = 0.9 if role != "unknown" else 0.84
|
|
400
|
+
|
|
401
|
+
bbox = (x0, float(page_height - y1), x1, float(page_height - y0))
|
|
402
|
+
|
|
403
|
+
evidence = ["image_signature:true"]
|
|
404
|
+
if role != "unknown":
|
|
405
|
+
evidence.append(f"role_hint:{role}")
|
|
406
|
+
|
|
407
|
+
candidates.append(
|
|
408
|
+
WetCandidate(
|
|
409
|
+
bbox=bbox,
|
|
410
|
+
Role=role,
|
|
411
|
+
Score=min(1.0, score),
|
|
412
|
+
Evidence=evidence,
|
|
413
|
+
)
|
|
414
|
+
)
|
|
415
|
+
return candidates
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
def _infer_role_nearby(rect, words) -> str:
|
|
419
|
+
"""Best-effort role inference using text near the image rectangle."""
|
|
420
|
+
|
|
421
|
+
proximity_y = 48.0
|
|
422
|
+
proximity_x = 140.0
|
|
423
|
+
if hasattr(rect, "x0"):
|
|
424
|
+
rx0, ry0, rx1, ry1 = float(rect.x0), float(rect.y0), float(rect.x1), float(rect.y1)
|
|
425
|
+
elif isinstance(rect, (tuple, list)) and len(rect) == 4:
|
|
426
|
+
rx0, ry0, rx1, ry1 = map(float, rect)
|
|
427
|
+
else:
|
|
428
|
+
return "unknown"
|
|
429
|
+
|
|
430
|
+
nearby_tokens: list[str] = []
|
|
431
|
+
for word in words:
|
|
432
|
+
if len(word) < 5:
|
|
433
|
+
continue
|
|
434
|
+
x0, y0, x1, y1, token, *_ = word
|
|
435
|
+
if y1 < ry0 - proximity_y or y0 > ry1 + proximity_y:
|
|
436
|
+
continue
|
|
437
|
+
if x1 < rx0 - proximity_x or x0 > rx1 + proximity_x:
|
|
438
|
+
continue
|
|
439
|
+
nearby_tokens.append(str(token))
|
|
440
|
+
if not nearby_tokens:
|
|
441
|
+
return "unknown"
|
|
442
|
+
normalized = " ".join(nearby_tokens).lower()
|
|
443
|
+
return _infer_role(normalized)
|
|
444
|
+
|
|
445
|
+
|
|
446
|
+
def _needs_wet_enhancement(file_result: FileResult) -> bool:
|
|
447
|
+
"""Return True when we should run wet OCR to refine pseudo/unknown signatures."""
|
|
448
|
+
|
|
449
|
+
return False
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
def _to_signatures(
|
|
453
|
+
candidates: Sequence[WetCandidate],
|
|
454
|
+
page_number: int,
|
|
455
|
+
) -> list[Signature]:
|
|
456
|
+
signatures: list[Signature] = []
|
|
457
|
+
for candidate in candidates:
|
|
458
|
+
signatures.append(
|
|
459
|
+
Signature(
|
|
460
|
+
Page=page_number,
|
|
461
|
+
FieldName="wet_signature_detected",
|
|
462
|
+
Role=candidate.Role,
|
|
463
|
+
Score=int(round(candidate.Score * 100)),
|
|
464
|
+
Scores={candidate.Role: int(round(candidate.Score * 100))},
|
|
465
|
+
Evidence=candidate.Evidence,
|
|
466
|
+
Hint="WetSignatureOCR",
|
|
467
|
+
RenderType="wet",
|
|
468
|
+
BoundingBox=candidate.bbox,
|
|
469
|
+
)
|
|
470
|
+
)
|
|
471
|
+
return signatures
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
def _mark_manual_review(file_result: FileResult, reason: str) -> None:
|
|
475
|
+
hints = _split_hints(file_result.Hints)
|
|
476
|
+
hints.add(f"ManualReview:{reason}")
|
|
477
|
+
file_result.Hints = ";".join(sorted(hints)) if hints else file_result.Hints
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
def _refresh_metadata(file_result: FileResult) -> None:
|
|
481
|
+
file_result.SignatureCount = len(file_result.Signatures)
|
|
482
|
+
signature_pages = sorted({sig.Page for sig in file_result.Signatures if sig.Page})
|
|
483
|
+
file_result.SignaturePages = ",".join(map(str, signature_pages))
|
|
484
|
+
roles = sorted({sig.Role for sig in file_result.Signatures if sig.Role != "unknown"})
|
|
485
|
+
if roles:
|
|
486
|
+
file_result.Roles = ";".join(roles)
|
|
487
|
+
file_result.ElectronicSignatureFound = file_result.SignatureCount > 0
|
|
488
|
+
file_result.MixedContent = (
|
|
489
|
+
file_result.ElectronicSignatureFound and bool(file_result.ScannedPdf)
|
|
490
|
+
)
|
|
491
|
+
hints = _split_hints(file_result.Hints)
|
|
492
|
+
hints |= {sig.Hint for sig in file_result.Signatures if sig.Hint}
|
|
493
|
+
file_result.Hints = ";".join(sorted(hints))
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
def _split_hints(hints: str | None) -> set[str]:
|
|
497
|
+
if not hints:
|
|
498
|
+
return set()
|
|
499
|
+
return {hint for hint in hints.split(";") if hint}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sigdetect
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Signature detection and role attribution for PDFs
|
|
5
5
|
Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
|
|
6
6
|
License: MIT
|
|
@@ -95,14 +95,14 @@ sigdetect detect \
|
|
|
95
95
|
### Notes
|
|
96
96
|
|
|
97
97
|
- The config file controls `pdf_root`, `out_dir`, `engine`, `pseudo_signatures`, `recurse_xobjects`, etc.
|
|
98
|
-
- `--engine`
|
|
98
|
+
- `--engine` accepts **auto** (default; prefers PyMuPDF when installed, falls back to PyPDF2), **pypdf2**, or **pymupdf**.
|
|
99
99
|
- `--pseudo-signatures` enables a vendor/Acro-only pseudo-signature when no actual `/Widget` is present (useful for DocuSign / Acrobat Sign receipts).
|
|
100
100
|
- `--recurse-xobjects` allows scanning Form XObjects for vendor markers and labels embedded in page resources.
|
|
101
101
|
- `--profile` selects tuned role logic:
|
|
102
102
|
- `hipaa` → patient / representative / attorney
|
|
103
103
|
- `retainer` → client / firm (prefers detecting two signatures)
|
|
104
104
|
- `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
|
|
105
|
-
- `--crop-signatures`
|
|
105
|
+
- Cropping (`--crop-signatures`) and wet detection (`--detect-wet`) are enabled by default for single-pass runs; disable them if you want a light, e-sign-only pass. PyMuPDF is required for crops; PyMuPDF + Tesseract are required for wet detection.
|
|
106
106
|
- If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
|
|
107
107
|
|
|
108
108
|
### EDA (quick aggregate stats)
|
|
@@ -136,15 +136,13 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
|
|
|
136
136
|
print(result.to_dict())
|
|
137
137
|
~~~
|
|
138
138
|
|
|
139
|
-
`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image.
|
|
139
|
+
`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
|
|
140
140
|
|
|
141
141
|
---
|
|
142
142
|
|
|
143
143
|
## Library API (embed in another script)
|
|
144
144
|
|
|
145
|
-
Minimal, plug-and-play API
|
|
146
|
-
Import from `sigdetect.api` and get plain dicts out (JSON-ready),
|
|
147
|
-
with no I/O side effects by default:
|
|
145
|
+
Minimal, plug-and-play API that returns plain dicts (JSON-ready) without side effects unless you opt into cropping:
|
|
148
146
|
|
|
149
147
|
~~~python
|
|
150
148
|
from pathlib import Path
|
|
@@ -192,23 +190,14 @@ for res in ScanDirectory(
|
|
|
192
190
|
# 3) Crop PNG snippets for FileResult objects (requires PyMuPDF)
|
|
193
191
|
detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
|
|
194
192
|
file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
|
|
195
|
-
|
|
193
|
+
CropSignatureImages(
|
|
196
194
|
"/path/to/pdfs/example.pdf",
|
|
197
195
|
file_result,
|
|
198
196
|
outputDirectory="./signature_crops",
|
|
199
197
|
dpi=200,
|
|
200
|
-
returnBytes=True, # also returns in-memory PNG bytes for each crop
|
|
201
|
-
# saveToDisk=False, # optional: skip writing PNGs to disk
|
|
202
198
|
)
|
|
203
|
-
|
|
204
|
-
first_crop = crops[0]
|
|
205
|
-
print(first_crop.path, len(first_crop.image_bytes))
|
|
206
199
|
~~~
|
|
207
200
|
|
|
208
|
-
When ``returnBytes=True`` the helper returns ``SignatureCrop`` objects containing the saved path,
|
|
209
|
-
PNG bytes, and the originating signature metadata.
|
|
210
|
-
Pass ``saveToDisk=False`` if you only want in-memory PNG bytes (no files on disk or ``crop_path`` updates).
|
|
211
|
-
|
|
212
201
|
|
|
213
202
|
## Result schema
|
|
214
203
|
|
|
@@ -247,7 +236,7 @@ High-level summary (per file):
|
|
|
247
236
|
"scores": { "page_label": 4, "general": 2 },
|
|
248
237
|
"evidence": ["page_label:representative(parent/guardian)", "pseudo:true"],
|
|
249
238
|
"hint": "VendorOrAcroOnly",
|
|
250
|
-
"render_type": "
|
|
239
|
+
"render_type": "typed",
|
|
251
240
|
"bounding_box": null,
|
|
252
241
|
"crop_path": null
|
|
253
242
|
}
|
|
@@ -292,6 +281,10 @@ profile: retainer # or: hipaa
|
|
|
292
281
|
crop_signatures: false # enable to write PNG crops (requires pymupdf)
|
|
293
282
|
# crop_output_dir: ./signature_crops
|
|
294
283
|
crop_image_dpi: 200
|
|
284
|
+
detect_wet_signatures: false # opt-in OCR wet detection (PyMuPDF + Tesseract)
|
|
285
|
+
wet_ocr_dpi: 200
|
|
286
|
+
wet_ocr_languages: eng
|
|
287
|
+
wet_precision_threshold: 0.82
|
|
295
288
|
~~~
|
|
296
289
|
|
|
297
290
|
YAML files can be customized or load at runtime (see CLI `--config`, if available, or import and pass patterns into engine).
|
|
@@ -306,6 +299,7 @@ YAML files can be customized or load at runtime (see CLI `--config`, if availabl
|
|
|
306
299
|
- Looks for client and firm labels/tokens; boosts pages with law-firm markers (LLP/LLC/PA/PC) and “By:” blocks.
|
|
307
300
|
- Applies an anti-front-matter rule to reduce page-1 false positives (e.g., letterheads, firm mastheads).
|
|
308
301
|
- When only vendor/Acro clues exist (no widgets), it will emit two pseudo signatures targeting likely pages.
|
|
302
|
+
- **Wet detection (opt-in):** With `detect_wet_signatures: true`, the CLI runs an OCR-backed pass (PyMuPDF + pytesseract/Tesseract) after e-sign detection. It emits `RenderType="wet"` signatures for high-confidence label/stroke pairs in the lower page region. Missing OCR dependencies add a `ManualReview:*` hint instead of failing.
|
|
309
303
|
|
|
310
304
|
---
|
|
311
305
|
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
sigdetect/__init__.py,sha256=YvnTwlC1jfq83EhQS_1JjiiHK7_wJCCU1JvHv5E1qWY,573
|
|
2
|
+
sigdetect/api.py,sha256=qLCpbODLvw5AQMEAvpIP6kBNoc03h01ekjilg9tDxuw,9408
|
|
3
|
+
sigdetect/cli.py,sha256=Zco3-r4MAlVEmyEatvPUOZLLamh5ELFrquAK6ovJVlw,9290
|
|
4
|
+
sigdetect/config.py,sha256=-6GCUusdi0Ba-Rt6pwffB5MIz1ApPlBaXVKxpIppbKk,7678
|
|
5
|
+
sigdetect/cropping.py,sha256=zwOXzkG8tt1ZPUaDhJMHfonFEZtVNZZmZOzYQ_4nUAI,6074
|
|
6
|
+
sigdetect/eda.py,sha256=S92G1Gjmepri__D0n_V6foq0lQgH-RXI9anW8A58jfw,4681
|
|
7
|
+
sigdetect/logging_setup.py,sha256=LMF8ao_a-JwH0S522T6aYTFX3e8Ajjv_5ODS2YiBcHA,6404
|
|
8
|
+
sigdetect/utils.py,sha256=T9rubLf5T9JmjOHYMOba1j34fhOJaWocAXccnGTxRUE,5198
|
|
9
|
+
sigdetect/wet_detection.py,sha256=6ciFxMQS3f1nF502w4KLTksoYmjdudzTekh7McfWiIg,16464
|
|
10
|
+
sigdetect/data/role_rules.retainer.yml,sha256=IFdwKnDBXR2cTkdfrsZ6ku6CXD8S_dg5A3vKRKLW5h8,2532
|
|
11
|
+
sigdetect/data/role_rules.yml,sha256=HuLKsZR_A6sD9XvY4NHiY_VG3dS5ERNCBF9-Mxawomw,2751
|
|
12
|
+
sigdetect/data/vendor_patterns.yml,sha256=NRbZNQxcx_GuL6n1jAphBn6MM6ChCpeWGCsjbRx-PEo,384
|
|
13
|
+
sigdetect/detector/__init__.py,sha256=pUVFLwqj65cVO1qjsZy6NJ9BVY5xrJ6sQe-8LAb9O_A,2421
|
|
14
|
+
sigdetect/detector/base.py,sha256=L-iXWXqsTetDc4jRZo_wOdbNpKqOY20mX9FefrugdT0,263
|
|
15
|
+
sigdetect/detector/base_detector.py,sha256=GmAgUWO_fQgIfnihZSoyhR3wpnwZ-X3hS0Kuyz4G6Ys,608
|
|
16
|
+
sigdetect/detector/file_result_model.py,sha256=j2gTc9Sw3fJOHlexYsR_m5DiwHA8DzIzAMToESfvo4A,1767
|
|
17
|
+
sigdetect/detector/pymupdf_engine.py,sha256=SGtJOStKFdfsdBrscoe5zg9u2KGJ_JTRYZ25adL_7Lw,17390
|
|
18
|
+
sigdetect/detector/pypdf2_engine.py,sha256=kB8cIp_gMvCla0LIBi9sd19g0361Oc9TjCW_ZViUBJQ,47410
|
|
19
|
+
sigdetect/detector/signature_model.py,sha256=sdfQiOJzxnrg0WkGJxZCebA0wHqgzZnLI0gOv6ipSZA,1074
|
|
20
|
+
sigdetect-0.4.0.dist-info/METADATA,sha256=WA7OjyLtM3AH7OtdFRmliqBw0ucNlywoD2bykytlnPA,12475
|
|
21
|
+
sigdetect-0.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
22
|
+
sigdetect-0.4.0.dist-info/entry_points.txt,sha256=iqtfKjBU44-omM7Sh-idGz2ahw19oAvpvSyKZVArG3o,48
|
|
23
|
+
sigdetect-0.4.0.dist-info/top_level.txt,sha256=PKlfwUobkRC0viwiSXmhtw83G26FSNpimWYC1Uy00FY,10
|
|
24
|
+
sigdetect-0.4.0.dist-info/RECORD,,
|
sigdetect-0.3.1.dist-info/RECORD
DELETED
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
sigdetect/__init__.py,sha256=LhY78mDZ1ClYVNTxW_qtE-vqJoN9N7N5ZcNRDUI_3ss,575
|
|
2
|
-
sigdetect/api.py,sha256=6_CMSxcag9coHHzrpuRSVimHWSNtqQiWY9hdlqQ2IKY,9396
|
|
3
|
-
sigdetect/cli.py,sha256=NctAnaB-TQrUAT9m-v8kj2_KTNs88kbFOCiX32tHZm8,7920
|
|
4
|
-
sigdetect/config.py,sha256=S0NVKuJYiHJCocL-VNFdGJpasFcjTecavC4EthyS1DQ,5951
|
|
5
|
-
sigdetect/cropping.py,sha256=dmJF4Q1tkmkfm0NaiwHddNOP8Sj9S4Lj_d5EBjodEkk,6015
|
|
6
|
-
sigdetect/eda.py,sha256=S92G1Gjmepri__D0n_V6foq0lQgH-RXI9anW8A58jfw,4681
|
|
7
|
-
sigdetect/logging_setup.py,sha256=LMF8ao_a-JwH0S522T6aYTFX3e8Ajjv_5ODS2YiBcHA,6404
|
|
8
|
-
sigdetect/utils.py,sha256=T9rubLf5T9JmjOHYMOba1j34fhOJaWocAXccnGTxRUE,5198
|
|
9
|
-
sigdetect/data/role_rules.retainer.yml,sha256=IFdwKnDBXR2cTkdfrsZ6ku6CXD8S_dg5A3vKRKLW5h8,2532
|
|
10
|
-
sigdetect/data/role_rules.yml,sha256=HuLKsZR_A6sD9XvY4NHiY_VG3dS5ERNCBF9-Mxawomw,2751
|
|
11
|
-
sigdetect/data/vendor_patterns.yml,sha256=NRbZNQxcx_GuL6n1jAphBn6MM6ChCpeWGCsjbRx-PEo,384
|
|
12
|
-
sigdetect/detector/__init__.py,sha256=up2FCmD09f2bRHcS4WbY-clx3GQbWuk1PM2JlxgusHg,1608
|
|
13
|
-
sigdetect/detector/base.py,sha256=L-iXWXqsTetDc4jRZo_wOdbNpKqOY20mX9FefrugdT0,263
|
|
14
|
-
sigdetect/detector/base_detector.py,sha256=GmAgUWO_fQgIfnihZSoyhR3wpnwZ-X3hS0Kuyz4G6Ys,608
|
|
15
|
-
sigdetect/detector/file_result_model.py,sha256=j2gTc9Sw3fJOHlexYsR_m5DiwHA8DzIzAMToESfvo4A,1767
|
|
16
|
-
sigdetect/detector/pymupdf_engine.py,sha256=iyp7JuPlUnydwohH5zbNg4MwH44mBmxbBWOS3ZmArBo,17339
|
|
17
|
-
sigdetect/detector/pypdf2_engine.py,sha256=INWQH06kMLvto2VS-EdLC-EtMC6AG7JmdVYmNgx6_RU,47313
|
|
18
|
-
sigdetect/detector/signature_model.py,sha256=mztb9V5wgv2oohQ5Cxzcv8_Bo6TyWAVIXteaeQ2rywQ,1076
|
|
19
|
-
sigdetect-0.3.1.dist-info/METADATA,sha256=whXGE4-9spAjlMcZz_owdsIiB4EobXL9_UOuAJeDVfA,12342
|
|
20
|
-
sigdetect-0.3.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
21
|
-
sigdetect-0.3.1.dist-info/entry_points.txt,sha256=iqtfKjBU44-omM7Sh-idGz2ahw19oAvpvSyKZVArG3o,48
|
|
22
|
-
sigdetect-0.3.1.dist-info/top_level.txt,sha256=PKlfwUobkRC0viwiSXmhtw83G26FSNpimWYC1Uy00FY,10
|
|
23
|
-
sigdetect-0.3.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|