sigdetect 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sigdetect/api.py +136 -14
- sigdetect/cli.py +154 -20
- sigdetect/config.py +49 -9
- sigdetect/cropping.py +123 -0
- sigdetect/detector/pymupdf_engine.py +420 -0
- sigdetect/detector/pypdf2_engine.py +46 -8
- sigdetect/detector/signature_model.py +4 -0
- {sigdetect-0.1.0.dist-info → sigdetect-0.2.0.dist-info}/METADATA +37 -6
- {sigdetect-0.1.0.dist-info → sigdetect-0.2.0.dist-info}/RECORD +12 -11
- {sigdetect-0.1.0.dist-info → sigdetect-0.2.0.dist-info}/WHEEL +0 -0
- {sigdetect-0.1.0.dist-info → sigdetect-0.2.0.dist-info}/entry_points.txt +0 -0
- {sigdetect-0.1.0.dist-info → sigdetect-0.2.0.dist-info}/top_level.txt +0 -0
sigdetect/api.py
CHANGED
|
@@ -2,11 +2,12 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
from contextlib import contextmanager
|
|
5
6
|
from pathlib import Path
|
|
6
|
-
from typing import Any, Iterable, Iterator, Literal
|
|
7
|
+
from typing import Any, Generator, Iterable, Iterator, Literal
|
|
7
8
|
|
|
8
9
|
from sigdetect.config import DetectConfiguration
|
|
9
|
-
from sigdetect.detector import BuildDetector
|
|
10
|
+
from sigdetect.detector import BuildDetector, Detector, FileResult, Signature
|
|
10
11
|
|
|
11
12
|
EngineName = Literal["pypdf2", "pypdf", "pymupdf"]
|
|
12
13
|
ProfileName = Literal["hipaa", "retainer"]
|
|
@@ -19,23 +20,44 @@ def DetectPdf(
|
|
|
19
20
|
engineName: EngineName = "pypdf2",
|
|
20
21
|
includePseudoSignatures: bool = True,
|
|
21
22
|
recurseXObjects: bool = True,
|
|
23
|
+
detector: Detector | None = None,
|
|
22
24
|
) -> dict[str, Any]:
|
|
23
25
|
"""Detect signature evidence and assign roles for a single PDF."""
|
|
24
26
|
|
|
25
27
|
resolvedPath = Path(pdfPath)
|
|
28
|
+
activeDetector = detector or get_detector(
|
|
29
|
+
pdfRoot=resolvedPath.parent,
|
|
30
|
+
profileName=profileName,
|
|
31
|
+
engineName=engineName,
|
|
32
|
+
includePseudoSignatures=includePseudoSignatures,
|
|
33
|
+
recurseXObjects=recurseXObjects,
|
|
34
|
+
outputDirectory=None,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
result = activeDetector.Detect(resolvedPath)
|
|
38
|
+
return _ToPlainDictionary(result)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def get_detector(
|
|
42
|
+
*,
|
|
43
|
+
pdfRoot: str | Path | None = None,
|
|
44
|
+
profileName: ProfileName = "hipaa",
|
|
45
|
+
engineName: EngineName = "pypdf2",
|
|
46
|
+
includePseudoSignatures: bool = True,
|
|
47
|
+
recurseXObjects: bool = True,
|
|
48
|
+
outputDirectory: str | Path | None = None,
|
|
49
|
+
) -> Detector:
|
|
50
|
+
"""Return a reusable detector instance configured with the supplied options."""
|
|
26
51
|
|
|
27
52
|
configuration = DetectConfiguration(
|
|
28
|
-
PdfRoot=
|
|
29
|
-
OutputDirectory=None,
|
|
53
|
+
PdfRoot=Path(pdfRoot) if pdfRoot is not None else Path.cwd(),
|
|
54
|
+
OutputDirectory=Path(outputDirectory) if outputDirectory is not None else None,
|
|
30
55
|
Engine=engineName,
|
|
31
56
|
PseudoSignatures=includePseudoSignatures,
|
|
32
57
|
RecurseXObjects=recurseXObjects,
|
|
33
58
|
Profile=profileName,
|
|
34
59
|
)
|
|
35
|
-
|
|
36
|
-
detector = BuildDetector(configuration)
|
|
37
|
-
result = detector.Detect(resolvedPath)
|
|
38
|
-
return _ToPlainDictionary(result)
|
|
60
|
+
return BuildDetector(configuration)
|
|
39
61
|
|
|
40
62
|
|
|
41
63
|
def _ToPlainDictionary(candidate: Any) -> dict[str, Any]:
|
|
@@ -84,10 +106,17 @@ def _ToPlainValue(value: Any) -> Any:
|
|
|
84
106
|
|
|
85
107
|
def DetectMany(
|
|
86
108
|
pdfPaths: Iterable[str | Path],
|
|
109
|
+
*,
|
|
110
|
+
detector: Detector | None = None,
|
|
87
111
|
**kwargs: Any,
|
|
88
112
|
) -> Iterator[dict[str, Any]]:
|
|
89
113
|
"""Yield :func:`DetectPdf` results for each path in ``pdfPaths``."""
|
|
90
114
|
|
|
115
|
+
if detector is not None:
|
|
116
|
+
for pdfPath in pdfPaths:
|
|
117
|
+
yield _DetectWithDetector(detector, pdfPath)
|
|
118
|
+
return
|
|
119
|
+
|
|
91
120
|
for pdfPath in pdfPaths:
|
|
92
121
|
yield DetectPdf(pdfPath, **kwargs)
|
|
93
122
|
|
|
@@ -96,19 +125,24 @@ def ScanDirectory(
|
|
|
96
125
|
pdfRoot: str | Path,
|
|
97
126
|
*,
|
|
98
127
|
globPattern: str = "**/*.pdf",
|
|
128
|
+
detector: Detector | None = None,
|
|
99
129
|
**kwargs: Any,
|
|
100
130
|
) -> Iterator[dict[str, Any]]:
|
|
101
131
|
"""Walk ``pdfRoot`` and yield detection output for every matching PDF."""
|
|
102
132
|
|
|
103
133
|
rootDirectory = Path(pdfRoot)
|
|
104
|
-
|
|
105
|
-
rootDirectory.rglob(
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
134
|
+
if globPattern == "**/*.pdf":
|
|
135
|
+
iterator = (path for path in rootDirectory.rglob("*") if path.is_file())
|
|
136
|
+
else:
|
|
137
|
+
iterator = (
|
|
138
|
+
rootDirectory.rglob(globPattern.replace("**/", "", 1))
|
|
139
|
+
if globPattern.startswith("**/")
|
|
140
|
+
else rootDirectory.glob(globPattern)
|
|
141
|
+
)
|
|
142
|
+
|
|
109
143
|
for pdfPath in iterator:
|
|
110
144
|
if pdfPath.is_file() and pdfPath.suffix.lower() == ".pdf":
|
|
111
|
-
yield DetectPdf(pdfPath, **kwargs)
|
|
145
|
+
yield DetectPdf(pdfPath, detector=detector, **kwargs)
|
|
112
146
|
|
|
113
147
|
|
|
114
148
|
def ToCsvRow(result: dict[str, Any]) -> dict[str, Any]:
|
|
@@ -137,3 +171,91 @@ def Version() -> str:
|
|
|
137
171
|
return resolveVersion("sigdetect")
|
|
138
172
|
except Exception:
|
|
139
173
|
return "0.0.0-dev"
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _DetectWithDetector(detector: Detector, pdfPath: str | Path) -> dict[str, Any]:
|
|
177
|
+
"""Helper that runs ``detector`` and returns the plain dictionary result."""
|
|
178
|
+
|
|
179
|
+
resolvedPath = Path(pdfPath)
|
|
180
|
+
return _ToPlainDictionary(detector.Detect(resolvedPath))
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
@contextmanager
|
|
184
|
+
def detector_context(**kwargs: Any) -> Generator[Detector, None, None]:
|
|
185
|
+
"""Context manager wrapper around :func:`get_detector`."""
|
|
186
|
+
|
|
187
|
+
detector = get_detector(**kwargs)
|
|
188
|
+
try:
|
|
189
|
+
yield detector
|
|
190
|
+
finally:
|
|
191
|
+
pass
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def CropSignatureImages(
|
|
195
|
+
pdfPath: str | Path,
|
|
196
|
+
fileResult: FileResult | dict[str, Any],
|
|
197
|
+
*,
|
|
198
|
+
outputDirectory: str | Path,
|
|
199
|
+
dpi: int = 200,
|
|
200
|
+
) -> list[Path]:
|
|
201
|
+
"""Crop detected signature regions to PNG files.
|
|
202
|
+
|
|
203
|
+
Accepts either a :class:`FileResult` instance or the ``dict`` returned by
|
|
204
|
+
:func:`DetectPdf`. Requires the optional ``pymupdf`` dependency.
|
|
205
|
+
"""
|
|
206
|
+
|
|
207
|
+
from sigdetect.cropping import crop_signatures
|
|
208
|
+
|
|
209
|
+
file_result_obj, original_dict = _CoerceFileResult(fileResult)
|
|
210
|
+
paths = crop_signatures(
|
|
211
|
+
pdf_path=Path(pdfPath),
|
|
212
|
+
file_result=file_result_obj,
|
|
213
|
+
output_dir=Path(outputDirectory),
|
|
214
|
+
dpi=dpi,
|
|
215
|
+
)
|
|
216
|
+
if original_dict is not None:
|
|
217
|
+
original_dict.clear()
|
|
218
|
+
original_dict.update(file_result_obj.to_dict())
|
|
219
|
+
return paths
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def _CoerceFileResult(
|
|
223
|
+
candidate: FileResult | dict[str, Any]
|
|
224
|
+
) -> tuple[FileResult, dict[str, Any] | None]:
|
|
225
|
+
if isinstance(candidate, FileResult):
|
|
226
|
+
return candidate, None
|
|
227
|
+
if not isinstance(candidate, dict):
|
|
228
|
+
raise TypeError("fileResult must be FileResult or dict")
|
|
229
|
+
|
|
230
|
+
signatures: list[Signature] = []
|
|
231
|
+
for entry in candidate.get("signatures") or []:
|
|
232
|
+
bbox = entry.get("bounding_box")
|
|
233
|
+
signatures.append(
|
|
234
|
+
Signature(
|
|
235
|
+
Page=entry.get("page"),
|
|
236
|
+
FieldName=str(entry.get("field_name") or ""),
|
|
237
|
+
Role=str(entry.get("role") or "unknown"),
|
|
238
|
+
Score=int(entry.get("score") or 0),
|
|
239
|
+
Scores=dict(entry.get("scores") or {}),
|
|
240
|
+
Evidence=list(entry.get("evidence") or []),
|
|
241
|
+
Hint=str(entry.get("hint") or ""),
|
|
242
|
+
RenderType=str(entry.get("render_type") or "unknown"),
|
|
243
|
+
BoundingBox=tuple(bbox) if bbox else None,
|
|
244
|
+
CropPath=entry.get("crop_path"),
|
|
245
|
+
)
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
file_result = FileResult(
|
|
249
|
+
File=str(candidate.get("file") or ""),
|
|
250
|
+
SizeKilobytes=candidate.get("size_kb"),
|
|
251
|
+
PageCount=int(candidate.get("pages") or 0),
|
|
252
|
+
ElectronicSignatureFound=bool(candidate.get("esign_found")),
|
|
253
|
+
ScannedPdf=candidate.get("scanned_pdf"),
|
|
254
|
+
MixedContent=candidate.get("mixed"),
|
|
255
|
+
SignatureCount=int(candidate.get("sig_count") or len(signatures)),
|
|
256
|
+
SignaturePages=str(candidate.get("sig_pages") or ""),
|
|
257
|
+
Roles=str(candidate.get("roles") or "unknown"),
|
|
258
|
+
Hints=str(candidate.get("hints") or ""),
|
|
259
|
+
Signatures=signatures,
|
|
260
|
+
)
|
|
261
|
+
return file_result, candidate
|
sigdetect/cli.py
CHANGED
|
@@ -3,14 +3,16 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import json
|
|
6
|
+
from collections.abc import Iterator
|
|
6
7
|
from dataclasses import asdict, is_dataclass
|
|
7
8
|
from pathlib import Path
|
|
8
9
|
|
|
9
10
|
import typer
|
|
10
11
|
|
|
11
12
|
from . import __version__
|
|
12
|
-
from .config import LoadConfiguration
|
|
13
|
-
from .
|
|
13
|
+
from .config import FinalizeConfiguration, LoadConfiguration
|
|
14
|
+
from .cropping import SignatureCroppingUnavailable, crop_signatures
|
|
15
|
+
from .detector import BuildDetector, FileResult
|
|
14
16
|
from .eda import RunExploratoryAnalysis
|
|
15
17
|
from .logging_setup import ConfigureLogging
|
|
16
18
|
|
|
@@ -31,18 +33,65 @@ def _JsonSerializer(candidate):
|
|
|
31
33
|
return str(candidate)
|
|
32
34
|
|
|
33
35
|
|
|
36
|
+
def _EnumeratePdfs(pdfRoot: Path, recursive: bool) -> Iterator[Path]:
|
|
37
|
+
"""Yield PDF files under ``pdfRoot`` honoring the recursion flag."""
|
|
38
|
+
|
|
39
|
+
iterator = pdfRoot.rglob("*") if recursive else pdfRoot.glob("*")
|
|
40
|
+
for path in iterator:
|
|
41
|
+
if path.is_file() and path.suffix.lower() == ".pdf":
|
|
42
|
+
yield path
|
|
43
|
+
|
|
44
|
+
|
|
34
45
|
@CliApplication.command(name="detect")
|
|
35
46
|
def Detect(
|
|
36
47
|
configurationPath: Path | None = typer.Option(
|
|
37
48
|
None, "--config", "-c", help="Path to YAML config"
|
|
38
49
|
),
|
|
39
50
|
profileOverride: str | None = typer.Option(None, "--profile", "-p", help="hipaa or retainer"),
|
|
51
|
+
recursive: bool = typer.Option(
|
|
52
|
+
True,
|
|
53
|
+
"--recursive/--no-recursive",
|
|
54
|
+
help="Recurse into subdirectories when gathering PDFs",
|
|
55
|
+
),
|
|
56
|
+
cropSignatures: bool | None = typer.Option(
|
|
57
|
+
None,
|
|
58
|
+
"--crop-signatures/--no-crop-signatures",
|
|
59
|
+
help="Crop detected signature regions to PNG files (requires PyMuPDF)",
|
|
60
|
+
show_default=False,
|
|
61
|
+
),
|
|
62
|
+
cropDirectory: Path | None = typer.Option(
|
|
63
|
+
None,
|
|
64
|
+
"--crop-dir",
|
|
65
|
+
help="Directory for signature PNG crops (defaults to out_dir/signature_crops)",
|
|
66
|
+
),
|
|
67
|
+
cropDpi: int | None = typer.Option(
|
|
68
|
+
None,
|
|
69
|
+
"--crop-dpi",
|
|
70
|
+
min=72,
|
|
71
|
+
max=600,
|
|
72
|
+
help="Rendering DPI for signature crops",
|
|
73
|
+
show_default=False,
|
|
74
|
+
),
|
|
40
75
|
) -> None:
|
|
41
76
|
"""Run detection for the configured directory and emit ``results.json``."""
|
|
42
77
|
|
|
43
78
|
configuration = LoadConfiguration(configurationPath)
|
|
44
|
-
if profileOverride
|
|
45
|
-
|
|
79
|
+
if profileOverride is not None:
|
|
80
|
+
normalized_profile = profileOverride.lower()
|
|
81
|
+
if normalized_profile not in {"hipaa", "retainer"}:
|
|
82
|
+
raise typer.BadParameter("Profile must be 'hipaa' or 'retainer'.")
|
|
83
|
+
configuration = configuration.model_copy(update={"Profile": normalized_profile})
|
|
84
|
+
|
|
85
|
+
overrides: dict[str, object] = {}
|
|
86
|
+
if cropSignatures is not None:
|
|
87
|
+
overrides["CropSignatures"] = cropSignatures
|
|
88
|
+
if cropDirectory is not None:
|
|
89
|
+
overrides["CropOutputDirectory"] = cropDirectory
|
|
90
|
+
if cropDpi is not None:
|
|
91
|
+
overrides["CropImageDpi"] = cropDpi
|
|
92
|
+
if overrides:
|
|
93
|
+
configuration = configuration.model_copy(update=overrides)
|
|
94
|
+
configuration = FinalizeConfiguration(configuration)
|
|
46
95
|
|
|
47
96
|
try:
|
|
48
97
|
detector = BuildDetector(configuration)
|
|
@@ -54,26 +103,111 @@ def Detect(
|
|
|
54
103
|
typer.echo(str(exc), err=True)
|
|
55
104
|
raise typer.Exit(code=2) from exc
|
|
56
105
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
106
|
+
pdfIterator = _EnumeratePdfs(configuration.PdfRoot, recursive)
|
|
107
|
+
try:
|
|
108
|
+
firstPdf = next(pdfIterator)
|
|
109
|
+
except StopIteration:
|
|
110
|
+
raise SystemExit(f"No PDFs found in {configuration.PdfRoot}") from None
|
|
111
|
+
|
|
112
|
+
results_buffer: list[FileResult] | None = [] if configuration.OutputDirectory is None else None
|
|
113
|
+
json_handle = None
|
|
114
|
+
json_path: Path | None = None
|
|
115
|
+
wrote_first = False
|
|
116
|
+
|
|
117
|
+
if configuration.OutputDirectory is not None:
|
|
118
|
+
outputDirectory = configuration.OutputDirectory
|
|
119
|
+
outputDirectory.mkdir(parents=True, exist_ok=True)
|
|
120
|
+
json_path = outputDirectory / "results.json"
|
|
121
|
+
json_handle = open(json_path, "w", encoding="utf-8")
|
|
122
|
+
json_handle.write("[")
|
|
123
|
+
|
|
124
|
+
crop_dir = configuration.CropOutputDirectory
|
|
125
|
+
cropping_enabled = configuration.CropSignatures
|
|
126
|
+
cropping_available = True
|
|
127
|
+
cropping_attempted = False
|
|
128
|
+
if configuration.CropSignatures and crop_dir is None:
|
|
129
|
+
Logger.warning(
|
|
130
|
+
"CropSignatures enabled without an output directory",
|
|
131
|
+
extra={"pdf_root": str(configuration.PdfRoot)},
|
|
132
|
+
)
|
|
133
|
+
cropping_enabled = False
|
|
134
|
+
|
|
135
|
+
total_bboxes = 0
|
|
136
|
+
|
|
137
|
+
def _append_result(file_result: FileResult, source_pdf: Path) -> None:
|
|
138
|
+
nonlocal wrote_first, json_handle, total_bboxes, cropping_available, cropping_attempted
|
|
139
|
+
|
|
140
|
+
if cropping_enabled and cropping_available and crop_dir is not None:
|
|
141
|
+
try:
|
|
142
|
+
crop_signatures(
|
|
143
|
+
pdf_path=source_pdf,
|
|
144
|
+
file_result=file_result,
|
|
145
|
+
output_dir=crop_dir,
|
|
146
|
+
dpi=configuration.CropImageDpi,
|
|
147
|
+
logger=Logger,
|
|
148
|
+
)
|
|
149
|
+
cropping_attempted = True
|
|
150
|
+
except SignatureCroppingUnavailable as exc:
|
|
151
|
+
cropping_available = False
|
|
152
|
+
Logger.warning("Signature cropping unavailable", extra={"error": str(exc)})
|
|
153
|
+
typer.echo(str(exc), err=True)
|
|
154
|
+
except Exception as exc: # pragma: no cover - defensive
|
|
155
|
+
Logger.warning(
|
|
156
|
+
"Unexpected error while cropping signatures",
|
|
157
|
+
extra={"error": str(exc)},
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
total_bboxes += sum(1 for sig in file_result.Signatures if sig.BoundingBox)
|
|
161
|
+
|
|
162
|
+
if results_buffer is not None:
|
|
163
|
+
results_buffer.append(file_result)
|
|
164
|
+
return
|
|
165
|
+
|
|
166
|
+
if json_handle is None:
|
|
167
|
+
return
|
|
168
|
+
|
|
169
|
+
serialized = json.dumps(
|
|
170
|
+
file_result,
|
|
171
|
+
indent=2,
|
|
172
|
+
ensure_ascii=False,
|
|
173
|
+
default=_JsonSerializer,
|
|
174
|
+
)
|
|
175
|
+
indented = "\n".join(f" {line}" for line in serialized.splitlines())
|
|
176
|
+
if wrote_first:
|
|
177
|
+
json_handle.write(",\n")
|
|
178
|
+
else:
|
|
179
|
+
json_handle.write("\n")
|
|
180
|
+
json_handle.write(indented)
|
|
181
|
+
wrote_first = True
|
|
182
|
+
|
|
183
|
+
def _process(pdf_path: Path) -> None:
|
|
184
|
+
file_result = detector.Detect(pdf_path)
|
|
185
|
+
_append_result(file_result, pdf_path)
|
|
62
186
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
187
|
+
try:
|
|
188
|
+
_process(firstPdf)
|
|
189
|
+
for pdf_path in pdfIterator:
|
|
190
|
+
_process(pdf_path)
|
|
191
|
+
finally:
|
|
192
|
+
if json_handle is not None:
|
|
193
|
+
closing = "\n]\n" if wrote_first else "]\n"
|
|
194
|
+
json_handle.write(closing)
|
|
195
|
+
json_handle.close()
|
|
196
|
+
|
|
197
|
+
if json_handle is not None:
|
|
198
|
+
typer.echo(f"Wrote {json_path}")
|
|
199
|
+
else:
|
|
200
|
+
payload = json.dumps(
|
|
201
|
+
results_buffer or [], indent=2, ensure_ascii=False, default=_JsonSerializer
|
|
202
|
+
)
|
|
66
203
|
typer.echo(payload)
|
|
67
204
|
typer.echo("Detection completed with output disabled (out_dir=none)")
|
|
68
|
-
return
|
|
69
|
-
|
|
70
|
-
outputDirectory = configuration.OutputDirectory
|
|
71
|
-
outputDirectory.mkdir(parents=True, exist_ok=True)
|
|
72
205
|
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
206
|
+
if cropping_enabled and cropping_available and cropping_attempted and total_bboxes == 0:
|
|
207
|
+
Logger.warning(
|
|
208
|
+
"No signature bounding boxes detected; try --engine pymupdf for crop-ready output",
|
|
209
|
+
extra={"engine": configuration.Engine},
|
|
210
|
+
)
|
|
77
211
|
|
|
78
212
|
|
|
79
213
|
@CliApplication.command(name="eda")
|
sigdetect/config.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import os
|
|
6
|
+
from contextlib import suppress
|
|
6
7
|
from pathlib import Path
|
|
7
8
|
from typing import Literal
|
|
8
9
|
|
|
@@ -26,11 +27,13 @@ class DetectConfiguration(BaseModel):
|
|
|
26
27
|
OutputDirectory: Path | None = Field(default=Path("out"), alias="out_dir")
|
|
27
28
|
Engine: EngineName = Field(default="pypdf2", alias="engine")
|
|
28
29
|
Profile: ProfileName = Field(default="hipaa", alias="profile")
|
|
29
|
-
MaxWorkers: int = Field(default=8, alias="max_workers", ge=1, le=64)
|
|
30
30
|
PseudoSignatures: bool = Field(default=True, alias="pseudo_signatures")
|
|
31
31
|
RecurseXObjects: bool = Field(default=True, alias="recurse_xobjects")
|
|
32
|
+
CropSignatures: bool = Field(default=False, alias="crop_signatures")
|
|
33
|
+
CropOutputDirectory: Path | None = Field(default=None, alias="crop_output_dir")
|
|
34
|
+
CropImageDpi: int = Field(default=200, alias="crop_image_dpi", ge=72, le=600)
|
|
32
35
|
|
|
33
|
-
@field_validator("PdfRoot", "OutputDirectory", mode="before")
|
|
36
|
+
@field_validator("PdfRoot", "OutputDirectory", "CropOutputDirectory", mode="before")
|
|
34
37
|
@classmethod
|
|
35
38
|
def _CoercePath(cls, value: str | Path | None) -> Path | None:
|
|
36
39
|
"""Allow configuration values to be provided as ``str`` or ``Path``.
|
|
@@ -42,8 +45,8 @@ class DetectConfiguration(BaseModel):
|
|
|
42
45
|
if value is None:
|
|
43
46
|
return None
|
|
44
47
|
if isinstance(value, Path):
|
|
45
|
-
return value
|
|
46
|
-
return Path(value)
|
|
48
|
+
return value.expanduser()
|
|
49
|
+
return Path(value).expanduser()
|
|
47
50
|
|
|
48
51
|
# Expose legacy snake_case property names for gradual migration
|
|
49
52
|
@property
|
|
@@ -62,10 +65,6 @@ class DetectConfiguration(BaseModel):
|
|
|
62
65
|
def profile(self) -> ProfileName: # pragma: no cover - simple passthrough
|
|
63
66
|
return self.Profile
|
|
64
67
|
|
|
65
|
-
@property
|
|
66
|
-
def max_workers(self) -> int: # pragma: no cover - simple passthrough
|
|
67
|
-
return self.MaxWorkers
|
|
68
|
-
|
|
69
68
|
@property
|
|
70
69
|
def pseudo_signatures(self) -> bool: # pragma: no cover - simple passthrough
|
|
71
70
|
return self.PseudoSignatures
|
|
@@ -74,6 +73,18 @@ class DetectConfiguration(BaseModel):
|
|
|
74
73
|
def recurse_xobjects(self) -> bool: # pragma: no cover - simple passthrough
|
|
75
74
|
return self.RecurseXObjects
|
|
76
75
|
|
|
76
|
+
@property
|
|
77
|
+
def crop_signatures(self) -> bool: # pragma: no cover - simple passthrough
|
|
78
|
+
return self.CropSignatures
|
|
79
|
+
|
|
80
|
+
@property
|
|
81
|
+
def crop_output_dir(self) -> Path | None: # pragma: no cover - simple passthrough
|
|
82
|
+
return self.CropOutputDirectory
|
|
83
|
+
|
|
84
|
+
@property
|
|
85
|
+
def crop_image_dpi(self) -> int: # pragma: no cover - simple passthrough
|
|
86
|
+
return self.CropImageDpi
|
|
87
|
+
|
|
77
88
|
|
|
78
89
|
def LoadConfiguration(path: Path | None) -> DetectConfiguration:
|
|
79
90
|
"""Load configuration from ``path`` while applying environment overrides.
|
|
@@ -94,6 +105,9 @@ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
|
|
|
94
105
|
env_pdf_root = os.getenv("SIGDETECT_PDF_ROOT")
|
|
95
106
|
env_out_dir = os.getenv("SIGDETECT_OUT_DIR")
|
|
96
107
|
env_profile = os.getenv("SIGDETECT_PROFILE")
|
|
108
|
+
env_crop = os.getenv("SIGDETECT_CROP_SIGNATURES")
|
|
109
|
+
env_crop_dir = os.getenv("SIGDETECT_CROP_DIR")
|
|
110
|
+
env_crop_dpi = os.getenv("SIGDETECT_CROP_DPI")
|
|
97
111
|
|
|
98
112
|
raw_data: dict[str, object] = {}
|
|
99
113
|
if path and Path(path).exists():
|
|
@@ -108,10 +122,36 @@ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
|
|
|
108
122
|
raw_data["out_dir"] = None if env_out_dir.lower() == "none" else env_out_dir
|
|
109
123
|
if env_profile in {"hipaa", "retainer"}:
|
|
110
124
|
raw_data["profile"] = env_profile
|
|
125
|
+
if env_crop is not None:
|
|
126
|
+
lowered = env_crop.lower()
|
|
127
|
+
if lowered in {"1", "true", "yes", "on"}:
|
|
128
|
+
raw_data["crop_signatures"] = True
|
|
129
|
+
elif lowered in {"0", "false", "no", "off"}:
|
|
130
|
+
raw_data["crop_signatures"] = False
|
|
131
|
+
if env_crop_dir:
|
|
132
|
+
raw_data["crop_output_dir"] = env_crop_dir
|
|
133
|
+
if env_crop_dpi:
|
|
134
|
+
with suppress(ValueError):
|
|
135
|
+
raw_data["crop_image_dpi"] = int(env_crop_dpi)
|
|
111
136
|
|
|
112
137
|
configuration = DetectConfiguration(**raw_data)
|
|
138
|
+
return FinalizeConfiguration(configuration)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def FinalizeConfiguration(configuration: DetectConfiguration) -> DetectConfiguration:
|
|
142
|
+
"""Ensure derived directories exist and defaults are populated."""
|
|
143
|
+
|
|
144
|
+
updates: dict[str, object] = {}
|
|
113
145
|
|
|
114
146
|
if configuration.OutputDirectory is not None:
|
|
115
147
|
configuration.OutputDirectory.mkdir(parents=True, exist_ok=True)
|
|
116
148
|
|
|
117
|
-
|
|
149
|
+
if configuration.CropSignatures:
|
|
150
|
+
crop_dir = configuration.CropOutputDirectory
|
|
151
|
+
if crop_dir is None:
|
|
152
|
+
base_dir = configuration.OutputDirectory or configuration.PdfRoot
|
|
153
|
+
crop_dir = base_dir / "signature_crops"
|
|
154
|
+
crop_dir.mkdir(parents=True, exist_ok=True)
|
|
155
|
+
updates["CropOutputDirectory"] = crop_dir
|
|
156
|
+
|
|
157
|
+
return configuration if not updates else configuration.model_copy(update=updates)
|
sigdetect/cropping.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""Helpers for converting signature bounding boxes into PNG crops."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import re
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from .detector.file_result_model import FileResult
|
|
10
|
+
from .detector.signature_model import Signature
|
|
11
|
+
|
|
12
|
+
try: # pragma: no cover - optional dependency
|
|
13
|
+
import fitz # type: ignore
|
|
14
|
+
except Exception: # pragma: no cover - optional dependency
|
|
15
|
+
fitz = None # type: ignore[misc]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class SignatureCroppingUnavailable(RuntimeError):
|
|
19
|
+
"""Raised when PNG cropping cannot be performed (e.g., PyMuPDF missing)."""
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def crop_signatures(
|
|
23
|
+
pdf_path: Path,
|
|
24
|
+
file_result: FileResult,
|
|
25
|
+
*,
|
|
26
|
+
output_dir: Path,
|
|
27
|
+
dpi: int = 200,
|
|
28
|
+
logger: logging.Logger | None = None,
|
|
29
|
+
) -> list[Path]:
|
|
30
|
+
"""Render each signature bounding box to a PNG image using PyMuPDF."""
|
|
31
|
+
|
|
32
|
+
if fitz is None: # pragma: no cover - exercised when dependency absent
|
|
33
|
+
raise SignatureCroppingUnavailable(
|
|
34
|
+
"PyMuPDF is required for PNG crops. Install 'pymupdf' or 'sigdetect[pymupdf]'."
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
pdf_path = Path(pdf_path)
|
|
38
|
+
output_dir = Path(output_dir)
|
|
39
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
40
|
+
generated: list[Path] = []
|
|
41
|
+
|
|
42
|
+
with fitz.open(pdf_path) as document: # type: ignore[attr-defined]
|
|
43
|
+
per_document_dir = output_dir / pdf_path.stem
|
|
44
|
+
per_document_dir.mkdir(parents=True, exist_ok=True)
|
|
45
|
+
scale = dpi / 72.0
|
|
46
|
+
matrix = fitz.Matrix(scale, scale)
|
|
47
|
+
|
|
48
|
+
for index, signature in enumerate(file_result.Signatures, start=1):
|
|
49
|
+
if not signature.BoundingBox or not signature.Page:
|
|
50
|
+
continue
|
|
51
|
+
try:
|
|
52
|
+
page = document.load_page(signature.Page - 1)
|
|
53
|
+
except Exception as exc: # pragma: no cover - defensive
|
|
54
|
+
if logger:
|
|
55
|
+
logger.warning(
|
|
56
|
+
"Failed to load page for signature crop",
|
|
57
|
+
extra={
|
|
58
|
+
"file": pdf_path.name,
|
|
59
|
+
"page": signature.Page,
|
|
60
|
+
"error": str(exc),
|
|
61
|
+
},
|
|
62
|
+
)
|
|
63
|
+
continue
|
|
64
|
+
|
|
65
|
+
clip = _to_clip_rect(page, signature.BoundingBox)
|
|
66
|
+
if clip is None:
|
|
67
|
+
continue
|
|
68
|
+
|
|
69
|
+
filename = _build_filename(index, signature)
|
|
70
|
+
destination = per_document_dir / filename
|
|
71
|
+
|
|
72
|
+
try:
|
|
73
|
+
pixmap = page.get_pixmap(matrix=matrix, clip=clip, alpha=False)
|
|
74
|
+
pixmap.save(destination)
|
|
75
|
+
except Exception as exc: # pragma: no cover - defensive
|
|
76
|
+
if logger:
|
|
77
|
+
logger.warning(
|
|
78
|
+
"Failed to render signature crop",
|
|
79
|
+
extra={
|
|
80
|
+
"file": pdf_path.name,
|
|
81
|
+
"page": signature.Page,
|
|
82
|
+
"field": signature.FieldName,
|
|
83
|
+
"error": str(exc),
|
|
84
|
+
},
|
|
85
|
+
)
|
|
86
|
+
continue
|
|
87
|
+
|
|
88
|
+
signature.CropPath = str(destination)
|
|
89
|
+
generated.append(destination)
|
|
90
|
+
|
|
91
|
+
return generated
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _to_clip_rect(page, bbox: tuple[float, float, float, float]):
|
|
95
|
+
width = float(page.rect.width)
|
|
96
|
+
height = float(page.rect.height)
|
|
97
|
+
|
|
98
|
+
x0, y0, x1, y1 = bbox
|
|
99
|
+
left = _clamp(min(x0, x1), 0.0, width)
|
|
100
|
+
right = _clamp(max(x0, x1), 0.0, width)
|
|
101
|
+
|
|
102
|
+
top = _clamp(height - max(y0, y1), 0.0, height)
|
|
103
|
+
bottom = _clamp(height - min(y0, y1), 0.0, height)
|
|
104
|
+
|
|
105
|
+
if right - left <= 0 or bottom - top <= 0:
|
|
106
|
+
return None
|
|
107
|
+
return fitz.Rect(left, top, right, bottom)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _clamp(value: float, lower: float, upper: float) -> float:
|
|
111
|
+
return max(lower, min(value, upper))
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _build_filename(index: int, signature: Signature) -> str:
|
|
115
|
+
base = signature.Role or signature.FieldName or "signature"
|
|
116
|
+
slug = _slugify(base)
|
|
117
|
+
return f"sig_{index:02d}_{slug}.png"
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _slugify(value: str) -> str:
|
|
121
|
+
cleaned = re.sub(r"[^A-Za-z0-9_-]+", "_", value.strip().lower())
|
|
122
|
+
cleaned = cleaned.strip("_")
|
|
123
|
+
return cleaned or "signature"
|
|
@@ -0,0 +1,420 @@
|
|
|
1
|
+
"""PyMuPDF-backed detector that augments PyPDF2 heuristics with geometry."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Iterable, cast
|
|
7
|
+
|
|
8
|
+
from .pypdf2_engine import PyPDF2Detector
|
|
9
|
+
from .signature_model import Signature
|
|
10
|
+
|
|
11
|
+
try: # pragma: no cover - optional dependency
|
|
12
|
+
import fitz # type: ignore
|
|
13
|
+
except Exception: # pragma: no cover - optional dependency
|
|
14
|
+
fitz = None # type: ignore[misc]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class PyMuPDFDetector(PyPDF2Detector):
|
|
18
|
+
"""Detector that reuses PyPDF2 heuristics and annotates results via PyMuPDF."""
|
|
19
|
+
|
|
20
|
+
Name = "pymupdf"
|
|
21
|
+
SIGNATURE_PADDING = 64.0
|
|
22
|
+
ROLE_KEYWORDS: dict[str, tuple[str, ...]] = {
|
|
23
|
+
"client": ("client", "consumer", "claimant"),
|
|
24
|
+
"firm": ("firm", "attorney", "attorneys", "counsel", "company", "llp", "llc", "law", "by:"),
|
|
25
|
+
"patient": ("patient", "self", "plaintiff"),
|
|
26
|
+
"representative": ("representative", "guardian", "parent"),
|
|
27
|
+
"attorney": ("attorney", "counsel", "lawyer"),
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
def __init__(self, configuration):
|
|
31
|
+
if fitz is None: # pragma: no cover - optional dependency
|
|
32
|
+
raise ValueError(
|
|
33
|
+
"PyMuPDF engine requires the optional 'pymupdf' dependency. Install via 'pip install "
|
|
34
|
+
"sigdetect[pymupdf]' or add pymupdf to your environment."
|
|
35
|
+
)
|
|
36
|
+
super().__init__(configuration)
|
|
37
|
+
|
|
38
|
+
def Detect(self, pdf_path: Path): # type: ignore[override]
|
|
39
|
+
result = super().Detect(pdf_path)
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
document = fitz.open(str(pdf_path))
|
|
43
|
+
except Exception: # pragma: no cover - defensive
|
|
44
|
+
return result
|
|
45
|
+
|
|
46
|
+
with document:
|
|
47
|
+
widget_map = self._CollectWidgetRects(document)
|
|
48
|
+
self._ApplyWidgetRects(result.Signatures, widget_map)
|
|
49
|
+
self._InferPseudoRects(result.Signatures, document)
|
|
50
|
+
return result
|
|
51
|
+
|
|
52
|
+
# ───────────────────────────────── widget helpers ─────────────────────────────────
|
|
53
|
+
def _CollectWidgetRects(
|
|
54
|
+
self, document
|
|
55
|
+
) -> dict[tuple[int, str], tuple[float, float, float, float]]:
|
|
56
|
+
mapping: dict[tuple[int, str], tuple[float, float, float, float]] = {}
|
|
57
|
+
for page_index in range(document.page_count):
|
|
58
|
+
page = document.load_page(page_index)
|
|
59
|
+
widgets = page.widgets() if hasattr(page, "widgets") else None
|
|
60
|
+
if not widgets:
|
|
61
|
+
continue
|
|
62
|
+
for widget in widgets:
|
|
63
|
+
name = (widget.field_name or "").strip()
|
|
64
|
+
if not name:
|
|
65
|
+
continue
|
|
66
|
+
# Prefer true signature widgets but fall back to any widget with /Sig appearance
|
|
67
|
+
if getattr(widget, "field_type", None) not in {
|
|
68
|
+
getattr(fitz, "PDF_WIDGET_TYPE_SIGNATURE", 6)
|
|
69
|
+
}:
|
|
70
|
+
continue
|
|
71
|
+
rect = self._RectToPdfTuple(widget.rect, page.rect.height)
|
|
72
|
+
mapping[(page_index + 1, name)] = rect
|
|
73
|
+
return mapping
|
|
74
|
+
|
|
75
|
+
def _ApplyWidgetRects(
|
|
76
|
+
self,
|
|
77
|
+
signatures: Iterable[Signature],
|
|
78
|
+
widget_map: dict[tuple[int, str], tuple[float, float, float, float]],
|
|
79
|
+
) -> None:
|
|
80
|
+
for signature in signatures:
|
|
81
|
+
if signature.BoundingBox or not signature.FieldName or not signature.Page:
|
|
82
|
+
continue
|
|
83
|
+
key = (signature.Page, signature.FieldName.strip())
|
|
84
|
+
rect = widget_map.get(key)
|
|
85
|
+
if rect:
|
|
86
|
+
signature.BoundingBox = rect
|
|
87
|
+
|
|
88
|
+
# ───────────────────────────── pseudo bbox inference ─────────────────────────────
|
|
89
|
+
def _InferPseudoRects(self, signatures: Iterable[Signature], document) -> None:
|
|
90
|
+
for signature in signatures:
|
|
91
|
+
if signature.BoundingBox or signature.FieldName != "vendor_or_acro_detected":
|
|
92
|
+
continue
|
|
93
|
+
|
|
94
|
+
if signature.Page and signature.Page - 1 >= document.page_count:
|
|
95
|
+
continue
|
|
96
|
+
|
|
97
|
+
if signature.Page:
|
|
98
|
+
candidate_pages = [signature.Page - 1]
|
|
99
|
+
else:
|
|
100
|
+
candidate_pages = list(range(document.page_count - 1, -1, -1))
|
|
101
|
+
|
|
102
|
+
for page_index in candidate_pages:
|
|
103
|
+
if page_index < 0 or page_index >= document.page_count:
|
|
104
|
+
continue
|
|
105
|
+
page = document.load_page(page_index)
|
|
106
|
+
lines = self._ExtractLines(page)
|
|
107
|
+
rect_info = self._FindRoleLineRect(page, signature.Role, lines)
|
|
108
|
+
if rect_info is None:
|
|
109
|
+
rect_info = self._FallbackSignatureRect(page, signature.Role, lines)
|
|
110
|
+
if rect_info is not None:
|
|
111
|
+
rect, exclusion, mode = rect_info
|
|
112
|
+
padded = self._PadRect(rect, page.rect, signature.Role, exclusion, mode)
|
|
113
|
+
signature.BoundingBox = self._RectToPdfTuple(padded, page.rect.height)
|
|
114
|
+
if signature.Page is None:
|
|
115
|
+
signature.Page = page_index + 1
|
|
116
|
+
break
|
|
117
|
+
|
|
118
|
+
def _FindRoleLineRect(
|
|
119
|
+
self,
|
|
120
|
+
page,
|
|
121
|
+
role: str,
|
|
122
|
+
lines: list[dict[str, float | str]] | None = None,
|
|
123
|
+
) -> tuple[fitz.Rect, float | None, str] | None:
|
|
124
|
+
if lines is None:
|
|
125
|
+
lines = self._ExtractLines(page)
|
|
126
|
+
page_height = float(page.rect.height)
|
|
127
|
+
keywords = self.ROLE_KEYWORDS.get(role, ())
|
|
128
|
+
lower_roles = {"client", "firm", "representative", "attorney"}
|
|
129
|
+
if self.Profile == "retainer" and role in {"client", "firm"}:
|
|
130
|
+
min_factor = 0.15 if role == "client" else 0.4
|
|
131
|
+
min_y = page_height * min_factor
|
|
132
|
+
else:
|
|
133
|
+
min_y = page_height * (0.58 if role == "firm" else 0.5) if role in lower_roles else 0.0
|
|
134
|
+
|
|
135
|
+
def match_lines(require_signature: bool) -> list[tuple[int, dict[str, float | str]]]:
|
|
136
|
+
selected: list[tuple[int, dict[str, float | str]]] = []
|
|
137
|
+
for idx, line in enumerate(lines):
|
|
138
|
+
lower = line["lower_text"]
|
|
139
|
+
if lower.strip() == "":
|
|
140
|
+
continue
|
|
141
|
+
if line["y0"] < min_y:
|
|
142
|
+
continue
|
|
143
|
+
if require_signature and "sign" not in lower:
|
|
144
|
+
continue
|
|
145
|
+
if not require_signature and "sign" not in lower:
|
|
146
|
+
if "name" in lower or "print" in lower:
|
|
147
|
+
continue
|
|
148
|
+
if keywords and not any(keyword in lower for keyword in keywords):
|
|
149
|
+
continue
|
|
150
|
+
selected.append((idx, line))
|
|
151
|
+
return selected
|
|
152
|
+
|
|
153
|
+
matches = match_lines(require_signature=True)
|
|
154
|
+
if matches and matches[-1][1]["y0"] < page_height * 0.6:
|
|
155
|
+
matches = []
|
|
156
|
+
if not matches:
|
|
157
|
+
matches = match_lines(require_signature=False)
|
|
158
|
+
|
|
159
|
+
if matches:
|
|
160
|
+
idx, target = matches[-1]
|
|
161
|
+
label_rect = fitz.Rect(target["x0"], target["y0"], target["x1"], target["y1"])
|
|
162
|
+
stroke = self._LocateStrokeLine(lines, idx, label_rect)
|
|
163
|
+
if stroke is not None:
|
|
164
|
+
rect, exclusion = stroke
|
|
165
|
+
return rect, exclusion, "stroke"
|
|
166
|
+
image = self._LocateSignatureImage(page, label_rect)
|
|
167
|
+
if image is not None:
|
|
168
|
+
exclusion = self._NextExclusionY(lines, idx + 1, image.y1)
|
|
169
|
+
return image, exclusion, "image"
|
|
170
|
+
exclusion = self._NextExclusionY(lines, idx + 1, label_rect.y1)
|
|
171
|
+
return label_rect, exclusion, "label"
|
|
172
|
+
return None
|
|
173
|
+
|
|
174
|
+
def _FallbackSignatureRect(
|
|
175
|
+
self,
|
|
176
|
+
page,
|
|
177
|
+
role: str | None = None,
|
|
178
|
+
lines: list[dict[str, float | str]] | None = None,
|
|
179
|
+
) -> tuple[fitz.Rect, float | None, str] | None:
|
|
180
|
+
if lines is None:
|
|
181
|
+
lines = self._ExtractLines(page)
|
|
182
|
+
for idx in range(len(lines) - 1, -1, -1):
|
|
183
|
+
line = lines[idx]
|
|
184
|
+
lower = line["lower_text"]
|
|
185
|
+
if "signature" in lower or "sign" in lower:
|
|
186
|
+
rect = fitz.Rect(line["x0"], line["y0"], line["x1"], line["y1"])
|
|
187
|
+
exclusion = self._NextExclusionY(lines, idx + 1, rect.y1)
|
|
188
|
+
return rect, exclusion, "label"
|
|
189
|
+
if lines:
|
|
190
|
+
line = lines[-1]
|
|
191
|
+
rect = fitz.Rect(line["x0"], line["y0"], line["x1"], line["y1"])
|
|
192
|
+
exclusion = None
|
|
193
|
+
return rect, exclusion, "label"
|
|
194
|
+
return None
|
|
195
|
+
|
|
196
|
+
def _ExtractLines(self, page) -> list[dict[str, float | str]]:
|
|
197
|
+
words = page.get_text("words") or []
|
|
198
|
+
buckets: dict[tuple[int, int], dict[str, object]] = {}
|
|
199
|
+
for x0, y0, x1, y1, text, block, line, *_ in words:
|
|
200
|
+
if not text.strip():
|
|
201
|
+
continue
|
|
202
|
+
key = (int(block), int(line))
|
|
203
|
+
bucket = buckets.setdefault(
|
|
204
|
+
key,
|
|
205
|
+
{
|
|
206
|
+
"tokens": [],
|
|
207
|
+
"x0": float(x0),
|
|
208
|
+
"y0": float(y0),
|
|
209
|
+
"x1": float(x1),
|
|
210
|
+
"y1": float(y1),
|
|
211
|
+
},
|
|
212
|
+
)
|
|
213
|
+
tokens = cast(list[str], bucket["tokens"])
|
|
214
|
+
tokens.append(text)
|
|
215
|
+
bucket["x0"] = min(float(bucket["x0"]), float(x0))
|
|
216
|
+
bucket["y0"] = min(float(bucket["y0"]), float(y0))
|
|
217
|
+
bucket["x1"] = max(float(bucket["x1"]), float(x1))
|
|
218
|
+
bucket["y1"] = max(float(bucket["y1"]), float(y1))
|
|
219
|
+
lines: list[dict[str, float | str]] = []
|
|
220
|
+
for bucket in buckets.values():
|
|
221
|
+
text = " ".join(bucket["tokens"]).strip() # type: ignore[arg-type]
|
|
222
|
+
if not text:
|
|
223
|
+
continue
|
|
224
|
+
lines.append(
|
|
225
|
+
{
|
|
226
|
+
"text": text,
|
|
227
|
+
"lower_text": text.lower(),
|
|
228
|
+
"x0": float(bucket["x0"]),
|
|
229
|
+
"y0": float(bucket["y0"]),
|
|
230
|
+
"x1": float(bucket["x1"]),
|
|
231
|
+
"y1": float(bucket["y1"]),
|
|
232
|
+
}
|
|
233
|
+
)
|
|
234
|
+
lines.sort(key=lambda entry: (entry["y0"], entry["x0"]))
|
|
235
|
+
return lines
|
|
236
|
+
|
|
237
|
+
def _LocateStrokeLine(
|
|
238
|
+
self,
|
|
239
|
+
lines: list[dict[str, float | str]],
|
|
240
|
+
label_index: int,
|
|
241
|
+
label_rect: fitz.Rect,
|
|
242
|
+
) -> tuple[fitz.Rect, float | None] | None:
|
|
243
|
+
for idx in range(label_index - 1, max(label_index - 4, -1), -1):
|
|
244
|
+
lower = lines[idx]["lower_text"]
|
|
245
|
+
if "_" in lower or lower.strip().startswith("x"):
|
|
246
|
+
rect = fitz.Rect(
|
|
247
|
+
lines[idx]["x0"],
|
|
248
|
+
lines[idx]["y0"],
|
|
249
|
+
lines[idx]["x1"],
|
|
250
|
+
lines[idx]["y1"],
|
|
251
|
+
)
|
|
252
|
+
overlap = min(rect.x1, label_rect.x1) - max(rect.x0, label_rect.x0)
|
|
253
|
+
if overlap <= 0:
|
|
254
|
+
continue
|
|
255
|
+
# Keep crops below the label text.
|
|
256
|
+
return rect, label_rect.y0
|
|
257
|
+
return None
|
|
258
|
+
|
|
259
|
+
def _LocateSignatureImage(self, page, label_rect: fitz.Rect) -> fitz.Rect | None:
|
|
260
|
+
candidates: list[tuple[float, fitz.Rect]] = []
|
|
261
|
+
label_mid_x = (label_rect.x0 + label_rect.x1) / 2.0
|
|
262
|
+
for image in page.get_images(full=True):
|
|
263
|
+
bbox = page.get_image_bbox(image)
|
|
264
|
+
if bbox is None:
|
|
265
|
+
continue
|
|
266
|
+
width = float(bbox.width)
|
|
267
|
+
height = float(bbox.height)
|
|
268
|
+
if width < 40.0 or height < 12.0:
|
|
269
|
+
continue
|
|
270
|
+
if width > 380.0 or height > 220.0:
|
|
271
|
+
continue
|
|
272
|
+
# Require the image to sit near the label horizontally and vertically.
|
|
273
|
+
horiz_overlap = min(bbox.x1, label_rect.x1 + 220.0) - max(bbox.x0, label_rect.x0 - 40.0)
|
|
274
|
+
if horiz_overlap <= 0:
|
|
275
|
+
continue
|
|
276
|
+
vertical_gap = abs(((bbox.y0 + bbox.y1) / 2.0) - label_rect.y0)
|
|
277
|
+
if vertical_gap > 220.0:
|
|
278
|
+
continue
|
|
279
|
+
candidates.append((vertical_gap + abs(((bbox.x0 + bbox.x1) / 2.0) - label_mid_x), bbox))
|
|
280
|
+
|
|
281
|
+
if not candidates:
|
|
282
|
+
return None
|
|
283
|
+
candidates.sort(key=lambda item: item[0])
|
|
284
|
+
return candidates[0][1]
|
|
285
|
+
|
|
286
|
+
def _NextExclusionY(
|
|
287
|
+
self,
|
|
288
|
+
lines: list[dict[str, float | str]],
|
|
289
|
+
start_index: int,
|
|
290
|
+
minimum_y: float | None = None,
|
|
291
|
+
) -> float | None:
|
|
292
|
+
threshold = (minimum_y or -float("inf")) + 1.0
|
|
293
|
+
for line in lines[start_index:]:
|
|
294
|
+
y0 = float(line["y0"])
|
|
295
|
+
if y0 <= threshold:
|
|
296
|
+
continue
|
|
297
|
+
lower = line["lower_text"]
|
|
298
|
+
if any(token in lower for token in ("name", "print", "date", "by:")):
|
|
299
|
+
return y0
|
|
300
|
+
return None
|
|
301
|
+
|
|
302
|
+
def _RectToPdfTuple(self, rect, page_height: float) -> tuple[float, float, float, float]:
|
|
303
|
+
x0 = float(rect.x0)
|
|
304
|
+
x1 = float(rect.x1)
|
|
305
|
+
y0 = page_height - float(rect.y1)
|
|
306
|
+
y1 = page_height - float(rect.y0)
|
|
307
|
+
if x1 < x0:
|
|
308
|
+
x0, x1 = x1, x0
|
|
309
|
+
if y1 < y0:
|
|
310
|
+
y0, y1 = y1, y0
|
|
311
|
+
return (x0, y0, x1, y1)
|
|
312
|
+
|
|
313
|
+
def _PadRect(
|
|
314
|
+
self,
|
|
315
|
+
rect,
|
|
316
|
+
page_rect,
|
|
317
|
+
role: str | None = None,
|
|
318
|
+
exclusion_y0: float | None = None,
|
|
319
|
+
mode: str = "label",
|
|
320
|
+
):
|
|
321
|
+
"""Return a region focused on the expected signature line beneath ``rect``."""
|
|
322
|
+
|
|
323
|
+
max_width = 198.0 # 2.75 inches
|
|
324
|
+
max_height = 72.0 # 1 inch
|
|
325
|
+
|
|
326
|
+
pad_x = max(12.0, float(rect.width) * 0.08)
|
|
327
|
+
if mode == "stroke":
|
|
328
|
+
left = max(page_rect.x0, rect.x0 - 8.0)
|
|
329
|
+
right = min(page_rect.x1, rect.x1 + 8.0)
|
|
330
|
+
elif mode == "image":
|
|
331
|
+
left = max(page_rect.x0, rect.x0 - 10.0)
|
|
332
|
+
right = min(page_rect.x1, rect.x1 + 10.0)
|
|
333
|
+
else:
|
|
334
|
+
left = max(page_rect.x0, rect.x0 - pad_x)
|
|
335
|
+
right = min(page_rect.x1, rect.x1 + pad_x)
|
|
336
|
+
|
|
337
|
+
if self.Profile == "retainer" and role == "client" and mode in {"image", "label"}:
|
|
338
|
+
left = max(page_rect.x0, rect.x0 - 12.0)
|
|
339
|
+
right = min(page_rect.x1, rect.x1 + 16.0)
|
|
340
|
+
elif self.Profile == "retainer" and role == "firm" and mode in {"image", "label"}:
|
|
341
|
+
left = max(page_rect.x0, rect.x0 - 14.0)
|
|
342
|
+
right = min(page_rect.x1, rect.x1 + 18.0)
|
|
343
|
+
|
|
344
|
+
if right - left > max_width:
|
|
345
|
+
if mode == "stroke":
|
|
346
|
+
right = min(page_rect.x1, left + max_width)
|
|
347
|
+
else:
|
|
348
|
+
center = (left + right) / 2.0
|
|
349
|
+
half = max_width / 2.0
|
|
350
|
+
left = center - half
|
|
351
|
+
right = center + half
|
|
352
|
+
if left < page_rect.x0:
|
|
353
|
+
right += page_rect.x0 - left
|
|
354
|
+
left = page_rect.x0
|
|
355
|
+
if right > page_rect.x1:
|
|
356
|
+
left -= right - page_rect.x1
|
|
357
|
+
right = page_rect.x1
|
|
358
|
+
left = max(page_rect.x0, left)
|
|
359
|
+
right = min(page_rect.x1, right)
|
|
360
|
+
|
|
361
|
+
line_height = max(8.0, float(rect.height) or 12.0)
|
|
362
|
+
signature_height = max(40.0, line_height * 2.2)
|
|
363
|
+
if role == "client":
|
|
364
|
+
signature_height = max(signature_height, 65.0)
|
|
365
|
+
elif role == "firm":
|
|
366
|
+
signature_height = max(signature_height, 60.0)
|
|
367
|
+
elif role in {"representative", "patient", "attorney"}:
|
|
368
|
+
signature_height = max(signature_height, 55.0)
|
|
369
|
+
signature_height = min(signature_height, max_height)
|
|
370
|
+
|
|
371
|
+
baseline = float(rect.y1)
|
|
372
|
+
|
|
373
|
+
if mode == "stroke":
|
|
374
|
+
margin_above = max(6.0, line_height)
|
|
375
|
+
margin_below = max(18.0, line_height * 1.5)
|
|
376
|
+
top = float(rect.y0) - margin_above
|
|
377
|
+
bottom = float(rect.y1) + margin_below
|
|
378
|
+
signature_height = min(bottom - top, max_height)
|
|
379
|
+
elif mode == "image":
|
|
380
|
+
image_height = float(rect.height) or 12.0
|
|
381
|
+
signature_height = min(max_height, max(image_height + 18.0, 40.0))
|
|
382
|
+
extra = max(0.0, signature_height - image_height)
|
|
383
|
+
top = float(rect.y0) - min(extra * 0.25, 12.0)
|
|
384
|
+
bottom = top + signature_height
|
|
385
|
+
top = max(float(rect.y0) - 2.0, top)
|
|
386
|
+
bottom = top + signature_height
|
|
387
|
+
else:
|
|
388
|
+
gap_above = max(10.0, min(24.0, line_height * 0.9))
|
|
389
|
+
top = baseline + gap_above
|
|
390
|
+
bottom = top + signature_height
|
|
391
|
+
|
|
392
|
+
original_top = top
|
|
393
|
+
|
|
394
|
+
if exclusion_y0 is not None:
|
|
395
|
+
limited = exclusion_y0 - 4.0
|
|
396
|
+
if bottom > limited:
|
|
397
|
+
bottom = limited
|
|
398
|
+
top = max(original_top, bottom - signature_height)
|
|
399
|
+
if mode == "image":
|
|
400
|
+
limit_below = float(rect.y1) + 24.0
|
|
401
|
+
if bottom > limit_below:
|
|
402
|
+
bottom = limit_below
|
|
403
|
+
top = max(float(rect.y0) - 4.0, bottom - signature_height)
|
|
404
|
+
|
|
405
|
+
if bottom - top > max_height:
|
|
406
|
+
bottom = top + max_height
|
|
407
|
+
signature_height = min(signature_height, max_height)
|
|
408
|
+
|
|
409
|
+
if bottom > page_rect.y1:
|
|
410
|
+
bottom = page_rect.y1
|
|
411
|
+
top = max(original_top, bottom - signature_height)
|
|
412
|
+
|
|
413
|
+
if bottom - top > max_height:
|
|
414
|
+
bottom = top + max_height
|
|
415
|
+
|
|
416
|
+
if top >= bottom:
|
|
417
|
+
top = max(page_rect.y0, baseline - line_height)
|
|
418
|
+
bottom = min(page_rect.y1, top + min(signature_height, max_height))
|
|
419
|
+
|
|
420
|
+
return fitz.Rect(left, top, right, bottom)
|
|
@@ -212,7 +212,9 @@ class PyPDF2Detector(Detector):
|
|
|
212
212
|
hits.add(f"VendorText:{rx.pattern}")
|
|
213
213
|
return hits
|
|
214
214
|
|
|
215
|
-
def _ScanPageVendors(self, page) -> set[str]:
|
|
215
|
+
def _ScanPageVendors(self, page) -> tuple[set[str], str]:
|
|
216
|
+
"""Return vendor hits along with the extracted page text."""
|
|
217
|
+
|
|
216
218
|
found: set[str] = set()
|
|
217
219
|
|
|
218
220
|
with _QuietIo():
|
|
@@ -234,7 +236,7 @@ class PyPDF2Detector(Detector):
|
|
|
234
236
|
if rx.search(txt):
|
|
235
237
|
found.add(f"VendorText:{rx.pattern}")
|
|
236
238
|
|
|
237
|
-
return found
|
|
239
|
+
return found, txt
|
|
238
240
|
|
|
239
241
|
def _IterateFormXObjects(self, page) -> Iterator[generic.DictionaryObject]:
|
|
240
242
|
"""Yield Form XObject dictionaries recursively from page resources."""
|
|
@@ -438,6 +440,40 @@ class PyPDF2Detector(Detector):
|
|
|
438
440
|
nm = GetFieldNameFromAncestry(wdict)
|
|
439
441
|
return "" if nm is None else str(nm)
|
|
440
442
|
|
|
443
|
+
def _WidgetBoundingBox(
|
|
444
|
+
self, wdict: generic.DictionaryObject
|
|
445
|
+
) -> tuple[float, float, float, float] | None:
|
|
446
|
+
"""Return the widget's ``/Rect`` coordinates normalized as (x0, y0, x1, y1)."""
|
|
447
|
+
|
|
448
|
+
rect = self._RectToTuple(wdict.get("/Rect"))
|
|
449
|
+
if rect:
|
|
450
|
+
return rect
|
|
451
|
+
parent = AsDictionary(wdict.get("/Parent"))
|
|
452
|
+
if isinstance(parent, generic.DictionaryObject):
|
|
453
|
+
return self._RectToTuple(parent.get("/Rect"))
|
|
454
|
+
return None
|
|
455
|
+
|
|
456
|
+
def _RectToTuple(self, candidate) -> tuple[float, float, float, float] | None:
|
|
457
|
+
if candidate is None:
|
|
458
|
+
return None
|
|
459
|
+
if isinstance(candidate, generic.IndirectObject):
|
|
460
|
+
with suppress(Exception):
|
|
461
|
+
candidate = candidate.get_object()
|
|
462
|
+
if isinstance(candidate, generic.ArrayObject) and len(candidate) == 4:
|
|
463
|
+
coords: list[float] = []
|
|
464
|
+
for item in candidate:
|
|
465
|
+
try:
|
|
466
|
+
coords.append(float(item))
|
|
467
|
+
except Exception:
|
|
468
|
+
return None
|
|
469
|
+
x0, y0, x1, y1 = coords
|
|
470
|
+
if x1 < x0:
|
|
471
|
+
x0, x1 = x1, x0
|
|
472
|
+
if y1 < y0:
|
|
473
|
+
y0, y1 = y1, y0
|
|
474
|
+
return x0, y0, x1, y1
|
|
475
|
+
return None
|
|
476
|
+
|
|
441
477
|
@staticmethod
|
|
442
478
|
def _PickNameAny(d: generic.DictionaryObject) -> str | None:
|
|
443
479
|
for key in ("/T", "/TU", "/TM"):
|
|
@@ -685,7 +721,7 @@ class PyPDF2Detector(Detector):
|
|
|
685
721
|
|
|
686
722
|
for page in reader.pages:
|
|
687
723
|
# per-page vendor
|
|
688
|
-
pv = self._ScanPageVendors(page)
|
|
724
|
+
pv, page_text = self._ScanPageVendors(page)
|
|
689
725
|
x_hits: set[str] = set()
|
|
690
726
|
x_text = ""
|
|
691
727
|
if self.RecurseXObjects:
|
|
@@ -693,12 +729,10 @@ class PyPDF2Detector(Detector):
|
|
|
693
729
|
vendor_hints |= pv | x_hits
|
|
694
730
|
vendor_hits_per_page.append(len(pv) + len(x_hits))
|
|
695
731
|
|
|
696
|
-
with _QuietIo():
|
|
697
|
-
txt = page.extract_text() or ""
|
|
698
732
|
if x_text:
|
|
699
|
-
|
|
700
|
-
page_texts.append(
|
|
701
|
-
any_text = any_text or bool(
|
|
733
|
+
page_text = f"{page_text} {x_text}".strip() if page_text else x_text.strip()
|
|
734
|
+
page_texts.append(page_text)
|
|
735
|
+
any_text = any_text or bool(page_text)
|
|
702
736
|
|
|
703
737
|
# image counting
|
|
704
738
|
img_count = 0
|
|
@@ -760,6 +794,7 @@ class PyPDF2Detector(Detector):
|
|
|
760
794
|
field_name = self._FieldNameForWidget(wdict)
|
|
761
795
|
page_obj = reader.pages[idx - 1] if 0 <= (idx - 1) < len(reader.pages) else None
|
|
762
796
|
render_type = self._ClassifyAppearance(wdict, page_obj)
|
|
797
|
+
bounding_box = self._WidgetBoundingBox(wdict)
|
|
763
798
|
|
|
764
799
|
# de-dup by object ref (if present) and (page, name)
|
|
765
800
|
if isinstance(ref, generic.IndirectObject):
|
|
@@ -801,6 +836,7 @@ class PyPDF2Detector(Detector):
|
|
|
801
836
|
Evidence=evidence,
|
|
802
837
|
Hint=(f"AcroSig:{field_name}" if field_name else "AcroSig"),
|
|
803
838
|
RenderType=render_type,
|
|
839
|
+
BoundingBox=bounding_box,
|
|
804
840
|
)
|
|
805
841
|
)
|
|
806
842
|
|
|
@@ -969,6 +1005,7 @@ class PyPDF2Detector(Detector):
|
|
|
969
1005
|
field_name = self._FieldNameForWidget(wdict)
|
|
970
1006
|
page_obj = reader.pages[idx - 1] if 0 <= (idx - 1) < len(reader.pages) else None
|
|
971
1007
|
render_type = self._ClassifyAppearance(wdict, page_obj)
|
|
1008
|
+
bounding_box = self._WidgetBoundingBox(wdict)
|
|
972
1009
|
|
|
973
1010
|
# de-dup by object ref (if present) and (page, name)
|
|
974
1011
|
if isinstance(ref, generic.IndirectObject):
|
|
@@ -995,6 +1032,7 @@ class PyPDF2Detector(Detector):
|
|
|
995
1032
|
Evidence=evidence,
|
|
996
1033
|
Hint=(f"AcroSig:{field_name}" if field_name else "AcroSig"),
|
|
997
1034
|
RenderType=render_type,
|
|
1035
|
+
BoundingBox=bounding_box,
|
|
998
1036
|
)
|
|
999
1037
|
)
|
|
1000
1038
|
|
|
@@ -18,6 +18,8 @@ class Signature:
|
|
|
18
18
|
Evidence: list[str]
|
|
19
19
|
Hint: str
|
|
20
20
|
RenderType: str = "unknown"
|
|
21
|
+
BoundingBox: tuple[float, float, float, float] | None = None
|
|
22
|
+
CropPath: str | None = None
|
|
21
23
|
|
|
22
24
|
def to_dict(self) -> dict[str, Any]:
|
|
23
25
|
"""Return the legacy snake_case representation used in JSON payloads."""
|
|
@@ -31,4 +33,6 @@ class Signature:
|
|
|
31
33
|
"evidence": list(self.Evidence),
|
|
32
34
|
"hint": self.Hint,
|
|
33
35
|
"render_type": self.RenderType,
|
|
36
|
+
"bounding_box": list(self.BoundingBox) if self.BoundingBox else None,
|
|
37
|
+
"crop_path": self.CropPath,
|
|
34
38
|
}
|
|
@@ -1,13 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sigdetect
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: Signature detection and role attribution for PDFs
|
|
5
5
|
Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
|
|
6
6
|
License: MIT
|
|
7
7
|
Requires-Python: >=3.9
|
|
8
8
|
Description-Content-Type: text/markdown
|
|
9
9
|
Requires-Dist: pypdf>=4.0.0
|
|
10
|
-
Requires-Dist: pandas>=2.0
|
|
11
10
|
Requires-Dist: rich>=13.0
|
|
12
11
|
Requires-Dist: typer>=0.12
|
|
13
12
|
Requires-Dist: pydantic>=2.5
|
|
@@ -102,6 +101,8 @@ sigdetect detect \
|
|
|
102
101
|
- `--profile` selects tuned role logic:
|
|
103
102
|
- `hipaa` → patient / representative / attorney
|
|
104
103
|
- `retainer` → client / firm (prefers detecting two signatures)
|
|
104
|
+
- `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
|
|
105
|
+
- `--crop-signatures` enables PNG crops for each detected widget (requires installing the optional `pymupdf` dependency). Use `--crop-dir` to override the destination and `--crop-dpi` to choose rendering quality.
|
|
105
106
|
- If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
|
|
106
107
|
|
|
107
108
|
### EDA (quick aggregate stats)
|
|
@@ -135,7 +136,7 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
|
|
|
135
136
|
print(result.to_dict())
|
|
136
137
|
~~~
|
|
137
138
|
|
|
138
|
-
`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)).
|
|
139
|
+
`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image.
|
|
139
140
|
|
|
140
141
|
---
|
|
141
142
|
|
|
@@ -146,7 +147,17 @@ Import from `sigdetect.api` and get plain dicts out (JSON-ready),
|
|
|
146
147
|
with no I/O side effects by default:
|
|
147
148
|
|
|
148
149
|
~~~python
|
|
149
|
-
from
|
|
150
|
+
from pathlib import Path
|
|
151
|
+
|
|
152
|
+
from sigdetect.api import (
|
|
153
|
+
CropSignatureImages,
|
|
154
|
+
DetectMany,
|
|
155
|
+
DetectPdf,
|
|
156
|
+
ScanDirectory,
|
|
157
|
+
ToCsvRow,
|
|
158
|
+
Version,
|
|
159
|
+
get_detector,
|
|
160
|
+
)
|
|
150
161
|
|
|
151
162
|
print("sigdetect", Version())
|
|
152
163
|
|
|
@@ -178,6 +189,15 @@ for res in ScanDirectory(
|
|
|
178
189
|
# store in DB, print, etc.
|
|
179
190
|
pass
|
|
180
191
|
|
|
192
|
+
# 3) Crop PNG snippets for FileResult objects (requires PyMuPDF)
|
|
193
|
+
detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
|
|
194
|
+
file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
|
|
195
|
+
CropSignatureImages(
|
|
196
|
+
"/path/to/pdfs/example.pdf",
|
|
197
|
+
file_result,
|
|
198
|
+
outputDirectory="./signature_crops",
|
|
199
|
+
dpi=200,
|
|
200
|
+
)
|
|
181
201
|
~~~
|
|
182
202
|
|
|
183
203
|
|
|
@@ -205,7 +225,10 @@ High-level summary (per file):
|
|
|
205
225
|
"score": 5,
|
|
206
226
|
"scores": { "field": 3, "page_label": 2 },
|
|
207
227
|
"evidence": ["field:patient", "page_label:patient"],
|
|
208
|
-
"hint": "AcroSig:sig_patient"
|
|
228
|
+
"hint": "AcroSig:sig_patient",
|
|
229
|
+
"render_type": "typed",
|
|
230
|
+
"bounding_box": [10.0, 10.0, 150.0, 40.0],
|
|
231
|
+
"crop_path": "signature_crops/example/sig_01_patient.png"
|
|
209
232
|
},
|
|
210
233
|
{
|
|
211
234
|
"page": null,
|
|
@@ -214,7 +237,10 @@ High-level summary (per file):
|
|
|
214
237
|
"score": 6,
|
|
215
238
|
"scores": { "page_label": 4, "general": 2 },
|
|
216
239
|
"evidence": ["page_label:representative(parent/guardian)", "pseudo:true"],
|
|
217
|
-
"hint": "VendorOrAcroOnly"
|
|
240
|
+
"hint": "VendorOrAcroOnly",
|
|
241
|
+
"render_type": "unknown",
|
|
242
|
+
"bounding_box": null,
|
|
243
|
+
"crop_path": null
|
|
218
244
|
}
|
|
219
245
|
]
|
|
220
246
|
}
|
|
@@ -227,6 +253,8 @@ High-level summary (per file):
|
|
|
227
253
|
- **`mixed`** means both `esign_found` and `scanned_pdf` are `true`.
|
|
228
254
|
- **`roles`** summarizes unique non-`unknown` roles across signatures.
|
|
229
255
|
- In retainer profile, emitter prefers two signatures (client + firm), often on the same page.
|
|
256
|
+
- **`signatures[].bounding_box`** reports the widget rectangle in PDF points (origin bottom-left).
|
|
257
|
+
- **`signatures[].crop_path`** is populated when PNG crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
|
|
230
258
|
|
|
231
259
|
---
|
|
232
260
|
|
|
@@ -252,6 +280,9 @@ engine: pypdf2
|
|
|
252
280
|
pseudo_signatures: true
|
|
253
281
|
recurse_xobjects: true
|
|
254
282
|
profile: retainer # or: hipaa
|
|
283
|
+
crop_signatures: false # enable to write PNG crops (requires pymupdf)
|
|
284
|
+
# crop_output_dir: ./signature_crops
|
|
285
|
+
crop_image_dpi: 200
|
|
255
286
|
~~~
|
|
256
287
|
|
|
257
288
|
YAML files can be customized or load at runtime (see CLI `--config`, if available, or import and pass patterns into engine).
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
sigdetect/__init__.py,sha256=LhY78mDZ1ClYVNTxW_qtE-vqJoN9N7N5ZcNRDUI_3ss,575
|
|
2
|
-
sigdetect/api.py,sha256=
|
|
3
|
-
sigdetect/cli.py,sha256=
|
|
4
|
-
sigdetect/config.py,sha256=
|
|
2
|
+
sigdetect/api.py,sha256=F7bM0ctYmtczjqSbsl7MkUZQ28wkRnLAYt1WxfCtzk4,8518
|
|
3
|
+
sigdetect/cli.py,sha256=NctAnaB-TQrUAT9m-v8kj2_KTNs88kbFOCiX32tHZm8,7920
|
|
4
|
+
sigdetect/config.py,sha256=S0NVKuJYiHJCocL-VNFdGJpasFcjTecavC4EthyS1DQ,5951
|
|
5
|
+
sigdetect/cropping.py,sha256=89xPwXhWkJC5E0oW2e3_fDyERH5YGqyt4q4B-HSld4o,4084
|
|
5
6
|
sigdetect/eda.py,sha256=S92G1Gjmepri__D0n_V6foq0lQgH-RXI9anW8A58jfw,4681
|
|
6
7
|
sigdetect/logging_setup.py,sha256=LMF8ao_a-JwH0S522T6aYTFX3e8Ajjv_5ODS2YiBcHA,6404
|
|
7
8
|
sigdetect/utils.py,sha256=T9rubLf5T9JmjOHYMOba1j34fhOJaWocAXccnGTxRUE,5198
|
|
@@ -12,11 +13,11 @@ sigdetect/detector/__init__.py,sha256=up2FCmD09f2bRHcS4WbY-clx3GQbWuk1PM2JlxgusH
|
|
|
12
13
|
sigdetect/detector/base.py,sha256=L-iXWXqsTetDc4jRZo_wOdbNpKqOY20mX9FefrugdT0,263
|
|
13
14
|
sigdetect/detector/base_detector.py,sha256=GmAgUWO_fQgIfnihZSoyhR3wpnwZ-X3hS0Kuyz4G6Ys,608
|
|
14
15
|
sigdetect/detector/file_result_model.py,sha256=j2gTc9Sw3fJOHlexYsR_m5DiwHA8DzIzAMToESfvo4A,1767
|
|
15
|
-
sigdetect/detector/pymupdf_engine.py,sha256=
|
|
16
|
-
sigdetect/detector/pypdf2_engine.py,sha256=
|
|
17
|
-
sigdetect/detector/signature_model.py,sha256=
|
|
18
|
-
sigdetect-0.
|
|
19
|
-
sigdetect-0.
|
|
20
|
-
sigdetect-0.
|
|
21
|
-
sigdetect-0.
|
|
22
|
-
sigdetect-0.
|
|
16
|
+
sigdetect/detector/pymupdf_engine.py,sha256=iyp7JuPlUnydwohH5zbNg4MwH44mBmxbBWOS3ZmArBo,17339
|
|
17
|
+
sigdetect/detector/pypdf2_engine.py,sha256=INWQH06kMLvto2VS-EdLC-EtMC6AG7JmdVYmNgx6_RU,47313
|
|
18
|
+
sigdetect/detector/signature_model.py,sha256=mztb9V5wgv2oohQ5Cxzcv8_Bo6TyWAVIXteaeQ2rywQ,1076
|
|
19
|
+
sigdetect-0.2.0.dist-info/METADATA,sha256=HzF-CmGBs48_Cqv9Dv9AdXo_UoztA-tLPxVMN1fXOH0,11866
|
|
20
|
+
sigdetect-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
21
|
+
sigdetect-0.2.0.dist-info/entry_points.txt,sha256=iqtfKjBU44-omM7Sh-idGz2ahw19oAvpvSyKZVArG3o,48
|
|
22
|
+
sigdetect-0.2.0.dist-info/top_level.txt,sha256=PKlfwUobkRC0viwiSXmhtw83G26FSNpimWYC1Uy00FY,10
|
|
23
|
+
sigdetect-0.2.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|