sigdetect 0.1.1__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sigdetect/api.py +162 -14
- sigdetect/cli.py +154 -20
- sigdetect/config.py +49 -9
- sigdetect/cropping.py +177 -0
- sigdetect/detector/pymupdf_engine.py +420 -0
- sigdetect/detector/pypdf2_engine.py +46 -8
- sigdetect/detector/signature_model.py +4 -0
- {sigdetect-0.1.1.dist-info → sigdetect-0.3.0.dist-info}/METADATA +44 -6
- {sigdetect-0.1.1.dist-info → sigdetect-0.3.0.dist-info}/RECORD +12 -11
- {sigdetect-0.1.1.dist-info → sigdetect-0.3.0.dist-info}/WHEEL +0 -0
- {sigdetect-0.1.1.dist-info → sigdetect-0.3.0.dist-info}/entry_points.txt +0 -0
- {sigdetect-0.1.1.dist-info → sigdetect-0.3.0.dist-info}/top_level.txt +0 -0
sigdetect/api.py
CHANGED
|
@@ -2,11 +2,13 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
from contextlib import contextmanager
|
|
5
6
|
from pathlib import Path
|
|
6
|
-
from typing import Any, Iterable, Iterator, Literal
|
|
7
|
+
from typing import Any, Generator, Iterable, Iterator, Literal, overload
|
|
7
8
|
|
|
8
9
|
from sigdetect.config import DetectConfiguration
|
|
9
|
-
from sigdetect.
|
|
10
|
+
from sigdetect.cropping import SignatureCrop
|
|
11
|
+
from sigdetect.detector import BuildDetector, Detector, FileResult, Signature
|
|
10
12
|
|
|
11
13
|
EngineName = Literal["pypdf2", "pypdf", "pymupdf"]
|
|
12
14
|
ProfileName = Literal["hipaa", "retainer"]
|
|
@@ -19,23 +21,44 @@ def DetectPdf(
|
|
|
19
21
|
engineName: EngineName = "pypdf2",
|
|
20
22
|
includePseudoSignatures: bool = True,
|
|
21
23
|
recurseXObjects: bool = True,
|
|
24
|
+
detector: Detector | None = None,
|
|
22
25
|
) -> dict[str, Any]:
|
|
23
26
|
"""Detect signature evidence and assign roles for a single PDF."""
|
|
24
27
|
|
|
25
28
|
resolvedPath = Path(pdfPath)
|
|
29
|
+
activeDetector = detector or get_detector(
|
|
30
|
+
pdfRoot=resolvedPath.parent,
|
|
31
|
+
profileName=profileName,
|
|
32
|
+
engineName=engineName,
|
|
33
|
+
includePseudoSignatures=includePseudoSignatures,
|
|
34
|
+
recurseXObjects=recurseXObjects,
|
|
35
|
+
outputDirectory=None,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
result = activeDetector.Detect(resolvedPath)
|
|
39
|
+
return _ToPlainDictionary(result)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def get_detector(
|
|
43
|
+
*,
|
|
44
|
+
pdfRoot: str | Path | None = None,
|
|
45
|
+
profileName: ProfileName = "hipaa",
|
|
46
|
+
engineName: EngineName = "pypdf2",
|
|
47
|
+
includePseudoSignatures: bool = True,
|
|
48
|
+
recurseXObjects: bool = True,
|
|
49
|
+
outputDirectory: str | Path | None = None,
|
|
50
|
+
) -> Detector:
|
|
51
|
+
"""Return a reusable detector instance configured with the supplied options."""
|
|
26
52
|
|
|
27
53
|
configuration = DetectConfiguration(
|
|
28
|
-
PdfRoot=
|
|
29
|
-
OutputDirectory=None,
|
|
54
|
+
PdfRoot=Path(pdfRoot) if pdfRoot is not None else Path.cwd(),
|
|
55
|
+
OutputDirectory=Path(outputDirectory) if outputDirectory is not None else None,
|
|
30
56
|
Engine=engineName,
|
|
31
57
|
PseudoSignatures=includePseudoSignatures,
|
|
32
58
|
RecurseXObjects=recurseXObjects,
|
|
33
59
|
Profile=profileName,
|
|
34
60
|
)
|
|
35
|
-
|
|
36
|
-
detector = BuildDetector(configuration)
|
|
37
|
-
result = detector.Detect(resolvedPath)
|
|
38
|
-
return _ToPlainDictionary(result)
|
|
61
|
+
return BuildDetector(configuration)
|
|
39
62
|
|
|
40
63
|
|
|
41
64
|
def _ToPlainDictionary(candidate: Any) -> dict[str, Any]:
|
|
@@ -84,10 +107,17 @@ def _ToPlainValue(value: Any) -> Any:
|
|
|
84
107
|
|
|
85
108
|
def DetectMany(
|
|
86
109
|
pdfPaths: Iterable[str | Path],
|
|
110
|
+
*,
|
|
111
|
+
detector: Detector | None = None,
|
|
87
112
|
**kwargs: Any,
|
|
88
113
|
) -> Iterator[dict[str, Any]]:
|
|
89
114
|
"""Yield :func:`DetectPdf` results for each path in ``pdfPaths``."""
|
|
90
115
|
|
|
116
|
+
if detector is not None:
|
|
117
|
+
for pdfPath in pdfPaths:
|
|
118
|
+
yield _DetectWithDetector(detector, pdfPath)
|
|
119
|
+
return
|
|
120
|
+
|
|
91
121
|
for pdfPath in pdfPaths:
|
|
92
122
|
yield DetectPdf(pdfPath, **kwargs)
|
|
93
123
|
|
|
@@ -96,19 +126,24 @@ def ScanDirectory(
|
|
|
96
126
|
pdfRoot: str | Path,
|
|
97
127
|
*,
|
|
98
128
|
globPattern: str = "**/*.pdf",
|
|
129
|
+
detector: Detector | None = None,
|
|
99
130
|
**kwargs: Any,
|
|
100
131
|
) -> Iterator[dict[str, Any]]:
|
|
101
132
|
"""Walk ``pdfRoot`` and yield detection output for every matching PDF."""
|
|
102
133
|
|
|
103
134
|
rootDirectory = Path(pdfRoot)
|
|
104
|
-
|
|
105
|
-
rootDirectory.rglob(
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
135
|
+
if globPattern == "**/*.pdf":
|
|
136
|
+
iterator = (path for path in rootDirectory.rglob("*") if path.is_file())
|
|
137
|
+
else:
|
|
138
|
+
iterator = (
|
|
139
|
+
rootDirectory.rglob(globPattern.replace("**/", "", 1))
|
|
140
|
+
if globPattern.startswith("**/")
|
|
141
|
+
else rootDirectory.glob(globPattern)
|
|
142
|
+
)
|
|
143
|
+
|
|
109
144
|
for pdfPath in iterator:
|
|
110
145
|
if pdfPath.is_file() and pdfPath.suffix.lower() == ".pdf":
|
|
111
|
-
yield DetectPdf(pdfPath, **kwargs)
|
|
146
|
+
yield DetectPdf(pdfPath, detector=detector, **kwargs)
|
|
112
147
|
|
|
113
148
|
|
|
114
149
|
def ToCsvRow(result: dict[str, Any]) -> dict[str, Any]:
|
|
@@ -137,3 +172,116 @@ def Version() -> str:
|
|
|
137
172
|
return resolveVersion("sigdetect")
|
|
138
173
|
except Exception:
|
|
139
174
|
return "0.0.0-dev"
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def _DetectWithDetector(detector: Detector, pdfPath: str | Path) -> dict[str, Any]:
|
|
178
|
+
"""Helper that runs ``detector`` and returns the plain dictionary result."""
|
|
179
|
+
|
|
180
|
+
resolvedPath = Path(pdfPath)
|
|
181
|
+
return _ToPlainDictionary(detector.Detect(resolvedPath))
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
@contextmanager
|
|
185
|
+
def detector_context(**kwargs: Any) -> Generator[Detector, None, None]:
|
|
186
|
+
"""Context manager wrapper around :func:`get_detector`."""
|
|
187
|
+
|
|
188
|
+
detector = get_detector(**kwargs)
|
|
189
|
+
try:
|
|
190
|
+
yield detector
|
|
191
|
+
finally:
|
|
192
|
+
pass
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
@overload
|
|
196
|
+
def CropSignatureImages(
|
|
197
|
+
pdfPath: str | Path,
|
|
198
|
+
fileResult: FileResult | dict[str, Any],
|
|
199
|
+
*,
|
|
200
|
+
outputDirectory: str | Path,
|
|
201
|
+
dpi: int = 200,
|
|
202
|
+
returnBytes: Literal[False] = False,
|
|
203
|
+
) -> list[Path]: ...
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
@overload
|
|
207
|
+
def CropSignatureImages(
|
|
208
|
+
pdfPath: str | Path,
|
|
209
|
+
fileResult: FileResult | dict[str, Any],
|
|
210
|
+
*,
|
|
211
|
+
outputDirectory: str | Path,
|
|
212
|
+
dpi: int,
|
|
213
|
+
returnBytes: Literal[True],
|
|
214
|
+
) -> list[SignatureCrop]: ...
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def CropSignatureImages(
|
|
218
|
+
pdfPath: str | Path,
|
|
219
|
+
fileResult: FileResult | dict[str, Any],
|
|
220
|
+
*,
|
|
221
|
+
outputDirectory: str | Path,
|
|
222
|
+
dpi: int = 200,
|
|
223
|
+
returnBytes: bool = False,
|
|
224
|
+
) -> list[Path] | list[SignatureCrop]:
|
|
225
|
+
"""Crop detected signature regions to PNG files.
|
|
226
|
+
|
|
227
|
+
Accepts either a :class:`FileResult` instance or the ``dict`` returned by
|
|
228
|
+
:func:`DetectPdf`. Requires the optional ``pymupdf`` dependency.
|
|
229
|
+
Set ``returnBytes=True`` to also receive in-memory PNG bytes for each crop.
|
|
230
|
+
"""
|
|
231
|
+
|
|
232
|
+
from sigdetect.cropping import crop_signatures
|
|
233
|
+
|
|
234
|
+
file_result_obj, original_dict = _CoerceFileResult(fileResult)
|
|
235
|
+
paths = crop_signatures(
|
|
236
|
+
pdf_path=Path(pdfPath),
|
|
237
|
+
file_result=file_result_obj,
|
|
238
|
+
output_dir=Path(outputDirectory),
|
|
239
|
+
dpi=dpi,
|
|
240
|
+
return_bytes=returnBytes,
|
|
241
|
+
)
|
|
242
|
+
if original_dict is not None:
|
|
243
|
+
original_dict.clear()
|
|
244
|
+
original_dict.update(file_result_obj.to_dict())
|
|
245
|
+
return paths
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def _CoerceFileResult(
|
|
249
|
+
candidate: FileResult | dict[str, Any]
|
|
250
|
+
) -> tuple[FileResult, dict[str, Any] | None]:
|
|
251
|
+
if isinstance(candidate, FileResult):
|
|
252
|
+
return candidate, None
|
|
253
|
+
if not isinstance(candidate, dict):
|
|
254
|
+
raise TypeError("fileResult must be FileResult or dict")
|
|
255
|
+
|
|
256
|
+
signatures: list[Signature] = []
|
|
257
|
+
for entry in candidate.get("signatures") or []:
|
|
258
|
+
bbox = entry.get("bounding_box")
|
|
259
|
+
signatures.append(
|
|
260
|
+
Signature(
|
|
261
|
+
Page=entry.get("page"),
|
|
262
|
+
FieldName=str(entry.get("field_name") or ""),
|
|
263
|
+
Role=str(entry.get("role") or "unknown"),
|
|
264
|
+
Score=int(entry.get("score") or 0),
|
|
265
|
+
Scores=dict(entry.get("scores") or {}),
|
|
266
|
+
Evidence=list(entry.get("evidence") or []),
|
|
267
|
+
Hint=str(entry.get("hint") or ""),
|
|
268
|
+
RenderType=str(entry.get("render_type") or "unknown"),
|
|
269
|
+
BoundingBox=tuple(bbox) if bbox else None,
|
|
270
|
+
CropPath=entry.get("crop_path"),
|
|
271
|
+
)
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
file_result = FileResult(
|
|
275
|
+
File=str(candidate.get("file") or ""),
|
|
276
|
+
SizeKilobytes=candidate.get("size_kb"),
|
|
277
|
+
PageCount=int(candidate.get("pages") or 0),
|
|
278
|
+
ElectronicSignatureFound=bool(candidate.get("esign_found")),
|
|
279
|
+
ScannedPdf=candidate.get("scanned_pdf"),
|
|
280
|
+
MixedContent=candidate.get("mixed"),
|
|
281
|
+
SignatureCount=int(candidate.get("sig_count") or len(signatures)),
|
|
282
|
+
SignaturePages=str(candidate.get("sig_pages") or ""),
|
|
283
|
+
Roles=str(candidate.get("roles") or "unknown"),
|
|
284
|
+
Hints=str(candidate.get("hints") or ""),
|
|
285
|
+
Signatures=signatures,
|
|
286
|
+
)
|
|
287
|
+
return file_result, candidate
|
sigdetect/cli.py
CHANGED
|
@@ -3,14 +3,16 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import json
|
|
6
|
+
from collections.abc import Iterator
|
|
6
7
|
from dataclasses import asdict, is_dataclass
|
|
7
8
|
from pathlib import Path
|
|
8
9
|
|
|
9
10
|
import typer
|
|
10
11
|
|
|
11
12
|
from . import __version__
|
|
12
|
-
from .config import LoadConfiguration
|
|
13
|
-
from .
|
|
13
|
+
from .config import FinalizeConfiguration, LoadConfiguration
|
|
14
|
+
from .cropping import SignatureCroppingUnavailable, crop_signatures
|
|
15
|
+
from .detector import BuildDetector, FileResult
|
|
14
16
|
from .eda import RunExploratoryAnalysis
|
|
15
17
|
from .logging_setup import ConfigureLogging
|
|
16
18
|
|
|
@@ -31,18 +33,65 @@ def _JsonSerializer(candidate):
|
|
|
31
33
|
return str(candidate)
|
|
32
34
|
|
|
33
35
|
|
|
36
|
+
def _EnumeratePdfs(pdfRoot: Path, recursive: bool) -> Iterator[Path]:
|
|
37
|
+
"""Yield PDF files under ``pdfRoot`` honoring the recursion flag."""
|
|
38
|
+
|
|
39
|
+
iterator = pdfRoot.rglob("*") if recursive else pdfRoot.glob("*")
|
|
40
|
+
for path in iterator:
|
|
41
|
+
if path.is_file() and path.suffix.lower() == ".pdf":
|
|
42
|
+
yield path
|
|
43
|
+
|
|
44
|
+
|
|
34
45
|
@CliApplication.command(name="detect")
|
|
35
46
|
def Detect(
|
|
36
47
|
configurationPath: Path | None = typer.Option(
|
|
37
48
|
None, "--config", "-c", help="Path to YAML config"
|
|
38
49
|
),
|
|
39
50
|
profileOverride: str | None = typer.Option(None, "--profile", "-p", help="hipaa or retainer"),
|
|
51
|
+
recursive: bool = typer.Option(
|
|
52
|
+
True,
|
|
53
|
+
"--recursive/--no-recursive",
|
|
54
|
+
help="Recurse into subdirectories when gathering PDFs",
|
|
55
|
+
),
|
|
56
|
+
cropSignatures: bool | None = typer.Option(
|
|
57
|
+
None,
|
|
58
|
+
"--crop-signatures/--no-crop-signatures",
|
|
59
|
+
help="Crop detected signature regions to PNG files (requires PyMuPDF)",
|
|
60
|
+
show_default=False,
|
|
61
|
+
),
|
|
62
|
+
cropDirectory: Path | None = typer.Option(
|
|
63
|
+
None,
|
|
64
|
+
"--crop-dir",
|
|
65
|
+
help="Directory for signature PNG crops (defaults to out_dir/signature_crops)",
|
|
66
|
+
),
|
|
67
|
+
cropDpi: int | None = typer.Option(
|
|
68
|
+
None,
|
|
69
|
+
"--crop-dpi",
|
|
70
|
+
min=72,
|
|
71
|
+
max=600,
|
|
72
|
+
help="Rendering DPI for signature crops",
|
|
73
|
+
show_default=False,
|
|
74
|
+
),
|
|
40
75
|
) -> None:
|
|
41
76
|
"""Run detection for the configured directory and emit ``results.json``."""
|
|
42
77
|
|
|
43
78
|
configuration = LoadConfiguration(configurationPath)
|
|
44
|
-
if profileOverride
|
|
45
|
-
|
|
79
|
+
if profileOverride is not None:
|
|
80
|
+
normalized_profile = profileOverride.lower()
|
|
81
|
+
if normalized_profile not in {"hipaa", "retainer"}:
|
|
82
|
+
raise typer.BadParameter("Profile must be 'hipaa' or 'retainer'.")
|
|
83
|
+
configuration = configuration.model_copy(update={"Profile": normalized_profile})
|
|
84
|
+
|
|
85
|
+
overrides: dict[str, object] = {}
|
|
86
|
+
if cropSignatures is not None:
|
|
87
|
+
overrides["CropSignatures"] = cropSignatures
|
|
88
|
+
if cropDirectory is not None:
|
|
89
|
+
overrides["CropOutputDirectory"] = cropDirectory
|
|
90
|
+
if cropDpi is not None:
|
|
91
|
+
overrides["CropImageDpi"] = cropDpi
|
|
92
|
+
if overrides:
|
|
93
|
+
configuration = configuration.model_copy(update=overrides)
|
|
94
|
+
configuration = FinalizeConfiguration(configuration)
|
|
46
95
|
|
|
47
96
|
try:
|
|
48
97
|
detector = BuildDetector(configuration)
|
|
@@ -54,26 +103,111 @@ def Detect(
|
|
|
54
103
|
typer.echo(str(exc), err=True)
|
|
55
104
|
raise typer.Exit(code=2) from exc
|
|
56
105
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
106
|
+
pdfIterator = _EnumeratePdfs(configuration.PdfRoot, recursive)
|
|
107
|
+
try:
|
|
108
|
+
firstPdf = next(pdfIterator)
|
|
109
|
+
except StopIteration:
|
|
110
|
+
raise SystemExit(f"No PDFs found in {configuration.PdfRoot}") from None
|
|
111
|
+
|
|
112
|
+
results_buffer: list[FileResult] | None = [] if configuration.OutputDirectory is None else None
|
|
113
|
+
json_handle = None
|
|
114
|
+
json_path: Path | None = None
|
|
115
|
+
wrote_first = False
|
|
116
|
+
|
|
117
|
+
if configuration.OutputDirectory is not None:
|
|
118
|
+
outputDirectory = configuration.OutputDirectory
|
|
119
|
+
outputDirectory.mkdir(parents=True, exist_ok=True)
|
|
120
|
+
json_path = outputDirectory / "results.json"
|
|
121
|
+
json_handle = open(json_path, "w", encoding="utf-8")
|
|
122
|
+
json_handle.write("[")
|
|
123
|
+
|
|
124
|
+
crop_dir = configuration.CropOutputDirectory
|
|
125
|
+
cropping_enabled = configuration.CropSignatures
|
|
126
|
+
cropping_available = True
|
|
127
|
+
cropping_attempted = False
|
|
128
|
+
if configuration.CropSignatures and crop_dir is None:
|
|
129
|
+
Logger.warning(
|
|
130
|
+
"CropSignatures enabled without an output directory",
|
|
131
|
+
extra={"pdf_root": str(configuration.PdfRoot)},
|
|
132
|
+
)
|
|
133
|
+
cropping_enabled = False
|
|
134
|
+
|
|
135
|
+
total_bboxes = 0
|
|
136
|
+
|
|
137
|
+
def _append_result(file_result: FileResult, source_pdf: Path) -> None:
|
|
138
|
+
nonlocal wrote_first, json_handle, total_bboxes, cropping_available, cropping_attempted
|
|
139
|
+
|
|
140
|
+
if cropping_enabled and cropping_available and crop_dir is not None:
|
|
141
|
+
try:
|
|
142
|
+
crop_signatures(
|
|
143
|
+
pdf_path=source_pdf,
|
|
144
|
+
file_result=file_result,
|
|
145
|
+
output_dir=crop_dir,
|
|
146
|
+
dpi=configuration.CropImageDpi,
|
|
147
|
+
logger=Logger,
|
|
148
|
+
)
|
|
149
|
+
cropping_attempted = True
|
|
150
|
+
except SignatureCroppingUnavailable as exc:
|
|
151
|
+
cropping_available = False
|
|
152
|
+
Logger.warning("Signature cropping unavailable", extra={"error": str(exc)})
|
|
153
|
+
typer.echo(str(exc), err=True)
|
|
154
|
+
except Exception as exc: # pragma: no cover - defensive
|
|
155
|
+
Logger.warning(
|
|
156
|
+
"Unexpected error while cropping signatures",
|
|
157
|
+
extra={"error": str(exc)},
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
total_bboxes += sum(1 for sig in file_result.Signatures if sig.BoundingBox)
|
|
161
|
+
|
|
162
|
+
if results_buffer is not None:
|
|
163
|
+
results_buffer.append(file_result)
|
|
164
|
+
return
|
|
165
|
+
|
|
166
|
+
if json_handle is None:
|
|
167
|
+
return
|
|
168
|
+
|
|
169
|
+
serialized = json.dumps(
|
|
170
|
+
file_result,
|
|
171
|
+
indent=2,
|
|
172
|
+
ensure_ascii=False,
|
|
173
|
+
default=_JsonSerializer,
|
|
174
|
+
)
|
|
175
|
+
indented = "\n".join(f" {line}" for line in serialized.splitlines())
|
|
176
|
+
if wrote_first:
|
|
177
|
+
json_handle.write(",\n")
|
|
178
|
+
else:
|
|
179
|
+
json_handle.write("\n")
|
|
180
|
+
json_handle.write(indented)
|
|
181
|
+
wrote_first = True
|
|
182
|
+
|
|
183
|
+
def _process(pdf_path: Path) -> None:
|
|
184
|
+
file_result = detector.Detect(pdf_path)
|
|
185
|
+
_append_result(file_result, pdf_path)
|
|
62
186
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
187
|
+
try:
|
|
188
|
+
_process(firstPdf)
|
|
189
|
+
for pdf_path in pdfIterator:
|
|
190
|
+
_process(pdf_path)
|
|
191
|
+
finally:
|
|
192
|
+
if json_handle is not None:
|
|
193
|
+
closing = "\n]\n" if wrote_first else "]\n"
|
|
194
|
+
json_handle.write(closing)
|
|
195
|
+
json_handle.close()
|
|
196
|
+
|
|
197
|
+
if json_handle is not None:
|
|
198
|
+
typer.echo(f"Wrote {json_path}")
|
|
199
|
+
else:
|
|
200
|
+
payload = json.dumps(
|
|
201
|
+
results_buffer or [], indent=2, ensure_ascii=False, default=_JsonSerializer
|
|
202
|
+
)
|
|
66
203
|
typer.echo(payload)
|
|
67
204
|
typer.echo("Detection completed with output disabled (out_dir=none)")
|
|
68
|
-
return
|
|
69
|
-
|
|
70
|
-
outputDirectory = configuration.OutputDirectory
|
|
71
|
-
outputDirectory.mkdir(parents=True, exist_ok=True)
|
|
72
205
|
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
206
|
+
if cropping_enabled and cropping_available and cropping_attempted and total_bboxes == 0:
|
|
207
|
+
Logger.warning(
|
|
208
|
+
"No signature bounding boxes detected; try --engine pymupdf for crop-ready output",
|
|
209
|
+
extra={"engine": configuration.Engine},
|
|
210
|
+
)
|
|
77
211
|
|
|
78
212
|
|
|
79
213
|
@CliApplication.command(name="eda")
|
sigdetect/config.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import os
|
|
6
|
+
from contextlib import suppress
|
|
6
7
|
from pathlib import Path
|
|
7
8
|
from typing import Literal
|
|
8
9
|
|
|
@@ -26,11 +27,13 @@ class DetectConfiguration(BaseModel):
|
|
|
26
27
|
OutputDirectory: Path | None = Field(default=Path("out"), alias="out_dir")
|
|
27
28
|
Engine: EngineName = Field(default="pypdf2", alias="engine")
|
|
28
29
|
Profile: ProfileName = Field(default="hipaa", alias="profile")
|
|
29
|
-
MaxWorkers: int = Field(default=8, alias="max_workers", ge=1, le=64)
|
|
30
30
|
PseudoSignatures: bool = Field(default=True, alias="pseudo_signatures")
|
|
31
31
|
RecurseXObjects: bool = Field(default=True, alias="recurse_xobjects")
|
|
32
|
+
CropSignatures: bool = Field(default=False, alias="crop_signatures")
|
|
33
|
+
CropOutputDirectory: Path | None = Field(default=None, alias="crop_output_dir")
|
|
34
|
+
CropImageDpi: int = Field(default=200, alias="crop_image_dpi", ge=72, le=600)
|
|
32
35
|
|
|
33
|
-
@field_validator("PdfRoot", "OutputDirectory", mode="before")
|
|
36
|
+
@field_validator("PdfRoot", "OutputDirectory", "CropOutputDirectory", mode="before")
|
|
34
37
|
@classmethod
|
|
35
38
|
def _CoercePath(cls, value: str | Path | None) -> Path | None:
|
|
36
39
|
"""Allow configuration values to be provided as ``str`` or ``Path``.
|
|
@@ -42,8 +45,8 @@ class DetectConfiguration(BaseModel):
|
|
|
42
45
|
if value is None:
|
|
43
46
|
return None
|
|
44
47
|
if isinstance(value, Path):
|
|
45
|
-
return value
|
|
46
|
-
return Path(value)
|
|
48
|
+
return value.expanduser()
|
|
49
|
+
return Path(value).expanduser()
|
|
47
50
|
|
|
48
51
|
# Expose legacy snake_case property names for gradual migration
|
|
49
52
|
@property
|
|
@@ -62,10 +65,6 @@ class DetectConfiguration(BaseModel):
|
|
|
62
65
|
def profile(self) -> ProfileName: # pragma: no cover - simple passthrough
|
|
63
66
|
return self.Profile
|
|
64
67
|
|
|
65
|
-
@property
|
|
66
|
-
def max_workers(self) -> int: # pragma: no cover - simple passthrough
|
|
67
|
-
return self.MaxWorkers
|
|
68
|
-
|
|
69
68
|
@property
|
|
70
69
|
def pseudo_signatures(self) -> bool: # pragma: no cover - simple passthrough
|
|
71
70
|
return self.PseudoSignatures
|
|
@@ -74,6 +73,18 @@ class DetectConfiguration(BaseModel):
|
|
|
74
73
|
def recurse_xobjects(self) -> bool: # pragma: no cover - simple passthrough
|
|
75
74
|
return self.RecurseXObjects
|
|
76
75
|
|
|
76
|
+
@property
|
|
77
|
+
def crop_signatures(self) -> bool: # pragma: no cover - simple passthrough
|
|
78
|
+
return self.CropSignatures
|
|
79
|
+
|
|
80
|
+
@property
|
|
81
|
+
def crop_output_dir(self) -> Path | None: # pragma: no cover - simple passthrough
|
|
82
|
+
return self.CropOutputDirectory
|
|
83
|
+
|
|
84
|
+
@property
|
|
85
|
+
def crop_image_dpi(self) -> int: # pragma: no cover - simple passthrough
|
|
86
|
+
return self.CropImageDpi
|
|
87
|
+
|
|
77
88
|
|
|
78
89
|
def LoadConfiguration(path: Path | None) -> DetectConfiguration:
|
|
79
90
|
"""Load configuration from ``path`` while applying environment overrides.
|
|
@@ -94,6 +105,9 @@ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
|
|
|
94
105
|
env_pdf_root = os.getenv("SIGDETECT_PDF_ROOT")
|
|
95
106
|
env_out_dir = os.getenv("SIGDETECT_OUT_DIR")
|
|
96
107
|
env_profile = os.getenv("SIGDETECT_PROFILE")
|
|
108
|
+
env_crop = os.getenv("SIGDETECT_CROP_SIGNATURES")
|
|
109
|
+
env_crop_dir = os.getenv("SIGDETECT_CROP_DIR")
|
|
110
|
+
env_crop_dpi = os.getenv("SIGDETECT_CROP_DPI")
|
|
97
111
|
|
|
98
112
|
raw_data: dict[str, object] = {}
|
|
99
113
|
if path and Path(path).exists():
|
|
@@ -108,10 +122,36 @@ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
|
|
|
108
122
|
raw_data["out_dir"] = None if env_out_dir.lower() == "none" else env_out_dir
|
|
109
123
|
if env_profile in {"hipaa", "retainer"}:
|
|
110
124
|
raw_data["profile"] = env_profile
|
|
125
|
+
if env_crop is not None:
|
|
126
|
+
lowered = env_crop.lower()
|
|
127
|
+
if lowered in {"1", "true", "yes", "on"}:
|
|
128
|
+
raw_data["crop_signatures"] = True
|
|
129
|
+
elif lowered in {"0", "false", "no", "off"}:
|
|
130
|
+
raw_data["crop_signatures"] = False
|
|
131
|
+
if env_crop_dir:
|
|
132
|
+
raw_data["crop_output_dir"] = env_crop_dir
|
|
133
|
+
if env_crop_dpi:
|
|
134
|
+
with suppress(ValueError):
|
|
135
|
+
raw_data["crop_image_dpi"] = int(env_crop_dpi)
|
|
111
136
|
|
|
112
137
|
configuration = DetectConfiguration(**raw_data)
|
|
138
|
+
return FinalizeConfiguration(configuration)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def FinalizeConfiguration(configuration: DetectConfiguration) -> DetectConfiguration:
|
|
142
|
+
"""Ensure derived directories exist and defaults are populated."""
|
|
143
|
+
|
|
144
|
+
updates: dict[str, object] = {}
|
|
113
145
|
|
|
114
146
|
if configuration.OutputDirectory is not None:
|
|
115
147
|
configuration.OutputDirectory.mkdir(parents=True, exist_ok=True)
|
|
116
148
|
|
|
117
|
-
|
|
149
|
+
if configuration.CropSignatures:
|
|
150
|
+
crop_dir = configuration.CropOutputDirectory
|
|
151
|
+
if crop_dir is None:
|
|
152
|
+
base_dir = configuration.OutputDirectory or configuration.PdfRoot
|
|
153
|
+
crop_dir = base_dir / "signature_crops"
|
|
154
|
+
crop_dir.mkdir(parents=True, exist_ok=True)
|
|
155
|
+
updates["CropOutputDirectory"] = crop_dir
|
|
156
|
+
|
|
157
|
+
return configuration if not updates else configuration.model_copy(update=updates)
|