sigdetect 0.3.1__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sigdetect/__init__.py +1 -1
- sigdetect/api.py +43 -11
- sigdetect/cli.py +89 -23
- sigdetect/config.py +48 -3
- sigdetect/cropping.py +72 -12
- sigdetect/detector/__init__.py +27 -8
- sigdetect/detector/pymupdf_engine.py +3 -2
- sigdetect/detector/pypdf2_engine.py +7 -5
- sigdetect/detector/signature_model.py +3 -1
- sigdetect/wet_detection.py +549 -0
- {sigdetect-0.3.1.dist-info → sigdetect-0.5.0.dist-info}/METADATA +28 -25
- sigdetect-0.5.0.dist-info/RECORD +24 -0
- {sigdetect-0.3.1.dist-info → sigdetect-0.5.0.dist-info}/WHEEL +1 -1
- sigdetect-0.3.1.dist-info/RECORD +0 -23
- {sigdetect-0.3.1.dist-info → sigdetect-0.5.0.dist-info}/entry_points.txt +0 -0
- {sigdetect-0.3.1.dist-info → sigdetect-0.5.0.dist-info}/top_level.txt +0 -0
sigdetect/__init__.py
CHANGED
sigdetect/api.py
CHANGED
|
@@ -9,8 +9,9 @@ from typing import Any, Generator, Iterable, Iterator, Literal, overload
|
|
|
9
9
|
from sigdetect.config import DetectConfiguration
|
|
10
10
|
from sigdetect.cropping import SignatureCrop
|
|
11
11
|
from sigdetect.detector import BuildDetector, Detector, FileResult, Signature
|
|
12
|
+
from sigdetect.wet_detection import apply_wet_detection
|
|
12
13
|
|
|
13
|
-
EngineName = Literal["pypdf2", "pypdf", "pymupdf"]
|
|
14
|
+
EngineName = Literal["pypdf2", "pypdf", "pymupdf", "auto"]
|
|
14
15
|
ProfileName = Literal["hipaa", "retainer"]
|
|
15
16
|
|
|
16
17
|
|
|
@@ -18,12 +19,16 @@ def DetectPdf(
|
|
|
18
19
|
pdfPath: str | Path,
|
|
19
20
|
*,
|
|
20
21
|
profileName: ProfileName = "hipaa",
|
|
21
|
-
engineName: EngineName = "
|
|
22
|
+
engineName: EngineName = "auto",
|
|
22
23
|
includePseudoSignatures: bool = True,
|
|
23
24
|
recurseXObjects: bool = True,
|
|
25
|
+
runWetDetection: bool = True,
|
|
24
26
|
detector: Detector | None = None,
|
|
25
27
|
) -> dict[str, Any]:
|
|
26
|
-
"""Detect signature evidence and assign roles for a single PDF.
|
|
28
|
+
"""Detect signature evidence and assign roles for a single PDF.
|
|
29
|
+
|
|
30
|
+
Wet detection runs by default for non-e-sign PDFs; pass ``runWetDetection=False`` to skip OCR.
|
|
31
|
+
"""
|
|
27
32
|
|
|
28
33
|
resolvedPath = Path(pdfPath)
|
|
29
34
|
activeDetector = detector or get_detector(
|
|
@@ -36,6 +41,10 @@ def DetectPdf(
|
|
|
36
41
|
)
|
|
37
42
|
|
|
38
43
|
result = activeDetector.Detect(resolvedPath)
|
|
44
|
+
if runWetDetection:
|
|
45
|
+
configuration = _ResolveConfiguration(activeDetector)
|
|
46
|
+
if configuration is not None:
|
|
47
|
+
apply_wet_detection(resolvedPath, configuration, result)
|
|
39
48
|
return _ToPlainDictionary(result)
|
|
40
49
|
|
|
41
50
|
|
|
@@ -43,12 +52,15 @@ def get_detector(
|
|
|
43
52
|
*,
|
|
44
53
|
pdfRoot: str | Path | None = None,
|
|
45
54
|
profileName: ProfileName = "hipaa",
|
|
46
|
-
engineName: EngineName = "
|
|
55
|
+
engineName: EngineName = "auto",
|
|
47
56
|
includePseudoSignatures: bool = True,
|
|
48
57
|
recurseXObjects: bool = True,
|
|
49
58
|
outputDirectory: str | Path | None = None,
|
|
50
59
|
) -> Detector:
|
|
51
|
-
"""Return a reusable detector instance configured with the supplied options.
|
|
60
|
+
"""Return a reusable detector instance configured with the supplied options.
|
|
61
|
+
|
|
62
|
+
Engine selection is forced to ``auto`` (prefers PyMuPDF when available).
|
|
63
|
+
"""
|
|
52
64
|
|
|
53
65
|
configuration = DetectConfiguration(
|
|
54
66
|
PdfRoot=Path(pdfRoot) if pdfRoot is not None else Path.cwd(),
|
|
@@ -108,6 +120,7 @@ def _ToPlainValue(value: Any) -> Any:
|
|
|
108
120
|
def DetectMany(
|
|
109
121
|
pdfPaths: Iterable[str | Path],
|
|
110
122
|
*,
|
|
123
|
+
runWetDetection: bool = True,
|
|
111
124
|
detector: Detector | None = None,
|
|
112
125
|
**kwargs: Any,
|
|
113
126
|
) -> Iterator[dict[str, Any]]:
|
|
@@ -115,17 +128,18 @@ def DetectMany(
|
|
|
115
128
|
|
|
116
129
|
if detector is not None:
|
|
117
130
|
for pdfPath in pdfPaths:
|
|
118
|
-
yield _DetectWithDetector(detector, pdfPath)
|
|
131
|
+
yield _DetectWithDetector(detector, pdfPath, runWetDetection=runWetDetection)
|
|
119
132
|
return
|
|
120
133
|
|
|
121
134
|
for pdfPath in pdfPaths:
|
|
122
|
-
yield DetectPdf(pdfPath, **kwargs)
|
|
135
|
+
yield DetectPdf(pdfPath, runWetDetection=runWetDetection, **kwargs)
|
|
123
136
|
|
|
124
137
|
|
|
125
138
|
def ScanDirectory(
|
|
126
139
|
pdfRoot: str | Path,
|
|
127
140
|
*,
|
|
128
141
|
globPattern: str = "**/*.pdf",
|
|
142
|
+
runWetDetection: bool = True,
|
|
129
143
|
detector: Detector | None = None,
|
|
130
144
|
**kwargs: Any,
|
|
131
145
|
) -> Iterator[dict[str, Any]]:
|
|
@@ -143,7 +157,7 @@ def ScanDirectory(
|
|
|
143
157
|
|
|
144
158
|
for pdfPath in iterator:
|
|
145
159
|
if pdfPath.is_file() and pdfPath.suffix.lower() == ".pdf":
|
|
146
|
-
yield DetectPdf(pdfPath, detector=detector, **kwargs)
|
|
160
|
+
yield DetectPdf(pdfPath, detector=detector, runWetDetection=runWetDetection, **kwargs)
|
|
147
161
|
|
|
148
162
|
|
|
149
163
|
def ToCsvRow(result: dict[str, Any]) -> dict[str, Any]:
|
|
@@ -174,11 +188,25 @@ def Version() -> str:
|
|
|
174
188
|
return "0.0.0-dev"
|
|
175
189
|
|
|
176
190
|
|
|
177
|
-
def _DetectWithDetector(
|
|
191
|
+
def _DetectWithDetector(
|
|
192
|
+
detector: Detector, pdfPath: str | Path, *, runWetDetection: bool
|
|
193
|
+
) -> dict[str, Any]:
|
|
178
194
|
"""Helper that runs ``detector`` and returns the plain dictionary result."""
|
|
179
195
|
|
|
180
196
|
resolvedPath = Path(pdfPath)
|
|
181
|
-
|
|
197
|
+
result = detector.Detect(resolvedPath)
|
|
198
|
+
if runWetDetection:
|
|
199
|
+
configuration = _ResolveConfiguration(detector)
|
|
200
|
+
if configuration is not None:
|
|
201
|
+
apply_wet_detection(resolvedPath, configuration, result)
|
|
202
|
+
return _ToPlainDictionary(result)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _ResolveConfiguration(detector: Detector) -> DetectConfiguration | None:
|
|
206
|
+
configuration = getattr(detector, "Configuration", None)
|
|
207
|
+
if isinstance(configuration, DetectConfiguration):
|
|
208
|
+
return configuration
|
|
209
|
+
return None
|
|
182
210
|
|
|
183
211
|
|
|
184
212
|
@contextmanager
|
|
@@ -225,12 +253,15 @@ def CropSignatureImages(
|
|
|
225
253
|
returnBytes: bool = False,
|
|
226
254
|
saveToDisk: bool = True,
|
|
227
255
|
) -> list[Path] | list[SignatureCrop]:
|
|
228
|
-
"""
|
|
256
|
+
"""Create DOCX files containing cropped signature images.
|
|
229
257
|
|
|
230
258
|
Accepts either a :class:`FileResult` instance or the ``dict`` returned by
|
|
231
259
|
:func:`DetectPdf`. Requires the optional ``pymupdf`` dependency.
|
|
232
260
|
Set ``returnBytes=True`` to also receive in-memory PNG bytes for each crop. Set
|
|
233
261
|
``saveToDisk=False`` to skip writing PNG files while still returning in-memory data.
|
|
262
|
+
When ``saveToDisk`` is enabled, a one-image DOCX file is also written per crop. When
|
|
263
|
+
``returnBytes`` is True and ``python-docx`` is available, the returned
|
|
264
|
+
:class:`SignatureCrop` objects include ``docx_bytes``.
|
|
234
265
|
"""
|
|
235
266
|
|
|
236
267
|
from sigdetect.cropping import crop_signatures
|
|
@@ -273,6 +304,7 @@ def _CoerceFileResult(
|
|
|
273
304
|
RenderType=str(entry.get("render_type") or "unknown"),
|
|
274
305
|
BoundingBox=tuple(bbox) if bbox else None,
|
|
275
306
|
CropPath=entry.get("crop_path"),
|
|
307
|
+
CropBytes=entry.get("crop_bytes"),
|
|
276
308
|
)
|
|
277
309
|
)
|
|
278
310
|
|
sigdetect/cli.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import base64
|
|
5
6
|
import json
|
|
6
7
|
from collections.abc import Iterator
|
|
7
8
|
from dataclasses import asdict, is_dataclass
|
|
@@ -15,6 +16,7 @@ from .cropping import SignatureCroppingUnavailable, crop_signatures
|
|
|
15
16
|
from .detector import BuildDetector, FileResult
|
|
16
17
|
from .eda import RunExploratoryAnalysis
|
|
17
18
|
from .logging_setup import ConfigureLogging
|
|
19
|
+
from .wet_detection import apply_wet_detection
|
|
18
20
|
|
|
19
21
|
Logger = ConfigureLogging()
|
|
20
22
|
|
|
@@ -47,6 +49,12 @@ def Detect(
|
|
|
47
49
|
configurationPath: Path | None = typer.Option(
|
|
48
50
|
None, "--config", "-c", help="Path to YAML config"
|
|
49
51
|
),
|
|
52
|
+
writeResults: bool | None = typer.Option(
|
|
53
|
+
None,
|
|
54
|
+
"--write-results/--no-write-results",
|
|
55
|
+
help="Write results.json (or JSON to stdout when out_dir is none)",
|
|
56
|
+
show_default=False,
|
|
57
|
+
),
|
|
50
58
|
profileOverride: str | None = typer.Option(None, "--profile", "-p", help="hipaa or retainer"),
|
|
51
59
|
recursive: bool = typer.Option(
|
|
52
60
|
True,
|
|
@@ -56,13 +64,13 @@ def Detect(
|
|
|
56
64
|
cropSignatures: bool | None = typer.Option(
|
|
57
65
|
None,
|
|
58
66
|
"--crop-signatures/--no-crop-signatures",
|
|
59
|
-
help="
|
|
67
|
+
help="Write DOCX files containing cropped signature images (requires PyMuPDF + python-docx)",
|
|
60
68
|
show_default=False,
|
|
61
69
|
),
|
|
62
70
|
cropDirectory: Path | None = typer.Option(
|
|
63
71
|
None,
|
|
64
72
|
"--crop-dir",
|
|
65
|
-
help="Directory for signature
|
|
73
|
+
help="Directory for signature DOCX crops (defaults to out_dir/signature_crops)",
|
|
66
74
|
),
|
|
67
75
|
cropDpi: int | None = typer.Option(
|
|
68
76
|
None,
|
|
@@ -72,6 +80,39 @@ def Detect(
|
|
|
72
80
|
help="Rendering DPI for signature crops",
|
|
73
81
|
show_default=False,
|
|
74
82
|
),
|
|
83
|
+
cropBytes: bool = typer.Option(
|
|
84
|
+
False,
|
|
85
|
+
"--crop-bytes/--no-crop-bytes",
|
|
86
|
+
help="Embed base64 PNG bytes for signature crops in results JSON",
|
|
87
|
+
show_default=False,
|
|
88
|
+
),
|
|
89
|
+
detectWetSignatures: bool | None = typer.Option(
|
|
90
|
+
None,
|
|
91
|
+
"--detect-wet/--no-detect-wet",
|
|
92
|
+
help="Compatibility flag; non-e-sign PDFs always run OCR when deps are available",
|
|
93
|
+
show_default=False,
|
|
94
|
+
),
|
|
95
|
+
wetOcrDpi: int | None = typer.Option(
|
|
96
|
+
None,
|
|
97
|
+
"--wet-ocr-dpi",
|
|
98
|
+
min=72,
|
|
99
|
+
max=600,
|
|
100
|
+
help="Rendering DPI for OCR pages (wet detection)",
|
|
101
|
+
show_default=False,
|
|
102
|
+
),
|
|
103
|
+
wetOcrLanguages: str | None = typer.Option(
|
|
104
|
+
None,
|
|
105
|
+
"--wet-ocr-languages",
|
|
106
|
+
help="Tesseract language packs for OCR (e.g., 'eng' or 'eng+spa')",
|
|
107
|
+
),
|
|
108
|
+
wetPrecisionThreshold: float | None = typer.Option(
|
|
109
|
+
None,
|
|
110
|
+
"--wet-precision-threshold",
|
|
111
|
+
min=0.0,
|
|
112
|
+
max=1.0,
|
|
113
|
+
help="Minimum wet-signature confidence (0-1) to accept a candidate",
|
|
114
|
+
show_default=False,
|
|
115
|
+
),
|
|
75
116
|
) -> None:
|
|
76
117
|
"""Run detection for the configured directory and emit ``results.json``."""
|
|
77
118
|
|
|
@@ -83,12 +124,22 @@ def Detect(
|
|
|
83
124
|
configuration = configuration.model_copy(update={"Profile": normalized_profile})
|
|
84
125
|
|
|
85
126
|
overrides: dict[str, object] = {}
|
|
127
|
+
if writeResults is not None:
|
|
128
|
+
overrides["WriteResults"] = writeResults
|
|
86
129
|
if cropSignatures is not None:
|
|
87
130
|
overrides["CropSignatures"] = cropSignatures
|
|
88
131
|
if cropDirectory is not None:
|
|
89
132
|
overrides["CropOutputDirectory"] = cropDirectory
|
|
90
133
|
if cropDpi is not None:
|
|
91
134
|
overrides["CropImageDpi"] = cropDpi
|
|
135
|
+
if detectWetSignatures is not None:
|
|
136
|
+
overrides["DetectWetSignatures"] = detectWetSignatures
|
|
137
|
+
if wetOcrDpi is not None:
|
|
138
|
+
overrides["WetOcrDpi"] = wetOcrDpi
|
|
139
|
+
if wetOcrLanguages is not None:
|
|
140
|
+
overrides["WetOcrLanguages"] = wetOcrLanguages
|
|
141
|
+
if wetPrecisionThreshold is not None:
|
|
142
|
+
overrides["WetPrecisionThreshold"] = wetPrecisionThreshold
|
|
92
143
|
if overrides:
|
|
93
144
|
configuration = configuration.model_copy(update=overrides)
|
|
94
145
|
configuration = FinalizeConfiguration(configuration)
|
|
@@ -109,44 +160,52 @@ def Detect(
|
|
|
109
160
|
except StopIteration:
|
|
110
161
|
raise SystemExit(f"No PDFs found in {configuration.PdfRoot}") from None
|
|
111
162
|
|
|
112
|
-
|
|
163
|
+
write_results = configuration.WriteResults
|
|
164
|
+
results_buffer: list[FileResult] | None = (
|
|
165
|
+
[] if write_results and configuration.OutputDirectory is None else None
|
|
166
|
+
)
|
|
113
167
|
json_handle = None
|
|
114
168
|
json_path: Path | None = None
|
|
115
169
|
wrote_first = False
|
|
116
170
|
|
|
117
|
-
if configuration.OutputDirectory is not None:
|
|
171
|
+
if write_results and configuration.OutputDirectory is not None:
|
|
118
172
|
outputDirectory = configuration.OutputDirectory
|
|
119
173
|
outputDirectory.mkdir(parents=True, exist_ok=True)
|
|
120
174
|
json_path = outputDirectory / "results.json"
|
|
121
175
|
json_handle = open(json_path, "w", encoding="utf-8")
|
|
122
176
|
json_handle.write("[")
|
|
123
177
|
|
|
178
|
+
crop_bytes_enabled = bool(cropBytes)
|
|
124
179
|
crop_dir = configuration.CropOutputDirectory
|
|
180
|
+
if crop_dir is None:
|
|
181
|
+
base_dir = configuration.OutputDirectory or configuration.PdfRoot
|
|
182
|
+
crop_dir = base_dir / "signature_crops"
|
|
125
183
|
cropping_enabled = configuration.CropSignatures
|
|
126
184
|
cropping_available = True
|
|
127
185
|
cropping_attempted = False
|
|
128
|
-
if configuration.CropSignatures and crop_dir is None:
|
|
129
|
-
Logger.warning(
|
|
130
|
-
"CropSignatures enabled without an output directory",
|
|
131
|
-
extra={"pdf_root": str(configuration.PdfRoot)},
|
|
132
|
-
)
|
|
133
|
-
cropping_enabled = False
|
|
134
186
|
|
|
135
187
|
total_bboxes = 0
|
|
136
188
|
|
|
137
189
|
def _append_result(file_result: FileResult, source_pdf: Path) -> None:
|
|
138
190
|
nonlocal wrote_first, json_handle, total_bboxes, cropping_available, cropping_attempted
|
|
139
191
|
|
|
140
|
-
if
|
|
192
|
+
if cropping_available and (cropping_enabled or crop_bytes_enabled) and crop_dir is not None:
|
|
141
193
|
try:
|
|
142
|
-
crop_signatures(
|
|
194
|
+
crops = crop_signatures(
|
|
143
195
|
pdf_path=source_pdf,
|
|
144
196
|
file_result=file_result,
|
|
145
197
|
output_dir=crop_dir,
|
|
146
198
|
dpi=configuration.CropImageDpi,
|
|
147
199
|
logger=Logger,
|
|
200
|
+
return_bytes=crop_bytes_enabled,
|
|
201
|
+
save_files=cropping_enabled,
|
|
148
202
|
)
|
|
149
203
|
cropping_attempted = True
|
|
204
|
+
if crop_bytes_enabled:
|
|
205
|
+
for crop in crops:
|
|
206
|
+
crop.signature.CropBytes = base64.b64encode(crop.image_bytes).decode(
|
|
207
|
+
"ascii"
|
|
208
|
+
)
|
|
150
209
|
except SignatureCroppingUnavailable as exc:
|
|
151
210
|
cropping_available = False
|
|
152
211
|
Logger.warning("Signature cropping unavailable", extra={"error": str(exc)})
|
|
@@ -182,6 +241,7 @@ def Detect(
|
|
|
182
241
|
|
|
183
242
|
def _process(pdf_path: Path) -> None:
|
|
184
243
|
file_result = detector.Detect(pdf_path)
|
|
244
|
+
apply_wet_detection(pdf_path, configuration, file_result, logger=Logger)
|
|
185
245
|
_append_result(file_result, pdf_path)
|
|
186
246
|
|
|
187
247
|
try:
|
|
@@ -194,18 +254,24 @@ def Detect(
|
|
|
194
254
|
json_handle.write(closing)
|
|
195
255
|
json_handle.close()
|
|
196
256
|
|
|
197
|
-
if
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
257
|
+
if write_results:
|
|
258
|
+
if json_handle is not None:
|
|
259
|
+
typer.echo(f"Wrote {json_path}")
|
|
260
|
+
else:
|
|
261
|
+
payload = json.dumps(
|
|
262
|
+
results_buffer or [], indent=2, ensure_ascii=False, default=_JsonSerializer
|
|
263
|
+
)
|
|
264
|
+
typer.echo(payload)
|
|
265
|
+
typer.echo("Detection completed with output disabled (out_dir=none)")
|
|
266
|
+
|
|
267
|
+
if (
|
|
268
|
+
(cropping_enabled or crop_bytes_enabled)
|
|
269
|
+
and cropping_available
|
|
270
|
+
and cropping_attempted
|
|
271
|
+
and total_bboxes == 0
|
|
272
|
+
):
|
|
207
273
|
Logger.warning(
|
|
208
|
-
"No signature bounding boxes detected;
|
|
274
|
+
"No signature bounding boxes detected; install PyMuPDF for crop-ready output",
|
|
209
275
|
extra={"engine": configuration.Engine},
|
|
210
276
|
)
|
|
211
277
|
|
sigdetect/config.py
CHANGED
|
@@ -10,7 +10,7 @@ from typing import Literal
|
|
|
10
10
|
import yaml
|
|
11
11
|
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
|
12
12
|
|
|
13
|
-
EngineName = Literal["pypdf2", "pypdf", "pymupdf"]
|
|
13
|
+
EngineName = Literal["pypdf2", "pypdf", "pymupdf", "auto"]
|
|
14
14
|
ProfileName = Literal["hipaa", "retainer"]
|
|
15
15
|
|
|
16
16
|
|
|
@@ -25,13 +25,20 @@ class DetectConfiguration(BaseModel):
|
|
|
25
25
|
|
|
26
26
|
PdfRoot: Path = Field(default=Path("hipaa_results"), alias="pdf_root")
|
|
27
27
|
OutputDirectory: Path | None = Field(default=Path("out"), alias="out_dir")
|
|
28
|
-
|
|
28
|
+
WriteResults: bool = Field(default=False, alias="write_results")
|
|
29
|
+
Engine: EngineName = Field(default="auto", alias="engine")
|
|
29
30
|
Profile: ProfileName = Field(default="hipaa", alias="profile")
|
|
30
31
|
PseudoSignatures: bool = Field(default=True, alias="pseudo_signatures")
|
|
31
32
|
RecurseXObjects: bool = Field(default=True, alias="recurse_xobjects")
|
|
32
|
-
CropSignatures: bool = Field(default=
|
|
33
|
+
CropSignatures: bool = Field(default=True, alias="crop_signatures")
|
|
33
34
|
CropOutputDirectory: Path | None = Field(default=None, alias="crop_output_dir")
|
|
34
35
|
CropImageDpi: int = Field(default=200, alias="crop_image_dpi", ge=72, le=600)
|
|
36
|
+
DetectWetSignatures: bool = Field(default=True, alias="detect_wet_signatures")
|
|
37
|
+
WetOcrDpi: int = Field(default=200, alias="wet_ocr_dpi", ge=72, le=600)
|
|
38
|
+
WetOcrLanguages: str = Field(default="eng", alias="wet_ocr_languages")
|
|
39
|
+
WetPrecisionThreshold: float = Field(
|
|
40
|
+
default=0.82, alias="wet_precision_threshold", ge=0.0, le=1.0
|
|
41
|
+
)
|
|
35
42
|
|
|
36
43
|
@field_validator("PdfRoot", "OutputDirectory", "CropOutputDirectory", mode="before")
|
|
37
44
|
@classmethod
|
|
@@ -57,6 +64,10 @@ class DetectConfiguration(BaseModel):
|
|
|
57
64
|
def out_dir(self) -> Path | None: # pragma: no cover - simple passthrough
|
|
58
65
|
return self.OutputDirectory
|
|
59
66
|
|
|
67
|
+
@property
|
|
68
|
+
def write_results(self) -> bool: # pragma: no cover - simple passthrough
|
|
69
|
+
return self.WriteResults
|
|
70
|
+
|
|
60
71
|
@property
|
|
61
72
|
def engine(self) -> EngineName: # pragma: no cover - simple passthrough
|
|
62
73
|
return self.Engine
|
|
@@ -85,6 +96,22 @@ class DetectConfiguration(BaseModel):
|
|
|
85
96
|
def crop_image_dpi(self) -> int: # pragma: no cover - simple passthrough
|
|
86
97
|
return self.CropImageDpi
|
|
87
98
|
|
|
99
|
+
@property
|
|
100
|
+
def detect_wet_signatures(self) -> bool: # pragma: no cover - simple passthrough
|
|
101
|
+
return self.DetectWetSignatures
|
|
102
|
+
|
|
103
|
+
@property
|
|
104
|
+
def wet_ocr_dpi(self) -> int: # pragma: no cover - simple passthrough
|
|
105
|
+
return self.WetOcrDpi
|
|
106
|
+
|
|
107
|
+
@property
|
|
108
|
+
def wet_ocr_languages(self) -> str: # pragma: no cover - simple passthrough
|
|
109
|
+
return self.WetOcrLanguages
|
|
110
|
+
|
|
111
|
+
@property
|
|
112
|
+
def wet_precision_threshold(self) -> float: # pragma: no cover - simple passthrough
|
|
113
|
+
return self.WetPrecisionThreshold
|
|
114
|
+
|
|
88
115
|
|
|
89
116
|
def LoadConfiguration(path: Path | None) -> DetectConfiguration:
|
|
90
117
|
"""Load configuration from ``path`` while applying environment overrides.
|
|
@@ -108,6 +135,10 @@ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
|
|
|
108
135
|
env_crop = os.getenv("SIGDETECT_CROP_SIGNATURES")
|
|
109
136
|
env_crop_dir = os.getenv("SIGDETECT_CROP_DIR")
|
|
110
137
|
env_crop_dpi = os.getenv("SIGDETECT_CROP_DPI")
|
|
138
|
+
env_detect_wet = os.getenv("SIGDETECT_DETECT_WET")
|
|
139
|
+
env_wet_dpi = os.getenv("SIGDETECT_WET_OCR_DPI")
|
|
140
|
+
env_wet_lang = os.getenv("SIGDETECT_WET_LANGUAGES")
|
|
141
|
+
env_wet_precision = os.getenv("SIGDETECT_WET_PRECISION")
|
|
111
142
|
|
|
112
143
|
raw_data: dict[str, object] = {}
|
|
113
144
|
if path and Path(path).exists():
|
|
@@ -133,6 +164,20 @@ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
|
|
|
133
164
|
if env_crop_dpi:
|
|
134
165
|
with suppress(ValueError):
|
|
135
166
|
raw_data["crop_image_dpi"] = int(env_crop_dpi)
|
|
167
|
+
if env_detect_wet is not None:
|
|
168
|
+
lowered = env_detect_wet.lower()
|
|
169
|
+
if lowered in {"1", "true", "yes", "on"}:
|
|
170
|
+
raw_data["detect_wet_signatures"] = True
|
|
171
|
+
elif lowered in {"0", "false", "no", "off"}:
|
|
172
|
+
raw_data["detect_wet_signatures"] = False
|
|
173
|
+
if env_wet_dpi:
|
|
174
|
+
with suppress(ValueError):
|
|
175
|
+
raw_data["wet_ocr_dpi"] = int(env_wet_dpi)
|
|
176
|
+
if env_wet_lang:
|
|
177
|
+
raw_data["wet_ocr_languages"] = env_wet_lang
|
|
178
|
+
if env_wet_precision:
|
|
179
|
+
with suppress(ValueError):
|
|
180
|
+
raw_data["wet_precision_threshold"] = float(env_wet_precision)
|
|
136
181
|
|
|
137
182
|
configuration = DetectConfiguration(**raw_data)
|
|
138
183
|
return FinalizeConfiguration(configuration)
|
sigdetect/cropping.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
|
-
"""Helpers for converting signature bounding boxes into
|
|
1
|
+
"""Helpers for converting signature bounding boxes into DOCX crops."""
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import io
|
|
5
6
|
import logging
|
|
6
7
|
import re
|
|
7
8
|
from dataclasses import dataclass
|
|
@@ -16,18 +17,28 @@ try: # pragma: no cover - optional dependency
|
|
|
16
17
|
except Exception: # pragma: no cover - optional dependency
|
|
17
18
|
fitz = None # type: ignore[misc]
|
|
18
19
|
|
|
20
|
+
try: # pragma: no cover - optional dependency
|
|
21
|
+
from docx import Document # type: ignore
|
|
22
|
+
except Exception: # pragma: no cover - optional dependency
|
|
23
|
+
Document = None # type: ignore[assignment]
|
|
24
|
+
|
|
19
25
|
|
|
20
26
|
class SignatureCroppingUnavailable(RuntimeError):
|
|
21
27
|
"""Raised when PNG cropping cannot be performed (e.g., PyMuPDF missing)."""
|
|
22
28
|
|
|
23
29
|
|
|
30
|
+
class SignatureDocxUnavailable(RuntimeError):
|
|
31
|
+
"""Raised when DOCX creation cannot be performed (e.g., python-docx missing)."""
|
|
32
|
+
|
|
33
|
+
|
|
24
34
|
@dataclass(slots=True)
|
|
25
35
|
class SignatureCrop:
|
|
26
|
-
"""
|
|
36
|
+
"""Crop metadata and in-memory content."""
|
|
27
37
|
|
|
28
38
|
path: Path
|
|
29
39
|
image_bytes: bytes
|
|
30
40
|
signature: Signature
|
|
41
|
+
docx_bytes: bytes | None = None
|
|
31
42
|
saved_to_disk: bool = True
|
|
32
43
|
|
|
33
44
|
|
|
@@ -40,6 +51,7 @@ def crop_signatures(
|
|
|
40
51
|
dpi: int = 200,
|
|
41
52
|
logger: logging.Logger | None = None,
|
|
42
53
|
return_bytes: Literal[False] = False,
|
|
54
|
+
save_files: bool = True,
|
|
43
55
|
) -> list[Path]: ...
|
|
44
56
|
|
|
45
57
|
|
|
@@ -51,7 +63,8 @@ def crop_signatures(
|
|
|
51
63
|
output_dir: Path,
|
|
52
64
|
dpi: int = 200,
|
|
53
65
|
logger: logging.Logger | None = None,
|
|
54
|
-
return_bytes: Literal[True]
|
|
66
|
+
return_bytes: Literal[True],
|
|
67
|
+
save_files: bool = True,
|
|
55
68
|
) -> list[SignatureCrop]: ...
|
|
56
69
|
|
|
57
70
|
|
|
@@ -65,15 +78,18 @@ def crop_signatures(
|
|
|
65
78
|
return_bytes: bool = False,
|
|
66
79
|
save_files: bool = True,
|
|
67
80
|
) -> list[Path] | list[SignatureCrop]:
|
|
68
|
-
"""Render each signature bounding box to a PNG image
|
|
81
|
+
"""Render each signature bounding box to a PNG image and wrap it in a DOCX file.
|
|
69
82
|
|
|
70
83
|
Set ``return_bytes=True`` to collect in-memory PNG bytes for each crop while also writing
|
|
71
84
|
the files to ``output_dir``. Set ``save_files=False`` to skip writing PNGs to disk.
|
|
85
|
+
When ``save_files`` is enabled, a one-image DOCX file is also written per signature crop.
|
|
86
|
+
When ``return_bytes`` is True and ``python-docx`` is available, ``SignatureCrop.docx_bytes``
|
|
87
|
+
will contain the DOCX payload.
|
|
72
88
|
"""
|
|
73
89
|
|
|
74
90
|
if fitz is None: # pragma: no cover - exercised when dependency absent
|
|
75
91
|
raise SignatureCroppingUnavailable(
|
|
76
|
-
"PyMuPDF is required for PNG crops. Install 'pymupdf' or
|
|
92
|
+
"PyMuPDF is required for PNG crops. Install 'pymupdf' or add it to your environment."
|
|
77
93
|
)
|
|
78
94
|
if not save_files and not return_bytes:
|
|
79
95
|
raise ValueError("At least one of save_files or return_bytes must be True")
|
|
@@ -85,6 +101,16 @@ def crop_signatures(
|
|
|
85
101
|
generated_paths: list[Path] = []
|
|
86
102
|
generated_crops: list[SignatureCrop] = []
|
|
87
103
|
|
|
104
|
+
docx_to_disk = save_files
|
|
105
|
+
docx_in_memory = return_bytes
|
|
106
|
+
docx_enabled = docx_to_disk or docx_in_memory
|
|
107
|
+
docx_available = Document is not None
|
|
108
|
+
if docx_enabled and not docx_available and logger:
|
|
109
|
+
logger.warning(
|
|
110
|
+
"Signature DOCX output unavailable",
|
|
111
|
+
extra={"error": "python-docx is required to generate DOCX outputs"},
|
|
112
|
+
)
|
|
113
|
+
|
|
88
114
|
with fitz.open(pdf_path) as document: # type: ignore[attr-defined]
|
|
89
115
|
per_document_dir = output_dir / pdf_path.stem
|
|
90
116
|
if save_files:
|
|
@@ -114,14 +140,13 @@ def crop_signatures(
|
|
|
114
140
|
continue
|
|
115
141
|
|
|
116
142
|
filename = _build_filename(index, signature)
|
|
117
|
-
|
|
143
|
+
png_destination = per_document_dir / filename
|
|
144
|
+
docx_destination = png_destination.with_suffix(".docx")
|
|
118
145
|
|
|
119
146
|
try:
|
|
120
147
|
image_bytes: bytes | None = None
|
|
121
148
|
pixmap = page.get_pixmap(matrix=matrix, clip=clip, alpha=False)
|
|
122
|
-
if
|
|
123
|
-
pixmap.save(destination)
|
|
124
|
-
if return_bytes:
|
|
149
|
+
if return_bytes or docx_enabled:
|
|
125
150
|
image_bytes = pixmap.tobytes("png")
|
|
126
151
|
except Exception as exc: # pragma: no cover - defensive
|
|
127
152
|
if logger:
|
|
@@ -136,17 +161,40 @@ def crop_signatures(
|
|
|
136
161
|
)
|
|
137
162
|
continue
|
|
138
163
|
|
|
164
|
+
docx_bytes: bytes | None = None
|
|
165
|
+
if docx_enabled and docx_available:
|
|
166
|
+
if image_bytes is None: # pragma: no cover - defensive
|
|
167
|
+
continue
|
|
168
|
+
try:
|
|
169
|
+
docx_bytes = _build_docx_bytes(image_bytes)
|
|
170
|
+
if docx_to_disk:
|
|
171
|
+
docx_destination.write_bytes(docx_bytes)
|
|
172
|
+
except SignatureDocxUnavailable as exc:
|
|
173
|
+
if logger:
|
|
174
|
+
logger.warning(
|
|
175
|
+
"Signature DOCX output unavailable",
|
|
176
|
+
extra={"error": str(exc)},
|
|
177
|
+
)
|
|
178
|
+
docx_available = False
|
|
179
|
+
except Exception as exc: # pragma: no cover - defensive
|
|
180
|
+
if logger:
|
|
181
|
+
logger.warning(
|
|
182
|
+
"Failed to write signature DOCX",
|
|
183
|
+
extra={"file": pdf_path.name, "error": str(exc)},
|
|
184
|
+
)
|
|
185
|
+
|
|
139
186
|
if save_files:
|
|
140
|
-
signature.CropPath = str(
|
|
141
|
-
generated_paths.append(
|
|
187
|
+
signature.CropPath = str(docx_destination)
|
|
188
|
+
generated_paths.append(docx_destination)
|
|
142
189
|
if return_bytes:
|
|
143
190
|
if image_bytes is None: # pragma: no cover - defensive
|
|
144
191
|
continue
|
|
145
192
|
generated_crops.append(
|
|
146
193
|
SignatureCrop(
|
|
147
|
-
path=
|
|
194
|
+
path=docx_destination,
|
|
148
195
|
image_bytes=image_bytes,
|
|
149
196
|
signature=signature,
|
|
197
|
+
docx_bytes=docx_bytes,
|
|
150
198
|
saved_to_disk=save_files,
|
|
151
199
|
)
|
|
152
200
|
)
|
|
@@ -154,6 +202,18 @@ def crop_signatures(
|
|
|
154
202
|
return generated_crops if return_bytes else generated_paths
|
|
155
203
|
|
|
156
204
|
|
|
205
|
+
def _build_docx_bytes(image_bytes: bytes) -> bytes:
|
|
206
|
+
if Document is None:
|
|
207
|
+
raise SignatureDocxUnavailable(
|
|
208
|
+
"python-docx is required to generate DOCX outputs for signature crops."
|
|
209
|
+
)
|
|
210
|
+
document = Document()
|
|
211
|
+
document.add_picture(io.BytesIO(image_bytes))
|
|
212
|
+
buffer = io.BytesIO()
|
|
213
|
+
document.save(buffer)
|
|
214
|
+
return buffer.getvalue()
|
|
215
|
+
|
|
216
|
+
|
|
157
217
|
def _to_clip_rect(page, bbox: tuple[float, float, float, float]):
|
|
158
218
|
width = float(page.rect.width)
|
|
159
219
|
height = float(page.rect.height)
|