sigdetect 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sigdetect/api.py +48 -12
- sigdetect/cli.py +70 -28
- sigdetect/config.py +17 -0
- sigdetect/cropping.py +78 -15
- sigdetect/detector/__init__.py +10 -8
- sigdetect/detector/pymupdf_engine.py +2 -2
- sigdetect/detector/signature_model.py +6 -0
- sigdetect/wet_detection.py +63 -13
- {sigdetect-0.4.0.dist-info → sigdetect-0.5.1.dist-info}/METADATA +25 -12
- {sigdetect-0.4.0.dist-info → sigdetect-0.5.1.dist-info}/RECORD +13 -13
- {sigdetect-0.4.0.dist-info → sigdetect-0.5.1.dist-info}/WHEEL +1 -1
- {sigdetect-0.4.0.dist-info → sigdetect-0.5.1.dist-info}/entry_points.txt +0 -0
- {sigdetect-0.4.0.dist-info → sigdetect-0.5.1.dist-info}/top_level.txt +0 -0
sigdetect/api.py
CHANGED
|
@@ -9,6 +9,7 @@ from typing import Any, Generator, Iterable, Iterator, Literal, overload
|
|
|
9
9
|
from sigdetect.config import DetectConfiguration
|
|
10
10
|
from sigdetect.cropping import SignatureCrop
|
|
11
11
|
from sigdetect.detector import BuildDetector, Detector, FileResult, Signature
|
|
12
|
+
from sigdetect.wet_detection import apply_wet_detection
|
|
12
13
|
|
|
13
14
|
EngineName = Literal["pypdf2", "pypdf", "pymupdf", "auto"]
|
|
14
15
|
ProfileName = Literal["hipaa", "retainer"]
|
|
@@ -21,9 +22,13 @@ def DetectPdf(
|
|
|
21
22
|
engineName: EngineName = "auto",
|
|
22
23
|
includePseudoSignatures: bool = True,
|
|
23
24
|
recurseXObjects: bool = True,
|
|
25
|
+
runWetDetection: bool = True,
|
|
24
26
|
detector: Detector | None = None,
|
|
25
27
|
) -> dict[str, Any]:
|
|
26
|
-
"""Detect signature evidence and assign roles for a single PDF.
|
|
28
|
+
"""Detect signature evidence and assign roles for a single PDF.
|
|
29
|
+
|
|
30
|
+
Wet detection runs by default for non-e-sign PDFs; pass ``runWetDetection=False`` to skip OCR.
|
|
31
|
+
"""
|
|
27
32
|
|
|
28
33
|
resolvedPath = Path(pdfPath)
|
|
29
34
|
activeDetector = detector or get_detector(
|
|
@@ -36,6 +41,10 @@ def DetectPdf(
|
|
|
36
41
|
)
|
|
37
42
|
|
|
38
43
|
result = activeDetector.Detect(resolvedPath)
|
|
44
|
+
if runWetDetection:
|
|
45
|
+
configuration = _ResolveConfiguration(activeDetector)
|
|
46
|
+
if configuration is not None:
|
|
47
|
+
apply_wet_detection(resolvedPath, configuration, result)
|
|
39
48
|
return _ToPlainDictionary(result)
|
|
40
49
|
|
|
41
50
|
|
|
@@ -48,7 +57,10 @@ def get_detector(
|
|
|
48
57
|
recurseXObjects: bool = True,
|
|
49
58
|
outputDirectory: str | Path | None = None,
|
|
50
59
|
) -> Detector:
|
|
51
|
-
"""Return a reusable detector instance configured with the supplied options.
|
|
60
|
+
"""Return a reusable detector instance configured with the supplied options.
|
|
61
|
+
|
|
62
|
+
Engine selection is forced to ``auto`` (prefers PyMuPDF when available).
|
|
63
|
+
"""
|
|
52
64
|
|
|
53
65
|
configuration = DetectConfiguration(
|
|
54
66
|
PdfRoot=Path(pdfRoot) if pdfRoot is not None else Path.cwd(),
|
|
@@ -108,6 +120,7 @@ def _ToPlainValue(value: Any) -> Any:
|
|
|
108
120
|
def DetectMany(
|
|
109
121
|
pdfPaths: Iterable[str | Path],
|
|
110
122
|
*,
|
|
123
|
+
runWetDetection: bool = True,
|
|
111
124
|
detector: Detector | None = None,
|
|
112
125
|
**kwargs: Any,
|
|
113
126
|
) -> Iterator[dict[str, Any]]:
|
|
@@ -115,17 +128,18 @@ def DetectMany(
|
|
|
115
128
|
|
|
116
129
|
if detector is not None:
|
|
117
130
|
for pdfPath in pdfPaths:
|
|
118
|
-
yield _DetectWithDetector(detector, pdfPath)
|
|
131
|
+
yield _DetectWithDetector(detector, pdfPath, runWetDetection=runWetDetection)
|
|
119
132
|
return
|
|
120
133
|
|
|
121
134
|
for pdfPath in pdfPaths:
|
|
122
|
-
yield DetectPdf(pdfPath, **kwargs)
|
|
135
|
+
yield DetectPdf(pdfPath, runWetDetection=runWetDetection, **kwargs)
|
|
123
136
|
|
|
124
137
|
|
|
125
138
|
def ScanDirectory(
|
|
126
139
|
pdfRoot: str | Path,
|
|
127
140
|
*,
|
|
128
141
|
globPattern: str = "**/*.pdf",
|
|
142
|
+
runWetDetection: bool = True,
|
|
129
143
|
detector: Detector | None = None,
|
|
130
144
|
**kwargs: Any,
|
|
131
145
|
) -> Iterator[dict[str, Any]]:
|
|
@@ -143,7 +157,7 @@ def ScanDirectory(
|
|
|
143
157
|
|
|
144
158
|
for pdfPath in iterator:
|
|
145
159
|
if pdfPath.is_file() and pdfPath.suffix.lower() == ".pdf":
|
|
146
|
-
yield DetectPdf(pdfPath, detector=detector, **kwargs)
|
|
160
|
+
yield DetectPdf(pdfPath, detector=detector, runWetDetection=runWetDetection, **kwargs)
|
|
147
161
|
|
|
148
162
|
|
|
149
163
|
def ToCsvRow(result: dict[str, Any]) -> dict[str, Any]:
|
|
@@ -174,11 +188,25 @@ def Version() -> str:
|
|
|
174
188
|
return "0.0.0-dev"
|
|
175
189
|
|
|
176
190
|
|
|
177
|
-
def _DetectWithDetector(
|
|
191
|
+
def _DetectWithDetector(
|
|
192
|
+
detector: Detector, pdfPath: str | Path, *, runWetDetection: bool
|
|
193
|
+
) -> dict[str, Any]:
|
|
178
194
|
"""Helper that runs ``detector`` and returns the plain dictionary result."""
|
|
179
195
|
|
|
180
196
|
resolvedPath = Path(pdfPath)
|
|
181
|
-
|
|
197
|
+
result = detector.Detect(resolvedPath)
|
|
198
|
+
if runWetDetection:
|
|
199
|
+
configuration = _ResolveConfiguration(detector)
|
|
200
|
+
if configuration is not None:
|
|
201
|
+
apply_wet_detection(resolvedPath, configuration, result)
|
|
202
|
+
return _ToPlainDictionary(result)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _ResolveConfiguration(detector: Detector) -> DetectConfiguration | None:
|
|
206
|
+
configuration = getattr(detector, "Configuration", None)
|
|
207
|
+
if isinstance(configuration, DetectConfiguration):
|
|
208
|
+
return configuration
|
|
209
|
+
return None
|
|
182
210
|
|
|
183
211
|
|
|
184
212
|
@contextmanager
|
|
@@ -201,8 +229,8 @@ def CropSignatureImages(
|
|
|
201
229
|
dpi: int = 200,
|
|
202
230
|
returnBytes: Literal[False] = False,
|
|
203
231
|
saveToDisk: bool = True,
|
|
204
|
-
|
|
205
|
-
|
|
232
|
+
docx: bool = False,
|
|
233
|
+
) -> list[Path]: ...
|
|
206
234
|
|
|
207
235
|
|
|
208
236
|
@overload
|
|
@@ -214,8 +242,8 @@ def CropSignatureImages(
|
|
|
214
242
|
dpi: int,
|
|
215
243
|
returnBytes: Literal[True],
|
|
216
244
|
saveToDisk: bool,
|
|
217
|
-
|
|
218
|
-
|
|
245
|
+
docx: bool = False,
|
|
246
|
+
) -> list[SignatureCrop]: ...
|
|
219
247
|
|
|
220
248
|
|
|
221
249
|
def CropSignatureImages(
|
|
@@ -226,13 +254,17 @@ def CropSignatureImages(
|
|
|
226
254
|
dpi: int = 200,
|
|
227
255
|
returnBytes: bool = False,
|
|
228
256
|
saveToDisk: bool = True,
|
|
257
|
+
docx: bool = False,
|
|
229
258
|
) -> list[Path] | list[SignatureCrop]:
|
|
230
|
-
"""
|
|
259
|
+
"""Create PNG files containing cropped signature images (or DOCX when enabled).
|
|
231
260
|
|
|
232
261
|
Accepts either a :class:`FileResult` instance or the ``dict`` returned by
|
|
233
262
|
:func:`DetectPdf`. Requires the optional ``pymupdf`` dependency.
|
|
234
263
|
Set ``returnBytes=True`` to also receive in-memory PNG bytes for each crop. Set
|
|
235
264
|
``saveToDisk=False`` to skip writing PNG files while still returning in-memory data.
|
|
265
|
+
When ``docx`` is True, DOCX files are written instead of PNG files. When ``returnBytes`` is
|
|
266
|
+
True and ``docx`` is enabled, the returned :class:`SignatureCrop` objects include
|
|
267
|
+
``docx_bytes``.
|
|
236
268
|
"""
|
|
237
269
|
|
|
238
270
|
from sigdetect.cropping import crop_signatures
|
|
@@ -245,6 +277,7 @@ def CropSignatureImages(
|
|
|
245
277
|
dpi=dpi,
|
|
246
278
|
return_bytes=returnBytes,
|
|
247
279
|
save_files=saveToDisk,
|
|
280
|
+
docx=docx,
|
|
248
281
|
)
|
|
249
282
|
if original_dict is not None:
|
|
250
283
|
original_dict.clear()
|
|
@@ -275,6 +308,9 @@ def _CoerceFileResult(
|
|
|
275
308
|
RenderType=str(entry.get("render_type") or "unknown"),
|
|
276
309
|
BoundingBox=tuple(bbox) if bbox else None,
|
|
277
310
|
CropPath=entry.get("crop_path"),
|
|
311
|
+
CropBytes=entry.get("crop_bytes"),
|
|
312
|
+
CropDocxPath=entry.get("crop_docx_path"),
|
|
313
|
+
CropDocxBytes=entry.get("crop_docx_bytes"),
|
|
278
314
|
)
|
|
279
315
|
)
|
|
280
316
|
|
sigdetect/cli.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import base64
|
|
5
6
|
import json
|
|
6
7
|
from collections.abc import Iterator
|
|
7
8
|
from dataclasses import asdict, is_dataclass
|
|
@@ -48,6 +49,12 @@ def Detect(
|
|
|
48
49
|
configurationPath: Path | None = typer.Option(
|
|
49
50
|
None, "--config", "-c", help="Path to YAML config"
|
|
50
51
|
),
|
|
52
|
+
writeResults: bool | None = typer.Option(
|
|
53
|
+
None,
|
|
54
|
+
"--write-results/--no-write-results",
|
|
55
|
+
help="Write results.json (or JSON to stdout when out_dir is none)",
|
|
56
|
+
show_default=False,
|
|
57
|
+
),
|
|
51
58
|
profileOverride: str | None = typer.Option(None, "--profile", "-p", help="hipaa or retainer"),
|
|
52
59
|
recursive: bool = typer.Option(
|
|
53
60
|
True,
|
|
@@ -57,13 +64,19 @@ def Detect(
|
|
|
57
64
|
cropSignatures: bool | None = typer.Option(
|
|
58
65
|
None,
|
|
59
66
|
"--crop-signatures/--no-crop-signatures",
|
|
60
|
-
help="
|
|
67
|
+
help="Write PNG crops for signature widgets (requires PyMuPDF)",
|
|
68
|
+
show_default=False,
|
|
69
|
+
),
|
|
70
|
+
cropDocx: bool | None = typer.Option(
|
|
71
|
+
None,
|
|
72
|
+
"--crop-docx/--no-crop-docx",
|
|
73
|
+
help="Write DOCX crops instead of PNG files (requires PyMuPDF + python-docx)",
|
|
61
74
|
show_default=False,
|
|
62
75
|
),
|
|
63
76
|
cropDirectory: Path | None = typer.Option(
|
|
64
77
|
None,
|
|
65
78
|
"--crop-dir",
|
|
66
|
-
help="Directory for signature
|
|
79
|
+
help="Directory for signature crops (defaults to out_dir/signature_crops)",
|
|
67
80
|
),
|
|
68
81
|
cropDpi: int | None = typer.Option(
|
|
69
82
|
None,
|
|
@@ -73,10 +86,16 @@ def Detect(
|
|
|
73
86
|
help="Rendering DPI for signature crops",
|
|
74
87
|
show_default=False,
|
|
75
88
|
),
|
|
89
|
+
cropBytes: bool = typer.Option(
|
|
90
|
+
False,
|
|
91
|
+
"--crop-bytes/--no-crop-bytes",
|
|
92
|
+
help="Embed base64 PNG bytes (and DOCX bytes when --crop-docx) in results JSON",
|
|
93
|
+
show_default=False,
|
|
94
|
+
),
|
|
76
95
|
detectWetSignatures: bool | None = typer.Option(
|
|
77
96
|
None,
|
|
78
97
|
"--detect-wet/--no-detect-wet",
|
|
79
|
-
help="
|
|
98
|
+
help="Compatibility flag; non-e-sign PDFs always run OCR when deps are available",
|
|
80
99
|
show_default=False,
|
|
81
100
|
),
|
|
82
101
|
wetOcrDpi: int | None = typer.Option(
|
|
@@ -111,8 +130,12 @@ def Detect(
|
|
|
111
130
|
configuration = configuration.model_copy(update={"Profile": normalized_profile})
|
|
112
131
|
|
|
113
132
|
overrides: dict[str, object] = {}
|
|
133
|
+
if writeResults is not None:
|
|
134
|
+
overrides["WriteResults"] = writeResults
|
|
114
135
|
if cropSignatures is not None:
|
|
115
136
|
overrides["CropSignatures"] = cropSignatures
|
|
137
|
+
if cropDocx is not None:
|
|
138
|
+
overrides["CropDocx"] = cropDocx
|
|
116
139
|
if cropDirectory is not None:
|
|
117
140
|
overrides["CropOutputDirectory"] = cropDirectory
|
|
118
141
|
if cropDpi is not None:
|
|
@@ -145,53 +168,66 @@ def Detect(
|
|
|
145
168
|
except StopIteration:
|
|
146
169
|
raise SystemExit(f"No PDFs found in {configuration.PdfRoot}") from None
|
|
147
170
|
|
|
148
|
-
|
|
171
|
+
write_results = configuration.WriteResults
|
|
172
|
+
results_buffer: list[FileResult] | None = (
|
|
173
|
+
[] if write_results and configuration.OutputDirectory is None else None
|
|
174
|
+
)
|
|
149
175
|
json_handle = None
|
|
150
176
|
json_path: Path | None = None
|
|
151
177
|
wrote_first = False
|
|
152
178
|
|
|
153
|
-
if configuration.OutputDirectory is not None:
|
|
179
|
+
if write_results and configuration.OutputDirectory is not None:
|
|
154
180
|
outputDirectory = configuration.OutputDirectory
|
|
155
181
|
outputDirectory.mkdir(parents=True, exist_ok=True)
|
|
156
182
|
json_path = outputDirectory / "results.json"
|
|
157
183
|
json_handle = open(json_path, "w", encoding="utf-8")
|
|
158
184
|
json_handle.write("[")
|
|
159
185
|
|
|
186
|
+
crop_bytes_enabled = bool(cropBytes)
|
|
160
187
|
crop_dir = configuration.CropOutputDirectory
|
|
188
|
+
if crop_dir is None:
|
|
189
|
+
base_dir = configuration.OutputDirectory or configuration.PdfRoot
|
|
190
|
+
crop_dir = base_dir / "signature_crops"
|
|
161
191
|
cropping_enabled = configuration.CropSignatures
|
|
192
|
+
docx_enabled = configuration.CropDocx
|
|
162
193
|
cropping_available = True
|
|
163
194
|
cropping_attempted = False
|
|
164
|
-
if configuration.CropSignatures and crop_dir is None:
|
|
165
|
-
Logger.warning(
|
|
166
|
-
"CropSignatures enabled without an output directory",
|
|
167
|
-
extra={"pdf_root": str(configuration.PdfRoot)},
|
|
168
|
-
)
|
|
169
|
-
cropping_enabled = False
|
|
170
195
|
|
|
171
196
|
total_bboxes = 0
|
|
172
197
|
|
|
173
198
|
def _append_result(file_result: FileResult, source_pdf: Path) -> None:
|
|
174
199
|
nonlocal wrote_first, json_handle, total_bboxes, cropping_available, cropping_attempted
|
|
175
200
|
|
|
176
|
-
if
|
|
201
|
+
if cropping_available and (cropping_enabled or crop_bytes_enabled) and crop_dir is not None:
|
|
177
202
|
try:
|
|
178
|
-
crop_signatures(
|
|
203
|
+
crops = crop_signatures(
|
|
179
204
|
pdf_path=source_pdf,
|
|
180
205
|
file_result=file_result,
|
|
181
206
|
output_dir=crop_dir,
|
|
182
207
|
dpi=configuration.CropImageDpi,
|
|
183
208
|
logger=Logger,
|
|
209
|
+
return_bytes=crop_bytes_enabled,
|
|
210
|
+
save_files=cropping_enabled,
|
|
211
|
+
docx=docx_enabled,
|
|
184
212
|
)
|
|
185
213
|
cropping_attempted = True
|
|
214
|
+
if crop_bytes_enabled:
|
|
215
|
+
for crop in crops:
|
|
216
|
+
crop.signature.CropBytes = base64.b64encode(crop.image_bytes).decode(
|
|
217
|
+
"ascii"
|
|
218
|
+
)
|
|
219
|
+
if crop.docx_bytes:
|
|
220
|
+
crop.signature.CropDocxBytes = base64.b64encode(
|
|
221
|
+
crop.docx_bytes
|
|
222
|
+
).decode("ascii")
|
|
186
223
|
except SignatureCroppingUnavailable as exc:
|
|
187
224
|
cropping_available = False
|
|
188
225
|
Logger.warning("Signature cropping unavailable", extra={"error": str(exc)})
|
|
189
226
|
typer.echo(str(exc), err=True)
|
|
190
227
|
except Exception as exc: # pragma: no cover - defensive
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
)
|
|
228
|
+
cropping_available = False
|
|
229
|
+
Logger.warning("Signature cropping unavailable", extra={"error": str(exc)})
|
|
230
|
+
typer.echo(str(exc), err=True)
|
|
195
231
|
|
|
196
232
|
total_bboxes += sum(1 for sig in file_result.Signatures if sig.BoundingBox)
|
|
197
233
|
|
|
@@ -231,18 +267,24 @@ def Detect(
|
|
|
231
267
|
json_handle.write(closing)
|
|
232
268
|
json_handle.close()
|
|
233
269
|
|
|
234
|
-
if
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
270
|
+
if write_results:
|
|
271
|
+
if json_handle is not None:
|
|
272
|
+
typer.echo(f"Wrote {json_path}")
|
|
273
|
+
else:
|
|
274
|
+
payload = json.dumps(
|
|
275
|
+
results_buffer or [], indent=2, ensure_ascii=False, default=_JsonSerializer
|
|
276
|
+
)
|
|
277
|
+
typer.echo(payload)
|
|
278
|
+
typer.echo("Detection completed with output disabled (out_dir=none)")
|
|
279
|
+
|
|
280
|
+
if (
|
|
281
|
+
(cropping_enabled or crop_bytes_enabled)
|
|
282
|
+
and cropping_available
|
|
283
|
+
and cropping_attempted
|
|
284
|
+
and total_bboxes == 0
|
|
285
|
+
):
|
|
244
286
|
Logger.warning(
|
|
245
|
-
"No signature bounding boxes detected;
|
|
287
|
+
"No signature bounding boxes detected; install PyMuPDF for crop-ready output",
|
|
246
288
|
extra={"engine": configuration.Engine},
|
|
247
289
|
)
|
|
248
290
|
|
sigdetect/config.py
CHANGED
|
@@ -25,11 +25,13 @@ class DetectConfiguration(BaseModel):
|
|
|
25
25
|
|
|
26
26
|
PdfRoot: Path = Field(default=Path("hipaa_results"), alias="pdf_root")
|
|
27
27
|
OutputDirectory: Path | None = Field(default=Path("out"), alias="out_dir")
|
|
28
|
+
WriteResults: bool = Field(default=False, alias="write_results")
|
|
28
29
|
Engine: EngineName = Field(default="auto", alias="engine")
|
|
29
30
|
Profile: ProfileName = Field(default="hipaa", alias="profile")
|
|
30
31
|
PseudoSignatures: bool = Field(default=True, alias="pseudo_signatures")
|
|
31
32
|
RecurseXObjects: bool = Field(default=True, alias="recurse_xobjects")
|
|
32
33
|
CropSignatures: bool = Field(default=True, alias="crop_signatures")
|
|
34
|
+
CropDocx: bool = Field(default=False, alias="crop_docx")
|
|
33
35
|
CropOutputDirectory: Path | None = Field(default=None, alias="crop_output_dir")
|
|
34
36
|
CropImageDpi: int = Field(default=200, alias="crop_image_dpi", ge=72, le=600)
|
|
35
37
|
DetectWetSignatures: bool = Field(default=True, alias="detect_wet_signatures")
|
|
@@ -63,6 +65,10 @@ class DetectConfiguration(BaseModel):
|
|
|
63
65
|
def out_dir(self) -> Path | None: # pragma: no cover - simple passthrough
|
|
64
66
|
return self.OutputDirectory
|
|
65
67
|
|
|
68
|
+
@property
|
|
69
|
+
def write_results(self) -> bool: # pragma: no cover - simple passthrough
|
|
70
|
+
return self.WriteResults
|
|
71
|
+
|
|
66
72
|
@property
|
|
67
73
|
def engine(self) -> EngineName: # pragma: no cover - simple passthrough
|
|
68
74
|
return self.Engine
|
|
@@ -83,6 +89,10 @@ class DetectConfiguration(BaseModel):
|
|
|
83
89
|
def crop_signatures(self) -> bool: # pragma: no cover - simple passthrough
|
|
84
90
|
return self.CropSignatures
|
|
85
91
|
|
|
92
|
+
@property
|
|
93
|
+
def crop_docx(self) -> bool: # pragma: no cover - simple passthrough
|
|
94
|
+
return self.CropDocx
|
|
95
|
+
|
|
86
96
|
@property
|
|
87
97
|
def crop_output_dir(self) -> Path | None: # pragma: no cover - simple passthrough
|
|
88
98
|
return self.CropOutputDirectory
|
|
@@ -128,6 +138,7 @@ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
|
|
|
128
138
|
env_out_dir = os.getenv("SIGDETECT_OUT_DIR")
|
|
129
139
|
env_profile = os.getenv("SIGDETECT_PROFILE")
|
|
130
140
|
env_crop = os.getenv("SIGDETECT_CROP_SIGNATURES")
|
|
141
|
+
env_crop_docx = os.getenv("SIGDETECT_CROP_DOCX")
|
|
131
142
|
env_crop_dir = os.getenv("SIGDETECT_CROP_DIR")
|
|
132
143
|
env_crop_dpi = os.getenv("SIGDETECT_CROP_DPI")
|
|
133
144
|
env_detect_wet = os.getenv("SIGDETECT_DETECT_WET")
|
|
@@ -154,6 +165,12 @@ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
|
|
|
154
165
|
raw_data["crop_signatures"] = True
|
|
155
166
|
elif lowered in {"0", "false", "no", "off"}:
|
|
156
167
|
raw_data["crop_signatures"] = False
|
|
168
|
+
if env_crop_docx is not None:
|
|
169
|
+
lowered = env_crop_docx.lower()
|
|
170
|
+
if lowered in {"1", "true", "yes", "on"}:
|
|
171
|
+
raw_data["crop_docx"] = True
|
|
172
|
+
elif lowered in {"0", "false", "no", "off"}:
|
|
173
|
+
raw_data["crop_docx"] = False
|
|
157
174
|
if env_crop_dir:
|
|
158
175
|
raw_data["crop_output_dir"] = env_crop_dir
|
|
159
176
|
if env_crop_dpi:
|
sigdetect/cropping.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
|
-
"""Helpers for converting signature bounding boxes into PNG crops."""
|
|
1
|
+
"""Helpers for converting signature bounding boxes into PNG or DOCX crops."""
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import io
|
|
5
6
|
import logging
|
|
6
7
|
import re
|
|
7
8
|
from dataclasses import dataclass
|
|
@@ -16,18 +17,28 @@ try: # pragma: no cover - optional dependency
|
|
|
16
17
|
except Exception: # pragma: no cover - optional dependency
|
|
17
18
|
fitz = None # type: ignore[misc]
|
|
18
19
|
|
|
20
|
+
try: # pragma: no cover - optional dependency
|
|
21
|
+
from docx import Document # type: ignore
|
|
22
|
+
except Exception: # pragma: no cover - optional dependency
|
|
23
|
+
Document = None # type: ignore[assignment]
|
|
24
|
+
|
|
19
25
|
|
|
20
26
|
class SignatureCroppingUnavailable(RuntimeError):
|
|
21
27
|
"""Raised when PNG cropping cannot be performed (e.g., PyMuPDF missing)."""
|
|
22
28
|
|
|
23
29
|
|
|
30
|
+
class SignatureDocxUnavailable(SignatureCroppingUnavailable):
|
|
31
|
+
"""Raised when DOCX creation cannot be performed (e.g., python-docx missing)."""
|
|
32
|
+
|
|
33
|
+
|
|
24
34
|
@dataclass(slots=True)
|
|
25
35
|
class SignatureCrop:
|
|
26
|
-
"""
|
|
36
|
+
"""Crop metadata and in-memory content."""
|
|
27
37
|
|
|
28
38
|
path: Path
|
|
29
39
|
image_bytes: bytes
|
|
30
40
|
signature: Signature
|
|
41
|
+
docx_bytes: bytes | None = None
|
|
31
42
|
saved_to_disk: bool = True
|
|
32
43
|
|
|
33
44
|
|
|
@@ -41,8 +52,8 @@ def crop_signatures(
|
|
|
41
52
|
logger: logging.Logger | None = None,
|
|
42
53
|
return_bytes: Literal[False] = False,
|
|
43
54
|
save_files: bool = True,
|
|
44
|
-
|
|
45
|
-
|
|
55
|
+
docx: bool = False,
|
|
56
|
+
) -> list[Path]: ...
|
|
46
57
|
|
|
47
58
|
|
|
48
59
|
@overload
|
|
@@ -55,8 +66,8 @@ def crop_signatures(
|
|
|
55
66
|
logger: logging.Logger | None = None,
|
|
56
67
|
return_bytes: Literal[True],
|
|
57
68
|
save_files: bool = True,
|
|
58
|
-
|
|
59
|
-
|
|
69
|
+
docx: bool = False,
|
|
70
|
+
) -> list[SignatureCrop]: ...
|
|
60
71
|
|
|
61
72
|
|
|
62
73
|
def crop_signatures(
|
|
@@ -68,16 +79,19 @@ def crop_signatures(
|
|
|
68
79
|
logger: logging.Logger | None = None,
|
|
69
80
|
return_bytes: bool = False,
|
|
70
81
|
save_files: bool = True,
|
|
82
|
+
docx: bool = False,
|
|
71
83
|
) -> list[Path] | list[SignatureCrop]:
|
|
72
|
-
"""Render each signature bounding box to a PNG image
|
|
84
|
+
"""Render each signature bounding box to a PNG image and optionally wrap it in DOCX.
|
|
73
85
|
|
|
74
86
|
Set ``return_bytes=True`` to collect in-memory PNG bytes for each crop while also writing
|
|
75
87
|
the files to ``output_dir``. Set ``save_files=False`` to skip writing PNGs to disk.
|
|
88
|
+
When ``docx=True``, DOCX files are written instead of PNGs. When ``return_bytes`` is True
|
|
89
|
+
and ``docx=True``, ``SignatureCrop.docx_bytes`` will contain the DOCX payload.
|
|
76
90
|
"""
|
|
77
91
|
|
|
78
92
|
if fitz is None: # pragma: no cover - exercised when dependency absent
|
|
79
93
|
raise SignatureCroppingUnavailable(
|
|
80
|
-
"PyMuPDF is required for PNG crops. Install 'pymupdf' or
|
|
94
|
+
"PyMuPDF is required for PNG crops. Install 'pymupdf' or add it to your environment."
|
|
81
95
|
)
|
|
82
96
|
if not save_files and not return_bytes:
|
|
83
97
|
raise ValueError("At least one of save_files or return_bytes must be True")
|
|
@@ -89,6 +103,13 @@ def crop_signatures(
|
|
|
89
103
|
generated_paths: list[Path] = []
|
|
90
104
|
generated_crops: list[SignatureCrop] = []
|
|
91
105
|
|
|
106
|
+
docx_enabled = docx
|
|
107
|
+
docx_available = Document is not None
|
|
108
|
+
if docx_enabled and not docx_available:
|
|
109
|
+
raise SignatureDocxUnavailable(
|
|
110
|
+
"python-docx is required to generate DOCX outputs for signature crops."
|
|
111
|
+
)
|
|
112
|
+
|
|
92
113
|
with fitz.open(pdf_path) as document: # type: ignore[attr-defined]
|
|
93
114
|
per_document_dir = output_dir / pdf_path.stem
|
|
94
115
|
if save_files:
|
|
@@ -118,14 +139,15 @@ def crop_signatures(
|
|
|
118
139
|
continue
|
|
119
140
|
|
|
120
141
|
filename = _build_filename(index, signature)
|
|
121
|
-
|
|
142
|
+
png_destination = per_document_dir / filename
|
|
143
|
+
docx_destination = png_destination.with_suffix(".docx")
|
|
122
144
|
|
|
123
145
|
try:
|
|
124
146
|
image_bytes: bytes | None = None
|
|
125
147
|
pixmap = page.get_pixmap(matrix=matrix, clip=clip, alpha=False)
|
|
126
|
-
if save_files:
|
|
127
|
-
pixmap.save(
|
|
128
|
-
if return_bytes:
|
|
148
|
+
if save_files and not docx_enabled:
|
|
149
|
+
pixmap.save(png_destination)
|
|
150
|
+
if return_bytes or docx_enabled:
|
|
129
151
|
image_bytes = pixmap.tobytes("png")
|
|
130
152
|
except Exception as exc: # pragma: no cover - defensive
|
|
131
153
|
if logger:
|
|
@@ -140,17 +162,46 @@ def crop_signatures(
|
|
|
140
162
|
)
|
|
141
163
|
continue
|
|
142
164
|
|
|
165
|
+
docx_bytes: bytes | None = None
|
|
166
|
+
if docx_enabled:
|
|
167
|
+
if image_bytes is None: # pragma: no cover - defensive
|
|
168
|
+
continue
|
|
169
|
+
try:
|
|
170
|
+
docx_bytes = _build_docx_bytes(image_bytes)
|
|
171
|
+
if save_files:
|
|
172
|
+
docx_destination.write_bytes(docx_bytes)
|
|
173
|
+
except SignatureDocxUnavailable as exc:
|
|
174
|
+
if logger:
|
|
175
|
+
logger.warning(
|
|
176
|
+
"Signature DOCX output unavailable",
|
|
177
|
+
extra={"error": str(exc)},
|
|
178
|
+
)
|
|
179
|
+
docx_available = False
|
|
180
|
+
except Exception as exc: # pragma: no cover - defensive
|
|
181
|
+
if logger:
|
|
182
|
+
logger.warning(
|
|
183
|
+
"Failed to write signature DOCX",
|
|
184
|
+
extra={"file": pdf_path.name, "error": str(exc)},
|
|
185
|
+
)
|
|
186
|
+
|
|
143
187
|
if save_files:
|
|
144
|
-
|
|
145
|
-
|
|
188
|
+
if docx_enabled:
|
|
189
|
+
signature.CropPath = None
|
|
190
|
+
signature.CropDocxPath = str(docx_destination)
|
|
191
|
+
generated_paths.append(docx_destination)
|
|
192
|
+
else:
|
|
193
|
+
signature.CropDocxPath = None
|
|
194
|
+
signature.CropPath = str(png_destination)
|
|
195
|
+
generated_paths.append(png_destination)
|
|
146
196
|
if return_bytes:
|
|
147
197
|
if image_bytes is None: # pragma: no cover - defensive
|
|
148
198
|
continue
|
|
149
199
|
generated_crops.append(
|
|
150
200
|
SignatureCrop(
|
|
151
|
-
path=
|
|
201
|
+
path=docx_destination if docx_enabled else png_destination,
|
|
152
202
|
image_bytes=image_bytes,
|
|
153
203
|
signature=signature,
|
|
204
|
+
docx_bytes=docx_bytes,
|
|
154
205
|
saved_to_disk=save_files,
|
|
155
206
|
)
|
|
156
207
|
)
|
|
@@ -158,6 +209,18 @@ def crop_signatures(
|
|
|
158
209
|
return generated_crops if return_bytes else generated_paths
|
|
159
210
|
|
|
160
211
|
|
|
212
|
+
def _build_docx_bytes(image_bytes: bytes) -> bytes:
|
|
213
|
+
if Document is None:
|
|
214
|
+
raise SignatureDocxUnavailable(
|
|
215
|
+
"python-docx is required to generate DOCX outputs for signature crops."
|
|
216
|
+
)
|
|
217
|
+
document = Document()
|
|
218
|
+
document.add_picture(io.BytesIO(image_bytes))
|
|
219
|
+
buffer = io.BytesIO()
|
|
220
|
+
document.save(buffer)
|
|
221
|
+
return buffer.getvalue()
|
|
222
|
+
|
|
223
|
+
|
|
161
224
|
def _to_clip_rect(page, bbox: tuple[float, float, float, float]):
|
|
162
225
|
width = float(page.rect.width)
|
|
163
226
|
height = float(page.rect.height)
|
sigdetect/detector/__init__.py
CHANGED
|
@@ -22,10 +22,13 @@ ENGINE_REGISTRY: dict[str, Type[Detector]] = {
|
|
|
22
22
|
ENGINE_REGISTRY.setdefault("pypdf", PyPDF2Detector)
|
|
23
23
|
|
|
24
24
|
try: # pragma: no cover - optional dependency
|
|
25
|
-
from .pymupdf_engine import PyMuPDFDetector
|
|
25
|
+
from .pymupdf_engine import PyMuPDFDetector
|
|
26
|
+
from .pymupdf_engine import fitz as pymupdf_fitz # type: ignore
|
|
26
27
|
|
|
27
|
-
if getattr(PyMuPDFDetector, "Name", None):
|
|
28
|
+
if pymupdf_fitz is not None and getattr(PyMuPDFDetector, "Name", None):
|
|
28
29
|
ENGINE_REGISTRY[PyMuPDFDetector.Name] = PyMuPDFDetector
|
|
30
|
+
else:
|
|
31
|
+
PyMuPDFDetector = None # type: ignore
|
|
29
32
|
except Exception:
|
|
30
33
|
PyMuPDFDetector = None # type: ignore
|
|
31
34
|
|
|
@@ -33,17 +36,16 @@ except Exception:
|
|
|
33
36
|
def BuildDetector(configuration: DetectConfiguration) -> Detector:
|
|
34
37
|
"""Instantiate the configured engine or raise a clear error."""
|
|
35
38
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
or getattr(configuration, "engine", None)
|
|
39
|
-
or PyPDF2Detector.Name
|
|
40
|
-
)
|
|
39
|
+
# Force geometry-capable engine selection (auto prefers PyMuPDF when available).
|
|
40
|
+
engine_name = "auto"
|
|
41
41
|
normalized = str(engine_name).lower()
|
|
42
42
|
|
|
43
43
|
if normalized == "auto":
|
|
44
44
|
detector_cls: Type[Detector] | None = None
|
|
45
45
|
if PyMuPDFDetector is not None:
|
|
46
|
-
detector_cls =
|
|
46
|
+
detector_cls = (
|
|
47
|
+
ENGINE_REGISTRY.get(getattr(PyMuPDFDetector, "Name", "")) or PyMuPDFDetector
|
|
48
|
+
)
|
|
47
49
|
if detector_cls is None:
|
|
48
50
|
detector_cls = ENGINE_REGISTRY.get(PyPDF2Detector.Name) or ENGINE_REGISTRY.get("pypdf")
|
|
49
51
|
warnings.warn(
|
|
@@ -30,8 +30,8 @@ class PyMuPDFDetector(PyPDF2Detector):
|
|
|
30
30
|
def __init__(self, configuration):
|
|
31
31
|
if fitz is None: # pragma: no cover - optional dependency
|
|
32
32
|
raise ValueError(
|
|
33
|
-
"PyMuPDF engine requires the optional 'pymupdf' dependency. Install
|
|
34
|
-
"
|
|
33
|
+
"PyMuPDF engine requires the optional 'pymupdf' dependency. Install 'pymupdf' or add "
|
|
34
|
+
"it to your environment."
|
|
35
35
|
)
|
|
36
36
|
super().__init__(configuration)
|
|
37
37
|
|
|
@@ -20,6 +20,9 @@ class Signature:
|
|
|
20
20
|
RenderType: str = "typed"
|
|
21
21
|
BoundingBox: tuple[float, float, float, float] | None = None
|
|
22
22
|
CropPath: str | None = None
|
|
23
|
+
CropBytes: str | None = None
|
|
24
|
+
CropDocxPath: str | None = None
|
|
25
|
+
CropDocxBytes: str | None = None
|
|
23
26
|
|
|
24
27
|
def to_dict(self) -> dict[str, Any]:
|
|
25
28
|
"""Return the legacy snake_case representation used in JSON payloads."""
|
|
@@ -35,4 +38,7 @@ class Signature:
|
|
|
35
38
|
"render_type": self.RenderType,
|
|
36
39
|
"bounding_box": list(self.BoundingBox) if self.BoundingBox else None,
|
|
37
40
|
"crop_path": self.CropPath,
|
|
41
|
+
"crop_bytes": self.CropBytes,
|
|
42
|
+
"crop_docx_path": self.CropDocxPath,
|
|
43
|
+
"crop_docx_bytes": self.CropDocxBytes,
|
|
38
44
|
}
|
sigdetect/wet_detection.py
CHANGED
|
@@ -67,11 +67,7 @@ class OcrLine:
|
|
|
67
67
|
def should_run_wet_pipeline(file_result: FileResult) -> bool:
|
|
68
68
|
"""Return ``True`` when the OCR pipeline should run for ``file_result``."""
|
|
69
69
|
|
|
70
|
-
return (
|
|
71
|
-
(not file_result.ElectronicSignatureFound or file_result.SignatureCount == 0)
|
|
72
|
-
or (bool(file_result.ScannedPdf) and not file_result.ElectronicSignatureFound)
|
|
73
|
-
or bool(file_result.MixedContent)
|
|
74
|
-
)
|
|
70
|
+
return not bool(file_result.ElectronicSignatureFound)
|
|
75
71
|
|
|
76
72
|
|
|
77
73
|
def apply_wet_detection(
|
|
@@ -83,8 +79,6 @@ def apply_wet_detection(
|
|
|
83
79
|
) -> bool:
|
|
84
80
|
"""Augment ``file_result`` with OCR-detected wet signatures when possible."""
|
|
85
81
|
|
|
86
|
-
if not configuration.DetectWetSignatures:
|
|
87
|
-
return False
|
|
88
82
|
if not should_run_wet_pipeline(file_result):
|
|
89
83
|
return False
|
|
90
84
|
|
|
@@ -96,6 +90,8 @@ def apply_wet_detection(
|
|
|
96
90
|
logger.warning("Wet detection unavailable", extra={"error": str(exc)})
|
|
97
91
|
return False
|
|
98
92
|
|
|
93
|
+
original_esign = file_result.ElectronicSignatureFound
|
|
94
|
+
original_mixed = file_result.MixedContent
|
|
99
95
|
try:
|
|
100
96
|
added = _detect(pdf_path, configuration, file_result, logger=logger)
|
|
101
97
|
if not added:
|
|
@@ -106,6 +102,9 @@ def apply_wet_detection(
|
|
|
106
102
|
if logger:
|
|
107
103
|
logger.warning("Wet detection failed", extra={"error": str(exc)})
|
|
108
104
|
return False
|
|
105
|
+
finally:
|
|
106
|
+
file_result.ElectronicSignatureFound = original_esign
|
|
107
|
+
file_result.MixedContent = original_mixed
|
|
109
108
|
|
|
110
109
|
|
|
111
110
|
def _detect(
|
|
@@ -138,6 +137,7 @@ def _detect(
|
|
|
138
137
|
)
|
|
139
138
|
)
|
|
140
139
|
candidates.extend(_image_candidates(page))
|
|
140
|
+
candidates = _filter_candidates_for_page(candidates)
|
|
141
141
|
accepted = [
|
|
142
142
|
candidate
|
|
143
143
|
for candidate in candidates
|
|
@@ -157,7 +157,11 @@ def _detect(
|
|
|
157
157
|
if not new_signatures:
|
|
158
158
|
return False
|
|
159
159
|
|
|
160
|
-
|
|
160
|
+
filtered_signatures = _dedupe_wet_signatures(new_signatures)
|
|
161
|
+
if not filtered_signatures:
|
|
162
|
+
return False
|
|
163
|
+
|
|
164
|
+
file_result.Signatures.extend(filtered_signatures)
|
|
161
165
|
_refresh_metadata(file_result)
|
|
162
166
|
return True
|
|
163
167
|
finally:
|
|
@@ -277,6 +281,31 @@ def _build_candidates(
|
|
|
277
281
|
)
|
|
278
282
|
|
|
279
283
|
|
|
284
|
+
def _has_evidence(candidate: WetCandidate, token: str) -> bool:
|
|
285
|
+
return token in candidate.Evidence
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def _is_image_candidate(candidate: WetCandidate) -> bool:
|
|
289
|
+
return _has_evidence(candidate, "image_signature:true")
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def _has_stroke(candidate: WetCandidate) -> bool:
|
|
293
|
+
return _has_evidence(candidate, "stroke:yes")
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def _filter_candidates_for_page(candidates: Sequence[WetCandidate]) -> list[WetCandidate]:
|
|
297
|
+
if not candidates:
|
|
298
|
+
return []
|
|
299
|
+
has_image = any(_is_image_candidate(candidate) for candidate in candidates)
|
|
300
|
+
if not has_image:
|
|
301
|
+
return list(candidates)
|
|
302
|
+
return [
|
|
303
|
+
candidate
|
|
304
|
+
for candidate in candidates
|
|
305
|
+
if _is_image_candidate(candidate) or _has_stroke(candidate)
|
|
306
|
+
]
|
|
307
|
+
|
|
308
|
+
|
|
280
309
|
def _infer_role(normalized_text: str) -> str:
|
|
281
310
|
for role, keywords in ROLE_KEYWORDS.items():
|
|
282
311
|
if any(keyword in normalized_text for keyword in keywords):
|
|
@@ -379,7 +408,7 @@ def _image_candidates(page) -> list[WetCandidate]:
|
|
|
379
408
|
continue
|
|
380
409
|
if hasattr(rect, "x0"):
|
|
381
410
|
x0, y0, x1, y1 = float(rect.x0), float(rect.y0), float(rect.x1), float(rect.y1)
|
|
382
|
-
elif isinstance(rect,
|
|
411
|
+
elif isinstance(rect, tuple | list) and len(rect) == 4:
|
|
383
412
|
x0, y0, x1, y1 = map(float, rect)
|
|
384
413
|
else:
|
|
385
414
|
continue
|
|
@@ -422,7 +451,7 @@ def _infer_role_nearby(rect, words) -> str:
|
|
|
422
451
|
proximity_x = 140.0
|
|
423
452
|
if hasattr(rect, "x0"):
|
|
424
453
|
rx0, ry0, rx1, ry1 = float(rect.x0), float(rect.y0), float(rect.x1), float(rect.y1)
|
|
425
|
-
elif isinstance(rect,
|
|
454
|
+
elif isinstance(rect, tuple | list) and len(rect) == 4:
|
|
426
455
|
rx0, ry0, rx1, ry1 = map(float, rect)
|
|
427
456
|
else:
|
|
428
457
|
return "unknown"
|
|
@@ -471,6 +500,29 @@ def _to_signatures(
|
|
|
471
500
|
return signatures
|
|
472
501
|
|
|
473
502
|
|
|
503
|
+
def _signature_rank(signature: Signature) -> tuple[int, int, int]:
|
|
504
|
+
evidence = set(signature.Evidence or [])
|
|
505
|
+
if "image_signature:true" in evidence:
|
|
506
|
+
source_rank = 3
|
|
507
|
+
elif "stroke:yes" in evidence:
|
|
508
|
+
source_rank = 2
|
|
509
|
+
else:
|
|
510
|
+
source_rank = 1
|
|
511
|
+
return (source_rank, int(signature.Score or 0), int(signature.Page or 0))
|
|
512
|
+
|
|
513
|
+
|
|
514
|
+
def _dedupe_wet_signatures(signatures: Sequence[Signature]) -> list[Signature]:
|
|
515
|
+
best_by_role: dict[str, Signature] = {}
|
|
516
|
+
for signature in signatures:
|
|
517
|
+
role = (signature.Role or "unknown").strip().lower()
|
|
518
|
+
if role == "unknown":
|
|
519
|
+
continue
|
|
520
|
+
existing = best_by_role.get(role)
|
|
521
|
+
if existing is None or _signature_rank(signature) > _signature_rank(existing):
|
|
522
|
+
best_by_role[role] = signature
|
|
523
|
+
return sorted(best_by_role.values(), key=lambda sig: (int(sig.Page or 0), sig.Role or ""))
|
|
524
|
+
|
|
525
|
+
|
|
474
526
|
def _mark_manual_review(file_result: FileResult, reason: str) -> None:
|
|
475
527
|
hints = _split_hints(file_result.Hints)
|
|
476
528
|
hints.add(f"ManualReview:{reason}")
|
|
@@ -485,9 +537,7 @@ def _refresh_metadata(file_result: FileResult) -> None:
|
|
|
485
537
|
if roles:
|
|
486
538
|
file_result.Roles = ";".join(roles)
|
|
487
539
|
file_result.ElectronicSignatureFound = file_result.SignatureCount > 0
|
|
488
|
-
file_result.MixedContent = (
|
|
489
|
-
file_result.ElectronicSignatureFound and bool(file_result.ScannedPdf)
|
|
490
|
-
)
|
|
540
|
+
file_result.MixedContent = file_result.ElectronicSignatureFound and bool(file_result.ScannedPdf)
|
|
491
541
|
hints = _split_hints(file_result.Hints)
|
|
492
542
|
hints |= {sig.Hint for sig in file_result.Signatures if sig.Hint}
|
|
493
543
|
file_result.Hints = ";".join(sorted(hints))
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sigdetect
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.1
|
|
4
4
|
Summary: Signature detection and role attribution for PDFs
|
|
5
5
|
Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
|
|
6
6
|
License: MIT
|
|
@@ -10,9 +10,11 @@ Requires-Dist: pypdf>=4.0.0
|
|
|
10
10
|
Requires-Dist: rich>=13.0
|
|
11
11
|
Requires-Dist: typer>=0.12
|
|
12
12
|
Requires-Dist: pydantic>=2.5
|
|
13
|
+
Requires-Dist: pillow>=10.0
|
|
14
|
+
Requires-Dist: python-docx>=1.1.0
|
|
15
|
+
Requires-Dist: pytesseract>=0.3.10
|
|
16
|
+
Requires-Dist: pymupdf>=1.23
|
|
13
17
|
Requires-Dist: pyyaml>=6.0
|
|
14
|
-
Provides-Extra: pymupdf
|
|
15
|
-
Requires-Dist: pymupdf>=1.23; extra == "pymupdf"
|
|
16
18
|
|
|
17
19
|
# CaseWorks.Automation.CaseDocumentIntake
|
|
18
20
|
|
|
@@ -95,14 +97,16 @@ sigdetect detect \
|
|
|
95
97
|
### Notes
|
|
96
98
|
|
|
97
99
|
- The config file controls `pdf_root`, `out_dir`, `engine`, `pseudo_signatures`, `recurse_xobjects`, etc.
|
|
98
|
-
-
|
|
100
|
+
- Engine selection is forced to **auto** (prefers PyMuPDF for geometry, falls back to PyPDF2); any configured `engine` value is overridden.
|
|
99
101
|
- `--pseudo-signatures` enables a vendor/Acro-only pseudo-signature when no actual `/Widget` is present (useful for DocuSign / Acrobat Sign receipts).
|
|
100
102
|
- `--recurse-xobjects` allows scanning Form XObjects for vendor markers and labels embedded in page resources.
|
|
101
103
|
- `--profile` selects tuned role logic:
|
|
102
104
|
- `hipaa` → patient / representative / attorney
|
|
103
105
|
- `retainer` → client / firm (prefers detecting two signatures)
|
|
104
106
|
- `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
|
|
105
|
-
-
|
|
107
|
+
- Results output is disabled by default; set `write_results: true` or pass `--write-results` when you need `results.json` (for EDA).
|
|
108
|
+
- Cropping (`--crop-signatures`) writes PNG crops to disk by default; enable `--crop-docx` to write DOCX files instead of PNGs. `--crop-bytes` embeds base64 PNG data in `signatures[].crop_bytes` and, when `--crop-docx` is enabled, embeds DOCX bytes in `signatures[].crop_docx_bytes`. PyMuPDF is required for crops, and `python-docx` is required for DOCX output.
|
|
109
|
+
- Wet detection runs automatically for non-e-sign PDFs when dependencies are available; missing OCR dependencies add a `ManualReview:*` hint instead of failing. PyMuPDF + Tesseract are required for wet detection.
|
|
106
110
|
- If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
|
|
107
111
|
|
|
108
112
|
### EDA (quick aggregate stats)
|
|
@@ -113,6 +117,8 @@ sigdetect eda \
|
|
|
113
117
|
|
|
114
118
|
~~~
|
|
115
119
|
|
|
120
|
+
`sigdetect eda` expects `results.json`; enable `write_results: true` when running detect.
|
|
121
|
+
|
|
116
122
|
---
|
|
117
123
|
|
|
118
124
|
## Library usage
|
|
@@ -136,13 +142,13 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
|
|
|
136
142
|
print(result.to_dict())
|
|
137
143
|
~~~
|
|
138
144
|
|
|
139
|
-
`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
|
|
145
|
+
`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image; when DOCX cropping is enabled, `crop_docx_path` points at the generated doc. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
|
|
140
146
|
|
|
141
147
|
---
|
|
142
148
|
|
|
143
149
|
## Library API (embed in another script)
|
|
144
150
|
|
|
145
|
-
Minimal, plug-and-play API that returns plain dicts (JSON-ready) without side effects unless you opt into cropping
|
|
151
|
+
Minimal, plug-and-play API that returns plain dicts (JSON-ready) without side effects unless you opt into cropping. Engine selection is forced to `auto` (PyMuPDF preferred) to ensure geometry. Wet detection runs automatically for non-e-sign PDFs; pass `runWetDetection=False` to skip OCR.
|
|
146
152
|
|
|
147
153
|
~~~python
|
|
148
154
|
from pathlib import Path
|
|
@@ -165,6 +171,7 @@ result = DetectPdf(
|
|
|
165
171
|
profileName="retainer",
|
|
166
172
|
includePseudoSignatures=True,
|
|
167
173
|
recurseXObjects=True,
|
|
174
|
+
# runWetDetection=False, # disable OCR-backed wet detection if desired
|
|
168
175
|
)
|
|
169
176
|
print(
|
|
170
177
|
result["file"],
|
|
@@ -187,7 +194,7 @@ for res in ScanDirectory(
|
|
|
187
194
|
# store in DB, print, etc.
|
|
188
195
|
pass
|
|
189
196
|
|
|
190
|
-
# 3) Crop
|
|
197
|
+
# 3) Crop signature snippets for FileResult objects (requires PyMuPDF; DOCX needs python-docx)
|
|
191
198
|
detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
|
|
192
199
|
file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
|
|
193
200
|
CropSignatureImages(
|
|
@@ -226,7 +233,8 @@ High-level summary (per file):
|
|
|
226
233
|
"hint": "AcroSig:sig_patient",
|
|
227
234
|
"render_type": "typed",
|
|
228
235
|
"bounding_box": [10.0, 10.0, 150.0, 40.0],
|
|
229
|
-
"crop_path": "signature_crops/example/sig_01_patient.png"
|
|
236
|
+
"crop_path": "signature_crops/example/sig_01_patient.png",
|
|
237
|
+
"crop_docx_path": null
|
|
230
238
|
},
|
|
231
239
|
{
|
|
232
240
|
"page": null,
|
|
@@ -253,6 +261,9 @@ High-level summary (per file):
|
|
|
253
261
|
- In retainer profile, emitter prefers two signatures (client + firm), often on the same page.
|
|
254
262
|
- **`signatures[].bounding_box`** reports the widget rectangle in PDF points (origin bottom-left).
|
|
255
263
|
- **`signatures[].crop_path`** is populated when PNG crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
|
|
264
|
+
- **`signatures[].crop_docx_path`** is populated when DOCX crops are generated (`--crop-docx` or `docx=True`).
|
|
265
|
+
- **`signatures[].crop_bytes`** contains base64 PNG data when CLI `--crop-bytes` is enabled.
|
|
266
|
+
- **`signatures[].crop_docx_bytes`** contains base64 DOCX data when `--crop-docx` and `--crop-bytes` are enabled together.
|
|
256
267
|
|
|
257
268
|
---
|
|
258
269
|
|
|
@@ -274,14 +285,16 @@ You can keep one config YAML per dataset, e.g.:
|
|
|
274
285
|
# ./sample_data/config.yml (example)
|
|
275
286
|
pdf_root: ./pdfs
|
|
276
287
|
out_dir: ./sigdetect_out
|
|
277
|
-
engine:
|
|
288
|
+
engine: auto
|
|
289
|
+
write_results: false
|
|
278
290
|
pseudo_signatures: true
|
|
279
291
|
recurse_xobjects: true
|
|
280
292
|
profile: retainer # or: hipaa
|
|
281
293
|
crop_signatures: false # enable to write PNG crops (requires pymupdf)
|
|
294
|
+
crop_docx: false # enable to write DOCX crops instead of PNGs (requires python-docx)
|
|
282
295
|
# crop_output_dir: ./signature_crops
|
|
283
296
|
crop_image_dpi: 200
|
|
284
|
-
detect_wet_signatures: false #
|
|
297
|
+
detect_wet_signatures: false # kept for compatibility; non-e-sign PDFs still trigger OCR
|
|
285
298
|
wet_ocr_dpi: 200
|
|
286
299
|
wet_ocr_languages: eng
|
|
287
300
|
wet_precision_threshold: 0.82
|
|
@@ -299,7 +312,7 @@ YAML files can be customized or load at runtime (see CLI `--config`, if availabl
|
|
|
299
312
|
- Looks for client and firm labels/tokens; boosts pages with law-firm markers (LLP/LLC/PA/PC) and “By:” blocks.
|
|
300
313
|
- Applies an anti-front-matter rule to reduce page-1 false positives (e.g., letterheads, firm mastheads).
|
|
301
314
|
- When only vendor/Acro clues exist (no widgets), it will emit two pseudo signatures targeting likely pages.
|
|
302
|
-
- **Wet detection (
|
|
315
|
+
- **Wet detection (non-e-sign):** The CLI runs an OCR-backed pass (PyMuPDF + pytesseract/Tesseract) after e-sign detection whenever no e-sign evidence is found. It emits `RenderType="wet"` signatures for high-confidence label/stroke pairs in the lower page region. When an image-based signature is present on a page, label-only OCR candidates are suppressed unless a stroke is detected. Results are deduped to the top signature per role (dropping `unknown`). Missing OCR dependencies add a `ManualReview:*` hint instead of failing.
|
|
303
316
|
|
|
304
317
|
---
|
|
305
318
|
|
|
@@ -1,24 +1,24 @@
|
|
|
1
1
|
sigdetect/__init__.py,sha256=YvnTwlC1jfq83EhQS_1JjiiHK7_wJCCU1JvHv5E1qWY,573
|
|
2
|
-
sigdetect/api.py,sha256=
|
|
3
|
-
sigdetect/cli.py,sha256=
|
|
4
|
-
sigdetect/config.py,sha256
|
|
5
|
-
sigdetect/cropping.py,sha256=
|
|
2
|
+
sigdetect/api.py,sha256=hDfa6z4SoHth1Dw9HDfSPiytMQrqu_oyBZlXBwSh9g4,11010
|
|
3
|
+
sigdetect/cli.py,sha256=X5GqZ-PK67vz4OHN5r7h-V0hO886ZblUiUdKDuFowtU,10930
|
|
4
|
+
sigdetect/config.py,sha256=3SP1rkcWBGXloCDFomBJRMRKZOvXuHQbhIBqpVrzYmY,8365
|
|
5
|
+
sigdetect/cropping.py,sha256=HfOJrV2Xv9Eo0lCIl3mukz49agKB6h2TML99B0qQJNc,8837
|
|
6
6
|
sigdetect/eda.py,sha256=S92G1Gjmepri__D0n_V6foq0lQgH-RXI9anW8A58jfw,4681
|
|
7
7
|
sigdetect/logging_setup.py,sha256=LMF8ao_a-JwH0S522T6aYTFX3e8Ajjv_5ODS2YiBcHA,6404
|
|
8
8
|
sigdetect/utils.py,sha256=T9rubLf5T9JmjOHYMOba1j34fhOJaWocAXccnGTxRUE,5198
|
|
9
|
-
sigdetect/wet_detection.py,sha256=
|
|
9
|
+
sigdetect/wet_detection.py,sha256=zvi11XUmm_xLZ4BLvxInwMQg8YLcyQzEYAM9QSdJOIs,18259
|
|
10
10
|
sigdetect/data/role_rules.retainer.yml,sha256=IFdwKnDBXR2cTkdfrsZ6ku6CXD8S_dg5A3vKRKLW5h8,2532
|
|
11
11
|
sigdetect/data/role_rules.yml,sha256=HuLKsZR_A6sD9XvY4NHiY_VG3dS5ERNCBF9-Mxawomw,2751
|
|
12
12
|
sigdetect/data/vendor_patterns.yml,sha256=NRbZNQxcx_GuL6n1jAphBn6MM6ChCpeWGCsjbRx-PEo,384
|
|
13
|
-
sigdetect/detector/__init__.py,sha256=
|
|
13
|
+
sigdetect/detector/__init__.py,sha256=nT52mCI9s03Rso_RS86mm223rJfl5GlGDFsXwMJ3z3E,2548
|
|
14
14
|
sigdetect/detector/base.py,sha256=L-iXWXqsTetDc4jRZo_wOdbNpKqOY20mX9FefrugdT0,263
|
|
15
15
|
sigdetect/detector/base_detector.py,sha256=GmAgUWO_fQgIfnihZSoyhR3wpnwZ-X3hS0Kuyz4G6Ys,608
|
|
16
16
|
sigdetect/detector/file_result_model.py,sha256=j2gTc9Sw3fJOHlexYsR_m5DiwHA8DzIzAMToESfvo4A,1767
|
|
17
|
-
sigdetect/detector/pymupdf_engine.py,sha256=
|
|
17
|
+
sigdetect/detector/pymupdf_engine.py,sha256=N6oxvUa-48VvvhjbMk0R0kfScsggNKS7u5FLSeBRfWw,17358
|
|
18
18
|
sigdetect/detector/pypdf2_engine.py,sha256=kB8cIp_gMvCla0LIBi9sd19g0361Oc9TjCW_ZViUBJQ,47410
|
|
19
|
-
sigdetect/detector/signature_model.py,sha256=
|
|
20
|
-
sigdetect-0.
|
|
21
|
-
sigdetect-0.
|
|
22
|
-
sigdetect-0.
|
|
23
|
-
sigdetect-0.
|
|
24
|
-
sigdetect-0.
|
|
19
|
+
sigdetect/detector/signature_model.py,sha256=T2Hmfkfz_hZsDzwOhepxfNmkedxQp3_XHdrP8yGKoCk,1322
|
|
20
|
+
sigdetect-0.5.1.dist-info/METADATA,sha256=_Jnyl9_A1yZUrKwWxUxVB-9rcMG3MdUqiN5WX_zlpqQ,14131
|
|
21
|
+
sigdetect-0.5.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
22
|
+
sigdetect-0.5.1.dist-info/entry_points.txt,sha256=iqtfKjBU44-omM7Sh-idGz2ahw19oAvpvSyKZVArG3o,48
|
|
23
|
+
sigdetect-0.5.1.dist-info/top_level.txt,sha256=PKlfwUobkRC0viwiSXmhtw83G26FSNpimWYC1Uy00FY,10
|
|
24
|
+
sigdetect-0.5.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|