sigdetect 0.4.0__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sigdetect-0.4.0 → sigdetect-0.5.0}/PKG-INFO +23 -14
- {sigdetect-0.4.0 → sigdetect-0.5.0}/README.md +18 -11
- {sigdetect-0.4.0 → sigdetect-0.5.0}/pyproject.toml +5 -4
- {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect/api.py +42 -12
- {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect/cli.py +53 -24
- {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect/config.py +5 -0
- {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect/cropping.py +71 -15
- {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect/detector/__init__.py +10 -8
- {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect/detector/pymupdf_engine.py +2 -2
- {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect/detector/signature_model.py +2 -0
- {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect/wet_detection.py +63 -13
- {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect.egg-info/PKG-INFO +23 -14
- {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect.egg-info/requires.txt +4 -3
- {sigdetect-0.4.0 → sigdetect-0.5.0}/tests/test_api.py +36 -1
- {sigdetect-0.4.0 → sigdetect-0.5.0}/tests/test_cli.py +129 -2
- {sigdetect-0.4.0 → sigdetect-0.5.0}/tests/test_cropping.py +12 -1
- {sigdetect-0.4.0 → sigdetect-0.5.0}/tests/test_detector_options.py +4 -4
- sigdetect-0.5.0/tests/test_wet_detection.py +215 -0
- sigdetect-0.4.0/tests/test_wet_detection.py +0 -111
- {sigdetect-0.4.0 → sigdetect-0.5.0}/setup.cfg +0 -0
- {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect/__init__.py +0 -0
- {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect/data/role_rules.retainer.yml +0 -0
- {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect/data/role_rules.yml +0 -0
- {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect/data/vendor_patterns.yml +0 -0
- {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect/detector/base.py +0 -0
- {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect/detector/base_detector.py +0 -0
- {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect/detector/file_result_model.py +0 -0
- {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect/detector/pypdf2_engine.py +0 -0
- {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect/eda.py +0 -0
- {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect/logging_setup.py +0 -0
- {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect/utils.py +0 -0
- {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect.egg-info/SOURCES.txt +0 -0
- {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect.egg-info/dependency_links.txt +0 -0
- {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect.egg-info/entry_points.txt +0 -0
- {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect.egg-info/top_level.txt +0 -0
- {sigdetect-0.4.0 → sigdetect-0.5.0}/tests/test_pymupdf_engine.py +0 -0
- {sigdetect-0.4.0 → sigdetect-0.5.0}/tests/test_widget_role_patient_smoke.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sigdetect
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: Signature detection and role attribution for PDFs
|
|
5
5
|
Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
|
|
6
6
|
License: MIT
|
|
@@ -10,9 +10,11 @@ Requires-Dist: pypdf>=4.0.0
|
|
|
10
10
|
Requires-Dist: rich>=13.0
|
|
11
11
|
Requires-Dist: typer>=0.12
|
|
12
12
|
Requires-Dist: pydantic>=2.5
|
|
13
|
+
Requires-Dist: pillow>=10.0
|
|
14
|
+
Requires-Dist: python-docx>=1.1.0
|
|
15
|
+
Requires-Dist: pytesseract>=0.3.10
|
|
16
|
+
Requires-Dist: pymupdf>=1.23
|
|
13
17
|
Requires-Dist: pyyaml>=6.0
|
|
14
|
-
Provides-Extra: pymupdf
|
|
15
|
-
Requires-Dist: pymupdf>=1.23; extra == "pymupdf"
|
|
16
18
|
|
|
17
19
|
# CaseWorks.Automation.CaseDocumentIntake
|
|
18
20
|
|
|
@@ -95,14 +97,16 @@ sigdetect detect \
|
|
|
95
97
|
### Notes
|
|
96
98
|
|
|
97
99
|
- The config file controls `pdf_root`, `out_dir`, `engine`, `pseudo_signatures`, `recurse_xobjects`, etc.
|
|
98
|
-
-
|
|
100
|
+
- Engine selection is forced to **auto** (prefers PyMuPDF for geometry, falls back to PyPDF2); any configured `engine` value is overridden.
|
|
99
101
|
- `--pseudo-signatures` enables a vendor/Acro-only pseudo-signature when no actual `/Widget` is present (useful for DocuSign / Acrobat Sign receipts).
|
|
100
102
|
- `--recurse-xobjects` allows scanning Form XObjects for vendor markers and labels embedded in page resources.
|
|
101
103
|
- `--profile` selects tuned role logic:
|
|
102
104
|
- `hipaa` → patient / representative / attorney
|
|
103
105
|
- `retainer` → client / firm (prefers detecting two signatures)
|
|
104
106
|
- `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
|
|
105
|
-
-
|
|
107
|
+
- Results output is disabled by default; set `write_results: true` or pass `--write-results` when you need `results.json` (for EDA).
|
|
108
|
+
- Cropping (`--crop-signatures`) writes a one-image `.docx` per signature in the crop output directory (no PNG files on disk); `--crop-bytes` embeds base64 PNG data in `signatures[].crop_bytes` for in-memory use. PyMuPDF is required for crops, and `python-docx` is required for `.docx` output.
|
|
109
|
+
- Wet detection runs automatically for non-e-sign PDFs when dependencies are available; missing OCR dependencies add a `ManualReview:*` hint instead of failing. PyMuPDF + Tesseract are required for wet detection.
|
|
106
110
|
- If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
|
|
107
111
|
|
|
108
112
|
### EDA (quick aggregate stats)
|
|
@@ -113,6 +117,8 @@ sigdetect eda \
|
|
|
113
117
|
|
|
114
118
|
~~~
|
|
115
119
|
|
|
120
|
+
`sigdetect eda` expects `results.json`; enable `write_results: true` when running detect.
|
|
121
|
+
|
|
116
122
|
---
|
|
117
123
|
|
|
118
124
|
## Library usage
|
|
@@ -136,13 +142,13 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
|
|
|
136
142
|
print(result.to_dict())
|
|
137
143
|
~~~
|
|
138
144
|
|
|
139
|
-
`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When
|
|
145
|
+
`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When cropping is enabled, `crop_path` points at the generated `.docx`. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
|
|
140
146
|
|
|
141
147
|
---
|
|
142
148
|
|
|
143
149
|
## Library API (embed in another script)
|
|
144
150
|
|
|
145
|
-
Minimal, plug-and-play API that returns plain dicts (JSON-ready) without side effects unless you opt into cropping
|
|
151
|
+
Minimal, plug-and-play API that returns plain dicts (JSON-ready) without side effects unless you opt into cropping. Engine selection is forced to `auto` (PyMuPDF preferred) to ensure geometry. Wet detection runs automatically for non-e-sign PDFs; pass `runWetDetection=False` to skip OCR.
|
|
146
152
|
|
|
147
153
|
~~~python
|
|
148
154
|
from pathlib import Path
|
|
@@ -165,6 +171,7 @@ result = DetectPdf(
|
|
|
165
171
|
profileName="retainer",
|
|
166
172
|
includePseudoSignatures=True,
|
|
167
173
|
recurseXObjects=True,
|
|
174
|
+
# runWetDetection=False, # disable OCR-backed wet detection if desired
|
|
168
175
|
)
|
|
169
176
|
print(
|
|
170
177
|
result["file"],
|
|
@@ -187,7 +194,7 @@ for res in ScanDirectory(
|
|
|
187
194
|
# store in DB, print, etc.
|
|
188
195
|
pass
|
|
189
196
|
|
|
190
|
-
# 3)
|
|
197
|
+
# 3) Create DOCX crops for FileResult objects (requires PyMuPDF + python-docx)
|
|
191
198
|
detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
|
|
192
199
|
file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
|
|
193
200
|
CropSignatureImages(
|
|
@@ -226,7 +233,7 @@ High-level summary (per file):
|
|
|
226
233
|
"hint": "AcroSig:sig_patient",
|
|
227
234
|
"render_type": "typed",
|
|
228
235
|
"bounding_box": [10.0, 10.0, 150.0, 40.0],
|
|
229
|
-
"crop_path": "signature_crops/example/sig_01_patient.
|
|
236
|
+
"crop_path": "signature_crops/example/sig_01_patient.docx"
|
|
230
237
|
},
|
|
231
238
|
{
|
|
232
239
|
"page": null,
|
|
@@ -252,7 +259,8 @@ High-level summary (per file):
|
|
|
252
259
|
- **`roles`** summarizes unique non-`unknown` roles across signatures.
|
|
253
260
|
- In retainer profile, emitter prefers two signatures (client + firm), often on the same page.
|
|
254
261
|
- **`signatures[].bounding_box`** reports the widget rectangle in PDF points (origin bottom-left).
|
|
255
|
-
- **`signatures[].crop_path`** is populated when
|
|
262
|
+
- **`signatures[].crop_path`** is populated when DOCX crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
|
|
263
|
+
- **`signatures[].crop_bytes`** contains base64 PNG data when CLI `--crop-bytes` is enabled.
|
|
256
264
|
|
|
257
265
|
---
|
|
258
266
|
|
|
@@ -274,14 +282,15 @@ You can keep one config YAML per dataset, e.g.:
|
|
|
274
282
|
# ./sample_data/config.yml (example)
|
|
275
283
|
pdf_root: ./pdfs
|
|
276
284
|
out_dir: ./sigdetect_out
|
|
277
|
-
engine:
|
|
285
|
+
engine: auto
|
|
286
|
+
write_results: false
|
|
278
287
|
pseudo_signatures: true
|
|
279
288
|
recurse_xobjects: true
|
|
280
289
|
profile: retainer # or: hipaa
|
|
281
|
-
crop_signatures: false # enable to write
|
|
290
|
+
crop_signatures: false # enable to write DOCX crops (requires pymupdf + python-docx)
|
|
282
291
|
# crop_output_dir: ./signature_crops
|
|
283
292
|
crop_image_dpi: 200
|
|
284
|
-
detect_wet_signatures: false #
|
|
293
|
+
detect_wet_signatures: false # kept for compatibility; non-e-sign PDFs still trigger OCR
|
|
285
294
|
wet_ocr_dpi: 200
|
|
286
295
|
wet_ocr_languages: eng
|
|
287
296
|
wet_precision_threshold: 0.82
|
|
@@ -299,7 +308,7 @@ YAML files can be customized or load at runtime (see CLI `--config`, if availabl
|
|
|
299
308
|
- Looks for client and firm labels/tokens; boosts pages with law-firm markers (LLP/LLC/PA/PC) and “By:” blocks.
|
|
300
309
|
- Applies an anti-front-matter rule to reduce page-1 false positives (e.g., letterheads, firm mastheads).
|
|
301
310
|
- When only vendor/Acro clues exist (no widgets), it will emit two pseudo signatures targeting likely pages.
|
|
302
|
-
- **Wet detection (
|
|
311
|
+
- **Wet detection (non-e-sign):** The CLI runs an OCR-backed pass (PyMuPDF + pytesseract/Tesseract) after e-sign detection whenever no e-sign evidence is found. It emits `RenderType="wet"` signatures for high-confidence label/stroke pairs in the lower page region. When an image-based signature is present on a page, label-only OCR candidates are suppressed unless a stroke is detected. Results are deduped to the top signature per role (dropping `unknown`). Missing OCR dependencies add a `ManualReview:*` hint instead of failing.
|
|
303
312
|
|
|
304
313
|
---
|
|
305
314
|
|
|
@@ -79,14 +79,16 @@ sigdetect detect \
|
|
|
79
79
|
### Notes
|
|
80
80
|
|
|
81
81
|
- The config file controls `pdf_root`, `out_dir`, `engine`, `pseudo_signatures`, `recurse_xobjects`, etc.
|
|
82
|
-
-
|
|
82
|
+
- Engine selection is forced to **auto** (prefers PyMuPDF for geometry, falls back to PyPDF2); any configured `engine` value is overridden.
|
|
83
83
|
- `--pseudo-signatures` enables a vendor/Acro-only pseudo-signature when no actual `/Widget` is present (useful for DocuSign / Acrobat Sign receipts).
|
|
84
84
|
- `--recurse-xobjects` allows scanning Form XObjects for vendor markers and labels embedded in page resources.
|
|
85
85
|
- `--profile` selects tuned role logic:
|
|
86
86
|
- `hipaa` → patient / representative / attorney
|
|
87
87
|
- `retainer` → client / firm (prefers detecting two signatures)
|
|
88
88
|
- `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
|
|
89
|
-
-
|
|
89
|
+
- Results output is disabled by default; set `write_results: true` or pass `--write-results` when you need `results.json` (for EDA).
|
|
90
|
+
- Cropping (`--crop-signatures`) writes a one-image `.docx` per signature in the crop output directory (no PNG files on disk); `--crop-bytes` embeds base64 PNG data in `signatures[].crop_bytes` for in-memory use. PyMuPDF is required for crops, and `python-docx` is required for `.docx` output.
|
|
91
|
+
- Wet detection runs automatically for non-e-sign PDFs when dependencies are available; missing OCR dependencies add a `ManualReview:*` hint instead of failing. PyMuPDF + Tesseract are required for wet detection.
|
|
90
92
|
- If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
|
|
91
93
|
|
|
92
94
|
### EDA (quick aggregate stats)
|
|
@@ -97,6 +99,8 @@ sigdetect eda \
|
|
|
97
99
|
|
|
98
100
|
~~~
|
|
99
101
|
|
|
102
|
+
`sigdetect eda` expects `results.json`; enable `write_results: true` when running detect.
|
|
103
|
+
|
|
100
104
|
---
|
|
101
105
|
|
|
102
106
|
## Library usage
|
|
@@ -120,13 +124,13 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
|
|
|
120
124
|
print(result.to_dict())
|
|
121
125
|
~~~
|
|
122
126
|
|
|
123
|
-
`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When
|
|
127
|
+
`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When cropping is enabled, `crop_path` points at the generated `.docx`. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
|
|
124
128
|
|
|
125
129
|
---
|
|
126
130
|
|
|
127
131
|
## Library API (embed in another script)
|
|
128
132
|
|
|
129
|
-
Minimal, plug-and-play API that returns plain dicts (JSON-ready) without side effects unless you opt into cropping
|
|
133
|
+
Minimal, plug-and-play API that returns plain dicts (JSON-ready) without side effects unless you opt into cropping. Engine selection is forced to `auto` (PyMuPDF preferred) to ensure geometry. Wet detection runs automatically for non-e-sign PDFs; pass `runWetDetection=False` to skip OCR.
|
|
130
134
|
|
|
131
135
|
~~~python
|
|
132
136
|
from pathlib import Path
|
|
@@ -149,6 +153,7 @@ result = DetectPdf(
|
|
|
149
153
|
profileName="retainer",
|
|
150
154
|
includePseudoSignatures=True,
|
|
151
155
|
recurseXObjects=True,
|
|
156
|
+
# runWetDetection=False, # disable OCR-backed wet detection if desired
|
|
152
157
|
)
|
|
153
158
|
print(
|
|
154
159
|
result["file"],
|
|
@@ -171,7 +176,7 @@ for res in ScanDirectory(
|
|
|
171
176
|
# store in DB, print, etc.
|
|
172
177
|
pass
|
|
173
178
|
|
|
174
|
-
# 3)
|
|
179
|
+
# 3) Create DOCX crops for FileResult objects (requires PyMuPDF + python-docx)
|
|
175
180
|
detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
|
|
176
181
|
file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
|
|
177
182
|
CropSignatureImages(
|
|
@@ -210,7 +215,7 @@ High-level summary (per file):
|
|
|
210
215
|
"hint": "AcroSig:sig_patient",
|
|
211
216
|
"render_type": "typed",
|
|
212
217
|
"bounding_box": [10.0, 10.0, 150.0, 40.0],
|
|
213
|
-
"crop_path": "signature_crops/example/sig_01_patient.
|
|
218
|
+
"crop_path": "signature_crops/example/sig_01_patient.docx"
|
|
214
219
|
},
|
|
215
220
|
{
|
|
216
221
|
"page": null,
|
|
@@ -236,7 +241,8 @@ High-level summary (per file):
|
|
|
236
241
|
- **`roles`** summarizes unique non-`unknown` roles across signatures.
|
|
237
242
|
- In retainer profile, emitter prefers two signatures (client + firm), often on the same page.
|
|
238
243
|
- **`signatures[].bounding_box`** reports the widget rectangle in PDF points (origin bottom-left).
|
|
239
|
-
- **`signatures[].crop_path`** is populated when
|
|
244
|
+
- **`signatures[].crop_path`** is populated when DOCX crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
|
|
245
|
+
- **`signatures[].crop_bytes`** contains base64 PNG data when CLI `--crop-bytes` is enabled.
|
|
240
246
|
|
|
241
247
|
---
|
|
242
248
|
|
|
@@ -258,14 +264,15 @@ You can keep one config YAML per dataset, e.g.:
|
|
|
258
264
|
# ./sample_data/config.yml (example)
|
|
259
265
|
pdf_root: ./pdfs
|
|
260
266
|
out_dir: ./sigdetect_out
|
|
261
|
-
engine:
|
|
267
|
+
engine: auto
|
|
268
|
+
write_results: false
|
|
262
269
|
pseudo_signatures: true
|
|
263
270
|
recurse_xobjects: true
|
|
264
271
|
profile: retainer # or: hipaa
|
|
265
|
-
crop_signatures: false # enable to write
|
|
272
|
+
crop_signatures: false # enable to write DOCX crops (requires pymupdf + python-docx)
|
|
266
273
|
# crop_output_dir: ./signature_crops
|
|
267
274
|
crop_image_dpi: 200
|
|
268
|
-
detect_wet_signatures: false #
|
|
275
|
+
detect_wet_signatures: false # kept for compatibility; non-e-sign PDFs still trigger OCR
|
|
269
276
|
wet_ocr_dpi: 200
|
|
270
277
|
wet_ocr_languages: eng
|
|
271
278
|
wet_precision_threshold: 0.82
|
|
@@ -283,7 +290,7 @@ YAML files can be customized or load at runtime (see CLI `--config`, if availabl
|
|
|
283
290
|
- Looks for client and firm labels/tokens; boosts pages with law-firm markers (LLP/LLC/PA/PC) and “By:” blocks.
|
|
284
291
|
- Applies an anti-front-matter rule to reduce page-1 false positives (e.g., letterheads, firm mastheads).
|
|
285
292
|
- When only vendor/Acro clues exist (no widgets), it will emit two pseudo signatures targeting likely pages.
|
|
286
|
-
- **Wet detection (
|
|
293
|
+
- **Wet detection (non-e-sign):** The CLI runs an OCR-backed pass (PyMuPDF + pytesseract/Tesseract) after e-sign detection whenever no e-sign evidence is found. It emits `RenderType="wet"` signatures for high-confidence label/stroke pairs in the lower page region. When an image-based signature is present on a page, label-only OCR candidates are suppressed unless a stroke is detected. Results are deduped to the top signature per role (dropping `unknown`). Missing OCR dependencies add a `ManualReview:*` hint instead of failing.
|
|
287
294
|
|
|
288
295
|
---
|
|
289
296
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "sigdetect"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.5.0"
|
|
8
8
|
description = "Signature detection and role attribution for PDFs"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
authors = [{ name = "BT Asmamaw", email = "basmamaw@angeiongroup.com" }]
|
|
@@ -15,12 +15,13 @@ dependencies = [
|
|
|
15
15
|
"rich>=13.0",
|
|
16
16
|
"typer>=0.12",
|
|
17
17
|
"pydantic>=2.5",
|
|
18
|
+
"pillow>=10.0",
|
|
19
|
+
"python-docx>=1.1.0",
|
|
20
|
+
"pytesseract>=0.3.10",
|
|
21
|
+
"pymupdf>=1.23",
|
|
18
22
|
"pyyaml>=6.0",
|
|
19
23
|
]
|
|
20
24
|
|
|
21
|
-
[project.optional-dependencies]
|
|
22
|
-
pymupdf = ["pymupdf>=1.23"]
|
|
23
|
-
|
|
24
25
|
[project.scripts]
|
|
25
26
|
sigdetect = "sigdetect.cli:app"
|
|
26
27
|
|
|
@@ -9,6 +9,7 @@ from typing import Any, Generator, Iterable, Iterator, Literal, overload
|
|
|
9
9
|
from sigdetect.config import DetectConfiguration
|
|
10
10
|
from sigdetect.cropping import SignatureCrop
|
|
11
11
|
from sigdetect.detector import BuildDetector, Detector, FileResult, Signature
|
|
12
|
+
from sigdetect.wet_detection import apply_wet_detection
|
|
12
13
|
|
|
13
14
|
EngineName = Literal["pypdf2", "pypdf", "pymupdf", "auto"]
|
|
14
15
|
ProfileName = Literal["hipaa", "retainer"]
|
|
@@ -21,9 +22,13 @@ def DetectPdf(
|
|
|
21
22
|
engineName: EngineName = "auto",
|
|
22
23
|
includePseudoSignatures: bool = True,
|
|
23
24
|
recurseXObjects: bool = True,
|
|
25
|
+
runWetDetection: bool = True,
|
|
24
26
|
detector: Detector | None = None,
|
|
25
27
|
) -> dict[str, Any]:
|
|
26
|
-
"""Detect signature evidence and assign roles for a single PDF.
|
|
28
|
+
"""Detect signature evidence and assign roles for a single PDF.
|
|
29
|
+
|
|
30
|
+
Wet detection runs by default for non-e-sign PDFs; pass ``runWetDetection=False`` to skip OCR.
|
|
31
|
+
"""
|
|
27
32
|
|
|
28
33
|
resolvedPath = Path(pdfPath)
|
|
29
34
|
activeDetector = detector or get_detector(
|
|
@@ -36,6 +41,10 @@ def DetectPdf(
|
|
|
36
41
|
)
|
|
37
42
|
|
|
38
43
|
result = activeDetector.Detect(resolvedPath)
|
|
44
|
+
if runWetDetection:
|
|
45
|
+
configuration = _ResolveConfiguration(activeDetector)
|
|
46
|
+
if configuration is not None:
|
|
47
|
+
apply_wet_detection(resolvedPath, configuration, result)
|
|
39
48
|
return _ToPlainDictionary(result)
|
|
40
49
|
|
|
41
50
|
|
|
@@ -48,7 +57,10 @@ def get_detector(
|
|
|
48
57
|
recurseXObjects: bool = True,
|
|
49
58
|
outputDirectory: str | Path | None = None,
|
|
50
59
|
) -> Detector:
|
|
51
|
-
"""Return a reusable detector instance configured with the supplied options.
|
|
60
|
+
"""Return a reusable detector instance configured with the supplied options.
|
|
61
|
+
|
|
62
|
+
Engine selection is forced to ``auto`` (prefers PyMuPDF when available).
|
|
63
|
+
"""
|
|
52
64
|
|
|
53
65
|
configuration = DetectConfiguration(
|
|
54
66
|
PdfRoot=Path(pdfRoot) if pdfRoot is not None else Path.cwd(),
|
|
@@ -108,6 +120,7 @@ def _ToPlainValue(value: Any) -> Any:
|
|
|
108
120
|
def DetectMany(
|
|
109
121
|
pdfPaths: Iterable[str | Path],
|
|
110
122
|
*,
|
|
123
|
+
runWetDetection: bool = True,
|
|
111
124
|
detector: Detector | None = None,
|
|
112
125
|
**kwargs: Any,
|
|
113
126
|
) -> Iterator[dict[str, Any]]:
|
|
@@ -115,17 +128,18 @@ def DetectMany(
|
|
|
115
128
|
|
|
116
129
|
if detector is not None:
|
|
117
130
|
for pdfPath in pdfPaths:
|
|
118
|
-
yield _DetectWithDetector(detector, pdfPath)
|
|
131
|
+
yield _DetectWithDetector(detector, pdfPath, runWetDetection=runWetDetection)
|
|
119
132
|
return
|
|
120
133
|
|
|
121
134
|
for pdfPath in pdfPaths:
|
|
122
|
-
yield DetectPdf(pdfPath, **kwargs)
|
|
135
|
+
yield DetectPdf(pdfPath, runWetDetection=runWetDetection, **kwargs)
|
|
123
136
|
|
|
124
137
|
|
|
125
138
|
def ScanDirectory(
|
|
126
139
|
pdfRoot: str | Path,
|
|
127
140
|
*,
|
|
128
141
|
globPattern: str = "**/*.pdf",
|
|
142
|
+
runWetDetection: bool = True,
|
|
129
143
|
detector: Detector | None = None,
|
|
130
144
|
**kwargs: Any,
|
|
131
145
|
) -> Iterator[dict[str, Any]]:
|
|
@@ -143,7 +157,7 @@ def ScanDirectory(
|
|
|
143
157
|
|
|
144
158
|
for pdfPath in iterator:
|
|
145
159
|
if pdfPath.is_file() and pdfPath.suffix.lower() == ".pdf":
|
|
146
|
-
yield DetectPdf(pdfPath, detector=detector, **kwargs)
|
|
160
|
+
yield DetectPdf(pdfPath, detector=detector, runWetDetection=runWetDetection, **kwargs)
|
|
147
161
|
|
|
148
162
|
|
|
149
163
|
def ToCsvRow(result: dict[str, Any]) -> dict[str, Any]:
|
|
@@ -174,11 +188,25 @@ def Version() -> str:
|
|
|
174
188
|
return "0.0.0-dev"
|
|
175
189
|
|
|
176
190
|
|
|
177
|
-
def _DetectWithDetector(
|
|
191
|
+
def _DetectWithDetector(
|
|
192
|
+
detector: Detector, pdfPath: str | Path, *, runWetDetection: bool
|
|
193
|
+
) -> dict[str, Any]:
|
|
178
194
|
"""Helper that runs ``detector`` and returns the plain dictionary result."""
|
|
179
195
|
|
|
180
196
|
resolvedPath = Path(pdfPath)
|
|
181
|
-
|
|
197
|
+
result = detector.Detect(resolvedPath)
|
|
198
|
+
if runWetDetection:
|
|
199
|
+
configuration = _ResolveConfiguration(detector)
|
|
200
|
+
if configuration is not None:
|
|
201
|
+
apply_wet_detection(resolvedPath, configuration, result)
|
|
202
|
+
return _ToPlainDictionary(result)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _ResolveConfiguration(detector: Detector) -> DetectConfiguration | None:
|
|
206
|
+
configuration = getattr(detector, "Configuration", None)
|
|
207
|
+
if isinstance(configuration, DetectConfiguration):
|
|
208
|
+
return configuration
|
|
209
|
+
return None
|
|
182
210
|
|
|
183
211
|
|
|
184
212
|
@contextmanager
|
|
@@ -201,8 +229,7 @@ def CropSignatureImages(
|
|
|
201
229
|
dpi: int = 200,
|
|
202
230
|
returnBytes: Literal[False] = False,
|
|
203
231
|
saveToDisk: bool = True,
|
|
204
|
-
) -> list[Path]:
|
|
205
|
-
...
|
|
232
|
+
) -> list[Path]: ...
|
|
206
233
|
|
|
207
234
|
|
|
208
235
|
@overload
|
|
@@ -214,8 +241,7 @@ def CropSignatureImages(
|
|
|
214
241
|
dpi: int,
|
|
215
242
|
returnBytes: Literal[True],
|
|
216
243
|
saveToDisk: bool,
|
|
217
|
-
) -> list[SignatureCrop]:
|
|
218
|
-
...
|
|
244
|
+
) -> list[SignatureCrop]: ...
|
|
219
245
|
|
|
220
246
|
|
|
221
247
|
def CropSignatureImages(
|
|
@@ -227,12 +253,15 @@ def CropSignatureImages(
|
|
|
227
253
|
returnBytes: bool = False,
|
|
228
254
|
saveToDisk: bool = True,
|
|
229
255
|
) -> list[Path] | list[SignatureCrop]:
|
|
230
|
-
"""
|
|
256
|
+
"""Create DOCX files containing cropped signature images.
|
|
231
257
|
|
|
232
258
|
Accepts either a :class:`FileResult` instance or the ``dict`` returned by
|
|
233
259
|
:func:`DetectPdf`. Requires the optional ``pymupdf`` dependency.
|
|
234
260
|
Set ``returnBytes=True`` to also receive in-memory PNG bytes for each crop. Set
|
|
235
261
|
``saveToDisk=False`` to skip writing PNG files while still returning in-memory data.
|
|
262
|
+
When ``saveToDisk`` is enabled, a one-image DOCX file is also written per crop. When
|
|
263
|
+
``returnBytes`` is True and ``python-docx`` is available, the returned
|
|
264
|
+
:class:`SignatureCrop` objects include ``docx_bytes``.
|
|
236
265
|
"""
|
|
237
266
|
|
|
238
267
|
from sigdetect.cropping import crop_signatures
|
|
@@ -275,6 +304,7 @@ def _CoerceFileResult(
|
|
|
275
304
|
RenderType=str(entry.get("render_type") or "unknown"),
|
|
276
305
|
BoundingBox=tuple(bbox) if bbox else None,
|
|
277
306
|
CropPath=entry.get("crop_path"),
|
|
307
|
+
CropBytes=entry.get("crop_bytes"),
|
|
278
308
|
)
|
|
279
309
|
)
|
|
280
310
|
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import base64
|
|
5
6
|
import json
|
|
6
7
|
from collections.abc import Iterator
|
|
7
8
|
from dataclasses import asdict, is_dataclass
|
|
@@ -48,6 +49,12 @@ def Detect(
|
|
|
48
49
|
configurationPath: Path | None = typer.Option(
|
|
49
50
|
None, "--config", "-c", help="Path to YAML config"
|
|
50
51
|
),
|
|
52
|
+
writeResults: bool | None = typer.Option(
|
|
53
|
+
None,
|
|
54
|
+
"--write-results/--no-write-results",
|
|
55
|
+
help="Write results.json (or JSON to stdout when out_dir is none)",
|
|
56
|
+
show_default=False,
|
|
57
|
+
),
|
|
51
58
|
profileOverride: str | None = typer.Option(None, "--profile", "-p", help="hipaa or retainer"),
|
|
52
59
|
recursive: bool = typer.Option(
|
|
53
60
|
True,
|
|
@@ -57,13 +64,13 @@ def Detect(
|
|
|
57
64
|
cropSignatures: bool | None = typer.Option(
|
|
58
65
|
None,
|
|
59
66
|
"--crop-signatures/--no-crop-signatures",
|
|
60
|
-
help="
|
|
67
|
+
help="Write DOCX files containing cropped signature images (requires PyMuPDF + python-docx)",
|
|
61
68
|
show_default=False,
|
|
62
69
|
),
|
|
63
70
|
cropDirectory: Path | None = typer.Option(
|
|
64
71
|
None,
|
|
65
72
|
"--crop-dir",
|
|
66
|
-
help="Directory for signature
|
|
73
|
+
help="Directory for signature DOCX crops (defaults to out_dir/signature_crops)",
|
|
67
74
|
),
|
|
68
75
|
cropDpi: int | None = typer.Option(
|
|
69
76
|
None,
|
|
@@ -73,10 +80,16 @@ def Detect(
|
|
|
73
80
|
help="Rendering DPI for signature crops",
|
|
74
81
|
show_default=False,
|
|
75
82
|
),
|
|
83
|
+
cropBytes: bool = typer.Option(
|
|
84
|
+
False,
|
|
85
|
+
"--crop-bytes/--no-crop-bytes",
|
|
86
|
+
help="Embed base64 PNG bytes for signature crops in results JSON",
|
|
87
|
+
show_default=False,
|
|
88
|
+
),
|
|
76
89
|
detectWetSignatures: bool | None = typer.Option(
|
|
77
90
|
None,
|
|
78
91
|
"--detect-wet/--no-detect-wet",
|
|
79
|
-
help="
|
|
92
|
+
help="Compatibility flag; non-e-sign PDFs always run OCR when deps are available",
|
|
80
93
|
show_default=False,
|
|
81
94
|
),
|
|
82
95
|
wetOcrDpi: int | None = typer.Option(
|
|
@@ -111,6 +124,8 @@ def Detect(
|
|
|
111
124
|
configuration = configuration.model_copy(update={"Profile": normalized_profile})
|
|
112
125
|
|
|
113
126
|
overrides: dict[str, object] = {}
|
|
127
|
+
if writeResults is not None:
|
|
128
|
+
overrides["WriteResults"] = writeResults
|
|
114
129
|
if cropSignatures is not None:
|
|
115
130
|
overrides["CropSignatures"] = cropSignatures
|
|
116
131
|
if cropDirectory is not None:
|
|
@@ -145,44 +160,52 @@ def Detect(
|
|
|
145
160
|
except StopIteration:
|
|
146
161
|
raise SystemExit(f"No PDFs found in {configuration.PdfRoot}") from None
|
|
147
162
|
|
|
148
|
-
|
|
163
|
+
write_results = configuration.WriteResults
|
|
164
|
+
results_buffer: list[FileResult] | None = (
|
|
165
|
+
[] if write_results and configuration.OutputDirectory is None else None
|
|
166
|
+
)
|
|
149
167
|
json_handle = None
|
|
150
168
|
json_path: Path | None = None
|
|
151
169
|
wrote_first = False
|
|
152
170
|
|
|
153
|
-
if configuration.OutputDirectory is not None:
|
|
171
|
+
if write_results and configuration.OutputDirectory is not None:
|
|
154
172
|
outputDirectory = configuration.OutputDirectory
|
|
155
173
|
outputDirectory.mkdir(parents=True, exist_ok=True)
|
|
156
174
|
json_path = outputDirectory / "results.json"
|
|
157
175
|
json_handle = open(json_path, "w", encoding="utf-8")
|
|
158
176
|
json_handle.write("[")
|
|
159
177
|
|
|
178
|
+
crop_bytes_enabled = bool(cropBytes)
|
|
160
179
|
crop_dir = configuration.CropOutputDirectory
|
|
180
|
+
if crop_dir is None:
|
|
181
|
+
base_dir = configuration.OutputDirectory or configuration.PdfRoot
|
|
182
|
+
crop_dir = base_dir / "signature_crops"
|
|
161
183
|
cropping_enabled = configuration.CropSignatures
|
|
162
184
|
cropping_available = True
|
|
163
185
|
cropping_attempted = False
|
|
164
|
-
if configuration.CropSignatures and crop_dir is None:
|
|
165
|
-
Logger.warning(
|
|
166
|
-
"CropSignatures enabled without an output directory",
|
|
167
|
-
extra={"pdf_root": str(configuration.PdfRoot)},
|
|
168
|
-
)
|
|
169
|
-
cropping_enabled = False
|
|
170
186
|
|
|
171
187
|
total_bboxes = 0
|
|
172
188
|
|
|
173
189
|
def _append_result(file_result: FileResult, source_pdf: Path) -> None:
|
|
174
190
|
nonlocal wrote_first, json_handle, total_bboxes, cropping_available, cropping_attempted
|
|
175
191
|
|
|
176
|
-
if
|
|
192
|
+
if cropping_available and (cropping_enabled or crop_bytes_enabled) and crop_dir is not None:
|
|
177
193
|
try:
|
|
178
|
-
crop_signatures(
|
|
194
|
+
crops = crop_signatures(
|
|
179
195
|
pdf_path=source_pdf,
|
|
180
196
|
file_result=file_result,
|
|
181
197
|
output_dir=crop_dir,
|
|
182
198
|
dpi=configuration.CropImageDpi,
|
|
183
199
|
logger=Logger,
|
|
200
|
+
return_bytes=crop_bytes_enabled,
|
|
201
|
+
save_files=cropping_enabled,
|
|
184
202
|
)
|
|
185
203
|
cropping_attempted = True
|
|
204
|
+
if crop_bytes_enabled:
|
|
205
|
+
for crop in crops:
|
|
206
|
+
crop.signature.CropBytes = base64.b64encode(crop.image_bytes).decode(
|
|
207
|
+
"ascii"
|
|
208
|
+
)
|
|
186
209
|
except SignatureCroppingUnavailable as exc:
|
|
187
210
|
cropping_available = False
|
|
188
211
|
Logger.warning("Signature cropping unavailable", extra={"error": str(exc)})
|
|
@@ -231,18 +254,24 @@ def Detect(
|
|
|
231
254
|
json_handle.write(closing)
|
|
232
255
|
json_handle.close()
|
|
233
256
|
|
|
234
|
-
if
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
257
|
+
if write_results:
|
|
258
|
+
if json_handle is not None:
|
|
259
|
+
typer.echo(f"Wrote {json_path}")
|
|
260
|
+
else:
|
|
261
|
+
payload = json.dumps(
|
|
262
|
+
results_buffer or [], indent=2, ensure_ascii=False, default=_JsonSerializer
|
|
263
|
+
)
|
|
264
|
+
typer.echo(payload)
|
|
265
|
+
typer.echo("Detection completed with output disabled (out_dir=none)")
|
|
266
|
+
|
|
267
|
+
if (
|
|
268
|
+
(cropping_enabled or crop_bytes_enabled)
|
|
269
|
+
and cropping_available
|
|
270
|
+
and cropping_attempted
|
|
271
|
+
and total_bboxes == 0
|
|
272
|
+
):
|
|
244
273
|
Logger.warning(
|
|
245
|
-
"No signature bounding boxes detected;
|
|
274
|
+
"No signature bounding boxes detected; install PyMuPDF for crop-ready output",
|
|
246
275
|
extra={"engine": configuration.Engine},
|
|
247
276
|
)
|
|
248
277
|
|
|
@@ -25,6 +25,7 @@ class DetectConfiguration(BaseModel):
|
|
|
25
25
|
|
|
26
26
|
PdfRoot: Path = Field(default=Path("hipaa_results"), alias="pdf_root")
|
|
27
27
|
OutputDirectory: Path | None = Field(default=Path("out"), alias="out_dir")
|
|
28
|
+
WriteResults: bool = Field(default=False, alias="write_results")
|
|
28
29
|
Engine: EngineName = Field(default="auto", alias="engine")
|
|
29
30
|
Profile: ProfileName = Field(default="hipaa", alias="profile")
|
|
30
31
|
PseudoSignatures: bool = Field(default=True, alias="pseudo_signatures")
|
|
@@ -63,6 +64,10 @@ class DetectConfiguration(BaseModel):
|
|
|
63
64
|
def out_dir(self) -> Path | None: # pragma: no cover - simple passthrough
|
|
64
65
|
return self.OutputDirectory
|
|
65
66
|
|
|
67
|
+
@property
|
|
68
|
+
def write_results(self) -> bool: # pragma: no cover - simple passthrough
|
|
69
|
+
return self.WriteResults
|
|
70
|
+
|
|
66
71
|
@property
|
|
67
72
|
def engine(self) -> EngineName: # pragma: no cover - simple passthrough
|
|
68
73
|
return self.Engine
|