sigdetect 0.3.1__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sigdetect-0.3.1 → sigdetect-0.5.0}/PKG-INFO +28 -25
- {sigdetect-0.3.1 → sigdetect-0.5.0}/README.md +23 -22
- {sigdetect-0.3.1 → sigdetect-0.5.0}/pyproject.toml +5 -4
- {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect/__init__.py +1 -1
- {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect/api.py +43 -11
- {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect/cli.py +89 -23
- {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect/config.py +48 -3
- {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect/cropping.py +72 -12
- {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect/detector/__init__.py +27 -8
- {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect/detector/pymupdf_engine.py +3 -2
- {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect/detector/pypdf2_engine.py +7 -5
- {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect/detector/signature_model.py +3 -1
- sigdetect-0.5.0/src/sigdetect/wet_detection.py +549 -0
- {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect.egg-info/PKG-INFO +28 -25
- {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect.egg-info/SOURCES.txt +4 -0
- {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect.egg-info/requires.txt +4 -3
- {sigdetect-0.3.1 → sigdetect-0.5.0}/tests/test_api.py +36 -1
- sigdetect-0.5.0/tests/test_cli.py +275 -0
- {sigdetect-0.3.1 → sigdetect-0.5.0}/tests/test_cropping.py +12 -1
- sigdetect-0.5.0/tests/test_detector_options.py +82 -0
- sigdetect-0.5.0/tests/test_wet_detection.py +215 -0
- {sigdetect-0.3.1 → sigdetect-0.5.0}/setup.cfg +0 -0
- {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect/data/role_rules.retainer.yml +0 -0
- {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect/data/role_rules.yml +0 -0
- {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect/data/vendor_patterns.yml +0 -0
- {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect/detector/base.py +0 -0
- {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect/detector/base_detector.py +0 -0
- {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect/detector/file_result_model.py +0 -0
- {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect/eda.py +0 -0
- {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect/logging_setup.py +0 -0
- {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect/utils.py +0 -0
- {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect.egg-info/dependency_links.txt +0 -0
- {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect.egg-info/entry_points.txt +0 -0
- {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect.egg-info/top_level.txt +0 -0
- {sigdetect-0.3.1 → sigdetect-0.5.0}/tests/test_pymupdf_engine.py +0 -0
- {sigdetect-0.3.1 → sigdetect-0.5.0}/tests/test_widget_role_patient_smoke.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sigdetect
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: Signature detection and role attribution for PDFs
|
|
5
5
|
Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
|
|
6
6
|
License: MIT
|
|
@@ -10,9 +10,11 @@ Requires-Dist: pypdf>=4.0.0
|
|
|
10
10
|
Requires-Dist: rich>=13.0
|
|
11
11
|
Requires-Dist: typer>=0.12
|
|
12
12
|
Requires-Dist: pydantic>=2.5
|
|
13
|
+
Requires-Dist: pillow>=10.0
|
|
14
|
+
Requires-Dist: python-docx>=1.1.0
|
|
15
|
+
Requires-Dist: pytesseract>=0.3.10
|
|
16
|
+
Requires-Dist: pymupdf>=1.23
|
|
13
17
|
Requires-Dist: pyyaml>=6.0
|
|
14
|
-
Provides-Extra: pymupdf
|
|
15
|
-
Requires-Dist: pymupdf>=1.23; extra == "pymupdf"
|
|
16
18
|
|
|
17
19
|
# CaseWorks.Automation.CaseDocumentIntake
|
|
18
20
|
|
|
@@ -95,14 +97,16 @@ sigdetect detect \
|
|
|
95
97
|
### Notes
|
|
96
98
|
|
|
97
99
|
- The config file controls `pdf_root`, `out_dir`, `engine`, `pseudo_signatures`, `recurse_xobjects`, etc.
|
|
98
|
-
-
|
|
100
|
+
- Engine selection is forced to **auto** (prefers PyMuPDF for geometry, falls back to PyPDF2); any configured `engine` value is overridden.
|
|
99
101
|
- `--pseudo-signatures` enables a vendor/Acro-only pseudo-signature when no actual `/Widget` is present (useful for DocuSign / Acrobat Sign receipts).
|
|
100
102
|
- `--recurse-xobjects` allows scanning Form XObjects for vendor markers and labels embedded in page resources.
|
|
101
103
|
- `--profile` selects tuned role logic:
|
|
102
104
|
- `hipaa` → patient / representative / attorney
|
|
103
105
|
- `retainer` → client / firm (prefers detecting two signatures)
|
|
104
106
|
- `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
|
|
105
|
-
-
|
|
107
|
+
- Results output is disabled by default; set `write_results: true` or pass `--write-results` when you need `results.json` (for EDA).
|
|
108
|
+
- Cropping (`--crop-signatures`) writes a one-image `.docx` per signature in the crop output directory (no PNG files on disk); `--crop-bytes` embeds base64 PNG data in `signatures[].crop_bytes` for in-memory use. PyMuPDF is required for crops, and `python-docx` is required for `.docx` output.
|
|
109
|
+
- Wet detection runs automatically for non-e-sign PDFs when dependencies are available; missing OCR dependencies add a `ManualReview:*` hint instead of failing. PyMuPDF + Tesseract are required for wet detection.
|
|
106
110
|
- If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
|
|
107
111
|
|
|
108
112
|
### EDA (quick aggregate stats)
|
|
@@ -113,6 +117,8 @@ sigdetect eda \
|
|
|
113
117
|
|
|
114
118
|
~~~
|
|
115
119
|
|
|
120
|
+
`sigdetect eda` expects `results.json`; enable `write_results: true` when running detect.
|
|
121
|
+
|
|
116
122
|
---
|
|
117
123
|
|
|
118
124
|
## Library usage
|
|
@@ -136,15 +142,13 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
|
|
|
136
142
|
print(result.to_dict())
|
|
137
143
|
~~~
|
|
138
144
|
|
|
139
|
-
`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When
|
|
145
|
+
`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When cropping is enabled, `crop_path` points at the generated `.docx`. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
|
|
140
146
|
|
|
141
147
|
---
|
|
142
148
|
|
|
143
149
|
## Library API (embed in another script)
|
|
144
150
|
|
|
145
|
-
Minimal, plug-and-play API
|
|
146
|
-
Import from `sigdetect.api` and get plain dicts out (JSON-ready),
|
|
147
|
-
with no I/O side effects by default:
|
|
151
|
+
Minimal, plug-and-play API that returns plain dicts (JSON-ready) without side effects unless you opt into cropping. Engine selection is forced to `auto` (PyMuPDF preferred) to ensure geometry. Wet detection runs automatically for non-e-sign PDFs; pass `runWetDetection=False` to skip OCR.
|
|
148
152
|
|
|
149
153
|
~~~python
|
|
150
154
|
from pathlib import Path
|
|
@@ -167,6 +171,7 @@ result = DetectPdf(
|
|
|
167
171
|
profileName="retainer",
|
|
168
172
|
includePseudoSignatures=True,
|
|
169
173
|
recurseXObjects=True,
|
|
174
|
+
# runWetDetection=False, # disable OCR-backed wet detection if desired
|
|
170
175
|
)
|
|
171
176
|
print(
|
|
172
177
|
result["file"],
|
|
@@ -189,26 +194,17 @@ for res in ScanDirectory(
|
|
|
189
194
|
# store in DB, print, etc.
|
|
190
195
|
pass
|
|
191
196
|
|
|
192
|
-
# 3)
|
|
197
|
+
# 3) Create DOCX crops for FileResult objects (requires PyMuPDF + python-docx)
|
|
193
198
|
detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
|
|
194
199
|
file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
|
|
195
|
-
|
|
200
|
+
CropSignatureImages(
|
|
196
201
|
"/path/to/pdfs/example.pdf",
|
|
197
202
|
file_result,
|
|
198
203
|
outputDirectory="./signature_crops",
|
|
199
204
|
dpi=200,
|
|
200
|
-
returnBytes=True, # also returns in-memory PNG bytes for each crop
|
|
201
|
-
# saveToDisk=False, # optional: skip writing PNGs to disk
|
|
202
205
|
)
|
|
203
|
-
|
|
204
|
-
first_crop = crops[0]
|
|
205
|
-
print(first_crop.path, len(first_crop.image_bytes))
|
|
206
206
|
~~~
|
|
207
207
|
|
|
208
|
-
When ``returnBytes=True`` the helper returns ``SignatureCrop`` objects containing the saved path,
|
|
209
|
-
PNG bytes, and the originating signature metadata.
|
|
210
|
-
Pass ``saveToDisk=False`` if you only want in-memory PNG bytes (no files on disk or ``crop_path`` updates).
|
|
211
|
-
|
|
212
208
|
|
|
213
209
|
## Result schema
|
|
214
210
|
|
|
@@ -237,7 +233,7 @@ High-level summary (per file):
|
|
|
237
233
|
"hint": "AcroSig:sig_patient",
|
|
238
234
|
"render_type": "typed",
|
|
239
235
|
"bounding_box": [10.0, 10.0, 150.0, 40.0],
|
|
240
|
-
"crop_path": "signature_crops/example/sig_01_patient.
|
|
236
|
+
"crop_path": "signature_crops/example/sig_01_patient.docx"
|
|
241
237
|
},
|
|
242
238
|
{
|
|
243
239
|
"page": null,
|
|
@@ -247,7 +243,7 @@ High-level summary (per file):
|
|
|
247
243
|
"scores": { "page_label": 4, "general": 2 },
|
|
248
244
|
"evidence": ["page_label:representative(parent/guardian)", "pseudo:true"],
|
|
249
245
|
"hint": "VendorOrAcroOnly",
|
|
250
|
-
"render_type": "
|
|
246
|
+
"render_type": "typed",
|
|
251
247
|
"bounding_box": null,
|
|
252
248
|
"crop_path": null
|
|
253
249
|
}
|
|
@@ -263,7 +259,8 @@ High-level summary (per file):
|
|
|
263
259
|
- **`roles`** summarizes unique non-`unknown` roles across signatures.
|
|
264
260
|
- In retainer profile, emitter prefers two signatures (client + firm), often on the same page.
|
|
265
261
|
- **`signatures[].bounding_box`** reports the widget rectangle in PDF points (origin bottom-left).
|
|
266
|
-
- **`signatures[].crop_path`** is populated when
|
|
262
|
+
- **`signatures[].crop_path`** is populated when DOCX crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
|
|
263
|
+
- **`signatures[].crop_bytes`** contains base64 PNG data when CLI `--crop-bytes` is enabled.
|
|
267
264
|
|
|
268
265
|
---
|
|
269
266
|
|
|
@@ -285,13 +282,18 @@ You can keep one config YAML per dataset, e.g.:
|
|
|
285
282
|
# ./sample_data/config.yml (example)
|
|
286
283
|
pdf_root: ./pdfs
|
|
287
284
|
out_dir: ./sigdetect_out
|
|
288
|
-
engine:
|
|
285
|
+
engine: auto
|
|
286
|
+
write_results: false
|
|
289
287
|
pseudo_signatures: true
|
|
290
288
|
recurse_xobjects: true
|
|
291
289
|
profile: retainer # or: hipaa
|
|
292
|
-
crop_signatures: false # enable to write
|
|
290
|
+
crop_signatures: false # enable to write DOCX crops (requires pymupdf + python-docx)
|
|
293
291
|
# crop_output_dir: ./signature_crops
|
|
294
292
|
crop_image_dpi: 200
|
|
293
|
+
detect_wet_signatures: false # kept for compatibility; non-e-sign PDFs still trigger OCR
|
|
294
|
+
wet_ocr_dpi: 200
|
|
295
|
+
wet_ocr_languages: eng
|
|
296
|
+
wet_precision_threshold: 0.82
|
|
295
297
|
~~~
|
|
296
298
|
|
|
297
299
|
YAML files can be customized or load at runtime (see CLI `--config`, if available, or import and pass patterns into engine).
|
|
@@ -306,6 +308,7 @@ YAML files can be customized or load at runtime (see CLI `--config`, if availabl
|
|
|
306
308
|
- Looks for client and firm labels/tokens; boosts pages with law-firm markers (LLP/LLC/PA/PC) and “By:” blocks.
|
|
307
309
|
- Applies an anti-front-matter rule to reduce page-1 false positives (e.g., letterheads, firm mastheads).
|
|
308
310
|
- When only vendor/Acro clues exist (no widgets), it will emit two pseudo signatures targeting likely pages.
|
|
311
|
+
- **Wet detection (non-e-sign):** The CLI runs an OCR-backed pass (PyMuPDF + pytesseract/Tesseract) after e-sign detection whenever no e-sign evidence is found. It emits `RenderType="wet"` signatures for high-confidence label/stroke pairs in the lower page region. When an image-based signature is present on a page, label-only OCR candidates are suppressed unless a stroke is detected. Results are deduped to the top signature per role (dropping `unknown`). Missing OCR dependencies add a `ManualReview:*` hint instead of failing.
|
|
309
312
|
|
|
310
313
|
---
|
|
311
314
|
|
|
@@ -79,14 +79,16 @@ sigdetect detect \
|
|
|
79
79
|
### Notes
|
|
80
80
|
|
|
81
81
|
- The config file controls `pdf_root`, `out_dir`, `engine`, `pseudo_signatures`, `recurse_xobjects`, etc.
|
|
82
|
-
-
|
|
82
|
+
- Engine selection is forced to **auto** (prefers PyMuPDF for geometry, falls back to PyPDF2); any configured `engine` value is overridden.
|
|
83
83
|
- `--pseudo-signatures` enables a vendor/Acro-only pseudo-signature when no actual `/Widget` is present (useful for DocuSign / Acrobat Sign receipts).
|
|
84
84
|
- `--recurse-xobjects` allows scanning Form XObjects for vendor markers and labels embedded in page resources.
|
|
85
85
|
- `--profile` selects tuned role logic:
|
|
86
86
|
- `hipaa` → patient / representative / attorney
|
|
87
87
|
- `retainer` → client / firm (prefers detecting two signatures)
|
|
88
88
|
- `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
|
|
89
|
-
-
|
|
89
|
+
- Results output is disabled by default; set `write_results: true` or pass `--write-results` when you need `results.json` (for EDA).
|
|
90
|
+
- Cropping (`--crop-signatures`) writes a one-image `.docx` per signature in the crop output directory (no PNG files on disk); `--crop-bytes` embeds base64 PNG data in `signatures[].crop_bytes` for in-memory use. PyMuPDF is required for crops, and `python-docx` is required for `.docx` output.
|
|
91
|
+
- Wet detection runs automatically for non-e-sign PDFs when dependencies are available; missing OCR dependencies add a `ManualReview:*` hint instead of failing. PyMuPDF + Tesseract are required for wet detection.
|
|
90
92
|
- If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
|
|
91
93
|
|
|
92
94
|
### EDA (quick aggregate stats)
|
|
@@ -97,6 +99,8 @@ sigdetect eda \
|
|
|
97
99
|
|
|
98
100
|
~~~
|
|
99
101
|
|
|
102
|
+
`sigdetect eda` expects `results.json`; enable `write_results: true` when running detect.
|
|
103
|
+
|
|
100
104
|
---
|
|
101
105
|
|
|
102
106
|
## Library usage
|
|
@@ -120,15 +124,13 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
|
|
|
120
124
|
print(result.to_dict())
|
|
121
125
|
~~~
|
|
122
126
|
|
|
123
|
-
`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When
|
|
127
|
+
`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When cropping is enabled, `crop_path` points at the generated `.docx`. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
|
|
124
128
|
|
|
125
129
|
---
|
|
126
130
|
|
|
127
131
|
## Library API (embed in another script)
|
|
128
132
|
|
|
129
|
-
Minimal, plug-and-play API
|
|
130
|
-
Import from `sigdetect.api` and get plain dicts out (JSON-ready),
|
|
131
|
-
with no I/O side effects by default:
|
|
133
|
+
Minimal, plug-and-play API that returns plain dicts (JSON-ready) without side effects unless you opt into cropping. Engine selection is forced to `auto` (PyMuPDF preferred) to ensure geometry. Wet detection runs automatically for non-e-sign PDFs; pass `runWetDetection=False` to skip OCR.
|
|
132
134
|
|
|
133
135
|
~~~python
|
|
134
136
|
from pathlib import Path
|
|
@@ -151,6 +153,7 @@ result = DetectPdf(
|
|
|
151
153
|
profileName="retainer",
|
|
152
154
|
includePseudoSignatures=True,
|
|
153
155
|
recurseXObjects=True,
|
|
156
|
+
# runWetDetection=False, # disable OCR-backed wet detection if desired
|
|
154
157
|
)
|
|
155
158
|
print(
|
|
156
159
|
result["file"],
|
|
@@ -173,26 +176,17 @@ for res in ScanDirectory(
|
|
|
173
176
|
# store in DB, print, etc.
|
|
174
177
|
pass
|
|
175
178
|
|
|
176
|
-
# 3)
|
|
179
|
+
# 3) Create DOCX crops for FileResult objects (requires PyMuPDF + python-docx)
|
|
177
180
|
detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
|
|
178
181
|
file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
|
|
179
|
-
|
|
182
|
+
CropSignatureImages(
|
|
180
183
|
"/path/to/pdfs/example.pdf",
|
|
181
184
|
file_result,
|
|
182
185
|
outputDirectory="./signature_crops",
|
|
183
186
|
dpi=200,
|
|
184
|
-
returnBytes=True, # also returns in-memory PNG bytes for each crop
|
|
185
|
-
# saveToDisk=False, # optional: skip writing PNGs to disk
|
|
186
187
|
)
|
|
187
|
-
|
|
188
|
-
first_crop = crops[0]
|
|
189
|
-
print(first_crop.path, len(first_crop.image_bytes))
|
|
190
188
|
~~~
|
|
191
189
|
|
|
192
|
-
When ``returnBytes=True`` the helper returns ``SignatureCrop`` objects containing the saved path,
|
|
193
|
-
PNG bytes, and the originating signature metadata.
|
|
194
|
-
Pass ``saveToDisk=False`` if you only want in-memory PNG bytes (no files on disk or ``crop_path`` updates).
|
|
195
|
-
|
|
196
190
|
|
|
197
191
|
## Result schema
|
|
198
192
|
|
|
@@ -221,7 +215,7 @@ High-level summary (per file):
|
|
|
221
215
|
"hint": "AcroSig:sig_patient",
|
|
222
216
|
"render_type": "typed",
|
|
223
217
|
"bounding_box": [10.0, 10.0, 150.0, 40.0],
|
|
224
|
-
"crop_path": "signature_crops/example/sig_01_patient.
|
|
218
|
+
"crop_path": "signature_crops/example/sig_01_patient.docx"
|
|
225
219
|
},
|
|
226
220
|
{
|
|
227
221
|
"page": null,
|
|
@@ -231,7 +225,7 @@ High-level summary (per file):
|
|
|
231
225
|
"scores": { "page_label": 4, "general": 2 },
|
|
232
226
|
"evidence": ["page_label:representative(parent/guardian)", "pseudo:true"],
|
|
233
227
|
"hint": "VendorOrAcroOnly",
|
|
234
|
-
"render_type": "
|
|
228
|
+
"render_type": "typed",
|
|
235
229
|
"bounding_box": null,
|
|
236
230
|
"crop_path": null
|
|
237
231
|
}
|
|
@@ -247,7 +241,8 @@ High-level summary (per file):
|
|
|
247
241
|
- **`roles`** summarizes unique non-`unknown` roles across signatures.
|
|
248
242
|
- In retainer profile, emitter prefers two signatures (client + firm), often on the same page.
|
|
249
243
|
- **`signatures[].bounding_box`** reports the widget rectangle in PDF points (origin bottom-left).
|
|
250
|
-
- **`signatures[].crop_path`** is populated when
|
|
244
|
+
- **`signatures[].crop_path`** is populated when DOCX crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
|
|
245
|
+
- **`signatures[].crop_bytes`** contains base64 PNG data when CLI `--crop-bytes` is enabled.
|
|
251
246
|
|
|
252
247
|
---
|
|
253
248
|
|
|
@@ -269,13 +264,18 @@ You can keep one config YAML per dataset, e.g.:
|
|
|
269
264
|
# ./sample_data/config.yml (example)
|
|
270
265
|
pdf_root: ./pdfs
|
|
271
266
|
out_dir: ./sigdetect_out
|
|
272
|
-
engine:
|
|
267
|
+
engine: auto
|
|
268
|
+
write_results: false
|
|
273
269
|
pseudo_signatures: true
|
|
274
270
|
recurse_xobjects: true
|
|
275
271
|
profile: retainer # or: hipaa
|
|
276
|
-
crop_signatures: false # enable to write
|
|
272
|
+
crop_signatures: false # enable to write DOCX crops (requires pymupdf + python-docx)
|
|
277
273
|
# crop_output_dir: ./signature_crops
|
|
278
274
|
crop_image_dpi: 200
|
|
275
|
+
detect_wet_signatures: false # kept for compatibility; non-e-sign PDFs still trigger OCR
|
|
276
|
+
wet_ocr_dpi: 200
|
|
277
|
+
wet_ocr_languages: eng
|
|
278
|
+
wet_precision_threshold: 0.82
|
|
279
279
|
~~~
|
|
280
280
|
|
|
281
281
|
YAML files can be customized or load at runtime (see CLI `--config`, if available, or import and pass patterns into engine).
|
|
@@ -290,6 +290,7 @@ YAML files can be customized or load at runtime (see CLI `--config`, if availabl
|
|
|
290
290
|
- Looks for client and firm labels/tokens; boosts pages with law-firm markers (LLP/LLC/PA/PC) and “By:” blocks.
|
|
291
291
|
- Applies an anti-front-matter rule to reduce page-1 false positives (e.g., letterheads, firm mastheads).
|
|
292
292
|
- When only vendor/Acro clues exist (no widgets), it will emit two pseudo signatures targeting likely pages.
|
|
293
|
+
- **Wet detection (non-e-sign):** The CLI runs an OCR-backed pass (PyMuPDF + pytesseract/Tesseract) after e-sign detection whenever no e-sign evidence is found. It emits `RenderType="wet"` signatures for high-confidence label/stroke pairs in the lower page region. When an image-based signature is present on a page, label-only OCR candidates are suppressed unless a stroke is detected. Results are deduped to the top signature per role (dropping `unknown`). Missing OCR dependencies add a `ManualReview:*` hint instead of failing.
|
|
293
294
|
|
|
294
295
|
---
|
|
295
296
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "sigdetect"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.5.0"
|
|
8
8
|
description = "Signature detection and role attribution for PDFs"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
authors = [{ name = "BT Asmamaw", email = "basmamaw@angeiongroup.com" }]
|
|
@@ -15,12 +15,13 @@ dependencies = [
|
|
|
15
15
|
"rich>=13.0",
|
|
16
16
|
"typer>=0.12",
|
|
17
17
|
"pydantic>=2.5",
|
|
18
|
+
"pillow>=10.0",
|
|
19
|
+
"python-docx>=1.1.0",
|
|
20
|
+
"pytesseract>=0.3.10",
|
|
21
|
+
"pymupdf>=1.23",
|
|
18
22
|
"pyyaml>=6.0",
|
|
19
23
|
]
|
|
20
24
|
|
|
21
|
-
[project.optional-dependencies]
|
|
22
|
-
pymupdf = ["pymupdf>=1.23"]
|
|
23
|
-
|
|
24
25
|
[project.scripts]
|
|
25
26
|
sigdetect = "sigdetect.cli:app"
|
|
26
27
|
|
|
@@ -9,8 +9,9 @@ from typing import Any, Generator, Iterable, Iterator, Literal, overload
|
|
|
9
9
|
from sigdetect.config import DetectConfiguration
|
|
10
10
|
from sigdetect.cropping import SignatureCrop
|
|
11
11
|
from sigdetect.detector import BuildDetector, Detector, FileResult, Signature
|
|
12
|
+
from sigdetect.wet_detection import apply_wet_detection
|
|
12
13
|
|
|
13
|
-
EngineName = Literal["pypdf2", "pypdf", "pymupdf"]
|
|
14
|
+
EngineName = Literal["pypdf2", "pypdf", "pymupdf", "auto"]
|
|
14
15
|
ProfileName = Literal["hipaa", "retainer"]
|
|
15
16
|
|
|
16
17
|
|
|
@@ -18,12 +19,16 @@ def DetectPdf(
|
|
|
18
19
|
pdfPath: str | Path,
|
|
19
20
|
*,
|
|
20
21
|
profileName: ProfileName = "hipaa",
|
|
21
|
-
engineName: EngineName = "
|
|
22
|
+
engineName: EngineName = "auto",
|
|
22
23
|
includePseudoSignatures: bool = True,
|
|
23
24
|
recurseXObjects: bool = True,
|
|
25
|
+
runWetDetection: bool = True,
|
|
24
26
|
detector: Detector | None = None,
|
|
25
27
|
) -> dict[str, Any]:
|
|
26
|
-
"""Detect signature evidence and assign roles for a single PDF.
|
|
28
|
+
"""Detect signature evidence and assign roles for a single PDF.
|
|
29
|
+
|
|
30
|
+
Wet detection runs by default for non-e-sign PDFs; pass ``runWetDetection=False`` to skip OCR.
|
|
31
|
+
"""
|
|
27
32
|
|
|
28
33
|
resolvedPath = Path(pdfPath)
|
|
29
34
|
activeDetector = detector or get_detector(
|
|
@@ -36,6 +41,10 @@ def DetectPdf(
|
|
|
36
41
|
)
|
|
37
42
|
|
|
38
43
|
result = activeDetector.Detect(resolvedPath)
|
|
44
|
+
if runWetDetection:
|
|
45
|
+
configuration = _ResolveConfiguration(activeDetector)
|
|
46
|
+
if configuration is not None:
|
|
47
|
+
apply_wet_detection(resolvedPath, configuration, result)
|
|
39
48
|
return _ToPlainDictionary(result)
|
|
40
49
|
|
|
41
50
|
|
|
@@ -43,12 +52,15 @@ def get_detector(
|
|
|
43
52
|
*,
|
|
44
53
|
pdfRoot: str | Path | None = None,
|
|
45
54
|
profileName: ProfileName = "hipaa",
|
|
46
|
-
engineName: EngineName = "
|
|
55
|
+
engineName: EngineName = "auto",
|
|
47
56
|
includePseudoSignatures: bool = True,
|
|
48
57
|
recurseXObjects: bool = True,
|
|
49
58
|
outputDirectory: str | Path | None = None,
|
|
50
59
|
) -> Detector:
|
|
51
|
-
"""Return a reusable detector instance configured with the supplied options.
|
|
60
|
+
"""Return a reusable detector instance configured with the supplied options.
|
|
61
|
+
|
|
62
|
+
Engine selection is forced to ``auto`` (prefers PyMuPDF when available).
|
|
63
|
+
"""
|
|
52
64
|
|
|
53
65
|
configuration = DetectConfiguration(
|
|
54
66
|
PdfRoot=Path(pdfRoot) if pdfRoot is not None else Path.cwd(),
|
|
@@ -108,6 +120,7 @@ def _ToPlainValue(value: Any) -> Any:
|
|
|
108
120
|
def DetectMany(
|
|
109
121
|
pdfPaths: Iterable[str | Path],
|
|
110
122
|
*,
|
|
123
|
+
runWetDetection: bool = True,
|
|
111
124
|
detector: Detector | None = None,
|
|
112
125
|
**kwargs: Any,
|
|
113
126
|
) -> Iterator[dict[str, Any]]:
|
|
@@ -115,17 +128,18 @@ def DetectMany(
|
|
|
115
128
|
|
|
116
129
|
if detector is not None:
|
|
117
130
|
for pdfPath in pdfPaths:
|
|
118
|
-
yield _DetectWithDetector(detector, pdfPath)
|
|
131
|
+
yield _DetectWithDetector(detector, pdfPath, runWetDetection=runWetDetection)
|
|
119
132
|
return
|
|
120
133
|
|
|
121
134
|
for pdfPath in pdfPaths:
|
|
122
|
-
yield DetectPdf(pdfPath, **kwargs)
|
|
135
|
+
yield DetectPdf(pdfPath, runWetDetection=runWetDetection, **kwargs)
|
|
123
136
|
|
|
124
137
|
|
|
125
138
|
def ScanDirectory(
|
|
126
139
|
pdfRoot: str | Path,
|
|
127
140
|
*,
|
|
128
141
|
globPattern: str = "**/*.pdf",
|
|
142
|
+
runWetDetection: bool = True,
|
|
129
143
|
detector: Detector | None = None,
|
|
130
144
|
**kwargs: Any,
|
|
131
145
|
) -> Iterator[dict[str, Any]]:
|
|
@@ -143,7 +157,7 @@ def ScanDirectory(
|
|
|
143
157
|
|
|
144
158
|
for pdfPath in iterator:
|
|
145
159
|
if pdfPath.is_file() and pdfPath.suffix.lower() == ".pdf":
|
|
146
|
-
yield DetectPdf(pdfPath, detector=detector, **kwargs)
|
|
160
|
+
yield DetectPdf(pdfPath, detector=detector, runWetDetection=runWetDetection, **kwargs)
|
|
147
161
|
|
|
148
162
|
|
|
149
163
|
def ToCsvRow(result: dict[str, Any]) -> dict[str, Any]:
|
|
@@ -174,11 +188,25 @@ def Version() -> str:
|
|
|
174
188
|
return "0.0.0-dev"
|
|
175
189
|
|
|
176
190
|
|
|
177
|
-
def _DetectWithDetector(
|
|
191
|
+
def _DetectWithDetector(
|
|
192
|
+
detector: Detector, pdfPath: str | Path, *, runWetDetection: bool
|
|
193
|
+
) -> dict[str, Any]:
|
|
178
194
|
"""Helper that runs ``detector`` and returns the plain dictionary result."""
|
|
179
195
|
|
|
180
196
|
resolvedPath = Path(pdfPath)
|
|
181
|
-
|
|
197
|
+
result = detector.Detect(resolvedPath)
|
|
198
|
+
if runWetDetection:
|
|
199
|
+
configuration = _ResolveConfiguration(detector)
|
|
200
|
+
if configuration is not None:
|
|
201
|
+
apply_wet_detection(resolvedPath, configuration, result)
|
|
202
|
+
return _ToPlainDictionary(result)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _ResolveConfiguration(detector: Detector) -> DetectConfiguration | None:
|
|
206
|
+
configuration = getattr(detector, "Configuration", None)
|
|
207
|
+
if isinstance(configuration, DetectConfiguration):
|
|
208
|
+
return configuration
|
|
209
|
+
return None
|
|
182
210
|
|
|
183
211
|
|
|
184
212
|
@contextmanager
|
|
@@ -225,12 +253,15 @@ def CropSignatureImages(
|
|
|
225
253
|
returnBytes: bool = False,
|
|
226
254
|
saveToDisk: bool = True,
|
|
227
255
|
) -> list[Path] | list[SignatureCrop]:
|
|
228
|
-
"""
|
|
256
|
+
"""Create DOCX files containing cropped signature images.
|
|
229
257
|
|
|
230
258
|
Accepts either a :class:`FileResult` instance or the ``dict`` returned by
|
|
231
259
|
:func:`DetectPdf`. Requires the optional ``pymupdf`` dependency.
|
|
232
260
|
Set ``returnBytes=True`` to also receive in-memory PNG bytes for each crop. Set
|
|
233
261
|
``saveToDisk=False`` to skip writing PNG files while still returning in-memory data.
|
|
262
|
+
When ``saveToDisk`` is enabled, a one-image DOCX file is also written per crop. When
|
|
263
|
+
``returnBytes`` is True and ``python-docx`` is available, the returned
|
|
264
|
+
:class:`SignatureCrop` objects include ``docx_bytes``.
|
|
234
265
|
"""
|
|
235
266
|
|
|
236
267
|
from sigdetect.cropping import crop_signatures
|
|
@@ -273,6 +304,7 @@ def _CoerceFileResult(
|
|
|
273
304
|
RenderType=str(entry.get("render_type") or "unknown"),
|
|
274
305
|
BoundingBox=tuple(bbox) if bbox else None,
|
|
275
306
|
CropPath=entry.get("crop_path"),
|
|
307
|
+
CropBytes=entry.get("crop_bytes"),
|
|
276
308
|
)
|
|
277
309
|
)
|
|
278
310
|
|