sigdetect 0.4.0__tar.gz → 0.5.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sigdetect-0.4.0/src/sigdetect.egg-info → sigdetect-0.5.1}/PKG-INFO +25 -12
- sigdetect-0.4.0/PKG-INFO → sigdetect-0.5.1/README.md +20 -25
- {sigdetect-0.4.0 → sigdetect-0.5.1}/pyproject.toml +5 -4
- {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect/api.py +48 -12
- {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect/cli.py +70 -28
- {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect/config.py +17 -0
- {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect/cropping.py +78 -15
- {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect/detector/__init__.py +10 -8
- {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect/detector/pymupdf_engine.py +2 -2
- {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect/detector/signature_model.py +6 -0
- {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect/wet_detection.py +63 -13
- sigdetect-0.4.0/README.md → sigdetect-0.5.1/src/sigdetect.egg-info/PKG-INFO +38 -9
- {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect.egg-info/requires.txt +4 -3
- {sigdetect-0.4.0 → sigdetect-0.5.1}/tests/test_api.py +36 -1
- {sigdetect-0.4.0 → sigdetect-0.5.1}/tests/test_cli.py +131 -2
- {sigdetect-0.4.0 → sigdetect-0.5.1}/tests/test_cropping.py +88 -1
- {sigdetect-0.4.0 → sigdetect-0.5.1}/tests/test_detector_options.py +4 -4
- sigdetect-0.5.1/tests/test_wet_detection.py +215 -0
- sigdetect-0.4.0/tests/test_wet_detection.py +0 -111
- {sigdetect-0.4.0 → sigdetect-0.5.1}/setup.cfg +0 -0
- {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect/__init__.py +0 -0
- {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect/data/role_rules.retainer.yml +0 -0
- {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect/data/role_rules.yml +0 -0
- {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect/data/vendor_patterns.yml +0 -0
- {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect/detector/base.py +0 -0
- {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect/detector/base_detector.py +0 -0
- {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect/detector/file_result_model.py +0 -0
- {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect/detector/pypdf2_engine.py +0 -0
- {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect/eda.py +0 -0
- {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect/logging_setup.py +0 -0
- {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect/utils.py +0 -0
- {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect.egg-info/SOURCES.txt +0 -0
- {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect.egg-info/dependency_links.txt +0 -0
- {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect.egg-info/entry_points.txt +0 -0
- {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect.egg-info/top_level.txt +0 -0
- {sigdetect-0.4.0 → sigdetect-0.5.1}/tests/test_pymupdf_engine.py +0 -0
- {sigdetect-0.4.0 → sigdetect-0.5.1}/tests/test_widget_role_patient_smoke.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sigdetect
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.1
|
|
4
4
|
Summary: Signature detection and role attribution for PDFs
|
|
5
5
|
Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
|
|
6
6
|
License: MIT
|
|
@@ -10,9 +10,11 @@ Requires-Dist: pypdf>=4.0.0
|
|
|
10
10
|
Requires-Dist: rich>=13.0
|
|
11
11
|
Requires-Dist: typer>=0.12
|
|
12
12
|
Requires-Dist: pydantic>=2.5
|
|
13
|
+
Requires-Dist: pillow>=10.0
|
|
14
|
+
Requires-Dist: python-docx>=1.1.0
|
|
15
|
+
Requires-Dist: pytesseract>=0.3.10
|
|
16
|
+
Requires-Dist: pymupdf>=1.23
|
|
13
17
|
Requires-Dist: pyyaml>=6.0
|
|
14
|
-
Provides-Extra: pymupdf
|
|
15
|
-
Requires-Dist: pymupdf>=1.23; extra == "pymupdf"
|
|
16
18
|
|
|
17
19
|
# CaseWorks.Automation.CaseDocumentIntake
|
|
18
20
|
|
|
@@ -95,14 +97,16 @@ sigdetect detect \
|
|
|
95
97
|
### Notes
|
|
96
98
|
|
|
97
99
|
- The config file controls `pdf_root`, `out_dir`, `engine`, `pseudo_signatures`, `recurse_xobjects`, etc.
|
|
98
|
-
-
|
|
100
|
+
- Engine selection is forced to **auto** (prefers PyMuPDF for geometry, falls back to PyPDF2); any configured `engine` value is overridden.
|
|
99
101
|
- `--pseudo-signatures` enables a vendor/Acro-only pseudo-signature when no actual `/Widget` is present (useful for DocuSign / Acrobat Sign receipts).
|
|
100
102
|
- `--recurse-xobjects` allows scanning Form XObjects for vendor markers and labels embedded in page resources.
|
|
101
103
|
- `--profile` selects tuned role logic:
|
|
102
104
|
- `hipaa` → patient / representative / attorney
|
|
103
105
|
- `retainer` → client / firm (prefers detecting two signatures)
|
|
104
106
|
- `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
|
|
105
|
-
-
|
|
107
|
+
- Results output is disabled by default; set `write_results: true` or pass `--write-results` when you need `results.json` (for EDA).
|
|
108
|
+
- Cropping (`--crop-signatures`) writes PNG crops to disk by default; enable `--crop-docx` to write DOCX files instead of PNGs. `--crop-bytes` embeds base64 PNG data in `signatures[].crop_bytes` and, when `--crop-docx` is enabled, embeds DOCX bytes in `signatures[].crop_docx_bytes`. PyMuPDF is required for crops, and `python-docx` is required for DOCX output.
|
|
109
|
+
- Wet detection runs automatically for non-e-sign PDFs when dependencies are available; missing OCR dependencies add a `ManualReview:*` hint instead of failing. PyMuPDF + Tesseract are required for wet detection.
|
|
106
110
|
- If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
|
|
107
111
|
|
|
108
112
|
### EDA (quick aggregate stats)
|
|
@@ -113,6 +117,8 @@ sigdetect eda \
|
|
|
113
117
|
|
|
114
118
|
~~~
|
|
115
119
|
|
|
120
|
+
`sigdetect eda` expects `results.json`; enable `write_results: true` when running detect.
|
|
121
|
+
|
|
116
122
|
---
|
|
117
123
|
|
|
118
124
|
## Library usage
|
|
@@ -136,13 +142,13 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
|
|
|
136
142
|
print(result.to_dict())
|
|
137
143
|
~~~
|
|
138
144
|
|
|
139
|
-
`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
|
|
145
|
+
`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image; when DOCX cropping is enabled, `crop_docx_path` points at the generated doc. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
|
|
140
146
|
|
|
141
147
|
---
|
|
142
148
|
|
|
143
149
|
## Library API (embed in another script)
|
|
144
150
|
|
|
145
|
-
Minimal, plug-and-play API that returns plain dicts (JSON-ready) without side effects unless you opt into cropping
|
|
151
|
+
Minimal, plug-and-play API that returns plain dicts (JSON-ready) without side effects unless you opt into cropping. Engine selection is forced to `auto` (PyMuPDF preferred) to ensure geometry. Wet detection runs automatically for non-e-sign PDFs; pass `runWetDetection=False` to skip OCR.
|
|
146
152
|
|
|
147
153
|
~~~python
|
|
148
154
|
from pathlib import Path
|
|
@@ -165,6 +171,7 @@ result = DetectPdf(
|
|
|
165
171
|
profileName="retainer",
|
|
166
172
|
includePseudoSignatures=True,
|
|
167
173
|
recurseXObjects=True,
|
|
174
|
+
# runWetDetection=False, # disable OCR-backed wet detection if desired
|
|
168
175
|
)
|
|
169
176
|
print(
|
|
170
177
|
result["file"],
|
|
@@ -187,7 +194,7 @@ for res in ScanDirectory(
|
|
|
187
194
|
# store in DB, print, etc.
|
|
188
195
|
pass
|
|
189
196
|
|
|
190
|
-
# 3) Crop
|
|
197
|
+
# 3) Crop signature snippets for FileResult objects (requires PyMuPDF; DOCX needs python-docx)
|
|
191
198
|
detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
|
|
192
199
|
file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
|
|
193
200
|
CropSignatureImages(
|
|
@@ -226,7 +233,8 @@ High-level summary (per file):
|
|
|
226
233
|
"hint": "AcroSig:sig_patient",
|
|
227
234
|
"render_type": "typed",
|
|
228
235
|
"bounding_box": [10.0, 10.0, 150.0, 40.0],
|
|
229
|
-
"crop_path": "signature_crops/example/sig_01_patient.png"
|
|
236
|
+
"crop_path": "signature_crops/example/sig_01_patient.png",
|
|
237
|
+
"crop_docx_path": null
|
|
230
238
|
},
|
|
231
239
|
{
|
|
232
240
|
"page": null,
|
|
@@ -253,6 +261,9 @@ High-level summary (per file):
|
|
|
253
261
|
- In retainer profile, emitter prefers two signatures (client + firm), often on the same page.
|
|
254
262
|
- **`signatures[].bounding_box`** reports the widget rectangle in PDF points (origin bottom-left).
|
|
255
263
|
- **`signatures[].crop_path`** is populated when PNG crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
|
|
264
|
+
- **`signatures[].crop_docx_path`** is populated when DOCX crops are generated (`--crop-docx` or `docx=True`).
|
|
265
|
+
- **`signatures[].crop_bytes`** contains base64 PNG data when CLI `--crop-bytes` is enabled.
|
|
266
|
+
- **`signatures[].crop_docx_bytes`** contains base64 DOCX data when `--crop-docx` and `--crop-bytes` are enabled together.
|
|
256
267
|
|
|
257
268
|
---
|
|
258
269
|
|
|
@@ -274,14 +285,16 @@ You can keep one config YAML per dataset, e.g.:
|
|
|
274
285
|
# ./sample_data/config.yml (example)
|
|
275
286
|
pdf_root: ./pdfs
|
|
276
287
|
out_dir: ./sigdetect_out
|
|
277
|
-
engine:
|
|
288
|
+
engine: auto
|
|
289
|
+
write_results: false
|
|
278
290
|
pseudo_signatures: true
|
|
279
291
|
recurse_xobjects: true
|
|
280
292
|
profile: retainer # or: hipaa
|
|
281
293
|
crop_signatures: false # enable to write PNG crops (requires pymupdf)
|
|
294
|
+
crop_docx: false # enable to write DOCX crops instead of PNGs (requires python-docx)
|
|
282
295
|
# crop_output_dir: ./signature_crops
|
|
283
296
|
crop_image_dpi: 200
|
|
284
|
-
detect_wet_signatures: false #
|
|
297
|
+
detect_wet_signatures: false # kept for compatibility; non-e-sign PDFs still trigger OCR
|
|
285
298
|
wet_ocr_dpi: 200
|
|
286
299
|
wet_ocr_languages: eng
|
|
287
300
|
wet_precision_threshold: 0.82
|
|
@@ -299,7 +312,7 @@ YAML files can be customized or load at runtime (see CLI `--config`, if availabl
|
|
|
299
312
|
- Looks for client and firm labels/tokens; boosts pages with law-firm markers (LLP/LLC/PA/PC) and “By:” blocks.
|
|
300
313
|
- Applies an anti-front-matter rule to reduce page-1 false positives (e.g., letterheads, firm mastheads).
|
|
301
314
|
- When only vendor/Acro clues exist (no widgets), it will emit two pseudo signatures targeting likely pages.
|
|
302
|
-
- **Wet detection (
|
|
315
|
+
- **Wet detection (non-e-sign):** The CLI runs an OCR-backed pass (PyMuPDF + pytesseract/Tesseract) after e-sign detection whenever no e-sign evidence is found. It emits `RenderType="wet"` signatures for high-confidence label/stroke pairs in the lower page region. When an image-based signature is present on a page, label-only OCR candidates are suppressed unless a stroke is detected. Results are deduped to the top signature per role (dropping `unknown`). Missing OCR dependencies add a `ManualReview:*` hint instead of failing.
|
|
303
316
|
|
|
304
317
|
---
|
|
305
318
|
|
|
@@ -1,19 +1,3 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: sigdetect
|
|
3
|
-
Version: 0.4.0
|
|
4
|
-
Summary: Signature detection and role attribution for PDFs
|
|
5
|
-
Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
|
|
6
|
-
License: MIT
|
|
7
|
-
Requires-Python: >=3.9
|
|
8
|
-
Description-Content-Type: text/markdown
|
|
9
|
-
Requires-Dist: pypdf>=4.0.0
|
|
10
|
-
Requires-Dist: rich>=13.0
|
|
11
|
-
Requires-Dist: typer>=0.12
|
|
12
|
-
Requires-Dist: pydantic>=2.5
|
|
13
|
-
Requires-Dist: pyyaml>=6.0
|
|
14
|
-
Provides-Extra: pymupdf
|
|
15
|
-
Requires-Dist: pymupdf>=1.23; extra == "pymupdf"
|
|
16
|
-
|
|
17
1
|
# CaseWorks.Automation.CaseDocumentIntake
|
|
18
2
|
|
|
19
3
|
## sigdetect
|
|
@@ -95,14 +79,16 @@ sigdetect detect \
|
|
|
95
79
|
### Notes
|
|
96
80
|
|
|
97
81
|
- The config file controls `pdf_root`, `out_dir`, `engine`, `pseudo_signatures`, `recurse_xobjects`, etc.
|
|
98
|
-
-
|
|
82
|
+
- Engine selection is forced to **auto** (prefers PyMuPDF for geometry, falls back to PyPDF2); any configured `engine` value is overridden.
|
|
99
83
|
- `--pseudo-signatures` enables a vendor/Acro-only pseudo-signature when no actual `/Widget` is present (useful for DocuSign / Acrobat Sign receipts).
|
|
100
84
|
- `--recurse-xobjects` allows scanning Form XObjects for vendor markers and labels embedded in page resources.
|
|
101
85
|
- `--profile` selects tuned role logic:
|
|
102
86
|
- `hipaa` → patient / representative / attorney
|
|
103
87
|
- `retainer` → client / firm (prefers detecting two signatures)
|
|
104
88
|
- `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
|
|
105
|
-
-
|
|
89
|
+
- Results output is disabled by default; set `write_results: true` or pass `--write-results` when you need `results.json` (for EDA).
|
|
90
|
+
- Cropping (`--crop-signatures`) writes PNG crops to disk by default; enable `--crop-docx` to write DOCX files instead of PNGs. `--crop-bytes` embeds base64 PNG data in `signatures[].crop_bytes` and, when `--crop-docx` is enabled, embeds DOCX bytes in `signatures[].crop_docx_bytes`. PyMuPDF is required for crops, and `python-docx` is required for DOCX output.
|
|
91
|
+
- Wet detection runs automatically for non-e-sign PDFs when dependencies are available; missing OCR dependencies add a `ManualReview:*` hint instead of failing. PyMuPDF + Tesseract are required for wet detection.
|
|
106
92
|
- If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
|
|
107
93
|
|
|
108
94
|
### EDA (quick aggregate stats)
|
|
@@ -113,6 +99,8 @@ sigdetect eda \
|
|
|
113
99
|
|
|
114
100
|
~~~
|
|
115
101
|
|
|
102
|
+
`sigdetect eda` expects `results.json`; enable `write_results: true` when running detect.
|
|
103
|
+
|
|
116
104
|
---
|
|
117
105
|
|
|
118
106
|
## Library usage
|
|
@@ -136,13 +124,13 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
|
|
|
136
124
|
print(result.to_dict())
|
|
137
125
|
~~~
|
|
138
126
|
|
|
139
|
-
`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
|
|
127
|
+
`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image; when DOCX cropping is enabled, `crop_docx_path` points at the generated doc. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
|
|
140
128
|
|
|
141
129
|
---
|
|
142
130
|
|
|
143
131
|
## Library API (embed in another script)
|
|
144
132
|
|
|
145
|
-
Minimal, plug-and-play API that returns plain dicts (JSON-ready) without side effects unless you opt into cropping
|
|
133
|
+
Minimal, plug-and-play API that returns plain dicts (JSON-ready) without side effects unless you opt into cropping. Engine selection is forced to `auto` (PyMuPDF preferred) to ensure geometry. Wet detection runs automatically for non-e-sign PDFs; pass `runWetDetection=False` to skip OCR.
|
|
146
134
|
|
|
147
135
|
~~~python
|
|
148
136
|
from pathlib import Path
|
|
@@ -165,6 +153,7 @@ result = DetectPdf(
|
|
|
165
153
|
profileName="retainer",
|
|
166
154
|
includePseudoSignatures=True,
|
|
167
155
|
recurseXObjects=True,
|
|
156
|
+
# runWetDetection=False, # disable OCR-backed wet detection if desired
|
|
168
157
|
)
|
|
169
158
|
print(
|
|
170
159
|
result["file"],
|
|
@@ -187,7 +176,7 @@ for res in ScanDirectory(
|
|
|
187
176
|
# store in DB, print, etc.
|
|
188
177
|
pass
|
|
189
178
|
|
|
190
|
-
# 3) Crop
|
|
179
|
+
# 3) Crop signature snippets for FileResult objects (requires PyMuPDF; DOCX needs python-docx)
|
|
191
180
|
detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
|
|
192
181
|
file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
|
|
193
182
|
CropSignatureImages(
|
|
@@ -226,7 +215,8 @@ High-level summary (per file):
|
|
|
226
215
|
"hint": "AcroSig:sig_patient",
|
|
227
216
|
"render_type": "typed",
|
|
228
217
|
"bounding_box": [10.0, 10.0, 150.0, 40.0],
|
|
229
|
-
"crop_path": "signature_crops/example/sig_01_patient.png"
|
|
218
|
+
"crop_path": "signature_crops/example/sig_01_patient.png",
|
|
219
|
+
"crop_docx_path": null
|
|
230
220
|
},
|
|
231
221
|
{
|
|
232
222
|
"page": null,
|
|
@@ -253,6 +243,9 @@ High-level summary (per file):
|
|
|
253
243
|
- In retainer profile, emitter prefers two signatures (client + firm), often on the same page.
|
|
254
244
|
- **`signatures[].bounding_box`** reports the widget rectangle in PDF points (origin bottom-left).
|
|
255
245
|
- **`signatures[].crop_path`** is populated when PNG crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
|
|
246
|
+
- **`signatures[].crop_docx_path`** is populated when DOCX crops are generated (`--crop-docx` or `docx=True`).
|
|
247
|
+
- **`signatures[].crop_bytes`** contains base64 PNG data when CLI `--crop-bytes` is enabled.
|
|
248
|
+
- **`signatures[].crop_docx_bytes`** contains base64 DOCX data when `--crop-docx` and `--crop-bytes` are enabled together.
|
|
256
249
|
|
|
257
250
|
---
|
|
258
251
|
|
|
@@ -274,14 +267,16 @@ You can keep one config YAML per dataset, e.g.:
|
|
|
274
267
|
# ./sample_data/config.yml (example)
|
|
275
268
|
pdf_root: ./pdfs
|
|
276
269
|
out_dir: ./sigdetect_out
|
|
277
|
-
engine:
|
|
270
|
+
engine: auto
|
|
271
|
+
write_results: false
|
|
278
272
|
pseudo_signatures: true
|
|
279
273
|
recurse_xobjects: true
|
|
280
274
|
profile: retainer # or: hipaa
|
|
281
275
|
crop_signatures: false # enable to write PNG crops (requires pymupdf)
|
|
276
|
+
crop_docx: false # enable to write DOCX crops instead of PNGs (requires python-docx)
|
|
282
277
|
# crop_output_dir: ./signature_crops
|
|
283
278
|
crop_image_dpi: 200
|
|
284
|
-
detect_wet_signatures: false #
|
|
279
|
+
detect_wet_signatures: false # kept for compatibility; non-e-sign PDFs still trigger OCR
|
|
285
280
|
wet_ocr_dpi: 200
|
|
286
281
|
wet_ocr_languages: eng
|
|
287
282
|
wet_precision_threshold: 0.82
|
|
@@ -299,7 +294,7 @@ YAML files can be customized or load at runtime (see CLI `--config`, if availabl
|
|
|
299
294
|
- Looks for client and firm labels/tokens; boosts pages with law-firm markers (LLP/LLC/PA/PC) and “By:” blocks.
|
|
300
295
|
- Applies an anti-front-matter rule to reduce page-1 false positives (e.g., letterheads, firm mastheads).
|
|
301
296
|
- When only vendor/Acro clues exist (no widgets), it will emit two pseudo signatures targeting likely pages.
|
|
302
|
-
- **Wet detection (
|
|
297
|
+
- **Wet detection (non-e-sign):** The CLI runs an OCR-backed pass (PyMuPDF + pytesseract/Tesseract) after e-sign detection whenever no e-sign evidence is found. It emits `RenderType="wet"` signatures for high-confidence label/stroke pairs in the lower page region. When an image-based signature is present on a page, label-only OCR candidates are suppressed unless a stroke is detected. Results are deduped to the top signature per role (dropping `unknown`). Missing OCR dependencies add a `ManualReview:*` hint instead of failing.
|
|
303
298
|
|
|
304
299
|
---
|
|
305
300
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "sigdetect"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.5.1"
|
|
8
8
|
description = "Signature detection and role attribution for PDFs"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
authors = [{ name = "BT Asmamaw", email = "basmamaw@angeiongroup.com" }]
|
|
@@ -15,12 +15,13 @@ dependencies = [
|
|
|
15
15
|
"rich>=13.0",
|
|
16
16
|
"typer>=0.12",
|
|
17
17
|
"pydantic>=2.5",
|
|
18
|
+
"pillow>=10.0",
|
|
19
|
+
"python-docx>=1.1.0",
|
|
20
|
+
"pytesseract>=0.3.10",
|
|
21
|
+
"pymupdf>=1.23",
|
|
18
22
|
"pyyaml>=6.0",
|
|
19
23
|
]
|
|
20
24
|
|
|
21
|
-
[project.optional-dependencies]
|
|
22
|
-
pymupdf = ["pymupdf>=1.23"]
|
|
23
|
-
|
|
24
25
|
[project.scripts]
|
|
25
26
|
sigdetect = "sigdetect.cli:app"
|
|
26
27
|
|
|
@@ -9,6 +9,7 @@ from typing import Any, Generator, Iterable, Iterator, Literal, overload
|
|
|
9
9
|
from sigdetect.config import DetectConfiguration
|
|
10
10
|
from sigdetect.cropping import SignatureCrop
|
|
11
11
|
from sigdetect.detector import BuildDetector, Detector, FileResult, Signature
|
|
12
|
+
from sigdetect.wet_detection import apply_wet_detection
|
|
12
13
|
|
|
13
14
|
EngineName = Literal["pypdf2", "pypdf", "pymupdf", "auto"]
|
|
14
15
|
ProfileName = Literal["hipaa", "retainer"]
|
|
@@ -21,9 +22,13 @@ def DetectPdf(
|
|
|
21
22
|
engineName: EngineName = "auto",
|
|
22
23
|
includePseudoSignatures: bool = True,
|
|
23
24
|
recurseXObjects: bool = True,
|
|
25
|
+
runWetDetection: bool = True,
|
|
24
26
|
detector: Detector | None = None,
|
|
25
27
|
) -> dict[str, Any]:
|
|
26
|
-
"""Detect signature evidence and assign roles for a single PDF.
|
|
28
|
+
"""Detect signature evidence and assign roles for a single PDF.
|
|
29
|
+
|
|
30
|
+
Wet detection runs by default for non-e-sign PDFs; pass ``runWetDetection=False`` to skip OCR.
|
|
31
|
+
"""
|
|
27
32
|
|
|
28
33
|
resolvedPath = Path(pdfPath)
|
|
29
34
|
activeDetector = detector or get_detector(
|
|
@@ -36,6 +41,10 @@ def DetectPdf(
|
|
|
36
41
|
)
|
|
37
42
|
|
|
38
43
|
result = activeDetector.Detect(resolvedPath)
|
|
44
|
+
if runWetDetection:
|
|
45
|
+
configuration = _ResolveConfiguration(activeDetector)
|
|
46
|
+
if configuration is not None:
|
|
47
|
+
apply_wet_detection(resolvedPath, configuration, result)
|
|
39
48
|
return _ToPlainDictionary(result)
|
|
40
49
|
|
|
41
50
|
|
|
@@ -48,7 +57,10 @@ def get_detector(
|
|
|
48
57
|
recurseXObjects: bool = True,
|
|
49
58
|
outputDirectory: str | Path | None = None,
|
|
50
59
|
) -> Detector:
|
|
51
|
-
"""Return a reusable detector instance configured with the supplied options.
|
|
60
|
+
"""Return a reusable detector instance configured with the supplied options.
|
|
61
|
+
|
|
62
|
+
Engine selection is forced to ``auto`` (prefers PyMuPDF when available).
|
|
63
|
+
"""
|
|
52
64
|
|
|
53
65
|
configuration = DetectConfiguration(
|
|
54
66
|
PdfRoot=Path(pdfRoot) if pdfRoot is not None else Path.cwd(),
|
|
@@ -108,6 +120,7 @@ def _ToPlainValue(value: Any) -> Any:
|
|
|
108
120
|
def DetectMany(
|
|
109
121
|
pdfPaths: Iterable[str | Path],
|
|
110
122
|
*,
|
|
123
|
+
runWetDetection: bool = True,
|
|
111
124
|
detector: Detector | None = None,
|
|
112
125
|
**kwargs: Any,
|
|
113
126
|
) -> Iterator[dict[str, Any]]:
|
|
@@ -115,17 +128,18 @@ def DetectMany(
|
|
|
115
128
|
|
|
116
129
|
if detector is not None:
|
|
117
130
|
for pdfPath in pdfPaths:
|
|
118
|
-
yield _DetectWithDetector(detector, pdfPath)
|
|
131
|
+
yield _DetectWithDetector(detector, pdfPath, runWetDetection=runWetDetection)
|
|
119
132
|
return
|
|
120
133
|
|
|
121
134
|
for pdfPath in pdfPaths:
|
|
122
|
-
yield DetectPdf(pdfPath, **kwargs)
|
|
135
|
+
yield DetectPdf(pdfPath, runWetDetection=runWetDetection, **kwargs)
|
|
123
136
|
|
|
124
137
|
|
|
125
138
|
def ScanDirectory(
|
|
126
139
|
pdfRoot: str | Path,
|
|
127
140
|
*,
|
|
128
141
|
globPattern: str = "**/*.pdf",
|
|
142
|
+
runWetDetection: bool = True,
|
|
129
143
|
detector: Detector | None = None,
|
|
130
144
|
**kwargs: Any,
|
|
131
145
|
) -> Iterator[dict[str, Any]]:
|
|
@@ -143,7 +157,7 @@ def ScanDirectory(
|
|
|
143
157
|
|
|
144
158
|
for pdfPath in iterator:
|
|
145
159
|
if pdfPath.is_file() and pdfPath.suffix.lower() == ".pdf":
|
|
146
|
-
yield DetectPdf(pdfPath, detector=detector, **kwargs)
|
|
160
|
+
yield DetectPdf(pdfPath, detector=detector, runWetDetection=runWetDetection, **kwargs)
|
|
147
161
|
|
|
148
162
|
|
|
149
163
|
def ToCsvRow(result: dict[str, Any]) -> dict[str, Any]:
|
|
@@ -174,11 +188,25 @@ def Version() -> str:
|
|
|
174
188
|
return "0.0.0-dev"
|
|
175
189
|
|
|
176
190
|
|
|
177
|
-
def _DetectWithDetector(
|
|
191
|
+
def _DetectWithDetector(
|
|
192
|
+
detector: Detector, pdfPath: str | Path, *, runWetDetection: bool
|
|
193
|
+
) -> dict[str, Any]:
|
|
178
194
|
"""Helper that runs ``detector`` and returns the plain dictionary result."""
|
|
179
195
|
|
|
180
196
|
resolvedPath = Path(pdfPath)
|
|
181
|
-
|
|
197
|
+
result = detector.Detect(resolvedPath)
|
|
198
|
+
if runWetDetection:
|
|
199
|
+
configuration = _ResolveConfiguration(detector)
|
|
200
|
+
if configuration is not None:
|
|
201
|
+
apply_wet_detection(resolvedPath, configuration, result)
|
|
202
|
+
return _ToPlainDictionary(result)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _ResolveConfiguration(detector: Detector) -> DetectConfiguration | None:
|
|
206
|
+
configuration = getattr(detector, "Configuration", None)
|
|
207
|
+
if isinstance(configuration, DetectConfiguration):
|
|
208
|
+
return configuration
|
|
209
|
+
return None
|
|
182
210
|
|
|
183
211
|
|
|
184
212
|
@contextmanager
|
|
@@ -201,8 +229,8 @@ def CropSignatureImages(
|
|
|
201
229
|
dpi: int = 200,
|
|
202
230
|
returnBytes: Literal[False] = False,
|
|
203
231
|
saveToDisk: bool = True,
|
|
204
|
-
|
|
205
|
-
|
|
232
|
+
docx: bool = False,
|
|
233
|
+
) -> list[Path]: ...
|
|
206
234
|
|
|
207
235
|
|
|
208
236
|
@overload
|
|
@@ -214,8 +242,8 @@ def CropSignatureImages(
|
|
|
214
242
|
dpi: int,
|
|
215
243
|
returnBytes: Literal[True],
|
|
216
244
|
saveToDisk: bool,
|
|
217
|
-
|
|
218
|
-
|
|
245
|
+
docx: bool = False,
|
|
246
|
+
) -> list[SignatureCrop]: ...
|
|
219
247
|
|
|
220
248
|
|
|
221
249
|
def CropSignatureImages(
|
|
@@ -226,13 +254,17 @@ def CropSignatureImages(
|
|
|
226
254
|
dpi: int = 200,
|
|
227
255
|
returnBytes: bool = False,
|
|
228
256
|
saveToDisk: bool = True,
|
|
257
|
+
docx: bool = False,
|
|
229
258
|
) -> list[Path] | list[SignatureCrop]:
|
|
230
|
-
"""
|
|
259
|
+
"""Create PNG files containing cropped signature images (or DOCX when enabled).
|
|
231
260
|
|
|
232
261
|
Accepts either a :class:`FileResult` instance or the ``dict`` returned by
|
|
233
262
|
:func:`DetectPdf`. Requires the optional ``pymupdf`` dependency.
|
|
234
263
|
Set ``returnBytes=True`` to also receive in-memory PNG bytes for each crop. Set
|
|
235
264
|
``saveToDisk=False`` to skip writing PNG files while still returning in-memory data.
|
|
265
|
+
When ``docx`` is True, DOCX files are written instead of PNG files. When ``returnBytes`` is
|
|
266
|
+
True and ``docx`` is enabled, the returned :class:`SignatureCrop` objects include
|
|
267
|
+
``docx_bytes``.
|
|
236
268
|
"""
|
|
237
269
|
|
|
238
270
|
from sigdetect.cropping import crop_signatures
|
|
@@ -245,6 +277,7 @@ def CropSignatureImages(
|
|
|
245
277
|
dpi=dpi,
|
|
246
278
|
return_bytes=returnBytes,
|
|
247
279
|
save_files=saveToDisk,
|
|
280
|
+
docx=docx,
|
|
248
281
|
)
|
|
249
282
|
if original_dict is not None:
|
|
250
283
|
original_dict.clear()
|
|
@@ -275,6 +308,9 @@ def _CoerceFileResult(
|
|
|
275
308
|
RenderType=str(entry.get("render_type") or "unknown"),
|
|
276
309
|
BoundingBox=tuple(bbox) if bbox else None,
|
|
277
310
|
CropPath=entry.get("crop_path"),
|
|
311
|
+
CropBytes=entry.get("crop_bytes"),
|
|
312
|
+
CropDocxPath=entry.get("crop_docx_path"),
|
|
313
|
+
CropDocxBytes=entry.get("crop_docx_bytes"),
|
|
278
314
|
)
|
|
279
315
|
)
|
|
280
316
|
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import base64
|
|
5
6
|
import json
|
|
6
7
|
from collections.abc import Iterator
|
|
7
8
|
from dataclasses import asdict, is_dataclass
|
|
@@ -48,6 +49,12 @@ def Detect(
|
|
|
48
49
|
configurationPath: Path | None = typer.Option(
|
|
49
50
|
None, "--config", "-c", help="Path to YAML config"
|
|
50
51
|
),
|
|
52
|
+
writeResults: bool | None = typer.Option(
|
|
53
|
+
None,
|
|
54
|
+
"--write-results/--no-write-results",
|
|
55
|
+
help="Write results.json (or JSON to stdout when out_dir is none)",
|
|
56
|
+
show_default=False,
|
|
57
|
+
),
|
|
51
58
|
profileOverride: str | None = typer.Option(None, "--profile", "-p", help="hipaa or retainer"),
|
|
52
59
|
recursive: bool = typer.Option(
|
|
53
60
|
True,
|
|
@@ -57,13 +64,19 @@ def Detect(
|
|
|
57
64
|
cropSignatures: bool | None = typer.Option(
|
|
58
65
|
None,
|
|
59
66
|
"--crop-signatures/--no-crop-signatures",
|
|
60
|
-
help="
|
|
67
|
+
help="Write PNG crops for signature widgets (requires PyMuPDF)",
|
|
68
|
+
show_default=False,
|
|
69
|
+
),
|
|
70
|
+
cropDocx: bool | None = typer.Option(
|
|
71
|
+
None,
|
|
72
|
+
"--crop-docx/--no-crop-docx",
|
|
73
|
+
help="Write DOCX crops instead of PNG files (requires PyMuPDF + python-docx)",
|
|
61
74
|
show_default=False,
|
|
62
75
|
),
|
|
63
76
|
cropDirectory: Path | None = typer.Option(
|
|
64
77
|
None,
|
|
65
78
|
"--crop-dir",
|
|
66
|
-
help="Directory for signature
|
|
79
|
+
help="Directory for signature crops (defaults to out_dir/signature_crops)",
|
|
67
80
|
),
|
|
68
81
|
cropDpi: int | None = typer.Option(
|
|
69
82
|
None,
|
|
@@ -73,10 +86,16 @@ def Detect(
|
|
|
73
86
|
help="Rendering DPI for signature crops",
|
|
74
87
|
show_default=False,
|
|
75
88
|
),
|
|
89
|
+
cropBytes: bool = typer.Option(
|
|
90
|
+
False,
|
|
91
|
+
"--crop-bytes/--no-crop-bytes",
|
|
92
|
+
help="Embed base64 PNG bytes (and DOCX bytes when --crop-docx) in results JSON",
|
|
93
|
+
show_default=False,
|
|
94
|
+
),
|
|
76
95
|
detectWetSignatures: bool | None = typer.Option(
|
|
77
96
|
None,
|
|
78
97
|
"--detect-wet/--no-detect-wet",
|
|
79
|
-
help="
|
|
98
|
+
help="Compatibility flag; non-e-sign PDFs always run OCR when deps are available",
|
|
80
99
|
show_default=False,
|
|
81
100
|
),
|
|
82
101
|
wetOcrDpi: int | None = typer.Option(
|
|
@@ -111,8 +130,12 @@ def Detect(
|
|
|
111
130
|
configuration = configuration.model_copy(update={"Profile": normalized_profile})
|
|
112
131
|
|
|
113
132
|
overrides: dict[str, object] = {}
|
|
133
|
+
if writeResults is not None:
|
|
134
|
+
overrides["WriteResults"] = writeResults
|
|
114
135
|
if cropSignatures is not None:
|
|
115
136
|
overrides["CropSignatures"] = cropSignatures
|
|
137
|
+
if cropDocx is not None:
|
|
138
|
+
overrides["CropDocx"] = cropDocx
|
|
116
139
|
if cropDirectory is not None:
|
|
117
140
|
overrides["CropOutputDirectory"] = cropDirectory
|
|
118
141
|
if cropDpi is not None:
|
|
@@ -145,53 +168,66 @@ def Detect(
|
|
|
145
168
|
except StopIteration:
|
|
146
169
|
raise SystemExit(f"No PDFs found in {configuration.PdfRoot}") from None
|
|
147
170
|
|
|
148
|
-
|
|
171
|
+
write_results = configuration.WriteResults
|
|
172
|
+
results_buffer: list[FileResult] | None = (
|
|
173
|
+
[] if write_results and configuration.OutputDirectory is None else None
|
|
174
|
+
)
|
|
149
175
|
json_handle = None
|
|
150
176
|
json_path: Path | None = None
|
|
151
177
|
wrote_first = False
|
|
152
178
|
|
|
153
|
-
if configuration.OutputDirectory is not None:
|
|
179
|
+
if write_results and configuration.OutputDirectory is not None:
|
|
154
180
|
outputDirectory = configuration.OutputDirectory
|
|
155
181
|
outputDirectory.mkdir(parents=True, exist_ok=True)
|
|
156
182
|
json_path = outputDirectory / "results.json"
|
|
157
183
|
json_handle = open(json_path, "w", encoding="utf-8")
|
|
158
184
|
json_handle.write("[")
|
|
159
185
|
|
|
186
|
+
crop_bytes_enabled = bool(cropBytes)
|
|
160
187
|
crop_dir = configuration.CropOutputDirectory
|
|
188
|
+
if crop_dir is None:
|
|
189
|
+
base_dir = configuration.OutputDirectory or configuration.PdfRoot
|
|
190
|
+
crop_dir = base_dir / "signature_crops"
|
|
161
191
|
cropping_enabled = configuration.CropSignatures
|
|
192
|
+
docx_enabled = configuration.CropDocx
|
|
162
193
|
cropping_available = True
|
|
163
194
|
cropping_attempted = False
|
|
164
|
-
if configuration.CropSignatures and crop_dir is None:
|
|
165
|
-
Logger.warning(
|
|
166
|
-
"CropSignatures enabled without an output directory",
|
|
167
|
-
extra={"pdf_root": str(configuration.PdfRoot)},
|
|
168
|
-
)
|
|
169
|
-
cropping_enabled = False
|
|
170
195
|
|
|
171
196
|
total_bboxes = 0
|
|
172
197
|
|
|
173
198
|
def _append_result(file_result: FileResult, source_pdf: Path) -> None:
|
|
174
199
|
nonlocal wrote_first, json_handle, total_bboxes, cropping_available, cropping_attempted
|
|
175
200
|
|
|
176
|
-
if
|
|
201
|
+
if cropping_available and (cropping_enabled or crop_bytes_enabled) and crop_dir is not None:
|
|
177
202
|
try:
|
|
178
|
-
crop_signatures(
|
|
203
|
+
crops = crop_signatures(
|
|
179
204
|
pdf_path=source_pdf,
|
|
180
205
|
file_result=file_result,
|
|
181
206
|
output_dir=crop_dir,
|
|
182
207
|
dpi=configuration.CropImageDpi,
|
|
183
208
|
logger=Logger,
|
|
209
|
+
return_bytes=crop_bytes_enabled,
|
|
210
|
+
save_files=cropping_enabled,
|
|
211
|
+
docx=docx_enabled,
|
|
184
212
|
)
|
|
185
213
|
cropping_attempted = True
|
|
214
|
+
if crop_bytes_enabled:
|
|
215
|
+
for crop in crops:
|
|
216
|
+
crop.signature.CropBytes = base64.b64encode(crop.image_bytes).decode(
|
|
217
|
+
"ascii"
|
|
218
|
+
)
|
|
219
|
+
if crop.docx_bytes:
|
|
220
|
+
crop.signature.CropDocxBytes = base64.b64encode(
|
|
221
|
+
crop.docx_bytes
|
|
222
|
+
).decode("ascii")
|
|
186
223
|
except SignatureCroppingUnavailable as exc:
|
|
187
224
|
cropping_available = False
|
|
188
225
|
Logger.warning("Signature cropping unavailable", extra={"error": str(exc)})
|
|
189
226
|
typer.echo(str(exc), err=True)
|
|
190
227
|
except Exception as exc: # pragma: no cover - defensive
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
)
|
|
228
|
+
cropping_available = False
|
|
229
|
+
Logger.warning("Signature cropping unavailable", extra={"error": str(exc)})
|
|
230
|
+
typer.echo(str(exc), err=True)
|
|
195
231
|
|
|
196
232
|
total_bboxes += sum(1 for sig in file_result.Signatures if sig.BoundingBox)
|
|
197
233
|
|
|
@@ -231,18 +267,24 @@ def Detect(
|
|
|
231
267
|
json_handle.write(closing)
|
|
232
268
|
json_handle.close()
|
|
233
269
|
|
|
234
|
-
if
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
270
|
+
if write_results:
|
|
271
|
+
if json_handle is not None:
|
|
272
|
+
typer.echo(f"Wrote {json_path}")
|
|
273
|
+
else:
|
|
274
|
+
payload = json.dumps(
|
|
275
|
+
results_buffer or [], indent=2, ensure_ascii=False, default=_JsonSerializer
|
|
276
|
+
)
|
|
277
|
+
typer.echo(payload)
|
|
278
|
+
typer.echo("Detection completed with output disabled (out_dir=none)")
|
|
279
|
+
|
|
280
|
+
if (
|
|
281
|
+
(cropping_enabled or crop_bytes_enabled)
|
|
282
|
+
and cropping_available
|
|
283
|
+
and cropping_attempted
|
|
284
|
+
and total_bboxes == 0
|
|
285
|
+
):
|
|
244
286
|
Logger.warning(
|
|
245
|
-
"No signature bounding boxes detected;
|
|
287
|
+
"No signature bounding boxes detected; install PyMuPDF for crop-ready output",
|
|
246
288
|
extra={"engine": configuration.Engine},
|
|
247
289
|
)
|
|
248
290
|
|