sigdetect 0.3.1__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sigdetect/__init__.py +1 -1
- sigdetect/api.py +43 -11
- sigdetect/cli.py +89 -23
- sigdetect/config.py +48 -3
- sigdetect/cropping.py +72 -12
- sigdetect/detector/__init__.py +27 -8
- sigdetect/detector/pymupdf_engine.py +3 -2
- sigdetect/detector/pypdf2_engine.py +7 -5
- sigdetect/detector/signature_model.py +3 -1
- sigdetect/wet_detection.py +549 -0
- {sigdetect-0.3.1.dist-info → sigdetect-0.5.0.dist-info}/METADATA +28 -25
- sigdetect-0.5.0.dist-info/RECORD +24 -0
- {sigdetect-0.3.1.dist-info → sigdetect-0.5.0.dist-info}/WHEEL +1 -1
- sigdetect-0.3.1.dist-info/RECORD +0 -23
- {sigdetect-0.3.1.dist-info → sigdetect-0.5.0.dist-info}/entry_points.txt +0 -0
- {sigdetect-0.3.1.dist-info → sigdetect-0.5.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sigdetect
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: Signature detection and role attribution for PDFs
|
|
5
5
|
Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
|
|
6
6
|
License: MIT
|
|
@@ -10,9 +10,11 @@ Requires-Dist: pypdf>=4.0.0
|
|
|
10
10
|
Requires-Dist: rich>=13.0
|
|
11
11
|
Requires-Dist: typer>=0.12
|
|
12
12
|
Requires-Dist: pydantic>=2.5
|
|
13
|
+
Requires-Dist: pillow>=10.0
|
|
14
|
+
Requires-Dist: python-docx>=1.1.0
|
|
15
|
+
Requires-Dist: pytesseract>=0.3.10
|
|
16
|
+
Requires-Dist: pymupdf>=1.23
|
|
13
17
|
Requires-Dist: pyyaml>=6.0
|
|
14
|
-
Provides-Extra: pymupdf
|
|
15
|
-
Requires-Dist: pymupdf>=1.23; extra == "pymupdf"
|
|
16
18
|
|
|
17
19
|
# CaseWorks.Automation.CaseDocumentIntake
|
|
18
20
|
|
|
@@ -95,14 +97,16 @@ sigdetect detect \
|
|
|
95
97
|
### Notes
|
|
96
98
|
|
|
97
99
|
- The config file controls `pdf_root`, `out_dir`, `engine`, `pseudo_signatures`, `recurse_xobjects`, etc.
|
|
98
|
-
-
|
|
100
|
+
- Engine selection is forced to **auto** (prefers PyMuPDF for geometry, falls back to PyPDF2); any configured `engine` value is overridden.
|
|
99
101
|
- `--pseudo-signatures` enables a vendor/Acro-only pseudo-signature when no actual `/Widget` is present (useful for DocuSign / Acrobat Sign receipts).
|
|
100
102
|
- `--recurse-xobjects` allows scanning Form XObjects for vendor markers and labels embedded in page resources.
|
|
101
103
|
- `--profile` selects tuned role logic:
|
|
102
104
|
- `hipaa` → patient / representative / attorney
|
|
103
105
|
- `retainer` → client / firm (prefers detecting two signatures)
|
|
104
106
|
- `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
|
|
105
|
-
-
|
|
107
|
+
- Results output is disabled by default; set `write_results: true` or pass `--write-results` when you need `results.json` (for EDA).
|
|
108
|
+
- Cropping (`--crop-signatures`) writes a one-image `.docx` per signature in the crop output directory (no PNG files on disk); `--crop-bytes` embeds base64 PNG data in `signatures[].crop_bytes` for in-memory use. PyMuPDF is required for crops, and `python-docx` is required for `.docx` output.
|
|
109
|
+
- Wet detection runs automatically for non-e-sign PDFs when dependencies are available; missing OCR dependencies add a `ManualReview:*` hint instead of failing. PyMuPDF + Tesseract are required for wet detection.
|
|
106
110
|
- If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
|
|
107
111
|
|
|
108
112
|
### EDA (quick aggregate stats)
|
|
@@ -113,6 +117,8 @@ sigdetect eda \
|
|
|
113
117
|
|
|
114
118
|
~~~
|
|
115
119
|
|
|
120
|
+
`sigdetect eda` expects `results.json`; enable `write_results: true` when running detect.
|
|
121
|
+
|
|
116
122
|
---
|
|
117
123
|
|
|
118
124
|
## Library usage
|
|
@@ -136,15 +142,13 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
|
|
|
136
142
|
print(result.to_dict())
|
|
137
143
|
~~~
|
|
138
144
|
|
|
139
|
-
`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When
|
|
145
|
+
`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When cropping is enabled, `crop_path` points at the generated `.docx`. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
|
|
140
146
|
|
|
141
147
|
---
|
|
142
148
|
|
|
143
149
|
## Library API (embed in another script)
|
|
144
150
|
|
|
145
|
-
Minimal, plug-and-play API
|
|
146
|
-
Import from `sigdetect.api` and get plain dicts out (JSON-ready),
|
|
147
|
-
with no I/O side effects by default:
|
|
151
|
+
Minimal, plug-and-play API that returns plain dicts (JSON-ready) without side effects unless you opt into cropping. Engine selection is forced to `auto` (PyMuPDF preferred) to ensure geometry. Wet detection runs automatically for non-e-sign PDFs; pass `runWetDetection=False` to skip OCR.
|
|
148
152
|
|
|
149
153
|
~~~python
|
|
150
154
|
from pathlib import Path
|
|
@@ -167,6 +171,7 @@ result = DetectPdf(
|
|
|
167
171
|
profileName="retainer",
|
|
168
172
|
includePseudoSignatures=True,
|
|
169
173
|
recurseXObjects=True,
|
|
174
|
+
# runWetDetection=False, # disable OCR-backed wet detection if desired
|
|
170
175
|
)
|
|
171
176
|
print(
|
|
172
177
|
result["file"],
|
|
@@ -189,26 +194,17 @@ for res in ScanDirectory(
|
|
|
189
194
|
# store in DB, print, etc.
|
|
190
195
|
pass
|
|
191
196
|
|
|
192
|
-
# 3)
|
|
197
|
+
# 3) Create DOCX crops for FileResult objects (requires PyMuPDF + python-docx)
|
|
193
198
|
detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
|
|
194
199
|
file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
|
|
195
|
-
|
|
200
|
+
CropSignatureImages(
|
|
196
201
|
"/path/to/pdfs/example.pdf",
|
|
197
202
|
file_result,
|
|
198
203
|
outputDirectory="./signature_crops",
|
|
199
204
|
dpi=200,
|
|
200
|
-
returnBytes=True, # also returns in-memory PNG bytes for each crop
|
|
201
|
-
# saveToDisk=False, # optional: skip writing PNGs to disk
|
|
202
205
|
)
|
|
203
|
-
|
|
204
|
-
first_crop = crops[0]
|
|
205
|
-
print(first_crop.path, len(first_crop.image_bytes))
|
|
206
206
|
~~~
|
|
207
207
|
|
|
208
|
-
When ``returnBytes=True`` the helper returns ``SignatureCrop`` objects containing the saved path,
|
|
209
|
-
PNG bytes, and the originating signature metadata.
|
|
210
|
-
Pass ``saveToDisk=False`` if you only want in-memory PNG bytes (no files on disk or ``crop_path`` updates).
|
|
211
|
-
|
|
212
208
|
|
|
213
209
|
## Result schema
|
|
214
210
|
|
|
@@ -237,7 +233,7 @@ High-level summary (per file):
|
|
|
237
233
|
"hint": "AcroSig:sig_patient",
|
|
238
234
|
"render_type": "typed",
|
|
239
235
|
"bounding_box": [10.0, 10.0, 150.0, 40.0],
|
|
240
|
-
"crop_path": "signature_crops/example/sig_01_patient.
|
|
236
|
+
"crop_path": "signature_crops/example/sig_01_patient.docx"
|
|
241
237
|
},
|
|
242
238
|
{
|
|
243
239
|
"page": null,
|
|
@@ -247,7 +243,7 @@ High-level summary (per file):
|
|
|
247
243
|
"scores": { "page_label": 4, "general": 2 },
|
|
248
244
|
"evidence": ["page_label:representative(parent/guardian)", "pseudo:true"],
|
|
249
245
|
"hint": "VendorOrAcroOnly",
|
|
250
|
-
"render_type": "
|
|
246
|
+
"render_type": "typed",
|
|
251
247
|
"bounding_box": null,
|
|
252
248
|
"crop_path": null
|
|
253
249
|
}
|
|
@@ -263,7 +259,8 @@ High-level summary (per file):
|
|
|
263
259
|
- **`roles`** summarizes unique non-`unknown` roles across signatures.
|
|
264
260
|
- In retainer profile, emitter prefers two signatures (client + firm), often on the same page.
|
|
265
261
|
- **`signatures[].bounding_box`** reports the widget rectangle in PDF points (origin bottom-left).
|
|
266
|
-
- **`signatures[].crop_path`** is populated when
|
|
262
|
+
- **`signatures[].crop_path`** is populated when DOCX crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
|
|
263
|
+
- **`signatures[].crop_bytes`** contains base64 PNG data when CLI `--crop-bytes` is enabled.
|
|
267
264
|
|
|
268
265
|
---
|
|
269
266
|
|
|
@@ -285,13 +282,18 @@ You can keep one config YAML per dataset, e.g.:
|
|
|
285
282
|
# ./sample_data/config.yml (example)
|
|
286
283
|
pdf_root: ./pdfs
|
|
287
284
|
out_dir: ./sigdetect_out
|
|
288
|
-
engine:
|
|
285
|
+
engine: auto
|
|
286
|
+
write_results: false
|
|
289
287
|
pseudo_signatures: true
|
|
290
288
|
recurse_xobjects: true
|
|
291
289
|
profile: retainer # or: hipaa
|
|
292
|
-
crop_signatures: false # enable to write
|
|
290
|
+
crop_signatures: false # enable to write DOCX crops (requires pymupdf + python-docx)
|
|
293
291
|
# crop_output_dir: ./signature_crops
|
|
294
292
|
crop_image_dpi: 200
|
|
293
|
+
detect_wet_signatures: false # kept for compatibility; non-e-sign PDFs still trigger OCR
|
|
294
|
+
wet_ocr_dpi: 200
|
|
295
|
+
wet_ocr_languages: eng
|
|
296
|
+
wet_precision_threshold: 0.82
|
|
295
297
|
~~~
|
|
296
298
|
|
|
297
299
|
YAML files can be customized or load at runtime (see CLI `--config`, if available, or import and pass patterns into engine).
|
|
@@ -306,6 +308,7 @@ YAML files can be customized or load at runtime (see CLI `--config`, if availabl
|
|
|
306
308
|
- Looks for client and firm labels/tokens; boosts pages with law-firm markers (LLP/LLC/PA/PC) and “By:” blocks.
|
|
307
309
|
- Applies an anti-front-matter rule to reduce page-1 false positives (e.g., letterheads, firm mastheads).
|
|
308
310
|
- When only vendor/Acro clues exist (no widgets), it will emit two pseudo signatures targeting likely pages.
|
|
311
|
+
- **Wet detection (non-e-sign):** The CLI runs an OCR-backed pass (PyMuPDF + pytesseract/Tesseract) after e-sign detection whenever no e-sign evidence is found. It emits `RenderType="wet"` signatures for high-confidence label/stroke pairs in the lower page region. When an image-based signature is present on a page, label-only OCR candidates are suppressed unless a stroke is detected. Results are deduped to the top signature per role (dropping `unknown`). Missing OCR dependencies add a `ManualReview:*` hint instead of failing.
|
|
309
312
|
|
|
310
313
|
---
|
|
311
314
|
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
sigdetect/__init__.py,sha256=YvnTwlC1jfq83EhQS_1JjiiHK7_wJCCU1JvHv5E1qWY,573
|
|
2
|
+
sigdetect/api.py,sha256=uaU7JbSGpyViiXrrHu-iuifIi8xIes3PGeBZkoLNlPg,10800
|
|
3
|
+
sigdetect/cli.py,sha256=d5AznKwQPvYKVzC8RCBDgC9SlB4Goz1_pB2_EFzrsTg,10349
|
|
4
|
+
sigdetect/config.py,sha256=rJdlu9pM4aqeoY7Ha5qocPmZ7_UeVOOFepBlqOne2b8,7873
|
|
5
|
+
sigdetect/cropping.py,sha256=UeKL6dBY18V1E2DoLSbGjTzdGnjhz2WKPi3l3Q0Brh8,8516
|
|
6
|
+
sigdetect/eda.py,sha256=S92G1Gjmepri__D0n_V6foq0lQgH-RXI9anW8A58jfw,4681
|
|
7
|
+
sigdetect/logging_setup.py,sha256=LMF8ao_a-JwH0S522T6aYTFX3e8Ajjv_5ODS2YiBcHA,6404
|
|
8
|
+
sigdetect/utils.py,sha256=T9rubLf5T9JmjOHYMOba1j34fhOJaWocAXccnGTxRUE,5198
|
|
9
|
+
sigdetect/wet_detection.py,sha256=zvi11XUmm_xLZ4BLvxInwMQg8YLcyQzEYAM9QSdJOIs,18259
|
|
10
|
+
sigdetect/data/role_rules.retainer.yml,sha256=IFdwKnDBXR2cTkdfrsZ6ku6CXD8S_dg5A3vKRKLW5h8,2532
|
|
11
|
+
sigdetect/data/role_rules.yml,sha256=HuLKsZR_A6sD9XvY4NHiY_VG3dS5ERNCBF9-Mxawomw,2751
|
|
12
|
+
sigdetect/data/vendor_patterns.yml,sha256=NRbZNQxcx_GuL6n1jAphBn6MM6ChCpeWGCsjbRx-PEo,384
|
|
13
|
+
sigdetect/detector/__init__.py,sha256=nT52mCI9s03Rso_RS86mm223rJfl5GlGDFsXwMJ3z3E,2548
|
|
14
|
+
sigdetect/detector/base.py,sha256=L-iXWXqsTetDc4jRZo_wOdbNpKqOY20mX9FefrugdT0,263
|
|
15
|
+
sigdetect/detector/base_detector.py,sha256=GmAgUWO_fQgIfnihZSoyhR3wpnwZ-X3hS0Kuyz4G6Ys,608
|
|
16
|
+
sigdetect/detector/file_result_model.py,sha256=j2gTc9Sw3fJOHlexYsR_m5DiwHA8DzIzAMToESfvo4A,1767
|
|
17
|
+
sigdetect/detector/pymupdf_engine.py,sha256=N6oxvUa-48VvvhjbMk0R0kfScsggNKS7u5FLSeBRfWw,17358
|
|
18
|
+
sigdetect/detector/pypdf2_engine.py,sha256=kB8cIp_gMvCla0LIBi9sd19g0361Oc9TjCW_ZViUBJQ,47410
|
|
19
|
+
sigdetect/detector/signature_model.py,sha256=0SEUc34wvOvrzy_fDzzD42A9LsSzIOeZ4rERPDHimsA,1149
|
|
20
|
+
sigdetect-0.5.0.dist-info/METADATA,sha256=-Jgo6JZwWA18uqhjBv2mqZc43y9KHLfpMoPec7ObGow,13628
|
|
21
|
+
sigdetect-0.5.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
22
|
+
sigdetect-0.5.0.dist-info/entry_points.txt,sha256=iqtfKjBU44-omM7Sh-idGz2ahw19oAvpvSyKZVArG3o,48
|
|
23
|
+
sigdetect-0.5.0.dist-info/top_level.txt,sha256=PKlfwUobkRC0viwiSXmhtw83G26FSNpimWYC1Uy00FY,10
|
|
24
|
+
sigdetect-0.5.0.dist-info/RECORD,,
|
sigdetect-0.3.1.dist-info/RECORD
DELETED
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
sigdetect/__init__.py,sha256=LhY78mDZ1ClYVNTxW_qtE-vqJoN9N7N5ZcNRDUI_3ss,575
|
|
2
|
-
sigdetect/api.py,sha256=6_CMSxcag9coHHzrpuRSVimHWSNtqQiWY9hdlqQ2IKY,9396
|
|
3
|
-
sigdetect/cli.py,sha256=NctAnaB-TQrUAT9m-v8kj2_KTNs88kbFOCiX32tHZm8,7920
|
|
4
|
-
sigdetect/config.py,sha256=S0NVKuJYiHJCocL-VNFdGJpasFcjTecavC4EthyS1DQ,5951
|
|
5
|
-
sigdetect/cropping.py,sha256=dmJF4Q1tkmkfm0NaiwHddNOP8Sj9S4Lj_d5EBjodEkk,6015
|
|
6
|
-
sigdetect/eda.py,sha256=S92G1Gjmepri__D0n_V6foq0lQgH-RXI9anW8A58jfw,4681
|
|
7
|
-
sigdetect/logging_setup.py,sha256=LMF8ao_a-JwH0S522T6aYTFX3e8Ajjv_5ODS2YiBcHA,6404
|
|
8
|
-
sigdetect/utils.py,sha256=T9rubLf5T9JmjOHYMOba1j34fhOJaWocAXccnGTxRUE,5198
|
|
9
|
-
sigdetect/data/role_rules.retainer.yml,sha256=IFdwKnDBXR2cTkdfrsZ6ku6CXD8S_dg5A3vKRKLW5h8,2532
|
|
10
|
-
sigdetect/data/role_rules.yml,sha256=HuLKsZR_A6sD9XvY4NHiY_VG3dS5ERNCBF9-Mxawomw,2751
|
|
11
|
-
sigdetect/data/vendor_patterns.yml,sha256=NRbZNQxcx_GuL6n1jAphBn6MM6ChCpeWGCsjbRx-PEo,384
|
|
12
|
-
sigdetect/detector/__init__.py,sha256=up2FCmD09f2bRHcS4WbY-clx3GQbWuk1PM2JlxgusHg,1608
|
|
13
|
-
sigdetect/detector/base.py,sha256=L-iXWXqsTetDc4jRZo_wOdbNpKqOY20mX9FefrugdT0,263
|
|
14
|
-
sigdetect/detector/base_detector.py,sha256=GmAgUWO_fQgIfnihZSoyhR3wpnwZ-X3hS0Kuyz4G6Ys,608
|
|
15
|
-
sigdetect/detector/file_result_model.py,sha256=j2gTc9Sw3fJOHlexYsR_m5DiwHA8DzIzAMToESfvo4A,1767
|
|
16
|
-
sigdetect/detector/pymupdf_engine.py,sha256=iyp7JuPlUnydwohH5zbNg4MwH44mBmxbBWOS3ZmArBo,17339
|
|
17
|
-
sigdetect/detector/pypdf2_engine.py,sha256=INWQH06kMLvto2VS-EdLC-EtMC6AG7JmdVYmNgx6_RU,47313
|
|
18
|
-
sigdetect/detector/signature_model.py,sha256=mztb9V5wgv2oohQ5Cxzcv8_Bo6TyWAVIXteaeQ2rywQ,1076
|
|
19
|
-
sigdetect-0.3.1.dist-info/METADATA,sha256=whXGE4-9spAjlMcZz_owdsIiB4EobXL9_UOuAJeDVfA,12342
|
|
20
|
-
sigdetect-0.3.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
21
|
-
sigdetect-0.3.1.dist-info/entry_points.txt,sha256=iqtfKjBU44-omM7Sh-idGz2ahw19oAvpvSyKZVArG3o,48
|
|
22
|
-
sigdetect-0.3.1.dist-info/top_level.txt,sha256=PKlfwUobkRC0viwiSXmhtw83G26FSNpimWYC1Uy00FY,10
|
|
23
|
-
sigdetect-0.3.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|