sigdetect 0.3.1__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {sigdetect-0.3.1 → sigdetect-0.5.0}/PKG-INFO +28 -25
  2. {sigdetect-0.3.1 → sigdetect-0.5.0}/README.md +23 -22
  3. {sigdetect-0.3.1 → sigdetect-0.5.0}/pyproject.toml +5 -4
  4. {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect/__init__.py +1 -1
  5. {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect/api.py +43 -11
  6. {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect/cli.py +89 -23
  7. {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect/config.py +48 -3
  8. {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect/cropping.py +72 -12
  9. {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect/detector/__init__.py +27 -8
  10. {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect/detector/pymupdf_engine.py +3 -2
  11. {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect/detector/pypdf2_engine.py +7 -5
  12. {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect/detector/signature_model.py +3 -1
  13. sigdetect-0.5.0/src/sigdetect/wet_detection.py +549 -0
  14. {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect.egg-info/PKG-INFO +28 -25
  15. {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect.egg-info/SOURCES.txt +4 -0
  16. {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect.egg-info/requires.txt +4 -3
  17. {sigdetect-0.3.1 → sigdetect-0.5.0}/tests/test_api.py +36 -1
  18. sigdetect-0.5.0/tests/test_cli.py +275 -0
  19. {sigdetect-0.3.1 → sigdetect-0.5.0}/tests/test_cropping.py +12 -1
  20. sigdetect-0.5.0/tests/test_detector_options.py +82 -0
  21. sigdetect-0.5.0/tests/test_wet_detection.py +215 -0
  22. {sigdetect-0.3.1 → sigdetect-0.5.0}/setup.cfg +0 -0
  23. {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect/data/role_rules.retainer.yml +0 -0
  24. {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect/data/role_rules.yml +0 -0
  25. {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect/data/vendor_patterns.yml +0 -0
  26. {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect/detector/base.py +0 -0
  27. {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect/detector/base_detector.py +0 -0
  28. {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect/detector/file_result_model.py +0 -0
  29. {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect/eda.py +0 -0
  30. {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect/logging_setup.py +0 -0
  31. {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect/utils.py +0 -0
  32. {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect.egg-info/dependency_links.txt +0 -0
  33. {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect.egg-info/entry_points.txt +0 -0
  34. {sigdetect-0.3.1 → sigdetect-0.5.0}/src/sigdetect.egg-info/top_level.txt +0 -0
  35. {sigdetect-0.3.1 → sigdetect-0.5.0}/tests/test_pymupdf_engine.py +0 -0
  36. {sigdetect-0.3.1 → sigdetect-0.5.0}/tests/test_widget_role_patient_smoke.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sigdetect
3
- Version: 0.3.1
3
+ Version: 0.5.0
4
4
  Summary: Signature detection and role attribution for PDFs
5
5
  Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
6
6
  License: MIT
@@ -10,9 +10,11 @@ Requires-Dist: pypdf>=4.0.0
10
10
  Requires-Dist: rich>=13.0
11
11
  Requires-Dist: typer>=0.12
12
12
  Requires-Dist: pydantic>=2.5
13
+ Requires-Dist: pillow>=10.0
14
+ Requires-Dist: python-docx>=1.1.0
15
+ Requires-Dist: pytesseract>=0.3.10
16
+ Requires-Dist: pymupdf>=1.23
13
17
  Requires-Dist: pyyaml>=6.0
14
- Provides-Extra: pymupdf
15
- Requires-Dist: pymupdf>=1.23; extra == "pymupdf"
16
18
 
17
19
  # CaseWorks.Automation.CaseDocumentIntake
18
20
 
@@ -95,14 +97,16 @@ sigdetect detect \
95
97
  ### Notes
96
98
 
97
99
  - The config file controls `pdf_root`, `out_dir`, `engine`, `pseudo_signatures`, `recurse_xobjects`, etc.
98
- - `--engine` supports **pypdf2** (default); a **pymupdf** engine placeholder exists and may be included in a future build.
100
+ - Engine selection is forced to **auto** (prefers PyMuPDF for geometry, falls back to PyPDF2); any configured `engine` value is overridden.
99
101
  - `--pseudo-signatures` enables a vendor/Acro-only pseudo-signature when no actual `/Widget` is present (useful for DocuSign / Acrobat Sign receipts).
100
102
  - `--recurse-xobjects` allows scanning Form XObjects for vendor markers and labels embedded in page resources.
101
103
  - `--profile` selects tuned role logic:
102
104
  - `hipaa` → patient / representative / attorney
103
105
  - `retainer` → client / firm (prefers detecting two signatures)
104
106
  - `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
105
- - `--crop-signatures` enables PNG crops for each detected widget (requires installing the optional `pymupdf` dependency). Use `--crop-dir` to override the destination and `--crop-dpi` to choose rendering quality.
107
+ - Results output is disabled by default; set `write_results: true` or pass `--write-results` when you need `results.json` (for EDA).
108
+ - Cropping (`--crop-signatures`) writes a one-image `.docx` per signature in the crop output directory (no PNG files on disk); `--crop-bytes` embeds base64 PNG data in `signatures[].crop_bytes` for in-memory use. PyMuPDF is required for crops, and `python-docx` is required for `.docx` output.
109
+ - Wet detection runs automatically for non-e-sign PDFs when dependencies are available; missing OCR dependencies add a `ManualReview:*` hint instead of failing. PyMuPDF + Tesseract are required for wet detection.
106
110
  - If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
107
111
 
108
112
  ### EDA (quick aggregate stats)
@@ -113,6 +117,8 @@ sigdetect eda \
113
117
 
114
118
  ~~~
115
119
 
120
+ `sigdetect eda` expects `results.json`; enable `write_results: true` when running detect.
121
+
116
122
  ---
117
123
 
118
124
  ## Library usage
@@ -136,15 +142,13 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
136
142
  print(result.to_dict())
137
143
  ~~~
138
144
 
139
- `Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image.
145
+ `Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When cropping is enabled, `crop_path` points at the generated `.docx`. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
140
146
 
141
147
  ---
142
148
 
143
149
  ## Library API (embed in another script)
144
150
 
145
- Minimal, plug-and-play API
146
- Import from `sigdetect.api` and get plain dicts out (JSON-ready),
147
- with no I/O side effects by default:
151
+ Minimal, plug-and-play API that returns plain dicts (JSON-ready) without side effects unless you opt into cropping. Engine selection is forced to `auto` (PyMuPDF preferred) to ensure geometry. Wet detection runs automatically for non-e-sign PDFs; pass `runWetDetection=False` to skip OCR.
148
152
 
149
153
  ~~~python
150
154
  from pathlib import Path
@@ -167,6 +171,7 @@ result = DetectPdf(
167
171
  profileName="retainer",
168
172
  includePseudoSignatures=True,
169
173
  recurseXObjects=True,
174
+ # runWetDetection=False, # disable OCR-backed wet detection if desired
170
175
  )
171
176
  print(
172
177
  result["file"],
@@ -189,26 +194,17 @@ for res in ScanDirectory(
189
194
  # store in DB, print, etc.
190
195
  pass
191
196
 
192
- # 3) Crop PNG snippets for FileResult objects (requires PyMuPDF)
197
+ # 3) Create DOCX crops for FileResult objects (requires PyMuPDF + python-docx)
193
198
  detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
194
199
  file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
195
- crops = CropSignatureImages(
200
+ CropSignatureImages(
196
201
  "/path/to/pdfs/example.pdf",
197
202
  file_result,
198
203
  outputDirectory="./signature_crops",
199
204
  dpi=200,
200
- returnBytes=True, # also returns in-memory PNG bytes for each crop
201
- # saveToDisk=False, # optional: skip writing PNGs to disk
202
205
  )
203
-
204
- first_crop = crops[0]
205
- print(first_crop.path, len(first_crop.image_bytes))
206
206
  ~~~
207
207
 
208
- When ``returnBytes=True`` the helper returns ``SignatureCrop`` objects containing the saved path,
209
- PNG bytes, and the originating signature metadata.
210
- Pass ``saveToDisk=False`` if you only want in-memory PNG bytes (no files on disk or ``crop_path`` updates).
211
-
212
208
 
213
209
  ## Result schema
214
210
 
@@ -237,7 +233,7 @@ High-level summary (per file):
237
233
  "hint": "AcroSig:sig_patient",
238
234
  "render_type": "typed",
239
235
  "bounding_box": [10.0, 10.0, 150.0, 40.0],
240
- "crop_path": "signature_crops/example/sig_01_patient.png"
236
+ "crop_path": "signature_crops/example/sig_01_patient.docx"
241
237
  },
242
238
  {
243
239
  "page": null,
@@ -247,7 +243,7 @@ High-level summary (per file):
247
243
  "scores": { "page_label": 4, "general": 2 },
248
244
  "evidence": ["page_label:representative(parent/guardian)", "pseudo:true"],
249
245
  "hint": "VendorOrAcroOnly",
250
- "render_type": "unknown",
246
+ "render_type": "typed",
251
247
  "bounding_box": null,
252
248
  "crop_path": null
253
249
  }
@@ -263,7 +259,8 @@ High-level summary (per file):
263
259
  - **`roles`** summarizes unique non-`unknown` roles across signatures.
264
260
  - In retainer profile, emitter prefers two signatures (client + firm), often on the same page.
265
261
  - **`signatures[].bounding_box`** reports the widget rectangle in PDF points (origin bottom-left).
266
- - **`signatures[].crop_path`** is populated when PNG crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
262
+ - **`signatures[].crop_path`** is populated when DOCX crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
263
+ - **`signatures[].crop_bytes`** contains base64 PNG data when CLI `--crop-bytes` is enabled.
267
264
 
268
265
  ---
269
266
 
@@ -285,13 +282,18 @@ You can keep one config YAML per dataset, e.g.:
285
282
  # ./sample_data/config.yml (example)
286
283
  pdf_root: ./pdfs
287
284
  out_dir: ./sigdetect_out
288
- engine: pypdf2
285
+ engine: auto
286
+ write_results: false
289
287
  pseudo_signatures: true
290
288
  recurse_xobjects: true
291
289
  profile: retainer # or: hipaa
292
- crop_signatures: false # enable to write PNG crops (requires pymupdf)
290
+ crop_signatures: false # enable to write DOCX crops (requires pymupdf + python-docx)
293
291
  # crop_output_dir: ./signature_crops
294
292
  crop_image_dpi: 200
293
+ detect_wet_signatures: false # kept for compatibility; non-e-sign PDFs still trigger OCR
294
+ wet_ocr_dpi: 200
295
+ wet_ocr_languages: eng
296
+ wet_precision_threshold: 0.82
295
297
  ~~~
296
298
 
297
299
  YAML files can be customized or load at runtime (see CLI `--config`, if available, or import and pass patterns into engine).
@@ -306,6 +308,7 @@ YAML files can be customized or load at runtime (see CLI `--config`, if availabl
306
308
  - Looks for client and firm labels/tokens; boosts pages with law-firm markers (LLP/LLC/PA/PC) and “By:” blocks.
307
309
  - Applies an anti-front-matter rule to reduce page-1 false positives (e.g., letterheads, firm mastheads).
308
310
  - When only vendor/Acro clues exist (no widgets), it will emit two pseudo signatures targeting likely pages.
311
+ - **Wet detection (non-e-sign):** The CLI runs an OCR-backed pass (PyMuPDF + pytesseract/Tesseract) after e-sign detection whenever no e-sign evidence is found. It emits `RenderType="wet"` signatures for high-confidence label/stroke pairs in the lower page region. When an image-based signature is present on a page, label-only OCR candidates are suppressed unless a stroke is detected. Results are deduped to the top signature per role (dropping `unknown`). Missing OCR dependencies add a `ManualReview:*` hint instead of failing.
309
312
 
310
313
  ---
311
314
 
@@ -79,14 +79,16 @@ sigdetect detect \
79
79
  ### Notes
80
80
 
81
81
  - The config file controls `pdf_root`, `out_dir`, `engine`, `pseudo_signatures`, `recurse_xobjects`, etc.
82
- - `--engine` supports **pypdf2** (default); a **pymupdf** engine placeholder exists and may be included in a future build.
82
+ - Engine selection is forced to **auto** (prefers PyMuPDF for geometry, falls back to PyPDF2); any configured `engine` value is overridden.
83
83
  - `--pseudo-signatures` enables a vendor/Acro-only pseudo-signature when no actual `/Widget` is present (useful for DocuSign / Acrobat Sign receipts).
84
84
  - `--recurse-xobjects` allows scanning Form XObjects for vendor markers and labels embedded in page resources.
85
85
  - `--profile` selects tuned role logic:
86
86
  - `hipaa` → patient / representative / attorney
87
87
  - `retainer` → client / firm (prefers detecting two signatures)
88
88
  - `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
89
- - `--crop-signatures` enables PNG crops for each detected widget (requires installing the optional `pymupdf` dependency). Use `--crop-dir` to override the destination and `--crop-dpi` to choose rendering quality.
89
+ - Results output is disabled by default; set `write_results: true` or pass `--write-results` when you need `results.json` (for EDA).
90
+ - Cropping (`--crop-signatures`) writes a one-image `.docx` per signature in the crop output directory (no PNG files on disk); `--crop-bytes` embeds base64 PNG data in `signatures[].crop_bytes` for in-memory use. PyMuPDF is required for crops, and `python-docx` is required for `.docx` output.
91
+ - Wet detection runs automatically for non-e-sign PDFs when dependencies are available; missing OCR dependencies add a `ManualReview:*` hint instead of failing. PyMuPDF + Tesseract are required for wet detection.
90
92
  - If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
91
93
 
92
94
  ### EDA (quick aggregate stats)
@@ -97,6 +99,8 @@ sigdetect eda \
97
99
 
98
100
  ~~~
99
101
 
102
+ `sigdetect eda` expects `results.json`; enable `write_results: true` when running detect.
103
+
100
104
  ---
101
105
 
102
106
  ## Library usage
@@ -120,15 +124,13 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
120
124
  print(result.to_dict())
121
125
  ~~~
122
126
 
123
- `Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image.
127
+ `Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When cropping is enabled, `crop_path` points at the generated `.docx`. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
124
128
 
125
129
  ---
126
130
 
127
131
  ## Library API (embed in another script)
128
132
 
129
- Minimal, plug-and-play API
130
- Import from `sigdetect.api` and get plain dicts out (JSON-ready),
131
- with no I/O side effects by default:
133
+ Minimal, plug-and-play API that returns plain dicts (JSON-ready) without side effects unless you opt into cropping. Engine selection is forced to `auto` (PyMuPDF preferred) to ensure geometry. Wet detection runs automatically for non-e-sign PDFs; pass `runWetDetection=False` to skip OCR.
132
134
 
133
135
  ~~~python
134
136
  from pathlib import Path
@@ -151,6 +153,7 @@ result = DetectPdf(
151
153
  profileName="retainer",
152
154
  includePseudoSignatures=True,
153
155
  recurseXObjects=True,
156
+ # runWetDetection=False, # disable OCR-backed wet detection if desired
154
157
  )
155
158
  print(
156
159
  result["file"],
@@ -173,26 +176,17 @@ for res in ScanDirectory(
173
176
  # store in DB, print, etc.
174
177
  pass
175
178
 
176
- # 3) Crop PNG snippets for FileResult objects (requires PyMuPDF)
179
+ # 3) Create DOCX crops for FileResult objects (requires PyMuPDF + python-docx)
177
180
  detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
178
181
  file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
179
- crops = CropSignatureImages(
182
+ CropSignatureImages(
180
183
  "/path/to/pdfs/example.pdf",
181
184
  file_result,
182
185
  outputDirectory="./signature_crops",
183
186
  dpi=200,
184
- returnBytes=True, # also returns in-memory PNG bytes for each crop
185
- # saveToDisk=False, # optional: skip writing PNGs to disk
186
187
  )
187
-
188
- first_crop = crops[0]
189
- print(first_crop.path, len(first_crop.image_bytes))
190
188
  ~~~
191
189
 
192
- When ``returnBytes=True`` the helper returns ``SignatureCrop`` objects containing the saved path,
193
- PNG bytes, and the originating signature metadata.
194
- Pass ``saveToDisk=False`` if you only want in-memory PNG bytes (no files on disk or ``crop_path`` updates).
195
-
196
190
 
197
191
  ## Result schema
198
192
 
@@ -221,7 +215,7 @@ High-level summary (per file):
221
215
  "hint": "AcroSig:sig_patient",
222
216
  "render_type": "typed",
223
217
  "bounding_box": [10.0, 10.0, 150.0, 40.0],
224
- "crop_path": "signature_crops/example/sig_01_patient.png"
218
+ "crop_path": "signature_crops/example/sig_01_patient.docx"
225
219
  },
226
220
  {
227
221
  "page": null,
@@ -231,7 +225,7 @@ High-level summary (per file):
231
225
  "scores": { "page_label": 4, "general": 2 },
232
226
  "evidence": ["page_label:representative(parent/guardian)", "pseudo:true"],
233
227
  "hint": "VendorOrAcroOnly",
234
- "render_type": "unknown",
228
+ "render_type": "typed",
235
229
  "bounding_box": null,
236
230
  "crop_path": null
237
231
  }
@@ -247,7 +241,8 @@ High-level summary (per file):
247
241
  - **`roles`** summarizes unique non-`unknown` roles across signatures.
248
242
  - In retainer profile, emitter prefers two signatures (client + firm), often on the same page.
249
243
  - **`signatures[].bounding_box`** reports the widget rectangle in PDF points (origin bottom-left).
250
- - **`signatures[].crop_path`** is populated when PNG crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
244
+ - **`signatures[].crop_path`** is populated when DOCX crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
245
+ - **`signatures[].crop_bytes`** contains base64 PNG data when CLI `--crop-bytes` is enabled.
251
246
 
252
247
  ---
253
248
 
@@ -269,13 +264,18 @@ You can keep one config YAML per dataset, e.g.:
269
264
  # ./sample_data/config.yml (example)
270
265
  pdf_root: ./pdfs
271
266
  out_dir: ./sigdetect_out
272
- engine: pypdf2
267
+ engine: auto
268
+ write_results: false
273
269
  pseudo_signatures: true
274
270
  recurse_xobjects: true
275
271
  profile: retainer # or: hipaa
276
- crop_signatures: false # enable to write PNG crops (requires pymupdf)
272
+ crop_signatures: false # enable to write DOCX crops (requires pymupdf + python-docx)
277
273
  # crop_output_dir: ./signature_crops
278
274
  crop_image_dpi: 200
275
+ detect_wet_signatures: false # kept for compatibility; non-e-sign PDFs still trigger OCR
276
+ wet_ocr_dpi: 200
277
+ wet_ocr_languages: eng
278
+ wet_precision_threshold: 0.82
279
279
  ~~~
280
280
 
281
281
  YAML files can be customized or load at runtime (see CLI `--config`, if available, or import and pass patterns into engine).
@@ -290,6 +290,7 @@ YAML files can be customized or load at runtime (see CLI `--config`, if availabl
290
290
  - Looks for client and firm labels/tokens; boosts pages with law-firm markers (LLP/LLC/PA/PC) and “By:” blocks.
291
291
  - Applies an anti-front-matter rule to reduce page-1 false positives (e.g., letterheads, firm mastheads).
292
292
  - When only vendor/Acro clues exist (no widgets), it will emit two pseudo signatures targeting likely pages.
293
+ - **Wet detection (non-e-sign):** The CLI runs an OCR-backed pass (PyMuPDF + pytesseract/Tesseract) after e-sign detection whenever no e-sign evidence is found. It emits `RenderType="wet"` signatures for high-confidence label/stroke pairs in the lower page region. When an image-based signature is present on a page, label-only OCR candidates are suppressed unless a stroke is detected. Results are deduped to the top signature per role (dropping `unknown`). Missing OCR dependencies add a `ManualReview:*` hint instead of failing.
293
294
 
294
295
  ---
295
296
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sigdetect"
7
- version = "0.3.1"
7
+ version = "0.5.0"
8
8
  description = "Signature detection and role attribution for PDFs"
9
9
  readme = "README.md"
10
10
  authors = [{ name = "BT Asmamaw", email = "basmamaw@angeiongroup.com" }]
@@ -15,12 +15,13 @@ dependencies = [
15
15
  "rich>=13.0",
16
16
  "typer>=0.12",
17
17
  "pydantic>=2.5",
18
+ "pillow>=10.0",
19
+ "python-docx>=1.1.0",
20
+ "pytesseract>=0.3.10",
21
+ "pymupdf>=1.23",
18
22
  "pyyaml>=6.0",
19
23
  ]
20
24
 
21
- [project.optional-dependencies]
22
- pymupdf = ["pymupdf>=1.23"]
23
-
24
25
  [project.scripts]
25
26
  sigdetect = "sigdetect.cli:app"
26
27
 
@@ -21,4 +21,4 @@ try:
21
21
  except PackageNotFoundError: # pragma: no cover
22
22
  __version__ = "0.0.0"
23
23
 
24
- DEFAULT_ENGINE = "pypdf2"
24
+ DEFAULT_ENGINE = "auto"
@@ -9,8 +9,9 @@ from typing import Any, Generator, Iterable, Iterator, Literal, overload
9
9
  from sigdetect.config import DetectConfiguration
10
10
  from sigdetect.cropping import SignatureCrop
11
11
  from sigdetect.detector import BuildDetector, Detector, FileResult, Signature
12
+ from sigdetect.wet_detection import apply_wet_detection
12
13
 
13
- EngineName = Literal["pypdf2", "pypdf", "pymupdf"]
14
+ EngineName = Literal["pypdf2", "pypdf", "pymupdf", "auto"]
14
15
  ProfileName = Literal["hipaa", "retainer"]
15
16
 
16
17
 
@@ -18,12 +19,16 @@ def DetectPdf(
18
19
  pdfPath: str | Path,
19
20
  *,
20
21
  profileName: ProfileName = "hipaa",
21
- engineName: EngineName = "pypdf2",
22
+ engineName: EngineName = "auto",
22
23
  includePseudoSignatures: bool = True,
23
24
  recurseXObjects: bool = True,
25
+ runWetDetection: bool = True,
24
26
  detector: Detector | None = None,
25
27
  ) -> dict[str, Any]:
26
- """Detect signature evidence and assign roles for a single PDF."""
28
+ """Detect signature evidence and assign roles for a single PDF.
29
+
30
+ Wet detection runs by default for non-e-sign PDFs; pass ``runWetDetection=False`` to skip OCR.
31
+ """
27
32
 
28
33
  resolvedPath = Path(pdfPath)
29
34
  activeDetector = detector or get_detector(
@@ -36,6 +41,10 @@ def DetectPdf(
36
41
  )
37
42
 
38
43
  result = activeDetector.Detect(resolvedPath)
44
+ if runWetDetection:
45
+ configuration = _ResolveConfiguration(activeDetector)
46
+ if configuration is not None:
47
+ apply_wet_detection(resolvedPath, configuration, result)
39
48
  return _ToPlainDictionary(result)
40
49
 
41
50
 
@@ -43,12 +52,15 @@ def get_detector(
43
52
  *,
44
53
  pdfRoot: str | Path | None = None,
45
54
  profileName: ProfileName = "hipaa",
46
- engineName: EngineName = "pypdf2",
55
+ engineName: EngineName = "auto",
47
56
  includePseudoSignatures: bool = True,
48
57
  recurseXObjects: bool = True,
49
58
  outputDirectory: str | Path | None = None,
50
59
  ) -> Detector:
51
- """Return a reusable detector instance configured with the supplied options."""
60
+ """Return a reusable detector instance configured with the supplied options.
61
+
62
+ Engine selection is forced to ``auto`` (prefers PyMuPDF when available).
63
+ """
52
64
 
53
65
  configuration = DetectConfiguration(
54
66
  PdfRoot=Path(pdfRoot) if pdfRoot is not None else Path.cwd(),
@@ -108,6 +120,7 @@ def _ToPlainValue(value: Any) -> Any:
108
120
  def DetectMany(
109
121
  pdfPaths: Iterable[str | Path],
110
122
  *,
123
+ runWetDetection: bool = True,
111
124
  detector: Detector | None = None,
112
125
  **kwargs: Any,
113
126
  ) -> Iterator[dict[str, Any]]:
@@ -115,17 +128,18 @@ def DetectMany(
115
128
 
116
129
  if detector is not None:
117
130
  for pdfPath in pdfPaths:
118
- yield _DetectWithDetector(detector, pdfPath)
131
+ yield _DetectWithDetector(detector, pdfPath, runWetDetection=runWetDetection)
119
132
  return
120
133
 
121
134
  for pdfPath in pdfPaths:
122
- yield DetectPdf(pdfPath, **kwargs)
135
+ yield DetectPdf(pdfPath, runWetDetection=runWetDetection, **kwargs)
123
136
 
124
137
 
125
138
  def ScanDirectory(
126
139
  pdfRoot: str | Path,
127
140
  *,
128
141
  globPattern: str = "**/*.pdf",
142
+ runWetDetection: bool = True,
129
143
  detector: Detector | None = None,
130
144
  **kwargs: Any,
131
145
  ) -> Iterator[dict[str, Any]]:
@@ -143,7 +157,7 @@ def ScanDirectory(
143
157
 
144
158
  for pdfPath in iterator:
145
159
  if pdfPath.is_file() and pdfPath.suffix.lower() == ".pdf":
146
- yield DetectPdf(pdfPath, detector=detector, **kwargs)
160
+ yield DetectPdf(pdfPath, detector=detector, runWetDetection=runWetDetection, **kwargs)
147
161
 
148
162
 
149
163
  def ToCsvRow(result: dict[str, Any]) -> dict[str, Any]:
@@ -174,11 +188,25 @@ def Version() -> str:
174
188
  return "0.0.0-dev"
175
189
 
176
190
 
177
- def _DetectWithDetector(detector: Detector, pdfPath: str | Path) -> dict[str, Any]:
191
+ def _DetectWithDetector(
192
+ detector: Detector, pdfPath: str | Path, *, runWetDetection: bool
193
+ ) -> dict[str, Any]:
178
194
  """Helper that runs ``detector`` and returns the plain dictionary result."""
179
195
 
180
196
  resolvedPath = Path(pdfPath)
181
- return _ToPlainDictionary(detector.Detect(resolvedPath))
197
+ result = detector.Detect(resolvedPath)
198
+ if runWetDetection:
199
+ configuration = _ResolveConfiguration(detector)
200
+ if configuration is not None:
201
+ apply_wet_detection(resolvedPath, configuration, result)
202
+ return _ToPlainDictionary(result)
203
+
204
+
205
+ def _ResolveConfiguration(detector: Detector) -> DetectConfiguration | None:
206
+ configuration = getattr(detector, "Configuration", None)
207
+ if isinstance(configuration, DetectConfiguration):
208
+ return configuration
209
+ return None
182
210
 
183
211
 
184
212
  @contextmanager
@@ -225,12 +253,15 @@ def CropSignatureImages(
225
253
  returnBytes: bool = False,
226
254
  saveToDisk: bool = True,
227
255
  ) -> list[Path] | list[SignatureCrop]:
228
- """Crop detected signature regions to PNG files.
256
+ """Create DOCX files containing cropped signature images.
229
257
 
230
258
  Accepts either a :class:`FileResult` instance or the ``dict`` returned by
231
259
  :func:`DetectPdf`. Requires the optional ``pymupdf`` dependency.
232
260
  Set ``returnBytes=True`` to also receive in-memory PNG bytes for each crop. Set
233
261
  ``saveToDisk=False`` to skip writing PNG files while still returning in-memory data.
262
+ When ``saveToDisk`` is enabled, a one-image DOCX file is also written per crop. When
263
+ ``returnBytes`` is True and ``python-docx`` is available, the returned
264
+ :class:`SignatureCrop` objects include ``docx_bytes``.
234
265
  """
235
266
 
236
267
  from sigdetect.cropping import crop_signatures
@@ -273,6 +304,7 @@ def _CoerceFileResult(
273
304
  RenderType=str(entry.get("render_type") or "unknown"),
274
305
  BoundingBox=tuple(bbox) if bbox else None,
275
306
  CropPath=entry.get("crop_path"),
307
+ CropBytes=entry.get("crop_bytes"),
276
308
  )
277
309
  )
278
310