sigdetect 0.4.0__tar.gz → 0.5.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. {sigdetect-0.4.0/src/sigdetect.egg-info → sigdetect-0.5.1}/PKG-INFO +25 -12
  2. sigdetect-0.4.0/PKG-INFO → sigdetect-0.5.1/README.md +20 -25
  3. {sigdetect-0.4.0 → sigdetect-0.5.1}/pyproject.toml +5 -4
  4. {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect/api.py +48 -12
  5. {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect/cli.py +70 -28
  6. {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect/config.py +17 -0
  7. {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect/cropping.py +78 -15
  8. {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect/detector/__init__.py +10 -8
  9. {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect/detector/pymupdf_engine.py +2 -2
  10. {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect/detector/signature_model.py +6 -0
  11. {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect/wet_detection.py +63 -13
  12. sigdetect-0.4.0/README.md → sigdetect-0.5.1/src/sigdetect.egg-info/PKG-INFO +38 -9
  13. {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect.egg-info/requires.txt +4 -3
  14. {sigdetect-0.4.0 → sigdetect-0.5.1}/tests/test_api.py +36 -1
  15. {sigdetect-0.4.0 → sigdetect-0.5.1}/tests/test_cli.py +131 -2
  16. {sigdetect-0.4.0 → sigdetect-0.5.1}/tests/test_cropping.py +88 -1
  17. {sigdetect-0.4.0 → sigdetect-0.5.1}/tests/test_detector_options.py +4 -4
  18. sigdetect-0.5.1/tests/test_wet_detection.py +215 -0
  19. sigdetect-0.4.0/tests/test_wet_detection.py +0 -111
  20. {sigdetect-0.4.0 → sigdetect-0.5.1}/setup.cfg +0 -0
  21. {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect/__init__.py +0 -0
  22. {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect/data/role_rules.retainer.yml +0 -0
  23. {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect/data/role_rules.yml +0 -0
  24. {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect/data/vendor_patterns.yml +0 -0
  25. {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect/detector/base.py +0 -0
  26. {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect/detector/base_detector.py +0 -0
  27. {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect/detector/file_result_model.py +0 -0
  28. {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect/detector/pypdf2_engine.py +0 -0
  29. {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect/eda.py +0 -0
  30. {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect/logging_setup.py +0 -0
  31. {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect/utils.py +0 -0
  32. {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect.egg-info/SOURCES.txt +0 -0
  33. {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect.egg-info/dependency_links.txt +0 -0
  34. {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect.egg-info/entry_points.txt +0 -0
  35. {sigdetect-0.4.0 → sigdetect-0.5.1}/src/sigdetect.egg-info/top_level.txt +0 -0
  36. {sigdetect-0.4.0 → sigdetect-0.5.1}/tests/test_pymupdf_engine.py +0 -0
  37. {sigdetect-0.4.0 → sigdetect-0.5.1}/tests/test_widget_role_patient_smoke.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sigdetect
3
- Version: 0.4.0
3
+ Version: 0.5.1
4
4
  Summary: Signature detection and role attribution for PDFs
5
5
  Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
6
6
  License: MIT
@@ -10,9 +10,11 @@ Requires-Dist: pypdf>=4.0.0
10
10
  Requires-Dist: rich>=13.0
11
11
  Requires-Dist: typer>=0.12
12
12
  Requires-Dist: pydantic>=2.5
13
+ Requires-Dist: pillow>=10.0
14
+ Requires-Dist: python-docx>=1.1.0
15
+ Requires-Dist: pytesseract>=0.3.10
16
+ Requires-Dist: pymupdf>=1.23
13
17
  Requires-Dist: pyyaml>=6.0
14
- Provides-Extra: pymupdf
15
- Requires-Dist: pymupdf>=1.23; extra == "pymupdf"
16
18
 
17
19
  # CaseWorks.Automation.CaseDocumentIntake
18
20
 
@@ -95,14 +97,16 @@ sigdetect detect \
95
97
  ### Notes
96
98
 
97
99
  - The config file controls `pdf_root`, `out_dir`, `engine`, `pseudo_signatures`, `recurse_xobjects`, etc.
98
- - `--engine` accepts **auto** (default; prefers PyMuPDF when installed, falls back to PyPDF2), **pypdf2**, or **pymupdf**.
100
+ - Engine selection is forced to **auto** (prefers PyMuPDF for geometry, falls back to PyPDF2); any configured `engine` value is overridden.
99
101
  - `--pseudo-signatures` enables a vendor/Acro-only pseudo-signature when no actual `/Widget` is present (useful for DocuSign / Acrobat Sign receipts).
100
102
  - `--recurse-xobjects` allows scanning Form XObjects for vendor markers and labels embedded in page resources.
101
103
  - `--profile` selects tuned role logic:
102
104
  - `hipaa` → patient / representative / attorney
103
105
  - `retainer` → client / firm (prefers detecting two signatures)
104
106
  - `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
105
- - Cropping (`--crop-signatures`) and wet detection (`--detect-wet`) are enabled by default for single-pass runs; disable them if you want a light, e-sign-only pass. PyMuPDF is required for crops; PyMuPDF + Tesseract are required for wet detection.
107
+ - Results output is disabled by default; set `write_results: true` or pass `--write-results` when you need `results.json` (for EDA).
108
+ - Cropping (`--crop-signatures`) writes PNG crops to disk by default; enable `--crop-docx` to write DOCX files instead of PNGs. `--crop-bytes` embeds base64 PNG data in `signatures[].crop_bytes` and, when `--crop-docx` is enabled, embeds DOCX bytes in `signatures[].crop_docx_bytes`. PyMuPDF is required for crops, and `python-docx` is required for DOCX output.
109
+ - Wet detection runs automatically for non-e-sign PDFs when dependencies are available; missing OCR dependencies add a `ManualReview:*` hint instead of failing. PyMuPDF + Tesseract are required for wet detection.
106
110
  - If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
107
111
 
108
112
  ### EDA (quick aggregate stats)
@@ -113,6 +117,8 @@ sigdetect eda \
113
117
 
114
118
  ~~~
115
119
 
120
+ `sigdetect eda` expects `results.json`; enable `write_results: true` when running detect.
121
+
116
122
  ---
117
123
 
118
124
  ## Library usage
@@ -136,13 +142,13 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
136
142
  print(result.to_dict())
137
143
  ~~~
138
144
 
139
- `Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
145
+ `Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image; when DOCX cropping is enabled, `crop_docx_path` points at the generated doc. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
140
146
 
141
147
  ---
142
148
 
143
149
  ## Library API (embed in another script)
144
150
 
145
- Minimal, plug-and-play API that returns plain dicts (JSON-ready) without side effects unless you opt into cropping:
151
+ Minimal, plug-and-play API that returns plain dicts (JSON-ready) without side effects unless you opt into cropping. Engine selection is forced to `auto` (PyMuPDF preferred) to ensure geometry. Wet detection runs automatically for non-e-sign PDFs; pass `runWetDetection=False` to skip OCR.
146
152
 
147
153
  ~~~python
148
154
  from pathlib import Path
@@ -165,6 +171,7 @@ result = DetectPdf(
165
171
  profileName="retainer",
166
172
  includePseudoSignatures=True,
167
173
  recurseXObjects=True,
174
+ # runWetDetection=False, # disable OCR-backed wet detection if desired
168
175
  )
169
176
  print(
170
177
  result["file"],
@@ -187,7 +194,7 @@ for res in ScanDirectory(
187
194
  # store in DB, print, etc.
188
195
  pass
189
196
 
190
- # 3) Crop PNG snippets for FileResult objects (requires PyMuPDF)
197
+ # 3) Crop signature snippets for FileResult objects (requires PyMuPDF; DOCX needs python-docx)
191
198
  detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
192
199
  file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
193
200
  CropSignatureImages(
@@ -226,7 +233,8 @@ High-level summary (per file):
226
233
  "hint": "AcroSig:sig_patient",
227
234
  "render_type": "typed",
228
235
  "bounding_box": [10.0, 10.0, 150.0, 40.0],
229
- "crop_path": "signature_crops/example/sig_01_patient.png"
236
+ "crop_path": "signature_crops/example/sig_01_patient.png",
237
+ "crop_docx_path": null
230
238
  },
231
239
  {
232
240
  "page": null,
@@ -253,6 +261,9 @@ High-level summary (per file):
253
261
  - In retainer profile, emitter prefers two signatures (client + firm), often on the same page.
254
262
  - **`signatures[].bounding_box`** reports the widget rectangle in PDF points (origin bottom-left).
255
263
  - **`signatures[].crop_path`** is populated when PNG crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
264
+ - **`signatures[].crop_docx_path`** is populated when DOCX crops are generated (`--crop-docx` or `docx=True`).
265
+ - **`signatures[].crop_bytes`** contains base64 PNG data when CLI `--crop-bytes` is enabled.
266
+ - **`signatures[].crop_docx_bytes`** contains base64 DOCX data when `--crop-docx` and `--crop-bytes` are enabled together.
256
267
 
257
268
  ---
258
269
 
@@ -274,14 +285,16 @@ You can keep one config YAML per dataset, e.g.:
274
285
  # ./sample_data/config.yml (example)
275
286
  pdf_root: ./pdfs
276
287
  out_dir: ./sigdetect_out
277
- engine: pypdf2
288
+ engine: auto
289
+ write_results: false
278
290
  pseudo_signatures: true
279
291
  recurse_xobjects: true
280
292
  profile: retainer # or: hipaa
281
293
  crop_signatures: false # enable to write PNG crops (requires pymupdf)
294
+ crop_docx: false # enable to write DOCX crops instead of PNGs (requires python-docx)
282
295
  # crop_output_dir: ./signature_crops
283
296
  crop_image_dpi: 200
284
- detect_wet_signatures: false # opt-in OCR wet detection (PyMuPDF + Tesseract)
297
+ detect_wet_signatures: false # kept for compatibility; non-e-sign PDFs still trigger OCR
285
298
  wet_ocr_dpi: 200
286
299
  wet_ocr_languages: eng
287
300
  wet_precision_threshold: 0.82
@@ -299,7 +312,7 @@ YAML files can be customized or load at runtime (see CLI `--config`, if availabl
299
312
  - Looks for client and firm labels/tokens; boosts pages with law-firm markers (LLP/LLC/PA/PC) and “By:” blocks.
300
313
  - Applies an anti-front-matter rule to reduce page-1 false positives (e.g., letterheads, firm mastheads).
301
314
  - When only vendor/Acro clues exist (no widgets), it will emit two pseudo signatures targeting likely pages.
302
- - **Wet detection (opt-in):** With `detect_wet_signatures: true`, the CLI runs an OCR-backed pass (PyMuPDF + pytesseract/Tesseract) after e-sign detection. It emits `RenderType="wet"` signatures for high-confidence label/stroke pairs in the lower page region. Missing OCR dependencies add a `ManualReview:*` hint instead of failing.
315
+ - **Wet detection (non-e-sign):** The CLI runs an OCR-backed pass (PyMuPDF + pytesseract/Tesseract) after e-sign detection whenever no e-sign evidence is found. It emits `RenderType="wet"` signatures for high-confidence label/stroke pairs in the lower page region. When an image-based signature is present on a page, label-only OCR candidates are suppressed unless a stroke is detected. Results are deduped to the top signature per role (dropping `unknown`). Missing OCR dependencies add a `ManualReview:*` hint instead of failing.
303
316
 
304
317
  ---
305
318
 
@@ -1,19 +1,3 @@
1
- Metadata-Version: 2.4
2
- Name: sigdetect
3
- Version: 0.4.0
4
- Summary: Signature detection and role attribution for PDFs
5
- Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
6
- License: MIT
7
- Requires-Python: >=3.9
8
- Description-Content-Type: text/markdown
9
- Requires-Dist: pypdf>=4.0.0
10
- Requires-Dist: rich>=13.0
11
- Requires-Dist: typer>=0.12
12
- Requires-Dist: pydantic>=2.5
13
- Requires-Dist: pyyaml>=6.0
14
- Provides-Extra: pymupdf
15
- Requires-Dist: pymupdf>=1.23; extra == "pymupdf"
16
-
17
1
  # CaseWorks.Automation.CaseDocumentIntake
18
2
 
19
3
  ## sigdetect
@@ -95,14 +79,16 @@ sigdetect detect \
95
79
  ### Notes
96
80
 
97
81
  - The config file controls `pdf_root`, `out_dir`, `engine`, `pseudo_signatures`, `recurse_xobjects`, etc.
98
- - `--engine` accepts **auto** (default; prefers PyMuPDF when installed, falls back to PyPDF2), **pypdf2**, or **pymupdf**.
82
+ - Engine selection is forced to **auto** (prefers PyMuPDF for geometry, falls back to PyPDF2); any configured `engine` value is overridden.
99
83
  - `--pseudo-signatures` enables a vendor/Acro-only pseudo-signature when no actual `/Widget` is present (useful for DocuSign / Acrobat Sign receipts).
100
84
  - `--recurse-xobjects` allows scanning Form XObjects for vendor markers and labels embedded in page resources.
101
85
  - `--profile` selects tuned role logic:
102
86
  - `hipaa` → patient / representative / attorney
103
87
  - `retainer` → client / firm (prefers detecting two signatures)
104
88
  - `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
105
- - Cropping (`--crop-signatures`) and wet detection (`--detect-wet`) are enabled by default for single-pass runs; disable them if you want a light, e-sign-only pass. PyMuPDF is required for crops; PyMuPDF + Tesseract are required for wet detection.
89
+ - Results output is disabled by default; set `write_results: true` or pass `--write-results` when you need `results.json` (for EDA).
90
+ - Cropping (`--crop-signatures`) writes PNG crops to disk by default; enable `--crop-docx` to write DOCX files instead of PNGs. `--crop-bytes` embeds base64 PNG data in `signatures[].crop_bytes` and, when `--crop-docx` is enabled, embeds DOCX bytes in `signatures[].crop_docx_bytes`. PyMuPDF is required for crops, and `python-docx` is required for DOCX output.
91
+ - Wet detection runs automatically for non-e-sign PDFs when dependencies are available; missing OCR dependencies add a `ManualReview:*` hint instead of failing. PyMuPDF + Tesseract are required for wet detection.
106
92
  - If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
107
93
 
108
94
  ### EDA (quick aggregate stats)
@@ -113,6 +99,8 @@ sigdetect eda \
113
99
 
114
100
  ~~~
115
101
 
102
+ `sigdetect eda` expects `results.json`; enable `write_results: true` when running detect.
103
+
116
104
  ---
117
105
 
118
106
  ## Library usage
@@ -136,13 +124,13 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
136
124
  print(result.to_dict())
137
125
  ~~~
138
126
 
139
- `Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
127
+ `Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image; when DOCX cropping is enabled, `crop_docx_path` points at the generated doc. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
140
128
 
141
129
  ---
142
130
 
143
131
  ## Library API (embed in another script)
144
132
 
145
- Minimal, plug-and-play API that returns plain dicts (JSON-ready) without side effects unless you opt into cropping:
133
+ Minimal, plug-and-play API that returns plain dicts (JSON-ready) without side effects unless you opt into cropping. Engine selection is forced to `auto` (PyMuPDF preferred) to ensure geometry. Wet detection runs automatically for non-e-sign PDFs; pass `runWetDetection=False` to skip OCR.
146
134
 
147
135
  ~~~python
148
136
  from pathlib import Path
@@ -165,6 +153,7 @@ result = DetectPdf(
165
153
  profileName="retainer",
166
154
  includePseudoSignatures=True,
167
155
  recurseXObjects=True,
156
+ # runWetDetection=False, # disable OCR-backed wet detection if desired
168
157
  )
169
158
  print(
170
159
  result["file"],
@@ -187,7 +176,7 @@ for res in ScanDirectory(
187
176
  # store in DB, print, etc.
188
177
  pass
189
178
 
190
- # 3) Crop PNG snippets for FileResult objects (requires PyMuPDF)
179
+ # 3) Crop signature snippets for FileResult objects (requires PyMuPDF; DOCX needs python-docx)
191
180
  detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
192
181
  file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
193
182
  CropSignatureImages(
@@ -226,7 +215,8 @@ High-level summary (per file):
226
215
  "hint": "AcroSig:sig_patient",
227
216
  "render_type": "typed",
228
217
  "bounding_box": [10.0, 10.0, 150.0, 40.0],
229
- "crop_path": "signature_crops/example/sig_01_patient.png"
218
+ "crop_path": "signature_crops/example/sig_01_patient.png",
219
+ "crop_docx_path": null
230
220
  },
231
221
  {
232
222
  "page": null,
@@ -253,6 +243,9 @@ High-level summary (per file):
253
243
  - In retainer profile, emitter prefers two signatures (client + firm), often on the same page.
254
244
  - **`signatures[].bounding_box`** reports the widget rectangle in PDF points (origin bottom-left).
255
245
  - **`signatures[].crop_path`** is populated when PNG crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
246
+ - **`signatures[].crop_docx_path`** is populated when DOCX crops are generated (`--crop-docx` or `docx=True`).
247
+ - **`signatures[].crop_bytes`** contains base64 PNG data when CLI `--crop-bytes` is enabled.
248
+ - **`signatures[].crop_docx_bytes`** contains base64 DOCX data when `--crop-docx` and `--crop-bytes` are enabled together.
256
249
 
257
250
  ---
258
251
 
@@ -274,14 +267,16 @@ You can keep one config YAML per dataset, e.g.:
274
267
  # ./sample_data/config.yml (example)
275
268
  pdf_root: ./pdfs
276
269
  out_dir: ./sigdetect_out
277
- engine: pypdf2
270
+ engine: auto
271
+ write_results: false
278
272
  pseudo_signatures: true
279
273
  recurse_xobjects: true
280
274
  profile: retainer # or: hipaa
281
275
  crop_signatures: false # enable to write PNG crops (requires pymupdf)
276
+ crop_docx: false # enable to write DOCX crops instead of PNGs (requires python-docx)
282
277
  # crop_output_dir: ./signature_crops
283
278
  crop_image_dpi: 200
284
- detect_wet_signatures: false # opt-in OCR wet detection (PyMuPDF + Tesseract)
279
+ detect_wet_signatures: false # kept for compatibility; non-e-sign PDFs still trigger OCR
285
280
  wet_ocr_dpi: 200
286
281
  wet_ocr_languages: eng
287
282
  wet_precision_threshold: 0.82
@@ -299,7 +294,7 @@ YAML files can be customized or load at runtime (see CLI `--config`, if availabl
299
294
  - Looks for client and firm labels/tokens; boosts pages with law-firm markers (LLP/LLC/PA/PC) and “By:” blocks.
300
295
  - Applies an anti-front-matter rule to reduce page-1 false positives (e.g., letterheads, firm mastheads).
301
296
  - When only vendor/Acro clues exist (no widgets), it will emit two pseudo signatures targeting likely pages.
302
- - **Wet detection (opt-in):** With `detect_wet_signatures: true`, the CLI runs an OCR-backed pass (PyMuPDF + pytesseract/Tesseract) after e-sign detection. It emits `RenderType="wet"` signatures for high-confidence label/stroke pairs in the lower page region. Missing OCR dependencies add a `ManualReview:*` hint instead of failing.
297
+ - **Wet detection (non-e-sign):** The CLI runs an OCR-backed pass (PyMuPDF + pytesseract/Tesseract) after e-sign detection whenever no e-sign evidence is found. It emits `RenderType="wet"` signatures for high-confidence label/stroke pairs in the lower page region. When an image-based signature is present on a page, label-only OCR candidates are suppressed unless a stroke is detected. Results are deduped to the top signature per role (dropping `unknown`). Missing OCR dependencies add a `ManualReview:*` hint instead of failing.
303
298
 
304
299
  ---
305
300
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sigdetect"
7
- version = "0.4.0"
7
+ version = "0.5.1"
8
8
  description = "Signature detection and role attribution for PDFs"
9
9
  readme = "README.md"
10
10
  authors = [{ name = "BT Asmamaw", email = "basmamaw@angeiongroup.com" }]
@@ -15,12 +15,13 @@ dependencies = [
15
15
  "rich>=13.0",
16
16
  "typer>=0.12",
17
17
  "pydantic>=2.5",
18
+ "pillow>=10.0",
19
+ "python-docx>=1.1.0",
20
+ "pytesseract>=0.3.10",
21
+ "pymupdf>=1.23",
18
22
  "pyyaml>=6.0",
19
23
  ]
20
24
 
21
- [project.optional-dependencies]
22
- pymupdf = ["pymupdf>=1.23"]
23
-
24
25
  [project.scripts]
25
26
  sigdetect = "sigdetect.cli:app"
26
27
 
@@ -9,6 +9,7 @@ from typing import Any, Generator, Iterable, Iterator, Literal, overload
9
9
  from sigdetect.config import DetectConfiguration
10
10
  from sigdetect.cropping import SignatureCrop
11
11
  from sigdetect.detector import BuildDetector, Detector, FileResult, Signature
12
+ from sigdetect.wet_detection import apply_wet_detection
12
13
 
13
14
  EngineName = Literal["pypdf2", "pypdf", "pymupdf", "auto"]
14
15
  ProfileName = Literal["hipaa", "retainer"]
@@ -21,9 +22,13 @@ def DetectPdf(
21
22
  engineName: EngineName = "auto",
22
23
  includePseudoSignatures: bool = True,
23
24
  recurseXObjects: bool = True,
25
+ runWetDetection: bool = True,
24
26
  detector: Detector | None = None,
25
27
  ) -> dict[str, Any]:
26
- """Detect signature evidence and assign roles for a single PDF."""
28
+ """Detect signature evidence and assign roles for a single PDF.
29
+
30
+ Wet detection runs by default for non-e-sign PDFs; pass ``runWetDetection=False`` to skip OCR.
31
+ """
27
32
 
28
33
  resolvedPath = Path(pdfPath)
29
34
  activeDetector = detector or get_detector(
@@ -36,6 +41,10 @@ def DetectPdf(
36
41
  )
37
42
 
38
43
  result = activeDetector.Detect(resolvedPath)
44
+ if runWetDetection:
45
+ configuration = _ResolveConfiguration(activeDetector)
46
+ if configuration is not None:
47
+ apply_wet_detection(resolvedPath, configuration, result)
39
48
  return _ToPlainDictionary(result)
40
49
 
41
50
 
@@ -48,7 +57,10 @@ def get_detector(
48
57
  recurseXObjects: bool = True,
49
58
  outputDirectory: str | Path | None = None,
50
59
  ) -> Detector:
51
- """Return a reusable detector instance configured with the supplied options."""
60
+ """Return a reusable detector instance configured with the supplied options.
61
+
62
+ Engine selection is forced to ``auto`` (prefers PyMuPDF when available).
63
+ """
52
64
 
53
65
  configuration = DetectConfiguration(
54
66
  PdfRoot=Path(pdfRoot) if pdfRoot is not None else Path.cwd(),
@@ -108,6 +120,7 @@ def _ToPlainValue(value: Any) -> Any:
108
120
  def DetectMany(
109
121
  pdfPaths: Iterable[str | Path],
110
122
  *,
123
+ runWetDetection: bool = True,
111
124
  detector: Detector | None = None,
112
125
  **kwargs: Any,
113
126
  ) -> Iterator[dict[str, Any]]:
@@ -115,17 +128,18 @@ def DetectMany(
115
128
 
116
129
  if detector is not None:
117
130
  for pdfPath in pdfPaths:
118
- yield _DetectWithDetector(detector, pdfPath)
131
+ yield _DetectWithDetector(detector, pdfPath, runWetDetection=runWetDetection)
119
132
  return
120
133
 
121
134
  for pdfPath in pdfPaths:
122
- yield DetectPdf(pdfPath, **kwargs)
135
+ yield DetectPdf(pdfPath, runWetDetection=runWetDetection, **kwargs)
123
136
 
124
137
 
125
138
  def ScanDirectory(
126
139
  pdfRoot: str | Path,
127
140
  *,
128
141
  globPattern: str = "**/*.pdf",
142
+ runWetDetection: bool = True,
129
143
  detector: Detector | None = None,
130
144
  **kwargs: Any,
131
145
  ) -> Iterator[dict[str, Any]]:
@@ -143,7 +157,7 @@ def ScanDirectory(
143
157
 
144
158
  for pdfPath in iterator:
145
159
  if pdfPath.is_file() and pdfPath.suffix.lower() == ".pdf":
146
- yield DetectPdf(pdfPath, detector=detector, **kwargs)
160
+ yield DetectPdf(pdfPath, detector=detector, runWetDetection=runWetDetection, **kwargs)
147
161
 
148
162
 
149
163
  def ToCsvRow(result: dict[str, Any]) -> dict[str, Any]:
@@ -174,11 +188,25 @@ def Version() -> str:
174
188
  return "0.0.0-dev"
175
189
 
176
190
 
177
- def _DetectWithDetector(detector: Detector, pdfPath: str | Path) -> dict[str, Any]:
191
+ def _DetectWithDetector(
192
+ detector: Detector, pdfPath: str | Path, *, runWetDetection: bool
193
+ ) -> dict[str, Any]:
178
194
  """Helper that runs ``detector`` and returns the plain dictionary result."""
179
195
 
180
196
  resolvedPath = Path(pdfPath)
181
- return _ToPlainDictionary(detector.Detect(resolvedPath))
197
+ result = detector.Detect(resolvedPath)
198
+ if runWetDetection:
199
+ configuration = _ResolveConfiguration(detector)
200
+ if configuration is not None:
201
+ apply_wet_detection(resolvedPath, configuration, result)
202
+ return _ToPlainDictionary(result)
203
+
204
+
205
+ def _ResolveConfiguration(detector: Detector) -> DetectConfiguration | None:
206
+ configuration = getattr(detector, "Configuration", None)
207
+ if isinstance(configuration, DetectConfiguration):
208
+ return configuration
209
+ return None
182
210
 
183
211
 
184
212
  @contextmanager
@@ -201,8 +229,8 @@ def CropSignatureImages(
201
229
  dpi: int = 200,
202
230
  returnBytes: Literal[False] = False,
203
231
  saveToDisk: bool = True,
204
- ) -> list[Path]:
205
- ...
232
+ docx: bool = False,
233
+ ) -> list[Path]: ...
206
234
 
207
235
 
208
236
  @overload
@@ -214,8 +242,8 @@ def CropSignatureImages(
214
242
  dpi: int,
215
243
  returnBytes: Literal[True],
216
244
  saveToDisk: bool,
217
- ) -> list[SignatureCrop]:
218
- ...
245
+ docx: bool = False,
246
+ ) -> list[SignatureCrop]: ...
219
247
 
220
248
 
221
249
  def CropSignatureImages(
@@ -226,13 +254,17 @@ def CropSignatureImages(
226
254
  dpi: int = 200,
227
255
  returnBytes: bool = False,
228
256
  saveToDisk: bool = True,
257
+ docx: bool = False,
229
258
  ) -> list[Path] | list[SignatureCrop]:
230
- """Crop detected signature regions to PNG files.
259
+ """Create PNG files containing cropped signature images (or DOCX when enabled).
231
260
 
232
261
  Accepts either a :class:`FileResult` instance or the ``dict`` returned by
233
262
  :func:`DetectPdf`. Requires the optional ``pymupdf`` dependency.
234
263
  Set ``returnBytes=True`` to also receive in-memory PNG bytes for each crop. Set
235
264
  ``saveToDisk=False`` to skip writing PNG files while still returning in-memory data.
265
+ When ``docx`` is True, DOCX files are written instead of PNG files. When ``returnBytes`` is
266
+ True and ``docx`` is enabled, the returned :class:`SignatureCrop` objects include
267
+ ``docx_bytes``.
236
268
  """
237
269
 
238
270
  from sigdetect.cropping import crop_signatures
@@ -245,6 +277,7 @@ def CropSignatureImages(
245
277
  dpi=dpi,
246
278
  return_bytes=returnBytes,
247
279
  save_files=saveToDisk,
280
+ docx=docx,
248
281
  )
249
282
  if original_dict is not None:
250
283
  original_dict.clear()
@@ -275,6 +308,9 @@ def _CoerceFileResult(
275
308
  RenderType=str(entry.get("render_type") or "unknown"),
276
309
  BoundingBox=tuple(bbox) if bbox else None,
277
310
  CropPath=entry.get("crop_path"),
311
+ CropBytes=entry.get("crop_bytes"),
312
+ CropDocxPath=entry.get("crop_docx_path"),
313
+ CropDocxBytes=entry.get("crop_docx_bytes"),
278
314
  )
279
315
  )
280
316
 
@@ -2,6 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import base64
5
6
  import json
6
7
  from collections.abc import Iterator
7
8
  from dataclasses import asdict, is_dataclass
@@ -48,6 +49,12 @@ def Detect(
48
49
  configurationPath: Path | None = typer.Option(
49
50
  None, "--config", "-c", help="Path to YAML config"
50
51
  ),
52
+ writeResults: bool | None = typer.Option(
53
+ None,
54
+ "--write-results/--no-write-results",
55
+ help="Write results.json (or JSON to stdout when out_dir is none)",
56
+ show_default=False,
57
+ ),
51
58
  profileOverride: str | None = typer.Option(None, "--profile", "-p", help="hipaa or retainer"),
52
59
  recursive: bool = typer.Option(
53
60
  True,
@@ -57,13 +64,19 @@ def Detect(
57
64
  cropSignatures: bool | None = typer.Option(
58
65
  None,
59
66
  "--crop-signatures/--no-crop-signatures",
60
- help="Crop detected signature regions to PNG files (requires PyMuPDF)",
67
+ help="Write PNG crops for signature widgets (requires PyMuPDF)",
68
+ show_default=False,
69
+ ),
70
+ cropDocx: bool | None = typer.Option(
71
+ None,
72
+ "--crop-docx/--no-crop-docx",
73
+ help="Write DOCX crops instead of PNG files (requires PyMuPDF + python-docx)",
61
74
  show_default=False,
62
75
  ),
63
76
  cropDirectory: Path | None = typer.Option(
64
77
  None,
65
78
  "--crop-dir",
66
- help="Directory for signature PNG crops (defaults to out_dir/signature_crops)",
79
+ help="Directory for signature crops (defaults to out_dir/signature_crops)",
67
80
  ),
68
81
  cropDpi: int | None = typer.Option(
69
82
  None,
@@ -73,10 +86,16 @@ def Detect(
73
86
  help="Rendering DPI for signature crops",
74
87
  show_default=False,
75
88
  ),
89
+ cropBytes: bool = typer.Option(
90
+ False,
91
+ "--crop-bytes/--no-crop-bytes",
92
+ help="Embed base64 PNG bytes (and DOCX bytes when --crop-docx) in results JSON",
93
+ show_default=False,
94
+ ),
76
95
  detectWetSignatures: bool | None = typer.Option(
77
96
  None,
78
97
  "--detect-wet/--no-detect-wet",
79
- help="Run OCR-backed wet signature detection (requires PyMuPDF + Tesseract)",
98
+ help="Compatibility flag; non-e-sign PDFs always run OCR when deps are available",
80
99
  show_default=False,
81
100
  ),
82
101
  wetOcrDpi: int | None = typer.Option(
@@ -111,8 +130,12 @@ def Detect(
111
130
  configuration = configuration.model_copy(update={"Profile": normalized_profile})
112
131
 
113
132
  overrides: dict[str, object] = {}
133
+ if writeResults is not None:
134
+ overrides["WriteResults"] = writeResults
114
135
  if cropSignatures is not None:
115
136
  overrides["CropSignatures"] = cropSignatures
137
+ if cropDocx is not None:
138
+ overrides["CropDocx"] = cropDocx
116
139
  if cropDirectory is not None:
117
140
  overrides["CropOutputDirectory"] = cropDirectory
118
141
  if cropDpi is not None:
@@ -145,53 +168,66 @@ def Detect(
145
168
  except StopIteration:
146
169
  raise SystemExit(f"No PDFs found in {configuration.PdfRoot}") from None
147
170
 
148
- results_buffer: list[FileResult] | None = [] if configuration.OutputDirectory is None else None
171
+ write_results = configuration.WriteResults
172
+ results_buffer: list[FileResult] | None = (
173
+ [] if write_results and configuration.OutputDirectory is None else None
174
+ )
149
175
  json_handle = None
150
176
  json_path: Path | None = None
151
177
  wrote_first = False
152
178
 
153
- if configuration.OutputDirectory is not None:
179
+ if write_results and configuration.OutputDirectory is not None:
154
180
  outputDirectory = configuration.OutputDirectory
155
181
  outputDirectory.mkdir(parents=True, exist_ok=True)
156
182
  json_path = outputDirectory / "results.json"
157
183
  json_handle = open(json_path, "w", encoding="utf-8")
158
184
  json_handle.write("[")
159
185
 
186
+ crop_bytes_enabled = bool(cropBytes)
160
187
  crop_dir = configuration.CropOutputDirectory
188
+ if crop_dir is None:
189
+ base_dir = configuration.OutputDirectory or configuration.PdfRoot
190
+ crop_dir = base_dir / "signature_crops"
161
191
  cropping_enabled = configuration.CropSignatures
192
+ docx_enabled = configuration.CropDocx
162
193
  cropping_available = True
163
194
  cropping_attempted = False
164
- if configuration.CropSignatures and crop_dir is None:
165
- Logger.warning(
166
- "CropSignatures enabled without an output directory",
167
- extra={"pdf_root": str(configuration.PdfRoot)},
168
- )
169
- cropping_enabled = False
170
195
 
171
196
  total_bboxes = 0
172
197
 
173
198
  def _append_result(file_result: FileResult, source_pdf: Path) -> None:
174
199
  nonlocal wrote_first, json_handle, total_bboxes, cropping_available, cropping_attempted
175
200
 
176
- if cropping_enabled and cropping_available and crop_dir is not None:
201
+ if cropping_available and (cropping_enabled or crop_bytes_enabled) and crop_dir is not None:
177
202
  try:
178
- crop_signatures(
203
+ crops = crop_signatures(
179
204
  pdf_path=source_pdf,
180
205
  file_result=file_result,
181
206
  output_dir=crop_dir,
182
207
  dpi=configuration.CropImageDpi,
183
208
  logger=Logger,
209
+ return_bytes=crop_bytes_enabled,
210
+ save_files=cropping_enabled,
211
+ docx=docx_enabled,
184
212
  )
185
213
  cropping_attempted = True
214
+ if crop_bytes_enabled:
215
+ for crop in crops:
216
+ crop.signature.CropBytes = base64.b64encode(crop.image_bytes).decode(
217
+ "ascii"
218
+ )
219
+ if crop.docx_bytes:
220
+ crop.signature.CropDocxBytes = base64.b64encode(
221
+ crop.docx_bytes
222
+ ).decode("ascii")
186
223
  except SignatureCroppingUnavailable as exc:
187
224
  cropping_available = False
188
225
  Logger.warning("Signature cropping unavailable", extra={"error": str(exc)})
189
226
  typer.echo(str(exc), err=True)
190
227
  except Exception as exc: # pragma: no cover - defensive
191
- Logger.warning(
192
- "Unexpected error while cropping signatures",
193
- extra={"error": str(exc)},
194
- )
228
+ cropping_available = False
229
+ Logger.warning("Signature cropping unavailable", extra={"error": str(exc)})
230
+ typer.echo(str(exc), err=True)
195
231
 
196
232
  total_bboxes += sum(1 for sig in file_result.Signatures if sig.BoundingBox)
197
233
 
@@ -231,18 +267,24 @@ def Detect(
231
267
  json_handle.write(closing)
232
268
  json_handle.close()
233
269
 
234
- if json_handle is not None:
235
- typer.echo(f"Wrote {json_path}")
236
- else:
237
- payload = json.dumps(
238
- results_buffer or [], indent=2, ensure_ascii=False, default=_JsonSerializer
239
- )
240
- typer.echo(payload)
241
- typer.echo("Detection completed with output disabled (out_dir=none)")
242
-
243
- if cropping_enabled and cropping_available and cropping_attempted and total_bboxes == 0:
270
+ if write_results:
271
+ if json_handle is not None:
272
+ typer.echo(f"Wrote {json_path}")
273
+ else:
274
+ payload = json.dumps(
275
+ results_buffer or [], indent=2, ensure_ascii=False, default=_JsonSerializer
276
+ )
277
+ typer.echo(payload)
278
+ typer.echo("Detection completed with output disabled (out_dir=none)")
279
+
280
+ if (
281
+ (cropping_enabled or crop_bytes_enabled)
282
+ and cropping_available
283
+ and cropping_attempted
284
+ and total_bboxes == 0
285
+ ):
244
286
  Logger.warning(
245
- "No signature bounding boxes detected; try --engine pymupdf for crop-ready output",
287
+ "No signature bounding boxes detected; install PyMuPDF for crop-ready output",
246
288
  extra={"engine": configuration.Engine},
247
289
  )
248
290