sigdetect 0.4.0__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. {sigdetect-0.4.0 → sigdetect-0.5.0}/PKG-INFO +23 -14
  2. {sigdetect-0.4.0 → sigdetect-0.5.0}/README.md +18 -11
  3. {sigdetect-0.4.0 → sigdetect-0.5.0}/pyproject.toml +5 -4
  4. {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect/api.py +42 -12
  5. {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect/cli.py +53 -24
  6. {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect/config.py +5 -0
  7. {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect/cropping.py +71 -15
  8. {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect/detector/__init__.py +10 -8
  9. {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect/detector/pymupdf_engine.py +2 -2
  10. {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect/detector/signature_model.py +2 -0
  11. {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect/wet_detection.py +63 -13
  12. {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect.egg-info/PKG-INFO +23 -14
  13. {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect.egg-info/requires.txt +4 -3
  14. {sigdetect-0.4.0 → sigdetect-0.5.0}/tests/test_api.py +36 -1
  15. {sigdetect-0.4.0 → sigdetect-0.5.0}/tests/test_cli.py +129 -2
  16. {sigdetect-0.4.0 → sigdetect-0.5.0}/tests/test_cropping.py +12 -1
  17. {sigdetect-0.4.0 → sigdetect-0.5.0}/tests/test_detector_options.py +4 -4
  18. sigdetect-0.5.0/tests/test_wet_detection.py +215 -0
  19. sigdetect-0.4.0/tests/test_wet_detection.py +0 -111
  20. {sigdetect-0.4.0 → sigdetect-0.5.0}/setup.cfg +0 -0
  21. {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect/__init__.py +0 -0
  22. {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect/data/role_rules.retainer.yml +0 -0
  23. {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect/data/role_rules.yml +0 -0
  24. {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect/data/vendor_patterns.yml +0 -0
  25. {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect/detector/base.py +0 -0
  26. {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect/detector/base_detector.py +0 -0
  27. {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect/detector/file_result_model.py +0 -0
  28. {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect/detector/pypdf2_engine.py +0 -0
  29. {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect/eda.py +0 -0
  30. {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect/logging_setup.py +0 -0
  31. {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect/utils.py +0 -0
  32. {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect.egg-info/SOURCES.txt +0 -0
  33. {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect.egg-info/dependency_links.txt +0 -0
  34. {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect.egg-info/entry_points.txt +0 -0
  35. {sigdetect-0.4.0 → sigdetect-0.5.0}/src/sigdetect.egg-info/top_level.txt +0 -0
  36. {sigdetect-0.4.0 → sigdetect-0.5.0}/tests/test_pymupdf_engine.py +0 -0
  37. {sigdetect-0.4.0 → sigdetect-0.5.0}/tests/test_widget_role_patient_smoke.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sigdetect
3
- Version: 0.4.0
3
+ Version: 0.5.0
4
4
  Summary: Signature detection and role attribution for PDFs
5
5
  Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
6
6
  License: MIT
@@ -10,9 +10,11 @@ Requires-Dist: pypdf>=4.0.0
10
10
  Requires-Dist: rich>=13.0
11
11
  Requires-Dist: typer>=0.12
12
12
  Requires-Dist: pydantic>=2.5
13
+ Requires-Dist: pillow>=10.0
14
+ Requires-Dist: python-docx>=1.1.0
15
+ Requires-Dist: pytesseract>=0.3.10
16
+ Requires-Dist: pymupdf>=1.23
13
17
  Requires-Dist: pyyaml>=6.0
14
- Provides-Extra: pymupdf
15
- Requires-Dist: pymupdf>=1.23; extra == "pymupdf"
16
18
 
17
19
  # CaseWorks.Automation.CaseDocumentIntake
18
20
 
@@ -95,14 +97,16 @@ sigdetect detect \
95
97
  ### Notes
96
98
 
97
99
  - The config file controls `pdf_root`, `out_dir`, `engine`, `pseudo_signatures`, `recurse_xobjects`, etc.
98
- - `--engine` accepts **auto** (default; prefers PyMuPDF when installed, falls back to PyPDF2), **pypdf2**, or **pymupdf**.
100
+ - Engine selection is forced to **auto** (prefers PyMuPDF for geometry, falls back to PyPDF2); any configured `engine` value is overridden.
99
101
  - `--pseudo-signatures` enables a vendor/Acro-only pseudo-signature when no actual `/Widget` is present (useful for DocuSign / Acrobat Sign receipts).
100
102
  - `--recurse-xobjects` allows scanning Form XObjects for vendor markers and labels embedded in page resources.
101
103
  - `--profile` selects tuned role logic:
102
104
  - `hipaa` → patient / representative / attorney
103
105
  - `retainer` → client / firm (prefers detecting two signatures)
104
106
  - `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
105
- - Cropping (`--crop-signatures`) and wet detection (`--detect-wet`) are enabled by default for single-pass runs; disable them if you want a light, e-sign-only pass. PyMuPDF is required for crops; PyMuPDF + Tesseract are required for wet detection.
107
+ - Results output is disabled by default; set `write_results: true` or pass `--write-results` when you need `results.json` (for EDA).
108
+ - Cropping (`--crop-signatures`) writes a one-image `.docx` per signature in the crop output directory (no PNG files on disk); `--crop-bytes` embeds base64 PNG data in `signatures[].crop_bytes` for in-memory use. PyMuPDF is required for crops, and `python-docx` is required for `.docx` output.
109
+ - Wet detection runs automatically for non-e-sign PDFs when dependencies are available; missing OCR dependencies add a `ManualReview:*` hint instead of failing. PyMuPDF + Tesseract are required for wet detection.
106
110
  - If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
107
111
 
108
112
  ### EDA (quick aggregate stats)
@@ -113,6 +117,8 @@ sigdetect eda \
113
117
 
114
118
  ~~~
115
119
 
120
+ `sigdetect eda` expects `results.json`; enable `write_results: true` when running detect.
121
+
116
122
  ---
117
123
 
118
124
  ## Library usage
@@ -136,13 +142,13 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
136
142
  print(result.to_dict())
137
143
  ~~~
138
144
 
139
- `Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
145
+ `Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When cropping is enabled, `crop_path` points at the generated `.docx`. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
140
146
 
141
147
  ---
142
148
 
143
149
  ## Library API (embed in another script)
144
150
 
145
- Minimal, plug-and-play API that returns plain dicts (JSON-ready) without side effects unless you opt into cropping:
151
+ Minimal, plug-and-play API that returns plain dicts (JSON-ready) without side effects unless you opt into cropping. Engine selection is forced to `auto` (PyMuPDF preferred) to ensure geometry. Wet detection runs automatically for non-e-sign PDFs; pass `runWetDetection=False` to skip OCR.
146
152
 
147
153
  ~~~python
148
154
  from pathlib import Path
@@ -165,6 +171,7 @@ result = DetectPdf(
165
171
  profileName="retainer",
166
172
  includePseudoSignatures=True,
167
173
  recurseXObjects=True,
174
+ # runWetDetection=False, # disable OCR-backed wet detection if desired
168
175
  )
169
176
  print(
170
177
  result["file"],
@@ -187,7 +194,7 @@ for res in ScanDirectory(
187
194
  # store in DB, print, etc.
188
195
  pass
189
196
 
190
- # 3) Crop PNG snippets for FileResult objects (requires PyMuPDF)
197
+ # 3) Create DOCX crops for FileResult objects (requires PyMuPDF + python-docx)
191
198
  detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
192
199
  file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
193
200
  CropSignatureImages(
@@ -226,7 +233,7 @@ High-level summary (per file):
226
233
  "hint": "AcroSig:sig_patient",
227
234
  "render_type": "typed",
228
235
  "bounding_box": [10.0, 10.0, 150.0, 40.0],
229
- "crop_path": "signature_crops/example/sig_01_patient.png"
236
+ "crop_path": "signature_crops/example/sig_01_patient.docx"
230
237
  },
231
238
  {
232
239
  "page": null,
@@ -252,7 +259,8 @@ High-level summary (per file):
252
259
  - **`roles`** summarizes unique non-`unknown` roles across signatures.
253
260
  - In retainer profile, emitter prefers two signatures (client + firm), often on the same page.
254
261
  - **`signatures[].bounding_box`** reports the widget rectangle in PDF points (origin bottom-left).
255
- - **`signatures[].crop_path`** is populated when PNG crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
262
+ - **`signatures[].crop_path`** is populated when DOCX crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
263
+ - **`signatures[].crop_bytes`** contains base64 PNG data when CLI `--crop-bytes` is enabled.
256
264
 
257
265
  ---
258
266
 
@@ -274,14 +282,15 @@ You can keep one config YAML per dataset, e.g.:
274
282
  # ./sample_data/config.yml (example)
275
283
  pdf_root: ./pdfs
276
284
  out_dir: ./sigdetect_out
277
- engine: pypdf2
285
+ engine: auto
286
+ write_results: false
278
287
  pseudo_signatures: true
279
288
  recurse_xobjects: true
280
289
  profile: retainer # or: hipaa
281
- crop_signatures: false # enable to write PNG crops (requires pymupdf)
290
+ crop_signatures: false # enable to write DOCX crops (requires pymupdf + python-docx)
282
291
  # crop_output_dir: ./signature_crops
283
292
  crop_image_dpi: 200
284
- detect_wet_signatures: false # opt-in OCR wet detection (PyMuPDF + Tesseract)
293
+ detect_wet_signatures: false # kept for compatibility; non-e-sign PDFs still trigger OCR
285
294
  wet_ocr_dpi: 200
286
295
  wet_ocr_languages: eng
287
296
  wet_precision_threshold: 0.82
@@ -299,7 +308,7 @@ YAML files can be customized or load at runtime (see CLI `--config`, if availabl
299
308
  - Looks for client and firm labels/tokens; boosts pages with law-firm markers (LLP/LLC/PA/PC) and “By:” blocks.
300
309
  - Applies an anti-front-matter rule to reduce page-1 false positives (e.g., letterheads, firm mastheads).
301
310
  - When only vendor/Acro clues exist (no widgets), it will emit two pseudo signatures targeting likely pages.
302
- - **Wet detection (opt-in):** With `detect_wet_signatures: true`, the CLI runs an OCR-backed pass (PyMuPDF + pytesseract/Tesseract) after e-sign detection. It emits `RenderType="wet"` signatures for high-confidence label/stroke pairs in the lower page region. Missing OCR dependencies add a `ManualReview:*` hint instead of failing.
311
+ - **Wet detection (non-e-sign):** The CLI runs an OCR-backed pass (PyMuPDF + pytesseract/Tesseract) after e-sign detection whenever no e-sign evidence is found. It emits `RenderType="wet"` signatures for high-confidence label/stroke pairs in the lower page region. When an image-based signature is present on a page, label-only OCR candidates are suppressed unless a stroke is detected. Results are deduped to the top signature per role (dropping `unknown`). Missing OCR dependencies add a `ManualReview:*` hint instead of failing.
303
312
 
304
313
  ---
305
314
 
@@ -79,14 +79,16 @@ sigdetect detect \
79
79
  ### Notes
80
80
 
81
81
  - The config file controls `pdf_root`, `out_dir`, `engine`, `pseudo_signatures`, `recurse_xobjects`, etc.
82
- - `--engine` accepts **auto** (default; prefers PyMuPDF when installed, falls back to PyPDF2), **pypdf2**, or **pymupdf**.
82
+ - Engine selection is forced to **auto** (prefers PyMuPDF for geometry, falls back to PyPDF2); any configured `engine` value is overridden.
83
83
  - `--pseudo-signatures` enables a vendor/Acro-only pseudo-signature when no actual `/Widget` is present (useful for DocuSign / Acrobat Sign receipts).
84
84
  - `--recurse-xobjects` allows scanning Form XObjects for vendor markers and labels embedded in page resources.
85
85
  - `--profile` selects tuned role logic:
86
86
  - `hipaa` → patient / representative / attorney
87
87
  - `retainer` → client / firm (prefers detecting two signatures)
88
88
  - `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
89
- - Cropping (`--crop-signatures`) and wet detection (`--detect-wet`) are enabled by default for single-pass runs; disable them if you want a light, e-sign-only pass. PyMuPDF is required for crops; PyMuPDF + Tesseract are required for wet detection.
89
+ - Results output is disabled by default; set `write_results: true` or pass `--write-results` when you need `results.json` (for EDA).
90
+ - Cropping (`--crop-signatures`) writes a one-image `.docx` per signature in the crop output directory (no PNG files on disk); `--crop-bytes` embeds base64 PNG data in `signatures[].crop_bytes` for in-memory use. PyMuPDF is required for crops, and `python-docx` is required for `.docx` output.
91
+ - Wet detection runs automatically for non-e-sign PDFs when dependencies are available; missing OCR dependencies add a `ManualReview:*` hint instead of failing. PyMuPDF + Tesseract are required for wet detection.
90
92
  - If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
91
93
 
92
94
  ### EDA (quick aggregate stats)
@@ -97,6 +99,8 @@ sigdetect eda \
97
99
 
98
100
  ~~~
99
101
 
102
+ `sigdetect eda` expects `results.json`; enable `write_results: true` when running detect.
103
+
100
104
  ---
101
105
 
102
106
  ## Library usage
@@ -120,13 +124,13 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
120
124
  print(result.to_dict())
121
125
  ~~~
122
126
 
123
- `Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
127
+ `Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When cropping is enabled, `crop_path` points at the generated `.docx`. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
124
128
 
125
129
  ---
126
130
 
127
131
  ## Library API (embed in another script)
128
132
 
129
- Minimal, plug-and-play API that returns plain dicts (JSON-ready) without side effects unless you opt into cropping:
133
+ Minimal, plug-and-play API that returns plain dicts (JSON-ready) without side effects unless you opt into cropping. Engine selection is forced to `auto` (PyMuPDF preferred) to ensure geometry. Wet detection runs automatically for non-e-sign PDFs; pass `runWetDetection=False` to skip OCR.
130
134
 
131
135
  ~~~python
132
136
  from pathlib import Path
@@ -149,6 +153,7 @@ result = DetectPdf(
149
153
  profileName="retainer",
150
154
  includePseudoSignatures=True,
151
155
  recurseXObjects=True,
156
+ # runWetDetection=False, # disable OCR-backed wet detection if desired
152
157
  )
153
158
  print(
154
159
  result["file"],
@@ -171,7 +176,7 @@ for res in ScanDirectory(
171
176
  # store in DB, print, etc.
172
177
  pass
173
178
 
174
- # 3) Crop PNG snippets for FileResult objects (requires PyMuPDF)
179
+ # 3) Create DOCX crops for FileResult objects (requires PyMuPDF + python-docx)
175
180
  detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
176
181
  file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
177
182
  CropSignatureImages(
@@ -210,7 +215,7 @@ High-level summary (per file):
210
215
  "hint": "AcroSig:sig_patient",
211
216
  "render_type": "typed",
212
217
  "bounding_box": [10.0, 10.0, 150.0, 40.0],
213
- "crop_path": "signature_crops/example/sig_01_patient.png"
218
+ "crop_path": "signature_crops/example/sig_01_patient.docx"
214
219
  },
215
220
  {
216
221
  "page": null,
@@ -236,7 +241,8 @@ High-level summary (per file):
236
241
  - **`roles`** summarizes unique non-`unknown` roles across signatures.
237
242
  - In retainer profile, emitter prefers two signatures (client + firm), often on the same page.
238
243
  - **`signatures[].bounding_box`** reports the widget rectangle in PDF points (origin bottom-left).
239
- - **`signatures[].crop_path`** is populated when PNG crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
244
+ - **`signatures[].crop_path`** is populated when DOCX crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
245
+ - **`signatures[].crop_bytes`** contains base64 PNG data when CLI `--crop-bytes` is enabled.
240
246
 
241
247
  ---
242
248
 
@@ -258,14 +264,15 @@ You can keep one config YAML per dataset, e.g.:
258
264
  # ./sample_data/config.yml (example)
259
265
  pdf_root: ./pdfs
260
266
  out_dir: ./sigdetect_out
261
- engine: pypdf2
267
+ engine: auto
268
+ write_results: false
262
269
  pseudo_signatures: true
263
270
  recurse_xobjects: true
264
271
  profile: retainer # or: hipaa
265
- crop_signatures: false # enable to write PNG crops (requires pymupdf)
272
+ crop_signatures: false # enable to write DOCX crops (requires pymupdf + python-docx)
266
273
  # crop_output_dir: ./signature_crops
267
274
  crop_image_dpi: 200
268
- detect_wet_signatures: false # opt-in OCR wet detection (PyMuPDF + Tesseract)
275
+ detect_wet_signatures: false # kept for compatibility; non-e-sign PDFs still trigger OCR
269
276
  wet_ocr_dpi: 200
270
277
  wet_ocr_languages: eng
271
278
  wet_precision_threshold: 0.82
@@ -283,7 +290,7 @@ YAML files can be customized or load at runtime (see CLI `--config`, if availabl
283
290
  - Looks for client and firm labels/tokens; boosts pages with law-firm markers (LLP/LLC/PA/PC) and “By:” blocks.
284
291
  - Applies an anti-front-matter rule to reduce page-1 false positives (e.g., letterheads, firm mastheads).
285
292
  - When only vendor/Acro clues exist (no widgets), it will emit two pseudo signatures targeting likely pages.
286
- - **Wet detection (opt-in):** With `detect_wet_signatures: true`, the CLI runs an OCR-backed pass (PyMuPDF + pytesseract/Tesseract) after e-sign detection. It emits `RenderType="wet"` signatures for high-confidence label/stroke pairs in the lower page region. Missing OCR dependencies add a `ManualReview:*` hint instead of failing.
293
+ - **Wet detection (non-e-sign):** The CLI runs an OCR-backed pass (PyMuPDF + pytesseract/Tesseract) after e-sign detection whenever no e-sign evidence is found. It emits `RenderType="wet"` signatures for high-confidence label/stroke pairs in the lower page region. When an image-based signature is present on a page, label-only OCR candidates are suppressed unless a stroke is detected. Results are deduped to the top signature per role (dropping `unknown`). Missing OCR dependencies add a `ManualReview:*` hint instead of failing.
287
294
 
288
295
  ---
289
296
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sigdetect"
7
- version = "0.4.0"
7
+ version = "0.5.0"
8
8
  description = "Signature detection and role attribution for PDFs"
9
9
  readme = "README.md"
10
10
  authors = [{ name = "BT Asmamaw", email = "basmamaw@angeiongroup.com" }]
@@ -15,12 +15,13 @@ dependencies = [
15
15
  "rich>=13.0",
16
16
  "typer>=0.12",
17
17
  "pydantic>=2.5",
18
+ "pillow>=10.0",
19
+ "python-docx>=1.1.0",
20
+ "pytesseract>=0.3.10",
21
+ "pymupdf>=1.23",
18
22
  "pyyaml>=6.0",
19
23
  ]
20
24
 
21
- [project.optional-dependencies]
22
- pymupdf = ["pymupdf>=1.23"]
23
-
24
25
  [project.scripts]
25
26
  sigdetect = "sigdetect.cli:app"
26
27
 
@@ -9,6 +9,7 @@ from typing import Any, Generator, Iterable, Iterator, Literal, overload
9
9
  from sigdetect.config import DetectConfiguration
10
10
  from sigdetect.cropping import SignatureCrop
11
11
  from sigdetect.detector import BuildDetector, Detector, FileResult, Signature
12
+ from sigdetect.wet_detection import apply_wet_detection
12
13
 
13
14
  EngineName = Literal["pypdf2", "pypdf", "pymupdf", "auto"]
14
15
  ProfileName = Literal["hipaa", "retainer"]
@@ -21,9 +22,13 @@ def DetectPdf(
21
22
  engineName: EngineName = "auto",
22
23
  includePseudoSignatures: bool = True,
23
24
  recurseXObjects: bool = True,
25
+ runWetDetection: bool = True,
24
26
  detector: Detector | None = None,
25
27
  ) -> dict[str, Any]:
26
- """Detect signature evidence and assign roles for a single PDF."""
28
+ """Detect signature evidence and assign roles for a single PDF.
29
+
30
+ Wet detection runs by default for non-e-sign PDFs; pass ``runWetDetection=False`` to skip OCR.
31
+ """
27
32
 
28
33
  resolvedPath = Path(pdfPath)
29
34
  activeDetector = detector or get_detector(
@@ -36,6 +41,10 @@ def DetectPdf(
36
41
  )
37
42
 
38
43
  result = activeDetector.Detect(resolvedPath)
44
+ if runWetDetection:
45
+ configuration = _ResolveConfiguration(activeDetector)
46
+ if configuration is not None:
47
+ apply_wet_detection(resolvedPath, configuration, result)
39
48
  return _ToPlainDictionary(result)
40
49
 
41
50
 
@@ -48,7 +57,10 @@ def get_detector(
48
57
  recurseXObjects: bool = True,
49
58
  outputDirectory: str | Path | None = None,
50
59
  ) -> Detector:
51
- """Return a reusable detector instance configured with the supplied options."""
60
+ """Return a reusable detector instance configured with the supplied options.
61
+
62
+ Engine selection is forced to ``auto`` (prefers PyMuPDF when available).
63
+ """
52
64
 
53
65
  configuration = DetectConfiguration(
54
66
  PdfRoot=Path(pdfRoot) if pdfRoot is not None else Path.cwd(),
@@ -108,6 +120,7 @@ def _ToPlainValue(value: Any) -> Any:
108
120
  def DetectMany(
109
121
  pdfPaths: Iterable[str | Path],
110
122
  *,
123
+ runWetDetection: bool = True,
111
124
  detector: Detector | None = None,
112
125
  **kwargs: Any,
113
126
  ) -> Iterator[dict[str, Any]]:
@@ -115,17 +128,18 @@ def DetectMany(
115
128
 
116
129
  if detector is not None:
117
130
  for pdfPath in pdfPaths:
118
- yield _DetectWithDetector(detector, pdfPath)
131
+ yield _DetectWithDetector(detector, pdfPath, runWetDetection=runWetDetection)
119
132
  return
120
133
 
121
134
  for pdfPath in pdfPaths:
122
- yield DetectPdf(pdfPath, **kwargs)
135
+ yield DetectPdf(pdfPath, runWetDetection=runWetDetection, **kwargs)
123
136
 
124
137
 
125
138
  def ScanDirectory(
126
139
  pdfRoot: str | Path,
127
140
  *,
128
141
  globPattern: str = "**/*.pdf",
142
+ runWetDetection: bool = True,
129
143
  detector: Detector | None = None,
130
144
  **kwargs: Any,
131
145
  ) -> Iterator[dict[str, Any]]:
@@ -143,7 +157,7 @@ def ScanDirectory(
143
157
 
144
158
  for pdfPath in iterator:
145
159
  if pdfPath.is_file() and pdfPath.suffix.lower() == ".pdf":
146
- yield DetectPdf(pdfPath, detector=detector, **kwargs)
160
+ yield DetectPdf(pdfPath, detector=detector, runWetDetection=runWetDetection, **kwargs)
147
161
 
148
162
 
149
163
  def ToCsvRow(result: dict[str, Any]) -> dict[str, Any]:
@@ -174,11 +188,25 @@ def Version() -> str:
174
188
  return "0.0.0-dev"
175
189
 
176
190
 
177
- def _DetectWithDetector(detector: Detector, pdfPath: str | Path) -> dict[str, Any]:
191
+ def _DetectWithDetector(
192
+ detector: Detector, pdfPath: str | Path, *, runWetDetection: bool
193
+ ) -> dict[str, Any]:
178
194
  """Helper that runs ``detector`` and returns the plain dictionary result."""
179
195
 
180
196
  resolvedPath = Path(pdfPath)
181
- return _ToPlainDictionary(detector.Detect(resolvedPath))
197
+ result = detector.Detect(resolvedPath)
198
+ if runWetDetection:
199
+ configuration = _ResolveConfiguration(detector)
200
+ if configuration is not None:
201
+ apply_wet_detection(resolvedPath, configuration, result)
202
+ return _ToPlainDictionary(result)
203
+
204
+
205
+ def _ResolveConfiguration(detector: Detector) -> DetectConfiguration | None:
206
+ configuration = getattr(detector, "Configuration", None)
207
+ if isinstance(configuration, DetectConfiguration):
208
+ return configuration
209
+ return None
182
210
 
183
211
 
184
212
  @contextmanager
@@ -201,8 +229,7 @@ def CropSignatureImages(
201
229
  dpi: int = 200,
202
230
  returnBytes: Literal[False] = False,
203
231
  saveToDisk: bool = True,
204
- ) -> list[Path]:
205
- ...
232
+ ) -> list[Path]: ...
206
233
 
207
234
 
208
235
  @overload
@@ -214,8 +241,7 @@ def CropSignatureImages(
214
241
  dpi: int,
215
242
  returnBytes: Literal[True],
216
243
  saveToDisk: bool,
217
- ) -> list[SignatureCrop]:
218
- ...
244
+ ) -> list[SignatureCrop]: ...
219
245
 
220
246
 
221
247
  def CropSignatureImages(
@@ -227,12 +253,15 @@ def CropSignatureImages(
227
253
  returnBytes: bool = False,
228
254
  saveToDisk: bool = True,
229
255
  ) -> list[Path] | list[SignatureCrop]:
230
- """Crop detected signature regions to PNG files.
256
+ """Create DOCX files containing cropped signature images.
231
257
 
232
258
  Accepts either a :class:`FileResult` instance or the ``dict`` returned by
233
259
  :func:`DetectPdf`. Requires the optional ``pymupdf`` dependency.
234
260
  Set ``returnBytes=True`` to also receive in-memory PNG bytes for each crop. Set
235
261
  ``saveToDisk=False`` to skip writing PNG files while still returning in-memory data.
262
+ When ``saveToDisk`` is enabled, a one-image DOCX file is also written per crop. When
263
+ ``returnBytes`` is True and ``python-docx`` is available, the returned
264
+ :class:`SignatureCrop` objects include ``docx_bytes``.
236
265
  """
237
266
 
238
267
  from sigdetect.cropping import crop_signatures
@@ -275,6 +304,7 @@ def _CoerceFileResult(
275
304
  RenderType=str(entry.get("render_type") or "unknown"),
276
305
  BoundingBox=tuple(bbox) if bbox else None,
277
306
  CropPath=entry.get("crop_path"),
307
+ CropBytes=entry.get("crop_bytes"),
278
308
  )
279
309
  )
280
310
 
@@ -2,6 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import base64
5
6
  import json
6
7
  from collections.abc import Iterator
7
8
  from dataclasses import asdict, is_dataclass
@@ -48,6 +49,12 @@ def Detect(
48
49
  configurationPath: Path | None = typer.Option(
49
50
  None, "--config", "-c", help="Path to YAML config"
50
51
  ),
52
+ writeResults: bool | None = typer.Option(
53
+ None,
54
+ "--write-results/--no-write-results",
55
+ help="Write results.json (or JSON to stdout when out_dir is none)",
56
+ show_default=False,
57
+ ),
51
58
  profileOverride: str | None = typer.Option(None, "--profile", "-p", help="hipaa or retainer"),
52
59
  recursive: bool = typer.Option(
53
60
  True,
@@ -57,13 +64,13 @@ def Detect(
57
64
  cropSignatures: bool | None = typer.Option(
58
65
  None,
59
66
  "--crop-signatures/--no-crop-signatures",
60
- help="Crop detected signature regions to PNG files (requires PyMuPDF)",
67
+ help="Write DOCX files containing cropped signature images (requires PyMuPDF + python-docx)",
61
68
  show_default=False,
62
69
  ),
63
70
  cropDirectory: Path | None = typer.Option(
64
71
  None,
65
72
  "--crop-dir",
66
- help="Directory for signature PNG crops (defaults to out_dir/signature_crops)",
73
+ help="Directory for signature DOCX crops (defaults to out_dir/signature_crops)",
67
74
  ),
68
75
  cropDpi: int | None = typer.Option(
69
76
  None,
@@ -73,10 +80,16 @@ def Detect(
73
80
  help="Rendering DPI for signature crops",
74
81
  show_default=False,
75
82
  ),
83
+ cropBytes: bool = typer.Option(
84
+ False,
85
+ "--crop-bytes/--no-crop-bytes",
86
+ help="Embed base64 PNG bytes for signature crops in results JSON",
87
+ show_default=False,
88
+ ),
76
89
  detectWetSignatures: bool | None = typer.Option(
77
90
  None,
78
91
  "--detect-wet/--no-detect-wet",
79
- help="Run OCR-backed wet signature detection (requires PyMuPDF + Tesseract)",
92
+ help="Compatibility flag; non-e-sign PDFs always run OCR when deps are available",
80
93
  show_default=False,
81
94
  ),
82
95
  wetOcrDpi: int | None = typer.Option(
@@ -111,6 +124,8 @@ def Detect(
111
124
  configuration = configuration.model_copy(update={"Profile": normalized_profile})
112
125
 
113
126
  overrides: dict[str, object] = {}
127
+ if writeResults is not None:
128
+ overrides["WriteResults"] = writeResults
114
129
  if cropSignatures is not None:
115
130
  overrides["CropSignatures"] = cropSignatures
116
131
  if cropDirectory is not None:
@@ -145,44 +160,52 @@ def Detect(
145
160
  except StopIteration:
146
161
  raise SystemExit(f"No PDFs found in {configuration.PdfRoot}") from None
147
162
 
148
- results_buffer: list[FileResult] | None = [] if configuration.OutputDirectory is None else None
163
+ write_results = configuration.WriteResults
164
+ results_buffer: list[FileResult] | None = (
165
+ [] if write_results and configuration.OutputDirectory is None else None
166
+ )
149
167
  json_handle = None
150
168
  json_path: Path | None = None
151
169
  wrote_first = False
152
170
 
153
- if configuration.OutputDirectory is not None:
171
+ if write_results and configuration.OutputDirectory is not None:
154
172
  outputDirectory = configuration.OutputDirectory
155
173
  outputDirectory.mkdir(parents=True, exist_ok=True)
156
174
  json_path = outputDirectory / "results.json"
157
175
  json_handle = open(json_path, "w", encoding="utf-8")
158
176
  json_handle.write("[")
159
177
 
178
+ crop_bytes_enabled = bool(cropBytes)
160
179
  crop_dir = configuration.CropOutputDirectory
180
+ if crop_dir is None:
181
+ base_dir = configuration.OutputDirectory or configuration.PdfRoot
182
+ crop_dir = base_dir / "signature_crops"
161
183
  cropping_enabled = configuration.CropSignatures
162
184
  cropping_available = True
163
185
  cropping_attempted = False
164
- if configuration.CropSignatures and crop_dir is None:
165
- Logger.warning(
166
- "CropSignatures enabled without an output directory",
167
- extra={"pdf_root": str(configuration.PdfRoot)},
168
- )
169
- cropping_enabled = False
170
186
 
171
187
  total_bboxes = 0
172
188
 
173
189
  def _append_result(file_result: FileResult, source_pdf: Path) -> None:
174
190
  nonlocal wrote_first, json_handle, total_bboxes, cropping_available, cropping_attempted
175
191
 
176
- if cropping_enabled and cropping_available and crop_dir is not None:
192
+ if cropping_available and (cropping_enabled or crop_bytes_enabled) and crop_dir is not None:
177
193
  try:
178
- crop_signatures(
194
+ crops = crop_signatures(
179
195
  pdf_path=source_pdf,
180
196
  file_result=file_result,
181
197
  output_dir=crop_dir,
182
198
  dpi=configuration.CropImageDpi,
183
199
  logger=Logger,
200
+ return_bytes=crop_bytes_enabled,
201
+ save_files=cropping_enabled,
184
202
  )
185
203
  cropping_attempted = True
204
+ if crop_bytes_enabled:
205
+ for crop in crops:
206
+ crop.signature.CropBytes = base64.b64encode(crop.image_bytes).decode(
207
+ "ascii"
208
+ )
186
209
  except SignatureCroppingUnavailable as exc:
187
210
  cropping_available = False
188
211
  Logger.warning("Signature cropping unavailable", extra={"error": str(exc)})
@@ -231,18 +254,24 @@ def Detect(
231
254
  json_handle.write(closing)
232
255
  json_handle.close()
233
256
 
234
- if json_handle is not None:
235
- typer.echo(f"Wrote {json_path}")
236
- else:
237
- payload = json.dumps(
238
- results_buffer or [], indent=2, ensure_ascii=False, default=_JsonSerializer
239
- )
240
- typer.echo(payload)
241
- typer.echo("Detection completed with output disabled (out_dir=none)")
242
-
243
- if cropping_enabled and cropping_available and cropping_attempted and total_bboxes == 0:
257
+ if write_results:
258
+ if json_handle is not None:
259
+ typer.echo(f"Wrote {json_path}")
260
+ else:
261
+ payload = json.dumps(
262
+ results_buffer or [], indent=2, ensure_ascii=False, default=_JsonSerializer
263
+ )
264
+ typer.echo(payload)
265
+ typer.echo("Detection completed with output disabled (out_dir=none)")
266
+
267
+ if (
268
+ (cropping_enabled or crop_bytes_enabled)
269
+ and cropping_available
270
+ and cropping_attempted
271
+ and total_bboxes == 0
272
+ ):
244
273
  Logger.warning(
245
- "No signature bounding boxes detected; try --engine pymupdf for crop-ready output",
274
+ "No signature bounding boxes detected; install PyMuPDF for crop-ready output",
246
275
  extra={"engine": configuration.Engine},
247
276
  )
248
277
 
@@ -25,6 +25,7 @@ class DetectConfiguration(BaseModel):
25
25
 
26
26
  PdfRoot: Path = Field(default=Path("hipaa_results"), alias="pdf_root")
27
27
  OutputDirectory: Path | None = Field(default=Path("out"), alias="out_dir")
28
+ WriteResults: bool = Field(default=False, alias="write_results")
28
29
  Engine: EngineName = Field(default="auto", alias="engine")
29
30
  Profile: ProfileName = Field(default="hipaa", alias="profile")
30
31
  PseudoSignatures: bool = Field(default=True, alias="pseudo_signatures")
@@ -63,6 +64,10 @@ class DetectConfiguration(BaseModel):
63
64
  def out_dir(self) -> Path | None: # pragma: no cover - simple passthrough
64
65
  return self.OutputDirectory
65
66
 
67
+ @property
68
+ def write_results(self) -> bool: # pragma: no cover - simple passthrough
69
+ return self.WriteResults
70
+
66
71
  @property
67
72
  def engine(self) -> EngineName: # pragma: no cover - simple passthrough
68
73
  return self.Engine