sigdetect 0.5.0__tar.gz → 0.5.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {sigdetect-0.5.0 → sigdetect-0.5.1}/PKG-INFO +11 -7
  2. {sigdetect-0.5.0 → sigdetect-0.5.1}/README.md +10 -6
  3. {sigdetect-0.5.0 → sigdetect-0.5.1}/pyproject.toml +1 -1
  4. {sigdetect-0.5.0 → sigdetect-0.5.1}/src/sigdetect/api.py +10 -4
  5. {sigdetect-0.5.0 → sigdetect-0.5.1}/src/sigdetect/cli.py +20 -7
  6. {sigdetect-0.5.0 → sigdetect-0.5.1}/src/sigdetect/config.py +12 -0
  7. {sigdetect-0.5.0 → sigdetect-0.5.1}/src/sigdetect/cropping.py +25 -18
  8. {sigdetect-0.5.0 → sigdetect-0.5.1}/src/sigdetect/detector/signature_model.py +4 -0
  9. {sigdetect-0.5.0 → sigdetect-0.5.1}/src/sigdetect.egg-info/PKG-INFO +11 -7
  10. {sigdetect-0.5.0 → sigdetect-0.5.1}/tests/test_cli.py +2 -0
  11. {sigdetect-0.5.0 → sigdetect-0.5.1}/tests/test_cropping.py +82 -6
  12. {sigdetect-0.5.0 → sigdetect-0.5.1}/setup.cfg +0 -0
  13. {sigdetect-0.5.0 → sigdetect-0.5.1}/src/sigdetect/__init__.py +0 -0
  14. {sigdetect-0.5.0 → sigdetect-0.5.1}/src/sigdetect/data/role_rules.retainer.yml +0 -0
  15. {sigdetect-0.5.0 → sigdetect-0.5.1}/src/sigdetect/data/role_rules.yml +0 -0
  16. {sigdetect-0.5.0 → sigdetect-0.5.1}/src/sigdetect/data/vendor_patterns.yml +0 -0
  17. {sigdetect-0.5.0 → sigdetect-0.5.1}/src/sigdetect/detector/__init__.py +0 -0
  18. {sigdetect-0.5.0 → sigdetect-0.5.1}/src/sigdetect/detector/base.py +0 -0
  19. {sigdetect-0.5.0 → sigdetect-0.5.1}/src/sigdetect/detector/base_detector.py +0 -0
  20. {sigdetect-0.5.0 → sigdetect-0.5.1}/src/sigdetect/detector/file_result_model.py +0 -0
  21. {sigdetect-0.5.0 → sigdetect-0.5.1}/src/sigdetect/detector/pymupdf_engine.py +0 -0
  22. {sigdetect-0.5.0 → sigdetect-0.5.1}/src/sigdetect/detector/pypdf2_engine.py +0 -0
  23. {sigdetect-0.5.0 → sigdetect-0.5.1}/src/sigdetect/eda.py +0 -0
  24. {sigdetect-0.5.0 → sigdetect-0.5.1}/src/sigdetect/logging_setup.py +0 -0
  25. {sigdetect-0.5.0 → sigdetect-0.5.1}/src/sigdetect/utils.py +0 -0
  26. {sigdetect-0.5.0 → sigdetect-0.5.1}/src/sigdetect/wet_detection.py +0 -0
  27. {sigdetect-0.5.0 → sigdetect-0.5.1}/src/sigdetect.egg-info/SOURCES.txt +0 -0
  28. {sigdetect-0.5.0 → sigdetect-0.5.1}/src/sigdetect.egg-info/dependency_links.txt +0 -0
  29. {sigdetect-0.5.0 → sigdetect-0.5.1}/src/sigdetect.egg-info/entry_points.txt +0 -0
  30. {sigdetect-0.5.0 → sigdetect-0.5.1}/src/sigdetect.egg-info/requires.txt +0 -0
  31. {sigdetect-0.5.0 → sigdetect-0.5.1}/src/sigdetect.egg-info/top_level.txt +0 -0
  32. {sigdetect-0.5.0 → sigdetect-0.5.1}/tests/test_api.py +0 -0
  33. {sigdetect-0.5.0 → sigdetect-0.5.1}/tests/test_detector_options.py +0 -0
  34. {sigdetect-0.5.0 → sigdetect-0.5.1}/tests/test_pymupdf_engine.py +0 -0
  35. {sigdetect-0.5.0 → sigdetect-0.5.1}/tests/test_wet_detection.py +0 -0
  36. {sigdetect-0.5.0 → sigdetect-0.5.1}/tests/test_widget_role_patient_smoke.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sigdetect
3
- Version: 0.5.0
3
+ Version: 0.5.1
4
4
  Summary: Signature detection and role attribution for PDFs
5
5
  Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
6
6
  License: MIT
@@ -105,7 +105,7 @@ sigdetect detect \
105
105
  - `retainer` → client / firm (prefers detecting two signatures)
106
106
  - `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
107
107
  - Results output is disabled by default; set `write_results: true` or pass `--write-results` when you need `results.json` (for EDA).
108
- - Cropping (`--crop-signatures`) writes a one-image `.docx` per signature in the crop output directory (no PNG files on disk); `--crop-bytes` embeds base64 PNG data in `signatures[].crop_bytes` for in-memory use. PyMuPDF is required for crops, and `python-docx` is required for `.docx` output.
108
+ - Cropping (`--crop-signatures`) writes PNG crops to disk by default; enable `--crop-docx` to write DOCX files instead of PNGs. `--crop-bytes` embeds base64 PNG data in `signatures[].crop_bytes` and, when `--crop-docx` is enabled, embeds DOCX bytes in `signatures[].crop_docx_bytes`. PyMuPDF is required for crops, and `python-docx` is required for DOCX output.
109
109
  - Wet detection runs automatically for non-e-sign PDFs when dependencies are available; missing OCR dependencies add a `ManualReview:*` hint instead of failing. PyMuPDF + Tesseract are required for wet detection.
110
110
  - If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
111
111
 
@@ -142,7 +142,7 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
142
142
  print(result.to_dict())
143
143
  ~~~
144
144
 
145
- `Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When cropping is enabled, `crop_path` points at the generated `.docx`. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
145
+ `Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image; when DOCX cropping is enabled, `crop_docx_path` points at the generated doc. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
146
146
 
147
147
  ---
148
148
 
@@ -194,7 +194,7 @@ for res in ScanDirectory(
194
194
  # store in DB, print, etc.
195
195
  pass
196
196
 
197
- # 3) Create DOCX crops for FileResult objects (requires PyMuPDF + python-docx)
197
+ # 3) Crop signature snippets for FileResult objects (requires PyMuPDF; DOCX needs python-docx)
198
198
  detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
199
199
  file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
200
200
  CropSignatureImages(
@@ -233,7 +233,8 @@ High-level summary (per file):
233
233
  "hint": "AcroSig:sig_patient",
234
234
  "render_type": "typed",
235
235
  "bounding_box": [10.0, 10.0, 150.0, 40.0],
236
- "crop_path": "signature_crops/example/sig_01_patient.docx"
236
+ "crop_path": "signature_crops/example/sig_01_patient.png",
237
+ "crop_docx_path": null
237
238
  },
238
239
  {
239
240
  "page": null,
@@ -259,8 +260,10 @@ High-level summary (per file):
259
260
  - **`roles`** summarizes unique non-`unknown` roles across signatures.
260
261
  - In retainer profile, emitter prefers two signatures (client + firm), often on the same page.
261
262
  - **`signatures[].bounding_box`** reports the widget rectangle in PDF points (origin bottom-left).
262
- - **`signatures[].crop_path`** is populated when DOCX crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
263
+ - **`signatures[].crop_path`** is populated when PNG crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
264
+ - **`signatures[].crop_docx_path`** is populated when DOCX crops are generated (`--crop-docx` or `docx=True`).
263
265
  - **`signatures[].crop_bytes`** contains base64 PNG data when CLI `--crop-bytes` is enabled.
266
+ - **`signatures[].crop_docx_bytes`** contains base64 DOCX data when `--crop-docx` and `--crop-bytes` are enabled together.
264
267
 
265
268
  ---
266
269
 
@@ -287,7 +290,8 @@ write_results: false
287
290
  pseudo_signatures: true
288
291
  recurse_xobjects: true
289
292
  profile: retainer # or: hipaa
290
- crop_signatures: false # enable to write DOCX crops (requires pymupdf + python-docx)
293
+ crop_signatures: false # enable to write PNG crops (requires pymupdf)
294
+ crop_docx: false # enable to write DOCX crops instead of PNGs (requires python-docx)
291
295
  # crop_output_dir: ./signature_crops
292
296
  crop_image_dpi: 200
293
297
  detect_wet_signatures: false # kept for compatibility; non-e-sign PDFs still trigger OCR
@@ -87,7 +87,7 @@ sigdetect detect \
87
87
  - `retainer` → client / firm (prefers detecting two signatures)
88
88
  - `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
89
89
  - Results output is disabled by default; set `write_results: true` or pass `--write-results` when you need `results.json` (for EDA).
90
- - Cropping (`--crop-signatures`) writes a one-image `.docx` per signature in the crop output directory (no PNG files on disk); `--crop-bytes` embeds base64 PNG data in `signatures[].crop_bytes` for in-memory use. PyMuPDF is required for crops, and `python-docx` is required for `.docx` output.
90
+ - Cropping (`--crop-signatures`) writes PNG crops to disk by default; enable `--crop-docx` to write DOCX files instead of PNGs. `--crop-bytes` embeds base64 PNG data in `signatures[].crop_bytes` and, when `--crop-docx` is enabled, embeds DOCX bytes in `signatures[].crop_docx_bytes`. PyMuPDF is required for crops, and `python-docx` is required for DOCX output.
91
91
  - Wet detection runs automatically for non-e-sign PDFs when dependencies are available; missing OCR dependencies add a `ManualReview:*` hint instead of failing. PyMuPDF + Tesseract are required for wet detection.
92
92
  - If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
93
93
 
@@ -124,7 +124,7 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
124
124
  print(result.to_dict())
125
125
  ~~~
126
126
 
127
- `Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When cropping is enabled, `crop_path` points at the generated `.docx`. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
127
+ `Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image; when DOCX cropping is enabled, `crop_docx_path` points at the generated doc. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
128
128
 
129
129
  ---
130
130
 
@@ -176,7 +176,7 @@ for res in ScanDirectory(
176
176
  # store in DB, print, etc.
177
177
  pass
178
178
 
179
- # 3) Create DOCX crops for FileResult objects (requires PyMuPDF + python-docx)
179
+ # 3) Crop signature snippets for FileResult objects (requires PyMuPDF; DOCX needs python-docx)
180
180
  detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
181
181
  file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
182
182
  CropSignatureImages(
@@ -215,7 +215,8 @@ High-level summary (per file):
215
215
  "hint": "AcroSig:sig_patient",
216
216
  "render_type": "typed",
217
217
  "bounding_box": [10.0, 10.0, 150.0, 40.0],
218
- "crop_path": "signature_crops/example/sig_01_patient.docx"
218
+ "crop_path": "signature_crops/example/sig_01_patient.png",
219
+ "crop_docx_path": null
219
220
  },
220
221
  {
221
222
  "page": null,
@@ -241,8 +242,10 @@ High-level summary (per file):
241
242
  - **`roles`** summarizes unique non-`unknown` roles across signatures.
242
243
  - In retainer profile, emitter prefers two signatures (client + firm), often on the same page.
243
244
  - **`signatures[].bounding_box`** reports the widget rectangle in PDF points (origin bottom-left).
244
- - **`signatures[].crop_path`** is populated when DOCX crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
245
+ - **`signatures[].crop_path`** is populated when PNG crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
246
+ - **`signatures[].crop_docx_path`** is populated when DOCX crops are generated (`--crop-docx` or `docx=True`).
245
247
  - **`signatures[].crop_bytes`** contains base64 PNG data when CLI `--crop-bytes` is enabled.
248
+ - **`signatures[].crop_docx_bytes`** contains base64 DOCX data when `--crop-docx` and `--crop-bytes` are enabled together.
246
249
 
247
250
  ---
248
251
 
@@ -269,7 +272,8 @@ write_results: false
269
272
  pseudo_signatures: true
270
273
  recurse_xobjects: true
271
274
  profile: retainer # or: hipaa
272
- crop_signatures: false # enable to write DOCX crops (requires pymupdf + python-docx)
275
+ crop_signatures: false # enable to write PNG crops (requires pymupdf)
276
+ crop_docx: false # enable to write DOCX crops instead of PNGs (requires python-docx)
273
277
  # crop_output_dir: ./signature_crops
274
278
  crop_image_dpi: 200
275
279
  detect_wet_signatures: false # kept for compatibility; non-e-sign PDFs still trigger OCR
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sigdetect"
7
- version = "0.5.0"
7
+ version = "0.5.1"
8
8
  description = "Signature detection and role attribution for PDFs"
9
9
  readme = "README.md"
10
10
  authors = [{ name = "BT Asmamaw", email = "basmamaw@angeiongroup.com" }]
@@ -229,6 +229,7 @@ def CropSignatureImages(
229
229
  dpi: int = 200,
230
230
  returnBytes: Literal[False] = False,
231
231
  saveToDisk: bool = True,
232
+ docx: bool = False,
232
233
  ) -> list[Path]: ...
233
234
 
234
235
 
@@ -241,6 +242,7 @@ def CropSignatureImages(
241
242
  dpi: int,
242
243
  returnBytes: Literal[True],
243
244
  saveToDisk: bool,
245
+ docx: bool = False,
244
246
  ) -> list[SignatureCrop]: ...
245
247
 
246
248
 
@@ -252,16 +254,17 @@ def CropSignatureImages(
252
254
  dpi: int = 200,
253
255
  returnBytes: bool = False,
254
256
  saveToDisk: bool = True,
257
+ docx: bool = False,
255
258
  ) -> list[Path] | list[SignatureCrop]:
256
- """Create DOCX files containing cropped signature images.
259
+ """Create PNG files containing cropped signature images (or DOCX when enabled).
257
260
 
258
261
  Accepts either a :class:`FileResult` instance or the ``dict`` returned by
259
262
  :func:`DetectPdf`. Requires the optional ``pymupdf`` dependency.
260
263
  Set ``returnBytes=True`` to also receive in-memory PNG bytes for each crop. Set
261
264
  ``saveToDisk=False`` to skip writing PNG files while still returning in-memory data.
262
- When ``saveToDisk`` is enabled, a one-image DOCX file is also written per crop. When
263
- ``returnBytes`` is True and ``python-docx`` is available, the returned
264
- :class:`SignatureCrop` objects include ``docx_bytes``.
265
+ When ``docx`` is True, DOCX files are written instead of PNG files. When ``returnBytes`` is
266
+ True and ``docx`` is enabled, the returned :class:`SignatureCrop` objects include
267
+ ``docx_bytes``.
265
268
  """
266
269
 
267
270
  from sigdetect.cropping import crop_signatures
@@ -274,6 +277,7 @@ def CropSignatureImages(
274
277
  dpi=dpi,
275
278
  return_bytes=returnBytes,
276
279
  save_files=saveToDisk,
280
+ docx=docx,
277
281
  )
278
282
  if original_dict is not None:
279
283
  original_dict.clear()
@@ -305,6 +309,8 @@ def _CoerceFileResult(
305
309
  BoundingBox=tuple(bbox) if bbox else None,
306
310
  CropPath=entry.get("crop_path"),
307
311
  CropBytes=entry.get("crop_bytes"),
312
+ CropDocxPath=entry.get("crop_docx_path"),
313
+ CropDocxBytes=entry.get("crop_docx_bytes"),
308
314
  )
309
315
  )
310
316
 
@@ -64,13 +64,19 @@ def Detect(
64
64
  cropSignatures: bool | None = typer.Option(
65
65
  None,
66
66
  "--crop-signatures/--no-crop-signatures",
67
- help="Write DOCX files containing cropped signature images (requires PyMuPDF + python-docx)",
67
+ help="Write PNG crops for signature widgets (requires PyMuPDF)",
68
+ show_default=False,
69
+ ),
70
+ cropDocx: bool | None = typer.Option(
71
+ None,
72
+ "--crop-docx/--no-crop-docx",
73
+ help="Write DOCX crops instead of PNG files (requires PyMuPDF + python-docx)",
68
74
  show_default=False,
69
75
  ),
70
76
  cropDirectory: Path | None = typer.Option(
71
77
  None,
72
78
  "--crop-dir",
73
- help="Directory for signature DOCX crops (defaults to out_dir/signature_crops)",
79
+ help="Directory for signature crops (defaults to out_dir/signature_crops)",
74
80
  ),
75
81
  cropDpi: int | None = typer.Option(
76
82
  None,
@@ -83,7 +89,7 @@ def Detect(
83
89
  cropBytes: bool = typer.Option(
84
90
  False,
85
91
  "--crop-bytes/--no-crop-bytes",
86
- help="Embed base64 PNG bytes for signature crops in results JSON",
92
+ help="Embed base64 PNG bytes (and DOCX bytes when --crop-docx) in results JSON",
87
93
  show_default=False,
88
94
  ),
89
95
  detectWetSignatures: bool | None = typer.Option(
@@ -128,6 +134,8 @@ def Detect(
128
134
  overrides["WriteResults"] = writeResults
129
135
  if cropSignatures is not None:
130
136
  overrides["CropSignatures"] = cropSignatures
137
+ if cropDocx is not None:
138
+ overrides["CropDocx"] = cropDocx
131
139
  if cropDirectory is not None:
132
140
  overrides["CropOutputDirectory"] = cropDirectory
133
141
  if cropDpi is not None:
@@ -181,6 +189,7 @@ def Detect(
181
189
  base_dir = configuration.OutputDirectory or configuration.PdfRoot
182
190
  crop_dir = base_dir / "signature_crops"
183
191
  cropping_enabled = configuration.CropSignatures
192
+ docx_enabled = configuration.CropDocx
184
193
  cropping_available = True
185
194
  cropping_attempted = False
186
195
 
@@ -199,6 +208,7 @@ def Detect(
199
208
  logger=Logger,
200
209
  return_bytes=crop_bytes_enabled,
201
210
  save_files=cropping_enabled,
211
+ docx=docx_enabled,
202
212
  )
203
213
  cropping_attempted = True
204
214
  if crop_bytes_enabled:
@@ -206,15 +216,18 @@ def Detect(
206
216
  crop.signature.CropBytes = base64.b64encode(crop.image_bytes).decode(
207
217
  "ascii"
208
218
  )
219
+ if crop.docx_bytes:
220
+ crop.signature.CropDocxBytes = base64.b64encode(
221
+ crop.docx_bytes
222
+ ).decode("ascii")
209
223
  except SignatureCroppingUnavailable as exc:
210
224
  cropping_available = False
211
225
  Logger.warning("Signature cropping unavailable", extra={"error": str(exc)})
212
226
  typer.echo(str(exc), err=True)
213
227
  except Exception as exc: # pragma: no cover - defensive
214
- Logger.warning(
215
- "Unexpected error while cropping signatures",
216
- extra={"error": str(exc)},
217
- )
228
+ cropping_available = False
229
+ Logger.warning("Signature cropping unavailable", extra={"error": str(exc)})
230
+ typer.echo(str(exc), err=True)
218
231
 
219
232
  total_bboxes += sum(1 for sig in file_result.Signatures if sig.BoundingBox)
220
233
 
@@ -31,6 +31,7 @@ class DetectConfiguration(BaseModel):
31
31
  PseudoSignatures: bool = Field(default=True, alias="pseudo_signatures")
32
32
  RecurseXObjects: bool = Field(default=True, alias="recurse_xobjects")
33
33
  CropSignatures: bool = Field(default=True, alias="crop_signatures")
34
+ CropDocx: bool = Field(default=False, alias="crop_docx")
34
35
  CropOutputDirectory: Path | None = Field(default=None, alias="crop_output_dir")
35
36
  CropImageDpi: int = Field(default=200, alias="crop_image_dpi", ge=72, le=600)
36
37
  DetectWetSignatures: bool = Field(default=True, alias="detect_wet_signatures")
@@ -88,6 +89,10 @@ class DetectConfiguration(BaseModel):
88
89
  def crop_signatures(self) -> bool: # pragma: no cover - simple passthrough
89
90
  return self.CropSignatures
90
91
 
92
+ @property
93
+ def crop_docx(self) -> bool: # pragma: no cover - simple passthrough
94
+ return self.CropDocx
95
+
91
96
  @property
92
97
  def crop_output_dir(self) -> Path | None: # pragma: no cover - simple passthrough
93
98
  return self.CropOutputDirectory
@@ -133,6 +138,7 @@ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
133
138
  env_out_dir = os.getenv("SIGDETECT_OUT_DIR")
134
139
  env_profile = os.getenv("SIGDETECT_PROFILE")
135
140
  env_crop = os.getenv("SIGDETECT_CROP_SIGNATURES")
141
+ env_crop_docx = os.getenv("SIGDETECT_CROP_DOCX")
136
142
  env_crop_dir = os.getenv("SIGDETECT_CROP_DIR")
137
143
  env_crop_dpi = os.getenv("SIGDETECT_CROP_DPI")
138
144
  env_detect_wet = os.getenv("SIGDETECT_DETECT_WET")
@@ -159,6 +165,12 @@ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
159
165
  raw_data["crop_signatures"] = True
160
166
  elif lowered in {"0", "false", "no", "off"}:
161
167
  raw_data["crop_signatures"] = False
168
+ if env_crop_docx is not None:
169
+ lowered = env_crop_docx.lower()
170
+ if lowered in {"1", "true", "yes", "on"}:
171
+ raw_data["crop_docx"] = True
172
+ elif lowered in {"0", "false", "no", "off"}:
173
+ raw_data["crop_docx"] = False
162
174
  if env_crop_dir:
163
175
  raw_data["crop_output_dir"] = env_crop_dir
164
176
  if env_crop_dpi:
@@ -1,4 +1,4 @@
1
- """Helpers for converting signature bounding boxes into DOCX crops."""
1
+ """Helpers for converting signature bounding boxes into PNG or DOCX crops."""
2
2
 
3
3
  from __future__ import annotations
4
4
 
@@ -27,7 +27,7 @@ class SignatureCroppingUnavailable(RuntimeError):
27
27
  """Raised when PNG cropping cannot be performed (e.g., PyMuPDF missing)."""
28
28
 
29
29
 
30
- class SignatureDocxUnavailable(RuntimeError):
30
+ class SignatureDocxUnavailable(SignatureCroppingUnavailable):
31
31
  """Raised when DOCX creation cannot be performed (e.g., python-docx missing)."""
32
32
 
33
33
 
@@ -52,6 +52,7 @@ def crop_signatures(
52
52
  logger: logging.Logger | None = None,
53
53
  return_bytes: Literal[False] = False,
54
54
  save_files: bool = True,
55
+ docx: bool = False,
55
56
  ) -> list[Path]: ...
56
57
 
57
58
 
@@ -65,6 +66,7 @@ def crop_signatures(
65
66
  logger: logging.Logger | None = None,
66
67
  return_bytes: Literal[True],
67
68
  save_files: bool = True,
69
+ docx: bool = False,
68
70
  ) -> list[SignatureCrop]: ...
69
71
 
70
72
 
@@ -77,14 +79,14 @@ def crop_signatures(
77
79
  logger: logging.Logger | None = None,
78
80
  return_bytes: bool = False,
79
81
  save_files: bool = True,
82
+ docx: bool = False,
80
83
  ) -> list[Path] | list[SignatureCrop]:
81
- """Render each signature bounding box to a PNG image and wrap it in a DOCX file.
84
+ """Render each signature bounding box to a PNG image and optionally wrap it in DOCX.
82
85
 
83
86
  Set ``return_bytes=True`` to collect in-memory PNG bytes for each crop while also writing
84
87
  the files to ``output_dir``. Set ``save_files=False`` to skip writing PNGs to disk.
85
- When ``save_files`` is enabled, a one-image DOCX file is also written per signature crop.
86
- When ``return_bytes`` is True and ``python-docx`` is available, ``SignatureCrop.docx_bytes``
87
- will contain the DOCX payload.
88
+ When ``docx=True``, DOCX files are written instead of PNGs. When ``return_bytes`` is True
89
+ and ``docx=True``, ``SignatureCrop.docx_bytes`` will contain the DOCX payload.
88
90
  """
89
91
 
90
92
  if fitz is None: # pragma: no cover - exercised when dependency absent
@@ -101,14 +103,11 @@ def crop_signatures(
101
103
  generated_paths: list[Path] = []
102
104
  generated_crops: list[SignatureCrop] = []
103
105
 
104
- docx_to_disk = save_files
105
- docx_in_memory = return_bytes
106
- docx_enabled = docx_to_disk or docx_in_memory
106
+ docx_enabled = docx
107
107
  docx_available = Document is not None
108
- if docx_enabled and not docx_available and logger:
109
- logger.warning(
110
- "Signature DOCX output unavailable",
111
- extra={"error": "python-docx is required to generate DOCX outputs"},
108
+ if docx_enabled and not docx_available:
109
+ raise SignatureDocxUnavailable(
110
+ "python-docx is required to generate DOCX outputs for signature crops."
112
111
  )
113
112
 
114
113
  with fitz.open(pdf_path) as document: # type: ignore[attr-defined]
@@ -146,6 +145,8 @@ def crop_signatures(
146
145
  try:
147
146
  image_bytes: bytes | None = None
148
147
  pixmap = page.get_pixmap(matrix=matrix, clip=clip, alpha=False)
148
+ if save_files and not docx_enabled:
149
+ pixmap.save(png_destination)
149
150
  if return_bytes or docx_enabled:
150
151
  image_bytes = pixmap.tobytes("png")
151
152
  except Exception as exc: # pragma: no cover - defensive
@@ -162,12 +163,12 @@ def crop_signatures(
162
163
  continue
163
164
 
164
165
  docx_bytes: bytes | None = None
165
- if docx_enabled and docx_available:
166
+ if docx_enabled:
166
167
  if image_bytes is None: # pragma: no cover - defensive
167
168
  continue
168
169
  try:
169
170
  docx_bytes = _build_docx_bytes(image_bytes)
170
- if docx_to_disk:
171
+ if save_files:
171
172
  docx_destination.write_bytes(docx_bytes)
172
173
  except SignatureDocxUnavailable as exc:
173
174
  if logger:
@@ -184,14 +185,20 @@ def crop_signatures(
184
185
  )
185
186
 
186
187
  if save_files:
187
- signature.CropPath = str(docx_destination)
188
- generated_paths.append(docx_destination)
188
+ if docx_enabled:
189
+ signature.CropPath = None
190
+ signature.CropDocxPath = str(docx_destination)
191
+ generated_paths.append(docx_destination)
192
+ else:
193
+ signature.CropDocxPath = None
194
+ signature.CropPath = str(png_destination)
195
+ generated_paths.append(png_destination)
189
196
  if return_bytes:
190
197
  if image_bytes is None: # pragma: no cover - defensive
191
198
  continue
192
199
  generated_crops.append(
193
200
  SignatureCrop(
194
- path=docx_destination,
201
+ path=docx_destination if docx_enabled else png_destination,
195
202
  image_bytes=image_bytes,
196
203
  signature=signature,
197
204
  docx_bytes=docx_bytes,
@@ -21,6 +21,8 @@ class Signature:
21
21
  BoundingBox: tuple[float, float, float, float] | None = None
22
22
  CropPath: str | None = None
23
23
  CropBytes: str | None = None
24
+ CropDocxPath: str | None = None
25
+ CropDocxBytes: str | None = None
24
26
 
25
27
  def to_dict(self) -> dict[str, Any]:
26
28
  """Return the legacy snake_case representation used in JSON payloads."""
@@ -37,4 +39,6 @@ class Signature:
37
39
  "bounding_box": list(self.BoundingBox) if self.BoundingBox else None,
38
40
  "crop_path": self.CropPath,
39
41
  "crop_bytes": self.CropBytes,
42
+ "crop_docx_path": self.CropDocxPath,
43
+ "crop_docx_bytes": self.CropDocxBytes,
40
44
  }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sigdetect
3
- Version: 0.5.0
3
+ Version: 0.5.1
4
4
  Summary: Signature detection and role attribution for PDFs
5
5
  Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
6
6
  License: MIT
@@ -105,7 +105,7 @@ sigdetect detect \
105
105
  - `retainer` → client / firm (prefers detecting two signatures)
106
106
  - `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
107
107
  - Results output is disabled by default; set `write_results: true` or pass `--write-results` when you need `results.json` (for EDA).
108
- - Cropping (`--crop-signatures`) writes a one-image `.docx` per signature in the crop output directory (no PNG files on disk); `--crop-bytes` embeds base64 PNG data in `signatures[].crop_bytes` for in-memory use. PyMuPDF is required for crops, and `python-docx` is required for `.docx` output.
108
+ - Cropping (`--crop-signatures`) writes PNG crops to disk by default; enable `--crop-docx` to write DOCX files instead of PNGs. `--crop-bytes` embeds base64 PNG data in `signatures[].crop_bytes` and, when `--crop-docx` is enabled, embeds DOCX bytes in `signatures[].crop_docx_bytes`. PyMuPDF is required for crops, and `python-docx` is required for DOCX output.
109
109
  - Wet detection runs automatically for non-e-sign PDFs when dependencies are available; missing OCR dependencies add a `ManualReview:*` hint instead of failing. PyMuPDF + Tesseract are required for wet detection.
110
110
  - If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
111
111
 
@@ -142,7 +142,7 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
142
142
  print(result.to_dict())
143
143
  ~~~
144
144
 
145
- `Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When cropping is enabled, `crop_path` points at the generated `.docx`. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
145
+ `Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image; when DOCX cropping is enabled, `crop_docx_path` points at the generated doc. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
146
146
 
147
147
  ---
148
148
 
@@ -194,7 +194,7 @@ for res in ScanDirectory(
194
194
  # store in DB, print, etc.
195
195
  pass
196
196
 
197
- # 3) Create DOCX crops for FileResult objects (requires PyMuPDF + python-docx)
197
+ # 3) Crop signature snippets for FileResult objects (requires PyMuPDF; DOCX needs python-docx)
198
198
  detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
199
199
  file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
200
200
  CropSignatureImages(
@@ -233,7 +233,8 @@ High-level summary (per file):
233
233
  "hint": "AcroSig:sig_patient",
234
234
  "render_type": "typed",
235
235
  "bounding_box": [10.0, 10.0, 150.0, 40.0],
236
- "crop_path": "signature_crops/example/sig_01_patient.docx"
236
+ "crop_path": "signature_crops/example/sig_01_patient.png",
237
+ "crop_docx_path": null
237
238
  },
238
239
  {
239
240
  "page": null,
@@ -259,8 +260,10 @@ High-level summary (per file):
259
260
  - **`roles`** summarizes unique non-`unknown` roles across signatures.
260
261
  - In retainer profile, emitter prefers two signatures (client + firm), often on the same page.
261
262
  - **`signatures[].bounding_box`** reports the widget rectangle in PDF points (origin bottom-left).
262
- - **`signatures[].crop_path`** is populated when DOCX crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
263
+ - **`signatures[].crop_path`** is populated when PNG crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
264
+ - **`signatures[].crop_docx_path`** is populated when DOCX crops are generated (`--crop-docx` or `docx=True`).
263
265
  - **`signatures[].crop_bytes`** contains base64 PNG data when CLI `--crop-bytes` is enabled.
266
+ - **`signatures[].crop_docx_bytes`** contains base64 DOCX data when `--crop-docx` and `--crop-bytes` are enabled together.
264
267
 
265
268
  ---
266
269
 
@@ -287,7 +290,8 @@ write_results: false
287
290
  pseudo_signatures: true
288
291
  recurse_xobjects: true
289
292
  profile: retainer # or: hipaa
290
- crop_signatures: false # enable to write DOCX crops (requires pymupdf + python-docx)
293
+ crop_signatures: false # enable to write PNG crops (requires pymupdf)
294
+ crop_docx: false # enable to write DOCX crops instead of PNGs (requires python-docx)
291
295
  # crop_output_dir: ./signature_crops
292
296
  crop_image_dpi: 200
293
297
  detect_wet_signatures: false # kept for compatibility; non-e-sign PDFs still trigger OCR
@@ -249,9 +249,11 @@ def test_detect_crop_bytes_embeds_base64(tmp_path: Path, monkeypatch) -> None:
249
249
  logger=None,
250
250
  return_bytes=False,
251
251
  save_files=True,
252
+ docx=False,
252
253
  ):
253
254
  assert return_bytes is True
254
255
  assert save_files is False
256
+ assert docx is False
255
257
  return [
256
258
  SignatureCrop(
257
259
  path=Path(output_dir) / "sig_01.png",
@@ -10,7 +10,6 @@ from sigdetect.cropping import SignatureCrop, crop_signatures
10
10
  from sigdetect.detector.pypdf2_engine import PyPDF2Detector
11
11
 
12
12
  pytest.importorskip("fitz")
13
- pytest.importorskip("docx")
14
13
 
15
14
 
16
15
  def _pdf_with_signature(path: Path) -> None:
@@ -59,9 +58,31 @@ def test_crop_signatures(tmp_path: Path):
59
58
  if sig.BoundingBox:
60
59
  assert sig.CropPath is not None
61
60
  crop_path = Path(sig.CropPath)
61
+ assert crop_path.suffix == ".png"
62
+ assert crop_path.exists()
63
+ assert not crop_path.with_suffix(".docx").exists()
64
+ assert sig.CropDocxPath is None
65
+
66
+
67
+ def test_crop_signatures_docx_toggle(tmp_path: Path) -> None:
68
+ pdf_path = tmp_path / "doc.pdf"
69
+ _pdf_with_signature(pdf_path)
70
+
71
+ cfg = DetectConfiguration(pdf_root=tmp_path, out_dir=tmp_path, engine="pypdf2")
72
+ result = PyPDF2Detector(cfg).Detect(pdf_path)
73
+
74
+ out_dir = tmp_path / "crops_docx"
75
+ generated = crop_signatures(pdf_path, result, output_dir=out_dir, dpi=120, docx=True)
76
+
77
+ assert generated, "Expected at least one cropped docx"
78
+ for sig in result.Signatures:
79
+ if sig.BoundingBox:
80
+ assert sig.CropDocxPath is not None
81
+ crop_path = Path(sig.CropDocxPath)
62
82
  assert crop_path.suffix == ".docx"
63
83
  assert crop_path.exists()
64
84
  assert not crop_path.with_suffix(".png").exists()
85
+ assert sig.CropPath is None
65
86
 
66
87
 
67
88
  def test_crop_signature_images_accepts_dict(tmp_path: Path) -> None:
@@ -74,6 +95,8 @@ def test_crop_signature_images_accepts_dict(tmp_path: Path) -> None:
74
95
 
75
96
  assert paths
76
97
  assert result_dict["signatures"][0]["crop_path"] is not None
98
+ assert result_dict["signatures"][0]["crop_path"].endswith(".png")
99
+ assert result_dict["signatures"][0]["crop_docx_path"] is None
77
100
 
78
101
 
79
102
  def test_crop_signature_images_returns_bytes(tmp_path: Path) -> None:
@@ -92,9 +115,33 @@ def test_crop_signature_images_returns_bytes(tmp_path: Path) -> None:
92
115
  assert crops
93
116
  assert isinstance(crops[0], SignatureCrop)
94
117
  assert crops[0].image_bytes
95
- assert crops[0].docx_bytes
118
+ assert crops[0].docx_bytes is None
96
119
  assert result_dict["signatures"][0]["crop_path"] is not None
97
- assert result_dict["signatures"][0]["crop_path"].endswith(".docx")
120
+ assert result_dict["signatures"][0]["crop_path"].endswith(".png")
121
+ assert result_dict["signatures"][0]["crop_docx_path"] is None
122
+
123
+
124
+ def test_crop_signature_images_returns_bytes_docx(tmp_path: Path) -> None:
125
+ pdf_path = tmp_path / "doc.pdf"
126
+ _pdf_with_signature(pdf_path)
127
+
128
+ result_dict = DetectPdf(pdf_path, engineName="pymupdf")
129
+ out_dir = tmp_path / "dict_docx_crops"
130
+ crops = CropSignatureImages(
131
+ pdf_path,
132
+ result_dict,
133
+ outputDirectory=out_dir,
134
+ returnBytes=True,
135
+ docx=True,
136
+ )
137
+
138
+ assert crops
139
+ assert isinstance(crops[0], SignatureCrop)
140
+ assert crops[0].image_bytes
141
+ assert crops[0].docx_bytes
142
+ assert result_dict["signatures"][0]["crop_docx_path"] is not None
143
+ assert result_dict["signatures"][0]["crop_docx_path"].endswith(".docx")
144
+ assert result_dict["signatures"][0]["crop_path"] is None
98
145
 
99
146
 
100
147
  def test_crop_signature_images_can_skip_disk(tmp_path: Path) -> None:
@@ -111,6 +158,33 @@ def test_crop_signature_images_can_skip_disk(tmp_path: Path) -> None:
111
158
  saveToDisk=False,
112
159
  )
113
160
 
161
+ assert crops
162
+ first_crop = crops[0]
163
+ assert isinstance(first_crop, SignatureCrop)
164
+ assert first_crop.image_bytes
165
+ assert first_crop.docx_bytes is None
166
+ assert first_crop.saved_to_disk is False
167
+ assert not first_crop.path.exists()
168
+ assert not first_crop.path.with_suffix(".docx").exists()
169
+ assert result_dict["signatures"][0]["crop_path"] is None
170
+ assert result_dict["signatures"][0]["crop_docx_path"] is None
171
+
172
+
173
+ def test_crop_signature_images_can_skip_disk_docx(tmp_path: Path) -> None:
174
+ pdf_path = tmp_path / "doc.pdf"
175
+ _pdf_with_signature(pdf_path)
176
+
177
+ result_dict = DetectPdf(pdf_path, engineName="pymupdf")
178
+ out_dir = tmp_path / "dict_docx_crops_no_disk"
179
+ crops = CropSignatureImages(
180
+ pdf_path,
181
+ result_dict,
182
+ outputDirectory=out_dir,
183
+ returnBytes=True,
184
+ saveToDisk=False,
185
+ docx=True,
186
+ )
187
+
114
188
  assert crops
115
189
  first_crop = crops[0]
116
190
  assert isinstance(first_crop, SignatureCrop)
@@ -118,8 +192,10 @@ def test_crop_signature_images_can_skip_disk(tmp_path: Path) -> None:
118
192
  assert first_crop.docx_bytes
119
193
  assert first_crop.saved_to_disk is False
120
194
  assert not first_crop.path.exists()
195
+ assert first_crop.path.suffix == ".docx"
121
196
  assert not first_crop.path.with_suffix(".png").exists()
122
197
  assert result_dict["signatures"][0]["crop_path"] is None
198
+ assert result_dict["signatures"][0]["crop_docx_path"] is None
123
199
 
124
200
 
125
201
  def test_crop_signatures_returns_bytes(tmp_path: Path) -> None:
@@ -141,10 +217,10 @@ def test_crop_signatures_returns_bytes(tmp_path: Path) -> None:
141
217
  assert crops
142
218
  assert isinstance(crops[0], SignatureCrop)
143
219
  assert crops[0].path.exists()
144
- assert crops[0].path.suffix == ".docx"
145
- assert not crops[0].path.with_suffix(".png").exists()
220
+ assert crops[0].path.suffix == ".png"
221
+ assert not crops[0].path.with_suffix(".docx").exists()
146
222
  assert crops[0].image_bytes
147
- assert crops[0].docx_bytes
223
+ assert crops[0].docx_bytes is None
148
224
 
149
225
 
150
226
  def test_crop_signatures_requires_save_or_bytes(tmp_path: Path) -> None:
File without changes
File without changes