sigdetect 0.3.1__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sigdetect/__init__.py CHANGED
@@ -21,4 +21,4 @@ try:
21
21
  except PackageNotFoundError: # pragma: no cover
22
22
  __version__ = "0.0.0"
23
23
 
24
- DEFAULT_ENGINE = "pypdf2"
24
+ DEFAULT_ENGINE = "auto"
sigdetect/api.py CHANGED
@@ -9,8 +9,9 @@ from typing import Any, Generator, Iterable, Iterator, Literal, overload
9
9
  from sigdetect.config import DetectConfiguration
10
10
  from sigdetect.cropping import SignatureCrop
11
11
  from sigdetect.detector import BuildDetector, Detector, FileResult, Signature
12
+ from sigdetect.wet_detection import apply_wet_detection
12
13
 
13
- EngineName = Literal["pypdf2", "pypdf", "pymupdf"]
14
+ EngineName = Literal["pypdf2", "pypdf", "pymupdf", "auto"]
14
15
  ProfileName = Literal["hipaa", "retainer"]
15
16
 
16
17
 
@@ -18,12 +19,16 @@ def DetectPdf(
18
19
  pdfPath: str | Path,
19
20
  *,
20
21
  profileName: ProfileName = "hipaa",
21
- engineName: EngineName = "pypdf2",
22
+ engineName: EngineName = "auto",
22
23
  includePseudoSignatures: bool = True,
23
24
  recurseXObjects: bool = True,
25
+ runWetDetection: bool = True,
24
26
  detector: Detector | None = None,
25
27
  ) -> dict[str, Any]:
26
- """Detect signature evidence and assign roles for a single PDF."""
28
+ """Detect signature evidence and assign roles for a single PDF.
29
+
30
+ Wet detection runs by default for non-e-sign PDFs; pass ``runWetDetection=False`` to skip OCR.
31
+ """
27
32
 
28
33
  resolvedPath = Path(pdfPath)
29
34
  activeDetector = detector or get_detector(
@@ -36,6 +41,10 @@ def DetectPdf(
36
41
  )
37
42
 
38
43
  result = activeDetector.Detect(resolvedPath)
44
+ if runWetDetection:
45
+ configuration = _ResolveConfiguration(activeDetector)
46
+ if configuration is not None:
47
+ apply_wet_detection(resolvedPath, configuration, result)
39
48
  return _ToPlainDictionary(result)
40
49
 
41
50
 
@@ -43,12 +52,15 @@ def get_detector(
43
52
  *,
44
53
  pdfRoot: str | Path | None = None,
45
54
  profileName: ProfileName = "hipaa",
46
- engineName: EngineName = "pypdf2",
55
+ engineName: EngineName = "auto",
47
56
  includePseudoSignatures: bool = True,
48
57
  recurseXObjects: bool = True,
49
58
  outputDirectory: str | Path | None = None,
50
59
  ) -> Detector:
51
- """Return a reusable detector instance configured with the supplied options."""
60
+ """Return a reusable detector instance configured with the supplied options.
61
+
62
+ Engine selection is forced to ``auto`` (prefers PyMuPDF when available).
63
+ """
52
64
 
53
65
  configuration = DetectConfiguration(
54
66
  PdfRoot=Path(pdfRoot) if pdfRoot is not None else Path.cwd(),
@@ -108,6 +120,7 @@ def _ToPlainValue(value: Any) -> Any:
108
120
  def DetectMany(
109
121
  pdfPaths: Iterable[str | Path],
110
122
  *,
123
+ runWetDetection: bool = True,
111
124
  detector: Detector | None = None,
112
125
  **kwargs: Any,
113
126
  ) -> Iterator[dict[str, Any]]:
@@ -115,17 +128,18 @@ def DetectMany(
115
128
 
116
129
  if detector is not None:
117
130
  for pdfPath in pdfPaths:
118
- yield _DetectWithDetector(detector, pdfPath)
131
+ yield _DetectWithDetector(detector, pdfPath, runWetDetection=runWetDetection)
119
132
  return
120
133
 
121
134
  for pdfPath in pdfPaths:
122
- yield DetectPdf(pdfPath, **kwargs)
135
+ yield DetectPdf(pdfPath, runWetDetection=runWetDetection, **kwargs)
123
136
 
124
137
 
125
138
  def ScanDirectory(
126
139
  pdfRoot: str | Path,
127
140
  *,
128
141
  globPattern: str = "**/*.pdf",
142
+ runWetDetection: bool = True,
129
143
  detector: Detector | None = None,
130
144
  **kwargs: Any,
131
145
  ) -> Iterator[dict[str, Any]]:
@@ -143,7 +157,7 @@ def ScanDirectory(
143
157
 
144
158
  for pdfPath in iterator:
145
159
  if pdfPath.is_file() and pdfPath.suffix.lower() == ".pdf":
146
- yield DetectPdf(pdfPath, detector=detector, **kwargs)
160
+ yield DetectPdf(pdfPath, detector=detector, runWetDetection=runWetDetection, **kwargs)
147
161
 
148
162
 
149
163
  def ToCsvRow(result: dict[str, Any]) -> dict[str, Any]:
@@ -174,11 +188,25 @@ def Version() -> str:
174
188
  return "0.0.0-dev"
175
189
 
176
190
 
177
- def _DetectWithDetector(detector: Detector, pdfPath: str | Path) -> dict[str, Any]:
191
+ def _DetectWithDetector(
192
+ detector: Detector, pdfPath: str | Path, *, runWetDetection: bool
193
+ ) -> dict[str, Any]:
178
194
  """Helper that runs ``detector`` and returns the plain dictionary result."""
179
195
 
180
196
  resolvedPath = Path(pdfPath)
181
- return _ToPlainDictionary(detector.Detect(resolvedPath))
197
+ result = detector.Detect(resolvedPath)
198
+ if runWetDetection:
199
+ configuration = _ResolveConfiguration(detector)
200
+ if configuration is not None:
201
+ apply_wet_detection(resolvedPath, configuration, result)
202
+ return _ToPlainDictionary(result)
203
+
204
+
205
+ def _ResolveConfiguration(detector: Detector) -> DetectConfiguration | None:
206
+ configuration = getattr(detector, "Configuration", None)
207
+ if isinstance(configuration, DetectConfiguration):
208
+ return configuration
209
+ return None
182
210
 
183
211
 
184
212
  @contextmanager
@@ -225,12 +253,15 @@ def CropSignatureImages(
225
253
  returnBytes: bool = False,
226
254
  saveToDisk: bool = True,
227
255
  ) -> list[Path] | list[SignatureCrop]:
228
- """Crop detected signature regions to PNG files.
256
+ """Create DOCX files containing cropped signature images.
229
257
 
230
258
  Accepts either a :class:`FileResult` instance or the ``dict`` returned by
231
259
  :func:`DetectPdf`. Requires the optional ``pymupdf`` dependency.
232
260
  Set ``returnBytes=True`` to also receive in-memory PNG bytes for each crop. Set
233
261
  ``saveToDisk=False`` to skip writing PNG files while still returning in-memory data.
262
+ When ``saveToDisk`` is enabled, a one-image DOCX file is also written per crop. When
263
+ ``returnBytes`` is True and ``python-docx`` is available, the returned
264
+ :class:`SignatureCrop` objects include ``docx_bytes``.
234
265
  """
235
266
 
236
267
  from sigdetect.cropping import crop_signatures
@@ -273,6 +304,7 @@ def _CoerceFileResult(
273
304
  RenderType=str(entry.get("render_type") or "unknown"),
274
305
  BoundingBox=tuple(bbox) if bbox else None,
275
306
  CropPath=entry.get("crop_path"),
307
+ CropBytes=entry.get("crop_bytes"),
276
308
  )
277
309
  )
278
310
 
sigdetect/cli.py CHANGED
@@ -2,6 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import base64
5
6
  import json
6
7
  from collections.abc import Iterator
7
8
  from dataclasses import asdict, is_dataclass
@@ -15,6 +16,7 @@ from .cropping import SignatureCroppingUnavailable, crop_signatures
15
16
  from .detector import BuildDetector, FileResult
16
17
  from .eda import RunExploratoryAnalysis
17
18
  from .logging_setup import ConfigureLogging
19
+ from .wet_detection import apply_wet_detection
18
20
 
19
21
  Logger = ConfigureLogging()
20
22
 
@@ -47,6 +49,12 @@ def Detect(
47
49
  configurationPath: Path | None = typer.Option(
48
50
  None, "--config", "-c", help="Path to YAML config"
49
51
  ),
52
+ writeResults: bool | None = typer.Option(
53
+ None,
54
+ "--write-results/--no-write-results",
55
+ help="Write results.json (or JSON to stdout when out_dir is none)",
56
+ show_default=False,
57
+ ),
50
58
  profileOverride: str | None = typer.Option(None, "--profile", "-p", help="hipaa or retainer"),
51
59
  recursive: bool = typer.Option(
52
60
  True,
@@ -56,13 +64,13 @@ def Detect(
56
64
  cropSignatures: bool | None = typer.Option(
57
65
  None,
58
66
  "--crop-signatures/--no-crop-signatures",
59
- help="Crop detected signature regions to PNG files (requires PyMuPDF)",
67
+ help="Write DOCX files containing cropped signature images (requires PyMuPDF + python-docx)",
60
68
  show_default=False,
61
69
  ),
62
70
  cropDirectory: Path | None = typer.Option(
63
71
  None,
64
72
  "--crop-dir",
65
- help="Directory for signature PNG crops (defaults to out_dir/signature_crops)",
73
+ help="Directory for signature DOCX crops (defaults to out_dir/signature_crops)",
66
74
  ),
67
75
  cropDpi: int | None = typer.Option(
68
76
  None,
@@ -72,6 +80,39 @@ def Detect(
72
80
  help="Rendering DPI for signature crops",
73
81
  show_default=False,
74
82
  ),
83
+ cropBytes: bool = typer.Option(
84
+ False,
85
+ "--crop-bytes/--no-crop-bytes",
86
+ help="Embed base64 PNG bytes for signature crops in results JSON",
87
+ show_default=False,
88
+ ),
89
+ detectWetSignatures: bool | None = typer.Option(
90
+ None,
91
+ "--detect-wet/--no-detect-wet",
92
+ help="Compatibility flag; non-e-sign PDFs always run OCR when deps are available",
93
+ show_default=False,
94
+ ),
95
+ wetOcrDpi: int | None = typer.Option(
96
+ None,
97
+ "--wet-ocr-dpi",
98
+ min=72,
99
+ max=600,
100
+ help="Rendering DPI for OCR pages (wet detection)",
101
+ show_default=False,
102
+ ),
103
+ wetOcrLanguages: str | None = typer.Option(
104
+ None,
105
+ "--wet-ocr-languages",
106
+ help="Tesseract language packs for OCR (e.g., 'eng' or 'eng+spa')",
107
+ ),
108
+ wetPrecisionThreshold: float | None = typer.Option(
109
+ None,
110
+ "--wet-precision-threshold",
111
+ min=0.0,
112
+ max=1.0,
113
+ help="Minimum wet-signature confidence (0-1) to accept a candidate",
114
+ show_default=False,
115
+ ),
75
116
  ) -> None:
76
117
  """Run detection for the configured directory and emit ``results.json``."""
77
118
 
@@ -83,12 +124,22 @@ def Detect(
83
124
  configuration = configuration.model_copy(update={"Profile": normalized_profile})
84
125
 
85
126
  overrides: dict[str, object] = {}
127
+ if writeResults is not None:
128
+ overrides["WriteResults"] = writeResults
86
129
  if cropSignatures is not None:
87
130
  overrides["CropSignatures"] = cropSignatures
88
131
  if cropDirectory is not None:
89
132
  overrides["CropOutputDirectory"] = cropDirectory
90
133
  if cropDpi is not None:
91
134
  overrides["CropImageDpi"] = cropDpi
135
+ if detectWetSignatures is not None:
136
+ overrides["DetectWetSignatures"] = detectWetSignatures
137
+ if wetOcrDpi is not None:
138
+ overrides["WetOcrDpi"] = wetOcrDpi
139
+ if wetOcrLanguages is not None:
140
+ overrides["WetOcrLanguages"] = wetOcrLanguages
141
+ if wetPrecisionThreshold is not None:
142
+ overrides["WetPrecisionThreshold"] = wetPrecisionThreshold
92
143
  if overrides:
93
144
  configuration = configuration.model_copy(update=overrides)
94
145
  configuration = FinalizeConfiguration(configuration)
@@ -109,44 +160,52 @@ def Detect(
109
160
  except StopIteration:
110
161
  raise SystemExit(f"No PDFs found in {configuration.PdfRoot}") from None
111
162
 
112
- results_buffer: list[FileResult] | None = [] if configuration.OutputDirectory is None else None
163
+ write_results = configuration.WriteResults
164
+ results_buffer: list[FileResult] | None = (
165
+ [] if write_results and configuration.OutputDirectory is None else None
166
+ )
113
167
  json_handle = None
114
168
  json_path: Path | None = None
115
169
  wrote_first = False
116
170
 
117
- if configuration.OutputDirectory is not None:
171
+ if write_results and configuration.OutputDirectory is not None:
118
172
  outputDirectory = configuration.OutputDirectory
119
173
  outputDirectory.mkdir(parents=True, exist_ok=True)
120
174
  json_path = outputDirectory / "results.json"
121
175
  json_handle = open(json_path, "w", encoding="utf-8")
122
176
  json_handle.write("[")
123
177
 
178
+ crop_bytes_enabled = bool(cropBytes)
124
179
  crop_dir = configuration.CropOutputDirectory
180
+ if crop_dir is None:
181
+ base_dir = configuration.OutputDirectory or configuration.PdfRoot
182
+ crop_dir = base_dir / "signature_crops"
125
183
  cropping_enabled = configuration.CropSignatures
126
184
  cropping_available = True
127
185
  cropping_attempted = False
128
- if configuration.CropSignatures and crop_dir is None:
129
- Logger.warning(
130
- "CropSignatures enabled without an output directory",
131
- extra={"pdf_root": str(configuration.PdfRoot)},
132
- )
133
- cropping_enabled = False
134
186
 
135
187
  total_bboxes = 0
136
188
 
137
189
  def _append_result(file_result: FileResult, source_pdf: Path) -> None:
138
190
  nonlocal wrote_first, json_handle, total_bboxes, cropping_available, cropping_attempted
139
191
 
140
- if cropping_enabled and cropping_available and crop_dir is not None:
192
+ if cropping_available and (cropping_enabled or crop_bytes_enabled) and crop_dir is not None:
141
193
  try:
142
- crop_signatures(
194
+ crops = crop_signatures(
143
195
  pdf_path=source_pdf,
144
196
  file_result=file_result,
145
197
  output_dir=crop_dir,
146
198
  dpi=configuration.CropImageDpi,
147
199
  logger=Logger,
200
+ return_bytes=crop_bytes_enabled,
201
+ save_files=cropping_enabled,
148
202
  )
149
203
  cropping_attempted = True
204
+ if crop_bytes_enabled:
205
+ for crop in crops:
206
+ crop.signature.CropBytes = base64.b64encode(crop.image_bytes).decode(
207
+ "ascii"
208
+ )
150
209
  except SignatureCroppingUnavailable as exc:
151
210
  cropping_available = False
152
211
  Logger.warning("Signature cropping unavailable", extra={"error": str(exc)})
@@ -182,6 +241,7 @@ def Detect(
182
241
 
183
242
  def _process(pdf_path: Path) -> None:
184
243
  file_result = detector.Detect(pdf_path)
244
+ apply_wet_detection(pdf_path, configuration, file_result, logger=Logger)
185
245
  _append_result(file_result, pdf_path)
186
246
 
187
247
  try:
@@ -194,18 +254,24 @@ def Detect(
194
254
  json_handle.write(closing)
195
255
  json_handle.close()
196
256
 
197
- if json_handle is not None:
198
- typer.echo(f"Wrote {json_path}")
199
- else:
200
- payload = json.dumps(
201
- results_buffer or [], indent=2, ensure_ascii=False, default=_JsonSerializer
202
- )
203
- typer.echo(payload)
204
- typer.echo("Detection completed with output disabled (out_dir=none)")
205
-
206
- if cropping_enabled and cropping_available and cropping_attempted and total_bboxes == 0:
257
+ if write_results:
258
+ if json_handle is not None:
259
+ typer.echo(f"Wrote {json_path}")
260
+ else:
261
+ payload = json.dumps(
262
+ results_buffer or [], indent=2, ensure_ascii=False, default=_JsonSerializer
263
+ )
264
+ typer.echo(payload)
265
+ typer.echo("Detection completed with output disabled (out_dir=none)")
266
+
267
+ if (
268
+ (cropping_enabled or crop_bytes_enabled)
269
+ and cropping_available
270
+ and cropping_attempted
271
+ and total_bboxes == 0
272
+ ):
207
273
  Logger.warning(
208
- "No signature bounding boxes detected; try --engine pymupdf for crop-ready output",
274
+ "No signature bounding boxes detected; install PyMuPDF for crop-ready output",
209
275
  extra={"engine": configuration.Engine},
210
276
  )
211
277
 
sigdetect/config.py CHANGED
@@ -10,7 +10,7 @@ from typing import Literal
10
10
  import yaml
11
11
  from pydantic import BaseModel, ConfigDict, Field, field_validator
12
12
 
13
- EngineName = Literal["pypdf2", "pypdf", "pymupdf"]
13
+ EngineName = Literal["pypdf2", "pypdf", "pymupdf", "auto"]
14
14
  ProfileName = Literal["hipaa", "retainer"]
15
15
 
16
16
 
@@ -25,13 +25,20 @@ class DetectConfiguration(BaseModel):
25
25
 
26
26
  PdfRoot: Path = Field(default=Path("hipaa_results"), alias="pdf_root")
27
27
  OutputDirectory: Path | None = Field(default=Path("out"), alias="out_dir")
28
- Engine: EngineName = Field(default="pypdf2", alias="engine")
28
+ WriteResults: bool = Field(default=False, alias="write_results")
29
+ Engine: EngineName = Field(default="auto", alias="engine")
29
30
  Profile: ProfileName = Field(default="hipaa", alias="profile")
30
31
  PseudoSignatures: bool = Field(default=True, alias="pseudo_signatures")
31
32
  RecurseXObjects: bool = Field(default=True, alias="recurse_xobjects")
32
- CropSignatures: bool = Field(default=False, alias="crop_signatures")
33
+ CropSignatures: bool = Field(default=True, alias="crop_signatures")
33
34
  CropOutputDirectory: Path | None = Field(default=None, alias="crop_output_dir")
34
35
  CropImageDpi: int = Field(default=200, alias="crop_image_dpi", ge=72, le=600)
36
+ DetectWetSignatures: bool = Field(default=True, alias="detect_wet_signatures")
37
+ WetOcrDpi: int = Field(default=200, alias="wet_ocr_dpi", ge=72, le=600)
38
+ WetOcrLanguages: str = Field(default="eng", alias="wet_ocr_languages")
39
+ WetPrecisionThreshold: float = Field(
40
+ default=0.82, alias="wet_precision_threshold", ge=0.0, le=1.0
41
+ )
35
42
 
36
43
  @field_validator("PdfRoot", "OutputDirectory", "CropOutputDirectory", mode="before")
37
44
  @classmethod
@@ -57,6 +64,10 @@ class DetectConfiguration(BaseModel):
57
64
  def out_dir(self) -> Path | None: # pragma: no cover - simple passthrough
58
65
  return self.OutputDirectory
59
66
 
67
+ @property
68
+ def write_results(self) -> bool: # pragma: no cover - simple passthrough
69
+ return self.WriteResults
70
+
60
71
  @property
61
72
  def engine(self) -> EngineName: # pragma: no cover - simple passthrough
62
73
  return self.Engine
@@ -85,6 +96,22 @@ class DetectConfiguration(BaseModel):
85
96
  def crop_image_dpi(self) -> int: # pragma: no cover - simple passthrough
86
97
  return self.CropImageDpi
87
98
 
99
+ @property
100
+ def detect_wet_signatures(self) -> bool: # pragma: no cover - simple passthrough
101
+ return self.DetectWetSignatures
102
+
103
+ @property
104
+ def wet_ocr_dpi(self) -> int: # pragma: no cover - simple passthrough
105
+ return self.WetOcrDpi
106
+
107
+ @property
108
+ def wet_ocr_languages(self) -> str: # pragma: no cover - simple passthrough
109
+ return self.WetOcrLanguages
110
+
111
+ @property
112
+ def wet_precision_threshold(self) -> float: # pragma: no cover - simple passthrough
113
+ return self.WetPrecisionThreshold
114
+
88
115
 
89
116
  def LoadConfiguration(path: Path | None) -> DetectConfiguration:
90
117
  """Load configuration from ``path`` while applying environment overrides.
@@ -108,6 +135,10 @@ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
108
135
  env_crop = os.getenv("SIGDETECT_CROP_SIGNATURES")
109
136
  env_crop_dir = os.getenv("SIGDETECT_CROP_DIR")
110
137
  env_crop_dpi = os.getenv("SIGDETECT_CROP_DPI")
138
+ env_detect_wet = os.getenv("SIGDETECT_DETECT_WET")
139
+ env_wet_dpi = os.getenv("SIGDETECT_WET_OCR_DPI")
140
+ env_wet_lang = os.getenv("SIGDETECT_WET_LANGUAGES")
141
+ env_wet_precision = os.getenv("SIGDETECT_WET_PRECISION")
111
142
 
112
143
  raw_data: dict[str, object] = {}
113
144
  if path and Path(path).exists():
@@ -133,6 +164,20 @@ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
133
164
  if env_crop_dpi:
134
165
  with suppress(ValueError):
135
166
  raw_data["crop_image_dpi"] = int(env_crop_dpi)
167
+ if env_detect_wet is not None:
168
+ lowered = env_detect_wet.lower()
169
+ if lowered in {"1", "true", "yes", "on"}:
170
+ raw_data["detect_wet_signatures"] = True
171
+ elif lowered in {"0", "false", "no", "off"}:
172
+ raw_data["detect_wet_signatures"] = False
173
+ if env_wet_dpi:
174
+ with suppress(ValueError):
175
+ raw_data["wet_ocr_dpi"] = int(env_wet_dpi)
176
+ if env_wet_lang:
177
+ raw_data["wet_ocr_languages"] = env_wet_lang
178
+ if env_wet_precision:
179
+ with suppress(ValueError):
180
+ raw_data["wet_precision_threshold"] = float(env_wet_precision)
136
181
 
137
182
  configuration = DetectConfiguration(**raw_data)
138
183
  return FinalizeConfiguration(configuration)
sigdetect/cropping.py CHANGED
@@ -1,7 +1,8 @@
1
- """Helpers for converting signature bounding boxes into PNG crops."""
1
+ """Helpers for converting signature bounding boxes into DOCX crops."""
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import io
5
6
  import logging
6
7
  import re
7
8
  from dataclasses import dataclass
@@ -16,18 +17,28 @@ try: # pragma: no cover - optional dependency
16
17
  except Exception: # pragma: no cover - optional dependency
17
18
  fitz = None # type: ignore[misc]
18
19
 
20
+ try: # pragma: no cover - optional dependency
21
+ from docx import Document # type: ignore
22
+ except Exception: # pragma: no cover - optional dependency
23
+ Document = None # type: ignore[assignment]
24
+
19
25
 
20
26
  class SignatureCroppingUnavailable(RuntimeError):
21
27
  """Raised when PNG cropping cannot be performed (e.g., PyMuPDF missing)."""
22
28
 
23
29
 
30
+ class SignatureDocxUnavailable(RuntimeError):
31
+ """Raised when DOCX creation cannot be performed (e.g., python-docx missing)."""
32
+
33
+
24
34
  @dataclass(slots=True)
25
35
  class SignatureCrop:
26
- """PNG crop metadata and in-memory content."""
36
+ """Crop metadata and in-memory content."""
27
37
 
28
38
  path: Path
29
39
  image_bytes: bytes
30
40
  signature: Signature
41
+ docx_bytes: bytes | None = None
31
42
  saved_to_disk: bool = True
32
43
 
33
44
 
@@ -40,6 +51,7 @@ def crop_signatures(
40
51
  dpi: int = 200,
41
52
  logger: logging.Logger | None = None,
42
53
  return_bytes: Literal[False] = False,
54
+ save_files: bool = True,
43
55
  ) -> list[Path]: ...
44
56
 
45
57
 
@@ -51,7 +63,8 @@ def crop_signatures(
51
63
  output_dir: Path,
52
64
  dpi: int = 200,
53
65
  logger: logging.Logger | None = None,
54
- return_bytes: Literal[True] = True,
66
+ return_bytes: Literal[True],
67
+ save_files: bool = True,
55
68
  ) -> list[SignatureCrop]: ...
56
69
 
57
70
 
@@ -65,15 +78,18 @@ def crop_signatures(
65
78
  return_bytes: bool = False,
66
79
  save_files: bool = True,
67
80
  ) -> list[Path] | list[SignatureCrop]:
68
- """Render each signature bounding box to a PNG image using PyMuPDF.
81
+ """Render each signature bounding box to a PNG image and wrap it in a DOCX file.
69
82
 
70
83
  Set ``return_bytes=True`` to collect in-memory PNG bytes for each crop while also writing
71
84
  the files to ``output_dir``. Set ``save_files=False`` to skip writing PNGs to disk.
85
+ When ``save_files`` is enabled, a one-image DOCX file is also written per signature crop.
86
+ When ``return_bytes`` is True and ``python-docx`` is available, ``SignatureCrop.docx_bytes``
87
+ will contain the DOCX payload.
72
88
  """
73
89
 
74
90
  if fitz is None: # pragma: no cover - exercised when dependency absent
75
91
  raise SignatureCroppingUnavailable(
76
- "PyMuPDF is required for PNG crops. Install 'pymupdf' or 'sigdetect[pymupdf]'."
92
+ "PyMuPDF is required for PNG crops. Install 'pymupdf' or add it to your environment."
77
93
  )
78
94
  if not save_files and not return_bytes:
79
95
  raise ValueError("At least one of save_files or return_bytes must be True")
@@ -85,6 +101,16 @@ def crop_signatures(
85
101
  generated_paths: list[Path] = []
86
102
  generated_crops: list[SignatureCrop] = []
87
103
 
104
+ docx_to_disk = save_files
105
+ docx_in_memory = return_bytes
106
+ docx_enabled = docx_to_disk or docx_in_memory
107
+ docx_available = Document is not None
108
+ if docx_enabled and not docx_available and logger:
109
+ logger.warning(
110
+ "Signature DOCX output unavailable",
111
+ extra={"error": "python-docx is required to generate DOCX outputs"},
112
+ )
113
+
88
114
  with fitz.open(pdf_path) as document: # type: ignore[attr-defined]
89
115
  per_document_dir = output_dir / pdf_path.stem
90
116
  if save_files:
@@ -114,14 +140,13 @@ def crop_signatures(
114
140
  continue
115
141
 
116
142
  filename = _build_filename(index, signature)
117
- destination = per_document_dir / filename
143
+ png_destination = per_document_dir / filename
144
+ docx_destination = png_destination.with_suffix(".docx")
118
145
 
119
146
  try:
120
147
  image_bytes: bytes | None = None
121
148
  pixmap = page.get_pixmap(matrix=matrix, clip=clip, alpha=False)
122
- if save_files:
123
- pixmap.save(destination)
124
- if return_bytes:
149
+ if return_bytes or docx_enabled:
125
150
  image_bytes = pixmap.tobytes("png")
126
151
  except Exception as exc: # pragma: no cover - defensive
127
152
  if logger:
@@ -136,17 +161,40 @@ def crop_signatures(
136
161
  )
137
162
  continue
138
163
 
164
+ docx_bytes: bytes | None = None
165
+ if docx_enabled and docx_available:
166
+ if image_bytes is None: # pragma: no cover - defensive
167
+ continue
168
+ try:
169
+ docx_bytes = _build_docx_bytes(image_bytes)
170
+ if docx_to_disk:
171
+ docx_destination.write_bytes(docx_bytes)
172
+ except SignatureDocxUnavailable as exc:
173
+ if logger:
174
+ logger.warning(
175
+ "Signature DOCX output unavailable",
176
+ extra={"error": str(exc)},
177
+ )
178
+ docx_available = False
179
+ except Exception as exc: # pragma: no cover - defensive
180
+ if logger:
181
+ logger.warning(
182
+ "Failed to write signature DOCX",
183
+ extra={"file": pdf_path.name, "error": str(exc)},
184
+ )
185
+
139
186
  if save_files:
140
- signature.CropPath = str(destination)
141
- generated_paths.append(destination)
187
+ signature.CropPath = str(docx_destination)
188
+ generated_paths.append(docx_destination)
142
189
  if return_bytes:
143
190
  if image_bytes is None: # pragma: no cover - defensive
144
191
  continue
145
192
  generated_crops.append(
146
193
  SignatureCrop(
147
- path=destination,
194
+ path=docx_destination,
148
195
  image_bytes=image_bytes,
149
196
  signature=signature,
197
+ docx_bytes=docx_bytes,
150
198
  saved_to_disk=save_files,
151
199
  )
152
200
  )
@@ -154,6 +202,18 @@ def crop_signatures(
154
202
  return generated_crops if return_bytes else generated_paths
155
203
 
156
204
 
205
+ def _build_docx_bytes(image_bytes: bytes) -> bytes:
206
+ if Document is None:
207
+ raise SignatureDocxUnavailable(
208
+ "python-docx is required to generate DOCX outputs for signature crops."
209
+ )
210
+ document = Document()
211
+ document.add_picture(io.BytesIO(image_bytes))
212
+ buffer = io.BytesIO()
213
+ document.save(buffer)
214
+ return buffer.getvalue()
215
+
216
+
157
217
  def _to_clip_rect(page, bbox: tuple[float, float, float, float]):
158
218
  width = float(page.rect.width)
159
219
  height = float(page.rect.height)