PyPI - sigdetect - Versions diffs - 0.5.0__tar.gz → 0.5.1__tar.gz - Mend

sigdetect 0.5.0tar.gz → 0.5.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

{sigdetect-0.5.0 → sigdetect-0.5.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sigdetect
-Version: 0.5.0
+Version: 0.5.1
 Summary: Signature detection and role attribution for PDFs
 Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
 License: MIT
@@ -105,7 +105,7 @@ sigdetect detect \
   - `retainer` → client / firm (prefers detecting two signatures)
 - `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
 - Results output is disabled by default; set `write_results: true` or pass `--write-results` when you need `results.json` (for EDA).
-- Cropping (`--crop-signatures`) writes a one-image `.docx` per signature in the crop output directory (no PNG files on disk); `--crop-bytes` embeds base64 PNG data in `signatures[].crop_bytes` for in-memory use. PyMuPDF is required for crops, and `python-docx` is required for `.docx` output.
+- Cropping (`--crop-signatures`) writes PNG crops to disk by default; enable `--crop-docx` to write DOCX files instead of PNGs. `--crop-bytes` embeds base64 PNG data in `signatures[].crop_bytes` and, when `--crop-docx` is enabled, embeds DOCX bytes in `signatures[].crop_docx_bytes`. PyMuPDF is required for crops, and `python-docx` is required for DOCX output.
 - Wet detection runs automatically for non-e-sign PDFs when dependencies are available; missing OCR dependencies add a `ManualReview:*` hint instead of failing. PyMuPDF + Tesseract are required for wet detection.
 - If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
@@ -142,7 +142,7 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
 print(result.to_dict())
 ~~~
-`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When cropping is enabled, `crop_path` points at the generated `.docx`. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
+`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image; when DOCX cropping is enabled, `crop_docx_path` points at the generated doc. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
 ---
@@ -194,7 +194,7 @@ for res in ScanDirectory(
     # store in DB, print, etc.
     pass
-# 3) Create DOCX crops for FileResult objects (requires PyMuPDF + python-docx)
+# 3) Crop signature snippets for FileResult objects (requires PyMuPDF; DOCX needs python-docx)
 detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
 file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
 CropSignatureImages(
@@ -233,7 +233,8 @@ High-level summary (per file):
       "hint": "AcroSig:sig_patient",
       "render_type": "typed",
       "bounding_box": [10.0, 10.0, 150.0, 40.0],
-      "crop_path": "signature_crops/example/sig_01_patient.docx"
+      "crop_path": "signature_crops/example/sig_01_patient.png",
+      "crop_docx_path": null
     },
     {
       "page": null,
@@ -259,8 +260,10 @@ High-level summary (per file):
 - **`roles`** summarizes unique non-`unknown` roles across signatures.
 - In retainer profile, emitter prefers two signatures (client + firm), often on the same page.
 - **`signatures[].bounding_box`** reports the widget rectangle in PDF points (origin bottom-left).
-- **`signatures[].crop_path`** is populated when DOCX crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
+- **`signatures[].crop_path`** is populated when PNG crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
+- **`signatures[].crop_docx_path`** is populated when DOCX crops are generated (`--crop-docx` or `docx=True`).
 - **`signatures[].crop_bytes`** contains base64 PNG data when CLI `--crop-bytes` is enabled.
+- **`signatures[].crop_docx_bytes`** contains base64 DOCX data when `--crop-docx` and `--crop-bytes` are enabled together.
 ---
@@ -287,7 +290,8 @@ write_results: false
 pseudo_signatures: true
 recurse_xobjects: true
 profile: retainer    # or: hipaa
-crop_signatures: false   # enable to write DOCX crops (requires pymupdf + python-docx)
+crop_signatures: false   # enable to write PNG crops (requires pymupdf)
+crop_docx: false         # enable to write DOCX crops instead of PNGs (requires python-docx)
 # crop_output_dir: ./signature_crops
 crop_image_dpi: 200
 detect_wet_signatures: false   # kept for compatibility; non-e-sign PDFs still trigger OCR

{sigdetect-0.5.0 → sigdetect-0.5.1}/README.md RENAMED Viewed

@@ -87,7 +87,7 @@ sigdetect detect \
   - `retainer` → client / firm (prefers detecting two signatures)
 - `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
 - Results output is disabled by default; set `write_results: true` or pass `--write-results` when you need `results.json` (for EDA).
-- Cropping (`--crop-signatures`) writes a one-image `.docx` per signature in the crop output directory (no PNG files on disk); `--crop-bytes` embeds base64 PNG data in `signatures[].crop_bytes` for in-memory use. PyMuPDF is required for crops, and `python-docx` is required for `.docx` output.
+- Cropping (`--crop-signatures`) writes PNG crops to disk by default; enable `--crop-docx` to write DOCX files instead of PNGs. `--crop-bytes` embeds base64 PNG data in `signatures[].crop_bytes` and, when `--crop-docx` is enabled, embeds DOCX bytes in `signatures[].crop_docx_bytes`. PyMuPDF is required for crops, and `python-docx` is required for DOCX output.
 - Wet detection runs automatically for non-e-sign PDFs when dependencies are available; missing OCR dependencies add a `ManualReview:*` hint instead of failing. PyMuPDF + Tesseract are required for wet detection.
 - If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
@@ -124,7 +124,7 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
 print(result.to_dict())
 ~~~
-`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When cropping is enabled, `crop_path` points at the generated `.docx`. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
+`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image; when DOCX cropping is enabled, `crop_docx_path` points at the generated doc. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
 ---
@@ -176,7 +176,7 @@ for res in ScanDirectory(
     # store in DB, print, etc.
     pass
-# 3) Create DOCX crops for FileResult objects (requires PyMuPDF + python-docx)
+# 3) Crop signature snippets for FileResult objects (requires PyMuPDF; DOCX needs python-docx)
 detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
 file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
 CropSignatureImages(
@@ -215,7 +215,8 @@ High-level summary (per file):
       "hint": "AcroSig:sig_patient",
       "render_type": "typed",
       "bounding_box": [10.0, 10.0, 150.0, 40.0],
-      "crop_path": "signature_crops/example/sig_01_patient.docx"
+      "crop_path": "signature_crops/example/sig_01_patient.png",
+      "crop_docx_path": null
     },
     {
       "page": null,
@@ -241,8 +242,10 @@ High-level summary (per file):
 - **`roles`** summarizes unique non-`unknown` roles across signatures.
 - In retainer profile, emitter prefers two signatures (client + firm), often on the same page.
 - **`signatures[].bounding_box`** reports the widget rectangle in PDF points (origin bottom-left).
-- **`signatures[].crop_path`** is populated when DOCX crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
+- **`signatures[].crop_path`** is populated when PNG crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
+- **`signatures[].crop_docx_path`** is populated when DOCX crops are generated (`--crop-docx` or `docx=True`).
 - **`signatures[].crop_bytes`** contains base64 PNG data when CLI `--crop-bytes` is enabled.
+- **`signatures[].crop_docx_bytes`** contains base64 DOCX data when `--crop-docx` and `--crop-bytes` are enabled together.
 ---
@@ -269,7 +272,8 @@ write_results: false
 pseudo_signatures: true
 recurse_xobjects: true
 profile: retainer    # or: hipaa
-crop_signatures: false   # enable to write DOCX crops (requires pymupdf + python-docx)
+crop_signatures: false   # enable to write PNG crops (requires pymupdf)
+crop_docx: false         # enable to write DOCX crops instead of PNGs (requires python-docx)
 # crop_output_dir: ./signature_crops
 crop_image_dpi: 200
 detect_wet_signatures: false   # kept for compatibility; non-e-sign PDFs still trigger OCR

{sigdetect-0.5.0 → sigdetect-0.5.1}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "sigdetect"
-version = "0.5.0"
+version = "0.5.1"
 description = "Signature detection and role attribution for PDFs"
 readme = "README.md"
 authors = [{ name = "BT Asmamaw", email = "basmamaw@angeiongroup.com" }]

{sigdetect-0.5.0 → sigdetect-0.5.1}/src/sigdetect/api.py RENAMED Viewed

@@ -229,6 +229,7 @@ def CropSignatureImages(
     dpi: int = 200,
     returnBytes: Literal[False] = False,
     saveToDisk: bool = True,
+    docx: bool = False,
 ) -> list[Path]: ...
@@ -241,6 +242,7 @@ def CropSignatureImages(
     dpi: int,
     returnBytes: Literal[True],
     saveToDisk: bool,
+    docx: bool = False,
 ) -> list[SignatureCrop]: ...
@@ -252,16 +254,17 @@ def CropSignatureImages(
     dpi: int = 200,
     returnBytes: bool = False,
     saveToDisk: bool = True,
+    docx: bool = False,
 ) -> list[Path] | list[SignatureCrop]:
-    """Create DOCX files containing cropped signature images.
+    """Create PNG files containing cropped signature images (or DOCX when enabled).
     Accepts either a :class:`FileResult` instance or the ``dict`` returned by
     :func:`DetectPdf`. Requires the optional ``pymupdf`` dependency.
     Set ``returnBytes=True`` to also receive in-memory PNG bytes for each crop. Set
     ``saveToDisk=False`` to skip writing PNG files while still returning in-memory data.
-    When ``saveToDisk`` is enabled, a one-image DOCX file is also written per crop. When
-    ``returnBytes`` is True and ``python-docx`` is available, the returned
-    :class:`SignatureCrop` objects include ``docx_bytes``.
+    When ``docx`` is True, DOCX files are written instead of PNG files. When ``returnBytes`` is
+    True and ``docx`` is enabled, the returned :class:`SignatureCrop` objects include
+    ``docx_bytes``.
     """
     from sigdetect.cropping import crop_signatures
@@ -274,6 +277,7 @@ def CropSignatureImages(
         dpi=dpi,
         return_bytes=returnBytes,
         save_files=saveToDisk,
+        docx=docx,
     )
     if original_dict is not None:
         original_dict.clear()
@@ -305,6 +309,8 @@ def _CoerceFileResult(
                 BoundingBox=tuple(bbox) if bbox else None,
                 CropPath=entry.get("crop_path"),
                 CropBytes=entry.get("crop_bytes"),
+                CropDocxPath=entry.get("crop_docx_path"),
+                CropDocxBytes=entry.get("crop_docx_bytes"),
             )
         )

{sigdetect-0.5.0 → sigdetect-0.5.1}/src/sigdetect/cli.py RENAMED Viewed

@@ -64,13 +64,19 @@ def Detect(
     cropSignatures: bool | None = typer.Option(
         None,
         "--crop-signatures/--no-crop-signatures",
-        help="Write DOCX files containing cropped signature images (requires PyMuPDF + python-docx)",
+        help="Write PNG crops for signature widgets (requires PyMuPDF)",
+        show_default=False,
+    ),
+    cropDocx: bool | None = typer.Option(
+        None,
+        "--crop-docx/--no-crop-docx",
+        help="Write DOCX crops instead of PNG files (requires PyMuPDF + python-docx)",
         show_default=False,
     ),
     cropDirectory: Path | None = typer.Option(
         None,
         "--crop-dir",
-        help="Directory for signature DOCX crops (defaults to out_dir/signature_crops)",
+        help="Directory for signature crops (defaults to out_dir/signature_crops)",
     ),
     cropDpi: int | None = typer.Option(
         None,
@@ -83,7 +89,7 @@ def Detect(
     cropBytes: bool = typer.Option(
         False,
         "--crop-bytes/--no-crop-bytes",
-        help="Embed base64 PNG bytes for signature crops in results JSON",
+        help="Embed base64 PNG bytes (and DOCX bytes when --crop-docx) in results JSON",
         show_default=False,
     ),
     detectWetSignatures: bool | None = typer.Option(
@@ -128,6 +134,8 @@ def Detect(
         overrides["WriteResults"] = writeResults
     if cropSignatures is not None:
         overrides["CropSignatures"] = cropSignatures
+    if cropDocx is not None:
+        overrides["CropDocx"] = cropDocx
     if cropDirectory is not None:
         overrides["CropOutputDirectory"] = cropDirectory
     if cropDpi is not None:
@@ -181,6 +189,7 @@ def Detect(
         base_dir = configuration.OutputDirectory or configuration.PdfRoot
         crop_dir = base_dir / "signature_crops"
     cropping_enabled = configuration.CropSignatures
+    docx_enabled = configuration.CropDocx
     cropping_available = True
     cropping_attempted = False
@@ -199,6 +208,7 @@ def Detect(
                     logger=Logger,
                     return_bytes=crop_bytes_enabled,
                     save_files=cropping_enabled,
+                    docx=docx_enabled,
                 )
                 cropping_attempted = True
                 if crop_bytes_enabled:
@@ -206,15 +216,18 @@ def Detect(
                         crop.signature.CropBytes = base64.b64encode(crop.image_bytes).decode(
                             "ascii"
                         )
+                        if crop.docx_bytes:
+                            crop.signature.CropDocxBytes = base64.b64encode(
+                                crop.docx_bytes
+                            ).decode("ascii")
             except SignatureCroppingUnavailable as exc:
                 cropping_available = False
                 Logger.warning("Signature cropping unavailable", extra={"error": str(exc)})
                 typer.echo(str(exc), err=True)
             except Exception as exc:  # pragma: no cover - defensive
-                Logger.warning(
-                    "Unexpected error while cropping signatures",
-                    extra={"error": str(exc)},
-                )
+                cropping_available = False
+                Logger.warning("Signature cropping unavailable", extra={"error": str(exc)})
+                typer.echo(str(exc), err=True)
         total_bboxes += sum(1 for sig in file_result.Signatures if sig.BoundingBox)

{sigdetect-0.5.0 → sigdetect-0.5.1}/src/sigdetect/config.py RENAMED Viewed

@@ -31,6 +31,7 @@ class DetectConfiguration(BaseModel):
     PseudoSignatures: bool = Field(default=True, alias="pseudo_signatures")
     RecurseXObjects: bool = Field(default=True, alias="recurse_xobjects")
     CropSignatures: bool = Field(default=True, alias="crop_signatures")
+    CropDocx: bool = Field(default=False, alias="crop_docx")
     CropOutputDirectory: Path | None = Field(default=None, alias="crop_output_dir")
     CropImageDpi: int = Field(default=200, alias="crop_image_dpi", ge=72, le=600)
     DetectWetSignatures: bool = Field(default=True, alias="detect_wet_signatures")
@@ -88,6 +89,10 @@ class DetectConfiguration(BaseModel):
     def crop_signatures(self) -> bool:  # pragma: no cover - simple passthrough
         return self.CropSignatures
+    @property
+    def crop_docx(self) -> bool:  # pragma: no cover - simple passthrough
+        return self.CropDocx
     @property
     def crop_output_dir(self) -> Path | None:  # pragma: no cover - simple passthrough
         return self.CropOutputDirectory
@@ -133,6 +138,7 @@ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
     env_out_dir = os.getenv("SIGDETECT_OUT_DIR")
     env_profile = os.getenv("SIGDETECT_PROFILE")
     env_crop = os.getenv("SIGDETECT_CROP_SIGNATURES")
+    env_crop_docx = os.getenv("SIGDETECT_CROP_DOCX")
     env_crop_dir = os.getenv("SIGDETECT_CROP_DIR")
     env_crop_dpi = os.getenv("SIGDETECT_CROP_DPI")
     env_detect_wet = os.getenv("SIGDETECT_DETECT_WET")
@@ -159,6 +165,12 @@ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
             raw_data["crop_signatures"] = True
         elif lowered in {"0", "false", "no", "off"}:
             raw_data["crop_signatures"] = False
+    if env_crop_docx is not None:
+        lowered = env_crop_docx.lower()
+        if lowered in {"1", "true", "yes", "on"}:
+            raw_data["crop_docx"] = True
+        elif lowered in {"0", "false", "no", "off"}:
+            raw_data["crop_docx"] = False
     if env_crop_dir:
         raw_data["crop_output_dir"] = env_crop_dir
     if env_crop_dpi:

{sigdetect-0.5.0 → sigdetect-0.5.1}/src/sigdetect/cropping.py RENAMED Viewed

@@ -1,4 +1,4 @@
-"""Helpers for converting signature bounding boxes into DOCX crops."""
+"""Helpers for converting signature bounding boxes into PNG or DOCX crops."""
 from __future__ import annotations
@@ -27,7 +27,7 @@ class SignatureCroppingUnavailable(RuntimeError):
     """Raised when PNG cropping cannot be performed (e.g., PyMuPDF missing)."""
-class SignatureDocxUnavailable(RuntimeError):
+class SignatureDocxUnavailable(SignatureCroppingUnavailable):
     """Raised when DOCX creation cannot be performed (e.g., python-docx missing)."""
@@ -52,6 +52,7 @@ def crop_signatures(
     logger: logging.Logger | None = None,
     return_bytes: Literal[False] = False,
     save_files: bool = True,
+    docx: bool = False,
 ) -> list[Path]: ...
@@ -65,6 +66,7 @@ def crop_signatures(
     logger: logging.Logger | None = None,
     return_bytes: Literal[True],
     save_files: bool = True,
+    docx: bool = False,
 ) -> list[SignatureCrop]: ...
@@ -77,14 +79,14 @@ def crop_signatures(
     logger: logging.Logger | None = None,
     return_bytes: bool = False,
     save_files: bool = True,
+    docx: bool = False,
 ) -> list[Path] | list[SignatureCrop]:
-    """Render each signature bounding box to a PNG image and wrap it in a DOCX file.
+    """Render each signature bounding box to a PNG image and optionally wrap it in DOCX.
     Set ``return_bytes=True`` to collect in-memory PNG bytes for each crop while also writing
     the files to ``output_dir``. Set ``save_files=False`` to skip writing PNGs to disk.
-    When ``save_files`` is enabled, a one-image DOCX file is also written per signature crop.
-    When ``return_bytes`` is True and ``python-docx`` is available, ``SignatureCrop.docx_bytes``
-    will contain the DOCX payload.
+    When ``docx=True``, DOCX files are written instead of PNGs. When ``return_bytes`` is True
+    and ``docx=True``, ``SignatureCrop.docx_bytes`` will contain the DOCX payload.
     """
     if fitz is None:  # pragma: no cover - exercised when dependency absent
@@ -101,14 +103,11 @@ def crop_signatures(
     generated_paths: list[Path] = []
     generated_crops: list[SignatureCrop] = []
-    docx_to_disk = save_files
-    docx_in_memory = return_bytes
-    docx_enabled = docx_to_disk or docx_in_memory
+    docx_enabled = docx
     docx_available = Document is not None
-    if docx_enabled and not docx_available and logger:
-        logger.warning(
-            "Signature DOCX output unavailable",
-            extra={"error": "python-docx is required to generate DOCX outputs"},
+    if docx_enabled and not docx_available:
+        raise SignatureDocxUnavailable(
+            "python-docx is required to generate DOCX outputs for signature crops."
         )
     with fitz.open(pdf_path) as document:  # type: ignore[attr-defined]
@@ -146,6 +145,8 @@ def crop_signatures(
             try:
                 image_bytes: bytes | None = None
                 pixmap = page.get_pixmap(matrix=matrix, clip=clip, alpha=False)
+                if save_files and not docx_enabled:
+                    pixmap.save(png_destination)
                 if return_bytes or docx_enabled:
                     image_bytes = pixmap.tobytes("png")
             except Exception as exc:  # pragma: no cover - defensive
@@ -162,12 +163,12 @@ def crop_signatures(
                 continue
             docx_bytes: bytes | None = None
-            if docx_enabled and docx_available:
+            if docx_enabled:
                 if image_bytes is None:  # pragma: no cover - defensive
                     continue
                 try:
                     docx_bytes = _build_docx_bytes(image_bytes)
-                    if docx_to_disk:
+                    if save_files:
                         docx_destination.write_bytes(docx_bytes)
                 except SignatureDocxUnavailable as exc:
                     if logger:
@@ -184,14 +185,20 @@ def crop_signatures(
                         )
             if save_files:
-                signature.CropPath = str(docx_destination)
-                generated_paths.append(docx_destination)
+                if docx_enabled:
+                    signature.CropPath = None
+                    signature.CropDocxPath = str(docx_destination)
+                    generated_paths.append(docx_destination)
+                else:
+                    signature.CropDocxPath = None
+                    signature.CropPath = str(png_destination)
+                    generated_paths.append(png_destination)
             if return_bytes:
                 if image_bytes is None:  # pragma: no cover - defensive
                     continue
                 generated_crops.append(
                     SignatureCrop(
-                        path=docx_destination,
+                        path=docx_destination if docx_enabled else png_destination,
                         image_bytes=image_bytes,
                         signature=signature,
                         docx_bytes=docx_bytes,

{sigdetect-0.5.0 → sigdetect-0.5.1}/src/sigdetect/detector/signature_model.py RENAMED Viewed

@@ -21,6 +21,8 @@ class Signature:
     BoundingBox: tuple[float, float, float, float] | None = None
     CropPath: str | None = None
     CropBytes: str | None = None
+    CropDocxPath: str | None = None
+    CropDocxBytes: str | None = None
     def to_dict(self) -> dict[str, Any]:
         """Return the legacy snake_case representation used in JSON payloads."""
@@ -37,4 +39,6 @@ class Signature:
             "bounding_box": list(self.BoundingBox) if self.BoundingBox else None,
             "crop_path": self.CropPath,
             "crop_bytes": self.CropBytes,
+            "crop_docx_path": self.CropDocxPath,
+            "crop_docx_bytes": self.CropDocxBytes,
         }

{sigdetect-0.5.0 → sigdetect-0.5.1}/src/sigdetect.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sigdetect
-Version: 0.5.0
+Version: 0.5.1
 Summary: Signature detection and role attribution for PDFs
 Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
 License: MIT
@@ -105,7 +105,7 @@ sigdetect detect \
   - `retainer` → client / firm (prefers detecting two signatures)
 - `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
 - Results output is disabled by default; set `write_results: true` or pass `--write-results` when you need `results.json` (for EDA).
-- Cropping (`--crop-signatures`) writes a one-image `.docx` per signature in the crop output directory (no PNG files on disk); `--crop-bytes` embeds base64 PNG data in `signatures[].crop_bytes` for in-memory use. PyMuPDF is required for crops, and `python-docx` is required for `.docx` output.
+- Cropping (`--crop-signatures`) writes PNG crops to disk by default; enable `--crop-docx` to write DOCX files instead of PNGs. `--crop-bytes` embeds base64 PNG data in `signatures[].crop_bytes` and, when `--crop-docx` is enabled, embeds DOCX bytes in `signatures[].crop_docx_bytes`. PyMuPDF is required for crops, and `python-docx` is required for DOCX output.
 - Wet detection runs automatically for non-e-sign PDFs when dependencies are available; missing OCR dependencies add a `ManualReview:*` hint instead of failing. PyMuPDF + Tesseract are required for wet detection.
 - If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
@@ -142,7 +142,7 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
 print(result.to_dict())
 ~~~
-`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When cropping is enabled, `crop_path` points at the generated `.docx`. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
+`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image; when DOCX cropping is enabled, `crop_docx_path` points at the generated doc. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
 ---
@@ -194,7 +194,7 @@ for res in ScanDirectory(
     # store in DB, print, etc.
     pass
-# 3) Create DOCX crops for FileResult objects (requires PyMuPDF + python-docx)
+# 3) Crop signature snippets for FileResult objects (requires PyMuPDF; DOCX needs python-docx)
 detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
 file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
 CropSignatureImages(
@@ -233,7 +233,8 @@ High-level summary (per file):
       "hint": "AcroSig:sig_patient",
       "render_type": "typed",
       "bounding_box": [10.0, 10.0, 150.0, 40.0],
-      "crop_path": "signature_crops/example/sig_01_patient.docx"
+      "crop_path": "signature_crops/example/sig_01_patient.png",
+      "crop_docx_path": null
     },
     {
       "page": null,
@@ -259,8 +260,10 @@ High-level summary (per file):
 - **`roles`** summarizes unique non-`unknown` roles across signatures.
 - In retainer profile, emitter prefers two signatures (client + firm), often on the same page.
 - **`signatures[].bounding_box`** reports the widget rectangle in PDF points (origin bottom-left).
-- **`signatures[].crop_path`** is populated when DOCX crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
+- **`signatures[].crop_path`** is populated when PNG crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
+- **`signatures[].crop_docx_path`** is populated when DOCX crops are generated (`--crop-docx` or `docx=True`).
 - **`signatures[].crop_bytes`** contains base64 PNG data when CLI `--crop-bytes` is enabled.
+- **`signatures[].crop_docx_bytes`** contains base64 DOCX data when `--crop-docx` and `--crop-bytes` are enabled together.
 ---
@@ -287,7 +290,8 @@ write_results: false
 pseudo_signatures: true
 recurse_xobjects: true
 profile: retainer    # or: hipaa
-crop_signatures: false   # enable to write DOCX crops (requires pymupdf + python-docx)
+crop_signatures: false   # enable to write PNG crops (requires pymupdf)
+crop_docx: false         # enable to write DOCX crops instead of PNGs (requires python-docx)
 # crop_output_dir: ./signature_crops
 crop_image_dpi: 200
 detect_wet_signatures: false   # kept for compatibility; non-e-sign PDFs still trigger OCR

{sigdetect-0.5.0 → sigdetect-0.5.1}/tests/test_cli.py RENAMED Viewed

@@ -249,9 +249,11 @@ def test_detect_crop_bytes_embeds_base64(tmp_path: Path, monkeypatch) -> None:
         logger=None,
         return_bytes=False,
         save_files=True,
+        docx=False,
     ):
         assert return_bytes is True
         assert save_files is False
+        assert docx is False
         return [
             SignatureCrop(
                 path=Path(output_dir) / "sig_01.png",

{sigdetect-0.5.0 → sigdetect-0.5.1}/tests/test_cropping.py RENAMED Viewed

@@ -10,7 +10,6 @@ from sigdetect.cropping import SignatureCrop, crop_signatures
 from sigdetect.detector.pypdf2_engine import PyPDF2Detector
 pytest.importorskip("fitz")
-pytest.importorskip("docx")
 def _pdf_with_signature(path: Path) -> None:
@@ -59,9 +58,31 @@ def test_crop_signatures(tmp_path: Path):
         if sig.BoundingBox:
             assert sig.CropPath is not None
             crop_path = Path(sig.CropPath)
+            assert crop_path.suffix == ".png"
+            assert crop_path.exists()
+            assert not crop_path.with_suffix(".docx").exists()
+            assert sig.CropDocxPath is None
+def test_crop_signatures_docx_toggle(tmp_path: Path) -> None:
+    pdf_path = tmp_path / "doc.pdf"
+    _pdf_with_signature(pdf_path)
+    cfg = DetectConfiguration(pdf_root=tmp_path, out_dir=tmp_path, engine="pypdf2")
+    result = PyPDF2Detector(cfg).Detect(pdf_path)
+    out_dir = tmp_path / "crops_docx"
+    generated = crop_signatures(pdf_path, result, output_dir=out_dir, dpi=120, docx=True)
+    assert generated, "Expected at least one cropped docx"
+    for sig in result.Signatures:
+        if sig.BoundingBox:
+            assert sig.CropDocxPath is not None
+            crop_path = Path(sig.CropDocxPath)
             assert crop_path.suffix == ".docx"
             assert crop_path.exists()
             assert not crop_path.with_suffix(".png").exists()
+            assert sig.CropPath is None
 def test_crop_signature_images_accepts_dict(tmp_path: Path) -> None:
@@ -74,6 +95,8 @@ def test_crop_signature_images_accepts_dict(tmp_path: Path) -> None:
     assert paths
     assert result_dict["signatures"][0]["crop_path"] is not None
+    assert result_dict["signatures"][0]["crop_path"].endswith(".png")
+    assert result_dict["signatures"][0]["crop_docx_path"] is None
 def test_crop_signature_images_returns_bytes(tmp_path: Path) -> None:
@@ -92,9 +115,33 @@ def test_crop_signature_images_returns_bytes(tmp_path: Path) -> None:
     assert crops
     assert isinstance(crops[0], SignatureCrop)
     assert crops[0].image_bytes
-    assert crops[0].docx_bytes
+    assert crops[0].docx_bytes is None
     assert result_dict["signatures"][0]["crop_path"] is not None
-    assert result_dict["signatures"][0]["crop_path"].endswith(".docx")
+    assert result_dict["signatures"][0]["crop_path"].endswith(".png")
+    assert result_dict["signatures"][0]["crop_docx_path"] is None
+def test_crop_signature_images_returns_bytes_docx(tmp_path: Path) -> None:
+    pdf_path = tmp_path / "doc.pdf"
+    _pdf_with_signature(pdf_path)
+    result_dict = DetectPdf(pdf_path, engineName="pymupdf")
+    out_dir = tmp_path / "dict_docx_crops"
+    crops = CropSignatureImages(
+        pdf_path,
+        result_dict,
+        outputDirectory=out_dir,
+        returnBytes=True,
+        docx=True,
+    )
+    assert crops
+    assert isinstance(crops[0], SignatureCrop)
+    assert crops[0].image_bytes
+    assert crops[0].docx_bytes
+    assert result_dict["signatures"][0]["crop_docx_path"] is not None
+    assert result_dict["signatures"][0]["crop_docx_path"].endswith(".docx")
+    assert result_dict["signatures"][0]["crop_path"] is None
 def test_crop_signature_images_can_skip_disk(tmp_path: Path) -> None:
@@ -111,6 +158,33 @@ def test_crop_signature_images_can_skip_disk(tmp_path: Path) -> None:
         saveToDisk=False,
     )
+    assert crops
+    first_crop = crops[0]
+    assert isinstance(first_crop, SignatureCrop)
+    assert first_crop.image_bytes
+    assert first_crop.docx_bytes is None
+    assert first_crop.saved_to_disk is False
+    assert not first_crop.path.exists()
+    assert not first_crop.path.with_suffix(".docx").exists()
+    assert result_dict["signatures"][0]["crop_path"] is None
+    assert result_dict["signatures"][0]["crop_docx_path"] is None
+def test_crop_signature_images_can_skip_disk_docx(tmp_path: Path) -> None:
+    pdf_path = tmp_path / "doc.pdf"
+    _pdf_with_signature(pdf_path)
+    result_dict = DetectPdf(pdf_path, engineName="pymupdf")
+    out_dir = tmp_path / "dict_docx_crops_no_disk"
+    crops = CropSignatureImages(
+        pdf_path,
+        result_dict,
+        outputDirectory=out_dir,
+        returnBytes=True,
+        saveToDisk=False,
+        docx=True,
+    )
     assert crops
     first_crop = crops[0]
     assert isinstance(first_crop, SignatureCrop)
@@ -118,8 +192,10 @@ def test_crop_signature_images_can_skip_disk(tmp_path: Path) -> None:
     assert first_crop.docx_bytes
     assert first_crop.saved_to_disk is False
     assert not first_crop.path.exists()
+    assert first_crop.path.suffix == ".docx"
     assert not first_crop.path.with_suffix(".png").exists()
     assert result_dict["signatures"][0]["crop_path"] is None
+    assert result_dict["signatures"][0]["crop_docx_path"] is None
 def test_crop_signatures_returns_bytes(tmp_path: Path) -> None:
@@ -141,10 +217,10 @@ def test_crop_signatures_returns_bytes(tmp_path: Path) -> None:
     assert crops
     assert isinstance(crops[0], SignatureCrop)
     assert crops[0].path.exists()
-    assert crops[0].path.suffix == ".docx"
-    assert not crops[0].path.with_suffix(".png").exists()
+    assert crops[0].path.suffix == ".png"
+    assert not crops[0].path.with_suffix(".docx").exists()
     assert crops[0].image_bytes
-    assert crops[0].docx_bytes
+    assert crops[0].docx_bytes is None
 def test_crop_signatures_requires_save_or_bytes(tmp_path: Path) -> None: