PyPI - sigdetect - Versions diffs - 0.5.0__tar.gz → 0.5.2__tar.gz - Mend

sigdetect 0.5.0tar.gz → 0.5.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

{sigdetect-0.5.0 → sigdetect-0.5.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sigdetect
-Version: 0.5.0
+Version: 0.5.2
 Summary: Signature detection and role attribution for PDFs
 Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
 License: MIT
@@ -105,7 +105,7 @@ sigdetect detect \
   - `retainer` → client / firm (prefers detecting two signatures)
 - `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
 - Results output is disabled by default; set `write_results: true` or pass `--write-results` when you need `results.json` (for EDA).
-- Cropping (`--crop-signatures`) writes a one-image `.docx` per signature in the crop output directory (no PNG files on disk); `--crop-bytes` embeds base64 PNG data in `signatures[].crop_bytes` for in-memory use. PyMuPDF is required for crops, and `python-docx` is required for `.docx` output.
+- Cropping (`--crop-signatures`) writes PNG crops to disk by default; enable `--crop-docx` to write DOCX files instead of PNGs. `--crop-bytes` embeds base64 PNG data in `signatures[].crop_bytes` and, when `--crop-docx` is enabled, embeds DOCX bytes in `signatures[].crop_docx_bytes`. PyMuPDF is required for crops, and `python-docx` is required for DOCX output.
 - Wet detection runs automatically for non-e-sign PDFs when dependencies are available; missing OCR dependencies add a `ManualReview:*` hint instead of failing. PyMuPDF + Tesseract are required for wet detection.
 - If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
@@ -142,7 +142,7 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
 print(result.to_dict())
 ~~~
-`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When cropping is enabled, `crop_path` points at the generated `.docx`. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
+`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image; when DOCX cropping is enabled, `crop_docx_path` points at the generated doc. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
 ---
@@ -194,7 +194,7 @@ for res in ScanDirectory(
     # store in DB, print, etc.
     pass
-# 3) Create DOCX crops for FileResult objects (requires PyMuPDF + python-docx)
+# 3) Crop signature snippets for FileResult objects (requires PyMuPDF; DOCX needs python-docx)
 detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
 file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
 CropSignatureImages(
@@ -233,7 +233,8 @@ High-level summary (per file):
       "hint": "AcroSig:sig_patient",
       "render_type": "typed",
       "bounding_box": [10.0, 10.0, 150.0, 40.0],
-      "crop_path": "signature_crops/example/sig_01_patient.docx"
+      "crop_path": "signature_crops/example/sig_01_patient.png",
+      "crop_docx_path": null
     },
     {
       "page": null,
@@ -259,8 +260,10 @@ High-level summary (per file):
 - **`roles`** summarizes unique non-`unknown` roles across signatures.
 - In retainer profile, emitter prefers two signatures (client + firm), often on the same page.
 - **`signatures[].bounding_box`** reports the widget rectangle in PDF points (origin bottom-left).
-- **`signatures[].crop_path`** is populated when DOCX crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
+- **`signatures[].crop_path`** is populated when PNG crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
+- **`signatures[].crop_docx_path`** is populated when DOCX crops are generated (`--crop-docx` or `docx=True`).
 - **`signatures[].crop_bytes`** contains base64 PNG data when CLI `--crop-bytes` is enabled.
+- **`signatures[].crop_docx_bytes`** contains base64 DOCX data when `--crop-docx` and `--crop-bytes` are enabled together.
 ---
@@ -287,7 +290,8 @@ write_results: false
 pseudo_signatures: true
 recurse_xobjects: true
 profile: retainer    # or: hipaa
-crop_signatures: false   # enable to write DOCX crops (requires pymupdf + python-docx)
+crop_signatures: false   # enable to write PNG crops (requires pymupdf)
+crop_docx: false         # enable to write DOCX crops instead of PNGs (requires python-docx)
 # crop_output_dir: ./signature_crops
 crop_image_dpi: 200
 detect_wet_signatures: false   # kept for compatibility; non-e-sign PDFs still trigger OCR

{sigdetect-0.5.0 → sigdetect-0.5.2}/README.md RENAMED Viewed

@@ -87,7 +87,7 @@ sigdetect detect \
   - `retainer` → client / firm (prefers detecting two signatures)
 - `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
 - Results output is disabled by default; set `write_results: true` or pass `--write-results` when you need `results.json` (for EDA).
-- Cropping (`--crop-signatures`) writes a one-image `.docx` per signature in the crop output directory (no PNG files on disk); `--crop-bytes` embeds base64 PNG data in `signatures[].crop_bytes` for in-memory use. PyMuPDF is required for crops, and `python-docx` is required for `.docx` output.
+- Cropping (`--crop-signatures`) writes PNG crops to disk by default; enable `--crop-docx` to write DOCX files instead of PNGs. `--crop-bytes` embeds base64 PNG data in `signatures[].crop_bytes` and, when `--crop-docx` is enabled, embeds DOCX bytes in `signatures[].crop_docx_bytes`. PyMuPDF is required for crops, and `python-docx` is required for DOCX output.
 - Wet detection runs automatically for non-e-sign PDFs when dependencies are available; missing OCR dependencies add a `ManualReview:*` hint instead of failing. PyMuPDF + Tesseract are required for wet detection.
 - If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
@@ -124,7 +124,7 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
 print(result.to_dict())
 ~~~
-`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When cropping is enabled, `crop_path` points at the generated `.docx`. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
+`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image; when DOCX cropping is enabled, `crop_docx_path` points at the generated doc. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
 ---
@@ -176,7 +176,7 @@ for res in ScanDirectory(
     # store in DB, print, etc.
     pass
-# 3) Create DOCX crops for FileResult objects (requires PyMuPDF + python-docx)
+# 3) Crop signature snippets for FileResult objects (requires PyMuPDF; DOCX needs python-docx)
 detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
 file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
 CropSignatureImages(
@@ -215,7 +215,8 @@ High-level summary (per file):
       "hint": "AcroSig:sig_patient",
       "render_type": "typed",
       "bounding_box": [10.0, 10.0, 150.0, 40.0],
-      "crop_path": "signature_crops/example/sig_01_patient.docx"
+      "crop_path": "signature_crops/example/sig_01_patient.png",
+      "crop_docx_path": null
     },
     {
       "page": null,
@@ -241,8 +242,10 @@ High-level summary (per file):
 - **`roles`** summarizes unique non-`unknown` roles across signatures.
 - In retainer profile, emitter prefers two signatures (client + firm), often on the same page.
 - **`signatures[].bounding_box`** reports the widget rectangle in PDF points (origin bottom-left).
-- **`signatures[].crop_path`** is populated when DOCX crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
+- **`signatures[].crop_path`** is populated when PNG crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
+- **`signatures[].crop_docx_path`** is populated when DOCX crops are generated (`--crop-docx` or `docx=True`).
 - **`signatures[].crop_bytes`** contains base64 PNG data when CLI `--crop-bytes` is enabled.
+- **`signatures[].crop_docx_bytes`** contains base64 DOCX data when `--crop-docx` and `--crop-bytes` are enabled together.
 ---
@@ -269,7 +272,8 @@ write_results: false
 pseudo_signatures: true
 recurse_xobjects: true
 profile: retainer    # or: hipaa
-crop_signatures: false   # enable to write DOCX crops (requires pymupdf + python-docx)
+crop_signatures: false   # enable to write PNG crops (requires pymupdf)
+crop_docx: false         # enable to write DOCX crops instead of PNGs (requires python-docx)
 # crop_output_dir: ./signature_crops
 crop_image_dpi: 200
 detect_wet_signatures: false   # kept for compatibility; non-e-sign PDFs still trigger OCR

{sigdetect-0.5.0 → sigdetect-0.5.2}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "sigdetect"
-version = "0.5.0"
+version = "0.5.2"
 description = "Signature detection and role attribution for PDFs"
 readme = "README.md"
 authors = [{ name = "BT Asmamaw", email = "basmamaw@angeiongroup.com" }]

{sigdetect-0.5.0 → sigdetect-0.5.2}/src/sigdetect/api.py RENAMED Viewed

@@ -229,6 +229,7 @@ def CropSignatureImages(
     dpi: int = 200,
     returnBytes: Literal[False] = False,
     saveToDisk: bool = True,
+    docx: bool = False,
 ) -> list[Path]: ...
@@ -241,6 +242,7 @@ def CropSignatureImages(
     dpi: int,
     returnBytes: Literal[True],
     saveToDisk: bool,
+    docx: bool = False,
 ) -> list[SignatureCrop]: ...
@@ -252,16 +254,17 @@ def CropSignatureImages(
     dpi: int = 200,
     returnBytes: bool = False,
     saveToDisk: bool = True,
+    docx: bool = False,
 ) -> list[Path] | list[SignatureCrop]:
-    """Create DOCX files containing cropped signature images.
+    """Create PNG files containing cropped signature images (or DOCX when enabled).
     Accepts either a :class:`FileResult` instance or the ``dict`` returned by
     :func:`DetectPdf`. Requires the optional ``pymupdf`` dependency.
     Set ``returnBytes=True`` to also receive in-memory PNG bytes for each crop. Set
     ``saveToDisk=False`` to skip writing PNG files while still returning in-memory data.
-    When ``saveToDisk`` is enabled, a one-image DOCX file is also written per crop. When
-    ``returnBytes`` is True and ``python-docx`` is available, the returned
-    :class:`SignatureCrop` objects include ``docx_bytes``.
+    When ``docx`` is True, DOCX files are written instead of PNG files. When ``returnBytes`` is
+    True and ``docx`` is enabled, the returned :class:`SignatureCrop` objects include
+    ``docx_bytes``.
     """
     from sigdetect.cropping import crop_signatures
@@ -274,6 +277,7 @@ def CropSignatureImages(
         dpi=dpi,
         return_bytes=returnBytes,
         save_files=saveToDisk,
+        docx=docx,
     )
     if original_dict is not None:
         original_dict.clear()
@@ -305,6 +309,8 @@ def _CoerceFileResult(
                 BoundingBox=tuple(bbox) if bbox else None,
                 CropPath=entry.get("crop_path"),
                 CropBytes=entry.get("crop_bytes"),
+                CropDocxPath=entry.get("crop_docx_path"),
+                CropDocxBytes=entry.get("crop_docx_bytes"),
             )
         )

{sigdetect-0.5.0 → sigdetect-0.5.2}/src/sigdetect/cli.py RENAMED Viewed

@@ -64,13 +64,19 @@ def Detect(
     cropSignatures: bool | None = typer.Option(
         None,
         "--crop-signatures/--no-crop-signatures",
-        help="Write DOCX files containing cropped signature images (requires PyMuPDF + python-docx)",
+        help="Write PNG crops for signature widgets (requires PyMuPDF)",
+        show_default=False,
+    ),
+    cropDocx: bool | None = typer.Option(
+        None,
+        "--crop-docx/--no-crop-docx",
+        help="Write DOCX crops instead of PNG files (requires PyMuPDF + python-docx)",
         show_default=False,
     ),
     cropDirectory: Path | None = typer.Option(
         None,
         "--crop-dir",
-        help="Directory for signature DOCX crops (defaults to out_dir/signature_crops)",
+        help="Directory for signature crops (defaults to out_dir/signature_crops)",
     ),
     cropDpi: int | None = typer.Option(
         None,
@@ -83,7 +89,7 @@ def Detect(
     cropBytes: bool = typer.Option(
         False,
         "--crop-bytes/--no-crop-bytes",
-        help="Embed base64 PNG bytes for signature crops in results JSON",
+        help="Embed base64 PNG bytes (and DOCX bytes when --crop-docx) in results JSON",
         show_default=False,
     ),
     detectWetSignatures: bool | None = typer.Option(
@@ -128,6 +134,8 @@ def Detect(
         overrides["WriteResults"] = writeResults
     if cropSignatures is not None:
         overrides["CropSignatures"] = cropSignatures
+    if cropDocx is not None:
+        overrides["CropDocx"] = cropDocx
     if cropDirectory is not None:
         overrides["CropOutputDirectory"] = cropDirectory
     if cropDpi is not None:
@@ -181,6 +189,7 @@ def Detect(
         base_dir = configuration.OutputDirectory or configuration.PdfRoot
         crop_dir = base_dir / "signature_crops"
     cropping_enabled = configuration.CropSignatures
+    docx_enabled = configuration.CropDocx
     cropping_available = True
     cropping_attempted = False
@@ -199,6 +208,7 @@ def Detect(
                     logger=Logger,
                     return_bytes=crop_bytes_enabled,
                     save_files=cropping_enabled,
+                    docx=docx_enabled,
                 )
                 cropping_attempted = True
                 if crop_bytes_enabled:
@@ -206,15 +216,18 @@ def Detect(
                         crop.signature.CropBytes = base64.b64encode(crop.image_bytes).decode(
                             "ascii"
                         )
+                        if crop.docx_bytes:
+                            crop.signature.CropDocxBytes = base64.b64encode(
+                                crop.docx_bytes
+                            ).decode("ascii")
             except SignatureCroppingUnavailable as exc:
                 cropping_available = False
                 Logger.warning("Signature cropping unavailable", extra={"error": str(exc)})
                 typer.echo(str(exc), err=True)
             except Exception as exc:  # pragma: no cover - defensive
-                Logger.warning(
-                    "Unexpected error while cropping signatures",
-                    extra={"error": str(exc)},
-                )
+                cropping_available = False
+                Logger.warning("Signature cropping unavailable", extra={"error": str(exc)})
+                typer.echo(str(exc), err=True)
         total_bboxes += sum(1 for sig in file_result.Signatures if sig.BoundingBox)

{sigdetect-0.5.0 → sigdetect-0.5.2}/src/sigdetect/config.py RENAMED Viewed

@@ -31,6 +31,7 @@ class DetectConfiguration(BaseModel):
     PseudoSignatures: bool = Field(default=True, alias="pseudo_signatures")
     RecurseXObjects: bool = Field(default=True, alias="recurse_xobjects")
     CropSignatures: bool = Field(default=True, alias="crop_signatures")
+    CropDocx: bool = Field(default=False, alias="crop_docx")
     CropOutputDirectory: Path | None = Field(default=None, alias="crop_output_dir")
     CropImageDpi: int = Field(default=200, alias="crop_image_dpi", ge=72, le=600)
     DetectWetSignatures: bool = Field(default=True, alias="detect_wet_signatures")
@@ -88,6 +89,10 @@ class DetectConfiguration(BaseModel):
     def crop_signatures(self) -> bool:  # pragma: no cover - simple passthrough
         return self.CropSignatures
+    @property
+    def crop_docx(self) -> bool:  # pragma: no cover - simple passthrough
+        return self.CropDocx
     @property
     def crop_output_dir(self) -> Path | None:  # pragma: no cover - simple passthrough
         return self.CropOutputDirectory
@@ -133,6 +138,7 @@ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
     env_out_dir = os.getenv("SIGDETECT_OUT_DIR")
     env_profile = os.getenv("SIGDETECT_PROFILE")
     env_crop = os.getenv("SIGDETECT_CROP_SIGNATURES")
+    env_crop_docx = os.getenv("SIGDETECT_CROP_DOCX")
     env_crop_dir = os.getenv("SIGDETECT_CROP_DIR")
     env_crop_dpi = os.getenv("SIGDETECT_CROP_DPI")
     env_detect_wet = os.getenv("SIGDETECT_DETECT_WET")
@@ -159,6 +165,12 @@ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
             raw_data["crop_signatures"] = True
         elif lowered in {"0", "false", "no", "off"}:
             raw_data["crop_signatures"] = False
+    if env_crop_docx is not None:
+        lowered = env_crop_docx.lower()
+        if lowered in {"1", "true", "yes", "on"}:
+            raw_data["crop_docx"] = True
+        elif lowered in {"0", "false", "no", "off"}:
+            raw_data["crop_docx"] = False
     if env_crop_dir:
         raw_data["crop_output_dir"] = env_crop_dir
     if env_crop_dpi:

sigdetect 0.5.0__tar.gz → 0.5.2__tar.gz

sigdetect 0.5.0tar.gz → 0.5.2tar.gz