sigdetect 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sigdetect/api.py CHANGED
@@ -2,11 +2,12 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ from contextlib import contextmanager
5
6
  from pathlib import Path
6
- from typing import Any, Iterable, Iterator, Literal
7
+ from typing import Any, Generator, Iterable, Iterator, Literal
7
8
 
8
9
  from sigdetect.config import DetectConfiguration
9
- from sigdetect.detector import BuildDetector
10
+ from sigdetect.detector import BuildDetector, Detector, FileResult, Signature
10
11
 
11
12
  EngineName = Literal["pypdf2", "pypdf", "pymupdf"]
12
13
  ProfileName = Literal["hipaa", "retainer"]
@@ -19,23 +20,44 @@ def DetectPdf(
19
20
  engineName: EngineName = "pypdf2",
20
21
  includePseudoSignatures: bool = True,
21
22
  recurseXObjects: bool = True,
23
+ detector: Detector | None = None,
22
24
  ) -> dict[str, Any]:
23
25
  """Detect signature evidence and assign roles for a single PDF."""
24
26
 
25
27
  resolvedPath = Path(pdfPath)
28
+ activeDetector = detector or get_detector(
29
+ pdfRoot=resolvedPath.parent,
30
+ profileName=profileName,
31
+ engineName=engineName,
32
+ includePseudoSignatures=includePseudoSignatures,
33
+ recurseXObjects=recurseXObjects,
34
+ outputDirectory=None,
35
+ )
36
+
37
+ result = activeDetector.Detect(resolvedPath)
38
+ return _ToPlainDictionary(result)
39
+
40
+
41
+ def get_detector(
42
+ *,
43
+ pdfRoot: str | Path | None = None,
44
+ profileName: ProfileName = "hipaa",
45
+ engineName: EngineName = "pypdf2",
46
+ includePseudoSignatures: bool = True,
47
+ recurseXObjects: bool = True,
48
+ outputDirectory: str | Path | None = None,
49
+ ) -> Detector:
50
+ """Return a reusable detector instance configured with the supplied options."""
26
51
 
27
52
  configuration = DetectConfiguration(
28
- PdfRoot=resolvedPath.parent,
29
- OutputDirectory=None,
53
+ PdfRoot=Path(pdfRoot) if pdfRoot is not None else Path.cwd(),
54
+ OutputDirectory=Path(outputDirectory) if outputDirectory is not None else None,
30
55
  Engine=engineName,
31
56
  PseudoSignatures=includePseudoSignatures,
32
57
  RecurseXObjects=recurseXObjects,
33
58
  Profile=profileName,
34
59
  )
35
-
36
- detector = BuildDetector(configuration)
37
- result = detector.Detect(resolvedPath)
38
- return _ToPlainDictionary(result)
60
+ return BuildDetector(configuration)
39
61
 
40
62
 
41
63
  def _ToPlainDictionary(candidate: Any) -> dict[str, Any]:
@@ -84,10 +106,17 @@ def _ToPlainValue(value: Any) -> Any:
84
106
 
85
107
  def DetectMany(
86
108
  pdfPaths: Iterable[str | Path],
109
+ *,
110
+ detector: Detector | None = None,
87
111
  **kwargs: Any,
88
112
  ) -> Iterator[dict[str, Any]]:
89
113
  """Yield :func:`DetectPdf` results for each path in ``pdfPaths``."""
90
114
 
115
+ if detector is not None:
116
+ for pdfPath in pdfPaths:
117
+ yield _DetectWithDetector(detector, pdfPath)
118
+ return
119
+
91
120
  for pdfPath in pdfPaths:
92
121
  yield DetectPdf(pdfPath, **kwargs)
93
122
 
@@ -96,19 +125,24 @@ def ScanDirectory(
96
125
  pdfRoot: str | Path,
97
126
  *,
98
127
  globPattern: str = "**/*.pdf",
128
+ detector: Detector | None = None,
99
129
  **kwargs: Any,
100
130
  ) -> Iterator[dict[str, Any]]:
101
131
  """Walk ``pdfRoot`` and yield detection output for every matching PDF."""
102
132
 
103
133
  rootDirectory = Path(pdfRoot)
104
- iterator = (
105
- rootDirectory.rglob(globPattern.replace("**/", "", 1))
106
- if globPattern.startswith("**/")
107
- else rootDirectory.glob(globPattern)
108
- )
134
+ if globPattern == "**/*.pdf":
135
+ iterator = (path for path in rootDirectory.rglob("*") if path.is_file())
136
+ else:
137
+ iterator = (
138
+ rootDirectory.rglob(globPattern.replace("**/", "", 1))
139
+ if globPattern.startswith("**/")
140
+ else rootDirectory.glob(globPattern)
141
+ )
142
+
109
143
  for pdfPath in iterator:
110
144
  if pdfPath.is_file() and pdfPath.suffix.lower() == ".pdf":
111
- yield DetectPdf(pdfPath, **kwargs)
145
+ yield DetectPdf(pdfPath, detector=detector, **kwargs)
112
146
 
113
147
 
114
148
  def ToCsvRow(result: dict[str, Any]) -> dict[str, Any]:
@@ -137,3 +171,91 @@ def Version() -> str:
137
171
  return resolveVersion("sigdetect")
138
172
  except Exception:
139
173
  return "0.0.0-dev"
174
+
175
+
176
+ def _DetectWithDetector(detector: Detector, pdfPath: str | Path) -> dict[str, Any]:
177
+ """Helper that runs ``detector`` and returns the plain dictionary result."""
178
+
179
+ resolvedPath = Path(pdfPath)
180
+ return _ToPlainDictionary(detector.Detect(resolvedPath))
181
+
182
+
183
+ @contextmanager
184
+ def detector_context(**kwargs: Any) -> Generator[Detector, None, None]:
185
+ """Context manager wrapper around :func:`get_detector`."""
186
+
187
+ detector = get_detector(**kwargs)
188
+ try:
189
+ yield detector
190
+ finally:
191
+ pass
192
+
193
+
194
+ def CropSignatureImages(
195
+ pdfPath: str | Path,
196
+ fileResult: FileResult | dict[str, Any],
197
+ *,
198
+ outputDirectory: str | Path,
199
+ dpi: int = 200,
200
+ ) -> list[Path]:
201
+ """Crop detected signature regions to PNG files.
202
+
203
+ Accepts either a :class:`FileResult` instance or the ``dict`` returned by
204
+ :func:`DetectPdf`. Requires the optional ``pymupdf`` dependency.
205
+ """
206
+
207
+ from sigdetect.cropping import crop_signatures
208
+
209
+ file_result_obj, original_dict = _CoerceFileResult(fileResult)
210
+ paths = crop_signatures(
211
+ pdf_path=Path(pdfPath),
212
+ file_result=file_result_obj,
213
+ output_dir=Path(outputDirectory),
214
+ dpi=dpi,
215
+ )
216
+ if original_dict is not None:
217
+ original_dict.clear()
218
+ original_dict.update(file_result_obj.to_dict())
219
+ return paths
220
+
221
+
222
+ def _CoerceFileResult(
223
+ candidate: FileResult | dict[str, Any]
224
+ ) -> tuple[FileResult, dict[str, Any] | None]:
225
+ if isinstance(candidate, FileResult):
226
+ return candidate, None
227
+ if not isinstance(candidate, dict):
228
+ raise TypeError("fileResult must be FileResult or dict")
229
+
230
+ signatures: list[Signature] = []
231
+ for entry in candidate.get("signatures") or []:
232
+ bbox = entry.get("bounding_box")
233
+ signatures.append(
234
+ Signature(
235
+ Page=entry.get("page"),
236
+ FieldName=str(entry.get("field_name") or ""),
237
+ Role=str(entry.get("role") or "unknown"),
238
+ Score=int(entry.get("score") or 0),
239
+ Scores=dict(entry.get("scores") or {}),
240
+ Evidence=list(entry.get("evidence") or []),
241
+ Hint=str(entry.get("hint") or ""),
242
+ RenderType=str(entry.get("render_type") or "unknown"),
243
+ BoundingBox=tuple(bbox) if bbox else None,
244
+ CropPath=entry.get("crop_path"),
245
+ )
246
+ )
247
+
248
+ file_result = FileResult(
249
+ File=str(candidate.get("file") or ""),
250
+ SizeKilobytes=candidate.get("size_kb"),
251
+ PageCount=int(candidate.get("pages") or 0),
252
+ ElectronicSignatureFound=bool(candidate.get("esign_found")),
253
+ ScannedPdf=candidate.get("scanned_pdf"),
254
+ MixedContent=candidate.get("mixed"),
255
+ SignatureCount=int(candidate.get("sig_count") or len(signatures)),
256
+ SignaturePages=str(candidate.get("sig_pages") or ""),
257
+ Roles=str(candidate.get("roles") or "unknown"),
258
+ Hints=str(candidate.get("hints") or ""),
259
+ Signatures=signatures,
260
+ )
261
+ return file_result, candidate
sigdetect/cli.py CHANGED
@@ -3,14 +3,16 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import json
6
+ from collections.abc import Iterator
6
7
  from dataclasses import asdict, is_dataclass
7
8
  from pathlib import Path
8
9
 
9
10
  import typer
10
11
 
11
12
  from . import __version__
12
- from .config import LoadConfiguration
13
- from .detector import BuildDetector
13
+ from .config import FinalizeConfiguration, LoadConfiguration
14
+ from .cropping import SignatureCroppingUnavailable, crop_signatures
15
+ from .detector import BuildDetector, FileResult
14
16
  from .eda import RunExploratoryAnalysis
15
17
  from .logging_setup import ConfigureLogging
16
18
 
@@ -31,18 +33,65 @@ def _JsonSerializer(candidate):
31
33
  return str(candidate)
32
34
 
33
35
 
36
+ def _EnumeratePdfs(pdfRoot: Path, recursive: bool) -> Iterator[Path]:
37
+ """Yield PDF files under ``pdfRoot`` honoring the recursion flag."""
38
+
39
+ iterator = pdfRoot.rglob("*") if recursive else pdfRoot.glob("*")
40
+ for path in iterator:
41
+ if path.is_file() and path.suffix.lower() == ".pdf":
42
+ yield path
43
+
44
+
34
45
  @CliApplication.command(name="detect")
35
46
  def Detect(
36
47
  configurationPath: Path | None = typer.Option(
37
48
  None, "--config", "-c", help="Path to YAML config"
38
49
  ),
39
50
  profileOverride: str | None = typer.Option(None, "--profile", "-p", help="hipaa or retainer"),
51
+ recursive: bool = typer.Option(
52
+ True,
53
+ "--recursive/--no-recursive",
54
+ help="Recurse into subdirectories when gathering PDFs",
55
+ ),
56
+ cropSignatures: bool | None = typer.Option(
57
+ None,
58
+ "--crop-signatures/--no-crop-signatures",
59
+ help="Crop detected signature regions to PNG files (requires PyMuPDF)",
60
+ show_default=False,
61
+ ),
62
+ cropDirectory: Path | None = typer.Option(
63
+ None,
64
+ "--crop-dir",
65
+ help="Directory for signature PNG crops (defaults to out_dir/signature_crops)",
66
+ ),
67
+ cropDpi: int | None = typer.Option(
68
+ None,
69
+ "--crop-dpi",
70
+ min=72,
71
+ max=600,
72
+ help="Rendering DPI for signature crops",
73
+ show_default=False,
74
+ ),
40
75
  ) -> None:
41
76
  """Run detection for the configured directory and emit ``results.json``."""
42
77
 
43
78
  configuration = LoadConfiguration(configurationPath)
44
- if profileOverride in {"hipaa", "retainer"}:
45
- configuration = configuration.model_copy(update={"Profile": profileOverride})
79
+ if profileOverride is not None:
80
+ normalized_profile = profileOverride.lower()
81
+ if normalized_profile not in {"hipaa", "retainer"}:
82
+ raise typer.BadParameter("Profile must be 'hipaa' or 'retainer'.")
83
+ configuration = configuration.model_copy(update={"Profile": normalized_profile})
84
+
85
+ overrides: dict[str, object] = {}
86
+ if cropSignatures is not None:
87
+ overrides["CropSignatures"] = cropSignatures
88
+ if cropDirectory is not None:
89
+ overrides["CropOutputDirectory"] = cropDirectory
90
+ if cropDpi is not None:
91
+ overrides["CropImageDpi"] = cropDpi
92
+ if overrides:
93
+ configuration = configuration.model_copy(update=overrides)
94
+ configuration = FinalizeConfiguration(configuration)
46
95
 
47
96
  try:
48
97
  detector = BuildDetector(configuration)
@@ -54,26 +103,111 @@ def Detect(
54
103
  typer.echo(str(exc), err=True)
55
104
  raise typer.Exit(code=2) from exc
56
105
 
57
- pdfFiles = list(configuration.PdfRoot.glob("*.pdf"))
58
- if not pdfFiles:
59
- raise SystemExit(f"No PDFs found in {configuration.PdfRoot}")
60
-
61
- results = [detector.Detect(pdfPath) for pdfPath in pdfFiles]
106
+ pdfIterator = _EnumeratePdfs(configuration.PdfRoot, recursive)
107
+ try:
108
+ firstPdf = next(pdfIterator)
109
+ except StopIteration:
110
+ raise SystemExit(f"No PDFs found in {configuration.PdfRoot}") from None
111
+
112
+ results_buffer: list[FileResult] | None = [] if configuration.OutputDirectory is None else None
113
+ json_handle = None
114
+ json_path: Path | None = None
115
+ wrote_first = False
116
+
117
+ if configuration.OutputDirectory is not None:
118
+ outputDirectory = configuration.OutputDirectory
119
+ outputDirectory.mkdir(parents=True, exist_ok=True)
120
+ json_path = outputDirectory / "results.json"
121
+ json_handle = open(json_path, "w", encoding="utf-8")
122
+ json_handle.write("[")
123
+
124
+ crop_dir = configuration.CropOutputDirectory
125
+ cropping_enabled = configuration.CropSignatures
126
+ cropping_available = True
127
+ cropping_attempted = False
128
+ if configuration.CropSignatures and crop_dir is None:
129
+ Logger.warning(
130
+ "CropSignatures enabled without an output directory",
131
+ extra={"pdf_root": str(configuration.PdfRoot)},
132
+ )
133
+ cropping_enabled = False
134
+
135
+ total_bboxes = 0
136
+
137
+ def _append_result(file_result: FileResult, source_pdf: Path) -> None:
138
+ nonlocal wrote_first, json_handle, total_bboxes, cropping_available, cropping_attempted
139
+
140
+ if cropping_enabled and cropping_available and crop_dir is not None:
141
+ try:
142
+ crop_signatures(
143
+ pdf_path=source_pdf,
144
+ file_result=file_result,
145
+ output_dir=crop_dir,
146
+ dpi=configuration.CropImageDpi,
147
+ logger=Logger,
148
+ )
149
+ cropping_attempted = True
150
+ except SignatureCroppingUnavailable as exc:
151
+ cropping_available = False
152
+ Logger.warning("Signature cropping unavailable", extra={"error": str(exc)})
153
+ typer.echo(str(exc), err=True)
154
+ except Exception as exc: # pragma: no cover - defensive
155
+ Logger.warning(
156
+ "Unexpected error while cropping signatures",
157
+ extra={"error": str(exc)},
158
+ )
159
+
160
+ total_bboxes += sum(1 for sig in file_result.Signatures if sig.BoundingBox)
161
+
162
+ if results_buffer is not None:
163
+ results_buffer.append(file_result)
164
+ return
165
+
166
+ if json_handle is None:
167
+ return
168
+
169
+ serialized = json.dumps(
170
+ file_result,
171
+ indent=2,
172
+ ensure_ascii=False,
173
+ default=_JsonSerializer,
174
+ )
175
+ indented = "\n".join(f" {line}" for line in serialized.splitlines())
176
+ if wrote_first:
177
+ json_handle.write(",\n")
178
+ else:
179
+ json_handle.write("\n")
180
+ json_handle.write(indented)
181
+ wrote_first = True
182
+
183
+ def _process(pdf_path: Path) -> None:
184
+ file_result = detector.Detect(pdf_path)
185
+ _append_result(file_result, pdf_path)
62
186
 
63
- # Allow configuration to suppress file output entirely (out_dir: none / SIGDETECT_OUT_DIR=none)
64
- if configuration.OutputDirectory is None:
65
- payload = json.dumps(results, indent=2, ensure_ascii=False, default=_JsonSerializer)
187
+ try:
188
+ _process(firstPdf)
189
+ for pdf_path in pdfIterator:
190
+ _process(pdf_path)
191
+ finally:
192
+ if json_handle is not None:
193
+ closing = "\n]\n" if wrote_first else "]\n"
194
+ json_handle.write(closing)
195
+ json_handle.close()
196
+
197
+ if json_handle is not None:
198
+ typer.echo(f"Wrote {json_path}")
199
+ else:
200
+ payload = json.dumps(
201
+ results_buffer or [], indent=2, ensure_ascii=False, default=_JsonSerializer
202
+ )
66
203
  typer.echo(payload)
67
204
  typer.echo("Detection completed with output disabled (out_dir=none)")
68
- return
69
-
70
- outputDirectory = configuration.OutputDirectory
71
- outputDirectory.mkdir(parents=True, exist_ok=True)
72
205
 
73
- with open(outputDirectory / "results.json", "w", encoding="utf-8") as handle:
74
- json.dump(results, handle, indent=2, ensure_ascii=False, default=_JsonSerializer)
75
-
76
- typer.echo(f"Wrote {outputDirectory / 'results.json'}")
206
+ if cropping_enabled and cropping_available and cropping_attempted and total_bboxes == 0:
207
+ Logger.warning(
208
+ "No signature bounding boxes detected; try --engine pymupdf for crop-ready output",
209
+ extra={"engine": configuration.Engine},
210
+ )
77
211
 
78
212
 
79
213
  @CliApplication.command(name="eda")
sigdetect/config.py CHANGED
@@ -3,6 +3,7 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import os
6
+ from contextlib import suppress
6
7
  from pathlib import Path
7
8
  from typing import Literal
8
9
 
@@ -26,11 +27,13 @@ class DetectConfiguration(BaseModel):
26
27
  OutputDirectory: Path | None = Field(default=Path("out"), alias="out_dir")
27
28
  Engine: EngineName = Field(default="pypdf2", alias="engine")
28
29
  Profile: ProfileName = Field(default="hipaa", alias="profile")
29
- MaxWorkers: int = Field(default=8, alias="max_workers", ge=1, le=64)
30
30
  PseudoSignatures: bool = Field(default=True, alias="pseudo_signatures")
31
31
  RecurseXObjects: bool = Field(default=True, alias="recurse_xobjects")
32
+ CropSignatures: bool = Field(default=False, alias="crop_signatures")
33
+ CropOutputDirectory: Path | None = Field(default=None, alias="crop_output_dir")
34
+ CropImageDpi: int = Field(default=200, alias="crop_image_dpi", ge=72, le=600)
32
35
 
33
- @field_validator("PdfRoot", "OutputDirectory", mode="before")
36
+ @field_validator("PdfRoot", "OutputDirectory", "CropOutputDirectory", mode="before")
34
37
  @classmethod
35
38
  def _CoercePath(cls, value: str | Path | None) -> Path | None:
36
39
  """Allow configuration values to be provided as ``str`` or ``Path``.
@@ -42,8 +45,8 @@ class DetectConfiguration(BaseModel):
42
45
  if value is None:
43
46
  return None
44
47
  if isinstance(value, Path):
45
- return value
46
- return Path(value)
48
+ return value.expanduser()
49
+ return Path(value).expanduser()
47
50
 
48
51
  # Expose legacy snake_case property names for gradual migration
49
52
  @property
@@ -62,10 +65,6 @@ class DetectConfiguration(BaseModel):
62
65
  def profile(self) -> ProfileName: # pragma: no cover - simple passthrough
63
66
  return self.Profile
64
67
 
65
- @property
66
- def max_workers(self) -> int: # pragma: no cover - simple passthrough
67
- return self.MaxWorkers
68
-
69
68
  @property
70
69
  def pseudo_signatures(self) -> bool: # pragma: no cover - simple passthrough
71
70
  return self.PseudoSignatures
@@ -74,6 +73,18 @@ class DetectConfiguration(BaseModel):
74
73
  def recurse_xobjects(self) -> bool: # pragma: no cover - simple passthrough
75
74
  return self.RecurseXObjects
76
75
 
76
+ @property
77
+ def crop_signatures(self) -> bool: # pragma: no cover - simple passthrough
78
+ return self.CropSignatures
79
+
80
+ @property
81
+ def crop_output_dir(self) -> Path | None: # pragma: no cover - simple passthrough
82
+ return self.CropOutputDirectory
83
+
84
+ @property
85
+ def crop_image_dpi(self) -> int: # pragma: no cover - simple passthrough
86
+ return self.CropImageDpi
87
+
77
88
 
78
89
  def LoadConfiguration(path: Path | None) -> DetectConfiguration:
79
90
  """Load configuration from ``path`` while applying environment overrides.
@@ -94,6 +105,9 @@ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
94
105
  env_pdf_root = os.getenv("SIGDETECT_PDF_ROOT")
95
106
  env_out_dir = os.getenv("SIGDETECT_OUT_DIR")
96
107
  env_profile = os.getenv("SIGDETECT_PROFILE")
108
+ env_crop = os.getenv("SIGDETECT_CROP_SIGNATURES")
109
+ env_crop_dir = os.getenv("SIGDETECT_CROP_DIR")
110
+ env_crop_dpi = os.getenv("SIGDETECT_CROP_DPI")
97
111
 
98
112
  raw_data: dict[str, object] = {}
99
113
  if path and Path(path).exists():
@@ -108,10 +122,36 @@ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
108
122
  raw_data["out_dir"] = None if env_out_dir.lower() == "none" else env_out_dir
109
123
  if env_profile in {"hipaa", "retainer"}:
110
124
  raw_data["profile"] = env_profile
125
+ if env_crop is not None:
126
+ lowered = env_crop.lower()
127
+ if lowered in {"1", "true", "yes", "on"}:
128
+ raw_data["crop_signatures"] = True
129
+ elif lowered in {"0", "false", "no", "off"}:
130
+ raw_data["crop_signatures"] = False
131
+ if env_crop_dir:
132
+ raw_data["crop_output_dir"] = env_crop_dir
133
+ if env_crop_dpi:
134
+ with suppress(ValueError):
135
+ raw_data["crop_image_dpi"] = int(env_crop_dpi)
111
136
 
112
137
  configuration = DetectConfiguration(**raw_data)
138
+ return FinalizeConfiguration(configuration)
139
+
140
+
141
+ def FinalizeConfiguration(configuration: DetectConfiguration) -> DetectConfiguration:
142
+ """Ensure derived directories exist and defaults are populated."""
143
+
144
+ updates: dict[str, object] = {}
113
145
 
114
146
  if configuration.OutputDirectory is not None:
115
147
  configuration.OutputDirectory.mkdir(parents=True, exist_ok=True)
116
148
 
117
- return configuration
149
+ if configuration.CropSignatures:
150
+ crop_dir = configuration.CropOutputDirectory
151
+ if crop_dir is None:
152
+ base_dir = configuration.OutputDirectory or configuration.PdfRoot
153
+ crop_dir = base_dir / "signature_crops"
154
+ crop_dir.mkdir(parents=True, exist_ok=True)
155
+ updates["CropOutputDirectory"] = crop_dir
156
+
157
+ return configuration if not updates else configuration.model_copy(update=updates)
sigdetect/cropping.py ADDED
@@ -0,0 +1,123 @@
1
+ """Helpers for converting signature bounding boxes into PNG crops."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import re
7
+ from pathlib import Path
8
+
9
+ from .detector.file_result_model import FileResult
10
+ from .detector.signature_model import Signature
11
+
12
+ try: # pragma: no cover - optional dependency
13
+ import fitz # type: ignore
14
+ except Exception: # pragma: no cover - optional dependency
15
+ fitz = None # type: ignore[misc]
16
+
17
+
18
+ class SignatureCroppingUnavailable(RuntimeError):
19
+ """Raised when PNG cropping cannot be performed (e.g., PyMuPDF missing)."""
20
+
21
+
22
+ def crop_signatures(
23
+ pdf_path: Path,
24
+ file_result: FileResult,
25
+ *,
26
+ output_dir: Path,
27
+ dpi: int = 200,
28
+ logger: logging.Logger | None = None,
29
+ ) -> list[Path]:
30
+ """Render each signature bounding box to a PNG image using PyMuPDF."""
31
+
32
+ if fitz is None: # pragma: no cover - exercised when dependency absent
33
+ raise SignatureCroppingUnavailable(
34
+ "PyMuPDF is required for PNG crops. Install 'pymupdf' or 'sigdetect[pymupdf]'."
35
+ )
36
+
37
+ pdf_path = Path(pdf_path)
38
+ output_dir = Path(output_dir)
39
+ output_dir.mkdir(parents=True, exist_ok=True)
40
+ generated: list[Path] = []
41
+
42
+ with fitz.open(pdf_path) as document: # type: ignore[attr-defined]
43
+ per_document_dir = output_dir / pdf_path.stem
44
+ per_document_dir.mkdir(parents=True, exist_ok=True)
45
+ scale = dpi / 72.0
46
+ matrix = fitz.Matrix(scale, scale)
47
+
48
+ for index, signature in enumerate(file_result.Signatures, start=1):
49
+ if not signature.BoundingBox or not signature.Page:
50
+ continue
51
+ try:
52
+ page = document.load_page(signature.Page - 1)
53
+ except Exception as exc: # pragma: no cover - defensive
54
+ if logger:
55
+ logger.warning(
56
+ "Failed to load page for signature crop",
57
+ extra={
58
+ "file": pdf_path.name,
59
+ "page": signature.Page,
60
+ "error": str(exc),
61
+ },
62
+ )
63
+ continue
64
+
65
+ clip = _to_clip_rect(page, signature.BoundingBox)
66
+ if clip is None:
67
+ continue
68
+
69
+ filename = _build_filename(index, signature)
70
+ destination = per_document_dir / filename
71
+
72
+ try:
73
+ pixmap = page.get_pixmap(matrix=matrix, clip=clip, alpha=False)
74
+ pixmap.save(destination)
75
+ except Exception as exc: # pragma: no cover - defensive
76
+ if logger:
77
+ logger.warning(
78
+ "Failed to render signature crop",
79
+ extra={
80
+ "file": pdf_path.name,
81
+ "page": signature.Page,
82
+ "field": signature.FieldName,
83
+ "error": str(exc),
84
+ },
85
+ )
86
+ continue
87
+
88
+ signature.CropPath = str(destination)
89
+ generated.append(destination)
90
+
91
+ return generated
92
+
93
+
94
+ def _to_clip_rect(page, bbox: tuple[float, float, float, float]):
95
+ width = float(page.rect.width)
96
+ height = float(page.rect.height)
97
+
98
+ x0, y0, x1, y1 = bbox
99
+ left = _clamp(min(x0, x1), 0.0, width)
100
+ right = _clamp(max(x0, x1), 0.0, width)
101
+
102
+ top = _clamp(height - max(y0, y1), 0.0, height)
103
+ bottom = _clamp(height - min(y0, y1), 0.0, height)
104
+
105
+ if right - left <= 0 or bottom - top <= 0:
106
+ return None
107
+ return fitz.Rect(left, top, right, bottom)
108
+
109
+
110
+ def _clamp(value: float, lower: float, upper: float) -> float:
111
+ return max(lower, min(value, upper))
112
+
113
+
114
+ def _build_filename(index: int, signature: Signature) -> str:
115
+ base = signature.Role or signature.FieldName or "signature"
116
+ slug = _slugify(base)
117
+ return f"sig_{index:02d}_{slug}.png"
118
+
119
+
120
+ def _slugify(value: str) -> str:
121
+ cleaned = re.sub(r"[^A-Za-z0-9_-]+", "_", value.strip().lower())
122
+ cleaned = cleaned.strip("_")
123
+ return cleaned or "signature"
@@ -0,0 +1,420 @@
1
+ """PyMuPDF-backed detector that augments PyPDF2 heuristics with geometry."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Iterable, cast
7
+
8
+ from .pypdf2_engine import PyPDF2Detector
9
+ from .signature_model import Signature
10
+
11
+ try: # pragma: no cover - optional dependency
12
+ import fitz # type: ignore
13
+ except Exception: # pragma: no cover - optional dependency
14
+ fitz = None # type: ignore[misc]
15
+
16
+
17
+ class PyMuPDFDetector(PyPDF2Detector):
18
+ """Detector that reuses PyPDF2 heuristics and annotates results via PyMuPDF."""
19
+
20
+ Name = "pymupdf"
21
+ SIGNATURE_PADDING = 64.0
22
+ ROLE_KEYWORDS: dict[str, tuple[str, ...]] = {
23
+ "client": ("client", "consumer", "claimant"),
24
+ "firm": ("firm", "attorney", "attorneys", "counsel", "company", "llp", "llc", "law", "by:"),
25
+ "patient": ("patient", "self", "plaintiff"),
26
+ "representative": ("representative", "guardian", "parent"),
27
+ "attorney": ("attorney", "counsel", "lawyer"),
28
+ }
29
+
30
+ def __init__(self, configuration):
31
+ if fitz is None: # pragma: no cover - optional dependency
32
+ raise ValueError(
33
+ "PyMuPDF engine requires the optional 'pymupdf' dependency. Install via 'pip install "
34
+ "sigdetect[pymupdf]' or add pymupdf to your environment."
35
+ )
36
+ super().__init__(configuration)
37
+
38
+ def Detect(self, pdf_path: Path): # type: ignore[override]
39
+ result = super().Detect(pdf_path)
40
+
41
+ try:
42
+ document = fitz.open(str(pdf_path))
43
+ except Exception: # pragma: no cover - defensive
44
+ return result
45
+
46
+ with document:
47
+ widget_map = self._CollectWidgetRects(document)
48
+ self._ApplyWidgetRects(result.Signatures, widget_map)
49
+ self._InferPseudoRects(result.Signatures, document)
50
+ return result
51
+
52
+ # ───────────────────────────────── widget helpers ─────────────────────────────────
53
+ def _CollectWidgetRects(
54
+ self, document
55
+ ) -> dict[tuple[int, str], tuple[float, float, float, float]]:
56
+ mapping: dict[tuple[int, str], tuple[float, float, float, float]] = {}
57
+ for page_index in range(document.page_count):
58
+ page = document.load_page(page_index)
59
+ widgets = page.widgets() if hasattr(page, "widgets") else None
60
+ if not widgets:
61
+ continue
62
+ for widget in widgets:
63
+ name = (widget.field_name or "").strip()
64
+ if not name:
65
+ continue
66
+ # Prefer true signature widgets but fall back to any widget with /Sig appearance
67
+ if getattr(widget, "field_type", None) not in {
68
+ getattr(fitz, "PDF_WIDGET_TYPE_SIGNATURE", 6)
69
+ }:
70
+ continue
71
+ rect = self._RectToPdfTuple(widget.rect, page.rect.height)
72
+ mapping[(page_index + 1, name)] = rect
73
+ return mapping
74
+
75
+ def _ApplyWidgetRects(
76
+ self,
77
+ signatures: Iterable[Signature],
78
+ widget_map: dict[tuple[int, str], tuple[float, float, float, float]],
79
+ ) -> None:
80
+ for signature in signatures:
81
+ if signature.BoundingBox or not signature.FieldName or not signature.Page:
82
+ continue
83
+ key = (signature.Page, signature.FieldName.strip())
84
+ rect = widget_map.get(key)
85
+ if rect:
86
+ signature.BoundingBox = rect
87
+
88
+ # ───────────────────────────── pseudo bbox inference ─────────────────────────────
89
+ def _InferPseudoRects(self, signatures: Iterable[Signature], document) -> None:
90
+ for signature in signatures:
91
+ if signature.BoundingBox or signature.FieldName != "vendor_or_acro_detected":
92
+ continue
93
+
94
+ if signature.Page and signature.Page - 1 >= document.page_count:
95
+ continue
96
+
97
+ if signature.Page:
98
+ candidate_pages = [signature.Page - 1]
99
+ else:
100
+ candidate_pages = list(range(document.page_count - 1, -1, -1))
101
+
102
+ for page_index in candidate_pages:
103
+ if page_index < 0 or page_index >= document.page_count:
104
+ continue
105
+ page = document.load_page(page_index)
106
+ lines = self._ExtractLines(page)
107
+ rect_info = self._FindRoleLineRect(page, signature.Role, lines)
108
+ if rect_info is None:
109
+ rect_info = self._FallbackSignatureRect(page, signature.Role, lines)
110
+ if rect_info is not None:
111
+ rect, exclusion, mode = rect_info
112
+ padded = self._PadRect(rect, page.rect, signature.Role, exclusion, mode)
113
+ signature.BoundingBox = self._RectToPdfTuple(padded, page.rect.height)
114
+ if signature.Page is None:
115
+ signature.Page = page_index + 1
116
+ break
117
+
118
+ def _FindRoleLineRect(
119
+ self,
120
+ page,
121
+ role: str,
122
+ lines: list[dict[str, float | str]] | None = None,
123
+ ) -> tuple[fitz.Rect, float | None, str] | None:
124
+ if lines is None:
125
+ lines = self._ExtractLines(page)
126
+ page_height = float(page.rect.height)
127
+ keywords = self.ROLE_KEYWORDS.get(role, ())
128
+ lower_roles = {"client", "firm", "representative", "attorney"}
129
+ if self.Profile == "retainer" and role in {"client", "firm"}:
130
+ min_factor = 0.15 if role == "client" else 0.4
131
+ min_y = page_height * min_factor
132
+ else:
133
+ min_y = page_height * (0.58 if role == "firm" else 0.5) if role in lower_roles else 0.0
134
+
135
+ def match_lines(require_signature: bool) -> list[tuple[int, dict[str, float | str]]]:
136
+ selected: list[tuple[int, dict[str, float | str]]] = []
137
+ for idx, line in enumerate(lines):
138
+ lower = line["lower_text"]
139
+ if lower.strip() == "":
140
+ continue
141
+ if line["y0"] < min_y:
142
+ continue
143
+ if require_signature and "sign" not in lower:
144
+ continue
145
+ if not require_signature and "sign" not in lower:
146
+ if "name" in lower or "print" in lower:
147
+ continue
148
+ if keywords and not any(keyword in lower for keyword in keywords):
149
+ continue
150
+ selected.append((idx, line))
151
+ return selected
152
+
153
+ matches = match_lines(require_signature=True)
154
+ if matches and matches[-1][1]["y0"] < page_height * 0.6:
155
+ matches = []
156
+ if not matches:
157
+ matches = match_lines(require_signature=False)
158
+
159
+ if matches:
160
+ idx, target = matches[-1]
161
+ label_rect = fitz.Rect(target["x0"], target["y0"], target["x1"], target["y1"])
162
+ stroke = self._LocateStrokeLine(lines, idx, label_rect)
163
+ if stroke is not None:
164
+ rect, exclusion = stroke
165
+ return rect, exclusion, "stroke"
166
+ image = self._LocateSignatureImage(page, label_rect)
167
+ if image is not None:
168
+ exclusion = self._NextExclusionY(lines, idx + 1, image.y1)
169
+ return image, exclusion, "image"
170
+ exclusion = self._NextExclusionY(lines, idx + 1, label_rect.y1)
171
+ return label_rect, exclusion, "label"
172
+ return None
173
+
174
+ def _FallbackSignatureRect(
175
+ self,
176
+ page,
177
+ role: str | None = None,
178
+ lines: list[dict[str, float | str]] | None = None,
179
+ ) -> tuple[fitz.Rect, float | None, str] | None:
180
+ if lines is None:
181
+ lines = self._ExtractLines(page)
182
+ for idx in range(len(lines) - 1, -1, -1):
183
+ line = lines[idx]
184
+ lower = line["lower_text"]
185
+ if "signature" in lower or "sign" in lower:
186
+ rect = fitz.Rect(line["x0"], line["y0"], line["x1"], line["y1"])
187
+ exclusion = self._NextExclusionY(lines, idx + 1, rect.y1)
188
+ return rect, exclusion, "label"
189
+ if lines:
190
+ line = lines[-1]
191
+ rect = fitz.Rect(line["x0"], line["y0"], line["x1"], line["y1"])
192
+ exclusion = None
193
+ return rect, exclusion, "label"
194
+ return None
195
+
196
+ def _ExtractLines(self, page) -> list[dict[str, float | str]]:
197
+ words = page.get_text("words") or []
198
+ buckets: dict[tuple[int, int], dict[str, object]] = {}
199
+ for x0, y0, x1, y1, text, block, line, *_ in words:
200
+ if not text.strip():
201
+ continue
202
+ key = (int(block), int(line))
203
+ bucket = buckets.setdefault(
204
+ key,
205
+ {
206
+ "tokens": [],
207
+ "x0": float(x0),
208
+ "y0": float(y0),
209
+ "x1": float(x1),
210
+ "y1": float(y1),
211
+ },
212
+ )
213
+ tokens = cast(list[str], bucket["tokens"])
214
+ tokens.append(text)
215
+ bucket["x0"] = min(float(bucket["x0"]), float(x0))
216
+ bucket["y0"] = min(float(bucket["y0"]), float(y0))
217
+ bucket["x1"] = max(float(bucket["x1"]), float(x1))
218
+ bucket["y1"] = max(float(bucket["y1"]), float(y1))
219
+ lines: list[dict[str, float | str]] = []
220
+ for bucket in buckets.values():
221
+ text = " ".join(bucket["tokens"]).strip() # type: ignore[arg-type]
222
+ if not text:
223
+ continue
224
+ lines.append(
225
+ {
226
+ "text": text,
227
+ "lower_text": text.lower(),
228
+ "x0": float(bucket["x0"]),
229
+ "y0": float(bucket["y0"]),
230
+ "x1": float(bucket["x1"]),
231
+ "y1": float(bucket["y1"]),
232
+ }
233
+ )
234
+ lines.sort(key=lambda entry: (entry["y0"], entry["x0"]))
235
+ return lines
236
+
237
+ def _LocateStrokeLine(
238
+ self,
239
+ lines: list[dict[str, float | str]],
240
+ label_index: int,
241
+ label_rect: fitz.Rect,
242
+ ) -> tuple[fitz.Rect, float | None] | None:
243
+ for idx in range(label_index - 1, max(label_index - 4, -1), -1):
244
+ lower = lines[idx]["lower_text"]
245
+ if "_" in lower or lower.strip().startswith("x"):
246
+ rect = fitz.Rect(
247
+ lines[idx]["x0"],
248
+ lines[idx]["y0"],
249
+ lines[idx]["x1"],
250
+ lines[idx]["y1"],
251
+ )
252
+ overlap = min(rect.x1, label_rect.x1) - max(rect.x0, label_rect.x0)
253
+ if overlap <= 0:
254
+ continue
255
+ # Keep crops below the label text.
256
+ return rect, label_rect.y0
257
+ return None
258
+
259
+ def _LocateSignatureImage(self, page, label_rect: fitz.Rect) -> fitz.Rect | None:
260
+ candidates: list[tuple[float, fitz.Rect]] = []
261
+ label_mid_x = (label_rect.x0 + label_rect.x1) / 2.0
262
+ for image in page.get_images(full=True):
263
+ bbox = page.get_image_bbox(image)
264
+ if bbox is None:
265
+ continue
266
+ width = float(bbox.width)
267
+ height = float(bbox.height)
268
+ if width < 40.0 or height < 12.0:
269
+ continue
270
+ if width > 380.0 or height > 220.0:
271
+ continue
272
+ # Require the image to sit near the label horizontally and vertically.
273
+ horiz_overlap = min(bbox.x1, label_rect.x1 + 220.0) - max(bbox.x0, label_rect.x0 - 40.0)
274
+ if horiz_overlap <= 0:
275
+ continue
276
+ vertical_gap = abs(((bbox.y0 + bbox.y1) / 2.0) - label_rect.y0)
277
+ if vertical_gap > 220.0:
278
+ continue
279
+ candidates.append((vertical_gap + abs(((bbox.x0 + bbox.x1) / 2.0) - label_mid_x), bbox))
280
+
281
+ if not candidates:
282
+ return None
283
+ candidates.sort(key=lambda item: item[0])
284
+ return candidates[0][1]
285
+
286
+ def _NextExclusionY(
287
+ self,
288
+ lines: list[dict[str, float | str]],
289
+ start_index: int,
290
+ minimum_y: float | None = None,
291
+ ) -> float | None:
292
+ threshold = (minimum_y or -float("inf")) + 1.0
293
+ for line in lines[start_index:]:
294
+ y0 = float(line["y0"])
295
+ if y0 <= threshold:
296
+ continue
297
+ lower = line["lower_text"]
298
+ if any(token in lower for token in ("name", "print", "date", "by:")):
299
+ return y0
300
+ return None
301
+
302
+ def _RectToPdfTuple(self, rect, page_height: float) -> tuple[float, float, float, float]:
303
+ x0 = float(rect.x0)
304
+ x1 = float(rect.x1)
305
+ y0 = page_height - float(rect.y1)
306
+ y1 = page_height - float(rect.y0)
307
+ if x1 < x0:
308
+ x0, x1 = x1, x0
309
+ if y1 < y0:
310
+ y0, y1 = y1, y0
311
+ return (x0, y0, x1, y1)
312
+
313
+ def _PadRect(
314
+ self,
315
+ rect,
316
+ page_rect,
317
+ role: str | None = None,
318
+ exclusion_y0: float | None = None,
319
+ mode: str = "label",
320
+ ):
321
+ """Return a region focused on the expected signature line beneath ``rect``."""
322
+
323
+ max_width = 198.0 # 2.75 inches
324
+ max_height = 72.0 # 1 inch
325
+
326
+ pad_x = max(12.0, float(rect.width) * 0.08)
327
+ if mode == "stroke":
328
+ left = max(page_rect.x0, rect.x0 - 8.0)
329
+ right = min(page_rect.x1, rect.x1 + 8.0)
330
+ elif mode == "image":
331
+ left = max(page_rect.x0, rect.x0 - 10.0)
332
+ right = min(page_rect.x1, rect.x1 + 10.0)
333
+ else:
334
+ left = max(page_rect.x0, rect.x0 - pad_x)
335
+ right = min(page_rect.x1, rect.x1 + pad_x)
336
+
337
+ if self.Profile == "retainer" and role == "client" and mode in {"image", "label"}:
338
+ left = max(page_rect.x0, rect.x0 - 12.0)
339
+ right = min(page_rect.x1, rect.x1 + 16.0)
340
+ elif self.Profile == "retainer" and role == "firm" and mode in {"image", "label"}:
341
+ left = max(page_rect.x0, rect.x0 - 14.0)
342
+ right = min(page_rect.x1, rect.x1 + 18.0)
343
+
344
+ if right - left > max_width:
345
+ if mode == "stroke":
346
+ right = min(page_rect.x1, left + max_width)
347
+ else:
348
+ center = (left + right) / 2.0
349
+ half = max_width / 2.0
350
+ left = center - half
351
+ right = center + half
352
+ if left < page_rect.x0:
353
+ right += page_rect.x0 - left
354
+ left = page_rect.x0
355
+ if right > page_rect.x1:
356
+ left -= right - page_rect.x1
357
+ right = page_rect.x1
358
+ left = max(page_rect.x0, left)
359
+ right = min(page_rect.x1, right)
360
+
361
+ line_height = max(8.0, float(rect.height) or 12.0)
362
+ signature_height = max(40.0, line_height * 2.2)
363
+ if role == "client":
364
+ signature_height = max(signature_height, 65.0)
365
+ elif role == "firm":
366
+ signature_height = max(signature_height, 60.0)
367
+ elif role in {"representative", "patient", "attorney"}:
368
+ signature_height = max(signature_height, 55.0)
369
+ signature_height = min(signature_height, max_height)
370
+
371
+ baseline = float(rect.y1)
372
+
373
+ if mode == "stroke":
374
+ margin_above = max(6.0, line_height)
375
+ margin_below = max(18.0, line_height * 1.5)
376
+ top = float(rect.y0) - margin_above
377
+ bottom = float(rect.y1) + margin_below
378
+ signature_height = min(bottom - top, max_height)
379
+ elif mode == "image":
380
+ image_height = float(rect.height) or 12.0
381
+ signature_height = min(max_height, max(image_height + 18.0, 40.0))
382
+ extra = max(0.0, signature_height - image_height)
383
+ top = float(rect.y0) - min(extra * 0.25, 12.0)
384
+ bottom = top + signature_height
385
+ top = max(float(rect.y0) - 2.0, top)
386
+ bottom = top + signature_height
387
+ else:
388
+ gap_above = max(10.0, min(24.0, line_height * 0.9))
389
+ top = baseline + gap_above
390
+ bottom = top + signature_height
391
+
392
+ original_top = top
393
+
394
+ if exclusion_y0 is not None:
395
+ limited = exclusion_y0 - 4.0
396
+ if bottom > limited:
397
+ bottom = limited
398
+ top = max(original_top, bottom - signature_height)
399
+ if mode == "image":
400
+ limit_below = float(rect.y1) + 24.0
401
+ if bottom > limit_below:
402
+ bottom = limit_below
403
+ top = max(float(rect.y0) - 4.0, bottom - signature_height)
404
+
405
+ if bottom - top > max_height:
406
+ bottom = top + max_height
407
+ signature_height = min(signature_height, max_height)
408
+
409
+ if bottom > page_rect.y1:
410
+ bottom = page_rect.y1
411
+ top = max(original_top, bottom - signature_height)
412
+
413
+ if bottom - top > max_height:
414
+ bottom = top + max_height
415
+
416
+ if top >= bottom:
417
+ top = max(page_rect.y0, baseline - line_height)
418
+ bottom = min(page_rect.y1, top + min(signature_height, max_height))
419
+
420
+ return fitz.Rect(left, top, right, bottom)
@@ -212,7 +212,9 @@ class PyPDF2Detector(Detector):
212
212
  hits.add(f"VendorText:{rx.pattern}")
213
213
  return hits
214
214
 
215
- def _ScanPageVendors(self, page) -> set[str]:
215
+ def _ScanPageVendors(self, page) -> tuple[set[str], str]:
216
+ """Return vendor hits along with the extracted page text."""
217
+
216
218
  found: set[str] = set()
217
219
 
218
220
  with _QuietIo():
@@ -234,7 +236,7 @@ class PyPDF2Detector(Detector):
234
236
  if rx.search(txt):
235
237
  found.add(f"VendorText:{rx.pattern}")
236
238
 
237
- return found
239
+ return found, txt
238
240
 
239
241
  def _IterateFormXObjects(self, page) -> Iterator[generic.DictionaryObject]:
240
242
  """Yield Form XObject dictionaries recursively from page resources."""
@@ -438,6 +440,40 @@ class PyPDF2Detector(Detector):
438
440
  nm = GetFieldNameFromAncestry(wdict)
439
441
  return "" if nm is None else str(nm)
440
442
 
443
+ def _WidgetBoundingBox(
444
+ self, wdict: generic.DictionaryObject
445
+ ) -> tuple[float, float, float, float] | None:
446
+ """Return the widget's ``/Rect`` coordinates normalized as (x0, y0, x1, y1)."""
447
+
448
+ rect = self._RectToTuple(wdict.get("/Rect"))
449
+ if rect:
450
+ return rect
451
+ parent = AsDictionary(wdict.get("/Parent"))
452
+ if isinstance(parent, generic.DictionaryObject):
453
+ return self._RectToTuple(parent.get("/Rect"))
454
+ return None
455
+
456
+ def _RectToTuple(self, candidate) -> tuple[float, float, float, float] | None:
457
+ if candidate is None:
458
+ return None
459
+ if isinstance(candidate, generic.IndirectObject):
460
+ with suppress(Exception):
461
+ candidate = candidate.get_object()
462
+ if isinstance(candidate, generic.ArrayObject) and len(candidate) == 4:
463
+ coords: list[float] = []
464
+ for item in candidate:
465
+ try:
466
+ coords.append(float(item))
467
+ except Exception:
468
+ return None
469
+ x0, y0, x1, y1 = coords
470
+ if x1 < x0:
471
+ x0, x1 = x1, x0
472
+ if y1 < y0:
473
+ y0, y1 = y1, y0
474
+ return x0, y0, x1, y1
475
+ return None
476
+
441
477
  @staticmethod
442
478
  def _PickNameAny(d: generic.DictionaryObject) -> str | None:
443
479
  for key in ("/T", "/TU", "/TM"):
@@ -685,7 +721,7 @@ class PyPDF2Detector(Detector):
685
721
 
686
722
  for page in reader.pages:
687
723
  # per-page vendor
688
- pv = self._ScanPageVendors(page)
724
+ pv, page_text = self._ScanPageVendors(page)
689
725
  x_hits: set[str] = set()
690
726
  x_text = ""
691
727
  if self.RecurseXObjects:
@@ -693,12 +729,10 @@ class PyPDF2Detector(Detector):
693
729
  vendor_hints |= pv | x_hits
694
730
  vendor_hits_per_page.append(len(pv) + len(x_hits))
695
731
 
696
- with _QuietIo():
697
- txt = page.extract_text() or ""
698
732
  if x_text:
699
- txt = f"{txt} {x_text}".strip() if txt else x_text.strip()
700
- page_texts.append(txt)
701
- any_text = any_text or bool(txt)
733
+ page_text = f"{page_text} {x_text}".strip() if page_text else x_text.strip()
734
+ page_texts.append(page_text)
735
+ any_text = any_text or bool(page_text)
702
736
 
703
737
  # image counting
704
738
  img_count = 0
@@ -760,6 +794,7 @@ class PyPDF2Detector(Detector):
760
794
  field_name = self._FieldNameForWidget(wdict)
761
795
  page_obj = reader.pages[idx - 1] if 0 <= (idx - 1) < len(reader.pages) else None
762
796
  render_type = self._ClassifyAppearance(wdict, page_obj)
797
+ bounding_box = self._WidgetBoundingBox(wdict)
763
798
 
764
799
  # de-dup by object ref (if present) and (page, name)
765
800
  if isinstance(ref, generic.IndirectObject):
@@ -801,6 +836,7 @@ class PyPDF2Detector(Detector):
801
836
  Evidence=evidence,
802
837
  Hint=(f"AcroSig:{field_name}" if field_name else "AcroSig"),
803
838
  RenderType=render_type,
839
+ BoundingBox=bounding_box,
804
840
  )
805
841
  )
806
842
 
@@ -969,6 +1005,7 @@ class PyPDF2Detector(Detector):
969
1005
  field_name = self._FieldNameForWidget(wdict)
970
1006
  page_obj = reader.pages[idx - 1] if 0 <= (idx - 1) < len(reader.pages) else None
971
1007
  render_type = self._ClassifyAppearance(wdict, page_obj)
1008
+ bounding_box = self._WidgetBoundingBox(wdict)
972
1009
 
973
1010
  # de-dup by object ref (if present) and (page, name)
974
1011
  if isinstance(ref, generic.IndirectObject):
@@ -995,6 +1032,7 @@ class PyPDF2Detector(Detector):
995
1032
  Evidence=evidence,
996
1033
  Hint=(f"AcroSig:{field_name}" if field_name else "AcroSig"),
997
1034
  RenderType=render_type,
1035
+ BoundingBox=bounding_box,
998
1036
  )
999
1037
  )
1000
1038
 
@@ -18,6 +18,8 @@ class Signature:
18
18
  Evidence: list[str]
19
19
  Hint: str
20
20
  RenderType: str = "unknown"
21
+ BoundingBox: tuple[float, float, float, float] | None = None
22
+ CropPath: str | None = None
21
23
 
22
24
  def to_dict(self) -> dict[str, Any]:
23
25
  """Return the legacy snake_case representation used in JSON payloads."""
@@ -31,4 +33,6 @@ class Signature:
31
33
  "evidence": list(self.Evidence),
32
34
  "hint": self.Hint,
33
35
  "render_type": self.RenderType,
36
+ "bounding_box": list(self.BoundingBox) if self.BoundingBox else None,
37
+ "crop_path": self.CropPath,
34
38
  }
@@ -1,13 +1,12 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sigdetect
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: Signature detection and role attribution for PDFs
5
5
  Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
6
6
  License: MIT
7
7
  Requires-Python: >=3.9
8
8
  Description-Content-Type: text/markdown
9
9
  Requires-Dist: pypdf>=4.0.0
10
- Requires-Dist: pandas>=2.0
11
10
  Requires-Dist: rich>=13.0
12
11
  Requires-Dist: typer>=0.12
13
12
  Requires-Dist: pydantic>=2.5
@@ -102,6 +101,8 @@ sigdetect detect \
102
101
  - `--profile` selects tuned role logic:
103
102
  - `hipaa` → patient / representative / attorney
104
103
  - `retainer` → client / firm (prefers detecting two signatures)
104
+ - `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
105
+ - `--crop-signatures` enables PNG crops for each detected widget (requires installing the optional `pymupdf` dependency). Use `--crop-dir` to override the destination and `--crop-dpi` to choose rendering quality.
105
106
  - If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
106
107
 
107
108
  ### EDA (quick aggregate stats)
@@ -135,7 +136,7 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
135
136
  print(result.to_dict())
136
137
  ~~~
137
138
 
138
- `Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)).
139
+ `Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image.
139
140
 
140
141
  ---
141
142
 
@@ -146,7 +147,17 @@ Import from `sigdetect.api` and get plain dicts out (JSON-ready),
146
147
  with no I/O side effects by default:
147
148
 
148
149
  ~~~python
149
- from sigdetect.api import DetectPdf, DetectMany, ScanDirectory, ToCsvRow, Version
150
+ from pathlib import Path
151
+
152
+ from sigdetect.api import (
153
+ CropSignatureImages,
154
+ DetectMany,
155
+ DetectPdf,
156
+ ScanDirectory,
157
+ ToCsvRow,
158
+ Version,
159
+ get_detector,
160
+ )
150
161
 
151
162
  print("sigdetect", Version())
152
163
 
@@ -178,6 +189,15 @@ for res in ScanDirectory(
178
189
  # store in DB, print, etc.
179
190
  pass
180
191
 
192
+ # 3) Crop PNG snippets for FileResult objects (requires PyMuPDF)
193
+ detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
194
+ file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
195
+ CropSignatureImages(
196
+ "/path/to/pdfs/example.pdf",
197
+ file_result,
198
+ outputDirectory="./signature_crops",
199
+ dpi=200,
200
+ )
181
201
  ~~~
182
202
 
183
203
 
@@ -205,7 +225,10 @@ High-level summary (per file):
205
225
  "score": 5,
206
226
  "scores": { "field": 3, "page_label": 2 },
207
227
  "evidence": ["field:patient", "page_label:patient"],
208
- "hint": "AcroSig:sig_patient"
228
+ "hint": "AcroSig:sig_patient",
229
+ "render_type": "typed",
230
+ "bounding_box": [10.0, 10.0, 150.0, 40.0],
231
+ "crop_path": "signature_crops/example/sig_01_patient.png"
209
232
  },
210
233
  {
211
234
  "page": null,
@@ -214,7 +237,10 @@ High-level summary (per file):
214
237
  "score": 6,
215
238
  "scores": { "page_label": 4, "general": 2 },
216
239
  "evidence": ["page_label:representative(parent/guardian)", "pseudo:true"],
217
- "hint": "VendorOrAcroOnly"
240
+ "hint": "VendorOrAcroOnly",
241
+ "render_type": "unknown",
242
+ "bounding_box": null,
243
+ "crop_path": null
218
244
  }
219
245
  ]
220
246
  }
@@ -227,6 +253,8 @@ High-level summary (per file):
227
253
  - **`mixed`** means both `esign_found` and `scanned_pdf` are `true`.
228
254
  - **`roles`** summarizes unique non-`unknown` roles across signatures.
229
255
  - In retainer profile, emitter prefers two signatures (client + firm), often on the same page.
256
+ - **`signatures[].bounding_box`** reports the widget rectangle in PDF points (origin bottom-left).
257
+ - **`signatures[].crop_path`** is populated when PNG crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
230
258
 
231
259
  ---
232
260
 
@@ -252,6 +280,9 @@ engine: pypdf2
252
280
  pseudo_signatures: true
253
281
  recurse_xobjects: true
254
282
  profile: retainer # or: hipaa
283
+ crop_signatures: false # enable to write PNG crops (requires pymupdf)
284
+ # crop_output_dir: ./signature_crops
285
+ crop_image_dpi: 200
255
286
  ~~~
256
287
 
257
288
  YAML files can be customized or load at runtime (see CLI `--config`, if available, or import and pass patterns into engine).
@@ -1,7 +1,8 @@
1
1
  sigdetect/__init__.py,sha256=LhY78mDZ1ClYVNTxW_qtE-vqJoN9N7N5ZcNRDUI_3ss,575
2
- sigdetect/api.py,sha256=Un4SaZHNAmRLPh1aF9bzOfT6ibilT_y9C0xVmNlqHtI,4248
3
- sigdetect/cli.py,sha256=jm7aStuv64MCcZZkzv8ncNVGGg8FYIFKjkTPNfXWUgs,3136
4
- sigdetect/config.py,sha256=d3_AlAEFUHBoXyTbUAHQLTARVqM8q4I8q4xfwakPE0M,4165
2
+ sigdetect/api.py,sha256=F7bM0ctYmtczjqSbsl7MkUZQ28wkRnLAYt1WxfCtzk4,8518
3
+ sigdetect/cli.py,sha256=NctAnaB-TQrUAT9m-v8kj2_KTNs88kbFOCiX32tHZm8,7920
4
+ sigdetect/config.py,sha256=S0NVKuJYiHJCocL-VNFdGJpasFcjTecavC4EthyS1DQ,5951
5
+ sigdetect/cropping.py,sha256=89xPwXhWkJC5E0oW2e3_fDyERH5YGqyt4q4B-HSld4o,4084
5
6
  sigdetect/eda.py,sha256=S92G1Gjmepri__D0n_V6foq0lQgH-RXI9anW8A58jfw,4681
6
7
  sigdetect/logging_setup.py,sha256=LMF8ao_a-JwH0S522T6aYTFX3e8Ajjv_5ODS2YiBcHA,6404
7
8
  sigdetect/utils.py,sha256=T9rubLf5T9JmjOHYMOba1j34fhOJaWocAXccnGTxRUE,5198
@@ -12,11 +13,11 @@ sigdetect/detector/__init__.py,sha256=up2FCmD09f2bRHcS4WbY-clx3GQbWuk1PM2JlxgusH
12
13
  sigdetect/detector/base.py,sha256=L-iXWXqsTetDc4jRZo_wOdbNpKqOY20mX9FefrugdT0,263
13
14
  sigdetect/detector/base_detector.py,sha256=GmAgUWO_fQgIfnihZSoyhR3wpnwZ-X3hS0Kuyz4G6Ys,608
14
15
  sigdetect/detector/file_result_model.py,sha256=j2gTc9Sw3fJOHlexYsR_m5DiwHA8DzIzAMToESfvo4A,1767
15
- sigdetect/detector/pymupdf_engine.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
- sigdetect/detector/pypdf2_engine.py,sha256=e3JasLxI8K10IkpMcijES2EjA7RluNpKq6027oNROPU,45770
17
- sigdetect/detector/signature_model.py,sha256=nApd53aDRMZhOLdUlmoEPjHO1hs8leM6NysG10v-jVc,857
18
- sigdetect-0.1.0.dist-info/METADATA,sha256=7au6ZW0VN_y3JyZQJux6zEUO8BMBEp6qVn0HO86aXlU,10363
19
- sigdetect-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
20
- sigdetect-0.1.0.dist-info/entry_points.txt,sha256=iqtfKjBU44-omM7Sh-idGz2ahw19oAvpvSyKZVArG3o,48
21
- sigdetect-0.1.0.dist-info/top_level.txt,sha256=PKlfwUobkRC0viwiSXmhtw83G26FSNpimWYC1Uy00FY,10
22
- sigdetect-0.1.0.dist-info/RECORD,,
16
+ sigdetect/detector/pymupdf_engine.py,sha256=iyp7JuPlUnydwohH5zbNg4MwH44mBmxbBWOS3ZmArBo,17339
17
+ sigdetect/detector/pypdf2_engine.py,sha256=INWQH06kMLvto2VS-EdLC-EtMC6AG7JmdVYmNgx6_RU,47313
18
+ sigdetect/detector/signature_model.py,sha256=mztb9V5wgv2oohQ5Cxzcv8_Bo6TyWAVIXteaeQ2rywQ,1076
19
+ sigdetect-0.2.0.dist-info/METADATA,sha256=HzF-CmGBs48_Cqv9Dv9AdXo_UoztA-tLPxVMN1fXOH0,11866
20
+ sigdetect-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
21
+ sigdetect-0.2.0.dist-info/entry_points.txt,sha256=iqtfKjBU44-omM7Sh-idGz2ahw19oAvpvSyKZVArG3o,48
22
+ sigdetect-0.2.0.dist-info/top_level.txt,sha256=PKlfwUobkRC0viwiSXmhtw83G26FSNpimWYC1Uy00FY,10
23
+ sigdetect-0.2.0.dist-info/RECORD,,