sigdetect 0.1.1__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. {sigdetect-0.1.1 → sigdetect-0.3.0}/PKG-INFO +44 -6
  2. {sigdetect-0.1.1 → sigdetect-0.3.0}/README.md +43 -4
  3. {sigdetect-0.1.1 → sigdetect-0.3.0}/pyproject.toml +1 -2
  4. sigdetect-0.3.0/src/sigdetect/api.py +287 -0
  5. sigdetect-0.3.0/src/sigdetect/cli.py +232 -0
  6. {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect/config.py +49 -9
  7. sigdetect-0.3.0/src/sigdetect/cropping.py +177 -0
  8. sigdetect-0.3.0/src/sigdetect/detector/pymupdf_engine.py +420 -0
  9. {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect/detector/pypdf2_engine.py +46 -8
  10. {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect/detector/signature_model.py +4 -0
  11. {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect.egg-info/PKG-INFO +44 -6
  12. {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect.egg-info/SOURCES.txt +6 -1
  13. {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect.egg-info/requires.txt +0 -1
  14. sigdetect-0.3.0/tests/test_api.py +60 -0
  15. sigdetect-0.3.0/tests/test_cropping.py +113 -0
  16. sigdetect-0.3.0/tests/test_pymupdf_engine.py +87 -0
  17. sigdetect-0.3.0/tests/test_widget_role_patient_smoke.py +66 -0
  18. sigdetect-0.1.1/src/sigdetect/api.py +0 -139
  19. sigdetect-0.1.1/src/sigdetect/cli.py +0 -98
  20. sigdetect-0.1.1/src/sigdetect/detector/pymupdf_engine.py +0 -0
  21. {sigdetect-0.1.1 → sigdetect-0.3.0}/setup.cfg +0 -0
  22. {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect/__init__.py +0 -0
  23. {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect/data/role_rules.retainer.yml +0 -0
  24. {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect/data/role_rules.yml +0 -0
  25. {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect/data/vendor_patterns.yml +0 -0
  26. {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect/detector/__init__.py +0 -0
  27. {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect/detector/base.py +0 -0
  28. {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect/detector/base_detector.py +0 -0
  29. {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect/detector/file_result_model.py +0 -0
  30. {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect/eda.py +0 -0
  31. {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect/logging_setup.py +0 -0
  32. {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect/utils.py +0 -0
  33. {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect.egg-info/dependency_links.txt +0 -0
  34. {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect.egg-info/entry_points.txt +0 -0
  35. {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect.egg-info/top_level.txt +0 -0
@@ -1,13 +1,12 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sigdetect
3
- Version: 0.1.1
3
+ Version: 0.3.0
4
4
  Summary: Signature detection and role attribution for PDFs
5
5
  Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
6
6
  License: MIT
7
7
  Requires-Python: >=3.9
8
8
  Description-Content-Type: text/markdown
9
9
  Requires-Dist: pypdf>=4.0.0
10
- Requires-Dist: pandas>=2.0
11
10
  Requires-Dist: rich>=13.0
12
11
  Requires-Dist: typer>=0.12
13
12
  Requires-Dist: pydantic>=2.5
@@ -102,6 +101,8 @@ sigdetect detect \
102
101
  - `--profile` selects tuned role logic:
103
102
  - `hipaa` → patient / representative / attorney
104
103
  - `retainer` → client / firm (prefers detecting two signatures)
104
+ - `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
105
+ - `--crop-signatures` enables PNG crops for each detected widget (requires installing the optional `pymupdf` dependency). Use `--crop-dir` to override the destination and `--crop-dpi` to choose rendering quality.
105
106
  - If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
106
107
 
107
108
  ### EDA (quick aggregate stats)
@@ -135,7 +136,7 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
135
136
  print(result.to_dict())
136
137
  ~~~
137
138
 
138
- `Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)).
139
+ `Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image.
139
140
 
140
141
  ---
141
142
 
@@ -146,7 +147,17 @@ Import from `sigdetect.api` and get plain dicts out (JSON-ready),
146
147
  with no I/O side effects by default:
147
148
 
148
149
  ~~~python
149
- from sigdetect.api import DetectPdf, DetectMany, ScanDirectory, ToCsvRow, Version
150
+ from pathlib import Path
151
+
152
+ from sigdetect.api import (
153
+ CropSignatureImages,
154
+ DetectMany,
155
+ DetectPdf,
156
+ ScanDirectory,
157
+ ToCsvRow,
158
+ Version,
159
+ get_detector,
160
+ )
150
161
 
151
162
  print("sigdetect", Version())
152
163
 
@@ -178,8 +189,24 @@ for res in ScanDirectory(
178
189
  # store in DB, print, etc.
179
190
  pass
180
191
 
192
+ # 3) Crop PNG snippets for FileResult objects (requires PyMuPDF)
193
+ detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
194
+ file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
195
+ crops = CropSignatureImages(
196
+ "/path/to/pdfs/example.pdf",
197
+ file_result,
198
+ outputDirectory="./signature_crops",
199
+ dpi=200,
200
+ returnBytes=True, # also returns in-memory PNG bytes for each crop
201
+ )
202
+
203
+ first_crop = crops[0]
204
+ print(first_crop.path, len(first_crop.image_bytes))
181
205
  ~~~
182
206
 
207
+ When ``returnBytes=True`` the helper returns ``SignatureCrop`` objects containing the saved path,
208
+ PNG bytes, and the originating signature metadata.
209
+
183
210
 
184
211
  ## Result schema
185
212
 
@@ -205,7 +232,10 @@ High-level summary (per file):
205
232
  "score": 5,
206
233
  "scores": { "field": 3, "page_label": 2 },
207
234
  "evidence": ["field:patient", "page_label:patient"],
208
- "hint": "AcroSig:sig_patient"
235
+ "hint": "AcroSig:sig_patient",
236
+ "render_type": "typed",
237
+ "bounding_box": [10.0, 10.0, 150.0, 40.0],
238
+ "crop_path": "signature_crops/example/sig_01_patient.png"
209
239
  },
210
240
  {
211
241
  "page": null,
@@ -214,7 +244,10 @@ High-level summary (per file):
214
244
  "score": 6,
215
245
  "scores": { "page_label": 4, "general": 2 },
216
246
  "evidence": ["page_label:representative(parent/guardian)", "pseudo:true"],
217
- "hint": "VendorOrAcroOnly"
247
+ "hint": "VendorOrAcroOnly",
248
+ "render_type": "unknown",
249
+ "bounding_box": null,
250
+ "crop_path": null
218
251
  }
219
252
  ]
220
253
  }
@@ -227,6 +260,8 @@ High-level summary (per file):
227
260
  - **`mixed`** means both `esign_found` and `scanned_pdf` are `true`.
228
261
  - **`roles`** summarizes unique non-`unknown` roles across signatures.
229
262
  - In retainer profile, emitter prefers two signatures (client + firm), often on the same page.
263
+ - **`signatures[].bounding_box`** reports the widget rectangle in PDF points (origin bottom-left).
264
+ - **`signatures[].crop_path`** is populated when PNG crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
230
265
 
231
266
  ---
232
267
 
@@ -252,6 +287,9 @@ engine: pypdf2
252
287
  pseudo_signatures: true
253
288
  recurse_xobjects: true
254
289
  profile: retainer # or: hipaa
290
+ crop_signatures: false # enable to write PNG crops (requires pymupdf)
291
+ # crop_output_dir: ./signature_crops
292
+ crop_image_dpi: 200
255
293
  ~~~
256
294
 
257
295
  YAML files can be customized or load at runtime (see CLI `--config`, if available, or import and pass patterns into engine).
@@ -85,6 +85,8 @@ sigdetect detect \
85
85
  - `--profile` selects tuned role logic:
86
86
  - `hipaa` → patient / representative / attorney
87
87
  - `retainer` → client / firm (prefers detecting two signatures)
88
+ - `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
89
+ - `--crop-signatures` enables PNG crops for each detected widget (requires installing the optional `pymupdf` dependency). Use `--crop-dir` to override the destination and `--crop-dpi` to choose rendering quality.
88
90
  - If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
89
91
 
90
92
  ### EDA (quick aggregate stats)
@@ -118,7 +120,7 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
118
120
  print(result.to_dict())
119
121
  ~~~
120
122
 
121
- `Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)).
123
+ `Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image.
122
124
 
123
125
  ---
124
126
 
@@ -129,7 +131,17 @@ Import from `sigdetect.api` and get plain dicts out (JSON-ready),
129
131
  with no I/O side effects by default:
130
132
 
131
133
  ~~~python
132
- from sigdetect.api import DetectPdf, DetectMany, ScanDirectory, ToCsvRow, Version
134
+ from pathlib import Path
135
+
136
+ from sigdetect.api import (
137
+ CropSignatureImages,
138
+ DetectMany,
139
+ DetectPdf,
140
+ ScanDirectory,
141
+ ToCsvRow,
142
+ Version,
143
+ get_detector,
144
+ )
133
145
 
134
146
  print("sigdetect", Version())
135
147
 
@@ -161,8 +173,24 @@ for res in ScanDirectory(
161
173
  # store in DB, print, etc.
162
174
  pass
163
175
 
176
+ # 3) Crop PNG snippets for FileResult objects (requires PyMuPDF)
177
+ detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
178
+ file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
179
+ crops = CropSignatureImages(
180
+ "/path/to/pdfs/example.pdf",
181
+ file_result,
182
+ outputDirectory="./signature_crops",
183
+ dpi=200,
184
+ returnBytes=True, # also returns in-memory PNG bytes for each crop
185
+ )
186
+
187
+ first_crop = crops[0]
188
+ print(first_crop.path, len(first_crop.image_bytes))
164
189
  ~~~
165
190
 
191
+ When ``returnBytes=True`` the helper returns ``SignatureCrop`` objects containing the saved path,
192
+ PNG bytes, and the originating signature metadata.
193
+
166
194
 
167
195
  ## Result schema
168
196
 
@@ -188,7 +216,10 @@ High-level summary (per file):
188
216
  "score": 5,
189
217
  "scores": { "field": 3, "page_label": 2 },
190
218
  "evidence": ["field:patient", "page_label:patient"],
191
- "hint": "AcroSig:sig_patient"
219
+ "hint": "AcroSig:sig_patient",
220
+ "render_type": "typed",
221
+ "bounding_box": [10.0, 10.0, 150.0, 40.0],
222
+ "crop_path": "signature_crops/example/sig_01_patient.png"
192
223
  },
193
224
  {
194
225
  "page": null,
@@ -197,7 +228,10 @@ High-level summary (per file):
197
228
  "score": 6,
198
229
  "scores": { "page_label": 4, "general": 2 },
199
230
  "evidence": ["page_label:representative(parent/guardian)", "pseudo:true"],
200
- "hint": "VendorOrAcroOnly"
231
+ "hint": "VendorOrAcroOnly",
232
+ "render_type": "unknown",
233
+ "bounding_box": null,
234
+ "crop_path": null
201
235
  }
202
236
  ]
203
237
  }
@@ -210,6 +244,8 @@ High-level summary (per file):
210
244
  - **`mixed`** means both `esign_found` and `scanned_pdf` are `true`.
211
245
  - **`roles`** summarizes unique non-`unknown` roles across signatures.
212
246
  - In retainer profile, emitter prefers two signatures (client + firm), often on the same page.
247
+ - **`signatures[].bounding_box`** reports the widget rectangle in PDF points (origin bottom-left).
248
+ - **`signatures[].crop_path`** is populated when PNG crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
213
249
 
214
250
  ---
215
251
 
@@ -235,6 +271,9 @@ engine: pypdf2
235
271
  pseudo_signatures: true
236
272
  recurse_xobjects: true
237
273
  profile: retainer # or: hipaa
274
+ crop_signatures: false # enable to write PNG crops (requires pymupdf)
275
+ # crop_output_dir: ./signature_crops
276
+ crop_image_dpi: 200
238
277
  ~~~
239
278
 
240
279
  YAML files can be customized or load at runtime (see CLI `--config`, if available, or import and pass patterns into engine).
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sigdetect"
7
- version = "0.1.1"
7
+ version = "0.3.0"
8
8
  description = "Signature detection and role attribution for PDFs"
9
9
  readme = "README.md"
10
10
  authors = [{ name = "BT Asmamaw", email = "basmamaw@angeiongroup.com" }]
@@ -12,7 +12,6 @@ license = { text = "MIT" }
12
12
  requires-python = ">=3.9"
13
13
  dependencies = [
14
14
  "pypdf>=4.0.0",
15
- "pandas>=2.0",
16
15
  "rich>=13.0",
17
16
  "typer>=0.12",
18
17
  "pydantic>=2.5",
@@ -0,0 +1,287 @@
1
+ """Public helpers for programmatic use of the signature detection engine."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from contextlib import contextmanager
6
+ from pathlib import Path
7
+ from typing import Any, Generator, Iterable, Iterator, Literal, overload
8
+
9
+ from sigdetect.config import DetectConfiguration
10
+ from sigdetect.cropping import SignatureCrop
11
+ from sigdetect.detector import BuildDetector, Detector, FileResult, Signature
12
+
13
+ EngineName = Literal["pypdf2", "pypdf", "pymupdf"]
14
+ ProfileName = Literal["hipaa", "retainer"]
15
+
16
+
17
+ def DetectPdf(
18
+ pdfPath: str | Path,
19
+ *,
20
+ profileName: ProfileName = "hipaa",
21
+ engineName: EngineName = "pypdf2",
22
+ includePseudoSignatures: bool = True,
23
+ recurseXObjects: bool = True,
24
+ detector: Detector | None = None,
25
+ ) -> dict[str, Any]:
26
+ """Detect signature evidence and assign roles for a single PDF."""
27
+
28
+ resolvedPath = Path(pdfPath)
29
+ activeDetector = detector or get_detector(
30
+ pdfRoot=resolvedPath.parent,
31
+ profileName=profileName,
32
+ engineName=engineName,
33
+ includePseudoSignatures=includePseudoSignatures,
34
+ recurseXObjects=recurseXObjects,
35
+ outputDirectory=None,
36
+ )
37
+
38
+ result = activeDetector.Detect(resolvedPath)
39
+ return _ToPlainDictionary(result)
40
+
41
+
42
+ def get_detector(
43
+ *,
44
+ pdfRoot: str | Path | None = None,
45
+ profileName: ProfileName = "hipaa",
46
+ engineName: EngineName = "pypdf2",
47
+ includePseudoSignatures: bool = True,
48
+ recurseXObjects: bool = True,
49
+ outputDirectory: str | Path | None = None,
50
+ ) -> Detector:
51
+ """Return a reusable detector instance configured with the supplied options."""
52
+
53
+ configuration = DetectConfiguration(
54
+ PdfRoot=Path(pdfRoot) if pdfRoot is not None else Path.cwd(),
55
+ OutputDirectory=Path(outputDirectory) if outputDirectory is not None else None,
56
+ Engine=engineName,
57
+ PseudoSignatures=includePseudoSignatures,
58
+ RecurseXObjects=recurseXObjects,
59
+ Profile=profileName,
60
+ )
61
+ return BuildDetector(configuration)
62
+
63
+
64
+ def _ToPlainDictionary(candidate: Any) -> dict[str, Any]:
65
+ """Convert pydantic/dataclass instances to plain dictionaries."""
66
+
67
+ if hasattr(candidate, "to_dict"):
68
+ return candidate.to_dict()
69
+ if hasattr(candidate, "model_dump"):
70
+ return candidate.model_dump() # type: ignore[attr-defined]
71
+ if hasattr(candidate, "dict"):
72
+ return candidate.dict() # type: ignore[attr-defined]
73
+ try:
74
+ from dataclasses import asdict, is_dataclass
75
+
76
+ if is_dataclass(candidate):
77
+ return asdict(candidate)
78
+ except Exception:
79
+ pass
80
+ if isinstance(candidate, dict):
81
+ return {key: _ToPlainValue(candidate[key]) for key in candidate}
82
+ raise TypeError(f"Unsupported result type: {type(candidate)!r}")
83
+
84
+
85
+ def _ToPlainValue(value: Any) -> Any:
86
+ """Best effort conversion for nested structures."""
87
+
88
+ if hasattr(value, "to_dict"):
89
+ return value.to_dict()
90
+ if hasattr(value, "model_dump") or hasattr(value, "dict"):
91
+ return _ToPlainDictionary(value)
92
+ try:
93
+ from dataclasses import asdict, is_dataclass
94
+
95
+ if is_dataclass(value):
96
+ return asdict(value)
97
+ except Exception:
98
+ pass
99
+ if isinstance(value, list):
100
+ return [_ToPlainValue(item) for item in value]
101
+ if isinstance(value, tuple):
102
+ return tuple(_ToPlainValue(item) for item in value)
103
+ if isinstance(value, dict):
104
+ return {key: _ToPlainValue(result) for key, result in value.items()}
105
+ return value
106
+
107
+
108
+ def DetectMany(
109
+ pdfPaths: Iterable[str | Path],
110
+ *,
111
+ detector: Detector | None = None,
112
+ **kwargs: Any,
113
+ ) -> Iterator[dict[str, Any]]:
114
+ """Yield :func:`DetectPdf` results for each path in ``pdfPaths``."""
115
+
116
+ if detector is not None:
117
+ for pdfPath in pdfPaths:
118
+ yield _DetectWithDetector(detector, pdfPath)
119
+ return
120
+
121
+ for pdfPath in pdfPaths:
122
+ yield DetectPdf(pdfPath, **kwargs)
123
+
124
+
125
+ def ScanDirectory(
126
+ pdfRoot: str | Path,
127
+ *,
128
+ globPattern: str = "**/*.pdf",
129
+ detector: Detector | None = None,
130
+ **kwargs: Any,
131
+ ) -> Iterator[dict[str, Any]]:
132
+ """Walk ``pdfRoot`` and yield detection output for every matching PDF."""
133
+
134
+ rootDirectory = Path(pdfRoot)
135
+ if globPattern == "**/*.pdf":
136
+ iterator = (path for path in rootDirectory.rglob("*") if path.is_file())
137
+ else:
138
+ iterator = (
139
+ rootDirectory.rglob(globPattern.replace("**/", "", 1))
140
+ if globPattern.startswith("**/")
141
+ else rootDirectory.glob(globPattern)
142
+ )
143
+
144
+ for pdfPath in iterator:
145
+ if pdfPath.is_file() and pdfPath.suffix.lower() == ".pdf":
146
+ yield DetectPdf(pdfPath, detector=detector, **kwargs)
147
+
148
+
149
+ def ToCsvRow(result: dict[str, Any]) -> dict[str, Any]:
150
+ """Return a curated subset of keys suitable for CSV export."""
151
+
152
+ return {
153
+ "file": result.get("file"),
154
+ "size_kb": result.get("size_kb"),
155
+ "pages": result.get("pages"),
156
+ "esign_found": result.get("esign_found"),
157
+ "scanned_pdf": result.get("scanned_pdf"),
158
+ "mixed": result.get("mixed"),
159
+ "sig_count": result.get("sig_count"),
160
+ "sig_pages": result.get("sig_pages"),
161
+ "roles": result.get("roles"),
162
+ "hints": result.get("hints"),
163
+ }
164
+
165
+
166
+ def Version() -> str:
167
+ """Expose the installed package version without importing the CLI stack."""
168
+
169
+ try:
170
+ from importlib.metadata import version as resolveVersion
171
+
172
+ return resolveVersion("sigdetect")
173
+ except Exception:
174
+ return "0.0.0-dev"
175
+
176
+
177
+ def _DetectWithDetector(detector: Detector, pdfPath: str | Path) -> dict[str, Any]:
178
+ """Helper that runs ``detector`` and returns the plain dictionary result."""
179
+
180
+ resolvedPath = Path(pdfPath)
181
+ return _ToPlainDictionary(detector.Detect(resolvedPath))
182
+
183
+
184
+ @contextmanager
185
+ def detector_context(**kwargs: Any) -> Generator[Detector, None, None]:
186
+ """Context manager wrapper around :func:`get_detector`."""
187
+
188
+ detector = get_detector(**kwargs)
189
+ try:
190
+ yield detector
191
+ finally:
192
+ pass
193
+
194
+
195
+ @overload
196
+ def CropSignatureImages(
197
+ pdfPath: str | Path,
198
+ fileResult: FileResult | dict[str, Any],
199
+ *,
200
+ outputDirectory: str | Path,
201
+ dpi: int = 200,
202
+ returnBytes: Literal[False] = False,
203
+ ) -> list[Path]: ...
204
+
205
+
206
+ @overload
207
+ def CropSignatureImages(
208
+ pdfPath: str | Path,
209
+ fileResult: FileResult | dict[str, Any],
210
+ *,
211
+ outputDirectory: str | Path,
212
+ dpi: int,
213
+ returnBytes: Literal[True],
214
+ ) -> list[SignatureCrop]: ...
215
+
216
+
217
+ def CropSignatureImages(
218
+ pdfPath: str | Path,
219
+ fileResult: FileResult | dict[str, Any],
220
+ *,
221
+ outputDirectory: str | Path,
222
+ dpi: int = 200,
223
+ returnBytes: bool = False,
224
+ ) -> list[Path] | list[SignatureCrop]:
225
+ """Crop detected signature regions to PNG files.
226
+
227
+ Accepts either a :class:`FileResult` instance or the ``dict`` returned by
228
+ :func:`DetectPdf`. Requires the optional ``pymupdf`` dependency.
229
+ Set ``returnBytes=True`` to also receive in-memory PNG bytes for each crop.
230
+ """
231
+
232
+ from sigdetect.cropping import crop_signatures
233
+
234
+ file_result_obj, original_dict = _CoerceFileResult(fileResult)
235
+ paths = crop_signatures(
236
+ pdf_path=Path(pdfPath),
237
+ file_result=file_result_obj,
238
+ output_dir=Path(outputDirectory),
239
+ dpi=dpi,
240
+ return_bytes=returnBytes,
241
+ )
242
+ if original_dict is not None:
243
+ original_dict.clear()
244
+ original_dict.update(file_result_obj.to_dict())
245
+ return paths
246
+
247
+
248
+ def _CoerceFileResult(
249
+ candidate: FileResult | dict[str, Any]
250
+ ) -> tuple[FileResult, dict[str, Any] | None]:
251
+ if isinstance(candidate, FileResult):
252
+ return candidate, None
253
+ if not isinstance(candidate, dict):
254
+ raise TypeError("fileResult must be FileResult or dict")
255
+
256
+ signatures: list[Signature] = []
257
+ for entry in candidate.get("signatures") or []:
258
+ bbox = entry.get("bounding_box")
259
+ signatures.append(
260
+ Signature(
261
+ Page=entry.get("page"),
262
+ FieldName=str(entry.get("field_name") or ""),
263
+ Role=str(entry.get("role") or "unknown"),
264
+ Score=int(entry.get("score") or 0),
265
+ Scores=dict(entry.get("scores") or {}),
266
+ Evidence=list(entry.get("evidence") or []),
267
+ Hint=str(entry.get("hint") or ""),
268
+ RenderType=str(entry.get("render_type") or "unknown"),
269
+ BoundingBox=tuple(bbox) if bbox else None,
270
+ CropPath=entry.get("crop_path"),
271
+ )
272
+ )
273
+
274
+ file_result = FileResult(
275
+ File=str(candidate.get("file") or ""),
276
+ SizeKilobytes=candidate.get("size_kb"),
277
+ PageCount=int(candidate.get("pages") or 0),
278
+ ElectronicSignatureFound=bool(candidate.get("esign_found")),
279
+ ScannedPdf=candidate.get("scanned_pdf"),
280
+ MixedContent=candidate.get("mixed"),
281
+ SignatureCount=int(candidate.get("sig_count") or len(signatures)),
282
+ SignaturePages=str(candidate.get("sig_pages") or ""),
283
+ Roles=str(candidate.get("roles") or "unknown"),
284
+ Hints=str(candidate.get("hints") or ""),
285
+ Signatures=signatures,
286
+ )
287
+ return file_result, candidate