sigdetect 0.1.1__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sigdetect-0.1.1 → sigdetect-0.3.0}/PKG-INFO +44 -6
- {sigdetect-0.1.1 → sigdetect-0.3.0}/README.md +43 -4
- {sigdetect-0.1.1 → sigdetect-0.3.0}/pyproject.toml +1 -2
- sigdetect-0.3.0/src/sigdetect/api.py +287 -0
- sigdetect-0.3.0/src/sigdetect/cli.py +232 -0
- {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect/config.py +49 -9
- sigdetect-0.3.0/src/sigdetect/cropping.py +177 -0
- sigdetect-0.3.0/src/sigdetect/detector/pymupdf_engine.py +420 -0
- {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect/detector/pypdf2_engine.py +46 -8
- {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect/detector/signature_model.py +4 -0
- {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect.egg-info/PKG-INFO +44 -6
- {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect.egg-info/SOURCES.txt +6 -1
- {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect.egg-info/requires.txt +0 -1
- sigdetect-0.3.0/tests/test_api.py +60 -0
- sigdetect-0.3.0/tests/test_cropping.py +113 -0
- sigdetect-0.3.0/tests/test_pymupdf_engine.py +87 -0
- sigdetect-0.3.0/tests/test_widget_role_patient_smoke.py +66 -0
- sigdetect-0.1.1/src/sigdetect/api.py +0 -139
- sigdetect-0.1.1/src/sigdetect/cli.py +0 -98
- sigdetect-0.1.1/src/sigdetect/detector/pymupdf_engine.py +0 -0
- {sigdetect-0.1.1 → sigdetect-0.3.0}/setup.cfg +0 -0
- {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect/__init__.py +0 -0
- {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect/data/role_rules.retainer.yml +0 -0
- {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect/data/role_rules.yml +0 -0
- {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect/data/vendor_patterns.yml +0 -0
- {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect/detector/__init__.py +0 -0
- {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect/detector/base.py +0 -0
- {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect/detector/base_detector.py +0 -0
- {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect/detector/file_result_model.py +0 -0
- {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect/eda.py +0 -0
- {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect/logging_setup.py +0 -0
- {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect/utils.py +0 -0
- {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect.egg-info/dependency_links.txt +0 -0
- {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect.egg-info/entry_points.txt +0 -0
- {sigdetect-0.1.1 → sigdetect-0.3.0}/src/sigdetect.egg-info/top_level.txt +0 -0
|
@@ -1,13 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sigdetect
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Signature detection and role attribution for PDFs
|
|
5
5
|
Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
|
|
6
6
|
License: MIT
|
|
7
7
|
Requires-Python: >=3.9
|
|
8
8
|
Description-Content-Type: text/markdown
|
|
9
9
|
Requires-Dist: pypdf>=4.0.0
|
|
10
|
-
Requires-Dist: pandas>=2.0
|
|
11
10
|
Requires-Dist: rich>=13.0
|
|
12
11
|
Requires-Dist: typer>=0.12
|
|
13
12
|
Requires-Dist: pydantic>=2.5
|
|
@@ -102,6 +101,8 @@ sigdetect detect \
|
|
|
102
101
|
- `--profile` selects tuned role logic:
|
|
103
102
|
- `hipaa` → patient / representative / attorney
|
|
104
103
|
- `retainer` → client / firm (prefers detecting two signatures)
|
|
104
|
+
- `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
|
|
105
|
+
- `--crop-signatures` enables PNG crops for each detected widget (requires installing the optional `pymupdf` dependency). Use `--crop-dir` to override the destination and `--crop-dpi` to choose rendering quality.
|
|
105
106
|
- If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
|
|
106
107
|
|
|
107
108
|
### EDA (quick aggregate stats)
|
|
@@ -135,7 +136,7 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
|
|
|
135
136
|
print(result.to_dict())
|
|
136
137
|
~~~
|
|
137
138
|
|
|
138
|
-
`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)).
|
|
139
|
+
`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image.
|
|
139
140
|
|
|
140
141
|
---
|
|
141
142
|
|
|
@@ -146,7 +147,17 @@ Import from `sigdetect.api` and get plain dicts out (JSON-ready),
|
|
|
146
147
|
with no I/O side effects by default:
|
|
147
148
|
|
|
148
149
|
~~~python
|
|
149
|
-
from
|
|
150
|
+
from pathlib import Path
|
|
151
|
+
|
|
152
|
+
from sigdetect.api import (
|
|
153
|
+
CropSignatureImages,
|
|
154
|
+
DetectMany,
|
|
155
|
+
DetectPdf,
|
|
156
|
+
ScanDirectory,
|
|
157
|
+
ToCsvRow,
|
|
158
|
+
Version,
|
|
159
|
+
get_detector,
|
|
160
|
+
)
|
|
150
161
|
|
|
151
162
|
print("sigdetect", Version())
|
|
152
163
|
|
|
@@ -178,8 +189,24 @@ for res in ScanDirectory(
|
|
|
178
189
|
# store in DB, print, etc.
|
|
179
190
|
pass
|
|
180
191
|
|
|
192
|
+
# 3) Crop PNG snippets for FileResult objects (requires PyMuPDF)
|
|
193
|
+
detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
|
|
194
|
+
file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
|
|
195
|
+
crops = CropSignatureImages(
|
|
196
|
+
"/path/to/pdfs/example.pdf",
|
|
197
|
+
file_result,
|
|
198
|
+
outputDirectory="./signature_crops",
|
|
199
|
+
dpi=200,
|
|
200
|
+
returnBytes=True, # also returns in-memory PNG bytes for each crop
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
first_crop = crops[0]
|
|
204
|
+
print(first_crop.path, len(first_crop.image_bytes))
|
|
181
205
|
~~~
|
|
182
206
|
|
|
207
|
+
When ``returnBytes=True`` the helper returns ``SignatureCrop`` objects containing the saved path,
|
|
208
|
+
PNG bytes, and the originating signature metadata.
|
|
209
|
+
|
|
183
210
|
|
|
184
211
|
## Result schema
|
|
185
212
|
|
|
@@ -205,7 +232,10 @@ High-level summary (per file):
|
|
|
205
232
|
"score": 5,
|
|
206
233
|
"scores": { "field": 3, "page_label": 2 },
|
|
207
234
|
"evidence": ["field:patient", "page_label:patient"],
|
|
208
|
-
"hint": "AcroSig:sig_patient"
|
|
235
|
+
"hint": "AcroSig:sig_patient",
|
|
236
|
+
"render_type": "typed",
|
|
237
|
+
"bounding_box": [10.0, 10.0, 150.0, 40.0],
|
|
238
|
+
"crop_path": "signature_crops/example/sig_01_patient.png"
|
|
209
239
|
},
|
|
210
240
|
{
|
|
211
241
|
"page": null,
|
|
@@ -214,7 +244,10 @@ High-level summary (per file):
|
|
|
214
244
|
"score": 6,
|
|
215
245
|
"scores": { "page_label": 4, "general": 2 },
|
|
216
246
|
"evidence": ["page_label:representative(parent/guardian)", "pseudo:true"],
|
|
217
|
-
"hint": "VendorOrAcroOnly"
|
|
247
|
+
"hint": "VendorOrAcroOnly",
|
|
248
|
+
"render_type": "unknown",
|
|
249
|
+
"bounding_box": null,
|
|
250
|
+
"crop_path": null
|
|
218
251
|
}
|
|
219
252
|
]
|
|
220
253
|
}
|
|
@@ -227,6 +260,8 @@ High-level summary (per file):
|
|
|
227
260
|
- **`mixed`** means both `esign_found` and `scanned_pdf` are `true`.
|
|
228
261
|
- **`roles`** summarizes unique non-`unknown` roles across signatures.
|
|
229
262
|
- In retainer profile, emitter prefers two signatures (client + firm), often on the same page.
|
|
263
|
+
- **`signatures[].bounding_box`** reports the widget rectangle in PDF points (origin bottom-left).
|
|
264
|
+
- **`signatures[].crop_path`** is populated when PNG crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
|
|
230
265
|
|
|
231
266
|
---
|
|
232
267
|
|
|
@@ -252,6 +287,9 @@ engine: pypdf2
|
|
|
252
287
|
pseudo_signatures: true
|
|
253
288
|
recurse_xobjects: true
|
|
254
289
|
profile: retainer # or: hipaa
|
|
290
|
+
crop_signatures: false # enable to write PNG crops (requires pymupdf)
|
|
291
|
+
# crop_output_dir: ./signature_crops
|
|
292
|
+
crop_image_dpi: 200
|
|
255
293
|
~~~
|
|
256
294
|
|
|
257
295
|
YAML files can be customized or load at runtime (see CLI `--config`, if available, or import and pass patterns into engine).
|
|
@@ -85,6 +85,8 @@ sigdetect detect \
|
|
|
85
85
|
- `--profile` selects tuned role logic:
|
|
86
86
|
- `hipaa` → patient / representative / attorney
|
|
87
87
|
- `retainer` → client / firm (prefers detecting two signatures)
|
|
88
|
+
- `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
|
|
89
|
+
- `--crop-signatures` enables PNG crops for each detected widget (requires installing the optional `pymupdf` dependency). Use `--crop-dir` to override the destination and `--crop-dpi` to choose rendering quality.
|
|
88
90
|
- If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
|
|
89
91
|
|
|
90
92
|
### EDA (quick aggregate stats)
|
|
@@ -118,7 +120,7 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
|
|
|
118
120
|
print(result.to_dict())
|
|
119
121
|
~~~
|
|
120
122
|
|
|
121
|
-
`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)).
|
|
123
|
+
`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image.
|
|
122
124
|
|
|
123
125
|
---
|
|
124
126
|
|
|
@@ -129,7 +131,17 @@ Import from `sigdetect.api` and get plain dicts out (JSON-ready),
|
|
|
129
131
|
with no I/O side effects by default:
|
|
130
132
|
|
|
131
133
|
~~~python
|
|
132
|
-
from
|
|
134
|
+
from pathlib import Path
|
|
135
|
+
|
|
136
|
+
from sigdetect.api import (
|
|
137
|
+
CropSignatureImages,
|
|
138
|
+
DetectMany,
|
|
139
|
+
DetectPdf,
|
|
140
|
+
ScanDirectory,
|
|
141
|
+
ToCsvRow,
|
|
142
|
+
Version,
|
|
143
|
+
get_detector,
|
|
144
|
+
)
|
|
133
145
|
|
|
134
146
|
print("sigdetect", Version())
|
|
135
147
|
|
|
@@ -161,8 +173,24 @@ for res in ScanDirectory(
|
|
|
161
173
|
# store in DB, print, etc.
|
|
162
174
|
pass
|
|
163
175
|
|
|
176
|
+
# 3) Crop PNG snippets for FileResult objects (requires PyMuPDF)
|
|
177
|
+
detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
|
|
178
|
+
file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
|
|
179
|
+
crops = CropSignatureImages(
|
|
180
|
+
"/path/to/pdfs/example.pdf",
|
|
181
|
+
file_result,
|
|
182
|
+
outputDirectory="./signature_crops",
|
|
183
|
+
dpi=200,
|
|
184
|
+
returnBytes=True, # also returns in-memory PNG bytes for each crop
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
first_crop = crops[0]
|
|
188
|
+
print(first_crop.path, len(first_crop.image_bytes))
|
|
164
189
|
~~~
|
|
165
190
|
|
|
191
|
+
When ``returnBytes=True`` the helper returns ``SignatureCrop`` objects containing the saved path,
|
|
192
|
+
PNG bytes, and the originating signature metadata.
|
|
193
|
+
|
|
166
194
|
|
|
167
195
|
## Result schema
|
|
168
196
|
|
|
@@ -188,7 +216,10 @@ High-level summary (per file):
|
|
|
188
216
|
"score": 5,
|
|
189
217
|
"scores": { "field": 3, "page_label": 2 },
|
|
190
218
|
"evidence": ["field:patient", "page_label:patient"],
|
|
191
|
-
"hint": "AcroSig:sig_patient"
|
|
219
|
+
"hint": "AcroSig:sig_patient",
|
|
220
|
+
"render_type": "typed",
|
|
221
|
+
"bounding_box": [10.0, 10.0, 150.0, 40.0],
|
|
222
|
+
"crop_path": "signature_crops/example/sig_01_patient.png"
|
|
192
223
|
},
|
|
193
224
|
{
|
|
194
225
|
"page": null,
|
|
@@ -197,7 +228,10 @@ High-level summary (per file):
|
|
|
197
228
|
"score": 6,
|
|
198
229
|
"scores": { "page_label": 4, "general": 2 },
|
|
199
230
|
"evidence": ["page_label:representative(parent/guardian)", "pseudo:true"],
|
|
200
|
-
"hint": "VendorOrAcroOnly"
|
|
231
|
+
"hint": "VendorOrAcroOnly",
|
|
232
|
+
"render_type": "unknown",
|
|
233
|
+
"bounding_box": null,
|
|
234
|
+
"crop_path": null
|
|
201
235
|
}
|
|
202
236
|
]
|
|
203
237
|
}
|
|
@@ -210,6 +244,8 @@ High-level summary (per file):
|
|
|
210
244
|
- **`mixed`** means both `esign_found` and `scanned_pdf` are `true`.
|
|
211
245
|
- **`roles`** summarizes unique non-`unknown` roles across signatures.
|
|
212
246
|
- In retainer profile, emitter prefers two signatures (client + firm), often on the same page.
|
|
247
|
+
- **`signatures[].bounding_box`** reports the widget rectangle in PDF points (origin bottom-left).
|
|
248
|
+
- **`signatures[].crop_path`** is populated when PNG crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
|
|
213
249
|
|
|
214
250
|
---
|
|
215
251
|
|
|
@@ -235,6 +271,9 @@ engine: pypdf2
|
|
|
235
271
|
pseudo_signatures: true
|
|
236
272
|
recurse_xobjects: true
|
|
237
273
|
profile: retainer # or: hipaa
|
|
274
|
+
crop_signatures: false # enable to write PNG crops (requires pymupdf)
|
|
275
|
+
# crop_output_dir: ./signature_crops
|
|
276
|
+
crop_image_dpi: 200
|
|
238
277
|
~~~
|
|
239
278
|
|
|
240
279
|
YAML files can be customized or load at runtime (see CLI `--config`, if available, or import and pass patterns into engine).
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "sigdetect"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.3.0"
|
|
8
8
|
description = "Signature detection and role attribution for PDFs"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
authors = [{ name = "BT Asmamaw", email = "basmamaw@angeiongroup.com" }]
|
|
@@ -12,7 +12,6 @@ license = { text = "MIT" }
|
|
|
12
12
|
requires-python = ">=3.9"
|
|
13
13
|
dependencies = [
|
|
14
14
|
"pypdf>=4.0.0",
|
|
15
|
-
"pandas>=2.0",
|
|
16
15
|
"rich>=13.0",
|
|
17
16
|
"typer>=0.12",
|
|
18
17
|
"pydantic>=2.5",
|
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
"""Public helpers for programmatic use of the signature detection engine."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from contextlib import contextmanager
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Generator, Iterable, Iterator, Literal, overload
|
|
8
|
+
|
|
9
|
+
from sigdetect.config import DetectConfiguration
|
|
10
|
+
from sigdetect.cropping import SignatureCrop
|
|
11
|
+
from sigdetect.detector import BuildDetector, Detector, FileResult, Signature
|
|
12
|
+
|
|
13
|
+
EngineName = Literal["pypdf2", "pypdf", "pymupdf"]
|
|
14
|
+
ProfileName = Literal["hipaa", "retainer"]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def DetectPdf(
|
|
18
|
+
pdfPath: str | Path,
|
|
19
|
+
*,
|
|
20
|
+
profileName: ProfileName = "hipaa",
|
|
21
|
+
engineName: EngineName = "pypdf2",
|
|
22
|
+
includePseudoSignatures: bool = True,
|
|
23
|
+
recurseXObjects: bool = True,
|
|
24
|
+
detector: Detector | None = None,
|
|
25
|
+
) -> dict[str, Any]:
|
|
26
|
+
"""Detect signature evidence and assign roles for a single PDF."""
|
|
27
|
+
|
|
28
|
+
resolvedPath = Path(pdfPath)
|
|
29
|
+
activeDetector = detector or get_detector(
|
|
30
|
+
pdfRoot=resolvedPath.parent,
|
|
31
|
+
profileName=profileName,
|
|
32
|
+
engineName=engineName,
|
|
33
|
+
includePseudoSignatures=includePseudoSignatures,
|
|
34
|
+
recurseXObjects=recurseXObjects,
|
|
35
|
+
outputDirectory=None,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
result = activeDetector.Detect(resolvedPath)
|
|
39
|
+
return _ToPlainDictionary(result)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def get_detector(
|
|
43
|
+
*,
|
|
44
|
+
pdfRoot: str | Path | None = None,
|
|
45
|
+
profileName: ProfileName = "hipaa",
|
|
46
|
+
engineName: EngineName = "pypdf2",
|
|
47
|
+
includePseudoSignatures: bool = True,
|
|
48
|
+
recurseXObjects: bool = True,
|
|
49
|
+
outputDirectory: str | Path | None = None,
|
|
50
|
+
) -> Detector:
|
|
51
|
+
"""Return a reusable detector instance configured with the supplied options."""
|
|
52
|
+
|
|
53
|
+
configuration = DetectConfiguration(
|
|
54
|
+
PdfRoot=Path(pdfRoot) if pdfRoot is not None else Path.cwd(),
|
|
55
|
+
OutputDirectory=Path(outputDirectory) if outputDirectory is not None else None,
|
|
56
|
+
Engine=engineName,
|
|
57
|
+
PseudoSignatures=includePseudoSignatures,
|
|
58
|
+
RecurseXObjects=recurseXObjects,
|
|
59
|
+
Profile=profileName,
|
|
60
|
+
)
|
|
61
|
+
return BuildDetector(configuration)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _ToPlainDictionary(candidate: Any) -> dict[str, Any]:
|
|
65
|
+
"""Convert pydantic/dataclass instances to plain dictionaries."""
|
|
66
|
+
|
|
67
|
+
if hasattr(candidate, "to_dict"):
|
|
68
|
+
return candidate.to_dict()
|
|
69
|
+
if hasattr(candidate, "model_dump"):
|
|
70
|
+
return candidate.model_dump() # type: ignore[attr-defined]
|
|
71
|
+
if hasattr(candidate, "dict"):
|
|
72
|
+
return candidate.dict() # type: ignore[attr-defined]
|
|
73
|
+
try:
|
|
74
|
+
from dataclasses import asdict, is_dataclass
|
|
75
|
+
|
|
76
|
+
if is_dataclass(candidate):
|
|
77
|
+
return asdict(candidate)
|
|
78
|
+
except Exception:
|
|
79
|
+
pass
|
|
80
|
+
if isinstance(candidate, dict):
|
|
81
|
+
return {key: _ToPlainValue(candidate[key]) for key in candidate}
|
|
82
|
+
raise TypeError(f"Unsupported result type: {type(candidate)!r}")
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _ToPlainValue(value: Any) -> Any:
|
|
86
|
+
"""Best effort conversion for nested structures."""
|
|
87
|
+
|
|
88
|
+
if hasattr(value, "to_dict"):
|
|
89
|
+
return value.to_dict()
|
|
90
|
+
if hasattr(value, "model_dump") or hasattr(value, "dict"):
|
|
91
|
+
return _ToPlainDictionary(value)
|
|
92
|
+
try:
|
|
93
|
+
from dataclasses import asdict, is_dataclass
|
|
94
|
+
|
|
95
|
+
if is_dataclass(value):
|
|
96
|
+
return asdict(value)
|
|
97
|
+
except Exception:
|
|
98
|
+
pass
|
|
99
|
+
if isinstance(value, list):
|
|
100
|
+
return [_ToPlainValue(item) for item in value]
|
|
101
|
+
if isinstance(value, tuple):
|
|
102
|
+
return tuple(_ToPlainValue(item) for item in value)
|
|
103
|
+
if isinstance(value, dict):
|
|
104
|
+
return {key: _ToPlainValue(result) for key, result in value.items()}
|
|
105
|
+
return value
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def DetectMany(
|
|
109
|
+
pdfPaths: Iterable[str | Path],
|
|
110
|
+
*,
|
|
111
|
+
detector: Detector | None = None,
|
|
112
|
+
**kwargs: Any,
|
|
113
|
+
) -> Iterator[dict[str, Any]]:
|
|
114
|
+
"""Yield :func:`DetectPdf` results for each path in ``pdfPaths``."""
|
|
115
|
+
|
|
116
|
+
if detector is not None:
|
|
117
|
+
for pdfPath in pdfPaths:
|
|
118
|
+
yield _DetectWithDetector(detector, pdfPath)
|
|
119
|
+
return
|
|
120
|
+
|
|
121
|
+
for pdfPath in pdfPaths:
|
|
122
|
+
yield DetectPdf(pdfPath, **kwargs)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def ScanDirectory(
|
|
126
|
+
pdfRoot: str | Path,
|
|
127
|
+
*,
|
|
128
|
+
globPattern: str = "**/*.pdf",
|
|
129
|
+
detector: Detector | None = None,
|
|
130
|
+
**kwargs: Any,
|
|
131
|
+
) -> Iterator[dict[str, Any]]:
|
|
132
|
+
"""Walk ``pdfRoot`` and yield detection output for every matching PDF."""
|
|
133
|
+
|
|
134
|
+
rootDirectory = Path(pdfRoot)
|
|
135
|
+
if globPattern == "**/*.pdf":
|
|
136
|
+
iterator = (path for path in rootDirectory.rglob("*") if path.is_file())
|
|
137
|
+
else:
|
|
138
|
+
iterator = (
|
|
139
|
+
rootDirectory.rglob(globPattern.replace("**/", "", 1))
|
|
140
|
+
if globPattern.startswith("**/")
|
|
141
|
+
else rootDirectory.glob(globPattern)
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
for pdfPath in iterator:
|
|
145
|
+
if pdfPath.is_file() and pdfPath.suffix.lower() == ".pdf":
|
|
146
|
+
yield DetectPdf(pdfPath, detector=detector, **kwargs)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def ToCsvRow(result: dict[str, Any]) -> dict[str, Any]:
|
|
150
|
+
"""Return a curated subset of keys suitable for CSV export."""
|
|
151
|
+
|
|
152
|
+
return {
|
|
153
|
+
"file": result.get("file"),
|
|
154
|
+
"size_kb": result.get("size_kb"),
|
|
155
|
+
"pages": result.get("pages"),
|
|
156
|
+
"esign_found": result.get("esign_found"),
|
|
157
|
+
"scanned_pdf": result.get("scanned_pdf"),
|
|
158
|
+
"mixed": result.get("mixed"),
|
|
159
|
+
"sig_count": result.get("sig_count"),
|
|
160
|
+
"sig_pages": result.get("sig_pages"),
|
|
161
|
+
"roles": result.get("roles"),
|
|
162
|
+
"hints": result.get("hints"),
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def Version() -> str:
|
|
167
|
+
"""Expose the installed package version without importing the CLI stack."""
|
|
168
|
+
|
|
169
|
+
try:
|
|
170
|
+
from importlib.metadata import version as resolveVersion
|
|
171
|
+
|
|
172
|
+
return resolveVersion("sigdetect")
|
|
173
|
+
except Exception:
|
|
174
|
+
return "0.0.0-dev"
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def _DetectWithDetector(detector: Detector, pdfPath: str | Path) -> dict[str, Any]:
|
|
178
|
+
"""Helper that runs ``detector`` and returns the plain dictionary result."""
|
|
179
|
+
|
|
180
|
+
resolvedPath = Path(pdfPath)
|
|
181
|
+
return _ToPlainDictionary(detector.Detect(resolvedPath))
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
@contextmanager
|
|
185
|
+
def detector_context(**kwargs: Any) -> Generator[Detector, None, None]:
|
|
186
|
+
"""Context manager wrapper around :func:`get_detector`."""
|
|
187
|
+
|
|
188
|
+
detector = get_detector(**kwargs)
|
|
189
|
+
try:
|
|
190
|
+
yield detector
|
|
191
|
+
finally:
|
|
192
|
+
pass
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
@overload
|
|
196
|
+
def CropSignatureImages(
|
|
197
|
+
pdfPath: str | Path,
|
|
198
|
+
fileResult: FileResult | dict[str, Any],
|
|
199
|
+
*,
|
|
200
|
+
outputDirectory: str | Path,
|
|
201
|
+
dpi: int = 200,
|
|
202
|
+
returnBytes: Literal[False] = False,
|
|
203
|
+
) -> list[Path]: ...
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
@overload
|
|
207
|
+
def CropSignatureImages(
|
|
208
|
+
pdfPath: str | Path,
|
|
209
|
+
fileResult: FileResult | dict[str, Any],
|
|
210
|
+
*,
|
|
211
|
+
outputDirectory: str | Path,
|
|
212
|
+
dpi: int,
|
|
213
|
+
returnBytes: Literal[True],
|
|
214
|
+
) -> list[SignatureCrop]: ...
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def CropSignatureImages(
|
|
218
|
+
pdfPath: str | Path,
|
|
219
|
+
fileResult: FileResult | dict[str, Any],
|
|
220
|
+
*,
|
|
221
|
+
outputDirectory: str | Path,
|
|
222
|
+
dpi: int = 200,
|
|
223
|
+
returnBytes: bool = False,
|
|
224
|
+
) -> list[Path] | list[SignatureCrop]:
|
|
225
|
+
"""Crop detected signature regions to PNG files.
|
|
226
|
+
|
|
227
|
+
Accepts either a :class:`FileResult` instance or the ``dict`` returned by
|
|
228
|
+
:func:`DetectPdf`. Requires the optional ``pymupdf`` dependency.
|
|
229
|
+
Set ``returnBytes=True`` to also receive in-memory PNG bytes for each crop.
|
|
230
|
+
"""
|
|
231
|
+
|
|
232
|
+
from sigdetect.cropping import crop_signatures
|
|
233
|
+
|
|
234
|
+
file_result_obj, original_dict = _CoerceFileResult(fileResult)
|
|
235
|
+
paths = crop_signatures(
|
|
236
|
+
pdf_path=Path(pdfPath),
|
|
237
|
+
file_result=file_result_obj,
|
|
238
|
+
output_dir=Path(outputDirectory),
|
|
239
|
+
dpi=dpi,
|
|
240
|
+
return_bytes=returnBytes,
|
|
241
|
+
)
|
|
242
|
+
if original_dict is not None:
|
|
243
|
+
original_dict.clear()
|
|
244
|
+
original_dict.update(file_result_obj.to_dict())
|
|
245
|
+
return paths
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def _CoerceFileResult(
|
|
249
|
+
candidate: FileResult | dict[str, Any]
|
|
250
|
+
) -> tuple[FileResult, dict[str, Any] | None]:
|
|
251
|
+
if isinstance(candidate, FileResult):
|
|
252
|
+
return candidate, None
|
|
253
|
+
if not isinstance(candidate, dict):
|
|
254
|
+
raise TypeError("fileResult must be FileResult or dict")
|
|
255
|
+
|
|
256
|
+
signatures: list[Signature] = []
|
|
257
|
+
for entry in candidate.get("signatures") or []:
|
|
258
|
+
bbox = entry.get("bounding_box")
|
|
259
|
+
signatures.append(
|
|
260
|
+
Signature(
|
|
261
|
+
Page=entry.get("page"),
|
|
262
|
+
FieldName=str(entry.get("field_name") or ""),
|
|
263
|
+
Role=str(entry.get("role") or "unknown"),
|
|
264
|
+
Score=int(entry.get("score") or 0),
|
|
265
|
+
Scores=dict(entry.get("scores") or {}),
|
|
266
|
+
Evidence=list(entry.get("evidence") or []),
|
|
267
|
+
Hint=str(entry.get("hint") or ""),
|
|
268
|
+
RenderType=str(entry.get("render_type") or "unknown"),
|
|
269
|
+
BoundingBox=tuple(bbox) if bbox else None,
|
|
270
|
+
CropPath=entry.get("crop_path"),
|
|
271
|
+
)
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
file_result = FileResult(
|
|
275
|
+
File=str(candidate.get("file") or ""),
|
|
276
|
+
SizeKilobytes=candidate.get("size_kb"),
|
|
277
|
+
PageCount=int(candidate.get("pages") or 0),
|
|
278
|
+
ElectronicSignatureFound=bool(candidate.get("esign_found")),
|
|
279
|
+
ScannedPdf=candidate.get("scanned_pdf"),
|
|
280
|
+
MixedContent=candidate.get("mixed"),
|
|
281
|
+
SignatureCount=int(candidate.get("sig_count") or len(signatures)),
|
|
282
|
+
SignaturePages=str(candidate.get("sig_pages") or ""),
|
|
283
|
+
Roles=str(candidate.get("roles") or "unknown"),
|
|
284
|
+
Hints=str(candidate.get("hints") or ""),
|
|
285
|
+
Signatures=signatures,
|
|
286
|
+
)
|
|
287
|
+
return file_result, candidate
|