sigdetect 0.1.1__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sigdetect-0.1.1 → sigdetect-0.2.0}/PKG-INFO +37 -6
- {sigdetect-0.1.1 → sigdetect-0.2.0}/README.md +36 -4
- {sigdetect-0.1.1 → sigdetect-0.2.0}/pyproject.toml +1 -2
- sigdetect-0.2.0/src/sigdetect/api.py +261 -0
- sigdetect-0.2.0/src/sigdetect/cli.py +232 -0
- {sigdetect-0.1.1 → sigdetect-0.2.0}/src/sigdetect/config.py +49 -9
- sigdetect-0.2.0/src/sigdetect/cropping.py +123 -0
- sigdetect-0.2.0/src/sigdetect/detector/pymupdf_engine.py +420 -0
- {sigdetect-0.1.1 → sigdetect-0.2.0}/src/sigdetect/detector/pypdf2_engine.py +46 -8
- {sigdetect-0.1.1 → sigdetect-0.2.0}/src/sigdetect/detector/signature_model.py +4 -0
- {sigdetect-0.1.1 → sigdetect-0.2.0}/src/sigdetect.egg-info/PKG-INFO +37 -6
- {sigdetect-0.1.1 → sigdetect-0.2.0}/src/sigdetect.egg-info/SOURCES.txt +6 -1
- {sigdetect-0.1.1 → sigdetect-0.2.0}/src/sigdetect.egg-info/requires.txt +0 -1
- sigdetect-0.2.0/tests/test_api.py +60 -0
- sigdetect-0.2.0/tests/test_cropping.py +72 -0
- sigdetect-0.2.0/tests/test_pymupdf_engine.py +87 -0
- sigdetect-0.2.0/tests/test_widget_role_patient_smoke.py +66 -0
- sigdetect-0.1.1/src/sigdetect/api.py +0 -139
- sigdetect-0.1.1/src/sigdetect/cli.py +0 -98
- sigdetect-0.1.1/src/sigdetect/detector/pymupdf_engine.py +0 -0
- {sigdetect-0.1.1 → sigdetect-0.2.0}/setup.cfg +0 -0
- {sigdetect-0.1.1 → sigdetect-0.2.0}/src/sigdetect/__init__.py +0 -0
- {sigdetect-0.1.1 → sigdetect-0.2.0}/src/sigdetect/data/role_rules.retainer.yml +0 -0
- {sigdetect-0.1.1 → sigdetect-0.2.0}/src/sigdetect/data/role_rules.yml +0 -0
- {sigdetect-0.1.1 → sigdetect-0.2.0}/src/sigdetect/data/vendor_patterns.yml +0 -0
- {sigdetect-0.1.1 → sigdetect-0.2.0}/src/sigdetect/detector/__init__.py +0 -0
- {sigdetect-0.1.1 → sigdetect-0.2.0}/src/sigdetect/detector/base.py +0 -0
- {sigdetect-0.1.1 → sigdetect-0.2.0}/src/sigdetect/detector/base_detector.py +0 -0
- {sigdetect-0.1.1 → sigdetect-0.2.0}/src/sigdetect/detector/file_result_model.py +0 -0
- {sigdetect-0.1.1 → sigdetect-0.2.0}/src/sigdetect/eda.py +0 -0
- {sigdetect-0.1.1 → sigdetect-0.2.0}/src/sigdetect/logging_setup.py +0 -0
- {sigdetect-0.1.1 → sigdetect-0.2.0}/src/sigdetect/utils.py +0 -0
- {sigdetect-0.1.1 → sigdetect-0.2.0}/src/sigdetect.egg-info/dependency_links.txt +0 -0
- {sigdetect-0.1.1 → sigdetect-0.2.0}/src/sigdetect.egg-info/entry_points.txt +0 -0
- {sigdetect-0.1.1 → sigdetect-0.2.0}/src/sigdetect.egg-info/top_level.txt +0 -0
|
@@ -1,13 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sigdetect
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: Signature detection and role attribution for PDFs
|
|
5
5
|
Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
|
|
6
6
|
License: MIT
|
|
7
7
|
Requires-Python: >=3.9
|
|
8
8
|
Description-Content-Type: text/markdown
|
|
9
9
|
Requires-Dist: pypdf>=4.0.0
|
|
10
|
-
Requires-Dist: pandas>=2.0
|
|
11
10
|
Requires-Dist: rich>=13.0
|
|
12
11
|
Requires-Dist: typer>=0.12
|
|
13
12
|
Requires-Dist: pydantic>=2.5
|
|
@@ -102,6 +101,8 @@ sigdetect detect \
|
|
|
102
101
|
- `--profile` selects tuned role logic:
|
|
103
102
|
- `hipaa` → patient / representative / attorney
|
|
104
103
|
- `retainer` → client / firm (prefers detecting two signatures)
|
|
104
|
+
- `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
|
|
105
|
+
- `--crop-signatures` enables PNG crops for each detected widget (requires installing the optional `pymupdf` dependency). Use `--crop-dir` to override the destination and `--crop-dpi` to choose rendering quality.
|
|
105
106
|
- If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
|
|
106
107
|
|
|
107
108
|
### EDA (quick aggregate stats)
|
|
@@ -135,7 +136,7 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
|
|
|
135
136
|
print(result.to_dict())
|
|
136
137
|
~~~
|
|
137
138
|
|
|
138
|
-
`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)).
|
|
139
|
+
`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image.
|
|
139
140
|
|
|
140
141
|
---
|
|
141
142
|
|
|
@@ -146,7 +147,17 @@ Import from `sigdetect.api` and get plain dicts out (JSON-ready),
|
|
|
146
147
|
with no I/O side effects by default:
|
|
147
148
|
|
|
148
149
|
~~~python
|
|
149
|
-
from
|
|
150
|
+
from pathlib import Path
|
|
151
|
+
|
|
152
|
+
from sigdetect.api import (
|
|
153
|
+
CropSignatureImages,
|
|
154
|
+
DetectMany,
|
|
155
|
+
DetectPdf,
|
|
156
|
+
ScanDirectory,
|
|
157
|
+
ToCsvRow,
|
|
158
|
+
Version,
|
|
159
|
+
get_detector,
|
|
160
|
+
)
|
|
150
161
|
|
|
151
162
|
print("sigdetect", Version())
|
|
152
163
|
|
|
@@ -178,6 +189,15 @@ for res in ScanDirectory(
|
|
|
178
189
|
# store in DB, print, etc.
|
|
179
190
|
pass
|
|
180
191
|
|
|
192
|
+
# 3) Crop PNG snippets for FileResult objects (requires PyMuPDF)
|
|
193
|
+
detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
|
|
194
|
+
file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
|
|
195
|
+
CropSignatureImages(
|
|
196
|
+
"/path/to/pdfs/example.pdf",
|
|
197
|
+
file_result,
|
|
198
|
+
outputDirectory="./signature_crops",
|
|
199
|
+
dpi=200,
|
|
200
|
+
)
|
|
181
201
|
~~~
|
|
182
202
|
|
|
183
203
|
|
|
@@ -205,7 +225,10 @@ High-level summary (per file):
|
|
|
205
225
|
"score": 5,
|
|
206
226
|
"scores": { "field": 3, "page_label": 2 },
|
|
207
227
|
"evidence": ["field:patient", "page_label:patient"],
|
|
208
|
-
"hint": "AcroSig:sig_patient"
|
|
228
|
+
"hint": "AcroSig:sig_patient",
|
|
229
|
+
"render_type": "typed",
|
|
230
|
+
"bounding_box": [10.0, 10.0, 150.0, 40.0],
|
|
231
|
+
"crop_path": "signature_crops/example/sig_01_patient.png"
|
|
209
232
|
},
|
|
210
233
|
{
|
|
211
234
|
"page": null,
|
|
@@ -214,7 +237,10 @@ High-level summary (per file):
|
|
|
214
237
|
"score": 6,
|
|
215
238
|
"scores": { "page_label": 4, "general": 2 },
|
|
216
239
|
"evidence": ["page_label:representative(parent/guardian)", "pseudo:true"],
|
|
217
|
-
"hint": "VendorOrAcroOnly"
|
|
240
|
+
"hint": "VendorOrAcroOnly",
|
|
241
|
+
"render_type": "unknown",
|
|
242
|
+
"bounding_box": null,
|
|
243
|
+
"crop_path": null
|
|
218
244
|
}
|
|
219
245
|
]
|
|
220
246
|
}
|
|
@@ -227,6 +253,8 @@ High-level summary (per file):
|
|
|
227
253
|
- **`mixed`** means both `esign_found` and `scanned_pdf` are `true`.
|
|
228
254
|
- **`roles`** summarizes unique non-`unknown` roles across signatures.
|
|
229
255
|
- In retainer profile, emitter prefers two signatures (client + firm), often on the same page.
|
|
256
|
+
- **`signatures[].bounding_box`** reports the widget rectangle in PDF points (origin bottom-left).
|
|
257
|
+
- **`signatures[].crop_path`** is populated when PNG crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
|
|
230
258
|
|
|
231
259
|
---
|
|
232
260
|
|
|
@@ -252,6 +280,9 @@ engine: pypdf2
|
|
|
252
280
|
pseudo_signatures: true
|
|
253
281
|
recurse_xobjects: true
|
|
254
282
|
profile: retainer # or: hipaa
|
|
283
|
+
crop_signatures: false # enable to write PNG crops (requires pymupdf)
|
|
284
|
+
# crop_output_dir: ./signature_crops
|
|
285
|
+
crop_image_dpi: 200
|
|
255
286
|
~~~
|
|
256
287
|
|
|
257
288
|
YAML files can be customized or load at runtime (see CLI `--config`, if available, or import and pass patterns into engine).
|
|
@@ -85,6 +85,8 @@ sigdetect detect \
|
|
|
85
85
|
- `--profile` selects tuned role logic:
|
|
86
86
|
- `hipaa` → patient / representative / attorney
|
|
87
87
|
- `retainer` → client / firm (prefers detecting two signatures)
|
|
88
|
+
- `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
|
|
89
|
+
- `--crop-signatures` enables PNG crops for each detected widget (requires installing the optional `pymupdf` dependency). Use `--crop-dir` to override the destination and `--crop-dpi` to choose rendering quality.
|
|
88
90
|
- If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
|
|
89
91
|
|
|
90
92
|
### EDA (quick aggregate stats)
|
|
@@ -118,7 +120,7 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
|
|
|
118
120
|
print(result.to_dict())
|
|
119
121
|
~~~
|
|
120
122
|
|
|
121
|
-
`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)).
|
|
123
|
+
`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image.
|
|
122
124
|
|
|
123
125
|
---
|
|
124
126
|
|
|
@@ -129,7 +131,17 @@ Import from `sigdetect.api` and get plain dicts out (JSON-ready),
|
|
|
129
131
|
with no I/O side effects by default:
|
|
130
132
|
|
|
131
133
|
~~~python
|
|
132
|
-
from
|
|
134
|
+
from pathlib import Path
|
|
135
|
+
|
|
136
|
+
from sigdetect.api import (
|
|
137
|
+
CropSignatureImages,
|
|
138
|
+
DetectMany,
|
|
139
|
+
DetectPdf,
|
|
140
|
+
ScanDirectory,
|
|
141
|
+
ToCsvRow,
|
|
142
|
+
Version,
|
|
143
|
+
get_detector,
|
|
144
|
+
)
|
|
133
145
|
|
|
134
146
|
print("sigdetect", Version())
|
|
135
147
|
|
|
@@ -161,6 +173,15 @@ for res in ScanDirectory(
|
|
|
161
173
|
# store in DB, print, etc.
|
|
162
174
|
pass
|
|
163
175
|
|
|
176
|
+
# 3) Crop PNG snippets for FileResult objects (requires PyMuPDF)
|
|
177
|
+
detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
|
|
178
|
+
file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
|
|
179
|
+
CropSignatureImages(
|
|
180
|
+
"/path/to/pdfs/example.pdf",
|
|
181
|
+
file_result,
|
|
182
|
+
outputDirectory="./signature_crops",
|
|
183
|
+
dpi=200,
|
|
184
|
+
)
|
|
164
185
|
~~~
|
|
165
186
|
|
|
166
187
|
|
|
@@ -188,7 +209,10 @@ High-level summary (per file):
|
|
|
188
209
|
"score": 5,
|
|
189
210
|
"scores": { "field": 3, "page_label": 2 },
|
|
190
211
|
"evidence": ["field:patient", "page_label:patient"],
|
|
191
|
-
"hint": "AcroSig:sig_patient"
|
|
212
|
+
"hint": "AcroSig:sig_patient",
|
|
213
|
+
"render_type": "typed",
|
|
214
|
+
"bounding_box": [10.0, 10.0, 150.0, 40.0],
|
|
215
|
+
"crop_path": "signature_crops/example/sig_01_patient.png"
|
|
192
216
|
},
|
|
193
217
|
{
|
|
194
218
|
"page": null,
|
|
@@ -197,7 +221,10 @@ High-level summary (per file):
|
|
|
197
221
|
"score": 6,
|
|
198
222
|
"scores": { "page_label": 4, "general": 2 },
|
|
199
223
|
"evidence": ["page_label:representative(parent/guardian)", "pseudo:true"],
|
|
200
|
-
"hint": "VendorOrAcroOnly"
|
|
224
|
+
"hint": "VendorOrAcroOnly",
|
|
225
|
+
"render_type": "unknown",
|
|
226
|
+
"bounding_box": null,
|
|
227
|
+
"crop_path": null
|
|
201
228
|
}
|
|
202
229
|
]
|
|
203
230
|
}
|
|
@@ -210,6 +237,8 @@ High-level summary (per file):
|
|
|
210
237
|
- **`mixed`** means both `esign_found` and `scanned_pdf` are `true`.
|
|
211
238
|
- **`roles`** summarizes unique non-`unknown` roles across signatures.
|
|
212
239
|
- In retainer profile, emitter prefers two signatures (client + firm), often on the same page.
|
|
240
|
+
- **`signatures[].bounding_box`** reports the widget rectangle in PDF points (origin bottom-left).
|
|
241
|
+
- **`signatures[].crop_path`** is populated when PNG crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
|
|
213
242
|
|
|
214
243
|
---
|
|
215
244
|
|
|
@@ -235,6 +264,9 @@ engine: pypdf2
|
|
|
235
264
|
pseudo_signatures: true
|
|
236
265
|
recurse_xobjects: true
|
|
237
266
|
profile: retainer # or: hipaa
|
|
267
|
+
crop_signatures: false # enable to write PNG crops (requires pymupdf)
|
|
268
|
+
# crop_output_dir: ./signature_crops
|
|
269
|
+
crop_image_dpi: 200
|
|
238
270
|
~~~
|
|
239
271
|
|
|
240
272
|
YAML files can be customized or load at runtime (see CLI `--config`, if available, or import and pass patterns into engine).
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "sigdetect"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.2.0"
|
|
8
8
|
description = "Signature detection and role attribution for PDFs"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
authors = [{ name = "BT Asmamaw", email = "basmamaw@angeiongroup.com" }]
|
|
@@ -12,7 +12,6 @@ license = { text = "MIT" }
|
|
|
12
12
|
requires-python = ">=3.9"
|
|
13
13
|
dependencies = [
|
|
14
14
|
"pypdf>=4.0.0",
|
|
15
|
-
"pandas>=2.0",
|
|
16
15
|
"rich>=13.0",
|
|
17
16
|
"typer>=0.12",
|
|
18
17
|
"pydantic>=2.5",
|
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
"""Public helpers for programmatic use of the signature detection engine."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from contextlib import contextmanager
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Generator, Iterable, Iterator, Literal
|
|
8
|
+
|
|
9
|
+
from sigdetect.config import DetectConfiguration
|
|
10
|
+
from sigdetect.detector import BuildDetector, Detector, FileResult, Signature
|
|
11
|
+
|
|
12
|
+
EngineName = Literal["pypdf2", "pypdf", "pymupdf"]
|
|
13
|
+
ProfileName = Literal["hipaa", "retainer"]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def DetectPdf(
|
|
17
|
+
pdfPath: str | Path,
|
|
18
|
+
*,
|
|
19
|
+
profileName: ProfileName = "hipaa",
|
|
20
|
+
engineName: EngineName = "pypdf2",
|
|
21
|
+
includePseudoSignatures: bool = True,
|
|
22
|
+
recurseXObjects: bool = True,
|
|
23
|
+
detector: Detector | None = None,
|
|
24
|
+
) -> dict[str, Any]:
|
|
25
|
+
"""Detect signature evidence and assign roles for a single PDF."""
|
|
26
|
+
|
|
27
|
+
resolvedPath = Path(pdfPath)
|
|
28
|
+
activeDetector = detector or get_detector(
|
|
29
|
+
pdfRoot=resolvedPath.parent,
|
|
30
|
+
profileName=profileName,
|
|
31
|
+
engineName=engineName,
|
|
32
|
+
includePseudoSignatures=includePseudoSignatures,
|
|
33
|
+
recurseXObjects=recurseXObjects,
|
|
34
|
+
outputDirectory=None,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
result = activeDetector.Detect(resolvedPath)
|
|
38
|
+
return _ToPlainDictionary(result)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def get_detector(
|
|
42
|
+
*,
|
|
43
|
+
pdfRoot: str | Path | None = None,
|
|
44
|
+
profileName: ProfileName = "hipaa",
|
|
45
|
+
engineName: EngineName = "pypdf2",
|
|
46
|
+
includePseudoSignatures: bool = True,
|
|
47
|
+
recurseXObjects: bool = True,
|
|
48
|
+
outputDirectory: str | Path | None = None,
|
|
49
|
+
) -> Detector:
|
|
50
|
+
"""Return a reusable detector instance configured with the supplied options."""
|
|
51
|
+
|
|
52
|
+
configuration = DetectConfiguration(
|
|
53
|
+
PdfRoot=Path(pdfRoot) if pdfRoot is not None else Path.cwd(),
|
|
54
|
+
OutputDirectory=Path(outputDirectory) if outputDirectory is not None else None,
|
|
55
|
+
Engine=engineName,
|
|
56
|
+
PseudoSignatures=includePseudoSignatures,
|
|
57
|
+
RecurseXObjects=recurseXObjects,
|
|
58
|
+
Profile=profileName,
|
|
59
|
+
)
|
|
60
|
+
return BuildDetector(configuration)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _ToPlainDictionary(candidate: Any) -> dict[str, Any]:
|
|
64
|
+
"""Convert pydantic/dataclass instances to plain dictionaries."""
|
|
65
|
+
|
|
66
|
+
if hasattr(candidate, "to_dict"):
|
|
67
|
+
return candidate.to_dict()
|
|
68
|
+
if hasattr(candidate, "model_dump"):
|
|
69
|
+
return candidate.model_dump() # type: ignore[attr-defined]
|
|
70
|
+
if hasattr(candidate, "dict"):
|
|
71
|
+
return candidate.dict() # type: ignore[attr-defined]
|
|
72
|
+
try:
|
|
73
|
+
from dataclasses import asdict, is_dataclass
|
|
74
|
+
|
|
75
|
+
if is_dataclass(candidate):
|
|
76
|
+
return asdict(candidate)
|
|
77
|
+
except Exception:
|
|
78
|
+
pass
|
|
79
|
+
if isinstance(candidate, dict):
|
|
80
|
+
return {key: _ToPlainValue(candidate[key]) for key in candidate}
|
|
81
|
+
raise TypeError(f"Unsupported result type: {type(candidate)!r}")
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _ToPlainValue(value: Any) -> Any:
|
|
85
|
+
"""Best effort conversion for nested structures."""
|
|
86
|
+
|
|
87
|
+
if hasattr(value, "to_dict"):
|
|
88
|
+
return value.to_dict()
|
|
89
|
+
if hasattr(value, "model_dump") or hasattr(value, "dict"):
|
|
90
|
+
return _ToPlainDictionary(value)
|
|
91
|
+
try:
|
|
92
|
+
from dataclasses import asdict, is_dataclass
|
|
93
|
+
|
|
94
|
+
if is_dataclass(value):
|
|
95
|
+
return asdict(value)
|
|
96
|
+
except Exception:
|
|
97
|
+
pass
|
|
98
|
+
if isinstance(value, list):
|
|
99
|
+
return [_ToPlainValue(item) for item in value]
|
|
100
|
+
if isinstance(value, tuple):
|
|
101
|
+
return tuple(_ToPlainValue(item) for item in value)
|
|
102
|
+
if isinstance(value, dict):
|
|
103
|
+
return {key: _ToPlainValue(result) for key, result in value.items()}
|
|
104
|
+
return value
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def DetectMany(
|
|
108
|
+
pdfPaths: Iterable[str | Path],
|
|
109
|
+
*,
|
|
110
|
+
detector: Detector | None = None,
|
|
111
|
+
**kwargs: Any,
|
|
112
|
+
) -> Iterator[dict[str, Any]]:
|
|
113
|
+
"""Yield :func:`DetectPdf` results for each path in ``pdfPaths``."""
|
|
114
|
+
|
|
115
|
+
if detector is not None:
|
|
116
|
+
for pdfPath in pdfPaths:
|
|
117
|
+
yield _DetectWithDetector(detector, pdfPath)
|
|
118
|
+
return
|
|
119
|
+
|
|
120
|
+
for pdfPath in pdfPaths:
|
|
121
|
+
yield DetectPdf(pdfPath, **kwargs)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def ScanDirectory(
|
|
125
|
+
pdfRoot: str | Path,
|
|
126
|
+
*,
|
|
127
|
+
globPattern: str = "**/*.pdf",
|
|
128
|
+
detector: Detector | None = None,
|
|
129
|
+
**kwargs: Any,
|
|
130
|
+
) -> Iterator[dict[str, Any]]:
|
|
131
|
+
"""Walk ``pdfRoot`` and yield detection output for every matching PDF."""
|
|
132
|
+
|
|
133
|
+
rootDirectory = Path(pdfRoot)
|
|
134
|
+
if globPattern == "**/*.pdf":
|
|
135
|
+
iterator = (path for path in rootDirectory.rglob("*") if path.is_file())
|
|
136
|
+
else:
|
|
137
|
+
iterator = (
|
|
138
|
+
rootDirectory.rglob(globPattern.replace("**/", "", 1))
|
|
139
|
+
if globPattern.startswith("**/")
|
|
140
|
+
else rootDirectory.glob(globPattern)
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
for pdfPath in iterator:
|
|
144
|
+
if pdfPath.is_file() and pdfPath.suffix.lower() == ".pdf":
|
|
145
|
+
yield DetectPdf(pdfPath, detector=detector, **kwargs)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def ToCsvRow(result: dict[str, Any]) -> dict[str, Any]:
|
|
149
|
+
"""Return a curated subset of keys suitable for CSV export."""
|
|
150
|
+
|
|
151
|
+
return {
|
|
152
|
+
"file": result.get("file"),
|
|
153
|
+
"size_kb": result.get("size_kb"),
|
|
154
|
+
"pages": result.get("pages"),
|
|
155
|
+
"esign_found": result.get("esign_found"),
|
|
156
|
+
"scanned_pdf": result.get("scanned_pdf"),
|
|
157
|
+
"mixed": result.get("mixed"),
|
|
158
|
+
"sig_count": result.get("sig_count"),
|
|
159
|
+
"sig_pages": result.get("sig_pages"),
|
|
160
|
+
"roles": result.get("roles"),
|
|
161
|
+
"hints": result.get("hints"),
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def Version() -> str:
|
|
166
|
+
"""Expose the installed package version without importing the CLI stack."""
|
|
167
|
+
|
|
168
|
+
try:
|
|
169
|
+
from importlib.metadata import version as resolveVersion
|
|
170
|
+
|
|
171
|
+
return resolveVersion("sigdetect")
|
|
172
|
+
except Exception:
|
|
173
|
+
return "0.0.0-dev"
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _DetectWithDetector(detector: Detector, pdfPath: str | Path) -> dict[str, Any]:
|
|
177
|
+
"""Helper that runs ``detector`` and returns the plain dictionary result."""
|
|
178
|
+
|
|
179
|
+
resolvedPath = Path(pdfPath)
|
|
180
|
+
return _ToPlainDictionary(detector.Detect(resolvedPath))
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
@contextmanager
|
|
184
|
+
def detector_context(**kwargs: Any) -> Generator[Detector, None, None]:
|
|
185
|
+
"""Context manager wrapper around :func:`get_detector`."""
|
|
186
|
+
|
|
187
|
+
detector = get_detector(**kwargs)
|
|
188
|
+
try:
|
|
189
|
+
yield detector
|
|
190
|
+
finally:
|
|
191
|
+
pass
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def CropSignatureImages(
|
|
195
|
+
pdfPath: str | Path,
|
|
196
|
+
fileResult: FileResult | dict[str, Any],
|
|
197
|
+
*,
|
|
198
|
+
outputDirectory: str | Path,
|
|
199
|
+
dpi: int = 200,
|
|
200
|
+
) -> list[Path]:
|
|
201
|
+
"""Crop detected signature regions to PNG files.
|
|
202
|
+
|
|
203
|
+
Accepts either a :class:`FileResult` instance or the ``dict`` returned by
|
|
204
|
+
:func:`DetectPdf`. Requires the optional ``pymupdf`` dependency.
|
|
205
|
+
"""
|
|
206
|
+
|
|
207
|
+
from sigdetect.cropping import crop_signatures
|
|
208
|
+
|
|
209
|
+
file_result_obj, original_dict = _CoerceFileResult(fileResult)
|
|
210
|
+
paths = crop_signatures(
|
|
211
|
+
pdf_path=Path(pdfPath),
|
|
212
|
+
file_result=file_result_obj,
|
|
213
|
+
output_dir=Path(outputDirectory),
|
|
214
|
+
dpi=dpi,
|
|
215
|
+
)
|
|
216
|
+
if original_dict is not None:
|
|
217
|
+
original_dict.clear()
|
|
218
|
+
original_dict.update(file_result_obj.to_dict())
|
|
219
|
+
return paths
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def _CoerceFileResult(
|
|
223
|
+
candidate: FileResult | dict[str, Any]
|
|
224
|
+
) -> tuple[FileResult, dict[str, Any] | None]:
|
|
225
|
+
if isinstance(candidate, FileResult):
|
|
226
|
+
return candidate, None
|
|
227
|
+
if not isinstance(candidate, dict):
|
|
228
|
+
raise TypeError("fileResult must be FileResult or dict")
|
|
229
|
+
|
|
230
|
+
signatures: list[Signature] = []
|
|
231
|
+
for entry in candidate.get("signatures") or []:
|
|
232
|
+
bbox = entry.get("bounding_box")
|
|
233
|
+
signatures.append(
|
|
234
|
+
Signature(
|
|
235
|
+
Page=entry.get("page"),
|
|
236
|
+
FieldName=str(entry.get("field_name") or ""),
|
|
237
|
+
Role=str(entry.get("role") or "unknown"),
|
|
238
|
+
Score=int(entry.get("score") or 0),
|
|
239
|
+
Scores=dict(entry.get("scores") or {}),
|
|
240
|
+
Evidence=list(entry.get("evidence") or []),
|
|
241
|
+
Hint=str(entry.get("hint") or ""),
|
|
242
|
+
RenderType=str(entry.get("render_type") or "unknown"),
|
|
243
|
+
BoundingBox=tuple(bbox) if bbox else None,
|
|
244
|
+
CropPath=entry.get("crop_path"),
|
|
245
|
+
)
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
file_result = FileResult(
|
|
249
|
+
File=str(candidate.get("file") or ""),
|
|
250
|
+
SizeKilobytes=candidate.get("size_kb"),
|
|
251
|
+
PageCount=int(candidate.get("pages") or 0),
|
|
252
|
+
ElectronicSignatureFound=bool(candidate.get("esign_found")),
|
|
253
|
+
ScannedPdf=candidate.get("scanned_pdf"),
|
|
254
|
+
MixedContent=candidate.get("mixed"),
|
|
255
|
+
SignatureCount=int(candidate.get("sig_count") or len(signatures)),
|
|
256
|
+
SignaturePages=str(candidate.get("sig_pages") or ""),
|
|
257
|
+
Roles=str(candidate.get("roles") or "unknown"),
|
|
258
|
+
Hints=str(candidate.get("hints") or ""),
|
|
259
|
+
Signatures=signatures,
|
|
260
|
+
)
|
|
261
|
+
return file_result, candidate
|