sigdetect 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sigdetect/api.py +33 -2
- sigdetect/cropping.py +72 -9
- {sigdetect-0.2.0.dist-info → sigdetect-0.3.1.dist-info}/METADATA +11 -2
- {sigdetect-0.2.0.dist-info → sigdetect-0.3.1.dist-info}/RECORD +7 -7
- {sigdetect-0.2.0.dist-info → sigdetect-0.3.1.dist-info}/WHEEL +0 -0
- {sigdetect-0.2.0.dist-info → sigdetect-0.3.1.dist-info}/entry_points.txt +0 -0
- {sigdetect-0.2.0.dist-info → sigdetect-0.3.1.dist-info}/top_level.txt +0 -0
sigdetect/api.py
CHANGED
|
@@ -4,9 +4,10 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from contextlib import contextmanager
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from typing import Any, Generator, Iterable, Iterator, Literal
|
|
7
|
+
from typing import Any, Generator, Iterable, Iterator, Literal, overload
|
|
8
8
|
|
|
9
9
|
from sigdetect.config import DetectConfiguration
|
|
10
|
+
from sigdetect.cropping import SignatureCrop
|
|
10
11
|
from sigdetect.detector import BuildDetector, Detector, FileResult, Signature
|
|
11
12
|
|
|
12
13
|
EngineName = Literal["pypdf2", "pypdf", "pymupdf"]
|
|
@@ -191,17 +192,45 @@ def detector_context(**kwargs: Any) -> Generator[Detector, None, None]:
|
|
|
191
192
|
pass
|
|
192
193
|
|
|
193
194
|
|
|
195
|
+
@overload
|
|
194
196
|
def CropSignatureImages(
|
|
195
197
|
pdfPath: str | Path,
|
|
196
198
|
fileResult: FileResult | dict[str, Any],
|
|
197
199
|
*,
|
|
198
200
|
outputDirectory: str | Path,
|
|
199
201
|
dpi: int = 200,
|
|
200
|
-
|
|
202
|
+
returnBytes: Literal[False] = False,
|
|
203
|
+
saveToDisk: bool = True,
|
|
204
|
+
) -> list[Path]: ...
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
@overload
|
|
208
|
+
def CropSignatureImages(
|
|
209
|
+
pdfPath: str | Path,
|
|
210
|
+
fileResult: FileResult | dict[str, Any],
|
|
211
|
+
*,
|
|
212
|
+
outputDirectory: str | Path,
|
|
213
|
+
dpi: int,
|
|
214
|
+
returnBytes: Literal[True],
|
|
215
|
+
saveToDisk: bool,
|
|
216
|
+
) -> list[SignatureCrop]: ...
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def CropSignatureImages(
|
|
220
|
+
pdfPath: str | Path,
|
|
221
|
+
fileResult: FileResult | dict[str, Any],
|
|
222
|
+
*,
|
|
223
|
+
outputDirectory: str | Path,
|
|
224
|
+
dpi: int = 200,
|
|
225
|
+
returnBytes: bool = False,
|
|
226
|
+
saveToDisk: bool = True,
|
|
227
|
+
) -> list[Path] | list[SignatureCrop]:
|
|
201
228
|
"""Crop detected signature regions to PNG files.
|
|
202
229
|
|
|
203
230
|
Accepts either a :class:`FileResult` instance or the ``dict`` returned by
|
|
204
231
|
:func:`DetectPdf`. Requires the optional ``pymupdf`` dependency.
|
|
232
|
+
Set ``returnBytes=True`` to also receive in-memory PNG bytes for each crop. Set
|
|
233
|
+
``saveToDisk=False`` to skip writing PNG files while still returning in-memory data.
|
|
205
234
|
"""
|
|
206
235
|
|
|
207
236
|
from sigdetect.cropping import crop_signatures
|
|
@@ -212,6 +241,8 @@ def CropSignatureImages(
|
|
|
212
241
|
file_result=file_result_obj,
|
|
213
242
|
output_dir=Path(outputDirectory),
|
|
214
243
|
dpi=dpi,
|
|
244
|
+
return_bytes=returnBytes,
|
|
245
|
+
save_files=saveToDisk,
|
|
215
246
|
)
|
|
216
247
|
if original_dict is not None:
|
|
217
248
|
original_dict.clear()
|
sigdetect/cropping.py
CHANGED
|
@@ -4,7 +4,9 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
6
|
import re
|
|
7
|
+
from dataclasses import dataclass
|
|
7
8
|
from pathlib import Path
|
|
9
|
+
from typing import Literal, overload
|
|
8
10
|
|
|
9
11
|
from .detector.file_result_model import FileResult
|
|
10
12
|
from .detector.signature_model import Signature
|
|
@@ -19,6 +21,29 @@ class SignatureCroppingUnavailable(RuntimeError):
|
|
|
19
21
|
"""Raised when PNG cropping cannot be performed (e.g., PyMuPDF missing)."""
|
|
20
22
|
|
|
21
23
|
|
|
24
|
+
@dataclass(slots=True)
|
|
25
|
+
class SignatureCrop:
|
|
26
|
+
"""PNG crop metadata and in-memory content."""
|
|
27
|
+
|
|
28
|
+
path: Path
|
|
29
|
+
image_bytes: bytes
|
|
30
|
+
signature: Signature
|
|
31
|
+
saved_to_disk: bool = True
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@overload
|
|
35
|
+
def crop_signatures(
|
|
36
|
+
pdf_path: Path,
|
|
37
|
+
file_result: FileResult,
|
|
38
|
+
*,
|
|
39
|
+
output_dir: Path,
|
|
40
|
+
dpi: int = 200,
|
|
41
|
+
logger: logging.Logger | None = None,
|
|
42
|
+
return_bytes: Literal[False] = False,
|
|
43
|
+
) -> list[Path]: ...
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@overload
|
|
22
47
|
def crop_signatures(
|
|
23
48
|
pdf_path: Path,
|
|
24
49
|
file_result: FileResult,
|
|
@@ -26,22 +51,44 @@ def crop_signatures(
|
|
|
26
51
|
output_dir: Path,
|
|
27
52
|
dpi: int = 200,
|
|
28
53
|
logger: logging.Logger | None = None,
|
|
29
|
-
|
|
30
|
-
|
|
54
|
+
return_bytes: Literal[True] = True,
|
|
55
|
+
) -> list[SignatureCrop]: ...
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def crop_signatures(
|
|
59
|
+
pdf_path: Path,
|
|
60
|
+
file_result: FileResult,
|
|
61
|
+
*,
|
|
62
|
+
output_dir: Path,
|
|
63
|
+
dpi: int = 200,
|
|
64
|
+
logger: logging.Logger | None = None,
|
|
65
|
+
return_bytes: bool = False,
|
|
66
|
+
save_files: bool = True,
|
|
67
|
+
) -> list[Path] | list[SignatureCrop]:
|
|
68
|
+
"""Render each signature bounding box to a PNG image using PyMuPDF.
|
|
69
|
+
|
|
70
|
+
Set ``return_bytes=True`` to collect in-memory PNG bytes for each crop while also writing
|
|
71
|
+
the files to ``output_dir``. Set ``save_files=False`` to skip writing PNGs to disk.
|
|
72
|
+
"""
|
|
31
73
|
|
|
32
74
|
if fitz is None: # pragma: no cover - exercised when dependency absent
|
|
33
75
|
raise SignatureCroppingUnavailable(
|
|
34
76
|
"PyMuPDF is required for PNG crops. Install 'pymupdf' or 'sigdetect[pymupdf]'."
|
|
35
77
|
)
|
|
78
|
+
if not save_files and not return_bytes:
|
|
79
|
+
raise ValueError("At least one of save_files or return_bytes must be True")
|
|
36
80
|
|
|
37
81
|
pdf_path = Path(pdf_path)
|
|
38
82
|
output_dir = Path(output_dir)
|
|
39
|
-
|
|
40
|
-
|
|
83
|
+
if save_files:
|
|
84
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
85
|
+
generated_paths: list[Path] = []
|
|
86
|
+
generated_crops: list[SignatureCrop] = []
|
|
41
87
|
|
|
42
88
|
with fitz.open(pdf_path) as document: # type: ignore[attr-defined]
|
|
43
89
|
per_document_dir = output_dir / pdf_path.stem
|
|
44
|
-
|
|
90
|
+
if save_files:
|
|
91
|
+
per_document_dir.mkdir(parents=True, exist_ok=True)
|
|
45
92
|
scale = dpi / 72.0
|
|
46
93
|
matrix = fitz.Matrix(scale, scale)
|
|
47
94
|
|
|
@@ -70,8 +117,12 @@ def crop_signatures(
|
|
|
70
117
|
destination = per_document_dir / filename
|
|
71
118
|
|
|
72
119
|
try:
|
|
120
|
+
image_bytes: bytes | None = None
|
|
73
121
|
pixmap = page.get_pixmap(matrix=matrix, clip=clip, alpha=False)
|
|
74
|
-
|
|
122
|
+
if save_files:
|
|
123
|
+
pixmap.save(destination)
|
|
124
|
+
if return_bytes:
|
|
125
|
+
image_bytes = pixmap.tobytes("png")
|
|
75
126
|
except Exception as exc: # pragma: no cover - defensive
|
|
76
127
|
if logger:
|
|
77
128
|
logger.warning(
|
|
@@ -85,10 +136,22 @@ def crop_signatures(
|
|
|
85
136
|
)
|
|
86
137
|
continue
|
|
87
138
|
|
|
88
|
-
|
|
89
|
-
|
|
139
|
+
if save_files:
|
|
140
|
+
signature.CropPath = str(destination)
|
|
141
|
+
generated_paths.append(destination)
|
|
142
|
+
if return_bytes:
|
|
143
|
+
if image_bytes is None: # pragma: no cover - defensive
|
|
144
|
+
continue
|
|
145
|
+
generated_crops.append(
|
|
146
|
+
SignatureCrop(
|
|
147
|
+
path=destination,
|
|
148
|
+
image_bytes=image_bytes,
|
|
149
|
+
signature=signature,
|
|
150
|
+
saved_to_disk=save_files,
|
|
151
|
+
)
|
|
152
|
+
)
|
|
90
153
|
|
|
91
|
-
return
|
|
154
|
+
return generated_crops if return_bytes else generated_paths
|
|
92
155
|
|
|
93
156
|
|
|
94
157
|
def _to_clip_rect(page, bbox: tuple[float, float, float, float]):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sigdetect
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.1
|
|
4
4
|
Summary: Signature detection and role attribution for PDFs
|
|
5
5
|
Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
|
|
6
6
|
License: MIT
|
|
@@ -192,14 +192,23 @@ for res in ScanDirectory(
|
|
|
192
192
|
# 3) Crop PNG snippets for FileResult objects (requires PyMuPDF)
|
|
193
193
|
detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
|
|
194
194
|
file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
|
|
195
|
-
CropSignatureImages(
|
|
195
|
+
crops = CropSignatureImages(
|
|
196
196
|
"/path/to/pdfs/example.pdf",
|
|
197
197
|
file_result,
|
|
198
198
|
outputDirectory="./signature_crops",
|
|
199
199
|
dpi=200,
|
|
200
|
+
returnBytes=True, # also returns in-memory PNG bytes for each crop
|
|
201
|
+
# saveToDisk=False, # optional: skip writing PNGs to disk
|
|
200
202
|
)
|
|
203
|
+
|
|
204
|
+
first_crop = crops[0]
|
|
205
|
+
print(first_crop.path, len(first_crop.image_bytes))
|
|
201
206
|
~~~
|
|
202
207
|
|
|
208
|
+
When ``returnBytes=True`` the helper returns ``SignatureCrop`` objects containing the saved path,
|
|
209
|
+
PNG bytes, and the originating signature metadata.
|
|
210
|
+
Pass ``saveToDisk=False`` if you only want in-memory PNG bytes (no files on disk or ``crop_path`` updates).
|
|
211
|
+
|
|
203
212
|
|
|
204
213
|
## Result schema
|
|
205
214
|
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
sigdetect/__init__.py,sha256=LhY78mDZ1ClYVNTxW_qtE-vqJoN9N7N5ZcNRDUI_3ss,575
|
|
2
|
-
sigdetect/api.py,sha256=
|
|
2
|
+
sigdetect/api.py,sha256=6_CMSxcag9coHHzrpuRSVimHWSNtqQiWY9hdlqQ2IKY,9396
|
|
3
3
|
sigdetect/cli.py,sha256=NctAnaB-TQrUAT9m-v8kj2_KTNs88kbFOCiX32tHZm8,7920
|
|
4
4
|
sigdetect/config.py,sha256=S0NVKuJYiHJCocL-VNFdGJpasFcjTecavC4EthyS1DQ,5951
|
|
5
|
-
sigdetect/cropping.py,sha256=
|
|
5
|
+
sigdetect/cropping.py,sha256=dmJF4Q1tkmkfm0NaiwHddNOP8Sj9S4Lj_d5EBjodEkk,6015
|
|
6
6
|
sigdetect/eda.py,sha256=S92G1Gjmepri__D0n_V6foq0lQgH-RXI9anW8A58jfw,4681
|
|
7
7
|
sigdetect/logging_setup.py,sha256=LMF8ao_a-JwH0S522T6aYTFX3e8Ajjv_5ODS2YiBcHA,6404
|
|
8
8
|
sigdetect/utils.py,sha256=T9rubLf5T9JmjOHYMOba1j34fhOJaWocAXccnGTxRUE,5198
|
|
@@ -16,8 +16,8 @@ sigdetect/detector/file_result_model.py,sha256=j2gTc9Sw3fJOHlexYsR_m5DiwHA8DzIzA
|
|
|
16
16
|
sigdetect/detector/pymupdf_engine.py,sha256=iyp7JuPlUnydwohH5zbNg4MwH44mBmxbBWOS3ZmArBo,17339
|
|
17
17
|
sigdetect/detector/pypdf2_engine.py,sha256=INWQH06kMLvto2VS-EdLC-EtMC6AG7JmdVYmNgx6_RU,47313
|
|
18
18
|
sigdetect/detector/signature_model.py,sha256=mztb9V5wgv2oohQ5Cxzcv8_Bo6TyWAVIXteaeQ2rywQ,1076
|
|
19
|
-
sigdetect-0.
|
|
20
|
-
sigdetect-0.
|
|
21
|
-
sigdetect-0.
|
|
22
|
-
sigdetect-0.
|
|
23
|
-
sigdetect-0.
|
|
19
|
+
sigdetect-0.3.1.dist-info/METADATA,sha256=whXGE4-9spAjlMcZz_owdsIiB4EobXL9_UOuAJeDVfA,12342
|
|
20
|
+
sigdetect-0.3.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
21
|
+
sigdetect-0.3.1.dist-info/entry_points.txt,sha256=iqtfKjBU44-omM7Sh-idGz2ahw19oAvpvSyKZVArG3o,48
|
|
22
|
+
sigdetect-0.3.1.dist-info/top_level.txt,sha256=PKlfwUobkRC0viwiSXmhtw83G26FSNpimWYC1Uy00FY,10
|
|
23
|
+
sigdetect-0.3.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|