sigdetect 0.2.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sigdetect-0.2.0 → sigdetect-0.3.0}/PKG-INFO +9 -2
- {sigdetect-0.2.0 → sigdetect-0.3.0}/README.md +8 -1
- {sigdetect-0.2.0 → sigdetect-0.3.0}/pyproject.toml +1 -1
- {sigdetect-0.2.0 → sigdetect-0.3.0}/src/sigdetect/api.py +28 -2
- {sigdetect-0.2.0 → sigdetect-0.3.0}/src/sigdetect/cropping.py +59 -5
- {sigdetect-0.2.0 → sigdetect-0.3.0}/src/sigdetect.egg-info/PKG-INFO +9 -2
- {sigdetect-0.2.0 → sigdetect-0.3.0}/tests/test_cropping.py +42 -1
- {sigdetect-0.2.0 → sigdetect-0.3.0}/setup.cfg +0 -0
- {sigdetect-0.2.0 → sigdetect-0.3.0}/src/sigdetect/__init__.py +0 -0
- {sigdetect-0.2.0 → sigdetect-0.3.0}/src/sigdetect/cli.py +0 -0
- {sigdetect-0.2.0 → sigdetect-0.3.0}/src/sigdetect/config.py +0 -0
- {sigdetect-0.2.0 → sigdetect-0.3.0}/src/sigdetect/data/role_rules.retainer.yml +0 -0
- {sigdetect-0.2.0 → sigdetect-0.3.0}/src/sigdetect/data/role_rules.yml +0 -0
- {sigdetect-0.2.0 → sigdetect-0.3.0}/src/sigdetect/data/vendor_patterns.yml +0 -0
- {sigdetect-0.2.0 → sigdetect-0.3.0}/src/sigdetect/detector/__init__.py +0 -0
- {sigdetect-0.2.0 → sigdetect-0.3.0}/src/sigdetect/detector/base.py +0 -0
- {sigdetect-0.2.0 → sigdetect-0.3.0}/src/sigdetect/detector/base_detector.py +0 -0
- {sigdetect-0.2.0 → sigdetect-0.3.0}/src/sigdetect/detector/file_result_model.py +0 -0
- {sigdetect-0.2.0 → sigdetect-0.3.0}/src/sigdetect/detector/pymupdf_engine.py +0 -0
- {sigdetect-0.2.0 → sigdetect-0.3.0}/src/sigdetect/detector/pypdf2_engine.py +0 -0
- {sigdetect-0.2.0 → sigdetect-0.3.0}/src/sigdetect/detector/signature_model.py +0 -0
- {sigdetect-0.2.0 → sigdetect-0.3.0}/src/sigdetect/eda.py +0 -0
- {sigdetect-0.2.0 → sigdetect-0.3.0}/src/sigdetect/logging_setup.py +0 -0
- {sigdetect-0.2.0 → sigdetect-0.3.0}/src/sigdetect/utils.py +0 -0
- {sigdetect-0.2.0 → sigdetect-0.3.0}/src/sigdetect.egg-info/SOURCES.txt +0 -0
- {sigdetect-0.2.0 → sigdetect-0.3.0}/src/sigdetect.egg-info/dependency_links.txt +0 -0
- {sigdetect-0.2.0 → sigdetect-0.3.0}/src/sigdetect.egg-info/entry_points.txt +0 -0
- {sigdetect-0.2.0 → sigdetect-0.3.0}/src/sigdetect.egg-info/requires.txt +0 -0
- {sigdetect-0.2.0 → sigdetect-0.3.0}/src/sigdetect.egg-info/top_level.txt +0 -0
- {sigdetect-0.2.0 → sigdetect-0.3.0}/tests/test_api.py +0 -0
- {sigdetect-0.2.0 → sigdetect-0.3.0}/tests/test_pymupdf_engine.py +0 -0
- {sigdetect-0.2.0 → sigdetect-0.3.0}/tests/test_widget_role_patient_smoke.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sigdetect
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Signature detection and role attribution for PDFs
|
|
5
5
|
Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
|
|
6
6
|
License: MIT
|
|
@@ -192,14 +192,21 @@ for res in ScanDirectory(
|
|
|
192
192
|
# 3) Crop PNG snippets for FileResult objects (requires PyMuPDF)
|
|
193
193
|
detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
|
|
194
194
|
file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
|
|
195
|
-
CropSignatureImages(
|
|
195
|
+
crops = CropSignatureImages(
|
|
196
196
|
"/path/to/pdfs/example.pdf",
|
|
197
197
|
file_result,
|
|
198
198
|
outputDirectory="./signature_crops",
|
|
199
199
|
dpi=200,
|
|
200
|
+
returnBytes=True, # also returns in-memory PNG bytes for each crop
|
|
200
201
|
)
|
|
202
|
+
|
|
203
|
+
first_crop = crops[0]
|
|
204
|
+
print(first_crop.path, len(first_crop.image_bytes))
|
|
201
205
|
~~~
|
|
202
206
|
|
|
207
|
+
When ``returnBytes=True`` the helper returns ``SignatureCrop`` objects containing the saved path,
|
|
208
|
+
PNG bytes, and the originating signature metadata.
|
|
209
|
+
|
|
203
210
|
|
|
204
211
|
## Result schema
|
|
205
212
|
|
|
@@ -176,14 +176,21 @@ for res in ScanDirectory(
|
|
|
176
176
|
# 3) Crop PNG snippets for FileResult objects (requires PyMuPDF)
|
|
177
177
|
detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
|
|
178
178
|
file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
|
|
179
|
-
CropSignatureImages(
|
|
179
|
+
crops = CropSignatureImages(
|
|
180
180
|
"/path/to/pdfs/example.pdf",
|
|
181
181
|
file_result,
|
|
182
182
|
outputDirectory="./signature_crops",
|
|
183
183
|
dpi=200,
|
|
184
|
+
returnBytes=True, # also returns in-memory PNG bytes for each crop
|
|
184
185
|
)
|
|
186
|
+
|
|
187
|
+
first_crop = crops[0]
|
|
188
|
+
print(first_crop.path, len(first_crop.image_bytes))
|
|
185
189
|
~~~
|
|
186
190
|
|
|
191
|
+
When ``returnBytes=True`` the helper returns ``SignatureCrop`` objects containing the saved path,
|
|
192
|
+
PNG bytes, and the originating signature metadata.
|
|
193
|
+
|
|
187
194
|
|
|
188
195
|
## Result schema
|
|
189
196
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "sigdetect"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.3.0"
|
|
8
8
|
description = "Signature detection and role attribution for PDFs"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
authors = [{ name = "BT Asmamaw", email = "basmamaw@angeiongroup.com" }]
|
|
@@ -4,9 +4,10 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from contextlib import contextmanager
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from typing import Any, Generator, Iterable, Iterator, Literal
|
|
7
|
+
from typing import Any, Generator, Iterable, Iterator, Literal, overload
|
|
8
8
|
|
|
9
9
|
from sigdetect.config import DetectConfiguration
|
|
10
|
+
from sigdetect.cropping import SignatureCrop
|
|
10
11
|
from sigdetect.detector import BuildDetector, Detector, FileResult, Signature
|
|
11
12
|
|
|
12
13
|
EngineName = Literal["pypdf2", "pypdf", "pymupdf"]
|
|
@@ -191,17 +192,41 @@ def detector_context(**kwargs: Any) -> Generator[Detector, None, None]:
|
|
|
191
192
|
pass
|
|
192
193
|
|
|
193
194
|
|
|
195
|
+
@overload
|
|
194
196
|
def CropSignatureImages(
|
|
195
197
|
pdfPath: str | Path,
|
|
196
198
|
fileResult: FileResult | dict[str, Any],
|
|
197
199
|
*,
|
|
198
200
|
outputDirectory: str | Path,
|
|
199
201
|
dpi: int = 200,
|
|
200
|
-
|
|
202
|
+
returnBytes: Literal[False] = False,
|
|
203
|
+
) -> list[Path]: ...
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
@overload
|
|
207
|
+
def CropSignatureImages(
|
|
208
|
+
pdfPath: str | Path,
|
|
209
|
+
fileResult: FileResult | dict[str, Any],
|
|
210
|
+
*,
|
|
211
|
+
outputDirectory: str | Path,
|
|
212
|
+
dpi: int,
|
|
213
|
+
returnBytes: Literal[True],
|
|
214
|
+
) -> list[SignatureCrop]: ...
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def CropSignatureImages(
|
|
218
|
+
pdfPath: str | Path,
|
|
219
|
+
fileResult: FileResult | dict[str, Any],
|
|
220
|
+
*,
|
|
221
|
+
outputDirectory: str | Path,
|
|
222
|
+
dpi: int = 200,
|
|
223
|
+
returnBytes: bool = False,
|
|
224
|
+
) -> list[Path] | list[SignatureCrop]:
|
|
201
225
|
"""Crop detected signature regions to PNG files.
|
|
202
226
|
|
|
203
227
|
Accepts either a :class:`FileResult` instance or the ``dict`` returned by
|
|
204
228
|
:func:`DetectPdf`. Requires the optional ``pymupdf`` dependency.
|
|
229
|
+
Set ``returnBytes=True`` to also receive in-memory PNG bytes for each crop.
|
|
205
230
|
"""
|
|
206
231
|
|
|
207
232
|
from sigdetect.cropping import crop_signatures
|
|
@@ -212,6 +237,7 @@ def CropSignatureImages(
|
|
|
212
237
|
file_result=file_result_obj,
|
|
213
238
|
output_dir=Path(outputDirectory),
|
|
214
239
|
dpi=dpi,
|
|
240
|
+
return_bytes=returnBytes,
|
|
215
241
|
)
|
|
216
242
|
if original_dict is not None:
|
|
217
243
|
original_dict.clear()
|
|
@@ -4,7 +4,9 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
6
|
import re
|
|
7
|
+
from dataclasses import dataclass
|
|
7
8
|
from pathlib import Path
|
|
9
|
+
from typing import Literal, overload
|
|
8
10
|
|
|
9
11
|
from .detector.file_result_model import FileResult
|
|
10
12
|
from .detector.signature_model import Signature
|
|
@@ -19,6 +21,28 @@ class SignatureCroppingUnavailable(RuntimeError):
|
|
|
19
21
|
"""Raised when PNG cropping cannot be performed (e.g., PyMuPDF missing)."""
|
|
20
22
|
|
|
21
23
|
|
|
24
|
+
@dataclass(slots=True)
|
|
25
|
+
class SignatureCrop:
|
|
26
|
+
"""PNG crop metadata and in-memory content."""
|
|
27
|
+
|
|
28
|
+
path: Path
|
|
29
|
+
image_bytes: bytes
|
|
30
|
+
signature: Signature
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@overload
|
|
34
|
+
def crop_signatures(
|
|
35
|
+
pdf_path: Path,
|
|
36
|
+
file_result: FileResult,
|
|
37
|
+
*,
|
|
38
|
+
output_dir: Path,
|
|
39
|
+
dpi: int = 200,
|
|
40
|
+
logger: logging.Logger | None = None,
|
|
41
|
+
return_bytes: Literal[False] = False,
|
|
42
|
+
) -> list[Path]: ...
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@overload
|
|
22
46
|
def crop_signatures(
|
|
23
47
|
pdf_path: Path,
|
|
24
48
|
file_result: FileResult,
|
|
@@ -26,8 +50,24 @@ def crop_signatures(
|
|
|
26
50
|
output_dir: Path,
|
|
27
51
|
dpi: int = 200,
|
|
28
52
|
logger: logging.Logger | None = None,
|
|
29
|
-
|
|
30
|
-
|
|
53
|
+
return_bytes: Literal[True] = True,
|
|
54
|
+
) -> list[SignatureCrop]: ...
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def crop_signatures(
|
|
58
|
+
pdf_path: Path,
|
|
59
|
+
file_result: FileResult,
|
|
60
|
+
*,
|
|
61
|
+
output_dir: Path,
|
|
62
|
+
dpi: int = 200,
|
|
63
|
+
logger: logging.Logger | None = None,
|
|
64
|
+
return_bytes: bool = False,
|
|
65
|
+
) -> list[Path] | list[SignatureCrop]:
|
|
66
|
+
"""Render each signature bounding box to a PNG image using PyMuPDF.
|
|
67
|
+
|
|
68
|
+
Set ``return_bytes=True`` to collect in-memory PNG bytes for each crop while also writing
|
|
69
|
+
the files to ``output_dir``.
|
|
70
|
+
"""
|
|
31
71
|
|
|
32
72
|
if fitz is None: # pragma: no cover - exercised when dependency absent
|
|
33
73
|
raise SignatureCroppingUnavailable(
|
|
@@ -37,7 +77,8 @@ def crop_signatures(
|
|
|
37
77
|
pdf_path = Path(pdf_path)
|
|
38
78
|
output_dir = Path(output_dir)
|
|
39
79
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
40
|
-
|
|
80
|
+
generated_paths: list[Path] = []
|
|
81
|
+
generated_crops: list[SignatureCrop] = []
|
|
41
82
|
|
|
42
83
|
with fitz.open(pdf_path) as document: # type: ignore[attr-defined]
|
|
43
84
|
per_document_dir = output_dir / pdf_path.stem
|
|
@@ -70,8 +111,11 @@ def crop_signatures(
|
|
|
70
111
|
destination = per_document_dir / filename
|
|
71
112
|
|
|
72
113
|
try:
|
|
114
|
+
image_bytes: bytes | None = None
|
|
73
115
|
pixmap = page.get_pixmap(matrix=matrix, clip=clip, alpha=False)
|
|
74
116
|
pixmap.save(destination)
|
|
117
|
+
if return_bytes:
|
|
118
|
+
image_bytes = pixmap.tobytes("png")
|
|
75
119
|
except Exception as exc: # pragma: no cover - defensive
|
|
76
120
|
if logger:
|
|
77
121
|
logger.warning(
|
|
@@ -86,9 +130,19 @@ def crop_signatures(
|
|
|
86
130
|
continue
|
|
87
131
|
|
|
88
132
|
signature.CropPath = str(destination)
|
|
89
|
-
|
|
133
|
+
generated_paths.append(destination)
|
|
134
|
+
if return_bytes:
|
|
135
|
+
if image_bytes is None: # pragma: no cover - defensive
|
|
136
|
+
continue
|
|
137
|
+
generated_crops.append(
|
|
138
|
+
SignatureCrop(
|
|
139
|
+
path=destination,
|
|
140
|
+
image_bytes=image_bytes,
|
|
141
|
+
signature=signature,
|
|
142
|
+
)
|
|
143
|
+
)
|
|
90
144
|
|
|
91
|
-
return
|
|
145
|
+
return generated_crops if return_bytes else generated_paths
|
|
92
146
|
|
|
93
147
|
|
|
94
148
|
def _to_clip_rect(page, bbox: tuple[float, float, float, float]):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sigdetect
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Signature detection and role attribution for PDFs
|
|
5
5
|
Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
|
|
6
6
|
License: MIT
|
|
@@ -192,14 +192,21 @@ for res in ScanDirectory(
|
|
|
192
192
|
# 3) Crop PNG snippets for FileResult objects (requires PyMuPDF)
|
|
193
193
|
detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
|
|
194
194
|
file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
|
|
195
|
-
CropSignatureImages(
|
|
195
|
+
crops = CropSignatureImages(
|
|
196
196
|
"/path/to/pdfs/example.pdf",
|
|
197
197
|
file_result,
|
|
198
198
|
outputDirectory="./signature_crops",
|
|
199
199
|
dpi=200,
|
|
200
|
+
returnBytes=True, # also returns in-memory PNG bytes for each crop
|
|
200
201
|
)
|
|
202
|
+
|
|
203
|
+
first_crop = crops[0]
|
|
204
|
+
print(first_crop.path, len(first_crop.image_bytes))
|
|
201
205
|
~~~
|
|
202
206
|
|
|
207
|
+
When ``returnBytes=True`` the helper returns ``SignatureCrop`` objects containing the saved path,
|
|
208
|
+
PNG bytes, and the originating signature metadata.
|
|
209
|
+
|
|
203
210
|
|
|
204
211
|
## Result schema
|
|
205
212
|
|
|
@@ -6,7 +6,7 @@ from pypdf.generic import ArrayObject, DictionaryObject, NameObject, NumberObjec
|
|
|
6
6
|
|
|
7
7
|
from sigdetect.api import CropSignatureImages, DetectPdf
|
|
8
8
|
from sigdetect.config import DetectConfiguration
|
|
9
|
-
from sigdetect.cropping import crop_signatures
|
|
9
|
+
from sigdetect.cropping import SignatureCrop, crop_signatures
|
|
10
10
|
from sigdetect.detector.pypdf2_engine import PyPDF2Detector
|
|
11
11
|
|
|
12
12
|
pytest.importorskip("fitz")
|
|
@@ -70,3 +70,44 @@ def test_crop_signature_images_accepts_dict(tmp_path: Path) -> None:
|
|
|
70
70
|
|
|
71
71
|
assert paths
|
|
72
72
|
assert result_dict["signatures"][0]["crop_path"] is not None
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def test_crop_signature_images_returns_bytes(tmp_path: Path) -> None:
|
|
76
|
+
pdf_path = tmp_path / "doc.pdf"
|
|
77
|
+
_pdf_with_signature(pdf_path)
|
|
78
|
+
|
|
79
|
+
result_dict = DetectPdf(pdf_path, engineName="pymupdf")
|
|
80
|
+
out_dir = tmp_path / "dict_byte_crops"
|
|
81
|
+
crops = CropSignatureImages(
|
|
82
|
+
pdf_path,
|
|
83
|
+
result_dict,
|
|
84
|
+
outputDirectory=out_dir,
|
|
85
|
+
returnBytes=True,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
assert crops
|
|
89
|
+
assert isinstance(crops[0], SignatureCrop)
|
|
90
|
+
assert crops[0].image_bytes
|
|
91
|
+
assert result_dict["signatures"][0]["crop_path"] is not None
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def test_crop_signatures_returns_bytes(tmp_path: Path) -> None:
|
|
95
|
+
pdf_path = tmp_path / "doc.pdf"
|
|
96
|
+
_pdf_with_signature(pdf_path)
|
|
97
|
+
|
|
98
|
+
cfg = DetectConfiguration(pdf_root=tmp_path, out_dir=tmp_path, engine="pypdf2")
|
|
99
|
+
result = PyPDF2Detector(cfg).Detect(pdf_path)
|
|
100
|
+
|
|
101
|
+
out_dir = tmp_path / "byte_crops"
|
|
102
|
+
crops = crop_signatures(
|
|
103
|
+
pdf_path,
|
|
104
|
+
result,
|
|
105
|
+
output_dir=out_dir,
|
|
106
|
+
dpi=120,
|
|
107
|
+
return_bytes=True,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
assert crops
|
|
111
|
+
assert isinstance(crops[0], SignatureCrop)
|
|
112
|
+
assert crops[0].path.exists()
|
|
113
|
+
assert crops[0].image_bytes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|