sigdetect 0.2.0__tar.gz → 0.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {sigdetect-0.2.0 → sigdetect-0.3.1}/PKG-INFO +11 -2
  2. {sigdetect-0.2.0 → sigdetect-0.3.1}/README.md +10 -1
  3. {sigdetect-0.2.0 → sigdetect-0.3.1}/pyproject.toml +1 -1
  4. {sigdetect-0.2.0 → sigdetect-0.3.1}/src/sigdetect/api.py +33 -2
  5. {sigdetect-0.2.0 → sigdetect-0.3.1}/src/sigdetect/cropping.py +72 -9
  6. {sigdetect-0.2.0 → sigdetect-0.3.1}/src/sigdetect.egg-info/PKG-INFO +11 -2
  7. {sigdetect-0.2.0 → sigdetect-0.3.1}/tests/test_cropping.py +83 -1
  8. {sigdetect-0.2.0 → sigdetect-0.3.1}/setup.cfg +0 -0
  9. {sigdetect-0.2.0 → sigdetect-0.3.1}/src/sigdetect/__init__.py +0 -0
  10. {sigdetect-0.2.0 → sigdetect-0.3.1}/src/sigdetect/cli.py +0 -0
  11. {sigdetect-0.2.0 → sigdetect-0.3.1}/src/sigdetect/config.py +0 -0
  12. {sigdetect-0.2.0 → sigdetect-0.3.1}/src/sigdetect/data/role_rules.retainer.yml +0 -0
  13. {sigdetect-0.2.0 → sigdetect-0.3.1}/src/sigdetect/data/role_rules.yml +0 -0
  14. {sigdetect-0.2.0 → sigdetect-0.3.1}/src/sigdetect/data/vendor_patterns.yml +0 -0
  15. {sigdetect-0.2.0 → sigdetect-0.3.1}/src/sigdetect/detector/__init__.py +0 -0
  16. {sigdetect-0.2.0 → sigdetect-0.3.1}/src/sigdetect/detector/base.py +0 -0
  17. {sigdetect-0.2.0 → sigdetect-0.3.1}/src/sigdetect/detector/base_detector.py +0 -0
  18. {sigdetect-0.2.0 → sigdetect-0.3.1}/src/sigdetect/detector/file_result_model.py +0 -0
  19. {sigdetect-0.2.0 → sigdetect-0.3.1}/src/sigdetect/detector/pymupdf_engine.py +0 -0
  20. {sigdetect-0.2.0 → sigdetect-0.3.1}/src/sigdetect/detector/pypdf2_engine.py +0 -0
  21. {sigdetect-0.2.0 → sigdetect-0.3.1}/src/sigdetect/detector/signature_model.py +0 -0
  22. {sigdetect-0.2.0 → sigdetect-0.3.1}/src/sigdetect/eda.py +0 -0
  23. {sigdetect-0.2.0 → sigdetect-0.3.1}/src/sigdetect/logging_setup.py +0 -0
  24. {sigdetect-0.2.0 → sigdetect-0.3.1}/src/sigdetect/utils.py +0 -0
  25. {sigdetect-0.2.0 → sigdetect-0.3.1}/src/sigdetect.egg-info/SOURCES.txt +0 -0
  26. {sigdetect-0.2.0 → sigdetect-0.3.1}/src/sigdetect.egg-info/dependency_links.txt +0 -0
  27. {sigdetect-0.2.0 → sigdetect-0.3.1}/src/sigdetect.egg-info/entry_points.txt +0 -0
  28. {sigdetect-0.2.0 → sigdetect-0.3.1}/src/sigdetect.egg-info/requires.txt +0 -0
  29. {sigdetect-0.2.0 → sigdetect-0.3.1}/src/sigdetect.egg-info/top_level.txt +0 -0
  30. {sigdetect-0.2.0 → sigdetect-0.3.1}/tests/test_api.py +0 -0
  31. {sigdetect-0.2.0 → sigdetect-0.3.1}/tests/test_pymupdf_engine.py +0 -0
  32. {sigdetect-0.2.0 → sigdetect-0.3.1}/tests/test_widget_role_patient_smoke.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sigdetect
3
- Version: 0.2.0
3
+ Version: 0.3.1
4
4
  Summary: Signature detection and role attribution for PDFs
5
5
  Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
6
6
  License: MIT
@@ -192,14 +192,23 @@ for res in ScanDirectory(
192
192
  # 3) Crop PNG snippets for FileResult objects (requires PyMuPDF)
193
193
  detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
194
194
  file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
195
- CropSignatureImages(
195
+ crops = CropSignatureImages(
196
196
  "/path/to/pdfs/example.pdf",
197
197
  file_result,
198
198
  outputDirectory="./signature_crops",
199
199
  dpi=200,
200
+ returnBytes=True, # also returns in-memory PNG bytes for each crop
201
+ # saveToDisk=False, # optional: skip writing PNGs to disk
200
202
  )
203
+
204
+ first_crop = crops[0]
205
+ print(first_crop.path, len(first_crop.image_bytes))
201
206
  ~~~
202
207
 
208
+ When ``returnBytes=True`` the helper returns ``SignatureCrop`` objects containing the saved path,
209
+ PNG bytes, and the originating signature metadata.
210
+ Pass ``saveToDisk=False`` if you only want in-memory PNG bytes (no files on disk or ``crop_path`` updates).
211
+
203
212
 
204
213
  ## Result schema
205
214
 
@@ -176,14 +176,23 @@ for res in ScanDirectory(
176
176
  # 3) Crop PNG snippets for FileResult objects (requires PyMuPDF)
177
177
  detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
178
178
  file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
179
- CropSignatureImages(
179
+ crops = CropSignatureImages(
180
180
  "/path/to/pdfs/example.pdf",
181
181
  file_result,
182
182
  outputDirectory="./signature_crops",
183
183
  dpi=200,
184
+ returnBytes=True, # also returns in-memory PNG bytes for each crop
185
+ # saveToDisk=False, # optional: skip writing PNGs to disk
184
186
  )
187
+
188
+ first_crop = crops[0]
189
+ print(first_crop.path, len(first_crop.image_bytes))
185
190
  ~~~
186
191
 
192
+ When ``returnBytes=True`` the helper returns ``SignatureCrop`` objects containing the saved path,
193
+ PNG bytes, and the originating signature metadata.
194
+ Pass ``saveToDisk=False`` if you only want in-memory PNG bytes (no files on disk or ``crop_path`` updates).
195
+
187
196
 
188
197
  ## Result schema
189
198
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sigdetect"
7
- version = "0.2.0"
7
+ version = "0.3.1"
8
8
  description = "Signature detection and role attribution for PDFs"
9
9
  readme = "README.md"
10
10
  authors = [{ name = "BT Asmamaw", email = "basmamaw@angeiongroup.com" }]
@@ -4,9 +4,10 @@ from __future__ import annotations
4
4
 
5
5
  from contextlib import contextmanager
6
6
  from pathlib import Path
7
- from typing import Any, Generator, Iterable, Iterator, Literal
7
+ from typing import Any, Generator, Iterable, Iterator, Literal, overload
8
8
 
9
9
  from sigdetect.config import DetectConfiguration
10
+ from sigdetect.cropping import SignatureCrop
10
11
  from sigdetect.detector import BuildDetector, Detector, FileResult, Signature
11
12
 
12
13
  EngineName = Literal["pypdf2", "pypdf", "pymupdf"]
@@ -191,17 +192,45 @@ def detector_context(**kwargs: Any) -> Generator[Detector, None, None]:
191
192
  pass
192
193
 
193
194
 
195
+ @overload
194
196
  def CropSignatureImages(
195
197
  pdfPath: str | Path,
196
198
  fileResult: FileResult | dict[str, Any],
197
199
  *,
198
200
  outputDirectory: str | Path,
199
201
  dpi: int = 200,
200
- ) -> list[Path]:
202
+ returnBytes: Literal[False] = False,
203
+ saveToDisk: bool = True,
204
+ ) -> list[Path]: ...
205
+
206
+
207
+ @overload
208
+ def CropSignatureImages(
209
+ pdfPath: str | Path,
210
+ fileResult: FileResult | dict[str, Any],
211
+ *,
212
+ outputDirectory: str | Path,
213
+ dpi: int,
214
+ returnBytes: Literal[True],
215
+ saveToDisk: bool,
216
+ ) -> list[SignatureCrop]: ...
217
+
218
+
219
+ def CropSignatureImages(
220
+ pdfPath: str | Path,
221
+ fileResult: FileResult | dict[str, Any],
222
+ *,
223
+ outputDirectory: str | Path,
224
+ dpi: int = 200,
225
+ returnBytes: bool = False,
226
+ saveToDisk: bool = True,
227
+ ) -> list[Path] | list[SignatureCrop]:
201
228
  """Crop detected signature regions to PNG files.
202
229
 
203
230
  Accepts either a :class:`FileResult` instance or the ``dict`` returned by
204
231
  :func:`DetectPdf`. Requires the optional ``pymupdf`` dependency.
232
+ Set ``returnBytes=True`` to also receive in-memory PNG bytes for each crop. Set
233
+ ``saveToDisk=False`` to skip writing PNG files while still returning in-memory data.
205
234
  """
206
235
 
207
236
  from sigdetect.cropping import crop_signatures
@@ -212,6 +241,8 @@ def CropSignatureImages(
212
241
  file_result=file_result_obj,
213
242
  output_dir=Path(outputDirectory),
214
243
  dpi=dpi,
244
+ return_bytes=returnBytes,
245
+ save_files=saveToDisk,
215
246
  )
216
247
  if original_dict is not None:
217
248
  original_dict.clear()
@@ -4,7 +4,9 @@ from __future__ import annotations
4
4
 
5
5
  import logging
6
6
  import re
7
+ from dataclasses import dataclass
7
8
  from pathlib import Path
9
+ from typing import Literal, overload
8
10
 
9
11
  from .detector.file_result_model import FileResult
10
12
  from .detector.signature_model import Signature
@@ -19,6 +21,29 @@ class SignatureCroppingUnavailable(RuntimeError):
19
21
  """Raised when PNG cropping cannot be performed (e.g., PyMuPDF missing)."""
20
22
 
21
23
 
24
+ @dataclass(slots=True)
25
+ class SignatureCrop:
26
+ """PNG crop metadata and in-memory content."""
27
+
28
+ path: Path
29
+ image_bytes: bytes
30
+ signature: Signature
31
+ saved_to_disk: bool = True
32
+
33
+
34
+ @overload
35
+ def crop_signatures(
36
+ pdf_path: Path,
37
+ file_result: FileResult,
38
+ *,
39
+ output_dir: Path,
40
+ dpi: int = 200,
41
+ logger: logging.Logger | None = None,
42
+ return_bytes: Literal[False] = False,
43
+ ) -> list[Path]: ...
44
+
45
+
46
+ @overload
22
47
  def crop_signatures(
23
48
  pdf_path: Path,
24
49
  file_result: FileResult,
@@ -26,22 +51,44 @@ def crop_signatures(
26
51
  output_dir: Path,
27
52
  dpi: int = 200,
28
53
  logger: logging.Logger | None = None,
29
- ) -> list[Path]:
30
- """Render each signature bounding box to a PNG image using PyMuPDF."""
54
+ return_bytes: Literal[True] = True,
55
+ ) -> list[SignatureCrop]: ...
56
+
57
+
58
+ def crop_signatures(
59
+ pdf_path: Path,
60
+ file_result: FileResult,
61
+ *,
62
+ output_dir: Path,
63
+ dpi: int = 200,
64
+ logger: logging.Logger | None = None,
65
+ return_bytes: bool = False,
66
+ save_files: bool = True,
67
+ ) -> list[Path] | list[SignatureCrop]:
68
+ """Render each signature bounding box to a PNG image using PyMuPDF.
69
+
70
+ Set ``return_bytes=True`` to collect in-memory PNG bytes for each crop while also writing
71
+ the files to ``output_dir``. Set ``save_files=False`` to skip writing PNGs to disk.
72
+ """
31
73
 
32
74
  if fitz is None: # pragma: no cover - exercised when dependency absent
33
75
  raise SignatureCroppingUnavailable(
34
76
  "PyMuPDF is required for PNG crops. Install 'pymupdf' or 'sigdetect[pymupdf]'."
35
77
  )
78
+ if not save_files and not return_bytes:
79
+ raise ValueError("At least one of save_files or return_bytes must be True")
36
80
 
37
81
  pdf_path = Path(pdf_path)
38
82
  output_dir = Path(output_dir)
39
- output_dir.mkdir(parents=True, exist_ok=True)
40
- generated: list[Path] = []
83
+ if save_files:
84
+ output_dir.mkdir(parents=True, exist_ok=True)
85
+ generated_paths: list[Path] = []
86
+ generated_crops: list[SignatureCrop] = []
41
87
 
42
88
  with fitz.open(pdf_path) as document: # type: ignore[attr-defined]
43
89
  per_document_dir = output_dir / pdf_path.stem
44
- per_document_dir.mkdir(parents=True, exist_ok=True)
90
+ if save_files:
91
+ per_document_dir.mkdir(parents=True, exist_ok=True)
45
92
  scale = dpi / 72.0
46
93
  matrix = fitz.Matrix(scale, scale)
47
94
 
@@ -70,8 +117,12 @@ def crop_signatures(
70
117
  destination = per_document_dir / filename
71
118
 
72
119
  try:
120
+ image_bytes: bytes | None = None
73
121
  pixmap = page.get_pixmap(matrix=matrix, clip=clip, alpha=False)
74
- pixmap.save(destination)
122
+ if save_files:
123
+ pixmap.save(destination)
124
+ if return_bytes:
125
+ image_bytes = pixmap.tobytes("png")
75
126
  except Exception as exc: # pragma: no cover - defensive
76
127
  if logger:
77
128
  logger.warning(
@@ -85,10 +136,22 @@ def crop_signatures(
85
136
  )
86
137
  continue
87
138
 
88
- signature.CropPath = str(destination)
89
- generated.append(destination)
139
+ if save_files:
140
+ signature.CropPath = str(destination)
141
+ generated_paths.append(destination)
142
+ if return_bytes:
143
+ if image_bytes is None: # pragma: no cover - defensive
144
+ continue
145
+ generated_crops.append(
146
+ SignatureCrop(
147
+ path=destination,
148
+ image_bytes=image_bytes,
149
+ signature=signature,
150
+ saved_to_disk=save_files,
151
+ )
152
+ )
90
153
 
91
- return generated
154
+ return generated_crops if return_bytes else generated_paths
92
155
 
93
156
 
94
157
  def _to_clip_rect(page, bbox: tuple[float, float, float, float]):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sigdetect
3
- Version: 0.2.0
3
+ Version: 0.3.1
4
4
  Summary: Signature detection and role attribution for PDFs
5
5
  Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
6
6
  License: MIT
@@ -192,14 +192,23 @@ for res in ScanDirectory(
192
192
  # 3) Crop PNG snippets for FileResult objects (requires PyMuPDF)
193
193
  detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
194
194
  file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
195
- CropSignatureImages(
195
+ crops = CropSignatureImages(
196
196
  "/path/to/pdfs/example.pdf",
197
197
  file_result,
198
198
  outputDirectory="./signature_crops",
199
199
  dpi=200,
200
+ returnBytes=True, # also returns in-memory PNG bytes for each crop
201
+ # saveToDisk=False, # optional: skip writing PNGs to disk
200
202
  )
203
+
204
+ first_crop = crops[0]
205
+ print(first_crop.path, len(first_crop.image_bytes))
201
206
  ~~~
202
207
 
208
+ When ``returnBytes=True`` the helper returns ``SignatureCrop`` objects containing the saved path,
209
+ PNG bytes, and the originating signature metadata.
210
+ Pass ``saveToDisk=False`` if you only want in-memory PNG bytes (no files on disk or ``crop_path`` updates).
211
+
203
212
 
204
213
  ## Result schema
205
214
 
@@ -6,7 +6,7 @@ from pypdf.generic import ArrayObject, DictionaryObject, NameObject, NumberObjec
6
6
 
7
7
  from sigdetect.api import CropSignatureImages, DetectPdf
8
8
  from sigdetect.config import DetectConfiguration
9
- from sigdetect.cropping import crop_signatures
9
+ from sigdetect.cropping import SignatureCrop, crop_signatures
10
10
  from sigdetect.detector.pypdf2_engine import PyPDF2Detector
11
11
 
12
12
  pytest.importorskip("fitz")
@@ -70,3 +70,85 @@ def test_crop_signature_images_accepts_dict(tmp_path: Path) -> None:
70
70
 
71
71
  assert paths
72
72
  assert result_dict["signatures"][0]["crop_path"] is not None
73
+
74
+
75
+ def test_crop_signature_images_returns_bytes(tmp_path: Path) -> None:
76
+ pdf_path = tmp_path / "doc.pdf"
77
+ _pdf_with_signature(pdf_path)
78
+
79
+ result_dict = DetectPdf(pdf_path, engineName="pymupdf")
80
+ out_dir = tmp_path / "dict_byte_crops"
81
+ crops = CropSignatureImages(
82
+ pdf_path,
83
+ result_dict,
84
+ outputDirectory=out_dir,
85
+ returnBytes=True,
86
+ )
87
+
88
+ assert crops
89
+ assert isinstance(crops[0], SignatureCrop)
90
+ assert crops[0].image_bytes
91
+ assert result_dict["signatures"][0]["crop_path"] is not None
92
+
93
+
94
+ def test_crop_signature_images_can_skip_disk(tmp_path: Path) -> None:
95
+ pdf_path = tmp_path / "doc.pdf"
96
+ _pdf_with_signature(pdf_path)
97
+
98
+ result_dict = DetectPdf(pdf_path, engineName="pymupdf")
99
+ out_dir = tmp_path / "dict_byte_crops_no_disk"
100
+ crops = CropSignatureImages(
101
+ pdf_path,
102
+ result_dict,
103
+ outputDirectory=out_dir,
104
+ returnBytes=True,
105
+ saveToDisk=False,
106
+ )
107
+
108
+ assert crops
109
+ first_crop = crops[0]
110
+ assert isinstance(first_crop, SignatureCrop)
111
+ assert first_crop.image_bytes
112
+ assert first_crop.saved_to_disk is False
113
+ assert not first_crop.path.exists()
114
+ assert result_dict["signatures"][0]["crop_path"] is None
115
+
116
+
117
+ def test_crop_signatures_returns_bytes(tmp_path: Path) -> None:
118
+ pdf_path = tmp_path / "doc.pdf"
119
+ _pdf_with_signature(pdf_path)
120
+
121
+ cfg = DetectConfiguration(pdf_root=tmp_path, out_dir=tmp_path, engine="pypdf2")
122
+ result = PyPDF2Detector(cfg).Detect(pdf_path)
123
+
124
+ out_dir = tmp_path / "byte_crops"
125
+ crops = crop_signatures(
126
+ pdf_path,
127
+ result,
128
+ output_dir=out_dir,
129
+ dpi=120,
130
+ return_bytes=True,
131
+ )
132
+
133
+ assert crops
134
+ assert isinstance(crops[0], SignatureCrop)
135
+ assert crops[0].path.exists()
136
+ assert crops[0].image_bytes
137
+
138
+
139
+ def test_crop_signatures_requires_save_or_bytes(tmp_path: Path) -> None:
140
+ pdf_path = tmp_path / "doc.pdf"
141
+ _pdf_with_signature(pdf_path)
142
+
143
+ cfg = DetectConfiguration(pdf_root=tmp_path, out_dir=tmp_path, engine="pypdf2")
144
+ result = PyPDF2Detector(cfg).Detect(pdf_path)
145
+
146
+ with pytest.raises(ValueError):
147
+ crop_signatures(
148
+ pdf_path,
149
+ result,
150
+ output_dir=tmp_path / "unused",
151
+ dpi=120,
152
+ save_files=False,
153
+ return_bytes=False,
154
+ )
File without changes
File without changes