sigdetect 0.1.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sigdetect/cropping.py ADDED
@@ -0,0 +1,177 @@
1
+ """Helpers for converting signature bounding boxes into PNG crops."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import re
7
+ from dataclasses import dataclass
8
+ from pathlib import Path
9
+ from typing import Literal, overload
10
+
11
+ from .detector.file_result_model import FileResult
12
+ from .detector.signature_model import Signature
13
+
14
+ try: # pragma: no cover - optional dependency
15
+ import fitz # type: ignore
16
+ except Exception: # pragma: no cover - optional dependency
17
+ fitz = None # type: ignore[misc]
18
+
19
+
20
+ class SignatureCroppingUnavailable(RuntimeError):
21
+ """Raised when PNG cropping cannot be performed (e.g., PyMuPDF missing)."""
22
+
23
+
24
+ @dataclass(slots=True)
25
+ class SignatureCrop:
26
+ """PNG crop metadata and in-memory content."""
27
+
28
+ path: Path
29
+ image_bytes: bytes
30
+ signature: Signature
31
+
32
+
33
+ @overload
34
+ def crop_signatures(
35
+ pdf_path: Path,
36
+ file_result: FileResult,
37
+ *,
38
+ output_dir: Path,
39
+ dpi: int = 200,
40
+ logger: logging.Logger | None = None,
41
+ return_bytes: Literal[False] = False,
42
+ ) -> list[Path]: ...
43
+
44
+
45
+ @overload
46
+ def crop_signatures(
47
+ pdf_path: Path,
48
+ file_result: FileResult,
49
+ *,
50
+ output_dir: Path,
51
+ dpi: int = 200,
52
+ logger: logging.Logger | None = None,
53
+ return_bytes: Literal[True] = True,
54
+ ) -> list[SignatureCrop]: ...
55
+
56
+
57
+ def crop_signatures(
58
+ pdf_path: Path,
59
+ file_result: FileResult,
60
+ *,
61
+ output_dir: Path,
62
+ dpi: int = 200,
63
+ logger: logging.Logger | None = None,
64
+ return_bytes: bool = False,
65
+ ) -> list[Path] | list[SignatureCrop]:
66
+ """Render each signature bounding box to a PNG image using PyMuPDF.
67
+
68
+ Set ``return_bytes=True`` to collect in-memory PNG bytes for each crop while also writing
69
+ the files to ``output_dir``.
70
+ """
71
+
72
+ if fitz is None: # pragma: no cover - exercised when dependency absent
73
+ raise SignatureCroppingUnavailable(
74
+ "PyMuPDF is required for PNG crops. Install 'pymupdf' or 'sigdetect[pymupdf]'."
75
+ )
76
+
77
+ pdf_path = Path(pdf_path)
78
+ output_dir = Path(output_dir)
79
+ output_dir.mkdir(parents=True, exist_ok=True)
80
+ generated_paths: list[Path] = []
81
+ generated_crops: list[SignatureCrop] = []
82
+
83
+ with fitz.open(pdf_path) as document: # type: ignore[attr-defined]
84
+ per_document_dir = output_dir / pdf_path.stem
85
+ per_document_dir.mkdir(parents=True, exist_ok=True)
86
+ scale = dpi / 72.0
87
+ matrix = fitz.Matrix(scale, scale)
88
+
89
+ for index, signature in enumerate(file_result.Signatures, start=1):
90
+ if not signature.BoundingBox or not signature.Page:
91
+ continue
92
+ try:
93
+ page = document.load_page(signature.Page - 1)
94
+ except Exception as exc: # pragma: no cover - defensive
95
+ if logger:
96
+ logger.warning(
97
+ "Failed to load page for signature crop",
98
+ extra={
99
+ "file": pdf_path.name,
100
+ "page": signature.Page,
101
+ "error": str(exc),
102
+ },
103
+ )
104
+ continue
105
+
106
+ clip = _to_clip_rect(page, signature.BoundingBox)
107
+ if clip is None:
108
+ continue
109
+
110
+ filename = _build_filename(index, signature)
111
+ destination = per_document_dir / filename
112
+
113
+ try:
114
+ image_bytes: bytes | None = None
115
+ pixmap = page.get_pixmap(matrix=matrix, clip=clip, alpha=False)
116
+ pixmap.save(destination)
117
+ if return_bytes:
118
+ image_bytes = pixmap.tobytes("png")
119
+ except Exception as exc: # pragma: no cover - defensive
120
+ if logger:
121
+ logger.warning(
122
+ "Failed to render signature crop",
123
+ extra={
124
+ "file": pdf_path.name,
125
+ "page": signature.Page,
126
+ "field": signature.FieldName,
127
+ "error": str(exc),
128
+ },
129
+ )
130
+ continue
131
+
132
+ signature.CropPath = str(destination)
133
+ generated_paths.append(destination)
134
+ if return_bytes:
135
+ if image_bytes is None: # pragma: no cover - defensive
136
+ continue
137
+ generated_crops.append(
138
+ SignatureCrop(
139
+ path=destination,
140
+ image_bytes=image_bytes,
141
+ signature=signature,
142
+ )
143
+ )
144
+
145
+ return generated_crops if return_bytes else generated_paths
146
+
147
+
148
+ def _to_clip_rect(page, bbox: tuple[float, float, float, float]):
149
+ width = float(page.rect.width)
150
+ height = float(page.rect.height)
151
+
152
+ x0, y0, x1, y1 = bbox
153
+ left = _clamp(min(x0, x1), 0.0, width)
154
+ right = _clamp(max(x0, x1), 0.0, width)
155
+
156
+ top = _clamp(height - max(y0, y1), 0.0, height)
157
+ bottom = _clamp(height - min(y0, y1), 0.0, height)
158
+
159
+ if right - left <= 0 or bottom - top <= 0:
160
+ return None
161
+ return fitz.Rect(left, top, right, bottom)
162
+
163
+
164
+ def _clamp(value: float, lower: float, upper: float) -> float:
165
+ return max(lower, min(value, upper))
166
+
167
+
168
+ def _build_filename(index: int, signature: Signature) -> str:
169
+ base = signature.Role or signature.FieldName or "signature"
170
+ slug = _slugify(base)
171
+ return f"sig_{index:02d}_{slug}.png"
172
+
173
+
174
+ def _slugify(value: str) -> str:
175
+ cleaned = re.sub(r"[^A-Za-z0-9_-]+", "_", value.strip().lower())
176
+ cleaned = cleaned.strip("_")
177
+ return cleaned or "signature"
@@ -0,0 +1,420 @@
1
+ """PyMuPDF-backed detector that augments PyPDF2 heuristics with geometry."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Iterable, cast
7
+
8
+ from .pypdf2_engine import PyPDF2Detector
9
+ from .signature_model import Signature
10
+
11
+ try: # pragma: no cover - optional dependency
12
+ import fitz # type: ignore
13
+ except Exception: # pragma: no cover - optional dependency
14
+ fitz = None # type: ignore[misc]
15
+
16
+
17
+ class PyMuPDFDetector(PyPDF2Detector):
18
+ """Detector that reuses PyPDF2 heuristics and annotates results via PyMuPDF."""
19
+
20
+ Name = "pymupdf"
21
+ SIGNATURE_PADDING = 64.0
22
+ ROLE_KEYWORDS: dict[str, tuple[str, ...]] = {
23
+ "client": ("client", "consumer", "claimant"),
24
+ "firm": ("firm", "attorney", "attorneys", "counsel", "company", "llp", "llc", "law", "by:"),
25
+ "patient": ("patient", "self", "plaintiff"),
26
+ "representative": ("representative", "guardian", "parent"),
27
+ "attorney": ("attorney", "counsel", "lawyer"),
28
+ }
29
+
30
+ def __init__(self, configuration):
31
+ if fitz is None: # pragma: no cover - optional dependency
32
+ raise ValueError(
33
+ "PyMuPDF engine requires the optional 'pymupdf' dependency. Install via 'pip install "
34
+ "sigdetect[pymupdf]' or add pymupdf to your environment."
35
+ )
36
+ super().__init__(configuration)
37
+
38
+ def Detect(self, pdf_path: Path): # type: ignore[override]
39
+ result = super().Detect(pdf_path)
40
+
41
+ try:
42
+ document = fitz.open(str(pdf_path))
43
+ except Exception: # pragma: no cover - defensive
44
+ return result
45
+
46
+ with document:
47
+ widget_map = self._CollectWidgetRects(document)
48
+ self._ApplyWidgetRects(result.Signatures, widget_map)
49
+ self._InferPseudoRects(result.Signatures, document)
50
+ return result
51
+
52
+ # ───────────────────────────────── widget helpers ─────────────────────────────────
53
+ def _CollectWidgetRects(
54
+ self, document
55
+ ) -> dict[tuple[int, str], tuple[float, float, float, float]]:
56
+ mapping: dict[tuple[int, str], tuple[float, float, float, float]] = {}
57
+ for page_index in range(document.page_count):
58
+ page = document.load_page(page_index)
59
+ widgets = page.widgets() if hasattr(page, "widgets") else None
60
+ if not widgets:
61
+ continue
62
+ for widget in widgets:
63
+ name = (widget.field_name or "").strip()
64
+ if not name:
65
+ continue
66
+ # Prefer true signature widgets but fall back to any widget with /Sig appearance
67
+ if getattr(widget, "field_type", None) not in {
68
+ getattr(fitz, "PDF_WIDGET_TYPE_SIGNATURE", 6)
69
+ }:
70
+ continue
71
+ rect = self._RectToPdfTuple(widget.rect, page.rect.height)
72
+ mapping[(page_index + 1, name)] = rect
73
+ return mapping
74
+
75
+ def _ApplyWidgetRects(
76
+ self,
77
+ signatures: Iterable[Signature],
78
+ widget_map: dict[tuple[int, str], tuple[float, float, float, float]],
79
+ ) -> None:
80
+ for signature in signatures:
81
+ if signature.BoundingBox or not signature.FieldName or not signature.Page:
82
+ continue
83
+ key = (signature.Page, signature.FieldName.strip())
84
+ rect = widget_map.get(key)
85
+ if rect:
86
+ signature.BoundingBox = rect
87
+
88
+ # ───────────────────────────── pseudo bbox inference ─────────────────────────────
89
+ def _InferPseudoRects(self, signatures: Iterable[Signature], document) -> None:
90
+ for signature in signatures:
91
+ if signature.BoundingBox or signature.FieldName != "vendor_or_acro_detected":
92
+ continue
93
+
94
+ if signature.Page and signature.Page - 1 >= document.page_count:
95
+ continue
96
+
97
+ if signature.Page:
98
+ candidate_pages = [signature.Page - 1]
99
+ else:
100
+ candidate_pages = list(range(document.page_count - 1, -1, -1))
101
+
102
+ for page_index in candidate_pages:
103
+ if page_index < 0 or page_index >= document.page_count:
104
+ continue
105
+ page = document.load_page(page_index)
106
+ lines = self._ExtractLines(page)
107
+ rect_info = self._FindRoleLineRect(page, signature.Role, lines)
108
+ if rect_info is None:
109
+ rect_info = self._FallbackSignatureRect(page, signature.Role, lines)
110
+ if rect_info is not None:
111
+ rect, exclusion, mode = rect_info
112
+ padded = self._PadRect(rect, page.rect, signature.Role, exclusion, mode)
113
+ signature.BoundingBox = self._RectToPdfTuple(padded, page.rect.height)
114
+ if signature.Page is None:
115
+ signature.Page = page_index + 1
116
+ break
117
+
118
+ def _FindRoleLineRect(
119
+ self,
120
+ page,
121
+ role: str,
122
+ lines: list[dict[str, float | str]] | None = None,
123
+ ) -> tuple[fitz.Rect, float | None, str] | None:
124
+ if lines is None:
125
+ lines = self._ExtractLines(page)
126
+ page_height = float(page.rect.height)
127
+ keywords = self.ROLE_KEYWORDS.get(role, ())
128
+ lower_roles = {"client", "firm", "representative", "attorney"}
129
+ if self.Profile == "retainer" and role in {"client", "firm"}:
130
+ min_factor = 0.15 if role == "client" else 0.4
131
+ min_y = page_height * min_factor
132
+ else:
133
+ min_y = page_height * (0.58 if role == "firm" else 0.5) if role in lower_roles else 0.0
134
+
135
+ def match_lines(require_signature: bool) -> list[tuple[int, dict[str, float | str]]]:
136
+ selected: list[tuple[int, dict[str, float | str]]] = []
137
+ for idx, line in enumerate(lines):
138
+ lower = line["lower_text"]
139
+ if lower.strip() == "":
140
+ continue
141
+ if line["y0"] < min_y:
142
+ continue
143
+ if require_signature and "sign" not in lower:
144
+ continue
145
+ if not require_signature and "sign" not in lower:
146
+ if "name" in lower or "print" in lower:
147
+ continue
148
+ if keywords and not any(keyword in lower for keyword in keywords):
149
+ continue
150
+ selected.append((idx, line))
151
+ return selected
152
+
153
+ matches = match_lines(require_signature=True)
154
+ if matches and matches[-1][1]["y0"] < page_height * 0.6:
155
+ matches = []
156
+ if not matches:
157
+ matches = match_lines(require_signature=False)
158
+
159
+ if matches:
160
+ idx, target = matches[-1]
161
+ label_rect = fitz.Rect(target["x0"], target["y0"], target["x1"], target["y1"])
162
+ stroke = self._LocateStrokeLine(lines, idx, label_rect)
163
+ if stroke is not None:
164
+ rect, exclusion = stroke
165
+ return rect, exclusion, "stroke"
166
+ image = self._LocateSignatureImage(page, label_rect)
167
+ if image is not None:
168
+ exclusion = self._NextExclusionY(lines, idx + 1, image.y1)
169
+ return image, exclusion, "image"
170
+ exclusion = self._NextExclusionY(lines, idx + 1, label_rect.y1)
171
+ return label_rect, exclusion, "label"
172
+ return None
173
+
174
+ def _FallbackSignatureRect(
175
+ self,
176
+ page,
177
+ role: str | None = None,
178
+ lines: list[dict[str, float | str]] | None = None,
179
+ ) -> tuple[fitz.Rect, float | None, str] | None:
180
+ if lines is None:
181
+ lines = self._ExtractLines(page)
182
+ for idx in range(len(lines) - 1, -1, -1):
183
+ line = lines[idx]
184
+ lower = line["lower_text"]
185
+ if "signature" in lower or "sign" in lower:
186
+ rect = fitz.Rect(line["x0"], line["y0"], line["x1"], line["y1"])
187
+ exclusion = self._NextExclusionY(lines, idx + 1, rect.y1)
188
+ return rect, exclusion, "label"
189
+ if lines:
190
+ line = lines[-1]
191
+ rect = fitz.Rect(line["x0"], line["y0"], line["x1"], line["y1"])
192
+ exclusion = None
193
+ return rect, exclusion, "label"
194
+ return None
195
+
196
+ def _ExtractLines(self, page) -> list[dict[str, float | str]]:
197
+ words = page.get_text("words") or []
198
+ buckets: dict[tuple[int, int], dict[str, object]] = {}
199
+ for x0, y0, x1, y1, text, block, line, *_ in words:
200
+ if not text.strip():
201
+ continue
202
+ key = (int(block), int(line))
203
+ bucket = buckets.setdefault(
204
+ key,
205
+ {
206
+ "tokens": [],
207
+ "x0": float(x0),
208
+ "y0": float(y0),
209
+ "x1": float(x1),
210
+ "y1": float(y1),
211
+ },
212
+ )
213
+ tokens = cast(list[str], bucket["tokens"])
214
+ tokens.append(text)
215
+ bucket["x0"] = min(float(bucket["x0"]), float(x0))
216
+ bucket["y0"] = min(float(bucket["y0"]), float(y0))
217
+ bucket["x1"] = max(float(bucket["x1"]), float(x1))
218
+ bucket["y1"] = max(float(bucket["y1"]), float(y1))
219
+ lines: list[dict[str, float | str]] = []
220
+ for bucket in buckets.values():
221
+ text = " ".join(bucket["tokens"]).strip() # type: ignore[arg-type]
222
+ if not text:
223
+ continue
224
+ lines.append(
225
+ {
226
+ "text": text,
227
+ "lower_text": text.lower(),
228
+ "x0": float(bucket["x0"]),
229
+ "y0": float(bucket["y0"]),
230
+ "x1": float(bucket["x1"]),
231
+ "y1": float(bucket["y1"]),
232
+ }
233
+ )
234
+ lines.sort(key=lambda entry: (entry["y0"], entry["x0"]))
235
+ return lines
236
+
237
+ def _LocateStrokeLine(
238
+ self,
239
+ lines: list[dict[str, float | str]],
240
+ label_index: int,
241
+ label_rect: fitz.Rect,
242
+ ) -> tuple[fitz.Rect, float | None] | None:
243
+ for idx in range(label_index - 1, max(label_index - 4, -1), -1):
244
+ lower = lines[idx]["lower_text"]
245
+ if "_" in lower or lower.strip().startswith("x"):
246
+ rect = fitz.Rect(
247
+ lines[idx]["x0"],
248
+ lines[idx]["y0"],
249
+ lines[idx]["x1"],
250
+ lines[idx]["y1"],
251
+ )
252
+ overlap = min(rect.x1, label_rect.x1) - max(rect.x0, label_rect.x0)
253
+ if overlap <= 0:
254
+ continue
255
+ # Keep crops below the label text.
256
+ return rect, label_rect.y0
257
+ return None
258
+
259
+ def _LocateSignatureImage(self, page, label_rect: fitz.Rect) -> fitz.Rect | None:
260
+ candidates: list[tuple[float, fitz.Rect]] = []
261
+ label_mid_x = (label_rect.x0 + label_rect.x1) / 2.0
262
+ for image in page.get_images(full=True):
263
+ bbox = page.get_image_bbox(image)
264
+ if bbox is None:
265
+ continue
266
+ width = float(bbox.width)
267
+ height = float(bbox.height)
268
+ if width < 40.0 or height < 12.0:
269
+ continue
270
+ if width > 380.0 or height > 220.0:
271
+ continue
272
+ # Require the image to sit near the label horizontally and vertically.
273
+ horiz_overlap = min(bbox.x1, label_rect.x1 + 220.0) - max(bbox.x0, label_rect.x0 - 40.0)
274
+ if horiz_overlap <= 0:
275
+ continue
276
+ vertical_gap = abs(((bbox.y0 + bbox.y1) / 2.0) - label_rect.y0)
277
+ if vertical_gap > 220.0:
278
+ continue
279
+ candidates.append((vertical_gap + abs(((bbox.x0 + bbox.x1) / 2.0) - label_mid_x), bbox))
280
+
281
+ if not candidates:
282
+ return None
283
+ candidates.sort(key=lambda item: item[0])
284
+ return candidates[0][1]
285
+
286
+ def _NextExclusionY(
287
+ self,
288
+ lines: list[dict[str, float | str]],
289
+ start_index: int,
290
+ minimum_y: float | None = None,
291
+ ) -> float | None:
292
+ threshold = (minimum_y or -float("inf")) + 1.0
293
+ for line in lines[start_index:]:
294
+ y0 = float(line["y0"])
295
+ if y0 <= threshold:
296
+ continue
297
+ lower = line["lower_text"]
298
+ if any(token in lower for token in ("name", "print", "date", "by:")):
299
+ return y0
300
+ return None
301
+
302
+ def _RectToPdfTuple(self, rect, page_height: float) -> tuple[float, float, float, float]:
303
+ x0 = float(rect.x0)
304
+ x1 = float(rect.x1)
305
+ y0 = page_height - float(rect.y1)
306
+ y1 = page_height - float(rect.y0)
307
+ if x1 < x0:
308
+ x0, x1 = x1, x0
309
+ if y1 < y0:
310
+ y0, y1 = y1, y0
311
+ return (x0, y0, x1, y1)
312
+
313
+ def _PadRect(
314
+ self,
315
+ rect,
316
+ page_rect,
317
+ role: str | None = None,
318
+ exclusion_y0: float | None = None,
319
+ mode: str = "label",
320
+ ):
321
+ """Return a region focused on the expected signature line beneath ``rect``."""
322
+
323
+ max_width = 198.0 # 2.75 inches
324
+ max_height = 72.0 # 1 inch
325
+
326
+ pad_x = max(12.0, float(rect.width) * 0.08)
327
+ if mode == "stroke":
328
+ left = max(page_rect.x0, rect.x0 - 8.0)
329
+ right = min(page_rect.x1, rect.x1 + 8.0)
330
+ elif mode == "image":
331
+ left = max(page_rect.x0, rect.x0 - 10.0)
332
+ right = min(page_rect.x1, rect.x1 + 10.0)
333
+ else:
334
+ left = max(page_rect.x0, rect.x0 - pad_x)
335
+ right = min(page_rect.x1, rect.x1 + pad_x)
336
+
337
+ if self.Profile == "retainer" and role == "client" and mode in {"image", "label"}:
338
+ left = max(page_rect.x0, rect.x0 - 12.0)
339
+ right = min(page_rect.x1, rect.x1 + 16.0)
340
+ elif self.Profile == "retainer" and role == "firm" and mode in {"image", "label"}:
341
+ left = max(page_rect.x0, rect.x0 - 14.0)
342
+ right = min(page_rect.x1, rect.x1 + 18.0)
343
+
344
+ if right - left > max_width:
345
+ if mode == "stroke":
346
+ right = min(page_rect.x1, left + max_width)
347
+ else:
348
+ center = (left + right) / 2.0
349
+ half = max_width / 2.0
350
+ left = center - half
351
+ right = center + half
352
+ if left < page_rect.x0:
353
+ right += page_rect.x0 - left
354
+ left = page_rect.x0
355
+ if right > page_rect.x1:
356
+ left -= right - page_rect.x1
357
+ right = page_rect.x1
358
+ left = max(page_rect.x0, left)
359
+ right = min(page_rect.x1, right)
360
+
361
+ line_height = max(8.0, float(rect.height) or 12.0)
362
+ signature_height = max(40.0, line_height * 2.2)
363
+ if role == "client":
364
+ signature_height = max(signature_height, 65.0)
365
+ elif role == "firm":
366
+ signature_height = max(signature_height, 60.0)
367
+ elif role in {"representative", "patient", "attorney"}:
368
+ signature_height = max(signature_height, 55.0)
369
+ signature_height = min(signature_height, max_height)
370
+
371
+ baseline = float(rect.y1)
372
+
373
+ if mode == "stroke":
374
+ margin_above = max(6.0, line_height)
375
+ margin_below = max(18.0, line_height * 1.5)
376
+ top = float(rect.y0) - margin_above
377
+ bottom = float(rect.y1) + margin_below
378
+ signature_height = min(bottom - top, max_height)
379
+ elif mode == "image":
380
+ image_height = float(rect.height) or 12.0
381
+ signature_height = min(max_height, max(image_height + 18.0, 40.0))
382
+ extra = max(0.0, signature_height - image_height)
383
+ top = float(rect.y0) - min(extra * 0.25, 12.0)
384
+ bottom = top + signature_height
385
+ top = max(float(rect.y0) - 2.0, top)
386
+ bottom = top + signature_height
387
+ else:
388
+ gap_above = max(10.0, min(24.0, line_height * 0.9))
389
+ top = baseline + gap_above
390
+ bottom = top + signature_height
391
+
392
+ original_top = top
393
+
394
+ if exclusion_y0 is not None:
395
+ limited = exclusion_y0 - 4.0
396
+ if bottom > limited:
397
+ bottom = limited
398
+ top = max(original_top, bottom - signature_height)
399
+ if mode == "image":
400
+ limit_below = float(rect.y1) + 24.0
401
+ if bottom > limit_below:
402
+ bottom = limit_below
403
+ top = max(float(rect.y0) - 4.0, bottom - signature_height)
404
+
405
+ if bottom - top > max_height:
406
+ bottom = top + max_height
407
+ signature_height = min(signature_height, max_height)
408
+
409
+ if bottom > page_rect.y1:
410
+ bottom = page_rect.y1
411
+ top = max(original_top, bottom - signature_height)
412
+
413
+ if bottom - top > max_height:
414
+ bottom = top + max_height
415
+
416
+ if top >= bottom:
417
+ top = max(page_rect.y0, baseline - line_height)
418
+ bottom = min(page_rect.y1, top + min(signature_height, max_height))
419
+
420
+ return fitz.Rect(left, top, right, bottom)