sigdetect 0.3.1__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,549 @@
1
+ """Wet signature detection via OCR-backed heuristics."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import re
7
+ from dataclasses import dataclass
8
+ from pathlib import Path
9
+ from typing import Iterable, Sequence
10
+
11
+ from PIL import Image
12
+
13
+ from sigdetect.config import DetectConfiguration
14
+ from sigdetect.detector.file_result_model import FileResult
15
+ from sigdetect.detector.signature_model import Signature
16
+
17
+ try: # pragma: no cover - optional dependency
18
+ import fitz # type: ignore
19
+ except Exception: # pragma: no cover - optional dependency
20
+ fitz = None # type: ignore[misc]
21
+
22
+ try: # pragma: no cover - optional dependency
23
+ import pytesseract # type: ignore
24
+ from pytesseract import Output as TesseractOutput
25
+ except Exception: # pragma: no cover - optional dependency
26
+ pytesseract = None # type: ignore[assignment]
27
+ TesseractOutput = None # type: ignore[assignment]
28
+
29
+
30
+ LOGGER = logging.getLogger("sigdetect.wet")
31
+
32
+ SIGNATURE_PATTERNS: tuple[re.Pattern[str], ...] = (
33
+ re.compile(r"\bsignature\b"),
34
+ re.compile(r"\bsigned\b"),
35
+ re.compile(r"\bsign\b"),
36
+ re.compile(r"\bsignature\s+of\b"),
37
+ re.compile(r"\bsignature\s*:"),
38
+ re.compile(r"\bsignature\s*-"),
39
+ re.compile(r"\bby:\b"),
40
+ )
41
+
42
+ ROLE_KEYWORDS: dict[str, tuple[str, ...]] = {
43
+ "client": ("client", "consumer", "claimant"),
44
+ "firm": ("firm", "attorney", "counsel", "by:", "esq", "law"),
45
+ "patient": ("patient", "self", "plaintiff"),
46
+ "representative": ("guardian", "representative", "parent", "poa"),
47
+ "attorney": ("attorney", "counsel", "lawyer"),
48
+ }
49
+
50
+
51
+ class WetDetectionUnavailable(RuntimeError):
52
+ """Raised when OCR-backed detection cannot run."""
53
+
54
+
55
+ @dataclass
56
+ class OcrLine:
57
+ """Structured OCR line extracted from pytesseract."""
58
+
59
+ text: str
60
+ confidence: float
61
+ left: int
62
+ top: int
63
+ right: int
64
+ bottom: int
65
+
66
+
67
+ def should_run_wet_pipeline(file_result: FileResult) -> bool:
68
+ """Return ``True`` when the OCR pipeline should run for ``file_result``."""
69
+
70
+ return not bool(file_result.ElectronicSignatureFound)
71
+
72
+
73
+ def apply_wet_detection(
74
+ pdf_path: Path,
75
+ configuration: DetectConfiguration,
76
+ file_result: FileResult,
77
+ *,
78
+ logger: logging.Logger | None = None,
79
+ ) -> bool:
80
+ """Augment ``file_result`` with OCR-detected wet signatures when possible."""
81
+
82
+ if not should_run_wet_pipeline(file_result):
83
+ return False
84
+
85
+ try:
86
+ _ensure_dependencies()
87
+ except WetDetectionUnavailable as exc:
88
+ _mark_manual_review(file_result, str(exc))
89
+ if logger:
90
+ logger.warning("Wet detection unavailable", extra={"error": str(exc)})
91
+ return False
92
+
93
+ original_esign = file_result.ElectronicSignatureFound
94
+ original_mixed = file_result.MixedContent
95
+ try:
96
+ added = _detect(pdf_path, configuration, file_result, logger=logger)
97
+ if not added:
98
+ _mark_manual_review(file_result, "NoHighConfidenceWetSignature")
99
+ return added
100
+ except Exception as exc: # pragma: no cover - defensive
101
+ _mark_manual_review(file_result, "WetDetectionError")
102
+ if logger:
103
+ logger.warning("Wet detection failed", extra={"error": str(exc)})
104
+ return False
105
+ finally:
106
+ file_result.ElectronicSignatureFound = original_esign
107
+ file_result.MixedContent = original_mixed
108
+
109
+
110
+ def _detect(
111
+ pdf_path: Path,
112
+ configuration: DetectConfiguration,
113
+ file_result: FileResult,
114
+ *,
115
+ logger: logging.Logger | None = None,
116
+ ) -> bool:
117
+ if fitz is None or pytesseract is None:
118
+ raise WetDetectionUnavailable("PyMuPDF or pytesseract not available")
119
+
120
+ document = fitz.open(pdf_path) # type: ignore[attr-defined]
121
+ try:
122
+ new_signatures: list[Signature] = []
123
+ matrix = fitz.Matrix(configuration.WetOcrDpi / 72.0, configuration.WetOcrDpi / 72.0)
124
+ for page_index in range(document.page_count):
125
+ page = document.load_page(page_index)
126
+ pixmap = page.get_pixmap(matrix=matrix, alpha=False)
127
+ image = _pixmap_to_image(pixmap)
128
+ ocr_lines = _extract_ocr_lines(image, configuration.WetOcrLanguages)
129
+ candidates = list(
130
+ _build_candidates(
131
+ ocr_lines,
132
+ image=image,
133
+ page_rect=page.rect,
134
+ pix_width=pixmap.width,
135
+ pix_height=pixmap.height,
136
+ scale=configuration.WetOcrDpi / 72.0,
137
+ )
138
+ )
139
+ candidates.extend(_image_candidates(page))
140
+ candidates = _filter_candidates_for_page(candidates)
141
+ accepted = [
142
+ candidate
143
+ for candidate in candidates
144
+ if candidate.Score >= configuration.WetPrecisionThreshold
145
+ ]
146
+ if logger:
147
+ logger.debug(
148
+ "Wet detection page summary",
149
+ extra={
150
+ "pdf": pdf_path.name,
151
+ "page": page_index + 1,
152
+ "candidates": len(candidates),
153
+ "accepted": len(accepted),
154
+ },
155
+ )
156
+ new_signatures.extend(_to_signatures(accepted, page_index + 1))
157
+ if not new_signatures:
158
+ return False
159
+
160
+ filtered_signatures = _dedupe_wet_signatures(new_signatures)
161
+ if not filtered_signatures:
162
+ return False
163
+
164
+ file_result.Signatures.extend(filtered_signatures)
165
+ _refresh_metadata(file_result)
166
+ return True
167
+ finally:
168
+ document.close()
169
+
170
+
171
+ def _ensure_dependencies() -> None:
172
+ if fitz is None:
173
+ raise WetDetectionUnavailable("PyMuPDF is required for wet detection (install 'pymupdf').")
174
+ if pytesseract is None or TesseractOutput is None:
175
+ raise WetDetectionUnavailable(
176
+ "pytesseract is required for wet detection and depends on the Tesseract OCR binary."
177
+ )
178
+
179
+
180
+ def _pixmap_to_image(pixmap) -> Image.Image:
181
+ mode = "RGB"
182
+ if pixmap.alpha:
183
+ mode = "RGBA"
184
+ image = Image.frombytes(mode, [pixmap.width, pixmap.height], pixmap.samples)
185
+ if mode == "RGBA":
186
+ image = image.convert("RGB")
187
+ return image
188
+
189
+
190
+ def _extract_ocr_lines(image: Image.Image, languages: str) -> list[OcrLine]:
191
+ if pytesseract is None or TesseractOutput is None:
192
+ raise WetDetectionUnavailable("pytesseract unavailable")
193
+
194
+ try:
195
+ data = pytesseract.image_to_data(image, lang=languages, output_type=TesseractOutput.DICT)
196
+ except Exception as exc: # pragma: no cover - passthrough to manual review
197
+ raise WetDetectionUnavailable(f"OCR failed: {exc}") from exc
198
+ total = len(data.get("text", []))
199
+ lines: dict[tuple[int, int, int], OcrLine] = {}
200
+ for idx in range(total):
201
+ text = (data["text"][idx] or "").strip()
202
+ if not text:
203
+ continue
204
+ conf_raw = float(data["conf"][idx])
205
+ if conf_raw <= 0:
206
+ continue
207
+ key = (data["block_num"][idx], data["par_num"][idx], data["line_num"][idx])
208
+ left = int(data["left"][idx])
209
+ top = int(data["top"][idx])
210
+ width = int(data["width"][idx])
211
+ height = int(data["height"][idx])
212
+ right = left + width
213
+ bottom = top + height
214
+ existing = lines.get(key)
215
+ if existing is None:
216
+ lines[key] = OcrLine(
217
+ text=text,
218
+ confidence=conf_raw / 100.0,
219
+ left=left,
220
+ top=top,
221
+ right=right,
222
+ bottom=bottom,
223
+ )
224
+ else:
225
+ existing.text = f"{existing.text} {text}"
226
+ existing.confidence = min(1.0, (existing.confidence + conf_raw / 100.0) / 2.0)
227
+ existing.left = min(existing.left, left)
228
+ existing.top = min(existing.top, top)
229
+ existing.right = max(existing.right, right)
230
+ existing.bottom = max(existing.bottom, bottom)
231
+ return list(lines.values())
232
+
233
+
234
+ @dataclass
235
+ class WetCandidate:
236
+ bbox: tuple[float, float, float, float]
237
+ Role: str
238
+ Score: float
239
+ Evidence: list[str]
240
+
241
+
242
+ def _build_candidates(
243
+ lines: Iterable[OcrLine],
244
+ *,
245
+ image: Image.Image,
246
+ page_rect,
247
+ pix_width: int,
248
+ pix_height: int,
249
+ scale: float,
250
+ ) -> Iterable[WetCandidate]:
251
+ for line in lines:
252
+ normalized = line.text.lower()
253
+ if not _has_signature_keyword(normalized):
254
+ continue
255
+ if len(normalized) > 80:
256
+ # Ignore long paragraph-like OCR lines
257
+ continue
258
+ if (line.bottom / pix_height) < 0.4:
259
+ # Ignore lines in the upper section of the page
260
+ continue
261
+ role = _infer_role(normalized)
262
+ stroke_found, stroke_y = _stroke_under_line(image, line)
263
+ bonus = _keyword_bonus(normalized)
264
+ if stroke_found:
265
+ bonus += 0.12
266
+ # Slight positional prior: lines in lower quarter are more likely signatures.
267
+ if (line.bottom / pix_height) > 0.7:
268
+ bonus += 0.05
269
+ confidence = min(1.0, line.confidence + bonus)
270
+ bbox = _expand_bbox(line, page_rect, pix_height, scale, stroke_y=stroke_y)
271
+ yield WetCandidate(
272
+ bbox=bbox,
273
+ Role=role,
274
+ Score=confidence,
275
+ Evidence=[
276
+ f"ocr_line:{line.text.strip()}",
277
+ f"ocr_conf:{confidence:.2f}",
278
+ "wet:true",
279
+ "stroke:yes" if stroke_found else "stroke:no",
280
+ ],
281
+ )
282
+
283
+
284
+ def _has_evidence(candidate: WetCandidate, token: str) -> bool:
285
+ return token in candidate.Evidence
286
+
287
+
288
+ def _is_image_candidate(candidate: WetCandidate) -> bool:
289
+ return _has_evidence(candidate, "image_signature:true")
290
+
291
+
292
+ def _has_stroke(candidate: WetCandidate) -> bool:
293
+ return _has_evidence(candidate, "stroke:yes")
294
+
295
+
296
+ def _filter_candidates_for_page(candidates: Sequence[WetCandidate]) -> list[WetCandidate]:
297
+ if not candidates:
298
+ return []
299
+ has_image = any(_is_image_candidate(candidate) for candidate in candidates)
300
+ if not has_image:
301
+ return list(candidates)
302
+ return [
303
+ candidate
304
+ for candidate in candidates
305
+ if _is_image_candidate(candidate) or _has_stroke(candidate)
306
+ ]
307
+
308
+
309
+ def _infer_role(normalized_text: str) -> str:
310
+ for role, keywords in ROLE_KEYWORDS.items():
311
+ if any(keyword in normalized_text for keyword in keywords):
312
+ return role
313
+ return "unknown"
314
+
315
+
316
+ def _keyword_bonus(normalized_text: str) -> float:
317
+ bonus = 0.0
318
+ if "signature" in normalized_text:
319
+ bonus += 0.05
320
+ if "date" in normalized_text:
321
+ bonus -= 0.02
322
+ if "by:" in normalized_text:
323
+ bonus += 0.03
324
+ return bonus
325
+
326
+
327
+ def _has_signature_keyword(normalized_text: str) -> bool:
328
+ return any(pattern.search(normalized_text) for pattern in SIGNATURE_PATTERNS)
329
+
330
+
331
+ def _expand_bbox(
332
+ line: OcrLine,
333
+ page_rect,
334
+ pix_height: int,
335
+ scale: float,
336
+ *,
337
+ stroke_y: float | None = None,
338
+ ) -> tuple[float, float, float, float]:
339
+ x0 = line.left / scale
340
+ x1 = line.right / scale
341
+ y1 = (pix_height - line.top) / scale
342
+
343
+ pad_x = max(14.0, (x1 - x0) * 0.25)
344
+ left = max(page_rect.x0, x0 - pad_x)
345
+ right = min(page_rect.x1, x1 + pad_x)
346
+
347
+ gap = 14.0
348
+ signature_height = 70.0
349
+ top = min(page_rect.y1, y1 + gap)
350
+ bottom = min(page_rect.y1, top + signature_height)
351
+
352
+ if bottom <= top:
353
+ bottom = min(page_rect.y1, top + signature_height)
354
+
355
+ if stroke_y is not None:
356
+ # Anchor to the detected stroke under the OCR label when available.
357
+ sy = (pix_height - stroke_y) / scale
358
+ if sy < top:
359
+ top = sy
360
+ bottom = max(bottom, sy + signature_height)
361
+
362
+ return (float(left), float(top), float(right), float(bottom))
363
+
364
+
365
+ def _stroke_under_line(image: Image.Image, line: OcrLine) -> tuple[bool, float | None]:
366
+ """Heuristic: look for a dark horizontal stroke beneath the OCR line."""
367
+
368
+ gray = image.convert("L")
369
+ pad_x = 10
370
+ strip_height = 28
371
+ x0 = max(0, line.left - pad_x)
372
+ x1 = min(gray.width, line.right + pad_x)
373
+ y0 = min(gray.height, line.bottom + 2)
374
+ y1 = min(gray.height, y0 + strip_height)
375
+ if x1 <= x0 or y1 <= y0:
376
+ return False, None
377
+
378
+ crop = gray.crop((x0, y0, x1, y1))
379
+ width = crop.width or 1
380
+ max_density = 0.0
381
+ best_row = None
382
+ # Simple density scan: percentage of dark pixels per row.
383
+ threshold = 160
384
+ for row in range(crop.height):
385
+ row_pixels = [crop.getpixel((col, row)) for col in range(width)]
386
+ dark = sum(1 for px in row_pixels if px < threshold)
387
+ density = dark / width
388
+ if density > max_density:
389
+ max_density = density
390
+ best_row = row
391
+ if max_density < 0.32 or best_row is None:
392
+ return False, None
393
+ return True, float(y0 + best_row)
394
+
395
+
396
+ def _image_candidates(page) -> list[WetCandidate]:
397
+ """Heuristic: treat small, wide images near signature areas as wet signatures."""
398
+
399
+ candidates: list[WetCandidate] = []
400
+ page_width = float(page.rect.width)
401
+ page_height = float(page.rect.height)
402
+ page_area = page_width * page_height
403
+ words = page.get_text("words") or []
404
+
405
+ for info in page.get_image_info(xrefs=True) or []:
406
+ rect = info.get("bbox") or info.get("rect")
407
+ if rect is None:
408
+ continue
409
+ if hasattr(rect, "x0"):
410
+ x0, y0, x1, y1 = float(rect.x0), float(rect.y0), float(rect.x1), float(rect.y1)
411
+ elif isinstance(rect, tuple | list) and len(rect) == 4:
412
+ x0, y0, x1, y1 = map(float, rect)
413
+ else:
414
+ continue
415
+ width = float(x1 - x0)
416
+ height = float(y1 - y0)
417
+ if width <= 40 or height <= 15:
418
+ # Skip tiny marks/logos
419
+ continue
420
+ aspect = width / height if height else 0.0
421
+ if aspect < 1.6:
422
+ continue
423
+ if (width * height) / page_area > 0.1:
424
+ # Ignore large illustrations/backgrounds
425
+ continue
426
+
427
+ role = _infer_role_nearby(rect, words)
428
+ score = 0.9 if role != "unknown" else 0.84
429
+
430
+ bbox = (x0, float(page_height - y1), x1, float(page_height - y0))
431
+
432
+ evidence = ["image_signature:true"]
433
+ if role != "unknown":
434
+ evidence.append(f"role_hint:{role}")
435
+
436
+ candidates.append(
437
+ WetCandidate(
438
+ bbox=bbox,
439
+ Role=role,
440
+ Score=min(1.0, score),
441
+ Evidence=evidence,
442
+ )
443
+ )
444
+ return candidates
445
+
446
+
447
+ def _infer_role_nearby(rect, words) -> str:
448
+ """Best-effort role inference using text near the image rectangle."""
449
+
450
+ proximity_y = 48.0
451
+ proximity_x = 140.0
452
+ if hasattr(rect, "x0"):
453
+ rx0, ry0, rx1, ry1 = float(rect.x0), float(rect.y0), float(rect.x1), float(rect.y1)
454
+ elif isinstance(rect, tuple | list) and len(rect) == 4:
455
+ rx0, ry0, rx1, ry1 = map(float, rect)
456
+ else:
457
+ return "unknown"
458
+
459
+ nearby_tokens: list[str] = []
460
+ for word in words:
461
+ if len(word) < 5:
462
+ continue
463
+ x0, y0, x1, y1, token, *_ = word
464
+ if y1 < ry0 - proximity_y or y0 > ry1 + proximity_y:
465
+ continue
466
+ if x1 < rx0 - proximity_x or x0 > rx1 + proximity_x:
467
+ continue
468
+ nearby_tokens.append(str(token))
469
+ if not nearby_tokens:
470
+ return "unknown"
471
+ normalized = " ".join(nearby_tokens).lower()
472
+ return _infer_role(normalized)
473
+
474
+
475
+ def _needs_wet_enhancement(file_result: FileResult) -> bool:
476
+ """Return True when we should run wet OCR to refine pseudo/unknown signatures."""
477
+
478
+ return False
479
+
480
+
481
+ def _to_signatures(
482
+ candidates: Sequence[WetCandidate],
483
+ page_number: int,
484
+ ) -> list[Signature]:
485
+ signatures: list[Signature] = []
486
+ for candidate in candidates:
487
+ signatures.append(
488
+ Signature(
489
+ Page=page_number,
490
+ FieldName="wet_signature_detected",
491
+ Role=candidate.Role,
492
+ Score=int(round(candidate.Score * 100)),
493
+ Scores={candidate.Role: int(round(candidate.Score * 100))},
494
+ Evidence=candidate.Evidence,
495
+ Hint="WetSignatureOCR",
496
+ RenderType="wet",
497
+ BoundingBox=candidate.bbox,
498
+ )
499
+ )
500
+ return signatures
501
+
502
+
503
+ def _signature_rank(signature: Signature) -> tuple[int, int, int]:
504
+ evidence = set(signature.Evidence or [])
505
+ if "image_signature:true" in evidence:
506
+ source_rank = 3
507
+ elif "stroke:yes" in evidence:
508
+ source_rank = 2
509
+ else:
510
+ source_rank = 1
511
+ return (source_rank, int(signature.Score or 0), int(signature.Page or 0))
512
+
513
+
514
+ def _dedupe_wet_signatures(signatures: Sequence[Signature]) -> list[Signature]:
515
+ best_by_role: dict[str, Signature] = {}
516
+ for signature in signatures:
517
+ role = (signature.Role or "unknown").strip().lower()
518
+ if role == "unknown":
519
+ continue
520
+ existing = best_by_role.get(role)
521
+ if existing is None or _signature_rank(signature) > _signature_rank(existing):
522
+ best_by_role[role] = signature
523
+ return sorted(best_by_role.values(), key=lambda sig: (int(sig.Page or 0), sig.Role or ""))
524
+
525
+
526
+ def _mark_manual_review(file_result: FileResult, reason: str) -> None:
527
+ hints = _split_hints(file_result.Hints)
528
+ hints.add(f"ManualReview:{reason}")
529
+ file_result.Hints = ";".join(sorted(hints)) if hints else file_result.Hints
530
+
531
+
532
+ def _refresh_metadata(file_result: FileResult) -> None:
533
+ file_result.SignatureCount = len(file_result.Signatures)
534
+ signature_pages = sorted({sig.Page for sig in file_result.Signatures if sig.Page})
535
+ file_result.SignaturePages = ",".join(map(str, signature_pages))
536
+ roles = sorted({sig.Role for sig in file_result.Signatures if sig.Role != "unknown"})
537
+ if roles:
538
+ file_result.Roles = ";".join(roles)
539
+ file_result.ElectronicSignatureFound = file_result.SignatureCount > 0
540
+ file_result.MixedContent = file_result.ElectronicSignatureFound and bool(file_result.ScannedPdf)
541
+ hints = _split_hints(file_result.Hints)
542
+ hints |= {sig.Hint for sig in file_result.Signatures if sig.Hint}
543
+ file_result.Hints = ";".join(sorted(hints))
544
+
545
+
546
+ def _split_hints(hints: str | None) -> set[str]:
547
+ if not hints:
548
+ return set()
549
+ return {hint for hint in hints.split(";") if hint}