sigdetect 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sigdetect/cropping.py CHANGED
@@ -7,7 +7,9 @@ import logging
7
7
  import re
8
8
  from dataclasses import dataclass
9
9
  from pathlib import Path
10
- from typing import Literal, overload
10
+ from typing import Callable, Literal, overload
11
+
12
+ from PIL import Image, ImageDraw
11
13
 
12
14
  from .detector.file_result_model import FileResult
13
15
  from .detector.signature_model import Signature
@@ -22,6 +24,13 @@ try: # pragma: no cover - optional dependency
22
24
  except Exception: # pragma: no cover - optional dependency
23
25
  Document = None # type: ignore[assignment]
24
26
 
27
+ try: # pragma: no cover - optional dependency
28
+ import pytesseract # type: ignore
29
+ from pytesseract import Output as TesseractOutput
30
+ except Exception: # pragma: no cover - optional dependency
31
+ pytesseract = None # type: ignore[assignment]
32
+ TesseractOutput = None # type: ignore[assignment]
33
+
25
34
 
26
35
  class SignatureCroppingUnavailable(RuntimeError):
27
36
  """Raised when PNG cropping cannot be performed (e.g., PyMuPDF missing)."""
@@ -53,6 +62,7 @@ def crop_signatures(
53
62
  return_bytes: Literal[False] = False,
54
63
  save_files: bool = True,
55
64
  docx: bool = False,
65
+ trim: bool = True,
56
66
  ) -> list[Path]: ...
57
67
 
58
68
 
@@ -67,6 +77,7 @@ def crop_signatures(
67
77
  return_bytes: Literal[True],
68
78
  save_files: bool = True,
69
79
  docx: bool = False,
80
+ trim: bool = True,
70
81
  ) -> list[SignatureCrop]: ...
71
82
 
72
83
 
@@ -80,6 +91,7 @@ def crop_signatures(
80
91
  return_bytes: bool = False,
81
92
  save_files: bool = True,
82
93
  docx: bool = False,
94
+ trim: bool = True,
83
95
  ) -> list[Path] | list[SignatureCrop]:
84
96
  """Render each signature bounding box to a PNG image and optionally wrap it in DOCX.
85
97
 
@@ -87,6 +99,7 @@ def crop_signatures(
87
99
  the files to ``output_dir``. Set ``save_files=False`` to skip writing PNGs to disk.
88
100
  When ``docx=True``, DOCX files are written instead of PNGs. When ``return_bytes`` is True
89
101
  and ``docx=True``, ``SignatureCrop.docx_bytes`` will contain the DOCX payload.
102
+ When ``trim`` is enabled, the crop is tightened around the detected ink where possible.
90
103
  """
91
104
 
92
105
  if fitz is None: # pragma: no cover - exercised when dependency absent
@@ -110,6 +123,15 @@ def crop_signatures(
110
123
  "python-docx is required to generate DOCX outputs for signature crops."
111
124
  )
112
125
 
126
+ page_signature_counts: dict[int, int] = {}
127
+ for signature in file_result.Signatures:
128
+ if signature.Page:
129
+ page_signature_counts[signature.Page] = page_signature_counts.get(signature.Page, 0) + 1
130
+
131
+ page_signature_index: dict[int, int] = {}
132
+ page_cache: dict[int, dict[str, object]] = {}
133
+ document_cache: dict[str, object] = {}
134
+
113
135
  with fitz.open(pdf_path) as document: # type: ignore[attr-defined]
114
136
  per_document_dir = output_dir / pdf_path.stem
115
137
  if save_files:
@@ -117,38 +139,201 @@ def crop_signatures(
117
139
  scale = dpi / 72.0
118
140
  matrix = fitz.Matrix(scale, scale)
119
141
 
142
+ page_changed = False
120
143
  for index, signature in enumerate(file_result.Signatures, start=1):
121
- if not signature.BoundingBox or not signature.Page:
122
- continue
123
- try:
124
- page = document.load_page(signature.Page - 1)
125
- except Exception as exc: # pragma: no cover - defensive
126
- if logger:
127
- logger.warning(
128
- "Failed to load page for signature crop",
129
- extra={
130
- "file": pdf_path.name,
131
- "page": signature.Page,
132
- "error": str(exc),
133
- },
144
+ page_number = signature.Page
145
+ if page_number:
146
+ page_signature_index[page_number] = page_signature_index.get(page_number, 0) + 1
147
+ signature_index = page_signature_index.get(page_number, 1)
148
+ else:
149
+ signature_index = 1
150
+ force_fallback = _is_pseudo_signature(signature)
151
+ force_existing = _has_image_evidence(signature)
152
+ candidates: list[tuple[str, int, tuple[float, float, float, float]]] = []
153
+ seen: set[tuple[int, float, float, float, float]] = set()
154
+
155
+ def add_candidate(
156
+ source: str,
157
+ candidate_page: int | None,
158
+ candidate_bbox: tuple[float, float, float, float] | None,
159
+ ) -> None:
160
+ if candidate_page is None or not _bbox_has_area(candidate_bbox):
161
+ return
162
+ key = (
163
+ candidate_page,
164
+ round(candidate_bbox[0], 2),
165
+ round(candidate_bbox[1], 2),
166
+ round(candidate_bbox[2], 2),
167
+ round(candidate_bbox[3], 2),
168
+ )
169
+ if key in seen:
170
+ return
171
+ seen.add(key)
172
+ candidates.append((source, candidate_page, candidate_bbox))
173
+
174
+ delayed_existing: tuple[int, tuple[float, float, float, float]] | None = None
175
+ if page_number and _bbox_has_area(signature.BoundingBox):
176
+ if force_fallback:
177
+ delayed_existing = (page_number, signature.BoundingBox)
178
+ else:
179
+ add_candidate("existing", page_number, signature.BoundingBox)
180
+
181
+ page = None
182
+ if page_number:
183
+ try:
184
+ page = document.load_page(page_number - 1)
185
+ except Exception as exc: # pragma: no cover - defensive
186
+ if logger:
187
+ logger.warning(
188
+ "Failed to load page for signature crop",
189
+ extra={
190
+ "file": pdf_path.name,
191
+ "page": page_number,
192
+ "error": str(exc),
193
+ },
194
+ )
195
+ page = None
196
+
197
+ if not force_existing:
198
+ if page is not None and page_number is not None:
199
+ resolved = _resolve_signature_bbox(
200
+ page,
201
+ signature,
202
+ page_cache=page_cache,
203
+ signature_count=page_signature_counts.get(page_number, 1),
204
+ signature_index=signature_index,
205
+ page_number=page_number,
134
206
  )
207
+ add_candidate("resolved", page_number, resolved)
208
+
209
+ fallback_page, resolved = _resolve_bbox_across_document(
210
+ document,
211
+ signature,
212
+ page_cache=page_cache,
213
+ document_cache=document_cache,
214
+ signature_index=signature_index,
215
+ signature_count=page_signature_counts.get(page_number or 1, 1),
216
+ skip_page=page_number,
217
+ )
218
+ add_candidate("fallback", fallback_page, resolved)
219
+
220
+ if delayed_existing is not None:
221
+ add_candidate("existing", delayed_existing[0], delayed_existing[1])
222
+
223
+ if not candidates:
135
224
  continue
136
225
 
137
- clip = _to_clip_rect(page, signature.BoundingBox)
138
- if clip is None:
226
+ best_bytes: bytes | None = None
227
+ best_bbox: tuple[float, float, float, float] | None = None
228
+ best_page: int | None = None
229
+ best_score: int | None = None
230
+
231
+ for source, candidate_page, candidate_bbox in candidates:
232
+ try:
233
+ page = document.load_page(candidate_page - 1)
234
+ except Exception as exc: # pragma: no cover - defensive
235
+ if logger:
236
+ logger.warning(
237
+ "Failed to load page for signature crop",
238
+ extra={
239
+ "file": pdf_path.name,
240
+ "page": candidate_page,
241
+ "error": str(exc),
242
+ },
243
+ )
244
+ continue
245
+
246
+ image_rects = page_cache.get(candidate_page, {}).get("image_rects")
247
+ if image_rects is None:
248
+ image_rects = _collect_image_rects(page)
249
+ page_cache.setdefault(candidate_page, {})["image_rects"] = image_rects
250
+
251
+ min_overlap = 0.6
252
+ min_center_overlap = 0.2
253
+ if signature.RenderType in {"drawn", "typed"} or _is_pseudo_signature(signature):
254
+ min_overlap = 0.3
255
+ min_center_overlap = 0.1
256
+ refined_bbox = _refine_bbox_with_image_rects(
257
+ page,
258
+ candidate_bbox,
259
+ image_rects=image_rects,
260
+ min_overlap=min_overlap,
261
+ min_center_overlap=min_center_overlap,
262
+ )
263
+ skip_trim = signature.RenderType in {"drawn", "typed"} or _is_pseudo_signature(signature)
264
+ candidate_allow_trim = True
265
+ if skip_trim and refined_bbox is not None:
266
+ candidate_allow_trim = False
267
+
268
+ render_bboxes: list[tuple[tuple[float, float, float, float], bool]] = [
269
+ (candidate_bbox, candidate_allow_trim)
270
+ ]
271
+ if refined_bbox and refined_bbox != candidate_bbox:
272
+ skip_trim = signature.RenderType in {"drawn", "typed"} or _is_pseudo_signature(signature)
273
+ render_bboxes.append((refined_bbox, not skip_trim))
274
+ if (signature.RenderType or "").lower() == "wet":
275
+ expanded_bbox = _expand_wet_bbox(page, candidate_bbox)
276
+ if expanded_bbox and expanded_bbox not in {candidate_bbox, refined_bbox}:
277
+ render_bboxes.append((expanded_bbox, True))
278
+
279
+ for render_bbox, allow_trim in render_bboxes:
280
+ clip = _to_clip_rect(page, render_bbox)
281
+ if clip is None:
282
+ continue
283
+ try:
284
+ pixmap = page.get_pixmap(matrix=matrix, clip=clip, alpha=False)
285
+ raw_bytes = pixmap.tobytes("png")
286
+ final_bytes = (
287
+ _trim_signature_image_bytes(
288
+ raw_bytes,
289
+ render_type=signature.RenderType,
290
+ )
291
+ if trim and allow_trim
292
+ else raw_bytes
293
+ )
294
+ except Exception as exc: # pragma: no cover - defensive
295
+ if logger:
296
+ logger.warning(
297
+ "Failed to render signature crop",
298
+ extra={
299
+ "file": pdf_path.name,
300
+ "page": candidate_page,
301
+ "field": signature.FieldName,
302
+ "error": str(exc),
303
+ },
304
+ )
305
+ continue
306
+
307
+ if _is_blank_crop(final_bytes):
308
+ continue
309
+
310
+ dark, _ = _ink_metrics(final_bytes)
311
+ if best_score is None or dark > best_score:
312
+ best_score = dark
313
+ best_bytes = final_bytes
314
+ best_bbox = render_bbox
315
+ best_page = candidate_page
316
+
317
+ if best_bytes is None or best_bbox is None or best_page is None:
139
318
  continue
140
319
 
320
+ if signature.Page != best_page:
321
+ signature.Page = best_page
322
+ page_changed = True
323
+ signature.BoundingBox = best_bbox
324
+
325
+ final_bytes = best_bytes
326
+
141
327
  filename = _build_filename(index, signature)
142
328
  png_destination = per_document_dir / filename
143
329
  docx_destination = png_destination.with_suffix(".docx")
144
330
 
145
331
  try:
146
332
  image_bytes: bytes | None = None
147
- pixmap = page.get_pixmap(matrix=matrix, clip=clip, alpha=False)
148
333
  if save_files and not docx_enabled:
149
- pixmap.save(png_destination)
334
+ png_destination.write_bytes(final_bytes)
150
335
  if return_bytes or docx_enabled:
151
- image_bytes = pixmap.tobytes("png")
336
+ image_bytes = final_bytes
152
337
  except Exception as exc: # pragma: no cover - defensive
153
338
  if logger:
154
339
  logger.warning(
@@ -206,6 +391,9 @@ def crop_signatures(
206
391
  )
207
392
  )
208
393
 
394
+ if page_changed:
395
+ _update_signature_pages(file_result)
396
+
209
397
  return generated_crops if return_bytes else generated_paths
210
398
 
211
399
 
@@ -221,6 +409,1370 @@ def _build_docx_bytes(image_bytes: bytes) -> bytes:
221
409
  return buffer.getvalue()
222
410
 
223
411
 
412
+ @dataclass(frozen=True)
413
+ class _OcrBox:
414
+ text: str
415
+ confidence: float
416
+ left: int
417
+ top: int
418
+ right: int
419
+ bottom: int
420
+
421
+
422
+ _OCR_LABEL_PATTERNS: tuple[re.Pattern[str], ...] = (
423
+ re.compile(r"\b(signature|signed|sign)\b", re.IGNORECASE),
424
+ re.compile(r"\b(date\s+signed|date)\b", re.IGNORECASE),
425
+ re.compile(r"\b(print(?:ed)?\s+name)\b", re.IGNORECASE),
426
+ re.compile(
427
+ r"\b(client|patient|attorney|firm|law|counsel|representative|guardian|witness)\b",
428
+ re.IGNORECASE,
429
+ ),
430
+ re.compile(r"\b(docusign|docu\s*sign|envelope|adobe)\b", re.IGNORECASE),
431
+ )
432
+
433
+
434
+ def _is_label_text(text: str) -> bool:
435
+ cleaned = text.strip().lower()
436
+ if not cleaned:
437
+ return False
438
+ return any(pattern.search(cleaned) for pattern in _OCR_LABEL_PATTERNS)
439
+
440
+
441
+ def _extract_ocr_boxes(
442
+ image: Image.Image,
443
+ *,
444
+ languages: str = "eng",
445
+ min_confidence: float = 50.0,
446
+ ) -> list[_OcrBox]:
447
+ if pytesseract is None or TesseractOutput is None:
448
+ return []
449
+ try:
450
+ data = pytesseract.image_to_data(
451
+ image,
452
+ lang=languages,
453
+ config="--psm 11",
454
+ output_type=TesseractOutput.DICT,
455
+ )
456
+ except Exception:
457
+ return []
458
+
459
+ texts = data.get("text", [])
460
+ boxes: list[_OcrBox] = []
461
+ for idx, raw in enumerate(texts):
462
+ text = str(raw).strip()
463
+ if not text:
464
+ continue
465
+ try:
466
+ confidence = float(data["conf"][idx])
467
+ except (ValueError, KeyError, TypeError):
468
+ continue
469
+ if confidence < min_confidence:
470
+ continue
471
+ try:
472
+ left = int(data["left"][idx])
473
+ top = int(data["top"][idx])
474
+ width = int(data["width"][idx])
475
+ height = int(data["height"][idx])
476
+ except (ValueError, KeyError, TypeError):
477
+ continue
478
+ if width <= 0 or height <= 0:
479
+ continue
480
+ boxes.append(
481
+ _OcrBox(
482
+ text=text,
483
+ confidence=confidence,
484
+ left=left,
485
+ top=top,
486
+ right=left + width,
487
+ bottom=top + height,
488
+ )
489
+ )
490
+ return boxes
491
+
492
+
493
+ def _estimate_white_level(gray: Image.Image) -> int:
494
+ histogram = gray.histogram()
495
+ total_pixels = gray.width * gray.height
496
+ cutoff = int(total_pixels * 0.995)
497
+ cumulative = 0
498
+ white_level = 255
499
+ for idx, count in enumerate(histogram):
500
+ cumulative += count
501
+ if cumulative >= cutoff:
502
+ white_level = idx
503
+ break
504
+ return white_level
505
+
506
+
507
+ def _find_horizontal_rule_rows(
508
+ gray: Image.Image,
509
+ *,
510
+ threshold: int = 240,
511
+ density_ratio: float = 0.25,
512
+ max_thickness: int = 8,
513
+ ) -> tuple[list[tuple[int, int]], list[int]]:
514
+ width, height = gray.size
515
+ if width == 0 or height == 0:
516
+ return [], []
517
+ pixels = gray.load()
518
+ row_density = []
519
+ for y in range(height):
520
+ dark = sum(1 for x in range(width) if pixels[x, y] < threshold)
521
+ row_density.append(dark)
522
+ line_threshold = int(width * density_ratio)
523
+ segments: list[tuple[int, int]] = []
524
+ start: int | None = None
525
+ for y, dark in enumerate(row_density):
526
+ if dark >= line_threshold:
527
+ if start is None:
528
+ start = y
529
+ else:
530
+ if start is not None:
531
+ if (y - 1) - start + 1 <= max_thickness:
532
+ segments.append((start, y - 1))
533
+ start = None
534
+ if start is not None and (height - 1) - start + 1 <= max_thickness:
535
+ segments.append((start, height - 1))
536
+ return segments, row_density
537
+
538
+
539
+ def _find_vertical_rule_cols(
540
+ gray: Image.Image,
541
+ *,
542
+ threshold: int = 240,
543
+ density_ratio: float = 0.6,
544
+ max_thickness: int = 6,
545
+ ) -> tuple[list[tuple[int, int]], list[int]]:
546
+ width, height = gray.size
547
+ if width == 0 or height == 0:
548
+ return [], []
549
+ pixels = gray.load()
550
+ col_density = []
551
+ for x in range(width):
552
+ dark = sum(1 for y in range(height) if pixels[x, y] < threshold)
553
+ col_density.append(dark)
554
+ line_threshold = int(height * density_ratio)
555
+ segments: list[tuple[int, int]] = []
556
+ start: int | None = None
557
+ for x, dark in enumerate(col_density):
558
+ if dark >= line_threshold:
559
+ if start is None:
560
+ start = x
561
+ else:
562
+ if start is not None:
563
+ if (x - 1) - start + 1 <= max_thickness:
564
+ segments.append((start, x - 1))
565
+ start = None
566
+ if start is not None and (width - 1) - start + 1 <= max_thickness:
567
+ segments.append((start, width - 1))
568
+ return segments, col_density
569
+
570
+
571
+ def _find_ink_band(
572
+ gray: Image.Image,
573
+ *,
574
+ threshold: int = 240,
575
+ min_density_ratio: float = 0.004,
576
+ gap_px: int = 3,
577
+ ) -> tuple[int, int] | None:
578
+ width, height = gray.size
579
+ if width == 0 or height == 0:
580
+ return None
581
+ pixels = gray.load()
582
+ min_density = max(2, int(width * min_density_ratio))
583
+ row_density = []
584
+ for y in range(height):
585
+ dark = sum(1 for x in range(width) if pixels[x, y] < threshold)
586
+ row_density.append(dark)
587
+
588
+ segments: list[tuple[int, int]] = []
589
+ start: int | None = None
590
+ for y, dark in enumerate(row_density):
591
+ if dark >= min_density:
592
+ if start is None:
593
+ start = y
594
+ else:
595
+ if start is not None:
596
+ segments.append((start, y - 1))
597
+ start = None
598
+ if start is not None:
599
+ segments.append((start, height - 1))
600
+
601
+ if not segments:
602
+ return None
603
+
604
+ merged: list[list[int]] = []
605
+ for seg in segments:
606
+ if not merged:
607
+ merged.append([seg[0], seg[1]])
608
+ continue
609
+ if seg[0] - merged[-1][1] <= gap_px:
610
+ merged[-1][1] = seg[1]
611
+ else:
612
+ merged.append([seg[0], seg[1]])
613
+
614
+ best = None
615
+ best_score = None
616
+ for y0, y1 in merged:
617
+ score = sum(row_density[y0 : y1 + 1])
618
+ if best_score is None or score > best_score:
619
+ best_score = score
620
+ best = (y0, y1)
621
+ return best
622
+
623
+
624
+ def _pick_line_below_band(
625
+ segments: list[tuple[int, int]],
626
+ *,
627
+ band_end: int,
628
+ ) -> tuple[int, int] | None:
629
+ if not segments:
630
+ return None
631
+ below = [seg for seg in segments if seg[0] >= band_end]
632
+ if below:
633
+ return min(below, key=lambda seg: seg[0] - band_end)
634
+ return None
635
+
636
+
637
+ def _bbox_from_boxes(boxes: list[_OcrBox]) -> tuple[int, int, int, int] | None:
638
+ if not boxes:
639
+ return None
640
+ left = min(box.left for box in boxes)
641
+ top = min(box.top for box in boxes)
642
+ right = max(box.right for box in boxes)
643
+ bottom = max(box.bottom for box in boxes)
644
+ if right <= left or bottom <= top:
645
+ return None
646
+ return left, top, right, bottom
647
+
648
+
649
+ def _trim_bbox_by_ocr_boxes(
650
+ bbox: tuple[int, int, int, int],
651
+ boxes: list[_OcrBox],
652
+ *,
653
+ min_gap: int = 6,
654
+ ) -> tuple[int, int, int, int]:
655
+ x0, y0, x1, y1 = bbox
656
+ if not boxes:
657
+ return bbox
658
+ candidate = None
659
+ for box in boxes:
660
+ if box.top < y0 + min_gap:
661
+ continue
662
+ if box.left > x1 or box.right < x0:
663
+ continue
664
+ if candidate is None or box.top < candidate:
665
+ candidate = box.top
666
+ if candidate is not None and candidate - 1 > y0:
667
+ y1 = min(y1, candidate - 1)
668
+ return x0, y0, x1, y1
669
+
670
+
671
+ def _trim_border_lines(
672
+ image: Image.Image,
673
+ *,
674
+ threshold: int = 240,
675
+ density_ratio: float = 0.85,
676
+ edge_ratio: float = 0.2,
677
+ ) -> Image.Image:
678
+ gray = image.convert("L")
679
+ width, height = gray.size
680
+ if width == 0 or height == 0:
681
+ return image
682
+ pixels = gray.load()
683
+ row_density = [sum(1 for x in range(width) if pixels[x, y] < threshold) for y in range(height)]
684
+ col_density = [sum(1 for y in range(height) if pixels[x, y] < threshold) for x in range(width)]
685
+
686
+ band_x = max(1, int(width * edge_ratio))
687
+ band_y = max(1, int(height * edge_ratio))
688
+ row_threshold = int(width * density_ratio)
689
+ col_threshold = int(height * density_ratio)
690
+
691
+ left_cut = -1
692
+ for x in range(band_x):
693
+ if col_density[x] >= col_threshold:
694
+ left_cut = x
695
+ right_cut = width
696
+ for x in range(width - band_x, width):
697
+ if col_density[x] >= col_threshold:
698
+ right_cut = x
699
+ break
700
+
701
+ top_cut = -1
702
+ for y in range(band_y):
703
+ if row_density[y] >= row_threshold:
704
+ top_cut = y
705
+ bottom_cut = height
706
+ for y in range(height - band_y, height):
707
+ if row_density[y] >= row_threshold:
708
+ bottom_cut = y
709
+ break
710
+
711
+ x0 = max(0, left_cut + 1)
712
+ x1 = min(width, right_cut)
713
+ y0 = max(0, top_cut + 1)
714
+ y1 = min(height, bottom_cut)
715
+ if x1 - x0 <= 2 or y1 - y0 <= 2:
716
+ return image
717
+ return image.crop((x0, y0, x1, y1))
718
+
719
+
720
+ def _select_signature_line(
721
+ line_segments: list[tuple[int, int]],
722
+ row_density: list[int],
723
+ ) -> tuple[int, int] | None:
724
+ if not line_segments:
725
+ return None
726
+ best = None
727
+ best_score = None
728
+ for y0, y1 in line_segments:
729
+ score = sum(row_density[y0 : y1 + 1])
730
+ if best_score is None or score > best_score:
731
+ best_score = score
732
+ best = (y0, y1)
733
+ return best
734
+
735
+
736
+ def _component_bboxes(
737
+ gray: Image.Image,
738
+ *,
739
+ threshold: int,
740
+ min_pixels: int = 40,
741
+ line_ratio: float = 12.0,
742
+ edge_margin: int = 1,
743
+ ) -> list[dict[str, int]]:
744
+ width, height = gray.size
745
+ if width == 0 or height == 0:
746
+ return []
747
+ pixels = gray.load()
748
+ visited = [False] * (width * height)
749
+ bboxes: list[dict[str, int]] = []
750
+
751
+ def index(x: int, y: int) -> int:
752
+ return y * width + x
753
+
754
+ for y in range(height):
755
+ for x in range(width):
756
+ idx = index(x, y)
757
+ if visited[idx]:
758
+ continue
759
+ if pixels[x, y] >= threshold:
760
+ visited[idx] = True
761
+ continue
762
+
763
+ stack = [(x, y)]
764
+ visited[idx] = True
765
+ min_x = max_x = x
766
+ min_y = max_y = y
767
+ count = 0
768
+ pixels_list: list[tuple[int, int]] = []
769
+
770
+ while stack:
771
+ cx, cy = stack.pop()
772
+ count += 1
773
+ pixels_list.append((cx, cy))
774
+ if cx < min_x:
775
+ min_x = cx
776
+ if cx > max_x:
777
+ max_x = cx
778
+ if cy < min_y:
779
+ min_y = cy
780
+ if cy > max_y:
781
+ max_y = cy
782
+
783
+ for nx in (cx - 1, cx, cx + 1):
784
+ if nx < 0 or nx >= width:
785
+ continue
786
+ for ny in (cy - 1, cy, cy + 1):
787
+ if ny < 0 or ny >= height:
788
+ continue
789
+ nidx = index(nx, ny)
790
+ if visited[nidx]:
791
+ continue
792
+ visited[nidx] = True
793
+ if pixels[nx, ny] < threshold:
794
+ stack.append((nx, ny))
795
+
796
+ if count < min_pixels:
797
+ continue
798
+ w = max_x - min_x + 1
799
+ h = max_y - min_y + 1
800
+ if h <= 0 or w <= 0:
801
+ continue
802
+ if w > h * line_ratio or h > w * line_ratio:
803
+ continue
804
+ edge_count = 0
805
+ for px, py in pixels_list:
806
+ if (
807
+ px <= min_x + edge_margin
808
+ or px >= max_x - edge_margin
809
+ or py <= min_y + edge_margin
810
+ or py >= max_y - edge_margin
811
+ ):
812
+ edge_count += 1
813
+ edge_ratio = edge_count / max(1, count)
814
+ bboxes.append(
815
+ {
816
+ "min_x": min_x,
817
+ "min_y": min_y,
818
+ "max_x": max_x,
819
+ "max_y": max_y,
820
+ "count": count,
821
+ "edge_ratio": edge_ratio,
822
+ "edge_sum": edge_ratio * count,
823
+ }
824
+ )
825
+ return bboxes
826
+
827
+
828
+ def _merge_component_bboxes(
829
+ components: list[dict[str, int]],
830
+ *,
831
+ gap: int = 6,
832
+ ) -> list[dict[str, int]]:
833
+ merged: list[dict[str, int]] = []
834
+ for comp in sorted(components, key=lambda item: item["count"], reverse=True):
835
+ placed = False
836
+ for target in merged:
837
+ dx = max(0, max(target["min_x"] - comp["max_x"], comp["min_x"] - target["max_x"]))
838
+ dy = max(0, max(target["min_y"] - comp["max_y"], comp["min_y"] - target["max_y"]))
839
+ if dx <= gap and dy <= gap:
840
+ target["min_x"] = min(target["min_x"], comp["min_x"])
841
+ target["min_y"] = min(target["min_y"], comp["min_y"])
842
+ target["max_x"] = max(target["max_x"], comp["max_x"])
843
+ target["max_y"] = max(target["max_y"], comp["max_y"])
844
+ target["count"] += comp["count"]
845
+ target["edge_sum"] = target.get("edge_sum", 0.0) + comp.get("edge_sum", 0.0)
846
+ target["edge_ratio"] = target["edge_sum"] / max(1, target["count"])
847
+ placed = True
848
+ break
849
+ if not placed:
850
+ entry = comp.copy()
851
+ if "edge_sum" not in entry:
852
+ entry["edge_sum"] = entry.get("edge_ratio", 0.0) * entry["count"]
853
+ merged.append(entry)
854
+ return merged
855
+
856
+
857
+ def _components_bbox(
858
+ gray: Image.Image,
859
+ *,
860
+ threshold: int,
861
+ min_pixels: int = 40,
862
+ gap: int = 6,
863
+ max_edge_ratio: float = 0.7,
864
+ ) -> tuple[int, int, int, int] | None:
865
+ components = _component_bboxes(gray, threshold=threshold, min_pixels=min_pixels)
866
+ if not components:
867
+ return None
868
+ clusters = _merge_component_bboxes(components, gap=gap)
869
+ if not clusters:
870
+ return None
871
+ filtered = [item for item in clusters if item.get("edge_ratio", 0.0) < max_edge_ratio]
872
+ if filtered:
873
+ best = max(filtered, key=lambda item: item["count"])
874
+ else:
875
+ best = max(clusters, key=lambda item: item["count"])
876
+ return best["min_x"], best["min_y"], best["max_x"], best["max_y"]
877
+
878
+
879
+ def _select_line_cutoff(
880
+ segments: list[tuple[int, int]],
881
+ row_density: list[int],
882
+ *,
883
+ min_above_dark: int = 40,
884
+ ratio_threshold: float = 0.5,
885
+ ) -> int | None:
886
+ if not segments:
887
+ return None
888
+ height = len(row_density)
889
+ candidates: list[tuple[int, int]] = []
890
+ for y0, y1 in segments:
891
+ above_dark = sum(row_density[:y0])
892
+ below_dark = sum(row_density[y1 + 1 :])
893
+ if above_dark < min_above_dark:
894
+ continue
895
+ if below_dark > 0 and above_dark < below_dark * ratio_threshold:
896
+ continue
897
+ candidates.append((y0, y1))
898
+ if not candidates:
899
+ return None
900
+ y0, y1 = max(candidates, key=lambda seg: seg[1])
901
+ pad = max(2, int(height * 0.01))
902
+ return min(height - 1, y1 + pad)
903
+
904
+
905
+ def _mask_regions(
906
+ gray: Image.Image,
907
+ *,
908
+ boxes: list[_OcrBox],
909
+ line_segments: list[tuple[int, int]],
910
+ vertical_segments: list[tuple[int, int]] | None = None,
911
+ box_filter: Callable[[_OcrBox], bool] | None = None,
912
+ ) -> Image.Image:
913
+ masked = gray.copy()
914
+ draw = ImageDraw.Draw(masked)
915
+ width, height = gray.size
916
+ for y0, y1 in line_segments:
917
+ draw.rectangle((0, y0, width - 1, y1), fill=255)
918
+ if vertical_segments:
919
+ for x0, x1 in vertical_segments:
920
+ draw.rectangle((x0, 0, x1, height - 1), fill=255)
921
+ for box in boxes:
922
+ if box_filter is not None and not box_filter(box):
923
+ continue
924
+ draw.rectangle((box.left, box.top, box.right, box.bottom), fill=255)
925
+ return masked
926
+
927
+
928
+ def _whiteout_regions_rgb(
929
+ image: Image.Image,
930
+ *,
931
+ boxes: list[_OcrBox],
932
+ line_segments: list[tuple[int, int]],
933
+ vertical_segments: list[tuple[int, int]] | None = None,
934
+ ) -> Image.Image:
935
+ width, height = image.size
936
+ if width == 0 or height == 0:
937
+ return image
938
+ rgb = image.convert("RGB")
939
+ draw = ImageDraw.Draw(rgb)
940
+ for y0, y1 in line_segments:
941
+ draw.rectangle((0, y0, width - 1, y1), fill=(255, 255, 255))
942
+ if vertical_segments:
943
+ for x0, x1 in vertical_segments:
944
+ draw.rectangle((x0, 0, x1, height - 1), fill=(255, 255, 255))
945
+ for box in boxes:
946
+ draw.rectangle((box.left, box.top, box.right, box.bottom), fill=(255, 255, 255))
947
+ return rgb
948
+
949
+
950
+ def _is_blue_pixel(r: int, g: int, b: int) -> bool:
951
+ return b > 100 and b > r + 25 and b > g + 25
952
+
953
+
954
+ def _build_ink_mask(
955
+ image: Image.Image,
956
+ *,
957
+ threshold: int,
958
+ remove_blue: bool = True,
959
+ ) -> Image.Image:
960
+ rgb = image.convert("RGB")
961
+ width, height = rgb.size
962
+ mask = Image.new("L", (width, height), 255)
963
+ pix = rgb.load()
964
+ mpix = mask.load()
965
+ for y in range(height):
966
+ for x in range(width):
967
+ r, g, b = pix[x, y]
968
+ gray = int(0.299 * r + 0.587 * g + 0.114 * b)
969
+ if gray < threshold and not (remove_blue and _is_blue_pixel(r, g, b)):
970
+ mpix[x, y] = 0
971
+ return mask
972
+
973
+
974
+ def _tighten_to_ink_components(
975
+ image: Image.Image,
976
+ *,
977
+ remove_blue: bool,
978
+ pad_px: int = 2,
979
+ ) -> Image.Image:
980
+ gray = image.convert("L")
981
+ white_level = _estimate_white_level(gray)
982
+ threshold = min(245, max(200, white_level - 10))
983
+ if remove_blue:
984
+ mask = _build_ink_mask(image, threshold=threshold, remove_blue=True)
985
+ bbox = _components_bbox(mask, threshold=200, gap=10, max_edge_ratio=0.98)
986
+ else:
987
+ bbox = _components_bbox(gray, threshold=threshold, gap=10, max_edge_ratio=0.98)
988
+ if bbox is None:
989
+ return image
990
+ x0, y0, x1, y1 = bbox
991
+ width, height = image.size
992
+ x0 = max(0, x0 - pad_px)
993
+ y0 = max(0, y0 - pad_px)
994
+ x1 = min(width - 1, x1 + pad_px)
995
+ y1 = min(height - 1, y1 + pad_px)
996
+ if x1 <= x0 or y1 <= y0:
997
+ return image
998
+ return image.crop((x0, y0, x1 + 1, y1 + 1))
999
+
1000
+
1001
+ def _components_bbox_on_line(
1002
+ gray: Image.Image,
1003
+ *,
1004
+ threshold: int,
1005
+ min_pixels: int = 40,
1006
+ max_edge_ratio: float = 0.98,
1007
+ ) -> tuple[int, int, int, int] | None:
1008
+ components = _component_bboxes(gray, threshold=threshold, min_pixels=min_pixels)
1009
+ if not components:
1010
+ return None
1011
+ filtered = [item for item in components if item.get("edge_ratio", 0.0) < max_edge_ratio]
1012
+ if filtered:
1013
+ components = filtered
1014
+
1015
+ clusters: list[dict[str, int]] = []
1016
+ for comp in components:
1017
+ comp_height = comp["max_y"] - comp["min_y"] + 1
1018
+ placed = False
1019
+ for cluster in clusters:
1020
+ cluster_height = cluster["max_y"] - cluster["min_y"] + 1
1021
+ overlap = min(cluster["max_y"], comp["max_y"]) - max(cluster["min_y"], comp["min_y"]) + 1
1022
+ if overlap >= 0:
1023
+ min_height = max(1, min(cluster_height, comp_height))
1024
+ if (overlap / min_height) >= 0.3:
1025
+ placed = True
1026
+ else:
1027
+ gap = max(cluster["min_y"] - comp["max_y"], comp["min_y"] - cluster["max_y"])
1028
+ y_gap = max(4, int(min(cluster_height, comp_height) * 0.6))
1029
+ if gap <= y_gap:
1030
+ placed = True
1031
+ if placed:
1032
+ cluster["min_x"] = min(cluster["min_x"], comp["min_x"])
1033
+ cluster["min_y"] = min(cluster["min_y"], comp["min_y"])
1034
+ cluster["max_x"] = max(cluster["max_x"], comp["max_x"])
1035
+ cluster["max_y"] = max(cluster["max_y"], comp["max_y"])
1036
+ cluster["count"] += comp["count"]
1037
+ break
1038
+ if not placed:
1039
+ clusters.append(comp.copy())
1040
+
1041
+ if not clusters:
1042
+ return None
1043
+ best = max(clusters, key=lambda item: item["count"])
1044
+ return best["min_x"], best["min_y"], best["max_x"], best["max_y"]
1045
+
1046
+
1047
+ def _tighten_to_ink_components_on_line(
1048
+ image: Image.Image,
1049
+ *,
1050
+ remove_blue: bool,
1051
+ pad_px: int = 2,
1052
+ ) -> Image.Image:
1053
+ gray = image.convert("L")
1054
+ white_level = _estimate_white_level(gray)
1055
+ threshold = min(245, max(200, white_level - 10))
1056
+ if remove_blue:
1057
+ mask = _build_ink_mask(image, threshold=threshold, remove_blue=True)
1058
+ bbox = _components_bbox_on_line(mask, threshold=200, max_edge_ratio=0.98)
1059
+ else:
1060
+ bbox = _components_bbox_on_line(gray, threshold=threshold, max_edge_ratio=0.98)
1061
+ if bbox is None:
1062
+ return image
1063
+ x0, y0, x1, y1 = bbox
1064
+ width, height = image.size
1065
+ x0 = max(0, x0 - pad_px)
1066
+ y0 = max(0, y0 - pad_px)
1067
+ x1 = min(width - 1, x1 + pad_px)
1068
+ y1 = min(height - 1, y1 + pad_px)
1069
+ if x1 <= x0 or y1 <= y0:
1070
+ return image
1071
+ return image.crop((x0, y0, x1 + 1, y1 + 1))
1072
+
1073
+
1074
+ def _ink_bbox(
1075
+ gray: Image.Image,
1076
+ *,
1077
+ threshold: int,
1078
+ ) -> tuple[int, int, int, int] | None:
1079
+ width, height = gray.size
1080
+ pixels = gray.load()
1081
+ min_x, min_y = width, height
1082
+ max_x, max_y = -1, -1
1083
+ for y in range(height):
1084
+ for x in range(width):
1085
+ if pixels[x, y] < threshold:
1086
+ if x < min_x:
1087
+ min_x = x
1088
+ if x > max_x:
1089
+ max_x = x
1090
+ if y < min_y:
1091
+ min_y = y
1092
+ if y > max_y:
1093
+ max_y = y
1094
+ if max_x < 0:
1095
+ return None
1096
+ return min_x, min_y, max_x, max_y
1097
+
1098
+
1099
+ def _ocr_trim_signature_image_bytes(
1100
+ image_bytes: bytes,
1101
+ *,
1102
+ render_type: str | None,
1103
+ pad_px: int = 3,
1104
+ ) -> bytes | None:
1105
+ if pytesseract is None or TesseractOutput is None:
1106
+ return None
1107
+
1108
+ image = Image.open(io.BytesIO(image_bytes))
1109
+ gray = image.convert("L")
1110
+ boxes = _extract_ocr_boxes(gray)
1111
+ if not boxes:
1112
+ return None
1113
+
1114
+ line_segments, row_density = _find_horizontal_rule_rows(gray)
1115
+ vertical_segments, _ = _find_vertical_rule_cols(gray)
1116
+
1117
+ def mask_all(_: _OcrBox) -> bool:
1118
+ return True
1119
+
1120
+ render = (render_type or "").lower()
1121
+ if render in {"drawn", "wet"}:
1122
+ cleaned = _whiteout_regions_rgb(
1123
+ image,
1124
+ boxes=boxes,
1125
+ line_segments=line_segments,
1126
+ vertical_segments=vertical_segments,
1127
+ )
1128
+ white_level = _estimate_white_level(cleaned.convert("L"))
1129
+ threshold = min(245, max(200, white_level - 10))
1130
+ mask = _build_ink_mask(cleaned, threshold=threshold, remove_blue=True)
1131
+ bbox = _components_bbox(mask, threshold=200, gap=12, max_edge_ratio=0.95)
1132
+ else:
1133
+ masked_strict = _mask_regions(
1134
+ gray,
1135
+ boxes=boxes,
1136
+ line_segments=line_segments,
1137
+ vertical_segments=vertical_segments,
1138
+ box_filter=mask_all,
1139
+ )
1140
+
1141
+ white_level = _estimate_white_level(masked_strict)
1142
+ threshold = min(245, max(200, white_level - 10))
1143
+ bbox = _components_bbox(masked_strict, threshold=threshold)
1144
+
1145
+ if bbox is None:
1146
+ line_segment = _select_signature_line(line_segments, row_density)
1147
+ max_above = max(40, int(gray.height * 0.25))
1148
+
1149
+ def typed_filter(box: _OcrBox) -> bool:
1150
+ if _is_label_text(box.text):
1151
+ return True
1152
+ if line_segment is not None:
1153
+ line_start = line_segment[0]
1154
+ keep = (line_start - max_above) <= box.bottom <= (line_start + 2)
1155
+ return not keep
1156
+ return False
1157
+
1158
+ masked_typed = _mask_regions(
1159
+ gray,
1160
+ boxes=boxes,
1161
+ line_segments=line_segments,
1162
+ vertical_segments=vertical_segments,
1163
+ box_filter=typed_filter,
1164
+ )
1165
+ white_level = _estimate_white_level(masked_typed)
1166
+ threshold = min(245, max(200, white_level - 10))
1167
+ bbox = _components_bbox(masked_typed, threshold=threshold)
1168
+ if bbox is None:
1169
+ filtered = [box for box in boxes if not _is_label_text(box.text)]
1170
+ if line_segment is not None:
1171
+ line_start = line_segment[0]
1172
+ filtered = [
1173
+ box
1174
+ for box in filtered
1175
+ if (line_start - max_above) <= box.bottom <= (line_start + 2)
1176
+ ]
1177
+ fallback_bbox = _bbox_from_boxes(filtered)
1178
+ if fallback_bbox is None:
1179
+ return None
1180
+ x0, y0, x1, y1 = fallback_bbox
1181
+ else:
1182
+ x0, y0, x1, y1 = bbox
1183
+ else:
1184
+ x0, y0, x1, y1 = bbox
1185
+
1186
+ if render in {"drawn", "wet"}:
1187
+ x0, y0, x1, y1 = _trim_bbox_by_ocr_boxes((x0, y0, x1, y1), boxes)
1188
+ width, height = gray.size
1189
+ x0 = max(0, x0 - pad_px)
1190
+ y0 = max(0, y0 - pad_px)
1191
+ x1 = min(width - 1, x1 + pad_px)
1192
+ y1 = min(height - 1, y1 + pad_px)
1193
+
1194
+ if render in {"drawn", "wet"}:
1195
+ x0, y0, x1, y1 = _trim_bbox_by_ocr_boxes((x0, y0, x1, y1), boxes, min_gap=2)
1196
+
1197
+ if x1 <= x0 or y1 <= y0:
1198
+ return None
1199
+ if (x1 - x0) < max(8, int(width * 0.08)) or (y1 - y0) < max(6, int(height * 0.08)):
1200
+ return None
1201
+
1202
+ cropped = image.crop((x0, y0, x1 + 1, y1 + 1))
1203
+ if render in {"drawn", "wet"}:
1204
+ cropped = _trim_border_lines(cropped, density_ratio=0.55, edge_ratio=0.25)
1205
+ cropped = _tighten_to_ink_components_on_line(cropped, remove_blue=True, pad_px=1)
1206
+ else:
1207
+ cropped = _trim_border_lines(cropped, density_ratio=0.65, edge_ratio=0.2)
1208
+ cropped = _tighten_to_ink_components(cropped, remove_blue=False, pad_px=1)
1209
+ buffer = io.BytesIO()
1210
+ cropped.save(buffer, format="PNG")
1211
+ return buffer.getvalue()
1212
+
1213
+
1214
+ def _wet_ink_trim_signature_image_bytes(
1215
+ image_bytes: bytes,
1216
+ *,
1217
+ pad_px: int = 4,
1218
+ ) -> bytes | None:
1219
+ if pytesseract is None or TesseractOutput is None:
1220
+ return None
1221
+
1222
+ image = Image.open(io.BytesIO(image_bytes))
1223
+ gray = image.convert("L")
1224
+
1225
+ boxes = _extract_ocr_boxes(gray)
1226
+ line_segments, _ = _find_horizontal_rule_rows(gray, density_ratio=0.18, max_thickness=6)
1227
+ vertical_segments, _ = _find_vertical_rule_cols(gray)
1228
+ cleaned = _whiteout_regions_rgb(
1229
+ image,
1230
+ boxes=boxes,
1231
+ line_segments=line_segments,
1232
+ vertical_segments=vertical_segments,
1233
+ )
1234
+ white_level = _estimate_white_level(cleaned.convert("L"))
1235
+ threshold = min(245, max(200, white_level - 10))
1236
+ mask = _build_ink_mask(cleaned, threshold=threshold, remove_blue=True)
1237
+ bbox = _components_bbox_on_line(mask, threshold=200, max_edge_ratio=0.98)
1238
+ if bbox is None:
1239
+ return None
1240
+ x0, y0, x1, y1 = bbox
1241
+ width, height = gray.size
1242
+ x0 = max(0, x0 - pad_px)
1243
+ y0 = max(0, y0 - pad_px)
1244
+ x1 = min(width - 1, x1 + pad_px)
1245
+ y1 = min(height - 1, y1 + pad_px)
1246
+ if x1 <= x0 or y1 <= y0:
1247
+ return None
1248
+ if (x1 - x0) < max(10, int(width * 0.08)) or (y1 - y0) < max(6, int(height * 0.08)):
1249
+ return None
1250
+
1251
+ cropped = image.crop((x0, y0, x1 + 1, y1 + 1))
1252
+ cropped = _trim_border_lines(cropped, density_ratio=0.55, edge_ratio=0.25)
1253
+ cropped = _tighten_to_ink_components_on_line(cropped, remove_blue=True, pad_px=1)
1254
+ buffer = io.BytesIO()
1255
+ cropped.save(buffer, format="PNG")
1256
+ return buffer.getvalue()
1257
+
1258
+
1259
+ def _select_best_trim(
1260
+ original_bytes: bytes,
1261
+ candidates: list[bytes],
1262
+ ) -> bytes:
1263
+ original_image = Image.open(io.BytesIO(original_bytes))
1264
+ original_area = original_image.width * original_image.height
1265
+ min_area = max(200, int(original_area * 0.01))
1266
+
1267
+ best_bytes: bytes | None = None
1268
+ best_ratio = -1.0
1269
+ best_area = None
1270
+ best_dark = -1
1271
+ for candidate in candidates:
1272
+ if _is_blank_crop(candidate):
1273
+ continue
1274
+ image = Image.open(io.BytesIO(candidate))
1275
+ area = image.width * image.height
1276
+ if area < min_area:
1277
+ continue
1278
+ dark, ratio = _ink_metrics(candidate)
1279
+ if ratio > best_ratio:
1280
+ best_ratio = ratio
1281
+ best_area = area
1282
+ best_dark = dark
1283
+ best_bytes = candidate
1284
+ continue
1285
+ if best_area is None:
1286
+ continue
1287
+ if abs(ratio - best_ratio) <= 0.01:
1288
+ if area < best_area or (area == best_area and dark > best_dark):
1289
+ best_ratio = ratio
1290
+ best_area = area
1291
+ best_dark = dark
1292
+ best_bytes = candidate
1293
+ return best_bytes or candidates[0]
1294
+
1295
+
1296
+ def _select_best_trim_wet(
1297
+ original_bytes: bytes,
1298
+ candidates: list[bytes],
1299
+ ) -> bytes:
1300
+ original_image = Image.open(io.BytesIO(original_bytes))
1301
+ original_area = original_image.width * original_image.height
1302
+ min_area = max(200, int(original_area * 0.01))
1303
+ min_ratio = 0.0006
1304
+
1305
+ best_bytes: bytes | None = None
1306
+ best_area = None
1307
+ best_ratio = None
1308
+ for candidate in candidates:
1309
+ if _is_blank_crop(candidate):
1310
+ continue
1311
+ image = Image.open(io.BytesIO(candidate))
1312
+ area = image.width * image.height
1313
+ if area < min_area:
1314
+ continue
1315
+ dark, ratio = _ink_metrics(candidate)
1316
+ if ratio < min_ratio:
1317
+ continue
1318
+ if best_ratio is None or ratio > best_ratio:
1319
+ best_ratio = ratio
1320
+ best_area = area
1321
+ best_bytes = candidate
1322
+ continue
1323
+ if best_area is None or best_ratio is None:
1324
+ continue
1325
+ if abs(ratio - best_ratio) <= 0.02 and area > best_area:
1326
+ best_area = area
1327
+ best_bytes = candidate
1328
+ if best_bytes is not None:
1329
+ return _pad_signature_image_bytes(best_bytes, pad_px=3)
1330
+ return _select_best_trim(original_bytes, candidates)
1331
+
1332
+
1333
+ def _pad_signature_image_bytes(image_bytes: bytes, *, pad_px: int) -> bytes:
1334
+ if pad_px <= 0:
1335
+ return image_bytes
1336
+ image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
1337
+ width, height = image.size
1338
+ padded = Image.new("RGB", (width + pad_px * 2, height + pad_px * 2), (255, 255, 255))
1339
+ padded.paste(image, (pad_px, pad_px))
1340
+ buffer = io.BytesIO()
1341
+ padded.save(buffer, format="PNG")
1342
+ return buffer.getvalue()
1343
+
1344
+
1345
+ def _trim_signature_image_bytes(
1346
+ image_bytes: bytes,
1347
+ *,
1348
+ render_type: str | None = None,
1349
+ pad_px: int = 4,
1350
+ gap_px: int = 4,
1351
+ min_density_ratio: float = 0.004,
1352
+ ) -> bytes:
1353
+ candidates: list[bytes] = [image_bytes]
1354
+ render = (render_type or "").lower()
1355
+ if render == "wet":
1356
+ wet_trimmed = _wet_ink_trim_signature_image_bytes(image_bytes)
1357
+ if wet_trimmed is not None:
1358
+ candidates.append(wet_trimmed)
1359
+ ocr_trimmed = _ocr_trim_signature_image_bytes(image_bytes, render_type=render_type)
1360
+ if ocr_trimmed is not None:
1361
+ candidates.append(ocr_trimmed)
1362
+
1363
+ heuristic = _heuristic_trim_signature_image_bytes(
1364
+ image_bytes,
1365
+ pad_px=pad_px,
1366
+ gap_px=gap_px,
1367
+ min_density_ratio=min_density_ratio,
1368
+ )
1369
+ candidates.append(heuristic)
1370
+ if render == "wet":
1371
+ return _select_best_trim_wet(image_bytes, candidates)
1372
+ return _select_best_trim(image_bytes, candidates)
1373
+
1374
+
1375
+ def _heuristic_trim_signature_image_bytes(
1376
+ image_bytes: bytes,
1377
+ *,
1378
+ pad_px: int = 4,
1379
+ gap_px: int = 4,
1380
+ min_density_ratio: float = 0.004,
1381
+ ) -> bytes:
1382
+ image = Image.open(io.BytesIO(image_bytes))
1383
+ gray = image.convert("L")
1384
+ width, height = gray.size
1385
+
1386
+ histogram = gray.histogram()
1387
+ total_pixels = width * height
1388
+ cutoff = int(total_pixels * 0.995)
1389
+ cumulative = 0
1390
+ white_level = 255
1391
+ for idx, count in enumerate(histogram):
1392
+ cumulative += count
1393
+ if cumulative >= cutoff:
1394
+ white_level = idx
1395
+ break
1396
+
1397
+ if white_level < 200:
1398
+ return image_bytes
1399
+
1400
+ thresholds = [min(254, max(200, white_level - delta)) for delta in (6, 4, 2, 1, 0)]
1401
+ min_density = max(2, int(width * min_density_ratio))
1402
+ pixels = gray.load()
1403
+
1404
+ row_densities: dict[int, list[int]] = {}
1405
+ for threshold in thresholds:
1406
+ row_density = []
1407
+ for y in range(height):
1408
+ dark = sum(1 for x in range(width) if pixels[x, y] < threshold)
1409
+ row_density.append(dark)
1410
+ row_densities[threshold] = row_density
1411
+
1412
+ line_bounds = _detect_horizontal_rule_cutoff(row_densities[thresholds[-1]], width)
1413
+ scan_limit = None
1414
+ descender_limit = height - 1
1415
+ if line_bounds is not None:
1416
+ line_start, line_end = line_bounds
1417
+ scan_limit = max(0, line_start - 1)
1418
+ descender_limit = min(height - 1, line_end + max(2, int(height * 0.02)))
1419
+
1420
+ min_band_height = max(4, int(height * 0.02))
1421
+ best = None
1422
+ best_small = None
1423
+ best_small_threshold = None
1424
+ best_threshold = None
1425
+ line_threshold = int(width * 0.6)
1426
+ for threshold in thresholds:
1427
+ row_density = row_densities[threshold]
1428
+ segments: list[tuple[int, int]] = []
1429
+ start: int | None = None
1430
+ for y, dark in enumerate(row_density):
1431
+ if scan_limit is not None and y > scan_limit:
1432
+ if start is not None:
1433
+ segments.append((start, y - 1))
1434
+ start = None
1435
+ break
1436
+ if dark >= min_density:
1437
+ if start is None:
1438
+ start = y
1439
+ else:
1440
+ if start is not None:
1441
+ segments.append((start, y - 1))
1442
+ start = None
1443
+ if start is not None:
1444
+ segments.append((start, height - 1))
1445
+
1446
+ if not segments:
1447
+ continue
1448
+
1449
+ merged: list[list[int]] = []
1450
+ for seg in segments:
1451
+ if not merged:
1452
+ merged.append([seg[0], seg[1]])
1453
+ continue
1454
+ if seg[0] - merged[-1][1] <= gap_px:
1455
+ merged[-1][1] = seg[1]
1456
+ else:
1457
+ merged.append([seg[0], seg[1]])
1458
+
1459
+ candidates = []
1460
+ for y0, y1 in merged:
1461
+ min_x, max_x = width, -1
1462
+ total_dark = 0
1463
+ for y in range(y0, y1 + 1):
1464
+ for x in range(width):
1465
+ if pixels[x, y] < threshold:
1466
+ total_dark += 1
1467
+ if x < min_x:
1468
+ min_x = x
1469
+ if x > max_x:
1470
+ max_x = x
1471
+ if max_x < 0:
1472
+ continue
1473
+ band_height = y1 - y0 + 1
1474
+ band_width = max_x - min_x + 1
1475
+ score = total_dark * (band_height**1.3)
1476
+ if line_bounds is not None:
1477
+ distance = max(0, line_bounds[0] - y1)
1478
+ proximity = 1.0 / (1.0 + (distance / 20.0))
1479
+ score *= 1.0 + 0.5 * proximity
1480
+ candidates.append(
1481
+ {
1482
+ "y0": y0,
1483
+ "y1": y1,
1484
+ "min_x": min_x,
1485
+ "max_x": max_x,
1486
+ "total": total_dark,
1487
+ "height": band_height,
1488
+ "width": band_width,
1489
+ "score": score,
1490
+ }
1491
+ )
1492
+
1493
+ if not candidates:
1494
+ continue
1495
+
1496
+ candidates.sort(key=lambda item: item["score"], reverse=True)
1497
+ top_candidate = candidates[0]
1498
+ if top_candidate["height"] >= min_band_height:
1499
+ if best is None or top_candidate["score"] > best["score"]:
1500
+ best = top_candidate
1501
+ best_threshold = threshold
1502
+ else:
1503
+ if best_small is None or top_candidate["score"] > best_small["score"]:
1504
+ best_small = top_candidate
1505
+ best_small_threshold = threshold
1506
+
1507
+ if best is None:
1508
+ best = best_small
1509
+ best_threshold = best_small_threshold
1510
+
1511
+ if best is None:
1512
+ return image_bytes
1513
+
1514
+ expansion_density = row_densities.get(best_threshold, row_densities[thresholds[-1]])
1515
+ expand_threshold = max(1, int(min_density * 0.4))
1516
+ y0 = best["y0"]
1517
+ y1 = best["y1"]
1518
+
1519
+ while y0 > 0 and expansion_density[y0 - 1] >= expand_threshold:
1520
+ y0 -= 1
1521
+ while y1 < descender_limit and expansion_density[y1 + 1] >= expand_threshold:
1522
+ y1 += 1
1523
+
1524
+ max_white_pad = max(8, int(height * 0.04))
1525
+ while y0 > 0 and (best["y0"] - y0) < max_white_pad:
1526
+ if expansion_density[y0 - 1] >= expand_threshold:
1527
+ break
1528
+ y0 -= 1
1529
+
1530
+ min_x, max_x = width, -1
1531
+ skip_line_rows = line_bounds is not None
1532
+ for y in range(y0, y1 + 1):
1533
+ if skip_line_rows and expansion_density[y] >= line_threshold:
1534
+ continue
1535
+ for x in range(width):
1536
+ if pixels[x, y] < thresholds[-1]:
1537
+ if x < min_x:
1538
+ min_x = x
1539
+ if x > max_x:
1540
+ max_x = x
1541
+ if max_x >= 0:
1542
+ best = {
1543
+ "y0": y0,
1544
+ "y1": y1,
1545
+ "min_x": min_x,
1546
+ "max_x": max_x,
1547
+ }
1548
+
1549
+ x0 = max(0, best["min_x"] - pad_px)
1550
+ x1 = min(width - 1, best["max_x"] + pad_px)
1551
+ y0 = max(0, best["y0"] - pad_px)
1552
+ y1 = min(height - 1, best["y1"] + pad_px)
1553
+
1554
+ if x1 <= x0 or y1 <= y0:
1555
+ return image_bytes
1556
+ if (x1 - x0) < max(10, int(width * 0.2)) or (y1 - y0) < max(6, int(height * 0.08)):
1557
+ return image_bytes
1558
+
1559
+ cropped = image.crop((x0, y0, x1 + 1, y1 + 1))
1560
+ cropped = _trim_below_horizontal_rule(cropped)
1561
+ cropped = _tighten_to_ink_band(cropped)
1562
+ buffer = io.BytesIO()
1563
+ cropped.save(buffer, format="PNG")
1564
+ return buffer.getvalue()
1565
+
1566
+
1567
+ def _trim_below_horizontal_rule(
1568
+ image: Image.Image,
1569
+ *,
1570
+ threshold: int = 240,
1571
+ ) -> Image.Image:
1572
+ gray = image.convert("L")
1573
+ width, height = gray.size
1574
+ if width == 0 or height == 0:
1575
+ return image
1576
+ pixels = gray.load()
1577
+ row_density = []
1578
+ for y in range(height):
1579
+ dark = sum(1 for x in range(width) if pixels[x, y] < threshold)
1580
+ row_density.append(dark)
1581
+
1582
+ line_bounds = _detect_horizontal_rule_cutoff(row_density, width)
1583
+ if line_bounds is None:
1584
+ return image
1585
+
1586
+ line_start, line_end = line_bounds
1587
+ above_dark = sum(row_density[:line_start])
1588
+ below_dark = sum(row_density[line_end + 1 :])
1589
+ if above_dark < 40:
1590
+ return image
1591
+ if below_dark <= max(40, int(above_dark * 0.2)):
1592
+ return image
1593
+
1594
+ keep_below = max(2, int(height * 0.01))
1595
+ new_bottom = min(height - 1, line_end + keep_below)
1596
+ if new_bottom <= 0 or new_bottom >= height - 1:
1597
+ return image
1598
+ return image.crop((0, 0, width, new_bottom + 1))
1599
+
1600
+
1601
+ def _tighten_to_ink_band(
1602
+ image: Image.Image,
1603
+ *,
1604
+ threshold: int = 240,
1605
+ pad_px: int = 2,
1606
+ min_density_ratio: float = 0.004,
1607
+ ) -> Image.Image:
1608
+ gray = image.convert("L")
1609
+ width, height = gray.size
1610
+ if width == 0 or height == 0:
1611
+ return image
1612
+
1613
+ pixels = gray.load()
1614
+ row_density = []
1615
+ for y in range(height):
1616
+ dark = sum(1 for x in range(width) if pixels[x, y] < threshold)
1617
+ row_density.append(dark)
1618
+
1619
+ line_threshold = int(width * 0.6)
1620
+ line_rows = {i for i, d in enumerate(row_density) if d >= line_threshold}
1621
+ if not line_rows and max(row_density, default=0) == 0:
1622
+ return image
1623
+
1624
+ min_density = max(2, int(width * min_density_ratio))
1625
+ segments: list[tuple[int, int]] = []
1626
+ start: int | None = None
1627
+ for y, dark in enumerate(row_density):
1628
+ if y in line_rows:
1629
+ if start is not None:
1630
+ segments.append((start, y - 1))
1631
+ start = None
1632
+ continue
1633
+ if dark >= min_density:
1634
+ if start is None:
1635
+ start = y
1636
+ else:
1637
+ if start is not None:
1638
+ segments.append((start, y - 1))
1639
+ start = None
1640
+ if start is not None:
1641
+ segments.append((start, height - 1))
1642
+
1643
+ if not segments:
1644
+ return image
1645
+
1646
+ def segment_score(y0: int, y1: int) -> int:
1647
+ return sum(row_density[y0 : y1 + 1])
1648
+
1649
+ line_groups: list[tuple[int, int]] = []
1650
+ if line_rows:
1651
+ sorted_rows = sorted(line_rows)
1652
+ group_start = sorted_rows[0]
1653
+ prev = sorted_rows[0]
1654
+ for row in sorted_rows[1:]:
1655
+ if row - prev > 1:
1656
+ line_groups.append((group_start, prev))
1657
+ group_start = row
1658
+ prev = row
1659
+ line_groups.append((group_start, prev))
1660
+
1661
+ max_gap = min(int(height * 0.35), 140)
1662
+ best_segment: tuple[int, int] | None = None
1663
+ best_score = None
1664
+
1665
+ def consider_segments(above: bool) -> bool:
1666
+ nonlocal best_segment, best_score
1667
+ found = False
1668
+ for line_start, line_end in line_groups:
1669
+ for y0, y1 in segments:
1670
+ if above:
1671
+ gap = line_start - y1
1672
+ if y1 >= line_start or gap > max_gap:
1673
+ continue
1674
+ else:
1675
+ gap = y0 - line_end
1676
+ if y0 <= line_end or gap > max_gap:
1677
+ continue
1678
+
1679
+ height_span = y1 - y0 + 1
1680
+ if height_span < 2:
1681
+ continue
1682
+ score = segment_score(y0, y1)
1683
+ proximity = 1.0 / (1.0 + (gap / 25.0))
1684
+ weighted = score * (1.0 + 0.5 * proximity)
1685
+ if best_score is None or weighted > best_score:
1686
+ best_score = weighted
1687
+ best_segment = (y0, y1)
1688
+ found = True
1689
+ return found
1690
+
1691
+ if line_groups:
1692
+ if not consider_segments(True):
1693
+ consider_segments(False)
1694
+
1695
+ if best_segment is None:
1696
+ for y0, y1 in segments:
1697
+ score = segment_score(y0, y1)
1698
+ if best_score is None or score > best_score:
1699
+ best_score = score
1700
+ best_segment = (y0, y1)
1701
+
1702
+ if best_segment is None:
1703
+ return image
1704
+
1705
+ y0, y1 = best_segment
1706
+ min_x, max_x = width, -1
1707
+ for y in range(y0, y1 + 1):
1708
+ if y in line_rows:
1709
+ continue
1710
+ for x in range(width):
1711
+ if pixels[x, y] < threshold:
1712
+ if x < min_x:
1713
+ min_x = x
1714
+ if x > max_x:
1715
+ max_x = x
1716
+
1717
+ if max_x < 0:
1718
+ return image
1719
+
1720
+ x0 = max(0, min_x - pad_px)
1721
+ x1 = min(width - 1, max_x + pad_px)
1722
+ y0 = max(0, y0 - pad_px)
1723
+ y1 = min(height - 1, y1 + pad_px)
1724
+ if x1 <= x0 or y1 <= y0:
1725
+ return image
1726
+ return image.crop((x0, y0, x1 + 1, y1 + 1))
1727
+
1728
+
1729
+ def _detect_horizontal_rule_cutoff(
1730
+ row_density: list[int],
1731
+ width: int,
1732
+ ) -> tuple[int, int] | None:
1733
+ if not row_density:
1734
+ return None
1735
+ line_threshold = int(width * 0.6)
1736
+ max_thickness = 4
1737
+ segments: list[tuple[int, int]] = []
1738
+ start = None
1739
+ for y, density in enumerate(row_density):
1740
+ if density >= line_threshold:
1741
+ if start is None:
1742
+ start = y
1743
+ else:
1744
+ if start is not None:
1745
+ segments.append((start, y - 1))
1746
+ start = None
1747
+ if start is not None:
1748
+ segments.append((start, len(row_density) - 1))
1749
+
1750
+ if not segments:
1751
+ return None
1752
+
1753
+ total_dark = sum(row_density)
1754
+ if total_dark <= 0:
1755
+ return None
1756
+
1757
+ min_above_dark = max(40, int(total_dark * 0.02))
1758
+ for y0, y1 in segments:
1759
+ thickness = y1 - y0 + 1
1760
+ if thickness > max_thickness:
1761
+ continue
1762
+ above_dark = sum(row_density[:y0])
1763
+ below_dark = sum(row_density[y1 + 1 :])
1764
+ if above_dark < 40:
1765
+ continue
1766
+ midpoint_ratio = ((y0 + y1) / 2.0) / max(1, len(row_density))
1767
+ if above_dark >= min_above_dark and midpoint_ratio >= 0.2:
1768
+ return (y0, y1)
1769
+ if midpoint_ratio >= 0.35:
1770
+ return (y0, y1)
1771
+ if above_dark >= max(40, int(below_dark * 0.3)):
1772
+ return (y0, y1)
1773
+ return None
1774
+
1775
+
224
1776
  def _to_clip_rect(page, bbox: tuple[float, float, float, float]):
225
1777
  width = float(page.rect.width)
226
1778
  height = float(page.rect.height)
@@ -237,6 +1789,31 @@ def _to_clip_rect(page, bbox: tuple[float, float, float, float]):
237
1789
  return fitz.Rect(left, top, right, bottom)
238
1790
 
239
1791
 
1792
+ def _expand_wet_bbox(page, bbox: tuple[float, float, float, float]) -> tuple[float, float, float, float] | None:
1793
+ clip = _to_clip_rect(page, bbox)
1794
+ if clip is None:
1795
+ return None
1796
+ rect = fitz.Rect(clip)
1797
+ width = rect.width
1798
+ height = rect.height
1799
+ if width <= 0 or height <= 0:
1800
+ return None
1801
+ page_rect = page.rect
1802
+ if width >= page_rect.width * 0.45:
1803
+ return None
1804
+
1805
+ left_pad = max(6.0, width * 0.15)
1806
+ left = max(page_rect.x0, rect.x0 - left_pad)
1807
+ right = page_rect.x1 if width < page_rect.width * 0.7 else rect.x1
1808
+
1809
+ pad_y = max(12.0, height * 0.8)
1810
+ top = max(page_rect.y0, rect.y0 - pad_y)
1811
+ bottom = min(page_rect.y1, rect.y1 + pad_y)
1812
+ if right <= left or bottom <= top:
1813
+ return None
1814
+ return _rect_to_pdf_tuple(fitz.Rect(left, top, right, bottom), page_rect.height)
1815
+
1816
+
240
1817
  def _clamp(value: float, lower: float, upper: float) -> float:
241
1818
  return max(lower, min(value, upper))
242
1819
 
@@ -251,3 +1828,787 @@ def _slugify(value: str) -> str:
251
1828
  cleaned = re.sub(r"[^A-Za-z0-9_-]+", "_", value.strip().lower())
252
1829
  cleaned = cleaned.strip("_")
253
1830
  return cleaned or "signature"
1831
+
1832
+
1833
+ def _update_signature_pages(file_result: FileResult) -> None:
1834
+ pages = sorted({sig.Page for sig in file_result.Signatures if sig.Page})
1835
+ file_result.SignaturePages = ",".join(str(page) for page in pages)
1836
+
1837
+
1838
+ def _bbox_has_area(bbox: tuple[float, float, float, float] | None) -> bool:
1839
+ if not bbox or len(bbox) != 4:
1840
+ return False
1841
+ x0, y0, x1, y1 = bbox
1842
+ return not (x0 == 0 and y0 == 0 and x1 == 0 and y1 == 0) and (x1 - x0) > 0 and (y1 - y0) > 0
1843
+
1844
+
1845
+ def _is_pseudo_signature(signature: Signature) -> bool:
1846
+ if _has_image_evidence(signature):
1847
+ return False
1848
+ if signature.FieldName == "vendor_or_acro_detected":
1849
+ return True
1850
+ hint = (signature.Hint or "").lower()
1851
+ if "vendororacronly" in hint:
1852
+ return True
1853
+ return any("pseudo:true" == token for token in signature.Evidence)
1854
+
1855
+
1856
+ def _has_image_evidence(signature: Signature) -> bool:
1857
+ return bool(signature.Evidence and any("image:retainer" == token for token in signature.Evidence))
1858
+
1859
+
1860
+ def _collect_image_rects(page) -> list[object]:
1861
+ rects: list[object] = []
1862
+ try:
1863
+ images = page.get_images(full=True)
1864
+ except Exception:
1865
+ return rects
1866
+ for img in images:
1867
+ xref = img[0]
1868
+ try:
1869
+ rects.extend(page.get_image_rects(xref))
1870
+ except Exception:
1871
+ continue
1872
+ return rects
1873
+
1874
+
1875
+ def _filter_image_rects(rects: list[object], page_rect) -> list[object]:
1876
+ if not rects:
1877
+ return []
1878
+ max_area = page_rect.width * page_rect.height * 0.35
1879
+ filtered: list[object] = []
1880
+ for rect in rects:
1881
+ r = fitz.Rect(rect)
1882
+ area = r.get_area()
1883
+ if area <= 0:
1884
+ continue
1885
+ if area > max_area:
1886
+ continue
1887
+ filtered.append(r)
1888
+ return filtered
1889
+
1890
+
1891
+ def _select_image_bbox(
1892
+ image_rects: list[object],
1893
+ signature: Signature,
1894
+ role_rects: dict[str, list[object]],
1895
+ label_rects: list[object],
1896
+ page_rect,
1897
+ *,
1898
+ signature_index: int,
1899
+ signature_count: int,
1900
+ ) -> tuple[float, float, float, float] | None:
1901
+ if not image_rects:
1902
+ return None
1903
+ filtered = _filter_image_rects(image_rects, page_rect)
1904
+ if not filtered:
1905
+ return None
1906
+
1907
+ role_hint = (signature.Role or "").lower()
1908
+ anchors: list[object] = []
1909
+ if role_hint in {"firm", "attorney"}:
1910
+ anchors = role_rects.get("firm", [])
1911
+ elif role_hint in {"client", "patient", "representative"}:
1912
+ anchors = role_rects.get("client", []) + label_rects
1913
+ else:
1914
+ anchors = label_rects
1915
+
1916
+ if anchors:
1917
+ candidate, score = _select_rect_near_labels_with_score(filtered, anchors)
1918
+ max_distance = min(page_rect.height * 0.5, 260.0)
1919
+ if candidate is None or score is None or score > max_distance:
1920
+ candidate = None
1921
+ else:
1922
+ candidate = _select_rect_by_order(filtered, signature, signature_index, signature_count)
1923
+
1924
+ if candidate is None:
1925
+ return None
1926
+ bbox = _rect_to_pdf_tuple(candidate, page_rect.height)
1927
+ if _bbox_has_area(bbox):
1928
+ return bbox
1929
+ return None
1930
+
1931
+
1932
+ def _refine_bbox_with_image_rects(
1933
+ page,
1934
+ bbox: tuple[float, float, float, float],
1935
+ *,
1936
+ image_rects: list[object],
1937
+ min_overlap: float = 0.6,
1938
+ min_center_overlap: float = 0.2,
1939
+ min_area: float = 20.0,
1940
+ ) -> tuple[float, float, float, float] | None:
1941
+ clip = _to_clip_rect(page, bbox)
1942
+ if clip is None:
1943
+ return None
1944
+ best_rect = None
1945
+ best_overlap = None
1946
+ best_distance = None
1947
+ clip_center_x = (clip.x0 + clip.x1) / 2.0
1948
+ clip_center_y = (clip.y0 + clip.y1) / 2.0
1949
+ for rect in image_rects:
1950
+ try:
1951
+ inter = rect & clip
1952
+ except Exception:
1953
+ continue
1954
+ if inter is None or inter.get_area() <= 0:
1955
+ continue
1956
+ rect_area = rect.get_area()
1957
+ if rect_area <= min_area:
1958
+ continue
1959
+ overlap = inter.get_area() / max(1.0, rect_area)
1960
+ center_x = (rect.x0 + rect.x1) / 2.0
1961
+ center_y = (rect.y0 + rect.y1) / 2.0
1962
+ center_inside = (clip.x0 <= center_x <= clip.x1) and (clip.y0 <= center_y <= clip.y1)
1963
+ if overlap < min_overlap and not (center_inside and overlap >= min_center_overlap):
1964
+ continue
1965
+ distance = (center_x - clip_center_x) ** 2 + (center_y - clip_center_y) ** 2
1966
+ if best_overlap is None:
1967
+ best_overlap = overlap
1968
+ best_distance = distance
1969
+ best_rect = rect
1970
+ continue
1971
+ if overlap > best_overlap:
1972
+ best_overlap = overlap
1973
+ best_distance = distance
1974
+ best_rect = rect
1975
+ continue
1976
+ if abs(overlap - best_overlap) <= 0.05 and best_distance is not None and distance < best_distance:
1977
+ best_distance = distance
1978
+ best_rect = rect
1979
+ if best_rect is None:
1980
+ return None
1981
+ return _rect_to_pdf_tuple(best_rect, page.rect.height)
1982
+
1983
+
1984
+ def _ink_metrics(image_bytes: bytes, *, threshold: int = 240) -> tuple[int, float]:
1985
+ image = Image.open(io.BytesIO(image_bytes))
1986
+ gray = image.convert("L")
1987
+ histogram = gray.histogram()
1988
+ total = sum(histogram)
1989
+ dark = sum(histogram[:threshold])
1990
+ ratio = (dark / total) if total else 0.0
1991
+ return dark, ratio
1992
+
1993
+
1994
+ def _is_blank_crop(
1995
+ image_bytes: bytes,
1996
+ *,
1997
+ min_pixels: int = 40,
1998
+ min_ratio: float = 0.0005,
1999
+ ) -> bool:
2000
+ dark, ratio = _ink_metrics(image_bytes)
2001
+ return dark < min_pixels and ratio < min_ratio
2002
+
2003
+
2004
+ def _resolve_signature_bbox(
2005
+ page,
2006
+ signature: Signature,
2007
+ *,
2008
+ page_cache: dict[int, dict[str, object]],
2009
+ signature_count: int,
2010
+ signature_index: int,
2011
+ page_number: int,
2012
+ ) -> tuple[float, float, float, float] | None:
2013
+ cache = page_cache.setdefault(page_number, {})
2014
+ name = (signature.FieldName or "").strip()
2015
+
2016
+ label_rects = _get_label_rects(page_cache, page_number, page)
2017
+ role_rects = _get_role_label_rects(page_cache, page_number, page)
2018
+ line_rects = _get_line_rects(page_cache, page_number, page)
2019
+
2020
+ if signature.RenderType in {"typed", "drawn"}:
2021
+ image_rects = _get_image_rects(page_cache, page_number, page)
2022
+ image_bbox = _select_image_bbox(
2023
+ image_rects,
2024
+ signature,
2025
+ role_rects,
2026
+ label_rects,
2027
+ page.rect,
2028
+ signature_index=signature_index,
2029
+ signature_count=signature_count,
2030
+ )
2031
+ if image_bbox is not None:
2032
+ return image_bbox
2033
+
2034
+ line_bbox = _select_line_bbox(
2035
+ line_rects,
2036
+ signature,
2037
+ role_rects,
2038
+ label_rects,
2039
+ page.rect,
2040
+ )
2041
+ if line_bbox is not None:
2042
+ return line_bbox
2043
+
2044
+ widget_cache = cache.get("widget_cache")
2045
+ if widget_cache is None:
2046
+ widget_map, widget_sig_rects, widget_rects = _collect_widget_rects(page)
2047
+ widget_cache = {
2048
+ "map": widget_map,
2049
+ "sig_rects": widget_sig_rects,
2050
+ "all_rects": widget_rects,
2051
+ }
2052
+ cache["widget_cache"] = widget_cache
2053
+ widget_map = widget_cache["map"]
2054
+ widget_sig_rects = widget_cache["sig_rects"]
2055
+ widget_rects = widget_cache["all_rects"]
2056
+
2057
+ if name:
2058
+ rect = widget_map.get(name)
2059
+ if rect is not None:
2060
+ bbox = _rect_to_pdf_tuple(rect, page.rect.height)
2061
+ if _bbox_has_area(bbox):
2062
+ return bbox
2063
+
2064
+ rect = _select_rect_candidate(
2065
+ widget_sig_rects,
2066
+ signature,
2067
+ signature_index,
2068
+ signature_count,
2069
+ label_rects,
2070
+ )
2071
+ if rect is not None:
2072
+ bbox = _rect_to_pdf_tuple(rect, page.rect.height)
2073
+ if _bbox_has_area(bbox):
2074
+ return bbox
2075
+
2076
+ widget_candidates = _filter_signature_like_rects(widget_rects, page.rect)
2077
+ rect = _select_rect_candidate(
2078
+ widget_candidates,
2079
+ signature,
2080
+ signature_index,
2081
+ signature_count,
2082
+ label_rects,
2083
+ )
2084
+ if rect is not None:
2085
+ bbox = _rect_to_pdf_tuple(rect, page.rect.height)
2086
+ if _bbox_has_area(bbox):
2087
+ return bbox
2088
+ if not widget_candidates and len(widget_rects) == 1:
2089
+ bbox = _rect_to_pdf_tuple(widget_rects[0], page.rect.height)
2090
+ if _bbox_has_area(bbox):
2091
+ return bbox
2092
+
2093
+ annot_cache = cache.get("annot_cache")
2094
+ if annot_cache is None:
2095
+ annot_map, annot_rects = _collect_annot_rects(page)
2096
+ annot_cache = {"map": annot_map, "rects": annot_rects}
2097
+ cache["annot_cache"] = annot_cache
2098
+ annot_map = annot_cache["map"]
2099
+ annot_rects = annot_cache["rects"]
2100
+
2101
+ if name:
2102
+ rect = annot_map.get(name)
2103
+ if rect is not None:
2104
+ bbox = _rect_to_pdf_tuple(rect, page.rect.height)
2105
+ if _bbox_has_area(bbox):
2106
+ return bbox
2107
+
2108
+ annot_candidates = _filter_signature_like_rects(annot_rects, page.rect)
2109
+ if not annot_candidates and len(annot_rects) == 1:
2110
+ annot_candidates = [fitz.Rect(annot_rects[0])]
2111
+ rect = _select_rect_candidate(
2112
+ annot_candidates,
2113
+ signature,
2114
+ signature_index,
2115
+ signature_count,
2116
+ label_rects,
2117
+ )
2118
+ if rect is not None:
2119
+ bbox = _rect_to_pdf_tuple(rect, page.rect.height)
2120
+ if _bbox_has_area(bbox):
2121
+ return bbox
2122
+ if label_rects:
2123
+ target = _select_rect_by_order(label_rects, signature, signature_index, signature_count)
2124
+ if target is not None:
2125
+ expanded = _expand_rect_from_label(target, page.rect)
2126
+ if expanded is not None:
2127
+ bbox = _rect_to_pdf_tuple(expanded, page.rect.height)
2128
+ if _bbox_has_area(bbox):
2129
+ return bbox
2130
+
2131
+ if name:
2132
+ fieldname_cache = cache.get("fieldname_cache")
2133
+ if fieldname_cache is None:
2134
+ fieldname_cache = {}
2135
+ cache["fieldname_cache"] = fieldname_cache
2136
+ rects = fieldname_cache.get(name)
2137
+ if rects is None:
2138
+ rects = _find_fieldname_text_rects(page, name)
2139
+ fieldname_cache[name] = rects
2140
+ if rects:
2141
+ target = _select_rect_by_order(rects, signature, signature_index, signature_count)
2142
+ if target is not None:
2143
+ expanded = _expand_rect_around_text(target, page.rect)
2144
+ if expanded is not None:
2145
+ bbox = _rect_to_pdf_tuple(expanded, page.rect.height)
2146
+ if _bbox_has_area(bbox):
2147
+ return bbox
2148
+
2149
+ return None
2150
+
2151
+
2152
+ def _resolve_bbox_across_document(
2153
+ document,
2154
+ signature: Signature,
2155
+ *,
2156
+ page_cache: dict[int, dict[str, object]],
2157
+ document_cache: dict[str, object],
2158
+ signature_index: int,
2159
+ signature_count: int,
2160
+ skip_page: int | None,
2161
+ ) -> tuple[int | None, tuple[float, float, float, float] | None]:
2162
+ label_pages = document_cache.get("label_pages")
2163
+ if label_pages is None:
2164
+ label_pages = []
2165
+ for page_index in range(document.page_count):
2166
+ page_number = page_index + 1
2167
+ page = document.load_page(page_index)
2168
+ labels = _get_label_rects(page_cache, page_number, page)
2169
+ if labels:
2170
+ label_pages.append(page_number)
2171
+ document_cache["label_pages"] = label_pages
2172
+
2173
+ page_order = list(label_pages)
2174
+ for page_number in range(1, document.page_count + 1):
2175
+ if page_number not in page_order:
2176
+ page_order.append(page_number)
2177
+
2178
+ for page_number in page_order:
2179
+ if skip_page is not None and page_number == skip_page:
2180
+ continue
2181
+ page = document.load_page(page_number - 1)
2182
+ resolved = _resolve_signature_bbox(
2183
+ page,
2184
+ signature,
2185
+ page_cache=page_cache,
2186
+ signature_count=signature_count,
2187
+ signature_index=signature_index,
2188
+ page_number=page_number,
2189
+ )
2190
+ if resolved is not None:
2191
+ return page_number, resolved
2192
+ return None, None
2193
+
2194
+
2195
+ _SIGNATURE_NAME_PATTERN = re.compile(r"\bsign", re.IGNORECASE)
2196
+ _SIGNATURE_LABEL_PATTERNS = (
2197
+ re.compile(r"\bsignature\b", re.IGNORECASE),
2198
+ re.compile(r"\bsign here\b", re.IGNORECASE),
2199
+ re.compile(r"\bsigned by\b", re.IGNORECASE),
2200
+ re.compile(r"/s/", re.IGNORECASE),
2201
+ )
2202
+
2203
+ _ROLE_LABEL_PATTERNS = {
2204
+ "client": (
2205
+ re.compile(r"\bclient\b", re.IGNORECASE),
2206
+ re.compile(r"\bpatient\b", re.IGNORECASE),
2207
+ re.compile(r"\bplaintiff\b", re.IGNORECASE),
2208
+ ),
2209
+ "firm": (
2210
+ re.compile(r"\bfirm\b", re.IGNORECASE),
2211
+ re.compile(r"\battorney\b", re.IGNORECASE),
2212
+ re.compile(r"\bcounsel\b", re.IGNORECASE),
2213
+ re.compile(r"\blaw\b", re.IGNORECASE),
2214
+ re.compile(r"\blegal\b", re.IGNORECASE),
2215
+ re.compile(r"\bllc\b", re.IGNORECASE),
2216
+ re.compile(r"\bllp\b", re.IGNORECASE),
2217
+ re.compile(r"\bgroup\b", re.IGNORECASE),
2218
+ ),
2219
+ }
2220
+
2221
+
2222
+ def _get_label_rects(
2223
+ page_cache: dict[int, dict[str, object]],
2224
+ page_number: int,
2225
+ page,
2226
+ ) -> list[object]:
2227
+ cache = page_cache.setdefault(page_number, {})
2228
+ label_rects = cache.get("label_rects")
2229
+ if label_rects is None:
2230
+ label_rects = _collect_signature_labels(page)
2231
+ cache["label_rects"] = label_rects
2232
+ return label_rects
2233
+
2234
+
2235
+ def _get_role_label_rects(
2236
+ page_cache: dict[int, dict[str, object]],
2237
+ page_number: int,
2238
+ page,
2239
+ ) -> dict[str, list[object]]:
2240
+ cache = page_cache.setdefault(page_number, {})
2241
+ role_rects = cache.get("role_rects")
2242
+ if role_rects is None:
2243
+ role_rects = _collect_role_text_rects(page)
2244
+ cache["role_rects"] = role_rects
2245
+ return role_rects
2246
+
2247
+
2248
+ def _get_line_rects(
2249
+ page_cache: dict[int, dict[str, object]],
2250
+ page_number: int,
2251
+ page,
2252
+ ) -> list[object]:
2253
+ cache = page_cache.setdefault(page_number, {})
2254
+ line_rects = cache.get("line_rects")
2255
+ if line_rects is None:
2256
+ line_rects = _collect_underscore_rects(page)
2257
+ cache["line_rects"] = line_rects
2258
+ return line_rects
2259
+
2260
+
2261
+ def _get_image_rects(
2262
+ page_cache: dict[int, dict[str, object]],
2263
+ page_number: int,
2264
+ page,
2265
+ ) -> list[object]:
2266
+ cache = page_cache.setdefault(page_number, {})
2267
+ image_rects = cache.get("image_rects")
2268
+ if image_rects is None:
2269
+ image_rects = _collect_image_rects(page)
2270
+ cache["image_rects"] = image_rects
2271
+ return image_rects
2272
+
2273
+
2274
+ def _collect_widget_rects(page) -> tuple[dict[str, object], list[object], list[object]]:
2275
+ mapping: dict[str, object] = {}
2276
+ signature_rects: list[object] = []
2277
+ all_rects: list[object] = []
2278
+ widgets = page.widgets() if hasattr(page, "widgets") else None
2279
+ if not widgets:
2280
+ return mapping, signature_rects, all_rects
2281
+ for widget in widgets:
2282
+ rect = widget.rect
2283
+ name = (widget.field_name or "").strip()
2284
+ if name:
2285
+ mapping[name] = rect
2286
+ all_rects.append(rect)
2287
+ if _is_signature_widget(widget, name):
2288
+ signature_rects.append(rect)
2289
+ return mapping, signature_rects, all_rects
2290
+
2291
+
2292
+ def _is_signature_widget(widget, name: str) -> bool:
2293
+ if getattr(widget, "field_type", None) in {getattr(fitz, "PDF_WIDGET_TYPE_SIGNATURE", 6)}:
2294
+ return True
2295
+ if name and _SIGNATURE_NAME_PATTERN.search(name):
2296
+ return True
2297
+ return False
2298
+
2299
+
2300
+ def _collect_annot_rects(page) -> tuple[dict[str, object], list[object]]:
2301
+ mapping: dict[str, object] = {}
2302
+ candidates: list[object] = []
2303
+ annot = page.first_annot
2304
+ while annot:
2305
+ name = (getattr(annot, "field_name", None) or getattr(annot, "title", None) or "").strip()
2306
+ if name and name not in mapping:
2307
+ mapping[name] = annot.rect
2308
+ if _is_signature_annotation(annot):
2309
+ candidates.append(annot.rect)
2310
+ annot = annot.next
2311
+ return mapping, candidates
2312
+
2313
+
2314
+ def _is_signature_annotation(annot) -> bool:
2315
+ try:
2316
+ annot_type = (
2317
+ annot.type[1] if isinstance(annot.type, tuple) and len(annot.type) > 1 else None
2318
+ )
2319
+ except Exception: # pragma: no cover - defensive
2320
+ annot_type = None
2321
+ if not annot_type:
2322
+ return False
2323
+ label = str(annot_type).lower()
2324
+ return label in {"stamp", "ink", "freetext", "text"}
2325
+
2326
+
2327
+ def _collect_signature_labels(page) -> list[object]:
2328
+ labels: list[object] = []
2329
+ try:
2330
+ blocks = page.get_text("blocks")
2331
+ except Exception: # pragma: no cover - defensive
2332
+ return labels
2333
+ for block in blocks or []:
2334
+ if not block or len(block) < 5:
2335
+ continue
2336
+ text = str(block[4] or "")
2337
+ if not text:
2338
+ continue
2339
+ if _is_signature_label_text(text):
2340
+ labels.append(fitz.Rect(block[0], block[1], block[2], block[3]))
2341
+ return labels
2342
+
2343
+
2344
+ def _is_signature_label_text(text: str) -> bool:
2345
+ normalized = " ".join(text.split())
2346
+ return any(pattern.search(normalized) for pattern in _SIGNATURE_LABEL_PATTERNS)
2347
+
2348
+
2349
+ def _collect_text_lines(page) -> list[dict[str, object]]:
2350
+ lines: dict[tuple[int, int], dict[str, object]] = {}
2351
+ try:
2352
+ words = page.get_text("words") or []
2353
+ except Exception: # pragma: no cover - defensive
2354
+ return []
2355
+ for word in words:
2356
+ if len(word) < 8:
2357
+ continue
2358
+ x0, y0, x1, y1, text, block_no, line_no, *_ = word
2359
+ key = (int(block_no), int(line_no))
2360
+ entry = lines.get(key)
2361
+ if entry is None:
2362
+ entry = {
2363
+ "x0": float(x0),
2364
+ "y0": float(y0),
2365
+ "x1": float(x1),
2366
+ "y1": float(y1),
2367
+ "text": str(text),
2368
+ }
2369
+ lines[key] = entry
2370
+ else:
2371
+ entry["x0"] = min(entry["x0"], float(x0))
2372
+ entry["y0"] = min(entry["y0"], float(y0))
2373
+ entry["x1"] = max(entry["x1"], float(x1))
2374
+ entry["y1"] = max(entry["y1"], float(y1))
2375
+ entry["text"] = f"{entry['text']} {text}"
2376
+ result: list[dict[str, object]] = []
2377
+ for entry in lines.values():
2378
+ text = str(entry["text"]).strip()
2379
+ if not text:
2380
+ continue
2381
+ entry["lower"] = text.lower()
2382
+ entry["rect"] = fitz.Rect(entry["x0"], entry["y0"], entry["x1"], entry["y1"])
2383
+ result.append(entry)
2384
+ return result
2385
+
2386
+
2387
+ def _collect_role_text_rects(page) -> dict[str, list[object]]:
2388
+ rects: dict[str, list[object]] = {"client": [], "firm": []}
2389
+ lines = _collect_text_lines(page)
2390
+ for line in lines:
2391
+ text = str(line["text"])
2392
+ lower = str(line["lower"])
2393
+ if len(text) > 60:
2394
+ continue
2395
+ for role, patterns in _ROLE_LABEL_PATTERNS.items():
2396
+ if any(pattern.search(lower) for pattern in patterns):
2397
+ rects[role].append(line["rect"])
2398
+ return rects
2399
+
2400
+
2401
+ def _collect_underscore_rects(page) -> list[object]:
2402
+ rects: list[object] = []
2403
+ try:
2404
+ words = page.get_text("words") or []
2405
+ except Exception: # pragma: no cover - defensive
2406
+ return rects
2407
+ for word in words:
2408
+ if len(word) < 5:
2409
+ continue
2410
+ x0, y0, x1, y1, text, *_ = word
2411
+ text = str(text)
2412
+ if text and set(text) == {"_"} and len(text) >= 4:
2413
+ rects.append(fitz.Rect(float(x0), float(y0), float(x1), float(y1)))
2414
+ return rects
2415
+
2416
+
2417
+ def _select_line_bbox(
2418
+ line_rects: list[object],
2419
+ signature: Signature,
2420
+ role_rects: dict[str, list[object]],
2421
+ label_rects: list[object],
2422
+ page_rect,
2423
+ ) -> tuple[float, float, float, float] | None:
2424
+ if not line_rects:
2425
+ return None
2426
+ role_hint = (signature.Role or "").lower()
2427
+ anchors: list[object] = []
2428
+ if role_hint in {"firm", "attorney"}:
2429
+ anchors = role_rects.get("firm", [])
2430
+ elif role_hint in {"client", "patient", "representative"}:
2431
+ anchors = role_rects.get("client", []) + label_rects
2432
+ else:
2433
+ anchors = label_rects
2434
+
2435
+ if anchors:
2436
+ candidate, score = _select_rect_near_labels_with_score(line_rects, anchors)
2437
+ max_distance = min(page_rect.height * 0.35, 220.0)
2438
+ if candidate is None or score is None or score > max_distance:
2439
+ candidate = None
2440
+ else:
2441
+ candidate = _select_rect_by_order(line_rects, signature, 1, 1)
2442
+
2443
+ if candidate is None:
2444
+ return None
2445
+ expanded = _expand_rect_from_line(candidate, page_rect)
2446
+ if expanded is None:
2447
+ return None
2448
+ bbox = _rect_to_pdf_tuple(expanded, page_rect.height)
2449
+ if _bbox_has_area(bbox):
2450
+ return bbox
2451
+ return None
2452
+
2453
+
2454
+ def _select_rect_near_labels_with_score(
2455
+ rects: list[object],
2456
+ label_rects: list[object],
2457
+ ) -> tuple[object | None, float | None]:
2458
+ if not rects or not label_rects:
2459
+ return None, None
2460
+ best = None
2461
+ best_score: float | None = None
2462
+ for rect in rects:
2463
+ r = fitz.Rect(rect)
2464
+ for label in label_rects:
2465
+ l = fitz.Rect(label)
2466
+ vertical_gap = max(0.0, max(l.y0 - r.y1, r.y0 - l.y1))
2467
+ horizontal_gap = max(0.0, max(l.x0 - r.x1, r.x0 - l.x1))
2468
+ score = vertical_gap * 2.0 + horizontal_gap
2469
+ if best_score is None or score < best_score:
2470
+ best_score = score
2471
+ best = r
2472
+ return best, best_score
2473
+
2474
+
2475
+ def _expand_rect_from_line(line_rect, page_rect):
2476
+ rect = fitz.Rect(line_rect)
2477
+ width = rect.width
2478
+ height = rect.height
2479
+ if width <= 0 or height <= 0:
2480
+ return None
2481
+ pad_x = max(8.0, width * 0.05)
2482
+ left = max(page_rect.x0, rect.x0 - pad_x)
2483
+ right = min(page_rect.x1, rect.x1 + pad_x)
2484
+
2485
+ max_height = min(140.0, max(60.0, height * 12.0))
2486
+ gap = max(2.0, height * 1.5)
2487
+ upper = max(page_rect.y0, rect.y0 - gap)
2488
+ lower = max(page_rect.y0, upper - max_height)
2489
+ if upper <= lower:
2490
+ return None
2491
+ return fitz.Rect(left, lower, right, upper)
2492
+
2493
+
2494
+ def _select_rect_by_order(
2495
+ rects: list[object],
2496
+ signature: Signature,
2497
+ signature_index: int,
2498
+ signature_count: int,
2499
+ ) -> object | None:
2500
+ if not rects:
2501
+ return None
2502
+ ordered = sorted(rects, key=lambda rect: rect.y0)
2503
+ if signature_count > 1 and signature_index > 0:
2504
+ return ordered[min(signature_index - 1, len(ordered) - 1)]
2505
+ role_hint = (signature.Role or "").lower()
2506
+ if role_hint in {"patient", "client"}:
2507
+ return ordered[0]
2508
+ if role_hint in {"attorney", "firm"} and len(ordered) > 1:
2509
+ return ordered[min(1, len(ordered) - 1)]
2510
+ return ordered[0]
2511
+
2512
+
2513
+ def _select_rect_candidate(
2514
+ rects: list[object],
2515
+ signature: Signature,
2516
+ signature_index: int,
2517
+ signature_count: int,
2518
+ label_rects: list[object] | None,
2519
+ ) -> object | None:
2520
+ if rects and label_rects:
2521
+ near = _select_rect_near_labels(rects, label_rects)
2522
+ if near is not None:
2523
+ return near
2524
+ return _select_rect_by_order(rects, signature, signature_index, signature_count)
2525
+
2526
+
2527
+ def _select_rect_near_labels(rects: list[object], label_rects: list[object]) -> object | None:
2528
+ if not rects or not label_rects:
2529
+ return None
2530
+ best = None
2531
+ best_score = None
2532
+ for rect in rects:
2533
+ r = fitz.Rect(rect)
2534
+ for label in label_rects:
2535
+ l = fitz.Rect(label)
2536
+ vertical_gap = max(0.0, max(l.y0 - r.y1, r.y0 - l.y1))
2537
+ horizontal_gap = max(0.0, max(l.x0 - r.x1, r.x0 - l.x1))
2538
+ score = vertical_gap * 2.0 + horizontal_gap
2539
+ if best_score is None or score < best_score:
2540
+ best_score = score
2541
+ best = r
2542
+ return best
2543
+
2544
+
2545
+ def _filter_signature_like_rects(rects: list[object], page_rect) -> list[object]:
2546
+ filtered: list[object] = []
2547
+ for rect in rects:
2548
+ r = fitz.Rect(rect)
2549
+ width = r.width
2550
+ height = r.height
2551
+ if width < 40 or height < 8:
2552
+ continue
2553
+ if height > page_rect.height * 0.4:
2554
+ continue
2555
+ if width / max(height, 1.0) < 1.5:
2556
+ continue
2557
+ filtered.append(r)
2558
+ return filtered
2559
+
2560
+
2561
+ def _expand_rect_from_label(label_rect, page_rect):
2562
+ rect = fitz.Rect(label_rect)
2563
+ height = max(16.0, rect.height)
2564
+ width = max(80.0, rect.width)
2565
+ left = max(page_rect.x0, rect.x0 - width * 0.05)
2566
+ right = min(page_rect.x1, rect.x1 + width * 0.35)
2567
+ gap = max(4.0, height * 0.3)
2568
+ max_height = min(120.0, height * 5.5)
2569
+
2570
+ top = min(page_rect.y1, rect.y1 + gap)
2571
+ bottom = min(page_rect.y1, top + max_height)
2572
+ min_height = max(40.0, height * 2.5)
2573
+ if bottom - top < min_height:
2574
+ bottom = max(page_rect.y0, rect.y0 - gap)
2575
+ top = max(page_rect.y0, bottom - max_height)
2576
+ if bottom <= top:
2577
+ return None
2578
+ return fitz.Rect(left, top, right, bottom)
2579
+
2580
+
2581
+ def _find_fieldname_text_rects(page, field_name: str) -> list[object]:
2582
+ name = field_name.strip()
2583
+ if not name:
2584
+ return []
2585
+ try:
2586
+ hits = page.search_for(name, flags=fitz.TEXT_IGNORECASE)
2587
+ except Exception: # pragma: no cover - defensive
2588
+ return []
2589
+ return [fitz.Rect(hit) for hit in hits]
2590
+
2591
+
2592
+ def _expand_rect_around_text(text_rect, page_rect):
2593
+ rect = fitz.Rect(text_rect)
2594
+ pad_x = max(6.0, rect.width * 0.2)
2595
+ pad_y = max(4.0, rect.height * 0.8)
2596
+ left = max(page_rect.x0, rect.x0 - pad_x)
2597
+ right = min(page_rect.x1, rect.x1 + pad_x)
2598
+ top = max(page_rect.y0, rect.y0 - pad_y)
2599
+ bottom = min(page_rect.y1, rect.y1 + pad_y)
2600
+ if right <= left or bottom <= top:
2601
+ return None
2602
+ return fitz.Rect(left, top, right, bottom)
2603
+
2604
+
2605
+ def _rect_to_pdf_tuple(rect, page_height: float) -> tuple[float, float, float, float]:
2606
+ x0 = float(rect.x0)
2607
+ x1 = float(rect.x1)
2608
+ y0 = page_height - float(rect.y1)
2609
+ y1 = page_height - float(rect.y0)
2610
+ if x1 < x0:
2611
+ x0, x1 = x1, x0
2612
+ if y1 < y0:
2613
+ y0, y1 = y1, y0
2614
+ return (x0, y0, x1, y1)