sigdetect 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sigdetect/cropping.py +2380 -19
- sigdetect/detector/pymupdf_engine.py +246 -0
- sigdetect/wet_detection.py +48 -14
- {sigdetect-0.5.1.dist-info → sigdetect-0.5.3.dist-info}/METADATA +1 -1
- {sigdetect-0.5.1.dist-info → sigdetect-0.5.3.dist-info}/RECORD +8 -8
- {sigdetect-0.5.1.dist-info → sigdetect-0.5.3.dist-info}/WHEEL +0 -0
- {sigdetect-0.5.1.dist-info → sigdetect-0.5.3.dist-info}/entry_points.txt +0 -0
- {sigdetect-0.5.1.dist-info → sigdetect-0.5.3.dist-info}/top_level.txt +0 -0
sigdetect/cropping.py
CHANGED
|
@@ -7,7 +7,9 @@ import logging
|
|
|
7
7
|
import re
|
|
8
8
|
from dataclasses import dataclass
|
|
9
9
|
from pathlib import Path
|
|
10
|
-
from typing import Literal, overload
|
|
10
|
+
from typing import Callable, Literal, overload
|
|
11
|
+
|
|
12
|
+
from PIL import Image, ImageDraw
|
|
11
13
|
|
|
12
14
|
from .detector.file_result_model import FileResult
|
|
13
15
|
from .detector.signature_model import Signature
|
|
@@ -22,6 +24,13 @@ try: # pragma: no cover - optional dependency
|
|
|
22
24
|
except Exception: # pragma: no cover - optional dependency
|
|
23
25
|
Document = None # type: ignore[assignment]
|
|
24
26
|
|
|
27
|
+
try: # pragma: no cover - optional dependency
|
|
28
|
+
import pytesseract # type: ignore
|
|
29
|
+
from pytesseract import Output as TesseractOutput
|
|
30
|
+
except Exception: # pragma: no cover - optional dependency
|
|
31
|
+
pytesseract = None # type: ignore[assignment]
|
|
32
|
+
TesseractOutput = None # type: ignore[assignment]
|
|
33
|
+
|
|
25
34
|
|
|
26
35
|
class SignatureCroppingUnavailable(RuntimeError):
|
|
27
36
|
"""Raised when PNG cropping cannot be performed (e.g., PyMuPDF missing)."""
|
|
@@ -53,6 +62,7 @@ def crop_signatures(
|
|
|
53
62
|
return_bytes: Literal[False] = False,
|
|
54
63
|
save_files: bool = True,
|
|
55
64
|
docx: bool = False,
|
|
65
|
+
trim: bool = True,
|
|
56
66
|
) -> list[Path]: ...
|
|
57
67
|
|
|
58
68
|
|
|
@@ -67,6 +77,7 @@ def crop_signatures(
|
|
|
67
77
|
return_bytes: Literal[True],
|
|
68
78
|
save_files: bool = True,
|
|
69
79
|
docx: bool = False,
|
|
80
|
+
trim: bool = True,
|
|
70
81
|
) -> list[SignatureCrop]: ...
|
|
71
82
|
|
|
72
83
|
|
|
@@ -80,6 +91,7 @@ def crop_signatures(
|
|
|
80
91
|
return_bytes: bool = False,
|
|
81
92
|
save_files: bool = True,
|
|
82
93
|
docx: bool = False,
|
|
94
|
+
trim: bool = True,
|
|
83
95
|
) -> list[Path] | list[SignatureCrop]:
|
|
84
96
|
"""Render each signature bounding box to a PNG image and optionally wrap it in DOCX.
|
|
85
97
|
|
|
@@ -87,6 +99,7 @@ def crop_signatures(
|
|
|
87
99
|
the files to ``output_dir``. Set ``save_files=False`` to skip writing PNGs to disk.
|
|
88
100
|
When ``docx=True``, DOCX files are written instead of PNGs. When ``return_bytes`` is True
|
|
89
101
|
and ``docx=True``, ``SignatureCrop.docx_bytes`` will contain the DOCX payload.
|
|
102
|
+
When ``trim`` is enabled, the crop is tightened around the detected ink where possible.
|
|
90
103
|
"""
|
|
91
104
|
|
|
92
105
|
if fitz is None: # pragma: no cover - exercised when dependency absent
|
|
@@ -110,6 +123,15 @@ def crop_signatures(
|
|
|
110
123
|
"python-docx is required to generate DOCX outputs for signature crops."
|
|
111
124
|
)
|
|
112
125
|
|
|
126
|
+
page_signature_counts: dict[int, int] = {}
|
|
127
|
+
for signature in file_result.Signatures:
|
|
128
|
+
if signature.Page:
|
|
129
|
+
page_signature_counts[signature.Page] = page_signature_counts.get(signature.Page, 0) + 1
|
|
130
|
+
|
|
131
|
+
page_signature_index: dict[int, int] = {}
|
|
132
|
+
page_cache: dict[int, dict[str, object]] = {}
|
|
133
|
+
document_cache: dict[str, object] = {}
|
|
134
|
+
|
|
113
135
|
with fitz.open(pdf_path) as document: # type: ignore[attr-defined]
|
|
114
136
|
per_document_dir = output_dir / pdf_path.stem
|
|
115
137
|
if save_files:
|
|
@@ -117,38 +139,201 @@ def crop_signatures(
|
|
|
117
139
|
scale = dpi / 72.0
|
|
118
140
|
matrix = fitz.Matrix(scale, scale)
|
|
119
141
|
|
|
142
|
+
page_changed = False
|
|
120
143
|
for index, signature in enumerate(file_result.Signatures, start=1):
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
144
|
+
page_number = signature.Page
|
|
145
|
+
if page_number:
|
|
146
|
+
page_signature_index[page_number] = page_signature_index.get(page_number, 0) + 1
|
|
147
|
+
signature_index = page_signature_index.get(page_number, 1)
|
|
148
|
+
else:
|
|
149
|
+
signature_index = 1
|
|
150
|
+
force_fallback = _is_pseudo_signature(signature)
|
|
151
|
+
force_existing = _has_image_evidence(signature)
|
|
152
|
+
candidates: list[tuple[str, int, tuple[float, float, float, float]]] = []
|
|
153
|
+
seen: set[tuple[int, float, float, float, float]] = set()
|
|
154
|
+
|
|
155
|
+
def add_candidate(
|
|
156
|
+
source: str,
|
|
157
|
+
candidate_page: int | None,
|
|
158
|
+
candidate_bbox: tuple[float, float, float, float] | None,
|
|
159
|
+
) -> None:
|
|
160
|
+
if candidate_page is None or not _bbox_has_area(candidate_bbox):
|
|
161
|
+
return
|
|
162
|
+
key = (
|
|
163
|
+
candidate_page,
|
|
164
|
+
round(candidate_bbox[0], 2),
|
|
165
|
+
round(candidate_bbox[1], 2),
|
|
166
|
+
round(candidate_bbox[2], 2),
|
|
167
|
+
round(candidate_bbox[3], 2),
|
|
168
|
+
)
|
|
169
|
+
if key in seen:
|
|
170
|
+
return
|
|
171
|
+
seen.add(key)
|
|
172
|
+
candidates.append((source, candidate_page, candidate_bbox))
|
|
173
|
+
|
|
174
|
+
delayed_existing: tuple[int, tuple[float, float, float, float]] | None = None
|
|
175
|
+
if page_number and _bbox_has_area(signature.BoundingBox):
|
|
176
|
+
if force_fallback:
|
|
177
|
+
delayed_existing = (page_number, signature.BoundingBox)
|
|
178
|
+
else:
|
|
179
|
+
add_candidate("existing", page_number, signature.BoundingBox)
|
|
180
|
+
|
|
181
|
+
page = None
|
|
182
|
+
if page_number:
|
|
183
|
+
try:
|
|
184
|
+
page = document.load_page(page_number - 1)
|
|
185
|
+
except Exception as exc: # pragma: no cover - defensive
|
|
186
|
+
if logger:
|
|
187
|
+
logger.warning(
|
|
188
|
+
"Failed to load page for signature crop",
|
|
189
|
+
extra={
|
|
190
|
+
"file": pdf_path.name,
|
|
191
|
+
"page": page_number,
|
|
192
|
+
"error": str(exc),
|
|
193
|
+
},
|
|
194
|
+
)
|
|
195
|
+
page = None
|
|
196
|
+
|
|
197
|
+
if not force_existing:
|
|
198
|
+
if page is not None and page_number is not None:
|
|
199
|
+
resolved = _resolve_signature_bbox(
|
|
200
|
+
page,
|
|
201
|
+
signature,
|
|
202
|
+
page_cache=page_cache,
|
|
203
|
+
signature_count=page_signature_counts.get(page_number, 1),
|
|
204
|
+
signature_index=signature_index,
|
|
205
|
+
page_number=page_number,
|
|
134
206
|
)
|
|
207
|
+
add_candidate("resolved", page_number, resolved)
|
|
208
|
+
|
|
209
|
+
fallback_page, resolved = _resolve_bbox_across_document(
|
|
210
|
+
document,
|
|
211
|
+
signature,
|
|
212
|
+
page_cache=page_cache,
|
|
213
|
+
document_cache=document_cache,
|
|
214
|
+
signature_index=signature_index,
|
|
215
|
+
signature_count=page_signature_counts.get(page_number or 1, 1),
|
|
216
|
+
skip_page=page_number,
|
|
217
|
+
)
|
|
218
|
+
add_candidate("fallback", fallback_page, resolved)
|
|
219
|
+
|
|
220
|
+
if delayed_existing is not None:
|
|
221
|
+
add_candidate("existing", delayed_existing[0], delayed_existing[1])
|
|
222
|
+
|
|
223
|
+
if not candidates:
|
|
135
224
|
continue
|
|
136
225
|
|
|
137
|
-
|
|
138
|
-
|
|
226
|
+
best_bytes: bytes | None = None
|
|
227
|
+
best_bbox: tuple[float, float, float, float] | None = None
|
|
228
|
+
best_page: int | None = None
|
|
229
|
+
best_score: int | None = None
|
|
230
|
+
|
|
231
|
+
for source, candidate_page, candidate_bbox in candidates:
|
|
232
|
+
try:
|
|
233
|
+
page = document.load_page(candidate_page - 1)
|
|
234
|
+
except Exception as exc: # pragma: no cover - defensive
|
|
235
|
+
if logger:
|
|
236
|
+
logger.warning(
|
|
237
|
+
"Failed to load page for signature crop",
|
|
238
|
+
extra={
|
|
239
|
+
"file": pdf_path.name,
|
|
240
|
+
"page": candidate_page,
|
|
241
|
+
"error": str(exc),
|
|
242
|
+
},
|
|
243
|
+
)
|
|
244
|
+
continue
|
|
245
|
+
|
|
246
|
+
image_rects = page_cache.get(candidate_page, {}).get("image_rects")
|
|
247
|
+
if image_rects is None:
|
|
248
|
+
image_rects = _collect_image_rects(page)
|
|
249
|
+
page_cache.setdefault(candidate_page, {})["image_rects"] = image_rects
|
|
250
|
+
|
|
251
|
+
min_overlap = 0.6
|
|
252
|
+
min_center_overlap = 0.2
|
|
253
|
+
if signature.RenderType in {"drawn", "typed"} or _is_pseudo_signature(signature):
|
|
254
|
+
min_overlap = 0.3
|
|
255
|
+
min_center_overlap = 0.1
|
|
256
|
+
refined_bbox = _refine_bbox_with_image_rects(
|
|
257
|
+
page,
|
|
258
|
+
candidate_bbox,
|
|
259
|
+
image_rects=image_rects,
|
|
260
|
+
min_overlap=min_overlap,
|
|
261
|
+
min_center_overlap=min_center_overlap,
|
|
262
|
+
)
|
|
263
|
+
skip_trim = signature.RenderType in {"drawn", "typed"} or _is_pseudo_signature(signature)
|
|
264
|
+
candidate_allow_trim = True
|
|
265
|
+
if skip_trim and refined_bbox is not None:
|
|
266
|
+
candidate_allow_trim = False
|
|
267
|
+
|
|
268
|
+
render_bboxes: list[tuple[tuple[float, float, float, float], bool]] = [
|
|
269
|
+
(candidate_bbox, candidate_allow_trim)
|
|
270
|
+
]
|
|
271
|
+
if refined_bbox and refined_bbox != candidate_bbox:
|
|
272
|
+
skip_trim = signature.RenderType in {"drawn", "typed"} or _is_pseudo_signature(signature)
|
|
273
|
+
render_bboxes.append((refined_bbox, not skip_trim))
|
|
274
|
+
if (signature.RenderType or "").lower() == "wet":
|
|
275
|
+
expanded_bbox = _expand_wet_bbox(page, candidate_bbox)
|
|
276
|
+
if expanded_bbox and expanded_bbox not in {candidate_bbox, refined_bbox}:
|
|
277
|
+
render_bboxes.append((expanded_bbox, True))
|
|
278
|
+
|
|
279
|
+
for render_bbox, allow_trim in render_bboxes:
|
|
280
|
+
clip = _to_clip_rect(page, render_bbox)
|
|
281
|
+
if clip is None:
|
|
282
|
+
continue
|
|
283
|
+
try:
|
|
284
|
+
pixmap = page.get_pixmap(matrix=matrix, clip=clip, alpha=False)
|
|
285
|
+
raw_bytes = pixmap.tobytes("png")
|
|
286
|
+
final_bytes = (
|
|
287
|
+
_trim_signature_image_bytes(
|
|
288
|
+
raw_bytes,
|
|
289
|
+
render_type=signature.RenderType,
|
|
290
|
+
)
|
|
291
|
+
if trim and allow_trim
|
|
292
|
+
else raw_bytes
|
|
293
|
+
)
|
|
294
|
+
except Exception as exc: # pragma: no cover - defensive
|
|
295
|
+
if logger:
|
|
296
|
+
logger.warning(
|
|
297
|
+
"Failed to render signature crop",
|
|
298
|
+
extra={
|
|
299
|
+
"file": pdf_path.name,
|
|
300
|
+
"page": candidate_page,
|
|
301
|
+
"field": signature.FieldName,
|
|
302
|
+
"error": str(exc),
|
|
303
|
+
},
|
|
304
|
+
)
|
|
305
|
+
continue
|
|
306
|
+
|
|
307
|
+
if _is_blank_crop(final_bytes):
|
|
308
|
+
continue
|
|
309
|
+
|
|
310
|
+
dark, _ = _ink_metrics(final_bytes)
|
|
311
|
+
if best_score is None or dark > best_score:
|
|
312
|
+
best_score = dark
|
|
313
|
+
best_bytes = final_bytes
|
|
314
|
+
best_bbox = render_bbox
|
|
315
|
+
best_page = candidate_page
|
|
316
|
+
|
|
317
|
+
if best_bytes is None or best_bbox is None or best_page is None:
|
|
139
318
|
continue
|
|
140
319
|
|
|
320
|
+
if signature.Page != best_page:
|
|
321
|
+
signature.Page = best_page
|
|
322
|
+
page_changed = True
|
|
323
|
+
signature.BoundingBox = best_bbox
|
|
324
|
+
|
|
325
|
+
final_bytes = best_bytes
|
|
326
|
+
|
|
141
327
|
filename = _build_filename(index, signature)
|
|
142
328
|
png_destination = per_document_dir / filename
|
|
143
329
|
docx_destination = png_destination.with_suffix(".docx")
|
|
144
330
|
|
|
145
331
|
try:
|
|
146
332
|
image_bytes: bytes | None = None
|
|
147
|
-
pixmap = page.get_pixmap(matrix=matrix, clip=clip, alpha=False)
|
|
148
333
|
if save_files and not docx_enabled:
|
|
149
|
-
|
|
334
|
+
png_destination.write_bytes(final_bytes)
|
|
150
335
|
if return_bytes or docx_enabled:
|
|
151
|
-
image_bytes =
|
|
336
|
+
image_bytes = final_bytes
|
|
152
337
|
except Exception as exc: # pragma: no cover - defensive
|
|
153
338
|
if logger:
|
|
154
339
|
logger.warning(
|
|
@@ -206,6 +391,9 @@ def crop_signatures(
|
|
|
206
391
|
)
|
|
207
392
|
)
|
|
208
393
|
|
|
394
|
+
if page_changed:
|
|
395
|
+
_update_signature_pages(file_result)
|
|
396
|
+
|
|
209
397
|
return generated_crops if return_bytes else generated_paths
|
|
210
398
|
|
|
211
399
|
|
|
@@ -221,6 +409,1370 @@ def _build_docx_bytes(image_bytes: bytes) -> bytes:
|
|
|
221
409
|
return buffer.getvalue()
|
|
222
410
|
|
|
223
411
|
|
|
412
|
+
@dataclass(frozen=True)
|
|
413
|
+
class _OcrBox:
|
|
414
|
+
text: str
|
|
415
|
+
confidence: float
|
|
416
|
+
left: int
|
|
417
|
+
top: int
|
|
418
|
+
right: int
|
|
419
|
+
bottom: int
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
_OCR_LABEL_PATTERNS: tuple[re.Pattern[str], ...] = (
|
|
423
|
+
re.compile(r"\b(signature|signed|sign)\b", re.IGNORECASE),
|
|
424
|
+
re.compile(r"\b(date\s+signed|date)\b", re.IGNORECASE),
|
|
425
|
+
re.compile(r"\b(print(?:ed)?\s+name)\b", re.IGNORECASE),
|
|
426
|
+
re.compile(
|
|
427
|
+
r"\b(client|patient|attorney|firm|law|counsel|representative|guardian|witness)\b",
|
|
428
|
+
re.IGNORECASE,
|
|
429
|
+
),
|
|
430
|
+
re.compile(r"\b(docusign|docu\s*sign|envelope|adobe)\b", re.IGNORECASE),
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
|
|
434
|
+
def _is_label_text(text: str) -> bool:
|
|
435
|
+
cleaned = text.strip().lower()
|
|
436
|
+
if not cleaned:
|
|
437
|
+
return False
|
|
438
|
+
return any(pattern.search(cleaned) for pattern in _OCR_LABEL_PATTERNS)
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
def _extract_ocr_boxes(
|
|
442
|
+
image: Image.Image,
|
|
443
|
+
*,
|
|
444
|
+
languages: str = "eng",
|
|
445
|
+
min_confidence: float = 50.0,
|
|
446
|
+
) -> list[_OcrBox]:
|
|
447
|
+
if pytesseract is None or TesseractOutput is None:
|
|
448
|
+
return []
|
|
449
|
+
try:
|
|
450
|
+
data = pytesseract.image_to_data(
|
|
451
|
+
image,
|
|
452
|
+
lang=languages,
|
|
453
|
+
config="--psm 11",
|
|
454
|
+
output_type=TesseractOutput.DICT,
|
|
455
|
+
)
|
|
456
|
+
except Exception:
|
|
457
|
+
return []
|
|
458
|
+
|
|
459
|
+
texts = data.get("text", [])
|
|
460
|
+
boxes: list[_OcrBox] = []
|
|
461
|
+
for idx, raw in enumerate(texts):
|
|
462
|
+
text = str(raw).strip()
|
|
463
|
+
if not text:
|
|
464
|
+
continue
|
|
465
|
+
try:
|
|
466
|
+
confidence = float(data["conf"][idx])
|
|
467
|
+
except (ValueError, KeyError, TypeError):
|
|
468
|
+
continue
|
|
469
|
+
if confidence < min_confidence:
|
|
470
|
+
continue
|
|
471
|
+
try:
|
|
472
|
+
left = int(data["left"][idx])
|
|
473
|
+
top = int(data["top"][idx])
|
|
474
|
+
width = int(data["width"][idx])
|
|
475
|
+
height = int(data["height"][idx])
|
|
476
|
+
except (ValueError, KeyError, TypeError):
|
|
477
|
+
continue
|
|
478
|
+
if width <= 0 or height <= 0:
|
|
479
|
+
continue
|
|
480
|
+
boxes.append(
|
|
481
|
+
_OcrBox(
|
|
482
|
+
text=text,
|
|
483
|
+
confidence=confidence,
|
|
484
|
+
left=left,
|
|
485
|
+
top=top,
|
|
486
|
+
right=left + width,
|
|
487
|
+
bottom=top + height,
|
|
488
|
+
)
|
|
489
|
+
)
|
|
490
|
+
return boxes
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
def _estimate_white_level(gray: Image.Image) -> int:
|
|
494
|
+
histogram = gray.histogram()
|
|
495
|
+
total_pixels = gray.width * gray.height
|
|
496
|
+
cutoff = int(total_pixels * 0.995)
|
|
497
|
+
cumulative = 0
|
|
498
|
+
white_level = 255
|
|
499
|
+
for idx, count in enumerate(histogram):
|
|
500
|
+
cumulative += count
|
|
501
|
+
if cumulative >= cutoff:
|
|
502
|
+
white_level = idx
|
|
503
|
+
break
|
|
504
|
+
return white_level
|
|
505
|
+
|
|
506
|
+
|
|
507
|
+
def _find_horizontal_rule_rows(
|
|
508
|
+
gray: Image.Image,
|
|
509
|
+
*,
|
|
510
|
+
threshold: int = 240,
|
|
511
|
+
density_ratio: float = 0.25,
|
|
512
|
+
max_thickness: int = 8,
|
|
513
|
+
) -> tuple[list[tuple[int, int]], list[int]]:
|
|
514
|
+
width, height = gray.size
|
|
515
|
+
if width == 0 or height == 0:
|
|
516
|
+
return [], []
|
|
517
|
+
pixels = gray.load()
|
|
518
|
+
row_density = []
|
|
519
|
+
for y in range(height):
|
|
520
|
+
dark = sum(1 for x in range(width) if pixels[x, y] < threshold)
|
|
521
|
+
row_density.append(dark)
|
|
522
|
+
line_threshold = int(width * density_ratio)
|
|
523
|
+
segments: list[tuple[int, int]] = []
|
|
524
|
+
start: int | None = None
|
|
525
|
+
for y, dark in enumerate(row_density):
|
|
526
|
+
if dark >= line_threshold:
|
|
527
|
+
if start is None:
|
|
528
|
+
start = y
|
|
529
|
+
else:
|
|
530
|
+
if start is not None:
|
|
531
|
+
if (y - 1) - start + 1 <= max_thickness:
|
|
532
|
+
segments.append((start, y - 1))
|
|
533
|
+
start = None
|
|
534
|
+
if start is not None and (height - 1) - start + 1 <= max_thickness:
|
|
535
|
+
segments.append((start, height - 1))
|
|
536
|
+
return segments, row_density
|
|
537
|
+
|
|
538
|
+
|
|
539
|
+
def _find_vertical_rule_cols(
|
|
540
|
+
gray: Image.Image,
|
|
541
|
+
*,
|
|
542
|
+
threshold: int = 240,
|
|
543
|
+
density_ratio: float = 0.6,
|
|
544
|
+
max_thickness: int = 6,
|
|
545
|
+
) -> tuple[list[tuple[int, int]], list[int]]:
|
|
546
|
+
width, height = gray.size
|
|
547
|
+
if width == 0 or height == 0:
|
|
548
|
+
return [], []
|
|
549
|
+
pixels = gray.load()
|
|
550
|
+
col_density = []
|
|
551
|
+
for x in range(width):
|
|
552
|
+
dark = sum(1 for y in range(height) if pixels[x, y] < threshold)
|
|
553
|
+
col_density.append(dark)
|
|
554
|
+
line_threshold = int(height * density_ratio)
|
|
555
|
+
segments: list[tuple[int, int]] = []
|
|
556
|
+
start: int | None = None
|
|
557
|
+
for x, dark in enumerate(col_density):
|
|
558
|
+
if dark >= line_threshold:
|
|
559
|
+
if start is None:
|
|
560
|
+
start = x
|
|
561
|
+
else:
|
|
562
|
+
if start is not None:
|
|
563
|
+
if (x - 1) - start + 1 <= max_thickness:
|
|
564
|
+
segments.append((start, x - 1))
|
|
565
|
+
start = None
|
|
566
|
+
if start is not None and (width - 1) - start + 1 <= max_thickness:
|
|
567
|
+
segments.append((start, width - 1))
|
|
568
|
+
return segments, col_density
|
|
569
|
+
|
|
570
|
+
|
|
571
|
+
def _find_ink_band(
|
|
572
|
+
gray: Image.Image,
|
|
573
|
+
*,
|
|
574
|
+
threshold: int = 240,
|
|
575
|
+
min_density_ratio: float = 0.004,
|
|
576
|
+
gap_px: int = 3,
|
|
577
|
+
) -> tuple[int, int] | None:
|
|
578
|
+
width, height = gray.size
|
|
579
|
+
if width == 0 or height == 0:
|
|
580
|
+
return None
|
|
581
|
+
pixels = gray.load()
|
|
582
|
+
min_density = max(2, int(width * min_density_ratio))
|
|
583
|
+
row_density = []
|
|
584
|
+
for y in range(height):
|
|
585
|
+
dark = sum(1 for x in range(width) if pixels[x, y] < threshold)
|
|
586
|
+
row_density.append(dark)
|
|
587
|
+
|
|
588
|
+
segments: list[tuple[int, int]] = []
|
|
589
|
+
start: int | None = None
|
|
590
|
+
for y, dark in enumerate(row_density):
|
|
591
|
+
if dark >= min_density:
|
|
592
|
+
if start is None:
|
|
593
|
+
start = y
|
|
594
|
+
else:
|
|
595
|
+
if start is not None:
|
|
596
|
+
segments.append((start, y - 1))
|
|
597
|
+
start = None
|
|
598
|
+
if start is not None:
|
|
599
|
+
segments.append((start, height - 1))
|
|
600
|
+
|
|
601
|
+
if not segments:
|
|
602
|
+
return None
|
|
603
|
+
|
|
604
|
+
merged: list[list[int]] = []
|
|
605
|
+
for seg in segments:
|
|
606
|
+
if not merged:
|
|
607
|
+
merged.append([seg[0], seg[1]])
|
|
608
|
+
continue
|
|
609
|
+
if seg[0] - merged[-1][1] <= gap_px:
|
|
610
|
+
merged[-1][1] = seg[1]
|
|
611
|
+
else:
|
|
612
|
+
merged.append([seg[0], seg[1]])
|
|
613
|
+
|
|
614
|
+
best = None
|
|
615
|
+
best_score = None
|
|
616
|
+
for y0, y1 in merged:
|
|
617
|
+
score = sum(row_density[y0 : y1 + 1])
|
|
618
|
+
if best_score is None or score > best_score:
|
|
619
|
+
best_score = score
|
|
620
|
+
best = (y0, y1)
|
|
621
|
+
return best
|
|
622
|
+
|
|
623
|
+
|
|
624
|
+
def _pick_line_below_band(
|
|
625
|
+
segments: list[tuple[int, int]],
|
|
626
|
+
*,
|
|
627
|
+
band_end: int,
|
|
628
|
+
) -> tuple[int, int] | None:
|
|
629
|
+
if not segments:
|
|
630
|
+
return None
|
|
631
|
+
below = [seg for seg in segments if seg[0] >= band_end]
|
|
632
|
+
if below:
|
|
633
|
+
return min(below, key=lambda seg: seg[0] - band_end)
|
|
634
|
+
return None
|
|
635
|
+
|
|
636
|
+
|
|
637
|
+
def _bbox_from_boxes(boxes: list[_OcrBox]) -> tuple[int, int, int, int] | None:
|
|
638
|
+
if not boxes:
|
|
639
|
+
return None
|
|
640
|
+
left = min(box.left for box in boxes)
|
|
641
|
+
top = min(box.top for box in boxes)
|
|
642
|
+
right = max(box.right for box in boxes)
|
|
643
|
+
bottom = max(box.bottom for box in boxes)
|
|
644
|
+
if right <= left or bottom <= top:
|
|
645
|
+
return None
|
|
646
|
+
return left, top, right, bottom
|
|
647
|
+
|
|
648
|
+
|
|
649
|
+
def _trim_bbox_by_ocr_boxes(
|
|
650
|
+
bbox: tuple[int, int, int, int],
|
|
651
|
+
boxes: list[_OcrBox],
|
|
652
|
+
*,
|
|
653
|
+
min_gap: int = 6,
|
|
654
|
+
) -> tuple[int, int, int, int]:
|
|
655
|
+
x0, y0, x1, y1 = bbox
|
|
656
|
+
if not boxes:
|
|
657
|
+
return bbox
|
|
658
|
+
candidate = None
|
|
659
|
+
for box in boxes:
|
|
660
|
+
if box.top < y0 + min_gap:
|
|
661
|
+
continue
|
|
662
|
+
if box.left > x1 or box.right < x0:
|
|
663
|
+
continue
|
|
664
|
+
if candidate is None or box.top < candidate:
|
|
665
|
+
candidate = box.top
|
|
666
|
+
if candidate is not None and candidate - 1 > y0:
|
|
667
|
+
y1 = min(y1, candidate - 1)
|
|
668
|
+
return x0, y0, x1, y1
|
|
669
|
+
|
|
670
|
+
|
|
671
|
+
def _trim_border_lines(
|
|
672
|
+
image: Image.Image,
|
|
673
|
+
*,
|
|
674
|
+
threshold: int = 240,
|
|
675
|
+
density_ratio: float = 0.85,
|
|
676
|
+
edge_ratio: float = 0.2,
|
|
677
|
+
) -> Image.Image:
|
|
678
|
+
gray = image.convert("L")
|
|
679
|
+
width, height = gray.size
|
|
680
|
+
if width == 0 or height == 0:
|
|
681
|
+
return image
|
|
682
|
+
pixels = gray.load()
|
|
683
|
+
row_density = [sum(1 for x in range(width) if pixels[x, y] < threshold) for y in range(height)]
|
|
684
|
+
col_density = [sum(1 for y in range(height) if pixels[x, y] < threshold) for x in range(width)]
|
|
685
|
+
|
|
686
|
+
band_x = max(1, int(width * edge_ratio))
|
|
687
|
+
band_y = max(1, int(height * edge_ratio))
|
|
688
|
+
row_threshold = int(width * density_ratio)
|
|
689
|
+
col_threshold = int(height * density_ratio)
|
|
690
|
+
|
|
691
|
+
left_cut = -1
|
|
692
|
+
for x in range(band_x):
|
|
693
|
+
if col_density[x] >= col_threshold:
|
|
694
|
+
left_cut = x
|
|
695
|
+
right_cut = width
|
|
696
|
+
for x in range(width - band_x, width):
|
|
697
|
+
if col_density[x] >= col_threshold:
|
|
698
|
+
right_cut = x
|
|
699
|
+
break
|
|
700
|
+
|
|
701
|
+
top_cut = -1
|
|
702
|
+
for y in range(band_y):
|
|
703
|
+
if row_density[y] >= row_threshold:
|
|
704
|
+
top_cut = y
|
|
705
|
+
bottom_cut = height
|
|
706
|
+
for y in range(height - band_y, height):
|
|
707
|
+
if row_density[y] >= row_threshold:
|
|
708
|
+
bottom_cut = y
|
|
709
|
+
break
|
|
710
|
+
|
|
711
|
+
x0 = max(0, left_cut + 1)
|
|
712
|
+
x1 = min(width, right_cut)
|
|
713
|
+
y0 = max(0, top_cut + 1)
|
|
714
|
+
y1 = min(height, bottom_cut)
|
|
715
|
+
if x1 - x0 <= 2 or y1 - y0 <= 2:
|
|
716
|
+
return image
|
|
717
|
+
return image.crop((x0, y0, x1, y1))
|
|
718
|
+
|
|
719
|
+
|
|
720
|
+
def _select_signature_line(
|
|
721
|
+
line_segments: list[tuple[int, int]],
|
|
722
|
+
row_density: list[int],
|
|
723
|
+
) -> tuple[int, int] | None:
|
|
724
|
+
if not line_segments:
|
|
725
|
+
return None
|
|
726
|
+
best = None
|
|
727
|
+
best_score = None
|
|
728
|
+
for y0, y1 in line_segments:
|
|
729
|
+
score = sum(row_density[y0 : y1 + 1])
|
|
730
|
+
if best_score is None or score > best_score:
|
|
731
|
+
best_score = score
|
|
732
|
+
best = (y0, y1)
|
|
733
|
+
return best
|
|
734
|
+
|
|
735
|
+
|
|
736
|
+
def _component_bboxes(
|
|
737
|
+
gray: Image.Image,
|
|
738
|
+
*,
|
|
739
|
+
threshold: int,
|
|
740
|
+
min_pixels: int = 40,
|
|
741
|
+
line_ratio: float = 12.0,
|
|
742
|
+
edge_margin: int = 1,
|
|
743
|
+
) -> list[dict[str, int]]:
|
|
744
|
+
width, height = gray.size
|
|
745
|
+
if width == 0 or height == 0:
|
|
746
|
+
return []
|
|
747
|
+
pixels = gray.load()
|
|
748
|
+
visited = [False] * (width * height)
|
|
749
|
+
bboxes: list[dict[str, int]] = []
|
|
750
|
+
|
|
751
|
+
def index(x: int, y: int) -> int:
|
|
752
|
+
return y * width + x
|
|
753
|
+
|
|
754
|
+
for y in range(height):
|
|
755
|
+
for x in range(width):
|
|
756
|
+
idx = index(x, y)
|
|
757
|
+
if visited[idx]:
|
|
758
|
+
continue
|
|
759
|
+
if pixels[x, y] >= threshold:
|
|
760
|
+
visited[idx] = True
|
|
761
|
+
continue
|
|
762
|
+
|
|
763
|
+
stack = [(x, y)]
|
|
764
|
+
visited[idx] = True
|
|
765
|
+
min_x = max_x = x
|
|
766
|
+
min_y = max_y = y
|
|
767
|
+
count = 0
|
|
768
|
+
pixels_list: list[tuple[int, int]] = []
|
|
769
|
+
|
|
770
|
+
while stack:
|
|
771
|
+
cx, cy = stack.pop()
|
|
772
|
+
count += 1
|
|
773
|
+
pixels_list.append((cx, cy))
|
|
774
|
+
if cx < min_x:
|
|
775
|
+
min_x = cx
|
|
776
|
+
if cx > max_x:
|
|
777
|
+
max_x = cx
|
|
778
|
+
if cy < min_y:
|
|
779
|
+
min_y = cy
|
|
780
|
+
if cy > max_y:
|
|
781
|
+
max_y = cy
|
|
782
|
+
|
|
783
|
+
for nx in (cx - 1, cx, cx + 1):
|
|
784
|
+
if nx < 0 or nx >= width:
|
|
785
|
+
continue
|
|
786
|
+
for ny in (cy - 1, cy, cy + 1):
|
|
787
|
+
if ny < 0 or ny >= height:
|
|
788
|
+
continue
|
|
789
|
+
nidx = index(nx, ny)
|
|
790
|
+
if visited[nidx]:
|
|
791
|
+
continue
|
|
792
|
+
visited[nidx] = True
|
|
793
|
+
if pixels[nx, ny] < threshold:
|
|
794
|
+
stack.append((nx, ny))
|
|
795
|
+
|
|
796
|
+
if count < min_pixels:
|
|
797
|
+
continue
|
|
798
|
+
w = max_x - min_x + 1
|
|
799
|
+
h = max_y - min_y + 1
|
|
800
|
+
if h <= 0 or w <= 0:
|
|
801
|
+
continue
|
|
802
|
+
if w > h * line_ratio or h > w * line_ratio:
|
|
803
|
+
continue
|
|
804
|
+
edge_count = 0
|
|
805
|
+
for px, py in pixels_list:
|
|
806
|
+
if (
|
|
807
|
+
px <= min_x + edge_margin
|
|
808
|
+
or px >= max_x - edge_margin
|
|
809
|
+
or py <= min_y + edge_margin
|
|
810
|
+
or py >= max_y - edge_margin
|
|
811
|
+
):
|
|
812
|
+
edge_count += 1
|
|
813
|
+
edge_ratio = edge_count / max(1, count)
|
|
814
|
+
bboxes.append(
|
|
815
|
+
{
|
|
816
|
+
"min_x": min_x,
|
|
817
|
+
"min_y": min_y,
|
|
818
|
+
"max_x": max_x,
|
|
819
|
+
"max_y": max_y,
|
|
820
|
+
"count": count,
|
|
821
|
+
"edge_ratio": edge_ratio,
|
|
822
|
+
"edge_sum": edge_ratio * count,
|
|
823
|
+
}
|
|
824
|
+
)
|
|
825
|
+
return bboxes
|
|
826
|
+
|
|
827
|
+
|
|
828
|
+
def _merge_component_bboxes(
|
|
829
|
+
components: list[dict[str, int]],
|
|
830
|
+
*,
|
|
831
|
+
gap: int = 6,
|
|
832
|
+
) -> list[dict[str, int]]:
|
|
833
|
+
merged: list[dict[str, int]] = []
|
|
834
|
+
for comp in sorted(components, key=lambda item: item["count"], reverse=True):
|
|
835
|
+
placed = False
|
|
836
|
+
for target in merged:
|
|
837
|
+
dx = max(0, max(target["min_x"] - comp["max_x"], comp["min_x"] - target["max_x"]))
|
|
838
|
+
dy = max(0, max(target["min_y"] - comp["max_y"], comp["min_y"] - target["max_y"]))
|
|
839
|
+
if dx <= gap and dy <= gap:
|
|
840
|
+
target["min_x"] = min(target["min_x"], comp["min_x"])
|
|
841
|
+
target["min_y"] = min(target["min_y"], comp["min_y"])
|
|
842
|
+
target["max_x"] = max(target["max_x"], comp["max_x"])
|
|
843
|
+
target["max_y"] = max(target["max_y"], comp["max_y"])
|
|
844
|
+
target["count"] += comp["count"]
|
|
845
|
+
target["edge_sum"] = target.get("edge_sum", 0.0) + comp.get("edge_sum", 0.0)
|
|
846
|
+
target["edge_ratio"] = target["edge_sum"] / max(1, target["count"])
|
|
847
|
+
placed = True
|
|
848
|
+
break
|
|
849
|
+
if not placed:
|
|
850
|
+
entry = comp.copy()
|
|
851
|
+
if "edge_sum" not in entry:
|
|
852
|
+
entry["edge_sum"] = entry.get("edge_ratio", 0.0) * entry["count"]
|
|
853
|
+
merged.append(entry)
|
|
854
|
+
return merged
|
|
855
|
+
|
|
856
|
+
|
|
857
|
+
def _components_bbox(
|
|
858
|
+
gray: Image.Image,
|
|
859
|
+
*,
|
|
860
|
+
threshold: int,
|
|
861
|
+
min_pixels: int = 40,
|
|
862
|
+
gap: int = 6,
|
|
863
|
+
max_edge_ratio: float = 0.7,
|
|
864
|
+
) -> tuple[int, int, int, int] | None:
|
|
865
|
+
components = _component_bboxes(gray, threshold=threshold, min_pixels=min_pixels)
|
|
866
|
+
if not components:
|
|
867
|
+
return None
|
|
868
|
+
clusters = _merge_component_bboxes(components, gap=gap)
|
|
869
|
+
if not clusters:
|
|
870
|
+
return None
|
|
871
|
+
filtered = [item for item in clusters if item.get("edge_ratio", 0.0) < max_edge_ratio]
|
|
872
|
+
if filtered:
|
|
873
|
+
best = max(filtered, key=lambda item: item["count"])
|
|
874
|
+
else:
|
|
875
|
+
best = max(clusters, key=lambda item: item["count"])
|
|
876
|
+
return best["min_x"], best["min_y"], best["max_x"], best["max_y"]
|
|
877
|
+
|
|
878
|
+
|
|
879
|
+
def _select_line_cutoff(
|
|
880
|
+
segments: list[tuple[int, int]],
|
|
881
|
+
row_density: list[int],
|
|
882
|
+
*,
|
|
883
|
+
min_above_dark: int = 40,
|
|
884
|
+
ratio_threshold: float = 0.5,
|
|
885
|
+
) -> int | None:
|
|
886
|
+
if not segments:
|
|
887
|
+
return None
|
|
888
|
+
height = len(row_density)
|
|
889
|
+
candidates: list[tuple[int, int]] = []
|
|
890
|
+
for y0, y1 in segments:
|
|
891
|
+
above_dark = sum(row_density[:y0])
|
|
892
|
+
below_dark = sum(row_density[y1 + 1 :])
|
|
893
|
+
if above_dark < min_above_dark:
|
|
894
|
+
continue
|
|
895
|
+
if below_dark > 0 and above_dark < below_dark * ratio_threshold:
|
|
896
|
+
continue
|
|
897
|
+
candidates.append((y0, y1))
|
|
898
|
+
if not candidates:
|
|
899
|
+
return None
|
|
900
|
+
y0, y1 = max(candidates, key=lambda seg: seg[1])
|
|
901
|
+
pad = max(2, int(height * 0.01))
|
|
902
|
+
return min(height - 1, y1 + pad)
|
|
903
|
+
|
|
904
|
+
|
|
905
|
+
def _mask_regions(
|
|
906
|
+
gray: Image.Image,
|
|
907
|
+
*,
|
|
908
|
+
boxes: list[_OcrBox],
|
|
909
|
+
line_segments: list[tuple[int, int]],
|
|
910
|
+
vertical_segments: list[tuple[int, int]] | None = None,
|
|
911
|
+
box_filter: Callable[[_OcrBox], bool] | None = None,
|
|
912
|
+
) -> Image.Image:
|
|
913
|
+
masked = gray.copy()
|
|
914
|
+
draw = ImageDraw.Draw(masked)
|
|
915
|
+
width, height = gray.size
|
|
916
|
+
for y0, y1 in line_segments:
|
|
917
|
+
draw.rectangle((0, y0, width - 1, y1), fill=255)
|
|
918
|
+
if vertical_segments:
|
|
919
|
+
for x0, x1 in vertical_segments:
|
|
920
|
+
draw.rectangle((x0, 0, x1, height - 1), fill=255)
|
|
921
|
+
for box in boxes:
|
|
922
|
+
if box_filter is not None and not box_filter(box):
|
|
923
|
+
continue
|
|
924
|
+
draw.rectangle((box.left, box.top, box.right, box.bottom), fill=255)
|
|
925
|
+
return masked
|
|
926
|
+
|
|
927
|
+
|
|
928
|
+
def _whiteout_regions_rgb(
|
|
929
|
+
image: Image.Image,
|
|
930
|
+
*,
|
|
931
|
+
boxes: list[_OcrBox],
|
|
932
|
+
line_segments: list[tuple[int, int]],
|
|
933
|
+
vertical_segments: list[tuple[int, int]] | None = None,
|
|
934
|
+
) -> Image.Image:
|
|
935
|
+
width, height = image.size
|
|
936
|
+
if width == 0 or height == 0:
|
|
937
|
+
return image
|
|
938
|
+
rgb = image.convert("RGB")
|
|
939
|
+
draw = ImageDraw.Draw(rgb)
|
|
940
|
+
for y0, y1 in line_segments:
|
|
941
|
+
draw.rectangle((0, y0, width - 1, y1), fill=(255, 255, 255))
|
|
942
|
+
if vertical_segments:
|
|
943
|
+
for x0, x1 in vertical_segments:
|
|
944
|
+
draw.rectangle((x0, 0, x1, height - 1), fill=(255, 255, 255))
|
|
945
|
+
for box in boxes:
|
|
946
|
+
draw.rectangle((box.left, box.top, box.right, box.bottom), fill=(255, 255, 255))
|
|
947
|
+
return rgb
|
|
948
|
+
|
|
949
|
+
|
|
950
|
+
def _is_blue_pixel(r: int, g: int, b: int) -> bool:
|
|
951
|
+
return b > 100 and b > r + 25 and b > g + 25
|
|
952
|
+
|
|
953
|
+
|
|
954
|
+
def _build_ink_mask(
|
|
955
|
+
image: Image.Image,
|
|
956
|
+
*,
|
|
957
|
+
threshold: int,
|
|
958
|
+
remove_blue: bool = True,
|
|
959
|
+
) -> Image.Image:
|
|
960
|
+
rgb = image.convert("RGB")
|
|
961
|
+
width, height = rgb.size
|
|
962
|
+
mask = Image.new("L", (width, height), 255)
|
|
963
|
+
pix = rgb.load()
|
|
964
|
+
mpix = mask.load()
|
|
965
|
+
for y in range(height):
|
|
966
|
+
for x in range(width):
|
|
967
|
+
r, g, b = pix[x, y]
|
|
968
|
+
gray = int(0.299 * r + 0.587 * g + 0.114 * b)
|
|
969
|
+
if gray < threshold and not (remove_blue and _is_blue_pixel(r, g, b)):
|
|
970
|
+
mpix[x, y] = 0
|
|
971
|
+
return mask
|
|
972
|
+
|
|
973
|
+
|
|
974
|
+
def _tighten_to_ink_components(
|
|
975
|
+
image: Image.Image,
|
|
976
|
+
*,
|
|
977
|
+
remove_blue: bool,
|
|
978
|
+
pad_px: int = 2,
|
|
979
|
+
) -> Image.Image:
|
|
980
|
+
gray = image.convert("L")
|
|
981
|
+
white_level = _estimate_white_level(gray)
|
|
982
|
+
threshold = min(245, max(200, white_level - 10))
|
|
983
|
+
if remove_blue:
|
|
984
|
+
mask = _build_ink_mask(image, threshold=threshold, remove_blue=True)
|
|
985
|
+
bbox = _components_bbox(mask, threshold=200, gap=10, max_edge_ratio=0.98)
|
|
986
|
+
else:
|
|
987
|
+
bbox = _components_bbox(gray, threshold=threshold, gap=10, max_edge_ratio=0.98)
|
|
988
|
+
if bbox is None:
|
|
989
|
+
return image
|
|
990
|
+
x0, y0, x1, y1 = bbox
|
|
991
|
+
width, height = image.size
|
|
992
|
+
x0 = max(0, x0 - pad_px)
|
|
993
|
+
y0 = max(0, y0 - pad_px)
|
|
994
|
+
x1 = min(width - 1, x1 + pad_px)
|
|
995
|
+
y1 = min(height - 1, y1 + pad_px)
|
|
996
|
+
if x1 <= x0 or y1 <= y0:
|
|
997
|
+
return image
|
|
998
|
+
return image.crop((x0, y0, x1 + 1, y1 + 1))
|
|
999
|
+
|
|
1000
|
+
|
|
1001
|
+
def _components_bbox_on_line(
|
|
1002
|
+
gray: Image.Image,
|
|
1003
|
+
*,
|
|
1004
|
+
threshold: int,
|
|
1005
|
+
min_pixels: int = 40,
|
|
1006
|
+
max_edge_ratio: float = 0.98,
|
|
1007
|
+
) -> tuple[int, int, int, int] | None:
|
|
1008
|
+
components = _component_bboxes(gray, threshold=threshold, min_pixels=min_pixels)
|
|
1009
|
+
if not components:
|
|
1010
|
+
return None
|
|
1011
|
+
filtered = [item for item in components if item.get("edge_ratio", 0.0) < max_edge_ratio]
|
|
1012
|
+
if filtered:
|
|
1013
|
+
components = filtered
|
|
1014
|
+
|
|
1015
|
+
clusters: list[dict[str, int]] = []
|
|
1016
|
+
for comp in components:
|
|
1017
|
+
comp_height = comp["max_y"] - comp["min_y"] + 1
|
|
1018
|
+
placed = False
|
|
1019
|
+
for cluster in clusters:
|
|
1020
|
+
cluster_height = cluster["max_y"] - cluster["min_y"] + 1
|
|
1021
|
+
overlap = min(cluster["max_y"], comp["max_y"]) - max(cluster["min_y"], comp["min_y"]) + 1
|
|
1022
|
+
if overlap >= 0:
|
|
1023
|
+
min_height = max(1, min(cluster_height, comp_height))
|
|
1024
|
+
if (overlap / min_height) >= 0.3:
|
|
1025
|
+
placed = True
|
|
1026
|
+
else:
|
|
1027
|
+
gap = max(cluster["min_y"] - comp["max_y"], comp["min_y"] - cluster["max_y"])
|
|
1028
|
+
y_gap = max(4, int(min(cluster_height, comp_height) * 0.6))
|
|
1029
|
+
if gap <= y_gap:
|
|
1030
|
+
placed = True
|
|
1031
|
+
if placed:
|
|
1032
|
+
cluster["min_x"] = min(cluster["min_x"], comp["min_x"])
|
|
1033
|
+
cluster["min_y"] = min(cluster["min_y"], comp["min_y"])
|
|
1034
|
+
cluster["max_x"] = max(cluster["max_x"], comp["max_x"])
|
|
1035
|
+
cluster["max_y"] = max(cluster["max_y"], comp["max_y"])
|
|
1036
|
+
cluster["count"] += comp["count"]
|
|
1037
|
+
break
|
|
1038
|
+
if not placed:
|
|
1039
|
+
clusters.append(comp.copy())
|
|
1040
|
+
|
|
1041
|
+
if not clusters:
|
|
1042
|
+
return None
|
|
1043
|
+
best = max(clusters, key=lambda item: item["count"])
|
|
1044
|
+
return best["min_x"], best["min_y"], best["max_x"], best["max_y"]
|
|
1045
|
+
|
|
1046
|
+
|
|
1047
|
+
def _tighten_to_ink_components_on_line(
|
|
1048
|
+
image: Image.Image,
|
|
1049
|
+
*,
|
|
1050
|
+
remove_blue: bool,
|
|
1051
|
+
pad_px: int = 2,
|
|
1052
|
+
) -> Image.Image:
|
|
1053
|
+
gray = image.convert("L")
|
|
1054
|
+
white_level = _estimate_white_level(gray)
|
|
1055
|
+
threshold = min(245, max(200, white_level - 10))
|
|
1056
|
+
if remove_blue:
|
|
1057
|
+
mask = _build_ink_mask(image, threshold=threshold, remove_blue=True)
|
|
1058
|
+
bbox = _components_bbox_on_line(mask, threshold=200, max_edge_ratio=0.98)
|
|
1059
|
+
else:
|
|
1060
|
+
bbox = _components_bbox_on_line(gray, threshold=threshold, max_edge_ratio=0.98)
|
|
1061
|
+
if bbox is None:
|
|
1062
|
+
return image
|
|
1063
|
+
x0, y0, x1, y1 = bbox
|
|
1064
|
+
width, height = image.size
|
|
1065
|
+
x0 = max(0, x0 - pad_px)
|
|
1066
|
+
y0 = max(0, y0 - pad_px)
|
|
1067
|
+
x1 = min(width - 1, x1 + pad_px)
|
|
1068
|
+
y1 = min(height - 1, y1 + pad_px)
|
|
1069
|
+
if x1 <= x0 or y1 <= y0:
|
|
1070
|
+
return image
|
|
1071
|
+
return image.crop((x0, y0, x1 + 1, y1 + 1))
|
|
1072
|
+
|
|
1073
|
+
|
|
1074
|
+
def _ink_bbox(
|
|
1075
|
+
gray: Image.Image,
|
|
1076
|
+
*,
|
|
1077
|
+
threshold: int,
|
|
1078
|
+
) -> tuple[int, int, int, int] | None:
|
|
1079
|
+
width, height = gray.size
|
|
1080
|
+
pixels = gray.load()
|
|
1081
|
+
min_x, min_y = width, height
|
|
1082
|
+
max_x, max_y = -1, -1
|
|
1083
|
+
for y in range(height):
|
|
1084
|
+
for x in range(width):
|
|
1085
|
+
if pixels[x, y] < threshold:
|
|
1086
|
+
if x < min_x:
|
|
1087
|
+
min_x = x
|
|
1088
|
+
if x > max_x:
|
|
1089
|
+
max_x = x
|
|
1090
|
+
if y < min_y:
|
|
1091
|
+
min_y = y
|
|
1092
|
+
if y > max_y:
|
|
1093
|
+
max_y = y
|
|
1094
|
+
if max_x < 0:
|
|
1095
|
+
return None
|
|
1096
|
+
return min_x, min_y, max_x, max_y
|
|
1097
|
+
|
|
1098
|
+
|
|
1099
|
+
def _ocr_trim_signature_image_bytes(
|
|
1100
|
+
image_bytes: bytes,
|
|
1101
|
+
*,
|
|
1102
|
+
render_type: str | None,
|
|
1103
|
+
pad_px: int = 3,
|
|
1104
|
+
) -> bytes | None:
|
|
1105
|
+
if pytesseract is None or TesseractOutput is None:
|
|
1106
|
+
return None
|
|
1107
|
+
|
|
1108
|
+
image = Image.open(io.BytesIO(image_bytes))
|
|
1109
|
+
gray = image.convert("L")
|
|
1110
|
+
boxes = _extract_ocr_boxes(gray)
|
|
1111
|
+
if not boxes:
|
|
1112
|
+
return None
|
|
1113
|
+
|
|
1114
|
+
line_segments, row_density = _find_horizontal_rule_rows(gray)
|
|
1115
|
+
vertical_segments, _ = _find_vertical_rule_cols(gray)
|
|
1116
|
+
|
|
1117
|
+
def mask_all(_: _OcrBox) -> bool:
|
|
1118
|
+
return True
|
|
1119
|
+
|
|
1120
|
+
render = (render_type or "").lower()
|
|
1121
|
+
if render in {"drawn", "wet"}:
|
|
1122
|
+
cleaned = _whiteout_regions_rgb(
|
|
1123
|
+
image,
|
|
1124
|
+
boxes=boxes,
|
|
1125
|
+
line_segments=line_segments,
|
|
1126
|
+
vertical_segments=vertical_segments,
|
|
1127
|
+
)
|
|
1128
|
+
white_level = _estimate_white_level(cleaned.convert("L"))
|
|
1129
|
+
threshold = min(245, max(200, white_level - 10))
|
|
1130
|
+
mask = _build_ink_mask(cleaned, threshold=threshold, remove_blue=True)
|
|
1131
|
+
bbox = _components_bbox(mask, threshold=200, gap=12, max_edge_ratio=0.95)
|
|
1132
|
+
else:
|
|
1133
|
+
masked_strict = _mask_regions(
|
|
1134
|
+
gray,
|
|
1135
|
+
boxes=boxes,
|
|
1136
|
+
line_segments=line_segments,
|
|
1137
|
+
vertical_segments=vertical_segments,
|
|
1138
|
+
box_filter=mask_all,
|
|
1139
|
+
)
|
|
1140
|
+
|
|
1141
|
+
white_level = _estimate_white_level(masked_strict)
|
|
1142
|
+
threshold = min(245, max(200, white_level - 10))
|
|
1143
|
+
bbox = _components_bbox(masked_strict, threshold=threshold)
|
|
1144
|
+
|
|
1145
|
+
if bbox is None:
|
|
1146
|
+
line_segment = _select_signature_line(line_segments, row_density)
|
|
1147
|
+
max_above = max(40, int(gray.height * 0.25))
|
|
1148
|
+
|
|
1149
|
+
def typed_filter(box: _OcrBox) -> bool:
|
|
1150
|
+
if _is_label_text(box.text):
|
|
1151
|
+
return True
|
|
1152
|
+
if line_segment is not None:
|
|
1153
|
+
line_start = line_segment[0]
|
|
1154
|
+
keep = (line_start - max_above) <= box.bottom <= (line_start + 2)
|
|
1155
|
+
return not keep
|
|
1156
|
+
return False
|
|
1157
|
+
|
|
1158
|
+
masked_typed = _mask_regions(
|
|
1159
|
+
gray,
|
|
1160
|
+
boxes=boxes,
|
|
1161
|
+
line_segments=line_segments,
|
|
1162
|
+
vertical_segments=vertical_segments,
|
|
1163
|
+
box_filter=typed_filter,
|
|
1164
|
+
)
|
|
1165
|
+
white_level = _estimate_white_level(masked_typed)
|
|
1166
|
+
threshold = min(245, max(200, white_level - 10))
|
|
1167
|
+
bbox = _components_bbox(masked_typed, threshold=threshold)
|
|
1168
|
+
if bbox is None:
|
|
1169
|
+
filtered = [box for box in boxes if not _is_label_text(box.text)]
|
|
1170
|
+
if line_segment is not None:
|
|
1171
|
+
line_start = line_segment[0]
|
|
1172
|
+
filtered = [
|
|
1173
|
+
box
|
|
1174
|
+
for box in filtered
|
|
1175
|
+
if (line_start - max_above) <= box.bottom <= (line_start + 2)
|
|
1176
|
+
]
|
|
1177
|
+
fallback_bbox = _bbox_from_boxes(filtered)
|
|
1178
|
+
if fallback_bbox is None:
|
|
1179
|
+
return None
|
|
1180
|
+
x0, y0, x1, y1 = fallback_bbox
|
|
1181
|
+
else:
|
|
1182
|
+
x0, y0, x1, y1 = bbox
|
|
1183
|
+
else:
|
|
1184
|
+
x0, y0, x1, y1 = bbox
|
|
1185
|
+
|
|
1186
|
+
if render in {"drawn", "wet"}:
|
|
1187
|
+
x0, y0, x1, y1 = _trim_bbox_by_ocr_boxes((x0, y0, x1, y1), boxes)
|
|
1188
|
+
width, height = gray.size
|
|
1189
|
+
x0 = max(0, x0 - pad_px)
|
|
1190
|
+
y0 = max(0, y0 - pad_px)
|
|
1191
|
+
x1 = min(width - 1, x1 + pad_px)
|
|
1192
|
+
y1 = min(height - 1, y1 + pad_px)
|
|
1193
|
+
|
|
1194
|
+
if render in {"drawn", "wet"}:
|
|
1195
|
+
x0, y0, x1, y1 = _trim_bbox_by_ocr_boxes((x0, y0, x1, y1), boxes, min_gap=2)
|
|
1196
|
+
|
|
1197
|
+
if x1 <= x0 or y1 <= y0:
|
|
1198
|
+
return None
|
|
1199
|
+
if (x1 - x0) < max(8, int(width * 0.08)) or (y1 - y0) < max(6, int(height * 0.08)):
|
|
1200
|
+
return None
|
|
1201
|
+
|
|
1202
|
+
cropped = image.crop((x0, y0, x1 + 1, y1 + 1))
|
|
1203
|
+
if render in {"drawn", "wet"}:
|
|
1204
|
+
cropped = _trim_border_lines(cropped, density_ratio=0.55, edge_ratio=0.25)
|
|
1205
|
+
cropped = _tighten_to_ink_components_on_line(cropped, remove_blue=True, pad_px=1)
|
|
1206
|
+
else:
|
|
1207
|
+
cropped = _trim_border_lines(cropped, density_ratio=0.65, edge_ratio=0.2)
|
|
1208
|
+
cropped = _tighten_to_ink_components(cropped, remove_blue=False, pad_px=1)
|
|
1209
|
+
buffer = io.BytesIO()
|
|
1210
|
+
cropped.save(buffer, format="PNG")
|
|
1211
|
+
return buffer.getvalue()
|
|
1212
|
+
|
|
1213
|
+
|
|
1214
|
+
def _wet_ink_trim_signature_image_bytes(
|
|
1215
|
+
image_bytes: bytes,
|
|
1216
|
+
*,
|
|
1217
|
+
pad_px: int = 4,
|
|
1218
|
+
) -> bytes | None:
|
|
1219
|
+
if pytesseract is None or TesseractOutput is None:
|
|
1220
|
+
return None
|
|
1221
|
+
|
|
1222
|
+
image = Image.open(io.BytesIO(image_bytes))
|
|
1223
|
+
gray = image.convert("L")
|
|
1224
|
+
|
|
1225
|
+
boxes = _extract_ocr_boxes(gray)
|
|
1226
|
+
line_segments, _ = _find_horizontal_rule_rows(gray, density_ratio=0.18, max_thickness=6)
|
|
1227
|
+
vertical_segments, _ = _find_vertical_rule_cols(gray)
|
|
1228
|
+
cleaned = _whiteout_regions_rgb(
|
|
1229
|
+
image,
|
|
1230
|
+
boxes=boxes,
|
|
1231
|
+
line_segments=line_segments,
|
|
1232
|
+
vertical_segments=vertical_segments,
|
|
1233
|
+
)
|
|
1234
|
+
white_level = _estimate_white_level(cleaned.convert("L"))
|
|
1235
|
+
threshold = min(245, max(200, white_level - 10))
|
|
1236
|
+
mask = _build_ink_mask(cleaned, threshold=threshold, remove_blue=True)
|
|
1237
|
+
bbox = _components_bbox_on_line(mask, threshold=200, max_edge_ratio=0.98)
|
|
1238
|
+
if bbox is None:
|
|
1239
|
+
return None
|
|
1240
|
+
x0, y0, x1, y1 = bbox
|
|
1241
|
+
width, height = gray.size
|
|
1242
|
+
x0 = max(0, x0 - pad_px)
|
|
1243
|
+
y0 = max(0, y0 - pad_px)
|
|
1244
|
+
x1 = min(width - 1, x1 + pad_px)
|
|
1245
|
+
y1 = min(height - 1, y1 + pad_px)
|
|
1246
|
+
if x1 <= x0 or y1 <= y0:
|
|
1247
|
+
return None
|
|
1248
|
+
if (x1 - x0) < max(10, int(width * 0.08)) or (y1 - y0) < max(6, int(height * 0.08)):
|
|
1249
|
+
return None
|
|
1250
|
+
|
|
1251
|
+
cropped = image.crop((x0, y0, x1 + 1, y1 + 1))
|
|
1252
|
+
cropped = _trim_border_lines(cropped, density_ratio=0.55, edge_ratio=0.25)
|
|
1253
|
+
cropped = _tighten_to_ink_components_on_line(cropped, remove_blue=True, pad_px=1)
|
|
1254
|
+
buffer = io.BytesIO()
|
|
1255
|
+
cropped.save(buffer, format="PNG")
|
|
1256
|
+
return buffer.getvalue()
|
|
1257
|
+
|
|
1258
|
+
|
|
1259
|
+
def _select_best_trim(
|
|
1260
|
+
original_bytes: bytes,
|
|
1261
|
+
candidates: list[bytes],
|
|
1262
|
+
) -> bytes:
|
|
1263
|
+
original_image = Image.open(io.BytesIO(original_bytes))
|
|
1264
|
+
original_area = original_image.width * original_image.height
|
|
1265
|
+
min_area = max(200, int(original_area * 0.01))
|
|
1266
|
+
|
|
1267
|
+
best_bytes: bytes | None = None
|
|
1268
|
+
best_ratio = -1.0
|
|
1269
|
+
best_area = None
|
|
1270
|
+
best_dark = -1
|
|
1271
|
+
for candidate in candidates:
|
|
1272
|
+
if _is_blank_crop(candidate):
|
|
1273
|
+
continue
|
|
1274
|
+
image = Image.open(io.BytesIO(candidate))
|
|
1275
|
+
area = image.width * image.height
|
|
1276
|
+
if area < min_area:
|
|
1277
|
+
continue
|
|
1278
|
+
dark, ratio = _ink_metrics(candidate)
|
|
1279
|
+
if ratio > best_ratio:
|
|
1280
|
+
best_ratio = ratio
|
|
1281
|
+
best_area = area
|
|
1282
|
+
best_dark = dark
|
|
1283
|
+
best_bytes = candidate
|
|
1284
|
+
continue
|
|
1285
|
+
if best_area is None:
|
|
1286
|
+
continue
|
|
1287
|
+
if abs(ratio - best_ratio) <= 0.01:
|
|
1288
|
+
if area < best_area or (area == best_area and dark > best_dark):
|
|
1289
|
+
best_ratio = ratio
|
|
1290
|
+
best_area = area
|
|
1291
|
+
best_dark = dark
|
|
1292
|
+
best_bytes = candidate
|
|
1293
|
+
return best_bytes or candidates[0]
|
|
1294
|
+
|
|
1295
|
+
|
|
1296
|
+
def _select_best_trim_wet(
|
|
1297
|
+
original_bytes: bytes,
|
|
1298
|
+
candidates: list[bytes],
|
|
1299
|
+
) -> bytes:
|
|
1300
|
+
original_image = Image.open(io.BytesIO(original_bytes))
|
|
1301
|
+
original_area = original_image.width * original_image.height
|
|
1302
|
+
min_area = max(200, int(original_area * 0.01))
|
|
1303
|
+
min_ratio = 0.0006
|
|
1304
|
+
|
|
1305
|
+
best_bytes: bytes | None = None
|
|
1306
|
+
best_area = None
|
|
1307
|
+
best_ratio = None
|
|
1308
|
+
for candidate in candidates:
|
|
1309
|
+
if _is_blank_crop(candidate):
|
|
1310
|
+
continue
|
|
1311
|
+
image = Image.open(io.BytesIO(candidate))
|
|
1312
|
+
area = image.width * image.height
|
|
1313
|
+
if area < min_area:
|
|
1314
|
+
continue
|
|
1315
|
+
dark, ratio = _ink_metrics(candidate)
|
|
1316
|
+
if ratio < min_ratio:
|
|
1317
|
+
continue
|
|
1318
|
+
if best_ratio is None or ratio > best_ratio:
|
|
1319
|
+
best_ratio = ratio
|
|
1320
|
+
best_area = area
|
|
1321
|
+
best_bytes = candidate
|
|
1322
|
+
continue
|
|
1323
|
+
if best_area is None or best_ratio is None:
|
|
1324
|
+
continue
|
|
1325
|
+
if abs(ratio - best_ratio) <= 0.02 and area > best_area:
|
|
1326
|
+
best_area = area
|
|
1327
|
+
best_bytes = candidate
|
|
1328
|
+
if best_bytes is not None:
|
|
1329
|
+
return _pad_signature_image_bytes(best_bytes, pad_px=3)
|
|
1330
|
+
return _select_best_trim(original_bytes, candidates)
|
|
1331
|
+
|
|
1332
|
+
|
|
1333
|
+
def _pad_signature_image_bytes(image_bytes: bytes, *, pad_px: int) -> bytes:
|
|
1334
|
+
if pad_px <= 0:
|
|
1335
|
+
return image_bytes
|
|
1336
|
+
image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
|
|
1337
|
+
width, height = image.size
|
|
1338
|
+
padded = Image.new("RGB", (width + pad_px * 2, height + pad_px * 2), (255, 255, 255))
|
|
1339
|
+
padded.paste(image, (pad_px, pad_px))
|
|
1340
|
+
buffer = io.BytesIO()
|
|
1341
|
+
padded.save(buffer, format="PNG")
|
|
1342
|
+
return buffer.getvalue()
|
|
1343
|
+
|
|
1344
|
+
|
|
1345
|
+
def _trim_signature_image_bytes(
|
|
1346
|
+
image_bytes: bytes,
|
|
1347
|
+
*,
|
|
1348
|
+
render_type: str | None = None,
|
|
1349
|
+
pad_px: int = 4,
|
|
1350
|
+
gap_px: int = 4,
|
|
1351
|
+
min_density_ratio: float = 0.004,
|
|
1352
|
+
) -> bytes:
|
|
1353
|
+
candidates: list[bytes] = [image_bytes]
|
|
1354
|
+
render = (render_type or "").lower()
|
|
1355
|
+
if render == "wet":
|
|
1356
|
+
wet_trimmed = _wet_ink_trim_signature_image_bytes(image_bytes)
|
|
1357
|
+
if wet_trimmed is not None:
|
|
1358
|
+
candidates.append(wet_trimmed)
|
|
1359
|
+
ocr_trimmed = _ocr_trim_signature_image_bytes(image_bytes, render_type=render_type)
|
|
1360
|
+
if ocr_trimmed is not None:
|
|
1361
|
+
candidates.append(ocr_trimmed)
|
|
1362
|
+
|
|
1363
|
+
heuristic = _heuristic_trim_signature_image_bytes(
|
|
1364
|
+
image_bytes,
|
|
1365
|
+
pad_px=pad_px,
|
|
1366
|
+
gap_px=gap_px,
|
|
1367
|
+
min_density_ratio=min_density_ratio,
|
|
1368
|
+
)
|
|
1369
|
+
candidates.append(heuristic)
|
|
1370
|
+
if render == "wet":
|
|
1371
|
+
return _select_best_trim_wet(image_bytes, candidates)
|
|
1372
|
+
return _select_best_trim(image_bytes, candidates)
|
|
1373
|
+
|
|
1374
|
+
|
|
1375
|
+
def _heuristic_trim_signature_image_bytes(
|
|
1376
|
+
image_bytes: bytes,
|
|
1377
|
+
*,
|
|
1378
|
+
pad_px: int = 4,
|
|
1379
|
+
gap_px: int = 4,
|
|
1380
|
+
min_density_ratio: float = 0.004,
|
|
1381
|
+
) -> bytes:
|
|
1382
|
+
image = Image.open(io.BytesIO(image_bytes))
|
|
1383
|
+
gray = image.convert("L")
|
|
1384
|
+
width, height = gray.size
|
|
1385
|
+
|
|
1386
|
+
histogram = gray.histogram()
|
|
1387
|
+
total_pixels = width * height
|
|
1388
|
+
cutoff = int(total_pixels * 0.995)
|
|
1389
|
+
cumulative = 0
|
|
1390
|
+
white_level = 255
|
|
1391
|
+
for idx, count in enumerate(histogram):
|
|
1392
|
+
cumulative += count
|
|
1393
|
+
if cumulative >= cutoff:
|
|
1394
|
+
white_level = idx
|
|
1395
|
+
break
|
|
1396
|
+
|
|
1397
|
+
if white_level < 200:
|
|
1398
|
+
return image_bytes
|
|
1399
|
+
|
|
1400
|
+
thresholds = [min(254, max(200, white_level - delta)) for delta in (6, 4, 2, 1, 0)]
|
|
1401
|
+
min_density = max(2, int(width * min_density_ratio))
|
|
1402
|
+
pixels = gray.load()
|
|
1403
|
+
|
|
1404
|
+
row_densities: dict[int, list[int]] = {}
|
|
1405
|
+
for threshold in thresholds:
|
|
1406
|
+
row_density = []
|
|
1407
|
+
for y in range(height):
|
|
1408
|
+
dark = sum(1 for x in range(width) if pixels[x, y] < threshold)
|
|
1409
|
+
row_density.append(dark)
|
|
1410
|
+
row_densities[threshold] = row_density
|
|
1411
|
+
|
|
1412
|
+
line_bounds = _detect_horizontal_rule_cutoff(row_densities[thresholds[-1]], width)
|
|
1413
|
+
scan_limit = None
|
|
1414
|
+
descender_limit = height - 1
|
|
1415
|
+
if line_bounds is not None:
|
|
1416
|
+
line_start, line_end = line_bounds
|
|
1417
|
+
scan_limit = max(0, line_start - 1)
|
|
1418
|
+
descender_limit = min(height - 1, line_end + max(2, int(height * 0.02)))
|
|
1419
|
+
|
|
1420
|
+
min_band_height = max(4, int(height * 0.02))
|
|
1421
|
+
best = None
|
|
1422
|
+
best_small = None
|
|
1423
|
+
best_small_threshold = None
|
|
1424
|
+
best_threshold = None
|
|
1425
|
+
line_threshold = int(width * 0.6)
|
|
1426
|
+
for threshold in thresholds:
|
|
1427
|
+
row_density = row_densities[threshold]
|
|
1428
|
+
segments: list[tuple[int, int]] = []
|
|
1429
|
+
start: int | None = None
|
|
1430
|
+
for y, dark in enumerate(row_density):
|
|
1431
|
+
if scan_limit is not None and y > scan_limit:
|
|
1432
|
+
if start is not None:
|
|
1433
|
+
segments.append((start, y - 1))
|
|
1434
|
+
start = None
|
|
1435
|
+
break
|
|
1436
|
+
if dark >= min_density:
|
|
1437
|
+
if start is None:
|
|
1438
|
+
start = y
|
|
1439
|
+
else:
|
|
1440
|
+
if start is not None:
|
|
1441
|
+
segments.append((start, y - 1))
|
|
1442
|
+
start = None
|
|
1443
|
+
if start is not None:
|
|
1444
|
+
segments.append((start, height - 1))
|
|
1445
|
+
|
|
1446
|
+
if not segments:
|
|
1447
|
+
continue
|
|
1448
|
+
|
|
1449
|
+
merged: list[list[int]] = []
|
|
1450
|
+
for seg in segments:
|
|
1451
|
+
if not merged:
|
|
1452
|
+
merged.append([seg[0], seg[1]])
|
|
1453
|
+
continue
|
|
1454
|
+
if seg[0] - merged[-1][1] <= gap_px:
|
|
1455
|
+
merged[-1][1] = seg[1]
|
|
1456
|
+
else:
|
|
1457
|
+
merged.append([seg[0], seg[1]])
|
|
1458
|
+
|
|
1459
|
+
candidates = []
|
|
1460
|
+
for y0, y1 in merged:
|
|
1461
|
+
min_x, max_x = width, -1
|
|
1462
|
+
total_dark = 0
|
|
1463
|
+
for y in range(y0, y1 + 1):
|
|
1464
|
+
for x in range(width):
|
|
1465
|
+
if pixels[x, y] < threshold:
|
|
1466
|
+
total_dark += 1
|
|
1467
|
+
if x < min_x:
|
|
1468
|
+
min_x = x
|
|
1469
|
+
if x > max_x:
|
|
1470
|
+
max_x = x
|
|
1471
|
+
if max_x < 0:
|
|
1472
|
+
continue
|
|
1473
|
+
band_height = y1 - y0 + 1
|
|
1474
|
+
band_width = max_x - min_x + 1
|
|
1475
|
+
score = total_dark * (band_height**1.3)
|
|
1476
|
+
if line_bounds is not None:
|
|
1477
|
+
distance = max(0, line_bounds[0] - y1)
|
|
1478
|
+
proximity = 1.0 / (1.0 + (distance / 20.0))
|
|
1479
|
+
score *= 1.0 + 0.5 * proximity
|
|
1480
|
+
candidates.append(
|
|
1481
|
+
{
|
|
1482
|
+
"y0": y0,
|
|
1483
|
+
"y1": y1,
|
|
1484
|
+
"min_x": min_x,
|
|
1485
|
+
"max_x": max_x,
|
|
1486
|
+
"total": total_dark,
|
|
1487
|
+
"height": band_height,
|
|
1488
|
+
"width": band_width,
|
|
1489
|
+
"score": score,
|
|
1490
|
+
}
|
|
1491
|
+
)
|
|
1492
|
+
|
|
1493
|
+
if not candidates:
|
|
1494
|
+
continue
|
|
1495
|
+
|
|
1496
|
+
candidates.sort(key=lambda item: item["score"], reverse=True)
|
|
1497
|
+
top_candidate = candidates[0]
|
|
1498
|
+
if top_candidate["height"] >= min_band_height:
|
|
1499
|
+
if best is None or top_candidate["score"] > best["score"]:
|
|
1500
|
+
best = top_candidate
|
|
1501
|
+
best_threshold = threshold
|
|
1502
|
+
else:
|
|
1503
|
+
if best_small is None or top_candidate["score"] > best_small["score"]:
|
|
1504
|
+
best_small = top_candidate
|
|
1505
|
+
best_small_threshold = threshold
|
|
1506
|
+
|
|
1507
|
+
if best is None:
|
|
1508
|
+
best = best_small
|
|
1509
|
+
best_threshold = best_small_threshold
|
|
1510
|
+
|
|
1511
|
+
if best is None:
|
|
1512
|
+
return image_bytes
|
|
1513
|
+
|
|
1514
|
+
expansion_density = row_densities.get(best_threshold, row_densities[thresholds[-1]])
|
|
1515
|
+
expand_threshold = max(1, int(min_density * 0.4))
|
|
1516
|
+
y0 = best["y0"]
|
|
1517
|
+
y1 = best["y1"]
|
|
1518
|
+
|
|
1519
|
+
while y0 > 0 and expansion_density[y0 - 1] >= expand_threshold:
|
|
1520
|
+
y0 -= 1
|
|
1521
|
+
while y1 < descender_limit and expansion_density[y1 + 1] >= expand_threshold:
|
|
1522
|
+
y1 += 1
|
|
1523
|
+
|
|
1524
|
+
max_white_pad = max(8, int(height * 0.04))
|
|
1525
|
+
while y0 > 0 and (best["y0"] - y0) < max_white_pad:
|
|
1526
|
+
if expansion_density[y0 - 1] >= expand_threshold:
|
|
1527
|
+
break
|
|
1528
|
+
y0 -= 1
|
|
1529
|
+
|
|
1530
|
+
min_x, max_x = width, -1
|
|
1531
|
+
skip_line_rows = line_bounds is not None
|
|
1532
|
+
for y in range(y0, y1 + 1):
|
|
1533
|
+
if skip_line_rows and expansion_density[y] >= line_threshold:
|
|
1534
|
+
continue
|
|
1535
|
+
for x in range(width):
|
|
1536
|
+
if pixels[x, y] < thresholds[-1]:
|
|
1537
|
+
if x < min_x:
|
|
1538
|
+
min_x = x
|
|
1539
|
+
if x > max_x:
|
|
1540
|
+
max_x = x
|
|
1541
|
+
if max_x >= 0:
|
|
1542
|
+
best = {
|
|
1543
|
+
"y0": y0,
|
|
1544
|
+
"y1": y1,
|
|
1545
|
+
"min_x": min_x,
|
|
1546
|
+
"max_x": max_x,
|
|
1547
|
+
}
|
|
1548
|
+
|
|
1549
|
+
x0 = max(0, best["min_x"] - pad_px)
|
|
1550
|
+
x1 = min(width - 1, best["max_x"] + pad_px)
|
|
1551
|
+
y0 = max(0, best["y0"] - pad_px)
|
|
1552
|
+
y1 = min(height - 1, best["y1"] + pad_px)
|
|
1553
|
+
|
|
1554
|
+
if x1 <= x0 or y1 <= y0:
|
|
1555
|
+
return image_bytes
|
|
1556
|
+
if (x1 - x0) < max(10, int(width * 0.2)) or (y1 - y0) < max(6, int(height * 0.08)):
|
|
1557
|
+
return image_bytes
|
|
1558
|
+
|
|
1559
|
+
cropped = image.crop((x0, y0, x1 + 1, y1 + 1))
|
|
1560
|
+
cropped = _trim_below_horizontal_rule(cropped)
|
|
1561
|
+
cropped = _tighten_to_ink_band(cropped)
|
|
1562
|
+
buffer = io.BytesIO()
|
|
1563
|
+
cropped.save(buffer, format="PNG")
|
|
1564
|
+
return buffer.getvalue()
|
|
1565
|
+
|
|
1566
|
+
|
|
1567
|
+
def _trim_below_horizontal_rule(
|
|
1568
|
+
image: Image.Image,
|
|
1569
|
+
*,
|
|
1570
|
+
threshold: int = 240,
|
|
1571
|
+
) -> Image.Image:
|
|
1572
|
+
gray = image.convert("L")
|
|
1573
|
+
width, height = gray.size
|
|
1574
|
+
if width == 0 or height == 0:
|
|
1575
|
+
return image
|
|
1576
|
+
pixels = gray.load()
|
|
1577
|
+
row_density = []
|
|
1578
|
+
for y in range(height):
|
|
1579
|
+
dark = sum(1 for x in range(width) if pixels[x, y] < threshold)
|
|
1580
|
+
row_density.append(dark)
|
|
1581
|
+
|
|
1582
|
+
line_bounds = _detect_horizontal_rule_cutoff(row_density, width)
|
|
1583
|
+
if line_bounds is None:
|
|
1584
|
+
return image
|
|
1585
|
+
|
|
1586
|
+
line_start, line_end = line_bounds
|
|
1587
|
+
above_dark = sum(row_density[:line_start])
|
|
1588
|
+
below_dark = sum(row_density[line_end + 1 :])
|
|
1589
|
+
if above_dark < 40:
|
|
1590
|
+
return image
|
|
1591
|
+
if below_dark <= max(40, int(above_dark * 0.2)):
|
|
1592
|
+
return image
|
|
1593
|
+
|
|
1594
|
+
keep_below = max(2, int(height * 0.01))
|
|
1595
|
+
new_bottom = min(height - 1, line_end + keep_below)
|
|
1596
|
+
if new_bottom <= 0 or new_bottom >= height - 1:
|
|
1597
|
+
return image
|
|
1598
|
+
return image.crop((0, 0, width, new_bottom + 1))
|
|
1599
|
+
|
|
1600
|
+
|
|
1601
|
+
def _tighten_to_ink_band(
|
|
1602
|
+
image: Image.Image,
|
|
1603
|
+
*,
|
|
1604
|
+
threshold: int = 240,
|
|
1605
|
+
pad_px: int = 2,
|
|
1606
|
+
min_density_ratio: float = 0.004,
|
|
1607
|
+
) -> Image.Image:
|
|
1608
|
+
gray = image.convert("L")
|
|
1609
|
+
width, height = gray.size
|
|
1610
|
+
if width == 0 or height == 0:
|
|
1611
|
+
return image
|
|
1612
|
+
|
|
1613
|
+
pixels = gray.load()
|
|
1614
|
+
row_density = []
|
|
1615
|
+
for y in range(height):
|
|
1616
|
+
dark = sum(1 for x in range(width) if pixels[x, y] < threshold)
|
|
1617
|
+
row_density.append(dark)
|
|
1618
|
+
|
|
1619
|
+
line_threshold = int(width * 0.6)
|
|
1620
|
+
line_rows = {i for i, d in enumerate(row_density) if d >= line_threshold}
|
|
1621
|
+
if not line_rows and max(row_density, default=0) == 0:
|
|
1622
|
+
return image
|
|
1623
|
+
|
|
1624
|
+
min_density = max(2, int(width * min_density_ratio))
|
|
1625
|
+
segments: list[tuple[int, int]] = []
|
|
1626
|
+
start: int | None = None
|
|
1627
|
+
for y, dark in enumerate(row_density):
|
|
1628
|
+
if y in line_rows:
|
|
1629
|
+
if start is not None:
|
|
1630
|
+
segments.append((start, y - 1))
|
|
1631
|
+
start = None
|
|
1632
|
+
continue
|
|
1633
|
+
if dark >= min_density:
|
|
1634
|
+
if start is None:
|
|
1635
|
+
start = y
|
|
1636
|
+
else:
|
|
1637
|
+
if start is not None:
|
|
1638
|
+
segments.append((start, y - 1))
|
|
1639
|
+
start = None
|
|
1640
|
+
if start is not None:
|
|
1641
|
+
segments.append((start, height - 1))
|
|
1642
|
+
|
|
1643
|
+
if not segments:
|
|
1644
|
+
return image
|
|
1645
|
+
|
|
1646
|
+
def segment_score(y0: int, y1: int) -> int:
|
|
1647
|
+
return sum(row_density[y0 : y1 + 1])
|
|
1648
|
+
|
|
1649
|
+
line_groups: list[tuple[int, int]] = []
|
|
1650
|
+
if line_rows:
|
|
1651
|
+
sorted_rows = sorted(line_rows)
|
|
1652
|
+
group_start = sorted_rows[0]
|
|
1653
|
+
prev = sorted_rows[0]
|
|
1654
|
+
for row in sorted_rows[1:]:
|
|
1655
|
+
if row - prev > 1:
|
|
1656
|
+
line_groups.append((group_start, prev))
|
|
1657
|
+
group_start = row
|
|
1658
|
+
prev = row
|
|
1659
|
+
line_groups.append((group_start, prev))
|
|
1660
|
+
|
|
1661
|
+
max_gap = min(int(height * 0.35), 140)
|
|
1662
|
+
best_segment: tuple[int, int] | None = None
|
|
1663
|
+
best_score = None
|
|
1664
|
+
|
|
1665
|
+
def consider_segments(above: bool) -> bool:
|
|
1666
|
+
nonlocal best_segment, best_score
|
|
1667
|
+
found = False
|
|
1668
|
+
for line_start, line_end in line_groups:
|
|
1669
|
+
for y0, y1 in segments:
|
|
1670
|
+
if above:
|
|
1671
|
+
gap = line_start - y1
|
|
1672
|
+
if y1 >= line_start or gap > max_gap:
|
|
1673
|
+
continue
|
|
1674
|
+
else:
|
|
1675
|
+
gap = y0 - line_end
|
|
1676
|
+
if y0 <= line_end or gap > max_gap:
|
|
1677
|
+
continue
|
|
1678
|
+
|
|
1679
|
+
height_span = y1 - y0 + 1
|
|
1680
|
+
if height_span < 2:
|
|
1681
|
+
continue
|
|
1682
|
+
score = segment_score(y0, y1)
|
|
1683
|
+
proximity = 1.0 / (1.0 + (gap / 25.0))
|
|
1684
|
+
weighted = score * (1.0 + 0.5 * proximity)
|
|
1685
|
+
if best_score is None or weighted > best_score:
|
|
1686
|
+
best_score = weighted
|
|
1687
|
+
best_segment = (y0, y1)
|
|
1688
|
+
found = True
|
|
1689
|
+
return found
|
|
1690
|
+
|
|
1691
|
+
if line_groups:
|
|
1692
|
+
if not consider_segments(True):
|
|
1693
|
+
consider_segments(False)
|
|
1694
|
+
|
|
1695
|
+
if best_segment is None:
|
|
1696
|
+
for y0, y1 in segments:
|
|
1697
|
+
score = segment_score(y0, y1)
|
|
1698
|
+
if best_score is None or score > best_score:
|
|
1699
|
+
best_score = score
|
|
1700
|
+
best_segment = (y0, y1)
|
|
1701
|
+
|
|
1702
|
+
if best_segment is None:
|
|
1703
|
+
return image
|
|
1704
|
+
|
|
1705
|
+
y0, y1 = best_segment
|
|
1706
|
+
min_x, max_x = width, -1
|
|
1707
|
+
for y in range(y0, y1 + 1):
|
|
1708
|
+
if y in line_rows:
|
|
1709
|
+
continue
|
|
1710
|
+
for x in range(width):
|
|
1711
|
+
if pixels[x, y] < threshold:
|
|
1712
|
+
if x < min_x:
|
|
1713
|
+
min_x = x
|
|
1714
|
+
if x > max_x:
|
|
1715
|
+
max_x = x
|
|
1716
|
+
|
|
1717
|
+
if max_x < 0:
|
|
1718
|
+
return image
|
|
1719
|
+
|
|
1720
|
+
x0 = max(0, min_x - pad_px)
|
|
1721
|
+
x1 = min(width - 1, max_x + pad_px)
|
|
1722
|
+
y0 = max(0, y0 - pad_px)
|
|
1723
|
+
y1 = min(height - 1, y1 + pad_px)
|
|
1724
|
+
if x1 <= x0 or y1 <= y0:
|
|
1725
|
+
return image
|
|
1726
|
+
return image.crop((x0, y0, x1 + 1, y1 + 1))
|
|
1727
|
+
|
|
1728
|
+
|
|
1729
|
+
def _detect_horizontal_rule_cutoff(
|
|
1730
|
+
row_density: list[int],
|
|
1731
|
+
width: int,
|
|
1732
|
+
) -> tuple[int, int] | None:
|
|
1733
|
+
if not row_density:
|
|
1734
|
+
return None
|
|
1735
|
+
line_threshold = int(width * 0.6)
|
|
1736
|
+
max_thickness = 4
|
|
1737
|
+
segments: list[tuple[int, int]] = []
|
|
1738
|
+
start = None
|
|
1739
|
+
for y, density in enumerate(row_density):
|
|
1740
|
+
if density >= line_threshold:
|
|
1741
|
+
if start is None:
|
|
1742
|
+
start = y
|
|
1743
|
+
else:
|
|
1744
|
+
if start is not None:
|
|
1745
|
+
segments.append((start, y - 1))
|
|
1746
|
+
start = None
|
|
1747
|
+
if start is not None:
|
|
1748
|
+
segments.append((start, len(row_density) - 1))
|
|
1749
|
+
|
|
1750
|
+
if not segments:
|
|
1751
|
+
return None
|
|
1752
|
+
|
|
1753
|
+
total_dark = sum(row_density)
|
|
1754
|
+
if total_dark <= 0:
|
|
1755
|
+
return None
|
|
1756
|
+
|
|
1757
|
+
min_above_dark = max(40, int(total_dark * 0.02))
|
|
1758
|
+
for y0, y1 in segments:
|
|
1759
|
+
thickness = y1 - y0 + 1
|
|
1760
|
+
if thickness > max_thickness:
|
|
1761
|
+
continue
|
|
1762
|
+
above_dark = sum(row_density[:y0])
|
|
1763
|
+
below_dark = sum(row_density[y1 + 1 :])
|
|
1764
|
+
if above_dark < 40:
|
|
1765
|
+
continue
|
|
1766
|
+
midpoint_ratio = ((y0 + y1) / 2.0) / max(1, len(row_density))
|
|
1767
|
+
if above_dark >= min_above_dark and midpoint_ratio >= 0.2:
|
|
1768
|
+
return (y0, y1)
|
|
1769
|
+
if midpoint_ratio >= 0.35:
|
|
1770
|
+
return (y0, y1)
|
|
1771
|
+
if above_dark >= max(40, int(below_dark * 0.3)):
|
|
1772
|
+
return (y0, y1)
|
|
1773
|
+
return None
|
|
1774
|
+
|
|
1775
|
+
|
|
224
1776
|
def _to_clip_rect(page, bbox: tuple[float, float, float, float]):
|
|
225
1777
|
width = float(page.rect.width)
|
|
226
1778
|
height = float(page.rect.height)
|
|
@@ -237,6 +1789,31 @@ def _to_clip_rect(page, bbox: tuple[float, float, float, float]):
|
|
|
237
1789
|
return fitz.Rect(left, top, right, bottom)
|
|
238
1790
|
|
|
239
1791
|
|
|
1792
|
+
def _expand_wet_bbox(page, bbox: tuple[float, float, float, float]) -> tuple[float, float, float, float] | None:
|
|
1793
|
+
clip = _to_clip_rect(page, bbox)
|
|
1794
|
+
if clip is None:
|
|
1795
|
+
return None
|
|
1796
|
+
rect = fitz.Rect(clip)
|
|
1797
|
+
width = rect.width
|
|
1798
|
+
height = rect.height
|
|
1799
|
+
if width <= 0 or height <= 0:
|
|
1800
|
+
return None
|
|
1801
|
+
page_rect = page.rect
|
|
1802
|
+
if width >= page_rect.width * 0.45:
|
|
1803
|
+
return None
|
|
1804
|
+
|
|
1805
|
+
left_pad = max(6.0, width * 0.15)
|
|
1806
|
+
left = max(page_rect.x0, rect.x0 - left_pad)
|
|
1807
|
+
right = page_rect.x1 if width < page_rect.width * 0.7 else rect.x1
|
|
1808
|
+
|
|
1809
|
+
pad_y = max(12.0, height * 0.8)
|
|
1810
|
+
top = max(page_rect.y0, rect.y0 - pad_y)
|
|
1811
|
+
bottom = min(page_rect.y1, rect.y1 + pad_y)
|
|
1812
|
+
if right <= left or bottom <= top:
|
|
1813
|
+
return None
|
|
1814
|
+
return _rect_to_pdf_tuple(fitz.Rect(left, top, right, bottom), page_rect.height)
|
|
1815
|
+
|
|
1816
|
+
|
|
240
1817
|
def _clamp(value: float, lower: float, upper: float) -> float:
|
|
241
1818
|
return max(lower, min(value, upper))
|
|
242
1819
|
|
|
@@ -251,3 +1828,787 @@ def _slugify(value: str) -> str:
|
|
|
251
1828
|
cleaned = re.sub(r"[^A-Za-z0-9_-]+", "_", value.strip().lower())
|
|
252
1829
|
cleaned = cleaned.strip("_")
|
|
253
1830
|
return cleaned or "signature"
|
|
1831
|
+
|
|
1832
|
+
|
|
1833
|
+
def _update_signature_pages(file_result: FileResult) -> None:
|
|
1834
|
+
pages = sorted({sig.Page for sig in file_result.Signatures if sig.Page})
|
|
1835
|
+
file_result.SignaturePages = ",".join(str(page) for page in pages)
|
|
1836
|
+
|
|
1837
|
+
|
|
1838
|
+
def _bbox_has_area(bbox: tuple[float, float, float, float] | None) -> bool:
|
|
1839
|
+
if not bbox or len(bbox) != 4:
|
|
1840
|
+
return False
|
|
1841
|
+
x0, y0, x1, y1 = bbox
|
|
1842
|
+
return not (x0 == 0 and y0 == 0 and x1 == 0 and y1 == 0) and (x1 - x0) > 0 and (y1 - y0) > 0
|
|
1843
|
+
|
|
1844
|
+
|
|
1845
|
+
def _is_pseudo_signature(signature: Signature) -> bool:
|
|
1846
|
+
if _has_image_evidence(signature):
|
|
1847
|
+
return False
|
|
1848
|
+
if signature.FieldName == "vendor_or_acro_detected":
|
|
1849
|
+
return True
|
|
1850
|
+
hint = (signature.Hint or "").lower()
|
|
1851
|
+
if "vendororacronly" in hint:
|
|
1852
|
+
return True
|
|
1853
|
+
return any("pseudo:true" == token for token in signature.Evidence)
|
|
1854
|
+
|
|
1855
|
+
|
|
1856
|
+
def _has_image_evidence(signature: Signature) -> bool:
|
|
1857
|
+
return bool(signature.Evidence and any("image:retainer" == token for token in signature.Evidence))
|
|
1858
|
+
|
|
1859
|
+
|
|
1860
|
+
def _collect_image_rects(page) -> list[object]:
|
|
1861
|
+
rects: list[object] = []
|
|
1862
|
+
try:
|
|
1863
|
+
images = page.get_images(full=True)
|
|
1864
|
+
except Exception:
|
|
1865
|
+
return rects
|
|
1866
|
+
for img in images:
|
|
1867
|
+
xref = img[0]
|
|
1868
|
+
try:
|
|
1869
|
+
rects.extend(page.get_image_rects(xref))
|
|
1870
|
+
except Exception:
|
|
1871
|
+
continue
|
|
1872
|
+
return rects
|
|
1873
|
+
|
|
1874
|
+
|
|
1875
|
+
def _filter_image_rects(rects: list[object], page_rect) -> list[object]:
|
|
1876
|
+
if not rects:
|
|
1877
|
+
return []
|
|
1878
|
+
max_area = page_rect.width * page_rect.height * 0.35
|
|
1879
|
+
filtered: list[object] = []
|
|
1880
|
+
for rect in rects:
|
|
1881
|
+
r = fitz.Rect(rect)
|
|
1882
|
+
area = r.get_area()
|
|
1883
|
+
if area <= 0:
|
|
1884
|
+
continue
|
|
1885
|
+
if area > max_area:
|
|
1886
|
+
continue
|
|
1887
|
+
filtered.append(r)
|
|
1888
|
+
return filtered
|
|
1889
|
+
|
|
1890
|
+
|
|
1891
|
+
def _select_image_bbox(
|
|
1892
|
+
image_rects: list[object],
|
|
1893
|
+
signature: Signature,
|
|
1894
|
+
role_rects: dict[str, list[object]],
|
|
1895
|
+
label_rects: list[object],
|
|
1896
|
+
page_rect,
|
|
1897
|
+
*,
|
|
1898
|
+
signature_index: int,
|
|
1899
|
+
signature_count: int,
|
|
1900
|
+
) -> tuple[float, float, float, float] | None:
|
|
1901
|
+
if not image_rects:
|
|
1902
|
+
return None
|
|
1903
|
+
filtered = _filter_image_rects(image_rects, page_rect)
|
|
1904
|
+
if not filtered:
|
|
1905
|
+
return None
|
|
1906
|
+
|
|
1907
|
+
role_hint = (signature.Role or "").lower()
|
|
1908
|
+
anchors: list[object] = []
|
|
1909
|
+
if role_hint in {"firm", "attorney"}:
|
|
1910
|
+
anchors = role_rects.get("firm", [])
|
|
1911
|
+
elif role_hint in {"client", "patient", "representative"}:
|
|
1912
|
+
anchors = role_rects.get("client", []) + label_rects
|
|
1913
|
+
else:
|
|
1914
|
+
anchors = label_rects
|
|
1915
|
+
|
|
1916
|
+
if anchors:
|
|
1917
|
+
candidate, score = _select_rect_near_labels_with_score(filtered, anchors)
|
|
1918
|
+
max_distance = min(page_rect.height * 0.5, 260.0)
|
|
1919
|
+
if candidate is None or score is None or score > max_distance:
|
|
1920
|
+
candidate = None
|
|
1921
|
+
else:
|
|
1922
|
+
candidate = _select_rect_by_order(filtered, signature, signature_index, signature_count)
|
|
1923
|
+
|
|
1924
|
+
if candidate is None:
|
|
1925
|
+
return None
|
|
1926
|
+
bbox = _rect_to_pdf_tuple(candidate, page_rect.height)
|
|
1927
|
+
if _bbox_has_area(bbox):
|
|
1928
|
+
return bbox
|
|
1929
|
+
return None
|
|
1930
|
+
|
|
1931
|
+
|
|
1932
|
+
def _refine_bbox_with_image_rects(
|
|
1933
|
+
page,
|
|
1934
|
+
bbox: tuple[float, float, float, float],
|
|
1935
|
+
*,
|
|
1936
|
+
image_rects: list[object],
|
|
1937
|
+
min_overlap: float = 0.6,
|
|
1938
|
+
min_center_overlap: float = 0.2,
|
|
1939
|
+
min_area: float = 20.0,
|
|
1940
|
+
) -> tuple[float, float, float, float] | None:
|
|
1941
|
+
clip = _to_clip_rect(page, bbox)
|
|
1942
|
+
if clip is None:
|
|
1943
|
+
return None
|
|
1944
|
+
best_rect = None
|
|
1945
|
+
best_overlap = None
|
|
1946
|
+
best_distance = None
|
|
1947
|
+
clip_center_x = (clip.x0 + clip.x1) / 2.0
|
|
1948
|
+
clip_center_y = (clip.y0 + clip.y1) / 2.0
|
|
1949
|
+
for rect in image_rects:
|
|
1950
|
+
try:
|
|
1951
|
+
inter = rect & clip
|
|
1952
|
+
except Exception:
|
|
1953
|
+
continue
|
|
1954
|
+
if inter is None or inter.get_area() <= 0:
|
|
1955
|
+
continue
|
|
1956
|
+
rect_area = rect.get_area()
|
|
1957
|
+
if rect_area <= min_area:
|
|
1958
|
+
continue
|
|
1959
|
+
overlap = inter.get_area() / max(1.0, rect_area)
|
|
1960
|
+
center_x = (rect.x0 + rect.x1) / 2.0
|
|
1961
|
+
center_y = (rect.y0 + rect.y1) / 2.0
|
|
1962
|
+
center_inside = (clip.x0 <= center_x <= clip.x1) and (clip.y0 <= center_y <= clip.y1)
|
|
1963
|
+
if overlap < min_overlap and not (center_inside and overlap >= min_center_overlap):
|
|
1964
|
+
continue
|
|
1965
|
+
distance = (center_x - clip_center_x) ** 2 + (center_y - clip_center_y) ** 2
|
|
1966
|
+
if best_overlap is None:
|
|
1967
|
+
best_overlap = overlap
|
|
1968
|
+
best_distance = distance
|
|
1969
|
+
best_rect = rect
|
|
1970
|
+
continue
|
|
1971
|
+
if overlap > best_overlap:
|
|
1972
|
+
best_overlap = overlap
|
|
1973
|
+
best_distance = distance
|
|
1974
|
+
best_rect = rect
|
|
1975
|
+
continue
|
|
1976
|
+
if abs(overlap - best_overlap) <= 0.05 and best_distance is not None and distance < best_distance:
|
|
1977
|
+
best_distance = distance
|
|
1978
|
+
best_rect = rect
|
|
1979
|
+
if best_rect is None:
|
|
1980
|
+
return None
|
|
1981
|
+
return _rect_to_pdf_tuple(best_rect, page.rect.height)
|
|
1982
|
+
|
|
1983
|
+
|
|
1984
|
+
def _ink_metrics(image_bytes: bytes, *, threshold: int = 240) -> tuple[int, float]:
|
|
1985
|
+
image = Image.open(io.BytesIO(image_bytes))
|
|
1986
|
+
gray = image.convert("L")
|
|
1987
|
+
histogram = gray.histogram()
|
|
1988
|
+
total = sum(histogram)
|
|
1989
|
+
dark = sum(histogram[:threshold])
|
|
1990
|
+
ratio = (dark / total) if total else 0.0
|
|
1991
|
+
return dark, ratio
|
|
1992
|
+
|
|
1993
|
+
|
|
1994
|
+
def _is_blank_crop(
|
|
1995
|
+
image_bytes: bytes,
|
|
1996
|
+
*,
|
|
1997
|
+
min_pixels: int = 40,
|
|
1998
|
+
min_ratio: float = 0.0005,
|
|
1999
|
+
) -> bool:
|
|
2000
|
+
dark, ratio = _ink_metrics(image_bytes)
|
|
2001
|
+
return dark < min_pixels and ratio < min_ratio
|
|
2002
|
+
|
|
2003
|
+
|
|
2004
|
+
def _resolve_signature_bbox(
|
|
2005
|
+
page,
|
|
2006
|
+
signature: Signature,
|
|
2007
|
+
*,
|
|
2008
|
+
page_cache: dict[int, dict[str, object]],
|
|
2009
|
+
signature_count: int,
|
|
2010
|
+
signature_index: int,
|
|
2011
|
+
page_number: int,
|
|
2012
|
+
) -> tuple[float, float, float, float] | None:
|
|
2013
|
+
cache = page_cache.setdefault(page_number, {})
|
|
2014
|
+
name = (signature.FieldName or "").strip()
|
|
2015
|
+
|
|
2016
|
+
label_rects = _get_label_rects(page_cache, page_number, page)
|
|
2017
|
+
role_rects = _get_role_label_rects(page_cache, page_number, page)
|
|
2018
|
+
line_rects = _get_line_rects(page_cache, page_number, page)
|
|
2019
|
+
|
|
2020
|
+
if signature.RenderType in {"typed", "drawn"}:
|
|
2021
|
+
image_rects = _get_image_rects(page_cache, page_number, page)
|
|
2022
|
+
image_bbox = _select_image_bbox(
|
|
2023
|
+
image_rects,
|
|
2024
|
+
signature,
|
|
2025
|
+
role_rects,
|
|
2026
|
+
label_rects,
|
|
2027
|
+
page.rect,
|
|
2028
|
+
signature_index=signature_index,
|
|
2029
|
+
signature_count=signature_count,
|
|
2030
|
+
)
|
|
2031
|
+
if image_bbox is not None:
|
|
2032
|
+
return image_bbox
|
|
2033
|
+
|
|
2034
|
+
line_bbox = _select_line_bbox(
|
|
2035
|
+
line_rects,
|
|
2036
|
+
signature,
|
|
2037
|
+
role_rects,
|
|
2038
|
+
label_rects,
|
|
2039
|
+
page.rect,
|
|
2040
|
+
)
|
|
2041
|
+
if line_bbox is not None:
|
|
2042
|
+
return line_bbox
|
|
2043
|
+
|
|
2044
|
+
widget_cache = cache.get("widget_cache")
|
|
2045
|
+
if widget_cache is None:
|
|
2046
|
+
widget_map, widget_sig_rects, widget_rects = _collect_widget_rects(page)
|
|
2047
|
+
widget_cache = {
|
|
2048
|
+
"map": widget_map,
|
|
2049
|
+
"sig_rects": widget_sig_rects,
|
|
2050
|
+
"all_rects": widget_rects,
|
|
2051
|
+
}
|
|
2052
|
+
cache["widget_cache"] = widget_cache
|
|
2053
|
+
widget_map = widget_cache["map"]
|
|
2054
|
+
widget_sig_rects = widget_cache["sig_rects"]
|
|
2055
|
+
widget_rects = widget_cache["all_rects"]
|
|
2056
|
+
|
|
2057
|
+
if name:
|
|
2058
|
+
rect = widget_map.get(name)
|
|
2059
|
+
if rect is not None:
|
|
2060
|
+
bbox = _rect_to_pdf_tuple(rect, page.rect.height)
|
|
2061
|
+
if _bbox_has_area(bbox):
|
|
2062
|
+
return bbox
|
|
2063
|
+
|
|
2064
|
+
rect = _select_rect_candidate(
|
|
2065
|
+
widget_sig_rects,
|
|
2066
|
+
signature,
|
|
2067
|
+
signature_index,
|
|
2068
|
+
signature_count,
|
|
2069
|
+
label_rects,
|
|
2070
|
+
)
|
|
2071
|
+
if rect is not None:
|
|
2072
|
+
bbox = _rect_to_pdf_tuple(rect, page.rect.height)
|
|
2073
|
+
if _bbox_has_area(bbox):
|
|
2074
|
+
return bbox
|
|
2075
|
+
|
|
2076
|
+
widget_candidates = _filter_signature_like_rects(widget_rects, page.rect)
|
|
2077
|
+
rect = _select_rect_candidate(
|
|
2078
|
+
widget_candidates,
|
|
2079
|
+
signature,
|
|
2080
|
+
signature_index,
|
|
2081
|
+
signature_count,
|
|
2082
|
+
label_rects,
|
|
2083
|
+
)
|
|
2084
|
+
if rect is not None:
|
|
2085
|
+
bbox = _rect_to_pdf_tuple(rect, page.rect.height)
|
|
2086
|
+
if _bbox_has_area(bbox):
|
|
2087
|
+
return bbox
|
|
2088
|
+
if not widget_candidates and len(widget_rects) == 1:
|
|
2089
|
+
bbox = _rect_to_pdf_tuple(widget_rects[0], page.rect.height)
|
|
2090
|
+
if _bbox_has_area(bbox):
|
|
2091
|
+
return bbox
|
|
2092
|
+
|
|
2093
|
+
annot_cache = cache.get("annot_cache")
|
|
2094
|
+
if annot_cache is None:
|
|
2095
|
+
annot_map, annot_rects = _collect_annot_rects(page)
|
|
2096
|
+
annot_cache = {"map": annot_map, "rects": annot_rects}
|
|
2097
|
+
cache["annot_cache"] = annot_cache
|
|
2098
|
+
annot_map = annot_cache["map"]
|
|
2099
|
+
annot_rects = annot_cache["rects"]
|
|
2100
|
+
|
|
2101
|
+
if name:
|
|
2102
|
+
rect = annot_map.get(name)
|
|
2103
|
+
if rect is not None:
|
|
2104
|
+
bbox = _rect_to_pdf_tuple(rect, page.rect.height)
|
|
2105
|
+
if _bbox_has_area(bbox):
|
|
2106
|
+
return bbox
|
|
2107
|
+
|
|
2108
|
+
annot_candidates = _filter_signature_like_rects(annot_rects, page.rect)
|
|
2109
|
+
if not annot_candidates and len(annot_rects) == 1:
|
|
2110
|
+
annot_candidates = [fitz.Rect(annot_rects[0])]
|
|
2111
|
+
rect = _select_rect_candidate(
|
|
2112
|
+
annot_candidates,
|
|
2113
|
+
signature,
|
|
2114
|
+
signature_index,
|
|
2115
|
+
signature_count,
|
|
2116
|
+
label_rects,
|
|
2117
|
+
)
|
|
2118
|
+
if rect is not None:
|
|
2119
|
+
bbox = _rect_to_pdf_tuple(rect, page.rect.height)
|
|
2120
|
+
if _bbox_has_area(bbox):
|
|
2121
|
+
return bbox
|
|
2122
|
+
if label_rects:
|
|
2123
|
+
target = _select_rect_by_order(label_rects, signature, signature_index, signature_count)
|
|
2124
|
+
if target is not None:
|
|
2125
|
+
expanded = _expand_rect_from_label(target, page.rect)
|
|
2126
|
+
if expanded is not None:
|
|
2127
|
+
bbox = _rect_to_pdf_tuple(expanded, page.rect.height)
|
|
2128
|
+
if _bbox_has_area(bbox):
|
|
2129
|
+
return bbox
|
|
2130
|
+
|
|
2131
|
+
if name:
|
|
2132
|
+
fieldname_cache = cache.get("fieldname_cache")
|
|
2133
|
+
if fieldname_cache is None:
|
|
2134
|
+
fieldname_cache = {}
|
|
2135
|
+
cache["fieldname_cache"] = fieldname_cache
|
|
2136
|
+
rects = fieldname_cache.get(name)
|
|
2137
|
+
if rects is None:
|
|
2138
|
+
rects = _find_fieldname_text_rects(page, name)
|
|
2139
|
+
fieldname_cache[name] = rects
|
|
2140
|
+
if rects:
|
|
2141
|
+
target = _select_rect_by_order(rects, signature, signature_index, signature_count)
|
|
2142
|
+
if target is not None:
|
|
2143
|
+
expanded = _expand_rect_around_text(target, page.rect)
|
|
2144
|
+
if expanded is not None:
|
|
2145
|
+
bbox = _rect_to_pdf_tuple(expanded, page.rect.height)
|
|
2146
|
+
if _bbox_has_area(bbox):
|
|
2147
|
+
return bbox
|
|
2148
|
+
|
|
2149
|
+
return None
|
|
2150
|
+
|
|
2151
|
+
|
|
2152
|
+
def _resolve_bbox_across_document(
|
|
2153
|
+
document,
|
|
2154
|
+
signature: Signature,
|
|
2155
|
+
*,
|
|
2156
|
+
page_cache: dict[int, dict[str, object]],
|
|
2157
|
+
document_cache: dict[str, object],
|
|
2158
|
+
signature_index: int,
|
|
2159
|
+
signature_count: int,
|
|
2160
|
+
skip_page: int | None,
|
|
2161
|
+
) -> tuple[int | None, tuple[float, float, float, float] | None]:
|
|
2162
|
+
label_pages = document_cache.get("label_pages")
|
|
2163
|
+
if label_pages is None:
|
|
2164
|
+
label_pages = []
|
|
2165
|
+
for page_index in range(document.page_count):
|
|
2166
|
+
page_number = page_index + 1
|
|
2167
|
+
page = document.load_page(page_index)
|
|
2168
|
+
labels = _get_label_rects(page_cache, page_number, page)
|
|
2169
|
+
if labels:
|
|
2170
|
+
label_pages.append(page_number)
|
|
2171
|
+
document_cache["label_pages"] = label_pages
|
|
2172
|
+
|
|
2173
|
+
page_order = list(label_pages)
|
|
2174
|
+
for page_number in range(1, document.page_count + 1):
|
|
2175
|
+
if page_number not in page_order:
|
|
2176
|
+
page_order.append(page_number)
|
|
2177
|
+
|
|
2178
|
+
for page_number in page_order:
|
|
2179
|
+
if skip_page is not None and page_number == skip_page:
|
|
2180
|
+
continue
|
|
2181
|
+
page = document.load_page(page_number - 1)
|
|
2182
|
+
resolved = _resolve_signature_bbox(
|
|
2183
|
+
page,
|
|
2184
|
+
signature,
|
|
2185
|
+
page_cache=page_cache,
|
|
2186
|
+
signature_count=signature_count,
|
|
2187
|
+
signature_index=signature_index,
|
|
2188
|
+
page_number=page_number,
|
|
2189
|
+
)
|
|
2190
|
+
if resolved is not None:
|
|
2191
|
+
return page_number, resolved
|
|
2192
|
+
return None, None
|
|
2193
|
+
|
|
2194
|
+
|
|
2195
|
+
_SIGNATURE_NAME_PATTERN = re.compile(r"\bsign", re.IGNORECASE)
|
|
2196
|
+
_SIGNATURE_LABEL_PATTERNS = (
|
|
2197
|
+
re.compile(r"\bsignature\b", re.IGNORECASE),
|
|
2198
|
+
re.compile(r"\bsign here\b", re.IGNORECASE),
|
|
2199
|
+
re.compile(r"\bsigned by\b", re.IGNORECASE),
|
|
2200
|
+
re.compile(r"/s/", re.IGNORECASE),
|
|
2201
|
+
)
|
|
2202
|
+
|
|
2203
|
+
_ROLE_LABEL_PATTERNS = {
|
|
2204
|
+
"client": (
|
|
2205
|
+
re.compile(r"\bclient\b", re.IGNORECASE),
|
|
2206
|
+
re.compile(r"\bpatient\b", re.IGNORECASE),
|
|
2207
|
+
re.compile(r"\bplaintiff\b", re.IGNORECASE),
|
|
2208
|
+
),
|
|
2209
|
+
"firm": (
|
|
2210
|
+
re.compile(r"\bfirm\b", re.IGNORECASE),
|
|
2211
|
+
re.compile(r"\battorney\b", re.IGNORECASE),
|
|
2212
|
+
re.compile(r"\bcounsel\b", re.IGNORECASE),
|
|
2213
|
+
re.compile(r"\blaw\b", re.IGNORECASE),
|
|
2214
|
+
re.compile(r"\blegal\b", re.IGNORECASE),
|
|
2215
|
+
re.compile(r"\bllc\b", re.IGNORECASE),
|
|
2216
|
+
re.compile(r"\bllp\b", re.IGNORECASE),
|
|
2217
|
+
re.compile(r"\bgroup\b", re.IGNORECASE),
|
|
2218
|
+
),
|
|
2219
|
+
}
|
|
2220
|
+
|
|
2221
|
+
|
|
2222
|
+
def _get_label_rects(
|
|
2223
|
+
page_cache: dict[int, dict[str, object]],
|
|
2224
|
+
page_number: int,
|
|
2225
|
+
page,
|
|
2226
|
+
) -> list[object]:
|
|
2227
|
+
cache = page_cache.setdefault(page_number, {})
|
|
2228
|
+
label_rects = cache.get("label_rects")
|
|
2229
|
+
if label_rects is None:
|
|
2230
|
+
label_rects = _collect_signature_labels(page)
|
|
2231
|
+
cache["label_rects"] = label_rects
|
|
2232
|
+
return label_rects
|
|
2233
|
+
|
|
2234
|
+
|
|
2235
|
+
def _get_role_label_rects(
|
|
2236
|
+
page_cache: dict[int, dict[str, object]],
|
|
2237
|
+
page_number: int,
|
|
2238
|
+
page,
|
|
2239
|
+
) -> dict[str, list[object]]:
|
|
2240
|
+
cache = page_cache.setdefault(page_number, {})
|
|
2241
|
+
role_rects = cache.get("role_rects")
|
|
2242
|
+
if role_rects is None:
|
|
2243
|
+
role_rects = _collect_role_text_rects(page)
|
|
2244
|
+
cache["role_rects"] = role_rects
|
|
2245
|
+
return role_rects
|
|
2246
|
+
|
|
2247
|
+
|
|
2248
|
+
def _get_line_rects(
|
|
2249
|
+
page_cache: dict[int, dict[str, object]],
|
|
2250
|
+
page_number: int,
|
|
2251
|
+
page,
|
|
2252
|
+
) -> list[object]:
|
|
2253
|
+
cache = page_cache.setdefault(page_number, {})
|
|
2254
|
+
line_rects = cache.get("line_rects")
|
|
2255
|
+
if line_rects is None:
|
|
2256
|
+
line_rects = _collect_underscore_rects(page)
|
|
2257
|
+
cache["line_rects"] = line_rects
|
|
2258
|
+
return line_rects
|
|
2259
|
+
|
|
2260
|
+
|
|
2261
|
+
def _get_image_rects(
|
|
2262
|
+
page_cache: dict[int, dict[str, object]],
|
|
2263
|
+
page_number: int,
|
|
2264
|
+
page,
|
|
2265
|
+
) -> list[object]:
|
|
2266
|
+
cache = page_cache.setdefault(page_number, {})
|
|
2267
|
+
image_rects = cache.get("image_rects")
|
|
2268
|
+
if image_rects is None:
|
|
2269
|
+
image_rects = _collect_image_rects(page)
|
|
2270
|
+
cache["image_rects"] = image_rects
|
|
2271
|
+
return image_rects
|
|
2272
|
+
|
|
2273
|
+
|
|
2274
|
+
def _collect_widget_rects(page) -> tuple[dict[str, object], list[object], list[object]]:
|
|
2275
|
+
mapping: dict[str, object] = {}
|
|
2276
|
+
signature_rects: list[object] = []
|
|
2277
|
+
all_rects: list[object] = []
|
|
2278
|
+
widgets = page.widgets() if hasattr(page, "widgets") else None
|
|
2279
|
+
if not widgets:
|
|
2280
|
+
return mapping, signature_rects, all_rects
|
|
2281
|
+
for widget in widgets:
|
|
2282
|
+
rect = widget.rect
|
|
2283
|
+
name = (widget.field_name or "").strip()
|
|
2284
|
+
if name:
|
|
2285
|
+
mapping[name] = rect
|
|
2286
|
+
all_rects.append(rect)
|
|
2287
|
+
if _is_signature_widget(widget, name):
|
|
2288
|
+
signature_rects.append(rect)
|
|
2289
|
+
return mapping, signature_rects, all_rects
|
|
2290
|
+
|
|
2291
|
+
|
|
2292
|
+
def _is_signature_widget(widget, name: str) -> bool:
|
|
2293
|
+
if getattr(widget, "field_type", None) in {getattr(fitz, "PDF_WIDGET_TYPE_SIGNATURE", 6)}:
|
|
2294
|
+
return True
|
|
2295
|
+
if name and _SIGNATURE_NAME_PATTERN.search(name):
|
|
2296
|
+
return True
|
|
2297
|
+
return False
|
|
2298
|
+
|
|
2299
|
+
|
|
2300
|
+
def _collect_annot_rects(page) -> tuple[dict[str, object], list[object]]:
|
|
2301
|
+
mapping: dict[str, object] = {}
|
|
2302
|
+
candidates: list[object] = []
|
|
2303
|
+
annot = page.first_annot
|
|
2304
|
+
while annot:
|
|
2305
|
+
name = (getattr(annot, "field_name", None) or getattr(annot, "title", None) or "").strip()
|
|
2306
|
+
if name and name not in mapping:
|
|
2307
|
+
mapping[name] = annot.rect
|
|
2308
|
+
if _is_signature_annotation(annot):
|
|
2309
|
+
candidates.append(annot.rect)
|
|
2310
|
+
annot = annot.next
|
|
2311
|
+
return mapping, candidates
|
|
2312
|
+
|
|
2313
|
+
|
|
2314
|
+
def _is_signature_annotation(annot) -> bool:
|
|
2315
|
+
try:
|
|
2316
|
+
annot_type = (
|
|
2317
|
+
annot.type[1] if isinstance(annot.type, tuple) and len(annot.type) > 1 else None
|
|
2318
|
+
)
|
|
2319
|
+
except Exception: # pragma: no cover - defensive
|
|
2320
|
+
annot_type = None
|
|
2321
|
+
if not annot_type:
|
|
2322
|
+
return False
|
|
2323
|
+
label = str(annot_type).lower()
|
|
2324
|
+
return label in {"stamp", "ink", "freetext", "text"}
|
|
2325
|
+
|
|
2326
|
+
|
|
2327
|
+
def _collect_signature_labels(page) -> list[object]:
|
|
2328
|
+
labels: list[object] = []
|
|
2329
|
+
try:
|
|
2330
|
+
blocks = page.get_text("blocks")
|
|
2331
|
+
except Exception: # pragma: no cover - defensive
|
|
2332
|
+
return labels
|
|
2333
|
+
for block in blocks or []:
|
|
2334
|
+
if not block or len(block) < 5:
|
|
2335
|
+
continue
|
|
2336
|
+
text = str(block[4] or "")
|
|
2337
|
+
if not text:
|
|
2338
|
+
continue
|
|
2339
|
+
if _is_signature_label_text(text):
|
|
2340
|
+
labels.append(fitz.Rect(block[0], block[1], block[2], block[3]))
|
|
2341
|
+
return labels
|
|
2342
|
+
|
|
2343
|
+
|
|
2344
|
+
def _is_signature_label_text(text: str) -> bool:
|
|
2345
|
+
normalized = " ".join(text.split())
|
|
2346
|
+
return any(pattern.search(normalized) for pattern in _SIGNATURE_LABEL_PATTERNS)
|
|
2347
|
+
|
|
2348
|
+
|
|
2349
|
+
def _collect_text_lines(page) -> list[dict[str, object]]:
|
|
2350
|
+
lines: dict[tuple[int, int], dict[str, object]] = {}
|
|
2351
|
+
try:
|
|
2352
|
+
words = page.get_text("words") or []
|
|
2353
|
+
except Exception: # pragma: no cover - defensive
|
|
2354
|
+
return []
|
|
2355
|
+
for word in words:
|
|
2356
|
+
if len(word) < 8:
|
|
2357
|
+
continue
|
|
2358
|
+
x0, y0, x1, y1, text, block_no, line_no, *_ = word
|
|
2359
|
+
key = (int(block_no), int(line_no))
|
|
2360
|
+
entry = lines.get(key)
|
|
2361
|
+
if entry is None:
|
|
2362
|
+
entry = {
|
|
2363
|
+
"x0": float(x0),
|
|
2364
|
+
"y0": float(y0),
|
|
2365
|
+
"x1": float(x1),
|
|
2366
|
+
"y1": float(y1),
|
|
2367
|
+
"text": str(text),
|
|
2368
|
+
}
|
|
2369
|
+
lines[key] = entry
|
|
2370
|
+
else:
|
|
2371
|
+
entry["x0"] = min(entry["x0"], float(x0))
|
|
2372
|
+
entry["y0"] = min(entry["y0"], float(y0))
|
|
2373
|
+
entry["x1"] = max(entry["x1"], float(x1))
|
|
2374
|
+
entry["y1"] = max(entry["y1"], float(y1))
|
|
2375
|
+
entry["text"] = f"{entry['text']} {text}"
|
|
2376
|
+
result: list[dict[str, object]] = []
|
|
2377
|
+
for entry in lines.values():
|
|
2378
|
+
text = str(entry["text"]).strip()
|
|
2379
|
+
if not text:
|
|
2380
|
+
continue
|
|
2381
|
+
entry["lower"] = text.lower()
|
|
2382
|
+
entry["rect"] = fitz.Rect(entry["x0"], entry["y0"], entry["x1"], entry["y1"])
|
|
2383
|
+
result.append(entry)
|
|
2384
|
+
return result
|
|
2385
|
+
|
|
2386
|
+
|
|
2387
|
+
def _collect_role_text_rects(page) -> dict[str, list[object]]:
|
|
2388
|
+
rects: dict[str, list[object]] = {"client": [], "firm": []}
|
|
2389
|
+
lines = _collect_text_lines(page)
|
|
2390
|
+
for line in lines:
|
|
2391
|
+
text = str(line["text"])
|
|
2392
|
+
lower = str(line["lower"])
|
|
2393
|
+
if len(text) > 60:
|
|
2394
|
+
continue
|
|
2395
|
+
for role, patterns in _ROLE_LABEL_PATTERNS.items():
|
|
2396
|
+
if any(pattern.search(lower) for pattern in patterns):
|
|
2397
|
+
rects[role].append(line["rect"])
|
|
2398
|
+
return rects
|
|
2399
|
+
|
|
2400
|
+
|
|
2401
|
+
def _collect_underscore_rects(page) -> list[object]:
|
|
2402
|
+
rects: list[object] = []
|
|
2403
|
+
try:
|
|
2404
|
+
words = page.get_text("words") or []
|
|
2405
|
+
except Exception: # pragma: no cover - defensive
|
|
2406
|
+
return rects
|
|
2407
|
+
for word in words:
|
|
2408
|
+
if len(word) < 5:
|
|
2409
|
+
continue
|
|
2410
|
+
x0, y0, x1, y1, text, *_ = word
|
|
2411
|
+
text = str(text)
|
|
2412
|
+
if text and set(text) == {"_"} and len(text) >= 4:
|
|
2413
|
+
rects.append(fitz.Rect(float(x0), float(y0), float(x1), float(y1)))
|
|
2414
|
+
return rects
|
|
2415
|
+
|
|
2416
|
+
|
|
2417
|
+
def _select_line_bbox(
|
|
2418
|
+
line_rects: list[object],
|
|
2419
|
+
signature: Signature,
|
|
2420
|
+
role_rects: dict[str, list[object]],
|
|
2421
|
+
label_rects: list[object],
|
|
2422
|
+
page_rect,
|
|
2423
|
+
) -> tuple[float, float, float, float] | None:
|
|
2424
|
+
if not line_rects:
|
|
2425
|
+
return None
|
|
2426
|
+
role_hint = (signature.Role or "").lower()
|
|
2427
|
+
anchors: list[object] = []
|
|
2428
|
+
if role_hint in {"firm", "attorney"}:
|
|
2429
|
+
anchors = role_rects.get("firm", [])
|
|
2430
|
+
elif role_hint in {"client", "patient", "representative"}:
|
|
2431
|
+
anchors = role_rects.get("client", []) + label_rects
|
|
2432
|
+
else:
|
|
2433
|
+
anchors = label_rects
|
|
2434
|
+
|
|
2435
|
+
if anchors:
|
|
2436
|
+
candidate, score = _select_rect_near_labels_with_score(line_rects, anchors)
|
|
2437
|
+
max_distance = min(page_rect.height * 0.35, 220.0)
|
|
2438
|
+
if candidate is None or score is None or score > max_distance:
|
|
2439
|
+
candidate = None
|
|
2440
|
+
else:
|
|
2441
|
+
candidate = _select_rect_by_order(line_rects, signature, 1, 1)
|
|
2442
|
+
|
|
2443
|
+
if candidate is None:
|
|
2444
|
+
return None
|
|
2445
|
+
expanded = _expand_rect_from_line(candidate, page_rect)
|
|
2446
|
+
if expanded is None:
|
|
2447
|
+
return None
|
|
2448
|
+
bbox = _rect_to_pdf_tuple(expanded, page_rect.height)
|
|
2449
|
+
if _bbox_has_area(bbox):
|
|
2450
|
+
return bbox
|
|
2451
|
+
return None
|
|
2452
|
+
|
|
2453
|
+
|
|
2454
|
+
def _select_rect_near_labels_with_score(
|
|
2455
|
+
rects: list[object],
|
|
2456
|
+
label_rects: list[object],
|
|
2457
|
+
) -> tuple[object | None, float | None]:
|
|
2458
|
+
if not rects or not label_rects:
|
|
2459
|
+
return None, None
|
|
2460
|
+
best = None
|
|
2461
|
+
best_score: float | None = None
|
|
2462
|
+
for rect in rects:
|
|
2463
|
+
r = fitz.Rect(rect)
|
|
2464
|
+
for label in label_rects:
|
|
2465
|
+
l = fitz.Rect(label)
|
|
2466
|
+
vertical_gap = max(0.0, max(l.y0 - r.y1, r.y0 - l.y1))
|
|
2467
|
+
horizontal_gap = max(0.0, max(l.x0 - r.x1, r.x0 - l.x1))
|
|
2468
|
+
score = vertical_gap * 2.0 + horizontal_gap
|
|
2469
|
+
if best_score is None or score < best_score:
|
|
2470
|
+
best_score = score
|
|
2471
|
+
best = r
|
|
2472
|
+
return best, best_score
|
|
2473
|
+
|
|
2474
|
+
|
|
2475
|
+
def _expand_rect_from_line(line_rect, page_rect):
|
|
2476
|
+
rect = fitz.Rect(line_rect)
|
|
2477
|
+
width = rect.width
|
|
2478
|
+
height = rect.height
|
|
2479
|
+
if width <= 0 or height <= 0:
|
|
2480
|
+
return None
|
|
2481
|
+
pad_x = max(8.0, width * 0.05)
|
|
2482
|
+
left = max(page_rect.x0, rect.x0 - pad_x)
|
|
2483
|
+
right = min(page_rect.x1, rect.x1 + pad_x)
|
|
2484
|
+
|
|
2485
|
+
max_height = min(140.0, max(60.0, height * 12.0))
|
|
2486
|
+
gap = max(2.0, height * 1.5)
|
|
2487
|
+
upper = max(page_rect.y0, rect.y0 - gap)
|
|
2488
|
+
lower = max(page_rect.y0, upper - max_height)
|
|
2489
|
+
if upper <= lower:
|
|
2490
|
+
return None
|
|
2491
|
+
return fitz.Rect(left, lower, right, upper)
|
|
2492
|
+
|
|
2493
|
+
|
|
2494
|
+
def _select_rect_by_order(
|
|
2495
|
+
rects: list[object],
|
|
2496
|
+
signature: Signature,
|
|
2497
|
+
signature_index: int,
|
|
2498
|
+
signature_count: int,
|
|
2499
|
+
) -> object | None:
|
|
2500
|
+
if not rects:
|
|
2501
|
+
return None
|
|
2502
|
+
ordered = sorted(rects, key=lambda rect: rect.y0)
|
|
2503
|
+
if signature_count > 1 and signature_index > 0:
|
|
2504
|
+
return ordered[min(signature_index - 1, len(ordered) - 1)]
|
|
2505
|
+
role_hint = (signature.Role or "").lower()
|
|
2506
|
+
if role_hint in {"patient", "client"}:
|
|
2507
|
+
return ordered[0]
|
|
2508
|
+
if role_hint in {"attorney", "firm"} and len(ordered) > 1:
|
|
2509
|
+
return ordered[min(1, len(ordered) - 1)]
|
|
2510
|
+
return ordered[0]
|
|
2511
|
+
|
|
2512
|
+
|
|
2513
|
+
def _select_rect_candidate(
|
|
2514
|
+
rects: list[object],
|
|
2515
|
+
signature: Signature,
|
|
2516
|
+
signature_index: int,
|
|
2517
|
+
signature_count: int,
|
|
2518
|
+
label_rects: list[object] | None,
|
|
2519
|
+
) -> object | None:
|
|
2520
|
+
if rects and label_rects:
|
|
2521
|
+
near = _select_rect_near_labels(rects, label_rects)
|
|
2522
|
+
if near is not None:
|
|
2523
|
+
return near
|
|
2524
|
+
return _select_rect_by_order(rects, signature, signature_index, signature_count)
|
|
2525
|
+
|
|
2526
|
+
|
|
2527
|
+
def _select_rect_near_labels(rects: list[object], label_rects: list[object]) -> object | None:
|
|
2528
|
+
if not rects or not label_rects:
|
|
2529
|
+
return None
|
|
2530
|
+
best = None
|
|
2531
|
+
best_score = None
|
|
2532
|
+
for rect in rects:
|
|
2533
|
+
r = fitz.Rect(rect)
|
|
2534
|
+
for label in label_rects:
|
|
2535
|
+
l = fitz.Rect(label)
|
|
2536
|
+
vertical_gap = max(0.0, max(l.y0 - r.y1, r.y0 - l.y1))
|
|
2537
|
+
horizontal_gap = max(0.0, max(l.x0 - r.x1, r.x0 - l.x1))
|
|
2538
|
+
score = vertical_gap * 2.0 + horizontal_gap
|
|
2539
|
+
if best_score is None or score < best_score:
|
|
2540
|
+
best_score = score
|
|
2541
|
+
best = r
|
|
2542
|
+
return best
|
|
2543
|
+
|
|
2544
|
+
|
|
2545
|
+
def _filter_signature_like_rects(rects: list[object], page_rect) -> list[object]:
|
|
2546
|
+
filtered: list[object] = []
|
|
2547
|
+
for rect in rects:
|
|
2548
|
+
r = fitz.Rect(rect)
|
|
2549
|
+
width = r.width
|
|
2550
|
+
height = r.height
|
|
2551
|
+
if width < 40 or height < 8:
|
|
2552
|
+
continue
|
|
2553
|
+
if height > page_rect.height * 0.4:
|
|
2554
|
+
continue
|
|
2555
|
+
if width / max(height, 1.0) < 1.5:
|
|
2556
|
+
continue
|
|
2557
|
+
filtered.append(r)
|
|
2558
|
+
return filtered
|
|
2559
|
+
|
|
2560
|
+
|
|
2561
|
+
def _expand_rect_from_label(label_rect, page_rect):
|
|
2562
|
+
rect = fitz.Rect(label_rect)
|
|
2563
|
+
height = max(16.0, rect.height)
|
|
2564
|
+
width = max(80.0, rect.width)
|
|
2565
|
+
left = max(page_rect.x0, rect.x0 - width * 0.05)
|
|
2566
|
+
right = min(page_rect.x1, rect.x1 + width * 0.35)
|
|
2567
|
+
gap = max(4.0, height * 0.3)
|
|
2568
|
+
max_height = min(120.0, height * 5.5)
|
|
2569
|
+
|
|
2570
|
+
top = min(page_rect.y1, rect.y1 + gap)
|
|
2571
|
+
bottom = min(page_rect.y1, top + max_height)
|
|
2572
|
+
min_height = max(40.0, height * 2.5)
|
|
2573
|
+
if bottom - top < min_height:
|
|
2574
|
+
bottom = max(page_rect.y0, rect.y0 - gap)
|
|
2575
|
+
top = max(page_rect.y0, bottom - max_height)
|
|
2576
|
+
if bottom <= top:
|
|
2577
|
+
return None
|
|
2578
|
+
return fitz.Rect(left, top, right, bottom)
|
|
2579
|
+
|
|
2580
|
+
|
|
2581
|
+
def _find_fieldname_text_rects(page, field_name: str) -> list[object]:
|
|
2582
|
+
name = field_name.strip()
|
|
2583
|
+
if not name:
|
|
2584
|
+
return []
|
|
2585
|
+
try:
|
|
2586
|
+
hits = page.search_for(name, flags=fitz.TEXT_IGNORECASE)
|
|
2587
|
+
except Exception: # pragma: no cover - defensive
|
|
2588
|
+
return []
|
|
2589
|
+
return [fitz.Rect(hit) for hit in hits]
|
|
2590
|
+
|
|
2591
|
+
|
|
2592
|
+
def _expand_rect_around_text(text_rect, page_rect):
|
|
2593
|
+
rect = fitz.Rect(text_rect)
|
|
2594
|
+
pad_x = max(6.0, rect.width * 0.2)
|
|
2595
|
+
pad_y = max(4.0, rect.height * 0.8)
|
|
2596
|
+
left = max(page_rect.x0, rect.x0 - pad_x)
|
|
2597
|
+
right = min(page_rect.x1, rect.x1 + pad_x)
|
|
2598
|
+
top = max(page_rect.y0, rect.y0 - pad_y)
|
|
2599
|
+
bottom = min(page_rect.y1, rect.y1 + pad_y)
|
|
2600
|
+
if right <= left or bottom <= top:
|
|
2601
|
+
return None
|
|
2602
|
+
return fitz.Rect(left, top, right, bottom)
|
|
2603
|
+
|
|
2604
|
+
|
|
2605
|
+
def _rect_to_pdf_tuple(rect, page_height: float) -> tuple[float, float, float, float]:
|
|
2606
|
+
x0 = float(rect.x0)
|
|
2607
|
+
x1 = float(rect.x1)
|
|
2608
|
+
y0 = page_height - float(rect.y1)
|
|
2609
|
+
y1 = page_height - float(rect.y0)
|
|
2610
|
+
if x1 < x0:
|
|
2611
|
+
x0, x1 = x1, x0
|
|
2612
|
+
if y1 < y0:
|
|
2613
|
+
y0, y1 = y1, y0
|
|
2614
|
+
return (x0, y0, x1, y1)
|