sigdetect 0.5.1__tar.gz → 0.5.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sigdetect-0.5.1 → sigdetect-0.5.2}/PKG-INFO +1 -1
- {sigdetect-0.5.1 → sigdetect-0.5.2}/pyproject.toml +1 -1
- {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/cropping.py +237 -2
- {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/wet_detection.py +48 -14
- {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect.egg-info/PKG-INFO +1 -1
- {sigdetect-0.5.1 → sigdetect-0.5.2}/tests/test_cropping.py +50 -1
- {sigdetect-0.5.1 → sigdetect-0.5.2}/tests/test_wet_detection.py +98 -0
- {sigdetect-0.5.1 → sigdetect-0.5.2}/README.md +0 -0
- {sigdetect-0.5.1 → sigdetect-0.5.2}/setup.cfg +0 -0
- {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/__init__.py +0 -0
- {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/api.py +0 -0
- {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/cli.py +0 -0
- {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/config.py +0 -0
- {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/data/role_rules.retainer.yml +0 -0
- {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/data/role_rules.yml +0 -0
- {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/data/vendor_patterns.yml +0 -0
- {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/detector/__init__.py +0 -0
- {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/detector/base.py +0 -0
- {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/detector/base_detector.py +0 -0
- {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/detector/file_result_model.py +0 -0
- {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/detector/pymupdf_engine.py +0 -0
- {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/detector/pypdf2_engine.py +0 -0
- {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/detector/signature_model.py +0 -0
- {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/eda.py +0 -0
- {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/logging_setup.py +0 -0
- {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/utils.py +0 -0
- {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect.egg-info/SOURCES.txt +0 -0
- {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect.egg-info/dependency_links.txt +0 -0
- {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect.egg-info/entry_points.txt +0 -0
- {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect.egg-info/requires.txt +0 -0
- {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect.egg-info/top_level.txt +0 -0
- {sigdetect-0.5.1 → sigdetect-0.5.2}/tests/test_api.py +0 -0
- {sigdetect-0.5.1 → sigdetect-0.5.2}/tests/test_cli.py +0 -0
- {sigdetect-0.5.1 → sigdetect-0.5.2}/tests/test_detector_options.py +0 -0
- {sigdetect-0.5.1 → sigdetect-0.5.2}/tests/test_pymupdf_engine.py +0 -0
- {sigdetect-0.5.1 → sigdetect-0.5.2}/tests/test_widget_role_patient_smoke.py +0 -0
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "sigdetect"
|
|
7
|
-
version = "0.5.
|
|
7
|
+
version = "0.5.2"
|
|
8
8
|
description = "Signature detection and role attribution for PDFs"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
authors = [{ name = "BT Asmamaw", email = "basmamaw@angeiongroup.com" }]
|
|
@@ -9,6 +9,8 @@ from dataclasses import dataclass
|
|
|
9
9
|
from pathlib import Path
|
|
10
10
|
from typing import Literal, overload
|
|
11
11
|
|
|
12
|
+
from PIL import Image
|
|
13
|
+
|
|
12
14
|
from .detector.file_result_model import FileResult
|
|
13
15
|
from .detector.signature_model import Signature
|
|
14
16
|
|
|
@@ -53,6 +55,7 @@ def crop_signatures(
|
|
|
53
55
|
return_bytes: Literal[False] = False,
|
|
54
56
|
save_files: bool = True,
|
|
55
57
|
docx: bool = False,
|
|
58
|
+
trim: bool = True,
|
|
56
59
|
) -> list[Path]: ...
|
|
57
60
|
|
|
58
61
|
|
|
@@ -67,6 +70,7 @@ def crop_signatures(
|
|
|
67
70
|
return_bytes: Literal[True],
|
|
68
71
|
save_files: bool = True,
|
|
69
72
|
docx: bool = False,
|
|
73
|
+
trim: bool = True,
|
|
70
74
|
) -> list[SignatureCrop]: ...
|
|
71
75
|
|
|
72
76
|
|
|
@@ -80,6 +84,7 @@ def crop_signatures(
|
|
|
80
84
|
return_bytes: bool = False,
|
|
81
85
|
save_files: bool = True,
|
|
82
86
|
docx: bool = False,
|
|
87
|
+
trim: bool = True,
|
|
83
88
|
) -> list[Path] | list[SignatureCrop]:
|
|
84
89
|
"""Render each signature bounding box to a PNG image and optionally wrap it in DOCX.
|
|
85
90
|
|
|
@@ -87,6 +92,7 @@ def crop_signatures(
|
|
|
87
92
|
the files to ``output_dir``. Set ``save_files=False`` to skip writing PNGs to disk.
|
|
88
93
|
When ``docx=True``, DOCX files are written instead of PNGs. When ``return_bytes`` is True
|
|
89
94
|
and ``docx=True``, ``SignatureCrop.docx_bytes`` will contain the DOCX payload.
|
|
95
|
+
When ``trim`` is enabled, the crop is tightened around the detected ink where possible.
|
|
90
96
|
"""
|
|
91
97
|
|
|
92
98
|
if fitz is None: # pragma: no cover - exercised when dependency absent
|
|
@@ -145,10 +151,12 @@ def crop_signatures(
|
|
|
145
151
|
try:
|
|
146
152
|
image_bytes: bytes | None = None
|
|
147
153
|
pixmap = page.get_pixmap(matrix=matrix, clip=clip, alpha=False)
|
|
154
|
+
raw_bytes = pixmap.tobytes("png")
|
|
155
|
+
final_bytes = _trim_signature_image_bytes(raw_bytes) if trim else raw_bytes
|
|
148
156
|
if save_files and not docx_enabled:
|
|
149
|
-
|
|
157
|
+
png_destination.write_bytes(final_bytes)
|
|
150
158
|
if return_bytes or docx_enabled:
|
|
151
|
-
image_bytes =
|
|
159
|
+
image_bytes = final_bytes
|
|
152
160
|
except Exception as exc: # pragma: no cover - defensive
|
|
153
161
|
if logger:
|
|
154
162
|
logger.warning(
|
|
@@ -221,6 +229,233 @@ def _build_docx_bytes(image_bytes: bytes) -> bytes:
|
|
|
221
229
|
return buffer.getvalue()
|
|
222
230
|
|
|
223
231
|
|
|
232
|
+
def _trim_signature_image_bytes(
|
|
233
|
+
image_bytes: bytes,
|
|
234
|
+
*,
|
|
235
|
+
pad_px: int = 4,
|
|
236
|
+
gap_px: int = 4,
|
|
237
|
+
min_density_ratio: float = 0.004,
|
|
238
|
+
) -> bytes:
|
|
239
|
+
image = Image.open(io.BytesIO(image_bytes))
|
|
240
|
+
gray = image.convert("L")
|
|
241
|
+
width, height = gray.size
|
|
242
|
+
|
|
243
|
+
histogram = gray.histogram()
|
|
244
|
+
total_pixels = width * height
|
|
245
|
+
cutoff = int(total_pixels * 0.995)
|
|
246
|
+
cumulative = 0
|
|
247
|
+
white_level = 255
|
|
248
|
+
for idx, count in enumerate(histogram):
|
|
249
|
+
cumulative += count
|
|
250
|
+
if cumulative >= cutoff:
|
|
251
|
+
white_level = idx
|
|
252
|
+
break
|
|
253
|
+
|
|
254
|
+
if white_level < 200:
|
|
255
|
+
return image_bytes
|
|
256
|
+
|
|
257
|
+
thresholds = [min(254, max(200, white_level - delta)) for delta in (6, 4, 2, 1, 0)]
|
|
258
|
+
min_density = max(2, int(width * min_density_ratio))
|
|
259
|
+
pixels = gray.load()
|
|
260
|
+
|
|
261
|
+
row_densities: dict[int, list[int]] = {}
|
|
262
|
+
for threshold in thresholds:
|
|
263
|
+
row_density = []
|
|
264
|
+
for y in range(height):
|
|
265
|
+
dark = sum(1 for x in range(width) if pixels[x, y] < threshold)
|
|
266
|
+
row_density.append(dark)
|
|
267
|
+
row_densities[threshold] = row_density
|
|
268
|
+
|
|
269
|
+
line_bounds = _detect_horizontal_rule_cutoff(row_densities[thresholds[-1]], width)
|
|
270
|
+
scan_limit = None
|
|
271
|
+
descender_limit = height - 1
|
|
272
|
+
if line_bounds is not None:
|
|
273
|
+
line_start, line_end = line_bounds
|
|
274
|
+
scan_limit = max(0, line_start - 1)
|
|
275
|
+
descender_limit = min(height - 1, line_end + max(2, int(height * 0.02)))
|
|
276
|
+
|
|
277
|
+
min_band_height = max(4, int(height * 0.02))
|
|
278
|
+
best = None
|
|
279
|
+
best_small = None
|
|
280
|
+
best_small_threshold = None
|
|
281
|
+
best_threshold = None
|
|
282
|
+
line_threshold = int(width * 0.6)
|
|
283
|
+
for threshold in thresholds:
|
|
284
|
+
row_density = row_densities[threshold]
|
|
285
|
+
segments: list[tuple[int, int]] = []
|
|
286
|
+
start: int | None = None
|
|
287
|
+
for y, dark in enumerate(row_density):
|
|
288
|
+
if scan_limit is not None and y > scan_limit:
|
|
289
|
+
if start is not None:
|
|
290
|
+
segments.append((start, y - 1))
|
|
291
|
+
start = None
|
|
292
|
+
break
|
|
293
|
+
if dark >= min_density:
|
|
294
|
+
if start is None:
|
|
295
|
+
start = y
|
|
296
|
+
else:
|
|
297
|
+
if start is not None:
|
|
298
|
+
segments.append((start, y - 1))
|
|
299
|
+
start = None
|
|
300
|
+
if start is not None:
|
|
301
|
+
segments.append((start, height - 1))
|
|
302
|
+
|
|
303
|
+
if not segments:
|
|
304
|
+
continue
|
|
305
|
+
|
|
306
|
+
merged: list[list[int]] = []
|
|
307
|
+
for seg in segments:
|
|
308
|
+
if not merged:
|
|
309
|
+
merged.append([seg[0], seg[1]])
|
|
310
|
+
continue
|
|
311
|
+
if seg[0] - merged[-1][1] <= gap_px:
|
|
312
|
+
merged[-1][1] = seg[1]
|
|
313
|
+
else:
|
|
314
|
+
merged.append([seg[0], seg[1]])
|
|
315
|
+
|
|
316
|
+
candidates = []
|
|
317
|
+
for y0, y1 in merged:
|
|
318
|
+
min_x, max_x = width, -1
|
|
319
|
+
total_dark = 0
|
|
320
|
+
for y in range(y0, y1 + 1):
|
|
321
|
+
for x in range(width):
|
|
322
|
+
if pixels[x, y] < threshold:
|
|
323
|
+
total_dark += 1
|
|
324
|
+
if x < min_x:
|
|
325
|
+
min_x = x
|
|
326
|
+
if x > max_x:
|
|
327
|
+
max_x = x
|
|
328
|
+
if max_x < 0:
|
|
329
|
+
continue
|
|
330
|
+
band_height = y1 - y0 + 1
|
|
331
|
+
band_width = max_x - min_x + 1
|
|
332
|
+
score = total_dark * (band_height**1.3)
|
|
333
|
+
if line_bounds is not None:
|
|
334
|
+
distance = max(0, line_bounds[0] - y1)
|
|
335
|
+
proximity = 1.0 / (1.0 + (distance / 20.0))
|
|
336
|
+
score *= 1.0 + 0.5 * proximity
|
|
337
|
+
candidates.append(
|
|
338
|
+
{
|
|
339
|
+
"y0": y0,
|
|
340
|
+
"y1": y1,
|
|
341
|
+
"min_x": min_x,
|
|
342
|
+
"max_x": max_x,
|
|
343
|
+
"total": total_dark,
|
|
344
|
+
"height": band_height,
|
|
345
|
+
"width": band_width,
|
|
346
|
+
"score": score,
|
|
347
|
+
}
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
if not candidates:
|
|
351
|
+
continue
|
|
352
|
+
|
|
353
|
+
candidates.sort(key=lambda item: item["score"], reverse=True)
|
|
354
|
+
top_candidate = candidates[0]
|
|
355
|
+
if top_candidate["height"] >= min_band_height:
|
|
356
|
+
if best is None or top_candidate["score"] > best["score"]:
|
|
357
|
+
best = top_candidate
|
|
358
|
+
best_threshold = threshold
|
|
359
|
+
else:
|
|
360
|
+
if best_small is None or top_candidate["score"] > best_small["score"]:
|
|
361
|
+
best_small = top_candidate
|
|
362
|
+
best_small_threshold = threshold
|
|
363
|
+
|
|
364
|
+
if best is None:
|
|
365
|
+
best = best_small
|
|
366
|
+
best_threshold = best_small_threshold
|
|
367
|
+
|
|
368
|
+
if best is None:
|
|
369
|
+
return image_bytes
|
|
370
|
+
|
|
371
|
+
expansion_density = row_densities.get(best_threshold, row_densities[thresholds[-1]])
|
|
372
|
+
expand_threshold = max(1, int(min_density * 0.4))
|
|
373
|
+
y0 = best["y0"]
|
|
374
|
+
y1 = best["y1"]
|
|
375
|
+
|
|
376
|
+
while y0 > 0 and expansion_density[y0 - 1] >= expand_threshold:
|
|
377
|
+
y0 -= 1
|
|
378
|
+
while y1 < descender_limit and expansion_density[y1 + 1] >= expand_threshold:
|
|
379
|
+
y1 += 1
|
|
380
|
+
|
|
381
|
+
min_x, max_x = width, -1
|
|
382
|
+
for y in range(y0, y1 + 1):
|
|
383
|
+
if expansion_density[y] >= line_threshold:
|
|
384
|
+
continue
|
|
385
|
+
for x in range(width):
|
|
386
|
+
if pixels[x, y] < thresholds[-1]:
|
|
387
|
+
if x < min_x:
|
|
388
|
+
min_x = x
|
|
389
|
+
if x > max_x:
|
|
390
|
+
max_x = x
|
|
391
|
+
if max_x >= 0:
|
|
392
|
+
best = {
|
|
393
|
+
"y0": y0,
|
|
394
|
+
"y1": y1,
|
|
395
|
+
"min_x": min_x,
|
|
396
|
+
"max_x": max_x,
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
x0 = max(0, best["min_x"] - pad_px)
|
|
400
|
+
x1 = min(width - 1, best["max_x"] + pad_px)
|
|
401
|
+
y0 = max(0, best["y0"] - pad_px)
|
|
402
|
+
y1 = min(height - 1, best["y1"] + pad_px)
|
|
403
|
+
|
|
404
|
+
if x1 <= x0 or y1 <= y0:
|
|
405
|
+
return image_bytes
|
|
406
|
+
if (x1 - x0) < max(10, int(width * 0.2)) or (y1 - y0) < max(6, int(height * 0.08)):
|
|
407
|
+
return image_bytes
|
|
408
|
+
|
|
409
|
+
cropped = image.crop((x0, y0, x1 + 1, y1 + 1))
|
|
410
|
+
buffer = io.BytesIO()
|
|
411
|
+
cropped.save(buffer, format="PNG")
|
|
412
|
+
return buffer.getvalue()
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
def _detect_horizontal_rule_cutoff(
|
|
416
|
+
row_density: list[int],
|
|
417
|
+
width: int,
|
|
418
|
+
) -> tuple[int, int] | None:
|
|
419
|
+
if not row_density:
|
|
420
|
+
return None
|
|
421
|
+
line_threshold = int(width * 0.6)
|
|
422
|
+
max_thickness = 4
|
|
423
|
+
segments: list[tuple[int, int]] = []
|
|
424
|
+
start = None
|
|
425
|
+
for y, density in enumerate(row_density):
|
|
426
|
+
if density >= line_threshold:
|
|
427
|
+
if start is None:
|
|
428
|
+
start = y
|
|
429
|
+
else:
|
|
430
|
+
if start is not None:
|
|
431
|
+
segments.append((start, y - 1))
|
|
432
|
+
start = None
|
|
433
|
+
if start is not None:
|
|
434
|
+
segments.append((start, len(row_density) - 1))
|
|
435
|
+
|
|
436
|
+
if not segments:
|
|
437
|
+
return None
|
|
438
|
+
|
|
439
|
+
total_dark = sum(row_density)
|
|
440
|
+
if total_dark <= 0:
|
|
441
|
+
return None
|
|
442
|
+
|
|
443
|
+
for y0, y1 in segments:
|
|
444
|
+
thickness = y1 - y0 + 1
|
|
445
|
+
if thickness > max_thickness:
|
|
446
|
+
continue
|
|
447
|
+
above_dark = sum(row_density[:y0])
|
|
448
|
+
below_dark = sum(row_density[y1 + 1 :])
|
|
449
|
+
if above_dark < 40:
|
|
450
|
+
continue
|
|
451
|
+
midpoint_ratio = ((y0 + y1) / 2.0) / max(1, len(row_density))
|
|
452
|
+
if midpoint_ratio >= 0.35:
|
|
453
|
+
return (y0, y1)
|
|
454
|
+
if above_dark >= max(40, int(below_dark * 0.3)):
|
|
455
|
+
return (y0, y1)
|
|
456
|
+
return None
|
|
457
|
+
|
|
458
|
+
|
|
224
459
|
def _to_clip_rect(page, bbox: tuple[float, float, float, float]):
|
|
225
460
|
width = float(page.rect.width)
|
|
226
461
|
height = float(page.rect.height)
|
|
@@ -94,6 +94,17 @@ def apply_wet_detection(
|
|
|
94
94
|
original_mixed = file_result.MixedContent
|
|
95
95
|
try:
|
|
96
96
|
added = _detect(pdf_path, configuration, file_result, logger=logger)
|
|
97
|
+
if added and configuration.Profile == "hipaa":
|
|
98
|
+
updated = False
|
|
99
|
+
for signature in file_result.Signatures:
|
|
100
|
+
if signature.RenderType == "wet" and (signature.Role or "unknown") == "unknown":
|
|
101
|
+
signature.Role = "patient"
|
|
102
|
+
signature.Scores = {"patient": int(signature.Score or 0)}
|
|
103
|
+
signature.Evidence = list(signature.Evidence or [])
|
|
104
|
+
signature.Evidence.append("role_default:patient")
|
|
105
|
+
updated = True
|
|
106
|
+
if updated:
|
|
107
|
+
_refresh_metadata(file_result)
|
|
97
108
|
if not added:
|
|
98
109
|
_mark_manual_review(file_result, "NoHighConfidenceWetSignature")
|
|
99
110
|
return added
|
|
@@ -136,6 +147,18 @@ def _detect(
|
|
|
136
147
|
scale=configuration.WetOcrDpi / 72.0,
|
|
137
148
|
)
|
|
138
149
|
)
|
|
150
|
+
if not candidates:
|
|
151
|
+
candidates = list(
|
|
152
|
+
_build_candidates(
|
|
153
|
+
ocr_lines,
|
|
154
|
+
image=image,
|
|
155
|
+
page_rect=page.rect,
|
|
156
|
+
pix_width=pixmap.width,
|
|
157
|
+
pix_height=pixmap.height,
|
|
158
|
+
scale=configuration.WetOcrDpi / 72.0,
|
|
159
|
+
min_y_ratio=0.2,
|
|
160
|
+
)
|
|
161
|
+
)
|
|
139
162
|
candidates.extend(_image_candidates(page))
|
|
140
163
|
candidates = _filter_candidates_for_page(candidates)
|
|
141
164
|
accepted = [
|
|
@@ -247,6 +270,7 @@ def _build_candidates(
|
|
|
247
270
|
pix_width: int,
|
|
248
271
|
pix_height: int,
|
|
249
272
|
scale: float,
|
|
273
|
+
min_y_ratio: float = 0.4,
|
|
250
274
|
) -> Iterable[WetCandidate]:
|
|
251
275
|
for line in lines:
|
|
252
276
|
normalized = line.text.lower()
|
|
@@ -255,7 +279,7 @@ def _build_candidates(
|
|
|
255
279
|
if len(normalized) > 80:
|
|
256
280
|
# Ignore long paragraph-like OCR lines
|
|
257
281
|
continue
|
|
258
|
-
if (line.bottom / pix_height) <
|
|
282
|
+
if (line.bottom / pix_height) < min_y_ratio:
|
|
259
283
|
# Ignore lines in the upper section of the page
|
|
260
284
|
continue
|
|
261
285
|
role = _infer_role(normalized)
|
|
@@ -338,28 +362,33 @@ def _expand_bbox(
|
|
|
338
362
|
) -> tuple[float, float, float, float]:
|
|
339
363
|
x0 = line.left / scale
|
|
340
364
|
x1 = line.right / scale
|
|
341
|
-
|
|
365
|
+
y_top = (pix_height - line.top) / scale
|
|
366
|
+
y_bottom = (pix_height - line.bottom) / scale
|
|
342
367
|
|
|
343
368
|
pad_x = max(14.0, (x1 - x0) * 0.25)
|
|
344
369
|
left = max(page_rect.x0, x0 - pad_x)
|
|
345
370
|
right = min(page_rect.x1, x1 + pad_x)
|
|
346
371
|
|
|
347
372
|
gap = 14.0
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
bottom = min(page_rect.y1, top + signature_height)
|
|
373
|
+
line_height = max(1.0, (line.bottom - line.top) / scale)
|
|
374
|
+
signature_height = max(70.0, line_height * 6.0)
|
|
375
|
+
upper = min(page_rect.y1, y_bottom - gap)
|
|
376
|
+
upper = max(page_rect.y0, upper)
|
|
377
|
+
lower = max(page_rect.y0, upper - signature_height)
|
|
354
378
|
|
|
355
379
|
if stroke_y is not None:
|
|
356
|
-
# Anchor to the detected stroke
|
|
380
|
+
# Anchor to the detected stroke (signature line) beneath the label.
|
|
357
381
|
sy = (pix_height - stroke_y) / scale
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
382
|
+
field_lower = min(page_rect.y1, max(page_rect.y0, sy + 2.0))
|
|
383
|
+
field_upper = min(page_rect.y1, y_bottom - gap)
|
|
384
|
+
if field_upper > field_lower + 6.0:
|
|
385
|
+
lower = field_lower
|
|
386
|
+
upper = field_upper
|
|
387
|
+
else:
|
|
388
|
+
upper = min(page_rect.y1, field_lower + signature_height)
|
|
389
|
+
lower = max(page_rect.y0, upper - signature_height)
|
|
361
390
|
|
|
362
|
-
return (float(left), float(
|
|
391
|
+
return (float(left), float(lower), float(right), float(upper))
|
|
363
392
|
|
|
364
393
|
|
|
365
394
|
def _stroke_under_line(image: Image.Image, line: OcrLine) -> tuple[bool, float | None]:
|
|
@@ -513,14 +542,19 @@ def _signature_rank(signature: Signature) -> tuple[int, int, int]:
|
|
|
513
542
|
|
|
514
543
|
def _dedupe_wet_signatures(signatures: Sequence[Signature]) -> list[Signature]:
|
|
515
544
|
best_by_role: dict[str, Signature] = {}
|
|
545
|
+
best_unknown: Signature | None = None
|
|
516
546
|
for signature in signatures:
|
|
517
547
|
role = (signature.Role or "unknown").strip().lower()
|
|
518
548
|
if role == "unknown":
|
|
549
|
+
if best_unknown is None or _signature_rank(signature) > _signature_rank(best_unknown):
|
|
550
|
+
best_unknown = signature
|
|
519
551
|
continue
|
|
520
552
|
existing = best_by_role.get(role)
|
|
521
553
|
if existing is None or _signature_rank(signature) > _signature_rank(existing):
|
|
522
554
|
best_by_role[role] = signature
|
|
523
|
-
|
|
555
|
+
if best_by_role:
|
|
556
|
+
return sorted(best_by_role.values(), key=lambda sig: (int(sig.Page or 0), sig.Role or ""))
|
|
557
|
+
return [best_unknown] if best_unknown is not None else []
|
|
524
558
|
|
|
525
559
|
|
|
526
560
|
def _mark_manual_review(file_result: FileResult, reason: str) -> None:
|
|
@@ -1,12 +1,14 @@
|
|
|
1
|
+
import io
|
|
1
2
|
from pathlib import Path
|
|
2
3
|
|
|
3
4
|
import pytest
|
|
4
5
|
from pypdf import PdfWriter
|
|
6
|
+
from PIL import Image, ImageDraw
|
|
5
7
|
from pypdf.generic import ArrayObject, DictionaryObject, NameObject, NumberObject, TextStringObject
|
|
6
8
|
|
|
7
9
|
from sigdetect.api import CropSignatureImages, DetectPdf
|
|
8
10
|
from sigdetect.config import DetectConfiguration
|
|
9
|
-
from sigdetect.cropping import SignatureCrop, crop_signatures
|
|
11
|
+
from sigdetect.cropping import SignatureCrop, _trim_signature_image_bytes, crop_signatures
|
|
10
12
|
from sigdetect.detector.pypdf2_engine import PyPDF2Detector
|
|
11
13
|
|
|
12
14
|
pytest.importorskip("fitz")
|
|
@@ -43,6 +45,53 @@ def _pdf_with_signature(path: Path) -> None:
|
|
|
43
45
|
writer.write(handle)
|
|
44
46
|
|
|
45
47
|
|
|
48
|
+
def _build_test_crop_bytes() -> bytes:
|
|
49
|
+
image = Image.new("RGB", (200, 100), "white")
|
|
50
|
+
draw = ImageDraw.Draw(image)
|
|
51
|
+
draw.rectangle([20, 10, 80, 20], fill="black")
|
|
52
|
+
draw.rectangle([10, 60, 190, 80], fill="black")
|
|
53
|
+
buffer = io.BytesIO()
|
|
54
|
+
image.save(buffer, format="PNG")
|
|
55
|
+
return buffer.getvalue()
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def test_trim_signature_image_bytes_prefers_lower_band() -> None:
|
|
59
|
+
original = _build_test_crop_bytes()
|
|
60
|
+
trimmed = _trim_signature_image_bytes(original, pad_px=2)
|
|
61
|
+
|
|
62
|
+
original_image = Image.open(io.BytesIO(original))
|
|
63
|
+
trimmed_image = Image.open(io.BytesIO(trimmed))
|
|
64
|
+
|
|
65
|
+
assert trimmed_image.height < original_image.height
|
|
66
|
+
|
|
67
|
+
gray = trimmed_image.convert("L")
|
|
68
|
+
pixels = gray.load()
|
|
69
|
+
width, height = gray.size
|
|
70
|
+
top_dark = sum(1 for x in range(width) for y in range(min(8, height)) if pixels[x, y] < 240)
|
|
71
|
+
assert top_dark == 0
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def test_trim_signature_image_bytes_respects_horizontal_rule() -> None:
|
|
75
|
+
image = Image.new("RGB", (200, 120), "white")
|
|
76
|
+
draw = ImageDraw.Draw(image)
|
|
77
|
+
# Signature scribble above the line.
|
|
78
|
+
draw.line([20, 20, 180, 30], fill="black", width=3)
|
|
79
|
+
draw.line([25, 28, 140, 18], fill="black", width=2)
|
|
80
|
+
# Horizontal rule separating signature from print name.
|
|
81
|
+
draw.line([10, 50, 190, 50], fill="black", width=2)
|
|
82
|
+
# Text-ish block below the line.
|
|
83
|
+
draw.rectangle([20, 70, 120, 85], fill="black")
|
|
84
|
+
|
|
85
|
+
buffer = io.BytesIO()
|
|
86
|
+
image.save(buffer, format="PNG")
|
|
87
|
+
trimmed = _trim_signature_image_bytes(buffer.getvalue(), pad_px=2)
|
|
88
|
+
|
|
89
|
+
trimmed_image = Image.open(io.BytesIO(trimmed)).convert("L")
|
|
90
|
+
width, height = trimmed_image.size
|
|
91
|
+
# Ensure we trimmed off the lower text block (should be well above original height).
|
|
92
|
+
assert height < 90
|
|
93
|
+
|
|
94
|
+
|
|
46
95
|
def test_crop_signatures(tmp_path: Path):
|
|
47
96
|
pdf_path = tmp_path / "doc.pdf"
|
|
48
97
|
_pdf_with_signature(pdf_path)
|
|
@@ -1,15 +1,19 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
2
|
|
|
3
|
+
from PIL import Image
|
|
4
|
+
|
|
3
5
|
from pypdf import PdfWriter
|
|
4
6
|
|
|
5
7
|
from sigdetect.config import DetectConfiguration
|
|
6
8
|
from sigdetect.detector.file_result_model import FileResult
|
|
7
9
|
from sigdetect.detector.signature_model import Signature
|
|
8
10
|
from sigdetect.wet_detection import (
|
|
11
|
+
OcrLine,
|
|
9
12
|
WetCandidate,
|
|
10
13
|
_dedupe_wet_signatures,
|
|
11
14
|
_filter_candidates_for_page,
|
|
12
15
|
_image_candidates,
|
|
16
|
+
_build_candidates,
|
|
13
17
|
_refresh_metadata,
|
|
14
18
|
apply_wet_detection,
|
|
15
19
|
should_run_wet_pipeline,
|
|
@@ -130,6 +134,46 @@ def test_apply_wet_detection_preserves_esign_flags(monkeypatch, tmp_path: Path)
|
|
|
130
134
|
assert file_result.MixedContent is False
|
|
131
135
|
|
|
132
136
|
|
|
137
|
+
def test_apply_wet_detection_defaults_unknown_to_patient(monkeypatch, tmp_path: Path) -> None:
|
|
138
|
+
pdf_path = tmp_path / "doc.pdf"
|
|
139
|
+
_blank_pdf(pdf_path)
|
|
140
|
+
configuration = DetectConfiguration(
|
|
141
|
+
pdf_root=tmp_path,
|
|
142
|
+
out_dir=tmp_path,
|
|
143
|
+
engine="pypdf2",
|
|
144
|
+
profile="hipaa",
|
|
145
|
+
)
|
|
146
|
+
file_result = _empty_file_result("doc.pdf")
|
|
147
|
+
|
|
148
|
+
monkeypatch.setattr("sigdetect.wet_detection._ensure_dependencies", lambda: None)
|
|
149
|
+
|
|
150
|
+
def fake_detect(pdf_path, configuration, file_result, logger=None):
|
|
151
|
+
file_result.Signatures.append(
|
|
152
|
+
Signature(
|
|
153
|
+
Page=1,
|
|
154
|
+
FieldName="wet_signature_detected",
|
|
155
|
+
Role="unknown",
|
|
156
|
+
Score=88,
|
|
157
|
+
Scores={"unknown": 88},
|
|
158
|
+
Evidence=["wet:true"],
|
|
159
|
+
Hint="WetSignatureOCR",
|
|
160
|
+
RenderType="wet",
|
|
161
|
+
BoundingBox=(10.0, 10.0, 100.0, 40.0),
|
|
162
|
+
)
|
|
163
|
+
)
|
|
164
|
+
_refresh_metadata(file_result)
|
|
165
|
+
return True
|
|
166
|
+
|
|
167
|
+
monkeypatch.setattr("sigdetect.wet_detection._detect", fake_detect)
|
|
168
|
+
|
|
169
|
+
applied = apply_wet_detection(pdf_path, configuration, file_result)
|
|
170
|
+
|
|
171
|
+
assert applied is True
|
|
172
|
+
assert file_result.Signatures
|
|
173
|
+
assert file_result.Signatures[0].Role == "patient"
|
|
174
|
+
assert "role_default:patient" in (file_result.Signatures[0].Evidence or [])
|
|
175
|
+
|
|
176
|
+
|
|
133
177
|
def test_image_candidate_detection_infers_role_from_nearby_text() -> None:
|
|
134
178
|
class Rect:
|
|
135
179
|
def __init__(self, x0, y0, x1, y1):
|
|
@@ -213,3 +257,57 @@ def test_dedupe_wet_signatures_keeps_best_per_role() -> None:
|
|
|
213
257
|
assert filtered[0].Role == "patient"
|
|
214
258
|
assert filtered[0].Page == 2
|
|
215
259
|
assert "image_signature:true" in filtered[0].Evidence
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def test_dedupe_wet_signatures_keeps_unknown_when_only() -> None:
|
|
263
|
+
def make_signature(page: int, role: str, score: int, evidence: list[str]) -> Signature:
|
|
264
|
+
return Signature(
|
|
265
|
+
Page=page,
|
|
266
|
+
FieldName="wet_signature_detected",
|
|
267
|
+
Role=role,
|
|
268
|
+
Score=score,
|
|
269
|
+
Scores={role: score},
|
|
270
|
+
Evidence=evidence,
|
|
271
|
+
Hint="WetSignatureOCR",
|
|
272
|
+
RenderType="wet",
|
|
273
|
+
BoundingBox=(0.0, 0.0, 10.0, 10.0),
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
unknown = make_signature(1, "unknown", 90, ["ocr_line:signature", "stroke:no"])
|
|
277
|
+
filtered = _dedupe_wet_signatures([unknown])
|
|
278
|
+
|
|
279
|
+
assert len(filtered) == 1
|
|
280
|
+
assert filtered[0].Role == "unknown"
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def test_build_candidates_respects_min_y_ratio() -> None:
|
|
284
|
+
class DummyPageRect:
|
|
285
|
+
x0, y0, x1, y1 = 0.0, 0.0, 600.0, 800.0
|
|
286
|
+
|
|
287
|
+
image = Image.new("RGB", (100, 100), "white")
|
|
288
|
+
line = OcrLine(text="Signature", confidence=0.9, left=10, top=10, right=90, bottom=30)
|
|
289
|
+
|
|
290
|
+
candidates_default = list(
|
|
291
|
+
_build_candidates(
|
|
292
|
+
[line],
|
|
293
|
+
image=image,
|
|
294
|
+
page_rect=DummyPageRect(),
|
|
295
|
+
pix_width=100,
|
|
296
|
+
pix_height=100,
|
|
297
|
+
scale=1.0,
|
|
298
|
+
)
|
|
299
|
+
)
|
|
300
|
+
candidates_relaxed = list(
|
|
301
|
+
_build_candidates(
|
|
302
|
+
[line],
|
|
303
|
+
image=image,
|
|
304
|
+
page_rect=DummyPageRect(),
|
|
305
|
+
pix_width=100,
|
|
306
|
+
pix_height=100,
|
|
307
|
+
scale=1.0,
|
|
308
|
+
min_y_ratio=0.2,
|
|
309
|
+
)
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
assert not candidates_default
|
|
313
|
+
assert candidates_relaxed
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|