sigdetect 0.5.1__tar.gz → 0.5.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {sigdetect-0.5.1 → sigdetect-0.5.2}/PKG-INFO +1 -1
  2. {sigdetect-0.5.1 → sigdetect-0.5.2}/pyproject.toml +1 -1
  3. {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/cropping.py +237 -2
  4. {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/wet_detection.py +48 -14
  5. {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect.egg-info/PKG-INFO +1 -1
  6. {sigdetect-0.5.1 → sigdetect-0.5.2}/tests/test_cropping.py +50 -1
  7. {sigdetect-0.5.1 → sigdetect-0.5.2}/tests/test_wet_detection.py +98 -0
  8. {sigdetect-0.5.1 → sigdetect-0.5.2}/README.md +0 -0
  9. {sigdetect-0.5.1 → sigdetect-0.5.2}/setup.cfg +0 -0
  10. {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/__init__.py +0 -0
  11. {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/api.py +0 -0
  12. {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/cli.py +0 -0
  13. {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/config.py +0 -0
  14. {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/data/role_rules.retainer.yml +0 -0
  15. {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/data/role_rules.yml +0 -0
  16. {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/data/vendor_patterns.yml +0 -0
  17. {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/detector/__init__.py +0 -0
  18. {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/detector/base.py +0 -0
  19. {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/detector/base_detector.py +0 -0
  20. {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/detector/file_result_model.py +0 -0
  21. {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/detector/pymupdf_engine.py +0 -0
  22. {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/detector/pypdf2_engine.py +0 -0
  23. {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/detector/signature_model.py +0 -0
  24. {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/eda.py +0 -0
  25. {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/logging_setup.py +0 -0
  26. {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/utils.py +0 -0
  27. {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect.egg-info/SOURCES.txt +0 -0
  28. {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect.egg-info/dependency_links.txt +0 -0
  29. {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect.egg-info/entry_points.txt +0 -0
  30. {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect.egg-info/requires.txt +0 -0
  31. {sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect.egg-info/top_level.txt +0 -0
  32. {sigdetect-0.5.1 → sigdetect-0.5.2}/tests/test_api.py +0 -0
  33. {sigdetect-0.5.1 → sigdetect-0.5.2}/tests/test_cli.py +0 -0
  34. {sigdetect-0.5.1 → sigdetect-0.5.2}/tests/test_detector_options.py +0 -0
  35. {sigdetect-0.5.1 → sigdetect-0.5.2}/tests/test_pymupdf_engine.py +0 -0
  36. {sigdetect-0.5.1 → sigdetect-0.5.2}/tests/test_widget_role_patient_smoke.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sigdetect
3
- Version: 0.5.1
3
+ Version: 0.5.2
4
4
  Summary: Signature detection and role attribution for PDFs
5
5
  Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sigdetect"
7
- version = "0.5.1"
7
+ version = "0.5.2"
8
8
  description = "Signature detection and role attribution for PDFs"
9
9
  readme = "README.md"
10
10
  authors = [{ name = "BT Asmamaw", email = "basmamaw@angeiongroup.com" }]
@@ -9,6 +9,8 @@ from dataclasses import dataclass
9
9
  from pathlib import Path
10
10
  from typing import Literal, overload
11
11
 
12
+ from PIL import Image
13
+
12
14
  from .detector.file_result_model import FileResult
13
15
  from .detector.signature_model import Signature
14
16
 
@@ -53,6 +55,7 @@ def crop_signatures(
53
55
  return_bytes: Literal[False] = False,
54
56
  save_files: bool = True,
55
57
  docx: bool = False,
58
+ trim: bool = True,
56
59
  ) -> list[Path]: ...
57
60
 
58
61
 
@@ -67,6 +70,7 @@ def crop_signatures(
67
70
  return_bytes: Literal[True],
68
71
  save_files: bool = True,
69
72
  docx: bool = False,
73
+ trim: bool = True,
70
74
  ) -> list[SignatureCrop]: ...
71
75
 
72
76
 
@@ -80,6 +84,7 @@ def crop_signatures(
80
84
  return_bytes: bool = False,
81
85
  save_files: bool = True,
82
86
  docx: bool = False,
87
+ trim: bool = True,
83
88
  ) -> list[Path] | list[SignatureCrop]:
84
89
  """Render each signature bounding box to a PNG image and optionally wrap it in DOCX.
85
90
 
@@ -87,6 +92,7 @@ def crop_signatures(
87
92
  the files to ``output_dir``. Set ``save_files=False`` to skip writing PNGs to disk.
88
93
  When ``docx=True``, DOCX files are written instead of PNGs. When ``return_bytes`` is True
89
94
  and ``docx=True``, ``SignatureCrop.docx_bytes`` will contain the DOCX payload.
95
+ When ``trim`` is enabled, the crop is tightened around the detected ink where possible.
90
96
  """
91
97
 
92
98
  if fitz is None: # pragma: no cover - exercised when dependency absent
@@ -145,10 +151,12 @@ def crop_signatures(
145
151
  try:
146
152
  image_bytes: bytes | None = None
147
153
  pixmap = page.get_pixmap(matrix=matrix, clip=clip, alpha=False)
154
+ raw_bytes = pixmap.tobytes("png")
155
+ final_bytes = _trim_signature_image_bytes(raw_bytes) if trim else raw_bytes
148
156
  if save_files and not docx_enabled:
149
- pixmap.save(png_destination)
157
+ png_destination.write_bytes(final_bytes)
150
158
  if return_bytes or docx_enabled:
151
- image_bytes = pixmap.tobytes("png")
159
+ image_bytes = final_bytes
152
160
  except Exception as exc: # pragma: no cover - defensive
153
161
  if logger:
154
162
  logger.warning(
@@ -221,6 +229,233 @@ def _build_docx_bytes(image_bytes: bytes) -> bytes:
221
229
  return buffer.getvalue()
222
230
 
223
231
 
232
+ def _trim_signature_image_bytes(
233
+ image_bytes: bytes,
234
+ *,
235
+ pad_px: int = 4,
236
+ gap_px: int = 4,
237
+ min_density_ratio: float = 0.004,
238
+ ) -> bytes:
239
+ image = Image.open(io.BytesIO(image_bytes))
240
+ gray = image.convert("L")
241
+ width, height = gray.size
242
+
243
+ histogram = gray.histogram()
244
+ total_pixels = width * height
245
+ cutoff = int(total_pixels * 0.995)
246
+ cumulative = 0
247
+ white_level = 255
248
+ for idx, count in enumerate(histogram):
249
+ cumulative += count
250
+ if cumulative >= cutoff:
251
+ white_level = idx
252
+ break
253
+
254
+ if white_level < 200:
255
+ return image_bytes
256
+
257
+ thresholds = [min(254, max(200, white_level - delta)) for delta in (6, 4, 2, 1, 0)]
258
+ min_density = max(2, int(width * min_density_ratio))
259
+ pixels = gray.load()
260
+
261
+ row_densities: dict[int, list[int]] = {}
262
+ for threshold in thresholds:
263
+ row_density = []
264
+ for y in range(height):
265
+ dark = sum(1 for x in range(width) if pixels[x, y] < threshold)
266
+ row_density.append(dark)
267
+ row_densities[threshold] = row_density
268
+
269
+ line_bounds = _detect_horizontal_rule_cutoff(row_densities[thresholds[-1]], width)
270
+ scan_limit = None
271
+ descender_limit = height - 1
272
+ if line_bounds is not None:
273
+ line_start, line_end = line_bounds
274
+ scan_limit = max(0, line_start - 1)
275
+ descender_limit = min(height - 1, line_end + max(2, int(height * 0.02)))
276
+
277
+ min_band_height = max(4, int(height * 0.02))
278
+ best = None
279
+ best_small = None
280
+ best_small_threshold = None
281
+ best_threshold = None
282
+ line_threshold = int(width * 0.6)
283
+ for threshold in thresholds:
284
+ row_density = row_densities[threshold]
285
+ segments: list[tuple[int, int]] = []
286
+ start: int | None = None
287
+ for y, dark in enumerate(row_density):
288
+ if scan_limit is not None and y > scan_limit:
289
+ if start is not None:
290
+ segments.append((start, y - 1))
291
+ start = None
292
+ break
293
+ if dark >= min_density:
294
+ if start is None:
295
+ start = y
296
+ else:
297
+ if start is not None:
298
+ segments.append((start, y - 1))
299
+ start = None
300
+ if start is not None:
301
+ segments.append((start, height - 1))
302
+
303
+ if not segments:
304
+ continue
305
+
306
+ merged: list[list[int]] = []
307
+ for seg in segments:
308
+ if not merged:
309
+ merged.append([seg[0], seg[1]])
310
+ continue
311
+ if seg[0] - merged[-1][1] <= gap_px:
312
+ merged[-1][1] = seg[1]
313
+ else:
314
+ merged.append([seg[0], seg[1]])
315
+
316
+ candidates = []
317
+ for y0, y1 in merged:
318
+ min_x, max_x = width, -1
319
+ total_dark = 0
320
+ for y in range(y0, y1 + 1):
321
+ for x in range(width):
322
+ if pixels[x, y] < threshold:
323
+ total_dark += 1
324
+ if x < min_x:
325
+ min_x = x
326
+ if x > max_x:
327
+ max_x = x
328
+ if max_x < 0:
329
+ continue
330
+ band_height = y1 - y0 + 1
331
+ band_width = max_x - min_x + 1
332
+ score = total_dark * (band_height**1.3)
333
+ if line_bounds is not None:
334
+ distance = max(0, line_bounds[0] - y1)
335
+ proximity = 1.0 / (1.0 + (distance / 20.0))
336
+ score *= 1.0 + 0.5 * proximity
337
+ candidates.append(
338
+ {
339
+ "y0": y0,
340
+ "y1": y1,
341
+ "min_x": min_x,
342
+ "max_x": max_x,
343
+ "total": total_dark,
344
+ "height": band_height,
345
+ "width": band_width,
346
+ "score": score,
347
+ }
348
+ )
349
+
350
+ if not candidates:
351
+ continue
352
+
353
+ candidates.sort(key=lambda item: item["score"], reverse=True)
354
+ top_candidate = candidates[0]
355
+ if top_candidate["height"] >= min_band_height:
356
+ if best is None or top_candidate["score"] > best["score"]:
357
+ best = top_candidate
358
+ best_threshold = threshold
359
+ else:
360
+ if best_small is None or top_candidate["score"] > best_small["score"]:
361
+ best_small = top_candidate
362
+ best_small_threshold = threshold
363
+
364
+ if best is None:
365
+ best = best_small
366
+ best_threshold = best_small_threshold
367
+
368
+ if best is None:
369
+ return image_bytes
370
+
371
+ expansion_density = row_densities.get(best_threshold, row_densities[thresholds[-1]])
372
+ expand_threshold = max(1, int(min_density * 0.4))
373
+ y0 = best["y0"]
374
+ y1 = best["y1"]
375
+
376
+ while y0 > 0 and expansion_density[y0 - 1] >= expand_threshold:
377
+ y0 -= 1
378
+ while y1 < descender_limit and expansion_density[y1 + 1] >= expand_threshold:
379
+ y1 += 1
380
+
381
+ min_x, max_x = width, -1
382
+ for y in range(y0, y1 + 1):
383
+ if expansion_density[y] >= line_threshold:
384
+ continue
385
+ for x in range(width):
386
+ if pixels[x, y] < thresholds[-1]:
387
+ if x < min_x:
388
+ min_x = x
389
+ if x > max_x:
390
+ max_x = x
391
+ if max_x >= 0:
392
+ best = {
393
+ "y0": y0,
394
+ "y1": y1,
395
+ "min_x": min_x,
396
+ "max_x": max_x,
397
+ }
398
+
399
+ x0 = max(0, best["min_x"] - pad_px)
400
+ x1 = min(width - 1, best["max_x"] + pad_px)
401
+ y0 = max(0, best["y0"] - pad_px)
402
+ y1 = min(height - 1, best["y1"] + pad_px)
403
+
404
+ if x1 <= x0 or y1 <= y0:
405
+ return image_bytes
406
+ if (x1 - x0) < max(10, int(width * 0.2)) or (y1 - y0) < max(6, int(height * 0.08)):
407
+ return image_bytes
408
+
409
+ cropped = image.crop((x0, y0, x1 + 1, y1 + 1))
410
+ buffer = io.BytesIO()
411
+ cropped.save(buffer, format="PNG")
412
+ return buffer.getvalue()
413
+
414
+
415
+ def _detect_horizontal_rule_cutoff(
416
+ row_density: list[int],
417
+ width: int,
418
+ ) -> tuple[int, int] | None:
419
+ if not row_density:
420
+ return None
421
+ line_threshold = int(width * 0.6)
422
+ max_thickness = 4
423
+ segments: list[tuple[int, int]] = []
424
+ start = None
425
+ for y, density in enumerate(row_density):
426
+ if density >= line_threshold:
427
+ if start is None:
428
+ start = y
429
+ else:
430
+ if start is not None:
431
+ segments.append((start, y - 1))
432
+ start = None
433
+ if start is not None:
434
+ segments.append((start, len(row_density) - 1))
435
+
436
+ if not segments:
437
+ return None
438
+
439
+ total_dark = sum(row_density)
440
+ if total_dark <= 0:
441
+ return None
442
+
443
+ for y0, y1 in segments:
444
+ thickness = y1 - y0 + 1
445
+ if thickness > max_thickness:
446
+ continue
447
+ above_dark = sum(row_density[:y0])
448
+ below_dark = sum(row_density[y1 + 1 :])
449
+ if above_dark < 40:
450
+ continue
451
+ midpoint_ratio = ((y0 + y1) / 2.0) / max(1, len(row_density))
452
+ if midpoint_ratio >= 0.35:
453
+ return (y0, y1)
454
+ if above_dark >= max(40, int(below_dark * 0.3)):
455
+ return (y0, y1)
456
+ return None
457
+
458
+
224
459
  def _to_clip_rect(page, bbox: tuple[float, float, float, float]):
225
460
  width = float(page.rect.width)
226
461
  height = float(page.rect.height)
@@ -94,6 +94,17 @@ def apply_wet_detection(
94
94
  original_mixed = file_result.MixedContent
95
95
  try:
96
96
  added = _detect(pdf_path, configuration, file_result, logger=logger)
97
+ if added and configuration.Profile == "hipaa":
98
+ updated = False
99
+ for signature in file_result.Signatures:
100
+ if signature.RenderType == "wet" and (signature.Role or "unknown") == "unknown":
101
+ signature.Role = "patient"
102
+ signature.Scores = {"patient": int(signature.Score or 0)}
103
+ signature.Evidence = list(signature.Evidence or [])
104
+ signature.Evidence.append("role_default:patient")
105
+ updated = True
106
+ if updated:
107
+ _refresh_metadata(file_result)
97
108
  if not added:
98
109
  _mark_manual_review(file_result, "NoHighConfidenceWetSignature")
99
110
  return added
@@ -136,6 +147,18 @@ def _detect(
136
147
  scale=configuration.WetOcrDpi / 72.0,
137
148
  )
138
149
  )
150
+ if not candidates:
151
+ candidates = list(
152
+ _build_candidates(
153
+ ocr_lines,
154
+ image=image,
155
+ page_rect=page.rect,
156
+ pix_width=pixmap.width,
157
+ pix_height=pixmap.height,
158
+ scale=configuration.WetOcrDpi / 72.0,
159
+ min_y_ratio=0.2,
160
+ )
161
+ )
139
162
  candidates.extend(_image_candidates(page))
140
163
  candidates = _filter_candidates_for_page(candidates)
141
164
  accepted = [
@@ -247,6 +270,7 @@ def _build_candidates(
247
270
  pix_width: int,
248
271
  pix_height: int,
249
272
  scale: float,
273
+ min_y_ratio: float = 0.4,
250
274
  ) -> Iterable[WetCandidate]:
251
275
  for line in lines:
252
276
  normalized = line.text.lower()
@@ -255,7 +279,7 @@ def _build_candidates(
255
279
  if len(normalized) > 80:
256
280
  # Ignore long paragraph-like OCR lines
257
281
  continue
258
- if (line.bottom / pix_height) < 0.4:
282
+ if (line.bottom / pix_height) < min_y_ratio:
259
283
  # Ignore lines in the upper section of the page
260
284
  continue
261
285
  role = _infer_role(normalized)
@@ -338,28 +362,33 @@ def _expand_bbox(
338
362
  ) -> tuple[float, float, float, float]:
339
363
  x0 = line.left / scale
340
364
  x1 = line.right / scale
341
- y1 = (pix_height - line.top) / scale
365
+ y_top = (pix_height - line.top) / scale
366
+ y_bottom = (pix_height - line.bottom) / scale
342
367
 
343
368
  pad_x = max(14.0, (x1 - x0) * 0.25)
344
369
  left = max(page_rect.x0, x0 - pad_x)
345
370
  right = min(page_rect.x1, x1 + pad_x)
346
371
 
347
372
  gap = 14.0
348
- signature_height = 70.0
349
- top = min(page_rect.y1, y1 + gap)
350
- bottom = min(page_rect.y1, top + signature_height)
351
-
352
- if bottom <= top:
353
- bottom = min(page_rect.y1, top + signature_height)
373
+ line_height = max(1.0, (line.bottom - line.top) / scale)
374
+ signature_height = max(70.0, line_height * 6.0)
375
+ upper = min(page_rect.y1, y_bottom - gap)
376
+ upper = max(page_rect.y0, upper)
377
+ lower = max(page_rect.y0, upper - signature_height)
354
378
 
355
379
  if stroke_y is not None:
356
- # Anchor to the detected stroke under the OCR label when available.
380
+ # Anchor to the detected stroke (signature line) beneath the label.
357
381
  sy = (pix_height - stroke_y) / scale
358
- if sy < top:
359
- top = sy
360
- bottom = max(bottom, sy + signature_height)
382
+ field_lower = min(page_rect.y1, max(page_rect.y0, sy + 2.0))
383
+ field_upper = min(page_rect.y1, y_bottom - gap)
384
+ if field_upper > field_lower + 6.0:
385
+ lower = field_lower
386
+ upper = field_upper
387
+ else:
388
+ upper = min(page_rect.y1, field_lower + signature_height)
389
+ lower = max(page_rect.y0, upper - signature_height)
361
390
 
362
- return (float(left), float(top), float(right), float(bottom))
391
+ return (float(left), float(lower), float(right), float(upper))
363
392
 
364
393
 
365
394
  def _stroke_under_line(image: Image.Image, line: OcrLine) -> tuple[bool, float | None]:
@@ -513,14 +542,19 @@ def _signature_rank(signature: Signature) -> tuple[int, int, int]:
513
542
 
514
543
  def _dedupe_wet_signatures(signatures: Sequence[Signature]) -> list[Signature]:
515
544
  best_by_role: dict[str, Signature] = {}
545
+ best_unknown: Signature | None = None
516
546
  for signature in signatures:
517
547
  role = (signature.Role or "unknown").strip().lower()
518
548
  if role == "unknown":
549
+ if best_unknown is None or _signature_rank(signature) > _signature_rank(best_unknown):
550
+ best_unknown = signature
519
551
  continue
520
552
  existing = best_by_role.get(role)
521
553
  if existing is None or _signature_rank(signature) > _signature_rank(existing):
522
554
  best_by_role[role] = signature
523
- return sorted(best_by_role.values(), key=lambda sig: (int(sig.Page or 0), sig.Role or ""))
555
+ if best_by_role:
556
+ return sorted(best_by_role.values(), key=lambda sig: (int(sig.Page or 0), sig.Role or ""))
557
+ return [best_unknown] if best_unknown is not None else []
524
558
 
525
559
 
526
560
  def _mark_manual_review(file_result: FileResult, reason: str) -> None:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sigdetect
3
- Version: 0.5.1
3
+ Version: 0.5.2
4
4
  Summary: Signature detection and role attribution for PDFs
5
5
  Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
6
6
  License: MIT
@@ -1,12 +1,14 @@
1
+ import io
1
2
  from pathlib import Path
2
3
 
3
4
  import pytest
4
5
  from pypdf import PdfWriter
6
+ from PIL import Image, ImageDraw
5
7
  from pypdf.generic import ArrayObject, DictionaryObject, NameObject, NumberObject, TextStringObject
6
8
 
7
9
  from sigdetect.api import CropSignatureImages, DetectPdf
8
10
  from sigdetect.config import DetectConfiguration
9
- from sigdetect.cropping import SignatureCrop, crop_signatures
11
+ from sigdetect.cropping import SignatureCrop, _trim_signature_image_bytes, crop_signatures
10
12
  from sigdetect.detector.pypdf2_engine import PyPDF2Detector
11
13
 
12
14
  pytest.importorskip("fitz")
@@ -43,6 +45,53 @@ def _pdf_with_signature(path: Path) -> None:
43
45
  writer.write(handle)
44
46
 
45
47
 
48
+ def _build_test_crop_bytes() -> bytes:
49
+ image = Image.new("RGB", (200, 100), "white")
50
+ draw = ImageDraw.Draw(image)
51
+ draw.rectangle([20, 10, 80, 20], fill="black")
52
+ draw.rectangle([10, 60, 190, 80], fill="black")
53
+ buffer = io.BytesIO()
54
+ image.save(buffer, format="PNG")
55
+ return buffer.getvalue()
56
+
57
+
58
+ def test_trim_signature_image_bytes_prefers_lower_band() -> None:
59
+ original = _build_test_crop_bytes()
60
+ trimmed = _trim_signature_image_bytes(original, pad_px=2)
61
+
62
+ original_image = Image.open(io.BytesIO(original))
63
+ trimmed_image = Image.open(io.BytesIO(trimmed))
64
+
65
+ assert trimmed_image.height < original_image.height
66
+
67
+ gray = trimmed_image.convert("L")
68
+ pixels = gray.load()
69
+ width, height = gray.size
70
+ top_dark = sum(1 for x in range(width) for y in range(min(8, height)) if pixels[x, y] < 240)
71
+ assert top_dark == 0
72
+
73
+
74
+ def test_trim_signature_image_bytes_respects_horizontal_rule() -> None:
75
+ image = Image.new("RGB", (200, 120), "white")
76
+ draw = ImageDraw.Draw(image)
77
+ # Signature scribble above the line.
78
+ draw.line([20, 20, 180, 30], fill="black", width=3)
79
+ draw.line([25, 28, 140, 18], fill="black", width=2)
80
+ # Horizontal rule separating signature from print name.
81
+ draw.line([10, 50, 190, 50], fill="black", width=2)
82
+ # Text-ish block below the line.
83
+ draw.rectangle([20, 70, 120, 85], fill="black")
84
+
85
+ buffer = io.BytesIO()
86
+ image.save(buffer, format="PNG")
87
+ trimmed = _trim_signature_image_bytes(buffer.getvalue(), pad_px=2)
88
+
89
+ trimmed_image = Image.open(io.BytesIO(trimmed)).convert("L")
90
+ width, height = trimmed_image.size
91
+ # Ensure we trimmed off the lower text block (should be well above original height).
92
+ assert height < 90
93
+
94
+
46
95
  def test_crop_signatures(tmp_path: Path):
47
96
  pdf_path = tmp_path / "doc.pdf"
48
97
  _pdf_with_signature(pdf_path)
@@ -1,15 +1,19 @@
1
1
  from pathlib import Path
2
2
 
3
+ from PIL import Image
4
+
3
5
  from pypdf import PdfWriter
4
6
 
5
7
  from sigdetect.config import DetectConfiguration
6
8
  from sigdetect.detector.file_result_model import FileResult
7
9
  from sigdetect.detector.signature_model import Signature
8
10
  from sigdetect.wet_detection import (
11
+ OcrLine,
9
12
  WetCandidate,
10
13
  _dedupe_wet_signatures,
11
14
  _filter_candidates_for_page,
12
15
  _image_candidates,
16
+ _build_candidates,
13
17
  _refresh_metadata,
14
18
  apply_wet_detection,
15
19
  should_run_wet_pipeline,
@@ -130,6 +134,46 @@ def test_apply_wet_detection_preserves_esign_flags(monkeypatch, tmp_path: Path)
130
134
  assert file_result.MixedContent is False
131
135
 
132
136
 
137
+ def test_apply_wet_detection_defaults_unknown_to_patient(monkeypatch, tmp_path: Path) -> None:
138
+ pdf_path = tmp_path / "doc.pdf"
139
+ _blank_pdf(pdf_path)
140
+ configuration = DetectConfiguration(
141
+ pdf_root=tmp_path,
142
+ out_dir=tmp_path,
143
+ engine="pypdf2",
144
+ profile="hipaa",
145
+ )
146
+ file_result = _empty_file_result("doc.pdf")
147
+
148
+ monkeypatch.setattr("sigdetect.wet_detection._ensure_dependencies", lambda: None)
149
+
150
+ def fake_detect(pdf_path, configuration, file_result, logger=None):
151
+ file_result.Signatures.append(
152
+ Signature(
153
+ Page=1,
154
+ FieldName="wet_signature_detected",
155
+ Role="unknown",
156
+ Score=88,
157
+ Scores={"unknown": 88},
158
+ Evidence=["wet:true"],
159
+ Hint="WetSignatureOCR",
160
+ RenderType="wet",
161
+ BoundingBox=(10.0, 10.0, 100.0, 40.0),
162
+ )
163
+ )
164
+ _refresh_metadata(file_result)
165
+ return True
166
+
167
+ monkeypatch.setattr("sigdetect.wet_detection._detect", fake_detect)
168
+
169
+ applied = apply_wet_detection(pdf_path, configuration, file_result)
170
+
171
+ assert applied is True
172
+ assert file_result.Signatures
173
+ assert file_result.Signatures[0].Role == "patient"
174
+ assert "role_default:patient" in (file_result.Signatures[0].Evidence or [])
175
+
176
+
133
177
  def test_image_candidate_detection_infers_role_from_nearby_text() -> None:
134
178
  class Rect:
135
179
  def __init__(self, x0, y0, x1, y1):
@@ -213,3 +257,57 @@ def test_dedupe_wet_signatures_keeps_best_per_role() -> None:
213
257
  assert filtered[0].Role == "patient"
214
258
  assert filtered[0].Page == 2
215
259
  assert "image_signature:true" in filtered[0].Evidence
260
+
261
+
262
+ def test_dedupe_wet_signatures_keeps_unknown_when_only() -> None:
263
+ def make_signature(page: int, role: str, score: int, evidence: list[str]) -> Signature:
264
+ return Signature(
265
+ Page=page,
266
+ FieldName="wet_signature_detected",
267
+ Role=role,
268
+ Score=score,
269
+ Scores={role: score},
270
+ Evidence=evidence,
271
+ Hint="WetSignatureOCR",
272
+ RenderType="wet",
273
+ BoundingBox=(0.0, 0.0, 10.0, 10.0),
274
+ )
275
+
276
+ unknown = make_signature(1, "unknown", 90, ["ocr_line:signature", "stroke:no"])
277
+ filtered = _dedupe_wet_signatures([unknown])
278
+
279
+ assert len(filtered) == 1
280
+ assert filtered[0].Role == "unknown"
281
+
282
+
283
+ def test_build_candidates_respects_min_y_ratio() -> None:
284
+ class DummyPageRect:
285
+ x0, y0, x1, y1 = 0.0, 0.0, 600.0, 800.0
286
+
287
+ image = Image.new("RGB", (100, 100), "white")
288
+ line = OcrLine(text="Signature", confidence=0.9, left=10, top=10, right=90, bottom=30)
289
+
290
+ candidates_default = list(
291
+ _build_candidates(
292
+ [line],
293
+ image=image,
294
+ page_rect=DummyPageRect(),
295
+ pix_width=100,
296
+ pix_height=100,
297
+ scale=1.0,
298
+ )
299
+ )
300
+ candidates_relaxed = list(
301
+ _build_candidates(
302
+ [line],
303
+ image=image,
304
+ page_rect=DummyPageRect(),
305
+ pix_width=100,
306
+ pix_height=100,
307
+ scale=1.0,
308
+ min_y_ratio=0.2,
309
+ )
310
+ )
311
+
312
+ assert not candidates_default
313
+ assert candidates_relaxed
File without changes
File without changes
File without changes
File without changes