sigdetect 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sigdetect/cropping.py CHANGED
@@ -9,6 +9,8 @@ from dataclasses import dataclass
9
9
  from pathlib import Path
10
10
  from typing import Literal, overload
11
11
 
12
+ from PIL import Image
13
+
12
14
  from .detector.file_result_model import FileResult
13
15
  from .detector.signature_model import Signature
14
16
 
@@ -53,6 +55,7 @@ def crop_signatures(
53
55
  return_bytes: Literal[False] = False,
54
56
  save_files: bool = True,
55
57
  docx: bool = False,
58
+ trim: bool = True,
56
59
  ) -> list[Path]: ...
57
60
 
58
61
 
@@ -67,6 +70,7 @@ def crop_signatures(
67
70
  return_bytes: Literal[True],
68
71
  save_files: bool = True,
69
72
  docx: bool = False,
73
+ trim: bool = True,
70
74
  ) -> list[SignatureCrop]: ...
71
75
 
72
76
 
@@ -80,6 +84,7 @@ def crop_signatures(
80
84
  return_bytes: bool = False,
81
85
  save_files: bool = True,
82
86
  docx: bool = False,
87
+ trim: bool = True,
83
88
  ) -> list[Path] | list[SignatureCrop]:
84
89
  """Render each signature bounding box to a PNG image and optionally wrap it in DOCX.
85
90
 
@@ -87,6 +92,7 @@ def crop_signatures(
87
92
  the files to ``output_dir``. Set ``save_files=False`` to skip writing PNGs to disk.
88
93
  When ``docx=True``, DOCX files are written instead of PNGs. When ``return_bytes`` is True
89
94
  and ``docx=True``, ``SignatureCrop.docx_bytes`` will contain the DOCX payload.
95
+ When ``trim`` is enabled, the crop is tightened around the detected ink where possible.
90
96
  """
91
97
 
92
98
  if fitz is None: # pragma: no cover - exercised when dependency absent
@@ -145,10 +151,12 @@ def crop_signatures(
145
151
  try:
146
152
  image_bytes: bytes | None = None
147
153
  pixmap = page.get_pixmap(matrix=matrix, clip=clip, alpha=False)
154
+ raw_bytes = pixmap.tobytes("png")
155
+ final_bytes = _trim_signature_image_bytes(raw_bytes) if trim else raw_bytes
148
156
  if save_files and not docx_enabled:
149
- pixmap.save(png_destination)
157
+ png_destination.write_bytes(final_bytes)
150
158
  if return_bytes or docx_enabled:
151
- image_bytes = pixmap.tobytes("png")
159
+ image_bytes = final_bytes
152
160
  except Exception as exc: # pragma: no cover - defensive
153
161
  if logger:
154
162
  logger.warning(
@@ -221,6 +229,233 @@ def _build_docx_bytes(image_bytes: bytes) -> bytes:
221
229
  return buffer.getvalue()
222
230
 
223
231
 
232
+ def _trim_signature_image_bytes(
233
+ image_bytes: bytes,
234
+ *,
235
+ pad_px: int = 4,
236
+ gap_px: int = 4,
237
+ min_density_ratio: float = 0.004,
238
+ ) -> bytes:
239
+ image = Image.open(io.BytesIO(image_bytes))
240
+ gray = image.convert("L")
241
+ width, height = gray.size
242
+
243
+ histogram = gray.histogram()
244
+ total_pixels = width * height
245
+ cutoff = int(total_pixels * 0.995)
246
+ cumulative = 0
247
+ white_level = 255
248
+ for idx, count in enumerate(histogram):
249
+ cumulative += count
250
+ if cumulative >= cutoff:
251
+ white_level = idx
252
+ break
253
+
254
+ if white_level < 200:
255
+ return image_bytes
256
+
257
+ thresholds = [min(254, max(200, white_level - delta)) for delta in (6, 4, 2, 1, 0)]
258
+ min_density = max(2, int(width * min_density_ratio))
259
+ pixels = gray.load()
260
+
261
+ row_densities: dict[int, list[int]] = {}
262
+ for threshold in thresholds:
263
+ row_density = []
264
+ for y in range(height):
265
+ dark = sum(1 for x in range(width) if pixels[x, y] < threshold)
266
+ row_density.append(dark)
267
+ row_densities[threshold] = row_density
268
+
269
+ line_bounds = _detect_horizontal_rule_cutoff(row_densities[thresholds[-1]], width)
270
+ scan_limit = None
271
+ descender_limit = height - 1
272
+ if line_bounds is not None:
273
+ line_start, line_end = line_bounds
274
+ scan_limit = max(0, line_start - 1)
275
+ descender_limit = min(height - 1, line_end + max(2, int(height * 0.02)))
276
+
277
+ min_band_height = max(4, int(height * 0.02))
278
+ best = None
279
+ best_small = None
280
+ best_small_threshold = None
281
+ best_threshold = None
282
+ line_threshold = int(width * 0.6)
283
+ for threshold in thresholds:
284
+ row_density = row_densities[threshold]
285
+ segments: list[tuple[int, int]] = []
286
+ start: int | None = None
287
+ for y, dark in enumerate(row_density):
288
+ if scan_limit is not None and y > scan_limit:
289
+ if start is not None:
290
+ segments.append((start, y - 1))
291
+ start = None
292
+ break
293
+ if dark >= min_density:
294
+ if start is None:
295
+ start = y
296
+ else:
297
+ if start is not None:
298
+ segments.append((start, y - 1))
299
+ start = None
300
+ if start is not None:
301
+ segments.append((start, height - 1))
302
+
303
+ if not segments:
304
+ continue
305
+
306
+ merged: list[list[int]] = []
307
+ for seg in segments:
308
+ if not merged:
309
+ merged.append([seg[0], seg[1]])
310
+ continue
311
+ if seg[0] - merged[-1][1] <= gap_px:
312
+ merged[-1][1] = seg[1]
313
+ else:
314
+ merged.append([seg[0], seg[1]])
315
+
316
+ candidates = []
317
+ for y0, y1 in merged:
318
+ min_x, max_x = width, -1
319
+ total_dark = 0
320
+ for y in range(y0, y1 + 1):
321
+ for x in range(width):
322
+ if pixels[x, y] < threshold:
323
+ total_dark += 1
324
+ if x < min_x:
325
+ min_x = x
326
+ if x > max_x:
327
+ max_x = x
328
+ if max_x < 0:
329
+ continue
330
+ band_height = y1 - y0 + 1
331
+ band_width = max_x - min_x + 1
332
+ score = total_dark * (band_height**1.3)
333
+ if line_bounds is not None:
334
+ distance = max(0, line_bounds[0] - y1)
335
+ proximity = 1.0 / (1.0 + (distance / 20.0))
336
+ score *= 1.0 + 0.5 * proximity
337
+ candidates.append(
338
+ {
339
+ "y0": y0,
340
+ "y1": y1,
341
+ "min_x": min_x,
342
+ "max_x": max_x,
343
+ "total": total_dark,
344
+ "height": band_height,
345
+ "width": band_width,
346
+ "score": score,
347
+ }
348
+ )
349
+
350
+ if not candidates:
351
+ continue
352
+
353
+ candidates.sort(key=lambda item: item["score"], reverse=True)
354
+ top_candidate = candidates[0]
355
+ if top_candidate["height"] >= min_band_height:
356
+ if best is None or top_candidate["score"] > best["score"]:
357
+ best = top_candidate
358
+ best_threshold = threshold
359
+ else:
360
+ if best_small is None or top_candidate["score"] > best_small["score"]:
361
+ best_small = top_candidate
362
+ best_small_threshold = threshold
363
+
364
+ if best is None:
365
+ best = best_small
366
+ best_threshold = best_small_threshold
367
+
368
+ if best is None:
369
+ return image_bytes
370
+
371
+ expansion_density = row_densities.get(best_threshold, row_densities[thresholds[-1]])
372
+ expand_threshold = max(1, int(min_density * 0.4))
373
+ y0 = best["y0"]
374
+ y1 = best["y1"]
375
+
376
+ while y0 > 0 and expansion_density[y0 - 1] >= expand_threshold:
377
+ y0 -= 1
378
+ while y1 < descender_limit and expansion_density[y1 + 1] >= expand_threshold:
379
+ y1 += 1
380
+
381
+ min_x, max_x = width, -1
382
+ for y in range(y0, y1 + 1):
383
+ if expansion_density[y] >= line_threshold:
384
+ continue
385
+ for x in range(width):
386
+ if pixels[x, y] < thresholds[-1]:
387
+ if x < min_x:
388
+ min_x = x
389
+ if x > max_x:
390
+ max_x = x
391
+ if max_x >= 0:
392
+ best = {
393
+ "y0": y0,
394
+ "y1": y1,
395
+ "min_x": min_x,
396
+ "max_x": max_x,
397
+ }
398
+
399
+ x0 = max(0, best["min_x"] - pad_px)
400
+ x1 = min(width - 1, best["max_x"] + pad_px)
401
+ y0 = max(0, best["y0"] - pad_px)
402
+ y1 = min(height - 1, best["y1"] + pad_px)
403
+
404
+ if x1 <= x0 or y1 <= y0:
405
+ return image_bytes
406
+ if (x1 - x0) < max(10, int(width * 0.2)) or (y1 - y0) < max(6, int(height * 0.08)):
407
+ return image_bytes
408
+
409
+ cropped = image.crop((x0, y0, x1 + 1, y1 + 1))
410
+ buffer = io.BytesIO()
411
+ cropped.save(buffer, format="PNG")
412
+ return buffer.getvalue()
413
+
414
+
415
+ def _detect_horizontal_rule_cutoff(
416
+ row_density: list[int],
417
+ width: int,
418
+ ) -> tuple[int, int] | None:
419
+ if not row_density:
420
+ return None
421
+ line_threshold = int(width * 0.6)
422
+ max_thickness = 4
423
+ segments: list[tuple[int, int]] = []
424
+ start = None
425
+ for y, density in enumerate(row_density):
426
+ if density >= line_threshold:
427
+ if start is None:
428
+ start = y
429
+ else:
430
+ if start is not None:
431
+ segments.append((start, y - 1))
432
+ start = None
433
+ if start is not None:
434
+ segments.append((start, len(row_density) - 1))
435
+
436
+ if not segments:
437
+ return None
438
+
439
+ total_dark = sum(row_density)
440
+ if total_dark <= 0:
441
+ return None
442
+
443
+ for y0, y1 in segments:
444
+ thickness = y1 - y0 + 1
445
+ if thickness > max_thickness:
446
+ continue
447
+ above_dark = sum(row_density[:y0])
448
+ below_dark = sum(row_density[y1 + 1 :])
449
+ if above_dark < 40:
450
+ continue
451
+ midpoint_ratio = ((y0 + y1) / 2.0) / max(1, len(row_density))
452
+ if midpoint_ratio >= 0.35:
453
+ return (y0, y1)
454
+ if above_dark >= max(40, int(below_dark * 0.3)):
455
+ return (y0, y1)
456
+ return None
457
+
458
+
224
459
  def _to_clip_rect(page, bbox: tuple[float, float, float, float]):
225
460
  width = float(page.rect.width)
226
461
  height = float(page.rect.height)
@@ -94,6 +94,17 @@ def apply_wet_detection(
94
94
  original_mixed = file_result.MixedContent
95
95
  try:
96
96
  added = _detect(pdf_path, configuration, file_result, logger=logger)
97
+ if added and configuration.Profile == "hipaa":
98
+ updated = False
99
+ for signature in file_result.Signatures:
100
+ if signature.RenderType == "wet" and (signature.Role or "unknown") == "unknown":
101
+ signature.Role = "patient"
102
+ signature.Scores = {"patient": int(signature.Score or 0)}
103
+ signature.Evidence = list(signature.Evidence or [])
104
+ signature.Evidence.append("role_default:patient")
105
+ updated = True
106
+ if updated:
107
+ _refresh_metadata(file_result)
97
108
  if not added:
98
109
  _mark_manual_review(file_result, "NoHighConfidenceWetSignature")
99
110
  return added
@@ -136,6 +147,18 @@ def _detect(
136
147
  scale=configuration.WetOcrDpi / 72.0,
137
148
  )
138
149
  )
150
+ if not candidates:
151
+ candidates = list(
152
+ _build_candidates(
153
+ ocr_lines,
154
+ image=image,
155
+ page_rect=page.rect,
156
+ pix_width=pixmap.width,
157
+ pix_height=pixmap.height,
158
+ scale=configuration.WetOcrDpi / 72.0,
159
+ min_y_ratio=0.2,
160
+ )
161
+ )
139
162
  candidates.extend(_image_candidates(page))
140
163
  candidates = _filter_candidates_for_page(candidates)
141
164
  accepted = [
@@ -247,6 +270,7 @@ def _build_candidates(
247
270
  pix_width: int,
248
271
  pix_height: int,
249
272
  scale: float,
273
+ min_y_ratio: float = 0.4,
250
274
  ) -> Iterable[WetCandidate]:
251
275
  for line in lines:
252
276
  normalized = line.text.lower()
@@ -255,7 +279,7 @@ def _build_candidates(
255
279
  if len(normalized) > 80:
256
280
  # Ignore long paragraph-like OCR lines
257
281
  continue
258
- if (line.bottom / pix_height) < 0.4:
282
+ if (line.bottom / pix_height) < min_y_ratio:
259
283
  # Ignore lines in the upper section of the page
260
284
  continue
261
285
  role = _infer_role(normalized)
@@ -338,28 +362,33 @@ def _expand_bbox(
338
362
  ) -> tuple[float, float, float, float]:
339
363
  x0 = line.left / scale
340
364
  x1 = line.right / scale
341
- y1 = (pix_height - line.top) / scale
365
+ y_top = (pix_height - line.top) / scale
366
+ y_bottom = (pix_height - line.bottom) / scale
342
367
 
343
368
  pad_x = max(14.0, (x1 - x0) * 0.25)
344
369
  left = max(page_rect.x0, x0 - pad_x)
345
370
  right = min(page_rect.x1, x1 + pad_x)
346
371
 
347
372
  gap = 14.0
348
- signature_height = 70.0
349
- top = min(page_rect.y1, y1 + gap)
350
- bottom = min(page_rect.y1, top + signature_height)
351
-
352
- if bottom <= top:
353
- bottom = min(page_rect.y1, top + signature_height)
373
+ line_height = max(1.0, (line.bottom - line.top) / scale)
374
+ signature_height = max(70.0, line_height * 6.0)
375
+ upper = min(page_rect.y1, y_bottom - gap)
376
+ upper = max(page_rect.y0, upper)
377
+ lower = max(page_rect.y0, upper - signature_height)
354
378
 
355
379
  if stroke_y is not None:
356
- # Anchor to the detected stroke under the OCR label when available.
380
+ # Anchor to the detected stroke (signature line) beneath the label.
357
381
  sy = (pix_height - stroke_y) / scale
358
- if sy < top:
359
- top = sy
360
- bottom = max(bottom, sy + signature_height)
382
+ field_lower = min(page_rect.y1, max(page_rect.y0, sy + 2.0))
383
+ field_upper = min(page_rect.y1, y_bottom - gap)
384
+ if field_upper > field_lower + 6.0:
385
+ lower = field_lower
386
+ upper = field_upper
387
+ else:
388
+ upper = min(page_rect.y1, field_lower + signature_height)
389
+ lower = max(page_rect.y0, upper - signature_height)
361
390
 
362
- return (float(left), float(top), float(right), float(bottom))
391
+ return (float(left), float(lower), float(right), float(upper))
363
392
 
364
393
 
365
394
  def _stroke_under_line(image: Image.Image, line: OcrLine) -> tuple[bool, float | None]:
@@ -513,14 +542,19 @@ def _signature_rank(signature: Signature) -> tuple[int, int, int]:
513
542
 
514
543
  def _dedupe_wet_signatures(signatures: Sequence[Signature]) -> list[Signature]:
515
544
  best_by_role: dict[str, Signature] = {}
545
+ best_unknown: Signature | None = None
516
546
  for signature in signatures:
517
547
  role = (signature.Role or "unknown").strip().lower()
518
548
  if role == "unknown":
549
+ if best_unknown is None or _signature_rank(signature) > _signature_rank(best_unknown):
550
+ best_unknown = signature
519
551
  continue
520
552
  existing = best_by_role.get(role)
521
553
  if existing is None or _signature_rank(signature) > _signature_rank(existing):
522
554
  best_by_role[role] = signature
523
- return sorted(best_by_role.values(), key=lambda sig: (int(sig.Page or 0), sig.Role or ""))
555
+ if best_by_role:
556
+ return sorted(best_by_role.values(), key=lambda sig: (int(sig.Page or 0), sig.Role or ""))
557
+ return [best_unknown] if best_unknown is not None else []
524
558
 
525
559
 
526
560
  def _mark_manual_review(file_result: FileResult, reason: str) -> None:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sigdetect
3
- Version: 0.5.1
3
+ Version: 0.5.2
4
4
  Summary: Signature detection and role attribution for PDFs
5
5
  Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
6
6
  License: MIT
@@ -2,11 +2,11 @@ sigdetect/__init__.py,sha256=YvnTwlC1jfq83EhQS_1JjiiHK7_wJCCU1JvHv5E1qWY,573
2
2
  sigdetect/api.py,sha256=hDfa6z4SoHth1Dw9HDfSPiytMQrqu_oyBZlXBwSh9g4,11010
3
3
  sigdetect/cli.py,sha256=X5GqZ-PK67vz4OHN5r7h-V0hO886ZblUiUdKDuFowtU,10930
4
4
  sigdetect/config.py,sha256=3SP1rkcWBGXloCDFomBJRMRKZOvXuHQbhIBqpVrzYmY,8365
5
- sigdetect/cropping.py,sha256=HfOJrV2Xv9Eo0lCIl3mukz49agKB6h2TML99B0qQJNc,8837
5
+ sigdetect/cropping.py,sha256=IyiBfIEHBLvOv8t_d-O51BfpljTFpE-dG_RxDxJAzAo,16339
6
6
  sigdetect/eda.py,sha256=S92G1Gjmepri__D0n_V6foq0lQgH-RXI9anW8A58jfw,4681
7
7
  sigdetect/logging_setup.py,sha256=LMF8ao_a-JwH0S522T6aYTFX3e8Ajjv_5ODS2YiBcHA,6404
8
8
  sigdetect/utils.py,sha256=T9rubLf5T9JmjOHYMOba1j34fhOJaWocAXccnGTxRUE,5198
9
- sigdetect/wet_detection.py,sha256=zvi11XUmm_xLZ4BLvxInwMQg8YLcyQzEYAM9QSdJOIs,18259
9
+ sigdetect/wet_detection.py,sha256=ofKijykm4fKrvFaVkEkPPKL9iKeRNvlAiKkD2vHxD8k,20025
10
10
  sigdetect/data/role_rules.retainer.yml,sha256=IFdwKnDBXR2cTkdfrsZ6ku6CXD8S_dg5A3vKRKLW5h8,2532
11
11
  sigdetect/data/role_rules.yml,sha256=HuLKsZR_A6sD9XvY4NHiY_VG3dS5ERNCBF9-Mxawomw,2751
12
12
  sigdetect/data/vendor_patterns.yml,sha256=NRbZNQxcx_GuL6n1jAphBn6MM6ChCpeWGCsjbRx-PEo,384
@@ -17,8 +17,8 @@ sigdetect/detector/file_result_model.py,sha256=j2gTc9Sw3fJOHlexYsR_m5DiwHA8DzIzA
17
17
  sigdetect/detector/pymupdf_engine.py,sha256=N6oxvUa-48VvvhjbMk0R0kfScsggNKS7u5FLSeBRfWw,17358
18
18
  sigdetect/detector/pypdf2_engine.py,sha256=kB8cIp_gMvCla0LIBi9sd19g0361Oc9TjCW_ZViUBJQ,47410
19
19
  sigdetect/detector/signature_model.py,sha256=T2Hmfkfz_hZsDzwOhepxfNmkedxQp3_XHdrP8yGKoCk,1322
20
- sigdetect-0.5.1.dist-info/METADATA,sha256=_Jnyl9_A1yZUrKwWxUxVB-9rcMG3MdUqiN5WX_zlpqQ,14131
21
- sigdetect-0.5.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
22
- sigdetect-0.5.1.dist-info/entry_points.txt,sha256=iqtfKjBU44-omM7Sh-idGz2ahw19oAvpvSyKZVArG3o,48
23
- sigdetect-0.5.1.dist-info/top_level.txt,sha256=PKlfwUobkRC0viwiSXmhtw83G26FSNpimWYC1Uy00FY,10
24
- sigdetect-0.5.1.dist-info/RECORD,,
20
+ sigdetect-0.5.2.dist-info/METADATA,sha256=jLin7USVPqeA5tS7KCuPRRt1PLwdt-oJWhWuKSQa6hE,14131
21
+ sigdetect-0.5.2.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
22
+ sigdetect-0.5.2.dist-info/entry_points.txt,sha256=iqtfKjBU44-omM7Sh-idGz2ahw19oAvpvSyKZVArG3o,48
23
+ sigdetect-0.5.2.dist-info/top_level.txt,sha256=PKlfwUobkRC0viwiSXmhtw83G26FSNpimWYC1Uy00FY,10
24
+ sigdetect-0.5.2.dist-info/RECORD,,