sigdetect 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,6 +3,7 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  from pathlib import Path
6
+ import re
6
7
  from typing import Iterable, cast
7
8
 
8
9
  from .pypdf2_engine import PyPDF2Detector
@@ -26,6 +27,23 @@ class PyMuPDFDetector(PyPDF2Detector):
26
27
  "representative": ("representative", "guardian", "parent"),
27
28
  "attorney": ("attorney", "counsel", "lawyer"),
28
29
  }
30
+ RETAINER_CLIENT_PATTERNS = (
31
+ re.compile(r"\bclient\b", re.IGNORECASE),
32
+ re.compile(r"\bpatient\b", re.IGNORECASE),
33
+ re.compile(r"\bagreed\b", re.IGNORECASE),
34
+ re.compile(r"\baccepted\b", re.IGNORECASE),
35
+ re.compile(r"\bby:\b", re.IGNORECASE),
36
+ re.compile(r"\bon this date\b", re.IGNORECASE),
37
+ re.compile(r"\bname\b", re.IGNORECASE),
38
+ )
39
+ RETAINER_FIRM_PATTERNS = (
40
+ re.compile(r"\battorney\b", re.IGNORECASE),
41
+ re.compile(r"\bauthorized\b", re.IGNORECASE),
42
+ re.compile(r"\brepresentative\b", re.IGNORECASE),
43
+ re.compile(r"\bfirm\b", re.IGNORECASE),
44
+ re.compile(r"\bcounsel\b", re.IGNORECASE),
45
+ re.compile(r"\blaw\b", re.IGNORECASE),
46
+ )
29
47
 
30
48
  def __init__(self, configuration):
31
49
  if fitz is None: # pragma: no cover - optional dependency
@@ -47,6 +65,7 @@ class PyMuPDFDetector(PyPDF2Detector):
47
65
  widget_map = self._CollectWidgetRects(document)
48
66
  self._ApplyWidgetRects(result.Signatures, widget_map)
49
67
  self._InferPseudoRects(result.Signatures, document)
68
+ self._ExpandRetainerImageSignatures(result, document)
50
69
  return result
51
70
 
52
71
  # ───────────────────────────────── widget helpers ─────────────────────────────────
@@ -116,6 +135,233 @@ class PyMuPDFDetector(PyPDF2Detector):
116
135
  signature.Page = page_index + 1
117
136
  break
118
137
 
138
+ # ───────────────────────── retainer image expansion ─────────────────────────
139
+ def _ExpandRetainerImageSignatures(self, result, document) -> None:
140
+ if self.Profile != "retainer":
141
+ return
142
+ if not self.Configuration.PseudoSignatures:
143
+ return
144
+ if not result.Signatures:
145
+ return
146
+
147
+ signatures_by_page: dict[int, list[Signature]] = {}
148
+ for sig in result.Signatures:
149
+ if sig.Page:
150
+ signatures_by_page.setdefault(sig.Page, []).append(sig)
151
+
152
+ added = False
153
+ image_pages: set[int] = set()
154
+ for page_index in range(document.page_count):
155
+ page_number = page_index + 1
156
+ page = document.load_page(page_index)
157
+ image_rects = self._FilterSignatureImageRects(self._CollectImageRects(page), page.rect)
158
+ if not image_rects:
159
+ continue
160
+
161
+ lines = self._ExtractLines(page)
162
+ if not self._HasRetainerSignatureCue(lines):
163
+ continue
164
+ image_pages.add(page_number)
165
+ anchors = self._CollectRetainerRoleAnchors(lines)
166
+ roles_for_rects = self._AssignRolesToImages(
167
+ image_rects,
168
+ anchors,
169
+ signatures_by_page.get(page_number, []),
170
+ )
171
+
172
+ existing_on_page = signatures_by_page.get(page_number, [])
173
+ existing_rects: list[tuple[Signature, fitz.Rect]] = []
174
+ for sig in existing_on_page:
175
+ if sig.BoundingBox:
176
+ existing_rects.append(
177
+ (sig, self._PdfTupleToRect(sig.BoundingBox, page.rect.height))
178
+ )
179
+
180
+ for rect, role in zip(image_rects, roles_for_rects):
181
+ matched_sig = None
182
+ best_overlap = 0.0
183
+ for sig, sig_rect in existing_rects:
184
+ overlap = self._OverlapRatio(rect, sig_rect)
185
+ if overlap > best_overlap:
186
+ best_overlap = overlap
187
+ matched_sig = sig
188
+ if matched_sig and best_overlap >= 0.6:
189
+ matched_sig.BoundingBox = self._RectToPdfTuple(rect, page.rect.height)
190
+ if not matched_sig.RenderType:
191
+ matched_sig.RenderType = "drawn"
192
+ if (
193
+ role != "unknown"
194
+ and (
195
+ matched_sig.Role == "unknown"
196
+ or matched_sig.FieldName == "vendor_or_acro_detected"
197
+ or matched_sig.Hint == "VendorOrAcroOnly"
198
+ )
199
+ ):
200
+ matched_sig.Role = role
201
+ matched_sig.Score = max(int(matched_sig.Score or 0), 1)
202
+ matched_sig.Scores = {role: matched_sig.Score}
203
+ matched_sig.Evidence = (matched_sig.Evidence or []) + ["image:retainer"]
204
+ continue
205
+
206
+ if role == "unknown":
207
+ role = "firm"
208
+ new_sig = Signature(
209
+ Page=page_number,
210
+ FieldName="vendor_or_acro_detected",
211
+ Role=role,
212
+ Score=1,
213
+ Scores={role: 1},
214
+ Evidence=["image:retainer", "pseudo:true"],
215
+ Hint="VendorOrAcroOnly",
216
+ RenderType="drawn",
217
+ BoundingBox=self._RectToPdfTuple(rect, page.rect.height),
218
+ )
219
+ result.Signatures.append(new_sig)
220
+ signatures_by_page.setdefault(page_number, []).append(new_sig)
221
+ added = True
222
+
223
+ if added:
224
+ if image_pages:
225
+ filtered: list[Signature] = []
226
+ for sig in result.Signatures:
227
+ if sig.Page and sig.Page not in image_pages:
228
+ if sig.FieldName == "vendor_or_acro_detected" or not self._HasBBox(sig.BoundingBox):
229
+ continue
230
+ filtered.append(sig)
231
+ result.Signatures = filtered
232
+ pages = sorted({sig.Page for sig in result.Signatures if sig.Page})
233
+ result.SignatureCount = len(result.Signatures)
234
+ result.SignaturePages = ",".join(str(p) for p in pages)
235
+ roles = sorted({sig.Role for sig in result.Signatures if sig.Role and sig.Role != "unknown"})
236
+ result.Roles = ";".join(roles) if roles else result.Roles
237
+ result.ElectronicSignatureFound = result.SignatureCount > 0
238
+ hints = {h for h in (result.Hints or "").split(";") if h}
239
+ hints |= {sig.Hint for sig in result.Signatures if sig.Hint}
240
+ result.Hints = ";".join(sorted(hints)) if hints else result.Hints
241
+
242
+ def _CollectImageRects(self, page) -> list[fitz.Rect]:
243
+ rects: list[fitz.Rect] = []
244
+ try:
245
+ images = page.get_images(full=True)
246
+ except Exception:
247
+ return rects
248
+ for img in images:
249
+ xref = img[0]
250
+ try:
251
+ rects.extend(page.get_image_rects(xref))
252
+ except Exception:
253
+ continue
254
+ return rects
255
+
256
+ def _FilterSignatureImageRects(self, rects: list[fitz.Rect], page_rect) -> list[fitz.Rect]:
257
+ if not rects:
258
+ return []
259
+ max_area = page_rect.width * page_rect.height * 0.35
260
+ filtered: list[fitz.Rect] = []
261
+ for rect in rects:
262
+ r = fitz.Rect(rect)
263
+ area = r.get_area()
264
+ if area <= 0:
265
+ continue
266
+ if area > max_area:
267
+ continue
268
+ width = max(1.0, float(r.width))
269
+ height = max(1.0, float(r.height))
270
+ aspect = width / height
271
+ if aspect < 1.3:
272
+ continue
273
+ filtered.append(r)
274
+ return filtered
275
+
276
+ def _CollectRetainerRoleAnchors(
277
+ self, lines: list[dict[str, float | str]]
278
+ ) -> dict[str, list[fitz.Rect]]:
279
+ anchors: dict[str, list[fitz.Rect]] = {"client": [], "firm": []}
280
+ for line in lines:
281
+ lower = str(line["lower_text"])
282
+ rect = fitz.Rect(line["x0"], line["y0"], line["x1"], line["y1"])
283
+ if any(pattern.search(lower) for pattern in self.RETAINER_CLIENT_PATTERNS):
284
+ anchors["client"].append(rect)
285
+ if any(pattern.search(lower) for pattern in self.RETAINER_FIRM_PATTERNS):
286
+ anchors["firm"].append(rect)
287
+ return anchors
288
+
289
+ def _HasRetainerSignatureCue(self, lines: list[dict[str, float | str]]) -> bool:
290
+ tokens = ("signature", "signed", "agreed", "accepted", "authorized", "attorney", "by:")
291
+ for line in lines:
292
+ lower = str(line["lower_text"])
293
+ if any(token in lower for token in tokens):
294
+ return True
295
+ return False
296
+
297
+ def _AssignRolesToImages(
298
+ self,
299
+ rects: list[fitz.Rect],
300
+ anchors: dict[str, list[fitz.Rect]],
301
+ existing_signatures: list[Signature],
302
+ ) -> list[str]:
303
+ roles: list[str] = []
304
+ for rect in rects:
305
+ client_score = self._RoleDistance(rect, anchors.get("client", []))
306
+ firm_score = self._RoleDistance(rect, anchors.get("firm", []))
307
+ if client_score is None and firm_score is None:
308
+ roles.append("unknown")
309
+ elif firm_score is None:
310
+ roles.append("client")
311
+ elif client_score is None:
312
+ roles.append("firm")
313
+ else:
314
+ roles.append("client" if client_score <= firm_score else "firm")
315
+
316
+ existing_client = any(sig.Role == "client" for sig in existing_signatures)
317
+ if not existing_client and "client" not in roles and rects:
318
+ lowest_index = max(range(len(rects)), key=lambda i: rects[i].y1)
319
+ roles[lowest_index] = "client"
320
+ if "firm" not in roles:
321
+ for idx, role in enumerate(roles):
322
+ if role == "unknown":
323
+ roles[idx] = "firm"
324
+ return roles
325
+
326
+ def _RoleDistance(self, rect: fitz.Rect, anchors: list[fitz.Rect]) -> float | None:
327
+ if not anchors:
328
+ return None
329
+ best = None
330
+ for anchor in anchors:
331
+ vertical_gap = max(0.0, max(anchor.y0 - rect.y1, rect.y0 - anchor.y1))
332
+ horizontal_gap = max(0.0, max(anchor.x0 - rect.x1, rect.x0 - anchor.x1))
333
+ score = vertical_gap * 2.0 + horizontal_gap
334
+ if best is None or score < best:
335
+ best = score
336
+ return best
337
+
338
+ def _OverlapRatio(self, rect: fitz.Rect, other: fitz.Rect) -> float:
339
+ try:
340
+ inter = rect & other
341
+ except Exception:
342
+ return 0.0
343
+ if inter is None:
344
+ return 0.0
345
+ inter_area = inter.get_area()
346
+ if inter_area <= 0:
347
+ return 0.0
348
+ denom = max(1.0, min(rect.get_area(), other.get_area()))
349
+ return inter_area / denom
350
+
351
+ def _PdfTupleToRect(self, bbox: tuple[float, float, float, float], page_height: float) -> fitz.Rect:
352
+ x0, y0, x1, y1 = bbox
353
+ top = page_height - y1
354
+ bottom = page_height - y0
355
+ return fitz.Rect(x0, top, x1, bottom)
356
+
357
+ def _HasBBox(self, bbox: tuple[float, float, float, float] | None) -> bool:
358
+ if not bbox or len(bbox) != 4:
359
+ return False
360
+ x0, y0, x1, y1 = bbox
361
+ if x0 == 0 and y0 == 0 and x1 == 0 and y1 == 0:
362
+ return False
363
+ return (x1 - x0) > 0 and (y1 - y0) > 0
364
+
119
365
  def _FindRoleLineRect(
120
366
  self,
121
367
  page,
@@ -94,6 +94,17 @@ def apply_wet_detection(
94
94
  original_mixed = file_result.MixedContent
95
95
  try:
96
96
  added = _detect(pdf_path, configuration, file_result, logger=logger)
97
+ if added and configuration.Profile == "hipaa":
98
+ updated = False
99
+ for signature in file_result.Signatures:
100
+ if signature.RenderType == "wet" and (signature.Role or "unknown") == "unknown":
101
+ signature.Role = "patient"
102
+ signature.Scores = {"patient": int(signature.Score or 0)}
103
+ signature.Evidence = list(signature.Evidence or [])
104
+ signature.Evidence.append("role_default:patient")
105
+ updated = True
106
+ if updated:
107
+ _refresh_metadata(file_result)
97
108
  if not added:
98
109
  _mark_manual_review(file_result, "NoHighConfidenceWetSignature")
99
110
  return added
@@ -136,6 +147,18 @@ def _detect(
136
147
  scale=configuration.WetOcrDpi / 72.0,
137
148
  )
138
149
  )
150
+ if not candidates:
151
+ candidates = list(
152
+ _build_candidates(
153
+ ocr_lines,
154
+ image=image,
155
+ page_rect=page.rect,
156
+ pix_width=pixmap.width,
157
+ pix_height=pixmap.height,
158
+ scale=configuration.WetOcrDpi / 72.0,
159
+ min_y_ratio=0.2,
160
+ )
161
+ )
139
162
  candidates.extend(_image_candidates(page))
140
163
  candidates = _filter_candidates_for_page(candidates)
141
164
  accepted = [
@@ -247,6 +270,7 @@ def _build_candidates(
247
270
  pix_width: int,
248
271
  pix_height: int,
249
272
  scale: float,
273
+ min_y_ratio: float = 0.4,
250
274
  ) -> Iterable[WetCandidate]:
251
275
  for line in lines:
252
276
  normalized = line.text.lower()
@@ -255,7 +279,7 @@ def _build_candidates(
255
279
  if len(normalized) > 80:
256
280
  # Ignore long paragraph-like OCR lines
257
281
  continue
258
- if (line.bottom / pix_height) < 0.4:
282
+ if (line.bottom / pix_height) < min_y_ratio:
259
283
  # Ignore lines in the upper section of the page
260
284
  continue
261
285
  role = _infer_role(normalized)
@@ -338,28 +362,33 @@ def _expand_bbox(
338
362
  ) -> tuple[float, float, float, float]:
339
363
  x0 = line.left / scale
340
364
  x1 = line.right / scale
341
- y1 = (pix_height - line.top) / scale
365
+ y_top = (pix_height - line.top) / scale
366
+ y_bottom = (pix_height - line.bottom) / scale
342
367
 
343
368
  pad_x = max(14.0, (x1 - x0) * 0.25)
344
369
  left = max(page_rect.x0, x0 - pad_x)
345
370
  right = min(page_rect.x1, x1 + pad_x)
346
371
 
347
372
  gap = 14.0
348
- signature_height = 70.0
349
- top = min(page_rect.y1, y1 + gap)
350
- bottom = min(page_rect.y1, top + signature_height)
351
-
352
- if bottom <= top:
353
- bottom = min(page_rect.y1, top + signature_height)
373
+ line_height = max(1.0, (line.bottom - line.top) / scale)
374
+ signature_height = max(70.0, line_height * 6.0)
375
+ upper = min(page_rect.y1, y_bottom - gap)
376
+ upper = max(page_rect.y0, upper)
377
+ lower = max(page_rect.y0, upper - signature_height)
354
378
 
355
379
  if stroke_y is not None:
356
- # Anchor to the detected stroke under the OCR label when available.
380
+ # Anchor to the detected stroke (signature line) beneath the label.
357
381
  sy = (pix_height - stroke_y) / scale
358
- if sy < top:
359
- top = sy
360
- bottom = max(bottom, sy + signature_height)
382
+ field_lower = min(page_rect.y1, max(page_rect.y0, sy + 2.0))
383
+ field_upper = min(page_rect.y1, y_bottom - gap)
384
+ if field_upper > field_lower + 6.0:
385
+ lower = field_lower
386
+ upper = field_upper
387
+ else:
388
+ upper = min(page_rect.y1, field_lower + signature_height)
389
+ lower = max(page_rect.y0, upper - signature_height)
361
390
 
362
- return (float(left), float(top), float(right), float(bottom))
391
+ return (float(left), float(lower), float(right), float(upper))
363
392
 
364
393
 
365
394
  def _stroke_under_line(image: Image.Image, line: OcrLine) -> tuple[bool, float | None]:
@@ -513,14 +542,19 @@ def _signature_rank(signature: Signature) -> tuple[int, int, int]:
513
542
 
514
543
  def _dedupe_wet_signatures(signatures: Sequence[Signature]) -> list[Signature]:
515
544
  best_by_role: dict[str, Signature] = {}
545
+ best_unknown: Signature | None = None
516
546
  for signature in signatures:
517
547
  role = (signature.Role or "unknown").strip().lower()
518
548
  if role == "unknown":
549
+ if best_unknown is None or _signature_rank(signature) > _signature_rank(best_unknown):
550
+ best_unknown = signature
519
551
  continue
520
552
  existing = best_by_role.get(role)
521
553
  if existing is None or _signature_rank(signature) > _signature_rank(existing):
522
554
  best_by_role[role] = signature
523
- return sorted(best_by_role.values(), key=lambda sig: (int(sig.Page or 0), sig.Role or ""))
555
+ if best_by_role:
556
+ return sorted(best_by_role.values(), key=lambda sig: (int(sig.Page or 0), sig.Role or ""))
557
+ return [best_unknown] if best_unknown is not None else []
524
558
 
525
559
 
526
560
  def _mark_manual_review(file_result: FileResult, reason: str) -> None:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sigdetect
3
- Version: 0.5.1
3
+ Version: 0.5.3
4
4
  Summary: Signature detection and role attribution for PDFs
5
5
  Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
6
6
  License: MIT
@@ -2,11 +2,11 @@ sigdetect/__init__.py,sha256=YvnTwlC1jfq83EhQS_1JjiiHK7_wJCCU1JvHv5E1qWY,573
2
2
  sigdetect/api.py,sha256=hDfa6z4SoHth1Dw9HDfSPiytMQrqu_oyBZlXBwSh9g4,11010
3
3
  sigdetect/cli.py,sha256=X5GqZ-PK67vz4OHN5r7h-V0hO886ZblUiUdKDuFowtU,10930
4
4
  sigdetect/config.py,sha256=3SP1rkcWBGXloCDFomBJRMRKZOvXuHQbhIBqpVrzYmY,8365
5
- sigdetect/cropping.py,sha256=HfOJrV2Xv9Eo0lCIl3mukz49agKB6h2TML99B0qQJNc,8837
5
+ sigdetect/cropping.py,sha256=6O7xuEU0hOlv0Wfb4kr2DJS-JPEw_kDNx4mLeYPuXl8,86869
6
6
  sigdetect/eda.py,sha256=S92G1Gjmepri__D0n_V6foq0lQgH-RXI9anW8A58jfw,4681
7
7
  sigdetect/logging_setup.py,sha256=LMF8ao_a-JwH0S522T6aYTFX3e8Ajjv_5ODS2YiBcHA,6404
8
8
  sigdetect/utils.py,sha256=T9rubLf5T9JmjOHYMOba1j34fhOJaWocAXccnGTxRUE,5198
9
- sigdetect/wet_detection.py,sha256=zvi11XUmm_xLZ4BLvxInwMQg8YLcyQzEYAM9QSdJOIs,18259
9
+ sigdetect/wet_detection.py,sha256=ofKijykm4fKrvFaVkEkPPKL9iKeRNvlAiKkD2vHxD8k,20025
10
10
  sigdetect/data/role_rules.retainer.yml,sha256=IFdwKnDBXR2cTkdfrsZ6ku6CXD8S_dg5A3vKRKLW5h8,2532
11
11
  sigdetect/data/role_rules.yml,sha256=HuLKsZR_A6sD9XvY4NHiY_VG3dS5ERNCBF9-Mxawomw,2751
12
12
  sigdetect/data/vendor_patterns.yml,sha256=NRbZNQxcx_GuL6n1jAphBn6MM6ChCpeWGCsjbRx-PEo,384
@@ -14,11 +14,11 @@ sigdetect/detector/__init__.py,sha256=nT52mCI9s03Rso_RS86mm223rJfl5GlGDFsXwMJ3z3
14
14
  sigdetect/detector/base.py,sha256=L-iXWXqsTetDc4jRZo_wOdbNpKqOY20mX9FefrugdT0,263
15
15
  sigdetect/detector/base_detector.py,sha256=GmAgUWO_fQgIfnihZSoyhR3wpnwZ-X3hS0Kuyz4G6Ys,608
16
16
  sigdetect/detector/file_result_model.py,sha256=j2gTc9Sw3fJOHlexYsR_m5DiwHA8DzIzAMToESfvo4A,1767
17
- sigdetect/detector/pymupdf_engine.py,sha256=N6oxvUa-48VvvhjbMk0R0kfScsggNKS7u5FLSeBRfWw,17358
17
+ sigdetect/detector/pymupdf_engine.py,sha256=ZcjMrCR6qxa4pvlvOf88OGWPQsCXnPmNN7yLyEv23Cc,27840
18
18
  sigdetect/detector/pypdf2_engine.py,sha256=kB8cIp_gMvCla0LIBi9sd19g0361Oc9TjCW_ZViUBJQ,47410
19
19
  sigdetect/detector/signature_model.py,sha256=T2Hmfkfz_hZsDzwOhepxfNmkedxQp3_XHdrP8yGKoCk,1322
20
- sigdetect-0.5.1.dist-info/METADATA,sha256=_Jnyl9_A1yZUrKwWxUxVB-9rcMG3MdUqiN5WX_zlpqQ,14131
21
- sigdetect-0.5.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
22
- sigdetect-0.5.1.dist-info/entry_points.txt,sha256=iqtfKjBU44-omM7Sh-idGz2ahw19oAvpvSyKZVArG3o,48
23
- sigdetect-0.5.1.dist-info/top_level.txt,sha256=PKlfwUobkRC0viwiSXmhtw83G26FSNpimWYC1Uy00FY,10
24
- sigdetect-0.5.1.dist-info/RECORD,,
20
+ sigdetect-0.5.3.dist-info/METADATA,sha256=lm6dyZlv6tS2L61G5u94D_vbPXQ3RHJPDRS5LlDDpc0,14131
21
+ sigdetect-0.5.3.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
22
+ sigdetect-0.5.3.dist-info/entry_points.txt,sha256=iqtfKjBU44-omM7Sh-idGz2ahw19oAvpvSyKZVArG3o,48
23
+ sigdetect-0.5.3.dist-info/top_level.txt,sha256=PKlfwUobkRC0viwiSXmhtw83G26FSNpimWYC1Uy00FY,10
24
+ sigdetect-0.5.3.dist-info/RECORD,,