sigdetect 0.5.2__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,6 +3,7 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  from pathlib import Path
6
+ import re
6
7
  from typing import Iterable, cast
7
8
 
8
9
  from .pypdf2_engine import PyPDF2Detector
@@ -26,6 +27,23 @@ class PyMuPDFDetector(PyPDF2Detector):
26
27
  "representative": ("representative", "guardian", "parent"),
27
28
  "attorney": ("attorney", "counsel", "lawyer"),
28
29
  }
30
+ RETAINER_CLIENT_PATTERNS = (
31
+ re.compile(r"\bclient\b", re.IGNORECASE),
32
+ re.compile(r"\bpatient\b", re.IGNORECASE),
33
+ re.compile(r"\bagreed\b", re.IGNORECASE),
34
+ re.compile(r"\baccepted\b", re.IGNORECASE),
35
+ re.compile(r"\bby:\b", re.IGNORECASE),
36
+ re.compile(r"\bon this date\b", re.IGNORECASE),
37
+ re.compile(r"\bname\b", re.IGNORECASE),
38
+ )
39
+ RETAINER_FIRM_PATTERNS = (
40
+ re.compile(r"\battorney\b", re.IGNORECASE),
41
+ re.compile(r"\bauthorized\b", re.IGNORECASE),
42
+ re.compile(r"\brepresentative\b", re.IGNORECASE),
43
+ re.compile(r"\bfirm\b", re.IGNORECASE),
44
+ re.compile(r"\bcounsel\b", re.IGNORECASE),
45
+ re.compile(r"\blaw\b", re.IGNORECASE),
46
+ )
29
47
 
30
48
  def __init__(self, configuration):
31
49
  if fitz is None: # pragma: no cover - optional dependency
@@ -47,6 +65,7 @@ class PyMuPDFDetector(PyPDF2Detector):
47
65
  widget_map = self._CollectWidgetRects(document)
48
66
  self._ApplyWidgetRects(result.Signatures, widget_map)
49
67
  self._InferPseudoRects(result.Signatures, document)
68
+ self._ExpandRetainerImageSignatures(result, document)
50
69
  return result
51
70
 
52
71
  # ───────────────────────────────── widget helpers ─────────────────────────────────
@@ -116,6 +135,233 @@ class PyMuPDFDetector(PyPDF2Detector):
116
135
  signature.Page = page_index + 1
117
136
  break
118
137
 
138
+ # ───────────────────────── retainer image expansion ─────────────────────────
139
+ def _ExpandRetainerImageSignatures(self, result, document) -> None:
140
+ if self.Profile != "retainer":
141
+ return
142
+ if not self.Configuration.PseudoSignatures:
143
+ return
144
+ if not result.Signatures:
145
+ return
146
+
147
+ signatures_by_page: dict[int, list[Signature]] = {}
148
+ for sig in result.Signatures:
149
+ if sig.Page:
150
+ signatures_by_page.setdefault(sig.Page, []).append(sig)
151
+
152
+ added = False
153
+ image_pages: set[int] = set()
154
+ for page_index in range(document.page_count):
155
+ page_number = page_index + 1
156
+ page = document.load_page(page_index)
157
+ image_rects = self._FilterSignatureImageRects(self._CollectImageRects(page), page.rect)
158
+ if not image_rects:
159
+ continue
160
+
161
+ lines = self._ExtractLines(page)
162
+ if not self._HasRetainerSignatureCue(lines):
163
+ continue
164
+ image_pages.add(page_number)
165
+ anchors = self._CollectRetainerRoleAnchors(lines)
166
+ roles_for_rects = self._AssignRolesToImages(
167
+ image_rects,
168
+ anchors,
169
+ signatures_by_page.get(page_number, []),
170
+ )
171
+
172
+ existing_on_page = signatures_by_page.get(page_number, [])
173
+ existing_rects: list[tuple[Signature, fitz.Rect]] = []
174
+ for sig in existing_on_page:
175
+ if sig.BoundingBox:
176
+ existing_rects.append(
177
+ (sig, self._PdfTupleToRect(sig.BoundingBox, page.rect.height))
178
+ )
179
+
180
+ for rect, role in zip(image_rects, roles_for_rects):
181
+ matched_sig = None
182
+ best_overlap = 0.0
183
+ for sig, sig_rect in existing_rects:
184
+ overlap = self._OverlapRatio(rect, sig_rect)
185
+ if overlap > best_overlap:
186
+ best_overlap = overlap
187
+ matched_sig = sig
188
+ if matched_sig and best_overlap >= 0.6:
189
+ matched_sig.BoundingBox = self._RectToPdfTuple(rect, page.rect.height)
190
+ if not matched_sig.RenderType:
191
+ matched_sig.RenderType = "drawn"
192
+ if (
193
+ role != "unknown"
194
+ and (
195
+ matched_sig.Role == "unknown"
196
+ or matched_sig.FieldName == "vendor_or_acro_detected"
197
+ or matched_sig.Hint == "VendorOrAcroOnly"
198
+ )
199
+ ):
200
+ matched_sig.Role = role
201
+ matched_sig.Score = max(int(matched_sig.Score or 0), 1)
202
+ matched_sig.Scores = {role: matched_sig.Score}
203
+ matched_sig.Evidence = (matched_sig.Evidence or []) + ["image:retainer"]
204
+ continue
205
+
206
+ if role == "unknown":
207
+ role = "firm"
208
+ new_sig = Signature(
209
+ Page=page_number,
210
+ FieldName="vendor_or_acro_detected",
211
+ Role=role,
212
+ Score=1,
213
+ Scores={role: 1},
214
+ Evidence=["image:retainer", "pseudo:true"],
215
+ Hint="VendorOrAcroOnly",
216
+ RenderType="drawn",
217
+ BoundingBox=self._RectToPdfTuple(rect, page.rect.height),
218
+ )
219
+ result.Signatures.append(new_sig)
220
+ signatures_by_page.setdefault(page_number, []).append(new_sig)
221
+ added = True
222
+
223
+ if added:
224
+ if image_pages:
225
+ filtered: list[Signature] = []
226
+ for sig in result.Signatures:
227
+ if sig.Page and sig.Page not in image_pages:
228
+ if sig.FieldName == "vendor_or_acro_detected" or not self._HasBBox(sig.BoundingBox):
229
+ continue
230
+ filtered.append(sig)
231
+ result.Signatures = filtered
232
+ pages = sorted({sig.Page for sig in result.Signatures if sig.Page})
233
+ result.SignatureCount = len(result.Signatures)
234
+ result.SignaturePages = ",".join(str(p) for p in pages)
235
+ roles = sorted({sig.Role for sig in result.Signatures if sig.Role and sig.Role != "unknown"})
236
+ result.Roles = ";".join(roles) if roles else result.Roles
237
+ result.ElectronicSignatureFound = result.SignatureCount > 0
238
+ hints = {h for h in (result.Hints or "").split(";") if h}
239
+ hints |= {sig.Hint for sig in result.Signatures if sig.Hint}
240
+ result.Hints = ";".join(sorted(hints)) if hints else result.Hints
241
+
242
+ def _CollectImageRects(self, page) -> list[fitz.Rect]:
243
+ rects: list[fitz.Rect] = []
244
+ try:
245
+ images = page.get_images(full=True)
246
+ except Exception:
247
+ return rects
248
+ for img in images:
249
+ xref = img[0]
250
+ try:
251
+ rects.extend(page.get_image_rects(xref))
252
+ except Exception:
253
+ continue
254
+ return rects
255
+
256
+ def _FilterSignatureImageRects(self, rects: list[fitz.Rect], page_rect) -> list[fitz.Rect]:
257
+ if not rects:
258
+ return []
259
+ max_area = page_rect.width * page_rect.height * 0.35
260
+ filtered: list[fitz.Rect] = []
261
+ for rect in rects:
262
+ r = fitz.Rect(rect)
263
+ area = r.get_area()
264
+ if area <= 0:
265
+ continue
266
+ if area > max_area:
267
+ continue
268
+ width = max(1.0, float(r.width))
269
+ height = max(1.0, float(r.height))
270
+ aspect = width / height
271
+ if aspect < 1.3:
272
+ continue
273
+ filtered.append(r)
274
+ return filtered
275
+
276
+ def _CollectRetainerRoleAnchors(
277
+ self, lines: list[dict[str, float | str]]
278
+ ) -> dict[str, list[fitz.Rect]]:
279
+ anchors: dict[str, list[fitz.Rect]] = {"client": [], "firm": []}
280
+ for line in lines:
281
+ lower = str(line["lower_text"])
282
+ rect = fitz.Rect(line["x0"], line["y0"], line["x1"], line["y1"])
283
+ if any(pattern.search(lower) for pattern in self.RETAINER_CLIENT_PATTERNS):
284
+ anchors["client"].append(rect)
285
+ if any(pattern.search(lower) for pattern in self.RETAINER_FIRM_PATTERNS):
286
+ anchors["firm"].append(rect)
287
+ return anchors
288
+
289
+ def _HasRetainerSignatureCue(self, lines: list[dict[str, float | str]]) -> bool:
290
+ tokens = ("signature", "signed", "agreed", "accepted", "authorized", "attorney", "by:")
291
+ for line in lines:
292
+ lower = str(line["lower_text"])
293
+ if any(token in lower for token in tokens):
294
+ return True
295
+ return False
296
+
297
+ def _AssignRolesToImages(
298
+ self,
299
+ rects: list[fitz.Rect],
300
+ anchors: dict[str, list[fitz.Rect]],
301
+ existing_signatures: list[Signature],
302
+ ) -> list[str]:
303
+ roles: list[str] = []
304
+ for rect in rects:
305
+ client_score = self._RoleDistance(rect, anchors.get("client", []))
306
+ firm_score = self._RoleDistance(rect, anchors.get("firm", []))
307
+ if client_score is None and firm_score is None:
308
+ roles.append("unknown")
309
+ elif firm_score is None:
310
+ roles.append("client")
311
+ elif client_score is None:
312
+ roles.append("firm")
313
+ else:
314
+ roles.append("client" if client_score <= firm_score else "firm")
315
+
316
+ existing_client = any(sig.Role == "client" for sig in existing_signatures)
317
+ if not existing_client and "client" not in roles and rects:
318
+ lowest_index = max(range(len(rects)), key=lambda i: rects[i].y1)
319
+ roles[lowest_index] = "client"
320
+ if "firm" not in roles:
321
+ for idx, role in enumerate(roles):
322
+ if role == "unknown":
323
+ roles[idx] = "firm"
324
+ return roles
325
+
326
+ def _RoleDistance(self, rect: fitz.Rect, anchors: list[fitz.Rect]) -> float | None:
327
+ if not anchors:
328
+ return None
329
+ best = None
330
+ for anchor in anchors:
331
+ vertical_gap = max(0.0, max(anchor.y0 - rect.y1, rect.y0 - anchor.y1))
332
+ horizontal_gap = max(0.0, max(anchor.x0 - rect.x1, rect.x0 - anchor.x1))
333
+ score = vertical_gap * 2.0 + horizontal_gap
334
+ if best is None or score < best:
335
+ best = score
336
+ return best
337
+
338
+ def _OverlapRatio(self, rect: fitz.Rect, other: fitz.Rect) -> float:
339
+ try:
340
+ inter = rect & other
341
+ except Exception:
342
+ return 0.0
343
+ if inter is None:
344
+ return 0.0
345
+ inter_area = inter.get_area()
346
+ if inter_area <= 0:
347
+ return 0.0
348
+ denom = max(1.0, min(rect.get_area(), other.get_area()))
349
+ return inter_area / denom
350
+
351
+ def _PdfTupleToRect(self, bbox: tuple[float, float, float, float], page_height: float) -> fitz.Rect:
352
+ x0, y0, x1, y1 = bbox
353
+ top = page_height - y1
354
+ bottom = page_height - y0
355
+ return fitz.Rect(x0, top, x1, bottom)
356
+
357
+ def _HasBBox(self, bbox: tuple[float, float, float, float] | None) -> bool:
358
+ if not bbox or len(bbox) != 4:
359
+ return False
360
+ x0, y0, x1, y1 = bbox
361
+ if x0 == 0 and y0 == 0 and x1 == 0 and y1 == 0:
362
+ return False
363
+ return (x1 - x0) > 0 and (y1 - y0) > 0
364
+
119
365
  def _FindRoleLineRect(
120
366
  self,
121
367
  page,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sigdetect
3
- Version: 0.5.2
3
+ Version: 0.5.3
4
4
  Summary: Signature detection and role attribution for PDFs
5
5
  Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
6
6
  License: MIT
@@ -2,7 +2,7 @@ sigdetect/__init__.py,sha256=YvnTwlC1jfq83EhQS_1JjiiHK7_wJCCU1JvHv5E1qWY,573
2
2
  sigdetect/api.py,sha256=hDfa6z4SoHth1Dw9HDfSPiytMQrqu_oyBZlXBwSh9g4,11010
3
3
  sigdetect/cli.py,sha256=X5GqZ-PK67vz4OHN5r7h-V0hO886ZblUiUdKDuFowtU,10930
4
4
  sigdetect/config.py,sha256=3SP1rkcWBGXloCDFomBJRMRKZOvXuHQbhIBqpVrzYmY,8365
5
- sigdetect/cropping.py,sha256=IyiBfIEHBLvOv8t_d-O51BfpljTFpE-dG_RxDxJAzAo,16339
5
+ sigdetect/cropping.py,sha256=6O7xuEU0hOlv0Wfb4kr2DJS-JPEw_kDNx4mLeYPuXl8,86869
6
6
  sigdetect/eda.py,sha256=S92G1Gjmepri__D0n_V6foq0lQgH-RXI9anW8A58jfw,4681
7
7
  sigdetect/logging_setup.py,sha256=LMF8ao_a-JwH0S522T6aYTFX3e8Ajjv_5ODS2YiBcHA,6404
8
8
  sigdetect/utils.py,sha256=T9rubLf5T9JmjOHYMOba1j34fhOJaWocAXccnGTxRUE,5198
@@ -14,11 +14,11 @@ sigdetect/detector/__init__.py,sha256=nT52mCI9s03Rso_RS86mm223rJfl5GlGDFsXwMJ3z3
14
14
  sigdetect/detector/base.py,sha256=L-iXWXqsTetDc4jRZo_wOdbNpKqOY20mX9FefrugdT0,263
15
15
  sigdetect/detector/base_detector.py,sha256=GmAgUWO_fQgIfnihZSoyhR3wpnwZ-X3hS0Kuyz4G6Ys,608
16
16
  sigdetect/detector/file_result_model.py,sha256=j2gTc9Sw3fJOHlexYsR_m5DiwHA8DzIzAMToESfvo4A,1767
17
- sigdetect/detector/pymupdf_engine.py,sha256=N6oxvUa-48VvvhjbMk0R0kfScsggNKS7u5FLSeBRfWw,17358
17
+ sigdetect/detector/pymupdf_engine.py,sha256=ZcjMrCR6qxa4pvlvOf88OGWPQsCXnPmNN7yLyEv23Cc,27840
18
18
  sigdetect/detector/pypdf2_engine.py,sha256=kB8cIp_gMvCla0LIBi9sd19g0361Oc9TjCW_ZViUBJQ,47410
19
19
  sigdetect/detector/signature_model.py,sha256=T2Hmfkfz_hZsDzwOhepxfNmkedxQp3_XHdrP8yGKoCk,1322
20
- sigdetect-0.5.2.dist-info/METADATA,sha256=jLin7USVPqeA5tS7KCuPRRt1PLwdt-oJWhWuKSQa6hE,14131
21
- sigdetect-0.5.2.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
22
- sigdetect-0.5.2.dist-info/entry_points.txt,sha256=iqtfKjBU44-omM7Sh-idGz2ahw19oAvpvSyKZVArG3o,48
23
- sigdetect-0.5.2.dist-info/top_level.txt,sha256=PKlfwUobkRC0viwiSXmhtw83G26FSNpimWYC1Uy00FY,10
24
- sigdetect-0.5.2.dist-info/RECORD,,
20
+ sigdetect-0.5.3.dist-info/METADATA,sha256=lm6dyZlv6tS2L61G5u94D_vbPXQ3RHJPDRS5LlDDpc0,14131
21
+ sigdetect-0.5.3.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
22
+ sigdetect-0.5.3.dist-info/entry_points.txt,sha256=iqtfKjBU44-omM7Sh-idGz2ahw19oAvpvSyKZVArG3o,48
23
+ sigdetect-0.5.3.dist-info/top_level.txt,sha256=PKlfwUobkRC0viwiSXmhtw83G26FSNpimWYC1Uy00FY,10
24
+ sigdetect-0.5.3.dist-info/RECORD,,