sigdetect 0.5.2__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sigdetect/cropping.py +2147 -21
- sigdetect/detector/pymupdf_engine.py +246 -0
- {sigdetect-0.5.2.dist-info → sigdetect-0.5.3.dist-info}/METADATA +1 -1
- {sigdetect-0.5.2.dist-info → sigdetect-0.5.3.dist-info}/RECORD +7 -7
- {sigdetect-0.5.2.dist-info → sigdetect-0.5.3.dist-info}/WHEEL +0 -0
- {sigdetect-0.5.2.dist-info → sigdetect-0.5.3.dist-info}/entry_points.txt +0 -0
- {sigdetect-0.5.2.dist-info → sigdetect-0.5.3.dist-info}/top_level.txt +0 -0
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
from pathlib import Path
|
|
6
|
+
import re
|
|
6
7
|
from typing import Iterable, cast
|
|
7
8
|
|
|
8
9
|
from .pypdf2_engine import PyPDF2Detector
|
|
@@ -26,6 +27,23 @@ class PyMuPDFDetector(PyPDF2Detector):
|
|
|
26
27
|
"representative": ("representative", "guardian", "parent"),
|
|
27
28
|
"attorney": ("attorney", "counsel", "lawyer"),
|
|
28
29
|
}
|
|
30
|
+
RETAINER_CLIENT_PATTERNS = (
|
|
31
|
+
re.compile(r"\bclient\b", re.IGNORECASE),
|
|
32
|
+
re.compile(r"\bpatient\b", re.IGNORECASE),
|
|
33
|
+
re.compile(r"\bagreed\b", re.IGNORECASE),
|
|
34
|
+
re.compile(r"\baccepted\b", re.IGNORECASE),
|
|
35
|
+
re.compile(r"\bby:\b", re.IGNORECASE),
|
|
36
|
+
re.compile(r"\bon this date\b", re.IGNORECASE),
|
|
37
|
+
re.compile(r"\bname\b", re.IGNORECASE),
|
|
38
|
+
)
|
|
39
|
+
RETAINER_FIRM_PATTERNS = (
|
|
40
|
+
re.compile(r"\battorney\b", re.IGNORECASE),
|
|
41
|
+
re.compile(r"\bauthorized\b", re.IGNORECASE),
|
|
42
|
+
re.compile(r"\brepresentative\b", re.IGNORECASE),
|
|
43
|
+
re.compile(r"\bfirm\b", re.IGNORECASE),
|
|
44
|
+
re.compile(r"\bcounsel\b", re.IGNORECASE),
|
|
45
|
+
re.compile(r"\blaw\b", re.IGNORECASE),
|
|
46
|
+
)
|
|
29
47
|
|
|
30
48
|
def __init__(self, configuration):
|
|
31
49
|
if fitz is None: # pragma: no cover - optional dependency
|
|
@@ -47,6 +65,7 @@ class PyMuPDFDetector(PyPDF2Detector):
|
|
|
47
65
|
widget_map = self._CollectWidgetRects(document)
|
|
48
66
|
self._ApplyWidgetRects(result.Signatures, widget_map)
|
|
49
67
|
self._InferPseudoRects(result.Signatures, document)
|
|
68
|
+
self._ExpandRetainerImageSignatures(result, document)
|
|
50
69
|
return result
|
|
51
70
|
|
|
52
71
|
# ───────────────────────────────── widget helpers ─────────────────────────────────
|
|
@@ -116,6 +135,233 @@ class PyMuPDFDetector(PyPDF2Detector):
|
|
|
116
135
|
signature.Page = page_index + 1
|
|
117
136
|
break
|
|
118
137
|
|
|
138
|
+
# ───────────────────────── retainer image expansion ─────────────────────────
|
|
139
|
+
def _ExpandRetainerImageSignatures(self, result, document) -> None:
|
|
140
|
+
if self.Profile != "retainer":
|
|
141
|
+
return
|
|
142
|
+
if not self.Configuration.PseudoSignatures:
|
|
143
|
+
return
|
|
144
|
+
if not result.Signatures:
|
|
145
|
+
return
|
|
146
|
+
|
|
147
|
+
signatures_by_page: dict[int, list[Signature]] = {}
|
|
148
|
+
for sig in result.Signatures:
|
|
149
|
+
if sig.Page:
|
|
150
|
+
signatures_by_page.setdefault(sig.Page, []).append(sig)
|
|
151
|
+
|
|
152
|
+
added = False
|
|
153
|
+
image_pages: set[int] = set()
|
|
154
|
+
for page_index in range(document.page_count):
|
|
155
|
+
page_number = page_index + 1
|
|
156
|
+
page = document.load_page(page_index)
|
|
157
|
+
image_rects = self._FilterSignatureImageRects(self._CollectImageRects(page), page.rect)
|
|
158
|
+
if not image_rects:
|
|
159
|
+
continue
|
|
160
|
+
|
|
161
|
+
lines = self._ExtractLines(page)
|
|
162
|
+
if not self._HasRetainerSignatureCue(lines):
|
|
163
|
+
continue
|
|
164
|
+
image_pages.add(page_number)
|
|
165
|
+
anchors = self._CollectRetainerRoleAnchors(lines)
|
|
166
|
+
roles_for_rects = self._AssignRolesToImages(
|
|
167
|
+
image_rects,
|
|
168
|
+
anchors,
|
|
169
|
+
signatures_by_page.get(page_number, []),
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
existing_on_page = signatures_by_page.get(page_number, [])
|
|
173
|
+
existing_rects: list[tuple[Signature, fitz.Rect]] = []
|
|
174
|
+
for sig in existing_on_page:
|
|
175
|
+
if sig.BoundingBox:
|
|
176
|
+
existing_rects.append(
|
|
177
|
+
(sig, self._PdfTupleToRect(sig.BoundingBox, page.rect.height))
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
for rect, role in zip(image_rects, roles_for_rects):
|
|
181
|
+
matched_sig = None
|
|
182
|
+
best_overlap = 0.0
|
|
183
|
+
for sig, sig_rect in existing_rects:
|
|
184
|
+
overlap = self._OverlapRatio(rect, sig_rect)
|
|
185
|
+
if overlap > best_overlap:
|
|
186
|
+
best_overlap = overlap
|
|
187
|
+
matched_sig = sig
|
|
188
|
+
if matched_sig and best_overlap >= 0.6:
|
|
189
|
+
matched_sig.BoundingBox = self._RectToPdfTuple(rect, page.rect.height)
|
|
190
|
+
if not matched_sig.RenderType:
|
|
191
|
+
matched_sig.RenderType = "drawn"
|
|
192
|
+
if (
|
|
193
|
+
role != "unknown"
|
|
194
|
+
and (
|
|
195
|
+
matched_sig.Role == "unknown"
|
|
196
|
+
or matched_sig.FieldName == "vendor_or_acro_detected"
|
|
197
|
+
or matched_sig.Hint == "VendorOrAcroOnly"
|
|
198
|
+
)
|
|
199
|
+
):
|
|
200
|
+
matched_sig.Role = role
|
|
201
|
+
matched_sig.Score = max(int(matched_sig.Score or 0), 1)
|
|
202
|
+
matched_sig.Scores = {role: matched_sig.Score}
|
|
203
|
+
matched_sig.Evidence = (matched_sig.Evidence or []) + ["image:retainer"]
|
|
204
|
+
continue
|
|
205
|
+
|
|
206
|
+
if role == "unknown":
|
|
207
|
+
role = "firm"
|
|
208
|
+
new_sig = Signature(
|
|
209
|
+
Page=page_number,
|
|
210
|
+
FieldName="vendor_or_acro_detected",
|
|
211
|
+
Role=role,
|
|
212
|
+
Score=1,
|
|
213
|
+
Scores={role: 1},
|
|
214
|
+
Evidence=["image:retainer", "pseudo:true"],
|
|
215
|
+
Hint="VendorOrAcroOnly",
|
|
216
|
+
RenderType="drawn",
|
|
217
|
+
BoundingBox=self._RectToPdfTuple(rect, page.rect.height),
|
|
218
|
+
)
|
|
219
|
+
result.Signatures.append(new_sig)
|
|
220
|
+
signatures_by_page.setdefault(page_number, []).append(new_sig)
|
|
221
|
+
added = True
|
|
222
|
+
|
|
223
|
+
if added:
|
|
224
|
+
if image_pages:
|
|
225
|
+
filtered: list[Signature] = []
|
|
226
|
+
for sig in result.Signatures:
|
|
227
|
+
if sig.Page and sig.Page not in image_pages:
|
|
228
|
+
if sig.FieldName == "vendor_or_acro_detected" or not self._HasBBox(sig.BoundingBox):
|
|
229
|
+
continue
|
|
230
|
+
filtered.append(sig)
|
|
231
|
+
result.Signatures = filtered
|
|
232
|
+
pages = sorted({sig.Page for sig in result.Signatures if sig.Page})
|
|
233
|
+
result.SignatureCount = len(result.Signatures)
|
|
234
|
+
result.SignaturePages = ",".join(str(p) for p in pages)
|
|
235
|
+
roles = sorted({sig.Role for sig in result.Signatures if sig.Role and sig.Role != "unknown"})
|
|
236
|
+
result.Roles = ";".join(roles) if roles else result.Roles
|
|
237
|
+
result.ElectronicSignatureFound = result.SignatureCount > 0
|
|
238
|
+
hints = {h for h in (result.Hints or "").split(";") if h}
|
|
239
|
+
hints |= {sig.Hint for sig in result.Signatures if sig.Hint}
|
|
240
|
+
result.Hints = ";".join(sorted(hints)) if hints else result.Hints
|
|
241
|
+
|
|
242
|
+
def _CollectImageRects(self, page) -> list[fitz.Rect]:
|
|
243
|
+
rects: list[fitz.Rect] = []
|
|
244
|
+
try:
|
|
245
|
+
images = page.get_images(full=True)
|
|
246
|
+
except Exception:
|
|
247
|
+
return rects
|
|
248
|
+
for img in images:
|
|
249
|
+
xref = img[0]
|
|
250
|
+
try:
|
|
251
|
+
rects.extend(page.get_image_rects(xref))
|
|
252
|
+
except Exception:
|
|
253
|
+
continue
|
|
254
|
+
return rects
|
|
255
|
+
|
|
256
|
+
def _FilterSignatureImageRects(self, rects: list[fitz.Rect], page_rect) -> list[fitz.Rect]:
|
|
257
|
+
if not rects:
|
|
258
|
+
return []
|
|
259
|
+
max_area = page_rect.width * page_rect.height * 0.35
|
|
260
|
+
filtered: list[fitz.Rect] = []
|
|
261
|
+
for rect in rects:
|
|
262
|
+
r = fitz.Rect(rect)
|
|
263
|
+
area = r.get_area()
|
|
264
|
+
if area <= 0:
|
|
265
|
+
continue
|
|
266
|
+
if area > max_area:
|
|
267
|
+
continue
|
|
268
|
+
width = max(1.0, float(r.width))
|
|
269
|
+
height = max(1.0, float(r.height))
|
|
270
|
+
aspect = width / height
|
|
271
|
+
if aspect < 1.3:
|
|
272
|
+
continue
|
|
273
|
+
filtered.append(r)
|
|
274
|
+
return filtered
|
|
275
|
+
|
|
276
|
+
def _CollectRetainerRoleAnchors(
|
|
277
|
+
self, lines: list[dict[str, float | str]]
|
|
278
|
+
) -> dict[str, list[fitz.Rect]]:
|
|
279
|
+
anchors: dict[str, list[fitz.Rect]] = {"client": [], "firm": []}
|
|
280
|
+
for line in lines:
|
|
281
|
+
lower = str(line["lower_text"])
|
|
282
|
+
rect = fitz.Rect(line["x0"], line["y0"], line["x1"], line["y1"])
|
|
283
|
+
if any(pattern.search(lower) for pattern in self.RETAINER_CLIENT_PATTERNS):
|
|
284
|
+
anchors["client"].append(rect)
|
|
285
|
+
if any(pattern.search(lower) for pattern in self.RETAINER_FIRM_PATTERNS):
|
|
286
|
+
anchors["firm"].append(rect)
|
|
287
|
+
return anchors
|
|
288
|
+
|
|
289
|
+
def _HasRetainerSignatureCue(self, lines: list[dict[str, float | str]]) -> bool:
|
|
290
|
+
tokens = ("signature", "signed", "agreed", "accepted", "authorized", "attorney", "by:")
|
|
291
|
+
for line in lines:
|
|
292
|
+
lower = str(line["lower_text"])
|
|
293
|
+
if any(token in lower for token in tokens):
|
|
294
|
+
return True
|
|
295
|
+
return False
|
|
296
|
+
|
|
297
|
+
def _AssignRolesToImages(
|
|
298
|
+
self,
|
|
299
|
+
rects: list[fitz.Rect],
|
|
300
|
+
anchors: dict[str, list[fitz.Rect]],
|
|
301
|
+
existing_signatures: list[Signature],
|
|
302
|
+
) -> list[str]:
|
|
303
|
+
roles: list[str] = []
|
|
304
|
+
for rect in rects:
|
|
305
|
+
client_score = self._RoleDistance(rect, anchors.get("client", []))
|
|
306
|
+
firm_score = self._RoleDistance(rect, anchors.get("firm", []))
|
|
307
|
+
if client_score is None and firm_score is None:
|
|
308
|
+
roles.append("unknown")
|
|
309
|
+
elif firm_score is None:
|
|
310
|
+
roles.append("client")
|
|
311
|
+
elif client_score is None:
|
|
312
|
+
roles.append("firm")
|
|
313
|
+
else:
|
|
314
|
+
roles.append("client" if client_score <= firm_score else "firm")
|
|
315
|
+
|
|
316
|
+
existing_client = any(sig.Role == "client" for sig in existing_signatures)
|
|
317
|
+
if not existing_client and "client" not in roles and rects:
|
|
318
|
+
lowest_index = max(range(len(rects)), key=lambda i: rects[i].y1)
|
|
319
|
+
roles[lowest_index] = "client"
|
|
320
|
+
if "firm" not in roles:
|
|
321
|
+
for idx, role in enumerate(roles):
|
|
322
|
+
if role == "unknown":
|
|
323
|
+
roles[idx] = "firm"
|
|
324
|
+
return roles
|
|
325
|
+
|
|
326
|
+
def _RoleDistance(self, rect: fitz.Rect, anchors: list[fitz.Rect]) -> float | None:
|
|
327
|
+
if not anchors:
|
|
328
|
+
return None
|
|
329
|
+
best = None
|
|
330
|
+
for anchor in anchors:
|
|
331
|
+
vertical_gap = max(0.0, max(anchor.y0 - rect.y1, rect.y0 - anchor.y1))
|
|
332
|
+
horizontal_gap = max(0.0, max(anchor.x0 - rect.x1, rect.x0 - anchor.x1))
|
|
333
|
+
score = vertical_gap * 2.0 + horizontal_gap
|
|
334
|
+
if best is None or score < best:
|
|
335
|
+
best = score
|
|
336
|
+
return best
|
|
337
|
+
|
|
338
|
+
def _OverlapRatio(self, rect: fitz.Rect, other: fitz.Rect) -> float:
|
|
339
|
+
try:
|
|
340
|
+
inter = rect & other
|
|
341
|
+
except Exception:
|
|
342
|
+
return 0.0
|
|
343
|
+
if inter is None:
|
|
344
|
+
return 0.0
|
|
345
|
+
inter_area = inter.get_area()
|
|
346
|
+
if inter_area <= 0:
|
|
347
|
+
return 0.0
|
|
348
|
+
denom = max(1.0, min(rect.get_area(), other.get_area()))
|
|
349
|
+
return inter_area / denom
|
|
350
|
+
|
|
351
|
+
def _PdfTupleToRect(self, bbox: tuple[float, float, float, float], page_height: float) -> fitz.Rect:
|
|
352
|
+
x0, y0, x1, y1 = bbox
|
|
353
|
+
top = page_height - y1
|
|
354
|
+
bottom = page_height - y0
|
|
355
|
+
return fitz.Rect(x0, top, x1, bottom)
|
|
356
|
+
|
|
357
|
+
def _HasBBox(self, bbox: tuple[float, float, float, float] | None) -> bool:
|
|
358
|
+
if not bbox or len(bbox) != 4:
|
|
359
|
+
return False
|
|
360
|
+
x0, y0, x1, y1 = bbox
|
|
361
|
+
if x0 == 0 and y0 == 0 and x1 == 0 and y1 == 0:
|
|
362
|
+
return False
|
|
363
|
+
return (x1 - x0) > 0 and (y1 - y0) > 0
|
|
364
|
+
|
|
119
365
|
def _FindRoleLineRect(
|
|
120
366
|
self,
|
|
121
367
|
page,
|
|
@@ -2,7 +2,7 @@ sigdetect/__init__.py,sha256=YvnTwlC1jfq83EhQS_1JjiiHK7_wJCCU1JvHv5E1qWY,573
|
|
|
2
2
|
sigdetect/api.py,sha256=hDfa6z4SoHth1Dw9HDfSPiytMQrqu_oyBZlXBwSh9g4,11010
|
|
3
3
|
sigdetect/cli.py,sha256=X5GqZ-PK67vz4OHN5r7h-V0hO886ZblUiUdKDuFowtU,10930
|
|
4
4
|
sigdetect/config.py,sha256=3SP1rkcWBGXloCDFomBJRMRKZOvXuHQbhIBqpVrzYmY,8365
|
|
5
|
-
sigdetect/cropping.py,sha256=
|
|
5
|
+
sigdetect/cropping.py,sha256=6O7xuEU0hOlv0Wfb4kr2DJS-JPEw_kDNx4mLeYPuXl8,86869
|
|
6
6
|
sigdetect/eda.py,sha256=S92G1Gjmepri__D0n_V6foq0lQgH-RXI9anW8A58jfw,4681
|
|
7
7
|
sigdetect/logging_setup.py,sha256=LMF8ao_a-JwH0S522T6aYTFX3e8Ajjv_5ODS2YiBcHA,6404
|
|
8
8
|
sigdetect/utils.py,sha256=T9rubLf5T9JmjOHYMOba1j34fhOJaWocAXccnGTxRUE,5198
|
|
@@ -14,11 +14,11 @@ sigdetect/detector/__init__.py,sha256=nT52mCI9s03Rso_RS86mm223rJfl5GlGDFsXwMJ3z3
|
|
|
14
14
|
sigdetect/detector/base.py,sha256=L-iXWXqsTetDc4jRZo_wOdbNpKqOY20mX9FefrugdT0,263
|
|
15
15
|
sigdetect/detector/base_detector.py,sha256=GmAgUWO_fQgIfnihZSoyhR3wpnwZ-X3hS0Kuyz4G6Ys,608
|
|
16
16
|
sigdetect/detector/file_result_model.py,sha256=j2gTc9Sw3fJOHlexYsR_m5DiwHA8DzIzAMToESfvo4A,1767
|
|
17
|
-
sigdetect/detector/pymupdf_engine.py,sha256=
|
|
17
|
+
sigdetect/detector/pymupdf_engine.py,sha256=ZcjMrCR6qxa4pvlvOf88OGWPQsCXnPmNN7yLyEv23Cc,27840
|
|
18
18
|
sigdetect/detector/pypdf2_engine.py,sha256=kB8cIp_gMvCla0LIBi9sd19g0361Oc9TjCW_ZViUBJQ,47410
|
|
19
19
|
sigdetect/detector/signature_model.py,sha256=T2Hmfkfz_hZsDzwOhepxfNmkedxQp3_XHdrP8yGKoCk,1322
|
|
20
|
-
sigdetect-0.5.
|
|
21
|
-
sigdetect-0.5.
|
|
22
|
-
sigdetect-0.5.
|
|
23
|
-
sigdetect-0.5.
|
|
24
|
-
sigdetect-0.5.
|
|
20
|
+
sigdetect-0.5.3.dist-info/METADATA,sha256=lm6dyZlv6tS2L61G5u94D_vbPXQ3RHJPDRS5LlDDpc0,14131
|
|
21
|
+
sigdetect-0.5.3.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
22
|
+
sigdetect-0.5.3.dist-info/entry_points.txt,sha256=iqtfKjBU44-omM7Sh-idGz2ahw19oAvpvSyKZVArG3o,48
|
|
23
|
+
sigdetect-0.5.3.dist-info/top_level.txt,sha256=PKlfwUobkRC0viwiSXmhtw83G26FSNpimWYC1Uy00FY,10
|
|
24
|
+
sigdetect-0.5.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|