sigdetect 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1114 @@
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ import re
5
+ import warnings
6
+ import zlib
7
+ from collections import defaultdict
8
+ from collections.abc import Iterator
9
+ from contextlib import contextmanager, redirect_stderr, redirect_stdout, suppress
10
+ from pathlib import Path
11
+
12
+ from pypdf import PdfReader, generic
13
+ from pypdf.errors import PdfReadWarning
14
+
15
+ from ..config import DetectConfiguration
16
+ from ..utils import (
17
+ AsDictionary,
18
+ ChooseRole,
19
+ GetFieldNameFromAncestry,
20
+ HasSignatureFieldInAncestry,
21
+ HasSignatureValue,
22
+ LoadPatterns,
23
+ NormalizeText,
24
+ RolesFromGeneral,
25
+ RolesFromLabels,
26
+ )
27
+ from .base import Detector, FileResult, Signature
28
+
29
+ # ────────────────────────── silence noisy pdf warnings ──────────────────────────
30
+ warnings.filterwarnings(
31
+ "ignore",
32
+ message=r"Multiple definitions in dictionary.*key /Subtype",
33
+ category=PdfReadWarning,
34
+ )
35
+
36
+ # ---------------- fallbacks (used only if YAML omits them) ----------------
37
+ DEFAULT_VENDOR_BYTES = [
38
+ r"/DocuSign",
39
+ r"/Adobe\.PPKLite",
40
+ r"/DocTimeStamp",
41
+ r"/DSS",
42
+ r"/AcrobatSign",
43
+ r"/HelloSign",
44
+ r"/Vinesign",
45
+ r"/PandaDoc",
46
+ ]
47
+ DEFAULT_VENDOR_TEXT = [
48
+ r"DocuSign\s+Envelope\s+ID",
49
+ r"Signature\s+Certificate",
50
+ r"Electronic\s+Record\s+and\s+Signature\s+Disclosure",
51
+ r"Adobe\s+Acrobat\s+Sign|Acrobat\s+Sign",
52
+ r"HelloSign|Dropbox\s+Sign",
53
+ r"Vinesign",
54
+ r"Signed\s+with\s+PandaDoc",
55
+ r"Reference\s+number",
56
+ r"Digitally\s+signed\s+by",
57
+ ]
58
+
59
+ DEFAULT_FIELDNAME_HINTS: dict[str, tuple[str, ...]] = {
60
+ "patient": ("patient", "plaintiff", "self", "claimant"),
61
+ "attorney": ("attorney", "lawyer", "counsel"),
62
+ "representative": (
63
+ "representative",
64
+ "rep",
65
+ "guardian",
66
+ "parent",
67
+ "executor",
68
+ "custodian",
69
+ "conservator",
70
+ "poa",
71
+ "powerofattorney",
72
+ ),
73
+ # retainer additions
74
+ "client": ("client", "clientname", "clientsignature", "consumer"),
75
+ "firm": ("firm", "lawfirm", "company", "corp", "authorizedsignatory"),
76
+ }
77
+
78
+ # Add robust “Parent/Guardian” label variant into representative labels.
79
+ REP_EXTRA = r"(signature\s+of\s+(the\s+)?(parent|guardian|parent\s*/\s*guardian))"
80
+
81
+ # Light retainer page cues (extra to YAML; safe defaults)
82
+ RETAIN_CLIENT_LABELS = [
83
+ r"\bclient\s+signature\b",
84
+ r"\bname\s+of\s+client\b",
85
+ r"\bclient\b.*\bsignature\b",
86
+ r"\bprint(?:ed)?\s+name\b.*\bclient\b",
87
+ ]
88
+ RETAIN_FIRM_LABELS = [
89
+ r"\bby:\b",
90
+ r"\bfor\s+the\s+firm\b",
91
+ r"\battorney'?s?\s+signature\b",
92
+ r"\bcounsel\s+signature\b",
93
+ r"\besq\.?\b",
94
+ ]
95
+ RETAIN_FIRM_MARKERS = [
96
+ r"\bLLP\b",
97
+ r"\bLLC\b",
98
+ r"\bP\.?C\.?\b",
99
+ r"\bP\.?A\.?\b",
100
+ r"\bAttorneys?\s+at\s+Law\b",
101
+ r"\bLaw\b",
102
+ ]
103
+
104
+ AP_DO_PATTERN = re.compile(r"/(?P<name>[^\s]+)\s+Do\b")
105
+ AP_TEXT_PATTERN = re.compile(r"\b(TJ|Tj)\b")
106
+ AP_VECTOR_PATTERN = re.compile(r"\b(m|l|c|re)\b", re.IGNORECASE)
107
+
108
+
109
+ @contextmanager
110
+ def _QuietIo():
111
+ """Hide noisy stdout/stderr messages from PDF parsing/text extraction."""
112
+ sink = io.StringIO()
113
+ with redirect_stdout(sink), redirect_stderr(sink):
114
+ yield
115
+
116
+
117
+ class PyPDF2Detector(Detector):
118
+ Name = "pypdf2"
119
+
120
+ def __init__(self, configuration: DetectConfiguration):
121
+ self.Configuration = configuration
122
+ self.Profile = (
123
+ getattr(configuration, "Profile", getattr(configuration, "profile", "hipaa")) or "hipaa"
124
+ )
125
+ pats = LoadPatterns(self.Profile)
126
+
127
+ # Vendor patterns (fallback to defaults if missing)
128
+ vb = pats.get("bytes") or DEFAULT_VENDOR_BYTES
129
+ vt = pats.get("text") or DEFAULT_VENDOR_TEXT
130
+ self.VendorBytePatterns = [re.compile(p.encode(), re.I) for p in vb]
131
+ self.VendorTextPatterns = [re.compile(p, re.I) for p in vt]
132
+
133
+ # Allow callers to disable expensive XObject recursion if desired
134
+ self.RecurseXObjects = bool(
135
+ getattr(
136
+ configuration, "RecurseXObjects", getattr(configuration, "recurse_xobjects", True)
137
+ )
138
+ )
139
+
140
+ # Role patterns (labels + general)
141
+ labels = dict(pats.get("labels", {}))
142
+ # Ensure HIPAA representative includes Parent/Guardian phrasing
143
+ if "representative" in labels:
144
+ labels["representative"] = f"(?:{labels['representative']}|{REP_EXTRA})"
145
+ else:
146
+ labels["representative"] = REP_EXTRA
147
+
148
+ # Retainer: ensure 'client' and 'firm' buckets exist
149
+ if self.Profile == "retainer":
150
+ if "client" not in labels:
151
+ labels["client"] = "|".join(RETAIN_CLIENT_LABELS)
152
+ if "firm" not in labels:
153
+ labels["firm"] = "|".join(RETAIN_FIRM_LABELS)
154
+
155
+ self.RoleLabelPatterns = {k: re.compile(v, re.I) for k, v in labels.items()}
156
+ self.GeneralRolePatterns = {
157
+ k: re.compile(v, re.I) for k, v in pats.get("general", {}).items()
158
+ }
159
+
160
+ # Field hints (accept either key name)
161
+ raw_field_hints = (
162
+ pats.get("field_hints") or pats.get("fieldname_hints") or DEFAULT_FIELDNAME_HINTS
163
+ )
164
+ self.FieldHints: dict[str, tuple[str, ...]] = {
165
+ k: tuple(v) for k, v in raw_field_hints.items()
166
+ }
167
+
168
+ # Doc hard rules + weights
169
+ self.DocumentHardRules = {
170
+ k: re.compile(v, re.I) for k, v in pats.get("doc_hard", {}).items()
171
+ }
172
+ self.WeightConfiguration = pats.get(
173
+ "weights",
174
+ {
175
+ "field": 3,
176
+ "page_label": 2,
177
+ "general": 1,
178
+ "doc_hint_strong": 3,
179
+ "doc_hint_weak": 2,
180
+ },
181
+ )
182
+
183
+ # Precompile retainer extras
184
+ if self.Profile == "retainer":
185
+ self._ClientPagePatterns = [re.compile(p, re.I) for p in RETAIN_CLIENT_LABELS]
186
+ self._FirmPagePatterns = [re.compile(p, re.I) for p in RETAIN_FIRM_LABELS]
187
+ self._FirmMarkerPatterns = [re.compile(p, re.I) for p in RETAIN_FIRM_MARKERS]
188
+ self._SignatureWord = re.compile(r"\bsignature\b", re.I)
189
+ self._DateWord = re.compile(r"\bdate\b", re.I)
190
+ self._ByWord = re.compile(r"\bby:\b", re.I)
191
+
192
+ # Heuristic to drop false widgets like DocuSign envelope ID
193
+ self._EnvelopeNoise = re.compile(r"envelope[_\s-]*id|envelopeid|certificate|docid", re.I)
194
+
195
+ # ---------------- vendor scanning helpers ----------------
196
+ def _ScanRaw(self, raw: bytes) -> set[str]:
197
+ """Scan bytes (already decompressed if needed) for vendor markers & text."""
198
+ hits: set[str] = set()
199
+ if not raw:
200
+ return hits
201
+ for rx in self.VendorBytePatterns:
202
+ if rx.search(raw):
203
+ try:
204
+ pat = rx.pattern.decode("ascii", "ignore")
205
+ except Exception:
206
+ pat = str(rx.pattern)
207
+ hits.add(f"VendorBytes:{pat}")
208
+ # also search text markers inside bytes
209
+ textish = raw.decode("latin1", "ignore")
210
+ for rx in self.VendorTextPatterns:
211
+ if rx.search(textish):
212
+ hits.add(f"VendorText:{rx.pattern}")
213
+ return hits
214
+
215
+ def _ScanPageVendors(self, page) -> set[str]:
216
+ found: set[str] = set()
217
+
218
+ with _QuietIo():
219
+ cont = page.get_contents()
220
+ raws: list[bytes] = []
221
+ if cont is None:
222
+ pass
223
+ elif isinstance(cont, list):
224
+ raws.extend(c.get_data() for c in cont if hasattr(c, "get_data"))
225
+ elif hasattr(cont, "get_data"):
226
+ raws.append(cont.get_data())
227
+
228
+ for raw in raws:
229
+ found |= self._ScanRaw(raw)
230
+
231
+ with _QuietIo():
232
+ txt = page.extract_text() or ""
233
+ for rx in self.VendorTextPatterns:
234
+ if rx.search(txt):
235
+ found.add(f"VendorText:{rx.pattern}")
236
+
237
+ return found
238
+
239
+ def _IterateFormXObjects(self, page) -> Iterator[generic.DictionaryObject]:
240
+ """Yield Form XObject dictionaries recursively from page resources."""
241
+ with suppress(KeyError):
242
+ xobjs = page["/Resources"]["/XObject"]
243
+ visited = set()
244
+
245
+ def walk(xo):
246
+ d = AsDictionary(xo)
247
+ if not isinstance(d, generic.DictionaryObject):
248
+ return
249
+ key = (id(d), d.get("/Subtype"))
250
+ if key in visited:
251
+ return
252
+ visited.add(key)
253
+ if d.get("/Subtype") == "/Form":
254
+ yield d
255
+ with suppress(KeyError):
256
+ nested = d["/Resources"]["/XObject"]
257
+ for n in nested.values():
258
+ yield from walk(n)
259
+
260
+ for ob in xobjs.values():
261
+ yield from walk(ob)
262
+
263
+ def _CollectXObjectVendorAndText(self, page) -> tuple[set[str], str]:
264
+ """Scan Form XObjects regardless of whether they're drawn."""
265
+ hits: set[str] = set()
266
+ parts: list[str] = []
267
+ for xo in self._IterateFormXObjects(page):
268
+ if hasattr(xo, "get_data"):
269
+ with suppress(Exception), _QuietIo():
270
+ raw = xo.get_data()
271
+ if raw:
272
+ hits |= self._ScanRaw(raw)
273
+ parts.append(raw.decode("latin1", "ignore"))
274
+ return hits, " ".join(parts)
275
+
276
+ # ---------------- appearance classification helpers ----------------
277
+ def _ExtractAppearanceStreams(self, candidate: object) -> list[object]:
278
+ """Return decoded appearance stream objects from an ``/AP`` entry."""
279
+
280
+ streams: list[object] = []
281
+
282
+ def visit(node: object | None) -> None:
283
+ if node is None:
284
+ return
285
+ obj = AsDictionary(node)
286
+ if isinstance(obj, generic.IndirectObject):
287
+ with suppress(Exception):
288
+ obj = obj.get_object()
289
+ obj = AsDictionary(obj)
290
+ if obj is None:
291
+ return
292
+ if hasattr(obj, "get_data"):
293
+ streams.append(obj)
294
+ return
295
+ if isinstance(obj, generic.DictionaryObject):
296
+ for value in obj.values():
297
+ visit(value)
298
+ elif isinstance(obj, generic.ArrayObject):
299
+ for value in obj:
300
+ visit(value)
301
+
302
+ visit(candidate)
303
+ return streams
304
+
305
+ def _ResolveResources(self, stream, page) -> generic.DictionaryObject | None:
306
+ """Return the resource dictionary for the given appearance stream."""
307
+
308
+ resources = AsDictionary(getattr(stream, "get", lambda *_: None)("/Resources")) # type: ignore[arg-type]
309
+ if isinstance(resources, generic.IndirectObject):
310
+ with suppress(Exception):
311
+ resources = resources.get_object()
312
+ resources = AsDictionary(resources)
313
+ if not isinstance(resources, generic.DictionaryObject) and page is not None:
314
+ page_resources = AsDictionary(page.get("/Resources")) if page else None
315
+ if isinstance(page_resources, generic.IndirectObject):
316
+ with suppress(Exception):
317
+ page_resources = page_resources.get_object()
318
+ if isinstance(page_resources, generic.DictionaryObject):
319
+ resources = page_resources
320
+ return resources if isinstance(resources, generic.DictionaryObject) else None
321
+
322
+ def _DoTargetsImage(self, name: str, resources: generic.DictionaryObject | None) -> bool:
323
+ """Determine whether ``name`` resolves to an Image XObject."""
324
+
325
+ normalized = name.lstrip("/")
326
+ if resources is not None:
327
+ xobjects = AsDictionary(resources.get("/XObject"))
328
+ if isinstance(xobjects, generic.IndirectObject):
329
+ with suppress(Exception):
330
+ xobjects = xobjects.get_object()
331
+ if isinstance(xobjects, generic.DictionaryObject):
332
+ for key, value in xobjects.items():
333
+ key_name = str(key)
334
+ if key_name.startswith("/"):
335
+ key_name = key_name[1:]
336
+ if key_name == normalized:
337
+ target = AsDictionary(value)
338
+ if isinstance(target, generic.IndirectObject):
339
+ with suppress(Exception):
340
+ target = target.get_object()
341
+ target = AsDictionary(target)
342
+ if isinstance(target, generic.DictionaryObject):
343
+ if target.get("/Subtype") == "/Image":
344
+ return True
345
+ # Fallback heuristic: appearance streams typically prefix image XObjects with "Im".
346
+ return normalized.lower().startswith("im")
347
+
348
+ def _ClassifyAppearance(self, widget: generic.DictionaryObject, page) -> str:
349
+ """Classify the widget's appearance as drawn/typed/hybrid/unknown."""
350
+
351
+ ap_dict = AsDictionary(widget.get("/AP"))
352
+ if not isinstance(ap_dict, generic.DictionaryObject):
353
+ return "unknown"
354
+ normal = ap_dict.get("/N")
355
+ streams = self._ExtractAppearanceStreams(normal)
356
+ if not streams:
357
+ return "unknown"
358
+
359
+ has_text = False
360
+ has_vector = False
361
+ has_image = False
362
+
363
+ for stream in streams:
364
+ try:
365
+ data = stream.get_data() # type: ignore[attr-defined]
366
+ except Exception:
367
+ continue
368
+ if not data:
369
+ continue
370
+
371
+ text = data.decode("latin1", "ignore")
372
+ if AP_TEXT_PATTERN.search(text):
373
+ has_text = True
374
+ if AP_VECTOR_PATTERN.search(text):
375
+ has_vector = True
376
+
377
+ names = {match.group("name").lstrip("/") for match in AP_DO_PATTERN.finditer(text)}
378
+ if names:
379
+ resources = self._ResolveResources(stream, page)
380
+ for name in names:
381
+ if self._DoTargetsImage(name, resources):
382
+ has_image = True
383
+ break
384
+
385
+ if has_image and (has_text or has_vector):
386
+ return "hybrid"
387
+ if has_image:
388
+ return "drawn"
389
+ if has_text or has_vector:
390
+ return "typed"
391
+ return "unknown"
392
+
393
+ # ---- file-wide stream scan (compressed or not)
394
+ def _ScanFileStreamsForVendors(self, file_bytes: bytes) -> tuple[set[str], str]:
395
+ """
396
+ Find all 'stream ... endstream' blocks, test raw and decompressed (zlib/gzip),
397
+ and return (vendor_hits, decoded_text_blob).
398
+ """
399
+ hits: set[str] = set()
400
+ texts: list[str] = []
401
+ if not file_bytes:
402
+ return hits, ""
403
+
404
+ # quick pass on the whole file
405
+ hits |= self._ScanRaw(file_bytes)
406
+
407
+ for m in re.finditer(rb"stream\s*[\r\n]+(.*?)\s*endstream", file_bytes, re.DOTALL):
408
+ chunk = m.group(1)
409
+ if not chunk:
410
+ continue
411
+
412
+ # raw scan + raw text
413
+ hits |= self._ScanRaw(chunk)
414
+ texts.append(chunk.decode("latin1", "ignore"))
415
+
416
+ # try decompress with multiple wbits
417
+ for wbits in (15, -15, 31):
418
+ try:
419
+ dec = zlib.decompress(chunk, wbits)
420
+ if dec:
421
+ hits |= self._ScanRaw(dec)
422
+ texts.append(dec.decode("latin1", "ignore"))
423
+ break
424
+ except Exception:
425
+ continue
426
+ return hits, " ".join(texts)
427
+
428
+ # ---------------- helpers for widgets ----------------
429
+ def _FieldNameForWidget(self, wdict: generic.DictionaryObject) -> str:
430
+ nm = self._PickNameAny(wdict)
431
+ if nm:
432
+ return nm
433
+ p = AsDictionary(wdict.get("/Parent"))
434
+ if isinstance(p, generic.DictionaryObject):
435
+ nm = self._PickNameAny(p)
436
+ if nm:
437
+ return nm
438
+ nm = GetFieldNameFromAncestry(wdict)
439
+ return "" if nm is None else str(nm)
440
+
441
+ @staticmethod
442
+ def _PickNameAny(d: generic.DictionaryObject) -> str | None:
443
+ for key in ("/T", "/TU", "/TM"):
444
+ v = d.get(key)
445
+ if v:
446
+ try:
447
+ return str(v)
448
+ except Exception:
449
+ return None
450
+ return None
451
+
452
+ def _IsSignatureWidget(self, wdict: generic.DictionaryObject) -> bool:
453
+ """Strictly identify real signature widgets and ignore envelope/metadata fields."""
454
+ try:
455
+ if wdict.get("/FT") == "/Sig" or HasSignatureFieldInAncestry(wdict):
456
+ return True
457
+ # value object might be an indirect sig dict
458
+ v = wdict.get("/V")
459
+ if isinstance(v, generic.IndirectObject):
460
+ v = v.get_object()
461
+ dv = AsDictionary(v)
462
+ if isinstance(dv, generic.DictionaryObject) and dv.get("/Type") == "/Sig":
463
+ return True
464
+ # Heuristic: drop known non-signature fields (DocuSign envelope, cert refs, etc.)
465
+ fname = (self._FieldNameForWidget(wdict) or "").strip()
466
+ if fname and self._EnvelopeNoise.search(fname):
467
+ return False
468
+ except Exception:
469
+ pass
470
+ return False
471
+
472
+ def _iter_widgets_with_ref(
473
+ self, annots_obj
474
+ ) -> Iterator[tuple[generic.DictionaryObject, generic.IndirectObject | None]]:
475
+ if annots_obj is None:
476
+ return
477
+ stack = [annots_obj]
478
+ while stack:
479
+ cur = stack.pop()
480
+ if isinstance(cur, generic.IndirectObject):
481
+ obj = cur.get_object()
482
+ if isinstance(obj, generic.DictionaryObject) and obj.get("/Subtype") == "/Widget":
483
+ yield obj, cur
484
+ continue
485
+ if isinstance(obj, generic.ArrayObject):
486
+ stack.extend(list(obj))
487
+ continue
488
+ elif isinstance(cur, generic.ArrayObject):
489
+ stack.extend(list(cur))
490
+ elif isinstance(cur, generic.DictionaryObject) and cur.get("/Subtype") == "/Widget":
491
+ yield cur, None
492
+
493
+ def _CollectAcroSignatures(self, reader: PdfReader) -> list[tuple[str, int | None, bool]]:
494
+ """
495
+ Return a list of (field_name, page_index_or_None, has_kids_widget).
496
+ has_kids_widget = True when /Kids exists on the /Sig field (real widget).
497
+ """
498
+ results: list[tuple[str, int | None, bool]] = []
499
+ with suppress(Exception):
500
+ root = reader.trailer["/Root"]
501
+ acro = root.get("/AcroForm")
502
+ fields = AsDictionary(acro).get("/Fields") if acro else None
503
+ if not isinstance(fields, generic.ArrayObject):
504
+ return results
505
+
506
+ def walk(fobj):
507
+ fd = AsDictionary(fobj)
508
+ if not isinstance(fd, generic.DictionaryObject):
509
+ return
510
+ if (
511
+ fd.get("/FT") == "/Sig"
512
+ or HasSignatureFieldInAncestry(fd)
513
+ or HasSignatureValue(fd)
514
+ ):
515
+ name = (
516
+ self._PickNameAny(fd) or (GetFieldNameFromAncestry(fd) or "") or "AcroSig"
517
+ )
518
+ page_idx: int | None = None
519
+ has_kids_widget = False
520
+
521
+ kids = AsDictionary(fd.get("/Kids"))
522
+ if isinstance(kids, generic.ArrayObject):
523
+ has_kids_widget = len(kids) > 0
524
+ for kid in kids:
525
+ kd = AsDictionary(kid)
526
+ if isinstance(kd, generic.DictionaryObject):
527
+ with suppress(KeyError, AttributeError):
528
+ p = kd.get("/P")
529
+ if isinstance(p, generic.IndirectObject):
530
+ try:
531
+ for i, pg in enumerate(reader.pages, start=1):
532
+ if pg.indirect_reference == p:
533
+ page_idx = i
534
+ break
535
+ except Exception:
536
+ pass
537
+ results.append((str(name), page_idx, has_kids_widget))
538
+
539
+ kids = AsDictionary(fd.get("/Kids"))
540
+ if isinstance(kids, generic.ArrayObject):
541
+ for k in kids:
542
+ walk(k)
543
+
544
+ for f in fields:
545
+ walk(f)
546
+
547
+ return results
548
+
549
+ # ---------------- role scoring (HIPAA) ----------------
550
+ def _RolesFromField(self, field_name: str) -> set[str]:
551
+ roles: set[str] = set()
552
+ compact = re.sub(r"[^a-z0-9]+", "", (field_name or "").lower())
553
+ if not compact:
554
+ return roles
555
+ for role, keys in self.FieldHints.items():
556
+ if any(k in compact for k in keys):
557
+ roles.add(role)
558
+ return roles
559
+
560
+ def _InferRole(self, field_name: str, page_text: str):
561
+ scores: dict[str, int] = defaultdict(int)
562
+ evidence: list[str] = []
563
+
564
+ for r in self._RolesFromField(field_name):
565
+ scores[r] += self.WeightConfiguration["field"]
566
+ evidence.append(f"field:{r}")
567
+
568
+ for r in RolesFromLabels(page_text, self.RoleLabelPatterns):
569
+ scores[r] += self.WeightConfiguration["page_label"]
570
+ evidence.append(f"page_label:{r}")
571
+
572
+ for r in RolesFromGeneral(page_text, self.GeneralRolePatterns):
573
+ scores[r] += self.WeightConfiguration["general"]
574
+ evidence.append(f"general:{r}")
575
+
576
+ role = ChooseRole(scores)
577
+ return role, evidence, dict(scores), sum(scores.values())
578
+
579
+ # ---------------- retainer utilities (pseudo, vendor-only) ----------------
580
+ def _RetainerPageScores(
581
+ self,
582
+ text: str,
583
+ vendor_count: int,
584
+ page_index0: int,
585
+ total_pages: int,
586
+ ) -> tuple[int, int, list[str]]:
587
+ """Return (client_score, firm_score, evidence[]) for a single page."""
588
+ t = NormalizeText(text)
589
+ ev: list[str] = []
590
+ cs = fs = 0
591
+
592
+ # explicit labels
593
+ for rx in self._ClientPagePatterns:
594
+ if rx.search(t):
595
+ cs += self.WeightConfiguration["page_label"]
596
+ ev.append("label:client")
597
+
598
+ firm_label_hit = False
599
+ for rx in self._FirmPagePatterns:
600
+ if rx.search(t):
601
+ fs += self.WeightConfiguration["page_label"]
602
+ firm_label_hit = True
603
+ if firm_label_hit:
604
+ ev.append("label:firm")
605
+
606
+ # firm markers (LLP/LLC/etc.) boost — BUT ignore on page 1 unless a real signature cue exists
607
+ marker_boosted = False
608
+ for rx in self._FirmMarkerPatterns:
609
+ if rx.search(t):
610
+ if page_index0 > 0 or self._SignatureWord.search(t) or self._ByWord.search(t):
611
+ fs += self.WeightConfiguration["general"] # light boost
612
+ ev.append("marker:firm")
613
+ marker_boosted = True
614
+ # If only marker on page 1 with no cue, we do not add any boost.
615
+
616
+ # signature & date co-occurrence (stronger confidence)
617
+ sig_hit = bool(self._SignatureWord.search(t))
618
+ date_hit = bool(self._DateWord.search(t))
619
+ if sig_hit and re.search(r"\bclient\b", t, re.I):
620
+ cs += 1
621
+ ev.append("word:signature+client")
622
+ if sig_hit and (
623
+ self._ByWord.search(t) or re.search(r"\b(attorney|counsel|firm)\b", t, re.I)
624
+ ):
625
+ fs += 1
626
+ ev.append("word:signature+firm")
627
+ if sig_hit and date_hit:
628
+ # common signature block layout has both
629
+ cs += 1
630
+ fs += 1
631
+ ev.append("word:signature+date")
632
+
633
+ # vendor hits seen on this page (from content/xobject) – weak but helpful
634
+ if vendor_count > 0:
635
+ cs += 1
636
+ fs += 1
637
+ ev.append("vendor:page_hit")
638
+
639
+ # position prior: end of the doc tends to host signature blocks
640
+ if total_pages >= 3 and page_index0 >= (2 * total_pages) // 3 - 1:
641
+ cs += 1
642
+ fs += 1
643
+ ev.append("prior:end_of_doc")
644
+
645
+ # general role regex (if YAML provided)
646
+ for r in RolesFromGeneral(t, self.GeneralRolePatterns):
647
+ if r == "client":
648
+ cs += self.WeightConfiguration["general"]
649
+ ev.append("general:client")
650
+ if r in {"firm", "attorney"}:
651
+ fs += self.WeightConfiguration["general"]
652
+ ev.append("general:firm")
653
+
654
+ # FINAL dampener for page 1:
655
+ # If page 1 had only weak firm markers (LLP/LLC) and no signature cues, wipe that boost.
656
+ if page_index0 == 0 and not sig_hit and not self._ByWord.search(t):
657
+ if marker_boosted:
658
+ fs = max(0, fs - self.WeightConfiguration["general"])
659
+ ev.append("dampen:front_matter")
660
+
661
+ return cs, fs, ev
662
+
663
+ # ---------------- main ----------------
664
+ def Detect(self, pdf_path: Path) -> FileResult:
665
+ try:
666
+ with _QuietIo():
667
+ reader = PdfReader(str(pdf_path))
668
+ size_kb = round(pdf_path.stat().st_size / 1024, 1)
669
+ pages = len(reader.pages)
670
+
671
+ # file-wide vendor scan (+decompressed streams)
672
+ try:
673
+ _file_bytes = pdf_path.read_bytes()
674
+ except Exception:
675
+ _file_bytes = b""
676
+ file_vendor_hits, _stream_text_blob = self._ScanFileStreamsForVendors(_file_bytes)
677
+
678
+ acro_sig_list = self._CollectAcroSignatures(reader)
679
+
680
+ page_texts: list[str] = []
681
+ vendor_hints: set[str] = set()
682
+ vendor_hits_per_page: list[int] = []
683
+ images_per_page: list[int] = []
684
+ any_text, img_pages = False, 0
685
+
686
+ for page in reader.pages:
687
+ # per-page vendor
688
+ pv = self._ScanPageVendors(page)
689
+ x_hits: set[str] = set()
690
+ x_text = ""
691
+ if self.RecurseXObjects:
692
+ x_hits, x_text = self._CollectXObjectVendorAndText(page)
693
+ vendor_hints |= pv | x_hits
694
+ vendor_hits_per_page.append(len(pv) + len(x_hits))
695
+
696
+ with _QuietIo():
697
+ txt = page.extract_text() or ""
698
+ if x_text:
699
+ txt = f"{txt} {x_text}".strip() if txt else x_text.strip()
700
+ page_texts.append(txt)
701
+ any_text = any_text or bool(txt)
702
+
703
+ # image counting
704
+ img_count = 0
705
+ with suppress(KeyError):
706
+ xobjs = page["/Resources"]["/XObject"]
707
+ img_count = sum(
708
+ 1 for obj in xobjs.values() if AsDictionary(obj).get("/Subtype") == "/Image"
709
+ )
710
+ images_per_page.append(img_count)
711
+ img_pages += 1 if img_count > 0 else 0
712
+
713
+ # Merge file-level vendor hits (catches unpainted & compressed streams)
714
+ vendor_hints |= file_vendor_hits
715
+
716
+ scanned_pdf = (not any_text) and (img_pages > 0)
717
+
718
+ # --- find signature widgets on pages (strict)
719
+ page_widgets: list[
720
+ tuple[int, generic.DictionaryObject, generic.IndirectObject | None]
721
+ ] = []
722
+ for idx, page in enumerate(reader.pages, start=1):
723
+ for wdict, ref in self._iter_widgets_with_ref(page.get("/Annots")):
724
+ if self._IsSignatureWidget(wdict):
725
+ page_widgets.append((idx, wdict, ref))
726
+ # else: ignore envelope/cert widgets
727
+
728
+ has_page_widgets = len(page_widgets) > 0
729
+ has_acro = len(acro_sig_list) > 0
730
+ has_vendor = len(vendor_hints) > 0
731
+ acro_has_kids = any(hk for _, __, hk in acro_sig_list)
732
+
733
+ # ───────────────────────────── HIPAA branch ─────────────────────────────
734
+ if self.Profile == "hipaa":
735
+ return self._DetectHipaaPath(
736
+ pdf_path=pdf_path,
737
+ reader=reader,
738
+ page_texts=page_texts,
739
+ vendor_hints=vendor_hints,
740
+ scanned_pdf=scanned_pdf,
741
+ page_widgets=page_widgets,
742
+ acro_sig_list=acro_sig_list,
743
+ has_page_widgets=has_page_widgets,
744
+ has_acro=has_acro,
745
+ has_vendor=has_vendor,
746
+ acro_has_kids=acro_has_kids,
747
+ size_kb=size_kb,
748
+ pages=pages,
749
+ )
750
+
751
+ # ───────────────────────────── Retainer branch ─────────────────────────────
752
+ signatures: list[Signature] = []
753
+
754
+ if has_page_widgets:
755
+ # Real widgets: infer role from page text; avoid envelope-id noise already filtered
756
+ seen_refs: set[str] = set()
757
+ seen_page_name: set[tuple[int, str]] = set()
758
+
759
+ for idx, wdict, ref in page_widgets:
760
+ field_name = self._FieldNameForWidget(wdict)
761
+ page_obj = reader.pages[idx - 1] if 0 <= (idx - 1) < len(reader.pages) else None
762
+ render_type = self._ClassifyAppearance(wdict, page_obj)
763
+
764
+ # de-dup by object ref (if present) and (page, name)
765
+ if isinstance(ref, generic.IndirectObject):
766
+ key = f"{ref.idnum}:{ref.generation}"
767
+ if key in seen_refs:
768
+ continue
769
+ seen_refs.add(key)
770
+
771
+ if field_name:
772
+ key2 = (idx, field_name)
773
+ if key2 in seen_page_name:
774
+ continue
775
+ seen_page_name.add(key2)
776
+
777
+ page_text = page_texts[idx - 1] if 0 <= (idx - 1) < len(page_texts) else ""
778
+ c, f, ev = self._RetainerPageScores(
779
+ page_text, vendor_hits_per_page[idx - 1], idx - 1, pages
780
+ )
781
+ role = "client" if c >= f and c > 0 else ("firm" if f > 0 else "unknown")
782
+
783
+ # fall back to generic role inference if indecisive
784
+ if role == "unknown":
785
+ role, evidence, scores, total = self._InferRole(field_name, page_text)
786
+ evidence = evidence or ev
787
+ scores = scores or ({role: 1} if role != "unknown" else {})
788
+ total = total or sum(scores.values())
789
+ else:
790
+ evidence = ev
791
+ scores = {role: (c if role == "client" else f)}
792
+ total = scores[role]
793
+
794
+ signatures.append(
795
+ Signature(
796
+ Page=idx,
797
+ FieldName=field_name,
798
+ Role=role,
799
+ Score=total,
800
+ Scores=scores,
801
+ Evidence=evidence,
802
+ Hint=(f"AcroSig:{field_name}" if field_name else "AcroSig"),
803
+ RenderType=render_type,
804
+ )
805
+ )
806
+
807
+ # If only one role but page text clearly indicates both, add the second role pseudo.
808
+ if len(signatures) == 1:
809
+ pg = signatures[0].Page or 1
810
+ c, f, ev = self._RetainerPageScores(
811
+ page_texts[pg - 1], vendor_hits_per_page[pg - 1], pg - 1, pages
812
+ )
813
+ want = None
814
+ have = {signatures[0].Role}
815
+ if "client" not in have and c > 0:
816
+ want = ("client", c)
817
+ elif "firm" not in have and f > 0:
818
+ want = ("firm", f)
819
+ if want:
820
+ r, sc = want
821
+ signatures.append(
822
+ Signature(
823
+ Page=pg,
824
+ FieldName="vendor_or_acro_detected",
825
+ Role=r,
826
+ Score=sc,
827
+ Scores={r: sc},
828
+ Evidence=ev + ["pseudo:true"],
829
+ Hint="VendorOrAcroOnly",
830
+ )
831
+ )
832
+
833
+ else:
834
+ # No widgets found. Retainers usually have two signees; pick likely pages.
835
+ if self.Configuration.PseudoSignatures and (has_acro or has_vendor):
836
+ totals = []
837
+ for i, text in enumerate(page_texts):
838
+ c, f, ev = self._RetainerPageScores(text, vendor_hits_per_page[i], i, pages)
839
+ totals.append((i, c, f, ev))
840
+
841
+ # Pages with any signal
842
+ candidates = [i for i, c, f, _ in totals if (c > 0 or f > 0)]
843
+
844
+ # If page 1 is in candidates but has no signature cue, drop it (anti front-matter).
845
+ def HasSignatureCue(i: int) -> bool:
846
+ t = page_texts[i]
847
+ return bool(self._SignatureWord.search(t) or self._ByWord.search(t))
848
+
849
+ candidates = [i for i in candidates if not (i == 0 and not HasSignatureCue(i))]
850
+
851
+ # If still empty, prefer the last page(s)
852
+ if not candidates:
853
+ candidates = [p for p in range(max(0, pages - 2), pages)]
854
+
855
+ # best client & firm pages
856
+ c_best = max(candidates, key=lambda i: totals[i][1]) if candidates else None
857
+ f_best = max(candidates, key=lambda i: totals[i][2]) if candidates else None
858
+
859
+ def emit(page_idx: int | None, role: str, score: int, ev: list[str]):
860
+ pg = (page_idx + 1) if page_idx is not None else pages
861
+ signatures.append(
862
+ Signature(
863
+ Page=pg,
864
+ FieldName="vendor_or_acro_detected",
865
+ Role=role,
866
+ Score=score,
867
+ Scores={role: score} if score > 0 else {},
868
+ Evidence=ev + ["pseudo:true"],
869
+ Hint="VendorOrAcroOnly",
870
+ )
871
+ )
872
+
873
+ if c_best is not None and totals[c_best][1] > 0:
874
+ emit(c_best, "client", totals[c_best][1], totals[c_best][3])
875
+ if f_best is not None and totals[f_best][2] > 0:
876
+ emit(f_best, "firm", totals[f_best][2], totals[f_best][3])
877
+
878
+ # If nothing yet, emit both roles on the last page as a conservative fallback.
879
+ if not signatures:
880
+ emit(pages - 1, "client", 0, [])
881
+ emit(pages - 1, "firm", 0, [])
882
+
883
+ # doc-level names for hints
884
+ acro_names = {name for name, _pg, _hk in acro_sig_list if name}
885
+
886
+ # scanned/mixed refinement for retainer:
887
+ # If we emitted only pseudo signatures (no widgets on those pages and no vendor on them),
888
+ # and those pages have images, mark scanned and mixed.
889
+ if self.Profile == "retainer" and not has_page_widgets:
890
+ pseudo_pages = [s.Page for s in signatures if s.Page]
891
+ if pseudo_pages:
892
+ pvendors = all(
893
+ vendor_hits_per_page[p - 1] == 0 for p in pseudo_pages if p - 1 >= 0
894
+ )
895
+ pimages = any(images_per_page[p - 1] > 0 for p in pseudo_pages if p - 1 >= 0)
896
+ else:
897
+ pvendors = False
898
+ pimages = False
899
+ if pimages and pvendors:
900
+ scanned_pdf = True # scanned signatures present
901
+
902
+ esign_found = (len(signatures) > 0) or has_vendor or has_acro
903
+ mixed = esign_found and scanned_pdf
904
+
905
+ doc_roles: set[str] = {s.Role for s in signatures if s.Role != "unknown"}
906
+
907
+ hints: set[str] = set()
908
+ hints |= {f"AcroSig:{n}" for n in acro_names}
909
+ hints |= set(vendor_hints)
910
+ hints |= {s.Hint for s in signatures}
911
+
912
+ return FileResult(
913
+ File=pdf_path.name,
914
+ SizeKilobytes=size_kb,
915
+ PageCount=pages,
916
+ ElectronicSignatureFound=esign_found,
917
+ ScannedPdf=scanned_pdf,
918
+ MixedContent=mixed,
919
+ SignatureCount=len(signatures),
920
+ SignaturePages=",".join(
921
+ map(str, sorted({signature.Page for signature in signatures if signature.Page}))
922
+ ),
923
+ Roles=";".join(sorted(doc_roles)) if doc_roles else "unknown",
924
+ Hints=";".join(sorted(hints)),
925
+ Signatures=signatures,
926
+ )
927
+
928
+ except Exception as exc: # capture errors per file
929
+ return FileResult(
930
+ File=pdf_path.name,
931
+ SizeKilobytes=None,
932
+ PageCount=0,
933
+ ElectronicSignatureFound=False,
934
+ ScannedPdf=None,
935
+ MixedContent=None,
936
+ SignatureCount=0,
937
+ SignaturePages="",
938
+ Roles="error",
939
+ Hints=f"ERROR:{exc}",
940
+ Signatures=[],
941
+ )
942
+
943
+ # ───────────────────────── HIPAA path ─────────────────────────
944
+ def _DetectHipaaPath(
945
+ self,
946
+ *,
947
+ pdf_path: Path,
948
+ reader: PdfReader,
949
+ page_texts: list[str],
950
+ vendor_hints: set[str],
951
+ scanned_pdf: bool,
952
+ page_widgets: list[tuple[int, generic.DictionaryObject, generic.IndirectObject | None]],
953
+ acro_sig_list: list[tuple[str, int | None, bool]],
954
+ has_page_widgets: bool,
955
+ has_acro: bool,
956
+ has_vendor: bool,
957
+ acro_has_kids: bool,
958
+ size_kb: float,
959
+ pages: int,
960
+ ) -> FileResult:
961
+ signatures: list[Signature] = []
962
+
963
+ if has_page_widgets:
964
+ # --- real widgets path (NO pseudo allowed later)
965
+ seen_refs: set[str] = set()
966
+ seen_page_name: set[tuple[int, str]] = set()
967
+
968
+ for idx, wdict, ref in page_widgets:
969
+ field_name = self._FieldNameForWidget(wdict)
970
+ page_obj = reader.pages[idx - 1] if 0 <= (idx - 1) < len(reader.pages) else None
971
+ render_type = self._ClassifyAppearance(wdict, page_obj)
972
+
973
+ # de-dup by object ref (if present) and (page, name)
974
+ if isinstance(ref, generic.IndirectObject):
975
+ key = f"{ref.idnum}:{ref.generation}"
976
+ if key in seen_refs:
977
+ continue
978
+ seen_refs.add(key)
979
+
980
+ if field_name:
981
+ key2 = (idx, field_name)
982
+ if key2 in seen_page_name:
983
+ continue
984
+ seen_page_name.add(key2)
985
+
986
+ page_text = page_texts[idx - 1] if 0 <= (idx - 1) < len(page_texts) else ""
987
+ role, evidence, scores, total = self._InferRole(field_name, page_text)
988
+ signatures.append(
989
+ Signature(
990
+ Page=idx,
991
+ FieldName=field_name,
992
+ Role=role,
993
+ Score=total,
994
+ Scores=scores,
995
+ Evidence=evidence,
996
+ Hint=(f"AcroSig:{field_name}" if field_name else "AcroSig"),
997
+ RenderType=render_type,
998
+ )
999
+ )
1000
+
1001
+ elif acro_has_kids:
1002
+ # There are real /Widget(s) attached to /Sig fields, but we didn't
1003
+ # locate them on pages (e.g., /Annots not visible). Emit NON-pseudo
1004
+ # signatures from the field names so mixed cases don't get pseudo.
1005
+ whole_text = NormalizeText("\n".join(page_texts))
1006
+ for fname, pg, _hk in acro_sig_list:
1007
+ page_text = page_texts[pg - 1] if pg and pg - 1 < len(page_texts) else ""
1008
+ # fallback to whole doc text if page unknown
1009
+ base_text = page_text or whole_text
1010
+ role, evidence, scores, total = self._InferRole(fname, base_text)
1011
+ signatures.append(
1012
+ Signature(
1013
+ Page=pg,
1014
+ FieldName=fname,
1015
+ Role=role,
1016
+ Score=total,
1017
+ Scores=scores,
1018
+ Evidence=evidence,
1019
+ Hint=f"AcroSig:{fname}" if fname else "AcroSig",
1020
+ )
1021
+ )
1022
+
1023
+ else:
1024
+ # --- vendor/acro pseudo path (only when NO page widgets or acro-kids)
1025
+ if self.Configuration.PseudoSignatures and (has_acro or has_vendor):
1026
+ # IMPORTANT: use only page text (not raw decompressed stream dump).
1027
+ text_norm = NormalizeText("\n".join(page_texts))
1028
+ scores: dict[str, int] = defaultdict(int)
1029
+ evidence: list[str] = []
1030
+
1031
+ # Hard rules
1032
+ rel = self.DocumentHardRules.get("rel_label")
1033
+ kin = self.DocumentHardRules.get("kin")
1034
+ minor = self.DocumentHardRules.get("minor")
1035
+ firstp = self.DocumentHardRules.get("first_person")
1036
+
1037
+ rel_hit = bool(rel and rel.search(text_norm))
1038
+ kin_hit = bool(kin and kin.search(text_norm))
1039
+ if rel_hit and kin_hit:
1040
+ scores["representative"] += 100
1041
+ evidence.append("rule:relationship+kin")
1042
+ if minor and minor.search(text_norm):
1043
+ scores["representative"] += 50
1044
+ evidence.append("rule:minor/unable_to_sign")
1045
+ if scores.get("representative", 0) == 0 and firstp and (firstp.search(text_norm)):
1046
+ scores["patient"] += 30
1047
+ evidence.append("rule:first_person_authorize")
1048
+
1049
+ # Labels across doc
1050
+ for r in RolesFromLabels(text_norm, self.RoleLabelPatterns):
1051
+ scores[r] += self.WeightConfiguration["page_label"]
1052
+ evidence.append(f"page_label:{r}")
1053
+
1054
+ # General — ignore weak attorney in pseudo
1055
+ for r in RolesFromGeneral(text_norm, self.GeneralRolePatterns):
1056
+ if r == "attorney":
1057
+ continue
1058
+ scores[r] += self.WeightConfiguration["general"]
1059
+ evidence.append(f"general:{r}")
1060
+
1061
+ # Boost from acro field names, if any
1062
+ for fname, _pg, _hk in acro_sig_list:
1063
+ for r in self._RolesFromField(fname):
1064
+ scores[r] += self.WeightConfiguration["field"]
1065
+ evidence.append(f"field:{r}")
1066
+
1067
+ role = ChooseRole(scores)
1068
+ if role == "unknown":
1069
+ if rel_hit and kin_hit:
1070
+ role = "representative"
1071
+ evidence.append("tie:relationship+kin")
1072
+ elif firstp and firstp.search(text_norm):
1073
+ role = "patient"
1074
+ evidence.append("tie:first_person")
1075
+
1076
+ signatures.append(
1077
+ Signature(
1078
+ Page=None,
1079
+ FieldName="vendor_or_acro_detected",
1080
+ Role=role,
1081
+ Score=sum(scores.values()),
1082
+ Scores=dict(scores),
1083
+ Evidence=evidence + ["pseudo:true"],
1084
+ Hint="VendorOrAcroOnly",
1085
+ )
1086
+ )
1087
+
1088
+ # doc-level hints
1089
+ acro_names = {name for name, _pg, _hk in acro_sig_list if name}
1090
+ esign_found = (len(signatures) > 0) or (len(vendor_hints) > 0) or (len(acro_names) > 0)
1091
+ mixed = esign_found and scanned_pdf
1092
+
1093
+ doc_roles: set[str] = {s.Role for s in signatures if s.Role != "unknown"}
1094
+
1095
+ hints: set[str] = set()
1096
+ hints |= {f"AcroSig:{n}" for n in acro_names}
1097
+ hints |= set(vendor_hints)
1098
+ hints |= {s.Hint for s in signatures}
1099
+
1100
+ return FileResult(
1101
+ File=pdf_path.name,
1102
+ SizeKilobytes=size_kb,
1103
+ PageCount=pages,
1104
+ ElectronicSignatureFound=esign_found,
1105
+ ScannedPdf=scanned_pdf,
1106
+ MixedContent=mixed,
1107
+ SignatureCount=len(signatures),
1108
+ SignaturePages=",".join(
1109
+ map(str, sorted({signature.Page for signature in signatures if signature.Page}))
1110
+ ),
1111
+ Roles=";".join(sorted(doc_roles)) if doc_roles else "unknown",
1112
+ Hints=";".join(sorted(hints)),
1113
+ Signatures=signatures,
1114
+ )