sigdetect 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sigdetect/__init__.py +24 -0
- sigdetect/api.py +139 -0
- sigdetect/cli.py +98 -0
- sigdetect/config.py +117 -0
- sigdetect/data/role_rules.retainer.yml +61 -0
- sigdetect/data/role_rules.yml +71 -0
- sigdetect/data/vendor_patterns.yml +16 -0
- sigdetect/detector/__init__.py +55 -0
- sigdetect/detector/base.py +9 -0
- sigdetect/detector/base_detector.py +22 -0
- sigdetect/detector/file_result_model.py +59 -0
- sigdetect/detector/pymupdf_engine.py +0 -0
- sigdetect/detector/pypdf2_engine.py +1114 -0
- sigdetect/detector/signature_model.py +34 -0
- sigdetect/eda.py +137 -0
- sigdetect/logging_setup.py +218 -0
- sigdetect/utils.py +152 -0
- sigdetect-0.1.0.dist-info/METADATA +394 -0
- sigdetect-0.1.0.dist-info/RECORD +22 -0
- sigdetect-0.1.0.dist-info/WHEEL +5 -0
- sigdetect-0.1.0.dist-info/entry_points.txt +2 -0
- sigdetect-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1114 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import io
|
|
4
|
+
import re
|
|
5
|
+
import warnings
|
|
6
|
+
import zlib
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
from collections.abc import Iterator
|
|
9
|
+
from contextlib import contextmanager, redirect_stderr, redirect_stdout, suppress
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from pypdf import PdfReader, generic
|
|
13
|
+
from pypdf.errors import PdfReadWarning
|
|
14
|
+
|
|
15
|
+
from ..config import DetectConfiguration
|
|
16
|
+
from ..utils import (
|
|
17
|
+
AsDictionary,
|
|
18
|
+
ChooseRole,
|
|
19
|
+
GetFieldNameFromAncestry,
|
|
20
|
+
HasSignatureFieldInAncestry,
|
|
21
|
+
HasSignatureValue,
|
|
22
|
+
LoadPatterns,
|
|
23
|
+
NormalizeText,
|
|
24
|
+
RolesFromGeneral,
|
|
25
|
+
RolesFromLabels,
|
|
26
|
+
)
|
|
27
|
+
from .base import Detector, FileResult, Signature
|
|
28
|
+
|
|
29
|
+
# ────────────────────────── silence noisy pdf warnings ──────────────────────────
|
|
30
|
+
warnings.filterwarnings(
|
|
31
|
+
"ignore",
|
|
32
|
+
message=r"Multiple definitions in dictionary.*key /Subtype",
|
|
33
|
+
category=PdfReadWarning,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
# ---------------- fallbacks (used only if YAML omits them) ----------------
|
|
37
|
+
DEFAULT_VENDOR_BYTES = [
|
|
38
|
+
r"/DocuSign",
|
|
39
|
+
r"/Adobe\.PPKLite",
|
|
40
|
+
r"/DocTimeStamp",
|
|
41
|
+
r"/DSS",
|
|
42
|
+
r"/AcrobatSign",
|
|
43
|
+
r"/HelloSign",
|
|
44
|
+
r"/Vinesign",
|
|
45
|
+
r"/PandaDoc",
|
|
46
|
+
]
|
|
47
|
+
DEFAULT_VENDOR_TEXT = [
|
|
48
|
+
r"DocuSign\s+Envelope\s+ID",
|
|
49
|
+
r"Signature\s+Certificate",
|
|
50
|
+
r"Electronic\s+Record\s+and\s+Signature\s+Disclosure",
|
|
51
|
+
r"Adobe\s+Acrobat\s+Sign|Acrobat\s+Sign",
|
|
52
|
+
r"HelloSign|Dropbox\s+Sign",
|
|
53
|
+
r"Vinesign",
|
|
54
|
+
r"Signed\s+with\s+PandaDoc",
|
|
55
|
+
r"Reference\s+number",
|
|
56
|
+
r"Digitally\s+signed\s+by",
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
DEFAULT_FIELDNAME_HINTS: dict[str, tuple[str, ...]] = {
|
|
60
|
+
"patient": ("patient", "plaintiff", "self", "claimant"),
|
|
61
|
+
"attorney": ("attorney", "lawyer", "counsel"),
|
|
62
|
+
"representative": (
|
|
63
|
+
"representative",
|
|
64
|
+
"rep",
|
|
65
|
+
"guardian",
|
|
66
|
+
"parent",
|
|
67
|
+
"executor",
|
|
68
|
+
"custodian",
|
|
69
|
+
"conservator",
|
|
70
|
+
"poa",
|
|
71
|
+
"powerofattorney",
|
|
72
|
+
),
|
|
73
|
+
# retainer additions
|
|
74
|
+
"client": ("client", "clientname", "clientsignature", "consumer"),
|
|
75
|
+
"firm": ("firm", "lawfirm", "company", "corp", "authorizedsignatory"),
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
# Add robust “Parent/Guardian” label variant into representative labels.
|
|
79
|
+
REP_EXTRA = r"(signature\s+of\s+(the\s+)?(parent|guardian|parent\s*/\s*guardian))"
|
|
80
|
+
|
|
81
|
+
# Light retainer page cues (extra to YAML; safe defaults)
|
|
82
|
+
RETAIN_CLIENT_LABELS = [
|
|
83
|
+
r"\bclient\s+signature\b",
|
|
84
|
+
r"\bname\s+of\s+client\b",
|
|
85
|
+
r"\bclient\b.*\bsignature\b",
|
|
86
|
+
r"\bprint(?:ed)?\s+name\b.*\bclient\b",
|
|
87
|
+
]
|
|
88
|
+
RETAIN_FIRM_LABELS = [
|
|
89
|
+
r"\bby:\b",
|
|
90
|
+
r"\bfor\s+the\s+firm\b",
|
|
91
|
+
r"\battorney'?s?\s+signature\b",
|
|
92
|
+
r"\bcounsel\s+signature\b",
|
|
93
|
+
r"\besq\.?\b",
|
|
94
|
+
]
|
|
95
|
+
RETAIN_FIRM_MARKERS = [
|
|
96
|
+
r"\bLLP\b",
|
|
97
|
+
r"\bLLC\b",
|
|
98
|
+
r"\bP\.?C\.?\b",
|
|
99
|
+
r"\bP\.?A\.?\b",
|
|
100
|
+
r"\bAttorneys?\s+at\s+Law\b",
|
|
101
|
+
r"\bLaw\b",
|
|
102
|
+
]
|
|
103
|
+
|
|
104
|
+
AP_DO_PATTERN = re.compile(r"/(?P<name>[^\s]+)\s+Do\b")
|
|
105
|
+
AP_TEXT_PATTERN = re.compile(r"\b(TJ|Tj)\b")
|
|
106
|
+
AP_VECTOR_PATTERN = re.compile(r"\b(m|l|c|re)\b", re.IGNORECASE)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
@contextmanager
|
|
110
|
+
def _QuietIo():
|
|
111
|
+
"""Hide noisy stdout/stderr messages from PDF parsing/text extraction."""
|
|
112
|
+
sink = io.StringIO()
|
|
113
|
+
with redirect_stdout(sink), redirect_stderr(sink):
|
|
114
|
+
yield
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class PyPDF2Detector(Detector):
|
|
118
|
+
Name = "pypdf2"
|
|
119
|
+
|
|
120
|
+
def __init__(self, configuration: DetectConfiguration):
|
|
121
|
+
self.Configuration = configuration
|
|
122
|
+
self.Profile = (
|
|
123
|
+
getattr(configuration, "Profile", getattr(configuration, "profile", "hipaa")) or "hipaa"
|
|
124
|
+
)
|
|
125
|
+
pats = LoadPatterns(self.Profile)
|
|
126
|
+
|
|
127
|
+
# Vendor patterns (fallback to defaults if missing)
|
|
128
|
+
vb = pats.get("bytes") or DEFAULT_VENDOR_BYTES
|
|
129
|
+
vt = pats.get("text") or DEFAULT_VENDOR_TEXT
|
|
130
|
+
self.VendorBytePatterns = [re.compile(p.encode(), re.I) for p in vb]
|
|
131
|
+
self.VendorTextPatterns = [re.compile(p, re.I) for p in vt]
|
|
132
|
+
|
|
133
|
+
# Allow callers to disable expensive XObject recursion if desired
|
|
134
|
+
self.RecurseXObjects = bool(
|
|
135
|
+
getattr(
|
|
136
|
+
configuration, "RecurseXObjects", getattr(configuration, "recurse_xobjects", True)
|
|
137
|
+
)
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
# Role patterns (labels + general)
|
|
141
|
+
labels = dict(pats.get("labels", {}))
|
|
142
|
+
# Ensure HIPAA representative includes Parent/Guardian phrasing
|
|
143
|
+
if "representative" in labels:
|
|
144
|
+
labels["representative"] = f"(?:{labels['representative']}|{REP_EXTRA})"
|
|
145
|
+
else:
|
|
146
|
+
labels["representative"] = REP_EXTRA
|
|
147
|
+
|
|
148
|
+
# Retainer: ensure 'client' and 'firm' buckets exist
|
|
149
|
+
if self.Profile == "retainer":
|
|
150
|
+
if "client" not in labels:
|
|
151
|
+
labels["client"] = "|".join(RETAIN_CLIENT_LABELS)
|
|
152
|
+
if "firm" not in labels:
|
|
153
|
+
labels["firm"] = "|".join(RETAIN_FIRM_LABELS)
|
|
154
|
+
|
|
155
|
+
self.RoleLabelPatterns = {k: re.compile(v, re.I) for k, v in labels.items()}
|
|
156
|
+
self.GeneralRolePatterns = {
|
|
157
|
+
k: re.compile(v, re.I) for k, v in pats.get("general", {}).items()
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
# Field hints (accept either key name)
|
|
161
|
+
raw_field_hints = (
|
|
162
|
+
pats.get("field_hints") or pats.get("fieldname_hints") or DEFAULT_FIELDNAME_HINTS
|
|
163
|
+
)
|
|
164
|
+
self.FieldHints: dict[str, tuple[str, ...]] = {
|
|
165
|
+
k: tuple(v) for k, v in raw_field_hints.items()
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
# Doc hard rules + weights
|
|
169
|
+
self.DocumentHardRules = {
|
|
170
|
+
k: re.compile(v, re.I) for k, v in pats.get("doc_hard", {}).items()
|
|
171
|
+
}
|
|
172
|
+
self.WeightConfiguration = pats.get(
|
|
173
|
+
"weights",
|
|
174
|
+
{
|
|
175
|
+
"field": 3,
|
|
176
|
+
"page_label": 2,
|
|
177
|
+
"general": 1,
|
|
178
|
+
"doc_hint_strong": 3,
|
|
179
|
+
"doc_hint_weak": 2,
|
|
180
|
+
},
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# Precompile retainer extras
|
|
184
|
+
if self.Profile == "retainer":
|
|
185
|
+
self._ClientPagePatterns = [re.compile(p, re.I) for p in RETAIN_CLIENT_LABELS]
|
|
186
|
+
self._FirmPagePatterns = [re.compile(p, re.I) for p in RETAIN_FIRM_LABELS]
|
|
187
|
+
self._FirmMarkerPatterns = [re.compile(p, re.I) for p in RETAIN_FIRM_MARKERS]
|
|
188
|
+
self._SignatureWord = re.compile(r"\bsignature\b", re.I)
|
|
189
|
+
self._DateWord = re.compile(r"\bdate\b", re.I)
|
|
190
|
+
self._ByWord = re.compile(r"\bby:\b", re.I)
|
|
191
|
+
|
|
192
|
+
# Heuristic to drop false widgets like DocuSign envelope ID
|
|
193
|
+
self._EnvelopeNoise = re.compile(r"envelope[_\s-]*id|envelopeid|certificate|docid", re.I)
|
|
194
|
+
|
|
195
|
+
# ---------------- vendor scanning helpers ----------------
|
|
196
|
+
def _ScanRaw(self, raw: bytes) -> set[str]:
|
|
197
|
+
"""Scan bytes (already decompressed if needed) for vendor markers & text."""
|
|
198
|
+
hits: set[str] = set()
|
|
199
|
+
if not raw:
|
|
200
|
+
return hits
|
|
201
|
+
for rx in self.VendorBytePatterns:
|
|
202
|
+
if rx.search(raw):
|
|
203
|
+
try:
|
|
204
|
+
pat = rx.pattern.decode("ascii", "ignore")
|
|
205
|
+
except Exception:
|
|
206
|
+
pat = str(rx.pattern)
|
|
207
|
+
hits.add(f"VendorBytes:{pat}")
|
|
208
|
+
# also search text markers inside bytes
|
|
209
|
+
textish = raw.decode("latin1", "ignore")
|
|
210
|
+
for rx in self.VendorTextPatterns:
|
|
211
|
+
if rx.search(textish):
|
|
212
|
+
hits.add(f"VendorText:{rx.pattern}")
|
|
213
|
+
return hits
|
|
214
|
+
|
|
215
|
+
def _ScanPageVendors(self, page) -> set[str]:
|
|
216
|
+
found: set[str] = set()
|
|
217
|
+
|
|
218
|
+
with _QuietIo():
|
|
219
|
+
cont = page.get_contents()
|
|
220
|
+
raws: list[bytes] = []
|
|
221
|
+
if cont is None:
|
|
222
|
+
pass
|
|
223
|
+
elif isinstance(cont, list):
|
|
224
|
+
raws.extend(c.get_data() for c in cont if hasattr(c, "get_data"))
|
|
225
|
+
elif hasattr(cont, "get_data"):
|
|
226
|
+
raws.append(cont.get_data())
|
|
227
|
+
|
|
228
|
+
for raw in raws:
|
|
229
|
+
found |= self._ScanRaw(raw)
|
|
230
|
+
|
|
231
|
+
with _QuietIo():
|
|
232
|
+
txt = page.extract_text() or ""
|
|
233
|
+
for rx in self.VendorTextPatterns:
|
|
234
|
+
if rx.search(txt):
|
|
235
|
+
found.add(f"VendorText:{rx.pattern}")
|
|
236
|
+
|
|
237
|
+
return found
|
|
238
|
+
|
|
239
|
+
def _IterateFormXObjects(self, page) -> Iterator[generic.DictionaryObject]:
|
|
240
|
+
"""Yield Form XObject dictionaries recursively from page resources."""
|
|
241
|
+
with suppress(KeyError):
|
|
242
|
+
xobjs = page["/Resources"]["/XObject"]
|
|
243
|
+
visited = set()
|
|
244
|
+
|
|
245
|
+
def walk(xo):
|
|
246
|
+
d = AsDictionary(xo)
|
|
247
|
+
if not isinstance(d, generic.DictionaryObject):
|
|
248
|
+
return
|
|
249
|
+
key = (id(d), d.get("/Subtype"))
|
|
250
|
+
if key in visited:
|
|
251
|
+
return
|
|
252
|
+
visited.add(key)
|
|
253
|
+
if d.get("/Subtype") == "/Form":
|
|
254
|
+
yield d
|
|
255
|
+
with suppress(KeyError):
|
|
256
|
+
nested = d["/Resources"]["/XObject"]
|
|
257
|
+
for n in nested.values():
|
|
258
|
+
yield from walk(n)
|
|
259
|
+
|
|
260
|
+
for ob in xobjs.values():
|
|
261
|
+
yield from walk(ob)
|
|
262
|
+
|
|
263
|
+
def _CollectXObjectVendorAndText(self, page) -> tuple[set[str], str]:
|
|
264
|
+
"""Scan Form XObjects regardless of whether they're drawn."""
|
|
265
|
+
hits: set[str] = set()
|
|
266
|
+
parts: list[str] = []
|
|
267
|
+
for xo in self._IterateFormXObjects(page):
|
|
268
|
+
if hasattr(xo, "get_data"):
|
|
269
|
+
with suppress(Exception), _QuietIo():
|
|
270
|
+
raw = xo.get_data()
|
|
271
|
+
if raw:
|
|
272
|
+
hits |= self._ScanRaw(raw)
|
|
273
|
+
parts.append(raw.decode("latin1", "ignore"))
|
|
274
|
+
return hits, " ".join(parts)
|
|
275
|
+
|
|
276
|
+
# ---------------- appearance classification helpers ----------------
|
|
277
|
+
def _ExtractAppearanceStreams(self, candidate: object) -> list[object]:
|
|
278
|
+
"""Return decoded appearance stream objects from an ``/AP`` entry."""
|
|
279
|
+
|
|
280
|
+
streams: list[object] = []
|
|
281
|
+
|
|
282
|
+
def visit(node: object | None) -> None:
|
|
283
|
+
if node is None:
|
|
284
|
+
return
|
|
285
|
+
obj = AsDictionary(node)
|
|
286
|
+
if isinstance(obj, generic.IndirectObject):
|
|
287
|
+
with suppress(Exception):
|
|
288
|
+
obj = obj.get_object()
|
|
289
|
+
obj = AsDictionary(obj)
|
|
290
|
+
if obj is None:
|
|
291
|
+
return
|
|
292
|
+
if hasattr(obj, "get_data"):
|
|
293
|
+
streams.append(obj)
|
|
294
|
+
return
|
|
295
|
+
if isinstance(obj, generic.DictionaryObject):
|
|
296
|
+
for value in obj.values():
|
|
297
|
+
visit(value)
|
|
298
|
+
elif isinstance(obj, generic.ArrayObject):
|
|
299
|
+
for value in obj:
|
|
300
|
+
visit(value)
|
|
301
|
+
|
|
302
|
+
visit(candidate)
|
|
303
|
+
return streams
|
|
304
|
+
|
|
305
|
+
def _ResolveResources(self, stream, page) -> generic.DictionaryObject | None:
|
|
306
|
+
"""Return the resource dictionary for the given appearance stream."""
|
|
307
|
+
|
|
308
|
+
resources = AsDictionary(getattr(stream, "get", lambda *_: None)("/Resources")) # type: ignore[arg-type]
|
|
309
|
+
if isinstance(resources, generic.IndirectObject):
|
|
310
|
+
with suppress(Exception):
|
|
311
|
+
resources = resources.get_object()
|
|
312
|
+
resources = AsDictionary(resources)
|
|
313
|
+
if not isinstance(resources, generic.DictionaryObject) and page is not None:
|
|
314
|
+
page_resources = AsDictionary(page.get("/Resources")) if page else None
|
|
315
|
+
if isinstance(page_resources, generic.IndirectObject):
|
|
316
|
+
with suppress(Exception):
|
|
317
|
+
page_resources = page_resources.get_object()
|
|
318
|
+
if isinstance(page_resources, generic.DictionaryObject):
|
|
319
|
+
resources = page_resources
|
|
320
|
+
return resources if isinstance(resources, generic.DictionaryObject) else None
|
|
321
|
+
|
|
322
|
+
def _DoTargetsImage(self, name: str, resources: generic.DictionaryObject | None) -> bool:
|
|
323
|
+
"""Determine whether ``name`` resolves to an Image XObject."""
|
|
324
|
+
|
|
325
|
+
normalized = name.lstrip("/")
|
|
326
|
+
if resources is not None:
|
|
327
|
+
xobjects = AsDictionary(resources.get("/XObject"))
|
|
328
|
+
if isinstance(xobjects, generic.IndirectObject):
|
|
329
|
+
with suppress(Exception):
|
|
330
|
+
xobjects = xobjects.get_object()
|
|
331
|
+
if isinstance(xobjects, generic.DictionaryObject):
|
|
332
|
+
for key, value in xobjects.items():
|
|
333
|
+
key_name = str(key)
|
|
334
|
+
if key_name.startswith("/"):
|
|
335
|
+
key_name = key_name[1:]
|
|
336
|
+
if key_name == normalized:
|
|
337
|
+
target = AsDictionary(value)
|
|
338
|
+
if isinstance(target, generic.IndirectObject):
|
|
339
|
+
with suppress(Exception):
|
|
340
|
+
target = target.get_object()
|
|
341
|
+
target = AsDictionary(target)
|
|
342
|
+
if isinstance(target, generic.DictionaryObject):
|
|
343
|
+
if target.get("/Subtype") == "/Image":
|
|
344
|
+
return True
|
|
345
|
+
# Fallback heuristic: appearance streams typically prefix image XObjects with "Im".
|
|
346
|
+
return normalized.lower().startswith("im")
|
|
347
|
+
|
|
348
|
+
def _ClassifyAppearance(self, widget: generic.DictionaryObject, page) -> str:
|
|
349
|
+
"""Classify the widget's appearance as drawn/typed/hybrid/unknown."""
|
|
350
|
+
|
|
351
|
+
ap_dict = AsDictionary(widget.get("/AP"))
|
|
352
|
+
if not isinstance(ap_dict, generic.DictionaryObject):
|
|
353
|
+
return "unknown"
|
|
354
|
+
normal = ap_dict.get("/N")
|
|
355
|
+
streams = self._ExtractAppearanceStreams(normal)
|
|
356
|
+
if not streams:
|
|
357
|
+
return "unknown"
|
|
358
|
+
|
|
359
|
+
has_text = False
|
|
360
|
+
has_vector = False
|
|
361
|
+
has_image = False
|
|
362
|
+
|
|
363
|
+
for stream in streams:
|
|
364
|
+
try:
|
|
365
|
+
data = stream.get_data() # type: ignore[attr-defined]
|
|
366
|
+
except Exception:
|
|
367
|
+
continue
|
|
368
|
+
if not data:
|
|
369
|
+
continue
|
|
370
|
+
|
|
371
|
+
text = data.decode("latin1", "ignore")
|
|
372
|
+
if AP_TEXT_PATTERN.search(text):
|
|
373
|
+
has_text = True
|
|
374
|
+
if AP_VECTOR_PATTERN.search(text):
|
|
375
|
+
has_vector = True
|
|
376
|
+
|
|
377
|
+
names = {match.group("name").lstrip("/") for match in AP_DO_PATTERN.finditer(text)}
|
|
378
|
+
if names:
|
|
379
|
+
resources = self._ResolveResources(stream, page)
|
|
380
|
+
for name in names:
|
|
381
|
+
if self._DoTargetsImage(name, resources):
|
|
382
|
+
has_image = True
|
|
383
|
+
break
|
|
384
|
+
|
|
385
|
+
if has_image and (has_text or has_vector):
|
|
386
|
+
return "hybrid"
|
|
387
|
+
if has_image:
|
|
388
|
+
return "drawn"
|
|
389
|
+
if has_text or has_vector:
|
|
390
|
+
return "typed"
|
|
391
|
+
return "unknown"
|
|
392
|
+
|
|
393
|
+
# ---- file-wide stream scan (compressed or not)
|
|
394
|
+
def _ScanFileStreamsForVendors(self, file_bytes: bytes) -> tuple[set[str], str]:
|
|
395
|
+
"""
|
|
396
|
+
Find all 'stream ... endstream' blocks, test raw and decompressed (zlib/gzip),
|
|
397
|
+
and return (vendor_hits, decoded_text_blob).
|
|
398
|
+
"""
|
|
399
|
+
hits: set[str] = set()
|
|
400
|
+
texts: list[str] = []
|
|
401
|
+
if not file_bytes:
|
|
402
|
+
return hits, ""
|
|
403
|
+
|
|
404
|
+
# quick pass on the whole file
|
|
405
|
+
hits |= self._ScanRaw(file_bytes)
|
|
406
|
+
|
|
407
|
+
for m in re.finditer(rb"stream\s*[\r\n]+(.*?)\s*endstream", file_bytes, re.DOTALL):
|
|
408
|
+
chunk = m.group(1)
|
|
409
|
+
if not chunk:
|
|
410
|
+
continue
|
|
411
|
+
|
|
412
|
+
# raw scan + raw text
|
|
413
|
+
hits |= self._ScanRaw(chunk)
|
|
414
|
+
texts.append(chunk.decode("latin1", "ignore"))
|
|
415
|
+
|
|
416
|
+
# try decompress with multiple wbits
|
|
417
|
+
for wbits in (15, -15, 31):
|
|
418
|
+
try:
|
|
419
|
+
dec = zlib.decompress(chunk, wbits)
|
|
420
|
+
if dec:
|
|
421
|
+
hits |= self._ScanRaw(dec)
|
|
422
|
+
texts.append(dec.decode("latin1", "ignore"))
|
|
423
|
+
break
|
|
424
|
+
except Exception:
|
|
425
|
+
continue
|
|
426
|
+
return hits, " ".join(texts)
|
|
427
|
+
|
|
428
|
+
# ---------------- helpers for widgets ----------------
|
|
429
|
+
def _FieldNameForWidget(self, wdict: generic.DictionaryObject) -> str:
|
|
430
|
+
nm = self._PickNameAny(wdict)
|
|
431
|
+
if nm:
|
|
432
|
+
return nm
|
|
433
|
+
p = AsDictionary(wdict.get("/Parent"))
|
|
434
|
+
if isinstance(p, generic.DictionaryObject):
|
|
435
|
+
nm = self._PickNameAny(p)
|
|
436
|
+
if nm:
|
|
437
|
+
return nm
|
|
438
|
+
nm = GetFieldNameFromAncestry(wdict)
|
|
439
|
+
return "" if nm is None else str(nm)
|
|
440
|
+
|
|
441
|
+
@staticmethod
|
|
442
|
+
def _PickNameAny(d: generic.DictionaryObject) -> str | None:
|
|
443
|
+
for key in ("/T", "/TU", "/TM"):
|
|
444
|
+
v = d.get(key)
|
|
445
|
+
if v:
|
|
446
|
+
try:
|
|
447
|
+
return str(v)
|
|
448
|
+
except Exception:
|
|
449
|
+
return None
|
|
450
|
+
return None
|
|
451
|
+
|
|
452
|
+
def _IsSignatureWidget(self, wdict: generic.DictionaryObject) -> bool:
|
|
453
|
+
"""Strictly identify real signature widgets and ignore envelope/metadata fields."""
|
|
454
|
+
try:
|
|
455
|
+
if wdict.get("/FT") == "/Sig" or HasSignatureFieldInAncestry(wdict):
|
|
456
|
+
return True
|
|
457
|
+
# value object might be an indirect sig dict
|
|
458
|
+
v = wdict.get("/V")
|
|
459
|
+
if isinstance(v, generic.IndirectObject):
|
|
460
|
+
v = v.get_object()
|
|
461
|
+
dv = AsDictionary(v)
|
|
462
|
+
if isinstance(dv, generic.DictionaryObject) and dv.get("/Type") == "/Sig":
|
|
463
|
+
return True
|
|
464
|
+
# Heuristic: drop known non-signature fields (DocuSign envelope, cert refs, etc.)
|
|
465
|
+
fname = (self._FieldNameForWidget(wdict) or "").strip()
|
|
466
|
+
if fname and self._EnvelopeNoise.search(fname):
|
|
467
|
+
return False
|
|
468
|
+
except Exception:
|
|
469
|
+
pass
|
|
470
|
+
return False
|
|
471
|
+
|
|
472
|
+
def _iter_widgets_with_ref(
|
|
473
|
+
self, annots_obj
|
|
474
|
+
) -> Iterator[tuple[generic.DictionaryObject, generic.IndirectObject | None]]:
|
|
475
|
+
if annots_obj is None:
|
|
476
|
+
return
|
|
477
|
+
stack = [annots_obj]
|
|
478
|
+
while stack:
|
|
479
|
+
cur = stack.pop()
|
|
480
|
+
if isinstance(cur, generic.IndirectObject):
|
|
481
|
+
obj = cur.get_object()
|
|
482
|
+
if isinstance(obj, generic.DictionaryObject) and obj.get("/Subtype") == "/Widget":
|
|
483
|
+
yield obj, cur
|
|
484
|
+
continue
|
|
485
|
+
if isinstance(obj, generic.ArrayObject):
|
|
486
|
+
stack.extend(list(obj))
|
|
487
|
+
continue
|
|
488
|
+
elif isinstance(cur, generic.ArrayObject):
|
|
489
|
+
stack.extend(list(cur))
|
|
490
|
+
elif isinstance(cur, generic.DictionaryObject) and cur.get("/Subtype") == "/Widget":
|
|
491
|
+
yield cur, None
|
|
492
|
+
|
|
493
|
+
def _CollectAcroSignatures(self, reader: PdfReader) -> list[tuple[str, int | None, bool]]:
|
|
494
|
+
"""
|
|
495
|
+
Return a list of (field_name, page_index_or_None, has_kids_widget).
|
|
496
|
+
has_kids_widget = True when /Kids exists on the /Sig field (real widget).
|
|
497
|
+
"""
|
|
498
|
+
results: list[tuple[str, int | None, bool]] = []
|
|
499
|
+
with suppress(Exception):
|
|
500
|
+
root = reader.trailer["/Root"]
|
|
501
|
+
acro = root.get("/AcroForm")
|
|
502
|
+
fields = AsDictionary(acro).get("/Fields") if acro else None
|
|
503
|
+
if not isinstance(fields, generic.ArrayObject):
|
|
504
|
+
return results
|
|
505
|
+
|
|
506
|
+
def walk(fobj):
|
|
507
|
+
fd = AsDictionary(fobj)
|
|
508
|
+
if not isinstance(fd, generic.DictionaryObject):
|
|
509
|
+
return
|
|
510
|
+
if (
|
|
511
|
+
fd.get("/FT") == "/Sig"
|
|
512
|
+
or HasSignatureFieldInAncestry(fd)
|
|
513
|
+
or HasSignatureValue(fd)
|
|
514
|
+
):
|
|
515
|
+
name = (
|
|
516
|
+
self._PickNameAny(fd) or (GetFieldNameFromAncestry(fd) or "") or "AcroSig"
|
|
517
|
+
)
|
|
518
|
+
page_idx: int | None = None
|
|
519
|
+
has_kids_widget = False
|
|
520
|
+
|
|
521
|
+
kids = AsDictionary(fd.get("/Kids"))
|
|
522
|
+
if isinstance(kids, generic.ArrayObject):
|
|
523
|
+
has_kids_widget = len(kids) > 0
|
|
524
|
+
for kid in kids:
|
|
525
|
+
kd = AsDictionary(kid)
|
|
526
|
+
if isinstance(kd, generic.DictionaryObject):
|
|
527
|
+
with suppress(KeyError, AttributeError):
|
|
528
|
+
p = kd.get("/P")
|
|
529
|
+
if isinstance(p, generic.IndirectObject):
|
|
530
|
+
try:
|
|
531
|
+
for i, pg in enumerate(reader.pages, start=1):
|
|
532
|
+
if pg.indirect_reference == p:
|
|
533
|
+
page_idx = i
|
|
534
|
+
break
|
|
535
|
+
except Exception:
|
|
536
|
+
pass
|
|
537
|
+
results.append((str(name), page_idx, has_kids_widget))
|
|
538
|
+
|
|
539
|
+
kids = AsDictionary(fd.get("/Kids"))
|
|
540
|
+
if isinstance(kids, generic.ArrayObject):
|
|
541
|
+
for k in kids:
|
|
542
|
+
walk(k)
|
|
543
|
+
|
|
544
|
+
for f in fields:
|
|
545
|
+
walk(f)
|
|
546
|
+
|
|
547
|
+
return results
|
|
548
|
+
|
|
549
|
+
# ---------------- role scoring (HIPAA) ----------------
|
|
550
|
+
def _RolesFromField(self, field_name: str) -> set[str]:
|
|
551
|
+
roles: set[str] = set()
|
|
552
|
+
compact = re.sub(r"[^a-z0-9]+", "", (field_name or "").lower())
|
|
553
|
+
if not compact:
|
|
554
|
+
return roles
|
|
555
|
+
for role, keys in self.FieldHints.items():
|
|
556
|
+
if any(k in compact for k in keys):
|
|
557
|
+
roles.add(role)
|
|
558
|
+
return roles
|
|
559
|
+
|
|
560
|
+
def _InferRole(self, field_name: str, page_text: str):
|
|
561
|
+
scores: dict[str, int] = defaultdict(int)
|
|
562
|
+
evidence: list[str] = []
|
|
563
|
+
|
|
564
|
+
for r in self._RolesFromField(field_name):
|
|
565
|
+
scores[r] += self.WeightConfiguration["field"]
|
|
566
|
+
evidence.append(f"field:{r}")
|
|
567
|
+
|
|
568
|
+
for r in RolesFromLabels(page_text, self.RoleLabelPatterns):
|
|
569
|
+
scores[r] += self.WeightConfiguration["page_label"]
|
|
570
|
+
evidence.append(f"page_label:{r}")
|
|
571
|
+
|
|
572
|
+
for r in RolesFromGeneral(page_text, self.GeneralRolePatterns):
|
|
573
|
+
scores[r] += self.WeightConfiguration["general"]
|
|
574
|
+
evidence.append(f"general:{r}")
|
|
575
|
+
|
|
576
|
+
role = ChooseRole(scores)
|
|
577
|
+
return role, evidence, dict(scores), sum(scores.values())
|
|
578
|
+
|
|
579
|
+
# ---------------- retainer utilities (pseudo, vendor-only) ----------------
|
|
580
|
+
def _RetainerPageScores(
|
|
581
|
+
self,
|
|
582
|
+
text: str,
|
|
583
|
+
vendor_count: int,
|
|
584
|
+
page_index0: int,
|
|
585
|
+
total_pages: int,
|
|
586
|
+
) -> tuple[int, int, list[str]]:
|
|
587
|
+
"""Return (client_score, firm_score, evidence[]) for a single page."""
|
|
588
|
+
t = NormalizeText(text)
|
|
589
|
+
ev: list[str] = []
|
|
590
|
+
cs = fs = 0
|
|
591
|
+
|
|
592
|
+
# explicit labels
|
|
593
|
+
for rx in self._ClientPagePatterns:
|
|
594
|
+
if rx.search(t):
|
|
595
|
+
cs += self.WeightConfiguration["page_label"]
|
|
596
|
+
ev.append("label:client")
|
|
597
|
+
|
|
598
|
+
firm_label_hit = False
|
|
599
|
+
for rx in self._FirmPagePatterns:
|
|
600
|
+
if rx.search(t):
|
|
601
|
+
fs += self.WeightConfiguration["page_label"]
|
|
602
|
+
firm_label_hit = True
|
|
603
|
+
if firm_label_hit:
|
|
604
|
+
ev.append("label:firm")
|
|
605
|
+
|
|
606
|
+
# firm markers (LLP/LLC/etc.) boost — BUT ignore on page 1 unless a real signature cue exists
|
|
607
|
+
marker_boosted = False
|
|
608
|
+
for rx in self._FirmMarkerPatterns:
|
|
609
|
+
if rx.search(t):
|
|
610
|
+
if page_index0 > 0 or self._SignatureWord.search(t) or self._ByWord.search(t):
|
|
611
|
+
fs += self.WeightConfiguration["general"] # light boost
|
|
612
|
+
ev.append("marker:firm")
|
|
613
|
+
marker_boosted = True
|
|
614
|
+
# If only marker on page 1 with no cue, we do not add any boost.
|
|
615
|
+
|
|
616
|
+
# signature & date co-occurrence (stronger confidence)
|
|
617
|
+
sig_hit = bool(self._SignatureWord.search(t))
|
|
618
|
+
date_hit = bool(self._DateWord.search(t))
|
|
619
|
+
if sig_hit and re.search(r"\bclient\b", t, re.I):
|
|
620
|
+
cs += 1
|
|
621
|
+
ev.append("word:signature+client")
|
|
622
|
+
if sig_hit and (
|
|
623
|
+
self._ByWord.search(t) or re.search(r"\b(attorney|counsel|firm)\b", t, re.I)
|
|
624
|
+
):
|
|
625
|
+
fs += 1
|
|
626
|
+
ev.append("word:signature+firm")
|
|
627
|
+
if sig_hit and date_hit:
|
|
628
|
+
# common signature block layout has both
|
|
629
|
+
cs += 1
|
|
630
|
+
fs += 1
|
|
631
|
+
ev.append("word:signature+date")
|
|
632
|
+
|
|
633
|
+
# vendor hits seen on this page (from content/xobject) – weak but helpful
|
|
634
|
+
if vendor_count > 0:
|
|
635
|
+
cs += 1
|
|
636
|
+
fs += 1
|
|
637
|
+
ev.append("vendor:page_hit")
|
|
638
|
+
|
|
639
|
+
# position prior: end of the doc tends to host signature blocks
|
|
640
|
+
if total_pages >= 3 and page_index0 >= (2 * total_pages) // 3 - 1:
|
|
641
|
+
cs += 1
|
|
642
|
+
fs += 1
|
|
643
|
+
ev.append("prior:end_of_doc")
|
|
644
|
+
|
|
645
|
+
# general role regex (if YAML provided)
|
|
646
|
+
for r in RolesFromGeneral(t, self.GeneralRolePatterns):
|
|
647
|
+
if r == "client":
|
|
648
|
+
cs += self.WeightConfiguration["general"]
|
|
649
|
+
ev.append("general:client")
|
|
650
|
+
if r in {"firm", "attorney"}:
|
|
651
|
+
fs += self.WeightConfiguration["general"]
|
|
652
|
+
ev.append("general:firm")
|
|
653
|
+
|
|
654
|
+
# FINAL dampener for page 1:
|
|
655
|
+
# If page 1 had only weak firm markers (LLP/LLC) and no signature cues, wipe that boost.
|
|
656
|
+
if page_index0 == 0 and not sig_hit and not self._ByWord.search(t):
|
|
657
|
+
if marker_boosted:
|
|
658
|
+
fs = max(0, fs - self.WeightConfiguration["general"])
|
|
659
|
+
ev.append("dampen:front_matter")
|
|
660
|
+
|
|
661
|
+
return cs, fs, ev
|
|
662
|
+
|
|
663
|
+
# ---------------- main ----------------
|
|
664
|
+
def Detect(self, pdf_path: Path) -> FileResult:
|
|
665
|
+
try:
|
|
666
|
+
with _QuietIo():
|
|
667
|
+
reader = PdfReader(str(pdf_path))
|
|
668
|
+
size_kb = round(pdf_path.stat().st_size / 1024, 1)
|
|
669
|
+
pages = len(reader.pages)
|
|
670
|
+
|
|
671
|
+
# file-wide vendor scan (+decompressed streams)
|
|
672
|
+
try:
|
|
673
|
+
_file_bytes = pdf_path.read_bytes()
|
|
674
|
+
except Exception:
|
|
675
|
+
_file_bytes = b""
|
|
676
|
+
file_vendor_hits, _stream_text_blob = self._ScanFileStreamsForVendors(_file_bytes)
|
|
677
|
+
|
|
678
|
+
acro_sig_list = self._CollectAcroSignatures(reader)
|
|
679
|
+
|
|
680
|
+
page_texts: list[str] = []
|
|
681
|
+
vendor_hints: set[str] = set()
|
|
682
|
+
vendor_hits_per_page: list[int] = []
|
|
683
|
+
images_per_page: list[int] = []
|
|
684
|
+
any_text, img_pages = False, 0
|
|
685
|
+
|
|
686
|
+
for page in reader.pages:
|
|
687
|
+
# per-page vendor
|
|
688
|
+
pv = self._ScanPageVendors(page)
|
|
689
|
+
x_hits: set[str] = set()
|
|
690
|
+
x_text = ""
|
|
691
|
+
if self.RecurseXObjects:
|
|
692
|
+
x_hits, x_text = self._CollectXObjectVendorAndText(page)
|
|
693
|
+
vendor_hints |= pv | x_hits
|
|
694
|
+
vendor_hits_per_page.append(len(pv) + len(x_hits))
|
|
695
|
+
|
|
696
|
+
with _QuietIo():
|
|
697
|
+
txt = page.extract_text() or ""
|
|
698
|
+
if x_text:
|
|
699
|
+
txt = f"{txt} {x_text}".strip() if txt else x_text.strip()
|
|
700
|
+
page_texts.append(txt)
|
|
701
|
+
any_text = any_text or bool(txt)
|
|
702
|
+
|
|
703
|
+
# image counting
|
|
704
|
+
img_count = 0
|
|
705
|
+
with suppress(KeyError):
|
|
706
|
+
xobjs = page["/Resources"]["/XObject"]
|
|
707
|
+
img_count = sum(
|
|
708
|
+
1 for obj in xobjs.values() if AsDictionary(obj).get("/Subtype") == "/Image"
|
|
709
|
+
)
|
|
710
|
+
images_per_page.append(img_count)
|
|
711
|
+
img_pages += 1 if img_count > 0 else 0
|
|
712
|
+
|
|
713
|
+
# Merge file-level vendor hits (catches unpainted & compressed streams)
|
|
714
|
+
vendor_hints |= file_vendor_hits
|
|
715
|
+
|
|
716
|
+
scanned_pdf = (not any_text) and (img_pages > 0)
|
|
717
|
+
|
|
718
|
+
# --- find signature widgets on pages (strict)
|
|
719
|
+
page_widgets: list[
|
|
720
|
+
tuple[int, generic.DictionaryObject, generic.IndirectObject | None]
|
|
721
|
+
] = []
|
|
722
|
+
for idx, page in enumerate(reader.pages, start=1):
|
|
723
|
+
for wdict, ref in self._iter_widgets_with_ref(page.get("/Annots")):
|
|
724
|
+
if self._IsSignatureWidget(wdict):
|
|
725
|
+
page_widgets.append((idx, wdict, ref))
|
|
726
|
+
# else: ignore envelope/cert widgets
|
|
727
|
+
|
|
728
|
+
has_page_widgets = len(page_widgets) > 0
|
|
729
|
+
has_acro = len(acro_sig_list) > 0
|
|
730
|
+
has_vendor = len(vendor_hints) > 0
|
|
731
|
+
acro_has_kids = any(hk for _, __, hk in acro_sig_list)
|
|
732
|
+
|
|
733
|
+
# ───────────────────────────── HIPAA branch ─────────────────────────────
|
|
734
|
+
if self.Profile == "hipaa":
|
|
735
|
+
return self._DetectHipaaPath(
|
|
736
|
+
pdf_path=pdf_path,
|
|
737
|
+
reader=reader,
|
|
738
|
+
page_texts=page_texts,
|
|
739
|
+
vendor_hints=vendor_hints,
|
|
740
|
+
scanned_pdf=scanned_pdf,
|
|
741
|
+
page_widgets=page_widgets,
|
|
742
|
+
acro_sig_list=acro_sig_list,
|
|
743
|
+
has_page_widgets=has_page_widgets,
|
|
744
|
+
has_acro=has_acro,
|
|
745
|
+
has_vendor=has_vendor,
|
|
746
|
+
acro_has_kids=acro_has_kids,
|
|
747
|
+
size_kb=size_kb,
|
|
748
|
+
pages=pages,
|
|
749
|
+
)
|
|
750
|
+
|
|
751
|
+
# ───────────────────────────── Retainer branch ─────────────────────────────
|
|
752
|
+
signatures: list[Signature] = []
|
|
753
|
+
|
|
754
|
+
if has_page_widgets:
|
|
755
|
+
# Real widgets: infer role from page text; avoid envelope-id noise already filtered
|
|
756
|
+
seen_refs: set[str] = set()
|
|
757
|
+
seen_page_name: set[tuple[int, str]] = set()
|
|
758
|
+
|
|
759
|
+
for idx, wdict, ref in page_widgets:
|
|
760
|
+
field_name = self._FieldNameForWidget(wdict)
|
|
761
|
+
page_obj = reader.pages[idx - 1] if 0 <= (idx - 1) < len(reader.pages) else None
|
|
762
|
+
render_type = self._ClassifyAppearance(wdict, page_obj)
|
|
763
|
+
|
|
764
|
+
# de-dup by object ref (if present) and (page, name)
|
|
765
|
+
if isinstance(ref, generic.IndirectObject):
|
|
766
|
+
key = f"{ref.idnum}:{ref.generation}"
|
|
767
|
+
if key in seen_refs:
|
|
768
|
+
continue
|
|
769
|
+
seen_refs.add(key)
|
|
770
|
+
|
|
771
|
+
if field_name:
|
|
772
|
+
key2 = (idx, field_name)
|
|
773
|
+
if key2 in seen_page_name:
|
|
774
|
+
continue
|
|
775
|
+
seen_page_name.add(key2)
|
|
776
|
+
|
|
777
|
+
page_text = page_texts[idx - 1] if 0 <= (idx - 1) < len(page_texts) else ""
|
|
778
|
+
c, f, ev = self._RetainerPageScores(
|
|
779
|
+
page_text, vendor_hits_per_page[idx - 1], idx - 1, pages
|
|
780
|
+
)
|
|
781
|
+
role = "client" if c >= f and c > 0 else ("firm" if f > 0 else "unknown")
|
|
782
|
+
|
|
783
|
+
# fall back to generic role inference if indecisive
|
|
784
|
+
if role == "unknown":
|
|
785
|
+
role, evidence, scores, total = self._InferRole(field_name, page_text)
|
|
786
|
+
evidence = evidence or ev
|
|
787
|
+
scores = scores or ({role: 1} if role != "unknown" else {})
|
|
788
|
+
total = total or sum(scores.values())
|
|
789
|
+
else:
|
|
790
|
+
evidence = ev
|
|
791
|
+
scores = {role: (c if role == "client" else f)}
|
|
792
|
+
total = scores[role]
|
|
793
|
+
|
|
794
|
+
signatures.append(
|
|
795
|
+
Signature(
|
|
796
|
+
Page=idx,
|
|
797
|
+
FieldName=field_name,
|
|
798
|
+
Role=role,
|
|
799
|
+
Score=total,
|
|
800
|
+
Scores=scores,
|
|
801
|
+
Evidence=evidence,
|
|
802
|
+
Hint=(f"AcroSig:{field_name}" if field_name else "AcroSig"),
|
|
803
|
+
RenderType=render_type,
|
|
804
|
+
)
|
|
805
|
+
)
|
|
806
|
+
|
|
807
|
+
# If only one role but page text clearly indicates both, add the second role pseudo.
|
|
808
|
+
if len(signatures) == 1:
|
|
809
|
+
pg = signatures[0].Page or 1
|
|
810
|
+
c, f, ev = self._RetainerPageScores(
|
|
811
|
+
page_texts[pg - 1], vendor_hits_per_page[pg - 1], pg - 1, pages
|
|
812
|
+
)
|
|
813
|
+
want = None
|
|
814
|
+
have = {signatures[0].Role}
|
|
815
|
+
if "client" not in have and c > 0:
|
|
816
|
+
want = ("client", c)
|
|
817
|
+
elif "firm" not in have and f > 0:
|
|
818
|
+
want = ("firm", f)
|
|
819
|
+
if want:
|
|
820
|
+
r, sc = want
|
|
821
|
+
signatures.append(
|
|
822
|
+
Signature(
|
|
823
|
+
Page=pg,
|
|
824
|
+
FieldName="vendor_or_acro_detected",
|
|
825
|
+
Role=r,
|
|
826
|
+
Score=sc,
|
|
827
|
+
Scores={r: sc},
|
|
828
|
+
Evidence=ev + ["pseudo:true"],
|
|
829
|
+
Hint="VendorOrAcroOnly",
|
|
830
|
+
)
|
|
831
|
+
)
|
|
832
|
+
|
|
833
|
+
else:
|
|
834
|
+
# No widgets found. Retainers usually have two signees; pick likely pages.
|
|
835
|
+
if self.Configuration.PseudoSignatures and (has_acro or has_vendor):
|
|
836
|
+
totals = []
|
|
837
|
+
for i, text in enumerate(page_texts):
|
|
838
|
+
c, f, ev = self._RetainerPageScores(text, vendor_hits_per_page[i], i, pages)
|
|
839
|
+
totals.append((i, c, f, ev))
|
|
840
|
+
|
|
841
|
+
# Pages with any signal
|
|
842
|
+
candidates = [i for i, c, f, _ in totals if (c > 0 or f > 0)]
|
|
843
|
+
|
|
844
|
+
# If page 1 is in candidates but has no signature cue, drop it (anti front-matter).
|
|
845
|
+
def HasSignatureCue(i: int) -> bool:
|
|
846
|
+
t = page_texts[i]
|
|
847
|
+
return bool(self._SignatureWord.search(t) or self._ByWord.search(t))
|
|
848
|
+
|
|
849
|
+
candidates = [i for i in candidates if not (i == 0 and not HasSignatureCue(i))]
|
|
850
|
+
|
|
851
|
+
# If still empty, prefer the last page(s)
|
|
852
|
+
if not candidates:
|
|
853
|
+
candidates = [p for p in range(max(0, pages - 2), pages)]
|
|
854
|
+
|
|
855
|
+
# best client & firm pages
|
|
856
|
+
c_best = max(candidates, key=lambda i: totals[i][1]) if candidates else None
|
|
857
|
+
f_best = max(candidates, key=lambda i: totals[i][2]) if candidates else None
|
|
858
|
+
|
|
859
|
+
def emit(page_idx: int | None, role: str, score: int, ev: list[str]):
|
|
860
|
+
pg = (page_idx + 1) if page_idx is not None else pages
|
|
861
|
+
signatures.append(
|
|
862
|
+
Signature(
|
|
863
|
+
Page=pg,
|
|
864
|
+
FieldName="vendor_or_acro_detected",
|
|
865
|
+
Role=role,
|
|
866
|
+
Score=score,
|
|
867
|
+
Scores={role: score} if score > 0 else {},
|
|
868
|
+
Evidence=ev + ["pseudo:true"],
|
|
869
|
+
Hint="VendorOrAcroOnly",
|
|
870
|
+
)
|
|
871
|
+
)
|
|
872
|
+
|
|
873
|
+
if c_best is not None and totals[c_best][1] > 0:
|
|
874
|
+
emit(c_best, "client", totals[c_best][1], totals[c_best][3])
|
|
875
|
+
if f_best is not None and totals[f_best][2] > 0:
|
|
876
|
+
emit(f_best, "firm", totals[f_best][2], totals[f_best][3])
|
|
877
|
+
|
|
878
|
+
# If nothing yet, emit both roles on the last page as a conservative fallback.
|
|
879
|
+
if not signatures:
|
|
880
|
+
emit(pages - 1, "client", 0, [])
|
|
881
|
+
emit(pages - 1, "firm", 0, [])
|
|
882
|
+
|
|
883
|
+
# doc-level names for hints
|
|
884
|
+
acro_names = {name for name, _pg, _hk in acro_sig_list if name}
|
|
885
|
+
|
|
886
|
+
# scanned/mixed refinement for retainer:
|
|
887
|
+
# If we emitted only pseudo signatures (no widgets on those pages and no vendor on them),
|
|
888
|
+
# and those pages have images, mark scanned and mixed.
|
|
889
|
+
if self.Profile == "retainer" and not has_page_widgets:
|
|
890
|
+
pseudo_pages = [s.Page for s in signatures if s.Page]
|
|
891
|
+
if pseudo_pages:
|
|
892
|
+
pvendors = all(
|
|
893
|
+
vendor_hits_per_page[p - 1] == 0 for p in pseudo_pages if p - 1 >= 0
|
|
894
|
+
)
|
|
895
|
+
pimages = any(images_per_page[p - 1] > 0 for p in pseudo_pages if p - 1 >= 0)
|
|
896
|
+
else:
|
|
897
|
+
pvendors = False
|
|
898
|
+
pimages = False
|
|
899
|
+
if pimages and pvendors:
|
|
900
|
+
scanned_pdf = True # scanned signatures present
|
|
901
|
+
|
|
902
|
+
esign_found = (len(signatures) > 0) or has_vendor or has_acro
|
|
903
|
+
mixed = esign_found and scanned_pdf
|
|
904
|
+
|
|
905
|
+
doc_roles: set[str] = {s.Role for s in signatures if s.Role != "unknown"}
|
|
906
|
+
|
|
907
|
+
hints: set[str] = set()
|
|
908
|
+
hints |= {f"AcroSig:{n}" for n in acro_names}
|
|
909
|
+
hints |= set(vendor_hints)
|
|
910
|
+
hints |= {s.Hint for s in signatures}
|
|
911
|
+
|
|
912
|
+
return FileResult(
|
|
913
|
+
File=pdf_path.name,
|
|
914
|
+
SizeKilobytes=size_kb,
|
|
915
|
+
PageCount=pages,
|
|
916
|
+
ElectronicSignatureFound=esign_found,
|
|
917
|
+
ScannedPdf=scanned_pdf,
|
|
918
|
+
MixedContent=mixed,
|
|
919
|
+
SignatureCount=len(signatures),
|
|
920
|
+
SignaturePages=",".join(
|
|
921
|
+
map(str, sorted({signature.Page for signature in signatures if signature.Page}))
|
|
922
|
+
),
|
|
923
|
+
Roles=";".join(sorted(doc_roles)) if doc_roles else "unknown",
|
|
924
|
+
Hints=";".join(sorted(hints)),
|
|
925
|
+
Signatures=signatures,
|
|
926
|
+
)
|
|
927
|
+
|
|
928
|
+
except Exception as exc: # capture errors per file
|
|
929
|
+
return FileResult(
|
|
930
|
+
File=pdf_path.name,
|
|
931
|
+
SizeKilobytes=None,
|
|
932
|
+
PageCount=0,
|
|
933
|
+
ElectronicSignatureFound=False,
|
|
934
|
+
ScannedPdf=None,
|
|
935
|
+
MixedContent=None,
|
|
936
|
+
SignatureCount=0,
|
|
937
|
+
SignaturePages="",
|
|
938
|
+
Roles="error",
|
|
939
|
+
Hints=f"ERROR:{exc}",
|
|
940
|
+
Signatures=[],
|
|
941
|
+
)
|
|
942
|
+
|
|
943
|
+
# ───────────────────────── HIPAA path ─────────────────────────
|
|
944
|
+
def _DetectHipaaPath(
|
|
945
|
+
self,
|
|
946
|
+
*,
|
|
947
|
+
pdf_path: Path,
|
|
948
|
+
reader: PdfReader,
|
|
949
|
+
page_texts: list[str],
|
|
950
|
+
vendor_hints: set[str],
|
|
951
|
+
scanned_pdf: bool,
|
|
952
|
+
page_widgets: list[tuple[int, generic.DictionaryObject, generic.IndirectObject | None]],
|
|
953
|
+
acro_sig_list: list[tuple[str, int | None, bool]],
|
|
954
|
+
has_page_widgets: bool,
|
|
955
|
+
has_acro: bool,
|
|
956
|
+
has_vendor: bool,
|
|
957
|
+
acro_has_kids: bool,
|
|
958
|
+
size_kb: float,
|
|
959
|
+
pages: int,
|
|
960
|
+
) -> FileResult:
|
|
961
|
+
signatures: list[Signature] = []
|
|
962
|
+
|
|
963
|
+
if has_page_widgets:
|
|
964
|
+
# --- real widgets path (NO pseudo allowed later)
|
|
965
|
+
seen_refs: set[str] = set()
|
|
966
|
+
seen_page_name: set[tuple[int, str]] = set()
|
|
967
|
+
|
|
968
|
+
for idx, wdict, ref in page_widgets:
|
|
969
|
+
field_name = self._FieldNameForWidget(wdict)
|
|
970
|
+
page_obj = reader.pages[idx - 1] if 0 <= (idx - 1) < len(reader.pages) else None
|
|
971
|
+
render_type = self._ClassifyAppearance(wdict, page_obj)
|
|
972
|
+
|
|
973
|
+
# de-dup by object ref (if present) and (page, name)
|
|
974
|
+
if isinstance(ref, generic.IndirectObject):
|
|
975
|
+
key = f"{ref.idnum}:{ref.generation}"
|
|
976
|
+
if key in seen_refs:
|
|
977
|
+
continue
|
|
978
|
+
seen_refs.add(key)
|
|
979
|
+
|
|
980
|
+
if field_name:
|
|
981
|
+
key2 = (idx, field_name)
|
|
982
|
+
if key2 in seen_page_name:
|
|
983
|
+
continue
|
|
984
|
+
seen_page_name.add(key2)
|
|
985
|
+
|
|
986
|
+
page_text = page_texts[idx - 1] if 0 <= (idx - 1) < len(page_texts) else ""
|
|
987
|
+
role, evidence, scores, total = self._InferRole(field_name, page_text)
|
|
988
|
+
signatures.append(
|
|
989
|
+
Signature(
|
|
990
|
+
Page=idx,
|
|
991
|
+
FieldName=field_name,
|
|
992
|
+
Role=role,
|
|
993
|
+
Score=total,
|
|
994
|
+
Scores=scores,
|
|
995
|
+
Evidence=evidence,
|
|
996
|
+
Hint=(f"AcroSig:{field_name}" if field_name else "AcroSig"),
|
|
997
|
+
RenderType=render_type,
|
|
998
|
+
)
|
|
999
|
+
)
|
|
1000
|
+
|
|
1001
|
+
elif acro_has_kids:
|
|
1002
|
+
# There are real /Widget(s) attached to /Sig fields, but we didn't
|
|
1003
|
+
# locate them on pages (e.g., /Annots not visible). Emit NON-pseudo
|
|
1004
|
+
# signatures from the field names so mixed cases don't get pseudo.
|
|
1005
|
+
whole_text = NormalizeText("\n".join(page_texts))
|
|
1006
|
+
for fname, pg, _hk in acro_sig_list:
|
|
1007
|
+
page_text = page_texts[pg - 1] if pg and pg - 1 < len(page_texts) else ""
|
|
1008
|
+
# fallback to whole doc text if page unknown
|
|
1009
|
+
base_text = page_text or whole_text
|
|
1010
|
+
role, evidence, scores, total = self._InferRole(fname, base_text)
|
|
1011
|
+
signatures.append(
|
|
1012
|
+
Signature(
|
|
1013
|
+
Page=pg,
|
|
1014
|
+
FieldName=fname,
|
|
1015
|
+
Role=role,
|
|
1016
|
+
Score=total,
|
|
1017
|
+
Scores=scores,
|
|
1018
|
+
Evidence=evidence,
|
|
1019
|
+
Hint=f"AcroSig:{fname}" if fname else "AcroSig",
|
|
1020
|
+
)
|
|
1021
|
+
)
|
|
1022
|
+
|
|
1023
|
+
else:
|
|
1024
|
+
# --- vendor/acro pseudo path (only when NO page widgets or acro-kids)
|
|
1025
|
+
if self.Configuration.PseudoSignatures and (has_acro or has_vendor):
|
|
1026
|
+
# IMPORTANT: use only page text (not raw decompressed stream dump).
|
|
1027
|
+
text_norm = NormalizeText("\n".join(page_texts))
|
|
1028
|
+
scores: dict[str, int] = defaultdict(int)
|
|
1029
|
+
evidence: list[str] = []
|
|
1030
|
+
|
|
1031
|
+
# Hard rules
|
|
1032
|
+
rel = self.DocumentHardRules.get("rel_label")
|
|
1033
|
+
kin = self.DocumentHardRules.get("kin")
|
|
1034
|
+
minor = self.DocumentHardRules.get("minor")
|
|
1035
|
+
firstp = self.DocumentHardRules.get("first_person")
|
|
1036
|
+
|
|
1037
|
+
rel_hit = bool(rel and rel.search(text_norm))
|
|
1038
|
+
kin_hit = bool(kin and kin.search(text_norm))
|
|
1039
|
+
if rel_hit and kin_hit:
|
|
1040
|
+
scores["representative"] += 100
|
|
1041
|
+
evidence.append("rule:relationship+kin")
|
|
1042
|
+
if minor and minor.search(text_norm):
|
|
1043
|
+
scores["representative"] += 50
|
|
1044
|
+
evidence.append("rule:minor/unable_to_sign")
|
|
1045
|
+
if scores.get("representative", 0) == 0 and firstp and (firstp.search(text_norm)):
|
|
1046
|
+
scores["patient"] += 30
|
|
1047
|
+
evidence.append("rule:first_person_authorize")
|
|
1048
|
+
|
|
1049
|
+
# Labels across doc
|
|
1050
|
+
for r in RolesFromLabels(text_norm, self.RoleLabelPatterns):
|
|
1051
|
+
scores[r] += self.WeightConfiguration["page_label"]
|
|
1052
|
+
evidence.append(f"page_label:{r}")
|
|
1053
|
+
|
|
1054
|
+
# General — ignore weak attorney in pseudo
|
|
1055
|
+
for r in RolesFromGeneral(text_norm, self.GeneralRolePatterns):
|
|
1056
|
+
if r == "attorney":
|
|
1057
|
+
continue
|
|
1058
|
+
scores[r] += self.WeightConfiguration["general"]
|
|
1059
|
+
evidence.append(f"general:{r}")
|
|
1060
|
+
|
|
1061
|
+
# Boost from acro field names, if any
|
|
1062
|
+
for fname, _pg, _hk in acro_sig_list:
|
|
1063
|
+
for r in self._RolesFromField(fname):
|
|
1064
|
+
scores[r] += self.WeightConfiguration["field"]
|
|
1065
|
+
evidence.append(f"field:{r}")
|
|
1066
|
+
|
|
1067
|
+
role = ChooseRole(scores)
|
|
1068
|
+
if role == "unknown":
|
|
1069
|
+
if rel_hit and kin_hit:
|
|
1070
|
+
role = "representative"
|
|
1071
|
+
evidence.append("tie:relationship+kin")
|
|
1072
|
+
elif firstp and firstp.search(text_norm):
|
|
1073
|
+
role = "patient"
|
|
1074
|
+
evidence.append("tie:first_person")
|
|
1075
|
+
|
|
1076
|
+
signatures.append(
|
|
1077
|
+
Signature(
|
|
1078
|
+
Page=None,
|
|
1079
|
+
FieldName="vendor_or_acro_detected",
|
|
1080
|
+
Role=role,
|
|
1081
|
+
Score=sum(scores.values()),
|
|
1082
|
+
Scores=dict(scores),
|
|
1083
|
+
Evidence=evidence + ["pseudo:true"],
|
|
1084
|
+
Hint="VendorOrAcroOnly",
|
|
1085
|
+
)
|
|
1086
|
+
)
|
|
1087
|
+
|
|
1088
|
+
# doc-level hints
|
|
1089
|
+
acro_names = {name for name, _pg, _hk in acro_sig_list if name}
|
|
1090
|
+
esign_found = (len(signatures) > 0) or (len(vendor_hints) > 0) or (len(acro_names) > 0)
|
|
1091
|
+
mixed = esign_found and scanned_pdf
|
|
1092
|
+
|
|
1093
|
+
doc_roles: set[str] = {s.Role for s in signatures if s.Role != "unknown"}
|
|
1094
|
+
|
|
1095
|
+
hints: set[str] = set()
|
|
1096
|
+
hints |= {f"AcroSig:{n}" for n in acro_names}
|
|
1097
|
+
hints |= set(vendor_hints)
|
|
1098
|
+
hints |= {s.Hint for s in signatures}
|
|
1099
|
+
|
|
1100
|
+
return FileResult(
|
|
1101
|
+
File=pdf_path.name,
|
|
1102
|
+
SizeKilobytes=size_kb,
|
|
1103
|
+
PageCount=pages,
|
|
1104
|
+
ElectronicSignatureFound=esign_found,
|
|
1105
|
+
ScannedPdf=scanned_pdf,
|
|
1106
|
+
MixedContent=mixed,
|
|
1107
|
+
SignatureCount=len(signatures),
|
|
1108
|
+
SignaturePages=",".join(
|
|
1109
|
+
map(str, sorted({signature.Page for signature in signatures if signature.Page}))
|
|
1110
|
+
),
|
|
1111
|
+
Roles=";".join(sorted(doc_roles)) if doc_roles else "unknown",
|
|
1112
|
+
Hints=";".join(sorted(hints)),
|
|
1113
|
+
Signatures=signatures,
|
|
1114
|
+
)
|