sigdetect 0.1.1__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sigdetect/api.py +162 -14
- sigdetect/cli.py +154 -20
- sigdetect/config.py +49 -9
- sigdetect/cropping.py +177 -0
- sigdetect/detector/pymupdf_engine.py +420 -0
- sigdetect/detector/pypdf2_engine.py +46 -8
- sigdetect/detector/signature_model.py +4 -0
- {sigdetect-0.1.1.dist-info → sigdetect-0.3.0.dist-info}/METADATA +44 -6
- {sigdetect-0.1.1.dist-info → sigdetect-0.3.0.dist-info}/RECORD +12 -11
- {sigdetect-0.1.1.dist-info → sigdetect-0.3.0.dist-info}/WHEEL +0 -0
- {sigdetect-0.1.1.dist-info → sigdetect-0.3.0.dist-info}/entry_points.txt +0 -0
- {sigdetect-0.1.1.dist-info → sigdetect-0.3.0.dist-info}/top_level.txt +0 -0
|
@@ -212,7 +212,9 @@ class PyPDF2Detector(Detector):
|
|
|
212
212
|
hits.add(f"VendorText:{rx.pattern}")
|
|
213
213
|
return hits
|
|
214
214
|
|
|
215
|
-
def _ScanPageVendors(self, page) -> set[str]:
|
|
215
|
+
def _ScanPageVendors(self, page) -> tuple[set[str], str]:
|
|
216
|
+
"""Return vendor hits along with the extracted page text."""
|
|
217
|
+
|
|
216
218
|
found: set[str] = set()
|
|
217
219
|
|
|
218
220
|
with _QuietIo():
|
|
@@ -234,7 +236,7 @@ class PyPDF2Detector(Detector):
|
|
|
234
236
|
if rx.search(txt):
|
|
235
237
|
found.add(f"VendorText:{rx.pattern}")
|
|
236
238
|
|
|
237
|
-
return found
|
|
239
|
+
return found, txt
|
|
238
240
|
|
|
239
241
|
def _IterateFormXObjects(self, page) -> Iterator[generic.DictionaryObject]:
|
|
240
242
|
"""Yield Form XObject dictionaries recursively from page resources."""
|
|
@@ -438,6 +440,40 @@ class PyPDF2Detector(Detector):
|
|
|
438
440
|
nm = GetFieldNameFromAncestry(wdict)
|
|
439
441
|
return "" if nm is None else str(nm)
|
|
440
442
|
|
|
443
|
+
def _WidgetBoundingBox(
|
|
444
|
+
self, wdict: generic.DictionaryObject
|
|
445
|
+
) -> tuple[float, float, float, float] | None:
|
|
446
|
+
"""Return the widget's ``/Rect`` coordinates normalized as (x0, y0, x1, y1)."""
|
|
447
|
+
|
|
448
|
+
rect = self._RectToTuple(wdict.get("/Rect"))
|
|
449
|
+
if rect:
|
|
450
|
+
return rect
|
|
451
|
+
parent = AsDictionary(wdict.get("/Parent"))
|
|
452
|
+
if isinstance(parent, generic.DictionaryObject):
|
|
453
|
+
return self._RectToTuple(parent.get("/Rect"))
|
|
454
|
+
return None
|
|
455
|
+
|
|
456
|
+
def _RectToTuple(self, candidate) -> tuple[float, float, float, float] | None:
|
|
457
|
+
if candidate is None:
|
|
458
|
+
return None
|
|
459
|
+
if isinstance(candidate, generic.IndirectObject):
|
|
460
|
+
with suppress(Exception):
|
|
461
|
+
candidate = candidate.get_object()
|
|
462
|
+
if isinstance(candidate, generic.ArrayObject) and len(candidate) == 4:
|
|
463
|
+
coords: list[float] = []
|
|
464
|
+
for item in candidate:
|
|
465
|
+
try:
|
|
466
|
+
coords.append(float(item))
|
|
467
|
+
except Exception:
|
|
468
|
+
return None
|
|
469
|
+
x0, y0, x1, y1 = coords
|
|
470
|
+
if x1 < x0:
|
|
471
|
+
x0, x1 = x1, x0
|
|
472
|
+
if y1 < y0:
|
|
473
|
+
y0, y1 = y1, y0
|
|
474
|
+
return x0, y0, x1, y1
|
|
475
|
+
return None
|
|
476
|
+
|
|
441
477
|
@staticmethod
|
|
442
478
|
def _PickNameAny(d: generic.DictionaryObject) -> str | None:
|
|
443
479
|
for key in ("/T", "/TU", "/TM"):
|
|
@@ -685,7 +721,7 @@ class PyPDF2Detector(Detector):
|
|
|
685
721
|
|
|
686
722
|
for page in reader.pages:
|
|
687
723
|
# per-page vendor
|
|
688
|
-
pv = self._ScanPageVendors(page)
|
|
724
|
+
pv, page_text = self._ScanPageVendors(page)
|
|
689
725
|
x_hits: set[str] = set()
|
|
690
726
|
x_text = ""
|
|
691
727
|
if self.RecurseXObjects:
|
|
@@ -693,12 +729,10 @@ class PyPDF2Detector(Detector):
|
|
|
693
729
|
vendor_hints |= pv | x_hits
|
|
694
730
|
vendor_hits_per_page.append(len(pv) + len(x_hits))
|
|
695
731
|
|
|
696
|
-
with _QuietIo():
|
|
697
|
-
txt = page.extract_text() or ""
|
|
698
732
|
if x_text:
|
|
699
|
-
|
|
700
|
-
page_texts.append(
|
|
701
|
-
any_text = any_text or bool(
|
|
733
|
+
page_text = f"{page_text} {x_text}".strip() if page_text else x_text.strip()
|
|
734
|
+
page_texts.append(page_text)
|
|
735
|
+
any_text = any_text or bool(page_text)
|
|
702
736
|
|
|
703
737
|
# image counting
|
|
704
738
|
img_count = 0
|
|
@@ -760,6 +794,7 @@ class PyPDF2Detector(Detector):
|
|
|
760
794
|
field_name = self._FieldNameForWidget(wdict)
|
|
761
795
|
page_obj = reader.pages[idx - 1] if 0 <= (idx - 1) < len(reader.pages) else None
|
|
762
796
|
render_type = self._ClassifyAppearance(wdict, page_obj)
|
|
797
|
+
bounding_box = self._WidgetBoundingBox(wdict)
|
|
763
798
|
|
|
764
799
|
# de-dup by object ref (if present) and (page, name)
|
|
765
800
|
if isinstance(ref, generic.IndirectObject):
|
|
@@ -801,6 +836,7 @@ class PyPDF2Detector(Detector):
|
|
|
801
836
|
Evidence=evidence,
|
|
802
837
|
Hint=(f"AcroSig:{field_name}" if field_name else "AcroSig"),
|
|
803
838
|
RenderType=render_type,
|
|
839
|
+
BoundingBox=bounding_box,
|
|
804
840
|
)
|
|
805
841
|
)
|
|
806
842
|
|
|
@@ -969,6 +1005,7 @@ class PyPDF2Detector(Detector):
|
|
|
969
1005
|
field_name = self._FieldNameForWidget(wdict)
|
|
970
1006
|
page_obj = reader.pages[idx - 1] if 0 <= (idx - 1) < len(reader.pages) else None
|
|
971
1007
|
render_type = self._ClassifyAppearance(wdict, page_obj)
|
|
1008
|
+
bounding_box = self._WidgetBoundingBox(wdict)
|
|
972
1009
|
|
|
973
1010
|
# de-dup by object ref (if present) and (page, name)
|
|
974
1011
|
if isinstance(ref, generic.IndirectObject):
|
|
@@ -995,6 +1032,7 @@ class PyPDF2Detector(Detector):
|
|
|
995
1032
|
Evidence=evidence,
|
|
996
1033
|
Hint=(f"AcroSig:{field_name}" if field_name else "AcroSig"),
|
|
997
1034
|
RenderType=render_type,
|
|
1035
|
+
BoundingBox=bounding_box,
|
|
998
1036
|
)
|
|
999
1037
|
)
|
|
1000
1038
|
|
|
@@ -18,6 +18,8 @@ class Signature:
|
|
|
18
18
|
Evidence: list[str]
|
|
19
19
|
Hint: str
|
|
20
20
|
RenderType: str = "unknown"
|
|
21
|
+
BoundingBox: tuple[float, float, float, float] | None = None
|
|
22
|
+
CropPath: str | None = None
|
|
21
23
|
|
|
22
24
|
def to_dict(self) -> dict[str, Any]:
|
|
23
25
|
"""Return the legacy snake_case representation used in JSON payloads."""
|
|
@@ -31,4 +33,6 @@ class Signature:
|
|
|
31
33
|
"evidence": list(self.Evidence),
|
|
32
34
|
"hint": self.Hint,
|
|
33
35
|
"render_type": self.RenderType,
|
|
36
|
+
"bounding_box": list(self.BoundingBox) if self.BoundingBox else None,
|
|
37
|
+
"crop_path": self.CropPath,
|
|
34
38
|
}
|
|
@@ -1,13 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sigdetect
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Signature detection and role attribution for PDFs
|
|
5
5
|
Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
|
|
6
6
|
License: MIT
|
|
7
7
|
Requires-Python: >=3.9
|
|
8
8
|
Description-Content-Type: text/markdown
|
|
9
9
|
Requires-Dist: pypdf>=4.0.0
|
|
10
|
-
Requires-Dist: pandas>=2.0
|
|
11
10
|
Requires-Dist: rich>=13.0
|
|
12
11
|
Requires-Dist: typer>=0.12
|
|
13
12
|
Requires-Dist: pydantic>=2.5
|
|
@@ -102,6 +101,8 @@ sigdetect detect \
|
|
|
102
101
|
- `--profile` selects tuned role logic:
|
|
103
102
|
- `hipaa` → patient / representative / attorney
|
|
104
103
|
- `retainer` → client / firm (prefers detecting two signatures)
|
|
104
|
+
- `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
|
|
105
|
+
- `--crop-signatures` enables PNG crops for each detected widget (requires installing the optional `pymupdf` dependency). Use `--crop-dir` to override the destination and `--crop-dpi` to choose rendering quality.
|
|
105
106
|
- If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
|
|
106
107
|
|
|
107
108
|
### EDA (quick aggregate stats)
|
|
@@ -135,7 +136,7 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
|
|
|
135
136
|
print(result.to_dict())
|
|
136
137
|
~~~
|
|
137
138
|
|
|
138
|
-
`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)).
|
|
139
|
+
`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image.
|
|
139
140
|
|
|
140
141
|
---
|
|
141
142
|
|
|
@@ -146,7 +147,17 @@ Import from `sigdetect.api` and get plain dicts out (JSON-ready),
|
|
|
146
147
|
with no I/O side effects by default:
|
|
147
148
|
|
|
148
149
|
~~~python
|
|
149
|
-
from
|
|
150
|
+
from pathlib import Path
|
|
151
|
+
|
|
152
|
+
from sigdetect.api import (
|
|
153
|
+
CropSignatureImages,
|
|
154
|
+
DetectMany,
|
|
155
|
+
DetectPdf,
|
|
156
|
+
ScanDirectory,
|
|
157
|
+
ToCsvRow,
|
|
158
|
+
Version,
|
|
159
|
+
get_detector,
|
|
160
|
+
)
|
|
150
161
|
|
|
151
162
|
print("sigdetect", Version())
|
|
152
163
|
|
|
@@ -178,8 +189,24 @@ for res in ScanDirectory(
|
|
|
178
189
|
# store in DB, print, etc.
|
|
179
190
|
pass
|
|
180
191
|
|
|
192
|
+
# 3) Crop PNG snippets for FileResult objects (requires PyMuPDF)
|
|
193
|
+
detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
|
|
194
|
+
file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
|
|
195
|
+
crops = CropSignatureImages(
|
|
196
|
+
"/path/to/pdfs/example.pdf",
|
|
197
|
+
file_result,
|
|
198
|
+
outputDirectory="./signature_crops",
|
|
199
|
+
dpi=200,
|
|
200
|
+
returnBytes=True, # also returns in-memory PNG bytes for each crop
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
first_crop = crops[0]
|
|
204
|
+
print(first_crop.path, len(first_crop.image_bytes))
|
|
181
205
|
~~~
|
|
182
206
|
|
|
207
|
+
When ``returnBytes=True`` the helper returns ``SignatureCrop`` objects containing the saved path,
|
|
208
|
+
PNG bytes, and the originating signature metadata.
|
|
209
|
+
|
|
183
210
|
|
|
184
211
|
## Result schema
|
|
185
212
|
|
|
@@ -205,7 +232,10 @@ High-level summary (per file):
|
|
|
205
232
|
"score": 5,
|
|
206
233
|
"scores": { "field": 3, "page_label": 2 },
|
|
207
234
|
"evidence": ["field:patient", "page_label:patient"],
|
|
208
|
-
"hint": "AcroSig:sig_patient"
|
|
235
|
+
"hint": "AcroSig:sig_patient",
|
|
236
|
+
"render_type": "typed",
|
|
237
|
+
"bounding_box": [10.0, 10.0, 150.0, 40.0],
|
|
238
|
+
"crop_path": "signature_crops/example/sig_01_patient.png"
|
|
209
239
|
},
|
|
210
240
|
{
|
|
211
241
|
"page": null,
|
|
@@ -214,7 +244,10 @@ High-level summary (per file):
|
|
|
214
244
|
"score": 6,
|
|
215
245
|
"scores": { "page_label": 4, "general": 2 },
|
|
216
246
|
"evidence": ["page_label:representative(parent/guardian)", "pseudo:true"],
|
|
217
|
-
"hint": "VendorOrAcroOnly"
|
|
247
|
+
"hint": "VendorOrAcroOnly",
|
|
248
|
+
"render_type": "unknown",
|
|
249
|
+
"bounding_box": null,
|
|
250
|
+
"crop_path": null
|
|
218
251
|
}
|
|
219
252
|
]
|
|
220
253
|
}
|
|
@@ -227,6 +260,8 @@ High-level summary (per file):
|
|
|
227
260
|
- **`mixed`** means both `esign_found` and `scanned_pdf` are `true`.
|
|
228
261
|
- **`roles`** summarizes unique non-`unknown` roles across signatures.
|
|
229
262
|
- In retainer profile, emitter prefers two signatures (client + firm), often on the same page.
|
|
263
|
+
- **`signatures[].bounding_box`** reports the widget rectangle in PDF points (origin bottom-left).
|
|
264
|
+
- **`signatures[].crop_path`** is populated when PNG crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
|
|
230
265
|
|
|
231
266
|
---
|
|
232
267
|
|
|
@@ -252,6 +287,9 @@ engine: pypdf2
|
|
|
252
287
|
pseudo_signatures: true
|
|
253
288
|
recurse_xobjects: true
|
|
254
289
|
profile: retainer # or: hipaa
|
|
290
|
+
crop_signatures: false # enable to write PNG crops (requires pymupdf)
|
|
291
|
+
# crop_output_dir: ./signature_crops
|
|
292
|
+
crop_image_dpi: 200
|
|
255
293
|
~~~
|
|
256
294
|
|
|
257
295
|
YAML files can be customized or load at runtime (see CLI `--config`, if available, or import and pass patterns into engine).
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
sigdetect/__init__.py,sha256=LhY78mDZ1ClYVNTxW_qtE-vqJoN9N7N5ZcNRDUI_3ss,575
|
|
2
|
-
sigdetect/api.py,sha256=
|
|
3
|
-
sigdetect/cli.py,sha256=
|
|
4
|
-
sigdetect/config.py,sha256=
|
|
2
|
+
sigdetect/api.py,sha256=jIUaq6nslDdluNlRoDSdaX3Dx1lkIIZmIJPHn8Nk2Ko,9192
|
|
3
|
+
sigdetect/cli.py,sha256=NctAnaB-TQrUAT9m-v8kj2_KTNs88kbFOCiX32tHZm8,7920
|
|
4
|
+
sigdetect/config.py,sha256=S0NVKuJYiHJCocL-VNFdGJpasFcjTecavC4EthyS1DQ,5951
|
|
5
|
+
sigdetect/cropping.py,sha256=IyIcQAPH3z58tS6yeplglMDNu9F-iyQtpYQ1Ya2X_8o,5602
|
|
5
6
|
sigdetect/eda.py,sha256=S92G1Gjmepri__D0n_V6foq0lQgH-RXI9anW8A58jfw,4681
|
|
6
7
|
sigdetect/logging_setup.py,sha256=LMF8ao_a-JwH0S522T6aYTFX3e8Ajjv_5ODS2YiBcHA,6404
|
|
7
8
|
sigdetect/utils.py,sha256=T9rubLf5T9JmjOHYMOba1j34fhOJaWocAXccnGTxRUE,5198
|
|
@@ -12,11 +13,11 @@ sigdetect/detector/__init__.py,sha256=up2FCmD09f2bRHcS4WbY-clx3GQbWuk1PM2JlxgusH
|
|
|
12
13
|
sigdetect/detector/base.py,sha256=L-iXWXqsTetDc4jRZo_wOdbNpKqOY20mX9FefrugdT0,263
|
|
13
14
|
sigdetect/detector/base_detector.py,sha256=GmAgUWO_fQgIfnihZSoyhR3wpnwZ-X3hS0Kuyz4G6Ys,608
|
|
14
15
|
sigdetect/detector/file_result_model.py,sha256=j2gTc9Sw3fJOHlexYsR_m5DiwHA8DzIzAMToESfvo4A,1767
|
|
15
|
-
sigdetect/detector/pymupdf_engine.py,sha256=
|
|
16
|
-
sigdetect/detector/pypdf2_engine.py,sha256=
|
|
17
|
-
sigdetect/detector/signature_model.py,sha256=
|
|
18
|
-
sigdetect-0.
|
|
19
|
-
sigdetect-0.
|
|
20
|
-
sigdetect-0.
|
|
21
|
-
sigdetect-0.
|
|
22
|
-
sigdetect-0.
|
|
16
|
+
sigdetect/detector/pymupdf_engine.py,sha256=iyp7JuPlUnydwohH5zbNg4MwH44mBmxbBWOS3ZmArBo,17339
|
|
17
|
+
sigdetect/detector/pypdf2_engine.py,sha256=INWQH06kMLvto2VS-EdLC-EtMC6AG7JmdVYmNgx6_RU,47313
|
|
18
|
+
sigdetect/detector/signature_model.py,sha256=mztb9V5wgv2oohQ5Cxzcv8_Bo6TyWAVIXteaeQ2rywQ,1076
|
|
19
|
+
sigdetect-0.3.0.dist-info/METADATA,sha256=i7rSqbNbViLWyNJFO5si0eghcM01mBdkLrFsVND7xZw,12171
|
|
20
|
+
sigdetect-0.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
21
|
+
sigdetect-0.3.0.dist-info/entry_points.txt,sha256=iqtfKjBU44-omM7Sh-idGz2ahw19oAvpvSyKZVArG3o,48
|
|
22
|
+
sigdetect-0.3.0.dist-info/top_level.txt,sha256=PKlfwUobkRC0viwiSXmhtw83G26FSNpimWYC1Uy00FY,10
|
|
23
|
+
sigdetect-0.3.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|