sigdetect 0.1.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -212,7 +212,9 @@ class PyPDF2Detector(Detector):
212
212
  hits.add(f"VendorText:{rx.pattern}")
213
213
  return hits
214
214
 
215
- def _ScanPageVendors(self, page) -> set[str]:
215
+ def _ScanPageVendors(self, page) -> tuple[set[str], str]:
216
+ """Return vendor hits along with the extracted page text."""
217
+
216
218
  found: set[str] = set()
217
219
 
218
220
  with _QuietIo():
@@ -234,7 +236,7 @@ class PyPDF2Detector(Detector):
234
236
  if rx.search(txt):
235
237
  found.add(f"VendorText:{rx.pattern}")
236
238
 
237
- return found
239
+ return found, txt
238
240
 
239
241
  def _IterateFormXObjects(self, page) -> Iterator[generic.DictionaryObject]:
240
242
  """Yield Form XObject dictionaries recursively from page resources."""
@@ -438,6 +440,40 @@ class PyPDF2Detector(Detector):
438
440
  nm = GetFieldNameFromAncestry(wdict)
439
441
  return "" if nm is None else str(nm)
440
442
 
443
+ def _WidgetBoundingBox(
444
+ self, wdict: generic.DictionaryObject
445
+ ) -> tuple[float, float, float, float] | None:
446
+ """Return the widget's ``/Rect`` coordinates normalized as (x0, y0, x1, y1)."""
447
+
448
+ rect = self._RectToTuple(wdict.get("/Rect"))
449
+ if rect:
450
+ return rect
451
+ parent = AsDictionary(wdict.get("/Parent"))
452
+ if isinstance(parent, generic.DictionaryObject):
453
+ return self._RectToTuple(parent.get("/Rect"))
454
+ return None
455
+
456
+ def _RectToTuple(self, candidate) -> tuple[float, float, float, float] | None:
457
+ if candidate is None:
458
+ return None
459
+ if isinstance(candidate, generic.IndirectObject):
460
+ with suppress(Exception):
461
+ candidate = candidate.get_object()
462
+ if isinstance(candidate, generic.ArrayObject) and len(candidate) == 4:
463
+ coords: list[float] = []
464
+ for item in candidate:
465
+ try:
466
+ coords.append(float(item))
467
+ except Exception:
468
+ return None
469
+ x0, y0, x1, y1 = coords
470
+ if x1 < x0:
471
+ x0, x1 = x1, x0
472
+ if y1 < y0:
473
+ y0, y1 = y1, y0
474
+ return x0, y0, x1, y1
475
+ return None
476
+
441
477
  @staticmethod
442
478
  def _PickNameAny(d: generic.DictionaryObject) -> str | None:
443
479
  for key in ("/T", "/TU", "/TM"):
@@ -685,7 +721,7 @@ class PyPDF2Detector(Detector):
685
721
 
686
722
  for page in reader.pages:
687
723
  # per-page vendor
688
- pv = self._ScanPageVendors(page)
724
+ pv, page_text = self._ScanPageVendors(page)
689
725
  x_hits: set[str] = set()
690
726
  x_text = ""
691
727
  if self.RecurseXObjects:
@@ -693,12 +729,10 @@ class PyPDF2Detector(Detector):
693
729
  vendor_hints |= pv | x_hits
694
730
  vendor_hits_per_page.append(len(pv) + len(x_hits))
695
731
 
696
- with _QuietIo():
697
- txt = page.extract_text() or ""
698
732
  if x_text:
699
- txt = f"{txt} {x_text}".strip() if txt else x_text.strip()
700
- page_texts.append(txt)
701
- any_text = any_text or bool(txt)
733
+ page_text = f"{page_text} {x_text}".strip() if page_text else x_text.strip()
734
+ page_texts.append(page_text)
735
+ any_text = any_text or bool(page_text)
702
736
 
703
737
  # image counting
704
738
  img_count = 0
@@ -760,6 +794,7 @@ class PyPDF2Detector(Detector):
760
794
  field_name = self._FieldNameForWidget(wdict)
761
795
  page_obj = reader.pages[idx - 1] if 0 <= (idx - 1) < len(reader.pages) else None
762
796
  render_type = self._ClassifyAppearance(wdict, page_obj)
797
+ bounding_box = self._WidgetBoundingBox(wdict)
763
798
 
764
799
  # de-dup by object ref (if present) and (page, name)
765
800
  if isinstance(ref, generic.IndirectObject):
@@ -801,6 +836,7 @@ class PyPDF2Detector(Detector):
801
836
  Evidence=evidence,
802
837
  Hint=(f"AcroSig:{field_name}" if field_name else "AcroSig"),
803
838
  RenderType=render_type,
839
+ BoundingBox=bounding_box,
804
840
  )
805
841
  )
806
842
 
@@ -969,6 +1005,7 @@ class PyPDF2Detector(Detector):
969
1005
  field_name = self._FieldNameForWidget(wdict)
970
1006
  page_obj = reader.pages[idx - 1] if 0 <= (idx - 1) < len(reader.pages) else None
971
1007
  render_type = self._ClassifyAppearance(wdict, page_obj)
1008
+ bounding_box = self._WidgetBoundingBox(wdict)
972
1009
 
973
1010
  # de-dup by object ref (if present) and (page, name)
974
1011
  if isinstance(ref, generic.IndirectObject):
@@ -995,6 +1032,7 @@ class PyPDF2Detector(Detector):
995
1032
  Evidence=evidence,
996
1033
  Hint=(f"AcroSig:{field_name}" if field_name else "AcroSig"),
997
1034
  RenderType=render_type,
1035
+ BoundingBox=bounding_box,
998
1036
  )
999
1037
  )
1000
1038
 
@@ -18,6 +18,8 @@ class Signature:
18
18
  Evidence: list[str]
19
19
  Hint: str
20
20
  RenderType: str = "unknown"
21
+ BoundingBox: tuple[float, float, float, float] | None = None
22
+ CropPath: str | None = None
21
23
 
22
24
  def to_dict(self) -> dict[str, Any]:
23
25
  """Return the legacy snake_case representation used in JSON payloads."""
@@ -31,4 +33,6 @@ class Signature:
31
33
  "evidence": list(self.Evidence),
32
34
  "hint": self.Hint,
33
35
  "render_type": self.RenderType,
36
+ "bounding_box": list(self.BoundingBox) if self.BoundingBox else None,
37
+ "crop_path": self.CropPath,
34
38
  }
@@ -1,13 +1,12 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sigdetect
3
- Version: 0.1.1
3
+ Version: 0.3.0
4
4
  Summary: Signature detection and role attribution for PDFs
5
5
  Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
6
6
  License: MIT
7
7
  Requires-Python: >=3.9
8
8
  Description-Content-Type: text/markdown
9
9
  Requires-Dist: pypdf>=4.0.0
10
- Requires-Dist: pandas>=2.0
11
10
  Requires-Dist: rich>=13.0
12
11
  Requires-Dist: typer>=0.12
13
12
  Requires-Dist: pydantic>=2.5
@@ -102,6 +101,8 @@ sigdetect detect \
102
101
  - `--profile` selects tuned role logic:
103
102
  - `hipaa` → patient / representative / attorney
104
103
  - `retainer` → client / firm (prefers detecting two signatures)
104
+ - `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
105
+ - `--crop-signatures` enables PNG crops for each detected widget (requires installing the optional `pymupdf` dependency). Use `--crop-dir` to override the destination and `--crop-dpi` to choose rendering quality.
105
106
  - If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
106
107
 
107
108
  ### EDA (quick aggregate stats)
@@ -135,7 +136,7 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
135
136
  print(result.to_dict())
136
137
  ~~~
137
138
 
138
- `Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)).
139
+ `Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image.
139
140
 
140
141
  ---
141
142
 
@@ -146,7 +147,17 @@ Import from `sigdetect.api` and get plain dicts out (JSON-ready),
146
147
  with no I/O side effects by default:
147
148
 
148
149
  ~~~python
149
- from sigdetect.api import DetectPdf, DetectMany, ScanDirectory, ToCsvRow, Version
150
+ from pathlib import Path
151
+
152
+ from sigdetect.api import (
153
+ CropSignatureImages,
154
+ DetectMany,
155
+ DetectPdf,
156
+ ScanDirectory,
157
+ ToCsvRow,
158
+ Version,
159
+ get_detector,
160
+ )
150
161
 
151
162
  print("sigdetect", Version())
152
163
 
@@ -178,8 +189,24 @@ for res in ScanDirectory(
178
189
  # store in DB, print, etc.
179
190
  pass
180
191
 
192
+ # 3) Crop PNG snippets for FileResult objects (requires PyMuPDF)
193
+ detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
194
+ file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
195
+ crops = CropSignatureImages(
196
+ "/path/to/pdfs/example.pdf",
197
+ file_result,
198
+ outputDirectory="./signature_crops",
199
+ dpi=200,
200
+ returnBytes=True, # also returns in-memory PNG bytes for each crop
201
+ )
202
+
203
+ first_crop = crops[0]
204
+ print(first_crop.path, len(first_crop.image_bytes))
181
205
  ~~~
182
206
 
207
+ When ``returnBytes=True`` the helper returns ``SignatureCrop`` objects containing the saved path,
208
+ PNG bytes, and the originating signature metadata.
209
+
183
210
 
184
211
  ## Result schema
185
212
 
@@ -205,7 +232,10 @@ High-level summary (per file):
205
232
  "score": 5,
206
233
  "scores": { "field": 3, "page_label": 2 },
207
234
  "evidence": ["field:patient", "page_label:patient"],
208
- "hint": "AcroSig:sig_patient"
235
+ "hint": "AcroSig:sig_patient",
236
+ "render_type": "typed",
237
+ "bounding_box": [10.0, 10.0, 150.0, 40.0],
238
+ "crop_path": "signature_crops/example/sig_01_patient.png"
209
239
  },
210
240
  {
211
241
  "page": null,
@@ -214,7 +244,10 @@ High-level summary (per file):
214
244
  "score": 6,
215
245
  "scores": { "page_label": 4, "general": 2 },
216
246
  "evidence": ["page_label:representative(parent/guardian)", "pseudo:true"],
217
- "hint": "VendorOrAcroOnly"
247
+ "hint": "VendorOrAcroOnly",
248
+ "render_type": "unknown",
249
+ "bounding_box": null,
250
+ "crop_path": null
218
251
  }
219
252
  ]
220
253
  }
@@ -227,6 +260,8 @@ High-level summary (per file):
227
260
  - **`mixed`** means both `esign_found` and `scanned_pdf` are `true`.
228
261
  - **`roles`** summarizes unique non-`unknown` roles across signatures.
229
262
  - In retainer profile, emitter prefers two signatures (client + firm), often on the same page.
263
+ - **`signatures[].bounding_box`** reports the widget rectangle in PDF points (origin bottom-left).
264
+ - **`signatures[].crop_path`** is populated when PNG crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
230
265
 
231
266
  ---
232
267
 
@@ -252,6 +287,9 @@ engine: pypdf2
252
287
  pseudo_signatures: true
253
288
  recurse_xobjects: true
254
289
  profile: retainer # or: hipaa
290
+ crop_signatures: false # enable to write PNG crops (requires pymupdf)
291
+ # crop_output_dir: ./signature_crops
292
+ crop_image_dpi: 200
255
293
  ~~~
256
294
 
257
295
  YAML files can be customized or load at runtime (see CLI `--config`, if available, or import and pass patterns into engine).
@@ -1,7 +1,8 @@
1
1
  sigdetect/__init__.py,sha256=LhY78mDZ1ClYVNTxW_qtE-vqJoN9N7N5ZcNRDUI_3ss,575
2
- sigdetect/api.py,sha256=Un4SaZHNAmRLPh1aF9bzOfT6ibilT_y9C0xVmNlqHtI,4248
3
- sigdetect/cli.py,sha256=jm7aStuv64MCcZZkzv8ncNVGGg8FYIFKjkTPNfXWUgs,3136
4
- sigdetect/config.py,sha256=d3_AlAEFUHBoXyTbUAHQLTARVqM8q4I8q4xfwakPE0M,4165
2
+ sigdetect/api.py,sha256=jIUaq6nslDdluNlRoDSdaX3Dx1lkIIZmIJPHn8Nk2Ko,9192
3
+ sigdetect/cli.py,sha256=NctAnaB-TQrUAT9m-v8kj2_KTNs88kbFOCiX32tHZm8,7920
4
+ sigdetect/config.py,sha256=S0NVKuJYiHJCocL-VNFdGJpasFcjTecavC4EthyS1DQ,5951
5
+ sigdetect/cropping.py,sha256=IyIcQAPH3z58tS6yeplglMDNu9F-iyQtpYQ1Ya2X_8o,5602
5
6
  sigdetect/eda.py,sha256=S92G1Gjmepri__D0n_V6foq0lQgH-RXI9anW8A58jfw,4681
6
7
  sigdetect/logging_setup.py,sha256=LMF8ao_a-JwH0S522T6aYTFX3e8Ajjv_5ODS2YiBcHA,6404
7
8
  sigdetect/utils.py,sha256=T9rubLf5T9JmjOHYMOba1j34fhOJaWocAXccnGTxRUE,5198
@@ -12,11 +13,11 @@ sigdetect/detector/__init__.py,sha256=up2FCmD09f2bRHcS4WbY-clx3GQbWuk1PM2JlxgusH
12
13
  sigdetect/detector/base.py,sha256=L-iXWXqsTetDc4jRZo_wOdbNpKqOY20mX9FefrugdT0,263
13
14
  sigdetect/detector/base_detector.py,sha256=GmAgUWO_fQgIfnihZSoyhR3wpnwZ-X3hS0Kuyz4G6Ys,608
14
15
  sigdetect/detector/file_result_model.py,sha256=j2gTc9Sw3fJOHlexYsR_m5DiwHA8DzIzAMToESfvo4A,1767
15
- sigdetect/detector/pymupdf_engine.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
- sigdetect/detector/pypdf2_engine.py,sha256=e3JasLxI8K10IkpMcijES2EjA7RluNpKq6027oNROPU,45770
17
- sigdetect/detector/signature_model.py,sha256=nApd53aDRMZhOLdUlmoEPjHO1hs8leM6NysG10v-jVc,857
18
- sigdetect-0.1.1.dist-info/METADATA,sha256=JvjfOiez4frYSsTjPxC55meRQW4qUGAUCyKuBxjULA0,10363
19
- sigdetect-0.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
20
- sigdetect-0.1.1.dist-info/entry_points.txt,sha256=iqtfKjBU44-omM7Sh-idGz2ahw19oAvpvSyKZVArG3o,48
21
- sigdetect-0.1.1.dist-info/top_level.txt,sha256=PKlfwUobkRC0viwiSXmhtw83G26FSNpimWYC1Uy00FY,10
22
- sigdetect-0.1.1.dist-info/RECORD,,
16
+ sigdetect/detector/pymupdf_engine.py,sha256=iyp7JuPlUnydwohH5zbNg4MwH44mBmxbBWOS3ZmArBo,17339
17
+ sigdetect/detector/pypdf2_engine.py,sha256=INWQH06kMLvto2VS-EdLC-EtMC6AG7JmdVYmNgx6_RU,47313
18
+ sigdetect/detector/signature_model.py,sha256=mztb9V5wgv2oohQ5Cxzcv8_Bo6TyWAVIXteaeQ2rywQ,1076
19
+ sigdetect-0.3.0.dist-info/METADATA,sha256=i7rSqbNbViLWyNJFO5si0eghcM01mBdkLrFsVND7xZw,12171
20
+ sigdetect-0.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
21
+ sigdetect-0.3.0.dist-info/entry_points.txt,sha256=iqtfKjBU44-omM7Sh-idGz2ahw19oAvpvSyKZVArG3o,48
22
+ sigdetect-0.3.0.dist-info/top_level.txt,sha256=PKlfwUobkRC0viwiSXmhtw83G26FSNpimWYC1Uy00FY,10
23
+ sigdetect-0.3.0.dist-info/RECORD,,