sigdetect 0.3.1__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sigdetect/__init__.py +1 -1
- sigdetect/api.py +43 -11
- sigdetect/cli.py +89 -23
- sigdetect/config.py +48 -3
- sigdetect/cropping.py +72 -12
- sigdetect/detector/__init__.py +27 -8
- sigdetect/detector/pymupdf_engine.py +3 -2
- sigdetect/detector/pypdf2_engine.py +7 -5
- sigdetect/detector/signature_model.py +3 -1
- sigdetect/wet_detection.py +549 -0
- {sigdetect-0.3.1.dist-info → sigdetect-0.5.0.dist-info}/METADATA +28 -25
- sigdetect-0.5.0.dist-info/RECORD +24 -0
- {sigdetect-0.3.1.dist-info → sigdetect-0.5.0.dist-info}/WHEEL +1 -1
- sigdetect-0.3.1.dist-info/RECORD +0 -23
- {sigdetect-0.3.1.dist-info → sigdetect-0.5.0.dist-info}/entry_points.txt +0 -0
- {sigdetect-0.3.1.dist-info → sigdetect-0.5.0.dist-info}/top_level.txt +0 -0
sigdetect/detector/__init__.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import warnings
|
|
5
6
|
from typing import TYPE_CHECKING, Type
|
|
6
7
|
|
|
7
8
|
from .base_detector import Detector
|
|
@@ -21,10 +22,13 @@ ENGINE_REGISTRY: dict[str, Type[Detector]] = {
|
|
|
21
22
|
ENGINE_REGISTRY.setdefault("pypdf", PyPDF2Detector)
|
|
22
23
|
|
|
23
24
|
try: # pragma: no cover - optional dependency
|
|
24
|
-
from .pymupdf_engine import PyMuPDFDetector
|
|
25
|
+
from .pymupdf_engine import PyMuPDFDetector
|
|
26
|
+
from .pymupdf_engine import fitz as pymupdf_fitz # type: ignore
|
|
25
27
|
|
|
26
|
-
if getattr(PyMuPDFDetector, "Name", None):
|
|
28
|
+
if pymupdf_fitz is not None and getattr(PyMuPDFDetector, "Name", None):
|
|
27
29
|
ENGINE_REGISTRY[PyMuPDFDetector.Name] = PyMuPDFDetector
|
|
30
|
+
else:
|
|
31
|
+
PyMuPDFDetector = None # type: ignore
|
|
28
32
|
except Exception:
|
|
29
33
|
PyMuPDFDetector = None # type: ignore
|
|
30
34
|
|
|
@@ -32,12 +36,27 @@ except Exception:
|
|
|
32
36
|
def BuildDetector(configuration: DetectConfiguration) -> Detector:
|
|
33
37
|
"""Instantiate the configured engine or raise a clear error."""
|
|
34
38
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
39
|
+
# Force geometry-capable engine selection (auto prefers PyMuPDF when available).
|
|
40
|
+
engine_name = "auto"
|
|
41
|
+
normalized = str(engine_name).lower()
|
|
42
|
+
|
|
43
|
+
if normalized == "auto":
|
|
44
|
+
detector_cls: Type[Detector] | None = None
|
|
45
|
+
if PyMuPDFDetector is not None:
|
|
46
|
+
detector_cls = (
|
|
47
|
+
ENGINE_REGISTRY.get(getattr(PyMuPDFDetector, "Name", "")) or PyMuPDFDetector
|
|
48
|
+
)
|
|
49
|
+
if detector_cls is None:
|
|
50
|
+
detector_cls = ENGINE_REGISTRY.get(PyPDF2Detector.Name) or ENGINE_REGISTRY.get("pypdf")
|
|
51
|
+
warnings.warn(
|
|
52
|
+
"Engine 'auto' falling back to 'pypdf2' because PyMuPDF is unavailable",
|
|
53
|
+
RuntimeWarning,
|
|
54
|
+
stacklevel=2,
|
|
55
|
+
)
|
|
56
|
+
if detector_cls is None:
|
|
57
|
+
available = ", ".join(sorted(ENGINE_REGISTRY)) or "<none>"
|
|
58
|
+
raise ValueError(f"No available detector engines. Available engines: {available}")
|
|
59
|
+
return detector_cls(configuration)
|
|
41
60
|
|
|
42
61
|
detector_cls = ENGINE_REGISTRY.get(normalized)
|
|
43
62
|
if detector_cls is None:
|
|
@@ -30,8 +30,8 @@ class PyMuPDFDetector(PyPDF2Detector):
|
|
|
30
30
|
def __init__(self, configuration):
|
|
31
31
|
if fitz is None: # pragma: no cover - optional dependency
|
|
32
32
|
raise ValueError(
|
|
33
|
-
"PyMuPDF engine requires the optional 'pymupdf' dependency. Install
|
|
34
|
-
"
|
|
33
|
+
"PyMuPDF engine requires the optional 'pymupdf' dependency. Install 'pymupdf' or add "
|
|
34
|
+
"it to your environment."
|
|
35
35
|
)
|
|
36
36
|
super().__init__(configuration)
|
|
37
37
|
|
|
@@ -111,6 +111,7 @@ class PyMuPDFDetector(PyPDF2Detector):
|
|
|
111
111
|
rect, exclusion, mode = rect_info
|
|
112
112
|
padded = self._PadRect(rect, page.rect, signature.Role, exclusion, mode)
|
|
113
113
|
signature.BoundingBox = self._RectToPdfTuple(padded, page.rect.height)
|
|
114
|
+
signature.RenderType = "drawn"
|
|
114
115
|
if signature.Page is None:
|
|
115
116
|
signature.Page = page_index + 1
|
|
116
117
|
break
|
|
@@ -348,7 +348,7 @@ class PyPDF2Detector(Detector):
|
|
|
348
348
|
return normalized.lower().startswith("im")
|
|
349
349
|
|
|
350
350
|
def _ClassifyAppearance(self, widget: generic.DictionaryObject, page) -> str:
|
|
351
|
-
"""Classify the widget's appearance as drawn
|
|
351
|
+
"""Classify the widget's appearance as drawn or typed."""
|
|
352
352
|
|
|
353
353
|
ap_dict = AsDictionary(widget.get("/AP"))
|
|
354
354
|
if not isinstance(ap_dict, generic.DictionaryObject):
|
|
@@ -356,7 +356,7 @@ class PyPDF2Detector(Detector):
|
|
|
356
356
|
normal = ap_dict.get("/N")
|
|
357
357
|
streams = self._ExtractAppearanceStreams(normal)
|
|
358
358
|
if not streams:
|
|
359
|
-
return "
|
|
359
|
+
return "typed"
|
|
360
360
|
|
|
361
361
|
has_text = False
|
|
362
362
|
has_vector = False
|
|
@@ -384,13 +384,11 @@ class PyPDF2Detector(Detector):
|
|
|
384
384
|
has_image = True
|
|
385
385
|
break
|
|
386
386
|
|
|
387
|
-
if has_image and (has_text or has_vector):
|
|
388
|
-
return "hybrid"
|
|
389
387
|
if has_image:
|
|
390
388
|
return "drawn"
|
|
391
389
|
if has_text or has_vector:
|
|
392
390
|
return "typed"
|
|
393
|
-
return "
|
|
391
|
+
return "typed"
|
|
394
392
|
|
|
395
393
|
# ---- file-wide stream scan (compressed or not)
|
|
396
394
|
def _ScanFileStreamsForVendors(self, file_bytes: bytes) -> tuple[set[str], str]:
|
|
@@ -863,6 +861,7 @@ class PyPDF2Detector(Detector):
|
|
|
863
861
|
Scores={r: sc},
|
|
864
862
|
Evidence=ev + ["pseudo:true"],
|
|
865
863
|
Hint="VendorOrAcroOnly",
|
|
864
|
+
RenderType="typed",
|
|
866
865
|
)
|
|
867
866
|
)
|
|
868
867
|
|
|
@@ -903,6 +902,7 @@ class PyPDF2Detector(Detector):
|
|
|
903
902
|
Scores={role: score} if score > 0 else {},
|
|
904
903
|
Evidence=ev + ["pseudo:true"],
|
|
905
904
|
Hint="VendorOrAcroOnly",
|
|
905
|
+
RenderType="typed",
|
|
906
906
|
)
|
|
907
907
|
)
|
|
908
908
|
|
|
@@ -1055,6 +1055,7 @@ class PyPDF2Detector(Detector):
|
|
|
1055
1055
|
Scores=scores,
|
|
1056
1056
|
Evidence=evidence,
|
|
1057
1057
|
Hint=f"AcroSig:{fname}" if fname else "AcroSig",
|
|
1058
|
+
RenderType="typed",
|
|
1058
1059
|
)
|
|
1059
1060
|
)
|
|
1060
1061
|
|
|
@@ -1120,6 +1121,7 @@ class PyPDF2Detector(Detector):
|
|
|
1120
1121
|
Scores=dict(scores),
|
|
1121
1122
|
Evidence=evidence + ["pseudo:true"],
|
|
1122
1123
|
Hint="VendorOrAcroOnly",
|
|
1124
|
+
RenderType="typed",
|
|
1123
1125
|
)
|
|
1124
1126
|
)
|
|
1125
1127
|
|
|
@@ -17,9 +17,10 @@ class Signature:
|
|
|
17
17
|
Scores: dict[str, int]
|
|
18
18
|
Evidence: list[str]
|
|
19
19
|
Hint: str
|
|
20
|
-
RenderType: str = "
|
|
20
|
+
RenderType: str = "typed"
|
|
21
21
|
BoundingBox: tuple[float, float, float, float] | None = None
|
|
22
22
|
CropPath: str | None = None
|
|
23
|
+
CropBytes: str | None = None
|
|
23
24
|
|
|
24
25
|
def to_dict(self) -> dict[str, Any]:
|
|
25
26
|
"""Return the legacy snake_case representation used in JSON payloads."""
|
|
@@ -35,4 +36,5 @@ class Signature:
|
|
|
35
36
|
"render_type": self.RenderType,
|
|
36
37
|
"bounding_box": list(self.BoundingBox) if self.BoundingBox else None,
|
|
37
38
|
"crop_path": self.CropPath,
|
|
39
|
+
"crop_bytes": self.CropBytes,
|
|
38
40
|
}
|