sigdetect 0.3.1__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import warnings
5
6
  from typing import TYPE_CHECKING, Type
6
7
 
7
8
  from .base_detector import Detector
@@ -21,10 +22,13 @@ ENGINE_REGISTRY: dict[str, Type[Detector]] = {
21
22
  ENGINE_REGISTRY.setdefault("pypdf", PyPDF2Detector)
22
23
 
23
24
  try: # pragma: no cover - optional dependency
24
- from .pymupdf_engine import PyMuPDFDetector # type: ignore
25
+ from .pymupdf_engine import PyMuPDFDetector
26
+ from .pymupdf_engine import fitz as pymupdf_fitz # type: ignore
25
27
 
26
- if getattr(PyMuPDFDetector, "Name", None):
28
+ if pymupdf_fitz is not None and getattr(PyMuPDFDetector, "Name", None):
27
29
  ENGINE_REGISTRY[PyMuPDFDetector.Name] = PyMuPDFDetector
30
+ else:
31
+ PyMuPDFDetector = None # type: ignore
28
32
  except Exception:
29
33
  PyMuPDFDetector = None # type: ignore
30
34
 
@@ -32,12 +36,27 @@ except Exception:
32
36
  def BuildDetector(configuration: DetectConfiguration) -> Detector:
33
37
  """Instantiate the configured engine or raise a clear error."""
34
38
 
35
- engine_name = (
36
- getattr(configuration, "Engine", None)
37
- or getattr(configuration, "engine", None)
38
- or PyPDF2Detector.Name
39
- )
40
- normalized = engine_name.lower()
39
+ # Force geometry-capable engine selection (auto prefers PyMuPDF when available).
40
+ engine_name = "auto"
41
+ normalized = str(engine_name).lower()
42
+
43
+ if normalized == "auto":
44
+ detector_cls: Type[Detector] | None = None
45
+ if PyMuPDFDetector is not None:
46
+ detector_cls = (
47
+ ENGINE_REGISTRY.get(getattr(PyMuPDFDetector, "Name", "")) or PyMuPDFDetector
48
+ )
49
+ if detector_cls is None:
50
+ detector_cls = ENGINE_REGISTRY.get(PyPDF2Detector.Name) or ENGINE_REGISTRY.get("pypdf")
51
+ warnings.warn(
52
+ "Engine 'auto' falling back to 'pypdf2' because PyMuPDF is unavailable",
53
+ RuntimeWarning,
54
+ stacklevel=2,
55
+ )
56
+ if detector_cls is None:
57
+ available = ", ".join(sorted(ENGINE_REGISTRY)) or "<none>"
58
+ raise ValueError(f"No available detector engines. Available engines: {available}")
59
+ return detector_cls(configuration)
41
60
 
42
61
  detector_cls = ENGINE_REGISTRY.get(normalized)
43
62
  if detector_cls is None:
@@ -30,8 +30,8 @@ class PyMuPDFDetector(PyPDF2Detector):
30
30
  def __init__(self, configuration):
31
31
  if fitz is None: # pragma: no cover - optional dependency
32
32
  raise ValueError(
33
- "PyMuPDF engine requires the optional 'pymupdf' dependency. Install via 'pip install "
34
- "sigdetect[pymupdf]' or add pymupdf to your environment."
33
+ "PyMuPDF engine requires the optional 'pymupdf' dependency. Install 'pymupdf' or add "
34
+ "it to your environment."
35
35
  )
36
36
  super().__init__(configuration)
37
37
 
@@ -111,6 +111,7 @@ class PyMuPDFDetector(PyPDF2Detector):
111
111
  rect, exclusion, mode = rect_info
112
112
  padded = self._PadRect(rect, page.rect, signature.Role, exclusion, mode)
113
113
  signature.BoundingBox = self._RectToPdfTuple(padded, page.rect.height)
114
+ signature.RenderType = "drawn"
114
115
  if signature.Page is None:
115
116
  signature.Page = page_index + 1
116
117
  break
@@ -348,7 +348,7 @@ class PyPDF2Detector(Detector):
348
348
  return normalized.lower().startswith("im")
349
349
 
350
350
  def _ClassifyAppearance(self, widget: generic.DictionaryObject, page) -> str:
351
- """Classify the widget's appearance as drawn/typed/hybrid/unknown."""
351
+ """Classify the widget's appearance as drawn or typed."""
352
352
 
353
353
  ap_dict = AsDictionary(widget.get("/AP"))
354
354
  if not isinstance(ap_dict, generic.DictionaryObject):
@@ -356,7 +356,7 @@ class PyPDF2Detector(Detector):
356
356
  normal = ap_dict.get("/N")
357
357
  streams = self._ExtractAppearanceStreams(normal)
358
358
  if not streams:
359
- return "unknown"
359
+ return "typed"
360
360
 
361
361
  has_text = False
362
362
  has_vector = False
@@ -384,13 +384,11 @@ class PyPDF2Detector(Detector):
384
384
  has_image = True
385
385
  break
386
386
 
387
- if has_image and (has_text or has_vector):
388
- return "hybrid"
389
387
  if has_image:
390
388
  return "drawn"
391
389
  if has_text or has_vector:
392
390
  return "typed"
393
- return "unknown"
391
+ return "typed"
394
392
 
395
393
  # ---- file-wide stream scan (compressed or not)
396
394
  def _ScanFileStreamsForVendors(self, file_bytes: bytes) -> tuple[set[str], str]:
@@ -863,6 +861,7 @@ class PyPDF2Detector(Detector):
863
861
  Scores={r: sc},
864
862
  Evidence=ev + ["pseudo:true"],
865
863
  Hint="VendorOrAcroOnly",
864
+ RenderType="typed",
866
865
  )
867
866
  )
868
867
 
@@ -903,6 +902,7 @@ class PyPDF2Detector(Detector):
903
902
  Scores={role: score} if score > 0 else {},
904
903
  Evidence=ev + ["pseudo:true"],
905
904
  Hint="VendorOrAcroOnly",
905
+ RenderType="typed",
906
906
  )
907
907
  )
908
908
 
@@ -1055,6 +1055,7 @@ class PyPDF2Detector(Detector):
1055
1055
  Scores=scores,
1056
1056
  Evidence=evidence,
1057
1057
  Hint=f"AcroSig:{fname}" if fname else "AcroSig",
1058
+ RenderType="typed",
1058
1059
  )
1059
1060
  )
1060
1061
 
@@ -1120,6 +1121,7 @@ class PyPDF2Detector(Detector):
1120
1121
  Scores=dict(scores),
1121
1122
  Evidence=evidence + ["pseudo:true"],
1122
1123
  Hint="VendorOrAcroOnly",
1124
+ RenderType="typed",
1123
1125
  )
1124
1126
  )
1125
1127
 
@@ -17,9 +17,10 @@ class Signature:
17
17
  Scores: dict[str, int]
18
18
  Evidence: list[str]
19
19
  Hint: str
20
- RenderType: str = "unknown"
20
+ RenderType: str = "typed"
21
21
  BoundingBox: tuple[float, float, float, float] | None = None
22
22
  CropPath: str | None = None
23
+ CropBytes: str | None = None
23
24
 
24
25
  def to_dict(self) -> dict[str, Any]:
25
26
  """Return the legacy snake_case representation used in JSON payloads."""
@@ -35,4 +36,5 @@ class Signature:
35
36
  "render_type": self.RenderType,
36
37
  "bounding_box": list(self.BoundingBox) if self.BoundingBox else None,
37
38
  "crop_path": self.CropPath,
39
+ "crop_bytes": self.CropBytes,
38
40
  }