sigdetect 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sigdetect/__init__.py CHANGED
@@ -21,4 +21,4 @@ try:
21
21
  except PackageNotFoundError: # pragma: no cover
22
22
  __version__ = "0.0.0"
23
23
 
24
- DEFAULT_ENGINE = "pypdf2"
24
+ DEFAULT_ENGINE = "auto"
sigdetect/api.py CHANGED
@@ -10,7 +10,7 @@ from sigdetect.config import DetectConfiguration
10
10
  from sigdetect.cropping import SignatureCrop
11
11
  from sigdetect.detector import BuildDetector, Detector, FileResult, Signature
12
12
 
13
- EngineName = Literal["pypdf2", "pypdf", "pymupdf"]
13
+ EngineName = Literal["pypdf2", "pypdf", "pymupdf", "auto"]
14
14
  ProfileName = Literal["hipaa", "retainer"]
15
15
 
16
16
 
@@ -18,7 +18,7 @@ def DetectPdf(
18
18
  pdfPath: str | Path,
19
19
  *,
20
20
  profileName: ProfileName = "hipaa",
21
- engineName: EngineName = "pypdf2",
21
+ engineName: EngineName = "auto",
22
22
  includePseudoSignatures: bool = True,
23
23
  recurseXObjects: bool = True,
24
24
  detector: Detector | None = None,
@@ -43,7 +43,7 @@ def get_detector(
43
43
  *,
44
44
  pdfRoot: str | Path | None = None,
45
45
  profileName: ProfileName = "hipaa",
46
- engineName: EngineName = "pypdf2",
46
+ engineName: EngineName = "auto",
47
47
  includePseudoSignatures: bool = True,
48
48
  recurseXObjects: bool = True,
49
49
  outputDirectory: str | Path | None = None,
@@ -200,7 +200,9 @@ def CropSignatureImages(
200
200
  outputDirectory: str | Path,
201
201
  dpi: int = 200,
202
202
  returnBytes: Literal[False] = False,
203
- ) -> list[Path]: ...
203
+ saveToDisk: bool = True,
204
+ ) -> list[Path]:
205
+ ...
204
206
 
205
207
 
206
208
  @overload
@@ -211,7 +213,9 @@ def CropSignatureImages(
211
213
  outputDirectory: str | Path,
212
214
  dpi: int,
213
215
  returnBytes: Literal[True],
214
- ) -> list[SignatureCrop]: ...
216
+ saveToDisk: bool,
217
+ ) -> list[SignatureCrop]:
218
+ ...
215
219
 
216
220
 
217
221
  def CropSignatureImages(
@@ -221,12 +225,14 @@ def CropSignatureImages(
221
225
  outputDirectory: str | Path,
222
226
  dpi: int = 200,
223
227
  returnBytes: bool = False,
228
+ saveToDisk: bool = True,
224
229
  ) -> list[Path] | list[SignatureCrop]:
225
230
  """Crop detected signature regions to PNG files.
226
231
 
227
232
  Accepts either a :class:`FileResult` instance or the ``dict`` returned by
228
233
  :func:`DetectPdf`. Requires the optional ``pymupdf`` dependency.
229
- Set ``returnBytes=True`` to also receive in-memory PNG bytes for each crop.
234
+ Set ``returnBytes=True`` to also receive in-memory PNG bytes for each crop. Set
235
+ ``saveToDisk=False`` to skip writing PNG files while still returning in-memory data.
230
236
  """
231
237
 
232
238
  from sigdetect.cropping import crop_signatures
@@ -238,6 +244,7 @@ def CropSignatureImages(
238
244
  output_dir=Path(outputDirectory),
239
245
  dpi=dpi,
240
246
  return_bytes=returnBytes,
247
+ save_files=saveToDisk,
241
248
  )
242
249
  if original_dict is not None:
243
250
  original_dict.clear()
sigdetect/cli.py CHANGED
@@ -15,6 +15,7 @@ from .cropping import SignatureCroppingUnavailable, crop_signatures
15
15
  from .detector import BuildDetector, FileResult
16
16
  from .eda import RunExploratoryAnalysis
17
17
  from .logging_setup import ConfigureLogging
18
+ from .wet_detection import apply_wet_detection
18
19
 
19
20
  Logger = ConfigureLogging()
20
21
 
@@ -72,6 +73,33 @@ def Detect(
72
73
  help="Rendering DPI for signature crops",
73
74
  show_default=False,
74
75
  ),
76
+ detectWetSignatures: bool | None = typer.Option(
77
+ None,
78
+ "--detect-wet/--no-detect-wet",
79
+ help="Run OCR-backed wet signature detection (requires PyMuPDF + Tesseract)",
80
+ show_default=False,
81
+ ),
82
+ wetOcrDpi: int | None = typer.Option(
83
+ None,
84
+ "--wet-ocr-dpi",
85
+ min=72,
86
+ max=600,
87
+ help="Rendering DPI for OCR pages (wet detection)",
88
+ show_default=False,
89
+ ),
90
+ wetOcrLanguages: str | None = typer.Option(
91
+ None,
92
+ "--wet-ocr-languages",
93
+ help="Tesseract language packs for OCR (e.g., 'eng' or 'eng+spa')",
94
+ ),
95
+ wetPrecisionThreshold: float | None = typer.Option(
96
+ None,
97
+ "--wet-precision-threshold",
98
+ min=0.0,
99
+ max=1.0,
100
+ help="Minimum wet-signature confidence (0-1) to accept a candidate",
101
+ show_default=False,
102
+ ),
75
103
  ) -> None:
76
104
  """Run detection for the configured directory and emit ``results.json``."""
77
105
 
@@ -89,6 +117,14 @@ def Detect(
89
117
  overrides["CropOutputDirectory"] = cropDirectory
90
118
  if cropDpi is not None:
91
119
  overrides["CropImageDpi"] = cropDpi
120
+ if detectWetSignatures is not None:
121
+ overrides["DetectWetSignatures"] = detectWetSignatures
122
+ if wetOcrDpi is not None:
123
+ overrides["WetOcrDpi"] = wetOcrDpi
124
+ if wetOcrLanguages is not None:
125
+ overrides["WetOcrLanguages"] = wetOcrLanguages
126
+ if wetPrecisionThreshold is not None:
127
+ overrides["WetPrecisionThreshold"] = wetPrecisionThreshold
92
128
  if overrides:
93
129
  configuration = configuration.model_copy(update=overrides)
94
130
  configuration = FinalizeConfiguration(configuration)
@@ -182,6 +218,7 @@ def Detect(
182
218
 
183
219
  def _process(pdf_path: Path) -> None:
184
220
  file_result = detector.Detect(pdf_path)
221
+ apply_wet_detection(pdf_path, configuration, file_result, logger=Logger)
185
222
  _append_result(file_result, pdf_path)
186
223
 
187
224
  try:
sigdetect/config.py CHANGED
@@ -10,7 +10,7 @@ from typing import Literal
10
10
  import yaml
11
11
  from pydantic import BaseModel, ConfigDict, Field, field_validator
12
12
 
13
- EngineName = Literal["pypdf2", "pypdf", "pymupdf"]
13
+ EngineName = Literal["pypdf2", "pypdf", "pymupdf", "auto"]
14
14
  ProfileName = Literal["hipaa", "retainer"]
15
15
 
16
16
 
@@ -25,13 +25,19 @@ class DetectConfiguration(BaseModel):
25
25
 
26
26
  PdfRoot: Path = Field(default=Path("hipaa_results"), alias="pdf_root")
27
27
  OutputDirectory: Path | None = Field(default=Path("out"), alias="out_dir")
28
- Engine: EngineName = Field(default="pypdf2", alias="engine")
28
+ Engine: EngineName = Field(default="auto", alias="engine")
29
29
  Profile: ProfileName = Field(default="hipaa", alias="profile")
30
30
  PseudoSignatures: bool = Field(default=True, alias="pseudo_signatures")
31
31
  RecurseXObjects: bool = Field(default=True, alias="recurse_xobjects")
32
- CropSignatures: bool = Field(default=False, alias="crop_signatures")
32
+ CropSignatures: bool = Field(default=True, alias="crop_signatures")
33
33
  CropOutputDirectory: Path | None = Field(default=None, alias="crop_output_dir")
34
34
  CropImageDpi: int = Field(default=200, alias="crop_image_dpi", ge=72, le=600)
35
+ DetectWetSignatures: bool = Field(default=True, alias="detect_wet_signatures")
36
+ WetOcrDpi: int = Field(default=200, alias="wet_ocr_dpi", ge=72, le=600)
37
+ WetOcrLanguages: str = Field(default="eng", alias="wet_ocr_languages")
38
+ WetPrecisionThreshold: float = Field(
39
+ default=0.82, alias="wet_precision_threshold", ge=0.0, le=1.0
40
+ )
35
41
 
36
42
  @field_validator("PdfRoot", "OutputDirectory", "CropOutputDirectory", mode="before")
37
43
  @classmethod
@@ -85,6 +91,22 @@ class DetectConfiguration(BaseModel):
85
91
  def crop_image_dpi(self) -> int: # pragma: no cover - simple passthrough
86
92
  return self.CropImageDpi
87
93
 
94
+ @property
95
+ def detect_wet_signatures(self) -> bool: # pragma: no cover - simple passthrough
96
+ return self.DetectWetSignatures
97
+
98
+ @property
99
+ def wet_ocr_dpi(self) -> int: # pragma: no cover - simple passthrough
100
+ return self.WetOcrDpi
101
+
102
+ @property
103
+ def wet_ocr_languages(self) -> str: # pragma: no cover - simple passthrough
104
+ return self.WetOcrLanguages
105
+
106
+ @property
107
+ def wet_precision_threshold(self) -> float: # pragma: no cover - simple passthrough
108
+ return self.WetPrecisionThreshold
109
+
88
110
 
89
111
  def LoadConfiguration(path: Path | None) -> DetectConfiguration:
90
112
  """Load configuration from ``path`` while applying environment overrides.
@@ -108,6 +130,10 @@ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
108
130
  env_crop = os.getenv("SIGDETECT_CROP_SIGNATURES")
109
131
  env_crop_dir = os.getenv("SIGDETECT_CROP_DIR")
110
132
  env_crop_dpi = os.getenv("SIGDETECT_CROP_DPI")
133
+ env_detect_wet = os.getenv("SIGDETECT_DETECT_WET")
134
+ env_wet_dpi = os.getenv("SIGDETECT_WET_OCR_DPI")
135
+ env_wet_lang = os.getenv("SIGDETECT_WET_LANGUAGES")
136
+ env_wet_precision = os.getenv("SIGDETECT_WET_PRECISION")
111
137
 
112
138
  raw_data: dict[str, object] = {}
113
139
  if path and Path(path).exists():
@@ -133,6 +159,20 @@ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
133
159
  if env_crop_dpi:
134
160
  with suppress(ValueError):
135
161
  raw_data["crop_image_dpi"] = int(env_crop_dpi)
162
+ if env_detect_wet is not None:
163
+ lowered = env_detect_wet.lower()
164
+ if lowered in {"1", "true", "yes", "on"}:
165
+ raw_data["detect_wet_signatures"] = True
166
+ elif lowered in {"0", "false", "no", "off"}:
167
+ raw_data["detect_wet_signatures"] = False
168
+ if env_wet_dpi:
169
+ with suppress(ValueError):
170
+ raw_data["wet_ocr_dpi"] = int(env_wet_dpi)
171
+ if env_wet_lang:
172
+ raw_data["wet_ocr_languages"] = env_wet_lang
173
+ if env_wet_precision:
174
+ with suppress(ValueError):
175
+ raw_data["wet_precision_threshold"] = float(env_wet_precision)
136
176
 
137
177
  configuration = DetectConfiguration(**raw_data)
138
178
  return FinalizeConfiguration(configuration)
sigdetect/cropping.py CHANGED
@@ -28,6 +28,7 @@ class SignatureCrop:
28
28
  path: Path
29
29
  image_bytes: bytes
30
30
  signature: Signature
31
+ saved_to_disk: bool = True
31
32
 
32
33
 
33
34
  @overload
@@ -39,7 +40,9 @@ def crop_signatures(
39
40
  dpi: int = 200,
40
41
  logger: logging.Logger | None = None,
41
42
  return_bytes: Literal[False] = False,
42
- ) -> list[Path]: ...
43
+ save_files: bool = True,
44
+ ) -> list[Path]:
45
+ ...
43
46
 
44
47
 
45
48
  @overload
@@ -50,8 +53,10 @@ def crop_signatures(
50
53
  output_dir: Path,
51
54
  dpi: int = 200,
52
55
  logger: logging.Logger | None = None,
53
- return_bytes: Literal[True] = True,
54
- ) -> list[SignatureCrop]: ...
56
+ return_bytes: Literal[True],
57
+ save_files: bool = True,
58
+ ) -> list[SignatureCrop]:
59
+ ...
55
60
 
56
61
 
57
62
  def crop_signatures(
@@ -62,27 +67,32 @@ def crop_signatures(
62
67
  dpi: int = 200,
63
68
  logger: logging.Logger | None = None,
64
69
  return_bytes: bool = False,
70
+ save_files: bool = True,
65
71
  ) -> list[Path] | list[SignatureCrop]:
66
72
  """Render each signature bounding box to a PNG image using PyMuPDF.
67
73
 
68
74
  Set ``return_bytes=True`` to collect in-memory PNG bytes for each crop while also writing
69
- the files to ``output_dir``.
75
+ the files to ``output_dir``. Set ``save_files=False`` to skip writing PNGs to disk.
70
76
  """
71
77
 
72
78
  if fitz is None: # pragma: no cover - exercised when dependency absent
73
79
  raise SignatureCroppingUnavailable(
74
80
  "PyMuPDF is required for PNG crops. Install 'pymupdf' or 'sigdetect[pymupdf]'."
75
81
  )
82
+ if not save_files and not return_bytes:
83
+ raise ValueError("At least one of save_files or return_bytes must be True")
76
84
 
77
85
  pdf_path = Path(pdf_path)
78
86
  output_dir = Path(output_dir)
79
- output_dir.mkdir(parents=True, exist_ok=True)
87
+ if save_files:
88
+ output_dir.mkdir(parents=True, exist_ok=True)
80
89
  generated_paths: list[Path] = []
81
90
  generated_crops: list[SignatureCrop] = []
82
91
 
83
92
  with fitz.open(pdf_path) as document: # type: ignore[attr-defined]
84
93
  per_document_dir = output_dir / pdf_path.stem
85
- per_document_dir.mkdir(parents=True, exist_ok=True)
94
+ if save_files:
95
+ per_document_dir.mkdir(parents=True, exist_ok=True)
86
96
  scale = dpi / 72.0
87
97
  matrix = fitz.Matrix(scale, scale)
88
98
 
@@ -113,7 +123,8 @@ def crop_signatures(
113
123
  try:
114
124
  image_bytes: bytes | None = None
115
125
  pixmap = page.get_pixmap(matrix=matrix, clip=clip, alpha=False)
116
- pixmap.save(destination)
126
+ if save_files:
127
+ pixmap.save(destination)
117
128
  if return_bytes:
118
129
  image_bytes = pixmap.tobytes("png")
119
130
  except Exception as exc: # pragma: no cover - defensive
@@ -129,8 +140,9 @@ def crop_signatures(
129
140
  )
130
141
  continue
131
142
 
132
- signature.CropPath = str(destination)
133
- generated_paths.append(destination)
143
+ if save_files:
144
+ signature.CropPath = str(destination)
145
+ generated_paths.append(destination)
134
146
  if return_bytes:
135
147
  if image_bytes is None: # pragma: no cover - defensive
136
148
  continue
@@ -139,6 +151,7 @@ def crop_signatures(
139
151
  path=destination,
140
152
  image_bytes=image_bytes,
141
153
  signature=signature,
154
+ saved_to_disk=save_files,
142
155
  )
143
156
  )
144
157
 
@@ -2,6 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import warnings
5
6
  from typing import TYPE_CHECKING, Type
6
7
 
7
8
  from .base_detector import Detector
@@ -37,7 +38,23 @@ def BuildDetector(configuration: DetectConfiguration) -> Detector:
37
38
  or getattr(configuration, "engine", None)
38
39
  or PyPDF2Detector.Name
39
40
  )
40
- normalized = engine_name.lower()
41
+ normalized = str(engine_name).lower()
42
+
43
+ if normalized == "auto":
44
+ detector_cls: Type[Detector] | None = None
45
+ if PyMuPDFDetector is not None:
46
+ detector_cls = ENGINE_REGISTRY.get(getattr(PyMuPDFDetector, "Name", "")) or PyMuPDFDetector
47
+ if detector_cls is None:
48
+ detector_cls = ENGINE_REGISTRY.get(PyPDF2Detector.Name) or ENGINE_REGISTRY.get("pypdf")
49
+ warnings.warn(
50
+ "Engine 'auto' falling back to 'pypdf2' because PyMuPDF is unavailable",
51
+ RuntimeWarning,
52
+ stacklevel=2,
53
+ )
54
+ if detector_cls is None:
55
+ available = ", ".join(sorted(ENGINE_REGISTRY)) or "<none>"
56
+ raise ValueError(f"No available detector engines. Available engines: {available}")
57
+ return detector_cls(configuration)
41
58
 
42
59
  detector_cls = ENGINE_REGISTRY.get(normalized)
43
60
  if detector_cls is None:
@@ -111,6 +111,7 @@ class PyMuPDFDetector(PyPDF2Detector):
111
111
  rect, exclusion, mode = rect_info
112
112
  padded = self._PadRect(rect, page.rect, signature.Role, exclusion, mode)
113
113
  signature.BoundingBox = self._RectToPdfTuple(padded, page.rect.height)
114
+ signature.RenderType = "drawn"
114
115
  if signature.Page is None:
115
116
  signature.Page = page_index + 1
116
117
  break
@@ -348,7 +348,7 @@ class PyPDF2Detector(Detector):
348
348
  return normalized.lower().startswith("im")
349
349
 
350
350
  def _ClassifyAppearance(self, widget: generic.DictionaryObject, page) -> str:
351
- """Classify the widget's appearance as drawn/typed/hybrid/unknown."""
351
+ """Classify the widget's appearance as drawn or typed."""
352
352
 
353
353
  ap_dict = AsDictionary(widget.get("/AP"))
354
354
  if not isinstance(ap_dict, generic.DictionaryObject):
@@ -356,7 +356,7 @@ class PyPDF2Detector(Detector):
356
356
  normal = ap_dict.get("/N")
357
357
  streams = self._ExtractAppearanceStreams(normal)
358
358
  if not streams:
359
- return "unknown"
359
+ return "typed"
360
360
 
361
361
  has_text = False
362
362
  has_vector = False
@@ -384,13 +384,11 @@ class PyPDF2Detector(Detector):
384
384
  has_image = True
385
385
  break
386
386
 
387
- if has_image and (has_text or has_vector):
388
- return "hybrid"
389
387
  if has_image:
390
388
  return "drawn"
391
389
  if has_text or has_vector:
392
390
  return "typed"
393
- return "unknown"
391
+ return "typed"
394
392
 
395
393
  # ---- file-wide stream scan (compressed or not)
396
394
  def _ScanFileStreamsForVendors(self, file_bytes: bytes) -> tuple[set[str], str]:
@@ -863,6 +861,7 @@ class PyPDF2Detector(Detector):
863
861
  Scores={r: sc},
864
862
  Evidence=ev + ["pseudo:true"],
865
863
  Hint="VendorOrAcroOnly",
864
+ RenderType="typed",
866
865
  )
867
866
  )
868
867
 
@@ -903,6 +902,7 @@ class PyPDF2Detector(Detector):
903
902
  Scores={role: score} if score > 0 else {},
904
903
  Evidence=ev + ["pseudo:true"],
905
904
  Hint="VendorOrAcroOnly",
905
+ RenderType="typed",
906
906
  )
907
907
  )
908
908
 
@@ -1055,6 +1055,7 @@ class PyPDF2Detector(Detector):
1055
1055
  Scores=scores,
1056
1056
  Evidence=evidence,
1057
1057
  Hint=f"AcroSig:{fname}" if fname else "AcroSig",
1058
+ RenderType="typed",
1058
1059
  )
1059
1060
  )
1060
1061
 
@@ -1120,6 +1121,7 @@ class PyPDF2Detector(Detector):
1120
1121
  Scores=dict(scores),
1121
1122
  Evidence=evidence + ["pseudo:true"],
1122
1123
  Hint="VendorOrAcroOnly",
1124
+ RenderType="typed",
1123
1125
  )
1124
1126
  )
1125
1127
 
@@ -17,7 +17,7 @@ class Signature:
17
17
  Scores: dict[str, int]
18
18
  Evidence: list[str]
19
19
  Hint: str
20
- RenderType: str = "unknown"
20
+ RenderType: str = "typed"
21
21
  BoundingBox: tuple[float, float, float, float] | None = None
22
22
  CropPath: str | None = None
23
23
 
@@ -0,0 +1,499 @@
1
+ """Wet signature detection via OCR-backed heuristics."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import re
7
+ from dataclasses import dataclass
8
+ from pathlib import Path
9
+ from typing import Iterable, Sequence
10
+
11
+ from PIL import Image
12
+
13
+ from sigdetect.config import DetectConfiguration
14
+ from sigdetect.detector.file_result_model import FileResult
15
+ from sigdetect.detector.signature_model import Signature
16
+
17
+ try: # pragma: no cover - optional dependency
18
+ import fitz # type: ignore
19
+ except Exception: # pragma: no cover - optional dependency
20
+ fitz = None # type: ignore[misc]
21
+
22
+ try: # pragma: no cover - optional dependency
23
+ import pytesseract # type: ignore
24
+ from pytesseract import Output as TesseractOutput
25
+ except Exception: # pragma: no cover - optional dependency
26
+ pytesseract = None # type: ignore[assignment]
27
+ TesseractOutput = None # type: ignore[assignment]
28
+
29
+
30
+ LOGGER = logging.getLogger("sigdetect.wet")
31
+
32
+ SIGNATURE_PATTERNS: tuple[re.Pattern[str], ...] = (
33
+ re.compile(r"\bsignature\b"),
34
+ re.compile(r"\bsigned\b"),
35
+ re.compile(r"\bsign\b"),
36
+ re.compile(r"\bsignature\s+of\b"),
37
+ re.compile(r"\bsignature\s*:"),
38
+ re.compile(r"\bsignature\s*-"),
39
+ re.compile(r"\bby:\b"),
40
+ )
41
+
42
+ ROLE_KEYWORDS: dict[str, tuple[str, ...]] = {
43
+ "client": ("client", "consumer", "claimant"),
44
+ "firm": ("firm", "attorney", "counsel", "by:", "esq", "law"),
45
+ "patient": ("patient", "self", "plaintiff"),
46
+ "representative": ("guardian", "representative", "parent", "poa"),
47
+ "attorney": ("attorney", "counsel", "lawyer"),
48
+ }
49
+
50
+
51
+ class WetDetectionUnavailable(RuntimeError):
52
+ """Raised when OCR-backed detection cannot run."""
53
+
54
+
55
+ @dataclass
56
+ class OcrLine:
57
+ """Structured OCR line extracted from pytesseract."""
58
+
59
+ text: str
60
+ confidence: float
61
+ left: int
62
+ top: int
63
+ right: int
64
+ bottom: int
65
+
66
+
67
+ def should_run_wet_pipeline(file_result: FileResult) -> bool:
68
+ """Return ``True`` when the OCR pipeline should run for ``file_result``."""
69
+
70
+ return (
71
+ (not file_result.ElectronicSignatureFound or file_result.SignatureCount == 0)
72
+ or (bool(file_result.ScannedPdf) and not file_result.ElectronicSignatureFound)
73
+ or bool(file_result.MixedContent)
74
+ )
75
+
76
+
77
+ def apply_wet_detection(
78
+ pdf_path: Path,
79
+ configuration: DetectConfiguration,
80
+ file_result: FileResult,
81
+ *,
82
+ logger: logging.Logger | None = None,
83
+ ) -> bool:
84
+ """Augment ``file_result`` with OCR-detected wet signatures when possible."""
85
+
86
+ if not configuration.DetectWetSignatures:
87
+ return False
88
+ if not should_run_wet_pipeline(file_result):
89
+ return False
90
+
91
+ try:
92
+ _ensure_dependencies()
93
+ except WetDetectionUnavailable as exc:
94
+ _mark_manual_review(file_result, str(exc))
95
+ if logger:
96
+ logger.warning("Wet detection unavailable", extra={"error": str(exc)})
97
+ return False
98
+
99
+ try:
100
+ added = _detect(pdf_path, configuration, file_result, logger=logger)
101
+ if not added:
102
+ _mark_manual_review(file_result, "NoHighConfidenceWetSignature")
103
+ return added
104
+ except Exception as exc: # pragma: no cover - defensive
105
+ _mark_manual_review(file_result, "WetDetectionError")
106
+ if logger:
107
+ logger.warning("Wet detection failed", extra={"error": str(exc)})
108
+ return False
109
+
110
+
111
+ def _detect(
112
+ pdf_path: Path,
113
+ configuration: DetectConfiguration,
114
+ file_result: FileResult,
115
+ *,
116
+ logger: logging.Logger | None = None,
117
+ ) -> bool:
118
+ if fitz is None or pytesseract is None:
119
+ raise WetDetectionUnavailable("PyMuPDF or pytesseract not available")
120
+
121
+ document = fitz.open(pdf_path) # type: ignore[attr-defined]
122
+ try:
123
+ new_signatures: list[Signature] = []
124
+ matrix = fitz.Matrix(configuration.WetOcrDpi / 72.0, configuration.WetOcrDpi / 72.0)
125
+ for page_index in range(document.page_count):
126
+ page = document.load_page(page_index)
127
+ pixmap = page.get_pixmap(matrix=matrix, alpha=False)
128
+ image = _pixmap_to_image(pixmap)
129
+ ocr_lines = _extract_ocr_lines(image, configuration.WetOcrLanguages)
130
+ candidates = list(
131
+ _build_candidates(
132
+ ocr_lines,
133
+ image=image,
134
+ page_rect=page.rect,
135
+ pix_width=pixmap.width,
136
+ pix_height=pixmap.height,
137
+ scale=configuration.WetOcrDpi / 72.0,
138
+ )
139
+ )
140
+ candidates.extend(_image_candidates(page))
141
+ accepted = [
142
+ candidate
143
+ for candidate in candidates
144
+ if candidate.Score >= configuration.WetPrecisionThreshold
145
+ ]
146
+ if logger:
147
+ logger.debug(
148
+ "Wet detection page summary",
149
+ extra={
150
+ "pdf": pdf_path.name,
151
+ "page": page_index + 1,
152
+ "candidates": len(candidates),
153
+ "accepted": len(accepted),
154
+ },
155
+ )
156
+ new_signatures.extend(_to_signatures(accepted, page_index + 1))
157
+ if not new_signatures:
158
+ return False
159
+
160
+ file_result.Signatures.extend(new_signatures)
161
+ _refresh_metadata(file_result)
162
+ return True
163
+ finally:
164
+ document.close()
165
+
166
+
167
+ def _ensure_dependencies() -> None:
168
+ if fitz is None:
169
+ raise WetDetectionUnavailable("PyMuPDF is required for wet detection (install 'pymupdf').")
170
+ if pytesseract is None or TesseractOutput is None:
171
+ raise WetDetectionUnavailable(
172
+ "pytesseract is required for wet detection and depends on the Tesseract OCR binary."
173
+ )
174
+
175
+
176
+ def _pixmap_to_image(pixmap) -> Image.Image:
177
+ mode = "RGB"
178
+ if pixmap.alpha:
179
+ mode = "RGBA"
180
+ image = Image.frombytes(mode, [pixmap.width, pixmap.height], pixmap.samples)
181
+ if mode == "RGBA":
182
+ image = image.convert("RGB")
183
+ return image
184
+
185
+
186
+ def _extract_ocr_lines(image: Image.Image, languages: str) -> list[OcrLine]:
187
+ if pytesseract is None or TesseractOutput is None:
188
+ raise WetDetectionUnavailable("pytesseract unavailable")
189
+
190
+ try:
191
+ data = pytesseract.image_to_data(image, lang=languages, output_type=TesseractOutput.DICT)
192
+ except Exception as exc: # pragma: no cover - passthrough to manual review
193
+ raise WetDetectionUnavailable(f"OCR failed: {exc}") from exc
194
+ total = len(data.get("text", []))
195
+ lines: dict[tuple[int, int, int], OcrLine] = {}
196
+ for idx in range(total):
197
+ text = (data["text"][idx] or "").strip()
198
+ if not text:
199
+ continue
200
+ conf_raw = float(data["conf"][idx])
201
+ if conf_raw <= 0:
202
+ continue
203
+ key = (data["block_num"][idx], data["par_num"][idx], data["line_num"][idx])
204
+ left = int(data["left"][idx])
205
+ top = int(data["top"][idx])
206
+ width = int(data["width"][idx])
207
+ height = int(data["height"][idx])
208
+ right = left + width
209
+ bottom = top + height
210
+ existing = lines.get(key)
211
+ if existing is None:
212
+ lines[key] = OcrLine(
213
+ text=text,
214
+ confidence=conf_raw / 100.0,
215
+ left=left,
216
+ top=top,
217
+ right=right,
218
+ bottom=bottom,
219
+ )
220
+ else:
221
+ existing.text = f"{existing.text} {text}"
222
+ existing.confidence = min(1.0, (existing.confidence + conf_raw / 100.0) / 2.0)
223
+ existing.left = min(existing.left, left)
224
+ existing.top = min(existing.top, top)
225
+ existing.right = max(existing.right, right)
226
+ existing.bottom = max(existing.bottom, bottom)
227
+ return list(lines.values())
228
+
229
+
230
+ @dataclass
231
+ class WetCandidate:
232
+ bbox: tuple[float, float, float, float]
233
+ Role: str
234
+ Score: float
235
+ Evidence: list[str]
236
+
237
+
238
+ def _build_candidates(
239
+ lines: Iterable[OcrLine],
240
+ *,
241
+ image: Image.Image,
242
+ page_rect,
243
+ pix_width: int,
244
+ pix_height: int,
245
+ scale: float,
246
+ ) -> Iterable[WetCandidate]:
247
+ for line in lines:
248
+ normalized = line.text.lower()
249
+ if not _has_signature_keyword(normalized):
250
+ continue
251
+ if len(normalized) > 80:
252
+ # Ignore long paragraph-like OCR lines
253
+ continue
254
+ if (line.bottom / pix_height) < 0.4:
255
+ # Ignore lines in the upper section of the page
256
+ continue
257
+ role = _infer_role(normalized)
258
+ stroke_found, stroke_y = _stroke_under_line(image, line)
259
+ bonus = _keyword_bonus(normalized)
260
+ if stroke_found:
261
+ bonus += 0.12
262
+ # Slight positional prior: lines in lower quarter are more likely signatures.
263
+ if (line.bottom / pix_height) > 0.7:
264
+ bonus += 0.05
265
+ confidence = min(1.0, line.confidence + bonus)
266
+ bbox = _expand_bbox(line, page_rect, pix_height, scale, stroke_y=stroke_y)
267
+ yield WetCandidate(
268
+ bbox=bbox,
269
+ Role=role,
270
+ Score=confidence,
271
+ Evidence=[
272
+ f"ocr_line:{line.text.strip()}",
273
+ f"ocr_conf:{confidence:.2f}",
274
+ "wet:true",
275
+ "stroke:yes" if stroke_found else "stroke:no",
276
+ ],
277
+ )
278
+
279
+
280
+ def _infer_role(normalized_text: str) -> str:
281
+ for role, keywords in ROLE_KEYWORDS.items():
282
+ if any(keyword in normalized_text for keyword in keywords):
283
+ return role
284
+ return "unknown"
285
+
286
+
287
+ def _keyword_bonus(normalized_text: str) -> float:
288
+ bonus = 0.0
289
+ if "signature" in normalized_text:
290
+ bonus += 0.05
291
+ if "date" in normalized_text:
292
+ bonus -= 0.02
293
+ if "by:" in normalized_text:
294
+ bonus += 0.03
295
+ return bonus
296
+
297
+
298
+ def _has_signature_keyword(normalized_text: str) -> bool:
299
+ return any(pattern.search(normalized_text) for pattern in SIGNATURE_PATTERNS)
300
+
301
+
302
+ def _expand_bbox(
303
+ line: OcrLine,
304
+ page_rect,
305
+ pix_height: int,
306
+ scale: float,
307
+ *,
308
+ stroke_y: float | None = None,
309
+ ) -> tuple[float, float, float, float]:
310
+ x0 = line.left / scale
311
+ x1 = line.right / scale
312
+ y1 = (pix_height - line.top) / scale
313
+
314
+ pad_x = max(14.0, (x1 - x0) * 0.25)
315
+ left = max(page_rect.x0, x0 - pad_x)
316
+ right = min(page_rect.x1, x1 + pad_x)
317
+
318
+ gap = 14.0
319
+ signature_height = 70.0
320
+ top = min(page_rect.y1, y1 + gap)
321
+ bottom = min(page_rect.y1, top + signature_height)
322
+
323
+ if bottom <= top:
324
+ bottom = min(page_rect.y1, top + signature_height)
325
+
326
+ if stroke_y is not None:
327
+ # Anchor to the detected stroke under the OCR label when available.
328
+ sy = (pix_height - stroke_y) / scale
329
+ if sy < top:
330
+ top = sy
331
+ bottom = max(bottom, sy + signature_height)
332
+
333
+ return (float(left), float(top), float(right), float(bottom))
334
+
335
+
336
+ def _stroke_under_line(image: Image.Image, line: OcrLine) -> tuple[bool, float | None]:
337
+ """Heuristic: look for a dark horizontal stroke beneath the OCR line."""
338
+
339
+ gray = image.convert("L")
340
+ pad_x = 10
341
+ strip_height = 28
342
+ x0 = max(0, line.left - pad_x)
343
+ x1 = min(gray.width, line.right + pad_x)
344
+ y0 = min(gray.height, line.bottom + 2)
345
+ y1 = min(gray.height, y0 + strip_height)
346
+ if x1 <= x0 or y1 <= y0:
347
+ return False, None
348
+
349
+ crop = gray.crop((x0, y0, x1, y1))
350
+ width = crop.width or 1
351
+ max_density = 0.0
352
+ best_row = None
353
+ # Simple density scan: percentage of dark pixels per row.
354
+ threshold = 160
355
+ for row in range(crop.height):
356
+ row_pixels = [crop.getpixel((col, row)) for col in range(width)]
357
+ dark = sum(1 for px in row_pixels if px < threshold)
358
+ density = dark / width
359
+ if density > max_density:
360
+ max_density = density
361
+ best_row = row
362
+ if max_density < 0.32 or best_row is None:
363
+ return False, None
364
+ return True, float(y0 + best_row)
365
+
366
+
367
+ def _image_candidates(page) -> list[WetCandidate]:
368
+ """Heuristic: treat small, wide images near signature areas as wet signatures."""
369
+
370
+ candidates: list[WetCandidate] = []
371
+ page_width = float(page.rect.width)
372
+ page_height = float(page.rect.height)
373
+ page_area = page_width * page_height
374
+ words = page.get_text("words") or []
375
+
376
+ for info in page.get_image_info(xrefs=True) or []:
377
+ rect = info.get("bbox") or info.get("rect")
378
+ if rect is None:
379
+ continue
380
+ if hasattr(rect, "x0"):
381
+ x0, y0, x1, y1 = float(rect.x0), float(rect.y0), float(rect.x1), float(rect.y1)
382
+ elif isinstance(rect, (tuple, list)) and len(rect) == 4:
383
+ x0, y0, x1, y1 = map(float, rect)
384
+ else:
385
+ continue
386
+ width = float(x1 - x0)
387
+ height = float(y1 - y0)
388
+ if width <= 40 or height <= 15:
389
+ # Skip tiny marks/logos
390
+ continue
391
+ aspect = width / height if height else 0.0
392
+ if aspect < 1.6:
393
+ continue
394
+ if (width * height) / page_area > 0.1:
395
+ # Ignore large illustrations/backgrounds
396
+ continue
397
+
398
+ role = _infer_role_nearby(rect, words)
399
+ score = 0.9 if role != "unknown" else 0.84
400
+
401
+ bbox = (x0, float(page_height - y1), x1, float(page_height - y0))
402
+
403
+ evidence = ["image_signature:true"]
404
+ if role != "unknown":
405
+ evidence.append(f"role_hint:{role}")
406
+
407
+ candidates.append(
408
+ WetCandidate(
409
+ bbox=bbox,
410
+ Role=role,
411
+ Score=min(1.0, score),
412
+ Evidence=evidence,
413
+ )
414
+ )
415
+ return candidates
416
+
417
+
418
+ def _infer_role_nearby(rect, words) -> str:
419
+ """Best-effort role inference using text near the image rectangle."""
420
+
421
+ proximity_y = 48.0
422
+ proximity_x = 140.0
423
+ if hasattr(rect, "x0"):
424
+ rx0, ry0, rx1, ry1 = float(rect.x0), float(rect.y0), float(rect.x1), float(rect.y1)
425
+ elif isinstance(rect, (tuple, list)) and len(rect) == 4:
426
+ rx0, ry0, rx1, ry1 = map(float, rect)
427
+ else:
428
+ return "unknown"
429
+
430
+ nearby_tokens: list[str] = []
431
+ for word in words:
432
+ if len(word) < 5:
433
+ continue
434
+ x0, y0, x1, y1, token, *_ = word
435
+ if y1 < ry0 - proximity_y or y0 > ry1 + proximity_y:
436
+ continue
437
+ if x1 < rx0 - proximity_x or x0 > rx1 + proximity_x:
438
+ continue
439
+ nearby_tokens.append(str(token))
440
+ if not nearby_tokens:
441
+ return "unknown"
442
+ normalized = " ".join(nearby_tokens).lower()
443
+ return _infer_role(normalized)
444
+
445
+
446
+ def _needs_wet_enhancement(file_result: FileResult) -> bool:
447
+ """Return True when we should run wet OCR to refine pseudo/unknown signatures."""
448
+
449
+ return False
450
+
451
+
452
+ def _to_signatures(
453
+ candidates: Sequence[WetCandidate],
454
+ page_number: int,
455
+ ) -> list[Signature]:
456
+ signatures: list[Signature] = []
457
+ for candidate in candidates:
458
+ signatures.append(
459
+ Signature(
460
+ Page=page_number,
461
+ FieldName="wet_signature_detected",
462
+ Role=candidate.Role,
463
+ Score=int(round(candidate.Score * 100)),
464
+ Scores={candidate.Role: int(round(candidate.Score * 100))},
465
+ Evidence=candidate.Evidence,
466
+ Hint="WetSignatureOCR",
467
+ RenderType="wet",
468
+ BoundingBox=candidate.bbox,
469
+ )
470
+ )
471
+ return signatures
472
+
473
+
474
+ def _mark_manual_review(file_result: FileResult, reason: str) -> None:
475
+ hints = _split_hints(file_result.Hints)
476
+ hints.add(f"ManualReview:{reason}")
477
+ file_result.Hints = ";".join(sorted(hints)) if hints else file_result.Hints
478
+
479
+
480
+ def _refresh_metadata(file_result: FileResult) -> None:
481
+ file_result.SignatureCount = len(file_result.Signatures)
482
+ signature_pages = sorted({sig.Page for sig in file_result.Signatures if sig.Page})
483
+ file_result.SignaturePages = ",".join(map(str, signature_pages))
484
+ roles = sorted({sig.Role for sig in file_result.Signatures if sig.Role != "unknown"})
485
+ if roles:
486
+ file_result.Roles = ";".join(roles)
487
+ file_result.ElectronicSignatureFound = file_result.SignatureCount > 0
488
+ file_result.MixedContent = (
489
+ file_result.ElectronicSignatureFound and bool(file_result.ScannedPdf)
490
+ )
491
+ hints = _split_hints(file_result.Hints)
492
+ hints |= {sig.Hint for sig in file_result.Signatures if sig.Hint}
493
+ file_result.Hints = ";".join(sorted(hints))
494
+
495
+
496
+ def _split_hints(hints: str | None) -> set[str]:
497
+ if not hints:
498
+ return set()
499
+ return {hint for hint in hints.split(";") if hint}
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sigdetect
3
- Version: 0.3.0
3
+ Version: 0.4.0
4
4
  Summary: Signature detection and role attribution for PDFs
5
5
  Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
6
6
  License: MIT
@@ -95,14 +95,14 @@ sigdetect detect \
95
95
  ### Notes
96
96
 
97
97
  - The config file controls `pdf_root`, `out_dir`, `engine`, `pseudo_signatures`, `recurse_xobjects`, etc.
98
- - `--engine` supports **pypdf2** (default); a **pymupdf** engine placeholder exists and may be included in a future build.
98
+ - `--engine` accepts **auto** (default; prefers PyMuPDF when installed, falls back to PyPDF2), **pypdf2**, or **pymupdf**.
99
99
  - `--pseudo-signatures` enables a vendor/Acro-only pseudo-signature when no actual `/Widget` is present (useful for DocuSign / Acrobat Sign receipts).
100
100
  - `--recurse-xobjects` allows scanning Form XObjects for vendor markers and labels embedded in page resources.
101
101
  - `--profile` selects tuned role logic:
102
102
  - `hipaa` → patient / representative / attorney
103
103
  - `retainer` → client / firm (prefers detecting two signatures)
104
104
  - `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
105
- - `--crop-signatures` enables PNG crops for each detected widget (requires installing the optional `pymupdf` dependency). Use `--crop-dir` to override the destination and `--crop-dpi` to choose rendering quality.
105
+ - Cropping (`--crop-signatures`) and wet detection (`--detect-wet`) are enabled by default for single-pass runs; disable them if you want a light, e-sign-only pass. PyMuPDF is required for crops; PyMuPDF + Tesseract are required for wet detection.
106
106
  - If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
107
107
 
108
108
  ### EDA (quick aggregate stats)
@@ -136,15 +136,13 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
136
136
  print(result.to_dict())
137
137
  ~~~
138
138
 
139
- `Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image.
139
+ `Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
140
140
 
141
141
  ---
142
142
 
143
143
  ## Library API (embed in another script)
144
144
 
145
- Minimal, plug-and-play API
146
- Import from `sigdetect.api` and get plain dicts out (JSON-ready),
147
- with no I/O side effects by default:
145
+ Minimal, plug-and-play API that returns plain dicts (JSON-ready) without side effects unless you opt into cropping:
148
146
 
149
147
  ~~~python
150
148
  from pathlib import Path
@@ -192,21 +190,14 @@ for res in ScanDirectory(
192
190
  # 3) Crop PNG snippets for FileResult objects (requires PyMuPDF)
193
191
  detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
194
192
  file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
195
- crops = CropSignatureImages(
193
+ CropSignatureImages(
196
194
  "/path/to/pdfs/example.pdf",
197
195
  file_result,
198
196
  outputDirectory="./signature_crops",
199
197
  dpi=200,
200
- returnBytes=True, # also returns in-memory PNG bytes for each crop
201
198
  )
202
-
203
- first_crop = crops[0]
204
- print(first_crop.path, len(first_crop.image_bytes))
205
199
  ~~~
206
200
 
207
- When ``returnBytes=True`` the helper returns ``SignatureCrop`` objects containing the saved path,
208
- PNG bytes, and the originating signature metadata.
209
-
210
201
 
211
202
  ## Result schema
212
203
 
@@ -245,7 +236,7 @@ High-level summary (per file):
245
236
  "scores": { "page_label": 4, "general": 2 },
246
237
  "evidence": ["page_label:representative(parent/guardian)", "pseudo:true"],
247
238
  "hint": "VendorOrAcroOnly",
248
- "render_type": "unknown",
239
+ "render_type": "typed",
249
240
  "bounding_box": null,
250
241
  "crop_path": null
251
242
  }
@@ -290,6 +281,10 @@ profile: retainer # or: hipaa
290
281
  crop_signatures: false # enable to write PNG crops (requires pymupdf)
291
282
  # crop_output_dir: ./signature_crops
292
283
  crop_image_dpi: 200
284
+ detect_wet_signatures: false # opt-in OCR wet detection (PyMuPDF + Tesseract)
285
+ wet_ocr_dpi: 200
286
+ wet_ocr_languages: eng
287
+ wet_precision_threshold: 0.82
293
288
  ~~~
294
289
 
295
290
  YAML files can be customized or load at runtime (see CLI `--config`, if available, or import and pass patterns into engine).
@@ -304,6 +299,7 @@ YAML files can be customized or load at runtime (see CLI `--config`, if availabl
304
299
  - Looks for client and firm labels/tokens; boosts pages with law-firm markers (LLP/LLC/PA/PC) and “By:” blocks.
305
300
  - Applies an anti-front-matter rule to reduce page-1 false positives (e.g., letterheads, firm mastheads).
306
301
  - When only vendor/Acro clues exist (no widgets), it will emit two pseudo signatures targeting likely pages.
302
+ - **Wet detection (opt-in):** With `detect_wet_signatures: true`, the CLI runs an OCR-backed pass (PyMuPDF + pytesseract/Tesseract) after e-sign detection. It emits `RenderType="wet"` signatures for high-confidence label/stroke pairs in the lower page region. Missing OCR dependencies add a `ManualReview:*` hint instead of failing.
307
303
 
308
304
  ---
309
305
 
@@ -0,0 +1,24 @@
1
+ sigdetect/__init__.py,sha256=YvnTwlC1jfq83EhQS_1JjiiHK7_wJCCU1JvHv5E1qWY,573
2
+ sigdetect/api.py,sha256=qLCpbODLvw5AQMEAvpIP6kBNoc03h01ekjilg9tDxuw,9408
3
+ sigdetect/cli.py,sha256=Zco3-r4MAlVEmyEatvPUOZLLamh5ELFrquAK6ovJVlw,9290
4
+ sigdetect/config.py,sha256=-6GCUusdi0Ba-Rt6pwffB5MIz1ApPlBaXVKxpIppbKk,7678
5
+ sigdetect/cropping.py,sha256=zwOXzkG8tt1ZPUaDhJMHfonFEZtVNZZmZOzYQ_4nUAI,6074
6
+ sigdetect/eda.py,sha256=S92G1Gjmepri__D0n_V6foq0lQgH-RXI9anW8A58jfw,4681
7
+ sigdetect/logging_setup.py,sha256=LMF8ao_a-JwH0S522T6aYTFX3e8Ajjv_5ODS2YiBcHA,6404
8
+ sigdetect/utils.py,sha256=T9rubLf5T9JmjOHYMOba1j34fhOJaWocAXccnGTxRUE,5198
9
+ sigdetect/wet_detection.py,sha256=6ciFxMQS3f1nF502w4KLTksoYmjdudzTekh7McfWiIg,16464
10
+ sigdetect/data/role_rules.retainer.yml,sha256=IFdwKnDBXR2cTkdfrsZ6ku6CXD8S_dg5A3vKRKLW5h8,2532
11
+ sigdetect/data/role_rules.yml,sha256=HuLKsZR_A6sD9XvY4NHiY_VG3dS5ERNCBF9-Mxawomw,2751
12
+ sigdetect/data/vendor_patterns.yml,sha256=NRbZNQxcx_GuL6n1jAphBn6MM6ChCpeWGCsjbRx-PEo,384
13
+ sigdetect/detector/__init__.py,sha256=pUVFLwqj65cVO1qjsZy6NJ9BVY5xrJ6sQe-8LAb9O_A,2421
14
+ sigdetect/detector/base.py,sha256=L-iXWXqsTetDc4jRZo_wOdbNpKqOY20mX9FefrugdT0,263
15
+ sigdetect/detector/base_detector.py,sha256=GmAgUWO_fQgIfnihZSoyhR3wpnwZ-X3hS0Kuyz4G6Ys,608
16
+ sigdetect/detector/file_result_model.py,sha256=j2gTc9Sw3fJOHlexYsR_m5DiwHA8DzIzAMToESfvo4A,1767
17
+ sigdetect/detector/pymupdf_engine.py,sha256=SGtJOStKFdfsdBrscoe5zg9u2KGJ_JTRYZ25adL_7Lw,17390
18
+ sigdetect/detector/pypdf2_engine.py,sha256=kB8cIp_gMvCla0LIBi9sd19g0361Oc9TjCW_ZViUBJQ,47410
19
+ sigdetect/detector/signature_model.py,sha256=sdfQiOJzxnrg0WkGJxZCebA0wHqgzZnLI0gOv6ipSZA,1074
20
+ sigdetect-0.4.0.dist-info/METADATA,sha256=WA7OjyLtM3AH7OtdFRmliqBw0ucNlywoD2bykytlnPA,12475
21
+ sigdetect-0.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
22
+ sigdetect-0.4.0.dist-info/entry_points.txt,sha256=iqtfKjBU44-omM7Sh-idGz2ahw19oAvpvSyKZVArG3o,48
23
+ sigdetect-0.4.0.dist-info/top_level.txt,sha256=PKlfwUobkRC0viwiSXmhtw83G26FSNpimWYC1Uy00FY,10
24
+ sigdetect-0.4.0.dist-info/RECORD,,
@@ -1,23 +0,0 @@
1
- sigdetect/__init__.py,sha256=LhY78mDZ1ClYVNTxW_qtE-vqJoN9N7N5ZcNRDUI_3ss,575
2
- sigdetect/api.py,sha256=jIUaq6nslDdluNlRoDSdaX3Dx1lkIIZmIJPHn8Nk2Ko,9192
3
- sigdetect/cli.py,sha256=NctAnaB-TQrUAT9m-v8kj2_KTNs88kbFOCiX32tHZm8,7920
4
- sigdetect/config.py,sha256=S0NVKuJYiHJCocL-VNFdGJpasFcjTecavC4EthyS1DQ,5951
5
- sigdetect/cropping.py,sha256=IyIcQAPH3z58tS6yeplglMDNu9F-iyQtpYQ1Ya2X_8o,5602
6
- sigdetect/eda.py,sha256=S92G1Gjmepri__D0n_V6foq0lQgH-RXI9anW8A58jfw,4681
7
- sigdetect/logging_setup.py,sha256=LMF8ao_a-JwH0S522T6aYTFX3e8Ajjv_5ODS2YiBcHA,6404
8
- sigdetect/utils.py,sha256=T9rubLf5T9JmjOHYMOba1j34fhOJaWocAXccnGTxRUE,5198
9
- sigdetect/data/role_rules.retainer.yml,sha256=IFdwKnDBXR2cTkdfrsZ6ku6CXD8S_dg5A3vKRKLW5h8,2532
10
- sigdetect/data/role_rules.yml,sha256=HuLKsZR_A6sD9XvY4NHiY_VG3dS5ERNCBF9-Mxawomw,2751
11
- sigdetect/data/vendor_patterns.yml,sha256=NRbZNQxcx_GuL6n1jAphBn6MM6ChCpeWGCsjbRx-PEo,384
12
- sigdetect/detector/__init__.py,sha256=up2FCmD09f2bRHcS4WbY-clx3GQbWuk1PM2JlxgusHg,1608
13
- sigdetect/detector/base.py,sha256=L-iXWXqsTetDc4jRZo_wOdbNpKqOY20mX9FefrugdT0,263
14
- sigdetect/detector/base_detector.py,sha256=GmAgUWO_fQgIfnihZSoyhR3wpnwZ-X3hS0Kuyz4G6Ys,608
15
- sigdetect/detector/file_result_model.py,sha256=j2gTc9Sw3fJOHlexYsR_m5DiwHA8DzIzAMToESfvo4A,1767
16
- sigdetect/detector/pymupdf_engine.py,sha256=iyp7JuPlUnydwohH5zbNg4MwH44mBmxbBWOS3ZmArBo,17339
17
- sigdetect/detector/pypdf2_engine.py,sha256=INWQH06kMLvto2VS-EdLC-EtMC6AG7JmdVYmNgx6_RU,47313
18
- sigdetect/detector/signature_model.py,sha256=mztb9V5wgv2oohQ5Cxzcv8_Bo6TyWAVIXteaeQ2rywQ,1076
19
- sigdetect-0.3.0.dist-info/METADATA,sha256=i7rSqbNbViLWyNJFO5si0eghcM01mBdkLrFsVND7xZw,12171
20
- sigdetect-0.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
21
- sigdetect-0.3.0.dist-info/entry_points.txt,sha256=iqtfKjBU44-omM7Sh-idGz2ahw19oAvpvSyKZVArG3o,48
22
- sigdetect-0.3.0.dist-info/top_level.txt,sha256=PKlfwUobkRC0viwiSXmhtw83G26FSNpimWYC1Uy00FY,10
23
- sigdetect-0.3.0.dist-info/RECORD,,