sigdetect 0.3.1__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {sigdetect-0.3.1 → sigdetect-0.4.0}/PKG-INFO +12 -18
  2. {sigdetect-0.3.1 → sigdetect-0.4.0}/README.md +11 -17
  3. {sigdetect-0.3.1 → sigdetect-0.4.0}/pyproject.toml +1 -1
  4. {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect/__init__.py +1 -1
  5. {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect/api.py +7 -5
  6. {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect/cli.py +37 -0
  7. {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect/config.py +43 -3
  8. {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect/cropping.py +7 -3
  9. {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect/detector/__init__.py +18 -1
  10. {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect/detector/pymupdf_engine.py +1 -0
  11. {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect/detector/pypdf2_engine.py +7 -5
  12. {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect/detector/signature_model.py +1 -1
  13. sigdetect-0.4.0/src/sigdetect/wet_detection.py +499 -0
  14. {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect.egg-info/PKG-INFO +12 -18
  15. {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect.egg-info/SOURCES.txt +4 -0
  16. sigdetect-0.4.0/tests/test_cli.py +148 -0
  17. sigdetect-0.4.0/tests/test_detector_options.py +82 -0
  18. sigdetect-0.4.0/tests/test_wet_detection.py +111 -0
  19. {sigdetect-0.3.1 → sigdetect-0.4.0}/setup.cfg +0 -0
  20. {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect/data/role_rules.retainer.yml +0 -0
  21. {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect/data/role_rules.yml +0 -0
  22. {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect/data/vendor_patterns.yml +0 -0
  23. {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect/detector/base.py +0 -0
  24. {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect/detector/base_detector.py +0 -0
  25. {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect/detector/file_result_model.py +0 -0
  26. {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect/eda.py +0 -0
  27. {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect/logging_setup.py +0 -0
  28. {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect/utils.py +0 -0
  29. {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect.egg-info/dependency_links.txt +0 -0
  30. {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect.egg-info/entry_points.txt +0 -0
  31. {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect.egg-info/requires.txt +0 -0
  32. {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect.egg-info/top_level.txt +0 -0
  33. {sigdetect-0.3.1 → sigdetect-0.4.0}/tests/test_api.py +0 -0
  34. {sigdetect-0.3.1 → sigdetect-0.4.0}/tests/test_cropping.py +0 -0
  35. {sigdetect-0.3.1 → sigdetect-0.4.0}/tests/test_pymupdf_engine.py +0 -0
  36. {sigdetect-0.3.1 → sigdetect-0.4.0}/tests/test_widget_role_patient_smoke.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sigdetect
3
- Version: 0.3.1
3
+ Version: 0.4.0
4
4
  Summary: Signature detection and role attribution for PDFs
5
5
  Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
6
6
  License: MIT
@@ -95,14 +95,14 @@ sigdetect detect \
95
95
  ### Notes
96
96
 
97
97
  - The config file controls `pdf_root`, `out_dir`, `engine`, `pseudo_signatures`, `recurse_xobjects`, etc.
98
- - `--engine` supports **pypdf2** (default); a **pymupdf** engine placeholder exists and may be included in a future build.
98
+ - `--engine` accepts **auto** (default; prefers PyMuPDF when installed, falls back to PyPDF2), **pypdf2**, or **pymupdf**.
99
99
  - `--pseudo-signatures` enables a vendor/Acro-only pseudo-signature when no actual `/Widget` is present (useful for DocuSign / Acrobat Sign receipts).
100
100
  - `--recurse-xobjects` allows scanning Form XObjects for vendor markers and labels embedded in page resources.
101
101
  - `--profile` selects tuned role logic:
102
102
  - `hipaa` → patient / representative / attorney
103
103
  - `retainer` → client / firm (prefers detecting two signatures)
104
104
  - `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
105
- - `--crop-signatures` enables PNG crops for each detected widget (requires installing the optional `pymupdf` dependency). Use `--crop-dir` to override the destination and `--crop-dpi` to choose rendering quality.
105
+ - Cropping (`--crop-signatures`) and wet detection (`--detect-wet`) are enabled by default for single-pass runs; disable them if you want a light, e-sign-only pass. PyMuPDF is required for crops; PyMuPDF + Tesseract are required for wet detection.
106
106
  - If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
107
107
 
108
108
  ### EDA (quick aggregate stats)
@@ -136,15 +136,13 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
136
136
  print(result.to_dict())
137
137
  ~~~
138
138
 
139
- `Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image.
139
+ `Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
140
140
 
141
141
  ---
142
142
 
143
143
  ## Library API (embed in another script)
144
144
 
145
- Minimal, plug-and-play API
146
- Import from `sigdetect.api` and get plain dicts out (JSON-ready),
147
- with no I/O side effects by default:
145
+ Minimal, plug-and-play API that returns plain dicts (JSON-ready) without side effects unless you opt into cropping:
148
146
 
149
147
  ~~~python
150
148
  from pathlib import Path
@@ -192,23 +190,14 @@ for res in ScanDirectory(
192
190
  # 3) Crop PNG snippets for FileResult objects (requires PyMuPDF)
193
191
  detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
194
192
  file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
195
- crops = CropSignatureImages(
193
+ CropSignatureImages(
196
194
  "/path/to/pdfs/example.pdf",
197
195
  file_result,
198
196
  outputDirectory="./signature_crops",
199
197
  dpi=200,
200
- returnBytes=True, # also returns in-memory PNG bytes for each crop
201
- # saveToDisk=False, # optional: skip writing PNGs to disk
202
198
  )
203
-
204
- first_crop = crops[0]
205
- print(first_crop.path, len(first_crop.image_bytes))
206
199
  ~~~
207
200
 
208
- When ``returnBytes=True`` the helper returns ``SignatureCrop`` objects containing the saved path,
209
- PNG bytes, and the originating signature metadata.
210
- Pass ``saveToDisk=False`` if you only want in-memory PNG bytes (no files on disk or ``crop_path`` updates).
211
-
212
201
 
213
202
  ## Result schema
214
203
 
@@ -247,7 +236,7 @@ High-level summary (per file):
247
236
  "scores": { "page_label": 4, "general": 2 },
248
237
  "evidence": ["page_label:representative(parent/guardian)", "pseudo:true"],
249
238
  "hint": "VendorOrAcroOnly",
250
- "render_type": "unknown",
239
+ "render_type": "typed",
251
240
  "bounding_box": null,
252
241
  "crop_path": null
253
242
  }
@@ -292,6 +281,10 @@ profile: retainer # or: hipaa
292
281
  crop_signatures: false # enable to write PNG crops (requires pymupdf)
293
282
  # crop_output_dir: ./signature_crops
294
283
  crop_image_dpi: 200
284
+ detect_wet_signatures: false # opt-in OCR wet detection (PyMuPDF + Tesseract)
285
+ wet_ocr_dpi: 200
286
+ wet_ocr_languages: eng
287
+ wet_precision_threshold: 0.82
295
288
  ~~~
296
289
 
297
290
  YAML files can be customized or load at runtime (see CLI `--config`, if available, or import and pass patterns into engine).
@@ -306,6 +299,7 @@ YAML files can be customized or load at runtime (see CLI `--config`, if availabl
306
299
  - Looks for client and firm labels/tokens; boosts pages with law-firm markers (LLP/LLC/PA/PC) and “By:” blocks.
307
300
  - Applies an anti-front-matter rule to reduce page-1 false positives (e.g., letterheads, firm mastheads).
308
301
  - When only vendor/Acro clues exist (no widgets), it will emit two pseudo signatures targeting likely pages.
302
+ - **Wet detection (opt-in):** With `detect_wet_signatures: true`, the CLI runs an OCR-backed pass (PyMuPDF + pytesseract/Tesseract) after e-sign detection. It emits `RenderType="wet"` signatures for high-confidence label/stroke pairs in the lower page region. Missing OCR dependencies add a `ManualReview:*` hint instead of failing.
309
303
 
310
304
  ---
311
305
 
@@ -79,14 +79,14 @@ sigdetect detect \
79
79
  ### Notes
80
80
 
81
81
  - The config file controls `pdf_root`, `out_dir`, `engine`, `pseudo_signatures`, `recurse_xobjects`, etc.
82
- - `--engine` supports **pypdf2** (default); a **pymupdf** engine placeholder exists and may be included in a future build.
82
+ - `--engine` accepts **auto** (default; prefers PyMuPDF when installed, falls back to PyPDF2), **pypdf2**, or **pymupdf**.
83
83
  - `--pseudo-signatures` enables a vendor/Acro-only pseudo-signature when no actual `/Widget` is present (useful for DocuSign / Acrobat Sign receipts).
84
84
  - `--recurse-xobjects` allows scanning Form XObjects for vendor markers and labels embedded in page resources.
85
85
  - `--profile` selects tuned role logic:
86
86
  - `hipaa` → patient / representative / attorney
87
87
  - `retainer` → client / firm (prefers detecting two signatures)
88
88
  - `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
89
- - `--crop-signatures` enables PNG crops for each detected widget (requires installing the optional `pymupdf` dependency). Use `--crop-dir` to override the destination and `--crop-dpi` to choose rendering quality.
89
+ - Cropping (`--crop-signatures`) and wet detection (`--detect-wet`) are enabled by default for single-pass runs; disable them if you want a light, e-sign-only pass. PyMuPDF is required for crops; PyMuPDF + Tesseract are required for wet detection.
90
90
  - If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
91
91
 
92
92
  ### EDA (quick aggregate stats)
@@ -120,15 +120,13 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
120
120
  print(result.to_dict())
121
121
  ~~~
122
122
 
123
- `Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image.
123
+ `Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
124
124
 
125
125
  ---
126
126
 
127
127
  ## Library API (embed in another script)
128
128
 
129
- Minimal, plug-and-play API
130
- Import from `sigdetect.api` and get plain dicts out (JSON-ready),
131
- with no I/O side effects by default:
129
+ Minimal, plug-and-play API that returns plain dicts (JSON-ready) without side effects unless you opt into cropping:
132
130
 
133
131
  ~~~python
134
132
  from pathlib import Path
@@ -176,23 +174,14 @@ for res in ScanDirectory(
176
174
  # 3) Crop PNG snippets for FileResult objects (requires PyMuPDF)
177
175
  detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
178
176
  file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
179
- crops = CropSignatureImages(
177
+ CropSignatureImages(
180
178
  "/path/to/pdfs/example.pdf",
181
179
  file_result,
182
180
  outputDirectory="./signature_crops",
183
181
  dpi=200,
184
- returnBytes=True, # also returns in-memory PNG bytes for each crop
185
- # saveToDisk=False, # optional: skip writing PNGs to disk
186
182
  )
187
-
188
- first_crop = crops[0]
189
- print(first_crop.path, len(first_crop.image_bytes))
190
183
  ~~~
191
184
 
192
- When ``returnBytes=True`` the helper returns ``SignatureCrop`` objects containing the saved path,
193
- PNG bytes, and the originating signature metadata.
194
- Pass ``saveToDisk=False`` if you only want in-memory PNG bytes (no files on disk or ``crop_path`` updates).
195
-
196
185
 
197
186
  ## Result schema
198
187
 
@@ -231,7 +220,7 @@ High-level summary (per file):
231
220
  "scores": { "page_label": 4, "general": 2 },
232
221
  "evidence": ["page_label:representative(parent/guardian)", "pseudo:true"],
233
222
  "hint": "VendorOrAcroOnly",
234
- "render_type": "unknown",
223
+ "render_type": "typed",
235
224
  "bounding_box": null,
236
225
  "crop_path": null
237
226
  }
@@ -276,6 +265,10 @@ profile: retainer # or: hipaa
276
265
  crop_signatures: false # enable to write PNG crops (requires pymupdf)
277
266
  # crop_output_dir: ./signature_crops
278
267
  crop_image_dpi: 200
268
+ detect_wet_signatures: false # opt-in OCR wet detection (PyMuPDF + Tesseract)
269
+ wet_ocr_dpi: 200
270
+ wet_ocr_languages: eng
271
+ wet_precision_threshold: 0.82
279
272
  ~~~
280
273
 
281
274
  YAML files can be customized or load at runtime (see CLI `--config`, if available, or import and pass patterns into engine).
@@ -290,6 +283,7 @@ YAML files can be customized or load at runtime (see CLI `--config`, if availabl
290
283
  - Looks for client and firm labels/tokens; boosts pages with law-firm markers (LLP/LLC/PA/PC) and “By:” blocks.
291
284
  - Applies an anti-front-matter rule to reduce page-1 false positives (e.g., letterheads, firm mastheads).
292
285
  - When only vendor/Acro clues exist (no widgets), it will emit two pseudo signatures targeting likely pages.
286
+ - **Wet detection (opt-in):** With `detect_wet_signatures: true`, the CLI runs an OCR-backed pass (PyMuPDF + pytesseract/Tesseract) after e-sign detection. It emits `RenderType="wet"` signatures for high-confidence label/stroke pairs in the lower page region. Missing OCR dependencies add a `ManualReview:*` hint instead of failing.
293
287
 
294
288
  ---
295
289
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sigdetect"
7
- version = "0.3.1"
7
+ version = "0.4.0"
8
8
  description = "Signature detection and role attribution for PDFs"
9
9
  readme = "README.md"
10
10
  authors = [{ name = "BT Asmamaw", email = "basmamaw@angeiongroup.com" }]
@@ -21,4 +21,4 @@ try:
21
21
  except PackageNotFoundError: # pragma: no cover
22
22
  __version__ = "0.0.0"
23
23
 
24
- DEFAULT_ENGINE = "pypdf2"
24
+ DEFAULT_ENGINE = "auto"
@@ -10,7 +10,7 @@ from sigdetect.config import DetectConfiguration
10
10
  from sigdetect.cropping import SignatureCrop
11
11
  from sigdetect.detector import BuildDetector, Detector, FileResult, Signature
12
12
 
13
- EngineName = Literal["pypdf2", "pypdf", "pymupdf"]
13
+ EngineName = Literal["pypdf2", "pypdf", "pymupdf", "auto"]
14
14
  ProfileName = Literal["hipaa", "retainer"]
15
15
 
16
16
 
@@ -18,7 +18,7 @@ def DetectPdf(
18
18
  pdfPath: str | Path,
19
19
  *,
20
20
  profileName: ProfileName = "hipaa",
21
- engineName: EngineName = "pypdf2",
21
+ engineName: EngineName = "auto",
22
22
  includePseudoSignatures: bool = True,
23
23
  recurseXObjects: bool = True,
24
24
  detector: Detector | None = None,
@@ -43,7 +43,7 @@ def get_detector(
43
43
  *,
44
44
  pdfRoot: str | Path | None = None,
45
45
  profileName: ProfileName = "hipaa",
46
- engineName: EngineName = "pypdf2",
46
+ engineName: EngineName = "auto",
47
47
  includePseudoSignatures: bool = True,
48
48
  recurseXObjects: bool = True,
49
49
  outputDirectory: str | Path | None = None,
@@ -201,7 +201,8 @@ def CropSignatureImages(
201
201
  dpi: int = 200,
202
202
  returnBytes: Literal[False] = False,
203
203
  saveToDisk: bool = True,
204
- ) -> list[Path]: ...
204
+ ) -> list[Path]:
205
+ ...
205
206
 
206
207
 
207
208
  @overload
@@ -213,7 +214,8 @@ def CropSignatureImages(
213
214
  dpi: int,
214
215
  returnBytes: Literal[True],
215
216
  saveToDisk: bool,
216
- ) -> list[SignatureCrop]: ...
217
+ ) -> list[SignatureCrop]:
218
+ ...
217
219
 
218
220
 
219
221
  def CropSignatureImages(
@@ -15,6 +15,7 @@ from .cropping import SignatureCroppingUnavailable, crop_signatures
15
15
  from .detector import BuildDetector, FileResult
16
16
  from .eda import RunExploratoryAnalysis
17
17
  from .logging_setup import ConfigureLogging
18
+ from .wet_detection import apply_wet_detection
18
19
 
19
20
  Logger = ConfigureLogging()
20
21
 
@@ -72,6 +73,33 @@ def Detect(
72
73
  help="Rendering DPI for signature crops",
73
74
  show_default=False,
74
75
  ),
76
+ detectWetSignatures: bool | None = typer.Option(
77
+ None,
78
+ "--detect-wet/--no-detect-wet",
79
+ help="Run OCR-backed wet signature detection (requires PyMuPDF + Tesseract)",
80
+ show_default=False,
81
+ ),
82
+ wetOcrDpi: int | None = typer.Option(
83
+ None,
84
+ "--wet-ocr-dpi",
85
+ min=72,
86
+ max=600,
87
+ help="Rendering DPI for OCR pages (wet detection)",
88
+ show_default=False,
89
+ ),
90
+ wetOcrLanguages: str | None = typer.Option(
91
+ None,
92
+ "--wet-ocr-languages",
93
+ help="Tesseract language packs for OCR (e.g., 'eng' or 'eng+spa')",
94
+ ),
95
+ wetPrecisionThreshold: float | None = typer.Option(
96
+ None,
97
+ "--wet-precision-threshold",
98
+ min=0.0,
99
+ max=1.0,
100
+ help="Minimum wet-signature confidence (0-1) to accept a candidate",
101
+ show_default=False,
102
+ ),
75
103
  ) -> None:
76
104
  """Run detection for the configured directory and emit ``results.json``."""
77
105
 
@@ -89,6 +117,14 @@ def Detect(
89
117
  overrides["CropOutputDirectory"] = cropDirectory
90
118
  if cropDpi is not None:
91
119
  overrides["CropImageDpi"] = cropDpi
120
+ if detectWetSignatures is not None:
121
+ overrides["DetectWetSignatures"] = detectWetSignatures
122
+ if wetOcrDpi is not None:
123
+ overrides["WetOcrDpi"] = wetOcrDpi
124
+ if wetOcrLanguages is not None:
125
+ overrides["WetOcrLanguages"] = wetOcrLanguages
126
+ if wetPrecisionThreshold is not None:
127
+ overrides["WetPrecisionThreshold"] = wetPrecisionThreshold
92
128
  if overrides:
93
129
  configuration = configuration.model_copy(update=overrides)
94
130
  configuration = FinalizeConfiguration(configuration)
@@ -182,6 +218,7 @@ def Detect(
182
218
 
183
219
  def _process(pdf_path: Path) -> None:
184
220
  file_result = detector.Detect(pdf_path)
221
+ apply_wet_detection(pdf_path, configuration, file_result, logger=Logger)
185
222
  _append_result(file_result, pdf_path)
186
223
 
187
224
  try:
@@ -10,7 +10,7 @@ from typing import Literal
10
10
  import yaml
11
11
  from pydantic import BaseModel, ConfigDict, Field, field_validator
12
12
 
13
- EngineName = Literal["pypdf2", "pypdf", "pymupdf"]
13
+ EngineName = Literal["pypdf2", "pypdf", "pymupdf", "auto"]
14
14
  ProfileName = Literal["hipaa", "retainer"]
15
15
 
16
16
 
@@ -25,13 +25,19 @@ class DetectConfiguration(BaseModel):
25
25
 
26
26
  PdfRoot: Path = Field(default=Path("hipaa_results"), alias="pdf_root")
27
27
  OutputDirectory: Path | None = Field(default=Path("out"), alias="out_dir")
28
- Engine: EngineName = Field(default="pypdf2", alias="engine")
28
+ Engine: EngineName = Field(default="auto", alias="engine")
29
29
  Profile: ProfileName = Field(default="hipaa", alias="profile")
30
30
  PseudoSignatures: bool = Field(default=True, alias="pseudo_signatures")
31
31
  RecurseXObjects: bool = Field(default=True, alias="recurse_xobjects")
32
- CropSignatures: bool = Field(default=False, alias="crop_signatures")
32
+ CropSignatures: bool = Field(default=True, alias="crop_signatures")
33
33
  CropOutputDirectory: Path | None = Field(default=None, alias="crop_output_dir")
34
34
  CropImageDpi: int = Field(default=200, alias="crop_image_dpi", ge=72, le=600)
35
+ DetectWetSignatures: bool = Field(default=True, alias="detect_wet_signatures")
36
+ WetOcrDpi: int = Field(default=200, alias="wet_ocr_dpi", ge=72, le=600)
37
+ WetOcrLanguages: str = Field(default="eng", alias="wet_ocr_languages")
38
+ WetPrecisionThreshold: float = Field(
39
+ default=0.82, alias="wet_precision_threshold", ge=0.0, le=1.0
40
+ )
35
41
 
36
42
  @field_validator("PdfRoot", "OutputDirectory", "CropOutputDirectory", mode="before")
37
43
  @classmethod
@@ -85,6 +91,22 @@ class DetectConfiguration(BaseModel):
85
91
  def crop_image_dpi(self) -> int: # pragma: no cover - simple passthrough
86
92
  return self.CropImageDpi
87
93
 
94
+ @property
95
+ def detect_wet_signatures(self) -> bool: # pragma: no cover - simple passthrough
96
+ return self.DetectWetSignatures
97
+
98
+ @property
99
+ def wet_ocr_dpi(self) -> int: # pragma: no cover - simple passthrough
100
+ return self.WetOcrDpi
101
+
102
+ @property
103
+ def wet_ocr_languages(self) -> str: # pragma: no cover - simple passthrough
104
+ return self.WetOcrLanguages
105
+
106
+ @property
107
+ def wet_precision_threshold(self) -> float: # pragma: no cover - simple passthrough
108
+ return self.WetPrecisionThreshold
109
+
88
110
 
89
111
  def LoadConfiguration(path: Path | None) -> DetectConfiguration:
90
112
  """Load configuration from ``path`` while applying environment overrides.
@@ -108,6 +130,10 @@ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
108
130
  env_crop = os.getenv("SIGDETECT_CROP_SIGNATURES")
109
131
  env_crop_dir = os.getenv("SIGDETECT_CROP_DIR")
110
132
  env_crop_dpi = os.getenv("SIGDETECT_CROP_DPI")
133
+ env_detect_wet = os.getenv("SIGDETECT_DETECT_WET")
134
+ env_wet_dpi = os.getenv("SIGDETECT_WET_OCR_DPI")
135
+ env_wet_lang = os.getenv("SIGDETECT_WET_LANGUAGES")
136
+ env_wet_precision = os.getenv("SIGDETECT_WET_PRECISION")
111
137
 
112
138
  raw_data: dict[str, object] = {}
113
139
  if path and Path(path).exists():
@@ -133,6 +159,20 @@ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
133
159
  if env_crop_dpi:
134
160
  with suppress(ValueError):
135
161
  raw_data["crop_image_dpi"] = int(env_crop_dpi)
162
+ if env_detect_wet is not None:
163
+ lowered = env_detect_wet.lower()
164
+ if lowered in {"1", "true", "yes", "on"}:
165
+ raw_data["detect_wet_signatures"] = True
166
+ elif lowered in {"0", "false", "no", "off"}:
167
+ raw_data["detect_wet_signatures"] = False
168
+ if env_wet_dpi:
169
+ with suppress(ValueError):
170
+ raw_data["wet_ocr_dpi"] = int(env_wet_dpi)
171
+ if env_wet_lang:
172
+ raw_data["wet_ocr_languages"] = env_wet_lang
173
+ if env_wet_precision:
174
+ with suppress(ValueError):
175
+ raw_data["wet_precision_threshold"] = float(env_wet_precision)
136
176
 
137
177
  configuration = DetectConfiguration(**raw_data)
138
178
  return FinalizeConfiguration(configuration)
@@ -40,7 +40,9 @@ def crop_signatures(
40
40
  dpi: int = 200,
41
41
  logger: logging.Logger | None = None,
42
42
  return_bytes: Literal[False] = False,
43
- ) -> list[Path]: ...
43
+ save_files: bool = True,
44
+ ) -> list[Path]:
45
+ ...
44
46
 
45
47
 
46
48
  @overload
@@ -51,8 +53,10 @@ def crop_signatures(
51
53
  output_dir: Path,
52
54
  dpi: int = 200,
53
55
  logger: logging.Logger | None = None,
54
- return_bytes: Literal[True] = True,
55
- ) -> list[SignatureCrop]: ...
56
+ return_bytes: Literal[True],
57
+ save_files: bool = True,
58
+ ) -> list[SignatureCrop]:
59
+ ...
56
60
 
57
61
 
58
62
  def crop_signatures(
@@ -2,6 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import warnings
5
6
  from typing import TYPE_CHECKING, Type
6
7
 
7
8
  from .base_detector import Detector
@@ -37,7 +38,23 @@ def BuildDetector(configuration: DetectConfiguration) -> Detector:
37
38
  or getattr(configuration, "engine", None)
38
39
  or PyPDF2Detector.Name
39
40
  )
40
- normalized = engine_name.lower()
41
+ normalized = str(engine_name).lower()
42
+
43
+ if normalized == "auto":
44
+ detector_cls: Type[Detector] | None = None
45
+ if PyMuPDFDetector is not None:
46
+ detector_cls = ENGINE_REGISTRY.get(getattr(PyMuPDFDetector, "Name", "")) or PyMuPDFDetector
47
+ if detector_cls is None:
48
+ detector_cls = ENGINE_REGISTRY.get(PyPDF2Detector.Name) or ENGINE_REGISTRY.get("pypdf")
49
+ warnings.warn(
50
+ "Engine 'auto' falling back to 'pypdf2' because PyMuPDF is unavailable",
51
+ RuntimeWarning,
52
+ stacklevel=2,
53
+ )
54
+ if detector_cls is None:
55
+ available = ", ".join(sorted(ENGINE_REGISTRY)) or "<none>"
56
+ raise ValueError(f"No available detector engines. Available engines: {available}")
57
+ return detector_cls(configuration)
41
58
 
42
59
  detector_cls = ENGINE_REGISTRY.get(normalized)
43
60
  if detector_cls is None:
@@ -111,6 +111,7 @@ class PyMuPDFDetector(PyPDF2Detector):
111
111
  rect, exclusion, mode = rect_info
112
112
  padded = self._PadRect(rect, page.rect, signature.Role, exclusion, mode)
113
113
  signature.BoundingBox = self._RectToPdfTuple(padded, page.rect.height)
114
+ signature.RenderType = "drawn"
114
115
  if signature.Page is None:
115
116
  signature.Page = page_index + 1
116
117
  break
@@ -348,7 +348,7 @@ class PyPDF2Detector(Detector):
348
348
  return normalized.lower().startswith("im")
349
349
 
350
350
  def _ClassifyAppearance(self, widget: generic.DictionaryObject, page) -> str:
351
- """Classify the widget's appearance as drawn/typed/hybrid/unknown."""
351
+ """Classify the widget's appearance as drawn or typed."""
352
352
 
353
353
  ap_dict = AsDictionary(widget.get("/AP"))
354
354
  if not isinstance(ap_dict, generic.DictionaryObject):
@@ -356,7 +356,7 @@ class PyPDF2Detector(Detector):
356
356
  normal = ap_dict.get("/N")
357
357
  streams = self._ExtractAppearanceStreams(normal)
358
358
  if not streams:
359
- return "unknown"
359
+ return "typed"
360
360
 
361
361
  has_text = False
362
362
  has_vector = False
@@ -384,13 +384,11 @@ class PyPDF2Detector(Detector):
384
384
  has_image = True
385
385
  break
386
386
 
387
- if has_image and (has_text or has_vector):
388
- return "hybrid"
389
387
  if has_image:
390
388
  return "drawn"
391
389
  if has_text or has_vector:
392
390
  return "typed"
393
- return "unknown"
391
+ return "typed"
394
392
 
395
393
  # ---- file-wide stream scan (compressed or not)
396
394
  def _ScanFileStreamsForVendors(self, file_bytes: bytes) -> tuple[set[str], str]:
@@ -863,6 +861,7 @@ class PyPDF2Detector(Detector):
863
861
  Scores={r: sc},
864
862
  Evidence=ev + ["pseudo:true"],
865
863
  Hint="VendorOrAcroOnly",
864
+ RenderType="typed",
866
865
  )
867
866
  )
868
867
 
@@ -903,6 +902,7 @@ class PyPDF2Detector(Detector):
903
902
  Scores={role: score} if score > 0 else {},
904
903
  Evidence=ev + ["pseudo:true"],
905
904
  Hint="VendorOrAcroOnly",
905
+ RenderType="typed",
906
906
  )
907
907
  )
908
908
 
@@ -1055,6 +1055,7 @@ class PyPDF2Detector(Detector):
1055
1055
  Scores=scores,
1056
1056
  Evidence=evidence,
1057
1057
  Hint=f"AcroSig:{fname}" if fname else "AcroSig",
1058
+ RenderType="typed",
1058
1059
  )
1059
1060
  )
1060
1061
 
@@ -1120,6 +1121,7 @@ class PyPDF2Detector(Detector):
1120
1121
  Scores=dict(scores),
1121
1122
  Evidence=evidence + ["pseudo:true"],
1122
1123
  Hint="VendorOrAcroOnly",
1124
+ RenderType="typed",
1123
1125
  )
1124
1126
  )
1125
1127
 
@@ -17,7 +17,7 @@ class Signature:
17
17
  Scores: dict[str, int]
18
18
  Evidence: list[str]
19
19
  Hint: str
20
- RenderType: str = "unknown"
20
+ RenderType: str = "typed"
21
21
  BoundingBox: tuple[float, float, float, float] | None = None
22
22
  CropPath: str | None = None
23
23