sigdetect 0.3.0__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {sigdetect-0.3.0 → sigdetect-0.4.0}/PKG-INFO +12 -16
  2. {sigdetect-0.3.0 → sigdetect-0.4.0}/README.md +11 -15
  3. {sigdetect-0.3.0 → sigdetect-0.4.0}/pyproject.toml +1 -1
  4. {sigdetect-0.3.0 → sigdetect-0.4.0}/src/sigdetect/__init__.py +1 -1
  5. {sigdetect-0.3.0 → sigdetect-0.4.0}/src/sigdetect/api.py +13 -6
  6. {sigdetect-0.3.0 → sigdetect-0.4.0}/src/sigdetect/cli.py +37 -0
  7. {sigdetect-0.3.0 → sigdetect-0.4.0}/src/sigdetect/config.py +43 -3
  8. {sigdetect-0.3.0 → sigdetect-0.4.0}/src/sigdetect/cropping.py +22 -9
  9. {sigdetect-0.3.0 → sigdetect-0.4.0}/src/sigdetect/detector/__init__.py +18 -1
  10. {sigdetect-0.3.0 → sigdetect-0.4.0}/src/sigdetect/detector/pymupdf_engine.py +1 -0
  11. {sigdetect-0.3.0 → sigdetect-0.4.0}/src/sigdetect/detector/pypdf2_engine.py +7 -5
  12. {sigdetect-0.3.0 → sigdetect-0.4.0}/src/sigdetect/detector/signature_model.py +1 -1
  13. sigdetect-0.4.0/src/sigdetect/wet_detection.py +499 -0
  14. {sigdetect-0.3.0 → sigdetect-0.4.0}/src/sigdetect.egg-info/PKG-INFO +12 -16
  15. {sigdetect-0.3.0 → sigdetect-0.4.0}/src/sigdetect.egg-info/SOURCES.txt +4 -0
  16. sigdetect-0.4.0/tests/test_cli.py +148 -0
  17. {sigdetect-0.3.0 → sigdetect-0.4.0}/tests/test_cropping.py +41 -0
  18. sigdetect-0.4.0/tests/test_detector_options.py +82 -0
  19. sigdetect-0.4.0/tests/test_wet_detection.py +111 -0
  20. {sigdetect-0.3.0 → sigdetect-0.4.0}/setup.cfg +0 -0
  21. {sigdetect-0.3.0 → sigdetect-0.4.0}/src/sigdetect/data/role_rules.retainer.yml +0 -0
  22. {sigdetect-0.3.0 → sigdetect-0.4.0}/src/sigdetect/data/role_rules.yml +0 -0
  23. {sigdetect-0.3.0 → sigdetect-0.4.0}/src/sigdetect/data/vendor_patterns.yml +0 -0
  24. {sigdetect-0.3.0 → sigdetect-0.4.0}/src/sigdetect/detector/base.py +0 -0
  25. {sigdetect-0.3.0 → sigdetect-0.4.0}/src/sigdetect/detector/base_detector.py +0 -0
  26. {sigdetect-0.3.0 → sigdetect-0.4.0}/src/sigdetect/detector/file_result_model.py +0 -0
  27. {sigdetect-0.3.0 → sigdetect-0.4.0}/src/sigdetect/eda.py +0 -0
  28. {sigdetect-0.3.0 → sigdetect-0.4.0}/src/sigdetect/logging_setup.py +0 -0
  29. {sigdetect-0.3.0 → sigdetect-0.4.0}/src/sigdetect/utils.py +0 -0
  30. {sigdetect-0.3.0 → sigdetect-0.4.0}/src/sigdetect.egg-info/dependency_links.txt +0 -0
  31. {sigdetect-0.3.0 → sigdetect-0.4.0}/src/sigdetect.egg-info/entry_points.txt +0 -0
  32. {sigdetect-0.3.0 → sigdetect-0.4.0}/src/sigdetect.egg-info/requires.txt +0 -0
  33. {sigdetect-0.3.0 → sigdetect-0.4.0}/src/sigdetect.egg-info/top_level.txt +0 -0
  34. {sigdetect-0.3.0 → sigdetect-0.4.0}/tests/test_api.py +0 -0
  35. {sigdetect-0.3.0 → sigdetect-0.4.0}/tests/test_pymupdf_engine.py +0 -0
  36. {sigdetect-0.3.0 → sigdetect-0.4.0}/tests/test_widget_role_patient_smoke.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sigdetect
3
- Version: 0.3.0
3
+ Version: 0.4.0
4
4
  Summary: Signature detection and role attribution for PDFs
5
5
  Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
6
6
  License: MIT
@@ -95,14 +95,14 @@ sigdetect detect \
95
95
  ### Notes
96
96
 
97
97
  - The config file controls `pdf_root`, `out_dir`, `engine`, `pseudo_signatures`, `recurse_xobjects`, etc.
98
- - `--engine` supports **pypdf2** (default); a **pymupdf** engine placeholder exists and may be included in a future build.
98
+ - `--engine` accepts **auto** (default; prefers PyMuPDF when installed, falls back to PyPDF2), **pypdf2**, or **pymupdf**.
99
99
  - `--pseudo-signatures` enables a vendor/Acro-only pseudo-signature when no actual `/Widget` is present (useful for DocuSign / Acrobat Sign receipts).
100
100
  - `--recurse-xobjects` allows scanning Form XObjects for vendor markers and labels embedded in page resources.
101
101
  - `--profile` selects tuned role logic:
102
102
  - `hipaa` → patient / representative / attorney
103
103
  - `retainer` → client / firm (prefers detecting two signatures)
104
104
  - `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
105
- - `--crop-signatures` enables PNG crops for each detected widget (requires installing the optional `pymupdf` dependency). Use `--crop-dir` to override the destination and `--crop-dpi` to choose rendering quality.
105
+ - Cropping (`--crop-signatures`) and wet detection (`--detect-wet`) are enabled by default for single-pass runs; disable them if you want a light, e-sign-only pass. PyMuPDF is required for crops; PyMuPDF + Tesseract are required for wet detection.
106
106
  - If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
107
107
 
108
108
  ### EDA (quick aggregate stats)
@@ -136,15 +136,13 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
136
136
  print(result.to_dict())
137
137
  ~~~
138
138
 
139
- `Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image.
139
+ `Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
140
140
 
141
141
  ---
142
142
 
143
143
  ## Library API (embed in another script)
144
144
 
145
- Minimal, plug-and-play API
146
- Import from `sigdetect.api` and get plain dicts out (JSON-ready),
147
- with no I/O side effects by default:
145
+ Minimal, plug-and-play API that returns plain dicts (JSON-ready) without side effects unless you opt into cropping:
148
146
 
149
147
  ~~~python
150
148
  from pathlib import Path
@@ -192,21 +190,14 @@ for res in ScanDirectory(
192
190
  # 3) Crop PNG snippets for FileResult objects (requires PyMuPDF)
193
191
  detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
194
192
  file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
195
- crops = CropSignatureImages(
193
+ CropSignatureImages(
196
194
  "/path/to/pdfs/example.pdf",
197
195
  file_result,
198
196
  outputDirectory="./signature_crops",
199
197
  dpi=200,
200
- returnBytes=True, # also returns in-memory PNG bytes for each crop
201
198
  )
202
-
203
- first_crop = crops[0]
204
- print(first_crop.path, len(first_crop.image_bytes))
205
199
  ~~~
206
200
 
207
- When ``returnBytes=True`` the helper returns ``SignatureCrop`` objects containing the saved path,
208
- PNG bytes, and the originating signature metadata.
209
-
210
201
 
211
202
  ## Result schema
212
203
 
@@ -245,7 +236,7 @@ High-level summary (per file):
245
236
  "scores": { "page_label": 4, "general": 2 },
246
237
  "evidence": ["page_label:representative(parent/guardian)", "pseudo:true"],
247
238
  "hint": "VendorOrAcroOnly",
248
- "render_type": "unknown",
239
+ "render_type": "typed",
249
240
  "bounding_box": null,
250
241
  "crop_path": null
251
242
  }
@@ -290,6 +281,10 @@ profile: retainer # or: hipaa
290
281
  crop_signatures: false # enable to write PNG crops (requires pymupdf)
291
282
  # crop_output_dir: ./signature_crops
292
283
  crop_image_dpi: 200
284
+ detect_wet_signatures: false # opt-in OCR wet detection (PyMuPDF + Tesseract)
285
+ wet_ocr_dpi: 200
286
+ wet_ocr_languages: eng
287
+ wet_precision_threshold: 0.82
293
288
  ~~~
294
289
 
295
290
  YAML files can be customized or load at runtime (see CLI `--config`, if available, or import and pass patterns into engine).
@@ -304,6 +299,7 @@ YAML files can be customized or load at runtime (see CLI `--config`, if availabl
304
299
  - Looks for client and firm labels/tokens; boosts pages with law-firm markers (LLP/LLC/PA/PC) and “By:” blocks.
305
300
  - Applies an anti-front-matter rule to reduce page-1 false positives (e.g., letterheads, firm mastheads).
306
301
  - When only vendor/Acro clues exist (no widgets), it will emit two pseudo signatures targeting likely pages.
302
+ - **Wet detection (opt-in):** With `detect_wet_signatures: true`, the CLI runs an OCR-backed pass (PyMuPDF + pytesseract/Tesseract) after e-sign detection. It emits `RenderType="wet"` signatures for high-confidence label/stroke pairs in the lower page region. Missing OCR dependencies add a `ManualReview:*` hint instead of failing.
307
303
 
308
304
  ---
309
305
 
@@ -79,14 +79,14 @@ sigdetect detect \
79
79
  ### Notes
80
80
 
81
81
  - The config file controls `pdf_root`, `out_dir`, `engine`, `pseudo_signatures`, `recurse_xobjects`, etc.
82
- - `--engine` supports **pypdf2** (default); a **pymupdf** engine placeholder exists and may be included in a future build.
82
+ - `--engine` accepts **auto** (default; prefers PyMuPDF when installed, falls back to PyPDF2), **pypdf2**, or **pymupdf**.
83
83
  - `--pseudo-signatures` enables a vendor/Acro-only pseudo-signature when no actual `/Widget` is present (useful for DocuSign / Acrobat Sign receipts).
84
84
  - `--recurse-xobjects` allows scanning Form XObjects for vendor markers and labels embedded in page resources.
85
85
  - `--profile` selects tuned role logic:
86
86
  - `hipaa` → patient / representative / attorney
87
87
  - `retainer` → client / firm (prefers detecting two signatures)
88
88
  - `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
89
- - `--crop-signatures` enables PNG crops for each detected widget (requires installing the optional `pymupdf` dependency). Use `--crop-dir` to override the destination and `--crop-dpi` to choose rendering quality.
89
+ - Cropping (`--crop-signatures`) and wet detection (`--detect-wet`) are enabled by default for single-pass runs; disable them if you want a light, e-sign-only pass. PyMuPDF is required for crops; PyMuPDF + Tesseract are required for wet detection.
90
90
  - If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
91
91
 
92
92
  ### EDA (quick aggregate stats)
@@ -120,15 +120,13 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
120
120
  print(result.to_dict())
121
121
  ~~~
122
122
 
123
- `Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image.
123
+ `Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
124
124
 
125
125
  ---
126
126
 
127
127
  ## Library API (embed in another script)
128
128
 
129
- Minimal, plug-and-play API
130
- Import from `sigdetect.api` and get plain dicts out (JSON-ready),
131
- with no I/O side effects by default:
129
+ Minimal, plug-and-play API that returns plain dicts (JSON-ready) without side effects unless you opt into cropping:
132
130
 
133
131
  ~~~python
134
132
  from pathlib import Path
@@ -176,21 +174,14 @@ for res in ScanDirectory(
176
174
  # 3) Crop PNG snippets for FileResult objects (requires PyMuPDF)
177
175
  detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
178
176
  file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
179
- crops = CropSignatureImages(
177
+ CropSignatureImages(
180
178
  "/path/to/pdfs/example.pdf",
181
179
  file_result,
182
180
  outputDirectory="./signature_crops",
183
181
  dpi=200,
184
- returnBytes=True, # also returns in-memory PNG bytes for each crop
185
182
  )
186
-
187
- first_crop = crops[0]
188
- print(first_crop.path, len(first_crop.image_bytes))
189
183
  ~~~
190
184
 
191
- When ``returnBytes=True`` the helper returns ``SignatureCrop`` objects containing the saved path,
192
- PNG bytes, and the originating signature metadata.
193
-
194
185
 
195
186
  ## Result schema
196
187
 
@@ -229,7 +220,7 @@ High-level summary (per file):
229
220
  "scores": { "page_label": 4, "general": 2 },
230
221
  "evidence": ["page_label:representative(parent/guardian)", "pseudo:true"],
231
222
  "hint": "VendorOrAcroOnly",
232
- "render_type": "unknown",
223
+ "render_type": "typed",
233
224
  "bounding_box": null,
234
225
  "crop_path": null
235
226
  }
@@ -274,6 +265,10 @@ profile: retainer # or: hipaa
274
265
  crop_signatures: false # enable to write PNG crops (requires pymupdf)
275
266
  # crop_output_dir: ./signature_crops
276
267
  crop_image_dpi: 200
268
+ detect_wet_signatures: false # opt-in OCR wet detection (PyMuPDF + Tesseract)
269
+ wet_ocr_dpi: 200
270
+ wet_ocr_languages: eng
271
+ wet_precision_threshold: 0.82
277
272
  ~~~
278
273
 
279
274
  YAML files can be customized or load at runtime (see CLI `--config`, if available, or import and pass patterns into engine).
@@ -288,6 +283,7 @@ YAML files can be customized or load at runtime (see CLI `--config`, if availabl
288
283
  - Looks for client and firm labels/tokens; boosts pages with law-firm markers (LLP/LLC/PA/PC) and “By:” blocks.
289
284
  - Applies an anti-front-matter rule to reduce page-1 false positives (e.g., letterheads, firm mastheads).
290
285
  - When only vendor/Acro clues exist (no widgets), it will emit two pseudo signatures targeting likely pages.
286
+ - **Wet detection (opt-in):** With `detect_wet_signatures: true`, the CLI runs an OCR-backed pass (PyMuPDF + pytesseract/Tesseract) after e-sign detection. It emits `RenderType="wet"` signatures for high-confidence label/stroke pairs in the lower page region. Missing OCR dependencies add a `ManualReview:*` hint instead of failing.
291
287
 
292
288
  ---
293
289
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sigdetect"
7
- version = "0.3.0"
7
+ version = "0.4.0"
8
8
  description = "Signature detection and role attribution for PDFs"
9
9
  readme = "README.md"
10
10
  authors = [{ name = "BT Asmamaw", email = "basmamaw@angeiongroup.com" }]
@@ -21,4 +21,4 @@ try:
21
21
  except PackageNotFoundError: # pragma: no cover
22
22
  __version__ = "0.0.0"
23
23
 
24
- DEFAULT_ENGINE = "pypdf2"
24
+ DEFAULT_ENGINE = "auto"
@@ -10,7 +10,7 @@ from sigdetect.config import DetectConfiguration
10
10
  from sigdetect.cropping import SignatureCrop
11
11
  from sigdetect.detector import BuildDetector, Detector, FileResult, Signature
12
12
 
13
- EngineName = Literal["pypdf2", "pypdf", "pymupdf"]
13
+ EngineName = Literal["pypdf2", "pypdf", "pymupdf", "auto"]
14
14
  ProfileName = Literal["hipaa", "retainer"]
15
15
 
16
16
 
@@ -18,7 +18,7 @@ def DetectPdf(
18
18
  pdfPath: str | Path,
19
19
  *,
20
20
  profileName: ProfileName = "hipaa",
21
- engineName: EngineName = "pypdf2",
21
+ engineName: EngineName = "auto",
22
22
  includePseudoSignatures: bool = True,
23
23
  recurseXObjects: bool = True,
24
24
  detector: Detector | None = None,
@@ -43,7 +43,7 @@ def get_detector(
43
43
  *,
44
44
  pdfRoot: str | Path | None = None,
45
45
  profileName: ProfileName = "hipaa",
46
- engineName: EngineName = "pypdf2",
46
+ engineName: EngineName = "auto",
47
47
  includePseudoSignatures: bool = True,
48
48
  recurseXObjects: bool = True,
49
49
  outputDirectory: str | Path | None = None,
@@ -200,7 +200,9 @@ def CropSignatureImages(
200
200
  outputDirectory: str | Path,
201
201
  dpi: int = 200,
202
202
  returnBytes: Literal[False] = False,
203
- ) -> list[Path]: ...
203
+ saveToDisk: bool = True,
204
+ ) -> list[Path]:
205
+ ...
204
206
 
205
207
 
206
208
  @overload
@@ -211,7 +213,9 @@ def CropSignatureImages(
211
213
  outputDirectory: str | Path,
212
214
  dpi: int,
213
215
  returnBytes: Literal[True],
214
- ) -> list[SignatureCrop]: ...
216
+ saveToDisk: bool,
217
+ ) -> list[SignatureCrop]:
218
+ ...
215
219
 
216
220
 
217
221
  def CropSignatureImages(
@@ -221,12 +225,14 @@ def CropSignatureImages(
221
225
  outputDirectory: str | Path,
222
226
  dpi: int = 200,
223
227
  returnBytes: bool = False,
228
+ saveToDisk: bool = True,
224
229
  ) -> list[Path] | list[SignatureCrop]:
225
230
  """Crop detected signature regions to PNG files.
226
231
 
227
232
  Accepts either a :class:`FileResult` instance or the ``dict`` returned by
228
233
  :func:`DetectPdf`. Requires the optional ``pymupdf`` dependency.
229
- Set ``returnBytes=True`` to also receive in-memory PNG bytes for each crop.
234
+ Set ``returnBytes=True`` to also receive in-memory PNG bytes for each crop. Set
235
+ ``saveToDisk=False`` to skip writing PNG files while still returning in-memory data.
230
236
  """
231
237
 
232
238
  from sigdetect.cropping import crop_signatures
@@ -238,6 +244,7 @@ def CropSignatureImages(
238
244
  output_dir=Path(outputDirectory),
239
245
  dpi=dpi,
240
246
  return_bytes=returnBytes,
247
+ save_files=saveToDisk,
241
248
  )
242
249
  if original_dict is not None:
243
250
  original_dict.clear()
@@ -15,6 +15,7 @@ from .cropping import SignatureCroppingUnavailable, crop_signatures
15
15
  from .detector import BuildDetector, FileResult
16
16
  from .eda import RunExploratoryAnalysis
17
17
  from .logging_setup import ConfigureLogging
18
+ from .wet_detection import apply_wet_detection
18
19
 
19
20
  Logger = ConfigureLogging()
20
21
 
@@ -72,6 +73,33 @@ def Detect(
72
73
  help="Rendering DPI for signature crops",
73
74
  show_default=False,
74
75
  ),
76
+ detectWetSignatures: bool | None = typer.Option(
77
+ None,
78
+ "--detect-wet/--no-detect-wet",
79
+ help="Run OCR-backed wet signature detection (requires PyMuPDF + Tesseract)",
80
+ show_default=False,
81
+ ),
82
+ wetOcrDpi: int | None = typer.Option(
83
+ None,
84
+ "--wet-ocr-dpi",
85
+ min=72,
86
+ max=600,
87
+ help="Rendering DPI for OCR pages (wet detection)",
88
+ show_default=False,
89
+ ),
90
+ wetOcrLanguages: str | None = typer.Option(
91
+ None,
92
+ "--wet-ocr-languages",
93
+ help="Tesseract language packs for OCR (e.g., 'eng' or 'eng+spa')",
94
+ ),
95
+ wetPrecisionThreshold: float | None = typer.Option(
96
+ None,
97
+ "--wet-precision-threshold",
98
+ min=0.0,
99
+ max=1.0,
100
+ help="Minimum wet-signature confidence (0-1) to accept a candidate",
101
+ show_default=False,
102
+ ),
75
103
  ) -> None:
76
104
  """Run detection for the configured directory and emit ``results.json``."""
77
105
 
@@ -89,6 +117,14 @@ def Detect(
89
117
  overrides["CropOutputDirectory"] = cropDirectory
90
118
  if cropDpi is not None:
91
119
  overrides["CropImageDpi"] = cropDpi
120
+ if detectWetSignatures is not None:
121
+ overrides["DetectWetSignatures"] = detectWetSignatures
122
+ if wetOcrDpi is not None:
123
+ overrides["WetOcrDpi"] = wetOcrDpi
124
+ if wetOcrLanguages is not None:
125
+ overrides["WetOcrLanguages"] = wetOcrLanguages
126
+ if wetPrecisionThreshold is not None:
127
+ overrides["WetPrecisionThreshold"] = wetPrecisionThreshold
92
128
  if overrides:
93
129
  configuration = configuration.model_copy(update=overrides)
94
130
  configuration = FinalizeConfiguration(configuration)
@@ -182,6 +218,7 @@ def Detect(
182
218
 
183
219
  def _process(pdf_path: Path) -> None:
184
220
  file_result = detector.Detect(pdf_path)
221
+ apply_wet_detection(pdf_path, configuration, file_result, logger=Logger)
185
222
  _append_result(file_result, pdf_path)
186
223
 
187
224
  try:
@@ -10,7 +10,7 @@ from typing import Literal
10
10
  import yaml
11
11
  from pydantic import BaseModel, ConfigDict, Field, field_validator
12
12
 
13
- EngineName = Literal["pypdf2", "pypdf", "pymupdf"]
13
+ EngineName = Literal["pypdf2", "pypdf", "pymupdf", "auto"]
14
14
  ProfileName = Literal["hipaa", "retainer"]
15
15
 
16
16
 
@@ -25,13 +25,19 @@ class DetectConfiguration(BaseModel):
25
25
 
26
26
  PdfRoot: Path = Field(default=Path("hipaa_results"), alias="pdf_root")
27
27
  OutputDirectory: Path | None = Field(default=Path("out"), alias="out_dir")
28
- Engine: EngineName = Field(default="pypdf2", alias="engine")
28
+ Engine: EngineName = Field(default="auto", alias="engine")
29
29
  Profile: ProfileName = Field(default="hipaa", alias="profile")
30
30
  PseudoSignatures: bool = Field(default=True, alias="pseudo_signatures")
31
31
  RecurseXObjects: bool = Field(default=True, alias="recurse_xobjects")
32
- CropSignatures: bool = Field(default=False, alias="crop_signatures")
32
+ CropSignatures: bool = Field(default=True, alias="crop_signatures")
33
33
  CropOutputDirectory: Path | None = Field(default=None, alias="crop_output_dir")
34
34
  CropImageDpi: int = Field(default=200, alias="crop_image_dpi", ge=72, le=600)
35
+ DetectWetSignatures: bool = Field(default=True, alias="detect_wet_signatures")
36
+ WetOcrDpi: int = Field(default=200, alias="wet_ocr_dpi", ge=72, le=600)
37
+ WetOcrLanguages: str = Field(default="eng", alias="wet_ocr_languages")
38
+ WetPrecisionThreshold: float = Field(
39
+ default=0.82, alias="wet_precision_threshold", ge=0.0, le=1.0
40
+ )
35
41
 
36
42
  @field_validator("PdfRoot", "OutputDirectory", "CropOutputDirectory", mode="before")
37
43
  @classmethod
@@ -85,6 +91,22 @@ class DetectConfiguration(BaseModel):
85
91
  def crop_image_dpi(self) -> int: # pragma: no cover - simple passthrough
86
92
  return self.CropImageDpi
87
93
 
94
+ @property
95
+ def detect_wet_signatures(self) -> bool: # pragma: no cover - simple passthrough
96
+ return self.DetectWetSignatures
97
+
98
+ @property
99
+ def wet_ocr_dpi(self) -> int: # pragma: no cover - simple passthrough
100
+ return self.WetOcrDpi
101
+
102
+ @property
103
+ def wet_ocr_languages(self) -> str: # pragma: no cover - simple passthrough
104
+ return self.WetOcrLanguages
105
+
106
+ @property
107
+ def wet_precision_threshold(self) -> float: # pragma: no cover - simple passthrough
108
+ return self.WetPrecisionThreshold
109
+
88
110
 
89
111
  def LoadConfiguration(path: Path | None) -> DetectConfiguration:
90
112
  """Load configuration from ``path`` while applying environment overrides.
@@ -108,6 +130,10 @@ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
108
130
  env_crop = os.getenv("SIGDETECT_CROP_SIGNATURES")
109
131
  env_crop_dir = os.getenv("SIGDETECT_CROP_DIR")
110
132
  env_crop_dpi = os.getenv("SIGDETECT_CROP_DPI")
133
+ env_detect_wet = os.getenv("SIGDETECT_DETECT_WET")
134
+ env_wet_dpi = os.getenv("SIGDETECT_WET_OCR_DPI")
135
+ env_wet_lang = os.getenv("SIGDETECT_WET_LANGUAGES")
136
+ env_wet_precision = os.getenv("SIGDETECT_WET_PRECISION")
111
137
 
112
138
  raw_data: dict[str, object] = {}
113
139
  if path and Path(path).exists():
@@ -133,6 +159,20 @@ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
133
159
  if env_crop_dpi:
134
160
  with suppress(ValueError):
135
161
  raw_data["crop_image_dpi"] = int(env_crop_dpi)
162
+ if env_detect_wet is not None:
163
+ lowered = env_detect_wet.lower()
164
+ if lowered in {"1", "true", "yes", "on"}:
165
+ raw_data["detect_wet_signatures"] = True
166
+ elif lowered in {"0", "false", "no", "off"}:
167
+ raw_data["detect_wet_signatures"] = False
168
+ if env_wet_dpi:
169
+ with suppress(ValueError):
170
+ raw_data["wet_ocr_dpi"] = int(env_wet_dpi)
171
+ if env_wet_lang:
172
+ raw_data["wet_ocr_languages"] = env_wet_lang
173
+ if env_wet_precision:
174
+ with suppress(ValueError):
175
+ raw_data["wet_precision_threshold"] = float(env_wet_precision)
136
176
 
137
177
  configuration = DetectConfiguration(**raw_data)
138
178
  return FinalizeConfiguration(configuration)
@@ -28,6 +28,7 @@ class SignatureCrop:
28
28
  path: Path
29
29
  image_bytes: bytes
30
30
  signature: Signature
31
+ saved_to_disk: bool = True
31
32
 
32
33
 
33
34
  @overload
@@ -39,7 +40,9 @@ def crop_signatures(
39
40
  dpi: int = 200,
40
41
  logger: logging.Logger | None = None,
41
42
  return_bytes: Literal[False] = False,
42
- ) -> list[Path]: ...
43
+ save_files: bool = True,
44
+ ) -> list[Path]:
45
+ ...
43
46
 
44
47
 
45
48
  @overload
@@ -50,8 +53,10 @@ def crop_signatures(
50
53
  output_dir: Path,
51
54
  dpi: int = 200,
52
55
  logger: logging.Logger | None = None,
53
- return_bytes: Literal[True] = True,
54
- ) -> list[SignatureCrop]: ...
56
+ return_bytes: Literal[True],
57
+ save_files: bool = True,
58
+ ) -> list[SignatureCrop]:
59
+ ...
55
60
 
56
61
 
57
62
  def crop_signatures(
@@ -62,27 +67,32 @@ def crop_signatures(
62
67
  dpi: int = 200,
63
68
  logger: logging.Logger | None = None,
64
69
  return_bytes: bool = False,
70
+ save_files: bool = True,
65
71
  ) -> list[Path] | list[SignatureCrop]:
66
72
  """Render each signature bounding box to a PNG image using PyMuPDF.
67
73
 
68
74
  Set ``return_bytes=True`` to collect in-memory PNG bytes for each crop while also writing
69
- the files to ``output_dir``.
75
+ the files to ``output_dir``. Set ``save_files=False`` to skip writing PNGs to disk.
70
76
  """
71
77
 
72
78
  if fitz is None: # pragma: no cover - exercised when dependency absent
73
79
  raise SignatureCroppingUnavailable(
74
80
  "PyMuPDF is required for PNG crops. Install 'pymupdf' or 'sigdetect[pymupdf]'."
75
81
  )
82
+ if not save_files and not return_bytes:
83
+ raise ValueError("At least one of save_files or return_bytes must be True")
76
84
 
77
85
  pdf_path = Path(pdf_path)
78
86
  output_dir = Path(output_dir)
79
- output_dir.mkdir(parents=True, exist_ok=True)
87
+ if save_files:
88
+ output_dir.mkdir(parents=True, exist_ok=True)
80
89
  generated_paths: list[Path] = []
81
90
  generated_crops: list[SignatureCrop] = []
82
91
 
83
92
  with fitz.open(pdf_path) as document: # type: ignore[attr-defined]
84
93
  per_document_dir = output_dir / pdf_path.stem
85
- per_document_dir.mkdir(parents=True, exist_ok=True)
94
+ if save_files:
95
+ per_document_dir.mkdir(parents=True, exist_ok=True)
86
96
  scale = dpi / 72.0
87
97
  matrix = fitz.Matrix(scale, scale)
88
98
 
@@ -113,7 +123,8 @@ def crop_signatures(
113
123
  try:
114
124
  image_bytes: bytes | None = None
115
125
  pixmap = page.get_pixmap(matrix=matrix, clip=clip, alpha=False)
116
- pixmap.save(destination)
126
+ if save_files:
127
+ pixmap.save(destination)
117
128
  if return_bytes:
118
129
  image_bytes = pixmap.tobytes("png")
119
130
  except Exception as exc: # pragma: no cover - defensive
@@ -129,8 +140,9 @@ def crop_signatures(
129
140
  )
130
141
  continue
131
142
 
132
- signature.CropPath = str(destination)
133
- generated_paths.append(destination)
143
+ if save_files:
144
+ signature.CropPath = str(destination)
145
+ generated_paths.append(destination)
134
146
  if return_bytes:
135
147
  if image_bytes is None: # pragma: no cover - defensive
136
148
  continue
@@ -139,6 +151,7 @@ def crop_signatures(
139
151
  path=destination,
140
152
  image_bytes=image_bytes,
141
153
  signature=signature,
154
+ saved_to_disk=save_files,
142
155
  )
143
156
  )
144
157
 
@@ -2,6 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import warnings
5
6
  from typing import TYPE_CHECKING, Type
6
7
 
7
8
  from .base_detector import Detector
@@ -37,7 +38,23 @@ def BuildDetector(configuration: DetectConfiguration) -> Detector:
37
38
  or getattr(configuration, "engine", None)
38
39
  or PyPDF2Detector.Name
39
40
  )
40
- normalized = engine_name.lower()
41
+ normalized = str(engine_name).lower()
42
+
43
+ if normalized == "auto":
44
+ detector_cls: Type[Detector] | None = None
45
+ if PyMuPDFDetector is not None:
46
+ detector_cls = ENGINE_REGISTRY.get(getattr(PyMuPDFDetector, "Name", "")) or PyMuPDFDetector
47
+ if detector_cls is None:
48
+ detector_cls = ENGINE_REGISTRY.get(PyPDF2Detector.Name) or ENGINE_REGISTRY.get("pypdf")
49
+ warnings.warn(
50
+ "Engine 'auto' falling back to 'pypdf2' because PyMuPDF is unavailable",
51
+ RuntimeWarning,
52
+ stacklevel=2,
53
+ )
54
+ if detector_cls is None:
55
+ available = ", ".join(sorted(ENGINE_REGISTRY)) or "<none>"
56
+ raise ValueError(f"No available detector engines. Available engines: {available}")
57
+ return detector_cls(configuration)
41
58
 
42
59
  detector_cls = ENGINE_REGISTRY.get(normalized)
43
60
  if detector_cls is None:
@@ -111,6 +111,7 @@ class PyMuPDFDetector(PyPDF2Detector):
111
111
  rect, exclusion, mode = rect_info
112
112
  padded = self._PadRect(rect, page.rect, signature.Role, exclusion, mode)
113
113
  signature.BoundingBox = self._RectToPdfTuple(padded, page.rect.height)
114
+ signature.RenderType = "drawn"
114
115
  if signature.Page is None:
115
116
  signature.Page = page_index + 1
116
117
  break
@@ -348,7 +348,7 @@ class PyPDF2Detector(Detector):
348
348
  return normalized.lower().startswith("im")
349
349
 
350
350
  def _ClassifyAppearance(self, widget: generic.DictionaryObject, page) -> str:
351
- """Classify the widget's appearance as drawn/typed/hybrid/unknown."""
351
+ """Classify the widget's appearance as drawn or typed."""
352
352
 
353
353
  ap_dict = AsDictionary(widget.get("/AP"))
354
354
  if not isinstance(ap_dict, generic.DictionaryObject):
@@ -356,7 +356,7 @@ class PyPDF2Detector(Detector):
356
356
  normal = ap_dict.get("/N")
357
357
  streams = self._ExtractAppearanceStreams(normal)
358
358
  if not streams:
359
- return "unknown"
359
+ return "typed"
360
360
 
361
361
  has_text = False
362
362
  has_vector = False
@@ -384,13 +384,11 @@ class PyPDF2Detector(Detector):
384
384
  has_image = True
385
385
  break
386
386
 
387
- if has_image and (has_text or has_vector):
388
- return "hybrid"
389
387
  if has_image:
390
388
  return "drawn"
391
389
  if has_text or has_vector:
392
390
  return "typed"
393
- return "unknown"
391
+ return "typed"
394
392
 
395
393
  # ---- file-wide stream scan (compressed or not)
396
394
  def _ScanFileStreamsForVendors(self, file_bytes: bytes) -> tuple[set[str], str]:
@@ -863,6 +861,7 @@ class PyPDF2Detector(Detector):
863
861
  Scores={r: sc},
864
862
  Evidence=ev + ["pseudo:true"],
865
863
  Hint="VendorOrAcroOnly",
864
+ RenderType="typed",
866
865
  )
867
866
  )
868
867
 
@@ -903,6 +902,7 @@ class PyPDF2Detector(Detector):
903
902
  Scores={role: score} if score > 0 else {},
904
903
  Evidence=ev + ["pseudo:true"],
905
904
  Hint="VendorOrAcroOnly",
905
+ RenderType="typed",
906
906
  )
907
907
  )
908
908
 
@@ -1055,6 +1055,7 @@ class PyPDF2Detector(Detector):
1055
1055
  Scores=scores,
1056
1056
  Evidence=evidence,
1057
1057
  Hint=f"AcroSig:{fname}" if fname else "AcroSig",
1058
+ RenderType="typed",
1058
1059
  )
1059
1060
  )
1060
1061
 
@@ -1120,6 +1121,7 @@ class PyPDF2Detector(Detector):
1120
1121
  Scores=dict(scores),
1121
1122
  Evidence=evidence + ["pseudo:true"],
1122
1123
  Hint="VendorOrAcroOnly",
1124
+ RenderType="typed",
1123
1125
  )
1124
1126
  )
1125
1127
 
@@ -17,7 +17,7 @@ class Signature:
17
17
  Scores: dict[str, int]
18
18
  Evidence: list[str]
19
19
  Hint: str
20
- RenderType: str = "unknown"
20
+ RenderType: str = "typed"
21
21
  BoundingBox: tuple[float, float, float, float] | None = None
22
22
  CropPath: str | None = None
23
23