sigdetect 0.3.1__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sigdetect-0.3.1 → sigdetect-0.4.0}/PKG-INFO +12 -18
- {sigdetect-0.3.1 → sigdetect-0.4.0}/README.md +11 -17
- {sigdetect-0.3.1 → sigdetect-0.4.0}/pyproject.toml +1 -1
- {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect/__init__.py +1 -1
- {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect/api.py +7 -5
- {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect/cli.py +37 -0
- {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect/config.py +43 -3
- {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect/cropping.py +7 -3
- {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect/detector/__init__.py +18 -1
- {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect/detector/pymupdf_engine.py +1 -0
- {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect/detector/pypdf2_engine.py +7 -5
- {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect/detector/signature_model.py +1 -1
- sigdetect-0.4.0/src/sigdetect/wet_detection.py +499 -0
- {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect.egg-info/PKG-INFO +12 -18
- {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect.egg-info/SOURCES.txt +4 -0
- sigdetect-0.4.0/tests/test_cli.py +148 -0
- sigdetect-0.4.0/tests/test_detector_options.py +82 -0
- sigdetect-0.4.0/tests/test_wet_detection.py +111 -0
- {sigdetect-0.3.1 → sigdetect-0.4.0}/setup.cfg +0 -0
- {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect/data/role_rules.retainer.yml +0 -0
- {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect/data/role_rules.yml +0 -0
- {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect/data/vendor_patterns.yml +0 -0
- {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect/detector/base.py +0 -0
- {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect/detector/base_detector.py +0 -0
- {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect/detector/file_result_model.py +0 -0
- {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect/eda.py +0 -0
- {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect/logging_setup.py +0 -0
- {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect/utils.py +0 -0
- {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect.egg-info/dependency_links.txt +0 -0
- {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect.egg-info/entry_points.txt +0 -0
- {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect.egg-info/requires.txt +0 -0
- {sigdetect-0.3.1 → sigdetect-0.4.0}/src/sigdetect.egg-info/top_level.txt +0 -0
- {sigdetect-0.3.1 → sigdetect-0.4.0}/tests/test_api.py +0 -0
- {sigdetect-0.3.1 → sigdetect-0.4.0}/tests/test_cropping.py +0 -0
- {sigdetect-0.3.1 → sigdetect-0.4.0}/tests/test_pymupdf_engine.py +0 -0
- {sigdetect-0.3.1 → sigdetect-0.4.0}/tests/test_widget_role_patient_smoke.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sigdetect
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Signature detection and role attribution for PDFs
|
|
5
5
|
Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
|
|
6
6
|
License: MIT
|
|
@@ -95,14 +95,14 @@ sigdetect detect \
|
|
|
95
95
|
### Notes
|
|
96
96
|
|
|
97
97
|
- The config file controls `pdf_root`, `out_dir`, `engine`, `pseudo_signatures`, `recurse_xobjects`, etc.
|
|
98
|
-
- `--engine`
|
|
98
|
+
- `--engine` accepts **auto** (default; prefers PyMuPDF when installed, falls back to PyPDF2), **pypdf2**, or **pymupdf**.
|
|
99
99
|
- `--pseudo-signatures` enables a vendor/Acro-only pseudo-signature when no actual `/Widget` is present (useful for DocuSign / Acrobat Sign receipts).
|
|
100
100
|
- `--recurse-xobjects` allows scanning Form XObjects for vendor markers and labels embedded in page resources.
|
|
101
101
|
- `--profile` selects tuned role logic:
|
|
102
102
|
- `hipaa` → patient / representative / attorney
|
|
103
103
|
- `retainer` → client / firm (prefers detecting two signatures)
|
|
104
104
|
- `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
|
|
105
|
-
- `--crop-signatures`
|
|
105
|
+
- Cropping (`--crop-signatures`) and wet detection (`--detect-wet`) are enabled by default for single-pass runs; disable them if you want a light, e-sign-only pass. PyMuPDF is required for crops; PyMuPDF + Tesseract are required for wet detection.
|
|
106
106
|
- If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
|
|
107
107
|
|
|
108
108
|
### EDA (quick aggregate stats)
|
|
@@ -136,15 +136,13 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
|
|
|
136
136
|
print(result.to_dict())
|
|
137
137
|
~~~
|
|
138
138
|
|
|
139
|
-
`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image.
|
|
139
|
+
`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
|
|
140
140
|
|
|
141
141
|
---
|
|
142
142
|
|
|
143
143
|
## Library API (embed in another script)
|
|
144
144
|
|
|
145
|
-
Minimal, plug-and-play API
|
|
146
|
-
Import from `sigdetect.api` and get plain dicts out (JSON-ready),
|
|
147
|
-
with no I/O side effects by default:
|
|
145
|
+
Minimal, plug-and-play API that returns plain dicts (JSON-ready) without side effects unless you opt into cropping:
|
|
148
146
|
|
|
149
147
|
~~~python
|
|
150
148
|
from pathlib import Path
|
|
@@ -192,23 +190,14 @@ for res in ScanDirectory(
|
|
|
192
190
|
# 3) Crop PNG snippets for FileResult objects (requires PyMuPDF)
|
|
193
191
|
detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
|
|
194
192
|
file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
|
|
195
|
-
|
|
193
|
+
CropSignatureImages(
|
|
196
194
|
"/path/to/pdfs/example.pdf",
|
|
197
195
|
file_result,
|
|
198
196
|
outputDirectory="./signature_crops",
|
|
199
197
|
dpi=200,
|
|
200
|
-
returnBytes=True, # also returns in-memory PNG bytes for each crop
|
|
201
|
-
# saveToDisk=False, # optional: skip writing PNGs to disk
|
|
202
198
|
)
|
|
203
|
-
|
|
204
|
-
first_crop = crops[0]
|
|
205
|
-
print(first_crop.path, len(first_crop.image_bytes))
|
|
206
199
|
~~~
|
|
207
200
|
|
|
208
|
-
When ``returnBytes=True`` the helper returns ``SignatureCrop`` objects containing the saved path,
|
|
209
|
-
PNG bytes, and the originating signature metadata.
|
|
210
|
-
Pass ``saveToDisk=False`` if you only want in-memory PNG bytes (no files on disk or ``crop_path`` updates).
|
|
211
|
-
|
|
212
201
|
|
|
213
202
|
## Result schema
|
|
214
203
|
|
|
@@ -247,7 +236,7 @@ High-level summary (per file):
|
|
|
247
236
|
"scores": { "page_label": 4, "general": 2 },
|
|
248
237
|
"evidence": ["page_label:representative(parent/guardian)", "pseudo:true"],
|
|
249
238
|
"hint": "VendorOrAcroOnly",
|
|
250
|
-
"render_type": "
|
|
239
|
+
"render_type": "typed",
|
|
251
240
|
"bounding_box": null,
|
|
252
241
|
"crop_path": null
|
|
253
242
|
}
|
|
@@ -292,6 +281,10 @@ profile: retainer # or: hipaa
|
|
|
292
281
|
crop_signatures: false # enable to write PNG crops (requires pymupdf)
|
|
293
282
|
# crop_output_dir: ./signature_crops
|
|
294
283
|
crop_image_dpi: 200
|
|
284
|
+
detect_wet_signatures: false # opt-in OCR wet detection (PyMuPDF + Tesseract)
|
|
285
|
+
wet_ocr_dpi: 200
|
|
286
|
+
wet_ocr_languages: eng
|
|
287
|
+
wet_precision_threshold: 0.82
|
|
295
288
|
~~~
|
|
296
289
|
|
|
297
290
|
YAML files can be customized or load at runtime (see CLI `--config`, if available, or import and pass patterns into engine).
|
|
@@ -306,6 +299,7 @@ YAML files can be customized or load at runtime (see CLI `--config`, if availabl
|
|
|
306
299
|
- Looks for client and firm labels/tokens; boosts pages with law-firm markers (LLP/LLC/PA/PC) and “By:” blocks.
|
|
307
300
|
- Applies an anti-front-matter rule to reduce page-1 false positives (e.g., letterheads, firm mastheads).
|
|
308
301
|
- When only vendor/Acro clues exist (no widgets), it will emit two pseudo signatures targeting likely pages.
|
|
302
|
+
- **Wet detection (opt-in):** With `detect_wet_signatures: true`, the CLI runs an OCR-backed pass (PyMuPDF + pytesseract/Tesseract) after e-sign detection. It emits `RenderType="wet"` signatures for high-confidence label/stroke pairs in the lower page region. Missing OCR dependencies add a `ManualReview:*` hint instead of failing.
|
|
309
303
|
|
|
310
304
|
---
|
|
311
305
|
|
|
@@ -79,14 +79,14 @@ sigdetect detect \
|
|
|
79
79
|
### Notes
|
|
80
80
|
|
|
81
81
|
- The config file controls `pdf_root`, `out_dir`, `engine`, `pseudo_signatures`, `recurse_xobjects`, etc.
|
|
82
|
-
- `--engine`
|
|
82
|
+
- `--engine` accepts **auto** (default; prefers PyMuPDF when installed, falls back to PyPDF2), **pypdf2**, or **pymupdf**.
|
|
83
83
|
- `--pseudo-signatures` enables a vendor/Acro-only pseudo-signature when no actual `/Widget` is present (useful for DocuSign / Acrobat Sign receipts).
|
|
84
84
|
- `--recurse-xobjects` allows scanning Form XObjects for vendor markers and labels embedded in page resources.
|
|
85
85
|
- `--profile` selects tuned role logic:
|
|
86
86
|
- `hipaa` → patient / representative / attorney
|
|
87
87
|
- `retainer` → client / firm (prefers detecting two signatures)
|
|
88
88
|
- `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
|
|
89
|
-
- `--crop-signatures`
|
|
89
|
+
- Cropping (`--crop-signatures`) and wet detection (`--detect-wet`) are enabled by default for single-pass runs; disable them if you want a light, e-sign-only pass. PyMuPDF is required for crops; PyMuPDF + Tesseract are required for wet detection.
|
|
90
90
|
- If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
|
|
91
91
|
|
|
92
92
|
### EDA (quick aggregate stats)
|
|
@@ -120,15 +120,13 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
|
|
|
120
120
|
print(result.to_dict())
|
|
121
121
|
~~~
|
|
122
122
|
|
|
123
|
-
`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image.
|
|
123
|
+
`Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
|
|
124
124
|
|
|
125
125
|
---
|
|
126
126
|
|
|
127
127
|
## Library API (embed in another script)
|
|
128
128
|
|
|
129
|
-
Minimal, plug-and-play API
|
|
130
|
-
Import from `sigdetect.api` and get plain dicts out (JSON-ready),
|
|
131
|
-
with no I/O side effects by default:
|
|
129
|
+
Minimal, plug-and-play API that returns plain dicts (JSON-ready) without side effects unless you opt into cropping:
|
|
132
130
|
|
|
133
131
|
~~~python
|
|
134
132
|
from pathlib import Path
|
|
@@ -176,23 +174,14 @@ for res in ScanDirectory(
|
|
|
176
174
|
# 3) Crop PNG snippets for FileResult objects (requires PyMuPDF)
|
|
177
175
|
detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
|
|
178
176
|
file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
|
|
179
|
-
|
|
177
|
+
CropSignatureImages(
|
|
180
178
|
"/path/to/pdfs/example.pdf",
|
|
181
179
|
file_result,
|
|
182
180
|
outputDirectory="./signature_crops",
|
|
183
181
|
dpi=200,
|
|
184
|
-
returnBytes=True, # also returns in-memory PNG bytes for each crop
|
|
185
|
-
# saveToDisk=False, # optional: skip writing PNGs to disk
|
|
186
182
|
)
|
|
187
|
-
|
|
188
|
-
first_crop = crops[0]
|
|
189
|
-
print(first_crop.path, len(first_crop.image_bytes))
|
|
190
183
|
~~~
|
|
191
184
|
|
|
192
|
-
When ``returnBytes=True`` the helper returns ``SignatureCrop`` objects containing the saved path,
|
|
193
|
-
PNG bytes, and the originating signature metadata.
|
|
194
|
-
Pass ``saveToDisk=False`` if you only want in-memory PNG bytes (no files on disk or ``crop_path`` updates).
|
|
195
|
-
|
|
196
185
|
|
|
197
186
|
## Result schema
|
|
198
187
|
|
|
@@ -231,7 +220,7 @@ High-level summary (per file):
|
|
|
231
220
|
"scores": { "page_label": 4, "general": 2 },
|
|
232
221
|
"evidence": ["page_label:representative(parent/guardian)", "pseudo:true"],
|
|
233
222
|
"hint": "VendorOrAcroOnly",
|
|
234
|
-
"render_type": "
|
|
223
|
+
"render_type": "typed",
|
|
235
224
|
"bounding_box": null,
|
|
236
225
|
"crop_path": null
|
|
237
226
|
}
|
|
@@ -276,6 +265,10 @@ profile: retainer # or: hipaa
|
|
|
276
265
|
crop_signatures: false # enable to write PNG crops (requires pymupdf)
|
|
277
266
|
# crop_output_dir: ./signature_crops
|
|
278
267
|
crop_image_dpi: 200
|
|
268
|
+
detect_wet_signatures: false # opt-in OCR wet detection (PyMuPDF + Tesseract)
|
|
269
|
+
wet_ocr_dpi: 200
|
|
270
|
+
wet_ocr_languages: eng
|
|
271
|
+
wet_precision_threshold: 0.82
|
|
279
272
|
~~~
|
|
280
273
|
|
|
281
274
|
YAML files can be customized or load at runtime (see CLI `--config`, if available, or import and pass patterns into engine).
|
|
@@ -290,6 +283,7 @@ YAML files can be customized or load at runtime (see CLI `--config`, if availabl
|
|
|
290
283
|
- Looks for client and firm labels/tokens; boosts pages with law-firm markers (LLP/LLC/PA/PC) and “By:” blocks.
|
|
291
284
|
- Applies an anti-front-matter rule to reduce page-1 false positives (e.g., letterheads, firm mastheads).
|
|
292
285
|
- When only vendor/Acro clues exist (no widgets), it will emit two pseudo signatures targeting likely pages.
|
|
286
|
+
- **Wet detection (opt-in):** With `detect_wet_signatures: true`, the CLI runs an OCR-backed pass (PyMuPDF + pytesseract/Tesseract) after e-sign detection. It emits `RenderType="wet"` signatures for high-confidence label/stroke pairs in the lower page region. Missing OCR dependencies add a `ManualReview:*` hint instead of failing.
|
|
293
287
|
|
|
294
288
|
---
|
|
295
289
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "sigdetect"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.4.0"
|
|
8
8
|
description = "Signature detection and role attribution for PDFs"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
authors = [{ name = "BT Asmamaw", email = "basmamaw@angeiongroup.com" }]
|
|
@@ -10,7 +10,7 @@ from sigdetect.config import DetectConfiguration
|
|
|
10
10
|
from sigdetect.cropping import SignatureCrop
|
|
11
11
|
from sigdetect.detector import BuildDetector, Detector, FileResult, Signature
|
|
12
12
|
|
|
13
|
-
EngineName = Literal["pypdf2", "pypdf", "pymupdf"]
|
|
13
|
+
EngineName = Literal["pypdf2", "pypdf", "pymupdf", "auto"]
|
|
14
14
|
ProfileName = Literal["hipaa", "retainer"]
|
|
15
15
|
|
|
16
16
|
|
|
@@ -18,7 +18,7 @@ def DetectPdf(
|
|
|
18
18
|
pdfPath: str | Path,
|
|
19
19
|
*,
|
|
20
20
|
profileName: ProfileName = "hipaa",
|
|
21
|
-
engineName: EngineName = "
|
|
21
|
+
engineName: EngineName = "auto",
|
|
22
22
|
includePseudoSignatures: bool = True,
|
|
23
23
|
recurseXObjects: bool = True,
|
|
24
24
|
detector: Detector | None = None,
|
|
@@ -43,7 +43,7 @@ def get_detector(
|
|
|
43
43
|
*,
|
|
44
44
|
pdfRoot: str | Path | None = None,
|
|
45
45
|
profileName: ProfileName = "hipaa",
|
|
46
|
-
engineName: EngineName = "
|
|
46
|
+
engineName: EngineName = "auto",
|
|
47
47
|
includePseudoSignatures: bool = True,
|
|
48
48
|
recurseXObjects: bool = True,
|
|
49
49
|
outputDirectory: str | Path | None = None,
|
|
@@ -201,7 +201,8 @@ def CropSignatureImages(
|
|
|
201
201
|
dpi: int = 200,
|
|
202
202
|
returnBytes: Literal[False] = False,
|
|
203
203
|
saveToDisk: bool = True,
|
|
204
|
-
) -> list[Path]:
|
|
204
|
+
) -> list[Path]:
|
|
205
|
+
...
|
|
205
206
|
|
|
206
207
|
|
|
207
208
|
@overload
|
|
@@ -213,7 +214,8 @@ def CropSignatureImages(
|
|
|
213
214
|
dpi: int,
|
|
214
215
|
returnBytes: Literal[True],
|
|
215
216
|
saveToDisk: bool,
|
|
216
|
-
) -> list[SignatureCrop]:
|
|
217
|
+
) -> list[SignatureCrop]:
|
|
218
|
+
...
|
|
217
219
|
|
|
218
220
|
|
|
219
221
|
def CropSignatureImages(
|
|
@@ -15,6 +15,7 @@ from .cropping import SignatureCroppingUnavailable, crop_signatures
|
|
|
15
15
|
from .detector import BuildDetector, FileResult
|
|
16
16
|
from .eda import RunExploratoryAnalysis
|
|
17
17
|
from .logging_setup import ConfigureLogging
|
|
18
|
+
from .wet_detection import apply_wet_detection
|
|
18
19
|
|
|
19
20
|
Logger = ConfigureLogging()
|
|
20
21
|
|
|
@@ -72,6 +73,33 @@ def Detect(
|
|
|
72
73
|
help="Rendering DPI for signature crops",
|
|
73
74
|
show_default=False,
|
|
74
75
|
),
|
|
76
|
+
detectWetSignatures: bool | None = typer.Option(
|
|
77
|
+
None,
|
|
78
|
+
"--detect-wet/--no-detect-wet",
|
|
79
|
+
help="Run OCR-backed wet signature detection (requires PyMuPDF + Tesseract)",
|
|
80
|
+
show_default=False,
|
|
81
|
+
),
|
|
82
|
+
wetOcrDpi: int | None = typer.Option(
|
|
83
|
+
None,
|
|
84
|
+
"--wet-ocr-dpi",
|
|
85
|
+
min=72,
|
|
86
|
+
max=600,
|
|
87
|
+
help="Rendering DPI for OCR pages (wet detection)",
|
|
88
|
+
show_default=False,
|
|
89
|
+
),
|
|
90
|
+
wetOcrLanguages: str | None = typer.Option(
|
|
91
|
+
None,
|
|
92
|
+
"--wet-ocr-languages",
|
|
93
|
+
help="Tesseract language packs for OCR (e.g., 'eng' or 'eng+spa')",
|
|
94
|
+
),
|
|
95
|
+
wetPrecisionThreshold: float | None = typer.Option(
|
|
96
|
+
None,
|
|
97
|
+
"--wet-precision-threshold",
|
|
98
|
+
min=0.0,
|
|
99
|
+
max=1.0,
|
|
100
|
+
help="Minimum wet-signature confidence (0-1) to accept a candidate",
|
|
101
|
+
show_default=False,
|
|
102
|
+
),
|
|
75
103
|
) -> None:
|
|
76
104
|
"""Run detection for the configured directory and emit ``results.json``."""
|
|
77
105
|
|
|
@@ -89,6 +117,14 @@ def Detect(
|
|
|
89
117
|
overrides["CropOutputDirectory"] = cropDirectory
|
|
90
118
|
if cropDpi is not None:
|
|
91
119
|
overrides["CropImageDpi"] = cropDpi
|
|
120
|
+
if detectWetSignatures is not None:
|
|
121
|
+
overrides["DetectWetSignatures"] = detectWetSignatures
|
|
122
|
+
if wetOcrDpi is not None:
|
|
123
|
+
overrides["WetOcrDpi"] = wetOcrDpi
|
|
124
|
+
if wetOcrLanguages is not None:
|
|
125
|
+
overrides["WetOcrLanguages"] = wetOcrLanguages
|
|
126
|
+
if wetPrecisionThreshold is not None:
|
|
127
|
+
overrides["WetPrecisionThreshold"] = wetPrecisionThreshold
|
|
92
128
|
if overrides:
|
|
93
129
|
configuration = configuration.model_copy(update=overrides)
|
|
94
130
|
configuration = FinalizeConfiguration(configuration)
|
|
@@ -182,6 +218,7 @@ def Detect(
|
|
|
182
218
|
|
|
183
219
|
def _process(pdf_path: Path) -> None:
|
|
184
220
|
file_result = detector.Detect(pdf_path)
|
|
221
|
+
apply_wet_detection(pdf_path, configuration, file_result, logger=Logger)
|
|
185
222
|
_append_result(file_result, pdf_path)
|
|
186
223
|
|
|
187
224
|
try:
|
|
@@ -10,7 +10,7 @@ from typing import Literal
|
|
|
10
10
|
import yaml
|
|
11
11
|
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
|
12
12
|
|
|
13
|
-
EngineName = Literal["pypdf2", "pypdf", "pymupdf"]
|
|
13
|
+
EngineName = Literal["pypdf2", "pypdf", "pymupdf", "auto"]
|
|
14
14
|
ProfileName = Literal["hipaa", "retainer"]
|
|
15
15
|
|
|
16
16
|
|
|
@@ -25,13 +25,19 @@ class DetectConfiguration(BaseModel):
|
|
|
25
25
|
|
|
26
26
|
PdfRoot: Path = Field(default=Path("hipaa_results"), alias="pdf_root")
|
|
27
27
|
OutputDirectory: Path | None = Field(default=Path("out"), alias="out_dir")
|
|
28
|
-
Engine: EngineName = Field(default="
|
|
28
|
+
Engine: EngineName = Field(default="auto", alias="engine")
|
|
29
29
|
Profile: ProfileName = Field(default="hipaa", alias="profile")
|
|
30
30
|
PseudoSignatures: bool = Field(default=True, alias="pseudo_signatures")
|
|
31
31
|
RecurseXObjects: bool = Field(default=True, alias="recurse_xobjects")
|
|
32
|
-
CropSignatures: bool = Field(default=
|
|
32
|
+
CropSignatures: bool = Field(default=True, alias="crop_signatures")
|
|
33
33
|
CropOutputDirectory: Path | None = Field(default=None, alias="crop_output_dir")
|
|
34
34
|
CropImageDpi: int = Field(default=200, alias="crop_image_dpi", ge=72, le=600)
|
|
35
|
+
DetectWetSignatures: bool = Field(default=True, alias="detect_wet_signatures")
|
|
36
|
+
WetOcrDpi: int = Field(default=200, alias="wet_ocr_dpi", ge=72, le=600)
|
|
37
|
+
WetOcrLanguages: str = Field(default="eng", alias="wet_ocr_languages")
|
|
38
|
+
WetPrecisionThreshold: float = Field(
|
|
39
|
+
default=0.82, alias="wet_precision_threshold", ge=0.0, le=1.0
|
|
40
|
+
)
|
|
35
41
|
|
|
36
42
|
@field_validator("PdfRoot", "OutputDirectory", "CropOutputDirectory", mode="before")
|
|
37
43
|
@classmethod
|
|
@@ -85,6 +91,22 @@ class DetectConfiguration(BaseModel):
|
|
|
85
91
|
def crop_image_dpi(self) -> int: # pragma: no cover - simple passthrough
|
|
86
92
|
return self.CropImageDpi
|
|
87
93
|
|
|
94
|
+
@property
|
|
95
|
+
def detect_wet_signatures(self) -> bool: # pragma: no cover - simple passthrough
|
|
96
|
+
return self.DetectWetSignatures
|
|
97
|
+
|
|
98
|
+
@property
|
|
99
|
+
def wet_ocr_dpi(self) -> int: # pragma: no cover - simple passthrough
|
|
100
|
+
return self.WetOcrDpi
|
|
101
|
+
|
|
102
|
+
@property
|
|
103
|
+
def wet_ocr_languages(self) -> str: # pragma: no cover - simple passthrough
|
|
104
|
+
return self.WetOcrLanguages
|
|
105
|
+
|
|
106
|
+
@property
|
|
107
|
+
def wet_precision_threshold(self) -> float: # pragma: no cover - simple passthrough
|
|
108
|
+
return self.WetPrecisionThreshold
|
|
109
|
+
|
|
88
110
|
|
|
89
111
|
def LoadConfiguration(path: Path | None) -> DetectConfiguration:
|
|
90
112
|
"""Load configuration from ``path`` while applying environment overrides.
|
|
@@ -108,6 +130,10 @@ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
|
|
|
108
130
|
env_crop = os.getenv("SIGDETECT_CROP_SIGNATURES")
|
|
109
131
|
env_crop_dir = os.getenv("SIGDETECT_CROP_DIR")
|
|
110
132
|
env_crop_dpi = os.getenv("SIGDETECT_CROP_DPI")
|
|
133
|
+
env_detect_wet = os.getenv("SIGDETECT_DETECT_WET")
|
|
134
|
+
env_wet_dpi = os.getenv("SIGDETECT_WET_OCR_DPI")
|
|
135
|
+
env_wet_lang = os.getenv("SIGDETECT_WET_LANGUAGES")
|
|
136
|
+
env_wet_precision = os.getenv("SIGDETECT_WET_PRECISION")
|
|
111
137
|
|
|
112
138
|
raw_data: dict[str, object] = {}
|
|
113
139
|
if path and Path(path).exists():
|
|
@@ -133,6 +159,20 @@ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
|
|
|
133
159
|
if env_crop_dpi:
|
|
134
160
|
with suppress(ValueError):
|
|
135
161
|
raw_data["crop_image_dpi"] = int(env_crop_dpi)
|
|
162
|
+
if env_detect_wet is not None:
|
|
163
|
+
lowered = env_detect_wet.lower()
|
|
164
|
+
if lowered in {"1", "true", "yes", "on"}:
|
|
165
|
+
raw_data["detect_wet_signatures"] = True
|
|
166
|
+
elif lowered in {"0", "false", "no", "off"}:
|
|
167
|
+
raw_data["detect_wet_signatures"] = False
|
|
168
|
+
if env_wet_dpi:
|
|
169
|
+
with suppress(ValueError):
|
|
170
|
+
raw_data["wet_ocr_dpi"] = int(env_wet_dpi)
|
|
171
|
+
if env_wet_lang:
|
|
172
|
+
raw_data["wet_ocr_languages"] = env_wet_lang
|
|
173
|
+
if env_wet_precision:
|
|
174
|
+
with suppress(ValueError):
|
|
175
|
+
raw_data["wet_precision_threshold"] = float(env_wet_precision)
|
|
136
176
|
|
|
137
177
|
configuration = DetectConfiguration(**raw_data)
|
|
138
178
|
return FinalizeConfiguration(configuration)
|
|
@@ -40,7 +40,9 @@ def crop_signatures(
|
|
|
40
40
|
dpi: int = 200,
|
|
41
41
|
logger: logging.Logger | None = None,
|
|
42
42
|
return_bytes: Literal[False] = False,
|
|
43
|
-
|
|
43
|
+
save_files: bool = True,
|
|
44
|
+
) -> list[Path]:
|
|
45
|
+
...
|
|
44
46
|
|
|
45
47
|
|
|
46
48
|
@overload
|
|
@@ -51,8 +53,10 @@ def crop_signatures(
|
|
|
51
53
|
output_dir: Path,
|
|
52
54
|
dpi: int = 200,
|
|
53
55
|
logger: logging.Logger | None = None,
|
|
54
|
-
return_bytes: Literal[True]
|
|
55
|
-
|
|
56
|
+
return_bytes: Literal[True],
|
|
57
|
+
save_files: bool = True,
|
|
58
|
+
) -> list[SignatureCrop]:
|
|
59
|
+
...
|
|
56
60
|
|
|
57
61
|
|
|
58
62
|
def crop_signatures(
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import warnings
|
|
5
6
|
from typing import TYPE_CHECKING, Type
|
|
6
7
|
|
|
7
8
|
from .base_detector import Detector
|
|
@@ -37,7 +38,23 @@ def BuildDetector(configuration: DetectConfiguration) -> Detector:
|
|
|
37
38
|
or getattr(configuration, "engine", None)
|
|
38
39
|
or PyPDF2Detector.Name
|
|
39
40
|
)
|
|
40
|
-
normalized = engine_name.lower()
|
|
41
|
+
normalized = str(engine_name).lower()
|
|
42
|
+
|
|
43
|
+
if normalized == "auto":
|
|
44
|
+
detector_cls: Type[Detector] | None = None
|
|
45
|
+
if PyMuPDFDetector is not None:
|
|
46
|
+
detector_cls = ENGINE_REGISTRY.get(getattr(PyMuPDFDetector, "Name", "")) or PyMuPDFDetector
|
|
47
|
+
if detector_cls is None:
|
|
48
|
+
detector_cls = ENGINE_REGISTRY.get(PyPDF2Detector.Name) or ENGINE_REGISTRY.get("pypdf")
|
|
49
|
+
warnings.warn(
|
|
50
|
+
"Engine 'auto' falling back to 'pypdf2' because PyMuPDF is unavailable",
|
|
51
|
+
RuntimeWarning,
|
|
52
|
+
stacklevel=2,
|
|
53
|
+
)
|
|
54
|
+
if detector_cls is None:
|
|
55
|
+
available = ", ".join(sorted(ENGINE_REGISTRY)) or "<none>"
|
|
56
|
+
raise ValueError(f"No available detector engines. Available engines: {available}")
|
|
57
|
+
return detector_cls(configuration)
|
|
41
58
|
|
|
42
59
|
detector_cls = ENGINE_REGISTRY.get(normalized)
|
|
43
60
|
if detector_cls is None:
|
|
@@ -111,6 +111,7 @@ class PyMuPDFDetector(PyPDF2Detector):
|
|
|
111
111
|
rect, exclusion, mode = rect_info
|
|
112
112
|
padded = self._PadRect(rect, page.rect, signature.Role, exclusion, mode)
|
|
113
113
|
signature.BoundingBox = self._RectToPdfTuple(padded, page.rect.height)
|
|
114
|
+
signature.RenderType = "drawn"
|
|
114
115
|
if signature.Page is None:
|
|
115
116
|
signature.Page = page_index + 1
|
|
116
117
|
break
|
|
@@ -348,7 +348,7 @@ class PyPDF2Detector(Detector):
|
|
|
348
348
|
return normalized.lower().startswith("im")
|
|
349
349
|
|
|
350
350
|
def _ClassifyAppearance(self, widget: generic.DictionaryObject, page) -> str:
|
|
351
|
-
"""Classify the widget's appearance as drawn
|
|
351
|
+
"""Classify the widget's appearance as drawn or typed."""
|
|
352
352
|
|
|
353
353
|
ap_dict = AsDictionary(widget.get("/AP"))
|
|
354
354
|
if not isinstance(ap_dict, generic.DictionaryObject):
|
|
@@ -356,7 +356,7 @@ class PyPDF2Detector(Detector):
|
|
|
356
356
|
normal = ap_dict.get("/N")
|
|
357
357
|
streams = self._ExtractAppearanceStreams(normal)
|
|
358
358
|
if not streams:
|
|
359
|
-
return "
|
|
359
|
+
return "typed"
|
|
360
360
|
|
|
361
361
|
has_text = False
|
|
362
362
|
has_vector = False
|
|
@@ -384,13 +384,11 @@ class PyPDF2Detector(Detector):
|
|
|
384
384
|
has_image = True
|
|
385
385
|
break
|
|
386
386
|
|
|
387
|
-
if has_image and (has_text or has_vector):
|
|
388
|
-
return "hybrid"
|
|
389
387
|
if has_image:
|
|
390
388
|
return "drawn"
|
|
391
389
|
if has_text or has_vector:
|
|
392
390
|
return "typed"
|
|
393
|
-
return "
|
|
391
|
+
return "typed"
|
|
394
392
|
|
|
395
393
|
# ---- file-wide stream scan (compressed or not)
|
|
396
394
|
def _ScanFileStreamsForVendors(self, file_bytes: bytes) -> tuple[set[str], str]:
|
|
@@ -863,6 +861,7 @@ class PyPDF2Detector(Detector):
|
|
|
863
861
|
Scores={r: sc},
|
|
864
862
|
Evidence=ev + ["pseudo:true"],
|
|
865
863
|
Hint="VendorOrAcroOnly",
|
|
864
|
+
RenderType="typed",
|
|
866
865
|
)
|
|
867
866
|
)
|
|
868
867
|
|
|
@@ -903,6 +902,7 @@ class PyPDF2Detector(Detector):
|
|
|
903
902
|
Scores={role: score} if score > 0 else {},
|
|
904
903
|
Evidence=ev + ["pseudo:true"],
|
|
905
904
|
Hint="VendorOrAcroOnly",
|
|
905
|
+
RenderType="typed",
|
|
906
906
|
)
|
|
907
907
|
)
|
|
908
908
|
|
|
@@ -1055,6 +1055,7 @@ class PyPDF2Detector(Detector):
|
|
|
1055
1055
|
Scores=scores,
|
|
1056
1056
|
Evidence=evidence,
|
|
1057
1057
|
Hint=f"AcroSig:{fname}" if fname else "AcroSig",
|
|
1058
|
+
RenderType="typed",
|
|
1058
1059
|
)
|
|
1059
1060
|
)
|
|
1060
1061
|
|
|
@@ -1120,6 +1121,7 @@ class PyPDF2Detector(Detector):
|
|
|
1120
1121
|
Scores=dict(scores),
|
|
1121
1122
|
Evidence=evidence + ["pseudo:true"],
|
|
1122
1123
|
Hint="VendorOrAcroOnly",
|
|
1124
|
+
RenderType="typed",
|
|
1123
1125
|
)
|
|
1124
1126
|
)
|
|
1125
1127
|
|