sigdetect 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sigdetect/api.py CHANGED
@@ -229,6 +229,7 @@ def CropSignatureImages(
229
229
  dpi: int = 200,
230
230
  returnBytes: Literal[False] = False,
231
231
  saveToDisk: bool = True,
232
+ docx: bool = False,
232
233
  ) -> list[Path]: ...
233
234
 
234
235
 
@@ -241,6 +242,7 @@ def CropSignatureImages(
241
242
  dpi: int,
242
243
  returnBytes: Literal[True],
243
244
  saveToDisk: bool,
245
+ docx: bool = False,
244
246
  ) -> list[SignatureCrop]: ...
245
247
 
246
248
 
@@ -252,16 +254,17 @@ def CropSignatureImages(
252
254
  dpi: int = 200,
253
255
  returnBytes: bool = False,
254
256
  saveToDisk: bool = True,
257
+ docx: bool = False,
255
258
  ) -> list[Path] | list[SignatureCrop]:
256
- """Create DOCX files containing cropped signature images.
259
+ """Create PNG files containing cropped signature images (or DOCX when enabled).
257
260
 
258
261
  Accepts either a :class:`FileResult` instance or the ``dict`` returned by
259
262
  :func:`DetectPdf`. Requires the optional ``pymupdf`` dependency.
260
263
  Set ``returnBytes=True`` to also receive in-memory PNG bytes for each crop. Set
261
264
  ``saveToDisk=False`` to skip writing PNG files while still returning in-memory data.
262
- When ``saveToDisk`` is enabled, a one-image DOCX file is also written per crop. When
263
- ``returnBytes`` is True and ``python-docx`` is available, the returned
264
- :class:`SignatureCrop` objects include ``docx_bytes``.
265
+ When ``docx`` is True, DOCX files are written instead of PNG files. When ``returnBytes`` is
266
+ True and ``docx`` is enabled, the returned :class:`SignatureCrop` objects include
267
+ ``docx_bytes``.
265
268
  """
266
269
 
267
270
  from sigdetect.cropping import crop_signatures
@@ -274,6 +277,7 @@ def CropSignatureImages(
274
277
  dpi=dpi,
275
278
  return_bytes=returnBytes,
276
279
  save_files=saveToDisk,
280
+ docx=docx,
277
281
  )
278
282
  if original_dict is not None:
279
283
  original_dict.clear()
@@ -305,6 +309,8 @@ def _CoerceFileResult(
305
309
  BoundingBox=tuple(bbox) if bbox else None,
306
310
  CropPath=entry.get("crop_path"),
307
311
  CropBytes=entry.get("crop_bytes"),
312
+ CropDocxPath=entry.get("crop_docx_path"),
313
+ CropDocxBytes=entry.get("crop_docx_bytes"),
308
314
  )
309
315
  )
310
316
 
sigdetect/cli.py CHANGED
@@ -64,13 +64,19 @@ def Detect(
64
64
  cropSignatures: bool | None = typer.Option(
65
65
  None,
66
66
  "--crop-signatures/--no-crop-signatures",
67
- help="Write DOCX files containing cropped signature images (requires PyMuPDF + python-docx)",
67
+ help="Write PNG crops for signature widgets (requires PyMuPDF)",
68
+ show_default=False,
69
+ ),
70
+ cropDocx: bool | None = typer.Option(
71
+ None,
72
+ "--crop-docx/--no-crop-docx",
73
+ help="Write DOCX crops instead of PNG files (requires PyMuPDF + python-docx)",
68
74
  show_default=False,
69
75
  ),
70
76
  cropDirectory: Path | None = typer.Option(
71
77
  None,
72
78
  "--crop-dir",
73
- help="Directory for signature DOCX crops (defaults to out_dir/signature_crops)",
79
+ help="Directory for signature crops (defaults to out_dir/signature_crops)",
74
80
  ),
75
81
  cropDpi: int | None = typer.Option(
76
82
  None,
@@ -83,7 +89,7 @@ def Detect(
83
89
  cropBytes: bool = typer.Option(
84
90
  False,
85
91
  "--crop-bytes/--no-crop-bytes",
86
- help="Embed base64 PNG bytes for signature crops in results JSON",
92
+ help="Embed base64 PNG bytes (and DOCX bytes when --crop-docx) in results JSON",
87
93
  show_default=False,
88
94
  ),
89
95
  detectWetSignatures: bool | None = typer.Option(
@@ -128,6 +134,8 @@ def Detect(
128
134
  overrides["WriteResults"] = writeResults
129
135
  if cropSignatures is not None:
130
136
  overrides["CropSignatures"] = cropSignatures
137
+ if cropDocx is not None:
138
+ overrides["CropDocx"] = cropDocx
131
139
  if cropDirectory is not None:
132
140
  overrides["CropOutputDirectory"] = cropDirectory
133
141
  if cropDpi is not None:
@@ -181,6 +189,7 @@ def Detect(
181
189
  base_dir = configuration.OutputDirectory or configuration.PdfRoot
182
190
  crop_dir = base_dir / "signature_crops"
183
191
  cropping_enabled = configuration.CropSignatures
192
+ docx_enabled = configuration.CropDocx
184
193
  cropping_available = True
185
194
  cropping_attempted = False
186
195
 
@@ -199,6 +208,7 @@ def Detect(
199
208
  logger=Logger,
200
209
  return_bytes=crop_bytes_enabled,
201
210
  save_files=cropping_enabled,
211
+ docx=docx_enabled,
202
212
  )
203
213
  cropping_attempted = True
204
214
  if crop_bytes_enabled:
@@ -206,15 +216,18 @@ def Detect(
206
216
  crop.signature.CropBytes = base64.b64encode(crop.image_bytes).decode(
207
217
  "ascii"
208
218
  )
219
+ if crop.docx_bytes:
220
+ crop.signature.CropDocxBytes = base64.b64encode(
221
+ crop.docx_bytes
222
+ ).decode("ascii")
209
223
  except SignatureCroppingUnavailable as exc:
210
224
  cropping_available = False
211
225
  Logger.warning("Signature cropping unavailable", extra={"error": str(exc)})
212
226
  typer.echo(str(exc), err=True)
213
227
  except Exception as exc: # pragma: no cover - defensive
214
- Logger.warning(
215
- "Unexpected error while cropping signatures",
216
- extra={"error": str(exc)},
217
- )
228
+ cropping_available = False
229
+ Logger.warning("Signature cropping unavailable", extra={"error": str(exc)})
230
+ typer.echo(str(exc), err=True)
218
231
 
219
232
  total_bboxes += sum(1 for sig in file_result.Signatures if sig.BoundingBox)
220
233
 
sigdetect/config.py CHANGED
@@ -31,6 +31,7 @@ class DetectConfiguration(BaseModel):
31
31
  PseudoSignatures: bool = Field(default=True, alias="pseudo_signatures")
32
32
  RecurseXObjects: bool = Field(default=True, alias="recurse_xobjects")
33
33
  CropSignatures: bool = Field(default=True, alias="crop_signatures")
34
+ CropDocx: bool = Field(default=False, alias="crop_docx")
34
35
  CropOutputDirectory: Path | None = Field(default=None, alias="crop_output_dir")
35
36
  CropImageDpi: int = Field(default=200, alias="crop_image_dpi", ge=72, le=600)
36
37
  DetectWetSignatures: bool = Field(default=True, alias="detect_wet_signatures")
@@ -88,6 +89,10 @@ class DetectConfiguration(BaseModel):
88
89
  def crop_signatures(self) -> bool: # pragma: no cover - simple passthrough
89
90
  return self.CropSignatures
90
91
 
92
+ @property
93
+ def crop_docx(self) -> bool: # pragma: no cover - simple passthrough
94
+ return self.CropDocx
95
+
91
96
  @property
92
97
  def crop_output_dir(self) -> Path | None: # pragma: no cover - simple passthrough
93
98
  return self.CropOutputDirectory
@@ -133,6 +138,7 @@ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
133
138
  env_out_dir = os.getenv("SIGDETECT_OUT_DIR")
134
139
  env_profile = os.getenv("SIGDETECT_PROFILE")
135
140
  env_crop = os.getenv("SIGDETECT_CROP_SIGNATURES")
141
+ env_crop_docx = os.getenv("SIGDETECT_CROP_DOCX")
136
142
  env_crop_dir = os.getenv("SIGDETECT_CROP_DIR")
137
143
  env_crop_dpi = os.getenv("SIGDETECT_CROP_DPI")
138
144
  env_detect_wet = os.getenv("SIGDETECT_DETECT_WET")
@@ -159,6 +165,12 @@ def LoadConfiguration(path: Path | None) -> DetectConfiguration:
159
165
  raw_data["crop_signatures"] = True
160
166
  elif lowered in {"0", "false", "no", "off"}:
161
167
  raw_data["crop_signatures"] = False
168
+ if env_crop_docx is not None:
169
+ lowered = env_crop_docx.lower()
170
+ if lowered in {"1", "true", "yes", "on"}:
171
+ raw_data["crop_docx"] = True
172
+ elif lowered in {"0", "false", "no", "off"}:
173
+ raw_data["crop_docx"] = False
162
174
  if env_crop_dir:
163
175
  raw_data["crop_output_dir"] = env_crop_dir
164
176
  if env_crop_dpi:
sigdetect/cropping.py CHANGED
@@ -1,4 +1,4 @@
1
- """Helpers for converting signature bounding boxes into DOCX crops."""
1
+ """Helpers for converting signature bounding boxes into PNG or DOCX crops."""
2
2
 
3
3
  from __future__ import annotations
4
4
 
@@ -9,6 +9,8 @@ from dataclasses import dataclass
9
9
  from pathlib import Path
10
10
  from typing import Literal, overload
11
11
 
12
+ from PIL import Image
13
+
12
14
  from .detector.file_result_model import FileResult
13
15
  from .detector.signature_model import Signature
14
16
 
@@ -27,7 +29,7 @@ class SignatureCroppingUnavailable(RuntimeError):
27
29
  """Raised when PNG cropping cannot be performed (e.g., PyMuPDF missing)."""
28
30
 
29
31
 
30
- class SignatureDocxUnavailable(RuntimeError):
32
+ class SignatureDocxUnavailable(SignatureCroppingUnavailable):
31
33
  """Raised when DOCX creation cannot be performed (e.g., python-docx missing)."""
32
34
 
33
35
 
@@ -52,6 +54,8 @@ def crop_signatures(
52
54
  logger: logging.Logger | None = None,
53
55
  return_bytes: Literal[False] = False,
54
56
  save_files: bool = True,
57
+ docx: bool = False,
58
+ trim: bool = True,
55
59
  ) -> list[Path]: ...
56
60
 
57
61
 
@@ -65,6 +69,8 @@ def crop_signatures(
65
69
  logger: logging.Logger | None = None,
66
70
  return_bytes: Literal[True],
67
71
  save_files: bool = True,
72
+ docx: bool = False,
73
+ trim: bool = True,
68
74
  ) -> list[SignatureCrop]: ...
69
75
 
70
76
 
@@ -77,14 +83,16 @@ def crop_signatures(
77
83
  logger: logging.Logger | None = None,
78
84
  return_bytes: bool = False,
79
85
  save_files: bool = True,
86
+ docx: bool = False,
87
+ trim: bool = True,
80
88
  ) -> list[Path] | list[SignatureCrop]:
81
- """Render each signature bounding box to a PNG image and wrap it in a DOCX file.
89
+ """Render each signature bounding box to a PNG image and optionally wrap it in DOCX.
82
90
 
83
91
  Set ``return_bytes=True`` to collect in-memory PNG bytes for each crop while also writing
84
92
  the files to ``output_dir``. Set ``save_files=False`` to skip writing PNGs to disk.
85
- When ``save_files`` is enabled, a one-image DOCX file is also written per signature crop.
86
- When ``return_bytes`` is True and ``python-docx`` is available, ``SignatureCrop.docx_bytes``
87
- will contain the DOCX payload.
93
+ When ``docx=True``, DOCX files are written instead of PNGs. When ``return_bytes`` is True
94
+ and ``docx=True``, ``SignatureCrop.docx_bytes`` will contain the DOCX payload.
95
+ When ``trim`` is enabled, the crop is tightened around the detected ink where possible.
88
96
  """
89
97
 
90
98
  if fitz is None: # pragma: no cover - exercised when dependency absent
@@ -101,14 +109,11 @@ def crop_signatures(
101
109
  generated_paths: list[Path] = []
102
110
  generated_crops: list[SignatureCrop] = []
103
111
 
104
- docx_to_disk = save_files
105
- docx_in_memory = return_bytes
106
- docx_enabled = docx_to_disk or docx_in_memory
112
+ docx_enabled = docx
107
113
  docx_available = Document is not None
108
- if docx_enabled and not docx_available and logger:
109
- logger.warning(
110
- "Signature DOCX output unavailable",
111
- extra={"error": "python-docx is required to generate DOCX outputs"},
114
+ if docx_enabled and not docx_available:
115
+ raise SignatureDocxUnavailable(
116
+ "python-docx is required to generate DOCX outputs for signature crops."
112
117
  )
113
118
 
114
119
  with fitz.open(pdf_path) as document: # type: ignore[attr-defined]
@@ -146,8 +151,12 @@ def crop_signatures(
146
151
  try:
147
152
  image_bytes: bytes | None = None
148
153
  pixmap = page.get_pixmap(matrix=matrix, clip=clip, alpha=False)
154
+ raw_bytes = pixmap.tobytes("png")
155
+ final_bytes = _trim_signature_image_bytes(raw_bytes) if trim else raw_bytes
156
+ if save_files and not docx_enabled:
157
+ png_destination.write_bytes(final_bytes)
149
158
  if return_bytes or docx_enabled:
150
- image_bytes = pixmap.tobytes("png")
159
+ image_bytes = final_bytes
151
160
  except Exception as exc: # pragma: no cover - defensive
152
161
  if logger:
153
162
  logger.warning(
@@ -162,12 +171,12 @@ def crop_signatures(
162
171
  continue
163
172
 
164
173
  docx_bytes: bytes | None = None
165
- if docx_enabled and docx_available:
174
+ if docx_enabled:
166
175
  if image_bytes is None: # pragma: no cover - defensive
167
176
  continue
168
177
  try:
169
178
  docx_bytes = _build_docx_bytes(image_bytes)
170
- if docx_to_disk:
179
+ if save_files:
171
180
  docx_destination.write_bytes(docx_bytes)
172
181
  except SignatureDocxUnavailable as exc:
173
182
  if logger:
@@ -184,14 +193,20 @@ def crop_signatures(
184
193
  )
185
194
 
186
195
  if save_files:
187
- signature.CropPath = str(docx_destination)
188
- generated_paths.append(docx_destination)
196
+ if docx_enabled:
197
+ signature.CropPath = None
198
+ signature.CropDocxPath = str(docx_destination)
199
+ generated_paths.append(docx_destination)
200
+ else:
201
+ signature.CropDocxPath = None
202
+ signature.CropPath = str(png_destination)
203
+ generated_paths.append(png_destination)
189
204
  if return_bytes:
190
205
  if image_bytes is None: # pragma: no cover - defensive
191
206
  continue
192
207
  generated_crops.append(
193
208
  SignatureCrop(
194
- path=docx_destination,
209
+ path=docx_destination if docx_enabled else png_destination,
195
210
  image_bytes=image_bytes,
196
211
  signature=signature,
197
212
  docx_bytes=docx_bytes,
@@ -214,6 +229,233 @@ def _build_docx_bytes(image_bytes: bytes) -> bytes:
214
229
  return buffer.getvalue()
215
230
 
216
231
 
232
+ def _trim_signature_image_bytes(
233
+ image_bytes: bytes,
234
+ *,
235
+ pad_px: int = 4,
236
+ gap_px: int = 4,
237
+ min_density_ratio: float = 0.004,
238
+ ) -> bytes:
239
+ image = Image.open(io.BytesIO(image_bytes))
240
+ gray = image.convert("L")
241
+ width, height = gray.size
242
+
243
+ histogram = gray.histogram()
244
+ total_pixels = width * height
245
+ cutoff = int(total_pixels * 0.995)
246
+ cumulative = 0
247
+ white_level = 255
248
+ for idx, count in enumerate(histogram):
249
+ cumulative += count
250
+ if cumulative >= cutoff:
251
+ white_level = idx
252
+ break
253
+
254
+ if white_level < 200:
255
+ return image_bytes
256
+
257
+ thresholds = [min(254, max(200, white_level - delta)) for delta in (6, 4, 2, 1, 0)]
258
+ min_density = max(2, int(width * min_density_ratio))
259
+ pixels = gray.load()
260
+
261
+ row_densities: dict[int, list[int]] = {}
262
+ for threshold in thresholds:
263
+ row_density = []
264
+ for y in range(height):
265
+ dark = sum(1 for x in range(width) if pixels[x, y] < threshold)
266
+ row_density.append(dark)
267
+ row_densities[threshold] = row_density
268
+
269
+ line_bounds = _detect_horizontal_rule_cutoff(row_densities[thresholds[-1]], width)
270
+ scan_limit = None
271
+ descender_limit = height - 1
272
+ if line_bounds is not None:
273
+ line_start, line_end = line_bounds
274
+ scan_limit = max(0, line_start - 1)
275
+ descender_limit = min(height - 1, line_end + max(2, int(height * 0.02)))
276
+
277
+ min_band_height = max(4, int(height * 0.02))
278
+ best = None
279
+ best_small = None
280
+ best_small_threshold = None
281
+ best_threshold = None
282
+ line_threshold = int(width * 0.6)
283
+ for threshold in thresholds:
284
+ row_density = row_densities[threshold]
285
+ segments: list[tuple[int, int]] = []
286
+ start: int | None = None
287
+ for y, dark in enumerate(row_density):
288
+ if scan_limit is not None and y > scan_limit:
289
+ if start is not None:
290
+ segments.append((start, y - 1))
291
+ start = None
292
+ break
293
+ if dark >= min_density:
294
+ if start is None:
295
+ start = y
296
+ else:
297
+ if start is not None:
298
+ segments.append((start, y - 1))
299
+ start = None
300
+ if start is not None:
301
+ segments.append((start, height - 1))
302
+
303
+ if not segments:
304
+ continue
305
+
306
+ merged: list[list[int]] = []
307
+ for seg in segments:
308
+ if not merged:
309
+ merged.append([seg[0], seg[1]])
310
+ continue
311
+ if seg[0] - merged[-1][1] <= gap_px:
312
+ merged[-1][1] = seg[1]
313
+ else:
314
+ merged.append([seg[0], seg[1]])
315
+
316
+ candidates = []
317
+ for y0, y1 in merged:
318
+ min_x, max_x = width, -1
319
+ total_dark = 0
320
+ for y in range(y0, y1 + 1):
321
+ for x in range(width):
322
+ if pixels[x, y] < threshold:
323
+ total_dark += 1
324
+ if x < min_x:
325
+ min_x = x
326
+ if x > max_x:
327
+ max_x = x
328
+ if max_x < 0:
329
+ continue
330
+ band_height = y1 - y0 + 1
331
+ band_width = max_x - min_x + 1
332
+ score = total_dark * (band_height**1.3)
333
+ if line_bounds is not None:
334
+ distance = max(0, line_bounds[0] - y1)
335
+ proximity = 1.0 / (1.0 + (distance / 20.0))
336
+ score *= 1.0 + 0.5 * proximity
337
+ candidates.append(
338
+ {
339
+ "y0": y0,
340
+ "y1": y1,
341
+ "min_x": min_x,
342
+ "max_x": max_x,
343
+ "total": total_dark,
344
+ "height": band_height,
345
+ "width": band_width,
346
+ "score": score,
347
+ }
348
+ )
349
+
350
+ if not candidates:
351
+ continue
352
+
353
+ candidates.sort(key=lambda item: item["score"], reverse=True)
354
+ top_candidate = candidates[0]
355
+ if top_candidate["height"] >= min_band_height:
356
+ if best is None or top_candidate["score"] > best["score"]:
357
+ best = top_candidate
358
+ best_threshold = threshold
359
+ else:
360
+ if best_small is None or top_candidate["score"] > best_small["score"]:
361
+ best_small = top_candidate
362
+ best_small_threshold = threshold
363
+
364
+ if best is None:
365
+ best = best_small
366
+ best_threshold = best_small_threshold
367
+
368
+ if best is None:
369
+ return image_bytes
370
+
371
+ expansion_density = row_densities.get(best_threshold, row_densities[thresholds[-1]])
372
+ expand_threshold = max(1, int(min_density * 0.4))
373
+ y0 = best["y0"]
374
+ y1 = best["y1"]
375
+
376
+ while y0 > 0 and expansion_density[y0 - 1] >= expand_threshold:
377
+ y0 -= 1
378
+ while y1 < descender_limit and expansion_density[y1 + 1] >= expand_threshold:
379
+ y1 += 1
380
+
381
+ min_x, max_x = width, -1
382
+ for y in range(y0, y1 + 1):
383
+ if expansion_density[y] >= line_threshold:
384
+ continue
385
+ for x in range(width):
386
+ if pixels[x, y] < thresholds[-1]:
387
+ if x < min_x:
388
+ min_x = x
389
+ if x > max_x:
390
+ max_x = x
391
+ if max_x >= 0:
392
+ best = {
393
+ "y0": y0,
394
+ "y1": y1,
395
+ "min_x": min_x,
396
+ "max_x": max_x,
397
+ }
398
+
399
+ x0 = max(0, best["min_x"] - pad_px)
400
+ x1 = min(width - 1, best["max_x"] + pad_px)
401
+ y0 = max(0, best["y0"] - pad_px)
402
+ y1 = min(height - 1, best["y1"] + pad_px)
403
+
404
+ if x1 <= x0 or y1 <= y0:
405
+ return image_bytes
406
+ if (x1 - x0) < max(10, int(width * 0.2)) or (y1 - y0) < max(6, int(height * 0.08)):
407
+ return image_bytes
408
+
409
+ cropped = image.crop((x0, y0, x1 + 1, y1 + 1))
410
+ buffer = io.BytesIO()
411
+ cropped.save(buffer, format="PNG")
412
+ return buffer.getvalue()
413
+
414
+
415
+ def _detect_horizontal_rule_cutoff(
416
+ row_density: list[int],
417
+ width: int,
418
+ ) -> tuple[int, int] | None:
419
+ if not row_density:
420
+ return None
421
+ line_threshold = int(width * 0.6)
422
+ max_thickness = 4
423
+ segments: list[tuple[int, int]] = []
424
+ start = None
425
+ for y, density in enumerate(row_density):
426
+ if density >= line_threshold:
427
+ if start is None:
428
+ start = y
429
+ else:
430
+ if start is not None:
431
+ segments.append((start, y - 1))
432
+ start = None
433
+ if start is not None:
434
+ segments.append((start, len(row_density) - 1))
435
+
436
+ if not segments:
437
+ return None
438
+
439
+ total_dark = sum(row_density)
440
+ if total_dark <= 0:
441
+ return None
442
+
443
+ for y0, y1 in segments:
444
+ thickness = y1 - y0 + 1
445
+ if thickness > max_thickness:
446
+ continue
447
+ above_dark = sum(row_density[:y0])
448
+ below_dark = sum(row_density[y1 + 1 :])
449
+ if above_dark < 40:
450
+ continue
451
+ midpoint_ratio = ((y0 + y1) / 2.0) / max(1, len(row_density))
452
+ if midpoint_ratio >= 0.35:
453
+ return (y0, y1)
454
+ if above_dark >= max(40, int(below_dark * 0.3)):
455
+ return (y0, y1)
456
+ return None
457
+
458
+
217
459
  def _to_clip_rect(page, bbox: tuple[float, float, float, float]):
218
460
  width = float(page.rect.width)
219
461
  height = float(page.rect.height)
@@ -21,6 +21,8 @@ class Signature:
21
21
  BoundingBox: tuple[float, float, float, float] | None = None
22
22
  CropPath: str | None = None
23
23
  CropBytes: str | None = None
24
+ CropDocxPath: str | None = None
25
+ CropDocxBytes: str | None = None
24
26
 
25
27
  def to_dict(self) -> dict[str, Any]:
26
28
  """Return the legacy snake_case representation used in JSON payloads."""
@@ -37,4 +39,6 @@ class Signature:
37
39
  "bounding_box": list(self.BoundingBox) if self.BoundingBox else None,
38
40
  "crop_path": self.CropPath,
39
41
  "crop_bytes": self.CropBytes,
42
+ "crop_docx_path": self.CropDocxPath,
43
+ "crop_docx_bytes": self.CropDocxBytes,
40
44
  }
@@ -94,6 +94,17 @@ def apply_wet_detection(
94
94
  original_mixed = file_result.MixedContent
95
95
  try:
96
96
  added = _detect(pdf_path, configuration, file_result, logger=logger)
97
+ if added and configuration.Profile == "hipaa":
98
+ updated = False
99
+ for signature in file_result.Signatures:
100
+ if signature.RenderType == "wet" and (signature.Role or "unknown") == "unknown":
101
+ signature.Role = "patient"
102
+ signature.Scores = {"patient": int(signature.Score or 0)}
103
+ signature.Evidence = list(signature.Evidence or [])
104
+ signature.Evidence.append("role_default:patient")
105
+ updated = True
106
+ if updated:
107
+ _refresh_metadata(file_result)
97
108
  if not added:
98
109
  _mark_manual_review(file_result, "NoHighConfidenceWetSignature")
99
110
  return added
@@ -136,6 +147,18 @@ def _detect(
136
147
  scale=configuration.WetOcrDpi / 72.0,
137
148
  )
138
149
  )
150
+ if not candidates:
151
+ candidates = list(
152
+ _build_candidates(
153
+ ocr_lines,
154
+ image=image,
155
+ page_rect=page.rect,
156
+ pix_width=pixmap.width,
157
+ pix_height=pixmap.height,
158
+ scale=configuration.WetOcrDpi / 72.0,
159
+ min_y_ratio=0.2,
160
+ )
161
+ )
139
162
  candidates.extend(_image_candidates(page))
140
163
  candidates = _filter_candidates_for_page(candidates)
141
164
  accepted = [
@@ -247,6 +270,7 @@ def _build_candidates(
247
270
  pix_width: int,
248
271
  pix_height: int,
249
272
  scale: float,
273
+ min_y_ratio: float = 0.4,
250
274
  ) -> Iterable[WetCandidate]:
251
275
  for line in lines:
252
276
  normalized = line.text.lower()
@@ -255,7 +279,7 @@ def _build_candidates(
255
279
  if len(normalized) > 80:
256
280
  # Ignore long paragraph-like OCR lines
257
281
  continue
258
- if (line.bottom / pix_height) < 0.4:
282
+ if (line.bottom / pix_height) < min_y_ratio:
259
283
  # Ignore lines in the upper section of the page
260
284
  continue
261
285
  role = _infer_role(normalized)
@@ -338,28 +362,33 @@ def _expand_bbox(
338
362
  ) -> tuple[float, float, float, float]:
339
363
  x0 = line.left / scale
340
364
  x1 = line.right / scale
341
- y1 = (pix_height - line.top) / scale
365
+ y_top = (pix_height - line.top) / scale
366
+ y_bottom = (pix_height - line.bottom) / scale
342
367
 
343
368
  pad_x = max(14.0, (x1 - x0) * 0.25)
344
369
  left = max(page_rect.x0, x0 - pad_x)
345
370
  right = min(page_rect.x1, x1 + pad_x)
346
371
 
347
372
  gap = 14.0
348
- signature_height = 70.0
349
- top = min(page_rect.y1, y1 + gap)
350
- bottom = min(page_rect.y1, top + signature_height)
351
-
352
- if bottom <= top:
353
- bottom = min(page_rect.y1, top + signature_height)
373
+ line_height = max(1.0, (line.bottom - line.top) / scale)
374
+ signature_height = max(70.0, line_height * 6.0)
375
+ upper = min(page_rect.y1, y_bottom - gap)
376
+ upper = max(page_rect.y0, upper)
377
+ lower = max(page_rect.y0, upper - signature_height)
354
378
 
355
379
  if stroke_y is not None:
356
- # Anchor to the detected stroke under the OCR label when available.
380
+ # Anchor to the detected stroke (signature line) beneath the label.
357
381
  sy = (pix_height - stroke_y) / scale
358
- if sy < top:
359
- top = sy
360
- bottom = max(bottom, sy + signature_height)
382
+ field_lower = min(page_rect.y1, max(page_rect.y0, sy + 2.0))
383
+ field_upper = min(page_rect.y1, y_bottom - gap)
384
+ if field_upper > field_lower + 6.0:
385
+ lower = field_lower
386
+ upper = field_upper
387
+ else:
388
+ upper = min(page_rect.y1, field_lower + signature_height)
389
+ lower = max(page_rect.y0, upper - signature_height)
361
390
 
362
- return (float(left), float(top), float(right), float(bottom))
391
+ return (float(left), float(lower), float(right), float(upper))
363
392
 
364
393
 
365
394
  def _stroke_under_line(image: Image.Image, line: OcrLine) -> tuple[bool, float | None]:
@@ -513,14 +542,19 @@ def _signature_rank(signature: Signature) -> tuple[int, int, int]:
513
542
 
514
543
  def _dedupe_wet_signatures(signatures: Sequence[Signature]) -> list[Signature]:
515
544
  best_by_role: dict[str, Signature] = {}
545
+ best_unknown: Signature | None = None
516
546
  for signature in signatures:
517
547
  role = (signature.Role or "unknown").strip().lower()
518
548
  if role == "unknown":
549
+ if best_unknown is None or _signature_rank(signature) > _signature_rank(best_unknown):
550
+ best_unknown = signature
519
551
  continue
520
552
  existing = best_by_role.get(role)
521
553
  if existing is None or _signature_rank(signature) > _signature_rank(existing):
522
554
  best_by_role[role] = signature
523
- return sorted(best_by_role.values(), key=lambda sig: (int(sig.Page or 0), sig.Role or ""))
555
+ if best_by_role:
556
+ return sorted(best_by_role.values(), key=lambda sig: (int(sig.Page or 0), sig.Role or ""))
557
+ return [best_unknown] if best_unknown is not None else []
524
558
 
525
559
 
526
560
  def _mark_manual_review(file_result: FileResult, reason: str) -> None:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sigdetect
3
- Version: 0.5.0
3
+ Version: 0.5.2
4
4
  Summary: Signature detection and role attribution for PDFs
5
5
  Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
6
6
  License: MIT
@@ -105,7 +105,7 @@ sigdetect detect \
105
105
  - `retainer` → client / firm (prefers detecting two signatures)
106
106
  - `--recursive/--no-recursive` toggles whether `sigdetect detect` descends into subdirectories when hunting for PDFs (recursive by default).
107
107
  - Results output is disabled by default; set `write_results: true` or pass `--write-results` when you need `results.json` (for EDA).
108
- - Cropping (`--crop-signatures`) writes a one-image `.docx` per signature in the crop output directory (no PNG files on disk); `--crop-bytes` embeds base64 PNG data in `signatures[].crop_bytes` for in-memory use. PyMuPDF is required for crops, and `python-docx` is required for `.docx` output.
108
+ - Cropping (`--crop-signatures`) writes PNG crops to disk by default; enable `--crop-docx` to write DOCX files instead of PNGs. `--crop-bytes` embeds base64 PNG data in `signatures[].crop_bytes` and, when `--crop-docx` is enabled, embeds DOCX bytes in `signatures[].crop_docx_bytes`. PyMuPDF is required for crops, and `python-docx` is required for DOCX output.
109
109
  - Wet detection runs automatically for non-e-sign PDFs when dependencies are available; missing OCR dependencies add a `ManualReview:*` hint instead of failing. PyMuPDF + Tesseract are required for wet detection.
110
110
  - If the executable is not on `PATH`, you can always fall back to `python -m sigdetect.cli ...`.
111
111
 
@@ -142,7 +142,7 @@ result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
142
142
  print(result.to_dict())
143
143
  ~~~
144
144
 
145
- `Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When cropping is enabled, `crop_path` points at the generated `.docx`. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
145
+ `Detect(Path)` returns a **FileResult** dataclass; call `.to_dict()` for the JSON-friendly representation (see [Result schema](#result-schema)). Each signature entry now exposes `bounding_box` coordinates (PDF points, origin bottom-left). When PNG cropping is enabled, `crop_path` points at the generated image; when DOCX cropping is enabled, `crop_docx_path` points at the generated doc. Use `Engine="auto"` if you want the single-pass defaults that prefer PyMuPDF (for geometry) when available.
146
146
 
147
147
  ---
148
148
 
@@ -194,7 +194,7 @@ for res in ScanDirectory(
194
194
  # store in DB, print, etc.
195
195
  pass
196
196
 
197
- # 3) Create DOCX crops for FileResult objects (requires PyMuPDF + python-docx)
197
+ # 3) Crop signature snippets for FileResult objects (requires PyMuPDF; DOCX needs python-docx)
198
198
  detector = get_detector(pdfRoot="/path/to/pdfs", profileName="hipaa")
199
199
  file_result = detector.Detect(Path("/path/to/pdfs/example.pdf"))
200
200
  CropSignatureImages(
@@ -233,7 +233,8 @@ High-level summary (per file):
233
233
  "hint": "AcroSig:sig_patient",
234
234
  "render_type": "typed",
235
235
  "bounding_box": [10.0, 10.0, 150.0, 40.0],
236
- "crop_path": "signature_crops/example/sig_01_patient.docx"
236
+ "crop_path": "signature_crops/example/sig_01_patient.png",
237
+ "crop_docx_path": null
237
238
  },
238
239
  {
239
240
  "page": null,
@@ -259,8 +260,10 @@ High-level summary (per file):
259
260
  - **`roles`** summarizes unique non-`unknown` roles across signatures.
260
261
  - In retainer profile, emitter prefers two signatures (client + firm), often on the same page.
261
262
  - **`signatures[].bounding_box`** reports the widget rectangle in PDF points (origin bottom-left).
262
- - **`signatures[].crop_path`** is populated when DOCX crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
263
+ - **`signatures[].crop_path`** is populated when PNG crops are generated (via CLI `--crop-signatures` or `CropSignatureImages`).
264
+ - **`signatures[].crop_docx_path`** is populated when DOCX crops are generated (`--crop-docx` or `docx=True`).
263
265
  - **`signatures[].crop_bytes`** contains base64 PNG data when CLI `--crop-bytes` is enabled.
266
+ - **`signatures[].crop_docx_bytes`** contains base64 DOCX data when `--crop-docx` and `--crop-bytes` are enabled together.
264
267
 
265
268
  ---
266
269
 
@@ -287,7 +290,8 @@ write_results: false
287
290
  pseudo_signatures: true
288
291
  recurse_xobjects: true
289
292
  profile: retainer # or: hipaa
290
- crop_signatures: false # enable to write DOCX crops (requires pymupdf + python-docx)
293
+ crop_signatures: false # enable to write PNG crops (requires pymupdf)
294
+ crop_docx: false # enable to write DOCX crops instead of PNGs (requires python-docx)
291
295
  # crop_output_dir: ./signature_crops
292
296
  crop_image_dpi: 200
293
297
  detect_wet_signatures: false # kept for compatibility; non-e-sign PDFs still trigger OCR
@@ -1,12 +1,12 @@
1
1
  sigdetect/__init__.py,sha256=YvnTwlC1jfq83EhQS_1JjiiHK7_wJCCU1JvHv5E1qWY,573
2
- sigdetect/api.py,sha256=uaU7JbSGpyViiXrrHu-iuifIi8xIes3PGeBZkoLNlPg,10800
3
- sigdetect/cli.py,sha256=d5AznKwQPvYKVzC8RCBDgC9SlB4Goz1_pB2_EFzrsTg,10349
4
- sigdetect/config.py,sha256=rJdlu9pM4aqeoY7Ha5qocPmZ7_UeVOOFepBlqOne2b8,7873
5
- sigdetect/cropping.py,sha256=UeKL6dBY18V1E2DoLSbGjTzdGnjhz2WKPi3l3Q0Brh8,8516
2
+ sigdetect/api.py,sha256=hDfa6z4SoHth1Dw9HDfSPiytMQrqu_oyBZlXBwSh9g4,11010
3
+ sigdetect/cli.py,sha256=X5GqZ-PK67vz4OHN5r7h-V0hO886ZblUiUdKDuFowtU,10930
4
+ sigdetect/config.py,sha256=3SP1rkcWBGXloCDFomBJRMRKZOvXuHQbhIBqpVrzYmY,8365
5
+ sigdetect/cropping.py,sha256=IyiBfIEHBLvOv8t_d-O51BfpljTFpE-dG_RxDxJAzAo,16339
6
6
  sigdetect/eda.py,sha256=S92G1Gjmepri__D0n_V6foq0lQgH-RXI9anW8A58jfw,4681
7
7
  sigdetect/logging_setup.py,sha256=LMF8ao_a-JwH0S522T6aYTFX3e8Ajjv_5ODS2YiBcHA,6404
8
8
  sigdetect/utils.py,sha256=T9rubLf5T9JmjOHYMOba1j34fhOJaWocAXccnGTxRUE,5198
9
- sigdetect/wet_detection.py,sha256=zvi11XUmm_xLZ4BLvxInwMQg8YLcyQzEYAM9QSdJOIs,18259
9
+ sigdetect/wet_detection.py,sha256=ofKijykm4fKrvFaVkEkPPKL9iKeRNvlAiKkD2vHxD8k,20025
10
10
  sigdetect/data/role_rules.retainer.yml,sha256=IFdwKnDBXR2cTkdfrsZ6ku6CXD8S_dg5A3vKRKLW5h8,2532
11
11
  sigdetect/data/role_rules.yml,sha256=HuLKsZR_A6sD9XvY4NHiY_VG3dS5ERNCBF9-Mxawomw,2751
12
12
  sigdetect/data/vendor_patterns.yml,sha256=NRbZNQxcx_GuL6n1jAphBn6MM6ChCpeWGCsjbRx-PEo,384
@@ -16,9 +16,9 @@ sigdetect/detector/base_detector.py,sha256=GmAgUWO_fQgIfnihZSoyhR3wpnwZ-X3hS0Kuy
16
16
  sigdetect/detector/file_result_model.py,sha256=j2gTc9Sw3fJOHlexYsR_m5DiwHA8DzIzAMToESfvo4A,1767
17
17
  sigdetect/detector/pymupdf_engine.py,sha256=N6oxvUa-48VvvhjbMk0R0kfScsggNKS7u5FLSeBRfWw,17358
18
18
  sigdetect/detector/pypdf2_engine.py,sha256=kB8cIp_gMvCla0LIBi9sd19g0361Oc9TjCW_ZViUBJQ,47410
19
- sigdetect/detector/signature_model.py,sha256=0SEUc34wvOvrzy_fDzzD42A9LsSzIOeZ4rERPDHimsA,1149
20
- sigdetect-0.5.0.dist-info/METADATA,sha256=-Jgo6JZwWA18uqhjBv2mqZc43y9KHLfpMoPec7ObGow,13628
21
- sigdetect-0.5.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
22
- sigdetect-0.5.0.dist-info/entry_points.txt,sha256=iqtfKjBU44-omM7Sh-idGz2ahw19oAvpvSyKZVArG3o,48
23
- sigdetect-0.5.0.dist-info/top_level.txt,sha256=PKlfwUobkRC0viwiSXmhtw83G26FSNpimWYC1Uy00FY,10
24
- sigdetect-0.5.0.dist-info/RECORD,,
19
+ sigdetect/detector/signature_model.py,sha256=T2Hmfkfz_hZsDzwOhepxfNmkedxQp3_XHdrP8yGKoCk,1322
20
+ sigdetect-0.5.2.dist-info/METADATA,sha256=jLin7USVPqeA5tS7KCuPRRt1PLwdt-oJWhWuKSQa6hE,14131
21
+ sigdetect-0.5.2.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
22
+ sigdetect-0.5.2.dist-info/entry_points.txt,sha256=iqtfKjBU44-omM7Sh-idGz2ahw19oAvpvSyKZVArG3o,48
23
+ sigdetect-0.5.2.dist-info/top_level.txt,sha256=PKlfwUobkRC0viwiSXmhtw83G26FSNpimWYC1Uy00FY,10
24
+ sigdetect-0.5.2.dist-info/RECORD,,