sigdetect 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,34 @@
1
+ """Signature model returned by detection engines."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Any
7
+
8
+
9
+ @dataclass(slots=True)
10
+ class Signature:
11
+ """Metadata describing a detected signature field."""
12
+
13
+ Page: int | None
14
+ FieldName: str
15
+ Role: str
16
+ Score: int
17
+ Scores: dict[str, int]
18
+ Evidence: list[str]
19
+ Hint: str
20
+ RenderType: str = "unknown"
21
+
22
+ def to_dict(self) -> dict[str, Any]:
23
+ """Return the legacy snake_case representation used in JSON payloads."""
24
+
25
+ return {
26
+ "page": self.Page,
27
+ "field_name": self.FieldName,
28
+ "role": self.Role,
29
+ "score": self.Score,
30
+ "scores": self.Scores,
31
+ "evidence": list(self.Evidence),
32
+ "hint": self.Hint,
33
+ "render_type": self.RenderType,
34
+ }
sigdetect/eda.py ADDED
@@ -0,0 +1,137 @@
1
+ """Exploratory data analysis helpers for signature detection output."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import statistics
7
+ from collections import Counter
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ from rich.console import Console
12
+ from rich.table import Table
13
+
14
+ from .config import DetectConfiguration
15
+
16
+ ConsoleInstance = Console()
17
+
18
+
19
+ def _SafeNumber(value: Any, defaultValue: float | None = None) -> float | None:
20
+ """Attempt to coerce ``value`` to ``float`` while tolerating bad input."""
21
+
22
+ try:
23
+ return float(value)
24
+ except Exception:
25
+ return defaultValue
26
+
27
+
28
+ def _FormatSizeStatistics(sizeValues: list[float]) -> str:
29
+ """Return a ``min / median / max`` summary for ``sizeValues``."""
30
+
31
+ if not sizeValues:
32
+ return "—"
33
+ sortedValues = sorted(value for value in sizeValues if value is not None)
34
+ if not sortedValues:
35
+ return "—"
36
+ minimum = int(round(sortedValues[0]))
37
+ median = int(round(statistics.median(sortedValues)))
38
+ maximum = int(round(sortedValues[-1]))
39
+ return f"{minimum} / {median} / {maximum}"
40
+
41
+
42
+ def _LoadResults(resultsPath: Path) -> list[dict[str, Any]]:
43
+ """Load ``results.json`` from disk and guard against malformed content."""
44
+
45
+ if not resultsPath.exists():
46
+ ConsoleInstance.print(f"[yellow]No results.json found at {resultsPath}[/yellow]")
47
+ return []
48
+ try:
49
+ data = json.loads(resultsPath.read_text())
50
+ except Exception as exc:
51
+ ConsoleInstance.print(f"[red]Failed to read {resultsPath}: {exc}[/red]")
52
+ return []
53
+ if not isinstance(data, list):
54
+ ConsoleInstance.print(f"[red]results.json is not a list: {type(data)}[/red]")
55
+ return []
56
+ return data
57
+
58
+
59
+ def _FlattenSignatures(rows: list[dict[str, Any]]) -> list[dict[str, Any]]:
60
+ """Collate signature dictionaries found within ``rows``."""
61
+
62
+ signatures: list[dict[str, Any]] = []
63
+ for row in rows:
64
+ for signature in row.get("signatures") or []:
65
+ if isinstance(signature, dict):
66
+ signatures.append(signature)
67
+ return signatures
68
+
69
+
70
+ def RunExploratoryAnalysis(configuration: DetectConfiguration) -> None:
71
+ """Print a compact summary of the detection output defined by ``configuration``."""
72
+
73
+ outputDirectory = configuration.OutputDirectory or configuration.PdfRoot
74
+ resultsPath = outputDirectory / "results.json"
75
+ rows = _LoadResults(resultsPath)
76
+
77
+ if not rows:
78
+ ConsoleInstance.print("[yellow]No results to summarize.[/yellow]")
79
+ return
80
+
81
+ totalCount = len(rows)
82
+ electronicSignatureCount = sum(1 for row in rows if bool(row.get("esign_found")))
83
+ wetSignatureCount = totalCount - electronicSignatureCount
84
+ scannedCount = sum(1 for row in rows if bool(row.get("scanned_pdf")))
85
+ mixedCount = sum(1 for row in rows if bool(row.get("mixed")))
86
+ sizeValues = [
87
+ _SafeNumber(row.get("size_kb"))
88
+ for row in rows
89
+ if _SafeNumber(row.get("size_kb")) is not None
90
+ ]
91
+
92
+ table = Table(show_header=True, header_style="bold")
93
+ table.add_column("Total", justify="right")
94
+ table.add_column("E-sign", justify="right")
95
+ table.add_column("Wet", justify="right")
96
+ table.add_column("Scans", justify="right")
97
+ table.add_column("Mixed", justify="right")
98
+ table.add_column("Size KB (min/med/max)", justify="left")
99
+
100
+ table.add_row(
101
+ str(totalCount),
102
+ str(electronicSignatureCount),
103
+ str(wetSignatureCount),
104
+ str(scannedCount),
105
+ str(mixedCount),
106
+ _FormatSizeStatistics(sizeValues),
107
+ )
108
+ ConsoleInstance.print(table)
109
+
110
+ signatures = _FlattenSignatures(rows)
111
+ roleCounts = Counter((signature.get("role") or "unknown") for signature in signatures)
112
+
113
+ if signatures:
114
+ ConsoleInstance.print("\nSignature roles (per-signature) — including unknown:")
115
+ preferredOrder = [
116
+ "patient",
117
+ "representative",
118
+ "client",
119
+ "firm",
120
+ "attorney",
121
+ "unknown",
122
+ ]
123
+ seenRoles = set()
124
+ orderedRoles: list[str] = []
125
+ for role in preferredOrder:
126
+ if role in roleCounts:
127
+ orderedRoles.append(role)
128
+ seenRoles.add(role)
129
+ for role in sorted(roleCounts):
130
+ if role not in seenRoles:
131
+ orderedRoles.append(role)
132
+
133
+ bulletLines = [f" • {role:<13} — {roleCounts[role]}" for role in orderedRoles]
134
+ ConsoleInstance.print("\n".join(bulletLines))
135
+ ConsoleInstance.print(f"(total signatures tallied: {sum(roleCounts.values())})\n")
136
+ else:
137
+ ConsoleInstance.print("\n[dim]No signatures found to break down by role.[/dim]\n")
@@ -0,0 +1,218 @@
1
+ """Logging helpers configured for the CaseWorks standards."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import logging
7
+ import os
8
+ from logging.handlers import RotatingFileHandler
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ from rich.logging import RichHandler
13
+
14
+ _LEVEL_MAP = {
15
+ "CRITICAL": logging.CRITICAL,
16
+ "ERROR": logging.ERROR,
17
+ "WARNING": logging.WARNING,
18
+ "INFO": logging.INFO,
19
+ "DEBUG": logging.DEBUG,
20
+ "NOTSET": logging.NOTSET,
21
+ }
22
+
23
+
24
+ def _CoerceLevel(levelValue: str | int | None) -> int:
25
+ """Translate the provided logging level into a numeric value."""
26
+
27
+ if isinstance(levelValue, int):
28
+ return levelValue
29
+ if isinstance(levelValue, str):
30
+ return _LEVEL_MAP.get(levelValue.upper(), logging.INFO)
31
+ return logging.INFO
32
+
33
+
34
+ class JsonFormatter(logging.Formatter):
35
+ """Minimal JSON formatter with deterministic keys."""
36
+
37
+ default_time_format = "%Y-%m-%dT%H:%M:%S"
38
+ default_msec_format = "%s.%03d"
39
+
40
+ def format(self, record: logging.LogRecord) -> str: # noqa: D401 (formatter contract)
41
+ payload: dict[str, Any] = {
42
+ "time": self.formatTime(record, self.default_time_format),
43
+ "level": record.levelname,
44
+ "name": record.name,
45
+ "message": record.getMessage(),
46
+ "module": record.module,
47
+ "func": record.funcName,
48
+ "line": record.lineno,
49
+ "process": record.process,
50
+ "thread": record.threadName,
51
+ }
52
+
53
+ for key, value in record.__dict__.items():
54
+ if key.startswith("_"):
55
+ continue
56
+ if key in {
57
+ "name",
58
+ "msg",
59
+ "args",
60
+ "levelname",
61
+ "levelno",
62
+ "pathname",
63
+ "filename",
64
+ "module",
65
+ "exc_info",
66
+ "exc_text",
67
+ "stack_info",
68
+ "lineno",
69
+ "funcName",
70
+ "created",
71
+ "msecs",
72
+ "relativeCreated",
73
+ "thread",
74
+ "threadName",
75
+ "process",
76
+ "processName",
77
+ "message",
78
+ }:
79
+ continue
80
+ payload.setdefault(key, value)
81
+
82
+ if record.exc_info:
83
+ payload["exc_info"] = self.formatException(record.exc_info)
84
+
85
+ return json.dumps(payload, ensure_ascii=False)
86
+
87
+
88
+ def _CreateRichHandler(levelValue: int) -> RichHandler:
89
+ """Instantiate the styled console handler."""
90
+
91
+ return RichHandler(
92
+ level=levelValue,
93
+ markup=True,
94
+ rich_tracebacks=True,
95
+ show_time=True,
96
+ show_path=False,
97
+ log_time_format="[%Y-%m-%d %H:%M:%S]",
98
+ )
99
+
100
+
101
+ def _CreateFileHandler(
102
+ logfile: Path, levelValue: int, jsonFormat: bool, maxBytes: int, backupCount: int
103
+ ) -> RotatingFileHandler:
104
+ """Create a rotating file handler that optionally emits JSON."""
105
+
106
+ logfile.parent.mkdir(parents=True, exist_ok=True)
107
+ handler = RotatingFileHandler(
108
+ filename=str(logfile),
109
+ maxBytes=maxBytes,
110
+ backupCount=backupCount,
111
+ encoding="utf-8",
112
+ )
113
+ handler.setLevel(levelValue)
114
+ if jsonFormat:
115
+ handler.setFormatter(JsonFormatter())
116
+ else:
117
+ handler.setFormatter(
118
+ logging.Formatter(
119
+ fmt="%(asctime)s %(levelname)s %(name)s:%(lineno)d | %(message)s",
120
+ datefmt="%Y-%m-%d %H:%M:%S",
121
+ )
122
+ )
123
+ return handler
124
+
125
+
126
+ def ConfigureLogging(
127
+ levelValue: str | int | None = None,
128
+ *,
129
+ logfile: str | Path | None = None,
130
+ jsonLogs: bool | None = None,
131
+ useRich: bool | None = None,
132
+ maxBytes: int | None = None,
133
+ backupCount: int | None = None,
134
+ loggerName: str = "sigdetect",
135
+ ) -> logging.Logger:
136
+ """Initialise logging with precedence ``arguments > env vars > defaults``."""
137
+
138
+ resolvedLevel = _CoerceLevel(levelValue or os.getenv("SIGDETECT_LOG_LEVEL"))
139
+ resolvedLogfile = (
140
+ Path(os.getenv("SIGDETECT_LOG_FILE"))
141
+ if (logfile is None and os.getenv("SIGDETECT_LOG_FILE"))
142
+ else Path(logfile) if logfile else None
143
+ )
144
+ resolvedJson = (
145
+ jsonLogs
146
+ if jsonLogs is not None
147
+ else os.getenv("SIGDETECT_LOG_JSON", "false").lower() in {"1", "true", "yes"}
148
+ )
149
+ resolvedRich = (
150
+ useRich
151
+ if useRich is not None
152
+ else os.getenv("SIGDETECT_LOG_RICH", "true").lower() in {"1", "true", "yes"}
153
+ )
154
+ resolvedMaxBytes = (
155
+ maxBytes if maxBytes is not None else int(os.getenv("SIGDETECT_LOG_MAX_BYTES", "1048576"))
156
+ )
157
+ resolvedBackups = (
158
+ backupCount if backupCount is not None else int(os.getenv("SIGDETECT_LOG_BACKUPS", "5"))
159
+ )
160
+
161
+ logger = logging.getLogger(loggerName)
162
+ if getattr(logger, "_configured", False):
163
+ return logger
164
+
165
+ logger.setLevel(resolvedLevel)
166
+ logger.propagate = False
167
+
168
+ handlers: list[logging.Handler] = []
169
+
170
+ if resolvedRich:
171
+ handlers.append(_CreateRichHandler(resolvedLevel))
172
+ else:
173
+ streamHandler = logging.StreamHandler()
174
+ streamHandler.setLevel(resolvedLevel)
175
+ streamHandler.setFormatter(
176
+ logging.Formatter(
177
+ fmt="%(asctime)s %(levelname)s %(name)s:%(lineno)d | %(message)s",
178
+ datefmt="%H:%M:%S",
179
+ )
180
+ )
181
+ handlers.append(streamHandler)
182
+
183
+ if resolvedLogfile:
184
+ handlers.append(
185
+ _CreateFileHandler(
186
+ resolvedLogfile, resolvedLevel, resolvedJson, resolvedMaxBytes, resolvedBackups
187
+ )
188
+ )
189
+
190
+ for handler in handlers:
191
+ logger.addHandler(handler)
192
+
193
+ logger._configured = True # type: ignore[attr-defined]
194
+ logger.debug(
195
+ "Logging initialized",
196
+ extra={
197
+ "level": resolvedLevel,
198
+ "logfile": str(resolvedLogfile) if resolvedLogfile else None,
199
+ "json_logs": resolvedJson,
200
+ "use_rich": resolvedRich,
201
+ },
202
+ )
203
+ return logger
204
+
205
+
206
+ def SetVerbosity(logger: logging.Logger, *, verbose: bool = False, quiet: bool = False) -> None:
207
+ """Adjust the console verbosity for ``logger``."""
208
+
209
+ if quiet:
210
+ newLevel = logging.WARNING
211
+ elif verbose:
212
+ newLevel = logging.DEBUG
213
+ else:
214
+ newLevel = logging.INFO
215
+
216
+ logger.setLevel(newLevel)
217
+ for handler in logger.handlers:
218
+ handler.setLevel(newLevel)
sigdetect/utils.py ADDED
@@ -0,0 +1,152 @@
1
+ """Utility helpers shared across detectors."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from collections.abc import Iterator
7
+ from contextlib import suppress
8
+ from importlib import resources
9
+ from typing import Any, Pattern
10
+
11
+ import yaml
12
+ from pypdf import generic
13
+
14
+ _PACKAGE_NAME = "sigdetect.data"
15
+ _VENDOR_FILE = "vendor_patterns.yml"
16
+
17
+
18
+ def LoadPatterns(profileName: str | None = None) -> dict[str, Any]:
19
+ """Return the merged vendor and role patterns for the requested profile."""
20
+
21
+ roleCandidates: list[str] = []
22
+ if profileName:
23
+ roleCandidates.append(f"role_rules.{profileName}.yml")
24
+ roleCandidates.append("role_rules.yml")
25
+
26
+ rolePatterns: dict[str, Any] = {}
27
+ for candidate in roleCandidates:
28
+ try:
29
+ with resources.files(_PACKAGE_NAME).joinpath(candidate).open("rb") as handle:
30
+ rolePatterns = yaml.safe_load(handle) or {}
31
+ break
32
+ except FileNotFoundError:
33
+ continue
34
+
35
+ with resources.files(_PACKAGE_NAME).joinpath(_VENDOR_FILE).open("rb") as handle:
36
+ vendorPatterns = yaml.safe_load(handle) or {}
37
+
38
+ rolePatterns.setdefault("bytes", vendorPatterns.get("bytes"))
39
+ rolePatterns.setdefault("text", vendorPatterns.get("text"))
40
+ return rolePatterns
41
+
42
+
43
+ def NormalizeText(value: str) -> str:
44
+ """Normalize whitespace so downstream regex work consistently."""
45
+
46
+ return re.sub(r"\s+", " ", (value or "")).strip()
47
+
48
+
49
+ def AsDictionary(candidate: Any) -> Any:
50
+ """Resolve pypdf indirect objects to their underlying dictionary."""
51
+
52
+ if isinstance(candidate, generic.IndirectObject):
53
+ with suppress(Exception):
54
+ return candidate.get_object()
55
+ return candidate
56
+
57
+
58
+ def IterateWidgets(candidate: Any) -> Iterator[Any]:
59
+ """Yield widget dictionaries from any nested structure."""
60
+
61
+ if candidate is None:
62
+ return
63
+ if isinstance(candidate, generic.IndirectObject):
64
+ yield from IterateWidgets(candidate.get_object())
65
+ elif isinstance(candidate, generic.ArrayObject):
66
+ for item in candidate:
67
+ yield from IterateWidgets(item)
68
+ elif isinstance(candidate, generic.DictionaryObject):
69
+ yield candidate
70
+
71
+
72
+ def HasSignatureFieldInAncestry(candidate: Any, maxHops: int = 12) -> bool:
73
+ """Check if a dictionary or any parent declares a signature field type."""
74
+
75
+ hopCount = 0
76
+ current = AsDictionary(candidate)
77
+ while isinstance(current, generic.DictionaryObject) and hopCount <= maxHops:
78
+ if current.get("/FT") == "/Sig":
79
+ return True
80
+ current = AsDictionary(current.get("/Parent"))
81
+ hopCount += 1
82
+ return False
83
+
84
+
85
+ def HasSignatureValue(candidate: Any) -> bool:
86
+ """Determine whether the widget or any parent contains signature metadata."""
87
+
88
+ dictionaryCandidate = AsDictionary(candidate)
89
+ if not isinstance(dictionaryCandidate, generic.DictionaryObject):
90
+ return False
91
+
92
+ valueCandidate = AsDictionary(dictionaryCandidate.get("/V"))
93
+ if isinstance(valueCandidate, generic.DictionaryObject):
94
+ if (
95
+ valueCandidate.get("/Type") == "/Sig"
96
+ or valueCandidate.get("/SubFilter")
97
+ or valueCandidate.get("/Filter")
98
+ ):
99
+ return True
100
+
101
+ parentCandidate = AsDictionary(dictionaryCandidate.get("/Parent"))
102
+ if isinstance(parentCandidate, generic.DictionaryObject):
103
+ parentValue = AsDictionary(parentCandidate.get("/V"))
104
+ if isinstance(parentValue, generic.DictionaryObject):
105
+ if (
106
+ parentValue.get("/Type") == "/Sig"
107
+ or parentValue.get("/SubFilter")
108
+ or parentValue.get("/Filter")
109
+ ):
110
+ return True
111
+ return False
112
+
113
+
114
+ def GetFieldNameFromAncestry(candidate: Any, maxHops: int = 12) -> str | None:
115
+ """Return the closest field name (``/T``) in the widget hierarchy."""
116
+
117
+ hopCount = 0
118
+ current = AsDictionary(candidate)
119
+ while isinstance(current, generic.DictionaryObject) and hopCount <= maxHops:
120
+ fieldName = current.get("/T")
121
+ if fieldName:
122
+ try:
123
+ return str(fieldName)
124
+ except Exception:
125
+ return None
126
+ current = AsDictionary(current.get("/Parent"))
127
+ hopCount += 1
128
+ return None
129
+
130
+
131
+ def RolesFromLabels(text: str, labelPatterns: dict[str, Pattern[str]]) -> set[str]:
132
+ """Identify roles that match the explicit label patterns."""
133
+
134
+ normalizedText = NormalizeText(text)
135
+ return {role for role, pattern in labelPatterns.items() if pattern.search(normalizedText)}
136
+
137
+
138
+ def RolesFromGeneral(text: str, generalPatterns: dict[str, Pattern[str]]) -> set[str]:
139
+ """Identify roles using the broader, free-form regex patterns."""
140
+
141
+ normalizedText = NormalizeText(text)
142
+ return {role for role, pattern in generalPatterns.items() if pattern.search(normalizedText)}
143
+
144
+
145
+ def ChooseRole(scores: dict[str, int]) -> str:
146
+ """Return the dominant role based on the supplied score mapping."""
147
+
148
+ if not scores:
149
+ return "unknown"
150
+ topScore = max(scores.values())
151
+ winners = [role for role, value in scores.items() if value == topScore]
152
+ return winners[0] if len(winners) == 1 and topScore > 0 else "unknown"