scrufflehog 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,14 @@
1
+ """scrufflehog — deterministically verify that your redactors actually redact.
2
+
3
+ Everyone scans for secrets that leaked. scrufflehog tests whether the redaction
4
+ you rely on actually works: it runs adversarial probes through your own redactor
5
+ and asserts the secret is gone and not trivially reversible, and checks your
6
+ field denylist/allow-list covers the sensitive names you think it does.
7
+ """
8
+ from .oracles import Defect, assert_output, reversible
9
+ from .probes import Probe, get_probe_set
10
+ from .engine import RunResult, run
11
+
12
+ __version__ = "0.1.0"
13
+ __all__ = ["Defect", "assert_output", "reversible", "Probe", "get_probe_set",
14
+ "RunResult", "run"]
scrufflehog/advisor.py ADDED
@@ -0,0 +1,39 @@
1
+ """Optional advisor interface — the ONLY place non-determinism may enter.
2
+
3
+ An advisor proposes INPUTS and HYPOTHESES; it never renders a verdict. The
4
+ deterministic oracle still decides every defect. The default NoopAdvisor makes
5
+ the engine byte-identical to a pure deterministic run — the agentic layer is
6
+ strictly additive and opt-in. See docs/AGENTIC.md.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ from pathlib import Path
11
+ from typing import Protocol, runtime_checkable
12
+
13
+ from .probes import Probe
14
+
15
+
16
+ class CoverageVerdict:
17
+ CONFIRMED = "confirmed" # a real field with this name reaches the redactor
18
+ REFUTED = "refuted" # no such field is logged here — drop the finding
19
+ UNCONFIRMED = "unconfirmed" # can't tell — finding stands as a hypothesis
20
+
21
+
22
+ @runtime_checkable
23
+ class Advisor(Protocol):
24
+ def propose_probes(self, redactor_src: str, entry: dict) -> list[Probe]: ...
25
+ def discover_redactors(self, target: Path) -> list[dict]: ...
26
+ def confirm_coverage_gap(self, target: Path, field: str, redactor: str) -> str: ...
27
+
28
+
29
+ class NoopAdvisor:
30
+ """Default. Adds nothing; the run stays purely deterministic."""
31
+
32
+ def propose_probes(self, redactor_src: str, entry: dict) -> list[Probe]:
33
+ return []
34
+
35
+ def discover_redactors(self, target: Path) -> list[dict]:
36
+ return []
37
+
38
+ def confirm_coverage_gap(self, target: Path, field: str, redactor: str) -> str:
39
+ return CoverageVerdict.UNCONFIRMED
@@ -0,0 +1,2 @@
1
+ """Optional advisors. Import the concrete one you want; the core never imports
2
+ these, so the base package stays dependency-free and deterministic."""
@@ -0,0 +1,167 @@
1
+ """LLMAdvisor — the optional agentic layer.
2
+
3
+ Provider-agnostic: you pass a `complete(prompt: str) -> str` callable, so this
4
+ works with any backend (Anthropic, OpenAI, Bedrock, a local model) with no hard
5
+ SDK dependency. The core package never imports this module.
6
+
7
+ Invariant (see docs/AGENTIC.md): the advisor only proposes INPUTS and
8
+ HYPOTHESES. It NEVER renders a verdict — the deterministic oracle still decides
9
+ every defect. So:
10
+ - propose_probes: returns extra probes (planted secrets we control) for the
11
+ engine to run through the redactor + oracle. The model picks realistic
12
+ SHAPES; ground truth (the secret) is still ours.
13
+ - discover_redactors: returns candidate config entries for a human to confirm.
14
+ - confirm_coverage_gap: returns CONFIRMED/REFUTED/UNCONFIRMED by asking the
15
+ model to check real field usage — but a coverage finding only ever
16
+ DOWNGRADES to refuted or stays a hypothesis; the model can't invent one.
17
+
18
+ Every method degrades to the deterministic default (empty / UNCONFIRMED) on any
19
+ error, malformed output, or timeout. A broken advisor can never fail a run or
20
+ manufacture a finding.
21
+ """
22
+ from __future__ import annotations
23
+
24
+ import json
25
+ import re
26
+ from pathlib import Path
27
+ from typing import Callable
28
+
29
+ from ..advisor import CoverageVerdict
30
+ from ..probes import Probe, weak_secret_space
31
+
32
+ CompleteFn = Callable[[str], str]
33
+
34
+ _MAX_SRC = 6000 # cap redactor source sent to the model
35
+ _MAX_PROBES = 6 # cap generated probes per redactor
36
+
37
+
38
+ def _extract_json(text: str):
39
+ """Pull the first JSON array/object out of a model reply (tolerant of prose
40
+ or ```json fences). Returns the parsed value or None."""
41
+ if not text:
42
+ return None
43
+ fenced = re.search(r"```(?:json)?\s*(.*?)```", text, re.DOTALL)
44
+ candidate = fenced.group(1) if fenced else text
45
+ # try array then object
46
+ for opener, closer in (("[", "]"), ("{", "}")):
47
+ i, j = candidate.find(opener), candidate.rfind(closer)
48
+ if i != -1 and j != -1 and j > i:
49
+ try:
50
+ return json.loads(candidate[i:j + 1])
51
+ except json.JSONDecodeError:
52
+ continue
53
+ return None
54
+
55
+
56
+ class LLMAdvisor:
57
+ def __init__(self, complete: CompleteFn):
58
+ self._complete = complete
59
+
60
+ # --- 1. domain-matched probe generation -------------------------------
61
+ def propose_probes(self, redactor_src: str, entry: dict) -> list[Probe]:
62
+ if not redactor_src:
63
+ return []
64
+ prompt = (
65
+ "You are helping test a redaction function. Given its source, return "
66
+ "a JSON array of test INPUTS shaped like the data this redactor is "
67
+ "meant to process (e.g. if it redacts URLs, produce URL strings; if "
68
+ "log field values, produce those). For each, give a JSON object "
69
+ '{"name": str, "input": str, "secret": str} where `secret` is a '
70
+ "substring of `input` that MUST be redacted. Use the literal token "
71
+ "SECRET_MARKER as the sensitive value inside each input so it is "
72
+ "unambiguous. Return ONLY the JSON array.\n\n"
73
+ f"Redactor source:\n```\n{redactor_src[:_MAX_SRC]}\n```")
74
+ try:
75
+ parsed = _extract_json(self._complete(prompt))
76
+ except Exception: # noqa: BLE001
77
+ return []
78
+ if not isinstance(parsed, list):
79
+ return []
80
+ probes: list[Probe] = []
81
+ for i, item in enumerate(parsed[:_MAX_PROBES]):
82
+ if not isinstance(item, dict):
83
+ continue
84
+ inp = item.get("input")
85
+ secret = item.get("secret")
86
+ if not isinstance(inp, str) or not isinstance(secret, str) or not secret:
87
+ continue
88
+ # Substitute OUR controlled secret for the model's marker so ground
89
+ # truth is ours, not the model's — it only chose the SHAPE.
90
+ controlled = "test1234"
91
+ inp2 = inp.replace("SECRET_MARKER", controlled)
92
+ sec2 = secret.replace("SECRET_MARKER", controlled)
93
+ if sec2 not in inp2:
94
+ continue # discard shapes where the secret isn't actually present
95
+ probes.append(Probe(
96
+ name=str(item.get("name", f"llm-probe-{i}"))[:60],
97
+ input=inp2, secret=sec2, secret_space=weak_secret_space()))
98
+ return probes
99
+
100
+ # --- 2. redactor discovery (proposals for human/config confirmation) ---
101
+ def discover_redactors(self, target: Path) -> list[dict]:
102
+ # Intentionally conservative: discovery is advisory and needs a repo
103
+ # walk the caller opts into. Left as a proposal hook; returns [] unless
104
+ # a subclass wires a file walk. Kept simple to avoid scanning huge trees
105
+ # by default.
106
+ return []
107
+
108
+ # --- 3. coverage-gap confirmation -------------------------------------
109
+ def confirm_coverage_gap(self, target: Path, field: str, redactor: str) -> str:
110
+ """Ask whether a field named `field` plausibly reaches this redactor in
111
+ the codebase. Can only REFUTE (drop) or leave UNCONFIRMED — never
112
+ promote to a defect the oracle didn't produce."""
113
+ # Cheap deterministic pre-check: does the token even appear in the tree?
114
+ # (A model call is only worth it if there's something to reason about.)
115
+ try:
116
+ hits = self._grep_field(target, field)
117
+ except Exception: # noqa: BLE001
118
+ return CoverageVerdict.UNCONFIRMED
119
+ if not hits:
120
+ # The field name appears nowhere — the gap is likely moot here.
121
+ # Still only UNCONFIRMED (absence in source ≠ never logged), unless
122
+ # the model is available to make the call.
123
+ pass
124
+ prompt = (
125
+ "A log/PII redactor does NOT redact a field named "
126
+ f"'{field}'. Here are code lines mentioning it (may be empty):\n"
127
+ + "\n".join(hits[:20]) +
128
+ "\n\nBased ONLY on this, is a field literally named "
129
+ f"'{field}' plausibly present in data this redactor processes? "
130
+ 'Answer with JSON {"verdict": "confirmed"|"refuted"|"unconfirmed"}. '
131
+ "Use 'refuted' only if the name clearly does not occur as a data "
132
+ "field here.")
133
+ try:
134
+ parsed = _extract_json(self._complete(prompt))
135
+ except Exception: # noqa: BLE001
136
+ return CoverageVerdict.UNCONFIRMED
137
+ if isinstance(parsed, dict):
138
+ v = str(parsed.get("verdict", "")).lower()
139
+ if v in (CoverageVerdict.CONFIRMED, CoverageVerdict.REFUTED,
140
+ CoverageVerdict.UNCONFIRMED):
141
+ return v
142
+ return CoverageVerdict.UNCONFIRMED
143
+
144
+ @staticmethod
145
+ def _grep_field(target: Path, field: str) -> list[str]:
146
+ """Deterministic, cheap: source lines mentioning the field name. Bounded
147
+ to keep it fast; skips obvious vendored/build dirs."""
148
+ out: list[str] = []
149
+ skip = {"node_modules", ".git", "dist", "build", "target", ".venv", "vendor"}
150
+ exts = {".go", ".py", ".ts", ".js", ".rs", ".java", ".rb", ".proto"}
151
+ pat = re.compile(re.escape(field), re.IGNORECASE)
152
+ for p in target.rglob("*"):
153
+ if len(out) >= 50:
154
+ break
155
+ if not p.is_file() or p.suffix not in exts:
156
+ continue
157
+ if any(part in skip for part in p.parts):
158
+ continue
159
+ try:
160
+ for ln in p.read_text(encoding="utf-8", errors="replace").splitlines():
161
+ if pat.search(ln):
162
+ out.append(f"{p.name}: {ln.strip()[:160]}")
163
+ if len(out) >= 50:
164
+ break
165
+ except OSError:
166
+ continue
167
+ return out
scrufflehog/cli.py ADDED
@@ -0,0 +1,82 @@
1
+ """scrufflehog CLI — verify a target's redactors against a config."""
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ from .config import load_config
9
+ from .engine import run
10
+ from .output import FORMATTERS
11
+
12
+
13
+ def _load_advisor(name: str):
14
+ if name == "none":
15
+ from .advisor import NoopAdvisor
16
+ return NoopAdvisor()
17
+ if name == "llm":
18
+ from .advisors.llm import LLMAdvisor
19
+ complete = _resolve_completion_backend()
20
+ if complete is None:
21
+ print("--advisor llm needs a completion backend. Set SCRUFFLEHOG_LLM "
22
+ "to a 'module:function' path resolving to a complete(prompt)->str "
23
+ "callable (e.g. your Anthropic/OpenAI/Bedrock wrapper).",
24
+ file=sys.stderr)
25
+ raise SystemExit(2)
26
+ return LLMAdvisor(complete)
27
+ raise SystemExit(f"unknown advisor: {name!r}")
28
+
29
+
30
+ def _resolve_completion_backend():
31
+ """Resolve a `complete(prompt)->str` callable from SCRUFFLEHOG_LLM=
32
+ 'package.module:function'. Provider-agnostic — the user points it at their
33
+ own model wrapper, so scrufflehog needs no SDK dependency."""
34
+ import importlib
35
+ import os
36
+ spec = os.environ.get("SCRUFFLEHOG_LLM")
37
+ if not spec or ":" not in spec:
38
+ return None
39
+ mod_name, fn_name = spec.split(":", 1)
40
+ try:
41
+ mod = importlib.import_module(mod_name)
42
+ return getattr(mod, fn_name)
43
+ except (ImportError, AttributeError) as e:
44
+ print(f"could not load SCRUFFLEHOG_LLM={spec!r}: {e}", file=sys.stderr)
45
+ return None
46
+
47
+
48
+ def main(argv: list[str] | None = None) -> int:
49
+ p = argparse.ArgumentParser(
50
+ prog="scrufflehog",
51
+ description="Deterministically verify that your redactors actually redact.")
52
+ sub = p.add_subparsers(dest="cmd", required=True)
53
+
54
+ v = sub.add_parser("verify", help="verify redactors declared in a config")
55
+ v.add_argument("--config", required=True, type=Path)
56
+ v.add_argument("--target", required=True, type=Path,
57
+ help="repo checkout the config's module paths are relative to")
58
+ v.add_argument("--format", choices=sorted(FORMATTERS), default="text")
59
+ v.add_argument("--advisor", choices=["none", "llm"], default="none",
60
+ help="optional agentic assist (proposes probes / confirms "
61
+ "coverage gaps); verdicts stay deterministic")
62
+ v.add_argument("--fail-on-defect", action="store_true",
63
+ help="exit non-zero if any defect is found (for CI gating)")
64
+
65
+ args = p.parse_args(argv)
66
+
67
+ if args.cmd == "verify":
68
+ if not args.target.is_dir():
69
+ print(f"target not a directory: {args.target}", file=sys.stderr)
70
+ return 2
71
+ config = load_config(args.config)
72
+ advisor = _load_advisor(args.advisor)
73
+ result = run(args.target, config, advisor)
74
+ print(FORMATTERS[args.format](result, str(args.target)))
75
+ if args.fail_on_defect and result.defects:
76
+ return 1
77
+ return 0
78
+ return 2
79
+
80
+
81
+ if __name__ == "__main__":
82
+ raise SystemExit(main())
scrufflehog/config.py ADDED
@@ -0,0 +1,55 @@
1
+ """Load a scrufflehog config — the externalised redactor registry.
2
+
3
+ TOML shape (see examples/):
4
+
5
+ [[transform]]
6
+ lang = "python" # python | go | rust | node
7
+ module = "app/redact.py"
8
+ fn = "redact"
9
+ kind = "value" # value | row | tree
10
+ probe_set = "value" # value | url_apikey
11
+ # go extras: import = "...", wrap = "error"
12
+ # rust extras: call = "mycrate::redact(&line)"
13
+ # node extras: export = "default"
14
+
15
+ [[coverage]]
16
+ module = "app/redact.py"
17
+ symbol = "SECRET_KEYS"
18
+ extract = "py_collection" # go_map_keys | py_collection | ts_redact_paths | rust_str_set
19
+ match = "exact_ci" # exact_ci | substring_ci | field_substring_ci
20
+ doc_claims_substring = false
21
+ # corpus = ["ssn", "cvv", ...] # optional; defaults to the built-in list
22
+ """
23
+ from __future__ import annotations
24
+
25
+ from pathlib import Path
26
+ from typing import Any
27
+
28
+ try:
29
+ import tomllib # py3.11+
30
+ except ModuleNotFoundError: # pragma: no cover
31
+ import tomli as tomllib # type: ignore
32
+
33
+
34
+ def load_config(path: Path) -> dict[str, Any]:
35
+ if not path.exists():
36
+ raise FileNotFoundError(f"config not found: {path}")
37
+ with open(path, "rb") as f:
38
+ data = tomllib.load(f)
39
+ config = {"transform": data.get("transform", []),
40
+ "coverage": data.get("coverage", [])}
41
+ _validate(config)
42
+ return config
43
+
44
+
45
+ def _validate(config: dict[str, Any]) -> None:
46
+ for e in config["transform"]:
47
+ for req in ("lang", "module"):
48
+ if req not in e:
49
+ raise ValueError(f"transform entry missing {req!r}: {e}")
50
+ if e["lang"] != "rust" and "fn" not in e and e.get("export") != "default":
51
+ raise ValueError(f"transform entry needs 'fn' (or export='default'): {e}")
52
+ for s in config["coverage"]:
53
+ for req in ("module", "symbol", "extract"):
54
+ if req not in s:
55
+ raise ValueError(f"coverage entry missing {req!r}: {s}")
@@ -0,0 +1,4 @@
1
+ from .extract import extract_key_set, DEFAULT_SENSITIVE_FIELDS
2
+ from .semantics import covered
3
+
4
+ __all__ = ["extract_key_set", "covered", "DEFAULT_SENSITIVE_FIELDS"]
@@ -0,0 +1,104 @@
1
+ """Static extraction of a redactor's field-name denylist / allow-list.
2
+
3
+ A field-name list is DATA, not behaviour — so we can extract it from source and
4
+ check coverage WITHOUT executing the target's language. This is what makes
5
+ coverage mode language-agnostic.
6
+
7
+ Extractors:
8
+ go_map_keys — keys of `var X = map[string]struct{}{ "k": {}, ... }`
9
+ py_collection — string entries of a Python list/tuple/set literal
10
+ ts_redact_paths — leaf field names from a path allow-list
11
+ (e.g. redact.req('body.email') -> email;
12
+ body.message[*].phone_number -> phone_number)
13
+ rust_str_set — string literals in a Rust slice/array/HashSet-from literal
14
+
15
+ Fails LOUD on an empty extraction: an empty set from source that plainly has
16
+ entries means the extractor missed the literal, and silently returning it would
17
+ false-positive "everything is missed" on a good list. A genuinely empty list is
18
+ reportable, but must be proven, not inferred from a parse miss.
19
+ """
20
+ from __future__ import annotations
21
+
22
+ import re
23
+ from pathlib import Path
24
+ from typing import Any
25
+
26
+ # A generic, public-knowledge corpus of field names a PII/secret redactor is
27
+ # reasonably expected to cover. Users can override per-target in config.
28
+ DEFAULT_SENSITIVE_FIELDS = [
29
+ "ssn", "social_security_number", "cvv", "cvc", "card_number", "pan",
30
+ "iban", "swift", "routing_number", "account_number", "phone_number",
31
+ "phone", "mobile", "passport", "passport_number", "drivers_license",
32
+ "dob", "date_of_birth", "email", "email_address", "password", "secret",
33
+ "token", "api_key", "apikey", "authorization", "pin", "security_code",
34
+ "credit_card", "bank_account", "tax_id",
35
+ ]
36
+
37
+ _GO_MAP_KEY_RE = re.compile(r'"([^"\\]+)"\s*:\s*\{\}')
38
+ _STR_RE = re.compile(r'"([^"\\]+)"|\'([^\'\\]+)\'')
39
+
40
+
41
+ def _scoped_block(src: str, symbol: str, extract: str) -> str:
42
+ """Return the literal block belonging to `symbol`, brace/bracket-balanced.
43
+ Anchors on a declaration (`symbol =` / `symbol:` / map decl), not a comment
44
+ mention. Handles Go's `map[string]struct{}{` (first `{` is struct{}'s body)."""
45
+ m = re.search(rf'\b{re.escape(symbol)}\b\s*(?:=|:?=|:|\bmap\b|\[\])', src)
46
+ idx = m.start() if m else src.find(symbol)
47
+ if idx == -1:
48
+ raise ValueError(f"symbol {symbol!r} not found")
49
+
50
+ if extract == "go_map_keys":
51
+ open_re = re.search(r'\}\s*\{', src[idx:])
52
+ if not open_re:
53
+ raise ValueError(f"could not locate map literal opening for {symbol!r}")
54
+ brace = idx + open_re.end() - 1
55
+ else:
56
+ candidates = [p for p in (src.find("{", idx), src.find("[", idx),
57
+ src.find("(", idx)) if p != -1]
58
+ if not candidates:
59
+ raise ValueError(f"no literal block after {symbol!r}")
60
+ brace = min(candidates)
61
+
62
+ opener = src[brace]
63
+ closer = {"{": "}", "[": "]", "(": ")"}[opener]
64
+ depth, end = 0, brace
65
+ for i in range(brace, len(src)):
66
+ if src[i] == opener:
67
+ depth += 1
68
+ elif src[i] == closer:
69
+ depth -= 1
70
+ if depth == 0:
71
+ end = i
72
+ break
73
+ return src[brace:end + 1]
74
+
75
+
76
+ def extract_key_set(target: Path, spec: dict[str, Any]) -> set[str]:
77
+ path = target / spec["module"]
78
+ if not path.exists():
79
+ raise FileNotFoundError(f"coverage module not found: {spec['module']}")
80
+ src = path.read_text(encoding="utf-8", errors="replace")
81
+ extract = spec["extract"]
82
+ block = _scoped_block(src, spec["symbol"], extract)
83
+
84
+ if extract == "go_map_keys":
85
+ keys = {m.lower() for m in _GO_MAP_KEY_RE.findall(block)}
86
+ elif extract in ("py_collection", "rust_str_set"):
87
+ keys = {(a or b).lower() for a, b in _STR_RE.findall(block)}
88
+ elif extract == "ts_redact_paths":
89
+ keys = set()
90
+ for a, b in _STR_RE.findall(block):
91
+ raw = a or b
92
+ leaf = re.sub(r'\[[^\]]*\]', '', raw).rstrip('.').split('.')[-1]
93
+ leaf = leaf.strip().strip('"\'')
94
+ if leaf:
95
+ keys.add(leaf.lower())
96
+ else:
97
+ raise ValueError(f"unknown extract kind: {extract!r}")
98
+
99
+ if not keys:
100
+ raise ValueError(
101
+ f"extracted 0 keys from {spec['symbol']!r} in {spec['module']} — "
102
+ f"the extractor likely failed to locate the literal (fail-loud: "
103
+ f"refusing to report a false 'everything missed').")
104
+ return keys
@@ -0,0 +1,19 @@
1
+ """Match semantics — how a redactor decides whether a field name is covered.
2
+
3
+ Mirrors the target redactor's own matching so the coverage verdict reflects what
4
+ the redactor actually does, not what we assume.
5
+ """
6
+ from __future__ import annotations
7
+
8
+
9
+ def covered(field_name: str, keys: set[str], match: str) -> bool:
10
+ f = field_name.lower()
11
+ if match == "exact_ci":
12
+ return f in keys
13
+ if match == "substring_ci":
14
+ # redactor redacts a field if ANY key is a substring of the field name
15
+ return any(k in f for k in keys)
16
+ if match == "field_substring_ci":
17
+ # inverse: redactor redacts if the field name is a substring of any key
18
+ return any(f in k for k in keys)
19
+ raise ValueError(f"unknown match semantics: {match!r}")
scrufflehog/engine.py ADDED
@@ -0,0 +1,134 @@
1
+ """Engine — ties runners + coverage + oracles into a verification run.
2
+
3
+ Two families:
4
+ transform-strength — execute each registered redactor on its probe set,
5
+ apply the oracles to the output.
6
+ coverage — statically extract each denylist/allow-list and check a
7
+ sensitive-field corpus against it.
8
+
9
+ The advisor (default no-op) may add probes, discover redactors, or confirm
10
+ coverage gaps — but the deterministic oracle renders every verdict.
11
+ """
12
+ from __future__ import annotations
13
+
14
+ from dataclasses import dataclass, field
15
+ from pathlib import Path
16
+ from typing import Any
17
+
18
+ from .advisor import Advisor, CoverageVerdict, NoopAdvisor
19
+ from .coverage import covered, extract_key_set, DEFAULT_SENSITIVE_FIELDS
20
+ from .oracles import (COVERAGE_GAP, REDACTOR_ERRORED, Defect, assert_output)
21
+ from .probes import Probe, get_probe_set
22
+ from .runners import make_producer
23
+
24
+
25
+ @dataclass
26
+ class RunResult:
27
+ defects: list[Defect] = field(default_factory=list)
28
+ notes: list[str] = field(default_factory=list)
29
+
30
+
31
+ def _transform_probes(target: Path, entry: dict, advisor: Advisor) -> list[Probe]:
32
+ probes = get_probe_set(entry.get("probe_set", "value"))
33
+ # Advisor may ADD domain-matched probes; deterministic set always included.
34
+ src = ""
35
+ mod = entry.get("module")
36
+ if mod and (target / mod).exists():
37
+ try:
38
+ src = (target / mod).read_text(encoding="utf-8", errors="replace")
39
+ except OSError:
40
+ src = ""
41
+ try:
42
+ probes = probes + advisor.propose_probes(src, entry)
43
+ except Exception: # noqa: BLE001 — advisor failure degrades to deterministic
44
+ pass
45
+ return probes
46
+
47
+
48
+ def verify_transform(target: Path, entry: dict, advisor: Advisor) -> tuple[list[Defect], str]:
49
+ lang = entry.get("lang", "python")
50
+ label = f"{entry.get('module', '?')}:{entry.get('fn', '?')}"
51
+ kind = entry.get("kind", "value")
52
+ try:
53
+ produce = make_producer(target, entry)
54
+ except (FileNotFoundError, AttributeError, RuntimeError, ValueError, ImportError) as e:
55
+ return ([Defect(label, REDACTOR_ERRORED, "-",
56
+ f"[{lang}] could not prepare redactor: {e}")],
57
+ f"[transform:{lang}] {label}: unprepared")
58
+ probes = _transform_probes(target, entry, advisor)
59
+ defects: list[Defect] = []
60
+ try:
61
+ for p in probes:
62
+ try:
63
+ out = produce(p)
64
+ except Exception as exc: # noqa: BLE001 — a crashing redactor IS a defect
65
+ defects.append(Defect(label, REDACTOR_ERRORED, p.name,
66
+ f"redactor raised {type(exc).__name__}: {exc}"))
67
+ continue
68
+ d = assert_output(out, p, label, kind)
69
+ if d is not None:
70
+ defects.append(d)
71
+ finally:
72
+ cleanup = getattr(produce, "_cleanup", None)
73
+ if callable(cleanup):
74
+ cleanup()
75
+ return defects, f"[transform:{lang}] {label}: {len(defects)} defect(s)"
76
+
77
+
78
+ def verify_coverage(target: Path, spec: dict, advisor: Advisor) -> tuple[list[Defect], str]:
79
+ label = f"{spec['module']}:{spec['symbol']}"
80
+ try:
81
+ keys = extract_key_set(target, spec)
82
+ except (FileNotFoundError, ValueError) as e:
83
+ return ([Defect(label, REDACTOR_ERRORED, "-",
84
+ f"coverage target did not resolve: {e}")],
85
+ f"[coverage] {label}: unresolved")
86
+ match = spec.get("match", "exact_ci")
87
+ corpus = spec.get("corpus", DEFAULT_SENSITIVE_FIELDS)
88
+ defects: list[Defect] = []
89
+ for f in corpus:
90
+ if covered(f, keys, match):
91
+ continue
92
+ # Advisor may confirm/refute the hypothesis; default leaves it standing.
93
+ try:
94
+ verdict = advisor.confirm_coverage_gap(target, f, label)
95
+ except Exception: # noqa: BLE001
96
+ verdict = CoverageVerdict.UNCONFIRMED
97
+ if verdict == CoverageVerdict.REFUTED:
98
+ continue
99
+ suffix = "" if verdict == CoverageVerdict.UNCONFIRMED else f" [{verdict}]"
100
+ defects.append(Defect(
101
+ label, COVERAGE_GAP, f,
102
+ f"sensitive field {f!r} not caught by the {len(keys)}-key list "
103
+ f"under {match} matching — it is not redacted{suffix}"))
104
+ if spec.get("doc_claims_substring") and match == "exact_ci" and defects:
105
+ defects.append(Defect(
106
+ label, COVERAGE_GAP, "<doc-mismatch>",
107
+ "redactor docs claim partial/substring matching but the behaviour is "
108
+ "exact — fields expected to be covered by substring are NOT"))
109
+ return defects, f"[coverage] {label}: {len(defects)} gap(s)"
110
+
111
+
112
+ def run(target: Path, config: dict[str, Any], advisor: Advisor | None = None) -> RunResult:
113
+ """Run all transform + coverage checks in `config` against `target`.
114
+
115
+ config = {"transform": [entry, ...], "coverage": [spec, ...]}
116
+ """
117
+ advisor = advisor or NoopAdvisor()
118
+ result = RunResult()
119
+
120
+ transform = config.get("transform", [])
121
+ coverage = config.get("coverage", [])
122
+ if not transform and not coverage:
123
+ result.notes.append("no redactors configured — nothing to verify")
124
+ return result
125
+
126
+ for entry in transform:
127
+ defects, note = verify_transform(target, entry, advisor)
128
+ result.defects.extend(defects)
129
+ result.notes.append(note)
130
+ for spec in coverage:
131
+ defects, note = verify_coverage(target, spec, advisor)
132
+ result.defects.extend(defects)
133
+ result.notes.append(note)
134
+ return result