mcp-warden-cli 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. mcp_warden/__init__.py +25 -0
  2. mcp_warden/capture.py +197 -0
  3. mcp_warden/check_core.py +98 -0
  4. mcp_warden/checks.py +163 -0
  5. mcp_warden/checks_secret.py +129 -0
  6. mcp_warden/checks_supply.py +146 -0
  7. mcp_warden/cli.py +367 -0
  8. mcp_warden/cli_diff.py +245 -0
  9. mcp_warden/cli_guard.py +301 -0
  10. mcp_warden/cli_lock.py +124 -0
  11. mcp_warden/cli_sign.py +304 -0
  12. mcp_warden/drift.py +272 -0
  13. mcp_warden/emit_res.py +102 -0
  14. mcp_warden/emitters.py +169 -0
  15. mcp_warden/framing.py +338 -0
  16. mcp_warden/guard.py +397 -0
  17. mcp_warden/guard_banner.py +211 -0
  18. mcp_warden/guard_io.py +70 -0
  19. mcp_warden/guard_lifecycle.py +320 -0
  20. mcp_warden/guard_list_gate.py +95 -0
  21. mcp_warden/guard_loop.py +332 -0
  22. mcp_warden/guard_result.py +249 -0
  23. mcp_warden/guard_strict.py +159 -0
  24. mcp_warden/hashing.py +102 -0
  25. mcp_warden/inspector.py +169 -0
  26. mcp_warden/lockfile.py +299 -0
  27. mcp_warden/models.py +300 -0
  28. mcp_warden/net_rules.py +80 -0
  29. mcp_warden/policy_eval.py +264 -0
  30. mcp_warden/policy_model.py +261 -0
  31. mcp_warden/precommit.py +259 -0
  32. mcp_warden/provenance.py +199 -0
  33. mcp_warden/redact.py +52 -0
  34. mcp_warden/res_catalog.py +198 -0
  35. mcp_warden/res_net.py +227 -0
  36. mcp_warden/res_rules.py +176 -0
  37. mcp_warden/result_inspection.py +201 -0
  38. mcp_warden/schema_diff.py +548 -0
  39. mcp_warden/signing.py +291 -0
  40. mcp_warden/tokenizer.py +199 -0
  41. mcp_warden/wire_block.py +258 -0
  42. mcp_warden_cli-1.0.0.dist-info/METADATA +517 -0
  43. mcp_warden_cli-1.0.0.dist-info/RECORD +46 -0
  44. mcp_warden_cli-1.0.0.dist-info/WHEEL +4 -0
  45. mcp_warden_cli-1.0.0.dist-info/entry_points.txt +3 -0
  46. mcp_warden_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
mcp_warden/__init__.py ADDED
@@ -0,0 +1,25 @@
1
+ """mcp-warden — CI-first MCP supply-chain integrity gate.
2
+
3
+ mcp-warden pins and verifies the *declared* tool/resource/prompt surface of an
4
+ MCP server (the ``(name, description, inputSchema)`` metadata returned by
5
+ ``tools/list`` / ``resources/list`` / ``prompts/list``), then fails CI when that
6
+ surface drifts from an approved baseline. It operates on **definitions**, never
7
+ on runtime tool behavior or tool results. See ``docs/THREAT_MODEL.md``.
8
+ """
9
+
10
+ __version__ = "1.0.0"
11
+ #: Lock schema version. Bumped 2 → 3 for #29 (in-document ``$ref`` resolution in
12
+ #: ``schema_diff.extract_skeleton``). Following refs changes the skeleton of any
13
+ #: ref-using tool → its ``entry_digest`` and the ``overall_digest`` (which embeds
14
+ #: ``schema_version``, lockfile.py:167). The bump makes that digest change a
15
+ #: declared schema-format migration rather than a silent surface change; drift.py
16
+ #: emits an additive ``schema-version-migrated`` advisory alongside (never in
17
+ #: place of) the ``unapproved-change`` finding so re-attestation is required.
18
+ SCHEMA_VERSION = 3
19
+ #: Provenance-block version (#19). Lives INSIDE the ``pin`` block, OUTSIDE the
20
+ #: ``overall_digest`` payload, so it can evolve for #16/#23 without changing any
21
+ #: server's digest. Deliberately distinct from ``SCHEMA_VERSION`` (which is in
22
+ #: the digest payload — bumping that would falsely trip drift on v2 baselines).
23
+ PROVENANCE_VERSION = 1
24
+
25
+ __all__ = ["__version__", "SCHEMA_VERSION", "PROVENANCE_VERSION"]
mcp_warden/capture.py ADDED
@@ -0,0 +1,197 @@
1
+ """MCP stdio capture client.
2
+
3
+ Spawns the target MCP server **over stdio as an argv array, never via a shell**
4
+ (WARDEN_LOCK_SCHEMA.md §10.4), runs ``initialize`` + ``tools/list`` +
5
+ ``resources/list`` + ``prompts/list``, and captures the declared surface.
6
+
7
+ A server that hangs, crashes, or exits nonzero must produce a clear
8
+ ``CaptureError``, not a traceback.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import logging
14
+ from typing import Any
15
+
16
+ import anyio
17
+ from mcp import ClientSession, StdioServerParameters
18
+ from mcp.client.stdio import stdio_client
19
+
20
+ from .models import (
21
+ CapturedPrompt,
22
+ CapturedResource,
23
+ CapturedSurface,
24
+ CapturedTool,
25
+ )
26
+
27
+ logger = logging.getLogger("mcp_warden.capture")
28
+
29
+ #: Hard wall-clock timeout for the entire capture handshake (seconds).
30
+ DEFAULT_TIMEOUT_S = 30.0
31
+
32
+
33
+ class CaptureError(Exception):
34
+ """Raised when the MCP server cannot be captured cleanly.
35
+
36
+ Carries a human-readable message suitable for CLI display; never a raw
37
+ traceback from the child process.
38
+ """
39
+
40
+
41
+ def _model_dump(obj: Any) -> dict[str, Any]:
42
+ """Best-effort dict view of an MCP SDK model across pydantic versions."""
43
+ if hasattr(obj, "model_dump"):
44
+ return obj.model_dump() # pydantic v2
45
+ if hasattr(obj, "dict"):
46
+ return obj.dict() # pydantic v1 fallback
47
+ return dict(obj)
48
+
49
+
50
+ async def _capture_async(command: str, args: list[str], timeout_s: float) -> CapturedSurface:
51
+ """Inner async capture; wrapped with a timeout by :func:`capture_surface`."""
52
+ # StdioServerParameters passes command+args as an argv array to the OS; the
53
+ # MCP SDK does NOT spawn through a shell. This is the §10.4 guarantee.
54
+ params = StdioServerParameters(command=command, args=list(args))
55
+
56
+ async with stdio_client(params) as (read_stream, write_stream):
57
+ async with ClientSession(read_stream, write_stream) as session:
58
+ init_result = await session.initialize()
59
+ protocol_version = str(getattr(init_result, "protocolVersion", "") or "")
60
+
61
+ tools = await _list_tools(session)
62
+ resources = await _list_resources(session)
63
+ prompts = await _list_prompts(session)
64
+
65
+ return CapturedSurface(
66
+ command=command,
67
+ args=list(args),
68
+ protocol_version=protocol_version,
69
+ tools=tools,
70
+ resources=resources,
71
+ prompts=prompts,
72
+ )
73
+
74
+
75
+ async def _list_tools(session: ClientSession) -> list[CapturedTool]:
76
+ """Run ``tools/list`` and normalize results. Empty list if unsupported."""
77
+ try:
78
+ result = await session.list_tools()
79
+ except Exception as exc: # server may not declare the tools capability
80
+ logger.info("tools/list unavailable: %s", exc)
81
+ return []
82
+ out: list[CapturedTool] = []
83
+ for tool in getattr(result, "tools", []) or []:
84
+ data = _model_dump(tool)
85
+ out.append(
86
+ CapturedTool(
87
+ name=str(data.get("name", "")),
88
+ description=data.get("description"),
89
+ input_schema=data.get("inputSchema"),
90
+ )
91
+ )
92
+ return out
93
+
94
+
95
+ async def _list_resources(session: ClientSession) -> list[CapturedResource]:
96
+ """Run ``resources/list`` and normalize results. Empty list if unsupported."""
97
+ try:
98
+ result = await session.list_resources()
99
+ except Exception as exc:
100
+ logger.info("resources/list unavailable: %s", exc)
101
+ return []
102
+ out: list[CapturedResource] = []
103
+ for res in getattr(result, "resources", []) or []:
104
+ data = _model_dump(res)
105
+ out.append(
106
+ CapturedResource(
107
+ uri=str(data.get("uri", "")),
108
+ name=data.get("name"),
109
+ description=data.get("description"),
110
+ mime_type=data.get("mimeType"),
111
+ )
112
+ )
113
+ return out
114
+
115
+
116
+ async def _list_prompts(session: ClientSession) -> list[CapturedPrompt]:
117
+ """Run ``prompts/list`` and normalize results. Empty list if unsupported."""
118
+ try:
119
+ result = await session.list_prompts()
120
+ except Exception as exc:
121
+ logger.info("prompts/list unavailable: %s", exc)
122
+ return []
123
+ out: list[CapturedPrompt] = []
124
+ for prompt in getattr(result, "prompts", []) or []:
125
+ data = _model_dump(prompt)
126
+ arguments = data.get("arguments")
127
+ norm_args: list[dict[str, Any]] | None = None
128
+ if isinstance(arguments, list):
129
+ norm_args = [a if isinstance(a, dict) else _model_dump(a) for a in arguments]
130
+ out.append(
131
+ CapturedPrompt(
132
+ name=str(data.get("name", "")),
133
+ description=data.get("description"),
134
+ arguments=norm_args,
135
+ )
136
+ )
137
+ return out
138
+
139
+
140
+ async def capture_surface(
141
+ command: str,
142
+ args: list[str],
143
+ timeout_s: float = DEFAULT_TIMEOUT_S,
144
+ ) -> CapturedSurface:
145
+ """Spawn an MCP server over stdio and capture its declared surface.
146
+
147
+ Args:
148
+ command: ``argv[0]`` of the server launch (no shell expansion performed).
149
+ args: Remaining argv, order preserved.
150
+ timeout_s: Wall-clock timeout for the whole handshake.
151
+
152
+ Returns:
153
+ The :class:`CapturedSurface` with tools/resources/prompts.
154
+
155
+ Raises:
156
+ CaptureError: If the server hangs (timeout), crashes, exits nonzero, or
157
+ the MCP handshake fails. The message is CLI-safe.
158
+ """
159
+ logger.debug("spawning MCP server: command=%r args=%r", command, args)
160
+ try:
161
+ with anyio.fail_after(timeout_s):
162
+ return await _capture_async(command, args, timeout_s)
163
+ except TimeoutError as exc:
164
+ raise CaptureError(
165
+ f"MCP server '{command}' did not complete the handshake within {timeout_s:.0f}s "
166
+ f"(it may be hung or waiting on input)."
167
+ ) from exc
168
+ except CaptureError:
169
+ raise
170
+ except FileNotFoundError as exc:
171
+ raise CaptureError(f"MCP server command not found: '{command}' ({exc}).") from exc
172
+ except Exception as exc:
173
+ # Covers nonzero exit, broken pipe, protocol error, decode failure, etc.
174
+ raise CaptureError(
175
+ f"Failed to capture MCP server '{command}': {type(exc).__name__}: {exc}"
176
+ ) from exc
177
+
178
+
179
+ def capture_surface_sync(
180
+ command: str,
181
+ args: list[str],
182
+ timeout_s: float = DEFAULT_TIMEOUT_S,
183
+ ) -> CapturedSurface:
184
+ """Synchronous wrapper around :func:`capture_surface` for the CLI.
185
+
186
+ Args:
187
+ command: ``argv[0]`` of the server launch.
188
+ args: Remaining argv.
189
+ timeout_s: Wall-clock timeout.
190
+
191
+ Returns:
192
+ The captured surface.
193
+
194
+ Raises:
195
+ CaptureError: On any capture failure (see :func:`capture_surface`).
196
+ """
197
+ return anyio.run(capture_surface, command, args, timeout_s)
@@ -0,0 +1,98 @@
1
+ """Shared check core: the single source of truth for the ``check`` verdict.
2
+
3
+ Both ``cli.py:check`` and the pre-commit wrapper (``precommit.py``) call
4
+ :func:`run_check` so a local hook and CI can never disagree on a drift verdict
5
+ (issue: "a hook that disagrees with CI is worse than no hook").
6
+
7
+ The sequence here mirrors what ``check`` has always done:
8
+ ``read_lock`` -> ``capture_surface_sync`` -> ``run_checks`` -> ``build_lock``
9
+ (an in-memory CURRENT lock, never persisted) -> ``compute_drift``.
10
+
11
+ # INTERNAL STABILITY NOTE: the pre-commit wrapper (precommit.py) depends on this
12
+ # function's signature and exception contract (CaptureError for spawn/timeout
13
+ # failures; FileNotFoundError / ValueError for a missing/invalid lock). Do not
14
+ # change either without updating precommit.py.
15
+ #
16
+ # DETERMINISM: this shared verdict path MUST stay free of environment-dependent
17
+ # behavior (cwd-, time-, locale-, or env-var-conditioned branches). The local
18
+ # pre-commit hook and CI both reach the drift verdict through this exact code, so
19
+ # any non-deterministic branch here would let a local hook verdict diverge from
20
+ # CI — the precise failure ("a hook that disagrees with CI") this module exists
21
+ # to prevent.
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ from dataclasses import dataclass
27
+ from pathlib import Path
28
+
29
+ from .capture import capture_surface_sync
30
+ from .checks import run_checks
31
+ from .drift import DriftItem, compute_drift
32
+ from .lockfile import build_lock, read_lock
33
+ from .models import Finding
34
+
35
+
36
+ @dataclass(frozen=True)
37
+ class CheckResult:
38
+ """The full result of a check run, for callers that need more than drift.
39
+
40
+ ``findings`` are the static-check findings on the current surface (needed by
41
+ the CLI's SARIF/JSON emitters); ``drift`` is the verdict set.
42
+ """
43
+
44
+ findings: list[Finding]
45
+ drift: list[DriftItem]
46
+
47
+
48
+ def run_check_full(
49
+ command: str,
50
+ args: list[str],
51
+ lock_path: Path,
52
+ timeout_s: float,
53
+ ) -> CheckResult:
54
+ """Run the full check verdict path: read lock -> capture -> checks -> drift.
55
+
56
+ This is the single source of truth for the ``check`` verdict. ``cli.py:check``
57
+ calls it (and uses ``findings`` for SARIF/JSON output); the pre-commit wrapper
58
+ calls the thinner :func:`run_check` which discards ``findings``.
59
+
60
+ Args:
61
+ command: The MCP server launch command (argv[0]).
62
+ args: The remaining server launch argv.
63
+ lock_path: Path to the baseline ``warden.lock``.
64
+ timeout_s: Capture timeout in seconds.
65
+
66
+ Returns:
67
+ A :class:`CheckResult` (``drift`` empty == clean).
68
+
69
+ Raises:
70
+ FileNotFoundError: The lock file does not exist.
71
+ ValueError: The lock file is invalid JSON or fails schema validation.
72
+ CaptureError: The server could not be spawned or did not respond in time.
73
+ """
74
+ baseline = read_lock(lock_path)
75
+ surface = capture_surface_sync(command, args, timeout_s=timeout_s)
76
+ findings = run_checks(surface)
77
+ # build_lock constructs an IN-MEMORY current lock for diffing only; it is
78
+ # never written to disk on the check path.
79
+ current = build_lock(surface, findings)
80
+ drift = compute_drift(baseline, current)
81
+ return CheckResult(findings=findings, drift=drift)
82
+
83
+
84
+ def run_check(
85
+ command: str,
86
+ args: list[str],
87
+ lock_path: Path,
88
+ timeout_s: float,
89
+ ) -> list[DriftItem]:
90
+ """Run the check path and return only the drift set (verdict).
91
+
92
+ Convenience wrapper over :func:`run_check_full` for callers (the pre-commit
93
+ hook) that only need the drift verdict and never the static findings.
94
+
95
+ Raises:
96
+ FileNotFoundError, ValueError, CaptureError: see :func:`run_check_full`.
97
+ """
98
+ return run_check_full(command, args, lock_path, timeout_s).drift
mcp_warden/checks.py ADDED
@@ -0,0 +1,163 @@
1
+ """Static-check engine orchestrator (CHECKS.md).
2
+
3
+ Runs the full ``WRD-*`` catalog over a captured surface:
4
+ - capability checks ``WRD-CAP-*`` (via the shared tokenizer),
5
+ - secret checks ``WRD-SEC-*`` (checks_secret),
6
+ - supply-chain checks ``WRD-SUP-*`` (checks_supply),
7
+ - robustness ``WRD-SCHEMA-MALFORMED``.
8
+
9
+ Findings are returned sorted by ``(target, rule_id)`` for deterministic output
10
+ (CHECKS.md §5.1). CUT items (fuzzy/NLP, result scanning, etc.) are NOT here.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from typing import Any
16
+
17
+ from .checks_secret import scan_field
18
+ from .checks_supply import check_launch_command
19
+ from .models import CapturedSurface, Finding
20
+ from .tokenizer import capability_evidence, derive_capabilities
21
+
22
+ # Capability flag -> (rule_id, severity) per CHECKS.md §4.1.
23
+ _CAP_RULES: dict[str, tuple[str, str]] = {
24
+ "shell-exec": ("WRD-CAP-SHELL", "critical"),
25
+ "fs-write": ("WRD-CAP-FS-WRITE", "high"),
26
+ "fs-read": ("WRD-CAP-FS-READ", "medium"),
27
+ "http-request": ("WRD-CAP-HTTP", "high"),
28
+ "sql-query": ("WRD-CAP-SQL", "high"),
29
+ }
30
+
31
+
32
+ def _string_values_from_schema(schema: dict[str, Any]) -> list[str]:
33
+ """Collect string ``default``/``enum``/``examples`` values from a JSON Schema.
34
+
35
+ Recurses nested schemas (``properties``, ``items``, ``$defs``, etc.). Property
36
+ *keys* are intentionally NOT scanned (CHECKS.md §4.2 — a key named ``api_key``
37
+ is not a leak).
38
+
39
+ Args:
40
+ schema: A JSON Schema fragment.
41
+
42
+ Returns:
43
+ Flat list of candidate string values to run secret scans over.
44
+ """
45
+ out: list[str] = []
46
+
47
+ def walk(node: Any) -> None:
48
+ if isinstance(node, dict):
49
+ if isinstance(node.get("default"), str):
50
+ out.append(node["default"])
51
+ enum = node.get("enum")
52
+ if isinstance(enum, list):
53
+ out.extend(v for v in enum if isinstance(v, str))
54
+ examples = node.get("examples")
55
+ if isinstance(examples, list):
56
+ out.extend(v for v in examples if isinstance(v, str))
57
+ for key, val in node.items():
58
+ if key in ("default", "enum", "examples"):
59
+ continue
60
+ walk(val)
61
+ elif isinstance(node, list):
62
+ for item in node:
63
+ walk(item)
64
+
65
+ walk(schema)
66
+ return out
67
+
68
+
69
+ def _schema_is_malformed(schema: Any) -> bool:
70
+ """Return True if an inputSchema is present but not analyzable (not an object)."""
71
+ return schema is not None and not isinstance(schema, dict)
72
+
73
+
74
+ def run_checks(surface: CapturedSurface) -> list[Finding]:
75
+ """Run the full static-check catalog over a captured surface.
76
+
77
+ Args:
78
+ surface: The captured declared surface.
79
+
80
+ Returns:
81
+ Deterministically sorted (by ``target``, then ``rule_id``) list of
82
+ findings. Secret snippets are redacted by the scanners.
83
+ """
84
+ findings: list[Finding] = []
85
+
86
+ # --- Launch / supply-chain (target = launch/command) ---
87
+ findings.extend(check_launch_command(surface.command, surface.args))
88
+ for arg in (surface.command, *surface.args):
89
+ findings.extend(scan_field(arg, "launch/command"))
90
+
91
+ # --- Tools ---
92
+ for tool in surface.tools:
93
+ target = f"tools/{tool.name}"
94
+
95
+ if _schema_is_malformed(tool.input_schema):
96
+ findings.append(
97
+ Finding(
98
+ rule_id="WRD-SCHEMA-MALFORMED",
99
+ severity="low",
100
+ target=target,
101
+ message="inputSchema is present but not a JSON object; capability analysis skipped",
102
+ snippet=f"inputSchema type={type(tool.input_schema).__name__}",
103
+ )
104
+ )
105
+ schema_obj: dict[str, Any] | None = None
106
+ else:
107
+ schema_obj = tool.input_schema
108
+
109
+ # Capability checks via the shared tokenizer.
110
+ for flag in derive_capabilities(tool.name, schema_obj):
111
+ rule_id, severity = _CAP_RULES[flag]
112
+ evidence = capability_evidence(tool.name, schema_obj, flag)
113
+ findings.append(
114
+ Finding(
115
+ rule_id=rule_id,
116
+ severity=severity,
117
+ target=target,
118
+ message=f"Tool derives capability '{flag}' ({evidence})",
119
+ snippet=evidence,
120
+ )
121
+ )
122
+
123
+ # Secret checks on name, description, and schema string values.
124
+ findings.extend(scan_field(tool.name, target))
125
+ if tool.description:
126
+ findings.extend(scan_field(tool.description, target))
127
+ if isinstance(schema_obj, dict):
128
+ for sval in _string_values_from_schema(schema_obj):
129
+ findings.extend(scan_field(sval, target))
130
+
131
+ # --- Resources ---
132
+ for res in surface.resources:
133
+ target = f"resources/{res.uri}"
134
+ for field in (res.uri, res.name, res.description):
135
+ if field:
136
+ findings.extend(scan_field(field, target))
137
+
138
+ # --- Prompts ---
139
+ for prompt in surface.prompts:
140
+ target = f"prompts/{prompt.name}"
141
+ findings.extend(scan_field(prompt.name, target))
142
+ if prompt.description:
143
+ findings.extend(scan_field(prompt.description, target))
144
+
145
+ return _dedupe_and_sort(findings)
146
+
147
+
148
+ def _dedupe_and_sort(findings: list[Finding]) -> list[Finding]:
149
+ """Collapse duplicate (rule_id, target, snippet) and sort by (target, rule_id).
150
+
151
+ CHECKS.md §5.1/§5.2: one finding per (rule_id, target, match-location);
152
+ emitted sorted by ``(target, rule_id)``.
153
+ """
154
+ seen: set[tuple[str, str, str]] = set()
155
+ unique: list[Finding] = []
156
+ for f in findings:
157
+ key = (f.rule_id, f.target, f.snippet)
158
+ if key in seen:
159
+ continue
160
+ seen.add(key)
161
+ unique.append(f)
162
+ unique.sort(key=lambda f: (f.target, f.rule_id, f.snippet))
163
+ return unique
@@ -0,0 +1,129 @@
1
+ """Secret-leakage checks (MCP-SECRET) — ``WRD-SEC-*`` (CHECKS.md §4.2).
2
+
3
+ Deterministic regex + entropy heuristics over the declared surface's string
4
+ fields. Snippets are ALWAYS redacted (CHECKS.md §8.2).
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import math
10
+ import re
11
+ from collections import Counter
12
+
13
+ from .models import Finding
14
+ from .redact import redact_secret
15
+
16
+ # --- Vendor patterns (CHECKS.md §4.2; case-sensitive unless noted) -----------
17
+
18
+ _VENDOR_PATTERNS: list[tuple[str, str, re.Pattern[str]]] = [
19
+ ("WRD-SEC-OPENAI", "critical", re.compile(r"\bsk-[A-Za-z0-9]{20,}\b")),
20
+ # GitHub: ghp_ (36) plus gho_/ghu_/ghs_/ghr_ OAuth/app tokens.
21
+ ("WRD-SEC-GITHUB", "critical", re.compile(r"\bgh[pousr]_[A-Za-z0-9]{36}\b")),
22
+ ("WRD-SEC-AWS-AKID", "critical", re.compile(r"\bAKIA[0-9A-Z]{16}\b")),
23
+ ("WRD-SEC-SLACK", "critical", re.compile(r"\bxox[baprs]-[A-Za-z0-9-]{10,}\b")),
24
+ (
25
+ "WRD-SEC-PRIVKEY",
26
+ "critical",
27
+ re.compile(r"-----BEGIN (RSA |EC |OPENSSH |DSA |PGP )?PRIVATE KEY-----"),
28
+ ),
29
+ (
30
+ "WRD-SEC-JWT",
31
+ "high",
32
+ re.compile(r"\beyJ[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\b"),
33
+ ),
34
+ ]
35
+
36
+ #: Entropy candidate token pattern (CHECKS.md §4.2).
37
+ _ENTROPY_TOKEN = re.compile(r"[A-Za-z0-9+/_=-]{20,}")
38
+
39
+ #: Splitter for the entropy pass: whitespace + chars outside the candidate set.
40
+ _ENTROPY_SPLIT = re.compile(r"[^A-Za-z0-9+/_=.-]+")
41
+
42
+ ENTROPY_THRESHOLD = 4.0
43
+ ENTROPY_MIN_LEN = 24
44
+ ALNUM_DOMINANCE = 0.80
45
+
46
+
47
+ def shannon_entropy(token: str) -> float:
48
+ """Compute Shannon entropy (bits/char) over a token's character distribution.
49
+
50
+ ``H = -Σ p_i log2 p_i``.
51
+
52
+ Args:
53
+ token: The candidate string.
54
+
55
+ Returns:
56
+ Entropy in bits per character; ``0.0`` for an empty string.
57
+ """
58
+ if not token:
59
+ return 0.0
60
+ counts = Counter(token)
61
+ n = len(token)
62
+ return -sum((c / n) * math.log2(c / n) for c in counts.values())
63
+
64
+
65
+ def _alnum_ratio(token: str) -> float:
66
+ """Fraction of characters in ``[A-Za-z0-9]``."""
67
+ if not token:
68
+ return 0.0
69
+ alnum = sum(1 for ch in token if ch.isalnum() and ch.isascii())
70
+ return alnum / len(token)
71
+
72
+
73
+ def scan_field(value: str, target: str) -> list[Finding]:
74
+ """Scan one string field for secret patterns; return redacted findings.
75
+
76
+ Applies the vendor patterns first, then the entropy heuristic de-duped
77
+ against any token already matched by a vendor rule.
78
+
79
+ Args:
80
+ value: The string field content to scan.
81
+ target: The finding target, e.g. ``"tools/<name>"`` or ``"launch/command"``.
82
+
83
+ Returns:
84
+ A list of :class:`Finding` with redacted snippets. May be empty.
85
+ """
86
+ if not value:
87
+ return []
88
+
89
+ findings: list[Finding] = []
90
+ matched_spans: set[str] = set()
91
+
92
+ # 1) Explicit vendor patterns.
93
+ for rule_id, severity, pattern in _VENDOR_PATTERNS:
94
+ for m in pattern.finditer(value):
95
+ raw = m.group(0)
96
+ matched_spans.add(raw)
97
+ findings.append(
98
+ Finding(
99
+ rule_id=rule_id,
100
+ severity=severity,
101
+ target=target,
102
+ message=f"{rule_id}: possible secret in field",
103
+ snippet=redact_secret(raw),
104
+ )
105
+ )
106
+
107
+ # 2) Entropy heuristic, de-duped against vendor matches.
108
+ for token in _ENTROPY_SPLIT.split(value):
109
+ if len(token) < ENTROPY_MIN_LEN:
110
+ continue
111
+ if not _ENTROPY_TOKEN.fullmatch(token):
112
+ continue
113
+ if any(token in span or span in token for span in matched_spans):
114
+ continue # already covered by a vendor rule
115
+ if _alnum_ratio(token) < ALNUM_DOMINANCE:
116
+ continue
117
+ if shannon_entropy(token) >= ENTROPY_THRESHOLD:
118
+ findings.append(
119
+ Finding(
120
+ rule_id="WRD-SEC-ENTROPY",
121
+ severity="high",
122
+ target=target,
123
+ message="WRD-SEC-ENTROPY: high-entropy token (possible secret)",
124
+ snippet=redact_secret(token),
125
+ )
126
+ )
127
+ matched_spans.add(token)
128
+
129
+ return findings