@intentsolutions/audit-harness 1.1.7 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,143 @@
1
+ #!/usr/bin/env bash
2
+ # caa-check.sh — verify a namespace publishes CAA records (and, when configured,
3
+ # pins the EXPECTED certificate authority) before a production signed attestation
4
+ # is anchored against it.
5
+ #
6
+ # WHY THIS EXISTS (CISO binding, DR-010 Q5 / ISEDC v1 Q1 2026-05-10):
7
+ # CAA (RFC 8659) records constrain which CAs may issue certificates for a
8
+ # namespace. Pinning the CA on evals.intentsolutions.io closes the mis-issuance
9
+ # path an attacker could otherwise use to obtain a look-alike cert and present
10
+ # forged attestation infrastructure. This must be verified BEFORE the first
11
+ # production attestation. This script is that gate — read-only, fail-closed.
12
+ #
13
+ # WHY IT QUERIES AN EXPLICIT RESOLVER (the bug this version fixes):
14
+ # Querying the LOCAL STUB RESOLVER (plain `dig`, no `@server`) FALSE-NEGATIVES
15
+ # on hosts whose stub resolver lags CAA propagation or strips the record type
16
+ # (systemd-resolved, many CI runners, dev boxes). On such a host a correctly
17
+ # CAA-pinned zone looks like it has no CAA, and the gate refuses a legitimate
18
+ # production sign. The fix is to query a TRUSTED PUBLIC resolver. The gate
19
+ # stays fail-closed: PASS only on a positive matching CAA record from a trusted
20
+ # resolver; absence / mismatch / unreachable => non-zero.
21
+ #
22
+ # Usage:
23
+ # bash scripts/caa-check.sh [DOMAIN]
24
+ # EXPECTED_CAA_ISSUER=letsencrypt.org bash scripts/caa-check.sh evals.intentsolutions.io
25
+ #
26
+ # Resolution order for the domain:
27
+ # 1. $1 (positional)
28
+ # 2. $CAA_CHECK_DOMAIN
29
+ # 3. default: evals.intentsolutions.io
30
+ #
31
+ # Issuer policy:
32
+ # - EXPECTED_CAA_ISSUER (env) — when set, at least one CAA `issue` (or
33
+ # `issuewild`) record MUST name this CA, else the check FAILS (exit 1).
34
+ # Default: letsencrypt.org (the CA the IS public-namespace certs are issued
35
+ # by). Override per-deployment.
36
+ # - EXPECTED_CAA_ISSUER=ANY (case-insensitive) — relax to "any CAA record is
37
+ # acceptable"; presence of ANY CAA record passes, absence fails, and a
38
+ # warning is emitted that no specific CA is being pinned.
39
+ #
40
+ # Exit codes:
41
+ # 0 — CAA verified (present at a trusted resolver, and matches
42
+ # EXPECTED_CAA_ISSUER when a specific issuer is required)
43
+ # 1 — CAA NOT verified (no CAA records, or expected issuer not present, from
44
+ # any trusted resolver)
45
+ # 2 — UNKNOWN/UNREACHABLE (no resolver tool installed)
46
+ #
47
+ # Override knobs:
48
+ # CAA_CHECK_RESOLVERS — space-separated list of trusted public resolvers to
49
+ # query in order (default: "1.1.1.1 8.8.8.8").
50
+ # CAA_CHECK_DIG_CMD — command used in place of `dig` (default: dig)
51
+
52
+ set -euo pipefail
53
+
54
+ DOMAIN="${1:-${CAA_CHECK_DOMAIN:-evals.intentsolutions.io}}"
55
+ EXPECTED_CAA_ISSUER="${EXPECTED_CAA_ISSUER:-letsencrypt.org}"
56
+ DIG_CMD="${CAA_CHECK_DIG_CMD:-dig}"
57
+ # Trusted public resolvers, queried in order, until one returns a CAA record.
58
+ RESOLVERS="${CAA_CHECK_RESOLVERS:-1.1.1.1 8.8.8.8}"
59
+
60
+ log() { printf 'caa-check: %s\n' "$1" >&2; }
61
+
62
+ if [[ "$DOMAIN" == "-h" || "$DOMAIN" == "--help" ]]; then
63
+ sed -n '2,60p' "$0"
64
+ exit 0
65
+ fi
66
+
67
+ have() { command -v "$1" >/dev/null 2>&1; }
68
+
69
+ if ! have "$DIG_CMD"; then
70
+ log "UNKNOWN/UNREACHABLE — '$DIG_CMD' is not installed; cannot look up CAA for '$DOMAIN'"
71
+ log " failing closed (production must not sign on UNKNOWN)"
72
+ log " remediation: install bind9-dnsutils (provides dig) on the signing host"
73
+ exit 2
74
+ fi
75
+
76
+ # issuer_matches CAA_TEXT -> 0 if a matching issue/issuewild record is present.
77
+ # Match any `issue` or `issuewild` property whose value contains the expected
78
+ # CA. CAA values are quoted; we match case-insensitively on the issuer substring.
79
+ issuer_matches() {
80
+ printf '%s\n' "$1" \
81
+ | grep -iE '[[:space:]]issue(wild)?[[:space:]]' \
82
+ | grep -iqF "$EXPECTED_CAA_ISSUER"
83
+ }
84
+
85
+ # is_blank CAA_TEXT -> 0 if the text is empty after stripping whitespace.
86
+ is_blank() {
87
+ [[ -z "${1//[$' \t\r\n']/}" ]]
88
+ }
89
+
90
+ last_caa_out="" # records from the last resolver that returned ANY CAA records
91
+ saw_records=0 # at least one trusted resolver returned CAA records
92
+
93
+ shopt -s nocasematch
94
+ relax_any=0
95
+ [[ "$EXPECTED_CAA_ISSUER" == "ANY" ]] && relax_any=1
96
+ shopt -u nocasematch
97
+
98
+ for resolver in $RESOLVERS; do
99
+ log "looking up CAA records for '$DOMAIN' via $DIG_CMD @$resolver"
100
+ # `dig @resolver +short CAA` prints one line per record, e.g.:
101
+ # 0 issue "letsencrypt.org"
102
+ # 0 issuewild ";"
103
+ caa_out="$("$DIG_CMD" "@$resolver" +short CAA "$DOMAIN" 2>/dev/null || true)"
104
+
105
+ if is_blank "$caa_out"; then
106
+ log " no CAA records returned by @$resolver"
107
+ continue
108
+ fi
109
+
110
+ saw_records=1
111
+ last_caa_out="$caa_out"
112
+
113
+ # --- ANY-issuer relaxation: any CAA record present passes ---
114
+ if [[ "$relax_any" -eq 1 ]]; then
115
+ log "VERIFIED (presence only) — CAA records exist for '$DOMAIN' (via @$resolver)"
116
+ log " WARNING: EXPECTED_CAA_ISSUER=ANY — no specific CA is being pinned."
117
+ log " Records found:"
118
+ printf '%s\n' "$caa_out" | sed 's/^/ /' >&2
119
+ exit 0
120
+ fi
121
+
122
+ # --- Specific-issuer pinning ---
123
+ if issuer_matches "$caa_out"; then
124
+ log "VERIFIED — '$DOMAIN' pins issuance to '$EXPECTED_CAA_ISSUER' (via @$resolver)"
125
+ exit 0
126
+ fi
127
+
128
+ log " CAA records exist at @$resolver but none pin '$EXPECTED_CAA_ISSUER'; trying next resolver"
129
+ done
130
+
131
+ # No trusted resolver yielded a matching CAA record -> fail-closed (exit 1).
132
+ if [[ "$saw_records" -eq 1 ]]; then
133
+ log "NOT VERIFIED — CAA records exist for '$DOMAIN' but none pin '$EXPECTED_CAA_ISSUER'"
134
+ log " Records found:"
135
+ printf '%s\n' "$last_caa_out" | sed 's/^/ /' >&2
136
+ log " remediation: add a CAA record pinning the expected CA, or set"
137
+ log " EXPECTED_CAA_ISSUER to the CA actually published (or ANY to accept any CAA)."
138
+ else
139
+ log "NOT VERIFIED — no CAA records found for '$DOMAIN' (resolvers tried: $RESOLVERS)"
140
+ log " remediation: publish a CAA record pinning the issuing CA, e.g.:"
141
+ log " $DOMAIN. CAA 0 issue \"$EXPECTED_CAA_ISSUER\""
142
+ fi
143
+ exit 1
@@ -0,0 +1,120 @@
1
+ #!/usr/bin/env bash
2
+ # check-wrapper-sync.sh — assert the bundled wrapper-script mirrors are byte-identical
3
+ # to their canonical source under scripts/.
4
+ #
5
+ # WHY THIS EXISTS
6
+ # ---------------
7
+ # The Node package (bin/audit-harness.js) dispatches to the CANONICAL scripts under
8
+ # scripts/. The Python wrapper (intent-audit-harness on PyPI) and the Rust wrapper
9
+ # (intent-audit-harness on crates.io) cannot reach those canonical files at install
10
+ # time, so each BUNDLES a copy:
11
+ #
12
+ # * python/src/intent_audit_harness/scripts/<name> (packaged into the wheel)
13
+ # * rust/scripts/<name> (include_bytes!'d into the binary)
14
+ #
15
+ # Those copies are hand-maintained. On 2026-05-24 they were found ~1 month stale:
16
+ # the bundled crap-score.py was missing v1.1.1's --json evidence envelope, the
17
+ # `which_or_none("go")` PATH guard (silent crash on Go-less hosts), and the
18
+ # rglob->os.walk directory pruning. A user running
19
+ # `pip install intent-audit-harness && audit-harness crap` got the OLD gate.
20
+ # (Tracking bead: iah-python-wrapper-scripts-sync / bd_000-projects-65k4.)
21
+ #
22
+ # This gate makes that class of drift IMPOSSIBLE to merge silently: every bundled
23
+ # mirror MUST be a byte-for-byte copy of its canonical source. There is no
24
+ # wrapper-only delta — both wrappers invoke the script verbatim via bash/python3.
25
+ #
26
+ # RESYNC (when this gate REDs)
27
+ # ----------------------------
28
+ # bash scripts/check-wrapper-sync.sh --fix # copy canonical -> both mirrors
29
+ # then review + commit the result.
30
+ #
31
+ # Exit codes:
32
+ # 0 all mirrors in sync (or --fix completed)
33
+ # 1 drift detected (and not in --fix mode)
34
+ set -euo pipefail
35
+
36
+ # Resolve repo root from this script's own location so the gate works regardless
37
+ # of the caller's CWD (CI runs it from the repo root; a dev may run it elsewhere).
38
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
39
+ REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
40
+ CANONICAL_DIR="${REPO_ROOT}/scripts"
41
+
42
+ # The set of scripts the Python + Rust wrappers DISPATCH. Keep this in lock-step
43
+ # with:
44
+ # * python/src/intent_audit_harness/cli.py (COMMANDS dict)
45
+ # * rust/src/main.rs (SCRIPTS array)
46
+ # If a wrapper starts dispatching a new canonical script, add it here AND to both
47
+ # wrapper sources, and copy it into both mirror dirs.
48
+ MIRRORED_SCRIPTS=(
49
+ "harness-hash.sh"
50
+ "escape-scan.sh"
51
+ "arch-check.sh"
52
+ "bias-count.sh"
53
+ "gherkin-lint.sh"
54
+ "crap-score.py"
55
+ )
56
+
57
+ # Each mirror directory that bundles a copy of the canonical scripts.
58
+ MIRROR_DIRS=(
59
+ "python/src/intent_audit_harness/scripts"
60
+ "rust/scripts"
61
+ )
62
+
63
+ FIX=0
64
+ if [[ "${1:-}" == "--fix" ]]; then
65
+ FIX=1
66
+ fi
67
+
68
+ drift_found=0
69
+ missing_canonical=0
70
+
71
+ for name in "${MIRRORED_SCRIPTS[@]}"; do
72
+ canonical="${CANONICAL_DIR}/${name}"
73
+ if [[ ! -f "${canonical}" ]]; then
74
+ echo "ERROR: canonical source missing: scripts/${name}" >&2
75
+ missing_canonical=1
76
+ continue
77
+ fi
78
+ for mdir in "${MIRROR_DIRS[@]}"; do
79
+ mirror="${REPO_ROOT}/${mdir}/${name}"
80
+ if [[ ! -f "${mirror}" ]]; then
81
+ echo "DRIFT: missing mirror ${mdir}/${name} (expected a copy of scripts/${name})" >&2
82
+ drift_found=1
83
+ if [[ "${FIX}" -eq 1 ]]; then
84
+ cp -f "${canonical}" "${mirror}"
85
+ echo " fixed: created ${mdir}/${name}"
86
+ fi
87
+ continue
88
+ fi
89
+ if ! diff -q "${canonical}" "${mirror}" >/dev/null 2>&1; then
90
+ echo "DRIFT: ${mdir}/${name} differs from canonical scripts/${name}" >&2
91
+ drift_found=1
92
+ if [[ "${FIX}" -eq 1 ]]; then
93
+ cp -f "${canonical}" "${mirror}"
94
+ echo " fixed: resynced ${mdir}/${name}"
95
+ fi
96
+ fi
97
+ done
98
+ done
99
+
100
+ if [[ "${missing_canonical}" -eq 1 ]]; then
101
+ echo "FAIL: one or more canonical scripts are missing — cannot verify mirror sync." >&2
102
+ exit 1
103
+ fi
104
+
105
+ if [[ "${FIX}" -eq 1 ]]; then
106
+ echo "check-wrapper-sync: --fix complete. Review + commit the resynced mirrors."
107
+ exit 0
108
+ fi
109
+
110
+ if [[ "${drift_found}" -eq 1 ]]; then
111
+ echo "" >&2
112
+ echo "FAIL: bundled wrapper mirrors are out of sync with canonical scripts/." >&2
113
+ echo " The Python (PyPI) and Rust (crates.io) packages would ship STALE gates." >&2
114
+ echo " Resync with: bash scripts/check-wrapper-sync.sh --fix" >&2
115
+ echo " then review + commit the result." >&2
116
+ exit 1
117
+ fi
118
+
119
+ echo "check-wrapper-sync: OK — all ${#MIRRORED_SCRIPTS[@]} bundled mirrors match canonical in ${#MIRROR_DIRS[@]} wrapper dirs."
120
+ exit 0
@@ -50,6 +50,22 @@ EXCLUDED_DIRS = {
50
50
  }
51
51
 
52
52
 
53
+ def is_excluded_dir(name: str) -> bool:
54
+ """Single exclusion predicate shared by the candidate-discovery walk and
55
+ the --json input-hash walk.
56
+
57
+ Both walks MUST agree on which directories they descend into; otherwise the
58
+ set of files that feed the CRAP score can diverge from the set that feeds
59
+ the input_hash, and the score/hash desync (a hash that claims to cover
60
+ files the score never saw, or vice versa). The rule is: skip any dot-dir
61
+ (e.g. `.idea`, `.svn`, `.git`) OR any explicitly-named build/vendor dir in
62
+ EXCLUDED_DIRS. Previously discovery dropped all dot-dirs while the hash walk
63
+ dropped only the named subset, so a dot-dir not in EXCLUDED_DIRS was hashed
64
+ but never scored.
65
+ """
66
+ return name.startswith(".") or name in EXCLUDED_DIRS
67
+
68
+
53
69
  def crap(complexity: int, coverage_pct: float) -> float:
54
70
  cov = max(0.0, min(100.0, coverage_pct)) / 100.0
55
71
  return (complexity ** 2) * ((1.0 - cov) ** 3) + complexity
@@ -98,8 +114,7 @@ def score_python(root: Path, kind: str) -> list[MethodScore]:
98
114
  scanned = [
99
115
  p.name for p in root.iterdir()
100
116
  if p.is_dir()
101
- and not p.name.startswith(".")
102
- and p.name not in EXCLUDED_DIRS
117
+ and not is_excluded_dir(p.name)
103
118
  and p.name not in test_dirs
104
119
  and any(p.rglob("*.py"))
105
120
  ]
@@ -165,7 +180,15 @@ def score_go(root: Path, kind: str) -> list[MethodScore]:
165
180
  print("[crap-score] gocyclo not installed", file=sys.stderr)
166
181
  return []
167
182
 
168
- rc, out, _ = run(["gocyclo", "-ignore", "_test.go" if kind == "src" else ".*\\.go$", "."], root)
183
+ # For kind="src", ignore *_test.go at the gocyclo level. For kind="test",
184
+ # do NOT pass -ignore: a pattern like `.*\.go$` matches every analyzable
185
+ # file (gocyclo only reads .go files), which silenced all test-kind output.
186
+ # The include-filter below keeps only *_test.go rows for kind="test".
187
+ gocyclo_cmd = ["gocyclo"]
188
+ if kind == "src":
189
+ gocyclo_cmd += ["-ignore", "_test.go"]
190
+ gocyclo_cmd.append(".")
191
+ rc, out, _ = run(gocyclo_cmd, root)
169
192
  complexity: list[tuple[str, str, int]] = []
170
193
  for line in out.splitlines():
171
194
  parts = line.strip().split()
@@ -187,11 +210,28 @@ def score_go(root: Path, kind: str) -> list[MethodScore]:
187
210
  if not cov_out.is_file() and which_or_none("go"):
188
211
  run(["go", "test", "-coverprofile=coverage.out", "-covermode=atomic", "./..."], root)
189
212
  if cov_out.is_file() and which_or_none("go"):
213
+ # `go tool cover -func` reports module-qualified paths
214
+ # (github.com/user/repo/pkg/file.go) while gocyclo reports repo-relative
215
+ # paths (pkg/file.go). Strip the module prefix read from go.mod so the
216
+ # coverage keys join the complexity keys.
217
+ module_prefix = ""
218
+ go_mod = root / "go.mod"
219
+ if go_mod.is_file():
220
+ try:
221
+ for mod_line in go_mod.read_text().splitlines():
222
+ mod_line = mod_line.strip()
223
+ if mod_line.startswith("module ") or mod_line.startswith("module\t"):
224
+ module_prefix = mod_line.split(None, 1)[1].strip() + "/"
225
+ break
226
+ except OSError:
227
+ pass
190
228
  rc, out, _ = run(["go", "tool", "cover", "-func=coverage.out"], root)
191
229
  for line in out.splitlines():
192
230
  parts = line.split()
193
231
  if len(parts) >= 3 and parts[-1].endswith("%"):
194
232
  fpath = parts[0].split(":", 1)[0]
233
+ if module_prefix and fpath.startswith(module_prefix):
234
+ fpath = fpath[len(module_prefix):]
195
235
  try:
196
236
  pct = float(parts[-1].rstrip("%"))
197
237
  except ValueError:
@@ -228,6 +268,17 @@ def score_js(root: Path, kind: str) -> list[MethodScore]:
228
268
  except json.JSONDecodeError:
229
269
  return []
230
270
 
271
+ # c8/istanbul's json-summary reporter keys files by ABSOLUTE path while
272
+ # complexity-report (run with a repo-relative target) reports repo-relative
273
+ # paths. Normalize both sides to repo-relative so the coverage join works.
274
+ def _rel_to_root(p: str) -> str:
275
+ if os.path.isabs(p):
276
+ try:
277
+ return os.path.relpath(p, str(root))
278
+ except ValueError:
279
+ return p # e.g. different drive on Windows — keep as-is
280
+ return p
281
+
231
282
  cov_path = root / "coverage" / "coverage-summary.json"
232
283
  coverage: dict[str, float] = {}
233
284
  if cov_path.is_file():
@@ -237,14 +288,14 @@ def score_js(root: Path, kind: str) -> list[MethodScore]:
237
288
  if fpath == "total":
238
289
  continue
239
290
  lines_pct = summary.get("lines", {}).get("pct", 0.0)
240
- coverage[fpath] = float(lines_pct)
291
+ coverage[_rel_to_root(fpath)] = float(lines_pct)
241
292
  except (OSError, json.JSONDecodeError):
242
293
  pass
243
294
 
244
295
  scores: list[MethodScore] = []
245
296
  for report in data.get("reports", []):
246
297
  fpath = report.get("path", "")
247
- cov = coverage.get(fpath, 0.0)
298
+ cov = coverage.get(_rel_to_root(fpath), 0.0)
248
299
  for func in report.get("functions", []):
249
300
  c = int(func.get("cyclomatic", 1))
250
301
  scores.append(
@@ -403,7 +454,7 @@ def main() -> int:
403
454
  exts = (".py", ".ts", ".tsx", ".js", ".jsx", ".go", ".rs", ".java", ".kt", ".cs", ".php", ".rb")
404
455
  collected: list[Path] = []
405
456
  for dirpath, dirs, files in os.walk(root):
406
- dirs[:] = [d for d in dirs if d not in EXCLUDED_DIRS]
457
+ dirs[:] = [d for d in dirs if not is_excluded_dir(d)]
407
458
  for fn in files:
408
459
  if fn.endswith(exts):
409
460
  collected.append(Path(dirpath) / fn)
@@ -0,0 +1,238 @@
1
+ #!/usr/bin/env bash
2
+ # cred-gate.sh — Provider credential PASS/FAIL gate (iah-E08).
3
+ #
4
+ # CISO non-negotiable per DR-010 S1Q5: before any provider abstraction is allowed
5
+ # to flow data into an Evidence Bundle / OTel signal / gate-result envelope, two
6
+ # things MUST hold and are gated here, deterministically and offline:
7
+ #
8
+ # 1. CREDENTIAL REDACTION — no provider secret VALUE appears verbatim in the
9
+ # candidate artifact (the JSON the runner is about to sign, the OTel line it
10
+ # is about to emit, any log it captures). A leaked API key in a signed,
11
+ # Rekor-anchored Statement is irreversible.
12
+ #
13
+ # 2. ENV-VAR SPILLOVER — the candidate artifact does not blindly serialize the
14
+ # process environment (e.g. an `env` dump, a `process.env` spread, or a
15
+ # "context": {<all env>} block). A provider key need not be named to leak:
16
+ # a wholesale env dump spills every secret at once.
17
+ #
18
+ # This gate is READ-ONLY and OFFLINE. It never contacts a provider, never reads
19
+ # a real key from disk, and never writes. It inspects the candidate artifact you
20
+ # hand it (stdin or --input) against the secret values present in the environment
21
+ # (referenced by NAME via --secret-env, so the values never appear on the command
22
+ # line) plus a built-in catalog of provider-key SHAPES.
23
+ #
24
+ # It emits a gate-result/v1 envelope on stdout (--json) suitable for piping to
25
+ # emit-evidence, OR a human-readable PASS/FAIL summary (default).
26
+ #
27
+ # Usage:
28
+ # bash cred-gate.sh --input candidate.json
29
+ # <producer> | bash cred-gate.sh # candidate on stdin
30
+ # bash cred-gate.sh --secret-env ANTHROPIC_API_KEY --secret-env OPENAI_API_KEY < cand.json
31
+ # bash cred-gate.sh --json < candidate.json | bash emit-evidence.sh
32
+ #
33
+ # Flags:
34
+ # --input PATH Read the candidate artifact from PATH instead of stdin.
35
+ # --secret-env NAME Treat $NAME's VALUE as a secret that must NOT appear in the
36
+ # candidate. Repeatable. The value is read from the
37
+ # environment by name — it is never passed on argv.
38
+ # --json Emit a gate-result/v1 envelope (JSON) instead of text.
39
+ # --gate-id ID Override the gate_id in the envelope (default: provider-cred-gate).
40
+ # --help, -h Print help.
41
+ #
42
+ # Exit codes:
43
+ # 0 — PASS (no secret value present; no env-var spillover detected)
44
+ # 1 — FAIL (a secret value leaked OR an env-var spillover pattern matched)
45
+ # 2 — usage / input error (no candidate, unreadable --input)
46
+ #
47
+ # Failure-mode docs (iah-E08d): see docs/cred-gate.md for the catalog of detected
48
+ # shapes, the spillover heuristics, the false-positive posture, and remediation.
49
+
50
+ set -euo pipefail
51
+
52
+ # Bash version floor: align with the rest of the harness (jcgw).
53
+ [ "${BASH_VERSINFO:-0}" -ge 4 ] || { echo 'audit-harness requires bash >= 4' >&2; exit 2; }
54
+
55
+ INPUT="-"
56
+ EMIT_JSON=0
57
+ GATE_ID="provider-cred-gate"
58
+ SECRET_ENVS=()
59
+
60
+ while [[ $# -gt 0 ]]; do
61
+ case "$1" in
62
+ --input) INPUT="$2"; shift 2 ;;
63
+ --secret-env) SECRET_ENVS+=("$2"); shift 2 ;;
64
+ --json) EMIT_JSON=1; shift ;;
65
+ --gate-id) GATE_ID="$2"; shift 2 ;;
66
+ --help|-h) sed -n '2,46p' "$0"; exit 0 ;;
67
+ *) echo "cred-gate: unknown flag $1" >&2; exit 2 ;;
68
+ esac
69
+ done
70
+
71
+ # --- Read the candidate artifact ---
72
+ if [[ "$INPUT" == "-" ]]; then
73
+ CANDIDATE=$(cat)
74
+ else
75
+ if [[ ! -r "$INPUT" ]]; then
76
+ echo "cred-gate: cannot read $INPUT" >&2
77
+ exit 2
78
+ fi
79
+ CANDIDATE=$(cat "$INPUT")
80
+ fi
81
+
82
+ if [[ -z "$CANDIDATE" ]]; then
83
+ echo "cred-gate: empty candidate artifact" >&2
84
+ exit 2
85
+ fi
86
+
87
+ # Resolve the gate input hash (sha256 of the candidate bytes) so the emitted
88
+ # envelope's input_hash is coherent with what was actually inspected.
89
+ INPUT_HASH="sha256:$(printf '%s' "$CANDIDATE" | sha256sum | cut -d' ' -f1)"
90
+ # The policy is this script's own bytes — a content address of the gate logic.
91
+ POLICY_HASH="sha256:$(sha256sum "$0" | cut -d' ' -f1)"
92
+
93
+ # --- Collect the secret VALUES to redaction-check (by env-var name) ---
94
+ # Built as a NUL-delimited blob so values with newlines/spaces stay intact and
95
+ # never touch argv.
96
+ SECRET_VALUES_BLOB=""
97
+ for name in "${SECRET_ENVS[@]:-}"; do
98
+ [[ -z "$name" ]] && continue
99
+ # Indirect expansion: read $name's value without it ever appearing on argv.
100
+ val="${!name:-}"
101
+ # Skip empty / trivially short values: a 1-char "secret" would false-positive
102
+ # on virtually any artifact and is not a real credential.
103
+ [[ ${#val} -lt 8 ]] && continue
104
+ SECRET_VALUES_BLOB+="$val"$'\0'
105
+ done
106
+
107
+ # --- Deterministic analysis in python (offline; values via env, not argv) ---
108
+ # We pass the candidate + the secret blob + the catalog knobs through the
109
+ # environment so no secret value is ever visible in `ps`.
110
+ RESULT=$(
111
+ CANDIDATE="$CANDIDATE" \
112
+ SECRET_VALUES_BLOB="$SECRET_VALUES_BLOB" \
113
+ GATE_ID="$GATE_ID" \
114
+ python3 - <<'PY'
115
+ import json
116
+ import os
117
+ import re
118
+ import sys
119
+
120
+ candidate = os.environ["CANDIDATE"]
121
+
122
+ findings = [] # list of {"kind": ..., "detail": ...}
123
+
124
+ # --- 1. Credential redaction: explicit secret VALUES must not appear verbatim ---
125
+ blob = os.environ.get("SECRET_VALUES_BLOB", "")
126
+ secret_values = [v for v in blob.split("\0") if v]
127
+ for val in secret_values:
128
+ if val in candidate:
129
+ # NEVER echo the secret. Report only its length + a non-reversible
130
+ # fingerprint so the finding is actionable without re-leaking.
131
+ import hashlib
132
+
133
+ fp = hashlib.sha256(val.encode("utf-8")).hexdigest()[:12]
134
+ findings.append(
135
+ {
136
+ "kind": "secret-value-leak",
137
+ "detail": (
138
+ "a declared secret value (len=%d, sha256:%s...) appears "
139
+ "verbatim in the candidate artifact" % (len(val), fp)
140
+ ),
141
+ }
142
+ )
143
+
144
+ # --- 2. Credential redaction: provider-key SHAPES (value-agnostic catalog) ---
145
+ # Each pattern matches the literal on-the-wire shape of a known provider key.
146
+ # A match means a raw key is embedded even if it was not declared via
147
+ # --secret-env. Patterns are intentionally specific to keep the FP rate low.
148
+ SHAPE_PATTERNS = [
149
+ ("anthropic-key", r"sk-ant-[A-Za-z0-9_-]{20,}"),
150
+ # OpenAI keys start sk- but NOT sk-ant- (that's anthropic, matched above).
151
+ # The negative lookahead keeps the two findings disjoint.
152
+ ("openai-key", r"sk-(?!ant-)(?:proj-)?[A-Za-z0-9_-]{20,}"),
153
+ ("groq-key", r"gsk_[A-Za-z0-9]{20,}"),
154
+ ("nvidia-key", r"nvapi-[A-Za-z0-9_-]{20,}"),
155
+ ("aws-access-key-id", r"AKIA[0-9A-Z]{16}"),
156
+ ("google-api-key", r"AIza[0-9A-Za-z_-]{35}"),
157
+ ("github-token", r"gh[posru]_[A-Za-z0-9]{36,}"),
158
+ ("slack-token", r"xox[baprs]-[A-Za-z0-9-]{10,}"),
159
+ ("private-key-block", r"-----BEGIN (?:RSA |EC |OPENSSH )?PRIVATE KEY-----"),
160
+ ]
161
+ for name, pattern in SHAPE_PATTERNS:
162
+ if re.search(pattern, candidate):
163
+ findings.append(
164
+ {
165
+ "kind": "secret-shape-match",
166
+ "detail": "candidate contains a value matching the %s key shape"
167
+ % name,
168
+ }
169
+ )
170
+
171
+ # --- 3. Env-var spillover: wholesale environment serialization ---
172
+ # A provider key need not be NAMED to leak — a blanket env dump spills every
173
+ # secret at once. We flag the structural patterns that serialize the whole
174
+ # environment into the artifact.
175
+ SPILLOVER_PATTERNS = [
176
+ ("process-env-spread", r"\.\.\.\s*process\.env\b"),
177
+ ("os-environ-dump", r"\bdict\(\s*os\.environ\s*\)|\bos\.environ\b\s*[,}\]]"),
178
+ ("env-block-key", r'"(?:env|environ|environment)"\s*:\s*\{'),
179
+ ("printenv-capture", r"\b(?:printenv|/usr/bin/env)\b"),
180
+ ]
181
+ # These are heuristics: matching one is an ADVISORY-grade structural smell, but
182
+ # combined with an actual secret leak it is a hard FAIL. We treat any spillover
183
+ # match as a finding so the gate FAILs — an env dump in a to-be-signed artifact
184
+ # is exactly the irreversible leak this gate exists to stop.
185
+ for name, pattern in SPILLOVER_PATTERNS:
186
+ if re.search(pattern, candidate):
187
+ findings.append(
188
+ {
189
+ "kind": "env-spillover",
190
+ "detail": "candidate serializes the process environment via "
191
+ "the %s pattern" % name,
192
+ }
193
+ )
194
+
195
+ result = "FAIL" if findings else "PASS"
196
+ print(json.dumps({"result": result, "findings": findings}))
197
+ PY
198
+ )
199
+
200
+ # --- Parse the python result ---
201
+ GATE_RESULT=$(printf '%s' "$RESULT" | python3 -c "import json,sys; print(json.load(sys.stdin)['result'])")
202
+ FINDINGS_JSON=$(printf '%s' "$RESULT" | python3 -c "import json,sys; print(json.dumps(json.load(sys.stdin)['findings']))")
203
+ FINDING_COUNT=$(printf '%s' "$RESULT" | python3 -c "import json,sys; print(len(json.load(sys.stdin)['findings']))")
204
+
205
+ # --- Emit ---
206
+ if [[ "$EMIT_JSON" -eq 1 ]]; then
207
+ GATE_ID="$GATE_ID" GATE_RESULT="$GATE_RESULT" INPUT_HASH="$INPUT_HASH" \
208
+ POLICY_HASH="$POLICY_HASH" FINDINGS_JSON="$FINDINGS_JSON" \
209
+ python3 - <<'PY'
210
+ import json
211
+ import os
212
+
213
+ env = {
214
+ "gate_id": os.environ["GATE_ID"],
215
+ "result": os.environ["GATE_RESULT"],
216
+ "input_hash": os.environ["INPUT_HASH"],
217
+ "policy_hash": os.environ["POLICY_HASH"],
218
+ "metadata": {"findings": json.loads(os.environ["FINDINGS_JSON"])},
219
+ }
220
+ if env["result"] == "FAIL":
221
+ env["failure_mode"] = "provider_credential_leak"
222
+ print(json.dumps(env, separators=(",", ":")))
223
+ PY
224
+ else
225
+ if [[ "$GATE_RESULT" == "PASS" ]]; then
226
+ echo "cred-gate: PASS — no provider secret value present, no env-var spillover detected"
227
+ else
228
+ echo "cred-gate: FAIL — $FINDING_COUNT credential finding(s):" >&2
229
+ printf '%s' "$FINDINGS_JSON" | python3 -c "
230
+ import json, sys
231
+ for f in json.load(sys.stdin):
232
+ sys.stderr.write(' ⛔ [%s] %s\n' % (f['kind'], f['detail']))
233
+ "
234
+ echo "cred-gate: see docs/cred-gate.md for remediation (iah-E08d)." >&2
235
+ fi
236
+ fi
237
+
238
+ [[ "$GATE_RESULT" == "PASS" ]] && exit 0 || exit 1