code-audit-validator 1.2.0__tar.gz → 1.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: code-audit-validator
3
- Version: 1.2.0
3
+ Version: 1.3.0
4
4
  Summary: Deterministic conformance checker for AUDIT.md agent-audit outputs — CyberSkill code-audit-framework
5
5
  Author-email: CyberSkill <info@cyberskill.world>
6
6
  License:
@@ -256,8 +256,9 @@ python3 evals/validate.py --all # full suite, human output
256
256
  python3 evals/validate.py --all --json # machine-readable
257
257
  ./evals/run-evals.sh --record # run + pin baseline to current AUDIT.md
258
258
  python3 evals/validate.py --run <dir> # validate a real run's docs/ output
259
- python3 evals/validate.py --run <dir> --report json # structured findings export (loops, tasks, metrics, violations)
259
+ python3 evals/validate.py --run <dir> --report json # structured findings export (schemas/report.v1.json)
260
260
  python3 evals/validate.py --run <dir> --report sarif # GitHub code-scanning format
261
+ python3 evals/validate.py --aggregate r1.json r2.json # portfolio roll-up over report JSONs
261
262
  python3 evals/scripts/retro-summary.py # retro scores per protocol version (did each release help?)
262
263
  ```
263
264
 
@@ -265,6 +266,24 @@ Point `--run` at the target repo root (or its `docs/`): if the target's
265
266
  `AUDIT.md` is found, its CONFIG is preflighted and `PROTECTED_AREAS` is loaded
266
267
  automatically; `--protected` extends it.
267
268
 
269
+ **Waivers.** A target repo may carry `docs/AUDIT-WAIVERS.yaml` — audit-trailed,
270
+ *expiring* suppressions (`code` + optional `file`/`match` + `reason` +
271
+ `approved_by` + mandatory ISO `expires`). A valid waiver suppresses the matched
272
+ violation and is reported separately; an expired or undated one un-suppresses
273
+ it AND flags the stale waiver (`WAIVER-EXPIRED`). This is the sanctioned
274
+ exception channel — eval fixtures, by contrast, may never be weakened.
275
+
276
+ **Parsing notes (precision boundaries, pinned by fixtures).** Tables inside
277
+ ``` fences are raw evidence, never artifacts (G07/B19). Tables must use
278
+ leading-pipe GFM rows — the exact Phase 2 template shape; pipeless variants
279
+ read as nonconformant. Protected-area matching is case-insensitive substring —
280
+ keep CONFIG entries specific (`src/billing/`, not `src/`). Artifacts must be
281
+ UTF-8 and ≤ 10 MB (`MALFORMED-FILE` otherwise, never a crash).
282
+
283
+ **Version pinning.** The validator checks the *current* protocol's template.
284
+ Validating artifacts produced under an older protocol? Pin the matching tag
285
+ (validator and protocol release in lockstep: `v1.2.0` ↔ protocol v1.2.0).
286
+
268
287
  ## Adding a fixture
269
288
 
270
289
  1. Create `evals/fixtures/<Gnn|Bnn>-<slug>/` with `fixture.yaml` + `docs/BACKLOG.md` (and `docs/HANDOFF.md` if relevant).
@@ -13,10 +13,10 @@ weakens a rule.
13
13
 
14
14
  | | |
15
15
  |---|---|
16
- | Protocol | [`AUDIT.md`](./AUDIT.md) — current release **v1.2.0** |
16
+ | Protocol | [`AUDIT.md`](./AUDIT.md) — current release **v1.3.0** |
17
17
  | History | [`CHANGELOG.md`](./CHANGELOG.md) · immutable copies in [`improve/versions/`](./improve/versions/) |
18
18
  | Self-improvement | [`improve/CRITIC.md`](./improve/CRITIC.md) — one evidenced change per cycle |
19
- | Regression gate | [`evals/`](./evals/) — **24 fixtures, 24/24 green** at v1.2.0, stdlib-only Python; enforced in CI on every push |
19
+ | Regression gate | [`evals/`](./evals/) — **34 fixtures, 34/34 green** at v1.3.0, stdlib-only Python; enforced in CI on every push |
20
20
  | For agents | [`AGENTS.md`](./AGENTS.md) — machine-facing operating rules for this repo |
21
21
  | License | [Apache-2.0](./LICENSE) · [`CONTRIBUTING.md`](./CONTRIBUTING.md) · [`SECURITY.md`](./SECURITY.md) |
22
22
 
@@ -152,8 +152,11 @@ the GitHub Action below. Re-running the same kickoff prompt resumes idempotently
152
152
  **No clone needed — two distribution channels for step 3:**
153
153
 
154
154
  ```bash
155
- # One-off (uv) — or pipx install the same URL for a persistent command.
156
- # @v1 = floating major tag; pin an exact release tag (> v1.2.0) for immutability.
155
+ # From PyPI (https://pypi.org/project/code-audit-validator/):
156
+ pipx install code-audit-validator # or: uvx code-audit-validate --run .
157
+ code-audit-validate --run . --report json
158
+
159
+ # Or straight from the repo (@v1 = floating major tag; pin a release tag for immutability):
157
160
  uvx --from git+https://github.com/cyberskill-official/code-audit-framework@v1 \
158
161
  code-audit-validate --run . --report json
159
162
  ```
@@ -166,8 +169,16 @@ uvx --from git+https://github.com/cyberskill-official/code-audit-framework@v1 \
166
169
  report: json # optional; also writes audit-report.json
167
170
  ```
168
171
 
169
- (The packaged entry point covers `--run`/`--report`; the fixture suite
170
- `--all` stays repo-only, since fixtures ship with the repo, not the wheel.)
172
+ (The packaged entry point covers `--run`/`--report`/`--aggregate`; the fixture
173
+ suite `--all` stays repo-only, since fixtures ship with the repo, not the wheel.)
174
+
175
+ Two operational notes: accepted exceptions go in the target's
176
+ `docs/AUDIT-WAIVERS.yaml` — audit-trailed suppressions with a reason, an
177
+ approver, and a **mandatory expiry** (expired waivers re-raise the finding and
178
+ flag the stale waiver). And the validator is **offline by design**: stdlib-only,
179
+ no network calls, no telemetry — nothing about the audited codebase leaves the
180
+ machine, which makes it safe for air-gapped and regulated environments
181
+ (see [`COMPLIANCE.md`](./COMPLIANCE.md)).
171
182
 
172
183
  **Improving the protocol itself, scripted the same way** (Job B in
173
184
  [`AGENTS.md`](./AGENTS.md) — the file agents are pointed at once they're
@@ -215,7 +226,7 @@ regression-tested, and changed only with evidence.
215
226
  improve/CRITIC.md ── ONE minimal change; PATCH/MINOR/MAJOR
216
227
 
217
228
 
218
- evals/validate.py --all ── 24 fixtures must stay green
229
+ evals/validate.py --all ── 34 fixtures must stay green
219
230
 
220
231
 
221
232
  CHANGELOG.md + improve/versions/AUDIT-vX.Y.Z.md (immutable release)
@@ -262,7 +273,7 @@ Full evidence trail: [`CHANGELOG.md`](./CHANGELOG.md),
262
273
  ## The regression harness
263
274
 
264
275
  ```bash
265
- python3 evals/validate.py --all # 24 fixtures: G* must pass, B* must trip
276
+ python3 evals/validate.py --all # 34 fixtures: G* must pass, B* must trip
266
277
  ./evals/run-evals.sh --record # run + pin baseline.json to AUDIT.md's sha256
267
278
  python3 evals/validate.py --run DIR # validate any real run's docs/ output
268
279
  python3 evals/validate.py --run DIR --report json # structured findings export (or: sarif)
@@ -35,8 +35,9 @@ python3 evals/validate.py --all # full suite, human output
35
35
  python3 evals/validate.py --all --json # machine-readable
36
36
  ./evals/run-evals.sh --record # run + pin baseline to current AUDIT.md
37
37
  python3 evals/validate.py --run <dir> # validate a real run's docs/ output
38
- python3 evals/validate.py --run <dir> --report json # structured findings export (loops, tasks, metrics, violations)
38
+ python3 evals/validate.py --run <dir> --report json # structured findings export (schemas/report.v1.json)
39
39
  python3 evals/validate.py --run <dir> --report sarif # GitHub code-scanning format
40
+ python3 evals/validate.py --aggregate r1.json r2.json # portfolio roll-up over report JSONs
40
41
  python3 evals/scripts/retro-summary.py # retro scores per protocol version (did each release help?)
41
42
  ```
42
43
 
@@ -44,6 +45,24 @@ Point `--run` at the target repo root (or its `docs/`): if the target's
44
45
  `AUDIT.md` is found, its CONFIG is preflighted and `PROTECTED_AREAS` is loaded
45
46
  automatically; `--protected` extends it.
46
47
 
48
+ **Waivers.** A target repo may carry `docs/AUDIT-WAIVERS.yaml` — audit-trailed,
49
+ *expiring* suppressions (`code` + optional `file`/`match` + `reason` +
50
+ `approved_by` + mandatory ISO `expires`). A valid waiver suppresses the matched
51
+ violation and is reported separately; an expired or undated one un-suppresses
52
+ it AND flags the stale waiver (`WAIVER-EXPIRED`). This is the sanctioned
53
+ exception channel — eval fixtures, by contrast, may never be weakened.
54
+
55
+ **Parsing notes (precision boundaries, pinned by fixtures).** Tables inside
56
+ ``` fences are raw evidence, never artifacts (G07/B19). Tables must use
57
+ leading-pipe GFM rows — the exact Phase 2 template shape; pipeless variants
58
+ read as nonconformant. Protected-area matching is case-insensitive substring —
59
+ keep CONFIG entries specific (`src/billing/`, not `src/`). Artifacts must be
60
+ UTF-8 and ≤ 10 MB (`MALFORMED-FILE` otherwise, never a crash).
61
+
62
+ **Version pinning.** The validator checks the *current* protocol's template.
63
+ Validating artifacts produced under an older protocol? Pin the matching tag
64
+ (validator and protocol release in lockstep: `v1.2.0` ↔ protocol v1.2.0).
65
+
47
66
  ## Adding a fixture
48
67
 
49
68
  1. Create `evals/fixtures/<Gnn|Bnn>-<slug>/` with `fixture.yaml` + `docs/BACKLOG.md` (and `docs/HANDOFF.md` if relevant).
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: code-audit-validator
3
- Version: 1.2.0
3
+ Version: 1.3.0
4
4
  Summary: Deterministic conformance checker for AUDIT.md agent-audit outputs — CyberSkill code-audit-framework
5
5
  Author-email: CyberSkill <info@cyberskill.world>
6
6
  License:
@@ -256,8 +256,9 @@ python3 evals/validate.py --all # full suite, human output
256
256
  python3 evals/validate.py --all --json # machine-readable
257
257
  ./evals/run-evals.sh --record # run + pin baseline to current AUDIT.md
258
258
  python3 evals/validate.py --run <dir> # validate a real run's docs/ output
259
- python3 evals/validate.py --run <dir> --report json # structured findings export (loops, tasks, metrics, violations)
259
+ python3 evals/validate.py --run <dir> --report json # structured findings export (schemas/report.v1.json)
260
260
  python3 evals/validate.py --run <dir> --report sarif # GitHub code-scanning format
261
+ python3 evals/validate.py --aggregate r1.json r2.json # portfolio roll-up over report JSONs
261
262
  python3 evals/scripts/retro-summary.py # retro scores per protocol version (did each release help?)
262
263
  ```
263
264
 
@@ -265,6 +266,24 @@ Point `--run` at the target repo root (or its `docs/`): if the target's
265
266
  `AUDIT.md` is found, its CONFIG is preflighted and `PROTECTED_AREAS` is loaded
266
267
  automatically; `--protected` extends it.
267
268
 
269
+ **Waivers.** A target repo may carry `docs/AUDIT-WAIVERS.yaml` — audit-trailed,
270
+ *expiring* suppressions (`code` + optional `file`/`match` + `reason` +
271
+ `approved_by` + mandatory ISO `expires`). A valid waiver suppresses the matched
272
+ violation and is reported separately; an expired or undated one un-suppresses
273
+ it AND flags the stale waiver (`WAIVER-EXPIRED`). This is the sanctioned
274
+ exception channel — eval fixtures, by contrast, may never be weakened.
275
+
276
+ **Parsing notes (precision boundaries, pinned by fixtures).** Tables inside
277
+ ``` fences are raw evidence, never artifacts (G07/B19). Tables must use
278
+ leading-pipe GFM rows — the exact Phase 2 template shape; pipeless variants
279
+ read as nonconformant. Protected-area matching is case-insensitive substring —
280
+ keep CONFIG entries specific (`src/billing/`, not `src/`). Artifacts must be
281
+ UTF-8 and ≤ 10 MB (`MALFORMED-FILE` otherwise, never a crash).
282
+
283
+ **Version pinning.** The validator checks the *current* protocol's template.
284
+ Validating artifacts produced under an older protocol? Pin the matching tag
285
+ (validator and protocol release in lockstep: `v1.2.0` ↔ protocol v1.2.0).
286
+
268
287
  ## Adding a fixture
269
288
 
270
289
  1. Create `evals/fixtures/<Gnn|Bnn>-<slug>/` with `fixture.yaml` + `docs/BACKLOG.md` (and `docs/HANDOFF.md` if relevant).
@@ -29,8 +29,15 @@ machine-checkable subset of AUDIT.md's core rules:
29
29
  unedited <placeholder> text (Phase 0 preflight)
30
30
  CONFIG-BAD-ENUM MODE / DEPTH / BENCHMARK_MODE / SEVERITY_FLOOR outside its
31
31
  allowed set (Phase 0 preflight)
32
+ MALFORMED-FILE artifact is not valid UTF-8 text or exceeds the size
33
+ ceiling — a verdict, never a traceback
34
+ WAIVER-EXPIRED a docs/AUDIT-WAIVERS.yaml entry matched a violation but is
35
+ expired/undated; the original violation stays active
32
36
 
33
37
  A loop with zero findings is VALID (R7): absence of tasks is never a violation.
38
+ Waivers (docs/AUDIT-WAIVERS.yaml in the target repo) suppress matched
39
+ violations with an audit trail: code + reason + approved_by + expires (ISO
40
+ date, mandatory). Expired waivers un-suppress and are themselves flagged.
34
41
 
35
42
  When the run directory (or its parent, if you point --run at docs/ itself)
36
43
  contains the target's AUDIT.md, the CONFIG block is preflighted and
@@ -41,6 +48,7 @@ Usage:
41
48
  python3 evals/validate.py --run <dir-containing-docs> [--protected p1,p2]
42
49
  python3 evals/validate.py --run <dir> --report json # structured findings export
43
50
  python3 evals/validate.py --run <dir> --report sarif # GitHub code-scanning format
51
+ python3 evals/validate.py --aggregate r1.json r2.json # portfolio roll-up of report JSONs
44
52
  python3 evals/validate.py --all # run every fixture, compare to expectations
45
53
  python3 evals/validate.py --all --json # machine-readable results
46
54
 
@@ -110,11 +118,25 @@ def split_cells(line: str):
110
118
 
111
119
 
112
120
  def parse_tables(text: str):
113
- """Yield (header_cells, rows, end_line_idx) for every markdown table."""
121
+ """Yield (header_cells, rows, end_line_idx) for every markdown table.
122
+
123
+ Fence-aware (architect review F-1): R1 *requires* pasting raw tool output
124
+ into ``` fences, and that output may itself contain GFM-table-shaped lines
125
+ (markdown-emitting coverage/lint tools, `gh` CLI). Quoted lines inside a
126
+ fence are raw evidence, not run artifacts — they must neither trip
127
+ task/benchmark checks nor count toward template conformance.
128
+ `section_fences` keeps its own independent fence walk."""
114
129
  lines = text.splitlines()
115
- i = 0
130
+ i, in_fence = 0, False
116
131
  while i < len(lines):
117
132
  line = lines[i].strip()
133
+ if line.startswith("```"):
134
+ in_fence = not in_fence
135
+ i += 1
136
+ continue
137
+ if in_fence:
138
+ i += 1
139
+ continue
118
140
  if line.startswith("|") and i + 1 < len(lines) and re.match(r"^\|[\s:|-]+\|?$", lines[i + 1].strip()):
119
141
  header = split_cells(line)
120
142
  rows, j = [], i + 2
@@ -214,11 +236,13 @@ def check_benchmark_like_table(header, rows, end_idx, text, violations, src, is_
214
236
  fences = section_fences(text, end_idx)
215
237
  if has_measured_row and not fences:
216
238
  violations.append(("R1-NO-OUTPUT", src, "table has MEASURED/measured rows but no fenced raw-output block before next heading"))
217
- # …and each measured row must be traceable to ITS verify command
239
+ # …and each measured row must be traceable to ITS verify command.
240
+ # Whitespace-normalized containment (architect review F-6): a long command
241
+ # re-wrapped across lines inside the fence is still the same command.
218
242
  elif fences:
219
- joined = "\n".join(fences)
243
+ joined_ws = " ".join("\n".join(fences).split())
220
244
  for metric, verify in measured_rows:
221
- if verify and verify not in {"—", "-", ""} and verify not in joined:
245
+ if verify and verify not in {"—", "-", ""} and " ".join(verify.split()) not in joined_ws:
222
246
  violations.append(("R1-UNLINKED-OUTPUT", src, f"measured metric '{metric}': verify command '{verify}' appears in no fenced output block"))
223
247
 
224
248
 
@@ -242,14 +266,16 @@ def check_task_table(header, rows, violations, src, protected):
242
266
  if status == "BLOCKED" and "root cause" not in " ".join(r).lower():
243
267
  violations.append(("R6-NO-ROOTCAUSE", src, f"BLOCKED task '{tid}' has no 'Root cause:' note"))
244
268
  if status == "DONE" and protected:
245
- joined = " ".join(r)
269
+ joined = " ".join(r).casefold() # case-insensitive: src/Billing == src/billing (F-6)
246
270
  for p in protected:
247
- if p and p in joined:
271
+ if p and p.casefold() in joined:
248
272
  violations.append(("R3-PROTECTED", src, f"DONE task '{tid}' touches protected path '{p}'"))
249
273
 
250
274
 
251
275
  APPROVED_RE = re.compile(r"^Approved:\s*(.+)$", re.MULTILINE)
252
- MODE_GATED_RE = re.compile(r"(?mi)^\s*-?\s*Mode:\s*gated\b")
276
+ # `Mode:` may open the Scope line or follow another field (`Protocol: … | Mode: …`
277
+ # since v1.3.0) — match at line start or after a `|` separator, never mid-prose.
278
+ MODE_GATED_RE = re.compile(r"(?mi)(?:^|\|)\s*-?\s*Mode:\s*gated\b")
253
279
  EXECUTED_STATUSES = {"DONE", "IN-PROGRESS", "BLOCKED"}
254
280
 
255
281
 
@@ -294,19 +320,30 @@ def check_secrets(text, violations, src):
294
320
  violations.append(("R8-SECRET", src, f"unredacted {kind} matching '{m.group(0)[:12]}…'"))
295
321
 
296
322
 
297
- MODE_LINE_RE = re.compile(r"(?mi)^\s*-?\s*Mode:\s*\S+")
323
+ MODE_LINE_RE = re.compile(r"(?mi)(?:^|\|)\s*-?\s*Mode:\s*\S+")
324
+ PROTO_LINE_RE = re.compile(r"(?mi)(?:^|\|)\s*-?\s*Protocol:\s*v(\d+)\.(\d+)\.(\d+)\b")
298
325
  NO_FINDINGS_RE = re.compile(r"No significant findings", re.IGNORECASE)
299
326
 
327
+ # The template requirements this validator enforces, keyed to the protocol
328
+ # release it ships with (kept in lockstep by check-docs-sync.py). Artifacts
329
+ # that echo an older `Protocol:` are judged by THAT version's template —
330
+ # version-aware validation, architect review F-5. Artifacts without the echo
331
+ # are assumed current (and, from v1.3.0 on, flagged for omitting it).
332
+ CURRENT_PROTOCOL = (1, 3, 0)
333
+ MODE_ECHO_SINCE = (1, 1, 0)
334
+ PROTO_ECHO_SINCE = (1, 3, 0)
335
+
300
336
 
301
337
  def check_template_conformance(text, violations, src):
302
338
  """BLINDSPOTS BS-12 — the meta-tripwire. Every other check activates only
303
339
  when output LOOKS like the Phase 2 template (pipe tables, headings, Mode
304
340
  echo); a run that emits prose instead silently escapes all of them. This
305
341
  converts that silent escape into a violation, making the rest of the rule
306
- set load-bearing. Per loop section the template requires a `Mode:` line in
307
- Scope & method, and EITHER (benchmark table AND task table) OR the R7
308
- "No significant findings" line (a tabled-baselines-but-no-tasks loop also
309
- carries that line, so benchmark-table-only sections remain conformant)."""
342
+ set load-bearing. Per loop section the template requires (since v1.3.0) a
343
+ `Protocol:` echo, (since v1.1.0) a `Mode:` line, and all versions
344
+ EITHER (benchmark table AND task table) OR the R7 "No significant
345
+ findings" line. Requirements are gated on the section's stated protocol
346
+ version, so older artifacts are judged by their own template (F-5)."""
310
347
  sections = re.split(r"(?m)^##\s+(?=Loop\b)", text)[1:]
311
348
  if not sections:
312
349
  violations.append(("TEMPLATE-NONCONFORMANT", src,
@@ -314,7 +351,13 @@ def check_template_conformance(text, violations, src):
314
351
  return
315
352
  for sec in sections:
316
353
  loop_id = (sec.splitlines() or ["?"])[0].strip()
317
- if not MODE_LINE_RE.search(sec):
354
+ pm = PROTO_LINE_RE.search(sec)
355
+ proto = tuple(int(g) for g in pm.groups()) if pm else CURRENT_PROTOCOL
356
+ if pm is None and CURRENT_PROTOCOL >= PROTO_ECHO_SINCE:
357
+ violations.append(("TEMPLATE-NONCONFORMANT", src,
358
+ f"'{loop_id}': Scope & method has no 'Protocol:' echo (required since v1.3.0; "
359
+ f"artifacts from older protocol versions validate with the matching release tag)"))
360
+ if proto >= MODE_ECHO_SINCE and not MODE_LINE_RE.search(sec):
318
361
  violations.append(("TEMPLATE-NONCONFORMANT", src,
319
362
  f"'{loop_id}': Scope & method has no 'Mode:' line (required since v1.1.0)"))
320
363
  tables = list(parse_tables(sec))
@@ -332,15 +375,28 @@ CONFIG_ENUMS = {
332
375
  "BENCHMARK_MODE": {"auto", "provided", "none"},
333
376
  "SEVERITY_FLOOR": {"Critical", "High", "Medium", "Low"},
334
377
  }
335
- PLACEHOLDER_RE = re.compile(r"<[^<>\n]*>")
378
+ # Architect review F-4: `<...>` alone misreads Java/TS generics (List<OrderDTO>)
379
+ # and shell redirection (< seed.txt > out.log) as placeholders. A placeholder is
380
+ # either the WHOLE value wrapped in <...>, or text carrying one of the canonical
381
+ # template stems below (the literal phrasings shipped in AUDIT.md's CONFIG).
382
+ TEMPLATE_STEMS = ("<e.g.", "<one line", "<paths/", "<how to", "<constraints", "<optional:")
336
383
  CONFIG_KEY_RE = re.compile(r"^([A-Z][A-Z_]+):\s*(.*)$")
337
384
 
338
385
 
386
+ def is_placeholder(value: str) -> bool:
387
+ v = value.strip()
388
+ if len(v) > 2 and v.startswith("<") and v.endswith(">"):
389
+ return True
390
+ low = v.lower()
391
+ return any(stem in low for stem in TEMPLATE_STEMS)
392
+
393
+
339
394
  def parse_audit_config(audit_md: Path):
340
395
  """Flat KEY: value parse of the CONFIG block in a target repo's AUDIT.md.
341
- Trailing `# comment` text is stripped; placeholder text is preserved."""
396
+ Comments are stripped only at >=2 spaces before '#' (the template's own
397
+ column style) so values like 'ticket #4211' survive intact (F-4)."""
342
398
  cfg, in_config = {}, False
343
- for line in audit_md.read_text(encoding="utf-8").splitlines():
399
+ for line in audit_md.read_text(encoding="utf-8", errors="replace").splitlines():
344
400
  if re.match(r"^##\s*CONFIG\b", line):
345
401
  in_config = True
346
402
  continue
@@ -352,34 +408,136 @@ def parse_audit_config(audit_md: Path):
352
408
  if not m:
353
409
  continue
354
410
  key, raw = m.groups()
355
- cfg[key] = re.split(r"\s+#", raw, 1)[0].strip()
411
+ cfg[key] = re.split(r"\s{2,}#", raw, 1)[0].strip()
356
412
  return cfg
357
413
 
358
414
 
359
415
  def check_config_preflight(target_root: Path, violations, protected):
360
416
  """Phase 0 CONFIG preflight (review gap G-D) + PROTECTED_AREAS auto-load
361
417
  (gap G-F). Runs only when the target's AUDIT.md is present; placeholder
362
- values never silently configure anything."""
418
+ values never silently configure anything. Enum values are compared on the
419
+ first whitespace token, so an inline trailing comment can't fail the enum."""
363
420
  audit = target_root / "AUDIT.md"
364
421
  if not audit.exists():
365
422
  return
366
423
  cfg = parse_audit_config(audit)
367
424
  for key, val in cfg.items():
368
- if PLACEHOLDER_RE.search(val):
425
+ if is_placeholder(val):
369
426
  violations.append(("CONFIG-PLACEHOLDER", "AUDIT.md",
370
427
  f"{key} still contains unedited template text: '{val[:60]}'"))
371
- elif key in CONFIG_ENUMS and val and val not in CONFIG_ENUMS[key]:
372
- violations.append(("CONFIG-BAD-ENUM", "AUDIT.md",
373
- f"{key} '{val}' not in {sorted(CONFIG_ENUMS[key])}"))
428
+ elif key in CONFIG_ENUMS and val:
429
+ token = val.split()[0]
430
+ if token not in CONFIG_ENUMS[key]:
431
+ violations.append(("CONFIG-BAD-ENUM", "AUDIT.md",
432
+ f"{key} '{token}' not in {sorted(CONFIG_ENUMS[key])}"))
374
433
  areas = cfg.get("PROTECTED_AREAS", "")
375
- if areas and not PLACEHOLDER_RE.search(areas):
434
+ if areas and not is_placeholder(areas):
376
435
  for p in areas.split(","):
377
436
  p = p.strip()
378
437
  if p and p not in protected:
379
438
  protected.append(p)
380
439
 
381
440
 
382
- def validate_run(run_dir: Path, protected=None):
441
+ MAX_ARTIFACT_BYTES = 10 * 1024 * 1024 # 10 MB ceiling — a "report" beyond this is not a report
442
+
443
+
444
+ def read_artifact(path: Path, violations, src):
445
+ """Guarded reader (architect review F-3): artifact problems must become
446
+ VERDICTS, never tracebacks — a gate that crashes is neither pass nor fail
447
+ and invites `|| true` workarounds. Returns text, or None after recording
448
+ a MALFORMED-FILE violation."""
449
+ try:
450
+ if path.stat().st_size > MAX_ARTIFACT_BYTES:
451
+ violations.append(("MALFORMED-FILE", src,
452
+ f"file is {path.stat().st_size} bytes — exceeds the {MAX_ARTIFACT_BYTES // (1024*1024)} MB artifact ceiling"))
453
+ return None
454
+ return path.read_text(encoding="utf-8")
455
+ except UnicodeDecodeError as e:
456
+ violations.append(("MALFORMED-FILE", src,
457
+ f"not valid UTF-8 (decode error at byte {e.start}) — artifacts must be UTF-8 text"))
458
+ return None
459
+ except OSError as e:
460
+ violations.append(("MALFORMED-FILE", src, f"unreadable: {e.__class__.__name__}"))
461
+ return None
462
+
463
+
464
+ def load_waivers(docs: Path):
465
+ """docs/AUDIT-WAIVERS.yaml in the TARGET repo — audit-trailed, expiring
466
+ suppressions (architect review §3.1). Deliberately different from eval
467
+ fixtures (which may never be weakened): waivers live in the audited repo,
468
+ name an approver, and MUST expire. Minimal YAML subset, stdlib-only:
469
+
470
+ - code: R2-UNCITED # required: violation code to waive
471
+ file: BACKLOG.md # optional: artifact filename
472
+ match: "Palantir" # optional: substring of the detail
473
+ reason: "approved comparison for marketing deck"
474
+ approved_by: "name@company"
475
+ expires: 2026-09-01 # required: ISO date
476
+ """
477
+ f = docs / "AUDIT-WAIVERS.yaml"
478
+ entries, cur = [], None
479
+ if not f.exists():
480
+ return entries
481
+ for raw in f.read_text(encoding="utf-8", errors="replace").splitlines():
482
+ line = raw.strip()
483
+ if not line or line.startswith("#"):
484
+ continue
485
+ if line.startswith("- "):
486
+ cur = {}
487
+ entries.append(cur)
488
+ line = line[2:].strip()
489
+ if cur is None or ":" not in line:
490
+ continue
491
+ k, _, v = line.partition(":")
492
+ cur[k.strip()] = v.split(" #")[0].strip().strip("\"'")
493
+ return entries
494
+
495
+
496
+ def apply_waivers(docs: Path, violations, waived_out=None):
497
+ """Partition violations into active vs waived. An expired (or undated)
498
+ waiver does NOT suppress — the original violation stays active and the
499
+ waiver itself becomes a WAIVER-EXPIRED violation. WAIVER-EXPIRED is not
500
+ itself waivable."""
501
+ import datetime
502
+ waivers = load_waivers(docs)
503
+ if not waivers:
504
+ return violations
505
+ today = datetime.date.today()
506
+ active, flagged = [], set()
507
+ for code, src, detail in violations:
508
+ match = None
509
+ for i, w in enumerate(waivers):
510
+ if w.get("code") != code:
511
+ continue
512
+ if w.get("file") and w["file"] != src:
513
+ continue
514
+ if w.get("match") and w["match"] not in detail:
515
+ continue
516
+ match = (i, w)
517
+ break
518
+ if match is None:
519
+ active.append((code, src, detail))
520
+ continue
521
+ i, w = match
522
+ try:
523
+ valid = datetime.date.fromisoformat(w.get("expires", "")) >= today
524
+ except ValueError:
525
+ valid = False
526
+ if valid:
527
+ if waived_out is not None:
528
+ waived_out.append({"code": code, "file": src, "detail": detail,
529
+ "reason": w.get("reason", ""), "approved_by": w.get("approved_by", ""),
530
+ "expires": w.get("expires", "")})
531
+ else:
532
+ active.append((code, src, detail))
533
+ if i not in flagged:
534
+ flagged.add(i)
535
+ active.append(("WAIVER-EXPIRED", "AUDIT-WAIVERS.yaml",
536
+ f"waiver for {code} ('{w.get('reason', 'no reason')}') expired or has no valid 'expires:' date — renew it or fix the violation"))
537
+ return active
538
+
539
+
540
+ def validate_run(run_dir: Path, protected=None, waived_out=None):
383
541
  """Validate one run directory (containing docs/BACKLOG.md, docs/HANDOFF.md)."""
384
542
  protected = list(protected or [])
385
543
  violations = []
@@ -392,26 +550,32 @@ def validate_run(run_dir: Path, protected=None):
392
550
  if not backlog.exists():
393
551
  violations.append(("MISSING-FILE", "docs/BACKLOG.md", "file not found"))
394
552
  if backlog.exists():
395
- text = backlog.read_text(encoding="utf-8")
396
- check_template_conformance(text, violations, "BACKLOG.md")
397
- check_secrets(text, violations, "BACKLOG.md")
398
- check_approvals(text, violations, "BACKLOG.md")
399
- for header, rows, end in parse_tables(text):
400
- if col(header, "metric") is not None and col(header, "final") is None:
401
- check_benchmark_like_table(header, rows, end, text, violations, "BACKLOG.md", is_handoff=False)
402
- elif col(header, "status") is not None and col(header, "id") is not None:
403
- check_task_table(header, rows, violations, "BACKLOG.md", protected)
553
+ text = read_artifact(backlog, violations, "BACKLOG.md")
554
+ if text is not None:
555
+ check_template_conformance(text, violations, "BACKLOG.md")
556
+ check_secrets(text, violations, "BACKLOG.md")
557
+ check_approvals(text, violations, "BACKLOG.md")
558
+ for header, rows, end in parse_tables(text):
559
+ # Architect review F-2: a metric table in the BACKLOG is ALWAYS
560
+ # checked column shape selects semantics, it never disables
561
+ # the check (the `Final`-column escape hatch is closed).
562
+ if col(header, "metric") is not None:
563
+ handoff_shaped = col(header, "final") is not None and col(header, "status") is not None
564
+ check_benchmark_like_table(header, rows, end, text, violations, "BACKLOG.md", is_handoff=handoff_shaped)
565
+ elif col(header, "status") is not None and col(header, "id") is not None:
566
+ check_task_table(header, rows, violations, "BACKLOG.md", protected)
404
567
  if handoff.exists():
405
- text = handoff.read_text(encoding="utf-8")
406
- check_secrets(text, violations, "HANDOFF.md")
407
- if not STOP_RE.search(text):
408
- violations.append(("P5-NO-STOP-REASON", "HANDOFF.md", "no 'Stop condition: (a|b|c)' line"))
409
- for header, rows, end in parse_tables(text):
410
- if col(header, "metric") is not None:
411
- check_benchmark_like_table(header, rows, end, text, violations, "HANDOFF.md", is_handoff=(col(header, "final") is not None))
412
- elif col(header, "status") is not None and col(header, "id") is not None:
413
- check_task_table(header, rows, violations, "HANDOFF.md", protected)
414
- return violations
568
+ text = read_artifact(handoff, violations, "HANDOFF.md")
569
+ if text is not None:
570
+ check_secrets(text, violations, "HANDOFF.md")
571
+ if not STOP_RE.search(text):
572
+ violations.append(("P5-NO-STOP-REASON", "HANDOFF.md", "no 'Stop condition: (a|b|c)' line"))
573
+ for header, rows, end in parse_tables(text):
574
+ if col(header, "metric") is not None:
575
+ check_benchmark_like_table(header, rows, end, text, violations, "HANDOFF.md", is_handoff=(col(header, "final") is not None))
576
+ elif col(header, "status") is not None and col(header, "id") is not None:
577
+ check_task_table(header, rows, violations, "HANDOFF.md", protected)
578
+ return apply_waivers(docs, violations, waived_out)
415
579
 
416
580
 
417
581
  LOOP_HEAD_RE = re.compile(r"^Loop\s+(\d+)\s*(?:—|-)?\s*(.*)$")
@@ -446,11 +610,13 @@ def build_report(run_dir: Path, protected, violations):
446
610
  for sec in re.split(r"(?m)^##\s+(?=Loop\b)", text)[1:]:
447
611
  first = (sec.splitlines() or [""])[0]
448
612
  hm = LOOP_HEAD_RE.match(first.strip())
449
- mode_m = re.search(r"(?mi)^\s*-?\s*Mode:\s*(\S+)", sec)
613
+ mode_m = re.search(r"(?mi)(?:^|\|)\s*-?\s*Mode:\s*(\S+)", sec)
614
+ proto_m = PROTO_LINE_RE.search(sec)
450
615
  appr_m = APPROVED_RE.search(sec)
451
616
  loop = {
452
617
  "loop": int(hm.group(1)) if hm else None,
453
618
  "date": (hm.group(2).strip() or None) if hm else None,
619
+ "protocol": f"v{'.'.join(proto_m.groups())}" if proto_m else None,
454
620
  "mode": mode_m.group(1) if mode_m else None,
455
621
  "approved": ([] if appr_m.group(1).strip().lower() == "none"
456
622
  else [norm(x) for x in appr_m.group(1).split(",") if x.strip()]) if appr_m else None,
@@ -485,10 +651,17 @@ def build_report(run_dir: Path, protected, violations):
485
651
  def cell(ix):
486
652
  return r[ix] if ix is not None and ix < len(r) else ""
487
653
  if cell(mi):
488
- report["metrics"].append({
654
+ entry = {
489
655
  "metric": cell(mi), "baseline": cell(bi), "final": cell(fi),
490
656
  "delta": cell(di), "target": cell(ti), "verify": cell(vi), "status": cell(si),
491
- })
657
+ }
658
+ # Computed delta when both ends parse as numbers (review §2):
659
+ # the reported Delta cell is echoed, never trusted as math.
660
+ num = lambda s: (re.search(r"-?\d+(?:\.\d+)?", s) or [None]) and re.search(r"-?\d+(?:\.\d+)?", s) # noqa: E731
661
+ b_m, f_m = num(entry["baseline"]), num(entry["final"])
662
+ if b_m and f_m:
663
+ entry["delta_computed"] = round(float(f_m.group(0)) - float(b_m.group(0)), 6)
664
+ report["metrics"].append(entry)
492
665
  tasks = [t for l in report["loops"] for t in l["tasks"]]
493
666
  by = lambda key: {k: sum(1 for t in tasks if t[key] == k) # noqa: E731
494
667
  for k in sorted({t[key] for t in tasks if t[key]})}
@@ -552,6 +725,8 @@ def load_fixture_meta(fdir: Path):
552
725
  meta[k] = [x.strip() for x in v.strip("[]").split(",") if x.strip()]
553
726
  elif k in ("id", "expect", "description"):
554
727
  meta[k] = v
728
+ if meta["expect"] not in ("pass", "fail"): # F-6: a typo must not silently change semantics
729
+ raise SystemExit(f"fixture {fdir.name}: expect '{meta['expect']}' must be 'pass' or 'fail'")
555
730
  return meta
556
731
 
557
732
 
@@ -616,6 +791,38 @@ def run_all(as_json=False):
616
791
  return 0 if ok else 1
617
792
 
618
793
 
794
+ def aggregate_reports(paths):
795
+ """Portfolio roll-up over per-run report JSONs (architect review §3.2)."""
796
+ import datetime
797
+ runs, by_code, by_sev = [], {}, {}
798
+ for p in paths:
799
+ r = json.loads(Path(p).read_text(encoding="utf-8"))
800
+ s = r.get("summary", {})
801
+ runs.append({
802
+ "run_dir": r.get("run_dir"), "protocol_version": r.get("protocol_version"),
803
+ "clean": s.get("clean"), "violations": s.get("violations", 0),
804
+ "waived": len(r.get("waived", [])), "tasks": s.get("tasks", 0),
805
+ "loops": s.get("loops", 0),
806
+ })
807
+ for code, n in s.get("violations_by_code", {}).items():
808
+ by_code[code] = by_code.get(code, 0) + n
809
+ for sev, n in s.get("tasks_by_severity", {}).items():
810
+ by_sev[sev] = by_sev.get(sev, 0) + n
811
+ return {
812
+ "schema": "code-audit-framework/portfolio@1",
813
+ "generated_at": datetime.datetime.now(datetime.timezone.utc).isoformat(timespec="seconds"),
814
+ "runs": runs,
815
+ "totals": {
816
+ "runs": len(runs),
817
+ "clean_runs": sum(1 for r in runs if r["clean"]),
818
+ "violations": sum(r["violations"] for r in runs),
819
+ "waived": sum(r["waived"] for r in runs),
820
+ "violations_by_code": dict(sorted(by_code.items())),
821
+ "tasks_by_severity": dict(sorted(by_sev.items())),
822
+ },
823
+ }
824
+
825
+
619
826
  def main():
620
827
  ap = argparse.ArgumentParser()
621
828
  ap.add_argument("--run", help="validate one run directory (containing docs/)")
@@ -623,22 +830,51 @@ def main():
623
830
  ap.add_argument("--json", action="store_true")
624
831
  ap.add_argument("--report", choices=["json", "sarif"],
625
832
  help="with --run: emit a structured findings report instead of plain violations")
833
+ ap.add_argument("--aggregate", nargs="+", metavar="REPORT_JSON",
834
+ help="portfolio roll-up over per-run --report json files")
626
835
  ap.add_argument("--protected", default="", help="comma-separated protected paths (extends the target AUDIT.md's PROTECTED_AREAS)")
627
836
  args = ap.parse_args()
628
837
  if args.all:
629
838
  sys.exit(run_all(as_json=args.json))
839
+ if args.aggregate:
840
+ missing = [p for p in args.aggregate if not Path(p).is_file()]
841
+ if missing:
842
+ print(f"usage error: report file(s) not found: {', '.join(missing)}", file=sys.stderr)
843
+ sys.exit(2)
844
+ agg = aggregate_reports(args.aggregate)
845
+ if args.json:
846
+ print(json.dumps(agg, indent=2))
847
+ else:
848
+ t = agg["totals"]
849
+ print(f"{'Run':40s} {'proto':8s} {'clean':5s} {'viol':>4s} {'waived':>6s} {'tasks':>5s}")
850
+ for r in agg["runs"]:
851
+ print(f"{str(r['run_dir'])[:40]:40s} {str(r['protocol_version']):8s} "
852
+ f"{'yes' if r['clean'] else 'NO':5s} {r['violations']:4d} {r['waived']:6d} {r['tasks']:5d}")
853
+ print(f"\n{t['clean_runs']}/{t['runs']} runs clean — {t['violations']} active violation(s), "
854
+ f"{t['waived']} waived — by code: {t['violations_by_code'] or '{}'}")
855
+ sys.exit(0)
630
856
  if args.run:
857
+ run_path = Path(args.run)
858
+ if not run_path.exists():
859
+ print(f"usage error: --run path does not exist: {run_path}", file=sys.stderr)
860
+ sys.exit(2)
631
861
  protected = [p for p in args.protected.split(",") if p]
632
- v = validate_run(Path(args.run), protected=protected)
862
+ waived = []
863
+ v = validate_run(run_path, protected=protected, waived_out=waived)
633
864
  if args.report:
634
- report = build_report(Path(args.run), protected, v)
865
+ report = build_report(run_path, protected, v)
866
+ report["waived"] = waived
867
+ report["summary"]["waived"] = len(waived)
635
868
  print(json.dumps(to_sarif(report) if args.report == "sarif" else report, indent=2))
636
869
  elif args.json:
637
870
  print(json.dumps([{"code": c, "file": s, "detail": d} for c, s, d in v], indent=2))
638
871
  else:
639
872
  for c, s, d in v:
640
873
  print(f"VIOLATION {c} [{s}] {d}")
641
- print("CLEAN no violations" if not v else f"{len(v)} violation(s)")
874
+ for w in waived:
875
+ print(f"WAIVED {w['code']} [{w['file']}] until {w['expires']} — {w['reason']} (approved by {w['approved_by']})")
876
+ tail = "CLEAN — no violations" if not v else f"{len(v)} violation(s)"
877
+ print(tail + (f" ({len(waived)} waived)" if waived else ""))
642
878
  sys.exit(0 if not v else 1)
643
879
  ap.print_help()
644
880
  sys.exit(2)
@@ -19,7 +19,7 @@ build-backend = "setuptools.build_meta"
19
19
 
20
20
  [project]
21
21
  name = "code-audit-validator"
22
- version = "1.2.0"
22
+ version = "1.3.0"
23
23
  description = "Deterministic conformance checker for AUDIT.md agent-audit outputs — CyberSkill code-audit-framework"
24
24
  readme = { file = "evals/README.md", content-type = "text/markdown" }
25
25
  license = { file = "LICENSE" }