devlyn-cli 2.3.0 → 2.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +1 -1
- package/CLAUDE.md +2 -2
- package/README.md +82 -29
- package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
- package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
- package/benchmark/auto-resolve/README.md +307 -44
- package/benchmark/auto-resolve/RUBRIC.md +23 -14
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
- package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
- package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
- package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
- package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
- package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
- package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
- package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
- package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
- package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
- package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
- package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
- package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
- package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
- package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
- package/benchmark/auto-resolve/scripts/judge.sh +153 -26
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
- package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
- package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
- package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
- package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
- package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
- package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
- package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
- package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
- package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
- package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
- package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
- package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
- package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
- package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
- package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
- package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
- package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
- package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
- package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
- package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
- package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
- package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
- package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
- package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
- package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
- package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
- package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
- package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
- package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
- package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
- package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
- package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
- package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
- package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
- package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
- package/bin/devlyn.js +211 -18
- package/config/skills/_shared/adapters/README.md +3 -0
- package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
- package/config/skills/_shared/adapters/opus-4-7.md +9 -1
- package/config/skills/_shared/archive_run.py +78 -6
- package/config/skills/_shared/codex-config.md +3 -2
- package/config/skills/_shared/codex-monitored.sh +46 -1
- package/config/skills/_shared/collect-codex-findings.py +20 -5
- package/config/skills/_shared/engine-preflight.md +1 -1
- package/config/skills/_shared/runtime-principles.md +5 -8
- package/config/skills/_shared/spec-verify-check.py +2664 -107
- package/config/skills/_shared/verify-merge-findings.py +1369 -19
- package/config/skills/devlyn:ideate/SKILL.md +7 -4
- package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
- package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
- package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
- package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
- package/config/skills/devlyn:resolve/SKILL.md +49 -18
- package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
- package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
- package/config/skills/devlyn:resolve/references/phases/verify.md +62 -28
- package/config/skills/devlyn:resolve/references/state-schema.md +7 -4
- package/package.json +47 -2
- package/scripts/lint-fixtures.sh +349 -0
- package/scripts/lint-shadow-fixtures.sh +58 -0
- package/scripts/lint-skills.sh +3642 -92
- /package/{optional-skills → config/skills}/devlyn:design-ui/SKILL.md +0 -0
|
@@ -5,11 +5,20 @@ from __future__ import annotations
|
|
|
5
5
|
|
|
6
6
|
import argparse
|
|
7
7
|
import json
|
|
8
|
+
import math
|
|
8
9
|
import re
|
|
9
10
|
from collections import Counter
|
|
10
11
|
from pathlib import Path
|
|
11
12
|
from typing import Any
|
|
12
13
|
|
|
14
|
+
from pair_evidence_contract import (
|
|
15
|
+
all_known_pair_trigger_reasons,
|
|
16
|
+
has_canonical_pair_trigger_reason,
|
|
17
|
+
has_known_pair_trigger_reason,
|
|
18
|
+
loads_strict_json_object,
|
|
19
|
+
path_has_actionable_solo_headroom_hypothesis,
|
|
20
|
+
)
|
|
21
|
+
|
|
13
22
|
|
|
14
23
|
RANK = {
|
|
15
24
|
"PASS": 0,
|
|
@@ -24,7 +33,29 @@ def rank(verdict: str | None) -> int:
|
|
|
24
33
|
|
|
25
34
|
|
|
26
35
|
def load_json(path: Path) -> dict[str, Any]:
|
|
27
|
-
|
|
36
|
+
try:
|
|
37
|
+
return loads_strict_json_object(path.read_text(encoding="utf8"))
|
|
38
|
+
except (json.JSONDecodeError, ValueError):
|
|
39
|
+
return {}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def object_field(payload: dict[str, Any], key: str) -> dict[str, Any]:
|
|
43
|
+
value = payload.get(key)
|
|
44
|
+
return value if isinstance(value, dict) else {}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def verdict_field(payload: dict[str, Any], key: str) -> str | None:
|
|
48
|
+
value = payload.get(key)
|
|
49
|
+
return value if isinstance(value, str) else None
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def number_field(payload: dict[str, Any], key: str) -> int | float | None:
|
|
53
|
+
value = payload.get(key)
|
|
54
|
+
if isinstance(value, bool):
|
|
55
|
+
return None
|
|
56
|
+
if not isinstance(value, (int, float)) or not math.isfinite(value):
|
|
57
|
+
return None
|
|
58
|
+
return value
|
|
28
59
|
|
|
29
60
|
|
|
30
61
|
def transcript_failure_reason(results_root: Path, run_id: str, arm: str) -> str | None:
|
|
@@ -51,16 +82,112 @@ def infer_fixture_id(results_root: Path, run_id: str) -> str:
|
|
|
51
82
|
def elapsed_ratio(pair_elapsed: Any, solo_elapsed: Any) -> float | None:
|
|
52
83
|
if not isinstance(pair_elapsed, (int, float)) or not isinstance(solo_elapsed, (int, float)):
|
|
53
84
|
return None
|
|
54
|
-
if solo_elapsed <= 0:
|
|
85
|
+
if pair_elapsed <= 0 or solo_elapsed <= 0:
|
|
55
86
|
return None
|
|
56
87
|
return pair_elapsed / solo_elapsed
|
|
57
88
|
|
|
58
89
|
|
|
90
|
+
def is_true(value: Any) -> bool:
|
|
91
|
+
return value is True
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def pair_trigger_failures(
|
|
95
|
+
pair: dict[str, Any],
|
|
96
|
+
*,
|
|
97
|
+
fixture_spec: Path | None = None,
|
|
98
|
+
require_hypothesis_trigger: bool = False,
|
|
99
|
+
) -> list[str]:
|
|
100
|
+
trigger = pair.get("pair_trigger")
|
|
101
|
+
if not isinstance(trigger, dict):
|
|
102
|
+
return ["pair_trigger missing or malformed"]
|
|
103
|
+
eligible = trigger.get("eligible")
|
|
104
|
+
reasons = trigger.get("reasons")
|
|
105
|
+
skipped_reason = trigger.get("skipped_reason")
|
|
106
|
+
if not isinstance(eligible, bool):
|
|
107
|
+
return ["pair_trigger.eligible malformed"]
|
|
108
|
+
if not isinstance(reasons, list) or not all(isinstance(reason, str) for reason in reasons):
|
|
109
|
+
return ["pair_trigger.reasons malformed"]
|
|
110
|
+
if skipped_reason is not None and not isinstance(skipped_reason, str):
|
|
111
|
+
return ["pair_trigger.skipped_reason malformed"]
|
|
112
|
+
if eligible is True:
|
|
113
|
+
failures = []
|
|
114
|
+
if not reasons:
|
|
115
|
+
failures.append("pair_trigger eligible with empty reasons")
|
|
116
|
+
if reasons and not has_known_pair_trigger_reason(reasons):
|
|
117
|
+
failures.append("pair_trigger reasons missing known trigger reason")
|
|
118
|
+
if (
|
|
119
|
+
reasons
|
|
120
|
+
and has_known_pair_trigger_reason(reasons)
|
|
121
|
+
and not all_known_pair_trigger_reasons(reasons)
|
|
122
|
+
):
|
|
123
|
+
failures.append("pair_trigger reasons contain unknown trigger reason")
|
|
124
|
+
if (
|
|
125
|
+
reasons
|
|
126
|
+
and all_known_pair_trigger_reasons(reasons)
|
|
127
|
+
and not has_canonical_pair_trigger_reason(reasons)
|
|
128
|
+
):
|
|
129
|
+
failures.append("pair_trigger reasons missing canonical trigger reason")
|
|
130
|
+
if skipped_reason is not None:
|
|
131
|
+
failures.append("pair_trigger eligible with skipped_reason")
|
|
132
|
+
if (
|
|
133
|
+
require_hypothesis_trigger
|
|
134
|
+
and fixture_spec is not None
|
|
135
|
+
and path_has_actionable_solo_headroom_hypothesis(fixture_spec)
|
|
136
|
+
and "spec.solo_headroom_hypothesis" not in reasons
|
|
137
|
+
):
|
|
138
|
+
failures.append("pair_trigger missing spec.solo_headroom_hypothesis")
|
|
139
|
+
return failures
|
|
140
|
+
if reasons:
|
|
141
|
+
return ["pair_trigger ineligible with reasons"]
|
|
142
|
+
return []
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def pair_trigger_eligible(pair: dict[str, Any]) -> bool:
|
|
146
|
+
trigger = pair.get("pair_trigger")
|
|
147
|
+
return (
|
|
148
|
+
isinstance(trigger, dict)
|
|
149
|
+
and trigger.get("eligible") is True
|
|
150
|
+
and isinstance(trigger.get("reasons"), list)
|
|
151
|
+
and all(isinstance(reason, str) for reason in trigger["reasons"])
|
|
152
|
+
and len(trigger["reasons"]) > 0
|
|
153
|
+
and all_known_pair_trigger_reasons(trigger["reasons"])
|
|
154
|
+
and has_canonical_pair_trigger_reason(trigger["reasons"])
|
|
155
|
+
and trigger.get("skipped_reason") is None
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def pair_trigger_reasons(pair: dict[str, Any]) -> list[str]:
|
|
160
|
+
trigger = pair.get("pair_trigger")
|
|
161
|
+
if not isinstance(trigger, dict):
|
|
162
|
+
return []
|
|
163
|
+
reasons = trigger.get("reasons")
|
|
164
|
+
if not isinstance(reasons, list) or not all(isinstance(reason, str) for reason in reasons):
|
|
165
|
+
return []
|
|
166
|
+
return reasons
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def pair_trigger_label(row: dict[str, Any]) -> str:
|
|
170
|
+
if row["pair_trigger_missed"]:
|
|
171
|
+
return "missed"
|
|
172
|
+
failures = row.get("pair_trigger_failures") or []
|
|
173
|
+
if failures:
|
|
174
|
+
return "malformed"
|
|
175
|
+
if row["pair_trigger_eligible"]:
|
|
176
|
+
return "eligible"
|
|
177
|
+
return "not_eligible"
|
|
178
|
+
|
|
179
|
+
|
|
59
180
|
def load_gate_rows(gate_json: Path | None) -> dict[str, dict[str, Any]]:
|
|
60
181
|
if gate_json is None:
|
|
61
182
|
return {}
|
|
62
183
|
doc = load_json(gate_json)
|
|
63
|
-
|
|
184
|
+
rows = doc.get("rows")
|
|
185
|
+
if not isinstance(rows, list):
|
|
186
|
+
return {}
|
|
187
|
+
return {
|
|
188
|
+
row["run_id"]: row for row in rows
|
|
189
|
+
if isinstance(row, dict) and isinstance(row.get("run_id"), str)
|
|
190
|
+
}
|
|
64
191
|
|
|
65
192
|
|
|
66
193
|
def min_gate_rate(value: str) -> float:
|
|
@@ -98,8 +225,21 @@ def classify(row: dict[str, Any], included: bool) -> str:
|
|
|
98
225
|
return "failed attempt: timeout"
|
|
99
226
|
if row.get("solo_failure_reason") == "provider_limit" or row.get("pair_failure_reason") == "provider_limit":
|
|
100
227
|
return "failed attempt: provider limit"
|
|
228
|
+
if row.get("solo_environment_contamination") or row.get("pair_environment_contamination"):
|
|
229
|
+
return "failed attempt: environment contamination"
|
|
230
|
+
if row.get("solo_disqualifier") or row.get("pair_disqualifier"):
|
|
231
|
+
return "failed attempt: disqualifier"
|
|
232
|
+
if row.get("solo_invoke_failure") or row.get("pair_invoke_failure"):
|
|
233
|
+
return "failed attempt: invoke failure"
|
|
101
234
|
if row.get("solo_invoke_exit") not in (None, 0) or row.get("pair_invoke_exit") not in (None, 0):
|
|
102
235
|
return "failed attempt: nonzero invoke exit"
|
|
236
|
+
if row.get("malformed_compare"):
|
|
237
|
+
return "failed attempt: malformed compare"
|
|
238
|
+
if row.get("pair_trigger_missed"):
|
|
239
|
+
return "failed attempt: pair trigger missed"
|
|
240
|
+
trigger_failures = row.get("pair_trigger_failures") or []
|
|
241
|
+
if trigger_failures:
|
|
242
|
+
return "failed attempt: pair trigger contract: " + "; ".join(trigger_failures)
|
|
103
243
|
if row["solo_mechanical"] == "FAIL":
|
|
104
244
|
return "excluded: solo mechanical dominated"
|
|
105
245
|
if row["external_lift"] or row["internal_lift"]:
|
|
@@ -114,47 +254,95 @@ def classify(row: dict[str, Any], included: bool) -> str:
|
|
|
114
254
|
return "no verdict lift"
|
|
115
255
|
|
|
116
256
|
|
|
117
|
-
def build_row(
|
|
257
|
+
def build_row(
|
|
258
|
+
results_root: Path,
|
|
259
|
+
run_id: str,
|
|
260
|
+
gate_rows_by_id: dict[str, dict[str, Any]],
|
|
261
|
+
*,
|
|
262
|
+
fixtures_root: Path | None,
|
|
263
|
+
require_hypothesis_trigger: bool,
|
|
264
|
+
) -> dict[str, Any]:
|
|
118
265
|
compare_path = results_root / run_id / "compare.json"
|
|
266
|
+
malformed_compare = False
|
|
119
267
|
if compare_path.exists():
|
|
120
268
|
compare = load_json(compare_path)
|
|
269
|
+
malformed_compare = not bool(compare)
|
|
121
270
|
else:
|
|
122
271
|
compare = {
|
|
123
272
|
"solo": {},
|
|
124
273
|
"pair": {},
|
|
125
274
|
"comparison": {"compare_missing": True},
|
|
126
275
|
}
|
|
127
|
-
solo = compare
|
|
128
|
-
pair = compare
|
|
129
|
-
comparison = compare
|
|
130
|
-
|
|
276
|
+
solo = object_field(compare, "solo")
|
|
277
|
+
pair = object_field(compare, "pair")
|
|
278
|
+
comparison = object_field(compare, "comparison")
|
|
279
|
+
malformed_compare = malformed_compare or any(
|
|
280
|
+
key in compare and not isinstance(compare.get(key), dict)
|
|
281
|
+
for key in ("solo", "pair", "comparison")
|
|
282
|
+
)
|
|
283
|
+
pair_ratio = elapsed_ratio(
|
|
284
|
+
number_field(pair, "elapsed_seconds"),
|
|
285
|
+
number_field(solo, "elapsed_seconds"),
|
|
286
|
+
)
|
|
131
287
|
gate_row = gate_rows_by_id.get(run_id) or {}
|
|
288
|
+
solo_verdict = (
|
|
289
|
+
verdict_field(comparison, "solo_verdict")
|
|
290
|
+
or verdict_field(solo, "verify_verdict")
|
|
291
|
+
)
|
|
292
|
+
pair_verdict = (
|
|
293
|
+
verdict_field(comparison, "pair_verdict")
|
|
294
|
+
or verdict_field(pair, "verify_verdict")
|
|
295
|
+
)
|
|
296
|
+
solo_sub = object_field(solo, "sub_verdicts")
|
|
297
|
+
pair_sub = object_field(pair, "sub_verdicts")
|
|
298
|
+
fixture_id = infer_fixture_id(results_root, run_id)
|
|
299
|
+
fixture_spec = None
|
|
300
|
+
if fixtures_root is not None and fixture_id != "unknown":
|
|
301
|
+
fixture_spec = fixtures_root / fixture_id / "spec.md"
|
|
302
|
+
trigger_failures = pair_trigger_failures(
|
|
303
|
+
pair,
|
|
304
|
+
fixture_spec=fixture_spec,
|
|
305
|
+
require_hypothesis_trigger=require_hypothesis_trigger,
|
|
306
|
+
)
|
|
307
|
+
trigger_reasons = pair_trigger_reasons(pair)
|
|
132
308
|
row = {
|
|
133
|
-
"fixture_id":
|
|
309
|
+
"fixture_id": fixture_id,
|
|
134
310
|
"run_id": run_id,
|
|
135
|
-
"solo_verdict":
|
|
136
|
-
"pair_verdict":
|
|
137
|
-
"pair_mode":
|
|
138
|
-
"
|
|
139
|
-
"
|
|
140
|
-
"
|
|
141
|
-
"
|
|
142
|
-
"
|
|
311
|
+
"solo_verdict": solo_verdict,
|
|
312
|
+
"pair_verdict": pair_verdict,
|
|
313
|
+
"pair_mode": is_true(pair.get("pair_mode")),
|
|
314
|
+
"pair_trigger_eligible": pair_trigger_eligible(pair),
|
|
315
|
+
"pair_trigger_reasons": trigger_reasons,
|
|
316
|
+
"pair_trigger_has_canonical_reason": has_canonical_pair_trigger_reason(trigger_reasons),
|
|
317
|
+
"pair_trigger_missed": is_true(comparison.get("pair_trigger_missed")),
|
|
318
|
+
"pair_trigger_failures": trigger_failures,
|
|
319
|
+
"external_lift": is_true(comparison.get("pair_verdict_lift")),
|
|
320
|
+
"internal_lift": is_true(comparison.get("pair_internal_verdict_lift")),
|
|
321
|
+
"pair_found_more_findings": is_true(comparison.get("pair_found_more_findings")),
|
|
322
|
+
"pair_found_more_low_or_worse": is_true(comparison.get("pair_found_more_low_or_worse")),
|
|
323
|
+
"row_failed_before_compare": is_true(comparison.get("row_failed_before_compare")),
|
|
143
324
|
"row_exit": comparison.get("row_exit"),
|
|
144
|
-
"compare_missing":
|
|
325
|
+
"compare_missing": is_true(comparison.get("compare_missing")),
|
|
145
326
|
"solo_invoke_exit": solo.get("invoke_exit"),
|
|
146
327
|
"pair_invoke_exit": pair.get("invoke_exit"),
|
|
147
328
|
"solo_failure_reason": solo.get("invoke_failure_reason")
|
|
148
329
|
or transcript_failure_reason(results_root, run_id, "solo"),
|
|
149
330
|
"pair_failure_reason": pair.get("invoke_failure_reason")
|
|
150
331
|
or transcript_failure_reason(results_root, run_id, "pair"),
|
|
151
|
-
"
|
|
152
|
-
"
|
|
332
|
+
"solo_invoke_failure": is_true(solo.get("invoke_failure")),
|
|
333
|
+
"pair_invoke_failure": is_true(pair.get("invoke_failure")),
|
|
334
|
+
"solo_environment_contamination": is_true(solo.get("environment_contamination")),
|
|
335
|
+
"pair_environment_contamination": is_true(pair.get("environment_contamination")),
|
|
336
|
+
"solo_disqualifier": is_true(solo.get("disqualifier")),
|
|
337
|
+
"pair_disqualifier": is_true(pair.get("disqualifier")),
|
|
338
|
+
"solo_timed_out": is_true(solo.get("timed_out")),
|
|
339
|
+
"pair_timed_out": is_true(pair.get("timed_out")),
|
|
153
340
|
"pair_solo_wall_ratio": pair_ratio,
|
|
154
|
-
"solo_mechanical": (
|
|
155
|
-
"pair_mechanical": (
|
|
341
|
+
"solo_mechanical": verdict_field(solo_sub, "mechanical"),
|
|
342
|
+
"pair_mechanical": verdict_field(pair_sub, "mechanical"),
|
|
156
343
|
"included_in_gate": gate_row.get("status") == "PASS",
|
|
157
344
|
"gate_failures": gate_row.get("failures") or [],
|
|
345
|
+
"malformed_compare": malformed_compare,
|
|
158
346
|
}
|
|
159
347
|
row["classification"] = classify(row, row["included_in_gate"])
|
|
160
348
|
return row
|
|
@@ -164,6 +352,12 @@ def fmt_ratio(value: Any) -> str:
|
|
|
164
352
|
return f"{value:.2f}x" if isinstance(value, (int, float)) else "n/a"
|
|
165
353
|
|
|
166
354
|
|
|
355
|
+
def fmt_trigger_reasons(value: Any) -> str:
|
|
356
|
+
if not isinstance(value, list) or not all(isinstance(item, str) for item in value):
|
|
357
|
+
return ""
|
|
358
|
+
return ",".join(value)
|
|
359
|
+
|
|
360
|
+
|
|
167
361
|
def write_md(path: Path, report: dict[str, Any]) -> None:
|
|
168
362
|
lines = [
|
|
169
363
|
f"# {report['title']}",
|
|
@@ -189,14 +383,16 @@ def write_md(path: Path, report: dict[str, Any]) -> None:
|
|
|
189
383
|
lines.extend(
|
|
190
384
|
[
|
|
191
385
|
"",
|
|
192
|
-
"| Fixture | Solo | Pair | Pair mode | Wall ratio | External lift | Internal lift | Included | Classification |",
|
|
193
|
-
"
|
|
386
|
+
"| Fixture | Solo VERIFY | Pair VERIFY | Pair mode | Pair trigger | Triggers | Wall ratio | External lift | Internal lift | Included | Classification |",
|
|
387
|
+
"|---|---|---|---|---|---|---:|---|---|---|---|",
|
|
194
388
|
]
|
|
195
389
|
)
|
|
196
390
|
for row in report["rows"]:
|
|
197
391
|
lines.append(
|
|
198
392
|
f"| {row['fixture_id']} | {row['solo_verdict']} | {row['pair_verdict']} | "
|
|
199
|
-
f"{str(row['pair_mode']).lower()} | {
|
|
393
|
+
f"{str(row['pair_mode']).lower()} | {pair_trigger_label(row)} | "
|
|
394
|
+
f"{fmt_trigger_reasons(row.get('pair_trigger_reasons'))} | "
|
|
395
|
+
f"{fmt_ratio(row.get('pair_solo_wall_ratio'))} | "
|
|
200
396
|
f"{str(row['external_lift']).lower()} | {str(row['internal_lift']).lower()} | "
|
|
201
397
|
f"{str(row['included_in_gate']).lower()} | {row['classification']} |"
|
|
202
398
|
)
|
|
@@ -207,18 +403,35 @@ def write_md(path: Path, report: dict[str, Any]) -> None:
|
|
|
207
403
|
def main() -> int:
|
|
208
404
|
parser = argparse.ArgumentParser()
|
|
209
405
|
parser.add_argument("--results-root", default="benchmark/auto-resolve/results", type=Path)
|
|
406
|
+
parser.add_argument("--fixtures-root", type=Path)
|
|
210
407
|
parser.add_argument("--run-id", action="append", required=True)
|
|
211
408
|
parser.add_argument("--gate-json", type=Path)
|
|
212
409
|
parser.add_argument("--title", required=True)
|
|
213
410
|
parser.add_argument("--verdict", required=True)
|
|
214
411
|
parser.add_argument("--min-gate-rate", type=min_gate_rate)
|
|
215
412
|
parser.add_argument("--max-trailing-non-gate", type=non_negative_int)
|
|
413
|
+
parser.add_argument(
|
|
414
|
+
"--require-hypothesis-trigger",
|
|
415
|
+
action="store_true",
|
|
416
|
+
help="require fixtures with actionable solo-headroom hypotheses to expose spec.solo_headroom_hypothesis in pair_trigger.reasons",
|
|
417
|
+
)
|
|
216
418
|
parser.add_argument("--out-json", required=True, type=Path)
|
|
217
419
|
parser.add_argument("--out-md", required=True, type=Path)
|
|
218
420
|
args = parser.parse_args()
|
|
421
|
+
if args.require_hypothesis_trigger and args.fixtures_root is None:
|
|
422
|
+
parser.error("--require-hypothesis-trigger requires --fixtures-root")
|
|
219
423
|
|
|
220
424
|
gate_rows_by_id = load_gate_rows(args.gate_json)
|
|
221
|
-
rows = [
|
|
425
|
+
rows = [
|
|
426
|
+
build_row(
|
|
427
|
+
args.results_root,
|
|
428
|
+
run_id,
|
|
429
|
+
gate_rows_by_id,
|
|
430
|
+
fixtures_root=args.fixtures_root,
|
|
431
|
+
require_hypothesis_trigger=args.require_hypothesis_trigger,
|
|
432
|
+
)
|
|
433
|
+
for run_id in args.run_id
|
|
434
|
+
]
|
|
222
435
|
gate_rows = sum(1 for row in rows if row["included_in_gate"])
|
|
223
436
|
trailing_non_gate_rows = 0
|
|
224
437
|
for row in reversed(rows):
|
|
@@ -0,0 +1,288 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Regression tests for audit-headroom-rejections.py.
|
|
3
|
+
|
|
4
|
+
set -euo pipefail
|
|
5
|
+
|
|
6
|
+
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
7
|
+
SCRIPT="$SCRIPT_DIR/audit-headroom-rejections.py"
|
|
8
|
+
TMP_DIR="$(mktemp -d /tmp/audit-headroom-rejections-test.XXXXXX)"
|
|
9
|
+
trap 'rm -rf "$TMP_DIR"' EXIT
|
|
10
|
+
|
|
11
|
+
fixtures="$TMP_DIR/fixtures"
|
|
12
|
+
results="$TMP_DIR/results"
|
|
13
|
+
registry="$TMP_DIR/pair-rejected-fixtures.sh"
|
|
14
|
+
mkdir -p "$fixtures/F16-cli-quote-tax-rules" \
|
|
15
|
+
"$fixtures/F33-cli-new-candidate" \
|
|
16
|
+
"$fixtures/F34-cli-rejected-candidate" \
|
|
17
|
+
"$fixtures/F35-cli-missing-judge" \
|
|
18
|
+
"$fixtures/F36-unsupported-rejection" \
|
|
19
|
+
"$results/old-f16" \
|
|
20
|
+
"$results/f33-headroom" \
|
|
21
|
+
"$results/f33-weak-pair-pass" \
|
|
22
|
+
"$results/f34-headroom" \
|
|
23
|
+
"$results/f35-missing-judge" \
|
|
24
|
+
"$results/20260512-f36-headroom" \
|
|
25
|
+
"$results/bad-json-headroom" \
|
|
26
|
+
"$results/malformed-headroom" \
|
|
27
|
+
"$results/f16-pair-pass"
|
|
28
|
+
|
|
29
|
+
cat > "$registry" <<'SH'
|
|
30
|
+
rejected_pair_fixture_reason() {
|
|
31
|
+
local fid="$1"
|
|
32
|
+
case "$fid" in
|
|
33
|
+
F34-*|F34)
|
|
34
|
+
echo "measured solo ceiling"
|
|
35
|
+
;;
|
|
36
|
+
F36-*|F36)
|
|
37
|
+
echo "bare 33 / solo_claude 98 in 20260512-missing-headroom"
|
|
38
|
+
;;
|
|
39
|
+
*)
|
|
40
|
+
return 1
|
|
41
|
+
;;
|
|
42
|
+
esac
|
|
43
|
+
}
|
|
44
|
+
SH
|
|
45
|
+
|
|
46
|
+
s_only_registry="$TMP_DIR/s-only-registry.sh"
|
|
47
|
+
cat > "$s_only_registry" <<'SH'
|
|
48
|
+
rejected_pair_fixture_reason() {
|
|
49
|
+
local fid="$1"
|
|
50
|
+
case "$fid" in
|
|
51
|
+
S3-*|S3)
|
|
52
|
+
echo "shadow solo ceiling"
|
|
53
|
+
;;
|
|
54
|
+
*)
|
|
55
|
+
return 1
|
|
56
|
+
;;
|
|
57
|
+
esac
|
|
58
|
+
}
|
|
59
|
+
SH
|
|
60
|
+
python3 - "$SCRIPT" "$s_only_registry" <<'PY'
|
|
61
|
+
import importlib.util
|
|
62
|
+
import pathlib
|
|
63
|
+
import sys
|
|
64
|
+
|
|
65
|
+
spec = importlib.util.spec_from_file_location("audit_headroom_rejections", sys.argv[1])
|
|
66
|
+
module = importlib.util.module_from_spec(spec)
|
|
67
|
+
assert spec.loader is not None
|
|
68
|
+
spec.loader.exec_module(module)
|
|
69
|
+
assert module.registry_short_ids(pathlib.Path(sys.argv[2])) == {"S3"}
|
|
70
|
+
PY
|
|
71
|
+
|
|
72
|
+
write_headroom_fail() {
|
|
73
|
+
local run_id="$1"
|
|
74
|
+
local fixture="$2"
|
|
75
|
+
local bare="$3"
|
|
76
|
+
local solo="$4"
|
|
77
|
+
cat > "$results/$run_id/headroom-gate.json" <<JSON
|
|
78
|
+
{
|
|
79
|
+
"run_id": "$run_id",
|
|
80
|
+
"verdict": "FAIL",
|
|
81
|
+
"rows": [
|
|
82
|
+
{
|
|
83
|
+
"fixture": "$fixture",
|
|
84
|
+
"status": "FAIL",
|
|
85
|
+
"bare_score": $bare,
|
|
86
|
+
"solo_score": $solo,
|
|
87
|
+
"reason": "solo_claude score $solo > 80"
|
|
88
|
+
}
|
|
89
|
+
]
|
|
90
|
+
}
|
|
91
|
+
JSON
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
write_headroom_fail old-f16 F16-cli-quote-tax-rules 50 98
|
|
95
|
+
write_headroom_fail f33-headroom F33-cli-new-candidate 33 98
|
|
96
|
+
write_headroom_fail f34-headroom F34-cli-rejected-candidate 33 98
|
|
97
|
+
|
|
98
|
+
cat > "$results/f35-missing-judge/headroom-gate.json" <<'JSON'
|
|
99
|
+
{
|
|
100
|
+
"run_id": "f35-missing-judge",
|
|
101
|
+
"verdict": "FAIL",
|
|
102
|
+
"rows": [
|
|
103
|
+
{
|
|
104
|
+
"fixture": "F35-cli-missing-judge",
|
|
105
|
+
"status": "MISSING_JUDGE",
|
|
106
|
+
"reason": "judge.json missing"
|
|
107
|
+
}
|
|
108
|
+
]
|
|
109
|
+
}
|
|
110
|
+
JSON
|
|
111
|
+
|
|
112
|
+
cat > "$results/malformed-headroom/headroom-gate.json" <<'JSON'
|
|
113
|
+
{
|
|
114
|
+
"run_id": "malformed-headroom",
|
|
115
|
+
"verdict": "FAIL",
|
|
116
|
+
"rows": []
|
|
117
|
+
}
|
|
118
|
+
JSON
|
|
119
|
+
|
|
120
|
+
printf '{not-json\n' > "$results/bad-json-headroom/headroom-gate.json"
|
|
121
|
+
|
|
122
|
+
cat > "$results/f16-pair-pass/full-pipeline-pair-gate.json" <<'JSON'
|
|
123
|
+
{
|
|
124
|
+
"run_id": "f16-pair-pass",
|
|
125
|
+
"verdict": "PASS",
|
|
126
|
+
"pair_arm": "l2_risk_probes",
|
|
127
|
+
"rows": [
|
|
128
|
+
{
|
|
129
|
+
"fixture": "F16-cli-quote-tax-rules",
|
|
130
|
+
"status": "PASS",
|
|
131
|
+
"bare_score": 50,
|
|
132
|
+
"solo_score": 75,
|
|
133
|
+
"pair_score": 96,
|
|
134
|
+
"pair_margin": 21,
|
|
135
|
+
"pair_mode": true,
|
|
136
|
+
"pair_trigger_eligible": true,
|
|
137
|
+
"pair_solo_wall_ratio": 1.28
|
|
138
|
+
}
|
|
139
|
+
]
|
|
140
|
+
}
|
|
141
|
+
JSON
|
|
142
|
+
mkdir -p "$results/f16-pair-pass/F16-cli-quote-tax-rules/l2_risk_probes"
|
|
143
|
+
cat > "$results/f16-pair-pass/F16-cli-quote-tax-rules/l2_risk_probes/result.json" <<'JSON'
|
|
144
|
+
{
|
|
145
|
+
"pair_trigger": {
|
|
146
|
+
"eligible": true,
|
|
147
|
+
"reasons": ["complexity.high"],
|
|
148
|
+
"skipped_reason": null
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
JSON
|
|
152
|
+
python3 - "$SCRIPT" "$results" <<'PY'
|
|
153
|
+
import importlib.util
|
|
154
|
+
import pathlib
|
|
155
|
+
import sys
|
|
156
|
+
|
|
157
|
+
spec = importlib.util.spec_from_file_location("audit_headroom_rejections", sys.argv[1])
|
|
158
|
+
module = importlib.util.module_from_spec(spec)
|
|
159
|
+
assert spec.loader is not None
|
|
160
|
+
spec.loader.exec_module(module)
|
|
161
|
+
results_root = pathlib.Path(sys.argv[2])
|
|
162
|
+
kwargs = {
|
|
163
|
+
"results_root": results_root,
|
|
164
|
+
"run_id": "f16-pair-pass",
|
|
165
|
+
"fixture": "F16-cli-quote-tax-rules",
|
|
166
|
+
"pair_arm": "l2_risk_probes",
|
|
167
|
+
}
|
|
168
|
+
assert module.pair_result_trigger_reasons(**kwargs) == ["complexity.high"]
|
|
169
|
+
path = (
|
|
170
|
+
results_root
|
|
171
|
+
/ "f16-pair-pass"
|
|
172
|
+
/ "F16-cli-quote-tax-rules"
|
|
173
|
+
/ "l2_risk_probes"
|
|
174
|
+
/ "result.json"
|
|
175
|
+
)
|
|
176
|
+
path.write_text(
|
|
177
|
+
'{"pair_trigger":{"eligible":true,"reasons":["risk high"],"skipped_reason":null}}\n',
|
|
178
|
+
encoding="utf8",
|
|
179
|
+
)
|
|
180
|
+
assert module.pair_result_trigger_reasons(**kwargs) == []
|
|
181
|
+
path.write_text(
|
|
182
|
+
'{"pair_trigger":{"eligible":true,"reasons":["complexity.high"],"skipped_reason":null}}\n',
|
|
183
|
+
encoding="utf8",
|
|
184
|
+
)
|
|
185
|
+
PY
|
|
186
|
+
|
|
187
|
+
cat > "$results/f33-weak-pair-pass/full-pipeline-pair-gate.json" <<'JSON'
|
|
188
|
+
{
|
|
189
|
+
"run_id": "f33-weak-pair-pass",
|
|
190
|
+
"verdict": "PASS",
|
|
191
|
+
"pair_arm": "l2_risk_probes",
|
|
192
|
+
"rows": [
|
|
193
|
+
{
|
|
194
|
+
"fixture": "F33-cli-new-candidate",
|
|
195
|
+
"status": "PASS",
|
|
196
|
+
"bare_score": 33,
|
|
197
|
+
"solo_score": 98,
|
|
198
|
+
"pair_score": 96,
|
|
199
|
+
"pair_margin": -2,
|
|
200
|
+
"pair_mode": true,
|
|
201
|
+
"pair_trigger_eligible": true,
|
|
202
|
+
"pair_solo_wall_ratio": 1.1
|
|
203
|
+
}
|
|
204
|
+
]
|
|
205
|
+
}
|
|
206
|
+
JSON
|
|
207
|
+
|
|
208
|
+
if python3 "$SCRIPT" \
|
|
209
|
+
--fixtures-root "$fixtures" \
|
|
210
|
+
--registry "$registry" \
|
|
211
|
+
--results-root "$results" \
|
|
212
|
+
--out-json "$TMP_DIR/audit.json" > "$TMP_DIR/audit.out" 2> "$TMP_DIR/audit.err"; then
|
|
213
|
+
echo "expected unrecorded F33 failure" >&2
|
|
214
|
+
exit 1
|
|
215
|
+
fi
|
|
216
|
+
grep -Fq 'F33-cli-new-candidate' "$TMP_DIR/audit.err"
|
|
217
|
+
grep -Fq 'F35-cli-missing-judge' "$TMP_DIR/audit.err"
|
|
218
|
+
grep -Fq 'status=MISSING_JUDGE' "$TMP_DIR/audit.err"
|
|
219
|
+
grep -Fq 'malformed-headroom <unknown>' "$TMP_DIR/audit.err"
|
|
220
|
+
grep -Fq 'status=MALFORMED_ROWS' "$TMP_DIR/audit.err"
|
|
221
|
+
grep -Fq 'bad-json-headroom <unknown>' "$TMP_DIR/audit.err"
|
|
222
|
+
grep -Fq 'status=MALFORMED_JSON' "$TMP_DIR/audit.err"
|
|
223
|
+
grep -Fq 'unsupported registry rejection(s)' "$TMP_DIR/audit.err"
|
|
224
|
+
grep -Fq 'F36-unsupported-rejection' "$TMP_DIR/audit.err"
|
|
225
|
+
grep -Fq 'expected_run=20260512-missing-headroom' "$TMP_DIR/audit.err"
|
|
226
|
+
grep -Fq 'solo_claude=98' "$TMP_DIR/audit.err"
|
|
227
|
+
grep -Fq 'expected_solo_claude=98' "$TMP_DIR/audit.err"
|
|
228
|
+
grep -Fq '"verdict": "FAIL"' "$TMP_DIR/audit.json"
|
|
229
|
+
grep -Fq '"fixture": "F33-cli-new-candidate"' "$TMP_DIR/audit.json"
|
|
230
|
+
grep -Fq '"fixture": "F35-cli-missing-judge"' "$TMP_DIR/audit.json"
|
|
231
|
+
grep -Fq '"fixture": "<unknown>"' "$TMP_DIR/audit.json"
|
|
232
|
+
grep -Fq '"unsupported_registry_rejections"' "$TMP_DIR/audit.json"
|
|
233
|
+
if grep -Fq 'F16-cli-quote-tax-rules' "$TMP_DIR/audit.err"; then
|
|
234
|
+
echo "F16 has passing pair evidence and must not be reported" >&2
|
|
235
|
+
cat "$TMP_DIR/audit.err" >&2
|
|
236
|
+
exit 1
|
|
237
|
+
fi
|
|
238
|
+
if grep -Fq 'F34-cli-rejected-candidate' "$TMP_DIR/audit.err"; then
|
|
239
|
+
echo "F34 is rejected and must not be reported" >&2
|
|
240
|
+
cat "$TMP_DIR/audit.err" >&2
|
|
241
|
+
exit 1
|
|
242
|
+
fi
|
|
243
|
+
|
|
244
|
+
python3 - "$registry" <<'PY'
|
|
245
|
+
from pathlib import Path
|
|
246
|
+
import sys
|
|
247
|
+
path = Path(sys.argv[1])
|
|
248
|
+
text = path.read_text()
|
|
249
|
+
text = text.replace(
|
|
250
|
+
' F34-*|F34)',
|
|
251
|
+
' F33-*|F33)\n'
|
|
252
|
+
' echo "measured solo ceiling"\n'
|
|
253
|
+
' ;;\n'
|
|
254
|
+
' F35-*|F35)\n'
|
|
255
|
+
' echo "missing judge artifact"\n'
|
|
256
|
+
' ;;\n'
|
|
257
|
+
' F34-*|F34)'
|
|
258
|
+
)
|
|
259
|
+
path.write_text(text)
|
|
260
|
+
PY
|
|
261
|
+
|
|
262
|
+
rm -rf "$results/malformed-headroom"
|
|
263
|
+
rm -rf "$results/bad-json-headroom"
|
|
264
|
+
|
|
265
|
+
write_headroom_fail 20260512-f36-headroom F36-unsupported-rejection 33 98
|
|
266
|
+
python3 - "$registry" <<'PY'
|
|
267
|
+
from pathlib import Path
|
|
268
|
+
import sys
|
|
269
|
+
path = Path(sys.argv[1])
|
|
270
|
+
text = path.read_text()
|
|
271
|
+
text = text.replace(
|
|
272
|
+
"bare 33 / solo_claude 98 in 20260512-missing-headroom",
|
|
273
|
+
"bare 33 / solo_claude 98 in 20260512-f36-headroom",
|
|
274
|
+
)
|
|
275
|
+
path.write_text(text)
|
|
276
|
+
PY
|
|
277
|
+
|
|
278
|
+
python3 "$SCRIPT" \
|
|
279
|
+
--fixtures-root "$fixtures" \
|
|
280
|
+
--registry "$registry" \
|
|
281
|
+
--results-root "$results" \
|
|
282
|
+
--out-json "$TMP_DIR/audit-pass.json" \
|
|
283
|
+
> "$TMP_DIR/audit-pass.out"
|
|
284
|
+
grep -Fq 'PASS audit-headroom-rejections' "$TMP_DIR/audit-pass.out"
|
|
285
|
+
grep -Fq '"verdict": "PASS"' "$TMP_DIR/audit-pass.json"
|
|
286
|
+
grep -Fq '"unsupported_registry_rejections": []' "$TMP_DIR/audit-pass.json"
|
|
287
|
+
|
|
288
|
+
echo "PASS test-audit-headroom-rejections"
|