devlyn-cli 2.3.0 → 2.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +1 -1
- package/CLAUDE.md +2 -2
- package/README.md +82 -29
- package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
- package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
- package/benchmark/auto-resolve/README.md +307 -44
- package/benchmark/auto-resolve/RUBRIC.md +23 -14
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
- package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
- package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
- package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
- package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
- package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
- package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
- package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
- package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
- package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
- package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
- package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
- package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
- package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
- package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
- package/benchmark/auto-resolve/scripts/judge.sh +153 -26
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
- package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
- package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
- package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
- package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
- package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
- package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
- package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
- package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
- package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
- package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
- package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
- package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
- package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
- package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
- package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
- package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
- package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
- package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
- package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
- package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
- package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
- package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
- package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
- package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
- package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
- package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
- package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
- package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
- package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
- package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
- package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
- package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
- package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
- package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
- package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
- package/bin/devlyn.js +211 -18
- package/config/skills/_shared/adapters/README.md +3 -0
- package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
- package/config/skills/_shared/adapters/opus-4-7.md +9 -1
- package/config/skills/_shared/archive_run.py +78 -6
- package/config/skills/_shared/codex-config.md +3 -2
- package/config/skills/_shared/codex-monitored.sh +46 -1
- package/config/skills/_shared/collect-codex-findings.py +20 -5
- package/config/skills/_shared/engine-preflight.md +1 -1
- package/config/skills/_shared/runtime-principles.md +5 -8
- package/config/skills/_shared/spec-verify-check.py +2664 -107
- package/config/skills/_shared/verify-merge-findings.py +1369 -19
- package/config/skills/devlyn:ideate/SKILL.md +7 -4
- package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
- package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
- package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
- package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
- package/config/skills/devlyn:resolve/SKILL.md +49 -18
- package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
- package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
- package/config/skills/devlyn:resolve/references/phases/verify.md +62 -28
- package/config/skills/devlyn:resolve/references/state-schema.md +7 -4
- package/package.json +47 -2
- package/scripts/lint-fixtures.sh +349 -0
- package/scripts/lint-shadow-fixtures.sh +58 -0
- package/scripts/lint-skills.sh +3642 -92
- /package/{optional-skills → config/skills}/devlyn:design-ui/SKILL.md +0 -0
|
@@ -0,0 +1,1672 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Regression tests for audit-pair-evidence.py.
|
|
3
|
+
|
|
4
|
+
set -euo pipefail
|
|
5
|
+
|
|
6
|
+
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
7
|
+
SCRIPT="$SCRIPT_DIR/audit-pair-evidence.py"
|
|
8
|
+
TMP_DIR="$(mktemp -d /tmp/audit-pair-evidence-test.XXXXXX)"
|
|
9
|
+
trap 'rm -rf "$TMP_DIR"' EXIT
|
|
10
|
+
|
|
11
|
+
expect_fail_contains() {
|
|
12
|
+
local label="$1"
|
|
13
|
+
local needle="$2"
|
|
14
|
+
shift 2
|
|
15
|
+
local out="$TMP_DIR/$label.out"
|
|
16
|
+
if "$@" > "$out" 2>&1; then
|
|
17
|
+
echo "expected failure for $label" >&2
|
|
18
|
+
cat "$out" >&2
|
|
19
|
+
exit 1
|
|
20
|
+
fi
|
|
21
|
+
if ! grep -Fq -- "$needle" "$out"; then
|
|
22
|
+
echo "missing expected text for $label: $needle" >&2
|
|
23
|
+
cat "$out" >&2
|
|
24
|
+
exit 1
|
|
25
|
+
fi
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
fixtures="$TMP_DIR/fixtures"
|
|
29
|
+
results="$TMP_DIR/results"
|
|
30
|
+
registry="$TMP_DIR/pair-rejected-fixtures.sh"
|
|
31
|
+
mkdir -p "$fixtures/F16-cli-quote-tax-rules" \
|
|
32
|
+
"$fixtures/F21-cli-scheduler-priority" \
|
|
33
|
+
"$fixtures/F34-cli-rejected-candidate" \
|
|
34
|
+
"$results/pair-pass" \
|
|
35
|
+
"$results/pair-pass-2" \
|
|
36
|
+
"$results/rejected-headroom"
|
|
37
|
+
|
|
38
|
+
cat > "$registry" <<'SH'
|
|
39
|
+
rejected_pair_fixture_reason() {
|
|
40
|
+
local fid="$1"
|
|
41
|
+
case "$fid" in
|
|
42
|
+
F34-*|F34)
|
|
43
|
+
echo "measured solo ceiling"
|
|
44
|
+
;;
|
|
45
|
+
*)
|
|
46
|
+
return 1
|
|
47
|
+
;;
|
|
48
|
+
esac
|
|
49
|
+
}
|
|
50
|
+
SH
|
|
51
|
+
|
|
52
|
+
cat > "$results/pair-pass/full-pipeline-pair-gate.json" <<'JSON'
|
|
53
|
+
{
|
|
54
|
+
"run_id": "pair-pass",
|
|
55
|
+
"verdict": "PASS",
|
|
56
|
+
"pair_arm": "l2_risk_probes",
|
|
57
|
+
"rows": [
|
|
58
|
+
{
|
|
59
|
+
"fixture": "F16-cli-quote-tax-rules",
|
|
60
|
+
"status": "PASS",
|
|
61
|
+
"bare_score": 50,
|
|
62
|
+
"solo_score": 75,
|
|
63
|
+
"pair_score": 96,
|
|
64
|
+
"pair_margin": 21,
|
|
65
|
+
"pair_mode": true,
|
|
66
|
+
"pair_trigger_eligible": true,
|
|
67
|
+
"pair_trigger_reasons": ["complexity.high"],
|
|
68
|
+
"pair_trigger_has_canonical_reason": true,
|
|
69
|
+
"pair_solo_wall_ratio": 1.28
|
|
70
|
+
}
|
|
71
|
+
]
|
|
72
|
+
}
|
|
73
|
+
JSON
|
|
74
|
+
|
|
75
|
+
cat > "$results/rejected-headroom/headroom-gate.json" <<'JSON'
|
|
76
|
+
{
|
|
77
|
+
"run_id": "rejected-headroom",
|
|
78
|
+
"verdict": "FAIL",
|
|
79
|
+
"rows": [
|
|
80
|
+
{
|
|
81
|
+
"fixture": "F34-cli-rejected-candidate",
|
|
82
|
+
"status": "FAIL",
|
|
83
|
+
"bare_score": 33,
|
|
84
|
+
"solo_score": 98,
|
|
85
|
+
"reason": "solo_claude score 98 > 80"
|
|
86
|
+
}
|
|
87
|
+
]
|
|
88
|
+
}
|
|
89
|
+
JSON
|
|
90
|
+
|
|
91
|
+
expect_fail_contains unmeasured "unmeasured candidate fixture(s): F21-cli-scheduler-priority" \
|
|
92
|
+
python3 "$SCRIPT" \
|
|
93
|
+
--fixtures-root "$fixtures" \
|
|
94
|
+
--registry "$registry" \
|
|
95
|
+
--results-root "$results" \
|
|
96
|
+
--out-dir "$TMP_DIR/out-fail"
|
|
97
|
+
|
|
98
|
+
expect_fail_contains bad-min-pair-evidence "--min-pair-evidence must be >= 1" \
|
|
99
|
+
python3 "$SCRIPT" \
|
|
100
|
+
--fixtures-root "$fixtures" \
|
|
101
|
+
--registry "$registry" \
|
|
102
|
+
--results-root "$results" \
|
|
103
|
+
--min-pair-evidence 0
|
|
104
|
+
|
|
105
|
+
expect_fail_contains bad-min-pair-margin "--min-pair-margin must be >= 1" \
|
|
106
|
+
python3 "$SCRIPT" \
|
|
107
|
+
--fixtures-root "$fixtures" \
|
|
108
|
+
--registry "$registry" \
|
|
109
|
+
--results-root "$results" \
|
|
110
|
+
--min-pair-margin 0
|
|
111
|
+
|
|
112
|
+
expect_fail_contains bad-max-wall-ratio "--max-pair-solo-wall-ratio must be finite and > 0" \
|
|
113
|
+
python3 "$SCRIPT" \
|
|
114
|
+
--fixtures-root "$fixtures" \
|
|
115
|
+
--registry "$registry" \
|
|
116
|
+
--results-root "$results" \
|
|
117
|
+
--max-pair-solo-wall-ratio 0
|
|
118
|
+
|
|
119
|
+
expect_fail_contains nan-max-wall-ratio "--max-pair-solo-wall-ratio must be finite and > 0" \
|
|
120
|
+
python3 "$SCRIPT" \
|
|
121
|
+
--fixtures-root "$fixtures" \
|
|
122
|
+
--registry "$registry" \
|
|
123
|
+
--results-root "$results" \
|
|
124
|
+
--max-pair-solo-wall-ratio NaN
|
|
125
|
+
|
|
126
|
+
cat > "$results/pair-pass-2/full-pipeline-pair-gate.json" <<'JSON'
|
|
127
|
+
{
|
|
128
|
+
"run_id": "pair-pass-2",
|
|
129
|
+
"verdict": "PASS",
|
|
130
|
+
"pair_arm": "l2_risk_probes",
|
|
131
|
+
"rows": [
|
|
132
|
+
{
|
|
133
|
+
"fixture": "F21-cli-scheduler-priority",
|
|
134
|
+
"status": "PASS",
|
|
135
|
+
"bare_score": 33,
|
|
136
|
+
"solo_score": 66,
|
|
137
|
+
"pair_score": 99,
|
|
138
|
+
"pair_margin": 33,
|
|
139
|
+
"pair_mode": true,
|
|
140
|
+
"pair_trigger_eligible": true,
|
|
141
|
+
"pair_trigger_reasons": ["complexity.high", "risk_profile.high_risk"],
|
|
142
|
+
"pair_trigger_has_canonical_reason": true,
|
|
143
|
+
"pair_solo_wall_ratio": 1.47
|
|
144
|
+
}
|
|
145
|
+
]
|
|
146
|
+
}
|
|
147
|
+
JSON
|
|
148
|
+
|
|
149
|
+
expect_fail_contains pair-evidence-hypotheses "pair evidence hypotheses missing for fixture(s): F16-cli-quote-tax-rules, F21-cli-scheduler-priority" \
|
|
150
|
+
python3 "$SCRIPT" \
|
|
151
|
+
--fixtures-root "$fixtures" \
|
|
152
|
+
--registry "$registry" \
|
|
153
|
+
--results-root "$results" \
|
|
154
|
+
--min-pair-evidence 2 \
|
|
155
|
+
--out-dir "$TMP_DIR/out-hypothesis-fail"
|
|
156
|
+
|
|
157
|
+
for fixture in F16-cli-quote-tax-rules F21-cli-scheduler-priority; do
|
|
158
|
+
cat > "$fixtures/$fixture/spec.md" <<'EOF'
|
|
159
|
+
# Spec
|
|
160
|
+
|
|
161
|
+
## Verification
|
|
162
|
+
|
|
163
|
+
- Visible pair-evidence fixture.
|
|
164
|
+
|
|
165
|
+
## Solo-headroom hypothesis
|
|
166
|
+
|
|
167
|
+
A capable solo_claude baseline is expected to miss the ordering interaction;
|
|
168
|
+
observable command `node "$BENCH_FIXTURE_DIR/verifiers/visible.js"` exposes the miss.
|
|
169
|
+
EOF
|
|
170
|
+
cat > "$fixtures/$fixture/NOTES.md" <<'EOF'
|
|
171
|
+
# Notes
|
|
172
|
+
EOF
|
|
173
|
+
cat > "$fixtures/$fixture/expected.json" <<'EOF'
|
|
174
|
+
{
|
|
175
|
+
"verification_commands": [
|
|
176
|
+
{
|
|
177
|
+
"cmd": "node \"$BENCH_FIXTURE_DIR/verifiers/visible.js\"",
|
|
178
|
+
"exit_code": 0
|
|
179
|
+
}
|
|
180
|
+
]
|
|
181
|
+
}
|
|
182
|
+
EOF
|
|
183
|
+
done
|
|
184
|
+
|
|
185
|
+
python3 "$SCRIPT" \
|
|
186
|
+
--fixtures-root "$fixtures" \
|
|
187
|
+
--registry "$registry" \
|
|
188
|
+
--results-root "$results" \
|
|
189
|
+
--min-pair-evidence 2 \
|
|
190
|
+
--out-dir "$TMP_DIR/out-pass" \
|
|
191
|
+
> "$TMP_DIR/pass.out"
|
|
192
|
+
grep -Fq 'PASS audit-pair-evidence' "$TMP_DIR/pass.out"
|
|
193
|
+
test -f "$TMP_DIR/out-pass/frontier.json"
|
|
194
|
+
test -f "$TMP_DIR/out-pass/frontier.stdout"
|
|
195
|
+
test -f "$TMP_DIR/out-pass/frontier.stderr"
|
|
196
|
+
test -f "$TMP_DIR/out-pass/headroom-audit.json"
|
|
197
|
+
test -f "$TMP_DIR/out-pass/headroom-rejections.stdout"
|
|
198
|
+
test -f "$TMP_DIR/out-pass/headroom-rejections.stderr"
|
|
199
|
+
test -f "$TMP_DIR/out-pass/audit.json"
|
|
200
|
+
grep -Fq 'F16-cli-quote-tax-rules: bare=50 solo_claude=75 pair=96 arm=l2_risk_probes margin=+21' "$TMP_DIR/out-pass/frontier.stdout"
|
|
201
|
+
grep -Fq 'pair_margin_avg=+27.00 pair_margin_min=+21 wall_avg=1.38x wall_max=1.47x' "$TMP_DIR/out-pass/frontier.stdout"
|
|
202
|
+
grep -Fq 'verdict=pair_evidence_passed' "$TMP_DIR/out-pass/frontier.stdout"
|
|
203
|
+
grep -Fq 'PASS pair-candidate-frontier' "$TMP_DIR/out-pass/frontier.stdout"
|
|
204
|
+
grep -Fq 'headroom_rejections=PASS verdict=PASS unrecorded=0 unsupported=0' "$TMP_DIR/pass.out"
|
|
205
|
+
grep -Fq 'pair_evidence_quality=PASS min_pair_margin_actual=+21 min_pair_margin_required=+5 max_wall_actual=1.47x max_wall_allowed=3.00x' "$TMP_DIR/pass.out"
|
|
206
|
+
grep -Fq 'pair_trigger_reasons=PASS canonical=2 historical_alias=1 exposed=2 total=2 summary=2 rows_match=true' "$TMP_DIR/pass.out"
|
|
207
|
+
grep -Fq 'pair_trigger_historical_aliases=F21-cli-scheduler-priority=risk_profile.high_risk' "$TMP_DIR/pass.out"
|
|
208
|
+
grep -Fq 'pair_evidence_hypotheses=PASS documented=2 total=2' "$TMP_DIR/pass.out"
|
|
209
|
+
grep -Fq 'pair_evidence_hypothesis_triggers=WARN matched=0 documented=2 total=2' "$TMP_DIR/pass.out"
|
|
210
|
+
grep -Fq 'pair_evidence_hypothesis_trigger_gaps=F16-cli-quote-tax-rules=complexity.high;F21-cli-scheduler-priority=complexity.high,risk_profile.high_risk' "$TMP_DIR/pass.out"
|
|
211
|
+
python3 - "$TMP_DIR/out-pass/audit.json" <<'PY'
|
|
212
|
+
import json
|
|
213
|
+
import sys
|
|
214
|
+
|
|
215
|
+
report = json.load(open(sys.argv[1], encoding="utf8"))
|
|
216
|
+
assert report["verdict"] == "PASS"
|
|
217
|
+
assert report["min_pair_evidence"] == 2
|
|
218
|
+
assert report["min_pair_margin"] == 5
|
|
219
|
+
assert report["max_pair_solo_wall_ratio"] == 3.0
|
|
220
|
+
assert report["frontier_summary"]["min_pair_margin"] == 5
|
|
221
|
+
assert report["frontier_summary"]["max_pair_solo_wall_ratio"] == 3.0
|
|
222
|
+
assert report["frontier_summary"]["fixtures_total"] == 3
|
|
223
|
+
assert report["frontier_summary"]["candidate_count"] == 2
|
|
224
|
+
assert report["frontier_summary"]["pair_evidence_count"] == 2
|
|
225
|
+
assert report["frontier_summary"]["unmeasured_count"] == 0
|
|
226
|
+
assert report["frontier_summary"]["pair_margin_avg"] == 27
|
|
227
|
+
assert report["frontier_summary"]["pair_margin_min"] == 21
|
|
228
|
+
assert report["frontier_summary"]["pair_solo_wall_ratio_avg"] == 1.38
|
|
229
|
+
assert report["frontier_summary"]["pair_solo_wall_ratio_max"] == 1.47
|
|
230
|
+
assert report["pair_evidence_rows"] == [
|
|
231
|
+
{
|
|
232
|
+
"fixture": "F16-cli-quote-tax-rules",
|
|
233
|
+
"verdict": "pair_evidence_passed",
|
|
234
|
+
"run_id": "pair-pass",
|
|
235
|
+
"pair_arm": "l2_risk_probes",
|
|
236
|
+
"bare_score": 50,
|
|
237
|
+
"solo_score": 75,
|
|
238
|
+
"pair_score": 96,
|
|
239
|
+
"pair_margin": 21,
|
|
240
|
+
"pair_mode": True,
|
|
241
|
+
"pair_trigger_eligible": True,
|
|
242
|
+
"pair_trigger_reasons": ["complexity.high"],
|
|
243
|
+
"pair_trigger_has_canonical_reason": True,
|
|
244
|
+
"pair_trigger_has_hypothesis_reason": False,
|
|
245
|
+
"pair_solo_wall_ratio": 1.28,
|
|
246
|
+
},
|
|
247
|
+
{
|
|
248
|
+
"fixture": "F21-cli-scheduler-priority",
|
|
249
|
+
"verdict": "pair_evidence_passed",
|
|
250
|
+
"run_id": "pair-pass-2",
|
|
251
|
+
"pair_arm": "l2_risk_probes",
|
|
252
|
+
"bare_score": 33,
|
|
253
|
+
"solo_score": 66,
|
|
254
|
+
"pair_score": 99,
|
|
255
|
+
"pair_margin": 33,
|
|
256
|
+
"pair_mode": True,
|
|
257
|
+
"pair_trigger_eligible": True,
|
|
258
|
+
"pair_trigger_reasons": ["complexity.high", "risk_profile.high_risk"],
|
|
259
|
+
"pair_trigger_has_canonical_reason": True,
|
|
260
|
+
"pair_trigger_has_hypothesis_reason": False,
|
|
261
|
+
"pair_solo_wall_ratio": 1.47,
|
|
262
|
+
},
|
|
263
|
+
]
|
|
264
|
+
assert report["checks"]["frontier"]["status"] == "PASS"
|
|
265
|
+
assert report["checks"]["headroom_rejections"]["status"] == "PASS"
|
|
266
|
+
assert report["checks"]["headroom_rejections"]["exit_code"] == 0
|
|
267
|
+
assert report["checks"]["headroom_rejections"]["report_check_exit_code"] == 0
|
|
268
|
+
assert report["checks"]["headroom_rejections"]["verdict"] == "PASS"
|
|
269
|
+
assert report["checks"]["headroom_rejections"]["unrecorded_failure_count"] == 0
|
|
270
|
+
assert report["checks"]["headroom_rejections"]["unsupported_registry_rejection_count"] == 0
|
|
271
|
+
assert report["checks"]["frontier_report"]["status"] == "PASS"
|
|
272
|
+
assert report["checks"]["frontier_report"]["verdict"] == "PASS"
|
|
273
|
+
assert report["checks"]["frontier_report"]["unmeasured_count"] == 0
|
|
274
|
+
assert report["checks"]["frontier_stdout"]["status"] == "PASS"
|
|
275
|
+
assert report["checks"]["frontier_stdout"]["summary_rows"] == 1
|
|
276
|
+
assert report["checks"]["frontier_stdout"]["aggregate_rows"] == 1
|
|
277
|
+
assert report["checks"]["frontier_stdout"]["final_verdict_rows"] == 1
|
|
278
|
+
assert report["checks"]["frontier_stdout"]["expected_rows"] == 2
|
|
279
|
+
assert report["checks"]["frontier_stdout"]["stdout_rows"] == 2
|
|
280
|
+
assert report["checks"]["frontier_stdout"]["trigger_rows"] == 2
|
|
281
|
+
assert report["checks"]["frontier_stdout"]["hypothesis_trigger_rows"] == 2
|
|
282
|
+
assert report["checks"]["frontier_stdout"]["rows_match_count"] is True
|
|
283
|
+
assert report["checks"]["frontier_stdout"]["trigger_rows_match_count"] is True
|
|
284
|
+
assert report["checks"]["frontier_stdout"]["hypothesis_trigger_rows_match_count"] is True
|
|
285
|
+
assert report["checks"]["min_pair_evidence"]["status"] == "PASS"
|
|
286
|
+
assert report["checks"]["min_pair_evidence"]["required"] == 2
|
|
287
|
+
assert report["checks"]["min_pair_evidence"]["actual"] == 2
|
|
288
|
+
assert report["checks"]["min_pair_evidence"]["actual_rows"] == 2
|
|
289
|
+
assert report["checks"]["min_pair_evidence"]["rows_match_count"] is True
|
|
290
|
+
assert report["checks"]["pair_evidence_quality"]["status"] == "PASS"
|
|
291
|
+
assert report["checks"]["pair_evidence_quality"]["min_pair_margin_required"] == 5
|
|
292
|
+
assert report["checks"]["pair_evidence_quality"]["min_pair_margin_actual"] == 21
|
|
293
|
+
assert report["checks"]["pair_evidence_quality"]["max_pair_solo_wall_ratio_allowed"] == 3.0
|
|
294
|
+
assert report["checks"]["pair_evidence_quality"]["max_pair_solo_wall_ratio_actual"] == 1.47
|
|
295
|
+
assert report["checks"]["pair_evidence_quality"]["summary_min_pair_margin"] == 21
|
|
296
|
+
assert report["checks"]["pair_evidence_quality"]["summary_max_pair_solo_wall_ratio"] == 1.47
|
|
297
|
+
assert report["checks"]["pair_trigger_reasons"]["status"] == "PASS"
|
|
298
|
+
assert report["checks"]["pair_trigger_reasons"]["summary_pair_evidence_count"] == 2
|
|
299
|
+
assert report["checks"]["pair_trigger_reasons"]["canonical_rows"] == 2
|
|
300
|
+
assert report["checks"]["pair_trigger_reasons"]["historical_alias_rows"] == 1
|
|
301
|
+
assert report["checks"]["pair_trigger_reasons"]["historical_alias_details"] == [
|
|
302
|
+
{"fixture": "F21-cli-scheduler-priority", "aliases": ["risk_profile.high_risk"]}
|
|
303
|
+
]
|
|
304
|
+
assert report["checks"]["pair_trigger_reasons"]["exposed_rows"] == 2
|
|
305
|
+
assert report["checks"]["pair_trigger_reasons"]["total_rows"] == 2
|
|
306
|
+
assert report["checks"]["pair_trigger_reasons"]["rows_match_count"] is True
|
|
307
|
+
assert report["checks"]["pair_evidence_hypothesis_triggers"]["status"] == "WARN"
|
|
308
|
+
assert report["checks"]["pair_evidence_hypothesis_triggers"]["exit_code"] == 0
|
|
309
|
+
assert report["checks"]["pair_evidence_hypothesis_triggers"]["required"] is False
|
|
310
|
+
assert report["checks"]["pair_evidence_hypothesis_triggers"]["matched_rows"] == 0
|
|
311
|
+
assert report["checks"]["pair_evidence_hypothesis_triggers"]["documented_rows"] == 2
|
|
312
|
+
assert report["checks"]["pair_evidence_hypothesis_triggers"]["total_rows"] == 2
|
|
313
|
+
assert report["checks"]["pair_evidence_hypothesis_triggers"]["gap_details"] == [
|
|
314
|
+
{
|
|
315
|
+
"fixture": "F16-cli-quote-tax-rules",
|
|
316
|
+
"pair_trigger_reasons": ["complexity.high"],
|
|
317
|
+
},
|
|
318
|
+
{
|
|
319
|
+
"fixture": "F21-cli-scheduler-priority",
|
|
320
|
+
"pair_trigger_reasons": ["complexity.high", "risk_profile.high_risk"],
|
|
321
|
+
},
|
|
322
|
+
]
|
|
323
|
+
assert report["artifacts"] == {
|
|
324
|
+
"frontier_json": "frontier.json",
|
|
325
|
+
"frontier_stdout": "frontier.stdout",
|
|
326
|
+
"frontier_stderr": "frontier.stderr",
|
|
327
|
+
"headroom_audit_json": "headroom-audit.json",
|
|
328
|
+
"headroom_rejections_stdout": "headroom-rejections.stdout",
|
|
329
|
+
"headroom_rejections_stderr": "headroom-rejections.stderr",
|
|
330
|
+
"audit_json": "audit.json",
|
|
331
|
+
}
|
|
332
|
+
PY
|
|
333
|
+
|
|
334
|
+
if python3 "$SCRIPT" \
|
|
335
|
+
--fixtures-root "$fixtures" \
|
|
336
|
+
--registry "$registry" \
|
|
337
|
+
--results-root "$results" \
|
|
338
|
+
--min-pair-evidence 2 \
|
|
339
|
+
--require-hypothesis-trigger \
|
|
340
|
+
--out-dir "$TMP_DIR/out-strict-trigger" \
|
|
341
|
+
> "$TMP_DIR/strict-trigger.out" 2>&1; then
|
|
342
|
+
echo "audit must fail when --require-hypothesis-trigger sees trigger gaps" >&2
|
|
343
|
+
cat "$TMP_DIR/strict-trigger.out" >&2
|
|
344
|
+
exit 1
|
|
345
|
+
fi
|
|
346
|
+
grep -Fq 'pair evidence hypothesis triggers missing for fixture(s): F16-cli-quote-tax-rules, F21-cli-scheduler-priority' "$TMP_DIR/strict-trigger.out"
|
|
347
|
+
grep -Fq 'pair_evidence_hypothesis_triggers=FAIL matched=0 documented=2 total=2' "$TMP_DIR/strict-trigger.out"
|
|
348
|
+
grep -Fq 'pair_evidence_hypothesis_trigger_gaps=F16-cli-quote-tax-rules=complexity.high;F21-cli-scheduler-priority=complexity.high,risk_profile.high_risk' "$TMP_DIR/strict-trigger.out"
|
|
349
|
+
grep -Fq 'FAIL audit-pair-evidence' "$TMP_DIR/strict-trigger.out"
|
|
350
|
+
python3 - "$TMP_DIR/out-strict-trigger/audit.json" <<'PY'
|
|
351
|
+
import json
|
|
352
|
+
import sys
|
|
353
|
+
|
|
354
|
+
report = json.load(open(sys.argv[1], encoding="utf8"))
|
|
355
|
+
assert report["verdict"] == "FAIL"
|
|
356
|
+
assert report["checks"]["pair_evidence_hypothesis_triggers"]["status"] == "FAIL"
|
|
357
|
+
assert report["checks"]["pair_evidence_hypothesis_triggers"]["exit_code"] == 1
|
|
358
|
+
assert report["checks"]["pair_evidence_hypothesis_triggers"]["required"] is True
|
|
359
|
+
assert report["checks"]["pair_evidence_hypothesis_triggers"]["gap_details"] == [
|
|
360
|
+
{
|
|
361
|
+
"fixture": "F16-cli-quote-tax-rules",
|
|
362
|
+
"pair_trigger_reasons": ["complexity.high"],
|
|
363
|
+
},
|
|
364
|
+
{
|
|
365
|
+
"fixture": "F21-cli-scheduler-priority",
|
|
366
|
+
"pair_trigger_reasons": ["complexity.high", "risk_profile.high_risk"],
|
|
367
|
+
},
|
|
368
|
+
]
|
|
369
|
+
PY
|
|
370
|
+
|
|
371
|
+
python3 - "$SCRIPT" "$TMP_DIR/out-pass/frontier.json" "$TMP_DIR/out-pass/frontier.stdout" <<'PY'
|
|
372
|
+
import importlib.util
|
|
373
|
+
import pathlib
|
|
374
|
+
import sys
|
|
375
|
+
|
|
376
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
377
|
+
module = importlib.util.module_from_spec(spec)
|
|
378
|
+
assert spec.loader is not None
|
|
379
|
+
spec.loader.exec_module(module)
|
|
380
|
+
sys.exit(module.check_frontier_stdout(pathlib.Path(sys.argv[2]), pathlib.Path(sys.argv[3])))
|
|
381
|
+
PY
|
|
382
|
+
|
|
383
|
+
cat > "$TMP_DIR/missing-trigger-reasons.json" <<'JSON'
|
|
384
|
+
{
|
|
385
|
+
"pair_evidence_count": 1,
|
|
386
|
+
"rows": [
|
|
387
|
+
{
|
|
388
|
+
"fixture": "F16-cli-quote-tax-rules",
|
|
389
|
+
"status": "pair_evidence_passed",
|
|
390
|
+
"passing_pair_evidence": [
|
|
391
|
+
{
|
|
392
|
+
"run_id": "pair-pass",
|
|
393
|
+
"pair_arm": "l2_risk_probes",
|
|
394
|
+
"bare_score": 50,
|
|
395
|
+
"solo_score": 75,
|
|
396
|
+
"pair_score": 96,
|
|
397
|
+
"pair_margin": 21,
|
|
398
|
+
"pair_mode": true,
|
|
399
|
+
"pair_trigger_eligible": true,
|
|
400
|
+
"pair_solo_wall_ratio": 1.28
|
|
401
|
+
}
|
|
402
|
+
]
|
|
403
|
+
}
|
|
404
|
+
]
|
|
405
|
+
}
|
|
406
|
+
JSON
|
|
407
|
+
expect_fail_contains missing-trigger-reasons "pair trigger reason rows 0 do not match summary count 1" \
|
|
408
|
+
python3 - "$SCRIPT" "$TMP_DIR/missing-trigger-reasons.json" <<'PY'
|
|
409
|
+
import importlib.util
|
|
410
|
+
import pathlib
|
|
411
|
+
import sys
|
|
412
|
+
|
|
413
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
414
|
+
module = importlib.util.module_from_spec(spec)
|
|
415
|
+
assert spec.loader is not None
|
|
416
|
+
spec.loader.exec_module(module)
|
|
417
|
+
sys.exit(module.check_pair_trigger_reasons(pathlib.Path(sys.argv[2])))
|
|
418
|
+
PY
|
|
419
|
+
|
|
420
|
+
cat > "$TMP_DIR/malformed-trigger-reason-rows.json" <<'JSON'
|
|
421
|
+
{
|
|
422
|
+
"pair_evidence_count": 1,
|
|
423
|
+
"rows": [
|
|
424
|
+
{
|
|
425
|
+
"fixture": "F16-cli-quote-tax-rules",
|
|
426
|
+
"status": "pair_evidence_passed",
|
|
427
|
+
"passing_pair_evidence": [
|
|
428
|
+
{
|
|
429
|
+
"run_id": "pair-pass",
|
|
430
|
+
"pair_arm": "l2_risk_probes",
|
|
431
|
+
"bare_score": 50,
|
|
432
|
+
"solo_score": 75,
|
|
433
|
+
"pair_score": 96,
|
|
434
|
+
"pair_margin": 21,
|
|
435
|
+
"pair_mode": true,
|
|
436
|
+
"pair_trigger_eligible": true,
|
|
437
|
+
"pair_trigger_reasons": [],
|
|
438
|
+
"pair_trigger_has_canonical_reason": true,
|
|
439
|
+
"pair_solo_wall_ratio": 1.28
|
|
440
|
+
}
|
|
441
|
+
]
|
|
442
|
+
}
|
|
443
|
+
]
|
|
444
|
+
}
|
|
445
|
+
JSON
|
|
446
|
+
expect_fail_contains malformed-trigger-reason-rows "pair trigger reason rows 0 do not match summary count 1" \
|
|
447
|
+
python3 - "$SCRIPT" "$TMP_DIR/malformed-trigger-reason-rows.json" <<'PY'
|
|
448
|
+
import importlib.util
|
|
449
|
+
import pathlib
|
|
450
|
+
import sys
|
|
451
|
+
|
|
452
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
453
|
+
module = importlib.util.module_from_spec(spec)
|
|
454
|
+
assert spec.loader is not None
|
|
455
|
+
spec.loader.exec_module(module)
|
|
456
|
+
sys.exit(module.check_pair_trigger_reasons(pathlib.Path(sys.argv[2])))
|
|
457
|
+
PY
|
|
458
|
+
|
|
459
|
+
cat > "$TMP_DIR/mixed-unknown-trigger-reason-rows.json" <<'JSON'
|
|
460
|
+
{
|
|
461
|
+
"pair_evidence_count": 1,
|
|
462
|
+
"rows": [
|
|
463
|
+
{
|
|
464
|
+
"fixture": "F16-cli-quote-tax-rules",
|
|
465
|
+
"status": "pair_evidence_passed",
|
|
466
|
+
"passing_pair_evidence": [
|
|
467
|
+
{
|
|
468
|
+
"run_id": "pair-pass",
|
|
469
|
+
"pair_arm": "l2_risk_probes",
|
|
470
|
+
"bare_score": 50,
|
|
471
|
+
"solo_score": 75,
|
|
472
|
+
"pair_score": 96,
|
|
473
|
+
"pair_margin": 21,
|
|
474
|
+
"pair_mode": true,
|
|
475
|
+
"pair_trigger_eligible": true,
|
|
476
|
+
"pair_trigger_reasons": ["complexity.high", "looks-hard"],
|
|
477
|
+
"pair_trigger_has_canonical_reason": true,
|
|
478
|
+
"pair_solo_wall_ratio": 1.28
|
|
479
|
+
}
|
|
480
|
+
]
|
|
481
|
+
}
|
|
482
|
+
]
|
|
483
|
+
}
|
|
484
|
+
JSON
|
|
485
|
+
expect_fail_contains mixed-unknown-trigger-reason-rows "pair trigger reason rows 0 do not match summary count 1" \
|
|
486
|
+
python3 - "$SCRIPT" "$TMP_DIR/mixed-unknown-trigger-reason-rows.json" <<'PY'
|
|
487
|
+
import importlib.util
|
|
488
|
+
import pathlib
|
|
489
|
+
import sys
|
|
490
|
+
|
|
491
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
492
|
+
module = importlib.util.module_from_spec(spec)
|
|
493
|
+
assert spec.loader is not None
|
|
494
|
+
spec.loader.exec_module(module)
|
|
495
|
+
sys.exit(module.check_pair_trigger_reasons(pathlib.Path(sys.argv[2])))
|
|
496
|
+
PY
|
|
497
|
+
|
|
498
|
+
cat > "$TMP_DIR/normalized-canonical-trigger-reason-rows.json" <<'JSON'
|
|
499
|
+
{
|
|
500
|
+
"pair_evidence_count": 1,
|
|
501
|
+
"rows": [
|
|
502
|
+
{
|
|
503
|
+
"fixture": "F16-cli-quote-tax-rules",
|
|
504
|
+
"status": "pair_evidence_passed",
|
|
505
|
+
"passing_pair_evidence": [
|
|
506
|
+
{
|
|
507
|
+
"run_id": "pair-pass",
|
|
508
|
+
"pair_arm": "l2_risk_probes",
|
|
509
|
+
"bare_score": 50,
|
|
510
|
+
"solo_score": 75,
|
|
511
|
+
"pair_score": 96,
|
|
512
|
+
"pair_margin": 21,
|
|
513
|
+
"pair_mode": true,
|
|
514
|
+
"pair_trigger_eligible": true,
|
|
515
|
+
"pair_trigger_reasons": ["risk high"],
|
|
516
|
+
"pair_trigger_has_canonical_reason": true,
|
|
517
|
+
"pair_solo_wall_ratio": 1.28
|
|
518
|
+
}
|
|
519
|
+
]
|
|
520
|
+
}
|
|
521
|
+
]
|
|
522
|
+
}
|
|
523
|
+
JSON
|
|
524
|
+
expect_fail_contains normalized-canonical-trigger-reason-rows "pair trigger reason rows 0 do not match summary count 1" \
|
|
525
|
+
python3 - "$SCRIPT" "$TMP_DIR/normalized-canonical-trigger-reason-rows.json" <<'PY'
|
|
526
|
+
import importlib.util
|
|
527
|
+
import pathlib
|
|
528
|
+
import sys
|
|
529
|
+
|
|
530
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
531
|
+
module = importlib.util.module_from_spec(spec)
|
|
532
|
+
assert spec.loader is not None
|
|
533
|
+
spec.loader.exec_module(module)
|
|
534
|
+
sys.exit(module.check_pair_trigger_reasons(pathlib.Path(sys.argv[2])))
|
|
535
|
+
PY
|
|
536
|
+
|
|
537
|
+
grep -Fv 'PASS pair-candidate-frontier' "$TMP_DIR/out-pass/frontier.stdout" \
|
|
538
|
+
> "$TMP_DIR/missing-final-verdict-frontier.stdout"
|
|
539
|
+
expect_fail_contains missing-final-frontier-verdict "frontier stdout final verdict row count is not exactly 1" \
|
|
540
|
+
python3 - "$SCRIPT" "$TMP_DIR/out-pass/frontier.json" "$TMP_DIR/missing-final-verdict-frontier.stdout" <<'PY'
|
|
541
|
+
import importlib.util
|
|
542
|
+
import pathlib
|
|
543
|
+
import sys
|
|
544
|
+
|
|
545
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
546
|
+
module = importlib.util.module_from_spec(spec)
|
|
547
|
+
assert spec.loader is not None
|
|
548
|
+
spec.loader.exec_module(module)
|
|
549
|
+
sys.exit(module.check_frontier_stdout(pathlib.Path(sys.argv[2]), pathlib.Path(sys.argv[3])))
|
|
550
|
+
PY
|
|
551
|
+
|
|
552
|
+
cp "$TMP_DIR/out-pass/frontier.stdout" "$TMP_DIR/duplicate-final-verdict-frontier.stdout"
|
|
553
|
+
printf 'PASS pair-candidate-frontier\n' >> "$TMP_DIR/duplicate-final-verdict-frontier.stdout"
|
|
554
|
+
expect_fail_contains duplicate-final-frontier-verdict "frontier stdout final verdict row count is not exactly 1" \
|
|
555
|
+
python3 - "$SCRIPT" "$TMP_DIR/out-pass/frontier.json" "$TMP_DIR/duplicate-final-verdict-frontier.stdout" <<'PY'
|
|
556
|
+
import importlib.util
|
|
557
|
+
import pathlib
|
|
558
|
+
import sys
|
|
559
|
+
|
|
560
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
561
|
+
module = importlib.util.module_from_spec(spec)
|
|
562
|
+
assert spec.loader is not None
|
|
563
|
+
spec.loader.exec_module(module)
|
|
564
|
+
sys.exit(module.check_frontier_stdout(pathlib.Path(sys.argv[2]), pathlib.Path(sys.argv[3])))
|
|
565
|
+
PY
|
|
566
|
+
|
|
567
|
+
printf 'fixtures=3 rejected=1 candidates=2 pair_evidence=2 unmeasured=0 verdict=PASS\n' \
|
|
568
|
+
> "$TMP_DIR/bad-frontier.stdout"
|
|
569
|
+
printf 'pair_margin_avg=+27.00 pair_margin_min=+21 wall_avg=1.38x wall_max=1.47x\n' \
|
|
570
|
+
>> "$TMP_DIR/bad-frontier.stdout"
|
|
571
|
+
printf 'F16-cli-quote-tax-rules: bare=50 solo_claude=75 pair=95 arm=l2_risk_probes margin=+20 wall=1.28x run=pair-pass verdict=pair_evidence_passed\n' \
|
|
572
|
+
>> "$TMP_DIR/bad-frontier.stdout"
|
|
573
|
+
printf 'F21-cli-scheduler-priority: bare=33 solo_claude=66 pair=99 arm=l2_risk_probes margin=+33 wall=1.47x run=pair-pass-2 verdict=pair_evidence_passed\n' \
|
|
574
|
+
>> "$TMP_DIR/bad-frontier.stdout"
|
|
575
|
+
expect_fail_contains missing-frontier-score-row "frontier stdout missing score row for F16-cli-quote-tax-rules" \
|
|
576
|
+
python3 - "$SCRIPT" "$TMP_DIR/out-pass/frontier.json" "$TMP_DIR/bad-frontier.stdout" <<'PY'
|
|
577
|
+
import importlib.util
|
|
578
|
+
import pathlib
|
|
579
|
+
import sys
|
|
580
|
+
|
|
581
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
582
|
+
module = importlib.util.module_from_spec(spec)
|
|
583
|
+
assert spec.loader is not None
|
|
584
|
+
spec.loader.exec_module(module)
|
|
585
|
+
sys.exit(module.check_frontier_stdout(pathlib.Path(sys.argv[2]), pathlib.Path(sys.argv[3])))
|
|
586
|
+
PY
|
|
587
|
+
|
|
588
|
+
sed -E 's/ triggers=[^[:space:]]+//' "$TMP_DIR/out-pass/frontier.stdout" \
|
|
589
|
+
> "$TMP_DIR/no-trigger-frontier.stdout"
|
|
590
|
+
expect_fail_contains missing-frontier-triggers "frontier stdout missing score row for F16-cli-quote-tax-rules" \
|
|
591
|
+
python3 - "$SCRIPT" "$TMP_DIR/out-pass/frontier.json" "$TMP_DIR/no-trigger-frontier.stdout" <<'PY'
|
|
592
|
+
import importlib.util
|
|
593
|
+
import pathlib
|
|
594
|
+
import sys
|
|
595
|
+
|
|
596
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
597
|
+
module = importlib.util.module_from_spec(spec)
|
|
598
|
+
assert spec.loader is not None
|
|
599
|
+
spec.loader.exec_module(module)
|
|
600
|
+
sys.exit(module.check_frontier_stdout(pathlib.Path(sys.argv[2]), pathlib.Path(sys.argv[3])))
|
|
601
|
+
PY
|
|
602
|
+
|
|
603
|
+
cat > "$TMP_DIR/no-aggregate-frontier.stdout" <<'OUT'
|
|
604
|
+
fixtures=3 rejected=1 candidates=2 pair_evidence=2 unmeasured=0 verdict=PASS
|
|
605
|
+
F16-cli-quote-tax-rules: bare=50 solo_claude=75 pair=96 arm=l2_risk_probes margin=+21 wall=1.28x run=pair-pass verdict=pair_evidence_passed
|
|
606
|
+
F21-cli-scheduler-priority: bare=33 solo_claude=66 pair=99 arm=l2_risk_probes margin=+33 wall=1.47x run=pair-pass-2 verdict=pair_evidence_passed
|
|
607
|
+
OUT
|
|
608
|
+
expect_fail_contains missing-frontier-aggregate-row "frontier stdout aggregate score row count is not exactly 1" \
|
|
609
|
+
python3 - "$SCRIPT" "$TMP_DIR/out-pass/frontier.json" "$TMP_DIR/no-aggregate-frontier.stdout" <<'PY'
|
|
610
|
+
import importlib.util
|
|
611
|
+
import pathlib
|
|
612
|
+
import sys
|
|
613
|
+
|
|
614
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
615
|
+
module = importlib.util.module_from_spec(spec)
|
|
616
|
+
assert spec.loader is not None
|
|
617
|
+
spec.loader.exec_module(module)
|
|
618
|
+
sys.exit(module.check_frontier_stdout(pathlib.Path(sys.argv[2]), pathlib.Path(sys.argv[3])))
|
|
619
|
+
PY
|
|
620
|
+
|
|
621
|
+
cp "$TMP_DIR/out-pass/frontier.stdout" "$TMP_DIR/duplicate-summary-frontier.stdout"
|
|
622
|
+
printf 'fixtures=3 rejected=1 candidates=2 pair_evidence=2 unmeasured=0 verdict=PASS\n' \
|
|
623
|
+
>> "$TMP_DIR/duplicate-summary-frontier.stdout"
|
|
624
|
+
expect_fail_contains duplicate-frontier-summary-row "frontier stdout summary score row count is not exactly 1" \
|
|
625
|
+
python3 - "$SCRIPT" "$TMP_DIR/out-pass/frontier.json" "$TMP_DIR/duplicate-summary-frontier.stdout" <<'PY'
|
|
626
|
+
import importlib.util
|
|
627
|
+
import pathlib
|
|
628
|
+
import sys
|
|
629
|
+
|
|
630
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
631
|
+
module = importlib.util.module_from_spec(spec)
|
|
632
|
+
assert spec.loader is not None
|
|
633
|
+
spec.loader.exec_module(module)
|
|
634
|
+
sys.exit(module.check_frontier_stdout(pathlib.Path(sys.argv[2]), pathlib.Path(sys.argv[3])))
|
|
635
|
+
PY
|
|
636
|
+
|
|
637
|
+
cp "$TMP_DIR/out-pass/frontier.stdout" "$TMP_DIR/duplicate-aggregate-frontier.stdout"
|
|
638
|
+
printf 'pair_margin_avg=+27.00 pair_margin_min=+21 wall_avg=1.38x wall_max=1.47x\n' \
|
|
639
|
+
>> "$TMP_DIR/duplicate-aggregate-frontier.stdout"
|
|
640
|
+
expect_fail_contains duplicate-frontier-aggregate-row "frontier stdout aggregate score row count is not exactly 1" \
|
|
641
|
+
python3 - "$SCRIPT" "$TMP_DIR/out-pass/frontier.json" "$TMP_DIR/duplicate-aggregate-frontier.stdout" <<'PY'
|
|
642
|
+
import importlib.util
|
|
643
|
+
import pathlib
|
|
644
|
+
import sys
|
|
645
|
+
|
|
646
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
647
|
+
module = importlib.util.module_from_spec(spec)
|
|
648
|
+
assert spec.loader is not None
|
|
649
|
+
spec.loader.exec_module(module)
|
|
650
|
+
sys.exit(module.check_frontier_stdout(pathlib.Path(sys.argv[2]), pathlib.Path(sys.argv[3])))
|
|
651
|
+
PY
|
|
652
|
+
|
|
653
|
+
cat > "$TMP_DIR/partial-frontier.stdout" <<'OUT'
|
|
654
|
+
fixtures=3 rejected=1 candidates=2 pair_evidence=2 unmeasured=0 verdict=PASS
|
|
655
|
+
pair_margin_avg=+27.00 pair_margin_min=+21 wall_avg=1.38x wall_max=1.47x
|
|
656
|
+
F16-cli-quote-tax-rules: bare=50 solo_claude=75 pair=96 arm=l2_risk_probes margin=+21 verdict=pair_evidence_passed
|
|
657
|
+
F21-cli-scheduler-priority: bare=33 solo_claude=66 pair=99 arm=l2_risk_probes margin=+33 wall=1.47x run=pair-pass-2 verdict=pair_evidence_passed
|
|
658
|
+
OUT
|
|
659
|
+
expect_fail_contains partial-frontier-score-row "frontier stdout missing score row for F16-cli-quote-tax-rules" \
|
|
660
|
+
python3 - "$SCRIPT" "$TMP_DIR/out-pass/frontier.json" "$TMP_DIR/partial-frontier.stdout" <<'PY'
|
|
661
|
+
import importlib.util
|
|
662
|
+
import pathlib
|
|
663
|
+
import sys
|
|
664
|
+
|
|
665
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
666
|
+
module = importlib.util.module_from_spec(spec)
|
|
667
|
+
assert spec.loader is not None
|
|
668
|
+
spec.loader.exec_module(module)
|
|
669
|
+
sys.exit(module.check_frontier_stdout(pathlib.Path(sys.argv[2]), pathlib.Path(sys.argv[3])))
|
|
670
|
+
PY
|
|
671
|
+
|
|
672
|
+
cp "$TMP_DIR/out-pass/frontier.stdout" "$TMP_DIR/extra-frontier.stdout"
|
|
673
|
+
printf 'F99-stale-fixture: bare=1 solo_claude=2 pair=3 arm=l2_risk_probes margin=+1 wall=1.00x run=stale verdict=pair_evidence_passed\n' \
|
|
674
|
+
>> "$TMP_DIR/extra-frontier.stdout"
|
|
675
|
+
expect_fail_contains extra-frontier-score-row "frontier stdout score row count 3 does not match frontier evidence row count 2" \
|
|
676
|
+
python3 - "$SCRIPT" "$TMP_DIR/out-pass/frontier.json" "$TMP_DIR/extra-frontier.stdout" <<'PY'
|
|
677
|
+
import importlib.util
|
|
678
|
+
import pathlib
|
|
679
|
+
import sys
|
|
680
|
+
|
|
681
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
682
|
+
module = importlib.util.module_from_spec(spec)
|
|
683
|
+
assert spec.loader is not None
|
|
684
|
+
spec.loader.exec_module(module)
|
|
685
|
+
sys.exit(module.check_frontier_stdout(pathlib.Path(sys.argv[2]), pathlib.Path(sys.argv[3])))
|
|
686
|
+
PY
|
|
687
|
+
|
|
688
|
+
cat > "$TMP_DIR/malformed-frontier-summary.json" <<'JSON'
|
|
689
|
+
{
|
|
690
|
+
"verdict": "PASS"
|
|
691
|
+
}
|
|
692
|
+
JSON
|
|
693
|
+
expect_fail_contains malformed-frontier-stdout-summary "frontier stdout check missing summary fields" \
|
|
694
|
+
python3 - "$SCRIPT" "$TMP_DIR/malformed-frontier-summary.json" "$TMP_DIR/bad-frontier.stdout" <<'PY'
|
|
695
|
+
import importlib.util
|
|
696
|
+
import pathlib
|
|
697
|
+
import sys
|
|
698
|
+
|
|
699
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
700
|
+
module = importlib.util.module_from_spec(spec)
|
|
701
|
+
assert spec.loader is not None
|
|
702
|
+
spec.loader.exec_module(module)
|
|
703
|
+
sys.exit(module.check_frontier_stdout(pathlib.Path(sys.argv[2]), pathlib.Path(sys.argv[3])))
|
|
704
|
+
PY
|
|
705
|
+
|
|
706
|
+
cat > "$TMP_DIR/malformed-frontier-count.json" <<'JSON'
|
|
707
|
+
{
|
|
708
|
+
"verdict": "PASS",
|
|
709
|
+
"fixtures_total": 3,
|
|
710
|
+
"rejected_count": 1,
|
|
711
|
+
"candidate_count": 2,
|
|
712
|
+
"pair_evidence_count": "2",
|
|
713
|
+
"unmeasured_count": 0
|
|
714
|
+
}
|
|
715
|
+
JSON
|
|
716
|
+
expect_fail_contains malformed-frontier-stdout-counts "frontier stdout summary counts malformed" \
|
|
717
|
+
python3 - "$SCRIPT" "$TMP_DIR/malformed-frontier-count.json" "$TMP_DIR/bad-frontier.stdout" <<'PY'
|
|
718
|
+
import importlib.util
|
|
719
|
+
import pathlib
|
|
720
|
+
import sys
|
|
721
|
+
|
|
722
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
723
|
+
module = importlib.util.module_from_spec(spec)
|
|
724
|
+
assert spec.loader is not None
|
|
725
|
+
spec.loader.exec_module(module)
|
|
726
|
+
sys.exit(module.check_frontier_stdout(pathlib.Path(sys.argv[2]), pathlib.Path(sys.argv[3])))
|
|
727
|
+
PY
|
|
728
|
+
|
|
729
|
+
cat > "$TMP_DIR/malformed-frontier-aggregate.json" <<'JSON'
|
|
730
|
+
{
|
|
731
|
+
"verdict": "PASS",
|
|
732
|
+
"fixtures_total": 3,
|
|
733
|
+
"rejected_count": 1,
|
|
734
|
+
"candidate_count": 2,
|
|
735
|
+
"pair_evidence_count": 2,
|
|
736
|
+
"unmeasured_count": 0,
|
|
737
|
+
"pair_margin_avg": "27",
|
|
738
|
+
"pair_margin_min": 21,
|
|
739
|
+
"pair_solo_wall_ratio_avg": 1.38,
|
|
740
|
+
"pair_solo_wall_ratio_max": 1.47,
|
|
741
|
+
"rows": []
|
|
742
|
+
}
|
|
743
|
+
JSON
|
|
744
|
+
expect_fail_contains malformed-frontier-stdout-aggregate "frontier stdout aggregate fields malformed" \
|
|
745
|
+
python3 - "$SCRIPT" "$TMP_DIR/malformed-frontier-aggregate.json" "$TMP_DIR/bad-frontier.stdout" <<'PY'
|
|
746
|
+
import importlib.util
|
|
747
|
+
import pathlib
|
|
748
|
+
import sys
|
|
749
|
+
|
|
750
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
751
|
+
module = importlib.util.module_from_spec(spec)
|
|
752
|
+
assert spec.loader is not None
|
|
753
|
+
spec.loader.exec_module(module)
|
|
754
|
+
sys.exit(module.check_frontier_stdout(pathlib.Path(sys.argv[2]), pathlib.Path(sys.argv[3])))
|
|
755
|
+
PY
|
|
756
|
+
|
|
757
|
+
cat > "$TMP_DIR/frontier-fail-verdict.json" <<'JSON'
|
|
758
|
+
{
|
|
759
|
+
"verdict": "FAIL",
|
|
760
|
+
"unmeasured_count": 1,
|
|
761
|
+
"pair_evidence_count": 1,
|
|
762
|
+
"rows": []
|
|
763
|
+
}
|
|
764
|
+
JSON
|
|
765
|
+
expect_fail_contains frontier-fail-verdict "frontier verdict 'FAIL' is not PASS" \
|
|
766
|
+
python3 - "$SCRIPT" "$TMP_DIR/frontier-fail-verdict.json" <<'PY'
|
|
767
|
+
import importlib.util
|
|
768
|
+
import pathlib
|
|
769
|
+
import sys
|
|
770
|
+
|
|
771
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
772
|
+
module = importlib.util.module_from_spec(spec)
|
|
773
|
+
assert spec.loader is not None
|
|
774
|
+
spec.loader.exec_module(module)
|
|
775
|
+
sys.exit(module.check_frontier_report(pathlib.Path(sys.argv[2])))
|
|
776
|
+
PY
|
|
777
|
+
|
|
778
|
+
cat > "$TMP_DIR/frontier-unmeasured.json" <<'JSON'
|
|
779
|
+
{
|
|
780
|
+
"verdict": "PASS",
|
|
781
|
+
"unmeasured_count": 1,
|
|
782
|
+
"pair_evidence_count": 1,
|
|
783
|
+
"rows": []
|
|
784
|
+
}
|
|
785
|
+
JSON
|
|
786
|
+
expect_fail_contains frontier-unmeasured "frontier has 1 unmeasured candidate fixture(s)" \
|
|
787
|
+
python3 - "$SCRIPT" "$TMP_DIR/frontier-unmeasured.json" <<'PY'
|
|
788
|
+
import importlib.util
|
|
789
|
+
import pathlib
|
|
790
|
+
import sys
|
|
791
|
+
|
|
792
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
793
|
+
module = importlib.util.module_from_spec(spec)
|
|
794
|
+
assert spec.loader is not None
|
|
795
|
+
spec.loader.exec_module(module)
|
|
796
|
+
sys.exit(module.check_frontier_report(pathlib.Path(sys.argv[2])))
|
|
797
|
+
PY
|
|
798
|
+
|
|
799
|
+
cat > "$TMP_DIR/frontier-malformed-unmeasured.json" <<'JSON'
|
|
800
|
+
{
|
|
801
|
+
"verdict": "PASS",
|
|
802
|
+
"unmeasured_count": true,
|
|
803
|
+
"pair_evidence_count": 1,
|
|
804
|
+
"rows": []
|
|
805
|
+
}
|
|
806
|
+
JSON
|
|
807
|
+
expect_fail_contains frontier-malformed-unmeasured "frontier unmeasured count missing or malformed" \
|
|
808
|
+
python3 - "$SCRIPT" "$TMP_DIR/frontier-malformed-unmeasured.json" <<'PY'
|
|
809
|
+
import importlib.util
|
|
810
|
+
import pathlib
|
|
811
|
+
import sys
|
|
812
|
+
|
|
813
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
814
|
+
module = importlib.util.module_from_spec(spec)
|
|
815
|
+
assert spec.loader is not None
|
|
816
|
+
spec.loader.exec_module(module)
|
|
817
|
+
sys.exit(module.check_frontier_report(pathlib.Path(sys.argv[2])))
|
|
818
|
+
PY
|
|
819
|
+
|
|
820
|
+
cat > "$TMP_DIR/headroom-fail-verdict.json" <<'JSON'
|
|
821
|
+
{
|
|
822
|
+
"verdict": "FAIL",
|
|
823
|
+
"unrecorded_failures": [],
|
|
824
|
+
"unsupported_registry_rejections": []
|
|
825
|
+
}
|
|
826
|
+
JSON
|
|
827
|
+
expect_fail_contains headroom-fail-verdict "headroom audit verdict 'FAIL' is not PASS" \
|
|
828
|
+
python3 - "$SCRIPT" "$TMP_DIR/headroom-fail-verdict.json" <<'PY'
|
|
829
|
+
import importlib.util
|
|
830
|
+
import pathlib
|
|
831
|
+
import sys
|
|
832
|
+
|
|
833
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
834
|
+
module = importlib.util.module_from_spec(spec)
|
|
835
|
+
assert spec.loader is not None
|
|
836
|
+
spec.loader.exec_module(module)
|
|
837
|
+
sys.exit(module.check_headroom_audit_report(pathlib.Path(sys.argv[2])))
|
|
838
|
+
PY
|
|
839
|
+
|
|
840
|
+
cat > "$TMP_DIR/headroom-missing-unsupported.json" <<'JSON'
|
|
841
|
+
{
|
|
842
|
+
"verdict": "PASS",
|
|
843
|
+
"unrecorded_failures": []
|
|
844
|
+
}
|
|
845
|
+
JSON
|
|
846
|
+
expect_fail_contains headroom-missing-unsupported "headroom audit unsupported registry rejection count missing or malformed" \
|
|
847
|
+
python3 - "$SCRIPT" "$TMP_DIR/headroom-missing-unsupported.json" <<'PY'
|
|
848
|
+
import importlib.util
|
|
849
|
+
import pathlib
|
|
850
|
+
import sys
|
|
851
|
+
|
|
852
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
853
|
+
module = importlib.util.module_from_spec(spec)
|
|
854
|
+
assert spec.loader is not None
|
|
855
|
+
spec.loader.exec_module(module)
|
|
856
|
+
sys.exit(module.check_headroom_audit_report(pathlib.Path(sys.argv[2])))
|
|
857
|
+
PY
|
|
858
|
+
|
|
859
|
+
cat > "$TMP_DIR/headroom-unsupported.json" <<'JSON'
|
|
860
|
+
{
|
|
861
|
+
"verdict": "PASS",
|
|
862
|
+
"unrecorded_failures": [],
|
|
863
|
+
"unsupported_registry_rejections": [{"fixture": "F36-unsupported-rejection"}]
|
|
864
|
+
}
|
|
865
|
+
JSON
|
|
866
|
+
expect_fail_contains headroom-unsupported "headroom audit has 1 unsupported registry rejection(s)" \
|
|
867
|
+
python3 - "$SCRIPT" "$TMP_DIR/headroom-unsupported.json" <<'PY'
|
|
868
|
+
import importlib.util
|
|
869
|
+
import pathlib
|
|
870
|
+
import sys
|
|
871
|
+
|
|
872
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
873
|
+
module = importlib.util.module_from_spec(spec)
|
|
874
|
+
assert spec.loader is not None
|
|
875
|
+
spec.loader.exec_module(module)
|
|
876
|
+
sys.exit(module.check_headroom_audit_report(pathlib.Path(sys.argv[2])))
|
|
877
|
+
PY
|
|
878
|
+
|
|
879
|
+
python3 - "$SCRIPT" "$TMP_DIR/headroom-unsupported.json" > "$TMP_DIR/headroom-summary.out" <<'PY'
|
|
880
|
+
import importlib.util
|
|
881
|
+
import pathlib
|
|
882
|
+
import sys
|
|
883
|
+
|
|
884
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
885
|
+
module = importlib.util.module_from_spec(spec)
|
|
886
|
+
assert spec.loader is not None
|
|
887
|
+
spec.loader.exec_module(module)
|
|
888
|
+
module.print_headroom_rejections_summary(pathlib.Path(sys.argv[2]), status=1)
|
|
889
|
+
PY
|
|
890
|
+
grep -Fq 'headroom_rejections=FAIL verdict=PASS unrecorded=0 unsupported=1' "$TMP_DIR/headroom-summary.out"
|
|
891
|
+
|
|
892
|
+
cat > "$TMP_DIR/frontier-incomplete-best.json" <<'JSON'
|
|
893
|
+
{
|
|
894
|
+
"pair_evidence_count": 1,
|
|
895
|
+
"rows": [
|
|
896
|
+
{
|
|
897
|
+
"fixture": "F16-cli-quote-tax-rules",
|
|
898
|
+
"status": "pair_evidence_passed",
|
|
899
|
+
"passing_pair_evidence": [
|
|
900
|
+
{
|
|
901
|
+
"run_id": "higher-incomplete",
|
|
902
|
+
"bare_score": 50,
|
|
903
|
+
"solo_score": 75,
|
|
904
|
+
"pair_score": 98,
|
|
905
|
+
"pair_margin": 23,
|
|
906
|
+
"pair_mode": true,
|
|
907
|
+
"pair_trigger_eligible": true,
|
|
908
|
+
"pair_solo_wall_ratio": 1.32
|
|
909
|
+
},
|
|
910
|
+
{
|
|
911
|
+
"run_id": "lower-complete",
|
|
912
|
+
"pair_arm": "l2_risk_probes",
|
|
913
|
+
"bare_score": 50,
|
|
914
|
+
"solo_score": 75,
|
|
915
|
+
"pair_score": 96,
|
|
916
|
+
"pair_margin": 21,
|
|
917
|
+
"pair_mode": true,
|
|
918
|
+
"pair_trigger_eligible": true,
|
|
919
|
+
"pair_trigger_reasons": ["complexity.high"],
|
|
920
|
+
"pair_trigger_has_canonical_reason": true,
|
|
921
|
+
"pair_solo_wall_ratio": 1.28
|
|
922
|
+
}
|
|
923
|
+
]
|
|
924
|
+
}
|
|
925
|
+
]
|
|
926
|
+
}
|
|
927
|
+
JSON
|
|
928
|
+
python3 - "$SCRIPT" "$TMP_DIR/frontier-incomplete-best.json" <<'PY'
|
|
929
|
+
import importlib.util
|
|
930
|
+
import pathlib
|
|
931
|
+
import sys
|
|
932
|
+
|
|
933
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
934
|
+
module = importlib.util.module_from_spec(spec)
|
|
935
|
+
assert spec.loader is not None
|
|
936
|
+
spec.loader.exec_module(module)
|
|
937
|
+
rows = module.load_pair_evidence_rows(pathlib.Path(sys.argv[2]))
|
|
938
|
+
assert rows == [
|
|
939
|
+
{
|
|
940
|
+
"fixture": "F16-cli-quote-tax-rules",
|
|
941
|
+
"verdict": "pair_evidence_passed",
|
|
942
|
+
"run_id": "lower-complete",
|
|
943
|
+
"pair_arm": "l2_risk_probes",
|
|
944
|
+
"bare_score": 50,
|
|
945
|
+
"solo_score": 75,
|
|
946
|
+
"pair_score": 96,
|
|
947
|
+
"pair_margin": 21,
|
|
948
|
+
"pair_mode": True,
|
|
949
|
+
"pair_trigger_eligible": True,
|
|
950
|
+
"pair_trigger_reasons": ["complexity.high"],
|
|
951
|
+
"pair_trigger_has_canonical_reason": True,
|
|
952
|
+
"pair_trigger_has_hypothesis_reason": False,
|
|
953
|
+
"pair_solo_wall_ratio": 1.28,
|
|
954
|
+
}
|
|
955
|
+
]
|
|
956
|
+
PY
|
|
957
|
+
|
|
958
|
+
cat > "$TMP_DIR/bad-frontier-rows.json" <<'JSON'
|
|
959
|
+
{
|
|
960
|
+
"pair_evidence_count": 2,
|
|
961
|
+
"rows": [
|
|
962
|
+
{
|
|
963
|
+
"fixture": "F16-cli-quote-tax-rules",
|
|
964
|
+
"status": "pair_evidence_passed",
|
|
965
|
+
"passing_pair_evidence": []
|
|
966
|
+
},
|
|
967
|
+
{
|
|
968
|
+
"fixture": "F21-cli-scheduler-priority",
|
|
969
|
+
"status": "pair_evidence_passed",
|
|
970
|
+
"passing_pair_evidence": "malformed"
|
|
971
|
+
}
|
|
972
|
+
]
|
|
973
|
+
}
|
|
974
|
+
JSON
|
|
975
|
+
expect_fail_contains missing-pair-evidence-rows "pair evidence rows 0 do not match summary count 2" \
|
|
976
|
+
python3 - "$SCRIPT" "$TMP_DIR/bad-frontier-rows.json" <<'PY'
|
|
977
|
+
import importlib.util
|
|
978
|
+
import pathlib
|
|
979
|
+
import sys
|
|
980
|
+
|
|
981
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
982
|
+
module = importlib.util.module_from_spec(spec)
|
|
983
|
+
assert spec.loader is not None
|
|
984
|
+
spec.loader.exec_module(module)
|
|
985
|
+
sys.exit(module.check_min_pair_evidence(pathlib.Path(sys.argv[2]), 2))
|
|
986
|
+
PY
|
|
987
|
+
|
|
988
|
+
cat > "$TMP_DIR/bad-frontier-row-fields.json" <<'JSON'
|
|
989
|
+
{
|
|
990
|
+
"pair_evidence_count": 2,
|
|
991
|
+
"rows": [
|
|
992
|
+
{
|
|
993
|
+
"fixture": "F16-cli-quote-tax-rules",
|
|
994
|
+
"status": "pair_evidence_passed",
|
|
995
|
+
"passing_pair_evidence": [
|
|
996
|
+
{
|
|
997
|
+
"run_id": "pair-pass",
|
|
998
|
+
"pair_arm": "l2_risk_probes",
|
|
999
|
+
"bare_score": null,
|
|
1000
|
+
"solo_score": 75,
|
|
1001
|
+
"pair_score": 96,
|
|
1002
|
+
"pair_margin": 21,
|
|
1003
|
+
"pair_mode": true,
|
|
1004
|
+
"pair_trigger_eligible": true,
|
|
1005
|
+
"pair_solo_wall_ratio": 1.28
|
|
1006
|
+
}
|
|
1007
|
+
]
|
|
1008
|
+
},
|
|
1009
|
+
{
|
|
1010
|
+
"fixture": "F21-cli-scheduler-priority",
|
|
1011
|
+
"status": "pair_evidence_passed",
|
|
1012
|
+
"passing_pair_evidence": [
|
|
1013
|
+
{
|
|
1014
|
+
"run_id": "pair-pass-2",
|
|
1015
|
+
"pair_arm": "l2_risk_probes",
|
|
1016
|
+
"bare_score": 33,
|
|
1017
|
+
"solo_score": 66,
|
|
1018
|
+
"pair_score": 99,
|
|
1019
|
+
"pair_margin": 33,
|
|
1020
|
+
"pair_mode": true,
|
|
1021
|
+
"pair_trigger_eligible": true,
|
|
1022
|
+
"pair_solo_wall_ratio": true
|
|
1023
|
+
}
|
|
1024
|
+
]
|
|
1025
|
+
}
|
|
1026
|
+
]
|
|
1027
|
+
}
|
|
1028
|
+
JSON
|
|
1029
|
+
expect_fail_contains malformed-pair-evidence-row-fields "pair evidence rows 0 do not match summary count 2" \
|
|
1030
|
+
python3 - "$SCRIPT" "$TMP_DIR/bad-frontier-row-fields.json" <<'PY'
|
|
1031
|
+
import importlib.util
|
|
1032
|
+
import pathlib
|
|
1033
|
+
import sys
|
|
1034
|
+
|
|
1035
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
1036
|
+
module = importlib.util.module_from_spec(spec)
|
|
1037
|
+
assert spec.loader is not None
|
|
1038
|
+
spec.loader.exec_module(module)
|
|
1039
|
+
sys.exit(module.check_min_pair_evidence(pathlib.Path(sys.argv[2]), 2))
|
|
1040
|
+
PY
|
|
1041
|
+
|
|
1042
|
+
cat > "$TMP_DIR/nan-frontier-row-fields.json" <<'JSON'
|
|
1043
|
+
{
|
|
1044
|
+
"pair_evidence_count": 1,
|
|
1045
|
+
"rows": [
|
|
1046
|
+
{
|
|
1047
|
+
"fixture": "F16-cli-quote-tax-rules",
|
|
1048
|
+
"status": "pair_evidence_passed",
|
|
1049
|
+
"passing_pair_evidence": [
|
|
1050
|
+
{
|
|
1051
|
+
"run_id": "nan-wall-run",
|
|
1052
|
+
"pair_arm": "l2_risk_probes",
|
|
1053
|
+
"bare_score": 50,
|
|
1054
|
+
"solo_score": 75,
|
|
1055
|
+
"pair_score": 96,
|
|
1056
|
+
"pair_margin": 21,
|
|
1057
|
+
"pair_mode": true,
|
|
1058
|
+
"pair_trigger_eligible": true,
|
|
1059
|
+
"pair_solo_wall_ratio": NaN
|
|
1060
|
+
}
|
|
1061
|
+
]
|
|
1062
|
+
}
|
|
1063
|
+
]
|
|
1064
|
+
}
|
|
1065
|
+
JSON
|
|
1066
|
+
expect_fail_contains nan-pair-evidence-row-fields "pair evidence count missing or malformed from frontier report" \
|
|
1067
|
+
python3 - "$SCRIPT" "$TMP_DIR/nan-frontier-row-fields.json" <<'PY'
|
|
1068
|
+
import importlib.util
|
|
1069
|
+
import pathlib
|
|
1070
|
+
import sys
|
|
1071
|
+
|
|
1072
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
1073
|
+
module = importlib.util.module_from_spec(spec)
|
|
1074
|
+
assert spec.loader is not None
|
|
1075
|
+
spec.loader.exec_module(module)
|
|
1076
|
+
sys.exit(module.check_min_pair_evidence(pathlib.Path(sys.argv[2]), 1))
|
|
1077
|
+
PY
|
|
1078
|
+
|
|
1079
|
+
cat > "$TMP_DIR/mismatched-margin-row-fields.json" <<'JSON'
|
|
1080
|
+
{
|
|
1081
|
+
"pair_evidence_count": 1,
|
|
1082
|
+
"rows": [
|
|
1083
|
+
{
|
|
1084
|
+
"fixture": "F16-cli-quote-tax-rules",
|
|
1085
|
+
"status": "pair_evidence_passed",
|
|
1086
|
+
"passing_pair_evidence": [
|
|
1087
|
+
{
|
|
1088
|
+
"run_id": "inflated-margin-run",
|
|
1089
|
+
"pair_arm": "l2_risk_probes",
|
|
1090
|
+
"bare_score": 50,
|
|
1091
|
+
"solo_score": 75,
|
|
1092
|
+
"pair_score": 76,
|
|
1093
|
+
"pair_margin": 21,
|
|
1094
|
+
"pair_mode": true,
|
|
1095
|
+
"pair_trigger_eligible": true,
|
|
1096
|
+
"pair_solo_wall_ratio": 1.28
|
|
1097
|
+
}
|
|
1098
|
+
]
|
|
1099
|
+
}
|
|
1100
|
+
]
|
|
1101
|
+
}
|
|
1102
|
+
JSON
|
|
1103
|
+
expect_fail_contains mismatched-margin-row-fields "pair evidence rows 0 do not match summary count 1" \
|
|
1104
|
+
python3 - "$SCRIPT" "$TMP_DIR/mismatched-margin-row-fields.json" <<'PY'
|
|
1105
|
+
import importlib.util
|
|
1106
|
+
import pathlib
|
|
1107
|
+
import sys
|
|
1108
|
+
|
|
1109
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
1110
|
+
module = importlib.util.module_from_spec(spec)
|
|
1111
|
+
assert spec.loader is not None
|
|
1112
|
+
spec.loader.exec_module(module)
|
|
1113
|
+
sys.exit(module.check_min_pair_evidence(pathlib.Path(sys.argv[2]), 1))
|
|
1114
|
+
PY
|
|
1115
|
+
|
|
1116
|
+
cat > "$TMP_DIR/overrange-score-row-fields.json" <<'JSON'
|
|
1117
|
+
{
|
|
1118
|
+
"pair_evidence_count": 1,
|
|
1119
|
+
"rows": [
|
|
1120
|
+
{
|
|
1121
|
+
"fixture": "F16-cli-quote-tax-rules",
|
|
1122
|
+
"status": "pair_evidence_passed",
|
|
1123
|
+
"passing_pair_evidence": [
|
|
1124
|
+
{
|
|
1125
|
+
"run_id": "overrange-score-run",
|
|
1126
|
+
"pair_arm": "l2_risk_probes",
|
|
1127
|
+
"bare_score": 50,
|
|
1128
|
+
"solo_score": 75,
|
|
1129
|
+
"pair_score": 101,
|
|
1130
|
+
"pair_margin": 26,
|
|
1131
|
+
"pair_mode": true,
|
|
1132
|
+
"pair_trigger_eligible": true,
|
|
1133
|
+
"pair_solo_wall_ratio": 1.28
|
|
1134
|
+
}
|
|
1135
|
+
]
|
|
1136
|
+
}
|
|
1137
|
+
]
|
|
1138
|
+
}
|
|
1139
|
+
JSON
|
|
1140
|
+
expect_fail_contains overrange-score-row-fields "pair evidence rows 0 do not match summary count 1" \
|
|
1141
|
+
python3 - "$SCRIPT" "$TMP_DIR/overrange-score-row-fields.json" <<'PY'
|
|
1142
|
+
import importlib.util
|
|
1143
|
+
import pathlib
|
|
1144
|
+
import sys
|
|
1145
|
+
|
|
1146
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
1147
|
+
module = importlib.util.module_from_spec(spec)
|
|
1148
|
+
assert spec.loader is not None
|
|
1149
|
+
spec.loader.exec_module(module)
|
|
1150
|
+
sys.exit(module.check_min_pair_evidence(pathlib.Path(sys.argv[2]), 1))
|
|
1151
|
+
PY
|
|
1152
|
+
|
|
1153
|
+
cat > "$TMP_DIR/invalid-pair-arm-row-fields.json" <<'JSON'
|
|
1154
|
+
{
|
|
1155
|
+
"pair_evidence_count": 1,
|
|
1156
|
+
"rows": [
|
|
1157
|
+
{
|
|
1158
|
+
"fixture": "F16-cli-quote-tax-rules",
|
|
1159
|
+
"status": "pair_evidence_passed",
|
|
1160
|
+
"passing_pair_evidence": [
|
|
1161
|
+
{
|
|
1162
|
+
"run_id": "invalid-arm-run",
|
|
1163
|
+
"pair_arm": "bare",
|
|
1164
|
+
"bare_score": 50,
|
|
1165
|
+
"solo_score": 75,
|
|
1166
|
+
"pair_score": 96,
|
|
1167
|
+
"pair_margin": 21,
|
|
1168
|
+
"pair_mode": true,
|
|
1169
|
+
"pair_trigger_eligible": true,
|
|
1170
|
+
"pair_solo_wall_ratio": 1.28
|
|
1171
|
+
}
|
|
1172
|
+
]
|
|
1173
|
+
}
|
|
1174
|
+
]
|
|
1175
|
+
}
|
|
1176
|
+
JSON
|
|
1177
|
+
expect_fail_contains invalid-pair-arm-row-fields "pair evidence rows 0 do not match summary count 1" \
|
|
1178
|
+
python3 - "$SCRIPT" "$TMP_DIR/invalid-pair-arm-row-fields.json" <<'PY'
|
|
1179
|
+
import importlib.util
|
|
1180
|
+
import pathlib
|
|
1181
|
+
import sys
|
|
1182
|
+
|
|
1183
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
1184
|
+
module = importlib.util.module_from_spec(spec)
|
|
1185
|
+
assert spec.loader is not None
|
|
1186
|
+
spec.loader.exec_module(module)
|
|
1187
|
+
sys.exit(module.check_min_pair_evidence(pathlib.Path(sys.argv[2]), 1))
|
|
1188
|
+
PY
|
|
1189
|
+
|
|
1190
|
+
cat > "$TMP_DIR/false-pair-mode-row-fields.json" <<'JSON'
|
|
1191
|
+
{
|
|
1192
|
+
"pair_evidence_count": 1,
|
|
1193
|
+
"rows": [
|
|
1194
|
+
{
|
|
1195
|
+
"fixture": "F16-cli-quote-tax-rules",
|
|
1196
|
+
"status": "pair_evidence_passed",
|
|
1197
|
+
"passing_pair_evidence": [
|
|
1198
|
+
{
|
|
1199
|
+
"run_id": "false-pair-mode-run",
|
|
1200
|
+
"pair_arm": "l2_risk_probes",
|
|
1201
|
+
"bare_score": 50,
|
|
1202
|
+
"solo_score": 75,
|
|
1203
|
+
"pair_score": 96,
|
|
1204
|
+
"pair_margin": 21,
|
|
1205
|
+
"pair_mode": false,
|
|
1206
|
+
"pair_solo_wall_ratio": 1.28
|
|
1207
|
+
}
|
|
1208
|
+
]
|
|
1209
|
+
}
|
|
1210
|
+
]
|
|
1211
|
+
}
|
|
1212
|
+
JSON
|
|
1213
|
+
expect_fail_contains false-pair-mode-row-fields "pair evidence rows 0 do not match summary count 1" \
|
|
1214
|
+
python3 - "$SCRIPT" "$TMP_DIR/false-pair-mode-row-fields.json" <<'PY'
|
|
1215
|
+
import importlib.util
|
|
1216
|
+
import pathlib
|
|
1217
|
+
import sys
|
|
1218
|
+
|
|
1219
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
1220
|
+
module = importlib.util.module_from_spec(spec)
|
|
1221
|
+
assert spec.loader is not None
|
|
1222
|
+
spec.loader.exec_module(module)
|
|
1223
|
+
sys.exit(module.check_min_pair_evidence(pathlib.Path(sys.argv[2]), 1))
|
|
1224
|
+
PY
|
|
1225
|
+
|
|
1226
|
+
cat > "$TMP_DIR/missing-pair-trigger-row-fields.json" <<'JSON'
|
|
1227
|
+
{
|
|
1228
|
+
"pair_evidence_count": 1,
|
|
1229
|
+
"rows": [
|
|
1230
|
+
{
|
|
1231
|
+
"fixture": "F16-cli-quote-tax-rules",
|
|
1232
|
+
"status": "pair_evidence_passed",
|
|
1233
|
+
"passing_pair_evidence": [
|
|
1234
|
+
{
|
|
1235
|
+
"run_id": "stale-gate-run",
|
|
1236
|
+
"pair_arm": "l2_risk_probes",
|
|
1237
|
+
"bare_score": 50,
|
|
1238
|
+
"solo_score": 75,
|
|
1239
|
+
"pair_score": 96,
|
|
1240
|
+
"pair_margin": 21,
|
|
1241
|
+
"pair_mode": true,
|
|
1242
|
+
"pair_solo_wall_ratio": 1.28
|
|
1243
|
+
}
|
|
1244
|
+
]
|
|
1245
|
+
}
|
|
1246
|
+
]
|
|
1247
|
+
}
|
|
1248
|
+
JSON
|
|
1249
|
+
expect_fail_contains missing-pair-trigger-row-fields "pair evidence rows 0 do not match summary count 1" \
|
|
1250
|
+
python3 - "$SCRIPT" "$TMP_DIR/missing-pair-trigger-row-fields.json" <<'PY'
|
|
1251
|
+
import importlib.util
|
|
1252
|
+
import pathlib
|
|
1253
|
+
import sys
|
|
1254
|
+
|
|
1255
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
1256
|
+
module = importlib.util.module_from_spec(spec)
|
|
1257
|
+
assert spec.loader is not None
|
|
1258
|
+
spec.loader.exec_module(module)
|
|
1259
|
+
sys.exit(module.check_min_pair_evidence(pathlib.Path(sys.argv[2]), 1))
|
|
1260
|
+
PY
|
|
1261
|
+
|
|
1262
|
+
cat > "$TMP_DIR/zero-wall-row-fields.json" <<'JSON'
|
|
1263
|
+
{
|
|
1264
|
+
"pair_evidence_count": 1,
|
|
1265
|
+
"rows": [
|
|
1266
|
+
{
|
|
1267
|
+
"fixture": "F16-cli-quote-tax-rules",
|
|
1268
|
+
"status": "pair_evidence_passed",
|
|
1269
|
+
"passing_pair_evidence": [
|
|
1270
|
+
{
|
|
1271
|
+
"run_id": "zero-wall-run",
|
|
1272
|
+
"pair_arm": "l2_risk_probes",
|
|
1273
|
+
"bare_score": 50,
|
|
1274
|
+
"solo_score": 75,
|
|
1275
|
+
"pair_score": 96,
|
|
1276
|
+
"pair_margin": 21,
|
|
1277
|
+
"pair_mode": true,
|
|
1278
|
+
"pair_trigger_eligible": true,
|
|
1279
|
+
"pair_solo_wall_ratio": 0
|
|
1280
|
+
}
|
|
1281
|
+
]
|
|
1282
|
+
}
|
|
1283
|
+
]
|
|
1284
|
+
}
|
|
1285
|
+
JSON
|
|
1286
|
+
expect_fail_contains zero-wall-row-fields "pair evidence rows 0 do not match summary count 1" \
|
|
1287
|
+
python3 - "$SCRIPT" "$TMP_DIR/zero-wall-row-fields.json" <<'PY'
|
|
1288
|
+
import importlib.util
|
|
1289
|
+
import pathlib
|
|
1290
|
+
import sys
|
|
1291
|
+
|
|
1292
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
1293
|
+
module = importlib.util.module_from_spec(spec)
|
|
1294
|
+
assert spec.loader is not None
|
|
1295
|
+
spec.loader.exec_module(module)
|
|
1296
|
+
sys.exit(module.check_min_pair_evidence(pathlib.Path(sys.argv[2]), 1))
|
|
1297
|
+
PY
|
|
1298
|
+
|
|
1299
|
+
cat > "$TMP_DIR/bool-frontier-count.json" <<'JSON'
|
|
1300
|
+
{
|
|
1301
|
+
"pair_evidence_count": true,
|
|
1302
|
+
"rows": [
|
|
1303
|
+
{
|
|
1304
|
+
"fixture": "F16-cli-quote-tax-rules",
|
|
1305
|
+
"status": "pair_evidence_passed",
|
|
1306
|
+
"passing_pair_evidence": [
|
|
1307
|
+
{
|
|
1308
|
+
"run_id": "pair-pass",
|
|
1309
|
+
"pair_arm": "l2_risk_probes",
|
|
1310
|
+
"bare_score": 50,
|
|
1311
|
+
"solo_score": 75,
|
|
1312
|
+
"pair_score": 96,
|
|
1313
|
+
"pair_margin": 21,
|
|
1314
|
+
"pair_mode": true,
|
|
1315
|
+
"pair_trigger_eligible": true,
|
|
1316
|
+
"pair_trigger_reasons": ["complexity.high"],
|
|
1317
|
+
"pair_trigger_has_canonical_reason": true,
|
|
1318
|
+
"pair_solo_wall_ratio": 1.28
|
|
1319
|
+
}
|
|
1320
|
+
]
|
|
1321
|
+
}
|
|
1322
|
+
]
|
|
1323
|
+
}
|
|
1324
|
+
JSON
|
|
1325
|
+
expect_fail_contains malformed-pair-evidence-count "pair evidence count missing or malformed from frontier report" \
|
|
1326
|
+
python3 - "$SCRIPT" "$TMP_DIR/bool-frontier-count.json" <<'PY'
|
|
1327
|
+
import importlib.util
|
|
1328
|
+
import pathlib
|
|
1329
|
+
import sys
|
|
1330
|
+
|
|
1331
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
1332
|
+
module = importlib.util.module_from_spec(spec)
|
|
1333
|
+
assert spec.loader is not None
|
|
1334
|
+
spec.loader.exec_module(module)
|
|
1335
|
+
sys.exit(module.check_min_pair_evidence(pathlib.Path(sys.argv[2]), 1))
|
|
1336
|
+
PY
|
|
1337
|
+
|
|
1338
|
+
cat > "$TMP_DIR/mismatched-frontier-rows.json" <<'JSON'
|
|
1339
|
+
{
|
|
1340
|
+
"pair_evidence_count": 2,
|
|
1341
|
+
"rows": [
|
|
1342
|
+
{
|
|
1343
|
+
"fixture": "F16-cli-quote-tax-rules",
|
|
1344
|
+
"status": "pair_evidence_passed",
|
|
1345
|
+
"passing_pair_evidence": [
|
|
1346
|
+
{
|
|
1347
|
+
"run_id": "pair-pass",
|
|
1348
|
+
"pair_arm": "l2_risk_probes",
|
|
1349
|
+
"bare_score": 50,
|
|
1350
|
+
"solo_score": 75,
|
|
1351
|
+
"pair_score": 96,
|
|
1352
|
+
"pair_margin": 21,
|
|
1353
|
+
"pair_mode": true,
|
|
1354
|
+
"pair_trigger_eligible": true,
|
|
1355
|
+
"pair_trigger_reasons": ["complexity.high"],
|
|
1356
|
+
"pair_trigger_has_canonical_reason": true,
|
|
1357
|
+
"pair_solo_wall_ratio": 1.28
|
|
1358
|
+
}
|
|
1359
|
+
]
|
|
1360
|
+
},
|
|
1361
|
+
{
|
|
1362
|
+
"fixture": "F21-cli-scheduler-priority",
|
|
1363
|
+
"status": "pair_evidence_passed",
|
|
1364
|
+
"passing_pair_evidence": [
|
|
1365
|
+
{
|
|
1366
|
+
"run_id": "incomplete-row",
|
|
1367
|
+
"bare_score": 33,
|
|
1368
|
+
"solo_score": 66,
|
|
1369
|
+
"pair_score": 99,
|
|
1370
|
+
"pair_margin": 33,
|
|
1371
|
+
"pair_mode": true,
|
|
1372
|
+
"pair_trigger_eligible": true,
|
|
1373
|
+
"pair_solo_wall_ratio": 1.47
|
|
1374
|
+
}
|
|
1375
|
+
]
|
|
1376
|
+
}
|
|
1377
|
+
]
|
|
1378
|
+
}
|
|
1379
|
+
JSON
|
|
1380
|
+
expect_fail_contains mismatched-pair-evidence-rows "pair evidence rows 1 do not match summary count 2" \
|
|
1381
|
+
python3 - "$SCRIPT" "$TMP_DIR/mismatched-frontier-rows.json" <<'PY'
|
|
1382
|
+
import importlib.util
|
|
1383
|
+
import pathlib
|
|
1384
|
+
import sys
|
|
1385
|
+
|
|
1386
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
1387
|
+
module = importlib.util.module_from_spec(spec)
|
|
1388
|
+
assert spec.loader is not None
|
|
1389
|
+
spec.loader.exec_module(module)
|
|
1390
|
+
sys.exit(module.check_min_pair_evidence(pathlib.Path(sys.argv[2]), 1))
|
|
1391
|
+
PY
|
|
1392
|
+
|
|
1393
|
+
expect_fail_contains min-pair-evidence "pair evidence count 2 below required minimum 4" \
|
|
1394
|
+
python3 "$SCRIPT" \
|
|
1395
|
+
--fixtures-root "$fixtures" \
|
|
1396
|
+
--registry "$registry" \
|
|
1397
|
+
--results-root "$results" \
|
|
1398
|
+
--out-dir "$TMP_DIR/out-low-evidence"
|
|
1399
|
+
grep -Fq 'FAIL audit-pair-evidence' "$TMP_DIR/min-pair-evidence.out"
|
|
1400
|
+
grep -Fq 'headroom_rejections=PASS verdict=PASS unrecorded=0 unsupported=0' "$TMP_DIR/min-pair-evidence.out"
|
|
1401
|
+
grep -Fq 'pair_evidence_quality=PASS min_pair_margin_actual=+21 min_pair_margin_required=+5 max_wall_actual=1.47x max_wall_allowed=3.00x' "$TMP_DIR/min-pair-evidence.out"
|
|
1402
|
+
grep -Fq 'pair_trigger_reasons=PASS canonical=2 historical_alias=1 exposed=2 total=2 summary=2 rows_match=true' "$TMP_DIR/min-pair-evidence.out"
|
|
1403
|
+
grep -Fq 'pair_trigger_historical_aliases=F21-cli-scheduler-priority=risk_profile.high_risk' "$TMP_DIR/min-pair-evidence.out"
|
|
1404
|
+
grep -Fq 'pair_evidence_hypothesis_triggers=WARN matched=0 documented=2 total=2' "$TMP_DIR/min-pair-evidence.out"
|
|
1405
|
+
grep -Fq 'pair_evidence_hypothesis_trigger_gaps=F16-cli-quote-tax-rules=complexity.high;F21-cli-scheduler-priority=complexity.high,risk_profile.high_risk' "$TMP_DIR/min-pair-evidence.out"
|
|
1406
|
+
python3 - "$TMP_DIR/out-low-evidence/audit.json" <<'PY'
|
|
1407
|
+
import json
|
|
1408
|
+
import sys
|
|
1409
|
+
|
|
1410
|
+
report = json.load(open(sys.argv[1], encoding="utf8"))
|
|
1411
|
+
assert report["verdict"] == "FAIL"
|
|
1412
|
+
assert report["checks"]["frontier"]["status"] == "PASS"
|
|
1413
|
+
assert report["checks"]["headroom_rejections"]["status"] == "PASS"
|
|
1414
|
+
assert report["checks"]["headroom_rejections"]["report_check_exit_code"] == 0
|
|
1415
|
+
assert report["checks"]["headroom_rejections"]["verdict"] == "PASS"
|
|
1416
|
+
assert report["checks"]["headroom_rejections"]["unrecorded_failure_count"] == 0
|
|
1417
|
+
assert report["checks"]["headroom_rejections"]["unsupported_registry_rejection_count"] == 0
|
|
1418
|
+
assert report["checks"]["min_pair_evidence"]["status"] == "FAIL"
|
|
1419
|
+
assert report["checks"]["min_pair_evidence"]["required"] == 4
|
|
1420
|
+
assert report["checks"]["min_pair_evidence"]["actual_rows"] == 2
|
|
1421
|
+
assert report["checks"]["pair_evidence_quality"]["status"] == "PASS"
|
|
1422
|
+
assert report["checks"]["pair_evidence_quality"]["min_pair_margin_actual"] == 21
|
|
1423
|
+
assert report["checks"]["pair_evidence_quality"]["max_pair_solo_wall_ratio_actual"] == 1.47
|
|
1424
|
+
assert report["checks"]["pair_trigger_reasons"]["status"] == "PASS"
|
|
1425
|
+
assert report["checks"]["pair_trigger_reasons"]["summary_pair_evidence_count"] == 2
|
|
1426
|
+
assert report["checks"]["pair_trigger_reasons"]["canonical_rows"] == 2
|
|
1427
|
+
assert report["checks"]["pair_trigger_reasons"]["historical_alias_rows"] == 1
|
|
1428
|
+
assert report["checks"]["pair_trigger_reasons"]["historical_alias_details"] == [
|
|
1429
|
+
{"fixture": "F21-cli-scheduler-priority", "aliases": ["risk_profile.high_risk"]}
|
|
1430
|
+
]
|
|
1431
|
+
assert report["checks"]["pair_trigger_reasons"]["exposed_rows"] == 2
|
|
1432
|
+
assert report["checks"]["pair_trigger_reasons"]["total_rows"] == 2
|
|
1433
|
+
assert report["checks"]["pair_trigger_reasons"]["rows_match_count"] is True
|
|
1434
|
+
assert report["checks"]["pair_evidence_hypotheses"]["status"] == "PASS"
|
|
1435
|
+
assert report["checks"]["pair_evidence_hypotheses"]["documented_rows"] == 2
|
|
1436
|
+
assert report["checks"]["pair_evidence_hypotheses"]["total_rows"] == 2
|
|
1437
|
+
assert report["checks"]["pair_evidence_hypothesis_triggers"]["status"] == "WARN"
|
|
1438
|
+
assert report["checks"]["pair_evidence_hypothesis_triggers"]["exit_code"] == 0
|
|
1439
|
+
assert report["checks"]["pair_evidence_hypothesis_triggers"]["required"] is False
|
|
1440
|
+
assert report["checks"]["pair_evidence_hypothesis_triggers"]["matched_rows"] == 0
|
|
1441
|
+
assert report["checks"]["pair_evidence_hypothesis_triggers"]["documented_rows"] == 2
|
|
1442
|
+
assert report["checks"]["pair_evidence_hypothesis_triggers"]["total_rows"] == 2
|
|
1443
|
+
assert report["checks"]["pair_evidence_hypothesis_triggers"]["gap_details"] == [
|
|
1444
|
+
{
|
|
1445
|
+
"fixture": "F16-cli-quote-tax-rules",
|
|
1446
|
+
"pair_trigger_reasons": ["complexity.high"],
|
|
1447
|
+
},
|
|
1448
|
+
{
|
|
1449
|
+
"fixture": "F21-cli-scheduler-priority",
|
|
1450
|
+
"pair_trigger_reasons": ["complexity.high", "risk_profile.high_risk"],
|
|
1451
|
+
},
|
|
1452
|
+
]
|
|
1453
|
+
PY
|
|
1454
|
+
|
|
1455
|
+
cat > "$TMP_DIR/low-quality-frontier.json" <<'JSON'
|
|
1456
|
+
{
|
|
1457
|
+
"pair_margin_min": 4,
|
|
1458
|
+
"pair_solo_wall_ratio_max": 1.2,
|
|
1459
|
+
"rows": [
|
|
1460
|
+
{
|
|
1461
|
+
"fixture": "F16-cli-quote-tax-rules",
|
|
1462
|
+
"status": "pair_evidence_passed",
|
|
1463
|
+
"passing_pair_evidence": [
|
|
1464
|
+
{
|
|
1465
|
+
"run_id": "low-quality-run",
|
|
1466
|
+
"pair_arm": "l2_risk_probes",
|
|
1467
|
+
"bare_score": 50,
|
|
1468
|
+
"solo_score": 75,
|
|
1469
|
+
"pair_score": 79,
|
|
1470
|
+
"pair_margin": 4,
|
|
1471
|
+
"pair_mode": true,
|
|
1472
|
+
"pair_trigger_eligible": true,
|
|
1473
|
+
"pair_trigger_reasons": ["complexity.high"],
|
|
1474
|
+
"pair_trigger_has_canonical_reason": true,
|
|
1475
|
+
"pair_solo_wall_ratio": 1.2
|
|
1476
|
+
}
|
|
1477
|
+
]
|
|
1478
|
+
}
|
|
1479
|
+
]
|
|
1480
|
+
}
|
|
1481
|
+
JSON
|
|
1482
|
+
expect_fail_contains low-quality-pair-evidence "pair evidence margin below minimum for fixture(s): F16-cli-quote-tax-rules" \
|
|
1483
|
+
python3 - "$SCRIPT" "$TMP_DIR/low-quality-frontier.json" <<'PY'
|
|
1484
|
+
import importlib.util
|
|
1485
|
+
import pathlib
|
|
1486
|
+
import sys
|
|
1487
|
+
|
|
1488
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
1489
|
+
module = importlib.util.module_from_spec(spec)
|
|
1490
|
+
assert spec.loader is not None
|
|
1491
|
+
spec.loader.exec_module(module)
|
|
1492
|
+
sys.exit(module.check_pair_evidence_quality(
|
|
1493
|
+
pathlib.Path(sys.argv[2]),
|
|
1494
|
+
min_pair_margin=5,
|
|
1495
|
+
max_pair_solo_wall_ratio=3.0,
|
|
1496
|
+
))
|
|
1497
|
+
PY
|
|
1498
|
+
python3 - "$SCRIPT" "$TMP_DIR/low-quality-frontier.json" > "$TMP_DIR/low-quality-quality-row.out" <<'PY'
|
|
1499
|
+
import importlib.util
|
|
1500
|
+
import pathlib
|
|
1501
|
+
import sys
|
|
1502
|
+
|
|
1503
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
1504
|
+
module = importlib.util.module_from_spec(spec)
|
|
1505
|
+
assert spec.loader is not None
|
|
1506
|
+
spec.loader.exec_module(module)
|
|
1507
|
+
module.print_pair_evidence_quality(
|
|
1508
|
+
pathlib.Path(sys.argv[2]),
|
|
1509
|
+
min_pair_margin=5,
|
|
1510
|
+
max_pair_solo_wall_ratio=3.0,
|
|
1511
|
+
status=1,
|
|
1512
|
+
)
|
|
1513
|
+
PY
|
|
1514
|
+
grep -Fq 'pair_evidence_quality=FAIL min_pair_margin_actual=+4 min_pair_margin_required=+5 max_wall_actual=1.20x max_wall_allowed=3.00x' "$TMP_DIR/low-quality-quality-row.out"
|
|
1515
|
+
|
|
1516
|
+
cat > "$TMP_DIR/no-quality-rows-frontier.json" <<'JSON'
|
|
1517
|
+
{
|
|
1518
|
+
"pair_margin_min": 21,
|
|
1519
|
+
"pair_solo_wall_ratio_max": 1.2,
|
|
1520
|
+
"rows": []
|
|
1521
|
+
}
|
|
1522
|
+
JSON
|
|
1523
|
+
expect_fail_contains no-quality-rows "pair evidence quality check has no complete rows" \
|
|
1524
|
+
python3 - "$SCRIPT" "$TMP_DIR/no-quality-rows-frontier.json" <<'PY'
|
|
1525
|
+
import importlib.util
|
|
1526
|
+
import pathlib
|
|
1527
|
+
import sys
|
|
1528
|
+
|
|
1529
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
1530
|
+
module = importlib.util.module_from_spec(spec)
|
|
1531
|
+
assert spec.loader is not None
|
|
1532
|
+
spec.loader.exec_module(module)
|
|
1533
|
+
sys.exit(module.check_pair_evidence_quality(
|
|
1534
|
+
pathlib.Path(sys.argv[2]),
|
|
1535
|
+
min_pair_margin=5,
|
|
1536
|
+
max_pair_solo_wall_ratio=3.0,
|
|
1537
|
+
))
|
|
1538
|
+
PY
|
|
1539
|
+
|
|
1540
|
+
cat > "$TMP_DIR/high-wall-frontier.json" <<'JSON'
|
|
1541
|
+
{
|
|
1542
|
+
"pair_margin_min": 21,
|
|
1543
|
+
"pair_solo_wall_ratio_max": 3.5,
|
|
1544
|
+
"rows": [
|
|
1545
|
+
{
|
|
1546
|
+
"fixture": "F16-cli-quote-tax-rules",
|
|
1547
|
+
"status": "pair_evidence_passed",
|
|
1548
|
+
"passing_pair_evidence": [
|
|
1549
|
+
{
|
|
1550
|
+
"run_id": "high-wall-run",
|
|
1551
|
+
"pair_arm": "l2_risk_probes",
|
|
1552
|
+
"bare_score": 50,
|
|
1553
|
+
"solo_score": 75,
|
|
1554
|
+
"pair_score": 96,
|
|
1555
|
+
"pair_margin": 21,
|
|
1556
|
+
"pair_mode": true,
|
|
1557
|
+
"pair_trigger_eligible": true,
|
|
1558
|
+
"pair_trigger_reasons": ["complexity.high"],
|
|
1559
|
+
"pair_trigger_has_canonical_reason": true,
|
|
1560
|
+
"pair_solo_wall_ratio": 3.5
|
|
1561
|
+
}
|
|
1562
|
+
]
|
|
1563
|
+
}
|
|
1564
|
+
]
|
|
1565
|
+
}
|
|
1566
|
+
JSON
|
|
1567
|
+
expect_fail_contains high-wall-pair-evidence "pair evidence wall ratio above maximum for fixture(s): F16-cli-quote-tax-rules" \
|
|
1568
|
+
python3 - "$SCRIPT" "$TMP_DIR/high-wall-frontier.json" <<'PY'
|
|
1569
|
+
import importlib.util
|
|
1570
|
+
import pathlib
|
|
1571
|
+
import sys
|
|
1572
|
+
|
|
1573
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
1574
|
+
module = importlib.util.module_from_spec(spec)
|
|
1575
|
+
assert spec.loader is not None
|
|
1576
|
+
spec.loader.exec_module(module)
|
|
1577
|
+
sys.exit(module.check_pair_evidence_quality(
|
|
1578
|
+
pathlib.Path(sys.argv[2]),
|
|
1579
|
+
min_pair_margin=5,
|
|
1580
|
+
max_pair_solo_wall_ratio=3.0,
|
|
1581
|
+
))
|
|
1582
|
+
PY
|
|
1583
|
+
|
|
1584
|
+
cat > "$TMP_DIR/summary-mismatch-frontier.json" <<'JSON'
|
|
1585
|
+
{
|
|
1586
|
+
"pair_margin_min": 22,
|
|
1587
|
+
"pair_solo_wall_ratio_max": 1.2,
|
|
1588
|
+
"rows": [
|
|
1589
|
+
{
|
|
1590
|
+
"fixture": "F16-cli-quote-tax-rules",
|
|
1591
|
+
"status": "pair_evidence_passed",
|
|
1592
|
+
"passing_pair_evidence": [
|
|
1593
|
+
{
|
|
1594
|
+
"run_id": "summary-mismatch-run",
|
|
1595
|
+
"pair_arm": "l2_risk_probes",
|
|
1596
|
+
"bare_score": 50,
|
|
1597
|
+
"solo_score": 75,
|
|
1598
|
+
"pair_score": 96,
|
|
1599
|
+
"pair_margin": 21,
|
|
1600
|
+
"pair_mode": true,
|
|
1601
|
+
"pair_trigger_eligible": true,
|
|
1602
|
+
"pair_trigger_reasons": ["complexity.high"],
|
|
1603
|
+
"pair_trigger_has_canonical_reason": true,
|
|
1604
|
+
"pair_solo_wall_ratio": 1.2
|
|
1605
|
+
}
|
|
1606
|
+
]
|
|
1607
|
+
}
|
|
1608
|
+
]
|
|
1609
|
+
}
|
|
1610
|
+
JSON
|
|
1611
|
+
expect_fail_contains summary-margin-mismatch "frontier pair_margin_min does not match pair evidence rows" \
|
|
1612
|
+
python3 - "$SCRIPT" "$TMP_DIR/summary-mismatch-frontier.json" <<'PY'
|
|
1613
|
+
import importlib.util
|
|
1614
|
+
import pathlib
|
|
1615
|
+
import sys
|
|
1616
|
+
|
|
1617
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
1618
|
+
module = importlib.util.module_from_spec(spec)
|
|
1619
|
+
assert spec.loader is not None
|
|
1620
|
+
spec.loader.exec_module(module)
|
|
1621
|
+
sys.exit(module.check_pair_evidence_quality(
|
|
1622
|
+
pathlib.Path(sys.argv[2]),
|
|
1623
|
+
min_pair_margin=5,
|
|
1624
|
+
max_pair_solo_wall_ratio=3.0,
|
|
1625
|
+
))
|
|
1626
|
+
PY
|
|
1627
|
+
|
|
1628
|
+
cat > "$TMP_DIR/summary-wall-mismatch-frontier.json" <<'JSON'
|
|
1629
|
+
{
|
|
1630
|
+
"pair_margin_min": 21,
|
|
1631
|
+
"pair_solo_wall_ratio_max": 1.3,
|
|
1632
|
+
"rows": [
|
|
1633
|
+
{
|
|
1634
|
+
"fixture": "F16-cli-quote-tax-rules",
|
|
1635
|
+
"status": "pair_evidence_passed",
|
|
1636
|
+
"passing_pair_evidence": [
|
|
1637
|
+
{
|
|
1638
|
+
"run_id": "summary-wall-mismatch-run",
|
|
1639
|
+
"pair_arm": "l2_risk_probes",
|
|
1640
|
+
"bare_score": 50,
|
|
1641
|
+
"solo_score": 75,
|
|
1642
|
+
"pair_score": 96,
|
|
1643
|
+
"pair_margin": 21,
|
|
1644
|
+
"pair_mode": true,
|
|
1645
|
+
"pair_trigger_eligible": true,
|
|
1646
|
+
"pair_trigger_reasons": ["complexity.high"],
|
|
1647
|
+
"pair_trigger_has_canonical_reason": true,
|
|
1648
|
+
"pair_solo_wall_ratio": 1.2
|
|
1649
|
+
}
|
|
1650
|
+
]
|
|
1651
|
+
}
|
|
1652
|
+
]
|
|
1653
|
+
}
|
|
1654
|
+
JSON
|
|
1655
|
+
expect_fail_contains summary-wall-mismatch "frontier pair_solo_wall_ratio_max does not match pair evidence rows" \
|
|
1656
|
+
python3 - "$SCRIPT" "$TMP_DIR/summary-wall-mismatch-frontier.json" <<'PY'
|
|
1657
|
+
import importlib.util
|
|
1658
|
+
import pathlib
|
|
1659
|
+
import sys
|
|
1660
|
+
|
|
1661
|
+
spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
|
|
1662
|
+
module = importlib.util.module_from_spec(spec)
|
|
1663
|
+
assert spec.loader is not None
|
|
1664
|
+
spec.loader.exec_module(module)
|
|
1665
|
+
sys.exit(module.check_pair_evidence_quality(
|
|
1666
|
+
pathlib.Path(sys.argv[2]),
|
|
1667
|
+
min_pair_margin=5,
|
|
1668
|
+
max_pair_solo_wall_ratio=3.0,
|
|
1669
|
+
))
|
|
1670
|
+
PY
|
|
1671
|
+
|
|
1672
|
+
echo "PASS test-audit-pair-evidence"
|