devlyn-cli 2.3.0 → 2.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +1 -1
- package/CLAUDE.md +2 -2
- package/README.md +82 -29
- package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
- package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
- package/benchmark/auto-resolve/README.md +307 -44
- package/benchmark/auto-resolve/RUBRIC.md +23 -14
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
- package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
- package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
- package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
- package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
- package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
- package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
- package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
- package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
- package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
- package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
- package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
- package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
- package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
- package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
- package/benchmark/auto-resolve/scripts/judge.sh +153 -26
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
- package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
- package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
- package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
- package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
- package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
- package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
- package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
- package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
- package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
- package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
- package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
- package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
- package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
- package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
- package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
- package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
- package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
- package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
- package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
- package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
- package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
- package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
- package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
- package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
- package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
- package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
- package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
- package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
- package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
- package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
- package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
- package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
- package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
- package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
- package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
- package/bin/devlyn.js +211 -18
- package/config/skills/_shared/adapters/README.md +3 -0
- package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
- package/config/skills/_shared/adapters/opus-4-7.md +9 -1
- package/config/skills/_shared/archive_run.py +78 -6
- package/config/skills/_shared/codex-config.md +3 -2
- package/config/skills/_shared/codex-monitored.sh +46 -1
- package/config/skills/_shared/collect-codex-findings.py +20 -5
- package/config/skills/_shared/engine-preflight.md +1 -1
- package/config/skills/_shared/runtime-principles.md +5 -8
- package/config/skills/_shared/spec-verify-check.py +2664 -107
- package/config/skills/_shared/verify-merge-findings.py +1369 -19
- package/config/skills/devlyn:ideate/SKILL.md +7 -4
- package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
- package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
- package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
- package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
- package/config/skills/devlyn:resolve/SKILL.md +49 -18
- package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
- package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
- package/config/skills/devlyn:resolve/references/phases/verify.md +62 -28
- package/config/skills/devlyn:resolve/references/state-schema.md +7 -4
- package/package.json +47 -2
- package/scripts/lint-fixtures.sh +349 -0
- package/scripts/lint-shadow-fixtures.sh +58 -0
- package/scripts/lint-skills.sh +3642 -92
- /package/{optional-skills → config/skills}/devlyn:design-ui/SKILL.md +0 -0
|
@@ -17,28 +17,30 @@ write_fixture() {
|
|
|
17
17
|
local pair_mode="${6:-true}"
|
|
18
18
|
local pair_elapsed="${7:-200}"
|
|
19
19
|
local solo_elapsed="${8:-100}"
|
|
20
|
-
local pair_arm="${9:-
|
|
20
|
+
local pair_arm="${9:-l2_risk_probes}"
|
|
21
21
|
local dir="$TMP_DIR/$run_id/$fixture"
|
|
22
22
|
mkdir -p "$dir/bare" "$dir/solo_claude" "$dir/$pair_arm"
|
|
23
23
|
cat > "$dir/judge.json" <<EOF
|
|
24
24
|
{
|
|
25
25
|
"scores_by_arm": {"bare": $bare, "solo_claude": $solo, "$pair_arm": $pair},
|
|
26
|
+
"_blind_mapping": {"A": "bare", "B": "solo_claude", "C": "$pair_arm", "seed": 1},
|
|
26
27
|
"disqualifiers_by_arm": {}
|
|
27
28
|
}
|
|
28
29
|
EOF
|
|
29
30
|
for arm in bare solo_claude "$pair_arm"; do
|
|
30
31
|
cat > "$dir/$arm/verify.json" <<'EOF'
|
|
31
|
-
{"disqualifier": false}
|
|
32
|
+
{"disqualifier": false, "verify_score": 1.0}
|
|
32
33
|
EOF
|
|
34
|
+
: > "$dir/$arm/diff.patch"
|
|
33
35
|
done
|
|
34
36
|
cat > "$dir/bare/result.json" <<'EOF'
|
|
35
37
|
{"timed_out": false, "invoke_failure": false, "disqualifier": false, "elapsed_seconds": 20}
|
|
36
38
|
EOF
|
|
37
39
|
cat > "$dir/solo_claude/result.json" <<EOF
|
|
38
|
-
{"timed_out": false, "invoke_failure": false, "disqualifier": false, "elapsed_seconds": $solo_elapsed}
|
|
40
|
+
{"timed_out": false, "invoke_failure": false, "disqualifier": false, "elapsed_seconds": $solo_elapsed, "terminal_verdict": "PASS", "verify_verdict": "PASS"}
|
|
39
41
|
EOF
|
|
40
42
|
cat > "$dir/$pair_arm/result.json" <<EOF
|
|
41
|
-
{"timed_out": false, "invoke_failure": false, "disqualifier": false, "elapsed_seconds": $pair_elapsed, "pair_mode": $pair_mode}
|
|
43
|
+
{"timed_out": false, "invoke_failure": false, "disqualifier": false, "elapsed_seconds": $pair_elapsed, "pair_mode": $pair_mode, "pair_trigger": {"eligible": true, "reasons": ["complexity.high"], "skipped_reason": null}, "terminal_verdict": "PASS", "verify_verdict": "PASS"}
|
|
42
44
|
EOF
|
|
43
45
|
}
|
|
44
46
|
|
|
@@ -60,39 +62,455 @@ expect_fail_contains() {
|
|
|
60
62
|
}
|
|
61
63
|
|
|
62
64
|
write_fixture pass F21 50 75 82 true 220 110
|
|
63
|
-
write_fixture pass
|
|
65
|
+
write_fixture pass F23 55 75 83 true 280 140
|
|
66
|
+
expect_fail_contains missing-rejected-registry "rejected fixture registry missing" \
|
|
67
|
+
env PAIR_REJECTED_FIXTURES_REGISTRY="$TMP_DIR/missing-registry.sh" \
|
|
68
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id pass --min-fixtures 1
|
|
69
|
+
empty_registry="$TMP_DIR/empty-registry.sh"
|
|
70
|
+
: > "$empty_registry"
|
|
71
|
+
expect_fail_contains empty-rejected-registry "rejected fixture registry has no fixture entries" \
|
|
72
|
+
env PAIR_REJECTED_FIXTURES_REGISTRY="$empty_registry" \
|
|
73
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id pass --min-fixtures 1
|
|
64
74
|
python3 "$GATE" --results-root "$TMP_DIR" --run-id pass \
|
|
65
75
|
--max-pair-solo-wall-ratio 3 \
|
|
66
76
|
--out-json "$TMP_DIR/pass.json" \
|
|
67
77
|
--out-md "$TMP_DIR/pass.md"
|
|
68
78
|
grep -Fq '"verdict": "PASS"' "$TMP_DIR/pass.json"
|
|
79
|
+
grep -Fq '"avg_pair_margin": 7.5' "$TMP_DIR/pass.json"
|
|
69
80
|
grep -Fq '"avg_pair_solo_wall_ratio": 2.0' "$TMP_DIR/pass.json"
|
|
81
|
+
grep -Fq '"max_pair_solo_wall_ratio": 3.0' "$TMP_DIR/pass.json"
|
|
82
|
+
grep -Fq '"max_observed_pair_solo_wall_ratio": 2.0' "$TMP_DIR/pass.json"
|
|
83
|
+
grep -Fq '"require_hypothesis_trigger": false' "$TMP_DIR/pass.json"
|
|
84
|
+
grep -Fq '"pair_trigger_has_canonical_reason": true' "$TMP_DIR/pass.json"
|
|
85
|
+
grep -Fq '"pair_trigger_has_hypothesis_reason": false' "$TMP_DIR/pass.json"
|
|
86
|
+
grep -Fq 'pair_trigger eligible with a canonical reason' "$TMP_DIR/pass.json"
|
|
70
87
|
grep -Fq 'Verdict: **PASS**' "$TMP_DIR/pass.md"
|
|
88
|
+
grep -Fq 'Fixtures passed: 2/2 (minimum required: 2)' "$TMP_DIR/pass.md"
|
|
89
|
+
grep -Fq 'Average pair margin: +7.5' "$TMP_DIR/pass.md"
|
|
90
|
+
grep -Fq 'Allowed pair/solo wall ratio: 3.00x' "$TMP_DIR/pass.md"
|
|
91
|
+
grep -Fq 'Maximum observed pair/solo wall ratio: 2.00x' "$TMP_DIR/pass.md"
|
|
92
|
+
grep -Fq 'Hypothesis trigger required: false' "$TMP_DIR/pass.md"
|
|
93
|
+
grep -Fq 'pair_trigger eligible with canonical reason' "$TMP_DIR/pass.md"
|
|
94
|
+
grep -Fq '"min_bare_headroom_required": 5' "$TMP_DIR/pass.json"
|
|
95
|
+
grep -Fq '"min_solo_headroom_required": 5' "$TMP_DIR/pass.json"
|
|
96
|
+
grep -Fq '| Fixture | Bare | Bare headroom | Solo_claude | Solo_claude headroom | Pair | Margin | Pair mode | Hypothesis trigger | Triggers | Wall ratio | Status | Reason |' "$TMP_DIR/pass.md"
|
|
97
|
+
grep -Fq '| F21 | 50 | 10 | 75 | 5 | 82 | +7 | true | false | complexity.high | 2.00x | PASS | |' "$TMP_DIR/pass.md"
|
|
98
|
+
grep -Fq '| F23 | 55 | 5 | 75 | 5 | 83 | +8 | true | false | complexity.high | 2.00x | PASS | |' "$TMP_DIR/pass.md"
|
|
99
|
+
|
|
100
|
+
write_fixture nan-result F21 50 75 85 true
|
|
101
|
+
cat > "$TMP_DIR/nan-result/F21/l2_risk_probes/result.json" <<'EOF'
|
|
102
|
+
{"timed_out": false, "invoke_failure": false, "disqualifier": false, "elapsed_seconds": NaN, "pair_mode": true, "pair_trigger": {"eligible": true, "reasons": ["complexity.high"], "skipped_reason": null}, "terminal_verdict": "PASS", "verify_verdict": "PASS"}
|
|
103
|
+
EOF
|
|
104
|
+
expect_fail_contains nan-result-json "l2_risk_probes result.json malformed" \
|
|
105
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id nan-result --min-fixtures 1
|
|
106
|
+
|
|
107
|
+
write_fixture rejected-direct F2 50 75 85 true
|
|
108
|
+
write_fixture rejected-direct F21 50 75 85 true
|
|
109
|
+
expect_fail_contains rejected-direct "fixture rejected for pair-candidate runs" \
|
|
110
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id rejected-direct --min-fixtures 1
|
|
111
|
+
|
|
112
|
+
write_fixture rejected-shadow-direct S3-cli-ticket-assignment 50 75 85 true
|
|
113
|
+
write_fixture rejected-shadow-direct F21 50 75 85 true
|
|
114
|
+
expect_fail_contains rejected-shadow-direct "fixture rejected for pair-candidate runs" \
|
|
115
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id rejected-shadow-direct --min-fixtures 1
|
|
116
|
+
|
|
117
|
+
write_fixture partial-baseline F21 50 75 85 true
|
|
118
|
+
write_fixture partial-baseline F23 55 75 90 true
|
|
119
|
+
python3 - "$TMP_DIR/partial-baseline/F21/solo_claude/verify.json" <<'PY'
|
|
120
|
+
import json, sys
|
|
121
|
+
path = sys.argv[1]
|
|
122
|
+
data = json.load(open(path))
|
|
123
|
+
data["verify_score"] = 0.75
|
|
124
|
+
json.dump(data, open(path, "w"), indent=2)
|
|
125
|
+
PY
|
|
126
|
+
python3 - "$TMP_DIR/partial-baseline/F21/solo_claude/result.json" <<'PY'
|
|
127
|
+
import json, sys
|
|
128
|
+
path = sys.argv[1]
|
|
129
|
+
data = json.load(open(path))
|
|
130
|
+
data["terminal_verdict"] = "FAIL"
|
|
131
|
+
data["verify_verdict"] = "FAIL"
|
|
132
|
+
json.dump(data, open(path, "w"), indent=2)
|
|
133
|
+
PY
|
|
134
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id partial-baseline \
|
|
135
|
+
--max-pair-solo-wall-ratio 3 \
|
|
136
|
+
--out-json "$TMP_DIR/partial-baseline.json" \
|
|
137
|
+
--out-md "$TMP_DIR/partial-baseline.md"
|
|
138
|
+
grep -Fq '"verdict": "PASS"' "$TMP_DIR/partial-baseline.json"
|
|
139
|
+
grep -Fq '| F21 | 50 | 10 | 75 | 5 | 85 | +10 | true | false | complexity.high | 2.00x | PASS | |' "$TMP_DIR/partial-baseline.md"
|
|
71
140
|
|
|
72
141
|
write_fixture no-headroom F21 50 81 90 true
|
|
73
|
-
write_fixture no-headroom
|
|
142
|
+
write_fixture no-headroom F23 55 75 83 true
|
|
74
143
|
expect_fail_contains no-headroom "solo_claude score 81 > 80" \
|
|
75
144
|
python3 "$GATE" --results-root "$TMP_DIR" --run-id no-headroom
|
|
76
145
|
|
|
77
|
-
write_fixture
|
|
78
|
-
write_fixture
|
|
146
|
+
write_fixture marginal-headroom F21 59 66 85 true
|
|
147
|
+
write_fixture marginal-headroom F23 50 75 82 true
|
|
148
|
+
expect_fail_contains marginal-headroom "bare headroom 1 < 5" \
|
|
149
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id marginal-headroom
|
|
150
|
+
|
|
151
|
+
write_fixture dirty-bare F21 50 75 85 true
|
|
152
|
+
python3 - "$TMP_DIR/dirty-bare/F21/bare/result.json" <<'PY'
|
|
153
|
+
import json, sys
|
|
154
|
+
path = sys.argv[1]
|
|
155
|
+
data = json.load(open(path))
|
|
156
|
+
data["disqualifier"] = True
|
|
157
|
+
json.dump(data, open(path, "w"), indent=2)
|
|
158
|
+
PY
|
|
159
|
+
expect_fail_contains dirty-bare "bare result disqualifier" \
|
|
160
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id dirty-bare --min-fixtures 1
|
|
161
|
+
|
|
162
|
+
write_fixture dirty-solo F21 50 75 85 true
|
|
163
|
+
python3 - "$TMP_DIR/dirty-solo/F21/solo_claude/verify.json" <<'PY'
|
|
164
|
+
import json, sys
|
|
165
|
+
path = sys.argv[1]
|
|
166
|
+
data = json.load(open(path))
|
|
167
|
+
data["disqualifier"] = True
|
|
168
|
+
json.dump(data, open(path, "w"), indent=2)
|
|
169
|
+
PY
|
|
170
|
+
expect_fail_contains dirty-solo "solo_claude verify disqualifier" \
|
|
171
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id dirty-solo --min-fixtures 1
|
|
172
|
+
|
|
173
|
+
write_fixture control-ceiling F22-cli-ledger-close 94 98 99 true 140 100 l2_risk_probes
|
|
174
|
+
write_fixture control-ceiling F26-cli-payout-ledger-rules 25 98 99 true 140 100 l2_risk_probes
|
|
175
|
+
expect_fail_contains control-ceiling "solo_claude score 98 > 80" \
|
|
176
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id control-ceiling \
|
|
177
|
+
--pair-arm l2_risk_probes --min-fixtures 2
|
|
178
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id control-ceiling \
|
|
179
|
+
--pair-arm l2_risk_probes --min-fixtures 2 \
|
|
180
|
+
--out-json "$TMP_DIR/control-ceiling.json" \
|
|
181
|
+
--out-md "$TMP_DIR/control-ceiling.md" >/dev/null 2>&1 || true
|
|
182
|
+
grep -Fq '"verdict": "FAIL"' "$TMP_DIR/control-ceiling.json"
|
|
183
|
+
grep -Fq 'F22-cli-ledger-close' "$TMP_DIR/control-ceiling.md"
|
|
184
|
+
grep -Fq 'F26-cli-payout-ledger-rules' "$TMP_DIR/control-ceiling.md"
|
|
185
|
+
|
|
186
|
+
write_fixture no-pair-mode F21 50 75 85 false 200 100 l2_gated
|
|
187
|
+
write_fixture no-pair-mode F23 55 75 85 true 200 100 l2_gated
|
|
79
188
|
expect_fail_contains no-pair-mode "l2_gated pair_mode not true" \
|
|
80
|
-
python3 "$GATE" --results-root "$TMP_DIR" --run-id no-pair-mode
|
|
189
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id no-pair-mode \
|
|
190
|
+
--pair-arm l2_gated
|
|
191
|
+
|
|
192
|
+
write_fixture malformed-pair-trigger F21 50 75 85 true
|
|
193
|
+
python3 - "$TMP_DIR/malformed-pair-trigger/F21/l2_risk_probes/result.json" <<'PY'
|
|
194
|
+
import json, sys
|
|
195
|
+
path = sys.argv[1]
|
|
196
|
+
data = json.load(open(path))
|
|
197
|
+
data["pair_trigger"] = {"eligible": True, "reasons": "complexity.high", "skipped_reason": None}
|
|
198
|
+
json.dump(data, open(path, "w"), indent=2)
|
|
199
|
+
PY
|
|
200
|
+
expect_fail_contains malformed-pair-trigger "l2_risk_probes pair_trigger.reasons malformed" \
|
|
201
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id malformed-pair-trigger --min-fixtures 1
|
|
202
|
+
|
|
203
|
+
write_fixture unknown-pair-trigger-reason F21 50 75 85 true
|
|
204
|
+
python3 - "$TMP_DIR/unknown-pair-trigger-reason/F21/l2_risk_probes/result.json" <<'PY'
|
|
205
|
+
import json, sys
|
|
206
|
+
path = sys.argv[1]
|
|
207
|
+
data = json.load(open(path))
|
|
208
|
+
data["pair_trigger"] = {"eligible": True, "reasons": ["looks-hard"], "skipped_reason": None}
|
|
209
|
+
json.dump(data, open(path, "w"), indent=2)
|
|
210
|
+
PY
|
|
211
|
+
expect_fail_contains unknown-pair-trigger-reason "l2_risk_probes pair_trigger reasons missing known trigger reason" \
|
|
212
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id unknown-pair-trigger-reason --min-fixtures 1
|
|
213
|
+
|
|
214
|
+
write_fixture mixed-unknown-pair-trigger-reason F21 50 75 85 true
|
|
215
|
+
python3 - "$TMP_DIR/mixed-unknown-pair-trigger-reason/F21/l2_risk_probes/result.json" <<'PY'
|
|
216
|
+
import json, sys
|
|
217
|
+
path = sys.argv[1]
|
|
218
|
+
data = json.load(open(path))
|
|
219
|
+
data["pair_trigger"] = {"eligible": True, "reasons": ["complexity.high", "looks-hard"], "skipped_reason": None}
|
|
220
|
+
json.dump(data, open(path, "w"), indent=2)
|
|
221
|
+
PY
|
|
222
|
+
expect_fail_contains mixed-unknown-pair-trigger-reason "l2_risk_probes pair_trigger reasons contain unknown trigger reason" \
|
|
223
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id mixed-unknown-pair-trigger-reason --min-fixtures 1
|
|
224
|
+
|
|
225
|
+
write_fixture normalized-canonical-pair-trigger-reason F21 50 75 85 true
|
|
226
|
+
python3 - "$TMP_DIR/normalized-canonical-pair-trigger-reason/F21/l2_risk_probes/result.json" <<'PY'
|
|
227
|
+
import json, sys
|
|
228
|
+
path = sys.argv[1]
|
|
229
|
+
data = json.load(open(path))
|
|
230
|
+
data["pair_trigger"] = {"eligible": True, "reasons": ["risk high"], "skipped_reason": None}
|
|
231
|
+
json.dump(data, open(path, "w"), indent=2)
|
|
232
|
+
PY
|
|
233
|
+
expect_fail_contains normalized-canonical-pair-trigger-reason "l2_risk_probes pair_trigger reasons missing known trigger reason" \
|
|
234
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id normalized-canonical-pair-trigger-reason --min-fixtures 1
|
|
235
|
+
|
|
236
|
+
write_fixture historical-only-pair-trigger-reason F21 50 75 85 true
|
|
237
|
+
python3 - "$TMP_DIR/historical-only-pair-trigger-reason/F21/l2_risk_probes/result.json" <<'PY'
|
|
238
|
+
import json, sys
|
|
239
|
+
path = sys.argv[1]
|
|
240
|
+
data = json.load(open(path))
|
|
241
|
+
data["pair_trigger"] = {"eligible": True, "reasons": ["risk_profile.high_risk"], "skipped_reason": None}
|
|
242
|
+
json.dump(data, open(path, "w"), indent=2)
|
|
243
|
+
PY
|
|
244
|
+
expect_fail_contains historical-only-pair-trigger-reason "l2_risk_probes pair_trigger reasons missing canonical trigger reason" \
|
|
245
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id historical-only-pair-trigger-reason --min-fixtures 1
|
|
246
|
+
|
|
247
|
+
write_fixture missing-hypothesis-trigger F16-cli-quote-tax-rules 50 75 85 true
|
|
248
|
+
expect_fail_contains missing-hypothesis-trigger "l2_risk_probes pair_trigger missing spec.solo_headroom_hypothesis" \
|
|
249
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id missing-hypothesis-trigger --min-fixtures 1 --require-hypothesis-trigger
|
|
250
|
+
python3 - "$TMP_DIR/missing-hypothesis-trigger/F16-cli-quote-tax-rules/l2_risk_probes/result.json" <<'PY'
|
|
251
|
+
import json, sys
|
|
252
|
+
path = sys.argv[1]
|
|
253
|
+
data = json.load(open(path))
|
|
254
|
+
data["pair_trigger"]["reasons"] = ["complexity.high", "spec.solo_headroom_hypothesis"]
|
|
255
|
+
json.dump(data, open(path, "w"), indent=2)
|
|
256
|
+
PY
|
|
257
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id missing-hypothesis-trigger \
|
|
258
|
+
--min-fixtures 1 \
|
|
259
|
+
--require-hypothesis-trigger \
|
|
260
|
+
--out-json "$TMP_DIR/hypothesis-trigger-pass.json" \
|
|
261
|
+
--out-md "$TMP_DIR/hypothesis-trigger-pass.md"
|
|
262
|
+
grep -Fq '"verdict": "PASS"' "$TMP_DIR/hypothesis-trigger-pass.json"
|
|
263
|
+
grep -Fq '"require_hypothesis_trigger": true' "$TMP_DIR/hypothesis-trigger-pass.json"
|
|
264
|
+
grep -Fq '"pair_trigger_has_hypothesis_reason": true' "$TMP_DIR/hypothesis-trigger-pass.json"
|
|
265
|
+
grep -Fq 'Hypothesis trigger required: true' "$TMP_DIR/hypothesis-trigger-pass.md"
|
|
266
|
+
grep -Fq '| F16-cli-quote-tax-rules | 50 | 10 | 75 | 5 | 85 | +10 | true | true | complexity.high,spec.solo_headroom_hypothesis | 2.00x | PASS | |' "$TMP_DIR/hypothesis-trigger-pass.md"
|
|
267
|
+
grep -Fq 'complexity.high,spec.solo_headroom_hypothesis' "$TMP_DIR/hypothesis-trigger-pass.md"
|
|
81
268
|
|
|
82
269
|
write_fixture weak-margin F21 50 75 79 true
|
|
83
|
-
write_fixture weak-margin
|
|
84
|
-
expect_fail_contains weak-margin "
|
|
270
|
+
write_fixture weak-margin F23 55 75 88 true
|
|
271
|
+
expect_fail_contains weak-margin "l2_risk_probes margin +4 < +5" \
|
|
85
272
|
python3 "$GATE" --results-root "$TMP_DIR" --run-id weak-margin
|
|
86
273
|
|
|
87
|
-
write_fixture
|
|
88
|
-
|
|
274
|
+
write_fixture dirty-pair F21 50 75 85 true
|
|
275
|
+
python3 - "$TMP_DIR/dirty-pair/F21/l2_risk_probes/verify.json" <<'PY'
|
|
276
|
+
import json, sys
|
|
277
|
+
path = sys.argv[1]
|
|
278
|
+
data = json.load(open(path))
|
|
279
|
+
data["disqualifier"] = True
|
|
280
|
+
json.dump(data, open(path, "w"), indent=2)
|
|
281
|
+
PY
|
|
282
|
+
expect_fail_contains dirty-pair "l2_risk_probes verify disqualifier" \
|
|
283
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id dirty-pair --min-fixtures 1
|
|
284
|
+
|
|
285
|
+
write_fixture dirty-pair-verify-score F21 50 75 85 true
|
|
286
|
+
python3 - "$TMP_DIR/dirty-pair-verify-score/F21/l2_risk_probes/verify.json" <<'PY'
|
|
287
|
+
import json, sys
|
|
288
|
+
path = sys.argv[1]
|
|
289
|
+
data = json.load(open(path))
|
|
290
|
+
data["verify_score"] = 0.75
|
|
291
|
+
json.dump(data, open(path, "w"), indent=2)
|
|
292
|
+
PY
|
|
293
|
+
expect_fail_contains dirty-pair-verify-score "l2_risk_probes verify_score < 1.0" \
|
|
294
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id dirty-pair-verify-score --min-fixtures 1
|
|
295
|
+
|
|
296
|
+
write_fixture boolean-pair-verify-score F21 50 75 85 true
|
|
297
|
+
python3 - "$TMP_DIR/boolean-pair-verify-score/F21/l2_risk_probes/verify.json" <<'PY'
|
|
298
|
+
import json, sys
|
|
299
|
+
path = sys.argv[1]
|
|
300
|
+
data = json.load(open(path))
|
|
301
|
+
data["verify_score"] = True
|
|
302
|
+
json.dump(data, open(path, "w"), indent=2)
|
|
303
|
+
PY
|
|
304
|
+
expect_fail_contains boolean-pair-verify-score "l2_risk_probes verify_score < 1.0" \
|
|
305
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id boolean-pair-verify-score --min-fixtures 1
|
|
306
|
+
|
|
307
|
+
write_fixture dirty-pair-verdict F21 50 75 85 true
|
|
308
|
+
python3 - "$TMP_DIR/dirty-pair-verdict/F21/l2_risk_probes/result.json" <<'PY'
|
|
309
|
+
import json, sys
|
|
310
|
+
path = sys.argv[1]
|
|
311
|
+
data = json.load(open(path))
|
|
312
|
+
data["terminal_verdict"] = "BLOCKED:probe-derive-malformed"
|
|
313
|
+
data["verify_verdict"] = "BLOCKED"
|
|
314
|
+
json.dump(data, open(path, "w"), indent=2)
|
|
315
|
+
PY
|
|
316
|
+
expect_fail_contains dirty-pair-verdict "l2_risk_probes terminal verdict not pass" \
|
|
317
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id dirty-pair-verdict --min-fixtures 1
|
|
318
|
+
|
|
319
|
+
write_fixture dirty-pair-axis F21 50 75 85 true
|
|
320
|
+
python3 - "$TMP_DIR/dirty-pair-axis/F21/judge.json" <<'PY'
|
|
321
|
+
import json, sys
|
|
322
|
+
path = sys.argv[1]
|
|
323
|
+
data = json.load(open(path))
|
|
324
|
+
data["_blind_mapping"] = {"A": "bare", "B": "solo_claude", "C": "l2_risk_probes", "seed": 1}
|
|
325
|
+
data["_axis_validation"] = {
|
|
326
|
+
"out_of_range_count": 1,
|
|
327
|
+
"out_of_range_cells": [{"breakdown": "c_breakdown", "axis": "quality", "value": 26}],
|
|
328
|
+
"axis_range": [0, 25],
|
|
329
|
+
}
|
|
330
|
+
json.dump(data, open(path, "w"), indent=2)
|
|
331
|
+
PY
|
|
332
|
+
expect_fail_contains dirty-pair-axis "l2_risk_probes judge axis-invalid (1)" \
|
|
333
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id dirty-pair-axis --min-fixtures 1
|
|
334
|
+
|
|
335
|
+
write_fixture dirty-solo-axis F21 50 75 85 true
|
|
336
|
+
python3 - "$TMP_DIR/dirty-solo-axis/F21/judge.json" <<'PY'
|
|
337
|
+
import json, sys
|
|
338
|
+
path = sys.argv[1]
|
|
339
|
+
data = json.load(open(path))
|
|
340
|
+
data["_blind_mapping"] = {"A": "bare", "B": "solo_claude", "C": "l2_risk_probes", "seed": 1}
|
|
341
|
+
data["_axis_validation"] = {
|
|
342
|
+
"out_of_range_count": 1,
|
|
343
|
+
"out_of_range_cells": [{"breakdown": "b_breakdown", "axis": "quality", "value": 26}],
|
|
344
|
+
"axis_range": [0, 25],
|
|
345
|
+
}
|
|
346
|
+
json.dump(data, open(path, "w"), indent=2)
|
|
347
|
+
PY
|
|
348
|
+
expect_fail_contains dirty-solo-axis "solo_claude judge axis-invalid (1)" \
|
|
349
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id dirty-solo-axis --min-fixtures 1
|
|
350
|
+
|
|
351
|
+
write_fixture unmapped-axis F21 50 75 85 true
|
|
352
|
+
python3 - "$TMP_DIR/unmapped-axis/F21/judge.json" <<'PY'
|
|
353
|
+
import json, sys
|
|
354
|
+
path = sys.argv[1]
|
|
355
|
+
data = json.load(open(path))
|
|
356
|
+
data["_blind_mapping"] = {"A": "bare", "B": "solo_claude", "C": "l2_forced", "seed": 1}
|
|
357
|
+
data["_axis_validation"] = {
|
|
358
|
+
"out_of_range_count": 1,
|
|
359
|
+
"out_of_range_cells": [{"breakdown": "c_breakdown", "axis": "quality", "value": 26}],
|
|
360
|
+
"axis_range": [0, 25],
|
|
361
|
+
}
|
|
362
|
+
json.dump(data, open(path, "w"), indent=2)
|
|
363
|
+
PY
|
|
364
|
+
expect_fail_contains unmapped-axis "judge axis-invalid unmapped (1)" \
|
|
365
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id unmapped-axis --min-fixtures 1
|
|
366
|
+
|
|
367
|
+
write_fixture missing-mapping F21 50 75 85 true
|
|
368
|
+
python3 - "$TMP_DIR/missing-mapping/F21/judge.json" <<'PY'
|
|
369
|
+
import json, sys
|
|
370
|
+
path = sys.argv[1]
|
|
371
|
+
data = json.load(open(path))
|
|
372
|
+
del data["_blind_mapping"]
|
|
373
|
+
json.dump(data, open(path, "w"), indent=2)
|
|
374
|
+
PY
|
|
375
|
+
expect_fail_contains missing-mapping "judge blind mapping missing" \
|
|
376
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id missing-mapping --min-fixtures 1
|
|
377
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id missing-mapping --min-fixtures 1 \
|
|
378
|
+
--out-json "$TMP_DIR/missing-mapping.json" >/dev/null 2>&1 || true
|
|
379
|
+
grep -Fq '"bare_score": null' "$TMP_DIR/missing-mapping.json"
|
|
380
|
+
grep -Fq '"solo_score": null' "$TMP_DIR/missing-mapping.json"
|
|
381
|
+
grep -Fq '"pair_score": null' "$TMP_DIR/missing-mapping.json"
|
|
382
|
+
|
|
383
|
+
write_fixture malformed-mapping-axis F21 50 75 85 true
|
|
384
|
+
python3 - "$TMP_DIR/malformed-mapping-axis/F21/judge.json" <<'PY'
|
|
385
|
+
import json, sys
|
|
386
|
+
path = sys.argv[1]
|
|
387
|
+
data = json.load(open(path))
|
|
388
|
+
data["_blind_mapping"] = "not-a-dict"
|
|
389
|
+
data["_axis_validation"] = {
|
|
390
|
+
"out_of_range_count": 1,
|
|
391
|
+
"out_of_range_cells": [{"breakdown": "c_breakdown", "axis": "quality", "value": 26}],
|
|
392
|
+
"axis_range": [0, 25],
|
|
393
|
+
}
|
|
394
|
+
json.dump(data, open(path, "w"), indent=2)
|
|
395
|
+
PY
|
|
396
|
+
expect_fail_contains malformed-mapping-axis "judge blind mapping missing" \
|
|
397
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id malformed-mapping-axis --min-fixtures 1
|
|
398
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id malformed-mapping-axis --min-fixtures 1 \
|
|
399
|
+
--out-json "$TMP_DIR/malformed-mapping-axis.json" >/dev/null 2>&1 || true
|
|
400
|
+
grep -Fq '"bare_score": null' "$TMP_DIR/malformed-mapping-axis.json"
|
|
401
|
+
grep -Fq '"solo_score": null' "$TMP_DIR/malformed-mapping-axis.json"
|
|
402
|
+
grep -Fq '"pair_score": null' "$TMP_DIR/malformed-mapping-axis.json"
|
|
403
|
+
|
|
404
|
+
write_fixture wrong-pair-mapping F21 50 75 85 true
|
|
405
|
+
python3 - "$TMP_DIR/wrong-pair-mapping/F21/judge.json" <<'PY'
|
|
406
|
+
import json, sys
|
|
407
|
+
path = sys.argv[1]
|
|
408
|
+
data = json.load(open(path))
|
|
409
|
+
data["_blind_mapping"] = {"A": "bare", "B": "solo_claude", "C": "l2_gated", "seed": 1}
|
|
410
|
+
json.dump(data, open(path, "w"), indent=2)
|
|
411
|
+
PY
|
|
412
|
+
expect_fail_contains wrong-pair-mapping "judge blind mapping missing arm(s): l2_risk_probes" \
|
|
413
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id wrong-pair-mapping --min-fixtures 1
|
|
414
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id wrong-pair-mapping --min-fixtures 1 \
|
|
415
|
+
--out-json "$TMP_DIR/wrong-pair-mapping.json" >/dev/null 2>&1 || true
|
|
416
|
+
grep -Fq '"bare_score": 50' "$TMP_DIR/wrong-pair-mapping.json"
|
|
417
|
+
grep -Fq '"solo_score": 75' "$TMP_DIR/wrong-pair-mapping.json"
|
|
418
|
+
grep -Fq '"pair_score": null' "$TMP_DIR/wrong-pair-mapping.json"
|
|
419
|
+
grep -Fq '"pair_margin": null' "$TMP_DIR/wrong-pair-mapping.json"
|
|
420
|
+
|
|
421
|
+
write_fixture malformed-scores F21 50 75 85 true
|
|
422
|
+
python3 - "$TMP_DIR/malformed-scores/F21/judge.json" <<'PY'
|
|
423
|
+
import json, sys
|
|
424
|
+
path = sys.argv[1]
|
|
425
|
+
data = json.load(open(path))
|
|
426
|
+
data["scores_by_arm"] = ["not", "a", "dict"]
|
|
427
|
+
json.dump(data, open(path, "w"), indent=2)
|
|
428
|
+
PY
|
|
429
|
+
expect_fail_contains malformed-scores "bare score missing" \
|
|
430
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id malformed-scores --min-fixtures 1
|
|
431
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id malformed-scores --min-fixtures 1 \
|
|
432
|
+
--out-json "$TMP_DIR/malformed-scores.json" >/dev/null 2>&1 || true
|
|
433
|
+
grep -Fq '"bare_score": null' "$TMP_DIR/malformed-scores.json"
|
|
434
|
+
grep -Fq '"solo_score": null' "$TMP_DIR/malformed-scores.json"
|
|
435
|
+
grep -Fq '"pair_score": null' "$TMP_DIR/malformed-scores.json"
|
|
436
|
+
grep -Fq '"pair_margin": null' "$TMP_DIR/malformed-scores.json"
|
|
437
|
+
|
|
438
|
+
write_fixture overrange-score F21 50 75 101 true
|
|
439
|
+
expect_fail_contains overrange-score "l2_risk_probes score missing" \
|
|
440
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id overrange-score --min-fixtures 1
|
|
441
|
+
|
|
442
|
+
write_fixture boolean-score F21 true 75 85 true
|
|
443
|
+
expect_fail_contains boolean-score "bare score missing" \
|
|
444
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id boolean-score --min-fixtures 1
|
|
445
|
+
|
|
446
|
+
write_fixture boolean-wall-time F21 50 75 85 true true 100
|
|
447
|
+
expect_fail_contains boolean-wall-time "pair/solo wall ratio missing" \
|
|
448
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id boolean-wall-time --min-fixtures 1
|
|
449
|
+
|
|
450
|
+
write_fixture dirty-pair-env F21 50 75 85 true
|
|
451
|
+
python3 - "$TMP_DIR/dirty-pair-env/F21/l2_risk_probes/result.json" <<'PY'
|
|
452
|
+
import json, sys
|
|
453
|
+
path = sys.argv[1]
|
|
454
|
+
data = json.load(open(path))
|
|
455
|
+
data["environment_contamination"] = True
|
|
456
|
+
json.dump(data, open(path, "w"), indent=2)
|
|
457
|
+
PY
|
|
458
|
+
expect_fail_contains dirty-pair-env "l2_risk_probes environment contamination" \
|
|
459
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id dirty-pair-env --min-fixtures 1
|
|
460
|
+
|
|
461
|
+
write_fixture malformed-pair-bool F21 50 75 85 true
|
|
462
|
+
python3 - "$TMP_DIR/malformed-pair-bool/F21/l2_risk_probes/result.json" <<'PY'
|
|
463
|
+
import json, sys
|
|
464
|
+
path = sys.argv[1]
|
|
465
|
+
data = json.load(open(path))
|
|
466
|
+
data["timed_out"] = "false"
|
|
467
|
+
json.dump(data, open(path, "w"), indent=2)
|
|
468
|
+
PY
|
|
469
|
+
expect_fail_contains malformed-pair-bool "l2_risk_probes result timed_out malformed" \
|
|
470
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id malformed-pair-bool --min-fixtures 1
|
|
471
|
+
|
|
472
|
+
write_fixture malformed-judge-bool F21 50 75 85 true
|
|
473
|
+
python3 - "$TMP_DIR/malformed-judge-bool/F21/judge.json" <<'PY'
|
|
474
|
+
import json, sys
|
|
475
|
+
path = sys.argv[1]
|
|
476
|
+
data = json.load(open(path))
|
|
477
|
+
data["disqualifiers_by_arm"] = {"l2_risk_probes": {"disqualifier": "false"}}
|
|
478
|
+
json.dump(data, open(path, "w"), indent=2)
|
|
479
|
+
PY
|
|
480
|
+
expect_fail_contains malformed-judge-bool "l2_risk_probes judge disqualifier malformed" \
|
|
481
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id malformed-judge-bool --min-fixtures 1
|
|
482
|
+
|
|
483
|
+
write_fixture missing-pair-diff F21 50 75 85 true
|
|
484
|
+
rm "$TMP_DIR/missing-pair-diff/F21/l2_risk_probes/diff.patch"
|
|
485
|
+
expect_fail_contains missing-pair-diff "l2_risk_probes diff.patch missing" \
|
|
486
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id missing-pair-diff --min-fixtures 1
|
|
487
|
+
|
|
488
|
+
write_fixture malformed-result-artifact F21 50 75 85 true
|
|
489
|
+
printf '["not", "a", "dict"]\n' > "$TMP_DIR/malformed-result-artifact/F21/l2_risk_probes/result.json"
|
|
490
|
+
expect_fail_contains malformed-result-artifact "l2_risk_probes result.json malformed" \
|
|
491
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id malformed-result-artifact --min-fixtures 1
|
|
492
|
+
|
|
493
|
+
write_fixture malformed-verify-artifact F21 50 75 85 true
|
|
494
|
+
printf '["not", "a", "dict"]\n' > "$TMP_DIR/malformed-verify-artifact/F21/l2_risk_probes/verify.json"
|
|
495
|
+
expect_fail_contains malformed-verify-artifact "l2_risk_probes verify.json malformed" \
|
|
496
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id malformed-verify-artifact --min-fixtures 1
|
|
497
|
+
|
|
498
|
+
write_fixture malformed-judge-artifact F21 50 75 85 true
|
|
499
|
+
printf '["not", "a", "dict"]\n' > "$TMP_DIR/malformed-judge-artifact/F21/judge.json"
|
|
500
|
+
expect_fail_contains malformed-judge-artifact "judge.json malformed" \
|
|
501
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id malformed-judge-artifact --min-fixtures 1
|
|
502
|
+
|
|
503
|
+
write_fixture custom-pair-arm F21 50 75 82 true 220 110 l2_gated
|
|
504
|
+
write_fixture custom-pair-arm F23 55 75 83 true 280 140 l2_gated
|
|
89
505
|
python3 "$GATE" --results-root "$TMP_DIR" --run-id custom-pair-arm \
|
|
90
|
-
--pair-arm
|
|
506
|
+
--pair-arm l2_gated \
|
|
91
507
|
--max-pair-solo-wall-ratio 3 \
|
|
92
508
|
--out-json "$TMP_DIR/custom-pair-arm.json" \
|
|
93
509
|
--out-md "$TMP_DIR/custom-pair-arm.md"
|
|
94
|
-
grep -Fq '"pair_arm": "
|
|
95
|
-
grep -Fq '
|
|
510
|
+
grep -Fq '"pair_arm": "l2_gated"' "$TMP_DIR/custom-pair-arm.json"
|
|
511
|
+
grep -Fq 'l2_gated must be evidence-clean' "$TMP_DIR/custom-pair-arm.json"
|
|
512
|
+
grep -Fq 'pair_trigger eligible with a canonical reason' "$TMP_DIR/custom-pair-arm.json"
|
|
513
|
+
grep -Fq 'l2_gated - solo_claude >= 5' "$TMP_DIR/custom-pair-arm.md"
|
|
96
514
|
|
|
97
515
|
write_fixture provider-limit F21 50 75 85 true 37 100 l2_risk_probes
|
|
98
516
|
python3 - "$TMP_DIR/provider-limit/F21/l2_risk_probes/result.json" <<'PY'
|
|
@@ -111,6 +529,7 @@ python3 "$GATE" --results-root "$TMP_DIR" --run-id provider-limit \
|
|
|
111
529
|
--out-json "$TMP_DIR/provider-limit.json" \
|
|
112
530
|
--out-md "$TMP_DIR/provider-limit.md" >/dev/null 2>&1 || true
|
|
113
531
|
grep -Fq '"pair_margin": null' "$TMP_DIR/provider-limit.json"
|
|
532
|
+
grep -Fq '"avg_pair_margin": null' "$TMP_DIR/provider-limit.json"
|
|
114
533
|
grep -Fq '"pair_solo_wall_ratio": null' "$TMP_DIR/provider-limit.json"
|
|
115
534
|
if grep -Fq 'margin -' "$TMP_DIR/provider-limit.md"; then
|
|
116
535
|
echo "provider-limit row must not report quality margin" >&2
|
|
@@ -119,13 +538,73 @@ if grep -Fq 'margin -' "$TMP_DIR/provider-limit.md"; then
|
|
|
119
538
|
fi
|
|
120
539
|
|
|
121
540
|
write_fixture slow-pair F21 50 75 85 true 401 100
|
|
122
|
-
write_fixture slow-pair
|
|
541
|
+
write_fixture slow-pair F23 55 75 83 true 280 140
|
|
123
542
|
expect_fail_contains slow-pair "pair/solo wall ratio 4.01 > 3.00" \
|
|
124
|
-
python3 "$GATE" --results-root "$TMP_DIR" --run-id slow-pair
|
|
543
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id slow-pair
|
|
544
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id slow-pair \
|
|
545
|
+
--max-pair-solo-wall-ratio 5 \
|
|
546
|
+
--out-json "$TMP_DIR/slow-pair-diagnostic.json" >/dev/null
|
|
547
|
+
grep -Fq '"verdict": "PASS"' "$TMP_DIR/slow-pair-diagnostic.json"
|
|
125
548
|
|
|
126
549
|
write_fixture one-fixture F21 50 75 85 true
|
|
127
550
|
expect_fail_contains one-fixture "fixture_count_ok" \
|
|
128
551
|
python3 "$GATE" --results-root "$TMP_DIR" --run-id one-fixture --out-json "$TMP_DIR/one-fixture.json"
|
|
129
552
|
grep -Fq '"fixture_count_ok": false' "$TMP_DIR/one-fixture.json"
|
|
130
553
|
|
|
554
|
+
write_fixture malformed-dq F21 50 75 85 true
|
|
555
|
+
python3 - "$TMP_DIR/malformed-dq/F21/judge.json" <<'PY'
|
|
556
|
+
import json, sys
|
|
557
|
+
path = sys.argv[1]
|
|
558
|
+
data = json.load(open(path))
|
|
559
|
+
data["disqualifiers_by_arm"] = ["not", "a", "dict"]
|
|
560
|
+
json.dump(data, open(path, "w"), indent=2)
|
|
561
|
+
PY
|
|
562
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id malformed-dq --min-fixtures 1 \
|
|
563
|
+
--out-json "$TMP_DIR/malformed-dq.json" >/dev/null
|
|
564
|
+
grep -Fq '"verdict": "PASS"' "$TMP_DIR/malformed-dq.json"
|
|
565
|
+
|
|
566
|
+
write_fixture malformed-dq-entry F21 50 75 85 true
|
|
567
|
+
python3 - "$TMP_DIR/malformed-dq-entry/F21/judge.json" <<'PY'
|
|
568
|
+
import json, sys
|
|
569
|
+
path = sys.argv[1]
|
|
570
|
+
data = json.load(open(path))
|
|
571
|
+
data["disqualifiers_by_arm"] = {"l2_risk_probes": True}
|
|
572
|
+
json.dump(data, open(path, "w"), indent=2)
|
|
573
|
+
PY
|
|
574
|
+
expect_fail_contains malformed-dq-entry "l2_risk_probes judge disqualifier" \
|
|
575
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id malformed-dq-entry --min-fixtures 1
|
|
576
|
+
|
|
577
|
+
write_fixture malformed-axis-wrapper F21 50 75 85 true
|
|
578
|
+
python3 - "$TMP_DIR/malformed-axis-wrapper/F21/judge.json" <<'PY'
|
|
579
|
+
import json, sys
|
|
580
|
+
path = sys.argv[1]
|
|
581
|
+
data = json.load(open(path))
|
|
582
|
+
data["_axis_validation"] = ["not", "a", "dict"]
|
|
583
|
+
json.dump(data, open(path, "w"), indent=2)
|
|
584
|
+
PY
|
|
585
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id malformed-axis-wrapper --min-fixtures 1 \
|
|
586
|
+
--out-json "$TMP_DIR/malformed-axis-wrapper.json" >/dev/null
|
|
587
|
+
grep -Fq '"verdict": "PASS"' "$TMP_DIR/malformed-axis-wrapper.json"
|
|
588
|
+
|
|
589
|
+
expect_fail_contains invalid-min-pair-margin "value must be > 0" \
|
|
590
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id pass --min-pair-margin 0
|
|
591
|
+
|
|
592
|
+
expect_fail_contains invalid-max-wall-ratio "value must be finite and > 0" \
|
|
593
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id pass --max-pair-solo-wall-ratio nan
|
|
594
|
+
|
|
595
|
+
expect_fail_contains invalid-min-fixtures "value must be > 0" \
|
|
596
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id pass --min-fixtures 0
|
|
597
|
+
|
|
598
|
+
expect_fail_contains invalid-min-bare-headroom "value must be >= 0" \
|
|
599
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id pass --min-bare-headroom -1
|
|
600
|
+
|
|
601
|
+
expect_fail_contains invalid-min-solo-headroom "value must be >= 0" \
|
|
602
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id pass --min-solo-headroom -1
|
|
603
|
+
|
|
604
|
+
expect_fail_contains invalid-pair-arm "pair-arm must be one of" \
|
|
605
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id pass --pair-arm variant
|
|
606
|
+
|
|
607
|
+
expect_fail_contains retired-pair-arm "pair-arm l2_forced is retired" \
|
|
608
|
+
python3 "$GATE" --results-root "$TMP_DIR" --run-id pass --pair-arm l2_forced
|
|
609
|
+
|
|
131
610
|
echo "PASS test-full-pipeline-pair-gate"
|