devlyn-cli 2.2.2 → 2.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +2 -2
- package/CLAUDE.md +4 -4
- package/README.md +85 -34
- package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
- package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
- package/benchmark/auto-resolve/README.md +307 -44
- package/benchmark/auto-resolve/RUBRIC.md +23 -14
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
- package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
- package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
- package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
- package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
- package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
- package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
- package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
- package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
- package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
- package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
- package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
- package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
- package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
- package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
- package/benchmark/auto-resolve/scripts/judge.sh +153 -26
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
- package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
- package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
- package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
- package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
- package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
- package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
- package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
- package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
- package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
- package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
- package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
- package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
- package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
- package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
- package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
- package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
- package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
- package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
- package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
- package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
- package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
- package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
- package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
- package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
- package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
- package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
- package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
- package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
- package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
- package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
- package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
- package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
- package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
- package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
- package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
- package/bin/devlyn.js +221 -17
- package/config/skills/_shared/adapters/README.md +3 -0
- package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
- package/config/skills/_shared/adapters/opus-4-7.md +9 -1
- package/config/skills/_shared/archive_run.py +78 -6
- package/config/skills/_shared/codex-config.md +5 -4
- package/config/skills/_shared/codex-monitored.sh +46 -1
- package/config/skills/_shared/collect-codex-findings.py +20 -5
- package/config/skills/_shared/engine-preflight.md +17 -13
- package/config/skills/_shared/runtime-principles.md +6 -9
- package/config/skills/_shared/spec-verify-check.py +2664 -107
- package/config/skills/_shared/verify-merge-findings.py +1369 -19
- package/config/skills/devlyn:design-ui/SKILL.md +364 -0
- package/config/skills/devlyn:ideate/SKILL.md +7 -4
- package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
- package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
- package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
- package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
- package/config/skills/devlyn:resolve/SKILL.md +78 -26
- package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
- package/config/skills/devlyn:resolve/references/phases/implement.md +1 -1
- package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
- package/config/skills/devlyn:resolve/references/phases/verify.md +80 -29
- package/config/skills/devlyn:resolve/references/state-schema.md +9 -4
- package/package.json +47 -2
- package/scripts/lint-fixtures.sh +349 -0
- package/scripts/lint-shadow-fixtures.sh +58 -0
- package/scripts/lint-skills.sh +3645 -95
|
@@ -0,0 +1,401 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Regression tests for run-headroom-candidate.sh argument and output guards.
|
|
3
|
+
|
|
4
|
+
set -euo pipefail
|
|
5
|
+
|
|
6
|
+
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
7
|
+
RUNNER="$SCRIPT_DIR/run-headroom-candidate.sh"
|
|
8
|
+
REJECTED="$SCRIPT_DIR/pair-rejected-fixtures.sh"
|
|
9
|
+
TMP_DIR="$(mktemp -d /tmp/run-headroom-candidate-test.XXXXXX)"
|
|
10
|
+
BENCH_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
11
|
+
TEST_RUN="headroom-cli-replay-$(basename "$TMP_DIR")"
|
|
12
|
+
TEST_SHADOW="$BENCH_ROOT/shadow-fixtures/S98-runner-hypothesis"
|
|
13
|
+
trap 'rm -rf "$TMP_DIR" "$BENCH_ROOT/results/$TEST_RUN"* "$TEST_SHADOW"' EXIT
|
|
14
|
+
|
|
15
|
+
expect_fail_contains() {
|
|
16
|
+
local label="$1"
|
|
17
|
+
local needle="$2"
|
|
18
|
+
shift 2
|
|
19
|
+
local out="$TMP_DIR/$label.out"
|
|
20
|
+
if "$@" > "$out" 2>&1; then
|
|
21
|
+
echo "expected failure for $label" >&2
|
|
22
|
+
cat "$out" >&2
|
|
23
|
+
exit 1
|
|
24
|
+
fi
|
|
25
|
+
if ! grep -Fq -- "$needle" "$out"; then
|
|
26
|
+
echo "missing expected text for $label: $needle" >&2
|
|
27
|
+
cat "$out" >&2
|
|
28
|
+
exit 1
|
|
29
|
+
fi
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
bash "$RUNNER" --help > "$TMP_DIR/help.out" 2>&1
|
|
33
|
+
grep -Fq 'usage:' "$TMP_DIR/help.out"
|
|
34
|
+
grep -Fq -- '--bare-max N' "$TMP_DIR/help.out"
|
|
35
|
+
grep -Fq -- '--solo-max N' "$TMP_DIR/help.out"
|
|
36
|
+
grep -Fq -- '--min-bare-headroom N' "$TMP_DIR/help.out"
|
|
37
|
+
grep -Fq -- '--min-solo-headroom N' "$TMP_DIR/help.out"
|
|
38
|
+
grep -Fq -- '--min-fixtures N' "$TMP_DIR/help.out"
|
|
39
|
+
grep -Fq -- '--allow-rejected-fixtures' "$TMP_DIR/help.out"
|
|
40
|
+
grep -Fq -- '--dry-run' "$TMP_DIR/help.out"
|
|
41
|
+
grep -Fq 'print_command' "$RUNNER"
|
|
42
|
+
grep -Fq 'Command: ' "$RUNNER"
|
|
43
|
+
grep -Fq 'DEVLYN_BENCHMARK_CLI_SUBCOMMAND' "$RUNNER"
|
|
44
|
+
grep -Fq 'cmd=(npx devlyn-cli benchmark headroom --run-id "$RUN_ID")' "$RUNNER"
|
|
45
|
+
grep -Fq 'cmd=(bash "$0" --run-id "$RUN_ID")' "$RUNNER"
|
|
46
|
+
grep -Fq 'cmd+=(--bare-max "$BARE_MAX")' "$RUNNER"
|
|
47
|
+
grep -Fq 'cmd+=(--solo-max "$SOLO_MAX")' "$RUNNER"
|
|
48
|
+
grep -Fq 'cmd+=(--min-bare-headroom "$MIN_BARE_HEADROOM")' "$RUNNER"
|
|
49
|
+
grep -Fq 'cmd+=(--min-solo-headroom "$MIN_SOLO_HEADROOM")' "$RUNNER"
|
|
50
|
+
grep -Fq 'cmd+=(--min-fixtures "$MIN_FIXTURES")' "$RUNNER"
|
|
51
|
+
grep -Fq 'cmd+=(--allow-rejected-fixtures)' "$RUNNER"
|
|
52
|
+
grep -Fq 'cmd+=(--dry-run)' "$RUNNER"
|
|
53
|
+
grep -Fq 'baseline evidence-complete' "$RUNNER"
|
|
54
|
+
grep -Fq 'headroom gate passed — candidate set accepted' "$RUNNER"
|
|
55
|
+
grep -Fq 'headroom gate failed — candidate set rejected' "$RUNNER"
|
|
56
|
+
grep -Fq -- '--bare-max "$BARE_MAX"' "$RUNNER"
|
|
57
|
+
grep -Fq -- '--solo-max "$SOLO_MAX"' "$RUNNER"
|
|
58
|
+
grep -Fq -- '--min-bare-headroom "$MIN_BARE_HEADROOM"' "$RUNNER"
|
|
59
|
+
grep -Fq -- '--min-solo-headroom "$MIN_SOLO_HEADROOM"' "$RUNNER"
|
|
60
|
+
grep -Fq -- '--min-fixtures "$MIN_FIXTURES"' "$RUNNER"
|
|
61
|
+
grep -Fq 'cat "$BENCH_ROOT/results/$RUN_ID/headroom-gate.md"' "$RUNNER"
|
|
62
|
+
grep -Fq 'headroom gate report missing' "$RUNNER"
|
|
63
|
+
grep -Fq 'validate_fixtures' "$RUNNER"
|
|
64
|
+
grep -Fq 'fixture_has_solo_ceiling_avoidance_note' "$RUNNER"
|
|
65
|
+
grep -Fq 'shadow fixture NOTES.md needs ## Solo ceiling avoidance' "$RUNNER"
|
|
66
|
+
grep -Fq 'fixture not found in fixtures/ or shadow-fixtures/' "$RUNNER"
|
|
67
|
+
grep -Fq '[FS][0-9]*) FIXTURES+=("$1")' "$RUNNER"
|
|
68
|
+
grep -Fq 'retired_fixture_exists' "$RUNNER"
|
|
69
|
+
grep -Fq 'fixture is retired and is not rerun by pair-candidate runners' "$RUNNER"
|
|
70
|
+
grep -Fq 'fixture_smoke_only' "$RUNNER"
|
|
71
|
+
grep -Fq 'fixture is smoke-only and cannot run providers' "$RUNNER"
|
|
72
|
+
grep -Fq 'rejected_pair_fixture_reason' "$RUNNER"
|
|
73
|
+
grep -Fq 'source "$BENCH_ROOT/scripts/pair-rejected-fixtures.sh"' "$RUNNER"
|
|
74
|
+
grep -Fq 'declare -F rejected_pair_fixture_reason' "$RUNNER"
|
|
75
|
+
grep -Fq '20260511-f3-http-error-headroom' "$REJECTED"
|
|
76
|
+
grep -Fq '20260507-f10-f11-tier1-full-pipeline' "$REJECTED"
|
|
77
|
+
grep -Fq '20260511-f12-webhook-headroom' "$REJECTED"
|
|
78
|
+
grep -Fq '20260511-f15-concurrency-headroom' "$REJECTED"
|
|
79
|
+
grep -Fq '20260511-f28-policy-oraclefix-reverified-pair' "$REJECTED"
|
|
80
|
+
grep -Fq '20260511-f30-headroom-v1' "$REJECTED"
|
|
81
|
+
grep -Fq '20260513-s2-inventory-headroom' "$REJECTED"
|
|
82
|
+
grep -Fq '20260513-s3-ticket-headroom' "$REJECTED"
|
|
83
|
+
grep -Fq '20260513-s4-return-headroom' "$REJECTED"
|
|
84
|
+
grep -Fq '20260513-s5-credit-headroom' "$REJECTED"
|
|
85
|
+
grep -Fq 'Use --allow-rejected-fixtures for diagnostics only' "$RUNNER"
|
|
86
|
+
|
|
87
|
+
expect_fail_contains missing-fixture 'usage:' \
|
|
88
|
+
bash "$RUNNER" --run-id headroom-arg-test
|
|
89
|
+
|
|
90
|
+
expect_fail_contains unknown-arg 'unknown arg: --bad-flag' \
|
|
91
|
+
bash "$RUNNER" --bad-flag F21-cli-scheduler-priority
|
|
92
|
+
|
|
93
|
+
expect_fail_contains missing-bare-max-value '--bare-max requires a value' \
|
|
94
|
+
bash "$RUNNER" --bare-max
|
|
95
|
+
|
|
96
|
+
expect_fail_contains invalid-bare-max '--bare-max must be an integer: nope' \
|
|
97
|
+
bash "$RUNNER" --bare-max nope F21-cli-scheduler-priority
|
|
98
|
+
|
|
99
|
+
expect_fail_contains invalid-min-fixtures '--min-fixtures must be >= 1' \
|
|
100
|
+
bash "$RUNNER" --min-fixtures 0 F21-cli-scheduler-priority
|
|
101
|
+
|
|
102
|
+
expect_fail_contains invalid-min-bare-headroom '--min-bare-headroom must be an integer: nope' \
|
|
103
|
+
bash "$RUNNER" --min-bare-headroom nope F21-cli-scheduler-priority
|
|
104
|
+
|
|
105
|
+
expect_fail_contains negative-min-bare-headroom '--min-bare-headroom must be an integer: -1' \
|
|
106
|
+
bash "$RUNNER" --min-bare-headroom -1 F21-cli-scheduler-priority
|
|
107
|
+
|
|
108
|
+
expect_fail_contains negative-min-solo-headroom '--min-solo-headroom must be an integer: -1' \
|
|
109
|
+
bash "$RUNNER" --min-solo-headroom -1 F21-cli-scheduler-priority
|
|
110
|
+
|
|
111
|
+
expect_fail_contains missing-fixture-fast \
|
|
112
|
+
'fixture not found in fixtures/ or shadow-fixtures/: F999-not-a-fixture' \
|
|
113
|
+
bash "$RUNNER" --run-id "$TEST_RUN-missing" F999-not-a-fixture
|
|
114
|
+
|
|
115
|
+
expect_fail_contains rejected-f1-fixture \
|
|
116
|
+
'fixture rejected for pair-candidate runs: F1-cli-trivial-flag' \
|
|
117
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-f1" --dry-run --min-fixtures 1 F1-cli-trivial-flag
|
|
118
|
+
|
|
119
|
+
expect_fail_contains rejected-f2-fixture \
|
|
120
|
+
'fixture rejected for pair-candidate runs: F2-cli-medium-subcommand' \
|
|
121
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-f2" --dry-run --min-fixtures 1 F2-cli-medium-subcommand
|
|
122
|
+
|
|
123
|
+
expect_fail_contains rejected-fixture \
|
|
124
|
+
'fixture rejected for pair-candidate runs: F26-cli-payout-ledger-rules' \
|
|
125
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected" --dry-run --min-fixtures 1 F26-cli-payout-ledger-rules
|
|
126
|
+
|
|
127
|
+
expect_fail_contains rejected-f3-fixture \
|
|
128
|
+
'fixture rejected for pair-candidate runs: F3-backend-contract-risk' \
|
|
129
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-f3" --dry-run --min-fixtures 1 F3-backend-contract-risk
|
|
130
|
+
|
|
131
|
+
expect_fail_contains rejected-f4-fixture \
|
|
132
|
+
'fixture rejected for pair-candidate runs: F4-web-browser-design' \
|
|
133
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-f4" --dry-run --min-fixtures 1 F4-web-browser-design
|
|
134
|
+
|
|
135
|
+
expect_fail_contains rejected-f5-fixture \
|
|
136
|
+
'fixture rejected for pair-candidate runs: F5-fix-loop-red-green' \
|
|
137
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-f5" --dry-run --min-fixtures 1 F5-fix-loop-red-green
|
|
138
|
+
|
|
139
|
+
expect_fail_contains rejected-f6-fixture \
|
|
140
|
+
'fixture rejected for pair-candidate runs: F6-dep-audit-native-module' \
|
|
141
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-f6" --dry-run --min-fixtures 1 F6-dep-audit-native-module
|
|
142
|
+
|
|
143
|
+
expect_fail_contains rejected-f7-fixture \
|
|
144
|
+
'fixture rejected for pair-candidate runs: F7-out-of-scope-trap' \
|
|
145
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-f7" --dry-run --min-fixtures 1 F7-out-of-scope-trap
|
|
146
|
+
|
|
147
|
+
expect_fail_contains rejected-f8-fixture \
|
|
148
|
+
'fixture rejected for pair-candidate runs: F8-known-limit-ambiguous' \
|
|
149
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-f8" --dry-run --min-fixtures 1 F8-known-limit-ambiguous
|
|
150
|
+
|
|
151
|
+
expect_fail_contains rejected-f9-fixture \
|
|
152
|
+
'fixture rejected for pair-candidate runs: F9-e2e-ideate-to-resolve' \
|
|
153
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-f9" --dry-run --min-fixtures 1 F9-e2e-ideate-to-resolve
|
|
154
|
+
|
|
155
|
+
expect_fail_contains rejected-f10-fixture \
|
|
156
|
+
'fixture rejected for pair-candidate runs: F10-persist-write-collision' \
|
|
157
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-f10" --dry-run --min-fixtures 1 F10-persist-write-collision
|
|
158
|
+
|
|
159
|
+
expect_fail_contains rejected-f11-fixture \
|
|
160
|
+
'fixture rejected for pair-candidate runs: F11-batch-import-all-or-nothing' \
|
|
161
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-f11" --dry-run --min-fixtures 1 F11-batch-import-all-or-nothing
|
|
162
|
+
|
|
163
|
+
expect_fail_contains rejected-f12-fixture \
|
|
164
|
+
'fixture rejected for pair-candidate runs: F12-webhook-raw-body-signature' \
|
|
165
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-f12" --dry-run --min-fixtures 1 F12-webhook-raw-body-signature
|
|
166
|
+
|
|
167
|
+
expect_fail_contains rejected-f15-fixture \
|
|
168
|
+
'fixture rejected for pair-candidate runs: F15-frozen-diff-race-review' \
|
|
169
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-f15" --dry-run --min-fixtures 1 F15-frozen-diff-race-review
|
|
170
|
+
|
|
171
|
+
expect_fail_contains rejected-f31-fixture \
|
|
172
|
+
'fixture rejected for pair-candidate runs: F31-cli-seat-rebalance' \
|
|
173
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-f31" --dry-run --min-fixtures 1 F31-cli-seat-rebalance
|
|
174
|
+
|
|
175
|
+
expect_fail_contains rejected-f32-fixture \
|
|
176
|
+
'fixture rejected for pair-candidate runs: F32-cli-subscription-renewal' \
|
|
177
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-f32" --dry-run --min-fixtures 1 F32-cli-subscription-renewal
|
|
178
|
+
|
|
179
|
+
expect_fail_contains rejected-s2-shadow-fixture \
|
|
180
|
+
'fixture rejected for pair-candidate runs: S2-cli-inventory-reservation' \
|
|
181
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-s2" --dry-run --min-fixtures 1 S2-cli-inventory-reservation
|
|
182
|
+
|
|
183
|
+
expect_fail_contains rejected-s3-shadow-fixture \
|
|
184
|
+
'fixture rejected for pair-candidate runs: S3-cli-ticket-assignment' \
|
|
185
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-s3" --dry-run --min-fixtures 1 S3-cli-ticket-assignment
|
|
186
|
+
|
|
187
|
+
expect_fail_contains rejected-s4-shadow-fixture \
|
|
188
|
+
'fixture rejected for pair-candidate runs: S4-cli-return-routing' \
|
|
189
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-s4" --dry-run --min-fixtures 1 S4-cli-return-routing
|
|
190
|
+
|
|
191
|
+
expect_fail_contains rejected-s5-shadow-fixture \
|
|
192
|
+
'fixture rejected for pair-candidate runs: S5-cli-credit-grant-ledger' \
|
|
193
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-s5" --dry-run --min-fixtures 1 S5-cli-credit-grant-ledger
|
|
194
|
+
|
|
195
|
+
expect_fail_contains rejected-s6-shadow-fixture \
|
|
196
|
+
'fixture rejected for pair-candidate runs: S6-cli-refund-window-ledger' \
|
|
197
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-s6" --dry-run --min-fixtures 1 S6-cli-refund-window-ledger
|
|
198
|
+
|
|
199
|
+
expect_fail_contains retired-fixture \
|
|
200
|
+
'fixture is retired and is not rerun by pair-candidate runners: F28-cli-return-authorization' \
|
|
201
|
+
bash "$RUNNER" --run-id "$TEST_RUN-retired" --dry-run --min-fixtures 1 F28-cli-return-authorization
|
|
202
|
+
|
|
203
|
+
expect_fail_contains smoke-only-s1-provider-run \
|
|
204
|
+
'fixture is smoke-only and cannot run providers: S1-cli-lang-flag' \
|
|
205
|
+
bash "$RUNNER" --run-id "$TEST_RUN-smoke-only" --min-fixtures 1 S1-cli-lang-flag
|
|
206
|
+
|
|
207
|
+
expect_fail_contains cli-replay-command \
|
|
208
|
+
"Command: npx devlyn-cli benchmark headroom --run-id $TEST_RUN" \
|
|
209
|
+
env DEVLYN_BENCHMARK_CLI_SUBCOMMAND=headroom \
|
|
210
|
+
bash "$RUNNER" --run-id "$TEST_RUN" --min-fixtures 2 F999-not-a-fixture
|
|
211
|
+
|
|
212
|
+
expect_fail_contains dry-run-min-fixtures \
|
|
213
|
+
'[headroom] DRY RUN failed' \
|
|
214
|
+
bash "$RUNNER" --run-id "$TEST_RUN-dry-run-fail" --dry-run F21-cli-scheduler-priority
|
|
215
|
+
|
|
216
|
+
bash "$RUNNER" --run-id "$TEST_RUN-dry-run" --dry-run --min-fixtures 1 F21-cli-scheduler-priority \
|
|
217
|
+
> "$TMP_DIR/dry-run.out" 2>&1
|
|
218
|
+
grep -Fq 'Mode: DRY RUN (no model/provider invocations)' "$TMP_DIR/dry-run.out"
|
|
219
|
+
grep -Fq 'Command: ' "$TMP_DIR/dry-run.out"
|
|
220
|
+
grep -Fq -- '--dry-run' "$TMP_DIR/dry-run.out"
|
|
221
|
+
grep -Fq -- '--min-bare-headroom 5' "$TMP_DIR/dry-run.out"
|
|
222
|
+
grep -Fq -- '--min-solo-headroom 5' "$TMP_DIR/dry-run.out"
|
|
223
|
+
grep -Fq -- '--min-fixtures 1' "$TMP_DIR/dry-run.out"
|
|
224
|
+
grep -Fq '[headroom] DRY RUN complete' "$TMP_DIR/dry-run.out"
|
|
225
|
+
|
|
226
|
+
bash "$RUNNER" --run-id "$TEST_RUN-shadow-dry-run" --dry-run --min-fixtures 1 S1-cli-lang-flag \
|
|
227
|
+
> "$TMP_DIR/shadow-dry-run.out" 2>&1
|
|
228
|
+
grep -Fq 'Fixtures: S1-cli-lang-flag' "$TMP_DIR/shadow-dry-run.out"
|
|
229
|
+
grep -Fq '[headroom] DRY RUN complete' "$TMP_DIR/shadow-dry-run.out"
|
|
230
|
+
|
|
231
|
+
mkdir -p "$TEST_SHADOW"
|
|
232
|
+
cat > "$TEST_SHADOW/metadata.json" <<'EOF'
|
|
233
|
+
{
|
|
234
|
+
"id": "S98-runner-hypothesis",
|
|
235
|
+
"category": "high-risk"
|
|
236
|
+
}
|
|
237
|
+
EOF
|
|
238
|
+
cat > "$TEST_SHADOW/spec.md" <<'EOF'
|
|
239
|
+
# Runner hypothesis fixture
|
|
240
|
+
|
|
241
|
+
Add idempotency handling for duplicate requests.
|
|
242
|
+
EOF
|
|
243
|
+
cat > "$TEST_SHADOW/expected.json" <<'EOF'
|
|
244
|
+
{
|
|
245
|
+
"verification_commands": [
|
|
246
|
+
{
|
|
247
|
+
"cmd": "node -e \"process.exit(0)\"",
|
|
248
|
+
"exit_code": 0
|
|
249
|
+
}
|
|
250
|
+
]
|
|
251
|
+
}
|
|
252
|
+
EOF
|
|
253
|
+
cat > "$TEST_SHADOW/NOTES.md" <<'EOF'
|
|
254
|
+
# Notes
|
|
255
|
+
|
|
256
|
+
Synthetic runner guard fixture.
|
|
257
|
+
EOF
|
|
258
|
+
expect_fail_contains missing-solo-headroom-hypothesis \
|
|
259
|
+
'fixture spec.md needs a solo-headroom hypothesis with solo_claude miss and observable command from expected.json before provider spend: S98-runner-hypothesis' \
|
|
260
|
+
bash "$RUNNER" --run-id "$TEST_RUN-missing-hypothesis" --dry-run --min-fixtures 1 S98-runner-hypothesis
|
|
261
|
+
cat >> "$TEST_SHADOW/spec.md" <<'EOF'
|
|
262
|
+
|
|
263
|
+
## Solo-headroom hypothesis
|
|
264
|
+
|
|
265
|
+
A capable solo_claude baseline is expected to miss duplicate idempotency ordering.
|
|
266
|
+
EOF
|
|
267
|
+
expect_fail_contains weak-solo-headroom-hypothesis \
|
|
268
|
+
'fixture spec.md needs a solo-headroom hypothesis with solo_claude miss and observable command from expected.json before provider spend: S98-runner-hypothesis' \
|
|
269
|
+
bash "$RUNNER" --run-id "$TEST_RUN-weak-hypothesis" --dry-run --min-fixtures 1 S98-runner-hypothesis
|
|
270
|
+
cat >> "$TEST_SHADOW/spec.md" <<'EOF'
|
|
271
|
+
|
|
272
|
+
Implementation marker: `duplicate-idempotency`.
|
|
273
|
+
EOF
|
|
274
|
+
expect_fail_contains unrelated-backtick-solo-headroom-hypothesis \
|
|
275
|
+
'fixture spec.md needs a solo-headroom hypothesis with solo_claude miss and observable command from expected.json before provider spend: S98-runner-hypothesis' \
|
|
276
|
+
bash "$RUNNER" --run-id "$TEST_RUN-unrelated-backtick-hypothesis" --dry-run --min-fixtures 1 S98-runner-hypothesis
|
|
277
|
+
cat >> "$TEST_SHADOW/spec.md" <<'EOF'
|
|
278
|
+
|
|
279
|
+
Observable command: `node -e "process.exit(0)"` exposes behavior.
|
|
280
|
+
EOF
|
|
281
|
+
expect_fail_contains observable-without-miss-solo-headroom-hypothesis \
|
|
282
|
+
'fixture spec.md needs a solo-headroom hypothesis with solo_claude miss and observable command from expected.json before provider spend: S98-runner-hypothesis' \
|
|
283
|
+
bash "$RUNNER" --run-id "$TEST_RUN-observable-without-miss-hypothesis" --dry-run --min-fixtures 1 S98-runner-hypothesis
|
|
284
|
+
cat >> "$TEST_SHADOW/spec.md" <<'EOF'
|
|
285
|
+
|
|
286
|
+
Observable command: `node -e "process.exit(0)"` exposes the miss.
|
|
287
|
+
EOF
|
|
288
|
+
expect_fail_contains missing-solo-ceiling-avoidance \
|
|
289
|
+
'shadow fixture NOTES.md needs ## Solo ceiling avoidance with solo_claude, a rejected/solo-saturated control comparison, and headroom reasoning before provider spend: S98-runner-hypothesis' \
|
|
290
|
+
bash "$RUNNER" --run-id "$TEST_RUN-missing-ceiling" --dry-run --min-fixtures 1 S98-runner-hypothesis
|
|
291
|
+
cat >> "$TEST_SHADOW/NOTES.md" <<'EOF'
|
|
292
|
+
|
|
293
|
+
## Solo ceiling avoidance
|
|
294
|
+
|
|
295
|
+
This candidate mentions solo_claude but gives no control comparison.
|
|
296
|
+
EOF
|
|
297
|
+
expect_fail_contains weak-solo-ceiling-avoidance \
|
|
298
|
+
'shadow fixture NOTES.md needs ## Solo ceiling avoidance with solo_claude, a rejected/solo-saturated control comparison, and headroom reasoning before provider spend: S98-runner-hypothesis' \
|
|
299
|
+
bash "$RUNNER" --run-id "$TEST_RUN-weak-ceiling" --dry-run --min-fixtures 1 S98-runner-hypothesis
|
|
300
|
+
cat >> "$TEST_SHADOW/NOTES.md" <<'EOF'
|
|
301
|
+
|
|
302
|
+
Unlike solo-saturated S2-S6 controls, this fixture should preserve
|
|
303
|
+
solo_claude headroom because it targets a multi-run state dependency.
|
|
304
|
+
EOF
|
|
305
|
+
bash "$RUNNER" --run-id "$TEST_RUN-hypothesis" --dry-run --min-fixtures 1 S98-runner-hypothesis \
|
|
306
|
+
> "$TMP_DIR/hypothesis.out" 2>&1
|
|
307
|
+
grep -Fq '[headroom] DRY RUN complete' "$TMP_DIR/hypothesis.out"
|
|
308
|
+
|
|
309
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-override" --dry-run --min-fixtures 1 \
|
|
310
|
+
--allow-rejected-fixtures F26-cli-payout-ledger-rules \
|
|
311
|
+
> "$TMP_DIR/rejected-override.out" 2>&1
|
|
312
|
+
grep -Fq -- '--allow-rejected-fixtures' "$TMP_DIR/rejected-override.out"
|
|
313
|
+
grep -Fq '[headroom] DRY RUN complete' "$TMP_DIR/rejected-override.out"
|
|
314
|
+
|
|
315
|
+
bash "$RUNNER" --run-id "$TEST_RUN-shadow-rejected-override" --dry-run --min-fixtures 1 \
|
|
316
|
+
--allow-rejected-fixtures S3-cli-ticket-assignment \
|
|
317
|
+
> "$TMP_DIR/shadow-rejected-override.out" 2>&1
|
|
318
|
+
grep -Fq -- '--allow-rejected-fixtures' "$TMP_DIR/shadow-rejected-override.out"
|
|
319
|
+
grep -Fq '[headroom] DRY RUN complete' "$TMP_DIR/shadow-rejected-override.out"
|
|
320
|
+
|
|
321
|
+
STUB_REPO="$TMP_DIR/stub-repo"
|
|
322
|
+
STUB_BENCH="$STUB_REPO/benchmark/auto-resolve"
|
|
323
|
+
mkdir -p \
|
|
324
|
+
"$STUB_BENCH/scripts" \
|
|
325
|
+
"$STUB_BENCH/fixtures/F21-cli-scheduler-priority" \
|
|
326
|
+
"$STUB_REPO/config/skills/devlyn:resolve"
|
|
327
|
+
cp "$RUNNER" "$STUB_BENCH/scripts/run-headroom-candidate.sh"
|
|
328
|
+
cp "$REJECTED" "$STUB_BENCH/scripts/pair-rejected-fixtures.sh"
|
|
329
|
+
chmod +x "$STUB_BENCH/scripts/run-headroom-candidate.sh"
|
|
330
|
+
chmod +x "$STUB_BENCH/scripts/pair-rejected-fixtures.sh"
|
|
331
|
+
printf -- '---\nname: devlyn:resolve\n---\n' > "$STUB_REPO/config/skills/devlyn:resolve/SKILL.md"
|
|
332
|
+
cat > "$STUB_BENCH/scripts/run-fixture.sh" <<'EOF'
|
|
333
|
+
#!/usr/bin/env bash
|
|
334
|
+
set -euo pipefail
|
|
335
|
+
echo "[stub-run-fixture] $*"
|
|
336
|
+
EOF
|
|
337
|
+
chmod +x "$STUB_BENCH/scripts/run-fixture.sh"
|
|
338
|
+
cat > "$STUB_BENCH/scripts/judge.sh" <<'EOF'
|
|
339
|
+
#!/usr/bin/env bash
|
|
340
|
+
set -euo pipefail
|
|
341
|
+
echo "[stub-judge] $*"
|
|
342
|
+
EOF
|
|
343
|
+
chmod +x "$STUB_BENCH/scripts/judge.sh"
|
|
344
|
+
cat > "$STUB_BENCH/scripts/headroom-gate.py" <<'PY'
|
|
345
|
+
#!/usr/bin/env python3
|
|
346
|
+
import json
|
|
347
|
+
import os
|
|
348
|
+
import pathlib
|
|
349
|
+
import sys
|
|
350
|
+
|
|
351
|
+
out_json = None
|
|
352
|
+
out_md = None
|
|
353
|
+
args = sys.argv[1:]
|
|
354
|
+
for index, arg in enumerate(args):
|
|
355
|
+
if arg == "--out-json":
|
|
356
|
+
out_json = pathlib.Path(args[index + 1])
|
|
357
|
+
if arg == "--out-md":
|
|
358
|
+
out_md = pathlib.Path(args[index + 1])
|
|
359
|
+
payload = {"verdict": "PASS" if os.environ.get("STUB_HEADROOM_EXIT", "0") == "0" else "FAIL"}
|
|
360
|
+
if out_json:
|
|
361
|
+
out_json.parent.mkdir(parents=True, exist_ok=True)
|
|
362
|
+
out_json.write_text(json.dumps(payload) + "\n", encoding="utf8")
|
|
363
|
+
if out_md:
|
|
364
|
+
out_md.parent.mkdir(parents=True, exist_ok=True)
|
|
365
|
+
out_md.write_text(
|
|
366
|
+
"# stub headroom\n\n"
|
|
367
|
+
"Verdict: **%s**\n\n"
|
|
368
|
+
"| fixture | bare | solo_claude | solo_claude-bare |\n"
|
|
369
|
+
"| --- | ---: | ---: | ---: |\n"
|
|
370
|
+
"| F21-cli-scheduler-priority | 50 | 75 | 25 |\n"
|
|
371
|
+
% payload["verdict"],
|
|
372
|
+
encoding="utf8",
|
|
373
|
+
)
|
|
374
|
+
sys.exit(int(os.environ.get("STUB_HEADROOM_EXIT", "0")))
|
|
375
|
+
PY
|
|
376
|
+
|
|
377
|
+
STUB_RUNNER="$STUB_BENCH/scripts/run-headroom-candidate.sh"
|
|
378
|
+
STUB_HEADROOM_EXIT=0 \
|
|
379
|
+
bash "$STUB_RUNNER" --run-id "$TEST_RUN-stub-success" --min-fixtures 1 F21-cli-scheduler-priority \
|
|
380
|
+
> "$TMP_DIR/stub-success.out" 2>&1
|
|
381
|
+
grep -Fq '[headroom] headroom gate passed — candidate set accepted.' "$TMP_DIR/stub-success.out"
|
|
382
|
+
grep -Fq '| F21-cli-scheduler-priority | 50 | 75 | 25 |' "$TMP_DIR/stub-success.out"
|
|
383
|
+
grep -Fq '[stub-run-fixture] --fixture F21-cli-scheduler-priority --arm bare' "$TMP_DIR/stub-success.out"
|
|
384
|
+
grep -Fq '[stub-run-fixture] --fixture F21-cli-scheduler-priority --arm solo_claude' "$TMP_DIR/stub-success.out"
|
|
385
|
+
|
|
386
|
+
if STUB_HEADROOM_EXIT=1 \
|
|
387
|
+
bash "$STUB_RUNNER" --run-id "$TEST_RUN-stub-fail" --min-fixtures 1 F21-cli-scheduler-priority \
|
|
388
|
+
> "$TMP_DIR/stub-fail.out" 2>&1; then
|
|
389
|
+
echo "expected stub headroom gate failure" >&2
|
|
390
|
+
cat "$TMP_DIR/stub-fail.out" >&2
|
|
391
|
+
exit 1
|
|
392
|
+
fi
|
|
393
|
+
grep -Fq '[headroom] headroom gate failed — candidate set rejected.' "$TMP_DIR/stub-fail.out"
|
|
394
|
+
grep -Fq '| F21-cli-scheduler-priority | 50 | 75 | 25 |' "$TMP_DIR/stub-fail.out"
|
|
395
|
+
if grep -Fq '[headroom] headroom gate passed — candidate set accepted.' "$TMP_DIR/stub-fail.out"; then
|
|
396
|
+
echo "accepted message must not print after headroom gate failure" >&2
|
|
397
|
+
cat "$TMP_DIR/stub-fail.out" >&2
|
|
398
|
+
exit 1
|
|
399
|
+
fi
|
|
400
|
+
|
|
401
|
+
echo "PASS test-run-headroom-candidate"
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Regression tests for run-swebench-solver-batch.sh argument guards.
|
|
3
|
+
|
|
4
|
+
set -euo pipefail
|
|
5
|
+
|
|
6
|
+
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
7
|
+
RUNNER="$SCRIPT_DIR/run-swebench-solver-batch.sh"
|
|
8
|
+
TMP_DIR="$(mktemp -d /tmp/run-swebench-solver-batch-test.XXXXXX)"
|
|
9
|
+
trap 'rm -rf "$TMP_DIR"' EXIT
|
|
10
|
+
FAKEBIN="$TMP_DIR/fakebin"
|
|
11
|
+
mkdir -p "$FAKEBIN"
|
|
12
|
+
cat > "$FAKEBIN/claude" <<'EOF'
|
|
13
|
+
#!/usr/bin/env bash
|
|
14
|
+
echo "fake claude should not be reached" >&2
|
|
15
|
+
exit 1
|
|
16
|
+
EOF
|
|
17
|
+
chmod +x "$FAKEBIN/claude"
|
|
18
|
+
|
|
19
|
+
expect_fail_contains() {
|
|
20
|
+
local label="$1"
|
|
21
|
+
local needle="$2"
|
|
22
|
+
shift 2
|
|
23
|
+
local out="$TMP_DIR/$label.out"
|
|
24
|
+
if "$@" > "$out" 2>&1; then
|
|
25
|
+
echo "expected failure for $label" >&2
|
|
26
|
+
cat "$out" >&2
|
|
27
|
+
exit 1
|
|
28
|
+
fi
|
|
29
|
+
if ! grep -Fq -- "$needle" "$out"; then
|
|
30
|
+
echo "missing expected text for $label: $needle" >&2
|
|
31
|
+
cat "$out" >&2
|
|
32
|
+
exit 1
|
|
33
|
+
fi
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
bash "$RUNNER" --help > "$TMP_DIR/help.out" 2>&1
|
|
37
|
+
grep -Fq 'usage:' "$TMP_DIR/help.out"
|
|
38
|
+
grep -Fq -- '--instances-jsonl <path>' "$TMP_DIR/help.out"
|
|
39
|
+
grep -Fq -- '--predictions-out <path>' "$TMP_DIR/help.out"
|
|
40
|
+
grep -Fq -- '--timeout-seconds N' "$TMP_DIR/help.out"
|
|
41
|
+
grep -Fq 'require_value()' "$RUNNER"
|
|
42
|
+
|
|
43
|
+
expect_fail_contains missing-instances-jsonl-value \
|
|
44
|
+
'--instances-jsonl requires a value' \
|
|
45
|
+
bash "$RUNNER" --instances-jsonl
|
|
46
|
+
|
|
47
|
+
expect_fail_contains missing-predictions-out-value \
|
|
48
|
+
'--predictions-out requires a value' \
|
|
49
|
+
bash "$RUNNER" --instances-jsonl "$TMP_DIR/instances.jsonl" --predictions-out
|
|
50
|
+
|
|
51
|
+
expect_fail_contains missing-model-name-value \
|
|
52
|
+
'--model-name requires a value' \
|
|
53
|
+
bash "$RUNNER" --instances-jsonl "$TMP_DIR/instances.jsonl" --predictions-out "$TMP_DIR/predictions.jsonl" --model-name
|
|
54
|
+
|
|
55
|
+
expect_fail_contains missing-repos-root-value \
|
|
56
|
+
'--repos-root requires a value' \
|
|
57
|
+
bash "$RUNNER" --instances-jsonl "$TMP_DIR/instances.jsonl" --predictions-out "$TMP_DIR/predictions.jsonl" --repos-root
|
|
58
|
+
|
|
59
|
+
expect_fail_contains missing-worktrees-root-value \
|
|
60
|
+
'--worktrees-root requires a value' \
|
|
61
|
+
bash "$RUNNER" --instances-jsonl "$TMP_DIR/instances.jsonl" --predictions-out "$TMP_DIR/predictions.jsonl" --worktrees-root
|
|
62
|
+
|
|
63
|
+
expect_fail_contains missing-timeout-value \
|
|
64
|
+
'--timeout-seconds requires a value' \
|
|
65
|
+
bash "$RUNNER" --instances-jsonl "$TMP_DIR/instances.jsonl" --predictions-out "$TMP_DIR/predictions.jsonl" --timeout-seconds
|
|
66
|
+
|
|
67
|
+
expect_fail_contains missing-limit-value \
|
|
68
|
+
'--limit requires a value' \
|
|
69
|
+
bash "$RUNNER" --instances-jsonl "$TMP_DIR/instances.jsonl" --predictions-out "$TMP_DIR/predictions.jsonl" --limit
|
|
70
|
+
|
|
71
|
+
expect_fail_contains missing-instance-id-value \
|
|
72
|
+
'--instance-id requires a value' \
|
|
73
|
+
bash "$RUNNER" --instances-jsonl "$TMP_DIR/instances.jsonl" --predictions-out "$TMP_DIR/predictions.jsonl" --instance-id
|
|
74
|
+
|
|
75
|
+
touch "$TMP_DIR/instances.jsonl"
|
|
76
|
+
expect_fail_contains invalid-timeout \
|
|
77
|
+
'--timeout-seconds must be an integer' \
|
|
78
|
+
bash "$RUNNER" --instances-jsonl "$TMP_DIR/instances.jsonl" --predictions-out "$TMP_DIR/predictions.jsonl" --timeout-seconds nope
|
|
79
|
+
|
|
80
|
+
expect_fail_contains zero-timeout \
|
|
81
|
+
'--timeout-seconds must be > 0' \
|
|
82
|
+
bash "$RUNNER" --instances-jsonl "$TMP_DIR/instances.jsonl" --predictions-out "$TMP_DIR/predictions.jsonl" --timeout-seconds 0
|
|
83
|
+
|
|
84
|
+
expect_fail_contains invalid-limit \
|
|
85
|
+
'--limit must be an integer' \
|
|
86
|
+
bash "$RUNNER" --instances-jsonl "$TMP_DIR/instances.jsonl" --predictions-out "$TMP_DIR/predictions.jsonl" --limit nope
|
|
87
|
+
|
|
88
|
+
expect_fail_contains zero-limit \
|
|
89
|
+
'--limit must be > 0' \
|
|
90
|
+
bash "$RUNNER" --instances-jsonl "$TMP_DIR/instances.jsonl" --predictions-out "$TMP_DIR/predictions.jsonl" --limit 0
|
|
91
|
+
|
|
92
|
+
expect_fail_contains missing-claude \
|
|
93
|
+
'claude command not found' \
|
|
94
|
+
env PATH="/usr/bin:/bin" bash "$RUNNER" --instances-jsonl "$TMP_DIR/instances.jsonl" --predictions-out "$TMP_DIR/predictions.jsonl"
|
|
95
|
+
|
|
96
|
+
printf '[]\n' > "$TMP_DIR/non-object-instances.jsonl"
|
|
97
|
+
expect_fail_contains non-object-instance-row \
|
|
98
|
+
'expected JSON object' \
|
|
99
|
+
env PATH="$FAKEBIN:/usr/bin:/bin" bash "$RUNNER" --instances-jsonl "$TMP_DIR/non-object-instances.jsonl" --predictions-out "$TMP_DIR/predictions.jsonl"
|
|
100
|
+
|
|
101
|
+
printf '{"instance_id": NaN}\n' > "$TMP_DIR/nan-instances.jsonl"
|
|
102
|
+
expect_fail_contains nan-instance-row \
|
|
103
|
+
'invalid JSON numeric constant: NaN' \
|
|
104
|
+
env PATH="$FAKEBIN:/usr/bin:/bin" bash "$RUNNER" --instances-jsonl "$TMP_DIR/nan-instances.jsonl" --predictions-out "$TMP_DIR/predictions.jsonl"
|
|
105
|
+
|
|
106
|
+
printf '{"repo":"local/repo"}\n' > "$TMP_DIR/missing-id-instances.jsonl"
|
|
107
|
+
expect_fail_contains missing-instance-id-row \
|
|
108
|
+
'missing instance_id' \
|
|
109
|
+
env PATH="$FAKEBIN:/usr/bin:/bin" bash "$RUNNER" --instances-jsonl "$TMP_DIR/missing-id-instances.jsonl" --predictions-out "$TMP_DIR/predictions.jsonl"
|
|
110
|
+
|
|
111
|
+
echo "PASS test-run-swebench-solver-batch"
|