devlyn-cli 2.2.2 → 2.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +2 -2
- package/CLAUDE.md +4 -4
- package/README.md +85 -34
- package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
- package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
- package/benchmark/auto-resolve/README.md +307 -44
- package/benchmark/auto-resolve/RUBRIC.md +23 -14
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
- package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
- package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
- package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
- package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
- package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
- package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
- package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
- package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
- package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
- package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
- package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
- package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
- package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
- package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
- package/benchmark/auto-resolve/scripts/judge.sh +153 -26
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
- package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
- package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
- package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
- package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
- package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
- package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
- package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
- package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
- package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
- package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
- package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
- package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
- package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
- package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
- package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
- package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
- package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
- package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
- package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
- package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
- package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
- package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
- package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
- package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
- package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
- package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
- package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
- package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
- package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
- package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
- package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
- package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
- package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
- package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
- package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
- package/bin/devlyn.js +221 -17
- package/config/skills/_shared/adapters/README.md +3 -0
- package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
- package/config/skills/_shared/adapters/opus-4-7.md +9 -1
- package/config/skills/_shared/archive_run.py +78 -6
- package/config/skills/_shared/codex-config.md +5 -4
- package/config/skills/_shared/codex-monitored.sh +46 -1
- package/config/skills/_shared/collect-codex-findings.py +20 -5
- package/config/skills/_shared/engine-preflight.md +17 -13
- package/config/skills/_shared/runtime-principles.md +6 -9
- package/config/skills/_shared/spec-verify-check.py +2664 -107
- package/config/skills/_shared/verify-merge-findings.py +1369 -19
- package/config/skills/devlyn:design-ui/SKILL.md +364 -0
- package/config/skills/devlyn:ideate/SKILL.md +7 -4
- package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
- package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
- package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
- package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
- package/config/skills/devlyn:resolve/SKILL.md +78 -26
- package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
- package/config/skills/devlyn:resolve/references/phases/implement.md +1 -1
- package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
- package/config/skills/devlyn:resolve/references/phases/verify.md +80 -29
- package/config/skills/devlyn:resolve/references/state-schema.md +9 -4
- package/package.json +47 -2
- package/scripts/lint-fixtures.sh +349 -0
- package/scripts/lint-shadow-fixtures.sh +58 -0
- package/scripts/lint-skills.sh +3645 -95
|
@@ -0,0 +1,497 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Regression tests for run-full-pipeline-pair-candidate.sh argument guards.
|
|
3
|
+
|
|
4
|
+
set -euo pipefail
|
|
5
|
+
|
|
6
|
+
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
7
|
+
RUNNER="$SCRIPT_DIR/run-full-pipeline-pair-candidate.sh"
|
|
8
|
+
REJECTED="$SCRIPT_DIR/pair-rejected-fixtures.sh"
|
|
9
|
+
BENCH_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
10
|
+
TMP_DIR="$(mktemp -d /tmp/run-full-pipeline-pair-candidate-test.XXXXXX)"
|
|
11
|
+
TEST_RUN="arg-test-$(basename "$TMP_DIR")"
|
|
12
|
+
TEST_SHADOW="$BENCH_ROOT/shadow-fixtures/S97-runner-hypothesis"
|
|
13
|
+
trap 'rm -rf "$TMP_DIR" "$BENCH_ROOT/results/$TEST_RUN"* "$BENCH_ROOT/results/src-$TEST_RUN" "$TEST_SHADOW"' EXIT
|
|
14
|
+
|
|
15
|
+
expect_fail_contains() {
|
|
16
|
+
local label="$1"
|
|
17
|
+
local needle="$2"
|
|
18
|
+
shift 2
|
|
19
|
+
local out="$TMP_DIR/$label.out"
|
|
20
|
+
if "$@" > "$out" 2>&1; then
|
|
21
|
+
echo "expected failure for $label" >&2
|
|
22
|
+
cat "$out" >&2
|
|
23
|
+
exit 1
|
|
24
|
+
fi
|
|
25
|
+
if ! grep -Fq -- "$needle" "$out"; then
|
|
26
|
+
echo "missing expected text for $label: $needle" >&2
|
|
27
|
+
cat "$out" >&2
|
|
28
|
+
exit 1
|
|
29
|
+
fi
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
bash "$RUNNER" --help > "$TMP_DIR/help.out" 2>&1
|
|
33
|
+
grep -Fq 'default: l2_risk_probes' "$TMP_DIR/help.out"
|
|
34
|
+
grep -Fq -- '--min-bare-headroom N' "$TMP_DIR/help.out"
|
|
35
|
+
grep -Fq -- '--min-solo-headroom N' "$TMP_DIR/help.out"
|
|
36
|
+
grep -Fq -- '--max-pair-solo-wall-ratio N (default: 3)' "$TMP_DIR/help.out"
|
|
37
|
+
grep -Fq -- '--allow-rejected-fixtures' "$TMP_DIR/help.out"
|
|
38
|
+
grep -Fq -- '--dry-run' "$TMP_DIR/help.out"
|
|
39
|
+
grep -Fq 'run_gate_with_report' "$RUNNER"
|
|
40
|
+
grep -Fq 'Command: ' "$RUNNER"
|
|
41
|
+
grep -Fq 'DEVLYN_BENCHMARK_CLI_SUBCOMMAND' "$RUNNER"
|
|
42
|
+
grep -Fq 'cmd=(npx devlyn-cli benchmark pair --run-id "$RUN_ID")' "$RUNNER"
|
|
43
|
+
grep -Fq 'cmd=(bash "$0" --run-id "$RUN_ID")' "$RUNNER"
|
|
44
|
+
grep -Fq 'cmd+=(--min-bare-headroom "$MIN_BARE_HEADROOM")' "$RUNNER"
|
|
45
|
+
grep -Fq 'cmd+=(--min-solo-headroom "$MIN_SOLO_HEADROOM")' "$RUNNER"
|
|
46
|
+
grep -Fq 'cmd+=(--allow-rejected-fixtures)' "$RUNNER"
|
|
47
|
+
grep -Fq 'cmd+=(--dry-run)' "$RUNNER"
|
|
48
|
+
grep -Fq 'baseline evidence-complete' "$RUNNER"
|
|
49
|
+
grep -Fq '$PAIR_ARM evidence-clean' "$RUNNER"
|
|
50
|
+
grep -Fq 'MAX_PAIR_SOLO_WALL_RATIO=3' "$RUNNER"
|
|
51
|
+
grep -Fq 'headroom-gate.md' "$RUNNER"
|
|
52
|
+
grep -Fq 'full-pipeline-pair-gate.md' "$RUNNER"
|
|
53
|
+
grep -Fq 'cat "$report"' "$RUNNER"
|
|
54
|
+
grep -Fq 'headroom gate failed — pair arm not executed' "$RUNNER"
|
|
55
|
+
grep -Fq 'headroom gate passed — executing $PAIR_ARM' "$RUNNER"
|
|
56
|
+
grep -Fq 'pair gate failed — pair evidence rejected' "$RUNNER"
|
|
57
|
+
grep -Fq 'pair gate passed — pair evidence accepted' "$RUNNER"
|
|
58
|
+
grep -Fq 'if ! run_gate_with_report \' "$RUNNER"
|
|
59
|
+
grep -Fq 'mirror_skills()' "$RUNNER"
|
|
60
|
+
grep -Fq 'validate_fixtures' "$RUNNER"
|
|
61
|
+
grep -Fq 'fixture_has_solo_ceiling_avoidance_note' "$RUNNER"
|
|
62
|
+
grep -Fq 'shadow fixture NOTES.md needs ## Solo ceiling avoidance' "$RUNNER"
|
|
63
|
+
grep -Fq 'fixture not found in fixtures/ or shadow-fixtures/' "$RUNNER"
|
|
64
|
+
grep -Fq '[FS][0-9]*) FIXTURES+=("$1")' "$RUNNER"
|
|
65
|
+
grep -Fq 'retired_fixture_exists' "$RUNNER"
|
|
66
|
+
grep -Fq 'fixture is retired and is not rerun by pair-candidate runners' "$RUNNER"
|
|
67
|
+
grep -Fq 'fixture_smoke_only' "$RUNNER"
|
|
68
|
+
grep -Fq 'fixture is smoke-only and cannot run providers' "$RUNNER"
|
|
69
|
+
grep -Fq 'rejected_pair_fixture_reason' "$RUNNER"
|
|
70
|
+
grep -Fq 'source "$BENCH_ROOT/scripts/pair-rejected-fixtures.sh"' "$RUNNER"
|
|
71
|
+
grep -Fq 'declare -F rejected_pair_fixture_reason' "$RUNNER"
|
|
72
|
+
grep -Fq '20260511-f3-http-error-headroom' "$REJECTED"
|
|
73
|
+
grep -Fq '20260507-f10-f11-tier1-full-pipeline' "$REJECTED"
|
|
74
|
+
grep -Fq '20260511-f12-webhook-headroom' "$REJECTED"
|
|
75
|
+
grep -Fq '20260511-f15-concurrency-headroom' "$REJECTED"
|
|
76
|
+
grep -Fq '20260511-f28-policy-oraclefix-reverified-pair' "$REJECTED"
|
|
77
|
+
grep -Fq '20260511-f30-headroom-v1' "$REJECTED"
|
|
78
|
+
grep -Fq '20260513-s2-inventory-headroom' "$REJECTED"
|
|
79
|
+
grep -Fq '20260513-s3-ticket-headroom' "$REJECTED"
|
|
80
|
+
grep -Fq '20260513-s4-return-headroom' "$REJECTED"
|
|
81
|
+
grep -Fq '20260513-s5-credit-headroom' "$REJECTED"
|
|
82
|
+
grep -Fq 'Use --allow-rejected-fixtures for diagnostics only' "$RUNNER"
|
|
83
|
+
grep -Fq 'if [ -z "$REUSE_CALIBRATED_FROM" ]; then' "$RUNNER"
|
|
84
|
+
grep -Fq 'if [ -n "$REUSE_CALIBRATED_FROM" ]; then' "$RUNNER"
|
|
85
|
+
|
|
86
|
+
expect_fail_contains invalid-pair-arm \
|
|
87
|
+
"pair-arm must be l2_risk_probes or l2_gated" \
|
|
88
|
+
bash "$RUNNER" --run-id arg-test --pair-arm variant F21-cli-scheduler-priority
|
|
89
|
+
|
|
90
|
+
expect_fail_contains retired-pair-arm \
|
|
91
|
+
"pair-arm l2_forced is retired" \
|
|
92
|
+
bash "$RUNNER" --run-id arg-test --pair-arm l2_forced F21-cli-scheduler-priority
|
|
93
|
+
|
|
94
|
+
expect_fail_contains missing-bare-max-value \
|
|
95
|
+
"--bare-max requires a value" \
|
|
96
|
+
bash "$RUNNER" --bare-max
|
|
97
|
+
|
|
98
|
+
expect_fail_contains invalid-bare-max \
|
|
99
|
+
"--bare-max must be an integer: nope" \
|
|
100
|
+
bash "$RUNNER" --bare-max nope F21-cli-scheduler-priority
|
|
101
|
+
|
|
102
|
+
expect_fail_contains invalid-min-fixtures \
|
|
103
|
+
"--min-fixtures must be >= 1" \
|
|
104
|
+
bash "$RUNNER" --min-fixtures 0 F21-cli-scheduler-priority
|
|
105
|
+
|
|
106
|
+
expect_fail_contains invalid-min-solo-headroom \
|
|
107
|
+
"--min-solo-headroom must be an integer: nope" \
|
|
108
|
+
bash "$RUNNER" --min-solo-headroom nope F21-cli-scheduler-priority
|
|
109
|
+
|
|
110
|
+
expect_fail_contains negative-min-bare-headroom \
|
|
111
|
+
"--min-bare-headroom must be an integer: -1" \
|
|
112
|
+
bash "$RUNNER" --min-bare-headroom -1 F21-cli-scheduler-priority
|
|
113
|
+
|
|
114
|
+
expect_fail_contains negative-min-solo-headroom \
|
|
115
|
+
"--min-solo-headroom must be an integer: -1" \
|
|
116
|
+
bash "$RUNNER" --min-solo-headroom -1 F21-cli-scheduler-priority
|
|
117
|
+
|
|
118
|
+
expect_fail_contains invalid-wall-ratio \
|
|
119
|
+
"--max-pair-solo-wall-ratio must be a positive number: nope" \
|
|
120
|
+
bash "$RUNNER" --max-pair-solo-wall-ratio nope F21-cli-scheduler-priority
|
|
121
|
+
|
|
122
|
+
expect_fail_contains zero-wall-ratio \
|
|
123
|
+
"--max-pair-solo-wall-ratio must be > 0" \
|
|
124
|
+
bash "$RUNNER" --max-pair-solo-wall-ratio 0 F21-cli-scheduler-priority
|
|
125
|
+
|
|
126
|
+
expect_fail_contains missing-fixture-fast \
|
|
127
|
+
"fixture not found in fixtures/ or shadow-fixtures/: F999-not-a-fixture" \
|
|
128
|
+
bash "$RUNNER" --run-id "$TEST_RUN-missing-fixture" F999-not-a-fixture
|
|
129
|
+
|
|
130
|
+
expect_fail_contains rejected-f1-fixture \
|
|
131
|
+
"fixture rejected for pair-candidate runs: F1-cli-trivial-flag" \
|
|
132
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-f1" --dry-run --min-fixtures 1 F1-cli-trivial-flag
|
|
133
|
+
|
|
134
|
+
expect_fail_contains rejected-f2-fixture \
|
|
135
|
+
"fixture rejected for pair-candidate runs: F2-cli-medium-subcommand" \
|
|
136
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-f2" --dry-run --min-fixtures 1 F2-cli-medium-subcommand
|
|
137
|
+
|
|
138
|
+
expect_fail_contains rejected-fixture \
|
|
139
|
+
"fixture rejected for pair-candidate runs: F26-cli-payout-ledger-rules" \
|
|
140
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected" --dry-run --min-fixtures 1 F26-cli-payout-ledger-rules
|
|
141
|
+
|
|
142
|
+
expect_fail_contains rejected-f3-fixture \
|
|
143
|
+
"fixture rejected for pair-candidate runs: F3-backend-contract-risk" \
|
|
144
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-f3" --dry-run --min-fixtures 1 F3-backend-contract-risk
|
|
145
|
+
|
|
146
|
+
expect_fail_contains rejected-f4-fixture \
|
|
147
|
+
"fixture rejected for pair-candidate runs: F4-web-browser-design" \
|
|
148
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-f4" --dry-run --min-fixtures 1 F4-web-browser-design
|
|
149
|
+
|
|
150
|
+
expect_fail_contains rejected-f5-fixture \
|
|
151
|
+
"fixture rejected for pair-candidate runs: F5-fix-loop-red-green" \
|
|
152
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-f5" --dry-run --min-fixtures 1 F5-fix-loop-red-green
|
|
153
|
+
|
|
154
|
+
expect_fail_contains rejected-f6-fixture \
|
|
155
|
+
"fixture rejected for pair-candidate runs: F6-dep-audit-native-module" \
|
|
156
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-f6" --dry-run --min-fixtures 1 F6-dep-audit-native-module
|
|
157
|
+
|
|
158
|
+
expect_fail_contains rejected-f7-fixture \
|
|
159
|
+
"fixture rejected for pair-candidate runs: F7-out-of-scope-trap" \
|
|
160
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-f7" --dry-run --min-fixtures 1 F7-out-of-scope-trap
|
|
161
|
+
|
|
162
|
+
expect_fail_contains rejected-f8-fixture \
|
|
163
|
+
"fixture rejected for pair-candidate runs: F8-known-limit-ambiguous" \
|
|
164
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-f8" --dry-run --min-fixtures 1 F8-known-limit-ambiguous
|
|
165
|
+
|
|
166
|
+
expect_fail_contains rejected-f9-fixture \
|
|
167
|
+
"fixture rejected for pair-candidate runs: F9-e2e-ideate-to-resolve" \
|
|
168
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-f9" --dry-run --min-fixtures 1 F9-e2e-ideate-to-resolve
|
|
169
|
+
|
|
170
|
+
expect_fail_contains rejected-f10-fixture \
|
|
171
|
+
"fixture rejected for pair-candidate runs: F10-persist-write-collision" \
|
|
172
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-f10" --dry-run --min-fixtures 1 F10-persist-write-collision
|
|
173
|
+
|
|
174
|
+
expect_fail_contains rejected-f11-fixture \
|
|
175
|
+
"fixture rejected for pair-candidate runs: F11-batch-import-all-or-nothing" \
|
|
176
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-f11" --dry-run --min-fixtures 1 F11-batch-import-all-or-nothing
|
|
177
|
+
|
|
178
|
+
expect_fail_contains rejected-f12-fixture \
|
|
179
|
+
"fixture rejected for pair-candidate runs: F12-webhook-raw-body-signature" \
|
|
180
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-f12" --dry-run --min-fixtures 1 F12-webhook-raw-body-signature
|
|
181
|
+
|
|
182
|
+
expect_fail_contains rejected-f15-fixture \
|
|
183
|
+
"fixture rejected for pair-candidate runs: F15-frozen-diff-race-review" \
|
|
184
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-f15" --dry-run --min-fixtures 1 F15-frozen-diff-race-review
|
|
185
|
+
|
|
186
|
+
expect_fail_contains rejected-f31-fixture \
|
|
187
|
+
"fixture rejected for pair-candidate runs: F31-cli-seat-rebalance" \
|
|
188
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-f31" --dry-run --min-fixtures 1 F31-cli-seat-rebalance
|
|
189
|
+
|
|
190
|
+
expect_fail_contains rejected-f32-fixture \
|
|
191
|
+
"fixture rejected for pair-candidate runs: F32-cli-subscription-renewal" \
|
|
192
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-f32" --dry-run --min-fixtures 1 F32-cli-subscription-renewal
|
|
193
|
+
|
|
194
|
+
expect_fail_contains rejected-s2-shadow-fixture \
|
|
195
|
+
"fixture rejected for pair-candidate runs: S2-cli-inventory-reservation" \
|
|
196
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-s2" --dry-run --min-fixtures 1 S2-cli-inventory-reservation
|
|
197
|
+
|
|
198
|
+
expect_fail_contains rejected-s3-shadow-fixture \
|
|
199
|
+
"fixture rejected for pair-candidate runs: S3-cli-ticket-assignment" \
|
|
200
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-s3" --dry-run --min-fixtures 1 S3-cli-ticket-assignment
|
|
201
|
+
|
|
202
|
+
expect_fail_contains rejected-s4-shadow-fixture \
|
|
203
|
+
"fixture rejected for pair-candidate runs: S4-cli-return-routing" \
|
|
204
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-s4" --dry-run --min-fixtures 1 S4-cli-return-routing
|
|
205
|
+
|
|
206
|
+
expect_fail_contains rejected-s5-shadow-fixture \
|
|
207
|
+
"fixture rejected for pair-candidate runs: S5-cli-credit-grant-ledger" \
|
|
208
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-s5" --dry-run --min-fixtures 1 S5-cli-credit-grant-ledger
|
|
209
|
+
|
|
210
|
+
expect_fail_contains rejected-s6-shadow-fixture \
|
|
211
|
+
"fixture rejected for pair-candidate runs: S6-cli-refund-window-ledger" \
|
|
212
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-s6" --dry-run --min-fixtures 1 S6-cli-refund-window-ledger
|
|
213
|
+
|
|
214
|
+
expect_fail_contains retired-fixture \
|
|
215
|
+
"fixture is retired and is not rerun by pair-candidate runners: F28-cli-return-authorization" \
|
|
216
|
+
bash "$RUNNER" --run-id "$TEST_RUN-retired" --dry-run --min-fixtures 1 F28-cli-return-authorization
|
|
217
|
+
|
|
218
|
+
expect_fail_contains smoke-only-s1-provider-run \
|
|
219
|
+
"fixture is smoke-only and cannot run providers: S1-cli-lang-flag" \
|
|
220
|
+
bash "$RUNNER" --run-id "$TEST_RUN-smoke-only" --min-fixtures 1 S1-cli-lang-flag
|
|
221
|
+
|
|
222
|
+
expect_fail_contains reuse-source-missing \
|
|
223
|
+
"reuse source missing" \
|
|
224
|
+
bash "$RUNNER" --run-id "$TEST_RUN-source-missing" \
|
|
225
|
+
--reuse-calibrated-from "src-$TEST_RUN-missing" \
|
|
226
|
+
F21-cli-scheduler-priority
|
|
227
|
+
|
|
228
|
+
expect_fail_contains cli-replay-command \
|
|
229
|
+
"Command: npx devlyn-cli benchmark pair --run-id $TEST_RUN-cli-replay" \
|
|
230
|
+
env DEVLYN_BENCHMARK_CLI_SUBCOMMAND=pair \
|
|
231
|
+
bash "$RUNNER" --run-id "$TEST_RUN-cli-replay" \
|
|
232
|
+
--reuse-calibrated-from "src-$TEST_RUN-missing" \
|
|
233
|
+
F21-cli-scheduler-priority
|
|
234
|
+
|
|
235
|
+
expect_fail_contains dry-run-min-fixtures \
|
|
236
|
+
'[full-pipeline-pair] DRY RUN failed' \
|
|
237
|
+
bash "$RUNNER" --run-id "$TEST_RUN-dry-run-fail" --dry-run F21-cli-scheduler-priority
|
|
238
|
+
|
|
239
|
+
bash "$RUNNER" --run-id "$TEST_RUN-dry-run" --dry-run --min-fixtures 1 F21-cli-scheduler-priority \
|
|
240
|
+
> "$TMP_DIR/dry-run.out" 2>&1
|
|
241
|
+
grep -Fq 'Mode: DRY RUN (no model/provider invocations)' "$TMP_DIR/dry-run.out"
|
|
242
|
+
grep -Fq 'Command: ' "$TMP_DIR/dry-run.out"
|
|
243
|
+
grep -Fq -- '--dry-run' "$TMP_DIR/dry-run.out"
|
|
244
|
+
grep -Fq -- '--min-bare-headroom 5' "$TMP_DIR/dry-run.out"
|
|
245
|
+
grep -Fq -- '--min-solo-headroom 5' "$TMP_DIR/dry-run.out"
|
|
246
|
+
grep -Fq -- '--min-fixtures 1' "$TMP_DIR/dry-run.out"
|
|
247
|
+
grep -Fq -- '--max-pair-solo-wall-ratio 3' "$TMP_DIR/dry-run.out"
|
|
248
|
+
grep -Fq 'Pair: l2_risk_probes evidence-clean, canonical trigger, margin >= +5, wall ratio <= 3' "$TMP_DIR/dry-run.out"
|
|
249
|
+
grep -Fq '[full-pipeline-pair] DRY RUN complete' "$TMP_DIR/dry-run.out"
|
|
250
|
+
|
|
251
|
+
bash "$RUNNER" --run-id "$TEST_RUN-shadow-dry-run" --dry-run --min-fixtures 1 S1-cli-lang-flag \
|
|
252
|
+
> "$TMP_DIR/shadow-dry-run.out" 2>&1
|
|
253
|
+
grep -Fq 'Fixtures: S1-cli-lang-flag' "$TMP_DIR/shadow-dry-run.out"
|
|
254
|
+
grep -Fq '[full-pipeline-pair] DRY RUN complete' "$TMP_DIR/shadow-dry-run.out"
|
|
255
|
+
|
|
256
|
+
mkdir -p "$TEST_SHADOW"
|
|
257
|
+
cat > "$TEST_SHADOW/metadata.json" <<'EOF'
|
|
258
|
+
{
|
|
259
|
+
"id": "S97-runner-hypothesis",
|
|
260
|
+
"category": "high-risk"
|
|
261
|
+
}
|
|
262
|
+
EOF
|
|
263
|
+
cat > "$TEST_SHADOW/spec.md" <<'EOF'
|
|
264
|
+
# Runner hypothesis fixture
|
|
265
|
+
|
|
266
|
+
Add idempotency handling for duplicate requests.
|
|
267
|
+
EOF
|
|
268
|
+
cat > "$TEST_SHADOW/expected.json" <<'EOF'
|
|
269
|
+
{
|
|
270
|
+
"verification_commands": [
|
|
271
|
+
{
|
|
272
|
+
"cmd": "node -e \"process.exit(0)\"",
|
|
273
|
+
"exit_code": 0
|
|
274
|
+
}
|
|
275
|
+
]
|
|
276
|
+
}
|
|
277
|
+
EOF
|
|
278
|
+
cat > "$TEST_SHADOW/NOTES.md" <<'EOF'
|
|
279
|
+
# Notes
|
|
280
|
+
|
|
281
|
+
Synthetic runner guard fixture.
|
|
282
|
+
EOF
|
|
283
|
+
expect_fail_contains missing-solo-headroom-hypothesis \
|
|
284
|
+
'fixture spec.md needs a solo-headroom hypothesis with solo_claude miss and observable command from expected.json before provider spend: S97-runner-hypothesis' \
|
|
285
|
+
bash "$RUNNER" --run-id "$TEST_RUN-missing-hypothesis" --dry-run --min-fixtures 1 S97-runner-hypothesis
|
|
286
|
+
cat >> "$TEST_SHADOW/spec.md" <<'EOF'
|
|
287
|
+
|
|
288
|
+
## Solo-headroom hypothesis
|
|
289
|
+
|
|
290
|
+
A capable solo_claude baseline is expected to miss duplicate idempotency ordering.
|
|
291
|
+
EOF
|
|
292
|
+
expect_fail_contains weak-solo-headroom-hypothesis \
|
|
293
|
+
'fixture spec.md needs a solo-headroom hypothesis with solo_claude miss and observable command from expected.json before provider spend: S97-runner-hypothesis' \
|
|
294
|
+
bash "$RUNNER" --run-id "$TEST_RUN-weak-hypothesis" --dry-run --min-fixtures 1 S97-runner-hypothesis
|
|
295
|
+
cat >> "$TEST_SHADOW/spec.md" <<'EOF'
|
|
296
|
+
|
|
297
|
+
Implementation marker: `duplicate-idempotency`.
|
|
298
|
+
EOF
|
|
299
|
+
expect_fail_contains unrelated-backtick-solo-headroom-hypothesis \
|
|
300
|
+
'fixture spec.md needs a solo-headroom hypothesis with solo_claude miss and observable command from expected.json before provider spend: S97-runner-hypothesis' \
|
|
301
|
+
bash "$RUNNER" --run-id "$TEST_RUN-unrelated-backtick-hypothesis" --dry-run --min-fixtures 1 S97-runner-hypothesis
|
|
302
|
+
cat >> "$TEST_SHADOW/spec.md" <<'EOF'
|
|
303
|
+
|
|
304
|
+
Observable command: `node -e "process.exit(0)"` exposes behavior.
|
|
305
|
+
EOF
|
|
306
|
+
expect_fail_contains observable-without-miss-solo-headroom-hypothesis \
|
|
307
|
+
'fixture spec.md needs a solo-headroom hypothesis with solo_claude miss and observable command from expected.json before provider spend: S97-runner-hypothesis' \
|
|
308
|
+
bash "$RUNNER" --run-id "$TEST_RUN-observable-without-miss-hypothesis" --dry-run --min-fixtures 1 S97-runner-hypothesis
|
|
309
|
+
cat >> "$TEST_SHADOW/spec.md" <<'EOF'
|
|
310
|
+
|
|
311
|
+
Observable command: `node -e "process.exit(0)"` exposes the miss.
|
|
312
|
+
EOF
|
|
313
|
+
expect_fail_contains missing-solo-ceiling-avoidance \
|
|
314
|
+
'shadow fixture NOTES.md needs ## Solo ceiling avoidance with solo_claude, a rejected/solo-saturated control comparison, and headroom reasoning before provider spend: S97-runner-hypothesis' \
|
|
315
|
+
bash "$RUNNER" --run-id "$TEST_RUN-missing-ceiling" --dry-run --min-fixtures 1 S97-runner-hypothesis
|
|
316
|
+
cat >> "$TEST_SHADOW/NOTES.md" <<'EOF'
|
|
317
|
+
|
|
318
|
+
## Solo ceiling avoidance
|
|
319
|
+
|
|
320
|
+
This candidate mentions solo_claude but gives no control comparison.
|
|
321
|
+
EOF
|
|
322
|
+
expect_fail_contains weak-solo-ceiling-avoidance \
|
|
323
|
+
'shadow fixture NOTES.md needs ## Solo ceiling avoidance with solo_claude, a rejected/solo-saturated control comparison, and headroom reasoning before provider spend: S97-runner-hypothesis' \
|
|
324
|
+
bash "$RUNNER" --run-id "$TEST_RUN-weak-ceiling" --dry-run --min-fixtures 1 S97-runner-hypothesis
|
|
325
|
+
cat >> "$TEST_SHADOW/NOTES.md" <<'EOF'
|
|
326
|
+
|
|
327
|
+
Unlike solo-saturated S2-S6 controls, this fixture should preserve
|
|
328
|
+
solo_claude headroom because it targets a multi-run state dependency.
|
|
329
|
+
EOF
|
|
330
|
+
bash "$RUNNER" --run-id "$TEST_RUN-hypothesis" --dry-run --min-fixtures 1 S97-runner-hypothesis \
|
|
331
|
+
> "$TMP_DIR/hypothesis.out" 2>&1
|
|
332
|
+
grep -Fq '[full-pipeline-pair] DRY RUN complete' "$TMP_DIR/hypothesis.out"
|
|
333
|
+
|
|
334
|
+
bash "$RUNNER" --run-id "$TEST_RUN-shadow-rejected-override" --dry-run --min-fixtures 1 \
|
|
335
|
+
--allow-rejected-fixtures S3-cli-ticket-assignment \
|
|
336
|
+
> "$TMP_DIR/shadow-rejected-override.out" 2>&1
|
|
337
|
+
grep -Fq -- '--allow-rejected-fixtures' "$TMP_DIR/shadow-rejected-override.out"
|
|
338
|
+
grep -Fq '[full-pipeline-pair] DRY RUN complete' "$TMP_DIR/shadow-rejected-override.out"
|
|
339
|
+
|
|
340
|
+
bash "$RUNNER" --run-id "$TEST_RUN-rejected-override" --dry-run --min-fixtures 1 \
|
|
341
|
+
--allow-rejected-fixtures F26-cli-payout-ledger-rules \
|
|
342
|
+
> "$TMP_DIR/rejected-override.out" 2>&1
|
|
343
|
+
grep -Fq -- '--allow-rejected-fixtures' "$TMP_DIR/rejected-override.out"
|
|
344
|
+
grep -Fq '[full-pipeline-pair] DRY RUN complete' "$TMP_DIR/rejected-override.out"
|
|
345
|
+
|
|
346
|
+
STUB_REPO="$TMP_DIR/stub-repo"
|
|
347
|
+
STUB_BENCH="$STUB_REPO/benchmark/auto-resolve"
|
|
348
|
+
mkdir -p \
|
|
349
|
+
"$STUB_BENCH/scripts" \
|
|
350
|
+
"$STUB_BENCH/fixtures/F21-cli-scheduler-priority" \
|
|
351
|
+
"$STUB_REPO/config/skills/devlyn:resolve"
|
|
352
|
+
cp "$RUNNER" "$STUB_BENCH/scripts/run-full-pipeline-pair-candidate.sh"
|
|
353
|
+
cp "$REJECTED" "$STUB_BENCH/scripts/pair-rejected-fixtures.sh"
|
|
354
|
+
chmod +x "$STUB_BENCH/scripts/run-full-pipeline-pair-candidate.sh"
|
|
355
|
+
chmod +x "$STUB_BENCH/scripts/pair-rejected-fixtures.sh"
|
|
356
|
+
printf -- '---\nname: devlyn:resolve\n---\n' > "$STUB_REPO/config/skills/devlyn:resolve/SKILL.md"
|
|
357
|
+
cat > "$STUB_BENCH/scripts/run-fixture.sh" <<'EOF'
|
|
358
|
+
#!/usr/bin/env bash
|
|
359
|
+
set -euo pipefail
|
|
360
|
+
echo "[stub-run-fixture] $*"
|
|
361
|
+
EOF
|
|
362
|
+
chmod +x "$STUB_BENCH/scripts/run-fixture.sh"
|
|
363
|
+
cat > "$STUB_BENCH/scripts/judge.sh" <<'EOF'
|
|
364
|
+
#!/usr/bin/env bash
|
|
365
|
+
set -euo pipefail
|
|
366
|
+
echo "[stub-judge] $*"
|
|
367
|
+
EOF
|
|
368
|
+
chmod +x "$STUB_BENCH/scripts/judge.sh"
|
|
369
|
+
cat > "$STUB_BENCH/scripts/headroom-gate.py" <<'PY'
|
|
370
|
+
#!/usr/bin/env python3
|
|
371
|
+
import json
|
|
372
|
+
import os
|
|
373
|
+
import pathlib
|
|
374
|
+
import sys
|
|
375
|
+
|
|
376
|
+
out_json = None
|
|
377
|
+
out_md = None
|
|
378
|
+
args = sys.argv[1:]
|
|
379
|
+
for index, arg in enumerate(args):
|
|
380
|
+
if arg == "--out-json":
|
|
381
|
+
out_json = pathlib.Path(args[index + 1])
|
|
382
|
+
if arg == "--out-md":
|
|
383
|
+
out_md = pathlib.Path(args[index + 1])
|
|
384
|
+
payload = {"verdict": "PASS" if os.environ.get("STUB_HEADROOM_EXIT", "0") == "0" else "FAIL"}
|
|
385
|
+
if out_json:
|
|
386
|
+
out_json.parent.mkdir(parents=True, exist_ok=True)
|
|
387
|
+
out_json.write_text(json.dumps(payload) + "\n", encoding="utf8")
|
|
388
|
+
if out_md:
|
|
389
|
+
out_md.parent.mkdir(parents=True, exist_ok=True)
|
|
390
|
+
out_md.write_text(
|
|
391
|
+
"# stub headroom\n\n"
|
|
392
|
+
"Verdict: **%s**\n\n"
|
|
393
|
+
"| fixture | bare | solo_claude | solo_claude-bare |\n"
|
|
394
|
+
"| --- | ---: | ---: | ---: |\n"
|
|
395
|
+
"| F21-cli-scheduler-priority | 50 | 75 | 25 |\n"
|
|
396
|
+
% payload["verdict"],
|
|
397
|
+
encoding="utf8",
|
|
398
|
+
)
|
|
399
|
+
sys.exit(int(os.environ.get("STUB_HEADROOM_EXIT", "0")))
|
|
400
|
+
PY
|
|
401
|
+
cat > "$STUB_BENCH/scripts/full-pipeline-pair-gate.py" <<'PY'
|
|
402
|
+
#!/usr/bin/env python3
|
|
403
|
+
import json
|
|
404
|
+
import os
|
|
405
|
+
import pathlib
|
|
406
|
+
import sys
|
|
407
|
+
|
|
408
|
+
out_json = None
|
|
409
|
+
out_md = None
|
|
410
|
+
args = sys.argv[1:]
|
|
411
|
+
for index, arg in enumerate(args):
|
|
412
|
+
if arg == "--out-json":
|
|
413
|
+
out_json = pathlib.Path(args[index + 1])
|
|
414
|
+
if arg == "--out-md":
|
|
415
|
+
out_md = pathlib.Path(args[index + 1])
|
|
416
|
+
payload = {"verdict": "PASS" if os.environ.get("STUB_PAIR_EXIT", "0") == "0" else "FAIL"}
|
|
417
|
+
print("stub-pair-gate args: " + " ".join(args))
|
|
418
|
+
if out_json:
|
|
419
|
+
out_json.parent.mkdir(parents=True, exist_ok=True)
|
|
420
|
+
out_json.write_text(json.dumps(payload) + "\n", encoding="utf8")
|
|
421
|
+
if out_md:
|
|
422
|
+
out_md.parent.mkdir(parents=True, exist_ok=True)
|
|
423
|
+
out_md.write_text(
|
|
424
|
+
"# stub pair\n\n"
|
|
425
|
+
"Verdict: **%s**\n\n"
|
|
426
|
+
"| fixture | bare | solo_claude | pair | pair-solo_claude |\n"
|
|
427
|
+
"| --- | ---: | ---: | ---: | ---: |\n"
|
|
428
|
+
"| F21-cli-scheduler-priority | 50 | 75 | 96 | 21 |\n"
|
|
429
|
+
% payload["verdict"],
|
|
430
|
+
encoding="utf8",
|
|
431
|
+
)
|
|
432
|
+
sys.exit(int(os.environ.get("STUB_PAIR_EXIT", "0")))
|
|
433
|
+
PY
|
|
434
|
+
|
|
435
|
+
STUB_RUNNER="$STUB_BENCH/scripts/run-full-pipeline-pair-candidate.sh"
|
|
436
|
+
STUB_HEADROOM_EXIT=0 STUB_PAIR_EXIT=0 \
|
|
437
|
+
bash "$STUB_RUNNER" --run-id "$TEST_RUN-stub-success" --min-fixtures 1 F21-cli-scheduler-priority \
|
|
438
|
+
> "$TMP_DIR/stub-success.out" 2>&1
|
|
439
|
+
grep -Fq '[full-pipeline-pair] headroom gate passed — executing l2_risk_probes.' "$TMP_DIR/stub-success.out"
|
|
440
|
+
grep -Fq '[full-pipeline-pair] pair gate passed — pair evidence accepted.' "$TMP_DIR/stub-success.out"
|
|
441
|
+
grep -Fq '[full-pipeline-pair] release audit: npx devlyn-cli benchmark audit --require-hypothesis-trigger --out-dir /tmp/devlyn-benchmark-audit-strict' "$TMP_DIR/stub-success.out"
|
|
442
|
+
grep -Fq '| F21-cli-scheduler-priority | 50 | 75 | 25 |' "$TMP_DIR/stub-success.out"
|
|
443
|
+
grep -Fq '| F21-cli-scheduler-priority | 50 | 75 | 96 | 21 |' "$TMP_DIR/stub-success.out"
|
|
444
|
+
grep -Fq '[stub-run-fixture] --fixture F21-cli-scheduler-priority --arm l2_risk_probes' "$TMP_DIR/stub-success.out"
|
|
445
|
+
grep -Fq -- '--require-hypothesis-trigger' "$TMP_DIR/stub-success.out"
|
|
446
|
+
|
|
447
|
+
if STUB_HEADROOM_EXIT=1 STUB_PAIR_EXIT=0 \
|
|
448
|
+
bash "$STUB_RUNNER" --run-id "$TEST_RUN-stub-headroom-fail" --min-fixtures 1 F21-cli-scheduler-priority \
|
|
449
|
+
> "$TMP_DIR/stub-headroom-fail.out" 2>&1; then
|
|
450
|
+
echo "expected stub headroom failure" >&2
|
|
451
|
+
cat "$TMP_DIR/stub-headroom-fail.out" >&2
|
|
452
|
+
exit 1
|
|
453
|
+
fi
|
|
454
|
+
grep -Fq '[full-pipeline-pair] headroom gate failed — pair arm not executed.' "$TMP_DIR/stub-headroom-fail.out"
|
|
455
|
+
grep -Fq '| F21-cli-scheduler-priority | 50 | 75 | 25 |' "$TMP_DIR/stub-headroom-fail.out"
|
|
456
|
+
if grep -Fq '[stub-run-fixture] --fixture F21-cli-scheduler-priority --arm l2_risk_probes' "$TMP_DIR/stub-headroom-fail.out"; then
|
|
457
|
+
echo "pair arm must not run after headroom failure" >&2
|
|
458
|
+
cat "$TMP_DIR/stub-headroom-fail.out" >&2
|
|
459
|
+
exit 1
|
|
460
|
+
fi
|
|
461
|
+
|
|
462
|
+
if STUB_HEADROOM_EXIT=0 STUB_PAIR_EXIT=1 \
|
|
463
|
+
bash "$STUB_RUNNER" --run-id "$TEST_RUN-stub-pair-fail" --min-fixtures 1 F21-cli-scheduler-priority \
|
|
464
|
+
> "$TMP_DIR/stub-pair-fail.out" 2>&1; then
|
|
465
|
+
echo "expected stub pair gate failure" >&2
|
|
466
|
+
cat "$TMP_DIR/stub-pair-fail.out" >&2
|
|
467
|
+
exit 1
|
|
468
|
+
fi
|
|
469
|
+
grep -Fq '[full-pipeline-pair] headroom gate passed — executing l2_risk_probes.' "$TMP_DIR/stub-pair-fail.out"
|
|
470
|
+
grep -Fq '[full-pipeline-pair] pair gate failed — pair evidence rejected.' "$TMP_DIR/stub-pair-fail.out"
|
|
471
|
+
grep -Fq '| F21-cli-scheduler-priority | 50 | 75 | 25 |' "$TMP_DIR/stub-pair-fail.out"
|
|
472
|
+
grep -Fq '| F21-cli-scheduler-priority | 50 | 75 | 96 | 21 |' "$TMP_DIR/stub-pair-fail.out"
|
|
473
|
+
if grep -Fq '[full-pipeline-pair] pair gate passed — pair evidence accepted.' "$TMP_DIR/stub-pair-fail.out"; then
|
|
474
|
+
echo "pair accepted message must not print after pair gate failure" >&2
|
|
475
|
+
cat "$TMP_DIR/stub-pair-fail.out" >&2
|
|
476
|
+
exit 1
|
|
477
|
+
fi
|
|
478
|
+
|
|
479
|
+
mkdir -p "$BENCH_ROOT/results/src-$TEST_RUN/F21-cli-scheduler-priority/bare"
|
|
480
|
+
printf '{}\n' > "$BENCH_ROOT/results/src-$TEST_RUN/F21-cli-scheduler-priority/bare/result.json"
|
|
481
|
+
printf '{}\n' > "$BENCH_ROOT/results/src-$TEST_RUN/F21-cli-scheduler-priority/bare/verify.json"
|
|
482
|
+
expect_fail_contains reuse-source-incomplete \
|
|
483
|
+
"reuse source missing diff.patch" \
|
|
484
|
+
bash "$RUNNER" --run-id "$TEST_RUN-source-incomplete" \
|
|
485
|
+
--reuse-calibrated-from "src-$TEST_RUN" \
|
|
486
|
+
F21-cli-scheduler-priority
|
|
487
|
+
|
|
488
|
+
mkdir -p "$BENCH_ROOT/results/$TEST_RUN-destination-incomplete/F21-cli-scheduler-priority/bare"
|
|
489
|
+
printf '{}\n' > "$BENCH_ROOT/results/$TEST_RUN-destination-incomplete/F21-cli-scheduler-priority/bare/result.json"
|
|
490
|
+
printf '{}\n' > "$BENCH_ROOT/results/$TEST_RUN-destination-incomplete/F21-cli-scheduler-priority/bare/verify.json"
|
|
491
|
+
expect_fail_contains reuse-destination-incomplete \
|
|
492
|
+
"reuse destination incomplete diff.patch" \
|
|
493
|
+
bash "$RUNNER" --run-id "$TEST_RUN-destination-incomplete" \
|
|
494
|
+
--reuse-calibrated-from "src-$TEST_RUN" \
|
|
495
|
+
F21-cli-scheduler-priority
|
|
496
|
+
|
|
497
|
+
echo "PASS test-run-full-pipeline-pair-candidate"
|