devlyn-cli 2.3.0 → 2.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +1 -1
- package/CLAUDE.md +2 -2
- package/README.md +82 -29
- package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
- package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
- package/benchmark/auto-resolve/README.md +307 -44
- package/benchmark/auto-resolve/RUBRIC.md +23 -14
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
- package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
- package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
- package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
- package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
- package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
- package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
- package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
- package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
- package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
- package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
- package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
- package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
- package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
- package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
- package/benchmark/auto-resolve/scripts/judge.sh +153 -26
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
- package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
- package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
- package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
- package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
- package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
- package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
- package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
- package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
- package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
- package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
- package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
- package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
- package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
- package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
- package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
- package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
- package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
- package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
- package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
- package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
- package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
- package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
- package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
- package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
- package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
- package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
- package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
- package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
- package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
- package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
- package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
- package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
- package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
- package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
- package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
- package/bin/devlyn.js +211 -18
- package/config/skills/_shared/adapters/README.md +3 -0
- package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
- package/config/skills/_shared/adapters/opus-4-7.md +9 -1
- package/config/skills/_shared/archive_run.py +78 -6
- package/config/skills/_shared/codex-config.md +3 -2
- package/config/skills/_shared/codex-monitored.sh +46 -1
- package/config/skills/_shared/collect-codex-findings.py +20 -5
- package/config/skills/_shared/engine-preflight.md +1 -1
- package/config/skills/_shared/runtime-principles.md +5 -8
- package/config/skills/_shared/spec-verify-check.py +2664 -107
- package/config/skills/_shared/verify-merge-findings.py +1369 -19
- package/config/skills/devlyn:ideate/SKILL.md +7 -4
- package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
- package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
- package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
- package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
- package/config/skills/devlyn:resolve/SKILL.md +49 -18
- package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
- package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
- package/config/skills/devlyn:resolve/references/phases/verify.md +62 -28
- package/config/skills/devlyn:resolve/references/state-schema.md +7 -4
- package/package.json +47 -2
- package/scripts/lint-fixtures.sh +349 -0
- package/scripts/lint-shadow-fixtures.sh +58 -0
- package/scripts/lint-skills.sh +3642 -92
- /package/{optional-skills → config/skills}/devlyn:design-ui/SKILL.md +0 -0
|
@@ -0,0 +1,933 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Regression tests for benchmark runner argument parsing.
|
|
3
|
+
set -euo pipefail
|
|
4
|
+
|
|
5
|
+
ROOT="$(cd "$(dirname "$0")/../../.." && pwd)"
|
|
6
|
+
TMP="$(mktemp -d)"
|
|
7
|
+
BENCH_ROOT="$ROOT/benchmark/auto-resolve"
|
|
8
|
+
trap 'rm -rf "$TMP"; rm -rf "$BENCH_ROOT/results/arg-parse-command-test" "$BENCH_ROOT/results/arg-parse-discovery-test" "$BENCH_ROOT/results/arg-parse-shadow-suite-dry-run" "$BENCH_ROOT/results/arg-parse-shadow-cli-suite-dry-run" "$BENCH_ROOT/results/arg-parse-variant-path" "$BENCH_ROOT/results/arg-parse-headroom-cli-replay" "$BENCH_ROOT/results/arg-parse-pair-cli-replay" "$BENCH_ROOT/results/arg-parse-shadow-judge" "$BENCH_ROOT/results/arg-parse-opus-bad-mapping" "$BENCH_ROOT/results/arg-parse-opus-malformed-mapping" "$BENCH_ROOT/results/arg-parse-opus-malformed-score" "$BENCH_ROOT/results/arg-parse-opus-invalid-generated-score" "$BENCH_ROOT/results/arg-parse-opus-invalid-generated-dq" "$BENCH_ROOT/results/arg-parse-opus-summary-mapping" "$BENCH_ROOT/results/arg-parse-opus-summary-null-margin"; rm -rf /tmp/bench-arg-parse-variant-path-* /tmp/bench-arg-parse-headroom-cli-replay-*' EXIT
|
|
9
|
+
|
|
10
|
+
expect_fail_contains() {
|
|
11
|
+
local name="$1"
|
|
12
|
+
local expected="$2"
|
|
13
|
+
shift 2
|
|
14
|
+
set +e
|
|
15
|
+
"$@" > "$TMP/$name.out" 2>&1
|
|
16
|
+
local status=$?
|
|
17
|
+
set -e
|
|
18
|
+
[ "$status" -ne 0 ] || {
|
|
19
|
+
echo "expected failure for $name" >&2
|
|
20
|
+
exit 1
|
|
21
|
+
}
|
|
22
|
+
grep -Fq -- "$expected" "$TMP/$name.out" || {
|
|
23
|
+
echo "missing expected output for $name: $expected" >&2
|
|
24
|
+
cat "$TMP/$name.out" >&2
|
|
25
|
+
exit 1
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
expect_fail_contains suite-missing-n "--n requires a value" \
|
|
30
|
+
bash "$ROOT/benchmark/auto-resolve/scripts/run-suite.sh" --n
|
|
31
|
+
expect_fail_contains suite-bad-n "error: --n must be an integer" \
|
|
32
|
+
bash "$ROOT/benchmark/auto-resolve/scripts/run-suite.sh" --n abc --dry-run
|
|
33
|
+
expect_fail_contains suite-missing-run-id "--run-id requires a value" \
|
|
34
|
+
bash "$ROOT/benchmark/auto-resolve/scripts/run-suite.sh" --judge-only --run-id
|
|
35
|
+
|
|
36
|
+
bash "$ROOT/benchmark/auto-resolve/scripts/run-suite.sh" --help > "$TMP/run-suite-help.out" 2>&1
|
|
37
|
+
grep -Fq 'run-suite.sh --suite shadow --dry-run' "$TMP/run-suite-help.out"
|
|
38
|
+
grep -Fq 'shadow suite refuses provider/judge runs' "$TMP/run-suite-help.out"
|
|
39
|
+
|
|
40
|
+
expect_fail_contains fixture-missing-arm "--arm requires a value" \
|
|
41
|
+
bash "$ROOT/benchmark/auto-resolve/scripts/run-fixture.sh" --fixture F1 --arm
|
|
42
|
+
expect_fail_contains fixture-missing-resolve-skill "--resolve-skill requires a value" \
|
|
43
|
+
bash "$ROOT/benchmark/auto-resolve/scripts/run-fixture.sh" \
|
|
44
|
+
--fixture F1 --arm bare --run-id arg-parse --resolve-skill
|
|
45
|
+
|
|
46
|
+
expect_fail_contains judge-missing-fixture "--fixture requires a value" \
|
|
47
|
+
bash "$ROOT/benchmark/auto-resolve/scripts/judge.sh" --fixture
|
|
48
|
+
expect_fail_contains judge-missing-run-id "--run-id requires a value" \
|
|
49
|
+
bash "$ROOT/benchmark/auto-resolve/scripts/judge.sh" --fixture F1 --run-id
|
|
50
|
+
|
|
51
|
+
grep -Fq 'shadow-fixtures/$FIXTURE' "$ROOT/benchmark/auto-resolve/scripts/judge.sh"
|
|
52
|
+
|
|
53
|
+
SHADOW_JUDGE_DIR="$BENCH_ROOT/results/arg-parse-shadow-judge/S1-cli-lang-flag"
|
|
54
|
+
mkdir -p "$SHADOW_JUDGE_DIR/bare" "$SHADOW_JUDGE_DIR/solo_claude" "$TMP/fakebin"
|
|
55
|
+
cat > "$SHADOW_JUDGE_DIR/bare/diff.patch" <<'EOF'
|
|
56
|
+
diff --git a/bin/cli.js b/bin/cli.js
|
|
57
|
+
--- a/bin/cli.js
|
|
58
|
+
+++ b/bin/cli.js
|
|
59
|
+
@@ -1 +1 @@
|
|
60
|
+
-old
|
|
61
|
+
+bare
|
|
62
|
+
EOF
|
|
63
|
+
cat > "$SHADOW_JUDGE_DIR/solo_claude/diff.patch" <<'EOF'
|
|
64
|
+
diff --git a/bin/cli.js b/bin/cli.js
|
|
65
|
+
--- a/bin/cli.js
|
|
66
|
+
+++ b/bin/cli.js
|
|
67
|
+
@@ -1 +1 @@
|
|
68
|
+
-old
|
|
69
|
+
+solo
|
|
70
|
+
EOF
|
|
71
|
+
printf '{"arm":"bare","verify_score":0.5}\n' > "$SHADOW_JUDGE_DIR/bare/verify.json"
|
|
72
|
+
printf '{"arm":"solo_claude","verify_score":0.75}\n' > "$SHADOW_JUDGE_DIR/solo_claude/verify.json"
|
|
73
|
+
cat > "$TMP/fakebin/codex" <<'EOF'
|
|
74
|
+
#!/usr/bin/env bash
|
|
75
|
+
if [ "${1:-}" = "--version" ]; then
|
|
76
|
+
echo "codex-cli fake"
|
|
77
|
+
exit 0
|
|
78
|
+
fi
|
|
79
|
+
last=""
|
|
80
|
+
while [ $# -gt 0 ]; do
|
|
81
|
+
if [ "$1" = "--output-last-message" ]; then
|
|
82
|
+
last="$2"
|
|
83
|
+
shift 2
|
|
84
|
+
continue
|
|
85
|
+
fi
|
|
86
|
+
shift
|
|
87
|
+
done
|
|
88
|
+
json='{"a_score":50,"b_score":75,"winner":"B","a_breakdown":{"spec":12,"constraint":13,"scope":12,"quality":13,"notes":"ok"},"b_breakdown":{"spec":19,"constraint":19,"scope":18,"quality":19,"notes":"ok"},"critical_findings":{"A":[],"B":[]},"disqualifiers":{"A":false,"A_reason":"","B":false,"B_reason":""},"overall_reasoning":"fake judge output for shadow fixture resolver regression."}'
|
|
89
|
+
[ -z "$last" ] || printf '%s\n' "$json" > "$last"
|
|
90
|
+
printf '%s\n' "$json"
|
|
91
|
+
EOF
|
|
92
|
+
chmod +x "$TMP/fakebin/codex"
|
|
93
|
+
PATH="$TMP/fakebin:$PATH" \
|
|
94
|
+
bash "$ROOT/benchmark/auto-resolve/scripts/judge.sh" --fixture S1-cli-lang-flag --run-id arg-parse-shadow-judge \
|
|
95
|
+
> "$TMP/shadow-judge.out" 2>&1
|
|
96
|
+
grep -Fq '[judge]' "$TMP/shadow-judge.out"
|
|
97
|
+
grep -Fq '"solo_claude"' "$SHADOW_JUDGE_DIR/judge.json"
|
|
98
|
+
|
|
99
|
+
expect_fail_contains opus-missing-run-id "--run-id requires a value" \
|
|
100
|
+
bash "$ROOT/benchmark/auto-resolve/scripts/judge-opus-pass.sh" --run-id
|
|
101
|
+
|
|
102
|
+
OPUS_BAD_MAPPING_DIR="$BENCH_ROOT/results/arg-parse-opus-bad-mapping/F9-e2e-ideate-to-resolve"
|
|
103
|
+
mkdir -p "$OPUS_BAD_MAPPING_DIR"
|
|
104
|
+
: > "$OPUS_BAD_MAPPING_DIR/judge-prompt.txt"
|
|
105
|
+
cat > "$OPUS_BAD_MAPPING_DIR/judge.json" <<'JSON'
|
|
106
|
+
{
|
|
107
|
+
"_blind_mapping": {"A": "variant", "B": "bare", "seed": 1},
|
|
108
|
+
"scores_by_arm": {"variant": 70, "bare": 50, "solo_claude": 60}
|
|
109
|
+
}
|
|
110
|
+
JSON
|
|
111
|
+
expect_fail_contains opus-bad-mapping "judge blind mapping missing arm(s): solo_claude" \
|
|
112
|
+
bash "$ROOT/benchmark/auto-resolve/scripts/judge-opus-pass.sh" --run-id arg-parse-opus-bad-mapping
|
|
113
|
+
|
|
114
|
+
OPUS_MALFORMED_MAPPING_DIR="$BENCH_ROOT/results/arg-parse-opus-malformed-mapping/F9-e2e-ideate-to-resolve"
|
|
115
|
+
mkdir -p "$OPUS_MALFORMED_MAPPING_DIR"
|
|
116
|
+
: > "$OPUS_MALFORMED_MAPPING_DIR/judge-prompt.txt"
|
|
117
|
+
cat > "$OPUS_MALFORMED_MAPPING_DIR/judge.json" <<'JSON'
|
|
118
|
+
{
|
|
119
|
+
"_blind_mapping": "not-a-dict",
|
|
120
|
+
"scores_by_arm": {"variant": 70, "bare": 50, "solo_claude": 60}
|
|
121
|
+
}
|
|
122
|
+
JSON
|
|
123
|
+
expect_fail_contains opus-malformed-mapping "judge blind mapping missing" \
|
|
124
|
+
bash "$ROOT/benchmark/auto-resolve/scripts/judge-opus-pass.sh" --run-id arg-parse-opus-malformed-mapping
|
|
125
|
+
|
|
126
|
+
OPUS_MALFORMED_SCORE_DIR="$BENCH_ROOT/results/arg-parse-opus-malformed-score/F9-e2e-ideate-to-resolve"
|
|
127
|
+
mkdir -p "$OPUS_MALFORMED_SCORE_DIR"
|
|
128
|
+
: > "$OPUS_MALFORMED_SCORE_DIR/judge-prompt.txt"
|
|
129
|
+
cat > "$OPUS_MALFORMED_SCORE_DIR/judge.json" <<'JSON'
|
|
130
|
+
{
|
|
131
|
+
"_blind_mapping": {"A": "bare", "B": "solo_claude", "C": "variant", "seed": 1},
|
|
132
|
+
"scores_by_arm": {"bare": 50, "solo_claude": true, "variant": 101}
|
|
133
|
+
}
|
|
134
|
+
JSON
|
|
135
|
+
expect_fail_contains opus-malformed-score "scores_by_arm malformed score(s): solo_claude, variant" \
|
|
136
|
+
bash "$ROOT/benchmark/auto-resolve/scripts/judge-opus-pass.sh" --run-id arg-parse-opus-malformed-score
|
|
137
|
+
|
|
138
|
+
OPUS_SUMMARY_MAPPING_DIR="$BENCH_ROOT/results/arg-parse-opus-summary-mapping/F99-opus-summary-mapping"
|
|
139
|
+
mkdir -p "$OPUS_SUMMARY_MAPPING_DIR"
|
|
140
|
+
: > "$OPUS_SUMMARY_MAPPING_DIR/judge-prompt.txt"
|
|
141
|
+
cat > "$OPUS_SUMMARY_MAPPING_DIR/judge.json" <<'JSON'
|
|
142
|
+
{
|
|
143
|
+
"_blind_mapping": {"A": "bare", "B": "solo_claude", "seed": 1},
|
|
144
|
+
"scores_by_arm": {"bare": 50, "solo_claude": 60},
|
|
145
|
+
"margins": {"solo_over_bare": 999, "variant_over_bare": 888},
|
|
146
|
+
"winner_arm": "variant",
|
|
147
|
+
"breakdowns_by_arm": {
|
|
148
|
+
"bare": {"spec": 10, "constraint": 10, "scope": 10, "quality": 10},
|
|
149
|
+
"solo_claude": {"spec": 11, "constraint": 11, "scope": 11, "quality": 11}
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
JSON
|
|
153
|
+
FAKE_CLAUDE_DIR="$TMP/fake-claude-bin"
|
|
154
|
+
mkdir -p "$FAKE_CLAUDE_DIR"
|
|
155
|
+
cat > "$FAKE_CLAUDE_DIR/claude" <<'EOF'
|
|
156
|
+
#!/usr/bin/env bash
|
|
157
|
+
if [ "${1:-}" = "--version" ]; then
|
|
158
|
+
echo "claude fake"
|
|
159
|
+
exit 0
|
|
160
|
+
fi
|
|
161
|
+
if [ "${FAKE_CLAUDE_INVALID_SCORE:-}" = "1" ]; then
|
|
162
|
+
cat <<'JSON'
|
|
163
|
+
{
|
|
164
|
+
"a_score": true,
|
|
165
|
+
"b_score": 101,
|
|
166
|
+
"winner": "B",
|
|
167
|
+
"disqualifiers": {"A": false, "A_reason": "", "B": false, "B_reason": ""},
|
|
168
|
+
"critical_findings": {"A": [], "B": []},
|
|
169
|
+
"a_breakdown": {"spec": 10, "constraint": 10, "scope": 10, "quality": 10},
|
|
170
|
+
"b_breakdown": {"spec": 11, "constraint": 11, "scope": 11, "quality": 11},
|
|
171
|
+
"overall_reasoning": "invalid scores for regression test"
|
|
172
|
+
}
|
|
173
|
+
JSON
|
|
174
|
+
exit 0
|
|
175
|
+
fi
|
|
176
|
+
if [ "${FAKE_CLAUDE_INVALID_DQ:-}" = "1" ]; then
|
|
177
|
+
cat <<'JSON'
|
|
178
|
+
{
|
|
179
|
+
"a_score": 40,
|
|
180
|
+
"b_score": 70,
|
|
181
|
+
"winner": "B",
|
|
182
|
+
"disqualifiers": {"A": "false", "A_reason": "", "B": false, "B_reason": ""},
|
|
183
|
+
"critical_findings": {"A": [], "B": []},
|
|
184
|
+
"a_breakdown": {"spec": 10, "constraint": 10, "scope": 10, "quality": 10},
|
|
185
|
+
"b_breakdown": {"spec": 11, "constraint": 11, "scope": 11, "quality": 11},
|
|
186
|
+
"overall_reasoning": "invalid disqualifier for regression test"
|
|
187
|
+
}
|
|
188
|
+
JSON
|
|
189
|
+
exit 0
|
|
190
|
+
fi
|
|
191
|
+
cat <<'JSON'
|
|
192
|
+
{
|
|
193
|
+
"a_score": 40,
|
|
194
|
+
"b_score": 70,
|
|
195
|
+
"winner": "B",
|
|
196
|
+
"disqualifiers": ["not", "a", "dict"],
|
|
197
|
+
"a_breakdown": {"spec": 10, "constraint": 10, "scope": 10, "quality": 10},
|
|
198
|
+
"b_breakdown": {"spec": 11, "constraint": 11, "scope": 11, "quality": 11}
|
|
199
|
+
}
|
|
200
|
+
JSON
|
|
201
|
+
EOF
|
|
202
|
+
chmod +x "$FAKE_CLAUDE_DIR/claude"
|
|
203
|
+
OPUS_INVALID_GENERATED_SCORE_DIR="$BENCH_ROOT/results/arg-parse-opus-invalid-generated-score/F99-opus-invalid-generated-score"
|
|
204
|
+
mkdir -p "$OPUS_INVALID_GENERATED_SCORE_DIR"
|
|
205
|
+
: > "$OPUS_INVALID_GENERATED_SCORE_DIR/judge-prompt.txt"
|
|
206
|
+
cat > "$OPUS_INVALID_GENERATED_SCORE_DIR/judge.json" <<'JSON'
|
|
207
|
+
{
|
|
208
|
+
"_blind_mapping": {"A": "bare", "B": "solo_claude", "seed": 1},
|
|
209
|
+
"scores_by_arm": {"bare": 50, "solo_claude": 60}
|
|
210
|
+
}
|
|
211
|
+
JSON
|
|
212
|
+
expect_fail_contains opus-invalid-generated-score "invalid opus score value(s): a_score, b_score" \
|
|
213
|
+
env FAKE_CLAUDE_INVALID_SCORE=1 PATH="$FAKE_CLAUDE_DIR:$PATH" \
|
|
214
|
+
bash "$ROOT/benchmark/auto-resolve/scripts/judge-opus-pass.sh" \
|
|
215
|
+
--run-id arg-parse-opus-invalid-generated-score
|
|
216
|
+
OPUS_INVALID_GENERATED_DQ_DIR="$BENCH_ROOT/results/arg-parse-opus-invalid-generated-dq/F99-opus-invalid-generated-dq"
|
|
217
|
+
mkdir -p "$OPUS_INVALID_GENERATED_DQ_DIR"
|
|
218
|
+
: > "$OPUS_INVALID_GENERATED_DQ_DIR/judge-prompt.txt"
|
|
219
|
+
cat > "$OPUS_INVALID_GENERATED_DQ_DIR/judge.json" <<'JSON'
|
|
220
|
+
{
|
|
221
|
+
"_blind_mapping": {"A": "bare", "B": "solo_claude", "seed": 1},
|
|
222
|
+
"scores_by_arm": {"bare": 50, "solo_claude": 60}
|
|
223
|
+
}
|
|
224
|
+
JSON
|
|
225
|
+
expect_fail_contains opus-invalid-generated-dq "invalid opus disqualifier value(s): A" \
|
|
226
|
+
env FAKE_CLAUDE_INVALID_DQ=1 PATH="$FAKE_CLAUDE_DIR:$PATH" \
|
|
227
|
+
bash "$ROOT/benchmark/auto-resolve/scripts/judge-opus-pass.sh" \
|
|
228
|
+
--run-id arg-parse-opus-invalid-generated-dq
|
|
229
|
+
PATH="$FAKE_CLAUDE_DIR:$PATH" \
|
|
230
|
+
bash "$ROOT/benchmark/auto-resolve/scripts/judge-opus-pass.sh" \
|
|
231
|
+
--run-id arg-parse-opus-summary-mapping > "$TMP/opus-summary-mapping.out" 2>&1
|
|
232
|
+
python3 - "$BENCH_ROOT/results/arg-parse-opus-summary-mapping/cross-judge-summary.json" <<'PY'
|
|
233
|
+
import json
|
|
234
|
+
import pathlib
|
|
235
|
+
import sys
|
|
236
|
+
|
|
237
|
+
summary = json.loads(pathlib.Path(sys.argv[1]).read_text())
|
|
238
|
+
row = summary["rows"][0]
|
|
239
|
+
assert row["gpt_scores"] == {"bare": 50, "solo_claude": 60}, row
|
|
240
|
+
assert row["gpt_margin_l1_l0"] == 10, row
|
|
241
|
+
assert row["gpt_margin_v_l0"] is None, row
|
|
242
|
+
assert row["gpt_winner"] is None, row
|
|
243
|
+
assert row["opus_winner"] == "solo_claude", row
|
|
244
|
+
assert row["winner_agree"] is False, row
|
|
245
|
+
assert summary["sign_valid_count_variant_over_bare"] == 0, summary
|
|
246
|
+
PY
|
|
247
|
+
|
|
248
|
+
OPUS_SUMMARY_NULL_MARGIN_DIR="$BENCH_ROOT/results/arg-parse-opus-summary-null-margin/F99-opus-summary-null-margin"
|
|
249
|
+
mkdir -p "$OPUS_SUMMARY_NULL_MARGIN_DIR"
|
|
250
|
+
: > "$OPUS_SUMMARY_NULL_MARGIN_DIR/judge-prompt.txt"
|
|
251
|
+
cat > "$OPUS_SUMMARY_NULL_MARGIN_DIR/judge.json" <<'JSON'
|
|
252
|
+
{
|
|
253
|
+
"_blind_mapping": {"A": "bare", "B": "solo_claude", "seed": 1},
|
|
254
|
+
"breakdowns_by_arm": {
|
|
255
|
+
"bare": {"spec": 10, "constraint": 10, "scope": 10, "quality": 10},
|
|
256
|
+
"solo_claude": {"spec": 11, "constraint": 11, "scope": 11, "quality": 11}
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
JSON
|
|
260
|
+
PATH="$FAKE_CLAUDE_DIR:$PATH" \
|
|
261
|
+
bash "$ROOT/benchmark/auto-resolve/scripts/judge-opus-pass.sh" \
|
|
262
|
+
--run-id arg-parse-opus-summary-null-margin > "$TMP/opus-summary-null-margin.out" 2>&1
|
|
263
|
+
grep -Fq 'gpt_l1_l0_avg=na' "$TMP/opus-summary-null-margin.out"
|
|
264
|
+
grep -Fq 'suite_avg_diff=na' "$TMP/opus-summary-null-margin.out"
|
|
265
|
+
python3 - "$BENCH_ROOT/results/arg-parse-opus-summary-null-margin/cross-judge-summary.json" <<'PY'
|
|
266
|
+
import json
|
|
267
|
+
import pathlib
|
|
268
|
+
import sys
|
|
269
|
+
|
|
270
|
+
summary = json.loads(pathlib.Path(sys.argv[1]).read_text())
|
|
271
|
+
row = summary["rows"][0]
|
|
272
|
+
assert row["gpt_scores"] == {}, row
|
|
273
|
+
assert row["gpt_margin_l1_l0"] is None, row
|
|
274
|
+
assert summary["suite_avg_l1_l0"]["gpt"] is None, summary
|
|
275
|
+
assert summary["suite_avg_l1_l0"]["gpt_valid_count"] == 0, summary
|
|
276
|
+
PY
|
|
277
|
+
|
|
278
|
+
bash "$ROOT/benchmark/auto-resolve/scripts/run-suite.sh" \
|
|
279
|
+
--dry-run \
|
|
280
|
+
--run-id arg-parse-command-test \
|
|
281
|
+
F0 > "$TMP/suite-command.out" 2>&1
|
|
282
|
+
grep -Fq 'Command: ' "$TMP/suite-command.out"
|
|
283
|
+
grep -Fq -- '--dry-run' "$TMP/suite-command.out"
|
|
284
|
+
grep -Fq -- '--run-id arg-parse-command-test' "$TMP/suite-command.out"
|
|
285
|
+
|
|
286
|
+
bash "$ROOT/benchmark/auto-resolve/scripts/run-suite.sh" \
|
|
287
|
+
--dry-run \
|
|
288
|
+
--judge-only \
|
|
289
|
+
--run-id arg-parse-discovery-test > "$TMP/suite-discovery.out" 2>&1
|
|
290
|
+
grep -Fq 'F25-cli-cart-promotion-rules' "$TMP/suite-discovery.out"
|
|
291
|
+
if grep -Fq 'F27-cli-subscription-proration' "$TMP/suite-discovery.out"; then
|
|
292
|
+
echo "retired F27 must not be auto-discovered by the golden suite" >&2
|
|
293
|
+
cat "$TMP/suite-discovery.out" >&2
|
|
294
|
+
exit 1
|
|
295
|
+
fi
|
|
296
|
+
if grep -Fq 'F28-cli-return-authorization' "$TMP/suite-discovery.out"; then
|
|
297
|
+
echo "retired F28 must not be auto-discovered by the golden suite" >&2
|
|
298
|
+
cat "$TMP/suite-discovery.out" >&2
|
|
299
|
+
exit 1
|
|
300
|
+
fi
|
|
301
|
+
|
|
302
|
+
bash "$ROOT/benchmark/auto-resolve/scripts/run-suite.sh" \
|
|
303
|
+
--suite shadow \
|
|
304
|
+
--dry-run \
|
|
305
|
+
--run-id arg-parse-shadow-suite-dry-run > "$TMP/shadow-suite-dry-run.out" 2>&1
|
|
306
|
+
grep -Fq 'Suite: shadow' "$TMP/shadow-suite-dry-run.out"
|
|
307
|
+
grep -Fq 'S1-cli-lang-flag' "$TMP/shadow-suite-dry-run.out"
|
|
308
|
+
grep -Fq '[suite] DRY RUN complete' "$TMP/shadow-suite-dry-run.out"
|
|
309
|
+
grep -Fq 'Use benchmark headroom/pair with explicit S* candidates for real provider measurement.' "$TMP/shadow-suite-dry-run.out"
|
|
310
|
+
if grep -Fq 'Run without --dry-run to invoke models.' "$TMP/shadow-suite-dry-run.out"; then
|
|
311
|
+
echo "shadow suite dry-run must not invite a blocked non-dry-run suite invocation" >&2
|
|
312
|
+
cat "$TMP/shadow-suite-dry-run.out" >&2
|
|
313
|
+
exit 1
|
|
314
|
+
fi
|
|
315
|
+
|
|
316
|
+
expect_fail_contains shadow-suite-provider-run \
|
|
317
|
+
"shadow suite run-suite is dry-run only" \
|
|
318
|
+
bash "$ROOT/benchmark/auto-resolve/scripts/run-suite.sh" \
|
|
319
|
+
--suite shadow \
|
|
320
|
+
--run-id arg-parse-shadow-suite-block
|
|
321
|
+
|
|
322
|
+
expect_fail_contains shadow-suite-judge-only-provider-run \
|
|
323
|
+
"shadow suite run-suite is dry-run only" \
|
|
324
|
+
bash "$ROOT/benchmark/auto-resolve/scripts/run-suite.sh" \
|
|
325
|
+
--suite shadow \
|
|
326
|
+
--judge-only \
|
|
327
|
+
--run-id arg-parse-shadow-suite-judge-only-block
|
|
328
|
+
|
|
329
|
+
node "$ROOT/bin/devlyn.js" benchmark suite \
|
|
330
|
+
--suite shadow \
|
|
331
|
+
--dry-run \
|
|
332
|
+
--run-id arg-parse-shadow-cli-suite-dry-run > "$TMP/shadow-cli-suite-dry-run.out" 2>&1
|
|
333
|
+
grep -Fq 'Suite: shadow' "$TMP/shadow-cli-suite-dry-run.out"
|
|
334
|
+
grep -Fq 'S1-cli-lang-flag' "$TMP/shadow-cli-suite-dry-run.out"
|
|
335
|
+
grep -Fq 'Use benchmark headroom/pair with explicit S* candidates for real provider measurement.' "$TMP/shadow-cli-suite-dry-run.out"
|
|
336
|
+
|
|
337
|
+
node "$ROOT/bin/devlyn.js" --help > "$TMP/devlyn-help.out" 2>&1
|
|
338
|
+
grep -Fq 'npx devlyn-cli benchmark Run the resolve benchmark suite' "$TMP/devlyn-help.out"
|
|
339
|
+
grep -Fq 'npx devlyn-cli benchmark recent Show compact recent benchmark results' "$TMP/devlyn-help.out"
|
|
340
|
+
grep -Fq 'npx devlyn-cli benchmark frontier Show pair candidate frontier scores/triggers without providers' "$TMP/devlyn-help.out"
|
|
341
|
+
grep -Fq 'npx devlyn-cli benchmark audit Audit pair evidence readiness' "$TMP/devlyn-help.out"
|
|
342
|
+
grep -Fq 'npx devlyn-cli benchmark audit-headroom Audit failed headroom results' "$TMP/devlyn-help.out"
|
|
343
|
+
grep -Fq 'npx devlyn-cli benchmark headroom <fixtures...> Score bare vs solo_claude headroom' "$TMP/devlyn-help.out"
|
|
344
|
+
grep -Fq 'npx devlyn-cli benchmark pair <fixtures...> Score solo_claude vs pair path' "$TMP/devlyn-help.out"
|
|
345
|
+
if grep -Fq -- '--n 3' "$TMP/devlyn-help.out"; then
|
|
346
|
+
echo "help must not advertise unsupported --n 3 benchmark runs" >&2
|
|
347
|
+
cat "$TMP/devlyn-help.out" >&2
|
|
348
|
+
exit 1
|
|
349
|
+
fi
|
|
350
|
+
node "$ROOT/bin/devlyn.js" benchmark --help > "$TMP/devlyn-benchmark-help.out" 2>&1
|
|
351
|
+
grep -Fq 'npx devlyn-cli benchmark [suite] [options] [fixtures...]' "$TMP/devlyn-benchmark-help.out"
|
|
352
|
+
grep -Fq 'npx devlyn-cli benchmark recent [options]' "$TMP/devlyn-benchmark-help.out"
|
|
353
|
+
grep -Fq 'npx devlyn-cli benchmark frontier [options]' "$TMP/devlyn-benchmark-help.out"
|
|
354
|
+
grep -Fq 'npx devlyn-cli benchmark audit [options]' "$TMP/devlyn-benchmark-help.out"
|
|
355
|
+
grep -Fq 'npx devlyn-cli benchmark audit-headroom [options]' "$TMP/devlyn-benchmark-help.out"
|
|
356
|
+
grep -Fq 'npx devlyn-cli benchmark suite --suite shadow --dry-run' "$TMP/devlyn-benchmark-help.out"
|
|
357
|
+
grep -Fq 'use headroom/pair with explicit S* ids for real measurement' "$TMP/devlyn-benchmark-help.out"
|
|
358
|
+
grep -Fq 'Show compact, wrap-safe recent benchmark results' "$TMP/devlyn-benchmark-help.out"
|
|
359
|
+
grep -Fq 'npx devlyn-cli benchmark headroom [options] <fixtures...>' "$TMP/devlyn-benchmark-help.out"
|
|
360
|
+
grep -Fq 'npx devlyn-cli benchmark pair [options] <fixtures...>' "$TMP/devlyn-benchmark-help.out"
|
|
361
|
+
grep -Fq 'Show active rejected/evidence/unmeasured pair candidates, scores, and triggers without providers' "$TMP/devlyn-benchmark-help.out"
|
|
362
|
+
grep -Fq 'Fail on unmeasured pair candidates and invalid headroom rejections' "$TMP/devlyn-benchmark-help.out"
|
|
363
|
+
grep -Fq 'Prints frontier score rows plus headroom and pair quality handoff rows' "$TMP/devlyn-benchmark-help.out"
|
|
364
|
+
grep -Fq 'Fail on active failed or unsupported headroom rejections' "$TMP/devlyn-benchmark-help.out"
|
|
365
|
+
grep -Fq 'Score bare vs solo_claude before spending the pair arm' "$TMP/devlyn-benchmark-help.out"
|
|
366
|
+
grep -Fq 'Score solo_claude vs the selected pair path and print gate tables' "$TMP/devlyn-benchmark-help.out"
|
|
367
|
+
grep -Fq 'npx devlyn-cli benchmark recent --out-md /tmp/devlyn-recent-benchmark.md' "$TMP/devlyn-benchmark-help.out"
|
|
368
|
+
grep -Fq 'npx devlyn-cli benchmark pair --min-fixtures 3 --max-pair-solo-wall-ratio 3 F16-cli-quote-tax-rules F23-cli-fulfillment-wave F25-cli-cart-promotion-rules' "$TMP/devlyn-benchmark-help.out"
|
|
369
|
+
|
|
370
|
+
node "$ROOT/bin/devlyn.js" benchmark recent --help > "$TMP/devlyn-benchmark-recent-help.out" 2>&1
|
|
371
|
+
grep -Fq 'npx devlyn-cli benchmark recent [options]' "$TMP/devlyn-benchmark-recent-help.out"
|
|
372
|
+
grep -Fq -- '--out-json PATH' "$TMP/devlyn-benchmark-recent-help.out"
|
|
373
|
+
grep -Fq -- '--out-md PATH' "$TMP/devlyn-benchmark-recent-help.out"
|
|
374
|
+
grep -Fq -- '--fixtures-root PATH' "$TMP/devlyn-benchmark-recent-help.out"
|
|
375
|
+
grep -Fq -- '--registry PATH' "$TMP/devlyn-benchmark-recent-help.out"
|
|
376
|
+
grep -Fq -- '--results-root PATH' "$TMP/devlyn-benchmark-recent-help.out"
|
|
377
|
+
grep -Fq -- '--max-width N default: 92' "$TMP/devlyn-benchmark-recent-help.out"
|
|
378
|
+
grep -Fq 'Prints compact, wrap-safe benchmark status and pair-evidence cards without wide tables' "$TMP/devlyn-benchmark-recent-help.out"
|
|
379
|
+
grep -Fq 'npx devlyn-cli benchmark recent --out-md /tmp/devlyn-recent-benchmark.md' "$TMP/devlyn-benchmark-recent-help.out"
|
|
380
|
+
|
|
381
|
+
node "$ROOT/bin/devlyn.js" benchmark audit --help > "$TMP/devlyn-benchmark-audit-help.out" 2>&1
|
|
382
|
+
grep -Fq 'npx devlyn-cli benchmark audit [options]' "$TMP/devlyn-benchmark-audit-help.out"
|
|
383
|
+
grep -Fq -- '--out-dir PATH' "$TMP/devlyn-benchmark-audit-help.out"
|
|
384
|
+
grep -Fq -- '--fixtures-root PATH' "$TMP/devlyn-benchmark-audit-help.out"
|
|
385
|
+
grep -Fq -- '--registry PATH' "$TMP/devlyn-benchmark-audit-help.out"
|
|
386
|
+
grep -Fq -- '--results-root PATH' "$TMP/devlyn-benchmark-audit-help.out"
|
|
387
|
+
grep -Fq -- '--min-pair-evidence N default: 4' "$TMP/devlyn-benchmark-audit-help.out"
|
|
388
|
+
grep -Fq -- '--min-pair-margin N default: 5' "$TMP/devlyn-benchmark-audit-help.out"
|
|
389
|
+
grep -Fq -- '--max-pair-solo-wall-ratio N default: 3' "$TMP/devlyn-benchmark-audit-help.out"
|
|
390
|
+
grep -Fq -- '--require-hypothesis-trigger' "$TMP/devlyn-benchmark-audit-help.out"
|
|
391
|
+
grep -Fq 'Prints frontier score rows plus headroom_rejections=PASS/FAIL, pair_evidence_quality=PASS/FAIL, pair_trigger_reasons=PASS/FAIL, pair_evidence_hypotheses=PASS/FAIL, pair_evidence_hypothesis_triggers=PASS/WARN/FAIL, historical-alias, and hypothesis-trigger gap handoff rows' "$TMP/devlyn-benchmark-audit-help.out"
|
|
392
|
+
grep -Fq 'npx devlyn-cli benchmark audit --out-dir /tmp/devlyn-benchmark-audit' "$TMP/devlyn-benchmark-audit-help.out"
|
|
393
|
+
grep -Fq 'npx devlyn-cli benchmark audit --require-hypothesis-trigger --out-dir /tmp/devlyn-benchmark-audit-strict' "$TMP/devlyn-benchmark-audit-help.out"
|
|
394
|
+
|
|
395
|
+
node "$ROOT/bin/devlyn.js" benchmark frontier --help > "$TMP/devlyn-benchmark-frontier-help.out" 2>&1
|
|
396
|
+
grep -Fq 'npx devlyn-cli benchmark frontier [options]' "$TMP/devlyn-benchmark-frontier-help.out"
|
|
397
|
+
grep -Fq -- '--out-json PATH' "$TMP/devlyn-benchmark-frontier-help.out"
|
|
398
|
+
grep -Fq -- '--out-md PATH' "$TMP/devlyn-benchmark-frontier-help.out"
|
|
399
|
+
grep -Fq -- '--fixtures-root PATH' "$TMP/devlyn-benchmark-frontier-help.out"
|
|
400
|
+
grep -Fq -- '--registry PATH' "$TMP/devlyn-benchmark-frontier-help.out"
|
|
401
|
+
grep -Fq -- '--results-root PATH' "$TMP/devlyn-benchmark-frontier-help.out"
|
|
402
|
+
grep -Fq -- '--fail-on-unmeasured' "$TMP/devlyn-benchmark-frontier-help.out"
|
|
403
|
+
grep -Fq -- '--min-pair-margin N default: 5' "$TMP/devlyn-benchmark-frontier-help.out"
|
|
404
|
+
grep -Fq -- '--max-pair-solo-wall-ratio N default: 3' "$TMP/devlyn-benchmark-frontier-help.out"
|
|
405
|
+
grep -Fq 'Prints pair evidence score rows with trigger reasons; --out-md includes a Triggers column' "$TMP/devlyn-benchmark-frontier-help.out"
|
|
406
|
+
grep -Fq 'npx devlyn-cli benchmark frontier --out-md /tmp/devlyn-pair-frontier.md' "$TMP/devlyn-benchmark-frontier-help.out"
|
|
407
|
+
|
|
408
|
+
node "$ROOT/bin/devlyn.js" benchmark audit-headroom --help > "$TMP/devlyn-benchmark-audit-headroom-help.out" 2>&1
|
|
409
|
+
grep -Fq 'npx devlyn-cli benchmark audit-headroom [options]' "$TMP/devlyn-benchmark-audit-headroom-help.out"
|
|
410
|
+
grep -Fq -- '--out-json PATH' "$TMP/devlyn-benchmark-audit-headroom-help.out"
|
|
411
|
+
grep -Fq -- '--fixtures-root PATH' "$TMP/devlyn-benchmark-audit-headroom-help.out"
|
|
412
|
+
grep -Fq -- '--registry PATH' "$TMP/devlyn-benchmark-audit-headroom-help.out"
|
|
413
|
+
grep -Fq -- '--results-root PATH' "$TMP/devlyn-benchmark-audit-headroom-help.out"
|
|
414
|
+
grep -Fq 'npx devlyn-cli benchmark audit-headroom --out-json /tmp/devlyn-headroom-audit.json' "$TMP/devlyn-benchmark-audit-headroom-help.out"
|
|
415
|
+
|
|
416
|
+
node "$ROOT/bin/devlyn.js" benchmark audit-headroom --out-json "$TMP/headroom-audit.json" > "$TMP/devlyn-benchmark-audit-headroom.out" 2>&1
|
|
417
|
+
grep -Fq 'PASS audit-headroom-rejections' "$TMP/devlyn-benchmark-audit-headroom.out"
|
|
418
|
+
python3 - "$TMP/headroom-audit.json" <<'PY'
|
|
419
|
+
import json
|
|
420
|
+
import sys
|
|
421
|
+
|
|
422
|
+
report = json.load(open(sys.argv[1], encoding="utf8"))
|
|
423
|
+
assert report["verdict"] == "PASS"
|
|
424
|
+
assert report["unrecorded_failures"] == []
|
|
425
|
+
assert report["unsupported_registry_rejections"] == []
|
|
426
|
+
PY
|
|
427
|
+
|
|
428
|
+
node "$ROOT/bin/devlyn.js" benchmark recent \
|
|
429
|
+
--out-json "$TMP/recent.json" \
|
|
430
|
+
--out-md "$TMP/recent.md" \
|
|
431
|
+
--max-width 92 > "$TMP/devlyn-benchmark-recent.out" 2>&1
|
|
432
|
+
grep -Fq 'Recent Benchmark Snapshot' "$TMP/devlyn-benchmark-recent.out"
|
|
433
|
+
grep -Fq 'Pair evidence rows: 4' "$TMP/devlyn-benchmark-recent.out"
|
|
434
|
+
grep -Fq 'Unmeasured candidates: 0' "$TMP/devlyn-benchmark-recent.out"
|
|
435
|
+
grep -Fq 'F21 cli scheduler priority' "$TMP/devlyn-benchmark-recent.out"
|
|
436
|
+
grep -Fq 'triggers: complexity.high, risk.high, risk_probes.enabled, spec.solo_headroom_hypothesis' "$TMP/devlyn-benchmark-recent.out"
|
|
437
|
+
grep -Fq '# Recent Benchmark Snapshot' "$TMP/recent.md"
|
|
438
|
+
grep -Fq '## Pair Evidence' "$TMP/recent.md"
|
|
439
|
+
if grep -Fq '| Fixture |' "$TMP/recent.md"; then
|
|
440
|
+
echo "recent benchmark markdown must use wrap-safe cards, not a wide table" >&2
|
|
441
|
+
cat "$TMP/recent.md" >&2
|
|
442
|
+
exit 1
|
|
443
|
+
fi
|
|
444
|
+
python3 - "$TMP/devlyn-benchmark-recent.out" "$TMP/recent.json" <<'PY'
|
|
445
|
+
import json
|
|
446
|
+
import pathlib
|
|
447
|
+
import sys
|
|
448
|
+
|
|
449
|
+
text = pathlib.Path(sys.argv[1]).read_text(encoding="utf8")
|
|
450
|
+
long_lines = [(i, len(line), line) for i, line in enumerate(text.splitlines(), 1) if len(line) > 92]
|
|
451
|
+
assert not long_lines, long_lines
|
|
452
|
+
report = json.load(open(sys.argv[2], encoding="utf8"))
|
|
453
|
+
assert report["verdict"] == "PASS"
|
|
454
|
+
assert report["pair_evidence_count"] == 4
|
|
455
|
+
assert report["unmeasured_count"] == 0
|
|
456
|
+
assert report["pair_margin_avg"] == 27.25
|
|
457
|
+
assert report["pair_solo_wall_ratio_max"] == 2.25
|
|
458
|
+
PY
|
|
459
|
+
|
|
460
|
+
node "$ROOT/bin/devlyn.js" benchmark audit --out-dir "$TMP/audit" > "$TMP/devlyn-benchmark-audit.out" 2>&1
|
|
461
|
+
grep -Fq '[audit] frontier' "$TMP/devlyn-benchmark-audit.out"
|
|
462
|
+
grep -Fq 'fixtures=21 rejected=17 candidates=4 pair_evidence=4 unmeasured=0 verdict=PASS' "$TMP/devlyn-benchmark-audit.out"
|
|
463
|
+
grep -Fq 'F16-cli-quote-tax-rules: bare=50 solo_claude=75 pair=96 arm=l2_risk_probes margin=+21' "$TMP/devlyn-benchmark-audit.out"
|
|
464
|
+
grep -Fq 'verdict=pair_evidence_passed' "$TMP/devlyn-benchmark-audit.out"
|
|
465
|
+
grep -Fq 'headroom_rejections=PASS verdict=PASS unrecorded=0 unsupported=0' "$TMP/devlyn-benchmark-audit.out"
|
|
466
|
+
grep -Fq 'pair_evidence_quality=PASS min_pair_margin_actual=+21 min_pair_margin_required=+5 max_wall_actual=2.25x max_wall_allowed=3.00x' "$TMP/devlyn-benchmark-audit.out"
|
|
467
|
+
grep -Fq 'pair_trigger_reasons=PASS canonical=4 historical_alias=0 exposed=4 total=4 summary=4 rows_match=true' "$TMP/devlyn-benchmark-audit.out"
|
|
468
|
+
grep -Fq 'pair_evidence_hypothesis_triggers=PASS matched=4 documented=4 total=4' "$TMP/devlyn-benchmark-audit.out"
|
|
469
|
+
if grep -Fq 'pair_trigger_historical_aliases=' "$TMP/devlyn-benchmark-audit.out" \
|
|
470
|
+
|| grep -Fq 'pair_evidence_hypothesis_trigger_gaps=' "$TMP/devlyn-benchmark-audit.out"; then
|
|
471
|
+
echo "current benchmark audit must not report historical aliases or hypothesis-trigger gaps" >&2
|
|
472
|
+
cat "$TMP/devlyn-benchmark-audit.out" >&2
|
|
473
|
+
exit 1
|
|
474
|
+
fi
|
|
475
|
+
grep -Fq 'PASS audit-pair-evidence' "$TMP/devlyn-benchmark-audit.out"
|
|
476
|
+
test -f "$TMP/audit/frontier.json"
|
|
477
|
+
test -f "$TMP/audit/frontier.stdout"
|
|
478
|
+
test -f "$TMP/audit/frontier.stderr"
|
|
479
|
+
test -f "$TMP/audit/headroom-audit.json"
|
|
480
|
+
test -f "$TMP/audit/headroom-rejections.stdout"
|
|
481
|
+
test -f "$TMP/audit/headroom-rejections.stderr"
|
|
482
|
+
test -f "$TMP/audit/audit.json"
|
|
483
|
+
grep -Fq 'F16-cli-quote-tax-rules: bare=50 solo_claude=75 pair=96 arm=l2_risk_probes margin=+21' "$TMP/audit/frontier.stdout"
|
|
484
|
+
grep -Fq 'verdict=pair_evidence_passed' "$TMP/audit/frontier.stdout"
|
|
485
|
+
python3 - "$TMP/audit/audit.json" "$TMP/audit/frontier.json" <<'PY'
|
|
486
|
+
import json
|
|
487
|
+
import sys
|
|
488
|
+
|
|
489
|
+
audit = json.load(open(sys.argv[1], encoding="utf8"))
|
|
490
|
+
frontier = json.load(open(sys.argv[2], encoding="utf8"))
|
|
491
|
+
assert audit["verdict"] == "PASS"
|
|
492
|
+
assert audit["min_pair_evidence"] == 4
|
|
493
|
+
assert audit["min_pair_margin"] == 5
|
|
494
|
+
assert audit["max_pair_solo_wall_ratio"] == 3.0
|
|
495
|
+
assert audit["checks"]["frontier"]["status"] == "PASS"
|
|
496
|
+
assert audit["checks"]["headroom_rejections"]["status"] == "PASS"
|
|
497
|
+
assert audit["checks"]["headroom_rejections"]["exit_code"] == 0
|
|
498
|
+
assert audit["checks"]["headroom_rejections"]["report_check_exit_code"] == 0
|
|
499
|
+
assert audit["checks"]["headroom_rejections"]["verdict"] == "PASS"
|
|
500
|
+
assert audit["checks"]["headroom_rejections"]["unrecorded_failure_count"] == 0
|
|
501
|
+
assert audit["checks"]["headroom_rejections"]["unsupported_registry_rejection_count"] == 0
|
|
502
|
+
assert audit["checks"]["frontier_report"]["status"] == "PASS"
|
|
503
|
+
assert audit["checks"]["frontier_report"]["verdict"] == frontier["verdict"]
|
|
504
|
+
assert audit["checks"]["frontier_report"]["unmeasured_count"] == frontier["unmeasured_count"]
|
|
505
|
+
assert audit["checks"]["frontier_stdout"]["status"] == "PASS"
|
|
506
|
+
assert audit["checks"]["frontier_stdout"]["summary_rows"] == 1
|
|
507
|
+
assert audit["checks"]["frontier_stdout"]["aggregate_rows"] == 1
|
|
508
|
+
assert audit["checks"]["frontier_stdout"]["final_verdict_rows"] == 1
|
|
509
|
+
assert audit["checks"]["frontier_stdout"]["expected_rows"] == len(audit["pair_evidence_rows"])
|
|
510
|
+
assert audit["checks"]["frontier_stdout"]["stdout_rows"] == len(audit["pair_evidence_rows"])
|
|
511
|
+
assert audit["checks"]["frontier_stdout"]["trigger_rows"] == len(audit["pair_evidence_rows"])
|
|
512
|
+
assert audit["checks"]["frontier_stdout"]["hypothesis_trigger_rows"] == len(audit["pair_evidence_rows"])
|
|
513
|
+
assert audit["checks"]["frontier_stdout"]["rows_match_count"] is True
|
|
514
|
+
assert audit["checks"]["frontier_stdout"]["trigger_rows_match_count"] is True
|
|
515
|
+
assert audit["checks"]["frontier_stdout"]["hypothesis_trigger_rows_match_count"] is True
|
|
516
|
+
assert audit["checks"]["min_pair_evidence"]["status"] == "PASS"
|
|
517
|
+
assert audit["checks"]["min_pair_evidence"]["actual_rows"] == len(audit["pair_evidence_rows"])
|
|
518
|
+
assert audit["checks"]["min_pair_evidence"]["actual_rows"] >= audit["min_pair_evidence"]
|
|
519
|
+
assert audit["checks"]["pair_evidence_quality"]["status"] == "PASS"
|
|
520
|
+
assert audit["checks"]["pair_evidence_quality"]["min_pair_margin_actual"] == frontier["pair_margin_min"]
|
|
521
|
+
assert audit["checks"]["pair_evidence_quality"]["max_pair_solo_wall_ratio_actual"] == frontier["pair_solo_wall_ratio_max"]
|
|
522
|
+
assert audit["checks"]["pair_trigger_reasons"]["status"] == "PASS"
|
|
523
|
+
assert audit["checks"]["pair_trigger_reasons"]["summary_pair_evidence_count"] == 4
|
|
524
|
+
assert audit["checks"]["pair_trigger_reasons"]["canonical_rows"] == 4
|
|
525
|
+
assert audit["checks"]["pair_trigger_reasons"]["historical_alias_rows"] == 0
|
|
526
|
+
assert audit["checks"]["pair_trigger_reasons"]["historical_alias_details"] == []
|
|
527
|
+
assert audit["checks"]["pair_trigger_reasons"]["exposed_rows"] == 4
|
|
528
|
+
assert audit["checks"]["pair_trigger_reasons"]["total_rows"] == 4
|
|
529
|
+
assert audit["checks"]["pair_trigger_reasons"]["rows_match_count"] is True
|
|
530
|
+
assert audit["checks"]["pair_evidence_hypothesis_triggers"]["status"] == "PASS"
|
|
531
|
+
assert audit["checks"]["pair_evidence_hypothesis_triggers"]["exit_code"] == 0
|
|
532
|
+
assert audit["checks"]["pair_evidence_hypothesis_triggers"]["required"] is False
|
|
533
|
+
assert audit["checks"]["pair_evidence_hypothesis_triggers"]["matched_rows"] == 4
|
|
534
|
+
assert audit["checks"]["pair_evidence_hypothesis_triggers"]["documented_rows"] == 4
|
|
535
|
+
assert audit["checks"]["pair_evidence_hypothesis_triggers"]["total_rows"] == 4
|
|
536
|
+
assert audit["checks"]["pair_evidence_hypothesis_triggers"]["gap_details"] == []
|
|
537
|
+
assert audit["artifacts"]["frontier_stdout"] == "frontier.stdout"
|
|
538
|
+
assert audit["artifacts"]["headroom_rejections_stdout"] == "headroom-rejections.stdout"
|
|
539
|
+
assert audit["frontier_summary"]["pair_margin_avg"] == frontier["pair_margin_avg"]
|
|
540
|
+
assert audit["frontier_summary"]["unmeasured_count"] == frontier["unmeasured_count"]
|
|
541
|
+
assert len(audit["pair_evidence_rows"]) == frontier["pair_evidence_count"]
|
|
542
|
+
for row in audit["pair_evidence_rows"]:
|
|
543
|
+
assert isinstance(row["fixture"], str) and row["fixture"]
|
|
544
|
+
assert row["verdict"] == "pair_evidence_passed"
|
|
545
|
+
assert isinstance(row["run_id"], str) and row["run_id"]
|
|
546
|
+
assert isinstance(row["pair_arm"], str) and row["pair_arm"]
|
|
547
|
+
assert isinstance(row["bare_score"], int) and not isinstance(row["bare_score"], bool)
|
|
548
|
+
assert isinstance(row["solo_score"], int) and not isinstance(row["solo_score"], bool)
|
|
549
|
+
assert isinstance(row["pair_score"], int) and not isinstance(row["pair_score"], bool)
|
|
550
|
+
assert isinstance(row["pair_margin"], int) and not isinstance(row["pair_margin"], bool)
|
|
551
|
+
assert row["pair_mode"] is True
|
|
552
|
+
assert row["pair_trigger_eligible"] is True
|
|
553
|
+
assert isinstance(row["pair_solo_wall_ratio"], (int, float))
|
|
554
|
+
assert not isinstance(row["pair_solo_wall_ratio"], bool)
|
|
555
|
+
assert frontier["verdict"] == "PASS"
|
|
556
|
+
assert frontier["min_pair_margin"] == 5
|
|
557
|
+
assert frontier["max_pair_solo_wall_ratio"] == 3.0
|
|
558
|
+
assert frontier["unmeasured_count"] == 0
|
|
559
|
+
assert frontier["pair_margin_avg"] is not None
|
|
560
|
+
assert frontier["pair_margin_min"] is not None
|
|
561
|
+
PY
|
|
562
|
+
|
|
563
|
+
actual_pair_evidence=$(python3 - "$TMP/audit/audit.json" <<'PY'
|
|
564
|
+
import json
|
|
565
|
+
import sys
|
|
566
|
+
|
|
567
|
+
audit = json.load(open(sys.argv[1], encoding="utf8"))
|
|
568
|
+
actual = audit["checks"]["min_pair_evidence"]["actual_rows"]
|
|
569
|
+
assert isinstance(actual, int) and not isinstance(actual, bool)
|
|
570
|
+
print(actual)
|
|
571
|
+
PY
|
|
572
|
+
)
|
|
573
|
+
required_pair_evidence=$((actual_pair_evidence + 1))
|
|
574
|
+
if node "$ROOT/bin/devlyn.js" benchmark audit \
|
|
575
|
+
--min-pair-evidence "$required_pair_evidence" \
|
|
576
|
+
--out-dir "$TMP/audit-fail" \
|
|
577
|
+
> "$TMP/devlyn-benchmark-audit-fail.out" 2>&1; then
|
|
578
|
+
echo "benchmark audit must fail when min pair evidence exceeds current evidence rows" >&2
|
|
579
|
+
cat "$TMP/devlyn-benchmark-audit-fail.out" >&2
|
|
580
|
+
exit 1
|
|
581
|
+
fi
|
|
582
|
+
grep -Fq "pair evidence count ${actual_pair_evidence} below required minimum ${required_pair_evidence}" "$TMP/devlyn-benchmark-audit-fail.out"
|
|
583
|
+
grep -Fq 'pair_margin_avg=+27.25 pair_margin_min=+21 wall_avg=1.66x wall_max=2.25x' "$TMP/devlyn-benchmark-audit-fail.out"
|
|
584
|
+
grep -Fq 'F16-cli-quote-tax-rules: bare=50 solo_claude=75 pair=96 arm=l2_risk_probes margin=+21' "$TMP/devlyn-benchmark-audit-fail.out"
|
|
585
|
+
grep -Fq 'headroom_rejections=PASS verdict=PASS unrecorded=0 unsupported=0' "$TMP/devlyn-benchmark-audit-fail.out"
|
|
586
|
+
grep -Fq 'pair_evidence_quality=PASS min_pair_margin_actual=+21 min_pair_margin_required=+5 max_wall_actual=2.25x max_wall_allowed=3.00x' "$TMP/devlyn-benchmark-audit-fail.out"
|
|
587
|
+
grep -Fq 'pair_trigger_reasons=PASS canonical=4 historical_alias=0 exposed=4 total=4 summary=4 rows_match=true' "$TMP/devlyn-benchmark-audit-fail.out"
|
|
588
|
+
grep -Fq 'pair_evidence_hypothesis_triggers=PASS matched=4 documented=4 total=4' "$TMP/devlyn-benchmark-audit-fail.out"
|
|
589
|
+
grep -Fq 'FAIL audit-pair-evidence' "$TMP/devlyn-benchmark-audit-fail.out"
|
|
590
|
+
python3 - "$TMP/audit-fail/audit.json" "$actual_pair_evidence" "$required_pair_evidence" <<'PY'
|
|
591
|
+
import json
|
|
592
|
+
import sys
|
|
593
|
+
|
|
594
|
+
audit = json.load(open(sys.argv[1], encoding="utf8"))
|
|
595
|
+
actual = int(sys.argv[2])
|
|
596
|
+
required = int(sys.argv[3])
|
|
597
|
+
assert audit["verdict"] == "FAIL"
|
|
598
|
+
assert audit["checks"]["frontier"]["status"] == "PASS"
|
|
599
|
+
assert audit["checks"]["headroom_rejections"]["status"] == "PASS"
|
|
600
|
+
assert audit["checks"]["headroom_rejections"]["report_check_exit_code"] == 0
|
|
601
|
+
assert audit["checks"]["headroom_rejections"]["verdict"] == "PASS"
|
|
602
|
+
assert audit["checks"]["headroom_rejections"]["unrecorded_failure_count"] == 0
|
|
603
|
+
assert audit["checks"]["headroom_rejections"]["unsupported_registry_rejection_count"] == 0
|
|
604
|
+
assert audit["checks"]["min_pair_evidence"]["status"] == "FAIL"
|
|
605
|
+
assert audit["checks"]["min_pair_evidence"]["required"] == required
|
|
606
|
+
assert audit["checks"]["min_pair_evidence"]["actual_rows"] == actual
|
|
607
|
+
assert audit["checks"]["pair_evidence_quality"]["status"] == "PASS"
|
|
608
|
+
assert audit["checks"]["pair_trigger_reasons"]["status"] == "PASS"
|
|
609
|
+
assert audit["checks"]["pair_trigger_reasons"]["summary_pair_evidence_count"] == actual
|
|
610
|
+
assert audit["checks"]["pair_trigger_reasons"]["historical_alias_rows"] == 0
|
|
611
|
+
assert audit["checks"]["pair_trigger_reasons"]["rows_match_count"] is True
|
|
612
|
+
assert audit["checks"]["pair_evidence_hypothesis_triggers"]["status"] == "PASS"
|
|
613
|
+
assert audit["checks"]["pair_evidence_hypothesis_triggers"]["matched_rows"] == actual
|
|
614
|
+
PY
|
|
615
|
+
|
|
616
|
+
node "$ROOT/bin/devlyn.js" benchmark audit \
|
|
617
|
+
--require-hypothesis-trigger \
|
|
618
|
+
--out-dir "$TMP/audit-strict-trigger" \
|
|
619
|
+
> "$TMP/devlyn-benchmark-audit-strict-trigger.out" 2>&1
|
|
620
|
+
grep -Fq 'pair_evidence_hypothesis_triggers=PASS matched=4 documented=4 total=4' "$TMP/devlyn-benchmark-audit-strict-trigger.out"
|
|
621
|
+
grep -Fq 'PASS audit-pair-evidence' "$TMP/devlyn-benchmark-audit-strict-trigger.out"
|
|
622
|
+
if grep -Fq 'pair_evidence_hypothesis_trigger_gaps=' "$TMP/devlyn-benchmark-audit-strict-trigger.out"; then
|
|
623
|
+
echo "strict benchmark audit must not report current hypothesis-trigger gaps" >&2
|
|
624
|
+
cat "$TMP/devlyn-benchmark-audit-strict-trigger.out" >&2
|
|
625
|
+
exit 1
|
|
626
|
+
fi
|
|
627
|
+
python3 - "$TMP/audit-strict-trigger/audit.json" <<'PY'
|
|
628
|
+
import json
|
|
629
|
+
import sys
|
|
630
|
+
|
|
631
|
+
audit = json.load(open(sys.argv[1], encoding="utf8"))
|
|
632
|
+
assert audit["verdict"] == "PASS"
|
|
633
|
+
assert audit["checks"]["pair_evidence_hypothesis_triggers"]["status"] == "PASS"
|
|
634
|
+
assert audit["checks"]["pair_evidence_hypothesis_triggers"]["exit_code"] == 0
|
|
635
|
+
assert audit["checks"]["pair_evidence_hypothesis_triggers"]["required"] is True
|
|
636
|
+
assert audit["checks"]["pair_evidence_hypothesis_triggers"]["matched_rows"] == 4
|
|
637
|
+
assert audit["checks"]["pair_evidence_hypothesis_triggers"]["documented_rows"] == 4
|
|
638
|
+
assert audit["checks"]["pair_evidence_hypothesis_triggers"]["total_rows"] == 4
|
|
639
|
+
assert audit["checks"]["pair_evidence_hypothesis_triggers"]["gap_details"] == []
|
|
640
|
+
PY
|
|
641
|
+
|
|
642
|
+
node "$ROOT/bin/devlyn.js" benchmark frontier --out-json "$TMP/frontier.json" > "$TMP/devlyn-benchmark-frontier.out" 2>&1
|
|
643
|
+
grep -Fq 'fixtures=' "$TMP/devlyn-benchmark-frontier.out"
|
|
644
|
+
grep -Fq 'rejected=' "$TMP/devlyn-benchmark-frontier.out"
|
|
645
|
+
grep -Fq 'candidates=' "$TMP/devlyn-benchmark-frontier.out"
|
|
646
|
+
grep -Fq 'pair_evidence=' "$TMP/devlyn-benchmark-frontier.out"
|
|
647
|
+
grep -Fq 'pair_margin_avg=' "$TMP/devlyn-benchmark-frontier.out"
|
|
648
|
+
grep -Fq 'PASS pair-candidate-frontier' "$TMP/devlyn-benchmark-frontier.out"
|
|
649
|
+
python3 - "$TMP/frontier.json" <<'PY'
|
|
650
|
+
import json
|
|
651
|
+
import sys
|
|
652
|
+
|
|
653
|
+
report = json.load(open(sys.argv[1], encoding="utf8"))
|
|
654
|
+
assert report["verdict"] in {"PASS", "FAIL"}
|
|
655
|
+
assert report["fixtures_total"] >= 1
|
|
656
|
+
assert "unmeasured_count" in report
|
|
657
|
+
assert "pair_margin_avg" in report
|
|
658
|
+
assert "rows" in report
|
|
659
|
+
PY
|
|
660
|
+
|
|
661
|
+
frontier_fail_fixtures="$TMP/frontier-fail-fixtures"
|
|
662
|
+
frontier_fail_results="$TMP/frontier-fail-results"
|
|
663
|
+
frontier_fail_registry="$TMP/frontier-fail-rejected.sh"
|
|
664
|
+
mkdir -p "$frontier_fail_fixtures/F21-cli-scheduler-priority" "$frontier_fail_results"
|
|
665
|
+
cat > "$frontier_fail_registry" <<'SH'
|
|
666
|
+
rejected_pair_fixture_reason() {
|
|
667
|
+
local fid="$1"
|
|
668
|
+
case "$fid" in
|
|
669
|
+
F2-*|F2)
|
|
670
|
+
echo "measured ceiling"
|
|
671
|
+
;;
|
|
672
|
+
*)
|
|
673
|
+
return 1
|
|
674
|
+
;;
|
|
675
|
+
esac
|
|
676
|
+
}
|
|
677
|
+
SH
|
|
678
|
+
if node "$ROOT/bin/devlyn.js" benchmark frontier \
|
|
679
|
+
--fixtures-root "$frontier_fail_fixtures" \
|
|
680
|
+
--registry "$frontier_fail_registry" \
|
|
681
|
+
--results-root "$frontier_fail_results" \
|
|
682
|
+
--fail-on-unmeasured \
|
|
683
|
+
--out-json "$TMP/frontier-fail.json" \
|
|
684
|
+
> "$TMP/devlyn-benchmark-frontier-fail.out" 2>&1; then
|
|
685
|
+
echo "benchmark frontier must fail when active unmeasured candidates remain" >&2
|
|
686
|
+
cat "$TMP/devlyn-benchmark-frontier-fail.out" >&2
|
|
687
|
+
exit 1
|
|
688
|
+
fi
|
|
689
|
+
grep -Fq 'fixtures=1 rejected=0 candidates=1 pair_evidence=0 unmeasured=1 verdict=FAIL' "$TMP/devlyn-benchmark-frontier-fail.out"
|
|
690
|
+
grep -Fq 'unmeasured candidate fixture(s): F21-cli-scheduler-priority' "$TMP/devlyn-benchmark-frontier-fail.out"
|
|
691
|
+
grep -Fq 'FAIL pair-candidate-frontier' "$TMP/devlyn-benchmark-frontier-fail.out"
|
|
692
|
+
python3 - "$TMP/frontier-fail.json" <<'PY'
|
|
693
|
+
import json
|
|
694
|
+
import sys
|
|
695
|
+
|
|
696
|
+
report = json.load(open(sys.argv[1], encoding="utf8"))
|
|
697
|
+
assert report["verdict"] == "FAIL"
|
|
698
|
+
assert report["fixtures_total"] == 1
|
|
699
|
+
assert report["candidate_count"] == 1
|
|
700
|
+
assert report["unmeasured_count"] == 1
|
|
701
|
+
assert report["rows"][0]["status"] == "candidate_unmeasured"
|
|
702
|
+
PY
|
|
703
|
+
|
|
704
|
+
set +e
|
|
705
|
+
node "$ROOT/bin/devlyn.js" benchmark frontier \
|
|
706
|
+
--fixtures-root "$frontier_fail_fixtures" \
|
|
707
|
+
--registry "$frontier_fail_registry" \
|
|
708
|
+
--results-root "$frontier_fail_results" \
|
|
709
|
+
--fail-on-unmeasured \
|
|
710
|
+
> "$TMP/devlyn-benchmark-frontier-json-fail.json" \
|
|
711
|
+
2> "$TMP/devlyn-benchmark-frontier-json-fail.stderr"
|
|
712
|
+
frontier_json_fail_status=$?
|
|
713
|
+
set -e
|
|
714
|
+
if [ "$frontier_json_fail_status" -eq 0 ]; then
|
|
715
|
+
echo "benchmark frontier pure JSON failure path must fail" >&2
|
|
716
|
+
exit 1
|
|
717
|
+
fi
|
|
718
|
+
grep -Fq 'unmeasured candidate fixture(s): F21-cli-scheduler-priority' "$TMP/devlyn-benchmark-frontier-json-fail.stderr"
|
|
719
|
+
grep -Fq 'FAIL pair-candidate-frontier' "$TMP/devlyn-benchmark-frontier-json-fail.stderr"
|
|
720
|
+
if grep -Fq 'FAIL pair-candidate-frontier' "$TMP/devlyn-benchmark-frontier-json-fail.json"; then
|
|
721
|
+
echo "benchmark frontier pure JSON stdout must not include final text verdict" >&2
|
|
722
|
+
cat "$TMP/devlyn-benchmark-frontier-json-fail.json" >&2
|
|
723
|
+
exit 1
|
|
724
|
+
fi
|
|
725
|
+
python3 - "$TMP/devlyn-benchmark-frontier-json-fail.json" <<'PY'
|
|
726
|
+
import json
|
|
727
|
+
import sys
|
|
728
|
+
|
|
729
|
+
report = json.load(open(sys.argv[1], encoding="utf8"))
|
|
730
|
+
assert report["verdict"] == "FAIL"
|
|
731
|
+
assert report["fixtures_total"] == 1
|
|
732
|
+
assert report["unmeasured_count"] == 1
|
|
733
|
+
assert report["rows"][0]["status"] == "candidate_unmeasured"
|
|
734
|
+
PY
|
|
735
|
+
|
|
736
|
+
node "$ROOT/bin/devlyn.js" benchmark suite --dry-run --run-id arg-parse-command-test F0 \
|
|
737
|
+
> "$TMP/devlyn-benchmark-suite.out" 2>&1
|
|
738
|
+
grep -Fq '═══ Benchmark Suite Run ═══' "$TMP/devlyn-benchmark-suite.out"
|
|
739
|
+
grep -Fq -- '--run-id arg-parse-command-test' "$TMP/devlyn-benchmark-suite.out"
|
|
740
|
+
|
|
741
|
+
node "$ROOT/bin/devlyn.js" benchmark headroom --help > "$TMP/devlyn-benchmark-headroom-help.out" 2>&1
|
|
742
|
+
grep -Fq 'npx devlyn-cli benchmark headroom [options] <fixtures...>' "$TMP/devlyn-benchmark-headroom-help.out"
|
|
743
|
+
grep -Fq 'use 3 for F16/F23/F25 proof reruns; audit requires 4 passing evidence rows' "$TMP/devlyn-benchmark-headroom-help.out"
|
|
744
|
+
grep -Fq 'npx devlyn-cli benchmark headroom --min-fixtures 3 F16-cli-quote-tax-rules F23-cli-fulfillment-wave F25-cli-cart-promotion-rules' "$TMP/devlyn-benchmark-headroom-help.out"
|
|
745
|
+
grep -Fq -- '--min-bare-headroom N' "$TMP/devlyn-benchmark-headroom-help.out"
|
|
746
|
+
grep -Fq -- '--min-solo-headroom N' "$TMP/devlyn-benchmark-headroom-help.out"
|
|
747
|
+
grep -Fq -- '--allow-rejected-fixtures' "$TMP/devlyn-benchmark-headroom-help.out"
|
|
748
|
+
grep -Fq -- '--dry-run' "$TMP/devlyn-benchmark-headroom-help.out"
|
|
749
|
+
if grep -Fq 'run-headroom-candidate.sh' "$TMP/devlyn-benchmark-headroom-help.out"; then
|
|
750
|
+
echo "headroom CLI help must not expose internal runner name" >&2
|
|
751
|
+
cat "$TMP/devlyn-benchmark-headroom-help.out" >&2
|
|
752
|
+
exit 1
|
|
753
|
+
fi
|
|
754
|
+
node "$ROOT/bin/devlyn.js" benchmark pair --help > "$TMP/devlyn-benchmark-pair-help.out" 2>&1
|
|
755
|
+
grep -Fq 'npx devlyn-cli benchmark pair [options] <fixtures...>' "$TMP/devlyn-benchmark-pair-help.out"
|
|
756
|
+
grep -Fq 'use 3 for F16/F23/F25 proof reruns; audit requires 4 passing evidence rows' "$TMP/devlyn-benchmark-pair-help.out"
|
|
757
|
+
grep -Fq 'default: l2_risk_probes; l2_gated is diagnostic' "$TMP/devlyn-benchmark-pair-help.out"
|
|
758
|
+
grep -Fq -- '--min-bare-headroom N' "$TMP/devlyn-benchmark-pair-help.out"
|
|
759
|
+
grep -Fq -- '--min-solo-headroom N' "$TMP/devlyn-benchmark-pair-help.out"
|
|
760
|
+
grep -Fq -- '--max-pair-solo-wall-ratio N default: 3' "$TMP/devlyn-benchmark-pair-help.out"
|
|
761
|
+
grep -Fq -- '--allow-rejected-fixtures' "$TMP/devlyn-benchmark-pair-help.out"
|
|
762
|
+
grep -Fq 'npx devlyn-cli benchmark pair --min-fixtures 3 --max-pair-solo-wall-ratio 3 F16-cli-quote-tax-rules F23-cli-fulfillment-wave F25-cli-cart-promotion-rules' "$TMP/devlyn-benchmark-pair-help.out"
|
|
763
|
+
grep -Fq -- '--dry-run' "$TMP/devlyn-benchmark-pair-help.out"
|
|
764
|
+
if grep -Fq 'run-full-pipeline-pair-candidate.sh' "$TMP/devlyn-benchmark-pair-help.out"; then
|
|
765
|
+
echo "pair CLI help must not expose internal runner name" >&2
|
|
766
|
+
cat "$TMP/devlyn-benchmark-pair-help.out" >&2
|
|
767
|
+
exit 1
|
|
768
|
+
fi
|
|
769
|
+
grep -Fq 'DEVLYN_BENCHMARK_CLI_SUBCOMMAND: benchmarkMode' "$ROOT/bin/devlyn.js"
|
|
770
|
+
|
|
771
|
+
expect_fail_contains devlyn-headroom-cli-replay \
|
|
772
|
+
'Command: npx devlyn-cli benchmark headroom --run-id arg-parse-headroom-cli-replay' \
|
|
773
|
+
node "$ROOT/bin/devlyn.js" benchmark headroom \
|
|
774
|
+
--run-id arg-parse-headroom-cli-replay \
|
|
775
|
+
--min-fixtures 2 \
|
|
776
|
+
F999-not-a-fixture
|
|
777
|
+
|
|
778
|
+
expect_fail_contains devlyn-pair-cli-replay \
|
|
779
|
+
'Command: npx devlyn-cli benchmark pair --run-id arg-parse-pair-cli-replay' \
|
|
780
|
+
node "$ROOT/bin/devlyn.js" benchmark pair \
|
|
781
|
+
--run-id arg-parse-pair-cli-replay \
|
|
782
|
+
--reuse-calibrated-from arg-parse-missing-calibration \
|
|
783
|
+
F21-cli-scheduler-priority
|
|
784
|
+
|
|
785
|
+
node "$ROOT/bin/devlyn.js" benchmark headroom \
|
|
786
|
+
--run-id arg-parse-headroom-dry-run \
|
|
787
|
+
--dry-run \
|
|
788
|
+
--min-fixtures 1 \
|
|
789
|
+
F21-cli-scheduler-priority > "$TMP/devlyn-headroom-dry-run.out" 2>&1
|
|
790
|
+
grep -Fq 'Command: npx devlyn-cli benchmark headroom --run-id arg-parse-headroom-dry-run' "$TMP/devlyn-headroom-dry-run.out"
|
|
791
|
+
grep -Fq -- '--min-bare-headroom 5' "$TMP/devlyn-headroom-dry-run.out"
|
|
792
|
+
grep -Fq -- '--min-solo-headroom 5' "$TMP/devlyn-headroom-dry-run.out"
|
|
793
|
+
grep -Fq -- '--dry-run' "$TMP/devlyn-headroom-dry-run.out"
|
|
794
|
+
grep -Fq '[headroom] DRY RUN complete' "$TMP/devlyn-headroom-dry-run.out"
|
|
795
|
+
|
|
796
|
+
node "$ROOT/bin/devlyn.js" benchmark headroom \
|
|
797
|
+
--run-id arg-parse-shadow-headroom-dry-run \
|
|
798
|
+
--dry-run \
|
|
799
|
+
--min-fixtures 1 \
|
|
800
|
+
S1-cli-lang-flag > "$TMP/devlyn-shadow-headroom-dry-run.out" 2>&1
|
|
801
|
+
grep -Fq 'Fixtures: S1-cli-lang-flag' "$TMP/devlyn-shadow-headroom-dry-run.out"
|
|
802
|
+
grep -Fq '[headroom] DRY RUN complete' "$TMP/devlyn-shadow-headroom-dry-run.out"
|
|
803
|
+
|
|
804
|
+
expect_fail_contains smoke-only-s1-cli-headroom \
|
|
805
|
+
"fixture is smoke-only and cannot run providers: S1-cli-lang-flag" \
|
|
806
|
+
node "$ROOT/bin/devlyn.js" benchmark headroom \
|
|
807
|
+
--run-id arg-parse-shadow-headroom-block \
|
|
808
|
+
--min-fixtures 1 \
|
|
809
|
+
S1-cli-lang-flag
|
|
810
|
+
|
|
811
|
+
node "$ROOT/bin/devlyn.js" benchmark pair \
|
|
812
|
+
--run-id arg-parse-pair-dry-run \
|
|
813
|
+
--dry-run \
|
|
814
|
+
--min-fixtures 1 \
|
|
815
|
+
F21-cli-scheduler-priority > "$TMP/devlyn-pair-dry-run.out" 2>&1
|
|
816
|
+
grep -Fq 'Command: npx devlyn-cli benchmark pair --run-id arg-parse-pair-dry-run' "$TMP/devlyn-pair-dry-run.out"
|
|
817
|
+
grep -Fq -- '--min-bare-headroom 5' "$TMP/devlyn-pair-dry-run.out"
|
|
818
|
+
grep -Fq -- '--min-solo-headroom 5' "$TMP/devlyn-pair-dry-run.out"
|
|
819
|
+
grep -Fq -- '--max-pair-solo-wall-ratio 3' "$TMP/devlyn-pair-dry-run.out"
|
|
820
|
+
grep -Fq -- '--dry-run' "$TMP/devlyn-pair-dry-run.out"
|
|
821
|
+
grep -Fq '[full-pipeline-pair] DRY RUN complete' "$TMP/devlyn-pair-dry-run.out"
|
|
822
|
+
|
|
823
|
+
node "$ROOT/bin/devlyn.js" benchmark pair \
|
|
824
|
+
--run-id arg-parse-shadow-pair-dry-run \
|
|
825
|
+
--dry-run \
|
|
826
|
+
--min-fixtures 1 \
|
|
827
|
+
S1-cli-lang-flag > "$TMP/devlyn-shadow-pair-dry-run.out" 2>&1
|
|
828
|
+
grep -Fq 'Fixtures: S1-cli-lang-flag' "$TMP/devlyn-shadow-pair-dry-run.out"
|
|
829
|
+
grep -Fq '[full-pipeline-pair] DRY RUN complete' "$TMP/devlyn-shadow-pair-dry-run.out"
|
|
830
|
+
|
|
831
|
+
expect_fail_contains smoke-only-s1-cli-pair \
|
|
832
|
+
"fixture is smoke-only and cannot run providers: S1-cli-lang-flag" \
|
|
833
|
+
node "$ROOT/bin/devlyn.js" benchmark pair \
|
|
834
|
+
--run-id arg-parse-shadow-pair-block \
|
|
835
|
+
--min-fixtures 1 \
|
|
836
|
+
S1-cli-lang-flag
|
|
837
|
+
|
|
838
|
+
bash "$ROOT/benchmark/auto-resolve/scripts/run-fixture.sh" \
|
|
839
|
+
--fixture F1-cli-trivial-flag \
|
|
840
|
+
--arm variant \
|
|
841
|
+
--run-id arg-parse-variant-path \
|
|
842
|
+
--dry-run > "$TMP/variant-dry-run.out" 2>&1
|
|
843
|
+
grep -Fq -- '--engine claude --risk-probes' \
|
|
844
|
+
"$BENCH_ROOT/results/arg-parse-variant-path/F1-cli-trivial-flag/variant/input.md"
|
|
845
|
+
if grep -Fq -- '--engine auto' \
|
|
846
|
+
"$BENCH_ROOT/results/arg-parse-variant-path/F1-cli-trivial-flag/variant/input.md"; then
|
|
847
|
+
echo "variant arm must not use retired --engine auto route" >&2
|
|
848
|
+
cat "$BENCH_ROOT/results/arg-parse-variant-path/F1-cli-trivial-flag/variant/input.md" >&2
|
|
849
|
+
exit 1
|
|
850
|
+
fi
|
|
851
|
+
mkdir -p "$BENCH_ROOT/shadow-fixtures"
|
|
852
|
+
rm -rf "$BENCH_ROOT/shadow-fixtures/arg-parse-nan-metadata" \
|
|
853
|
+
"$BENCH_ROOT/shadow-fixtures/arg-parse-nan-expected" \
|
|
854
|
+
"$BENCH_ROOT/results/arg-parse-nan-metadata" \
|
|
855
|
+
"$BENCH_ROOT/results/arg-parse-nan-expected"
|
|
856
|
+
cp -R "$BENCH_ROOT/fixtures/F1-cli-trivial-flag" "$BENCH_ROOT/shadow-fixtures/arg-parse-nan-metadata"
|
|
857
|
+
printf '{"timeout_seconds": NaN}\n' > "$BENCH_ROOT/shadow-fixtures/arg-parse-nan-metadata/metadata.json"
|
|
858
|
+
expect_fail_contains fixture-nan-metadata "invalid JSON numeric constant: NaN" \
|
|
859
|
+
bash "$ROOT/benchmark/auto-resolve/scripts/run-fixture.sh" \
|
|
860
|
+
--fixture arg-parse-nan-metadata \
|
|
861
|
+
--arm bare \
|
|
862
|
+
--run-id arg-parse-nan-metadata \
|
|
863
|
+
--dry-run
|
|
864
|
+
cp -R "$BENCH_ROOT/fixtures/F1-cli-trivial-flag" "$BENCH_ROOT/shadow-fixtures/arg-parse-nan-expected"
|
|
865
|
+
printf '{"verification_commands": NaN}\n' > "$BENCH_ROOT/shadow-fixtures/arg-parse-nan-expected/expected.json"
|
|
866
|
+
expect_fail_contains fixture-nan-expected "invalid JSON numeric constant: NaN" \
|
|
867
|
+
bash "$ROOT/benchmark/auto-resolve/scripts/run-fixture.sh" \
|
|
868
|
+
--fixture arg-parse-nan-expected \
|
|
869
|
+
--arm variant \
|
|
870
|
+
--run-id arg-parse-nan-expected \
|
|
871
|
+
--dry-run
|
|
872
|
+
rm -rf "$BENCH_ROOT/shadow-fixtures/arg-parse-nan-metadata" \
|
|
873
|
+
"$BENCH_ROOT/shadow-fixtures/arg-parse-nan-expected" \
|
|
874
|
+
"$BENCH_ROOT/results/arg-parse-nan-metadata" \
|
|
875
|
+
"$BENCH_ROOT/results/arg-parse-nan-expected"
|
|
876
|
+
grep -Fq 'data = raw_oracle' \
|
|
877
|
+
"$ROOT/benchmark/auto-resolve/scripts/run-fixture.sh"
|
|
878
|
+
grep -Fq 'expected = loads_strict_json_object(pathlib.Path(sys.argv[1]).read_text())' \
|
|
879
|
+
"$ROOT/benchmark/auto-resolve/scripts/run-fixture.sh"
|
|
880
|
+
grep -Fq 'oracle artifact malformed or unreadable' \
|
|
881
|
+
"$ROOT/benchmark/auto-resolve/scripts/run-fixture.sh"
|
|
882
|
+
grep -Fq 'findings = raw_findings if isinstance(raw_findings, list) else []' \
|
|
883
|
+
"$ROOT/benchmark/auto-resolve/scripts/run-fixture.sh"
|
|
884
|
+
grep -Fq 'if not isinstance(finding, dict):' \
|
|
885
|
+
"$ROOT/benchmark/auto-resolve/scripts/run-fixture.sh"
|
|
886
|
+
grep -Fq 'loads_strict_json_object(pathlib.Path(result_dir, "timing.json").read_text())' \
|
|
887
|
+
"$ROOT/benchmark/auto-resolve/scripts/run-fixture.sh"
|
|
888
|
+
grep -Fq 'loads_strict_json_object(pathlib.Path(result_dir, "verify.json").read_text())' \
|
|
889
|
+
"$ROOT/benchmark/auto-resolve/scripts/run-fixture.sh"
|
|
890
|
+
grep -Fq 'loads_strict_json_object(pathlib.Path(state_path).read_text())' \
|
|
891
|
+
"$ROOT/benchmark/auto-resolve/scripts/run-fixture.sh"
|
|
892
|
+
grep -Fq '"type": "oracle-error"' \
|
|
893
|
+
"$ROOT/benchmark/auto-resolve/scripts/run-fixture.sh"
|
|
894
|
+
grep -Fq 'verify["oracle_disqualifier"] = True' \
|
|
895
|
+
"$ROOT/benchmark/auto-resolve/scripts/run-fixture.sh"
|
|
896
|
+
|
|
897
|
+
SCOPE_REPO="$TMP/scope-repo"
|
|
898
|
+
mkdir -p "$SCOPE_REPO"
|
|
899
|
+
git -C "$SCOPE_REPO" init -q
|
|
900
|
+
git -C "$SCOPE_REPO" config user.email bench@example.com
|
|
901
|
+
git -C "$SCOPE_REPO" config user.name bench
|
|
902
|
+
printf 'console.log("ok")\n' > "$SCOPE_REPO/app.js"
|
|
903
|
+
git -C "$SCOPE_REPO" add app.js
|
|
904
|
+
git -C "$SCOPE_REPO" commit -q -m base
|
|
905
|
+
SCOPE_SHA="$(git -C "$SCOPE_REPO" rev-parse HEAD)"
|
|
906
|
+
|
|
907
|
+
cat > "$TMP/expected-nan.json" <<'JSON'
|
|
908
|
+
{"tier_a_waivers": NaN, "spec_output_files": ["app.js"]}
|
|
909
|
+
JSON
|
|
910
|
+
python3 "$ROOT/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py" \
|
|
911
|
+
--work "$SCOPE_REPO" \
|
|
912
|
+
--scaffold "$SCOPE_SHA" \
|
|
913
|
+
--expected "$TMP/expected-nan.json" > "$TMP/scope-tier-a-nan.json"
|
|
914
|
+
grep -Fq '"error": "expected.json unreadable: invalid JSON numeric constant: NaN"' \
|
|
915
|
+
"$TMP/scope-tier-a-nan.json"
|
|
916
|
+
python3 "$ROOT/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py" \
|
|
917
|
+
--work "$SCOPE_REPO" \
|
|
918
|
+
--scaffold "$SCOPE_SHA" \
|
|
919
|
+
--expected "$TMP/expected-nan.json" > "$TMP/scope-tier-b-nan.json"
|
|
920
|
+
grep -Fq '"error": "expected.json unreadable: invalid JSON numeric constant: NaN"' \
|
|
921
|
+
"$TMP/scope-tier-b-nan.json"
|
|
922
|
+
|
|
923
|
+
cat > "$TMP/expected-bad-tier-c.json" <<'JSON'
|
|
924
|
+
{"tier_a_waivers": [], "spec_output_files": "app.js"}
|
|
925
|
+
JSON
|
|
926
|
+
python3 "$ROOT/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py" \
|
|
927
|
+
--work "$SCOPE_REPO" \
|
|
928
|
+
--scaffold "$SCOPE_SHA" \
|
|
929
|
+
--expected "$TMP/expected-bad-tier-c.json" > "$TMP/scope-tier-b-bad-tier-c.json"
|
|
930
|
+
grep -Fq '"error": "expected.json malformed: spec_output_files must be a string array"' \
|
|
931
|
+
"$TMP/scope-tier-b-bad-tier-c.json"
|
|
932
|
+
|
|
933
|
+
echo "PASS test-benchmark-arg-parsing"
|