devlyn-cli 2.2.2 → 2.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +2 -2
- package/CLAUDE.md +4 -4
- package/README.md +85 -34
- package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
- package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
- package/benchmark/auto-resolve/README.md +307 -44
- package/benchmark/auto-resolve/RUBRIC.md +23 -14
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
- package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
- package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
- package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
- package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
- package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
- package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
- package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
- package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
- package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
- package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
- package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
- package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
- package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
- package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
- package/benchmark/auto-resolve/scripts/judge.sh +153 -26
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
- package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
- package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
- package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
- package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
- package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
- package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
- package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
- package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
- package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
- package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
- package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
- package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
- package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
- package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
- package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
- package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
- package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
- package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
- package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
- package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
- package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
- package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
- package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
- package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
- package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
- package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
- package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
- package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
- package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
- package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
- package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
- package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
- package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
- package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
- package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
- package/bin/devlyn.js +221 -17
- package/config/skills/_shared/adapters/README.md +3 -0
- package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
- package/config/skills/_shared/adapters/opus-4-7.md +9 -1
- package/config/skills/_shared/archive_run.py +78 -6
- package/config/skills/_shared/codex-config.md +5 -4
- package/config/skills/_shared/codex-monitored.sh +46 -1
- package/config/skills/_shared/collect-codex-findings.py +20 -5
- package/config/skills/_shared/engine-preflight.md +17 -13
- package/config/skills/_shared/runtime-principles.md +6 -9
- package/config/skills/_shared/spec-verify-check.py +2664 -107
- package/config/skills/_shared/verify-merge-findings.py +1369 -19
- package/config/skills/devlyn:design-ui/SKILL.md +364 -0
- package/config/skills/devlyn:ideate/SKILL.md +7 -4
- package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
- package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
- package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
- package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
- package/config/skills/devlyn:resolve/SKILL.md +78 -26
- package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
- package/config/skills/devlyn:resolve/references/phases/implement.md +1 -1
- package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
- package/config/skills/devlyn:resolve/references/phases/verify.md +80 -29
- package/config/skills/devlyn:resolve/references/state-schema.md +9 -4
- package/package.json +47 -2
- package/scripts/lint-fixtures.sh +349 -0
- package/scripts/lint-shadow-fixtures.sh +58 -0
- package/scripts/lint-skills.sh +3645 -95
|
@@ -8,25 +8,93 @@ set -euo pipefail
|
|
|
8
8
|
|
|
9
9
|
usage() {
|
|
10
10
|
local code="${1:-1}"
|
|
11
|
-
|
|
11
|
+
cat >&2 <<'EOF'
|
|
12
|
+
usage: run-headroom-candidate.sh [options] <fixture> [<fixture> ...]
|
|
13
|
+
|
|
14
|
+
Options:
|
|
15
|
+
--run-id ID
|
|
16
|
+
--bare-max N (default: 60)
|
|
17
|
+
--solo-max N (default: 80)
|
|
18
|
+
--min-bare-headroom N (default: 5)
|
|
19
|
+
--min-solo-headroom N (default: 5)
|
|
20
|
+
--min-fixtures N (default: 2)
|
|
21
|
+
--allow-rejected-fixtures
|
|
22
|
+
allow rejected/ceiling fixtures for diagnostics only
|
|
23
|
+
--dry-run validate args/fixtures and print replay command only
|
|
24
|
+
EOF
|
|
12
25
|
exit "$code"
|
|
13
26
|
}
|
|
14
27
|
|
|
28
|
+
require_value() {
|
|
29
|
+
local flag="$1"
|
|
30
|
+
local value="${2:-}"
|
|
31
|
+
if [ -z "$value" ] || [[ "$value" == --* ]]; then
|
|
32
|
+
echo "$flag requires a value" >&2
|
|
33
|
+
exit 1
|
|
34
|
+
fi
|
|
35
|
+
}
|
|
36
|
+
|
|
15
37
|
RUN_ID=""
|
|
38
|
+
BARE_MAX=60
|
|
39
|
+
SOLO_MAX=80
|
|
40
|
+
MIN_BARE_HEADROOM=5
|
|
41
|
+
MIN_SOLO_HEADROOM=5
|
|
42
|
+
MIN_FIXTURES=2
|
|
43
|
+
ALLOW_REJECTED_FIXTURES=0
|
|
44
|
+
DRY_RUN=0
|
|
16
45
|
FIXTURES=()
|
|
17
46
|
while [ $# -gt 0 ]; do
|
|
18
47
|
case "$1" in
|
|
19
|
-
--run-id) RUN_ID="$2"; shift 2;;
|
|
48
|
+
--run-id) require_value "$1" "${2:-}"; RUN_ID="$2"; shift 2;;
|
|
49
|
+
--bare-max) require_value "$1" "${2:-}"; BARE_MAX="$2"; shift 2;;
|
|
50
|
+
--solo-max) require_value "$1" "${2:-}"; SOLO_MAX="$2"; shift 2;;
|
|
51
|
+
--min-bare-headroom) require_value "$1" "${2:-}"; MIN_BARE_HEADROOM="$2"; shift 2;;
|
|
52
|
+
--min-solo-headroom) require_value "$1" "${2:-}"; MIN_SOLO_HEADROOM="$2"; shift 2;;
|
|
53
|
+
--min-fixtures) require_value "$1" "${2:-}"; MIN_FIXTURES="$2"; shift 2;;
|
|
54
|
+
--allow-rejected-fixtures) ALLOW_REJECTED_FIXTURES=1; shift;;
|
|
55
|
+
--dry-run) DRY_RUN=1; shift;;
|
|
20
56
|
-h|--help) usage 0;;
|
|
21
|
-
|
|
57
|
+
[FS][0-9]*) FIXTURES+=("$1"); shift;;
|
|
22
58
|
*) echo "unknown arg: $1" >&2; usage;;
|
|
23
59
|
esac
|
|
24
60
|
done
|
|
25
61
|
|
|
62
|
+
for threshold in BARE_MAX SOLO_MAX MIN_BARE_HEADROOM MIN_SOLO_HEADROOM MIN_FIXTURES; do
|
|
63
|
+
value="${!threshold}"
|
|
64
|
+
case "$threshold" in
|
|
65
|
+
BARE_MAX) flag="bare-max" ;;
|
|
66
|
+
SOLO_MAX) flag="solo-max" ;;
|
|
67
|
+
MIN_BARE_HEADROOM) flag="min-bare-headroom" ;;
|
|
68
|
+
MIN_SOLO_HEADROOM) flag="min-solo-headroom" ;;
|
|
69
|
+
MIN_FIXTURES) flag="min-fixtures" ;;
|
|
70
|
+
esac
|
|
71
|
+
if [[ ! "$value" =~ ^[0-9]+$ ]]; then
|
|
72
|
+
echo "--$flag must be an integer: $value" >&2
|
|
73
|
+
exit 1
|
|
74
|
+
fi
|
|
75
|
+
done
|
|
76
|
+
if [ "$MIN_FIXTURES" -lt 1 ]; then
|
|
77
|
+
echo "--min-fixtures must be >= 1" >&2
|
|
78
|
+
exit 1
|
|
79
|
+
fi
|
|
80
|
+
if [ "$MIN_BARE_HEADROOM" -lt 0 ]; then
|
|
81
|
+
echo "--min-bare-headroom must be >= 0" >&2
|
|
82
|
+
exit 1
|
|
83
|
+
fi
|
|
84
|
+
if [ "$MIN_SOLO_HEADROOM" -lt 0 ]; then
|
|
85
|
+
echo "--min-solo-headroom must be >= 0" >&2
|
|
86
|
+
exit 1
|
|
87
|
+
fi
|
|
88
|
+
|
|
26
89
|
[ ${#FIXTURES[@]} -gt 0 ] || usage
|
|
27
90
|
|
|
28
91
|
BENCH_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
|
29
92
|
REPO_ROOT="$(cd "$BENCH_ROOT/../.." && pwd)"
|
|
93
|
+
source "$BENCH_ROOT/scripts/pair-rejected-fixtures.sh"
|
|
94
|
+
if ! declare -F rejected_pair_fixture_reason >/dev/null; then
|
|
95
|
+
echo "rejected fixture registry must define rejected_pair_fixture_reason" >&2
|
|
96
|
+
exit 1
|
|
97
|
+
fi
|
|
30
98
|
|
|
31
99
|
if [ -z "$RUN_ID" ]; then
|
|
32
100
|
TS=$(date -u +%Y%m%dT%H%M%SZ)
|
|
@@ -34,16 +102,177 @@ if [ -z "$RUN_ID" ]; then
|
|
|
34
102
|
RUN_ID="${TS}-${SHA}-headroom"
|
|
35
103
|
fi
|
|
36
104
|
|
|
105
|
+
print_command() {
|
|
106
|
+
local cmd
|
|
107
|
+
if [ "${DEVLYN_BENCHMARK_CLI_SUBCOMMAND:-}" = "headroom" ]; then
|
|
108
|
+
cmd=(npx devlyn-cli benchmark headroom --run-id "$RUN_ID")
|
|
109
|
+
else
|
|
110
|
+
cmd=(bash "$0" --run-id "$RUN_ID")
|
|
111
|
+
fi
|
|
112
|
+
cmd+=(--bare-max "$BARE_MAX")
|
|
113
|
+
cmd+=(--solo-max "$SOLO_MAX")
|
|
114
|
+
cmd+=(--min-bare-headroom "$MIN_BARE_HEADROOM")
|
|
115
|
+
cmd+=(--min-solo-headroom "$MIN_SOLO_HEADROOM")
|
|
116
|
+
cmd+=(--min-fixtures "$MIN_FIXTURES")
|
|
117
|
+
[ "$ALLOW_REJECTED_FIXTURES" -eq 0 ] || cmd+=(--allow-rejected-fixtures)
|
|
118
|
+
[ "$DRY_RUN" -eq 0 ] || cmd+=(--dry-run)
|
|
119
|
+
cmd+=("${FIXTURES[@]}")
|
|
120
|
+
printf 'Command: '
|
|
121
|
+
printf '%q ' "${cmd[@]}"
|
|
122
|
+
printf '\n'
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
fixture_exists() {
|
|
126
|
+
local fid="$1"
|
|
127
|
+
[ -d "$BENCH_ROOT/fixtures/$fid" ] || [ -d "$BENCH_ROOT/shadow-fixtures/$fid" ]
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
fixture_dir() {
|
|
131
|
+
local fid="$1"
|
|
132
|
+
if [ -d "$BENCH_ROOT/fixtures/$fid" ]; then
|
|
133
|
+
printf '%s\n' "$BENCH_ROOT/fixtures/$fid"
|
|
134
|
+
else
|
|
135
|
+
printf '%s\n' "$BENCH_ROOT/shadow-fixtures/$fid"
|
|
136
|
+
fi
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
is_shadow_fixture() {
|
|
140
|
+
local fid="$1"
|
|
141
|
+
[ -d "$BENCH_ROOT/shadow-fixtures/$fid" ]
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
retired_fixture_exists() {
|
|
145
|
+
local fid="$1"
|
|
146
|
+
[ -d "$BENCH_ROOT/fixtures/retired/$fid" ]
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
fixture_smoke_only() {
|
|
150
|
+
local fid="$1"
|
|
151
|
+
[[ "$fid" == S1 || "$fid" == S1-* ]]
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
fixture_category() {
|
|
155
|
+
local dir="$1"
|
|
156
|
+
python3 - "$dir/metadata.json" <<'PY'
|
|
157
|
+
import json
|
|
158
|
+
import sys
|
|
159
|
+
|
|
160
|
+
try:
|
|
161
|
+
with open(sys.argv[1], encoding="utf-8") as handle:
|
|
162
|
+
print(json.load(handle).get("category", ""))
|
|
163
|
+
except FileNotFoundError:
|
|
164
|
+
print("")
|
|
165
|
+
PY
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
fixture_has_solo_headroom_hypothesis() {
|
|
169
|
+
local dir="$1"
|
|
170
|
+
python3 "$BENCH_ROOT/scripts/solo-headroom-hypothesis.py" --expected-json "$dir/expected.json" "$dir/spec.md"
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
fixture_has_solo_ceiling_avoidance_note() {
|
|
174
|
+
local dir="$1"
|
|
175
|
+
python3 "$BENCH_ROOT/scripts/solo-ceiling-avoidance.py" "$dir/NOTES.md"
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
fixture_has_pair_evidence() {
|
|
179
|
+
local fid="$1"
|
|
180
|
+
python3 - "$BENCH_ROOT/results" "$fid" <<'PY'
|
|
181
|
+
import json
|
|
182
|
+
import pathlib
|
|
183
|
+
import sys
|
|
184
|
+
|
|
185
|
+
results = pathlib.Path(sys.argv[1])
|
|
186
|
+
fixture = sys.argv[2]
|
|
187
|
+
if not results.is_dir():
|
|
188
|
+
sys.exit(1)
|
|
189
|
+
for path in results.glob("*/full-pipeline-pair-gate.json"):
|
|
190
|
+
try:
|
|
191
|
+
data = json.loads(path.read_text(encoding="utf-8"))
|
|
192
|
+
except (OSError, json.JSONDecodeError):
|
|
193
|
+
continue
|
|
194
|
+
if data.get("verdict") != "PASS":
|
|
195
|
+
continue
|
|
196
|
+
rows = data.get("rows")
|
|
197
|
+
if not isinstance(rows, list):
|
|
198
|
+
continue
|
|
199
|
+
for row in rows:
|
|
200
|
+
if isinstance(row, dict) and row.get("fixture") == fixture and row.get("status") == "PASS":
|
|
201
|
+
sys.exit(0)
|
|
202
|
+
sys.exit(1)
|
|
203
|
+
PY
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
validate_fixtures() {
|
|
207
|
+
local missing=0
|
|
208
|
+
local fid reason dir category
|
|
209
|
+
for fid in "${FIXTURES[@]}"; do
|
|
210
|
+
if ! fixture_exists "$fid"; then
|
|
211
|
+
if retired_fixture_exists "$fid"; then
|
|
212
|
+
echo "fixture is retired and is not rerun by pair-candidate runners: $fid. Use preserved results/docs for historical replay." >&2
|
|
213
|
+
missing=1
|
|
214
|
+
continue
|
|
215
|
+
fi
|
|
216
|
+
echo "fixture not found in fixtures/ or shadow-fixtures/: $fid" >&2
|
|
217
|
+
missing=1
|
|
218
|
+
continue
|
|
219
|
+
fi
|
|
220
|
+
if [ "$DRY_RUN" -eq 0 ] && fixture_smoke_only "$fid"; then
|
|
221
|
+
echo "fixture is smoke-only and cannot run providers: $fid. Use --dry-run for runner/package validation." >&2
|
|
222
|
+
missing=1
|
|
223
|
+
continue
|
|
224
|
+
fi
|
|
225
|
+
reason="$(rejected_pair_fixture_reason "$fid" || true)"
|
|
226
|
+
if [ "$ALLOW_REJECTED_FIXTURES" -eq 0 ]; then
|
|
227
|
+
if [ -n "$reason" ]; then
|
|
228
|
+
echo "fixture rejected for pair-candidate runs: $fid ($reason). Use --allow-rejected-fixtures for diagnostics only." >&2
|
|
229
|
+
missing=1
|
|
230
|
+
continue
|
|
231
|
+
fi
|
|
232
|
+
fi
|
|
233
|
+
if [ -z "$reason" ]; then
|
|
234
|
+
dir="$(fixture_dir "$fid")"
|
|
235
|
+
category="$(fixture_category "$dir")"
|
|
236
|
+
if [ "$category" = "high-risk" ] && ! fixture_has_pair_evidence "$fid"; then
|
|
237
|
+
if ! fixture_has_solo_headroom_hypothesis "$dir"; then
|
|
238
|
+
echo "fixture spec.md needs a solo-headroom hypothesis with solo_claude miss and observable command from expected.json before provider spend: $fid" >&2
|
|
239
|
+
missing=1
|
|
240
|
+
fi
|
|
241
|
+
if is_shadow_fixture "$fid" && ! fixture_has_solo_ceiling_avoidance_note "$dir"; then
|
|
242
|
+
echo "shadow fixture NOTES.md needs ## Solo ceiling avoidance with solo_claude, a rejected/solo-saturated control comparison, and headroom reasoning before provider spend: $fid" >&2
|
|
243
|
+
missing=1
|
|
244
|
+
fi
|
|
245
|
+
fi
|
|
246
|
+
fi
|
|
247
|
+
done
|
|
248
|
+
[ "$missing" -eq 0 ] || exit 1
|
|
249
|
+
}
|
|
250
|
+
|
|
37
251
|
echo ""
|
|
38
252
|
echo "═══ Headroom Candidate Run ═══"
|
|
39
253
|
echo "Run-id: $RUN_ID"
|
|
40
254
|
echo "Fixtures: ${FIXTURES[*]}"
|
|
41
255
|
echo "Arms: bare solo_claude"
|
|
42
|
-
|
|
43
|
-
|
|
256
|
+
echo "Gate: bare <= $BARE_MAX (headroom >= $MIN_BARE_HEADROOM), solo_claude <= $SOLO_MAX (headroom >= $MIN_SOLO_HEADROOM), baseline evidence-complete, min fixtures $MIN_FIXTURES"
|
|
257
|
+
[ "$DRY_RUN" -eq 0 ] || echo "Mode: DRY RUN (no model/provider invocations)"
|
|
258
|
+
print_command
|
|
259
|
+
if [ ${#FIXTURES[@]} -lt "$MIN_FIXTURES" ]; then
|
|
260
|
+
echo "Gate: will FAIL set gate unless at least $MIN_FIXTURES fixtures are supplied"
|
|
44
261
|
fi
|
|
45
262
|
echo ""
|
|
46
263
|
|
|
264
|
+
validate_fixtures
|
|
265
|
+
|
|
266
|
+
if [ "$DRY_RUN" -eq 1 ] && [ "${#FIXTURES[@]}" -lt "$MIN_FIXTURES" ]; then
|
|
267
|
+
echo "[headroom] DRY RUN failed — ${#FIXTURES[@]} fixture(s) supplied, --min-fixtures requires $MIN_FIXTURES." >&2
|
|
268
|
+
exit 1
|
|
269
|
+
fi
|
|
270
|
+
|
|
271
|
+
if [ "$DRY_RUN" -eq 1 ]; then
|
|
272
|
+
echo "[headroom] DRY RUN complete — fixtures resolved, no arms or judges executed."
|
|
273
|
+
exit 0
|
|
274
|
+
fi
|
|
275
|
+
|
|
47
276
|
SRC_SKILLS="$REPO_ROOT/config/skills"
|
|
48
277
|
DST_SKILLS="$REPO_ROOT/.claude/skills"
|
|
49
278
|
mkdir -p "$DST_SKILLS"
|
|
@@ -84,10 +313,24 @@ echo ""
|
|
|
84
313
|
set +e
|
|
85
314
|
python3 "$BENCH_ROOT/scripts/headroom-gate.py" \
|
|
86
315
|
--run-id "$RUN_ID" \
|
|
316
|
+
--bare-max "$BARE_MAX" \
|
|
317
|
+
--solo-max "$SOLO_MAX" \
|
|
318
|
+
--min-bare-headroom "$MIN_BARE_HEADROOM" \
|
|
319
|
+
--min-solo-headroom "$MIN_SOLO_HEADROOM" \
|
|
320
|
+
--min-fixtures "$MIN_FIXTURES" \
|
|
87
321
|
--out-json "$BENCH_ROOT/results/$RUN_ID/headroom-gate.json" \
|
|
88
322
|
--out-md "$BENCH_ROOT/results/$RUN_ID/headroom-gate.md"
|
|
89
323
|
GATE_EXIT=$?
|
|
90
324
|
set -e
|
|
91
325
|
|
|
92
|
-
|
|
326
|
+
if [ -f "$BENCH_ROOT/results/$RUN_ID/headroom-gate.md" ]; then
|
|
327
|
+
cat "$BENCH_ROOT/results/$RUN_ID/headroom-gate.md"
|
|
328
|
+
else
|
|
329
|
+
echo "[headroom] headroom gate report missing: $BENCH_ROOT/results/$RUN_ID/headroom-gate.md" >&2
|
|
330
|
+
fi
|
|
331
|
+
if [ "$GATE_EXIT" -eq 0 ]; then
|
|
332
|
+
echo "[headroom] headroom gate passed — candidate set accepted."
|
|
333
|
+
else
|
|
334
|
+
echo "[headroom] headroom gate failed — candidate set rejected."
|
|
335
|
+
fi
|
|
93
336
|
exit "$GATE_EXIT"
|
|
@@ -96,16 +96,29 @@ echo "[run-iter-0033c] RUN_ID=$RUN_ID"
|
|
|
96
96
|
echo "[run-iter-0033c] RESULTS_DIR=$RESULTS_DIR"
|
|
97
97
|
|
|
98
98
|
# --- Determine pair-eligible set from manifest input bundle ---
|
|
99
|
-
#
|
|
100
|
-
#
|
|
101
|
-
#
|
|
99
|
+
# Pair eligibility is pre-registered from C1/F9 before any iter-0033c arms run.
|
|
100
|
+
# The later L1 rerun summary is archived into the final manifest for provenance;
|
|
101
|
+
# it must not change the arm-selection set after execution has begun.
|
|
102
102
|
DRAFT_MANIFEST="$RESULTS_DIR/manifest-draft.json"
|
|
103
103
|
python3 benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py \
|
|
104
104
|
--c1-summary "$C1_SUMMARY" \
|
|
105
105
|
--f9-judge "$F9_JUDGE" \
|
|
106
106
|
--l1-rerun-summary "$C1_SUMMARY" \
|
|
107
107
|
--output "$DRAFT_MANIFEST"
|
|
108
|
-
PAIR_ELIGIBLE=$(python3 -
|
|
108
|
+
PAIR_ELIGIBLE=$(python3 - "$DRAFT_MANIFEST" "$REPO_ROOT/benchmark/auto-resolve/scripts" <<'PY'
|
|
109
|
+
import pathlib
|
|
110
|
+
import sys
|
|
111
|
+
|
|
112
|
+
sys.path.insert(0, sys.argv[2])
|
|
113
|
+
from pair_evidence_contract import loads_strict_json_object
|
|
114
|
+
|
|
115
|
+
manifest = loads_strict_json_object(pathlib.Path(sys.argv[1]).read_text())
|
|
116
|
+
fixtures = manifest.get("fixtures_pair_eligible")
|
|
117
|
+
if not isinstance(fixtures, list) or not all(isinstance(item, str) for item in fixtures):
|
|
118
|
+
raise SystemExit("manifest fixtures_pair_eligible must be a string array")
|
|
119
|
+
print(" ".join(fixtures))
|
|
120
|
+
PY
|
|
121
|
+
)
|
|
109
122
|
echo "[run-iter-0033c] pair-eligible: $PAIR_ELIGIBLE"
|
|
110
123
|
|
|
111
124
|
# --- Per-fixture interleaved arm loop ---
|
|
@@ -161,50 +174,11 @@ done
|
|
|
161
174
|
|
|
162
175
|
# --- Build L1 rerun summary from solo_claude arm result.json + judge.json ---
|
|
163
176
|
L1_RERUN_SUMMARY="$RESULTS_DIR/l1-rerun-summary.json"
|
|
164
|
-
python3 -
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
run_id = sys.argv[3]
|
|
170
|
-
head_sha = sys.argv[4]
|
|
171
|
-
rows = []
|
|
172
|
-
for fx_dir in sorted(results_dir.iterdir()):
|
|
173
|
-
if not fx_dir.is_dir():
|
|
174
|
-
continue
|
|
175
|
-
judge_p = fx_dir / "judge.json"
|
|
176
|
-
if not judge_p.is_file():
|
|
177
|
-
continue
|
|
178
|
-
judge = json.loads(judge_p.read_text())
|
|
179
|
-
mapping = judge.get("_blind_mapping") or {}
|
|
180
|
-
inv = {v: k for k, v in mapping.items()}
|
|
181
|
-
arms = {}
|
|
182
|
-
for arm_name in ("solo_claude", "l2_gated", "l2_forced", "bare"):
|
|
183
|
-
letter = inv.get(arm_name)
|
|
184
|
-
if not letter:
|
|
185
|
-
continue
|
|
186
|
-
arm_dir = fx_dir / arm_name
|
|
187
|
-
result = {}
|
|
188
|
-
if (arm_dir / "result.json").is_file():
|
|
189
|
-
result = json.loads((arm_dir / "result.json").read_text())
|
|
190
|
-
arms[arm_name] = {
|
|
191
|
-
"score": judge.get(f"{letter}_score"),
|
|
192
|
-
"wall_s": result.get("elapsed_seconds"),
|
|
193
|
-
"verify_score": result.get("verify_score"),
|
|
194
|
-
"files_changed": result.get("files_changed"),
|
|
195
|
-
"timed_out": result.get("timed_out"),
|
|
196
|
-
"disqualifier": result.get("disqualifier"),
|
|
197
|
-
}
|
|
198
|
-
rows.append({"fixture": fx_dir.name, "arms": arms})
|
|
199
|
-
out = {
|
|
200
|
-
"run_id": run_id,
|
|
201
|
-
"git_sha": head_sha,
|
|
202
|
-
"fixtures_total": len(rows),
|
|
203
|
-
"rows": rows,
|
|
204
|
-
}
|
|
205
|
-
out_path.write_text(json.dumps(out, indent=2) + "\n")
|
|
206
|
-
print(f"[l1-rerun-summary] wrote {out_path} (fixtures={len(rows)})")
|
|
207
|
-
PY
|
|
177
|
+
python3 benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py \
|
|
178
|
+
--results-dir "$RESULTS_DIR" \
|
|
179
|
+
--out "$L1_RERUN_SUMMARY" \
|
|
180
|
+
--run-id "$RUN_ID" \
|
|
181
|
+
--git-sha "$HEAD_SHA"
|
|
208
182
|
|
|
209
183
|
# --- Build final manifest with real L1 rerun summary ---
|
|
210
184
|
FINAL_MANIFEST="$RESULTS_DIR/iter-0033c-pair-eligible.json"
|
|
@@ -6,13 +6,13 @@
|
|
|
6
6
|
#
|
|
7
7
|
# Usage:
|
|
8
8
|
# run-suite.sh # all fixtures, n=1 smoke
|
|
9
|
-
# run-suite.sh --n 3 # 3 runs per fixture for ship decisions
|
|
10
9
|
# run-suite.sh F2 F5 # specific fixtures only
|
|
11
10
|
# run-suite.sh --dry-run # skip model invocations, validate setup
|
|
12
11
|
# run-suite.sh --judge-only --run-id X # re-judge an existing run
|
|
13
12
|
# run-suite.sh --label v3.6 # tag this run
|
|
14
13
|
# run-suite.sh --bless # if ship-gate PASS, promote to baselines/shipped.json
|
|
15
14
|
# run-suite.sh --resolve-skill new # invoke /devlyn:resolve --spec (the only supported value post iter-0034 cutover; flag kept as accepted no-op for historical runners)
|
|
15
|
+
# run-suite.sh --suite shadow --dry-run # list shadow tasks; shadow suite refuses provider/judge runs
|
|
16
16
|
#
|
|
17
17
|
# Exits 0 on PASS, 1 on FAIL.
|
|
18
18
|
|
|
@@ -32,17 +32,26 @@ SUITE="golden"
|
|
|
32
32
|
RESOLVE_SKILL="new"
|
|
33
33
|
FIXTURES=()
|
|
34
34
|
|
|
35
|
+
require_value() {
|
|
36
|
+
local flag="$1"
|
|
37
|
+
local value="${2:-}"
|
|
38
|
+
if [ -z "$value" ] || [[ "$value" == --* ]]; then
|
|
39
|
+
echo "$flag requires a value" >&2
|
|
40
|
+
exit 1
|
|
41
|
+
fi
|
|
42
|
+
}
|
|
43
|
+
|
|
35
44
|
while [ $# -gt 0 ]; do
|
|
36
45
|
case "$1" in
|
|
37
|
-
--n) N="$2"; shift 2;;
|
|
38
|
-
--label) LABEL="$2"; shift 2;;
|
|
46
|
+
--n) require_value "$1" "${2:-}"; N="$2"; shift 2;;
|
|
47
|
+
--label) require_value "$1" "${2:-}"; LABEL="$2"; shift 2;;
|
|
39
48
|
--dry-run) DRY_RUN=1; shift;;
|
|
40
49
|
--judge-only) JUDGE_ONLY=1; shift;;
|
|
41
|
-
--run-id) RUN_ID_ARG="$2"; shift 2;;
|
|
50
|
+
--run-id) require_value "$1" "${2:-}"; RUN_ID_ARG="$2"; shift 2;;
|
|
42
51
|
--bless) BLESS=1; shift;;
|
|
43
52
|
--accept-missing) ACCEPT_MISSING=1; shift;;
|
|
44
|
-
--suite) SUITE="$2"; shift 2;;
|
|
45
|
-
--resolve-skill) RESOLVE_SKILL="$2"; shift 2;;
|
|
53
|
+
--suite) require_value "$1" "${2:-}"; SUITE="$2"; shift 2;;
|
|
54
|
+
--resolve-skill) require_value "$1" "${2:-}"; RESOLVE_SKILL="$2"; shift 2;;
|
|
46
55
|
-h|--help)
|
|
47
56
|
head -22 "$0" | sed -n '3,22p'; exit 0;;
|
|
48
57
|
[FS][0-9]*) FIXTURES+=("$1"); shift;;
|
|
@@ -69,8 +78,15 @@ case "$SUITE" in
|
|
|
69
78
|
*) echo "error: --suite must be 'golden' or 'shadow' (got '$SUITE')" >&2; exit 1;;
|
|
70
79
|
esac
|
|
71
80
|
|
|
81
|
+
if [ "$SUITE" = "shadow" ] && [ "$DRY_RUN" -eq 0 ]; then
|
|
82
|
+
echo "shadow suite run-suite is dry-run only. Use benchmark headroom/pair with explicit S* candidates for real provider measurement." >&2
|
|
83
|
+
exit 1
|
|
84
|
+
fi
|
|
85
|
+
|
|
72
86
|
# n must be 1 while iteration semantics aren't wired through judge/report.
|
|
73
87
|
# Remove this block when compile-report.py gains multi-iter aggregation.
|
|
88
|
+
case "$N" in ''|*[!0-9]*) echo "error: --n must be an integer" >&2; exit 1;; esac
|
|
89
|
+
[ "$N" -gt 0 ] || { echo "error: --n must be > 0" >&2; exit 1; }
|
|
74
90
|
if [ "$N" -ne 1 ]; then
|
|
75
91
|
echo "error: --n $N not yet supported — judge/report currently expect a single iteration per fixture." >&2
|
|
76
92
|
echo " Track progress in benchmark/auto-resolve/BENCHMARK-DESIGN.md (#multi-iter-roadmap)." >&2
|
|
@@ -101,6 +117,22 @@ fi
|
|
|
101
117
|
RES_DIR="$BENCH_ROOT/results/$RUN_ID"
|
|
102
118
|
mkdir -p "$RES_DIR"
|
|
103
119
|
|
|
120
|
+
print_command() {
|
|
121
|
+
local cmd=(bash "$0" --n "$N" --suite "$SUITE" --resolve-skill "$RESOLVE_SKILL")
|
|
122
|
+
[ -z "$LABEL" ] || cmd+=(--label "$LABEL")
|
|
123
|
+
cmd+=(--run-id "$RUN_ID")
|
|
124
|
+
[ $DRY_RUN -eq 0 ] || cmd+=(--dry-run)
|
|
125
|
+
[ $JUDGE_ONLY -eq 0 ] || cmd+=(--judge-only)
|
|
126
|
+
[ $BLESS -eq 0 ] || cmd+=(--bless)
|
|
127
|
+
[ $ACCEPT_MISSING -eq 0 ] || cmd+=(--accept-missing)
|
|
128
|
+
if [ ${#FIXTURES[@]} -gt 0 ]; then
|
|
129
|
+
cmd+=("${FIXTURES[@]}")
|
|
130
|
+
fi
|
|
131
|
+
printf 'Command: '
|
|
132
|
+
printf '%q ' "${cmd[@]}"
|
|
133
|
+
printf '\n'
|
|
134
|
+
}
|
|
135
|
+
|
|
104
136
|
echo ""
|
|
105
137
|
echo "═══ Benchmark Suite Run ═══"
|
|
106
138
|
echo "Run-id: $RUN_ID"
|
|
@@ -111,6 +143,7 @@ echo "n: $N"
|
|
|
111
143
|
echo "Resolve skill: $RESOLVE_SKILL"
|
|
112
144
|
[ $DRY_RUN -eq 1 ] && echo "Mode: DRY RUN (no model invocations)"
|
|
113
145
|
[ $JUDGE_ONLY -eq 1 ] && echo "Mode: JUDGE ONLY (re-judging existing artifacts)"
|
|
146
|
+
print_command
|
|
114
147
|
echo ""
|
|
115
148
|
|
|
116
149
|
# ---- Mirror committed skills into .claude/skills (iter-0017) --------------
|
|
@@ -201,7 +234,11 @@ done
|
|
|
201
234
|
if [ $DRY_RUN -eq 1 ]; then
|
|
202
235
|
echo ""
|
|
203
236
|
echo "[suite] DRY RUN complete — results in $RES_DIR"
|
|
204
|
-
|
|
237
|
+
if [ "$SUITE" = "shadow" ]; then
|
|
238
|
+
echo "Use benchmark headroom/pair with explicit S* candidates for real provider measurement."
|
|
239
|
+
else
|
|
240
|
+
echo "Run without --dry-run to invoke models."
|
|
241
|
+
fi
|
|
205
242
|
exit 0
|
|
206
243
|
fi
|
|
207
244
|
|
|
@@ -19,6 +19,15 @@ EOF
|
|
|
19
19
|
exit "${1:-1}"
|
|
20
20
|
}
|
|
21
21
|
|
|
22
|
+
require_value() {
|
|
23
|
+
local flag="$1"
|
|
24
|
+
local value="${2:-}"
|
|
25
|
+
if [ -z "$value" ] || [[ "$value" == --* ]]; then
|
|
26
|
+
echo "$flag requires a value" >&2
|
|
27
|
+
exit 1
|
|
28
|
+
fi
|
|
29
|
+
}
|
|
30
|
+
|
|
22
31
|
MANIFEST=""
|
|
23
32
|
RUN_PREFIX=""
|
|
24
33
|
PAIR_MODE="gated"
|
|
@@ -33,18 +42,18 @@ RUN_IDS_OUT=""
|
|
|
33
42
|
RESUME_COMPLETED_ARMS=0
|
|
34
43
|
while [ $# -gt 0 ]; do
|
|
35
44
|
case "$1" in
|
|
36
|
-
--manifest) MANIFEST="$2"; shift 2;;
|
|
37
|
-
--run-prefix) RUN_PREFIX="$2"; shift 2;;
|
|
38
|
-
--pair-mode) PAIR_MODE="$2"; shift 2;;
|
|
39
|
-
--min-runs) MIN_RUNS="$2"; shift 2;;
|
|
40
|
-
--out-json) OUT_JSON="$2"; shift 2;;
|
|
41
|
-
--out-md) OUT_MD="$2"; shift 2;;
|
|
42
|
-
--max-pair-solo-wall-ratio) MAX_PAIR_SOLO_WALL_RATIO="$2"; shift 2;;
|
|
43
|
-
--timeout-seconds) TIMEOUT_SECONDS="$2"; shift 2;;
|
|
44
|
-
--run-ids-out) RUN_IDS_OUT="$2"; shift 2;;
|
|
45
|
+
--manifest) require_value "$1" "${2:-}"; MANIFEST="$2"; shift 2;;
|
|
46
|
+
--run-prefix) require_value "$1" "${2:-}"; RUN_PREFIX="$2"; shift 2;;
|
|
47
|
+
--pair-mode) require_value "$1" "${2:-}"; PAIR_MODE="$2"; shift 2;;
|
|
48
|
+
--min-runs) require_value "$1" "${2:-}"; MIN_RUNS="$2"; shift 2;;
|
|
49
|
+
--out-json) require_value "$1" "${2:-}"; OUT_JSON="$2"; shift 2;;
|
|
50
|
+
--out-md) require_value "$1" "${2:-}"; OUT_MD="$2"; shift 2;;
|
|
51
|
+
--max-pair-solo-wall-ratio) require_value "$1" "${2:-}"; MAX_PAIR_SOLO_WALL_RATIO="$2"; shift 2;;
|
|
52
|
+
--timeout-seconds) require_value "$1" "${2:-}"; TIMEOUT_SECONDS="$2"; shift 2;;
|
|
53
|
+
--run-ids-out) require_value "$1" "${2:-}"; RUN_IDS_OUT="$2"; shift 2;;
|
|
45
54
|
--resume-completed-arms) RESUME_COMPLETED_ARMS=1; shift;;
|
|
46
55
|
--prepare-only) PREPARE_ONLY=1; shift;;
|
|
47
|
-
--gate-only-run-ids) GATE_ONLY_RUN_IDS="$2"; shift 2;;
|
|
56
|
+
--gate-only-run-ids) require_value "$1" "${2:-}"; GATE_ONLY_RUN_IDS="$2"; shift 2;;
|
|
48
57
|
-h|--help) usage 0;;
|
|
49
58
|
*) echo "unknown arg: $1" >&2; usage 1;;
|
|
50
59
|
esac
|
|
@@ -79,12 +88,91 @@ if [ -z "$RUN_PREFIX" ]; then
|
|
|
79
88
|
RUN_PREFIX="$(date -u +%Y%m%dT%H%M%SZ)-swebench-frozen"
|
|
80
89
|
fi
|
|
81
90
|
|
|
91
|
+
print_command() {
|
|
92
|
+
local cmd=(bash "$0" --manifest "$MANIFEST" --run-prefix "$RUN_PREFIX")
|
|
93
|
+
cmd+=(--pair-mode "$PAIR_MODE")
|
|
94
|
+
cmd+=(--min-runs "$MIN_RUNS")
|
|
95
|
+
[ -z "$OUT_JSON" ] || cmd+=(--out-json "$OUT_JSON")
|
|
96
|
+
[ -z "$OUT_MD" ] || cmd+=(--out-md "$OUT_MD")
|
|
97
|
+
[ -z "$MAX_PAIR_SOLO_WALL_RATIO" ] || cmd+=(--max-pair-solo-wall-ratio "$MAX_PAIR_SOLO_WALL_RATIO")
|
|
98
|
+
[ -z "$TIMEOUT_SECONDS" ] || cmd+=(--timeout-seconds "$TIMEOUT_SECONDS")
|
|
99
|
+
[ -z "$RUN_IDS_OUT" ] || cmd+=(--run-ids-out "$RUN_IDS_OUT")
|
|
100
|
+
[ "$RESUME_COMPLETED_ARMS" -eq 0 ] || cmd+=(--resume-completed-arms)
|
|
101
|
+
[ "$PREPARE_ONLY" -eq 0 ] || cmd+=(--prepare-only)
|
|
102
|
+
[ -z "$GATE_ONLY_RUN_IDS" ] || cmd+=(--gate-only-run-ids "$GATE_ONLY_RUN_IDS")
|
|
103
|
+
printf 'Command: '
|
|
104
|
+
printf '%q ' "${cmd[@]}"
|
|
105
|
+
printf '\n'
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
echo ""
|
|
109
|
+
echo "═══ SWE-bench Frozen VERIFY Corpus Run ═══"
|
|
110
|
+
echo "Run-prefix: $RUN_PREFIX"
|
|
111
|
+
echo "Pair mode: $PAIR_MODE"
|
|
112
|
+
echo "Min runs: $MIN_RUNS"
|
|
113
|
+
[ -z "$MAX_PAIR_SOLO_WALL_RATIO" ] || echo "Wall cap: pair/solo <= ${MAX_PAIR_SOLO_WALL_RATIO}x"
|
|
114
|
+
print_command
|
|
115
|
+
echo ""
|
|
116
|
+
|
|
82
117
|
TMP_RUN_IDS="$(mktemp)"
|
|
83
118
|
trap 'rm -f "$TMP_RUN_IDS"' EXIT
|
|
84
119
|
ROW_FAILURES=0
|
|
85
120
|
|
|
121
|
+
python3 - "$MANIFEST" "$GATE_ONLY_RUN_IDS" "$SCRIPT_DIR" <<'PY'
|
|
122
|
+
import pathlib
|
|
123
|
+
import sys
|
|
124
|
+
|
|
125
|
+
sys.path.insert(0, sys.argv[3])
|
|
126
|
+
from pair_evidence_contract import loads_strict_json_object
|
|
127
|
+
|
|
128
|
+
manifest_path = pathlib.Path(sys.argv[1])
|
|
129
|
+
gate_only_run_ids = sys.argv[2]
|
|
130
|
+
try:
|
|
131
|
+
manifest = loads_strict_json_object(manifest_path.read_text())
|
|
132
|
+
except ValueError as exc:
|
|
133
|
+
if str(exc) == "top-level JSON value must be an object":
|
|
134
|
+
raise SystemExit("manifest malformed: expected JSON object") from exc
|
|
135
|
+
raise
|
|
136
|
+
if not isinstance(manifest, dict):
|
|
137
|
+
raise SystemExit("manifest malformed: expected JSON object")
|
|
138
|
+
cases_root = manifest.get("cases_root")
|
|
139
|
+
if not isinstance(cases_root, str) or not cases_root.strip():
|
|
140
|
+
raise SystemExit("manifest malformed: missing non-empty cases_root")
|
|
141
|
+
if gate_only_run_ids:
|
|
142
|
+
raise SystemExit(0)
|
|
143
|
+
prepared = manifest.get("prepared")
|
|
144
|
+
if not isinstance(prepared, list) or not prepared:
|
|
145
|
+
raise SystemExit("manifest malformed: prepared must be a non-empty array")
|
|
146
|
+
for index, row in enumerate(prepared, start=1):
|
|
147
|
+
if not isinstance(row, dict):
|
|
148
|
+
raise SystemExit(f"manifest malformed: prepared[{index}] expected JSON object")
|
|
149
|
+
for key in ("instance_id", "case_dir", "repo_dir"):
|
|
150
|
+
value = row.get(key)
|
|
151
|
+
if not isinstance(value, str) or not value.strip():
|
|
152
|
+
raise SystemExit(f"manifest malformed: prepared[{index}] missing non-empty {key}")
|
|
153
|
+
PY
|
|
154
|
+
|
|
86
155
|
if [ -n "$GATE_ONLY_RUN_IDS" ]; then
|
|
87
|
-
|
|
156
|
+
python3 - "$GATE_ONLY_RUN_IDS" "$TMP_RUN_IDS" <<'PY'
|
|
157
|
+
import pathlib
|
|
158
|
+
import re
|
|
159
|
+
import sys
|
|
160
|
+
|
|
161
|
+
source = pathlib.Path(sys.argv[1])
|
|
162
|
+
dest = pathlib.Path(sys.argv[2])
|
|
163
|
+
safe = re.compile(r"^[A-Za-z0-9_.-]+$")
|
|
164
|
+
run_ids: list[str] = []
|
|
165
|
+
for line_no, line in enumerate(source.read_text(encoding="utf8").splitlines(), start=1):
|
|
166
|
+
run_id = line.strip()
|
|
167
|
+
if not run_id:
|
|
168
|
+
raise SystemExit(f"run ids malformed: line {line_no} is empty")
|
|
169
|
+
if not safe.match(run_id):
|
|
170
|
+
raise SystemExit(f"run ids malformed: line {line_no} has unsafe run id")
|
|
171
|
+
run_ids.append(run_id)
|
|
172
|
+
if not run_ids:
|
|
173
|
+
raise SystemExit("run ids malformed: no run ids")
|
|
174
|
+
dest.write_text("\n".join(run_ids) + "\n", encoding="utf8")
|
|
175
|
+
PY
|
|
88
176
|
else
|
|
89
177
|
while IFS=$'\t' read -r index instance_id cases_root repo_dir diff_path; do
|
|
90
178
|
[ -n "$instance_id" ] || continue
|
|
@@ -157,10 +245,14 @@ if not compare_path.exists():
|
|
|
157
245
|
PY
|
|
158
246
|
fi
|
|
159
247
|
printf '%s\n' "$safe_run_id" >> "$TMP_RUN_IDS"
|
|
160
|
-
done < <(python3 - "$MANIFEST" <<'PY'
|
|
161
|
-
import
|
|
162
|
-
|
|
163
|
-
|
|
248
|
+
done < <(python3 - "$MANIFEST" "$SCRIPT_DIR" <<'PY'
|
|
249
|
+
import pathlib, sys
|
|
250
|
+
|
|
251
|
+
sys.path.insert(0, sys.argv[2])
|
|
252
|
+
from pair_evidence_contract import loads_strict_json_object
|
|
253
|
+
|
|
254
|
+
manifest = loads_strict_json_object(pathlib.Path(sys.argv[1]).read_text())
|
|
255
|
+
for index, row in enumerate(manifest["prepared"], start=1):
|
|
164
256
|
instance_id = row["instance_id"]
|
|
165
257
|
case_dir = pathlib.Path(row["case_dir"])
|
|
166
258
|
repo_dir = pathlib.Path(row["repo_dir"])
|
|
@@ -192,13 +284,22 @@ fi
|
|
|
192
284
|
run_count="$(wc -l < "$TMP_RUN_IDS" | tr -d ' ')"
|
|
193
285
|
[ "$run_count" -gt 0 ] || { echo "manifest prepared no runs" >&2; exit 1; }
|
|
194
286
|
|
|
195
|
-
fixtures_root="$(python3 - "$MANIFEST" <<'PY'
|
|
196
|
-
import
|
|
197
|
-
|
|
287
|
+
fixtures_root="$(python3 - "$MANIFEST" "$SCRIPT_DIR" <<'PY'
|
|
288
|
+
import pathlib, sys
|
|
289
|
+
|
|
290
|
+
sys.path.insert(0, sys.argv[2])
|
|
291
|
+
from pair_evidence_contract import loads_strict_json_object
|
|
292
|
+
|
|
293
|
+
manifest = loads_strict_json_object(pathlib.Path(sys.argv[1]).read_text())
|
|
198
294
|
print(manifest["cases_root"])
|
|
199
295
|
PY
|
|
200
296
|
)"
|
|
201
|
-
gate_args=(
|
|
297
|
+
gate_args=(
|
|
298
|
+
python3 "$SCRIPT_DIR/frozen-verify-gate.py"
|
|
299
|
+
--fixtures-root "$fixtures_root"
|
|
300
|
+
--min-runs "$MIN_RUNS"
|
|
301
|
+
--require-hypothesis-trigger
|
|
302
|
+
)
|
|
202
303
|
[ -z "$OUT_JSON" ] || gate_args+=(--out-json "$OUT_JSON")
|
|
203
304
|
[ -z "$OUT_MD" ] || gate_args+=(--out-md "$OUT_MD")
|
|
204
305
|
[ -z "$MAX_PAIR_SOLO_WALL_RATIO" ] || gate_args+=(--max-pair-solo-wall-ratio "$MAX_PAIR_SOLO_WALL_RATIO")
|