devlyn-cli 2.3.0 → 2.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +1 -1
- package/CLAUDE.md +2 -2
- package/README.md +82 -29
- package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
- package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
- package/benchmark/auto-resolve/README.md +307 -44
- package/benchmark/auto-resolve/RUBRIC.md +23 -14
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
- package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
- package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
- package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
- package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
- package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
- package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
- package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
- package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
- package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
- package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
- package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
- package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
- package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
- package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
- package/benchmark/auto-resolve/scripts/judge.sh +153 -26
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
- package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
- package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
- package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
- package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
- package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
- package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
- package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
- package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
- package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
- package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
- package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
- package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
- package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
- package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
- package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
- package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
- package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
- package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
- package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
- package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
- package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
- package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
- package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
- package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
- package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
- package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
- package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
- package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
- package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
- package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
- package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
- package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
- package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
- package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
- package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
- package/bin/devlyn.js +211 -18
- package/config/skills/_shared/adapters/README.md +3 -0
- package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
- package/config/skills/_shared/adapters/opus-4-7.md +9 -1
- package/config/skills/_shared/archive_run.py +78 -6
- package/config/skills/_shared/codex-config.md +3 -2
- package/config/skills/_shared/codex-monitored.sh +46 -1
- package/config/skills/_shared/collect-codex-findings.py +20 -5
- package/config/skills/_shared/engine-preflight.md +1 -1
- package/config/skills/_shared/runtime-principles.md +5 -8
- package/config/skills/_shared/spec-verify-check.py +2664 -107
- package/config/skills/_shared/verify-merge-findings.py +1369 -19
- package/config/skills/devlyn:ideate/SKILL.md +7 -4
- package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
- package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
- package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
- package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
- package/config/skills/devlyn:resolve/SKILL.md +49 -18
- package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
- package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
- package/config/skills/devlyn:resolve/references/phases/verify.md +62 -28
- package/config/skills/devlyn:resolve/references/state-schema.md +7 -4
- package/package.json +47 -2
- package/scripts/lint-fixtures.sh +349 -0
- package/scripts/lint-shadow-fixtures.sh +58 -0
- package/scripts/lint-skills.sh +3642 -92
- /package/{optional-skills → config/skills}/devlyn:design-ui/SKILL.md +0 -0
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Shared rejected/ceiling pair-candidate fixture registry.
|
|
3
|
+
# Includes active fixtures and calibrated shadow controls that should not spend
|
|
4
|
+
# pair-candidate runs unless explicitly requested for diagnostics.
|
|
5
|
+
|
|
6
|
+
rejected_pair_fixture_reason() {
|
|
7
|
+
local fid="$1"
|
|
8
|
+
case "$fid" in
|
|
9
|
+
F1-*|F1)
|
|
10
|
+
echo "trivial calibration fixture; every arm is expected to one-shot it"
|
|
11
|
+
;;
|
|
12
|
+
F2-*|F2)
|
|
13
|
+
echo "bare 83 / solo_claude 95 in 20260512-f2-medium-headroom"
|
|
14
|
+
;;
|
|
15
|
+
F3-*|F3)
|
|
16
|
+
echo "bare 97 / solo_claude 99 in 20260511-f3-http-error-headroom"
|
|
17
|
+
;;
|
|
18
|
+
F4-*|F4)
|
|
19
|
+
echo "bare 70 / solo_claude 92 with bare disqualifier in 20260512-f4-web-headroom"
|
|
20
|
+
;;
|
|
21
|
+
F5-*|F5)
|
|
22
|
+
echo "bare 99 / solo_claude 99 in 20260512-f5-fixloop-headroom"
|
|
23
|
+
;;
|
|
24
|
+
F6-*|F6)
|
|
25
|
+
echo "bare 97 / solo_claude 96 in 20260512-f6-checksum-headroom"
|
|
26
|
+
;;
|
|
27
|
+
F7-*|F7)
|
|
28
|
+
echo "bare 99 / solo_claude 100 in 20260512-f7-scope-headroom"
|
|
29
|
+
;;
|
|
30
|
+
F8-*|F8)
|
|
31
|
+
echo "known-limit ambiguity fixture; expected margin is [-3,+3], not pair-lift evidence"
|
|
32
|
+
;;
|
|
33
|
+
F9-*|F9)
|
|
34
|
+
echo "bare 60 / solo_claude 90 with bare headroom 0 and bare judge disqualifier in 20260512-f9-e2e-headroom"
|
|
35
|
+
;;
|
|
36
|
+
F10-*|F10)
|
|
37
|
+
echo "bare 75 / solo_claude 94 in 20260507-f10-f11-tier1-full-pipeline"
|
|
38
|
+
;;
|
|
39
|
+
F11-*|F11)
|
|
40
|
+
echo "bare 98 / solo_claude 97 in 20260507-f10-f11-tier1-full-pipeline"
|
|
41
|
+
;;
|
|
42
|
+
F12-*|F12)
|
|
43
|
+
echo "bare 85 / solo_claude 99 in 20260511-f12-webhook-headroom"
|
|
44
|
+
;;
|
|
45
|
+
F15-*|F15)
|
|
46
|
+
echo "bare 99 / solo_claude 94 in 20260511-f15-concurrency-headroom"
|
|
47
|
+
;;
|
|
48
|
+
F22-*|F22)
|
|
49
|
+
echo "bare 94 / solo_claude 98 in 20260508-f22-exact-error-headroom"
|
|
50
|
+
;;
|
|
51
|
+
F26-*|F26)
|
|
52
|
+
echo "solo_claude scored 98 in 20260508-f26-headroom"
|
|
53
|
+
;;
|
|
54
|
+
F27-*|F27)
|
|
55
|
+
echo "solo_claude scored 94 in 20260511-f27-headroom-smoke-061401"
|
|
56
|
+
;;
|
|
57
|
+
F28-*|F28)
|
|
58
|
+
echo "corrected-oracle reverify scored solo_claude 98 in 20260511-f28-policy-oraclefix-reverified-pair"
|
|
59
|
+
;;
|
|
60
|
+
F29-*|F29)
|
|
61
|
+
echo "corrected headroom scored solo_claude 92 in 20260510-f29-headroom-v2"
|
|
62
|
+
;;
|
|
63
|
+
F30-*|F30)
|
|
64
|
+
echo "solo_claude scored 98 in 20260511-f30-headroom-v1"
|
|
65
|
+
;;
|
|
66
|
+
F31-*|F31)
|
|
67
|
+
echo "solo_claude scored 98 with bare disqualifiers in 20260512-f31-seat-rebalance-headroom"
|
|
68
|
+
;;
|
|
69
|
+
F32-*|F32)
|
|
70
|
+
echo "bare 33 / solo_claude 98 in 20260512-f32-subscription-renewal-headroom"
|
|
71
|
+
;;
|
|
72
|
+
S2-*|S2)
|
|
73
|
+
echo "bare 33 / solo_claude 99 with solo timeout in 20260513-s2-inventory-headroom"
|
|
74
|
+
;;
|
|
75
|
+
S3-*|S3)
|
|
76
|
+
echo "bare 33 / solo_claude 99 with solo timeout in 20260513-s3-ticket-headroom"
|
|
77
|
+
;;
|
|
78
|
+
S4-*|S4)
|
|
79
|
+
echo "bare 33 / solo_claude 98 with solo timeout in 20260513-s4-return-headroom"
|
|
80
|
+
;;
|
|
81
|
+
S5-*|S5)
|
|
82
|
+
echo "bare 33 / solo_claude 98 with solo timeout in 20260513-s5-credit-headroom"
|
|
83
|
+
;;
|
|
84
|
+
S6-*|S6)
|
|
85
|
+
echo "bare 33 / solo_claude 98 with solo timeout in 20260514-s6-refund-headroom-v1"
|
|
86
|
+
;;
|
|
87
|
+
*)
|
|
88
|
+
return 1
|
|
89
|
+
;;
|
|
90
|
+
esac
|
|
91
|
+
}
|
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
"""Shared pair-evidence contract for benchmark audits."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import json
|
|
5
|
+
import math
|
|
6
|
+
import re
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
ALLOWED_PAIR_ARMS = {"l2_risk_probes", "l2_gated"}
|
|
12
|
+
CANONICAL_PAIR_TRIGGER_REASONS = {
|
|
13
|
+
"mode.verify-only",
|
|
14
|
+
"mode.pair-verify",
|
|
15
|
+
"complexity.high",
|
|
16
|
+
"complexity.large",
|
|
17
|
+
"spec.complexity.high",
|
|
18
|
+
"spec.complexity.large",
|
|
19
|
+
"spec.solo_headroom_hypothesis",
|
|
20
|
+
"risk.high",
|
|
21
|
+
"risk_probes.enabled",
|
|
22
|
+
"risk_probes.present",
|
|
23
|
+
"coverage.failed",
|
|
24
|
+
"mechanical.warning",
|
|
25
|
+
"judge.warning",
|
|
26
|
+
}
|
|
27
|
+
HISTORICAL_PAIR_TRIGGER_REASON_ALIASES = {
|
|
28
|
+
"risk_profile.high_risk",
|
|
29
|
+
"risk_probes_enabled",
|
|
30
|
+
}
|
|
31
|
+
HISTORICAL_NORMALIZED_PAIR_TRIGGER_REASON_ALIASES = {
|
|
32
|
+
"complexity.high.spec.frontmatter",
|
|
33
|
+
"frontmatter.complexity.high",
|
|
34
|
+
"high.complexity.spec",
|
|
35
|
+
"high.risk.profile",
|
|
36
|
+
"spec.frontmatter.complexity.high",
|
|
37
|
+
"state.complexity.high",
|
|
38
|
+
}
|
|
39
|
+
# Benchmark readers accept historical aliases only for archived artifacts.
|
|
40
|
+
# Runtime /devlyn:resolve state must continue to emit canonical reasons.
|
|
41
|
+
KNOWN_PAIR_TRIGGER_REASONS = (
|
|
42
|
+
CANONICAL_PAIR_TRIGGER_REASONS | HISTORICAL_PAIR_TRIGGER_REASON_ALIASES
|
|
43
|
+
)
|
|
44
|
+
OBSERVABLE_COMMAND_MARKERS = ("command", "observable", "expose")
|
|
45
|
+
BACKTICKED_TEXT_RE = re.compile(r"`[^`\n]+`")
|
|
46
|
+
RESERVED_BACKTICK_TERMS = {"solo-headroom hypothesis", "solo_claude", "miss"}
|
|
47
|
+
COMMAND_PREFIXES = {
|
|
48
|
+
"bash",
|
|
49
|
+
"bun",
|
|
50
|
+
"cargo",
|
|
51
|
+
"git",
|
|
52
|
+
"go",
|
|
53
|
+
"jest",
|
|
54
|
+
"make",
|
|
55
|
+
"node",
|
|
56
|
+
"npm",
|
|
57
|
+
"pnpm",
|
|
58
|
+
"printf",
|
|
59
|
+
"pytest",
|
|
60
|
+
"python",
|
|
61
|
+
"python3",
|
|
62
|
+
"ruff",
|
|
63
|
+
"sh",
|
|
64
|
+
"uv",
|
|
65
|
+
"vitest",
|
|
66
|
+
"yarn",
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def reject_json_constant(token: str) -> None:
|
|
71
|
+
raise ValueError(f"invalid JSON numeric constant: {token}")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def loads_strict_json_object(text: str) -> dict[str, Any]:
|
|
75
|
+
data = json.loads(text, parse_constant=reject_json_constant)
|
|
76
|
+
if not isinstance(data, dict):
|
|
77
|
+
raise ValueError("top-level JSON value must be an object")
|
|
78
|
+
return data
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def normalized_pair_trigger_reason(reason: str) -> str:
|
|
82
|
+
return re.sub(r"[^a-z0-9]+", ".", reason.lower()).strip(".")
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def is_known_pair_trigger_reason(reason: str) -> bool:
|
|
86
|
+
normalized = normalized_pair_trigger_reason(reason)
|
|
87
|
+
return (
|
|
88
|
+
reason in CANONICAL_PAIR_TRIGGER_REASONS
|
|
89
|
+
or reason in HISTORICAL_PAIR_TRIGGER_REASON_ALIASES
|
|
90
|
+
or normalized in HISTORICAL_NORMALIZED_PAIR_TRIGGER_REASON_ALIASES
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def is_canonical_pair_trigger_reason(reason: str) -> bool:
|
|
95
|
+
return reason in CANONICAL_PAIR_TRIGGER_REASONS
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def is_historical_pair_trigger_reason(reason: str) -> bool:
|
|
99
|
+
normalized = normalized_pair_trigger_reason(reason)
|
|
100
|
+
return (
|
|
101
|
+
reason in HISTORICAL_PAIR_TRIGGER_REASON_ALIASES
|
|
102
|
+
or normalized in HISTORICAL_NORMALIZED_PAIR_TRIGGER_REASON_ALIASES
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def has_known_pair_trigger_reason(reasons: list[str]) -> bool:
|
|
107
|
+
return any(is_known_pair_trigger_reason(reason) for reason in reasons)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def all_known_pair_trigger_reasons(reasons: list[str]) -> bool:
|
|
111
|
+
return all(is_known_pair_trigger_reason(reason) for reason in reasons)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def has_canonical_pair_trigger_reason(reasons: list[str]) -> bool:
|
|
115
|
+
return any(is_canonical_pair_trigger_reason(reason) for reason in reasons)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def has_historical_pair_trigger_reason(reasons: list[str]) -> bool:
|
|
119
|
+
return any(is_historical_pair_trigger_reason(reason) for reason in reasons)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def is_command_like_backtick(value: str) -> bool:
|
|
123
|
+
stripped = value.strip()
|
|
124
|
+
lower = stripped.lower()
|
|
125
|
+
if not stripped or lower in RESERVED_BACKTICK_TERMS:
|
|
126
|
+
return False
|
|
127
|
+
first = lower.split(maxsplit=1)[0]
|
|
128
|
+
return (
|
|
129
|
+
first in COMMAND_PREFIXES
|
|
130
|
+
or any(marker in stripped for marker in ("/", "$", "=", "|", "&&", ";"))
|
|
131
|
+
or stripped.endswith((".js", ".py", ".sh"))
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def actionable_observable_commands(text: str) -> list[str]:
|
|
136
|
+
commands: list[str] = []
|
|
137
|
+
for line in text.splitlines():
|
|
138
|
+
lower = line.lower()
|
|
139
|
+
if "miss" not in lower or not any(marker in lower for marker in OBSERVABLE_COMMAND_MARKERS):
|
|
140
|
+
continue
|
|
141
|
+
for match in BACKTICKED_TEXT_RE.finditer(line):
|
|
142
|
+
value = match.group(0).strip("`")
|
|
143
|
+
if is_command_like_backtick(value):
|
|
144
|
+
commands.append(value)
|
|
145
|
+
return commands
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def has_actionable_solo_headroom_hypothesis_text(text: str) -> bool:
|
|
149
|
+
lower = text.lower()
|
|
150
|
+
return (
|
|
151
|
+
"solo-headroom hypothesis" in lower
|
|
152
|
+
and "solo_claude" in lower
|
|
153
|
+
and "miss" in lower
|
|
154
|
+
and bool(actionable_observable_commands(text))
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def path_has_actionable_solo_headroom_hypothesis(path: Path) -> bool:
|
|
159
|
+
try:
|
|
160
|
+
text = path.read_text(encoding="utf-8")
|
|
161
|
+
except OSError:
|
|
162
|
+
return False
|
|
163
|
+
return has_actionable_solo_headroom_hypothesis_text(text)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def normalize_pair_evidence_row(
|
|
167
|
+
*,
|
|
168
|
+
fixture: str,
|
|
169
|
+
run_id: str,
|
|
170
|
+
pair_arm: object,
|
|
171
|
+
row: dict[str, Any],
|
|
172
|
+
) -> dict[str, Any] | None:
|
|
173
|
+
bare_score = row.get("bare_score")
|
|
174
|
+
solo_score = row.get("solo_score")
|
|
175
|
+
pair_score = row.get("pair_score")
|
|
176
|
+
pair_margin = row.get("pair_margin")
|
|
177
|
+
pair_mode = row.get("pair_mode")
|
|
178
|
+
pair_trigger_eligible = row.get("pair_trigger_eligible")
|
|
179
|
+
pair_trigger_reasons = row.get("pair_trigger_reasons")
|
|
180
|
+
wall_ratio = row.get("pair_solo_wall_ratio")
|
|
181
|
+
if not fixture or not run_id:
|
|
182
|
+
return None
|
|
183
|
+
if not isinstance(pair_arm, str) or pair_arm not in ALLOWED_PAIR_ARMS:
|
|
184
|
+
return None
|
|
185
|
+
if not all(is_score(value) for value in [bare_score, solo_score, pair_score]):
|
|
186
|
+
return None
|
|
187
|
+
if not is_strict_int(pair_margin):
|
|
188
|
+
return None
|
|
189
|
+
if pair_margin != pair_score - solo_score:
|
|
190
|
+
return None
|
|
191
|
+
if pair_mode is not True:
|
|
192
|
+
return None
|
|
193
|
+
if pair_trigger_eligible is not True:
|
|
194
|
+
return None
|
|
195
|
+
if not (
|
|
196
|
+
isinstance(pair_trigger_reasons, list)
|
|
197
|
+
and pair_trigger_reasons
|
|
198
|
+
and all(isinstance(reason, str) for reason in pair_trigger_reasons)
|
|
199
|
+
and all_known_pair_trigger_reasons(pair_trigger_reasons)
|
|
200
|
+
and has_canonical_pair_trigger_reason(pair_trigger_reasons)
|
|
201
|
+
):
|
|
202
|
+
return None
|
|
203
|
+
if not is_strict_number(wall_ratio):
|
|
204
|
+
return None
|
|
205
|
+
normalized = {
|
|
206
|
+
"run_id": run_id,
|
|
207
|
+
"pair_arm": pair_arm,
|
|
208
|
+
"bare_score": bare_score,
|
|
209
|
+
"solo_score": solo_score,
|
|
210
|
+
"pair_score": pair_score,
|
|
211
|
+
"pair_margin": pair_margin,
|
|
212
|
+
"pair_mode": pair_mode,
|
|
213
|
+
"pair_trigger_eligible": pair_trigger_eligible,
|
|
214
|
+
"pair_trigger_reasons": pair_trigger_reasons,
|
|
215
|
+
"pair_trigger_has_canonical_reason": True,
|
|
216
|
+
"pair_trigger_has_hypothesis_reason": (
|
|
217
|
+
"spec.solo_headroom_hypothesis" in pair_trigger_reasons
|
|
218
|
+
),
|
|
219
|
+
"pair_solo_wall_ratio": wall_ratio,
|
|
220
|
+
}
|
|
221
|
+
return normalized
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def best_pair_evidence(evidence: list[object]) -> dict[str, Any] | None:
|
|
225
|
+
candidates = [
|
|
226
|
+
normalized
|
|
227
|
+
for item in evidence
|
|
228
|
+
if isinstance(item, dict)
|
|
229
|
+
if isinstance(item.get("run_id"), str)
|
|
230
|
+
for normalized in [
|
|
231
|
+
normalize_pair_evidence_row(
|
|
232
|
+
fixture="_",
|
|
233
|
+
run_id=item["run_id"],
|
|
234
|
+
pair_arm=item.get("pair_arm"),
|
|
235
|
+
row=item,
|
|
236
|
+
)
|
|
237
|
+
]
|
|
238
|
+
if normalized is not None
|
|
239
|
+
]
|
|
240
|
+
if not candidates:
|
|
241
|
+
return None
|
|
242
|
+
|
|
243
|
+
def key(item: dict[str, Any]) -> tuple[int, int, str]:
|
|
244
|
+
margin = item.get("pair_margin")
|
|
245
|
+
pair_score = item.get("pair_score")
|
|
246
|
+
return (
|
|
247
|
+
margin if isinstance(margin, int) else -10_000,
|
|
248
|
+
pair_score if isinstance(pair_score, int) else -10_000,
|
|
249
|
+
str(item.get("run_id") or ""),
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
return max(candidates, key=key)
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def is_strict_int(value: object) -> bool:
|
|
256
|
+
return isinstance(value, int) and not isinstance(value, bool)
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def is_score(value: object) -> bool:
|
|
260
|
+
return is_strict_int(value) and 0 <= value <= 100
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def is_strict_number(value: object) -> bool:
|
|
264
|
+
return (
|
|
265
|
+
isinstance(value, (int, float))
|
|
266
|
+
and not isinstance(value, bool)
|
|
267
|
+
and math.isfinite(value)
|
|
268
|
+
and value > 0
|
|
269
|
+
)
|
|
@@ -17,8 +17,12 @@ import subprocess
|
|
|
17
17
|
from pathlib import Path
|
|
18
18
|
from typing import Any
|
|
19
19
|
|
|
20
|
+
from pair_evidence_contract import loads_strict_json_object
|
|
21
|
+
|
|
20
22
|
|
|
21
23
|
SAFE_ID = re.compile(r"^[A-Za-z0-9_.-]+$")
|
|
24
|
+
SAFE_REPO = re.compile(r"^[A-Za-z0-9_.-]+/[A-Za-z0-9_.-]+$")
|
|
25
|
+
SAFE_COMMIT = re.compile(r"^[0-9a-fA-F]{7,40}$")
|
|
22
26
|
|
|
23
27
|
|
|
24
28
|
def run(cmd: list[str], cwd: Path | None = None) -> None:
|
|
@@ -26,11 +30,12 @@ def run(cmd: list[str], cwd: Path | None = None) -> None:
|
|
|
26
30
|
|
|
27
31
|
|
|
28
32
|
def read_json(path: Path) -> dict[str, Any]:
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
33
|
+
try:
|
|
34
|
+
return loads_strict_json_object(path.read_text(encoding="utf8"))
|
|
35
|
+
except ValueError as exc:
|
|
36
|
+
if str(exc) == "top-level JSON value must be an object":
|
|
37
|
+
raise ValueError(f"expected JSON object: {path}") from exc
|
|
38
|
+
raise
|
|
34
39
|
|
|
35
40
|
|
|
36
41
|
def require_text(instance: dict[str, Any], key: str) -> str:
|
|
@@ -40,14 +45,38 @@ def require_text(instance: dict[str, Any], key: str) -> str:
|
|
|
40
45
|
return value.strip()
|
|
41
46
|
|
|
42
47
|
|
|
48
|
+
def positive_int(value: str) -> int:
|
|
49
|
+
try:
|
|
50
|
+
parsed = int(value)
|
|
51
|
+
except ValueError as exc:
|
|
52
|
+
raise argparse.ArgumentTypeError("must be an integer") from exc
|
|
53
|
+
if parsed <= 0:
|
|
54
|
+
raise argparse.ArgumentTypeError("must be > 0")
|
|
55
|
+
return parsed
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def require_safe_repo(instance: dict[str, Any]) -> str:
|
|
59
|
+
repo = require_text(instance, "repo")
|
|
60
|
+
if not SAFE_REPO.match(repo):
|
|
61
|
+
raise ValueError(f"unsafe SWE-bench repo: {repo!r}")
|
|
62
|
+
return repo
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def require_safe_base_commit(instance: dict[str, Any]) -> str:
|
|
66
|
+
base_commit = require_text(instance, "base_commit")
|
|
67
|
+
if not SAFE_COMMIT.match(base_commit):
|
|
68
|
+
raise ValueError(f"unsafe SWE-bench base_commit: {base_commit!r}")
|
|
69
|
+
return base_commit
|
|
70
|
+
|
|
71
|
+
|
|
43
72
|
def repo_cache_name(repo: str, base_commit: str) -> str:
|
|
44
73
|
safe_repo = repo.replace("/", "__")
|
|
45
74
|
return f"{safe_repo}-{base_commit[:12]}"
|
|
46
75
|
|
|
47
76
|
|
|
48
77
|
def prepare_repo(instance: dict[str, Any], repo_dir: Path | None, repos_root: Path) -> Path:
|
|
49
|
-
repo =
|
|
50
|
-
base_commit =
|
|
78
|
+
repo = require_safe_repo(instance)
|
|
79
|
+
base_commit = require_safe_base_commit(instance)
|
|
51
80
|
repos_root.mkdir(parents=True, exist_ok=True)
|
|
52
81
|
dest = repos_root / repo_cache_name(repo, base_commit)
|
|
53
82
|
|
|
@@ -72,8 +101,8 @@ def write_case_files(
|
|
|
72
101
|
timeout_seconds: int,
|
|
73
102
|
) -> None:
|
|
74
103
|
instance_id = require_text(instance, "instance_id")
|
|
75
|
-
repo =
|
|
76
|
-
base_commit =
|
|
104
|
+
repo = require_safe_repo(instance)
|
|
105
|
+
base_commit = require_safe_base_commit(instance)
|
|
77
106
|
problem = require_text(instance, "problem_statement")
|
|
78
107
|
case_dir.mkdir(parents=True, exist_ok=True)
|
|
79
108
|
|
|
@@ -196,7 +225,7 @@ def main() -> int:
|
|
|
196
225
|
type=Path,
|
|
197
226
|
help="Local clone/source repo to copy instead of cloning GitHub; useful for tests and cached runs.",
|
|
198
227
|
)
|
|
199
|
-
parser.add_argument("--timeout-seconds", type=
|
|
228
|
+
parser.add_argument("--timeout-seconds", type=positive_int, default=2400)
|
|
200
229
|
args = parser.parse_args()
|
|
201
230
|
|
|
202
231
|
instance = read_json(args.instance_json)
|
|
@@ -10,6 +10,8 @@ import tempfile
|
|
|
10
10
|
from pathlib import Path
|
|
11
11
|
from typing import Any
|
|
12
12
|
|
|
13
|
+
from pair_evidence_contract import loads_strict_json_object, reject_json_constant
|
|
14
|
+
|
|
13
15
|
|
|
14
16
|
def read_jsonl(path: Path) -> list[dict[str, Any]]:
|
|
15
17
|
rows: list[dict[str, Any]] = []
|
|
@@ -17,7 +19,7 @@ def read_jsonl(path: Path) -> list[dict[str, Any]]:
|
|
|
17
19
|
for line_no, line in enumerate(f, start=1):
|
|
18
20
|
if not line.strip():
|
|
19
21
|
continue
|
|
20
|
-
value = json.loads(line)
|
|
22
|
+
value = json.loads(line, parse_constant=reject_json_constant)
|
|
21
23
|
if not isinstance(value, dict):
|
|
22
24
|
raise ValueError(f"{path}:{line_no}: expected JSON object")
|
|
23
25
|
rows.append(value)
|
|
@@ -31,6 +33,32 @@ def require_text(row: dict[str, Any], key: str, source: str) -> str:
|
|
|
31
33
|
return value.strip()
|
|
32
34
|
|
|
33
35
|
|
|
36
|
+
def positive_int(value: str) -> int:
|
|
37
|
+
try:
|
|
38
|
+
parsed = int(value)
|
|
39
|
+
except ValueError as exc:
|
|
40
|
+
raise argparse.ArgumentTypeError("must be an integer") from exc
|
|
41
|
+
if parsed <= 0:
|
|
42
|
+
raise argparse.ArgumentTypeError("must be > 0")
|
|
43
|
+
return parsed
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def parse_prepared_case(stdout: str, source: str) -> dict[str, Any]:
|
|
47
|
+
try:
|
|
48
|
+
value = loads_strict_json_object(stdout)
|
|
49
|
+
except ValueError as exc:
|
|
50
|
+
if str(exc) == "top-level JSON value must be an object":
|
|
51
|
+
raise ValueError(f"{source}: expected JSON object") from exc
|
|
52
|
+
raise
|
|
53
|
+
for key in ("instance_id", "case_dir", "repo_dir", "run_command"):
|
|
54
|
+
if key == "run_command":
|
|
55
|
+
if not isinstance(value.get(key), list) or not value[key]:
|
|
56
|
+
raise ValueError(f"{source}: missing non-empty {key!r}")
|
|
57
|
+
elif not isinstance(value.get(key), str) or not value[key].strip():
|
|
58
|
+
raise ValueError(f"{source}: missing non-empty {key!r}")
|
|
59
|
+
return value
|
|
60
|
+
|
|
61
|
+
|
|
34
62
|
def main() -> int:
|
|
35
63
|
parser = argparse.ArgumentParser()
|
|
36
64
|
parser.add_argument("--instances-jsonl", required=True, type=Path)
|
|
@@ -47,8 +75,8 @@ def main() -> int:
|
|
|
47
75
|
)
|
|
48
76
|
parser.add_argument("--repo-dir", type=Path, help="Use one local repo clone for every selected instance.")
|
|
49
77
|
parser.add_argument("--instance-id", action="append", help="Prepare only these instance ids.")
|
|
50
|
-
parser.add_argument("--limit", type=
|
|
51
|
-
parser.add_argument("--timeout-seconds", type=
|
|
78
|
+
parser.add_argument("--limit", type=positive_int, help="Prepare at most N matched instances after filtering.")
|
|
79
|
+
parser.add_argument("--timeout-seconds", type=positive_int, default=2400)
|
|
52
80
|
parser.add_argument("--out-manifest", type=Path)
|
|
53
81
|
args = parser.parse_args()
|
|
54
82
|
|
|
@@ -61,6 +89,8 @@ def main() -> int:
|
|
|
61
89
|
predictions[instance_id] = row
|
|
62
90
|
|
|
63
91
|
selected_ids = args.instance_id or list(predictions)
|
|
92
|
+
if not selected_ids:
|
|
93
|
+
raise ValueError("no prediction instances selected")
|
|
64
94
|
script = Path(__file__).with_name("prepare-swebench-frozen-case.py")
|
|
65
95
|
prepared: list[dict[str, Any]] = []
|
|
66
96
|
with tempfile.TemporaryDirectory() as tmp:
|
|
@@ -98,7 +128,7 @@ def main() -> int:
|
|
|
98
128
|
if args.repo_dir is not None:
|
|
99
129
|
cmd.extend(["--repo-dir", str(args.repo_dir)])
|
|
100
130
|
completed = subprocess.run(cmd, check=True, text=True, capture_output=True)
|
|
101
|
-
prepared.append(
|
|
131
|
+
prepared.append(parse_prepared_case(completed.stdout, f"prepared case {instance_id}"))
|
|
102
132
|
|
|
103
133
|
manifest = {
|
|
104
134
|
"instances_jsonl": str(args.instances_jsonl),
|
|
@@ -11,8 +11,12 @@ import subprocess
|
|
|
11
11
|
from pathlib import Path
|
|
12
12
|
from typing import Any
|
|
13
13
|
|
|
14
|
+
from pair_evidence_contract import reject_json_constant
|
|
15
|
+
|
|
14
16
|
|
|
15
17
|
SAFE_ID = re.compile(r"^[A-Za-z0-9_.-]+$")
|
|
18
|
+
SAFE_REPO = re.compile(r"^[A-Za-z0-9_.-]+/[A-Za-z0-9_.-]+$")
|
|
19
|
+
SAFE_COMMIT = re.compile(r"^[0-9a-fA-F]{7,40}$")
|
|
16
20
|
|
|
17
21
|
|
|
18
22
|
def run(cmd: list[str], cwd: Path | None = None) -> None:
|
|
@@ -25,7 +29,7 @@ def read_instances(path: Path) -> list[dict[str, Any]]:
|
|
|
25
29
|
for line_no, line in enumerate(f, start=1):
|
|
26
30
|
if not line.strip():
|
|
27
31
|
continue
|
|
28
|
-
value = json.loads(line)
|
|
32
|
+
value = json.loads(line, parse_constant=reject_json_constant)
|
|
29
33
|
if not isinstance(value, dict):
|
|
30
34
|
raise ValueError(f"{path}:{line_no}: expected JSON object")
|
|
31
35
|
rows.append(value)
|
|
@@ -39,6 +43,20 @@ def require_text(instance: dict[str, Any], key: str) -> str:
|
|
|
39
43
|
return value.strip()
|
|
40
44
|
|
|
41
45
|
|
|
46
|
+
def require_safe_repo(instance: dict[str, Any]) -> str:
|
|
47
|
+
repo = require_text(instance, "repo")
|
|
48
|
+
if not SAFE_REPO.match(repo):
|
|
49
|
+
raise ValueError(f"unsafe SWE-bench repo: {repo!r}")
|
|
50
|
+
return repo
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def require_safe_base_commit(instance: dict[str, Any]) -> str:
|
|
54
|
+
base_commit = require_text(instance, "base_commit")
|
|
55
|
+
if not SAFE_COMMIT.match(base_commit):
|
|
56
|
+
raise ValueError(f"unsafe SWE-bench base_commit: {base_commit!r}")
|
|
57
|
+
return base_commit
|
|
58
|
+
|
|
59
|
+
|
|
42
60
|
def pick_instance(path: Path, instance_id: str) -> dict[str, Any]:
|
|
43
61
|
matches = [row for row in read_instances(path) if row.get("instance_id") == instance_id]
|
|
44
62
|
if len(matches) != 1:
|
|
@@ -51,8 +69,8 @@ def repo_cache_name(repo: str, base_commit: str) -> str:
|
|
|
51
69
|
|
|
52
70
|
|
|
53
71
|
def prepare_repo(instance: dict[str, Any], repos_root: Path) -> Path:
|
|
54
|
-
repo =
|
|
55
|
-
base_commit =
|
|
72
|
+
repo = require_safe_repo(instance)
|
|
73
|
+
base_commit = require_safe_base_commit(instance)
|
|
56
74
|
repos_root.mkdir(parents=True, exist_ok=True)
|
|
57
75
|
dest = repos_root / repo_cache_name(repo, base_commit)
|
|
58
76
|
|
|
@@ -77,8 +95,8 @@ def copy_worktree(repo_path: Path, worktree: Path) -> None:
|
|
|
77
95
|
|
|
78
96
|
def write_spec(instance: dict[str, Any], worktree: Path) -> Path:
|
|
79
97
|
instance_id = require_text(instance, "instance_id")
|
|
80
|
-
repo =
|
|
81
|
-
base_commit =
|
|
98
|
+
repo = require_safe_repo(instance)
|
|
99
|
+
base_commit = require_safe_base_commit(instance)
|
|
82
100
|
problem = require_text(instance, "problem_statement")
|
|
83
101
|
spec_path = worktree / "docs" / "roadmap" / "phase-1" / f"{instance_id}.md"
|
|
84
102
|
spec_path.parent.mkdir(parents=True, exist_ok=True)
|