devlyn-cli 2.2.2 → 2.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +2 -2
- package/CLAUDE.md +4 -4
- package/README.md +85 -34
- package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
- package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
- package/benchmark/auto-resolve/README.md +307 -44
- package/benchmark/auto-resolve/RUBRIC.md +23 -14
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
- package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
- package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
- package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
- package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
- package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
- package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
- package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
- package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
- package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
- package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
- package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
- package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
- package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
- package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
- package/benchmark/auto-resolve/scripts/judge.sh +153 -26
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
- package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
- package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
- package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
- package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
- package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
- package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
- package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
- package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
- package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
- package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
- package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
- package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
- package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
- package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
- package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
- package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
- package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
- package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
- package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
- package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
- package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
- package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
- package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
- package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
- package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
- package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
- package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
- package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
- package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
- package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
- package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
- package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
- package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
- package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
- package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
- package/bin/devlyn.js +221 -17
- package/config/skills/_shared/adapters/README.md +3 -0
- package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
- package/config/skills/_shared/adapters/opus-4-7.md +9 -1
- package/config/skills/_shared/archive_run.py +78 -6
- package/config/skills/_shared/codex-config.md +5 -4
- package/config/skills/_shared/codex-monitored.sh +46 -1
- package/config/skills/_shared/collect-codex-findings.py +20 -5
- package/config/skills/_shared/engine-preflight.md +17 -13
- package/config/skills/_shared/runtime-principles.md +6 -9
- package/config/skills/_shared/spec-verify-check.py +2664 -107
- package/config/skills/_shared/verify-merge-findings.py +1369 -19
- package/config/skills/devlyn:design-ui/SKILL.md +364 -0
- package/config/skills/devlyn:ideate/SKILL.md +7 -4
- package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
- package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
- package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
- package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
- package/config/skills/devlyn:resolve/SKILL.md +78 -26
- package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
- package/config/skills/devlyn:resolve/references/phases/implement.md +1 -1
- package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
- package/config/skills/devlyn:resolve/references/phases/verify.md +80 -29
- package/config/skills/devlyn:resolve/references/state-schema.md +9 -4
- package/package.json +47 -2
- package/scripts/lint-fixtures.sh +349 -0
- package/scripts/lint-shadow-fixtures.sh +58 -0
- package/scripts/lint-skills.sh +3645 -95
|
@@ -25,6 +25,24 @@ EOF
|
|
|
25
25
|
exit "${1:-1}"
|
|
26
26
|
}
|
|
27
27
|
|
|
28
|
+
require_value() {
|
|
29
|
+
local flag="$1"
|
|
30
|
+
local value="${2:-}"
|
|
31
|
+
if [ -z "$value" ] || [[ "$value" == --* ]]; then
|
|
32
|
+
echo "$flag requires a value" >&2
|
|
33
|
+
exit 1
|
|
34
|
+
fi
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
require_safe_id() {
|
|
38
|
+
local label="$1"
|
|
39
|
+
local value="$2"
|
|
40
|
+
if [[ ! "$value" =~ ^[A-Za-z0-9_.-]+$ ]]; then
|
|
41
|
+
echo "$label must match [A-Za-z0-9_.-]+: $value" >&2
|
|
42
|
+
exit 1
|
|
43
|
+
fi
|
|
44
|
+
}
|
|
45
|
+
|
|
28
46
|
FIXTURE=""
|
|
29
47
|
DIFF_PATH=""
|
|
30
48
|
RUN_ID=""
|
|
@@ -36,13 +54,13 @@ TIMEOUT_OVERRIDE=""
|
|
|
36
54
|
RESUME_COMPLETED_ARMS=0
|
|
37
55
|
while [ $# -gt 0 ]; do
|
|
38
56
|
case "$1" in
|
|
39
|
-
--fixture) FIXTURE="$2"; shift 2;;
|
|
40
|
-
--diff) DIFF_PATH="$2"; shift 2;;
|
|
41
|
-
--run-id) RUN_ID="$2"; shift 2;;
|
|
42
|
-
--pair-mode) PAIR_MODE="$2"; shift 2;;
|
|
43
|
-
--fixtures-root) FIXTURES_ROOT="$2"; shift 2;;
|
|
44
|
-
--base-repo) BASE_REPO="$2"; shift 2;;
|
|
45
|
-
--timeout-seconds) TIMEOUT_OVERRIDE="$2"; shift 2;;
|
|
57
|
+
--fixture) require_value "$1" "${2:-}"; FIXTURE="$2"; shift 2;;
|
|
58
|
+
--diff) require_value "$1" "${2:-}"; DIFF_PATH="$2"; shift 2;;
|
|
59
|
+
--run-id) require_value "$1" "${2:-}"; RUN_ID="$2"; shift 2;;
|
|
60
|
+
--pair-mode) require_value "$1" "${2:-}"; PAIR_MODE="$2"; shift 2;;
|
|
61
|
+
--fixtures-root) require_value "$1" "${2:-}"; FIXTURES_ROOT="$2"; shift 2;;
|
|
62
|
+
--base-repo) require_value "$1" "${2:-}"; BASE_REPO="$2"; shift 2;;
|
|
63
|
+
--timeout-seconds) require_value "$1" "${2:-}"; TIMEOUT_OVERRIDE="$2"; shift 2;;
|
|
46
64
|
--prepare-only) PREPARE_ONLY=1; shift;;
|
|
47
65
|
--resume-completed-arms) RESUME_COMPLETED_ARMS=1; shift;;
|
|
48
66
|
-h|--help) usage 0;;
|
|
@@ -51,6 +69,7 @@ while [ $# -gt 0 ]; do
|
|
|
51
69
|
done
|
|
52
70
|
|
|
53
71
|
[ -n "$FIXTURE" ] && [ -n "$DIFF_PATH" ] || usage 1
|
|
72
|
+
require_safe_id "--fixture" "$FIXTURE"
|
|
54
73
|
[ -f "$DIFF_PATH" ] || { echo "diff not found: $DIFF_PATH" >&2; exit 1; }
|
|
55
74
|
[ -s "$DIFF_PATH" ] || { echo "diff is empty: $DIFF_PATH" >&2; exit 1; }
|
|
56
75
|
[ "$PAIR_MODE" = "forced" ] || [ "$PAIR_MODE" = "gated" ] || { echo "--pair-mode must be forced|gated (got '$PAIR_MODE')" >&2; exit 1; }
|
|
@@ -74,7 +93,20 @@ for f in "$META" "$EXPECTED" "$SPEC" "$TASK" "$SETUP"; do
|
|
|
74
93
|
[ -f "$f" ] || { echo "fixture missing required file: $f" >&2; exit 1; }
|
|
75
94
|
done
|
|
76
95
|
|
|
77
|
-
TIMEOUT=$(python3 -
|
|
96
|
+
TIMEOUT=$(python3 - "$META" "$BENCH_ROOT/scripts" <<'PY'
|
|
97
|
+
import pathlib
|
|
98
|
+
import sys
|
|
99
|
+
|
|
100
|
+
sys.path.insert(0, sys.argv[2])
|
|
101
|
+
from pair_evidence_contract import loads_strict_json_object
|
|
102
|
+
|
|
103
|
+
metadata = loads_strict_json_object(pathlib.Path(sys.argv[1]).read_text())
|
|
104
|
+
timeout = metadata.get("timeout_seconds")
|
|
105
|
+
if not isinstance(timeout, int) or isinstance(timeout, bool) or timeout <= 0:
|
|
106
|
+
raise SystemExit("metadata timeout_seconds must be a positive integer")
|
|
107
|
+
print(timeout)
|
|
108
|
+
PY
|
|
109
|
+
)
|
|
78
110
|
if [ -n "$TIMEOUT_OVERRIDE" ]; then
|
|
79
111
|
case "$TIMEOUT_OVERRIDE" in ''|*[!0-9]*) echo "--timeout-seconds must be an integer" >&2; exit 1;; esac
|
|
80
112
|
[ "$TIMEOUT_OVERRIDE" -gt 0 ] || { echo "--timeout-seconds must be > 0" >&2; exit 1; }
|
|
@@ -85,10 +117,28 @@ if [ -z "$RUN_ID" ]; then
|
|
|
85
117
|
SHA=$(git -C "$REPO_ROOT" rev-parse --short HEAD 2>/dev/null || echo nogit)
|
|
86
118
|
RUN_ID="${TS}-${SHA}-frozen-verify"
|
|
87
119
|
fi
|
|
120
|
+
require_safe_id "--run-id" "$RUN_ID"
|
|
88
121
|
|
|
89
122
|
RESULT_ROOT="$BENCH_ROOT/results/$RUN_ID"
|
|
90
123
|
mkdir -p "$RESULT_ROOT"
|
|
91
124
|
|
|
125
|
+
print_command() {
|
|
126
|
+
local cmd=(bash "$0"
|
|
127
|
+
--fixture "$FIXTURE"
|
|
128
|
+
--fixtures-root "$FIXTURES_ROOT"
|
|
129
|
+
--base-repo "$BASE_REPO"
|
|
130
|
+
--diff "$DIFF_PATH"
|
|
131
|
+
--run-id "$RUN_ID"
|
|
132
|
+
--pair-mode "$PAIR_MODE"
|
|
133
|
+
--timeout-seconds "$TIMEOUT"
|
|
134
|
+
)
|
|
135
|
+
[ "$PREPARE_ONLY" -eq 0 ] || cmd+=(--prepare-only)
|
|
136
|
+
[ "$RESUME_COMPLETED_ARMS" -eq 0 ] || cmd+=(--resume-completed-arms)
|
|
137
|
+
printf 'Command: '
|
|
138
|
+
printf '%q ' "${cmd[@]}"
|
|
139
|
+
printf '\n'
|
|
140
|
+
}
|
|
141
|
+
|
|
92
142
|
echo ""
|
|
93
143
|
echo "═══ Frozen Verify Pair Run ═══"
|
|
94
144
|
echo "Run-id: $RUN_ID"
|
|
@@ -99,6 +149,7 @@ echo "Diff: $DIFF_PATH"
|
|
|
99
149
|
echo "Pair: $PAIR_MODE"
|
|
100
150
|
echo "Timeout: ${TIMEOUT}s per arm"
|
|
101
151
|
[ "$PREPARE_ONLY" -eq 0 ] || echo "Mode: prepare-only"
|
|
152
|
+
print_command
|
|
102
153
|
echo ""
|
|
103
154
|
|
|
104
155
|
mirror_skills() {
|
|
@@ -195,17 +246,36 @@ summarize_arm() {
|
|
|
195
246
|
local result_dir="$1"
|
|
196
247
|
local elapsed="$2"
|
|
197
248
|
local invoke_exit="$3"
|
|
198
|
-
python3 - "$result_dir" "$elapsed" "$invoke_exit" <<'PY'
|
|
249
|
+
python3 - "$result_dir" "$elapsed" "$invoke_exit" "$BENCH_ROOT/scripts" <<'PY'
|
|
199
250
|
import json, pathlib, sys
|
|
251
|
+
sys.path.insert(0, sys.argv[4])
|
|
252
|
+
from pair_evidence_contract import loads_strict_json_object, reject_json_constant
|
|
253
|
+
|
|
200
254
|
result_dir = pathlib.Path(sys.argv[1])
|
|
201
255
|
elapsed = int(sys.argv[2])
|
|
202
256
|
invoke_exit = int(sys.argv[3])
|
|
203
257
|
archive = result_dir / "run-archive"
|
|
204
258
|
state_path = archive / "pipeline.state.json"
|
|
205
|
-
|
|
206
|
-
|
|
259
|
+
def as_dict(value):
|
|
260
|
+
return value if isinstance(value, dict) else {}
|
|
261
|
+
|
|
262
|
+
def strict_nonnegative_int(value):
|
|
263
|
+
return isinstance(value, int) and not isinstance(value, bool) and value >= 0
|
|
264
|
+
|
|
265
|
+
state = as_dict(loads_strict_json_object(state_path.read_text())) if state_path.is_file() else {}
|
|
266
|
+
phases = as_dict(state.get("phases"))
|
|
267
|
+
verify = as_dict(phases.get("verify"))
|
|
268
|
+
legacy_verify = as_dict(state.get("verify"))
|
|
207
269
|
sub_verdicts = verify.get("sub_verdicts")
|
|
208
|
-
pair_trigger = verify.get("pair_trigger") or
|
|
270
|
+
pair_trigger = verify.get("pair_trigger") or legacy_verify.get("pair_trigger")
|
|
271
|
+
PAIR_VERDICTS = {"PASS", "PASS_WITH_ISSUES", "NEEDS_WORK", "BLOCKED", "FAIL"}
|
|
272
|
+
|
|
273
|
+
def has_pair_judge_verdict(sub_verdicts):
|
|
274
|
+
return isinstance(sub_verdicts, dict) and (
|
|
275
|
+
sub_verdicts.get("judge_codex") in PAIR_VERDICTS
|
|
276
|
+
or sub_verdicts.get("pair_judge") in PAIR_VERDICTS
|
|
277
|
+
)
|
|
278
|
+
|
|
209
279
|
findings = []
|
|
210
280
|
finding_paths = []
|
|
211
281
|
merged_path = archive / "verify-merged.findings.jsonl"
|
|
@@ -231,7 +301,7 @@ for findings_path in finding_paths:
|
|
|
231
301
|
for line in findings_path.read_text().splitlines():
|
|
232
302
|
if line.strip():
|
|
233
303
|
try:
|
|
234
|
-
parsed = json.loads(line)
|
|
304
|
+
parsed = json.loads(line, parse_constant=reject_json_constant)
|
|
235
305
|
except json.JSONDecodeError:
|
|
236
306
|
continue
|
|
237
307
|
if not isinstance(parsed, dict):
|
|
@@ -240,9 +310,10 @@ for findings_path in finding_paths:
|
|
|
240
310
|
if sev not in finding_severities:
|
|
241
311
|
continue
|
|
242
312
|
findings.append(parsed)
|
|
243
|
-
merged =
|
|
313
|
+
merged = as_dict(verify.get("merged"))
|
|
244
314
|
merged_findings_count = sum(
|
|
245
|
-
|
|
315
|
+
merged.get(k) if strict_nonnegative_int(merged.get(k)) else 0
|
|
316
|
+
for k in ("critical", "high", "medium", "low")
|
|
246
317
|
)
|
|
247
318
|
findings_count = len(findings) if findings else merged_findings_count
|
|
248
319
|
severity_counts = {}
|
|
@@ -262,15 +333,12 @@ summary = {
|
|
|
262
333
|
"invoke_exit": invoke_exit,
|
|
263
334
|
"timed_out": invoke_exit == 124,
|
|
264
335
|
"invoke_failure_reason": invoke_failure_reason,
|
|
265
|
-
"terminal_verdict": (
|
|
336
|
+
"terminal_verdict": as_dict(phases.get("final_report")).get("verdict"),
|
|
266
337
|
"verify_verdict": verify.get("verdict"),
|
|
267
338
|
"sub_verdicts": sub_verdicts,
|
|
268
339
|
"pair_trigger": pair_trigger,
|
|
269
|
-
"pair_mode":
|
|
270
|
-
|
|
271
|
-
or sub_verdicts.get("pair_judge") is not None
|
|
272
|
-
))
|
|
273
|
-
or bool(verify.get("pair_mode")),
|
|
340
|
+
"pair_mode": has_pair_judge_verdict(sub_verdicts)
|
|
341
|
+
or verify.get("pair_mode") is True,
|
|
274
342
|
"verify_findings_count": findings_count,
|
|
275
343
|
"verify_findings_source": findings_source if finding_paths else (
|
|
276
344
|
"state.merged" if merged_findings_count else "missing"
|
|
@@ -302,11 +370,13 @@ run_arm() {
|
|
|
302
370
|
local result_dir="$RESULT_ROOT/$arm"
|
|
303
371
|
local work_dir="/tmp/bench-${RUN_ID}-${FIXTURE}-${arm}"
|
|
304
372
|
if [ "$RESUME_COMPLETED_ARMS" -eq 1 ] && [ "$PREPARE_ONLY" -eq 0 ] && [ -f "$result_dir/summary.json" ]; then
|
|
305
|
-
if python3 - "$result_dir/summary.json" <<'PY'
|
|
306
|
-
import
|
|
373
|
+
if python3 - "$result_dir/summary.json" "$BENCH_ROOT/scripts" <<'PY'
|
|
374
|
+
import pathlib
|
|
307
375
|
import sys
|
|
376
|
+
sys.path.insert(0, sys.argv[2])
|
|
377
|
+
from pair_evidence_contract import loads_strict_json_object
|
|
308
378
|
|
|
309
|
-
summary =
|
|
379
|
+
summary = loads_strict_json_object(pathlib.Path(sys.argv[1]).read_text())
|
|
310
380
|
raise SystemExit(0 if summary.get("invoke_exit") == 0 else 1)
|
|
311
381
|
PY
|
|
312
382
|
then
|
|
@@ -330,9 +400,12 @@ PY
|
|
|
330
400
|
mkdir -p "$work_dir/docs/roadmap/phase-1" "$work_dir/.devlyn"
|
|
331
401
|
cp "$SPEC" "$work_dir/docs/roadmap/phase-1/$FIXTURE.md"
|
|
332
402
|
cp "$DIFF_PATH" "$work_dir/.devlyn/external-diff.patch"
|
|
333
|
-
python3 - "$EXPECTED" "$work_dir/.devlyn/spec-verify.json" <<'PY'
|
|
334
|
-
import json, os, sys
|
|
335
|
-
|
|
403
|
+
python3 - "$EXPECTED" "$work_dir/.devlyn/spec-verify.json" "$BENCH_ROOT/scripts" <<'PY'
|
|
404
|
+
import json, os, pathlib, sys
|
|
405
|
+
sys.path.insert(0, sys.argv[3])
|
|
406
|
+
from pair_evidence_contract import loads_strict_json_object
|
|
407
|
+
|
|
408
|
+
expected = loads_strict_json_object(pathlib.Path(sys.argv[1]).read_text())
|
|
336
409
|
out_path = sys.argv[2]
|
|
337
410
|
commands = expected.get("verification_commands", [])
|
|
338
411
|
if not commands:
|
|
@@ -462,14 +535,17 @@ else
|
|
|
462
535
|
run_arm pair ""
|
|
463
536
|
fi
|
|
464
537
|
|
|
465
|
-
python3 - "$RESULT_ROOT" "$PAIR_MODE" <<'PY'
|
|
466
|
-
import json, pathlib, sys
|
|
538
|
+
python3 - "$RESULT_ROOT" "$PAIR_MODE" "$BENCH_ROOT/scripts" <<'PY'
|
|
539
|
+
import json, math, pathlib, sys
|
|
540
|
+
sys.path.insert(0, sys.argv[3])
|
|
541
|
+
from pair_evidence_contract import loads_strict_json_object
|
|
542
|
+
|
|
467
543
|
root = pathlib.Path(sys.argv[1])
|
|
468
544
|
pair_mode_requested = sys.argv[2]
|
|
469
545
|
out = {}
|
|
470
546
|
for arm in ("solo", "pair"):
|
|
471
547
|
path = root / arm / "summary.json"
|
|
472
|
-
out[arm] =
|
|
548
|
+
out[arm] = loads_strict_json_object(path.read_text()) if path.is_file() else {"missing": True}
|
|
473
549
|
solo = out.get("solo", {})
|
|
474
550
|
pair = out.get("pair", {})
|
|
475
551
|
rank = {
|
|
@@ -481,31 +557,127 @@ rank = {
|
|
|
481
557
|
}
|
|
482
558
|
solo_rank = rank.get(solo.get("verify_verdict"), 0)
|
|
483
559
|
pair_rank = rank.get(pair.get("verify_verdict"), 0)
|
|
484
|
-
|
|
560
|
+
raw_pair_sub = pair.get("sub_verdicts")
|
|
561
|
+
pair_sub = raw_pair_sub if isinstance(raw_pair_sub, dict) else {}
|
|
485
562
|
pair_primary_verdict = pair_sub.get("judge")
|
|
486
563
|
pair_judge_verdict = pair_sub.get("pair_judge")
|
|
487
564
|
pair_primary_rank = rank.get(pair_primary_verdict, 0)
|
|
488
565
|
pair_judge_rank = rank.get(pair_judge_verdict, 0)
|
|
566
|
+
def strict_positive_number(value):
|
|
567
|
+
return (
|
|
568
|
+
isinstance(value, (int, float))
|
|
569
|
+
and not isinstance(value, bool)
|
|
570
|
+
and math.isfinite(value)
|
|
571
|
+
and value > 0
|
|
572
|
+
)
|
|
573
|
+
|
|
574
|
+
def elapsed_ratio(pair_elapsed, solo_elapsed):
|
|
575
|
+
if not strict_positive_number(pair_elapsed) or not strict_positive_number(solo_elapsed):
|
|
576
|
+
return None
|
|
577
|
+
return pair_elapsed / solo_elapsed
|
|
578
|
+
|
|
579
|
+
def strict_nonnegative_int(value):
|
|
580
|
+
return isinstance(value, int) and not isinstance(value, bool) and value >= 0
|
|
581
|
+
|
|
582
|
+
def summary_findings_count(data):
|
|
583
|
+
value = data.get("verify_findings_count")
|
|
584
|
+
return value if strict_nonnegative_int(value) else None
|
|
585
|
+
|
|
586
|
+
def severity_count_sum(data):
|
|
587
|
+
raw_counts = data.get("severity_counts")
|
|
588
|
+
if not isinstance(raw_counts, dict):
|
|
589
|
+
return None
|
|
590
|
+
total = 0
|
|
591
|
+
for key in ("LOW", "MEDIUM", "HIGH", "CRITICAL"):
|
|
592
|
+
value = raw_counts.get(key, 0)
|
|
593
|
+
if not strict_nonnegative_int(value):
|
|
594
|
+
return None
|
|
595
|
+
total += value
|
|
596
|
+
return total
|
|
597
|
+
|
|
598
|
+
def strict_greater(left, right):
|
|
599
|
+
return left is not None and right is not None and left > right
|
|
600
|
+
|
|
601
|
+
wall_ratio = elapsed_ratio(pair.get("elapsed_seconds"), solo.get("elapsed_seconds"))
|
|
602
|
+
pair_mode_true = pair.get("pair_mode") is True
|
|
603
|
+
raw_pair_trigger = pair.get("pair_trigger")
|
|
604
|
+
pair_trigger = raw_pair_trigger if isinstance(raw_pair_trigger, dict) else {}
|
|
605
|
+
pair_findings_count = summary_findings_count(pair)
|
|
606
|
+
solo_findings_count = summary_findings_count(solo)
|
|
607
|
+
pair_low_or_worse = severity_count_sum(pair)
|
|
608
|
+
solo_low_or_worse = severity_count_sum(solo)
|
|
489
609
|
out["comparison"] = {
|
|
490
610
|
"pair_mode_requested": pair_mode_requested,
|
|
491
611
|
"pair_trigger_missed": bool(
|
|
492
612
|
pair_mode_requested == "gated"
|
|
493
|
-
and
|
|
494
|
-
and
|
|
495
|
-
and not
|
|
613
|
+
and pair_trigger.get("eligible") is True
|
|
614
|
+
and pair_trigger.get("reasons")
|
|
615
|
+
and not pair_mode_true
|
|
496
616
|
),
|
|
497
|
-
"pair_found_more_findings": (
|
|
498
|
-
"pair_found_more_low_or_worse":
|
|
499
|
-
|
|
500
|
-
"
|
|
501
|
-
"pair_internal_verdict_lift": bool(pair.get("pair_mode"))
|
|
617
|
+
"pair_found_more_findings": strict_greater(pair_findings_count, solo_findings_count),
|
|
618
|
+
"pair_found_more_low_or_worse": strict_greater(pair_low_or_worse, solo_low_or_worse),
|
|
619
|
+
"pair_verdict_lift": pair_mode_true and pair_rank > solo_rank and pair_rank >= rank["NEEDS_WORK"],
|
|
620
|
+
"pair_internal_verdict_lift": pair_mode_true
|
|
502
621
|
and pair_judge_rank > pair_primary_rank
|
|
503
622
|
and pair_rank >= rank["NEEDS_WORK"],
|
|
504
623
|
"solo_verdict": solo.get("verify_verdict"),
|
|
505
624
|
"pair_verdict": pair.get("verify_verdict"),
|
|
506
625
|
"pair_primary_verdict": pair_primary_verdict,
|
|
507
626
|
"pair_judge_verdict": pair_judge_verdict,
|
|
627
|
+
"pair_solo_wall_ratio": wall_ratio,
|
|
508
628
|
}
|
|
509
629
|
(root / "compare.json").write_text(json.dumps(out, indent=2) + "\n")
|
|
510
630
|
print(json.dumps(out, indent=2))
|
|
631
|
+
|
|
632
|
+
def fmt_bool(value):
|
|
633
|
+
return str(value is True).lower()
|
|
634
|
+
|
|
635
|
+
def fmt_ratio(value):
|
|
636
|
+
return f"{value:.2f}x" if strict_positive_number(value) else "n/a"
|
|
637
|
+
|
|
638
|
+
def fmt_seconds(value):
|
|
639
|
+
return f"{value}s" if strict_positive_number(value) else "n/a"
|
|
640
|
+
|
|
641
|
+
def fmt_trigger_reasons(value):
|
|
642
|
+
if not isinstance(value, dict):
|
|
643
|
+
return ""
|
|
644
|
+
reasons = value.get("reasons")
|
|
645
|
+
if not isinstance(reasons, list) or not all(isinstance(reason, str) for reason in reasons):
|
|
646
|
+
return ""
|
|
647
|
+
return ",".join(reasons)
|
|
648
|
+
|
|
649
|
+
def arm_row(name, data):
|
|
650
|
+
return (
|
|
651
|
+
f"| {name} | {data.get('verify_verdict') or 'n/a'} | "
|
|
652
|
+
f"{fmt_bool(data.get('pair_mode'))} | "
|
|
653
|
+
f"{fmt_trigger_reasons(data.get('pair_trigger'))} | "
|
|
654
|
+
f"{data.get('verify_findings_count', 'n/a')} | "
|
|
655
|
+
f"{fmt_seconds(data.get('elapsed_seconds'))} | {data.get('invoke_exit', 'n/a')} | "
|
|
656
|
+
f"{data.get('invoke_failure_reason') or 'n/a'} |"
|
|
657
|
+
)
|
|
658
|
+
|
|
659
|
+
summary_lines = [
|
|
660
|
+
"# Frozen VERIFY Pair Summary",
|
|
661
|
+
"",
|
|
662
|
+
f"Run id: `{root.name}`",
|
|
663
|
+
f"Pair mode requested: `{pair_mode_requested}`",
|
|
664
|
+
"",
|
|
665
|
+
"| Arm | Verdict | Pair mode | Triggers | Findings | Elapsed | Invoke exit | Failure |",
|
|
666
|
+
"|---|---|---:|---|---:|---:|---:|---|",
|
|
667
|
+
arm_row("solo", solo),
|
|
668
|
+
arm_row("pair", pair),
|
|
669
|
+
"",
|
|
670
|
+
"| Wall ratio | External lift | Internal lift | Trigger missed |",
|
|
671
|
+
"|---:|---:|---:|---:|",
|
|
672
|
+
(
|
|
673
|
+
f"| {fmt_ratio(wall_ratio)} | "
|
|
674
|
+
f"{fmt_bool(out['comparison']['pair_verdict_lift'])} | "
|
|
675
|
+
f"{fmt_bool(out['comparison']['pair_internal_verdict_lift'])} | "
|
|
676
|
+
f"{fmt_bool(out['comparison']['pair_trigger_missed'])} |"
|
|
677
|
+
),
|
|
678
|
+
"",
|
|
679
|
+
]
|
|
680
|
+
summary_text = "\n".join(summary_lines)
|
|
681
|
+
(root / "compare.md").write_text(summary_text, encoding="utf8")
|
|
682
|
+
print(summary_text)
|
|
511
683
|
PY
|