devlyn-cli 2.3.0 → 2.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +1 -1
- package/CLAUDE.md +2 -2
- package/README.md +82 -29
- package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
- package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
- package/benchmark/auto-resolve/README.md +307 -44
- package/benchmark/auto-resolve/RUBRIC.md +23 -14
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
- package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
- package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
- package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
- package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
- package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
- package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
- package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
- package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
- package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
- package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
- package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
- package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
- package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
- package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
- package/benchmark/auto-resolve/scripts/judge.sh +153 -26
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
- package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
- package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
- package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
- package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
- package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
- package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
- package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
- package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
- package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
- package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
- package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
- package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
- package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
- package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
- package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
- package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
- package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
- package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
- package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
- package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
- package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
- package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
- package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
- package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
- package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
- package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
- package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
- package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
- package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
- package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
- package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
- package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
- package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
- package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
- package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
- package/bin/devlyn.js +211 -18
- package/config/skills/_shared/adapters/README.md +3 -0
- package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
- package/config/skills/_shared/adapters/opus-4-7.md +9 -1
- package/config/skills/_shared/archive_run.py +78 -6
- package/config/skills/_shared/codex-config.md +3 -2
- package/config/skills/_shared/codex-monitored.sh +46 -1
- package/config/skills/_shared/collect-codex-findings.py +20 -5
- package/config/skills/_shared/engine-preflight.md +1 -1
- package/config/skills/_shared/runtime-principles.md +5 -8
- package/config/skills/_shared/spec-verify-check.py +2664 -107
- package/config/skills/_shared/verify-merge-findings.py +1369 -19
- package/config/skills/devlyn:ideate/SKILL.md +7 -4
- package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
- package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
- package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
- package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
- package/config/skills/devlyn:resolve/SKILL.md +49 -18
- package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
- package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
- package/config/skills/devlyn:resolve/references/phases/verify.md +62 -28
- package/config/skills/devlyn:resolve/references/state-schema.md +7 -4
- package/package.json +47 -2
- package/scripts/lint-fixtures.sh +349 -0
- package/scripts/lint-shadow-fixtures.sh +58 -0
- package/scripts/lint-skills.sh +3642 -92
- /package/{optional-skills → config/skills}/devlyn:design-ui/SKILL.md +0 -0
|
@@ -23,9 +23,9 @@
|
|
|
23
23
|
# `_axis_validation`, same shape as judge.sh).
|
|
24
24
|
# - Always re-judges (no skip-on-exists) so cross-judge results never go
|
|
25
25
|
# stale.
|
|
26
|
-
# - Aggregator computes per-axis L1-L0 disagreement vs
|
|
27
|
-
# metric per Codex R0 Q1 — falsification rule: any axis
|
|
28
|
-
# means iter-0021/0023 L1 readout is single-judge artifact).
|
|
26
|
+
# - Aggregator computes per-axis solo_claude-bare (L1-L0) disagreement vs
|
|
27
|
+
# GPT (the decisive metric per Codex R0 Q1 — falsification rule: any axis
|
|
28
|
+
# disagreement >2 means iter-0021/0023 L1 readout is single-judge artifact).
|
|
29
29
|
#
|
|
30
30
|
# Usage:
|
|
31
31
|
# judge-opus-pass.sh --run-id <ID>
|
|
@@ -38,10 +38,19 @@
|
|
|
38
38
|
set -euo pipefail
|
|
39
39
|
|
|
40
40
|
usage() { echo "usage: $0 --run-id <ID>"; exit 1; }
|
|
41
|
+
require_value() {
|
|
42
|
+
local flag="$1"
|
|
43
|
+
local value="${2:-}"
|
|
44
|
+
if [ -z "$value" ] || [[ "$value" == --* ]]; then
|
|
45
|
+
echo "$flag requires a value" >&2
|
|
46
|
+
exit 1
|
|
47
|
+
fi
|
|
48
|
+
}
|
|
49
|
+
|
|
41
50
|
RUN_ID=""
|
|
42
51
|
while [ $# -gt 0 ]; do
|
|
43
52
|
case "$1" in
|
|
44
|
-
--run-id) RUN_ID="$2"; shift 2;;
|
|
53
|
+
--run-id) require_value "$1" "${2:-}"; RUN_ID="$2"; shift 2;;
|
|
45
54
|
*) usage;;
|
|
46
55
|
esac
|
|
47
56
|
done
|
|
@@ -51,6 +60,50 @@ BENCH_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
|
|
51
60
|
RES_ROOT="$BENCH_ROOT/results/$RUN_ID"
|
|
52
61
|
[ -d "$RES_ROOT" ] || { echo "no results dir: $RES_ROOT"; exit 1; }
|
|
53
62
|
|
|
63
|
+
python3 - "$RES_ROOT" "$BENCH_ROOT/scripts" <<'PY'
|
|
64
|
+
import json
|
|
65
|
+
import pathlib
|
|
66
|
+
import sys
|
|
67
|
+
sys.path.insert(0, sys.argv[2])
|
|
68
|
+
from pair_evidence_contract import loads_strict_json_object
|
|
69
|
+
|
|
70
|
+
def is_score(value):
|
|
71
|
+
return isinstance(value, int) and not isinstance(value, bool) and 0 <= value <= 100
|
|
72
|
+
|
|
73
|
+
res_root = pathlib.Path(sys.argv[1])
|
|
74
|
+
errors = []
|
|
75
|
+
for fixture_dir in sorted(p for p in res_root.glob("F*/") if p.is_dir()):
|
|
76
|
+
judge_path = fixture_dir / "judge.json"
|
|
77
|
+
prompt_path = fixture_dir / "judge-prompt.txt"
|
|
78
|
+
if not judge_path.exists() or not prompt_path.exists():
|
|
79
|
+
continue
|
|
80
|
+
judge = loads_strict_json_object(judge_path.read_text())
|
|
81
|
+
mapping = judge.get("_blind_mapping")
|
|
82
|
+
if not isinstance(mapping, dict):
|
|
83
|
+
errors.append(f"{fixture_dir.name}: judge blind mapping missing")
|
|
84
|
+
continue
|
|
85
|
+
mapped_arms = {arm for slot, arm in mapping.items() if slot in {"A", "B", "C"}}
|
|
86
|
+
required = {"bare", "solo_claude"}
|
|
87
|
+
raw_scores = judge.get("scores_by_arm")
|
|
88
|
+
scores = raw_scores if isinstance(raw_scores, dict) else {}
|
|
89
|
+
if "variant" in scores:
|
|
90
|
+
required.add("variant")
|
|
91
|
+
malformed_scores = sorted(arm for arm, score in scores.items() if not is_score(score))
|
|
92
|
+
if malformed_scores:
|
|
93
|
+
errors.append(f"{fixture_dir.name}: scores_by_arm malformed score(s): {', '.join(malformed_scores)}")
|
|
94
|
+
missing = sorted(required - mapped_arms)
|
|
95
|
+
if missing:
|
|
96
|
+
errors.append(f"{fixture_dir.name}: judge blind mapping missing arm(s): {', '.join(missing)}")
|
|
97
|
+
unmapped_scores = sorted(set(scores) - mapped_arms)
|
|
98
|
+
if unmapped_scores:
|
|
99
|
+
errors.append(f"{fixture_dir.name}: scores_by_arm without blind mapping: {', '.join(unmapped_scores)}")
|
|
100
|
+
|
|
101
|
+
if errors:
|
|
102
|
+
for error in errors:
|
|
103
|
+
print(f"[opus-judge] ✗ {error}", file=sys.stderr)
|
|
104
|
+
raise SystemExit(2)
|
|
105
|
+
PY
|
|
106
|
+
|
|
54
107
|
command -v claude >/dev/null 2>&1 || { echo "claude CLI not on PATH"; exit 1; }
|
|
55
108
|
|
|
56
109
|
CLAUDE_CLI_VER=$(claude --version 2>/dev/null || echo "claude-cli unknown")
|
|
@@ -105,22 +158,32 @@ for fixture_dir in "$RES_ROOT"/F*/; do
|
|
|
105
158
|
continue
|
|
106
159
|
fi
|
|
107
160
|
|
|
108
|
-
python3 - "$opus_out_raw" "$gpt_judge_f" "$opus_judge_f" "$CLAUDE_CLI_VER" "$JUDGE_MODEL_ALIAS" <<'PY' || { echo "[opus-judge] ✗ $fid parse failed"; failed=$((failed + 1)); continue; }
|
|
161
|
+
python3 - "$opus_out_raw" "$gpt_judge_f" "$opus_judge_f" "$CLAUDE_CLI_VER" "$JUDGE_MODEL_ALIAS" "$BENCH_ROOT/scripts" <<'PY' || { echo "[opus-judge] ✗ $fid parse failed"; failed=$((failed + 1)); continue; }
|
|
109
162
|
import sys, json, pathlib
|
|
163
|
+
sys.path.insert(0, sys.argv[6])
|
|
164
|
+
from pair_evidence_contract import loads_strict_json_object, reject_json_constant
|
|
165
|
+
|
|
166
|
+
def is_score(value):
|
|
167
|
+
return isinstance(value, int) and not isinstance(value, bool) and 0 <= value <= 100
|
|
168
|
+
|
|
169
|
+
def is_bool(value):
|
|
170
|
+
return isinstance(value, bool)
|
|
110
171
|
|
|
111
172
|
raw = pathlib.Path(sys.argv[1]).read_text()
|
|
112
|
-
gpt =
|
|
173
|
+
gpt = loads_strict_json_object(pathlib.Path(sys.argv[2]).read_text())
|
|
113
174
|
target = pathlib.Path(sys.argv[3])
|
|
114
175
|
cli_ver = sys.argv[4].strip()
|
|
115
176
|
model_alias = sys.argv[5].strip()
|
|
116
177
|
|
|
117
178
|
# Robust JSON extraction — last valid {} block with required score keys.
|
|
118
|
-
mapping = gpt.get("_blind_mapping")
|
|
179
|
+
mapping = gpt.get("_blind_mapping")
|
|
180
|
+
if not isinstance(mapping, dict):
|
|
181
|
+
raise SystemExit("gpt judge.json _blind_mapping must be an object")
|
|
119
182
|
required_score_keys = ["a_score", "b_score"]
|
|
120
183
|
if "C" in mapping:
|
|
121
184
|
required_score_keys.append("c_score")
|
|
122
185
|
|
|
123
|
-
decoder = json.JSONDecoder()
|
|
186
|
+
decoder = json.JSONDecoder(parse_constant=reject_json_constant)
|
|
124
187
|
brace_positions = [i for i, c in enumerate(raw) if c == '{']
|
|
125
188
|
chosen = None
|
|
126
189
|
for pos in reversed(brace_positions):
|
|
@@ -135,6 +198,9 @@ if chosen is None:
|
|
|
135
198
|
raise SystemExit(
|
|
136
199
|
f"no valid JSON with keys {required_score_keys} in opus output: {sys.argv[1]}"
|
|
137
200
|
)
|
|
201
|
+
invalid_scores = [key for key in required_score_keys if not is_score(chosen.get(key))]
|
|
202
|
+
if invalid_scores:
|
|
203
|
+
raise SystemExit(f"invalid opus score value(s): {', '.join(invalid_scores)}")
|
|
138
204
|
|
|
139
205
|
# Axis validation — mirror judge.sh post iter-0023.
|
|
140
206
|
AXIS_KEYS = ("spec", "constraint", "scope", "quality")
|
|
@@ -147,9 +213,9 @@ for bk in BREAKDOWN_KEYS:
|
|
|
147
213
|
if axis not in chosen[bk]:
|
|
148
214
|
continue
|
|
149
215
|
v = chosen[bk][axis]
|
|
150
|
-
if not isinstance(v, (int, float)) or v < 0 or v > 25:
|
|
216
|
+
if isinstance(v, bool) or not isinstance(v, (int, float)) or v < 0 or v > 25:
|
|
151
217
|
axis_invalid_cells.append({"breakdown": bk, "axis": axis, "value": v})
|
|
152
|
-
chosen[bk][axis] = max(0, min(25, int(v) if isinstance(v, (int, float)) else 0))
|
|
218
|
+
chosen[bk][axis] = max(0, min(25, int(v) if not isinstance(v, bool) and isinstance(v, (int, float)) else 0))
|
|
153
219
|
chosen["_axis_validation"] = {
|
|
154
220
|
"out_of_range_count": len(axis_invalid_cells),
|
|
155
221
|
"out_of_range_cells": axis_invalid_cells,
|
|
@@ -172,7 +238,7 @@ slot_letters = ["A", "B", "C"]
|
|
|
172
238
|
scores_by_arm = {}
|
|
173
239
|
for letter, key in zip(slot_letters, slot_keys):
|
|
174
240
|
arm = mapping.get(letter)
|
|
175
|
-
if arm is not None and key in chosen:
|
|
241
|
+
if arm is not None and key in chosen and is_score(chosen[key]):
|
|
176
242
|
scores_by_arm[arm] = chosen[key]
|
|
177
243
|
chosen["scores_by_arm"] = scores_by_arm
|
|
178
244
|
|
|
@@ -196,18 +262,26 @@ for letter, bk in zip(slot_letters, BREAKDOWN_KEYS):
|
|
|
196
262
|
chosen["breakdowns_by_arm"] = breakdowns_by_arm
|
|
197
263
|
|
|
198
264
|
# Per-arm critical_findings + disqualifiers (same shape judge.sh emits).
|
|
199
|
-
|
|
265
|
+
raw_findings_letters = chosen.get("critical_findings")
|
|
266
|
+
findings_letters = raw_findings_letters if isinstance(raw_findings_letters, dict) else {}
|
|
200
267
|
chosen["findings_by_arm"] = {
|
|
201
268
|
mapping[l]: findings_letters.get(l, []) for l in slot_letters if l in mapping
|
|
202
269
|
}
|
|
203
|
-
|
|
270
|
+
raw_dq_letters = chosen.get("disqualifiers")
|
|
271
|
+
dq_letters = raw_dq_letters if isinstance(raw_dq_letters, dict) else {}
|
|
272
|
+
invalid_dq_letters = [
|
|
273
|
+
letter for letter in slot_letters
|
|
274
|
+
if letter in dq_letters and not is_bool(dq_letters.get(letter))
|
|
275
|
+
]
|
|
276
|
+
if invalid_dq_letters:
|
|
277
|
+
raise SystemExit(f"invalid opus disqualifier value(s): {', '.join(invalid_dq_letters)}")
|
|
204
278
|
dq_by_arm = {}
|
|
205
279
|
for l in slot_letters:
|
|
206
280
|
if l not in mapping:
|
|
207
281
|
continue
|
|
208
282
|
arm = mapping[l]
|
|
209
283
|
dq_by_arm[arm] = {
|
|
210
|
-
"disqualifier":
|
|
284
|
+
"disqualifier": dq_letters.get(l, False) is True,
|
|
211
285
|
"reason": str(dq_letters.get(f"{l}_reason", "") or ""),
|
|
212
286
|
}
|
|
213
287
|
chosen["disqualifiers_by_arm"] = dq_by_arm
|
|
@@ -236,7 +310,8 @@ target.write_text(json.dumps(chosen, indent=2))
|
|
|
236
310
|
print(
|
|
237
311
|
f"[opus-judge] {target.parent.name} "
|
|
238
312
|
f"v={chosen.get('variant_score')} l1={chosen.get('solo_score')} l0={chosen.get('bare_score')} "
|
|
239
|
-
f"
|
|
313
|
+
f"solo_claude-bare={chosen['margins']['solo_over_bare']} "
|
|
314
|
+
f"variant-solo_claude={chosen['margins']['variant_over_solo']}"
|
|
240
315
|
)
|
|
241
316
|
PY
|
|
242
317
|
processed=$((processed + 1))
|
|
@@ -244,9 +319,14 @@ done
|
|
|
244
319
|
|
|
245
320
|
echo "[opus-judge] judge passes: processed=$processed skipped=$skipped failed=$failed"
|
|
246
321
|
|
|
247
|
-
# Aggregate cross-judge agreement, including per-axis L1-L0 disagreement.
|
|
248
|
-
python3 - "$RES_ROOT" <<'PY'
|
|
322
|
+
# Aggregate cross-judge agreement, including per-axis solo_claude-bare (L1-L0) disagreement.
|
|
323
|
+
python3 - "$RES_ROOT" "$BENCH_ROOT/scripts" <<'PY'
|
|
249
324
|
import json, pathlib, sys, math
|
|
325
|
+
sys.path.insert(0, sys.argv[2])
|
|
326
|
+
from pair_evidence_contract import loads_strict_json_object
|
|
327
|
+
|
|
328
|
+
def is_score(value):
|
|
329
|
+
return isinstance(value, int) and not isinstance(value, bool) and 0 <= value <= 100
|
|
250
330
|
|
|
251
331
|
res_root = pathlib.Path(sys.argv[1])
|
|
252
332
|
rows = []
|
|
@@ -257,21 +337,30 @@ for fdir in sorted(res_root.glob("F*/")):
|
|
|
257
337
|
o_f = fdir / "judge-opus.json"
|
|
258
338
|
if not g_f.exists() or not o_f.exists():
|
|
259
339
|
continue
|
|
260
|
-
g =
|
|
261
|
-
o =
|
|
340
|
+
g = loads_strict_json_object(g_f.read_text())
|
|
341
|
+
o = loads_strict_json_object(o_f.read_text())
|
|
262
342
|
|
|
263
|
-
# Per-axis L1-L0
|
|
343
|
+
# Per-axis solo_claude-bare (L1-L0) for both judges.
|
|
264
344
|
# Codex R1 #1: judge.sh historically writes `a/b/c_breakdown` plus
|
|
265
345
|
# `_blind_mapping`, NOT `breakdowns_by_arm`. iter-0020 judge.json files
|
|
266
346
|
# are in that historical shape. Derive per-arm breakdowns from letter
|
|
267
347
|
# fields when `breakdowns_by_arm` is absent; fail loudly when neither
|
|
268
348
|
# source is available so axis disagreement never silently falls to zero.
|
|
349
|
+
def blind_mapping(j):
|
|
350
|
+
raw_mapping = j.get("_blind_mapping")
|
|
351
|
+
return raw_mapping if isinstance(raw_mapping, dict) else {}
|
|
352
|
+
|
|
353
|
+
def mapped_arm_set(j):
|
|
354
|
+
mapping = blind_mapping(j)
|
|
355
|
+
return {arm for slot, arm in mapping.items() if slot in {"A", "B", "C"}}
|
|
356
|
+
|
|
269
357
|
def axis_l1_l0(j, label):
|
|
358
|
+
mapped_arms = mapped_arm_set(j)
|
|
270
359
|
bka = j.get("breakdowns_by_arm") or {}
|
|
271
|
-
if "solo_claude" in bka and "bare" in bka:
|
|
360
|
+
if {"solo_claude", "bare"}.issubset(mapped_arms) and "solo_claude" in bka and "bare" in bka:
|
|
272
361
|
l1 = bka["solo_claude"]; l0 = bka["bare"]
|
|
273
362
|
else:
|
|
274
|
-
mapping = j
|
|
363
|
+
mapping = blind_mapping(j)
|
|
275
364
|
slot_letters = ["A", "B", "C"]
|
|
276
365
|
slot_breakdowns = ["a_breakdown", "b_breakdown", "c_breakdown"]
|
|
277
366
|
derived = {}
|
|
@@ -287,16 +376,35 @@ for fdir in sorted(res_root.glob("F*/")):
|
|
|
287
376
|
l1 = derived["solo_claude"]; l0 = derived["bare"]
|
|
288
377
|
return {a: (l1.get(a, 0) - l0.get(a, 0)) for a in axis_keys}
|
|
289
378
|
|
|
379
|
+
def mapped_scores(j):
|
|
380
|
+
mapped_arms = mapped_arm_set(j)
|
|
381
|
+
raw_scores = j.get("scores_by_arm")
|
|
382
|
+
scores = raw_scores if isinstance(raw_scores, dict) else {}
|
|
383
|
+
return {arm: score for arm, score in scores.items() if arm in mapped_arms and is_score(score)}
|
|
384
|
+
|
|
385
|
+
def margin_from_scores(scores, left, right):
|
|
386
|
+
if left in scores and right in scores:
|
|
387
|
+
return scores[left] - scores[right]
|
|
388
|
+
return None
|
|
389
|
+
|
|
390
|
+
def mapped_winner(j, scores):
|
|
391
|
+
winner = j.get("winner_arm")
|
|
392
|
+
if winner == "tie" or winner in scores:
|
|
393
|
+
return winner
|
|
394
|
+
return None
|
|
395
|
+
|
|
290
396
|
g_axes = axis_l1_l0(g, f"gpt {fdir.name}")
|
|
291
397
|
o_axes = axis_l1_l0(o, f"opus {fdir.name}")
|
|
292
398
|
axis_disagreement = {a: o_axes[a] - g_axes[a] for a in axis_keys}
|
|
293
399
|
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
g_l1_l0 =
|
|
297
|
-
o_l1_l0 =
|
|
298
|
-
g_v_l0 =
|
|
299
|
-
o_v_l0 =
|
|
400
|
+
g_scores = mapped_scores(g)
|
|
401
|
+
o_scores = mapped_scores(o)
|
|
402
|
+
g_l1_l0 = margin_from_scores(g_scores, "solo_claude", "bare")
|
|
403
|
+
o_l1_l0 = margin_from_scores(o_scores, "solo_claude", "bare")
|
|
404
|
+
g_v_l0 = margin_from_scores(g_scores, "variant", "bare")
|
|
405
|
+
o_v_l0 = margin_from_scores(o_scores, "variant", "bare")
|
|
406
|
+
g_winner = mapped_winner(g, g_scores)
|
|
407
|
+
o_winner = mapped_winner(o, o_scores)
|
|
300
408
|
margin_l1_l0_diff = (
|
|
301
409
|
abs(g_l1_l0 - o_l1_l0) if g_l1_l0 is not None and o_l1_l0 is not None else None
|
|
302
410
|
)
|
|
@@ -306,8 +414,8 @@ for fdir in sorted(res_root.glob("F*/")):
|
|
|
306
414
|
|
|
307
415
|
rows.append({
|
|
308
416
|
"fixture": fdir.name,
|
|
309
|
-
"gpt_scores":
|
|
310
|
-
"opus_scores":
|
|
417
|
+
"gpt_scores": g_scores,
|
|
418
|
+
"opus_scores": o_scores,
|
|
311
419
|
"gpt_margin_l1_l0": g_l1_l0,
|
|
312
420
|
"opus_margin_l1_l0": o_l1_l0,
|
|
313
421
|
"margin_l1_l0_diff": margin_l1_l0_diff,
|
|
@@ -317,9 +425,9 @@ for fdir in sorted(res_root.glob("F*/")):
|
|
|
317
425
|
"gpt_axis_l1_l0": g_axes,
|
|
318
426
|
"opus_axis_l1_l0": o_axes,
|
|
319
427
|
"axis_disagreement": axis_disagreement,
|
|
320
|
-
"winner_agree":
|
|
321
|
-
"gpt_winner":
|
|
322
|
-
"opus_winner":
|
|
428
|
+
"winner_agree": g_winner is not None and o_winner is not None and g_winner == o_winner,
|
|
429
|
+
"gpt_winner": g_winner,
|
|
430
|
+
"opus_winner": o_winner,
|
|
323
431
|
})
|
|
324
432
|
|
|
325
433
|
if not rows:
|
|
@@ -328,7 +436,7 @@ if not rows:
|
|
|
328
436
|
|
|
329
437
|
n = len(rows)
|
|
330
438
|
|
|
331
|
-
# Suite-level per-axis L1-L0 sum (both judges) and disagreement.
|
|
439
|
+
# Suite-level per-axis solo_claude-bare (L1-L0) sum (both judges) and disagreement.
|
|
332
440
|
g_axis_sum = {a: sum(r["gpt_axis_l1_l0"][a] for r in rows) for a in axis_keys}
|
|
333
441
|
o_axis_sum = {a: sum(r["opus_axis_l1_l0"][a] for r in rows) for a in axis_keys}
|
|
334
442
|
axis_sum_disagreement = {a: o_axis_sum[a] - g_axis_sum[a] for a in axis_keys}
|
|
@@ -339,7 +447,7 @@ THRESHOLD = 2
|
|
|
339
447
|
falsified_by_axis = max_abs_axis_disagreement > THRESHOLD
|
|
340
448
|
flipped_axes = [a for a, v in axis_sum_disagreement.items() if abs(v) > THRESHOLD]
|
|
341
449
|
|
|
342
|
-
# Suite avg L1-L0
|
|
450
|
+
# Suite avg solo_claude-bare (L1-L0, both judges) — Codex R1 #3: divide by valid-count, report denom.
|
|
343
451
|
gpt_l1_l0_valid = [r["gpt_margin_l1_l0"] for r in rows if r["gpt_margin_l1_l0"] is not None]
|
|
344
452
|
opus_l1_l0_valid = [r["opus_margin_l1_l0"] for r in rows if r["opus_margin_l1_l0"] is not None]
|
|
345
453
|
gpt_l1_l0_avg = (sum(gpt_l1_l0_valid) / len(gpt_l1_l0_valid)) if gpt_l1_l0_valid else None
|
|
@@ -408,18 +516,22 @@ summary = {
|
|
|
408
516
|
out = res_root / "cross-judge-summary.json"
|
|
409
517
|
out.write_text(json.dumps(summary, indent=2))
|
|
410
518
|
|
|
519
|
+
def fmt_metric(value):
|
|
520
|
+
return f"{value:.2f}" if value is not None else "na"
|
|
521
|
+
|
|
411
522
|
print(
|
|
412
523
|
f"[cross-judge] n={n} "
|
|
413
524
|
f"falsified={falsified_by_axis} flipped_axes={flipped_axes} "
|
|
414
525
|
f"max_axis_disagreement={max_abs_axis_disagreement} "
|
|
415
|
-
f"gpt_l1_l0_avg={gpt_l1_l0_avg
|
|
416
|
-
f"
|
|
526
|
+
f"gpt_l1_l0_avg={fmt_metric(gpt_l1_l0_avg)} "
|
|
527
|
+
f"opus_l1_l0_avg={fmt_metric(opus_l1_l0_avg)} "
|
|
528
|
+
f"suite_avg_diff={fmt_metric(suite_avg_diff)}"
|
|
417
529
|
)
|
|
418
530
|
print(f"[cross-judge] axis_sum_l1_l0: gpt={g_axis_sum} opus={o_axis_sum} disagree={axis_sum_disagreement}")
|
|
419
531
|
print(f"[cross-judge] wrote {out}")
|
|
420
532
|
PY
|
|
421
533
|
|
|
422
|
-
# Hard-fail summary if
|
|
534
|
+
# Hard-fail summary if any fixture in the selected result set lacks a paired judgement.
|
|
423
535
|
EXPECTED_FIXTURES=$(ls -d "$RES_ROOT"/F*/ 2>/dev/null | wc -l | awk '{print $1}')
|
|
424
536
|
PAIRED=$(find "$RES_ROOT" -maxdepth 2 -name 'judge-opus.json' | wc -l | awk '{print $1}')
|
|
425
537
|
echo "[opus-judge] expected_fixtures=$EXPECTED_FIXTURES paired=$PAIRED"
|
|
@@ -7,30 +7,46 @@
|
|
|
7
7
|
# Reads:
|
|
8
8
|
# results/<run-id>/<fixture>/variant/diff.patch + verify.json
|
|
9
9
|
# results/<run-id>/<fixture>/bare/diff.patch + verify.json
|
|
10
|
-
# fixtures/<fixture>/spec.md + expected.json
|
|
10
|
+
# fixtures/<fixture>/spec.md + expected.json, or shadow-fixtures/<fixture>/...
|
|
11
11
|
# RUBRIC.md (stable rubric)
|
|
12
12
|
#
|
|
13
13
|
# Writes:
|
|
14
14
|
# results/<run-id>/<fixture>/judge.json
|
|
15
15
|
#
|
|
16
|
-
# Blind:
|
|
16
|
+
# Blind: arm-to-slot assignment randomized per fixture, seed stored in judge.json.
|
|
17
17
|
|
|
18
18
|
set -euo pipefail
|
|
19
19
|
|
|
20
20
|
usage() { echo "usage: $0 --fixture <FID> --run-id <ID>"; exit 1; }
|
|
21
|
+
require_value() {
|
|
22
|
+
local flag="$1"
|
|
23
|
+
local value="${2:-}"
|
|
24
|
+
if [ -z "$value" ] || [[ "$value" == --* ]]; then
|
|
25
|
+
echo "$flag requires a value" >&2
|
|
26
|
+
exit 1
|
|
27
|
+
fi
|
|
28
|
+
}
|
|
29
|
+
|
|
21
30
|
FIXTURE=""; RUN_ID=""
|
|
22
31
|
while [ $# -gt 0 ]; do
|
|
23
32
|
case "$1" in
|
|
24
|
-
--fixture) FIXTURE="$2"; shift 2;;
|
|
25
|
-
--run-id) RUN_ID="$2"; shift 2;;
|
|
33
|
+
--fixture) require_value "$1" "${2:-}"; FIXTURE="$2"; shift 2;;
|
|
34
|
+
--run-id) require_value "$1" "${2:-}"; RUN_ID="$2"; shift 2;;
|
|
26
35
|
*) usage;;
|
|
27
36
|
esac
|
|
28
37
|
done
|
|
29
38
|
[ -n "$FIXTURE" ] && [ -n "$RUN_ID" ] || usage
|
|
30
39
|
|
|
31
40
|
BENCH_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
|
32
|
-
FIX_DIR="$BENCH_ROOT/fixtures/$FIXTURE"
|
|
33
41
|
RES_DIR="$BENCH_ROOT/results/$RUN_ID/$FIXTURE"
|
|
42
|
+
FIX_DIR=""
|
|
43
|
+
for candidate in "$BENCH_ROOT/fixtures/$FIXTURE" "$BENCH_ROOT/shadow-fixtures/$FIXTURE"; do
|
|
44
|
+
if [ -d "$candidate" ]; then
|
|
45
|
+
FIX_DIR="$candidate"
|
|
46
|
+
break
|
|
47
|
+
fi
|
|
48
|
+
done
|
|
49
|
+
[ -n "$FIX_DIR" ] || { echo "fixture not found in fixtures/ or shadow-fixtures/: $FIXTURE"; exit 1; }
|
|
34
50
|
|
|
35
51
|
# iter-0019: 3 arms — variant (L2), solo_claude (L1), bare (L0). The judge
|
|
36
52
|
# scores all three in a single pass with the same prompt + same model so
|
|
@@ -43,10 +59,11 @@ ARMS_PRESENT=()
|
|
|
43
59
|
# iter-0033c: l2_gated/l2_forced added for NEW L2 vs NEW L1 measurement.
|
|
44
60
|
# iter-0037: l2_risk_probes adds bounded visible-verification probes before
|
|
45
61
|
# IMPLEMENT; judge treats it as another blind arm when artifacts exist.
|
|
46
|
-
# Slot count is still A/B/C max 3
|
|
47
|
-
# {
|
|
48
|
-
#
|
|
49
|
-
#
|
|
62
|
+
# Slot count is still A/B/C max 3. Current pair-candidate proof runs supply
|
|
63
|
+
# {bare, solo_claude, selected pair arm} where the selected pair arm is usually
|
|
64
|
+
# l2_risk_probes; older diagnostic runs may instead supply l2_gated/l2_forced.
|
|
65
|
+
# The blind-shuffle slot mapping below already tolerates arbitrary
|
|
66
|
+
# ARMS_PRESENT counts >=2.
|
|
50
67
|
for arm in variant solo_claude bare l2_gated l2_risk_probes l2_forced; do
|
|
51
68
|
if [ -f "$RES_DIR/$arm/diff.patch" ] && [ -f "$RES_DIR/$arm/verify.json" ]; then
|
|
52
69
|
ARMS_PRESENT+=("$arm")
|
|
@@ -100,12 +117,15 @@ fi
|
|
|
100
117
|
# pipeline-commit markers, .devlyn/ archive lines) don't leak to the judge.
|
|
101
118
|
# Judge sees only file-content changes; the transcript, arm label, NOTES.md,
|
|
102
119
|
# and all process artifacts stay out of the prompt.
|
|
103
|
-
python3 - "$PROMPT_FILE" "$FIX_DIR/spec.md" "$FIX_DIR/expected.json" "$BENCH_ROOT/RUBRIC.md" "$A_DIFF" "$B_DIFF" "$A_VERIFY" "$B_VERIFY" "$C_DIFF" "$C_VERIFY" <<'PY'
|
|
120
|
+
python3 - "$PROMPT_FILE" "$FIX_DIR/spec.md" "$FIX_DIR/expected.json" "$BENCH_ROOT/RUBRIC.md" "$A_DIFF" "$B_DIFF" "$A_VERIFY" "$B_VERIFY" "$C_DIFF" "$C_VERIFY" "$BENCH_ROOT/scripts" <<'PY'
|
|
104
121
|
import sys, pathlib, re, json
|
|
105
122
|
args = sys.argv[1:]
|
|
106
123
|
out_p, spec_p, exp_p, rubric_p = map(pathlib.Path, args[:4])
|
|
107
124
|
a_diff, b_diff, a_ver, b_ver = map(pathlib.Path, args[4:8])
|
|
108
125
|
c_diff_arg, c_ver_arg = args[8], args[9]
|
|
126
|
+
sys.path.insert(0, args[10])
|
|
127
|
+
from pair_evidence_contract import loads_strict_json_object
|
|
128
|
+
|
|
109
129
|
c_diff = pathlib.Path(c_diff_arg) if c_diff_arg else None
|
|
110
130
|
c_ver = pathlib.Path(c_ver_arg) if c_ver_arg else None
|
|
111
131
|
out = out_p
|
|
@@ -139,7 +159,7 @@ def sanitize(diff: str) -> str:
|
|
|
139
159
|
|
|
140
160
|
# Also strip arm-identifying fields from verify.json before passing to judge.
|
|
141
161
|
def sanitize_verify(path: pathlib.Path) -> str:
|
|
142
|
-
data =
|
|
162
|
+
data = loads_strict_json_object(path.read_text())
|
|
143
163
|
# Remove anything that could name the arm
|
|
144
164
|
data.pop("arm", None)
|
|
145
165
|
return json.dumps(data, indent=2)
|
|
@@ -253,8 +273,88 @@ rm -rf "$JUDGE_CWD"
|
|
|
253
273
|
mkdir -p "$JUDGE_CWD"
|
|
254
274
|
|
|
255
275
|
JUDGE_OUT="$RES_DIR/judge-output.txt"
|
|
276
|
+
JUDGE_LAST="$RES_DIR/judge-last-message.txt"
|
|
277
|
+
JUDGE_SCHEMA="$RES_DIR/judge-output.schema.json"
|
|
278
|
+
python3 - "$JUDGE_SCHEMA" "$C_ARM" <<'PY'
|
|
279
|
+
import json
|
|
280
|
+
import pathlib
|
|
281
|
+
import sys
|
|
282
|
+
|
|
283
|
+
out = pathlib.Path(sys.argv[1])
|
|
284
|
+
have_c = bool(sys.argv[2])
|
|
285
|
+
letters = ["A", "B"] + (["C"] if have_c else [])
|
|
286
|
+
|
|
287
|
+
breakdown = {
|
|
288
|
+
"type": "object",
|
|
289
|
+
"required": ["spec", "constraint", "scope", "quality", "notes"],
|
|
290
|
+
"properties": {
|
|
291
|
+
"spec": {"type": "integer", "minimum": 0, "maximum": 25},
|
|
292
|
+
"constraint": {"type": "integer", "minimum": 0, "maximum": 25},
|
|
293
|
+
"scope": {"type": "integer", "minimum": 0, "maximum": 25},
|
|
294
|
+
"quality": {"type": "integer", "minimum": 0, "maximum": 25},
|
|
295
|
+
"notes": {"type": "string"},
|
|
296
|
+
},
|
|
297
|
+
"additionalProperties": False,
|
|
298
|
+
}
|
|
299
|
+
findings = {
|
|
300
|
+
"type": "object",
|
|
301
|
+
"required": letters,
|
|
302
|
+
"properties": {letter: {"type": "array", "items": {"type": "string"}} for letter in letters},
|
|
303
|
+
"additionalProperties": False,
|
|
304
|
+
}
|
|
305
|
+
disqualifier_props = {}
|
|
306
|
+
for letter in letters:
|
|
307
|
+
disqualifier_props[letter] = {"type": "boolean"}
|
|
308
|
+
disqualifier_props[f"{letter}_reason"] = {"type": "string"}
|
|
309
|
+
schema = {
|
|
310
|
+
"type": "object",
|
|
311
|
+
"required": [
|
|
312
|
+
"a_score",
|
|
313
|
+
"b_score",
|
|
314
|
+
"winner",
|
|
315
|
+
"a_breakdown",
|
|
316
|
+
"b_breakdown",
|
|
317
|
+
"critical_findings",
|
|
318
|
+
"disqualifiers",
|
|
319
|
+
"overall_reasoning",
|
|
320
|
+
] + (["c_score", "c_breakdown"] if have_c else []),
|
|
321
|
+
"properties": {
|
|
322
|
+
"a_score": {"type": "integer", "minimum": 0, "maximum": 100},
|
|
323
|
+
"b_score": {"type": "integer", "minimum": 0, "maximum": 100},
|
|
324
|
+
"winner": {"type": "string", "enum": letters + ["tie"]},
|
|
325
|
+
"a_breakdown": breakdown,
|
|
326
|
+
"b_breakdown": breakdown,
|
|
327
|
+
"critical_findings": findings,
|
|
328
|
+
"disqualifiers": {
|
|
329
|
+
"type": "object",
|
|
330
|
+
"required": list(disqualifier_props),
|
|
331
|
+
"properties": disqualifier_props,
|
|
332
|
+
"additionalProperties": False,
|
|
333
|
+
},
|
|
334
|
+
"overall_reasoning": {"type": "string"},
|
|
335
|
+
},
|
|
336
|
+
"additionalProperties": False,
|
|
337
|
+
}
|
|
338
|
+
if have_c:
|
|
339
|
+
schema["properties"]["c_score"] = {"type": "integer", "minimum": 0, "maximum": 100}
|
|
340
|
+
schema["properties"]["c_breakdown"] = breakdown
|
|
341
|
+
out.write_text(json.dumps(schema, indent=2))
|
|
342
|
+
PY
|
|
256
343
|
set +e
|
|
257
|
-
cat "$PROMPT_FILE" | (
|
|
344
|
+
cat "$PROMPT_FILE" | (
|
|
345
|
+
cd "$JUDGE_CWD" && codex exec \
|
|
346
|
+
--ignore-user-config \
|
|
347
|
+
--ignore-rules \
|
|
348
|
+
--ephemeral \
|
|
349
|
+
--disable codex_hooks \
|
|
350
|
+
--disable hooks \
|
|
351
|
+
-s read-only \
|
|
352
|
+
--skip-git-repo-check \
|
|
353
|
+
-c model_reasoning_effort=xhigh \
|
|
354
|
+
--output-schema "$JUDGE_SCHEMA" \
|
|
355
|
+
--output-last-message "$JUDGE_LAST" \
|
|
356
|
+
-
|
|
357
|
+
) > "$JUDGE_OUT" 2>&1
|
|
258
358
|
JUDGE_EXIT=$?
|
|
259
359
|
set -e
|
|
260
360
|
rm -rf "$JUDGE_CWD"
|
|
@@ -264,18 +364,23 @@ if [ $JUDGE_EXIT -ne 0 ]; then
|
|
|
264
364
|
fi
|
|
265
365
|
|
|
266
366
|
# Extract JSON (codex wraps with banners; pick the last {...} block)
|
|
267
|
-
python3 - "$JUDGE_OUT" "$RES_DIR/judge.json" "$A_ARM" "$B_ARM" "$C_ARM" "$SEED" "$CODEX_CLI_VER" "$JUDGE_MODEL" <<'PY'
|
|
367
|
+
python3 - "$JUDGE_OUT" "$JUDGE_LAST" "$RES_DIR/judge.json" "$A_ARM" "$B_ARM" "$C_ARM" "$SEED" "$CODEX_CLI_VER" "$JUDGE_MODEL" "$BENCH_ROOT/scripts" <<'PY'
|
|
268
368
|
import math
|
|
269
369
|
import sys, re, json, pathlib
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
370
|
+
out_path = pathlib.Path(sys.argv[1])
|
|
371
|
+
last_path = pathlib.Path(sys.argv[2])
|
|
372
|
+
target = pathlib.Path(sys.argv[3])
|
|
373
|
+
a_arm, b_arm, c_arm, seed, codex_ver, judge_model = sys.argv[4:10]
|
|
374
|
+
sys.path.insert(0, sys.argv[10])
|
|
375
|
+
from pair_evidence_contract import loads_strict_json_object, reject_json_constant
|
|
376
|
+
|
|
377
|
+
out = last_path.read_text() if last_path.is_file() and last_path.stat().st_size else out_path.read_text()
|
|
273
378
|
|
|
274
379
|
# Extract the last valid judgment JSON. A naive brace-counter breaks on
|
|
275
380
|
# `{`/`}` that appear inside strings (e.g. JS source embedded in the arms'
|
|
276
381
|
# diffs), so use json.JSONDecoder.raw_decode starting at each `{` position
|
|
277
382
|
# and keep the last successful parse with the required keys.
|
|
278
|
-
decoder = json.JSONDecoder()
|
|
383
|
+
decoder = json.JSONDecoder(parse_constant=reject_json_constant)
|
|
279
384
|
brace_positions = [i for i, c in enumerate(out) if c == '{']
|
|
280
385
|
chosen = None
|
|
281
386
|
for pos in reversed(brace_positions):
|
|
@@ -287,7 +392,21 @@ for pos in reversed(brace_positions):
|
|
|
287
392
|
chosen = obj
|
|
288
393
|
break
|
|
289
394
|
if chosen is None:
|
|
290
|
-
raise SystemExit(f"no valid JSON in judge output; see {
|
|
395
|
+
raise SystemExit(f"no valid JSON in judge output; see {out_path}")
|
|
396
|
+
|
|
397
|
+
def is_score(value):
|
|
398
|
+
return isinstance(value, int) and not isinstance(value, bool) and 0 <= value <= 100
|
|
399
|
+
|
|
400
|
+
def is_bool(value):
|
|
401
|
+
return isinstance(value, bool)
|
|
402
|
+
|
|
403
|
+
def is_axis_score(value):
|
|
404
|
+
return isinstance(value, int) and not isinstance(value, bool) and 0 <= value <= 25
|
|
405
|
+
|
|
406
|
+
required_score_keys = ["a_score", "b_score"] + (["c_score"] if c_arm else [])
|
|
407
|
+
invalid_scores = [key for key in required_score_keys if not is_score(chosen.get(key))]
|
|
408
|
+
if invalid_scores:
|
|
409
|
+
raise SystemExit(f"invalid judge score value(s): {', '.join(invalid_scores)}")
|
|
291
410
|
|
|
292
411
|
# Decode blind labels — record full mapping so summary code can iterate
|
|
293
412
|
mapping = {"A": a_arm, "B": b_arm}
|
|
@@ -313,9 +432,9 @@ for bk in BREAKDOWN_KEYS:
|
|
|
313
432
|
if axis not in chosen[bk]:
|
|
314
433
|
continue
|
|
315
434
|
v = chosen[bk][axis]
|
|
316
|
-
if not
|
|
435
|
+
if not is_axis_score(v):
|
|
317
436
|
axis_invalid_cells.append({"breakdown": bk, "axis": axis, "value": v})
|
|
318
|
-
chosen[bk][axis] = max(0, min(25, int(v) if isinstance(v, (int, float)) else 0))
|
|
437
|
+
chosen[bk][axis] = max(0, min(25, int(v) if not isinstance(v, bool) and isinstance(v, (int, float)) else 0))
|
|
319
438
|
chosen["_axis_validation"] = {
|
|
320
439
|
"out_of_range_count": len(axis_invalid_cells),
|
|
321
440
|
"out_of_range_cells": axis_invalid_cells,
|
|
@@ -335,7 +454,7 @@ def arm_verify_score(arm: str):
|
|
|
335
454
|
path = target.parent / arm / "verify.json"
|
|
336
455
|
if not path.is_file():
|
|
337
456
|
return None
|
|
338
|
-
data =
|
|
457
|
+
data = loads_strict_json_object(path.read_text())
|
|
339
458
|
value = data.get("verify_score")
|
|
340
459
|
return float(value) if isinstance(value, (int, float)) else None
|
|
341
460
|
|
|
@@ -367,7 +486,7 @@ for letter, score_key, breakdown_key in (
|
|
|
367
486
|
"score_capped": False,
|
|
368
487
|
"spec_capped": False,
|
|
369
488
|
}
|
|
370
|
-
if
|
|
489
|
+
if is_score(raw_score) and raw_score > score_cap:
|
|
371
490
|
chosen[score_key] = score_cap
|
|
372
491
|
row["score_capped"] = True
|
|
373
492
|
breakdown = chosen.get(breakdown_key)
|
|
@@ -386,23 +505,31 @@ slot_keys = ["a_score", "b_score", "c_score"]
|
|
|
386
505
|
slot_letters = ["A", "B", "C"]
|
|
387
506
|
for letter, key in zip(slot_letters, slot_keys):
|
|
388
507
|
arm = mapping.get(letter)
|
|
389
|
-
if arm is not None and key in chosen:
|
|
508
|
+
if arm is not None and key in chosen and is_score(chosen[key]):
|
|
390
509
|
scores_by_arm[arm] = chosen[key]
|
|
391
510
|
chosen["scores_by_arm"] = scores_by_arm
|
|
392
511
|
|
|
393
512
|
# Per-letter critical_findings / disqualifiers also rotated to per-arm.
|
|
394
|
-
|
|
513
|
+
raw_findings_letters = chosen.get("critical_findings")
|
|
514
|
+
findings_letters = raw_findings_letters if isinstance(raw_findings_letters, dict) else {}
|
|
395
515
|
findings_by_arm = {mapping[l]: findings_letters.get(l, []) for l in slot_letters if l in mapping}
|
|
396
516
|
chosen["findings_by_arm"] = findings_by_arm
|
|
397
517
|
|
|
398
|
-
|
|
518
|
+
raw_dq_letters = chosen.get("disqualifiers")
|
|
519
|
+
dq_letters = raw_dq_letters if isinstance(raw_dq_letters, dict) else {}
|
|
520
|
+
invalid_dq_letters = [
|
|
521
|
+
letter for letter in slot_letters
|
|
522
|
+
if letter in dq_letters and not is_bool(dq_letters.get(letter))
|
|
523
|
+
]
|
|
524
|
+
if invalid_dq_letters:
|
|
525
|
+
raise SystemExit(f"invalid judge disqualifier value(s): {', '.join(invalid_dq_letters)}")
|
|
399
526
|
dq_by_arm = {}
|
|
400
527
|
for l in slot_letters:
|
|
401
528
|
if l not in mapping:
|
|
402
529
|
continue
|
|
403
530
|
arm = mapping[l]
|
|
404
531
|
dq_by_arm[arm] = {
|
|
405
|
-
"disqualifier":
|
|
532
|
+
"disqualifier": dq_letters.get(l, False) is True,
|
|
406
533
|
"reason": str(dq_letters.get(f"{l}_reason", "") or ""),
|
|
407
534
|
}
|
|
408
535
|
chosen["disqualifiers_by_arm"] = dq_by_arm
|