devlyn-cli 2.3.0 → 2.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +1 -1
- package/CLAUDE.md +2 -2
- package/README.md +82 -29
- package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
- package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
- package/benchmark/auto-resolve/README.md +307 -44
- package/benchmark/auto-resolve/RUBRIC.md +23 -14
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
- package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
- package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
- package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
- package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
- package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
- package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
- package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
- package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
- package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
- package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
- package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
- package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
- package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
- package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
- package/benchmark/auto-resolve/scripts/judge.sh +153 -26
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
- package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
- package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
- package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
- package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
- package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
- package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
- package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
- package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
- package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
- package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
- package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
- package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
- package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
- package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
- package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
- package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
- package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
- package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
- package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
- package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
- package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
- package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
- package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
- package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
- package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
- package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
- package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
- package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
- package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
- package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
- package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
- package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
- package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
- package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
- package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
- package/bin/devlyn.js +211 -18
- package/config/skills/_shared/adapters/README.md +3 -0
- package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
- package/config/skills/_shared/adapters/opus-4-7.md +9 -1
- package/config/skills/_shared/archive_run.py +78 -6
- package/config/skills/_shared/codex-config.md +3 -2
- package/config/skills/_shared/codex-monitored.sh +46 -1
- package/config/skills/_shared/collect-codex-findings.py +20 -5
- package/config/skills/_shared/engine-preflight.md +1 -1
- package/config/skills/_shared/runtime-principles.md +5 -8
- package/config/skills/_shared/spec-verify-check.py +2664 -107
- package/config/skills/_shared/verify-merge-findings.py +1369 -19
- package/config/skills/devlyn:ideate/SKILL.md +7 -4
- package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
- package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
- package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
- package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
- package/config/skills/devlyn:resolve/SKILL.md +49 -18
- package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
- package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
- package/config/skills/devlyn:resolve/references/phases/verify.md +62 -28
- package/config/skills/devlyn:resolve/references/state-schema.md +7 -4
- package/package.json +47 -2
- package/scripts/lint-fixtures.sh +349 -0
- package/scripts/lint-shadow-fixtures.sh +58 -0
- package/scripts/lint-skills.sh +3642 -92
- /package/{optional-skills → config/skills}/devlyn:design-ui/SKILL.md +0 -0
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Print a compact, wrap-safe benchmark snapshot from local artifacts."""
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import importlib.util
|
|
7
|
+
import json
|
|
8
|
+
import pathlib
|
|
9
|
+
import sys
|
|
10
|
+
import textwrap
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
SCRIPT_DIR = pathlib.Path(__file__).resolve().parent
|
|
14
|
+
FRONTIER_PATH = SCRIPT_DIR / "pair-candidate-frontier.py"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def load_frontier_module() -> Any:
|
|
18
|
+
spec = importlib.util.spec_from_file_location("pair_candidate_frontier", FRONTIER_PATH)
|
|
19
|
+
if spec is None or spec.loader is None:
|
|
20
|
+
raise RuntimeError(f"cannot load frontier module: {FRONTIER_PATH}")
|
|
21
|
+
module = importlib.util.module_from_spec(spec)
|
|
22
|
+
spec.loader.exec_module(module)
|
|
23
|
+
return module
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
FRONTIER = load_frontier_module()
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def best_rows(report: dict[str, Any]) -> list[dict[str, Any]]:
|
|
30
|
+
rows: list[dict[str, Any]] = []
|
|
31
|
+
for row in report.get("rows", []):
|
|
32
|
+
if row.get("status") != "pair_evidence_passed":
|
|
33
|
+
continue
|
|
34
|
+
best = FRONTIER.best_pair_evidence(row.get("passing_pair_evidence", []))
|
|
35
|
+
if best is None:
|
|
36
|
+
continue
|
|
37
|
+
rows.append({"fixture": row["fixture"], **best})
|
|
38
|
+
return rows
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def display_fixture(fixture: str) -> str:
|
|
42
|
+
short, _, rest = fixture.partition("-")
|
|
43
|
+
return f"{short} {rest.replace('-', ' ')}" if rest else fixture
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def fmt_margin(value: Any) -> str:
|
|
47
|
+
return f"{value:+d}" if isinstance(value, int) and not isinstance(value, bool) else "n/a"
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def fmt_decimal_margin(value: Any) -> str:
|
|
51
|
+
return f"{value:+.2f}" if isinstance(value, (int, float)) and not isinstance(value, bool) else "n/a"
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def fmt_wall(value: Any) -> str:
|
|
55
|
+
return f"{value:.2f}x" if isinstance(value, (int, float)) and not isinstance(value, bool) else "n/a"
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def fmt_score(value: Any) -> str:
|
|
59
|
+
return str(value) if isinstance(value, int) and not isinstance(value, bool) else "n/a"
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def wrap_item(prefix: str, text: str, *, width: int) -> list[str]:
|
|
63
|
+
return textwrap.wrap(
|
|
64
|
+
text,
|
|
65
|
+
width=width,
|
|
66
|
+
initial_indent=prefix,
|
|
67
|
+
subsequent_indent=" " * len(prefix),
|
|
68
|
+
break_long_words=False,
|
|
69
|
+
break_on_hyphens=False,
|
|
70
|
+
) or [prefix.rstrip()]
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def render_text(report: dict[str, Any], *, width: int) -> str:
|
|
74
|
+
rows = best_rows(report)
|
|
75
|
+
lines = [
|
|
76
|
+
"Recent Benchmark Snapshot",
|
|
77
|
+
"=========================",
|
|
78
|
+
"",
|
|
79
|
+
"Status",
|
|
80
|
+
f" Verdict: {report.get('verdict', 'n/a')}",
|
|
81
|
+
f" Active fixtures: {report.get('fixtures_total', 'n/a')}",
|
|
82
|
+
f" Rejected controls: {report.get('rejected_total', 'n/a')}",
|
|
83
|
+
f" Pair evidence rows: {report.get('pair_evidence_total', 'n/a')}",
|
|
84
|
+
f" Unmeasured candidates: {report.get('unmeasured_candidate_total', 'n/a')}",
|
|
85
|
+
"",
|
|
86
|
+
"Pair Lift",
|
|
87
|
+
f" Average margin: {fmt_decimal_margin(report.get('pair_margin_avg'))}",
|
|
88
|
+
f" Minimum margin: {fmt_margin(report.get('pair_margin_min'))}",
|
|
89
|
+
f" Average wall ratio: {fmt_wall(report.get('pair_solo_wall_ratio_avg'))}",
|
|
90
|
+
f" Maximum wall ratio: {fmt_wall(report.get('pair_solo_wall_ratio_max'))}",
|
|
91
|
+
f" Gate: margin >= {fmt_margin(report.get('min_pair_margin'))}; wall <= {fmt_wall(report.get('max_pair_solo_wall_ratio'))}",
|
|
92
|
+
"",
|
|
93
|
+
"Pair Evidence",
|
|
94
|
+
]
|
|
95
|
+
if not rows:
|
|
96
|
+
lines.append(" No passing pair evidence rows found.")
|
|
97
|
+
return "\n".join(lines) + "\n"
|
|
98
|
+
|
|
99
|
+
for item in rows:
|
|
100
|
+
lines.append(f" {display_fixture(item['fixture'])}")
|
|
101
|
+
lines.append(
|
|
102
|
+
" scores: bare {bare} | solo_claude {solo} | pair {pair}".format(
|
|
103
|
+
bare=fmt_score(item.get("bare_score")),
|
|
104
|
+
solo=fmt_score(item.get("solo_score")),
|
|
105
|
+
pair=fmt_score(item.get("pair_score")),
|
|
106
|
+
)
|
|
107
|
+
)
|
|
108
|
+
lines.append(
|
|
109
|
+
" lift: {margin} | wall {wall} | arm {arm}".format(
|
|
110
|
+
margin=fmt_margin(item.get("pair_margin")),
|
|
111
|
+
wall=fmt_wall(item.get("pair_solo_wall_ratio")),
|
|
112
|
+
arm=item.get("pair_arm") or "n/a",
|
|
113
|
+
)
|
|
114
|
+
)
|
|
115
|
+
lines.extend(wrap_item(" run: ", str(item.get("run_id") or "n/a"), width=width))
|
|
116
|
+
triggers = ", ".join(item.get("pair_trigger_reasons") or [])
|
|
117
|
+
lines.extend(wrap_item(" triggers: ", triggers or "n/a", width=width))
|
|
118
|
+
return "\n".join(lines) + "\n"
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def render_markdown(report: dict[str, Any], *, width: int) -> str:
|
|
122
|
+
rows = best_rows(report)
|
|
123
|
+
lines = [
|
|
124
|
+
"# Recent Benchmark Snapshot",
|
|
125
|
+
"",
|
|
126
|
+
"## Status",
|
|
127
|
+
"",
|
|
128
|
+
f"- Verdict: **{report.get('verdict', 'n/a')}**",
|
|
129
|
+
f"- Active fixtures: {report.get('fixtures_total', 'n/a')}",
|
|
130
|
+
f"- Rejected controls: {report.get('rejected_total', 'n/a')}",
|
|
131
|
+
f"- Pair evidence rows: {report.get('pair_evidence_total', 'n/a')}",
|
|
132
|
+
f"- Unmeasured candidates: {report.get('unmeasured_candidate_total', 'n/a')}",
|
|
133
|
+
"",
|
|
134
|
+
"## Pair Lift",
|
|
135
|
+
"",
|
|
136
|
+
f"- Average margin: **{fmt_decimal_margin(report.get('pair_margin_avg'))}**",
|
|
137
|
+
f"- Minimum margin: **{fmt_margin(report.get('pair_margin_min'))}**",
|
|
138
|
+
f"- Average wall ratio: {fmt_wall(report.get('pair_solo_wall_ratio_avg'))}",
|
|
139
|
+
f"- Maximum wall ratio: {fmt_wall(report.get('pair_solo_wall_ratio_max'))}",
|
|
140
|
+
f"- Gate: margin >= {fmt_margin(report.get('min_pair_margin'))}; wall <= {fmt_wall(report.get('max_pair_solo_wall_ratio'))}",
|
|
141
|
+
"",
|
|
142
|
+
"## Pair Evidence",
|
|
143
|
+
"",
|
|
144
|
+
]
|
|
145
|
+
if not rows:
|
|
146
|
+
lines.append("No passing pair evidence rows found.")
|
|
147
|
+
return "\n".join(lines) + "\n"
|
|
148
|
+
|
|
149
|
+
for item in rows:
|
|
150
|
+
lines.extend(
|
|
151
|
+
[
|
|
152
|
+
f"### {display_fixture(item['fixture'])}",
|
|
153
|
+
"",
|
|
154
|
+
f"- Scores: bare {fmt_score(item.get('bare_score'))}, solo_claude {fmt_score(item.get('solo_score'))}, pair {fmt_score(item.get('pair_score'))}.",
|
|
155
|
+
f"- Lift: {fmt_margin(item.get('pair_margin'))}; wall {fmt_wall(item.get('pair_solo_wall_ratio'))}; arm `{item.get('pair_arm') or 'n/a'}`.",
|
|
156
|
+
f"- Run: `{item.get('run_id') or 'n/a'}`.",
|
|
157
|
+
]
|
|
158
|
+
)
|
|
159
|
+
triggers = ", ".join(item.get("pair_trigger_reasons") or [])
|
|
160
|
+
wrapped = wrap_item("- Triggers: ", triggers or "n/a", width=width)
|
|
161
|
+
lines.extend(wrapped)
|
|
162
|
+
lines.append("")
|
|
163
|
+
return "\n".join(lines).rstrip() + "\n"
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def main() -> int:
|
|
167
|
+
parser = argparse.ArgumentParser()
|
|
168
|
+
parser.add_argument(
|
|
169
|
+
"--fixtures-root",
|
|
170
|
+
type=pathlib.Path,
|
|
171
|
+
default=pathlib.Path("benchmark/auto-resolve/fixtures"),
|
|
172
|
+
)
|
|
173
|
+
parser.add_argument(
|
|
174
|
+
"--registry",
|
|
175
|
+
type=pathlib.Path,
|
|
176
|
+
default=SCRIPT_DIR / "pair-rejected-fixtures.sh",
|
|
177
|
+
)
|
|
178
|
+
parser.add_argument(
|
|
179
|
+
"--results-root",
|
|
180
|
+
type=pathlib.Path,
|
|
181
|
+
default=pathlib.Path("benchmark/auto-resolve/results"),
|
|
182
|
+
)
|
|
183
|
+
parser.add_argument("--out-json", type=pathlib.Path)
|
|
184
|
+
parser.add_argument("--out-md", type=pathlib.Path)
|
|
185
|
+
parser.add_argument(
|
|
186
|
+
"--max-width",
|
|
187
|
+
type=int,
|
|
188
|
+
default=92,
|
|
189
|
+
help="target maximum line width for text and markdown output",
|
|
190
|
+
)
|
|
191
|
+
parser.add_argument(
|
|
192
|
+
"--min-pair-margin",
|
|
193
|
+
type=int,
|
|
194
|
+
default=5,
|
|
195
|
+
help="minimum pair-over-solo margin required to count passing pair evidence",
|
|
196
|
+
)
|
|
197
|
+
parser.add_argument(
|
|
198
|
+
"--max-pair-solo-wall-ratio",
|
|
199
|
+
type=float,
|
|
200
|
+
default=3.0,
|
|
201
|
+
help="maximum pair/solo wall-time ratio allowed to count passing pair evidence",
|
|
202
|
+
)
|
|
203
|
+
args = parser.parse_args()
|
|
204
|
+
if args.max_width < 60:
|
|
205
|
+
print("error: --max-width must be >= 60", file=sys.stderr)
|
|
206
|
+
return 2
|
|
207
|
+
|
|
208
|
+
try:
|
|
209
|
+
report = FRONTIER.build_report(
|
|
210
|
+
fixtures_root=args.fixtures_root,
|
|
211
|
+
registry=args.registry,
|
|
212
|
+
results_root=args.results_root,
|
|
213
|
+
min_pair_margin=args.min_pair_margin,
|
|
214
|
+
max_pair_solo_wall_ratio=args.max_pair_solo_wall_ratio,
|
|
215
|
+
)
|
|
216
|
+
except ValueError as exc:
|
|
217
|
+
print(f"error: {exc}", file=sys.stderr)
|
|
218
|
+
return 2
|
|
219
|
+
|
|
220
|
+
if args.out_json:
|
|
221
|
+
args.out_json.parent.mkdir(parents=True, exist_ok=True)
|
|
222
|
+
args.out_json.write_text(json.dumps(report, indent=2) + "\n", encoding="utf8")
|
|
223
|
+
if args.out_md:
|
|
224
|
+
args.out_md.parent.mkdir(parents=True, exist_ok=True)
|
|
225
|
+
args.out_md.write_text(render_markdown(report, width=args.max_width), encoding="utf8")
|
|
226
|
+
|
|
227
|
+
print(render_text(report, width=args.max_width), end="")
|
|
228
|
+
return 0
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
if __name__ == "__main__":
|
|
232
|
+
sys.exit(main())
|
|
@@ -5,8 +5,8 @@
|
|
|
5
5
|
# subprocess (isolated session), then captures artifacts + runs verification.
|
|
6
6
|
#
|
|
7
7
|
# Usage:
|
|
8
|
-
# run-fixture.sh --fixture <FID> --arm <variant|bare> --run-id <ID>
|
|
9
|
-
# run-fixture.sh --fixture <FID> --arm <variant|bare> --run-id <ID> --dry-run
|
|
8
|
+
# run-fixture.sh --fixture <FID> --arm <variant|solo_claude|bare|l2_gated|l2_risk_probes|l2_forced> --run-id <ID>
|
|
9
|
+
# run-fixture.sh --fixture <FID> --arm <variant|solo_claude|bare|l2_gated|l2_risk_probes|l2_forced> --run-id <ID> --dry-run
|
|
10
10
|
#
|
|
11
11
|
# Outputs to benchmark/auto-resolve/results/<run-id>/<fixture>/<arm>/:
|
|
12
12
|
# input.md, transcript.txt, diff.patch, changed-files.txt, verify.json,
|
|
@@ -19,6 +19,15 @@ usage() {
|
|
|
19
19
|
exit 1
|
|
20
20
|
}
|
|
21
21
|
|
|
22
|
+
require_value() {
|
|
23
|
+
local flag="$1"
|
|
24
|
+
local value="${2:-}"
|
|
25
|
+
if [ -z "$value" ] || [[ "$value" == --* ]]; then
|
|
26
|
+
echo "$flag requires a value" >&2
|
|
27
|
+
exit 1
|
|
28
|
+
fi
|
|
29
|
+
}
|
|
30
|
+
|
|
22
31
|
kill_worktree_processes() {
|
|
23
32
|
local work_dir="$1"
|
|
24
33
|
local signal="$2"
|
|
@@ -40,16 +49,16 @@ FIXTURE=""; ARM=""; RUN_ID=""; DRY_RUN=0
|
|
|
40
49
|
RESOLVE_SKILL="new"
|
|
41
50
|
while [ $# -gt 0 ]; do
|
|
42
51
|
case "$1" in
|
|
43
|
-
--fixture) FIXTURE="$2"; shift 2;;
|
|
44
|
-
--arm) ARM="$2"; shift 2;;
|
|
45
|
-
--run-id) RUN_ID="$2"; shift 2;;
|
|
46
|
-
--resolve-skill) RESOLVE_SKILL="$2"; shift 2;;
|
|
52
|
+
--fixture) require_value "$1" "${2:-}"; FIXTURE="$2"; shift 2;;
|
|
53
|
+
--arm) require_value "$1" "${2:-}"; ARM="$2"; shift 2;;
|
|
54
|
+
--run-id) require_value "$1" "${2:-}"; RUN_ID="$2"; shift 2;;
|
|
55
|
+
--resolve-skill) require_value "$1" "${2:-}"; RESOLVE_SKILL="$2"; shift 2;;
|
|
47
56
|
--dry-run) DRY_RUN=1; shift;;
|
|
48
57
|
*) usage;;
|
|
49
58
|
esac
|
|
50
59
|
done
|
|
51
60
|
[ -n "$FIXTURE" ] && [ -n "$ARM" ] && [ -n "$RUN_ID" ] || usage
|
|
52
|
-
# iter-0019:
|
|
61
|
+
# iter-0019/0037: 3 smoke arms — variant (L2: Claude orchestrator + risk-probes pair path),
|
|
53
62
|
# solo_claude (L1: Claude orchestrator, codex blocked by shim+wrapper enforcement),
|
|
54
63
|
# bare (L0: direct claude -p, no skill, no codex).
|
|
55
64
|
# iter-0033c (Codex R0-infra adoption, 2026-05-02): two L2 diagnostic arms for /devlyn:resolve —
|
|
@@ -99,8 +108,21 @@ for f in "$META" "$EXPECTED" "$SPEC" "$TASK"; do
|
|
|
99
108
|
[ -f "$f" ] || { echo "fixture missing required file: $f (see SCHEMA.md)"; exit 1; }
|
|
100
109
|
done
|
|
101
110
|
|
|
102
|
-
TIMEOUT=$(python3 -
|
|
103
|
-
|
|
111
|
+
TIMEOUT=$(python3 - "$META" "$BENCH_ROOT/scripts" <<'PY'
|
|
112
|
+
import pathlib
|
|
113
|
+
import sys
|
|
114
|
+
|
|
115
|
+
sys.path.insert(0, sys.argv[2])
|
|
116
|
+
from pair_evidence_contract import loads_strict_json_object
|
|
117
|
+
|
|
118
|
+
metadata = loads_strict_json_object(pathlib.Path(sys.argv[1]).read_text())
|
|
119
|
+
timeout = metadata.get("timeout_seconds")
|
|
120
|
+
if not isinstance(timeout, int) or isinstance(timeout, bool) or timeout <= 0:
|
|
121
|
+
raise SystemExit("metadata timeout_seconds must be a positive integer")
|
|
122
|
+
print(timeout)
|
|
123
|
+
PY
|
|
124
|
+
)
|
|
125
|
+
if [ "$ARM" = "variant" ] || [ "$ARM" = "l2_risk_probes" ]; then
|
|
104
126
|
# This arm adds a bounded Codex probe-derive phase before IMPLEMENT and a
|
|
105
127
|
# bounded Codex pair-JUDGE during VERIFY. The full-pipeline gate still
|
|
106
128
|
# enforces wall-time efficiency by pair/solo ratio; this budget prevents a
|
|
@@ -119,19 +141,18 @@ WORK_DIR="/tmp/bench-${RUN_ID}-${FIXTURE}-${ARM}"
|
|
|
119
141
|
rm -rf "$WORK_DIR"
|
|
120
142
|
cp -R "$BENCH_ROOT/fixtures/test-repo" "$WORK_DIR"
|
|
121
143
|
|
|
122
|
-
# All skill-driven arms (variant / solo_claude / l2_gated /
|
|
123
|
-
# devlyn skills + project CLAUDE.md pre-baseline + codex shim
|
|
124
|
-
# wrapper. Bare gets nothing (no skill, no shim, no env).
|
|
144
|
+
# All skill-driven arms (variant / solo_claude / l2_gated / l2_risk_probes /
|
|
145
|
+
# l2_forced) get devlyn skills + project CLAUDE.md pre-baseline + codex shim
|
|
146
|
+
# + monitored wrapper. Bare gets nothing (no skill, no shim, no env).
|
|
125
147
|
#
|
|
126
148
|
# iter-0019: solo_claude (L1) shares variant-arm staging because the L1 arm
|
|
127
149
|
# runs the same orchestrator on the same skills — only difference is codex
|
|
128
150
|
# is blocked. Shim catches PATH resolution; wrapper catches direct-path
|
|
129
151
|
# invocations.
|
|
130
|
-
# iter-0033c (Codex R0-infra Q6):
|
|
131
|
-
# (codex unblocked, shim+wrapper routing).
|
|
132
|
-
#
|
|
133
|
-
#
|
|
134
|
-
# /devlyn:resolve VERIFY phase pulls Codex via the OTHER-engine rule.
|
|
152
|
+
# iter-0033c/0037 (Codex R0-infra Q6 + risk probes): pair arms share variant
|
|
153
|
+
# staging (codex unblocked, shim+wrapper routing). The smoke `variant` arm now
|
|
154
|
+
# follows the current measured risk-probes path rather than an older
|
|
155
|
+
# auto-engine implement route.
|
|
135
156
|
if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
|
|
136
157
|
|| [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ] || [ "$ARM" = "l2_forced" ]; then
|
|
137
158
|
mkdir -p "$WORK_DIR/.claude"
|
|
@@ -183,7 +204,7 @@ if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
|
|
|
183
204
|
# the bypass weapon. Across iter-0019 paid 5-fixture run the bypass
|
|
184
205
|
# was OPEN but never exercised; this closes the surface preemptively
|
|
185
206
|
# before iter-0020's 9-fixture L0/L1/L2 run.
|
|
186
|
-
# iter-0033c (Codex R0-infra Q5):
|
|
207
|
+
# iter-0033c/0037 (Codex R0-infra Q5 + risk probes): l2_* arms are codex-UNBLOCKED
|
|
187
208
|
# (codex must be reachable for VERIFY pair-JUDGE). They take the variant
|
|
188
209
|
# path: ARM_CODEX_BLOCKED=0 → python writer omits CODEX_BLOCKED from env
|
|
189
210
|
# entirely (the shim refuses on any non-empty value, so 0 ≠ unset).
|
|
@@ -209,11 +230,12 @@ if codex_blocked == "1":
|
|
|
209
230
|
# CODEX_BLOCKED enforcement gap.
|
|
210
231
|
env["CODEX_BLOCKED"] = "1"
|
|
211
232
|
else:
|
|
212
|
-
# variant
|
|
213
|
-
#
|
|
233
|
+
# variant / pair arms — codex routes through wrapper for risk-probe
|
|
234
|
+
# derivation and VERIFY pair-JUDGE; both vars are required by the
|
|
235
|
+
# shim/wrapper handshake.
|
|
214
236
|
env["CODEX_REAL_BIN"] = real_bin
|
|
215
237
|
env["CODEX_MONITORED_PATH"] = monitored
|
|
216
|
-
if arm
|
|
238
|
+
if arm in ("variant", "l2_risk_probes"):
|
|
217
239
|
# Risk-probe derivation is a bounded contract-conversion step. A long
|
|
218
240
|
# Codex run is a harness failure, not useful extra quality signal.
|
|
219
241
|
env["CODEX_MONITORED_TIMEOUT_SEC"] = "300"
|
|
@@ -273,9 +295,12 @@ fi
|
|
|
273
295
|
# files. Those commands still run in the post-run verifier below.
|
|
274
296
|
if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
|
|
275
297
|
|| [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ] || [ "$ARM" = "l2_forced" ]; then
|
|
276
|
-
python3 - "$EXPECTED" "$WORK_DIR/.devlyn/spec-verify.json" <<'PY'
|
|
277
|
-
import json, os, sys
|
|
278
|
-
|
|
298
|
+
python3 - "$EXPECTED" "$WORK_DIR/.devlyn/spec-verify.json" "$BENCH_ROOT/scripts" <<'PY'
|
|
299
|
+
import json, os, pathlib, sys
|
|
300
|
+
sys.path.insert(0, sys.argv[3])
|
|
301
|
+
from pair_evidence_contract import loads_strict_json_object
|
|
302
|
+
|
|
303
|
+
expected = loads_strict_json_object(pathlib.Path(sys.argv[1]).read_text())
|
|
279
304
|
out_path = sys.argv[2]
|
|
280
305
|
visible_commands = [
|
|
281
306
|
cmd for cmd in expected.get("verification_commands", [])
|
|
@@ -301,11 +326,11 @@ fi
|
|
|
301
326
|
# 2. Spec-mode `/devlyn:resolve --spec <path>` for the rest (post iter-0034
|
|
302
327
|
# Phase 4 cutover the OLD `/devlyn:auto-resolve` route was deleted).
|
|
303
328
|
PROMPT_FILE="$RESULT_DIR/input.md"
|
|
304
|
-
# Variant uses
|
|
305
|
-
#
|
|
306
|
-
#
|
|
307
|
-
#
|
|
308
|
-
#
|
|
329
|
+
# Variant uses the current measured risk-probes pair path; solo_claude uses
|
|
330
|
+
# --engine claude explicitly so the orchestrator routes every implementation
|
|
331
|
+
# phase to Claude and never tries to invoke codex. The CODEX_BLOCKED shim
|
|
332
|
+
# enforces this at the binary layer if the orchestrator misroutes. Both arms
|
|
333
|
+
# pass the engine flag explicitly so they survive future runtime-default
|
|
309
334
|
# changes (post iter-0020 close-out: default flipped to claude).
|
|
310
335
|
if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
|
|
311
336
|
|| [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ] || [ "$ARM" = "l2_forced" ]; then
|
|
@@ -315,8 +340,8 @@ if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
|
|
|
315
340
|
ENGINE_PROMPT_HINT="Run with \`--engine claude\` for every phase. Codex must not be invoked — the harness has blocked it at the binary layer for this run."
|
|
316
341
|
;;
|
|
317
342
|
variant)
|
|
318
|
-
ENGINE_CLAUSE="--engine
|
|
319
|
-
ENGINE_PROMPT_HINT="Run with \`--engine
|
|
343
|
+
ENGINE_CLAUSE="--engine claude --risk-probes"
|
|
344
|
+
ENGINE_PROMPT_HINT="Run with \`--engine claude --risk-probes\` so the smoke L2 arm uses the current measured pair path: Claude implements, Codex derives bounded visible-verification probes and can act as VERIFY pair-JUDGE."
|
|
320
345
|
;;
|
|
321
346
|
l2_gated)
|
|
322
347
|
# NEW L2 with natural pair-mode triggers. Claude does IMPLEMENT;
|
|
@@ -484,7 +509,7 @@ else
|
|
|
484
509
|
# iter-0009 + iter-0019: prepend codex shim PATH for any arm that staged
|
|
485
510
|
# one. variant routes through codex-monitored.sh; solo_claude refuses on
|
|
486
511
|
# CODEX_BLOCKED=1; bare has no shim.
|
|
487
|
-
# iter-0033c (Codex R0-infra Q6):
|
|
512
|
+
# iter-0033c/0037 (Codex R0-infra Q6 + risk probes): l2_* arms ALSO need the shim
|
|
488
513
|
# PATH — they route Claude IMPLEMENT but Codex pair-JUDGE in VERIFY hits
|
|
489
514
|
# `codex exec` through the wrapper for starvation safety.
|
|
490
515
|
if { [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
|
|
@@ -652,10 +677,13 @@ fi
|
|
|
652
677
|
# Run verification commands + forbidden pattern scan + deps check. Uses
|
|
653
678
|
# the operator's real HOME (same as the arm saw). Fixtures that need HOME
|
|
654
679
|
# isolation override it inline per verification command.
|
|
655
|
-
python3 - "$EXPECTED" "$RESULT_DIR" "$WORK_DIR" <<'PY'
|
|
656
|
-
import json, os, re, subprocess, sys
|
|
680
|
+
python3 - "$EXPECTED" "$RESULT_DIR" "$WORK_DIR" "$BENCH_ROOT/scripts" <<'PY'
|
|
681
|
+
import json, os, pathlib, re, subprocess, sys
|
|
682
|
+
|
|
683
|
+
sys.path.insert(0, sys.argv[4])
|
|
684
|
+
from pair_evidence_contract import loads_strict_json_object
|
|
657
685
|
|
|
658
|
-
expected =
|
|
686
|
+
expected = loads_strict_json_object(pathlib.Path(sys.argv[1]).read_text())
|
|
659
687
|
result_dir = sys.argv[2]
|
|
660
688
|
work = sys.argv[3]
|
|
661
689
|
|
|
@@ -771,12 +799,39 @@ for oracle_file in (
|
|
|
771
799
|
"oracle-scope-tier-b.json",
|
|
772
800
|
"oracle-test-fidelity.json",
|
|
773
801
|
):
|
|
802
|
+
oracle_path = os.path.join(result_dir, oracle_file)
|
|
774
803
|
try:
|
|
775
|
-
|
|
776
|
-
except
|
|
804
|
+
raw_oracle = loads_strict_json_object(pathlib.Path(oracle_path).read_text())
|
|
805
|
+
except (OSError, ValueError) as exc:
|
|
806
|
+
oracle_name = oracle_file.removesuffix(".json")
|
|
807
|
+
verify["oracle_findings"].append({
|
|
808
|
+
"oracle": oracle_name,
|
|
809
|
+
"type": "oracle-error",
|
|
810
|
+
"severity": "hard",
|
|
811
|
+
"verdict": "Deterministic oracle failed or emitted an invalid artifact",
|
|
812
|
+
"error": f"oracle artifact malformed or unreadable: {exc}",
|
|
813
|
+
})
|
|
814
|
+
verify["oracle_disqualifier"] = True
|
|
777
815
|
continue
|
|
816
|
+
data = raw_oracle
|
|
778
817
|
oracle_name = data.get("oracle") or oracle_file.removesuffix(".json")
|
|
779
|
-
|
|
818
|
+
if not isinstance(oracle_name, str) or not oracle_name:
|
|
819
|
+
oracle_name = oracle_file.removesuffix(".json")
|
|
820
|
+
oracle_error = data.get("error")
|
|
821
|
+
if isinstance(oracle_error, str) and oracle_error:
|
|
822
|
+
verify["oracle_findings"].append({
|
|
823
|
+
"oracle": oracle_name,
|
|
824
|
+
"type": "oracle-error",
|
|
825
|
+
"severity": "hard",
|
|
826
|
+
"verdict": "Deterministic oracle failed or emitted an invalid artifact",
|
|
827
|
+
"error": oracle_error,
|
|
828
|
+
})
|
|
829
|
+
verify["oracle_disqualifier"] = True
|
|
830
|
+
raw_findings = data.get("findings")
|
|
831
|
+
findings = raw_findings if isinstance(raw_findings, list) else []
|
|
832
|
+
for finding in findings:
|
|
833
|
+
if not isinstance(finding, dict):
|
|
834
|
+
continue
|
|
780
835
|
item = dict(finding)
|
|
781
836
|
item["oracle"] = oracle_name
|
|
782
837
|
verify["oracle_findings"].append(item)
|
|
@@ -796,12 +851,15 @@ PY
|
|
|
796
851
|
|
|
797
852
|
# Timing + aggregate
|
|
798
853
|
export INVOKE_EXIT WATCHDOG_FIRED
|
|
799
|
-
python3 - "$RESULT_DIR" "$FIXTURE" "$ARM" "$RUN_ID" "$T_END" "$ELAPSED" "$TIMEOUT" <<'PY'
|
|
800
|
-
import json, os, sys
|
|
854
|
+
python3 - "$RESULT_DIR" "$FIXTURE" "$ARM" "$RUN_ID" "$T_END" "$ELAPSED" "$TIMEOUT" "$BENCH_ROOT/scripts" <<'PY'
|
|
855
|
+
import json, os, pathlib, sys
|
|
801
856
|
result_dir, fixture, arm, run_id = sys.argv[1:5]
|
|
802
857
|
t_end, elapsed, timeout = int(sys.argv[5]), int(sys.argv[6]), int(sys.argv[7])
|
|
803
858
|
|
|
804
|
-
|
|
859
|
+
sys.path.insert(0, sys.argv[8])
|
|
860
|
+
from pair_evidence_contract import loads_strict_json_object
|
|
861
|
+
|
|
862
|
+
timing = loads_strict_json_object(pathlib.Path(result_dir, "timing.json").read_text())
|
|
805
863
|
timing["end_epoch"] = t_end
|
|
806
864
|
timing["elapsed_seconds"] = elapsed
|
|
807
865
|
timing["timeout_seconds"] = timeout
|
|
@@ -812,7 +870,10 @@ timing["timeout_seconds"] = timeout
|
|
|
812
870
|
timing["timed_out"] = os.environ.get("WATCHDOG_FIRED", "0") == "1"
|
|
813
871
|
json.dump(timing, open(os.path.join(result_dir, "timing.json"), "w"), indent=2)
|
|
814
872
|
|
|
815
|
-
|
|
873
|
+
def as_dict(value):
|
|
874
|
+
return value if isinstance(value, dict) else {}
|
|
875
|
+
|
|
876
|
+
verify = as_dict(loads_strict_json_object(pathlib.Path(result_dir, "verify.json").read_text()))
|
|
816
877
|
try:
|
|
817
878
|
with open(os.path.join(result_dir, "diff.patch")) as f: diff_size = len(f.read())
|
|
818
879
|
except Exception: diff_size = 0
|
|
@@ -825,15 +886,21 @@ except Exception:
|
|
|
825
886
|
state = {}
|
|
826
887
|
state_path = os.path.join(result_dir, "run-archive", "pipeline.state.json")
|
|
827
888
|
if os.path.isfile(state_path):
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
verify_phase = (
|
|
889
|
+
state = as_dict(loads_strict_json_object(pathlib.Path(state_path).read_text()))
|
|
890
|
+
phases = as_dict(state.get("phases"))
|
|
891
|
+
verify_phase = as_dict(phases.get("verify"))
|
|
892
|
+
legacy_verify = as_dict(state.get("verify"))
|
|
831
893
|
sub_verdicts = verify_phase.get("sub_verdicts")
|
|
832
|
-
pair_trigger = verify_phase.get("pair_trigger") or
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
)
|
|
894
|
+
pair_trigger = verify_phase.get("pair_trigger") or legacy_verify.get("pair_trigger")
|
|
895
|
+
PAIR_VERDICTS = {"PASS", "PASS_WITH_ISSUES", "NEEDS_WORK", "BLOCKED", "FAIL"}
|
|
896
|
+
|
|
897
|
+
def has_pair_judge_verdict(sub_verdicts):
|
|
898
|
+
return isinstance(sub_verdicts, dict) and (
|
|
899
|
+
sub_verdicts.get("judge_codex") in PAIR_VERDICTS
|
|
900
|
+
or sub_verdicts.get("pair_judge") in PAIR_VERDICTS
|
|
901
|
+
)
|
|
902
|
+
|
|
903
|
+
pair_mode = has_pair_judge_verdict(sub_verdicts) or verify_phase.get("pair_mode") is True
|
|
837
904
|
|
|
838
905
|
invoke_exit = int(os.environ.get("INVOKE_EXIT", "0"))
|
|
839
906
|
plugin_contamination = False
|
|
@@ -893,7 +960,7 @@ result = {
|
|
|
893
960
|
"invoke_exit": invoke_exit,
|
|
894
961
|
"invoke_failure": invoke_failure,
|
|
895
962
|
"invoke_failure_reason": invoke_failure_reason,
|
|
896
|
-
"terminal_verdict": (
|
|
963
|
+
"terminal_verdict": as_dict(phases.get("final_report")).get("verdict"),
|
|
897
964
|
"verify_verdict": verify_phase.get("verdict"),
|
|
898
965
|
"pair_trigger": pair_trigger,
|
|
899
966
|
"pair_mode": pair_mode,
|