devlyn-cli 2.1.0 → 2.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +1 -1
- package/benchmark/auto-resolve/README.md +321 -2
- package/benchmark/auto-resolve/RUBRIC.md +6 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +63 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/expected.json +60 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/setup.sh +17 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/spec.md +51 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/invalid.js +29 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/parallel.js +50 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +70 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/expected.json +52 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/setup.sh +171 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/spec.md +50 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +83 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/expected.json +74 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/setup.sh +251 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/spec.md +57 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/task.txt +13 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/verifiers/replay-malformed-body.js +64 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +98 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/expected.json +46 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/setup.sh +336 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/spec.md +51 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +26 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/expected.json +64 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/setup.sh +32 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +57 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/exact-success.js +54 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/no-hardcoded-pricing.js +47 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/stock-error.js +45 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +27 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/expected.json +62 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +61 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/error-order.js +55 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/priority-blocked.js +48 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +27 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/spec.md +64 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/conflicting-duplicate.js +34 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/idempotent-close.js +41 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +27 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +70 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/priority-rollback.js +64 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/single-warehouse-fefo.js +66 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +28 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/expected.json +66 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/setup.sh +36 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +64 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/catalog-source.js +57 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/exact-success.js +63 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/stock-error.js +34 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +25 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/expected.json +68 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/setup.sh +17 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/spec.md +68 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/conflicting-duplicate.js +29 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/exact-payout.js +58 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/rules-source.js +56 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +0 -3
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +13 -1
- package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +98 -0
- package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +111 -0
- package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +289 -0
- package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +250 -0
- package/benchmark/auto-resolve/scripts/headroom-gate.py +147 -0
- package/benchmark/auto-resolve/scripts/judge.sh +82 -3
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +0 -11
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +0 -10
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +244 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +118 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +192 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +257 -43
- package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +511 -0
- package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +162 -0
- package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +93 -0
- package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +209 -0
- package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +239 -0
- package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +265 -0
- package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +192 -0
- package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +131 -0
- package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +84 -0
- package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +302 -0
- package/config/skills/_shared/archive_run.py +3 -0
- package/config/skills/_shared/codex-config.md +2 -2
- package/config/skills/_shared/codex-monitored.sh +72 -7
- package/config/skills/_shared/collect-codex-findings.py +125 -0
- package/config/skills/_shared/engine-preflight.md +1 -1
- package/config/skills/_shared/expected.schema.json +18 -0
- package/config/skills/_shared/spec-verify-check.py +363 -10
- package/config/skills/_shared/verify-merge-findings.py +327 -0
- package/config/skills/devlyn:resolve/SKILL.md +69 -8
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +1 -1
- package/config/skills/devlyn:resolve/references/phases/probe-derive.md +183 -0
- package/config/skills/devlyn:resolve/references/phases/verify.md +156 -4
- package/config/skills/devlyn:resolve/references/state-schema.md +10 -4
- package/package.json +1 -1
- package/scripts/lint-skills.sh +69 -20
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Fetch SWE-bench instances as JSONL without Hugging Face Python deps."""
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import argparse
|
|
7
|
+
import json
|
|
8
|
+
import sys
|
|
9
|
+
import urllib.parse
|
|
10
|
+
import urllib.request
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
DATASETS = {
|
|
16
|
+
"lite": "princeton-nlp/SWE-bench_Lite",
|
|
17
|
+
"verified": "princeton-nlp/SWE-bench_Verified",
|
|
18
|
+
"full": "princeton-nlp/SWE-bench",
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def fetch_rows(dataset: str, split: str, offset: int, length: int) -> dict[str, Any]:
|
|
23
|
+
params = urllib.parse.urlencode(
|
|
24
|
+
{
|
|
25
|
+
"dataset": dataset,
|
|
26
|
+
"config": "default",
|
|
27
|
+
"split": split,
|
|
28
|
+
"offset": offset,
|
|
29
|
+
"length": length,
|
|
30
|
+
}
|
|
31
|
+
)
|
|
32
|
+
url = f"https://datasets-server.huggingface.co/rows?{params}"
|
|
33
|
+
with urllib.request.urlopen(url, timeout=60) as response:
|
|
34
|
+
return json.load(response)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def main() -> int:
|
|
38
|
+
parser = argparse.ArgumentParser()
|
|
39
|
+
parser.add_argument("--dataset", choices=sorted(DATASETS), default="lite")
|
|
40
|
+
parser.add_argument("--dataset-id", help="Override the Hugging Face dataset id.")
|
|
41
|
+
parser.add_argument("--split", default="test")
|
|
42
|
+
parser.add_argument("--limit", type=int, help="Fetch at most N rows.")
|
|
43
|
+
parser.add_argument("--page-size", type=int, default=100)
|
|
44
|
+
parser.add_argument("--instance-id", action="append", help="Keep only these instance ids.")
|
|
45
|
+
parser.add_argument("--out", required=True, type=Path)
|
|
46
|
+
args = parser.parse_args()
|
|
47
|
+
|
|
48
|
+
if args.page_size <= 0:
|
|
49
|
+
raise ValueError("--page-size must be > 0")
|
|
50
|
+
dataset = args.dataset_id or DATASETS[args.dataset]
|
|
51
|
+
keep = set(args.instance_id or [])
|
|
52
|
+
rows: list[dict[str, Any]] = []
|
|
53
|
+
offset = 0
|
|
54
|
+
total: int | None = None
|
|
55
|
+
|
|
56
|
+
while True:
|
|
57
|
+
remaining = args.page_size
|
|
58
|
+
if args.limit is not None:
|
|
59
|
+
remaining = min(remaining, max(args.limit - len(rows), 0))
|
|
60
|
+
if remaining == 0:
|
|
61
|
+
break
|
|
62
|
+
page = fetch_rows(dataset, args.split, offset, remaining)
|
|
63
|
+
if total is None:
|
|
64
|
+
total = int(page.get("num_rows_total") or 0)
|
|
65
|
+
page_rows = page.get("rows") or []
|
|
66
|
+
if not page_rows:
|
|
67
|
+
break
|
|
68
|
+
for wrapper in page_rows:
|
|
69
|
+
row = wrapper.get("row")
|
|
70
|
+
if not isinstance(row, dict):
|
|
71
|
+
continue
|
|
72
|
+
instance_id = row.get("instance_id")
|
|
73
|
+
if keep and instance_id not in keep:
|
|
74
|
+
continue
|
|
75
|
+
rows.append(row)
|
|
76
|
+
if args.limit is not None and len(rows) >= args.limit:
|
|
77
|
+
break
|
|
78
|
+
offset += len(page_rows)
|
|
79
|
+
if offset >= total:
|
|
80
|
+
break
|
|
81
|
+
|
|
82
|
+
if keep:
|
|
83
|
+
found = {row.get("instance_id") for row in rows}
|
|
84
|
+
missing = sorted(keep - found)
|
|
85
|
+
if missing:
|
|
86
|
+
raise ValueError(f"requested instance ids not found in fetched split: {', '.join(missing)}")
|
|
87
|
+
args.out.parent.mkdir(parents=True, exist_ok=True)
|
|
88
|
+
with args.out.open("w", encoding="utf8") as f:
|
|
89
|
+
for row in rows:
|
|
90
|
+
f.write(json.dumps(row, ensure_ascii=False) + "\n")
|
|
91
|
+
print(
|
|
92
|
+
json.dumps(
|
|
93
|
+
{
|
|
94
|
+
"dataset": dataset,
|
|
95
|
+
"split": args.split,
|
|
96
|
+
"rows_written": len(rows),
|
|
97
|
+
"rows_total": total,
|
|
98
|
+
"out": str(args.out),
|
|
99
|
+
},
|
|
100
|
+
indent=2,
|
|
101
|
+
)
|
|
102
|
+
)
|
|
103
|
+
return 0
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
if __name__ == "__main__":
|
|
107
|
+
try:
|
|
108
|
+
raise SystemExit(main())
|
|
109
|
+
except Exception as exc:
|
|
110
|
+
print(f"fetch-swebench-instances: {exc}", file=sys.stderr)
|
|
111
|
+
raise SystemExit(1)
|
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Gate frozen VERIFY solo-vs-pair evidence.
|
|
3
|
+
|
|
4
|
+
This gate is intentionally narrower than headroom-gate.py. It does not claim
|
|
5
|
+
full-pipeline pair superiority. It verifies the leak-free thing we can measure:
|
|
6
|
+
given a fixed external diff, gated pair VERIFY fires and contributes a stricter
|
|
7
|
+
verdict-binding result. That can be either stricter than the separate solo arm
|
|
8
|
+
or stricter than the pair run's own primary judge, which avoids stochastic
|
|
9
|
+
solo-vs-pair confounding. Passing evidence must come from distinct fixture ids
|
|
10
|
+
with runner input metadata present.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import argparse
|
|
16
|
+
import json
|
|
17
|
+
import re
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import Any
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
VERDICT_RANK = {
|
|
23
|
+
"PASS": 0,
|
|
24
|
+
"PASS_WITH_ISSUES": 1,
|
|
25
|
+
"NEEDS_WORK": 2,
|
|
26
|
+
"BLOCKED": 3,
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def load_compare(results_root: Path, run_id: str) -> dict[str, Any]:
|
|
31
|
+
compare_path = results_root / run_id / "compare.json"
|
|
32
|
+
if not compare_path.exists():
|
|
33
|
+
raise FileNotFoundError(f"missing compare.json for {run_id}: {compare_path}")
|
|
34
|
+
with compare_path.open() as f:
|
|
35
|
+
return json.load(f)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def rank(verdict: str | None) -> int:
|
|
39
|
+
return VERDICT_RANK.get(verdict or "", -1)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def elapsed_ratio(pair_elapsed: Any, solo_elapsed: Any) -> float | None:
|
|
43
|
+
if not isinstance(pair_elapsed, (int, float)) or not isinstance(solo_elapsed, (int, float)):
|
|
44
|
+
return None
|
|
45
|
+
if solo_elapsed <= 0:
|
|
46
|
+
return None
|
|
47
|
+
return pair_elapsed / solo_elapsed
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def infer_fixture_id(results_root: Path, run_id: str) -> str | None:
|
|
51
|
+
run_root = results_root / run_id
|
|
52
|
+
for arm in ("pair", "solo"):
|
|
53
|
+
input_path = run_root / arm / "input.md"
|
|
54
|
+
if not input_path.exists():
|
|
55
|
+
continue
|
|
56
|
+
match = re.search(r"docs/roadmap/phase-1/([^`\s]+)\.md", input_path.read_text())
|
|
57
|
+
if match:
|
|
58
|
+
return match.group(1)
|
|
59
|
+
return None
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def transcript_failure_reason(results_root: Path, run_id: str, arm: str) -> str | None:
|
|
63
|
+
transcript_path = results_root / run_id / arm / "transcript.txt"
|
|
64
|
+
if not transcript_path.is_file():
|
|
65
|
+
return None
|
|
66
|
+
transcript = transcript_path.read_text(encoding="utf8", errors="replace")
|
|
67
|
+
if "You've hit your limit" in transcript:
|
|
68
|
+
return "provider_limit"
|
|
69
|
+
return None
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def evaluate_run(
|
|
73
|
+
results_root: Path,
|
|
74
|
+
fixtures_root: Path,
|
|
75
|
+
run_id: str,
|
|
76
|
+
max_pair_solo_wall_ratio: float | None,
|
|
77
|
+
) -> dict[str, Any]:
|
|
78
|
+
try:
|
|
79
|
+
compare = load_compare(results_root, run_id)
|
|
80
|
+
except FileNotFoundError as exc:
|
|
81
|
+
fixture_id = infer_fixture_id(results_root, run_id)
|
|
82
|
+
return {
|
|
83
|
+
"run_id": run_id,
|
|
84
|
+
"fixture_id": fixture_id,
|
|
85
|
+
"status": "FAIL",
|
|
86
|
+
"failures": [str(exc)],
|
|
87
|
+
"solo_verdict": None,
|
|
88
|
+
"pair_verdict": None,
|
|
89
|
+
"pair_mode": False,
|
|
90
|
+
"pair_trigger_missed": False,
|
|
91
|
+
"pair_verdict_lift": False,
|
|
92
|
+
"pair_internal_verdict_lift": False,
|
|
93
|
+
"pair_primary_verdict": None,
|
|
94
|
+
"pair_judge_verdict": None,
|
|
95
|
+
"solo_elapsed_seconds": None,
|
|
96
|
+
"pair_elapsed_seconds": None,
|
|
97
|
+
"pair_solo_wall_ratio": None,
|
|
98
|
+
"pair_severity_counts": {},
|
|
99
|
+
}
|
|
100
|
+
solo = compare.get("solo") or {}
|
|
101
|
+
pair = compare.get("pair") or {}
|
|
102
|
+
comparison = compare.get("comparison") or {}
|
|
103
|
+
solo_failure_reason = solo.get("invoke_failure_reason") or transcript_failure_reason(
|
|
104
|
+
results_root, run_id, "solo"
|
|
105
|
+
)
|
|
106
|
+
pair_failure_reason = pair.get("invoke_failure_reason") or transcript_failure_reason(
|
|
107
|
+
results_root, run_id, "pair"
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
failures: list[str] = []
|
|
111
|
+
if solo.get("timed_out"):
|
|
112
|
+
failures.append("solo timed out")
|
|
113
|
+
if pair.get("timed_out"):
|
|
114
|
+
failures.append("pair timed out")
|
|
115
|
+
if solo_failure_reason == "provider_limit":
|
|
116
|
+
failures.append("solo provider limit")
|
|
117
|
+
if pair_failure_reason == "provider_limit":
|
|
118
|
+
failures.append("pair provider limit")
|
|
119
|
+
if solo.get("invoke_exit") != 0:
|
|
120
|
+
failures.append(f"solo invoke_exit={solo.get('invoke_exit')}")
|
|
121
|
+
if pair.get("invoke_exit") != 0:
|
|
122
|
+
failures.append(f"pair invoke_exit={pair.get('invoke_exit')}")
|
|
123
|
+
if not pair.get("pair_mode"):
|
|
124
|
+
failures.append("pair_mode false")
|
|
125
|
+
if comparison.get("pair_trigger_missed"):
|
|
126
|
+
failures.append("pair trigger missed")
|
|
127
|
+
external_lift = bool(comparison.get("pair_verdict_lift"))
|
|
128
|
+
internal_lift = bool(comparison.get("pair_internal_verdict_lift"))
|
|
129
|
+
if not (external_lift or internal_lift):
|
|
130
|
+
failures.append("pair verdict lift false")
|
|
131
|
+
|
|
132
|
+
solo_verdict = (
|
|
133
|
+
comparison.get("solo_verdict")
|
|
134
|
+
or solo.get("verify_verdict")
|
|
135
|
+
or solo.get("terminal_verdict")
|
|
136
|
+
)
|
|
137
|
+
pair_verdict = (
|
|
138
|
+
comparison.get("pair_verdict")
|
|
139
|
+
or pair.get("verify_verdict")
|
|
140
|
+
or pair.get("terminal_verdict")
|
|
141
|
+
)
|
|
142
|
+
pair_primary_verdict = comparison.get("pair_primary_verdict")
|
|
143
|
+
pair_judge_verdict = comparison.get("pair_judge_verdict")
|
|
144
|
+
if external_lift and rank(pair_verdict) <= rank(solo_verdict):
|
|
145
|
+
failures.append(f"pair verdict {pair_verdict} not stricter than solo {solo_verdict}")
|
|
146
|
+
if internal_lift and rank(pair_judge_verdict) <= rank(pair_primary_verdict):
|
|
147
|
+
failures.append(
|
|
148
|
+
f"pair_judge verdict {pair_judge_verdict} not stricter than primary {pair_primary_verdict}"
|
|
149
|
+
)
|
|
150
|
+
if rank(pair_verdict) < VERDICT_RANK["NEEDS_WORK"]:
|
|
151
|
+
failures.append(f"pair verdict {pair_verdict} is not verdict-binding")
|
|
152
|
+
pair_elapsed = pair.get("elapsed_seconds")
|
|
153
|
+
solo_elapsed = solo.get("elapsed_seconds")
|
|
154
|
+
wall_ratio = elapsed_ratio(pair_elapsed, solo_elapsed)
|
|
155
|
+
if max_pair_solo_wall_ratio is not None:
|
|
156
|
+
if wall_ratio is None:
|
|
157
|
+
failures.append("pair/solo wall ratio missing")
|
|
158
|
+
elif wall_ratio > max_pair_solo_wall_ratio:
|
|
159
|
+
failures.append(
|
|
160
|
+
f"pair/solo wall ratio {wall_ratio:.2f} exceeds {max_pair_solo_wall_ratio:.2f}"
|
|
161
|
+
)
|
|
162
|
+
fixture_id = infer_fixture_id(results_root, run_id)
|
|
163
|
+
if not fixture_id:
|
|
164
|
+
failures.append("fixture_id missing")
|
|
165
|
+
elif not (fixtures_root / fixture_id).is_dir():
|
|
166
|
+
failures.append(f"fixture_id not found: {fixture_id}")
|
|
167
|
+
|
|
168
|
+
return {
|
|
169
|
+
"run_id": run_id,
|
|
170
|
+
"fixture_id": fixture_id,
|
|
171
|
+
"status": "PASS" if not failures else "FAIL",
|
|
172
|
+
"failures": failures,
|
|
173
|
+
"solo_verdict": solo_verdict,
|
|
174
|
+
"pair_verdict": pair_verdict,
|
|
175
|
+
"pair_mode": bool(pair.get("pair_mode")),
|
|
176
|
+
"pair_trigger_missed": bool(comparison.get("pair_trigger_missed")),
|
|
177
|
+
"pair_verdict_lift": external_lift,
|
|
178
|
+
"pair_internal_verdict_lift": internal_lift,
|
|
179
|
+
"pair_primary_verdict": pair_primary_verdict,
|
|
180
|
+
"pair_judge_verdict": pair_judge_verdict,
|
|
181
|
+
"solo_elapsed_seconds": solo_elapsed,
|
|
182
|
+
"pair_elapsed_seconds": pair_elapsed,
|
|
183
|
+
"pair_solo_wall_ratio": wall_ratio,
|
|
184
|
+
"solo_failure_reason": solo_failure_reason,
|
|
185
|
+
"pair_failure_reason": pair_failure_reason,
|
|
186
|
+
"pair_severity_counts": pair.get("severity_counts") or {},
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def format_ratio(value: Any) -> str:
|
|
191
|
+
if isinstance(value, (int, float)):
|
|
192
|
+
return f"{value:.2f}x"
|
|
193
|
+
return "n/a"
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def write_markdown(path: Path, report: dict[str, Any]) -> None:
|
|
197
|
+
lines = [
|
|
198
|
+
f"# Frozen VERIFY Gate — {report['run_ids_label']}",
|
|
199
|
+
"",
|
|
200
|
+
f"Verdict: **{report['verdict']}**",
|
|
201
|
+
"",
|
|
202
|
+
"Rule: every supplied run must be clean, each run must cover a distinct fixture, "
|
|
203
|
+
"gated pair VERIFY must fire, and pair must contribute a stricter "
|
|
204
|
+
"verdict-binding result than either the separate solo arm or the pair "
|
|
205
|
+
"run's own primary judge.",
|
|
206
|
+
"",
|
|
207
|
+
f"Minimum passing runs: {report['min_runs']}",
|
|
208
|
+
f"Max pair/solo wall ratio: {format_ratio(report.get('max_pair_solo_wall_ratio'))}",
|
|
209
|
+
f"Average pair/solo wall ratio: {format_ratio(report.get('avg_pair_solo_wall_ratio'))}",
|
|
210
|
+
"",
|
|
211
|
+
"| Run | Fixture | Solo | Pair | Pair mode | Wall ratio | External lift | Internal lift | Status | Reason |",
|
|
212
|
+
"|---|---|---|---|---|---|---|---|---|---|",
|
|
213
|
+
]
|
|
214
|
+
for row in report["rows"]:
|
|
215
|
+
reason = "; ".join(row["failures"]) if row["failures"] else "ok"
|
|
216
|
+
lines.append(
|
|
217
|
+
f"| {row['run_id']} | {row.get('fixture_id') or 'unknown'} | "
|
|
218
|
+
f"{row['solo_verdict']} | {row['pair_verdict']} | "
|
|
219
|
+
f"{str(row['pair_mode']).lower()} | {format_ratio(row.get('pair_solo_wall_ratio'))} | "
|
|
220
|
+
f"{str(row['pair_verdict_lift']).lower()} | "
|
|
221
|
+
f"{str(row['pair_internal_verdict_lift']).lower()} | "
|
|
222
|
+
f"{row['status']} | {reason} |"
|
|
223
|
+
)
|
|
224
|
+
lines.append("")
|
|
225
|
+
path.write_text("\n".join(lines), encoding="utf8")
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def main() -> int:
|
|
229
|
+
parser = argparse.ArgumentParser()
|
|
230
|
+
parser.add_argument("--results-root", default="benchmark/auto-resolve/results")
|
|
231
|
+
parser.add_argument("--fixtures-root", default="benchmark/auto-resolve/fixtures")
|
|
232
|
+
parser.add_argument("--run-id", action="append", required=True)
|
|
233
|
+
parser.add_argument("--min-runs", type=int, default=2)
|
|
234
|
+
parser.add_argument(
|
|
235
|
+
"--max-pair-solo-wall-ratio",
|
|
236
|
+
type=float,
|
|
237
|
+
help="Optional efficiency cap. When set, every run must include elapsed_seconds and pair/solo wall ratio must not exceed this value.",
|
|
238
|
+
)
|
|
239
|
+
parser.add_argument("--out-json")
|
|
240
|
+
parser.add_argument("--out-md")
|
|
241
|
+
args = parser.parse_args()
|
|
242
|
+
|
|
243
|
+
results_root = Path(args.results_root)
|
|
244
|
+
fixtures_root = Path(args.fixtures_root)
|
|
245
|
+
rows = [
|
|
246
|
+
evaluate_run(results_root, fixtures_root, run_id, args.max_pair_solo_wall_ratio)
|
|
247
|
+
for run_id in args.run_id
|
|
248
|
+
]
|
|
249
|
+
fixture_counts: dict[str, int] = {}
|
|
250
|
+
for row in rows:
|
|
251
|
+
fixture_id = row.get("fixture_id")
|
|
252
|
+
if fixture_id:
|
|
253
|
+
fixture_counts[fixture_id] = fixture_counts.get(fixture_id, 0) + 1
|
|
254
|
+
for row in rows:
|
|
255
|
+
fixture_id = row.get("fixture_id")
|
|
256
|
+
if fixture_id and fixture_counts.get(fixture_id, 0) > 1:
|
|
257
|
+
row["failures"].append(f"duplicate fixture_id={fixture_id}")
|
|
258
|
+
row["status"] = "FAIL"
|
|
259
|
+
passing = [row for row in rows if row["status"] == "PASS"]
|
|
260
|
+
verdict = "PASS" if len(passing) >= args.min_runs and len(passing) == len(rows) else "FAIL"
|
|
261
|
+
ratios = [
|
|
262
|
+
row["pair_solo_wall_ratio"]
|
|
263
|
+
for row in rows
|
|
264
|
+
if isinstance(row.get("pair_solo_wall_ratio"), (int, float))
|
|
265
|
+
]
|
|
266
|
+
|
|
267
|
+
report = {
|
|
268
|
+
"run_ids_label": ", ".join(args.run_id),
|
|
269
|
+
"rule": "clean frozen diff; distinct fixture per run; gated pair VERIFY fires; pair contributes a stricter verdict-binding result; optional pair/solo wall-ratio cap",
|
|
270
|
+
"min_runs": args.min_runs,
|
|
271
|
+
"max_pair_solo_wall_ratio": args.max_pair_solo_wall_ratio,
|
|
272
|
+
"avg_pair_solo_wall_ratio": (sum(ratios) / len(ratios)) if ratios else None,
|
|
273
|
+
"verdict": verdict,
|
|
274
|
+
"runs_total": len(rows),
|
|
275
|
+
"runs_passed": len(passing),
|
|
276
|
+
"rows": rows,
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
if args.out_json:
|
|
280
|
+
Path(args.out_json).write_text(json.dumps(report, indent=2) + "\n", encoding="utf8")
|
|
281
|
+
if args.out_md:
|
|
282
|
+
write_markdown(Path(args.out_md), report)
|
|
283
|
+
|
|
284
|
+
print(json.dumps(report, indent=2))
|
|
285
|
+
return 0 if verdict == "PASS" else 1
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
if __name__ == "__main__":
|
|
289
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Gate full-pipeline L2/pair evidence against L1 solo.
|
|
3
|
+
|
|
4
|
+
This is stricter than headroom-gate.py. Headroom only says a candidate set is
|
|
5
|
+
worth measuring. This gate says the measured L2 arm is usable evidence:
|
|
6
|
+
bare and solo leave headroom, l2_gated is clean, gated pair actually fired, and
|
|
7
|
+
the blind judge scores l2_gated materially above solo_claude.
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import argparse
|
|
12
|
+
import json
|
|
13
|
+
import pathlib
|
|
14
|
+
import sys
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def load_json(path: pathlib.Path) -> dict[str, Any] | None:
|
|
19
|
+
if not path.is_file():
|
|
20
|
+
return None
|
|
21
|
+
return json.loads(path.read_text())
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def score_for(judge: dict[str, Any], arm: str) -> int | None:
|
|
25
|
+
value = (judge.get("scores_by_arm") or {}).get(arm)
|
|
26
|
+
return value if isinstance(value, int) else None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def clean_failures(fixture_dir: pathlib.Path, judge: dict[str, Any], arm: str) -> list[str]:
|
|
30
|
+
failures: list[str] = []
|
|
31
|
+
result = load_json(fixture_dir / arm / "result.json")
|
|
32
|
+
verify = load_json(fixture_dir / arm / "verify.json")
|
|
33
|
+
if result is None:
|
|
34
|
+
failures.append(f"{arm} result.json missing")
|
|
35
|
+
if verify is None:
|
|
36
|
+
failures.append(f"{arm} verify.json missing")
|
|
37
|
+
|
|
38
|
+
dq_by_arm = judge.get("disqualifiers_by_arm") or {}
|
|
39
|
+
if bool((dq_by_arm.get(arm) or {}).get("disqualifier")):
|
|
40
|
+
failures.append(f"{arm} judge disqualifier")
|
|
41
|
+
if result is not None:
|
|
42
|
+
if bool(result.get("disqualifier")):
|
|
43
|
+
failures.append(f"{arm} result disqualifier")
|
|
44
|
+
if bool(result.get("timed_out")):
|
|
45
|
+
failures.append(f"{arm} timed out")
|
|
46
|
+
if bool(result.get("invoke_failure")):
|
|
47
|
+
reason = result.get("invoke_failure_reason")
|
|
48
|
+
if isinstance(reason, str) and reason:
|
|
49
|
+
failures.append(f"{arm} invoke failure ({reason})")
|
|
50
|
+
else:
|
|
51
|
+
failures.append(f"{arm} invoke failure")
|
|
52
|
+
if verify is not None and bool(verify.get("disqualifier")):
|
|
53
|
+
failures.append(f"{arm} verify disqualifier")
|
|
54
|
+
return failures
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def elapsed_ratio(pair_result: dict[str, Any] | None, solo_result: dict[str, Any] | None) -> float | None:
|
|
58
|
+
if pair_result is None or solo_result is None:
|
|
59
|
+
return None
|
|
60
|
+
pair_elapsed = pair_result.get("elapsed_seconds")
|
|
61
|
+
solo_elapsed = solo_result.get("elapsed_seconds")
|
|
62
|
+
if not isinstance(pair_elapsed, (int, float)) or not isinstance(solo_elapsed, (int, float)):
|
|
63
|
+
return None
|
|
64
|
+
if solo_elapsed <= 0:
|
|
65
|
+
return None
|
|
66
|
+
return pair_elapsed / solo_elapsed
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def provider_limited(result: dict[str, Any] | None) -> bool:
|
|
70
|
+
return result is not None and result.get("invoke_failure_reason") == "provider_limit"
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def evaluate_fixture(
|
|
74
|
+
fixture_dir: pathlib.Path,
|
|
75
|
+
*,
|
|
76
|
+
pair_arm: str,
|
|
77
|
+
bare_max: int,
|
|
78
|
+
solo_max: int,
|
|
79
|
+
min_pair_margin: int,
|
|
80
|
+
max_pair_solo_wall_ratio: float | None,
|
|
81
|
+
) -> dict[str, Any]:
|
|
82
|
+
judge = load_json(fixture_dir / "judge.json")
|
|
83
|
+
if judge is None:
|
|
84
|
+
return {
|
|
85
|
+
"fixture": fixture_dir.name,
|
|
86
|
+
"status": "FAIL",
|
|
87
|
+
"reason": "judge.json missing",
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
bare = score_for(judge, "bare")
|
|
91
|
+
solo = score_for(judge, "solo_claude")
|
|
92
|
+
pair = score_for(judge, pair_arm)
|
|
93
|
+
solo_result = load_json(fixture_dir / "solo_claude" / "result.json")
|
|
94
|
+
pair_result = load_json(fixture_dir / pair_arm / "result.json")
|
|
95
|
+
ratio = elapsed_ratio(pair_result, solo_result)
|
|
96
|
+
pair_provider_limited = provider_limited(pair_result)
|
|
97
|
+
if pair_provider_limited:
|
|
98
|
+
ratio = None
|
|
99
|
+
|
|
100
|
+
reasons: list[str] = []
|
|
101
|
+
if bare is None:
|
|
102
|
+
reasons.append("bare score missing")
|
|
103
|
+
elif bare > bare_max:
|
|
104
|
+
reasons.append(f"bare score {bare} > {bare_max}")
|
|
105
|
+
if solo is None:
|
|
106
|
+
reasons.append("solo_claude score missing")
|
|
107
|
+
elif solo > solo_max:
|
|
108
|
+
reasons.append(f"solo_claude score {solo} > {solo_max}")
|
|
109
|
+
if pair_provider_limited:
|
|
110
|
+
pass
|
|
111
|
+
elif pair is None:
|
|
112
|
+
reasons.append(f"{pair_arm} score missing")
|
|
113
|
+
elif solo is not None and pair - solo < min_pair_margin:
|
|
114
|
+
reasons.append(f"{pair_arm} margin {pair - solo:+d} < +{min_pair_margin}")
|
|
115
|
+
|
|
116
|
+
reasons.extend(clean_failures(fixture_dir, judge, "bare"))
|
|
117
|
+
reasons.extend(clean_failures(fixture_dir, judge, "solo_claude"))
|
|
118
|
+
reasons.extend(clean_failures(fixture_dir, judge, pair_arm))
|
|
119
|
+
|
|
120
|
+
pair_mode = None if pair_result is None else pair_result.get("pair_mode")
|
|
121
|
+
if pair_mode is not True and not pair_provider_limited:
|
|
122
|
+
reasons.append(f"{pair_arm} pair_mode not true")
|
|
123
|
+
|
|
124
|
+
if max_pair_solo_wall_ratio is not None and not pair_provider_limited:
|
|
125
|
+
if ratio is None:
|
|
126
|
+
reasons.append("pair/solo wall ratio missing")
|
|
127
|
+
elif ratio > max_pair_solo_wall_ratio:
|
|
128
|
+
reasons.append(f"pair/solo wall ratio {ratio:.2f} > {max_pair_solo_wall_ratio:.2f}")
|
|
129
|
+
|
|
130
|
+
return {
|
|
131
|
+
"fixture": fixture_dir.name,
|
|
132
|
+
"status": "PASS" if not reasons else "FAIL",
|
|
133
|
+
"bare_score": bare,
|
|
134
|
+
"solo_score": solo,
|
|
135
|
+
"pair_score": pair,
|
|
136
|
+
"pair_margin": (
|
|
137
|
+
None if pair_provider_limited
|
|
138
|
+
else pair - solo if isinstance(pair, int) and isinstance(solo, int)
|
|
139
|
+
else None
|
|
140
|
+
),
|
|
141
|
+
"pair_mode": pair_mode,
|
|
142
|
+
"pair_solo_wall_ratio": ratio,
|
|
143
|
+
"reason": "; ".join(reasons),
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def fmt_ratio(value: Any) -> str:
|
|
148
|
+
return f"{value:.2f}x" if isinstance(value, (int, float)) else "n/a"
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def write_md(path: pathlib.Path, report: dict[str, Any]) -> None:
|
|
152
|
+
lines = [
|
|
153
|
+
f"# Full-Pipeline Pair Gate - {report['run_id']}",
|
|
154
|
+
"",
|
|
155
|
+
f"Verdict: **{report['verdict']}**",
|
|
156
|
+
"",
|
|
157
|
+
f"Rule: at least {report['min_fixtures']} fixtures; bare <= {report['bare_max']}; "
|
|
158
|
+
f"solo_claude <= {report['solo_max']}; {report['pair_arm']} clean; pair_mode true; "
|
|
159
|
+
f"{report['pair_arm']} - solo_claude >= {report['min_pair_margin']}.",
|
|
160
|
+
f"Max pair/solo wall ratio: {fmt_ratio(report['max_pair_solo_wall_ratio'])}",
|
|
161
|
+
f"Average pair/solo wall ratio: {fmt_ratio(report['avg_pair_solo_wall_ratio'])}",
|
|
162
|
+
"",
|
|
163
|
+
"| Fixture | Bare | Solo | Pair | Margin | Pair mode | Wall ratio | Status | Reason |",
|
|
164
|
+
"|---|---:|---:|---:|---:|---|---:|---|---|",
|
|
165
|
+
]
|
|
166
|
+
for row in report["rows"]:
|
|
167
|
+
margin = row.get("pair_margin")
|
|
168
|
+
margin_text = f"{margin:+d}" if isinstance(margin, int) else "n/a"
|
|
169
|
+
lines.append(
|
|
170
|
+
f"| {row['fixture']} | {row.get('bare_score')} | {row.get('solo_score')} | "
|
|
171
|
+
f"{row.get('pair_score')} | {margin_text} | {str(row.get('pair_mode')).lower()} | "
|
|
172
|
+
f"{fmt_ratio(row.get('pair_solo_wall_ratio'))} | {row['status']} | {row.get('reason', '')} |"
|
|
173
|
+
)
|
|
174
|
+
lines.append("")
|
|
175
|
+
path.write_text("\n".join(lines), encoding="utf8")
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def positive_float(value: str) -> float:
|
|
179
|
+
parsed = float(value)
|
|
180
|
+
if parsed <= 0:
|
|
181
|
+
raise argparse.ArgumentTypeError("value must be > 0")
|
|
182
|
+
return parsed
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def main() -> int:
|
|
186
|
+
parser = argparse.ArgumentParser()
|
|
187
|
+
parser.add_argument("--run-id", required=True)
|
|
188
|
+
parser.add_argument("--results-root", default="benchmark/auto-resolve/results", type=pathlib.Path)
|
|
189
|
+
parser.add_argument("--bare-max", type=int, default=60)
|
|
190
|
+
parser.add_argument("--solo-max", type=int, default=80)
|
|
191
|
+
parser.add_argument("--min-pair-margin", type=int, default=5)
|
|
192
|
+
parser.add_argument("--min-fixtures", type=int, default=2)
|
|
193
|
+
parser.add_argument("--pair-arm", default="l2_gated")
|
|
194
|
+
parser.add_argument("--max-pair-solo-wall-ratio", type=positive_float)
|
|
195
|
+
parser.add_argument("--out-json", type=pathlib.Path)
|
|
196
|
+
parser.add_argument("--out-md", type=pathlib.Path)
|
|
197
|
+
args = parser.parse_args()
|
|
198
|
+
|
|
199
|
+
run_root = args.results_root / args.run_id
|
|
200
|
+
if not run_root.is_dir():
|
|
201
|
+
print(f"no results dir: {run_root}", file=sys.stderr)
|
|
202
|
+
return 2
|
|
203
|
+
|
|
204
|
+
rows = [
|
|
205
|
+
evaluate_fixture(
|
|
206
|
+
fixture_dir,
|
|
207
|
+
pair_arm=args.pair_arm,
|
|
208
|
+
bare_max=args.bare_max,
|
|
209
|
+
solo_max=args.solo_max,
|
|
210
|
+
min_pair_margin=args.min_pair_margin,
|
|
211
|
+
max_pair_solo_wall_ratio=args.max_pair_solo_wall_ratio,
|
|
212
|
+
)
|
|
213
|
+
for fixture_dir in sorted(p for p in run_root.iterdir() if p.is_dir())
|
|
214
|
+
]
|
|
215
|
+
pass_count = sum(1 for row in rows if row["status"] == "PASS")
|
|
216
|
+
fixture_count_ok = len(rows) >= args.min_fixtures
|
|
217
|
+
verdict = "PASS" if rows and fixture_count_ok and pass_count == len(rows) else "FAIL"
|
|
218
|
+
ratios = [
|
|
219
|
+
row["pair_solo_wall_ratio"]
|
|
220
|
+
for row in rows
|
|
221
|
+
if isinstance(row.get("pair_solo_wall_ratio"), (int, float))
|
|
222
|
+
]
|
|
223
|
+
report = {
|
|
224
|
+
"run_id": args.run_id,
|
|
225
|
+
"rule": "headroom candidates only; l2_gated must be clean, pair_mode true, and beat solo_claude by the configured margin",
|
|
226
|
+
"verdict": verdict,
|
|
227
|
+
"fixtures_total": len(rows),
|
|
228
|
+
"fixtures_passed": pass_count,
|
|
229
|
+
"min_fixtures": args.min_fixtures,
|
|
230
|
+
"fixture_count_ok": fixture_count_ok,
|
|
231
|
+
"bare_max": args.bare_max,
|
|
232
|
+
"solo_max": args.solo_max,
|
|
233
|
+
"min_pair_margin": args.min_pair_margin,
|
|
234
|
+
"pair_arm": args.pair_arm,
|
|
235
|
+
"max_pair_solo_wall_ratio": args.max_pair_solo_wall_ratio,
|
|
236
|
+
"avg_pair_solo_wall_ratio": (sum(ratios) / len(ratios)) if ratios else None,
|
|
237
|
+
"rows": rows,
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
if args.out_json:
|
|
241
|
+
args.out_json.write_text(json.dumps(report, indent=2) + "\n", encoding="utf8")
|
|
242
|
+
if args.out_md:
|
|
243
|
+
write_md(args.out_md, report)
|
|
244
|
+
else:
|
|
245
|
+
print(json.dumps(report, indent=2))
|
|
246
|
+
return 0 if verdict == "PASS" else 1
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
if __name__ == "__main__":
|
|
250
|
+
sys.exit(main())
|