devlyn-cli 2.0.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. package/CLAUDE.md +1 -1
  2. package/README.md +1 -1
  3. package/benchmark/auto-resolve/README.md +318 -2
  4. package/benchmark/auto-resolve/RUBRIC.md +6 -0
  5. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +63 -0
  6. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/expected.json +60 -0
  7. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/metadata.json +10 -0
  8. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/setup.sh +17 -0
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/spec.md +52 -0
  10. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/task.txt +9 -0
  11. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/invalid.js +29 -0
  12. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/parallel.js +50 -0
  13. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +70 -0
  14. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/expected.json +52 -0
  15. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/metadata.json +10 -0
  16. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/setup.sh +171 -0
  17. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/spec.md +51 -0
  18. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/task.txt +9 -0
  19. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +83 -0
  20. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/expected.json +74 -0
  21. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/metadata.json +10 -0
  22. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/setup.sh +251 -0
  23. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/spec.md +58 -0
  24. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/task.txt +13 -0
  25. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/verifiers/replay-malformed-body.js +64 -0
  26. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +98 -0
  27. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/expected.json +46 -0
  28. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/metadata.json +10 -0
  29. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/setup.sh +336 -0
  30. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/spec.md +52 -0
  31. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/task.txt +9 -0
  32. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +26 -0
  33. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/expected.json +64 -0
  34. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/metadata.json +10 -0
  35. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/setup.sh +32 -0
  36. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +58 -0
  37. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/task.txt +7 -0
  38. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/exact-success.js +54 -0
  39. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/no-hardcoded-pricing.js +47 -0
  40. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/stock-error.js +45 -0
  41. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +27 -0
  42. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/expected.json +62 -0
  43. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/metadata.json +10 -0
  44. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/setup.sh +2 -0
  45. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +62 -0
  46. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/task.txt +7 -0
  47. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/error-order.js +55 -0
  48. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/priority-blocked.js +48 -0
  49. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +27 -0
  50. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/expected.json +56 -0
  51. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/metadata.json +10 -0
  52. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/setup.sh +2 -0
  53. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/spec.md +65 -0
  54. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/task.txt +7 -0
  55. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/conflicting-duplicate.js +34 -0
  56. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/idempotent-close.js +41 -0
  57. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +27 -0
  58. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/expected.json +56 -0
  59. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/metadata.json +10 -0
  60. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/setup.sh +2 -0
  61. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +71 -0
  62. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/task.txt +7 -0
  63. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/priority-rollback.js +64 -0
  64. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/single-warehouse-fefo.js +66 -0
  65. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +28 -0
  66. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/expected.json +66 -0
  67. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/metadata.json +10 -0
  68. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/setup.sh +36 -0
  69. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +65 -0
  70. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/task.txt +7 -0
  71. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/catalog-source.js +57 -0
  72. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/exact-success.js +63 -0
  73. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/stock-error.js +34 -0
  74. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +25 -0
  75. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/expected.json +68 -0
  76. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/metadata.json +10 -0
  77. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/setup.sh +17 -0
  78. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/spec.md +69 -0
  79. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/task.txt +7 -0
  80. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/conflicting-duplicate.js +29 -0
  81. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/exact-payout.js +58 -0
  82. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/rules-source.js +56 -0
  83. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/NOTES.md +24 -0
  84. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/expected.json +66 -0
  85. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/metadata.json +10 -0
  86. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/setup.sh +22 -0
  87. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/spec.md +62 -0
  88. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/task.txt +9 -0
  89. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/exact-success.js +48 -0
  90. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/insufficient-balance.js +36 -0
  91. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/rules-source.js +55 -0
  92. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/NOTES.md +20 -0
  93. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/expected.json +66 -0
  94. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/metadata.json +10 -0
  95. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/setup.sh +23 -0
  96. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/spec.md +66 -0
  97. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/task.txt +11 -0
  98. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/exact-success.js +44 -0
  99. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/rules-source.js +58 -0
  100. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/unavailable-inventory.js +35 -0
  101. package/benchmark/auto-resolve/fixtures/SCHEMA.md +13 -1
  102. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +98 -0
  103. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +111 -0
  104. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +289 -0
  105. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +250 -0
  106. package/benchmark/auto-resolve/scripts/headroom-gate.py +147 -0
  107. package/benchmark/auto-resolve/scripts/judge.sh +82 -3
  108. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +244 -0
  109. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +118 -0
  110. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +192 -0
  111. package/benchmark/auto-resolve/scripts/run-fixture.sh +234 -40
  112. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +511 -0
  113. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +162 -0
  114. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +93 -0
  115. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +209 -0
  116. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +239 -0
  117. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +265 -0
  118. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +192 -0
  119. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +131 -0
  120. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +84 -0
  121. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +302 -0
  122. package/bin/devlyn.js +56 -10
  123. package/config/skills/_shared/archive_run.py +3 -0
  124. package/config/skills/_shared/codex-config.md +2 -2
  125. package/config/skills/_shared/codex-monitored.sh +72 -7
  126. package/config/skills/_shared/collect-codex-findings.py +125 -0
  127. package/config/skills/_shared/engine-preflight.md +1 -1
  128. package/config/skills/_shared/expected.schema.json +18 -0
  129. package/config/skills/_shared/spec-verify-check.py +312 -10
  130. package/config/skills/_shared/verify-merge-findings.py +327 -0
  131. package/config/skills/devlyn:ideate/SKILL.md +1 -1
  132. package/config/skills/devlyn:resolve/SKILL.md +62 -8
  133. package/config/skills/devlyn:resolve/references/phases/build-gate.md +1 -1
  134. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +164 -0
  135. package/config/skills/devlyn:resolve/references/phases/verify.md +156 -4
  136. package/config/skills/devlyn:resolve/references/state-schema.md +10 -4
  137. package/package.json +1 -1
  138. package/scripts/lint-skills.sh +32 -0
@@ -0,0 +1,111 @@
1
+ #!/usr/bin/env python3
2
+ """Fetch SWE-bench instances as JSONL without Hugging Face Python deps."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import json
8
+ import sys
9
+ import urllib.parse
10
+ import urllib.request
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+
15
+ DATASETS = {
16
+ "lite": "princeton-nlp/SWE-bench_Lite",
17
+ "verified": "princeton-nlp/SWE-bench_Verified",
18
+ "full": "princeton-nlp/SWE-bench",
19
+ }
20
+
21
+
22
+ def fetch_rows(dataset: str, split: str, offset: int, length: int) -> dict[str, Any]:
23
+ params = urllib.parse.urlencode(
24
+ {
25
+ "dataset": dataset,
26
+ "config": "default",
27
+ "split": split,
28
+ "offset": offset,
29
+ "length": length,
30
+ }
31
+ )
32
+ url = f"https://datasets-server.huggingface.co/rows?{params}"
33
+ with urllib.request.urlopen(url, timeout=60) as response:
34
+ return json.load(response)
35
+
36
+
37
+ def main() -> int:
38
+ parser = argparse.ArgumentParser()
39
+ parser.add_argument("--dataset", choices=sorted(DATASETS), default="lite")
40
+ parser.add_argument("--dataset-id", help="Override the Hugging Face dataset id.")
41
+ parser.add_argument("--split", default="test")
42
+ parser.add_argument("--limit", type=int, help="Fetch at most N rows.")
43
+ parser.add_argument("--page-size", type=int, default=100)
44
+ parser.add_argument("--instance-id", action="append", help="Keep only these instance ids.")
45
+ parser.add_argument("--out", required=True, type=Path)
46
+ args = parser.parse_args()
47
+
48
+ if args.page_size <= 0:
49
+ raise ValueError("--page-size must be > 0")
50
+ dataset = args.dataset_id or DATASETS[args.dataset]
51
+ keep = set(args.instance_id or [])
52
+ rows: list[dict[str, Any]] = []
53
+ offset = 0
54
+ total: int | None = None
55
+
56
+ while True:
57
+ remaining = args.page_size
58
+ if args.limit is not None:
59
+ remaining = min(remaining, max(args.limit - len(rows), 0))
60
+ if remaining == 0:
61
+ break
62
+ page = fetch_rows(dataset, args.split, offset, remaining)
63
+ if total is None:
64
+ total = int(page.get("num_rows_total") or 0)
65
+ page_rows = page.get("rows") or []
66
+ if not page_rows:
67
+ break
68
+ for wrapper in page_rows:
69
+ row = wrapper.get("row")
70
+ if not isinstance(row, dict):
71
+ continue
72
+ instance_id = row.get("instance_id")
73
+ if keep and instance_id not in keep:
74
+ continue
75
+ rows.append(row)
76
+ if args.limit is not None and len(rows) >= args.limit:
77
+ break
78
+ offset += len(page_rows)
79
+ if offset >= total:
80
+ break
81
+
82
+ if keep:
83
+ found = {row.get("instance_id") for row in rows}
84
+ missing = sorted(keep - found)
85
+ if missing:
86
+ raise ValueError(f"requested instance ids not found in fetched split: {', '.join(missing)}")
87
+ args.out.parent.mkdir(parents=True, exist_ok=True)
88
+ with args.out.open("w", encoding="utf8") as f:
89
+ for row in rows:
90
+ f.write(json.dumps(row, ensure_ascii=False) + "\n")
91
+ print(
92
+ json.dumps(
93
+ {
94
+ "dataset": dataset,
95
+ "split": args.split,
96
+ "rows_written": len(rows),
97
+ "rows_total": total,
98
+ "out": str(args.out),
99
+ },
100
+ indent=2,
101
+ )
102
+ )
103
+ return 0
104
+
105
+
106
+ if __name__ == "__main__":
107
+ try:
108
+ raise SystemExit(main())
109
+ except Exception as exc:
110
+ print(f"fetch-swebench-instances: {exc}", file=sys.stderr)
111
+ raise SystemExit(1)
@@ -0,0 +1,289 @@
1
+ #!/usr/bin/env python3
2
+ """Gate frozen VERIFY solo-vs-pair evidence.
3
+
4
+ This gate is intentionally narrower than headroom-gate.py. It does not claim
5
+ full-pipeline pair superiority. It verifies the leak-free thing we can measure:
6
+ given a fixed external diff, gated pair VERIFY fires and contributes a stricter
7
+ verdict-binding result. That can be either stricter than the separate solo arm
8
+ or stricter than the pair run's own primary judge, which avoids stochastic
9
+ solo-vs-pair confounding. Passing evidence must come from distinct fixture ids
10
+ with runner input metadata present.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import argparse
16
+ import json
17
+ import re
18
+ from pathlib import Path
19
+ from typing import Any
20
+
21
+
22
+ VERDICT_RANK = {
23
+ "PASS": 0,
24
+ "PASS_WITH_ISSUES": 1,
25
+ "NEEDS_WORK": 2,
26
+ "BLOCKED": 3,
27
+ }
28
+
29
+
30
+ def load_compare(results_root: Path, run_id: str) -> dict[str, Any]:
31
+ compare_path = results_root / run_id / "compare.json"
32
+ if not compare_path.exists():
33
+ raise FileNotFoundError(f"missing compare.json for {run_id}: {compare_path}")
34
+ with compare_path.open() as f:
35
+ return json.load(f)
36
+
37
+
38
+ def rank(verdict: str | None) -> int:
39
+ return VERDICT_RANK.get(verdict or "", -1)
40
+
41
+
42
+ def elapsed_ratio(pair_elapsed: Any, solo_elapsed: Any) -> float | None:
43
+ if not isinstance(pair_elapsed, (int, float)) or not isinstance(solo_elapsed, (int, float)):
44
+ return None
45
+ if solo_elapsed <= 0:
46
+ return None
47
+ return pair_elapsed / solo_elapsed
48
+
49
+
50
+ def infer_fixture_id(results_root: Path, run_id: str) -> str | None:
51
+ run_root = results_root / run_id
52
+ for arm in ("pair", "solo"):
53
+ input_path = run_root / arm / "input.md"
54
+ if not input_path.exists():
55
+ continue
56
+ match = re.search(r"docs/roadmap/phase-1/([^`\s]+)\.md", input_path.read_text())
57
+ if match:
58
+ return match.group(1)
59
+ return None
60
+
61
+
62
+ def transcript_failure_reason(results_root: Path, run_id: str, arm: str) -> str | None:
63
+ transcript_path = results_root / run_id / arm / "transcript.txt"
64
+ if not transcript_path.is_file():
65
+ return None
66
+ transcript = transcript_path.read_text(encoding="utf8", errors="replace")
67
+ if "You've hit your limit" in transcript:
68
+ return "provider_limit"
69
+ return None
70
+
71
+
72
+ def evaluate_run(
73
+ results_root: Path,
74
+ fixtures_root: Path,
75
+ run_id: str,
76
+ max_pair_solo_wall_ratio: float | None,
77
+ ) -> dict[str, Any]:
78
+ try:
79
+ compare = load_compare(results_root, run_id)
80
+ except FileNotFoundError as exc:
81
+ fixture_id = infer_fixture_id(results_root, run_id)
82
+ return {
83
+ "run_id": run_id,
84
+ "fixture_id": fixture_id,
85
+ "status": "FAIL",
86
+ "failures": [str(exc)],
87
+ "solo_verdict": None,
88
+ "pair_verdict": None,
89
+ "pair_mode": False,
90
+ "pair_trigger_missed": False,
91
+ "pair_verdict_lift": False,
92
+ "pair_internal_verdict_lift": False,
93
+ "pair_primary_verdict": None,
94
+ "pair_judge_verdict": None,
95
+ "solo_elapsed_seconds": None,
96
+ "pair_elapsed_seconds": None,
97
+ "pair_solo_wall_ratio": None,
98
+ "pair_severity_counts": {},
99
+ }
100
+ solo = compare.get("solo") or {}
101
+ pair = compare.get("pair") or {}
102
+ comparison = compare.get("comparison") or {}
103
+ solo_failure_reason = solo.get("invoke_failure_reason") or transcript_failure_reason(
104
+ results_root, run_id, "solo"
105
+ )
106
+ pair_failure_reason = pair.get("invoke_failure_reason") or transcript_failure_reason(
107
+ results_root, run_id, "pair"
108
+ )
109
+
110
+ failures: list[str] = []
111
+ if solo.get("timed_out"):
112
+ failures.append("solo timed out")
113
+ if pair.get("timed_out"):
114
+ failures.append("pair timed out")
115
+ if solo_failure_reason == "provider_limit":
116
+ failures.append("solo provider limit")
117
+ if pair_failure_reason == "provider_limit":
118
+ failures.append("pair provider limit")
119
+ if solo.get("invoke_exit") != 0:
120
+ failures.append(f"solo invoke_exit={solo.get('invoke_exit')}")
121
+ if pair.get("invoke_exit") != 0:
122
+ failures.append(f"pair invoke_exit={pair.get('invoke_exit')}")
123
+ if not pair.get("pair_mode"):
124
+ failures.append("pair_mode false")
125
+ if comparison.get("pair_trigger_missed"):
126
+ failures.append("pair trigger missed")
127
+ external_lift = bool(comparison.get("pair_verdict_lift"))
128
+ internal_lift = bool(comparison.get("pair_internal_verdict_lift"))
129
+ if not (external_lift or internal_lift):
130
+ failures.append("pair verdict lift false")
131
+
132
+ solo_verdict = (
133
+ comparison.get("solo_verdict")
134
+ or solo.get("verify_verdict")
135
+ or solo.get("terminal_verdict")
136
+ )
137
+ pair_verdict = (
138
+ comparison.get("pair_verdict")
139
+ or pair.get("verify_verdict")
140
+ or pair.get("terminal_verdict")
141
+ )
142
+ pair_primary_verdict = comparison.get("pair_primary_verdict")
143
+ pair_judge_verdict = comparison.get("pair_judge_verdict")
144
+ if external_lift and rank(pair_verdict) <= rank(solo_verdict):
145
+ failures.append(f"pair verdict {pair_verdict} not stricter than solo {solo_verdict}")
146
+ if internal_lift and rank(pair_judge_verdict) <= rank(pair_primary_verdict):
147
+ failures.append(
148
+ f"pair_judge verdict {pair_judge_verdict} not stricter than primary {pair_primary_verdict}"
149
+ )
150
+ if rank(pair_verdict) < VERDICT_RANK["NEEDS_WORK"]:
151
+ failures.append(f"pair verdict {pair_verdict} is not verdict-binding")
152
+ pair_elapsed = pair.get("elapsed_seconds")
153
+ solo_elapsed = solo.get("elapsed_seconds")
154
+ wall_ratio = elapsed_ratio(pair_elapsed, solo_elapsed)
155
+ if max_pair_solo_wall_ratio is not None:
156
+ if wall_ratio is None:
157
+ failures.append("pair/solo wall ratio missing")
158
+ elif wall_ratio > max_pair_solo_wall_ratio:
159
+ failures.append(
160
+ f"pair/solo wall ratio {wall_ratio:.2f} exceeds {max_pair_solo_wall_ratio:.2f}"
161
+ )
162
+ fixture_id = infer_fixture_id(results_root, run_id)
163
+ if not fixture_id:
164
+ failures.append("fixture_id missing")
165
+ elif not (fixtures_root / fixture_id).is_dir():
166
+ failures.append(f"fixture_id not found: {fixture_id}")
167
+
168
+ return {
169
+ "run_id": run_id,
170
+ "fixture_id": fixture_id,
171
+ "status": "PASS" if not failures else "FAIL",
172
+ "failures": failures,
173
+ "solo_verdict": solo_verdict,
174
+ "pair_verdict": pair_verdict,
175
+ "pair_mode": bool(pair.get("pair_mode")),
176
+ "pair_trigger_missed": bool(comparison.get("pair_trigger_missed")),
177
+ "pair_verdict_lift": external_lift,
178
+ "pair_internal_verdict_lift": internal_lift,
179
+ "pair_primary_verdict": pair_primary_verdict,
180
+ "pair_judge_verdict": pair_judge_verdict,
181
+ "solo_elapsed_seconds": solo_elapsed,
182
+ "pair_elapsed_seconds": pair_elapsed,
183
+ "pair_solo_wall_ratio": wall_ratio,
184
+ "solo_failure_reason": solo_failure_reason,
185
+ "pair_failure_reason": pair_failure_reason,
186
+ "pair_severity_counts": pair.get("severity_counts") or {},
187
+ }
188
+
189
+
190
+ def format_ratio(value: Any) -> str:
191
+ if isinstance(value, (int, float)):
192
+ return f"{value:.2f}x"
193
+ return "n/a"
194
+
195
+
196
+ def write_markdown(path: Path, report: dict[str, Any]) -> None:
197
+ lines = [
198
+ f"# Frozen VERIFY Gate — {report['run_ids_label']}",
199
+ "",
200
+ f"Verdict: **{report['verdict']}**",
201
+ "",
202
+ "Rule: every supplied run must be clean, each run must cover a distinct fixture, "
203
+ "gated pair VERIFY must fire, and pair must contribute a stricter "
204
+ "verdict-binding result than either the separate solo arm or the pair "
205
+ "run's own primary judge.",
206
+ "",
207
+ f"Minimum passing runs: {report['min_runs']}",
208
+ f"Max pair/solo wall ratio: {format_ratio(report.get('max_pair_solo_wall_ratio'))}",
209
+ f"Average pair/solo wall ratio: {format_ratio(report.get('avg_pair_solo_wall_ratio'))}",
210
+ "",
211
+ "| Run | Fixture | Solo | Pair | Pair mode | Wall ratio | External lift | Internal lift | Status | Reason |",
212
+ "|---|---|---|---|---|---|---|---|---|---|",
213
+ ]
214
+ for row in report["rows"]:
215
+ reason = "; ".join(row["failures"]) if row["failures"] else "ok"
216
+ lines.append(
217
+ f"| {row['run_id']} | {row.get('fixture_id') or 'unknown'} | "
218
+ f"{row['solo_verdict']} | {row['pair_verdict']} | "
219
+ f"{str(row['pair_mode']).lower()} | {format_ratio(row.get('pair_solo_wall_ratio'))} | "
220
+ f"{str(row['pair_verdict_lift']).lower()} | "
221
+ f"{str(row['pair_internal_verdict_lift']).lower()} | "
222
+ f"{row['status']} | {reason} |"
223
+ )
224
+ lines.append("")
225
+ path.write_text("\n".join(lines), encoding="utf8")
226
+
227
+
228
+ def main() -> int:
229
+ parser = argparse.ArgumentParser()
230
+ parser.add_argument("--results-root", default="benchmark/auto-resolve/results")
231
+ parser.add_argument("--fixtures-root", default="benchmark/auto-resolve/fixtures")
232
+ parser.add_argument("--run-id", action="append", required=True)
233
+ parser.add_argument("--min-runs", type=int, default=2)
234
+ parser.add_argument(
235
+ "--max-pair-solo-wall-ratio",
236
+ type=float,
237
+ help="Optional efficiency cap. When set, every run must include elapsed_seconds and pair/solo wall ratio must not exceed this value.",
238
+ )
239
+ parser.add_argument("--out-json")
240
+ parser.add_argument("--out-md")
241
+ args = parser.parse_args()
242
+
243
+ results_root = Path(args.results_root)
244
+ fixtures_root = Path(args.fixtures_root)
245
+ rows = [
246
+ evaluate_run(results_root, fixtures_root, run_id, args.max_pair_solo_wall_ratio)
247
+ for run_id in args.run_id
248
+ ]
249
+ fixture_counts: dict[str, int] = {}
250
+ for row in rows:
251
+ fixture_id = row.get("fixture_id")
252
+ if fixture_id:
253
+ fixture_counts[fixture_id] = fixture_counts.get(fixture_id, 0) + 1
254
+ for row in rows:
255
+ fixture_id = row.get("fixture_id")
256
+ if fixture_id and fixture_counts.get(fixture_id, 0) > 1:
257
+ row["failures"].append(f"duplicate fixture_id={fixture_id}")
258
+ row["status"] = "FAIL"
259
+ passing = [row for row in rows if row["status"] == "PASS"]
260
+ verdict = "PASS" if len(passing) >= args.min_runs and len(passing) == len(rows) else "FAIL"
261
+ ratios = [
262
+ row["pair_solo_wall_ratio"]
263
+ for row in rows
264
+ if isinstance(row.get("pair_solo_wall_ratio"), (int, float))
265
+ ]
266
+
267
+ report = {
268
+ "run_ids_label": ", ".join(args.run_id),
269
+ "rule": "clean frozen diff; distinct fixture per run; gated pair VERIFY fires; pair contributes a stricter verdict-binding result; optional pair/solo wall-ratio cap",
270
+ "min_runs": args.min_runs,
271
+ "max_pair_solo_wall_ratio": args.max_pair_solo_wall_ratio,
272
+ "avg_pair_solo_wall_ratio": (sum(ratios) / len(ratios)) if ratios else None,
273
+ "verdict": verdict,
274
+ "runs_total": len(rows),
275
+ "runs_passed": len(passing),
276
+ "rows": rows,
277
+ }
278
+
279
+ if args.out_json:
280
+ Path(args.out_json).write_text(json.dumps(report, indent=2) + "\n", encoding="utf8")
281
+ if args.out_md:
282
+ write_markdown(Path(args.out_md), report)
283
+
284
+ print(json.dumps(report, indent=2))
285
+ return 0 if verdict == "PASS" else 1
286
+
287
+
288
+ if __name__ == "__main__":
289
+ raise SystemExit(main())
@@ -0,0 +1,250 @@
1
+ #!/usr/bin/env python3
2
+ """Gate full-pipeline L2/pair evidence against L1 solo.
3
+
4
+ This is stricter than headroom-gate.py. Headroom only says a candidate set is
5
+ worth measuring. This gate says the measured L2 arm is usable evidence:
6
+ bare and solo leave headroom, l2_gated is clean, gated pair actually fired, and
7
+ the blind judge scores l2_gated materially above solo_claude.
8
+ """
9
+ from __future__ import annotations
10
+
11
+ import argparse
12
+ import json
13
+ import pathlib
14
+ import sys
15
+ from typing import Any
16
+
17
+
18
+ def load_json(path: pathlib.Path) -> dict[str, Any] | None:
19
+ if not path.is_file():
20
+ return None
21
+ return json.loads(path.read_text())
22
+
23
+
24
+ def score_for(judge: dict[str, Any], arm: str) -> int | None:
25
+ value = (judge.get("scores_by_arm") or {}).get(arm)
26
+ return value if isinstance(value, int) else None
27
+
28
+
29
+ def clean_failures(fixture_dir: pathlib.Path, judge: dict[str, Any], arm: str) -> list[str]:
30
+ failures: list[str] = []
31
+ result = load_json(fixture_dir / arm / "result.json")
32
+ verify = load_json(fixture_dir / arm / "verify.json")
33
+ if result is None:
34
+ failures.append(f"{arm} result.json missing")
35
+ if verify is None:
36
+ failures.append(f"{arm} verify.json missing")
37
+
38
+ dq_by_arm = judge.get("disqualifiers_by_arm") or {}
39
+ if bool((dq_by_arm.get(arm) or {}).get("disqualifier")):
40
+ failures.append(f"{arm} judge disqualifier")
41
+ if result is not None:
42
+ if bool(result.get("disqualifier")):
43
+ failures.append(f"{arm} result disqualifier")
44
+ if bool(result.get("timed_out")):
45
+ failures.append(f"{arm} timed out")
46
+ if bool(result.get("invoke_failure")):
47
+ reason = result.get("invoke_failure_reason")
48
+ if isinstance(reason, str) and reason:
49
+ failures.append(f"{arm} invoke failure ({reason})")
50
+ else:
51
+ failures.append(f"{arm} invoke failure")
52
+ if verify is not None and bool(verify.get("disqualifier")):
53
+ failures.append(f"{arm} verify disqualifier")
54
+ return failures
55
+
56
+
57
+ def elapsed_ratio(pair_result: dict[str, Any] | None, solo_result: dict[str, Any] | None) -> float | None:
58
+ if pair_result is None or solo_result is None:
59
+ return None
60
+ pair_elapsed = pair_result.get("elapsed_seconds")
61
+ solo_elapsed = solo_result.get("elapsed_seconds")
62
+ if not isinstance(pair_elapsed, (int, float)) or not isinstance(solo_elapsed, (int, float)):
63
+ return None
64
+ if solo_elapsed <= 0:
65
+ return None
66
+ return pair_elapsed / solo_elapsed
67
+
68
+
69
+ def provider_limited(result: dict[str, Any] | None) -> bool:
70
+ return result is not None and result.get("invoke_failure_reason") == "provider_limit"
71
+
72
+
73
+ def evaluate_fixture(
74
+ fixture_dir: pathlib.Path,
75
+ *,
76
+ pair_arm: str,
77
+ bare_max: int,
78
+ solo_max: int,
79
+ min_pair_margin: int,
80
+ max_pair_solo_wall_ratio: float | None,
81
+ ) -> dict[str, Any]:
82
+ judge = load_json(fixture_dir / "judge.json")
83
+ if judge is None:
84
+ return {
85
+ "fixture": fixture_dir.name,
86
+ "status": "FAIL",
87
+ "reason": "judge.json missing",
88
+ }
89
+
90
+ bare = score_for(judge, "bare")
91
+ solo = score_for(judge, "solo_claude")
92
+ pair = score_for(judge, pair_arm)
93
+ solo_result = load_json(fixture_dir / "solo_claude" / "result.json")
94
+ pair_result = load_json(fixture_dir / pair_arm / "result.json")
95
+ ratio = elapsed_ratio(pair_result, solo_result)
96
+ pair_provider_limited = provider_limited(pair_result)
97
+ if pair_provider_limited:
98
+ ratio = None
99
+
100
+ reasons: list[str] = []
101
+ if bare is None:
102
+ reasons.append("bare score missing")
103
+ elif bare > bare_max:
104
+ reasons.append(f"bare score {bare} > {bare_max}")
105
+ if solo is None:
106
+ reasons.append("solo_claude score missing")
107
+ elif solo > solo_max:
108
+ reasons.append(f"solo_claude score {solo} > {solo_max}")
109
+ if pair_provider_limited:
110
+ pass
111
+ elif pair is None:
112
+ reasons.append(f"{pair_arm} score missing")
113
+ elif solo is not None and pair - solo < min_pair_margin:
114
+ reasons.append(f"{pair_arm} margin {pair - solo:+d} < +{min_pair_margin}")
115
+
116
+ reasons.extend(clean_failures(fixture_dir, judge, "bare"))
117
+ reasons.extend(clean_failures(fixture_dir, judge, "solo_claude"))
118
+ reasons.extend(clean_failures(fixture_dir, judge, pair_arm))
119
+
120
+ pair_mode = None if pair_result is None else pair_result.get("pair_mode")
121
+ if pair_mode is not True and not pair_provider_limited:
122
+ reasons.append(f"{pair_arm} pair_mode not true")
123
+
124
+ if max_pair_solo_wall_ratio is not None and not pair_provider_limited:
125
+ if ratio is None:
126
+ reasons.append("pair/solo wall ratio missing")
127
+ elif ratio > max_pair_solo_wall_ratio:
128
+ reasons.append(f"pair/solo wall ratio {ratio:.2f} > {max_pair_solo_wall_ratio:.2f}")
129
+
130
+ return {
131
+ "fixture": fixture_dir.name,
132
+ "status": "PASS" if not reasons else "FAIL",
133
+ "bare_score": bare,
134
+ "solo_score": solo,
135
+ "pair_score": pair,
136
+ "pair_margin": (
137
+ None if pair_provider_limited
138
+ else pair - solo if isinstance(pair, int) and isinstance(solo, int)
139
+ else None
140
+ ),
141
+ "pair_mode": pair_mode,
142
+ "pair_solo_wall_ratio": ratio,
143
+ "reason": "; ".join(reasons),
144
+ }
145
+
146
+
147
+ def fmt_ratio(value: Any) -> str:
148
+ return f"{value:.2f}x" if isinstance(value, (int, float)) else "n/a"
149
+
150
+
151
+ def write_md(path: pathlib.Path, report: dict[str, Any]) -> None:
152
+ lines = [
153
+ f"# Full-Pipeline Pair Gate - {report['run_id']}",
154
+ "",
155
+ f"Verdict: **{report['verdict']}**",
156
+ "",
157
+ f"Rule: at least {report['min_fixtures']} fixtures; bare <= {report['bare_max']}; "
158
+ f"solo_claude <= {report['solo_max']}; {report['pair_arm']} clean; pair_mode true; "
159
+ f"{report['pair_arm']} - solo_claude >= {report['min_pair_margin']}.",
160
+ f"Max pair/solo wall ratio: {fmt_ratio(report['max_pair_solo_wall_ratio'])}",
161
+ f"Average pair/solo wall ratio: {fmt_ratio(report['avg_pair_solo_wall_ratio'])}",
162
+ "",
163
+ "| Fixture | Bare | Solo | Pair | Margin | Pair mode | Wall ratio | Status | Reason |",
164
+ "|---|---:|---:|---:|---:|---|---:|---|---|",
165
+ ]
166
+ for row in report["rows"]:
167
+ margin = row.get("pair_margin")
168
+ margin_text = f"{margin:+d}" if isinstance(margin, int) else "n/a"
169
+ lines.append(
170
+ f"| {row['fixture']} | {row.get('bare_score')} | {row.get('solo_score')} | "
171
+ f"{row.get('pair_score')} | {margin_text} | {str(row.get('pair_mode')).lower()} | "
172
+ f"{fmt_ratio(row.get('pair_solo_wall_ratio'))} | {row['status']} | {row.get('reason', '')} |"
173
+ )
174
+ lines.append("")
175
+ path.write_text("\n".join(lines), encoding="utf8")
176
+
177
+
178
+ def positive_float(value: str) -> float:
179
+ parsed = float(value)
180
+ if parsed <= 0:
181
+ raise argparse.ArgumentTypeError("value must be > 0")
182
+ return parsed
183
+
184
+
185
+ def main() -> int:
186
+ parser = argparse.ArgumentParser()
187
+ parser.add_argument("--run-id", required=True)
188
+ parser.add_argument("--results-root", default="benchmark/auto-resolve/results", type=pathlib.Path)
189
+ parser.add_argument("--bare-max", type=int, default=60)
190
+ parser.add_argument("--solo-max", type=int, default=80)
191
+ parser.add_argument("--min-pair-margin", type=int, default=5)
192
+ parser.add_argument("--min-fixtures", type=int, default=2)
193
+ parser.add_argument("--pair-arm", default="l2_gated")
194
+ parser.add_argument("--max-pair-solo-wall-ratio", type=positive_float)
195
+ parser.add_argument("--out-json", type=pathlib.Path)
196
+ parser.add_argument("--out-md", type=pathlib.Path)
197
+ args = parser.parse_args()
198
+
199
+ run_root = args.results_root / args.run_id
200
+ if not run_root.is_dir():
201
+ print(f"no results dir: {run_root}", file=sys.stderr)
202
+ return 2
203
+
204
+ rows = [
205
+ evaluate_fixture(
206
+ fixture_dir,
207
+ pair_arm=args.pair_arm,
208
+ bare_max=args.bare_max,
209
+ solo_max=args.solo_max,
210
+ min_pair_margin=args.min_pair_margin,
211
+ max_pair_solo_wall_ratio=args.max_pair_solo_wall_ratio,
212
+ )
213
+ for fixture_dir in sorted(p for p in run_root.iterdir() if p.is_dir())
214
+ ]
215
+ pass_count = sum(1 for row in rows if row["status"] == "PASS")
216
+ fixture_count_ok = len(rows) >= args.min_fixtures
217
+ verdict = "PASS" if rows and fixture_count_ok and pass_count == len(rows) else "FAIL"
218
+ ratios = [
219
+ row["pair_solo_wall_ratio"]
220
+ for row in rows
221
+ if isinstance(row.get("pair_solo_wall_ratio"), (int, float))
222
+ ]
223
+ report = {
224
+ "run_id": args.run_id,
225
+ "rule": "headroom candidates only; l2_gated must be clean, pair_mode true, and beat solo_claude by the configured margin",
226
+ "verdict": verdict,
227
+ "fixtures_total": len(rows),
228
+ "fixtures_passed": pass_count,
229
+ "min_fixtures": args.min_fixtures,
230
+ "fixture_count_ok": fixture_count_ok,
231
+ "bare_max": args.bare_max,
232
+ "solo_max": args.solo_max,
233
+ "min_pair_margin": args.min_pair_margin,
234
+ "pair_arm": args.pair_arm,
235
+ "max_pair_solo_wall_ratio": args.max_pair_solo_wall_ratio,
236
+ "avg_pair_solo_wall_ratio": (sum(ratios) / len(ratios)) if ratios else None,
237
+ "rows": rows,
238
+ }
239
+
240
+ if args.out_json:
241
+ args.out_json.write_text(json.dumps(report, indent=2) + "\n", encoding="utf8")
242
+ if args.out_md:
243
+ write_md(args.out_md, report)
244
+ else:
245
+ print(json.dumps(report, indent=2))
246
+ return 0 if verdict == "PASS" else 1
247
+
248
+
249
+ if __name__ == "__main__":
250
+ sys.exit(main())