devlyn-cli 2.3.0 → 2.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +1 -1
- package/CLAUDE.md +2 -2
- package/README.md +82 -29
- package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
- package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
- package/benchmark/auto-resolve/README.md +307 -44
- package/benchmark/auto-resolve/RUBRIC.md +23 -14
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
- package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
- package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
- package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
- package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
- package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
- package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
- package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
- package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
- package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
- package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
- package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
- package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
- package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
- package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
- package/benchmark/auto-resolve/scripts/judge.sh +153 -26
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
- package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
- package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
- package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
- package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
- package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
- package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
- package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
- package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
- package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
- package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
- package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
- package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
- package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
- package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
- package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
- package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
- package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
- package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
- package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
- package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
- package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
- package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
- package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
- package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
- package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
- package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
- package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
- package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
- package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
- package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
- package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
- package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
- package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
- package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
- package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
- package/bin/devlyn.js +211 -18
- package/config/skills/_shared/adapters/README.md +3 -0
- package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
- package/config/skills/_shared/adapters/opus-4-7.md +9 -1
- package/config/skills/_shared/archive_run.py +78 -6
- package/config/skills/_shared/codex-config.md +3 -2
- package/config/skills/_shared/codex-monitored.sh +46 -1
- package/config/skills/_shared/collect-codex-findings.py +20 -5
- package/config/skills/_shared/engine-preflight.md +1 -1
- package/config/skills/_shared/runtime-principles.md +5 -8
- package/config/skills/_shared/spec-verify-check.py +2664 -107
- package/config/skills/_shared/verify-merge-findings.py +1369 -19
- package/config/skills/devlyn:ideate/SKILL.md +7 -4
- package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
- package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
- package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
- package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
- package/config/skills/devlyn:resolve/SKILL.md +49 -18
- package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
- package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
- package/config/skills/devlyn:resolve/references/phases/verify.md +62 -28
- package/config/skills/devlyn:resolve/references/state-schema.md +7 -4
- package/package.json +47 -2
- package/scripts/lint-fixtures.sh +349 -0
- package/scripts/lint-shadow-fixtures.sh +58 -0
- package/scripts/lint-skills.sh +3642 -92
- /package/{optional-skills → config/skills}/devlyn:design-ui/SKILL.md +0 -0
|
@@ -3,54 +3,298 @@
|
|
|
3
3
|
|
|
4
4
|
This is stricter than headroom-gate.py. Headroom only says a candidate set is
|
|
5
5
|
worth measuring. This gate says the measured L2 arm is usable evidence:
|
|
6
|
-
bare and solo leave headroom
|
|
7
|
-
|
|
6
|
+
bare and solo leave headroom with complete comparable artifacts, the selected
|
|
7
|
+
pair arm is evidence-clean, pair mode actually fired for a canonical trigger
|
|
8
|
+
reason, and the blind judge scores the selected pair arm materially above
|
|
9
|
+
solo_claude.
|
|
8
10
|
"""
|
|
9
11
|
from __future__ import annotations
|
|
10
12
|
|
|
11
13
|
import argparse
|
|
12
14
|
import json
|
|
15
|
+
import os
|
|
13
16
|
import pathlib
|
|
17
|
+
import re
|
|
14
18
|
import sys
|
|
15
19
|
from typing import Any
|
|
16
20
|
|
|
21
|
+
SCRIPT_DIR = pathlib.Path(__file__).resolve().parent
|
|
22
|
+
if str(SCRIPT_DIR) not in sys.path:
|
|
23
|
+
sys.path.insert(0, str(SCRIPT_DIR))
|
|
24
|
+
FIXTURES_ROOT = SCRIPT_DIR.parent / "fixtures"
|
|
17
25
|
|
|
18
|
-
|
|
26
|
+
from pair_evidence_contract import (
|
|
27
|
+
ALLOWED_PAIR_ARMS,
|
|
28
|
+
all_known_pair_trigger_reasons,
|
|
29
|
+
has_canonical_pair_trigger_reason,
|
|
30
|
+
has_known_pair_trigger_reason,
|
|
31
|
+
is_score,
|
|
32
|
+
is_strict_number,
|
|
33
|
+
loads_strict_json_object,
|
|
34
|
+
path_has_actionable_solo_headroom_hypothesis,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
KNOWN_ARMS = {"bare", "solo_claude"} | ALLOWED_PAIR_ARMS
|
|
38
|
+
PASS_VERDICTS = {"PASS", "PASS_WITH_ISSUES"}
|
|
39
|
+
REJECTED_REGISTRY = pathlib.Path(__file__).with_name("pair-rejected-fixtures.sh")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def load_json(path: pathlib.Path) -> tuple[dict[str, Any] | None, str | None]:
|
|
19
43
|
if not path.is_file():
|
|
20
|
-
return None
|
|
21
|
-
|
|
44
|
+
return None, "missing"
|
|
45
|
+
try:
|
|
46
|
+
data = loads_strict_json_object(path.read_text())
|
|
47
|
+
except (ValueError, json.JSONDecodeError):
|
|
48
|
+
return None, "malformed"
|
|
49
|
+
return data, None
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def fixture_short(name: str) -> str:
|
|
53
|
+
return name.split("-", 1)[0] if "-" in name else name
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def rejected_registry_path() -> pathlib.Path:
|
|
57
|
+
override = os.environ.get("PAIR_REJECTED_FIXTURES_REGISTRY")
|
|
58
|
+
return pathlib.Path(override) if override else REJECTED_REGISTRY
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def load_rejected_short_ids(path: pathlib.Path) -> set[str]:
|
|
62
|
+
if not path.is_file():
|
|
63
|
+
raise ValueError(f"rejected fixture registry missing: {path}")
|
|
64
|
+
rejected = set()
|
|
65
|
+
for line in path.read_text().splitlines():
|
|
66
|
+
match = re.match(r"\s*([FS]\d+)-\*\|([FS]\d+)\)", line)
|
|
67
|
+
if match and match.group(1) == match.group(2):
|
|
68
|
+
rejected.add(match.group(1))
|
|
69
|
+
if not rejected:
|
|
70
|
+
raise ValueError(f"rejected fixture registry has no fixture entries: {path}")
|
|
71
|
+
return rejected
|
|
22
72
|
|
|
23
73
|
|
|
24
74
|
def score_for(judge: dict[str, Any], arm: str) -> int | None:
|
|
25
|
-
|
|
26
|
-
|
|
75
|
+
mapping = judge.get("_blind_mapping")
|
|
76
|
+
if not isinstance(mapping, dict):
|
|
77
|
+
return None
|
|
78
|
+
if arm not in {mapped for slot, mapped in mapping.items() if slot in {"A", "B", "C"}}:
|
|
79
|
+
return None
|
|
80
|
+
raw_scores = judge.get("scores_by_arm")
|
|
81
|
+
scores = raw_scores if isinstance(raw_scores, dict) else {}
|
|
82
|
+
value = scores.get(arm)
|
|
83
|
+
return value if is_score(value) else None
|
|
27
84
|
|
|
28
85
|
|
|
29
|
-
def
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
86
|
+
def verify_score_clean(payload: dict[str, Any] | None) -> bool:
|
|
87
|
+
if payload is None:
|
|
88
|
+
return False
|
|
89
|
+
value = payload.get("verify_score")
|
|
90
|
+
return is_strict_number(value) and value >= 1.0
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def bool_flag_failure(value: Any, true_reason: str, malformed_reason: str) -> str | None:
|
|
94
|
+
if value is True:
|
|
95
|
+
return true_reason
|
|
96
|
+
if value is False or value is None:
|
|
97
|
+
return None
|
|
98
|
+
return malformed_reason
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def pair_trigger_failures(result: dict[str, Any] | None, arm: str) -> list[str]:
|
|
102
|
+
if result is None:
|
|
103
|
+
return []
|
|
104
|
+
trigger = result.get("pair_trigger")
|
|
105
|
+
if not isinstance(trigger, dict):
|
|
106
|
+
return [f"{arm} pair_trigger missing or malformed"]
|
|
107
|
+
eligible = trigger.get("eligible")
|
|
108
|
+
reasons = trigger.get("reasons")
|
|
109
|
+
skipped_reason = trigger.get("skipped_reason")
|
|
110
|
+
if not isinstance(eligible, bool):
|
|
111
|
+
return [f"{arm} pair_trigger.eligible malformed"]
|
|
112
|
+
if not isinstance(reasons, list) or not all(isinstance(reason, str) for reason in reasons):
|
|
113
|
+
return [f"{arm} pair_trigger.reasons malformed"]
|
|
114
|
+
if skipped_reason is not None and not isinstance(skipped_reason, str):
|
|
115
|
+
return [f"{arm} pair_trigger.skipped_reason malformed"]
|
|
116
|
+
if eligible is not True:
|
|
117
|
+
return [f"{arm} pair_trigger not eligible"]
|
|
118
|
+
if not reasons:
|
|
119
|
+
return [f"{arm} pair_trigger eligible with empty reasons"]
|
|
120
|
+
if not has_known_pair_trigger_reason(reasons):
|
|
121
|
+
return [f"{arm} pair_trigger reasons missing known trigger reason"]
|
|
122
|
+
if not all_known_pair_trigger_reasons(reasons):
|
|
123
|
+
return [f"{arm} pair_trigger reasons contain unknown trigger reason"]
|
|
124
|
+
if not has_canonical_pair_trigger_reason(reasons):
|
|
125
|
+
return [f"{arm} pair_trigger reasons missing canonical trigger reason"]
|
|
126
|
+
if skipped_reason is not None:
|
|
127
|
+
return [f"{arm} pair_trigger eligible with skipped_reason"]
|
|
128
|
+
return []
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def pair_trigger_eligible(result: dict[str, Any] | None) -> bool:
|
|
132
|
+
if result is None:
|
|
133
|
+
return False
|
|
134
|
+
trigger = result.get("pair_trigger")
|
|
135
|
+
return (
|
|
136
|
+
isinstance(trigger, dict)
|
|
137
|
+
and trigger.get("eligible") is True
|
|
138
|
+
and isinstance(trigger.get("reasons"), list)
|
|
139
|
+
and bool(trigger.get("reasons"))
|
|
140
|
+
and all(isinstance(reason, str) for reason in trigger.get("reasons", []))
|
|
141
|
+
and all_known_pair_trigger_reasons(trigger.get("reasons", []))
|
|
142
|
+
and has_canonical_pair_trigger_reason(trigger.get("reasons", []))
|
|
143
|
+
and trigger.get("skipped_reason") is None
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def pair_trigger_reasons(result: dict[str, Any] | None) -> list[str]:
|
|
33
148
|
if result is None:
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
149
|
+
return []
|
|
150
|
+
trigger = result.get("pair_trigger")
|
|
151
|
+
if not isinstance(trigger, dict):
|
|
152
|
+
return []
|
|
153
|
+
reasons = trigger.get("reasons")
|
|
154
|
+
if not isinstance(reasons, list) or not all(isinstance(reason, str) for reason in reasons):
|
|
155
|
+
return []
|
|
156
|
+
return reasons
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def fixture_spec_has_solo_headroom_hypothesis(fixture: str) -> bool:
|
|
160
|
+
return path_has_actionable_solo_headroom_hypothesis(FIXTURES_ROOT / fixture / "spec.md")
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def skill_verdict_failures(result: dict[str, Any] | None, arm: str) -> list[str]:
|
|
164
|
+
if result is None or arm == "bare":
|
|
165
|
+
return []
|
|
166
|
+
failures: list[str] = []
|
|
167
|
+
terminal = result.get("terminal_verdict")
|
|
168
|
+
verify = result.get("verify_verdict")
|
|
169
|
+
if terminal not in PASS_VERDICTS:
|
|
170
|
+
failures.append(f"{arm} terminal verdict not pass")
|
|
171
|
+
if verify not in PASS_VERDICTS:
|
|
172
|
+
failures.append(f"{arm} verify verdict not pass")
|
|
173
|
+
return failures
|
|
174
|
+
|
|
37
175
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
176
|
+
def axis_validation_counts(judge: dict[str, Any]) -> tuple[dict[str, int], int]:
|
|
177
|
+
raw_mapping = judge.get("_blind_mapping")
|
|
178
|
+
mapping = raw_mapping if isinstance(raw_mapping, dict) else {}
|
|
179
|
+
raw_validation = judge.get("_axis_validation")
|
|
180
|
+
validation = raw_validation if isinstance(raw_validation, dict) else {}
|
|
181
|
+
cells = validation.get("out_of_range_cells") or []
|
|
182
|
+
declared_count = validation.get("out_of_range_count")
|
|
183
|
+
total_invalid = max(
|
|
184
|
+
declared_count if isinstance(declared_count, int) else 0,
|
|
185
|
+
len(cells) if isinstance(cells, list) else 0,
|
|
186
|
+
)
|
|
187
|
+
breakdown_to_letter = {
|
|
188
|
+
"a_breakdown": "A",
|
|
189
|
+
"b_breakdown": "B",
|
|
190
|
+
"c_breakdown": "C",
|
|
191
|
+
}
|
|
192
|
+
counts: dict[str, int] = {}
|
|
193
|
+
mapped_count = 0
|
|
194
|
+
if not isinstance(cells, list):
|
|
195
|
+
return counts, total_invalid
|
|
196
|
+
for cell in cells:
|
|
197
|
+
if not isinstance(cell, dict):
|
|
198
|
+
continue
|
|
199
|
+
letter = breakdown_to_letter.get(cell.get("breakdown"))
|
|
200
|
+
arm = mapping.get(letter) if letter else None
|
|
201
|
+
if arm in KNOWN_ARMS:
|
|
202
|
+
counts[arm] = counts.get(arm, 0) + 1
|
|
203
|
+
mapped_count += 1
|
|
204
|
+
return counts, max(0, total_invalid - mapped_count)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def axis_invalid_count(judge: dict[str, Any], arm: str) -> int:
|
|
208
|
+
counts, _ = axis_validation_counts(judge)
|
|
209
|
+
return counts.get(arm, 0)
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def axis_unmapped_invalid_count(judge: dict[str, Any]) -> int:
|
|
213
|
+
_, unmapped = axis_validation_counts(judge)
|
|
214
|
+
return unmapped
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def blind_mapping_failures(judge: dict[str, Any], required_arms: set[str]) -> list[str]:
|
|
218
|
+
mapping = judge.get("_blind_mapping")
|
|
219
|
+
if not isinstance(mapping, dict):
|
|
220
|
+
return ["judge blind mapping missing"]
|
|
221
|
+
mapped_arms = {arm for key, arm in mapping.items() if key in {"A", "B", "C"}}
|
|
222
|
+
missing = sorted(required_arms - mapped_arms)
|
|
223
|
+
if missing:
|
|
224
|
+
return [f"judge blind mapping missing arm(s): {', '.join(missing)}"]
|
|
225
|
+
return []
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def clean_failures(
|
|
229
|
+
fixture_dir: pathlib.Path,
|
|
230
|
+
judge: dict[str, Any],
|
|
231
|
+
arm: str,
|
|
232
|
+
*,
|
|
233
|
+
require_correctness: bool,
|
|
234
|
+
) -> list[str]:
|
|
235
|
+
failures: list[str] = []
|
|
236
|
+
result, result_error = load_json(fixture_dir / arm / "result.json")
|
|
237
|
+
verify, verify_error = load_json(fixture_dir / arm / "verify.json")
|
|
238
|
+
diff = fixture_dir / arm / "diff.patch"
|
|
239
|
+
if result_error:
|
|
240
|
+
failures.append(f"{arm} result.json {result_error}")
|
|
241
|
+
if verify_error:
|
|
242
|
+
failures.append(f"{arm} verify.json {verify_error}")
|
|
243
|
+
if not diff.is_file():
|
|
244
|
+
failures.append(f"{arm} diff.patch missing")
|
|
245
|
+
|
|
246
|
+
raw_dq_by_arm = judge.get("disqualifiers_by_arm")
|
|
247
|
+
dq_by_arm = raw_dq_by_arm if isinstance(raw_dq_by_arm, dict) else {}
|
|
248
|
+
dq_entry = dq_by_arm.get(arm)
|
|
249
|
+
dq_value = dq_entry.get("disqualifier") if isinstance(dq_entry, dict) else dq_entry
|
|
250
|
+
judge_dq_failure = bool_flag_failure(
|
|
251
|
+
dq_value,
|
|
252
|
+
f"{arm} judge disqualifier",
|
|
253
|
+
f"{arm} judge disqualifier malformed",
|
|
254
|
+
)
|
|
255
|
+
if judge_dq_failure:
|
|
256
|
+
failures.append(judge_dq_failure)
|
|
257
|
+
axis_invalid = axis_invalid_count(judge, arm)
|
|
258
|
+
if axis_invalid > 0:
|
|
259
|
+
failures.append(f"{arm} judge axis-invalid ({axis_invalid})")
|
|
41
260
|
if result is not None:
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
261
|
+
for field, true_reason in (
|
|
262
|
+
("disqualifier", f"{arm} result disqualifier"),
|
|
263
|
+
("timed_out", f"{arm} timed out"),
|
|
264
|
+
("environment_contamination", f"{arm} environment contamination"),
|
|
265
|
+
):
|
|
266
|
+
failure = bool_flag_failure(
|
|
267
|
+
result.get(field),
|
|
268
|
+
true_reason,
|
|
269
|
+
f"{arm} result {field} malformed",
|
|
270
|
+
)
|
|
271
|
+
if failure:
|
|
272
|
+
failures.append(failure)
|
|
273
|
+
invoke_failure = bool_flag_failure(
|
|
274
|
+
result.get("invoke_failure"),
|
|
275
|
+
f"{arm} invoke failure",
|
|
276
|
+
f"{arm} result invoke_failure malformed",
|
|
277
|
+
)
|
|
278
|
+
if invoke_failure == f"{arm} invoke failure":
|
|
47
279
|
reason = result.get("invoke_failure_reason")
|
|
48
280
|
if isinstance(reason, str) and reason:
|
|
49
281
|
failures.append(f"{arm} invoke failure ({reason})")
|
|
50
282
|
else:
|
|
51
|
-
failures.append(
|
|
52
|
-
|
|
53
|
-
|
|
283
|
+
failures.append(invoke_failure)
|
|
284
|
+
elif invoke_failure:
|
|
285
|
+
failures.append(invoke_failure)
|
|
286
|
+
if require_correctness:
|
|
287
|
+
failures.extend(skill_verdict_failures(result, arm))
|
|
288
|
+
if verify is not None:
|
|
289
|
+
verify_dq_failure = bool_flag_failure(
|
|
290
|
+
verify.get("disqualifier"),
|
|
291
|
+
f"{arm} verify disqualifier",
|
|
292
|
+
f"{arm} verify disqualifier malformed",
|
|
293
|
+
)
|
|
294
|
+
if verify_dq_failure:
|
|
295
|
+
failures.append(verify_dq_failure)
|
|
296
|
+
if require_correctness and verify is not None and not verify_score_clean(verify):
|
|
297
|
+
failures.append(f"{arm} verify_score < 1.0")
|
|
54
298
|
return failures
|
|
55
299
|
|
|
56
300
|
|
|
@@ -59,9 +303,7 @@ def elapsed_ratio(pair_result: dict[str, Any] | None, solo_result: dict[str, Any
|
|
|
59
303
|
return None
|
|
60
304
|
pair_elapsed = pair_result.get("elapsed_seconds")
|
|
61
305
|
solo_elapsed = solo_result.get("elapsed_seconds")
|
|
62
|
-
if not
|
|
63
|
-
return None
|
|
64
|
-
if solo_elapsed <= 0:
|
|
306
|
+
if not is_strict_number(pair_elapsed) or not is_strict_number(solo_elapsed):
|
|
65
307
|
return None
|
|
66
308
|
return pair_elapsed / solo_elapsed
|
|
67
309
|
|
|
@@ -73,53 +315,78 @@ def provider_limited(result: dict[str, Any] | None) -> bool:
|
|
|
73
315
|
def evaluate_fixture(
|
|
74
316
|
fixture_dir: pathlib.Path,
|
|
75
317
|
*,
|
|
318
|
+
rejected_short_ids: set[str],
|
|
76
319
|
pair_arm: str,
|
|
77
320
|
bare_max: int,
|
|
78
321
|
solo_max: int,
|
|
322
|
+
min_bare_headroom: int,
|
|
323
|
+
min_solo_headroom: int,
|
|
79
324
|
min_pair_margin: int,
|
|
80
325
|
max_pair_solo_wall_ratio: float | None,
|
|
326
|
+
require_hypothesis_trigger: bool,
|
|
81
327
|
) -> dict[str, Any]:
|
|
82
|
-
judge = load_json(fixture_dir / "judge.json")
|
|
328
|
+
judge, judge_error = load_json(fixture_dir / "judge.json")
|
|
83
329
|
if judge is None:
|
|
84
330
|
return {
|
|
85
331
|
"fixture": fixture_dir.name,
|
|
86
332
|
"status": "FAIL",
|
|
87
|
-
"reason": "judge.json
|
|
333
|
+
"reason": f"judge.json {judge_error}",
|
|
88
334
|
}
|
|
89
335
|
|
|
90
336
|
bare = score_for(judge, "bare")
|
|
91
337
|
solo = score_for(judge, "solo_claude")
|
|
92
338
|
pair = score_for(judge, pair_arm)
|
|
93
|
-
|
|
94
|
-
|
|
339
|
+
bare_headroom = bare_max - bare if isinstance(bare, int) else None
|
|
340
|
+
solo_headroom = solo_max - solo if isinstance(solo, int) else None
|
|
341
|
+
solo_result, _ = load_json(fixture_dir / "solo_claude" / "result.json")
|
|
342
|
+
pair_result, _ = load_json(fixture_dir / pair_arm / "result.json")
|
|
95
343
|
ratio = elapsed_ratio(pair_result, solo_result)
|
|
96
344
|
pair_provider_limited = provider_limited(pair_result)
|
|
97
345
|
if pair_provider_limited:
|
|
98
346
|
ratio = None
|
|
99
347
|
|
|
100
348
|
reasons: list[str] = []
|
|
349
|
+
if fixture_short(fixture_dir.name) in rejected_short_ids:
|
|
350
|
+
reasons.append("fixture rejected for pair-candidate runs")
|
|
101
351
|
if bare is None:
|
|
102
352
|
reasons.append("bare score missing")
|
|
103
353
|
elif bare > bare_max:
|
|
104
354
|
reasons.append(f"bare score {bare} > {bare_max}")
|
|
355
|
+
elif bare_headroom is not None and bare_headroom < min_bare_headroom:
|
|
356
|
+
reasons.append(f"bare headroom {bare_headroom} < {min_bare_headroom}")
|
|
105
357
|
if solo is None:
|
|
106
358
|
reasons.append("solo_claude score missing")
|
|
107
359
|
elif solo > solo_max:
|
|
108
360
|
reasons.append(f"solo_claude score {solo} > {solo_max}")
|
|
361
|
+
elif solo_headroom is not None and solo_headroom < min_solo_headroom:
|
|
362
|
+
reasons.append(f"solo_claude headroom {solo_headroom} < {min_solo_headroom}")
|
|
109
363
|
if pair_provider_limited:
|
|
110
364
|
pass
|
|
111
365
|
elif pair is None:
|
|
112
366
|
reasons.append(f"{pair_arm} score missing")
|
|
113
367
|
elif solo is not None and pair - solo < min_pair_margin:
|
|
114
368
|
reasons.append(f"{pair_arm} margin {pair - solo:+d} < +{min_pair_margin}")
|
|
369
|
+
unmapped_axis_invalid = axis_unmapped_invalid_count(judge)
|
|
370
|
+
if unmapped_axis_invalid > 0:
|
|
371
|
+
reasons.append(f"judge axis-invalid unmapped ({unmapped_axis_invalid})")
|
|
372
|
+
reasons.extend(blind_mapping_failures(judge, {"bare", "solo_claude", pair_arm}))
|
|
115
373
|
|
|
116
|
-
reasons.extend(clean_failures(fixture_dir, judge, "bare"))
|
|
117
|
-
reasons.extend(clean_failures(fixture_dir, judge, "solo_claude"))
|
|
118
|
-
reasons.extend(clean_failures(fixture_dir, judge, pair_arm))
|
|
374
|
+
reasons.extend(clean_failures(fixture_dir, judge, "bare", require_correctness=False))
|
|
375
|
+
reasons.extend(clean_failures(fixture_dir, judge, "solo_claude", require_correctness=False))
|
|
376
|
+
reasons.extend(clean_failures(fixture_dir, judge, pair_arm, require_correctness=True))
|
|
119
377
|
|
|
120
378
|
pair_mode = None if pair_result is None else pair_result.get("pair_mode")
|
|
121
379
|
if pair_mode is not True and not pair_provider_limited:
|
|
122
380
|
reasons.append(f"{pair_arm} pair_mode not true")
|
|
381
|
+
if not pair_provider_limited:
|
|
382
|
+
reasons.extend(pair_trigger_failures(pair_result, pair_arm))
|
|
383
|
+
if (
|
|
384
|
+
require_hypothesis_trigger
|
|
385
|
+
and
|
|
386
|
+
fixture_spec_has_solo_headroom_hypothesis(fixture_dir.name)
|
|
387
|
+
and "spec.solo_headroom_hypothesis" not in pair_trigger_reasons(pair_result)
|
|
388
|
+
):
|
|
389
|
+
reasons.append(f"{pair_arm} pair_trigger missing spec.solo_headroom_hypothesis")
|
|
123
390
|
|
|
124
391
|
if max_pair_solo_wall_ratio is not None and not pair_provider_limited:
|
|
125
392
|
if ratio is None:
|
|
@@ -131,7 +398,9 @@ def evaluate_fixture(
|
|
|
131
398
|
"fixture": fixture_dir.name,
|
|
132
399
|
"status": "PASS" if not reasons else "FAIL",
|
|
133
400
|
"bare_score": bare,
|
|
401
|
+
"bare_headroom": bare_headroom,
|
|
134
402
|
"solo_score": solo,
|
|
403
|
+
"solo_headroom": solo_headroom,
|
|
135
404
|
"pair_score": pair,
|
|
136
405
|
"pair_margin": (
|
|
137
406
|
None if pair_provider_limited
|
|
@@ -139,6 +408,14 @@ def evaluate_fixture(
|
|
|
139
408
|
else None
|
|
140
409
|
),
|
|
141
410
|
"pair_mode": pair_mode,
|
|
411
|
+
"pair_trigger_eligible": pair_trigger_eligible(pair_result),
|
|
412
|
+
"pair_trigger_reasons": pair_trigger_reasons(pair_result),
|
|
413
|
+
"pair_trigger_has_canonical_reason": has_canonical_pair_trigger_reason(
|
|
414
|
+
pair_trigger_reasons(pair_result)
|
|
415
|
+
),
|
|
416
|
+
"pair_trigger_has_hypothesis_reason": (
|
|
417
|
+
"spec.solo_headroom_hypothesis" in pair_trigger_reasons(pair_result)
|
|
418
|
+
),
|
|
142
419
|
"pair_solo_wall_ratio": ratio,
|
|
143
420
|
"reason": "; ".join(reasons),
|
|
144
421
|
}
|
|
@@ -148,27 +425,50 @@ def fmt_ratio(value: Any) -> str:
|
|
|
148
425
|
return f"{value:.2f}x" if isinstance(value, (int, float)) else "n/a"
|
|
149
426
|
|
|
150
427
|
|
|
428
|
+
def fmt_margin(value: Any) -> str:
|
|
429
|
+
return f"{value:+.1f}" if isinstance(value, (int, float)) else "n/a"
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
def fmt_trigger_reasons(value: Any) -> str:
|
|
433
|
+
if not isinstance(value, list) or not all(isinstance(item, str) for item in value):
|
|
434
|
+
return ""
|
|
435
|
+
return ",".join(value)
|
|
436
|
+
|
|
437
|
+
|
|
151
438
|
def write_md(path: pathlib.Path, report: dict[str, Any]) -> None:
|
|
152
439
|
lines = [
|
|
153
440
|
f"# Full-Pipeline Pair Gate - {report['run_id']}",
|
|
154
441
|
"",
|
|
155
442
|
f"Verdict: **{report['verdict']}**",
|
|
156
443
|
"",
|
|
444
|
+
f"Fixtures passed: {report['fixtures_passed']}/{report['fixtures_total']} "
|
|
445
|
+
f"(minimum required: {report['min_fixtures']})",
|
|
446
|
+
"",
|
|
157
447
|
f"Rule: at least {report['min_fixtures']} fixtures; bare <= {report['bare_max']}; "
|
|
158
|
-
f"
|
|
448
|
+
f"bare headroom >= {report['min_bare_headroom_required']}; "
|
|
449
|
+
f"solo_claude <= {report['solo_max']}; "
|
|
450
|
+
f"solo_claude headroom >= {report['min_solo_headroom_required']}; "
|
|
451
|
+
f"{report['pair_arm']} evidence-clean; pair_mode true; "
|
|
452
|
+
"pair_trigger eligible with canonical reason; "
|
|
159
453
|
f"{report['pair_arm']} - solo_claude >= {report['min_pair_margin']}.",
|
|
160
|
-
f"
|
|
454
|
+
f"Average pair margin: {fmt_margin(report['avg_pair_margin'])}",
|
|
455
|
+
f"Allowed pair/solo wall ratio: {fmt_ratio(report['max_pair_solo_wall_ratio'])}",
|
|
456
|
+
f"Maximum observed pair/solo wall ratio: {fmt_ratio(report['max_observed_pair_solo_wall_ratio'])}",
|
|
161
457
|
f"Average pair/solo wall ratio: {fmt_ratio(report['avg_pair_solo_wall_ratio'])}",
|
|
458
|
+
f"Hypothesis trigger required: {str(report['require_hypothesis_trigger']).lower()}",
|
|
162
459
|
"",
|
|
163
|
-
"| Fixture | Bare |
|
|
164
|
-
"
|
|
460
|
+
"| Fixture | Bare | Bare headroom | Solo_claude | Solo_claude headroom | Pair | Margin | Pair mode | Hypothesis trigger | Triggers | Wall ratio | Status | Reason |",
|
|
461
|
+
"|---|---:|---:|---:|---:|---:|---:|---|---|---|---:|---|---|",
|
|
165
462
|
]
|
|
166
463
|
for row in report["rows"]:
|
|
167
464
|
margin = row.get("pair_margin")
|
|
168
465
|
margin_text = f"{margin:+d}" if isinstance(margin, int) else "n/a"
|
|
169
466
|
lines.append(
|
|
170
|
-
f"| {row['fixture']} | {row.get('bare_score')} | {row.get('
|
|
467
|
+
f"| {row['fixture']} | {row.get('bare_score')} | {row.get('bare_headroom')} | "
|
|
468
|
+
f"{row.get('solo_score')} | {row.get('solo_headroom')} | "
|
|
171
469
|
f"{row.get('pair_score')} | {margin_text} | {str(row.get('pair_mode')).lower()} | "
|
|
470
|
+
f"{str(row.get('pair_trigger_has_hypothesis_reason')).lower()} | "
|
|
471
|
+
f"{fmt_trigger_reasons(row.get('pair_trigger_reasons'))} | "
|
|
172
472
|
f"{fmt_ratio(row.get('pair_solo_wall_ratio'))} | {row['status']} | {row.get('reason', '')} |"
|
|
173
473
|
)
|
|
174
474
|
lines.append("")
|
|
@@ -177,38 +477,82 @@ def write_md(path: pathlib.Path, report: dict[str, Any]) -> None:
|
|
|
177
477
|
|
|
178
478
|
def positive_float(value: str) -> float:
|
|
179
479
|
parsed = float(value)
|
|
480
|
+
if not is_strict_number(parsed):
|
|
481
|
+
raise argparse.ArgumentTypeError("value must be finite and > 0")
|
|
482
|
+
return parsed
|
|
483
|
+
|
|
484
|
+
|
|
485
|
+
def positive_int(value: str) -> int:
|
|
486
|
+
parsed = int(value)
|
|
180
487
|
if parsed <= 0:
|
|
181
488
|
raise argparse.ArgumentTypeError("value must be > 0")
|
|
182
489
|
return parsed
|
|
183
490
|
|
|
184
491
|
|
|
492
|
+
def non_negative_int(value: str) -> int:
|
|
493
|
+
parsed = int(value)
|
|
494
|
+
if parsed < 0:
|
|
495
|
+
raise argparse.ArgumentTypeError("value must be >= 0")
|
|
496
|
+
return parsed
|
|
497
|
+
|
|
498
|
+
|
|
185
499
|
def main() -> int:
|
|
186
500
|
parser = argparse.ArgumentParser()
|
|
187
501
|
parser.add_argument("--run-id", required=True)
|
|
188
502
|
parser.add_argument("--results-root", default="benchmark/auto-resolve/results", type=pathlib.Path)
|
|
189
503
|
parser.add_argument("--bare-max", type=int, default=60)
|
|
190
504
|
parser.add_argument("--solo-max", type=int, default=80)
|
|
191
|
-
parser.add_argument("--min-
|
|
192
|
-
parser.add_argument("--min-
|
|
193
|
-
parser.add_argument("--pair-
|
|
194
|
-
parser.add_argument("--
|
|
505
|
+
parser.add_argument("--min-bare-headroom", type=non_negative_int, default=5)
|
|
506
|
+
parser.add_argument("--min-solo-headroom", type=non_negative_int, default=5)
|
|
507
|
+
parser.add_argument("--min-pair-margin", type=positive_int, default=5)
|
|
508
|
+
parser.add_argument("--min-fixtures", type=positive_int, default=2)
|
|
509
|
+
parser.add_argument("--pair-arm", default="l2_risk_probes")
|
|
510
|
+
parser.add_argument("--max-pair-solo-wall-ratio", type=positive_float, default=3.0)
|
|
511
|
+
parser.add_argument(
|
|
512
|
+
"--require-hypothesis-trigger",
|
|
513
|
+
action="store_true",
|
|
514
|
+
help="require fixtures with actionable solo-headroom hypotheses to expose spec.solo_headroom_hypothesis in pair_trigger.reasons",
|
|
515
|
+
)
|
|
195
516
|
parser.add_argument("--out-json", type=pathlib.Path)
|
|
196
517
|
parser.add_argument("--out-md", type=pathlib.Path)
|
|
197
518
|
args = parser.parse_args()
|
|
198
519
|
|
|
520
|
+
if args.pair_arm == "l2_forced":
|
|
521
|
+
print(
|
|
522
|
+
"pair-arm l2_forced is retired: it leaks pair-awareness before IMPLEMENT; "
|
|
523
|
+
"use l2_risk_probes for current proof runs or l2_gated for diagnostics.",
|
|
524
|
+
file=sys.stderr,
|
|
525
|
+
)
|
|
526
|
+
return 2
|
|
527
|
+
if args.pair_arm not in ALLOWED_PAIR_ARMS:
|
|
528
|
+
print(
|
|
529
|
+
f"pair-arm must be one of {sorted(ALLOWED_PAIR_ARMS)}: {args.pair_arm}",
|
|
530
|
+
file=sys.stderr,
|
|
531
|
+
)
|
|
532
|
+
return 2
|
|
533
|
+
|
|
199
534
|
run_root = args.results_root / args.run_id
|
|
200
535
|
if not run_root.is_dir():
|
|
201
536
|
print(f"no results dir: {run_root}", file=sys.stderr)
|
|
202
537
|
return 2
|
|
203
538
|
|
|
539
|
+
try:
|
|
540
|
+
rejected_short_ids = load_rejected_short_ids(rejected_registry_path())
|
|
541
|
+
except ValueError as exc:
|
|
542
|
+
print(str(exc), file=sys.stderr)
|
|
543
|
+
return 2
|
|
204
544
|
rows = [
|
|
205
545
|
evaluate_fixture(
|
|
206
546
|
fixture_dir,
|
|
547
|
+
rejected_short_ids=rejected_short_ids,
|
|
207
548
|
pair_arm=args.pair_arm,
|
|
208
549
|
bare_max=args.bare_max,
|
|
209
550
|
solo_max=args.solo_max,
|
|
551
|
+
min_bare_headroom=args.min_bare_headroom,
|
|
552
|
+
min_solo_headroom=args.min_solo_headroom,
|
|
210
553
|
min_pair_margin=args.min_pair_margin,
|
|
211
554
|
max_pair_solo_wall_ratio=args.max_pair_solo_wall_ratio,
|
|
555
|
+
require_hypothesis_trigger=args.require_hypothesis_trigger,
|
|
212
556
|
)
|
|
213
557
|
for fixture_dir in sorted(p for p in run_root.iterdir() if p.is_dir())
|
|
214
558
|
]
|
|
@@ -218,11 +562,24 @@ def main() -> int:
|
|
|
218
562
|
ratios = [
|
|
219
563
|
row["pair_solo_wall_ratio"]
|
|
220
564
|
for row in rows
|
|
221
|
-
if
|
|
565
|
+
if is_strict_number(row.get("pair_solo_wall_ratio"))
|
|
566
|
+
]
|
|
567
|
+
margins = [
|
|
568
|
+
row["pair_margin"]
|
|
569
|
+
for row in rows
|
|
570
|
+
if isinstance(row.get("pair_margin"), int)
|
|
222
571
|
]
|
|
572
|
+
rule = (
|
|
573
|
+
"headroom candidates only; "
|
|
574
|
+
f"bare headroom >= {args.min_bare_headroom}; "
|
|
575
|
+
f"solo_claude headroom >= {args.min_solo_headroom}; "
|
|
576
|
+
f"{args.pair_arm} must be evidence-clean, pair_mode true, "
|
|
577
|
+
"pair_trigger eligible with a canonical reason, and beat solo_claude "
|
|
578
|
+
"by the configured margin"
|
|
579
|
+
)
|
|
223
580
|
report = {
|
|
224
581
|
"run_id": args.run_id,
|
|
225
|
-
"rule":
|
|
582
|
+
"rule": rule,
|
|
226
583
|
"verdict": verdict,
|
|
227
584
|
"fixtures_total": len(rows),
|
|
228
585
|
"fixtures_passed": pass_count,
|
|
@@ -230,9 +587,14 @@ def main() -> int:
|
|
|
230
587
|
"fixture_count_ok": fixture_count_ok,
|
|
231
588
|
"bare_max": args.bare_max,
|
|
232
589
|
"solo_max": args.solo_max,
|
|
590
|
+
"min_bare_headroom_required": args.min_bare_headroom,
|
|
591
|
+
"min_solo_headroom_required": args.min_solo_headroom,
|
|
233
592
|
"min_pair_margin": args.min_pair_margin,
|
|
234
593
|
"pair_arm": args.pair_arm,
|
|
594
|
+
"require_hypothesis_trigger": args.require_hypothesis_trigger,
|
|
235
595
|
"max_pair_solo_wall_ratio": args.max_pair_solo_wall_ratio,
|
|
596
|
+
"max_observed_pair_solo_wall_ratio": max(ratios) if ratios else None,
|
|
597
|
+
"avg_pair_margin": (sum(margins) / len(margins)) if margins else None,
|
|
236
598
|
"avg_pair_solo_wall_ratio": (sum(ratios) / len(ratios)) if ratios else None,
|
|
237
599
|
"rows": rows,
|
|
238
600
|
}
|