devlyn-cli 2.3.0 → 2.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +1 -1
- package/CLAUDE.md +2 -2
- package/README.md +80 -29
- package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
- package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
- package/benchmark/auto-resolve/README.md +307 -44
- package/benchmark/auto-resolve/RUBRIC.md +23 -14
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
- package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
- package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
- package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
- package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
- package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
- package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
- package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
- package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
- package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
- package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
- package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
- package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
- package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
- package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
- package/benchmark/auto-resolve/scripts/judge.sh +153 -26
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
- package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
- package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
- package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
- package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
- package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
- package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
- package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
- package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
- package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
- package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
- package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
- package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
- package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
- package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
- package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
- package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
- package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
- package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
- package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
- package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
- package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
- package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
- package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
- package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
- package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
- package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
- package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
- package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
- package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
- package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
- package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
- package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
- package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
- package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
- package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
- package/bin/devlyn.js +210 -17
- package/config/skills/_shared/adapters/README.md +3 -0
- package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
- package/config/skills/_shared/adapters/opus-4-7.md +9 -1
- package/config/skills/_shared/archive_run.py +78 -6
- package/config/skills/_shared/codex-config.md +3 -2
- package/config/skills/_shared/codex-monitored.sh +46 -1
- package/config/skills/_shared/collect-codex-findings.py +20 -5
- package/config/skills/_shared/engine-preflight.md +1 -1
- package/config/skills/_shared/runtime-principles.md +5 -8
- package/config/skills/_shared/spec-verify-check.py +2664 -107
- package/config/skills/_shared/verify-merge-findings.py +1369 -19
- package/config/skills/devlyn:ideate/SKILL.md +7 -4
- package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
- package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
- package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
- package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
- package/config/skills/devlyn:resolve/SKILL.md +49 -18
- package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
- package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
- package/config/skills/devlyn:resolve/references/phases/verify.md +62 -28
- package/config/skills/devlyn:resolve/references/state-schema.md +7 -4
- package/package.json +47 -2
- package/scripts/lint-fixtures.sh +349 -0
- package/scripts/lint-shadow-fixtures.sh +58 -0
- package/scripts/lint-skills.sh +3642 -92
- /package/{optional-skills → config/skills}/devlyn:design-ui/SKILL.md +0 -0
|
@@ -10,52 +10,210 @@ from __future__ import annotations
|
|
|
10
10
|
|
|
11
11
|
import argparse
|
|
12
12
|
import json
|
|
13
|
+
import os
|
|
13
14
|
import pathlib
|
|
15
|
+
import re
|
|
14
16
|
import sys
|
|
15
17
|
|
|
18
|
+
SCRIPT_DIR = pathlib.Path(__file__).resolve().parent
|
|
19
|
+
if str(SCRIPT_DIR) not in sys.path:
|
|
20
|
+
sys.path.insert(0, str(SCRIPT_DIR))
|
|
16
21
|
|
|
17
|
-
|
|
22
|
+
from pair_evidence_contract import is_score, loads_strict_json_object
|
|
23
|
+
|
|
24
|
+
KNOWN_ARMS = {"bare", "solo_claude"}
|
|
25
|
+
REJECTED_REGISTRY = pathlib.Path(__file__).with_name("pair-rejected-fixtures.sh")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def load_json(path: pathlib.Path) -> tuple[dict | None, str | None]:
|
|
18
29
|
if not path.is_file():
|
|
30
|
+
return None, "missing"
|
|
31
|
+
try:
|
|
32
|
+
data = loads_strict_json_object(path.read_text())
|
|
33
|
+
except (ValueError, json.JSONDecodeError):
|
|
34
|
+
return None, "malformed"
|
|
35
|
+
return data, None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def bool_flag_failure(value: object, true_reason: str, malformed_reason: str) -> str | None:
|
|
39
|
+
if value is True:
|
|
40
|
+
return true_reason
|
|
41
|
+
if value is False or value is None:
|
|
19
42
|
return None
|
|
20
|
-
return
|
|
43
|
+
return malformed_reason
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def fixture_short(name: str) -> str:
|
|
47
|
+
return name.split("-", 1)[0] if "-" in name else name
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def rejected_registry_path() -> pathlib.Path:
|
|
51
|
+
override = os.environ.get("PAIR_REJECTED_FIXTURES_REGISTRY")
|
|
52
|
+
return pathlib.Path(override) if override else REJECTED_REGISTRY
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def load_rejected_short_ids(path: pathlib.Path) -> set[str]:
|
|
56
|
+
if not path.is_file():
|
|
57
|
+
raise ValueError(f"rejected fixture registry missing: {path}")
|
|
58
|
+
rejected = set()
|
|
59
|
+
for line in path.read_text().splitlines():
|
|
60
|
+
match = re.match(r"\s*([FS]\d+)-\*\|([FS]\d+)\)", line)
|
|
61
|
+
if match and match.group(1) == match.group(2):
|
|
62
|
+
rejected.add(match.group(1))
|
|
63
|
+
if not rejected:
|
|
64
|
+
raise ValueError(f"rejected fixture registry has no fixture entries: {path}")
|
|
65
|
+
return rejected
|
|
21
66
|
|
|
22
67
|
|
|
23
68
|
def score_for(judge: dict, arm: str) -> int | None:
|
|
24
|
-
|
|
69
|
+
mapping = judge.get("_blind_mapping")
|
|
70
|
+
if not isinstance(mapping, dict):
|
|
71
|
+
return None
|
|
72
|
+
if arm not in {mapped for slot, mapped in mapping.items() if slot in {"A", "B", "C"}}:
|
|
73
|
+
return None
|
|
74
|
+
raw_scores = judge.get("scores_by_arm")
|
|
75
|
+
scores = raw_scores if isinstance(raw_scores, dict) else {}
|
|
25
76
|
value = scores.get(arm)
|
|
26
|
-
return value if
|
|
77
|
+
return value if is_score(value) else None
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def axis_validation_counts(judge: dict) -> tuple[dict[str, int], int]:
|
|
81
|
+
raw_mapping = judge.get("_blind_mapping")
|
|
82
|
+
mapping = raw_mapping if isinstance(raw_mapping, dict) else {}
|
|
83
|
+
raw_validation = judge.get("_axis_validation")
|
|
84
|
+
validation = raw_validation if isinstance(raw_validation, dict) else {}
|
|
85
|
+
cells = validation.get("out_of_range_cells") or []
|
|
86
|
+
declared_count = validation.get("out_of_range_count")
|
|
87
|
+
total_invalid = max(
|
|
88
|
+
declared_count if isinstance(declared_count, int) else 0,
|
|
89
|
+
len(cells) if isinstance(cells, list) else 0,
|
|
90
|
+
)
|
|
91
|
+
breakdown_to_letter = {
|
|
92
|
+
"a_breakdown": "A",
|
|
93
|
+
"b_breakdown": "B",
|
|
94
|
+
"c_breakdown": "C",
|
|
95
|
+
}
|
|
96
|
+
counts: dict[str, int] = {}
|
|
97
|
+
mapped_count = 0
|
|
98
|
+
if not isinstance(cells, list):
|
|
99
|
+
return counts, total_invalid
|
|
100
|
+
for cell in cells:
|
|
101
|
+
if not isinstance(cell, dict):
|
|
102
|
+
continue
|
|
103
|
+
letter = breakdown_to_letter.get(cell.get("breakdown"))
|
|
104
|
+
arm = mapping.get(letter) if letter else None
|
|
105
|
+
if arm in KNOWN_ARMS:
|
|
106
|
+
counts[arm] = counts.get(arm, 0) + 1
|
|
107
|
+
mapped_count += 1
|
|
108
|
+
return counts, max(0, total_invalid - mapped_count)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def axis_invalid_count(judge: dict, arm: str) -> int:
|
|
112
|
+
counts, _ = axis_validation_counts(judge)
|
|
113
|
+
return counts.get(arm, 0)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def axis_unmapped_invalid_count(judge: dict) -> int:
|
|
117
|
+
_, unmapped = axis_validation_counts(judge)
|
|
118
|
+
return unmapped
|
|
27
119
|
|
|
28
120
|
|
|
29
|
-
def
|
|
121
|
+
def blind_mapping_failures(judge: dict, required_arms: set[str]) -> list[str]:
|
|
122
|
+
mapping = judge.get("_blind_mapping")
|
|
123
|
+
if not isinstance(mapping, dict):
|
|
124
|
+
return ["judge blind mapping missing"]
|
|
125
|
+
mapped_arms = {arm for key, arm in mapping.items() if key in {"A", "B", "C"}}
|
|
126
|
+
missing = sorted(required_arms - mapped_arms)
|
|
127
|
+
if missing:
|
|
128
|
+
return [f"judge blind mapping missing arm(s): {', '.join(missing)}"]
|
|
129
|
+
return []
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def arm_complete_failures(fixture_dir: pathlib.Path, judge: dict, arm: str) -> list[str]:
|
|
30
133
|
failures: list[str] = []
|
|
31
|
-
result = load_json(fixture_dir / arm / "result.json")
|
|
32
|
-
verify = load_json(fixture_dir / arm / "verify.json")
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
if
|
|
39
|
-
failures.append(f"{arm}
|
|
134
|
+
result, result_error = load_json(fixture_dir / arm / "result.json")
|
|
135
|
+
verify, verify_error = load_json(fixture_dir / arm / "verify.json")
|
|
136
|
+
diff = fixture_dir / arm / "diff.patch"
|
|
137
|
+
if result_error:
|
|
138
|
+
failures.append(f"{arm} result.json {result_error}")
|
|
139
|
+
if verify_error:
|
|
140
|
+
failures.append(f"{arm} verify.json {verify_error}")
|
|
141
|
+
if not diff.is_file():
|
|
142
|
+
failures.append(f"{arm} diff.patch missing")
|
|
143
|
+
raw_dq_by_arm = judge.get("disqualifiers_by_arm")
|
|
144
|
+
dq_by_arm = raw_dq_by_arm if isinstance(raw_dq_by_arm, dict) else {}
|
|
145
|
+
dq_entry = dq_by_arm.get(arm)
|
|
146
|
+
dq_value = dq_entry.get("disqualifier") if isinstance(dq_entry, dict) else dq_entry
|
|
147
|
+
judge_dq_failure = bool_flag_failure(
|
|
148
|
+
dq_value,
|
|
149
|
+
f"{arm} judge disqualifier",
|
|
150
|
+
f"{arm} judge disqualifier malformed",
|
|
151
|
+
)
|
|
152
|
+
if judge_dq_failure:
|
|
153
|
+
failures.append(judge_dq_failure)
|
|
154
|
+
axis_invalid = axis_invalid_count(judge, arm)
|
|
155
|
+
if axis_invalid > 0:
|
|
156
|
+
failures.append(f"{arm} judge axis-invalid ({axis_invalid})")
|
|
40
157
|
if result is not None:
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
158
|
+
for field, true_reason in (
|
|
159
|
+
("disqualifier", f"{arm} result disqualifier"),
|
|
160
|
+
("timed_out", f"{arm} timed out"),
|
|
161
|
+
("invoke_failure", f"{arm} invoke failure"),
|
|
162
|
+
("environment_contamination", f"{arm} environment contamination"),
|
|
163
|
+
):
|
|
164
|
+
failure = bool_flag_failure(
|
|
165
|
+
result.get(field),
|
|
166
|
+
true_reason,
|
|
167
|
+
f"{arm} result {field} malformed",
|
|
168
|
+
)
|
|
169
|
+
if failure:
|
|
170
|
+
failures.append(failure)
|
|
171
|
+
if verify is not None:
|
|
172
|
+
verify_dq_failure = bool_flag_failure(
|
|
173
|
+
verify.get("disqualifier"),
|
|
174
|
+
f"{arm} verify disqualifier",
|
|
175
|
+
f"{arm} verify disqualifier malformed",
|
|
176
|
+
)
|
|
177
|
+
if verify_dq_failure:
|
|
178
|
+
failures.append(verify_dq_failure)
|
|
49
179
|
return failures
|
|
50
180
|
|
|
51
181
|
|
|
182
|
+
def positive_int(value: str) -> int:
|
|
183
|
+
parsed = int(value)
|
|
184
|
+
if parsed <= 0:
|
|
185
|
+
raise argparse.ArgumentTypeError("value must be > 0")
|
|
186
|
+
return parsed
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def non_negative_int(value: str) -> int:
|
|
190
|
+
parsed = int(value)
|
|
191
|
+
if parsed < 0:
|
|
192
|
+
raise argparse.ArgumentTypeError("value must be >= 0")
|
|
193
|
+
return parsed
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def remaining_headroom(score: int | None, max_score: int) -> int | None:
|
|
197
|
+
return max_score - score if isinstance(score, int) else None
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def average(values: list[int]) -> float | None:
|
|
201
|
+
return (sum(values) / len(values)) if values else None
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def fmt_float(value: float | None) -> str:
|
|
205
|
+
return f"{value:.1f}" if isinstance(value, (int, float)) else "n/a"
|
|
206
|
+
|
|
207
|
+
|
|
52
208
|
def main() -> int:
|
|
53
209
|
parser = argparse.ArgumentParser()
|
|
54
210
|
parser.add_argument("--run-id", required=True)
|
|
55
211
|
parser.add_argument("--results-root", default="benchmark/auto-resolve/results")
|
|
56
212
|
parser.add_argument("--bare-max", type=int, default=60)
|
|
57
213
|
parser.add_argument("--solo-max", type=int, default=80)
|
|
58
|
-
parser.add_argument("--min-
|
|
214
|
+
parser.add_argument("--min-bare-headroom", type=non_negative_int, default=5)
|
|
215
|
+
parser.add_argument("--min-solo-headroom", type=non_negative_int, default=5)
|
|
216
|
+
parser.add_argument("--min-fixtures", type=positive_int, default=2)
|
|
59
217
|
parser.add_argument("--out-json", default=None)
|
|
60
218
|
parser.add_argument("--out-md", default=None)
|
|
61
219
|
args = parser.parse_args()
|
|
@@ -66,52 +224,117 @@ def main() -> int:
|
|
|
66
224
|
return 2
|
|
67
225
|
|
|
68
226
|
rows = []
|
|
227
|
+
try:
|
|
228
|
+
rejected_short_ids = load_rejected_short_ids(rejected_registry_path())
|
|
229
|
+
except ValueError as exc:
|
|
230
|
+
print(str(exc), file=sys.stderr)
|
|
231
|
+
return 2
|
|
69
232
|
for fixture_dir in sorted(p for p in res_root.iterdir() if p.is_dir()):
|
|
70
|
-
judge = load_json(fixture_dir / "judge.json")
|
|
233
|
+
judge, judge_error = load_json(fixture_dir / "judge.json")
|
|
71
234
|
if judge is None:
|
|
72
235
|
rows.append({
|
|
73
236
|
"fixture": fixture_dir.name,
|
|
74
237
|
"status": "MISSING_JUDGE",
|
|
75
|
-
"reason": "judge.json
|
|
238
|
+
"reason": f"judge.json {judge_error}",
|
|
76
239
|
})
|
|
77
240
|
continue
|
|
78
241
|
bare = score_for(judge, "bare")
|
|
79
242
|
solo = score_for(judge, "solo_claude")
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
243
|
+
bare_headroom = remaining_headroom(bare, args.bare_max)
|
|
244
|
+
solo_headroom = remaining_headroom(solo, args.solo_max)
|
|
245
|
+
bare_complete_failures = arm_complete_failures(fixture_dir, judge, "bare")
|
|
246
|
+
solo_complete_failures = arm_complete_failures(fixture_dir, judge, "solo_claude")
|
|
247
|
+
unmapped_axis_invalid = axis_unmapped_invalid_count(judge)
|
|
248
|
+
mapping_failures = blind_mapping_failures(judge, KNOWN_ARMS)
|
|
249
|
+
rejected = fixture_short(fixture_dir.name) in rejected_short_ids
|
|
250
|
+
bare_headroom_ok = (
|
|
251
|
+
isinstance(bare_headroom, int)
|
|
252
|
+
and bare_headroom >= args.min_bare_headroom
|
|
253
|
+
)
|
|
254
|
+
solo_headroom_ok = (
|
|
255
|
+
isinstance(solo_headroom, int)
|
|
256
|
+
and solo_headroom >= args.min_solo_headroom
|
|
257
|
+
)
|
|
258
|
+
bare_ok = (
|
|
259
|
+
bare is not None
|
|
260
|
+
and bare <= args.bare_max
|
|
261
|
+
and bare_headroom_ok
|
|
262
|
+
and not bare_complete_failures
|
|
263
|
+
)
|
|
264
|
+
solo_ok = (
|
|
265
|
+
solo is not None
|
|
266
|
+
and solo <= args.solo_max
|
|
267
|
+
and solo_headroom_ok
|
|
268
|
+
and not solo_complete_failures
|
|
269
|
+
)
|
|
270
|
+
judge_ok = unmapped_axis_invalid == 0 and not mapping_failures
|
|
271
|
+
status = "PASS" if bare_ok and solo_ok and judge_ok and not rejected else "FAIL"
|
|
85
272
|
reasons = []
|
|
86
273
|
if bare is None:
|
|
87
274
|
reasons.append("bare score missing")
|
|
88
275
|
elif bare > args.bare_max:
|
|
89
276
|
reasons.append(f"bare score {bare} > {args.bare_max}")
|
|
277
|
+
elif bare_headroom is not None and bare_headroom < args.min_bare_headroom:
|
|
278
|
+
reasons.append(
|
|
279
|
+
f"bare headroom {bare_headroom} < {args.min_bare_headroom}"
|
|
280
|
+
)
|
|
90
281
|
if solo is None:
|
|
91
282
|
reasons.append("solo_claude score missing")
|
|
92
283
|
elif solo > args.solo_max:
|
|
93
284
|
reasons.append(f"solo_claude score {solo} > {args.solo_max}")
|
|
94
|
-
|
|
95
|
-
|
|
285
|
+
elif solo_headroom is not None and solo_headroom < args.min_solo_headroom:
|
|
286
|
+
reasons.append(
|
|
287
|
+
f"solo_claude headroom {solo_headroom} < {args.min_solo_headroom}"
|
|
288
|
+
)
|
|
289
|
+
if unmapped_axis_invalid > 0:
|
|
290
|
+
reasons.append(f"judge axis-invalid unmapped ({unmapped_axis_invalid})")
|
|
291
|
+
reasons.extend(mapping_failures)
|
|
292
|
+
if rejected:
|
|
293
|
+
reasons.append("fixture rejected for pair-candidate runs")
|
|
294
|
+
reasons.extend(bare_complete_failures)
|
|
295
|
+
reasons.extend(solo_complete_failures)
|
|
96
296
|
rows.append({
|
|
97
297
|
"fixture": fixture_dir.name,
|
|
98
298
|
"status": status,
|
|
99
299
|
"bare_score": bare,
|
|
100
300
|
"solo_score": solo,
|
|
301
|
+
"bare_headroom": bare_headroom,
|
|
302
|
+
"solo_headroom": solo_headroom,
|
|
101
303
|
"reason": "; ".join(reasons) if reasons else "",
|
|
102
304
|
})
|
|
103
305
|
|
|
104
306
|
pass_count = sum(1 for row in rows if row["status"] == "PASS")
|
|
105
307
|
fixture_count_ok = len(rows) >= args.min_fixtures
|
|
106
308
|
verdict = "PASS" if pass_count == len(rows) and rows and fixture_count_ok else "FAIL"
|
|
309
|
+
bare_headrooms = [
|
|
310
|
+
value for row in rows
|
|
311
|
+
if isinstance((value := row.get("bare_headroom")), int)
|
|
312
|
+
]
|
|
313
|
+
solo_headrooms = [
|
|
314
|
+
value for row in rows
|
|
315
|
+
if isinstance((value := row.get("solo_headroom")), int)
|
|
316
|
+
]
|
|
107
317
|
payload = {
|
|
108
318
|
"run_id": args.run_id,
|
|
109
|
-
"rule":
|
|
319
|
+
"rule": (
|
|
320
|
+
f"at least {args.min_fixtures} candidate fixtures; each must satisfy "
|
|
321
|
+
f"bare <= {args.bare_max} with headroom >= {args.min_bare_headroom}, "
|
|
322
|
+
f"solo_claude <= {args.solo_max} with headroom >= {args.min_solo_headroom}, "
|
|
323
|
+
"with both baseline arms evidence-complete"
|
|
324
|
+
),
|
|
110
325
|
"verdict": verdict,
|
|
111
326
|
"fixtures_total": len(rows),
|
|
112
327
|
"fixtures_passed": pass_count,
|
|
113
328
|
"min_fixtures": args.min_fixtures,
|
|
329
|
+
"bare_max": args.bare_max,
|
|
330
|
+
"solo_max": args.solo_max,
|
|
331
|
+
"min_bare_headroom_required": args.min_bare_headroom,
|
|
332
|
+
"min_solo_headroom_required": args.min_solo_headroom,
|
|
114
333
|
"fixture_count_ok": fixture_count_ok,
|
|
334
|
+
"avg_bare_headroom": average(bare_headrooms),
|
|
335
|
+
"min_bare_headroom": min(bare_headrooms) if bare_headrooms else None,
|
|
336
|
+
"avg_solo_headroom": average(solo_headrooms),
|
|
337
|
+
"min_solo_headroom": min(solo_headrooms) if solo_headrooms else None,
|
|
115
338
|
"rows": rows,
|
|
116
339
|
}
|
|
117
340
|
|
|
@@ -123,16 +346,24 @@ def main() -> int:
|
|
|
123
346
|
"",
|
|
124
347
|
f"Verdict: **{verdict}**",
|
|
125
348
|
"",
|
|
126
|
-
f"
|
|
127
|
-
|
|
349
|
+
f"Fixtures passed: {pass_count}/{len(rows)} (minimum required: {args.min_fixtures})",
|
|
350
|
+
"",
|
|
351
|
+
f"Rule: at least {args.min_fixtures} fixtures; bare <= {args.bare_max} "
|
|
352
|
+
f"with headroom >= {args.min_bare_headroom}, solo_claude <= {args.solo_max} "
|
|
353
|
+
f"with headroom >= {args.min_solo_headroom}, both baseline arms evidence-complete.",
|
|
354
|
+
f"Average bare headroom: {fmt_float(payload['avg_bare_headroom'])}",
|
|
355
|
+
f"Minimum bare headroom: {payload['min_bare_headroom'] if payload['min_bare_headroom'] is not None else 'n/a'}",
|
|
356
|
+
f"Average solo_claude headroom: {fmt_float(payload['avg_solo_headroom'])}",
|
|
357
|
+
f"Minimum solo_claude headroom: {payload['min_solo_headroom'] if payload['min_solo_headroom'] is not None else 'n/a'}",
|
|
128
358
|
"",
|
|
129
|
-
"| Fixture | Bare |
|
|
130
|
-
"
|
|
359
|
+
"| Fixture | Bare | Bare headroom | Solo_claude | Solo_claude headroom | Status | Reason |",
|
|
360
|
+
"|---|---:|---:|---:|---:|---|---|",
|
|
131
361
|
]
|
|
132
362
|
for row in rows:
|
|
133
363
|
lines.append(
|
|
134
|
-
f"| {row['fixture']} | {row.get('bare_score')} | {row.get('
|
|
135
|
-
f"{row
|
|
364
|
+
f"| {row['fixture']} | {row.get('bare_score')} | {row.get('bare_headroom')} | "
|
|
365
|
+
f"{row.get('solo_score')} | {row.get('solo_headroom')} | {row['status']} | "
|
|
366
|
+
f"{row.get('reason', '')} |"
|
|
136
367
|
)
|
|
137
368
|
report = "\n".join(lines) + "\n"
|
|
138
369
|
if args.out_md:
|