devlyn-cli 2.2.2 → 2.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +2 -2
- package/CLAUDE.md +4 -4
- package/README.md +85 -34
- package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
- package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
- package/benchmark/auto-resolve/README.md +307 -44
- package/benchmark/auto-resolve/RUBRIC.md +23 -14
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
- package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
- package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
- package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
- package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
- package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
- package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
- package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
- package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
- package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
- package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
- package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
- package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
- package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
- package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
- package/benchmark/auto-resolve/scripts/judge.sh +153 -26
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
- package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
- package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
- package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
- package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
- package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
- package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
- package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
- package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
- package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
- package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
- package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
- package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
- package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
- package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
- package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
- package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
- package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
- package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
- package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
- package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
- package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
- package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
- package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
- package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
- package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
- package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
- package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
- package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
- package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
- package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
- package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
- package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
- package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
- package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
- package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
- package/bin/devlyn.js +221 -17
- package/config/skills/_shared/adapters/README.md +3 -0
- package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
- package/config/skills/_shared/adapters/opus-4-7.md +9 -1
- package/config/skills/_shared/archive_run.py +78 -6
- package/config/skills/_shared/codex-config.md +5 -4
- package/config/skills/_shared/codex-monitored.sh +46 -1
- package/config/skills/_shared/collect-codex-findings.py +20 -5
- package/config/skills/_shared/engine-preflight.md +17 -13
- package/config/skills/_shared/runtime-principles.md +6 -9
- package/config/skills/_shared/spec-verify-check.py +2664 -107
- package/config/skills/_shared/verify-merge-findings.py +1369 -19
- package/config/skills/devlyn:design-ui/SKILL.md +364 -0
- package/config/skills/devlyn:ideate/SKILL.md +7 -4
- package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
- package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
- package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
- package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
- package/config/skills/devlyn:resolve/SKILL.md +78 -26
- package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
- package/config/skills/devlyn:resolve/references/phases/implement.md +1 -1
- package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
- package/config/skills/devlyn:resolve/references/phases/verify.md +80 -29
- package/config/skills/devlyn:resolve/references/state-schema.md +9 -4
- package/package.json +47 -2
- package/scripts/lint-fixtures.sh +349 -0
- package/scripts/lint-shadow-fixtures.sh +58 -0
- package/scripts/lint-skills.sh +3645 -95
|
@@ -14,10 +14,19 @@ from __future__ import annotations
|
|
|
14
14
|
|
|
15
15
|
import argparse
|
|
16
16
|
import json
|
|
17
|
+
import math
|
|
17
18
|
import re
|
|
18
19
|
from pathlib import Path
|
|
19
20
|
from typing import Any
|
|
20
21
|
|
|
22
|
+
from pair_evidence_contract import (
|
|
23
|
+
all_known_pair_trigger_reasons,
|
|
24
|
+
has_canonical_pair_trigger_reason,
|
|
25
|
+
has_known_pair_trigger_reason,
|
|
26
|
+
path_has_actionable_solo_headroom_hypothesis,
|
|
27
|
+
reject_json_constant,
|
|
28
|
+
)
|
|
29
|
+
|
|
21
30
|
|
|
22
31
|
VERDICT_RANK = {
|
|
23
32
|
"PASS": 0,
|
|
@@ -31,8 +40,35 @@ def load_compare(results_root: Path, run_id: str) -> dict[str, Any]:
|
|
|
31
40
|
compare_path = results_root / run_id / "compare.json"
|
|
32
41
|
if not compare_path.exists():
|
|
33
42
|
raise FileNotFoundError(f"missing compare.json for {run_id}: {compare_path}")
|
|
34
|
-
|
|
35
|
-
|
|
43
|
+
try:
|
|
44
|
+
data = json.loads(
|
|
45
|
+
compare_path.read_text(encoding="utf8"),
|
|
46
|
+
parse_constant=reject_json_constant,
|
|
47
|
+
)
|
|
48
|
+
except (json.JSONDecodeError, ValueError) as exc:
|
|
49
|
+
raise ValueError(f"malformed compare.json for {run_id}: invalid JSON") from exc
|
|
50
|
+
if not isinstance(data, dict):
|
|
51
|
+
raise ValueError(f"malformed compare.json for {run_id}: expected object")
|
|
52
|
+
return data
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def object_field(payload: dict[str, Any], key: str) -> dict[str, Any]:
|
|
56
|
+
value = payload.get(key)
|
|
57
|
+
return value if isinstance(value, dict) else {}
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def verdict_field(payload: dict[str, Any], key: str) -> str | None:
|
|
61
|
+
value = payload.get(key)
|
|
62
|
+
return value if isinstance(value, str) else None
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def number_field(payload: dict[str, Any], key: str) -> int | float | None:
|
|
66
|
+
value = payload.get(key)
|
|
67
|
+
if isinstance(value, bool):
|
|
68
|
+
return None
|
|
69
|
+
if not isinstance(value, (int, float)) or not math.isfinite(value):
|
|
70
|
+
return None
|
|
71
|
+
return value
|
|
36
72
|
|
|
37
73
|
|
|
38
74
|
def rank(verdict: str | None) -> int:
|
|
@@ -42,11 +78,49 @@ def rank(verdict: str | None) -> int:
|
|
|
42
78
|
def elapsed_ratio(pair_elapsed: Any, solo_elapsed: Any) -> float | None:
|
|
43
79
|
if not isinstance(pair_elapsed, (int, float)) or not isinstance(solo_elapsed, (int, float)):
|
|
44
80
|
return None
|
|
45
|
-
if solo_elapsed <= 0:
|
|
81
|
+
if pair_elapsed <= 0 or solo_elapsed <= 0:
|
|
46
82
|
return None
|
|
47
83
|
return pair_elapsed / solo_elapsed
|
|
48
84
|
|
|
49
85
|
|
|
86
|
+
def pair_trigger_failures(pair: dict[str, Any]) -> list[str]:
|
|
87
|
+
trigger = pair.get("pair_trigger")
|
|
88
|
+
if not isinstance(trigger, dict):
|
|
89
|
+
return ["pair_trigger missing or malformed"]
|
|
90
|
+
eligible = trigger.get("eligible")
|
|
91
|
+
reasons = trigger.get("reasons")
|
|
92
|
+
skipped_reason = trigger.get("skipped_reason")
|
|
93
|
+
if not isinstance(eligible, bool):
|
|
94
|
+
return ["pair_trigger.eligible malformed"]
|
|
95
|
+
if not isinstance(reasons, list) or not all(isinstance(reason, str) for reason in reasons):
|
|
96
|
+
return ["pair_trigger.reasons malformed"]
|
|
97
|
+
if skipped_reason is not None and not isinstance(skipped_reason, str):
|
|
98
|
+
return ["pair_trigger.skipped_reason malformed"]
|
|
99
|
+
if eligible is not True:
|
|
100
|
+
return ["pair_trigger not eligible"]
|
|
101
|
+
if not reasons:
|
|
102
|
+
return ["pair_trigger eligible with empty reasons"]
|
|
103
|
+
if not has_known_pair_trigger_reason(reasons):
|
|
104
|
+
return ["pair_trigger reasons missing known trigger reason"]
|
|
105
|
+
if not all_known_pair_trigger_reasons(reasons):
|
|
106
|
+
return ["pair_trigger reasons contain unknown trigger reason"]
|
|
107
|
+
if not has_canonical_pair_trigger_reason(reasons):
|
|
108
|
+
return ["pair_trigger reasons missing canonical trigger reason"]
|
|
109
|
+
if skipped_reason is not None:
|
|
110
|
+
return ["pair_trigger eligible with skipped_reason"]
|
|
111
|
+
return []
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def pair_trigger_reasons(pair: dict[str, Any]) -> list[str]:
|
|
115
|
+
trigger = pair.get("pair_trigger")
|
|
116
|
+
if not isinstance(trigger, dict):
|
|
117
|
+
return []
|
|
118
|
+
reasons = trigger.get("reasons")
|
|
119
|
+
if not isinstance(reasons, list) or not all(isinstance(reason, str) for reason in reasons):
|
|
120
|
+
return []
|
|
121
|
+
return reasons
|
|
122
|
+
|
|
123
|
+
|
|
50
124
|
def infer_fixture_id(results_root: Path, run_id: str) -> str | None:
|
|
51
125
|
run_root = results_root / run_id
|
|
52
126
|
for arm in ("pair", "solo"):
|
|
@@ -74,10 +148,11 @@ def evaluate_run(
|
|
|
74
148
|
fixtures_root: Path,
|
|
75
149
|
run_id: str,
|
|
76
150
|
max_pair_solo_wall_ratio: float | None,
|
|
151
|
+
require_hypothesis_trigger: bool,
|
|
77
152
|
) -> dict[str, Any]:
|
|
78
153
|
try:
|
|
79
154
|
compare = load_compare(results_root, run_id)
|
|
80
|
-
except FileNotFoundError as exc:
|
|
155
|
+
except (FileNotFoundError, ValueError) as exc:
|
|
81
156
|
fixture_id = infer_fixture_id(results_root, run_id)
|
|
82
157
|
return {
|
|
83
158
|
"run_id": run_id,
|
|
@@ -97,9 +172,9 @@ def evaluate_run(
|
|
|
97
172
|
"pair_solo_wall_ratio": None,
|
|
98
173
|
"pair_severity_counts": {},
|
|
99
174
|
}
|
|
100
|
-
solo = compare
|
|
101
|
-
pair = compare
|
|
102
|
-
comparison = compare
|
|
175
|
+
solo = object_field(compare, "solo")
|
|
176
|
+
pair = object_field(compare, "pair")
|
|
177
|
+
comparison = object_field(compare, "comparison")
|
|
103
178
|
solo_failure_reason = solo.get("invoke_failure_reason") or transcript_failure_reason(
|
|
104
179
|
results_root, run_id, "solo"
|
|
105
180
|
)
|
|
@@ -112,6 +187,20 @@ def evaluate_run(
|
|
|
112
187
|
failures.append("solo timed out")
|
|
113
188
|
if pair.get("timed_out"):
|
|
114
189
|
failures.append("pair timed out")
|
|
190
|
+
if solo.get("invoke_failure"):
|
|
191
|
+
reason = solo.get("invoke_failure_reason")
|
|
192
|
+
failures.append(f"solo invoke failure ({reason})" if reason else "solo invoke failure")
|
|
193
|
+
if pair.get("invoke_failure"):
|
|
194
|
+
reason = pair.get("invoke_failure_reason")
|
|
195
|
+
failures.append(f"pair invoke failure ({reason})" if reason else "pair invoke failure")
|
|
196
|
+
if solo.get("environment_contamination"):
|
|
197
|
+
failures.append("solo environment contamination")
|
|
198
|
+
if pair.get("environment_contamination"):
|
|
199
|
+
failures.append("pair environment contamination")
|
|
200
|
+
if solo.get("disqualifier"):
|
|
201
|
+
failures.append("solo disqualifier")
|
|
202
|
+
if pair.get("disqualifier"):
|
|
203
|
+
failures.append("pair disqualifier")
|
|
115
204
|
if solo_failure_reason == "provider_limit":
|
|
116
205
|
failures.append("solo provider limit")
|
|
117
206
|
if pair_failure_reason == "provider_limit":
|
|
@@ -120,27 +209,39 @@ def evaluate_run(
|
|
|
120
209
|
failures.append(f"solo invoke_exit={solo.get('invoke_exit')}")
|
|
121
210
|
if pair.get("invoke_exit") != 0:
|
|
122
211
|
failures.append(f"pair invoke_exit={pair.get('invoke_exit')}")
|
|
123
|
-
|
|
212
|
+
pair_mode = pair.get("pair_mode") is True
|
|
213
|
+
if not pair_mode:
|
|
124
214
|
failures.append("pair_mode false")
|
|
125
|
-
|
|
215
|
+
failures.extend(pair_trigger_failures(pair))
|
|
216
|
+
trigger_reasons = pair_trigger_reasons(pair)
|
|
217
|
+
pair_trigger_missed = comparison.get("pair_trigger_missed") is True
|
|
218
|
+
if pair_trigger_missed:
|
|
126
219
|
failures.append("pair trigger missed")
|
|
127
|
-
external_lift =
|
|
128
|
-
internal_lift =
|
|
220
|
+
external_lift = comparison.get("pair_verdict_lift") is True
|
|
221
|
+
internal_lift = comparison.get("pair_internal_verdict_lift") is True
|
|
129
222
|
if not (external_lift or internal_lift):
|
|
130
223
|
failures.append("pair verdict lift false")
|
|
131
224
|
|
|
132
225
|
solo_verdict = (
|
|
133
|
-
comparison
|
|
134
|
-
or solo
|
|
135
|
-
or solo
|
|
226
|
+
verdict_field(comparison, "solo_verdict")
|
|
227
|
+
or verdict_field(solo, "verify_verdict")
|
|
228
|
+
or verdict_field(solo, "terminal_verdict")
|
|
136
229
|
)
|
|
137
230
|
pair_verdict = (
|
|
138
|
-
comparison
|
|
139
|
-
or pair
|
|
140
|
-
or pair
|
|
231
|
+
verdict_field(comparison, "pair_verdict")
|
|
232
|
+
or verdict_field(pair, "verify_verdict")
|
|
233
|
+
or verdict_field(pair, "terminal_verdict")
|
|
141
234
|
)
|
|
142
|
-
pair_primary_verdict = comparison
|
|
143
|
-
pair_judge_verdict = comparison
|
|
235
|
+
pair_primary_verdict = verdict_field(comparison, "pair_primary_verdict")
|
|
236
|
+
pair_judge_verdict = verdict_field(comparison, "pair_judge_verdict")
|
|
237
|
+
if solo_verdict is None:
|
|
238
|
+
failures.append("solo verdict missing or malformed")
|
|
239
|
+
if pair_verdict is None:
|
|
240
|
+
failures.append("pair verdict missing or malformed")
|
|
241
|
+
if internal_lift and pair_primary_verdict is None:
|
|
242
|
+
failures.append("pair primary verdict missing or malformed")
|
|
243
|
+
if internal_lift and pair_judge_verdict is None:
|
|
244
|
+
failures.append("pair judge verdict missing or malformed")
|
|
144
245
|
if external_lift and rank(pair_verdict) <= rank(solo_verdict):
|
|
145
246
|
failures.append(f"pair verdict {pair_verdict} not stricter than solo {solo_verdict}")
|
|
146
247
|
if internal_lift and rank(pair_judge_verdict) <= rank(pair_primary_verdict):
|
|
@@ -149,8 +250,8 @@ def evaluate_run(
|
|
|
149
250
|
)
|
|
150
251
|
if rank(pair_verdict) < VERDICT_RANK["NEEDS_WORK"]:
|
|
151
252
|
failures.append(f"pair verdict {pair_verdict} is not verdict-binding")
|
|
152
|
-
pair_elapsed = pair
|
|
153
|
-
solo_elapsed = solo
|
|
253
|
+
pair_elapsed = number_field(pair, "elapsed_seconds")
|
|
254
|
+
solo_elapsed = number_field(solo, "elapsed_seconds")
|
|
154
255
|
wall_ratio = elapsed_ratio(pair_elapsed, solo_elapsed)
|
|
155
256
|
if max_pair_solo_wall_ratio is not None:
|
|
156
257
|
if wall_ratio is None:
|
|
@@ -164,6 +265,12 @@ def evaluate_run(
|
|
|
164
265
|
failures.append("fixture_id missing")
|
|
165
266
|
elif not (fixtures_root / fixture_id).is_dir():
|
|
166
267
|
failures.append(f"fixture_id not found: {fixture_id}")
|
|
268
|
+
elif (
|
|
269
|
+
require_hypothesis_trigger
|
|
270
|
+
and path_has_actionable_solo_headroom_hypothesis(fixtures_root / fixture_id / "spec.md")
|
|
271
|
+
and "spec.solo_headroom_hypothesis" not in trigger_reasons
|
|
272
|
+
):
|
|
273
|
+
failures.append("pair_trigger missing spec.solo_headroom_hypothesis")
|
|
167
274
|
|
|
168
275
|
return {
|
|
169
276
|
"run_id": run_id,
|
|
@@ -172,8 +279,10 @@ def evaluate_run(
|
|
|
172
279
|
"failures": failures,
|
|
173
280
|
"solo_verdict": solo_verdict,
|
|
174
281
|
"pair_verdict": pair_verdict,
|
|
175
|
-
"pair_mode":
|
|
176
|
-
"
|
|
282
|
+
"pair_mode": pair_mode,
|
|
283
|
+
"pair_trigger_reasons": trigger_reasons,
|
|
284
|
+
"pair_trigger_has_canonical_reason": has_canonical_pair_trigger_reason(trigger_reasons),
|
|
285
|
+
"pair_trigger_missed": pair_trigger_missed,
|
|
177
286
|
"pair_verdict_lift": external_lift,
|
|
178
287
|
"pair_internal_verdict_lift": internal_lift,
|
|
179
288
|
"pair_primary_verdict": pair_primary_verdict,
|
|
@@ -183,7 +292,7 @@ def evaluate_run(
|
|
|
183
292
|
"pair_solo_wall_ratio": wall_ratio,
|
|
184
293
|
"solo_failure_reason": solo_failure_reason,
|
|
185
294
|
"pair_failure_reason": pair_failure_reason,
|
|
186
|
-
"pair_severity_counts": pair
|
|
295
|
+
"pair_severity_counts": object_field(pair, "severity_counts"),
|
|
187
296
|
}
|
|
188
297
|
|
|
189
298
|
|
|
@@ -193,6 +302,12 @@ def format_ratio(value: Any) -> str:
|
|
|
193
302
|
return "n/a"
|
|
194
303
|
|
|
195
304
|
|
|
305
|
+
def format_trigger_reasons(value: Any) -> str:
|
|
306
|
+
if not isinstance(value, list) or not all(isinstance(item, str) for item in value):
|
|
307
|
+
return ""
|
|
308
|
+
return ",".join(value)
|
|
309
|
+
|
|
310
|
+
|
|
196
311
|
def write_markdown(path: Path, report: dict[str, Any]) -> None:
|
|
197
312
|
lines = [
|
|
198
313
|
f"# Frozen VERIFY Gate — {report['run_ids_label']}",
|
|
@@ -208,15 +323,17 @@ def write_markdown(path: Path, report: dict[str, Any]) -> None:
|
|
|
208
323
|
f"Max pair/solo wall ratio: {format_ratio(report.get('max_pair_solo_wall_ratio'))}",
|
|
209
324
|
f"Average pair/solo wall ratio: {format_ratio(report.get('avg_pair_solo_wall_ratio'))}",
|
|
210
325
|
"",
|
|
211
|
-
"| Run | Fixture | Solo | Pair | Pair mode | Wall ratio | External lift | Internal lift | Status | Reason |",
|
|
212
|
-
"
|
|
326
|
+
"| Run | Fixture | Solo VERIFY | Pair VERIFY | Pair mode | Triggers | Wall ratio | External lift | Internal lift | Status | Reason |",
|
|
327
|
+
"|---|---|---|---|---|---|---|---|---|---|---|",
|
|
213
328
|
]
|
|
214
329
|
for row in report["rows"]:
|
|
215
330
|
reason = "; ".join(row["failures"]) if row["failures"] else "ok"
|
|
216
331
|
lines.append(
|
|
217
332
|
f"| {row['run_id']} | {row.get('fixture_id') or 'unknown'} | "
|
|
218
333
|
f"{row['solo_verdict']} | {row['pair_verdict']} | "
|
|
219
|
-
f"{str(row['pair_mode']).lower()} |
|
|
334
|
+
f"{str(row['pair_mode']).lower()} | "
|
|
335
|
+
f"{format_trigger_reasons(row.get('pair_trigger_reasons'))} | "
|
|
336
|
+
f"{format_ratio(row.get('pair_solo_wall_ratio'))} | "
|
|
220
337
|
f"{str(row['pair_verdict_lift']).lower()} | "
|
|
221
338
|
f"{str(row['pair_internal_verdict_lift']).lower()} | "
|
|
222
339
|
f"{row['status']} | {reason} |"
|
|
@@ -225,17 +342,36 @@ def write_markdown(path: Path, report: dict[str, Any]) -> None:
|
|
|
225
342
|
path.write_text("\n".join(lines), encoding="utf8")
|
|
226
343
|
|
|
227
344
|
|
|
345
|
+
def positive_int(value: str) -> int:
|
|
346
|
+
parsed = int(value)
|
|
347
|
+
if parsed <= 0:
|
|
348
|
+
raise argparse.ArgumentTypeError("value must be > 0")
|
|
349
|
+
return parsed
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
def positive_float(value: str) -> float:
|
|
353
|
+
parsed = float(value)
|
|
354
|
+
if parsed <= 0:
|
|
355
|
+
raise argparse.ArgumentTypeError("value must be > 0")
|
|
356
|
+
return parsed
|
|
357
|
+
|
|
358
|
+
|
|
228
359
|
def main() -> int:
|
|
229
360
|
parser = argparse.ArgumentParser()
|
|
230
361
|
parser.add_argument("--results-root", default="benchmark/auto-resolve/results")
|
|
231
362
|
parser.add_argument("--fixtures-root", default="benchmark/auto-resolve/fixtures")
|
|
232
363
|
parser.add_argument("--run-id", action="append", required=True)
|
|
233
|
-
parser.add_argument("--min-runs", type=
|
|
364
|
+
parser.add_argument("--min-runs", type=positive_int, default=2)
|
|
234
365
|
parser.add_argument(
|
|
235
366
|
"--max-pair-solo-wall-ratio",
|
|
236
|
-
type=
|
|
367
|
+
type=positive_float,
|
|
237
368
|
help="Optional efficiency cap. When set, every run must include elapsed_seconds and pair/solo wall ratio must not exceed this value.",
|
|
238
369
|
)
|
|
370
|
+
parser.add_argument(
|
|
371
|
+
"--require-hypothesis-trigger",
|
|
372
|
+
action="store_true",
|
|
373
|
+
help="require fixtures with actionable solo-headroom hypotheses to expose spec.solo_headroom_hypothesis in pair_trigger.reasons",
|
|
374
|
+
)
|
|
239
375
|
parser.add_argument("--out-json")
|
|
240
376
|
parser.add_argument("--out-md")
|
|
241
377
|
args = parser.parse_args()
|
|
@@ -243,7 +379,13 @@ def main() -> int:
|
|
|
243
379
|
results_root = Path(args.results_root)
|
|
244
380
|
fixtures_root = Path(args.fixtures_root)
|
|
245
381
|
rows = [
|
|
246
|
-
evaluate_run(
|
|
382
|
+
evaluate_run(
|
|
383
|
+
results_root,
|
|
384
|
+
fixtures_root,
|
|
385
|
+
run_id,
|
|
386
|
+
args.max_pair_solo_wall_ratio,
|
|
387
|
+
args.require_hypothesis_trigger,
|
|
388
|
+
)
|
|
247
389
|
for run_id in args.run_id
|
|
248
390
|
]
|
|
249
391
|
fixture_counts: dict[str, int] = {}
|
|
@@ -262,6 +404,9 @@ def main() -> int:
|
|
|
262
404
|
row["pair_solo_wall_ratio"]
|
|
263
405
|
for row in rows
|
|
264
406
|
if isinstance(row.get("pair_solo_wall_ratio"), (int, float))
|
|
407
|
+
and not isinstance(row.get("pair_solo_wall_ratio"), bool)
|
|
408
|
+
and math.isfinite(row["pair_solo_wall_ratio"])
|
|
409
|
+
and row["pair_solo_wall_ratio"] > 0
|
|
265
410
|
]
|
|
266
411
|
|
|
267
412
|
report = {
|