devlyn-cli 2.2.2 → 2.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +2 -2
- package/CLAUDE.md +4 -4
- package/README.md +85 -34
- package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
- package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
- package/benchmark/auto-resolve/README.md +307 -44
- package/benchmark/auto-resolve/RUBRIC.md +23 -14
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
- package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
- package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
- package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
- package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
- package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
- package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
- package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
- package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
- package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
- package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
- package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
- package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
- package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
- package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
- package/benchmark/auto-resolve/scripts/judge.sh +153 -26
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
- package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
- package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
- package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
- package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
- package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
- package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
- package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
- package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
- package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
- package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
- package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
- package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
- package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
- package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
- package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
- package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
- package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
- package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
- package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
- package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
- package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
- package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
- package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
- package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
- package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
- package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
- package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
- package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
- package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
- package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
- package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
- package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
- package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
- package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
- package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
- package/bin/devlyn.js +221 -17
- package/config/skills/_shared/adapters/README.md +3 -0
- package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
- package/config/skills/_shared/adapters/opus-4-7.md +9 -1
- package/config/skills/_shared/archive_run.py +78 -6
- package/config/skills/_shared/codex-config.md +5 -4
- package/config/skills/_shared/codex-monitored.sh +46 -1
- package/config/skills/_shared/collect-codex-findings.py +20 -5
- package/config/skills/_shared/engine-preflight.md +17 -13
- package/config/skills/_shared/runtime-principles.md +6 -9
- package/config/skills/_shared/spec-verify-check.py +2664 -107
- package/config/skills/_shared/verify-merge-findings.py +1369 -19
- package/config/skills/devlyn:design-ui/SKILL.md +364 -0
- package/config/skills/devlyn:ideate/SKILL.md +7 -4
- package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
- package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
- package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
- package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
- package/config/skills/devlyn:resolve/SKILL.md +78 -26
- package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
- package/config/skills/devlyn:resolve/references/phases/implement.md +1 -1
- package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
- package/config/skills/devlyn:resolve/references/phases/verify.md +80 -29
- package/config/skills/devlyn:resolve/references/state-schema.md +9 -4
- package/package.json +47 -2
- package/scripts/lint-fixtures.sh +349 -0
- package/scripts/lint-shadow-fixtures.sh +58 -0
- package/scripts/lint-skills.sh +3645 -95
|
@@ -14,7 +14,7 @@ ensure_ascii=False, allow_nan=False`, then sha256 the bytes.
|
|
|
14
14
|
Inputs (all required):
|
|
15
15
|
--c1-summary <path> iter-0033 (C1) summary.json (selection grounds; never a comparison baseline)
|
|
16
16
|
--f9-judge <path> iter-0033a F9 judge.json (F9 inclusion proof)
|
|
17
|
-
--l1-rerun-summary <path> L1 rerun summary
|
|
17
|
+
--l1-rerun-summary <path> L1 rerun summary archived for provenance, not selection
|
|
18
18
|
--output <path> destination .devlyn/manifests/iter-0033c-pair-eligible.json
|
|
19
19
|
|
|
20
20
|
Selection rule (frozen pre-registration, iter-0033c §"Pair-eligible fixture set"):
|
|
@@ -25,18 +25,38 @@ Selection rule (frozen pre-registration, iter-0033c §"Pair-eligible fixture set
|
|
|
25
25
|
pair_eligible = high_value ∪ promoted_by_l1_le_l0 ∪ {F9 if iter-0033a passed}
|
|
26
26
|
− reporting_only
|
|
27
27
|
− conditional_excluded that did not get promoted
|
|
28
|
+
− current rejected/ceiling registry
|
|
28
29
|
"""
|
|
29
30
|
import argparse
|
|
30
31
|
import copy
|
|
31
32
|
import hashlib
|
|
32
33
|
import json
|
|
34
|
+
import re
|
|
33
35
|
import subprocess
|
|
34
36
|
import sys
|
|
35
37
|
from pathlib import Path
|
|
36
38
|
|
|
39
|
+
SCRIPT_DIR = Path(__file__).resolve().parent
|
|
40
|
+
if str(SCRIPT_DIR) not in sys.path:
|
|
41
|
+
sys.path.insert(0, str(SCRIPT_DIR))
|
|
42
|
+
|
|
43
|
+
from pair_evidence_contract import is_score, reject_json_constant
|
|
44
|
+
|
|
37
45
|
HIGH_VALUE = ["F2", "F3", "F4", "F6", "F7"]
|
|
38
46
|
CONDITIONAL = ["F1", "F5"]
|
|
39
47
|
REPORTING_ONLY = ["F8"]
|
|
48
|
+
REJECTED_REGISTRY = Path(__file__).with_name("pair-rejected-fixtures.sh")
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def exact_bool(value: object) -> bool | None:
|
|
52
|
+
return value if isinstance(value, bool) else None
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def disqualifier_flag(value: object, *, default: bool = False) -> bool:
|
|
56
|
+
if value is None:
|
|
57
|
+
return default
|
|
58
|
+
parsed = exact_bool(value)
|
|
59
|
+
return parsed if parsed is not None else True
|
|
40
60
|
|
|
41
61
|
|
|
42
62
|
def file_sha256(path: Path) -> str:
|
|
@@ -62,28 +82,111 @@ def fixture_short_id(full: str) -> str:
|
|
|
62
82
|
return full.split("-", 1)[0] if "-" in full else full
|
|
63
83
|
|
|
64
84
|
|
|
85
|
+
def load_rejected_fixture_reasons(path: Path) -> dict[str, str]:
|
|
86
|
+
if not path.is_file():
|
|
87
|
+
raise ValueError(f"rejected fixture registry not found: {path}")
|
|
88
|
+
rejected: dict[str, str] = {}
|
|
89
|
+
current: str | None = None
|
|
90
|
+
for line in path.read_text().splitlines():
|
|
91
|
+
match = re.match(r"\s*([FS]\d+)-\*\|([FS]\d+)\)", line)
|
|
92
|
+
if match and match.group(1) == match.group(2):
|
|
93
|
+
current = match.group(1)
|
|
94
|
+
continue
|
|
95
|
+
reason = re.match(r'\s*echo "([^"]+)"', line)
|
|
96
|
+
if current and reason:
|
|
97
|
+
rejected[current] = reason.group(1)
|
|
98
|
+
current = None
|
|
99
|
+
return dict(sorted(rejected.items(), key=lambda item: (item[0][0], int(item[0][1:]))))
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def load_rejected_short_ids(path: Path) -> list[str]:
|
|
103
|
+
return list(load_rejected_fixture_reasons(path))
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def load_json_object(path: Path, label: str) -> dict:
|
|
107
|
+
try:
|
|
108
|
+
data = json.loads(path.read_text(), parse_constant=reject_json_constant)
|
|
109
|
+
except (ValueError, json.JSONDecodeError) as exc:
|
|
110
|
+
raise ValueError(f"{label} malformed: invalid JSON") from exc
|
|
111
|
+
if not isinstance(data, dict):
|
|
112
|
+
raise ValueError(f"{label} malformed: expected object")
|
|
113
|
+
return data
|
|
114
|
+
|
|
115
|
+
|
|
65
116
|
def compute_promoted_l1_le_l0(c1_rows: list) -> list:
|
|
66
117
|
"""Return short fixture IDs (e.g. 'F3') where solo_claude.score ≤ bare.score in C1."""
|
|
67
118
|
promoted = []
|
|
68
119
|
for row in c1_rows:
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
120
|
+
if not isinstance(row, dict):
|
|
121
|
+
continue
|
|
122
|
+
raw_arms = row.get("arms")
|
|
123
|
+
arms = raw_arms if isinstance(raw_arms, dict) else {}
|
|
124
|
+
raw_solo = arms.get("solo_claude")
|
|
125
|
+
raw_bare = arms.get("bare")
|
|
126
|
+
solo_arm = raw_solo if isinstance(raw_solo, dict) else {}
|
|
127
|
+
bare_arm = raw_bare if isinstance(raw_bare, dict) else {}
|
|
128
|
+
if (
|
|
129
|
+
disqualifier_flag(solo_arm.get("disqualifier"))
|
|
130
|
+
or disqualifier_flag(bare_arm.get("disqualifier"))
|
|
131
|
+
):
|
|
132
|
+
continue
|
|
133
|
+
solo = solo_arm.get("score")
|
|
134
|
+
bare = bare_arm.get("score")
|
|
135
|
+
if not is_score(solo) or not is_score(bare):
|
|
73
136
|
continue
|
|
74
137
|
if solo <= bare:
|
|
75
|
-
|
|
138
|
+
fixture = row.get("fixture")
|
|
139
|
+
if isinstance(fixture, str):
|
|
140
|
+
promoted.append(fixture_short_id(fixture))
|
|
76
141
|
return promoted
|
|
77
142
|
|
|
78
143
|
|
|
144
|
+
def mapped_score(judge: dict, arm: str) -> int | None:
|
|
145
|
+
mapping = judge.get("_blind_mapping")
|
|
146
|
+
if not isinstance(mapping, dict):
|
|
147
|
+
return None
|
|
148
|
+
letter = next((slot for slot, mapped in mapping.items() if mapped == arm), None)
|
|
149
|
+
if letter is None:
|
|
150
|
+
return None
|
|
151
|
+
raw_scores = judge.get("scores_by_arm")
|
|
152
|
+
scores = raw_scores if isinstance(raw_scores, dict) else {}
|
|
153
|
+
score = scores.get(arm)
|
|
154
|
+
if is_score(score):
|
|
155
|
+
return score
|
|
156
|
+
legacy = judge.get(f"{letter.lower()}_score")
|
|
157
|
+
return legacy if is_score(legacy) else None
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def mapped_disqualifier(judge: dict, arm: str) -> bool:
|
|
161
|
+
mapping = judge.get("_blind_mapping")
|
|
162
|
+
if not isinstance(mapping, dict):
|
|
163
|
+
return True
|
|
164
|
+
letter = next((slot for slot, mapped in mapping.items() if mapped == arm), None)
|
|
165
|
+
if letter is None:
|
|
166
|
+
return True
|
|
167
|
+
raw_by_arm = judge.get("disqualifiers_by_arm")
|
|
168
|
+
if raw_by_arm is not None and not isinstance(raw_by_arm, dict):
|
|
169
|
+
return True
|
|
170
|
+
by_arm = raw_by_arm if isinstance(raw_by_arm, dict) else {}
|
|
171
|
+
if arm in by_arm:
|
|
172
|
+
entry = by_arm.get(arm)
|
|
173
|
+
return disqualifier_flag(
|
|
174
|
+
entry.get("disqualifier") if isinstance(entry, dict) else entry
|
|
175
|
+
)
|
|
176
|
+
raw_legacy = judge.get("disqualifiers")
|
|
177
|
+
if raw_legacy is not None and not isinstance(raw_legacy, dict):
|
|
178
|
+
return True
|
|
179
|
+
legacy = raw_legacy if isinstance(raw_legacy, dict) else {}
|
|
180
|
+
return disqualifier_flag(legacy.get(letter))
|
|
181
|
+
|
|
182
|
+
|
|
79
183
|
def f9_passed(f9_judge: dict) -> bool:
|
|
80
|
-
"""iter-0033a passed iff
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
if a is None or b is None:
|
|
184
|
+
"""iter-0033a passed iff solo_claude beats bare and solo is not disqualified."""
|
|
185
|
+
solo = mapped_score(f9_judge, "solo_claude")
|
|
186
|
+
bare = mapped_score(f9_judge, "bare")
|
|
187
|
+
if solo is None or bare is None:
|
|
85
188
|
return False
|
|
86
|
-
return
|
|
189
|
+
return solo > bare and not mapped_disqualifier(f9_judge, "solo_claude")
|
|
87
190
|
|
|
88
191
|
|
|
89
192
|
def head_sha() -> str:
|
|
@@ -114,10 +217,21 @@ def main() -> int:
|
|
|
114
217
|
print(f"error: {label} not found: {p}", file=sys.stderr)
|
|
115
218
|
return 2
|
|
116
219
|
|
|
117
|
-
|
|
118
|
-
|
|
220
|
+
try:
|
|
221
|
+
c1 = load_json_object(c1_path, "c1-summary")
|
|
222
|
+
f9 = load_json_object(f9_path, "f9-judge")
|
|
223
|
+
rejected_reasons = load_rejected_fixture_reasons(REJECTED_REGISTRY)
|
|
224
|
+
rejected_short_ids = list(rejected_reasons)
|
|
225
|
+
except ValueError as exc:
|
|
226
|
+
print(f"error: {exc}", file=sys.stderr)
|
|
227
|
+
return 2
|
|
228
|
+
|
|
229
|
+
c1_rows = c1.get("rows")
|
|
230
|
+
if not isinstance(c1_rows, list):
|
|
231
|
+
print("error: c1-summary malformed: rows must be an array", file=sys.stderr)
|
|
232
|
+
return 2
|
|
119
233
|
|
|
120
|
-
promoted = compute_promoted_l1_le_l0(
|
|
234
|
+
promoted = compute_promoted_l1_le_l0(c1_rows)
|
|
121
235
|
f9_in = f9_passed(f9)
|
|
122
236
|
|
|
123
237
|
pair_eligible = list(HIGH_VALUE) # frozen high-value list, ordered
|
|
@@ -127,10 +241,23 @@ def main() -> int:
|
|
|
127
241
|
if f9_in and "F9" not in pair_eligible:
|
|
128
242
|
pair_eligible.append("F9")
|
|
129
243
|
pair_eligible = [fx for fx in pair_eligible if fx not in REPORTING_ONLY]
|
|
244
|
+
rejected_excluded = sorted(
|
|
245
|
+
{fx for fx in pair_eligible if fx in rejected_short_ids},
|
|
246
|
+
key=lambda s: (s[0], int(s[1:])),
|
|
247
|
+
)
|
|
248
|
+
pair_eligible = [fx for fx in pair_eligible if fx not in rejected_short_ids]
|
|
130
249
|
|
|
131
250
|
conditional_promoted = [fx for fx in CONDITIONAL if fx in promoted]
|
|
132
251
|
conditional_excluded = [fx for fx in CONDITIONAL if fx not in promoted]
|
|
133
252
|
pair_eligible_sorted = sorted(pair_eligible, key=lambda s: (s[0], int(s[1:])))
|
|
253
|
+
if not pair_eligible_sorted:
|
|
254
|
+
rejected_text = ", ".join(rejected_excluded) if rejected_excluded else "none"
|
|
255
|
+
print(
|
|
256
|
+
"error: no pair-eligible fixtures remain after rejected-registry filtering "
|
|
257
|
+
f"(rejected_excluded={rejected_text})",
|
|
258
|
+
file=sys.stderr,
|
|
259
|
+
)
|
|
260
|
+
return 1
|
|
134
261
|
|
|
135
262
|
gate3_total = len(pair_eligible_sorted)
|
|
136
263
|
gate3_threshold = (gate3_total + 1) // 2 # ≥50% — ceil(gate3_total / 2)
|
|
@@ -152,6 +279,11 @@ def main() -> int:
|
|
|
152
279
|
"reporting_only": REPORTING_ONLY,
|
|
153
280
|
"conditional_excluded": conditional_excluded,
|
|
154
281
|
"conditional_promoted": conditional_promoted,
|
|
282
|
+
"rejected_registry": str(REJECTED_REGISTRY),
|
|
283
|
+
"rejected_excluded": rejected_excluded,
|
|
284
|
+
"rejected_excluded_reasons": {
|
|
285
|
+
fixture: rejected_reasons[fixture] for fixture in rejected_excluded
|
|
286
|
+
},
|
|
155
287
|
},
|
|
156
288
|
"fixtures_pair_eligible": pair_eligible_sorted,
|
|
157
289
|
"gate3_threshold_count": gate3_threshold,
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
|
-
"""F9
|
|
2
|
+
"""F9 skill-driven arm artifact + transcript fingerprint check.
|
|
3
3
|
|
|
4
4
|
Out-of-band per Codex R0.5 §B (iter-0033a): expected.json.verification_commands
|
|
5
5
|
apply to ALL arms (run-fixture.sh:472), so a `docs/specs/**` check there would
|
|
6
|
-
punish bare. This script runs AFTER run-fixture.sh and asserts
|
|
6
|
+
punish bare. This script runs AFTER run-fixture.sh and asserts skill-driven
|
|
7
7
|
arms produced the artifacts the 2-skill ideate→resolve chain should emit.
|
|
8
8
|
|
|
9
9
|
Bare arm is exempt by construction.
|
|
@@ -13,7 +13,7 @@ Usage:
|
|
|
13
13
|
|
|
14
14
|
Exits:
|
|
15
15
|
0 — all checks pass (or bare arm — exempt).
|
|
16
|
-
1 —
|
|
16
|
+
1 — skill-driven arm but artifact contract violated.
|
|
17
17
|
2 — invalid invocation (missing args, missing dir).
|
|
18
18
|
|
|
19
19
|
Emits a small JSON report at <result-dir>/check-f9-artifacts.json.
|
|
@@ -25,8 +25,10 @@ import re
|
|
|
25
25
|
import sys
|
|
26
26
|
from pathlib import Path
|
|
27
27
|
|
|
28
|
+
from pair_evidence_contract import loads_strict_json_object
|
|
28
29
|
|
|
29
|
-
|
|
30
|
+
|
|
31
|
+
SKILL_DRIVEN_ARMS = {"variant", "solo_claude", "l2_gated", "l2_risk_probes", "l2_forced"}
|
|
30
32
|
EXEMPT_ARMS = {"bare"}
|
|
31
33
|
|
|
32
34
|
SPEC_DIR_GLOB = "docs/specs/*/spec.md"
|
|
@@ -39,6 +41,18 @@ RE_AUTO_RESOLVE = re.compile(r"/devlyn:auto-resolve\b")
|
|
|
39
41
|
RE_PREFLIGHT = re.compile(r"/devlyn:preflight\b")
|
|
40
42
|
|
|
41
43
|
|
|
44
|
+
def _load_json_object(path: Path) -> tuple[dict | None, str | None]:
|
|
45
|
+
try:
|
|
46
|
+
data = loads_strict_json_object(path.read_text())
|
|
47
|
+
except json.JSONDecodeError as exc:
|
|
48
|
+
return None, f"{exc.__class__.__name__}: {exc}"
|
|
49
|
+
except ValueError as exc:
|
|
50
|
+
if str(exc) == "top-level JSON value must be an object":
|
|
51
|
+
return None, "expected JSON object"
|
|
52
|
+
return None, f"{exc.__class__.__name__}: {exc}"
|
|
53
|
+
return data, None
|
|
54
|
+
|
|
55
|
+
|
|
42
56
|
def main() -> int:
|
|
43
57
|
p = argparse.ArgumentParser(description=__doc__.split("\n", 1)[0])
|
|
44
58
|
p.add_argument("--result-dir", required=True,
|
|
@@ -71,8 +85,8 @@ def main() -> int:
|
|
|
71
85
|
_write_report(result_dir, report)
|
|
72
86
|
return 0
|
|
73
87
|
|
|
74
|
-
if arm not in
|
|
75
|
-
print(f"error: unknown arm '{arm}' (expected one of {
|
|
88
|
+
if arm not in SKILL_DRIVEN_ARMS:
|
|
89
|
+
print(f"error: unknown arm '{arm}' (expected one of {SKILL_DRIVEN_ARMS | EXEMPT_ARMS})",
|
|
76
90
|
file=sys.stderr)
|
|
77
91
|
return 2
|
|
78
92
|
|
|
@@ -81,13 +95,13 @@ def main() -> int:
|
|
|
81
95
|
timing_path = result_dir / "timing.json"
|
|
82
96
|
work_dir: Path
|
|
83
97
|
if timing_path.is_file():
|
|
84
|
-
|
|
85
|
-
|
|
98
|
+
timing, _timing_error = _load_json_object(timing_path)
|
|
99
|
+
if timing is not None:
|
|
86
100
|
work_dir = Path(timing.get("work_dir", ""))
|
|
87
|
-
|
|
88
|
-
work_dir = Path("")
|
|
101
|
+
else:
|
|
102
|
+
work_dir = Path("__invalid_timing_work_dir__")
|
|
89
103
|
else:
|
|
90
|
-
work_dir = Path("")
|
|
104
|
+
work_dir = Path("__missing_timing_work_dir__")
|
|
91
105
|
|
|
92
106
|
if not work_dir.is_dir():
|
|
93
107
|
report["checks"].append({
|
|
@@ -163,16 +177,14 @@ def main() -> int:
|
|
|
163
177
|
else:
|
|
164
178
|
# Read the most recent run.
|
|
165
179
|
state_path = sorted(state_paths)[-1]
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
except Exception as exc:
|
|
180
|
+
state, state_error = _load_json_object(state_path)
|
|
181
|
+
if state is None:
|
|
169
182
|
report["checks"].append({
|
|
170
183
|
"name": "pipeline.state.json-parses",
|
|
171
184
|
"pass": False,
|
|
172
|
-
"reason":
|
|
185
|
+
"reason": state_error,
|
|
173
186
|
})
|
|
174
187
|
report["pass"] = False
|
|
175
|
-
state = None
|
|
176
188
|
|
|
177
189
|
if state is not None:
|
|
178
190
|
archived = "/runs/" in str(state_path)
|
|
@@ -8,6 +8,8 @@ import json
|
|
|
8
8
|
from pathlib import Path
|
|
9
9
|
from typing import Any
|
|
10
10
|
|
|
11
|
+
from pair_evidence_contract import reject_json_constant
|
|
12
|
+
|
|
11
13
|
|
|
12
14
|
def read_jsonl(path: Path) -> list[dict[str, Any]]:
|
|
13
15
|
rows: list[dict[str, Any]] = []
|
|
@@ -15,7 +17,7 @@ def read_jsonl(path: Path) -> list[dict[str, Any]]:
|
|
|
15
17
|
for line_no, line in enumerate(f, start=1):
|
|
16
18
|
if not line.strip():
|
|
17
19
|
continue
|
|
18
|
-
value = json.loads(line)
|
|
20
|
+
value = json.loads(line, parse_constant=reject_json_constant)
|
|
19
21
|
if not isinstance(value, dict):
|
|
20
22
|
raise ValueError(f"{path}:{line_no}: expected JSON object")
|
|
21
23
|
rows.append(value)
|
|
@@ -36,11 +38,17 @@ def instance_ids_from_jsonl(path: Path | None) -> set[str] | None:
|
|
|
36
38
|
|
|
37
39
|
def collect_from_root(root: Path, patch_name: str, keep: set[str] | None) -> list[tuple[str, Path]]:
|
|
38
40
|
patches: list[tuple[str, Path]] = []
|
|
41
|
+
seen: set[str] = set()
|
|
39
42
|
for patch_path in sorted(root.glob(f"*/{patch_name}")):
|
|
40
43
|
instance_id = patch_path.parent.name
|
|
41
44
|
if keep is not None and instance_id not in keep:
|
|
42
45
|
continue
|
|
46
|
+
seen.add(instance_id)
|
|
43
47
|
patches.append((instance_id, patch_path))
|
|
48
|
+
if keep is not None:
|
|
49
|
+
missing = sorted(keep - seen)
|
|
50
|
+
if missing:
|
|
51
|
+
raise ValueError(f"missing {patch_name} for instance ids: {', '.join(missing)}")
|
|
44
52
|
return patches
|
|
45
53
|
|
|
46
54
|
|
|
@@ -81,6 +89,8 @@ def main() -> int:
|
|
|
81
89
|
+ "\n"
|
|
82
90
|
)
|
|
83
91
|
written += 1
|
|
92
|
+
if written == 0:
|
|
93
|
+
raise ValueError("no non-empty patches collected")
|
|
84
94
|
|
|
85
95
|
report = {
|
|
86
96
|
"patch_root": str(args.patch_root),
|