devlyn-cli 2.2.2 → 2.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +2 -2
- package/CLAUDE.md +4 -4
- package/README.md +85 -34
- package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
- package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
- package/benchmark/auto-resolve/README.md +307 -44
- package/benchmark/auto-resolve/RUBRIC.md +23 -14
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
- package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
- package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
- package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
- package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
- package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
- package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
- package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
- package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
- package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
- package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
- package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
- package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
- package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
- package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
- package/benchmark/auto-resolve/scripts/judge.sh +153 -26
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
- package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
- package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
- package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
- package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
- package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
- package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
- package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
- package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
- package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
- package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
- package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
- package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
- package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
- package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
- package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
- package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
- package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
- package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
- package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
- package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
- package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
- package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
- package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
- package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
- package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
- package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
- package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
- package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
- package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
- package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
- package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
- package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
- package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
- package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
- package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
- package/bin/devlyn.js +221 -17
- package/config/skills/_shared/adapters/README.md +3 -0
- package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
- package/config/skills/_shared/adapters/opus-4-7.md +9 -1
- package/config/skills/_shared/archive_run.py +78 -6
- package/config/skills/_shared/codex-config.md +5 -4
- package/config/skills/_shared/codex-monitored.sh +46 -1
- package/config/skills/_shared/collect-codex-findings.py +20 -5
- package/config/skills/_shared/engine-preflight.md +17 -13
- package/config/skills/_shared/runtime-principles.md +6 -9
- package/config/skills/_shared/spec-verify-check.py +2664 -107
- package/config/skills/_shared/verify-merge-findings.py +1369 -19
- package/config/skills/devlyn:design-ui/SKILL.md +364 -0
- package/config/skills/devlyn:ideate/SKILL.md +7 -4
- package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
- package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
- package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
- package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
- package/config/skills/devlyn:resolve/SKILL.md +78 -26
- package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
- package/config/skills/devlyn:resolve/references/phases/implement.md +1 -1
- package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
- package/config/skills/devlyn:resolve/references/phases/verify.md +80 -29
- package/config/skills/devlyn:resolve/references/state-schema.md +9 -4
- package/package.json +47 -2
- package/scripts/lint-fixtures.sh +349 -0
- package/scripts/lint-shadow-fixtures.sh +58 -0
- package/scripts/lint-skills.sh +3645 -95
|
@@ -27,12 +27,12 @@ Gates per iter-0033c §"Acceptance gate":
|
|
|
27
27
|
6 trigger discipline (fixture-level): for each pair-eligible fixture, if
|
|
28
28
|
l2_forced lifts ≥ +5 OR catches categorical rescue, AND forced is not
|
|
29
29
|
impl-confounded, AND forced.pair_judge present → l2_gated MUST also have
|
|
30
|
-
pair_judge
|
|
30
|
+
recognized pair_judge verdict on that fixture.
|
|
31
31
|
7 attribution (4-class, data-only): per-fixture classify into
|
|
32
32
|
{no_material_lift, implementation_confounded, tool_or_trigger_lift,
|
|
33
33
|
deliberation_lift}. Reporting only; not pass/fail.
|
|
34
|
-
8 artifact contract: pair_judge
|
|
35
|
-
pair findings distinguishable from solo judge findings.
|
|
34
|
+
8 artifact contract: recognized pair_judge verdict for every fixture where
|
|
35
|
+
pair fired; pair findings distinguishable from solo judge findings.
|
|
36
36
|
|
|
37
37
|
Ship-blockers: 1a, 1b, 1c, 2, 3, 4, 6.
|
|
38
38
|
Quality gates: 5, 8 (failure → root-cause iter; Phase 4 holds).
|
|
@@ -43,19 +43,55 @@ import json
|
|
|
43
43
|
import sys
|
|
44
44
|
from pathlib import Path
|
|
45
45
|
|
|
46
|
+
SCRIPT_DIR = Path(__file__).resolve().parent
|
|
47
|
+
if str(SCRIPT_DIR) not in sys.path:
|
|
48
|
+
sys.path.insert(0, str(SCRIPT_DIR))
|
|
49
|
+
|
|
50
|
+
from pair_evidence_contract import (
|
|
51
|
+
is_score,
|
|
52
|
+
is_strict_number,
|
|
53
|
+
loads_strict_json_object,
|
|
54
|
+
reject_json_constant,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
PAIR_VERDICTS = {"PASS", "PASS_WITH_ISSUES", "NEEDS_WORK", "BLOCKED", "FAIL"}
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def exact_bool(value: object) -> bool | None:
|
|
61
|
+
return value if isinstance(value, bool) else None
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def bool_flag(value: object, *, default: bool = False) -> bool:
|
|
65
|
+
if value is None:
|
|
66
|
+
return default
|
|
67
|
+
parsed = exact_bool(value)
|
|
68
|
+
return parsed if parsed is not None else True
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def is_pair_judge_verdict(value: object) -> bool:
|
|
72
|
+
return value in PAIR_VERDICTS
|
|
73
|
+
|
|
46
74
|
|
|
47
75
|
def load_judge(results_dir: Path, fixture: str) -> dict | None:
|
|
48
76
|
p = results_dir / fixture / "judge.json"
|
|
49
77
|
if not p.is_file():
|
|
50
78
|
return None
|
|
51
|
-
|
|
79
|
+
try:
|
|
80
|
+
data = loads_strict_json_object(p.read_text())
|
|
81
|
+
except (ValueError, json.JSONDecodeError):
|
|
82
|
+
return None
|
|
83
|
+
return data
|
|
52
84
|
|
|
53
85
|
|
|
54
86
|
def load_result(results_dir: Path, fixture: str, arm: str) -> dict | None:
|
|
55
87
|
p = results_dir / fixture / arm / "result.json"
|
|
56
88
|
if not p.is_file():
|
|
57
89
|
return None
|
|
58
|
-
|
|
90
|
+
try:
|
|
91
|
+
data = loads_strict_json_object(p.read_text())
|
|
92
|
+
except (ValueError, json.JSONDecodeError):
|
|
93
|
+
return None
|
|
94
|
+
return data
|
|
59
95
|
|
|
60
96
|
|
|
61
97
|
def load_state(work_dir_root: Path, run_id: str, fixture: str, arm: str) -> dict | None:
|
|
@@ -67,7 +103,11 @@ def load_state(work_dir_root: Path, run_id: str, fixture: str, arm: str) -> dict
|
|
|
67
103
|
candidates = sorted(runs.glob("*/pipeline.state.json"))
|
|
68
104
|
if not candidates:
|
|
69
105
|
return None
|
|
70
|
-
|
|
106
|
+
try:
|
|
107
|
+
data = loads_strict_json_object(candidates[-1].read_text())
|
|
108
|
+
except (ValueError, json.JSONDecodeError):
|
|
109
|
+
return None
|
|
110
|
+
return data
|
|
71
111
|
|
|
72
112
|
|
|
73
113
|
def archive_run_dir(work_dir_root: Path, run_id: str, fixture: str, arm: str) -> Path | None:
|
|
@@ -154,19 +194,24 @@ def find_results_dir_fixtures(results_dir: Path) -> list[str]:
|
|
|
154
194
|
|
|
155
195
|
|
|
156
196
|
def get_score(judge: dict, arm: str) -> int | None:
|
|
157
|
-
"""Score for a given arm
|
|
158
|
-
|
|
159
|
-
|
|
197
|
+
"""Score for a given arm, only when the arm is present in `_blind_mapping`.
|
|
198
|
+
|
|
199
|
+
`scores_by_arm` is accepted only as a decoded view of the blind A/B/C slots;
|
|
200
|
+
a score for an arm absent from the blind mapping is not score evidence.
|
|
201
|
+
"""
|
|
160
202
|
if not judge:
|
|
161
203
|
return None
|
|
162
|
-
|
|
163
|
-
if
|
|
164
|
-
return sba[arm]
|
|
165
|
-
mapping = judge.get("_blind_mapping") or {}
|
|
204
|
+
raw_mapping = judge.get("_blind_mapping")
|
|
205
|
+
mapping = raw_mapping if isinstance(raw_mapping, dict) else {}
|
|
166
206
|
letter = next((k for k, v in mapping.items() if v == arm), None)
|
|
167
207
|
if not letter:
|
|
168
208
|
return None
|
|
169
|
-
|
|
209
|
+
raw_scores = judge.get("scores_by_arm")
|
|
210
|
+
sba = raw_scores if isinstance(raw_scores, dict) else {}
|
|
211
|
+
if is_score(sba.get(arm)):
|
|
212
|
+
return sba[arm]
|
|
213
|
+
legacy = judge.get(f"{letter.lower()}_score")
|
|
214
|
+
return legacy if is_score(legacy) else None
|
|
170
215
|
|
|
171
216
|
|
|
172
217
|
def get_disqualifier(judge: dict, arm: str) -> bool:
|
|
@@ -174,15 +219,31 @@ def get_disqualifier(judge: dict, arm: str) -> bool:
|
|
|
174
219
|
line 314-323; fall back to blind A/B/C with case-correct letter."""
|
|
175
220
|
if not judge:
|
|
176
221
|
return False
|
|
177
|
-
|
|
178
|
-
if
|
|
179
|
-
return bool(dba[arm].get("disqualifier", False))
|
|
180
|
-
dqs = judge.get("disqualifiers") or {}
|
|
181
|
-
mapping = judge.get("_blind_mapping") or {}
|
|
222
|
+
raw_mapping = judge.get("_blind_mapping")
|
|
223
|
+
mapping = raw_mapping if isinstance(raw_mapping, dict) else {}
|
|
182
224
|
letter = next((k for k, v in mapping.items() if v == arm), None)
|
|
183
225
|
if not letter:
|
|
184
|
-
|
|
185
|
-
|
|
226
|
+
raw_scores = judge.get("scores_by_arm")
|
|
227
|
+
sba = raw_scores if isinstance(raw_scores, dict) else {}
|
|
228
|
+
raw_dba = judge.get("disqualifiers_by_arm")
|
|
229
|
+
if raw_dba is not None and not isinstance(raw_dba, dict):
|
|
230
|
+
return True
|
|
231
|
+
dba = raw_dba if isinstance(raw_dba, dict) else {}
|
|
232
|
+
return arm in sba or arm in dba
|
|
233
|
+
raw_dba = judge.get("disqualifiers_by_arm")
|
|
234
|
+
if raw_dba is not None and not isinstance(raw_dba, dict):
|
|
235
|
+
return True
|
|
236
|
+
dba = raw_dba if isinstance(raw_dba, dict) else {}
|
|
237
|
+
if arm in dba:
|
|
238
|
+
entry = dba[arm]
|
|
239
|
+
return bool_flag(
|
|
240
|
+
entry.get("disqualifier") if isinstance(entry, dict) else entry
|
|
241
|
+
)
|
|
242
|
+
raw_dqs = judge.get("disqualifiers")
|
|
243
|
+
if raw_dqs is not None and not isinstance(raw_dqs, dict):
|
|
244
|
+
return True
|
|
245
|
+
dqs = raw_dqs if isinstance(raw_dqs, dict) else {}
|
|
246
|
+
return bool_flag(dqs.get(letter))
|
|
186
247
|
|
|
187
248
|
|
|
188
249
|
def gate_2_no_regression(rows: list[dict]) -> dict:
|
|
@@ -268,9 +329,11 @@ def load_mechanical_findings(work_dir_root: Path, run_id: str, fixture: str, arm
|
|
|
268
329
|
if not ln:
|
|
269
330
|
continue
|
|
270
331
|
try:
|
|
271
|
-
|
|
332
|
+
parsed = json.loads(ln, parse_constant=reject_json_constant)
|
|
272
333
|
except json.JSONDecodeError:
|
|
273
334
|
continue
|
|
335
|
+
if isinstance(parsed, dict):
|
|
336
|
+
out.append(parsed)
|
|
274
337
|
return out
|
|
275
338
|
|
|
276
339
|
|
|
@@ -401,7 +464,7 @@ def gate_8_artifact_contract(rows: list[dict]) -> dict:
|
|
|
401
464
|
return {
|
|
402
465
|
"gate": "8-artifact-contract",
|
|
403
466
|
"status": "PASS" if not failures else "FAIL",
|
|
404
|
-
"rule": "pair_judge
|
|
467
|
+
"rule": "recognized pair_judge verdict when fired; pair findings distinguishable from solo",
|
|
405
468
|
"failures": failures,
|
|
406
469
|
}
|
|
407
470
|
|
|
@@ -418,12 +481,18 @@ def build_rows(results_dir: Path, work_dir_root: Path, run_id: str) -> list[dict
|
|
|
418
481
|
forced_state = load_state(work_dir_root, run_id, fx, "l2_forced")
|
|
419
482
|
|
|
420
483
|
def pair_judge_present(state: dict | None) -> bool:
|
|
421
|
-
if not state:
|
|
484
|
+
if not isinstance(state, dict):
|
|
485
|
+
return False
|
|
486
|
+
phases = state.get("phases")
|
|
487
|
+
if not isinstance(phases, dict):
|
|
488
|
+
return False
|
|
489
|
+
verify = phases.get("verify")
|
|
490
|
+
if not isinstance(verify, dict):
|
|
491
|
+
return False
|
|
492
|
+
sub = verify.get("sub_verdicts")
|
|
493
|
+
if not isinstance(sub, dict):
|
|
422
494
|
return False
|
|
423
|
-
|
|
424
|
-
verify = phases.get("verify") or {}
|
|
425
|
-
sub = verify.get("sub_verdicts") or {}
|
|
426
|
-
return sub.get("pair_judge") is not None
|
|
495
|
+
return is_pair_judge_verdict(sub.get("pair_judge"))
|
|
427
496
|
|
|
428
497
|
# Pair findings distinguishability — checked from archive of whichever
|
|
429
498
|
# arm fired pair-mode. l2_forced always fires (when present); l2_gated
|
|
@@ -444,10 +513,10 @@ def build_rows(results_dir: Path, work_dir_root: Path, run_id: str) -> list[dict
|
|
|
444
513
|
"solo_dq": get_disqualifier(judge, "solo_claude"),
|
|
445
514
|
"l2_gated_dq": get_disqualifier(judge, "l2_gated"),
|
|
446
515
|
"l2_forced_dq": get_disqualifier(judge, "l2_forced"),
|
|
447
|
-
"solo_wall": (solo_r
|
|
448
|
-
"l2_gated_wall": (gated_r
|
|
449
|
-
"solo_timeout":
|
|
450
|
-
"l2_gated_timeout":
|
|
516
|
+
"solo_wall": strict_elapsed_seconds(solo_r),
|
|
517
|
+
"l2_gated_wall": strict_elapsed_seconds(gated_r),
|
|
518
|
+
"solo_timeout": timeout_flag(solo_r),
|
|
519
|
+
"l2_gated_timeout": timeout_flag(gated_r),
|
|
451
520
|
"l2_gated_pair_judge_present": pair_judge_present(gated_state),
|
|
452
521
|
"l2_forced_pair_judge_present": pair_judge_present(forced_state),
|
|
453
522
|
"pair_fired": pair_judge_present(gated_state) or pair_judge_present(forced_state),
|
|
@@ -466,6 +535,57 @@ def build_rows(results_dir: Path, work_dir_root: Path, run_id: str) -> list[dict
|
|
|
466
535
|
return rows
|
|
467
536
|
|
|
468
537
|
|
|
538
|
+
def strict_elapsed_seconds(result: dict | None) -> float | int | None:
|
|
539
|
+
if not result:
|
|
540
|
+
return None
|
|
541
|
+
value = result.get("elapsed_seconds")
|
|
542
|
+
return value if is_strict_number(value) else None
|
|
543
|
+
|
|
544
|
+
|
|
545
|
+
def timeout_flag(result: dict | None) -> bool:
|
|
546
|
+
if not result:
|
|
547
|
+
return False
|
|
548
|
+
return bool_flag(result.get("timed_out"))
|
|
549
|
+
|
|
550
|
+
|
|
551
|
+
def validate_manifest(manifest: object) -> tuple[dict | None, str | None]:
|
|
552
|
+
if not isinstance(manifest, dict):
|
|
553
|
+
return None, "manifest malformed: expected object"
|
|
554
|
+
raw_eligible = manifest.get("fixtures_pair_eligible")
|
|
555
|
+
if not isinstance(raw_eligible, list) or not all(isinstance(fx, str) for fx in raw_eligible):
|
|
556
|
+
return None, "manifest malformed: fixtures_pair_eligible must be a string array"
|
|
557
|
+
if not raw_eligible:
|
|
558
|
+
return None, "manifest malformed: fixtures_pair_eligible must not be empty"
|
|
559
|
+
threshold = manifest.get("gate3_threshold_count")
|
|
560
|
+
total = manifest.get("gate3_total")
|
|
561
|
+
if not isinstance(threshold, int) or isinstance(threshold, bool) or threshold <= 0:
|
|
562
|
+
return None, "manifest malformed: gate3_threshold_count must be a positive integer"
|
|
563
|
+
if not isinstance(total, int) or isinstance(total, bool) or total <= 0:
|
|
564
|
+
return None, "manifest malformed: gate3_total must be a positive integer"
|
|
565
|
+
if total != len(raw_eligible):
|
|
566
|
+
return None, "manifest malformed: gate3_total must equal fixtures_pair_eligible length"
|
|
567
|
+
if threshold > total:
|
|
568
|
+
return None, "manifest malformed: gate3_threshold_count must be <= gate3_total"
|
|
569
|
+
rule = manifest.get("selection_rule")
|
|
570
|
+
if rule is not None:
|
|
571
|
+
if not isinstance(rule, dict):
|
|
572
|
+
return None, "manifest malformed: selection_rule must be an object"
|
|
573
|
+
rejected = rule.get("rejected_excluded")
|
|
574
|
+
reasons = rule.get("rejected_excluded_reasons")
|
|
575
|
+
if rejected is not None:
|
|
576
|
+
if not isinstance(rejected, list) or not all(isinstance(fx, str) for fx in rejected):
|
|
577
|
+
return None, "manifest malformed: selection_rule.rejected_excluded must be a string array"
|
|
578
|
+
if reasons is not None:
|
|
579
|
+
if (
|
|
580
|
+
not isinstance(reasons, dict)
|
|
581
|
+
or not all(isinstance(fx, str) and isinstance(reason, str) and reason for fx, reason in reasons.items())
|
|
582
|
+
):
|
|
583
|
+
return None, "manifest malformed: selection_rule.rejected_excluded_reasons must map fixture ids to non-empty strings"
|
|
584
|
+
if rejected is not None and set(reasons) != set(rejected):
|
|
585
|
+
return None, "manifest malformed: selection_rule.rejected_excluded_reasons keys must match rejected_excluded"
|
|
586
|
+
return manifest, None
|
|
587
|
+
|
|
588
|
+
|
|
469
589
|
def render_markdown(gates: list[dict], rows: list[dict]) -> str:
|
|
470
590
|
lines = ["# iter-0033c gate table\n"]
|
|
471
591
|
lines.append("| fixture | solo | l2_gated | Δ | l2_forced | l2g pair? | l2f pair? | wall_ratio |")
|
|
@@ -511,7 +631,18 @@ def main() -> int:
|
|
|
511
631
|
ap.add_argument("--out-md", required=True)
|
|
512
632
|
args = ap.parse_args()
|
|
513
633
|
|
|
514
|
-
|
|
634
|
+
try:
|
|
635
|
+
raw_manifest = json.loads(
|
|
636
|
+
Path(args.manifest).read_text(),
|
|
637
|
+
parse_constant=reject_json_constant,
|
|
638
|
+
)
|
|
639
|
+
except (ValueError, json.JSONDecodeError) as exc:
|
|
640
|
+
print(f"error: manifest malformed: invalid JSON: {exc}", file=sys.stderr)
|
|
641
|
+
return 2
|
|
642
|
+
manifest, manifest_error = validate_manifest(raw_manifest)
|
|
643
|
+
if manifest is None:
|
|
644
|
+
print(f"error: {manifest_error}", file=sys.stderr)
|
|
645
|
+
return 2
|
|
515
646
|
rows = build_rows(Path(args.results_dir), Path(args.work_dir_root), args.run_id)
|
|
516
647
|
|
|
517
648
|
gates = [
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Build iter-0033c L1 rerun summary from per-fixture judge/result artifacts."""
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import json
|
|
7
|
+
import sys
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
SCRIPT_DIR = Path(__file__).resolve().parent
|
|
12
|
+
if str(SCRIPT_DIR) not in sys.path:
|
|
13
|
+
sys.path.insert(0, str(SCRIPT_DIR))
|
|
14
|
+
|
|
15
|
+
from pair_evidence_contract import is_score, is_strict_number, loads_strict_json_object
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
SCORE_ARMS = ("solo_claude", "l2_gated", "l2_forced", "bare")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def load_json(path: Path) -> dict[str, Any]:
|
|
22
|
+
try:
|
|
23
|
+
data = loads_strict_json_object(path.read_text(encoding="utf8"))
|
|
24
|
+
except (ValueError, json.JSONDecodeError):
|
|
25
|
+
return {}
|
|
26
|
+
return data
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def score_for(judge: dict[str, Any], arm: str, mapping: dict[str, Any]) -> int | None:
|
|
30
|
+
letter = next(
|
|
31
|
+
(slot for slot, mapped in mapping.items() if slot in {"A", "B", "C"} and mapped == arm),
|
|
32
|
+
None,
|
|
33
|
+
)
|
|
34
|
+
if letter is None:
|
|
35
|
+
return None
|
|
36
|
+
raw_scores = judge.get("scores_by_arm")
|
|
37
|
+
scores = raw_scores if isinstance(raw_scores, dict) else {}
|
|
38
|
+
score = scores.get(arm)
|
|
39
|
+
if is_score(score):
|
|
40
|
+
return score
|
|
41
|
+
legacy = judge.get(f"{letter.lower()}_score")
|
|
42
|
+
return legacy if is_score(legacy) else None
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def strict_number(value: object) -> object:
|
|
46
|
+
return value if is_strict_number(value) else None
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def build_summary(results_dir: Path, run_id: str, git_sha: str) -> dict[str, Any]:
|
|
50
|
+
rows = []
|
|
51
|
+
for fx_dir in sorted(p for p in results_dir.iterdir() if p.is_dir()):
|
|
52
|
+
judge_path = fx_dir / "judge.json"
|
|
53
|
+
if not judge_path.is_file():
|
|
54
|
+
continue
|
|
55
|
+
judge = load_json(judge_path)
|
|
56
|
+
raw_mapping = judge.get("_blind_mapping")
|
|
57
|
+
mapping = raw_mapping if isinstance(raw_mapping, dict) else {}
|
|
58
|
+
arms = {}
|
|
59
|
+
for arm_name in SCORE_ARMS:
|
|
60
|
+
score = score_for(judge, arm_name, mapping)
|
|
61
|
+
if score is None and arm_name not in set(mapping.values()):
|
|
62
|
+
continue
|
|
63
|
+
arm_dir = fx_dir / arm_name
|
|
64
|
+
result = load_json(arm_dir / "result.json") if (arm_dir / "result.json").is_file() else {}
|
|
65
|
+
arms[arm_name] = {
|
|
66
|
+
"score": score,
|
|
67
|
+
"wall_s": strict_number(result.get("elapsed_seconds")),
|
|
68
|
+
"verify_score": strict_number(result.get("verify_score")),
|
|
69
|
+
"files_changed": result.get("files_changed"),
|
|
70
|
+
"timed_out": result.get("timed_out"),
|
|
71
|
+
"disqualifier": result.get("disqualifier"),
|
|
72
|
+
}
|
|
73
|
+
rows.append({"fixture": fx_dir.name, "arms": arms})
|
|
74
|
+
return {
|
|
75
|
+
"run_id": run_id,
|
|
76
|
+
"git_sha": git_sha,
|
|
77
|
+
"fixtures_total": len(rows),
|
|
78
|
+
"rows": rows,
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def main() -> int:
|
|
83
|
+
parser = argparse.ArgumentParser()
|
|
84
|
+
parser.add_argument("--results-dir", required=True, type=Path)
|
|
85
|
+
parser.add_argument("--out", required=True, type=Path)
|
|
86
|
+
parser.add_argument("--run-id", required=True)
|
|
87
|
+
parser.add_argument("--git-sha", required=True)
|
|
88
|
+
args = parser.parse_args()
|
|
89
|
+
|
|
90
|
+
summary = build_summary(args.results_dir, args.run_id, args.git_sha)
|
|
91
|
+
args.out.write_text(json.dumps(summary, indent=2) + "\n", encoding="utf8")
|
|
92
|
+
print(f"[l1-rerun-summary] wrote {args.out} (fixtures={summary['fixtures_total']})")
|
|
93
|
+
return 0
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
if __name__ == "__main__":
|
|
97
|
+
raise SystemExit(main())
|