devlyn-cli 2.2.2 → 2.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +2 -2
- package/CLAUDE.md +4 -4
- package/README.md +85 -34
- package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
- package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
- package/benchmark/auto-resolve/README.md +307 -44
- package/benchmark/auto-resolve/RUBRIC.md +23 -14
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
- package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
- package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
- package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
- package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
- package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
- package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
- package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
- package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
- package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
- package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
- package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
- package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
- package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
- package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
- package/benchmark/auto-resolve/scripts/judge.sh +153 -26
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
- package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
- package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
- package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
- package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
- package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
- package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
- package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
- package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
- package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
- package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
- package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
- package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
- package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
- package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
- package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
- package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
- package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
- package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
- package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
- package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
- package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
- package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
- package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
- package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
- package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
- package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
- package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
- package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
- package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
- package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
- package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
- package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
- package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
- package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
- package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
- package/bin/devlyn.js +221 -17
- package/config/skills/_shared/adapters/README.md +3 -0
- package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
- package/config/skills/_shared/adapters/opus-4-7.md +9 -1
- package/config/skills/_shared/archive_run.py +78 -6
- package/config/skills/_shared/codex-config.md +5 -4
- package/config/skills/_shared/codex-monitored.sh +46 -1
- package/config/skills/_shared/collect-codex-findings.py +20 -5
- package/config/skills/_shared/engine-preflight.md +17 -13
- package/config/skills/_shared/runtime-principles.md +6 -9
- package/config/skills/_shared/spec-verify-check.py +2664 -107
- package/config/skills/_shared/verify-merge-findings.py +1369 -19
- package/config/skills/devlyn:design-ui/SKILL.md +364 -0
- package/config/skills/devlyn:ideate/SKILL.md +7 -4
- package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
- package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
- package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
- package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
- package/config/skills/devlyn:resolve/SKILL.md +78 -26
- package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
- package/config/skills/devlyn:resolve/references/phases/implement.md +1 -1
- package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
- package/config/skills/devlyn:resolve/references/phases/verify.md +80 -29
- package/config/skills/devlyn:resolve/references/state-schema.md +9 -4
- package/package.json +47 -2
- package/scripts/lint-fixtures.sh +349 -0
- package/scripts/lint-shadow-fixtures.sh +58 -0
- package/scripts/lint-skills.sh +3645 -95
|
@@ -28,18 +28,27 @@ RESUME=0
|
|
|
28
28
|
LIMIT=""
|
|
29
29
|
INSTANCE_IDS=()
|
|
30
30
|
|
|
31
|
+
require_value() {
|
|
32
|
+
local flag="$1"
|
|
33
|
+
local value="${2:-}"
|
|
34
|
+
if [ -z "$value" ] || [[ "$value" == --* ]]; then
|
|
35
|
+
echo "$flag requires a value" >&2
|
|
36
|
+
exit 1
|
|
37
|
+
fi
|
|
38
|
+
}
|
|
39
|
+
|
|
31
40
|
while [ $# -gt 0 ]; do
|
|
32
41
|
case "$1" in
|
|
33
|
-
--instances-jsonl) INSTANCES_JSONL="$2"; shift 2;;
|
|
34
|
-
--predictions-out) PREDICTIONS_OUT="$2"; shift 2;;
|
|
35
|
-
--model-name) MODEL_NAME="$2"; shift 2;;
|
|
36
|
-
--repos-root) REPOS_ROOT="$2"; shift 2;;
|
|
37
|
-
--worktrees-root) WORKTREES_ROOT="$2"; shift 2;;
|
|
38
|
-
--timeout-seconds) TIMEOUT_SECONDS="$2"; shift 2;;
|
|
42
|
+
--instances-jsonl) require_value "$1" "${2:-}"; INSTANCES_JSONL="$2"; shift 2;;
|
|
43
|
+
--predictions-out) require_value "$1" "${2:-}"; PREDICTIONS_OUT="$2"; shift 2;;
|
|
44
|
+
--model-name) require_value "$1" "${2:-}"; MODEL_NAME="$2"; shift 2;;
|
|
45
|
+
--repos-root) require_value "$1" "${2:-}"; REPOS_ROOT="$2"; shift 2;;
|
|
46
|
+
--worktrees-root) require_value "$1" "${2:-}"; WORKTREES_ROOT="$2"; shift 2;;
|
|
47
|
+
--timeout-seconds) require_value "$1" "${2:-}"; TIMEOUT_SECONDS="$2"; shift 2;;
|
|
39
48
|
--copy-devlyn-context) COPY_DEVLYN_CONTEXT=1; shift;;
|
|
40
49
|
--resume) RESUME=1; shift;;
|
|
41
|
-
--limit) LIMIT="$2"; shift 2;;
|
|
42
|
-
--instance-id) INSTANCE_IDS+=("$2"); shift 2;;
|
|
50
|
+
--limit) require_value "$1" "${2:-}"; LIMIT="$2"; shift 2;;
|
|
51
|
+
--instance-id) require_value "$1" "${2:-}"; INSTANCE_IDS+=("$2"); shift 2;;
|
|
43
52
|
-h|--help) usage 0;;
|
|
44
53
|
*) echo "unknown arg: $1" >&2; usage 1;;
|
|
45
54
|
esac
|
|
@@ -62,22 +71,31 @@ TMP_IDS="$(mktemp)"
|
|
|
62
71
|
TMP_SELECTED_INSTANCES="$(mktemp)"
|
|
63
72
|
trap 'rm -f "$TMP_IDS" "$TMP_SELECTED_INSTANCES"' EXIT
|
|
64
73
|
|
|
65
|
-
|
|
74
|
+
selection_args=("$INSTANCES_JSONL" "$TMP_SELECTED_INSTANCES" "$LIMIT")
|
|
75
|
+
if [ "${#INSTANCE_IDS[@]}" -gt 0 ]; then
|
|
76
|
+
selection_args+=("${INSTANCE_IDS[@]}")
|
|
77
|
+
fi
|
|
78
|
+
|
|
79
|
+
python3 - "$SCRIPT_DIR" "${selection_args[@]}" > "$TMP_IDS" <<'PY'
|
|
66
80
|
import json
|
|
67
81
|
import sys
|
|
68
82
|
from pathlib import Path
|
|
83
|
+
sys.path.insert(0, sys.argv[1])
|
|
84
|
+
from pair_evidence_contract import reject_json_constant
|
|
69
85
|
|
|
70
|
-
instances_path = Path(sys.argv[
|
|
71
|
-
selected_path = Path(sys.argv[
|
|
72
|
-
limit = int(sys.argv[
|
|
73
|
-
requested = sys.argv[
|
|
86
|
+
instances_path = Path(sys.argv[2])
|
|
87
|
+
selected_path = Path(sys.argv[3])
|
|
88
|
+
limit = int(sys.argv[4]) if sys.argv[4] else None
|
|
89
|
+
requested = sys.argv[5:]
|
|
74
90
|
requested_set = set(requested)
|
|
75
91
|
rows = []
|
|
76
92
|
with instances_path.open(encoding="utf8") as f:
|
|
77
93
|
for line_no, line in enumerate(f, start=1):
|
|
78
94
|
if not line.strip():
|
|
79
95
|
continue
|
|
80
|
-
row = json.loads(line)
|
|
96
|
+
row = json.loads(line, parse_constant=reject_json_constant)
|
|
97
|
+
if not isinstance(row, dict):
|
|
98
|
+
raise SystemExit(f"{instances_path}:{line_no}: expected JSON object")
|
|
81
99
|
instance_id = row.get("instance_id")
|
|
82
100
|
if not isinstance(instance_id, str) or not instance_id:
|
|
83
101
|
raise SystemExit(f"{instances_path}:{line_no}: missing instance_id")
|
|
@@ -11,6 +11,83 @@ Exits 0 on PASS, 1 on FAIL.
|
|
|
11
11
|
from __future__ import annotations
|
|
12
12
|
import argparse, json, pathlib, sys, shutil, datetime
|
|
13
13
|
|
|
14
|
+
SCRIPT_DIR = pathlib.Path(__file__).resolve().parent
|
|
15
|
+
if str(SCRIPT_DIR) not in sys.path:
|
|
16
|
+
sys.path.insert(0, str(SCRIPT_DIR))
|
|
17
|
+
|
|
18
|
+
from pair_evidence_contract import reject_json_constant
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def load_dict_json(path: pathlib.Path) -> tuple[dict | None, str | None]:
|
|
22
|
+
try:
|
|
23
|
+
data = json.loads(path.read_text(), parse_constant=reject_json_constant)
|
|
24
|
+
except (ValueError, json.JSONDecodeError):
|
|
25
|
+
return None, "invalid JSON"
|
|
26
|
+
if not isinstance(data, dict):
|
|
27
|
+
return None, "expected object"
|
|
28
|
+
return data, None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def object_or_empty(value) -> dict:
|
|
32
|
+
return value if isinstance(value, dict) else {}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def rows_from_summary(summary: dict, failures: list[str]) -> list[dict]:
|
|
36
|
+
raw_rows = summary.get("rows")
|
|
37
|
+
if not isinstance(raw_rows, list):
|
|
38
|
+
failures.append("summary rows missing or malformed — measurement invalid")
|
|
39
|
+
return []
|
|
40
|
+
rows = [row for row in raw_rows if isinstance(row, dict)]
|
|
41
|
+
if len(rows) != len(raw_rows):
|
|
42
|
+
failures.append("summary rows contain non-object entries — measurement invalid")
|
|
43
|
+
return rows
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def int_or_none(value) -> int | None:
|
|
47
|
+
return value if isinstance(value, int) and not isinstance(value, bool) else None
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def number_or_none(value) -> int | float | None:
|
|
51
|
+
if isinstance(value, bool):
|
|
52
|
+
return None
|
|
53
|
+
return value if isinstance(value, (int, float)) else None
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def bool_or_none(value) -> bool | None:
|
|
57
|
+
return value if isinstance(value, bool) else None
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def axis_invalid_count(rows: list[dict], arm: str, failures: list[str]) -> int:
|
|
61
|
+
total = 0
|
|
62
|
+
for row in rows:
|
|
63
|
+
arms = object_or_empty(row.get("arms"))
|
|
64
|
+
payload = object_or_empty(arms.get(arm))
|
|
65
|
+
raw_count = payload.get("_axis_validation_out_of_range_count", 0)
|
|
66
|
+
count = number_or_none(raw_count)
|
|
67
|
+
if count is None:
|
|
68
|
+
failures.append(f"{arm} axis count malformed — measurement invalid")
|
|
69
|
+
elif count > 0:
|
|
70
|
+
total += 1
|
|
71
|
+
return total
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def unmapped_axis_invalid_count(rows: list[dict], failures: list[str]) -> int:
|
|
75
|
+
total = 0
|
|
76
|
+
for row in rows:
|
|
77
|
+
raw_count = row.get("_axis_validation_unmapped_out_of_range_count", 0)
|
|
78
|
+
count = number_or_none(raw_count)
|
|
79
|
+
if count is None:
|
|
80
|
+
failures.append("unmapped axis count malformed — measurement invalid")
|
|
81
|
+
elif count > 0:
|
|
82
|
+
total += 1
|
|
83
|
+
return total
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def is_known_limit(row: dict) -> bool:
|
|
87
|
+
raw_category = row.get("category")
|
|
88
|
+
category = raw_category.lower() if isinstance(raw_category, str) else ""
|
|
89
|
+
return category in {"edge", "known-limit"}
|
|
90
|
+
|
|
14
91
|
|
|
15
92
|
def main() -> int:
|
|
16
93
|
p = argparse.ArgumentParser()
|
|
@@ -25,68 +102,134 @@ def main() -> int:
|
|
|
25
102
|
summary_p = root / "results" / args.run_id / "summary.json"
|
|
26
103
|
if not summary_p.exists():
|
|
27
104
|
print(f"no summary at {summary_p}", file=sys.stderr); return 1
|
|
28
|
-
summary =
|
|
105
|
+
summary, summary_error = load_dict_json(summary_p)
|
|
106
|
+
if summary is None:
|
|
107
|
+
print(f"measurement invalid: malformed summary.json ({summary_error})", file=sys.stderr)
|
|
108
|
+
return 1
|
|
29
109
|
|
|
30
110
|
baseline_p = root / "history" / "baselines" / "shipped.json"
|
|
31
111
|
baseline = None
|
|
32
112
|
if baseline_p.exists():
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
except Exception:
|
|
113
|
+
baseline, _ = load_dict_json(baseline_p)
|
|
114
|
+
if baseline is None:
|
|
36
115
|
baseline = None
|
|
37
116
|
|
|
38
117
|
failures: list[str] = []
|
|
39
118
|
warnings: list[str] = []
|
|
119
|
+
rows = rows_from_summary(summary, failures)
|
|
40
120
|
|
|
41
121
|
# Hard floor 1: no disqualifier in variant
|
|
42
|
-
|
|
43
|
-
|
|
122
|
+
hard_floor_violations = int_or_none(summary.get("hard_floor_violations"))
|
|
123
|
+
if hard_floor_violations is None:
|
|
124
|
+
failures.append("summary hard_floor_violations missing or malformed — measurement invalid")
|
|
125
|
+
elif hard_floor_violations > 0:
|
|
126
|
+
failures.append(f"{hard_floor_violations} variant disqualifier(s) — see report")
|
|
127
|
+
variant_axis_invalid = axis_invalid_count(rows, "variant", failures)
|
|
128
|
+
if variant_axis_invalid > 0:
|
|
129
|
+
failures.append(
|
|
130
|
+
f"variant axis-invalid: {variant_axis_invalid} fixture(s) have out-of-range axis cells — "
|
|
131
|
+
"re-judge before trusting L2 margins"
|
|
132
|
+
)
|
|
133
|
+
bare_axis_invalid = axis_invalid_count(rows, "bare", failures)
|
|
134
|
+
if bare_axis_invalid > 0:
|
|
135
|
+
failures.append(
|
|
136
|
+
f"bare axis-invalid: {bare_axis_invalid} fixture(s) have out-of-range axis cells — "
|
|
137
|
+
"re-judge before trusting margins"
|
|
138
|
+
)
|
|
139
|
+
unmapped_axis_invalid = unmapped_axis_invalid_count(rows, failures)
|
|
140
|
+
if unmapped_axis_invalid > 0:
|
|
141
|
+
failures.append(
|
|
142
|
+
f"judge axis-invalid unmapped: {unmapped_axis_invalid} fixture(s) have out-of-range axis cells "
|
|
143
|
+
"that could not be mapped to an arm — re-judge before trusting margins"
|
|
144
|
+
)
|
|
44
145
|
|
|
45
146
|
# Hard floor 2: F9 must pass (skipped during bootstrap via --accept-missing)
|
|
46
147
|
# Variant arm legacy gate kept for L2 baseline comparability.
|
|
47
148
|
# iter-0033a (2026-04-30): renamed F9 dir from -to-preflight to -to-resolve to
|
|
48
149
|
# match the shipped 2-skill contract (no preflight). The OLD pre-rename id
|
|
49
150
|
# is preserved in fixtures/retired/ for replay.
|
|
50
|
-
f9_row = next((r for r in
|
|
151
|
+
f9_row = next((r for r in rows if r.get("fixture") == "F9-e2e-ideate-to-resolve"), None)
|
|
51
152
|
if f9_row is None:
|
|
52
153
|
if not args.accept_missing:
|
|
53
154
|
failures.append("F9 (E2E novice flow) missing — add fixture or run with --accept-missing")
|
|
54
155
|
else:
|
|
55
|
-
|
|
156
|
+
f9_margin = number_or_none(f9_row.get("margin"))
|
|
157
|
+
if f9_margin is None:
|
|
158
|
+
failures.append("F9 (E2E novice flow) margin missing or malformed — measurement invalid")
|
|
159
|
+
elif f9_margin < 5:
|
|
56
160
|
failures.append("F9 (E2E novice flow) must have variant margin ≥ +5")
|
|
57
161
|
|
|
58
|
-
|
|
162
|
+
for row in rows:
|
|
163
|
+
if not is_known_limit(row):
|
|
164
|
+
continue
|
|
165
|
+
margin = number_or_none(row.get("margin"))
|
|
166
|
+
if margin is not None and (margin < -3 or margin > 3):
|
|
167
|
+
warnings.append(
|
|
168
|
+
f"{row.get('fixture')} known-limit margin {margin:+g} outside expected [-3,+3] range"
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
# Hard floor 3: at least 7 gated fixtures with margin ≥ +5
|
|
59
172
|
# (skipped during bootstrap via --accept-missing)
|
|
60
|
-
|
|
173
|
+
gated_fixtures = int_or_none(summary.get("gated_fixtures"))
|
|
174
|
+
margin_ge_5_count = int_or_none(summary.get("margin_ge_5_count"))
|
|
175
|
+
if gated_fixtures is None or margin_ge_5_count is None:
|
|
176
|
+
failures.append("summary gated fixture counts missing or malformed — measurement invalid")
|
|
177
|
+
elif gated_fixtures > 0 and margin_ge_5_count < 7:
|
|
61
178
|
if not args.accept_missing:
|
|
62
179
|
failures.append(
|
|
63
|
-
f"only {
|
|
180
|
+
f"only {margin_ge_5_count} of {gated_fixtures} "
|
|
64
181
|
f"gated fixtures have variant margin ≥ +5 (need ≥ 7)"
|
|
65
182
|
)
|
|
66
183
|
|
|
67
184
|
# iter-0023 — L1 (solo_claude) gates per NORTH-STAR.md ops test #1.
|
|
68
185
|
# Codex R1 (this iter) caught that ship-gate enforced only legacy L2
|
|
69
186
|
# `variant` margin and never read `solo_over_bare`. Now NORTH-STAR's
|
|
70
|
-
# documented L1 floor (≥ +5
|
|
187
|
+
# documented L1 floor (≥ +5 on at least 7 gated fixtures, F9 ≥ +5, no L1
|
|
71
188
|
# disqualifier) is mechanically enforced.
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
189
|
+
raw_arms_present = summary.get("arms_present")
|
|
190
|
+
if raw_arms_present is not None and not isinstance(raw_arms_present, dict):
|
|
191
|
+
failures.append("summary arms_present malformed — measurement invalid")
|
|
192
|
+
arms_present = object_or_empty(raw_arms_present)
|
|
193
|
+
raw_margins_avg = summary.get("margins_avg")
|
|
194
|
+
margins_avg = object_or_empty(raw_margins_avg)
|
|
195
|
+
raw_solo_present = arms_present.get("solo_claude")
|
|
196
|
+
solo_present = bool_or_none(raw_solo_present)
|
|
197
|
+
if raw_solo_present is not None and solo_present is None:
|
|
198
|
+
failures.append("summary arms_present.solo_claude malformed — measurement invalid")
|
|
199
|
+
if solo_present is True:
|
|
200
|
+
if raw_margins_avg is not None and not isinstance(raw_margins_avg, dict):
|
|
201
|
+
failures.append("summary margins_avg malformed — measurement invalid")
|
|
202
|
+
l1_dq_by_fixture: dict[str, bool] = {}
|
|
203
|
+
for r in rows:
|
|
204
|
+
fixture = str(r.get("fixture"))
|
|
205
|
+
l1 = object_or_empty(object_or_empty(r.get("arms")).get("solo_claude"))
|
|
206
|
+
raw_l1_dq = l1.get("disqualifier")
|
|
207
|
+
parsed_l1_dq = bool_or_none(raw_l1_dq)
|
|
208
|
+
if raw_l1_dq is not None and parsed_l1_dq is None:
|
|
209
|
+
failures.append(f"{fixture} L1 disqualifier malformed — measurement invalid")
|
|
210
|
+
l1_dq_by_fixture[fixture] = True
|
|
211
|
+
else:
|
|
212
|
+
l1_dq_by_fixture[fixture] = parsed_l1_dq is True
|
|
213
|
+
|
|
75
214
|
l1_avg = margins_avg.get("solo_over_bare")
|
|
76
|
-
if l1_avg is not None and l1_avg
|
|
215
|
+
if l1_avg is not None and number_or_none(l1_avg) is None:
|
|
216
|
+
failures.append("L1 (solo_over_bare) suite avg malformed — measurement invalid")
|
|
217
|
+
elif l1_avg is not None and l1_avg < 5:
|
|
77
218
|
warnings.append(
|
|
78
219
|
f"L1 (solo_over_bare) suite avg {l1_avg:+.1f} below NORTH-STAR floor +5 "
|
|
79
220
|
"(reporting only — per-fixture L1 gates below are decisive)"
|
|
80
221
|
)
|
|
81
222
|
# F9 L1 floor
|
|
82
223
|
if f9_row is not None:
|
|
83
|
-
f9_l1 = (f9_row.get("margins")
|
|
224
|
+
f9_l1 = object_or_empty(f9_row.get("margins")).get("solo_over_bare")
|
|
84
225
|
if f9_l1 is None:
|
|
85
226
|
if not args.accept_missing:
|
|
86
227
|
failures.append("F9 L1 (solo_over_bare) margin missing — measurement invalid")
|
|
228
|
+
elif number_or_none(f9_l1) is None:
|
|
229
|
+
failures.append("F9 L1 (solo_over_bare) margin malformed — measurement invalid")
|
|
87
230
|
elif f9_l1 < 5:
|
|
88
|
-
failures.append(f"F9 L1 (solo_over_bare) margin {f9_l1:+
|
|
89
|
-
# 7-
|
|
231
|
+
failures.append(f"F9 L1 (solo_over_bare) margin {f9_l1:+g} < +5 floor")
|
|
232
|
+
# 7-fixture L1 floor — headroom-aware (added 2026-05-02 per iter-0033 R4
|
|
90
233
|
# Codex collab + NORTH-STAR amendment + RUBRIC hard-floor 3 update).
|
|
91
234
|
# A fixture is excluded from the denominator when 100 - L0_score < 5
|
|
92
235
|
# AND L1_score >= 95 AND the L1 arm has no disqualifier / CRITICAL-HIGH
|
|
@@ -96,25 +239,26 @@ def main() -> int:
|
|
|
96
239
|
l1_ge_5 = 0
|
|
97
240
|
l1_gated = 0
|
|
98
241
|
l1_excluded_headroom = []
|
|
99
|
-
for r in
|
|
100
|
-
if (r
|
|
242
|
+
for r in rows:
|
|
243
|
+
if is_known_limit(r):
|
|
101
244
|
continue
|
|
102
|
-
arms = r.get("arms")
|
|
103
|
-
l0 = arms.get("bare")
|
|
104
|
-
l1 = arms.get("solo_claude")
|
|
105
|
-
l0_score = l0.get("score")
|
|
106
|
-
l1_score = l1.get("score")
|
|
107
|
-
m = (r.get("margins")
|
|
245
|
+
arms = object_or_empty(r.get("arms"))
|
|
246
|
+
l0 = object_or_empty(arms.get("bare"))
|
|
247
|
+
l1 = object_or_empty(arms.get("solo_claude"))
|
|
248
|
+
l0_score = number_or_none(l0.get("score"))
|
|
249
|
+
l1_score = number_or_none(l1.get("score"))
|
|
250
|
+
m = number_or_none(object_or_empty(r.get("margins")).get("solo_over_bare"))
|
|
108
251
|
if m is None:
|
|
109
252
|
continue
|
|
110
253
|
# Headroom carve-out — must satisfy ALL conditions:
|
|
111
254
|
# (a) bare ceiling-near (100 - L0 < 5)
|
|
112
255
|
# (b) L1 also ceiling-near (>=95)
|
|
113
256
|
# (c) L1 arm clean (no disqualifier, no axis-invalid, fix-loop didn't fail)
|
|
114
|
-
l1_dq_here =
|
|
115
|
-
|
|
257
|
+
l1_dq_here = l1_dq_by_fixture.get(str(r.get("fixture")), False)
|
|
258
|
+
l1_axis_count = number_or_none(l1.get("_axis_validation_out_of_range_count", 0))
|
|
259
|
+
l1_axis_inv = bool(l1_axis_count is not None and l1_axis_count > 0)
|
|
116
260
|
if (
|
|
117
|
-
|
|
261
|
+
l0_score is not None and l1_score is not None
|
|
118
262
|
and (100 - l0_score) < 5 and l1_score >= 95
|
|
119
263
|
and not l1_dq_here and not l1_axis_inv
|
|
120
264
|
):
|
|
@@ -136,14 +280,14 @@ def main() -> int:
|
|
|
136
280
|
warnings.append(
|
|
137
281
|
"L1 headroom-excluded (saturation candidates per RUBRIC two-shipped-version rule): "
|
|
138
282
|
+ ", ".join(
|
|
139
|
-
f"{x['fixture']} (L0={x['l0_score']} L1={x['l1_score']} margin={x['margin']:+
|
|
283
|
+
f"{x['fixture']} (L0={x['l0_score']} L1={x['l1_score']} margin={x['margin']:+g})"
|
|
140
284
|
for x in l1_excluded_headroom
|
|
141
285
|
)
|
|
142
286
|
)
|
|
143
287
|
# L1 disqualifier floor
|
|
144
288
|
l1_dq = sum(
|
|
145
|
-
1 for r in
|
|
146
|
-
if ((r.get("
|
|
289
|
+
1 for r in rows
|
|
290
|
+
if l1_dq_by_fixture.get(str(r.get("fixture")), False)
|
|
147
291
|
)
|
|
148
292
|
if l1_dq > 0:
|
|
149
293
|
failures.append(f"L1 disqualifier(s): {l1_dq} solo_claude arm(s) hit a disqualifier")
|
|
@@ -151,10 +295,13 @@ def main() -> int:
|
|
|
151
295
|
# `_axis_validation` per fixture). If any L1 row has invalid axis data,
|
|
152
296
|
# the L1 score for that row is not trustworthy.
|
|
153
297
|
l1_axis_invalid = 0
|
|
154
|
-
for r in
|
|
155
|
-
av = (r.get("arms")
|
|
298
|
+
for r in rows:
|
|
299
|
+
av = object_or_empty(object_or_empty(r.get("arms")).get("solo_claude"))
|
|
156
300
|
inv = av.get("_axis_validation_out_of_range_count")
|
|
157
|
-
|
|
301
|
+
count = number_or_none(inv)
|
|
302
|
+
if inv is not None and count is None:
|
|
303
|
+
failures.append("L1 axis count malformed — measurement invalid")
|
|
304
|
+
elif count is not None and count > 0:
|
|
158
305
|
l1_axis_invalid += 1
|
|
159
306
|
if l1_axis_invalid > 0:
|
|
160
307
|
failures.append(
|
|
@@ -164,31 +311,53 @@ def main() -> int:
|
|
|
164
311
|
|
|
165
312
|
# Hard floor 4: no per-fixture regression worse than −5 vs shipped baseline
|
|
166
313
|
if baseline:
|
|
167
|
-
prev_rows = {
|
|
168
|
-
|
|
314
|
+
prev_rows = {
|
|
315
|
+
r["fixture"]: r for r in baseline.get("rows", [])
|
|
316
|
+
if isinstance(r, dict) and isinstance(r.get("fixture"), str)
|
|
317
|
+
}
|
|
318
|
+
for r in rows:
|
|
319
|
+
if is_known_limit(r):
|
|
320
|
+
continue
|
|
169
321
|
fid = r.get("fixture")
|
|
170
322
|
prev = prev_rows.get(fid)
|
|
171
|
-
|
|
172
|
-
|
|
323
|
+
current_score = number_or_none(r.get("variant_score"))
|
|
324
|
+
previous_score = number_or_none(prev.get("variant_score")) if prev else None
|
|
325
|
+
if prev and current_score is not None and previous_score is not None:
|
|
326
|
+
delta = current_score - previous_score
|
|
173
327
|
if delta < -5:
|
|
174
|
-
failures.append(f"{fid} regressed {delta:+
|
|
328
|
+
failures.append(f"{fid} regressed {delta:+g} vs shipped (floor: −5)")
|
|
175
329
|
|
|
176
330
|
# Soft gate: suite average margin drop > 3
|
|
177
331
|
if baseline:
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
332
|
+
current_margin_avg = number_or_none(summary.get("margin_avg"))
|
|
333
|
+
baseline_margin_avg = number_or_none(baseline.get("margin_avg"))
|
|
334
|
+
if current_margin_avg is None:
|
|
335
|
+
failures.append("suite margin missing — measurement invalid")
|
|
336
|
+
elif baseline_margin_avg is None:
|
|
337
|
+
warnings.append("shipped baseline margin malformed; skipping suite margin delta")
|
|
338
|
+
else:
|
|
339
|
+
margin_delta = current_margin_avg - baseline_margin_avg
|
|
340
|
+
if margin_delta < -3:
|
|
341
|
+
warnings.append(f"suite margin dropped {margin_delta:+.1f} vs shipped (soft gate: > −3)")
|
|
181
342
|
|
|
182
343
|
# Soft gate: any fixture that was > +5 before is now ≤ 0
|
|
183
344
|
if baseline:
|
|
184
|
-
prev_rows = {
|
|
185
|
-
|
|
345
|
+
prev_rows = {
|
|
346
|
+
r["fixture"]: r for r in baseline.get("rows", [])
|
|
347
|
+
if isinstance(r, dict) and isinstance(r.get("fixture"), str)
|
|
348
|
+
}
|
|
349
|
+
for r in rows:
|
|
186
350
|
fid = r.get("fixture")
|
|
187
351
|
prev = prev_rows.get(fid)
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
352
|
+
prev_margin = number_or_none(prev.get("margin")) if prev else None
|
|
353
|
+
current_margin = number_or_none(r.get("margin"))
|
|
354
|
+
if prev and prev_margin is not None and prev_margin > 5:
|
|
355
|
+
if current_margin is None:
|
|
356
|
+
warnings.append(f"{fid} margin missing; was {prev_margin:+g}")
|
|
357
|
+
elif current_margin <= 0:
|
|
358
|
+
warnings.append(
|
|
359
|
+
f"{fid} lost its margin: was {prev_margin:+g}, now {current_margin:+g}"
|
|
360
|
+
)
|
|
192
361
|
|
|
193
362
|
verdict = "PASS" if not failures else "FAIL"
|
|
194
363
|
print(f"\n═══ SHIP-GATE VERDICT: {verdict} ═══\n")
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Validate a shadow fixture solo ceiling avoidance note."""
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import pathlib
|
|
7
|
+
import re
|
|
8
|
+
import sys
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
SECTION_RE = re.compile(r"(?ms)^##[ \t]+Solo ceiling avoidance\b[^\n]*\n(.*?)(?=^##[ \t]+|\Z)")
|
|
12
|
+
CONTROL_RE = re.compile(r"\bS[2-6]\b|S2-S6|solo-saturated|rejected controls?", re.IGNORECASE)
|
|
13
|
+
REASON_RE = re.compile(r"\bdiffer(?:s|ent|ence)?\b|\bunlike\b|\bbecause\b|\bpreserve\b|\bheadroom\b", re.IGNORECASE)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def read_text(path: pathlib.Path) -> str:
|
|
17
|
+
try:
|
|
18
|
+
return path.read_text(encoding="utf-8")
|
|
19
|
+
except UnicodeDecodeError as exc:
|
|
20
|
+
print(f"{path}: expected UTF-8 text ({exc})", file=sys.stderr)
|
|
21
|
+
raise SystemExit(2) from None
|
|
22
|
+
except OSError as exc:
|
|
23
|
+
print(f"{path}: unable to read ({exc})", file=sys.stderr)
|
|
24
|
+
raise SystemExit(2) from None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def solo_ceiling_avoidance_error(text: str) -> str | None:
|
|
28
|
+
match = SECTION_RE.search(text)
|
|
29
|
+
if not match:
|
|
30
|
+
return "missing ## Solo ceiling avoidance section"
|
|
31
|
+
section = match.group(1)
|
|
32
|
+
if "solo_claude" not in section:
|
|
33
|
+
return "solo ceiling avoidance must mention solo_claude"
|
|
34
|
+
if not CONTROL_RE.search(section):
|
|
35
|
+
return "solo ceiling avoidance must compare against rejected or solo-saturated controls such as S2-S6"
|
|
36
|
+
if not REASON_RE.search(section):
|
|
37
|
+
return "solo ceiling avoidance must state difference/headroom reasoning"
|
|
38
|
+
return None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def main(argv: list[str]) -> int:
|
|
42
|
+
parser = argparse.ArgumentParser()
|
|
43
|
+
parser.add_argument("path", type=pathlib.Path)
|
|
44
|
+
args = parser.parse_args(argv)
|
|
45
|
+
err = solo_ceiling_avoidance_error(read_text(args.path))
|
|
46
|
+
if err:
|
|
47
|
+
print(f"{args.path}: {err}", file=sys.stderr)
|
|
48
|
+
return 1
|
|
49
|
+
return 0
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
if __name__ == "__main__":
|
|
53
|
+
raise SystemExit(main(sys.argv[1:]))
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Validate that a pair-candidate fixture states an actionable solo-headroom hypothesis."""
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import json
|
|
7
|
+
import pathlib
|
|
8
|
+
import sys
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
from pair_evidence_contract import (
|
|
12
|
+
actionable_observable_commands,
|
|
13
|
+
has_actionable_solo_headroom_hypothesis_text,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def combined_text(paths: list[pathlib.Path]) -> str:
|
|
18
|
+
chunks: list[str] = []
|
|
19
|
+
for path in paths:
|
|
20
|
+
if not path.is_file():
|
|
21
|
+
continue
|
|
22
|
+
try:
|
|
23
|
+
chunks.append(path.read_text(encoding="utf-8"))
|
|
24
|
+
except UnicodeDecodeError as exc:
|
|
25
|
+
print(f"{path}: expected UTF-8 text ({exc})", file=sys.stderr)
|
|
26
|
+
raise SystemExit(2) from None
|
|
27
|
+
return "\n".join(chunks)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def has_actionable_hypothesis(text: str) -> bool:
|
|
31
|
+
return has_actionable_solo_headroom_hypothesis_text(text)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def expected_commands(path: pathlib.Path) -> set[str]:
|
|
35
|
+
try:
|
|
36
|
+
data = json.loads(path.read_text(encoding="utf-8"))
|
|
37
|
+
except UnicodeDecodeError as exc:
|
|
38
|
+
print(f"{path}: expected UTF-8 JSON ({exc})", file=sys.stderr)
|
|
39
|
+
raise SystemExit(2) from None
|
|
40
|
+
except json.JSONDecodeError as exc:
|
|
41
|
+
print(f"{path}: invalid JSON ({exc})", file=sys.stderr)
|
|
42
|
+
raise SystemExit(2) from None
|
|
43
|
+
|
|
44
|
+
commands = data.get("verification_commands")
|
|
45
|
+
if not isinstance(commands, list):
|
|
46
|
+
print(f"{path}: verification_commands must be a list", file=sys.stderr)
|
|
47
|
+
raise SystemExit(2)
|
|
48
|
+
|
|
49
|
+
result: set[str] = set()
|
|
50
|
+
for index, command in enumerate(commands):
|
|
51
|
+
if not isinstance(command, dict) or not isinstance(command.get("cmd"), str):
|
|
52
|
+
print(f"{path}: verification_commands[{index}].cmd must be a string", file=sys.stderr)
|
|
53
|
+
raise SystemExit(2)
|
|
54
|
+
result.add(command["cmd"])
|
|
55
|
+
return result
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def main(argv: list[str]) -> int:
|
|
59
|
+
parser = argparse.ArgumentParser()
|
|
60
|
+
parser.add_argument(
|
|
61
|
+
"--expected-json",
|
|
62
|
+
type=pathlib.Path,
|
|
63
|
+
help="Require the observable hypothesis command to match expected.json verification_commands[].cmd.",
|
|
64
|
+
)
|
|
65
|
+
parser.add_argument("paths", nargs="+", type=pathlib.Path)
|
|
66
|
+
args = parser.parse_args(argv)
|
|
67
|
+
text = combined_text(args.paths)
|
|
68
|
+
if not has_actionable_hypothesis(text):
|
|
69
|
+
return 1
|
|
70
|
+
if args.expected_json is None:
|
|
71
|
+
return 0
|
|
72
|
+
expected = expected_commands(args.expected_json)
|
|
73
|
+
return 0 if any(command in expected for command in actionable_observable_commands(text)) else 1
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
if __name__ == "__main__":
|
|
77
|
+
raise SystemExit(main(sys.argv[1:]))
|