devlyn-cli 2.3.0 → 2.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +1 -1
- package/CLAUDE.md +2 -2
- package/README.md +82 -29
- package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
- package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
- package/benchmark/auto-resolve/README.md +307 -44
- package/benchmark/auto-resolve/RUBRIC.md +23 -14
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
- package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
- package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
- package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
- package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
- package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
- package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
- package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
- package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
- package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
- package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
- package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
- package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
- package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
- package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
- package/benchmark/auto-resolve/scripts/judge.sh +153 -26
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
- package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
- package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
- package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
- package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
- package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
- package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
- package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
- package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
- package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
- package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
- package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
- package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
- package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
- package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
- package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
- package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
- package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
- package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
- package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
- package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
- package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
- package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
- package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
- package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
- package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
- package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
- package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
- package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
- package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
- package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
- package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
- package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
- package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
- package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
- package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
- package/bin/devlyn.js +211 -18
- package/config/skills/_shared/adapters/README.md +3 -0
- package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
- package/config/skills/_shared/adapters/opus-4-7.md +9 -1
- package/config/skills/_shared/archive_run.py +78 -6
- package/config/skills/_shared/codex-config.md +3 -2
- package/config/skills/_shared/codex-monitored.sh +46 -1
- package/config/skills/_shared/collect-codex-findings.py +20 -5
- package/config/skills/_shared/engine-preflight.md +1 -1
- package/config/skills/_shared/runtime-principles.md +5 -8
- package/config/skills/_shared/spec-verify-check.py +2664 -107
- package/config/skills/_shared/verify-merge-findings.py +1369 -19
- package/config/skills/devlyn:ideate/SKILL.md +7 -4
- package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
- package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
- package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
- package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
- package/config/skills/devlyn:resolve/SKILL.md +49 -18
- package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
- package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
- package/config/skills/devlyn:resolve/references/phases/verify.md +62 -28
- package/config/skills/devlyn:resolve/references/state-schema.md +7 -4
- package/package.json +47 -2
- package/scripts/lint-fixtures.sh +349 -0
- package/scripts/lint-shadow-fixtures.sh +58 -0
- package/scripts/lint-skills.sh +3642 -92
- /package/{optional-skills → config/skills}/devlyn:design-ui/SKILL.md +0 -0
|
@@ -14,7 +14,7 @@ anchoring is left-only, which is what we want. Per-oracle convention
|
|
|
14
14
|
documented here; step 1's content oracle uses regex instead.
|
|
15
15
|
|
|
16
16
|
Fixtures can waive any Tier A pattern via `expected.json::tier_a_waivers`
|
|
17
|
-
(list of fnmatch globs). Load-bearing case: F9 e2e-ideate-to-
|
|
17
|
+
(list of fnmatch globs). Load-bearing case: F9 e2e-ideate-to-resolve
|
|
18
18
|
legitimately creates docs/VISION.md, docs/ROADMAP.md, docs/roadmap/**.
|
|
19
19
|
|
|
20
20
|
Step 2 scope: findings only. Scoring integration is a later step.
|
|
@@ -27,6 +27,8 @@ import pathlib
|
|
|
27
27
|
import subprocess
|
|
28
28
|
import sys
|
|
29
29
|
|
|
30
|
+
from pair_evidence_contract import loads_strict_json_object
|
|
31
|
+
|
|
30
32
|
ORACLE_NAME = "scope-tier-a"
|
|
31
33
|
|
|
32
34
|
# iter-0022: stable category enumeration. See header comment in
|
|
@@ -222,27 +224,32 @@ def main():
|
|
|
222
224
|
|
|
223
225
|
waivers = []
|
|
224
226
|
fixture_id = None
|
|
227
|
+
expected_error = None
|
|
225
228
|
if args.expected:
|
|
226
229
|
exp_path = pathlib.Path(args.expected)
|
|
227
230
|
# fixture_id = parent directory name of expected.json
|
|
228
231
|
fixture_id = exp_path.parent.name
|
|
229
232
|
try:
|
|
230
|
-
expected =
|
|
233
|
+
expected = loads_strict_json_object(exp_path.read_text())
|
|
231
234
|
raw = expected.get("tier_a_waivers", [])
|
|
232
235
|
if isinstance(raw, list):
|
|
233
236
|
waivers = [w for w in raw if isinstance(w, str)]
|
|
234
|
-
except (OSError, json.JSONDecodeError) as e:
|
|
237
|
+
except (OSError, json.JSONDecodeError, ValueError) as e:
|
|
238
|
+
expected_error = f"expected.json unreadable: {e}"
|
|
235
239
|
sys.stderr.write(
|
|
236
240
|
f"[oracle-scope-tier-a] could not read waivers from {args.expected}: {e}\n"
|
|
237
241
|
)
|
|
238
242
|
|
|
239
243
|
findings = analyze(args.work, args.scaffold, waivers, fixture_id=fixture_id)
|
|
240
|
-
|
|
244
|
+
report = {
|
|
241
245
|
"oracle": "scope-tier-a",
|
|
242
246
|
"waivers": waivers,
|
|
243
247
|
"fixture_id": fixture_id,
|
|
244
248
|
"findings": findings,
|
|
245
|
-
}
|
|
249
|
+
}
|
|
250
|
+
if expected_error:
|
|
251
|
+
report["error"] = expected_error
|
|
252
|
+
print(json.dumps(report, indent=2))
|
|
246
253
|
|
|
247
254
|
|
|
248
255
|
if __name__ == "__main__":
|
|
@@ -34,6 +34,8 @@ import re
|
|
|
34
34
|
import subprocess
|
|
35
35
|
import sys
|
|
36
36
|
|
|
37
|
+
from pair_evidence_contract import loads_strict_json_object
|
|
38
|
+
|
|
37
39
|
ORACLE_NAME = "scope-tier-b"
|
|
38
40
|
|
|
39
41
|
# iter-0022: stable category enumeration. tier-b-reachable is `info` severity
|
|
@@ -221,8 +223,8 @@ def main():
|
|
|
221
223
|
ap.error("--work, --scaffold, and --expected are required unless --list-categories is set")
|
|
222
224
|
|
|
223
225
|
try:
|
|
224
|
-
expected =
|
|
225
|
-
except (OSError, json.JSONDecodeError) as e:
|
|
226
|
+
expected = loads_strict_json_object(pathlib.Path(args.expected).read_text())
|
|
227
|
+
except (OSError, json.JSONDecodeError, ValueError) as e:
|
|
226
228
|
sys.stderr.write(f"[oracle-scope-tier-b] cannot read expected: {e}\n")
|
|
227
229
|
print(json.dumps({
|
|
228
230
|
"oracle": "scope-tier-b",
|
|
@@ -238,6 +240,27 @@ def main():
|
|
|
238
240
|
# fixture_id = parent directory name of expected.json
|
|
239
241
|
fixture_id = pathlib.Path(args.expected).parent.name
|
|
240
242
|
|
|
243
|
+
if not isinstance(tier_c, list) or not all(isinstance(item, str) for item in tier_c):
|
|
244
|
+
print(json.dumps({
|
|
245
|
+
"oracle": "scope-tier-b",
|
|
246
|
+
"trace_method": TRACE_METHOD,
|
|
247
|
+
"tier_c_seeds_matched": [],
|
|
248
|
+
"fixture_id": fixture_id,
|
|
249
|
+
"findings": [],
|
|
250
|
+
"error": "expected.json malformed: spec_output_files must be a string array",
|
|
251
|
+
}, indent=2))
|
|
252
|
+
return
|
|
253
|
+
if not isinstance(waivers, list) or not all(isinstance(item, str) for item in waivers):
|
|
254
|
+
print(json.dumps({
|
|
255
|
+
"oracle": "scope-tier-b",
|
|
256
|
+
"trace_method": TRACE_METHOD,
|
|
257
|
+
"tier_c_seeds_matched": [],
|
|
258
|
+
"fixture_id": fixture_id,
|
|
259
|
+
"findings": [],
|
|
260
|
+
"error": "expected.json malformed: tier_a_waivers must be a string array",
|
|
261
|
+
}, indent=2))
|
|
262
|
+
return
|
|
263
|
+
|
|
241
264
|
if not tier_c:
|
|
242
265
|
print(json.dumps({
|
|
243
266
|
"oracle": "scope-tier-b",
|
|
@@ -0,0 +1,469 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Report active pair-candidate fixture frontier.
|
|
3
|
+
|
|
4
|
+
This is a spending guard for solo<pair work. It answers three questions before
|
|
5
|
+
new provider calls:
|
|
6
|
+
- which active fixtures are already rejected by measured headroom/design,
|
|
7
|
+
- which active fixtures remain pair-candidate eligible,
|
|
8
|
+
- which eligible fixtures already have passing full-pipeline pair evidence.
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import argparse
|
|
13
|
+
import json
|
|
14
|
+
import math
|
|
15
|
+
import pathlib
|
|
16
|
+
import re
|
|
17
|
+
import subprocess
|
|
18
|
+
import sys
|
|
19
|
+
from typing import Any
|
|
20
|
+
|
|
21
|
+
SCRIPT_DIR = pathlib.Path(__file__).resolve().parent
|
|
22
|
+
if str(SCRIPT_DIR) not in sys.path:
|
|
23
|
+
sys.path.insert(0, str(SCRIPT_DIR))
|
|
24
|
+
|
|
25
|
+
from pair_evidence_contract import (
|
|
26
|
+
all_known_pair_trigger_reasons,
|
|
27
|
+
best_pair_evidence,
|
|
28
|
+
has_canonical_pair_trigger_reason,
|
|
29
|
+
has_known_pair_trigger_reason,
|
|
30
|
+
is_strict_number,
|
|
31
|
+
loads_strict_json_object,
|
|
32
|
+
normalize_pair_evidence_row,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def fixture_short(name: str) -> str:
|
|
37
|
+
return name.split("-", 1)[0] if "-" in name else name
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def sort_fixture_key(name: str) -> tuple[int, str]:
|
|
41
|
+
short = fixture_short(name)
|
|
42
|
+
match = re.fullmatch(r"F(\d+)", short)
|
|
43
|
+
return (int(match.group(1)) if match else 10_000, name)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def active_fixtures(fixtures_root: pathlib.Path) -> list[str]:
|
|
47
|
+
if not fixtures_root.is_dir():
|
|
48
|
+
raise ValueError(f"fixtures root missing: {fixtures_root}")
|
|
49
|
+
return sorted(
|
|
50
|
+
[
|
|
51
|
+
path.name
|
|
52
|
+
for path in fixtures_root.iterdir()
|
|
53
|
+
if path.is_dir() and re.fullmatch(r"F\d+-.+", path.name)
|
|
54
|
+
],
|
|
55
|
+
key=sort_fixture_key,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def registry_short_ids(registry: pathlib.Path) -> set[str]:
|
|
60
|
+
if not registry.is_file():
|
|
61
|
+
raise ValueError(f"rejected fixture registry missing: {registry}")
|
|
62
|
+
rejected: set[str] = set()
|
|
63
|
+
for line in registry.read_text().splitlines():
|
|
64
|
+
match = re.match(r"\s*([FS]\d+)-\*\|([FS]\d+)\)", line)
|
|
65
|
+
if match and match.group(1) == match.group(2):
|
|
66
|
+
rejected.add(match.group(1))
|
|
67
|
+
if not rejected:
|
|
68
|
+
raise ValueError(f"rejected fixture registry has no fixture entries: {registry}")
|
|
69
|
+
return rejected
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def rejected_reason(registry: pathlib.Path, fixture: str) -> str | None:
|
|
73
|
+
proc = subprocess.run(
|
|
74
|
+
[
|
|
75
|
+
"bash",
|
|
76
|
+
"-c",
|
|
77
|
+
'source "$1"; rejected_pair_fixture_reason "$2"',
|
|
78
|
+
"bash",
|
|
79
|
+
str(registry),
|
|
80
|
+
fixture,
|
|
81
|
+
],
|
|
82
|
+
text=True,
|
|
83
|
+
stdout=subprocess.PIPE,
|
|
84
|
+
stderr=subprocess.PIPE,
|
|
85
|
+
check=False,
|
|
86
|
+
)
|
|
87
|
+
if proc.returncode == 0:
|
|
88
|
+
return proc.stdout.strip()
|
|
89
|
+
return None
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def load_json_object(path: pathlib.Path) -> dict[str, Any]:
|
|
93
|
+
try:
|
|
94
|
+
data = loads_strict_json_object(path.read_text())
|
|
95
|
+
except (OSError, ValueError, json.JSONDecodeError):
|
|
96
|
+
raise ValueError(f"pair evidence artifact malformed: {path}") from None
|
|
97
|
+
return data
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def pair_gate_rows(path: pathlib.Path, gate: dict[str, Any]) -> list[dict[str, Any]]:
|
|
101
|
+
rows = gate.get("rows")
|
|
102
|
+
if not isinstance(rows, list) or not rows:
|
|
103
|
+
raise ValueError(f"pair evidence artifact rows malformed: {path}")
|
|
104
|
+
if not all(isinstance(row, dict) for row in rows):
|
|
105
|
+
raise ValueError(f"pair evidence artifact rows malformed: {path}")
|
|
106
|
+
return rows
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def pair_result_trigger_reasons(
|
|
110
|
+
results_root: pathlib.Path,
|
|
111
|
+
*,
|
|
112
|
+
run_id: str,
|
|
113
|
+
fixture: str,
|
|
114
|
+
pair_arm: str,
|
|
115
|
+
) -> list[str]:
|
|
116
|
+
path = results_root / run_id / fixture / pair_arm / "result.json"
|
|
117
|
+
try:
|
|
118
|
+
result = loads_strict_json_object(path.read_text())
|
|
119
|
+
except (OSError, ValueError, json.JSONDecodeError):
|
|
120
|
+
return []
|
|
121
|
+
trigger = result.get("pair_trigger")
|
|
122
|
+
if not isinstance(trigger, dict):
|
|
123
|
+
return []
|
|
124
|
+
reasons = trigger.get("reasons")
|
|
125
|
+
if not (
|
|
126
|
+
isinstance(reasons, list)
|
|
127
|
+
and reasons
|
|
128
|
+
and all(isinstance(reason, str) for reason in reasons)
|
|
129
|
+
and has_known_pair_trigger_reason(reasons)
|
|
130
|
+
and all_known_pair_trigger_reasons(reasons)
|
|
131
|
+
and has_canonical_pair_trigger_reason(reasons)
|
|
132
|
+
):
|
|
133
|
+
return []
|
|
134
|
+
return reasons
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def passing_pair_evidence(
|
|
138
|
+
results_root: pathlib.Path,
|
|
139
|
+
*,
|
|
140
|
+
min_pair_margin: int,
|
|
141
|
+
max_pair_solo_wall_ratio: float,
|
|
142
|
+
) -> dict[str, list[dict[str, Any]]]:
|
|
143
|
+
evidence: dict[str, list[dict[str, Any]]] = {}
|
|
144
|
+
if not results_root.is_dir():
|
|
145
|
+
return evidence
|
|
146
|
+
for gate_path in sorted(results_root.glob("*/full-pipeline-pair-gate.json")):
|
|
147
|
+
gate = load_json_object(gate_path)
|
|
148
|
+
if gate.get("verdict") != "PASS":
|
|
149
|
+
continue
|
|
150
|
+
run_id = str(gate.get("run_id") or gate_path.parent.name)
|
|
151
|
+
pair_arm = gate.get("pair_arm")
|
|
152
|
+
for row in pair_gate_rows(gate_path, gate):
|
|
153
|
+
if row.get("status") != "PASS":
|
|
154
|
+
continue
|
|
155
|
+
fixture = row.get("fixture")
|
|
156
|
+
if not isinstance(fixture, str):
|
|
157
|
+
continue
|
|
158
|
+
candidate_row = row
|
|
159
|
+
if row.get("pair_trigger_reasons") is None and isinstance(pair_arm, str):
|
|
160
|
+
reasons = pair_result_trigger_reasons(
|
|
161
|
+
results_root,
|
|
162
|
+
run_id=run_id,
|
|
163
|
+
fixture=fixture,
|
|
164
|
+
pair_arm=pair_arm,
|
|
165
|
+
)
|
|
166
|
+
if reasons:
|
|
167
|
+
candidate_row = dict(row)
|
|
168
|
+
candidate_row["pair_trigger_reasons"] = reasons
|
|
169
|
+
candidate_row["pair_trigger_has_canonical_reason"] = True
|
|
170
|
+
evidence_row = normalize_pair_evidence_row(
|
|
171
|
+
fixture=fixture,
|
|
172
|
+
run_id=run_id,
|
|
173
|
+
pair_arm=pair_arm,
|
|
174
|
+
row=candidate_row,
|
|
175
|
+
)
|
|
176
|
+
if evidence_row is None:
|
|
177
|
+
continue
|
|
178
|
+
pair_margin = evidence_row["pair_margin"]
|
|
179
|
+
wall_ratio = evidence_row["pair_solo_wall_ratio"]
|
|
180
|
+
if pair_margin < min_pair_margin or wall_ratio > max_pair_solo_wall_ratio:
|
|
181
|
+
continue
|
|
182
|
+
evidence.setdefault(fixture, []).append(evidence_row)
|
|
183
|
+
return evidence
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def build_report(
|
|
187
|
+
*,
|
|
188
|
+
fixtures_root: pathlib.Path,
|
|
189
|
+
registry: pathlib.Path,
|
|
190
|
+
results_root: pathlib.Path,
|
|
191
|
+
min_pair_margin: int = 5,
|
|
192
|
+
max_pair_solo_wall_ratio: float = 3.0,
|
|
193
|
+
) -> dict[str, Any]:
|
|
194
|
+
fixtures = active_fixtures(fixtures_root)
|
|
195
|
+
rejected_short = registry_short_ids(registry)
|
|
196
|
+
evidence_by_fixture = passing_pair_evidence(
|
|
197
|
+
results_root,
|
|
198
|
+
min_pair_margin=min_pair_margin,
|
|
199
|
+
max_pair_solo_wall_ratio=max_pair_solo_wall_ratio,
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
rows: list[dict[str, Any]] = []
|
|
203
|
+
for fixture in fixtures:
|
|
204
|
+
reason = rejected_reason(registry, fixture) if fixture_short(fixture) in rejected_short else None
|
|
205
|
+
evidence = evidence_by_fixture.get(fixture, [])
|
|
206
|
+
if reason:
|
|
207
|
+
status = "rejected"
|
|
208
|
+
elif evidence:
|
|
209
|
+
status = "pair_evidence_passed"
|
|
210
|
+
else:
|
|
211
|
+
status = "candidate_unmeasured"
|
|
212
|
+
rows.append(
|
|
213
|
+
{
|
|
214
|
+
"fixture": fixture,
|
|
215
|
+
"short_id": fixture_short(fixture),
|
|
216
|
+
"status": status,
|
|
217
|
+
"rejected_reason": reason,
|
|
218
|
+
"passing_pair_evidence": evidence,
|
|
219
|
+
}
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
rejected_total = sum(1 for row in rows if row["status"] == "rejected")
|
|
223
|
+
candidate_total = sum(1 for row in rows if row["status"] != "rejected")
|
|
224
|
+
pair_evidence_total = sum(
|
|
225
|
+
1 for row in rows if row["status"] == "pair_evidence_passed"
|
|
226
|
+
)
|
|
227
|
+
unmeasured_candidate_total = sum(
|
|
228
|
+
1 for row in rows if row["status"] == "candidate_unmeasured"
|
|
229
|
+
)
|
|
230
|
+
best_pairs = [
|
|
231
|
+
best
|
|
232
|
+
for row in rows
|
|
233
|
+
if row["status"] == "pair_evidence_passed"
|
|
234
|
+
for best in [best_pair_evidence(row["passing_pair_evidence"])]
|
|
235
|
+
if best is not None
|
|
236
|
+
]
|
|
237
|
+
pair_margins = [
|
|
238
|
+
item["pair_margin"]
|
|
239
|
+
for item in best_pairs
|
|
240
|
+
if isinstance(item.get("pair_margin"), int)
|
|
241
|
+
]
|
|
242
|
+
wall_ratios = [
|
|
243
|
+
item["pair_solo_wall_ratio"]
|
|
244
|
+
for item in best_pairs
|
|
245
|
+
if is_strict_number(item.get("pair_solo_wall_ratio"))
|
|
246
|
+
]
|
|
247
|
+
|
|
248
|
+
return {
|
|
249
|
+
"verdict": "PASS" if unmeasured_candidate_total == 0 else "FAIL",
|
|
250
|
+
"min_pair_margin": min_pair_margin,
|
|
251
|
+
"max_pair_solo_wall_ratio": max_pair_solo_wall_ratio,
|
|
252
|
+
"fixtures_total": len(rows),
|
|
253
|
+
"rejected_total": rejected_total,
|
|
254
|
+
"candidate_total": candidate_total,
|
|
255
|
+
"pair_evidence_total": pair_evidence_total,
|
|
256
|
+
"unmeasured_candidate_total": unmeasured_candidate_total,
|
|
257
|
+
"rejected_count": rejected_total,
|
|
258
|
+
"candidate_count": candidate_total,
|
|
259
|
+
"pair_evidence_count": pair_evidence_total,
|
|
260
|
+
"unmeasured_count": unmeasured_candidate_total,
|
|
261
|
+
"pair_margin_avg": average(pair_margins),
|
|
262
|
+
"pair_margin_min": min(pair_margins) if pair_margins else None,
|
|
263
|
+
"pair_solo_wall_ratio_avg": average(wall_ratios),
|
|
264
|
+
"pair_solo_wall_ratio_max": round(max(wall_ratios), 2) if wall_ratios else None,
|
|
265
|
+
"rows": rows,
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def write_markdown(path: pathlib.Path, report: dict[str, Any]) -> None:
|
|
270
|
+
lines = [
|
|
271
|
+
"# Pair Candidate Frontier",
|
|
272
|
+
"",
|
|
273
|
+
f"Active fixtures: {report['fixtures_total']}",
|
|
274
|
+
f"Verdict: {report['verdict']}",
|
|
275
|
+
f"Rejected fixtures: {report['rejected_total']}",
|
|
276
|
+
f"Candidate fixtures: {report['candidate_total']}",
|
|
277
|
+
f"Candidates with passing pair evidence: {report['pair_evidence_total']}",
|
|
278
|
+
f"Unmeasured candidates: {report['unmeasured_candidate_total']}",
|
|
279
|
+
f"Minimum pair margin required: {format_margin(report.get('min_pair_margin'))}",
|
|
280
|
+
f"Maximum pair/solo wall ratio allowed: {format_wall_ratio(report.get('max_pair_solo_wall_ratio'))}",
|
|
281
|
+
f"Average pair margin: {format_decimal_margin(report.get('pair_margin_avg'))}",
|
|
282
|
+
f"Minimum pair margin: {format_margin(report.get('pair_margin_min'))}",
|
|
283
|
+
f"Average pair/solo wall ratio: {format_wall_ratio(report.get('pair_solo_wall_ratio_avg'))}",
|
|
284
|
+
f"Maximum pair/solo wall ratio: {format_wall_ratio(report.get('pair_solo_wall_ratio_max'))}",
|
|
285
|
+
"",
|
|
286
|
+
"| Fixture | Status | Verdict | Evidence | Pair arm | Triggers | Hypothesis trigger | Bare | Solo_claude | Pair | Margin | Wall ratio | Rejected reason |",
|
|
287
|
+
"|---|---|---|---|---|---|---|---:|---:|---:|---:|---:|---|",
|
|
288
|
+
]
|
|
289
|
+
for row in report["rows"]:
|
|
290
|
+
evidence = row["passing_pair_evidence"]
|
|
291
|
+
best = best_pair_evidence(evidence)
|
|
292
|
+
evidence_text = best.get("run_id", "") if best else ""
|
|
293
|
+
pair_arm = best.get("pair_arm", "") if best else ""
|
|
294
|
+
triggers = format_trigger_reasons(best.get("pair_trigger_reasons")) if best else ""
|
|
295
|
+
lines.append(
|
|
296
|
+
f"| {row['fixture']} | {row['status']} | {row['status']} | {evidence_text} | {pair_arm} | {triggers} | "
|
|
297
|
+
f"{format_bool(best.get('pair_trigger_has_hypothesis_reason') if best else None)} | "
|
|
298
|
+
f"{format_number(best.get('bare_score') if best else None)} | "
|
|
299
|
+
f"{format_number(best.get('solo_score') if best else None)} | "
|
|
300
|
+
f"{format_number(best.get('pair_score') if best else None)} | "
|
|
301
|
+
f"{format_margin(best.get('pair_margin') if best else None)} | "
|
|
302
|
+
f"{format_wall_ratio(best.get('pair_solo_wall_ratio') if best else None)} | "
|
|
303
|
+
f"{row.get('rejected_reason') or ''} |"
|
|
304
|
+
)
|
|
305
|
+
path.write_text("\n".join(lines) + "\n", encoding="utf8")
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def average(values: list[int | float]) -> float | None:
|
|
309
|
+
return round(sum(values) / len(values), 2) if values else None
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def format_number(value: Any) -> str:
|
|
313
|
+
return str(value) if isinstance(value, int) else ""
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def format_decimal_margin(value: Any) -> str:
|
|
317
|
+
return f"{value:+.2f}" if isinstance(value, (int, float)) else ""
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
def format_margin(value: Any) -> str:
|
|
321
|
+
return f"{value:+d}" if isinstance(value, int) else ""
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
def format_wall_ratio(value: Any) -> str:
|
|
325
|
+
return f"{value:.2f}x" if is_strict_number(value) else ""
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def format_trigger_reasons(value: Any) -> str:
|
|
329
|
+
if not isinstance(value, list) or not all(isinstance(item, str) for item in value):
|
|
330
|
+
return ""
|
|
331
|
+
return ",".join(value)
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def format_bool(value: Any) -> str:
|
|
335
|
+
return str(value).lower() if isinstance(value, bool) else ""
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def print_summary(report: dict[str, Any]) -> None:
|
|
339
|
+
print(
|
|
340
|
+
"fixtures={fixtures_total} rejected={rejected_total} "
|
|
341
|
+
"candidates={candidate_total} pair_evidence={pair_evidence_total} "
|
|
342
|
+
"unmeasured={unmeasured_candidate_total} verdict={verdict}".format(**report)
|
|
343
|
+
)
|
|
344
|
+
if report.get("pair_evidence_total"):
|
|
345
|
+
print(
|
|
346
|
+
"pair_margin_avg={avg} pair_margin_min={min_margin} "
|
|
347
|
+
"wall_avg={wall_avg} wall_max={wall_max}".format(
|
|
348
|
+
avg=format_decimal_margin(report.get("pair_margin_avg")),
|
|
349
|
+
min_margin=format_margin(report.get("pair_margin_min")),
|
|
350
|
+
wall_avg=format_wall_ratio(report.get("pair_solo_wall_ratio_avg")),
|
|
351
|
+
wall_max=format_wall_ratio(report.get("pair_solo_wall_ratio_max")),
|
|
352
|
+
)
|
|
353
|
+
)
|
|
354
|
+
for row in report["rows"]:
|
|
355
|
+
if row["status"] != "pair_evidence_passed":
|
|
356
|
+
continue
|
|
357
|
+
best = best_pair_evidence(row["passing_pair_evidence"])
|
|
358
|
+
if not best:
|
|
359
|
+
continue
|
|
360
|
+
print(
|
|
361
|
+
"{fixture}: bare={bare} solo_claude={solo} pair={pair} arm={arm} margin={margin} "
|
|
362
|
+
"wall={wall} run={run} verdict=pair_evidence_passed triggers={triggers} "
|
|
363
|
+
"hypothesis_trigger={hypothesis_trigger}".format(
|
|
364
|
+
fixture=row["fixture"],
|
|
365
|
+
bare=format_number(best.get("bare_score")),
|
|
366
|
+
solo=format_number(best.get("solo_score")),
|
|
367
|
+
pair=format_number(best.get("pair_score")),
|
|
368
|
+
arm=best.get("pair_arm") or "",
|
|
369
|
+
margin=format_margin(best.get("pair_margin")),
|
|
370
|
+
wall=format_wall_ratio(best.get("pair_solo_wall_ratio")),
|
|
371
|
+
run=best.get("run_id") or "",
|
|
372
|
+
triggers=format_trigger_reasons(best.get("pair_trigger_reasons")),
|
|
373
|
+
hypothesis_trigger=format_bool(best.get("pair_trigger_has_hypothesis_reason")),
|
|
374
|
+
)
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
def print_final_verdict(report: dict[str, Any]) -> None:
|
|
379
|
+
if report.get("verdict") == "PASS":
|
|
380
|
+
print("PASS pair-candidate-frontier", flush=True)
|
|
381
|
+
else:
|
|
382
|
+
print("FAIL pair-candidate-frontier", flush=True)
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def main() -> int:
|
|
386
|
+
parser = argparse.ArgumentParser()
|
|
387
|
+
parser.add_argument(
|
|
388
|
+
"--fixtures-root",
|
|
389
|
+
type=pathlib.Path,
|
|
390
|
+
default=pathlib.Path("benchmark/auto-resolve/fixtures"),
|
|
391
|
+
)
|
|
392
|
+
parser.add_argument(
|
|
393
|
+
"--registry",
|
|
394
|
+
type=pathlib.Path,
|
|
395
|
+
default=pathlib.Path(__file__).with_name("pair-rejected-fixtures.sh"),
|
|
396
|
+
)
|
|
397
|
+
parser.add_argument(
|
|
398
|
+
"--results-root",
|
|
399
|
+
type=pathlib.Path,
|
|
400
|
+
default=pathlib.Path("benchmark/auto-resolve/results"),
|
|
401
|
+
)
|
|
402
|
+
parser.add_argument("--out-json", type=pathlib.Path)
|
|
403
|
+
parser.add_argument("--out-md", type=pathlib.Path)
|
|
404
|
+
parser.add_argument(
|
|
405
|
+
"--fail-on-unmeasured",
|
|
406
|
+
action="store_true",
|
|
407
|
+
help="exit 1 when active candidate_unmeasured fixtures remain",
|
|
408
|
+
)
|
|
409
|
+
parser.add_argument(
|
|
410
|
+
"--min-pair-margin",
|
|
411
|
+
type=int,
|
|
412
|
+
default=5,
|
|
413
|
+
help="minimum pair-over-solo margin required to count passing pair evidence",
|
|
414
|
+
)
|
|
415
|
+
parser.add_argument(
|
|
416
|
+
"--max-pair-solo-wall-ratio",
|
|
417
|
+
type=float,
|
|
418
|
+
default=3.0,
|
|
419
|
+
help="maximum pair/solo wall-time ratio allowed to count passing pair evidence",
|
|
420
|
+
)
|
|
421
|
+
args = parser.parse_args()
|
|
422
|
+
if args.min_pair_margin < 1:
|
|
423
|
+
print("error: --min-pair-margin must be >= 1", file=sys.stderr)
|
|
424
|
+
return 2
|
|
425
|
+
if not math.isfinite(args.max_pair_solo_wall_ratio) or args.max_pair_solo_wall_ratio <= 0:
|
|
426
|
+
print("error: --max-pair-solo-wall-ratio must be finite and > 0", file=sys.stderr)
|
|
427
|
+
return 2
|
|
428
|
+
|
|
429
|
+
try:
|
|
430
|
+
report = build_report(
|
|
431
|
+
fixtures_root=args.fixtures_root,
|
|
432
|
+
registry=args.registry,
|
|
433
|
+
results_root=args.results_root,
|
|
434
|
+
min_pair_margin=args.min_pair_margin,
|
|
435
|
+
max_pair_solo_wall_ratio=args.max_pair_solo_wall_ratio,
|
|
436
|
+
)
|
|
437
|
+
except ValueError as exc:
|
|
438
|
+
print(f"error: {exc}", file=sys.stderr)
|
|
439
|
+
return 2
|
|
440
|
+
|
|
441
|
+
if args.out_json:
|
|
442
|
+
args.out_json.parent.mkdir(parents=True, exist_ok=True)
|
|
443
|
+
args.out_json.write_text(json.dumps(report, indent=2) + "\n", encoding="utf8")
|
|
444
|
+
if args.out_md:
|
|
445
|
+
args.out_md.parent.mkdir(parents=True, exist_ok=True)
|
|
446
|
+
write_markdown(args.out_md, report)
|
|
447
|
+
if not args.out_json and not args.out_md:
|
|
448
|
+
print(json.dumps(report, indent=2))
|
|
449
|
+
else:
|
|
450
|
+
print_summary(report)
|
|
451
|
+
print_final_verdict(report)
|
|
452
|
+
if args.fail_on_unmeasured and report["unmeasured_candidate_total"] > 0:
|
|
453
|
+
unmeasured = [
|
|
454
|
+
row["fixture"]
|
|
455
|
+
for row in report["rows"]
|
|
456
|
+
if row["status"] == "candidate_unmeasured"
|
|
457
|
+
]
|
|
458
|
+
print(
|
|
459
|
+
"unmeasured candidate fixture(s): " + ", ".join(unmeasured),
|
|
460
|
+
file=sys.stderr,
|
|
461
|
+
)
|
|
462
|
+
if not args.out_json and not args.out_md:
|
|
463
|
+
print("FAIL pair-candidate-frontier", file=sys.stderr, flush=True)
|
|
464
|
+
return 1
|
|
465
|
+
return 0
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
if __name__ == "__main__":
|
|
469
|
+
sys.exit(main())
|
|
@@ -31,6 +31,8 @@ import re
|
|
|
31
31
|
import subprocess
|
|
32
32
|
import sys
|
|
33
33
|
|
|
34
|
+
from pair_evidence_contract import loads_strict_json_object
|
|
35
|
+
|
|
34
36
|
ORACLE_SCRIPTS = {
|
|
35
37
|
"test-fidelity": "oracle-test-fidelity.py",
|
|
36
38
|
"scope-tier-a": "oracle-scope-tier-a.py",
|
|
@@ -157,7 +159,7 @@ def list_oracle_categories(scripts_dir, oracle_name):
|
|
|
157
159
|
text=True,
|
|
158
160
|
check=True,
|
|
159
161
|
)
|
|
160
|
-
payload =
|
|
162
|
+
payload = loads_strict_json_object(r.stdout)
|
|
161
163
|
if payload.get("oracle") != oracle_name:
|
|
162
164
|
raise ValueError(
|
|
163
165
|
f"oracle name mismatch: expected {oracle_name}, got {payload.get('oracle')}"
|
|
@@ -173,10 +175,8 @@ def build_registry(fixture_dir, scripts_dir, generated_at, repo_root):
|
|
|
173
175
|
expected_path = fixture_dir / "expected.json"
|
|
174
176
|
metadata_path = fixture_dir / "metadata.json"
|
|
175
177
|
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
with open(metadata_path, "r", encoding="utf-8") as f:
|
|
179
|
-
metadata = json.load(f)
|
|
178
|
+
expected = loads_strict_json_object(expected_path.read_text(encoding="utf-8"))
|
|
179
|
+
metadata = loads_strict_json_object(metadata_path.read_text(encoding="utf-8"))
|
|
180
180
|
|
|
181
181
|
fixture_id = metadata.get("id") or fixture_dir.name
|
|
182
182
|
|
|
@@ -25,6 +25,8 @@ import json
|
|
|
25
25
|
import pathlib
|
|
26
26
|
import sys
|
|
27
27
|
|
|
28
|
+
from pair_evidence_contract import reject_json_constant
|
|
29
|
+
|
|
28
30
|
SCHEMA_VERSION = "1"
|
|
29
31
|
AUTHORITY_ORDER_CANONICAL = [
|
|
30
32
|
"spec.md",
|
|
@@ -67,7 +69,11 @@ def _strict_pairs(pairs):
|
|
|
67
69
|
|
|
68
70
|
def load_strict_json(path):
|
|
69
71
|
with open(path, "r", encoding="utf-8") as f:
|
|
70
|
-
return json.load(
|
|
72
|
+
return json.load(
|
|
73
|
+
f,
|
|
74
|
+
object_pairs_hook=_strict_pairs,
|
|
75
|
+
parse_constant=reject_json_constant,
|
|
76
|
+
)
|
|
71
77
|
|
|
72
78
|
|
|
73
79
|
# ---------------------------------------------------------------------------
|
|
@@ -396,7 +402,8 @@ def lint(plan_path, registry_override=None):
|
|
|
396
402
|
return {"ok": False, "errors": [{"code": "plan_invalid_json",
|
|
397
403
|
"message": f"plan parse error: {e}"}]}
|
|
398
404
|
except ValueError as e:
|
|
399
|
-
|
|
405
|
+
code = "plan_duplicate_keys" if "duplicate key" in str(e) else "plan_invalid_json"
|
|
406
|
+
return {"ok": False, "errors": [{"code": code,
|
|
400
407
|
"message": str(e)}]}
|
|
401
408
|
except FileNotFoundError:
|
|
402
409
|
return {"ok": False, "errors": [{"code": "plan_not_found",
|