devlyn-cli 2.3.0 → 2.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +1 -1
- package/CLAUDE.md +2 -2
- package/README.md +82 -29
- package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
- package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
- package/benchmark/auto-resolve/README.md +307 -44
- package/benchmark/auto-resolve/RUBRIC.md +23 -14
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
- package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
- package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
- package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
- package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
- package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
- package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
- package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
- package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
- package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
- package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
- package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
- package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
- package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
- package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
- package/benchmark/auto-resolve/scripts/judge.sh +153 -26
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
- package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
- package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
- package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
- package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
- package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
- package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
- package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
- package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
- package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
- package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
- package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
- package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
- package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
- package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
- package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
- package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
- package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
- package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
- package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
- package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
- package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
- package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
- package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
- package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
- package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
- package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
- package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
- package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
- package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
- package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
- package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
- package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
- package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
- package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
- package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
- package/bin/devlyn.js +211 -18
- package/config/skills/_shared/adapters/README.md +3 -0
- package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
- package/config/skills/_shared/adapters/opus-4-7.md +9 -1
- package/config/skills/_shared/archive_run.py +78 -6
- package/config/skills/_shared/codex-config.md +3 -2
- package/config/skills/_shared/codex-monitored.sh +46 -1
- package/config/skills/_shared/collect-codex-findings.py +20 -5
- package/config/skills/_shared/engine-preflight.md +1 -1
- package/config/skills/_shared/runtime-principles.md +5 -8
- package/config/skills/_shared/spec-verify-check.py +2664 -107
- package/config/skills/_shared/verify-merge-findings.py +1369 -19
- package/config/skills/devlyn:ideate/SKILL.md +7 -4
- package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
- package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
- package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
- package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
- package/config/skills/devlyn:resolve/SKILL.md +49 -18
- package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
- package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
- package/config/skills/devlyn:resolve/references/phases/verify.md +62 -28
- package/config/skills/devlyn:resolve/references/state-schema.md +7 -4
- package/package.json +47 -2
- package/scripts/lint-fixtures.sh +349 -0
- package/scripts/lint-shadow-fixtures.sh +58 -0
- package/scripts/lint-skills.sh +3642 -92
- /package/{optional-skills → config/skills}/devlyn:design-ui/SKILL.md +0 -0
|
@@ -14,15 +14,18 @@ Default mode (BUILD_GATE invocation, no args):
|
|
|
14
14
|
benchmark truth is `commits`/`authors`) silently overwrote the
|
|
15
15
|
authoritative benchmark contract. For benchmarks, expected.json is
|
|
16
16
|
canonical.
|
|
17
|
-
(2) Otherwise,
|
|
18
|
-
|
|
17
|
+
(2) Otherwise, real-user spec mode first reads sibling `spec.expected.json`
|
|
18
|
+
next to `spec.md`; if it exists, validate it and stage its
|
|
19
|
+
`verification_commands`. A malformed sibling fails closed. If absent,
|
|
20
|
+
fall back to source markdown extract.
|
|
21
|
+
(3) For generated criteria and legacy handwritten specs without a sibling,
|
|
22
|
+
source markdown extract reads `pipeline.state.json:
|
|
23
|
+
source.{spec_path | criteria_path}` and extracts a `## Verification`
|
|
19
24
|
```json``` block. If present, overwrite `.devlyn/spec-verify.json`.
|
|
20
|
-
|
|
21
|
-
killed prior run is stale and must not be trusted in real-user mode.
|
|
22
|
-
(3) If no json block in source AND source.type=="generated": emit
|
|
25
|
+
(4) If no json block in source AND source.type=="generated": emit
|
|
23
26
|
CRITICAL `correctness.spec-verify-malformed` so the fix-loop reruns
|
|
24
27
|
BUILD.
|
|
25
|
-
(
|
|
28
|
+
(5) If no sibling/json block in source AND source.type=="spec": benchmark mode
|
|
26
29
|
with a pre-staged file would have hit branch (1). Without the
|
|
27
30
|
pre-staged file, benchmark falls through to no-op (rare — fixture
|
|
28
31
|
mis-config). Real-user mode silent no-op + drops any stale
|
|
@@ -35,11 +38,28 @@ Default mode (BUILD_GATE invocation, no args):
|
|
|
35
38
|
|
|
36
39
|
Check mode (`--check <markdown_path>`):
|
|
37
40
|
- Used by /devlyn:ideate after writing each item spec to validate that the
|
|
38
|
-
generated `## Verification` ```json``` block parses + matches the schema
|
|
41
|
+
generated `## Verification` ```json``` block parses + matches the schema,
|
|
42
|
+
and that present `complexity` frontmatter has a supported value.
|
|
39
43
|
- Exits 0 if the block is well-formed (or absent — ideate's check applies
|
|
40
44
|
to both new specs that include the block and pre-carrier handwritten
|
|
41
45
|
specs that omit it; absence is not failure here, only malformed JSON or
|
|
42
|
-
shape error is). Exits 2 on malformed json
|
|
46
|
+
shape error is). Exits 2 on malformed json, shape error, or unsupported
|
|
47
|
+
`complexity` value.
|
|
48
|
+
|
|
49
|
+
Expected-contract check mode (`--check-expected <json_path>`):
|
|
50
|
+
- Used by /devlyn:ideate after writing sibling `spec.expected.json`.
|
|
51
|
+
- Exits 0 if the file is valid JSON and matches `_shared/expected.schema.json`
|
|
52
|
+
shape, and if sibling `spec.md` has supported `complexity` frontmatter.
|
|
53
|
+
Exits 2 on unreadable, malformed, unsupported fields, or unsupported sibling
|
|
54
|
+
spec complexity.
|
|
55
|
+
|
|
56
|
+
Output routing:
|
|
57
|
+
- Default BUILD_GATE output writes `.devlyn/spec-verify-findings.jsonl` with
|
|
58
|
+
`phase: build_gate` and `BGATE-*` ids.
|
|
59
|
+
- VERIFY may set `SPEC_VERIFY_PHASE=verify_mechanical`,
|
|
60
|
+
`SPEC_VERIFY_FINDINGS_FILE=verify-mechanical.findings.jsonl`, and
|
|
61
|
+
`SPEC_VERIFY_FINDING_PREFIX=VERIFY-MECH` so `verify-merge-findings.py` consumes
|
|
62
|
+
deterministic blockers directly.
|
|
43
63
|
|
|
44
64
|
Why: iter-0018.5's prompt-only contract enforcement was empirically dead
|
|
45
65
|
(F9 verify=0.4 across all engines in iter-0019). Same lesson as iter-0008
|
|
@@ -50,11 +70,15 @@ markdown directly — closes NORTH-STAR test #14.
|
|
|
50
70
|
|
|
51
71
|
Exit codes:
|
|
52
72
|
- 0: silent no-op (no source carrier, real-user mode) OR --check passed
|
|
53
|
-
OR all commands passed.
|
|
54
|
-
|
|
73
|
+
OR all commands passed. Non-blocking expected-contract findings may be
|
|
74
|
+
written with exit 0.
|
|
75
|
+
- 1: at least one command failed, carrier malformed (generated source
|
|
55
76
|
required carrier, generated source had invalid json/shape, or pre-staged
|
|
56
|
-
file failed shape validation)
|
|
57
|
-
`.devlyn
|
|
77
|
+
file failed shape validation), or a blocking expected-contract finding
|
|
78
|
+
was emitted. Findings are written to the routed `.devlyn/` findings file:
|
|
79
|
+
`.devlyn/spec-verify-findings.jsonl` by default, or the file selected by
|
|
80
|
+
`SPEC_VERIFY_FINDINGS_FILE` (for example, VERIFY uses
|
|
81
|
+
`.devlyn/verify-mechanical.findings.jsonl`).
|
|
58
82
|
- 2: invocation error (unreadable spec-verify.json, missing markdown in
|
|
59
83
|
--check mode, etc.)
|
|
60
84
|
"""
|
|
@@ -62,6 +86,7 @@ Exit codes:
|
|
|
62
86
|
from __future__ import annotations
|
|
63
87
|
|
|
64
88
|
import json
|
|
89
|
+
import hashlib
|
|
65
90
|
import os
|
|
66
91
|
import re
|
|
67
92
|
import subprocess
|
|
@@ -70,6 +95,26 @@ import tempfile
|
|
|
70
95
|
from pathlib import Path
|
|
71
96
|
|
|
72
97
|
|
|
98
|
+
def reject_json_constant(token: str) -> None:
|
|
99
|
+
raise ValueError(f"invalid JSON numeric constant: {token}")
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def loads_strict_json(text: str):
|
|
103
|
+
return json.loads(text, parse_constant=reject_json_constant)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def output_phase() -> str:
|
|
107
|
+
return os.environ.get("SPEC_VERIFY_PHASE", "build_gate")
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def output_findings_name() -> str:
|
|
111
|
+
return os.environ.get("SPEC_VERIFY_FINDINGS_FILE", "spec-verify-findings.jsonl")
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def output_finding_prefix() -> str:
|
|
115
|
+
return os.environ.get("SPEC_VERIFY_FINDING_PREFIX", "BGATE")
|
|
116
|
+
|
|
117
|
+
|
|
73
118
|
VERIFICATION_SECTION_RE = re.compile(
|
|
74
119
|
r'(?ms)^##[ \t]+Verification\b[^\n]*\n(.*?)(?=^##[ \t]+|\Z)'
|
|
75
120
|
)
|
|
@@ -78,6 +123,39 @@ FORBIDDEN_RISK_PROBE_CMD_RE = re.compile(
|
|
|
78
123
|
r'BENCH_FIXTURE_DIR|benchmark/auto-resolve/fixtures|/verifiers/|verifiers/'
|
|
79
124
|
)
|
|
80
125
|
EXTERNAL_URL_RE = re.compile(r"https?://([^/\s\"']+)", re.IGNORECASE)
|
|
126
|
+
INLINE_JSON_OBJECT_RE = re.compile(r'`?\{\s*"[^"\n]+"\s*:', re.IGNORECASE)
|
|
127
|
+
BACKTICKED_TEXT_RE = re.compile(r"`[^`\n]+`")
|
|
128
|
+
OBSERVABLE_COMMAND_MARKERS = ("command", "observable", "expose")
|
|
129
|
+
RESERVED_BACKTICK_TERMS = {"solo-headroom hypothesis", "solo_claude", "miss"}
|
|
130
|
+
SOLO_CEILING_CONTROL_RE = re.compile(
|
|
131
|
+
r'\bS[2-6]\b|S2-S6|solo-saturated|rejected controls?|solo ceiling',
|
|
132
|
+
re.IGNORECASE,
|
|
133
|
+
)
|
|
134
|
+
SOLO_CEILING_DIFFERENCE_RE = re.compile(
|
|
135
|
+
r'\bdiffer(?:s|ent|ence)?\b|\bunlike\b|\bbecause\b|\bpreserve\b|\bheadroom\b',
|
|
136
|
+
re.IGNORECASE,
|
|
137
|
+
)
|
|
138
|
+
COMMAND_PREFIXES = {
|
|
139
|
+
"bash",
|
|
140
|
+
"bun",
|
|
141
|
+
"cargo",
|
|
142
|
+
"git",
|
|
143
|
+
"go",
|
|
144
|
+
"jest",
|
|
145
|
+
"make",
|
|
146
|
+
"node",
|
|
147
|
+
"npm",
|
|
148
|
+
"pnpm",
|
|
149
|
+
"printf",
|
|
150
|
+
"pytest",
|
|
151
|
+
"python",
|
|
152
|
+
"python3",
|
|
153
|
+
"ruff",
|
|
154
|
+
"sh",
|
|
155
|
+
"uv",
|
|
156
|
+
"vitest",
|
|
157
|
+
"yarn",
|
|
158
|
+
}
|
|
81
159
|
LOCAL_URL_HOSTS = {
|
|
82
160
|
'localhost',
|
|
83
161
|
'127.0.0.1',
|
|
@@ -93,6 +171,11 @@ RISK_PROBE_TAGS = {
|
|
|
93
171
|
"positive_remaining",
|
|
94
172
|
"stdout_stderr_contract",
|
|
95
173
|
"error_contract",
|
|
174
|
+
"http_error_contract",
|
|
175
|
+
"auth_signature_contract",
|
|
176
|
+
"idempotency_replay",
|
|
177
|
+
"concurrent_state_consistency",
|
|
178
|
+
"atomic_batch_state",
|
|
96
179
|
"shape_contract",
|
|
97
180
|
}
|
|
98
181
|
RISK_PROBE_REQUIRED_EVIDENCE = {
|
|
@@ -117,7 +200,59 @@ RISK_PROBE_REQUIRED_EVIDENCE = {
|
|
|
117
200
|
"asserts_full_remaining_state",
|
|
118
201
|
"zero_quantity_rows_absent",
|
|
119
202
|
},
|
|
203
|
+
"stdout_stderr_contract": {
|
|
204
|
+
"asserts_named_stream_output",
|
|
205
|
+
},
|
|
206
|
+
"error_contract": {
|
|
207
|
+
"asserts_error_payload_or_stderr",
|
|
208
|
+
"asserts_nonzero_or_exit_2",
|
|
209
|
+
},
|
|
210
|
+
"http_error_contract": {
|
|
211
|
+
"asserts_http_error_status",
|
|
212
|
+
"asserts_error_payload_body",
|
|
213
|
+
},
|
|
214
|
+
"auth_signature_contract": {
|
|
215
|
+
"asserts_signature_over_exact_bytes",
|
|
216
|
+
"asserts_tampered_or_missing_signature_rejected",
|
|
217
|
+
},
|
|
218
|
+
"idempotency_replay": {
|
|
219
|
+
"first_delivery_then_duplicate",
|
|
220
|
+
"duplicate_id_rejected_regardless_of_body",
|
|
221
|
+
},
|
|
222
|
+
"concurrent_state_consistency": {
|
|
223
|
+
"overlapping_mutations_exercised",
|
|
224
|
+
"all_successful_responses_reflected",
|
|
225
|
+
"distinct_identifiers_asserted",
|
|
226
|
+
},
|
|
227
|
+
"atomic_batch_state": {
|
|
228
|
+
"mixed_valid_invalid_batch",
|
|
229
|
+
"asserts_store_unchanged_after_failure",
|
|
230
|
+
"asserts_success_order_and_distinct_ids",
|
|
231
|
+
},
|
|
120
232
|
}
|
|
233
|
+
SHAPE_CONTRACT_REQUIRED_EVIDENCE = {
|
|
234
|
+
"uses_visible_input_key_names",
|
|
235
|
+
"asserts_visible_output_key_names",
|
|
236
|
+
"asserts_no_unexpected_output_keys",
|
|
237
|
+
}
|
|
238
|
+
EXPECTED_TOP_LEVEL_KEYS = {
|
|
239
|
+
"verification_commands",
|
|
240
|
+
"forbidden_patterns",
|
|
241
|
+
"required_files",
|
|
242
|
+
"forbidden_files",
|
|
243
|
+
"tier_a_waivers",
|
|
244
|
+
"spec_output_files",
|
|
245
|
+
"max_deps_added",
|
|
246
|
+
}
|
|
247
|
+
EXPECTED_VERIFICATION_COMMAND_KEYS = {
|
|
248
|
+
"cmd",
|
|
249
|
+
"exit_code",
|
|
250
|
+
"stdout_contains",
|
|
251
|
+
"stdout_not_contains",
|
|
252
|
+
"contract_refs",
|
|
253
|
+
}
|
|
254
|
+
PURE_DESIGN_ESCAPE = "all Requirements are pure-design"
|
|
255
|
+
SPEC_COMPLEXITY_VALUES = {"trivial", "medium", "high", "large"}
|
|
121
256
|
|
|
122
257
|
|
|
123
258
|
def extract_verification_block(text: str) -> str | None:
|
|
@@ -139,6 +274,157 @@ def extract_verification_text(text: str) -> str:
|
|
|
139
274
|
return section.group(1) if section else ""
|
|
140
275
|
|
|
141
276
|
|
|
277
|
+
def extract_frontmatter_field(text: str, field: str) -> str | None:
|
|
278
|
+
if not text.startswith("---"):
|
|
279
|
+
return None
|
|
280
|
+
end = text.find("\n---", 3)
|
|
281
|
+
if end == -1:
|
|
282
|
+
return None
|
|
283
|
+
pattern = re.compile(rf"\s*{re.escape(field)}\s*:\s*[\"']?([^\"'\n#]+)")
|
|
284
|
+
for line in text[3:end].splitlines():
|
|
285
|
+
match = pattern.match(line)
|
|
286
|
+
if match:
|
|
287
|
+
return match.group(1).strip().lower()
|
|
288
|
+
return None
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def validate_present_spec_complexity(text: str) -> str | None:
|
|
292
|
+
complexity = extract_frontmatter_field(text, "complexity")
|
|
293
|
+
if complexity is None or complexity in SPEC_COMPLEXITY_VALUES:
|
|
294
|
+
return None
|
|
295
|
+
values = ", ".join(sorted(SPEC_COMPLEXITY_VALUES))
|
|
296
|
+
return f"frontmatter complexity must be one of: {values}"
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def backticked_observable_miss_commands(text: str) -> list[str]:
|
|
300
|
+
commands: list[str] = []
|
|
301
|
+
for line in text.splitlines():
|
|
302
|
+
lower = line.lower()
|
|
303
|
+
if "miss" not in lower or not any(marker in lower for marker in OBSERVABLE_COMMAND_MARKERS):
|
|
304
|
+
continue
|
|
305
|
+
for match in BACKTICKED_TEXT_RE.finditer(line):
|
|
306
|
+
value = match.group(0).strip("`")
|
|
307
|
+
if is_command_like_backtick(value):
|
|
308
|
+
commands.append(value)
|
|
309
|
+
return commands
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def is_command_like_backtick(value: str) -> bool:
|
|
313
|
+
stripped = value.strip()
|
|
314
|
+
lower = stripped.lower()
|
|
315
|
+
if not stripped or lower in RESERVED_BACKTICK_TERMS:
|
|
316
|
+
return False
|
|
317
|
+
first = lower.split(maxsplit=1)[0]
|
|
318
|
+
return (
|
|
319
|
+
first in COMMAND_PREFIXES
|
|
320
|
+
or any(marker in stripped for marker in ("/", "$", "=", "|", "&&", ";"))
|
|
321
|
+
or stripped.endswith((".js", ".py", ".sh"))
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def has_backticked_observable_miss_command(text: str) -> bool:
|
|
326
|
+
return bool(backticked_observable_miss_commands(text))
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def validate_present_solo_headroom_hypothesis(text: str) -> str | None:
|
|
330
|
+
lower = text.lower()
|
|
331
|
+
if "solo-headroom hypothesis" not in lower and not ("solo_claude" in lower and "miss" in lower):
|
|
332
|
+
return None
|
|
333
|
+
if (
|
|
334
|
+
"solo-headroom hypothesis" in lower
|
|
335
|
+
and "solo_claude" in lower
|
|
336
|
+
and "miss" in lower
|
|
337
|
+
and has_backticked_observable_miss_command(text)
|
|
338
|
+
):
|
|
339
|
+
return None
|
|
340
|
+
return (
|
|
341
|
+
"solo-headroom hypothesis must include `solo-headroom hypothesis`, "
|
|
342
|
+
"`solo_claude`, `miss`, and a backticked command/observable line "
|
|
343
|
+
"that exposes the miss"
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def validate_present_solo_ceiling_avoidance(text: str) -> str | None:
|
|
348
|
+
lower = text.lower()
|
|
349
|
+
if "solo ceiling avoidance" not in lower:
|
|
350
|
+
return None
|
|
351
|
+
if (
|
|
352
|
+
"solo_claude" in lower
|
|
353
|
+
and SOLO_CEILING_CONTROL_RE.search(text)
|
|
354
|
+
and SOLO_CEILING_DIFFERENCE_RE.search(text)
|
|
355
|
+
):
|
|
356
|
+
return None
|
|
357
|
+
return (
|
|
358
|
+
"solo ceiling avoidance must include `solo ceiling avoidance`, "
|
|
359
|
+
"`solo_claude`, and a concrete difference from rejected or "
|
|
360
|
+
"solo-saturated controls such as `S2`-`S6`"
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def validate_solo_headroom_commands_against_expected(
|
|
365
|
+
spec_text: str,
|
|
366
|
+
commands: object,
|
|
367
|
+
expected_label: str,
|
|
368
|
+
) -> str | None:
|
|
369
|
+
lower = spec_text.lower()
|
|
370
|
+
if "solo-headroom hypothesis" not in lower and not ("solo_claude" in lower and "miss" in lower):
|
|
371
|
+
return None
|
|
372
|
+
expected_cmds = {
|
|
373
|
+
command.get("cmd")
|
|
374
|
+
for command in commands
|
|
375
|
+
if isinstance(command, dict) and isinstance(command.get("cmd"), str)
|
|
376
|
+
} if isinstance(commands, list) else set()
|
|
377
|
+
hypothesis_cmds = backticked_observable_miss_commands(spec_text)
|
|
378
|
+
if any(command in expected_cmds for command in hypothesis_cmds):
|
|
379
|
+
return None
|
|
380
|
+
return (
|
|
381
|
+
"solo-headroom hypothesis observable command must match "
|
|
382
|
+
f"{expected_label} verification_commands[].cmd"
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def command_contains_expected(actual: str, expected: str) -> bool:
|
|
387
|
+
normalized_actual = " ".join(actual.split())
|
|
388
|
+
normalized_expected = " ".join(expected.split())
|
|
389
|
+
if not normalized_expected:
|
|
390
|
+
return False
|
|
391
|
+
pattern = re.compile(
|
|
392
|
+
rf"(?<![A-Za-z0-9_.:/=-]){re.escape(normalized_expected)}(?![A-Za-z0-9_.:/=-])"
|
|
393
|
+
)
|
|
394
|
+
return bool(pattern.search(normalized_actual))
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
def validate_risk_probes_cover_solo_headroom_hypothesis(
|
|
398
|
+
probes: list[dict],
|
|
399
|
+
verification_text: str,
|
|
400
|
+
) -> str | None:
|
|
401
|
+
hypothesis_cmds = backticked_observable_miss_commands(verification_text)
|
|
402
|
+
if not hypothesis_cmds:
|
|
403
|
+
return None
|
|
404
|
+
if not probes:
|
|
405
|
+
return None
|
|
406
|
+
derived_from = probes[0].get("derived_from")
|
|
407
|
+
if not (
|
|
408
|
+
isinstance(derived_from, str)
|
|
409
|
+
and "solo-headroom hypothesis" in derived_from.lower()
|
|
410
|
+
and any(command_contains_expected(derived_from, hypothesis_cmd) for hypothesis_cmd in hypothesis_cmds)
|
|
411
|
+
):
|
|
412
|
+
return (
|
|
413
|
+
"risk-probes[0].derived_from must reference the solo-headroom "
|
|
414
|
+
"hypothesis bullet and observable command"
|
|
415
|
+
)
|
|
416
|
+
cmd = probes[0].get("cmd")
|
|
417
|
+
if isinstance(cmd, str) and any(
|
|
418
|
+
command_contains_expected(cmd, hypothesis_cmd)
|
|
419
|
+
for hypothesis_cmd in hypothesis_cmds
|
|
420
|
+
):
|
|
421
|
+
return None
|
|
422
|
+
return (
|
|
423
|
+
"risk-probes[0].cmd must contain a "
|
|
424
|
+
"solo-headroom hypothesis observable command"
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
|
|
142
428
|
def external_url_hosts(text: str) -> list[str]:
|
|
143
429
|
hosts: list[str] = []
|
|
144
430
|
for match in EXTERNAL_URL_RE.finditer(text or ''):
|
|
@@ -183,6 +469,120 @@ def validate_shape(data) -> str | None:
|
|
|
183
469
|
return None
|
|
184
470
|
|
|
185
471
|
|
|
472
|
+
def validate_string_list(data: object, key: str) -> str | None:
|
|
473
|
+
value = data.get(key, []) if isinstance(data, dict) else None
|
|
474
|
+
if not isinstance(value, list) or not all(isinstance(item, str) and item for item in value):
|
|
475
|
+
return f"{key} must be a list of non-empty strings"
|
|
476
|
+
return None
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
def validate_expected_shape(data) -> str | None:
|
|
480
|
+
"""Return None if shape matches the sibling spec.expected.json schema.
|
|
481
|
+
|
|
482
|
+
Keep this dependency-free: it mirrors `_shared/expected.schema.json` enough
|
|
483
|
+
to catch malformed ideate output before /devlyn:resolve consumes it.
|
|
484
|
+
"""
|
|
485
|
+
if not isinstance(data, dict):
|
|
486
|
+
return "top-level must be a JSON object"
|
|
487
|
+
unknown = sorted(set(data) - EXPECTED_TOP_LEVEL_KEYS)
|
|
488
|
+
if unknown:
|
|
489
|
+
return f"unknown top-level key(s): {', '.join(unknown)}"
|
|
490
|
+
if "verification_commands" in data:
|
|
491
|
+
commands = data["verification_commands"]
|
|
492
|
+
if not isinstance(commands, list):
|
|
493
|
+
return "verification_commands must be a list"
|
|
494
|
+
if commands:
|
|
495
|
+
err = validate_shape({"verification_commands": commands})
|
|
496
|
+
if err:
|
|
497
|
+
return err
|
|
498
|
+
for i, command in enumerate(commands):
|
|
499
|
+
unknown_command_keys = sorted(set(command) - EXPECTED_VERIFICATION_COMMAND_KEYS)
|
|
500
|
+
if unknown_command_keys:
|
|
501
|
+
return (
|
|
502
|
+
f"verification_commands[{i}] unknown key(s): "
|
|
503
|
+
f"{', '.join(unknown_command_keys)}"
|
|
504
|
+
)
|
|
505
|
+
contract_refs = command.get("contract_refs", [])
|
|
506
|
+
if not isinstance(contract_refs, list) or not all(
|
|
507
|
+
isinstance(item, str) and item for item in contract_refs
|
|
508
|
+
):
|
|
509
|
+
return f"verification_commands[{i}].contract_refs must be a list of non-empty strings"
|
|
510
|
+
for key in ("required_files", "forbidden_files", "tier_a_waivers", "spec_output_files"):
|
|
511
|
+
err = validate_string_list(data, key)
|
|
512
|
+
if err:
|
|
513
|
+
return err
|
|
514
|
+
max_deps = data.get("max_deps_added", 0)
|
|
515
|
+
if isinstance(max_deps, bool) or not isinstance(max_deps, int) or max_deps < 0:
|
|
516
|
+
return "max_deps_added must be a non-negative integer"
|
|
517
|
+
patterns = data.get("forbidden_patterns", [])
|
|
518
|
+
if not isinstance(patterns, list):
|
|
519
|
+
return "forbidden_patterns must be a list"
|
|
520
|
+
for i, pattern in enumerate(patterns):
|
|
521
|
+
if not isinstance(pattern, dict):
|
|
522
|
+
return f"forbidden_patterns[{i}] must be an object"
|
|
523
|
+
unknown_pattern_keys = sorted(set(pattern) - {"pattern", "description", "files", "severity"})
|
|
524
|
+
if unknown_pattern_keys:
|
|
525
|
+
return (
|
|
526
|
+
f"forbidden_patterns[{i}] unknown key(s): "
|
|
527
|
+
f"{', '.join(unknown_pattern_keys)}"
|
|
528
|
+
)
|
|
529
|
+
for key in ("pattern", "description", "severity"):
|
|
530
|
+
value = pattern.get(key)
|
|
531
|
+
if not isinstance(value, str) or not value:
|
|
532
|
+
return f"forbidden_patterns[{i}].{key} must be a non-empty string"
|
|
533
|
+
if pattern["severity"] not in {"disqualifier", "warning"}:
|
|
534
|
+
return f"forbidden_patterns[{i}].severity must be disqualifier or warning"
|
|
535
|
+
files = pattern.get("files", [])
|
|
536
|
+
if not isinstance(files, list) or not all(isinstance(item, str) and item for item in files):
|
|
537
|
+
return f"forbidden_patterns[{i}].files must be a list of non-empty strings"
|
|
538
|
+
return None
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
def validate_expected_against_sibling_spec(expected_path: Path, data: object) -> str | None:
|
|
542
|
+
if not isinstance(data, dict):
|
|
543
|
+
return None
|
|
544
|
+
spec_path = expected_path.with_name("spec.md")
|
|
545
|
+
if not spec_path.is_file():
|
|
546
|
+
return None
|
|
547
|
+
try:
|
|
548
|
+
spec_text = spec_path.read_text(encoding="utf-8")
|
|
549
|
+
except OSError:
|
|
550
|
+
return None
|
|
551
|
+
solo_headroom_err = validate_present_solo_headroom_hypothesis(spec_text)
|
|
552
|
+
if solo_headroom_err:
|
|
553
|
+
return solo_headroom_err
|
|
554
|
+
solo_ceiling_err = validate_present_solo_ceiling_avoidance(spec_text)
|
|
555
|
+
if solo_ceiling_err:
|
|
556
|
+
return solo_ceiling_err
|
|
557
|
+
commands = data.get("verification_commands", [])
|
|
558
|
+
solo_headroom_command_err = validate_solo_headroom_commands_against_expected(
|
|
559
|
+
spec_text,
|
|
560
|
+
commands,
|
|
561
|
+
"spec.expected.json",
|
|
562
|
+
)
|
|
563
|
+
if solo_headroom_command_err:
|
|
564
|
+
return solo_headroom_command_err
|
|
565
|
+
if commands:
|
|
566
|
+
return None
|
|
567
|
+
if PURE_DESIGN_ESCAPE in spec_text:
|
|
568
|
+
return None
|
|
569
|
+
return (
|
|
570
|
+
"verification_commands must contain at least one entry unless sibling "
|
|
571
|
+
"spec.md declares all Requirements are pure-design"
|
|
572
|
+
)
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
def validate_sibling_spec_complexity(expected_path: Path) -> str | None:
|
|
576
|
+
spec_path = expected_path.with_name("spec.md")
|
|
577
|
+
if not spec_path.is_file():
|
|
578
|
+
return None
|
|
579
|
+
try:
|
|
580
|
+
spec_text = spec_path.read_text(encoding="utf-8")
|
|
581
|
+
except OSError:
|
|
582
|
+
return None
|
|
583
|
+
return validate_present_spec_complexity(spec_text)
|
|
584
|
+
|
|
585
|
+
|
|
186
586
|
def validate_risk_probe(probe: object, index: int, verification_text: str) -> str | None:
|
|
187
587
|
if not isinstance(probe, dict):
|
|
188
588
|
return f"risk-probes[{index}] must be a JSON object"
|
|
@@ -221,13 +621,22 @@ def validate_risk_probe(probe: object, index: int, verification_text: str) -> st
|
|
|
221
621
|
if unknown_tags:
|
|
222
622
|
return f"risk-probes[{index}].tags contains unknown tag(s): {', '.join(unknown_tags)}"
|
|
223
623
|
if "error_contract" in tags and not re.search(
|
|
224
|
-
r'
|
|
624
|
+
r'stderr|exit[ `]*2|(?:prints?|writes?).{0,40}json[ -]?error|'
|
|
625
|
+
r'json[ -]?error.{0,40}(?:stderr|exit)',
|
|
225
626
|
derived_from,
|
|
226
627
|
re.IGNORECASE,
|
|
227
628
|
):
|
|
228
629
|
return (
|
|
229
630
|
f"risk-probes[{index}].derived_from for error_contract must name "
|
|
230
|
-
"
|
|
631
|
+
"a stderr, JSON-error stream, or exit-2 verification bullet"
|
|
632
|
+
)
|
|
633
|
+
if "http_error_contract" in tags and not (
|
|
634
|
+
re.search(r'\b(?:400|401|403|404|409|422|5[0-9][0-9])\b|http|status', derived_from, re.IGNORECASE)
|
|
635
|
+
and re.search(r'error|invalid', derived_from, re.IGNORECASE)
|
|
636
|
+
):
|
|
637
|
+
return (
|
|
638
|
+
f"risk-probes[{index}].derived_from for http_error_contract must name "
|
|
639
|
+
"an HTTP/status error response verification bullet"
|
|
231
640
|
)
|
|
232
641
|
evidence = probe.get("tag_evidence")
|
|
233
642
|
if not isinstance(evidence, dict):
|
|
@@ -245,22 +654,93 @@ def validate_risk_probe(probe: object, index: int, verification_text: str) -> st
|
|
|
245
654
|
f"risk-probes[{index}].tag_evidence.{tag} missing required "
|
|
246
655
|
f"item(s): {', '.join(missing_evidence)}"
|
|
247
656
|
)
|
|
657
|
+
if "shape_contract" in tags and shape_contract_requires_evidence(derived_from):
|
|
658
|
+
actual = evidence.get("shape_contract")
|
|
659
|
+
if not isinstance(actual, list) or not all(isinstance(item, str) for item in actual):
|
|
660
|
+
return f"risk-probes[{index}].tag_evidence.shape_contract must be a list of strings"
|
|
661
|
+
required_shape = set(SHAPE_CONTRACT_REQUIRED_EVIDENCE)
|
|
662
|
+
if re.search(r'error object|error body', derived_from, re.IGNORECASE) or (
|
|
663
|
+
INLINE_JSON_OBJECT_RE.search(derived_from)
|
|
664
|
+
and re.search(r'error|invalid', derived_from, re.IGNORECASE)
|
|
665
|
+
):
|
|
666
|
+
required_shape.add("asserts_exact_error_object")
|
|
667
|
+
missing_shape = sorted(required_shape - set(actual))
|
|
668
|
+
if missing_shape:
|
|
669
|
+
return (
|
|
670
|
+
f"risk-probes[{index}].tag_evidence.shape_contract missing required "
|
|
671
|
+
f"item(s): {', '.join(missing_shape)}"
|
|
672
|
+
)
|
|
248
673
|
return None
|
|
249
674
|
|
|
250
675
|
|
|
676
|
+
def shape_contract_requires_evidence(text: str) -> bool:
|
|
677
|
+
return bool(re.search(
|
|
678
|
+
r'\b(?:keys?|fields?|rows?|shape|object|json[ -]?object|'
|
|
679
|
+
r'error body|stdout|stderr|response body)\b|'
|
|
680
|
+
r'\b(?:applied|rejected|accounts|scheduled|accepted|remaining)\b',
|
|
681
|
+
text,
|
|
682
|
+
re.IGNORECASE,
|
|
683
|
+
) or INLINE_JSON_OBJECT_RE.search(text))
|
|
684
|
+
|
|
685
|
+
|
|
251
686
|
def required_risk_probe_tags(verification_text: str) -> set[str]:
|
|
252
687
|
text = verification_text.lower()
|
|
253
688
|
required: set[str] = set()
|
|
254
689
|
if re.search(r'priority|higher-priority|ordered by|ordering|appears first|input order', text):
|
|
255
690
|
required.add("ordering_inversion")
|
|
256
|
-
if re.search(r'blocked|overlap|forbidden
|
|
691
|
+
if re.search(r'blocked|overlap|forbidden[ -]+window', text):
|
|
257
692
|
required.add("boundary_overlap")
|
|
258
|
-
if re.search(r'rolls? back|
|
|
693
|
+
if re.search(r'rolls? back|rollback|all-or-nothing|tentative', text):
|
|
694
|
+
required.add("rollback_state")
|
|
695
|
+
if re.search(
|
|
696
|
+
r'rolls? back|rollback|all-or-nothing|tentative|reduce[s]? stock|'
|
|
697
|
+
r'available to later|later orders|remaining|'
|
|
698
|
+
r'(?:stock|inventory|balance|availability).{0,80}(?:later|remaining|after failures)',
|
|
699
|
+
text,
|
|
700
|
+
):
|
|
259
701
|
required.add("prior_consumption")
|
|
260
702
|
if "remaining" in text:
|
|
261
703
|
required.add("positive_remaining")
|
|
262
|
-
if re.search(
|
|
704
|
+
if re.search(
|
|
705
|
+
r'stderr|exit[ `]*2|(?:prints?|writes?).{0,40}json[ -]?error|'
|
|
706
|
+
r'json[ -]?error.{0,40}(?:stderr|exit)',
|
|
707
|
+
text,
|
|
708
|
+
):
|
|
709
|
+
required.add("error_contract")
|
|
710
|
+
if re.search(
|
|
711
|
+
r'\b(?:400|401|403|404|409|422|5[0-9][0-9])\b|http status|status code',
|
|
712
|
+
text,
|
|
713
|
+
) and re.search(r'json error|error object|error body|invalid_query|error.*field', text):
|
|
714
|
+
required.add("http_error_contract")
|
|
715
|
+
if re.search(
|
|
716
|
+
r'\b(?:keys?|fields?|rows?|shape|json[ -]?object|'
|
|
717
|
+
r'error object|error body|response body)\b|'
|
|
718
|
+
r'\b(?:applied|rejected|accounts|scheduled|accepted|remaining)\b',
|
|
719
|
+
text,
|
|
720
|
+
) or INLINE_JSON_OBJECT_RE.search(verification_text):
|
|
721
|
+
required.add("shape_contract")
|
|
722
|
+
if re.search(r'stderr|stdout|exit `?2`?', text):
|
|
263
723
|
required.add("stdout_stderr_contract")
|
|
724
|
+
if re.search(r'signature|signing|signed|x-signature|hmac|raw[ -]?body|timingsafeequal', text):
|
|
725
|
+
required.add("auth_signature_contract")
|
|
726
|
+
if re.search(
|
|
727
|
+
r'replay|same.{0,40}`?id`?|already-seen|idempotent|re-delivery|'
|
|
728
|
+
r'duplicate[ -]+(?:delivery|event|id)',
|
|
729
|
+
text,
|
|
730
|
+
):
|
|
731
|
+
required.add("idempotency_replay")
|
|
732
|
+
if re.search(
|
|
733
|
+
r'concurrent|close together|simultaneous|parallel|race|lost update|'
|
|
734
|
+
r'many at once|several .{0,40}requests',
|
|
735
|
+
text,
|
|
736
|
+
):
|
|
737
|
+
required.add("concurrent_state_consistency")
|
|
738
|
+
if re.search(
|
|
739
|
+
r'one valid \+ one invalid|valid \+ one invalid|all-or-nothing|'
|
|
740
|
+
r'same list as before|0 inserts|no partial updates',
|
|
741
|
+
text,
|
|
742
|
+
):
|
|
743
|
+
required.add("atomic_batch_state")
|
|
264
744
|
return required
|
|
265
745
|
|
|
266
746
|
|
|
@@ -287,8 +767,8 @@ def load_risk_probes(
|
|
|
287
767
|
if not line.strip():
|
|
288
768
|
continue
|
|
289
769
|
try:
|
|
290
|
-
probe =
|
|
291
|
-
except
|
|
770
|
+
probe = loads_strict_json(line)
|
|
771
|
+
except ValueError as e:
|
|
292
772
|
return ([], f"risk-probes[{index}] invalid JSON: {e}")
|
|
293
773
|
err = validate_risk_probe(probe, index, verification_text)
|
|
294
774
|
if err:
|
|
@@ -306,6 +786,12 @@ def load_risk_probes(
|
|
|
306
786
|
missing_tags = sorted(required_risk_probe_tags(verification_text) - present_tags)
|
|
307
787
|
if missing_tags:
|
|
308
788
|
return ([], f"risk-probes.jsonl missing required probe tag(s): {', '.join(missing_tags)}")
|
|
789
|
+
solo_headroom_probe_err = validate_risk_probes_cover_solo_headroom_hypothesis(
|
|
790
|
+
probes,
|
|
791
|
+
verification_text,
|
|
792
|
+
)
|
|
793
|
+
if solo_headroom_probe_err:
|
|
794
|
+
return ([], solo_headroom_probe_err)
|
|
309
795
|
return (probes, None)
|
|
310
796
|
|
|
311
797
|
|
|
@@ -318,8 +804,8 @@ def read_source(work: Path, devlyn_dir: Path) -> tuple[str | None, Path | None]:
|
|
|
318
804
|
if not state_path.is_file():
|
|
319
805
|
return (None, None)
|
|
320
806
|
try:
|
|
321
|
-
state =
|
|
322
|
-
except (
|
|
807
|
+
state = loads_strict_json(state_path.read_text())
|
|
808
|
+
except (ValueError, OSError):
|
|
323
809
|
return (None, None)
|
|
324
810
|
src = state.get("source") or {}
|
|
325
811
|
src_type = src.get("type")
|
|
@@ -337,6 +823,75 @@ def read_source(work: Path, devlyn_dir: Path) -> tuple[str | None, Path | None]:
|
|
|
337
823
|
return (src_type, md if md.is_file() else None)
|
|
338
824
|
|
|
339
825
|
|
|
826
|
+
def read_state(devlyn_dir: Path) -> dict:
|
|
827
|
+
state_path = devlyn_dir / "pipeline.state.json"
|
|
828
|
+
if not state_path.is_file():
|
|
829
|
+
return {}
|
|
830
|
+
try:
|
|
831
|
+
data = loads_strict_json(state_path.read_text())
|
|
832
|
+
except (ValueError, OSError):
|
|
833
|
+
return {}
|
|
834
|
+
return data if isinstance(data, dict) else {}
|
|
835
|
+
|
|
836
|
+
|
|
837
|
+
def state_requires_risk_probes(state: dict) -> bool:
|
|
838
|
+
risk_profile = state.get("risk_profile")
|
|
839
|
+
return isinstance(risk_profile, dict) and risk_profile.get("risk_probes_enabled") is True
|
|
840
|
+
|
|
841
|
+
|
|
842
|
+
def risk_probes_state_error(state: dict) -> str | None:
|
|
843
|
+
if "risk_profile" not in state:
|
|
844
|
+
return None
|
|
845
|
+
risk_profile = state.get("risk_profile")
|
|
846
|
+
if not isinstance(risk_profile, dict):
|
|
847
|
+
return "pipeline.state.json risk_profile must be an object"
|
|
848
|
+
if "risk_probes_enabled" not in risk_profile:
|
|
849
|
+
return None
|
|
850
|
+
if not isinstance(risk_profile.get("risk_probes_enabled"), bool):
|
|
851
|
+
return "pipeline.state.json risk_profile.risk_probes_enabled must be boolean"
|
|
852
|
+
return None
|
|
853
|
+
|
|
854
|
+
|
|
855
|
+
def source_integrity_error(src_type: str | None, state: dict, source_md: Path | None) -> str | None:
|
|
856
|
+
if source_md is None:
|
|
857
|
+
return None
|
|
858
|
+
src = state.get("source") if isinstance(state.get("source"), dict) else {}
|
|
859
|
+
if src_type == "generated":
|
|
860
|
+
field = "criteria_sha256"
|
|
861
|
+
required = True
|
|
862
|
+
elif src_type == "spec":
|
|
863
|
+
field = "spec_sha256"
|
|
864
|
+
required = False
|
|
865
|
+
else:
|
|
866
|
+
return None
|
|
867
|
+
expected = src.get(field)
|
|
868
|
+
qualified = f"source.{field}"
|
|
869
|
+
if not isinstance(expected, str) or not expected:
|
|
870
|
+
if required:
|
|
871
|
+
return f"{qualified} is required for generated criteria source integrity."
|
|
872
|
+
return None
|
|
873
|
+
try:
|
|
874
|
+
actual = hashlib.sha256(source_md.read_bytes()).hexdigest()
|
|
875
|
+
except OSError as exc:
|
|
876
|
+
return f"could not read {source_md} for source integrity check: {exc}"
|
|
877
|
+
if expected != actual:
|
|
878
|
+
return f"{qualified} mismatch for {source_md}: expected {expected}, actual {actual}."
|
|
879
|
+
return None
|
|
880
|
+
|
|
881
|
+
|
|
882
|
+
def load_expected_contract(expected_path: Path) -> tuple[dict | None, str | None]:
|
|
883
|
+
try:
|
|
884
|
+
data = loads_strict_json(expected_path.read_text())
|
|
885
|
+
except ValueError as e:
|
|
886
|
+
return (None, f"{expected_path} has invalid JSON: {e}")
|
|
887
|
+
except OSError as e:
|
|
888
|
+
return (None, f"{expected_path} is unreadable: {e}")
|
|
889
|
+
err = validate_expected_shape(data)
|
|
890
|
+
if err:
|
|
891
|
+
return (None, f"{expected_path}: {err}")
|
|
892
|
+
return (data, None)
|
|
893
|
+
|
|
894
|
+
|
|
340
895
|
def stage_from_source(md: Path, devlyn_dir: Path) -> tuple[bool, str | None]:
|
|
341
896
|
"""Materialize .devlyn/spec-verify.json from the json block in `md`.
|
|
342
897
|
|
|
@@ -349,8 +904,8 @@ def stage_from_source(md: Path, devlyn_dir: Path) -> tuple[bool, str | None]:
|
|
|
349
904
|
if block is None:
|
|
350
905
|
return (False, None)
|
|
351
906
|
try:
|
|
352
|
-
data =
|
|
353
|
-
except
|
|
907
|
+
data = loads_strict_json(block)
|
|
908
|
+
except ValueError as e:
|
|
354
909
|
return (False, f"`## Verification` ```json``` block in {md} has invalid JSON: {e}")
|
|
355
910
|
err = validate_shape(data)
|
|
356
911
|
if err:
|
|
@@ -361,13 +916,44 @@ def stage_from_source(md: Path, devlyn_dir: Path) -> tuple[bool, str | None]:
|
|
|
361
916
|
return (True, None)
|
|
362
917
|
|
|
363
918
|
|
|
919
|
+
def stage_from_expected(
|
|
920
|
+
md: Path,
|
|
921
|
+
devlyn_dir: Path,
|
|
922
|
+
) -> tuple[bool, bool, str | None, Path, dict | None]:
|
|
923
|
+
"""Materialize .devlyn/spec-verify.json from sibling spec.expected.json.
|
|
924
|
+
|
|
925
|
+
Returns (found, staged, error, expected_path, expected_data).
|
|
926
|
+
- found=False: no sibling file; caller may fall back to legacy inline carrier.
|
|
927
|
+
- found=True, error: sibling exists but is malformed; caller must fail closed.
|
|
928
|
+
- found=True, staged=False: valid pure-design contract with no commands.
|
|
929
|
+
- found=True, staged=True: wrote verification_commands into spec-verify.json.
|
|
930
|
+
"""
|
|
931
|
+
expected_path = md.with_name("spec.expected.json")
|
|
932
|
+
if not expected_path.is_file():
|
|
933
|
+
return (False, False, None, expected_path, None)
|
|
934
|
+
data, err = load_expected_contract(expected_path)
|
|
935
|
+
if err:
|
|
936
|
+
return (True, False, err, expected_path, None)
|
|
937
|
+
assert data is not None
|
|
938
|
+
commands = data.get("verification_commands")
|
|
939
|
+
if not commands:
|
|
940
|
+
spec_path = devlyn_dir / "spec-verify.json"
|
|
941
|
+
if spec_path.exists():
|
|
942
|
+
spec_path.unlink()
|
|
943
|
+
return (True, False, None, expected_path, data)
|
|
944
|
+
normalized = {"verification_commands": commands}
|
|
945
|
+
devlyn_dir.mkdir(parents=True, exist_ok=True)
|
|
946
|
+
(devlyn_dir / "spec-verify.json").write_text(json.dumps(normalized, indent=2) + "\n")
|
|
947
|
+
return (True, True, None, expected_path, data)
|
|
948
|
+
|
|
949
|
+
|
|
364
950
|
def write_malformed_finding(devlyn_dir: Path, error: str, source_path: Path | None) -> None:
|
|
365
951
|
"""Emit a single CRITICAL finding for a malformed verification carrier."""
|
|
366
952
|
devlyn_dir.mkdir(parents=True, exist_ok=True)
|
|
367
|
-
findings_path = devlyn_dir /
|
|
953
|
+
findings_path = devlyn_dir / output_findings_name()
|
|
368
954
|
file_ref = str(source_path) if source_path else ".devlyn/pipeline.state.json"
|
|
369
955
|
finding = {
|
|
370
|
-
"id": "
|
|
956
|
+
"id": f"{output_finding_prefix()}-0001",
|
|
371
957
|
"rule_id": "correctness.spec-verify-malformed",
|
|
372
958
|
"level": "error",
|
|
373
959
|
"severity": "CRITICAL",
|
|
@@ -375,11 +961,11 @@ def write_malformed_finding(devlyn_dir: Path, error: str, source_path: Path | No
|
|
|
375
961
|
"message": f"Verification contract carrier is malformed: {error}",
|
|
376
962
|
"file": file_ref,
|
|
377
963
|
"line": 1,
|
|
378
|
-
"phase":
|
|
964
|
+
"phase": output_phase(),
|
|
379
965
|
"criterion_ref": "spec-verify://carrier",
|
|
380
966
|
"fix_hint": (
|
|
381
|
-
"Fix the
|
|
382
|
-
"a non-empty `verification_commands` array of "
|
|
967
|
+
"Fix the sibling `spec.expected.json` file or the `## Verification` "
|
|
968
|
+
"```json``` block: a JSON object with a non-empty `verification_commands` array of "
|
|
383
969
|
"{cmd, exit_code?, stdout_contains?, stdout_not_contains?} "
|
|
384
970
|
"entries. See references/build-gate.md § 'Spec literal check'."
|
|
385
971
|
),
|
|
@@ -390,6 +976,200 @@ def write_malformed_finding(devlyn_dir: Path, error: str, source_path: Path | No
|
|
|
390
976
|
fh.write(json.dumps(finding) + "\n")
|
|
391
977
|
|
|
392
978
|
|
|
979
|
+
def slice_diff_to_files(diff_text: str, files: list[str]) -> str:
|
|
980
|
+
if not files:
|
|
981
|
+
return diff_text
|
|
982
|
+
out: list[str] = []
|
|
983
|
+
keep = False
|
|
984
|
+
for line in diff_text.splitlines(keepends=True):
|
|
985
|
+
if line.startswith("diff --git "):
|
|
986
|
+
keep = any(path in line for path in files)
|
|
987
|
+
if keep:
|
|
988
|
+
out.append(line)
|
|
989
|
+
return "".join(out)
|
|
990
|
+
|
|
991
|
+
|
|
992
|
+
def diff_text_for_expected(work: Path, devlyn_dir: Path, state: dict) -> tuple[str, str | None]:
|
|
993
|
+
external_diff = devlyn_dir / "external-diff.patch"
|
|
994
|
+
if external_diff.is_file():
|
|
995
|
+
try:
|
|
996
|
+
return (external_diff.read_text(), None)
|
|
997
|
+
except OSError as e:
|
|
998
|
+
return ("", f"cannot read {external_diff}: {e}")
|
|
999
|
+
base_sha = ((state.get("base_ref") or {}).get("sha") or "").strip()
|
|
1000
|
+
cmd = ["git", "diff"]
|
|
1001
|
+
if base_sha:
|
|
1002
|
+
cmd.append(base_sha)
|
|
1003
|
+
proc = subprocess.run(cmd, cwd=str(work), capture_output=True, text=True)
|
|
1004
|
+
if proc.returncode != 0:
|
|
1005
|
+
return ("", (proc.stderr or proc.stdout or "git diff failed").strip())
|
|
1006
|
+
return (proc.stdout or "", None)
|
|
1007
|
+
|
|
1008
|
+
|
|
1009
|
+
def count_deps_added(work: Path, state: dict) -> int:
|
|
1010
|
+
base_sha = ((state.get("base_ref") or {}).get("sha") or "").strip()
|
|
1011
|
+
cmd = ["git", "diff"]
|
|
1012
|
+
if base_sha:
|
|
1013
|
+
cmd.append(base_sha)
|
|
1014
|
+
cmd.extend(["--", "package.json"])
|
|
1015
|
+
proc = subprocess.run(cmd, cwd=str(work), capture_output=True, text=True)
|
|
1016
|
+
if proc.returncode != 0:
|
|
1017
|
+
return 0
|
|
1018
|
+
in_deps = False
|
|
1019
|
+
count = 0
|
|
1020
|
+
for line in (proc.stdout or "").splitlines():
|
|
1021
|
+
if line.startswith(("diff ", "index ", "---", "+++", "@@")):
|
|
1022
|
+
continue
|
|
1023
|
+
marker = line[:1]
|
|
1024
|
+
content = line[1:] if marker in {"+", "-", " "} else line
|
|
1025
|
+
if '"dependencies"' in content or '"devDependencies"' in content:
|
|
1026
|
+
in_deps = True
|
|
1027
|
+
elif content.strip().startswith("}"):
|
|
1028
|
+
in_deps = False
|
|
1029
|
+
elif in_deps and marker == "+":
|
|
1030
|
+
if re.search(r'"[^"]+"\s*:\s*"[^"]+"', content):
|
|
1031
|
+
count += 1
|
|
1032
|
+
return count
|
|
1033
|
+
|
|
1034
|
+
|
|
1035
|
+
def changed_files(work: Path, state: dict, devlyn_dir: Path) -> list[str]:
|
|
1036
|
+
external_diff = devlyn_dir / "external-diff.patch"
|
|
1037
|
+
if external_diff.is_file():
|
|
1038
|
+
names: list[str] = []
|
|
1039
|
+
try:
|
|
1040
|
+
external_text = external_diff.read_text()
|
|
1041
|
+
except OSError:
|
|
1042
|
+
return []
|
|
1043
|
+
for line in external_text.splitlines():
|
|
1044
|
+
if line.startswith("diff --git "):
|
|
1045
|
+
parts = line.split()
|
|
1046
|
+
if len(parts) >= 4:
|
|
1047
|
+
names.append(parts[3].removeprefix("b/"))
|
|
1048
|
+
return names
|
|
1049
|
+
base_sha = ((state.get("base_ref") or {}).get("sha") or "").strip()
|
|
1050
|
+
cmd = ["git", "diff", "--name-only"]
|
|
1051
|
+
if base_sha:
|
|
1052
|
+
cmd.append(base_sha)
|
|
1053
|
+
proc = subprocess.run(cmd, cwd=str(work), capture_output=True, text=True)
|
|
1054
|
+
if proc.returncode != 0:
|
|
1055
|
+
return []
|
|
1056
|
+
return [line.strip() for line in (proc.stdout or "").splitlines() if line.strip()]
|
|
1057
|
+
|
|
1058
|
+
|
|
1059
|
+
def expected_contract_findings(
|
|
1060
|
+
expected_data: dict | None,
|
|
1061
|
+
expected_path: Path | None,
|
|
1062
|
+
work: Path,
|
|
1063
|
+
devlyn_dir: Path,
|
|
1064
|
+
state: dict,
|
|
1065
|
+
finding_start: int,
|
|
1066
|
+
) -> tuple[list[dict], int]:
|
|
1067
|
+
if not expected_data:
|
|
1068
|
+
return ([], finding_start)
|
|
1069
|
+
findings: list[dict] = []
|
|
1070
|
+
seq = finding_start
|
|
1071
|
+
diff_text, diff_error = diff_text_for_expected(work, devlyn_dir, state)
|
|
1072
|
+
if diff_error and (
|
|
1073
|
+
expected_data.get("forbidden_patterns") or expected_data.get("forbidden_files")
|
|
1074
|
+
):
|
|
1075
|
+
findings.append({
|
|
1076
|
+
"id": f"{output_finding_prefix()}-{seq:04d}",
|
|
1077
|
+
"rule_id": "correctness.expected-contract-unverifiable",
|
|
1078
|
+
"level": "error",
|
|
1079
|
+
"severity": "CRITICAL",
|
|
1080
|
+
"confidence": 1.0,
|
|
1081
|
+
"message": f"Cannot compute diff for forbidden_patterns: {diff_error}",
|
|
1082
|
+
"file": str(expected_path or "spec.expected.json"),
|
|
1083
|
+
"line": 1,
|
|
1084
|
+
"phase": output_phase(),
|
|
1085
|
+
"criterion_ref": "spec.expected.json/forbidden_patterns",
|
|
1086
|
+
"fix_hint": "Ensure pipeline.state.json has base_ref.sha or provide .devlyn/external-diff.patch.",
|
|
1087
|
+
"blocking": True,
|
|
1088
|
+
"status": "open",
|
|
1089
|
+
})
|
|
1090
|
+
seq += 1
|
|
1091
|
+
for i, pattern in enumerate(expected_data.get("forbidden_patterns", []) or []):
|
|
1092
|
+
scope = slice_diff_to_files(diff_text, pattern.get("files") or [])
|
|
1093
|
+
if not re.search(pattern["pattern"], scope):
|
|
1094
|
+
continue
|
|
1095
|
+
is_disqualifier = pattern.get("severity") == "disqualifier"
|
|
1096
|
+
findings.append({
|
|
1097
|
+
"id": f"{output_finding_prefix()}-{seq:04d}",
|
|
1098
|
+
"rule_id": "correctness.forbidden-pattern",
|
|
1099
|
+
"level": "error" if is_disqualifier else "warning",
|
|
1100
|
+
"severity": "CRITICAL" if is_disqualifier else "MEDIUM",
|
|
1101
|
+
"confidence": 1.0,
|
|
1102
|
+
"message": pattern.get("description") or f"Forbidden pattern matched: {pattern['pattern']}",
|
|
1103
|
+
"file": str(expected_path or "spec.expected.json"),
|
|
1104
|
+
"line": 1,
|
|
1105
|
+
"phase": output_phase(),
|
|
1106
|
+
"criterion_ref": f"spec.expected.json/forbidden_patterns/{i}",
|
|
1107
|
+
"fix_hint": "Remove the forbidden diff pattern or change the spec.expected.json contract explicitly.",
|
|
1108
|
+
"blocking": is_disqualifier,
|
|
1109
|
+
"status": "open",
|
|
1110
|
+
})
|
|
1111
|
+
seq += 1
|
|
1112
|
+
changed = set(changed_files(work, state, devlyn_dir))
|
|
1113
|
+
for i, required in enumerate(expected_data.get("required_files", []) or []):
|
|
1114
|
+
if (work / required).exists():
|
|
1115
|
+
continue
|
|
1116
|
+
findings.append({
|
|
1117
|
+
"id": f"{output_finding_prefix()}-{seq:04d}",
|
|
1118
|
+
"rule_id": "correctness.required-file-missing",
|
|
1119
|
+
"level": "error",
|
|
1120
|
+
"severity": "CRITICAL",
|
|
1121
|
+
"confidence": 1.0,
|
|
1122
|
+
"message": f"Required file is missing: {required}",
|
|
1123
|
+
"file": str(expected_path or "spec.expected.json"),
|
|
1124
|
+
"line": 1,
|
|
1125
|
+
"phase": output_phase(),
|
|
1126
|
+
"criterion_ref": f"spec.expected.json/required_files/{i}",
|
|
1127
|
+
"fix_hint": "Create the required file or remove it from the expected contract.",
|
|
1128
|
+
"blocking": True,
|
|
1129
|
+
"status": "open",
|
|
1130
|
+
})
|
|
1131
|
+
seq += 1
|
|
1132
|
+
for i, forbidden in enumerate(expected_data.get("forbidden_files", []) or []):
|
|
1133
|
+
if forbidden not in changed:
|
|
1134
|
+
continue
|
|
1135
|
+
findings.append({
|
|
1136
|
+
"id": f"{output_finding_prefix()}-{seq:04d}",
|
|
1137
|
+
"rule_id": "scope.forbidden-file-touched",
|
|
1138
|
+
"level": "error",
|
|
1139
|
+
"severity": "CRITICAL",
|
|
1140
|
+
"confidence": 1.0,
|
|
1141
|
+
"message": f"Forbidden file appears in the diff: {forbidden}",
|
|
1142
|
+
"file": str(expected_path or "spec.expected.json"),
|
|
1143
|
+
"line": 1,
|
|
1144
|
+
"phase": output_phase(),
|
|
1145
|
+
"criterion_ref": f"spec.expected.json/forbidden_files/{i}",
|
|
1146
|
+
"fix_hint": "Remove that file from the diff or update the expected contract.",
|
|
1147
|
+
"blocking": True,
|
|
1148
|
+
"status": "open",
|
|
1149
|
+
})
|
|
1150
|
+
seq += 1
|
|
1151
|
+
max_deps = expected_data.get("max_deps_added", 0)
|
|
1152
|
+
deps_added = count_deps_added(work, state)
|
|
1153
|
+
if deps_added > max_deps:
|
|
1154
|
+
findings.append({
|
|
1155
|
+
"id": f"{output_finding_prefix()}-{seq:04d}",
|
|
1156
|
+
"rule_id": "scope.max-deps-added-exceeded",
|
|
1157
|
+
"level": "error",
|
|
1158
|
+
"severity": "CRITICAL",
|
|
1159
|
+
"confidence": 1.0,
|
|
1160
|
+
"message": f"Added {deps_added} package dependencies; max_deps_added is {max_deps}.",
|
|
1161
|
+
"file": str(expected_path or "spec.expected.json"),
|
|
1162
|
+
"line": 1,
|
|
1163
|
+
"phase": output_phase(),
|
|
1164
|
+
"criterion_ref": "spec.expected.json/max_deps_added",
|
|
1165
|
+
"fix_hint": "Remove the new dependency or explicitly license it in spec.expected.json.",
|
|
1166
|
+
"blocking": True,
|
|
1167
|
+
"status": "open",
|
|
1168
|
+
})
|
|
1169
|
+
seq += 1
|
|
1170
|
+
return (findings, seq)
|
|
1171
|
+
|
|
1172
|
+
|
|
393
1173
|
def run_check_mode(md_path: Path) -> int:
|
|
394
1174
|
"""`--check <markdown>` — validate the verification carrier without
|
|
395
1175
|
running any commands. Used by /devlyn:ideate after item-spec write.
|
|
@@ -400,15 +1180,28 @@ def run_check_mode(md_path: Path) -> int:
|
|
|
400
1180
|
if not md_path.is_file():
|
|
401
1181
|
print(f"[spec-verify --check] error: {md_path} not found", file=sys.stderr)
|
|
402
1182
|
return 2
|
|
403
|
-
|
|
1183
|
+
text = md_path.read_text()
|
|
1184
|
+
frontmatter_err = validate_present_spec_complexity(text)
|
|
1185
|
+
if frontmatter_err:
|
|
1186
|
+
print(f"[spec-verify --check] {md_path}: {frontmatter_err}", file=sys.stderr)
|
|
1187
|
+
return 2
|
|
1188
|
+
solo_headroom_err = validate_present_solo_headroom_hypothesis(text)
|
|
1189
|
+
if solo_headroom_err:
|
|
1190
|
+
print(f"[spec-verify --check] {md_path}: {solo_headroom_err}", file=sys.stderr)
|
|
1191
|
+
return 2
|
|
1192
|
+
solo_ceiling_err = validate_present_solo_ceiling_avoidance(text)
|
|
1193
|
+
if solo_ceiling_err:
|
|
1194
|
+
print(f"[spec-verify --check] {md_path}: {solo_ceiling_err}", file=sys.stderr)
|
|
1195
|
+
return 2
|
|
1196
|
+
block = extract_verification_block(text)
|
|
404
1197
|
if block is None:
|
|
405
1198
|
# Section absent or no json block — opt-in nature preserved for
|
|
406
1199
|
# ideate (a spec without machine verification is still valid; it
|
|
407
1200
|
# just won't activate the BUILD_GATE gate).
|
|
408
1201
|
return 0
|
|
409
1202
|
try:
|
|
410
|
-
data =
|
|
411
|
-
except
|
|
1203
|
+
data = loads_strict_json(block)
|
|
1204
|
+
except ValueError as e:
|
|
412
1205
|
print(
|
|
413
1206
|
f"[spec-verify --check] {md_path}: invalid JSON in `## Verification` "
|
|
414
1207
|
f"```json``` block: {e}",
|
|
@@ -419,6 +1212,33 @@ def run_check_mode(md_path: Path) -> int:
|
|
|
419
1212
|
if err:
|
|
420
1213
|
print(f"[spec-verify --check] {md_path}: shape error: {err}", file=sys.stderr)
|
|
421
1214
|
return 2
|
|
1215
|
+
solo_headroom_command_err = validate_solo_headroom_commands_against_expected(
|
|
1216
|
+
text,
|
|
1217
|
+
data.get("verification_commands", []),
|
|
1218
|
+
"`## Verification` JSON carrier",
|
|
1219
|
+
)
|
|
1220
|
+
if solo_headroom_command_err:
|
|
1221
|
+
print(f"[spec-verify --check] {md_path}: {solo_headroom_command_err}", file=sys.stderr)
|
|
1222
|
+
return 2
|
|
1223
|
+
return 0
|
|
1224
|
+
|
|
1225
|
+
|
|
1226
|
+
def run_check_expected_mode(expected_path: Path) -> int:
|
|
1227
|
+
if not expected_path.is_file():
|
|
1228
|
+
print(f"[spec-verify --check-expected] error: {expected_path} not found", file=sys.stderr)
|
|
1229
|
+
return 2
|
|
1230
|
+
_data, err = load_expected_contract(expected_path)
|
|
1231
|
+
if err:
|
|
1232
|
+
print(f"[spec-verify --check-expected] {expected_path}: shape error: {err}", file=sys.stderr)
|
|
1233
|
+
return 2
|
|
1234
|
+
complexity_err = validate_sibling_spec_complexity(expected_path)
|
|
1235
|
+
if complexity_err:
|
|
1236
|
+
print(f"[spec-verify --check-expected] {expected_path}: shape error: {complexity_err}", file=sys.stderr)
|
|
1237
|
+
return 2
|
|
1238
|
+
sibling_err = validate_expected_against_sibling_spec(expected_path, _data)
|
|
1239
|
+
if sibling_err:
|
|
1240
|
+
print(f"[spec-verify --check-expected] {expected_path}: shape error: {sibling_err}", file=sys.stderr)
|
|
1241
|
+
return 2
|
|
422
1242
|
return 0
|
|
423
1243
|
|
|
424
1244
|
|
|
@@ -461,70 +1281,1756 @@ def run_self_test() -> int:
|
|
|
461
1281
|
print(good.stderr, file=sys.stderr)
|
|
462
1282
|
return 1
|
|
463
1283
|
|
|
464
|
-
(devlyn / "risk-probes.jsonl").
|
|
465
|
-
|
|
466
|
-
"
|
|
467
|
-
"
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
[sys.executable, script_path, "--validate-risk-probes"],
|
|
1284
|
+
(devlyn / "risk-probes.jsonl").unlink()
|
|
1285
|
+
(devlyn / "pipeline.state.json").write_text(json.dumps({
|
|
1286
|
+
"source": {"type": "spec", "spec_path": str(spec_md)},
|
|
1287
|
+
"risk_profile": {"risk_probes_enabled": True},
|
|
1288
|
+
}))
|
|
1289
|
+
missing_required_probe = subprocess.run(
|
|
1290
|
+
[sys.executable, script_path, "--include-risk-probes"],
|
|
472
1291
|
cwd=work,
|
|
473
1292
|
env=env,
|
|
474
1293
|
capture_output=True,
|
|
475
1294
|
text=True,
|
|
476
1295
|
)
|
|
477
|
-
if
|
|
478
|
-
print("
|
|
1296
|
+
if missing_required_probe.returncode == 0:
|
|
1297
|
+
print("--include-risk-probes accepted missing required risk-probes.jsonl", file=sys.stderr)
|
|
1298
|
+
return 1
|
|
1299
|
+
if "risk-probes.jsonl is required when --risk-probes is enabled" not in missing_required_probe.stderr:
|
|
1300
|
+
print("--include-risk-probes missing required probe had the wrong error", file=sys.stderr)
|
|
1301
|
+
print(missing_required_probe.stderr, file=sys.stderr)
|
|
479
1302
|
return 1
|
|
480
1303
|
|
|
481
|
-
(devlyn / "
|
|
482
|
-
"
|
|
483
|
-
"
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
"tag_evidence": {"error_contract": []},
|
|
488
|
-
}) + "\n")
|
|
489
|
-
bad_error_ref = subprocess.run(
|
|
490
|
-
[sys.executable, script_path, "--validate-risk-probes"],
|
|
1304
|
+
(devlyn / "pipeline.state.json").write_text(json.dumps({
|
|
1305
|
+
"source": {"type": "spec", "spec_path": str(spec_md)},
|
|
1306
|
+
"risk_profile": {"risk_probes_enabled": False},
|
|
1307
|
+
}))
|
|
1308
|
+
missing_optional_probe = subprocess.run(
|
|
1309
|
+
[sys.executable, script_path, "--include-risk-probes"],
|
|
491
1310
|
cwd=work,
|
|
492
1311
|
env=env,
|
|
493
1312
|
capture_output=True,
|
|
494
1313
|
text=True,
|
|
495
1314
|
)
|
|
496
|
-
if
|
|
497
|
-
print("
|
|
1315
|
+
if missing_optional_probe.returncode != 0:
|
|
1316
|
+
print("--include-risk-probes rejected optional missing risk-probes.jsonl", file=sys.stderr)
|
|
1317
|
+
print(missing_optional_probe.stderr, file=sys.stderr)
|
|
1318
|
+
return 1
|
|
1319
|
+
|
|
1320
|
+
(devlyn / "pipeline.state.json").write_text(json.dumps({
|
|
1321
|
+
"source": {"type": "spec", "spec_path": str(spec_md)},
|
|
1322
|
+
"risk_profile": {"risk_probes_enabled": "true"},
|
|
1323
|
+
}))
|
|
1324
|
+
malformed_risk_probe_state = subprocess.run(
|
|
1325
|
+
[sys.executable, script_path, "--include-risk-probes"],
|
|
1326
|
+
cwd=work,
|
|
1327
|
+
env=env,
|
|
1328
|
+
capture_output=True,
|
|
1329
|
+
text=True,
|
|
1330
|
+
)
|
|
1331
|
+
if malformed_risk_probe_state.returncode == 0:
|
|
1332
|
+
print("--include-risk-probes accepted non-boolean risk_probes_enabled", file=sys.stderr)
|
|
1333
|
+
return 1
|
|
1334
|
+
if "risk_profile.risk_probes_enabled must be boolean" not in malformed_risk_probe_state.stderr:
|
|
1335
|
+
print("--include-risk-probes malformed risk_probes_enabled had the wrong error", file=sys.stderr)
|
|
1336
|
+
print(malformed_risk_probe_state.stderr, file=sys.stderr)
|
|
1337
|
+
return 1
|
|
1338
|
+
|
|
1339
|
+
(devlyn / "pipeline.state.json").write_text(json.dumps({
|
|
1340
|
+
"source": {"type": "spec", "spec_path": str(spec_md)},
|
|
1341
|
+
"risk_profile": "enabled",
|
|
1342
|
+
}))
|
|
1343
|
+
malformed_risk_profile = subprocess.run(
|
|
1344
|
+
[sys.executable, script_path, "--include-risk-probes"],
|
|
1345
|
+
cwd=work,
|
|
1346
|
+
env=env,
|
|
1347
|
+
capture_output=True,
|
|
1348
|
+
text=True,
|
|
1349
|
+
)
|
|
1350
|
+
if malformed_risk_profile.returncode == 0:
|
|
1351
|
+
print("--include-risk-probes accepted non-object risk_profile", file=sys.stderr)
|
|
1352
|
+
return 1
|
|
1353
|
+
if "risk_profile must be an object" not in malformed_risk_profile.stderr:
|
|
1354
|
+
print("--include-risk-probes malformed risk_profile had the wrong error", file=sys.stderr)
|
|
1355
|
+
print(malformed_risk_profile.stderr, file=sys.stderr)
|
|
498
1356
|
return 1
|
|
499
1357
|
|
|
1358
|
+
(devlyn / "pipeline.state.json").write_text(json.dumps({
|
|
1359
|
+
"source": {"type": "spec", "spec_path": str(spec_md)}
|
|
1360
|
+
}))
|
|
500
1361
|
(devlyn / "risk-probes.jsonl").write_text(json.dumps({
|
|
501
|
-
"id": "
|
|
1362
|
+
"id": "P1",
|
|
502
1363
|
"derived_from": "probe must pass visible marker.",
|
|
503
|
-
"cmd": "printf
|
|
1364
|
+
"cmd": "printf probe-ok",
|
|
504
1365
|
"exit_code": 0,
|
|
505
|
-
"
|
|
506
|
-
"
|
|
1366
|
+
"stdout_contains": ["probe-ok"],
|
|
1367
|
+
"stdout_not_contains": [],
|
|
1368
|
+
"tags": ["shape_contract"],
|
|
1369
|
+
"tag_evidence": {},
|
|
507
1370
|
}) + "\n")
|
|
508
|
-
|
|
509
|
-
|
|
1371
|
+
|
|
1372
|
+
good_complexity = work / "good-complexity.md"
|
|
1373
|
+
good_complexity.write_text(
|
|
1374
|
+
"---\nid: good\ncomplexity: large\n---\n\n# Good\n\n## Verification\n\n- ok\n",
|
|
1375
|
+
encoding="utf-8",
|
|
1376
|
+
)
|
|
1377
|
+
good_complexity_check = subprocess.run(
|
|
1378
|
+
[sys.executable, script_path, "--check", str(good_complexity)],
|
|
510
1379
|
cwd=work,
|
|
511
|
-
env=env,
|
|
512
1380
|
capture_output=True,
|
|
513
1381
|
text=True,
|
|
514
1382
|
)
|
|
515
|
-
if
|
|
516
|
-
print(
|
|
1383
|
+
if good_complexity_check.returncode != 0:
|
|
1384
|
+
print(good_complexity_check.stderr, file=sys.stderr)
|
|
517
1385
|
return 1
|
|
518
|
-
return 0
|
|
519
1386
|
|
|
1387
|
+
bad_complexity = work / "bad-complexity.md"
|
|
1388
|
+
bad_complexity.write_text(
|
|
1389
|
+
"---\nid: bad\ncomplexity: hihg\n---\n\n# Bad\n\n## Verification\n\n- ok\n",
|
|
1390
|
+
encoding="utf-8",
|
|
1391
|
+
)
|
|
1392
|
+
bad_complexity_check = subprocess.run(
|
|
1393
|
+
[sys.executable, script_path, "--check", str(bad_complexity)],
|
|
1394
|
+
cwd=work,
|
|
1395
|
+
capture_output=True,
|
|
1396
|
+
text=True,
|
|
1397
|
+
)
|
|
1398
|
+
if bad_complexity_check.returncode == 0:
|
|
1399
|
+
print("unsupported spec complexity was accepted", file=sys.stderr)
|
|
1400
|
+
return 1
|
|
1401
|
+
if "frontmatter complexity must be one of" not in bad_complexity_check.stderr:
|
|
1402
|
+
print("unsupported spec complexity did not report the allowed values", file=sys.stderr)
|
|
1403
|
+
print(bad_complexity_check.stderr, file=sys.stderr)
|
|
1404
|
+
return 1
|
|
520
1405
|
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
1406
|
+
weak_solo_headroom = work / "weak-solo-headroom.md"
|
|
1407
|
+
weak_solo_headroom.write_text(
|
|
1408
|
+
"# Weak\n\n## Verification\n\n"
|
|
1409
|
+
"- solo-headroom hypothesis: solo_claude should miss duplicate handling.\n"
|
|
1410
|
+
"- Observable command: `node check.js` exposes behavior.\n",
|
|
1411
|
+
encoding="utf-8",
|
|
1412
|
+
)
|
|
1413
|
+
weak_solo_check = subprocess.run(
|
|
1414
|
+
[sys.executable, script_path, "--check", str(weak_solo_headroom)],
|
|
1415
|
+
cwd=work,
|
|
1416
|
+
capture_output=True,
|
|
1417
|
+
text=True,
|
|
1418
|
+
)
|
|
1419
|
+
if weak_solo_check.returncode == 0:
|
|
1420
|
+
print("weak solo-headroom hypothesis was accepted by --check", file=sys.stderr)
|
|
1421
|
+
return 1
|
|
1422
|
+
if "backticked command/observable line that exposes the miss" not in weak_solo_check.stderr:
|
|
1423
|
+
print("--check did not report weak solo-headroom hypothesis", file=sys.stderr)
|
|
1424
|
+
print(weak_solo_check.stderr, file=sys.stderr)
|
|
1425
|
+
return 1
|
|
1426
|
+
|
|
1427
|
+
weak_descriptive_backtick = work / "weak-descriptive-backtick.md"
|
|
1428
|
+
weak_descriptive_backtick.write_text(
|
|
1429
|
+
"# Weak descriptive backtick\n\n## Verification\n\n"
|
|
1430
|
+
"- solo-headroom hypothesis: solo_claude should miss behavior where observable `priority rollback` exposes the miss.\n",
|
|
1431
|
+
encoding="utf-8",
|
|
1432
|
+
)
|
|
1433
|
+
weak_descriptive_check = subprocess.run(
|
|
1434
|
+
[sys.executable, script_path, "--check", str(weak_descriptive_backtick)],
|
|
1435
|
+
cwd=work,
|
|
1436
|
+
capture_output=True,
|
|
1437
|
+
text=True,
|
|
1438
|
+
)
|
|
1439
|
+
if weak_descriptive_check.returncode == 0:
|
|
1440
|
+
print("descriptive backtick solo-headroom hypothesis was accepted by --check", file=sys.stderr)
|
|
1441
|
+
return 1
|
|
1442
|
+
|
|
1443
|
+
strong_solo_headroom = work / "strong-solo-headroom.md"
|
|
1444
|
+
strong_solo_headroom.write_text(
|
|
1445
|
+
"# Strong\n\n## Verification\n\n"
|
|
1446
|
+
"- solo-headroom hypothesis: solo_claude should miss duplicate handling exposed by `node check.js`.\n",
|
|
1447
|
+
encoding="utf-8",
|
|
1448
|
+
)
|
|
1449
|
+
strong_solo_check = subprocess.run(
|
|
1450
|
+
[sys.executable, script_path, "--check", str(strong_solo_headroom)],
|
|
1451
|
+
cwd=work,
|
|
1452
|
+
capture_output=True,
|
|
1453
|
+
text=True,
|
|
1454
|
+
)
|
|
1455
|
+
if strong_solo_check.returncode != 0:
|
|
1456
|
+
print("actionable solo-headroom hypothesis was rejected by --check", file=sys.stderr)
|
|
1457
|
+
print(strong_solo_check.stderr, file=sys.stderr)
|
|
1458
|
+
return 1
|
|
1459
|
+
|
|
1460
|
+
docs_style_solo_headroom = work / "docs-style-solo-headroom.md"
|
|
1461
|
+
docs_style_solo_headroom.write_text(
|
|
1462
|
+
"# Docs style\n\n## Verification\n\n"
|
|
1463
|
+
"- Solo-headroom hypothesis: the spec must literally contain `solo_claude`, `miss`, and an observable command; "
|
|
1464
|
+
"`node check.js` exposes the miss.\n",
|
|
1465
|
+
encoding="utf-8",
|
|
1466
|
+
)
|
|
1467
|
+
docs_style_solo_check = subprocess.run(
|
|
1468
|
+
[sys.executable, script_path, "--check", str(docs_style_solo_headroom)],
|
|
1469
|
+
cwd=work,
|
|
1470
|
+
capture_output=True,
|
|
1471
|
+
text=True,
|
|
1472
|
+
)
|
|
1473
|
+
if docs_style_solo_check.returncode != 0:
|
|
1474
|
+
print("docs-style solo-headroom hypothesis was rejected by --check", file=sys.stderr)
|
|
1475
|
+
print(docs_style_solo_check.stderr, file=sys.stderr)
|
|
1476
|
+
return 1
|
|
1477
|
+
|
|
1478
|
+
weak_solo_ceiling = work / "weak-solo-ceiling.md"
|
|
1479
|
+
weak_solo_ceiling.write_text(
|
|
1480
|
+
"# Weak ceiling\n\n## Verification\n\n"
|
|
1481
|
+
"- solo ceiling avoidance: this is not like the previous ones.\n",
|
|
1482
|
+
encoding="utf-8",
|
|
1483
|
+
)
|
|
1484
|
+
weak_solo_ceiling_check = subprocess.run(
|
|
1485
|
+
[sys.executable, script_path, "--check", str(weak_solo_ceiling)],
|
|
1486
|
+
cwd=work,
|
|
1487
|
+
capture_output=True,
|
|
1488
|
+
text=True,
|
|
1489
|
+
)
|
|
1490
|
+
if weak_solo_ceiling_check.returncode == 0:
|
|
1491
|
+
print("weak solo ceiling avoidance was accepted by --check", file=sys.stderr)
|
|
1492
|
+
return 1
|
|
1493
|
+
if "concrete difference from rejected or solo-saturated controls" not in weak_solo_ceiling_check.stderr:
|
|
1494
|
+
print("--check did not report weak solo ceiling avoidance", file=sys.stderr)
|
|
1495
|
+
print(weak_solo_ceiling_check.stderr, file=sys.stderr)
|
|
1496
|
+
return 1
|
|
1497
|
+
|
|
1498
|
+
strong_solo_ceiling = work / "strong-solo-ceiling.md"
|
|
1499
|
+
strong_solo_ceiling.write_text(
|
|
1500
|
+
"# Strong ceiling\n\n## Verification\n\n"
|
|
1501
|
+
"- solo ceiling avoidance: unlike solo-saturated `S2`-`S6`, this uses a cross-run "
|
|
1502
|
+
"state leak because solo_claude headroom should be preserved.\n",
|
|
1503
|
+
encoding="utf-8",
|
|
1504
|
+
)
|
|
1505
|
+
strong_solo_ceiling_check = subprocess.run(
|
|
1506
|
+
[sys.executable, script_path, "--check", str(strong_solo_ceiling)],
|
|
1507
|
+
cwd=work,
|
|
1508
|
+
capture_output=True,
|
|
1509
|
+
text=True,
|
|
1510
|
+
)
|
|
1511
|
+
if strong_solo_ceiling_check.returncode != 0:
|
|
1512
|
+
print("actionable solo ceiling avoidance was rejected by --check", file=sys.stderr)
|
|
1513
|
+
print(strong_solo_ceiling_check.stderr, file=sys.stderr)
|
|
1514
|
+
return 1
|
|
1515
|
+
|
|
1516
|
+
inline_mismatched_solo = work / "inline-mismatched-solo.md"
|
|
1517
|
+
inline_mismatched_solo.write_text(
|
|
1518
|
+
"# Inline mismatch\n\n## Verification\n\n"
|
|
1519
|
+
"- solo-headroom hypothesis: solo_claude should miss duplicate handling; "
|
|
1520
|
+
"`node check.js` exposes the miss.\n\n"
|
|
1521
|
+
"```json\n"
|
|
1522
|
+
+ json.dumps({"verification_commands": [{"cmd": "printf ok"}]})
|
|
1523
|
+
+ "\n```\n",
|
|
1524
|
+
encoding="utf-8",
|
|
1525
|
+
)
|
|
1526
|
+
inline_mismatched_check = subprocess.run(
|
|
1527
|
+
[sys.executable, script_path, "--check", str(inline_mismatched_solo)],
|
|
1528
|
+
cwd=work,
|
|
1529
|
+
capture_output=True,
|
|
1530
|
+
text=True,
|
|
1531
|
+
)
|
|
1532
|
+
if inline_mismatched_check.returncode == 0:
|
|
1533
|
+
print("mismatched inline solo-headroom command was accepted by --check", file=sys.stderr)
|
|
1534
|
+
return 1
|
|
1535
|
+
if "observable command must match `## Verification` JSON carrier" not in inline_mismatched_check.stderr:
|
|
1536
|
+
print("--check did not report mismatched inline solo-headroom command", file=sys.stderr)
|
|
1537
|
+
print(inline_mismatched_check.stderr, file=sys.stderr)
|
|
1538
|
+
return 1
|
|
1539
|
+
|
|
1540
|
+
inline_matched_solo = work / "inline-matched-solo.md"
|
|
1541
|
+
inline_matched_solo.write_text(
|
|
1542
|
+
"# Inline match\n\n## Verification\n\n"
|
|
1543
|
+
"- solo-headroom hypothesis: solo_claude should miss duplicate handling; "
|
|
1544
|
+
"`printf ok` exposes the miss.\n\n"
|
|
1545
|
+
"```json\n"
|
|
1546
|
+
+ json.dumps({"verification_commands": [{"cmd": "printf ok"}]})
|
|
1547
|
+
+ "\n```\n",
|
|
1548
|
+
encoding="utf-8",
|
|
1549
|
+
)
|
|
1550
|
+
inline_matched_check = subprocess.run(
|
|
1551
|
+
[sys.executable, script_path, "--check", str(inline_matched_solo)],
|
|
1552
|
+
cwd=work,
|
|
1553
|
+
capture_output=True,
|
|
1554
|
+
text=True,
|
|
1555
|
+
)
|
|
1556
|
+
if inline_matched_check.returncode != 0:
|
|
1557
|
+
print("matched inline solo-headroom command was rejected by --check", file=sys.stderr)
|
|
1558
|
+
print(inline_matched_check.stderr, file=sys.stderr)
|
|
1559
|
+
return 1
|
|
1560
|
+
|
|
1561
|
+
(devlyn / "risk-probes.jsonl").write_text(json.dumps({
|
|
1562
|
+
"id": "P2",
|
|
1563
|
+
"derived_from": "probe must pass visible marker.",
|
|
1564
|
+
"cmd": "node $BENCH_FIXTURE_DIR/verifiers/hidden.js",
|
|
1565
|
+
"exit_code": 0,
|
|
1566
|
+
}) + "\n")
|
|
1567
|
+
bad = subprocess.run(
|
|
1568
|
+
[sys.executable, script_path, "--validate-risk-probes"],
|
|
1569
|
+
cwd=work,
|
|
1570
|
+
env=env,
|
|
1571
|
+
capture_output=True,
|
|
1572
|
+
text=True,
|
|
1573
|
+
)
|
|
1574
|
+
if bad.returncode == 0:
|
|
1575
|
+
print("hidden verifier path was accepted", file=sys.stderr)
|
|
1576
|
+
return 1
|
|
1577
|
+
|
|
1578
|
+
(devlyn / "risk-probes.jsonl").write_text('{"id":NaN}\n')
|
|
1579
|
+
bad_probe_nan = subprocess.run(
|
|
1580
|
+
[sys.executable, script_path, "--validate-risk-probes"],
|
|
1581
|
+
cwd=work,
|
|
1582
|
+
env=env,
|
|
1583
|
+
capture_output=True,
|
|
1584
|
+
text=True,
|
|
1585
|
+
)
|
|
1586
|
+
if bad_probe_nan.returncode == 0:
|
|
1587
|
+
print("NaN risk-probes JSONL was accepted", file=sys.stderr)
|
|
1588
|
+
return 1
|
|
1589
|
+
if "invalid JSON numeric constant: NaN" not in bad_probe_nan.stderr:
|
|
1590
|
+
print("NaN risk-probes JSONL did not report invalid numeric constant", file=sys.stderr)
|
|
1591
|
+
print(bad_probe_nan.stderr, file=sys.stderr)
|
|
1592
|
+
return 1
|
|
1593
|
+
|
|
1594
|
+
(devlyn / "risk-probes.jsonl").write_text(json.dumps({
|
|
1595
|
+
"id": "P3",
|
|
1596
|
+
"derived_from": "probe must pass visible marker.",
|
|
1597
|
+
"cmd": "printf bad-error-derived-from",
|
|
1598
|
+
"exit_code": 0,
|
|
1599
|
+
"tags": ["error_contract"],
|
|
1600
|
+
"tag_evidence": {"error_contract": []},
|
|
1601
|
+
}) + "\n")
|
|
1602
|
+
bad_error_ref = subprocess.run(
|
|
1603
|
+
[sys.executable, script_path, "--validate-risk-probes"],
|
|
1604
|
+
cwd=work,
|
|
1605
|
+
env=env,
|
|
1606
|
+
capture_output=True,
|
|
1607
|
+
text=True,
|
|
1608
|
+
)
|
|
1609
|
+
if bad_error_ref.returncode == 0:
|
|
1610
|
+
print("error_contract with unrelated derived_from was accepted", file=sys.stderr)
|
|
1611
|
+
return 1
|
|
1612
|
+
|
|
1613
|
+
spec_md.write_text(
|
|
1614
|
+
"# Spec\n\n## Verification\n\n"
|
|
1615
|
+
"- solo-headroom hypothesis: solo_claude should miss duplicate handling; "
|
|
1616
|
+
"`printf ok` exposes the miss.\n"
|
|
1617
|
+
"- probe must pass visible marker.\n",
|
|
1618
|
+
encoding="utf-8",
|
|
1619
|
+
)
|
|
1620
|
+
(devlyn / "risk-probes.jsonl").write_text(json.dumps({
|
|
1621
|
+
"id": "P4",
|
|
1622
|
+
"derived_from": "solo-headroom hypothesis: solo_claude should miss duplicate handling; `printf ok` exposes the miss.",
|
|
1623
|
+
"cmd": "printf unrelated",
|
|
1624
|
+
"exit_code": 0,
|
|
1625
|
+
"stdout_contains": ["unrelated"],
|
|
1626
|
+
"stdout_not_contains": [],
|
|
1627
|
+
"tags": ["shape_contract"],
|
|
1628
|
+
"tag_evidence": {},
|
|
1629
|
+
}) + "\n")
|
|
1630
|
+
bad_solo_headroom_probe = subprocess.run(
|
|
1631
|
+
[sys.executable, script_path, "--validate-risk-probes"],
|
|
1632
|
+
cwd=work,
|
|
1633
|
+
env=env,
|
|
1634
|
+
capture_output=True,
|
|
1635
|
+
text=True,
|
|
1636
|
+
)
|
|
1637
|
+
if bad_solo_headroom_probe.returncode == 0:
|
|
1638
|
+
print("risk probe missing solo-headroom command coverage was accepted", file=sys.stderr)
|
|
1639
|
+
return 1
|
|
1640
|
+
if "risk-probes[0].cmd must contain a solo-headroom hypothesis observable command" not in bad_solo_headroom_probe.stderr:
|
|
1641
|
+
print("solo-headroom risk-probe coverage failure had the wrong error", file=sys.stderr)
|
|
1642
|
+
print(bad_solo_headroom_probe.stderr, file=sys.stderr)
|
|
1643
|
+
return 1
|
|
1644
|
+
|
|
1645
|
+
(devlyn / "risk-probes.jsonl").write_text(json.dumps({
|
|
1646
|
+
"id": "P4a",
|
|
1647
|
+
"derived_from": "probe must pass visible marker.",
|
|
1648
|
+
"cmd": "bash -lc 'printf ok'",
|
|
1649
|
+
"exit_code": 0,
|
|
1650
|
+
"stdout_contains": ["ok"],
|
|
1651
|
+
"stdout_not_contains": [],
|
|
1652
|
+
"tags": ["shape_contract"],
|
|
1653
|
+
"tag_evidence": {},
|
|
1654
|
+
}) + "\n")
|
|
1655
|
+
bad_solo_headroom_derived_from = subprocess.run(
|
|
1656
|
+
[sys.executable, script_path, "--validate-risk-probes"],
|
|
1657
|
+
cwd=work,
|
|
1658
|
+
env=env,
|
|
1659
|
+
capture_output=True,
|
|
1660
|
+
text=True,
|
|
1661
|
+
)
|
|
1662
|
+
if bad_solo_headroom_derived_from.returncode == 0:
|
|
1663
|
+
print("risk probe with unrelated solo-headroom derived_from was accepted", file=sys.stderr)
|
|
1664
|
+
return 1
|
|
1665
|
+
if "risk-probes[0].derived_from must reference the solo-headroom hypothesis bullet" not in bad_solo_headroom_derived_from.stderr:
|
|
1666
|
+
print("solo-headroom risk-probe derived_from failure had the wrong error", file=sys.stderr)
|
|
1667
|
+
print(bad_solo_headroom_derived_from.stderr, file=sys.stderr)
|
|
1668
|
+
return 1
|
|
1669
|
+
|
|
1670
|
+
(devlyn / "risk-probes.jsonl").write_text(
|
|
1671
|
+
json.dumps({
|
|
1672
|
+
"id": "P5a",
|
|
1673
|
+
"derived_from": "solo-headroom hypothesis: solo_claude should miss duplicate handling; `printf ok` exposes the miss.",
|
|
1674
|
+
"cmd": "printf first-unrelated",
|
|
1675
|
+
"exit_code": 0,
|
|
1676
|
+
"stdout_contains": ["first-unrelated"],
|
|
1677
|
+
"stdout_not_contains": [],
|
|
1678
|
+
"tags": ["shape_contract"],
|
|
1679
|
+
"tag_evidence": {},
|
|
1680
|
+
}) + "\n" + json.dumps({
|
|
1681
|
+
"id": "P5b",
|
|
1682
|
+
"derived_from": "probe must pass visible marker.",
|
|
1683
|
+
"cmd": "bash -lc 'printf ok'",
|
|
1684
|
+
"exit_code": 0,
|
|
1685
|
+
"stdout_contains": ["ok"],
|
|
1686
|
+
"stdout_not_contains": [],
|
|
1687
|
+
"tags": ["shape_contract"],
|
|
1688
|
+
"tag_evidence": {},
|
|
1689
|
+
}) + "\n"
|
|
1690
|
+
)
|
|
1691
|
+
late_solo_headroom_probe = subprocess.run(
|
|
1692
|
+
[sys.executable, script_path, "--validate-risk-probes"],
|
|
1693
|
+
cwd=work,
|
|
1694
|
+
env=env,
|
|
1695
|
+
capture_output=True,
|
|
1696
|
+
text=True,
|
|
1697
|
+
)
|
|
1698
|
+
if late_solo_headroom_probe.returncode == 0:
|
|
1699
|
+
print("solo-headroom command in a later risk probe was accepted", file=sys.stderr)
|
|
1700
|
+
return 1
|
|
1701
|
+
|
|
1702
|
+
(devlyn / "risk-probes.jsonl").write_text(json.dumps({
|
|
1703
|
+
"id": "P5c",
|
|
1704
|
+
"derived_from": "solo-headroom hypothesis: solo_claude should miss duplicate handling; `printf ok` exposes the miss.",
|
|
1705
|
+
"cmd": "printf ok2",
|
|
1706
|
+
"exit_code": 0,
|
|
1707
|
+
"stdout_contains": ["ok2"],
|
|
1708
|
+
"stdout_not_contains": [],
|
|
1709
|
+
"tags": ["shape_contract"],
|
|
1710
|
+
"tag_evidence": {},
|
|
1711
|
+
}) + "\n")
|
|
1712
|
+
prefix_solo_headroom_probe = subprocess.run(
|
|
1713
|
+
[sys.executable, script_path, "--validate-risk-probes"],
|
|
1714
|
+
cwd=work,
|
|
1715
|
+
env=env,
|
|
1716
|
+
capture_output=True,
|
|
1717
|
+
text=True,
|
|
1718
|
+
)
|
|
1719
|
+
if prefix_solo_headroom_probe.returncode == 0:
|
|
1720
|
+
print("solo-headroom command prefix match was accepted", file=sys.stderr)
|
|
1721
|
+
return 1
|
|
1722
|
+
|
|
1723
|
+
(devlyn / "risk-probes.jsonl").write_text(json.dumps({
|
|
1724
|
+
"id": "P5",
|
|
1725
|
+
"derived_from": "solo-headroom hypothesis: solo_claude should miss duplicate handling; `printf ok` exposes the miss.",
|
|
1726
|
+
"cmd": "bash -lc 'printf ok'",
|
|
1727
|
+
"exit_code": 0,
|
|
1728
|
+
"stdout_contains": ["ok"],
|
|
1729
|
+
"stdout_not_contains": [],
|
|
1730
|
+
"tags": ["shape_contract"],
|
|
1731
|
+
"tag_evidence": {},
|
|
1732
|
+
}) + "\n")
|
|
1733
|
+
good_solo_headroom_probe = subprocess.run(
|
|
1734
|
+
[sys.executable, script_path, "--validate-risk-probes"],
|
|
1735
|
+
cwd=work,
|
|
1736
|
+
env=env,
|
|
1737
|
+
capture_output=True,
|
|
1738
|
+
text=True,
|
|
1739
|
+
)
|
|
1740
|
+
if good_solo_headroom_probe.returncode != 0:
|
|
1741
|
+
print("risk probe covering solo-headroom command was rejected", file=sys.stderr)
|
|
1742
|
+
print(good_solo_headroom_probe.stderr, file=sys.stderr)
|
|
1743
|
+
return 1
|
|
1744
|
+
|
|
1745
|
+
expected_json = work / "spec.expected.json"
|
|
1746
|
+
expected_json.write_text(json.dumps({
|
|
1747
|
+
"verification_commands": [
|
|
1748
|
+
{"cmd": "printf ok", "exit_code": 0, "stdout_contains": ["ok"]}
|
|
1749
|
+
],
|
|
1750
|
+
"forbidden_patterns": [
|
|
1751
|
+
{
|
|
1752
|
+
"pattern": "catch\\s*\\{\\s*\\}",
|
|
1753
|
+
"description": "silent catch hides failures",
|
|
1754
|
+
"severity": "disqualifier",
|
|
1755
|
+
}
|
|
1756
|
+
],
|
|
1757
|
+
"required_files": ["bin/cli.js"],
|
|
1758
|
+
"forbidden_files": [],
|
|
1759
|
+
"max_deps_added": 0,
|
|
1760
|
+
}) + "\n")
|
|
1761
|
+
spec_md.write_text(
|
|
1762
|
+
"# Spec\n\n## Verification\n\n"
|
|
1763
|
+
"- solo-headroom hypothesis: solo_claude should miss duplicate handling.\n"
|
|
1764
|
+
"- Observable command: `node check.js` exposes behavior.\n"
|
|
1765
|
+
)
|
|
1766
|
+
weak_sibling_solo = subprocess.run(
|
|
1767
|
+
[sys.executable, script_path, "--check-expected", str(expected_json)],
|
|
1768
|
+
cwd=work,
|
|
1769
|
+
capture_output=True,
|
|
1770
|
+
text=True,
|
|
1771
|
+
)
|
|
1772
|
+
if weak_sibling_solo.returncode == 0:
|
|
1773
|
+
print("weak sibling solo-headroom hypothesis was accepted by --check-expected", file=sys.stderr)
|
|
1774
|
+
return 1
|
|
1775
|
+
if "backticked command/observable line that exposes the miss" not in weak_sibling_solo.stderr:
|
|
1776
|
+
print("--check-expected did not report weak sibling solo-headroom hypothesis", file=sys.stderr)
|
|
1777
|
+
print(weak_sibling_solo.stderr, file=sys.stderr)
|
|
1778
|
+
return 1
|
|
1779
|
+
|
|
1780
|
+
spec_md.write_text(
|
|
1781
|
+
"# Spec\n\n## Verification\n\n"
|
|
1782
|
+
"- solo-headroom hypothesis: solo_claude should miss duplicate handling; "
|
|
1783
|
+
"`node check.js` exposes the miss.\n",
|
|
1784
|
+
encoding="utf-8",
|
|
1785
|
+
)
|
|
1786
|
+
mismatched_sibling_solo = subprocess.run(
|
|
1787
|
+
[sys.executable, script_path, "--check-expected", str(expected_json)],
|
|
1788
|
+
cwd=work,
|
|
1789
|
+
capture_output=True,
|
|
1790
|
+
text=True,
|
|
1791
|
+
)
|
|
1792
|
+
if mismatched_sibling_solo.returncode == 0:
|
|
1793
|
+
print("mismatched sibling solo-headroom command was accepted by --check-expected", file=sys.stderr)
|
|
1794
|
+
return 1
|
|
1795
|
+
if "observable command must match spec.expected.json" not in mismatched_sibling_solo.stderr:
|
|
1796
|
+
print("--check-expected did not report mismatched sibling solo-headroom command", file=sys.stderr)
|
|
1797
|
+
print(mismatched_sibling_solo.stderr, file=sys.stderr)
|
|
1798
|
+
return 1
|
|
1799
|
+
|
|
1800
|
+
spec_md.write_text(
|
|
1801
|
+
"# Spec\n\n## Verification\n\n"
|
|
1802
|
+
"- solo-headroom hypothesis: solo_claude should miss duplicate handling; "
|
|
1803
|
+
"`printf ok` exposes the miss.\n",
|
|
1804
|
+
encoding="utf-8",
|
|
1805
|
+
)
|
|
1806
|
+
matched_sibling_solo = subprocess.run(
|
|
1807
|
+
[sys.executable, script_path, "--check-expected", str(expected_json)],
|
|
1808
|
+
cwd=work,
|
|
1809
|
+
capture_output=True,
|
|
1810
|
+
text=True,
|
|
1811
|
+
)
|
|
1812
|
+
if matched_sibling_solo.returncode != 0:
|
|
1813
|
+
print("matched sibling solo-headroom command was rejected by --check-expected", file=sys.stderr)
|
|
1814
|
+
print(matched_sibling_solo.stderr, file=sys.stderr)
|
|
1815
|
+
return 1
|
|
1816
|
+
|
|
1817
|
+
spec_md.write_text(
|
|
1818
|
+
"# Spec\n\n## Verification\n\n"
|
|
1819
|
+
"- Solo-headroom hypothesis: the spec must literally contain `solo_claude`, `miss`, and an observable command; "
|
|
1820
|
+
"`printf ok` exposes the miss.\n",
|
|
1821
|
+
encoding="utf-8",
|
|
1822
|
+
)
|
|
1823
|
+
docs_style_sibling_solo = subprocess.run(
|
|
1824
|
+
[sys.executable, script_path, "--check-expected", str(expected_json)],
|
|
1825
|
+
cwd=work,
|
|
1826
|
+
capture_output=True,
|
|
1827
|
+
text=True,
|
|
1828
|
+
)
|
|
1829
|
+
if docs_style_sibling_solo.returncode != 0:
|
|
1830
|
+
print("docs-style sibling solo-headroom command was rejected by --check-expected", file=sys.stderr)
|
|
1831
|
+
print(docs_style_sibling_solo.stderr, file=sys.stderr)
|
|
1832
|
+
return 1
|
|
1833
|
+
|
|
1834
|
+
spec_md.write_text(
|
|
1835
|
+
"# Spec\n\n## Verification\n\n"
|
|
1836
|
+
"- solo ceiling avoidance: this differs from controls but omits the required baseline.\n",
|
|
1837
|
+
encoding="utf-8",
|
|
1838
|
+
)
|
|
1839
|
+
weak_sibling_solo_ceiling = subprocess.run(
|
|
1840
|
+
[sys.executable, script_path, "--check-expected", str(expected_json)],
|
|
1841
|
+
cwd=work,
|
|
1842
|
+
capture_output=True,
|
|
1843
|
+
text=True,
|
|
1844
|
+
)
|
|
1845
|
+
if weak_sibling_solo_ceiling.returncode == 0:
|
|
1846
|
+
print("weak sibling solo ceiling avoidance was accepted by --check-expected", file=sys.stderr)
|
|
1847
|
+
return 1
|
|
1848
|
+
if "concrete difference from rejected or solo-saturated controls" not in weak_sibling_solo_ceiling.stderr:
|
|
1849
|
+
print("--check-expected did not report weak sibling solo ceiling avoidance", file=sys.stderr)
|
|
1850
|
+
print(weak_sibling_solo_ceiling.stderr, file=sys.stderr)
|
|
1851
|
+
return 1
|
|
1852
|
+
|
|
1853
|
+
spec_md.write_text(
|
|
1854
|
+
"# Spec\n\n## Verification\n\n"
|
|
1855
|
+
"- solo ceiling avoidance: unlike solo-saturated `S2`-`S6`, this includes a "
|
|
1856
|
+
"multi-run temporal dependency because solo_claude headroom should remain.\n",
|
|
1857
|
+
encoding="utf-8",
|
|
1858
|
+
)
|
|
1859
|
+
strong_sibling_solo_ceiling = subprocess.run(
|
|
1860
|
+
[sys.executable, script_path, "--check-expected", str(expected_json)],
|
|
1861
|
+
cwd=work,
|
|
1862
|
+
capture_output=True,
|
|
1863
|
+
text=True,
|
|
1864
|
+
)
|
|
1865
|
+
if strong_sibling_solo_ceiling.returncode != 0:
|
|
1866
|
+
print("actionable sibling solo ceiling avoidance was rejected by --check-expected", file=sys.stderr)
|
|
1867
|
+
print(strong_sibling_solo_ceiling.stderr, file=sys.stderr)
|
|
1868
|
+
return 1
|
|
1869
|
+
|
|
1870
|
+
spec_md.write_text("# Spec\n\n## Verification\n\n- probe must pass visible marker.\n")
|
|
1871
|
+
expected_good = subprocess.run(
|
|
1872
|
+
[sys.executable, script_path, "--check-expected", str(expected_json)],
|
|
1873
|
+
cwd=work,
|
|
1874
|
+
capture_output=True,
|
|
1875
|
+
text=True,
|
|
1876
|
+
)
|
|
1877
|
+
if expected_good.returncode != 0:
|
|
1878
|
+
print(expected_good.stderr, file=sys.stderr)
|
|
1879
|
+
return 1
|
|
1880
|
+
|
|
1881
|
+
spec_md.write_text(
|
|
1882
|
+
"---\nid: bad-sibling\ncomplexity: hihg\n---\n\n# Bad sibling\n\n## Verification\n\n- ok\n",
|
|
1883
|
+
encoding="utf-8",
|
|
1884
|
+
)
|
|
1885
|
+
expected_bad_sibling_complexity = subprocess.run(
|
|
1886
|
+
[sys.executable, script_path, "--check-expected", str(expected_json)],
|
|
1887
|
+
cwd=work,
|
|
1888
|
+
capture_output=True,
|
|
1889
|
+
text=True,
|
|
1890
|
+
)
|
|
1891
|
+
if expected_bad_sibling_complexity.returncode == 0:
|
|
1892
|
+
print("unsupported sibling spec complexity was accepted by --check-expected", file=sys.stderr)
|
|
1893
|
+
return 1
|
|
1894
|
+
if "frontmatter complexity must be one of" not in expected_bad_sibling_complexity.stderr:
|
|
1895
|
+
print("--check-expected did not report unsupported sibling spec complexity", file=sys.stderr)
|
|
1896
|
+
print(expected_bad_sibling_complexity.stderr, file=sys.stderr)
|
|
1897
|
+
return 1
|
|
1898
|
+
spec_md.write_text("# Spec\n\n## Verification\n\n- probe must pass visible marker.\n")
|
|
1899
|
+
|
|
1900
|
+
expected_json.write_text(json.dumps({"verification_commands": []}) + "\n")
|
|
1901
|
+
expected_empty_runtime = subprocess.run(
|
|
1902
|
+
[sys.executable, script_path, "--check-expected", str(expected_json)],
|
|
1903
|
+
cwd=work,
|
|
1904
|
+
capture_output=True,
|
|
1905
|
+
text=True,
|
|
1906
|
+
)
|
|
1907
|
+
if expected_empty_runtime.returncode == 0:
|
|
1908
|
+
print("empty verification_commands should fail for runtime specs", file=sys.stderr)
|
|
1909
|
+
return 1
|
|
1910
|
+
|
|
1911
|
+
pure_root = work / "pure-design"
|
|
1912
|
+
pure_root.mkdir()
|
|
1913
|
+
pure_spec = pure_root / "spec.md"
|
|
1914
|
+
pure_spec.write_text(
|
|
1915
|
+
"# Pure design\n\n## Verification\n\n- (all Requirements are pure-design; no runtime verification commands)\n",
|
|
1916
|
+
encoding="utf-8",
|
|
1917
|
+
)
|
|
1918
|
+
pure_expected = pure_root / "spec.expected.json"
|
|
1919
|
+
pure_expected.write_text(json.dumps({"verification_commands": []}) + "\n", encoding="utf-8")
|
|
1920
|
+
expected_empty_design = subprocess.run(
|
|
1921
|
+
[sys.executable, script_path, "--check-expected", str(pure_expected)],
|
|
1922
|
+
cwd=work,
|
|
1923
|
+
capture_output=True,
|
|
1924
|
+
text=True,
|
|
1925
|
+
)
|
|
1926
|
+
if expected_empty_design.returncode != 0:
|
|
1927
|
+
print("empty verification_commands should be valid for pure-design specs", file=sys.stderr)
|
|
1928
|
+
print(expected_empty_design.stderr, file=sys.stderr)
|
|
1929
|
+
return 1
|
|
1930
|
+
|
|
1931
|
+
expected_json.write_text(json.dumps({"unknown": True}) + "\n")
|
|
1932
|
+
expected_bad = subprocess.run(
|
|
1933
|
+
[sys.executable, script_path, "--check-expected", str(expected_json)],
|
|
1934
|
+
cwd=work,
|
|
1935
|
+
capture_output=True,
|
|
1936
|
+
text=True,
|
|
1937
|
+
)
|
|
1938
|
+
if expected_bad.returncode == 0:
|
|
1939
|
+
print("spec.expected.json with unknown key was accepted", file=sys.stderr)
|
|
1940
|
+
return 1
|
|
1941
|
+
|
|
1942
|
+
expected_json.write_text(json.dumps({
|
|
1943
|
+
"verification_commands": [{"cmd": "printf ok", "stdout_contians": ["ok"]}]
|
|
1944
|
+
}) + "\n")
|
|
1945
|
+
expected_bad_command = subprocess.run(
|
|
1946
|
+
[sys.executable, script_path, "--check-expected", str(expected_json)],
|
|
1947
|
+
cwd=work,
|
|
1948
|
+
capture_output=True,
|
|
1949
|
+
text=True,
|
|
1950
|
+
)
|
|
1951
|
+
if expected_bad_command.returncode == 0:
|
|
1952
|
+
print("spec.expected.json command with unknown key was accepted", file=sys.stderr)
|
|
1953
|
+
return 1
|
|
1954
|
+
|
|
1955
|
+
expected_json.write_text("[1]\n")
|
|
1956
|
+
expected_non_object = subprocess.run(
|
|
1957
|
+
[sys.executable, script_path, "--check-expected", str(expected_json)],
|
|
1958
|
+
cwd=work,
|
|
1959
|
+
capture_output=True,
|
|
1960
|
+
text=True,
|
|
1961
|
+
)
|
|
1962
|
+
if expected_non_object.returncode == 0:
|
|
1963
|
+
print("spec.expected.json top-level array was accepted", file=sys.stderr)
|
|
1964
|
+
return 1
|
|
1965
|
+
if "top-level must be a JSON object" not in expected_non_object.stderr:
|
|
1966
|
+
print("spec.expected.json top-level array did not report object shape error", file=sys.stderr)
|
|
1967
|
+
print(expected_non_object.stderr, file=sys.stderr)
|
|
1968
|
+
return 1
|
|
1969
|
+
if "Traceback" in expected_non_object.stderr:
|
|
1970
|
+
print("spec.expected.json top-level array produced a traceback", file=sys.stderr)
|
|
1971
|
+
return 1
|
|
1972
|
+
|
|
1973
|
+
expected_json.write_text("{broken\n")
|
|
1974
|
+
expected_invalid_json = subprocess.run(
|
|
1975
|
+
[sys.executable, script_path, "--check-expected", str(expected_json)],
|
|
1976
|
+
cwd=work,
|
|
1977
|
+
capture_output=True,
|
|
1978
|
+
text=True,
|
|
1979
|
+
)
|
|
1980
|
+
if expected_invalid_json.returncode == 0:
|
|
1981
|
+
print("invalid spec.expected.json was accepted", file=sys.stderr)
|
|
1982
|
+
return 1
|
|
1983
|
+
if "has invalid JSON" not in expected_invalid_json.stderr:
|
|
1984
|
+
print("invalid spec.expected.json did not report JSON parse error", file=sys.stderr)
|
|
1985
|
+
print(expected_invalid_json.stderr, file=sys.stderr)
|
|
1986
|
+
return 1
|
|
1987
|
+
if "Traceback" in expected_invalid_json.stderr:
|
|
1988
|
+
print("invalid spec.expected.json produced a traceback", file=sys.stderr)
|
|
1989
|
+
return 1
|
|
1990
|
+
|
|
1991
|
+
expected_json.write_text('{"verification_commands": NaN}\n')
|
|
1992
|
+
expected_nan_json = subprocess.run(
|
|
1993
|
+
[sys.executable, script_path, "--check-expected", str(expected_json)],
|
|
1994
|
+
cwd=work,
|
|
1995
|
+
capture_output=True,
|
|
1996
|
+
text=True,
|
|
1997
|
+
)
|
|
1998
|
+
if expected_nan_json.returncode == 0:
|
|
1999
|
+
print("NaN spec.expected.json was accepted", file=sys.stderr)
|
|
2000
|
+
return 1
|
|
2001
|
+
if "invalid JSON numeric constant: NaN" not in expected_nan_json.stderr:
|
|
2002
|
+
print("NaN spec.expected.json did not report invalid numeric constant", file=sys.stderr)
|
|
2003
|
+
print(expected_nan_json.stderr, file=sys.stderr)
|
|
2004
|
+
return 1
|
|
2005
|
+
|
|
2006
|
+
spec_integrity = work / "spec-integrity"
|
|
2007
|
+
spec_integrity.mkdir()
|
|
2008
|
+
spec_integrity_devlyn = spec_integrity / ".devlyn"
|
|
2009
|
+
spec_integrity_devlyn.mkdir()
|
|
2010
|
+
integrity_spec = spec_integrity / "spec.md"
|
|
2011
|
+
integrity_spec.write_text(
|
|
2012
|
+
"# Spec\n\n## Verification\n\n```json\n"
|
|
2013
|
+
"{\"verification_commands\":[{\"cmd\":\"printf spec-hash-ok\",\"stdout_contains\":[\"spec-hash-ok\"]}]}\n"
|
|
2014
|
+
"```\n",
|
|
2015
|
+
encoding="utf-8",
|
|
2016
|
+
)
|
|
2017
|
+
(spec_integrity_devlyn / "pipeline.state.json").write_text(json.dumps({
|
|
2018
|
+
"source": {
|
|
2019
|
+
"type": "spec",
|
|
2020
|
+
"spec_path": str(integrity_spec),
|
|
2021
|
+
"spec_sha256": "0" * 64,
|
|
2022
|
+
}
|
|
2023
|
+
}))
|
|
2024
|
+
spec_bad_hash_run = subprocess.run(
|
|
2025
|
+
[sys.executable, script_path],
|
|
2026
|
+
cwd=spec_integrity,
|
|
2027
|
+
capture_output=True,
|
|
2028
|
+
text=True,
|
|
2029
|
+
)
|
|
2030
|
+
if spec_bad_hash_run.returncode == 0:
|
|
2031
|
+
print("spec source with mismatched source.spec_sha256 was accepted", file=sys.stderr)
|
|
2032
|
+
return 1
|
|
2033
|
+
if "source.spec_sha256 mismatch" not in spec_bad_hash_run.stderr:
|
|
2034
|
+
print("spec source hash mismatch did not report source integrity", file=sys.stderr)
|
|
2035
|
+
print(spec_bad_hash_run.stderr, file=sys.stderr)
|
|
2036
|
+
return 1
|
|
2037
|
+
|
|
2038
|
+
spec_hash = hashlib.sha256(integrity_spec.read_bytes()).hexdigest()
|
|
2039
|
+
(spec_integrity_devlyn / "pipeline.state.json").write_text(json.dumps({
|
|
2040
|
+
"source": {
|
|
2041
|
+
"type": "spec",
|
|
2042
|
+
"spec_path": str(integrity_spec),
|
|
2043
|
+
"spec_sha256": spec_hash,
|
|
2044
|
+
}
|
|
2045
|
+
}))
|
|
2046
|
+
spec_hash_run = subprocess.run(
|
|
2047
|
+
[sys.executable, script_path],
|
|
2048
|
+
cwd=spec_integrity,
|
|
2049
|
+
capture_output=True,
|
|
2050
|
+
text=True,
|
|
2051
|
+
)
|
|
2052
|
+
if spec_hash_run.returncode != 0:
|
|
2053
|
+
print(spec_hash_run.stderr, file=sys.stderr)
|
|
2054
|
+
return 1
|
|
2055
|
+
staged_spec_hash = loads_strict_json((spec_integrity_devlyn / "spec-verify.json").read_text())
|
|
2056
|
+
if staged_spec_hash.get("verification_commands", [{}])[0].get("cmd") != "printf spec-hash-ok":
|
|
2057
|
+
print("spec source with matching source.spec_sha256 was not staged", file=sys.stderr)
|
|
2058
|
+
return 1
|
|
2059
|
+
|
|
2060
|
+
generated_user = work / "generated-user"
|
|
2061
|
+
generated_user.mkdir()
|
|
2062
|
+
generated_devlyn = generated_user / ".devlyn"
|
|
2063
|
+
generated_devlyn.mkdir()
|
|
2064
|
+
generated_criteria = generated_user / ".devlyn" / "criteria.generated.md"
|
|
2065
|
+
generated_criteria.write_text(
|
|
2066
|
+
"# Criteria\n\n## Verification\n\n```json\n"
|
|
2067
|
+
"{\"verification_commands\":[{\"cmd\":\"printf generated-ok\",\"stdout_contains\":[\"generated-ok\"]}]}\n"
|
|
2068
|
+
"```\n",
|
|
2069
|
+
encoding="utf-8",
|
|
2070
|
+
)
|
|
2071
|
+
(generated_devlyn / "pipeline.state.json").write_text(json.dumps({
|
|
2072
|
+
"source": {"type": "generated", "criteria_path": str(generated_criteria)}
|
|
2073
|
+
}))
|
|
2074
|
+
generated_missing_hash_run = subprocess.run(
|
|
2075
|
+
[sys.executable, script_path],
|
|
2076
|
+
cwd=generated_user,
|
|
2077
|
+
capture_output=True,
|
|
2078
|
+
text=True,
|
|
2079
|
+
)
|
|
2080
|
+
if generated_missing_hash_run.returncode == 0:
|
|
2081
|
+
print("generated criteria without source.criteria_sha256 was accepted", file=sys.stderr)
|
|
2082
|
+
return 1
|
|
2083
|
+
if "source.criteria_sha256 is required" not in generated_missing_hash_run.stderr:
|
|
2084
|
+
print("generated criteria without source.criteria_sha256 did not report source integrity", file=sys.stderr)
|
|
2085
|
+
print(generated_missing_hash_run.stderr, file=sys.stderr)
|
|
2086
|
+
return 1
|
|
2087
|
+
|
|
2088
|
+
(generated_devlyn / "pipeline.state.json").write_text(json.dumps({
|
|
2089
|
+
"source": {
|
|
2090
|
+
"type": "generated",
|
|
2091
|
+
"criteria_path": str(generated_criteria),
|
|
2092
|
+
"criteria_sha256": "0" * 64,
|
|
2093
|
+
}
|
|
2094
|
+
}))
|
|
2095
|
+
generated_bad_hash_run = subprocess.run(
|
|
2096
|
+
[sys.executable, script_path],
|
|
2097
|
+
cwd=generated_user,
|
|
2098
|
+
capture_output=True,
|
|
2099
|
+
text=True,
|
|
2100
|
+
)
|
|
2101
|
+
if generated_bad_hash_run.returncode == 0:
|
|
2102
|
+
print("generated criteria with mismatched source.criteria_sha256 was accepted", file=sys.stderr)
|
|
2103
|
+
return 1
|
|
2104
|
+
if "source.criteria_sha256 mismatch" not in generated_bad_hash_run.stderr:
|
|
2105
|
+
print("generated criteria hash mismatch did not report source integrity", file=sys.stderr)
|
|
2106
|
+
print(generated_bad_hash_run.stderr, file=sys.stderr)
|
|
2107
|
+
return 1
|
|
2108
|
+
|
|
2109
|
+
generated_hash = hashlib.sha256(generated_criteria.read_bytes()).hexdigest()
|
|
2110
|
+
(generated_devlyn / "pipeline.state.json").write_text(json.dumps({
|
|
2111
|
+
"source": {
|
|
2112
|
+
"type": "generated",
|
|
2113
|
+
"criteria_path": str(generated_criteria),
|
|
2114
|
+
"criteria_sha256": generated_hash,
|
|
2115
|
+
}
|
|
2116
|
+
}))
|
|
2117
|
+
generated_run = subprocess.run(
|
|
2118
|
+
[sys.executable, script_path],
|
|
2119
|
+
cwd=generated_user,
|
|
2120
|
+
capture_output=True,
|
|
2121
|
+
text=True,
|
|
2122
|
+
)
|
|
2123
|
+
if generated_run.returncode != 0:
|
|
2124
|
+
print(generated_run.stderr, file=sys.stderr)
|
|
2125
|
+
return 1
|
|
2126
|
+
staged_generated = loads_strict_json((generated_devlyn / "spec-verify.json").read_text())
|
|
2127
|
+
if staged_generated.get("verification_commands", [{}])[0].get("cmd") != "printf generated-ok":
|
|
2128
|
+
print("generated criteria carrier was not staged into .devlyn/spec-verify.json", file=sys.stderr)
|
|
2129
|
+
return 1
|
|
2130
|
+
|
|
2131
|
+
generated_criteria.write_text(
|
|
2132
|
+
"# Criteria\n\n## Verification\n\n- generated criteria omitted its machine-readable carrier.\n",
|
|
2133
|
+
encoding="utf-8",
|
|
2134
|
+
)
|
|
2135
|
+
malformed_generated_hash = hashlib.sha256(generated_criteria.read_bytes()).hexdigest()
|
|
2136
|
+
(generated_devlyn / "pipeline.state.json").write_text(json.dumps({
|
|
2137
|
+
"source": {
|
|
2138
|
+
"type": "generated",
|
|
2139
|
+
"criteria_path": str(generated_criteria),
|
|
2140
|
+
"criteria_sha256": malformed_generated_hash,
|
|
2141
|
+
}
|
|
2142
|
+
}))
|
|
2143
|
+
malformed_generated_run = subprocess.run(
|
|
2144
|
+
[sys.executable, script_path],
|
|
2145
|
+
cwd=generated_user,
|
|
2146
|
+
capture_output=True,
|
|
2147
|
+
text=True,
|
|
2148
|
+
)
|
|
2149
|
+
if malformed_generated_run.returncode == 0:
|
|
2150
|
+
print("generated criteria without a JSON carrier was accepted", file=sys.stderr)
|
|
2151
|
+
return 1
|
|
2152
|
+
if "Generated criteria were written without one" not in malformed_generated_run.stderr:
|
|
2153
|
+
print("generated criteria without a JSON carrier did not report the generated-source contract", file=sys.stderr)
|
|
2154
|
+
print(malformed_generated_run.stderr, file=sys.stderr)
|
|
2155
|
+
return 1
|
|
2156
|
+
|
|
2157
|
+
real_user = work / "real-user"
|
|
2158
|
+
real_user.mkdir()
|
|
2159
|
+
real_devlyn = real_user / ".devlyn"
|
|
2160
|
+
real_devlyn.mkdir()
|
|
2161
|
+
real_spec = real_user / "spec.md"
|
|
2162
|
+
real_spec.write_text(
|
|
2163
|
+
"# Spec\n\n## Verification\n\n- sibling command must print sibling-ok.\n"
|
|
2164
|
+
)
|
|
2165
|
+
(real_user / "spec.expected.json").write_text(json.dumps({
|
|
2166
|
+
"verification_commands": [
|
|
2167
|
+
{"cmd": "printf sibling-ok", "stdout_contains": ["sibling-ok"]}
|
|
2168
|
+
]
|
|
2169
|
+
}) + "\n")
|
|
2170
|
+
(real_devlyn / "pipeline.state.json").write_text(json.dumps({
|
|
2171
|
+
"source": {"type": "spec", "spec_path": str(real_spec)}
|
|
2172
|
+
}))
|
|
2173
|
+
sibling_run = subprocess.run(
|
|
2174
|
+
[sys.executable, script_path],
|
|
2175
|
+
cwd=real_user,
|
|
2176
|
+
capture_output=True,
|
|
2177
|
+
text=True,
|
|
2178
|
+
)
|
|
2179
|
+
if sibling_run.returncode != 0:
|
|
2180
|
+
print(sibling_run.stderr, file=sys.stderr)
|
|
2181
|
+
return 1
|
|
2182
|
+
staged = loads_strict_json((real_devlyn / "spec-verify.json").read_text())
|
|
2183
|
+
if staged.get("verification_commands", [{}])[0].get("cmd") != "printf sibling-ok":
|
|
2184
|
+
print("sibling spec.expected.json was not staged into .devlyn/spec-verify.json", file=sys.stderr)
|
|
2185
|
+
return 1
|
|
2186
|
+
|
|
2187
|
+
malformed = work / "malformed-sibling"
|
|
2188
|
+
malformed.mkdir()
|
|
2189
|
+
malformed_devlyn = malformed / ".devlyn"
|
|
2190
|
+
malformed_devlyn.mkdir()
|
|
2191
|
+
malformed_spec = malformed / "spec.md"
|
|
2192
|
+
malformed_spec.write_text(
|
|
2193
|
+
"# Spec\n\n## Verification\n\n```json\n"
|
|
2194
|
+
"{\"verification_commands\":[{\"cmd\":\"printf inline-ok\"}]}\n"
|
|
2195
|
+
"```\n"
|
|
2196
|
+
)
|
|
2197
|
+
(malformed / "spec.expected.json").write_text(json.dumps({"unknown": True}) + "\n")
|
|
2198
|
+
(malformed_devlyn / "pipeline.state.json").write_text(json.dumps({
|
|
2199
|
+
"source": {"type": "spec", "spec_path": str(malformed_spec)}
|
|
2200
|
+
}))
|
|
2201
|
+
malformed_run = subprocess.run(
|
|
2202
|
+
[sys.executable, script_path],
|
|
2203
|
+
cwd=malformed,
|
|
2204
|
+
capture_output=True,
|
|
2205
|
+
text=True,
|
|
2206
|
+
)
|
|
2207
|
+
if malformed_run.returncode == 0:
|
|
2208
|
+
print("malformed sibling spec.expected.json fell back to inline carrier", file=sys.stderr)
|
|
2209
|
+
return 1
|
|
2210
|
+
|
|
2211
|
+
bench_spec = work / "bench-spec.md"
|
|
2212
|
+
bench_spec.write_text("# Spec\n\n## Verification\n\n- benchmark pre-staged wins.\n")
|
|
2213
|
+
(work / "spec.expected.json").write_text(json.dumps({
|
|
2214
|
+
"verification_commands": [
|
|
2215
|
+
{"cmd": "printf wrong", "stdout_contains": ["wrong"]}
|
|
2216
|
+
]
|
|
2217
|
+
}) + "\n")
|
|
2218
|
+
(devlyn / "pipeline.state.json").write_text(json.dumps({
|
|
2219
|
+
"source": {"type": "spec", "spec_path": str(bench_spec)}
|
|
2220
|
+
}))
|
|
2221
|
+
(devlyn / "spec-verify.json").write_text(json.dumps({
|
|
2222
|
+
"verification_commands": [
|
|
2223
|
+
{"cmd": "printf bench-staged", "stdout_contains": ["bench-staged"]}
|
|
2224
|
+
]
|
|
2225
|
+
}) + "\n")
|
|
2226
|
+
bench_pre_staged = subprocess.run(
|
|
2227
|
+
[sys.executable, script_path],
|
|
2228
|
+
cwd=work,
|
|
2229
|
+
env=env,
|
|
2230
|
+
capture_output=True,
|
|
2231
|
+
text=True,
|
|
2232
|
+
)
|
|
2233
|
+
if bench_pre_staged.returncode != 0:
|
|
2234
|
+
print(bench_pre_staged.stderr, file=sys.stderr)
|
|
2235
|
+
return 1
|
|
2236
|
+
staged_bench = loads_strict_json((devlyn / "spec-verify.json").read_text())
|
|
2237
|
+
if staged_bench.get("verification_commands", [{}])[0].get("cmd") != "printf bench-staged":
|
|
2238
|
+
print("benchmark pre-staged contract was overwritten", file=sys.stderr)
|
|
2239
|
+
return 1
|
|
2240
|
+
|
|
2241
|
+
verify_output = work / "verify-output"
|
|
2242
|
+
verify_output.mkdir()
|
|
2243
|
+
verify_devlyn = verify_output / ".devlyn"
|
|
2244
|
+
verify_devlyn.mkdir()
|
|
2245
|
+
verify_spec = verify_output / "spec.md"
|
|
2246
|
+
verify_spec.write_text("# Spec\n\n## Verification\n\n- verify mechanical output.\n")
|
|
2247
|
+
(verify_output / "spec.expected.json").write_text(json.dumps({
|
|
2248
|
+
"verification_commands": [
|
|
2249
|
+
{"cmd": "printf wrong", "stdout_contains": ["expected"]}
|
|
2250
|
+
]
|
|
2251
|
+
}) + "\n")
|
|
2252
|
+
(verify_devlyn / "pipeline.state.json").write_text(json.dumps({
|
|
2253
|
+
"source": {"type": "spec", "spec_path": str(verify_spec)}
|
|
2254
|
+
}))
|
|
2255
|
+
verify_env = os.environ.copy()
|
|
2256
|
+
verify_env.update({
|
|
2257
|
+
"SPEC_VERIFY_PHASE": "verify_mechanical",
|
|
2258
|
+
"SPEC_VERIFY_FINDINGS_FILE": "verify-mechanical.findings.jsonl",
|
|
2259
|
+
"SPEC_VERIFY_FINDING_PREFIX": "VERIFY-MECH",
|
|
2260
|
+
})
|
|
2261
|
+
verify_output_run = subprocess.run(
|
|
2262
|
+
[sys.executable, script_path],
|
|
2263
|
+
cwd=verify_output,
|
|
2264
|
+
env=verify_env,
|
|
2265
|
+
capture_output=True,
|
|
2266
|
+
text=True,
|
|
2267
|
+
)
|
|
2268
|
+
if verify_output_run.returncode == 0:
|
|
2269
|
+
print("VERIFY output-mode failing command was accepted", file=sys.stderr)
|
|
2270
|
+
return 1
|
|
2271
|
+
verify_findings = (verify_devlyn / "verify-mechanical.findings.jsonl").read_text()
|
|
2272
|
+
if '"phase": "verify_mechanical"' not in verify_findings or "VERIFY-MECH-" not in verify_findings:
|
|
2273
|
+
print("VERIFY output-mode did not route findings to verify-mechanical", file=sys.stderr)
|
|
2274
|
+
return 1
|
|
2275
|
+
|
|
2276
|
+
contract_root = work / "expected-contract"
|
|
2277
|
+
contract_root.mkdir()
|
|
2278
|
+
contract_devlyn = contract_root / ".devlyn"
|
|
2279
|
+
contract_devlyn.mkdir()
|
|
2280
|
+
(contract_root / "package.json").write_text(
|
|
2281
|
+
'{\n "dependencies": {},\n "devDependencies": {}\n}\n'
|
|
2282
|
+
)
|
|
2283
|
+
subprocess.run(["git", "init", "-q"], cwd=contract_root, check=True)
|
|
2284
|
+
subprocess.run(["git", "add", "-A"], cwd=contract_root, check=True)
|
|
2285
|
+
subprocess.run(
|
|
2286
|
+
["git", "-c", "user.email=t@t", "-c", "user.name=t", "commit", "-q", "-m", "base"],
|
|
2287
|
+
cwd=contract_root,
|
|
2288
|
+
check=True,
|
|
2289
|
+
)
|
|
2290
|
+
base_sha = subprocess.check_output(
|
|
2291
|
+
["git", "rev-parse", "HEAD"],
|
|
2292
|
+
cwd=contract_root,
|
|
2293
|
+
text=True,
|
|
2294
|
+
).strip()
|
|
2295
|
+
contract_spec = contract_root / "spec.md"
|
|
2296
|
+
contract_spec.write_text("# Spec\n\n## Verification\n\n- expected contract checks.\n")
|
|
2297
|
+
(contract_root / "app.js").write_text("try { work(); } catch { return null; }\n")
|
|
2298
|
+
(contract_root / "forbidden.txt").write_text("forbidden\n")
|
|
2299
|
+
(contract_root / "package.json").write_text(
|
|
2300
|
+
'{\n "dependencies": {\n "left-pad": "1.3.0"\n },\n'
|
|
2301
|
+
' "devDependencies": {}\n}\n'
|
|
2302
|
+
)
|
|
2303
|
+
(contract_root / "spec.expected.json").write_text(json.dumps({
|
|
2304
|
+
"verification_commands": [{"cmd": "printf ok", "stdout_contains": ["ok"]}],
|
|
2305
|
+
"forbidden_patterns": [{
|
|
2306
|
+
"pattern": "catch\\s*\\{\\s*return null",
|
|
2307
|
+
"description": "silent catch fallback",
|
|
2308
|
+
"severity": "disqualifier",
|
|
2309
|
+
}],
|
|
2310
|
+
"required_files": ["required.txt"],
|
|
2311
|
+
"forbidden_files": ["forbidden.txt"],
|
|
2312
|
+
"max_deps_added": 0,
|
|
2313
|
+
}) + "\n")
|
|
2314
|
+
subprocess.run(["git", "add", "-A"], cwd=contract_root, check=True)
|
|
2315
|
+
(contract_devlyn / "pipeline.state.json").write_text(json.dumps({
|
|
2316
|
+
"source": {"type": "spec", "spec_path": str(contract_spec)},
|
|
2317
|
+
"base_ref": {"sha": base_sha},
|
|
2318
|
+
}))
|
|
2319
|
+
contract_run = subprocess.run(
|
|
2320
|
+
[sys.executable, script_path],
|
|
2321
|
+
cwd=contract_root,
|
|
2322
|
+
capture_output=True,
|
|
2323
|
+
text=True,
|
|
2324
|
+
)
|
|
2325
|
+
if contract_run.returncode == 0:
|
|
2326
|
+
print("expected contract violations were accepted", file=sys.stderr)
|
|
2327
|
+
return 1
|
|
2328
|
+
findings_text = (contract_devlyn / "spec-verify-findings.jsonl").read_text()
|
|
2329
|
+
for rule_id in (
|
|
2330
|
+
"correctness.forbidden-pattern",
|
|
2331
|
+
"correctness.required-file-missing",
|
|
2332
|
+
"scope.forbidden-file-touched",
|
|
2333
|
+
"scope.max-deps-added-exceeded",
|
|
2334
|
+
):
|
|
2335
|
+
if rule_id not in findings_text:
|
|
2336
|
+
print(f"expected contract finding missing: {rule_id}", file=sys.stderr)
|
|
2337
|
+
return 1
|
|
2338
|
+
|
|
2339
|
+
(devlyn / "risk-probes.jsonl").write_text(json.dumps({
|
|
2340
|
+
"id": "P4",
|
|
2341
|
+
"derived_from": "probe must pass visible marker.",
|
|
2342
|
+
"cmd": "printf weak-boundary",
|
|
2343
|
+
"exit_code": 0,
|
|
2344
|
+
"tags": ["boundary_overlap"],
|
|
2345
|
+
"tag_evidence": {"boundary_overlap": ["one_minute_overlap"]},
|
|
2346
|
+
}) + "\n")
|
|
2347
|
+
weak = subprocess.run(
|
|
2348
|
+
[sys.executable, script_path, "--validate-risk-probes"],
|
|
2349
|
+
cwd=work,
|
|
2350
|
+
env=env,
|
|
2351
|
+
capture_output=True,
|
|
2352
|
+
text=True,
|
|
2353
|
+
)
|
|
2354
|
+
if weak.returncode == 0:
|
|
2355
|
+
print("incomplete boundary_overlap evidence was accepted", file=sys.stderr)
|
|
2356
|
+
return 1
|
|
2357
|
+
|
|
2358
|
+
rollback_root = work / "rollback-risk-probe"
|
|
2359
|
+
rollback_root.mkdir()
|
|
2360
|
+
rollback_devlyn = rollback_root / ".devlyn"
|
|
2361
|
+
rollback_devlyn.mkdir()
|
|
2362
|
+
rollback_spec = rollback_root / "spec.md"
|
|
2363
|
+
rollback_spec.write_text(
|
|
2364
|
+
"# Spec\n\n## Verification\n\n"
|
|
2365
|
+
"- A failed all-or-nothing operation must roll back tentative state "
|
|
2366
|
+
"so later orders can use the released stock.\n"
|
|
2367
|
+
)
|
|
2368
|
+
(rollback_devlyn / "pipeline.state.json").write_text(json.dumps({
|
|
2369
|
+
"source": {"type": "spec", "spec_path": str(rollback_spec)}
|
|
2370
|
+
}))
|
|
2371
|
+
(rollback_devlyn / "risk-probes.jsonl").write_text(json.dumps({
|
|
2372
|
+
"id": "P5",
|
|
2373
|
+
"derived_from": (
|
|
2374
|
+
"A failed all-or-nothing operation must roll back tentative "
|
|
2375
|
+
"state so later orders can use the released stock."
|
|
2376
|
+
),
|
|
2377
|
+
"cmd": "printf weak-rollback",
|
|
2378
|
+
"exit_code": 0,
|
|
2379
|
+
"tags": ["prior_consumption"],
|
|
2380
|
+
"tag_evidence": {
|
|
2381
|
+
"prior_consumption": [
|
|
2382
|
+
"same_resource_consumed_first",
|
|
2383
|
+
"later_entity_fails_or_reroutes",
|
|
2384
|
+
],
|
|
2385
|
+
},
|
|
2386
|
+
}) + "\n")
|
|
2387
|
+
weak_rollback = subprocess.run(
|
|
2388
|
+
[sys.executable, script_path, "--validate-risk-probes"],
|
|
2389
|
+
cwd=rollback_root,
|
|
2390
|
+
capture_output=True,
|
|
2391
|
+
text=True,
|
|
2392
|
+
)
|
|
2393
|
+
if weak_rollback.returncode == 0:
|
|
2394
|
+
print("rollback verification text did not require rollback_state probe tag", file=sys.stderr)
|
|
2395
|
+
return 1
|
|
2396
|
+
|
|
2397
|
+
error_root = work / "error-contract-risk-probe"
|
|
2398
|
+
error_root.mkdir()
|
|
2399
|
+
error_devlyn = error_root / ".devlyn"
|
|
2400
|
+
error_devlyn.mkdir()
|
|
2401
|
+
error_spec = error_root / "spec.md"
|
|
2402
|
+
error_spec.write_text(
|
|
2403
|
+
"# Spec\n\n## Verification\n\n"
|
|
2404
|
+
"- Invalid input must print a JSON error object to stderr and exit 2.\n"
|
|
2405
|
+
)
|
|
2406
|
+
(error_devlyn / "pipeline.state.json").write_text(json.dumps({
|
|
2407
|
+
"source": {"type": "spec", "spec_path": str(error_spec)}
|
|
2408
|
+
}))
|
|
2409
|
+
(error_devlyn / "risk-probes.jsonl").write_text(json.dumps({
|
|
2410
|
+
"id": "P6",
|
|
2411
|
+
"derived_from": "Invalid input must print a JSON error object to stderr and exit 2.",
|
|
2412
|
+
"cmd": "printf weak-error-contract",
|
|
2413
|
+
"exit_code": 0,
|
|
2414
|
+
"tags": ["stdout_stderr_contract", "error_contract"],
|
|
2415
|
+
"tag_evidence": {
|
|
2416
|
+
"stdout_stderr_contract": ["asserts_named_stream_output"],
|
|
2417
|
+
"error_contract": ["asserts_error_payload_or_stderr"],
|
|
2418
|
+
},
|
|
2419
|
+
}) + "\n")
|
|
2420
|
+
weak_error_contract = subprocess.run(
|
|
2421
|
+
[sys.executable, script_path, "--validate-risk-probes"],
|
|
2422
|
+
cwd=error_root,
|
|
2423
|
+
capture_output=True,
|
|
2424
|
+
text=True,
|
|
2425
|
+
)
|
|
2426
|
+
if weak_error_contract.returncode == 0:
|
|
2427
|
+
print("error_contract without exit-code evidence was accepted", file=sys.stderr)
|
|
2428
|
+
return 1
|
|
2429
|
+
|
|
2430
|
+
(error_devlyn / "risk-probes.jsonl").write_text(json.dumps({
|
|
2431
|
+
"id": "P7",
|
|
2432
|
+
"derived_from": "Invalid input must print a JSON error object to stderr and exit 2.",
|
|
2433
|
+
"cmd": "printf weak-stdio-contract",
|
|
2434
|
+
"exit_code": 2,
|
|
2435
|
+
"tags": ["stdout_stderr_contract", "error_contract"],
|
|
2436
|
+
"tag_evidence": {
|
|
2437
|
+
"stdout_stderr_contract": [],
|
|
2438
|
+
"error_contract": [
|
|
2439
|
+
"asserts_error_payload_or_stderr",
|
|
2440
|
+
"asserts_nonzero_or_exit_2",
|
|
2441
|
+
],
|
|
2442
|
+
},
|
|
2443
|
+
}) + "\n")
|
|
2444
|
+
weak_stdio_contract = subprocess.run(
|
|
2445
|
+
[sys.executable, script_path, "--validate-risk-probes"],
|
|
2446
|
+
cwd=error_root,
|
|
2447
|
+
capture_output=True,
|
|
2448
|
+
text=True,
|
|
2449
|
+
)
|
|
2450
|
+
if weak_stdio_contract.returncode == 0:
|
|
2451
|
+
print("stdout_stderr_contract without stream evidence was accepted", file=sys.stderr)
|
|
2452
|
+
return 1
|
|
2453
|
+
|
|
2454
|
+
(error_devlyn / "risk-probes.jsonl").write_text(json.dumps({
|
|
2455
|
+
"id": "P7b",
|
|
2456
|
+
"derived_from": "Invalid input must print a JSON error object to stderr and exit 2.",
|
|
2457
|
+
"cmd": "printf weak-json-error-shape-contract",
|
|
2458
|
+
"exit_code": 2,
|
|
2459
|
+
"tags": ["stdout_stderr_contract", "error_contract"],
|
|
2460
|
+
"tag_evidence": {
|
|
2461
|
+
"stdout_stderr_contract": ["asserts_named_stream_output"],
|
|
2462
|
+
"error_contract": [
|
|
2463
|
+
"asserts_error_payload_or_stderr",
|
|
2464
|
+
"asserts_nonzero_or_exit_2",
|
|
2465
|
+
],
|
|
2466
|
+
},
|
|
2467
|
+
}) + "\n")
|
|
2468
|
+
weak_json_error_shape_contract = subprocess.run(
|
|
2469
|
+
[sys.executable, script_path, "--validate-risk-probes"],
|
|
2470
|
+
cwd=error_root,
|
|
2471
|
+
capture_output=True,
|
|
2472
|
+
text=True,
|
|
2473
|
+
)
|
|
2474
|
+
if weak_json_error_shape_contract.returncode == 0:
|
|
2475
|
+
print("JSON error object text did not require shape_contract tag", file=sys.stderr)
|
|
2476
|
+
return 1
|
|
2477
|
+
|
|
2478
|
+
(error_devlyn / "risk-probes.jsonl").write_text(json.dumps({
|
|
2479
|
+
"id": "P7c",
|
|
2480
|
+
"derived_from": "Invalid input must print a JSON error object to stderr and exit 2.",
|
|
2481
|
+
"cmd": "printf json-error-shape-contract",
|
|
2482
|
+
"exit_code": 2,
|
|
2483
|
+
"tags": ["stdout_stderr_contract", "error_contract", "shape_contract"],
|
|
2484
|
+
"tag_evidence": {
|
|
2485
|
+
"stdout_stderr_contract": ["asserts_named_stream_output"],
|
|
2486
|
+
"error_contract": [
|
|
2487
|
+
"asserts_error_payload_or_stderr",
|
|
2488
|
+
"asserts_nonzero_or_exit_2",
|
|
2489
|
+
],
|
|
2490
|
+
"shape_contract": [
|
|
2491
|
+
"uses_visible_input_key_names",
|
|
2492
|
+
"asserts_visible_output_key_names",
|
|
2493
|
+
"asserts_no_unexpected_output_keys",
|
|
2494
|
+
"asserts_exact_error_object",
|
|
2495
|
+
],
|
|
2496
|
+
},
|
|
2497
|
+
}) + "\n")
|
|
2498
|
+
strong_json_error_shape_contract = subprocess.run(
|
|
2499
|
+
[sys.executable, script_path, "--validate-risk-probes"],
|
|
2500
|
+
cwd=error_root,
|
|
2501
|
+
capture_output=True,
|
|
2502
|
+
text=True,
|
|
2503
|
+
)
|
|
2504
|
+
if strong_json_error_shape_contract.returncode != 0:
|
|
2505
|
+
print("JSON error object shape_contract with exact object evidence was rejected", file=sys.stderr)
|
|
2506
|
+
print(strong_json_error_shape_contract.stderr, file=sys.stderr)
|
|
2507
|
+
return 1
|
|
2508
|
+
|
|
2509
|
+
inline_json_error = (
|
|
2510
|
+
'Invalid input prints JSON error `{ "error": "invalid" }` to stderr and exits 2.'
|
|
2511
|
+
)
|
|
2512
|
+
error_spec.write_text("# Spec\n\n## Verification\n\n- " + inline_json_error + "\n")
|
|
2513
|
+
(error_devlyn / "risk-probes.jsonl").write_text(json.dumps({
|
|
2514
|
+
"id": "P7d",
|
|
2515
|
+
"derived_from": inline_json_error,
|
|
2516
|
+
"cmd": "printf weak-inline-json-error-shape-contract",
|
|
2517
|
+
"exit_code": 2,
|
|
2518
|
+
"tags": ["stdout_stderr_contract", "error_contract"],
|
|
2519
|
+
"tag_evidence": {
|
|
2520
|
+
"stdout_stderr_contract": ["asserts_named_stream_output"],
|
|
2521
|
+
"error_contract": [
|
|
2522
|
+
"asserts_error_payload_or_stderr",
|
|
2523
|
+
"asserts_nonzero_or_exit_2",
|
|
2524
|
+
],
|
|
2525
|
+
},
|
|
2526
|
+
}) + "\n")
|
|
2527
|
+
weak_inline_json_error_shape_contract = subprocess.run(
|
|
2528
|
+
[sys.executable, script_path, "--validate-risk-probes"],
|
|
2529
|
+
cwd=error_root,
|
|
2530
|
+
capture_output=True,
|
|
2531
|
+
text=True,
|
|
2532
|
+
)
|
|
2533
|
+
if weak_inline_json_error_shape_contract.returncode == 0:
|
|
2534
|
+
print("inline JSON error text did not require shape_contract tag", file=sys.stderr)
|
|
2535
|
+
return 1
|
|
2536
|
+
|
|
2537
|
+
(error_devlyn / "risk-probes.jsonl").write_text(json.dumps({
|
|
2538
|
+
"id": "P7e",
|
|
2539
|
+
"derived_from": inline_json_error,
|
|
2540
|
+
"cmd": "printf inline-json-error-shape-contract",
|
|
2541
|
+
"exit_code": 2,
|
|
2542
|
+
"tags": ["stdout_stderr_contract", "error_contract", "shape_contract"],
|
|
2543
|
+
"tag_evidence": {
|
|
2544
|
+
"stdout_stderr_contract": ["asserts_named_stream_output"],
|
|
2545
|
+
"error_contract": [
|
|
2546
|
+
"asserts_error_payload_or_stderr",
|
|
2547
|
+
"asserts_nonzero_or_exit_2",
|
|
2548
|
+
],
|
|
2549
|
+
"shape_contract": [
|
|
2550
|
+
"uses_visible_input_key_names",
|
|
2551
|
+
"asserts_visible_output_key_names",
|
|
2552
|
+
"asserts_no_unexpected_output_keys",
|
|
2553
|
+
"asserts_exact_error_object",
|
|
2554
|
+
],
|
|
2555
|
+
},
|
|
2556
|
+
}) + "\n")
|
|
2557
|
+
strong_inline_json_error_shape_contract = subprocess.run(
|
|
2558
|
+
[sys.executable, script_path, "--validate-risk-probes"],
|
|
2559
|
+
cwd=error_root,
|
|
2560
|
+
capture_output=True,
|
|
2561
|
+
text=True,
|
|
2562
|
+
)
|
|
2563
|
+
if strong_inline_json_error_shape_contract.returncode != 0:
|
|
2564
|
+
print("inline JSON error shape_contract with exact object evidence was rejected", file=sys.stderr)
|
|
2565
|
+
print(strong_inline_json_error_shape_contract.stderr, file=sys.stderr)
|
|
2566
|
+
return 1
|
|
2567
|
+
|
|
2568
|
+
http_error_root = work / "http-error-contract-risk-probe"
|
|
2569
|
+
http_error_root.mkdir()
|
|
2570
|
+
http_error_devlyn = http_error_root / ".devlyn"
|
|
2571
|
+
http_error_devlyn.mkdir()
|
|
2572
|
+
http_error_spec = http_error_root / "spec.md"
|
|
2573
|
+
http_error_spec.write_text(
|
|
2574
|
+
"# Spec\n\n## Verification\n\n"
|
|
2575
|
+
"- An invalid query returns HTTP 400 with JSON error body `{ \"error\": \"invalid_query\", \"field\": \"per_page\" }`.\n"
|
|
2576
|
+
)
|
|
2577
|
+
(http_error_devlyn / "pipeline.state.json").write_text(json.dumps({
|
|
2578
|
+
"source": {"type": "spec", "spec_path": str(http_error_spec)}
|
|
2579
|
+
}))
|
|
2580
|
+
(http_error_devlyn / "risk-probes.jsonl").write_text(json.dumps({
|
|
2581
|
+
"id": "P8",
|
|
2582
|
+
"derived_from": (
|
|
2583
|
+
"An invalid query returns HTTP 400 with JSON error body "
|
|
2584
|
+
"`{ \"error\": \"invalid_query\", \"field\": \"per_page\" }`."
|
|
2585
|
+
),
|
|
2586
|
+
"cmd": "printf weak-http-error-contract",
|
|
2587
|
+
"exit_code": 0,
|
|
2588
|
+
"tags": ["shape_contract"],
|
|
2589
|
+
"tag_evidence": {},
|
|
2590
|
+
}) + "\n")
|
|
2591
|
+
weak_http_error_contract = subprocess.run(
|
|
2592
|
+
[sys.executable, script_path, "--validate-risk-probes"],
|
|
2593
|
+
cwd=http_error_root,
|
|
2594
|
+
capture_output=True,
|
|
2595
|
+
text=True,
|
|
2596
|
+
)
|
|
2597
|
+
if weak_http_error_contract.returncode == 0:
|
|
2598
|
+
print("http error text did not require http_error_contract tag", file=sys.stderr)
|
|
2599
|
+
return 1
|
|
2600
|
+
|
|
2601
|
+
(http_error_devlyn / "risk-probes.jsonl").write_text(json.dumps({
|
|
2602
|
+
"id": "P8b",
|
|
2603
|
+
"derived_from": (
|
|
2604
|
+
"An invalid query returns HTTP 400 with JSON error body "
|
|
2605
|
+
"`{ \"error\": \"invalid_query\", \"field\": \"per_page\" }`."
|
|
2606
|
+
),
|
|
2607
|
+
"cmd": "printf http-error-contract",
|
|
2608
|
+
"exit_code": 0,
|
|
2609
|
+
"tags": ["http_error_contract"],
|
|
2610
|
+
"tag_evidence": {
|
|
2611
|
+
"http_error_contract": ["asserts_http_error_status"],
|
|
2612
|
+
},
|
|
2613
|
+
}) + "\n")
|
|
2614
|
+
incomplete_http_error_contract = subprocess.run(
|
|
2615
|
+
[sys.executable, script_path, "--validate-risk-probes"],
|
|
2616
|
+
cwd=http_error_root,
|
|
2617
|
+
capture_output=True,
|
|
2618
|
+
text=True,
|
|
2619
|
+
)
|
|
2620
|
+
if incomplete_http_error_contract.returncode == 0:
|
|
2621
|
+
print("http_error_contract without payload evidence was accepted", file=sys.stderr)
|
|
2622
|
+
return 1
|
|
2623
|
+
|
|
2624
|
+
(http_error_devlyn / "risk-probes.jsonl").write_text(json.dumps({
|
|
2625
|
+
"id": "P8c",
|
|
2626
|
+
"derived_from": (
|
|
2627
|
+
"An invalid query returns HTTP 400 with JSON error body "
|
|
2628
|
+
"`{ \"error\": \"invalid_query\", \"field\": \"per_page\" }`."
|
|
2629
|
+
),
|
|
2630
|
+
"cmd": "printf weak-exact-error-shape-contract",
|
|
2631
|
+
"exit_code": 0,
|
|
2632
|
+
"tags": ["http_error_contract", "shape_contract"],
|
|
2633
|
+
"tag_evidence": {
|
|
2634
|
+
"http_error_contract": [
|
|
2635
|
+
"asserts_http_error_status",
|
|
2636
|
+
"asserts_error_payload_body",
|
|
2637
|
+
],
|
|
2638
|
+
"shape_contract": [
|
|
2639
|
+
"uses_visible_input_key_names",
|
|
2640
|
+
"asserts_visible_output_key_names",
|
|
2641
|
+
"asserts_no_unexpected_output_keys",
|
|
2642
|
+
],
|
|
2643
|
+
},
|
|
2644
|
+
}) + "\n")
|
|
2645
|
+
weak_exact_error_shape_contract = subprocess.run(
|
|
2646
|
+
[sys.executable, script_path, "--validate-risk-probes"],
|
|
2647
|
+
cwd=http_error_root,
|
|
2648
|
+
capture_output=True,
|
|
2649
|
+
text=True,
|
|
2650
|
+
)
|
|
2651
|
+
if weak_exact_error_shape_contract.returncode == 0:
|
|
2652
|
+
print("exact error body shape_contract without exact object evidence was accepted", file=sys.stderr)
|
|
2653
|
+
return 1
|
|
2654
|
+
|
|
2655
|
+
(http_error_devlyn / "risk-probes.jsonl").write_text(json.dumps({
|
|
2656
|
+
"id": "P8d",
|
|
2657
|
+
"derived_from": (
|
|
2658
|
+
"An invalid query returns HTTP 400 with JSON error body "
|
|
2659
|
+
"`{ \"error\": \"invalid_query\", \"field\": \"per_page\" }`."
|
|
2660
|
+
),
|
|
2661
|
+
"cmd": "printf exact-error-shape-contract",
|
|
2662
|
+
"exit_code": 0,
|
|
2663
|
+
"tags": ["http_error_contract", "shape_contract"],
|
|
2664
|
+
"tag_evidence": {
|
|
2665
|
+
"http_error_contract": [
|
|
2666
|
+
"asserts_http_error_status",
|
|
2667
|
+
"asserts_error_payload_body",
|
|
2668
|
+
],
|
|
2669
|
+
"shape_contract": [
|
|
2670
|
+
"uses_visible_input_key_names",
|
|
2671
|
+
"asserts_visible_output_key_names",
|
|
2672
|
+
"asserts_no_unexpected_output_keys",
|
|
2673
|
+
"asserts_exact_error_object",
|
|
2674
|
+
],
|
|
2675
|
+
},
|
|
2676
|
+
}) + "\n")
|
|
2677
|
+
strong_exact_error_shape_contract = subprocess.run(
|
|
2678
|
+
[sys.executable, script_path, "--validate-risk-probes"],
|
|
2679
|
+
cwd=http_error_root,
|
|
2680
|
+
capture_output=True,
|
|
2681
|
+
text=True,
|
|
2682
|
+
)
|
|
2683
|
+
if strong_exact_error_shape_contract.returncode != 0:
|
|
2684
|
+
print("exact error body shape_contract with exact object evidence was rejected", file=sys.stderr)
|
|
2685
|
+
print(strong_exact_error_shape_contract.stderr, file=sys.stderr)
|
|
2686
|
+
return 1
|
|
2687
|
+
|
|
2688
|
+
shape_root = work / "exact-shape-risk-probe"
|
|
2689
|
+
shape_root.mkdir()
|
|
2690
|
+
shape_devlyn = shape_root / ".devlyn"
|
|
2691
|
+
shape_devlyn.mkdir()
|
|
2692
|
+
shape_spec = shape_root / "spec.md"
|
|
2693
|
+
shape_spec.write_text(
|
|
2694
|
+
"# Spec\n\n## Verification\n\n"
|
|
2695
|
+
"- On success, output is one JSON object with keys `applied`, `rejected`, and `accounts`; "
|
|
2696
|
+
"`rejected` rows have keys `id` and `reason`.\n"
|
|
2697
|
+
)
|
|
2698
|
+
(shape_devlyn / "pipeline.state.json").write_text(json.dumps({
|
|
2699
|
+
"source": {"type": "spec", "spec_path": str(shape_spec)}
|
|
2700
|
+
}))
|
|
2701
|
+
(shape_devlyn / "risk-probes.jsonl").write_text(json.dumps({
|
|
2702
|
+
"id": "P8c",
|
|
2703
|
+
"derived_from": (
|
|
2704
|
+
"On success, output is one JSON object with keys `applied`, `rejected`, and `accounts`; "
|
|
2705
|
+
"`rejected` rows have keys `id` and `reason`."
|
|
2706
|
+
),
|
|
2707
|
+
"cmd": "printf weak-shape-contract",
|
|
2708
|
+
"exit_code": 0,
|
|
2709
|
+
"tags": ["shape_contract"],
|
|
2710
|
+
"tag_evidence": {},
|
|
2711
|
+
}) + "\n")
|
|
2712
|
+
weak_shape_contract = subprocess.run(
|
|
2713
|
+
[sys.executable, script_path, "--validate-risk-probes"],
|
|
2714
|
+
cwd=shape_root,
|
|
2715
|
+
capture_output=True,
|
|
2716
|
+
text=True,
|
|
2717
|
+
)
|
|
2718
|
+
if weak_shape_contract.returncode == 0:
|
|
2719
|
+
print("shape_contract without exact key evidence was accepted", file=sys.stderr)
|
|
2720
|
+
return 1
|
|
2721
|
+
|
|
2722
|
+
(shape_devlyn / "risk-probes.jsonl").write_text(json.dumps({
|
|
2723
|
+
"id": "P8d",
|
|
2724
|
+
"derived_from": (
|
|
2725
|
+
"On success, output is one JSON object with keys `applied`, `rejected`, and `accounts`; "
|
|
2726
|
+
"`rejected` rows have keys `id` and `reason`."
|
|
2727
|
+
),
|
|
2728
|
+
"cmd": "printf shape-contract",
|
|
2729
|
+
"exit_code": 0,
|
|
2730
|
+
"tags": ["shape_contract"],
|
|
2731
|
+
"tag_evidence": {
|
|
2732
|
+
"shape_contract": [
|
|
2733
|
+
"uses_visible_input_key_names",
|
|
2734
|
+
"asserts_visible_output_key_names",
|
|
2735
|
+
"asserts_no_unexpected_output_keys",
|
|
2736
|
+
],
|
|
2737
|
+
},
|
|
2738
|
+
}) + "\n")
|
|
2739
|
+
strong_shape_contract = subprocess.run(
|
|
2740
|
+
[sys.executable, script_path, "--validate-risk-probes"],
|
|
2741
|
+
cwd=shape_root,
|
|
2742
|
+
capture_output=True,
|
|
2743
|
+
text=True,
|
|
2744
|
+
)
|
|
2745
|
+
if strong_shape_contract.returncode != 0:
|
|
2746
|
+
print("shape_contract with exact key evidence was rejected", file=sys.stderr)
|
|
2747
|
+
print(strong_shape_contract.stderr, file=sys.stderr)
|
|
2748
|
+
return 1
|
|
2749
|
+
|
|
2750
|
+
inline_json_success = 'On success, stdout is `{ "id": "acct_1", "status": "accepted" }`.'
|
|
2751
|
+
shape_spec.write_text("# Spec\n\n## Verification\n\n- " + inline_json_success + "\n")
|
|
2752
|
+
(shape_devlyn / "risk-probes.jsonl").write_text(json.dumps({
|
|
2753
|
+
"id": "P8e",
|
|
2754
|
+
"derived_from": inline_json_success,
|
|
2755
|
+
"cmd": "printf weak-inline-json-shape-contract",
|
|
2756
|
+
"exit_code": 0,
|
|
2757
|
+
"tags": ["stdout_stderr_contract"],
|
|
2758
|
+
"tag_evidence": {
|
|
2759
|
+
"stdout_stderr_contract": ["asserts_named_stream_output"],
|
|
2760
|
+
},
|
|
2761
|
+
}) + "\n")
|
|
2762
|
+
weak_inline_json_shape_contract = subprocess.run(
|
|
2763
|
+
[sys.executable, script_path, "--validate-risk-probes"],
|
|
2764
|
+
cwd=shape_root,
|
|
2765
|
+
capture_output=True,
|
|
2766
|
+
text=True,
|
|
2767
|
+
)
|
|
2768
|
+
if weak_inline_json_shape_contract.returncode == 0:
|
|
2769
|
+
print("inline JSON object text did not require shape_contract tag", file=sys.stderr)
|
|
2770
|
+
return 1
|
|
2771
|
+
|
|
2772
|
+
(shape_devlyn / "risk-probes.jsonl").write_text(json.dumps({
|
|
2773
|
+
"id": "P8f",
|
|
2774
|
+
"derived_from": inline_json_success,
|
|
2775
|
+
"cmd": "printf inline-json-shape-contract",
|
|
2776
|
+
"exit_code": 0,
|
|
2777
|
+
"tags": ["stdout_stderr_contract", "shape_contract"],
|
|
2778
|
+
"tag_evidence": {
|
|
2779
|
+
"stdout_stderr_contract": ["asserts_named_stream_output"],
|
|
2780
|
+
"shape_contract": [
|
|
2781
|
+
"uses_visible_input_key_names",
|
|
2782
|
+
"asserts_visible_output_key_names",
|
|
2783
|
+
"asserts_no_unexpected_output_keys",
|
|
2784
|
+
],
|
|
2785
|
+
},
|
|
2786
|
+
}) + "\n")
|
|
2787
|
+
strong_inline_json_shape_contract = subprocess.run(
|
|
2788
|
+
[sys.executable, script_path, "--validate-risk-probes"],
|
|
2789
|
+
cwd=shape_root,
|
|
2790
|
+
capture_output=True,
|
|
2791
|
+
text=True,
|
|
2792
|
+
)
|
|
2793
|
+
if strong_inline_json_shape_contract.returncode != 0:
|
|
2794
|
+
print("inline JSON object shape_contract with key evidence was rejected", file=sys.stderr)
|
|
2795
|
+
print(strong_inline_json_shape_contract.stderr, file=sys.stderr)
|
|
2796
|
+
return 1
|
|
2797
|
+
|
|
2798
|
+
forbidden_text_root = work / "forbidden-pattern-risk-probe"
|
|
2799
|
+
forbidden_text_root.mkdir()
|
|
2800
|
+
forbidden_text_devlyn = forbidden_text_root / ".devlyn"
|
|
2801
|
+
forbidden_text_devlyn.mkdir()
|
|
2802
|
+
forbidden_text_spec = forbidden_text_root / "spec.md"
|
|
2803
|
+
forbidden_text_spec.write_text(
|
|
2804
|
+
"# Spec\n\n## Verification\n\n"
|
|
2805
|
+
"- The diff must not add forbidden fallback patterns.\n"
|
|
2806
|
+
)
|
|
2807
|
+
(forbidden_text_devlyn / "pipeline.state.json").write_text(json.dumps({
|
|
2808
|
+
"source": {"type": "spec", "spec_path": str(forbidden_text_spec)}
|
|
2809
|
+
}))
|
|
2810
|
+
(forbidden_text_devlyn / "risk-probes.jsonl").write_text(json.dumps({
|
|
2811
|
+
"id": "P8",
|
|
2812
|
+
"derived_from": "The diff must not add forbidden fallback patterns.",
|
|
2813
|
+
"cmd": "printf forbidden-pattern-static-check",
|
|
2814
|
+
"exit_code": 0,
|
|
2815
|
+
"tags": ["shape_contract"],
|
|
2816
|
+
"tag_evidence": {},
|
|
2817
|
+
}) + "\n")
|
|
2818
|
+
forbidden_pattern_probe = subprocess.run(
|
|
2819
|
+
[sys.executable, script_path, "--validate-risk-probes"],
|
|
2820
|
+
cwd=forbidden_text_root,
|
|
2821
|
+
capture_output=True,
|
|
2822
|
+
text=True,
|
|
2823
|
+
)
|
|
2824
|
+
if forbidden_pattern_probe.returncode != 0:
|
|
2825
|
+
print("generic forbidden-pattern verification text incorrectly required boundary_overlap", file=sys.stderr)
|
|
2826
|
+
print(forbidden_pattern_probe.stderr, file=sys.stderr)
|
|
2827
|
+
return 1
|
|
2828
|
+
|
|
2829
|
+
stock_validation_root = work / "stock-validation-risk-probe"
|
|
2830
|
+
stock_validation_root.mkdir()
|
|
2831
|
+
stock_validation_devlyn = stock_validation_root / ".devlyn"
|
|
2832
|
+
stock_validation_devlyn.mkdir()
|
|
2833
|
+
stock_validation_spec = stock_validation_root / "spec.md"
|
|
2834
|
+
stock_validation_spec.write_text(
|
|
2835
|
+
"# Spec\n\n## Verification\n\n"
|
|
2836
|
+
"- A quote over combined stock exits `2`, prints one JSON error to stderr, and prints no stdout.\n"
|
|
2837
|
+
)
|
|
2838
|
+
(stock_validation_devlyn / "pipeline.state.json").write_text(json.dumps({
|
|
2839
|
+
"source": {"type": "spec", "spec_path": str(stock_validation_spec)}
|
|
2840
|
+
}))
|
|
2841
|
+
(stock_validation_devlyn / "risk-probes.jsonl").write_text(json.dumps({
|
|
2842
|
+
"id": "P9",
|
|
2843
|
+
"derived_from": (
|
|
2844
|
+
"A quote over combined stock exits `2`, prints one JSON error "
|
|
2845
|
+
"to stderr, and prints no stdout."
|
|
2846
|
+
),
|
|
2847
|
+
"cmd": "printf stock-validation-error",
|
|
2848
|
+
"exit_code": 2,
|
|
2849
|
+
"tags": ["stdout_stderr_contract", "error_contract"],
|
|
2850
|
+
"tag_evidence": {
|
|
2851
|
+
"stdout_stderr_contract": ["asserts_named_stream_output"],
|
|
2852
|
+
"error_contract": [
|
|
2853
|
+
"asserts_error_payload_or_stderr",
|
|
2854
|
+
"asserts_nonzero_or_exit_2",
|
|
2855
|
+
],
|
|
2856
|
+
},
|
|
2857
|
+
}) + "\n")
|
|
2858
|
+
stock_validation_probe = subprocess.run(
|
|
2859
|
+
[sys.executable, script_path, "--validate-risk-probes"],
|
|
2860
|
+
cwd=stock_validation_root,
|
|
2861
|
+
capture_output=True,
|
|
2862
|
+
text=True,
|
|
2863
|
+
)
|
|
2864
|
+
if stock_validation_probe.returncode != 0:
|
|
2865
|
+
print("stock validation error text incorrectly required prior_consumption", file=sys.stderr)
|
|
2866
|
+
print(stock_validation_probe.stderr, file=sys.stderr)
|
|
2867
|
+
return 1
|
|
2868
|
+
|
|
2869
|
+
webhook_root = work / "webhook-risk-probe"
|
|
2870
|
+
webhook_root.mkdir()
|
|
2871
|
+
webhook_devlyn = webhook_root / ".devlyn"
|
|
2872
|
+
webhook_devlyn.mkdir()
|
|
2873
|
+
webhook_spec = webhook_root / "spec.md"
|
|
2874
|
+
webhook_spec.write_text(
|
|
2875
|
+
"# Spec\n\n## Verification\n\n"
|
|
2876
|
+
"- A POST whose body has been modified after signing returns 401.\n"
|
|
2877
|
+
"- A second POST with the same accepted `id` returns 409 even if the duplicate body would otherwise fail shape validation.\n"
|
|
2878
|
+
)
|
|
2879
|
+
(webhook_devlyn / "pipeline.state.json").write_text(json.dumps({
|
|
2880
|
+
"source": {"type": "spec", "spec_path": str(webhook_spec)}
|
|
2881
|
+
}))
|
|
2882
|
+
(webhook_devlyn / "risk-probes.jsonl").write_text(json.dumps({
|
|
2883
|
+
"id": "P10",
|
|
2884
|
+
"derived_from": "A POST whose body has been modified after signing returns 401.",
|
|
2885
|
+
"cmd": "printf weak-webhook",
|
|
2886
|
+
"exit_code": 0,
|
|
2887
|
+
"tags": ["shape_contract"],
|
|
2888
|
+
"tag_evidence": {},
|
|
2889
|
+
}) + "\n")
|
|
2890
|
+
weak_webhook_probe = subprocess.run(
|
|
2891
|
+
[sys.executable, script_path, "--validate-risk-probes"],
|
|
2892
|
+
cwd=webhook_root,
|
|
2893
|
+
capture_output=True,
|
|
2894
|
+
text=True,
|
|
2895
|
+
)
|
|
2896
|
+
if weak_webhook_probe.returncode == 0:
|
|
2897
|
+
print("webhook signature/replay text did not require auth/idempotency probe tags", file=sys.stderr)
|
|
2898
|
+
return 1
|
|
2899
|
+
|
|
2900
|
+
duplicate_sku_root = work / "duplicate-sku-risk-probe"
|
|
2901
|
+
duplicate_sku_root.mkdir()
|
|
2902
|
+
duplicate_sku_devlyn = duplicate_sku_root / ".devlyn"
|
|
2903
|
+
duplicate_sku_devlyn.mkdir()
|
|
2904
|
+
duplicate_sku_spec = duplicate_sku_root / "spec.md"
|
|
2905
|
+
duplicate_sku_spec.write_text(
|
|
2906
|
+
"# Spec\n\n## Verification\n\n"
|
|
2907
|
+
"- A cart with duplicate SKUs combines quantities before stock validation.\n"
|
|
2908
|
+
)
|
|
2909
|
+
(duplicate_sku_devlyn / "pipeline.state.json").write_text(json.dumps({
|
|
2910
|
+
"source": {"type": "spec", "spec_path": str(duplicate_sku_spec)}
|
|
2911
|
+
}))
|
|
2912
|
+
(duplicate_sku_devlyn / "risk-probes.jsonl").write_text(json.dumps({
|
|
2913
|
+
"id": "P11",
|
|
2914
|
+
"derived_from": "A cart with duplicate SKUs combines quantities before stock validation.",
|
|
2915
|
+
"cmd": "printf duplicate-sku-shape",
|
|
2916
|
+
"exit_code": 0,
|
|
2917
|
+
"tags": ["shape_contract"],
|
|
2918
|
+
"tag_evidence": {},
|
|
2919
|
+
}) + "\n")
|
|
2920
|
+
duplicate_sku_probe = subprocess.run(
|
|
2921
|
+
[sys.executable, script_path, "--validate-risk-probes"],
|
|
2922
|
+
cwd=duplicate_sku_root,
|
|
2923
|
+
capture_output=True,
|
|
2924
|
+
text=True,
|
|
2925
|
+
)
|
|
2926
|
+
if duplicate_sku_probe.returncode != 0:
|
|
2927
|
+
print("duplicate SKU verification text incorrectly required idempotency_replay", file=sys.stderr)
|
|
2928
|
+
print(duplicate_sku_probe.stderr, file=sys.stderr)
|
|
2929
|
+
return 1
|
|
2930
|
+
|
|
2931
|
+
concurrent_root = work / "concurrent-state-risk-probe"
|
|
2932
|
+
concurrent_root.mkdir()
|
|
2933
|
+
concurrent_devlyn = concurrent_root / ".devlyn"
|
|
2934
|
+
concurrent_devlyn.mkdir()
|
|
2935
|
+
concurrent_spec = concurrent_root / "spec.md"
|
|
2936
|
+
concurrent_spec.write_text(
|
|
2937
|
+
"# Spec\n\n## Verification\n\n"
|
|
2938
|
+
"- Several POST requests close together must all appear exactly once "
|
|
2939
|
+
"in GET output with distinct ids.\n"
|
|
2940
|
+
)
|
|
2941
|
+
(concurrent_devlyn / "pipeline.state.json").write_text(json.dumps({
|
|
2942
|
+
"source": {"type": "spec", "spec_path": str(concurrent_spec)}
|
|
2943
|
+
}))
|
|
2944
|
+
(concurrent_devlyn / "risk-probes.jsonl").write_text(json.dumps({
|
|
2945
|
+
"id": "P12",
|
|
2946
|
+
"derived_from": (
|
|
2947
|
+
"Several POST requests close together must all appear exactly "
|
|
2948
|
+
"once in GET output with distinct ids."
|
|
2949
|
+
),
|
|
2950
|
+
"cmd": "printf weak-concurrent-state",
|
|
2951
|
+
"exit_code": 0,
|
|
2952
|
+
"tags": ["shape_contract"],
|
|
2953
|
+
"tag_evidence": {},
|
|
2954
|
+
}) + "\n")
|
|
2955
|
+
weak_concurrent_probe = subprocess.run(
|
|
2956
|
+
[sys.executable, script_path, "--validate-risk-probes"],
|
|
2957
|
+
cwd=concurrent_root,
|
|
2958
|
+
capture_output=True,
|
|
2959
|
+
text=True,
|
|
2960
|
+
)
|
|
2961
|
+
if weak_concurrent_probe.returncode == 0:
|
|
2962
|
+
print("concurrent state text did not require concurrent_state_consistency tag", file=sys.stderr)
|
|
2963
|
+
return 1
|
|
2964
|
+
|
|
2965
|
+
atomic_batch_root = work / "atomic-batch-risk-probe"
|
|
2966
|
+
atomic_batch_root.mkdir()
|
|
2967
|
+
atomic_batch_devlyn = atomic_batch_root / ".devlyn"
|
|
2968
|
+
atomic_batch_devlyn.mkdir()
|
|
2969
|
+
atomic_batch_spec = atomic_batch_root / "spec.md"
|
|
2970
|
+
atomic_batch_spec.write_text(
|
|
2971
|
+
"# Spec\n\n## Verification\n\n"
|
|
2972
|
+
"- A POST with one valid + one invalid item returns `400`, AND a subsequent GET returns the same list as before the import.\n"
|
|
2973
|
+
"- A POST with all-valid items returns `201`, and the items appear in GET output in order with distinct ids.\n"
|
|
2974
|
+
)
|
|
2975
|
+
(atomic_batch_devlyn / "pipeline.state.json").write_text(json.dumps({
|
|
2976
|
+
"source": {"type": "spec", "spec_path": str(atomic_batch_spec)}
|
|
2977
|
+
}))
|
|
2978
|
+
(atomic_batch_devlyn / "risk-probes.jsonl").write_text(json.dumps({
|
|
2979
|
+
"id": "P13",
|
|
2980
|
+
"derived_from": (
|
|
2981
|
+
"A POST with one valid + one invalid item returns `400`, AND "
|
|
2982
|
+
"a subsequent GET returns the same list as before the import."
|
|
2983
|
+
),
|
|
2984
|
+
"cmd": "printf weak-atomic-batch",
|
|
2985
|
+
"exit_code": 0,
|
|
2986
|
+
"tags": ["shape_contract"],
|
|
2987
|
+
"tag_evidence": {},
|
|
2988
|
+
}) + "\n")
|
|
2989
|
+
weak_atomic_batch_probe = subprocess.run(
|
|
2990
|
+
[sys.executable, script_path, "--validate-risk-probes"],
|
|
2991
|
+
cwd=atomic_batch_root,
|
|
2992
|
+
capture_output=True,
|
|
2993
|
+
text=True,
|
|
2994
|
+
)
|
|
2995
|
+
if weak_atomic_batch_probe.returncode == 0:
|
|
2996
|
+
print("atomic batch text did not require atomic_batch_state tag", file=sys.stderr)
|
|
2997
|
+
return 1
|
|
2998
|
+
|
|
2999
|
+
(atomic_batch_devlyn / "risk-probes.jsonl").write_text(json.dumps({
|
|
3000
|
+
"id": "P13b",
|
|
3001
|
+
"derived_from": (
|
|
3002
|
+
"A POST with one valid + one invalid item returns `400`, AND "
|
|
3003
|
+
"a subsequent GET returns the same list as before the import."
|
|
3004
|
+
),
|
|
3005
|
+
"cmd": "printf incomplete-atomic-batch",
|
|
3006
|
+
"exit_code": 0,
|
|
3007
|
+
"tags": ["atomic_batch_state"],
|
|
3008
|
+
"tag_evidence": {
|
|
3009
|
+
"atomic_batch_state": [
|
|
3010
|
+
"mixed_valid_invalid_batch",
|
|
3011
|
+
"asserts_store_unchanged_after_failure",
|
|
3012
|
+
],
|
|
3013
|
+
},
|
|
3014
|
+
}) + "\n")
|
|
3015
|
+
incomplete_atomic_batch_probe = subprocess.run(
|
|
3016
|
+
[sys.executable, script_path, "--validate-risk-probes"],
|
|
3017
|
+
cwd=atomic_batch_root,
|
|
3018
|
+
capture_output=True,
|
|
3019
|
+
text=True,
|
|
3020
|
+
)
|
|
3021
|
+
if incomplete_atomic_batch_probe.returncode == 0:
|
|
3022
|
+
print("atomic_batch_state without success-order evidence was accepted", file=sys.stderr)
|
|
3023
|
+
return 1
|
|
3024
|
+
return 0
|
|
3025
|
+
|
|
3026
|
+
|
|
3027
|
+
def main() -> int:
|
|
3028
|
+
include_risk_probes = False
|
|
3029
|
+
validate_risk_probes_only = False
|
|
3030
|
+
if "--include-risk-probes" in sys.argv[1:]:
|
|
3031
|
+
include_risk_probes = True
|
|
3032
|
+
sys.argv = [arg for arg in sys.argv if arg != "--include-risk-probes"]
|
|
3033
|
+
if "--validate-risk-probes" in sys.argv[1:]:
|
|
528
3034
|
validate_risk_probes_only = True
|
|
529
3035
|
sys.argv = [arg for arg in sys.argv if arg != "--validate-risk-probes"]
|
|
530
3036
|
|
|
@@ -537,6 +3043,12 @@ def main() -> int:
|
|
|
537
3043
|
return 2
|
|
538
3044
|
return run_check_mode(Path(sys.argv[2]))
|
|
539
3045
|
|
|
3046
|
+
if len(sys.argv) >= 2 and sys.argv[1] == "--check-expected":
|
|
3047
|
+
if len(sys.argv) != 3:
|
|
3048
|
+
print("usage: spec-verify-check.py --check-expected <json-path>", file=sys.stderr)
|
|
3049
|
+
return 2
|
|
3050
|
+
return run_check_expected_mode(Path(sys.argv[2]))
|
|
3051
|
+
|
|
540
3052
|
bench_mode = "BENCH_WORKDIR" in os.environ
|
|
541
3053
|
work = Path(os.environ.get("BENCH_WORKDIR") or os.getcwd())
|
|
542
3054
|
devlyn_dir = work / ".devlyn"
|
|
@@ -550,15 +3062,17 @@ def main() -> int:
|
|
|
550
3062
|
# source-extract entirely. iter-0019.9 closes the F9 regression where
|
|
551
3063
|
# source-extract from an ideate-generated spec overwrote the
|
|
552
3064
|
# benchmark contract — for benchmarks, expected.json is canonical.
|
|
553
|
-
# 2. Otherwise,
|
|
3065
|
+
# 2. Otherwise, real-user source.type=="spec" first attempts the sibling
|
|
3066
|
+
# spec.expected.json next to spec.md. If present, validate it and stage
|
|
3067
|
+
# its verification_commands. If malformed, fail closed. If absent,
|
|
3068
|
+
# continue to legacy source-extract.
|
|
3069
|
+
# 3. Source-extract reads
|
|
554
3070
|
# `pipeline.state.json:source.{spec_path | criteria_path}`. If it has
|
|
555
|
-
# a json block, overwrite .devlyn/spec-verify.json with it.
|
|
556
|
-
#
|
|
557
|
-
# is stale (from a killed prior run) and must NOT be trusted.
|
|
558
|
-
# 3. If source has no json block AND source.type=="generated":
|
|
3071
|
+
# a json block, overwrite .devlyn/spec-verify.json with it.
|
|
3072
|
+
# 4. If source has no json block AND source.type=="generated":
|
|
559
3073
|
# CRITICAL spec-verify-malformed — generated criteria must ship a
|
|
560
|
-
# verifiable contract per
|
|
561
|
-
#
|
|
3074
|
+
# verifiable contract per the generated-criteria output contract.
|
|
3075
|
+
# 5. If source has no sibling/json block AND source.type=="spec":
|
|
562
3076
|
# - Real-user mode: silent no-op (preserves iter-0019.6 backward
|
|
563
3077
|
# compat for handwritten specs without the carrier). Drop any
|
|
564
3078
|
# stale pre-staged file.
|
|
@@ -568,6 +3082,14 @@ def main() -> int:
|
|
|
568
3082
|
pre_staged = spec_path.is_file() # captured BEFORE any potential write
|
|
569
3083
|
trust_bench_staged = bench_mode and pre_staged
|
|
570
3084
|
src_type, source_md = read_source(work, devlyn_dir)
|
|
3085
|
+
state = read_state(devlyn_dir)
|
|
3086
|
+
integrity_error = source_integrity_error(src_type, state, source_md)
|
|
3087
|
+
if integrity_error:
|
|
3088
|
+
print(f"[spec-verify] carrier malformed: {integrity_error}", file=sys.stderr)
|
|
3089
|
+
write_malformed_finding(devlyn_dir, integrity_error, source_md)
|
|
3090
|
+
return 1
|
|
3091
|
+
expected_data: dict | None = None
|
|
3092
|
+
expected_path: Path | None = None
|
|
571
3093
|
if validate_risk_probes_only:
|
|
572
3094
|
_risk_probes, risk_error = load_risk_probes(
|
|
573
3095
|
devlyn_dir, source_md, require_present=True
|
|
@@ -579,7 +3101,20 @@ def main() -> int:
|
|
|
579
3101
|
print("[spec-verify] risk probes valid", file=sys.stderr)
|
|
580
3102
|
return 0
|
|
581
3103
|
if source_md is not None and not trust_bench_staged:
|
|
582
|
-
|
|
3104
|
+
if src_type == "spec":
|
|
3105
|
+
expected_found, expected_staged, expected_error, expected_path, expected_data = stage_from_expected(
|
|
3106
|
+
source_md, devlyn_dir
|
|
3107
|
+
)
|
|
3108
|
+
if expected_error is not None:
|
|
3109
|
+
print(f"[spec-verify] carrier malformed: {expected_error}", file=sys.stderr)
|
|
3110
|
+
write_malformed_finding(devlyn_dir, expected_error, expected_path)
|
|
3111
|
+
return 1
|
|
3112
|
+
if expected_staged:
|
|
3113
|
+
staged, error = (True, None)
|
|
3114
|
+
else:
|
|
3115
|
+
staged, error = stage_from_source(source_md, devlyn_dir)
|
|
3116
|
+
else:
|
|
3117
|
+
staged, error = stage_from_source(source_md, devlyn_dir)
|
|
583
3118
|
if error is not None:
|
|
584
3119
|
print(f"[spec-verify] carrier malformed: {error}", file=sys.stderr)
|
|
585
3120
|
write_malformed_finding(devlyn_dir, error, source_md)
|
|
@@ -589,13 +3124,13 @@ def main() -> int:
|
|
|
589
3124
|
msg = (
|
|
590
3125
|
f"generated {source_md.name} must include a "
|
|
591
3126
|
"`## Verification` ```json``` block (verification_commands "
|
|
592
|
-
"array).
|
|
3127
|
+
"array). Generated criteria were written without one."
|
|
593
3128
|
)
|
|
594
3129
|
print(f"[spec-verify] {msg}", file=sys.stderr)
|
|
595
3130
|
write_malformed_finding(devlyn_dir, msg, source_md)
|
|
596
3131
|
return 1
|
|
597
3132
|
# source.type=="spec", no block in spec markdown.
|
|
598
|
-
if not bench_mode:
|
|
3133
|
+
if not bench_mode and expected_data is None:
|
|
599
3134
|
# Real-user handwritten spec: silent no-op. Drop any stale
|
|
600
3135
|
# pre-staged file so a killed prior run cannot poison this
|
|
601
3136
|
# run's gate.
|
|
@@ -615,31 +3150,42 @@ def main() -> int:
|
|
|
615
3150
|
spec_path.unlink()
|
|
616
3151
|
return 0
|
|
617
3152
|
|
|
3153
|
+
commands: list[dict] = []
|
|
618
3154
|
if not spec_path.exists():
|
|
619
3155
|
# No source markdown carrier AND no pre-staged file. Silent no-op
|
|
620
3156
|
# for benchmark misconfigurations (no fixture to gate against) and
|
|
621
3157
|
# for real-user runs without spec/criteria. Generated source case
|
|
622
3158
|
# is handled above.
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
3159
|
+
if expected_data is None:
|
|
3160
|
+
return 0
|
|
3161
|
+
else:
|
|
3162
|
+
try:
|
|
3163
|
+
spec = loads_strict_json(spec_path.read_text())
|
|
3164
|
+
except (ValueError, OSError) as e:
|
|
3165
|
+
print(f"[spec-verify] error: cannot parse {spec_path}: {e}", file=sys.stderr)
|
|
3166
|
+
return 2
|
|
630
3167
|
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
3168
|
+
# iter-0019.8 (Codex R2 #2): apply full shape validation to pre-staged
|
|
3169
|
+
# carriers too — bool exit_code, empty list, whitespace-only cmd were
|
|
3170
|
+
# silently accepted on the benchmark path. Empty list is rejected
|
|
3171
|
+
# because "all 0 commands passed" is vacuously true.
|
|
3172
|
+
shape_err = validate_shape(spec)
|
|
3173
|
+
if shape_err:
|
|
3174
|
+
print(f"[spec-verify] error: {spec_path}: {shape_err}", file=sys.stderr)
|
|
3175
|
+
write_malformed_finding(devlyn_dir, f"{spec_path}: {shape_err}", None)
|
|
3176
|
+
return 1
|
|
3177
|
+
commands = list(spec["verification_commands"])
|
|
641
3178
|
if include_risk_probes:
|
|
642
|
-
|
|
3179
|
+
risk_state_error = risk_probes_state_error(state)
|
|
3180
|
+
if risk_state_error:
|
|
3181
|
+
print(f"[spec-verify] risk probes malformed: {risk_state_error}", file=sys.stderr)
|
|
3182
|
+
write_malformed_finding(devlyn_dir, risk_state_error, Path("pipeline.state.json"))
|
|
3183
|
+
return 1
|
|
3184
|
+
risk_probes, risk_error = load_risk_probes(
|
|
3185
|
+
devlyn_dir,
|
|
3186
|
+
source_md,
|
|
3187
|
+
require_present=state_requires_risk_probes(state),
|
|
3188
|
+
)
|
|
643
3189
|
if risk_error:
|
|
644
3190
|
print(f"[spec-verify] risk probes malformed: {risk_error}", file=sys.stderr)
|
|
645
3191
|
write_malformed_finding(devlyn_dir, risk_error, devlyn_dir / "risk-probes.jsonl")
|
|
@@ -648,7 +3194,7 @@ def main() -> int:
|
|
|
648
3194
|
|
|
649
3195
|
devlyn_dir.mkdir(parents=True, exist_ok=True)
|
|
650
3196
|
results_path = devlyn_dir / "spec-verify.results.json"
|
|
651
|
-
findings_path = devlyn_dir /
|
|
3197
|
+
findings_path = devlyn_dir / output_findings_name()
|
|
652
3198
|
|
|
653
3199
|
verify_env = os.environ.copy()
|
|
654
3200
|
verify_env["BENCH_WORKDIR"] = str(work)
|
|
@@ -759,7 +3305,7 @@ def main() -> int:
|
|
|
759
3305
|
)
|
|
760
3306
|
|
|
761
3307
|
findings.append({
|
|
762
|
-
"id": f"
|
|
3308
|
+
"id": f"{output_finding_prefix()}-{finding_seq:04d}",
|
|
763
3309
|
"rule_id": rule_id,
|
|
764
3310
|
"level": "error",
|
|
765
3311
|
"severity": "CRITICAL",
|
|
@@ -767,7 +3313,7 @@ def main() -> int:
|
|
|
767
3313
|
"message": msg,
|
|
768
3314
|
"file": file_ref,
|
|
769
3315
|
"line": 1,
|
|
770
|
-
"phase":
|
|
3316
|
+
"phase": output_phase(),
|
|
771
3317
|
"criterion_ref": criterion_ref,
|
|
772
3318
|
"fix_hint": fix_hint,
|
|
773
3319
|
"blocking": True,
|
|
@@ -784,7 +3330,7 @@ def main() -> int:
|
|
|
784
3330
|
else "correctness.spec-literal-mismatch"
|
|
785
3331
|
)
|
|
786
3332
|
findings.append({
|
|
787
|
-
"id": f"
|
|
3333
|
+
"id": f"{output_finding_prefix()}-{finding_seq:04d}",
|
|
788
3334
|
"rule_id": rule_id,
|
|
789
3335
|
"level": "error",
|
|
790
3336
|
"severity": "CRITICAL",
|
|
@@ -794,7 +3340,7 @@ def main() -> int:
|
|
|
794
3340
|
),
|
|
795
3341
|
"file": ".devlyn/risk-probes.jsonl" if vc.get("_risk_probe") else ".devlyn/spec-verify.json",
|
|
796
3342
|
"line": 1,
|
|
797
|
-
"phase":
|
|
3343
|
+
"phase": output_phase(),
|
|
798
3344
|
"criterion_ref": (
|
|
799
3345
|
f"risk-probe:{vc.get('id')}"
|
|
800
3346
|
if vc.get("_risk_probe")
|
|
@@ -817,7 +3363,7 @@ def main() -> int:
|
|
|
817
3363
|
else "correctness.spec-literal-mismatch"
|
|
818
3364
|
)
|
|
819
3365
|
findings.append({
|
|
820
|
-
"id": f"
|
|
3366
|
+
"id": f"{output_finding_prefix()}-{finding_seq:04d}",
|
|
821
3367
|
"rule_id": rule_id,
|
|
822
3368
|
"level": "error",
|
|
823
3369
|
"severity": "CRITICAL",
|
|
@@ -828,7 +3374,7 @@ def main() -> int:
|
|
|
828
3374
|
),
|
|
829
3375
|
"file": ".devlyn/risk-probes.jsonl" if vc.get("_risk_probe") else ".devlyn/spec-verify.json",
|
|
830
3376
|
"line": 1,
|
|
831
|
-
"phase":
|
|
3377
|
+
"phase": output_phase(),
|
|
832
3378
|
"criterion_ref": (
|
|
833
3379
|
f"risk-probe:{vc.get('id')}"
|
|
834
3380
|
if vc.get("_risk_probe")
|
|
@@ -843,6 +3389,16 @@ def main() -> int:
|
|
|
843
3389
|
})
|
|
844
3390
|
finding_seq += 1
|
|
845
3391
|
|
|
3392
|
+
expected_findings, finding_seq = expected_contract_findings(
|
|
3393
|
+
expected_data,
|
|
3394
|
+
expected_path,
|
|
3395
|
+
work,
|
|
3396
|
+
devlyn_dir,
|
|
3397
|
+
state,
|
|
3398
|
+
finding_seq,
|
|
3399
|
+
)
|
|
3400
|
+
findings.extend(expected_findings)
|
|
3401
|
+
|
|
846
3402
|
results_path.write_text(json.dumps({"commands": results}, indent=2) + "\n")
|
|
847
3403
|
|
|
848
3404
|
# Append findings (jsonl). BUILD_GATE merge step concatenates this onto
|
|
@@ -853,10 +3409,11 @@ def main() -> int:
|
|
|
853
3409
|
fh.write(json.dumps(f) + "\n")
|
|
854
3410
|
|
|
855
3411
|
failed = [r for r in results if r.get("pass") is False]
|
|
856
|
-
if
|
|
3412
|
+
blocking_findings = [f for f in findings if f.get("severity") in {"CRITICAL", "HIGH"}]
|
|
3413
|
+
if failed or blocking_findings:
|
|
857
3414
|
print(
|
|
858
3415
|
f"[spec-verify] {len(failed)}/{len(results)} command(s) failed; "
|
|
859
|
-
f"{len(findings)}
|
|
3416
|
+
f"{len(findings)} finding(s) written to {findings_path}",
|
|
860
3417
|
file=sys.stderr,
|
|
861
3418
|
)
|
|
862
3419
|
return 1
|