devlyn-cli 2.0.0 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +1 -1
- package/README.md +1 -1
- package/benchmark/auto-resolve/README.md +318 -2
- package/benchmark/auto-resolve/RUBRIC.md +6 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +63 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/expected.json +60 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/setup.sh +17 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/spec.md +52 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/invalid.js +29 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/parallel.js +50 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +70 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/expected.json +52 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/setup.sh +171 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/spec.md +51 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +83 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/expected.json +74 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/setup.sh +251 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/spec.md +58 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/task.txt +13 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/verifiers/replay-malformed-body.js +64 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +98 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/expected.json +46 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/setup.sh +336 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/spec.md +52 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +26 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/expected.json +64 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/setup.sh +32 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +58 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/exact-success.js +54 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/no-hardcoded-pricing.js +47 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/stock-error.js +45 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +27 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/expected.json +62 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +62 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/error-order.js +55 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/priority-blocked.js +48 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +27 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/spec.md +65 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/conflicting-duplicate.js +34 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/idempotent-close.js +41 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +27 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +71 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/priority-rollback.js +64 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/single-warehouse-fefo.js +66 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +28 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/expected.json +66 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/setup.sh +36 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +65 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/catalog-source.js +57 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/exact-success.js +63 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/stock-error.js +34 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +25 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/expected.json +68 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/setup.sh +17 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/spec.md +69 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/conflicting-duplicate.js +29 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/exact-payout.js +58 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/rules-source.js +56 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/NOTES.md +24 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/expected.json +66 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/setup.sh +22 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/spec.md +62 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/exact-success.js +48 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/insufficient-balance.js +36 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/rules-source.js +55 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/NOTES.md +20 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/expected.json +66 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/setup.sh +23 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/spec.md +66 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/task.txt +11 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/exact-success.js +44 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/rules-source.js +58 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/unavailable-inventory.js +35 -0
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +13 -1
- package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +98 -0
- package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +111 -0
- package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +289 -0
- package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +250 -0
- package/benchmark/auto-resolve/scripts/headroom-gate.py +147 -0
- package/benchmark/auto-resolve/scripts/judge.sh +82 -3
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +244 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +118 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +192 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +234 -40
- package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +511 -0
- package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +162 -0
- package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +93 -0
- package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +209 -0
- package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +239 -0
- package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +265 -0
- package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +192 -0
- package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +131 -0
- package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +84 -0
- package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +302 -0
- package/bin/devlyn.js +56 -10
- package/config/skills/_shared/archive_run.py +3 -0
- package/config/skills/_shared/codex-config.md +2 -2
- package/config/skills/_shared/codex-monitored.sh +72 -7
- package/config/skills/_shared/collect-codex-findings.py +125 -0
- package/config/skills/_shared/engine-preflight.md +1 -1
- package/config/skills/_shared/expected.schema.json +18 -0
- package/config/skills/_shared/spec-verify-check.py +312 -10
- package/config/skills/_shared/verify-merge-findings.py +327 -0
- package/config/skills/devlyn:ideate/SKILL.md +1 -1
- package/config/skills/devlyn:resolve/SKILL.md +62 -8
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +1 -1
- package/config/skills/devlyn:resolve/references/phases/probe-derive.md +164 -0
- package/config/skills/devlyn:resolve/references/phases/verify.md +156 -4
- package/config/skills/devlyn:resolve/references/state-schema.md +10 -4
- package/package.json +1 -1
- package/scripts/lint-skills.sh +32 -0
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Normalize raw Codex pair-JUDGE stdout into canonical VERIFY JSONL."""
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import argparse
|
|
7
|
+
import json
|
|
8
|
+
import pathlib
|
|
9
|
+
import sys
|
|
10
|
+
import tempfile
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
FINDING_SEVERITIES = {"CRITICAL", "HIGH", "MEDIUM", "LOW", "INFO"}
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def atomic_write(path: pathlib.Path, text: str) -> None:
|
|
18
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
19
|
+
with tempfile.NamedTemporaryFile(
|
|
20
|
+
"w", encoding="utf-8", dir=path.parent, delete=False
|
|
21
|
+
) as handle:
|
|
22
|
+
handle.write(text)
|
|
23
|
+
tmp_name = handle.name
|
|
24
|
+
pathlib.Path(tmp_name).replace(path)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def collect(stdout_path: pathlib.Path) -> tuple[list[dict[str, Any]], dict[str, Any] | None]:
|
|
28
|
+
findings: list[dict[str, Any]] = []
|
|
29
|
+
summary: dict[str, Any] | None = None
|
|
30
|
+
with stdout_path.open(encoding="utf-8") as handle:
|
|
31
|
+
for line_no, line in enumerate(handle, 1):
|
|
32
|
+
raw = line.strip()
|
|
33
|
+
if not raw:
|
|
34
|
+
continue
|
|
35
|
+
if raw.startswith("# SUMMARY "):
|
|
36
|
+
try:
|
|
37
|
+
item = json.loads(raw.removeprefix("# SUMMARY ").strip())
|
|
38
|
+
except json.JSONDecodeError as exc:
|
|
39
|
+
raise SystemExit(f"error: invalid SUMMARY JSON at {stdout_path}:{line_no}: {exc}")
|
|
40
|
+
if not isinstance(item, dict):
|
|
41
|
+
raise SystemExit(f"error: SUMMARY is not an object at {stdout_path}:{line_no}")
|
|
42
|
+
summary = item
|
|
43
|
+
continue
|
|
44
|
+
if raw.startswith("#"):
|
|
45
|
+
continue
|
|
46
|
+
try:
|
|
47
|
+
item = json.loads(raw)
|
|
48
|
+
except json.JSONDecodeError as exc:
|
|
49
|
+
raise SystemExit(f"error: invalid JSONL at {stdout_path}:{line_no}: {exc}")
|
|
50
|
+
if not isinstance(item, dict):
|
|
51
|
+
raise SystemExit(f"error: JSONL item is not an object at {stdout_path}:{line_no}")
|
|
52
|
+
severity = str(item.get("severity") or "").upper()
|
|
53
|
+
if severity not in FINDING_SEVERITIES:
|
|
54
|
+
raise SystemExit(f"error: finding missing valid severity at {stdout_path}:{line_no}")
|
|
55
|
+
findings.append(item)
|
|
56
|
+
if not findings and summary is None:
|
|
57
|
+
raise SystemExit("error: Codex pair-JUDGE stdout contained no JSONL findings or PASS line")
|
|
58
|
+
if summary and summary.get("verdict") in {"NEEDS_WORK", "FAIL", "BLOCKED"} and not findings:
|
|
59
|
+
raise SystemExit("error: non-PASS SUMMARY without JSONL findings")
|
|
60
|
+
return findings, summary
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def self_test() -> int:
|
|
64
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
65
|
+
root = pathlib.Path(tmp)
|
|
66
|
+
stdout_path = root / "codex-judge.stdout"
|
|
67
|
+
out_path = root / "verify.pair.findings.jsonl"
|
|
68
|
+
summary_path = root / "codex-judge.summary.json"
|
|
69
|
+
stdout_path.write_text(
|
|
70
|
+
json.dumps({"id": "a", "severity": "HIGH"}) + "\n"
|
|
71
|
+
+ '# SUMMARY {"verdict":"NEEDS_WORK"}\n',
|
|
72
|
+
encoding="utf-8",
|
|
73
|
+
)
|
|
74
|
+
findings, summary = collect(stdout_path)
|
|
75
|
+
write_outputs(findings, summary, out_path, summary_path)
|
|
76
|
+
assert out_path.read_text(encoding="utf-8").count("\n") == 1
|
|
77
|
+
assert json.loads(summary_path.read_text(encoding="utf-8"))["verdict"] == "NEEDS_WORK"
|
|
78
|
+
stdout_path.write_text("", encoding="utf-8")
|
|
79
|
+
try:
|
|
80
|
+
collect(stdout_path)
|
|
81
|
+
except SystemExit as exc:
|
|
82
|
+
assert "no JSONL findings" in str(exc)
|
|
83
|
+
else:
|
|
84
|
+
raise AssertionError("empty Codex stdout must not normalize to PASS")
|
|
85
|
+
return 0
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def write_outputs(
|
|
89
|
+
findings: list[dict[str, Any]],
|
|
90
|
+
summary: dict[str, Any] | None,
|
|
91
|
+
out_path: pathlib.Path,
|
|
92
|
+
summary_path: pathlib.Path,
|
|
93
|
+
) -> None:
|
|
94
|
+
atomic_write(
|
|
95
|
+
out_path,
|
|
96
|
+
"".join(json.dumps(item, sort_keys=True, separators=(",", ":")) + "\n" for item in findings),
|
|
97
|
+
)
|
|
98
|
+
if summary is not None:
|
|
99
|
+
atomic_write(summary_path, json.dumps(summary, indent=2, sort_keys=True) + "\n")
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def main() -> int:
|
|
103
|
+
parser = argparse.ArgumentParser(description=__doc__)
|
|
104
|
+
parser.add_argument("--devlyn-dir", default=".devlyn")
|
|
105
|
+
parser.add_argument("--stdout-file", default="codex-judge.stdout")
|
|
106
|
+
parser.add_argument("--out", default="verify.pair.findings.jsonl")
|
|
107
|
+
parser.add_argument("--summary-out", default="codex-judge.summary.json")
|
|
108
|
+
parser.add_argument("--self-test", action="store_true")
|
|
109
|
+
args = parser.parse_args()
|
|
110
|
+
if args.self_test:
|
|
111
|
+
return self_test()
|
|
112
|
+
|
|
113
|
+
devlyn = pathlib.Path(args.devlyn_dir)
|
|
114
|
+
stdout_path = devlyn / args.stdout_file
|
|
115
|
+
if not stdout_path.is_file():
|
|
116
|
+
sys.stderr.write(f"error: {stdout_path} not found\n")
|
|
117
|
+
return 1
|
|
118
|
+
findings, summary = collect(stdout_path)
|
|
119
|
+
write_outputs(findings, summary, devlyn / args.out, devlyn / args.summary_out)
|
|
120
|
+
print(json.dumps({"findings_count": len(findings), "summary": summary}, sort_keys=True))
|
|
121
|
+
return 0
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
if __name__ == "__main__":
|
|
125
|
+
raise SystemExit(main())
|
|
@@ -14,7 +14,7 @@ When the resolved engine is `auto` or `codex`, on entry (before spawning any pha
|
|
|
14
14
|
|
|
15
15
|
Never prompt the user. Never abort the run on missing CLI.
|
|
16
16
|
|
|
17
|
-
Per-skill defaults: `/devlyn:resolve` defaults to `claude` (post iter-0020 close-out — Codex BUILD/IMPLEMENT below quality floor; iter-0033g + iter-0034 close-out — PLAN-pair research-only until container/sandbox infra justifies a measurement)
|
|
17
|
+
Per-skill defaults: `/devlyn:resolve` defaults to `claude` for PLAN/IMPLEMENT (post iter-0020 close-out — Codex BUILD/IMPLEMENT below quality floor; iter-0033g + iter-0034 close-out — PLAN-pair research-only until container/sandbox infra justifies a measurement). `/devlyn:resolve` VERIFY is the exception: gated pair-JUDGE may invoke the OTHER engine when its SKILL.md trigger policy fires. `/devlyn:ideate` defaults to `auto` for the CHALLENGE phase's cross-model GAN-critic dynamic. Each skill's SKILL.md flag block is the source of truth for that skill's default.
|
|
18
18
|
|
|
19
19
|
## Why this is the one permitted silent fallback
|
|
20
20
|
|
|
@@ -35,6 +35,12 @@
|
|
|
35
35
|
"description": "None of these substrings may appear in (stdout + stderr) for pass.",
|
|
36
36
|
"items": { "type": "string", "minLength": 1 },
|
|
37
37
|
"default": []
|
|
38
|
+
},
|
|
39
|
+
"contract_refs": {
|
|
40
|
+
"type": "array",
|
|
41
|
+
"description": "For hidden BENCH_FIXTURE_DIR commands, exact substrings from spec.md that this oracle verifies. Hidden oracles may test only visible spec clauses.",
|
|
42
|
+
"items": { "type": "string", "minLength": 1 },
|
|
43
|
+
"default": []
|
|
38
44
|
}
|
|
39
45
|
}
|
|
40
46
|
}
|
|
@@ -83,6 +89,18 @@
|
|
|
83
89
|
"items": { "type": "string", "minLength": 1 },
|
|
84
90
|
"default": []
|
|
85
91
|
},
|
|
92
|
+
"tier_a_waivers": {
|
|
93
|
+
"type": "array",
|
|
94
|
+
"description": "Optional fnmatch globs exempted from Tier A scope-oracle path checks when the spec explicitly authorizes those files.",
|
|
95
|
+
"items": { "type": "string", "minLength": 1 },
|
|
96
|
+
"default": []
|
|
97
|
+
},
|
|
98
|
+
"spec_output_files": {
|
|
99
|
+
"type": "array",
|
|
100
|
+
"description": "Files or globs that define the spec-authorized output surface for scope oracles. Touched files outside this set must be reachable from it via static imports or separately waived.",
|
|
101
|
+
"items": { "type": "string", "minLength": 1 },
|
|
102
|
+
"default": []
|
|
103
|
+
},
|
|
86
104
|
"max_deps_added": {
|
|
87
105
|
"type": "integer",
|
|
88
106
|
"description": "Hard cap on new entries under dependencies/devDependencies in package.json. Exceeds → DQ.",
|
|
@@ -66,6 +66,7 @@ import os
|
|
|
66
66
|
import re
|
|
67
67
|
import subprocess
|
|
68
68
|
import sys
|
|
69
|
+
import tempfile
|
|
69
70
|
from pathlib import Path
|
|
70
71
|
|
|
71
72
|
|
|
@@ -73,6 +74,42 @@ VERIFICATION_SECTION_RE = re.compile(
|
|
|
73
74
|
r'(?ms)^##[ \t]+Verification\b[^\n]*\n(.*?)(?=^##[ \t]+|\Z)'
|
|
74
75
|
)
|
|
75
76
|
JSON_FENCE_RE = re.compile(r'(?ms)^```json[ \t]*\n(.*?)\n```[ \t]*$')
|
|
77
|
+
FORBIDDEN_RISK_PROBE_CMD_RE = re.compile(
|
|
78
|
+
r'BENCH_FIXTURE_DIR|benchmark/auto-resolve/fixtures|/verifiers/|verifiers/'
|
|
79
|
+
)
|
|
80
|
+
RISK_PROBE_TAGS = {
|
|
81
|
+
"ordering_inversion",
|
|
82
|
+
"boundary_overlap",
|
|
83
|
+
"prior_consumption",
|
|
84
|
+
"rollback_state",
|
|
85
|
+
"positive_remaining",
|
|
86
|
+
"stdout_stderr_contract",
|
|
87
|
+
"error_contract",
|
|
88
|
+
"shape_contract",
|
|
89
|
+
}
|
|
90
|
+
RISK_PROBE_REQUIRED_EVIDENCE = {
|
|
91
|
+
"ordering_inversion": {
|
|
92
|
+
"input_order_would_choose_wrong_winner",
|
|
93
|
+
"asserts_processing_order_result",
|
|
94
|
+
},
|
|
95
|
+
"boundary_overlap": {
|
|
96
|
+
"starts_at_blocked_start",
|
|
97
|
+
"ends_at_blocked_end",
|
|
98
|
+
"one_minute_overlap",
|
|
99
|
+
},
|
|
100
|
+
"prior_consumption": {
|
|
101
|
+
"same_resource_consumed_first",
|
|
102
|
+
"later_entity_fails_or_reroutes",
|
|
103
|
+
},
|
|
104
|
+
"rollback_state": {
|
|
105
|
+
"failed_entity_tentative_state_absent",
|
|
106
|
+
"later_entity_uses_released_state",
|
|
107
|
+
},
|
|
108
|
+
"positive_remaining": {
|
|
109
|
+
"asserts_full_remaining_state",
|
|
110
|
+
"zero_quantity_rows_absent",
|
|
111
|
+
},
|
|
112
|
+
}
|
|
76
113
|
|
|
77
114
|
|
|
78
115
|
def extract_verification_block(text: str) -> str | None:
|
|
@@ -89,6 +126,11 @@ def extract_verification_block(text: str) -> str | None:
|
|
|
89
126
|
return fence.group(1) if fence else None
|
|
90
127
|
|
|
91
128
|
|
|
129
|
+
def extract_verification_text(text: str) -> str:
|
|
130
|
+
section = VERIFICATION_SECTION_RE.search(text)
|
|
131
|
+
return section.group(1) if section else ""
|
|
132
|
+
|
|
133
|
+
|
|
92
134
|
def validate_shape(data) -> str | None:
|
|
93
135
|
"""Return None if shape matches the canonical verification_commands
|
|
94
136
|
schema; else a human-readable error string.
|
|
@@ -124,6 +166,117 @@ def validate_shape(data) -> str | None:
|
|
|
124
166
|
return None
|
|
125
167
|
|
|
126
168
|
|
|
169
|
+
def validate_risk_probe(probe: object, index: int, verification_text: str) -> str | None:
|
|
170
|
+
if not isinstance(probe, dict):
|
|
171
|
+
return f"risk-probes[{index}] must be a JSON object"
|
|
172
|
+
probe_id = probe.get("id")
|
|
173
|
+
if not isinstance(probe_id, str) or not probe_id.strip():
|
|
174
|
+
return f"risk-probes[{index}].id must be a non-empty string"
|
|
175
|
+
derived_from = probe.get("derived_from")
|
|
176
|
+
if not isinstance(derived_from, str) or not derived_from.strip():
|
|
177
|
+
return f"risk-probes[{index}].derived_from must be a non-empty string"
|
|
178
|
+
if derived_from not in verification_text:
|
|
179
|
+
return (
|
|
180
|
+
f"risk-probes[{index}].derived_from must be an exact substring "
|
|
181
|
+
"of the source ## Verification section"
|
|
182
|
+
)
|
|
183
|
+
shape_err = validate_shape({"verification_commands": [probe]})
|
|
184
|
+
if shape_err:
|
|
185
|
+
return f"risk-probes[{index}]: {shape_err}"
|
|
186
|
+
cmd = probe.get("cmd", "")
|
|
187
|
+
if FORBIDDEN_RISK_PROBE_CMD_RE.search(cmd):
|
|
188
|
+
return (
|
|
189
|
+
f"risk-probes[{index}].cmd references hidden fixture/verifier paths; "
|
|
190
|
+
"risk probes must derive from visible spec text only"
|
|
191
|
+
)
|
|
192
|
+
if len(cmd) > 4000:
|
|
193
|
+
return f"risk-probes[{index}].cmd exceeds 4000 characters"
|
|
194
|
+
tags = probe.get("tags")
|
|
195
|
+
if not isinstance(tags, list) or not tags or not all(isinstance(t, str) for t in tags):
|
|
196
|
+
return f"risk-probes[{index}].tags must be a non-empty list of strings"
|
|
197
|
+
unknown_tags = sorted(set(tags) - RISK_PROBE_TAGS)
|
|
198
|
+
if unknown_tags:
|
|
199
|
+
return f"risk-probes[{index}].tags contains unknown tag(s): {', '.join(unknown_tags)}"
|
|
200
|
+
evidence = probe.get("tag_evidence")
|
|
201
|
+
if not isinstance(evidence, dict):
|
|
202
|
+
return f"risk-probes[{index}].tag_evidence must be an object"
|
|
203
|
+
for tag in tags:
|
|
204
|
+
required_evidence = RISK_PROBE_REQUIRED_EVIDENCE.get(tag)
|
|
205
|
+
if not required_evidence:
|
|
206
|
+
continue
|
|
207
|
+
actual = evidence.get(tag)
|
|
208
|
+
if not isinstance(actual, list) or not all(isinstance(item, str) for item in actual):
|
|
209
|
+
return f"risk-probes[{index}].tag_evidence.{tag} must be a list of strings"
|
|
210
|
+
missing_evidence = sorted(required_evidence - set(actual))
|
|
211
|
+
if missing_evidence:
|
|
212
|
+
return (
|
|
213
|
+
f"risk-probes[{index}].tag_evidence.{tag} missing required "
|
|
214
|
+
f"item(s): {', '.join(missing_evidence)}"
|
|
215
|
+
)
|
|
216
|
+
return None
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def required_risk_probe_tags(verification_text: str) -> set[str]:
|
|
220
|
+
text = verification_text.lower()
|
|
221
|
+
required: set[str] = set()
|
|
222
|
+
if re.search(r'priority|higher-priority|ordered by|ordering|appears first|input order', text):
|
|
223
|
+
required.add("ordering_inversion")
|
|
224
|
+
if re.search(r'blocked|overlap|forbidden|window', text):
|
|
225
|
+
required.add("boundary_overlap")
|
|
226
|
+
if re.search(r'rolls? back|reduce[s]? stock|available to later|later orders|remaining|stock', text):
|
|
227
|
+
required.add("prior_consumption")
|
|
228
|
+
if "remaining" in text:
|
|
229
|
+
required.add("positive_remaining")
|
|
230
|
+
if re.search(r'stderr|stdout|exit `?2`?|json error', text):
|
|
231
|
+
required.add("stdout_stderr_contract")
|
|
232
|
+
return required
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def load_risk_probes(
|
|
236
|
+
devlyn_dir: Path,
|
|
237
|
+
source_md: Path | None,
|
|
238
|
+
*,
|
|
239
|
+
require_present: bool = False,
|
|
240
|
+
) -> tuple[list[dict], str | None]:
|
|
241
|
+
probes_path = devlyn_dir / "risk-probes.jsonl"
|
|
242
|
+
if not probes_path.is_file():
|
|
243
|
+
if require_present:
|
|
244
|
+
return ([], "risk-probes.jsonl is required when --risk-probes is enabled")
|
|
245
|
+
return ([], None)
|
|
246
|
+
if source_md is None or not source_md.is_file():
|
|
247
|
+
return ([], "risk-probes.jsonl exists but source markdown is unavailable")
|
|
248
|
+
|
|
249
|
+
verification_text = extract_verification_text(source_md.read_text())
|
|
250
|
+
if not verification_text:
|
|
251
|
+
return ([], "risk-probes.jsonl exists but source has no ## Verification section")
|
|
252
|
+
|
|
253
|
+
probes: list[dict] = []
|
|
254
|
+
for index, line in enumerate(probes_path.read_text().splitlines()):
|
|
255
|
+
if not line.strip():
|
|
256
|
+
continue
|
|
257
|
+
try:
|
|
258
|
+
probe = json.loads(line)
|
|
259
|
+
except json.JSONDecodeError as e:
|
|
260
|
+
return ([], f"risk-probes[{index}] invalid JSON: {e}")
|
|
261
|
+
err = validate_risk_probe(probe, index, verification_text)
|
|
262
|
+
if err:
|
|
263
|
+
return ([], err)
|
|
264
|
+
normalized = dict(probe)
|
|
265
|
+
normalized["_risk_probe"] = True
|
|
266
|
+
normalized["_risk_probe_index"] = index
|
|
267
|
+
probes.append(normalized)
|
|
268
|
+
if len(probes) > 3:
|
|
269
|
+
return ([], "risk-probes.jsonl has more than 3 probes")
|
|
270
|
+
if require_present and not probes:
|
|
271
|
+
return ([], "risk-probes.jsonl must contain at least one probe")
|
|
272
|
+
if require_present:
|
|
273
|
+
present_tags = {tag for probe in probes for tag in probe.get("tags", [])}
|
|
274
|
+
missing_tags = sorted(required_risk_probe_tags(verification_text) - present_tags)
|
|
275
|
+
if missing_tags:
|
|
276
|
+
return ([], f"risk-probes.jsonl missing required probe tag(s): {', '.join(missing_tags)}")
|
|
277
|
+
return (probes, None)
|
|
278
|
+
|
|
279
|
+
|
|
127
280
|
def read_source(work: Path, devlyn_dir: Path) -> tuple[str | None, Path | None]:
|
|
128
281
|
"""Return (source_type, markdown_path) from .devlyn/pipeline.state.json,
|
|
129
282
|
or (None, None) if state is absent/unreadable. The markdown path is
|
|
@@ -237,7 +390,96 @@ def run_check_mode(md_path: Path) -> int:
|
|
|
237
390
|
return 0
|
|
238
391
|
|
|
239
392
|
|
|
393
|
+
def run_self_test() -> int:
|
|
394
|
+
script_path = str(Path(__file__).resolve())
|
|
395
|
+
with tempfile.TemporaryDirectory() as td:
|
|
396
|
+
work = Path(td)
|
|
397
|
+
devlyn = work / ".devlyn"
|
|
398
|
+
devlyn.mkdir()
|
|
399
|
+
spec_md = work / "spec.md"
|
|
400
|
+
spec_md.write_text("# Spec\n\n## Verification\n\n- probe must pass visible marker.\n")
|
|
401
|
+
(devlyn / "pipeline.state.json").write_text(json.dumps({
|
|
402
|
+
"source": {"type": "spec", "spec_path": str(spec_md)}
|
|
403
|
+
}))
|
|
404
|
+
(devlyn / "spec-verify.json").write_text(json.dumps({
|
|
405
|
+
"verification_commands": [
|
|
406
|
+
{"cmd": "printf ok", "exit_code": 0, "stdout_contains": ["ok"]}
|
|
407
|
+
]
|
|
408
|
+
}) + "\n")
|
|
409
|
+
(devlyn / "risk-probes.jsonl").write_text(json.dumps({
|
|
410
|
+
"id": "P1",
|
|
411
|
+
"derived_from": "probe must pass visible marker.",
|
|
412
|
+
"cmd": "printf probe-ok",
|
|
413
|
+
"exit_code": 0,
|
|
414
|
+
"stdout_contains": ["probe-ok"],
|
|
415
|
+
"stdout_not_contains": [],
|
|
416
|
+
"tags": ["shape_contract"],
|
|
417
|
+
"tag_evidence": {},
|
|
418
|
+
}) + "\n")
|
|
419
|
+
env = os.environ.copy()
|
|
420
|
+
env["BENCH_WORKDIR"] = str(work)
|
|
421
|
+
good = subprocess.run(
|
|
422
|
+
[sys.executable, script_path, "--include-risk-probes"],
|
|
423
|
+
cwd=work,
|
|
424
|
+
env=env,
|
|
425
|
+
capture_output=True,
|
|
426
|
+
text=True,
|
|
427
|
+
)
|
|
428
|
+
if good.returncode != 0:
|
|
429
|
+
print(good.stderr, file=sys.stderr)
|
|
430
|
+
return 1
|
|
431
|
+
|
|
432
|
+
(devlyn / "risk-probes.jsonl").write_text(json.dumps({
|
|
433
|
+
"id": "P2",
|
|
434
|
+
"derived_from": "probe must pass visible marker.",
|
|
435
|
+
"cmd": "node $BENCH_FIXTURE_DIR/verifiers/hidden.js",
|
|
436
|
+
"exit_code": 0,
|
|
437
|
+
}) + "\n")
|
|
438
|
+
bad = subprocess.run(
|
|
439
|
+
[sys.executable, script_path, "--validate-risk-probes"],
|
|
440
|
+
cwd=work,
|
|
441
|
+
env=env,
|
|
442
|
+
capture_output=True,
|
|
443
|
+
text=True,
|
|
444
|
+
)
|
|
445
|
+
if bad.returncode == 0:
|
|
446
|
+
print("hidden verifier path was accepted", file=sys.stderr)
|
|
447
|
+
return 1
|
|
448
|
+
|
|
449
|
+
(devlyn / "risk-probes.jsonl").write_text(json.dumps({
|
|
450
|
+
"id": "P3",
|
|
451
|
+
"derived_from": "probe must pass visible marker.",
|
|
452
|
+
"cmd": "printf weak-boundary",
|
|
453
|
+
"exit_code": 0,
|
|
454
|
+
"tags": ["boundary_overlap"],
|
|
455
|
+
"tag_evidence": {"boundary_overlap": ["one_minute_overlap"]},
|
|
456
|
+
}) + "\n")
|
|
457
|
+
weak = subprocess.run(
|
|
458
|
+
[sys.executable, script_path, "--validate-risk-probes"],
|
|
459
|
+
cwd=work,
|
|
460
|
+
env=env,
|
|
461
|
+
capture_output=True,
|
|
462
|
+
text=True,
|
|
463
|
+
)
|
|
464
|
+
if weak.returncode == 0:
|
|
465
|
+
print("incomplete boundary_overlap evidence was accepted", file=sys.stderr)
|
|
466
|
+
return 1
|
|
467
|
+
return 0
|
|
468
|
+
|
|
469
|
+
|
|
240
470
|
def main() -> int:
|
|
471
|
+
include_risk_probes = False
|
|
472
|
+
validate_risk_probes_only = False
|
|
473
|
+
if "--include-risk-probes" in sys.argv[1:]:
|
|
474
|
+
include_risk_probes = True
|
|
475
|
+
sys.argv = [arg for arg in sys.argv if arg != "--include-risk-probes"]
|
|
476
|
+
if "--validate-risk-probes" in sys.argv[1:]:
|
|
477
|
+
validate_risk_probes_only = True
|
|
478
|
+
sys.argv = [arg for arg in sys.argv if arg != "--validate-risk-probes"]
|
|
479
|
+
|
|
480
|
+
if len(sys.argv) == 2 and sys.argv[1] == "--self-test":
|
|
481
|
+
return run_self_test()
|
|
482
|
+
|
|
241
483
|
if len(sys.argv) >= 2 and sys.argv[1] == "--check":
|
|
242
484
|
if len(sys.argv) != 3:
|
|
243
485
|
print("usage: spec-verify-check.py --check <markdown-path>", file=sys.stderr)
|
|
@@ -275,6 +517,16 @@ def main() -> int:
|
|
|
275
517
|
pre_staged = spec_path.is_file() # captured BEFORE any potential write
|
|
276
518
|
trust_bench_staged = bench_mode and pre_staged
|
|
277
519
|
src_type, source_md = read_source(work, devlyn_dir)
|
|
520
|
+
if validate_risk_probes_only:
|
|
521
|
+
_risk_probes, risk_error = load_risk_probes(
|
|
522
|
+
devlyn_dir, source_md, require_present=True
|
|
523
|
+
)
|
|
524
|
+
if risk_error:
|
|
525
|
+
print(f"[spec-verify] risk probes malformed: {risk_error}", file=sys.stderr)
|
|
526
|
+
write_malformed_finding(devlyn_dir, risk_error, devlyn_dir / "risk-probes.jsonl")
|
|
527
|
+
return 1
|
|
528
|
+
print("[spec-verify] risk probes valid", file=sys.stderr)
|
|
529
|
+
return 0
|
|
278
530
|
if source_md is not None and not trust_bench_staged:
|
|
279
531
|
staged, error = stage_from_source(source_md, devlyn_dir)
|
|
280
532
|
if error is not None:
|
|
@@ -334,7 +586,14 @@ def main() -> int:
|
|
|
334
586
|
print(f"[spec-verify] error: {spec_path}: {shape_err}", file=sys.stderr)
|
|
335
587
|
write_malformed_finding(devlyn_dir, f"{spec_path}: {shape_err}", None)
|
|
336
588
|
return 1
|
|
337
|
-
commands = spec["verification_commands"]
|
|
589
|
+
commands = list(spec["verification_commands"])
|
|
590
|
+
if include_risk_probes:
|
|
591
|
+
risk_probes, risk_error = load_risk_probes(devlyn_dir, source_md)
|
|
592
|
+
if risk_error:
|
|
593
|
+
print(f"[spec-verify] risk probes malformed: {risk_error}", file=sys.stderr)
|
|
594
|
+
write_malformed_finding(devlyn_dir, risk_error, devlyn_dir / "risk-probes.jsonl")
|
|
595
|
+
return 1
|
|
596
|
+
commands.extend(risk_probes)
|
|
338
597
|
|
|
339
598
|
devlyn_dir.mkdir(parents=True, exist_ok=True)
|
|
340
599
|
results_path = devlyn_dir / "spec-verify.results.json"
|
|
@@ -354,6 +613,7 @@ def main() -> int:
|
|
|
354
613
|
"reason": "missing_cmd"})
|
|
355
614
|
continue
|
|
356
615
|
|
|
616
|
+
is_risk_probe = bool(vc.get("_risk_probe"))
|
|
357
617
|
expected_exit = vc.get("exit_code", 0)
|
|
358
618
|
stdout_contains = vc.get("stdout_contains", []) or []
|
|
359
619
|
stdout_not_contains = vc.get("stdout_not_contains", []) or []
|
|
@@ -423,17 +683,41 @@ def main() -> int:
|
|
|
423
683
|
f"contains={stdout_contains}, not_contains={stdout_not_contains})."
|
|
424
684
|
)
|
|
425
685
|
|
|
686
|
+
rule_id = (
|
|
687
|
+
"correctness.risk-probe-failed"
|
|
688
|
+
if is_risk_probe
|
|
689
|
+
else "correctness.spec-literal-mismatch"
|
|
690
|
+
)
|
|
691
|
+
criterion_ref = (
|
|
692
|
+
f"risk-probe:{vc.get('id')}"
|
|
693
|
+
if is_risk_probe
|
|
694
|
+
else f"spec-verify://verification_commands/{idx}"
|
|
695
|
+
)
|
|
696
|
+
file_ref = (
|
|
697
|
+
".devlyn/risk-probes.jsonl"
|
|
698
|
+
if is_risk_probe
|
|
699
|
+
else ".devlyn/spec-verify.json"
|
|
700
|
+
)
|
|
701
|
+
if is_risk_probe:
|
|
702
|
+
fix_hint = (
|
|
703
|
+
f"Risk probe `{vc.get('id')}` derived from "
|
|
704
|
+
f"{vc.get('derived_from')!r} failed. See "
|
|
705
|
+
".devlyn/spec-verify.results.json for captured output "
|
|
706
|
+
"and update the implementation to satisfy the visible "
|
|
707
|
+
"verification bullet."
|
|
708
|
+
)
|
|
709
|
+
|
|
426
710
|
findings.append({
|
|
427
711
|
"id": f"BGATE-{finding_seq:04d}",
|
|
428
|
-
"rule_id":
|
|
712
|
+
"rule_id": rule_id,
|
|
429
713
|
"level": "error",
|
|
430
714
|
"severity": "CRITICAL",
|
|
431
715
|
"confidence": 1.0,
|
|
432
716
|
"message": msg,
|
|
433
|
-
"file":
|
|
717
|
+
"file": file_ref,
|
|
434
718
|
"line": 1,
|
|
435
719
|
"phase": "build_gate",
|
|
436
|
-
"criterion_ref":
|
|
720
|
+
"criterion_ref": criterion_ref,
|
|
437
721
|
"fix_hint": fix_hint,
|
|
438
722
|
"blocking": True,
|
|
439
723
|
"status": "open",
|
|
@@ -443,19 +727,28 @@ def main() -> int:
|
|
|
443
727
|
except subprocess.TimeoutExpired:
|
|
444
728
|
results.append({"index": idx, "cmd": cmd, "pass": False,
|
|
445
729
|
"reason": "timeout"})
|
|
730
|
+
rule_id = (
|
|
731
|
+
"correctness.risk-probe-failed"
|
|
732
|
+
if vc.get("_risk_probe")
|
|
733
|
+
else "correctness.spec-literal-mismatch"
|
|
734
|
+
)
|
|
446
735
|
findings.append({
|
|
447
736
|
"id": f"BGATE-{finding_seq:04d}",
|
|
448
|
-
"rule_id":
|
|
737
|
+
"rule_id": rule_id,
|
|
449
738
|
"level": "error",
|
|
450
739
|
"severity": "CRITICAL",
|
|
451
740
|
"confidence": 1.0,
|
|
452
741
|
"message": (
|
|
453
742
|
f"Verification command #{idx + 1} timed out after 60s."
|
|
454
743
|
),
|
|
455
|
-
"file": ".devlyn/spec-verify.json",
|
|
744
|
+
"file": ".devlyn/risk-probes.jsonl" if vc.get("_risk_probe") else ".devlyn/spec-verify.json",
|
|
456
745
|
"line": 1,
|
|
457
746
|
"phase": "build_gate",
|
|
458
|
-
"criterion_ref":
|
|
747
|
+
"criterion_ref": (
|
|
748
|
+
f"risk-probe:{vc.get('id')}"
|
|
749
|
+
if vc.get("_risk_probe")
|
|
750
|
+
else f"spec-verify://verification_commands/{idx}"
|
|
751
|
+
),
|
|
459
752
|
"fix_hint": (
|
|
460
753
|
f"Command `{cmd}` exceeded 60s. Reduce work or fix a "
|
|
461
754
|
f"hang in the implementation."
|
|
@@ -467,9 +760,14 @@ def main() -> int:
|
|
|
467
760
|
except Exception as e: # noqa: BLE001 — surface any harness error explicitly
|
|
468
761
|
results.append({"index": idx, "cmd": cmd, "pass": False,
|
|
469
762
|
"reason": f"error:{e.__class__.__name__}:{e}"})
|
|
763
|
+
rule_id = (
|
|
764
|
+
"correctness.risk-probe-failed"
|
|
765
|
+
if vc.get("_risk_probe")
|
|
766
|
+
else "correctness.spec-literal-mismatch"
|
|
767
|
+
)
|
|
470
768
|
findings.append({
|
|
471
769
|
"id": f"BGATE-{finding_seq:04d}",
|
|
472
|
-
"rule_id":
|
|
770
|
+
"rule_id": rule_id,
|
|
473
771
|
"level": "error",
|
|
474
772
|
"severity": "CRITICAL",
|
|
475
773
|
"confidence": 1.0,
|
|
@@ -477,10 +775,14 @@ def main() -> int:
|
|
|
477
775
|
f"Verification command #{idx + 1} raised "
|
|
478
776
|
f"{e.__class__.__name__}: {e}."
|
|
479
777
|
),
|
|
480
|
-
"file": ".devlyn/spec-verify.json",
|
|
778
|
+
"file": ".devlyn/risk-probes.jsonl" if vc.get("_risk_probe") else ".devlyn/spec-verify.json",
|
|
481
779
|
"line": 1,
|
|
482
780
|
"phase": "build_gate",
|
|
483
|
-
"criterion_ref":
|
|
781
|
+
"criterion_ref": (
|
|
782
|
+
f"risk-probe:{vc.get('id')}"
|
|
783
|
+
if vc.get("_risk_probe")
|
|
784
|
+
else f"spec-verify://verification_commands/{idx}"
|
|
785
|
+
),
|
|
484
786
|
"fix_hint": (
|
|
485
787
|
f"Command `{cmd}` could not be executed. Check the work-dir "
|
|
486
788
|
f"state and any environment setup the command requires."
|