devlyn-cli 2.1.0 → 2.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +1 -1
- package/benchmark/auto-resolve/README.md +321 -2
- package/benchmark/auto-resolve/RUBRIC.md +6 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +63 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/expected.json +60 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/setup.sh +17 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/spec.md +51 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/invalid.js +29 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/parallel.js +50 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +70 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/expected.json +52 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/setup.sh +171 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/spec.md +50 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +83 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/expected.json +74 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/setup.sh +251 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/spec.md +57 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/task.txt +13 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/verifiers/replay-malformed-body.js +64 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +98 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/expected.json +46 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/setup.sh +336 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/spec.md +51 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +26 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/expected.json +64 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/setup.sh +32 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +57 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/exact-success.js +54 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/no-hardcoded-pricing.js +47 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/stock-error.js +45 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +27 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/expected.json +62 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +61 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/error-order.js +55 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/priority-blocked.js +48 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +27 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/spec.md +64 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/conflicting-duplicate.js +34 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/idempotent-close.js +41 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +27 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +70 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/priority-rollback.js +64 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/single-warehouse-fefo.js +66 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +28 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/expected.json +66 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/setup.sh +36 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +64 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/catalog-source.js +57 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/exact-success.js +63 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/stock-error.js +34 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +25 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/expected.json +68 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/setup.sh +17 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/spec.md +68 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/conflicting-duplicate.js +29 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/exact-payout.js +58 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/rules-source.js +56 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +0 -3
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +13 -1
- package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +98 -0
- package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +111 -0
- package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +289 -0
- package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +250 -0
- package/benchmark/auto-resolve/scripts/headroom-gate.py +147 -0
- package/benchmark/auto-resolve/scripts/judge.sh +82 -3
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +0 -11
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +0 -10
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +244 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +118 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +192 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +257 -43
- package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +511 -0
- package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +162 -0
- package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +93 -0
- package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +209 -0
- package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +239 -0
- package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +265 -0
- package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +192 -0
- package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +131 -0
- package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +84 -0
- package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +302 -0
- package/config/skills/_shared/archive_run.py +3 -0
- package/config/skills/_shared/codex-config.md +2 -2
- package/config/skills/_shared/codex-monitored.sh +72 -7
- package/config/skills/_shared/collect-codex-findings.py +125 -0
- package/config/skills/_shared/engine-preflight.md +1 -1
- package/config/skills/_shared/expected.schema.json +18 -0
- package/config/skills/_shared/spec-verify-check.py +363 -10
- package/config/skills/_shared/verify-merge-findings.py +327 -0
- package/config/skills/devlyn:resolve/SKILL.md +69 -8
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +1 -1
- package/config/skills/devlyn:resolve/references/phases/probe-derive.md +183 -0
- package/config/skills/devlyn:resolve/references/phases/verify.md +156 -4
- package/config/skills/devlyn:resolve/references/state-schema.md +10 -4
- package/package.json +1 -1
- package/scripts/lint-skills.sh +69 -20
|
@@ -66,6 +66,7 @@ import os
|
|
|
66
66
|
import re
|
|
67
67
|
import subprocess
|
|
68
68
|
import sys
|
|
69
|
+
import tempfile
|
|
69
70
|
from pathlib import Path
|
|
70
71
|
|
|
71
72
|
|
|
@@ -73,6 +74,50 @@ VERIFICATION_SECTION_RE = re.compile(
|
|
|
73
74
|
r'(?ms)^##[ \t]+Verification\b[^\n]*\n(.*?)(?=^##[ \t]+|\Z)'
|
|
74
75
|
)
|
|
75
76
|
JSON_FENCE_RE = re.compile(r'(?ms)^```json[ \t]*\n(.*?)\n```[ \t]*$')
|
|
77
|
+
FORBIDDEN_RISK_PROBE_CMD_RE = re.compile(
|
|
78
|
+
r'BENCH_FIXTURE_DIR|benchmark/auto-resolve/fixtures|/verifiers/|verifiers/'
|
|
79
|
+
)
|
|
80
|
+
EXTERNAL_URL_RE = re.compile(r"https?://([^/\s\"']+)", re.IGNORECASE)
|
|
81
|
+
LOCAL_URL_HOSTS = {
|
|
82
|
+
'localhost',
|
|
83
|
+
'127.0.0.1',
|
|
84
|
+
'0.0.0.0',
|
|
85
|
+
'[::1]',
|
|
86
|
+
'::1',
|
|
87
|
+
}
|
|
88
|
+
RISK_PROBE_TAGS = {
|
|
89
|
+
"ordering_inversion",
|
|
90
|
+
"boundary_overlap",
|
|
91
|
+
"prior_consumption",
|
|
92
|
+
"rollback_state",
|
|
93
|
+
"positive_remaining",
|
|
94
|
+
"stdout_stderr_contract",
|
|
95
|
+
"error_contract",
|
|
96
|
+
"shape_contract",
|
|
97
|
+
}
|
|
98
|
+
RISK_PROBE_REQUIRED_EVIDENCE = {
|
|
99
|
+
"ordering_inversion": {
|
|
100
|
+
"input_order_would_choose_wrong_winner",
|
|
101
|
+
"asserts_processing_order_result",
|
|
102
|
+
},
|
|
103
|
+
"boundary_overlap": {
|
|
104
|
+
"starts_at_blocked_start",
|
|
105
|
+
"ends_at_blocked_end",
|
|
106
|
+
"one_minute_overlap",
|
|
107
|
+
},
|
|
108
|
+
"prior_consumption": {
|
|
109
|
+
"same_resource_consumed_first",
|
|
110
|
+
"later_entity_fails_or_reroutes",
|
|
111
|
+
},
|
|
112
|
+
"rollback_state": {
|
|
113
|
+
"failed_entity_tentative_state_absent",
|
|
114
|
+
"later_entity_uses_released_state",
|
|
115
|
+
},
|
|
116
|
+
"positive_remaining": {
|
|
117
|
+
"asserts_full_remaining_state",
|
|
118
|
+
"zero_quantity_rows_absent",
|
|
119
|
+
},
|
|
120
|
+
}
|
|
76
121
|
|
|
77
122
|
|
|
78
123
|
def extract_verification_block(text: str) -> str | None:
|
|
@@ -89,6 +134,20 @@ def extract_verification_block(text: str) -> str | None:
|
|
|
89
134
|
return fence.group(1) if fence else None
|
|
90
135
|
|
|
91
136
|
|
|
137
|
+
def extract_verification_text(text: str) -> str:
|
|
138
|
+
section = VERIFICATION_SECTION_RE.search(text)
|
|
139
|
+
return section.group(1) if section else ""
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def external_url_hosts(text: str) -> list[str]:
|
|
143
|
+
hosts: list[str] = []
|
|
144
|
+
for match in EXTERNAL_URL_RE.finditer(text or ''):
|
|
145
|
+
host = match.group(1).split('@')[-1].split(':')[0].lower()
|
|
146
|
+
if host not in LOCAL_URL_HOSTS and host not in hosts:
|
|
147
|
+
hosts.append(host)
|
|
148
|
+
return hosts
|
|
149
|
+
|
|
150
|
+
|
|
92
151
|
def validate_shape(data) -> str | None:
|
|
93
152
|
"""Return None if shape matches the canonical verification_commands
|
|
94
153
|
schema; else a human-readable error string.
|
|
@@ -124,6 +183,132 @@ def validate_shape(data) -> str | None:
|
|
|
124
183
|
return None
|
|
125
184
|
|
|
126
185
|
|
|
186
|
+
def validate_risk_probe(probe: object, index: int, verification_text: str) -> str | None:
|
|
187
|
+
if not isinstance(probe, dict):
|
|
188
|
+
return f"risk-probes[{index}] must be a JSON object"
|
|
189
|
+
probe_id = probe.get("id")
|
|
190
|
+
if not isinstance(probe_id, str) or not probe_id.strip():
|
|
191
|
+
return f"risk-probes[{index}].id must be a non-empty string"
|
|
192
|
+
derived_from = probe.get("derived_from")
|
|
193
|
+
if not isinstance(derived_from, str) or not derived_from.strip():
|
|
194
|
+
return f"risk-probes[{index}].derived_from must be a non-empty string"
|
|
195
|
+
if derived_from not in verification_text:
|
|
196
|
+
return (
|
|
197
|
+
f"risk-probes[{index}].derived_from must be an exact substring "
|
|
198
|
+
"of the source ## Verification section"
|
|
199
|
+
)
|
|
200
|
+
shape_err = validate_shape({"verification_commands": [probe]})
|
|
201
|
+
if shape_err:
|
|
202
|
+
return f"risk-probes[{index}]: {shape_err}"
|
|
203
|
+
cmd = probe.get("cmd", "")
|
|
204
|
+
if FORBIDDEN_RISK_PROBE_CMD_RE.search(cmd):
|
|
205
|
+
return (
|
|
206
|
+
f"risk-probes[{index}].cmd references hidden fixture/verifier paths; "
|
|
207
|
+
"risk probes must derive from visible spec text only"
|
|
208
|
+
)
|
|
209
|
+
external_hosts = external_url_hosts(cmd)
|
|
210
|
+
if external_hosts:
|
|
211
|
+
return (
|
|
212
|
+
f"risk-probes[{index}].cmd references external URL(s): "
|
|
213
|
+
f"{', '.join(external_hosts)}; use only worktree-local or localhost resources"
|
|
214
|
+
)
|
|
215
|
+
if len(cmd) > 4000:
|
|
216
|
+
return f"risk-probes[{index}].cmd exceeds 4000 characters"
|
|
217
|
+
tags = probe.get("tags")
|
|
218
|
+
if not isinstance(tags, list) or not tags or not all(isinstance(t, str) for t in tags):
|
|
219
|
+
return f"risk-probes[{index}].tags must be a non-empty list of strings"
|
|
220
|
+
unknown_tags = sorted(set(tags) - RISK_PROBE_TAGS)
|
|
221
|
+
if unknown_tags:
|
|
222
|
+
return f"risk-probes[{index}].tags contains unknown tag(s): {', '.join(unknown_tags)}"
|
|
223
|
+
if "error_contract" in tags and not re.search(
|
|
224
|
+
r'invalid|stderr|json[ -]?error|error object|exit[ `]*2',
|
|
225
|
+
derived_from,
|
|
226
|
+
re.IGNORECASE,
|
|
227
|
+
):
|
|
228
|
+
return (
|
|
229
|
+
f"risk-probes[{index}].derived_from for error_contract must name "
|
|
230
|
+
"an invalid-input, stderr, JSON-error, or exit-2 verification bullet"
|
|
231
|
+
)
|
|
232
|
+
evidence = probe.get("tag_evidence")
|
|
233
|
+
if not isinstance(evidence, dict):
|
|
234
|
+
return f"risk-probes[{index}].tag_evidence must be an object"
|
|
235
|
+
for tag in tags:
|
|
236
|
+
required_evidence = RISK_PROBE_REQUIRED_EVIDENCE.get(tag)
|
|
237
|
+
if not required_evidence:
|
|
238
|
+
continue
|
|
239
|
+
actual = evidence.get(tag)
|
|
240
|
+
if not isinstance(actual, list) or not all(isinstance(item, str) for item in actual):
|
|
241
|
+
return f"risk-probes[{index}].tag_evidence.{tag} must be a list of strings"
|
|
242
|
+
missing_evidence = sorted(required_evidence - set(actual))
|
|
243
|
+
if missing_evidence:
|
|
244
|
+
return (
|
|
245
|
+
f"risk-probes[{index}].tag_evidence.{tag} missing required "
|
|
246
|
+
f"item(s): {', '.join(missing_evidence)}"
|
|
247
|
+
)
|
|
248
|
+
return None
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def required_risk_probe_tags(verification_text: str) -> set[str]:
|
|
252
|
+
text = verification_text.lower()
|
|
253
|
+
required: set[str] = set()
|
|
254
|
+
if re.search(r'priority|higher-priority|ordered by|ordering|appears first|input order', text):
|
|
255
|
+
required.add("ordering_inversion")
|
|
256
|
+
if re.search(r'blocked|overlap|forbidden|window', text):
|
|
257
|
+
required.add("boundary_overlap")
|
|
258
|
+
if re.search(r'rolls? back|reduce[s]? stock|available to later|later orders|remaining|stock', text):
|
|
259
|
+
required.add("prior_consumption")
|
|
260
|
+
if "remaining" in text:
|
|
261
|
+
required.add("positive_remaining")
|
|
262
|
+
if re.search(r'stderr|stdout|exit `?2`?|json error', text):
|
|
263
|
+
required.add("stdout_stderr_contract")
|
|
264
|
+
return required
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def load_risk_probes(
|
|
268
|
+
devlyn_dir: Path,
|
|
269
|
+
source_md: Path | None,
|
|
270
|
+
*,
|
|
271
|
+
require_present: bool = False,
|
|
272
|
+
) -> tuple[list[dict], str | None]:
|
|
273
|
+
probes_path = devlyn_dir / "risk-probes.jsonl"
|
|
274
|
+
if not probes_path.is_file():
|
|
275
|
+
if require_present:
|
|
276
|
+
return ([], "risk-probes.jsonl is required when --risk-probes is enabled")
|
|
277
|
+
return ([], None)
|
|
278
|
+
if source_md is None or not source_md.is_file():
|
|
279
|
+
return ([], "risk-probes.jsonl exists but source markdown is unavailable")
|
|
280
|
+
|
|
281
|
+
verification_text = extract_verification_text(source_md.read_text())
|
|
282
|
+
if not verification_text:
|
|
283
|
+
return ([], "risk-probes.jsonl exists but source has no ## Verification section")
|
|
284
|
+
|
|
285
|
+
probes: list[dict] = []
|
|
286
|
+
for index, line in enumerate(probes_path.read_text().splitlines()):
|
|
287
|
+
if not line.strip():
|
|
288
|
+
continue
|
|
289
|
+
try:
|
|
290
|
+
probe = json.loads(line)
|
|
291
|
+
except json.JSONDecodeError as e:
|
|
292
|
+
return ([], f"risk-probes[{index}] invalid JSON: {e}")
|
|
293
|
+
err = validate_risk_probe(probe, index, verification_text)
|
|
294
|
+
if err:
|
|
295
|
+
return ([], err)
|
|
296
|
+
normalized = dict(probe)
|
|
297
|
+
normalized["_risk_probe"] = True
|
|
298
|
+
normalized["_risk_probe_index"] = index
|
|
299
|
+
probes.append(normalized)
|
|
300
|
+
if len(probes) > 3:
|
|
301
|
+
return ([], "risk-probes.jsonl has more than 3 probes")
|
|
302
|
+
if require_present and not probes:
|
|
303
|
+
return ([], "risk-probes.jsonl must contain at least one probe")
|
|
304
|
+
if require_present:
|
|
305
|
+
present_tags = {tag for probe in probes for tag in probe.get("tags", [])}
|
|
306
|
+
missing_tags = sorted(required_risk_probe_tags(verification_text) - present_tags)
|
|
307
|
+
if missing_tags:
|
|
308
|
+
return ([], f"risk-probes.jsonl missing required probe tag(s): {', '.join(missing_tags)}")
|
|
309
|
+
return (probes, None)
|
|
310
|
+
|
|
311
|
+
|
|
127
312
|
def read_source(work: Path, devlyn_dir: Path) -> tuple[str | None, Path | None]:
|
|
128
313
|
"""Return (source_type, markdown_path) from .devlyn/pipeline.state.json,
|
|
129
314
|
or (None, None) if state is absent/unreadable. The markdown path is
|
|
@@ -237,7 +422,115 @@ def run_check_mode(md_path: Path) -> int:
|
|
|
237
422
|
return 0
|
|
238
423
|
|
|
239
424
|
|
|
425
|
+
def run_self_test() -> int:
|
|
426
|
+
script_path = str(Path(__file__).resolve())
|
|
427
|
+
with tempfile.TemporaryDirectory() as td:
|
|
428
|
+
work = Path(td)
|
|
429
|
+
devlyn = work / ".devlyn"
|
|
430
|
+
devlyn.mkdir()
|
|
431
|
+
spec_md = work / "spec.md"
|
|
432
|
+
spec_md.write_text("# Spec\n\n## Verification\n\n- probe must pass visible marker.\n")
|
|
433
|
+
(devlyn / "pipeline.state.json").write_text(json.dumps({
|
|
434
|
+
"source": {"type": "spec", "spec_path": str(spec_md)}
|
|
435
|
+
}))
|
|
436
|
+
(devlyn / "spec-verify.json").write_text(json.dumps({
|
|
437
|
+
"verification_commands": [
|
|
438
|
+
{"cmd": "printf ok", "exit_code": 0, "stdout_contains": ["ok"]}
|
|
439
|
+
]
|
|
440
|
+
}) + "\n")
|
|
441
|
+
(devlyn / "risk-probes.jsonl").write_text(json.dumps({
|
|
442
|
+
"id": "P1",
|
|
443
|
+
"derived_from": "probe must pass visible marker.",
|
|
444
|
+
"cmd": "printf probe-ok",
|
|
445
|
+
"exit_code": 0,
|
|
446
|
+
"stdout_contains": ["probe-ok"],
|
|
447
|
+
"stdout_not_contains": [],
|
|
448
|
+
"tags": ["shape_contract"],
|
|
449
|
+
"tag_evidence": {},
|
|
450
|
+
}) + "\n")
|
|
451
|
+
env = os.environ.copy()
|
|
452
|
+
env["BENCH_WORKDIR"] = str(work)
|
|
453
|
+
good = subprocess.run(
|
|
454
|
+
[sys.executable, script_path, "--include-risk-probes"],
|
|
455
|
+
cwd=work,
|
|
456
|
+
env=env,
|
|
457
|
+
capture_output=True,
|
|
458
|
+
text=True,
|
|
459
|
+
)
|
|
460
|
+
if good.returncode != 0:
|
|
461
|
+
print(good.stderr, file=sys.stderr)
|
|
462
|
+
return 1
|
|
463
|
+
|
|
464
|
+
(devlyn / "risk-probes.jsonl").write_text(json.dumps({
|
|
465
|
+
"id": "P2",
|
|
466
|
+
"derived_from": "probe must pass visible marker.",
|
|
467
|
+
"cmd": "node $BENCH_FIXTURE_DIR/verifiers/hidden.js",
|
|
468
|
+
"exit_code": 0,
|
|
469
|
+
}) + "\n")
|
|
470
|
+
bad = subprocess.run(
|
|
471
|
+
[sys.executable, script_path, "--validate-risk-probes"],
|
|
472
|
+
cwd=work,
|
|
473
|
+
env=env,
|
|
474
|
+
capture_output=True,
|
|
475
|
+
text=True,
|
|
476
|
+
)
|
|
477
|
+
if bad.returncode == 0:
|
|
478
|
+
print("hidden verifier path was accepted", file=sys.stderr)
|
|
479
|
+
return 1
|
|
480
|
+
|
|
481
|
+
(devlyn / "risk-probes.jsonl").write_text(json.dumps({
|
|
482
|
+
"id": "P3",
|
|
483
|
+
"derived_from": "probe must pass visible marker.",
|
|
484
|
+
"cmd": "printf bad-error-derived-from",
|
|
485
|
+
"exit_code": 0,
|
|
486
|
+
"tags": ["error_contract"],
|
|
487
|
+
"tag_evidence": {"error_contract": []},
|
|
488
|
+
}) + "\n")
|
|
489
|
+
bad_error_ref = subprocess.run(
|
|
490
|
+
[sys.executable, script_path, "--validate-risk-probes"],
|
|
491
|
+
cwd=work,
|
|
492
|
+
env=env,
|
|
493
|
+
capture_output=True,
|
|
494
|
+
text=True,
|
|
495
|
+
)
|
|
496
|
+
if bad_error_ref.returncode == 0:
|
|
497
|
+
print("error_contract with unrelated derived_from was accepted", file=sys.stderr)
|
|
498
|
+
return 1
|
|
499
|
+
|
|
500
|
+
(devlyn / "risk-probes.jsonl").write_text(json.dumps({
|
|
501
|
+
"id": "P4",
|
|
502
|
+
"derived_from": "probe must pass visible marker.",
|
|
503
|
+
"cmd": "printf weak-boundary",
|
|
504
|
+
"exit_code": 0,
|
|
505
|
+
"tags": ["boundary_overlap"],
|
|
506
|
+
"tag_evidence": {"boundary_overlap": ["one_minute_overlap"]},
|
|
507
|
+
}) + "\n")
|
|
508
|
+
weak = subprocess.run(
|
|
509
|
+
[sys.executable, script_path, "--validate-risk-probes"],
|
|
510
|
+
cwd=work,
|
|
511
|
+
env=env,
|
|
512
|
+
capture_output=True,
|
|
513
|
+
text=True,
|
|
514
|
+
)
|
|
515
|
+
if weak.returncode == 0:
|
|
516
|
+
print("incomplete boundary_overlap evidence was accepted", file=sys.stderr)
|
|
517
|
+
return 1
|
|
518
|
+
return 0
|
|
519
|
+
|
|
520
|
+
|
|
240
521
|
def main() -> int:
|
|
522
|
+
include_risk_probes = False
|
|
523
|
+
validate_risk_probes_only = False
|
|
524
|
+
if "--include-risk-probes" in sys.argv[1:]:
|
|
525
|
+
include_risk_probes = True
|
|
526
|
+
sys.argv = [arg for arg in sys.argv if arg != "--include-risk-probes"]
|
|
527
|
+
if "--validate-risk-probes" in sys.argv[1:]:
|
|
528
|
+
validate_risk_probes_only = True
|
|
529
|
+
sys.argv = [arg for arg in sys.argv if arg != "--validate-risk-probes"]
|
|
530
|
+
|
|
531
|
+
if len(sys.argv) == 2 and sys.argv[1] == "--self-test":
|
|
532
|
+
return run_self_test()
|
|
533
|
+
|
|
241
534
|
if len(sys.argv) >= 2 and sys.argv[1] == "--check":
|
|
242
535
|
if len(sys.argv) != 3:
|
|
243
536
|
print("usage: spec-verify-check.py --check <markdown-path>", file=sys.stderr)
|
|
@@ -275,6 +568,16 @@ def main() -> int:
|
|
|
275
568
|
pre_staged = spec_path.is_file() # captured BEFORE any potential write
|
|
276
569
|
trust_bench_staged = bench_mode and pre_staged
|
|
277
570
|
src_type, source_md = read_source(work, devlyn_dir)
|
|
571
|
+
if validate_risk_probes_only:
|
|
572
|
+
_risk_probes, risk_error = load_risk_probes(
|
|
573
|
+
devlyn_dir, source_md, require_present=True
|
|
574
|
+
)
|
|
575
|
+
if risk_error:
|
|
576
|
+
print(f"[spec-verify] risk probes malformed: {risk_error}", file=sys.stderr)
|
|
577
|
+
write_malformed_finding(devlyn_dir, risk_error, devlyn_dir / "risk-probes.jsonl")
|
|
578
|
+
return 1
|
|
579
|
+
print("[spec-verify] risk probes valid", file=sys.stderr)
|
|
580
|
+
return 0
|
|
278
581
|
if source_md is not None and not trust_bench_staged:
|
|
279
582
|
staged, error = stage_from_source(source_md, devlyn_dir)
|
|
280
583
|
if error is not None:
|
|
@@ -334,7 +637,14 @@ def main() -> int:
|
|
|
334
637
|
print(f"[spec-verify] error: {spec_path}: {shape_err}", file=sys.stderr)
|
|
335
638
|
write_malformed_finding(devlyn_dir, f"{spec_path}: {shape_err}", None)
|
|
336
639
|
return 1
|
|
337
|
-
commands = spec["verification_commands"]
|
|
640
|
+
commands = list(spec["verification_commands"])
|
|
641
|
+
if include_risk_probes:
|
|
642
|
+
risk_probes, risk_error = load_risk_probes(devlyn_dir, source_md)
|
|
643
|
+
if risk_error:
|
|
644
|
+
print(f"[spec-verify] risk probes malformed: {risk_error}", file=sys.stderr)
|
|
645
|
+
write_malformed_finding(devlyn_dir, risk_error, devlyn_dir / "risk-probes.jsonl")
|
|
646
|
+
return 1
|
|
647
|
+
commands.extend(risk_probes)
|
|
338
648
|
|
|
339
649
|
devlyn_dir.mkdir(parents=True, exist_ok=True)
|
|
340
650
|
results_path = devlyn_dir / "spec-verify.results.json"
|
|
@@ -354,6 +664,7 @@ def main() -> int:
|
|
|
354
664
|
"reason": "missing_cmd"})
|
|
355
665
|
continue
|
|
356
666
|
|
|
667
|
+
is_risk_probe = bool(vc.get("_risk_probe"))
|
|
357
668
|
expected_exit = vc.get("exit_code", 0)
|
|
358
669
|
stdout_contains = vc.get("stdout_contains", []) or []
|
|
359
670
|
stdout_not_contains = vc.get("stdout_not_contains", []) or []
|
|
@@ -423,17 +734,41 @@ def main() -> int:
|
|
|
423
734
|
f"contains={stdout_contains}, not_contains={stdout_not_contains})."
|
|
424
735
|
)
|
|
425
736
|
|
|
737
|
+
rule_id = (
|
|
738
|
+
"correctness.risk-probe-failed"
|
|
739
|
+
if is_risk_probe
|
|
740
|
+
else "correctness.spec-literal-mismatch"
|
|
741
|
+
)
|
|
742
|
+
criterion_ref = (
|
|
743
|
+
f"risk-probe:{vc.get('id')}"
|
|
744
|
+
if is_risk_probe
|
|
745
|
+
else f"spec-verify://verification_commands/{idx}"
|
|
746
|
+
)
|
|
747
|
+
file_ref = (
|
|
748
|
+
".devlyn/risk-probes.jsonl"
|
|
749
|
+
if is_risk_probe
|
|
750
|
+
else ".devlyn/spec-verify.json"
|
|
751
|
+
)
|
|
752
|
+
if is_risk_probe:
|
|
753
|
+
fix_hint = (
|
|
754
|
+
f"Risk probe `{vc.get('id')}` derived from "
|
|
755
|
+
f"{vc.get('derived_from')!r} failed. See "
|
|
756
|
+
".devlyn/spec-verify.results.json for captured output "
|
|
757
|
+
"and update the implementation to satisfy the visible "
|
|
758
|
+
"verification bullet."
|
|
759
|
+
)
|
|
760
|
+
|
|
426
761
|
findings.append({
|
|
427
762
|
"id": f"BGATE-{finding_seq:04d}",
|
|
428
|
-
"rule_id":
|
|
763
|
+
"rule_id": rule_id,
|
|
429
764
|
"level": "error",
|
|
430
765
|
"severity": "CRITICAL",
|
|
431
766
|
"confidence": 1.0,
|
|
432
767
|
"message": msg,
|
|
433
|
-
"file":
|
|
768
|
+
"file": file_ref,
|
|
434
769
|
"line": 1,
|
|
435
770
|
"phase": "build_gate",
|
|
436
|
-
"criterion_ref":
|
|
771
|
+
"criterion_ref": criterion_ref,
|
|
437
772
|
"fix_hint": fix_hint,
|
|
438
773
|
"blocking": True,
|
|
439
774
|
"status": "open",
|
|
@@ -443,19 +778,28 @@ def main() -> int:
|
|
|
443
778
|
except subprocess.TimeoutExpired:
|
|
444
779
|
results.append({"index": idx, "cmd": cmd, "pass": False,
|
|
445
780
|
"reason": "timeout"})
|
|
781
|
+
rule_id = (
|
|
782
|
+
"correctness.risk-probe-failed"
|
|
783
|
+
if vc.get("_risk_probe")
|
|
784
|
+
else "correctness.spec-literal-mismatch"
|
|
785
|
+
)
|
|
446
786
|
findings.append({
|
|
447
787
|
"id": f"BGATE-{finding_seq:04d}",
|
|
448
|
-
"rule_id":
|
|
788
|
+
"rule_id": rule_id,
|
|
449
789
|
"level": "error",
|
|
450
790
|
"severity": "CRITICAL",
|
|
451
791
|
"confidence": 1.0,
|
|
452
792
|
"message": (
|
|
453
793
|
f"Verification command #{idx + 1} timed out after 60s."
|
|
454
794
|
),
|
|
455
|
-
"file": ".devlyn/spec-verify.json",
|
|
795
|
+
"file": ".devlyn/risk-probes.jsonl" if vc.get("_risk_probe") else ".devlyn/spec-verify.json",
|
|
456
796
|
"line": 1,
|
|
457
797
|
"phase": "build_gate",
|
|
458
|
-
"criterion_ref":
|
|
798
|
+
"criterion_ref": (
|
|
799
|
+
f"risk-probe:{vc.get('id')}"
|
|
800
|
+
if vc.get("_risk_probe")
|
|
801
|
+
else f"spec-verify://verification_commands/{idx}"
|
|
802
|
+
),
|
|
459
803
|
"fix_hint": (
|
|
460
804
|
f"Command `{cmd}` exceeded 60s. Reduce work or fix a "
|
|
461
805
|
f"hang in the implementation."
|
|
@@ -467,9 +811,14 @@ def main() -> int:
|
|
|
467
811
|
except Exception as e: # noqa: BLE001 — surface any harness error explicitly
|
|
468
812
|
results.append({"index": idx, "cmd": cmd, "pass": False,
|
|
469
813
|
"reason": f"error:{e.__class__.__name__}:{e}"})
|
|
814
|
+
rule_id = (
|
|
815
|
+
"correctness.risk-probe-failed"
|
|
816
|
+
if vc.get("_risk_probe")
|
|
817
|
+
else "correctness.spec-literal-mismatch"
|
|
818
|
+
)
|
|
470
819
|
findings.append({
|
|
471
820
|
"id": f"BGATE-{finding_seq:04d}",
|
|
472
|
-
"rule_id":
|
|
821
|
+
"rule_id": rule_id,
|
|
473
822
|
"level": "error",
|
|
474
823
|
"severity": "CRITICAL",
|
|
475
824
|
"confidence": 1.0,
|
|
@@ -477,10 +826,14 @@ def main() -> int:
|
|
|
477
826
|
f"Verification command #{idx + 1} raised "
|
|
478
827
|
f"{e.__class__.__name__}: {e}."
|
|
479
828
|
),
|
|
480
|
-
"file": ".devlyn/spec-verify.json",
|
|
829
|
+
"file": ".devlyn/risk-probes.jsonl" if vc.get("_risk_probe") else ".devlyn/spec-verify.json",
|
|
481
830
|
"line": 1,
|
|
482
831
|
"phase": "build_gate",
|
|
483
|
-
"criterion_ref":
|
|
832
|
+
"criterion_ref": (
|
|
833
|
+
f"risk-probe:{vc.get('id')}"
|
|
834
|
+
if vc.get("_risk_probe")
|
|
835
|
+
else f"spec-verify://verification_commands/{idx}"
|
|
836
|
+
),
|
|
484
837
|
"fix_hint": (
|
|
485
838
|
f"Command `{cmd}` could not be executed. Check the work-dir "
|
|
486
839
|
f"state and any environment setup the command requires."
|