devlyn-cli 2.1.0 → 2.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. package/CLAUDE.md +1 -1
  2. package/benchmark/auto-resolve/README.md +321 -2
  3. package/benchmark/auto-resolve/RUBRIC.md +6 -0
  4. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/spec.md +0 -1
  5. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +63 -0
  6. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/expected.json +60 -0
  7. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/metadata.json +10 -0
  8. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/setup.sh +17 -0
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/spec.md +51 -0
  10. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/task.txt +9 -0
  11. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/invalid.js +29 -0
  12. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/parallel.js +50 -0
  13. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +70 -0
  14. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/expected.json +52 -0
  15. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/metadata.json +10 -0
  16. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/setup.sh +171 -0
  17. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/spec.md +50 -0
  18. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/task.txt +9 -0
  19. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +83 -0
  20. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/expected.json +74 -0
  21. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/metadata.json +10 -0
  22. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/setup.sh +251 -0
  23. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/spec.md +57 -0
  24. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/task.txt +13 -0
  25. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/verifiers/replay-malformed-body.js +64 -0
  26. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +98 -0
  27. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/expected.json +46 -0
  28. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/metadata.json +10 -0
  29. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/setup.sh +336 -0
  30. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/spec.md +51 -0
  31. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/task.txt +9 -0
  32. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +26 -0
  33. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/expected.json +64 -0
  34. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/metadata.json +10 -0
  35. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/setup.sh +32 -0
  36. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +57 -0
  37. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/task.txt +7 -0
  38. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/exact-success.js +54 -0
  39. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/no-hardcoded-pricing.js +47 -0
  40. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/stock-error.js +45 -0
  41. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/spec.md +0 -1
  42. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +27 -0
  43. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/expected.json +62 -0
  44. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/metadata.json +10 -0
  45. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/setup.sh +2 -0
  46. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +61 -0
  47. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/task.txt +7 -0
  48. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/error-order.js +55 -0
  49. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/priority-blocked.js +48 -0
  50. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +27 -0
  51. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/expected.json +56 -0
  52. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/metadata.json +10 -0
  53. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/setup.sh +2 -0
  54. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/spec.md +64 -0
  55. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/task.txt +7 -0
  56. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/conflicting-duplicate.js +34 -0
  57. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/idempotent-close.js +41 -0
  58. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +27 -0
  59. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/expected.json +56 -0
  60. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/metadata.json +10 -0
  61. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/setup.sh +2 -0
  62. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +70 -0
  63. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/task.txt +7 -0
  64. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/priority-rollback.js +64 -0
  65. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/single-warehouse-fefo.js +66 -0
  66. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +28 -0
  67. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/expected.json +66 -0
  68. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/metadata.json +10 -0
  69. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/setup.sh +36 -0
  70. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +64 -0
  71. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/task.txt +7 -0
  72. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/catalog-source.js +57 -0
  73. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/exact-success.js +63 -0
  74. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/stock-error.js +34 -0
  75. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +25 -0
  76. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/expected.json +68 -0
  77. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/metadata.json +10 -0
  78. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/setup.sh +17 -0
  79. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/spec.md +68 -0
  80. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/task.txt +7 -0
  81. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/conflicting-duplicate.js +29 -0
  82. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/exact-payout.js +58 -0
  83. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/rules-source.js +56 -0
  84. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +0 -1
  85. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/spec.md +0 -1
  86. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/spec.md +0 -1
  87. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/spec.md +0 -1
  88. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/spec.md +0 -1
  89. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +0 -3
  90. package/benchmark/auto-resolve/fixtures/SCHEMA.md +13 -1
  91. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +98 -0
  92. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +111 -0
  93. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +289 -0
  94. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +250 -0
  95. package/benchmark/auto-resolve/scripts/headroom-gate.py +147 -0
  96. package/benchmark/auto-resolve/scripts/judge.sh +82 -3
  97. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +0 -11
  98. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +0 -10
  99. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +244 -0
  100. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +118 -0
  101. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +192 -0
  102. package/benchmark/auto-resolve/scripts/run-fixture.sh +257 -43
  103. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +511 -0
  104. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +162 -0
  105. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +93 -0
  106. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +209 -0
  107. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +239 -0
  108. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +265 -0
  109. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +192 -0
  110. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +131 -0
  111. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +84 -0
  112. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +302 -0
  113. package/config/skills/_shared/archive_run.py +3 -0
  114. package/config/skills/_shared/codex-config.md +2 -2
  115. package/config/skills/_shared/codex-monitored.sh +72 -7
  116. package/config/skills/_shared/collect-codex-findings.py +125 -0
  117. package/config/skills/_shared/engine-preflight.md +1 -1
  118. package/config/skills/_shared/expected.schema.json +18 -0
  119. package/config/skills/_shared/spec-verify-check.py +363 -10
  120. package/config/skills/_shared/verify-merge-findings.py +327 -0
  121. package/config/skills/devlyn:resolve/SKILL.md +69 -8
  122. package/config/skills/devlyn:resolve/references/phases/build-gate.md +1 -1
  123. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +183 -0
  124. package/config/skills/devlyn:resolve/references/phases/verify.md +156 -4
  125. package/config/skills/devlyn:resolve/references/state-schema.md +10 -4
  126. package/package.json +1 -1
  127. package/scripts/lint-skills.sh +69 -20
@@ -66,6 +66,7 @@ import os
66
66
  import re
67
67
  import subprocess
68
68
  import sys
69
+ import tempfile
69
70
  from pathlib import Path
70
71
 
71
72
 
@@ -73,6 +74,50 @@ VERIFICATION_SECTION_RE = re.compile(
73
74
  r'(?ms)^##[ \t]+Verification\b[^\n]*\n(.*?)(?=^##[ \t]+|\Z)'
74
75
  )
75
76
  JSON_FENCE_RE = re.compile(r'(?ms)^```json[ \t]*\n(.*?)\n```[ \t]*$')
77
+ FORBIDDEN_RISK_PROBE_CMD_RE = re.compile(
78
+ r'BENCH_FIXTURE_DIR|benchmark/auto-resolve/fixtures|/verifiers/|verifiers/'
79
+ )
80
+ EXTERNAL_URL_RE = re.compile(r"https?://([^/\s\"']+)", re.IGNORECASE)
81
+ LOCAL_URL_HOSTS = {
82
+ 'localhost',
83
+ '127.0.0.1',
84
+ '0.0.0.0',
85
+ '[::1]',
86
+ '::1',
87
+ }
88
+ RISK_PROBE_TAGS = {
89
+ "ordering_inversion",
90
+ "boundary_overlap",
91
+ "prior_consumption",
92
+ "rollback_state",
93
+ "positive_remaining",
94
+ "stdout_stderr_contract",
95
+ "error_contract",
96
+ "shape_contract",
97
+ }
98
+ RISK_PROBE_REQUIRED_EVIDENCE = {
99
+ "ordering_inversion": {
100
+ "input_order_would_choose_wrong_winner",
101
+ "asserts_processing_order_result",
102
+ },
103
+ "boundary_overlap": {
104
+ "starts_at_blocked_start",
105
+ "ends_at_blocked_end",
106
+ "one_minute_overlap",
107
+ },
108
+ "prior_consumption": {
109
+ "same_resource_consumed_first",
110
+ "later_entity_fails_or_reroutes",
111
+ },
112
+ "rollback_state": {
113
+ "failed_entity_tentative_state_absent",
114
+ "later_entity_uses_released_state",
115
+ },
116
+ "positive_remaining": {
117
+ "asserts_full_remaining_state",
118
+ "zero_quantity_rows_absent",
119
+ },
120
+ }
76
121
 
77
122
 
78
123
  def extract_verification_block(text: str) -> str | None:
@@ -89,6 +134,20 @@ def extract_verification_block(text: str) -> str | None:
89
134
  return fence.group(1) if fence else None
90
135
 
91
136
 
137
+ def extract_verification_text(text: str) -> str:
138
+ section = VERIFICATION_SECTION_RE.search(text)
139
+ return section.group(1) if section else ""
140
+
141
+
142
+ def external_url_hosts(text: str) -> list[str]:
143
+ hosts: list[str] = []
144
+ for match in EXTERNAL_URL_RE.finditer(text or ''):
145
+ host = match.group(1).split('@')[-1].split(':')[0].lower()
146
+ if host not in LOCAL_URL_HOSTS and host not in hosts:
147
+ hosts.append(host)
148
+ return hosts
149
+
150
+
92
151
  def validate_shape(data) -> str | None:
93
152
  """Return None if shape matches the canonical verification_commands
94
153
  schema; else a human-readable error string.
@@ -124,6 +183,132 @@ def validate_shape(data) -> str | None:
124
183
  return None
125
184
 
126
185
 
186
+ def validate_risk_probe(probe: object, index: int, verification_text: str) -> str | None:
187
+ if not isinstance(probe, dict):
188
+ return f"risk-probes[{index}] must be a JSON object"
189
+ probe_id = probe.get("id")
190
+ if not isinstance(probe_id, str) or not probe_id.strip():
191
+ return f"risk-probes[{index}].id must be a non-empty string"
192
+ derived_from = probe.get("derived_from")
193
+ if not isinstance(derived_from, str) or not derived_from.strip():
194
+ return f"risk-probes[{index}].derived_from must be a non-empty string"
195
+ if derived_from not in verification_text:
196
+ return (
197
+ f"risk-probes[{index}].derived_from must be an exact substring "
198
+ "of the source ## Verification section"
199
+ )
200
+ shape_err = validate_shape({"verification_commands": [probe]})
201
+ if shape_err:
202
+ return f"risk-probes[{index}]: {shape_err}"
203
+ cmd = probe.get("cmd", "")
204
+ if FORBIDDEN_RISK_PROBE_CMD_RE.search(cmd):
205
+ return (
206
+ f"risk-probes[{index}].cmd references hidden fixture/verifier paths; "
207
+ "risk probes must derive from visible spec text only"
208
+ )
209
+ external_hosts = external_url_hosts(cmd)
210
+ if external_hosts:
211
+ return (
212
+ f"risk-probes[{index}].cmd references external URL(s): "
213
+ f"{', '.join(external_hosts)}; use only worktree-local or localhost resources"
214
+ )
215
+ if len(cmd) > 4000:
216
+ return f"risk-probes[{index}].cmd exceeds 4000 characters"
217
+ tags = probe.get("tags")
218
+ if not isinstance(tags, list) or not tags or not all(isinstance(t, str) for t in tags):
219
+ return f"risk-probes[{index}].tags must be a non-empty list of strings"
220
+ unknown_tags = sorted(set(tags) - RISK_PROBE_TAGS)
221
+ if unknown_tags:
222
+ return f"risk-probes[{index}].tags contains unknown tag(s): {', '.join(unknown_tags)}"
223
+ if "error_contract" in tags and not re.search(
224
+ r'invalid|stderr|json[ -]?error|error object|exit[ `]*2',
225
+ derived_from,
226
+ re.IGNORECASE,
227
+ ):
228
+ return (
229
+ f"risk-probes[{index}].derived_from for error_contract must name "
230
+ "an invalid-input, stderr, JSON-error, or exit-2 verification bullet"
231
+ )
232
+ evidence = probe.get("tag_evidence")
233
+ if not isinstance(evidence, dict):
234
+ return f"risk-probes[{index}].tag_evidence must be an object"
235
+ for tag in tags:
236
+ required_evidence = RISK_PROBE_REQUIRED_EVIDENCE.get(tag)
237
+ if not required_evidence:
238
+ continue
239
+ actual = evidence.get(tag)
240
+ if not isinstance(actual, list) or not all(isinstance(item, str) for item in actual):
241
+ return f"risk-probes[{index}].tag_evidence.{tag} must be a list of strings"
242
+ missing_evidence = sorted(required_evidence - set(actual))
243
+ if missing_evidence:
244
+ return (
245
+ f"risk-probes[{index}].tag_evidence.{tag} missing required "
246
+ f"item(s): {', '.join(missing_evidence)}"
247
+ )
248
+ return None
249
+
250
+
251
+ def required_risk_probe_tags(verification_text: str) -> set[str]:
252
+ text = verification_text.lower()
253
+ required: set[str] = set()
254
+ if re.search(r'priority|higher-priority|ordered by|ordering|appears first|input order', text):
255
+ required.add("ordering_inversion")
256
+ if re.search(r'blocked|overlap|forbidden|window', text):
257
+ required.add("boundary_overlap")
258
+ if re.search(r'rolls? back|reduce[s]? stock|available to later|later orders|remaining|stock', text):
259
+ required.add("prior_consumption")
260
+ if "remaining" in text:
261
+ required.add("positive_remaining")
262
+ if re.search(r'stderr|stdout|exit `?2`?|json error', text):
263
+ required.add("stdout_stderr_contract")
264
+ return required
265
+
266
+
267
+ def load_risk_probes(
268
+ devlyn_dir: Path,
269
+ source_md: Path | None,
270
+ *,
271
+ require_present: bool = False,
272
+ ) -> tuple[list[dict], str | None]:
273
+ probes_path = devlyn_dir / "risk-probes.jsonl"
274
+ if not probes_path.is_file():
275
+ if require_present:
276
+ return ([], "risk-probes.jsonl is required when --risk-probes is enabled")
277
+ return ([], None)
278
+ if source_md is None or not source_md.is_file():
279
+ return ([], "risk-probes.jsonl exists but source markdown is unavailable")
280
+
281
+ verification_text = extract_verification_text(source_md.read_text())
282
+ if not verification_text:
283
+ return ([], "risk-probes.jsonl exists but source has no ## Verification section")
284
+
285
+ probes: list[dict] = []
286
+ for index, line in enumerate(probes_path.read_text().splitlines()):
287
+ if not line.strip():
288
+ continue
289
+ try:
290
+ probe = json.loads(line)
291
+ except json.JSONDecodeError as e:
292
+ return ([], f"risk-probes[{index}] invalid JSON: {e}")
293
+ err = validate_risk_probe(probe, index, verification_text)
294
+ if err:
295
+ return ([], err)
296
+ normalized = dict(probe)
297
+ normalized["_risk_probe"] = True
298
+ normalized["_risk_probe_index"] = index
299
+ probes.append(normalized)
300
+ if len(probes) > 3:
301
+ return ([], "risk-probes.jsonl has more than 3 probes")
302
+ if require_present and not probes:
303
+ return ([], "risk-probes.jsonl must contain at least one probe")
304
+ if require_present:
305
+ present_tags = {tag for probe in probes for tag in probe.get("tags", [])}
306
+ missing_tags = sorted(required_risk_probe_tags(verification_text) - present_tags)
307
+ if missing_tags:
308
+ return ([], f"risk-probes.jsonl missing required probe tag(s): {', '.join(missing_tags)}")
309
+ return (probes, None)
310
+
311
+
127
312
  def read_source(work: Path, devlyn_dir: Path) -> tuple[str | None, Path | None]:
128
313
  """Return (source_type, markdown_path) from .devlyn/pipeline.state.json,
129
314
  or (None, None) if state is absent/unreadable. The markdown path is
@@ -237,7 +422,115 @@ def run_check_mode(md_path: Path) -> int:
237
422
  return 0
238
423
 
239
424
 
425
+ def run_self_test() -> int:
426
+ script_path = str(Path(__file__).resolve())
427
+ with tempfile.TemporaryDirectory() as td:
428
+ work = Path(td)
429
+ devlyn = work / ".devlyn"
430
+ devlyn.mkdir()
431
+ spec_md = work / "spec.md"
432
+ spec_md.write_text("# Spec\n\n## Verification\n\n- probe must pass visible marker.\n")
433
+ (devlyn / "pipeline.state.json").write_text(json.dumps({
434
+ "source": {"type": "spec", "spec_path": str(spec_md)}
435
+ }))
436
+ (devlyn / "spec-verify.json").write_text(json.dumps({
437
+ "verification_commands": [
438
+ {"cmd": "printf ok", "exit_code": 0, "stdout_contains": ["ok"]}
439
+ ]
440
+ }) + "\n")
441
+ (devlyn / "risk-probes.jsonl").write_text(json.dumps({
442
+ "id": "P1",
443
+ "derived_from": "probe must pass visible marker.",
444
+ "cmd": "printf probe-ok",
445
+ "exit_code": 0,
446
+ "stdout_contains": ["probe-ok"],
447
+ "stdout_not_contains": [],
448
+ "tags": ["shape_contract"],
449
+ "tag_evidence": {},
450
+ }) + "\n")
451
+ env = os.environ.copy()
452
+ env["BENCH_WORKDIR"] = str(work)
453
+ good = subprocess.run(
454
+ [sys.executable, script_path, "--include-risk-probes"],
455
+ cwd=work,
456
+ env=env,
457
+ capture_output=True,
458
+ text=True,
459
+ )
460
+ if good.returncode != 0:
461
+ print(good.stderr, file=sys.stderr)
462
+ return 1
463
+
464
+ (devlyn / "risk-probes.jsonl").write_text(json.dumps({
465
+ "id": "P2",
466
+ "derived_from": "probe must pass visible marker.",
467
+ "cmd": "node $BENCH_FIXTURE_DIR/verifiers/hidden.js",
468
+ "exit_code": 0,
469
+ }) + "\n")
470
+ bad = subprocess.run(
471
+ [sys.executable, script_path, "--validate-risk-probes"],
472
+ cwd=work,
473
+ env=env,
474
+ capture_output=True,
475
+ text=True,
476
+ )
477
+ if bad.returncode == 0:
478
+ print("hidden verifier path was accepted", file=sys.stderr)
479
+ return 1
480
+
481
+ (devlyn / "risk-probes.jsonl").write_text(json.dumps({
482
+ "id": "P3",
483
+ "derived_from": "probe must pass visible marker.",
484
+ "cmd": "printf bad-error-derived-from",
485
+ "exit_code": 0,
486
+ "tags": ["error_contract"],
487
+ "tag_evidence": {"error_contract": []},
488
+ }) + "\n")
489
+ bad_error_ref = subprocess.run(
490
+ [sys.executable, script_path, "--validate-risk-probes"],
491
+ cwd=work,
492
+ env=env,
493
+ capture_output=True,
494
+ text=True,
495
+ )
496
+ if bad_error_ref.returncode == 0:
497
+ print("error_contract with unrelated derived_from was accepted", file=sys.stderr)
498
+ return 1
499
+
500
+ (devlyn / "risk-probes.jsonl").write_text(json.dumps({
501
+ "id": "P4",
502
+ "derived_from": "probe must pass visible marker.",
503
+ "cmd": "printf weak-boundary",
504
+ "exit_code": 0,
505
+ "tags": ["boundary_overlap"],
506
+ "tag_evidence": {"boundary_overlap": ["one_minute_overlap"]},
507
+ }) + "\n")
508
+ weak = subprocess.run(
509
+ [sys.executable, script_path, "--validate-risk-probes"],
510
+ cwd=work,
511
+ env=env,
512
+ capture_output=True,
513
+ text=True,
514
+ )
515
+ if weak.returncode == 0:
516
+ print("incomplete boundary_overlap evidence was accepted", file=sys.stderr)
517
+ return 1
518
+ return 0
519
+
520
+
240
521
  def main() -> int:
522
+ include_risk_probes = False
523
+ validate_risk_probes_only = False
524
+ if "--include-risk-probes" in sys.argv[1:]:
525
+ include_risk_probes = True
526
+ sys.argv = [arg for arg in sys.argv if arg != "--include-risk-probes"]
527
+ if "--validate-risk-probes" in sys.argv[1:]:
528
+ validate_risk_probes_only = True
529
+ sys.argv = [arg for arg in sys.argv if arg != "--validate-risk-probes"]
530
+
531
+ if len(sys.argv) == 2 and sys.argv[1] == "--self-test":
532
+ return run_self_test()
533
+
241
534
  if len(sys.argv) >= 2 and sys.argv[1] == "--check":
242
535
  if len(sys.argv) != 3:
243
536
  print("usage: spec-verify-check.py --check <markdown-path>", file=sys.stderr)
@@ -275,6 +568,16 @@ def main() -> int:
275
568
  pre_staged = spec_path.is_file() # captured BEFORE any potential write
276
569
  trust_bench_staged = bench_mode and pre_staged
277
570
  src_type, source_md = read_source(work, devlyn_dir)
571
+ if validate_risk_probes_only:
572
+ _risk_probes, risk_error = load_risk_probes(
573
+ devlyn_dir, source_md, require_present=True
574
+ )
575
+ if risk_error:
576
+ print(f"[spec-verify] risk probes malformed: {risk_error}", file=sys.stderr)
577
+ write_malformed_finding(devlyn_dir, risk_error, devlyn_dir / "risk-probes.jsonl")
578
+ return 1
579
+ print("[spec-verify] risk probes valid", file=sys.stderr)
580
+ return 0
278
581
  if source_md is not None and not trust_bench_staged:
279
582
  staged, error = stage_from_source(source_md, devlyn_dir)
280
583
  if error is not None:
@@ -334,7 +637,14 @@ def main() -> int:
334
637
  print(f"[spec-verify] error: {spec_path}: {shape_err}", file=sys.stderr)
335
638
  write_malformed_finding(devlyn_dir, f"{spec_path}: {shape_err}", None)
336
639
  return 1
337
- commands = spec["verification_commands"]
640
+ commands = list(spec["verification_commands"])
641
+ if include_risk_probes:
642
+ risk_probes, risk_error = load_risk_probes(devlyn_dir, source_md)
643
+ if risk_error:
644
+ print(f"[spec-verify] risk probes malformed: {risk_error}", file=sys.stderr)
645
+ write_malformed_finding(devlyn_dir, risk_error, devlyn_dir / "risk-probes.jsonl")
646
+ return 1
647
+ commands.extend(risk_probes)
338
648
 
339
649
  devlyn_dir.mkdir(parents=True, exist_ok=True)
340
650
  results_path = devlyn_dir / "spec-verify.results.json"
@@ -354,6 +664,7 @@ def main() -> int:
354
664
  "reason": "missing_cmd"})
355
665
  continue
356
666
 
667
+ is_risk_probe = bool(vc.get("_risk_probe"))
357
668
  expected_exit = vc.get("exit_code", 0)
358
669
  stdout_contains = vc.get("stdout_contains", []) or []
359
670
  stdout_not_contains = vc.get("stdout_not_contains", []) or []
@@ -423,17 +734,41 @@ def main() -> int:
423
734
  f"contains={stdout_contains}, not_contains={stdout_not_contains})."
424
735
  )
425
736
 
737
+ rule_id = (
738
+ "correctness.risk-probe-failed"
739
+ if is_risk_probe
740
+ else "correctness.spec-literal-mismatch"
741
+ )
742
+ criterion_ref = (
743
+ f"risk-probe:{vc.get('id')}"
744
+ if is_risk_probe
745
+ else f"spec-verify://verification_commands/{idx}"
746
+ )
747
+ file_ref = (
748
+ ".devlyn/risk-probes.jsonl"
749
+ if is_risk_probe
750
+ else ".devlyn/spec-verify.json"
751
+ )
752
+ if is_risk_probe:
753
+ fix_hint = (
754
+ f"Risk probe `{vc.get('id')}` derived from "
755
+ f"{vc.get('derived_from')!r} failed. See "
756
+ ".devlyn/spec-verify.results.json for captured output "
757
+ "and update the implementation to satisfy the visible "
758
+ "verification bullet."
759
+ )
760
+
426
761
  findings.append({
427
762
  "id": f"BGATE-{finding_seq:04d}",
428
- "rule_id": "correctness.spec-literal-mismatch",
763
+ "rule_id": rule_id,
429
764
  "level": "error",
430
765
  "severity": "CRITICAL",
431
766
  "confidence": 1.0,
432
767
  "message": msg,
433
- "file": ".devlyn/spec-verify.json",
768
+ "file": file_ref,
434
769
  "line": 1,
435
770
  "phase": "build_gate",
436
- "criterion_ref": f"spec-verify://verification_commands/{idx}",
771
+ "criterion_ref": criterion_ref,
437
772
  "fix_hint": fix_hint,
438
773
  "blocking": True,
439
774
  "status": "open",
@@ -443,19 +778,28 @@ def main() -> int:
443
778
  except subprocess.TimeoutExpired:
444
779
  results.append({"index": idx, "cmd": cmd, "pass": False,
445
780
  "reason": "timeout"})
781
+ rule_id = (
782
+ "correctness.risk-probe-failed"
783
+ if vc.get("_risk_probe")
784
+ else "correctness.spec-literal-mismatch"
785
+ )
446
786
  findings.append({
447
787
  "id": f"BGATE-{finding_seq:04d}",
448
- "rule_id": "correctness.spec-literal-mismatch",
788
+ "rule_id": rule_id,
449
789
  "level": "error",
450
790
  "severity": "CRITICAL",
451
791
  "confidence": 1.0,
452
792
  "message": (
453
793
  f"Verification command #{idx + 1} timed out after 60s."
454
794
  ),
455
- "file": ".devlyn/spec-verify.json",
795
+ "file": ".devlyn/risk-probes.jsonl" if vc.get("_risk_probe") else ".devlyn/spec-verify.json",
456
796
  "line": 1,
457
797
  "phase": "build_gate",
458
- "criterion_ref": f"spec-verify://verification_commands/{idx}",
798
+ "criterion_ref": (
799
+ f"risk-probe:{vc.get('id')}"
800
+ if vc.get("_risk_probe")
801
+ else f"spec-verify://verification_commands/{idx}"
802
+ ),
459
803
  "fix_hint": (
460
804
  f"Command `{cmd}` exceeded 60s. Reduce work or fix a "
461
805
  f"hang in the implementation."
@@ -467,9 +811,14 @@ def main() -> int:
467
811
  except Exception as e: # noqa: BLE001 — surface any harness error explicitly
468
812
  results.append({"index": idx, "cmd": cmd, "pass": False,
469
813
  "reason": f"error:{e.__class__.__name__}:{e}"})
814
+ rule_id = (
815
+ "correctness.risk-probe-failed"
816
+ if vc.get("_risk_probe")
817
+ else "correctness.spec-literal-mismatch"
818
+ )
470
819
  findings.append({
471
820
  "id": f"BGATE-{finding_seq:04d}",
472
- "rule_id": "correctness.spec-literal-mismatch",
821
+ "rule_id": rule_id,
473
822
  "level": "error",
474
823
  "severity": "CRITICAL",
475
824
  "confidence": 1.0,
@@ -477,10 +826,14 @@ def main() -> int:
477
826
  f"Verification command #{idx + 1} raised "
478
827
  f"{e.__class__.__name__}: {e}."
479
828
  ),
480
- "file": ".devlyn/spec-verify.json",
829
+ "file": ".devlyn/risk-probes.jsonl" if vc.get("_risk_probe") else ".devlyn/spec-verify.json",
481
830
  "line": 1,
482
831
  "phase": "build_gate",
483
- "criterion_ref": f"spec-verify://verification_commands/{idx}",
832
+ "criterion_ref": (
833
+ f"risk-probe:{vc.get('id')}"
834
+ if vc.get("_risk_probe")
835
+ else f"spec-verify://verification_commands/{idx}"
836
+ ),
484
837
  "fix_hint": (
485
838
  f"Command `{cmd}` could not be executed. Check the work-dir "
486
839
  f"state and any environment setup the command requires."