devlyn-cli 2.1.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. package/CLAUDE.md +1 -1
  2. package/benchmark/auto-resolve/README.md +318 -2
  3. package/benchmark/auto-resolve/RUBRIC.md +6 -0
  4. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +63 -0
  5. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/expected.json +60 -0
  6. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/metadata.json +10 -0
  7. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/setup.sh +17 -0
  8. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/spec.md +52 -0
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/task.txt +9 -0
  10. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/invalid.js +29 -0
  11. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/parallel.js +50 -0
  12. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +70 -0
  13. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/expected.json +52 -0
  14. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/metadata.json +10 -0
  15. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/setup.sh +171 -0
  16. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/spec.md +51 -0
  17. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/task.txt +9 -0
  18. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +83 -0
  19. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/expected.json +74 -0
  20. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/metadata.json +10 -0
  21. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/setup.sh +251 -0
  22. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/spec.md +58 -0
  23. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/task.txt +13 -0
  24. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/verifiers/replay-malformed-body.js +64 -0
  25. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +98 -0
  26. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/expected.json +46 -0
  27. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/metadata.json +10 -0
  28. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/setup.sh +336 -0
  29. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/spec.md +52 -0
  30. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/task.txt +9 -0
  31. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +26 -0
  32. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/expected.json +64 -0
  33. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/metadata.json +10 -0
  34. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/setup.sh +32 -0
  35. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +58 -0
  36. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/task.txt +7 -0
  37. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/exact-success.js +54 -0
  38. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/no-hardcoded-pricing.js +47 -0
  39. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/stock-error.js +45 -0
  40. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +27 -0
  41. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/expected.json +62 -0
  42. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/metadata.json +10 -0
  43. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/setup.sh +2 -0
  44. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +62 -0
  45. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/task.txt +7 -0
  46. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/error-order.js +55 -0
  47. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/priority-blocked.js +48 -0
  48. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +27 -0
  49. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/expected.json +56 -0
  50. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/metadata.json +10 -0
  51. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/setup.sh +2 -0
  52. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/spec.md +65 -0
  53. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/task.txt +7 -0
  54. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/conflicting-duplicate.js +34 -0
  55. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/idempotent-close.js +41 -0
  56. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +27 -0
  57. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/expected.json +56 -0
  58. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/metadata.json +10 -0
  59. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/setup.sh +2 -0
  60. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +71 -0
  61. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/task.txt +7 -0
  62. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/priority-rollback.js +64 -0
  63. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/single-warehouse-fefo.js +66 -0
  64. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +28 -0
  65. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/expected.json +66 -0
  66. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/metadata.json +10 -0
  67. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/setup.sh +36 -0
  68. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +65 -0
  69. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/task.txt +7 -0
  70. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/catalog-source.js +57 -0
  71. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/exact-success.js +63 -0
  72. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/stock-error.js +34 -0
  73. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +25 -0
  74. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/expected.json +68 -0
  75. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/metadata.json +10 -0
  76. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/setup.sh +17 -0
  77. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/spec.md +69 -0
  78. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/task.txt +7 -0
  79. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/conflicting-duplicate.js +29 -0
  80. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/exact-payout.js +58 -0
  81. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/rules-source.js +56 -0
  82. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/NOTES.md +24 -0
  83. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/expected.json +66 -0
  84. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/metadata.json +10 -0
  85. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/setup.sh +22 -0
  86. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/spec.md +62 -0
  87. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/task.txt +9 -0
  88. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/exact-success.js +48 -0
  89. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/insufficient-balance.js +36 -0
  90. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/rules-source.js +55 -0
  91. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/NOTES.md +20 -0
  92. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/expected.json +66 -0
  93. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/metadata.json +10 -0
  94. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/setup.sh +23 -0
  95. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/spec.md +66 -0
  96. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/task.txt +11 -0
  97. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/exact-success.js +44 -0
  98. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/rules-source.js +58 -0
  99. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/unavailable-inventory.js +35 -0
  100. package/benchmark/auto-resolve/fixtures/SCHEMA.md +13 -1
  101. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +98 -0
  102. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +111 -0
  103. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +289 -0
  104. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +250 -0
  105. package/benchmark/auto-resolve/scripts/headroom-gate.py +147 -0
  106. package/benchmark/auto-resolve/scripts/judge.sh +82 -3
  107. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +244 -0
  108. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +118 -0
  109. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +192 -0
  110. package/benchmark/auto-resolve/scripts/run-fixture.sh +234 -40
  111. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +511 -0
  112. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +162 -0
  113. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +93 -0
  114. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +209 -0
  115. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +239 -0
  116. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +265 -0
  117. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +192 -0
  118. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +131 -0
  119. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +84 -0
  120. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +302 -0
  121. package/config/skills/_shared/archive_run.py +3 -0
  122. package/config/skills/_shared/codex-config.md +2 -2
  123. package/config/skills/_shared/codex-monitored.sh +72 -7
  124. package/config/skills/_shared/collect-codex-findings.py +125 -0
  125. package/config/skills/_shared/engine-preflight.md +1 -1
  126. package/config/skills/_shared/expected.schema.json +18 -0
  127. package/config/skills/_shared/spec-verify-check.py +312 -10
  128. package/config/skills/_shared/verify-merge-findings.py +327 -0
  129. package/config/skills/devlyn:resolve/SKILL.md +62 -8
  130. package/config/skills/devlyn:resolve/references/phases/build-gate.md +1 -1
  131. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +164 -0
  132. package/config/skills/devlyn:resolve/references/phases/verify.md +156 -4
  133. package/config/skills/devlyn:resolve/references/state-schema.md +10 -4
  134. package/package.json +1 -1
  135. package/scripts/lint-skills.sh +32 -0
@@ -66,6 +66,7 @@ import os
66
66
  import re
67
67
  import subprocess
68
68
  import sys
69
+ import tempfile
69
70
  from pathlib import Path
70
71
 
71
72
 
@@ -73,6 +74,42 @@ VERIFICATION_SECTION_RE = re.compile(
73
74
  r'(?ms)^##[ \t]+Verification\b[^\n]*\n(.*?)(?=^##[ \t]+|\Z)'
74
75
  )
75
76
  JSON_FENCE_RE = re.compile(r'(?ms)^```json[ \t]*\n(.*?)\n```[ \t]*$')
77
+ FORBIDDEN_RISK_PROBE_CMD_RE = re.compile(
78
+ r'BENCH_FIXTURE_DIR|benchmark/auto-resolve/fixtures|/verifiers/|verifiers/'
79
+ )
80
+ RISK_PROBE_TAGS = {
81
+ "ordering_inversion",
82
+ "boundary_overlap",
83
+ "prior_consumption",
84
+ "rollback_state",
85
+ "positive_remaining",
86
+ "stdout_stderr_contract",
87
+ "error_contract",
88
+ "shape_contract",
89
+ }
90
+ RISK_PROBE_REQUIRED_EVIDENCE = {
91
+ "ordering_inversion": {
92
+ "input_order_would_choose_wrong_winner",
93
+ "asserts_processing_order_result",
94
+ },
95
+ "boundary_overlap": {
96
+ "starts_at_blocked_start",
97
+ "ends_at_blocked_end",
98
+ "one_minute_overlap",
99
+ },
100
+ "prior_consumption": {
101
+ "same_resource_consumed_first",
102
+ "later_entity_fails_or_reroutes",
103
+ },
104
+ "rollback_state": {
105
+ "failed_entity_tentative_state_absent",
106
+ "later_entity_uses_released_state",
107
+ },
108
+ "positive_remaining": {
109
+ "asserts_full_remaining_state",
110
+ "zero_quantity_rows_absent",
111
+ },
112
+ }
76
113
 
77
114
 
78
115
  def extract_verification_block(text: str) -> str | None:
@@ -89,6 +126,11 @@ def extract_verification_block(text: str) -> str | None:
89
126
  return fence.group(1) if fence else None
90
127
 
91
128
 
129
+ def extract_verification_text(text: str) -> str:
130
+ section = VERIFICATION_SECTION_RE.search(text)
131
+ return section.group(1) if section else ""
132
+
133
+
92
134
  def validate_shape(data) -> str | None:
93
135
  """Return None if shape matches the canonical verification_commands
94
136
  schema; else a human-readable error string.
@@ -124,6 +166,117 @@ def validate_shape(data) -> str | None:
124
166
  return None
125
167
 
126
168
 
169
+ def validate_risk_probe(probe: object, index: int, verification_text: str) -> str | None:
170
+ if not isinstance(probe, dict):
171
+ return f"risk-probes[{index}] must be a JSON object"
172
+ probe_id = probe.get("id")
173
+ if not isinstance(probe_id, str) or not probe_id.strip():
174
+ return f"risk-probes[{index}].id must be a non-empty string"
175
+ derived_from = probe.get("derived_from")
176
+ if not isinstance(derived_from, str) or not derived_from.strip():
177
+ return f"risk-probes[{index}].derived_from must be a non-empty string"
178
+ if derived_from not in verification_text:
179
+ return (
180
+ f"risk-probes[{index}].derived_from must be an exact substring "
181
+ "of the source ## Verification section"
182
+ )
183
+ shape_err = validate_shape({"verification_commands": [probe]})
184
+ if shape_err:
185
+ return f"risk-probes[{index}]: {shape_err}"
186
+ cmd = probe.get("cmd", "")
187
+ if FORBIDDEN_RISK_PROBE_CMD_RE.search(cmd):
188
+ return (
189
+ f"risk-probes[{index}].cmd references hidden fixture/verifier paths; "
190
+ "risk probes must derive from visible spec text only"
191
+ )
192
+ if len(cmd) > 4000:
193
+ return f"risk-probes[{index}].cmd exceeds 4000 characters"
194
+ tags = probe.get("tags")
195
+ if not isinstance(tags, list) or not tags or not all(isinstance(t, str) for t in tags):
196
+ return f"risk-probes[{index}].tags must be a non-empty list of strings"
197
+ unknown_tags = sorted(set(tags) - RISK_PROBE_TAGS)
198
+ if unknown_tags:
199
+ return f"risk-probes[{index}].tags contains unknown tag(s): {', '.join(unknown_tags)}"
200
+ evidence = probe.get("tag_evidence")
201
+ if not isinstance(evidence, dict):
202
+ return f"risk-probes[{index}].tag_evidence must be an object"
203
+ for tag in tags:
204
+ required_evidence = RISK_PROBE_REQUIRED_EVIDENCE.get(tag)
205
+ if not required_evidence:
206
+ continue
207
+ actual = evidence.get(tag)
208
+ if not isinstance(actual, list) or not all(isinstance(item, str) for item in actual):
209
+ return f"risk-probes[{index}].tag_evidence.{tag} must be a list of strings"
210
+ missing_evidence = sorted(required_evidence - set(actual))
211
+ if missing_evidence:
212
+ return (
213
+ f"risk-probes[{index}].tag_evidence.{tag} missing required "
214
+ f"item(s): {', '.join(missing_evidence)}"
215
+ )
216
+ return None
217
+
218
+
219
+ def required_risk_probe_tags(verification_text: str) -> set[str]:
220
+ text = verification_text.lower()
221
+ required: set[str] = set()
222
+ if re.search(r'priority|higher-priority|ordered by|ordering|appears first|input order', text):
223
+ required.add("ordering_inversion")
224
+ if re.search(r'blocked|overlap|forbidden|window', text):
225
+ required.add("boundary_overlap")
226
+ if re.search(r'rolls? back|reduce[s]? stock|available to later|later orders|remaining|stock', text):
227
+ required.add("prior_consumption")
228
+ if "remaining" in text:
229
+ required.add("positive_remaining")
230
+ if re.search(r'stderr|stdout|exit `?2`?|json error', text):
231
+ required.add("stdout_stderr_contract")
232
+ return required
233
+
234
+
235
+ def load_risk_probes(
236
+ devlyn_dir: Path,
237
+ source_md: Path | None,
238
+ *,
239
+ require_present: bool = False,
240
+ ) -> tuple[list[dict], str | None]:
241
+ probes_path = devlyn_dir / "risk-probes.jsonl"
242
+ if not probes_path.is_file():
243
+ if require_present:
244
+ return ([], "risk-probes.jsonl is required when --risk-probes is enabled")
245
+ return ([], None)
246
+ if source_md is None or not source_md.is_file():
247
+ return ([], "risk-probes.jsonl exists but source markdown is unavailable")
248
+
249
+ verification_text = extract_verification_text(source_md.read_text())
250
+ if not verification_text:
251
+ return ([], "risk-probes.jsonl exists but source has no ## Verification section")
252
+
253
+ probes: list[dict] = []
254
+ for index, line in enumerate(probes_path.read_text().splitlines()):
255
+ if not line.strip():
256
+ continue
257
+ try:
258
+ probe = json.loads(line)
259
+ except json.JSONDecodeError as e:
260
+ return ([], f"risk-probes[{index}] invalid JSON: {e}")
261
+ err = validate_risk_probe(probe, index, verification_text)
262
+ if err:
263
+ return ([], err)
264
+ normalized = dict(probe)
265
+ normalized["_risk_probe"] = True
266
+ normalized["_risk_probe_index"] = index
267
+ probes.append(normalized)
268
+ if len(probes) > 3:
269
+ return ([], "risk-probes.jsonl has more than 3 probes")
270
+ if require_present and not probes:
271
+ return ([], "risk-probes.jsonl must contain at least one probe")
272
+ if require_present:
273
+ present_tags = {tag for probe in probes for tag in probe.get("tags", [])}
274
+ missing_tags = sorted(required_risk_probe_tags(verification_text) - present_tags)
275
+ if missing_tags:
276
+ return ([], f"risk-probes.jsonl missing required probe tag(s): {', '.join(missing_tags)}")
277
+ return (probes, None)
278
+
279
+
127
280
  def read_source(work: Path, devlyn_dir: Path) -> tuple[str | None, Path | None]:
128
281
  """Return (source_type, markdown_path) from .devlyn/pipeline.state.json,
129
282
  or (None, None) if state is absent/unreadable. The markdown path is
@@ -237,7 +390,96 @@ def run_check_mode(md_path: Path) -> int:
237
390
  return 0
238
391
 
239
392
 
393
+ def run_self_test() -> int:
394
+ script_path = str(Path(__file__).resolve())
395
+ with tempfile.TemporaryDirectory() as td:
396
+ work = Path(td)
397
+ devlyn = work / ".devlyn"
398
+ devlyn.mkdir()
399
+ spec_md = work / "spec.md"
400
+ spec_md.write_text("# Spec\n\n## Verification\n\n- probe must pass visible marker.\n")
401
+ (devlyn / "pipeline.state.json").write_text(json.dumps({
402
+ "source": {"type": "spec", "spec_path": str(spec_md)}
403
+ }))
404
+ (devlyn / "spec-verify.json").write_text(json.dumps({
405
+ "verification_commands": [
406
+ {"cmd": "printf ok", "exit_code": 0, "stdout_contains": ["ok"]}
407
+ ]
408
+ }) + "\n")
409
+ (devlyn / "risk-probes.jsonl").write_text(json.dumps({
410
+ "id": "P1",
411
+ "derived_from": "probe must pass visible marker.",
412
+ "cmd": "printf probe-ok",
413
+ "exit_code": 0,
414
+ "stdout_contains": ["probe-ok"],
415
+ "stdout_not_contains": [],
416
+ "tags": ["shape_contract"],
417
+ "tag_evidence": {},
418
+ }) + "\n")
419
+ env = os.environ.copy()
420
+ env["BENCH_WORKDIR"] = str(work)
421
+ good = subprocess.run(
422
+ [sys.executable, script_path, "--include-risk-probes"],
423
+ cwd=work,
424
+ env=env,
425
+ capture_output=True,
426
+ text=True,
427
+ )
428
+ if good.returncode != 0:
429
+ print(good.stderr, file=sys.stderr)
430
+ return 1
431
+
432
+ (devlyn / "risk-probes.jsonl").write_text(json.dumps({
433
+ "id": "P2",
434
+ "derived_from": "probe must pass visible marker.",
435
+ "cmd": "node $BENCH_FIXTURE_DIR/verifiers/hidden.js",
436
+ "exit_code": 0,
437
+ }) + "\n")
438
+ bad = subprocess.run(
439
+ [sys.executable, script_path, "--validate-risk-probes"],
440
+ cwd=work,
441
+ env=env,
442
+ capture_output=True,
443
+ text=True,
444
+ )
445
+ if bad.returncode == 0:
446
+ print("hidden verifier path was accepted", file=sys.stderr)
447
+ return 1
448
+
449
+ (devlyn / "risk-probes.jsonl").write_text(json.dumps({
450
+ "id": "P3",
451
+ "derived_from": "probe must pass visible marker.",
452
+ "cmd": "printf weak-boundary",
453
+ "exit_code": 0,
454
+ "tags": ["boundary_overlap"],
455
+ "tag_evidence": {"boundary_overlap": ["one_minute_overlap"]},
456
+ }) + "\n")
457
+ weak = subprocess.run(
458
+ [sys.executable, script_path, "--validate-risk-probes"],
459
+ cwd=work,
460
+ env=env,
461
+ capture_output=True,
462
+ text=True,
463
+ )
464
+ if weak.returncode == 0:
465
+ print("incomplete boundary_overlap evidence was accepted", file=sys.stderr)
466
+ return 1
467
+ return 0
468
+
469
+
240
470
  def main() -> int:
471
+ include_risk_probes = False
472
+ validate_risk_probes_only = False
473
+ if "--include-risk-probes" in sys.argv[1:]:
474
+ include_risk_probes = True
475
+ sys.argv = [arg for arg in sys.argv if arg != "--include-risk-probes"]
476
+ if "--validate-risk-probes" in sys.argv[1:]:
477
+ validate_risk_probes_only = True
478
+ sys.argv = [arg for arg in sys.argv if arg != "--validate-risk-probes"]
479
+
480
+ if len(sys.argv) == 2 and sys.argv[1] == "--self-test":
481
+ return run_self_test()
482
+
241
483
  if len(sys.argv) >= 2 and sys.argv[1] == "--check":
242
484
  if len(sys.argv) != 3:
243
485
  print("usage: spec-verify-check.py --check <markdown-path>", file=sys.stderr)
@@ -275,6 +517,16 @@ def main() -> int:
275
517
  pre_staged = spec_path.is_file() # captured BEFORE any potential write
276
518
  trust_bench_staged = bench_mode and pre_staged
277
519
  src_type, source_md = read_source(work, devlyn_dir)
520
+ if validate_risk_probes_only:
521
+ _risk_probes, risk_error = load_risk_probes(
522
+ devlyn_dir, source_md, require_present=True
523
+ )
524
+ if risk_error:
525
+ print(f"[spec-verify] risk probes malformed: {risk_error}", file=sys.stderr)
526
+ write_malformed_finding(devlyn_dir, risk_error, devlyn_dir / "risk-probes.jsonl")
527
+ return 1
528
+ print("[spec-verify] risk probes valid", file=sys.stderr)
529
+ return 0
278
530
  if source_md is not None and not trust_bench_staged:
279
531
  staged, error = stage_from_source(source_md, devlyn_dir)
280
532
  if error is not None:
@@ -334,7 +586,14 @@ def main() -> int:
334
586
  print(f"[spec-verify] error: {spec_path}: {shape_err}", file=sys.stderr)
335
587
  write_malformed_finding(devlyn_dir, f"{spec_path}: {shape_err}", None)
336
588
  return 1
337
- commands = spec["verification_commands"]
589
+ commands = list(spec["verification_commands"])
590
+ if include_risk_probes:
591
+ risk_probes, risk_error = load_risk_probes(devlyn_dir, source_md)
592
+ if risk_error:
593
+ print(f"[spec-verify] risk probes malformed: {risk_error}", file=sys.stderr)
594
+ write_malformed_finding(devlyn_dir, risk_error, devlyn_dir / "risk-probes.jsonl")
595
+ return 1
596
+ commands.extend(risk_probes)
338
597
 
339
598
  devlyn_dir.mkdir(parents=True, exist_ok=True)
340
599
  results_path = devlyn_dir / "spec-verify.results.json"
@@ -354,6 +613,7 @@ def main() -> int:
354
613
  "reason": "missing_cmd"})
355
614
  continue
356
615
 
616
+ is_risk_probe = bool(vc.get("_risk_probe"))
357
617
  expected_exit = vc.get("exit_code", 0)
358
618
  stdout_contains = vc.get("stdout_contains", []) or []
359
619
  stdout_not_contains = vc.get("stdout_not_contains", []) or []
@@ -423,17 +683,41 @@ def main() -> int:
423
683
  f"contains={stdout_contains}, not_contains={stdout_not_contains})."
424
684
  )
425
685
 
686
+ rule_id = (
687
+ "correctness.risk-probe-failed"
688
+ if is_risk_probe
689
+ else "correctness.spec-literal-mismatch"
690
+ )
691
+ criterion_ref = (
692
+ f"risk-probe:{vc.get('id')}"
693
+ if is_risk_probe
694
+ else f"spec-verify://verification_commands/{idx}"
695
+ )
696
+ file_ref = (
697
+ ".devlyn/risk-probes.jsonl"
698
+ if is_risk_probe
699
+ else ".devlyn/spec-verify.json"
700
+ )
701
+ if is_risk_probe:
702
+ fix_hint = (
703
+ f"Risk probe `{vc.get('id')}` derived from "
704
+ f"{vc.get('derived_from')!r} failed. See "
705
+ ".devlyn/spec-verify.results.json for captured output "
706
+ "and update the implementation to satisfy the visible "
707
+ "verification bullet."
708
+ )
709
+
426
710
  findings.append({
427
711
  "id": f"BGATE-{finding_seq:04d}",
428
- "rule_id": "correctness.spec-literal-mismatch",
712
+ "rule_id": rule_id,
429
713
  "level": "error",
430
714
  "severity": "CRITICAL",
431
715
  "confidence": 1.0,
432
716
  "message": msg,
433
- "file": ".devlyn/spec-verify.json",
717
+ "file": file_ref,
434
718
  "line": 1,
435
719
  "phase": "build_gate",
436
- "criterion_ref": f"spec-verify://verification_commands/{idx}",
720
+ "criterion_ref": criterion_ref,
437
721
  "fix_hint": fix_hint,
438
722
  "blocking": True,
439
723
  "status": "open",
@@ -443,19 +727,28 @@ def main() -> int:
443
727
  except subprocess.TimeoutExpired:
444
728
  results.append({"index": idx, "cmd": cmd, "pass": False,
445
729
  "reason": "timeout"})
730
+ rule_id = (
731
+ "correctness.risk-probe-failed"
732
+ if vc.get("_risk_probe")
733
+ else "correctness.spec-literal-mismatch"
734
+ )
446
735
  findings.append({
447
736
  "id": f"BGATE-{finding_seq:04d}",
448
- "rule_id": "correctness.spec-literal-mismatch",
737
+ "rule_id": rule_id,
449
738
  "level": "error",
450
739
  "severity": "CRITICAL",
451
740
  "confidence": 1.0,
452
741
  "message": (
453
742
  f"Verification command #{idx + 1} timed out after 60s."
454
743
  ),
455
- "file": ".devlyn/spec-verify.json",
744
+ "file": ".devlyn/risk-probes.jsonl" if vc.get("_risk_probe") else ".devlyn/spec-verify.json",
456
745
  "line": 1,
457
746
  "phase": "build_gate",
458
- "criterion_ref": f"spec-verify://verification_commands/{idx}",
747
+ "criterion_ref": (
748
+ f"risk-probe:{vc.get('id')}"
749
+ if vc.get("_risk_probe")
750
+ else f"spec-verify://verification_commands/{idx}"
751
+ ),
459
752
  "fix_hint": (
460
753
  f"Command `{cmd}` exceeded 60s. Reduce work or fix a "
461
754
  f"hang in the implementation."
@@ -467,9 +760,14 @@ def main() -> int:
467
760
  except Exception as e: # noqa: BLE001 — surface any harness error explicitly
468
761
  results.append({"index": idx, "cmd": cmd, "pass": False,
469
762
  "reason": f"error:{e.__class__.__name__}:{e}"})
763
+ rule_id = (
764
+ "correctness.risk-probe-failed"
765
+ if vc.get("_risk_probe")
766
+ else "correctness.spec-literal-mismatch"
767
+ )
470
768
  findings.append({
471
769
  "id": f"BGATE-{finding_seq:04d}",
472
- "rule_id": "correctness.spec-literal-mismatch",
770
+ "rule_id": rule_id,
473
771
  "level": "error",
474
772
  "severity": "CRITICAL",
475
773
  "confidence": 1.0,
@@ -477,10 +775,14 @@ def main() -> int:
477
775
  f"Verification command #{idx + 1} raised "
478
776
  f"{e.__class__.__name__}: {e}."
479
777
  ),
480
- "file": ".devlyn/spec-verify.json",
778
+ "file": ".devlyn/risk-probes.jsonl" if vc.get("_risk_probe") else ".devlyn/spec-verify.json",
481
779
  "line": 1,
482
780
  "phase": "build_gate",
483
- "criterion_ref": f"spec-verify://verification_commands/{idx}",
781
+ "criterion_ref": (
782
+ f"risk-probe:{vc.get('id')}"
783
+ if vc.get("_risk_probe")
784
+ else f"spec-verify://verification_commands/{idx}"
785
+ ),
484
786
  "fix_hint": (
485
787
  f"Command `{cmd}` could not be executed. Check the work-dir "
486
788
  f"state and any environment setup the command requires."