devlyn-cli 1.15.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. package/AGENTS.md +104 -0
  2. package/CLAUDE.md +135 -21
  3. package/README.md +43 -125
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +272 -0
  5. package/benchmark/auto-resolve/README.md +114 -0
  6. package/benchmark/auto-resolve/RUBRIC.md +162 -0
  7. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +30 -0
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/expected.json +68 -0
  9. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/metadata.json +10 -0
  10. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/setup.sh +4 -0
  11. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/spec.md +45 -0
  12. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/task.txt +8 -0
  13. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +54 -0
  14. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected-pair-plan-registry.json +170 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected.json +84 -0
  16. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/metadata.json +21 -0
  17. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-fail.json +214 -0
  18. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-pass.json +223 -0
  19. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/setup.sh +5 -0
  20. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/spec.md +56 -0
  21. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/task.txt +14 -0
  22. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +28 -0
  23. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected-pair-plan-registry.json +162 -0
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +65 -0
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/metadata.json +19 -0
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/setup.sh +4 -0
  27. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +56 -0
  28. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/task.txt +9 -0
  29. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +40 -0
  30. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/expected.json +57 -0
  31. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/metadata.json +10 -0
  32. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/setup.sh +6 -0
  33. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/spec.md +49 -0
  34. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/task.txt +9 -0
  35. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/expected.json +65 -0
  37. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/setup.sh +55 -0
  39. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/spec.md +49 -0
  40. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/task.txt +7 -0
  41. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +38 -0
  42. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/expected.json +77 -0
  43. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/metadata.json +10 -0
  44. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/setup.sh +4 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/spec.md +49 -0
  46. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/task.txt +10 -0
  47. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +50 -0
  48. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/expected.json +76 -0
  49. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/metadata.json +10 -0
  50. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/setup.sh +36 -0
  51. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/spec.md +46 -0
  52. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/task.txt +7 -0
  53. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +50 -0
  54. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/expected.json +63 -0
  55. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/setup.sh +4 -0
  57. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +48 -0
  58. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/task.txt +1 -0
  59. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +93 -0
  60. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/expected.json +74 -0
  61. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/metadata.json +10 -0
  62. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/setup.sh +28 -0
  63. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +62 -0
  64. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/task.txt +5 -0
  65. package/benchmark/auto-resolve/fixtures/SCHEMA.md +130 -0
  66. package/benchmark/auto-resolve/fixtures/test-repo/README.md +27 -0
  67. package/benchmark/auto-resolve/fixtures/test-repo/bin/cli.js +63 -0
  68. package/benchmark/auto-resolve/fixtures/test-repo/package-lock.json +823 -0
  69. package/benchmark/auto-resolve/fixtures/test-repo/package.json +22 -0
  70. package/benchmark/auto-resolve/fixtures/test-repo/playwright.config.js +17 -0
  71. package/benchmark/auto-resolve/fixtures/test-repo/server/index.js +37 -0
  72. package/benchmark/auto-resolve/fixtures/test-repo/tests/cli.test.js +25 -0
  73. package/benchmark/auto-resolve/fixtures/test-repo/tests/server.test.js +58 -0
  74. package/benchmark/auto-resolve/fixtures/test-repo/web/index.html +37 -0
  75. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +174 -0
  76. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +256 -0
  77. package/benchmark/auto-resolve/scripts/compile-report.py +331 -0
  78. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +552 -0
  79. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +430 -0
  80. package/benchmark/auto-resolve/scripts/judge.sh +359 -0
  81. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +260 -0
  82. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +274 -0
  83. package/benchmark/auto-resolve/scripts/oracle-test-fidelity.py +328 -0
  84. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +401 -0
  85. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +468 -0
  86. package/benchmark/auto-resolve/scripts/run-fixture.sh +691 -0
  87. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +234 -0
  88. package/benchmark/auto-resolve/scripts/run-suite.sh +214 -0
  89. package/benchmark/auto-resolve/scripts/ship-gate.py +222 -0
  90. package/bin/devlyn.js +129 -17
  91. package/config/skills/_shared/adapters/README.md +64 -0
  92. package/config/skills/_shared/adapters/gpt-5-5.md +29 -0
  93. package/config/skills/_shared/adapters/opus-4-7.md +29 -0
  94. package/config/skills/{devlyn:auto-resolve/scripts → _shared}/archive_run.py +26 -0
  95. package/config/skills/_shared/codex-config.md +54 -0
  96. package/config/skills/_shared/codex-monitored.sh +141 -0
  97. package/config/skills/_shared/engine-preflight.md +35 -0
  98. package/config/skills/_shared/expected.schema.json +93 -0
  99. package/config/skills/_shared/pair-plan-schema.md +298 -0
  100. package/config/skills/_shared/runtime-principles.md +110 -0
  101. package/config/skills/_shared/spec-verify-check.py +519 -0
  102. package/config/skills/devlyn:ideate/SKILL.md +99 -429
  103. package/config/skills/devlyn:ideate/references/elicitation.md +97 -0
  104. package/config/skills/devlyn:ideate/references/from-spec-mode.md +54 -0
  105. package/config/skills/devlyn:ideate/references/project-mode.md +76 -0
  106. package/config/skills/devlyn:ideate/references/spec-template.md +102 -0
  107. package/config/skills/devlyn:resolve/SKILL.md +172 -184
  108. package/config/skills/devlyn:resolve/references/free-form-mode.md +68 -0
  109. package/config/skills/devlyn:resolve/references/phases/build-gate.md +45 -0
  110. package/config/skills/devlyn:resolve/references/phases/cleanup.md +39 -0
  111. package/config/skills/devlyn:resolve/references/phases/implement.md +42 -0
  112. package/config/skills/devlyn:resolve/references/phases/plan.md +42 -0
  113. package/config/skills/devlyn:resolve/references/phases/verify.md +69 -0
  114. package/config/skills/devlyn:resolve/references/state-schema.md +106 -0
  115. package/{config/skills → optional-skills}/devlyn:design-system/SKILL.md +1 -0
  116. package/{config/skills → optional-skills}/devlyn:reap/SKILL.md +1 -0
  117. package/{config/skills → optional-skills}/devlyn:team-design-ui/SKILL.md +5 -0
  118. package/package.json +12 -2
  119. package/scripts/lint-skills.sh +431 -0
  120. package/config/skills/devlyn:auto-resolve/SKILL.md +0 -252
  121. package/config/skills/devlyn:auto-resolve/evals/evals.json +0 -21
  122. package/config/skills/devlyn:auto-resolve/evals/task-doctor-subcommand.md +0 -42
  123. package/config/skills/devlyn:auto-resolve/references/build-gate.md +0 -130
  124. package/config/skills/devlyn:auto-resolve/references/engine-routing.md +0 -82
  125. package/config/skills/devlyn:auto-resolve/references/findings-schema.md +0 -103
  126. package/config/skills/devlyn:auto-resolve/references/phases/phase-1-build.md +0 -54
  127. package/config/skills/devlyn:auto-resolve/references/phases/phase-2-evaluate.md +0 -45
  128. package/config/skills/devlyn:auto-resolve/references/phases/phase-3-critic.md +0 -84
  129. package/config/skills/devlyn:auto-resolve/references/pipeline-routing.md +0 -114
  130. package/config/skills/devlyn:auto-resolve/references/pipeline-state.md +0 -201
  131. package/config/skills/devlyn:auto-resolve/scripts/terminal_verdict.py +0 -96
  132. package/config/skills/devlyn:browser-validate/SKILL.md +0 -164
  133. package/config/skills/devlyn:browser-validate/references/flow-testing.md +0 -118
  134. package/config/skills/devlyn:browser-validate/references/tier1-chrome.md +0 -137
  135. package/config/skills/devlyn:browser-validate/references/tier2-playwright.md +0 -195
  136. package/config/skills/devlyn:browser-validate/references/tier3-curl.md +0 -57
  137. package/config/skills/devlyn:clean/SKILL.md +0 -285
  138. package/config/skills/devlyn:design-ui/SKILL.md +0 -351
  139. package/config/skills/devlyn:discover-product/SKILL.md +0 -124
  140. package/config/skills/devlyn:evaluate/SKILL.md +0 -564
  141. package/config/skills/devlyn:feature-spec/SKILL.md +0 -630
  142. package/config/skills/devlyn:ideate/references/challenge-rubric.md +0 -122
  143. package/config/skills/devlyn:ideate/references/codex-critic-template.md +0 -42
  144. package/config/skills/devlyn:ideate/references/templates/item-spec.md +0 -90
  145. package/config/skills/devlyn:implement-ui/SKILL.md +0 -466
  146. package/config/skills/devlyn:preflight/SKILL.md +0 -355
  147. package/config/skills/devlyn:preflight/references/auditors/browser-auditor.md +0 -32
  148. package/config/skills/devlyn:preflight/references/auditors/code-auditor.md +0 -86
  149. package/config/skills/devlyn:preflight/references/auditors/docs-auditor.md +0 -38
  150. package/config/skills/devlyn:product-spec/SKILL.md +0 -603
  151. package/config/skills/devlyn:recommend-features/SKILL.md +0 -286
  152. package/config/skills/devlyn:review/SKILL.md +0 -161
  153. package/config/skills/devlyn:team-resolve/SKILL.md +0 -631
  154. package/config/skills/devlyn:team-review/SKILL.md +0 -493
  155. package/config/skills/devlyn:update-docs/SKILL.md +0 -463
  156. package/config/skills/workflow-routing/SKILL.md +0 -73
  157. /package/{config/skills → optional-skills}/devlyn:reap/scripts/reap.sh +0 -0
  158. /package/{config/skills → optional-skills}/devlyn:reap/scripts/scan.sh +0 -0
@@ -0,0 +1,519 @@
1
+ #!/usr/bin/env python3
2
+ """Spec literal verification gate (iter-0019.6 + iter-0019.8 + iter-0019.9
3
+ carrier).
4
+
5
+ Default mode (BUILD_GATE invocation, no args):
6
+ - Resolves the contract carrier in this priority order (iter-0019.8 + Codex
7
+ R2 + iter-0019.9 Codex R-phaseA fix):
8
+ (1) **Benchmark mode trust** (iter-0019.9 fix for the F9 regression): when
9
+ `BENCH_WORKDIR` is set AND `.devlyn/spec-verify.json` already exists
10
+ at script start, trust it as the run-fixture.sh-staged contract from
11
+ `expected.json` and skip source-extract entirely. Without this guard,
12
+ an ideate-generated spec's `## Verification` ```json``` block (e.g.
13
+ F9 e2e novice flow generates `commitCount`/`topAuthors` while
14
+ benchmark truth is `commits`/`authors`) silently overwrote the
15
+ authoritative benchmark contract. For benchmarks, expected.json is
16
+ canonical.
17
+ (2) Otherwise, source markdown extract — read `pipeline.state.json:
18
+ source.{spec_path | criteria_path}` and extract a `## Verification`
19
+ ```json``` block. If present, overwrite `.devlyn/spec-verify.json`.
20
+ This is the real-user carrier path; a pre-existing file from a
21
+ killed prior run is stale and must not be trusted in real-user mode.
22
+ (3) If no json block in source AND source.type=="generated": emit
23
+ CRITICAL `correctness.spec-verify-malformed` so the fix-loop reruns
24
+ BUILD.
25
+ (4) If no json block in source AND source.type=="spec": benchmark mode
26
+ with a pre-staged file would have hit branch (1). Without the
27
+ pre-staged file, benchmark falls through to no-op (rare — fixture
28
+ mis-config). Real-user mode silent no-op + drops any stale
29
+ pre-staged file (preserves iter-0019.6 backward compat for
30
+ handwritten specs without the carrier).
31
+ - For each verification_commands entry, runs the command in the work-dir,
32
+ captures combined stdout+stderr, and asserts exit_code matches +
33
+ stdout_contains all required literals + stdout_not_contains none of the
34
+ forbidden literals. Mirrors run-fixture.sh's post-run verifier semantics.
35
+
36
+ Check mode (`--check <markdown_path>`):
37
+ - Used by /devlyn:ideate after writing each item spec to validate that the
38
+ generated `## Verification` ```json``` block parses + matches the schema.
39
+ - Exits 0 if the block is well-formed (or absent — ideate's check applies
40
+ to both new specs that include the block and pre-carrier handwritten
41
+ specs that omit it; absence is not failure here, only malformed JSON or
42
+ shape error is). Exits 2 on malformed json or shape error.
43
+
44
+ Why: iter-0018.5's prompt-only contract enforcement was empirically dead
45
+ (F9 verify=0.4 across all engines in iter-0019). Same lesson as iter-0008
46
+ prompt-only engine constraint. Mechanical bash-gate enforcement is the
47
+ only working pattern. iter-0019.8 extends iter-0019.6 from benchmark-only
48
+ to real-user runs by extracting the contract from the spec/criteria
49
+ markdown directly — closes NORTH-STAR test #14.
50
+
51
+ Exit codes:
52
+ - 0: silent no-op (no source carrier, real-user mode) OR --check passed
53
+ OR all commands passed.
54
+ - 1: at least one command failed OR carrier malformed (generated source
55
+ required carrier, generated source had invalid json/shape, or pre-staged
56
+ file failed shape validation). All paths emit a CRITICAL finding to
57
+ `.devlyn/spec-verify-findings.jsonl`.
58
+ - 2: invocation error (unreadable spec-verify.json, missing markdown in
59
+ --check mode, etc.)
60
+ """
61
+
62
+ from __future__ import annotations
63
+
64
+ import json
65
+ import os
66
+ import re
67
+ import subprocess
68
+ import sys
69
+ from pathlib import Path
70
+
71
+
72
+ VERIFICATION_SECTION_RE = re.compile(
73
+ r'(?ms)^##[ \t]+Verification\b[^\n]*\n(.*?)(?=^##[ \t]+|\Z)'
74
+ )
75
+ JSON_FENCE_RE = re.compile(r'(?ms)^```json[ \t]*\n(.*?)\n```[ \t]*$')
76
+
77
+
78
+ def extract_verification_block(text: str) -> str | None:
79
+ """Return the contents of the first ```json``` fenced block under the
80
+ first `## Verification` H2 heading, or None if not found.
81
+
82
+ Boundary: the fenced block must appear AFTER the `## Verification`
83
+ heading and BEFORE the next H2 (`## ...`) heading or end-of-file.
84
+ """
85
+ section = VERIFICATION_SECTION_RE.search(text)
86
+ if not section:
87
+ return None
88
+ fence = JSON_FENCE_RE.search(section.group(1))
89
+ return fence.group(1) if fence else None
90
+
91
+
92
+ def validate_shape(data) -> str | None:
93
+ """Return None if shape matches the canonical verification_commands
94
+ schema; else a human-readable error string.
95
+
96
+ Schema (iter-0019.8): top-level object with a non-empty
97
+ `verification_commands` list of objects. Each object requires a
98
+ non-empty string `cmd`; `exit_code` defaults to 0 and must be a
99
+ non-bool int; `stdout_contains` and `stdout_not_contains` default to
100
+ empty list and must be lists of strings. Bool is rejected explicitly
101
+ because Python's `bool` subclasses `int` — `isinstance(True, int) is
102
+ True` would otherwise let `exit_code: true` slip through.
103
+ """
104
+ if not isinstance(data, dict):
105
+ return "top-level must be a JSON object"
106
+ cmds = data.get("verification_commands")
107
+ if not isinstance(cmds, list):
108
+ return "verification_commands must be a list"
109
+ if not cmds:
110
+ return "verification_commands must contain at least one entry"
111
+ for i, c in enumerate(cmds):
112
+ if not isinstance(c, dict):
113
+ return f"verification_commands[{i}] must be an object"
114
+ cmd = c.get("cmd")
115
+ if not isinstance(cmd, str) or not cmd.strip():
116
+ return f"verification_commands[{i}].cmd must be a non-empty string"
117
+ ec = c.get("exit_code", 0)
118
+ if isinstance(ec, bool) or not isinstance(ec, int):
119
+ return f"verification_commands[{i}].exit_code must be int (not bool)"
120
+ for k in ("stdout_contains", "stdout_not_contains"):
121
+ v = c.get(k, [])
122
+ if not isinstance(v, list) or not all(isinstance(s, str) for s in v):
123
+ return f"verification_commands[{i}].{k} must be a list of strings"
124
+ return None
125
+
126
+
127
+ def read_source(work: Path, devlyn_dir: Path) -> tuple[str | None, Path | None]:
128
+ """Return (source_type, markdown_path) from .devlyn/pipeline.state.json,
129
+ or (None, None) if state is absent/unreadable. The markdown path is
130
+ resolved against `work` when relative.
131
+ """
132
+ state_path = devlyn_dir / "pipeline.state.json"
133
+ if not state_path.is_file():
134
+ return (None, None)
135
+ try:
136
+ state = json.loads(state_path.read_text())
137
+ except (json.JSONDecodeError, OSError):
138
+ return (None, None)
139
+ src = state.get("source") or {}
140
+ src_type = src.get("type")
141
+ if src_type == "spec":
142
+ md_path = src.get("spec_path")
143
+ elif src_type == "generated":
144
+ md_path = src.get("criteria_path")
145
+ else:
146
+ md_path = None
147
+ if not md_path:
148
+ return (src_type, None)
149
+ md = Path(md_path)
150
+ if not md.is_absolute():
151
+ md = work / md
152
+ return (src_type, md if md.is_file() else None)
153
+
154
+
155
+ def stage_from_source(md: Path, devlyn_dir: Path) -> tuple[bool, str | None]:
156
+ """Materialize .devlyn/spec-verify.json from the json block in `md`.
157
+
158
+ Returns (staged, error). staged=True → wrote spec-verify.json. error
159
+ non-None → carrier was found but malformed (caller emits CRITICAL).
160
+ staged=False, error=None → no json block in the source (handwritten
161
+ spec or generated source missing the contract).
162
+ """
163
+ block = extract_verification_block(md.read_text())
164
+ if block is None:
165
+ return (False, None)
166
+ try:
167
+ data = json.loads(block)
168
+ except json.JSONDecodeError as e:
169
+ return (False, f"`## Verification` ```json``` block in {md} has invalid JSON: {e}")
170
+ err = validate_shape(data)
171
+ if err:
172
+ return (False, f"`## Verification` ```json``` block in {md}: {err}")
173
+ normalized = {"verification_commands": data["verification_commands"]}
174
+ devlyn_dir.mkdir(parents=True, exist_ok=True)
175
+ (devlyn_dir / "spec-verify.json").write_text(json.dumps(normalized, indent=2) + "\n")
176
+ return (True, None)
177
+
178
+
179
+ def write_malformed_finding(devlyn_dir: Path, error: str, source_path: Path | None) -> None:
180
+ """Emit a single CRITICAL finding for a malformed verification carrier."""
181
+ devlyn_dir.mkdir(parents=True, exist_ok=True)
182
+ findings_path = devlyn_dir / "spec-verify-findings.jsonl"
183
+ file_ref = str(source_path) if source_path else ".devlyn/pipeline.state.json"
184
+ finding = {
185
+ "id": "BGATE-0001",
186
+ "rule_id": "correctness.spec-verify-malformed",
187
+ "level": "error",
188
+ "severity": "CRITICAL",
189
+ "confidence": 1.0,
190
+ "message": f"Verification contract carrier is malformed: {error}",
191
+ "file": file_ref,
192
+ "line": 1,
193
+ "phase": "build_gate",
194
+ "criterion_ref": "spec-verify://carrier",
195
+ "fix_hint": (
196
+ "Fix the `## Verification` ```json``` block: a JSON object with "
197
+ "a non-empty `verification_commands` array of "
198
+ "{cmd, exit_code?, stdout_contains?, stdout_not_contains?} "
199
+ "entries. See references/build-gate.md § 'Spec literal check'."
200
+ ),
201
+ "blocking": True,
202
+ "status": "open",
203
+ }
204
+ with findings_path.open("w") as fh:
205
+ fh.write(json.dumps(finding) + "\n")
206
+
207
+
208
+ def run_check_mode(md_path: Path) -> int:
209
+ """`--check <markdown>` — validate the verification carrier without
210
+ running any commands. Used by /devlyn:ideate after item-spec write.
211
+
212
+ Exit 0: section absent OR section present and well-formed.
213
+ Exit 2: section present but malformed (so ideate can re-prompt).
214
+ """
215
+ if not md_path.is_file():
216
+ print(f"[spec-verify --check] error: {md_path} not found", file=sys.stderr)
217
+ return 2
218
+ block = extract_verification_block(md_path.read_text())
219
+ if block is None:
220
+ # Section absent or no json block — opt-in nature preserved for
221
+ # ideate (a spec without machine verification is still valid; it
222
+ # just won't activate the BUILD_GATE gate).
223
+ return 0
224
+ try:
225
+ data = json.loads(block)
226
+ except json.JSONDecodeError as e:
227
+ print(
228
+ f"[spec-verify --check] {md_path}: invalid JSON in `## Verification` "
229
+ f"```json``` block: {e}",
230
+ file=sys.stderr,
231
+ )
232
+ return 2
233
+ err = validate_shape(data)
234
+ if err:
235
+ print(f"[spec-verify --check] {md_path}: shape error: {err}", file=sys.stderr)
236
+ return 2
237
+ return 0
238
+
239
+
240
+ def main() -> int:
241
+ if len(sys.argv) >= 2 and sys.argv[1] == "--check":
242
+ if len(sys.argv) != 3:
243
+ print("usage: spec-verify-check.py --check <markdown-path>", file=sys.stderr)
244
+ return 2
245
+ return run_check_mode(Path(sys.argv[2]))
246
+
247
+ bench_mode = "BENCH_WORKDIR" in os.environ
248
+ work = Path(os.environ.get("BENCH_WORKDIR") or os.getcwd())
249
+ devlyn_dir = work / ".devlyn"
250
+ spec_path = devlyn_dir / "spec-verify.json"
251
+
252
+ # iter-0019.8 + iter-0019.9 (Codex R-phaseA): determine the contract
253
+ # carrier source for THIS run. Order:
254
+ # 1. Benchmark mode (BENCH_WORKDIR set) AND a pre-staged
255
+ # .devlyn/spec-verify.json exists at script start: TRUST it (this is
256
+ # the run-fixture.sh contract staged from expected.json). Skip
257
+ # source-extract entirely. iter-0019.9 closes the F9 regression where
258
+ # source-extract from an ideate-generated spec overwrote the
259
+ # benchmark contract — for benchmarks, expected.json is canonical.
260
+ # 2. Otherwise, attempt source-extract from
261
+ # `pipeline.state.json:source.{spec_path | criteria_path}`. If it has
262
+ # a json block, overwrite .devlyn/spec-verify.json with it. This is
263
+ # the real-user carrier path; in real-user mode a pre-existing file
264
+ # is stale (from a killed prior run) and must NOT be trusted.
265
+ # 3. If source has no json block AND source.type=="generated":
266
+ # CRITICAL spec-verify-malformed — generated criteria must ship a
267
+ # verifiable contract per phase-1-build.md <output_contract>.
268
+ # 4. If source has no json block AND source.type=="spec":
269
+ # - Real-user mode: silent no-op (preserves iter-0019.6 backward
270
+ # compat for handwritten specs without the carrier). Drop any
271
+ # stale pre-staged file.
272
+ # - Benchmark mode: fall through to the pre-staged-trust branch
273
+ # (covers pre-iter-0019.9 fixtures whose spec.md has prose-only
274
+ # Verification — run-fixture.sh staged the contract regardless).
275
+ pre_staged = spec_path.is_file() # captured BEFORE any potential write
276
+ trust_bench_staged = bench_mode and pre_staged
277
+ src_type, source_md = read_source(work, devlyn_dir)
278
+ if source_md is not None and not trust_bench_staged:
279
+ staged, error = stage_from_source(source_md, devlyn_dir)
280
+ if error is not None:
281
+ print(f"[spec-verify] carrier malformed: {error}", file=sys.stderr)
282
+ write_malformed_finding(devlyn_dir, error, source_md)
283
+ return 1
284
+ if not staged:
285
+ if src_type == "generated":
286
+ msg = (
287
+ f"generated {source_md.name} must include a "
288
+ "`## Verification` ```json``` block (verification_commands "
289
+ "array). PHASE 1 BUILD generated criteria without one."
290
+ )
291
+ print(f"[spec-verify] {msg}", file=sys.stderr)
292
+ write_malformed_finding(devlyn_dir, msg, source_md)
293
+ return 1
294
+ # source.type=="spec", no block in spec markdown.
295
+ if not bench_mode:
296
+ # Real-user handwritten spec: silent no-op. Drop any stale
297
+ # pre-staged file so a killed prior run cannot poison this
298
+ # run's gate.
299
+ if spec_path.exists():
300
+ spec_path.unlink()
301
+ return 0
302
+ # Benchmark mode with no source block AND no pre-staged file
303
+ # (rare — fixture mis-config) falls through to the no-pre-staged
304
+ # silent no-op branch below.
305
+
306
+ # iter-0019.9 (Codex R2 caveat): close the real-user no-source-md
307
+ # stale-orphan gap. If pipeline.state.json is absent or has no source,
308
+ # but a stale .devlyn/spec-verify.json exists in real-user mode, drop
309
+ # it — the only legitimate path that reaches here with a pre-staged
310
+ # file is benchmark mode (run-fixture.sh staged it).
311
+ if source_md is None and not bench_mode and spec_path.exists():
312
+ spec_path.unlink()
313
+ return 0
314
+
315
+ if not spec_path.exists():
316
+ # No source markdown carrier AND no pre-staged file. Silent no-op
317
+ # for benchmark misconfigurations (no fixture to gate against) and
318
+ # for real-user runs without spec/criteria. Generated source case
319
+ # is handled above.
320
+ return 0
321
+
322
+ try:
323
+ spec = json.loads(spec_path.read_text())
324
+ except (json.JSONDecodeError, OSError) as e:
325
+ print(f"[spec-verify] error: cannot parse {spec_path}: {e}", file=sys.stderr)
326
+ return 2
327
+
328
+ # iter-0019.8 (Codex R2 #2): apply full shape validation to pre-staged
329
+ # carriers too — bool exit_code, empty list, whitespace-only cmd were
330
+ # silently accepted on the benchmark path. Empty list is rejected
331
+ # because "all 0 commands passed" is vacuously true.
332
+ shape_err = validate_shape(spec)
333
+ if shape_err:
334
+ print(f"[spec-verify] error: {spec_path}: {shape_err}", file=sys.stderr)
335
+ write_malformed_finding(devlyn_dir, f"{spec_path}: {shape_err}", None)
336
+ return 1
337
+ commands = spec["verification_commands"]
338
+
339
+ devlyn_dir.mkdir(parents=True, exist_ok=True)
340
+ results_path = devlyn_dir / "spec-verify.results.json"
341
+ findings_path = devlyn_dir / "spec-verify-findings.jsonl"
342
+
343
+ verify_env = os.environ.copy()
344
+ verify_env["BENCH_WORKDIR"] = str(work)
345
+
346
+ results: list[dict] = []
347
+ findings: list[dict] = []
348
+ finding_seq = 1
349
+
350
+ for idx, vc in enumerate(commands):
351
+ cmd = vc.get("cmd")
352
+ if not cmd:
353
+ results.append({"index": idx, "cmd": None, "pass": False,
354
+ "reason": "missing_cmd"})
355
+ continue
356
+
357
+ expected_exit = vc.get("exit_code", 0)
358
+ stdout_contains = vc.get("stdout_contains", []) or []
359
+ stdout_not_contains = vc.get("stdout_not_contains", []) or []
360
+
361
+ try:
362
+ proc = subprocess.run(
363
+ cmd,
364
+ cwd=str(work),
365
+ shell=True,
366
+ env=verify_env,
367
+ capture_output=True,
368
+ text=True,
369
+ timeout=60,
370
+ )
371
+ # Mirror run-fixture.sh post-run verifier: combined stdout+stderr.
372
+ out = (proc.stdout or "") + (proc.stderr or "")
373
+ ok_exit = proc.returncode == expected_exit
374
+ ok_contains = all(s in out for s in stdout_contains)
375
+ ok_not = not any(s in out for s in stdout_not_contains)
376
+ passed = bool(ok_exit and ok_contains and ok_not)
377
+
378
+ if passed:
379
+ reason = None
380
+ elif not ok_exit:
381
+ reason = "exit"
382
+ elif not ok_contains:
383
+ reason = "missing_contains"
384
+ else:
385
+ reason = "unexpected_text"
386
+
387
+ results.append({
388
+ "index": idx,
389
+ "cmd": cmd,
390
+ "expected_exit": expected_exit,
391
+ "actual_exit": proc.returncode,
392
+ "stdout_contains": stdout_contains,
393
+ "stdout_not_contains": stdout_not_contains,
394
+ "pass": passed,
395
+ "reason": reason,
396
+ "stdout_tail": out[-500:],
397
+ })
398
+
399
+ if not passed:
400
+ # Construct fine-grained message naming the specific failure.
401
+ if not ok_exit:
402
+ msg = (
403
+ f"Verification command #{idx + 1} failed: expected exit "
404
+ f"{expected_exit}, got {proc.returncode}."
405
+ )
406
+ elif not ok_contains:
407
+ missing = [s for s in stdout_contains if s not in out]
408
+ msg = (
409
+ f"Verification command #{idx + 1} failed: expected "
410
+ f"output to contain {missing!r}."
411
+ )
412
+ else:
413
+ forbidden = [s for s in stdout_not_contains if s in out]
414
+ msg = (
415
+ f"Verification command #{idx + 1} failed: output "
416
+ f"contained forbidden literal(s) {forbidden!r}."
417
+ )
418
+
419
+ fix_hint = (
420
+ f"See .devlyn/spec-verify.results.json for the captured "
421
+ f"output. Update implementation so `{cmd}` matches the "
422
+ f"contract (exit_code={expected_exit}, "
423
+ f"contains={stdout_contains}, not_contains={stdout_not_contains})."
424
+ )
425
+
426
+ findings.append({
427
+ "id": f"BGATE-{finding_seq:04d}",
428
+ "rule_id": "correctness.spec-literal-mismatch",
429
+ "level": "error",
430
+ "severity": "CRITICAL",
431
+ "confidence": 1.0,
432
+ "message": msg,
433
+ "file": ".devlyn/spec-verify.json",
434
+ "line": 1,
435
+ "phase": "build_gate",
436
+ "criterion_ref": f"spec-verify://verification_commands/{idx}",
437
+ "fix_hint": fix_hint,
438
+ "blocking": True,
439
+ "status": "open",
440
+ })
441
+ finding_seq += 1
442
+
443
+ except subprocess.TimeoutExpired:
444
+ results.append({"index": idx, "cmd": cmd, "pass": False,
445
+ "reason": "timeout"})
446
+ findings.append({
447
+ "id": f"BGATE-{finding_seq:04d}",
448
+ "rule_id": "correctness.spec-literal-mismatch",
449
+ "level": "error",
450
+ "severity": "CRITICAL",
451
+ "confidence": 1.0,
452
+ "message": (
453
+ f"Verification command #{idx + 1} timed out after 60s."
454
+ ),
455
+ "file": ".devlyn/spec-verify.json",
456
+ "line": 1,
457
+ "phase": "build_gate",
458
+ "criterion_ref": f"spec-verify://verification_commands/{idx}",
459
+ "fix_hint": (
460
+ f"Command `{cmd}` exceeded 60s. Reduce work or fix a "
461
+ f"hang in the implementation."
462
+ ),
463
+ "blocking": True,
464
+ "status": "open",
465
+ })
466
+ finding_seq += 1
467
+ except Exception as e: # noqa: BLE001 — surface any harness error explicitly
468
+ results.append({"index": idx, "cmd": cmd, "pass": False,
469
+ "reason": f"error:{e.__class__.__name__}:{e}"})
470
+ findings.append({
471
+ "id": f"BGATE-{finding_seq:04d}",
472
+ "rule_id": "correctness.spec-literal-mismatch",
473
+ "level": "error",
474
+ "severity": "CRITICAL",
475
+ "confidence": 1.0,
476
+ "message": (
477
+ f"Verification command #{idx + 1} raised "
478
+ f"{e.__class__.__name__}: {e}."
479
+ ),
480
+ "file": ".devlyn/spec-verify.json",
481
+ "line": 1,
482
+ "phase": "build_gate",
483
+ "criterion_ref": f"spec-verify://verification_commands/{idx}",
484
+ "fix_hint": (
485
+ f"Command `{cmd}` could not be executed. Check the work-dir "
486
+ f"state and any environment setup the command requires."
487
+ ),
488
+ "blocking": True,
489
+ "status": "open",
490
+ })
491
+ finding_seq += 1
492
+
493
+ results_path.write_text(json.dumps({"commands": results}, indent=2) + "\n")
494
+
495
+ # Append findings (jsonl). BUILD_GATE merge step concatenates this onto
496
+ # build_gate.findings.jsonl; never overwrite the orchestrator's own gate
497
+ # findings. Truncate this file each run since it is a per-round artifact.
498
+ with findings_path.open("w") as fh:
499
+ for f in findings:
500
+ fh.write(json.dumps(f) + "\n")
501
+
502
+ failed = [r for r in results if r.get("pass") is False]
503
+ if failed:
504
+ print(
505
+ f"[spec-verify] {len(failed)}/{len(results)} command(s) failed; "
506
+ f"{len(findings)} CRITICAL finding(s) written to {findings_path}",
507
+ file=sys.stderr,
508
+ )
509
+ return 1
510
+
511
+ print(
512
+ f"[spec-verify] all {len(results)} command(s) passed",
513
+ file=sys.stderr,
514
+ )
515
+ return 0
516
+
517
+
518
+ if __name__ == "__main__":
519
+ sys.exit(main())