devlyn-cli 1.15.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +104 -0
- package/CLAUDE.md +135 -21
- package/README.md +43 -125
- package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +272 -0
- package/benchmark/auto-resolve/README.md +114 -0
- package/benchmark/auto-resolve/RUBRIC.md +162 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +30 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/expected.json +68 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/setup.sh +4 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/spec.md +45 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/task.txt +8 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +54 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected-pair-plan-registry.json +170 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected.json +84 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/metadata.json +21 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-fail.json +214 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-pass.json +223 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/setup.sh +5 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/spec.md +56 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/task.txt +14 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +28 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected-pair-plan-registry.json +162 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +65 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/metadata.json +19 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/setup.sh +4 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +56 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +40 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/setup.sh +6 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/spec.md +49 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/expected.json +65 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/setup.sh +55 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/spec.md +49 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/expected.json +77 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/setup.sh +4 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/spec.md +49 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/task.txt +10 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +50 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/expected.json +76 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/setup.sh +36 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/spec.md +46 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +50 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/expected.json +63 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/setup.sh +4 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +48 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/task.txt +1 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +93 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/expected.json +74 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/setup.sh +28 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +62 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/task.txt +5 -0
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +130 -0
- package/benchmark/auto-resolve/fixtures/test-repo/README.md +27 -0
- package/benchmark/auto-resolve/fixtures/test-repo/bin/cli.js +63 -0
- package/benchmark/auto-resolve/fixtures/test-repo/package-lock.json +823 -0
- package/benchmark/auto-resolve/fixtures/test-repo/package.json +22 -0
- package/benchmark/auto-resolve/fixtures/test-repo/playwright.config.js +17 -0
- package/benchmark/auto-resolve/fixtures/test-repo/server/index.js +37 -0
- package/benchmark/auto-resolve/fixtures/test-repo/tests/cli.test.js +25 -0
- package/benchmark/auto-resolve/fixtures/test-repo/tests/server.test.js +58 -0
- package/benchmark/auto-resolve/fixtures/test-repo/web/index.html +37 -0
- package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +174 -0
- package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +256 -0
- package/benchmark/auto-resolve/scripts/compile-report.py +331 -0
- package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +552 -0
- package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +430 -0
- package/benchmark/auto-resolve/scripts/judge.sh +359 -0
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +260 -0
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +274 -0
- package/benchmark/auto-resolve/scripts/oracle-test-fidelity.py +328 -0
- package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +401 -0
- package/benchmark/auto-resolve/scripts/pair-plan-lint.py +468 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +691 -0
- package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +234 -0
- package/benchmark/auto-resolve/scripts/run-suite.sh +214 -0
- package/benchmark/auto-resolve/scripts/ship-gate.py +222 -0
- package/bin/devlyn.js +129 -17
- package/config/skills/_shared/adapters/README.md +64 -0
- package/config/skills/_shared/adapters/gpt-5-5.md +29 -0
- package/config/skills/_shared/adapters/opus-4-7.md +29 -0
- package/config/skills/{devlyn:auto-resolve/scripts → _shared}/archive_run.py +26 -0
- package/config/skills/_shared/codex-config.md +54 -0
- package/config/skills/_shared/codex-monitored.sh +141 -0
- package/config/skills/_shared/engine-preflight.md +35 -0
- package/config/skills/_shared/expected.schema.json +93 -0
- package/config/skills/_shared/pair-plan-schema.md +298 -0
- package/config/skills/_shared/runtime-principles.md +110 -0
- package/config/skills/_shared/spec-verify-check.py +519 -0
- package/config/skills/devlyn:ideate/SKILL.md +99 -429
- package/config/skills/devlyn:ideate/references/elicitation.md +97 -0
- package/config/skills/devlyn:ideate/references/from-spec-mode.md +54 -0
- package/config/skills/devlyn:ideate/references/project-mode.md +76 -0
- package/config/skills/devlyn:ideate/references/spec-template.md +102 -0
- package/config/skills/devlyn:resolve/SKILL.md +172 -184
- package/config/skills/devlyn:resolve/references/free-form-mode.md +68 -0
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +45 -0
- package/config/skills/devlyn:resolve/references/phases/cleanup.md +39 -0
- package/config/skills/devlyn:resolve/references/phases/implement.md +42 -0
- package/config/skills/devlyn:resolve/references/phases/plan.md +42 -0
- package/config/skills/devlyn:resolve/references/phases/verify.md +69 -0
- package/config/skills/devlyn:resolve/references/state-schema.md +106 -0
- package/{config/skills → optional-skills}/devlyn:design-system/SKILL.md +1 -0
- package/{config/skills → optional-skills}/devlyn:reap/SKILL.md +1 -0
- package/{config/skills → optional-skills}/devlyn:team-design-ui/SKILL.md +5 -0
- package/package.json +12 -2
- package/scripts/lint-skills.sh +431 -0
- package/config/skills/devlyn:auto-resolve/SKILL.md +0 -252
- package/config/skills/devlyn:auto-resolve/evals/evals.json +0 -21
- package/config/skills/devlyn:auto-resolve/evals/task-doctor-subcommand.md +0 -42
- package/config/skills/devlyn:auto-resolve/references/build-gate.md +0 -130
- package/config/skills/devlyn:auto-resolve/references/engine-routing.md +0 -82
- package/config/skills/devlyn:auto-resolve/references/findings-schema.md +0 -103
- package/config/skills/devlyn:auto-resolve/references/phases/phase-1-build.md +0 -54
- package/config/skills/devlyn:auto-resolve/references/phases/phase-2-evaluate.md +0 -45
- package/config/skills/devlyn:auto-resolve/references/phases/phase-3-critic.md +0 -84
- package/config/skills/devlyn:auto-resolve/references/pipeline-routing.md +0 -114
- package/config/skills/devlyn:auto-resolve/references/pipeline-state.md +0 -201
- package/config/skills/devlyn:auto-resolve/scripts/terminal_verdict.py +0 -96
- package/config/skills/devlyn:browser-validate/SKILL.md +0 -164
- package/config/skills/devlyn:browser-validate/references/flow-testing.md +0 -118
- package/config/skills/devlyn:browser-validate/references/tier1-chrome.md +0 -137
- package/config/skills/devlyn:browser-validate/references/tier2-playwright.md +0 -195
- package/config/skills/devlyn:browser-validate/references/tier3-curl.md +0 -57
- package/config/skills/devlyn:clean/SKILL.md +0 -285
- package/config/skills/devlyn:design-ui/SKILL.md +0 -351
- package/config/skills/devlyn:discover-product/SKILL.md +0 -124
- package/config/skills/devlyn:evaluate/SKILL.md +0 -564
- package/config/skills/devlyn:feature-spec/SKILL.md +0 -630
- package/config/skills/devlyn:ideate/references/challenge-rubric.md +0 -122
- package/config/skills/devlyn:ideate/references/codex-critic-template.md +0 -42
- package/config/skills/devlyn:ideate/references/templates/item-spec.md +0 -90
- package/config/skills/devlyn:implement-ui/SKILL.md +0 -466
- package/config/skills/devlyn:preflight/SKILL.md +0 -355
- package/config/skills/devlyn:preflight/references/auditors/browser-auditor.md +0 -32
- package/config/skills/devlyn:preflight/references/auditors/code-auditor.md +0 -86
- package/config/skills/devlyn:preflight/references/auditors/docs-auditor.md +0 -38
- package/config/skills/devlyn:product-spec/SKILL.md +0 -603
- package/config/skills/devlyn:recommend-features/SKILL.md +0 -286
- package/config/skills/devlyn:review/SKILL.md +0 -161
- package/config/skills/devlyn:team-resolve/SKILL.md +0 -631
- package/config/skills/devlyn:team-review/SKILL.md +0 -493
- package/config/skills/devlyn:update-docs/SKILL.md +0 -463
- package/config/skills/workflow-routing/SKILL.md +0 -73
- /package/{config/skills → optional-skills}/devlyn:reap/scripts/reap.sh +0 -0
- /package/{config/skills → optional-skills}/devlyn:reap/scripts/scan.sh +0 -0
|
@@ -0,0 +1,519 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Spec literal verification gate (iter-0019.6 + iter-0019.8 + iter-0019.9
|
|
3
|
+
carrier).
|
|
4
|
+
|
|
5
|
+
Default mode (BUILD_GATE invocation, no args):
|
|
6
|
+
- Resolves the contract carrier in this priority order (iter-0019.8 + Codex
|
|
7
|
+
R2 + iter-0019.9 Codex R-phaseA fix):
|
|
8
|
+
(1) **Benchmark mode trust** (iter-0019.9 fix for the F9 regression): when
|
|
9
|
+
`BENCH_WORKDIR` is set AND `.devlyn/spec-verify.json` already exists
|
|
10
|
+
at script start, trust it as the run-fixture.sh-staged contract from
|
|
11
|
+
`expected.json` and skip source-extract entirely. Without this guard,
|
|
12
|
+
an ideate-generated spec's `## Verification` ```json``` block (e.g.
|
|
13
|
+
F9 e2e novice flow generates `commitCount`/`topAuthors` while
|
|
14
|
+
benchmark truth is `commits`/`authors`) silently overwrote the
|
|
15
|
+
authoritative benchmark contract. For benchmarks, expected.json is
|
|
16
|
+
canonical.
|
|
17
|
+
(2) Otherwise, source markdown extract — read `pipeline.state.json:
|
|
18
|
+
source.{spec_path | criteria_path}` and extract a `## Verification`
|
|
19
|
+
```json``` block. If present, overwrite `.devlyn/spec-verify.json`.
|
|
20
|
+
This is the real-user carrier path; a pre-existing file from a
|
|
21
|
+
killed prior run is stale and must not be trusted in real-user mode.
|
|
22
|
+
(3) If no json block in source AND source.type=="generated": emit
|
|
23
|
+
CRITICAL `correctness.spec-verify-malformed` so the fix-loop reruns
|
|
24
|
+
BUILD.
|
|
25
|
+
(4) If no json block in source AND source.type=="spec": benchmark mode
|
|
26
|
+
with a pre-staged file would have hit branch (1). Without the
|
|
27
|
+
pre-staged file, benchmark falls through to no-op (rare — fixture
|
|
28
|
+
mis-config). Real-user mode silent no-op + drops any stale
|
|
29
|
+
pre-staged file (preserves iter-0019.6 backward compat for
|
|
30
|
+
handwritten specs without the carrier).
|
|
31
|
+
- For each verification_commands entry, runs the command in the work-dir,
|
|
32
|
+
captures combined stdout+stderr, and asserts exit_code matches +
|
|
33
|
+
stdout_contains all required literals + stdout_not_contains none of the
|
|
34
|
+
forbidden literals. Mirrors run-fixture.sh's post-run verifier semantics.
|
|
35
|
+
|
|
36
|
+
Check mode (`--check <markdown_path>`):
|
|
37
|
+
- Used by /devlyn:ideate after writing each item spec to validate that the
|
|
38
|
+
generated `## Verification` ```json``` block parses + matches the schema.
|
|
39
|
+
- Exits 0 if the block is well-formed (or absent — ideate's check applies
|
|
40
|
+
to both new specs that include the block and pre-carrier handwritten
|
|
41
|
+
specs that omit it; absence is not failure here, only malformed JSON or
|
|
42
|
+
shape error is). Exits 2 on malformed json or shape error.
|
|
43
|
+
|
|
44
|
+
Why: iter-0018.5's prompt-only contract enforcement was empirically dead
|
|
45
|
+
(F9 verify=0.4 across all engines in iter-0019). Same lesson as iter-0008
|
|
46
|
+
prompt-only engine constraint. Mechanical bash-gate enforcement is the
|
|
47
|
+
only working pattern. iter-0019.8 extends iter-0019.6 from benchmark-only
|
|
48
|
+
to real-user runs by extracting the contract from the spec/criteria
|
|
49
|
+
markdown directly — closes NORTH-STAR test #14.
|
|
50
|
+
|
|
51
|
+
Exit codes:
|
|
52
|
+
- 0: silent no-op (no source carrier, real-user mode) OR --check passed
|
|
53
|
+
OR all commands passed.
|
|
54
|
+
- 1: at least one command failed OR carrier malformed (generated source
|
|
55
|
+
required carrier, generated source had invalid json/shape, or pre-staged
|
|
56
|
+
file failed shape validation). All paths emit a CRITICAL finding to
|
|
57
|
+
`.devlyn/spec-verify-findings.jsonl`.
|
|
58
|
+
- 2: invocation error (unreadable spec-verify.json, missing markdown in
|
|
59
|
+
--check mode, etc.)
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
from __future__ import annotations
|
|
63
|
+
|
|
64
|
+
import json
|
|
65
|
+
import os
|
|
66
|
+
import re
|
|
67
|
+
import subprocess
|
|
68
|
+
import sys
|
|
69
|
+
from pathlib import Path
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
VERIFICATION_SECTION_RE = re.compile(
|
|
73
|
+
r'(?ms)^##[ \t]+Verification\b[^\n]*\n(.*?)(?=^##[ \t]+|\Z)'
|
|
74
|
+
)
|
|
75
|
+
JSON_FENCE_RE = re.compile(r'(?ms)^```json[ \t]*\n(.*?)\n```[ \t]*$')
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def extract_verification_block(text: str) -> str | None:
|
|
79
|
+
"""Return the contents of the first ```json``` fenced block under the
|
|
80
|
+
first `## Verification` H2 heading, or None if not found.
|
|
81
|
+
|
|
82
|
+
Boundary: the fenced block must appear AFTER the `## Verification`
|
|
83
|
+
heading and BEFORE the next H2 (`## ...`) heading or end-of-file.
|
|
84
|
+
"""
|
|
85
|
+
section = VERIFICATION_SECTION_RE.search(text)
|
|
86
|
+
if not section:
|
|
87
|
+
return None
|
|
88
|
+
fence = JSON_FENCE_RE.search(section.group(1))
|
|
89
|
+
return fence.group(1) if fence else None
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def validate_shape(data) -> str | None:
|
|
93
|
+
"""Return None if shape matches the canonical verification_commands
|
|
94
|
+
schema; else a human-readable error string.
|
|
95
|
+
|
|
96
|
+
Schema (iter-0019.8): top-level object with a non-empty
|
|
97
|
+
`verification_commands` list of objects. Each object requires a
|
|
98
|
+
non-empty string `cmd`; `exit_code` defaults to 0 and must be a
|
|
99
|
+
non-bool int; `stdout_contains` and `stdout_not_contains` default to
|
|
100
|
+
empty list and must be lists of strings. Bool is rejected explicitly
|
|
101
|
+
because Python's `bool` subclasses `int` — `isinstance(True, int) is
|
|
102
|
+
True` would otherwise let `exit_code: true` slip through.
|
|
103
|
+
"""
|
|
104
|
+
if not isinstance(data, dict):
|
|
105
|
+
return "top-level must be a JSON object"
|
|
106
|
+
cmds = data.get("verification_commands")
|
|
107
|
+
if not isinstance(cmds, list):
|
|
108
|
+
return "verification_commands must be a list"
|
|
109
|
+
if not cmds:
|
|
110
|
+
return "verification_commands must contain at least one entry"
|
|
111
|
+
for i, c in enumerate(cmds):
|
|
112
|
+
if not isinstance(c, dict):
|
|
113
|
+
return f"verification_commands[{i}] must be an object"
|
|
114
|
+
cmd = c.get("cmd")
|
|
115
|
+
if not isinstance(cmd, str) or not cmd.strip():
|
|
116
|
+
return f"verification_commands[{i}].cmd must be a non-empty string"
|
|
117
|
+
ec = c.get("exit_code", 0)
|
|
118
|
+
if isinstance(ec, bool) or not isinstance(ec, int):
|
|
119
|
+
return f"verification_commands[{i}].exit_code must be int (not bool)"
|
|
120
|
+
for k in ("stdout_contains", "stdout_not_contains"):
|
|
121
|
+
v = c.get(k, [])
|
|
122
|
+
if not isinstance(v, list) or not all(isinstance(s, str) for s in v):
|
|
123
|
+
return f"verification_commands[{i}].{k} must be a list of strings"
|
|
124
|
+
return None
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def read_source(work: Path, devlyn_dir: Path) -> tuple[str | None, Path | None]:
|
|
128
|
+
"""Return (source_type, markdown_path) from .devlyn/pipeline.state.json,
|
|
129
|
+
or (None, None) if state is absent/unreadable. The markdown path is
|
|
130
|
+
resolved against `work` when relative.
|
|
131
|
+
"""
|
|
132
|
+
state_path = devlyn_dir / "pipeline.state.json"
|
|
133
|
+
if not state_path.is_file():
|
|
134
|
+
return (None, None)
|
|
135
|
+
try:
|
|
136
|
+
state = json.loads(state_path.read_text())
|
|
137
|
+
except (json.JSONDecodeError, OSError):
|
|
138
|
+
return (None, None)
|
|
139
|
+
src = state.get("source") or {}
|
|
140
|
+
src_type = src.get("type")
|
|
141
|
+
if src_type == "spec":
|
|
142
|
+
md_path = src.get("spec_path")
|
|
143
|
+
elif src_type == "generated":
|
|
144
|
+
md_path = src.get("criteria_path")
|
|
145
|
+
else:
|
|
146
|
+
md_path = None
|
|
147
|
+
if not md_path:
|
|
148
|
+
return (src_type, None)
|
|
149
|
+
md = Path(md_path)
|
|
150
|
+
if not md.is_absolute():
|
|
151
|
+
md = work / md
|
|
152
|
+
return (src_type, md if md.is_file() else None)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def stage_from_source(md: Path, devlyn_dir: Path) -> tuple[bool, str | None]:
|
|
156
|
+
"""Materialize .devlyn/spec-verify.json from the json block in `md`.
|
|
157
|
+
|
|
158
|
+
Returns (staged, error). staged=True → wrote spec-verify.json. error
|
|
159
|
+
non-None → carrier was found but malformed (caller emits CRITICAL).
|
|
160
|
+
staged=False, error=None → no json block in the source (handwritten
|
|
161
|
+
spec or generated source missing the contract).
|
|
162
|
+
"""
|
|
163
|
+
block = extract_verification_block(md.read_text())
|
|
164
|
+
if block is None:
|
|
165
|
+
return (False, None)
|
|
166
|
+
try:
|
|
167
|
+
data = json.loads(block)
|
|
168
|
+
except json.JSONDecodeError as e:
|
|
169
|
+
return (False, f"`## Verification` ```json``` block in {md} has invalid JSON: {e}")
|
|
170
|
+
err = validate_shape(data)
|
|
171
|
+
if err:
|
|
172
|
+
return (False, f"`## Verification` ```json``` block in {md}: {err}")
|
|
173
|
+
normalized = {"verification_commands": data["verification_commands"]}
|
|
174
|
+
devlyn_dir.mkdir(parents=True, exist_ok=True)
|
|
175
|
+
(devlyn_dir / "spec-verify.json").write_text(json.dumps(normalized, indent=2) + "\n")
|
|
176
|
+
return (True, None)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def write_malformed_finding(devlyn_dir: Path, error: str, source_path: Path | None) -> None:
|
|
180
|
+
"""Emit a single CRITICAL finding for a malformed verification carrier."""
|
|
181
|
+
devlyn_dir.mkdir(parents=True, exist_ok=True)
|
|
182
|
+
findings_path = devlyn_dir / "spec-verify-findings.jsonl"
|
|
183
|
+
file_ref = str(source_path) if source_path else ".devlyn/pipeline.state.json"
|
|
184
|
+
finding = {
|
|
185
|
+
"id": "BGATE-0001",
|
|
186
|
+
"rule_id": "correctness.spec-verify-malformed",
|
|
187
|
+
"level": "error",
|
|
188
|
+
"severity": "CRITICAL",
|
|
189
|
+
"confidence": 1.0,
|
|
190
|
+
"message": f"Verification contract carrier is malformed: {error}",
|
|
191
|
+
"file": file_ref,
|
|
192
|
+
"line": 1,
|
|
193
|
+
"phase": "build_gate",
|
|
194
|
+
"criterion_ref": "spec-verify://carrier",
|
|
195
|
+
"fix_hint": (
|
|
196
|
+
"Fix the `## Verification` ```json``` block: a JSON object with "
|
|
197
|
+
"a non-empty `verification_commands` array of "
|
|
198
|
+
"{cmd, exit_code?, stdout_contains?, stdout_not_contains?} "
|
|
199
|
+
"entries. See references/build-gate.md § 'Spec literal check'."
|
|
200
|
+
),
|
|
201
|
+
"blocking": True,
|
|
202
|
+
"status": "open",
|
|
203
|
+
}
|
|
204
|
+
with findings_path.open("w") as fh:
|
|
205
|
+
fh.write(json.dumps(finding) + "\n")
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def run_check_mode(md_path: Path) -> int:
|
|
209
|
+
"""`--check <markdown>` — validate the verification carrier without
|
|
210
|
+
running any commands. Used by /devlyn:ideate after item-spec write.
|
|
211
|
+
|
|
212
|
+
Exit 0: section absent OR section present and well-formed.
|
|
213
|
+
Exit 2: section present but malformed (so ideate can re-prompt).
|
|
214
|
+
"""
|
|
215
|
+
if not md_path.is_file():
|
|
216
|
+
print(f"[spec-verify --check] error: {md_path} not found", file=sys.stderr)
|
|
217
|
+
return 2
|
|
218
|
+
block = extract_verification_block(md_path.read_text())
|
|
219
|
+
if block is None:
|
|
220
|
+
# Section absent or no json block — opt-in nature preserved for
|
|
221
|
+
# ideate (a spec without machine verification is still valid; it
|
|
222
|
+
# just won't activate the BUILD_GATE gate).
|
|
223
|
+
return 0
|
|
224
|
+
try:
|
|
225
|
+
data = json.loads(block)
|
|
226
|
+
except json.JSONDecodeError as e:
|
|
227
|
+
print(
|
|
228
|
+
f"[spec-verify --check] {md_path}: invalid JSON in `## Verification` "
|
|
229
|
+
f"```json``` block: {e}",
|
|
230
|
+
file=sys.stderr,
|
|
231
|
+
)
|
|
232
|
+
return 2
|
|
233
|
+
err = validate_shape(data)
|
|
234
|
+
if err:
|
|
235
|
+
print(f"[spec-verify --check] {md_path}: shape error: {err}", file=sys.stderr)
|
|
236
|
+
return 2
|
|
237
|
+
return 0
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def main() -> int:
|
|
241
|
+
if len(sys.argv) >= 2 and sys.argv[1] == "--check":
|
|
242
|
+
if len(sys.argv) != 3:
|
|
243
|
+
print("usage: spec-verify-check.py --check <markdown-path>", file=sys.stderr)
|
|
244
|
+
return 2
|
|
245
|
+
return run_check_mode(Path(sys.argv[2]))
|
|
246
|
+
|
|
247
|
+
bench_mode = "BENCH_WORKDIR" in os.environ
|
|
248
|
+
work = Path(os.environ.get("BENCH_WORKDIR") or os.getcwd())
|
|
249
|
+
devlyn_dir = work / ".devlyn"
|
|
250
|
+
spec_path = devlyn_dir / "spec-verify.json"
|
|
251
|
+
|
|
252
|
+
# iter-0019.8 + iter-0019.9 (Codex R-phaseA): determine the contract
|
|
253
|
+
# carrier source for THIS run. Order:
|
|
254
|
+
# 1. Benchmark mode (BENCH_WORKDIR set) AND a pre-staged
|
|
255
|
+
# .devlyn/spec-verify.json exists at script start: TRUST it (this is
|
|
256
|
+
# the run-fixture.sh contract staged from expected.json). Skip
|
|
257
|
+
# source-extract entirely. iter-0019.9 closes the F9 regression where
|
|
258
|
+
# source-extract from an ideate-generated spec overwrote the
|
|
259
|
+
# benchmark contract — for benchmarks, expected.json is canonical.
|
|
260
|
+
# 2. Otherwise, attempt source-extract from
|
|
261
|
+
# `pipeline.state.json:source.{spec_path | criteria_path}`. If it has
|
|
262
|
+
# a json block, overwrite .devlyn/spec-verify.json with it. This is
|
|
263
|
+
# the real-user carrier path; in real-user mode a pre-existing file
|
|
264
|
+
# is stale (from a killed prior run) and must NOT be trusted.
|
|
265
|
+
# 3. If source has no json block AND source.type=="generated":
|
|
266
|
+
# CRITICAL spec-verify-malformed — generated criteria must ship a
|
|
267
|
+
# verifiable contract per phase-1-build.md <output_contract>.
|
|
268
|
+
# 4. If source has no json block AND source.type=="spec":
|
|
269
|
+
# - Real-user mode: silent no-op (preserves iter-0019.6 backward
|
|
270
|
+
# compat for handwritten specs without the carrier). Drop any
|
|
271
|
+
# stale pre-staged file.
|
|
272
|
+
# - Benchmark mode: fall through to the pre-staged-trust branch
|
|
273
|
+
# (covers pre-iter-0019.9 fixtures whose spec.md has prose-only
|
|
274
|
+
# Verification — run-fixture.sh staged the contract regardless).
|
|
275
|
+
pre_staged = spec_path.is_file() # captured BEFORE any potential write
|
|
276
|
+
trust_bench_staged = bench_mode and pre_staged
|
|
277
|
+
src_type, source_md = read_source(work, devlyn_dir)
|
|
278
|
+
if source_md is not None and not trust_bench_staged:
|
|
279
|
+
staged, error = stage_from_source(source_md, devlyn_dir)
|
|
280
|
+
if error is not None:
|
|
281
|
+
print(f"[spec-verify] carrier malformed: {error}", file=sys.stderr)
|
|
282
|
+
write_malformed_finding(devlyn_dir, error, source_md)
|
|
283
|
+
return 1
|
|
284
|
+
if not staged:
|
|
285
|
+
if src_type == "generated":
|
|
286
|
+
msg = (
|
|
287
|
+
f"generated {source_md.name} must include a "
|
|
288
|
+
"`## Verification` ```json``` block (verification_commands "
|
|
289
|
+
"array). PHASE 1 BUILD generated criteria without one."
|
|
290
|
+
)
|
|
291
|
+
print(f"[spec-verify] {msg}", file=sys.stderr)
|
|
292
|
+
write_malformed_finding(devlyn_dir, msg, source_md)
|
|
293
|
+
return 1
|
|
294
|
+
# source.type=="spec", no block in spec markdown.
|
|
295
|
+
if not bench_mode:
|
|
296
|
+
# Real-user handwritten spec: silent no-op. Drop any stale
|
|
297
|
+
# pre-staged file so a killed prior run cannot poison this
|
|
298
|
+
# run's gate.
|
|
299
|
+
if spec_path.exists():
|
|
300
|
+
spec_path.unlink()
|
|
301
|
+
return 0
|
|
302
|
+
# Benchmark mode with no source block AND no pre-staged file
|
|
303
|
+
# (rare — fixture mis-config) falls through to the no-pre-staged
|
|
304
|
+
# silent no-op branch below.
|
|
305
|
+
|
|
306
|
+
# iter-0019.9 (Codex R2 caveat): close the real-user no-source-md
|
|
307
|
+
# stale-orphan gap. If pipeline.state.json is absent or has no source,
|
|
308
|
+
# but a stale .devlyn/spec-verify.json exists in real-user mode, drop
|
|
309
|
+
# it — the only legitimate path that reaches here with a pre-staged
|
|
310
|
+
# file is benchmark mode (run-fixture.sh staged it).
|
|
311
|
+
if source_md is None and not bench_mode and spec_path.exists():
|
|
312
|
+
spec_path.unlink()
|
|
313
|
+
return 0
|
|
314
|
+
|
|
315
|
+
if not spec_path.exists():
|
|
316
|
+
# No source markdown carrier AND no pre-staged file. Silent no-op
|
|
317
|
+
# for benchmark misconfigurations (no fixture to gate against) and
|
|
318
|
+
# for real-user runs without spec/criteria. Generated source case
|
|
319
|
+
# is handled above.
|
|
320
|
+
return 0
|
|
321
|
+
|
|
322
|
+
try:
|
|
323
|
+
spec = json.loads(spec_path.read_text())
|
|
324
|
+
except (json.JSONDecodeError, OSError) as e:
|
|
325
|
+
print(f"[spec-verify] error: cannot parse {spec_path}: {e}", file=sys.stderr)
|
|
326
|
+
return 2
|
|
327
|
+
|
|
328
|
+
# iter-0019.8 (Codex R2 #2): apply full shape validation to pre-staged
|
|
329
|
+
# carriers too — bool exit_code, empty list, whitespace-only cmd were
|
|
330
|
+
# silently accepted on the benchmark path. Empty list is rejected
|
|
331
|
+
# because "all 0 commands passed" is vacuously true.
|
|
332
|
+
shape_err = validate_shape(spec)
|
|
333
|
+
if shape_err:
|
|
334
|
+
print(f"[spec-verify] error: {spec_path}: {shape_err}", file=sys.stderr)
|
|
335
|
+
write_malformed_finding(devlyn_dir, f"{spec_path}: {shape_err}", None)
|
|
336
|
+
return 1
|
|
337
|
+
commands = spec["verification_commands"]
|
|
338
|
+
|
|
339
|
+
devlyn_dir.mkdir(parents=True, exist_ok=True)
|
|
340
|
+
results_path = devlyn_dir / "spec-verify.results.json"
|
|
341
|
+
findings_path = devlyn_dir / "spec-verify-findings.jsonl"
|
|
342
|
+
|
|
343
|
+
verify_env = os.environ.copy()
|
|
344
|
+
verify_env["BENCH_WORKDIR"] = str(work)
|
|
345
|
+
|
|
346
|
+
results: list[dict] = []
|
|
347
|
+
findings: list[dict] = []
|
|
348
|
+
finding_seq = 1
|
|
349
|
+
|
|
350
|
+
for idx, vc in enumerate(commands):
|
|
351
|
+
cmd = vc.get("cmd")
|
|
352
|
+
if not cmd:
|
|
353
|
+
results.append({"index": idx, "cmd": None, "pass": False,
|
|
354
|
+
"reason": "missing_cmd"})
|
|
355
|
+
continue
|
|
356
|
+
|
|
357
|
+
expected_exit = vc.get("exit_code", 0)
|
|
358
|
+
stdout_contains = vc.get("stdout_contains", []) or []
|
|
359
|
+
stdout_not_contains = vc.get("stdout_not_contains", []) or []
|
|
360
|
+
|
|
361
|
+
try:
|
|
362
|
+
proc = subprocess.run(
|
|
363
|
+
cmd,
|
|
364
|
+
cwd=str(work),
|
|
365
|
+
shell=True,
|
|
366
|
+
env=verify_env,
|
|
367
|
+
capture_output=True,
|
|
368
|
+
text=True,
|
|
369
|
+
timeout=60,
|
|
370
|
+
)
|
|
371
|
+
# Mirror run-fixture.sh post-run verifier: combined stdout+stderr.
|
|
372
|
+
out = (proc.stdout or "") + (proc.stderr or "")
|
|
373
|
+
ok_exit = proc.returncode == expected_exit
|
|
374
|
+
ok_contains = all(s in out for s in stdout_contains)
|
|
375
|
+
ok_not = not any(s in out for s in stdout_not_contains)
|
|
376
|
+
passed = bool(ok_exit and ok_contains and ok_not)
|
|
377
|
+
|
|
378
|
+
if passed:
|
|
379
|
+
reason = None
|
|
380
|
+
elif not ok_exit:
|
|
381
|
+
reason = "exit"
|
|
382
|
+
elif not ok_contains:
|
|
383
|
+
reason = "missing_contains"
|
|
384
|
+
else:
|
|
385
|
+
reason = "unexpected_text"
|
|
386
|
+
|
|
387
|
+
results.append({
|
|
388
|
+
"index": idx,
|
|
389
|
+
"cmd": cmd,
|
|
390
|
+
"expected_exit": expected_exit,
|
|
391
|
+
"actual_exit": proc.returncode,
|
|
392
|
+
"stdout_contains": stdout_contains,
|
|
393
|
+
"stdout_not_contains": stdout_not_contains,
|
|
394
|
+
"pass": passed,
|
|
395
|
+
"reason": reason,
|
|
396
|
+
"stdout_tail": out[-500:],
|
|
397
|
+
})
|
|
398
|
+
|
|
399
|
+
if not passed:
|
|
400
|
+
# Construct fine-grained message naming the specific failure.
|
|
401
|
+
if not ok_exit:
|
|
402
|
+
msg = (
|
|
403
|
+
f"Verification command #{idx + 1} failed: expected exit "
|
|
404
|
+
f"{expected_exit}, got {proc.returncode}."
|
|
405
|
+
)
|
|
406
|
+
elif not ok_contains:
|
|
407
|
+
missing = [s for s in stdout_contains if s not in out]
|
|
408
|
+
msg = (
|
|
409
|
+
f"Verification command #{idx + 1} failed: expected "
|
|
410
|
+
f"output to contain {missing!r}."
|
|
411
|
+
)
|
|
412
|
+
else:
|
|
413
|
+
forbidden = [s for s in stdout_not_contains if s in out]
|
|
414
|
+
msg = (
|
|
415
|
+
f"Verification command #{idx + 1} failed: output "
|
|
416
|
+
f"contained forbidden literal(s) {forbidden!r}."
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
fix_hint = (
|
|
420
|
+
f"See .devlyn/spec-verify.results.json for the captured "
|
|
421
|
+
f"output. Update implementation so `{cmd}` matches the "
|
|
422
|
+
f"contract (exit_code={expected_exit}, "
|
|
423
|
+
f"contains={stdout_contains}, not_contains={stdout_not_contains})."
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
findings.append({
|
|
427
|
+
"id": f"BGATE-{finding_seq:04d}",
|
|
428
|
+
"rule_id": "correctness.spec-literal-mismatch",
|
|
429
|
+
"level": "error",
|
|
430
|
+
"severity": "CRITICAL",
|
|
431
|
+
"confidence": 1.0,
|
|
432
|
+
"message": msg,
|
|
433
|
+
"file": ".devlyn/spec-verify.json",
|
|
434
|
+
"line": 1,
|
|
435
|
+
"phase": "build_gate",
|
|
436
|
+
"criterion_ref": f"spec-verify://verification_commands/{idx}",
|
|
437
|
+
"fix_hint": fix_hint,
|
|
438
|
+
"blocking": True,
|
|
439
|
+
"status": "open",
|
|
440
|
+
})
|
|
441
|
+
finding_seq += 1
|
|
442
|
+
|
|
443
|
+
except subprocess.TimeoutExpired:
|
|
444
|
+
results.append({"index": idx, "cmd": cmd, "pass": False,
|
|
445
|
+
"reason": "timeout"})
|
|
446
|
+
findings.append({
|
|
447
|
+
"id": f"BGATE-{finding_seq:04d}",
|
|
448
|
+
"rule_id": "correctness.spec-literal-mismatch",
|
|
449
|
+
"level": "error",
|
|
450
|
+
"severity": "CRITICAL",
|
|
451
|
+
"confidence": 1.0,
|
|
452
|
+
"message": (
|
|
453
|
+
f"Verification command #{idx + 1} timed out after 60s."
|
|
454
|
+
),
|
|
455
|
+
"file": ".devlyn/spec-verify.json",
|
|
456
|
+
"line": 1,
|
|
457
|
+
"phase": "build_gate",
|
|
458
|
+
"criterion_ref": f"spec-verify://verification_commands/{idx}",
|
|
459
|
+
"fix_hint": (
|
|
460
|
+
f"Command `{cmd}` exceeded 60s. Reduce work or fix a "
|
|
461
|
+
f"hang in the implementation."
|
|
462
|
+
),
|
|
463
|
+
"blocking": True,
|
|
464
|
+
"status": "open",
|
|
465
|
+
})
|
|
466
|
+
finding_seq += 1
|
|
467
|
+
except Exception as e: # noqa: BLE001 — surface any harness error explicitly
|
|
468
|
+
results.append({"index": idx, "cmd": cmd, "pass": False,
|
|
469
|
+
"reason": f"error:{e.__class__.__name__}:{e}"})
|
|
470
|
+
findings.append({
|
|
471
|
+
"id": f"BGATE-{finding_seq:04d}",
|
|
472
|
+
"rule_id": "correctness.spec-literal-mismatch",
|
|
473
|
+
"level": "error",
|
|
474
|
+
"severity": "CRITICAL",
|
|
475
|
+
"confidence": 1.0,
|
|
476
|
+
"message": (
|
|
477
|
+
f"Verification command #{idx + 1} raised "
|
|
478
|
+
f"{e.__class__.__name__}: {e}."
|
|
479
|
+
),
|
|
480
|
+
"file": ".devlyn/spec-verify.json",
|
|
481
|
+
"line": 1,
|
|
482
|
+
"phase": "build_gate",
|
|
483
|
+
"criterion_ref": f"spec-verify://verification_commands/{idx}",
|
|
484
|
+
"fix_hint": (
|
|
485
|
+
f"Command `{cmd}` could not be executed. Check the work-dir "
|
|
486
|
+
f"state and any environment setup the command requires."
|
|
487
|
+
),
|
|
488
|
+
"blocking": True,
|
|
489
|
+
"status": "open",
|
|
490
|
+
})
|
|
491
|
+
finding_seq += 1
|
|
492
|
+
|
|
493
|
+
results_path.write_text(json.dumps({"commands": results}, indent=2) + "\n")
|
|
494
|
+
|
|
495
|
+
# Append findings (jsonl). BUILD_GATE merge step concatenates this onto
|
|
496
|
+
# build_gate.findings.jsonl; never overwrite the orchestrator's own gate
|
|
497
|
+
# findings. Truncate this file each run since it is a per-round artifact.
|
|
498
|
+
with findings_path.open("w") as fh:
|
|
499
|
+
for f in findings:
|
|
500
|
+
fh.write(json.dumps(f) + "\n")
|
|
501
|
+
|
|
502
|
+
failed = [r for r in results if r.get("pass") is False]
|
|
503
|
+
if failed:
|
|
504
|
+
print(
|
|
505
|
+
f"[spec-verify] {len(failed)}/{len(results)} command(s) failed; "
|
|
506
|
+
f"{len(findings)} CRITICAL finding(s) written to {findings_path}",
|
|
507
|
+
file=sys.stderr,
|
|
508
|
+
)
|
|
509
|
+
return 1
|
|
510
|
+
|
|
511
|
+
print(
|
|
512
|
+
f"[spec-verify] all {len(results)} command(s) passed",
|
|
513
|
+
file=sys.stderr,
|
|
514
|
+
)
|
|
515
|
+
return 0
|
|
516
|
+
|
|
517
|
+
|
|
518
|
+
if __name__ == "__main__":
|
|
519
|
+
sys.exit(main())
|