devlyn-cli 1.15.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +104 -0
- package/CLAUDE.md +135 -21
- package/README.md +43 -125
- package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +272 -0
- package/benchmark/auto-resolve/README.md +114 -0
- package/benchmark/auto-resolve/RUBRIC.md +162 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +30 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/expected.json +68 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/setup.sh +4 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/spec.md +45 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/task.txt +8 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +54 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected-pair-plan-registry.json +170 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected.json +84 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/metadata.json +21 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-fail.json +214 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-pass.json +223 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/setup.sh +5 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/spec.md +56 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/task.txt +14 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +28 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected-pair-plan-registry.json +162 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +65 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/metadata.json +19 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/setup.sh +4 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +56 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +40 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/setup.sh +6 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/spec.md +49 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/expected.json +65 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/setup.sh +55 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/spec.md +49 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/expected.json +77 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/setup.sh +4 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/spec.md +49 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/task.txt +10 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +50 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/expected.json +76 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/setup.sh +36 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/spec.md +46 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +50 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/expected.json +63 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/setup.sh +4 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +48 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/task.txt +1 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +93 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/expected.json +74 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/setup.sh +28 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +62 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/task.txt +5 -0
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +130 -0
- package/benchmark/auto-resolve/fixtures/test-repo/README.md +27 -0
- package/benchmark/auto-resolve/fixtures/test-repo/bin/cli.js +63 -0
- package/benchmark/auto-resolve/fixtures/test-repo/package-lock.json +823 -0
- package/benchmark/auto-resolve/fixtures/test-repo/package.json +22 -0
- package/benchmark/auto-resolve/fixtures/test-repo/playwright.config.js +17 -0
- package/benchmark/auto-resolve/fixtures/test-repo/server/index.js +37 -0
- package/benchmark/auto-resolve/fixtures/test-repo/tests/cli.test.js +25 -0
- package/benchmark/auto-resolve/fixtures/test-repo/tests/server.test.js +58 -0
- package/benchmark/auto-resolve/fixtures/test-repo/web/index.html +37 -0
- package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +174 -0
- package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +256 -0
- package/benchmark/auto-resolve/scripts/compile-report.py +331 -0
- package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +552 -0
- package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +430 -0
- package/benchmark/auto-resolve/scripts/judge.sh +359 -0
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +260 -0
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +274 -0
- package/benchmark/auto-resolve/scripts/oracle-test-fidelity.py +328 -0
- package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +401 -0
- package/benchmark/auto-resolve/scripts/pair-plan-lint.py +468 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +691 -0
- package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +234 -0
- package/benchmark/auto-resolve/scripts/run-suite.sh +214 -0
- package/benchmark/auto-resolve/scripts/ship-gate.py +222 -0
- package/bin/devlyn.js +175 -17
- package/config/skills/_shared/adapters/README.md +64 -0
- package/config/skills/_shared/adapters/gpt-5-5.md +29 -0
- package/config/skills/_shared/adapters/opus-4-7.md +29 -0
- package/config/skills/{devlyn:auto-resolve/scripts → _shared}/archive_run.py +26 -0
- package/config/skills/_shared/codex-config.md +54 -0
- package/config/skills/_shared/codex-monitored.sh +141 -0
- package/config/skills/_shared/engine-preflight.md +35 -0
- package/config/skills/_shared/expected.schema.json +93 -0
- package/config/skills/_shared/pair-plan-schema.md +298 -0
- package/config/skills/_shared/runtime-principles.md +110 -0
- package/config/skills/_shared/spec-verify-check.py +519 -0
- package/config/skills/devlyn:ideate/SKILL.md +99 -429
- package/config/skills/devlyn:ideate/references/elicitation.md +97 -0
- package/config/skills/devlyn:ideate/references/from-spec-mode.md +54 -0
- package/config/skills/devlyn:ideate/references/project-mode.md +76 -0
- package/config/skills/devlyn:ideate/references/spec-template.md +102 -0
- package/config/skills/devlyn:resolve/SKILL.md +172 -184
- package/config/skills/devlyn:resolve/references/free-form-mode.md +68 -0
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +45 -0
- package/config/skills/devlyn:resolve/references/phases/cleanup.md +39 -0
- package/config/skills/devlyn:resolve/references/phases/implement.md +42 -0
- package/config/skills/devlyn:resolve/references/phases/plan.md +42 -0
- package/config/skills/devlyn:resolve/references/phases/verify.md +69 -0
- package/config/skills/devlyn:resolve/references/state-schema.md +106 -0
- package/{config/skills → optional-skills}/devlyn:design-system/SKILL.md +1 -0
- package/{config/skills → optional-skills}/devlyn:reap/SKILL.md +1 -0
- package/{config/skills → optional-skills}/devlyn:team-design-ui/SKILL.md +5 -0
- package/package.json +12 -2
- package/scripts/lint-skills.sh +431 -0
- package/config/skills/devlyn:auto-resolve/SKILL.md +0 -252
- package/config/skills/devlyn:auto-resolve/evals/evals.json +0 -21
- package/config/skills/devlyn:auto-resolve/evals/task-doctor-subcommand.md +0 -42
- package/config/skills/devlyn:auto-resolve/references/build-gate.md +0 -130
- package/config/skills/devlyn:auto-resolve/references/engine-routing.md +0 -82
- package/config/skills/devlyn:auto-resolve/references/findings-schema.md +0 -103
- package/config/skills/devlyn:auto-resolve/references/phases/phase-1-build.md +0 -54
- package/config/skills/devlyn:auto-resolve/references/phases/phase-2-evaluate.md +0 -45
- package/config/skills/devlyn:auto-resolve/references/phases/phase-3-critic.md +0 -84
- package/config/skills/devlyn:auto-resolve/references/pipeline-routing.md +0 -114
- package/config/skills/devlyn:auto-resolve/references/pipeline-state.md +0 -201
- package/config/skills/devlyn:auto-resolve/scripts/terminal_verdict.py +0 -96
- package/config/skills/devlyn:browser-validate/SKILL.md +0 -164
- package/config/skills/devlyn:browser-validate/references/flow-testing.md +0 -118
- package/config/skills/devlyn:browser-validate/references/tier1-chrome.md +0 -137
- package/config/skills/devlyn:browser-validate/references/tier2-playwright.md +0 -195
- package/config/skills/devlyn:browser-validate/references/tier3-curl.md +0 -57
- package/config/skills/devlyn:clean/SKILL.md +0 -285
- package/config/skills/devlyn:design-ui/SKILL.md +0 -351
- package/config/skills/devlyn:discover-product/SKILL.md +0 -124
- package/config/skills/devlyn:evaluate/SKILL.md +0 -564
- package/config/skills/devlyn:feature-spec/SKILL.md +0 -630
- package/config/skills/devlyn:ideate/references/challenge-rubric.md +0 -122
- package/config/skills/devlyn:ideate/references/codex-critic-template.md +0 -42
- package/config/skills/devlyn:ideate/references/templates/item-spec.md +0 -90
- package/config/skills/devlyn:implement-ui/SKILL.md +0 -466
- package/config/skills/devlyn:preflight/SKILL.md +0 -355
- package/config/skills/devlyn:preflight/references/auditors/browser-auditor.md +0 -32
- package/config/skills/devlyn:preflight/references/auditors/code-auditor.md +0 -86
- package/config/skills/devlyn:preflight/references/auditors/docs-auditor.md +0 -38
- package/config/skills/devlyn:product-spec/SKILL.md +0 -603
- package/config/skills/devlyn:recommend-features/SKILL.md +0 -286
- package/config/skills/devlyn:review/SKILL.md +0 -161
- package/config/skills/devlyn:team-resolve/SKILL.md +0 -631
- package/config/skills/devlyn:team-review/SKILL.md +0 -493
- package/config/skills/devlyn:update-docs/SKILL.md +0 -463
- package/config/skills/workflow-routing/SKILL.md +0 -73
- /package/{config/skills → optional-skills}/devlyn:reap/scripts/reap.sh +0 -0
- /package/{config/skills → optional-skills}/devlyn:reap/scripts/scan.sh +0 -0
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
# Runtime principles — sub-agent contract
|
|
2
|
+
|
|
3
|
+
The runtime contract every sub-agent inside `/devlyn:resolve` (PLAN / IMPLEMENT / BUILD_GATE / CLEANUP / VERIFY) and `/devlyn:ideate` (FRAME / EXPLORE / SPEC / CHALLENGE) must satisfy. Source of truth for sub-agent behavior on user tasks. NOT for autoresearch-loop / harness-developer concerns (see `autoresearch/PRINCIPLES.md`).
|
|
4
|
+
|
|
5
|
+
The four sections below mirror the corresponding CLAUDE.md sections (Subtractive-first editing, Goal-locked execution, No-workaround discipline, Evidence over claim). Each section is wrapped in `<!-- runtime-principles:section=NAME:begin -->` / `:end -->` markers in BOTH this file and CLAUDE.md; lint Check 12 (added in iter-0019.A Step 5) extracts each named block from both files and diffs to detect drift.
|
|
6
|
+
|
|
7
|
+
<!-- runtime-principles:contract:begin -->
|
|
8
|
+
## Subtractive-first editing — perfection = nothing left to remove
|
|
9
|
+
<!-- runtime-principles:section=subtractive-first:begin -->
|
|
10
|
+
|
|
11
|
+
> "Perfection is achieved not when there is nothing more to add, but when there is nothing left to take away." — Saint-Exupéry. **This is the operating definition of "done" in this repo.** A change is finished when no further line, branch, flag, or doc paragraph can be removed without breaking a learned failure mode. Not before.
|
|
12
|
+
|
|
13
|
+
This rule overrides instinct. LLMs (including you) are trained on corpora that reward elaborate, defensive, "thorough" code — so the default impulse is to add. That impulse is wrong here. Read the rules below as hard tests, not aesthetic preferences. They are not optional, not negotiable, and not satisfiable by writing more careful additions.
|
|
14
|
+
|
|
15
|
+
**Mandatory pre-edit question.** Before writing any change, you must answer in this order:
|
|
16
|
+
|
|
17
|
+
1. **What can I delete that makes the addition unnecessary?** If the addition becomes redundant after the deletion, ship the deletion alone.
|
|
18
|
+
2. **What can I delete that makes the addition smaller?** Trim the surrounding accretion before adding.
|
|
19
|
+
3. **Only then**, what is the minimum addition required?
|
|
20
|
+
|
|
21
|
+
If you skip question 1 or 2, you are violating this rule even if the resulting code looks clean.
|
|
22
|
+
|
|
23
|
+
**Hard tests every edit must pass:**
|
|
24
|
+
|
|
25
|
+
- **Net-negative is the default; pure-addition needs a citation.** A diff that adds N lines and removes 0 must point to a specific cause: a previously-observed failure mode (commit hash, fixture ID, finding ID, user-reported incident), OR an explicit user request / spec requirement that demands new user-visible behavior. The latter is a sufficient citation — do not block legitimate requested additions on the absence of a past failure. What is rejected: vague justifications like "it seems clearer," "for future flexibility," "just in case," "to be safe," "for completeness," "to handle edge cases" — these are the exact phrases that produce accretion.
|
|
26
|
+
- **Delete the line that makes the bug impossible, not the line that catches it.** Defensive wrappers, validation layers, error normalizers, and `try/catch` shells are usually evidence that an upstream contract is unclear. Fix the contract upstream and remove the defenses downstream. The trap: adding the wrapper feels like progress because it makes a test pass. The wrapper is debt; the contract fix is the work. **Scope guard**: if the upstream contract fix is outside the user's stated scope, stop and surface the scope expansion to the user before editing — Goal-locked execution overrides this. The right scope-expansion outcome is "user authorizes the upstream fix" or "user accepts a scoped local fix and a follow-up for upstream"; never silently restructure something the user didn't ask you to.
|
|
27
|
+
- **A new flag, branch, or option is admitting two failures**: (a) the default was wrong, (b) every reader pays attention cost forever. Default-fix-and-delete-flag beats add-flag-with-better-default. The bar for adding a configuration knob is "I have observed two real users with genuinely conflicting needs," not "this might be useful someday."
|
|
28
|
+
- **Doc additions are subject to the same rule.** Before adding a section to any `.md` file (CLAUDE.md, SKILL.md, README, references/), find the now-stale sentence or section the new one supersedes — delete that first. A growing instructions file dilutes the instructions that actually need to be followed; readers (human and LLM) skim long files and miss load-bearing rules.
|
|
29
|
+
- **A "cleaner" refactor that grows line count is not cleaner.** It is a sideways move that increases context, parsing, and review cost. **For refactor-only changes**, line count must drop unless a cited observed failure requires the new shape. **Never delete tests, contracts, public API, comments documenting non-obvious WHY, or user-facing behavior just to win the count** — that is gaming the metric, not honoring the principle. The metric serves complexity reduction; if a deletion would lose information not recoverable from code + commit history, it is the wrong deletion.
|
|
30
|
+
- **Stop adding when no further deletion is possible.** This is the Saint-Exupéry test inverted into a stopping rule: if you have made an addition and you cannot identify anything else that can be removed, examine the addition itself — is part of it still removable? Iterate until the diff is irreducible.
|
|
31
|
+
|
|
32
|
+
**Anti-rationalization clause** — explicitly guarding against LLM-style hedging:
|
|
33
|
+
|
|
34
|
+
- "More explicit is safer" is **not** a justification. Explicitness has a cost in attention and rot. Required-explicit goes in; nice-to-explicit gets cut.
|
|
35
|
+
- "Adding context for future readers" is **not** a justification. Future readers benefit more from shorter files than from explanatory prose. The code and the commit message together carry the why.
|
|
36
|
+
- "Defense-in-depth" is **not** a justification at the harness layer. Two layers that catch the same bug are evidence one of them should be the only layer.
|
|
37
|
+
- If you find yourself writing the phrase "in case" in a comment, code reviewer note, or doc, **stop and re-evaluate** — that phrase predicts an unjustified addition.
|
|
38
|
+
|
|
39
|
+
**Stopping rule.** A change is done when (a) all hypotheses it was meant to close are closed, AND (b) you have attempted at least one further deletion and confirmed it would break something. If you have not tried to delete more, you are not done. If nothing can be deleted to justify the current addition, the addition itself is too large — re-scope or surface the conflict to the user before proceeding.
|
|
40
|
+
|
|
41
|
+
**Never grow surface area silently.** Every accretion-shaped change must be visible: in the commit message, in the iteration file, or in a flagged review. Silent growth is the failure mode this rule exists to prevent.
|
|
42
|
+
<!-- runtime-principles:section=subtractive-first:end -->
|
|
43
|
+
|
|
44
|
+
## Goal-locked execution — stay on the North Star, do not wander
|
|
45
|
+
<!-- runtime-principles:section=goal-locked:begin -->
|
|
46
|
+
|
|
47
|
+
Even with a North Star defined, work drifts off-course ("산으로 간다" / "삼천포로 빠진다" — going up the wrong mountain instead of forward). The harness must **actively block** this drift at run time, not merely discourage it. The default is ruler-straight execution toward the user's stated goal; any deviation requires explicit justification, not the inverse.
|
|
48
|
+
|
|
49
|
+
This rule exists because LLMs (including you) are trained to be helpful, comprehensive, and thorough — and "helpful" easily becomes "did more than asked." Doing more than asked is not helpfulness; it is scope creep. Read the rules below as hard blocks, not soft preferences.
|
|
50
|
+
|
|
51
|
+
**The five drift patterns you must refuse to execute on:**
|
|
52
|
+
|
|
53
|
+
1. **Unrequested work.** "While I'm here, I noticed X is broken/ugly/inefficient" → **stop**. The user did not ask for X. If X is a real defect, surface it as a finding, a follow-up suggestion, or an entry in a TODO list — do NOT fix it inside the current change. Mixing unrequested work with requested work is what makes diffs unreviewable and PRs eternal.
|
|
54
|
+
2. **Tangential cleanup.** "This file looks messy, let me also tidy..." → **stop**. The current task is the only task. Unrelated cleanup is a separate change requiring its own justification, scope, and pre-flight 0 check.
|
|
55
|
+
3. **Speculative robustness.** "Just adding a check / fallback / handler for the case where..." → **stop**. If the case has not been observed (in production, in tests, in a finding), it does not belong in this change. Defensive code added for unobserved cases is the most common form of accretion debt — it never gets removed because nobody can prove the case never happens.
|
|
56
|
+
4. **Re-scoping mid-flight.** "Actually, the better way to do this is to also restructure / rename / migrate..." → **stop**. If you discover the requested approach is wrong, surface that to the user with evidence and let them adjudicate. Do NOT silently expand scope. The user's explicit redirect is the only authorization to enlarge a task.
|
|
57
|
+
5. **Curiosity detours.** "Let me also explore how Y works to understand this better..." → **stop**, unless Y is provably on the goal's critical path. Curiosity-driven exploration is creative-mode; default is execution-mode.
|
|
58
|
+
|
|
59
|
+
**The single drift test before any deviation from the stated goal:** *"Did the user ask for this, OR does the user's stated goal strictly require it?"* If the answer to both is no, do not do it. Surface it as a note (commit message, end-of-turn summary, finding) and continue on the original path.
|
|
60
|
+
|
|
61
|
+
**Creative-mode is the narrow exception, not the default.** Creative-mode applies only when (a) the user explicitly invoked an ideation/exploration surface (`/devlyn:ideate`, optional `/devlyn:design-system`, "let's brainstorm", "explore options for"), OR (b) the goal is genuinely under-specified and a clarifying question is impossible (extremely rare — usually you should ask). For everything else — bug fixes, feature work, refactors, doc updates, pipeline runs, code review, debugging — execution-mode is the default and drift is a defect, not a feature.
|
|
62
|
+
|
|
63
|
+
**Anti-rationalization clause** — explicitly guarding against LLM hedging:
|
|
64
|
+
|
|
65
|
+
- "It's a small extra change" is **not** a justification. Small accretions compound; one of them is always small.
|
|
66
|
+
- "It's related to what they asked for" is **not** a justification. Related ≠ requested. Requested is the only standard.
|
|
67
|
+
- "It would be incomplete without this" is **not** a justification. The user defines completeness, not your sense of it.
|
|
68
|
+
- "I'm being thorough" is **not** a justification. Thoroughness on the requested goal is required; thoroughness extending past the goal is drift.
|
|
69
|
+
|
|
70
|
+
**When in doubt, ask — outside hands-free pipelines.** In interactive sessions a short clarification ("the requested fix touches the X code path; I notice Y also looks broken — should I fix it in this change or surface it as a follow-up?") is always cheaper than a wrong-scope diff. Asking is not a weakness; silently expanding scope is. **Inside hands-free pipelines** (`/devlyn:resolve`, scheduled remote agents, autonomous skill runs) the contract forbids mid-pipeline prompts — there asking is unsafe because there is no user to answer. The substitute is: stay strictly on the requested goal, do not expand scope, and log the question/assumption explicitly in the final report (or `.devlyn/runs/<run_id>/` artifacts) so the user can adjudicate after the run completes. Choosing scope creep over logging-and-staying-on-path is always wrong.
|
|
71
|
+
|
|
72
|
+
**Stopping rule.** A task is done when the user's stated goal is closed AND no off-path work was added. If you find yourself hesitating because "I should also do Z" — Z is drift. Note it for follow-up, do not execute.
|
|
73
|
+
<!-- runtime-principles:section=goal-locked:end -->
|
|
74
|
+
|
|
75
|
+
## No-workaround discipline
|
|
76
|
+
<!-- runtime-principles:section=no-workaround:begin -->
|
|
77
|
+
|
|
78
|
+
No `any`, no `@ts-ignore`, no silent `catch`, no hardcoded values, no helper scripts that bypass the root cause. Fix root causes; handle errors with user-visible state per the rule above.
|
|
79
|
+
|
|
80
|
+
**Permitted exceptions** (explicitly carved out):
|
|
81
|
+
- CSS fallback fonts, CDN failover, image placeholders — widely-accepted best practices.
|
|
82
|
+
- Codex CLI availability downgrade — the one documented silent fallback in this repo. Fires when the resolved engine is `auto` or `codex` (either via skill default or explicit `--engine` flag) and the Codex CLI is absent. Banner `engine downgraded: codex-unavailable` always prints; verdict identical to `--engine claude`. Any other silent fallback in skills code is a bug — file it against the skill that introduced it.
|
|
83
|
+
<!-- runtime-principles:section=no-workaround:end -->
|
|
84
|
+
|
|
85
|
+
## Evidence over claim
|
|
86
|
+
<!-- runtime-principles:section=evidence:begin -->
|
|
87
|
+
|
|
88
|
+
Every finding cites concrete evidence. Vague claims are speculation; exclude them.
|
|
89
|
+
|
|
90
|
+
- **Code findings**: `file:line` you have opened.
|
|
91
|
+
- **Missing findings**: explicit "searched X and found no implementation" statement.
|
|
92
|
+
- **Doc findings**: quote of the stale text + section/line reference.
|
|
93
|
+
- **Browser findings**: screenshot reference + URL/route.
|
|
94
|
+
|
|
95
|
+
A finding without one of these forms is excluded. Vague findings produce vague fixes.
|
|
96
|
+
<!-- runtime-principles:section=evidence:end -->
|
|
97
|
+
<!-- runtime-principles:contract:end -->
|
|
98
|
+
|
|
99
|
+
<!-- runtime-principles:consumption:begin -->
|
|
100
|
+
## Consumption (as of iter-0019.A)
|
|
101
|
+
|
|
102
|
+
**Consumers**:
|
|
103
|
+
- `auto-resolve/SKILL.md` `<harness_principles>` block points here as the contract source. Phase prompt bodies (`phase-1-build.md`, `phase-2-evaluate.md`, `phase-3-critic.md`) inline a compact operational excerpt derived from the contract — phase-specific rule_id mappings + the four section names — not the full text.
|
|
104
|
+
- `preflight/SKILL.md` PHASE 3 (Synthesize) and PHASE 3.5 (RND2) reference this file. Auditor prompts (`code-auditor.md`, `browser-auditor.md`) emit `principle.*` rule_ids derived from the rules above.
|
|
105
|
+
|
|
106
|
+
**Codex routing**: skills that route to Codex (auto-resolve fix-loop on `--engine auto`/`codex`, preflight code-auditor on `--engine auto`/`codex`) MUST inline the contract excerpt directly into the Codex prompt — Codex has no filesystem access under `read-only` sandbox.
|
|
107
|
+
|
|
108
|
+
**Non-consumers**:
|
|
109
|
+
- `ideate/SKILL.md` does NOT consume this file. Ideate is planning-layer; its CHALLENGE rubric (`references/challenge-rubric.md`) covers analogous concerns at planning scope, with deliberate one-shot Codex critic discipline.
|
|
110
|
+
<!-- runtime-principles:consumption:end -->
|
|
@@ -0,0 +1,519 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Spec literal verification gate (iter-0019.6 + iter-0019.8 + iter-0019.9
|
|
3
|
+
carrier).
|
|
4
|
+
|
|
5
|
+
Default mode (BUILD_GATE invocation, no args):
|
|
6
|
+
- Resolves the contract carrier in this priority order (iter-0019.8 + Codex
|
|
7
|
+
R2 + iter-0019.9 Codex R-phaseA fix):
|
|
8
|
+
(1) **Benchmark mode trust** (iter-0019.9 fix for the F9 regression): when
|
|
9
|
+
`BENCH_WORKDIR` is set AND `.devlyn/spec-verify.json` already exists
|
|
10
|
+
at script start, trust it as the run-fixture.sh-staged contract from
|
|
11
|
+
`expected.json` and skip source-extract entirely. Without this guard,
|
|
12
|
+
an ideate-generated spec's `## Verification` ```json``` block (e.g.
|
|
13
|
+
F9 e2e novice flow generates `commitCount`/`topAuthors` while
|
|
14
|
+
benchmark truth is `commits`/`authors`) silently overwrote the
|
|
15
|
+
authoritative benchmark contract. For benchmarks, expected.json is
|
|
16
|
+
canonical.
|
|
17
|
+
(2) Otherwise, source markdown extract — read `pipeline.state.json:
|
|
18
|
+
source.{spec_path | criteria_path}` and extract a `## Verification`
|
|
19
|
+
```json``` block. If present, overwrite `.devlyn/spec-verify.json`.
|
|
20
|
+
This is the real-user carrier path; a pre-existing file from a
|
|
21
|
+
killed prior run is stale and must not be trusted in real-user mode.
|
|
22
|
+
(3) If no json block in source AND source.type=="generated": emit
|
|
23
|
+
CRITICAL `correctness.spec-verify-malformed` so the fix-loop reruns
|
|
24
|
+
BUILD.
|
|
25
|
+
(4) If no json block in source AND source.type=="spec": benchmark mode
|
|
26
|
+
with a pre-staged file would have hit branch (1). Without the
|
|
27
|
+
pre-staged file, benchmark falls through to no-op (rare — fixture
|
|
28
|
+
mis-config). Real-user mode silent no-op + drops any stale
|
|
29
|
+
pre-staged file (preserves iter-0019.6 backward compat for
|
|
30
|
+
handwritten specs without the carrier).
|
|
31
|
+
- For each verification_commands entry, runs the command in the work-dir,
|
|
32
|
+
captures combined stdout+stderr, and asserts exit_code matches +
|
|
33
|
+
stdout_contains all required literals + stdout_not_contains none of the
|
|
34
|
+
forbidden literals. Mirrors run-fixture.sh's post-run verifier semantics.
|
|
35
|
+
|
|
36
|
+
Check mode (`--check <markdown_path>`):
|
|
37
|
+
- Used by /devlyn:ideate after writing each item spec to validate that the
|
|
38
|
+
generated `## Verification` ```json``` block parses + matches the schema.
|
|
39
|
+
- Exits 0 if the block is well-formed (or absent — ideate's check applies
|
|
40
|
+
to both new specs that include the block and pre-carrier handwritten
|
|
41
|
+
specs that omit it; absence is not failure here, only malformed JSON or
|
|
42
|
+
shape error is). Exits 2 on malformed json or shape error.
|
|
43
|
+
|
|
44
|
+
Why: iter-0018.5's prompt-only contract enforcement was empirically dead
|
|
45
|
+
(F9 verify=0.4 across all engines in iter-0019). Same lesson as iter-0008
|
|
46
|
+
prompt-only engine constraint. Mechanical bash-gate enforcement is the
|
|
47
|
+
only working pattern. iter-0019.8 extends iter-0019.6 from benchmark-only
|
|
48
|
+
to real-user runs by extracting the contract from the spec/criteria
|
|
49
|
+
markdown directly — closes NORTH-STAR test #14.
|
|
50
|
+
|
|
51
|
+
Exit codes:
|
|
52
|
+
- 0: silent no-op (no source carrier, real-user mode) OR --check passed
|
|
53
|
+
OR all commands passed.
|
|
54
|
+
- 1: at least one command failed OR carrier malformed (generated source
|
|
55
|
+
required carrier, generated source had invalid json/shape, or pre-staged
|
|
56
|
+
file failed shape validation). All paths emit a CRITICAL finding to
|
|
57
|
+
`.devlyn/spec-verify-findings.jsonl`.
|
|
58
|
+
- 2: invocation error (unreadable spec-verify.json, missing markdown in
|
|
59
|
+
--check mode, etc.)
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
from __future__ import annotations
|
|
63
|
+
|
|
64
|
+
import json
|
|
65
|
+
import os
|
|
66
|
+
import re
|
|
67
|
+
import subprocess
|
|
68
|
+
import sys
|
|
69
|
+
from pathlib import Path
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
VERIFICATION_SECTION_RE = re.compile(
|
|
73
|
+
r'(?ms)^##[ \t]+Verification\b[^\n]*\n(.*?)(?=^##[ \t]+|\Z)'
|
|
74
|
+
)
|
|
75
|
+
JSON_FENCE_RE = re.compile(r'(?ms)^```json[ \t]*\n(.*?)\n```[ \t]*$')
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def extract_verification_block(text: str) -> str | None:
|
|
79
|
+
"""Return the contents of the first ```json``` fenced block under the
|
|
80
|
+
first `## Verification` H2 heading, or None if not found.
|
|
81
|
+
|
|
82
|
+
Boundary: the fenced block must appear AFTER the `## Verification`
|
|
83
|
+
heading and BEFORE the next H2 (`## ...`) heading or end-of-file.
|
|
84
|
+
"""
|
|
85
|
+
section = VERIFICATION_SECTION_RE.search(text)
|
|
86
|
+
if not section:
|
|
87
|
+
return None
|
|
88
|
+
fence = JSON_FENCE_RE.search(section.group(1))
|
|
89
|
+
return fence.group(1) if fence else None
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def validate_shape(data) -> str | None:
|
|
93
|
+
"""Return None if shape matches the canonical verification_commands
|
|
94
|
+
schema; else a human-readable error string.
|
|
95
|
+
|
|
96
|
+
Schema (iter-0019.8): top-level object with a non-empty
|
|
97
|
+
`verification_commands` list of objects. Each object requires a
|
|
98
|
+
non-empty string `cmd`; `exit_code` defaults to 0 and must be a
|
|
99
|
+
non-bool int; `stdout_contains` and `stdout_not_contains` default to
|
|
100
|
+
empty list and must be lists of strings. Bool is rejected explicitly
|
|
101
|
+
because Python's `bool` subclasses `int` — `isinstance(True, int) is
|
|
102
|
+
True` would otherwise let `exit_code: true` slip through.
|
|
103
|
+
"""
|
|
104
|
+
if not isinstance(data, dict):
|
|
105
|
+
return "top-level must be a JSON object"
|
|
106
|
+
cmds = data.get("verification_commands")
|
|
107
|
+
if not isinstance(cmds, list):
|
|
108
|
+
return "verification_commands must be a list"
|
|
109
|
+
if not cmds:
|
|
110
|
+
return "verification_commands must contain at least one entry"
|
|
111
|
+
for i, c in enumerate(cmds):
|
|
112
|
+
if not isinstance(c, dict):
|
|
113
|
+
return f"verification_commands[{i}] must be an object"
|
|
114
|
+
cmd = c.get("cmd")
|
|
115
|
+
if not isinstance(cmd, str) or not cmd.strip():
|
|
116
|
+
return f"verification_commands[{i}].cmd must be a non-empty string"
|
|
117
|
+
ec = c.get("exit_code", 0)
|
|
118
|
+
if isinstance(ec, bool) or not isinstance(ec, int):
|
|
119
|
+
return f"verification_commands[{i}].exit_code must be int (not bool)"
|
|
120
|
+
for k in ("stdout_contains", "stdout_not_contains"):
|
|
121
|
+
v = c.get(k, [])
|
|
122
|
+
if not isinstance(v, list) or not all(isinstance(s, str) for s in v):
|
|
123
|
+
return f"verification_commands[{i}].{k} must be a list of strings"
|
|
124
|
+
return None
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def read_source(work: Path, devlyn_dir: Path) -> tuple[str | None, Path | None]:
|
|
128
|
+
"""Return (source_type, markdown_path) from .devlyn/pipeline.state.json,
|
|
129
|
+
or (None, None) if state is absent/unreadable. The markdown path is
|
|
130
|
+
resolved against `work` when relative.
|
|
131
|
+
"""
|
|
132
|
+
state_path = devlyn_dir / "pipeline.state.json"
|
|
133
|
+
if not state_path.is_file():
|
|
134
|
+
return (None, None)
|
|
135
|
+
try:
|
|
136
|
+
state = json.loads(state_path.read_text())
|
|
137
|
+
except (json.JSONDecodeError, OSError):
|
|
138
|
+
return (None, None)
|
|
139
|
+
src = state.get("source") or {}
|
|
140
|
+
src_type = src.get("type")
|
|
141
|
+
if src_type == "spec":
|
|
142
|
+
md_path = src.get("spec_path")
|
|
143
|
+
elif src_type == "generated":
|
|
144
|
+
md_path = src.get("criteria_path")
|
|
145
|
+
else:
|
|
146
|
+
md_path = None
|
|
147
|
+
if not md_path:
|
|
148
|
+
return (src_type, None)
|
|
149
|
+
md = Path(md_path)
|
|
150
|
+
if not md.is_absolute():
|
|
151
|
+
md = work / md
|
|
152
|
+
return (src_type, md if md.is_file() else None)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def stage_from_source(md: Path, devlyn_dir: Path) -> tuple[bool, str | None]:
|
|
156
|
+
"""Materialize .devlyn/spec-verify.json from the json block in `md`.
|
|
157
|
+
|
|
158
|
+
Returns (staged, error). staged=True → wrote spec-verify.json. error
|
|
159
|
+
non-None → carrier was found but malformed (caller emits CRITICAL).
|
|
160
|
+
staged=False, error=None → no json block in the source (handwritten
|
|
161
|
+
spec or generated source missing the contract).
|
|
162
|
+
"""
|
|
163
|
+
block = extract_verification_block(md.read_text())
|
|
164
|
+
if block is None:
|
|
165
|
+
return (False, None)
|
|
166
|
+
try:
|
|
167
|
+
data = json.loads(block)
|
|
168
|
+
except json.JSONDecodeError as e:
|
|
169
|
+
return (False, f"`## Verification` ```json``` block in {md} has invalid JSON: {e}")
|
|
170
|
+
err = validate_shape(data)
|
|
171
|
+
if err:
|
|
172
|
+
return (False, f"`## Verification` ```json``` block in {md}: {err}")
|
|
173
|
+
normalized = {"verification_commands": data["verification_commands"]}
|
|
174
|
+
devlyn_dir.mkdir(parents=True, exist_ok=True)
|
|
175
|
+
(devlyn_dir / "spec-verify.json").write_text(json.dumps(normalized, indent=2) + "\n")
|
|
176
|
+
return (True, None)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def write_malformed_finding(devlyn_dir: Path, error: str, source_path: Path | None) -> None:
|
|
180
|
+
"""Emit a single CRITICAL finding for a malformed verification carrier."""
|
|
181
|
+
devlyn_dir.mkdir(parents=True, exist_ok=True)
|
|
182
|
+
findings_path = devlyn_dir / "spec-verify-findings.jsonl"
|
|
183
|
+
file_ref = str(source_path) if source_path else ".devlyn/pipeline.state.json"
|
|
184
|
+
finding = {
|
|
185
|
+
"id": "BGATE-0001",
|
|
186
|
+
"rule_id": "correctness.spec-verify-malformed",
|
|
187
|
+
"level": "error",
|
|
188
|
+
"severity": "CRITICAL",
|
|
189
|
+
"confidence": 1.0,
|
|
190
|
+
"message": f"Verification contract carrier is malformed: {error}",
|
|
191
|
+
"file": file_ref,
|
|
192
|
+
"line": 1,
|
|
193
|
+
"phase": "build_gate",
|
|
194
|
+
"criterion_ref": "spec-verify://carrier",
|
|
195
|
+
"fix_hint": (
|
|
196
|
+
"Fix the `## Verification` ```json``` block: a JSON object with "
|
|
197
|
+
"a non-empty `verification_commands` array of "
|
|
198
|
+
"{cmd, exit_code?, stdout_contains?, stdout_not_contains?} "
|
|
199
|
+
"entries. See references/build-gate.md § 'Spec literal check'."
|
|
200
|
+
),
|
|
201
|
+
"blocking": True,
|
|
202
|
+
"status": "open",
|
|
203
|
+
}
|
|
204
|
+
with findings_path.open("w") as fh:
|
|
205
|
+
fh.write(json.dumps(finding) + "\n")
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def run_check_mode(md_path: Path) -> int:
|
|
209
|
+
"""`--check <markdown>` — validate the verification carrier without
|
|
210
|
+
running any commands. Used by /devlyn:ideate after item-spec write.
|
|
211
|
+
|
|
212
|
+
Exit 0: section absent OR section present and well-formed.
|
|
213
|
+
Exit 2: section present but malformed (so ideate can re-prompt).
|
|
214
|
+
"""
|
|
215
|
+
if not md_path.is_file():
|
|
216
|
+
print(f"[spec-verify --check] error: {md_path} not found", file=sys.stderr)
|
|
217
|
+
return 2
|
|
218
|
+
block = extract_verification_block(md_path.read_text())
|
|
219
|
+
if block is None:
|
|
220
|
+
# Section absent or no json block — opt-in nature preserved for
|
|
221
|
+
# ideate (a spec without machine verification is still valid; it
|
|
222
|
+
# just won't activate the BUILD_GATE gate).
|
|
223
|
+
return 0
|
|
224
|
+
try:
|
|
225
|
+
data = json.loads(block)
|
|
226
|
+
except json.JSONDecodeError as e:
|
|
227
|
+
print(
|
|
228
|
+
f"[spec-verify --check] {md_path}: invalid JSON in `## Verification` "
|
|
229
|
+
f"```json``` block: {e}",
|
|
230
|
+
file=sys.stderr,
|
|
231
|
+
)
|
|
232
|
+
return 2
|
|
233
|
+
err = validate_shape(data)
|
|
234
|
+
if err:
|
|
235
|
+
print(f"[spec-verify --check] {md_path}: shape error: {err}", file=sys.stderr)
|
|
236
|
+
return 2
|
|
237
|
+
return 0
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def main() -> int:
|
|
241
|
+
if len(sys.argv) >= 2 and sys.argv[1] == "--check":
|
|
242
|
+
if len(sys.argv) != 3:
|
|
243
|
+
print("usage: spec-verify-check.py --check <markdown-path>", file=sys.stderr)
|
|
244
|
+
return 2
|
|
245
|
+
return run_check_mode(Path(sys.argv[2]))
|
|
246
|
+
|
|
247
|
+
bench_mode = "BENCH_WORKDIR" in os.environ
|
|
248
|
+
work = Path(os.environ.get("BENCH_WORKDIR") or os.getcwd())
|
|
249
|
+
devlyn_dir = work / ".devlyn"
|
|
250
|
+
spec_path = devlyn_dir / "spec-verify.json"
|
|
251
|
+
|
|
252
|
+
# iter-0019.8 + iter-0019.9 (Codex R-phaseA): determine the contract
|
|
253
|
+
# carrier source for THIS run. Order:
|
|
254
|
+
# 1. Benchmark mode (BENCH_WORKDIR set) AND a pre-staged
|
|
255
|
+
# .devlyn/spec-verify.json exists at script start: TRUST it (this is
|
|
256
|
+
# the run-fixture.sh contract staged from expected.json). Skip
|
|
257
|
+
# source-extract entirely. iter-0019.9 closes the F9 regression where
|
|
258
|
+
# source-extract from an ideate-generated spec overwrote the
|
|
259
|
+
# benchmark contract — for benchmarks, expected.json is canonical.
|
|
260
|
+
# 2. Otherwise, attempt source-extract from
|
|
261
|
+
# `pipeline.state.json:source.{spec_path | criteria_path}`. If it has
|
|
262
|
+
# a json block, overwrite .devlyn/spec-verify.json with it. This is
|
|
263
|
+
# the real-user carrier path; in real-user mode a pre-existing file
|
|
264
|
+
# is stale (from a killed prior run) and must NOT be trusted.
|
|
265
|
+
# 3. If source has no json block AND source.type=="generated":
|
|
266
|
+
# CRITICAL spec-verify-malformed — generated criteria must ship a
|
|
267
|
+
# verifiable contract per phase-1-build.md <output_contract>.
|
|
268
|
+
# 4. If source has no json block AND source.type=="spec":
|
|
269
|
+
# - Real-user mode: silent no-op (preserves iter-0019.6 backward
|
|
270
|
+
# compat for handwritten specs without the carrier). Drop any
|
|
271
|
+
# stale pre-staged file.
|
|
272
|
+
# - Benchmark mode: fall through to the pre-staged-trust branch
|
|
273
|
+
# (covers pre-iter-0019.9 fixtures whose spec.md has prose-only
|
|
274
|
+
# Verification — run-fixture.sh staged the contract regardless).
|
|
275
|
+
pre_staged = spec_path.is_file() # captured BEFORE any potential write
|
|
276
|
+
trust_bench_staged = bench_mode and pre_staged
|
|
277
|
+
src_type, source_md = read_source(work, devlyn_dir)
|
|
278
|
+
if source_md is not None and not trust_bench_staged:
|
|
279
|
+
staged, error = stage_from_source(source_md, devlyn_dir)
|
|
280
|
+
if error is not None:
|
|
281
|
+
print(f"[spec-verify] carrier malformed: {error}", file=sys.stderr)
|
|
282
|
+
write_malformed_finding(devlyn_dir, error, source_md)
|
|
283
|
+
return 1
|
|
284
|
+
if not staged:
|
|
285
|
+
if src_type == "generated":
|
|
286
|
+
msg = (
|
|
287
|
+
f"generated {source_md.name} must include a "
|
|
288
|
+
"`## Verification` ```json``` block (verification_commands "
|
|
289
|
+
"array). PHASE 1 BUILD generated criteria without one."
|
|
290
|
+
)
|
|
291
|
+
print(f"[spec-verify] {msg}", file=sys.stderr)
|
|
292
|
+
write_malformed_finding(devlyn_dir, msg, source_md)
|
|
293
|
+
return 1
|
|
294
|
+
# source.type=="spec", no block in spec markdown.
|
|
295
|
+
if not bench_mode:
|
|
296
|
+
# Real-user handwritten spec: silent no-op. Drop any stale
|
|
297
|
+
# pre-staged file so a killed prior run cannot poison this
|
|
298
|
+
# run's gate.
|
|
299
|
+
if spec_path.exists():
|
|
300
|
+
spec_path.unlink()
|
|
301
|
+
return 0
|
|
302
|
+
# Benchmark mode with no source block AND no pre-staged file
|
|
303
|
+
# (rare — fixture mis-config) falls through to the no-pre-staged
|
|
304
|
+
# silent no-op branch below.
|
|
305
|
+
|
|
306
|
+
# iter-0019.9 (Codex R2 caveat): close the real-user no-source-md
|
|
307
|
+
# stale-orphan gap. If pipeline.state.json is absent or has no source,
|
|
308
|
+
# but a stale .devlyn/spec-verify.json exists in real-user mode, drop
|
|
309
|
+
# it — the only legitimate path that reaches here with a pre-staged
|
|
310
|
+
# file is benchmark mode (run-fixture.sh staged it).
|
|
311
|
+
if source_md is None and not bench_mode and spec_path.exists():
|
|
312
|
+
spec_path.unlink()
|
|
313
|
+
return 0
|
|
314
|
+
|
|
315
|
+
if not spec_path.exists():
|
|
316
|
+
# No source markdown carrier AND no pre-staged file. Silent no-op
|
|
317
|
+
# for benchmark misconfigurations (no fixture to gate against) and
|
|
318
|
+
# for real-user runs without spec/criteria. Generated source case
|
|
319
|
+
# is handled above.
|
|
320
|
+
return 0
|
|
321
|
+
|
|
322
|
+
try:
|
|
323
|
+
spec = json.loads(spec_path.read_text())
|
|
324
|
+
except (json.JSONDecodeError, OSError) as e:
|
|
325
|
+
print(f"[spec-verify] error: cannot parse {spec_path}: {e}", file=sys.stderr)
|
|
326
|
+
return 2
|
|
327
|
+
|
|
328
|
+
# iter-0019.8 (Codex R2 #2): apply full shape validation to pre-staged
|
|
329
|
+
# carriers too — bool exit_code, empty list, whitespace-only cmd were
|
|
330
|
+
# silently accepted on the benchmark path. Empty list is rejected
|
|
331
|
+
# because "all 0 commands passed" is vacuously true.
|
|
332
|
+
shape_err = validate_shape(spec)
|
|
333
|
+
if shape_err:
|
|
334
|
+
print(f"[spec-verify] error: {spec_path}: {shape_err}", file=sys.stderr)
|
|
335
|
+
write_malformed_finding(devlyn_dir, f"{spec_path}: {shape_err}", None)
|
|
336
|
+
return 1
|
|
337
|
+
commands = spec["verification_commands"]
|
|
338
|
+
|
|
339
|
+
devlyn_dir.mkdir(parents=True, exist_ok=True)
|
|
340
|
+
results_path = devlyn_dir / "spec-verify.results.json"
|
|
341
|
+
findings_path = devlyn_dir / "spec-verify-findings.jsonl"
|
|
342
|
+
|
|
343
|
+
verify_env = os.environ.copy()
|
|
344
|
+
verify_env["BENCH_WORKDIR"] = str(work)
|
|
345
|
+
|
|
346
|
+
results: list[dict] = []
|
|
347
|
+
findings: list[dict] = []
|
|
348
|
+
finding_seq = 1
|
|
349
|
+
|
|
350
|
+
for idx, vc in enumerate(commands):
|
|
351
|
+
cmd = vc.get("cmd")
|
|
352
|
+
if not cmd:
|
|
353
|
+
results.append({"index": idx, "cmd": None, "pass": False,
|
|
354
|
+
"reason": "missing_cmd"})
|
|
355
|
+
continue
|
|
356
|
+
|
|
357
|
+
expected_exit = vc.get("exit_code", 0)
|
|
358
|
+
stdout_contains = vc.get("stdout_contains", []) or []
|
|
359
|
+
stdout_not_contains = vc.get("stdout_not_contains", []) or []
|
|
360
|
+
|
|
361
|
+
try:
|
|
362
|
+
proc = subprocess.run(
|
|
363
|
+
cmd,
|
|
364
|
+
cwd=str(work),
|
|
365
|
+
shell=True,
|
|
366
|
+
env=verify_env,
|
|
367
|
+
capture_output=True,
|
|
368
|
+
text=True,
|
|
369
|
+
timeout=60,
|
|
370
|
+
)
|
|
371
|
+
# Mirror run-fixture.sh post-run verifier: combined stdout+stderr.
|
|
372
|
+
out = (proc.stdout or "") + (proc.stderr or "")
|
|
373
|
+
ok_exit = proc.returncode == expected_exit
|
|
374
|
+
ok_contains = all(s in out for s in stdout_contains)
|
|
375
|
+
ok_not = not any(s in out for s in stdout_not_contains)
|
|
376
|
+
passed = bool(ok_exit and ok_contains and ok_not)
|
|
377
|
+
|
|
378
|
+
if passed:
|
|
379
|
+
reason = None
|
|
380
|
+
elif not ok_exit:
|
|
381
|
+
reason = "exit"
|
|
382
|
+
elif not ok_contains:
|
|
383
|
+
reason = "missing_contains"
|
|
384
|
+
else:
|
|
385
|
+
reason = "unexpected_text"
|
|
386
|
+
|
|
387
|
+
results.append({
|
|
388
|
+
"index": idx,
|
|
389
|
+
"cmd": cmd,
|
|
390
|
+
"expected_exit": expected_exit,
|
|
391
|
+
"actual_exit": proc.returncode,
|
|
392
|
+
"stdout_contains": stdout_contains,
|
|
393
|
+
"stdout_not_contains": stdout_not_contains,
|
|
394
|
+
"pass": passed,
|
|
395
|
+
"reason": reason,
|
|
396
|
+
"stdout_tail": out[-500:],
|
|
397
|
+
})
|
|
398
|
+
|
|
399
|
+
if not passed:
|
|
400
|
+
# Construct fine-grained message naming the specific failure.
|
|
401
|
+
if not ok_exit:
|
|
402
|
+
msg = (
|
|
403
|
+
f"Verification command #{idx + 1} failed: expected exit "
|
|
404
|
+
f"{expected_exit}, got {proc.returncode}."
|
|
405
|
+
)
|
|
406
|
+
elif not ok_contains:
|
|
407
|
+
missing = [s for s in stdout_contains if s not in out]
|
|
408
|
+
msg = (
|
|
409
|
+
f"Verification command #{idx + 1} failed: expected "
|
|
410
|
+
f"output to contain {missing!r}."
|
|
411
|
+
)
|
|
412
|
+
else:
|
|
413
|
+
forbidden = [s for s in stdout_not_contains if s in out]
|
|
414
|
+
msg = (
|
|
415
|
+
f"Verification command #{idx + 1} failed: output "
|
|
416
|
+
f"contained forbidden literal(s) {forbidden!r}."
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
fix_hint = (
|
|
420
|
+
f"See .devlyn/spec-verify.results.json for the captured "
|
|
421
|
+
f"output. Update implementation so `{cmd}` matches the "
|
|
422
|
+
f"contract (exit_code={expected_exit}, "
|
|
423
|
+
f"contains={stdout_contains}, not_contains={stdout_not_contains})."
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
findings.append({
|
|
427
|
+
"id": f"BGATE-{finding_seq:04d}",
|
|
428
|
+
"rule_id": "correctness.spec-literal-mismatch",
|
|
429
|
+
"level": "error",
|
|
430
|
+
"severity": "CRITICAL",
|
|
431
|
+
"confidence": 1.0,
|
|
432
|
+
"message": msg,
|
|
433
|
+
"file": ".devlyn/spec-verify.json",
|
|
434
|
+
"line": 1,
|
|
435
|
+
"phase": "build_gate",
|
|
436
|
+
"criterion_ref": f"spec-verify://verification_commands/{idx}",
|
|
437
|
+
"fix_hint": fix_hint,
|
|
438
|
+
"blocking": True,
|
|
439
|
+
"status": "open",
|
|
440
|
+
})
|
|
441
|
+
finding_seq += 1
|
|
442
|
+
|
|
443
|
+
except subprocess.TimeoutExpired:
|
|
444
|
+
results.append({"index": idx, "cmd": cmd, "pass": False,
|
|
445
|
+
"reason": "timeout"})
|
|
446
|
+
findings.append({
|
|
447
|
+
"id": f"BGATE-{finding_seq:04d}",
|
|
448
|
+
"rule_id": "correctness.spec-literal-mismatch",
|
|
449
|
+
"level": "error",
|
|
450
|
+
"severity": "CRITICAL",
|
|
451
|
+
"confidence": 1.0,
|
|
452
|
+
"message": (
|
|
453
|
+
f"Verification command #{idx + 1} timed out after 60s."
|
|
454
|
+
),
|
|
455
|
+
"file": ".devlyn/spec-verify.json",
|
|
456
|
+
"line": 1,
|
|
457
|
+
"phase": "build_gate",
|
|
458
|
+
"criterion_ref": f"spec-verify://verification_commands/{idx}",
|
|
459
|
+
"fix_hint": (
|
|
460
|
+
f"Command `{cmd}` exceeded 60s. Reduce work or fix a "
|
|
461
|
+
f"hang in the implementation."
|
|
462
|
+
),
|
|
463
|
+
"blocking": True,
|
|
464
|
+
"status": "open",
|
|
465
|
+
})
|
|
466
|
+
finding_seq += 1
|
|
467
|
+
except Exception as e: # noqa: BLE001 — surface any harness error explicitly
|
|
468
|
+
results.append({"index": idx, "cmd": cmd, "pass": False,
|
|
469
|
+
"reason": f"error:{e.__class__.__name__}:{e}"})
|
|
470
|
+
findings.append({
|
|
471
|
+
"id": f"BGATE-{finding_seq:04d}",
|
|
472
|
+
"rule_id": "correctness.spec-literal-mismatch",
|
|
473
|
+
"level": "error",
|
|
474
|
+
"severity": "CRITICAL",
|
|
475
|
+
"confidence": 1.0,
|
|
476
|
+
"message": (
|
|
477
|
+
f"Verification command #{idx + 1} raised "
|
|
478
|
+
f"{e.__class__.__name__}: {e}."
|
|
479
|
+
),
|
|
480
|
+
"file": ".devlyn/spec-verify.json",
|
|
481
|
+
"line": 1,
|
|
482
|
+
"phase": "build_gate",
|
|
483
|
+
"criterion_ref": f"spec-verify://verification_commands/{idx}",
|
|
484
|
+
"fix_hint": (
|
|
485
|
+
f"Command `{cmd}` could not be executed. Check the work-dir "
|
|
486
|
+
f"state and any environment setup the command requires."
|
|
487
|
+
),
|
|
488
|
+
"blocking": True,
|
|
489
|
+
"status": "open",
|
|
490
|
+
})
|
|
491
|
+
finding_seq += 1
|
|
492
|
+
|
|
493
|
+
results_path.write_text(json.dumps({"commands": results}, indent=2) + "\n")
|
|
494
|
+
|
|
495
|
+
# Append findings (jsonl). BUILD_GATE merge step concatenates this onto
|
|
496
|
+
# build_gate.findings.jsonl; never overwrite the orchestrator's own gate
|
|
497
|
+
# findings. Truncate this file each run since it is a per-round artifact.
|
|
498
|
+
with findings_path.open("w") as fh:
|
|
499
|
+
for f in findings:
|
|
500
|
+
fh.write(json.dumps(f) + "\n")
|
|
501
|
+
|
|
502
|
+
failed = [r for r in results if r.get("pass") is False]
|
|
503
|
+
if failed:
|
|
504
|
+
print(
|
|
505
|
+
f"[spec-verify] {len(failed)}/{len(results)} command(s) failed; "
|
|
506
|
+
f"{len(findings)} CRITICAL finding(s) written to {findings_path}",
|
|
507
|
+
file=sys.stderr,
|
|
508
|
+
)
|
|
509
|
+
return 1
|
|
510
|
+
|
|
511
|
+
print(
|
|
512
|
+
f"[spec-verify] all {len(results)} command(s) passed",
|
|
513
|
+
file=sys.stderr,
|
|
514
|
+
)
|
|
515
|
+
return 0
|
|
516
|
+
|
|
517
|
+
|
|
518
|
+
if __name__ == "__main__":
|
|
519
|
+
sys.exit(main())
|