devlyn-cli 1.13.0 → 1.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +28 -149
- package/README.md +30 -1
- package/config/skills/devlyn:auto-resolve/SKILL.md +167 -453
- package/config/skills/devlyn:auto-resolve/evals/evals.json +21 -0
- package/config/skills/devlyn:auto-resolve/evals/task-doctor-subcommand.md +42 -0
- package/config/skills/devlyn:auto-resolve/references/build-gate.md +36 -22
- package/config/skills/devlyn:auto-resolve/references/engine-routing.md +43 -165
- package/config/skills/devlyn:auto-resolve/references/findings-schema.md +103 -0
- package/config/skills/devlyn:auto-resolve/references/phases/phase-1-build.md +54 -0
- package/config/skills/devlyn:auto-resolve/references/phases/phase-2-evaluate.md +45 -0
- package/config/skills/devlyn:auto-resolve/references/phases/phase-3-critic.md +84 -0
- package/config/skills/devlyn:auto-resolve/references/pipeline-routing.md +114 -0
- package/config/skills/devlyn:auto-resolve/references/pipeline-state.md +201 -0
- package/config/skills/devlyn:auto-resolve/scripts/archive_run.py +104 -0
- package/config/skills/devlyn:auto-resolve/scripts/terminal_verdict.py +96 -0
- package/config/skills/devlyn:ideate/SKILL.md +17 -78
- package/config/skills/devlyn:ideate/references/codex-critic-template.md +42 -0
- package/config/skills/devlyn:ideate/references/templates/item-spec.md +4 -0
- package/config/skills/devlyn:preflight/SKILL.md +25 -40
- package/config/skills/devlyn:preflight/references/auditors/code-auditor.md +6 -10
- package/config/skills/devlyn:reap/SKILL.md +104 -0
- package/config/skills/devlyn:reap/scripts/reap.sh +129 -0
- package/config/skills/devlyn:reap/scripts/scan.sh +116 -0
- package/package.json +5 -1
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# PHASE 3 — CRITIC (agent prompt body)
|
|
2
|
+
|
|
3
|
+
Spawned when PHASE 3 runs. Engine: CRITIC row of `engine-routing.md` — design sub-pass always Claude; security sub-pass Dual on `--engine auto`, single on others.
|
|
4
|
+
|
|
5
|
+
**Findings-only**: CRITIC does NOT write code. Orchestrator routes `NEEDS_WORK`/`BLOCKED` findings into PHASE 2.5 with `triggered_by: "critic"`. No bespoke mini-loop inside CRITIC.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
<spec_integrity_check>
|
|
10
|
+
Before reading anything: verify source hash per `references/phases/phase-1-build.md#spec_integrity_check`.
|
|
11
|
+
</spec_integrity_check>
|
|
12
|
+
|
|
13
|
+
<goal>
|
|
14
|
+
One post-EVAL critic pass with two parallel sub-concerns. Produce a single `.devlyn/critic.findings.jsonl` tagged by rule_id prefix, plus a single `.devlyn/critic.log.md`.
|
|
15
|
+
</goal>
|
|
16
|
+
|
|
17
|
+
<input>
|
|
18
|
+
- Change surface: `git diff <pipeline.state.json:base_ref.sha>`. Read every changed file in full, not just the hunks.
|
|
19
|
+
- `package.json` / `requirements.txt` / lockfiles (`package-lock.json`, `pnpm-lock.yaml`, `yarn.lock`, `Pipfile.lock`, `poetry.lock`, `Cargo.lock`, `go.sum`) — for dependency audit.
|
|
20
|
+
</input>
|
|
21
|
+
|
|
22
|
+
## Sub-pass 1: DESIGN (always Claude)
|
|
23
|
+
|
|
24
|
+
<design_goal>
|
|
25
|
+
Read the diff cold — no checklist, no prior-phase context. Find what a staff engineer would block before this PR ships. Any hesitation is a finding.
|
|
26
|
+
</design_goal>
|
|
27
|
+
|
|
28
|
+
<design_quality_bar>
|
|
29
|
+
- Every finding anchored to `file:line` in code you have opened, with a concrete fix. Vague ≠ finding.
|
|
30
|
+
- `fix_hint` is a specific change ("change X to Y because Z"), never "consider improving".
|
|
31
|
+
- Interrogate: would this survive 10x traffic? A midnight oncall page? A junior dev in 6 months? Are baked-in assumptions stated out loud (hardcoded limits, implicit ordering, missed business-logic edges)? Is error handling actually helpful or does it prevent crashes while leaving users confused? Are there simpler idiomatic approaches — not "clever" but genuinely better?
|
|
32
|
+
- Do not open with praise.
|
|
33
|
+
- Rule_ids: `design.non-atomic-transaction`, `design.duplicate-pattern`, `design.hidden-assumption`, `design.unidiomatic-pattern`, `design.missing-integration`, etc.
|
|
34
|
+
- Severities: CRITICAL / HIGH / MEDIUM — no LOW (design is ship/no-ship).
|
|
35
|
+
|
|
36
|
+
**Design sub-verdict**: `PASS` only if zero design findings. Any open design finding → `NEEDS_WORK`.
|
|
37
|
+
</design_quality_bar>
|
|
38
|
+
|
|
39
|
+
## Sub-pass 2: SECURITY (Dual on `--engine auto`, single otherwise)
|
|
40
|
+
|
|
41
|
+
<security_goal>
|
|
42
|
+
Dedicated security audit of all recent changes. NOT a general code review — focus exclusively on security concerns. File:line evidence for every finding.
|
|
43
|
+
</security_goal>
|
|
44
|
+
|
|
45
|
+
<security_quality_bar>
|
|
46
|
+
Check every changed file for:
|
|
47
|
+
1. **Input validation**: trace every user input entry → storage/output. SQL injection, XSS, command injection, path traversal, SSRF.
|
|
48
|
+
2. **Auth & authorization**: new endpoints protected? Auth checks consistent? Privilege escalation / BOLA paths?
|
|
49
|
+
3. **Secrets & credentials**: grep for hardcoded API keys, tokens, passwords, private keys. Secrets from env vars. `.gitignore` covers sensitive files.
|
|
50
|
+
4. **Data exposure**: error messages leaking internal details? Logs capturing sensitive data? API responses returning more than needed?
|
|
51
|
+
5. **Dependencies** — **MANDATORY** when any dep manifest or lockfile changed (see `<input>` list above). Run the package manager's audit command:
|
|
52
|
+
- `npm audit --json` (Node/pnpm/yarn — all write to `npm audit`-compatible JSON)
|
|
53
|
+
- `pip-audit --format json`
|
|
54
|
+
- `cargo audit`
|
|
55
|
+
- `govulncheck ./...`
|
|
56
|
+
Report findings at CRITICAL/HIGH as blocking. Record the command run and its JSON output in `critic.log.md`.
|
|
57
|
+
6. **CSRF/CORS**: new endpoints with side effects → CSRF protection. CORS not overly permissive.
|
|
58
|
+
|
|
59
|
+
Rule_ids: `security.sql-injection`, `security.xss`, `security.path-traversal`, `security.ssrf`, `security.hardcoded-credential`, `security.missing-input-validation`, `security.missing-auth-check`, `security.privilege-escalation`, `security.data-exposure`, `security.insecure-dependency`, `security.missing-csrf`, `security.permissive-cors`.
|
|
60
|
+
|
|
61
|
+
**Security sub-verdict** (stricter than general — same as v3.2 SECURITY):
|
|
62
|
+
- `PASS` — zero findings
|
|
63
|
+
- `PASS_WITH_ISSUES` — LOW only
|
|
64
|
+
- `NEEDS_WORK` — HIGH or MEDIUM present (security MEDIUM is blocking by design)
|
|
65
|
+
- `BLOCKED` — any CRITICAL
|
|
66
|
+
|
|
67
|
+
**Dual merging** (when `--engine auto`): same finding from both models → keep more detailed wording, mark "confirmed by both". Codex-only → prefix message with `[codex]`. Conflicts → keep both. Take the MORE SEVERE severity between the two.
|
|
68
|
+
</security_quality_bar>
|
|
69
|
+
|
|
70
|
+
## Output contract
|
|
71
|
+
|
|
72
|
+
- **`.devlyn/critic.findings.jsonl`** — one JSONL file containing BOTH sub-passes' findings. Every line carries `phase: "critic"`. Rule_id prefix (`design.*` vs `security.*`) distinguishes sub-pass. ID prefix: `CRIT-<4digit>` (single sequence shared by both sub-passes for simplicity).
|
|
73
|
+
- **`.devlyn/critic.log.md`** — single prose summary: two sections ("Design" + "Security"). Each section: verdict + top 3 concerns framed actionably. Security section records the dep-audit command and its result.
|
|
74
|
+
- **state.json phases.critic** — record both sub-verdicts AND the combined verdict. Combined verdict = WORSE of the two:
|
|
75
|
+
- Any `BLOCKED` → `BLOCKED`
|
|
76
|
+
- Any `NEEDS_WORK` → `NEEDS_WORK`
|
|
77
|
+
- Any `PASS_WITH_ISSUES` → `PASS_WITH_ISSUES`
|
|
78
|
+
- Both `PASS` → `PASS`
|
|
79
|
+
|
|
80
|
+
## Principles
|
|
81
|
+
|
|
82
|
+
- Cold eyes catch what structured reviews miss. For design: "would I ship this with my name on it?" is the only question.
|
|
83
|
+
- For security: OWASP-anchored findings, file:line evidence. Speculative security concerns without a concrete attack vector are noise.
|
|
84
|
+
- Do NOT write code changes. Do NOT commit. Orchestrator handles routing.
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
# Pipeline Routing — 3 Routes + Stage A + Stage B LITE
|
|
2
|
+
|
|
3
|
+
Auto-resolve adapts its pipeline shape to each task. Single source of truth for route selection; the orchestrator reads it, SKILL.md does not restate the rules.
|
|
4
|
+
|
|
5
|
+
## The 3 routes
|
|
6
|
+
|
|
7
|
+
| Route | Intended for | Phases that run |
|
|
8
|
+
|-------|-------------|-----------------|
|
|
9
|
+
| `fast` | Trivial / low-complexity, zero risk signals | PARSE → BUILD → BUILD GATE → [BROWSER if web] → EVAL → [FIX if findings] → FINAL REPORT |
|
|
10
|
+
| `standard` | Default for medium work | `fast` + CRITIC (findings-only) + DOCS |
|
|
11
|
+
| `strict` | High-complexity OR risk signals present OR escalated | `standard` + team-assembled BUILD + BUILD GATE strict mode |
|
|
12
|
+
|
|
13
|
+
Every route runs PARSE, BUILD, BUILD GATE, EVAL, and FINAL REPORT. Routes differ in whether CRITIC/DOCS run and whether BUILD assembles a team.
|
|
14
|
+
|
|
15
|
+
**Findings-only** (CRITIC) means the phase emits a `.findings.jsonl` + `.log.md` but does not write code. The orchestrator routes any NEEDS_WORK/BLOCKED findings through the unified fix loop (see SKILL.md `PHASE 2.5`), which re-runs EVAL. This enforces the post-EVAL invariant: all semantic changes go through EVAL.
|
|
16
|
+
|
|
17
|
+
## Default guardrails (route-invariant under `auto`)
|
|
18
|
+
|
|
19
|
+
These hold across all three routes with no `--bypass`:
|
|
20
|
+
|
|
21
|
+
1. **BUILD GATE PASS** — `fast` runs the gate too.
|
|
22
|
+
2. **Independent EVAL PASS** — file:line evidence required.
|
|
23
|
+
3. **Every criterion terminal** (`verified` or `failed`).
|
|
24
|
+
4. **Zero open HIGH/CRITICAL findings** at pipeline exit (subject to `--max-rounds` — see exhaustion table).
|
|
25
|
+
5. **Web file changes force BROWSER VALIDATE** (`.tsx/.jsx/.vue/.svelte/.css/.html`, `page.*/layout.*/route.*`).
|
|
26
|
+
6. **Post-BUILD risk detection auto-escalates** via Stage B LITE.
|
|
27
|
+
|
|
28
|
+
## The `--bypass` flag
|
|
29
|
+
|
|
30
|
+
Semantics: `--bypass <phase>[,<phase>...]`. Bypassable phases: `build-gate`, `browser`, `critic`, `docs`.
|
|
31
|
+
|
|
32
|
+
Every bypass is recorded in `state.route.bypasses` and surfaced in the final report's `Guardrails bypassed:` line.
|
|
33
|
+
|
|
34
|
+
**Deprecated aliases** (still accepted, log warning once): `--skip-build-gate`, `--skip-browser`, `--skip-review`, `--skip-clean`, `--skip-docs`, `--security-review skip`, `--bypass simplify|review|clean|security|challenge` all map to `--bypass critic` for the post-EVAL group or the appropriate phase otherwise. Removed next minor version.
|
|
35
|
+
|
|
36
|
+
## `--max-rounds` exhaustion
|
|
37
|
+
|
|
38
|
+
When the fix loop exhausts `max_rounds` with findings still open:
|
|
39
|
+
|
|
40
|
+
| `triggered_by` | exhaustion behavior |
|
|
41
|
+
|---|---|
|
|
42
|
+
| `build_gate` | **halt** — skip to FINAL REPORT with `BUILD GATE EXHAUSTED` banner |
|
|
43
|
+
| `browser_validate` | **halt** — skip to FINAL REPORT with `BROWSER EXHAUSTED` banner |
|
|
44
|
+
| `evaluate` | **proceed_with_warning** — FINAL REPORT shows `EVAL EXHAUSTED` banner + open findings |
|
|
45
|
+
| `critic` | **proceed_with_warning** — FINAL REPORT shows `CRITIC EXHAUSTED` banner + open findings |
|
|
46
|
+
|
|
47
|
+
Guardrail #4 is suspended under `_with_warning` exhaustion: report banner shows what's unresolved.
|
|
48
|
+
|
|
49
|
+
## Stage A — Pre-build (PHASE 0)
|
|
50
|
+
|
|
51
|
+
Decision order (first match wins):
|
|
52
|
+
|
|
53
|
+
1. **User override** (`--route fast|standard|strict`): set `route.selected`, `route.user_override: true`. Stage B LITE will not run.
|
|
54
|
+
2. **Hard blocker**: missing spec or unmet internal deps → halt BLOCKED.
|
|
55
|
+
3. **Risk keywords in source**: grep source body (spec body for spec-driven, task description for generated) for `auth, login, session, token, secret, password, crypto, api, env, permission, access, database, migration, payment`. Any hit → `strict`. Record matched keywords.
|
|
56
|
+
4. **Complexity-based** (spec-driven only):
|
|
57
|
+
- `spec.frontmatter.complexity == "high"` → `strict`
|
|
58
|
+
- `spec.frontmatter.complexity == "medium"` → `standard`
|
|
59
|
+
- `spec.frontmatter.complexity == "low"` → `fast`
|
|
60
|
+
5. **Generated tasks**: default to `standard`, Stage B LITE may escalate after BUILD.
|
|
61
|
+
|
|
62
|
+
Stage A writes to `state.route.stage_a.{at, reasons}`.
|
|
63
|
+
|
|
64
|
+
## Stage B LITE — Post-BUILD-GATE (PHASE 1.4)
|
|
65
|
+
|
|
66
|
+
**One rule** (simplified from v3.2's multi-heuristic machinery). Does not run if `route.user_override == true`. Only escalates, never de-escalates.
|
|
67
|
+
|
|
68
|
+
**Rule**: escalate to `strict` if `git diff <state.base_ref.sha>` meets ANY of:
|
|
69
|
+
|
|
70
|
+
- **Risk keyword in diff content** — matches any of the 14 Stage A risk keywords.
|
|
71
|
+
- **API surface** — changed files include paths under `src/api/`, `routes/`, `handlers/`, `app/api/`.
|
|
72
|
+
- **Dependency change** — any of: `package.json`, `requirements.txt`, `package-lock.json`, `pnpm-lock.yaml`, `yarn.lock`, `Pipfile.lock`, `poetry.lock`, `Cargo.toml`, `Cargo.lock`, `go.mod`, `go.sum`.
|
|
73
|
+
|
|
74
|
+
Stage B LITE writes to `state.route.stage_b.{at, escalated_from, reasons}`. No escalation → `stage_b.at` remains `null`.
|
|
75
|
+
|
|
76
|
+
## Phase inclusion matrix
|
|
77
|
+
|
|
78
|
+
| Phase | `fast` | `standard` | `strict` |
|
|
79
|
+
|-------|--------|-----------|----------|
|
|
80
|
+
| 0 PARSE + PREFLIGHT + ROUTE | ✓ | ✓ | ✓ |
|
|
81
|
+
| 1 BUILD (solo) | ✓ | ✓ | — (team) |
|
|
82
|
+
| 1 BUILD (team) | `--team` | `--team` | ✓ |
|
|
83
|
+
| 1.4 BUILD GATE (auto) | ✓ | ✓ | — (strict+docker) |
|
|
84
|
+
| 1.4 BUILD GATE (strict+docker) | — | — | ✓ |
|
|
85
|
+
| 1.5 BROWSER VALIDATE | ✓ (web) | ✓ (web) | ✓ (web) |
|
|
86
|
+
| 2 EVALUATE | ✓ | ✓ | ✓ |
|
|
87
|
+
| 2.5 UNIFIED FIX LOOP | ✓ (if findings) | ✓ (if findings) | ✓ (if findings) |
|
|
88
|
+
| 3 CRITIC (findings-only) | — | ✓ | ✓ (Dual security sub-pass) |
|
|
89
|
+
| 4 DOCS (doc-files only) | — | ✓ | ✓ |
|
|
90
|
+
| 5 FINAL REPORT + ARCHIVE | ✓ | ✓ | ✓ |
|
|
91
|
+
|
|
92
|
+
Legend: ✓ runs, — skipped by route. `--bypass <phase>` forces skip on any route. `fast` skips CRITIC and DOCS.
|
|
93
|
+
|
|
94
|
+
## Terminal-state algorithm (PHASE 5)
|
|
95
|
+
|
|
96
|
+
Final verdict computed across all findings files in precedence order:
|
|
97
|
+
|
|
98
|
+
1. **BUILD GATE FAIL** at exhaustion → `BLOCKED` with `BUILD GATE EXHAUSTED`.
|
|
99
|
+
2. **BROWSER VALIDATE BLOCKED** at exhaustion → `BLOCKED`.
|
|
100
|
+
3. **Any unresolved CRITICAL** in any `<phase>.findings.jsonl` → `BLOCKED`.
|
|
101
|
+
4. **Any unresolved HIGH** with `rule_id` prefix `correctness.*` / `security.*` / `design.*` → `NEEDS_WORK`.
|
|
102
|
+
5. **Any unresolved HIGH** (other categories) → `NEEDS_WORK`.
|
|
103
|
+
6. **Any unresolved MEDIUM security.*** → `NEEDS_WORK` (security stricter than general by design).
|
|
104
|
+
7. **Any unresolved MEDIUM** (other categories) → `PASS_WITH_ISSUES`.
|
|
105
|
+
8. **Only LOW / none** → `PASS`.
|
|
106
|
+
|
|
107
|
+
"Unresolved" means `status == "open"` in the latest round's file.
|
|
108
|
+
|
|
109
|
+
## Non-goals
|
|
110
|
+
|
|
111
|
+
- Per-criterion routing (every phase sees every criterion).
|
|
112
|
+
- Route re-evaluation mid-round.
|
|
113
|
+
- De-escalation.
|
|
114
|
+
- Replacing `--bypass` (bypass is an orthogonal opt-out).
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
# Pipeline State — `.devlyn/pipeline.state.json`
|
|
2
|
+
|
|
3
|
+
Control plane for a single auto-resolve run. Contains pointers and state only — never copied content from the spec or findings files.
|
|
4
|
+
|
|
5
|
+
## Purpose
|
|
6
|
+
|
|
7
|
+
Every phase reads `pipeline.state.json` to answer:
|
|
8
|
+
- What base git SHA am I diffing against? (prevents diff-scope drift across phases)
|
|
9
|
+
- Where is the canonical criteria source? (spec file path or generated file path)
|
|
10
|
+
- What route was selected and why?
|
|
11
|
+
- Which criteria are verified / failed and with what evidence?
|
|
12
|
+
- What is the current fix-loop round and max?
|
|
13
|
+
- Where are the artifacts from phases that already ran?
|
|
14
|
+
- What SHA did EVALUATE first pass at? (post-EVAL invariant check)
|
|
15
|
+
|
|
16
|
+
State.json is the only cross-phase mutable state. Spec files and `<phase>.findings.jsonl` are immutable within a run.
|
|
17
|
+
|
|
18
|
+
## File location
|
|
19
|
+
|
|
20
|
+
`.devlyn/pipeline.state.json` during a run; moved to `.devlyn/runs/<run_id>/pipeline.state.json` at PHASE 5 (archive).
|
|
21
|
+
|
|
22
|
+
Created by PHASE 0 on run start. At PHASE 5, the entire `.devlyn/` run artifact set is **moved** (not deleted) into `.devlyn/runs/<run_id>/`. See `## Archive contract` below.
|
|
23
|
+
|
|
24
|
+
## Canonical schema (v1.2)
|
|
25
|
+
|
|
26
|
+
```json
|
|
27
|
+
{
|
|
28
|
+
"version": "1.2",
|
|
29
|
+
"run_id": "ar-<ISO8601-compact>-<uuidv7-short>",
|
|
30
|
+
"started_at": "<ISO-8601 UTC>",
|
|
31
|
+
"engine": "auto" | "codex" | "claude",
|
|
32
|
+
"base_ref": {
|
|
33
|
+
"branch": "<string, e.g. 'main'>",
|
|
34
|
+
"sha": "<full 40-char git sha captured at Phase 0 start>"
|
|
35
|
+
},
|
|
36
|
+
"eval_passed_sha": "<git sha recorded when PHASE 2 first returns PASS or PASS_WITH_ISSUES>" | null,
|
|
37
|
+
"route": {
|
|
38
|
+
"selected": "fast" | "standard" | "strict" | null,
|
|
39
|
+
"user_override": true | false,
|
|
40
|
+
"bypasses": ["<phase-name>", "..."],
|
|
41
|
+
"stage_a": {
|
|
42
|
+
"at": "<ISO-8601 UTC>" | null,
|
|
43
|
+
"reasons": ["<string>", "..."]
|
|
44
|
+
},
|
|
45
|
+
"stage_b": {
|
|
46
|
+
"at": "<ISO-8601 UTC>" | null,
|
|
47
|
+
"escalated_from": "fast" | "standard" | null,
|
|
48
|
+
"reasons": ["<string>", "..."]
|
|
49
|
+
}
|
|
50
|
+
},
|
|
51
|
+
"source": {
|
|
52
|
+
"type": "spec" | "generated",
|
|
53
|
+
"spec_path": "<string path>" | null,
|
|
54
|
+
"spec_sha256": "<hex>" | null,
|
|
55
|
+
"criteria_path": "<string path>" | null,
|
|
56
|
+
"criteria_sha256": "<hex>" | null,
|
|
57
|
+
"criteria_anchors": ["spec://requirements", "..."]
|
|
58
|
+
},
|
|
59
|
+
"criteria": [
|
|
60
|
+
{
|
|
61
|
+
"id": "C1",
|
|
62
|
+
"ref": "<anchor>",
|
|
63
|
+
"status": "pending" | "implemented" | "verified" | "failed",
|
|
64
|
+
"evidence": [
|
|
65
|
+
{"file": "<string>", "line": <int>, "note": "<string>"}
|
|
66
|
+
],
|
|
67
|
+
"failed_by_finding_ids": ["<string>"]
|
|
68
|
+
}
|
|
69
|
+
],
|
|
70
|
+
"phases": {
|
|
71
|
+
"<phase_name>": {
|
|
72
|
+
"verdict": "PASS" | "PASS_WITH_ISSUES" | "NEEDS_WORK" | "FAIL" | "BLOCKED" | null,
|
|
73
|
+
"engine": "codex" | "claude" | "bash" | "dual" | null,
|
|
74
|
+
"model": "<string>" | null,
|
|
75
|
+
"started_at": "<ISO-8601 UTC>" | null,
|
|
76
|
+
"completed_at": "<ISO-8601 UTC>" | null,
|
|
77
|
+
"duration_ms": <int> | null,
|
|
78
|
+
"round": <int>,
|
|
79
|
+
"triggered_by": "<phase-name>" | null,
|
|
80
|
+
"pre_sha": "<git sha captured before this phase spawned; used for per-phase diff invariant>" | null,
|
|
81
|
+
"artifacts": {
|
|
82
|
+
"findings_file": "<path>" | null,
|
|
83
|
+
"log_file": "<path>" | null
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
},
|
|
87
|
+
"rounds": {
|
|
88
|
+
"global": <int>,
|
|
89
|
+
"max_rounds": <int>
|
|
90
|
+
},
|
|
91
|
+
"perf": { // OPTIONAL — present only when --perf flag is passed (v3.4 demoted from mandatory)
|
|
92
|
+
"wall_ms": <int>,
|
|
93
|
+
"tokens_total": <int>,
|
|
94
|
+
"per_phase": [
|
|
95
|
+
{"phase": "<name>", "engine": "codex" | "claude" | "bash" | "dual", "wall_ms": <int>, "tokens": <int>, "round": <int>, "triggered_by": "<phase>" | null}
|
|
96
|
+
]
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## Field semantics
|
|
102
|
+
|
|
103
|
+
### Top-level
|
|
104
|
+
|
|
105
|
+
- `version` — schema version; current value `1.2`. Orchestrators must refuse incompatible versions.
|
|
106
|
+
- `run_id` — unique, time-sortable run identifier in format `ar-<UTC-compact>-<12 hex>`. Example: `ar-20260423T163044Z-018f4c2a1b9c`.
|
|
107
|
+
- `started_at` — Phase 0 start, ISO-8601 UTC.
|
|
108
|
+
- `engine` — user-provided `--engine` flag value, or `auto` default.
|
|
109
|
+
- `base_ref` — git state captured at Phase 0. **All subsequent `git diff` commands use this SHA**, not `HEAD~1` or `main`. This eliminates diff-scope drift.
|
|
110
|
+
- `eval_passed_sha` — `null` until PHASE 2 first returns `PASS` or `PASS_WITH_ISSUES`. At that moment the orchestrator records `git rev-parse HEAD` here. After this field is populated, the **post-EVAL findings-only invariant** applies: PHASE 3 (CRITIC) must not write any non-doc files (reverted on violation), and PHASE 4 (DOCS) may only touch doc-allowlist paths. See `invariants` section of the skill.
|
|
111
|
+
|
|
112
|
+
### Route
|
|
113
|
+
|
|
114
|
+
- `selected` — `fast` / `standard` / `strict`, or `null` before Phase 0 decides.
|
|
115
|
+
- `user_override` — `true` if user passed `--route <value>`.
|
|
116
|
+
- `bypasses` — list of phase names the user explicitly bypassed via `--bypass <phase>`. Surfaced in the final report's `Guardrails bypassed` line. Empty list if no bypass.
|
|
117
|
+
- `stage_a` — initial routing at Phase 0, based on spec frontmatter + content scan.
|
|
118
|
+
- `stage_b` — post-BUILD checkpoint at Phase 1.4 completion. **Can only escalate** (fast → standard → strict), never de-escalate. `at` is `null` if no escalation.
|
|
119
|
+
- `reasons` — human-readable decision rationale, surfaced in final report.
|
|
120
|
+
|
|
121
|
+
### Source
|
|
122
|
+
|
|
123
|
+
- `type` — `spec` (roadmap spec file) or `generated` (ad-hoc task).
|
|
124
|
+
- `spec_path` + `spec_sha256` — canonical spec pointer + integrity hash for spec runs. Each phase re-computes and compares before reading. Mismatch → phase writes `verdict: "BLOCKED"` with reason `spec_sha256 mismatch`.
|
|
125
|
+
- `criteria_path` + `criteria_sha256` — same pair for generated runs. `criteria_sha256` is populated by PHASE 1 BUILD after it creates `criteria.generated.md`. Subsequent phases verify it the same way.
|
|
126
|
+
- `criteria_anchors` — enumerated anchors downstream phases may reference.
|
|
127
|
+
|
|
128
|
+
### Criteria
|
|
129
|
+
|
|
130
|
+
One entry per testable criterion extracted from the source. State machine: `pending → implemented → verified | failed`.
|
|
131
|
+
|
|
132
|
+
### Phases
|
|
133
|
+
|
|
134
|
+
Key is phase name (v3.4 set): `build`, `build_gate`, `browser_validate`, `evaluate`, `fix_loop`, `critic`, `docs`, `final_report`.
|
|
135
|
+
|
|
136
|
+
- `verdict` — `PASS` / `PASS_WITH_ISSUES` / `NEEDS_WORK` / `FAIL` / `BLOCKED` / `null`. **Single canonical verdict source** — orchestrator branches on this, never by parsing artifact files.
|
|
137
|
+
- `engine` / `model` — which model ran this phase. `bash` for build-gate. `dual` for `critic` security sub-pass on `--engine auto`.
|
|
138
|
+
- `round` — which fix-loop round this execution belongs to. Phases that run once: `1`. `build_gate`, `browser_validate`, `evaluate`, `critic` increment with fix-loop iterations.
|
|
139
|
+
- `triggered_by` — for phases re-run via the unified fix loop (PHASE 2.5), records the triggering phase name (`build_gate` / `browser_validate` / `evaluate` / `critic`). Also written on fix-loop entries themselves. `null` for the first run.
|
|
140
|
+
- `pre_sha` — captured by the orchestrator immediately before spawning a post-EVAL phase (`git rev-parse HEAD`). Used by the post-EVAL invariant to diff **only what this phase touched**. Applies to `critic` and `docs`. `null` for PARSE/BUILD/BUILD_GATE/BROWSER/EVAL (those use `base_ref.sha`).
|
|
141
|
+
- `artifacts` — pointers to phase output files. Phases that emit structured findings write both `findings_file` and `log_file`. `critic` writes a single `.devlyn/critic.findings.jsonl` carrying both design and security rule_id prefixes. DOCS leaves both `null` (its output is git commits).
|
|
142
|
+
- `sub_verdicts` (only on `critic`) — `{"design": <verdict>, "security": <verdict>}`; overall `verdict` = WORSE of the two per `references/phases/phase-3-critic.md`.
|
|
143
|
+
- `dep_audit` (only on `critic`) — `{"ran": bool, "command": "<cmd>", "high": N, "critical": N}` populated when critic's security sub-pass ran `npm audit` / `pip-audit` / equivalent.
|
|
144
|
+
|
|
145
|
+
### Rounds
|
|
146
|
+
|
|
147
|
+
- `global` — shared round counter across all fix-loop invocations regardless of trigger. Increments once per fix-loop iteration.
|
|
148
|
+
- `max_rounds` — cap from `--max-rounds` flag (default 4).
|
|
149
|
+
|
|
150
|
+
### Perf (opt-in via `--perf`, v3.4)
|
|
151
|
+
|
|
152
|
+
When `--perf` is passed, the orchestrator records wall-time and token consumption per phase for retrospective benchmarking. When the flag is omitted (the default), the `perf` block is absent from state.json and the orchestrator skips timing/token bookkeeping — Karpathy P2 (Simplicity First) applied: no mandatory meta-measurement.
|
|
153
|
+
|
|
154
|
+
When enabled:
|
|
155
|
+
- `wall_ms` — total wall-clock from PHASE 0 start to PHASE 5 end, in milliseconds.
|
|
156
|
+
- `tokens_total` — sum of `per_phase[].tokens`.
|
|
157
|
+
- `per_phase` — one entry per phase execution. Fields: `phase`, `engine`, `wall_ms`, `tokens` (from subagent `total_tokens` or Codex usage; `bash` reports 0), `round`, `triggered_by`.
|
|
158
|
+
|
|
159
|
+
Written at phase completion; totals roll up at PHASE 5.
|
|
160
|
+
|
|
161
|
+
## Anchor syntax
|
|
162
|
+
|
|
163
|
+
Format: `<scheme>://<section>[/<index>]`. `scheme` is `spec` or `criteria.generated`. `section` is slug-lowercased H2. `index` is optional 0-based position.
|
|
164
|
+
|
|
165
|
+
## Write protocol
|
|
166
|
+
|
|
167
|
+
- **Phase 0 (PARSE + PREFLIGHT + ROUTE)** — creates state.json with `version`, `run_id`, `started_at`, `engine`, `base_ref`, `rounds.max_rounds`, empty `phases`, and (after preflight step) populates `source`, `criteria[]` with `status: pending`, `route.selected`, `route.stage_a`, `route.bypasses`. `eval_passed_sha` remains `null`.
|
|
168
|
+
- **Each phase start** — orchestrator writes `phases.<name>.started_at`, `round`, `triggered_by` (if re-run).
|
|
169
|
+
- **Each phase end** — phase writes `phases.<name>.{verdict, completed_at, duration_ms, artifacts}`. Build and Evaluate additionally update `criteria[]` state. **When EVALUATE first returns PASS/PASS_WITH_ISSUES**, orchestrator sets `state.eval_passed_sha = git rev-parse HEAD` — this is the reference point for the post-EVAL invariant.
|
|
170
|
+
- **Phase 1.4 completion checkpoint** — orchestrator runs Stage B LITE routing check; writes `route.stage_b` on escalation.
|
|
171
|
+
- **Phase 5 (FINAL REPORT + ARCHIVE)** — reads state.json for the report, renders the report, then archives (see below).
|
|
172
|
+
|
|
173
|
+
## Archive contract (PHASE 5)
|
|
174
|
+
|
|
175
|
+
Best-effort move-and-prune. Replaces the previous "delete `.devlyn/`" behavior.
|
|
176
|
+
|
|
177
|
+
1. Create `.devlyn/runs/<run_id>/` with `mkdir -p`.
|
|
178
|
+
2. Move `.devlyn/pipeline.state.json`, every `.devlyn/<phase>.findings.jsonl`, every `.devlyn/<phase>.log.md`, every `.devlyn/fix-batch.round-*.json`, and `.devlyn/criteria.generated.md` (if exists) into that directory. Use `mv` (atomic within a filesystem).
|
|
179
|
+
3. Prune to the last 10 completed runs. List `.devlyn/runs/*/pipeline.state.json`, sort by enclosing `run_id` (lexicographic = chronological because run_ids start with a compact ISO8601 timestamp), and delete the oldest directories until at most 10 remain. **Never delete a directory whose `pipeline.state.json` has `phases.final_report.verdict == null`** — those are still in flight.
|
|
180
|
+
4. Kill any dev-server process spawned by PHASE 1.5 (BROWSER VALIDATE).
|
|
181
|
+
|
|
182
|
+
Best-effort; no cross-process lock. Pruning is idempotent on sorted run_id list, so concurrent runs at worst delete a run already slated for pruning.
|
|
183
|
+
|
|
184
|
+
## Integrity invariants
|
|
185
|
+
|
|
186
|
+
The orchestrator enforces:
|
|
187
|
+
|
|
188
|
+
1. `base_ref.sha` never changes after Phase 0.
|
|
189
|
+
2. `source.spec_sha256` (or `source.criteria_sha256` for generated runs) is re-verified at every phase start. Mismatch → the phase writes `verdict: "BLOCKED"` with reason. Missing hash is allowed ONLY on the phase that first populates it (PHASE 0 for spec; PHASE 1 for generated).
|
|
190
|
+
3. `route.selected` can only escalate via `stage_b`. No de-escalation.
|
|
191
|
+
4. `rounds.global` never exceeds `rounds.max_rounds`.
|
|
192
|
+
5. `criteria[].status` progression is monotonic per round: `pending → implemented → verified | failed`. A `failed` criterion can return to `implemented` via a subsequent fix-loop round, then be re-evaluated.
|
|
193
|
+
6. **Post-EVAL findings-only** (per-phase diff, not cumulative): once `eval_passed_sha` is non-null, each post-EVAL phase (CRITIC, DOCS) records `phases.<phase>.pre_sha = git rev-parse HEAD` at spawn time. After completion, the orchestrator runs `git diff --name-only <phases.<phase>.pre_sha> -- ':!.devlyn/**'`. For CRITIC (findings-only), any non-empty diff triggers `git reset --hard <pre_sha>` + `invariant.post-eval-code-mutation` finding + fix-loop entry. For DOCS, only doc-file-allowlist paths are legal; everything else triggers the same flow. `pre_sha` (not cumulative `eval_passed_sha`) is the correct baseline because fix-loop commits between EVAL and CRITIC are legitimate — they were re-EVALed. The `:!.devlyn/**` pathspec excludes orchestrator bookkeeping writes.
|
|
194
|
+
|
|
195
|
+
Violations indicate a bug in the orchestrator. Do not attempt silent recovery.
|
|
196
|
+
|
|
197
|
+
## Non-goals
|
|
198
|
+
|
|
199
|
+
- Crash-resume / workflow-engine semantics. State.json enables audit and orchestrator branching, not resume-from-crash.
|
|
200
|
+
- Full SARIF export from state.json. `<phase>.findings.jsonl` is the SARIF-aligned surface; state.json is internal.
|
|
201
|
+
- Per-finding history across runs. Current run's findings live in its `runs/<run_id>/` directory; cross-run comparison is manual.
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Archive auto-resolve run artifacts per references/pipeline-state.md#archive-contract.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
python3 scripts/archive_run.py [--devlyn-dir .devlyn]
|
|
6
|
+
|
|
7
|
+
Reads run_id from .devlyn/pipeline.state.json, moves per-run artifacts into
|
|
8
|
+
.devlyn/runs/<run_id>/, then best-effort prunes to last 10 completed runs
|
|
9
|
+
(in-flight runs — phases.final_report.verdict == null — are never deleted).
|
|
10
|
+
|
|
11
|
+
The contract lives in pipeline-state.md. This script implements it so that
|
|
12
|
+
archive behavior is identical across every invocation.
|
|
13
|
+
"""
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import argparse
|
|
17
|
+
import json
|
|
18
|
+
import pathlib
|
|
19
|
+
import shutil
|
|
20
|
+
import sys
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
PER_RUN_PATTERNS = (
|
|
24
|
+
"pipeline.state.json",
|
|
25
|
+
"*.findings.jsonl",
|
|
26
|
+
"*.log.md",
|
|
27
|
+
"fix-batch.round-*.json",
|
|
28
|
+
"criteria.generated.md",
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def read_run_id(devlyn: pathlib.Path) -> str:
|
|
33
|
+
state_path = devlyn / "pipeline.state.json"
|
|
34
|
+
if not state_path.is_file():
|
|
35
|
+
raise SystemExit(f"error: {state_path} not found")
|
|
36
|
+
try:
|
|
37
|
+
state = json.loads(state_path.read_text(encoding="utf-8"))
|
|
38
|
+
except json.JSONDecodeError as e:
|
|
39
|
+
raise SystemExit(f"error: {state_path} is not valid JSON: {e}")
|
|
40
|
+
run_id = state.get("run_id")
|
|
41
|
+
if not run_id:
|
|
42
|
+
raise SystemExit(f"error: {state_path} has no run_id")
|
|
43
|
+
return run_id
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def move_artifacts(devlyn: pathlib.Path, dest: pathlib.Path) -> int:
|
|
47
|
+
dest.mkdir(parents=True, exist_ok=True)
|
|
48
|
+
moved = 0
|
|
49
|
+
for pat in PER_RUN_PATTERNS:
|
|
50
|
+
for src in devlyn.glob(pat):
|
|
51
|
+
if src.is_file():
|
|
52
|
+
shutil.move(str(src), str(dest / src.name))
|
|
53
|
+
moved += 1
|
|
54
|
+
return moved
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def prune(runs_dir: pathlib.Path, keep: int = 10) -> int:
|
|
58
|
+
"""Delete oldest completed runs beyond `keep`. In-flight runs never removed."""
|
|
59
|
+
candidates = []
|
|
60
|
+
for d in sorted(runs_dir.glob("*/"), key=lambda p: p.name):
|
|
61
|
+
state_file = d / "pipeline.state.json"
|
|
62
|
+
if not state_file.is_file():
|
|
63
|
+
continue
|
|
64
|
+
try:
|
|
65
|
+
s = json.loads(state_file.read_text(encoding="utf-8"))
|
|
66
|
+
except json.JSONDecodeError:
|
|
67
|
+
# Can't decide flight-state safely; skip (never prune)
|
|
68
|
+
continue
|
|
69
|
+
verdict = s.get("phases", {}).get("final_report", {}).get("verdict")
|
|
70
|
+
if verdict is None:
|
|
71
|
+
continue # in-flight
|
|
72
|
+
candidates.append(d)
|
|
73
|
+
over = len(candidates) - keep
|
|
74
|
+
if over <= 0:
|
|
75
|
+
return 0
|
|
76
|
+
pruned = 0
|
|
77
|
+
for d in candidates[:over]: # oldest first (lex sort = chronological)
|
|
78
|
+
shutil.rmtree(d, ignore_errors=False)
|
|
79
|
+
pruned += 1
|
|
80
|
+
return pruned
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def main() -> int:
|
|
84
|
+
ap = argparse.ArgumentParser(description=__doc__.splitlines()[0])
|
|
85
|
+
ap.add_argument("--devlyn-dir", default=".devlyn")
|
|
86
|
+
ap.add_argument("--keep", type=int, default=10, help="keep N most recent completed runs")
|
|
87
|
+
args = ap.parse_args()
|
|
88
|
+
|
|
89
|
+
devlyn = pathlib.Path(args.devlyn_dir)
|
|
90
|
+
if not devlyn.is_dir():
|
|
91
|
+
sys.stderr.write(f"error: {devlyn} is not a directory\n")
|
|
92
|
+
return 1
|
|
93
|
+
|
|
94
|
+
run_id = read_run_id(devlyn)
|
|
95
|
+
dest = devlyn / "runs" / run_id
|
|
96
|
+
moved = move_artifacts(devlyn, dest)
|
|
97
|
+
pruned = prune(devlyn / "runs", keep=args.keep)
|
|
98
|
+
|
|
99
|
+
sys.stdout.write(f"archived run_id={run_id} files={moved} pruned={pruned}\n")
|
|
100
|
+
return 0
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
if __name__ == "__main__":
|
|
104
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Compute auto-resolve terminal verdict per references/pipeline-routing.md#terminal-state-algorithm.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
python3 scripts/terminal_verdict.py [--devlyn-dir .devlyn] [--json]
|
|
6
|
+
|
|
7
|
+
Reads every `.devlyn/<phase>.findings.jsonl`, filters `status == "open"`, applies the
|
|
8
|
+
precedence list, and prints the verdict (stdout) and exit code.
|
|
9
|
+
|
|
10
|
+
Exit codes: 0 = PASS | 1 = PASS_WITH_ISSUES | 2 = NEEDS_WORK | 3 = BLOCKED
|
|
11
|
+
|
|
12
|
+
The pipeline routing file defines the authoritative precedence. This script implements
|
|
13
|
+
it deterministically so the orchestrator does not re-reason through the rule set per run.
|
|
14
|
+
"""
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import json
|
|
19
|
+
import pathlib
|
|
20
|
+
import sys
|
|
21
|
+
from collections import Counter
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
PRECEDENCE = [
|
|
25
|
+
# (label, predicate on finding list) — first True wins
|
|
26
|
+
("BLOCKED", lambda fs: any(f["severity"] == "CRITICAL" for f in fs)),
|
|
27
|
+
("NEEDS_WORK", lambda fs: any(
|
|
28
|
+
f["severity"] == "HIGH"
|
|
29
|
+
and any(f.get("rule_id", "").startswith(p) for p in ("correctness.", "security.", "design."))
|
|
30
|
+
for f in fs
|
|
31
|
+
)),
|
|
32
|
+
("NEEDS_WORK", lambda fs: any(f["severity"] == "HIGH" for f in fs)),
|
|
33
|
+
("NEEDS_WORK", lambda fs: any(
|
|
34
|
+
f["severity"] == "MEDIUM" and f.get("rule_id", "").startswith("security.")
|
|
35
|
+
for f in fs
|
|
36
|
+
)),
|
|
37
|
+
("PASS_WITH_ISSUES", lambda fs: any(f["severity"] == "MEDIUM" for f in fs)),
|
|
38
|
+
("PASS_WITH_ISSUES", lambda fs: any(f["severity"] == "LOW" for f in fs)),
|
|
39
|
+
("PASS", lambda fs: True), # fallthrough
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
EXIT = {"PASS": 0, "PASS_WITH_ISSUES": 1, "NEEDS_WORK": 2, "BLOCKED": 3}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def collect_open(devlyn: pathlib.Path) -> list[dict]:
|
|
46
|
+
open_findings: list[dict] = []
|
|
47
|
+
for jsonl in devlyn.glob("*.findings.jsonl"):
|
|
48
|
+
for line in jsonl.read_text(encoding="utf-8").splitlines():
|
|
49
|
+
line = line.strip()
|
|
50
|
+
if not line:
|
|
51
|
+
continue
|
|
52
|
+
try:
|
|
53
|
+
f = json.loads(line)
|
|
54
|
+
except json.JSONDecodeError:
|
|
55
|
+
# Malformed line surfaces explicitly rather than silently dropping.
|
|
56
|
+
sys.stderr.write(f"warn: malformed finding in {jsonl}: {line[:80]}\n")
|
|
57
|
+
continue
|
|
58
|
+
if f.get("status") == "open":
|
|
59
|
+
open_findings.append(f)
|
|
60
|
+
return open_findings
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def compute(findings: list[dict]) -> str:
|
|
64
|
+
for label, pred in PRECEDENCE:
|
|
65
|
+
if pred(findings):
|
|
66
|
+
return label
|
|
67
|
+
return "PASS"
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def main() -> int:
|
|
71
|
+
p = argparse.ArgumentParser(description=__doc__.splitlines()[0])
|
|
72
|
+
p.add_argument("--devlyn-dir", default=".devlyn", help="path to .devlyn/ (default: ./.devlyn)")
|
|
73
|
+
p.add_argument("--json", action="store_true", help="emit JSON summary to stdout")
|
|
74
|
+
args = p.parse_args()
|
|
75
|
+
|
|
76
|
+
devlyn = pathlib.Path(args.devlyn_dir)
|
|
77
|
+
if not devlyn.is_dir():
|
|
78
|
+
sys.stderr.write(f"error: {devlyn} is not a directory\n")
|
|
79
|
+
return 3
|
|
80
|
+
|
|
81
|
+
findings = collect_open(devlyn)
|
|
82
|
+
verdict = compute(findings)
|
|
83
|
+
by_sev = Counter(f["severity"] for f in findings)
|
|
84
|
+
|
|
85
|
+
if args.json:
|
|
86
|
+
json.dump({"verdict": verdict, "open": len(findings), "by_severity": dict(by_sev)}, sys.stdout)
|
|
87
|
+
sys.stdout.write("\n")
|
|
88
|
+
else:
|
|
89
|
+
sys.stdout.write(f"{verdict}\n")
|
|
90
|
+
sys.stdout.write(f"open: {len(findings)} ({' '.join(f'{k}={v}' for k, v in sorted(by_sev.items()))})\n")
|
|
91
|
+
|
|
92
|
+
return EXIT[verdict]
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
if __name__ == "__main__":
|
|
96
|
+
raise SystemExit(main())
|