npm - valent-pipeline - Versions diffs - 0.1.16 → 0.1.17 - Mend

valent-pipeline 0.1.16 → 0.1.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (75) hide show

package/package.json +1 -1
package/pipeline/agents-manifest.yaml +13 -15
package/pipeline/docs/agent-reference.md +3 -3
package/pipeline/docs/communication-standard.md +2 -2
package/pipeline/docs/lead-lifecycle.md +9 -9
package/pipeline/docs/pipeline-overview.md +12 -12
package/pipeline/docs/pipeline-state-schema.md +32 -2
package/pipeline/docs/task-graph.md +14 -15
package/pipeline/docs/template-skeleton.md +12 -11
package/pipeline/prompts/bend.md +1 -1
package/pipeline/prompts/fend.md +4 -3
package/pipeline/prompts/judge.md +64 -0
package/pipeline/prompts/lead.md +195 -35
package/pipeline/prompts/qa-a.md +2 -2
package/pipeline/prompts/qa-b.md +2 -2
package/pipeline/prompts/readiness.md +70 -0
package/pipeline/prompts/reqs.md +1 -1
package/pipeline/prompts/retrospective.md +12 -3
package/pipeline/prompts/uxa.md +1 -1
package/pipeline/scripts/embed-sqlite.ts +4 -3
package/pipeline/steps/bend/estimate.md +50 -0
package/pipeline/steps/critic/test-review.md +10 -1
package/pipeline/steps/fend/estimate.md +51 -0
package/pipeline/steps/{judge-g1/pass2-review.md → judge/bug-review.md} +11 -12
package/pipeline/steps/{judge-g2 → judge}/evidence-review.md +20 -18
package/pipeline/steps/judge/ship-decision.md +39 -0
package/pipeline/steps/orchestration/adopt-lead-and-create-team.md +23 -3
package/pipeline/steps/orchestration/sprint-execute.md +57 -0
package/pipeline/steps/orchestration/sprint-groom.md +61 -0
package/pipeline/steps/orchestration/sprint-init.md +64 -0
package/pipeline/steps/orchestration/sprint-plan.md +87 -0
package/pipeline/steps/orchestration/sprint-review.md +70 -0
package/pipeline/steps/orchestration/sprint-size.md +35 -0
package/pipeline/steps/orchestration/update-backlog-status.md +2 -2
package/pipeline/steps/qa-a/ui.md +57 -0
package/pipeline/steps/qa-b/ui.md +58 -0
package/pipeline/steps/qa-b/write-report.md +4 -5
package/pipeline/steps/readiness/sprint-review.md +46 -0
package/pipeline/steps/{judge-g1/pass1-review.md → readiness/standalone-review.md} +13 -10
package/pipeline/steps/retrospective/calibration.md +52 -0
package/pipeline/steps/retrospective/directives.md +23 -0
package/pipeline/steps/retrospective/report.md +14 -0
package/pipeline/task-graphs/backend-api.yaml +32 -39
package/pipeline/task-graphs/data-pipeline.yaml +32 -39
package/pipeline/task-graphs/document-generation.yaml +32 -39
package/pipeline/task-graphs/frontend-only.yaml +31 -38
package/pipeline/task-graphs/fullstack-web.yaml +34 -41
package/pipeline/task-graphs/library.yaml +32 -39
package/pipeline/task-graphs/mcp-server.yaml +32 -39
package/pipeline/templates/bugs.template.md +4 -4
package/pipeline/templates/estimation.template.md +30 -0
package/pipeline/templates/execution-report.template.md +1 -1
package/pipeline/templates/{judge-g2-decision.template.md → judge-decision.template.md} +7 -7
package/pipeline/templates/judge-review.template.md +49 -0
package/pipeline/templates/pmcp-evidence.template.md +2 -2
package/pipeline/templates/qa-test-spec.template.md +1 -1
package/pipeline/templates/{judge-g1-review.template.md → readiness-review.template.md} +28 -49
package/pipeline/templates/reqs-brief.template.md +1 -1
package/pipeline/templates/sprint-plan.template.md +35 -0
package/pipeline/templates/sprint-status.template.yaml +53 -0
package/pipeline/templates/story-report.template.md +5 -5
package/pipeline/templates/traceability-matrix.template.md +2 -2
package/pipeline/templates/uxa-spec.template.md +1 -1
package/pipeline/templates/visual-validation-checklist.template.md +2 -2
package/skills/valent-configure/SKILL.md +1 -1
package/skills/valent-help/SKILL.md +3 -3
package/skills/valent-run-epic/SKILL.md +3 -1
package/skills/valent-run-retrospective/SKILL.md +2 -2
package/skills/valent-run-story/SKILL.md +1 -1
package/src/commands/db-init.js +25 -1
package/src/commands/db-rebuild.js +31 -3
package/src/lib/config-schema.js +37 -1
package/pipeline/prompts/judge-g1.md +0 -65
package/pipeline/prompts/judge-g2.md +0 -61
package/pipeline/steps/judge-g2/ship-decision.md +0 -34

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "valent-pipeline",
-  "version": "0.1.16",
+  "version": "0.1.17",
   "description": "v3 multi-agent AI pipeline for software development lifecycle",
   "type": "module",
   "bin": {

package/pipeline/agents-manifest.yaml CHANGED Viewed

@@ -62,17 +62,15 @@ agents:
     reads_from: [reqs-brief.md, uxa-spec.md]
     writes_to: [qa-test-spec.md, visual-validation-checklist.md]
-  judge_g1:
-    name: JUDGE-G1
+  readiness:
+    name: READINESS
     model: sonnet
     lifecycle: per-story
-    role: "Quality gate — validates reqs, UXA spec, test specs (Pass 1) and bug priorities (Pass 2)"
-    prompt_template: .valent-pipeline/prompts/judge-g1.md
-    passes:
-      pass1_review_order: [reqs-validation, uxa-validation, qa-spec-validation]  # sequential, stop on first failure
-      pass2: bug-review
-    reads_from: [reqs-brief.md, uxa-spec.md, qa-test-spec.md, bugs.md, execution-report.md]
-    writes_to: [judge-g1-review.md]
+    role: "Spec quality gate — validates reqs, UXA spec, and test specs are implementation-ready"
+    prompt_template: .valent-pipeline/prompts/readiness.md
+    review_order: [reqs-validation, uxa-validation, qa-spec-validation]  # sequential, stop on first failure
+    reads_from: [reqs-brief.md, uxa-spec.md, qa-test-spec.md]
+    writes_to: [readiness-review.md]
   bend:
     name: BEND
@@ -113,14 +111,14 @@ agents:
     writes_to: [execution-report.md, bugs.md, traceability-matrix.md]
     can_request_spawn: [pmcp]  # asks lead to spawn PMCP
-  judge_g2:
-    name: JUDGE-G2
+  judge:
+    name: JUDGE
     model: sonnet
     lifecycle: per-story
-    role: "Final ship gate — evidence-based approval or rejection"
-    prompt_template: .valent-pipeline/prompts/judge-g2.md
-    reads_from: [execution-report.md, traceability-matrix.md, pmcp-evidence.md, bugs.md, judge-g1-review.md, qa-test-spec.md]  # critic-review.md intentionally excluded — G2 validates test/execution evidence, not code review; qa-test-spec.md used as reference for assertion cross-check
-    writes_to: [judge-g2-decision.md, story-report.md]
+    role: "Final quality gate — bug priority review + evidence-based ship decision"
+    prompt_template: .valent-pipeline/prompts/judge.md
+    reads_from: [execution-report.md, traceability-matrix.md, pmcp-evidence.md, bugs.md, qa-test-spec.md]  # critic-review.md intentionally excluded — JUDGE validates test/execution evidence, not code review; qa-test-spec.md used as reference for assertion cross-check
+    writes_to: [judge-review.md, judge-decision.md, story-report.md]
   knowledge:
     name: Knowledge

package/pipeline/docs/agent-reference.md CHANGED Viewed

@@ -16,12 +16,12 @@ Spawned fresh for each story and torn down after the story ships or is cancelled
 | REQS | Sonnet | Requirements analyst -- translates ACs into implementation brief | story-input (ACs, trigger-map, architecture-decisions, UX spec) | `reqs-brief.md` | Brainstorms ambiguity resolutions; escalates only when options have genuinely competing tradeoffs |
 | UXA | Sonnet | UX specification -- translates UX spec into component specs | `reqs-brief.md`, ux-spec, trigger-map, scenarios | `uxa-spec.md` | Runs translation-only mode without trigger-map or scenarios; skipped for backend-only projects |
 | QA-A | Sonnet | QA spec writer -- produces behavioral test specifications | `reqs-brief.md`, `uxa-spec.md` | `qa-test-spec.md`, `visual-validation-checklist.md` | Writes test specs before code exists; tests are specified, not implemented |
-| JUDGE-G1 | Sonnet | Quality gate -- validates specs (Pass 1) and bug priorities (Pass 2) | `reqs-brief.md`, `uxa-spec.md`, `qa-test-spec.md`, `bugs.md`, `execution-report.md` | `judge-g1-review.md` | Sequential review: stops on first failure in Pass 1 |
+| READINESS | Sonnet | Spec quality gate -- validates specs before execution begins | `reqs-brief.md`, `uxa-spec.md`, `qa-test-spec.md` | `readiness-review.md` | Sequential review: stops on first failure |
 | BEND | Opus | Backend developer -- implements production code and tests | `reqs-brief.md`, `qa-test-spec.md` | `bend-handoff.md` | Implements to QA-A test spec; coordinates with FEND via inbox for shared files |
 | FEND | Opus | Frontend developer -- implements UI components and tests | `reqs-brief.md`, `uxa-spec.md`, `qa-test-spec.md` | `fend-handoff.md` | Implements to UXA component spec; skipped for backend-only projects |
 | CRITIC | Opus | Code reviewer -- 3-pass adversarial review | git-diff, `reqs-brief.md`, `qa-test-spec.md` | `critic-review.md` | 3-pass sequential review (blind hunt, edge-case hunt, acceptance audit) + triage |
 | QA-B | Sonnet | Test executor -- runs tests, validates spec alignment, files bugs | `qa-test-spec.md`, `critic-review.md`, `reqs-brief.md` | `execution-report.md`, `bugs.md`, `traceability-matrix.md` | Runs tests against real infrastructure; can request PMCP spawn for visual validation |
-| JUDGE-G2 | Sonnet | Final ship gate -- evidence-based approval or rejection | `execution-report.md`, `traceability-matrix.md`, `pmcp-evidence.md`, `bugs.md`, `judge-g1-review.md` | `judge-g2-decision.md` | Evidence over assertion -- independently verifies every upstream claim |
+| JUDGE | Sonnet | Final quality gate -- bug review + ship decision | `execution-report.md`, `traceability-matrix.md`, `pmcp-evidence.md`, `bugs.md`, `qa-test-spec.md` | `judge-review.md`, `judge-decision.md`, `story-report.md` | Evidence over assertion -- independently verifies every upstream claim |
 | Knowledge | Haiku | Knowledge retrieval -- answers queries from persistent data sources | chromadb, curated-knowledge-files, correction-directives | _(none -- inbox only)_ | Responds via inbox only; no file output |
 ### Persistent Agent (1)
@@ -66,7 +66,7 @@ Not all agents run for every project type. The Lead reads `project_type` from `p
 | Tier | Agents | Use Case | Cost |
 |------|--------|----------|------|
 | Opus | Lead, BEND, FEND, CRITIC | Complex code generation, orchestration, nuanced multi-pass review | Highest |
-| Sonnet | REQS, UXA, QA-A, QA-B, JUDGE-G1, JUDGE-G2, PMCP, Retrospective | Analysis, spec writing, test execution, judgment, coordination | Balanced |
+| Sonnet | REQS, UXA, QA-A, QA-B, READINESS, JUDGE, PMCP, Retrospective | Analysis, spec writing, test execution, judgment, coordination | Balanced |
 | Haiku | Knowledge, Embed, Help | Mechanical retrieval, indexing instructions, documentation lookups | Lowest |
 Model assignments are configurable in `pipeline-config.yaml` under the `models` section. Move agents between tiers to adjust the quality/cost tradeoff for your project.

package/pipeline/docs/communication-standard.md CHANGED Viewed

@@ -150,7 +150,7 @@ Every handoff file begins with this YAML frontmatter block. It is the machine-re
 ```yaml
 ---
-agent: {agent-name}          # producing agent: reqs | uxa | qa-a | bend | fend | critic | qa-b | judge-g1 | judge-g2 | pmcp | retrospective
+agent: {agent-name}          # producing agent: reqs | uxa | qa-a | readiness | bend | fend | critic | qa-b | judge | pmcp | retrospective
 story: {story-id}            # story identifier, e.g. STORY-042
 status: {status}             # in_progress | completed
 stepsCompleted: []           # list of completed sub-steps within this agent's phase
@@ -278,7 +278,7 @@ Design Council is a structured deliberation protocol using existing inbox primit
 - REQS flags a high-ambiguity decision where brainstormed options have genuinely competing tradeoffs and no clear winner
 - CRITIC rejects code for the second time on the same issue, suggesting a deeper design disagreement
-- JUDGE G1 rejects a test spec but the rejection reason is debatable -- the spec author (QA-A) disagrees with the reviewer's interpretation
+- READINESS rejects a test spec but the rejection reason is debatable -- the spec author (QA-A) disagrees with the reviewer's interpretation
 ### Step 1: Initiator sends structured question

package/pipeline/docs/lead-lifecycle.md CHANGED Viewed

@@ -8,7 +8,7 @@
 ### Persistent vs Per-Story Agents
-The lead is the **only persistent agent** in the pipeline. It carries `pipeline-state.json` and backlog position forward across stories. All other agents (REQS, UXA, QA-A, BEND, FEND, CRITIC, QA-B, JUDGE-G1, JUDGE-G2, Knowledge) are **per-story** -- spawned fresh when a story starts, torn down when it ships.
+The lead is the **only persistent agent** in the pipeline. It carries `pipeline-state.json` and backlog position forward across stories. All other agents (REQS, UXA, QA-A, BEND, FEND, CRITIC, QA-B, READINESS, JUDGE, Knowledge) are **per-story** -- spawned fresh when a story starts, torn down when it ships.
 The Knowledge Agent's value is in its persistent data sources (ChromaDB collections and curated knowledge files on disk), not its conversation history. A fresh spawn reads from the same store.
@@ -79,7 +79,7 @@ The lead watches the board, not the work. It reads the shared task list to track
 - Read handoff documents to decide what happens next (task dependencies handle sequencing)
 - Relay messages between agents (they use inbox directly)
-- Judge output quality (JUDGE gates and CRITIC handle this -- except on G2 rejection)
+- Judge output quality (READINESS, JUDGE, and CRITIC handle this -- except on JUDGE rejection)
 - Customize templates per spawn (agents read their role from the manifest)
 ### Stall Detection
@@ -111,7 +111,7 @@ All code committed and pushed to the branch specified by the user. The pipeline
 ### Ship Sequence
-1. JUDGE G2 approves
+1. JUDGE approves
 2. Code committed and pushed to user-specified branch
 3. All agent outputs persist in the story folder (handoff files, reviews, bug reports, execution reports, PMCP evidence)
 4. Lead writes `story-report.md`: task completion times, rejection cycles, cost metrics
@@ -133,16 +133,16 @@ The lead maintains YAML frontmatter in `story-report.md` during story execution.
 | Rejection Source | What Failed | Re-entry Action |
 |-----------------|-------------|-----------------|
-| **JUDGE G1 Pass 1** | REQS brief inadequate | Re-queue REQS to revise brief. UXA and QA-A do not proceed. |
-| **JUDGE G1 Pass 1** | UXA spec incomplete (fails Developer Trust Test) | Re-queue UXA to revise spec. QA-A does not proceed. |
-| **JUDGE G1 Pass 1** | Test specs insufficient | Re-queue QA-A to revise specs. |
+| **READINESS** | REQS brief inadequate | Re-queue REQS to revise brief. UXA and QA-A do not proceed. |
+| **READINESS** | UXA spec incomplete (fails Developer Trust Test) | Re-queue UXA to revise spec. QA-A does not proceed. |
+| **READINESS** | Test specs insufficient | Re-queue QA-A to revise specs. |
 | **CRITIC** | BEND code rejected | CRITIC sends rejection to BEND via inbox. BEND fixes. CRITIC re-reviews. |
 | **CRITIC** | FEND code rejected | CRITIC sends rejection to FEND via inbox. FEND fixes. CRITIC re-reviews. |
 | **QA-B** | P1-P3 bugs found | QA-B routes bugs to BEND/FEND via inbox. Devs fix. QA-B re-runs. |
-| **JUDGE G1 Pass 2** | Bug priority reclassified (P4 -> P1-P3) | QA-B routes reclassified bug to devs. Devs fix. QA-B re-runs. |
-| **JUDGE G2** | Final gate rejection | **Lead takes ownership.** Lead reads the full rejection, diagnoses root cause, determines which agents need to act, and orchestrates the fix. G2 rejections are non-routine -- they mean something slipped through the entire chain. |
+| **JUDGE** | Bug priority reclassified (P4 -> P1-P3) | QA-B routes reclassified bug to devs. Devs fix. QA-B re-runs. |
+| **JUDGE** | Final gate rejection | **Lead takes ownership.** Lead reads the full rejection, diagnoses root cause, determines which agents need to act, and orchestrates the fix. JUDGE rejections are non-routine -- they mean something slipped through the entire chain. |
-**Note:** G1 Pass 1 reviews REQS -> UXA -> QA sequentially, stopping on first failure. Only one rejection fires per pass -- downstream specs are not reviewed if an upstream spec fails.
+**Note:** READINESS reviews REQS -> UXA -> QA sequentially, stopping on first failure. Only one rejection fires per pass -- downstream specs are not reviewed if an upstream spec fails.
 ---

package/pipeline/docs/pipeline-overview.md CHANGED Viewed

@@ -8,7 +8,7 @@
 v3 is a story execution pipeline built on Claude Code agent teams. It takes a user story with acceptance criteria and produces committed, tested code plus a full artifact trail -- requirements briefs, test specs, code reviews, execution reports, and traceability matrices.
-The pipeline is orchestrated by a persistent Lead agent that spawns a fresh team of specialist agents per story. Each agent reads structured handoff documents from upstream agents, does its work, and writes its own handoff document for downstream consumers. Quality gates (JUDGE agents) enforce pass/fail checkpoints before work proceeds.
+The pipeline is orchestrated by a persistent Lead agent that spawns a fresh team of specialist agents per story. Each agent reads structured handoff documents from upstream agents, does its work, and writes its own handoff document for downstream consumers. Quality gates (READINESS and JUDGE) enforce pass/fail checkpoints before work proceeds.
 ---
@@ -41,8 +41,9 @@ For each story, the pipeline writes to `stories/{story-id}/output/`:
 | `execution-report.md` | QA-B | Test execution results |
 | `bugs.md` | QA-B | Filed bugs with priorities |
 | `traceability-matrix.md` | QA-B | AC-to-test coverage map |
-| `judge-g1-review.md` | JUDGE-G1 | Spec quality gate results |
-| `judge-g2-decision.md` | JUDGE-G2 | Final ship/reject decision |
+| `readiness-review.md` | READINESS | Spec quality gate results |
+| `judge-review.md` | JUDGE | Bug review findings |
+| `judge-decision.md` | JUDGE | Final ship/reject decision |
 | `pmcp-evidence.md` | PMCP | Visual validation screenshots |
 | `story-report.md` | Lead | Story completion summary |
@@ -57,7 +58,7 @@ The Lead agent reads `agents-manifest.yaml` and `pipeline-config.yaml` at startu
 ### Pipeline Flow
 ```
-REQS → UXA → QA-A → JUDGE-G1 → BEND+FEND → CRITIC → QA-B → JUDGE-G1 → JUDGE-G2 → SHIP
+REQS → UXA → QA-A → READINESS → BEND+FEND → CRITIC → QA-B → JUDGE → SHIP
 ```
 1. **REQS** reads the story input and produces `reqs-brief.md` -- the implementation brief that all downstream agents treat as the source of truth for business requirements.
@@ -66,7 +67,7 @@ REQS → UXA → QA-A → JUDGE-G1 → BEND+FEND → CRITIC → QA-B → JUDGE-G
 3. **QA-A** writes behavioral test specifications and a visual validation checklist before any code is written. Tests are specified, not implemented.
-4. **JUDGE-G1 Pass 1** validates the spec chain: reqs brief, UXA spec, and QA test spec. Stops on first failure -- upstream agent must rework before the pipeline proceeds.
+4. **READINESS** validates the spec chain: reqs brief, UXA spec, and QA test spec. Stops on first failure -- upstream agent must rework before the pipeline proceeds.
 5. **BEND + FEND** implement production code and tests in parallel. BEND handles backend; FEND handles frontend (skipped for backend-only projects). Both read the reqs brief and test spec.
@@ -74,11 +75,9 @@ REQS → UXA → QA-A → JUDGE-G1 → BEND+FEND → CRITIC → QA-B → JUDGE-G
 7. **QA-B** executes the full test suite against real infrastructure, cross-references results against the QA-A test spec, files bugs, and builds the traceability matrix.
-8. **JUDGE-G1 Pass 2** reviews any filed bugs and validates their priorities.
+8. **JUDGE** reviews any filed bugs, validates their priorities, and makes the final SHIP or REJECT decision based on evidence: test results, traceability matrix, bug status, and PMCP visual evidence (if applicable). Evidence over assertion -- every upstream claim is independently verified.
-9. **JUDGE-G2** makes the final SHIP or REJECT decision based on evidence: test results, traceability matrix, bug status, and PMCP visual evidence (if applicable). Evidence over assertion -- every upstream claim is independently verified.
-10. **SHIP** -- Lead commits code, writes `story-report.md`, tears down the team, and picks the next story.
+9. **SHIP** -- Lead commits code, writes `story-report.md`, tears down the team, and picks the next story.
 ### Rejection Loops
@@ -101,7 +100,7 @@ See `.valent-pipeline/docs/communication-standard.md` for the full specification
 - **Distilled communication** -- All handoff artifacts are structured for machine consumption: YAML frontmatter, orchestrator summary, facts-only prose. No filler.
 - **Behavioral test specs** -- QA-A writes test specifications before code exists. Developers implement to spec; QA-B verifies against spec.
 - **Multi-pass code review** -- CRITIC runs three independent review passes (blind hunt, edge-case hunt, acceptance audit) then triages all findings by severity.
-- **Quality gates** -- JUDGE-G1 validates specs; JUDGE-G2 validates execution evidence. Both are pass/fail checkpoints that block the pipeline.
+- **Quality gates** -- READINESS validates specs; JUDGE validates execution evidence and makes the ship decision. Both are pass/fail checkpoints that block the pipeline.
 - **Correction directives** -- Learned rules from past stories (maintained by the Retrospective agent) that are injected into agent prompts to prevent recurring mistakes.
 ---
@@ -156,8 +155,9 @@ stories/
       execution-report.md             # QA-B output
       bugs.md                         # QA-B output
       traceability-matrix.md          # QA-B output
-      judge-g1-review.md              # JUDGE-G1 output
-      judge-g2-decision.md            # JUDGE-G2 output
+      readiness-review.md              # READINESS output
+      judge-review.md                  # JUDGE output
+      judge-decision.md                # JUDGE output
       pmcp-evidence.md                # PMCP output
       story-report.md                 # Lead output
       decisions.md                    # Design Council decisions

package/pipeline/docs/pipeline-state-schema.md CHANGED Viewed

@@ -53,7 +53,7 @@ Defines the JSON schema for `pipeline-state.json`, the Lead agent's persistent s
 |-------|------|-------------|---------|------------|
 | `id` | string | Story identifier (e.g., `STORY-042`) | Lead, all teammates (via story context) | Lead (on story start) |
 | `status` | enum | `in_progress`, `completed`, `cancelled`, `blocked-on-user` | Lead (for state machine transitions) | Lead (on phase transitions) |
-| `phase` | enum | `kick-off` (spawning agents), `monitoring` (execution in progress), `ship-teardown` (JUDGE-G2 passed, cleanup) | Lead (to determine allowed actions) | Lead (on phase transitions) |
+| `phase` | enum | `kick-off` (spawning agents), `monitoring` (execution in progress), `ship-teardown` (JUDGE passed, cleanup) | Lead (to determine allowed actions) | Lead (on phase transitions) |
 | `active_teammates` | string[] | Names of currently spawned teammates | Lead (for health checks, stall detection) | Lead (on spawn/termination) |
 | `task_graph_snapshot` | string | Pointer to the shared task list that tracks agent dependencies | Lead (on crash recovery to rebuild state) | Lead (on task graph changes) |
 | `started_at` | ISO-8601 | Timestamp when story execution began | Lead (for duration tracking in story-report) | Lead (on story start) |
@@ -64,7 +64,7 @@ Defines the JSON schema for `pipeline-state.json`, the Lead agent's persistent s
 | Field | Type | Description | Read by | Written by |
 |-------|------|-------------|---------|------------|
 | `id` | string | Story identifier | Lead (for scheduling) | Lead (on user submission) |
-| `status` | enum | `pending`, `in_progress`, `completed`, `blocked`, `blocked-on-user`, `cancelled` | Lead (to select next story) | Lead (on status changes) |
+| `status` | enum | Granular phase statuses: `pending`, `requirements-spec`, `ux-spec`, `test-case-development`, `readiness-review`, `groomed`, `sizing`, `sprint-planned`, `development`, `code-review`, `qa-validation`, `final-review`, `shipped`, `blocked`, `blocked-on-user`, `cancelled` | Lead (to select next story) | Lead (on status changes) |
 | `depends_on` | string[] | Story IDs that must complete before this story can start | Lead (for dependency resolution) | Lead (on user submission) |
 | `blocked_reason` | string | Human-readable reason for blocked status; empty or absent when not blocked | Lead (for user reporting) | Lead (when blocking occurs) |
@@ -92,6 +92,36 @@ The Lead creates `pipeline-state.json` on first pipeline initialization with `pi
 ---
+## Sprint State (Epic/Project Mode)
+When `is_sprint_mode` is true, `pipeline-state.json` includes a `current_sprint` object:
+| Field | Type | Description | Read by | Written by |
+|-------|------|-------------|---------|------------|
+| `id` | string | Sprint identifier (e.g., `kanban-mvp-sprint-3`) | Lead | Lead (on sprint init) |
+| `number` | number | Sequential sprint number within this run | Lead | Lead (on sprint init) |
+| `phase` | enum | `grooming`, `sizing`, `planning`, `executing`, `reviewing`, `completed` | Lead (for crash recovery) | Lead (on sprint phase transitions) |
+| `stories_planned` | string[] | Story IDs packed into this sprint | Lead | Lead (on sprint plan) |
+| `stories_completed` | string[] | Story IDs shipped in this sprint | Lead | Lead (on story ship) |
+| `velocity` | number | Current velocity in story points | Lead, Retrospective | Lead (on sprint init) |
+| `elapsed_execution_minutes` | number | Cumulative execution time (grooming excluded) | Lead (for budget check) | Lead (on story completion) |
+| `points_planned` | number | Total story points planned | Lead | Lead (on sprint plan) |
+| `points_completed` | number | Total story points shipped | Lead, Retrospective | Lead (on story ship) |
+### Sprint Crash Recovery
+If the Lead restarts and finds `current_sprint.phase` is not `completed`:
+1. Read `current_sprint.phase` to determine the sprint sub-phase.
+2. Resume from the appropriate orchestration step file:
+   - `grooming` → resume `sprint-groom.md` from last un-groomed story
+   - `sizing` → resume `sprint-size.md` from last un-sized story
+   - `planning` → re-run `sprint-plan.md` (idempotent)
+   - `executing` → resume `sprint-execute.md` from current story
+   - `reviewing` → re-run `sprint-review.md` (idempotent)
+---
 ## Crash Recovery Protocol
 If the Lead restarts and finds `current_story.status == "in_progress"`:

package/pipeline/docs/task-graph.md CHANGED Viewed

@@ -32,7 +32,7 @@ CRITIC reads `git-diff` (produced by BEND and FEND's code changes), so CRITIC is
 The full default pipeline sequence for a `fullstack-web` project:
 ```
-REQS -> UXA -> QA-A -> JUDGE-G1-Pass1 -> BEND + FEND [parallel] -> CRITIC -> QA-B -> JUDGE-G1-Pass2 -> JUDGE-G2
+REQS -> UXA -> QA-A -> READINESS -> BEND + FEND [parallel] -> CRITIC -> QA-B -> JUDGE
 ```
 Expanded with dependency notation:
@@ -42,15 +42,14 @@ Expanded with dependency notation:
 | REQS | (none) | First task, starts immediately |
 | UXA | REQS | Reads `reqs-brief.md` |
 | QA-A | UXA | Reads `reqs-brief.md`, `uxa-spec.md` |
-| JUDGE-G1 Pass 1 | QA-A | Reviews `reqs-brief.md`, `uxa-spec.md`, `qa-test-spec.md` sequentially |
-| BEND | JUDGE-G1 Pass 1 | Reads `reqs-brief.md`, `qa-test-spec.md` |
-| FEND | JUDGE-G1 Pass 1 | Reads `reqs-brief.md`, `uxa-spec.md`, `qa-test-spec.md` |
+| READINESS | QA-A | Reviews `reqs-brief.md`, `uxa-spec.md`, `qa-test-spec.md` sequentially |
+| BEND | READINESS | Reads `reqs-brief.md`, `qa-test-spec.md` |
+| FEND | READINESS | Reads `reqs-brief.md`, `uxa-spec.md`, `qa-test-spec.md` |
 | CRITIC | BEND, FEND | Reads `git-diff` (produced by both), `reqs-brief.md`, `qa-test-spec.md` |
 | QA-B | CRITIC | Reads `qa-test-spec.md`, `critic-review.md` |
-| JUDGE-G1 Pass 2 | QA-B | Reviews `bugs.md`, `execution-report.md` |
-| JUDGE-G2 | JUDGE-G1 Pass 2 | Reviews `execution-report.md`, `traceability-matrix.md`, `pmcp-evidence.md`, `bugs.md`, `judge-g1-review.md` |
+| JUDGE | QA-B | Reviews `execution-report.md`, `traceability-matrix.md`, `pmcp-evidence.md`, `bugs.md`, `qa-test-spec.md` |
-BEND and FEND run in **parallel** -- they are both blocked only by JUDGE-G1 Pass 1 and have no dependency on each other. CRITIC waits for both to complete.
+BEND and FEND run in **parallel** -- they are both blocked only by READINESS and have no dependency on each other. CRITIC waits for both to complete.
 ---
@@ -112,11 +111,11 @@ Skipped agents: UXA, FEND, PMCP
 Modified chain:
 ```
-REQS -> QA-A -> JUDGE-G1-Pass1 -> BEND -> CRITIC -> QA-B -> JUDGE-G1-Pass2 -> JUDGE-G2
+REQS -> QA-A -> READINESS -> BEND -> CRITIC -> QA-B -> JUDGE
 ```
 - QA-A is `blockedBy` REQS directly (UXA is skipped)
-- JUDGE-G1 Pass 1 skips UXA validation
+- READINESS skips UXA validation
 - CRITIC is `blockedBy` BEND only (no FEND)
 - Visual validation checklist is not produced; PMCP is not spawned
@@ -126,7 +125,7 @@ Skipped agents: BEND
 Modified chain:
 ```
-REQS -> UXA -> QA-A -> JUDGE-G1-Pass1 -> FEND -> CRITIC -> QA-B -> JUDGE-G1-Pass2 -> JUDGE-G2
+REQS -> UXA -> QA-A -> READINESS -> FEND -> CRITIC -> QA-B -> JUDGE
 ```
 - CRITIC is `blockedBy` FEND only (no BEND)
@@ -150,17 +149,17 @@ When a gate rejects, the lead modifies the graph to re-queue the rejected task.
 ### Rejection Scenarios
-**JUDGE G1 Pass 1 rejects REQS:**
+**READINESS rejects REQS:**
 - REQS task -> `pending`
 - UXA, QA-A, and everything downstream -> `pending`
 - REQS revises, re-completes. Pipeline resumes from UXA.
-**JUDGE G1 Pass 1 rejects UXA:**
+**READINESS rejects UXA:**
 - UXA task -> `pending`
 - QA-A and everything downstream -> `pending`
 - UXA revises, re-completes. Pipeline resumes from QA-A.
-**JUDGE G1 Pass 1 rejects QA-A:**
+**READINESS rejects QA-A:**
 - QA-A task -> `pending`
 - Everything downstream -> `pending`
 - QA-A revises, re-completes. Pipeline resumes from BEND/FEND.
@@ -176,9 +175,9 @@ When a gate rejects, the lead modifies the graph to re-queue the rejected task.
 - Devs fix without formal task re-queue (inbox-driven fix cycle)
 - QA-B re-runs tests after fixes
-**JUDGE G1 Pass 2 reclassifies bugs:**
+**JUDGE reclassifies bugs:**
 - Similar to QA-B bug routing -- devs fix, QA-B re-runs
-**JUDGE G2 rejects:**
+**JUDGE rejects:**
 - Lead takes ownership and determines which agents need to re-execute
 - Lead manually re-queues the appropriate tasks based on root cause diagnosis

package/pipeline/docs/template-skeleton.md CHANGED Viewed

@@ -9,7 +9,7 @@ Universal structure that all 16 handoff document templates follow. Consult this
 Templates define the **output format** of every agent-to-agent handoff artifact. They exist to:
 - **Decouple format from agent logic.** Adding a section to a template does not require editing the producing agent's prompt.
-- **Give consuming agents a predictable schema.** JUDGE G1 reads `qa-test-spec.template.md` to know exactly which sections it reviews.
+- **Give consuming agents a predictable schema.** READINESS reads `qa-test-spec.template.md` to know exactly which sections it reviews.
 - **Enforce the distilled communication standard structurally.** YAML frontmatter, orchestrator summary blocks, and machine-consumption formatting are baked into the template itself, not left to agent discretion.
 - **Serve as the handoff contract.** If a template marks a section `-- required` and the agent's output omits it, CRITIC or JUDGE flags the gap by comparing output against template.
@@ -85,7 +85,7 @@ Immediately after frontmatter. This is the TL;DR the lead scans without reading
 **Field notes:**
 - `Verdict`: `pass` means the agent completed successfully with no issues. `fail` means the agent could not complete or output failed self-checks. `needs-review` means output is complete but contains flagged concerns.
-- `State transition`: References pipeline phases (e.g., `reqs -> uxa`, `dev -> critic-review`, `qa-execution -> judge-g2`).
+- `State transition`: References pipeline phases (e.g., `reqs -> uxa`, `dev -> critic-review`, `qa-execution -> judge`).
 - `Flags`: Terse alerts. Examples: `"AC-3 ambiguous -- interpreted as X"`, `"no E2E for file upload -- mock only"`, `"none"`.
 ### 2.4 Required Sections
@@ -263,19 +263,20 @@ The 16 templates in `.valent-pipeline/templates/`, mapped to their producing age
 | Template | Producing Agent | Primary Consumers |
 |----------|----------------|-------------------|
-| `reqs-brief.template.md` | REQS | UXA, QA-A, BEND, FEND, CRITIC, JUDGE-G1 |
-| `uxa-spec.template.md` | UXA | QA-A, FEND, JUDGE-G1 |
-| `qa-test-spec.template.md` | QA-A | JUDGE-G1, BEND, FEND, CRITIC, QA-B |
+| `reqs-brief.template.md` | REQS | UXA, QA-A, BEND, FEND, CRITIC, READINESS |
+| `uxa-spec.template.md` | UXA | QA-A, FEND, READINESS |
+| `qa-test-spec.template.md` | QA-A | READINESS, BEND, FEND, CRITIC, QA-B |
 | `visual-validation-checklist.template.md` | QA-A | PMCP |
 | `bend-handoff.template.md` | BEND | CRITIC, QA-B |
 | `fend-handoff.template.md` | FEND | CRITIC, QA-B |
 | `critic-review.template.md` | CRITIC | BEND, FEND, QA-B |
-| `bugs.template.md` | QA-B | BEND, FEND, JUDGE-G1 (Pass 2) |
-| `execution-report.template.md` | QA-B | JUDGE-G2 |
-| `traceability-matrix.template.md` | QA-B | JUDGE-G2 |
-| `judge-g1-review.template.md` | JUDGE-G1 | Lead, REQS/UXA/QA-A (on rejection) |
-| `judge-g2-decision.template.md` | JUDGE-G2 | Lead |
+| `bugs.template.md` | QA-B | BEND, FEND, JUDGE (Pass 2) |
+| `execution-report.template.md` | QA-B | JUDGE |
+| `traceability-matrix.template.md` | QA-B | JUDGE |
+| `readiness-review.template.md` | READINESS | Lead, REQS/UXA/QA-A (on rejection) |
+| `judge-review.template.md` | JUDGE | Lead, BEND/FEND (on rejection) |
+| `judge-decision.template.md` | JUDGE | Lead |
 | `story-report.template.md` | Lead | User |
-| `pmcp-evidence.template.md` | PMCP | JUDGE-G2 |
+| `pmcp-evidence.template.md` | PMCP | JUDGE |
 | `retrospective.template.md` | Retrospective Agent | Lead, Knowledge Agent |
 | `embed-instructions.template.md` | Lead | Embed Agent |

package/pipeline/prompts/bend.md CHANGED Viewed

@@ -9,7 +9,7 @@ Read `.valent-pipeline/steps/common/agent-protocol.md` for Communication Standar
 You are spawned at story kick-off but do NOT begin work immediately.
-- **Wait for:** `[JUDGE-G1-APPROVAL]` (Pass 1) from JUDGE-G1
+- **Wait for:** `[READINESS-APPROVAL]` (Pass 1) from READINESS
 - **On completion:** Send `[HANDOFF]` to CRITIC. CC Lead. If FEND is active, CRITIC waits for both -- send your handoff; CRITIC starts when it has both.
 - **On rejection received (from CRITIC):** Read rejection at critic-review.md. Fix code. Re-send `[HANDOFF]` to CRITIC.
 - **On bug received (from QA-B):** Fix bug. Notify QA-B when fixed.

package/pipeline/prompts/fend.md CHANGED Viewed

@@ -9,7 +9,7 @@ Read `.valent-pipeline/steps/common/agent-protocol.md` for Communication Standar
 You are spawned at story kick-off but do NOT begin work immediately.
-- **Wait for:** `[JUDGE-G1-APPROVAL]` (Pass 1) from JUDGE-G1
+- **Wait for:** `[READINESS-APPROVAL]` (Pass 1) from READINESS
 - **On completion:** Send `[HANDOFF]` to CRITIC. CC Lead. CRITIC waits for both BEND and FEND -- send your handoff; CRITIC starts when it has both.
 - **On rejection received (from CRITIC):** Read rejection at critic-review.md. Fix code. Re-send `[HANDOFF]` to CRITIC.
 - **On bug received (from QA-B):** Fix bug. Notify QA-B when fixed.
@@ -49,8 +49,9 @@ These are non-negotiable. CRITIC and QA-B enforce them.
 - **Explicit assertions in test bodies** -- never hide assertions in helpers. Every test body must contain at least one visible `expect`/`assert`.
 - **Parallel-safe** -- no shared mutable state between tests. Must run cleanly with `--workers=4`.
 - **API-first setup** -- never use UI for test precondition setup. Seed via API calls or direct database insertion.
-- **Network-first pattern** -- intercept network routes BEFORE navigation to prevent race conditions.
-- **Zero mocks** -- tests hit real infrastructure. No mocking databases, APIs, or external services.
+- **Network-first setup** -- when using Playwright route handlers (for error simulation or offline testing), register them BEFORE `page.goto()` to prevent race conditions. Route handlers are acceptable for simulating error states (500s, timeouts, network failures) but MUST NOT be used to mock happy-path API responses.
+- **Real API for happy paths** -- happy-path E2E tests MUST hit the real running API server. No `route.fulfill()` with canned success responses for the primary AC flow. Unit tests MAY mock fetch for isolated component rendering logic, but every mocked unit test for an API-calling AC must be paired with a real-API E2E test for the same AC.
+- **API infrastructure** -- before running E2E tests, ensure the API server is running. If BEND is skipped, the existing API still needs to be live for real integration testing. Run `docker compose up -d db api` and verify `GET /api/health` returns 200 before executing E2E tests.
 ## UX-Specific Standards

package/pipeline/prompts/judge.md ADDED Viewed

@@ -0,0 +1,64 @@
+# JUDGE
+<!-- Prompt version: 1.0 | Model: Sonnet | Lifecycle: per-story -->
+You are **JUDGE**, the final quality gate. You review bug priorities from QA-B's execution, then make the binary SHIP or REJECT decision based on evidence, not trust. Every claim from upstream agents must be independently verified against artifacts.
+Your mandate: **evidence over assertion**. If an agent says "all tests pass," you verify against the execution report. If the traceability matrix says "100% coverage," you cross-reference against the test spec. Trust nothing; verify everything.
+Read `.valent-pipeline/steps/common/agent-protocol.md` for Communication Standard, Context Discipline, Inbox Protocol, Design Council Protocol, Knowledge-First Principle, Correction Directives, and YAML Frontmatter.
+## Trigger Protocol
+You are spawned when CRITIC starts reviewing (wave 3) but do NOT begin work immediately.
+- **Wait for:** `[HANDOFF]` from QA-B. Do NOT begin if CRITIC task is still `in_progress` (rejection/bug cycle ongoing).
+- **On bug review approval (no reclassifications to P1-P3):** Proceed directly to evidence review. No external message needed — this is an internal transition.
+- **On bug reclassification (P4 escalated to P1-P3):** Send `[JUDGE-RECLASS]` to the responsible dev (BEND or FEND per root cause) AND to Lead. Do NOT proceed to evidence review until bugs are fixed and QA-B re-runs.
+- **On SHIP verdict:** Send `[JUDGE-SHIP]` to Lead. Mark task completed. Lead owns ship/teardown.
+- **On REJECT verdict:** Send `[JUDGE-REJECT]` to Lead. Mark task completed. Lead owns JUDGE rejection routing — this is non-routine.
+- **Escalate to:** Lead — for `[BLOCKER]` or any issue you cannot resolve.
+## Output
+Write outputs to `{story_output_dir}/`:
+- `judge-review.md` using the template at `.valent-pipeline/templates/judge-review.template.md`
+- `judge-decision.md` using the template at `.valent-pipeline/templates/judge-decision.template.md`
+- `story-report.md` using the template at `.valent-pipeline/templates/story-report.template.md` (SHIP verdict only)
+## Inputs
+- `{story_output_dir}/execution-report.md` — REQUIRED
+- `{story_output_dir}/traceability-matrix.md` — REQUIRED
+- `{story_output_dir}/pmcp-evidence.md` — REQUIRED if UI story; N/A for backend-only
+- `{story_output_dir}/bugs.md` — REQUIRED
+- `{story_output_dir}/qa-test-spec.md` — reference for assertion cross-check and AC mapping
+## Context Variables
+- `{story_id}`, `{story_output_dir}`, `{correction_directives}`
+- `{tech_stack.test_framework_unit}`, `{tech_stack.test_framework_e2e}`
+- `{project_type}` — fullstack-web | backend-only | frontend-only
+## Step Sequence
+| Step | File | Condition |
+|------|------|-----------|
+| Bug Review (Steps 1-4) | `.valent-pipeline/steps/judge/bug-review.md` | Always |
+| Evidence Review (Steps 5-12) | `.valent-pipeline/steps/judge/evidence-review.md` | After bug review approves |
+| Ship Decision (Steps 13-14b) | `.valent-pipeline/steps/judge/ship-decision.md` | Always |
+## Verdict Principles
+1. **No partial ships.** The decision is SHIP or REJECT. There is no "ship with known issues" unless all known issues are P4.
+2. **Evidence over assertion.** If an agent claims something but the artifact does not support the claim, the artifact is the truth.
+3. **Socratic doubt is mandatory.** Do not skip Socratic validation even if all checks pass.
+4. **JUDGE rejection is an escalation.** Your rejection report must diagnose how the issue slipped through upstream gates.
+5. **Confidence level matters.** If uncertain about evidence, mark confidence as low or medium and explain what would raise it.
+6. **Priority accuracy matters.** Do not rubber-stamp QA-B priority assignments. A P4 that should be P3 is a risk that could slip to production.
+## Error Handling
+- If a required input file is missing or malformed: set blocker, message lead with `[BLOCKER]`, STOP.
+- If crash recovery detects partial output: resume from last completed step per frontmatter.
+- If you receive a correction directive mid-review: apply it, re-evaluate affected checks, update frontmatter.