valent-pipeline 0.1.16 → 0.1.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/package.json +1 -1
  2. package/pipeline/agents-manifest.yaml +13 -15
  3. package/pipeline/docs/agent-reference.md +3 -3
  4. package/pipeline/docs/communication-standard.md +2 -2
  5. package/pipeline/docs/lead-lifecycle.md +9 -9
  6. package/pipeline/docs/pipeline-overview.md +12 -12
  7. package/pipeline/docs/pipeline-state-schema.md +32 -2
  8. package/pipeline/docs/task-graph.md +14 -15
  9. package/pipeline/docs/template-skeleton.md +12 -11
  10. package/pipeline/prompts/bend.md +1 -1
  11. package/pipeline/prompts/fend.md +4 -3
  12. package/pipeline/prompts/judge.md +64 -0
  13. package/pipeline/prompts/lead.md +195 -35
  14. package/pipeline/prompts/qa-a.md +2 -2
  15. package/pipeline/prompts/qa-b.md +2 -2
  16. package/pipeline/prompts/readiness.md +70 -0
  17. package/pipeline/prompts/reqs.md +1 -1
  18. package/pipeline/prompts/retrospective.md +12 -3
  19. package/pipeline/prompts/uxa.md +1 -1
  20. package/pipeline/scripts/embed-sqlite.ts +4 -3
  21. package/pipeline/steps/bend/estimate.md +50 -0
  22. package/pipeline/steps/critic/test-review.md +10 -1
  23. package/pipeline/steps/fend/estimate.md +51 -0
  24. package/pipeline/steps/{judge-g1/pass2-review.md → judge/bug-review.md} +11 -12
  25. package/pipeline/steps/{judge-g2 → judge}/evidence-review.md +20 -18
  26. package/pipeline/steps/judge/ship-decision.md +39 -0
  27. package/pipeline/steps/orchestration/adopt-lead-and-create-team.md +23 -3
  28. package/pipeline/steps/orchestration/sprint-execute.md +57 -0
  29. package/pipeline/steps/orchestration/sprint-groom.md +61 -0
  30. package/pipeline/steps/orchestration/sprint-init.md +64 -0
  31. package/pipeline/steps/orchestration/sprint-plan.md +87 -0
  32. package/pipeline/steps/orchestration/sprint-review.md +70 -0
  33. package/pipeline/steps/orchestration/sprint-size.md +35 -0
  34. package/pipeline/steps/orchestration/update-backlog-status.md +2 -2
  35. package/pipeline/steps/qa-a/ui.md +57 -0
  36. package/pipeline/steps/qa-b/ui.md +58 -0
  37. package/pipeline/steps/qa-b/write-report.md +4 -5
  38. package/pipeline/steps/readiness/sprint-review.md +46 -0
  39. package/pipeline/steps/{judge-g1/pass1-review.md → readiness/standalone-review.md} +13 -10
  40. package/pipeline/steps/retrospective/calibration.md +52 -0
  41. package/pipeline/steps/retrospective/directives.md +23 -0
  42. package/pipeline/steps/retrospective/report.md +14 -0
  43. package/pipeline/task-graphs/backend-api.yaml +32 -39
  44. package/pipeline/task-graphs/data-pipeline.yaml +32 -39
  45. package/pipeline/task-graphs/document-generation.yaml +32 -39
  46. package/pipeline/task-graphs/frontend-only.yaml +31 -38
  47. package/pipeline/task-graphs/fullstack-web.yaml +34 -41
  48. package/pipeline/task-graphs/library.yaml +32 -39
  49. package/pipeline/task-graphs/mcp-server.yaml +32 -39
  50. package/pipeline/templates/bugs.template.md +4 -4
  51. package/pipeline/templates/estimation.template.md +30 -0
  52. package/pipeline/templates/execution-report.template.md +1 -1
  53. package/pipeline/templates/{judge-g2-decision.template.md → judge-decision.template.md} +7 -7
  54. package/pipeline/templates/judge-review.template.md +49 -0
  55. package/pipeline/templates/pmcp-evidence.template.md +2 -2
  56. package/pipeline/templates/qa-test-spec.template.md +1 -1
  57. package/pipeline/templates/{judge-g1-review.template.md → readiness-review.template.md} +28 -49
  58. package/pipeline/templates/reqs-brief.template.md +1 -1
  59. package/pipeline/templates/sprint-plan.template.md +35 -0
  60. package/pipeline/templates/sprint-status.template.yaml +53 -0
  61. package/pipeline/templates/story-report.template.md +5 -5
  62. package/pipeline/templates/traceability-matrix.template.md +2 -2
  63. package/pipeline/templates/uxa-spec.template.md +1 -1
  64. package/pipeline/templates/visual-validation-checklist.template.md +2 -2
  65. package/skills/valent-configure/SKILL.md +1 -1
  66. package/skills/valent-help/SKILL.md +3 -3
  67. package/skills/valent-run-epic/SKILL.md +3 -1
  68. package/skills/valent-run-retrospective/SKILL.md +2 -2
  69. package/skills/valent-run-story/SKILL.md +1 -1
  70. package/src/commands/db-init.js +25 -1
  71. package/src/commands/db-rebuild.js +31 -3
  72. package/src/lib/config-schema.js +37 -1
  73. package/pipeline/prompts/judge-g1.md +0 -65
  74. package/pipeline/prompts/judge-g2.md +0 -61
  75. package/pipeline/steps/judge-g2/ship-decision.md +0 -34
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "valent-pipeline",
3
- "version": "0.1.16",
3
+ "version": "0.1.17",
4
4
  "description": "v3 multi-agent AI pipeline for software development lifecycle",
5
5
  "type": "module",
6
6
  "bin": {
@@ -62,17 +62,15 @@ agents:
62
62
  reads_from: [reqs-brief.md, uxa-spec.md]
63
63
  writes_to: [qa-test-spec.md, visual-validation-checklist.md]
64
64
 
65
- judge_g1:
66
- name: JUDGE-G1
65
+ readiness:
66
+ name: READINESS
67
67
  model: sonnet
68
68
  lifecycle: per-story
69
- role: "Quality gate — validates reqs, UXA spec, test specs (Pass 1) and bug priorities (Pass 2)"
70
- prompt_template: .valent-pipeline/prompts/judge-g1.md
71
- passes:
72
- pass1_review_order: [reqs-validation, uxa-validation, qa-spec-validation] # sequential, stop on first failure
73
- pass2: bug-review
74
- reads_from: [reqs-brief.md, uxa-spec.md, qa-test-spec.md, bugs.md, execution-report.md]
75
- writes_to: [judge-g1-review.md]
69
+ role: "Spec quality gate — validates reqs, UXA spec, and test specs are implementation-ready"
70
+ prompt_template: .valent-pipeline/prompts/readiness.md
71
+ review_order: [reqs-validation, uxa-validation, qa-spec-validation] # sequential, stop on first failure
72
+ reads_from: [reqs-brief.md, uxa-spec.md, qa-test-spec.md]
73
+ writes_to: [readiness-review.md]
76
74
 
77
75
  bend:
78
76
  name: BEND
@@ -113,14 +111,14 @@ agents:
113
111
  writes_to: [execution-report.md, bugs.md, traceability-matrix.md]
114
112
  can_request_spawn: [pmcp] # asks lead to spawn PMCP
115
113
 
116
- judge_g2:
117
- name: JUDGE-G2
114
+ judge:
115
+ name: JUDGE
118
116
  model: sonnet
119
117
  lifecycle: per-story
120
- role: "Final ship gate — evidence-based approval or rejection"
121
- prompt_template: .valent-pipeline/prompts/judge-g2.md
122
- reads_from: [execution-report.md, traceability-matrix.md, pmcp-evidence.md, bugs.md, judge-g1-review.md, qa-test-spec.md] # critic-review.md intentionally excluded — G2 validates test/execution evidence, not code review; qa-test-spec.md used as reference for assertion cross-check
123
- writes_to: [judge-g2-decision.md, story-report.md]
118
+ role: "Final quality gate — bug priority review + evidence-based ship decision"
119
+ prompt_template: .valent-pipeline/prompts/judge.md
120
+ reads_from: [execution-report.md, traceability-matrix.md, pmcp-evidence.md, bugs.md, qa-test-spec.md] # critic-review.md intentionally excluded — JUDGE validates test/execution evidence, not code review; qa-test-spec.md used as reference for assertion cross-check
121
+ writes_to: [judge-review.md, judge-decision.md, story-report.md]
124
122
 
125
123
  knowledge:
126
124
  name: Knowledge
@@ -16,12 +16,12 @@ Spawned fresh for each story and torn down after the story ships or is cancelled
16
16
  | REQS | Sonnet | Requirements analyst -- translates ACs into implementation brief | story-input (ACs, trigger-map, architecture-decisions, UX spec) | `reqs-brief.md` | Brainstorms ambiguity resolutions; escalates only when options have genuinely competing tradeoffs |
17
17
  | UXA | Sonnet | UX specification -- translates UX spec into component specs | `reqs-brief.md`, ux-spec, trigger-map, scenarios | `uxa-spec.md` | Runs translation-only mode without trigger-map or scenarios; skipped for backend-only projects |
18
18
  | QA-A | Sonnet | QA spec writer -- produces behavioral test specifications | `reqs-brief.md`, `uxa-spec.md` | `qa-test-spec.md`, `visual-validation-checklist.md` | Writes test specs before code exists; tests are specified, not implemented |
19
- | JUDGE-G1 | Sonnet | Quality gate -- validates specs (Pass 1) and bug priorities (Pass 2) | `reqs-brief.md`, `uxa-spec.md`, `qa-test-spec.md`, `bugs.md`, `execution-report.md` | `judge-g1-review.md` | Sequential review: stops on first failure in Pass 1 |
19
+ | READINESS | Sonnet | Spec quality gate -- validates specs before execution begins | `reqs-brief.md`, `uxa-spec.md`, `qa-test-spec.md` | `readiness-review.md` | Sequential review: stops on first failure |
20
20
  | BEND | Opus | Backend developer -- implements production code and tests | `reqs-brief.md`, `qa-test-spec.md` | `bend-handoff.md` | Implements to QA-A test spec; coordinates with FEND via inbox for shared files |
21
21
  | FEND | Opus | Frontend developer -- implements UI components and tests | `reqs-brief.md`, `uxa-spec.md`, `qa-test-spec.md` | `fend-handoff.md` | Implements to UXA component spec; skipped for backend-only projects |
22
22
  | CRITIC | Opus | Code reviewer -- 3-pass adversarial review | git-diff, `reqs-brief.md`, `qa-test-spec.md` | `critic-review.md` | 3-pass sequential review (blind hunt, edge-case hunt, acceptance audit) + triage |
23
23
  | QA-B | Sonnet | Test executor -- runs tests, validates spec alignment, files bugs | `qa-test-spec.md`, `critic-review.md`, `reqs-brief.md` | `execution-report.md`, `bugs.md`, `traceability-matrix.md` | Runs tests against real infrastructure; can request PMCP spawn for visual validation |
24
- | JUDGE-G2 | Sonnet | Final ship gate -- evidence-based approval or rejection | `execution-report.md`, `traceability-matrix.md`, `pmcp-evidence.md`, `bugs.md`, `judge-g1-review.md` | `judge-g2-decision.md` | Evidence over assertion -- independently verifies every upstream claim |
24
+ | JUDGE | Sonnet | Final quality gate -- bug review + ship decision | `execution-report.md`, `traceability-matrix.md`, `pmcp-evidence.md`, `bugs.md`, `qa-test-spec.md` | `judge-review.md`, `judge-decision.md`, `story-report.md` | Evidence over assertion -- independently verifies every upstream claim |
25
25
  | Knowledge | Haiku | Knowledge retrieval -- answers queries from persistent data sources | chromadb, curated-knowledge-files, correction-directives | _(none -- inbox only)_ | Responds via inbox only; no file output |
26
26
 
27
27
  ### Persistent Agent (1)
@@ -66,7 +66,7 @@ Not all agents run for every project type. The Lead reads `project_type` from `p
66
66
  | Tier | Agents | Use Case | Cost |
67
67
  |------|--------|----------|------|
68
68
  | Opus | Lead, BEND, FEND, CRITIC | Complex code generation, orchestration, nuanced multi-pass review | Highest |
69
- | Sonnet | REQS, UXA, QA-A, QA-B, JUDGE-G1, JUDGE-G2, PMCP, Retrospective | Analysis, spec writing, test execution, judgment, coordination | Balanced |
69
+ | Sonnet | REQS, UXA, QA-A, QA-B, READINESS, JUDGE, PMCP, Retrospective | Analysis, spec writing, test execution, judgment, coordination | Balanced |
70
70
  | Haiku | Knowledge, Embed, Help | Mechanical retrieval, indexing instructions, documentation lookups | Lowest |
71
71
 
72
72
  Model assignments are configurable in `pipeline-config.yaml` under the `models` section. Move agents between tiers to adjust the quality/cost tradeoff for your project.
@@ -150,7 +150,7 @@ Every handoff file begins with this YAML frontmatter block. It is the machine-re
150
150
 
151
151
  ```yaml
152
152
  ---
153
- agent: {agent-name} # producing agent: reqs | uxa | qa-a | bend | fend | critic | qa-b | judge-g1 | judge-g2 | pmcp | retrospective
153
+ agent: {agent-name} # producing agent: reqs | uxa | qa-a | readiness | bend | fend | critic | qa-b | judge | pmcp | retrospective
154
154
  story: {story-id} # story identifier, e.g. STORY-042
155
155
  status: {status} # in_progress | completed
156
156
  stepsCompleted: [] # list of completed sub-steps within this agent's phase
@@ -278,7 +278,7 @@ Design Council is a structured deliberation protocol using existing inbox primit
278
278
 
279
279
  - REQS flags a high-ambiguity decision where brainstormed options have genuinely competing tradeoffs and no clear winner
280
280
  - CRITIC rejects code for the second time on the same issue, suggesting a deeper design disagreement
281
- - JUDGE G1 rejects a test spec but the rejection reason is debatable -- the spec author (QA-A) disagrees with the reviewer's interpretation
281
+ - READINESS rejects a test spec but the rejection reason is debatable -- the spec author (QA-A) disagrees with the reviewer's interpretation
282
282
 
283
283
  ### Step 1: Initiator sends structured question
284
284
 
@@ -8,7 +8,7 @@
8
8
 
9
9
  ### Persistent vs Per-Story Agents
10
10
 
11
- The lead is the **only persistent agent** in the pipeline. It carries `pipeline-state.json` and backlog position forward across stories. All other agents (REQS, UXA, QA-A, BEND, FEND, CRITIC, QA-B, JUDGE-G1, JUDGE-G2, Knowledge) are **per-story** -- spawned fresh when a story starts, torn down when it ships.
11
+ The lead is the **only persistent agent** in the pipeline. It carries `pipeline-state.json` and backlog position forward across stories. All other agents (REQS, UXA, QA-A, BEND, FEND, CRITIC, QA-B, READINESS, JUDGE, Knowledge) are **per-story** -- spawned fresh when a story starts, torn down when it ships.
12
12
 
13
13
  The Knowledge Agent's value is in its persistent data sources (ChromaDB collections and curated knowledge files on disk), not its conversation history. A fresh spawn reads from the same store.
14
14
 
@@ -79,7 +79,7 @@ The lead watches the board, not the work. It reads the shared task list to track
79
79
 
80
80
  - Read handoff documents to decide what happens next (task dependencies handle sequencing)
81
81
  - Relay messages between agents (they use inbox directly)
82
- - Judge output quality (JUDGE gates and CRITIC handle this -- except on G2 rejection)
82
+ - Judge output quality (READINESS, JUDGE, and CRITIC handle this -- except on JUDGE rejection)
83
83
  - Customize templates per spawn (agents read their role from the manifest)
84
84
 
85
85
  ### Stall Detection
@@ -111,7 +111,7 @@ All code committed and pushed to the branch specified by the user. The pipeline
111
111
 
112
112
  ### Ship Sequence
113
113
 
114
- 1. JUDGE G2 approves
114
+ 1. JUDGE approves
115
115
  2. Code committed and pushed to user-specified branch
116
116
  3. All agent outputs persist in the story folder (handoff files, reviews, bug reports, execution reports, PMCP evidence)
117
117
  4. Lead writes `story-report.md`: task completion times, rejection cycles, cost metrics
@@ -133,16 +133,16 @@ The lead maintains YAML frontmatter in `story-report.md` during story execution.
133
133
 
134
134
  | Rejection Source | What Failed | Re-entry Action |
135
135
  |-----------------|-------------|-----------------|
136
- | **JUDGE G1 Pass 1** | REQS brief inadequate | Re-queue REQS to revise brief. UXA and QA-A do not proceed. |
137
- | **JUDGE G1 Pass 1** | UXA spec incomplete (fails Developer Trust Test) | Re-queue UXA to revise spec. QA-A does not proceed. |
138
- | **JUDGE G1 Pass 1** | Test specs insufficient | Re-queue QA-A to revise specs. |
136
+ | **READINESS** | REQS brief inadequate | Re-queue REQS to revise brief. UXA and QA-A do not proceed. |
137
+ | **READINESS** | UXA spec incomplete (fails Developer Trust Test) | Re-queue UXA to revise spec. QA-A does not proceed. |
138
+ | **READINESS** | Test specs insufficient | Re-queue QA-A to revise specs. |
139
139
  | **CRITIC** | BEND code rejected | CRITIC sends rejection to BEND via inbox. BEND fixes. CRITIC re-reviews. |
140
140
  | **CRITIC** | FEND code rejected | CRITIC sends rejection to FEND via inbox. FEND fixes. CRITIC re-reviews. |
141
141
  | **QA-B** | P1-P3 bugs found | QA-B routes bugs to BEND/FEND via inbox. Devs fix. QA-B re-runs. |
142
- | **JUDGE G1 Pass 2** | Bug priority reclassified (P4 -> P1-P3) | QA-B routes reclassified bug to devs. Devs fix. QA-B re-runs. |
143
- | **JUDGE G2** | Final gate rejection | **Lead takes ownership.** Lead reads the full rejection, diagnoses root cause, determines which agents need to act, and orchestrates the fix. G2 rejections are non-routine -- they mean something slipped through the entire chain. |
142
+ | **JUDGE** | Bug priority reclassified (P4 -> P1-P3) | QA-B routes reclassified bug to devs. Devs fix. QA-B re-runs. |
143
+ | **JUDGE** | Final gate rejection | **Lead takes ownership.** Lead reads the full rejection, diagnoses root cause, determines which agents need to act, and orchestrates the fix. JUDGE rejections are non-routine -- they mean something slipped through the entire chain. |
144
144
 
145
- **Note:** G1 Pass 1 reviews REQS -> UXA -> QA sequentially, stopping on first failure. Only one rejection fires per pass -- downstream specs are not reviewed if an upstream spec fails.
145
+ **Note:** READINESS reviews REQS -> UXA -> QA sequentially, stopping on first failure. Only one rejection fires per pass -- downstream specs are not reviewed if an upstream spec fails.
146
146
 
147
147
  ---
148
148
 
@@ -8,7 +8,7 @@
8
8
 
9
9
  v3 is a story execution pipeline built on Claude Code agent teams. It takes a user story with acceptance criteria and produces committed, tested code plus a full artifact trail -- requirements briefs, test specs, code reviews, execution reports, and traceability matrices.
10
10
 
11
- The pipeline is orchestrated by a persistent Lead agent that spawns a fresh team of specialist agents per story. Each agent reads structured handoff documents from upstream agents, does its work, and writes its own handoff document for downstream consumers. Quality gates (JUDGE agents) enforce pass/fail checkpoints before work proceeds.
11
+ The pipeline is orchestrated by a persistent Lead agent that spawns a fresh team of specialist agents per story. Each agent reads structured handoff documents from upstream agents, does its work, and writes its own handoff document for downstream consumers. Quality gates (READINESS and JUDGE) enforce pass/fail checkpoints before work proceeds.
12
12
 
13
13
  ---
14
14
 
@@ -41,8 +41,9 @@ For each story, the pipeline writes to `stories/{story-id}/output/`:
41
41
  | `execution-report.md` | QA-B | Test execution results |
42
42
  | `bugs.md` | QA-B | Filed bugs with priorities |
43
43
  | `traceability-matrix.md` | QA-B | AC-to-test coverage map |
44
- | `judge-g1-review.md` | JUDGE-G1 | Spec quality gate results |
45
- | `judge-g2-decision.md` | JUDGE-G2 | Final ship/reject decision |
44
+ | `readiness-review.md` | READINESS | Spec quality gate results |
45
+ | `judge-review.md` | JUDGE | Bug review findings |
46
+ | `judge-decision.md` | JUDGE | Final ship/reject decision |
46
47
  | `pmcp-evidence.md` | PMCP | Visual validation screenshots |
47
48
  | `story-report.md` | Lead | Story completion summary |
48
49
 
@@ -57,7 +58,7 @@ The Lead agent reads `agents-manifest.yaml` and `pipeline-config.yaml` at startu
57
58
  ### Pipeline Flow
58
59
 
59
60
  ```
60
- REQS → UXA → QA-A → JUDGE-G1 → BEND+FEND → CRITIC → QA-B → JUDGE-G1JUDGE-G2 → SHIP
61
+ REQS → UXA → QA-A → READINESS → BEND+FEND → CRITIC → QA-B → JUDGE → SHIP
61
62
  ```
62
63
 
63
64
  1. **REQS** reads the story input and produces `reqs-brief.md` -- the implementation brief that all downstream agents treat as the source of truth for business requirements.
@@ -66,7 +67,7 @@ REQS → UXA → QA-A → JUDGE-G1 → BEND+FEND → CRITIC → QA-B → JUDGE-G
66
67
 
67
68
  3. **QA-A** writes behavioral test specifications and a visual validation checklist before any code is written. Tests are specified, not implemented.
68
69
 
69
- 4. **JUDGE-G1 Pass 1** validates the spec chain: reqs brief, UXA spec, and QA test spec. Stops on first failure -- upstream agent must rework before the pipeline proceeds.
70
+ 4. **READINESS** validates the spec chain: reqs brief, UXA spec, and QA test spec. Stops on first failure -- upstream agent must rework before the pipeline proceeds.
70
71
 
71
72
  5. **BEND + FEND** implement production code and tests in parallel. BEND handles backend; FEND handles frontend (skipped for backend-only projects). Both read the reqs brief and test spec.
72
73
 
@@ -74,11 +75,9 @@ REQS → UXA → QA-A → JUDGE-G1 → BEND+FEND → CRITIC → QA-B → JUDGE-G
74
75
 
75
76
  7. **QA-B** executes the full test suite against real infrastructure, cross-references results against the QA-A test spec, files bugs, and builds the traceability matrix.
76
77
 
77
- 8. **JUDGE-G1 Pass 2** reviews any filed bugs and validates their priorities.
78
+ 8. **JUDGE** reviews any filed bugs, validates their priorities, and makes the final SHIP or REJECT decision based on evidence: test results, traceability matrix, bug status, and PMCP visual evidence (if applicable). Evidence over assertion -- every upstream claim is independently verified.
78
79
 
79
- 9. **JUDGE-G2** makes the final SHIP or REJECT decision based on evidence: test results, traceability matrix, bug status, and PMCP visual evidence (if applicable). Evidence over assertion -- every upstream claim is independently verified.
80
-
81
- 10. **SHIP** -- Lead commits code, writes `story-report.md`, tears down the team, and picks the next story.
80
+ 9. **SHIP** -- Lead commits code, writes `story-report.md`, tears down the team, and picks the next story.
82
81
 
83
82
  ### Rejection Loops
84
83
 
@@ -101,7 +100,7 @@ See `.valent-pipeline/docs/communication-standard.md` for the full specification
101
100
  - **Distilled communication** -- All handoff artifacts are structured for machine consumption: YAML frontmatter, orchestrator summary, facts-only prose. No filler.
102
101
  - **Behavioral test specs** -- QA-A writes test specifications before code exists. Developers implement to spec; QA-B verifies against spec.
103
102
  - **Multi-pass code review** -- CRITIC runs three independent review passes (blind hunt, edge-case hunt, acceptance audit) then triages all findings by severity.
104
- - **Quality gates** -- JUDGE-G1 validates specs; JUDGE-G2 validates execution evidence. Both are pass/fail checkpoints that block the pipeline.
103
+ - **Quality gates** -- READINESS validates specs; JUDGE validates execution evidence and makes the ship decision. Both are pass/fail checkpoints that block the pipeline.
105
104
  - **Correction directives** -- Learned rules from past stories (maintained by the Retrospective agent) that are injected into agent prompts to prevent recurring mistakes.
106
105
 
107
106
  ---
@@ -156,8 +155,9 @@ stories/
156
155
  execution-report.md # QA-B output
157
156
  bugs.md # QA-B output
158
157
  traceability-matrix.md # QA-B output
159
- judge-g1-review.md # JUDGE-G1 output
160
- judge-g2-decision.md # JUDGE-G2 output
158
+ readiness-review.md # READINESS output
159
+ judge-review.md # JUDGE output
160
+ judge-decision.md # JUDGE output
161
161
  pmcp-evidence.md # PMCP output
162
162
  story-report.md # Lead output
163
163
  decisions.md # Design Council decisions
@@ -53,7 +53,7 @@ Defines the JSON schema for `pipeline-state.json`, the Lead agent's persistent s
53
53
  |-------|------|-------------|---------|------------|
54
54
  | `id` | string | Story identifier (e.g., `STORY-042`) | Lead, all teammates (via story context) | Lead (on story start) |
55
55
  | `status` | enum | `in_progress`, `completed`, `cancelled`, `blocked-on-user` | Lead (for state machine transitions) | Lead (on phase transitions) |
56
- | `phase` | enum | `kick-off` (spawning agents), `monitoring` (execution in progress), `ship-teardown` (JUDGE-G2 passed, cleanup) | Lead (to determine allowed actions) | Lead (on phase transitions) |
56
+ | `phase` | enum | `kick-off` (spawning agents), `monitoring` (execution in progress), `ship-teardown` (JUDGE passed, cleanup) | Lead (to determine allowed actions) | Lead (on phase transitions) |
57
57
  | `active_teammates` | string[] | Names of currently spawned teammates | Lead (for health checks, stall detection) | Lead (on spawn/termination) |
58
58
  | `task_graph_snapshot` | string | Pointer to the shared task list that tracks agent dependencies | Lead (on crash recovery to rebuild state) | Lead (on task graph changes) |
59
59
  | `started_at` | ISO-8601 | Timestamp when story execution began | Lead (for duration tracking in story-report) | Lead (on story start) |
@@ -64,7 +64,7 @@ Defines the JSON schema for `pipeline-state.json`, the Lead agent's persistent s
64
64
  | Field | Type | Description | Read by | Written by |
65
65
  |-------|------|-------------|---------|------------|
66
66
  | `id` | string | Story identifier | Lead (for scheduling) | Lead (on user submission) |
67
- | `status` | enum | `pending`, `in_progress`, `completed`, `blocked`, `blocked-on-user`, `cancelled` | Lead (to select next story) | Lead (on status changes) |
67
+ | `status` | enum | Granular phase statuses: `pending`, `requirements-spec`, `ux-spec`, `test-case-development`, `readiness-review`, `groomed`, `sizing`, `sprint-planned`, `development`, `code-review`, `qa-validation`, `final-review`, `shipped`, `blocked`, `blocked-on-user`, `cancelled` | Lead (to select next story) | Lead (on status changes) |
68
68
  | `depends_on` | string[] | Story IDs that must complete before this story can start | Lead (for dependency resolution) | Lead (on user submission) |
69
69
  | `blocked_reason` | string | Human-readable reason for blocked status; empty or absent when not blocked | Lead (for user reporting) | Lead (when blocking occurs) |
70
70
 
@@ -92,6 +92,36 @@ The Lead creates `pipeline-state.json` on first pipeline initialization with `pi
92
92
 
93
93
  ---
94
94
 
95
+ ## Sprint State (Epic/Project Mode)
96
+
97
+ When `is_sprint_mode` is true, `pipeline-state.json` includes a `current_sprint` object:
98
+
99
+ | Field | Type | Description | Read by | Written by |
100
+ |-------|------|-------------|---------|------------|
101
+ | `id` | string | Sprint identifier (e.g., `kanban-mvp-sprint-3`) | Lead | Lead (on sprint init) |
102
+ | `number` | number | Sequential sprint number within this run | Lead | Lead (on sprint init) |
103
+ | `phase` | enum | `grooming`, `sizing`, `planning`, `executing`, `reviewing`, `completed` | Lead (for crash recovery) | Lead (on sprint phase transitions) |
104
+ | `stories_planned` | string[] | Story IDs packed into this sprint | Lead | Lead (on sprint plan) |
105
+ | `stories_completed` | string[] | Story IDs shipped in this sprint | Lead | Lead (on story ship) |
106
+ | `velocity` | number | Current velocity in story points | Lead, Retrospective | Lead (on sprint init) |
107
+ | `elapsed_execution_minutes` | number | Cumulative execution time (grooming excluded) | Lead (for budget check) | Lead (on story completion) |
108
+ | `points_planned` | number | Total story points planned | Lead | Lead (on sprint plan) |
109
+ | `points_completed` | number | Total story points shipped | Lead, Retrospective | Lead (on story ship) |
110
+
111
+ ### Sprint Crash Recovery
112
+
113
+ If the Lead restarts and finds `current_sprint.phase` is not `completed`:
114
+
115
+ 1. Read `current_sprint.phase` to determine the sprint sub-phase.
116
+ 2. Resume from the appropriate orchestration step file:
117
+ - `grooming` → resume `sprint-groom.md` from last un-groomed story
118
+ - `sizing` → resume `sprint-size.md` from last un-sized story
119
+ - `planning` → re-run `sprint-plan.md` (idempotent)
120
+ - `executing` → resume `sprint-execute.md` from current story
121
+ - `reviewing` → re-run `sprint-review.md` (idempotent)
122
+
123
+ ---
124
+
95
125
  ## Crash Recovery Protocol
96
126
 
97
127
  If the Lead restarts and finds `current_story.status == "in_progress"`:
@@ -32,7 +32,7 @@ CRITIC reads `git-diff` (produced by BEND and FEND's code changes), so CRITIC is
32
32
  The full default pipeline sequence for a `fullstack-web` project:
33
33
 
34
34
  ```
35
- REQS -> UXA -> QA-A -> JUDGE-G1-Pass1 -> BEND + FEND [parallel] -> CRITIC -> QA-B -> JUDGE-G1-Pass2 -> JUDGE-G2
35
+ REQS -> UXA -> QA-A -> READINESS -> BEND + FEND [parallel] -> CRITIC -> QA-B -> JUDGE
36
36
  ```
37
37
 
38
38
  Expanded with dependency notation:
@@ -42,15 +42,14 @@ Expanded with dependency notation:
42
42
  | REQS | (none) | First task, starts immediately |
43
43
  | UXA | REQS | Reads `reqs-brief.md` |
44
44
  | QA-A | UXA | Reads `reqs-brief.md`, `uxa-spec.md` |
45
- | JUDGE-G1 Pass 1 | QA-A | Reviews `reqs-brief.md`, `uxa-spec.md`, `qa-test-spec.md` sequentially |
46
- | BEND | JUDGE-G1 Pass 1 | Reads `reqs-brief.md`, `qa-test-spec.md` |
47
- | FEND | JUDGE-G1 Pass 1 | Reads `reqs-brief.md`, `uxa-spec.md`, `qa-test-spec.md` |
45
+ | READINESS | QA-A | Reviews `reqs-brief.md`, `uxa-spec.md`, `qa-test-spec.md` sequentially |
46
+ | BEND | READINESS | Reads `reqs-brief.md`, `qa-test-spec.md` |
47
+ | FEND | READINESS | Reads `reqs-brief.md`, `uxa-spec.md`, `qa-test-spec.md` |
48
48
  | CRITIC | BEND, FEND | Reads `git-diff` (produced by both), `reqs-brief.md`, `qa-test-spec.md` |
49
49
  | QA-B | CRITIC | Reads `qa-test-spec.md`, `critic-review.md` |
50
- | JUDGE-G1 Pass 2 | QA-B | Reviews `bugs.md`, `execution-report.md` |
51
- | JUDGE-G2 | JUDGE-G1 Pass 2 | Reviews `execution-report.md`, `traceability-matrix.md`, `pmcp-evidence.md`, `bugs.md`, `judge-g1-review.md` |
50
+ | JUDGE | QA-B | Reviews `execution-report.md`, `traceability-matrix.md`, `pmcp-evidence.md`, `bugs.md`, `qa-test-spec.md` |
52
51
 
53
- BEND and FEND run in **parallel** -- they are both blocked only by JUDGE-G1 Pass 1 and have no dependency on each other. CRITIC waits for both to complete.
52
+ BEND and FEND run in **parallel** -- they are both blocked only by READINESS and have no dependency on each other. CRITIC waits for both to complete.
54
53
 
55
54
  ---
56
55
 
@@ -112,11 +111,11 @@ Skipped agents: UXA, FEND, PMCP
112
111
 
113
112
  Modified chain:
114
113
  ```
115
- REQS -> QA-A -> JUDGE-G1-Pass1 -> BEND -> CRITIC -> QA-B -> JUDGE-G1-Pass2 -> JUDGE-G2
114
+ REQS -> QA-A -> READINESS -> BEND -> CRITIC -> QA-B -> JUDGE
116
115
  ```
117
116
 
118
117
  - QA-A is `blockedBy` REQS directly (UXA is skipped)
119
- - JUDGE-G1 Pass 1 skips UXA validation
118
+ - READINESS skips UXA validation
120
119
  - CRITIC is `blockedBy` BEND only (no FEND)
121
120
  - Visual validation checklist is not produced; PMCP is not spawned
122
121
 
@@ -126,7 +125,7 @@ Skipped agents: BEND
126
125
 
127
126
  Modified chain:
128
127
  ```
129
- REQS -> UXA -> QA-A -> JUDGE-G1-Pass1 -> FEND -> CRITIC -> QA-B -> JUDGE-G1-Pass2 -> JUDGE-G2
128
+ REQS -> UXA -> QA-A -> READINESS -> FEND -> CRITIC -> QA-B -> JUDGE
130
129
  ```
131
130
 
132
131
  - CRITIC is `blockedBy` FEND only (no BEND)
@@ -150,17 +149,17 @@ When a gate rejects, the lead modifies the graph to re-queue the rejected task.
150
149
 
151
150
  ### Rejection Scenarios
152
151
 
153
- **JUDGE G1 Pass 1 rejects REQS:**
152
+ **READINESS rejects REQS:**
154
153
  - REQS task -> `pending`
155
154
  - UXA, QA-A, and everything downstream -> `pending`
156
155
  - REQS revises, re-completes. Pipeline resumes from UXA.
157
156
 
158
- **JUDGE G1 Pass 1 rejects UXA:**
157
+ **READINESS rejects UXA:**
159
158
  - UXA task -> `pending`
160
159
  - QA-A and everything downstream -> `pending`
161
160
  - UXA revises, re-completes. Pipeline resumes from QA-A.
162
161
 
163
- **JUDGE G1 Pass 1 rejects QA-A:**
162
+ **READINESS rejects QA-A:**
164
163
  - QA-A task -> `pending`
165
164
  - Everything downstream -> `pending`
166
165
  - QA-A revises, re-completes. Pipeline resumes from BEND/FEND.
@@ -176,9 +175,9 @@ When a gate rejects, the lead modifies the graph to re-queue the rejected task.
176
175
  - Devs fix without formal task re-queue (inbox-driven fix cycle)
177
176
  - QA-B re-runs tests after fixes
178
177
 
179
- **JUDGE G1 Pass 2 reclassifies bugs:**
178
+ **JUDGE reclassifies bugs:**
180
179
  - Similar to QA-B bug routing -- devs fix, QA-B re-runs
181
180
 
182
- **JUDGE G2 rejects:**
181
+ **JUDGE rejects:**
183
182
  - Lead takes ownership and determines which agents need to re-execute
184
183
  - Lead manually re-queues the appropriate tasks based on root cause diagnosis
@@ -9,7 +9,7 @@ Universal structure that all 16 handoff document templates follow. Consult this
9
9
  Templates define the **output format** of every agent-to-agent handoff artifact. They exist to:
10
10
 
11
11
  - **Decouple format from agent logic.** Adding a section to a template does not require editing the producing agent's prompt.
12
- - **Give consuming agents a predictable schema.** JUDGE G1 reads `qa-test-spec.template.md` to know exactly which sections it reviews.
12
+ - **Give consuming agents a predictable schema.** READINESS reads `qa-test-spec.template.md` to know exactly which sections it reviews.
13
13
  - **Enforce the distilled communication standard structurally.** YAML frontmatter, orchestrator summary blocks, and machine-consumption formatting are baked into the template itself, not left to agent discretion.
14
14
  - **Serve as the handoff contract.** If a template marks a section `-- required` and the agent's output omits it, CRITIC or JUDGE flags the gap by comparing output against template.
15
15
 
@@ -85,7 +85,7 @@ Immediately after frontmatter. This is the TL;DR the lead scans without reading
85
85
 
86
86
  **Field notes:**
87
87
  - `Verdict`: `pass` means the agent completed successfully with no issues. `fail` means the agent could not complete or output failed self-checks. `needs-review` means output is complete but contains flagged concerns.
88
- - `State transition`: References pipeline phases (e.g., `reqs -> uxa`, `dev -> critic-review`, `qa-execution -> judge-g2`).
88
+ - `State transition`: References pipeline phases (e.g., `reqs -> uxa`, `dev -> critic-review`, `qa-execution -> judge`).
89
89
  - `Flags`: Terse alerts. Examples: `"AC-3 ambiguous -- interpreted as X"`, `"no E2E for file upload -- mock only"`, `"none"`.
90
90
 
91
91
  ### 2.4 Required Sections
@@ -263,19 +263,20 @@ The 16 templates in `.valent-pipeline/templates/`, mapped to their producing age
263
263
 
264
264
  | Template | Producing Agent | Primary Consumers |
265
265
  |----------|----------------|-------------------|
266
- | `reqs-brief.template.md` | REQS | UXA, QA-A, BEND, FEND, CRITIC, JUDGE-G1 |
267
- | `uxa-spec.template.md` | UXA | QA-A, FEND, JUDGE-G1 |
268
- | `qa-test-spec.template.md` | QA-A | JUDGE-G1, BEND, FEND, CRITIC, QA-B |
266
+ | `reqs-brief.template.md` | REQS | UXA, QA-A, BEND, FEND, CRITIC, READINESS |
267
+ | `uxa-spec.template.md` | UXA | QA-A, FEND, READINESS |
268
+ | `qa-test-spec.template.md` | QA-A | READINESS, BEND, FEND, CRITIC, QA-B |
269
269
  | `visual-validation-checklist.template.md` | QA-A | PMCP |
270
270
  | `bend-handoff.template.md` | BEND | CRITIC, QA-B |
271
271
  | `fend-handoff.template.md` | FEND | CRITIC, QA-B |
272
272
  | `critic-review.template.md` | CRITIC | BEND, FEND, QA-B |
273
- | `bugs.template.md` | QA-B | BEND, FEND, JUDGE-G1 (Pass 2) |
274
- | `execution-report.template.md` | QA-B | JUDGE-G2 |
275
- | `traceability-matrix.template.md` | QA-B | JUDGE-G2 |
276
- | `judge-g1-review.template.md` | JUDGE-G1 | Lead, REQS/UXA/QA-A (on rejection) |
277
- | `judge-g2-decision.template.md` | JUDGE-G2 | Lead |
273
+ | `bugs.template.md` | QA-B | BEND, FEND, JUDGE (Pass 2) |
274
+ | `execution-report.template.md` | QA-B | JUDGE |
275
+ | `traceability-matrix.template.md` | QA-B | JUDGE |
276
+ | `readiness-review.template.md` | READINESS | Lead, REQS/UXA/QA-A (on rejection) |
277
+ | `judge-review.template.md` | JUDGE | Lead, BEND/FEND (on rejection) |
278
+ | `judge-decision.template.md` | JUDGE | Lead |
278
279
  | `story-report.template.md` | Lead | User |
279
- | `pmcp-evidence.template.md` | PMCP | JUDGE-G2 |
280
+ | `pmcp-evidence.template.md` | PMCP | JUDGE |
280
281
  | `retrospective.template.md` | Retrospective Agent | Lead, Knowledge Agent |
281
282
  | `embed-instructions.template.md` | Lead | Embed Agent |
@@ -9,7 +9,7 @@ Read `.valent-pipeline/steps/common/agent-protocol.md` for Communication Standar
9
9
 
10
10
  You are spawned at story kick-off but do NOT begin work immediately.
11
11
 
12
- - **Wait for:** `[JUDGE-G1-APPROVAL]` (Pass 1) from JUDGE-G1
12
+ - **Wait for:** `[READINESS-APPROVAL]` (Pass 1) from READINESS
13
13
  - **On completion:** Send `[HANDOFF]` to CRITIC. CC Lead. If FEND is active, CRITIC waits for both -- send your handoff; CRITIC starts when it has both.
14
14
  - **On rejection received (from CRITIC):** Read rejection at critic-review.md. Fix code. Re-send `[HANDOFF]` to CRITIC.
15
15
  - **On bug received (from QA-B):** Fix bug. Notify QA-B when fixed.
@@ -9,7 +9,7 @@ Read `.valent-pipeline/steps/common/agent-protocol.md` for Communication Standar
9
9
 
10
10
  You are spawned at story kick-off but do NOT begin work immediately.
11
11
 
12
- - **Wait for:** `[JUDGE-G1-APPROVAL]` (Pass 1) from JUDGE-G1
12
+ - **Wait for:** `[READINESS-APPROVAL]` (Pass 1) from READINESS
13
13
  - **On completion:** Send `[HANDOFF]` to CRITIC. CC Lead. CRITIC waits for both BEND and FEND -- send your handoff; CRITIC starts when it has both.
14
14
  - **On rejection received (from CRITIC):** Read rejection at critic-review.md. Fix code. Re-send `[HANDOFF]` to CRITIC.
15
15
  - **On bug received (from QA-B):** Fix bug. Notify QA-B when fixed.
@@ -49,8 +49,9 @@ These are non-negotiable. CRITIC and QA-B enforce them.
49
49
  - **Explicit assertions in test bodies** -- never hide assertions in helpers. Every test body must contain at least one visible `expect`/`assert`.
50
50
  - **Parallel-safe** -- no shared mutable state between tests. Must run cleanly with `--workers=4`.
51
51
  - **API-first setup** -- never use UI for test precondition setup. Seed via API calls or direct database insertion.
52
- - **Network-first pattern** -- intercept network routes BEFORE navigation to prevent race conditions.
53
- - **Zero mocks** -- tests hit real infrastructure. No mocking databases, APIs, or external services.
52
+ - **Network-first setup** -- when using Playwright route handlers (for error simulation or offline testing), register them BEFORE `page.goto()` to prevent race conditions. Route handlers are acceptable for simulating error states (500s, timeouts, network failures) but MUST NOT be used to mock happy-path API responses.
53
+ - **Real API for happy paths** -- happy-path E2E tests MUST hit the real running API server. No `route.fulfill()` with canned success responses for the primary AC flow. Unit tests MAY mock fetch for isolated component rendering logic, but every mocked unit test for an API-calling AC must be paired with a real-API E2E test for the same AC.
54
+ - **API infrastructure** -- before running E2E tests, ensure the API server is running. If BEND is skipped, the existing API still needs to be live for real integration testing. Run `docker compose up -d db api` and verify `GET /api/health` returns 200 before executing E2E tests.
54
55
 
55
56
  ## UX-Specific Standards
56
57
 
@@ -0,0 +1,64 @@
1
+ # JUDGE
2
+
3
+ <!-- Prompt version: 1.0 | Model: Sonnet | Lifecycle: per-story -->
4
+
5
+ You are **JUDGE**, the final quality gate. You review bug priorities from QA-B's execution, then make the binary SHIP or REJECT decision based on evidence, not trust. Every claim from upstream agents must be independently verified against artifacts.
6
+
7
+ Your mandate: **evidence over assertion**. If an agent says "all tests pass," you verify against the execution report. If the traceability matrix says "100% coverage," you cross-reference against the test spec. Trust nothing; verify everything.
8
+
9
+ Read `.valent-pipeline/steps/common/agent-protocol.md` for Communication Standard, Context Discipline, Inbox Protocol, Design Council Protocol, Knowledge-First Principle, Correction Directives, and YAML Frontmatter.
10
+
11
+ ## Trigger Protocol
12
+
13
+ You are spawned when CRITIC starts reviewing (wave 3) but do NOT begin work immediately.
14
+
15
+ - **Wait for:** `[HANDOFF]` from QA-B. Do NOT begin if CRITIC task is still `in_progress` (rejection/bug cycle ongoing).
16
+ - **On bug review approval (no reclassifications to P1-P3):** Proceed directly to evidence review. No external message needed — this is an internal transition.
17
+ - **On bug reclassification (P4 escalated to P1-P3):** Send `[JUDGE-RECLASS]` to the responsible dev (BEND or FEND per root cause) AND to Lead. Do NOT proceed to evidence review until bugs are fixed and QA-B re-runs.
18
+ - **On SHIP verdict:** Send `[JUDGE-SHIP]` to Lead. Mark task completed. Lead owns ship/teardown.
19
+ - **On REJECT verdict:** Send `[JUDGE-REJECT]` to Lead. Mark task completed. Lead owns JUDGE rejection routing — this is non-routine.
20
+ - **Escalate to:** Lead — for `[BLOCKER]` or any issue you cannot resolve.
21
+
22
+ ## Output
23
+
24
+ Write outputs to `{story_output_dir}/`:
25
+ - `judge-review.md` using the template at `.valent-pipeline/templates/judge-review.template.md`
26
+ - `judge-decision.md` using the template at `.valent-pipeline/templates/judge-decision.template.md`
27
+ - `story-report.md` using the template at `.valent-pipeline/templates/story-report.template.md` (SHIP verdict only)
28
+
29
+ ## Inputs
30
+
31
+ - `{story_output_dir}/execution-report.md` — REQUIRED
32
+ - `{story_output_dir}/traceability-matrix.md` — REQUIRED
33
+ - `{story_output_dir}/pmcp-evidence.md` — REQUIRED if UI story; N/A for backend-only
34
+ - `{story_output_dir}/bugs.md` — REQUIRED
35
+ - `{story_output_dir}/qa-test-spec.md` — reference for assertion cross-check and AC mapping
36
+
37
+ ## Context Variables
38
+
39
+ - `{story_id}`, `{story_output_dir}`, `{correction_directives}`
40
+ - `{tech_stack.test_framework_unit}`, `{tech_stack.test_framework_e2e}`
41
+ - `{project_type}` — fullstack-web | backend-only | frontend-only
42
+
43
+ ## Step Sequence
44
+
45
+ | Step | File | Condition |
46
+ |------|------|-----------|
47
+ | Bug Review (Steps 1-4) | `.valent-pipeline/steps/judge/bug-review.md` | Always |
48
+ | Evidence Review (Steps 5-12) | `.valent-pipeline/steps/judge/evidence-review.md` | After bug review approves |
49
+ | Ship Decision (Steps 13-14b) | `.valent-pipeline/steps/judge/ship-decision.md` | Always |
50
+
51
+ ## Verdict Principles
52
+
53
+ 1. **No partial ships.** The decision is SHIP or REJECT. There is no "ship with known issues" unless all known issues are P4.
54
+ 2. **Evidence over assertion.** If an agent claims something but the artifact does not support the claim, the artifact is the truth.
55
+ 3. **Socratic doubt is mandatory.** Do not skip Socratic validation even if all checks pass.
56
+ 4. **JUDGE rejection is an escalation.** Your rejection report must diagnose how the issue slipped through upstream gates.
57
+ 5. **Confidence level matters.** If uncertain about evidence, mark confidence as low or medium and explain what would raise it.
58
+ 6. **Priority accuracy matters.** Do not rubber-stamp QA-B priority assignments. A P4 that should be P3 is a risk that could slip to production.
59
+
60
+ ## Error Handling
61
+
62
+ - If a required input file is missing or malformed: set blocker, message lead with `[BLOCKER]`, STOP.
63
+ - If crash recovery detects partial output: resume from last completed step per frontmatter.
64
+ - If you receive a correction directive mid-review: apply it, re-evaluate affected checks, update frontmatter.