npm - valent-pipeline - Versions diffs - 0.3.4 → 0.4.2 - Mend

valent-pipeline 0.3.4 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

package/bin/cli.js +80 -0
package/package.json +7 -5
package/pipeline/docs/design/provider-adapter-guide.md +6 -7
package/pipeline/orchestrators/claude-code/README.md +99 -0
package/pipeline/orchestrators/claude-code/plan.workflow.js +284 -0
package/pipeline/orchestrators/claude-code/retro.workflow.js +274 -0
package/pipeline/orchestrators/claude-code/sprint.workflow.js +354 -0
package/pipeline/orchestrators/codex/README.md +52 -0
package/pipeline/orchestrators/codex/lead-loop.md +115 -0
package/pipeline/prompts/critic.md +2 -0
package/pipeline/prompts/lead.md +1 -1
package/pipeline/schemas/handoff.schema.json +19 -0
package/pipeline/schemas/task-graph.schema.json +53 -0
package/pipeline/schemas/verdict.schema.json +20 -0
package/pipeline/steps/common/distilled-handoff-format.md +15 -0
package/pipeline/steps/critic/acceptance-audit.md +1 -1
package/pipeline/steps/critic/edge-case-hunt.md +2 -2
package/pipeline/steps/critic/triage.md +2 -2
package/pipeline/steps/orchestration/adopt-lead-and-create-team.md +13 -12
package/pipeline/steps/orchestration/sprint-plan.md +28 -31
package/pipeline/steps/retrospective/calibration.md +18 -31
package/pipeline/task-graphs/backend-api.yaml +1 -1
package/pipeline/task-graphs/data-pipeline.yaml +1 -1
package/pipeline/task-graphs/document-generation.yaml +1 -1
package/pipeline/task-graphs/frontend-only.yaml +9 -8
package/pipeline/task-graphs/fullstack-web.yaml +11 -10
package/pipeline/task-graphs/library.yaml +1 -1
package/pipeline/task-graphs/mcp-server.yaml +1 -1
package/pipeline/task-graphs/mobile-app.yaml +8 -7
package/pipeline/templates/bend-handoff.template.md +11 -0
package/pipeline/templates/critic-review.template.md +15 -1
package/pipeline/templates/data-handoff.template.md +11 -0
package/pipeline/templates/docgen-handoff.template.md +11 -0
package/pipeline/templates/execution-report.template.md +11 -0
package/pipeline/templates/fend-handoff.template.md +11 -0
package/pipeline/templates/iac-handoff.template.md +11 -0
package/pipeline/templates/judge-decision.template.md +13 -0
package/pipeline/templates/libdev-handoff.template.md +11 -0
package/pipeline/templates/mcp-dev-handoff.template.md +11 -0
package/pipeline/templates/mobile-handoff.template.md +11 -0
package/pipeline/templates/qa-test-spec.template.md +11 -0
package/pipeline/templates/readiness-review.template.md +13 -0
package/pipeline/templates/reqs-brief.template.md +11 -0
package/pipeline/templates/uxa-spec.template.md +11 -0
package/skills/valent-run-story/SKILL.md +12 -0
package/src/commands/calibrate.js +86 -0
package/src/commands/init.js +1 -1
package/src/commands/rejection-cap.js +70 -0
package/src/commands/resolve-graph.js +79 -0
package/src/commands/sprint-pack.js +62 -0
package/src/commands/validate-handoff.js +32 -0
package/src/commands/validate-sprint.js +55 -0
package/src/lib/graph.js +98 -0
package/src/lib/handoff.js +99 -0
package/src/lib/rejection.js +38 -0
package/src/lib/sprint.js +312 -0

package/pipeline/orchestrators/claude-code/retro.workflow.js ADDED Viewed

@@ -0,0 +1,274 @@
+/**
+ * valent-pipeline RETROSPECTIVE orchestrator — Claude Code (native Workflow) provider.
+ *
+ * STATUS: Step 7 (reimplementation-plan §5b, feedback R5). Reviewable; control flow validated
+ * by scripts/test-workflow.js. Opt-in, not the default. The Codex provider keeps the
+ * markdown-skill Lead (hybrid, R3).
+ *
+ * The retrospective is the ONE place the meta-loop adds genuine *quality*, not just reliable
+ * structure (R5): the prose retro is a fixed single pass; here the aggregate review runs
+ * LOOP-UNTIL-DRY (keep reviewing until K consecutive rounds surface nothing new) followed by a
+ * COMPLETENESS-CRITIC ("which pattern did we not check?"). That is the same rigor that makes
+ * CRITIC's 3-pass and JUDGE the strongest existing features, applied to the learning loop.
+ *
+ * Flow: calibrate (CLI) -> analyze -> aggregate-review (loop-until-dry) -> completeness-critic
+ *       -> directives (agent proposes; CODE enforces impact gating + the architectural-invariant
+ *          guard) -> embed (CLI).
+ *
+ * The deterministic pieces are NOT in this script: calibration arithmetic is
+ * `valent-pipeline calibrate` (src/lib/sprint.js); embedding is `valent-pipeline db embed`.
+ * Both run through agents (a Workflow script has no CLI/fs access). The directive IMPACT
+ * GATING and INVARIANT GUARD are deterministic policy, so they are enforced HERE in code —
+ * the agent only proposes; the script decides what gets applied vs. surfaced for approval.
+ *
+ * args: { batchNumber, sprintId?, storyOutputDirs?: string[], dryRounds?: number, maxRounds?: number }
+ *   sprintId present => sprint-mode (calibration runs). dryRounds = consecutive empty rounds
+ *   that end the loop-until-dry (default 2). maxRounds caps it (default 5).
+ */
+export const meta = {
+  name: 'valent-retro',
+  description: 'Retrospective: calibrate, loop-until-dry aggregate review, gated directives, embed (Workflow)',
+  phases: [
+    { title: 'Calibrate', detail: 'valent-pipeline calibrate (estimation accuracy, in code) — sprint mode' },
+    { title: 'Analyze', detail: 'CRITIC/QA/JUDGE batch outputs + cost' },
+    { title: 'Aggregate', detail: 'loop-until-dry 3-pass aggregate review + completeness critic (R5)' },
+    { title: 'Directives', detail: 'agent proposes; code enforces impact gating + invariant guard' },
+    { title: 'Embed', detail: 'valent-pipeline db embed (persist curated patterns)' },
+  ],
+}
+// --- schemas (inlined) ---
+const FINDINGS_SCHEMA = {
+  type: 'object',
+  required: ['schema', 'findings'],
+  additionalProperties: true,
+  properties: {
+    schema: { const: 1 },
+    findings: {
+      type: 'array',
+      items: {
+        type: 'object',
+        required: ['id', 'summary'],
+        properties: {
+          id: { type: 'string' },
+          summary: { type: 'string' },
+          severity: { type: 'string' },
+          stories: { type: 'array', items: { type: 'string' } },
+        },
+      },
+    },
+  },
+}
+const COMPLETENESS_SCHEMA = {
+  type: 'object',
+  required: ['schema', 'gaps'],
+  additionalProperties: true,
+  // gaps = review angles NOT yet covered (e.g. "no security-boundary scan run"). Empty => complete.
+  properties: {
+    schema: { const: 1 },
+    gaps: { type: 'array', items: { type: 'string' } },
+  },
+}
+const DIRECTIVES_SCHEMA = {
+  type: 'object',
+  required: ['schema', 'directives'],
+  additionalProperties: true,
+  properties: {
+    schema: { const: 1 },
+    directives: {
+      type: 'array',
+      items: {
+        type: 'object',
+        required: ['target_agent', 'directive', 'reason', 'impact_level'],
+        properties: {
+          target_agent: { type: 'string' },
+          directive: { type: 'string' },
+          reason: { type: 'string' },
+          impact_level: { enum: ['low', 'medium', 'high'] },
+          // Agent flags whether the directive touches an Architectural Invariant (skip tests,
+          // ship without evidence, weaken a gate, exempt mandatory tests). The CODE decides
+          // what to do with that flag — see the gate below.
+          touchesInvariant: { type: 'boolean' },
+          category: { type: 'string' },
+        },
+      },
+    },
+  },
+}
+const HANDOFF_SCHEMA = {
+  type: 'object',
+  required: ['schema'],
+  additionalProperties: true,
+  properties: { schema: { const: 1 } },
+}
+// --- args ---
+const a = args || {}
+const batchNumber = a.batchNumber
+const sprintId = a.sprintId || null
+const dryRounds = a.dryRounds ?? 2
+const maxRounds = a.maxRounds ?? 5
+if (batchNumber == null) throw new Error('args must include { batchNumber }')
+const retroPrompt = (instruction, returnContract) =>
+  `You are **RETROSPECTIVE**, analyzing story batch ${batchNumber} in the valent-pipeline. ` +
+  `Read \`.valent-pipeline/prompts/retrospective.md\` and the step file named in the task. ${instruction} ` +
+  (returnContract || 'Return your findings as the JSON object specified.')
+// A stable de-dup key so loop-until-dry converges (don't re-count the same finding).
+const findingKey = (f) => `${(f.summary || '').toLowerCase().trim().slice(0, 80)}`
+// ---------------------------------------------------------------------------
+let calibration = null
+if (sprintId) {
+  phase('Calibrate')
+  // Estimation-accuracy arithmetic lives in code (src/lib/sprint.js); run it via the CLI.
+  calibration = await agent(
+    `Run exactly: \`valent-pipeline calibrate --sprint ${sprintId}\` in the project root and return its stdout JSON verbatim ` +
+      `(fields: ratios, flagged_pairs, surface_averages, velocity). This feeds calibration directives.`,
+    { label: 'calibrate', phase: 'Calibrate', schema: { type: 'object', additionalProperties: true } },
+  )
+  log(`calibration: ${(calibration.flagged_pairs || []).length} flagged pair(s); velocity unstable=${calibration.velocity?.unstable}`)
+}
+phase('Analyze')
+await agent(
+  retroPrompt(
+    'Run analyze.md: read all CRITIC reviews, QA-B bug reports, JUDGE rejections, and cost data; categorize rejection/bug patterns.',
+    'Return ONLY { schema:1, findings:[{id,summary,severity,stories}] } as JSON.',
+  ),
+  { label: 'analyze', phase: 'Analyze', schema: FINDINGS_SCHEMA },
+)
+phase('Aggregate')
+// LOOP-UNTIL-DRY (R5): re-run the 3-pass aggregate review until `dryRounds` consecutive
+// rounds surface nothing new, deduping against everything already seen. A simple
+// fixed-pass review (the prose behavior) misses the tail; this does not.
+const seen = new Set()
+const confirmed = []
+let dry = 0
+let round = 0
+while (dry < dryRounds && round < maxRounds) {
+  round += 1
+  const r = await agent(
+    retroPrompt(
+      `Run aggregate-review.md (round ${round}): 3-pass CRITIC-style review of the aggregate diff (last retro tag to HEAD) — ` +
+        `correctness across story boundaries, convention/pattern drift, architecture/integration. ` +
+        `Report ONLY findings not already reported in earlier rounds.`,
+      'Return ONLY { schema:1, findings:[{id,summary,severity,stories}] } as JSON.',
+    ),
+    { label: `aggregate:round-${round}`, phase: 'Aggregate', schema: FINDINGS_SCHEMA },
+  )
+  const fresh = (r.findings || []).filter((f) => !seen.has(findingKey(f)))
+  if (!fresh.length) {
+    dry += 1
+    log(`aggregate round ${round}: dry (${dry}/${dryRounds})`)
+    continue
+  }
+  dry = 0
+  for (const f of fresh) seen.add(findingKey(f))
+  confirmed.push(...fresh)
+  log(`aggregate round ${round}: +${fresh.length} new finding(s) (${confirmed.length} total)`)
+}
+// COMPLETENESS-CRITIC (R5): ask what review angle we never ran. Each named gap gets one
+// targeted review round; anything it surfaces joins the confirmed set.
+const critic = await agent(
+  retroPrompt(
+    `We ran ${round} aggregate-review round(s) and found ${confirmed.length} finding(s). ` +
+      `What review angle was NOT covered (e.g. a modality, a security boundary, a contract surface)? ` +
+      `List only genuine gaps — empty if coverage is complete.`,
+    'Return ONLY { schema:1, gaps:["..."] } as JSON.',
+  ),
+  { label: 'completeness-critic', phase: 'Aggregate', schema: COMPLETENESS_SCHEMA },
+)
+if ((critic.gaps || []).length) {
+  log(`completeness-critic surfaced ${critic.gaps.length} gap(s) — running targeted reviews`)
+  const extra = await parallel(
+    critic.gaps.map((gap, i) => () =>
+      agent(
+        retroPrompt(`Targeted aggregate review for the previously-uncovered angle: "${gap}". Report only findings not already reported.`,
+          'Return ONLY { schema:1, findings:[{id,summary,severity,stories}] } as JSON.'),
+        { label: `aggregate:gap-${i + 1}`, phase: 'Aggregate', schema: FINDINGS_SCHEMA },
+      )),
+  )
+  for (const r of extra.filter(Boolean)) {
+    for (const f of (r.findings || [])) {
+      if (!seen.has(findingKey(f))) { seen.add(findingKey(f)); confirmed.push(f) }
+    }
+  }
+}
+log(`aggregate review complete: ${confirmed.length} confirmed finding(s)`)
+phase('Directives')
+// The agent PROPOSES directives (with impact_level + a touchesInvariant flag). The CODE
+// enforces the policy — deterministic, uncheatable — per the §5b determinism map:
+//   - touchesInvariant      -> ARCHITECTURE-CONFLICT: never auto-applied, surfaced to the user
+//   - impact_level 'high'   -> proposal only, requires user approval
+//   - 'low' / 'medium'      -> auto-applied (medium also notifies the Lead)
+const drafted = await agent(
+  retroPrompt(
+    `Run directives.md against the ${confirmed.length} confirmed finding(s)` +
+      (calibration ? ' and the calibration metrics' : '') +
+      `. For EACH proposed directive set impact_level (low|medium|high) and touchesInvariant=true if it would skip test ` +
+      `execution, allow shipping without evidence, weaken a quality gate, or exempt mandatory tests. Do NOT self-censor — ` +
+      `propose it and flag it; the orchestrator decides what gets applied.`,
+    'Return ONLY { schema:1, directives:[{target_agent,directive,reason,impact_level,touchesInvariant,category}] } as JSON.',
+  ),
+  { label: 'draft-directives', phase: 'Directives', schema: DIRECTIVES_SCHEMA },
+)
+const all = drafted.directives || []
+const conflicts = all.filter((d) => d.touchesInvariant)
+const highImpact = all.filter((d) => !d.touchesInvariant && d.impact_level === 'high')
+const applied = all.filter((d) => !d.touchesInvariant && d.impact_level !== 'high')
+const proposals = [...conflicts, ...highImpact]
+log(`directives: ${applied.length} auto-applied, ${proposals.length} require user approval ` +
+  `(${conflicts.length} architecture-conflict, ${highImpact.length} high-impact)`)
+if (applied.length) {
+  await agent(
+    `Append these APPROVED correction directives to \`correction-directives.yaml\` (status: active, created_batch: ${batchNumber}). ` +
+      `They have passed the impact gate (low/medium only). Directives (JSON): ${JSON.stringify(applied)}. ` +
+      `Return { schema:1 } when done.`,
+    { label: 'apply-directives', phase: 'Directives', schema: HANDOFF_SCHEMA },
+  )
+}
+if (proposals.length) {
+  // Surfaced, never silently applied — this is the [ARCHITECTURE-CONFLICT] / high-impact path.
+  await agent(
+    `Write these directive PROPOSALS to \`retrospective-batch-${batchNumber}.md\` under "## Pending Approval" — do NOT add them to ` +
+      `correction-directives.yaml. For each, document the proposed directive, why it needs approval (architecture-conflict or high-impact), ` +
+      `evidence, risk, and an alternative. Proposals (JSON): ${JSON.stringify(proposals)}. Return { schema:1 } when done.`,
+    { label: 'surface-proposals', phase: 'Directives', schema: HANDOFF_SCHEMA },
+  )
+}
+phase('Embed')
+// Persisting curated knowledge is deterministic ingestion — write the manifest, then run the CLI.
+const embed = await agent(
+  `Run embed-instructions.md: write \`embed-instructions.md\` (curated recurring patterns / novel decisions / bug patterns / ` +
+    `broadly-applicable directives only — NOT one-offs) in the most recent story output dir, then run ` +
+    `\`valent-pipeline db embed --file <that path>\`. Return { schema:1, embedded:<int count> }.`,
+  { label: 'embed', phase: 'Embed', schema: { type: 'object', additionalProperties: true } },
+)
+return {
+  batchNumber,
+  sprintId,
+  aggregate_findings: confirmed.length,
+  aggregate_rounds: round,
+  completeness_gaps: (critic.gaps || []).length,
+  directives_applied: applied.length,
+  directives_pending_approval: proposals.length,
+  architecture_conflicts: conflicts.length,
+  embedded: embed.embedded ?? null,
+}

package/pipeline/orchestrators/claude-code/sprint.workflow.js ADDED Viewed

@@ -0,0 +1,354 @@
+/**
+ * valent-pipeline sprint orchestrator — Claude Code (native Workflow) provider.
+ *
+ * STATUS: Step 6 (sprint loop over the batch + 3b parallel CRITIC + full spawn-context),
+ * building on the Step 4 per-story slice. Reviewable; control flow is validated by
+ * scripts/test-workflow.js but it has NOT yet been exercised end-to-end against a live
+ * story, so it is opt-in, not the default (see README + skills/valent-run-story).
+ * The Codex provider keeps the markdown-skill Lead (hybrid, per reimplementation-plan.md
+ * R3); this script is the Claude Code deployment over the same shared substrate
+ * (prompts/steps/task-graphs/schemas).
+ *
+ * How it composes the substrate built in steps 1-3:
+ *   - resolve-graph (step 2) produces each story's stage list — conditional/skip_when
+ *     predicates evaluated and blockedBy pruned in code, never by model judgment.
+ *   - gates (readiness/critic/judge) are schema-validated verification stages; the
+ *     verdict invariant (verdict:pass => highFindingsOpen:0) is enforced both by the
+ *     schema and by assertGate(), so a gate cannot assert "pass" over open High findings.
+ *   - CRITIC's three passes (step 3b) run as INDEPENDENT parallel agents — each reads only
+ *     its own pass step file — then a triage barrier dedups and writes the verdict. This is
+ *     real perspective-diverse verify; the passes cannot anchor on each other.
+ *   - the CRITIC rejection loop is a real JS while-loop with a code-owned cap, replacing
+ *     the model-counted circuit breaker (lead.md) that could miscount.
+ *   - resume is journal-based for free: relaunch with resumeFromRunId and the unchanged
+ *     prefix of agent() calls replays from the journal. No disk-state rehydration.
+ *
+ * Sprint loop: the planned batch runs SEQUENTIALLY (a for-loop, not pipeline()). Executing
+ * stories share one git branch, so they cannot overlap without per-story worktrees — the
+ * for-loop is the sequentiality guarantee. Parallelism lives WITHIN a story (dev fan-out,
+ * CRITIC passes), never across executing stories. A rejected story (JUDGE fail / cap trip)
+ * is recorded as rolled-over and the batch continues with the next story.
+ *
+ * Workflow runtime note: this script body has NO filesystem/import access. Every side
+ * effect (running `valent-pipeline resolve-graph`, reading inputs, writing handoffs,
+ * git) is performed by the agents it spawns. The script only sequences them and validates
+ * their structured returns.
+ *
+ * args (either form):
+ *   { stories: [{ storyId, projectType?, profiles? }, ...], projectType?, profiles?, maxRejectionCycles? }
+ *   { storyId, projectType, profiles?, maxRejectionCycles? }   // single-story (back-compat)
+ */
+export const meta = {
+  name: 'valent-sprint',
+  description: 'Run a valent-pipeline sprint batch as a deterministic Workflow with schema-validated quality gates',
+  phases: [
+    { title: 'Resolve', detail: 'resolve-graph -> per-story stage list (predicates + pruning in code)' },
+    { title: 'Spec', detail: 'reqs -> uxa -> qa-a' },
+    { title: 'Readiness', detail: 'pre-dev quality gate' },
+    { title: 'Build', detail: 'dev agents in parallel (barrier before CRITIC)' },
+    { title: 'Critic', detail: 'three independent passes in parallel -> triage -> rejection loop (code-owned cap)' },
+    { title: 'QA', detail: 'execute tests against real infra' },
+    { title: 'Judge', detail: 'evidence-based ship decision' },
+  ],
+}
+// --- Structured-output schemas (mirror pipeline/schemas/*.json; inlined because a
+//     Workflow script cannot read files). The gate schema carries the pass-invariant. ---
+const HANDOFF_SCHEMA = {
+  type: 'object',
+  required: ['schema', 'agent', 'story'],
+  additionalProperties: true,
+  properties: {
+    schema: { const: 1 },
+    agent: { type: 'string' },
+    story: { type: 'string' },
+    files: { type: 'array', items: { type: 'string' } },
+    nextAgent: { type: ['string', 'null'] },
+    flags: { type: 'array', items: { type: 'string' } },
+  },
+}
+const VERDICT_SCHEMA = {
+  type: 'object',
+  required: ['schema', 'agent', 'story', 'verdict', 'highFindingsOpen'],
+  additionalProperties: true,
+  // Enforced post-validation in code as well (a JSON-Schema-only encoding of an
+  // implication is awkward); see assertGate().
+  properties: {
+    schema: { const: 1 },
+    agent: { type: 'string' },
+    story: { type: 'string' },
+    verdict: { enum: ['pass', 'fail', 'needs-review'] },
+    highFindingsOpen: { type: 'integer', minimum: 0 },
+    rejectionTarget: { type: ['string', 'null'] },
+    files: { type: 'array', items: { type: 'string' } },
+    flags: { type: 'array', items: { type: 'string' } },
+  },
+}
+// A single CRITIC pass returns its findings (no verdict — only triage decides). Kept loose:
+// passes intentionally overlap, and dedup/severity is triage's job.
+const FINDINGS_SCHEMA = {
+  type: 'object',
+  required: ['schema', 'agent', 'story', 'pass', 'findings'],
+  additionalProperties: true,
+  properties: {
+    schema: { const: 1 },
+    agent: { type: 'string' },
+    story: { type: 'string' },
+    pass: { enum: ['blind', 'edge', 'acceptance'] },
+    findings: {
+      type: 'array',
+      items: {
+        type: 'object',
+        required: ['summary'],
+        properties: {
+          summary: { type: 'string' },
+          file: { type: 'string' },
+          severity: { enum: ['High', 'Med', 'Low'] },
+        },
+      },
+    },
+  },
+}
+const RESOLVED_GRAPH_SCHEMA = {
+  type: 'object',
+  required: ['tasks', 'skipped'],
+  properties: {
+    tasks: {
+      type: 'array',
+      items: {
+        type: 'object',
+        required: ['ref', 'agent', 'blockedBy'],
+        properties: {
+          ref: { type: 'string' },
+          agent: { type: 'string' },
+          subject: { type: 'string' },
+          description: { type: 'string' },
+          blockedBy: { type: 'array', items: { type: 'string' } },
+        },
+      },
+    },
+    skipped: { type: 'array', items: { type: 'string' } },
+  },
+}
+const DEV_AGENTS = new Set(['BEND', 'FEND', 'IAC', 'DATA', 'DOCGEN', 'LIBDEV', 'MCP-DEV', 'MOBILE'])
+// CRITIC's three independent passes (step 3b). Each reads ONLY its own pass step file and
+// the diff/artifacts it is told to — never another pass's output — so they cannot anchor.
+const CRITIC_PASSES = [
+  { pass: 'blind', step: 'blind-hunt.md', reads: 'ONLY the git diff (do NOT read reqs-brief or qa-test-spec)' },
+  { pass: 'edge', step: 'edge-case-hunt.md', reads: 'the diff plus reqs-brief.md (hunt boundary/error/concurrency cases)' },
+  { pass: 'acceptance', step: 'acceptance-audit.md', reads: 'the diff plus qa-test-spec.md and reqs-brief.md (audit every AC)' },
+]
+// --- arg normalization: accept a batch or a single story ---------------------
+const a = args || {}
+const batch = Array.isArray(a.stories) && a.stories.length
+  ? a.stories.map((s) => ({
+      storyId: s.storyId,
+      projectType: s.projectType || a.projectType,
+      profiles: s.profiles || a.profiles || [],
+    }))
+  : [{ storyId: a.storyId, projectType: a.projectType, profiles: a.profiles || [] }]
+const maxRejectionCycles = a.maxRejectionCycles ?? 5
+for (const s of batch) {
+  if (!s.storyId || !s.projectType) {
+    throw new Error('each story needs { storyId, projectType }; profiles[] optional')
+  }
+}
+// --- prompt builder: mirrors providers/claude-code/spawn.template.md so spawned agents
+//     get full pipeline context (core prompt + shared context + step-at-execution + the
+//     handoff contract), not a terse one-liner. ------------------------------------------
+function buildPrompt({ role, promptFile, storyId, taskRef, taskSubject, trigger, completion, returnContract }) {
+  const outputDir = `stories/${storyId}/output`
+  return [
+    `You are **${role}**, for story ${storyId} in the valent-pipeline.`,
+    '',
+    '## Setup',
+    `1. Read your core prompt: \`.valent-pipeline/prompts/${promptFile}\` — identity, protocols, step sequence.`,
+    `2. Read shared context: \`${outputDir}/pipeline-context.md\` (and correction directives if present).`,
+    '3. Read each step file at the point of execution, not before. Check decision gates first.',
+    '',
+    '## Task Assignment',
+    `${taskRef ? `Task ${taskRef}: ` : ''}${taskSubject}`,
+    '',
+    '## Trigger',
+    trigger || 'Begin now.',
+    '',
+    '## On Completion',
+    completion ||
+      'Write your handoff artifact as usual.',
+    '',
+    returnContract ||
+      'Return ONLY the fields of your `valent:handoff` machine block as a JSON object.',
+  ].join('\n')
+}
+// A gate's verdict must satisfy the pass-invariant even though the schema can't express the
+// implication directly. This is the KANBAN-002 guard, enforced in the orchestrator.
+function assertGate(v, gate) {
+  if (v.verdict === 'pass' && v.highFindingsOpen > 0) {
+    throw new Error(`${gate}: illegal verdict — pass with highFindingsOpen=${v.highFindingsOpen}`)
+  }
+  return v
+}
+// ---------------------------------------------------------------------------
+const results = []
+for (let i = 0; i < batch.length; i++) {
+  if (i > 0) log(`--- story boundary: shared branch advances to ${batch[i].storyId} (sequential) ---`)
+  results.push(await runStory(batch[i]))
+}
+const shippedCount = results.filter((r) => r.shipped).length
+log(`sprint complete: ${shippedCount}/${results.length} shipped`)
+return {
+  shipped: results.every((r) => r.shipped),
+  stories_shipped: shippedCount,
+  stories_rolled_over: results.length - shippedCount,
+  results,
+}
+// ===========================================================================
+// runStory: the per-story pipeline (kept inline, not a nested workflow(), so the single
+// workflow() nesting level stays free for plan/retro — see reimplementation-plan §5b).
+async function runStory(story) {
+  const { storyId, projectType, profiles } = story
+  const profilesCsv = profiles.join(',')
+  phase('Resolve')
+  // The script cannot run the CLI itself; an agent runs resolve-graph and returns its JSON.
+  const graph = await agent(
+    `Run exactly: \`valent-pipeline resolve-graph --type ${projectType} --profiles ${profilesCsv}\` ` +
+      `in the project root for story ${storyId} and return its stdout JSON verbatim (fields: tasks, skipped).`,
+    { label: `resolve:${storyId}`, phase: 'Resolve', schema: RESOLVED_GRAPH_SCHEMA },
+  )
+  const has = (ref) => graph.tasks.some((t) => t.ref === ref)
+  const devTasks = graph.tasks.filter((t) => DEV_AGENTS.has(t.agent))
+  log(`${storyId}: resolved ${graph.tasks.length} tasks; skipped: ${graph.skipped.join(', ') || 'none'}`)
+  const spawn = (role, promptFile, taskSubject, opts = {}) =>
+    agent(buildPrompt({ role, promptFile, storyId, taskSubject, ...opts }), {
+      label: opts.label || `${role.toLowerCase()}:${storyId}`,
+      phase: opts.phase,
+      schema: opts.schema || HANDOFF_SCHEMA,
+    })
+  phase('Spec')
+  await spawn('REQS', 'reqs.md', 'Analyze the story and produce reqs-brief.md.', { phase: 'Spec' })
+  if (has('uxa')) {
+    await spawn('UXA', 'uxa.md', 'Translate the brief into uxa-spec.md.', { phase: 'Spec' })
+  }
+  await spawn('QA-A', 'qa-a.md', 'Produce qa-test-spec.md before any code is written.', { phase: 'Spec' })
+  phase('Readiness')
+  await runGate(storyId, 'READINESS', 'readiness.md', 'Validate the spec chain (reqs/uxa/qa) is implementation-ready.',
+    'Readiness', (verdict) => {
+      const target = verdict.rejectionTarget || 'REQS'
+      return spawn(target, `${target.toLowerCase()}.md`, 'Address the READINESS rejection and rewrite the affected spec.', {
+        label: `rework:${target.toLowerCase()}:${storyId}`,
+        phase: 'Readiness',
+      })
+    })
+  phase('Build')
+  // Genuine barrier: CRITIC needs ALL active dev agents' work before reviewing.
+  await parallel(
+    devTasks.map((t) => () =>
+      spawn(t.agent, `${t.agent.toLowerCase()}.md`, t.subject || 'Implement production code and tests per the brief and test spec.', {
+        label: `build:${t.ref}:${storyId}`,
+        phase: 'Build',
+      })),
+  )
+  phase('Critic')
+  await runCriticGate(storyId, devTasks)
+  phase('QA')
+  await spawn('QA-B', 'qa-b.md', 'Execute the full test suite, file bugs, build the traceability matrix.', { phase: 'QA' })
+  phase('Judge')
+  const decision = await runGate(storyId, 'JUDGE', 'judge.md',
+    'Review evidence (tests, traceability, bugs) and make the ship decision.', 'Judge', null)
+  return { storyId, shipped: decision.verdict === 'pass', verdict: decision.verdict, skipped: graph.skipped }
+  // --- per-story closures over storyId/devTasks ----------------------------
+  // runGate: a schema-validated verification stage with a code-owned rejection loop.
+  // `reworkThunk` (or null for terminal gates) produces the fix work before re-gating.
+  async function runGate(sid, role, promptFile, instruction, gatePhase, reworkThunk) {
+    let rejections = 0
+    while (true) {
+      const verdict = assertGate(
+        await spawn(role, promptFile, instruction, { label: `gate:${role.toLowerCase()}:${sid}`, phase: gatePhase, schema: VERDICT_SCHEMA }),
+        role,
+      )
+      if (verdict.verdict === 'pass') return verdict
+      if (!reworkThunk) return verdict // terminal gate (JUDGE): reject is the answer, Lead diagnoses
+      rejections += 1
+      if (rejections >= maxRejectionCycles) {
+        log(`${sid}/${role}: circuit breaker tripped after ${rejections} rejections — escalating`)
+        return { ...verdict, escalated: true }
+      }
+      log(`${sid}/${role}: rejection ${rejections}/${maxRejectionCycles} — reworking`)
+      await reworkThunk(verdict)
+    }
+  }
+  // runCriticGate (step 3b): three INDEPENDENT passes in parallel, then a triage barrier
+  // that dedups and writes the verdict. The whole thing is wrapped in the code-owned
+  // rejection loop; on reject, the routed dev agents rework and the passes re-run.
+  async function runCriticGate(sid, devs) {
+    let rejections = 0
+    while (true) {
+      // Independent perspective-diverse verify: each pass reads only its own inputs.
+      await parallel(
+        CRITIC_PASSES.map((p) => () =>
+          spawn('CRITIC', 'critic.md',
+            `Run pass ${p.pass} per \`.valent-pipeline/steps/critic/${p.step}\`. Read ${p.reads}. ` +
+              `Do NOT read any other pass's output. Record findings only — do NOT deduplicate or set a verdict.`,
+            {
+              label: `critic:${p.pass}:${sid}`,
+              phase: 'Critic',
+              schema: FINDINGS_SCHEMA,
+              returnContract: 'Return ONLY { schema:1, agent:"critic", story, pass, findings:[...] } as JSON.',
+            })),
+      )
+      // Triage barrier: the single point of deduplication; produces the schema-validated verdict.
+      const verdict = assertGate(
+        await spawn('CRITIC', 'critic.md',
+          'Triage per `.valent-pipeline/steps/critic/triage.md`: gather findings from ALL three passes, ' +
+            'collapse duplicates (same root cause) into one, classify final severity, then write the verdict.',
+          { label: `gate:critic:${sid}`, phase: 'Critic', schema: VERDICT_SCHEMA }),
+        'CRITIC',
+      )
+      if (verdict.verdict === 'pass') return verdict
+      rejections += 1
+      if (rejections >= maxRejectionCycles) {
+        log(`${sid}/CRITIC: circuit breaker tripped after ${rejections} rejections — escalating`)
+        return { ...verdict, escalated: true }
+      }
+      log(`${sid}/CRITIC: rejection ${rejections}/${maxRejectionCycles} — reworking`)
+      // Route fixes to the owning dev agent(s), then re-run the passes.
+      await parallel(
+        devs.map((t) => () =>
+          spawn(t.agent, `${t.agent.toLowerCase()}.md`, 'Fix every High finding CRITIC routed to you.', {
+            label: `rework:${t.ref}:${sid}`,
+            phase: 'Critic',
+          })),
+      )
+    }
+  }
+}

package/pipeline/orchestrators/codex/README.md ADDED Viewed

@@ -0,0 +1,52 @@
+# Codex orchestrator (thin prose Lead)
+This is the Codex deployment of the valent-pipeline orchestrator, per the hybrid target in
+[`../../../docs-feedback/reimplementation-plan.md`](../../../docs-feedback/reimplementation-plan.md)
+(R3): the Claude Code provider runs a deterministic **Workflow script**
+(`../claude-code/`), while the Codex provider runs a **thin prose Lead loop** — because Codex
+has no Workflow / team / inbox / cron primitives and genuinely needs an explicit loop. Both
+consume the same shared substrate (`prompts/`, `steps/`, `task-graphs/`, `schemas/`, templates).
+## The file
+| File | Role |
+|---|---|
+| `lead-loop.md` | The thin Codex orchestrator: resolve the DAG (CLI) → walk it honoring `blockedBy`, spawning Codex threads → validate each handoff (CLI) → gates with a code-owned rejection cap (CLI) → steer-when-alive. |
+## Status
+`lead-loop.md` is the **greenfield thin shell** that will replace the 1207-line `prompts/lead.md`
+for Codex. It is **opt-in and not the default**: `prompts/lead.md` remains the production Codex
+orchestrator until the step-9 live validation passes and step-11 cutover deletes it. It was
+written *from the plan*, not by trimming `lead.md` (the plan §0 rule — reading 1000+ lines of
+prose control flow re-absorbs the drift we're removing).
+## What it demonstrates (parity with the Claude Code engine, over the same substrate)
+| Concern | How (Codex shell) | Claude Code equivalent |
+|---|---|---|
+| DAG resolution | `valent-pipeline resolve-graph` (predicates + pruning in code) | same CLI, via an agent |
+| Quality gates | `valent-pipeline validate-handoff --gate` (schema + pass-invariant) | `agent(_, { schema })` + `assertGate()` |
+| Rejection cap | `valent-pipeline rejection-cap --increment` (file-backed counter, non-zero exit when tripped) | JS `while (rejections < cap)` |
+| Spawn vs. steer | **decided once**: steer-when-alive, spawn-when-not | n/a (every `agent()` is a fresh spawn) |
+| State of record | `task-registry.yaml` + handoff files on disk | the run journal (`resumeFromRunId`) |
+The point of the hybrid: the **deterministic decisions are the same CLIs** in both shells. The
+shells differ only in their spawn/await primitives (JS `agent()`/`parallel()` vs. Codex threads).
+## The contradiction this resolves
+`lead.md`'s Codex rejection path said *spawn a new subagent* on rejection, while
+`providers/codex/runtime.md` (Thread Persistence) and `lead.md`'s own sprint-mode `[STORY-RESET]`
+rule say *steer the persistent thread*. `lead-loop.md` decides it once — **steer-when-alive,
+spawn-when-not** — and `lead.md`'s lone contradictory line is reconciled (see that file's Codex
+rejection section). The duplicated legacy prose is deleted at cutover.
+## Known simplifications (next slices)
+- The thin shell covers the per-story execute loop. The sprint/plan/retro meta loop for Codex
+  reuses the same CLIs (`sprint-pack` / `calibrate` / `validate-sprint` / `db embed`) the prose
+  Lead already calls; folding those into a thin `plan`/`retro` prose loop mirrors the Claude Code
+  `plan.workflow.js` / `retro.workflow.js` and lands with cutover.
+- Cloud-multi thread persistence is handled by the spawn-when-not branch (threads can't cross
+  container boundaries), matching `runtime.md`'s documented fallback.