npm - valent-pipeline - Versions diffs - 0.17.1 → 0.18.0 - Mend

valent-pipeline 0.17.1 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

package/README.md +3 -1
package/package.json +1 -1
package/pipeline/docs/agent-reference.md +5 -2
package/pipeline/docs/pipeline-overview.md +2 -2
package/pipeline/docs/task-graph.md +1 -1
package/pipeline/orchestrators/claude-code/plan.workflow.js +3 -1
package/pipeline/orchestrators/claude-code/sprint.workflow.js +3 -3
package/pipeline/prompts/cli-dev.md +61 -0
package/pipeline/prompts/critic.md +3 -3
package/pipeline/prompts/qa-a.md +1 -1
package/pipeline/prompts/qa-b.md +1 -1
package/pipeline/prompts/reqs.md +1 -1
package/pipeline/steps/cli-dev/estimate.md +49 -0
package/pipeline/steps/cli-dev/handoff.md +12 -0
package/pipeline/steps/cli-dev/implement.md +19 -0
package/pipeline/steps/cli-dev/read-inputs.md +13 -0
package/pipeline/steps/cli-dev/write-tests.md +22 -0
package/pipeline/steps/critic/cli-tool.md +25 -0
package/pipeline/steps/judge/evidence-review.md +1 -0
package/pipeline/steps/orchestration/validate-story-inputs.md +4 -1
package/pipeline/steps/qa-a/cli-tool.md +42 -0
package/pipeline/steps/qa-b/cli-tool.md +56 -0
package/pipeline/steps/qa-b/execute-tests.md +1 -0
package/pipeline/steps/qa-b/write-report.md +1 -1
package/pipeline/steps/reqs/cli-tool.md +46 -0
package/pipeline/task-graphs/cli-tool.yaml +60 -0
package/pipeline/templates/cli-dev-handoff.template.md +89 -0
package/pipeline/templates/reqs-brief.template.md +11 -0
package/skills/valent-configure/SKILL.md +2 -1
package/skills/valent-setup-backlog/SKILL.md +1 -0
package/src/commands/db-rebuild.js +1 -0
package/src/commands/init.js +2 -2
package/src/lib/config-schema.js +4 -4
package/src/lib/detect.js +2 -4

package/README.md CHANGED Viewed

@@ -50,7 +50,7 @@ Quality gates (**SPECCHECK**, **RED**, **STATIC**, **CRITIC**, **GREEN**, **EVID
 ## Project Types
-The pipeline supports 7 project types, each with a tailored task graph and specialized developer agent:
+The pipeline supports 9 project types, each with a tailored task graph and specialized developer agent:
 | Project Type | Developer Agent | Agents Skipped |
 |---|---|---|
@@ -61,6 +61,7 @@ The pipeline supports 7 project types, each with a tailored task graph and speci
 | `mcp-server` | MCP-DEV | UXA, FEND, PMCP |
 | `document-generation` | DOCGEN | UXA, FEND, PMCP |
 | `library` | LIBDEV | UXA, FEND, PMCP |
+| `cli-tool` | CLI-DEV | UXA, FEND, PMCP |
 | `mobile-app` | MOBILE | *(conditional)* |
 The workflow selects which agents to spawn based on `project.type` in your `pipeline-config.yaml` and the story's `testing_profiles` (resolved deterministically by `resolve-graph`).
@@ -94,6 +95,7 @@ Specialized agents that replace BEND for non-API project types:
 | DATA | Sonnet | `data-pipeline` | `data-handoff.md` |
 | MCP-DEV | Sonnet | `mcp-server` | `mcp-dev-handoff.md` |
 | LIBDEV | Sonnet | `library` | `libdev-handoff.md` |
+| CLI-DEV | Sonnet | `cli-tool` | `cli-dev-handoff.md` |
 | DOCGEN | Sonnet | `document-generation` | `docgen-handoff.md` |
 | IAC | Sonnet | Cross-cutting (any type) | `iac-handoff.md` |
 | MOBILE | Sonnet | `mobile-app` | `mobile-handoff.md` |

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "valent-pipeline",
-  "version": "0.17.1",
+  "version": "0.18.0",
   "description": "v3 multi-agent AI pipeline for software development lifecycle",
   "type": "module",
   "bin": {

package/pipeline/docs/agent-reference.md CHANGED Viewed

@@ -64,12 +64,13 @@ domain-specific QA-A / QA-B / CRITIC steps.
 | DATA | Sonnet | Data pipeline developer | `data-pipeline` | `reqs-brief.md`, `qa-test-spec.md` | `data-handoff.md` | ETL/transforms, idempotency, checkpointing, row-level logging |
 | MCP-DEV | Sonnet | Protocol developer | `mcp-server` | `reqs-brief.md`, `qa-test-spec.md` | `mcp-dev-handoff.md` | JSON-RPC/stdio, two-tier error model, tool registration |
 | LIBDEV | Sonnet | Library developer | `library` | `reqs-brief.md`, `qa-test-spec.md` | `libdev-handoff.md` | Public API, exports/packaging, CJS/ESM, semver, type declarations |
+| CLI-DEV | Sonnet | CLI developer | `cli-tool` | `reqs-brief.md`, `qa-test-spec.md` | `cli-dev-handoff.md` | Command surface, argv parsing, exit codes, stdout/stderr contracts, bin entry |
 | DOCGEN | Sonnet | Document generation developer | `document-generation` | `reqs-brief.md`, `qa-test-spec.md` | `docgen-handoff.md` | Template engine, render pipeline, encoding, assets |
 | IAC | Sonnet | Infrastructure developer | `iac` profile (cross-cutting, any type) | `reqs-brief.md`, `qa-test-spec.md` | `iac-handoff.md` | Terraform/Pulumi/CloudFormation, K8s, CI/CD, IAM |
 | MOBILE | Sonnet | Mobile developer | `mobile-app` | `reqs-brief.md`, `uxa-spec.md`, `qa-test-spec.md` | `mobile-handoff.md` | React Native/Flutter, Maestro E2E, emulator lifecycle, iOS deferral |
 **Notes:**
-- DATA, MCP-DEV, LIBDEV, DOCGEN each replace BEND in their dedicated task graph.
+- DATA, MCP-DEV, LIBDEV, CLI-DEV, DOCGEN each replace BEND in their dedicated task graph.
 - IAC is cross-cutting — it slots into ANY task graph when `iac` is in `testing_profiles`, running in parallel with the primary developer agent.
 - MOBILE replaces BEND for mobile-app projects; BEND can still be conditionally included when `testing_profiles` includes `api`.
 - See the agent prompts in `pipeline/prompts/` and step files in `pipeline/steps/` for full implementation details.
@@ -98,6 +99,7 @@ the story's `testing_profiles` do not require.
 | `mcp-server` | MCP-DEV | UXA, FEND | `mcp-server.yaml` |
 | `document-generation` | DOCGEN | UXA, FEND | `document-generation.yaml` |
 | `library` | LIBDEV | UXA, FEND | `library.yaml` |
+| `cli-tool` | CLI-DEV | UXA, FEND | `cli-tool.yaml` |
 | `mobile-app` | MOBILE (+ BEND if `api` profile) | _(conditional)_ | `mobile-app.yaml` |
 **Conditional agents (any project type):**
@@ -113,7 +115,7 @@ Default assignments from `pipeline-config.yaml` `models:` (validated by `src/lib
 | Tier | Agents | Use Case | Cost |
 |------|--------|----------|------|
 | Opus | INTEGRATION, CRITIC-TRIAGE, JUDGE-DECIDE | Quality-gate adjudication + the binding ship call | Highest |
-| Sonnet | REQS, UXA, QA-A, QA-B, BEND, FEND, DATA, MCP-DEV, LIBDEV, DOCGEN, IAC, MOBILE | Spec writing, implementation, test execution | Balanced |
+| Sonnet | REQS, UXA, QA-A, QA-B, BEND, FEND, DATA, MCP-DEV, LIBDEV, CLI-DEV, DOCGEN, IAC, MOBILE | Spec writing, implementation, test execution | Balanced |
 | Haiku | Knowledge, CLI-runner steps (resolve/pack/validate/calibrate/persist) | Mechanical retrieval and IO; no reasoning | Lowest |
 Embedding runs as the `db index-curated` CLI step, not an agent. Model assignments are configurable in
@@ -149,6 +151,7 @@ story's `testing_profiles`:
 | `data-pipeline` | `qa-a/data-pipeline.md` | `qa-b/data-pipeline.md` | `critic/data-pipeline.md` | `reqs/data-pipeline.md` |
 | `mcp-server` | `qa-a/mcp-server.md` | `qa-b/mcp-server.md` | `critic/mcp-server.md` | `reqs/mcp-server.md` |
 | `library` | `qa-a/library.md` | `qa-b/library.md` | `critic/library.md` | `reqs/library.md` |
+| `cli-tool` | `qa-a/cli-tool.md` | `qa-b/cli-tool.md` | `critic/cli-tool.md` | `reqs/cli-tool.md` |
 | `document-generation` | `qa-a/document-generation.md` | `qa-b/document-generation.md` | `critic/document-generation.md` | `reqs/document-generation.md` |
 | `iac` | `qa-a/iac.md` | `qa-b/iac.md` | `critic/iac.md` | `reqs/iac.md` |
 | `mobile-app` | `qa-a/mobile-app.md` | `qa-b/mobile-app.md` | `critic/mobile-app.md` | `reqs/mobile-app.md` |

package/pipeline/docs/pipeline-overview.md CHANGED Viewed

@@ -68,7 +68,7 @@ resolve-graph → REQS → UXA? → QA-A → SPECCHECK → RED → dev fan-out
 4. **SPECCHECK** validates the spec chain mechanically (`valent spec check` artifact matrix + `trace check` AC coverage); rework routes to the CLI-named owner. **RED** (ATDD) then proves the acceptance suite fails pre-implementation and freezes it by hash; **GREEN** (post-CRITIC) proves the implementation satisfies it — red+green+diff is the story proof object (`proof.json`).
-5. **Dev fan-out** implements production code and tests in parallel. The dev agents (BEND/FEND/DATA/MCP-DEV/LIBDEV/DOCGEN/IAC/MOBILE) are selected by the story's `testing_profiles` and the resolved task graph. All read the reqs brief and test spec.
+5. **Dev fan-out** implements production code and tests in parallel. The dev agents (BEND/FEND/DATA/MCP-DEV/LIBDEV/CLI-DEV/DOCGEN/IAC/MOBILE) are selected by the story's `testing_profiles` and the resolved task graph. All read the reqs brief and test spec.
 6. **CRITIC** runs three independent parallel review passes (blind hunt, edge-case hunt, acceptance audit) followed by triage and a verdict. Rejects code back to the dev agents if High-severity findings exist.
@@ -133,7 +133,7 @@ This invokes the workflow, which reads the story input from `stories/STORY-ID/`,
 Run `/valent-configure` to interactively set:
-- **Project type** -- fullstack-web, backend-api, frontend-only, data-pipeline, mcp-server, document-generation, library
+- **Project type** -- fullstack-web, backend-api, frontend-only, data-pipeline, mcp-server, document-generation, library, cli-tool, mobile-app
 - **Tech stack** -- language, frameworks, test tools, browser automation
 - **Model assignments** -- the `models:` tier→roles map (opus/sonnet/haiku) controlling which Claude tier each agent runs on
 - **Quality thresholds** -- max rejection cycles, retrospective frequency, stall detection timeout

package/pipeline/docs/task-graph.md CHANGED Viewed

@@ -12,7 +12,7 @@ The dev task graph is **not** assembled at runtime from a manifest. It is a stat
 pipeline/task-graphs/<project-type>.yaml
 ```
-(e.g. `fullstack-web.yaml`, `backend-api.yaml`, `frontend-only.yaml`, `library.yaml`, `mcp-server.yaml`, `data-pipeline.yaml`, `document-generation.yaml`, `mobile-app.yaml`).
+(e.g. `fullstack-web.yaml`, `backend-api.yaml`, `frontend-only.yaml`, `library.yaml`, `cli-tool.yaml`, `mcp-server.yaml`, `data-pipeline.yaml`, `document-generation.yaml`, `mobile-app.yaml`).
 Each file contains a `tasks:` DAG. Every task declares:

package/pipeline/orchestrators/claude-code/plan.workflow.js CHANGED Viewed

@@ -149,7 +149,8 @@ const VALIDATE_SCHEMA = {
 // Which estimator agent owns each testing profile.
 const PROFILE_ESTIMATORS = {
   api: 'BEND', ui: 'FEND', 'data-pipeline': 'DATA', 'mcp-server': 'MCP-DEV',
-  library: 'LIBDEV', 'document-generation': 'DOCGEN', iac: 'IAC',
+  library: 'LIBDEV', 'cli-tool': 'CLI-DEV', 'document-generation': 'DOCGEN', iac: 'IAC',
+  'mobile-app': 'MOBILE', // was missing — a mobile-only story sized to 0 points
 }
 // --- args ---
@@ -339,6 +340,7 @@ if (toGroom.length) {
       '- `data-pipeline` — ETL, data transformation, or batch processing',
       '- `mcp-server` — MCP server tools, handlers, or protocol work',
       '- `library` — shared library/package (exports, packaging, versioning)',
+      '- `cli-tool` — command-line surface (subcommands, flags, exit codes, terminal output contracts)',
       '- `document-generation` — document/report template or generation pipeline work',
       '- `iac` — infrastructure (Terraform, CloudFormation, Kubernetes, CI/CD)',
       "Tag a profile only when the story OWNS that surface. A story that merely CONSUMES another story's API endpoint (no endpoint/DB change of its own) is NOT `api`.",

package/pipeline/orchestrators/claude-code/sprint.workflow.js CHANGED Viewed

@@ -545,7 +545,7 @@ const RESUME_CHECK_SCHEMA = {
   },
 }
-const DEV_AGENTS = new Set(['BEND', 'FEND', 'IAC', 'DATA', 'DOCGEN', 'LIBDEV', 'MCP-DEV', 'MOBILE'])
+const DEV_AGENTS = new Set(['BEND', 'FEND', 'IAC', 'DATA', 'DOCGEN', 'LIBDEV', 'CLI-DEV', 'MCP-DEV', 'MOBILE'])
 // CRITIC's three independent passes (step 3b). Each reads ONLY its own pass step file and
 // the diff/artifacts it is told to — never another pass's output — so they cannot anchor.
@@ -886,7 +886,7 @@ const DEFAULT_MODELS = {
   JUDGE: 'opus', 'JUDGE-EVIDENCE': 'sonnet', 'JUDGE-DECIDE': 'opus',
   REQS: 'sonnet', UXA: 'sonnet', 'QA-A': 'sonnet', 'QA-B': 'sonnet',
   BEND: 'sonnet', FEND: 'sonnet', DATA: 'sonnet', 'MCP-DEV': 'sonnet',
-  LIBDEV: 'sonnet', DOCGEN: 'sonnet', IAC: 'sonnet', MOBILE: 'sonnet',
+  LIBDEV: 'sonnet', 'CLI-DEV': 'sonnet', DOCGEN: 'sonnet', IAC: 'sonnet', MOBILE: 'sonnet',
   // PMCP is mechanical: crawl each route, run the exact checklist checkpoints, screenshot, and check
   // against explicit pass criteria — no open-ended judgment, so the cheapest tier (like STATIC/RESOLVE).
   // PERSIST is the sprint-end backlog writer — mechanical YAML edit, same tier as in plan/retro.
@@ -967,7 +967,7 @@ const reasoningFor = (role) => REASONING[String(role).toUpperCase()]
 // without Ref configured — which is why it can default ON: it only ever helps, never blocks. From
 // pipeline-config.yaml `ref` ({ enabled?, roles? }); set enabled:false to suppress it, or override
 // `roles` to change which agents get it. Static + args only => journal-replay safe.
-const DEFAULT_REF_ROLES = ['BEND', 'FEND', 'DATA', 'MCP-DEV', 'LIBDEV', 'DOCGEN', 'IAC', 'MOBILE', 'CRITIC']
+const DEFAULT_REF_ROLES = ['BEND', 'FEND', 'DATA', 'MCP-DEV', 'LIBDEV', 'CLI-DEV', 'DOCGEN', 'IAC', 'MOBILE', 'CRITIC']
 function buildRefRoleSet(cfg) {
   if (cfg && cfg.enabled === false) return new Set()
   const roles = cfg && Array.isArray(cfg.roles) && cfg.roles.length ? cfg.roles : DEFAULT_REF_ROLES

package/pipeline/prompts/cli-dev.md ADDED Viewed

@@ -0,0 +1,61 @@
+# CLI-DEV
+<!-- Prompt version: 1.0 | spawned per task by the Workflow orchestrator -->
+You are CLI-DEV, the command-line tool developer agent. You implement command surfaces: subcommands, flags, argument parsing, exit codes, stdout/stderr contracts, and the executable bin entry.
+Read `.valent-pipeline/steps/common/agent-protocol.md` for Communication Standard, Context Discipline, Design Council Protocol, Knowledge-First Principle, Correction Directives, and YAML Frontmatter.
+## Task Lifecycle
+You are spawned for exactly one task; your spawn instructions are your trigger. The orchestrator has already run the SPECCHECK spec gate before spawning you.
+- **On completion:** Write your handoff file with verdict, then return your machine block. The orchestrator routes it to CRITIC.
+- **On rejection rework:** Your spawn instructions carry CRITIC's open fix-list. Read `critic-review.md`, fix the code, write an updated handoff.
+- **On bug fix:** Your spawn instructions name the bug(s). Fix them, update your handoff; QA-B re-verifies on its next run.
+- **Escalate:** Write `status: blocked` (with the blocker) to your output frontmatter and return. The orchestrator reads it.
+## Context
+- **Story:** {story_id}
+- **Language:** {tech_stack.language}
+- **Package manager:** {tech_stack.package_manager}
+- **Module system:** {tech_stack.module_system}
+- **Type system:** {tech_stack.type_system}
+- **Unit test framework:** {tech_stack.test_framework_unit}
+- **Project type:** {project_type}
+## Inputs
+| Artifact | Purpose |
+|----------|---------|
+| `reqs-brief.md` | Acceptance criteria, business rules, command surface, flag contracts, exit-code semantics |
+| `qa-test-spec.md` | Behavioral test specifications for each AC -- what tests to write |
+## Output
+Write `cli-dev-handoff.md` using the template at `.valent-pipeline/templates/cli-dev-handoff.template.md`. Update YAML frontmatter as you complete each step.
+## Quality Standards
+Read `.valent-pipeline/steps/common/quality-standards.md` for universal standards enforced by CRITIC and QA-B.
+Additional CLI-DEV-specific standards:
+- **Exit codes are the API** -- 0 means success and ONLY success; every failure mode exits non-zero. A command that prints an error and exits 0 is a defect, full stop.
+- **stdout/stderr discipline** -- results (the output a script would pipe or parse) go to stdout; diagnostics, progress, and warnings go to stderr. Machine-readable output modes (`--json` etc.) must stay byte-clean on stdout.
+- **Help text matches behavior** -- every documented command, flag, and default does what `--help` says; every implemented flag appears in help. Unknown flags/subcommands error non-zero, never silently ignore.
+- **Non-interactive safe** -- the tool must never hang waiting for input when stdin is not a TTY; everything a prompt asks for is also settable by flag or environment variable.
+- **Destructive actions guarded** -- deletes/overwrites/irreversible operations require an explicit flag (`--force`/`--yes`) or refuse in non-interactive mode; the refusal names the flag.
+- **The bin entry works installed** -- the package.json `bin` mapping resolves, the entry has the correct shebang, and the tool runs via the package-manager execution path (`npx <name>`/equivalent), not just `node src/...`.
+## Step Sequence
+Update `stepsCompleted` and `pendingSteps` in frontmatter as you progress.
+### Steps
+| Step | File | Summary |
+|------|------|---------|
+| 1. Read Inputs | `.valent-pipeline/steps/cli-dev/read-inputs.md` | Read reqs-brief, qa-test-spec, correction directives, knowledge queries |
+| 2. Implement | `.valent-pipeline/steps/cli-dev/implement.md` | Command surface, argument parsing, exit codes, output contracts, bin entry |
+| 3. Write Tests | `.valent-pipeline/steps/cli-dev/write-tests.md` | Black-box subprocess tests, exit-code/output assertions, execution |
+| 4. Handoff | `.valent-pipeline/steps/cli-dev/handoff.md` | Write cli-dev-handoff.md, final verification |

package/pipeline/prompts/critic.md CHANGED Viewed

@@ -53,10 +53,10 @@ After triage-depth, execute only the passes indicated by your selected depth lev
 | 2. Pass 1: Blind Hunt | `.valent-pipeline/steps/critic/blind-hunt.md` | standard, deep |
 | 2b. Query Knowledge Base | (inline) | Always |
 | 3. Pass 2: Edge Case Hunt | `.valent-pipeline/steps/critic/edge-case-hunt.md` | deep only |
-| 3b. Load profile steps for edge-case-hunt | Conditional per `{testing_profiles}`: `.valent-pipeline/steps/critic/data-pipeline.md`, `mcp-server.md`, `library.md`, `document-generation.md`, `iac.md`, `mobile-app.md` | deep only |
+| 3b. Load profile steps for edge-case-hunt | Conditional per `{testing_profiles}`: `.valent-pipeline/steps/critic/data-pipeline.md`, `mcp-server.md`, `library.md`, `cli-tool.md`, `document-generation.md`, `iac.md`, `mobile-app.md` | deep only |
 | 4. Pass 3: Acceptance Audit | `.valent-pipeline/steps/critic/acceptance-audit.md` | Always |
 | 5. Test Code Review | `.valent-pipeline/steps/critic/test-review.md` | standard, deep |
-| 5b. Load profile steps for test-review | Conditional per `{testing_profiles}`: `.valent-pipeline/steps/critic/data-pipeline.md`, `mcp-server.md`, `library.md`, `document-generation.md`, `iac.md`, `mobile-app.md` | standard, deep |
+| 5b. Load profile steps for test-review | Conditional per `{testing_profiles}`: `.valent-pipeline/steps/critic/data-pipeline.md`, `mcp-server.md`, `library.md`, `cli-tool.md`, `document-generation.md`, `iac.md`, `mobile-app.md` | standard, deep |
 | 6. Triage | `.valent-pipeline/steps/critic/triage.md` | Always |
 | 7. Write Verdict | `.valent-pipeline/steps/critic/write-verdict.md` | Always |
@@ -67,7 +67,7 @@ The story runs on its own branch off the target branch (git flow — `.valent-pi
 Read the curated knowledge files (directory named in your spawn instructions, Setup step 2b) for recurring code quality issues, known anti-patterns, and correction directives relevant to CRITIC reviewing code for {story_id}. Also run `node .valent-pipeline/bin/cli.js db search --query "<topic>"` for prior-story lessons, and `node .valent-pipeline/bin/cli.js db query-directives --agent CRITIC`. If no relevant knowledge found, proceed without.
 ### Step 3b: Load Profile Steps for Edge Case Hunt (Conditional)
-For edge-case-hunt, also read profile-specific step files based on `{testing_profiles}`: `.valent-pipeline/steps/critic/data-pipeline.md`, `mcp-server.md`, `library.md`, `document-generation.md`, `iac.md`, `mobile-app.md`. If a profile step file does not exist, note it and proceed. Apply domain-specific focus areas alongside the generic ones.
+For edge-case-hunt, also read profile-specific step files based on `{testing_profiles}`: `.valent-pipeline/steps/critic/data-pipeline.md`, `mcp-server.md`, `library.md`, `cli-tool.md`, `document-generation.md`, `iac.md`, `mobile-app.md`. If a profile step file does not exist, note it and proceed. Apply domain-specific focus areas alongside the generic ones.
 ### Step 5b: Load Profile Steps for Test Review (Conditional)
 For test-review, also read profile-specific step files based on `{testing_profiles}` from `.valent-pipeline/steps/critic/`. Apply domain-specific test review criteria alongside the generic ones. If a profile step file does not exist, note it and proceed.

package/pipeline/prompts/qa-a.md CHANGED Viewed

@@ -55,7 +55,7 @@ Always include this table in the output for downstream agent calibration.
 | 1b | Query knowledge base | `.valent-pipeline/steps/qa-a/read-inputs.md` |
 | 2 | Risk classification per AC | `.valent-pipeline/steps/qa-a/read-inputs.md` |
 | 3 | Write Given-When-Then test cases | `.valent-pipeline/steps/qa-a/write-spec.md` |
-| 3b | Load testing profile step files | Conditional per `{testing_profiles}`: `.valent-pipeline/steps/qa-a/api.md`, `ui.md`, `data-pipeline.md`, `mcp-server.md`, `library.md`, `document-generation.md`, `iac.md` |
+| 3b | Load testing profile step files | Conditional per `{testing_profiles}`: `.valent-pipeline/steps/qa-a/api.md`, `ui.md`, `data-pipeline.md`, `mcp-server.md`, `library.md`, `cli-tool.md`, `document-generation.md`, `iac.md` |
 | 4 | Database state verification | `.valent-pipeline/steps/qa-a/write-spec.md` |
 | 5 | Seed data and fixture requirements | `.valent-pipeline/steps/qa-a/write-spec.md` |
 | 6 | Negative and edge case tests (P0-P1) | `.valent-pipeline/steps/qa-a/write-spec.md` |

package/pipeline/prompts/qa-b.md CHANGED Viewed

@@ -46,7 +46,7 @@ Write outputs to `{story_output_dir}/` using templates:
 | 2 | Read CRITIC review | `.valent-pipeline/steps/qa-b/execute-tests.md` |
 | 3 | Discover implemented tests | `.valent-pipeline/steps/qa-b/execute-tests.md` |
 | 4 | Run full test suite | `.valent-pipeline/steps/qa-b/execute-tests.md` |
-| 4b | Load and execute testing profile steps | Conditional per `{testing_profiles}`: `.valent-pipeline/steps/qa-b/api.md`, `ui.md`, `data-pipeline.md`, `mcp-server.md`, `library.md`, `document-generation.md`, `iac.md` |
+| 4b | Load and execute testing profile steps | Conditional per `{testing_profiles}`: `.valent-pipeline/steps/qa-b/api.md`, `ui.md`, `data-pipeline.md`, `mcp-server.md`, `library.md`, `cli-tool.md`, `document-generation.md`, `iac.md` |
 | 5 | Spec-implementation alignment check | `.valent-pipeline/steps/qa-b/execute-tests.md` |
 | 6 | Build traceability matrix | `.valent-pipeline/steps/qa-b/write-report.md` |
 | 7 | File bugs | `.valent-pipeline/steps/qa-b/file-bugs.md` |

package/pipeline/prompts/reqs.md CHANGED Viewed

@@ -23,7 +23,7 @@ Write output to `{story_output_dir}/reqs-brief.md` using the template at `.valen
 - `{story_id}`, `{story_output_dir}`, `knowledge/correction-directives.yaml`
 - `{tech_stack.language}`, `{tech_stack.backend_framework}`, `{tech_stack.frontend_framework}`
 - `{tech_stack.database}`
-- `{project_type}` -- fullstack-web | backend-api | frontend-only | data-pipeline | mcp-server | library | document-generation | mobile-app
+- `{project_type}` -- fullstack-web | backend-api | frontend-only | data-pipeline | mcp-server | library | cli-tool | document-generation | mobile-app
 - `{testing_profiles}` -- active testing profiles (e.g., `[api]`, `[api, ui]`, `[data-pipeline]`). Determines which domain step files to load.
 ## Step Sequence

package/pipeline/steps/cli-dev/estimate.md ADDED Viewed

@@ -0,0 +1,49 @@
+# CLI Tool Estimation
+**Purpose:** Assign a Fibonacci story point estimate for CLI implementation complexity. This is a lightweight estimation step — no code tools, no implementation. Read specs, assess complexity, output a number with rationale.
+**Fibonacci scale:** 1, 2, 3, 5, 8, 13, 21
+## Step 1: Read Groomed Specs
+Read and assess:
+- `{story_output_dir}/reqs-brief.md` — REQUIRED
+- `{story_output_dir}/qa-test-spec.md` — REQUIRED
+## Step 2: Assess Complexity Factors
+Evaluate each factor and record your assessment:
+| Factor | Assessment | Weight |
+|--------|-----------|--------|
+| **AC count and complexity** | How many ACs? Are they simple (add a flag) or complex (new subcommand family, breaking surface change)? | High |
+| **New patterns vs established** | Greenfield (new command framework, new output mode) vs incremental (extend an existing subcommand)? | High |
+| **Command surface** | How many new subcommands/flags? Interactions between flags? Breaking changes requiring migration/deprecation? | Medium |
+| **Contract complexity** | Distinct exit-code semantics? Machine-readable output modes? stdin/pipe handling? Cross-platform shell concerns? | Medium |
+| **Test complexity** | How hard will subprocess tests be? Long-running/stateful commands? Environment/filesystem fixtures? | Medium |
+## Step 3: Select Fibonacci Value
+Map your assessment to the Fibonacci scale:
+| Points | Typical CLI Scope |
+|--------|-------------------|
+| 1 | Single flag addition or help-text fix, no contract change |
+| 2 | Simple new flag with behavior, trivial output change |
+| 3 | New subcommand with a few flags, moderate test coverage |
+| 5 | New subcommand family or machine-readable output mode, exit-code rework, subprocess test suite |
+| 8 | Major surface change, breaking changes with deprecation path, cross-platform contract work |
+| 13 | Large CLI restructure, new dispatch/framework layer, extensive contract surface |
+| 21 | Epic-scale: new tool or complete command-surface overhaul (consider splitting the story) |
+**Calibration context (if `{estimation_model}` is `calibrated`):**
+If calibration directives are provided in `knowledge/correction-directives.yaml`, factor them into your estimate. These are learned patterns from prior sprints — e.g., "stories adding output modes consistently under-pointed by 1 tier."
+## Step 4: Write Estimate
+Write to `{story_output_dir}/cli-dev-estimation.md` using `.valent-pipeline/templates/estimation.template.md`:
+- Fibonacci value with brief rationale (2-3 sentences)
+- Factor assessments from Step 2
+- Calibration adjustments applied (if any)
+Send: `[ESTIMATION] CLI-DEV estimates {story_id} at {points} points. See cli-dev-estimation.md.`

package/pipeline/steps/cli-dev/handoff.md ADDED Viewed

@@ -0,0 +1,12 @@
+# CLI-DEV Step: Handoff
+Read `.valent-pipeline/steps/common/distilled-handoff-format.md` before writing output.
+## Pre-handoff smudge check
+Before writing the handoff, re-read the pitfalls primer for your stack in the curated knowledge directory (named in your spawn instructions, Setup step 2b) if one exists (e.g. `pitfalls-<profile>-<stack>.md`) and scan *your diff* against each item. Fix any gaps now — cheap here, a reject→rework cycle after handoff. The primer is a starter, not exhaustive: also handle obvious things like it that aren't listed. This is a ~10-second self-pass, not a substitute for CRITIC.
+## Step 13: Write cli-dev-handoff.md
+Complete all sections of the handoff document using the template at `.valent-pipeline/templates/cli-dev-handoff.template.md`. Set `status: completed` in frontmatter.
+## Independent Verification Requirement
+You must independently verify: all tests pass, the bin entry launches via the package-manager exec path, `--help`/`--version` exit 0 and are accurate, every failure path exits non-zero, and no touched command blocks on a prompt without a TTY -- before marking your task complete. Do not rely on CRITIC or QA-B to catch your failures.

package/pipeline/steps/cli-dev/implement.md ADDED Viewed

@@ -0,0 +1,19 @@
+# CLI-DEV Step: Implement
+## Step 4: Plan implementation approach
+Order: command surface design (subcommands/flags/arguments) -> core command logic -> exit-code and error paths -> output contracts (stdout/stderr, machine-readable modes) -> bin entry and packaging. Identify which existing commands or shared helpers the story touches.
+## Step 5: Design the command surface
+Per reqs-brief: define each subcommand, its flags (long form always; short aliases only where the brief asks), argument shapes, and defaults. Every flag must be intentional -- no dead options, no undocumented behavior. Record in `cli-dev-handoff.md#command-surface`.
+## Step 6: Implement command logic
+Per reqs-brief: implement the commands' core logic. Keep parsing/validation at the edge and testable logic in importable modules -- the bin entry should be a thin dispatcher. Record in `cli-dev-handoff.md#files-created-modified`.
+## Step 7: Implement exit codes and error paths
+Per reqs-brief: every failure mode exits non-zero with a one-line actionable error on stderr. Distinct exit codes where the brief assigns semantics to them. Never `process.exit(0)`/return-success after printing an error. Unknown flags or subcommands must error, not pass through.
+## Step 8: Implement output contracts
+Results to stdout; diagnostics/progress/warnings to stderr. If the brief specifies a machine-readable mode (`--json`, `--quiet`, etc.), its stdout must contain ONLY the contracted output -- no banners, no color codes when piped (honor NO_COLOR / non-TTY detection where the project does).
+## Step 9: Wire the bin entry and verify non-interactive safety
+Configure the package `bin` mapping and shebang; verify the tool launches via the package-manager exec path (`npx <name>` or equivalent), `--help` and `--version` exit 0 with accurate text. Verify no code path blocks on a prompt when stdin is not a TTY -- every interactive question has a flag/env override, and destructive actions refuse (naming the flag) rather than hang or proceed. Record decisions in `cli-dev-handoff.md#implementation-decisions`.

package/pipeline/steps/cli-dev/read-inputs.md ADDED Viewed

@@ -0,0 +1,13 @@
+# CLI-DEV Step: Read Inputs
+## Step 1: Read reqs-brief.md
+Understand: acceptance criteria, business rules, command surface requirements (subcommands, flags, arguments, defaults), exit-code semantics, output contracts (human vs machine-readable modes), interactivity constraints, cross-cutting concerns.
+## Step 2: Read qa-test-spec.md
+Understand: what tests to write for each AC, expected assertions, subprocess-invocation requirements, test case names and structure.
+## Step 3: Read correction directives
+Read `knowledge/correction-directives.yaml`. Apply all directives targeting CLI-DEV. Note any conflicts with default behavior and follow the directive.
+## Step 3b: Query Knowledge Base
+Read the curated knowledge files (directory named in your spawn instructions, Setup step 2b) for codebase conventions, implementation patterns, and known pitfalls relevant to CLI-DEV implementing {story_id} using {tech_stack.language} with {tech_stack.module_system} module system and {tech_stack.type_system}. Also run `node .valent-pipeline/bin/cli.js db search --query "<topic>"` for prior-story lessons, and `node .valent-pipeline/bin/cli.js db query-directives --agent CLI-DEV`. If no relevant knowledge found, proceed without.

package/pipeline/steps/cli-dev/write-tests.md ADDED Viewed

@@ -0,0 +1,22 @@
+# CLI-DEV Step: Write Tests
+## Rule 0 (ATDD): Acceptance tests are READ-ONLY
+When the story has QA-A-authored acceptance tests (manifest cases with `kind: acceptance` — files under the configured acceptance dir for this story plus any manifest-declared helpers), you MUST NOT edit, move, delete, or skip them. They are the spec. Every acceptance source is snapshot-hashed at the red run; the green gate compares hashes and ANY change auto-REJECTs the story to QA-A arbitration — making an acceptance test pass by changing it is mechanically impossible. If you believe an acceptance test is wrong, say so in your handoff (QA-A arbitrates: restore or rebaseline-with-reason); keep implementing against the rest. Your own unit/integration tests are yours — write and refactor them freely; the read-only rule covers only the acceptance tier.
+## Step 10: Write test code
+Satisfy qa-test-spec for each AC. Every test case named in qa-test-spec must have a corresponding test. Follow quality standards from the core prompt. Record in `cli-dev-handoff.md#test-files-written`.
+**Critical rule:** Behavioral tests must exercise the CLI the way a user would -- spawn the real bin entry as a subprocess and assert on stdout, stderr, and the exit code. No calling command handler functions directly for behavior the spec describes at the command line, and no mocking the process boundary. Unit tests on the importable core logic are fine IN ADDITION, never INSTEAD.
+## Step 11: Write contract tests
+Beyond per-AC tests, write subprocess tests that pin the tool's general contract:
+1. `--help` (root and each touched subcommand) exits 0 and names every implemented flag
+2. `--version` exits 0 and matches the package version
+3. An unknown flag and an unknown subcommand each exit non-zero with an error on stderr
+4. A machine-readable mode touched by this story (if any) produces byte-clean, parseable stdout
+5. With stdin not a TTY, no touched command hangs on a prompt (run every subprocess test with stdin closed or piped -- a hang here is the bug)
+## Step 12: Run tests, verify all pass
+Run the full test suite. All tests must pass. Record results in `cli-dev-handoff.md#test-results-summary`. If tests fail, fix the code -- do not skip or weaken tests.

package/pipeline/steps/critic/cli-tool.md ADDED Viewed

@@ -0,0 +1,25 @@
+# CRITIC Domain: CLI Tool Review
+**Applies to:** Stories where CLI-DEV is the implementing agent.
+## Edge Case Focus Areas
+In addition to the standard edge case hunt (Pass 2), scrutinize these CLI-specific risks:
+- **Exit 0 on failure** -- any path that prints an error (or catches and swallows one) and still exits 0. Scripts and CI branch on exit codes; this is the CLI equivalent of a swallowed exception. Any occurrence is a High finding.
+- **Contract output polluted** -- log lines, banners, progress, deprecation warnings, or color escape codes written to stdout in a machine-readable mode (or interleaved with pipeable output). Consumers parse stdout; pollution breaks them silently. High finding.
+- **Hidden interactivity** -- a code path that can reach a prompt (readline, inquirer, confirm) with no flag/env override or non-TTY guard. In CI this hangs forever. High finding.
+- **Unknown input passes silently** -- unknown flags or subcommands ignored instead of erroring non-zero; typos become no-ops that look like success. High finding.
+- **Accidental breaking changes** -- renamed/removed flags or subcommands, changed defaults, changed exit codes, or changed output shape that existing scripts depend on, without a deprecation path. Compare against the prior surface. High finding.
+- **Help text drift** -- implemented behavior that contradicts `--help` (missing flags, wrong defaults, stale examples). Med finding.
+- **Argument parsing edge cases** -- values that look like flags (`--name --verbose`), empty strings, `--` separator handling, repeated flags, quoting/spaces in paths (especially Windows). Probe each touched option.
+- **Partial-failure state** -- a command that mutates files/state, fails midway, and leaves the mutation half-applied with exit 0 or no way to resume/rollback. High finding for destructive commands.
+## Test Code Review Additions
+In addition to the standard test code review checklist:
+- **Subprocess tests exist** -- behavioral coverage must spawn the real bin entry and assert exit code + stdout + stderr. Tests that only call handler functions in-process are testing implementation, not the contract. Missing subprocess coverage is a High finding.
+- **Error paths asserted by exit code** -- failure tests must assert the non-zero exit code, not just the error message. Message-only assertions miss exit-0-on-failure defects. Med finding.
+- **Non-TTY coverage** -- touched commands must have at least one test with stdin closed/piped. Missing it is a Med finding (High if the command prompts).
+- **Machine output parsed, not pattern-matched** -- tests for `--json`-style modes must parse stdout with a real parser. Regex-only assertions are a Med finding.

package/pipeline/steps/judge/evidence-review.md CHANGED Viewed

@@ -82,6 +82,7 @@ PASSED:
 | `mcp-server` | `## Protocol Compliance Results` | server spawned, protocol spoken over real transport, tool responses asserted |
 | `iac` | `## Infrastructure Validation Results` | plan/apply against a sandbox; resources + drift asserted |
 | `library` | `## Export Audit Results` | library installed in an isolated consumer; real import/contract |
+| `cli-tool` | `## Command Invocation Results` + `## Contract Checks` | packed binary installed + spawned as subprocesses; exit codes/stdout/stderr asserted |
 | `document-generation` | `## Render Validation Results` | real render pipeline invoked; output structure asserted |
 | `mobile-app` | `## Mobile Platform Coverage Audit` | real emulator/simulator + real API traffic |

package/pipeline/steps/orchestration/validate-story-inputs.md CHANGED Viewed

@@ -36,6 +36,7 @@ Based on the story scope and project type, determine which testing profiles are
 | Story has data pipeline work (ETL, transformations, migrations) | `data-pipeline` |
 | Story has MCP server tools, handlers, or protocol work | `mcp-server` |
 | Story is shared library/package (exports, packaging, versioning) | `library` |
+| Story has command-line surface work (subcommands, flags, exit codes, terminal output contracts) | `cli-tool` |
 | Story has document/report template or generation pipeline work | `document-generation` |
 | Story has infrastructure work (Terraform, CloudFormation, Kubernetes, CI/CD) | `iac` |
@@ -46,6 +47,7 @@ Multiple profiles can be active. Examples:
 - Data pipeline story: `[data-pipeline]`
 - MCP server story: `[mcp-server]`
 - Library/package story: `[library]`
+- CLI tool story: `[cli-tool]`
 - Document generation story: `[document-generation]`
 - Infrastructure story: `[iac]`
 - Fullstack story with infrastructure: `[api, ui, iac]`
@@ -56,7 +58,8 @@ Set `{testing_profiles}` for use in shared context.
 **Each active profile carries a mandatory black-box, real-dependency verification** — the "prove it
 works for real" gate for that surface (live HTTP + real DB for `api`, real browser for `ui`, real
 pipeline run for `data-pipeline`, spawned server for `mcp-server`, applied infra for `iac`, installed
-package for `library`, real render for `document-generation`, real emulator for `mobile-app`). QA-B
+package for `library`, spawned binary for `cli-tool`, real render for `document-generation`, real
+emulator for `mobile-app`). QA-B
 executes it (`qa-b/{profile}.md`, Step 4b) and JUDGE gates on it (evidence-review Step 9): its **absence
 is a coverage gap, not a silent pass**. This is what catches boundary defects — CORS, serialization,
 auth headers, wire-format mismatch, migration drift — that mocked unit tests pass straight through.

package/pipeline/steps/qa-a/cli-tool.md ADDED Viewed

@@ -0,0 +1,42 @@
+# QA-A Step: CLI Tool Testing
+## Command Invocation Test Specification
+For every command/flag surface in this story, write a **Command Invocation Test** table:
+```
+## Command Invocation Tests
+| ID | Invocation | stdin | Expected Exit Code | Expected stdout | Expected stderr |
+```
+Rules:
+- One row per behavioral contract: the exact command line a user would type (including the bin name, not an internal function call)
+- stdin: `closed`, `piped: <content>`, or `tty` — every touched command needs at least one `closed`/`piped` row (non-interactive safety is part of the spec)
+- Expected exit code: an exact integer; "non-zero" only when the brief leaves the code unassigned
+- Expected stdout: the contracted output a consumer/script reads (exact string, pattern, or parseable shape for `--json`-style modes); state "empty" explicitly when nothing may print
+- Expected stderr: error/diagnostic expectations (pattern), or "empty"
+- Error paths are first-class: every failure mode in the brief gets its own row (bad input, missing file, unknown flag, unknown subcommand)
+- Contract rows are mandatory: `--help` (exit 0, names every new flag), `--version` (exit 0), unknown-flag and unknown-subcommand rows (non-zero, error on stderr)
+## Machine-Readable Output Specification
+If the story ships or touches a machine-readable mode (`--json`, `--quiet`, `--porcelain`, ...), write a **Machine Output Test** table:
+```
+## Machine Output Tests
+| ID | Invocation | Parse As | Required Fields | Cleanliness Check |
+```
+Rules:
+- Parse As: the format a consumer parses (JSON, NDJSON, TSV...) — the test must actually parse it, not regex it
+- Cleanliness: stdout contains ONLY the contracted output — no banners, progress, color codes, or log lines (those belong on stderr)
+## Quality Gate Additions
+- [ ] Command invocation table covers every new/changed command and flag, including every error path
+- [ ] `--help`, `--version`, unknown-flag, unknown-subcommand contract rows present
+- [ ] Every touched command has a non-TTY stdin row (closed or piped)
+- [ ] Destructive operations have a refusal row (no `--force`/`--yes`) and an execution row (with it)
+- [ ] Machine output tests present if any machine-readable mode is touched

package/pipeline/steps/qa-b/cli-tool.md ADDED Viewed

@@ -0,0 +1,56 @@
+# QA-B Step: CLI Tool Testing
+## Black-Box Binary Tests
+Mandatory for all stories on a CLI tool. Install the tool the way a user gets it and exercise the real binary as subprocesses — never call command handlers in-process for this evidence.
+**Procedure:**
+1. **Create isolated temp directory.** Outside the source tree.
+2. **Install the tool as a consumer.** Pack and install the real artifact (`npm pack` then `npm install <tarball>` in the temp project, `pipx install ../path`, equivalent) so the `bin` mapping, shebang, and packed file list are what's under test — not the working tree.
+3. **Execute the Command Invocation Test table.** For each row in qa-test-spec.md: spawn the installed binary with the row's argv and stdin condition; assert exit code, stdout, and stderr against the row. Record PASS/FAIL per row.
+4. **Run the contract rows.** `--help` (exits 0, names every new flag), `--version` (exits 0, matches the package version), unknown flag and unknown subcommand (non-zero, error on stderr).
+5. **Verify non-interactive safety.** Run every touched command with stdin closed and a timeout: any command that hangs waiting for input FAILS the row (file the bug; do not re-run with a TTY and call it a pass).
+6. **Verify machine-readable modes.** For each Machine Output Test row, parse stdout with a real parser (JSON.parse etc.) and assert the required fields; any non-contracted bytes on stdout is a FAIL.
+7. **Verify destructive guards.** Destructive operations refuse without their `--force`/`--yes` flag and act with it, per spec rows.
+8. **Clean up temp directory.**
+**Record results** in `## CLI Black-Box Test Results` of execution-report.md:
+```
+## Command Invocation Results
+| ID | Invocation | Exit (expected/actual) | stdout | stderr | Result |
+|----|-----------|------------------------|--------|--------|--------|
+| {id} | {command line} | {n}/{n} | {PASS/FAIL} | {PASS/FAIL} | {overall} |
+## Contract Checks
+| Check | Result | Notes |
+|-------|--------|-------|
+| Installed via package path (npx/bin mapping) | {PASS/FAIL} | {install command + tarball} |
+| --help accurate | {PASS/FAIL} | {missing/extra flags if any} |
+| --version matches package | {PASS/FAIL} | {versions} |
+| Unknown flag/subcommand rejected | {PASS/FAIL} | {details} |
+| No hang with stdin closed | {PASS/FAIL} | {commands timed out, if any} |
+| Machine output byte-clean + parseable | {PASS/FAIL or n/a} | {parser + fields} |
+```
+Include raw commands and full output for reproducibility.
+**Failure handling:**
+- Pack/install fails: file P1 bug, record error, continue against the source-tree bin entry (note the downgrade loudly).
+- A command exits 0 on a failure path: file P1 bug (exit codes are the API).
+- A command hangs without a TTY: file P1 bug.
+- Help/version drift: file P2 bug, continue.
+- Non-clean machine output: file P2 bug, continue.
+**This step cannot be skipped.** If qa-test-spec.md lacks a Command Invocation Tests section, construct the table from the command surface in reqs-brief.md and execute.
+## Execution Report Additions
+The execution report MUST include:
+- `## Command Invocation Results` table with per-row exit/stdout/stderr verdicts
+- `## Contract Checks` table
+- Raw pack/install/spawn commands with full output
+- Isolated temp directory path and cleanup confirmation

package/pipeline/steps/qa-b/execute-tests.md CHANGED Viewed

@@ -71,6 +71,7 @@ Read testing profile step file(s) from `.valent-pipeline/steps/qa-b/` based on `
 | `mcp-server` | `.valent-pipeline/steps/qa-b/mcp-server.md` | Story has MCP tools/protocol work |
 | `iac` | `.valent-pipeline/steps/qa-b/iac.md` | Story has infrastructure work |
 | `library` | `.valent-pipeline/steps/qa-b/library.md` | Story ships a package/exports |
+| `cli-tool` | `.valent-pipeline/steps/qa-b/cli-tool.md` | Story ships a command-line surface |
 | `document-generation` | `.valent-pipeline/steps/qa-b/document-generation.md` | Story renders documents/reports |
 | `mobile-app` | `.valent-pipeline/steps/qa-b/mobile-app.md` | Story has mobile app flows |

package/pipeline/steps/qa-b/write-report.md CHANGED Viewed

@@ -29,7 +29,7 @@ Rules:
 - SKIP only with documented justification
 - DB State Verified = Yes only if test code contains explicit DB assertions
 - **Real Coverage** -- classify each AC against the "prove it works for real" dimension:
-  - `Real` -- at least one **passing** test for this AC exercises the real dependency at the boundary (the active profile's real-dependency step covers it: live HTTP + real DB for `api`, real browser for `ui`, real pipeline run for `data-pipeline`, spawned server for `mcp-server`, applied infra for `iac`, installed package for `library`, real render for `document-generation`). Use the profile-step evidence you produced in Step 4b — an AC is `Real` only if it appears in that step's real-execution results.
+  - `Real` -- at least one **passing** test for this AC exercises the real dependency at the boundary (the active profile's real-dependency step covers it: live HTTP + real DB for `api`, real browser for `ui`, real pipeline run for `data-pipeline`, spawned server for `mcp-server`, applied infra for `iac`, installed package for `library`, spawned binary for `cli-tool`, real render for `document-generation`). Use the profile-step evidence you produced in Step 4b — an AC is `Real` only if it appears in that step's real-execution results.
   - `Mocked-only` -- the AC is only exercised by tests that mock/stub the boundary (mocked `fetch` in jsdom, Playwright `route.fulfill()` on the happy path, stubbed transport). This is a **coverage gap**, not a pass. Carry every `Mocked-only` AC into the Real-Dependency Coverage section, and file a bug per the `ui.md`/profile-step rules (P1 if a whole surface is mock-only, P2 per AC otherwise).
   - `N/A` -- the AC has no external boundary (pure in-process logic). Justify it in the Real-Dependency Coverage section; do NOT use `N/A` to wave past an unmocked-but-untested boundary.
 - Populate the **Real-dependency coverage**, **Mocked-only ACs**, and **Mocked-only P0/P1 ACs** summary lines, and the **Real-Dependency Coverage** section, per the template. These are what JUDGE gates on — an absent or empty Real Coverage column reads as an evidence gap, not a pass.

package/pipeline/steps/reqs/cli-tool.md ADDED Viewed

@@ -0,0 +1,46 @@
+# REQS Step: CLI Tool Requirements
+## Step 4b: CLI-Specific Requirement Extraction
+For stories with `cli-tool` in `testing_profiles`, extract additional requirements:
+### Command Surface
+- Subcommands touched or added, with their argument shapes (positional vs flag)
+- Flags: long form, short alias (if any), value type, default, required/optional
+- Flag interactions (mutually exclusive groups, flags implying other flags)
+- Environment variable overrides and their precedence vs flags
+### Exit Code Semantics
+- Exit code for success (0) and each distinct failure mode
+- Which failures share a generic non-zero code vs which get assigned codes
+- Codes consumers/scripts are known to branch on (these are the breaking-change surface)
+### Output Contract
+- What goes to stdout (the parseable/pipeable result) vs stderr (diagnostics, progress, warnings)
+- Machine-readable modes (`--json`, `--quiet`, `--porcelain`, ...) and their exact schema
+- Color/TTY behavior (NO_COLOR, non-TTY detection) if the project handles it
+- Stability expectations: which output is contract (scripts parse it) vs informational
+### Interactivity Constraints
+- Prompts the command may show, and the flag/env override for each
+- Non-TTY behavior: what each prompt does when stdin is not a terminal (use default, refuse, fail)
+- Destructive operations and their confirmation flags (`--force`/`--yes`)
+### Backwards Compatibility Constraints
+- Previously documented commands/flags/output that must remain stable
+- Deprecated flags and their migration path (alias period, warning text)
+- Current version and expected version after this story ships
+### Packaging and Invocation
+- `bin` mapping name(s) and the package-manager exec path (`npx <name>`, global install)
+- Minimum supported runtime versions
+- Shell-portability constraints (Windows/POSIX) if the brief raises them
+## Quality Gate Additions
+- [ ] Every command/flag has defined behavior, default, and error handling
+- [ ] Exit code assigned to every failure mode named in the ACs
+- [ ] stdout vs stderr contract stated for every touched command
+- [ ] Machine-readable output schema specified if a mode is touched
+- [ ] Every prompt has a non-interactive override; destructive ops have confirmation flags
+- [ ] Breaking-change impact on existing flags/output/exit codes classified

package/pipeline/task-graphs/cli-tool.yaml ADDED Viewed

@@ -0,0 +1,60 @@
+# Task graph template for cli-tool projects
+# Resolved deterministically by `resolve-graph`; the Workflow orchestrator wires dependencies from `tasks`.
+# Skipped agents: UXA, FEND
+#
+# Variables resolved at runtime:
+#   {{story_id}} — current story identifier
+#   {{story_output_dir}} — resolved story output directory
+project_type: "cli-tool"
+tasks:
+  - ref: reqs
+    agent: REQS
+    subject: "REQS: Analyze story and produce implementation brief"
+    description: "Read {{story_id}} story inputs, translate ACs into structured implementation brief (reqs-brief.md)."
+    activeForm: "REQS analyzing requirements"
+    blockedBy: []
+  - ref: qa_a
+    agent: QA-A
+    subject: "QA-A: Produce behavioral test specifications"
+    description: "Read reqs-brief.md, produce qa-test-spec.md. No visual-validation-checklist for cli-tool."
+    activeForm: "QA-A writing test specifications"
+    blockedBy: [reqs]
+  - ref: cli_dev
+    agent: CLI-DEV
+    subject: "CLI-DEV: Implement command surface production code and tests"
+    description: "Read reqs-brief.md and qa-test-spec.md, implement commands/flags/exit codes and bin entry, produce cli-dev-handoff.md."
+    activeForm: "CLI-DEV implementing CLI tool"
+    blockedBy: [qa_a]
+  - ref: iac
+    agent: IAC
+    subject: "IAC: Implement infrastructure definitions and tests"
+    description: "Read reqs-brief.md and qa-test-spec.md, implement infrastructure code, produce iac-handoff.md."
+    activeForm: "IAC implementing infrastructure"
+    blockedBy: [qa_a]
+    conditional: { includes: { testing_profiles: iac } }
+  - ref: critic
+    agent: CRITIC
+    subject: "CRITIC: Adversarial code review"
+    description: "Read git-diff, reqs-brief.md, qa-test-spec.md. Run blind-hunt, edge-case-hunt, acceptance-audit, triage passes. Produce critic-review.md."
+    activeForm: "CRITIC reviewing code"
+    blockedBy: [cli_dev, iac]
+  - ref: qa_b
+    agent: QA-B
+    subject: "QA-B: Execute tests and file bugs"
+    description: "Read qa-test-spec.md, critic-review.md, reqs-brief.md. Run tests, produce execution-report.md, bugs.md, traceability-matrix.md."
+    activeForm: "QA-B executing tests"
+    blockedBy: [critic]
+  - ref: judge
+    agent: JUDGE
+    subject: "JUDGE: Bug review and ship decision"
+    description: "Review bugs.md for priority accuracy, then review execution-report.md, traceability-matrix.md, qa-test-spec.md. Approve or reject ship."
+    activeForm: "JUDGE reviewing evidence and making ship decision"
+    blockedBy: [qa_b]

package/pipeline/templates/cli-dev-handoff.template.md ADDED Viewed

@@ -0,0 +1,89 @@
+# cli-dev-handoff
+<!-- Template version: 1.0 | Used by: CLI-DEV | Read by: CRITIC, QA-B -->
+---
+agent: {agent-name}
+story: {story-id}
+status: {in_progress | completed}
+stepsCompleted: []
+pendingSteps: []
+lastCheckpoint: {ISO-8601 timestamp}
+inputsRead: []
+outputsWritten: []
+blockers: []
+---
+## Orchestrator Summary -- required
+- **Agent:** {agent-name}
+- **Story:** {story-id}
+- **Verdict:** {pass | fail | needs-review}
+- **State transition:** {from-phase} -> {to-phase}
+- **Files created/modified:** {list of file paths}
+- **Flags:** {alerts for downstream agents, or "none"}
+## Machine Summary -- required
+<!-- Authoritative structured output (validated by `node .valent-pipeline/bin/cli.js validate-handoff`). The prose tables below are the human-readable companion. -->
+```yaml valent:handoff
+schema: 1
+agent: cli-dev
+story: STORY-ID
+files: []
+nextAgent: critic
+flags: []
+```
+## Files Created/Modified -- required
+<!-- All production and test files created or modified, with purpose. -->
+| File | Action | Purpose |
+|------|--------|---------|
+| {file-path} | {created \| modified} | {one-line description} |
+## Command Surface -- required
+<!-- Every subcommand/flag implemented or changed in this story. CRITIC and QA-B use this to validate coverage and contracts. -->
+| Command / Flag | Behavior | Exit Codes | Output (stdout) | Usage Example |
+|----------------|----------|------------|-----------------|---------------|
+| {command or --flag} | {one-line behavior} | {0 on ..., N on ...} | {what a consumer/script reads} | {one-line invocation} |
+## Contract Verification -- required
+<!-- Verified by actually running the bin entry, not by reading the code. -->
+- **Bin entry:** {package.json bin mapping; verified via `npx <name>` / equivalent: yes|no}
+- **`--help` / `--version`:** {exit 0, text accurate: yes|no}
+- **Unknown flag/subcommand:** {errors non-zero on stderr: yes|no}
+- **Non-interactive (stdin not a TTY):** {no touched command hangs; prompt overrides: list flags/env}
+- **Machine-readable mode:** {mode + byte-clean stdout verified, or "n/a"}
+## Breaking Changes -- conditional
+<!-- Only include if this story changes or removes existing commands/flags/output a consumer may depend on. -->
+| Change | Before | After | Migration / Deprecation |
+|--------|--------|-------|-------------------------|
+| {removed/renamed flag or changed output/exit code} | {previous contract} | {new contract} | {deprecation alias, warning, or "breaking"} |
+## Test Files Written -- required
+<!-- All test files created or modified, mapped to the spec cases they satisfy. -->
+| Test File | Test Cases | Spec Reference |
+|-----------|-----------|----------------|
+| {file-path} | {list of test case names} | `qa-test-spec.md#{ac-id}` |
+## Test Results Summary -- required
+<!-- Aggregate results from running the test suite. -->
+| Suite | Total | Passed | Failed | Skipped | Duration |
+|-------|-------|--------|--------|---------|----------|
+| {suite-name} | {count} | {count} | {count} | {count} | {duration} |
+## Implementation Decisions -- required
+<!-- Key decisions made during implementation. Each entry: decision, rationale, alternatives rejected. -->
+- **Decision:** {what was decided}
+  - **Rationale:** {why}
+  - **Rejected:** {alternatives considered and why they lost}
+## Cross-References -- required
+<!-- Explicit pointers to upstream artifacts consumed. -->
+- Reqs brief: `reqs-brief.md#{section}`
+- Test spec: `qa-test-spec.md#{section}`

package/pipeline/templates/reqs-brief.template.md CHANGED Viewed

@@ -118,6 +118,17 @@ database_changes:
 - **Module system:** {CJS / ESM / dual}
 - **Peer dependencies:** {required peer deps and version ranges}
+## Command Surface -- conditional
+<!-- Only include if cli-tool in testing_profiles -->
+| Command / Flag | Behavior | Exit Codes | stdout Contract | Non-TTY Behavior | Breaking Change? |
+|----------------|----------|------------|-----------------|------------------|------------------|
+| {command or --flag} | {behavior + default} | {0 on ..., N on ...} | {what scripts parse, or "human only"} | {default / refuse / prompt-override flag} | {yes/no — vs prior version} |
+- **Machine-readable modes:** {--json etc. + exact schema, or "none"}
+- **Destructive operations:** {operation -> confirmation flag, or "none"}
+- **Env overrides:** {ENV_VAR -> flag it overrides}
 ## Template Specifications -- conditional
 <!-- Only include if document-generation in testing_profiles -->

package/skills/valent-configure/SKILL.md CHANGED Viewed

@@ -33,6 +33,7 @@ Ask the user to pick one:
 | `mcp-server` | Model Context Protocol server (skips FEND, UXA, PMCP) |
 | `document-generation` | Templates, content pipelines, output validation |
 | `library` | Reusable package/module with public API (skips FEND, UXA, PMCP) |
+| `cli-tool` | Command-line tool with a `bin` entry (skips FEND, UXA, PMCP) |
 Default: `fullstack-web`
@@ -40,7 +41,7 @@ Default: `fullstack-web`
 Ask about each field. Use the project type to skip irrelevant questions:
-- **If project type has no frontend** (`backend-api`, `data-pipeline`, `mcp-server`, `library`): skip `frontend_framework`, `state_management`, and set them to `"none"`.
+- **If project type has no frontend** (`backend-api`, `data-pipeline`, `mcp-server`, `library`, `cli-tool`): skip `frontend_framework`, `state_management`, and set them to `"none"`.
 - **If project type has no backend** (`frontend-only`): skip `database_orm` and set it to `"none"`.
 Fields to ask (in order):

package/skills/valent-setup-backlog/SKILL.md CHANGED Viewed

@@ -95,6 +95,7 @@ Tag each story with the `testing_profiles` it **owns** (the same criteria the pi
 | ETL, data transformation, or batch processing | `data-pipeline` |
 | MCP server tools, handlers, or protocol work | `mcp-server` |
 | Shared library/package (exports, packaging, versioning) | `library` |
+| Command-line surface (subcommands, flags, exit codes, terminal output contracts) | `cli-tool` |
 | Document/report template or generation pipeline work | `document-generation` |
 | Infrastructure (Terraform, CloudFormation, Kubernetes, CI/CD) | `iac` |

package/src/commands/db-rebuild.js CHANGED Viewed

@@ -12,6 +12,7 @@ const ARTIFACT_MAP = {
   'data-handoff.md': { type: 'data-handoff', agent: 'DATA' },
   'mcp-dev-handoff.md': { type: 'mcp-dev-handoff', agent: 'MCP-DEV' },
   'libdev-handoff.md': { type: 'libdev-handoff', agent: 'LIBDEV' },
+  'cli-dev-handoff.md': { type: 'cli-dev-handoff', agent: 'CLI-DEV' },
   'docgen-handoff.md': { type: 'docgen-handoff', agent: 'DOCGEN' },
   'iac-handoff.md': { type: 'iac-handoff', agent: 'IAC' },
   'critic-review.md': { type: 'critic-review', agent: 'CRITIC' },

package/src/commands/init.js CHANGED Viewed

@@ -248,7 +248,7 @@ async function runWizard() {
     message: 'Project type:',
     choices: [
       'fullstack-web', 'backend-api', 'frontend-only',
-      'data-pipeline', 'mcp-server', 'document-generation', 'library', 'mobile-app'
+      'data-pipeline', 'mcp-server', 'document-generation', 'library', 'cli-tool', 'mobile-app'
     ],
     default: 'fullstack-web',
   }]);
@@ -268,7 +268,7 @@ async function runWizard() {
   }]);
   config.tech_stack.backend_framework = backendFramework;
-  if (!['backend-api', 'data-pipeline', 'mcp-server', 'library'].includes(projectType)) {
+  if (!['backend-api', 'data-pipeline', 'mcp-server', 'library', 'cli-tool'].includes(projectType)) {
     const { frontendFramework } = await inquirer.prompt([{
       type: 'input', name: 'frontendFramework', message: 'Frontend framework:', default: 'React',
     }]);

package/src/lib/config-schema.js CHANGED Viewed

@@ -11,7 +11,7 @@ export const KNOWN_ROLES = [
   'CRITIC', 'CRITIC-BLIND', 'CRITIC-EDGE', 'CRITIC-ACCEPT', 'CRITIC-TRIAGE', 'CRITIC-REVERIFY',
   'JUDGE', 'JUDGE-EVIDENCE', 'JUDGE-DECIDE',
   'REQS', 'UXA', 'QA-A', 'QA-B',
-  'BEND', 'FEND', 'DATA', 'MCP-DEV', 'LIBDEV', 'DOCGEN', 'IAC', 'MOBILE',
+  'BEND', 'FEND', 'DATA', 'MCP-DEV', 'LIBDEV', 'CLI-DEV', 'DOCGEN', 'IAC', 'MOBILE',
   'RESOLVE', 'STATIC', 'RESUME', 'PMCP', 'PERSIST', 'EVIDENCE', 'CROSSCHECK', 'GIT',
   'SPECCHECK', 'RED', 'GREEN',
   // plan
@@ -43,7 +43,7 @@ export function validateConfig(config) {
   }
   // Project section
-  const validProjectTypes = ['fullstack-web', 'backend-api', 'frontend-only', 'data-pipeline', 'mcp-server', 'document-generation', 'library', 'mobile-app'];
+  const validProjectTypes = ['fullstack-web', 'backend-api', 'frontend-only', 'data-pipeline', 'mcp-server', 'document-generation', 'library', 'cli-tool', 'mobile-app'];
   if (config.project?.type && !validProjectTypes.includes(config.project.type)) {
     errors.push(`Invalid project.type: "${config.project.type}". Must be one of: ${validProjectTypes.join(', ')}`);
   }
@@ -334,7 +334,7 @@ export const defaults = {
     opus: ['INTEGRATION', 'CRITIC-TRIAGE', 'JUDGE-DECIDE'],
     // Spec + build agents, the *finding* half of the gates (CRITIC edge/acceptance hunting, JUDGE
     // evidence cross-referencing), and lighter retro reasoning — mid tier.
-    sonnet: ['REQS', 'UXA', 'QA-A', 'QA-B', 'BEND', 'FEND', 'DATA', 'MCP-DEV', 'LIBDEV', 'DOCGEN', 'IAC', 'MOBILE', 'RETRO', 'CRITIC-EDGE', 'CRITIC-ACCEPT', 'CRITIC-REVERIFY', 'JUDGE-EVIDENCE'],
+    sonnet: ['REQS', 'UXA', 'QA-A', 'QA-B', 'BEND', 'FEND', 'DATA', 'MCP-DEV', 'LIBDEV', 'CLI-DEV', 'DOCGEN', 'IAC', 'MOBILE', 'RETRO', 'CRITIC-EDGE', 'CRITIC-ACCEPT', 'CRITIC-REVERIFY', 'JUDGE-EVIDENCE'],
     // Mechanical retrieval / CLI-runner / IO steps — no reasoning, cheapest tier.
     // RESOLVE/PACK/VALIDATE/CALIBRATE/PERSIST are the Workflow orchestrators' CLI-runner agents.
     // STATIC runs the deterministic pre-CRITIC gate (lint/type/static analysis) — pure command runner.
@@ -439,7 +439,7 @@ export const defaults = {
   // suppress, or trim `roles` to scope it. Install Ref as an MCP server to benefit.
   ref: {
     enabled: true,
-    roles: ['BEND', 'FEND', 'DATA', 'MCP-DEV', 'LIBDEV', 'DOCGEN', 'IAC', 'MOBILE', 'CRITIC'],
+    roles: ['BEND', 'FEND', 'DATA', 'MCP-DEV', 'LIBDEV', 'CLI-DEV', 'DOCGEN', 'IAC', 'MOBILE', 'CRITIC'],
   },
   // Visual validation (PMCP). For a story whose testing_profiles include `ui`, the sprint workflow's
   // Visual phase drives the browser-automation MCP (tech_stack.browser_automation_mcp) over QA-A's

package/src/lib/detect.js CHANGED Viewed

@@ -312,10 +312,8 @@ export function detectProject(root) {
   else if (hasFrontend && hasBackend) projectType = 'fullstack-web';
   else if (hasFrontend) projectType = 'frontend-only';
   else if (hasBackend) projectType = 'backend-api';
-  else if (pkg?.bin) {
-    projectType = 'library';
-    warnings.push('package.json has a `bin` (CLI tool) — suggesting `library`; there is no dedicated cli-tool type yet');
-  } else if (pkg && (pkg.main || pkg.exports)) projectType = 'library';
+  else if (pkg?.bin) projectType = 'cli-tool'; // a `bin` entry IS the product surface — black-box CLI evidence, not library exports
+  else if (pkg && (pkg.main || pkg.exports)) projectType = 'library';
   // Cross-workspace type incoherence (review pass-3 #29): a mobile signal in one workspace plus
   // a frontend framework in ANOTHER (the vue-docs + react-native repro) cannot coherently yield
   // a single project.type — withdraw the conclusion rather than answer confidently.