valent-pipeline 0.17.1 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/README.md +3 -1
  2. package/package.json +1 -1
  3. package/pipeline/docs/agent-reference.md +5 -2
  4. package/pipeline/docs/pipeline-overview.md +2 -2
  5. package/pipeline/docs/task-graph.md +1 -1
  6. package/pipeline/orchestrators/claude-code/plan.workflow.js +3 -1
  7. package/pipeline/orchestrators/claude-code/sprint.workflow.js +3 -3
  8. package/pipeline/prompts/cli-dev.md +61 -0
  9. package/pipeline/prompts/critic.md +3 -3
  10. package/pipeline/prompts/qa-a.md +1 -1
  11. package/pipeline/prompts/qa-b.md +1 -1
  12. package/pipeline/prompts/reqs.md +1 -1
  13. package/pipeline/steps/cli-dev/estimate.md +49 -0
  14. package/pipeline/steps/cli-dev/handoff.md +12 -0
  15. package/pipeline/steps/cli-dev/implement.md +19 -0
  16. package/pipeline/steps/cli-dev/read-inputs.md +13 -0
  17. package/pipeline/steps/cli-dev/write-tests.md +22 -0
  18. package/pipeline/steps/critic/cli-tool.md +25 -0
  19. package/pipeline/steps/judge/evidence-review.md +1 -0
  20. package/pipeline/steps/orchestration/validate-story-inputs.md +4 -1
  21. package/pipeline/steps/qa-a/cli-tool.md +42 -0
  22. package/pipeline/steps/qa-b/cli-tool.md +56 -0
  23. package/pipeline/steps/qa-b/execute-tests.md +1 -0
  24. package/pipeline/steps/qa-b/write-report.md +1 -1
  25. package/pipeline/steps/reqs/cli-tool.md +46 -0
  26. package/pipeline/task-graphs/cli-tool.yaml +60 -0
  27. package/pipeline/templates/cli-dev-handoff.template.md +89 -0
  28. package/pipeline/templates/reqs-brief.template.md +11 -0
  29. package/skills/valent-configure/SKILL.md +2 -1
  30. package/skills/valent-setup-backlog/SKILL.md +1 -0
  31. package/src/commands/db-rebuild.js +1 -0
  32. package/src/commands/init.js +2 -2
  33. package/src/lib/config-schema.js +4 -4
  34. package/src/lib/detect.js +2 -4
package/README.md CHANGED
@@ -50,7 +50,7 @@ Quality gates (**SPECCHECK**, **RED**, **STATIC**, **CRITIC**, **GREEN**, **EVID
50
50
 
51
51
  ## Project Types
52
52
 
53
- The pipeline supports 7 project types, each with a tailored task graph and specialized developer agent:
53
+ The pipeline supports 9 project types, each with a tailored task graph and specialized developer agent:
54
54
 
55
55
  | Project Type | Developer Agent | Agents Skipped |
56
56
  |---|---|---|
@@ -61,6 +61,7 @@ The pipeline supports 7 project types, each with a tailored task graph and speci
61
61
  | `mcp-server` | MCP-DEV | UXA, FEND, PMCP |
62
62
  | `document-generation` | DOCGEN | UXA, FEND, PMCP |
63
63
  | `library` | LIBDEV | UXA, FEND, PMCP |
64
+ | `cli-tool` | CLI-DEV | UXA, FEND, PMCP |
64
65
  | `mobile-app` | MOBILE | *(conditional)* |
65
66
 
66
67
  The workflow selects which agents to spawn based on `project.type` in your `pipeline-config.yaml` and the story's `testing_profiles` (resolved deterministically by `resolve-graph`).
@@ -94,6 +95,7 @@ Specialized agents that replace BEND for non-API project types:
94
95
  | DATA | Sonnet | `data-pipeline` | `data-handoff.md` |
95
96
  | MCP-DEV | Sonnet | `mcp-server` | `mcp-dev-handoff.md` |
96
97
  | LIBDEV | Sonnet | `library` | `libdev-handoff.md` |
98
+ | CLI-DEV | Sonnet | `cli-tool` | `cli-dev-handoff.md` |
97
99
  | DOCGEN | Sonnet | `document-generation` | `docgen-handoff.md` |
98
100
  | IAC | Sonnet | Cross-cutting (any type) | `iac-handoff.md` |
99
101
  | MOBILE | Sonnet | `mobile-app` | `mobile-handoff.md` |
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "valent-pipeline",
3
- "version": "0.17.1",
3
+ "version": "0.18.0",
4
4
  "description": "v3 multi-agent AI pipeline for software development lifecycle",
5
5
  "type": "module",
6
6
  "bin": {
@@ -64,12 +64,13 @@ domain-specific QA-A / QA-B / CRITIC steps.
64
64
  | DATA | Sonnet | Data pipeline developer | `data-pipeline` | `reqs-brief.md`, `qa-test-spec.md` | `data-handoff.md` | ETL/transforms, idempotency, checkpointing, row-level logging |
65
65
  | MCP-DEV | Sonnet | Protocol developer | `mcp-server` | `reqs-brief.md`, `qa-test-spec.md` | `mcp-dev-handoff.md` | JSON-RPC/stdio, two-tier error model, tool registration |
66
66
  | LIBDEV | Sonnet | Library developer | `library` | `reqs-brief.md`, `qa-test-spec.md` | `libdev-handoff.md` | Public API, exports/packaging, CJS/ESM, semver, type declarations |
67
+ | CLI-DEV | Sonnet | CLI developer | `cli-tool` | `reqs-brief.md`, `qa-test-spec.md` | `cli-dev-handoff.md` | Command surface, argv parsing, exit codes, stdout/stderr contracts, bin entry |
67
68
  | DOCGEN | Sonnet | Document generation developer | `document-generation` | `reqs-brief.md`, `qa-test-spec.md` | `docgen-handoff.md` | Template engine, render pipeline, encoding, assets |
68
69
  | IAC | Sonnet | Infrastructure developer | `iac` profile (cross-cutting, any type) | `reqs-brief.md`, `qa-test-spec.md` | `iac-handoff.md` | Terraform/Pulumi/CloudFormation, K8s, CI/CD, IAM |
69
70
  | MOBILE | Sonnet | Mobile developer | `mobile-app` | `reqs-brief.md`, `uxa-spec.md`, `qa-test-spec.md` | `mobile-handoff.md` | React Native/Flutter, Maestro E2E, emulator lifecycle, iOS deferral |
70
71
 
71
72
  **Notes:**
72
- - DATA, MCP-DEV, LIBDEV, DOCGEN each replace BEND in their dedicated task graph.
73
+ - DATA, MCP-DEV, LIBDEV, CLI-DEV, DOCGEN each replace BEND in their dedicated task graph.
73
74
  - IAC is cross-cutting — it slots into ANY task graph when `iac` is in `testing_profiles`, running in parallel with the primary developer agent.
74
75
  - MOBILE replaces BEND for mobile-app projects; BEND can still be conditionally included when `testing_profiles` includes `api`.
75
76
  - See the agent prompts in `pipeline/prompts/` and step files in `pipeline/steps/` for full implementation details.
@@ -98,6 +99,7 @@ the story's `testing_profiles` do not require.
98
99
  | `mcp-server` | MCP-DEV | UXA, FEND | `mcp-server.yaml` |
99
100
  | `document-generation` | DOCGEN | UXA, FEND | `document-generation.yaml` |
100
101
  | `library` | LIBDEV | UXA, FEND | `library.yaml` |
102
+ | `cli-tool` | CLI-DEV | UXA, FEND | `cli-tool.yaml` |
101
103
  | `mobile-app` | MOBILE (+ BEND if `api` profile) | _(conditional)_ | `mobile-app.yaml` |
102
104
 
103
105
  **Conditional agents (any project type):**
@@ -113,7 +115,7 @@ Default assignments from `pipeline-config.yaml` `models:` (validated by `src/lib
113
115
  | Tier | Agents | Use Case | Cost |
114
116
  |------|--------|----------|------|
115
117
  | Opus | INTEGRATION, CRITIC-TRIAGE, JUDGE-DECIDE | Quality-gate adjudication + the binding ship call | Highest |
116
- | Sonnet | REQS, UXA, QA-A, QA-B, BEND, FEND, DATA, MCP-DEV, LIBDEV, DOCGEN, IAC, MOBILE | Spec writing, implementation, test execution | Balanced |
118
+ | Sonnet | REQS, UXA, QA-A, QA-B, BEND, FEND, DATA, MCP-DEV, LIBDEV, CLI-DEV, DOCGEN, IAC, MOBILE | Spec writing, implementation, test execution | Balanced |
117
119
  | Haiku | Knowledge, CLI-runner steps (resolve/pack/validate/calibrate/persist) | Mechanical retrieval and IO; no reasoning | Lowest |
118
120
 
119
121
  Embedding runs as the `db index-curated` CLI step, not an agent. Model assignments are configurable in
@@ -149,6 +151,7 @@ story's `testing_profiles`:
149
151
  | `data-pipeline` | `qa-a/data-pipeline.md` | `qa-b/data-pipeline.md` | `critic/data-pipeline.md` | `reqs/data-pipeline.md` |
150
152
  | `mcp-server` | `qa-a/mcp-server.md` | `qa-b/mcp-server.md` | `critic/mcp-server.md` | `reqs/mcp-server.md` |
151
153
  | `library` | `qa-a/library.md` | `qa-b/library.md` | `critic/library.md` | `reqs/library.md` |
154
+ | `cli-tool` | `qa-a/cli-tool.md` | `qa-b/cli-tool.md` | `critic/cli-tool.md` | `reqs/cli-tool.md` |
152
155
  | `document-generation` | `qa-a/document-generation.md` | `qa-b/document-generation.md` | `critic/document-generation.md` | `reqs/document-generation.md` |
153
156
  | `iac` | `qa-a/iac.md` | `qa-b/iac.md` | `critic/iac.md` | `reqs/iac.md` |
154
157
  | `mobile-app` | `qa-a/mobile-app.md` | `qa-b/mobile-app.md` | `critic/mobile-app.md` | `reqs/mobile-app.md` |
@@ -68,7 +68,7 @@ resolve-graph → REQS → UXA? → QA-A → SPECCHECK → RED → dev fan-out
68
68
 
69
69
  4. **SPECCHECK** validates the spec chain mechanically (`valent spec check` artifact matrix + `trace check` AC coverage); rework routes to the CLI-named owner. **RED** (ATDD) then proves the acceptance suite fails pre-implementation and freezes it by hash; **GREEN** (post-CRITIC) proves the implementation satisfies it — red+green+diff is the story proof object (`proof.json`).
70
70
 
71
- 5. **Dev fan-out** implements production code and tests in parallel. The dev agents (BEND/FEND/DATA/MCP-DEV/LIBDEV/DOCGEN/IAC/MOBILE) are selected by the story's `testing_profiles` and the resolved task graph. All read the reqs brief and test spec.
71
+ 5. **Dev fan-out** implements production code and tests in parallel. The dev agents (BEND/FEND/DATA/MCP-DEV/LIBDEV/CLI-DEV/DOCGEN/IAC/MOBILE) are selected by the story's `testing_profiles` and the resolved task graph. All read the reqs brief and test spec.
72
72
 
73
73
  6. **CRITIC** runs three independent parallel review passes (blind hunt, edge-case hunt, acceptance audit) followed by triage and a verdict. Rejects code back to the dev agents if High-severity findings exist.
74
74
 
@@ -133,7 +133,7 @@ This invokes the workflow, which reads the story input from `stories/STORY-ID/`,
133
133
 
134
134
  Run `/valent-configure` to interactively set:
135
135
 
136
- - **Project type** -- fullstack-web, backend-api, frontend-only, data-pipeline, mcp-server, document-generation, library
136
+ - **Project type** -- fullstack-web, backend-api, frontend-only, data-pipeline, mcp-server, document-generation, library, cli-tool, mobile-app
137
137
  - **Tech stack** -- language, frameworks, test tools, browser automation
138
138
  - **Model assignments** -- the `models:` tier→roles map (opus/sonnet/haiku) controlling which Claude tier each agent runs on
139
139
  - **Quality thresholds** -- max rejection cycles, retrospective frequency, stall detection timeout
@@ -12,7 +12,7 @@ The dev task graph is **not** assembled at runtime from a manifest. It is a stat
12
12
  pipeline/task-graphs/<project-type>.yaml
13
13
  ```
14
14
 
15
- (e.g. `fullstack-web.yaml`, `backend-api.yaml`, `frontend-only.yaml`, `library.yaml`, `mcp-server.yaml`, `data-pipeline.yaml`, `document-generation.yaml`, `mobile-app.yaml`).
15
+ (e.g. `fullstack-web.yaml`, `backend-api.yaml`, `frontend-only.yaml`, `library.yaml`, `cli-tool.yaml`, `mcp-server.yaml`, `data-pipeline.yaml`, `document-generation.yaml`, `mobile-app.yaml`).
16
16
 
17
17
  Each file contains a `tasks:` DAG. Every task declares:
18
18
 
@@ -149,7 +149,8 @@ const VALIDATE_SCHEMA = {
149
149
  // Which estimator agent owns each testing profile.
150
150
  const PROFILE_ESTIMATORS = {
151
151
  api: 'BEND', ui: 'FEND', 'data-pipeline': 'DATA', 'mcp-server': 'MCP-DEV',
152
- library: 'LIBDEV', 'document-generation': 'DOCGEN', iac: 'IAC',
152
+ library: 'LIBDEV', 'cli-tool': 'CLI-DEV', 'document-generation': 'DOCGEN', iac: 'IAC',
153
+ 'mobile-app': 'MOBILE', // was missing — a mobile-only story sized to 0 points
153
154
  }
154
155
 
155
156
  // --- args ---
@@ -339,6 +340,7 @@ if (toGroom.length) {
339
340
  '- `data-pipeline` — ETL, data transformation, or batch processing',
340
341
  '- `mcp-server` — MCP server tools, handlers, or protocol work',
341
342
  '- `library` — shared library/package (exports, packaging, versioning)',
343
+ '- `cli-tool` — command-line surface (subcommands, flags, exit codes, terminal output contracts)',
342
344
  '- `document-generation` — document/report template or generation pipeline work',
343
345
  '- `iac` — infrastructure (Terraform, CloudFormation, Kubernetes, CI/CD)',
344
346
  "Tag a profile only when the story OWNS that surface. A story that merely CONSUMES another story's API endpoint (no endpoint/DB change of its own) is NOT `api`.",
@@ -545,7 +545,7 @@ const RESUME_CHECK_SCHEMA = {
545
545
  },
546
546
  }
547
547
 
548
- const DEV_AGENTS = new Set(['BEND', 'FEND', 'IAC', 'DATA', 'DOCGEN', 'LIBDEV', 'MCP-DEV', 'MOBILE'])
548
+ const DEV_AGENTS = new Set(['BEND', 'FEND', 'IAC', 'DATA', 'DOCGEN', 'LIBDEV', 'CLI-DEV', 'MCP-DEV', 'MOBILE'])
549
549
 
550
550
  // CRITIC's three independent passes (step 3b). Each reads ONLY its own pass step file and
551
551
  // the diff/artifacts it is told to — never another pass's output — so they cannot anchor.
@@ -886,7 +886,7 @@ const DEFAULT_MODELS = {
886
886
  JUDGE: 'opus', 'JUDGE-EVIDENCE': 'sonnet', 'JUDGE-DECIDE': 'opus',
887
887
  REQS: 'sonnet', UXA: 'sonnet', 'QA-A': 'sonnet', 'QA-B': 'sonnet',
888
888
  BEND: 'sonnet', FEND: 'sonnet', DATA: 'sonnet', 'MCP-DEV': 'sonnet',
889
- LIBDEV: 'sonnet', DOCGEN: 'sonnet', IAC: 'sonnet', MOBILE: 'sonnet',
889
+ LIBDEV: 'sonnet', 'CLI-DEV': 'sonnet', DOCGEN: 'sonnet', IAC: 'sonnet', MOBILE: 'sonnet',
890
890
  // PMCP is mechanical: crawl each route, run the exact checklist checkpoints, screenshot, and check
891
891
  // against explicit pass criteria — no open-ended judgment, so the cheapest tier (like STATIC/RESOLVE).
892
892
  // PERSIST is the sprint-end backlog writer — mechanical YAML edit, same tier as in plan/retro.
@@ -967,7 +967,7 @@ const reasoningFor = (role) => REASONING[String(role).toUpperCase()]
967
967
  // without Ref configured — which is why it can default ON: it only ever helps, never blocks. From
968
968
  // pipeline-config.yaml `ref` ({ enabled?, roles? }); set enabled:false to suppress it, or override
969
969
  // `roles` to change which agents get it. Static + args only => journal-replay safe.
970
- const DEFAULT_REF_ROLES = ['BEND', 'FEND', 'DATA', 'MCP-DEV', 'LIBDEV', 'DOCGEN', 'IAC', 'MOBILE', 'CRITIC']
970
+ const DEFAULT_REF_ROLES = ['BEND', 'FEND', 'DATA', 'MCP-DEV', 'LIBDEV', 'CLI-DEV', 'DOCGEN', 'IAC', 'MOBILE', 'CRITIC']
971
971
  function buildRefRoleSet(cfg) {
972
972
  if (cfg && cfg.enabled === false) return new Set()
973
973
  const roles = cfg && Array.isArray(cfg.roles) && cfg.roles.length ? cfg.roles : DEFAULT_REF_ROLES
@@ -0,0 +1,61 @@
1
+ # CLI-DEV
2
+ <!-- Prompt version: 1.0 | spawned per task by the Workflow orchestrator -->
3
+
4
+ You are CLI-DEV, the command-line tool developer agent. You implement command surfaces: subcommands, flags, argument parsing, exit codes, stdout/stderr contracts, and the executable bin entry.
5
+
6
+ Read `.valent-pipeline/steps/common/agent-protocol.md` for Communication Standard, Context Discipline, Design Council Protocol, Knowledge-First Principle, Correction Directives, and YAML Frontmatter.
7
+
8
+ ## Task Lifecycle
9
+
10
+ You are spawned for exactly one task; your spawn instructions are your trigger. The orchestrator has already run the SPECCHECK spec gate before spawning you.
11
+
12
+ - **On completion:** Write your handoff file with verdict, then return your machine block. The orchestrator routes it to CRITIC.
13
+ - **On rejection rework:** Your spawn instructions carry CRITIC's open fix-list. Read `critic-review.md`, fix the code, write an updated handoff.
14
+ - **On bug fix:** Your spawn instructions name the bug(s). Fix them, update your handoff; QA-B re-verifies on its next run.
15
+ - **Escalate:** Write `status: blocked` (with the blocker) to your output frontmatter and return. The orchestrator reads it.
16
+
17
+ ## Context
18
+
19
+ - **Story:** {story_id}
20
+ - **Language:** {tech_stack.language}
21
+ - **Package manager:** {tech_stack.package_manager}
22
+ - **Module system:** {tech_stack.module_system}
23
+ - **Type system:** {tech_stack.type_system}
24
+ - **Unit test framework:** {tech_stack.test_framework_unit}
25
+ - **Project type:** {project_type}
26
+
27
+ ## Inputs
28
+
29
+ | Artifact | Purpose |
30
+ |----------|---------|
31
+ | `reqs-brief.md` | Acceptance criteria, business rules, command surface, flag contracts, exit-code semantics |
32
+ | `qa-test-spec.md` | Behavioral test specifications for each AC -- what tests to write |
33
+
34
+ ## Output
35
+
36
+ Write `cli-dev-handoff.md` using the template at `.valent-pipeline/templates/cli-dev-handoff.template.md`. Update YAML frontmatter as you complete each step.
37
+
38
+ ## Quality Standards
39
+
40
+ Read `.valent-pipeline/steps/common/quality-standards.md` for universal standards enforced by CRITIC and QA-B.
41
+
42
+ Additional CLI-DEV-specific standards:
43
+ - **Exit codes are the API** -- 0 means success and ONLY success; every failure mode exits non-zero. A command that prints an error and exits 0 is a defect, full stop.
44
+ - **stdout/stderr discipline** -- results (the output a script would pipe or parse) go to stdout; diagnostics, progress, and warnings go to stderr. Machine-readable output modes (`--json` etc.) must stay byte-clean on stdout.
45
+ - **Help text matches behavior** -- every documented command, flag, and default does what `--help` says; every implemented flag appears in help. Unknown flags/subcommands error non-zero, never silently ignore.
46
+ - **Non-interactive safe** -- the tool must never hang waiting for input when stdin is not a TTY; everything a prompt asks for is also settable by flag or environment variable.
47
+ - **Destructive actions guarded** -- deletes/overwrites/irreversible operations require an explicit flag (`--force`/`--yes`) or refuse in non-interactive mode; the refusal names the flag.
48
+ - **The bin entry works installed** -- the package.json `bin` mapping resolves, the entry has the correct shebang, and the tool runs via the package-manager execution path (`npx <name>`/equivalent), not just `node src/...`.
49
+
50
+ ## Step Sequence
51
+
52
+ Update `stepsCompleted` and `pendingSteps` in frontmatter as you progress.
53
+
54
+ ### Steps
55
+
56
+ | Step | File | Summary |
57
+ |------|------|---------|
58
+ | 1. Read Inputs | `.valent-pipeline/steps/cli-dev/read-inputs.md` | Read reqs-brief, qa-test-spec, correction directives, knowledge queries |
59
+ | 2. Implement | `.valent-pipeline/steps/cli-dev/implement.md` | Command surface, argument parsing, exit codes, output contracts, bin entry |
60
+ | 3. Write Tests | `.valent-pipeline/steps/cli-dev/write-tests.md` | Black-box subprocess tests, exit-code/output assertions, execution |
61
+ | 4. Handoff | `.valent-pipeline/steps/cli-dev/handoff.md` | Write cli-dev-handoff.md, final verification |
@@ -53,10 +53,10 @@ After triage-depth, execute only the passes indicated by your selected depth lev
53
53
  | 2. Pass 1: Blind Hunt | `.valent-pipeline/steps/critic/blind-hunt.md` | standard, deep |
54
54
  | 2b. Query Knowledge Base | (inline) | Always |
55
55
  | 3. Pass 2: Edge Case Hunt | `.valent-pipeline/steps/critic/edge-case-hunt.md` | deep only |
56
- | 3b. Load profile steps for edge-case-hunt | Conditional per `{testing_profiles}`: `.valent-pipeline/steps/critic/data-pipeline.md`, `mcp-server.md`, `library.md`, `document-generation.md`, `iac.md`, `mobile-app.md` | deep only |
56
+ | 3b. Load profile steps for edge-case-hunt | Conditional per `{testing_profiles}`: `.valent-pipeline/steps/critic/data-pipeline.md`, `mcp-server.md`, `library.md`, `cli-tool.md`, `document-generation.md`, `iac.md`, `mobile-app.md` | deep only |
57
57
  | 4. Pass 3: Acceptance Audit | `.valent-pipeline/steps/critic/acceptance-audit.md` | Always |
58
58
  | 5. Test Code Review | `.valent-pipeline/steps/critic/test-review.md` | standard, deep |
59
- | 5b. Load profile steps for test-review | Conditional per `{testing_profiles}`: `.valent-pipeline/steps/critic/data-pipeline.md`, `mcp-server.md`, `library.md`, `document-generation.md`, `iac.md`, `mobile-app.md` | standard, deep |
59
+ | 5b. Load profile steps for test-review | Conditional per `{testing_profiles}`: `.valent-pipeline/steps/critic/data-pipeline.md`, `mcp-server.md`, `library.md`, `cli-tool.md`, `document-generation.md`, `iac.md`, `mobile-app.md` | standard, deep |
60
60
  | 6. Triage | `.valent-pipeline/steps/critic/triage.md` | Always |
61
61
  | 7. Write Verdict | `.valent-pipeline/steps/critic/write-verdict.md` | Always |
62
62
 
@@ -67,7 +67,7 @@ The story runs on its own branch off the target branch (git flow — `.valent-pi
67
67
  Read the curated knowledge files (directory named in your spawn instructions, Setup step 2b) for recurring code quality issues, known anti-patterns, and correction directives relevant to CRITIC reviewing code for {story_id}. Also run `node .valent-pipeline/bin/cli.js db search --query "<topic>"` for prior-story lessons, and `node .valent-pipeline/bin/cli.js db query-directives --agent CRITIC`. If no relevant knowledge found, proceed without.
68
68
 
69
69
  ### Step 3b: Load Profile Steps for Edge Case Hunt (Conditional)
70
- For edge-case-hunt, also read profile-specific step files based on `{testing_profiles}`: `.valent-pipeline/steps/critic/data-pipeline.md`, `mcp-server.md`, `library.md`, `document-generation.md`, `iac.md`, `mobile-app.md`. If a profile step file does not exist, note it and proceed. Apply domain-specific focus areas alongside the generic ones.
70
+ For edge-case-hunt, also read profile-specific step files based on `{testing_profiles}`: `.valent-pipeline/steps/critic/data-pipeline.md`, `mcp-server.md`, `library.md`, `cli-tool.md`, `document-generation.md`, `iac.md`, `mobile-app.md`. If a profile step file does not exist, note it and proceed. Apply domain-specific focus areas alongside the generic ones.
71
71
 
72
72
  ### Step 5b: Load Profile Steps for Test Review (Conditional)
73
73
  For test-review, also read profile-specific step files based on `{testing_profiles}` from `.valent-pipeline/steps/critic/`. Apply domain-specific test review criteria alongside the generic ones. If a profile step file does not exist, note it and proceed.
@@ -55,7 +55,7 @@ Always include this table in the output for downstream agent calibration.
55
55
  | 1b | Query knowledge base | `.valent-pipeline/steps/qa-a/read-inputs.md` |
56
56
  | 2 | Risk classification per AC | `.valent-pipeline/steps/qa-a/read-inputs.md` |
57
57
  | 3 | Write Given-When-Then test cases | `.valent-pipeline/steps/qa-a/write-spec.md` |
58
- | 3b | Load testing profile step files | Conditional per `{testing_profiles}`: `.valent-pipeline/steps/qa-a/api.md`, `ui.md`, `data-pipeline.md`, `mcp-server.md`, `library.md`, `document-generation.md`, `iac.md` |
58
+ | 3b | Load testing profile step files | Conditional per `{testing_profiles}`: `.valent-pipeline/steps/qa-a/api.md`, `ui.md`, `data-pipeline.md`, `mcp-server.md`, `library.md`, `cli-tool.md`, `document-generation.md`, `iac.md` |
59
59
  | 4 | Database state verification | `.valent-pipeline/steps/qa-a/write-spec.md` |
60
60
  | 5 | Seed data and fixture requirements | `.valent-pipeline/steps/qa-a/write-spec.md` |
61
61
  | 6 | Negative and edge case tests (P0-P1) | `.valent-pipeline/steps/qa-a/write-spec.md` |
@@ -46,7 +46,7 @@ Write outputs to `{story_output_dir}/` using templates:
46
46
  | 2 | Read CRITIC review | `.valent-pipeline/steps/qa-b/execute-tests.md` |
47
47
  | 3 | Discover implemented tests | `.valent-pipeline/steps/qa-b/execute-tests.md` |
48
48
  | 4 | Run full test suite | `.valent-pipeline/steps/qa-b/execute-tests.md` |
49
- | 4b | Load and execute testing profile steps | Conditional per `{testing_profiles}`: `.valent-pipeline/steps/qa-b/api.md`, `ui.md`, `data-pipeline.md`, `mcp-server.md`, `library.md`, `document-generation.md`, `iac.md` |
49
+ | 4b | Load and execute testing profile steps | Conditional per `{testing_profiles}`: `.valent-pipeline/steps/qa-b/api.md`, `ui.md`, `data-pipeline.md`, `mcp-server.md`, `library.md`, `cli-tool.md`, `document-generation.md`, `iac.md` |
50
50
  | 5 | Spec-implementation alignment check | `.valent-pipeline/steps/qa-b/execute-tests.md` |
51
51
  | 6 | Build traceability matrix | `.valent-pipeline/steps/qa-b/write-report.md` |
52
52
  | 7 | File bugs | `.valent-pipeline/steps/qa-b/file-bugs.md` |
@@ -23,7 +23,7 @@ Write output to `{story_output_dir}/reqs-brief.md` using the template at `.valen
23
23
  - `{story_id}`, `{story_output_dir}`, `knowledge/correction-directives.yaml`
24
24
  - `{tech_stack.language}`, `{tech_stack.backend_framework}`, `{tech_stack.frontend_framework}`
25
25
  - `{tech_stack.database}`
26
- - `{project_type}` -- fullstack-web | backend-api | frontend-only | data-pipeline | mcp-server | library | document-generation | mobile-app
26
+ - `{project_type}` -- fullstack-web | backend-api | frontend-only | data-pipeline | mcp-server | library | cli-tool | document-generation | mobile-app
27
27
  - `{testing_profiles}` -- active testing profiles (e.g., `[api]`, `[api, ui]`, `[data-pipeline]`). Determines which domain step files to load.
28
28
 
29
29
  ## Step Sequence
@@ -0,0 +1,49 @@
1
+ # CLI Tool Estimation
2
+
3
+ **Purpose:** Assign a Fibonacci story point estimate for CLI implementation complexity. This is a lightweight estimation step — no code tools, no implementation. Read specs, assess complexity, output a number with rationale.
4
+
5
+ **Fibonacci scale:** 1, 2, 3, 5, 8, 13, 21
6
+
7
+ ## Step 1: Read Groomed Specs
8
+
9
+ Read and assess:
10
+ - `{story_output_dir}/reqs-brief.md` — REQUIRED
11
+ - `{story_output_dir}/qa-test-spec.md` — REQUIRED
12
+
13
+ ## Step 2: Assess Complexity Factors
14
+
15
+ Evaluate each factor and record your assessment:
16
+
17
+ | Factor | Assessment | Weight |
18
+ |--------|-----------|--------|
19
+ | **AC count and complexity** | How many ACs? Are they simple (add a flag) or complex (new subcommand family, breaking surface change)? | High |
20
+ | **New patterns vs established** | Greenfield (new command framework, new output mode) vs incremental (extend an existing subcommand)? | High |
21
+ | **Command surface** | How many new subcommands/flags? Interactions between flags? Breaking changes requiring migration/deprecation? | Medium |
22
+ | **Contract complexity** | Distinct exit-code semantics? Machine-readable output modes? stdin/pipe handling? Cross-platform shell concerns? | Medium |
23
+ | **Test complexity** | How hard will subprocess tests be? Long-running/stateful commands? Environment/filesystem fixtures? | Medium |
24
+
25
+ ## Step 3: Select Fibonacci Value
26
+
27
+ Map your assessment to the Fibonacci scale:
28
+
29
+ | Points | Typical CLI Scope |
30
+ |--------|-------------------|
31
+ | 1 | Single flag addition or help-text fix, no contract change |
32
+ | 2 | Simple new flag with behavior, trivial output change |
33
+ | 3 | New subcommand with a few flags, moderate test coverage |
34
+ | 5 | New subcommand family or machine-readable output mode, exit-code rework, subprocess test suite |
35
+ | 8 | Major surface change, breaking changes with deprecation path, cross-platform contract work |
36
+ | 13 | Large CLI restructure, new dispatch/framework layer, extensive contract surface |
37
+ | 21 | Epic-scale: new tool or complete command-surface overhaul (consider splitting the story) |
38
+
39
+ **Calibration context (if `{estimation_model}` is `calibrated`):**
40
+ If calibration directives are provided in `knowledge/correction-directives.yaml`, factor them into your estimate. These are learned patterns from prior sprints — e.g., "stories adding output modes consistently under-pointed by 1 tier."
41
+
42
+ ## Step 4: Write Estimate
43
+
44
+ Write to `{story_output_dir}/cli-dev-estimation.md` using `.valent-pipeline/templates/estimation.template.md`:
45
+ - Fibonacci value with brief rationale (2-3 sentences)
46
+ - Factor assessments from Step 2
47
+ - Calibration adjustments applied (if any)
48
+
49
+ Send: `[ESTIMATION] CLI-DEV estimates {story_id} at {points} points. See cli-dev-estimation.md.`
@@ -0,0 +1,12 @@
1
+ # CLI-DEV Step: Handoff
2
+
3
+ Read `.valent-pipeline/steps/common/distilled-handoff-format.md` before writing output.
4
+
5
+ ## Pre-handoff smudge check
6
+ Before writing the handoff, re-read the pitfalls primer for your stack in the curated knowledge directory (named in your spawn instructions, Setup step 2b) if one exists (e.g. `pitfalls-<profile>-<stack>.md`) and scan *your diff* against each item. Fix any gaps now — cheap here, a reject→rework cycle after handoff. The primer is a starter, not exhaustive: also handle obvious things like it that aren't listed. This is a ~10-second self-pass, not a substitute for CRITIC.
7
+
8
+ ## Step 13: Write cli-dev-handoff.md
9
+ Complete all sections of the handoff document using the template at `.valent-pipeline/templates/cli-dev-handoff.template.md`. Set `status: completed` in frontmatter.
10
+
11
+ ## Independent Verification Requirement
12
+ You must independently verify: all tests pass, the bin entry launches via the package-manager exec path, `--help`/`--version` exit 0 and are accurate, every failure path exits non-zero, and no touched command blocks on a prompt without a TTY -- before marking your task complete. Do not rely on CRITIC or QA-B to catch your failures.
@@ -0,0 +1,19 @@
1
+ # CLI-DEV Step: Implement
2
+
3
+ ## Step 4: Plan implementation approach
4
+ Order: command surface design (subcommands/flags/arguments) -> core command logic -> exit-code and error paths -> output contracts (stdout/stderr, machine-readable modes) -> bin entry and packaging. Identify which existing commands or shared helpers the story touches.
5
+
6
+ ## Step 5: Design the command surface
7
+ Per reqs-brief: define each subcommand, its flags (long form always; short aliases only where the brief asks), argument shapes, and defaults. Every flag must be intentional -- no dead options, no undocumented behavior. Record in `cli-dev-handoff.md#command-surface`.
8
+
9
+ ## Step 6: Implement command logic
10
+ Per reqs-brief: implement the commands' core logic. Keep parsing/validation at the edge and testable logic in importable modules -- the bin entry should be a thin dispatcher. Record in `cli-dev-handoff.md#files-created-modified`.
11
+
12
+ ## Step 7: Implement exit codes and error paths
13
+ Per reqs-brief: every failure mode exits non-zero with a one-line actionable error on stderr. Distinct exit codes where the brief assigns semantics to them. Never `process.exit(0)`/return-success after printing an error. Unknown flags or subcommands must error, not pass through.
14
+
15
+ ## Step 8: Implement output contracts
16
+ Results to stdout; diagnostics/progress/warnings to stderr. If the brief specifies a machine-readable mode (`--json`, `--quiet`, etc.), its stdout must contain ONLY the contracted output -- no banners, no color codes when piped (honor NO_COLOR / non-TTY detection where the project does).
17
+
18
+ ## Step 9: Wire the bin entry and verify non-interactive safety
19
+ Configure the package `bin` mapping and shebang; verify the tool launches via the package-manager exec path (`npx <name>` or equivalent), `--help` and `--version` exit 0 with accurate text. Verify no code path blocks on a prompt when stdin is not a TTY -- every interactive question has a flag/env override, and destructive actions refuse (naming the flag) rather than hang or proceed. Record decisions in `cli-dev-handoff.md#implementation-decisions`.
@@ -0,0 +1,13 @@
1
+ # CLI-DEV Step: Read Inputs
2
+
3
+ ## Step 1: Read reqs-brief.md
4
+ Understand: acceptance criteria, business rules, command surface requirements (subcommands, flags, arguments, defaults), exit-code semantics, output contracts (human vs machine-readable modes), interactivity constraints, cross-cutting concerns.
5
+
6
+ ## Step 2: Read qa-test-spec.md
7
+ Understand: what tests to write for each AC, expected assertions, subprocess-invocation requirements, test case names and structure.
8
+
9
+ ## Step 3: Read correction directives
10
+ Read `knowledge/correction-directives.yaml`. Apply all directives targeting CLI-DEV. Note any conflicts with default behavior and follow the directive.
11
+
12
+ ## Step 3b: Query Knowledge Base
13
+ Read the curated knowledge files (directory named in your spawn instructions, Setup step 2b) for codebase conventions, implementation patterns, and known pitfalls relevant to CLI-DEV implementing {story_id} using {tech_stack.language} with {tech_stack.module_system} module system and {tech_stack.type_system}. Also run `node .valent-pipeline/bin/cli.js db search --query "<topic>"` for prior-story lessons, and `node .valent-pipeline/bin/cli.js db query-directives --agent CLI-DEV`. If no relevant knowledge found, proceed without.
@@ -0,0 +1,22 @@
1
+ # CLI-DEV Step: Write Tests
2
+
3
+ ## Rule 0 (ATDD): Acceptance tests are READ-ONLY
4
+
5
+ When the story has QA-A-authored acceptance tests (manifest cases with `kind: acceptance` — files under the configured acceptance dir for this story plus any manifest-declared helpers), you MUST NOT edit, move, delete, or skip them. They are the spec. Every acceptance source is snapshot-hashed at the red run; the green gate compares hashes and ANY change auto-REJECTs the story to QA-A arbitration — making an acceptance test pass by changing it is mechanically impossible. If you believe an acceptance test is wrong, say so in your handoff (QA-A arbitrates: restore or rebaseline-with-reason); keep implementing against the rest. Your own unit/integration tests are yours — write and refactor them freely; the read-only rule covers only the acceptance tier.
6
+
7
+
8
+ ## Step 10: Write test code
9
+ Satisfy qa-test-spec for each AC. Every test case named in qa-test-spec must have a corresponding test. Follow quality standards from the core prompt. Record in `cli-dev-handoff.md#test-files-written`.
10
+
11
+ **Critical rule:** Behavioral tests must exercise the CLI the way a user would -- spawn the real bin entry as a subprocess and assert on stdout, stderr, and the exit code. No calling command handler functions directly for behavior the spec describes at the command line, and no mocking the process boundary. Unit tests on the importable core logic are fine IN ADDITION, never INSTEAD.
12
+
13
+ ## Step 11: Write contract tests
14
+ Beyond per-AC tests, write subprocess tests that pin the tool's general contract:
15
+ 1. `--help` (root and each touched subcommand) exits 0 and names every implemented flag
16
+ 2. `--version` exits 0 and matches the package version
17
+ 3. An unknown flag and an unknown subcommand each exit non-zero with an error on stderr
18
+ 4. A machine-readable mode touched by this story (if any) produces byte-clean, parseable stdout
19
+ 5. With stdin not a TTY, no touched command hangs on a prompt (run every subprocess test with stdin closed or piped -- a hang here is the bug)
20
+
21
+ ## Step 12: Run tests, verify all pass
22
+ Run the full test suite. All tests must pass. Record results in `cli-dev-handoff.md#test-results-summary`. If tests fail, fix the code -- do not skip or weaken tests.
@@ -0,0 +1,25 @@
1
+ # CRITIC Domain: CLI Tool Review
2
+
3
+ **Applies to:** Stories where CLI-DEV is the implementing agent.
4
+
5
+ ## Edge Case Focus Areas
6
+
7
+ In addition to the standard edge case hunt (Pass 2), scrutinize these CLI-specific risks:
8
+
9
+ - **Exit 0 on failure** -- any path that prints an error (or catches and swallows one) and still exits 0. Scripts and CI branch on exit codes; this is the CLI equivalent of a swallowed exception. Any occurrence is a High finding.
10
+ - **Contract output polluted** -- log lines, banners, progress, deprecation warnings, or color escape codes written to stdout in a machine-readable mode (or interleaved with pipeable output). Consumers parse stdout; pollution breaks them silently. High finding.
11
+ - **Hidden interactivity** -- a code path that can reach a prompt (readline, inquirer, confirm) with no flag/env override or non-TTY guard. In CI this hangs forever. High finding.
12
+ - **Unknown input passes silently** -- unknown flags or subcommands ignored instead of erroring non-zero; typos become no-ops that look like success. High finding.
13
+ - **Accidental breaking changes** -- renamed/removed flags or subcommands, changed defaults, changed exit codes, or changed output shape that existing scripts depend on, without a deprecation path. Compare against the prior surface. High finding.
14
+ - **Help text drift** -- implemented behavior that contradicts `--help` (missing flags, wrong defaults, stale examples). Med finding.
15
+ - **Argument parsing edge cases** -- values that look like flags (`--name --verbose`), empty strings, `--` separator handling, repeated flags, quoting/spaces in paths (especially Windows). Probe each touched option.
16
+ - **Partial-failure state** -- a command that mutates files/state, fails midway, and leaves the mutation half-applied with exit 0 or no way to resume/rollback. High finding for destructive commands.
17
+
18
+ ## Test Code Review Additions
19
+
20
+ In addition to the standard test code review checklist:
21
+
22
+ - **Subprocess tests exist** -- behavioral coverage must spawn the real bin entry and assert exit code + stdout + stderr. Tests that only call handler functions in-process are testing implementation, not the contract. Missing subprocess coverage is a High finding.
23
+ - **Error paths asserted by exit code** -- failure tests must assert the non-zero exit code, not just the error message. Message-only assertions miss exit-0-on-failure defects. Med finding.
24
+ - **Non-TTY coverage** -- touched commands must have at least one test with stdin closed/piped. Missing it is a Med finding (High if the command prompts).
25
+ - **Machine output parsed, not pattern-matched** -- tests for `--json`-style modes must parse stdout with a real parser. Regex-only assertions are a Med finding.
@@ -82,6 +82,7 @@ PASSED:
82
82
  | `mcp-server` | `## Protocol Compliance Results` | server spawned, protocol spoken over real transport, tool responses asserted |
83
83
  | `iac` | `## Infrastructure Validation Results` | plan/apply against a sandbox; resources + drift asserted |
84
84
  | `library` | `## Export Audit Results` | library installed in an isolated consumer; real import/contract |
85
+ | `cli-tool` | `## Command Invocation Results` + `## Contract Checks` | packed binary installed + spawned as subprocesses; exit codes/stdout/stderr asserted |
85
86
  | `document-generation` | `## Render Validation Results` | real render pipeline invoked; output structure asserted |
86
87
  | `mobile-app` | `## Mobile Platform Coverage Audit` | real emulator/simulator + real API traffic |
87
88
 
@@ -36,6 +36,7 @@ Based on the story scope and project type, determine which testing profiles are
36
36
  | Story has data pipeline work (ETL, transformations, migrations) | `data-pipeline` |
37
37
  | Story has MCP server tools, handlers, or protocol work | `mcp-server` |
38
38
  | Story is shared library/package (exports, packaging, versioning) | `library` |
39
+ | Story has command-line surface work (subcommands, flags, exit codes, terminal output contracts) | `cli-tool` |
39
40
  | Story has document/report template or generation pipeline work | `document-generation` |
40
41
  | Story has infrastructure work (Terraform, CloudFormation, Kubernetes, CI/CD) | `iac` |
41
42
 
@@ -46,6 +47,7 @@ Multiple profiles can be active. Examples:
46
47
  - Data pipeline story: `[data-pipeline]`
47
48
  - MCP server story: `[mcp-server]`
48
49
  - Library/package story: `[library]`
50
+ - CLI tool story: `[cli-tool]`
49
51
  - Document generation story: `[document-generation]`
50
52
  - Infrastructure story: `[iac]`
51
53
  - Fullstack story with infrastructure: `[api, ui, iac]`
@@ -56,7 +58,8 @@ Set `{testing_profiles}` for use in shared context.
56
58
  **Each active profile carries a mandatory black-box, real-dependency verification** — the "prove it
57
59
  works for real" gate for that surface (live HTTP + real DB for `api`, real browser for `ui`, real
58
60
  pipeline run for `data-pipeline`, spawned server for `mcp-server`, applied infra for `iac`, installed
59
- package for `library`, real render for `document-generation`, real emulator for `mobile-app`). QA-B
61
+ package for `library`, spawned binary for `cli-tool`, real render for `document-generation`, real
62
+ emulator for `mobile-app`). QA-B
60
63
  executes it (`qa-b/{profile}.md`, Step 4b) and JUDGE gates on it (evidence-review Step 9): its **absence
61
64
  is a coverage gap, not a silent pass**. This is what catches boundary defects — CORS, serialization,
62
65
  auth headers, wire-format mismatch, migration drift — that mocked unit tests pass straight through.
@@ -0,0 +1,42 @@
1
+ # QA-A Step: CLI Tool Testing
2
+
3
+ ## Command Invocation Test Specification
4
+
5
+ For every command/flag surface in this story, write a **Command Invocation Test** table:
6
+
7
+ ```
8
+ ## Command Invocation Tests
9
+
10
+ | ID | Invocation | stdin | Expected Exit Code | Expected stdout | Expected stderr |
11
+ ```
12
+
13
+ Rules:
14
+ - One row per behavioral contract: the exact command line a user would type (including the bin name, not an internal function call)
15
+ - stdin: `closed`, `piped: <content>`, or `tty` — every touched command needs at least one `closed`/`piped` row (non-interactive safety is part of the spec)
16
+ - Expected exit code: an exact integer; "non-zero" only when the brief leaves the code unassigned
17
+ - Expected stdout: the contracted output a consumer/script reads (exact string, pattern, or parseable shape for `--json`-style modes); state "empty" explicitly when nothing may print
18
+ - Expected stderr: error/diagnostic expectations (pattern), or "empty"
19
+ - Error paths are first-class: every failure mode in the brief gets its own row (bad input, missing file, unknown flag, unknown subcommand)
20
+ - Contract rows are mandatory: `--help` (exit 0, names every new flag), `--version` (exit 0), unknown-flag and unknown-subcommand rows (non-zero, error on stderr)
21
+
22
+ ## Machine-Readable Output Specification
23
+
24
+ If the story ships or touches a machine-readable mode (`--json`, `--quiet`, `--porcelain`, ...), write a **Machine Output Test** table:
25
+
26
+ ```
27
+ ## Machine Output Tests
28
+
29
+ | ID | Invocation | Parse As | Required Fields | Cleanliness Check |
30
+ ```
31
+
32
+ Rules:
33
+ - Parse As: the format a consumer parses (JSON, NDJSON, TSV...) — the test must actually parse it, not regex it
34
+ - Cleanliness: stdout contains ONLY the contracted output — no banners, progress, color codes, or log lines (those belong on stderr)
35
+
36
+ ## Quality Gate Additions
37
+
38
+ - [ ] Command invocation table covers every new/changed command and flag, including every error path
39
+ - [ ] `--help`, `--version`, unknown-flag, unknown-subcommand contract rows present
40
+ - [ ] Every touched command has a non-TTY stdin row (closed or piped)
41
+ - [ ] Destructive operations have a refusal row (no `--force`/`--yes`) and an execution row (with it)
42
+ - [ ] Machine output tests present if any machine-readable mode is touched
@@ -0,0 +1,56 @@
1
+ # QA-B Step: CLI Tool Testing
2
+
3
+ ## Black-Box Binary Tests
4
+
5
+ Mandatory for all stories on a CLI tool. Install the tool the way a user gets it and exercise the real binary as subprocesses — never call command handlers in-process for this evidence.
6
+
7
+ **Procedure:**
8
+
9
+ 1. **Create isolated temp directory.** Outside the source tree.
10
+ 2. **Install the tool as a consumer.** Pack and install the real artifact (`npm pack` then `npm install <tarball>` in the temp project, `pipx install ../path`, equivalent) so the `bin` mapping, shebang, and packed file list are what's under test — not the working tree.
11
+ 3. **Execute the Command Invocation Test table.** For each row in qa-test-spec.md: spawn the installed binary with the row's argv and stdin condition; assert exit code, stdout, and stderr against the row. Record PASS/FAIL per row.
12
+ 4. **Run the contract rows.** `--help` (exits 0, names every new flag), `--version` (exits 0, matches the package version), unknown flag and unknown subcommand (non-zero, error on stderr).
13
+ 5. **Verify non-interactive safety.** Run every touched command with stdin closed and a timeout: any command that hangs waiting for input FAILS the row (file the bug; do not re-run with a TTY and call it a pass).
14
+ 6. **Verify machine-readable modes.** For each Machine Output Test row, parse stdout with a real parser (JSON.parse etc.) and assert the required fields; any non-contracted bytes on stdout is a FAIL.
15
+ 7. **Verify destructive guards.** Destructive operations refuse without their `--force`/`--yes` flag and act with it, per spec rows.
16
+ 8. **Clean up temp directory.**
17
+
18
+ **Record results** in `## CLI Black-Box Test Results` of execution-report.md:
19
+
20
+ ```
21
+ ## Command Invocation Results
22
+
23
+ | ID | Invocation | Exit (expected/actual) | stdout | stderr | Result |
24
+ |----|-----------|------------------------|--------|--------|--------|
25
+ | {id} | {command line} | {n}/{n} | {PASS/FAIL} | {PASS/FAIL} | {overall} |
26
+
27
+ ## Contract Checks
28
+
29
+ | Check | Result | Notes |
30
+ |-------|--------|-------|
31
+ | Installed via package path (npx/bin mapping) | {PASS/FAIL} | {install command + tarball} |
32
+ | --help accurate | {PASS/FAIL} | {missing/extra flags if any} |
33
+ | --version matches package | {PASS/FAIL} | {versions} |
34
+ | Unknown flag/subcommand rejected | {PASS/FAIL} | {details} |
35
+ | No hang with stdin closed | {PASS/FAIL} | {commands timed out, if any} |
36
+ | Machine output byte-clean + parseable | {PASS/FAIL or n/a} | {parser + fields} |
37
+ ```
38
+
39
+ Include raw commands and full output for reproducibility.
40
+
41
+ **Failure handling:**
42
+ - Pack/install fails: file P1 bug, record error, continue against the source-tree bin entry (note the downgrade loudly).
43
+ - A command exits 0 on a failure path: file P1 bug (exit codes are the API).
44
+ - A command hangs without a TTY: file P1 bug.
45
+ - Help/version drift: file P2 bug, continue.
46
+ - Non-clean machine output: file P2 bug, continue.
47
+
48
+ **This step cannot be skipped.** If qa-test-spec.md lacks a Command Invocation Tests section, construct the table from the command surface in reqs-brief.md and execute.
49
+
50
+ ## Execution Report Additions
51
+
52
+ The execution report MUST include:
53
+ - `## Command Invocation Results` table with per-row exit/stdout/stderr verdicts
54
+ - `## Contract Checks` table
55
+ - Raw pack/install/spawn commands with full output
56
+ - Isolated temp directory path and cleanup confirmation
@@ -71,6 +71,7 @@ Read testing profile step file(s) from `.valent-pipeline/steps/qa-b/` based on `
71
71
  | `mcp-server` | `.valent-pipeline/steps/qa-b/mcp-server.md` | Story has MCP tools/protocol work |
72
72
  | `iac` | `.valent-pipeline/steps/qa-b/iac.md` | Story has infrastructure work |
73
73
  | `library` | `.valent-pipeline/steps/qa-b/library.md` | Story ships a package/exports |
74
+ | `cli-tool` | `.valent-pipeline/steps/qa-b/cli-tool.md` | Story ships a command-line surface |
74
75
  | `document-generation` | `.valent-pipeline/steps/qa-b/document-generation.md` | Story renders documents/reports |
75
76
  | `mobile-app` | `.valent-pipeline/steps/qa-b/mobile-app.md` | Story has mobile app flows |
76
77
 
@@ -29,7 +29,7 @@ Rules:
29
29
  - SKIP only with documented justification
30
30
  - DB State Verified = Yes only if test code contains explicit DB assertions
31
31
  - **Real Coverage** -- classify each AC against the "prove it works for real" dimension:
32
- - `Real` -- at least one **passing** test for this AC exercises the real dependency at the boundary (the active profile's real-dependency step covers it: live HTTP + real DB for `api`, real browser for `ui`, real pipeline run for `data-pipeline`, spawned server for `mcp-server`, applied infra for `iac`, installed package for `library`, real render for `document-generation`). Use the profile-step evidence you produced in Step 4b — an AC is `Real` only if it appears in that step's real-execution results.
32
+ - `Real` -- at least one **passing** test for this AC exercises the real dependency at the boundary (the active profile's real-dependency step covers it: live HTTP + real DB for `api`, real browser for `ui`, real pipeline run for `data-pipeline`, spawned server for `mcp-server`, applied infra for `iac`, installed package for `library`, spawned binary for `cli-tool`, real render for `document-generation`). Use the profile-step evidence you produced in Step 4b — an AC is `Real` only if it appears in that step's real-execution results.
33
33
  - `Mocked-only` -- the AC is only exercised by tests that mock/stub the boundary (mocked `fetch` in jsdom, Playwright `route.fulfill()` on the happy path, stubbed transport). This is a **coverage gap**, not a pass. Carry every `Mocked-only` AC into the Real-Dependency Coverage section, and file a bug per the `ui.md`/profile-step rules (P1 if a whole surface is mock-only, P2 per AC otherwise).
34
34
  - `N/A` -- the AC has no external boundary (pure in-process logic). Justify it in the Real-Dependency Coverage section; do NOT use `N/A` to wave past an unmocked-but-untested boundary.
35
35
  - Populate the **Real-dependency coverage**, **Mocked-only ACs**, and **Mocked-only P0/P1 ACs** summary lines, and the **Real-Dependency Coverage** section, per the template. These are what JUDGE gates on — an absent or empty Real Coverage column reads as an evidence gap, not a pass.
@@ -0,0 +1,46 @@
1
+ # REQS Step: CLI Tool Requirements
2
+
3
+ ## Step 4b: CLI-Specific Requirement Extraction
4
+
5
+ For stories with `cli-tool` in `testing_profiles`, extract additional requirements:
6
+
7
+ ### Command Surface
8
+ - Subcommands touched or added, with their argument shapes (positional vs flag)
9
+ - Flags: long form, short alias (if any), value type, default, required/optional
10
+ - Flag interactions (mutually exclusive groups, flags implying other flags)
11
+ - Environment variable overrides and their precedence vs flags
12
+
13
+ ### Exit Code Semantics
14
+ - Exit code for success (0) and each distinct failure mode
15
+ - Which failures share a generic non-zero code vs which get assigned codes
16
+ - Codes consumers/scripts are known to branch on (these are the breaking-change surface)
17
+
18
+ ### Output Contract
19
+ - What goes to stdout (the parseable/pipeable result) vs stderr (diagnostics, progress, warnings)
20
+ - Machine-readable modes (`--json`, `--quiet`, `--porcelain`, ...) and their exact schema
21
+ - Color/TTY behavior (NO_COLOR, non-TTY detection) if the project handles it
22
+ - Stability expectations: which output is contract (scripts parse it) vs informational
23
+
24
+ ### Interactivity Constraints
25
+ - Prompts the command may show, and the flag/env override for each
26
+ - Non-TTY behavior: what each prompt does when stdin is not a terminal (use default, refuse, fail)
27
+ - Destructive operations and their confirmation flags (`--force`/`--yes`)
28
+
29
+ ### Backwards Compatibility Constraints
30
+ - Previously documented commands/flags/output that must remain stable
31
+ - Deprecated flags and their migration path (alias period, warning text)
32
+ - Current version and expected version after this story ships
33
+
34
+ ### Packaging and Invocation
35
+ - `bin` mapping name(s) and the package-manager exec path (`npx <name>`, global install)
36
+ - Minimum supported runtime versions
37
+ - Shell-portability constraints (Windows/POSIX) if the brief raises them
38
+
39
+ ## Quality Gate Additions
40
+
41
+ - [ ] Every command/flag has defined behavior, default, and error handling
42
+ - [ ] Exit code assigned to every failure mode named in the ACs
43
+ - [ ] stdout vs stderr contract stated for every touched command
44
+ - [ ] Machine-readable output schema specified if a mode is touched
45
+ - [ ] Every prompt has a non-interactive override; destructive ops have confirmation flags
46
+ - [ ] Breaking-change impact on existing flags/output/exit codes classified
@@ -0,0 +1,60 @@
1
+ # Task graph template for cli-tool projects
2
+ # Resolved deterministically by `resolve-graph`; the Workflow orchestrator wires dependencies from `tasks`.
3
+ # Skipped agents: UXA, FEND
4
+ #
5
+ # Variables resolved at runtime:
6
+ # {{story_id}} — current story identifier
7
+ # {{story_output_dir}} — resolved story output directory
8
+
9
+ project_type: "cli-tool"
10
+
11
+ tasks:
12
+ - ref: reqs
13
+ agent: REQS
14
+ subject: "REQS: Analyze story and produce implementation brief"
15
+ description: "Read {{story_id}} story inputs, translate ACs into structured implementation brief (reqs-brief.md)."
16
+ activeForm: "REQS analyzing requirements"
17
+ blockedBy: []
18
+
19
+ - ref: qa_a
20
+ agent: QA-A
21
+ subject: "QA-A: Produce behavioral test specifications"
22
+ description: "Read reqs-brief.md, produce qa-test-spec.md. No visual-validation-checklist for cli-tool."
23
+ activeForm: "QA-A writing test specifications"
24
+ blockedBy: [reqs]
25
+
26
+ - ref: cli_dev
27
+ agent: CLI-DEV
28
+ subject: "CLI-DEV: Implement command surface production code and tests"
29
+ description: "Read reqs-brief.md and qa-test-spec.md, implement commands/flags/exit codes and bin entry, produce cli-dev-handoff.md."
30
+ activeForm: "CLI-DEV implementing CLI tool"
31
+ blockedBy: [qa_a]
32
+
33
+ - ref: iac
34
+ agent: IAC
35
+ subject: "IAC: Implement infrastructure definitions and tests"
36
+ description: "Read reqs-brief.md and qa-test-spec.md, implement infrastructure code, produce iac-handoff.md."
37
+ activeForm: "IAC implementing infrastructure"
38
+ blockedBy: [qa_a]
39
+ conditional: { includes: { testing_profiles: iac } }
40
+
41
+ - ref: critic
42
+ agent: CRITIC
43
+ subject: "CRITIC: Adversarial code review"
44
+ description: "Read git-diff, reqs-brief.md, qa-test-spec.md. Run blind-hunt, edge-case-hunt, acceptance-audit, triage passes. Produce critic-review.md."
45
+ activeForm: "CRITIC reviewing code"
46
+ blockedBy: [cli_dev, iac]
47
+
48
+ - ref: qa_b
49
+ agent: QA-B
50
+ subject: "QA-B: Execute tests and file bugs"
51
+ description: "Read qa-test-spec.md, critic-review.md, reqs-brief.md. Run tests, produce execution-report.md, bugs.md, traceability-matrix.md."
52
+ activeForm: "QA-B executing tests"
53
+ blockedBy: [critic]
54
+
55
+ - ref: judge
56
+ agent: JUDGE
57
+ subject: "JUDGE: Bug review and ship decision"
58
+ description: "Review bugs.md for priority accuracy, then review execution-report.md, traceability-matrix.md, qa-test-spec.md. Approve or reject ship."
59
+ activeForm: "JUDGE reviewing evidence and making ship decision"
60
+ blockedBy: [qa_b]
@@ -0,0 +1,89 @@
1
+ # cli-dev-handoff
2
+ <!-- Template version: 1.0 | Used by: CLI-DEV | Read by: CRITIC, QA-B -->
3
+
4
+ ---
5
+ agent: {agent-name}
6
+ story: {story-id}
7
+ status: {in_progress | completed}
8
+ stepsCompleted: []
9
+ pendingSteps: []
10
+ lastCheckpoint: {ISO-8601 timestamp}
11
+ inputsRead: []
12
+ outputsWritten: []
13
+ blockers: []
14
+ ---
15
+
16
+ ## Orchestrator Summary -- required
17
+ - **Agent:** {agent-name}
18
+ - **Story:** {story-id}
19
+ - **Verdict:** {pass | fail | needs-review}
20
+ - **State transition:** {from-phase} -> {to-phase}
21
+ - **Files created/modified:** {list of file paths}
22
+ - **Flags:** {alerts for downstream agents, or "none"}
23
+
24
+ ## Machine Summary -- required
25
+ <!-- Authoritative structured output (validated by `node .valent-pipeline/bin/cli.js validate-handoff`). The prose tables below are the human-readable companion. -->
26
+ ```yaml valent:handoff
27
+ schema: 1
28
+ agent: cli-dev
29
+ story: STORY-ID
30
+ files: []
31
+ nextAgent: critic
32
+ flags: []
33
+ ```
34
+
35
+ ## Files Created/Modified -- required
36
+ <!-- All production and test files created or modified, with purpose. -->
37
+
38
+ | File | Action | Purpose |
39
+ |------|--------|---------|
40
+ | {file-path} | {created \| modified} | {one-line description} |
41
+
42
+ ## Command Surface -- required
43
+ <!-- Every subcommand/flag implemented or changed in this story. CRITIC and QA-B use this to validate coverage and contracts. -->
44
+
45
+ | Command / Flag | Behavior | Exit Codes | Output (stdout) | Usage Example |
46
+ |----------------|----------|------------|-----------------|---------------|
47
+ | {command or --flag} | {one-line behavior} | {0 on ..., N on ...} | {what a consumer/script reads} | {one-line invocation} |
48
+
49
+ ## Contract Verification -- required
50
+ <!-- Verified by actually running the bin entry, not by reading the code. -->
51
+
52
+ - **Bin entry:** {package.json bin mapping; verified via `npx <name>` / equivalent: yes|no}
53
+ - **`--help` / `--version`:** {exit 0, text accurate: yes|no}
54
+ - **Unknown flag/subcommand:** {errors non-zero on stderr: yes|no}
55
+ - **Non-interactive (stdin not a TTY):** {no touched command hangs; prompt overrides: list flags/env}
56
+ - **Machine-readable mode:** {mode + byte-clean stdout verified, or "n/a"}
57
+
58
+ ## Breaking Changes -- conditional
59
+ <!-- Only include if this story changes or removes existing commands/flags/output a consumer may depend on. -->
60
+
61
+ | Change | Before | After | Migration / Deprecation |
62
+ |--------|--------|-------|-------------------------|
63
+ | {removed/renamed flag or changed output/exit code} | {previous contract} | {new contract} | {deprecation alias, warning, or "breaking"} |
64
+
65
+ ## Test Files Written -- required
66
+ <!-- All test files created or modified, mapped to the spec cases they satisfy. -->
67
+
68
+ | Test File | Test Cases | Spec Reference |
69
+ |-----------|-----------|----------------|
70
+ | {file-path} | {list of test case names} | `qa-test-spec.md#{ac-id}` |
71
+
72
+ ## Test Results Summary -- required
73
+ <!-- Aggregate results from running the test suite. -->
74
+
75
+ | Suite | Total | Passed | Failed | Skipped | Duration |
76
+ |-------|-------|--------|--------|---------|----------|
77
+ | {suite-name} | {count} | {count} | {count} | {count} | {duration} |
78
+
79
+ ## Implementation Decisions -- required
80
+ <!-- Key decisions made during implementation. Each entry: decision, rationale, alternatives rejected. -->
81
+
82
+ - **Decision:** {what was decided}
83
+ - **Rationale:** {why}
84
+ - **Rejected:** {alternatives considered and why they lost}
85
+
86
+ ## Cross-References -- required
87
+ <!-- Explicit pointers to upstream artifacts consumed. -->
88
+ - Reqs brief: `reqs-brief.md#{section}`
89
+ - Test spec: `qa-test-spec.md#{section}`
@@ -118,6 +118,17 @@ database_changes:
118
118
  - **Module system:** {CJS / ESM / dual}
119
119
  - **Peer dependencies:** {required peer deps and version ranges}
120
120
 
121
+ ## Command Surface -- conditional
122
+ <!-- Only include if cli-tool in testing_profiles -->
123
+
124
+ | Command / Flag | Behavior | Exit Codes | stdout Contract | Non-TTY Behavior | Breaking Change? |
125
+ |----------------|----------|------------|-----------------|------------------|------------------|
126
+ | {command or --flag} | {behavior + default} | {0 on ..., N on ...} | {what scripts parse, or "human only"} | {default / refuse / prompt-override flag} | {yes/no — vs prior version} |
127
+
128
+ - **Machine-readable modes:** {--json etc. + exact schema, or "none"}
129
+ - **Destructive operations:** {operation -> confirmation flag, or "none"}
130
+ - **Env overrides:** {ENV_VAR -> flag it overrides}
131
+
121
132
  ## Template Specifications -- conditional
122
133
  <!-- Only include if document-generation in testing_profiles -->
123
134
 
@@ -33,6 +33,7 @@ Ask the user to pick one:
33
33
  | `mcp-server` | Model Context Protocol server (skips FEND, UXA, PMCP) |
34
34
  | `document-generation` | Templates, content pipelines, output validation |
35
35
  | `library` | Reusable package/module with public API (skips FEND, UXA, PMCP) |
36
+ | `cli-tool` | Command-line tool with a `bin` entry (skips FEND, UXA, PMCP) |
36
37
 
37
38
  Default: `fullstack-web`
38
39
 
@@ -40,7 +41,7 @@ Default: `fullstack-web`
40
41
 
41
42
  Ask about each field. Use the project type to skip irrelevant questions:
42
43
 
43
- - **If project type has no frontend** (`backend-api`, `data-pipeline`, `mcp-server`, `library`): skip `frontend_framework`, `state_management`, and set them to `"none"`.
44
+ - **If project type has no frontend** (`backend-api`, `data-pipeline`, `mcp-server`, `library`, `cli-tool`): skip `frontend_framework`, `state_management`, and set them to `"none"`.
44
45
  - **If project type has no backend** (`frontend-only`): skip `database_orm` and set it to `"none"`.
45
46
 
46
47
  Fields to ask (in order):
@@ -95,6 +95,7 @@ Tag each story with the `testing_profiles` it **owns** (the same criteria the pi
95
95
  | ETL, data transformation, or batch processing | `data-pipeline` |
96
96
  | MCP server tools, handlers, or protocol work | `mcp-server` |
97
97
  | Shared library/package (exports, packaging, versioning) | `library` |
98
+ | Command-line surface (subcommands, flags, exit codes, terminal output contracts) | `cli-tool` |
98
99
  | Document/report template or generation pipeline work | `document-generation` |
99
100
  | Infrastructure (Terraform, CloudFormation, Kubernetes, CI/CD) | `iac` |
100
101
 
@@ -12,6 +12,7 @@ const ARTIFACT_MAP = {
12
12
  'data-handoff.md': { type: 'data-handoff', agent: 'DATA' },
13
13
  'mcp-dev-handoff.md': { type: 'mcp-dev-handoff', agent: 'MCP-DEV' },
14
14
  'libdev-handoff.md': { type: 'libdev-handoff', agent: 'LIBDEV' },
15
+ 'cli-dev-handoff.md': { type: 'cli-dev-handoff', agent: 'CLI-DEV' },
15
16
  'docgen-handoff.md': { type: 'docgen-handoff', agent: 'DOCGEN' },
16
17
  'iac-handoff.md': { type: 'iac-handoff', agent: 'IAC' },
17
18
  'critic-review.md': { type: 'critic-review', agent: 'CRITIC' },
@@ -248,7 +248,7 @@ async function runWizard() {
248
248
  message: 'Project type:',
249
249
  choices: [
250
250
  'fullstack-web', 'backend-api', 'frontend-only',
251
- 'data-pipeline', 'mcp-server', 'document-generation', 'library', 'mobile-app'
251
+ 'data-pipeline', 'mcp-server', 'document-generation', 'library', 'cli-tool', 'mobile-app'
252
252
  ],
253
253
  default: 'fullstack-web',
254
254
  }]);
@@ -268,7 +268,7 @@ async function runWizard() {
268
268
  }]);
269
269
  config.tech_stack.backend_framework = backendFramework;
270
270
 
271
- if (!['backend-api', 'data-pipeline', 'mcp-server', 'library'].includes(projectType)) {
271
+ if (!['backend-api', 'data-pipeline', 'mcp-server', 'library', 'cli-tool'].includes(projectType)) {
272
272
  const { frontendFramework } = await inquirer.prompt([{
273
273
  type: 'input', name: 'frontendFramework', message: 'Frontend framework:', default: 'React',
274
274
  }]);
@@ -11,7 +11,7 @@ export const KNOWN_ROLES = [
11
11
  'CRITIC', 'CRITIC-BLIND', 'CRITIC-EDGE', 'CRITIC-ACCEPT', 'CRITIC-TRIAGE', 'CRITIC-REVERIFY',
12
12
  'JUDGE', 'JUDGE-EVIDENCE', 'JUDGE-DECIDE',
13
13
  'REQS', 'UXA', 'QA-A', 'QA-B',
14
- 'BEND', 'FEND', 'DATA', 'MCP-DEV', 'LIBDEV', 'DOCGEN', 'IAC', 'MOBILE',
14
+ 'BEND', 'FEND', 'DATA', 'MCP-DEV', 'LIBDEV', 'CLI-DEV', 'DOCGEN', 'IAC', 'MOBILE',
15
15
  'RESOLVE', 'STATIC', 'RESUME', 'PMCP', 'PERSIST', 'EVIDENCE', 'CROSSCHECK', 'GIT',
16
16
  'SPECCHECK', 'RED', 'GREEN',
17
17
  // plan
@@ -43,7 +43,7 @@ export function validateConfig(config) {
43
43
  }
44
44
 
45
45
  // Project section
46
- const validProjectTypes = ['fullstack-web', 'backend-api', 'frontend-only', 'data-pipeline', 'mcp-server', 'document-generation', 'library', 'mobile-app'];
46
+ const validProjectTypes = ['fullstack-web', 'backend-api', 'frontend-only', 'data-pipeline', 'mcp-server', 'document-generation', 'library', 'cli-tool', 'mobile-app'];
47
47
  if (config.project?.type && !validProjectTypes.includes(config.project.type)) {
48
48
  errors.push(`Invalid project.type: "${config.project.type}". Must be one of: ${validProjectTypes.join(', ')}`);
49
49
  }
@@ -334,7 +334,7 @@ export const defaults = {
334
334
  opus: ['INTEGRATION', 'CRITIC-TRIAGE', 'JUDGE-DECIDE'],
335
335
  // Spec + build agents, the *finding* half of the gates (CRITIC edge/acceptance hunting, JUDGE
336
336
  // evidence cross-referencing), and lighter retro reasoning — mid tier.
337
- sonnet: ['REQS', 'UXA', 'QA-A', 'QA-B', 'BEND', 'FEND', 'DATA', 'MCP-DEV', 'LIBDEV', 'DOCGEN', 'IAC', 'MOBILE', 'RETRO', 'CRITIC-EDGE', 'CRITIC-ACCEPT', 'CRITIC-REVERIFY', 'JUDGE-EVIDENCE'],
337
+ sonnet: ['REQS', 'UXA', 'QA-A', 'QA-B', 'BEND', 'FEND', 'DATA', 'MCP-DEV', 'LIBDEV', 'CLI-DEV', 'DOCGEN', 'IAC', 'MOBILE', 'RETRO', 'CRITIC-EDGE', 'CRITIC-ACCEPT', 'CRITIC-REVERIFY', 'JUDGE-EVIDENCE'],
338
338
  // Mechanical retrieval / CLI-runner / IO steps — no reasoning, cheapest tier.
339
339
  // RESOLVE/PACK/VALIDATE/CALIBRATE/PERSIST are the Workflow orchestrators' CLI-runner agents.
340
340
  // STATIC runs the deterministic pre-CRITIC gate (lint/type/static analysis) — pure command runner.
@@ -439,7 +439,7 @@ export const defaults = {
439
439
  // suppress, or trim `roles` to scope it. Install Ref as an MCP server to benefit.
440
440
  ref: {
441
441
  enabled: true,
442
- roles: ['BEND', 'FEND', 'DATA', 'MCP-DEV', 'LIBDEV', 'DOCGEN', 'IAC', 'MOBILE', 'CRITIC'],
442
+ roles: ['BEND', 'FEND', 'DATA', 'MCP-DEV', 'LIBDEV', 'CLI-DEV', 'DOCGEN', 'IAC', 'MOBILE', 'CRITIC'],
443
443
  },
444
444
  // Visual validation (PMCP). For a story whose testing_profiles include `ui`, the sprint workflow's
445
445
  // Visual phase drives the browser-automation MCP (tech_stack.browser_automation_mcp) over QA-A's
package/src/lib/detect.js CHANGED
@@ -312,10 +312,8 @@ export function detectProject(root) {
312
312
  else if (hasFrontend && hasBackend) projectType = 'fullstack-web';
313
313
  else if (hasFrontend) projectType = 'frontend-only';
314
314
  else if (hasBackend) projectType = 'backend-api';
315
- else if (pkg?.bin) {
316
- projectType = 'library';
317
- warnings.push('package.json has a `bin` (CLI tool) — suggesting `library`; there is no dedicated cli-tool type yet');
318
- } else if (pkg && (pkg.main || pkg.exports)) projectType = 'library';
315
+ else if (pkg?.bin) projectType = 'cli-tool'; // a `bin` entry IS the product surface — black-box CLI evidence, not library exports
316
+ else if (pkg && (pkg.main || pkg.exports)) projectType = 'library';
319
317
  // Cross-workspace type incoherence (review pass-3 #29): a mobile signal in one workspace plus
320
318
  // a frontend framework in ANOTHER (the vue-docs + react-native repro) cannot coherently yield
321
319
  // a single project.type — withdraw the conclusion rather than answer confidently.