@curdx/flow 2.0.0-beta.1 → 2.0.0-beta.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/.claude-plugin/marketplace.json +1 -1
  2. package/.claude-plugin/plugin.json +3 -10
  3. package/CHANGELOG.md +61 -0
  4. package/README.zh.md +2 -2
  5. package/agent-preamble/preamble.md +81 -11
  6. package/agents/flow-adversary.md +40 -55
  7. package/agents/flow-architect.md +23 -10
  8. package/agents/flow-debugger.md +2 -2
  9. package/agents/flow-edge-hunter.md +20 -6
  10. package/agents/flow-executor.md +3 -3
  11. package/agents/flow-planner.md +51 -48
  12. package/agents/flow-product-designer.md +14 -1
  13. package/agents/flow-qa-engineer.md +1 -1
  14. package/agents/flow-researcher.md +17 -2
  15. package/agents/flow-reviewer.md +5 -1
  16. package/agents/flow-security-auditor.md +1 -1
  17. package/agents/flow-triage-analyst.md +1 -1
  18. package/agents/flow-ui-researcher.md +2 -2
  19. package/agents/flow-ux-designer.md +1 -1
  20. package/agents/flow-verifier.md +47 -14
  21. package/bin/curdx-flow.js +13 -1
  22. package/cli/doctor.js +73 -13
  23. package/cli/install.js +62 -36
  24. package/cli/protocols.js +63 -10
  25. package/cli/registry.js +73 -0
  26. package/cli/uninstall.js +9 -11
  27. package/cli/upgrade.js +6 -10
  28. package/cli/utils.js +150 -56
  29. package/commands/fast.md +1 -1
  30. package/commands/implement.md +4 -4
  31. package/commands/init.md +14 -3
  32. package/commands/review.md +14 -5
  33. package/commands/spec.md +26 -2
  34. package/commands/start.md +47 -17
  35. package/commands/verify.md +13 -0
  36. package/gates/adversarial-review-gate.md +19 -19
  37. package/gates/devex-gate.md +4 -5
  38. package/gates/edge-case-gate.md +1 -1
  39. package/hooks/hooks.json +0 -11
  40. package/hooks/scripts/quick-mode-guard.sh +12 -9
  41. package/hooks/scripts/session-start.sh +1 -1
  42. package/hooks/scripts/stop-watcher.sh +25 -15
  43. package/knowledge/execution-strategies.md +6 -5
  44. package/knowledge/spec-driven-development.md +8 -7
  45. package/knowledge/two-stage-review.md +4 -3
  46. package/package.json +4 -2
  47. package/skills/brownfield-index/SKILL.md +62 -0
  48. package/skills/browser-qa/SKILL.md +50 -0
  49. package/skills/epic/SKILL.md +68 -0
  50. package/skills/security-audit/SKILL.md +50 -0
  51. package/skills/ui-sketch/SKILL.md +49 -0
  52. package/templates/config.json.tmpl +1 -1
  53. package/templates/design.md.tmpl +32 -112
  54. package/templates/requirements.md.tmpl +25 -43
  55. package/templates/research.md.tmpl +37 -68
  56. package/templates/tasks.md.tmpl +27 -84
  57. package/hooks/scripts/fail-tracker.sh +0 -31
package/cli/utils.js CHANGED
@@ -108,39 +108,6 @@ export function confirm(message, defaultYes = true) {
108
108
  });
109
109
  }
110
110
 
111
- /**
112
- * Ask user to pick from a list. Returns selected value or null if aborted.
113
- */
114
- export function select(message, choices, defaultIndex = 0) {
115
- return new Promise((resolve) => {
116
- console.log(`${color.cyan("?")} ${message}`);
117
- choices.forEach((ch, i) => {
118
- const marker = i === defaultIndex ? color.green("▸") : " ";
119
- console.log(` ${marker} ${color.bold(String(i + 1))}. ${ch.label}`);
120
- });
121
-
122
- const rl = createInterface({
123
- input: process.stdin,
124
- output: process.stdout,
125
- });
126
- rl.question(
127
- ` ${color.dim(`(default: ${defaultIndex + 1}, q to abort) `)}`,
128
- (ans) => {
129
- rl.close();
130
- const v = ans.trim().toLowerCase();
131
- if (v === "q") return resolve(null);
132
- if (v === "") return resolve(choices[defaultIndex].value);
133
- const n = parseInt(v, 10);
134
- if (Number.isInteger(n) && n >= 1 && n <= choices.length) {
135
- return resolve(choices[n - 1].value);
136
- }
137
- console.log(color.yellow(" (invalid, using default)"));
138
- resolve(choices[defaultIndex].value);
139
- }
140
- );
141
- });
142
- }
143
-
144
111
  /**
145
112
  * Multi-select (checkbox-style via comma-separated input).
146
113
  * Returns array of selected values.
@@ -199,47 +166,170 @@ export function claudeVersion() {
199
166
  return m ? m[1] : res.stdout.trim().split("\n")[0];
200
167
  }
201
168
 
202
- /** List installed plugins via `claude plugin list`. Returns array of { name, version, status }. */
169
+ /**
170
+ * List installed plugins. Prefers the structured `claude plugin list --json`
171
+ * output (stable machine-readable format; confirmed present in claude
172
+ * 2.1.117+). Falls back to parsing the human-readable stream-text output
173
+ * for older CLI versions, but warns that parser is brittle.
174
+ *
175
+ * Returns array of { name, version, status }.
176
+ */
203
177
  export function listPlugins() {
204
- const res = runSync("claude", ["plugin", "list"]);
205
- if (res.code !== 0) return [];
206
- const out = res.stdout;
207
- const plugins = [];
208
- // Parse format like:
178
+ // Preferred: structured JSON output.
179
+ const j = runSync("claude", ["plugin", "list", "--json"]);
180
+ if (j.code === 0 && j.stdout.trim().startsWith("[")) {
181
+ try {
182
+ const arr = JSON.parse(j.stdout);
183
+ return arr.map((p) => ({
184
+ // id has form "name@marketplace" — name is stable for dedup/lookup.
185
+ name: String(p.id || "").split("@")[0],
186
+ version: p.version,
187
+ status: p.enabled === false ? "disabled" : "enabled",
188
+ raw: JSON.stringify(p),
189
+ }));
190
+ } catch {
191
+ // JSON parse failed — fall through to legacy text parser.
192
+ }
193
+ }
194
+
195
+ // Legacy fallback: parse the human-readable format.
209
196
  // ❯ curdx-flow@curdx-flow-marketplace
210
197
  // Version: 1.1.1
211
- // Scope: user
212
198
  // Status: ✔ enabled
213
- const blocks = out.split(/\n\s*❯\s*/).slice(1);
199
+ // Fragile matches unicode markers. Kept only for older claude CLIs.
200
+ const res = runSync("claude", ["plugin", "list"]);
201
+ if (res.code !== 0) return [];
202
+ const plugins = [];
203
+ const blocks = res.stdout.split(/\n\s*❯\s*/).slice(1);
214
204
  for (const block of blocks) {
215
205
  const lines = block.split("\n");
216
206
  const name = lines[0].trim().split("@")[0];
217
207
  const version = (block.match(/Version:\s*(\S+)/) || [])[1];
218
- const status = block.includes("✔") ? "enabled" : block.includes("✘") ? "failed" : "unknown";
208
+ const status = block.includes("✔")
209
+ ? "enabled"
210
+ : block.includes("✘")
211
+ ? "failed"
212
+ : "unknown";
219
213
  plugins.push({ name, version, status, raw: block });
220
214
  }
221
215
  return plugins;
222
216
  }
223
217
 
224
- /** List MCPs via `claude mcp list`. Returns array of { name, status }. */
218
+ /**
219
+ * Read the user-level MCP registrations from ~/.claude.json. These are the
220
+ * MCPs the user added manually via `claude mcp add …` — distinct from
221
+ * plugin-bundled MCPs (which live in plugin.json).
222
+ *
223
+ * Returns a Map keyed by server name with the raw config object. Returns
224
+ * an empty Map if the file is missing / unreadable / has no mcpServers
225
+ * section — all of which are normal states and not errors.
226
+ */
227
+ export function readUserMcpConfig() {
228
+ try {
229
+ const path = join(HOME, ".claude.json");
230
+ if (!existsSync(path)) return new Map();
231
+ const cfg = JSON.parse(readFileSync(path, "utf-8"));
232
+ const servers = cfg?.mcpServers || {};
233
+ return new Map(Object.entries(servers));
234
+ } catch {
235
+ return new Map();
236
+ }
237
+ }
238
+
239
+ /**
240
+ * Given the output of listMcps() and a user-level MCP config map, find
241
+ * MCPs that are registered BOTH as user-level AND as plugin-bundled.
242
+ * The plugin-bundled form shows up as `plugin:<plugin>:<name>` in
243
+ * listMcps output, so a user-level "context7" and a plugin-level
244
+ * "plugin:curdx-flow:context7" are a duplicate pair.
245
+ *
246
+ * Returns array of { name, userConfig, pluginEntry }.
247
+ */
248
+ export function findDuplicateMcps(mcps, userConfig) {
249
+ const duplicates = [];
250
+ for (const m of mcps) {
251
+ // Only look at plugin-prefixed entries — they're the reference for
252
+ // what's bundled. Check if user has their own non-prefixed version.
253
+ if (m.plugin && userConfig.has(m.name)) {
254
+ duplicates.push({
255
+ name: m.name,
256
+ userConfig: userConfig.get(m.name),
257
+ pluginEntry: m,
258
+ });
259
+ }
260
+ }
261
+ return duplicates;
262
+ }
263
+
264
+ /**
265
+ * List MCP servers registered with the `claude` CLI. Returns array of
266
+ * { name, plugin, fullName, status, command }
267
+ * where `plugin` is set when the MCP came from a plugin (real name is
268
+ * `plugin:<plugin>:<mcp>`), `name` is the trailing segment, and `fullName`
269
+ * is the original as reported by claude.
270
+ *
271
+ * Fixture captured from `claude mcp list` (2.1.117):
272
+ * Checking MCP server health…
273
+ *
274
+ * plugin:curdx-flow:context7: npx -y @upstash/context7-mcp@latest - ✓ Connected
275
+ * context7: npx -y @upstash/context7-mcp --api-key ... - ✓ Connected
276
+ * claude.ai Gmail: https://gmailmcp... - ✓ Connected
277
+ *
278
+ * `claude mcp list --json` does not exist on 2.1.117 (verified), so this
279
+ * parser is the primary path. It is fixture-tested in test/utils.test.js
280
+ * so format regressions get caught in CI.
281
+ */
225
282
  export function listMcps() {
226
283
  const res = runSync("claude", ["mcp", "list"]);
227
284
  if (res.code !== 0) return [];
228
- const lines = res.stdout.split("\n");
285
+ return parseMcpList(res.stdout);
286
+ }
287
+
288
+ /** Exported for testing against a fixed input. */
289
+ export function parseMcpList(output) {
229
290
  const mcps = [];
230
- for (const line of lines) {
231
- // Rough parse — adjust if format differs
232
- const m = line.match(/^\s*([a-z0-9-]+)\s*[:\-]/i);
233
- if (m) mcps.push({ name: m[1], status: "registered" });
291
+ for (const raw of output.split("\n")) {
292
+ const line = raw.trimEnd();
293
+ if (!line) continue;
294
+ // skip the health-check header line
295
+ if (line.startsWith("Checking") || line.startsWith("checking")) continue;
296
+ // Expected format: "<fullName>: <command-or-url> - <status>"
297
+ // fullName may itself contain colons when prefixed with "plugin:<p>:<m>".
298
+ // Match from the end to find the status sentinel " - ", then split off
299
+ // the name at the first ": " after the identifier prefix.
300
+ const statusSplit = line.lastIndexOf(" - ");
301
+ if (statusSplit === -1) continue;
302
+ const statusRaw = line.slice(statusSplit + 3).trim();
303
+ const beforeStatus = line.slice(0, statusSplit);
304
+ // Find the first ": " that separates name from command. Note the space
305
+ // after the colon — this disambiguates from the colons inside
306
+ // "plugin:foo:bar".
307
+ const nameSplit = beforeStatus.indexOf(": ");
308
+ if (nameSplit === -1) continue;
309
+ const fullName = beforeStatus.slice(0, nameSplit).trim();
310
+ const command = beforeStatus.slice(nameSplit + 2).trim();
311
+
312
+ let plugin = null;
313
+ let name = fullName;
314
+ if (fullName.startsWith("plugin:")) {
315
+ const parts = fullName.split(":");
316
+ if (parts.length >= 3) {
317
+ plugin = parts[1];
318
+ name = parts.slice(2).join(":");
319
+ }
320
+ }
321
+
322
+ const status = /Connected|✓/.test(statusRaw)
323
+ ? "connected"
324
+ : /Failed|✗/.test(statusRaw)
325
+ ? "failed"
326
+ : "unknown";
327
+
328
+ mcps.push({ name, plugin, fullName, status, command });
234
329
  }
235
330
  return mcps;
236
331
  }
237
332
 
238
- // ---------- Paths ----------
239
- export function pluginCacheDir(pluginName = "curdx-flow", marketplace = "curdx-flow-marketplace") {
240
- return `${process.env.HOME}/.claude/plugins/cache/${marketplace}/${pluginName}`;
241
- }
242
-
243
333
  // ---------- Runtime PATH guards (bun / uv) ----------
244
334
  // claude-mem hard-codes `command: "bun"` in its .mcp.json, but bun installs to
245
335
  // ~/.bun/bin which is not on PATH when Claude Code spawns MCP servers
@@ -247,10 +337,14 @@ export function pluginCacheDir(pluginName = "curdx-flow", marketplace = "curdx-f
247
337
  // detection + self-healing: create a symlink to the user-level bun install
248
338
  // in a PATH-visible directory.
249
339
 
250
- import { mkdirSync, symlinkSync, lstatSync, unlinkSync, readlinkSync } from "node:fs";
251
- // `existsSync` and `join` already imported at the top of this file.
340
+ import { existsSync, mkdirSync, symlinkSync, lstatSync, unlinkSync, readlinkSync } from "node:fs";
341
+ import { homedir } from "node:os";
342
+ // `join` already imported at the top of this file.
252
343
 
253
- const HOME = process.env.HOME || "";
344
+ // os.homedir() is sourced from the OS-level user record and works even
345
+ // when $HOME is empty (non-login shells, some CI containers). See the
346
+ // same rationale in cli/protocols.js.
347
+ const HOME = homedir();
254
348
 
255
349
  /** Candidate bun install locations (priority order) */
256
350
  const BUN_CANDIDATES = [
package/commands/fast.md CHANGED
@@ -123,6 +123,6 @@ Choosing the right scenario matters more than forcing the flow.
123
123
  ## Forbidden
124
124
 
125
125
  - ✗ Committing without running verification
126
- - ✗ Changes touching more than 5 files (means it is no longer fast — run the full flow)
126
+ - ✗ Changes touching many unrelated files or modules (means it is no longer fast — run the full flow)
127
127
  - ✗ Writing library APIs from memory
128
128
  - ✗ Skipping the Step 2 5-question clarification (even when "obvious," explicit statement still has value)
@@ -15,7 +15,7 @@ Execute spec tasks per tasks.md. Select the best execution strategy based on arg
15
15
  ## Step 1: Preflight Checks
16
16
 
17
17
  ```bash
18
- [ ! -d ".flow" ] && { echo " Not a CurDX-Flow project. Run /curdx-flow:init first"; exit 1; }
18
+ [ ! -d ".flow" ] && { echo " Not a CurDX-Flow project. Run /curdx-flow:init first"; exit 1; }
19
19
 
20
20
  ARGS="$ARGUMENTS"
21
21
  SPEC_NAME=""
@@ -35,10 +35,10 @@ for arg in $ARGS; do
35
35
  done
36
36
 
37
37
  [ -z "$SPEC_NAME" ] && SPEC_NAME=$(cat .flow/.active-spec 2>/dev/null)
38
- [ -z "$SPEC_NAME" ] && { echo " No active spec. Run /curdx-flow:start first"; exit 1; }
38
+ [ -z "$SPEC_NAME" ] && { echo " No active spec. Run /curdx-flow:start first"; exit 1; }
39
39
 
40
40
  DIR=".flow/specs/$SPEC_NAME"
41
- [ ! -f "$DIR/tasks.md" ] && { echo " Missing tasks.md. Run /curdx-flow:spec first (or /curdx-flow:spec --phase=tasks to rebuild just the tasks phase)"; exit 1; }
41
+ [ ! -f "$DIR/tasks.md" ] && { echo " Missing tasks.md. Run /curdx-flow:spec first (or /curdx-flow:spec --phase=tasks to rebuild just the tasks phase)"; exit 1; }
42
42
  ```
43
43
 
44
44
  ## Step 2: Parse Task Characteristics from tasks.md
@@ -330,7 +330,7 @@ Prerequisites:
330
330
 
331
331
  ## Step 6: Progress Feedback
332
332
 
333
- Every 5 tasks or every wave, print status:
333
+ At each wave boundary (or periodically during long linear runs), print status:
334
334
 
335
335
  ```
336
336
  ═════ Progress ═════
package/commands/init.md CHANGED
@@ -71,9 +71,20 @@ Append (if not already present):
71
71
 
72
72
  ### Step 5: Health Check
73
73
 
74
- Run `npx @curdx/flow doctor` (or inline its checks) to verify:
75
- - 3 MCPs started (context7 / sequential-thinking / chrome-devtools)
76
- - Recommended plugins status (pua / claude-mem / frontend-design)
74
+ Do NOT shell out to a new terminal for this step — you are already inside
75
+ Claude Code. Verify inline via the information the plugin already has:
76
+
77
+ - Read `~/.claude/plugins/data/curdx-flow/.deps-checked` (optional — the
78
+ SessionStart hook already refreshes this once per day).
79
+ - If the user asks for the full report, suggest they run
80
+ `npx @curdx/flow doctor` in a separate terminal — don't try to spawn
81
+ it from inside the Claude Code session (output won't render cleanly
82
+ and the user has to alt-tab to see it).
83
+
84
+ Items the CLI doctor covers (for user reference):
85
+ - 2 bundled MCPs (context7 / sequential-thinking) — visible in `claude mcp list`
86
+ - 4 recommended plugins (pua / claude-mem / frontend-design / chrome-devtools-mcp)
87
+ - Runtime PATH guards for `bun` / `uv` (relevant only when claude-mem is installed)
77
88
 
78
89
  ### Step 6: Prompt Next Steps
79
90
 
@@ -16,8 +16,8 @@ Distinct from `/curdx-flow:verify`:
16
16
  | Flag | Default | Purpose |
17
17
  |------|---------|---------|
18
18
  | `--stage=<1\|2\|both>` | `both` | Stage 1 = spec compliance only. Stage 2 = code quality only. `both` = sequential. |
19
- | `--adversarial` | off | Add an adversarial review pass (6 dimensions × 2 sequential-thinking rounds). Zero-findings forbidden. |
20
- | `--edge-case` | off | Add edge-case hunting across the 7 categories. Produces a test-gap checklist. |
19
+ | `--adversarial` | off | Add an adversarial review pass across applicable categories (zero findings requires proof-of-checking, not fabrication). |
20
+ | `--edge-case` | off | Add edge-case hunting across applicable categories. Produces a test-gap checklist. |
21
21
 
22
22
  ## Preflight
23
23
 
@@ -65,7 +65,7 @@ Output: Stage-2 section of the report.
65
65
  ## Optional: adversarial review
66
66
 
67
67
  If `--adversarial`:
68
- Dispatch `flow-adversary`. It runs 6 dimensions × 2 rounds of `sequential-thinking`:
68
+ Dispatch `flow-adversary`. It scans the applicable categories (Architecture / Implementation / Testing / Security / Maintainability / UX — skip N/A with reason) using `sequential-thinking` proportional to the residual uncertainty, probing:
69
69
  1. What's missing?
70
70
  2. What's overengineered?
71
71
  3. What would break first in production?
@@ -73,12 +73,12 @@ Dispatch `flow-adversary`. It runs 6 dimensions × 2 rounds of `sequential-think
73
73
  5. What decision locks us out of a future option?
74
74
  6. What would a skeptical reviewer reject?
75
75
 
76
- **Zero findings are forbidden** — if the agent reports "all good", re-dispatch with stronger skepticism. Per `@${CLAUDE_PLUGIN_ROOT}/gates/adversarial-review-gate.md`.
76
+ **Zero findings requires proof-of-checking, not fabrication** — honest "clean" verdicts are fine if the agent lists what it examined. Per `@${CLAUDE_PLUGIN_ROOT}/gates/adversarial-review-gate.md`.
77
77
 
78
78
  ## Optional: edge-case hunting
79
79
 
80
80
  If `--edge-case`:
81
- Dispatch `flow-edge-hunter` across the 7 categories:
81
+ Dispatch `flow-edge-hunter` across the applicable categories (skip N/A with one-line reason):
82
82
  1. Boundary values (0, MAX, empty, one-over-limit)
83
83
  2. Concurrency / race conditions
84
84
  3. Network failure / partial failure
@@ -91,6 +91,15 @@ Output: test-gap checklist with suggested test cases.
91
91
 
92
92
  ## Report
93
93
 
94
+ **Landing check**: sub-agent responses can be truncated. After dispatching review agents, verify the report actually landed on disk:
95
+
96
+ ```bash
97
+ REPORT=".flow/specs/$SPEC_NAME/review-report.md"
98
+ if [ ! -f "$REPORT" ] || [ "$(wc -c < "$REPORT" 2>/dev/null | tr -d ' ')" -lt 300 ]; then
99
+ echo "⚠ Report missing or truncated. Re-dispatching flow-reviewer with a terse 'Write the report now, no narration' prompt."
100
+ fi
101
+ ```
102
+
94
103
  Consolidated output: `.flow/specs/$SPEC_NAME/review-report.md`:
95
104
 
96
105
  ```markdown
package/commands/spec.md CHANGED
@@ -82,7 +82,7 @@ Output: `requirements.md` with user stories (US-NN), acceptance criteria (AC-N.N
82
82
 
83
83
  ### design → `flow-architect`
84
84
  Inputs: `research.md` + `requirements.md`.
85
- Output: `design.md` with architecture decisions (AD-NN), component boundaries, data models, error-path design, mermaid diagrams. Must use `sequential-thinking` MCP (≥8 thoughts).
85
+ Output: `design.md` with architecture decisions (AD-NN), component boundaries, data models, error-path design, mermaid diagrams (when they clarify). Uses `sequential-thinking` MCP proportional to the genuine tradeoff surface.
86
86
 
87
87
  ### tasks → `flow-planner`
88
88
  Inputs: all three prior files + `.flow/PROJECT.md` tech stack.
@@ -94,10 +94,34 @@ After each phase completes successfully, update `.state.json`:
94
94
  {
95
95
  "phase": "<just-completed-phase>",
96
96
  "phase_status": { "<phase>": "completed" },
97
- "updated_at": "<ISO8601 timestamp>"
97
+ "updated": "<ISO8601 timestamp>"
98
98
  }
99
99
  ```
100
100
 
101
+ ### Artifact landing check (mandatory after every phase)
102
+
103
+ Sub-agent responses can be truncated by the model's output-length limit, which means the `Write` tool call for the phase's Markdown artifact may never fire. Do NOT trust the agent's return value alone — always verify the file actually landed.
104
+
105
+ For each phase just dispatched, run:
106
+
107
+ ```bash
108
+ ARTIFACT=".flow/specs/$SPEC_NAME/<phase>.md"
109
+ if [ ! -f "$ARTIFACT" ]; then
110
+ echo "⚠ $ARTIFACT did not land. Re-dispatching <phase> agent with an explicit 'write the file' prompt."
111
+ # Re-dispatch the same agent, but in the prompt, front-load:
112
+ # "Your ONLY job is to call the Write tool with the full <phase>.md content now.
113
+ # Do not explain. Do not narrate. Write the file and stop."
114
+ # This pattern produces an artifact even when prior verbosity caused truncation.
115
+ fi
116
+
117
+ # Minimum-size sanity check — if the file is <500 bytes, the write likely truncated
118
+ if [ -f "$ARTIFACT" ] && [ "$(wc -c < "$ARTIFACT" | tr -d ' ')" -lt 500 ]; then
119
+ echo "⚠ $ARTIFACT looks truncated (<500 bytes). Re-dispatching to complete it."
120
+ fi
121
+ ```
122
+
123
+ Only advance `.state.json.phase` after both the file exists AND passes the size sanity check. If a re-dispatch also fails to produce the artifact, stop and surface the issue to the user instead of silently advancing — that prevents later phases from consuming an empty upstream file.
124
+
101
125
  ## Optional planning review
102
126
 
103
127
  If `--review` (or `--review=<dims>`) is present:
package/commands/start.md CHANGED
@@ -32,23 +32,45 @@ Entry point for every feature. Works in four modes depending on flags and existi
32
32
 
33
33
  ## Flag parsing
34
34
 
35
- ```bash
36
- FLAG_RESUME=$(echo "$ARGUMENTS" | grep -q -- '--resume' && echo 1 || echo 0)
37
- FLAG_LIST=$(echo "$ARGUMENTS" | grep -q -- '--list' && echo 1 || echo 0)
38
- FLAG_MODE=$(echo "$ARGUMENTS" | grep -oP -- '--mode=\K[^\s]+' || echo "standard")
39
-
40
- # Strip flags from ARGUMENTS to leave the positional args
41
- POS=$(echo "$ARGUMENTS" | sed -E 's/--[a-z-]+(=[^ ]+)?//g' | xargs)
42
- SPEC_NAME=$(echo "$POS" | awk '{print $1}')
43
- GOAL=$(echo "$POS" | awk '{$1=""; print $0}' | sed 's/^"//; s/"$//' | xargs)
44
- ```
45
-
46
- Mode must be `fast`, `standard`, or `enterprise`. Invalid → default to `standard` with a warning.
35
+ **Do not shell-split `$ARGUMENTS`.** It is a user-supplied string that may
36
+ contain quoted substrings with spaces, `$`-signs, or embedded quotes.
37
+ `xargs`, naive `awk`, and `sed`-based quote stripping all mis-parse at
38
+ least one of those cases (e.g. `my-feature "Fix user's login bug"` breaks
39
+ `xargs: unmatched quote`). Parse the string as a model task instead:
40
+
41
+ 1. **Flags** (order-independent, each is self-delimited):
42
+ - `--resume` / `--list` boolean presence
43
+ - `--mode=<fast|standard|enterprise>` value after `=`
44
+ Detect each with a single regex over the full `$ARGUMENTS` string and
45
+ remove the matched span from your working copy. Flags not in the list
46
+ above are errors surface them to the user.
47
+
48
+ 2. **Positional args** (after flags removed):
49
+ - First whitespace-separated token → `SPEC_NAME` (kebab-case `[a-z0-9-]+`).
50
+ - Remainder of the string, trimmed and with one layer of outer `"..."`
51
+ or `'...'` quotes stripped → `GOAL`. Preserve inner quotes as-is.
52
+
53
+ 3. If `SPEC_NAME` does not match `^[a-z0-9][a-z0-9-]*$` (per
54
+ `schemas/spec-state.schema.json`), stop and ask the user to pick a
55
+ valid kebab-case name.
56
+
57
+ Mode must be `fast`, `standard`, or `enterprise`. Invalid → default to
58
+ `standard` with a warning.
59
+
60
+ Example inputs and their parse:
61
+
62
+ | `$ARGUMENTS` | SPEC_NAME | GOAL | flags |
63
+ |-------------------------------------------------|--------------|-------------------------------|---------------|
64
+ | `my-feature "Add JWT auth"` | `my-feature` | `Add JWT auth` | — |
65
+ | `my-feature --mode=fast "Add JWT auth"` | `my-feature` | `Add JWT auth` | mode=fast |
66
+ | `my-feature "Fix user's login bug"` | `my-feature` | `Fix user's login bug` | — |
67
+ | `--list` | — | — | list=true |
68
+ | `--resume` | — | — | resume=true |
47
69
 
48
70
  ## Branch logic
49
71
 
50
72
  ### Branch A: `--list`
51
- Enumerate every directory under `.flow/specs/`, read each `.state.json` for `phase` and `updated_at`, print a numbered list, then `AskUserQuestion` to pick one. Picking sets `.flow/.active-spec` and exits.
73
+ Enumerate every directory under `.flow/specs/`, read each `.state.json` for `phase` and `updated` (per `schemas/spec-state.schema.json`), print a numbered list, then `AskUserQuestion` to pick one. Picking sets `.flow/.active-spec` and exits.
52
74
 
53
75
  ### Branch B: `--resume` (no name)
54
76
  Read `.flow/.active-spec`. If it points to a valid spec dir, report its current phase and next suggested command (`/curdx-flow:spec` if incomplete, `/curdx-flow:implement` if tasks ready). If `.active-spec` is empty or stale, fall back to Branch A.
@@ -61,17 +83,25 @@ Create a new spec:
61
83
 
62
84
  ```bash
63
85
  mkdir -p ".flow/specs/$SPEC_NAME"
86
+ # NOTE: field names MUST match schemas/spec-state.schema.json:
87
+ # - spec_name (not "spec")
88
+ # - created (date, not "created_at")
89
+ # - updated (date-time, not "updated_at")
90
+ # - phase must be one of the enum values; the initial phase is "research"
91
+ # (there is no "created" phase — that was schema drift pre-beta.9)
92
+ # - version is required
64
93
  cat > ".flow/specs/$SPEC_NAME/.state.json" <<JSON
65
94
  {
66
- "spec": "$SPEC_NAME",
95
+ "version": "1.0",
96
+ "spec_name": "$SPEC_NAME",
67
97
  "goal": "$GOAL",
68
98
  "mode": "$FLAG_MODE",
69
- "phase": "created",
99
+ "phase": "research",
70
100
  "phase_status": {},
71
101
  "strategy": "auto",
72
102
  "execute_state": {},
73
- "created_at": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
74
- "updated_at": "$(date -u +%Y-%m-%dT%H:%M:%SZ)"
103
+ "created": "$(date -u +%Y-%m-%d)",
104
+ "updated": "$(date -u +%Y-%m-%dT%H:%M:%SZ)"
75
105
  }
76
106
  JSON
77
107
  echo "$SPEC_NAME" > .flow/.active-spec
@@ -67,6 +67,19 @@ If `--strict`:
67
67
 
68
68
  ### Step 4: Produce `verification-report.md`
69
69
 
70
+ **Landing check**: sub-agent responses can be truncated by the model's output-length limit. After dispatching `flow-verifier`, verify the report actually landed:
71
+
72
+ ```bash
73
+ REPORT=".flow/specs/$SPEC_NAME/verification-report.md"
74
+ if [ ! -f "$REPORT" ] || [ "$(wc -c < "$REPORT" 2>/dev/null | tr -d ' ')" -lt 300 ]; then
75
+ echo "⚠ Report missing or truncated. Re-dispatching flow-verifier with a terse 'write the report now' prompt."
76
+ # Re-dispatch pattern:
77
+ # "Your only job right now is to Write the verification-report.md using the
78
+ # findings you already gathered. Do not re-scan. Do not narrate. Write
79
+ # the file and stop."
80
+ fi
81
+ ```
82
+
70
83
  Write to `.flow/specs/$SPEC_NAME/verification-report.md`:
71
84
 
72
85
  ```markdown
@@ -33,19 +33,19 @@ A reviewer agent's output of "everything looks fine, no issues found" is an **in
33
33
  - "Looks good" is usually confirmation bias (the agent only checked the obvious)
34
34
  - AI tends to please the user ("great job!") — fight this tendency
35
35
 
36
- **Forced actions**:
37
- 1. If the agent outputs "no issues", automatically trigger a second round
38
- 2. The second round requires the agent to perform deeper analysis via sequential-thinking
39
- 3. If both rounds yield no findings, the agent must **prove** it checked:
40
- - List the dimensions examined (at least 5)
41
- - For each dimension, give the specific code/file locations inspected
42
- - Provide counterfactual hypotheses of "what it would look like if there were a problem"
36
+ **Forced actions when the agent reports "no issues"**:
37
+ 1. Automatically trigger a second round framed as "what would a senior skeptic reject in this PR?"
38
+ 2. If both rounds still honestly yield no findings, the agent must emit a **proof-of-checking report**:
39
+ - Every category it examined (with "N/A" for categories that don't apply)
40
+ - For each examined category, the specific code/file locations inspected
41
+ - Counterfactual hypotheses of "what this would look like if there were a problem" and why that signature is absent
42
+ 3. Fabricating findings to avoid the proof-of-checking step is a violation of L3 red line #2 (fact-driven). Better to emit "clean verdict with proof" than invent issues.
43
43
 
44
44
  ---
45
45
 
46
- ### Rule 2: Findings in at Least 3 Categories
46
+ ### Rule 2: Coverage proportional to feature scope
47
47
 
48
- A complete adversarial review must cover (find issues in at least 3 of these categories):
48
+ A complete adversarial review covers every category that applies to the feature, marks the rest as N/A with reason. Number of findings per category is proportional to real issues, not a quota:
49
49
 
50
50
  1. **Architecture layer**: Are decisions sound? Future-extensible? Lock-in risks?
51
51
  2. **Implementation layer**: Code quality? Error handling? Performance?
@@ -86,22 +86,22 @@ Not allowed:
86
86
  Input: object under review (code range / spec / PR diff)
87
87
 
88
88
  Round 1 (agent self-analysis):
89
- - Use sequential-thinking 6 rounds
90
- - Scan all 6 categories
89
+ - Use sequential-thinking proportional to the surface being probed
90
+ - Scan each applicable category; mark N/A ones with reason
91
91
  - Output findings list
92
92
 
93
93
  Decision:
94
- - Findings 3? → output report
95
- - Findings < 3? → force Round 2
94
+ - Any real findings? → output report with findings
95
+ - Zero findings after honest Round 1? → force Round 2 framed as skeptic
96
96
 
97
97
  Round 2 (deep analysis):
98
- - sequential-thinking for another 6 rounds
98
+ - sequential-thinking proportional to residual uncertainty
99
99
  - Focus on "seemingly no issues" parts (trust but verify)
100
- - May introduce external perspectives (read issues from similar projects)
100
+ - Optionally introduce external perspectives (read issues from similar projects)
101
101
 
102
102
  Decision:
103
- - Still < 3? → agent must explicitly prove it checked
104
- - Otherwise → output report
103
+ - Still zero findings? → agent must emit proof-of-checking report (NOT invent findings)
104
+ - Findings exist? → output report
105
105
 
106
106
  Output: review-report.md
107
107
  ```
@@ -190,10 +190,10 @@ Fix loop:
190
190
 
191
191
  ## Failure Recovery
192
192
 
193
- If after 2 rounds there are still < 3 findings:
193
+ If after Round 2 the honest verdict is still zero findings, emit a proof-of-checking report (do NOT fabricate to hit a quota — there is no quota):
194
194
 
195
195
  ```markdown
196
- ## Adversarial Review — Insufficient Findings
196
+ ## Adversarial Review — Proof of Checking (zero findings)
197
197
 
198
198
  I have examined the following dimensions across 2 rounds of analysis:
199
199
 
@@ -195,12 +195,12 @@ Reading these test names = reading API behavior documentation.
195
195
 
196
196
  ### Agent Automatic
197
197
 
198
- When `flow-ux-designer` / `flow-reviewer` applies this gate, use sequential-thinking 4 rounds to scan the 8 dimensions.
198
+ When `flow-ux-designer` / `flow-reviewer` applies this gate, use sequential-thinking proportional to the complexity of the codebase being scanned.
199
199
 
200
200
  ### Human Review
201
201
 
202
202
  Attach a DevEx checklist at PR time:
203
- - [ ] Clear naming (reviewed at least 3 times)
203
+ - [ ] Clear naming (re-read until obvious to a new maintainer)
204
204
  - [ ] Critical comments exist
205
205
  - [ ] Consistent structure
206
206
  - [ ] Actionable error messages
@@ -210,7 +210,7 @@ Attach a DevEx checklist at PR time:
210
210
 
211
211
  ## Scoring
212
212
 
213
- Each dimension 0-10 points:
213
+ Score each **applicable** dimension 0-10 (N/A dimensions are excluded from the total):
214
214
 
215
215
  ```
216
216
  10 = best practice
@@ -220,8 +220,7 @@ Each dimension 0-10 points:
220
220
  0 = serious issue
221
221
  ```
222
222
 
223
- Total 40+ / 80 = pass (warning, non-blocking).
224
- Total < 40 = blocked, improvement required.
223
+ Emit the per-dimension scores with evidence. The gate itself does not block on a numeric threshold; it surfaces the weaknesses for the user (or the reviewing agent) to decide whether any of them rise to a blocker. A single 0/10 on a material dimension is a blocker regardless of the total.
225
224
 
226
225
  ---
227
226
 
@@ -104,7 +104,7 @@ Q4. If no test, what test should be added to cover it?
104
104
  Input: object under review (function / component / API) + requirements + tests
105
105
 
106
106
  For each category (1-7):
107
- 1. Use sequential-thinking to list at least 3 possible edge scenarios
107
+ 1. Use sequential-thinking to list every plausible edge scenario for this category — stop when you've covered the real risk surface, don't pad to a quota, don't fabricate scenarios that won't occur in production
108
108
  2. Check whether each scenario has corresponding coverage in tests
109
109
  3. Add uncovered ones to the "gap list"
110
110