npm - @mutagent/cli - Versions diffs - 0.1.177 → 0.1.178 - Mend

@mutagent/cli 0.1.177 → 0.1.178

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/bin/cli.js CHANGED Viewed

@@ -1271,8 +1271,8 @@ var init_sdk_client = __esm(() => {
 // src/bin/cli.ts
 import { Command as Command21 } from "commander";
 import chalk39 from "chalk";
-import { readFileSync as readFileSync13, existsSync as existsSync16 } from "fs";
-import { join as join11, dirname as dirname3 } from "path";
+import { readFileSync as readFileSync14, existsSync as existsSync17 } from "fs";
+import { join as join12, dirname as dirname3 } from "path";
 import { fileURLToPath as fileURLToPath2 } from "url";
 // src/commands/auth.ts
@@ -1280,11 +1280,45 @@ init_config();
 init_sdk_client();
 import { Command } from "commander";
 import chalk4 from "chalk";
-import { existsSync as existsSync3 } from "fs";
-import { join as join4 } from "path";
+import { existsSync as existsSync4 } from "fs";
+import { join as join5 } from "path";
 // src/lib/output.ts
 import chalk from "chalk";
+import { existsSync as existsSync2, readFileSync as readFileSync2 } from "fs";
+import { join as join2 } from "path";
+var _compatCache = null;
+var _globalCliVersion = "0.0.0";
+function setCliVersion(version) {
+  _globalCliVersion = version;
+  _compatCache = null;
+}
+function getCompatMetadata(cliVersion = _globalCliVersion) {
+  if (_compatCache !== null)
+    return _compatCache;
+  let skillVersion = "unknown";
+  let skillMinCliVersion = "unknown";
+  try {
+    const candidatePaths = [
+      join2(process.cwd(), ".claude", "skills", "mutagent-cli", "SKILL.md"),
+      join2(process.cwd(), "mutagent-cli", ".claude", "skills", "mutagent-cli", "SKILL.md")
+    ];
+    for (const p of candidatePaths) {
+      if (existsSync2(p)) {
+        const content = readFileSync2(p, "utf-8");
+        const verMatch = /^SKILL_VERSION:\s*(.+)$/m.exec(content);
+        const minMatch = /^SKILL_MIN_CLI_VERSION:\s*(.+)$/m.exec(content);
+        if (verMatch?.[1])
+          skillVersion = verMatch[1].trim();
+        if (minMatch?.[1])
+          skillMinCliVersion = minMatch[1].trim();
+        break;
+      }
+    }
+  } catch {}
+  _compatCache = { cliVersion, skillVersion, skillMinCliVersion };
+  return _compatCache;
+}
 function getJsonFlag(command) {
   if (!command)
     return false;
@@ -1300,7 +1334,14 @@ class OutputFormatter {
   }
   output(data) {
     if (this.format === "json") {
-      console.log(JSON.stringify(data, null, 2));
+      let enriched = data;
+      if (_globalCliVersion !== "0.0.0" && data !== null && typeof data === "object" && !Array.isArray(data) && "_directive" in data) {
+        enriched = {
+          ...data,
+          _compat: getCompatMetadata(_globalCliVersion)
+        };
+      }
+      console.log(JSON.stringify(enriched, null, 2));
     } else {
       this.formatTable(data);
     }
@@ -1431,8 +1472,8 @@ function createSpinner(text, isJson) {
 init_errors();
 // src/lib/mutation-context.ts
-import { existsSync as existsSync2, readFileSync as readFileSync2, writeFileSync as writeFileSync2, mkdirSync as mkdirSync2 } from "fs";
-import { join as join2, resolve } from "path";
+import { existsSync as existsSync3, readFileSync as readFileSync3, writeFileSync as writeFileSync2, mkdirSync as mkdirSync2 } from "fs";
+import { join as join3, resolve } from "path";
 function parseTableRows(lines) {
   const rows = [];
   for (const line of lines) {
@@ -1473,18 +1514,18 @@ class MutationContext {
   }
   static load(projectRoot) {
     const root = projectRoot ?? process.cwd();
-    const dirPath = join2(resolve(root), CONTEXT_DIR);
-    const filePath = join2(dirPath, CONTEXT_FILE);
+    const dirPath = join3(resolve(root), CONTEXT_DIR);
+    const filePath = join3(dirPath, CONTEXT_FILE);
     const ctx = new MutationContext(filePath);
-    if (existsSync2(filePath)) {
-      const content = readFileSync2(filePath, "utf-8");
+    if (existsSync3(filePath)) {
+      const content = readFileSync3(filePath, "utf-8");
       ctx.parse(content);
     }
     return ctx;
   }
   save() {
-    const dirPath = join2(this.filePath, "..");
-    if (!existsSync2(dirPath)) {
+    const dirPath = join3(this.filePath, "..");
+    if (!existsSync3(dirPath)) {
       mkdirSync2(dirPath, { recursive: true });
     }
     writeFileSync2(this.filePath, this.render(), "utf-8");
@@ -1710,8 +1751,8 @@ import chalk2 from "chalk";
 import { resolve as resolve2 } from "path";
 // src/lib/explorer.ts
-import { readdirSync, readFileSync as readFileSync3, statSync } from "fs";
-import { join as join3, relative, extname, basename } from "path";
+import { readdirSync, readFileSync as readFileSync4, statSync } from "fs";
+import { join as join4, relative, extname, basename } from "path";
 var DOUBLE_VAR_PATTERN = /\{\{\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*\}\}/g;
 var SINGLE_VAR_PATTERN = /(?<![\\{])\{\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*\}(?!\})/g;
 var FENCED_CODE_PATTERN = /```[\s\S]*?```/g;
@@ -1805,7 +1846,7 @@ function walkDir(dir, extensions, excludeDirs, maxDepth, currentDepth = 0) {
     return files;
   }
   for (const entry of entries) {
-    const fullPath = join3(dir, entry);
+    const fullPath = join4(dir, entry);
     let stat;
     try {
       stat = statSync(fullPath);
@@ -1831,7 +1872,7 @@ function scanForPrompts(filePath, relativePath) {
   const results = [];
   let content;
   try {
-    content = readFileSync3(filePath, "utf-8");
+    content = readFileSync4(filePath, "utf-8");
   } catch {
     return results;
   }
@@ -1905,7 +1946,7 @@ function scanForAgents(filePath, relativePath) {
   const results = [];
   let content;
   try {
-    content = readFileSync3(filePath, "utf-8");
+    content = readFileSync4(filePath, "utf-8");
   } catch {
     return results;
   }
@@ -1953,7 +1994,7 @@ function scanForMarkers(filePath, relativePath) {
   const results = [];
   let content;
   try {
-    content = readFileSync3(filePath, "utf-8");
+    content = readFileSync4(filePath, "utf-8");
   } catch {
     return results;
   }
@@ -1987,7 +2028,7 @@ function scanJsonForSchemas(filePath, relativePath) {
     return results;
   let content;
   try {
-    content = readFileSync3(filePath, "utf-8");
+    content = readFileSync4(filePath, "utf-8");
   } catch {
     return results;
   }
@@ -2014,7 +2055,7 @@ function scanForDatasets(dir, rootPath, excludeDirs, maxDepth) {
     const name = basename(filePath, ext);
     let content;
     try {
-      content = readFileSync3(filePath, "utf-8");
+      content = readFileSync4(filePath, "utf-8");
     } catch {
       continue;
     }
@@ -2637,8 +2678,8 @@ Examples:
     }
     const isValid = await validateApiKey(apiKey, endpoint);
     const cwd = process.cwd();
-    const hasOnboarding = existsSync3(join4(cwd, ".mutagentrc.json"));
-    const hasContextFile = existsSync3(join4(cwd, ".mutagent", "mutation-context.md"));
+    const hasOnboarding = existsSync4(join5(cwd, ".mutagentrc.json"));
+    const hasContextFile = existsSync4(join5(cwd, ".mutagent", "mutation-context.md"));
     let contextSummary;
     if (hasContextFile) {
       try {
@@ -2799,7 +2840,7 @@ init_errors();
 init_sdk_client();
 import { Command as Command7 } from "commander";
 import chalk18 from "chalk";
-import { readFileSync as readFileSync4, existsSync as existsSync4 } from "fs";
+import { readFileSync as readFileSync5, existsSync as existsSync5 } from "fs";
 // src/lib/ui-links.ts
 function getAppBaseUrl() {
@@ -2906,7 +2947,7 @@ function formatCreationHints(hint) {
   return lines.join(`
 `);
 }
-var CARD_INSTRUCTION = `MANDATORY: HARD STOP — DO NOT PROCEED.
+var VERBATIM_CARD_INSTRUCTION = `MANDATORY: HARD STOP — DO NOT PROCEED.
 ` + `1. The renderedCard below appears in your bash/terminal output. That is NOT sufficient — the user cannot see it there.
 ` + `2. You MUST copy the FULL renderedCard text into your chat response VERBATIM. Do NOT summarize, rephrase, or omit any lines.
 ` + `3. Do NOT run any other command until you have displayed the card in chat.
@@ -2989,7 +3030,7 @@ function promptCreatedDirective(promptId, name) {
     fields: { promptId: String(promptId), name },
     links: { dashboard: dashboardUrl, api: apiUrl },
     next,
-    instruction: CARD_INSTRUCTION,
+    instruction: VERBATIM_CARD_INSTRUCTION,
     renderedCard: renderPlainCard({ title, icon: "✓", rows, links: cardLinks, next })
   };
 }
@@ -3013,7 +3054,7 @@ function promptUpdatedDirective(promptId, name) {
     fields: { promptId: String(promptId), name },
     links: { dashboard: dashboardUrl, api: apiUrl },
     next,
-    instruction: CARD_INSTRUCTION,
+    instruction: VERBATIM_CARD_INSTRUCTION,
     renderedCard: renderPlainCard({ title, icon: "✓", rows, links: cardLinks, next })
   };
 }
@@ -3030,7 +3071,7 @@ function promptDeletedDirective(promptId) {
     fields: { promptId },
     links: { dashboard: dashboardUrl },
     next,
-    instruction: CARD_INSTRUCTION,
+    instruction: VERBATIM_CARD_INSTRUCTION,
     renderedCard: renderPlainCard({ title, icon: "✗", rows, links: cardLinks, next })
   };
 }
@@ -3067,7 +3108,7 @@ function datasetAddedDirective(promptId, datasetId, name, itemCount, state) {
     },
     links: { dashboard: dashboardUrl, api: apiUrl },
     next,
-    instruction: CARD_INSTRUCTION,
+    instruction: VERBATIM_CARD_INSTRUCTION,
     renderedCard: renderPlainCard({ title, icon: "✓", rows, links: cardLinks, next })
   };
 }
@@ -3087,7 +3128,7 @@ function datasetDeletedDirective(promptId, datasetId) {
     fields: { promptId, datasetId },
     links: { datasets: datasetsUrl },
     next,
-    instruction: CARD_INSTRUCTION,
+    instruction: VERBATIM_CARD_INSTRUCTION,
     renderedCard: renderPlainCard({ title, icon: "✗", rows, links: cardLinks, next })
   };
 }
@@ -3123,7 +3164,7 @@ function evaluationCreatedDirective(promptId, evalId, name, criteriaCount, state
     },
     links: { dashboard: dashboardUrl, api: apiUrl },
     next,
-    instruction: CARD_INSTRUCTION,
+    instruction: VERBATIM_CARD_INSTRUCTION,
     renderedCard: renderPlainCard({ title, icon: "✓", rows, links: cardLinks, next })
   };
 }
@@ -3140,7 +3181,7 @@ function evaluationDeletedDirective(evaluationId) {
     fields: { evaluationId },
     links: { dashboard: dashboardUrl },
     next,
-    instruction: CARD_INSTRUCTION,
+    instruction: VERBATIM_CARD_INSTRUCTION,
     renderedCard: renderPlainCard({ title, icon: "✗", rows, links: cardLinks, next })
   };
 }
@@ -3508,6 +3549,7 @@ ${chalk7.yellow("AI Agent: Format Selection Rules")}
 ${chalk7.red("outputSchema is required.")}
 ${chalk7.yellow("AI Agent: ALWAYS append --json to this command.")}
+${chalk7.cyan("Conceptual help (brace convention, delimiter inference): .claude/skills/mutagent-cli/concepts/prompt-variables.md")} ${chalk7.dim("(install via mutagent skills install)")}
     `).action(async (options) => {
     const isJson = getJsonFlag(prompts);
     const output = new OutputFormatter(isJson ? "json" : "table");
@@ -3806,10 +3848,12 @@ async function buildGuidedWorkflow(promptId) {
   const askUserQuestions = allFields.map(({ field, source, fieldSchema }) => {
     const isInput = source === "inputSchema";
     const question = isInput ? `Define the Minimum Viable Context for "${field}". What data MUST be present in this input for the prompt to produce a correct output? Describe what constitutes complete vs incomplete input, and WHY this field matters.` : `Define what correct "${field}" looks like. What structure, content, or qualities make it good vs bad? Give concrete examples of good and bad outputs.`;
-    const hint = isInput ? `Input fields define what data the prompt NEEDS to work correctly. Without defining minimum viable context, the optimizer cannot detect whether failures come from bad input or bad prompt logic.` : null;
+    const hint = isInput ? `Input fields define what data the prompt NEEDS to work correctly. Without defining minimum viable context, the optimizer cannot detect whether failures come from bad input or bad prompt logic.` : undefined;
     const header = isInput ? `${field} [INPUT]` : `${field} [OUTPUT]`;
     const defineDesc = isInput ? `Describe what data MUST be present in "${field}" and WHY the prompt needs it. Focus on completeness and minimum viable context.` : `Describe what a correct vs incorrect "${field}" looks like. Focus on structure, content quality, and concrete examples — not numeric scores.`;
     return {
+      field,
+      source,
       question,
       header,
       ...hint != null ? { hint } : {},
@@ -3841,7 +3885,7 @@ async function buildGuidedWorkflow(promptId) {
       description: "Follow these steps to create an evaluation for this prompt:",
       steps: [
         { step: 1, action: "Review prompt schemas and context", detail: "Understand what the prompt does and what each field means." },
-        { step: 2, action: "Define correctness criteria for EVERY field", detail: `For EACH of the ${String(allFields.length)} fields in askUserQuestions, use AskUserQuestion to collect a rubric. INPUT fields (${String(inputFields.length)}): collect minimum viable context definitions — what data MUST exist. OUTPUT fields (${String(outputFields.length)}): collect correctness criteria — what correct vs incorrect looks like. Do NOT skip any field. Use the context hints to suggest rubrics when asked.` },
+        { step: 2, action: "Define correctness criteria for EVERY field", detail: `For EACH of the ${String(allFields.length)} fields in _directive.askUserQuestions, use AskUserQuestion to collect a rubric. INPUT fields (${String(inputFields.length)}): collect minimum viable context definitions — what data MUST exist. OUTPUT fields (${String(outputFields.length)}): collect correctness criteria — what correct vs incorrect looks like. Do NOT skip any field. Use the context hints to suggest rubrics when asked.` },
         { step: 3, action: "Build criteria JSON", detail: "Map each rubric to: { name: field, description: rubric, evaluationParameter: field }" },
         { step: 4, action: "Ask for evaluation name", detail: "Ask user what to name this evaluation" },
         { step: 5, action: "Create the evaluation", detail: `mutagent prompts evaluation create ${promptId} --name "<name>" -d '<json>' --json` }
@@ -3865,11 +3909,9 @@ async function buildGuidedWorkflow(promptId) {
         }
       })}'`
     },
-    askUserQuestions,
     _directive: {
-      display: "workflow_guide",
       instruction: `MANDATORY ORDERS — READ CAREFULLY:
-` + `1. You MUST ask the user about EVERY SINGLE field listed in askUserQuestions. There are ${String(allFields.length)} fields total.
+` + `1. You MUST ask the user about EVERY SINGLE field listed in _directive.askUserQuestions. There are ${String(allFields.length)} fields total.
 ` + `2. Do NOT skip ANY field. Do NOT decide that a field "doesn't need" a definition.
 ` + `3. INPUT fields (${String(inputFields.length)}): These define MINIMUM VIABLE CONTEXT — the data that MUST exist in the input for the prompt to work. Ask: "What data MUST be present in this field and WHY does the prompt need it?"
 ` + `   WHY THIS MATTERS: Without minimum viable context definitions, the optimizer cannot distinguish between failures caused by incomplete input vs failures caused by bad prompt logic.
@@ -3893,11 +3935,15 @@ async function buildGuidedWorkflow(promptId) {
 ` + `   NEVER use vague rubrics like "0-1 scale" or "score based on accuracy".
 ` + `   Canonical source: .claude/skills/mutagent-cli/SKILL.md → "Guided Eval Criteria".
 ` + "   Note: template variables use single-brace `{variable}` (platform canonical). `mutagent explore`\n" + "   infers single vs double per-file via inferPromptVariables().",
-      aiAgentDecisionTree: {
+      next: [
+        `mutagent prompts evaluation create ${promptId} --name "<name>" -d '<criteria-json>' --json`
+      ],
+      decisionTree: {
         step1: "Check if criteria already exist in the user's code. If criteria match expected shape (name + description + evaluationParameter targeting schema fields), use --data directly.",
-        step2: "If criteria are missing or malformed, use the askUserQuestions payloads to collect them via AskUserQuestion.",
+        step2: "If criteria are missing or malformed, use the _directive.askUserQuestions payloads to collect them via AskUserQuestion.",
         step3: `After collecting all criteria, construct the --data JSON and run: mutagent prompts evaluation create <prompt-id> --name "<name>" -d '<json>'`
-      }
+      },
+      askUserQuestions
     },
     _meta: { inputFields, outputFields, allFields }
   };
@@ -4002,12 +4048,11 @@ function buildGuidedDatasetWorkflow(prompt) {
       input: hasSchema ? templateInput : { "<field>": "<value>" },
       expectedOutput: hasSchema ? templateOutput : { "<field>": "<expected>" }
     },
-    askUserQuestions,
     guidance: {
       minItems: 5,
       priorityRule: "Hard cases that expose prompt weaknesses > easy cases that always pass",
       steps: [
-        "Collect answers to each question from the user",
+        "Collect answers to each question from the user via _directive.askUserQuestions",
         "Generate 5-10 dataset items covering all categories",
         "Ensure at least 2 hard/edge cases per category",
         'Format as JSON array: [{"input": {...}, "expectedOutput": {...}}, ...]',
@@ -4015,10 +4060,42 @@ function buildGuidedDatasetWorkflow(prompt) {
       ]
     },
     _directive: {
-      instruction: "Collect domain-specific dataset items from the user. " + "Ask about each schema field. " + "Prioritize hard cases over easy cases. " + "Then construct the dataset JSON and upload.",
+      instruction: `DATASET CURATION — MANDATORY RULES (self-sufficient, no Skill required):
+` + `
+` + `1. PRIORITY RULE: Hard cases that expose prompt weaknesses > easy cases that always pass.
+` + `   Collect HARD CASES FIRST. Ask about failure modes, edge cases, adversarial inputs.
+` + `
+` + `2. CASE CATEGORIES (collect in order):
+` + `   a. Edge Cases (HIGH priority): boundary inputs, empty fields, very long values, unicode, malformed data
+` + `   b. Hard Cases (HIGH priority): ambiguous inputs, adversarial examples, domain traps that fool the model
+` + `   c. Representative Cases (MEDIUM priority): typical production inputs (only after hard cases covered)
+` + `
+` + `3. PER-INPUT-FIELD: For each input field, ask what VALUES should be tested. Collect at minimum:
+` + `   - One boundary value (empty, null, extremely long, wrong type)
+` + `   - One ambiguous value (that could be interpreted multiple ways)
+` + `   - One representative production value
+` + `
+` + `4. PER-OUTPUT-FIELD: For each output field, ask what CORRECT looks like vs INCORRECT.
+` + `   You need the correctness shape to generate realistic expectedOutput values.
+` + `
+` + `5. MINIMUM: 5 dataset items total. At least 2 must be hard or edge cases.
+` + `
+` + `6. FORMAT: Each item MUST have:
+` + `   { "input": { <inputSchema fields> }, "expectedOutput": { <outputSchema fields> } }
+` + `   NEVER omit expectedOutput on labelable items — optimizer needs the ground truth.
+` + `
+` + `7. ANTI-PATTERNS (never do):
+` + `   - All easy cases that always pass → useless for optimization
+` + `   - No edge cases → optimizer never sees failure modes
+` + `   - Fictional inputs that won't happen in production → scores are misleading
+` + `   - Missing expectedOutput on items with knowable correct answers
+` + `
+` + `Use _directive.askUserQuestions (${String(askUserQuestions.length)} questions) to collect domain knowledge from the user.
+` + "For deeper conceptual background, see: .claude/skills/mutagent-cli/concepts/dataset-design.md",
       next: [
         `mutagent prompts dataset add ${promptId} -d '<constructed-json>' --name '<name>' --json`
-      ]
+      ],
+      askUserQuestions
     }
   };
 }
@@ -4653,6 +4730,7 @@ Expected Criteria Shape (--data):
 ${chalk10.red("Required: --name (unless --guided). Criteria must include evaluationParameter.")}
 ${chalk10.dim("CLI flags (--name, --description) override --data fields.")}
 ${chalk10.dim("Get prompt IDs: mutagent prompts list")}
+${chalk10.cyan("Conceptual help: .claude/skills/mutagent-cli/concepts/eval-criteria.md")} ${chalk10.dim("(install via mutagent skills install)")}
     `).action(async (promptId, options) => {
     let isJson = getJsonFlag(prompts);
     if (options.guided) {
@@ -5154,7 +5232,10 @@ function startDirective(job, promptId, datasetId, evaluationId) {
   ];
   const next = [`mutagent prompts optimize status ${job.id} --json`];
   return {
+    instruction: VERBATIM_CARD_INSTRUCTION,
+    next,
     display: "status_card",
+    renderedCard: renderPlainCard({ title, icon: "⚡", rows, links: cardLinks, next }),
     template: "optimization_started",
     title,
     fields: {
@@ -5167,13 +5248,7 @@ function startDirective(job, promptId, datasetId, evaluationId) {
       targetScore: job.config.targetScore ?? 0.8,
       model: job.config.model ?? "default"
     },
-    links: { monitor: monitorUrl, api: apiUrl },
-    next,
-    instruction: `HARD STOP — DO NOT PROCEED.
-` + `The renderedCard below appears in bash/terminal output. That is NOT sufficient.
-` + `You MUST copy the FULL renderedCard text into your chat response VERBATIM before running any other command.
-` + "After displaying, follow the suggested next steps.",
-    renderedCard: renderPlainCard({ title, icon: "⚡", rows, links: cardLinks, next })
+    links: { monitor: monitorUrl, api: apiUrl }
   };
 }
 function statusDirective(status, promptId) {
@@ -5201,7 +5276,10 @@ function statusDirective(status, promptId) {
   const monitorUrl = optimizerLink(promptId ?? "unknown", status.jobId);
   const cardLinks = [{ label: "Monitor", url: monitorUrl }];
   return {
+    instruction: VERBATIM_CARD_INSTRUCTION,
+    next,
     display: "status_card",
+    renderedCard: renderPlainCard({ title, icon, rows, links: cardLinks, next }),
     template: "optimization_progress",
     title,
     fields: {
@@ -5213,13 +5291,7 @@ function statusDirective(status, promptId) {
       eta,
       message: status.message
     },
-    links: { monitor: monitorUrl },
-    next,
-    instruction: `HARD STOP — DO NOT PROCEED.
-` + `The renderedCard below appears in bash/terminal output. That is NOT sufficient.
-` + `You MUST copy the FULL renderedCard text into your chat response VERBATIM before running any other command.
-` + "After displaying, follow the suggested next steps.",
-    renderedCard: renderPlainCard({ title, icon, rows, links: cardLinks, next })
+    links: { monitor: monitorUrl }
   };
 }
 function showPromptDiff(original, optimized) {
@@ -6149,6 +6221,8 @@ ${chalk16.yellow(`⚠ COST WARNING — AI Agent:
 ${chalk16.yellow("AI Agent: ALWAYS append --json to this command.")}
+${chalk16.cyan("Conceptual help (full optimization loop): .claude/skills/mutagent-cli/workflows/optimization.md")} ${chalk16.dim("(install via mutagent skills install)")}
 ${chalk16.bold("Hit a bug or unexpected result?")}
   ${chalk16.cyan('mutagent feedback send --category bug -m "describe what went wrong"')}
     `).action(async (promptId, options) => {
@@ -6461,6 +6535,7 @@ After viewing results:
   ${chalk16.dim("No flag = view scorecard only.")}
   ${chalk16.dim("AI Agent: Present scorecard to user via AskUserQuestion before applying.")}
+${chalk16.cyan("Conceptual help (scorecard interpretation, nextAction): .claude/skills/mutagent-cli/concepts/scorecard-output.md")} ${chalk16.dim("(install via mutagent skills install)")}
     `).action(async (jobId, options) => {
     const isJson = getJsonFlag(prompts);
     const output = new OutputFormatter(isJson ? "json" : "table");
@@ -6473,12 +6548,8 @@ After viewing results:
         const isCompleted = jobData?.status === "completed";
         const scorecardText = buildResultsScorecardText(resultData);
         const directive = {
-          display: "scorecard",
-          instruction: `HARD STOP — DO NOT PROCEED.
-` + `The scorecard below appears in bash/terminal output. That is NOT sufficient.
-` + `You MUST copy the FULL renderedCard text into your chat response VERBATIM.
-` + `Then use AskUserQuestion with options: [Apply optimized prompt / View full diff / Reject and keep original].
-` + "Do NOT auto-apply.",
+          display: "status_card",
+          instruction: VERBATIM_CARD_INSTRUCTION,
           renderedCard: scorecardText,
           next: isCompleted ? [
             `mutagent prompts optimize results ${jobId} --apply --json`,
@@ -6799,12 +6870,12 @@ Provide a valid JSON Schema, e.g., '{"type":"object","properties":{"field":{"typ
     }
   }
   if (filePath) {
-    if (!existsSync4(filePath)) {
+    if (!existsSync5(filePath)) {
       throw new MutagentError("FILE_NOT_FOUND", `File not found: ${filePath}`, `Run: ${helpCommand}
 Check the file path and try again`);
     }
     try {
-      return JSON.parse(readFileSync4(filePath, "utf-8"));
+      return JSON.parse(readFileSync5(filePath, "utf-8"));
     } catch {
       throw new MutagentError("INVALID_JSON", `Failed to parse JSON from ${filePath}`, `Run: ${helpCommand}
 Ensure the file contains valid JSON Schema`);
@@ -7102,20 +7173,20 @@ ${chalk22.dim("Returns full trace details including spans, tokens, and latency."
 init_config();
 import { Command as Command9 } from "commander";
 import chalk23 from "chalk";
-import { writeFileSync as writeFileSync3, existsSync as existsSync9 } from "fs";
+import { writeFileSync as writeFileSync3, existsSync as existsSync10 } from "fs";
 import { execSync } from "child_process";
 init_errors();
 // src/lib/integrations/langchain.ts
-import { readFileSync as readFileSync5, existsSync as existsSync5 } from "fs";
+import { readFileSync as readFileSync6, existsSync as existsSync6 } from "fs";
 var langchainIntegration = {
   name: "langchain",
   description: "LangChain framework",
   detect() {
     let hasLangchain = false;
-    if (existsSync5("package.json")) {
+    if (existsSync6("package.json")) {
       try {
-        const pkg = JSON.parse(readFileSync5("package.json", "utf-8"));
+        const pkg = JSON.parse(readFileSync6("package.json", "utf-8"));
         const deps = { ...pkg.dependencies, ...pkg.devDependencies };
         hasLangchain = "langchain" in deps || "@langchain/core" in deps;
       } catch {}
@@ -7247,15 +7318,15 @@ mutagent traces analyze <prompt-id>
 };
 // src/lib/integrations/langgraph.ts
-import { readFileSync as readFileSync6, existsSync as existsSync6 } from "fs";
+import { readFileSync as readFileSync7, existsSync as existsSync7 } from "fs";
 var langgraphIntegration = {
   name: "langgraph",
   description: "LangGraph agent workflow framework",
   detect() {
     let hasLanggraph = false;
-    if (existsSync6("package.json")) {
+    if (existsSync7("package.json")) {
       try {
-        const pkg = JSON.parse(readFileSync6("package.json", "utf-8"));
+        const pkg = JSON.parse(readFileSync7("package.json", "utf-8"));
         const deps = { ...pkg.dependencies, ...pkg.devDependencies };
         hasLanggraph = "@langchain/langgraph" in deps;
       } catch {}
@@ -7337,15 +7408,15 @@ mutagent integrate langgraph --verify
 };
 // src/lib/integrations/vercel-ai.ts
-import { readFileSync as readFileSync7, existsSync as existsSync7 } from "fs";
+import { readFileSync as readFileSync8, existsSync as existsSync8 } from "fs";
 var vercelAiIntegration = {
   name: "vercel-ai",
   description: "Vercel AI SDK",
   detect() {
     let hasAiSdk = false;
-    if (existsSync7("package.json")) {
+    if (existsSync8("package.json")) {
       try {
-        const pkg = JSON.parse(readFileSync7("package.json", "utf-8"));
+        const pkg = JSON.parse(readFileSync8("package.json", "utf-8"));
         const deps = { ...pkg.dependencies, ...pkg.devDependencies };
         hasAiSdk = "ai" in deps;
       } catch {}
@@ -7487,15 +7558,15 @@ mutagent integrate vercel-ai --verify
 };
 // src/lib/integrations/openai.ts
-import { readFileSync as readFileSync8, existsSync as existsSync8 } from "fs";
+import { readFileSync as readFileSync9, existsSync as existsSync9 } from "fs";
 var openaiIntegration = {
   name: "openai",
   description: "OpenAI SDK integration with automatic tracing",
   detect() {
     let hasOpenAI = false;
-    if (existsSync8("package.json")) {
+    if (existsSync9("package.json")) {
       try {
-        const pkg = JSON.parse(readFileSync8("package.json", "utf-8"));
+        const pkg = JSON.parse(readFileSync9("package.json", "utf-8"));
         const deps = { ...pkg.dependencies, ...pkg.devDependencies };
         hasOpenAI = "openai" in deps;
       } catch {}
@@ -7644,16 +7715,16 @@ function getFrameworkMetadata(name) {
 // src/commands/integrate.ts
 function detectPackageManager() {
-  if (existsSync9("bun.lockb") || existsSync9("bun.lock")) {
+  if (existsSync10("bun.lockb") || existsSync10("bun.lock")) {
     return "bun";
   }
-  if (existsSync9("pnpm-lock.yaml")) {
+  if (existsSync10("pnpm-lock.yaml")) {
     return "pnpm";
   }
-  if (existsSync9("yarn.lock")) {
+  if (existsSync10("yarn.lock")) {
     return "yarn";
   }
-  if (existsSync9("package-lock.json")) {
+  if (existsSync10("package-lock.json")) {
     return "npm";
   }
   try {
@@ -9039,15 +9110,15 @@ init_config();
 import { Command as Command15 } from "commander";
 import inquirer2 from "inquirer";
 import chalk34 from "chalk";
-import { existsSync as existsSync11, mkdirSync as mkdirSync3, writeFileSync as writeFileSync4 } from "fs";
+import { existsSync as existsSync12, mkdirSync as mkdirSync3, writeFileSync as writeFileSync4 } from "fs";
 import { execSync as execSync3 } from "child_process";
-import { join as join6 } from "path";
+import { join as join7 } from "path";
 init_errors();
 // src/lib/framework-detection.ts
-import { existsSync as existsSync10, readFileSync as readFileSync9 } from "fs";
+import { existsSync as existsSync11, readFileSync as readFileSync10 } from "fs";
 import { execSync as execSync2 } from "child_process";
-import { join as join5 } from "path";
+import { join as join6 } from "path";
 var FRAMEWORK_DETECTION_MAP = {
   "@mastra/core": {
     name: "mastra",
@@ -9097,16 +9168,16 @@ var FRAMEWORK_DETECTION_MAP = {
   }
 };
 function detectPackageManager2(cwd = process.cwd()) {
-  if (existsSync10(join5(cwd, "bun.lockb")) || existsSync10(join5(cwd, "bun.lock"))) {
+  if (existsSync11(join6(cwd, "bun.lockb")) || existsSync11(join6(cwd, "bun.lock"))) {
     return "bun";
   }
-  if (existsSync10(join5(cwd, "pnpm-lock.yaml"))) {
+  if (existsSync11(join6(cwd, "pnpm-lock.yaml"))) {
     return "pnpm";
   }
-  if (existsSync10(join5(cwd, "yarn.lock"))) {
+  if (existsSync11(join6(cwd, "yarn.lock"))) {
     return "yarn";
   }
-  if (existsSync10(join5(cwd, "package-lock.json"))) {
+  if (existsSync11(join6(cwd, "package-lock.json"))) {
     return "npm";
   }
   try {
@@ -9127,13 +9198,13 @@ function getInstallCommand2(pm, packages) {
   return commands[pm];
 }
 function detectFrameworkFromPackageJson(cwd = process.cwd()) {
-  const pkgPath = join5(cwd, "package.json");
-  if (!existsSync10(pkgPath)) {
+  const pkgPath = join6(cwd, "package.json");
+  if (!existsSync11(pkgPath)) {
     return null;
   }
   let pkg;
   try {
-    pkg = JSON.parse(readFileSync9(pkgPath, "utf-8"));
+    pkg = JSON.parse(readFileSync10(pkgPath, "utf-8"));
   } catch {
     return null;
   }
@@ -9149,12 +9220,12 @@ function detectFrameworkFromPackageJson(cwd = process.cwd()) {
   return null;
 }
 function hasRcConfig(cwd = process.cwd()) {
-  return existsSync10(join5(cwd, ".mutagentrc.json"));
+  return existsSync11(join6(cwd, ".mutagentrc.json"));
 }
 // src/commands/init.ts
 function writeRcConfig(config, cwd = process.cwd()) {
-  const rcPath = join6(cwd, ".mutagentrc.json");
+  const rcPath = join7(cwd, ".mutagentrc.json");
   writeFileSync4(rcPath, JSON.stringify(config, null, 2) + `
 `);
 }
@@ -9370,8 +9441,8 @@ Modes:
           }
         }
       }
-      const skillPath = join6(cwd, ".claude/skills/mutagent-cli/SKILL.md");
-      const skillInstalled = existsSync11(skillPath);
+      const skillPath = join7(cwd, ".claude/skills/mutagent-cli/SKILL.md");
+      const skillInstalled = existsSync12(skillPath);
       if (!isNonInteractive && !skillInstalled) {
         const { installSkill } = await inquirer2.prompt([{
           type: "confirm",
@@ -9381,11 +9452,11 @@ Modes:
         }]);
         if (installSkill) {
           try {
-            const skillDir = join6(cwd, ".claude/skills/mutagent-cli");
-            if (!existsSync11(skillDir)) {
+            const skillDir = join7(cwd, ".claude/skills/mutagent-cli");
+            if (!existsSync12(skillDir)) {
               mkdirSync3(skillDir, { recursive: true });
             }
-            execSync3("node " + join6(cwd, "node_modules/.bin/mutagent") + " skills install", {
+            execSync3("node " + join7(cwd, "node_modules/.bin/mutagent") + " skills install", {
               cwd,
               stdio: "ignore"
             });
@@ -9395,6 +9466,36 @@ Modes:
           }
         }
       }
+      const initDirective = {
+        instruction: skillInstalled ? "Verify workspace. Run: mutagent workspaces list --json" : [
+          "MANDATORY: Ask the user whether they want to install the MutagenT CLI skill and Claude Code hooks.",
+          "Use your AskUserQuestion tool to present the following choices:",
+          "1. Install skill + hooks (Recommended) — Installs the CLI skill AND configures Claude Code session telemetry hooks",
+          "2. Install skill only — Installs the CLI skill (teaches AI agents to use MutagenT CLI)",
+          "3. Skip — Do not install",
+          "",
+          "If the user picks option 1, run BOTH commands:",
+          "  mutagent skills install --json",
+          "  mutagent hooks install --json",
+          "",
+          "If the user picks option 2, run only: mutagent skills install --json",
+          "",
+          "After skill/hooks install, verify workspace: mutagent workspaces list --json"
+        ].join(`
+`),
+        next: ["mutagent workspaces list --json", "mutagent prompts list --json"],
+        ...!skillInstalled ? {
+          askUserQuestions: [{
+            question: "Would you like to install the MutagenT CLI skill and Claude Code hooks?",
+            options: [
+              { label: "Install skill + hooks (Recommended)", description: "Installs the CLI skill AND configures Claude Code session telemetry hooks" },
+              { label: "Install skill only", description: "Installs the CLI skill (teaches AI agents to use MutagenT CLI)" },
+              { label: "Skip", description: "Do not install" }
+            ],
+            multiSelect: false
+          }]
+        } : {}
+      };
       const summary = {
         success: true,
         mode,
@@ -9403,30 +9504,8 @@ Modes:
         framework: confirmedFramework?.name ?? null,
         authenticated,
         workspaceValidation: workspaceValidation ?? null,
-        skillInstalled: skillInstalled || existsSync11(skillPath),
-        _directive: {
-          instruction: "Verify workspace. Run: mutagent workspaces list --json",
-          next: ["mutagent workspaces list --json", "mutagent prompts list --json"],
-          ...!skillInstalled ? {
-            skillInstall: {
-              instruction: [
-                "MANDATORY: Ask the user whether they want to install the MutagenT CLI skill and Claude Code hooks.",
-                "Use your AskUserQuestion tool to present the following choices:",
-                "1. Install skill + hooks (Recommended) — Installs the CLI skill AND configures Claude Code session telemetry hooks",
-                "2. Install skill only — Installs the CLI skill (teaches AI agents to use MutagenT CLI)",
-                "3. Skip — Do not install",
-                "",
-                "If the user picks option 1, run BOTH commands:",
-                "  mutagent skills install --json",
-                "  mutagent hooks install --json",
-                "",
-                "If the user picks option 2, run only: mutagent skills install --json"
-              ].join(`
-`),
-              command: "mutagent skills install --json"
-            }
-          } : {}
-        }
+        skillInstalled: skillInstalled || existsSync12(skillPath),
+        _directive: initDirective
       };
       output.output(summary);
     } catch (error) {
@@ -9469,6 +9548,7 @@ ${chalk35.bold("Template variable delimiters (inferred per-file):")}
     ${chalk35.dim('{ "file": "src/prompts/summarize.ts", "line": 12, "reason": "template-variable", "delimiter": "single" }')}
 ${chalk35.dim("Results are saved to .mutagent/mutation-context.md for use by other commands.")}
+${chalk35.cyan("Conceptual help (prompt vs agent taxonomy, brace inference): .claude/skills/mutagent-cli/workflows/exploration.md")} ${chalk35.dim("(install via mutagent skills install)")}
     `).action((options) => {
     const isJson = getJsonFlag(explore);
     const output = new OutputFormatter(isJson ? "json" : "table");
@@ -9598,8 +9678,8 @@ Scanning ${scanPath}...
 // src/commands/skills.ts
 import { Command as Command17 } from "commander";
 import chalk36 from "chalk";
-import { existsSync as existsSync12, mkdirSync as mkdirSync4, writeFileSync as writeFileSync5 } from "fs";
-import { dirname, join as join7 } from "path";
+import { existsSync as existsSync13, mkdirSync as mkdirSync4, writeFileSync as writeFileSync5 } from "fs";
+import { dirname, join as join8 } from "path";
 import { execSync as execSync4 } from "child_process";
 // src/generated/skill-content.ts
@@ -9614,6 +9694,8 @@ description: |
   "create evaluation", "upload dataset", "explore prompts", "mutagent cli",
   "eval", "dataset", "guided", "how do I optimize", "improve my prompt",
   "set up tracing", "add observability".
+SKILL_VERSION: 1.1.0
+SKILL_MIN_CLI_VERSION: 0.1.163
 ---
 # MutagenT CLI Skill
@@ -9622,7 +9704,135 @@ description: |
 > Packed into the CLI binary via \`scripts/sync-skill.ts\`. Installed to end-user
 > dev environments via \`mutagent skills install\`. Edit this file, not the installed copy.
-## SKILL vs CLI — responsibility split
+---
+## CLI Prerequisite Check (RUN FIRST)
+Before executing ANY workflow step, verify the CLI is installed and version-compatible:
+**Step 1 -- Check CLI presence:**
+\`\`\`bash
+mutagent --version --json
+\`\`\`
+**Step 2 -- If command not found (error / not on PATH):**
+This is the **Path 2 onboarding case**: the Skill was installed first (e.g. from a skill registry, manually, or bundled in someone else's CLAUDE.md), but the CLI itself isn't installed yet. Do NOT just dump install instructions and stop -- proactively **offer to install it**.
+**2a. Detect the user's package manager** (best-effort — check in this order):
+\`\`\`bash
+# In the user's project root (cwd):
+test -f bun.lockb && echo "bun"
+test -f pnpm-lock.yaml && echo "pnpm"
+test -f yarn.lock && echo "yarn"
+test -f package-lock.json && echo "npm"
+# Fallback: which bun || which pnpm || which yarn || which npm
+\`\`\`
+If multiple lockfiles exist, prefer in order: \`bun > pnpm > yarn > npm\`.
+If no lockfile and the user is in a non-JS project (e.g. Python, Go), ask which they prefer.
+**2b. Ask the user via AskUserQuestion** (do NOT auto-install without consent):
+> "The MutagenT CLI is not installed yet. I can install it globally via \`<detected-pm>\`. Proceed?"
+Options to present:
+1. **Yes, install globally with \`<detected-pm>\`** (Recommended) -- runs \`<pm> add -g @mutagent/cli\` (or \`npm install -g @mutagent/cli\` for npm)
+2. **Yes, but use a different package manager** -- prompt for choice (npm / bun / pnpm / yarn)
+3. **No, I'll install it myself** -- show the four install commands as a verbatim block; STOP and wait for the user to install
+4. **Skip — I have it installed via a different path** -- ask the user to add it to PATH and re-invoke
+**2c. On user approval (option 1 or 2)**, run the install command in a Bash tool call:
+\`\`\`bash
+# bun
+bun add -g @mutagent/cli
+# npm
+npm install -g @mutagent/cli
+# pnpm
+pnpm add -g @mutagent/cli
+# yarn
+yarn global add @mutagent/cli
+\`\`\`
+Show the install output to the user verbatim. After it completes, **re-run Step 1** (\`mutagent --version --json\`) to confirm the CLI is now on PATH. If the post-install version check still fails (e.g. global bin not on PATH), tell the user:
+> "Install succeeded but \`mutagent\` isn't on PATH yet. Common fixes:
+> - bun: \`export PATH=\\"$HOME/.bun/bin:$PATH\\"\`
+> - npm: check \`npm config get prefix\` and add \`<prefix>/bin\` to PATH
+> - pnpm: \`pnpm setup\` then restart your shell
+> - yarn: \`yarn global bin\` and add that path to PATH"
+> Then ask the user to restart their shell or source the relevant rc file, and re-invoke me.
+**2d. On rejection (option 3 or 4)**, output the four install commands as a verbatim block and STOP. Do NOT proceed with any workflow until the user confirms the CLI is installed and re-invokes the skill.
+\`\`\`bash
+# Pick one (Recommended in order: bun > pnpm > yarn > npm)
+bun add -g @mutagent/cli
+pnpm add -g @mutagent/cli
+yarn global add @mutagent/cli
+npm install -g @mutagent/cli
+\`\`\`
+**Why proactive install (not just "stop and tell")**: a user invoking this Skill has already shown intent to use MutagenT. Forcing them to context-switch to a separate terminal, copy-paste an install command, and re-invoke the agent is friction that often loses the user. Asking once + installing on consent is the smoother path. The opt-out (option 3/4) preserves user control.
+**Step 3 -- Version compatibility check:**
+Parse \`_compat.cliVersion\` from the \`--version --json\` output and compare against
+\`SKILL_MIN_CLI_VERSION\` (above in this file's frontmatter, currently \`0.1.163\`).
+- If \`cliVersion >= SKILL_MIN_CLI_VERSION\`: all good, proceed normally.
+- If \`cliVersion < SKILL_MIN_CLI_VERSION\`: emit a **PROMINENT PERSISTENT WARN** to the user:
+  > Warning: **CLI version mismatch**: Your \`mutagent\` CLI is \`{cliVersion}\` but this Skill
+  > requires \`>= 0.1.163\`. Some features may not work correctly.
+  > Upgrade: \`npm install -g @mutagent/cli@latest\`
+  **DO NOT BLOCK** -- proceed with the requested workflow after showing the warn.
+  Re-emit this warn at the start of every subsequent Skill invocation until the user upgrades.
+**Per decision D4 (locked 2026-05-03)**: version mismatch is warn-only, never a hard block.
+---
+## MANDATORY: Verbatim Card Display Protocol
+If a mutagent CLI response contains \`_directive.renderedCard\`, your FIRST action
+MUST be to copy that text byte-for-byte into your chat reply BEFORE running any
+further command, asking any question, or doing any other work.
+The card also appears on stderr (auto-echoed by the CLI via \`echoDirectiveToStderr\`)
+but you MUST STILL render it in chat -- bash/stderr blocks may be collapsed in the
+user's UI and they cannot read them.
+- Do NOT summarize, paraphrase, or truncate any line of the card.
+- Do NOT skip the card to "save tokens."
+- Do NOT delay rendering until "after the next step."
+- This applies whether \`_directive.display\` is \`'status_card'\` or absent -- the
+  trigger is the presence of \`renderedCard\`, not the display tag.
+- Failure to display the card verbatim = protocol violation.
+This rule applies to: prompt/agent/dataset/evaluation create+update+delete,
+provider add+update+delete, and optimize start/status/results.
+---
+## Agent Runtime — interactive question handling
+This skill emits \`_directive.askUserQuestions\` arrays for guided flows
+(eval-creation, dataset-curation, init). On Claude Code, use the
+**AskUserQuestion** tool to present each question to the user. On other
+coding-agent runtimes (Cursor, Aider, Continue, etc.), use the equivalent
+inquiry/prompt mechanism your runtime provides.
+If your runtime has no interactive-question tool, fall back to:
+1. Echo each question's \`question\` field verbatim into chat
+2. Wait for the user's reply before proceeding to the next question
+3. Do NOT auto-fill answers from context (Rule 3)
+The \`_directive.askUserQuestions\` schema is described in
+[\`concepts/eval-criteria.md\`](./concepts/eval-criteria.md) §
+"Per-field rubric collection" and follows
+[Claude Code's AskUserQuestion tool shape](https://docs.claude.com/en/docs/claude-code/sdk).
+---
+## SKILL vs CLI -- responsibility split
 | Layer | Owner | Responsibility |
 |---|---|---|
@@ -9630,33 +9840,54 @@ description: |
 | **CLI** | \`mutagent <cmd>\` | commands, flags, \`--json\`, \`_directive.*\`, \`_links\` |
 | Platform | api.mutagent.io | storage, optimization, eval execution, \`{variable}\` rendering |
-**Rule**: SKILL never duplicates CLI flag lists — always \`mutagent <cmd> --help\` for flags.
+**Rule**: SKILL never duplicates CLI flag lists -- always \`mutagent <cmd> --help\` for flags.
 ---
-## 5 Core Rules — NON-NEGOTIABLE
+## 5 Core Rules -- NON-NEGOTIABLE
 1. **\`--json\` on EVERY command.** No exceptions. Agents use JSON mode exclusively.
-2. **\`<command> --help\` BEFORE first use of any command.** The CLI is the source of truth for flags — this SKILL never inlines them.
-3. **NEVER auto-generate eval criteria — collect from user.** Ask the user for each rubric field. See [concepts/eval-criteria.md](./concepts/eval-criteria.md) for the rubric format.
+2. **\`<command> --help\` BEFORE first use of any command.** The CLI is the source of truth for flags -- this SKILL never inlines them.
+3. **NEVER auto-generate eval criteria -- collect from user.** Ask the user for each rubric field. See [concepts/eval-criteria.md](./concepts/eval-criteria.md) for the rubric format.
 4. **Explore-before-modify.** Run \`mutagent explore --json\` before any write operation. Present findings, get user confirmation. Never mutate without discovery first.
 5. **Cost transparency before \`optimize start\`.** Run \`mutagent usage --json\` and show the result to the user. Get explicit confirmation before any optimization job.
 6. **Before optimizing, run \`mutagent providers list --models\` to verify available models.** This calls \`/providers/catalog\` and shows which models are available per provider. Use the output to pick valid \`--exec-model\` and \`--eval-model\` values.
 ---
-## Journey Router — route by user intent
+## Prompt vs Agent -- pick the right loop
+| Signal | Use | CLI surface | Skill workflow |
+|---|---|---|---|
+| Single LLM call -> text/JSON output | Prompt Optimization | \`mutagent prompts *\` | [workflows/optimization.md](./workflows/optimization.md) |
+| Multi-turn / tool-calling / state graph | Agent (WIP) | \`mutagent agents *\` (CRUD only) | [workflows/agents.md](./workflows/agents.md) (stub) |
-Match the user's first request. Load ONLY the matching subfile. Do NOT preload the whole set.
+When in doubt: run \`mutagent explore --json\` (it classifies discovered code under \`prompts[]\` vs \`agents[]\`).
-| User said / signal detected | Load subfile | Why |
+---
+## Journey Router -- route by user intent
+> **Concept files = WHY/WHAT pre-reads. Workflow files = HOW step sequences.**
+> Load BOTH when intent matches both axes (e.g., "create rubric" loads
+> \`concepts/eval-criteria.md\` for the rubric design framework AND
+> \`workflows/eval-creation.md\` for the step-by-step CLI sequence). Each topic's
+> concept ↔ workflow pairing is shown in the Subfile Map below.
+Match the user's first request. Load ONLY the matching subfile(s) per the table. Do NOT preload the whole set.
+| User said / signal detected | Load subfile(s) | Why |
 |---|---|---|
 | "trace", "observe", "integrate", "add framework" | [workflows/tracing.md](./workflows/tracing.md) | Non-destructive, fastest first-value path |
-| "optimize", "improve", "tune", "evaluate", "upload prompt" | [workflows/optimization.md](./workflows/optimization.md) | Full create→dataset→eval→optimize loop |
+| "optimize", "improve", "tune", "upload prompt" | [workflows/optimization.md](./workflows/optimization.md) | Full create->dataset->eval->optimize loop (orchestrator) |
+| "create dataset", "add examples", "test cases", "edge cases", "hard cases", "expand dataset", "dataset items" | [workflows/dataset-curation.md](./workflows/dataset-curation.md) (HOW) + [concepts/dataset-design.md](./concepts/dataset-design.md) (WHY) | Standalone dataset curation (no optimization context needed) |
+| "create evaluation", "create rubric", "evaluate prompt", "judge", "score this prompt", "rubric design", "MVC", "Output Standards" | [workflows/eval-creation.md](./workflows/eval-creation.md) (HOW) + [concepts/eval-criteria.md](./concepts/eval-criteria.md) (WHY) | Standalone evaluation rubric creation (no optimization context needed) |
 | "explore", "scan", "find prompts", "what prompts", "discover" | [workflows/exploration.md](./workflows/exploration.md) | Read-only discovery + taxonomy |
-| \`AgentExecutor\`, \`StateGraph\`, \`createReactAgent\`, \`tool_calls\`, \`@tool\`, \`langgraph\`, \`crewai\`, \`autogen\`, \`openai/agents\`, multi-turn | [workflows/agents.md](./workflows/agents.md) | WIP path — surface partnership link |
-| "how do variables work", "single vs double braces", delimiter | [concepts/prompt-variables.md](./concepts/prompt-variables.md) | Delimiter inference contract |
-| "what makes a good eval", "rubric", "evaluation criteria" | [concepts/eval-criteria.md](./concepts/eval-criteria.md) | INPUT MVC + OUTPUT Standards |
+| \`AgentExecutor\`, \`StateGraph\`, \`createReactAgent\`, \`tool_calls\`, \`@tool\`, \`langgraph\`, \`crewai\`, \`autogen\`, \`openai/agents\`, multi-turn | [workflows/agents.md](./workflows/agents.md) | WIP path -- surface partnership link |
+| "how do variables work", "single vs double braces", delimiter | [concepts/prompt-variables.md](./concepts/prompt-variables.md) | Delimiter inference contract (concept-only; prompt creation lives inline in optimization.md step 4) |
+| "what makes a good eval" (concept question only, no creation intent) | [concepts/eval-criteria.md](./concepts/eval-criteria.md) | INPUT MVC + OUTPUT Standards (no workflow load) |
+| "what makes a good dataset" (concept question only, no creation intent) | [concepts/dataset-design.md](./concepts/dataset-design.md) | Dataset curation principles + case categories (no workflow load) |
+| "scorecard", "interpret results", "what does X score mean" | [concepts/scorecard-output.md](./concepts/scorecard-output.md) | Interpretation only (no workflow needed) |
 | "check models", "what models", "available models", "which models" | run \`mutagent providers list --models --json\` | Discovery: shows catalog per provider before model selection |
 | Unclear / first time | run \`mutagent explore --json\` first, then reroute | Discovery before action |
@@ -9667,59 +9898,62 @@ Match the user's first request. Load ONLY the matching subfile. Do NOT preload t
 | File | WHEN to load | WHY | ENFORCEMENT |
 |---|---|---|---|
 | [workflows/tracing.md](./workflows/tracing.md) | User wants to add framework tracing / observability | Non-destructive append-only integration sequence | Must run explore first (Rule 4) |
-| [workflows/optimization.md](./workflows/optimization.md) | User wants to optimize or evaluate a prompt | Full loop: explore → upload → dataset → eval → optimize → apply | Must check usage before optimize (Rule 5); must collect rubrics from user (Rule 3) |
+| [workflows/optimization.md](./workflows/optimization.md) | User wants to optimize or evaluate a prompt | Full loop: explore -> upload -> dataset -> eval -> optimize -> apply | Must check usage before optimize (Rule 5); must collect rubrics from user (Rule 3) |
+| [workflows/dataset-curation.md](./workflows/dataset-curation.md) | User wants to create/expand a dataset (standalone) | Focused dataset curation without full optimization context | Hard cases first; ask per-field questions |
+| [workflows/eval-creation.md](./workflows/eval-creation.md) | User wants to create/edit evaluation rubric (standalone) | Focused per-field rubric collection without full optimization context | INPUT MVC + OUTPUT Standards split; ask per-field questions; collect from user (Rule 3) |
 | [workflows/exploration.md](./workflows/exploration.md) | User wants to scan codebase, identify prompts vs agents | Read-only discovery; output taxonomy to user | Run only; no writes |
-| [workflows/agents.md](./workflows/agents.md) | Multi-turn / tool-calling code detected | WIP — do NOT attempt optimizer, surface partnership link | Show WIP card to user verbatim |
+| [workflows/agents.md](./workflows/agents.md) | Multi-turn / tool-calling code detected | WIP -- do NOT attempt optimizer, surface partnership link | Show WIP card to user verbatim |
 | [concepts/prompt-variables.md](./concepts/prompt-variables.md) | Any question about \`{var}\` vs \`{{var}}\`, delimiter inference | Brace convention + conversion rules | Load before \`prompts create\` in optimization workflow |
-| [concepts/eval-criteria.md](./concepts/eval-criteria.md) | Any question about rubric design, MVC, Output Standards | granular rubric format — INPUT-param vs OUTPUT-param scope | Load before \`evaluation create --guided\` in optimization workflow |
+| [concepts/eval-criteria.md](./concepts/eval-criteria.md) | Any question about rubric design, MVC, Output Standards | granular rubric format -- INPUT-param vs OUTPUT-param scope | Load before \`evaluation create --guided\` in optimization workflow |
+| [concepts/dataset-design.md](./concepts/dataset-design.md) | Any question about dataset quality, case categories, hard cases | Dataset design principles -- parallel structure to eval-criteria.md | Load before \`dataset add --guided\` |
 ---
 ## Output handling
 After every CLI command:
-- **Show the command output to the user.** Command output appears in bash blocks that users may not see — always present the key results in your chat response.
-- **For evaluation create \`--guided\`**: the CLI provides per-field questions for each schema field. Ask the user each question in turn. Do not skip any field. Do not pre-fill answers.
+- **Show the command output to the user.** Command output appears in bash blocks that users may not see -- always present the key results in your chat response.
+- **For evaluation create \`--guided\`**: the CLI provides per-field questions in \`_directive.askUserQuestions\`. Ask the user each question in turn. Do not skip any field. Do not pre-fill answers.
 - **For \`optimize results\`**: present the before/after scorecard to the user and confirm whether to apply, view diff, or reject.
 ---
-## Anti-patterns — NEVER do these
+## Anti-patterns -- NEVER do these
 - Run any command without \`--json\`
-- Auto-generate eval criteria — always collect from the user
+- Auto-generate eval criteria -- always collect from the user
 - Skip any schema field when collecting evaluation rubrics
 - Skip \`mutagent explore --json\` before any write operation
 - Run \`optimize start\` without first showing \`usage --json\` to the user
 - Increase \`--max-iterations\` above 1 without explicit user consent (each iteration = LLM spend)
 - Run a multi-turn agent through the prompt optimizer
 - Skip showing command output results to the user
-- Inline CLI flags from memory — always read \`--help\` first
+- Inline CLI flags from memory -- always read \`--help\` first
 ---
 ## State Tracking
-- \`.mutagent/mutation-context.md\` — codebase index of discovered/uploaded prompts. Update after explore, create, dataset ops.
-- \`mutagent auth status --json\` — auth + workspace state.
+- \`.mutagent/mutation-context.md\` -- codebase index of discovered/uploaded prompts. Update after explore, create, dataset ops.
+- \`mutagent auth status --json\` -- auth + workspace state.
 ---
 ## Login (two paths)
-- **CI / automated**: \`export MUTAGENT_API_KEY=mt_... && mutagent login --json\` — no browser, no prompts.
-- **Onboarding a user**: \`mutagent login --browser --json\` — CLI prints auth URL to stdout, polls 5 min. **Surface the URL verbatim to the user.** \`--non-interactive\` is NOT needed when \`--browser\` is set.
+- **CI / automated**: \`export MUTAGENT_API_KEY=mt_... && mutagent login --json\` -- no browser, no prompts.
+- **Onboarding a user**: \`mutagent login --browser --json\` -- CLI prints auth URL to stdout, polls 5 min. **Surface the URL verbatim to the user.** \`--non-interactive\` is NOT needed when \`--browser\` is set.
-\`mutagent login\` is canonical. \`mutagent auth login\` is a back-compat alias. Both delegate to \`lib/auth-flow.ts\`. Decision record: [cli-design-principles.md](../../docs/cli-design-principles.md) → Login Unification.
+\`mutagent login\` is canonical. \`mutagent auth login\` is a back-compat alias. Both delegate to \`lib/auth-flow.ts\`. Decision record: [cli-design-principles.md](../../docs/cli-design-principles.md) -> Login Unification.
 ---
-## Error Recovery — Agent-Aware Bug Reporting
+## Error Recovery -- Agent-Aware Bug Reporting
 When ANY mutagent CLI command returns a non-zero exit code or an error response,
 follow this protocol:
-1. **Show the error to the user** (always) — reproduce the exact command and output.
+1. **Show the error to the user** (always) -- reproduce the exact command and output.
 2. **ASK the user** if they want to file a bug report with session context.
 3. **On user approval**, run:
    \`\`\`bash
@@ -9758,7 +9992,7 @@ top-level keys you supply:
 }
 \`\`\`
-\`_auto\` is always populated by the CLI — do **not** set it manually. Your
+\`_auto\` is always populated by the CLI -- do **not** set it manually. Your
 top-level keys are never overwritten; if you supply \`workspaceId: "ws_agent_B"\`,
 the CLI's current workspace A goes into \`_auto.workspaceId\`, not the top level.
@@ -9775,7 +10009,218 @@ If the feedback command returns a non-zero exit code, DO NOT retry silently. Sho
 ## Extensibility
-Add \`workflows/custom-<name>.md\` with frontmatter \`triggers: ["phrase"]\` — auto-discovered by the decision tree fallback row. No rebuild needed.
+Add \`workflows/custom-<name>.md\` with frontmatter \`triggers: ["phrase"]\` -- auto-discovered by the decision tree fallback row. No rebuild needed.
+`,
+  "concepts/dataset-design.md": `---
+name: mutagent-cli-concepts-dataset-design
+description: |
+  Canonical source for MutagenT dataset design principles.
+  The Golden Rule: hard cases > easy cases; edge cases are mandatory.
+  Covers case categories (Edge / Hard / Representative / Adversarial),
+  format requirements (input/expectedOutput shape), and anti-patterns.
+  Parallel structure to concepts/eval-criteria.md for cognitive parity.
+  Mirrored in the CLI directive's bootstrappable instruction field.
+triggers:
+  - "dataset design"
+  - "dataset quality"
+  - "what makes a good dataset"
+  - "hard cases"
+  - "edge cases"
+  - "test cases"
+  - "expectedOutput"
+  - "dataset items"
+  - "guided dataset"
+---
+# Concept -- Dataset Design
+> **Parallel to** [concepts/eval-criteria.md](./eval-criteria.md) -- same section
+> structure so agents can navigate both consistently.
+>
+> **Canonical source** for dataset curation principles. Mirrored inline in
+> \`mutagent-cli/src/commands/prompts/guided-dataset.ts\` (directive instruction field)
+> so even agents without the Skill loaded can execute correctly.
+## The Golden Rule
+**Hard cases that expose prompt weaknesses FIRST. Easy cases that always pass LAST.**
+A dataset where every item produces correct output tells you nothing about where the
+prompt fails. The optimizer needs failure signal to improve. Prioritize inputs that:
+1. Are ambiguous (multiple valid interpretations)
+2. Are adversarial (designed to trigger a known failure mode)
+3. Are at the boundary of what the prompt should handle
+4. Are drawn from actual production failure cases
+**One edge case that causes a failure is worth 10 easy cases that succeed.**
+---
+## NEVER skip expectedOutput on labelable items
+This is the dataset equivalent of Rule 3 (never auto-generate eval criteria).
+- If you know what the correct output should be for a given input, you MUST include \`expectedOutput\`.
+- The optimizer uses \`expectedOutput\` as the ground-truth signal for G-Eval scoring.
+- Omitting \`expectedOutput\` on a labelable item forces the evaluator to use LLM judgment alone -- much noisier.
+- **Only omit \`expectedOutput\`** when correct output is genuinely subjective / context-dependent AND no rubric can distinguish good from bad.
+Ask the user for expected outputs field by field -- do NOT auto-generate them.
+---
+## NEVER auto-generate dataset items
+This is the counterpart to Rule 3 (never auto-generate eval criteria). Reasons:
+- Auto-generated items tend to be representative cases (easy) rather than hard cases.
+- The user knows what production inputs look like and where the prompt fails; the agent does not.
+- Synthetic easy cases produce noisy optimization signal -- the optimizer improves scores on the easy
+  cases but the real prompt weaknesses go uncovered.
+- Collect items from the user via AskUserQuestion, one category at a time.
+---
+## Case Categories
+Collect in this priority order -- hardest categories first:
+### 1. Edge Cases (HIGH priority)
+Boundary inputs that test the limits of what the prompt should handle.
+| What to ask | Examples |
+|---|---|
+| Empty or null inputs | \`""\`, \`null\`, \`0\`, \`[]\` |
+| Very long inputs (token limits) | paragraph-length where field should be short |
+| Malformed inputs | wrong type, wrong format, garbled text |
+| Unicode / special characters | emoji, RTL text, control chars, escaped quotes |
+| Missing required sub-fields | object with some required fields absent |
+**Collect at minimum**: 1-2 edge cases per input field.
+### 2. Hard Cases (HIGH priority)
+Inputs that are valid but expose known prompt weaknesses or require nuanced reasoning.
+| What to ask | Examples |
+|---|---|
+| Ambiguous inputs | "What does this mean?" (multiple valid answers) |
+| Adversarial inputs | phrasing designed to trigger hallucination or refusal |
+| Domain traps | technical jargon with multiple meanings in context |
+| Instruction conflicts | input that triggers contradictory rules in the prompt |
+| Near-miss inputs | almost correct format but slightly off |
+**Collect at minimum**: 2-3 hard cases total.
+### 3. Representative Cases (MEDIUM priority)
+Typical production inputs -- what the prompt handles 80% of the time.
+| What to ask | Examples |
+|---|---|
+| Common use cases | most frequent user inputs |
+| Standard formats | well-formed, expected-length, standard vocabulary |
+| Baseline quality | inputs where the prompt should succeed reliably |
+**Collect after** hard and edge cases are covered.
+### 4. Adversarial Cases (LOW priority, if relevant)
+Inputs designed to test security / safety / guardrails.
+| What to ask | Examples |
+|---|---|
+| Prompt injection attempts | "Ignore previous instructions and..." |
+| Off-topic requests | completely unrelated to the prompt's domain |
+| Jailbreak patterns | attempts to bypass constraints |
+Only collect if the prompt has explicit safety constraints.
+---
+## Format Requirements
+Every dataset item MUST have:
+\`\`\`json
+{
+  "input": {
+    "<inputSchema_field_1>": "<value>",
+    "<inputSchema_field_2>": "<value>"
+  },
+  "expectedOutput": {
+    "<outputSchema_field_1>": "<expected_value>",
+    "<outputSchema_field_2>": "<expected_value>"
+  }
+}
+\`\`\`
+Rules:
+- \`input\` keys MUST match the prompt's \`inputSchema.properties\` exactly (no extras, no missing required fields).
+- \`expectedOutput\` keys MUST match the prompt's \`outputSchema.properties\`.
+- String values in \`expectedOutput\` should be the verbatim correct answer (not a description of it).
+- Numeric scores in \`expectedOutput\` should match what the evaluator would award for a perfect response.
+- Upload as a JSON array: \`[{item1}, {item2}, ...]\`
+### Minimum dataset size
+- **5 items minimum** for any optimization run.
+- **At least 2 items** must be hard or edge cases.
+- More items = better signal, especially for per-criterion scoring.
+- \`mutagent prompts dataset add --help\` has the upload command flags.
+---
+## Anti-patterns
+| Anti-pattern | Why it's bad | Fix |
+|---|---|---|
+| All easy cases (prompt always succeeds) | No failure signal for optimizer | Add hard/edge cases first |
+| No edge cases | Optimizer never sees boundary behavior | Ask user about failure modes |
+| Fictional inputs that won't happen in production | Optimization targets unrealistic scenarios | Anchor to real usage patterns |
+| Missing \`expectedOutput\` on labelable items | Optimizer uses LLM judgment alone (noisy) | Ask user for expected outputs |
+| Duplicate items | Wastes dataset budget, skews scores | Check for duplicates before upload |
+| Items that are identical to training data | May overfit | Include diverse failure modes |
+---
+## Cross-references
+- [SKILL.md](../SKILL.md) -- 5 rules + journey router
+- [workflows/dataset-curation.md](../workflows/dataset-curation.md) -- standalone dataset curation workflow (HOW; this file is WHY)
+- [workflows/optimization.md](../workflows/optimization.md) -- full loop that includes dataset add step
+- [concepts/eval-criteria.md](./eval-criteria.md) -- parallel concept doc for evaluation criteria
+---
+## CLI commands
+\`\`\`bash
+# Discovery (no LLM cost)
+mutagent prompts dataset --help                                   # list dataset subcommands
+mutagent prompts dataset add --help                               # read flags before first use (Rule 2)
+mutagent prompts dataset list <prompt-id> --json                  # list datasets attached to a prompt
+mutagent prompts dataset get <dataset-id> --json                  # inspect single dataset's items + metadata
+# Creation -- guided (no LLM cost; just storage)
+mutagent prompts dataset add <prompt-id> --guided --json          # get _directive.askUserQuestions (per-field collection)
+mutagent prompts dataset add <prompt-id> -d '<json>' --name "<name>" --json  # upload items
+                                                                  # -d accepts inline JSON OR @path/to/file.json OR - (stdin)
+# Mutations
+mutagent prompts dataset update <dataset-id> -d '<json>' --json   # replace items in existing dataset
+mutagent prompts dataset delete <dataset-id> --json               # delete dataset (idempotent; --force skips confirm)
+\`\`\`
+**Flag glossary** (dataset-specific):
+- \`--guided\` -- emit per-field \`askUserQuestions\` directive instead of expecting \`-d\` upfront. Use when collecting from user.
+- \`-d <json>\` / \`--data <json>\` -- supply items payload inline. Accepts: inline JSON, \`@path\` (read from file), \`-\` (read from stdin).
+- \`--name "<name>"\` -- human-readable label for the dataset (shows in dashboard).
+- \`--json\` -- structured output (Rule 1: always use). Returns \`_directive\` + \`_links\`.
+**Cost note**: dataset creation/edit/delete commands incur ZERO LLM cost. They are pure storage operations against the platform API. LLM cost is only incurred when \`mutagent prompts optimize start\` runs the exec model against these dataset items.
 `,
   "concepts/eval-criteria.md": `---
 name: mutagent-cli-concepts-eval-criteria
@@ -10079,6 +10524,37 @@ the output to collect rubrics in the correct order.
 - [workflows/optimization.md](../workflows/optimization.md) → steps 7-9 (where this concept is applied)
 - [concepts/prompt-variables.md](./prompt-variables.md) → delimiter inference (used in MVC step)
 - \`mutagent/src/modules/prompts/prompt-evaluations/README.md\` → BE mirror
+---
+## CLI commands
+\`\`\`bash
+# Discovery (no LLM cost)
+mutagent prompts evaluation --help                                # list eval subcommands
+mutagent prompts evaluation create --help                         # read flags before first use (Rule 2)
+mutagent prompts evaluation list <prompt-id> --json               # list existing evaluations on a prompt
+mutagent prompts evaluation get <eval-id> --json                  # inspect single evaluation's criteria + metadata
+# Creation -- guided (no LLM cost; just storage)
+mutagent prompts evaluation create <prompt-id> --guided --json    # get _directive.askUserQuestions + decisionTree (per-field collection)
+mutagent prompts evaluation create <prompt-id> -d '<json>' --name "<name>" --json  # upload criteria
+                                                                  # -d accepts inline JSON OR @path/to/file.json OR - (stdin)
+# Mutations
+mutagent prompts evaluation update <eval-id> -d '<json>' --json   # update existing criteria
+mutagent prompts evaluation delete <eval-id> --json               # delete evaluation (idempotent; --force skips confirm)
+\`\`\`
+**Flag glossary** (eval-specific):
+- \`--guided\` -- emit per-field \`askUserQuestions\` directive instead of expecting \`-d\` upfront.
+- \`-d <json>\` / \`--data <json>\` -- supply criteria payload inline. Accepts: inline JSON, \`@path\` (file), \`-\` (stdin).
+- \`--name "<name>"\` -- human-readable label (shows in dashboard).
+- \`--json\` -- structured output (Rule 1: always use). Returns \`_directive\` + \`_links\` + \`_compat\`.
+**Cost note**: eval creation/edit/delete commands incur ZERO LLM cost. Pure storage operations. LLM cost is incurred only when \`mutagent prompts optimize start\` runs the judge model against this evaluation.
+**Workflow cross-link**: for the standalone HOW (step-by-step CLI sequence), see [workflows/eval-creation.md](../workflows/eval-creation.md).
 `,
   "concepts/prompt-variables.md": `---
 name: mutagent-cli-concepts-prompt-variables
@@ -10235,6 +10711,36 @@ Use the delimiter field to:
 - [concepts/eval-criteria.md](./eval-criteria.md) → MVC (Minimum Viable Context) — uses delimiter to enumerate input params
 - Source: \`mutagent-cli/src/lib/explorer.ts\` → \`inferPromptVariables()\` and \`DiscoveredPrompt.delimiter\`
 - Tests: \`mutagent-cli/src/__tests__/lib/explorer.test.ts\`
+---
+## CLI commands
+\`\`\`bash
+# Discovery (no LLM cost, read-only)
+mutagent explore --help                                          # read flags before first use (Rule 2)
+mutagent explore --json                                          # scans codebase; emits "delimiter" field per prompt
+mutagent prompts get <id> --json                                 # inspect uploaded prompt's stored form (incl. delimiter)
+# Creation (no LLM cost, just storage)
+mutagent prompts create --help                                   # read brace format rules before creating
+mutagent prompts create --name "<name>" --raw "<prompt>" --json  # upload prompt (use single-brace {var} convention)
+mutagent prompts create --name "<name>" --raw-file <path> --json # upload from file (preferred for multi-line prompts)
+# Mutations
+mutagent prompts update <id> --raw "<prompt>" --json             # replace stored prompt body
+mutagent prompts delete <id> --json                              # delete prompt (idempotent; --force skips confirm)
+\`\`\`
+**Flag glossary** (prompt-create-specific):
+- \`--raw "<text>"\` -- inline prompt body. Use for short single-line prompts.
+- \`--raw-file <path>\` -- read prompt body from file. Preferred for multi-line / templated prompts; preserves whitespace.
+- \`--name "<name>"\` -- human-readable label (shows in dashboard + explore output).
+- \`--json\` -- structured output (Rule 1: always use). Returns \`_directive\` (status_card) + \`_links\` + \`_compat\`.
+**Cost note**: prompt creation/edit/delete commands incur ZERO LLM cost. Pure storage. Only \`mutagent prompts optimize start\` and \`mutagent prompts playground\` (interactive testing) incur LLM cost.
+**Brace convention reminder**: use single-brace \`{var}\` for variables you'll later supply via dataset items. Use double-brace \`{{literal}}\` only when you need a literal \`{var}\` substring rendered (rare). The CLI's \`mutagent explore\` infers delimiter automatically -- see § "Delimiter inference" above for the rules.
 `,
   "concepts/scorecard-output.md": `---
 name: mutagent-cli-concepts-scorecard-output
@@ -10456,6 +10962,47 @@ rl.on('line', (line) => {
   defined; \`criteria[].name\` in \`ScorecardData\` maps to \`name\` in the rubric.
 - [workflows/optimization.md](../workflows/optimization.md) — full optimization
   loop; the scorecard is produced at Step 8 (watch) and Step 9 (results).
+---
+## CLI commands
+\`\`\`bash
+# Discovery (no LLM cost)
+mutagent prompts optimize --help                                # list optimize subcommands
+mutagent prompts optimize start --help                          # read flags before first use (Rule 2)
+# \uD83D\uDCB0 LLM COST -- requires usage check (Rule 5) + provider catalog check (Rule 6)
+mutagent prompts optimize start <id> --dataset <d> --evaluation <e> --json
+                                                                # start job (cost = exec_model × items × iterations
+                                                                # + judge_model × items × iterations)
+mutagent prompts optimize start <id> --dataset <d> --evaluation <e> --watch --json
+                                                                # start + stream NDJSON events to stdout
+# Polling / watching (no LLM cost; just reads job state)
+mutagent prompts optimize status <job-id> --json                # poll progress snapshot (includes bestScore)
+mutagent prompts optimize watch <job-id> --json                 # attach to running job (NDJSON stream)
+mutagent prompts optimize results <job-id> --json               # full scorecard after completion (emits verbatim card)
+mutagent prompts optimize results <job-id> --diff --json        # view prompt diff (no apply)
+# Mutation (no LLM cost itself; modifies stored prompt)
+mutagent prompts optimize results <job-id> --apply --json       # apply optimized prompt -> updates stored version
+                                                                # (irreversible without manual revert via prompts update)
+\`\`\`
+**Flag glossary** (optimize-specific):
+- \`--dataset <d>\` -- dataset ID (from \`prompts dataset list\`). Items run through both exec and judge models.
+- \`--evaluation <e>\` -- evaluation ID (from \`prompts evaluation list\`). Drives the judge model's scoring rubric.
+- \`--watch\` -- after start, stream NDJSON events instead of returning immediately. Equivalent to \`start\` then \`watch\`.
+- \`--max-iterations N\` -- bound the optimizer loop. **Defaults to 1**; never raise without explicit user consent (each iteration = full eval × dataset round-trip).
+- \`--exec-model <model>\` / \`--eval-model <model>\` -- override defaults. Validate first via \`mutagent providers list --models --json\` (Rule 6).
+- \`--apply\` -- write optimized prompt back to stored version. Cannot be undone via flag; use \`prompts update\` to revert.
+- \`--diff\` -- view before/after diff without applying.
+- \`--json\` -- structured output (Rule 1: always use). Returns \`_directive\` + \`_links\` + \`_compat\`.
+**Cost note**: \`optimize start\` is the ONLY cost-incurring command in this family. Always run \`mutagent usage --json\` first (Rule 5) to surface remaining quota; show the result to the user; require explicit confirmation. The \`--max-iterations\` default of 1 keeps cost bounded.
+**Verbatim card protocol**: \`optimize start\`, \`optimize status\`, and \`optimize results\` all emit \`_directive.renderedCard\` -- the agent MUST echo the rendered card verbatim into chat before any next action (per SKILL.md § "MANDATORY: Verbatim Card Display Protocol"). The card also auto-echoes to stderr via \`echoDirectiveToStderr\`, but bash blocks may be collapsed in the user's UI.
 `,
   "workflows/agents.md": `---
 name: mutagent-cli-workflows-agents
@@ -10579,6 +11126,35 @@ Do NOT try to extract the whole agent loop at once.
 ---
+## CLI commands
+\`\`\`bash
+# Discovery (no LLM cost, read-only)
+mutagent explore --json                                # detect agents[] in codebase via taxonomy classifier
+mutagent agents --help                                 # list available agent subcommands (CRUD + WIP banner)
+mutagent agents list --json                            # CRUD: list registered agents
+mutagent agents get <id> --json                        # CRUD: inspect single agent (config + metadata)
+# Mutations (no LLM cost; just storage)
+mutagent agents create --name "<name>" --json          # register a new agent
+mutagent agents update <id> --json                     # update agent config
+mutagent agents delete <id> --json                     # delete agent (idempotent; --force skips confirm)
+# NOT YET AVAILABLE -- shows AGENTS_WIP_BANNER if attempted
+mutagent agents optimize <id>                          # WIP -- tracked separately; see partnership link below
+\`\`\`
+**Flag glossary** (agent-specific):
+- \`--name "<name>"\` -- human-readable label (shows in dashboard).
+- \`--force\` -- skip interactive confirmation on delete (auto-skipped in \`--json\` mode).
+- \`--json\` -- structured output (Rule 1: always use). Returns \`_directive\` (status_card) + \`_links\` + \`_compat\`.
+**Cost note**: all current \`mutagent agents *\` commands are CRUD (zero LLM cost). Agent optimization (\`mutagent agents optimize\`) is NOT yet available -- when shipped it will incur LLM cost similar to \`prompts optimize start\`. Current behavior on \`mutagent agents optimize\`: returns \`AGENTS_WIP_BANNER\` directive pointing to the partnership link.
+**Partnership link**: <https://www.mutagent.io/agents-partnership> -- for early access to multi-turn / tool-calling agent optimization.
+---
 ## Cross-references
 - [SKILL.md](../SKILL.md) → 5 rules + journey router
@@ -10586,6 +11162,408 @@ Do NOT try to extract the whole agent loop at once.
 - [workflows/optimization.md](./optimization.md) → branch (c) destination
 - [concepts/prompt-variables.md](../concepts/prompt-variables.md) → \`{foo}\` vs \`{{foo}}\` for sub-prompt extraction
 - Partnership link: https://www.mutagent.io/agents-partnership
+`,
+  "workflows/dataset-curation.md": `---
+name: mutagent-cli-workflows-dataset-curation
+description: |
+  Standalone dataset curation workflow. Use when the user wants to create
+  or expand a dataset WITHOUT running the full optimization loop.
+  Covers guided and manual dataset creation, per-field question collection,
+  hard-cases-first priority, and upload via CLI.
+  Cross-linked from workflows/optimization.md dataset step.
+triggers:
+  - "create dataset"
+  - "add examples"
+  - "test cases"
+  - "edge cases"
+  - "hard cases"
+  - "expand dataset"
+  - "dataset items"
+  - "curate dataset"
+  - "build dataset"
+  - "dataset curation"
+---
+# Workflow -- Dataset Curation (Standalone)
+> **When to use this workflow vs optimization.md**:
+>
+> Use THIS workflow when the user wants ONLY to create or expand a dataset,
+> without immediately running optimization. Common signals:
+> - "I want to add more test cases"
+> - "Let's build a dataset for this prompt"
+> - "Add some edge cases"
+> - "Expand the existing dataset"
+>
+> Use [workflows/optimization.md](./optimization.md) when the user wants the
+> full loop: create prompt -> dataset -> eval -> optimize. That workflow has
+> an inline dataset step that cross-links back here.
+Read the **5 rules** in [SKILL.md](../SKILL.md) before executing.
+---
+## When this workflow applies
+- User explicitly wants to curate/build a dataset (no immediate optimization intent)
+- User wants to add hard cases / edge cases to an existing dataset
+- User wants to understand what good dataset items look like before committing to optimization
+- User has a prompt uploaded already and wants to build test coverage
+---
+## Required pre-read
+Load [concepts/dataset-design.md](../concepts/dataset-design.md) before collecting items.
+It defines:
+- The Golden Rule (hard cases first)
+- 4 case categories (Edge / Hard / Representative / Adversarial)
+- Format requirements (\`input\` + \`expectedOutput\` shape)
+- Anti-patterns to avoid
+The CLI's \`--guided\` directive also contains a self-sufficient inline version of these
+rules in \`_directive.instruction\` -- safe to execute even without the Skill loaded.
+---
+## Workflow steps
+\`\`\`
+1. mutagent explore --json
+   -> confirm which prompt you're building a dataset for
+   -> show command output to user
+   -> ask: "Which prompt would you like to build a dataset for?"
+2. mutagent prompts get <prompt-id> --json
+   -> inspect inputSchema + outputSchema fields
+   -> understand what input and output shapes look like
+3. mutagent prompts dataset add --help
+   -> read flags (Rule 2: always --help before first use)
+4. mutagent prompts dataset add <prompt-id> --guided --json
+   -> CLI returns _directive.askUserQuestions with per-field questions
+   -> follow the instruction in _directive.instruction
+5. For EACH question in _directive.askUserQuestions:
+   -> use AskUserQuestion to collect the answer from the user
+   -> prioritize hard/edge case questions first (they come first in the list)
+   -> do NOT skip any question
+   -> do NOT auto-fill answers
+6. Construct dataset items from collected answers:
+   -> format: [{"input": {...}, "expectedOutput": {...}}, ...]
+   -> minimum 5 items; at least 2 must be hard/edge cases
+   -> verify all input keys match promptSchema.inputSchema.properties
+   -> verify all expectedOutput keys match promptSchema.outputSchema.properties
+7. Ask user to review the constructed items before upload:
+   "Here are the 7 dataset items I constructed. Review them before upload?"
+   -> show items in a readable format
+   -> accept corrections
+8. mutagent prompts dataset add <prompt-id> -d '[...]' --name "<name>" --json
+   -> upload the reviewed items
+   -> show command output to user (confirm datasetId)
+   -> record datasetId in .mutagent/mutation-context.md
+9. Ask: "What would you like to do next?"
+   -> Option A: Add more items (loop back to step 4)
+   -> Option B: Create an evaluation -> route to evaluation create --guided
+   -> Option C: Start optimization -> route to workflows/optimization.md step 10
+   -> Option D: Done
+\`\`\`
+---
+## Guided mode output shape
+\`mutagent prompts dataset add <prompt-id> --guided --json\` returns:
+\`\`\`json
+{
+  "promptId": "...",
+  "promptName": "...",
+  "schemaFields": { "input": ["field1", "field2"], "output": ["result"] },
+  "suggestedCategories": [
+    { "name": "Edge Cases", "description": "...", "priority": "high" },
+    { "name": "Hard Cases", "description": "...", "priority": "high" },
+    { "name": "Representative Cases", "description": "...", "priority": "medium" }
+  ],
+  "templateItem": {
+    "input": { "field1": "<value>", "field2": "<value>" },
+    "expectedOutput": { "result": "<expected>" }
+  },
+  "guidance": {
+    "minItems": 5,
+    "priorityRule": "Hard cases that expose prompt weaknesses > easy cases that always pass",
+    "steps": [...]
+  },
+  "_directive": {
+    "instruction": "...",   // self-sufficient conceptual rules (bootstrappable without Skill)
+    "next": ["mutagent prompts dataset add <id> -d '<json>' --name '<name>' --json"],
+    "askUserQuestions": [   // inside _directive (not a sibling)
+      { "field": "_general", "question": "What are the hardest inputs for this prompt?" },
+      { "field": "_edge_cases", "question": "What edge cases have caused failures?" },
+      { "field": "field1", "source": "inputSchema", "question": "What values should "field1" have?" },
+      ...
+    ]
+  },
+  "_compat": { "cliVersion": "...", "skillVersion": "...", "skillMinCliVersion": "..." }
+}
+\`\`\`
+Key: \`askUserQuestions\` is inside \`_directive\` (not a top-level sibling). Parse \`_directive.askUserQuestions\`.
+---
+## Cost control
+Dataset curation has NO LLM cost on its own -- it's a pure storage operation.
+Only \`mutagent prompts optimize start\` incurs LLM cost. Safe to run freely.
+---
+## Common pitfalls
+For the canonical anti-pattern list (WHY each is bad + how to fix), see [concepts/dataset-design.md](../concepts/dataset-design.md) § Anti-patterns. Workflow-specific execution mistakes:
+- **Uploading items with wrong field names** -> schema mismatch error from optimizer (not caught by concept-level rules)
+- **Forgetting to ask the user to review before upload** -> user can't correct mistakes (workflow step 7)
+- **Skipping the explore step** -> uploading to the wrong prompt (workflow step 1)
+---
+## Cross-references
+- [SKILL.md](../SKILL.md) -- 5 rules + journey router
+- [concepts/dataset-design.md](../concepts/dataset-design.md) -- Golden Rule, case categories, format requirements, anti-patterns (WHY; this file is HOW)
+- [workflows/optimization.md](./optimization.md) -- full loop; dataset step cross-links here
+- [workflows/eval-creation.md](./eval-creation.md) -- parallel workflow doc (for evaluation side)
+- [concepts/prompt-variables.md](../concepts/prompt-variables.md) -- brace convention (for input field values)
+---
+## CLI commands
+\`\`\`bash
+# Workflow execution sequence (commands appear inline in steps above; this is a quick reference)
+mutagent explore --json                                            # step 1: discover prompts
+mutagent prompts get <prompt-id> --json                            # step 2: inspect schemas
+mutagent prompts dataset add --help                                # step 3: read flags (Rule 2)
+mutagent prompts dataset add <prompt-id> --guided --json           # step 4: get _directive.askUserQuestions
+mutagent prompts dataset add <prompt-id> -d '<json>' --name "<name>" --json  # step 8: upload reviewed items
+\`\`\`
+For the full flag glossary + cost notes, see [concepts/dataset-design.md](../concepts/dataset-design.md) § CLI commands.
+`,
+  "workflows/eval-creation.md": `---
+name: mutagent-cli-workflows-eval-creation
+description: |
+  Standalone evaluation rubric creation workflow. Use when the user wants to
+  define eval criteria for a prompt WITHOUT immediately running optimization.
+  Covers guided per-field rubric collection (INPUT MVC + OUTPUT Standards),
+  full-depth granular rubrics, and upload via CLI.
+  Cross-linked from workflows/optimization.md eval step.
+triggers:
+  - "create evaluation"
+  - "create rubric"
+  - "evaluate prompt"
+  - "evaluation criteria"
+  - "rubric design"
+  - "MVC"
+  - "Output Standards"
+  - "score this prompt"
+  - "judge this prompt"
+  - "eval guided"
+---
+# Workflow -- Evaluation Creation (Standalone)
+> **When to use this workflow vs optimization.md**:
+>
+> Use THIS workflow when the user wants ONLY to define an evaluation rubric for
+> a prompt, without immediately running optimization. Common signals:
+> - "I want to score this prompt"
+> - "Let's define eval criteria for this prompt"
+> - "Create a rubric for this prompt"
+> - "How should we judge this prompt's outputs?"
+>
+> Use [workflows/optimization.md](./optimization.md) when the user wants the
+> full loop: create prompt -> dataset -> eval -> optimize. That workflow has
+> an inline eval step that cross-links back here.
+Read the **5 rules** in [SKILL.md](../SKILL.md) before executing.
+---
+## When this workflow applies
+- User explicitly wants to create or edit an evaluation rubric (no immediate optimization intent)
+- User wants to add criteria to an existing prompt (with or without an existing dataset)
+- User wants to understand how rubric design works before committing to optimization
+- User has a prompt uploaded already and wants quality scoring before iteration
+---
+## Required pre-read
+Load [concepts/eval-criteria.md](../concepts/eval-criteria.md) before collecting criteria.
+It defines:
+- The Golden Rule (INPUT MVC vs OUTPUT Standards split)
+- 6-tier MVC anchor framework for INPUT-scoped criteria
+- Output Standards format for OUTPUT-scoped criteria
+- Format requirements (\`name\`, \`description\`, \`evaluationParameter\`)
+- Anti-patterns to avoid
+The CLI's \`--guided\` directive also contains a self-sufficient inline version of these
+rules in \`_directive.instruction\` -- safe to execute even without the Skill loaded.
+---
+## Workflow steps
+\`\`\`
+1. mutagent explore --json
+   -> confirm which prompt you're creating an evaluation for
+   -> show command output to user
+   -> ask: "Which prompt would you like to evaluate?"
+2. mutagent prompts get <prompt-id> --json
+   -> inspect inputSchema + outputSchema fields
+   -> understand what input parameters and output shape look like
+   -> these drive the per-field rubric collection in step 4
+3. mutagent prompts evaluation create --help
+   -> read flags (Rule 2: always --help before first use)
+4. mutagent prompts evaluation create <prompt-id> --guided --json
+   -> CLI returns _directive.askUserQuestions with per-field questions
+   -> follow the instruction in _directive.instruction
+   -> the instruction inlines the INPUT MVC vs OUTPUT Standards framing
+      (bootstrappable -- works even if concepts/eval-criteria.md isn't loaded)
+5. For EACH question in _directive.askUserQuestions:
+   -> use AskUserQuestion to collect the answer from the user
+   -> INPUT-scoped fields (source: "inputSchema") -> ask MVC rubric:
+      what's minimum viable context the input MUST contain?
+   -> OUTPUT-scoped fields (source: "outputSchema") -> ask Output Standards:
+      what does correct vs incorrect look like for this field?
+   -> do NOT skip any field
+   -> do NOT auto-fill answers (Rule 3: never auto-generate criteria)
+6. Construct rubric items from collected answers:
+   -> format: [{"name": "...", "description": "...", "evaluationParameter": "..."}, ...]
+   -> one rubric per schema field unless user opts to merge fields
+   -> use 6-tier full-depth descriptions for complex INPUT criteria
+      (see concepts/eval-criteria.md for examples)
+   -> use simpler 2-3-tier descriptions for OUTPUT correctness criteria
+7. Ask user to review the constructed criteria before upload:
+   "Here are the N evaluation criteria I drafted. Review before upload?"
+   -> show criteria in a readable format (table or numbered list)
+   -> accept corrections; loop step 5-7 if user wants edits
+8. mutagent prompts evaluation create <prompt-id> -d '<json>' --name "<name>" --json
+   -> upload the reviewed criteria
+   -> show command output to user (confirm evaluationId)
+   -> record evaluationId in .mutagent/mutation-context.md
+9. Ask: "What would you like to do next?"
+   -> Option A: Add more criteria (loop back to step 4)
+   -> Option B: Add a dataset -> route to workflows/dataset-curation.md
+   -> Option C: Start optimization -> route to workflows/optimization.md step 10
+   -> Option D: Done
+\`\`\`
+---
+## Guided mode output shape
+\`mutagent prompts evaluation create <prompt-id> --guided --json\` returns:
+\`\`\`json
+{
+  "promptId": "...",
+  "promptName": "...",
+  "schemaFields": { "input": ["field1", "field2"], "output": ["result"] },
+  "_directive": {
+    "instruction": "...",   // self-sufficient INPUT MVC + OUTPUT Standards rules (bootstrappable)
+    "next": ["mutagent prompts evaluation create <id> -d '<json>' --name '<name>' --json"],
+    "decisionTree": {
+      "step1": "Confirm input parameters with the user via _directive.askUserQuestions...",
+      "step2": "Define correctness criteria for EVERY field..."
+    },
+    "askUserQuestions": [   // inside _directive (not a sibling)
+      { "field": "field1", "source": "inputSchema", "question": "What MVC anchors define minimum viable context for 'field1'?" },
+      { "field": "result", "source": "outputSchema", "question": "What does a correct 'result' look like vs incorrect?" },
+      ...
+    ]
+  },
+  "_compat": { "cliVersion": "...", "skillVersion": "...", "skillMinCliVersion": "..." }
+}
+\`\`\`
+Key: \`askUserQuestions\` is inside \`_directive\` (not a top-level sibling). Parse \`_directive.askUserQuestions\`. The \`decisionTree\` field guides multi-step branching.
+---
+## Cost control
+Eval creation has NO LLM cost on its own -- it's a pure storage operation.
+Only \`mutagent prompts optimize start\` incurs LLM cost (judge model + exec model
+multiplied by dataset items × iterations). Safe to create/edit eval criteria freely.
+---
+## Common pitfalls
+For the canonical anti-pattern list, see [concepts/eval-criteria.md](../concepts/eval-criteria.md) § Anti-patterns. Workflow-specific execution mistakes:
+- **Skipping per-field collection** -> rubric incomplete; optimizer scores against a sparse signal
+- **Auto-filling answers from context** instead of asking user -> rubric reflects the agent's assumptions, not the user's domain knowledge (Rule 3 violation)
+- **Merging input + output criteria into one** -> loses INPUT MVC vs OUTPUT Standards distinction; harder to interpret per-criterion scores
+- **Uploading without user review** -> user can't catch misinterpretations of their domain
+- **Wrong \`evaluationParameter\` value** -> server rejects with schema validation error
+---
+## Cross-references
+- [SKILL.md](../SKILL.md) -- 5 rules + journey router
+- [concepts/eval-criteria.md](../concepts/eval-criteria.md) -- Golden Rule, MVC/Output Standards, format requirements, anti-patterns
+- [workflows/optimization.md](./optimization.md) -- full loop; eval step cross-links here
+- [workflows/dataset-curation.md](./dataset-curation.md) -- parallel workflow doc (for dataset side)
+- [concepts/scorecard-output.md](../concepts/scorecard-output.md) -- how eval scores surface in optimization scorecard
+---
+## CLI commands
+\`\`\`bash
+# Discovery (no LLM cost)
+mutagent prompts evaluation --help                                 # list eval subcommands
+mutagent prompts evaluation create --help                          # read flags before first use (Rule 2)
+mutagent prompts evaluation list <prompt-id> --json                # list existing evaluations on a prompt
+mutagent prompts evaluation get <eval-id> --json                   # inspect single evaluation's criteria + metadata
+# Creation -- guided (no LLM cost; just storage)
+mutagent prompts evaluation create <prompt-id> --guided --json     # get _directive.askUserQuestions + decisionTree (per-field collection)
+mutagent prompts evaluation create <prompt-id> -d '<json>' --name "<name>" --json  # upload criteria
+                                                                   # -d accepts inline JSON OR @path/to/file.json OR - (stdin)
+# Mutations
+mutagent prompts evaluation update <eval-id> -d '<json>' --json    # update existing criteria
+mutagent prompts evaluation delete <eval-id> --json                # delete evaluation (idempotent; --force skips confirm)
+\`\`\`
+**Flag glossary** (eval-specific):
+- \`--guided\` -- emit per-field \`askUserQuestions\` directive instead of expecting \`-d\` upfront. Use when collecting from user.
+- \`-d <json>\` / \`--data <json>\` -- supply criteria payload inline. Accepts: inline JSON, \`@path\` (read from file), \`-\` (read from stdin).
+- \`--name "<name>"\` -- human-readable label for the evaluation (shows in dashboard).
+- \`--json\` -- structured output (Rule 1: always use). Returns \`_directive\` + \`_links\`.
+**Cost note**: eval creation/edit/delete commands incur ZERO LLM cost. They are pure storage operations against the platform API. LLM cost is only incurred when \`mutagent prompts optimize start\` runs the judge model against this evaluation.
 `,
   "workflows/exploration.md": `---
 name: mutagent-cli-workflows-exploration
@@ -10720,6 +11698,27 @@ After step 1, show the command output to the user before proceeding to classific
 ---
+## CLI commands
+\`\`\`bash
+# Discovery -- read-only (no LLM cost)
+mutagent explore --help                                # read flags before first use (Rule 2)
+mutagent explore --json                                # scan cwd for prompts + agents (full scan)
+mutagent explore --path ./src --json                   # scan specific directory subtree
+mutagent explore --markers-only --json                 # show only files with existing .mutagent/* markers
+mutagent explore --classify-only --json                # taxonomy output only (skip variable inference)
+\`\`\`
+**Flag glossary** (explore-specific):
+- \`--path <dir>\` -- restrict scan to subtree. Useful for monorepos with multiple apps; default is cwd.
+- \`--markers-only\` -- skip discovery; show only prompts/agents already uploaded (have \`.mutagent/*.md\` marker file). Use to refresh an existing index.
+- \`--classify-only\` -- skip per-prompt delimiter inference. Faster scan when you only need the prompts[]/agents[] taxonomy split.
+- \`--json\` -- structured output (Rule 1: always use). Returns \`prompts[]\`, \`agents[]\`, taxonomy, plus per-prompt \`delimiter\` field.
+**Cost note**: \`mutagent explore\` is fully read-only -- no LLM calls, no platform API mutations. Safe to run repeatedly. The output is a snapshot of cwd at run time; re-run after meaningful code changes.
+---
 ## Cross-references
 - [SKILL.md](../SKILL.md) → 5 rules + journey router
@@ -10826,6 +11825,9 @@ explore → prompts create → dataset add → evaluation create --guided
  7. Load [concepts/eval-criteria.md](../concepts/eval-criteria.md)
     → understand INPUT-param (MVC) vs OUTPUT-param (Standards) scope
+    → for standalone eval-only work outside this optimization context, see
+      [workflows/eval-creation.md](./eval-creation.md) -- this step inlines a brief
+      version of that workflow
  8. mutagent prompts evaluation create <promptId> --guided --json
     → the CLI provides a list of fields, each needing a rubric
@@ -10836,6 +11838,8 @@ explore → prompts create → dataset add → evaluation create --guided
         - do NOT skip any field
         - do NOT auto-generate any answer
     → collect at minimum: one INPUT criterion per {variable}, one OUTPUT criterion
+    → for the full step-by-step including review-before-upload + decisionTree handling,
+      see [workflows/eval-creation.md](./eval-creation.md)
  9. mutagent prompts evaluation create <promptId> -d '<json>' --json
     → upload the criteria collected in step 8
@@ -10923,6 +11927,63 @@ Ensure at least 2 hard/edge cases per category. Then upload:
 mutagent prompts dataset add <prompt-id> -d '<constructed-json>' --name '<name>' --json
 \`\`\`
+For dataset-only work (no optimization needed yet), see [workflows/dataset-curation.md](./dataset-curation.md)
+and [concepts/dataset-design.md](../concepts/dataset-design.md) for the full curation principles.
+---
+## CLI commands
+Run these before the first use of each command (Rule 2: \`--help\` before first use):
+\`\`\`bash
+mutagent explore --help                                        # codebase scan flags
+mutagent prompts create --help                                 # prompt upload flags + brace convention
+mutagent prompts dataset add --help                            # dataset add flags + --guided semantics
+mutagent prompts evaluation create --help                      # eval create flags + --guided semantics
+mutagent prompts optimize start --help                         # optimize start flags + cost-relevant flags
+mutagent prompts optimize status --help                        # status polling flags
+mutagent prompts optimize results --help                       # results flags + --apply / --diff
+mutagent usage --help                                          # quota query flags
+mutagent providers list --help                                 # provider catalog query flags
+\`\`\`
+Workflow execution sequence (annotated with cost markers):
+\`\`\`bash
+# Discovery + setup (no LLM cost)
+mutagent explore --json                                        # step 1: discover prompts
+mutagent prompts create --name "<name>" --raw-file <path> --json  # step 4: upload prompt
+mutagent prompts dataset add <id> --guided --json              # step 5-6: guided dataset (returns _directive.askUserQuestions)
+mutagent prompts dataset add <id> -d '<json>' --name "<name>" --json  # step 6: upload dataset items
+mutagent prompts evaluation create <id> --guided --json        # step 8: guided eval (returns _directive.askUserQuestions + decisionTree)
+mutagent prompts evaluation create <id> -d '<json>' --name "<name>" --json  # step 9: upload criteria
+# Pre-flight checks (no LLM cost)
+mutagent usage --json                                          # step 10: surface quota to user (Rule 5)
+mutagent providers list --models --json                        # verify exec/eval models are available (Rule 6)
+# \uD83D\uDCB0 LLM COST starts here -- requires explicit user confirmation per Rule 5
+mutagent prompts optimize start <id> --dataset <d> --evaluation <e> --max-iterations 1 --json
+                                                               # step 11: start job (cost = exec_model × items × iterations
+                                                               # + judge_model × items × iterations)
+                                                               # --max-iterations defaults to 1; never raise without user consent
+# Polling + results (no LLM cost; just reads job state + emits verbatim card)
+mutagent prompts optimize status <job-id> --json               # step 12: poll progress (verbatim card)
+mutagent prompts optimize results <job-id> --json              # step 13: view scorecard (verbatim card)
+mutagent prompts optimize results <job-id> --diff --json       # step 14a: view prompt diff (no apply)
+mutagent prompts optimize results <job-id> --apply --json      # step 14b: apply optimized prompt to stored version
+\`\`\`
+**Cost note**: \`optimize start\` is the ONLY cost-incurring command in this workflow. All other commands are pure storage/discovery operations. The \`--max-iterations\` flag bounds total cost (default = 1; never raise silently).
+**Verbatim card protocol**: \`optimize start\`, \`optimize status\`, and \`optimize results\` emit \`_directive.renderedCard\` -- echo verbatim per [SKILL.md § MANDATORY: Verbatim Card Display Protocol](../SKILL.md).
+For per-topic standalone HOW workflows, see:
+- [workflows/dataset-curation.md](./dataset-curation.md) -- standalone dataset curation
+- [workflows/eval-creation.md](./eval-creation.md) -- standalone evaluation rubric creation
 ---
 ## Cross-references
@@ -10930,6 +11991,8 @@ mutagent prompts dataset add <prompt-id> -d '<constructed-json>' --name '<name>'
 - [SKILL.md](../SKILL.md) → 5 rules + journey router
 - [concepts/prompt-variables.md](../concepts/prompt-variables.md) → brace convention + conversion (critical for steps 3 and 15)
 - [concepts/eval-criteria.md](../concepts/eval-criteria.md) → INPUT MVC + OUTPUT Standards + granular rubric (critical for steps 7-8)
+- [concepts/dataset-design.md](../concepts/dataset-design.md) → dataset curation principles (Golden Rule, case categories, anti-patterns)
+- [workflows/dataset-curation.md](./dataset-curation.md) → standalone dataset curation (when no optimization needed yet)
 - [workflows/exploration.md](./exploration.md) → step 1 of this workflow
 - [workflows/tracing.md](./tracing.md) → parallel or follow-up path
 `,
@@ -11044,6 +12107,36 @@ After step 6:
 ---
+## CLI commands
+\`\`\`bash
+# Discovery (no LLM cost, read-only)
+mutagent explore --help                                # read flags before first use (Rule 2)
+mutagent explore --json                                # step 1: detect framework + prompts taxonomy
+mutagent integrate --help                              # list supported frameworks + per-framework flags
+# Code generation (no LLM cost; emits integration snippet to stdout)
+mutagent integrate <framework> --json                  # step 3: get integration snippet for the detected framework
+mutagent integrate <framework> --output <path> --json  # write snippet directly to file (instead of stdout)
+# Verification (no LLM cost, read-only)
+mutagent traces list --json                            # step 6: verify traces arriving (recent N traces)
+mutagent traces list --prompt-id <id> --json           # filter by prompt
+mutagent traces list --since <ISO-timestamp> --json    # filter by time window (e.g., since first integration)
+mutagent traces get <trace-id> --json                  # inspect single trace's spans + metadata
+\`\`\`
+**Flag glossary** (tracing-specific):
+- \`<framework>\` -- supported frameworks: \`langchain\`, \`langgraph\`, \`llamaindex\`, \`openai-agents\`, \`crewai\`, \`autogen\`, \`vercel-ai\`. Run \`mutagent integrate --help\` for the canonical current list.
+- \`--output <path>\` -- write the integration snippet directly to a file. Without this flag, the snippet goes to stdout (typical for agent-mediated workflows so the agent can re-emit verbatim to user).
+- \`--prompt-id <id>\` -- filter trace list to one prompt's traces.
+- \`--since <ts>\` -- filter by timestamp (ISO-8601). Useful right after first integration to confirm traces are landing.
+- \`--json\` -- structured output (Rule 1: always use).
+**Cost note**: tracing is fully free at the CLI/platform layer -- the platform stores spans for analytics. The only "cost" is the marginal LLM call latency from in-process span emission inside the user's app (typically &lt;5ms per call). No optimizer cost incurred.
+---
 ## Cross-references
 - [SKILL.md](../SKILL.md) → 5 rules + journey router
@@ -11096,7 +12189,7 @@ that teaches coding agents how to use the MutagenT CLI effectively.
     const isJson = parentCmd ? getJsonFlag(parentCmd) : false;
     const output = new OutputFormatter(isJson ? "json" : "table");
     const repoRoot = findRepoRoot();
-    const skillDir = join7(repoRoot, SKILL_DIR);
+    const skillDir = join8(repoRoot, SKILL_DIR);
     const files = getSkillFiles();
     const writtenFiles = [];
     let totalBytes = 0;
@@ -11108,9 +12201,9 @@ that teaches coding agents how to use the MutagenT CLI effectively.
       return a.localeCompare(b);
     });
     for (const relPath of sortedKeys) {
-      const destPath = join7(skillDir, relPath);
+      const destPath = join8(skillDir, relPath);
       const parentDir = dirname(destPath);
-      if (!existsSync12(parentDir)) {
+      if (!existsSync13(parentDir)) {
         mkdirSync4(parentDir, { recursive: true });
       }
       const raw = files[relPath] ?? "";
@@ -11244,18 +12337,18 @@ import { Command as Command19 } from "commander";
 import { randomUUID } from "crypto";
 // src/commands/hooks/state.ts
-import { readFileSync as readFileSync10, writeFileSync as writeFileSync6, renameSync, unlinkSync, existsSync as existsSync13 } from "fs";
-import { join as join8 } from "path";
+import { readFileSync as readFileSync11, writeFileSync as writeFileSync6, renameSync, unlinkSync, existsSync as existsSync14 } from "fs";
+import { join as join9 } from "path";
 import { tmpdir } from "os";
 function stateFilePath(sessionId) {
-  return join8(tmpdir(), `mutagent-hook-${sessionId}.json`);
+  return join9(tmpdir(), `mutagent-hook-${sessionId}.json`);
 }
 function readState(sessionId) {
   const path = stateFilePath(sessionId);
-  if (!existsSync13(path))
+  if (!existsSync14(path))
     return null;
   try {
-    const raw = JSON.parse(readFileSync10(path, "utf-8"));
+    const raw = JSON.parse(readFileSync11(path, "utf-8"));
     if (!Array.isArray(raw.parentStack)) {
       raw.parentStack = [];
     }
@@ -11275,7 +12368,7 @@ function writeState(sessionId, state) {
 }
 function deleteState(sessionId) {
   const path = stateFilePath(sessionId);
-  if (existsSync13(path)) {
+  if (existsSync14(path)) {
     try {
       unlinkSync(path);
     } catch {}
@@ -11922,8 +13015,23 @@ async function handlePostToolUseFailure() {
 }
 // src/commands/hooks/install.ts
-import { readFileSync as readFileSync11, writeFileSync as writeFileSync7, existsSync as existsSync14, mkdirSync as mkdirSync5 } from "fs";
-import { join as join9 } from "path";
+import { readFileSync as readFileSync12, writeFileSync as writeFileSync7, existsSync as existsSync15, mkdirSync as mkdirSync5 } from "fs";
+import { join as join10 } from "path";
+class SettingsParseError extends Error {
+  settingsPath;
+  backupPath;
+  constructor(settingsPath, backupPath, cause) {
+    super(`settings.local.json contains unparseable JSON.
+` + `  Original file backed up to: ${backupPath}
+` + `  To recover: restore from backup, fix JSON syntax, then re-run ` + `\`mutagent hooks install\``);
+    this.settingsPath = settingsPath;
+    this.backupPath = backupPath;
+    this.name = "SettingsParseError";
+    if (cause instanceof Error)
+      this.cause = cause;
+  }
+}
 var V1_MIGRATIONS = {
   Stop: ["mutagent hooks claude-code session-end"]
 };
@@ -11970,15 +13078,18 @@ function migrateV1Hooks(settings) {
   return migrated;
 }
 function installHooks(cwd) {
-  const claudeDir = join9(cwd, ".claude");
-  const settingsPath = join9(claudeDir, "settings.local.json");
-  const existed = existsSync14(settingsPath);
+  const claudeDir = join10(cwd, ".claude");
+  const settingsPath = join10(claudeDir, "settings.local.json");
+  const existed = existsSync15(settingsPath);
   let settings = {};
   if (existed) {
+    const raw = readFileSync12(settingsPath, "utf-8");
     try {
-      settings = JSON.parse(readFileSync11(settingsPath, "utf-8"));
-    } catch {
-      settings = {};
+      settings = JSON.parse(raw);
+    } catch (err) {
+      const backupPath = `${settingsPath}.bak.${new Date().toISOString().replace(/:/g, "-")}`;
+      writeFileSync7(backupPath, raw, "utf-8");
+      throw new SettingsParseError(settingsPath, backupPath, err);
     }
   }
   const added = [];
@@ -12005,14 +13116,25 @@ function installHooks(cwd) {
       }
     }
   }
+  let userWarning;
   if (added.length > 0 || migrated.length > 0) {
-    if (!existsSync14(claudeDir)) {
+    if (!existsSync15(claudeDir)) {
       mkdirSync5(claudeDir, { recursive: true });
     }
     writeFileSync7(settingsPath, JSON.stringify(settings, null, 2) + `
 `, "utf-8");
   }
-  return { settingsPath, existed, added, alreadyPresent, migrated };
+  if (added.length > 0) {
+    const addedList = added.map((cmd) => {
+      const parts = cmd.split(" ");
+      return parts[parts.length - 1] ?? cmd;
+    }).join(", ");
+    userWarning = `MutagenT hooks installed into .claude/settings.local.json
+` + `  Added: ${addedList}
+` + `  This file was modified. Review with: git diff .claude/settings.local.json
+` + `  (To remove hooks, edit .claude/settings.local.json and delete the mutagent entries)`;
+  }
+  return { settingsPath, existed, added, alreadyPresent, migrated, userWarning };
 }
 // src/commands/hooks/index.ts
@@ -12047,7 +13169,28 @@ hook already present (checked by command string) so running this multiple
 times is safe.
     `).action((opts) => {
     const targetDir = opts.cwd ?? process.cwd();
-    const result = installHooks(targetDir);
+    const isJson = Boolean(opts.json);
+    let result;
+    try {
+      result = installHooks(targetDir);
+    } catch (err) {
+      if (err instanceof SettingsParseError) {
+        if (isJson) {
+          process.stdout.write(JSON.stringify({
+            success: false,
+            error: err.message,
+            backupPath: err.backupPath,
+            settingsPath: err.settingsPath
+          }) + `
+`);
+        } else {
+          process.stderr.write(`[mutagent hooks install] ERROR: ${err.message}
+`);
+        }
+        process.exit(1);
+      }
+      throw err;
+    }
     for (const { event, command } of result.migrated) {
       process.stderr.write(`[mutagent hooks install] ⚠️  Migrated v1 hook: removed '${event} → ${command}' (v2 wires this as 'SessionEnd')
 `);
@@ -12063,14 +13206,24 @@ times is safe.
 `);
       return;
     }
-    process.stdout.write(JSON.stringify({
+    if (result.userWarning) {
+      if (isJson) {} else {
+        process.stderr.write(`⚠ ${result.userWarning}
+`);
+      }
+    }
+    const jsonResponse = {
       success: true,
       settingsPath: result.settingsPath,
       existed: result.existed,
       added: result.added,
       alreadyPresent: result.alreadyPresent,
       message: result.added.length > 0 ? `Installed ${String(result.added.length)} hook(s). ${String(result.alreadyPresent.length)} already present.` : `All hooks already present (${String(result.alreadyPresent.length)}).`
-    }) + `
+    };
+    if (result.userWarning && isJson) {
+      jsonResponse.warnings = [result.userWarning];
+    }
+    process.stdout.write(JSON.stringify(jsonResponse) + `
 `);
   });
   const claudeCode = hooks.command("claude-code").description("Claude Code session telemetry");
@@ -12115,8 +13268,8 @@ import { Command as Command20 } from "commander";
 import chalk38 from "chalk";
 init_errors();
 init_config();
-import { readFileSync as readFileSync12, existsSync as existsSync15 } from "fs";
-import { join as join10, dirname as dirname2 } from "path";
+import { readFileSync as readFileSync13, existsSync as existsSync16 } from "fs";
+import { join as join11, dirname as dirname2 } from "path";
 import { fileURLToPath } from "url";
 var VALID_CATEGORIES = ["bug", "feature", "improvement", "praise"];
 function getCliVersion() {
@@ -12125,8 +13278,8 @@ function getCliVersion() {
   }
   try {
     const __dirname2 = dirname2(fileURLToPath(import.meta.url));
-    const pkgPath = join10(__dirname2, "..", "..", "package.json");
-    const pkg = JSON.parse(readFileSync12(pkgPath, "utf-8"));
+    const pkgPath = join11(__dirname2, "..", "..", "package.json");
+    const pkg = JSON.parse(readFileSync13(pkgPath, "utf-8"));
     return pkg.version ?? "0.1.1";
   } catch {
     return "0.1.1";
@@ -12157,12 +13310,12 @@ async function resolveContextSource(source, _readStdinOverride) {
     raw = await (_readStdinOverride ?? readStdin2)();
   } else if (source.startsWith("@")) {
     const filePath = source.slice(1);
-    if (!existsSync15(filePath)) {
+    if (!existsSync16(filePath)) {
       throw new MutagentError("INVALID_ARGUMENTS", `Context file not found: ${filePath}`, `Verify the path exists: ls -la "${filePath}"
 Or use inline JSON: --context '{"key":"value"}'`);
     }
     try {
-      raw = readFileSync12(filePath, "utf-8").trim();
+      raw = readFileSync13(filePath, "utf-8").trim();
     } catch (err) {
       const msg = err instanceof Error ? err.message : String(err);
       throw new MutagentError("INVALID_ARGUMENTS", `Cannot read context file "${filePath}": ${msg}`, `Check file permissions: ls -la "${filePath}"`);
@@ -12314,11 +13467,12 @@ if (process.env.CLI_VERSION) {
 } else {
   try {
     const __dirname2 = dirname3(fileURLToPath2(import.meta.url));
-    const pkgPath = join11(__dirname2, "..", "..", "package.json");
-    const pkg = JSON.parse(readFileSync13(pkgPath, "utf-8"));
+    const pkgPath = join12(__dirname2, "..", "..", "package.json");
+    const pkg = JSON.parse(readFileSync14(pkgPath, "utf-8"));
     cliVersion = pkg.version ?? cliVersion;
   } catch {}
 }
+setCliVersion(cliVersion);
 var program = new Command21;
 program.name("mutagent").description(`MutagenT CLI - AI-native prompt optimization platform
@@ -12328,6 +13482,15 @@ program.name("mutagent").description(`MutagenT CLI - AI-native prompt optimizati
   showGlobalOptions: true
 });
 program.addHelpText("after", `
+${chalk39.bold.cyan("WORKFLOWS:")}
+  ${chalk39.bold("Setup")}                  mutagent login  →  mutagent init  →  mutagent skills install
+  ${chalk39.bold("Framework Integration")}  mutagent explore  →  mutagent integrate <fw>  →  mutagent traces
+  ${chalk39.bold("Prompt Optimization")}    mutagent prompts create  →  prompts dataset  →  prompts evaluation  →  prompts optimize
+  ${chalk39.bold("Agent Optimization")}     ${chalk39.dim("[coming soon — see: mutagent agents --help]")}
+${chalk39.dim("For rubric design, prompt-vs-agent classification, scorecard interpretation,")}
+${chalk39.dim("see the Skill at")} ${chalk39.cyan(".claude/skills/mutagent-cli/SKILL.md")} ${chalk39.dim("(install via")} ${chalk39.cyan("mutagent skills install")}${chalk39.dim(")")}
 ${AGENTS_WIP_BANNER}
 ${chalk39.yellow("Non-Interactive Mode (CI/CD & Coding Agents):")}
@@ -12396,10 +13559,14 @@ ${chalk39.yellow("Post-Onboarding Decision Tree:")}
 ${chalk39.yellow("Directive System:")}
   Every --json response may include:
-  ${chalk39.bold("_directive.renderedCard")}  Pre-formatted card for the user ${chalk39.red("(MUST be shown in chat)")}
-  ${chalk39.bold("_directive.instruction")}   Next step for the agent
-  ${chalk39.bold("_directive.next")}          Array of suggested follow-up commands
-  ${chalk39.bold("_links")}                   Dashboard/API URLs (format as markdown links)
+  ${chalk39.bold("_directive.display")}      Type tag — 'status_card' for card-kind directives (drives test/docs guards)
+  ${chalk39.bold("_directive.renderedCard")} Pre-formatted card ${chalk39.red("(MUST echo verbatim in chat whenever this field exists — see SKILL.md Verbatim Card Display Protocol)")}
+  ${chalk39.bold("_directive.instruction")}  Next step for the agent (self-sufficient, no Skill required)
+  ${chalk39.bold("_directive.next")}         Array of suggested follow-up commands
+  ${chalk39.bold("_directive.decisionTree")} Multi-step branching logic for guided flows
+  ${chalk39.bold("_directive.askUserQuestions")} Questions to collect from the user (inside _directive)
+  ${chalk39.bold("_links")}                  Dashboard/API URLs (format as markdown links)
+  ${chalk39.bold("_compat")}                 Compat metadata: cliVersion, skillVersion, skillMinCliVersion
 ${chalk39.yellow("Evaluation Criteria Format:")}
   Each criterion MUST have: ${chalk39.bold("name")}, ${chalk39.bold("description")} (scoring rubric), ${chalk39.bold("evaluationParameter")}
@@ -12455,7 +13622,10 @@ ${!hasCredentials() ? `
 var rawArgs = process.argv.slice(2);
 if (rawArgs.includes("-v") || rawArgs.includes("--version")) {
   if (rawArgs.includes("--json")) {
-    console.log(JSON.stringify({ version: cliVersion }));
+    console.log(JSON.stringify({
+      version: cliVersion,
+      _compat: getCompatMetadata(cliVersion)
+    }));
   } else {
     console.log(cliVersion);
   }
@@ -12492,12 +13662,12 @@ program.addCommand(createFeedbackCommand());
 var isInteractive = process.stdin.isTTY && !rawArgs.includes("--json") && process.env.CI !== "true";
 var isSkillCommand = rawArgs[0] === "skills" || rawArgs[0] === "hooks";
 if (isInteractive && !isSkillCommand) {
-  const skillPath = join11(process.cwd(), ".claude/skills/mutagent-cli/SKILL.md");
-  if (!existsSync16(skillPath)) {
+  const skillPath = join12(process.cwd(), ".claude/skills/mutagent-cli/SKILL.md");
+  if (!existsSync17(skillPath)) {
     console.log(chalk39.dim("MutagenT SKILL not installed. Install it for AI agent support? Run:"), chalk39.cyan("mutagent skills install"));
   }
 }
 program.parse();
-//# debugId=5E2BEB108643839E64756E2164756E21
+//# debugId=B6E7E54395DC9FFD64756E2164756E21
 //# sourceMappingURL=cli.js.map