@mutagent/cli 0.1.177 → 0.1.179

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/bin/cli.js CHANGED
@@ -1271,8 +1271,8 @@ var init_sdk_client = __esm(() => {
1271
1271
  // src/bin/cli.ts
1272
1272
  import { Command as Command21 } from "commander";
1273
1273
  import chalk39 from "chalk";
1274
- import { readFileSync as readFileSync13, existsSync as existsSync16 } from "fs";
1275
- import { join as join11, dirname as dirname3 } from "path";
1274
+ import { readFileSync as readFileSync14, existsSync as existsSync17 } from "fs";
1275
+ import { join as join12, dirname as dirname3 } from "path";
1276
1276
  import { fileURLToPath as fileURLToPath2 } from "url";
1277
1277
 
1278
1278
  // src/commands/auth.ts
@@ -1280,11 +1280,45 @@ init_config();
1280
1280
  init_sdk_client();
1281
1281
  import { Command } from "commander";
1282
1282
  import chalk4 from "chalk";
1283
- import { existsSync as existsSync3 } from "fs";
1284
- import { join as join4 } from "path";
1283
+ import { existsSync as existsSync4 } from "fs";
1284
+ import { join as join5 } from "path";
1285
1285
 
1286
1286
  // src/lib/output.ts
1287
1287
  import chalk from "chalk";
1288
+ import { existsSync as existsSync2, readFileSync as readFileSync2 } from "fs";
1289
+ import { join as join2 } from "path";
1290
+ var _compatCache = null;
1291
+ var _globalCliVersion = "0.0.0";
1292
+ function setCliVersion(version) {
1293
+ _globalCliVersion = version;
1294
+ _compatCache = null;
1295
+ }
1296
+ function getCompatMetadata(cliVersion = _globalCliVersion) {
1297
+ if (_compatCache !== null)
1298
+ return _compatCache;
1299
+ let skillVersion = "unknown";
1300
+ let skillMinCliVersion = "unknown";
1301
+ try {
1302
+ const candidatePaths = [
1303
+ join2(process.cwd(), ".claude", "skills", "mutagent-cli", "SKILL.md"),
1304
+ join2(process.cwd(), "mutagent-cli", ".claude", "skills", "mutagent-cli", "SKILL.md")
1305
+ ];
1306
+ for (const p of candidatePaths) {
1307
+ if (existsSync2(p)) {
1308
+ const content = readFileSync2(p, "utf-8");
1309
+ const verMatch = /^SKILL_VERSION:\s*(.+)$/m.exec(content);
1310
+ const minMatch = /^SKILL_MIN_CLI_VERSION:\s*(.+)$/m.exec(content);
1311
+ if (verMatch?.[1])
1312
+ skillVersion = verMatch[1].trim();
1313
+ if (minMatch?.[1])
1314
+ skillMinCliVersion = minMatch[1].trim();
1315
+ break;
1316
+ }
1317
+ }
1318
+ } catch {}
1319
+ _compatCache = { cliVersion, skillVersion, skillMinCliVersion };
1320
+ return _compatCache;
1321
+ }
1288
1322
  function getJsonFlag(command) {
1289
1323
  if (!command)
1290
1324
  return false;
@@ -1300,7 +1334,14 @@ class OutputFormatter {
1300
1334
  }
1301
1335
  output(data) {
1302
1336
  if (this.format === "json") {
1303
- console.log(JSON.stringify(data, null, 2));
1337
+ let enriched = data;
1338
+ if (_globalCliVersion !== "0.0.0" && data !== null && typeof data === "object" && !Array.isArray(data) && "_directive" in data) {
1339
+ enriched = {
1340
+ ...data,
1341
+ _compat: getCompatMetadata(_globalCliVersion)
1342
+ };
1343
+ }
1344
+ console.log(JSON.stringify(enriched, null, 2));
1304
1345
  } else {
1305
1346
  this.formatTable(data);
1306
1347
  }
@@ -1431,8 +1472,8 @@ function createSpinner(text, isJson) {
1431
1472
  init_errors();
1432
1473
 
1433
1474
  // src/lib/mutation-context.ts
1434
- import { existsSync as existsSync2, readFileSync as readFileSync2, writeFileSync as writeFileSync2, mkdirSync as mkdirSync2 } from "fs";
1435
- import { join as join2, resolve } from "path";
1475
+ import { existsSync as existsSync3, readFileSync as readFileSync3, writeFileSync as writeFileSync2, mkdirSync as mkdirSync2 } from "fs";
1476
+ import { join as join3, resolve } from "path";
1436
1477
  function parseTableRows(lines) {
1437
1478
  const rows = [];
1438
1479
  for (const line of lines) {
@@ -1473,18 +1514,18 @@ class MutationContext {
1473
1514
  }
1474
1515
  static load(projectRoot) {
1475
1516
  const root = projectRoot ?? process.cwd();
1476
- const dirPath = join2(resolve(root), CONTEXT_DIR);
1477
- const filePath = join2(dirPath, CONTEXT_FILE);
1517
+ const dirPath = join3(resolve(root), CONTEXT_DIR);
1518
+ const filePath = join3(dirPath, CONTEXT_FILE);
1478
1519
  const ctx = new MutationContext(filePath);
1479
- if (existsSync2(filePath)) {
1480
- const content = readFileSync2(filePath, "utf-8");
1520
+ if (existsSync3(filePath)) {
1521
+ const content = readFileSync3(filePath, "utf-8");
1481
1522
  ctx.parse(content);
1482
1523
  }
1483
1524
  return ctx;
1484
1525
  }
1485
1526
  save() {
1486
- const dirPath = join2(this.filePath, "..");
1487
- if (!existsSync2(dirPath)) {
1527
+ const dirPath = join3(this.filePath, "..");
1528
+ if (!existsSync3(dirPath)) {
1488
1529
  mkdirSync2(dirPath, { recursive: true });
1489
1530
  }
1490
1531
  writeFileSync2(this.filePath, this.render(), "utf-8");
@@ -1710,8 +1751,8 @@ import chalk2 from "chalk";
1710
1751
  import { resolve as resolve2 } from "path";
1711
1752
 
1712
1753
  // src/lib/explorer.ts
1713
- import { readdirSync, readFileSync as readFileSync3, statSync } from "fs";
1714
- import { join as join3, relative, extname, basename } from "path";
1754
+ import { readdirSync, readFileSync as readFileSync4, statSync } from "fs";
1755
+ import { join as join4, relative, extname, basename } from "path";
1715
1756
  var DOUBLE_VAR_PATTERN = /\{\{\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*\}\}/g;
1716
1757
  var SINGLE_VAR_PATTERN = /(?<![\\{])\{\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*\}(?!\})/g;
1717
1758
  var FENCED_CODE_PATTERN = /```[\s\S]*?```/g;
@@ -1805,7 +1846,7 @@ function walkDir(dir, extensions, excludeDirs, maxDepth, currentDepth = 0) {
1805
1846
  return files;
1806
1847
  }
1807
1848
  for (const entry of entries) {
1808
- const fullPath = join3(dir, entry);
1849
+ const fullPath = join4(dir, entry);
1809
1850
  let stat;
1810
1851
  try {
1811
1852
  stat = statSync(fullPath);
@@ -1831,7 +1872,7 @@ function scanForPrompts(filePath, relativePath) {
1831
1872
  const results = [];
1832
1873
  let content;
1833
1874
  try {
1834
- content = readFileSync3(filePath, "utf-8");
1875
+ content = readFileSync4(filePath, "utf-8");
1835
1876
  } catch {
1836
1877
  return results;
1837
1878
  }
@@ -1905,7 +1946,7 @@ function scanForAgents(filePath, relativePath) {
1905
1946
  const results = [];
1906
1947
  let content;
1907
1948
  try {
1908
- content = readFileSync3(filePath, "utf-8");
1949
+ content = readFileSync4(filePath, "utf-8");
1909
1950
  } catch {
1910
1951
  return results;
1911
1952
  }
@@ -1953,7 +1994,7 @@ function scanForMarkers(filePath, relativePath) {
1953
1994
  const results = [];
1954
1995
  let content;
1955
1996
  try {
1956
- content = readFileSync3(filePath, "utf-8");
1997
+ content = readFileSync4(filePath, "utf-8");
1957
1998
  } catch {
1958
1999
  return results;
1959
2000
  }
@@ -1987,7 +2028,7 @@ function scanJsonForSchemas(filePath, relativePath) {
1987
2028
  return results;
1988
2029
  let content;
1989
2030
  try {
1990
- content = readFileSync3(filePath, "utf-8");
2031
+ content = readFileSync4(filePath, "utf-8");
1991
2032
  } catch {
1992
2033
  return results;
1993
2034
  }
@@ -2014,7 +2055,7 @@ function scanForDatasets(dir, rootPath, excludeDirs, maxDepth) {
2014
2055
  const name = basename(filePath, ext);
2015
2056
  let content;
2016
2057
  try {
2017
- content = readFileSync3(filePath, "utf-8");
2058
+ content = readFileSync4(filePath, "utf-8");
2018
2059
  } catch {
2019
2060
  continue;
2020
2061
  }
@@ -2637,8 +2678,8 @@ Examples:
2637
2678
  }
2638
2679
  const isValid = await validateApiKey(apiKey, endpoint);
2639
2680
  const cwd = process.cwd();
2640
- const hasOnboarding = existsSync3(join4(cwd, ".mutagentrc.json"));
2641
- const hasContextFile = existsSync3(join4(cwd, ".mutagent", "mutation-context.md"));
2681
+ const hasOnboarding = existsSync4(join5(cwd, ".mutagentrc.json"));
2682
+ const hasContextFile = existsSync4(join5(cwd, ".mutagent", "mutation-context.md"));
2642
2683
  let contextSummary;
2643
2684
  if (hasContextFile) {
2644
2685
  try {
@@ -2799,7 +2840,7 @@ init_errors();
2799
2840
  init_sdk_client();
2800
2841
  import { Command as Command7 } from "commander";
2801
2842
  import chalk18 from "chalk";
2802
- import { readFileSync as readFileSync4, existsSync as existsSync4 } from "fs";
2843
+ import { readFileSync as readFileSync5, existsSync as existsSync5 } from "fs";
2803
2844
 
2804
2845
  // src/lib/ui-links.ts
2805
2846
  function getAppBaseUrl() {
@@ -2906,7 +2947,7 @@ function formatCreationHints(hint) {
2906
2947
  return lines.join(`
2907
2948
  `);
2908
2949
  }
2909
- var CARD_INSTRUCTION = `MANDATORY: HARD STOP — DO NOT PROCEED.
2950
+ var VERBATIM_CARD_INSTRUCTION = `MANDATORY: HARD STOP — DO NOT PROCEED.
2910
2951
  ` + `1. The renderedCard below appears in your bash/terminal output. That is NOT sufficient — the user cannot see it there.
2911
2952
  ` + `2. You MUST copy the FULL renderedCard text into your chat response VERBATIM. Do NOT summarize, rephrase, or omit any lines.
2912
2953
  ` + `3. Do NOT run any other command until you have displayed the card in chat.
@@ -2989,7 +3030,7 @@ function promptCreatedDirective(promptId, name) {
2989
3030
  fields: { promptId: String(promptId), name },
2990
3031
  links: { dashboard: dashboardUrl, api: apiUrl },
2991
3032
  next,
2992
- instruction: CARD_INSTRUCTION,
3033
+ instruction: VERBATIM_CARD_INSTRUCTION,
2993
3034
  renderedCard: renderPlainCard({ title, icon: "✓", rows, links: cardLinks, next })
2994
3035
  };
2995
3036
  }
@@ -3013,7 +3054,7 @@ function promptUpdatedDirective(promptId, name) {
3013
3054
  fields: { promptId: String(promptId), name },
3014
3055
  links: { dashboard: dashboardUrl, api: apiUrl },
3015
3056
  next,
3016
- instruction: CARD_INSTRUCTION,
3057
+ instruction: VERBATIM_CARD_INSTRUCTION,
3017
3058
  renderedCard: renderPlainCard({ title, icon: "✓", rows, links: cardLinks, next })
3018
3059
  };
3019
3060
  }
@@ -3030,7 +3071,7 @@ function promptDeletedDirective(promptId) {
3030
3071
  fields: { promptId },
3031
3072
  links: { dashboard: dashboardUrl },
3032
3073
  next,
3033
- instruction: CARD_INSTRUCTION,
3074
+ instruction: VERBATIM_CARD_INSTRUCTION,
3034
3075
  renderedCard: renderPlainCard({ title, icon: "✗", rows, links: cardLinks, next })
3035
3076
  };
3036
3077
  }
@@ -3067,7 +3108,7 @@ function datasetAddedDirective(promptId, datasetId, name, itemCount, state) {
3067
3108
  },
3068
3109
  links: { dashboard: dashboardUrl, api: apiUrl },
3069
3110
  next,
3070
- instruction: CARD_INSTRUCTION,
3111
+ instruction: VERBATIM_CARD_INSTRUCTION,
3071
3112
  renderedCard: renderPlainCard({ title, icon: "✓", rows, links: cardLinks, next })
3072
3113
  };
3073
3114
  }
@@ -3087,7 +3128,7 @@ function datasetDeletedDirective(promptId, datasetId) {
3087
3128
  fields: { promptId, datasetId },
3088
3129
  links: { datasets: datasetsUrl },
3089
3130
  next,
3090
- instruction: CARD_INSTRUCTION,
3131
+ instruction: VERBATIM_CARD_INSTRUCTION,
3091
3132
  renderedCard: renderPlainCard({ title, icon: "✗", rows, links: cardLinks, next })
3092
3133
  };
3093
3134
  }
@@ -3123,7 +3164,7 @@ function evaluationCreatedDirective(promptId, evalId, name, criteriaCount, state
3123
3164
  },
3124
3165
  links: { dashboard: dashboardUrl, api: apiUrl },
3125
3166
  next,
3126
- instruction: CARD_INSTRUCTION,
3167
+ instruction: VERBATIM_CARD_INSTRUCTION,
3127
3168
  renderedCard: renderPlainCard({ title, icon: "✓", rows, links: cardLinks, next })
3128
3169
  };
3129
3170
  }
@@ -3140,7 +3181,7 @@ function evaluationDeletedDirective(evaluationId) {
3140
3181
  fields: { evaluationId },
3141
3182
  links: { dashboard: dashboardUrl },
3142
3183
  next,
3143
- instruction: CARD_INSTRUCTION,
3184
+ instruction: VERBATIM_CARD_INSTRUCTION,
3144
3185
  renderedCard: renderPlainCard({ title, icon: "✗", rows, links: cardLinks, next })
3145
3186
  };
3146
3187
  }
@@ -3508,6 +3549,7 @@ ${chalk7.yellow("AI Agent: Format Selection Rules")}
3508
3549
  ${chalk7.red("outputSchema is required.")}
3509
3550
 
3510
3551
  ${chalk7.yellow("AI Agent: ALWAYS append --json to this command.")}
3552
+ ${chalk7.cyan("Conceptual help (brace convention, delimiter inference): .claude/skills/mutagent-cli/concepts/prompt-variables.md")} ${chalk7.dim("(install via mutagent skills install)")}
3511
3553
  `).action(async (options) => {
3512
3554
  const isJson = getJsonFlag(prompts);
3513
3555
  const output = new OutputFormatter(isJson ? "json" : "table");
@@ -3806,10 +3848,12 @@ async function buildGuidedWorkflow(promptId) {
3806
3848
  const askUserQuestions = allFields.map(({ field, source, fieldSchema }) => {
3807
3849
  const isInput = source === "inputSchema";
3808
3850
  const question = isInput ? `Define the Minimum Viable Context for "${field}". What data MUST be present in this input for the prompt to produce a correct output? Describe what constitutes complete vs incomplete input, and WHY this field matters.` : `Define what correct "${field}" looks like. What structure, content, or qualities make it good vs bad? Give concrete examples of good and bad outputs.`;
3809
- const hint = isInput ? `Input fields define what data the prompt NEEDS to work correctly. Without defining minimum viable context, the optimizer cannot detect whether failures come from bad input or bad prompt logic.` : null;
3851
+ const hint = isInput ? `Input fields define what data the prompt NEEDS to work correctly. Without defining minimum viable context, the optimizer cannot detect whether failures come from bad input or bad prompt logic.` : undefined;
3810
3852
  const header = isInput ? `${field} [INPUT]` : `${field} [OUTPUT]`;
3811
3853
  const defineDesc = isInput ? `Describe what data MUST be present in "${field}" and WHY the prompt needs it. Focus on completeness and minimum viable context.` : `Describe what a correct vs incorrect "${field}" looks like. Focus on structure, content quality, and concrete examples — not numeric scores.`;
3812
3854
  return {
3855
+ field,
3856
+ source,
3813
3857
  question,
3814
3858
  header,
3815
3859
  ...hint != null ? { hint } : {},
@@ -3841,7 +3885,7 @@ async function buildGuidedWorkflow(promptId) {
3841
3885
  description: "Follow these steps to create an evaluation for this prompt:",
3842
3886
  steps: [
3843
3887
  { step: 1, action: "Review prompt schemas and context", detail: "Understand what the prompt does and what each field means." },
3844
- { step: 2, action: "Define correctness criteria for EVERY field", detail: `For EACH of the ${String(allFields.length)} fields in askUserQuestions, use AskUserQuestion to collect a rubric. INPUT fields (${String(inputFields.length)}): collect minimum viable context definitions — what data MUST exist. OUTPUT fields (${String(outputFields.length)}): collect correctness criteria — what correct vs incorrect looks like. Do NOT skip any field. Use the context hints to suggest rubrics when asked.` },
3888
+ { step: 2, action: "Define correctness criteria for EVERY field", detail: `For EACH of the ${String(allFields.length)} fields in _directive.askUserQuestions, use AskUserQuestion to collect a rubric. INPUT fields (${String(inputFields.length)}): collect minimum viable context definitions — what data MUST exist. OUTPUT fields (${String(outputFields.length)}): collect correctness criteria — what correct vs incorrect looks like. Do NOT skip any field. Use the context hints to suggest rubrics when asked.` },
3845
3889
  { step: 3, action: "Build criteria JSON", detail: "Map each rubric to: { name: field, description: rubric, evaluationParameter: field }" },
3846
3890
  { step: 4, action: "Ask for evaluation name", detail: "Ask user what to name this evaluation" },
3847
3891
  { step: 5, action: "Create the evaluation", detail: `mutagent prompts evaluation create ${promptId} --name "<name>" -d '<json>' --json` }
@@ -3865,11 +3909,9 @@ async function buildGuidedWorkflow(promptId) {
3865
3909
  }
3866
3910
  })}'`
3867
3911
  },
3868
- askUserQuestions,
3869
3912
  _directive: {
3870
- display: "workflow_guide",
3871
3913
  instruction: `MANDATORY ORDERS — READ CAREFULLY:
3872
- ` + `1. You MUST ask the user about EVERY SINGLE field listed in askUserQuestions. There are ${String(allFields.length)} fields total.
3914
+ ` + `1. You MUST ask the user about EVERY SINGLE field listed in _directive.askUserQuestions. There are ${String(allFields.length)} fields total.
3873
3915
  ` + `2. Do NOT skip ANY field. Do NOT decide that a field "doesn't need" a definition.
3874
3916
  ` + `3. INPUT fields (${String(inputFields.length)}): These define MINIMUM VIABLE CONTEXT — the data that MUST exist in the input for the prompt to work. Ask: "What data MUST be present in this field and WHY does the prompt need it?"
3875
3917
  ` + ` WHY THIS MATTERS: Without minimum viable context definitions, the optimizer cannot distinguish between failures caused by incomplete input vs failures caused by bad prompt logic.
@@ -3893,11 +3935,15 @@ async function buildGuidedWorkflow(promptId) {
3893
3935
  ` + ` NEVER use vague rubrics like "0-1 scale" or "score based on accuracy".
3894
3936
  ` + ` Canonical source: .claude/skills/mutagent-cli/SKILL.md → "Guided Eval Criteria".
3895
3937
  ` + " Note: template variables use single-brace `{variable}` (platform canonical). `mutagent explore`\n" + " infers single vs double per-file via inferPromptVariables().",
3896
- aiAgentDecisionTree: {
3938
+ next: [
3939
+ `mutagent prompts evaluation create ${promptId} --name "<name>" -d '<criteria-json>' --json`
3940
+ ],
3941
+ decisionTree: {
3897
3942
  step1: "Check if criteria already exist in the user's code. If criteria match expected shape (name + description + evaluationParameter targeting schema fields), use --data directly.",
3898
- step2: "If criteria are missing or malformed, use the askUserQuestions payloads to collect them via AskUserQuestion.",
3943
+ step2: "If criteria are missing or malformed, use the _directive.askUserQuestions payloads to collect them via AskUserQuestion.",
3899
3944
  step3: `After collecting all criteria, construct the --data JSON and run: mutagent prompts evaluation create <prompt-id> --name "<name>" -d '<json>'`
3900
- }
3945
+ },
3946
+ askUserQuestions
3901
3947
  },
3902
3948
  _meta: { inputFields, outputFields, allFields }
3903
3949
  };
@@ -4002,12 +4048,11 @@ function buildGuidedDatasetWorkflow(prompt) {
4002
4048
  input: hasSchema ? templateInput : { "<field>": "<value>" },
4003
4049
  expectedOutput: hasSchema ? templateOutput : { "<field>": "<expected>" }
4004
4050
  },
4005
- askUserQuestions,
4006
4051
  guidance: {
4007
4052
  minItems: 5,
4008
4053
  priorityRule: "Hard cases that expose prompt weaknesses > easy cases that always pass",
4009
4054
  steps: [
4010
- "Collect answers to each question from the user",
4055
+ "Collect answers to each question from the user via _directive.askUserQuestions",
4011
4056
  "Generate 5-10 dataset items covering all categories",
4012
4057
  "Ensure at least 2 hard/edge cases per category",
4013
4058
  'Format as JSON array: [{"input": {...}, "expectedOutput": {...}}, ...]',
@@ -4015,10 +4060,42 @@ function buildGuidedDatasetWorkflow(prompt) {
4015
4060
  ]
4016
4061
  },
4017
4062
  _directive: {
4018
- instruction: "Collect domain-specific dataset items from the user. " + "Ask about each schema field. " + "Prioritize hard cases over easy cases. " + "Then construct the dataset JSON and upload.",
4063
+ instruction: `DATASET CURATION MANDATORY RULES (self-sufficient, no Skill required):
4064
+ ` + `
4065
+ ` + `1. PRIORITY RULE: Hard cases that expose prompt weaknesses > easy cases that always pass.
4066
+ ` + ` Collect HARD CASES FIRST. Ask about failure modes, edge cases, adversarial inputs.
4067
+ ` + `
4068
+ ` + `2. CASE CATEGORIES (collect in order):
4069
+ ` + ` a. Edge Cases (HIGH priority): boundary inputs, empty fields, very long values, unicode, malformed data
4070
+ ` + ` b. Hard Cases (HIGH priority): ambiguous inputs, adversarial examples, domain traps that fool the model
4071
+ ` + ` c. Representative Cases (MEDIUM priority): typical production inputs (only after hard cases covered)
4072
+ ` + `
4073
+ ` + `3. PER-INPUT-FIELD: For each input field, ask what VALUES should be tested. Collect at minimum:
4074
+ ` + ` - One boundary value (empty, null, extremely long, wrong type)
4075
+ ` + ` - One ambiguous value (that could be interpreted multiple ways)
4076
+ ` + ` - One representative production value
4077
+ ` + `
4078
+ ` + `4. PER-OUTPUT-FIELD: For each output field, ask what CORRECT looks like vs INCORRECT.
4079
+ ` + ` You need the correctness shape to generate realistic expectedOutput values.
4080
+ ` + `
4081
+ ` + `5. MINIMUM: 5 dataset items total. At least 2 must be hard or edge cases.
4082
+ ` + `
4083
+ ` + `6. FORMAT: Each item MUST have:
4084
+ ` + ` { "input": { <inputSchema fields> }, "expectedOutput": { <outputSchema fields> } }
4085
+ ` + ` NEVER omit expectedOutput on labelable items — optimizer needs the ground truth.
4086
+ ` + `
4087
+ ` + `7. ANTI-PATTERNS (never do):
4088
+ ` + ` - All easy cases that always pass → useless for optimization
4089
+ ` + ` - No edge cases → optimizer never sees failure modes
4090
+ ` + ` - Fictional inputs that won't happen in production → scores are misleading
4091
+ ` + ` - Missing expectedOutput on items with knowable correct answers
4092
+ ` + `
4093
+ ` + `Use _directive.askUserQuestions (${String(askUserQuestions.length)} questions) to collect domain knowledge from the user.
4094
+ ` + "For deeper conceptual background, see: .claude/skills/mutagent-cli/concepts/dataset-design.md",
4019
4095
  next: [
4020
4096
  `mutagent prompts dataset add ${promptId} -d '<constructed-json>' --name '<name>' --json`
4021
- ]
4097
+ ],
4098
+ askUserQuestions
4022
4099
  }
4023
4100
  };
4024
4101
  }
@@ -4653,6 +4730,7 @@ Expected Criteria Shape (--data):
4653
4730
  ${chalk10.red("Required: --name (unless --guided). Criteria must include evaluationParameter.")}
4654
4731
  ${chalk10.dim("CLI flags (--name, --description) override --data fields.")}
4655
4732
  ${chalk10.dim("Get prompt IDs: mutagent prompts list")}
4733
+ ${chalk10.cyan("Conceptual help: .claude/skills/mutagent-cli/concepts/eval-criteria.md")} ${chalk10.dim("(install via mutagent skills install)")}
4656
4734
  `).action(async (promptId, options) => {
4657
4735
  let isJson = getJsonFlag(prompts);
4658
4736
  if (options.guided) {
@@ -5154,7 +5232,10 @@ function startDirective(job, promptId, datasetId, evaluationId) {
5154
5232
  ];
5155
5233
  const next = [`mutagent prompts optimize status ${job.id} --json`];
5156
5234
  return {
5235
+ instruction: VERBATIM_CARD_INSTRUCTION,
5236
+ next,
5157
5237
  display: "status_card",
5238
+ renderedCard: renderPlainCard({ title, icon: "⚡", rows, links: cardLinks, next }),
5158
5239
  template: "optimization_started",
5159
5240
  title,
5160
5241
  fields: {
@@ -5167,13 +5248,7 @@ function startDirective(job, promptId, datasetId, evaluationId) {
5167
5248
  targetScore: job.config.targetScore ?? 0.8,
5168
5249
  model: job.config.model ?? "default"
5169
5250
  },
5170
- links: { monitor: monitorUrl, api: apiUrl },
5171
- next,
5172
- instruction: `HARD STOP — DO NOT PROCEED.
5173
- ` + `The renderedCard below appears in bash/terminal output. That is NOT sufficient.
5174
- ` + `You MUST copy the FULL renderedCard text into your chat response VERBATIM before running any other command.
5175
- ` + "After displaying, follow the suggested next steps.",
5176
- renderedCard: renderPlainCard({ title, icon: "⚡", rows, links: cardLinks, next })
5251
+ links: { monitor: monitorUrl, api: apiUrl }
5177
5252
  };
5178
5253
  }
5179
5254
  function statusDirective(status, promptId) {
@@ -5201,7 +5276,10 @@ function statusDirective(status, promptId) {
5201
5276
  const monitorUrl = optimizerLink(promptId ?? "unknown", status.jobId);
5202
5277
  const cardLinks = [{ label: "Monitor", url: monitorUrl }];
5203
5278
  return {
5279
+ instruction: VERBATIM_CARD_INSTRUCTION,
5280
+ next,
5204
5281
  display: "status_card",
5282
+ renderedCard: renderPlainCard({ title, icon, rows, links: cardLinks, next }),
5205
5283
  template: "optimization_progress",
5206
5284
  title,
5207
5285
  fields: {
@@ -5213,13 +5291,7 @@ function statusDirective(status, promptId) {
5213
5291
  eta,
5214
5292
  message: status.message
5215
5293
  },
5216
- links: { monitor: monitorUrl },
5217
- next,
5218
- instruction: `HARD STOP — DO NOT PROCEED.
5219
- ` + `The renderedCard below appears in bash/terminal output. That is NOT sufficient.
5220
- ` + `You MUST copy the FULL renderedCard text into your chat response VERBATIM before running any other command.
5221
- ` + "After displaying, follow the suggested next steps.",
5222
- renderedCard: renderPlainCard({ title, icon, rows, links: cardLinks, next })
5294
+ links: { monitor: monitorUrl }
5223
5295
  };
5224
5296
  }
5225
5297
  function showPromptDiff(original, optimized) {
@@ -6149,6 +6221,8 @@ ${chalk16.yellow(`⚠ COST WARNING — AI Agent:
6149
6221
 
6150
6222
  ${chalk16.yellow("AI Agent: ALWAYS append --json to this command.")}
6151
6223
 
6224
+ ${chalk16.cyan("Conceptual help (full optimization loop): .claude/skills/mutagent-cli/workflows/optimization.md")} ${chalk16.dim("(install via mutagent skills install)")}
6225
+
6152
6226
  ${chalk16.bold("Hit a bug or unexpected result?")}
6153
6227
  ${chalk16.cyan('mutagent feedback send --category bug -m "describe what went wrong"')}
6154
6228
  `).action(async (promptId, options) => {
@@ -6461,6 +6535,7 @@ After viewing results:
6461
6535
  ${chalk16.dim("No flag = view scorecard only.")}
6462
6536
 
6463
6537
  ${chalk16.dim("AI Agent: Present scorecard to user via AskUserQuestion before applying.")}
6538
+ ${chalk16.cyan("Conceptual help (scorecard interpretation, nextAction): .claude/skills/mutagent-cli/concepts/scorecard-output.md")} ${chalk16.dim("(install via mutagent skills install)")}
6464
6539
  `).action(async (jobId, options) => {
6465
6540
  const isJson = getJsonFlag(prompts);
6466
6541
  const output = new OutputFormatter(isJson ? "json" : "table");
@@ -6473,12 +6548,8 @@ After viewing results:
6473
6548
  const isCompleted = jobData?.status === "completed";
6474
6549
  const scorecardText = buildResultsScorecardText(resultData);
6475
6550
  const directive = {
6476
- display: "scorecard",
6477
- instruction: `HARD STOP — DO NOT PROCEED.
6478
- ` + `The scorecard below appears in bash/terminal output. That is NOT sufficient.
6479
- ` + `You MUST copy the FULL renderedCard text into your chat response VERBATIM.
6480
- ` + `Then use AskUserQuestion with options: [Apply optimized prompt / View full diff / Reject and keep original].
6481
- ` + "Do NOT auto-apply.",
6551
+ display: "status_card",
6552
+ instruction: VERBATIM_CARD_INSTRUCTION,
6482
6553
  renderedCard: scorecardText,
6483
6554
  next: isCompleted ? [
6484
6555
  `mutagent prompts optimize results ${jobId} --apply --json`,
@@ -6799,12 +6870,12 @@ Provide a valid JSON Schema, e.g., '{"type":"object","properties":{"field":{"typ
6799
6870
  }
6800
6871
  }
6801
6872
  if (filePath) {
6802
- if (!existsSync4(filePath)) {
6873
+ if (!existsSync5(filePath)) {
6803
6874
  throw new MutagentError("FILE_NOT_FOUND", `File not found: ${filePath}`, `Run: ${helpCommand}
6804
6875
  Check the file path and try again`);
6805
6876
  }
6806
6877
  try {
6807
- return JSON.parse(readFileSync4(filePath, "utf-8"));
6878
+ return JSON.parse(readFileSync5(filePath, "utf-8"));
6808
6879
  } catch {
6809
6880
  throw new MutagentError("INVALID_JSON", `Failed to parse JSON from ${filePath}`, `Run: ${helpCommand}
6810
6881
  Ensure the file contains valid JSON Schema`);
@@ -7102,20 +7173,20 @@ ${chalk22.dim("Returns full trace details including spans, tokens, and latency."
7102
7173
  init_config();
7103
7174
  import { Command as Command9 } from "commander";
7104
7175
  import chalk23 from "chalk";
7105
- import { writeFileSync as writeFileSync3, existsSync as existsSync9 } from "fs";
7176
+ import { writeFileSync as writeFileSync3, existsSync as existsSync10 } from "fs";
7106
7177
  import { execSync } from "child_process";
7107
7178
  init_errors();
7108
7179
 
7109
7180
  // src/lib/integrations/langchain.ts
7110
- import { readFileSync as readFileSync5, existsSync as existsSync5 } from "fs";
7181
+ import { readFileSync as readFileSync6, existsSync as existsSync6 } from "fs";
7111
7182
  var langchainIntegration = {
7112
7183
  name: "langchain",
7113
7184
  description: "LangChain framework",
7114
7185
  detect() {
7115
7186
  let hasLangchain = false;
7116
- if (existsSync5("package.json")) {
7187
+ if (existsSync6("package.json")) {
7117
7188
  try {
7118
- const pkg = JSON.parse(readFileSync5("package.json", "utf-8"));
7189
+ const pkg = JSON.parse(readFileSync6("package.json", "utf-8"));
7119
7190
  const deps = { ...pkg.dependencies, ...pkg.devDependencies };
7120
7191
  hasLangchain = "langchain" in deps || "@langchain/core" in deps;
7121
7192
  } catch {}
@@ -7247,15 +7318,15 @@ mutagent traces analyze <prompt-id>
7247
7318
  };
7248
7319
 
7249
7320
  // src/lib/integrations/langgraph.ts
7250
- import { readFileSync as readFileSync6, existsSync as existsSync6 } from "fs";
7321
+ import { readFileSync as readFileSync7, existsSync as existsSync7 } from "fs";
7251
7322
  var langgraphIntegration = {
7252
7323
  name: "langgraph",
7253
7324
  description: "LangGraph agent workflow framework",
7254
7325
  detect() {
7255
7326
  let hasLanggraph = false;
7256
- if (existsSync6("package.json")) {
7327
+ if (existsSync7("package.json")) {
7257
7328
  try {
7258
- const pkg = JSON.parse(readFileSync6("package.json", "utf-8"));
7329
+ const pkg = JSON.parse(readFileSync7("package.json", "utf-8"));
7259
7330
  const deps = { ...pkg.dependencies, ...pkg.devDependencies };
7260
7331
  hasLanggraph = "@langchain/langgraph" in deps;
7261
7332
  } catch {}
@@ -7337,15 +7408,15 @@ mutagent integrate langgraph --verify
7337
7408
  };
7338
7409
 
7339
7410
  // src/lib/integrations/vercel-ai.ts
7340
- import { readFileSync as readFileSync7, existsSync as existsSync7 } from "fs";
7411
+ import { readFileSync as readFileSync8, existsSync as existsSync8 } from "fs";
7341
7412
  var vercelAiIntegration = {
7342
7413
  name: "vercel-ai",
7343
7414
  description: "Vercel AI SDK",
7344
7415
  detect() {
7345
7416
  let hasAiSdk = false;
7346
- if (existsSync7("package.json")) {
7417
+ if (existsSync8("package.json")) {
7347
7418
  try {
7348
- const pkg = JSON.parse(readFileSync7("package.json", "utf-8"));
7419
+ const pkg = JSON.parse(readFileSync8("package.json", "utf-8"));
7349
7420
  const deps = { ...pkg.dependencies, ...pkg.devDependencies };
7350
7421
  hasAiSdk = "ai" in deps;
7351
7422
  } catch {}
@@ -7487,15 +7558,15 @@ mutagent integrate vercel-ai --verify
7487
7558
  };
7488
7559
 
7489
7560
  // src/lib/integrations/openai.ts
7490
- import { readFileSync as readFileSync8, existsSync as existsSync8 } from "fs";
7561
+ import { readFileSync as readFileSync9, existsSync as existsSync9 } from "fs";
7491
7562
  var openaiIntegration = {
7492
7563
  name: "openai",
7493
7564
  description: "OpenAI SDK integration with automatic tracing",
7494
7565
  detect() {
7495
7566
  let hasOpenAI = false;
7496
- if (existsSync8("package.json")) {
7567
+ if (existsSync9("package.json")) {
7497
7568
  try {
7498
- const pkg = JSON.parse(readFileSync8("package.json", "utf-8"));
7569
+ const pkg = JSON.parse(readFileSync9("package.json", "utf-8"));
7499
7570
  const deps = { ...pkg.dependencies, ...pkg.devDependencies };
7500
7571
  hasOpenAI = "openai" in deps;
7501
7572
  } catch {}
@@ -7644,16 +7715,16 @@ function getFrameworkMetadata(name) {
7644
7715
 
7645
7716
  // src/commands/integrate.ts
7646
7717
  function detectPackageManager() {
7647
- if (existsSync9("bun.lockb") || existsSync9("bun.lock")) {
7718
+ if (existsSync10("bun.lockb") || existsSync10("bun.lock")) {
7648
7719
  return "bun";
7649
7720
  }
7650
- if (existsSync9("pnpm-lock.yaml")) {
7721
+ if (existsSync10("pnpm-lock.yaml")) {
7651
7722
  return "pnpm";
7652
7723
  }
7653
- if (existsSync9("yarn.lock")) {
7724
+ if (existsSync10("yarn.lock")) {
7654
7725
  return "yarn";
7655
7726
  }
7656
- if (existsSync9("package-lock.json")) {
7727
+ if (existsSync10("package-lock.json")) {
7657
7728
  return "npm";
7658
7729
  }
7659
7730
  try {
@@ -9039,15 +9110,15 @@ init_config();
9039
9110
  import { Command as Command15 } from "commander";
9040
9111
  import inquirer2 from "inquirer";
9041
9112
  import chalk34 from "chalk";
9042
- import { existsSync as existsSync11, mkdirSync as mkdirSync3, writeFileSync as writeFileSync4 } from "fs";
9113
+ import { existsSync as existsSync12, mkdirSync as mkdirSync3, writeFileSync as writeFileSync4 } from "fs";
9043
9114
  import { execSync as execSync3 } from "child_process";
9044
- import { join as join6 } from "path";
9115
+ import { join as join7 } from "path";
9045
9116
  init_errors();
9046
9117
 
9047
9118
  // src/lib/framework-detection.ts
9048
- import { existsSync as existsSync10, readFileSync as readFileSync9 } from "fs";
9119
+ import { existsSync as existsSync11, readFileSync as readFileSync10 } from "fs";
9049
9120
  import { execSync as execSync2 } from "child_process";
9050
- import { join as join5 } from "path";
9121
+ import { join as join6 } from "path";
9051
9122
  var FRAMEWORK_DETECTION_MAP = {
9052
9123
  "@mastra/core": {
9053
9124
  name: "mastra",
@@ -9097,16 +9168,16 @@ var FRAMEWORK_DETECTION_MAP = {
9097
9168
  }
9098
9169
  };
9099
9170
  function detectPackageManager2(cwd = process.cwd()) {
9100
- if (existsSync10(join5(cwd, "bun.lockb")) || existsSync10(join5(cwd, "bun.lock"))) {
9171
+ if (existsSync11(join6(cwd, "bun.lockb")) || existsSync11(join6(cwd, "bun.lock"))) {
9101
9172
  return "bun";
9102
9173
  }
9103
- if (existsSync10(join5(cwd, "pnpm-lock.yaml"))) {
9174
+ if (existsSync11(join6(cwd, "pnpm-lock.yaml"))) {
9104
9175
  return "pnpm";
9105
9176
  }
9106
- if (existsSync10(join5(cwd, "yarn.lock"))) {
9177
+ if (existsSync11(join6(cwd, "yarn.lock"))) {
9107
9178
  return "yarn";
9108
9179
  }
9109
- if (existsSync10(join5(cwd, "package-lock.json"))) {
9180
+ if (existsSync11(join6(cwd, "package-lock.json"))) {
9110
9181
  return "npm";
9111
9182
  }
9112
9183
  try {
@@ -9127,13 +9198,13 @@ function getInstallCommand2(pm, packages) {
9127
9198
  return commands[pm];
9128
9199
  }
9129
9200
  function detectFrameworkFromPackageJson(cwd = process.cwd()) {
9130
- const pkgPath = join5(cwd, "package.json");
9131
- if (!existsSync10(pkgPath)) {
9201
+ const pkgPath = join6(cwd, "package.json");
9202
+ if (!existsSync11(pkgPath)) {
9132
9203
  return null;
9133
9204
  }
9134
9205
  let pkg;
9135
9206
  try {
9136
- pkg = JSON.parse(readFileSync9(pkgPath, "utf-8"));
9207
+ pkg = JSON.parse(readFileSync10(pkgPath, "utf-8"));
9137
9208
  } catch {
9138
9209
  return null;
9139
9210
  }
@@ -9149,12 +9220,12 @@ function detectFrameworkFromPackageJson(cwd = process.cwd()) {
9149
9220
  return null;
9150
9221
  }
9151
9222
  function hasRcConfig(cwd = process.cwd()) {
9152
- return existsSync10(join5(cwd, ".mutagentrc.json"));
9223
+ return existsSync11(join6(cwd, ".mutagentrc.json"));
9153
9224
  }
9154
9225
 
9155
9226
  // src/commands/init.ts
9156
9227
  function writeRcConfig(config, cwd = process.cwd()) {
9157
- const rcPath = join6(cwd, ".mutagentrc.json");
9228
+ const rcPath = join7(cwd, ".mutagentrc.json");
9158
9229
  writeFileSync4(rcPath, JSON.stringify(config, null, 2) + `
9159
9230
  `);
9160
9231
  }
@@ -9370,8 +9441,8 @@ Modes:
9370
9441
  }
9371
9442
  }
9372
9443
  }
9373
- const skillPath = join6(cwd, ".claude/skills/mutagent-cli/SKILL.md");
9374
- const skillInstalled = existsSync11(skillPath);
9444
+ const skillPath = join7(cwd, ".claude/skills/mutagent-cli/SKILL.md");
9445
+ const skillInstalled = existsSync12(skillPath);
9375
9446
  if (!isNonInteractive && !skillInstalled) {
9376
9447
  const { installSkill } = await inquirer2.prompt([{
9377
9448
  type: "confirm",
@@ -9381,11 +9452,11 @@ Modes:
9381
9452
  }]);
9382
9453
  if (installSkill) {
9383
9454
  try {
9384
- const skillDir = join6(cwd, ".claude/skills/mutagent-cli");
9385
- if (!existsSync11(skillDir)) {
9455
+ const skillDir = join7(cwd, ".claude/skills/mutagent-cli");
9456
+ if (!existsSync12(skillDir)) {
9386
9457
  mkdirSync3(skillDir, { recursive: true });
9387
9458
  }
9388
- execSync3("node " + join6(cwd, "node_modules/.bin/mutagent") + " skills install", {
9459
+ execSync3("node " + join7(cwd, "node_modules/.bin/mutagent") + " skills install", {
9389
9460
  cwd,
9390
9461
  stdio: "ignore"
9391
9462
  });
@@ -9395,6 +9466,36 @@ Modes:
9395
9466
  }
9396
9467
  }
9397
9468
  }
9469
+ const initDirective = {
9470
+ instruction: skillInstalled ? "Verify workspace. Run: mutagent workspaces list --json" : [
9471
+ "MANDATORY: Ask the user whether they want to install the MutagenT CLI skill and Claude Code hooks.",
9472
+ "Use your AskUserQuestion tool to present the following choices:",
9473
+ "1. Install skill + hooks (Recommended) — Installs the CLI skill AND configures Claude Code session telemetry hooks",
9474
+ "2. Install skill only — Installs the CLI skill (teaches AI agents to use MutagenT CLI)",
9475
+ "3. Skip — Do not install",
9476
+ "",
9477
+ "If the user picks option 1, run BOTH commands:",
9478
+ " mutagent skills install --json",
9479
+ " mutagent hooks install --json",
9480
+ "",
9481
+ "If the user picks option 2, run only: mutagent skills install --json",
9482
+ "",
9483
+ "After skill/hooks install, verify workspace: mutagent workspaces list --json"
9484
+ ].join(`
9485
+ `),
9486
+ next: ["mutagent workspaces list --json", "mutagent prompts list --json"],
9487
+ ...!skillInstalled ? {
9488
+ askUserQuestions: [{
9489
+ question: "Would you like to install the MutagenT CLI skill and Claude Code hooks?",
9490
+ options: [
9491
+ { label: "Install skill + hooks (Recommended)", description: "Installs the CLI skill AND configures Claude Code session telemetry hooks" },
9492
+ { label: "Install skill only", description: "Installs the CLI skill (teaches AI agents to use MutagenT CLI)" },
9493
+ { label: "Skip", description: "Do not install" }
9494
+ ],
9495
+ multiSelect: false
9496
+ }]
9497
+ } : {}
9498
+ };
9398
9499
  const summary = {
9399
9500
  success: true,
9400
9501
  mode,
@@ -9403,30 +9504,8 @@ Modes:
9403
9504
  framework: confirmedFramework?.name ?? null,
9404
9505
  authenticated,
9405
9506
  workspaceValidation: workspaceValidation ?? null,
9406
- skillInstalled: skillInstalled || existsSync11(skillPath),
9407
- _directive: {
9408
- instruction: "Verify workspace. Run: mutagent workspaces list --json",
9409
- next: ["mutagent workspaces list --json", "mutagent prompts list --json"],
9410
- ...!skillInstalled ? {
9411
- skillInstall: {
9412
- instruction: [
9413
- "MANDATORY: Ask the user whether they want to install the MutagenT CLI skill and Claude Code hooks.",
9414
- "Use your AskUserQuestion tool to present the following choices:",
9415
- "1. Install skill + hooks (Recommended) — Installs the CLI skill AND configures Claude Code session telemetry hooks",
9416
- "2. Install skill only — Installs the CLI skill (teaches AI agents to use MutagenT CLI)",
9417
- "3. Skip — Do not install",
9418
- "",
9419
- "If the user picks option 1, run BOTH commands:",
9420
- " mutagent skills install --json",
9421
- " mutagent hooks install --json",
9422
- "",
9423
- "If the user picks option 2, run only: mutagent skills install --json"
9424
- ].join(`
9425
- `),
9426
- command: "mutagent skills install --json"
9427
- }
9428
- } : {}
9429
- }
9507
+ skillInstalled: skillInstalled || existsSync12(skillPath),
9508
+ _directive: initDirective
9430
9509
  };
9431
9510
  output.output(summary);
9432
9511
  } catch (error) {
@@ -9469,6 +9548,7 @@ ${chalk35.bold("Template variable delimiters (inferred per-file):")}
9469
9548
  ${chalk35.dim('{ "file": "src/prompts/summarize.ts", "line": 12, "reason": "template-variable", "delimiter": "single" }')}
9470
9549
 
9471
9550
  ${chalk35.dim("Results are saved to .mutagent/mutation-context.md for use by other commands.")}
9551
+ ${chalk35.cyan("Conceptual help (prompt vs agent taxonomy, brace inference): .claude/skills/mutagent-cli/workflows/exploration.md")} ${chalk35.dim("(install via mutagent skills install)")}
9472
9552
  `).action((options) => {
9473
9553
  const isJson = getJsonFlag(explore);
9474
9554
  const output = new OutputFormatter(isJson ? "json" : "table");
@@ -9598,8 +9678,8 @@ Scanning ${scanPath}...
9598
9678
  // src/commands/skills.ts
9599
9679
  import { Command as Command17 } from "commander";
9600
9680
  import chalk36 from "chalk";
9601
- import { existsSync as existsSync12, mkdirSync as mkdirSync4, writeFileSync as writeFileSync5 } from "fs";
9602
- import { dirname, join as join7 } from "path";
9681
+ import { existsSync as existsSync13, mkdirSync as mkdirSync4, writeFileSync as writeFileSync5 } from "fs";
9682
+ import { dirname, join as join8 } from "path";
9603
9683
  import { execSync as execSync4 } from "child_process";
9604
9684
 
9605
9685
  // src/generated/skill-content.ts
@@ -9614,6 +9694,8 @@ description: |
9614
9694
  "create evaluation", "upload dataset", "explore prompts", "mutagent cli",
9615
9695
  "eval", "dataset", "guided", "how do I optimize", "improve my prompt",
9616
9696
  "set up tracing", "add observability".
9697
+ SKILL_VERSION: 1.1.0
9698
+ SKILL_MIN_CLI_VERSION: 0.1.163
9617
9699
  ---
9618
9700
 
9619
9701
  # MutagenT CLI Skill
@@ -9622,7 +9704,135 @@ description: |
9622
9704
  > Packed into the CLI binary via \`scripts/sync-skill.ts\`. Installed to end-user
9623
9705
  > dev environments via \`mutagent skills install\`. Edit this file, not the installed copy.
9624
9706
 
9625
- ## SKILL vs CLI — responsibility split
9707
+ ---
9708
+
9709
+ ## CLI Prerequisite Check (RUN FIRST)
9710
+
9711
+ Before executing ANY workflow step, verify the CLI is installed and version-compatible:
9712
+
9713
+ **Step 1 -- Check CLI presence:**
9714
+ \`\`\`bash
9715
+ mutagent --version --json
9716
+ \`\`\`
9717
+
9718
+ **Step 2 -- If command not found (error / not on PATH):**
9719
+
9720
+ This is the **Path 2 onboarding case**: the Skill was installed first (e.g. from a skill registry, manually, or bundled in someone else's CLAUDE.md), but the CLI itself isn't installed yet. Do NOT just dump install instructions and stop -- proactively **offer to install it**.
9721
+
9722
+ **2a. Detect the user's package manager** (best-effort — check in this order):
9723
+ \`\`\`bash
9724
+ # In the user's project root (cwd):
9725
+ test -f bun.lockb && echo "bun"
9726
+ test -f pnpm-lock.yaml && echo "pnpm"
9727
+ test -f yarn.lock && echo "yarn"
9728
+ test -f package-lock.json && echo "npm"
9729
+ # Fallback: which bun || which pnpm || which yarn || which npm
9730
+ \`\`\`
9731
+ If multiple lockfiles exist, prefer in order: \`bun > pnpm > yarn > npm\`.
9732
+ If no lockfile and the user is in a non-JS project (e.g. Python, Go), ask which they prefer.
9733
+
9734
+ **2b. Ask the user via AskUserQuestion** (do NOT auto-install without consent):
9735
+
9736
+ > "The MutagenT CLI is not installed yet. I can install it globally via \`<detected-pm>\`. Proceed?"
9737
+
9738
+ Options to present:
9739
+ 1. **Yes, install globally with \`<detected-pm>\`** (Recommended) -- runs \`<pm> add -g @mutagent/cli\` (or \`npm install -g @mutagent/cli\` for npm)
9740
+ 2. **Yes, but use a different package manager** -- prompt for choice (npm / bun / pnpm / yarn)
9741
+ 3. **No, I'll install it myself** -- show the four install commands as a verbatim block; STOP and wait for the user to install
9742
+ 4. **Skip — I have it installed via a different path** -- ask the user to add it to PATH and re-invoke
9743
+
9744
+ **2c. On user approval (option 1 or 2)**, run the install command in a Bash tool call:
9745
+ \`\`\`bash
9746
+ # bun
9747
+ bun add -g @mutagent/cli
9748
+ # npm
9749
+ npm install -g @mutagent/cli
9750
+ # pnpm
9751
+ pnpm add -g @mutagent/cli
9752
+ # yarn
9753
+ yarn global add @mutagent/cli
9754
+ \`\`\`
9755
+ Show the install output to the user verbatim. After it completes, **re-run Step 1** (\`mutagent --version --json\`) to confirm the CLI is now on PATH. If the post-install version check still fails (e.g. global bin not on PATH), tell the user:
9756
+ > "Install succeeded but \`mutagent\` isn't on PATH yet. Common fixes:
9757
+ > - bun: \`export PATH=\\"$HOME/.bun/bin:$PATH\\"\`
9758
+ > - npm: check \`npm config get prefix\` and add \`<prefix>/bin\` to PATH
9759
+ > - pnpm: \`pnpm setup\` then restart your shell
9760
+ > - yarn: \`yarn global bin\` and add that path to PATH"
9761
+ > Then ask the user to restart their shell or source the relevant rc file, and re-invoke me.
9762
+
9763
+ **2d. On rejection (option 3 or 4)**, output the four install commands as a verbatim block and STOP. Do NOT proceed with any workflow until the user confirms the CLI is installed and re-invokes the skill.
9764
+
9765
+ \`\`\`bash
9766
+ # Pick one (Recommended in order: bun > pnpm > yarn > npm)
9767
+ bun add -g @mutagent/cli
9768
+ pnpm add -g @mutagent/cli
9769
+ yarn global add @mutagent/cli
9770
+ npm install -g @mutagent/cli
9771
+ \`\`\`
9772
+
9773
+ **Why proactive install (not just "stop and tell")**: a user invoking this Skill has already shown intent to use MutagenT. Forcing them to context-switch to a separate terminal, copy-paste an install command, and re-invoke the agent is friction that often loses the user. Asking once + installing on consent is the smoother path. The opt-out (option 3/4) preserves user control.
9774
+
9775
+ **Step 3 -- Version compatibility check:**
9776
+ Parse \`_compat.cliVersion\` from the \`--version --json\` output and compare against
9777
+ \`SKILL_MIN_CLI_VERSION\` (above in this file's frontmatter, currently \`0.1.163\`).
9778
+
9779
+ - If \`cliVersion >= SKILL_MIN_CLI_VERSION\`: all good, proceed normally.
9780
+ - If \`cliVersion < SKILL_MIN_CLI_VERSION\`: emit a **PROMINENT PERSISTENT WARN** to the user:
9781
+
9782
+ > Warning: **CLI version mismatch**: Your \`mutagent\` CLI is \`{cliVersion}\` but this Skill
9783
+ > requires \`>= 0.1.163\`. Some features may not work correctly.
9784
+ > Upgrade: \`npm install -g @mutagent/cli@latest\`
9785
+
9786
+ **DO NOT BLOCK** -- proceed with the requested workflow after showing the warn.
9787
+ Re-emit this warn at the start of every subsequent Skill invocation until the user upgrades.
9788
+
9789
+ **Per decision D4 (locked 2026-05-03)**: version mismatch is warn-only, never a hard block.
9790
+
9791
+ ---
9792
+
9793
+ ## MANDATORY: Verbatim Card Display Protocol
9794
+
9795
+ If a mutagent CLI response contains \`_directive.renderedCard\`, your FIRST action
9796
+ MUST be to copy that text byte-for-byte into your chat reply BEFORE running any
9797
+ further command, asking any question, or doing any other work.
9798
+
9799
+ The card also appears on stderr (auto-echoed by the CLI via \`echoDirectiveToStderr\`)
9800
+ but you MUST STILL render it in chat -- bash/stderr blocks may be collapsed in the
9801
+ user's UI and they cannot read them.
9802
+
9803
+ - Do NOT summarize, paraphrase, or truncate any line of the card.
9804
+ - Do NOT skip the card to "save tokens."
9805
+ - Do NOT delay rendering until "after the next step."
9806
+ - This applies whether \`_directive.display\` is \`'status_card'\` or absent -- the
9807
+ trigger is the presence of \`renderedCard\`, not the display tag.
9808
+ - Failure to display the card verbatim = protocol violation.
9809
+
9810
+ This rule applies to: prompt/agent/dataset/evaluation create+update+delete,
9811
+ provider add+update+delete, and optimize start/status/results.
9812
+
9813
+ ---
9814
+
9815
+ ## Agent Runtime — interactive question handling
9816
+
9817
+ This skill emits \`_directive.askUserQuestions\` arrays for guided flows
9818
+ (eval-creation, dataset-curation, init). On Claude Code, use the
9819
+ **AskUserQuestion** tool to present each question to the user. On other
9820
+ coding-agent runtimes (Cursor, Aider, Continue, etc.), use the equivalent
9821
+ inquiry/prompt mechanism your runtime provides.
9822
+
9823
+ If your runtime has no interactive-question tool, fall back to:
9824
+ 1. Echo each question's \`question\` field verbatim into chat
9825
+ 2. Wait for the user's reply before proceeding to the next question
9826
+ 3. Do NOT auto-fill answers from context (Rule 3)
9827
+
9828
+ The \`_directive.askUserQuestions\` schema is described in
9829
+ [\`concepts/eval-criteria.md\`](./concepts/eval-criteria.md) §
9830
+ "Per-field rubric collection" and follows
9831
+ [Claude Code's AskUserQuestion tool shape](https://docs.claude.com/en/docs/claude-code/sdk).
9832
+
9833
+ ---
9834
+
9835
+ ## SKILL vs CLI -- responsibility split
9626
9836
 
9627
9837
  | Layer | Owner | Responsibility |
9628
9838
  |---|---|---|
@@ -9630,33 +9840,54 @@ description: |
9630
9840
  | **CLI** | \`mutagent <cmd>\` | commands, flags, \`--json\`, \`_directive.*\`, \`_links\` |
9631
9841
  | Platform | api.mutagent.io | storage, optimization, eval execution, \`{variable}\` rendering |
9632
9842
 
9633
- **Rule**: SKILL never duplicates CLI flag lists always \`mutagent <cmd> --help\` for flags.
9843
+ **Rule**: SKILL never duplicates CLI flag lists -- always \`mutagent <cmd> --help\` for flags.
9634
9844
 
9635
9845
  ---
9636
9846
 
9637
- ## 5 Core Rules NON-NEGOTIABLE
9847
+ ## 5 Core Rules -- NON-NEGOTIABLE
9638
9848
 
9639
9849
  1. **\`--json\` on EVERY command.** No exceptions. Agents use JSON mode exclusively.
9640
- 2. **\`<command> --help\` BEFORE first use of any command.** The CLI is the source of truth for flags this SKILL never inlines them.
9641
- 3. **NEVER auto-generate eval criteria collect from user.** Ask the user for each rubric field. See [concepts/eval-criteria.md](./concepts/eval-criteria.md) for the rubric format.
9850
+ 2. **\`<command> --help\` BEFORE first use of any command.** The CLI is the source of truth for flags -- this SKILL never inlines them.
9851
+ 3. **NEVER auto-generate eval criteria -- collect from user.** Ask the user for each rubric field. See [concepts/eval-criteria.md](./concepts/eval-criteria.md) for the rubric format.
9642
9852
  4. **Explore-before-modify.** Run \`mutagent explore --json\` before any write operation. Present findings, get user confirmation. Never mutate without discovery first.
9643
9853
  5. **Cost transparency before \`optimize start\`.** Run \`mutagent usage --json\` and show the result to the user. Get explicit confirmation before any optimization job.
9644
9854
  6. **Before optimizing, run \`mutagent providers list --models\` to verify available models.** This calls \`/providers/catalog\` and shows which models are available per provider. Use the output to pick valid \`--exec-model\` and \`--eval-model\` values.
9645
9855
 
9646
9856
  ---
9647
9857
 
9648
- ## Journey Router route by user intent
9858
+ ## Prompt vs Agent -- pick the right loop
9859
+
9860
+ | Signal | Use | CLI surface | Skill workflow |
9861
+ |---|---|---|---|
9862
+ | Single LLM call -> text/JSON output | Prompt Optimization | \`mutagent prompts *\` | [workflows/optimization.md](./workflows/optimization.md) |
9863
+ | Multi-turn / tool-calling / state graph | Agent (WIP) | \`mutagent agents *\` (CRUD only) | [workflows/agents.md](./workflows/agents.md) (stub) |
9649
9864
 
9650
- Match the user's first request. Load ONLY the matching subfile. Do NOT preload the whole set.
9865
+ When in doubt: run \`mutagent explore --json\` (it classifies discovered code under \`prompts[]\` vs \`agents[]\`).
9651
9866
 
9652
- | User said / signal detected | Load subfile | Why |
9867
+ ---
9868
+
9869
+ ## Journey Router -- route by user intent
9870
+
9871
+ > **Concept files = WHY/WHAT pre-reads. Workflow files = HOW step sequences.**
9872
+ > Load BOTH when intent matches both axes (e.g., "create rubric" loads
9873
+ > \`concepts/eval-criteria.md\` for the rubric design framework AND
9874
+ > \`workflows/eval-creation.md\` for the step-by-step CLI sequence). Each topic's
9875
+ > concept ↔ workflow pairing is shown in the Subfile Map below.
9876
+
9877
+ Match the user's first request. Load ONLY the matching subfile(s) per the table. Do NOT preload the whole set.
9878
+
9879
+ | User said / signal detected | Load subfile(s) | Why |
9653
9880
  |---|---|---|
9654
9881
  | "trace", "observe", "integrate", "add framework" | [workflows/tracing.md](./workflows/tracing.md) | Non-destructive, fastest first-value path |
9655
- | "optimize", "improve", "tune", "evaluate", "upload prompt" | [workflows/optimization.md](./workflows/optimization.md) | Full createdatasetevaloptimize loop |
9882
+ | "optimize", "improve", "tune", "upload prompt" | [workflows/optimization.md](./workflows/optimization.md) | Full create->dataset->eval->optimize loop (orchestrator) |
9883
+ | "create dataset", "add examples", "test cases", "edge cases", "hard cases", "expand dataset", "dataset items" | [workflows/dataset-curation.md](./workflows/dataset-curation.md) (HOW) + [concepts/dataset-design.md](./concepts/dataset-design.md) (WHY) | Standalone dataset curation (no optimization context needed) |
9884
+ | "create evaluation", "create rubric", "evaluate prompt", "judge", "score this prompt", "rubric design", "MVC", "Output Standards" | [workflows/eval-creation.md](./workflows/eval-creation.md) (HOW) + [concepts/eval-criteria.md](./concepts/eval-criteria.md) (WHY) | Standalone evaluation rubric creation (no optimization context needed) |
9656
9885
  | "explore", "scan", "find prompts", "what prompts", "discover" | [workflows/exploration.md](./workflows/exploration.md) | Read-only discovery + taxonomy |
9657
- | \`AgentExecutor\`, \`StateGraph\`, \`createReactAgent\`, \`tool_calls\`, \`@tool\`, \`langgraph\`, \`crewai\`, \`autogen\`, \`openai/agents\`, multi-turn | [workflows/agents.md](./workflows/agents.md) | WIP path surface partnership link |
9658
- | "how do variables work", "single vs double braces", delimiter | [concepts/prompt-variables.md](./concepts/prompt-variables.md) | Delimiter inference contract |
9659
- | "what makes a good eval", "rubric", "evaluation criteria" | [concepts/eval-criteria.md](./concepts/eval-criteria.md) | INPUT MVC + OUTPUT Standards |
9886
+ | \`AgentExecutor\`, \`StateGraph\`, \`createReactAgent\`, \`tool_calls\`, \`@tool\`, \`langgraph\`, \`crewai\`, \`autogen\`, \`openai/agents\`, multi-turn | [workflows/agents.md](./workflows/agents.md) | WIP path -- surface partnership link |
9887
+ | "how do variables work", "single vs double braces", delimiter | [concepts/prompt-variables.md](./concepts/prompt-variables.md) | Delimiter inference contract (concept-only; prompt creation lives inline in optimization.md step 4) |
9888
+ | "what makes a good eval" (concept question only, no creation intent) | [concepts/eval-criteria.md](./concepts/eval-criteria.md) | INPUT MVC + OUTPUT Standards (no workflow load) |
9889
+ | "what makes a good dataset" (concept question only, no creation intent) | [concepts/dataset-design.md](./concepts/dataset-design.md) | Dataset curation principles + case categories (no workflow load) |
9890
+ | "scorecard", "interpret results", "what does X score mean" | [concepts/scorecard-output.md](./concepts/scorecard-output.md) | Interpretation only (no workflow needed) |
9660
9891
  | "check models", "what models", "available models", "which models" | run \`mutagent providers list --models --json\` | Discovery: shows catalog per provider before model selection |
9661
9892
  | Unclear / first time | run \`mutagent explore --json\` first, then reroute | Discovery before action |
9662
9893
 
@@ -9667,59 +9898,62 @@ Match the user's first request. Load ONLY the matching subfile. Do NOT preload t
9667
9898
  | File | WHEN to load | WHY | ENFORCEMENT |
9668
9899
  |---|---|---|---|
9669
9900
  | [workflows/tracing.md](./workflows/tracing.md) | User wants to add framework tracing / observability | Non-destructive append-only integration sequence | Must run explore first (Rule 4) |
9670
- | [workflows/optimization.md](./workflows/optimization.md) | User wants to optimize or evaluate a prompt | Full loop: explore upload dataset eval optimize apply | Must check usage before optimize (Rule 5); must collect rubrics from user (Rule 3) |
9901
+ | [workflows/optimization.md](./workflows/optimization.md) | User wants to optimize or evaluate a prompt | Full loop: explore -> upload -> dataset -> eval -> optimize -> apply | Must check usage before optimize (Rule 5); must collect rubrics from user (Rule 3) |
9902
+ | [workflows/dataset-curation.md](./workflows/dataset-curation.md) | User wants to create/expand a dataset (standalone) | Focused dataset curation without full optimization context | Hard cases first; ask per-field questions |
9903
+ | [workflows/eval-creation.md](./workflows/eval-creation.md) | User wants to create/edit evaluation rubric (standalone) | Focused per-field rubric collection without full optimization context | INPUT MVC + OUTPUT Standards split; ask per-field questions; collect from user (Rule 3) |
9671
9904
  | [workflows/exploration.md](./workflows/exploration.md) | User wants to scan codebase, identify prompts vs agents | Read-only discovery; output taxonomy to user | Run only; no writes |
9672
- | [workflows/agents.md](./workflows/agents.md) | Multi-turn / tool-calling code detected | WIP do NOT attempt optimizer, surface partnership link | Show WIP card to user verbatim |
9905
+ | [workflows/agents.md](./workflows/agents.md) | Multi-turn / tool-calling code detected | WIP -- do NOT attempt optimizer, surface partnership link | Show WIP card to user verbatim |
9673
9906
  | [concepts/prompt-variables.md](./concepts/prompt-variables.md) | Any question about \`{var}\` vs \`{{var}}\`, delimiter inference | Brace convention + conversion rules | Load before \`prompts create\` in optimization workflow |
9674
- | [concepts/eval-criteria.md](./concepts/eval-criteria.md) | Any question about rubric design, MVC, Output Standards | granular rubric format INPUT-param vs OUTPUT-param scope | Load before \`evaluation create --guided\` in optimization workflow |
9907
+ | [concepts/eval-criteria.md](./concepts/eval-criteria.md) | Any question about rubric design, MVC, Output Standards | granular rubric format -- INPUT-param vs OUTPUT-param scope | Load before \`evaluation create --guided\` in optimization workflow |
9908
+ | [concepts/dataset-design.md](./concepts/dataset-design.md) | Any question about dataset quality, case categories, hard cases | Dataset design principles -- parallel structure to eval-criteria.md | Load before \`dataset add --guided\` |
9675
9909
 
9676
9910
  ---
9677
9911
 
9678
9912
  ## Output handling
9679
9913
 
9680
9914
  After every CLI command:
9681
- - **Show the command output to the user.** Command output appears in bash blocks that users may not see always present the key results in your chat response.
9682
- - **For evaluation create \`--guided\`**: the CLI provides per-field questions for each schema field. Ask the user each question in turn. Do not skip any field. Do not pre-fill answers.
9915
+ - **Show the command output to the user.** Command output appears in bash blocks that users may not see -- always present the key results in your chat response.
9916
+ - **For evaluation create \`--guided\`**: the CLI provides per-field questions in \`_directive.askUserQuestions\`. Ask the user each question in turn. Do not skip any field. Do not pre-fill answers.
9683
9917
  - **For \`optimize results\`**: present the before/after scorecard to the user and confirm whether to apply, view diff, or reject.
9684
9918
 
9685
9919
  ---
9686
9920
 
9687
- ## Anti-patterns NEVER do these
9921
+ ## Anti-patterns -- NEVER do these
9688
9922
 
9689
9923
  - Run any command without \`--json\`
9690
- - Auto-generate eval criteria always collect from the user
9924
+ - Auto-generate eval criteria -- always collect from the user
9691
9925
  - Skip any schema field when collecting evaluation rubrics
9692
9926
  - Skip \`mutagent explore --json\` before any write operation
9693
9927
  - Run \`optimize start\` without first showing \`usage --json\` to the user
9694
9928
  - Increase \`--max-iterations\` above 1 without explicit user consent (each iteration = LLM spend)
9695
9929
  - Run a multi-turn agent through the prompt optimizer
9696
9930
  - Skip showing command output results to the user
9697
- - Inline CLI flags from memory always read \`--help\` first
9931
+ - Inline CLI flags from memory -- always read \`--help\` first
9698
9932
 
9699
9933
  ---
9700
9934
 
9701
9935
  ## State Tracking
9702
9936
 
9703
- - \`.mutagent/mutation-context.md\` codebase index of discovered/uploaded prompts. Update after explore, create, dataset ops.
9704
- - \`mutagent auth status --json\` auth + workspace state.
9937
+ - \`.mutagent/mutation-context.md\` -- codebase index of discovered/uploaded prompts. Update after explore, create, dataset ops.
9938
+ - \`mutagent auth status --json\` -- auth + workspace state.
9705
9939
 
9706
9940
  ---
9707
9941
 
9708
9942
  ## Login (two paths)
9709
9943
 
9710
- - **CI / automated**: \`export MUTAGENT_API_KEY=mt_... && mutagent login --json\` no browser, no prompts.
9711
- - **Onboarding a user**: \`mutagent login --browser --json\` CLI prints auth URL to stdout, polls 5 min. **Surface the URL verbatim to the user.** \`--non-interactive\` is NOT needed when \`--browser\` is set.
9944
+ - **CI / automated**: \`export MUTAGENT_API_KEY=mt_... && mutagent login --json\` -- no browser, no prompts.
9945
+ - **Onboarding a user**: \`mutagent login --browser --json\` -- CLI prints auth URL to stdout, polls 5 min. **Surface the URL verbatim to the user.** \`--non-interactive\` is NOT needed when \`--browser\` is set.
9712
9946
 
9713
- \`mutagent login\` is canonical. \`mutagent auth login\` is a back-compat alias. Both delegate to \`lib/auth-flow.ts\`. Decision record: [cli-design-principles.md](../../docs/cli-design-principles.md) Login Unification.
9947
+ \`mutagent login\` is canonical. \`mutagent auth login\` is a back-compat alias. Both delegate to \`lib/auth-flow.ts\`. Decision record: [cli-design-principles.md](../../docs/cli-design-principles.md) -> Login Unification.
9714
9948
 
9715
9949
  ---
9716
9950
 
9717
- ## Error Recovery Agent-Aware Bug Reporting
9951
+ ## Error Recovery -- Agent-Aware Bug Reporting
9718
9952
 
9719
9953
  When ANY mutagent CLI command returns a non-zero exit code or an error response,
9720
9954
  follow this protocol:
9721
9955
 
9722
- 1. **Show the error to the user** (always) reproduce the exact command and output.
9956
+ 1. **Show the error to the user** (always) -- reproduce the exact command and output.
9723
9957
  2. **ASK the user** if they want to file a bug report with session context.
9724
9958
  3. **On user approval**, run:
9725
9959
  \`\`\`bash
@@ -9758,7 +9992,7 @@ top-level keys you supply:
9758
9992
  }
9759
9993
  \`\`\`
9760
9994
 
9761
- \`_auto\` is always populated by the CLI do **not** set it manually. Your
9995
+ \`_auto\` is always populated by the CLI -- do **not** set it manually. Your
9762
9996
  top-level keys are never overwritten; if you supply \`workspaceId: "ws_agent_B"\`,
9763
9997
  the CLI's current workspace A goes into \`_auto.workspaceId\`, not the top level.
9764
9998
 
@@ -9775,7 +10009,218 @@ If the feedback command returns a non-zero exit code, DO NOT retry silently. Sho
9775
10009
 
9776
10010
  ## Extensibility
9777
10011
 
9778
- Add \`workflows/custom-<name>.md\` with frontmatter \`triggers: ["phrase"]\` auto-discovered by the decision tree fallback row. No rebuild needed.
10012
+ Add \`workflows/custom-<name>.md\` with frontmatter \`triggers: ["phrase"]\` -- auto-discovered by the decision tree fallback row. No rebuild needed.
10013
+ `,
10014
+ "concepts/dataset-design.md": `---
10015
+ name: mutagent-cli-concepts-dataset-design
10016
+ description: |
10017
+ Canonical source for MutagenT dataset design principles.
10018
+ The Golden Rule: hard cases > easy cases; edge cases are mandatory.
10019
+ Covers case categories (Edge / Hard / Representative / Adversarial),
10020
+ format requirements (input/expectedOutput shape), and anti-patterns.
10021
+ Parallel structure to concepts/eval-criteria.md for cognitive parity.
10022
+ Mirrored in the CLI directive's bootstrappable instruction field.
10023
+ triggers:
10024
+ - "dataset design"
10025
+ - "dataset quality"
10026
+ - "what makes a good dataset"
10027
+ - "hard cases"
10028
+ - "edge cases"
10029
+ - "test cases"
10030
+ - "expectedOutput"
10031
+ - "dataset items"
10032
+ - "guided dataset"
10033
+ ---
10034
+
10035
+ # Concept -- Dataset Design
10036
+
10037
+ > **Parallel to** [concepts/eval-criteria.md](./eval-criteria.md) -- same section
10038
+ > structure so agents can navigate both consistently.
10039
+ >
10040
+ > **Canonical source** for dataset curation principles. Mirrored inline in
10041
+ > \`mutagent-cli/src/commands/prompts/guided-dataset.ts\` (directive instruction field)
10042
+ > so even agents without the Skill loaded can execute correctly.
10043
+
10044
+ ## The Golden Rule
10045
+
10046
+ **Hard cases that expose prompt weaknesses FIRST. Easy cases that always pass LAST.**
10047
+
10048
+ A dataset where every item produces correct output tells you nothing about where the
10049
+ prompt fails. The optimizer needs failure signal to improve. Prioritize inputs that:
10050
+
10051
+ 1. Are ambiguous (multiple valid interpretations)
10052
+ 2. Are adversarial (designed to trigger a known failure mode)
10053
+ 3. Are at the boundary of what the prompt should handle
10054
+ 4. Are drawn from actual production failure cases
10055
+
10056
+ **One edge case that causes a failure is worth 10 easy cases that succeed.**
10057
+
10058
+ ---
10059
+
10060
+ ## NEVER skip expectedOutput on labelable items
10061
+
10062
+ This is the dataset equivalent of Rule 3 (never auto-generate eval criteria).
10063
+
10064
+ - If you know what the correct output should be for a given input, you MUST include \`expectedOutput\`.
10065
+ - The optimizer uses \`expectedOutput\` as the ground-truth signal for G-Eval scoring.
10066
+ - Omitting \`expectedOutput\` on a labelable item forces the evaluator to use LLM judgment alone -- much noisier.
10067
+ - **Only omit \`expectedOutput\`** when correct output is genuinely subjective / context-dependent AND no rubric can distinguish good from bad.
10068
+
10069
+ Ask the user for expected outputs field by field -- do NOT auto-generate them.
10070
+
10071
+ ---
10072
+
10073
+ ## NEVER auto-generate dataset items
10074
+
10075
+ This is the counterpart to Rule 3 (never auto-generate eval criteria). Reasons:
10076
+
10077
+ - Auto-generated items tend to be representative cases (easy) rather than hard cases.
10078
+ - The user knows what production inputs look like and where the prompt fails; the agent does not.
10079
+ - Synthetic easy cases produce noisy optimization signal -- the optimizer improves scores on the easy
10080
+ cases but the real prompt weaknesses go uncovered.
10081
+ - Collect items from the user via AskUserQuestion, one category at a time.
10082
+
10083
+ ---
10084
+
10085
+ ## Case Categories
10086
+
10087
+ Collect in this priority order -- hardest categories first:
10088
+
10089
+ ### 1. Edge Cases (HIGH priority)
10090
+
10091
+ Boundary inputs that test the limits of what the prompt should handle.
10092
+
10093
+ | What to ask | Examples |
10094
+ |---|---|
10095
+ | Empty or null inputs | \`""\`, \`null\`, \`0\`, \`[]\` |
10096
+ | Very long inputs (token limits) | paragraph-length where field should be short |
10097
+ | Malformed inputs | wrong type, wrong format, garbled text |
10098
+ | Unicode / special characters | emoji, RTL text, control chars, escaped quotes |
10099
+ | Missing required sub-fields | object with some required fields absent |
10100
+
10101
+ **Collect at minimum**: 1-2 edge cases per input field.
10102
+
10103
+ ### 2. Hard Cases (HIGH priority)
10104
+
10105
+ Inputs that are valid but expose known prompt weaknesses or require nuanced reasoning.
10106
+
10107
+ | What to ask | Examples |
10108
+ |---|---|
10109
+ | Ambiguous inputs | "What does this mean?" (multiple valid answers) |
10110
+ | Adversarial inputs | phrasing designed to trigger hallucination or refusal |
10111
+ | Domain traps | technical jargon with multiple meanings in context |
10112
+ | Instruction conflicts | input that triggers contradictory rules in the prompt |
10113
+ | Near-miss inputs | almost correct format but slightly off |
10114
+
10115
+ **Collect at minimum**: 2-3 hard cases total.
10116
+
10117
+ ### 3. Representative Cases (MEDIUM priority)
10118
+
10119
+ Typical production inputs -- what the prompt handles 80% of the time.
10120
+
10121
+ | What to ask | Examples |
10122
+ |---|---|
10123
+ | Common use cases | most frequent user inputs |
10124
+ | Standard formats | well-formed, expected-length, standard vocabulary |
10125
+ | Baseline quality | inputs where the prompt should succeed reliably |
10126
+
10127
+ **Collect after** hard and edge cases are covered.
10128
+
10129
+ ### 4. Adversarial Cases (LOW priority, if relevant)
10130
+
10131
+ Inputs designed to test security / safety / guardrails.
10132
+
10133
+ | What to ask | Examples |
10134
+ |---|---|
10135
+ | Prompt injection attempts | "Ignore previous instructions and..." |
10136
+ | Off-topic requests | completely unrelated to the prompt's domain |
10137
+ | Jailbreak patterns | attempts to bypass constraints |
10138
+
10139
+ Only collect if the prompt has explicit safety constraints.
10140
+
10141
+ ---
10142
+
10143
+ ## Format Requirements
10144
+
10145
+ Every dataset item MUST have:
10146
+
10147
+ \`\`\`json
10148
+ {
10149
+ "input": {
10150
+ "<inputSchema_field_1>": "<value>",
10151
+ "<inputSchema_field_2>": "<value>"
10152
+ },
10153
+ "expectedOutput": {
10154
+ "<outputSchema_field_1>": "<expected_value>",
10155
+ "<outputSchema_field_2>": "<expected_value>"
10156
+ }
10157
+ }
10158
+ \`\`\`
10159
+
10160
+ Rules:
10161
+ - \`input\` keys MUST match the prompt's \`inputSchema.properties\` exactly (no extras, no missing required fields).
10162
+ - \`expectedOutput\` keys MUST match the prompt's \`outputSchema.properties\`.
10163
+ - String values in \`expectedOutput\` should be the verbatim correct answer (not a description of it).
10164
+ - Numeric scores in \`expectedOutput\` should match what the evaluator would award for a perfect response.
10165
+ - Upload as a JSON array: \`[{item1}, {item2}, ...]\`
10166
+
10167
+ ### Minimum dataset size
10168
+
10169
+ - **5 items minimum** for any optimization run.
10170
+ - **At least 2 items** must be hard or edge cases.
10171
+ - More items = better signal, especially for per-criterion scoring.
10172
+ - \`mutagent prompts dataset add --help\` has the upload command flags.
10173
+
10174
+ ---
10175
+
10176
+ ## Anti-patterns
10177
+
10178
+ | Anti-pattern | Why it's bad | Fix |
10179
+ |---|---|---|
10180
+ | All easy cases (prompt always succeeds) | No failure signal for optimizer | Add hard/edge cases first |
10181
+ | No edge cases | Optimizer never sees boundary behavior | Ask user about failure modes |
10182
+ | Fictional inputs that won't happen in production | Optimization targets unrealistic scenarios | Anchor to real usage patterns |
10183
+ | Missing \`expectedOutput\` on labelable items | Optimizer uses LLM judgment alone (noisy) | Ask user for expected outputs |
10184
+ | Duplicate items | Wastes dataset budget, skews scores | Check for duplicates before upload |
10185
+ | Items that are identical to training data | May overfit | Include diverse failure modes |
10186
+
10187
+ ---
10188
+
10189
+ ## Cross-references
10190
+
10191
+ - [SKILL.md](../SKILL.md) -- 5 rules + journey router
10192
+ - [workflows/dataset-curation.md](../workflows/dataset-curation.md) -- standalone dataset curation workflow (HOW; this file is WHY)
10193
+ - [workflows/optimization.md](../workflows/optimization.md) -- full loop that includes dataset add step
10194
+ - [concepts/eval-criteria.md](./eval-criteria.md) -- parallel concept doc for evaluation criteria
10195
+
10196
+ ---
10197
+
10198
+ ## CLI commands
10199
+
10200
+ \`\`\`bash
10201
+ # Discovery (no LLM cost)
10202
+ mutagent prompts dataset --help # list dataset subcommands
10203
+ mutagent prompts dataset add --help # read flags before first use (Rule 2)
10204
+ mutagent prompts dataset list <prompt-id> --json # list datasets attached to a prompt
10205
+ mutagent prompts dataset get <dataset-id> --json # inspect single dataset's items + metadata
10206
+
10207
+ # Creation -- guided (no LLM cost; just storage)
10208
+ mutagent prompts dataset add <prompt-id> --guided --json # get _directive.askUserQuestions (per-field collection)
10209
+ mutagent prompts dataset add <prompt-id> -d '<json>' --name "<name>" --json # upload items
10210
+ # -d accepts inline JSON OR @path/to/file.json OR - (stdin)
10211
+
10212
+ # Mutations
10213
+ mutagent prompts dataset update <dataset-id> -d '<json>' --json # replace items in existing dataset
10214
+ mutagent prompts dataset delete <dataset-id> --json # delete dataset (idempotent; --force skips confirm)
10215
+ \`\`\`
10216
+
10217
+ **Flag glossary** (dataset-specific):
10218
+ - \`--guided\` -- emit per-field \`askUserQuestions\` directive instead of expecting \`-d\` upfront. Use when collecting from user.
10219
+ - \`-d <json>\` / \`--data <json>\` -- supply items payload inline. Accepts: inline JSON, \`@path\` (read from file), \`-\` (read from stdin).
10220
+ - \`--name "<name>"\` -- human-readable label for the dataset (shows in dashboard).
10221
+ - \`--json\` -- structured output (Rule 1: always use). Returns \`_directive\` + \`_links\`.
10222
+
10223
+ **Cost note**: dataset creation/edit/delete commands incur ZERO LLM cost. They are pure storage operations against the platform API. LLM cost is only incurred when \`mutagent prompts optimize start\` runs the exec model against these dataset items.
9779
10224
  `,
9780
10225
  "concepts/eval-criteria.md": `---
9781
10226
  name: mutagent-cli-concepts-eval-criteria
@@ -10079,6 +10524,37 @@ the output to collect rubrics in the correct order.
10079
10524
  - [workflows/optimization.md](../workflows/optimization.md) → steps 7-9 (where this concept is applied)
10080
10525
  - [concepts/prompt-variables.md](./prompt-variables.md) → delimiter inference (used in MVC step)
10081
10526
  - \`mutagent/src/modules/prompts/prompt-evaluations/README.md\` → BE mirror
10527
+
10528
+ ---
10529
+
10530
+ ## CLI commands
10531
+
10532
+ \`\`\`bash
10533
+ # Discovery (no LLM cost)
10534
+ mutagent prompts evaluation --help # list eval subcommands
10535
+ mutagent prompts evaluation create --help # read flags before first use (Rule 2)
10536
+ mutagent prompts evaluation list <prompt-id> --json # list existing evaluations on a prompt
10537
+ mutagent prompts evaluation get <eval-id> --json # inspect single evaluation's criteria + metadata
10538
+
10539
+ # Creation -- guided (no LLM cost; just storage)
10540
+ mutagent prompts evaluation create <prompt-id> --guided --json # get _directive.askUserQuestions + decisionTree (per-field collection)
10541
+ mutagent prompts evaluation create <prompt-id> -d '<json>' --name "<name>" --json # upload criteria
10542
+ # -d accepts inline JSON OR @path/to/file.json OR - (stdin)
10543
+
10544
+ # Mutations
10545
+ mutagent prompts evaluation update <eval-id> -d '<json>' --json # update existing criteria
10546
+ mutagent prompts evaluation delete <eval-id> --json # delete evaluation (idempotent; --force skips confirm)
10547
+ \`\`\`
10548
+
10549
+ **Flag glossary** (eval-specific):
10550
+ - \`--guided\` -- emit per-field \`askUserQuestions\` directive instead of expecting \`-d\` upfront.
10551
+ - \`-d <json>\` / \`--data <json>\` -- supply criteria payload inline. Accepts: inline JSON, \`@path\` (file), \`-\` (stdin).
10552
+ - \`--name "<name>"\` -- human-readable label (shows in dashboard).
10553
+ - \`--json\` -- structured output (Rule 1: always use). Returns \`_directive\` + \`_links\` + \`_compat\`.
10554
+
10555
+ **Cost note**: eval creation/edit/delete commands incur ZERO LLM cost. Pure storage operations. LLM cost is incurred only when \`mutagent prompts optimize start\` runs the judge model against this evaluation.
10556
+
10557
+ **Workflow cross-link**: for the standalone HOW (step-by-step CLI sequence), see [workflows/eval-creation.md](../workflows/eval-creation.md).
10082
10558
  `,
10083
10559
  "concepts/prompt-variables.md": `---
10084
10560
  name: mutagent-cli-concepts-prompt-variables
@@ -10235,6 +10711,36 @@ Use the delimiter field to:
10235
10711
  - [concepts/eval-criteria.md](./eval-criteria.md) → MVC (Minimum Viable Context) — uses delimiter to enumerate input params
10236
10712
  - Source: \`mutagent-cli/src/lib/explorer.ts\` → \`inferPromptVariables()\` and \`DiscoveredPrompt.delimiter\`
10237
10713
  - Tests: \`mutagent-cli/src/__tests__/lib/explorer.test.ts\`
10714
+
10715
+ ---
10716
+
10717
+ ## CLI commands
10718
+
10719
+ \`\`\`bash
10720
+ # Discovery (no LLM cost, read-only)
10721
+ mutagent explore --help # read flags before first use (Rule 2)
10722
+ mutagent explore --json # scans codebase; emits "delimiter" field per prompt
10723
+ mutagent prompts get <id> --json # inspect uploaded prompt's stored form (incl. delimiter)
10724
+
10725
+ # Creation (no LLM cost, just storage)
10726
+ mutagent prompts create --help # read brace format rules before creating
10727
+ mutagent prompts create --name "<name>" --raw "<prompt>" --json # upload prompt (use single-brace {var} convention)
10728
+ mutagent prompts create --name "<name>" --raw-file <path> --json # upload from file (preferred for multi-line prompts)
10729
+
10730
+ # Mutations
10731
+ mutagent prompts update <id> --raw "<prompt>" --json # replace stored prompt body
10732
+ mutagent prompts delete <id> --json # delete prompt (idempotent; --force skips confirm)
10733
+ \`\`\`
10734
+
10735
+ **Flag glossary** (prompt-create-specific):
10736
+ - \`--raw "<text>"\` -- inline prompt body. Use for short single-line prompts.
10737
+ - \`--raw-file <path>\` -- read prompt body from file. Preferred for multi-line / templated prompts; preserves whitespace.
10738
+ - \`--name "<name>"\` -- human-readable label (shows in dashboard + explore output).
10739
+ - \`--json\` -- structured output (Rule 1: always use). Returns \`_directive\` (status_card) + \`_links\` + \`_compat\`.
10740
+
10741
+ **Cost note**: prompt creation/edit/delete commands incur ZERO LLM cost. Pure storage. Only \`mutagent prompts optimize start\` and \`mutagent prompts playground\` (interactive testing) incur LLM cost.
10742
+
10743
+ **Brace convention reminder**: use single-brace \`{var}\` for variables you'll later supply via dataset items. Use double-brace \`{{literal}}\` only when you need a literal \`{var}\` substring rendered (rare). The CLI's \`mutagent explore\` infers delimiter automatically -- see § "Delimiter inference" above for the rules.
10238
10744
  `,
10239
10745
  "concepts/scorecard-output.md": `---
10240
10746
  name: mutagent-cli-concepts-scorecard-output
@@ -10456,6 +10962,47 @@ rl.on('line', (line) => {
10456
10962
  defined; \`criteria[].name\` in \`ScorecardData\` maps to \`name\` in the rubric.
10457
10963
  - [workflows/optimization.md](../workflows/optimization.md) — full optimization
10458
10964
  loop; the scorecard is produced at Step 8 (watch) and Step 9 (results).
10965
+
10966
+ ---
10967
+
10968
+ ## CLI commands
10969
+
10970
+ \`\`\`bash
10971
+ # Discovery (no LLM cost)
10972
+ mutagent prompts optimize --help # list optimize subcommands
10973
+ mutagent prompts optimize start --help # read flags before first use (Rule 2)
10974
+
10975
+ # \uD83D\uDCB0 LLM COST -- requires usage check (Rule 5) + provider catalog check (Rule 6)
10976
+ mutagent prompts optimize start <id> --dataset <d> --evaluation <e> --json
10977
+ # start job (cost = exec_model × items × iterations
10978
+ # + judge_model × items × iterations)
10979
+ mutagent prompts optimize start <id> --dataset <d> --evaluation <e> --watch --json
10980
+ # start + stream NDJSON events to stdout
10981
+
10982
+ # Polling / watching (no LLM cost; just reads job state)
10983
+ mutagent prompts optimize status <job-id> --json # poll progress snapshot (includes bestScore)
10984
+ mutagent prompts optimize watch <job-id> --json # attach to running job (NDJSON stream)
10985
+ mutagent prompts optimize results <job-id> --json # full scorecard after completion (emits verbatim card)
10986
+ mutagent prompts optimize results <job-id> --diff --json # view prompt diff (no apply)
10987
+
10988
+ # Mutation (no LLM cost itself; modifies stored prompt)
10989
+ mutagent prompts optimize results <job-id> --apply --json # apply optimized prompt -> updates stored version
10990
+ # (irreversible without manual revert via prompts update)
10991
+ \`\`\`
10992
+
10993
+ **Flag glossary** (optimize-specific):
10994
+ - \`--dataset <d>\` -- dataset ID (from \`prompts dataset list\`). Items run through both exec and judge models.
10995
+ - \`--evaluation <e>\` -- evaluation ID (from \`prompts evaluation list\`). Drives the judge model's scoring rubric.
10996
+ - \`--watch\` -- after start, stream NDJSON events instead of returning immediately. Equivalent to \`start\` then \`watch\`.
10997
+ - \`--max-iterations N\` -- bound the optimizer loop. **Defaults to 1**; never raise without explicit user consent (each iteration = full eval × dataset round-trip).
10998
+ - \`--exec-model <model>\` / \`--eval-model <model>\` -- override defaults. Validate first via \`mutagent providers list --models --json\` (Rule 6).
10999
+ - \`--apply\` -- write optimized prompt back to stored version. Cannot be undone via flag; use \`prompts update\` to revert.
11000
+ - \`--diff\` -- view before/after diff without applying.
11001
+ - \`--json\` -- structured output (Rule 1: always use). Returns \`_directive\` + \`_links\` + \`_compat\`.
11002
+
11003
+ **Cost note**: \`optimize start\` is the ONLY cost-incurring command in this family. Always run \`mutagent usage --json\` first (Rule 5) to surface remaining quota; show the result to the user; require explicit confirmation. The \`--max-iterations\` default of 1 keeps cost bounded.
11004
+
11005
+ **Verbatim card protocol**: \`optimize start\`, \`optimize status\`, and \`optimize results\` all emit \`_directive.renderedCard\` -- the agent MUST echo the rendered card verbatim into chat before any next action (per SKILL.md § "MANDATORY: Verbatim Card Display Protocol"). The card also auto-echoes to stderr via \`echoDirectiveToStderr\`, but bash blocks may be collapsed in the user's UI.
10459
11006
  `,
10460
11007
  "workflows/agents.md": `---
10461
11008
  name: mutagent-cli-workflows-agents
@@ -10579,6 +11126,35 @@ Do NOT try to extract the whole agent loop at once.
10579
11126
 
10580
11127
  ---
10581
11128
 
11129
+ ## CLI commands
11130
+
11131
+ \`\`\`bash
11132
+ # Discovery (no LLM cost, read-only)
11133
+ mutagent explore --json # detect agents[] in codebase via taxonomy classifier
11134
+ mutagent agents --help # list available agent subcommands (CRUD + WIP banner)
11135
+ mutagent agents list --json # CRUD: list registered agents
11136
+ mutagent agents get <id> --json # CRUD: inspect single agent (config + metadata)
11137
+
11138
+ # Mutations (no LLM cost; just storage)
11139
+ mutagent agents create --name "<name>" --json # register a new agent
11140
+ mutagent agents update <id> --json # update agent config
11141
+ mutagent agents delete <id> --json # delete agent (idempotent; --force skips confirm)
11142
+
11143
+ # NOT YET AVAILABLE -- shows AGENTS_WIP_BANNER if attempted
11144
+ mutagent agents optimize <id> # WIP -- tracked separately; see partnership link below
11145
+ \`\`\`
11146
+
11147
+ **Flag glossary** (agent-specific):
11148
+ - \`--name "<name>"\` -- human-readable label (shows in dashboard).
11149
+ - \`--force\` -- skip interactive confirmation on delete (auto-skipped in \`--json\` mode).
11150
+ - \`--json\` -- structured output (Rule 1: always use). Returns \`_directive\` (status_card) + \`_links\` + \`_compat\`.
11151
+
11152
+ **Cost note**: all current \`mutagent agents *\` commands are CRUD (zero LLM cost). Agent optimization (\`mutagent agents optimize\`) is NOT yet available -- when shipped it will incur LLM cost similar to \`prompts optimize start\`. Current behavior on \`mutagent agents optimize\`: returns \`AGENTS_WIP_BANNER\` directive pointing to the partnership link.
11153
+
11154
+ **Partnership link**: <https://www.mutagent.io/agents-partnership> -- for early access to multi-turn / tool-calling agent optimization.
11155
+
11156
+ ---
11157
+
10582
11158
  ## Cross-references
10583
11159
 
10584
11160
  - [SKILL.md](../SKILL.md) → 5 rules + journey router
@@ -10586,6 +11162,408 @@ Do NOT try to extract the whole agent loop at once.
10586
11162
  - [workflows/optimization.md](./optimization.md) → branch (c) destination
10587
11163
  - [concepts/prompt-variables.md](../concepts/prompt-variables.md) → \`{foo}\` vs \`{{foo}}\` for sub-prompt extraction
10588
11164
  - Partnership link: https://www.mutagent.io/agents-partnership
11165
+ `,
11166
+ "workflows/dataset-curation.md": `---
11167
+ name: mutagent-cli-workflows-dataset-curation
11168
+ description: |
11169
+ Standalone dataset curation workflow. Use when the user wants to create
11170
+ or expand a dataset WITHOUT running the full optimization loop.
11171
+ Covers guided and manual dataset creation, per-field question collection,
11172
+ hard-cases-first priority, and upload via CLI.
11173
+ Cross-linked from workflows/optimization.md dataset step.
11174
+ triggers:
11175
+ - "create dataset"
11176
+ - "add examples"
11177
+ - "test cases"
11178
+ - "edge cases"
11179
+ - "hard cases"
11180
+ - "expand dataset"
11181
+ - "dataset items"
11182
+ - "curate dataset"
11183
+ - "build dataset"
11184
+ - "dataset curation"
11185
+ ---
11186
+
11187
+ # Workflow -- Dataset Curation (Standalone)
11188
+
11189
+ > **When to use this workflow vs optimization.md**:
11190
+ >
11191
+ > Use THIS workflow when the user wants ONLY to create or expand a dataset,
11192
+ > without immediately running optimization. Common signals:
11193
+ > - "I want to add more test cases"
11194
+ > - "Let's build a dataset for this prompt"
11195
+ > - "Add some edge cases"
11196
+ > - "Expand the existing dataset"
11197
+ >
11198
+ > Use [workflows/optimization.md](./optimization.md) when the user wants the
11199
+ > full loop: create prompt -> dataset -> eval -> optimize. That workflow has
11200
+ > an inline dataset step that cross-links back here.
11201
+
11202
+ Read the **5 rules** in [SKILL.md](../SKILL.md) before executing.
11203
+
11204
+ ---
11205
+
11206
+ ## When this workflow applies
11207
+
11208
+ - User explicitly wants to curate/build a dataset (no immediate optimization intent)
11209
+ - User wants to add hard cases / edge cases to an existing dataset
11210
+ - User wants to understand what good dataset items look like before committing to optimization
11211
+ - User has a prompt uploaded already and wants to build test coverage
11212
+
11213
+ ---
11214
+
11215
+ ## Required pre-read
11216
+
11217
+ Load [concepts/dataset-design.md](../concepts/dataset-design.md) before collecting items.
11218
+ It defines:
11219
+ - The Golden Rule (hard cases first)
11220
+ - 4 case categories (Edge / Hard / Representative / Adversarial)
11221
+ - Format requirements (\`input\` + \`expectedOutput\` shape)
11222
+ - Anti-patterns to avoid
11223
+
11224
+ The CLI's \`--guided\` directive also contains a self-sufficient inline version of these
11225
+ rules in \`_directive.instruction\` -- safe to execute even without the Skill loaded.
11226
+
11227
+ ---
11228
+
11229
+ ## Workflow steps
11230
+
11231
+ \`\`\`
11232
+ 1. mutagent explore --json
11233
+ -> confirm which prompt you're building a dataset for
11234
+ -> show command output to user
11235
+ -> ask: "Which prompt would you like to build a dataset for?"
11236
+
11237
+ 2. mutagent prompts get <prompt-id> --json
11238
+ -> inspect inputSchema + outputSchema fields
11239
+ -> understand what input and output shapes look like
11240
+
11241
+ 3. mutagent prompts dataset add --help
11242
+ -> read flags (Rule 2: always --help before first use)
11243
+
11244
+ 4. mutagent prompts dataset add <prompt-id> --guided --json
11245
+ -> CLI returns _directive.askUserQuestions with per-field questions
11246
+ -> follow the instruction in _directive.instruction
11247
+
11248
+ 5. For EACH question in _directive.askUserQuestions:
11249
+ -> use AskUserQuestion to collect the answer from the user
11250
+ -> prioritize hard/edge case questions first (they come first in the list)
11251
+ -> do NOT skip any question
11252
+ -> do NOT auto-fill answers
11253
+
11254
+ 6. Construct dataset items from collected answers:
11255
+ -> format: [{"input": {...}, "expectedOutput": {...}}, ...]
11256
+ -> minimum 5 items; at least 2 must be hard/edge cases
11257
+ -> verify all input keys match promptSchema.inputSchema.properties
11258
+ -> verify all expectedOutput keys match promptSchema.outputSchema.properties
11259
+
11260
+ 7. Ask user to review the constructed items before upload:
11261
+ "Here are the 7 dataset items I constructed. Review them before upload?"
11262
+ -> show items in a readable format
11263
+ -> accept corrections
11264
+
11265
+ 8. mutagent prompts dataset add <prompt-id> -d '[...]' --name "<name>" --json
11266
+ -> upload the reviewed items
11267
+ -> show command output to user (confirm datasetId)
11268
+ -> record datasetId in .mutagent/mutation-context.md
11269
+
11270
+ 9. Ask: "What would you like to do next?"
11271
+ -> Option A: Add more items (loop back to step 4)
11272
+ -> Option B: Create an evaluation -> route to evaluation create --guided
11273
+ -> Option C: Start optimization -> route to workflows/optimization.md step 10
11274
+ -> Option D: Done
11275
+ \`\`\`
11276
+
11277
+ ---
11278
+
11279
+ ## Guided mode output shape
11280
+
11281
+ \`mutagent prompts dataset add <prompt-id> --guided --json\` returns:
11282
+
11283
+ \`\`\`json
11284
+ {
11285
+ "promptId": "...",
11286
+ "promptName": "...",
11287
+ "schemaFields": { "input": ["field1", "field2"], "output": ["result"] },
11288
+ "suggestedCategories": [
11289
+ { "name": "Edge Cases", "description": "...", "priority": "high" },
11290
+ { "name": "Hard Cases", "description": "...", "priority": "high" },
11291
+ { "name": "Representative Cases", "description": "...", "priority": "medium" }
11292
+ ],
11293
+ "templateItem": {
11294
+ "input": { "field1": "<value>", "field2": "<value>" },
11295
+ "expectedOutput": { "result": "<expected>" }
11296
+ },
11297
+ "guidance": {
11298
+ "minItems": 5,
11299
+ "priorityRule": "Hard cases that expose prompt weaknesses > easy cases that always pass",
11300
+ "steps": [...]
11301
+ },
11302
+ "_directive": {
11303
+ "instruction": "...", // self-sufficient conceptual rules (bootstrappable without Skill)
11304
+ "next": ["mutagent prompts dataset add <id> -d '<json>' --name '<name>' --json"],
11305
+ "askUserQuestions": [ // inside _directive (not a sibling)
11306
+ { "field": "_general", "question": "What are the hardest inputs for this prompt?" },
11307
+ { "field": "_edge_cases", "question": "What edge cases have caused failures?" },
11308
+ { "field": "field1", "source": "inputSchema", "question": "What values should "field1" have?" },
11309
+ ...
11310
+ ]
11311
+ },
11312
+ "_compat": { "cliVersion": "...", "skillVersion": "...", "skillMinCliVersion": "..." }
11313
+ }
11314
+ \`\`\`
11315
+
11316
+ Key: \`askUserQuestions\` is inside \`_directive\` (not a top-level sibling). Parse \`_directive.askUserQuestions\`.
11317
+
11318
+ ---
11319
+
11320
+ ## Cost control
11321
+
11322
+ Dataset curation has NO LLM cost on its own -- it's a pure storage operation.
11323
+ Only \`mutagent prompts optimize start\` incurs LLM cost. Safe to run freely.
11324
+
11325
+ ---
11326
+
11327
+ ## Common pitfalls
11328
+
11329
+ For the canonical anti-pattern list (WHY each is bad + how to fix), see [concepts/dataset-design.md](../concepts/dataset-design.md) § Anti-patterns. Workflow-specific execution mistakes:
11330
+
11331
+ - **Uploading items with wrong field names** -> schema mismatch error from optimizer (not caught by concept-level rules)
11332
+ - **Forgetting to ask the user to review before upload** -> user can't correct mistakes (workflow step 7)
11333
+ - **Skipping the explore step** -> uploading to the wrong prompt (workflow step 1)
11334
+
11335
+ ---
11336
+
11337
+ ## Cross-references
11338
+
11339
+ - [SKILL.md](../SKILL.md) -- 5 rules + journey router
11340
+ - [concepts/dataset-design.md](../concepts/dataset-design.md) -- Golden Rule, case categories, format requirements, anti-patterns (WHY; this file is HOW)
11341
+ - [workflows/optimization.md](./optimization.md) -- full loop; dataset step cross-links here
11342
+ - [workflows/eval-creation.md](./eval-creation.md) -- parallel workflow doc (for evaluation side)
11343
+ - [concepts/prompt-variables.md](../concepts/prompt-variables.md) -- brace convention (for input field values)
11344
+
11345
+ ---
11346
+
11347
+ ## CLI commands
11348
+
11349
+ \`\`\`bash
11350
+ # Workflow execution sequence (commands appear inline in steps above; this is a quick reference)
11351
+ mutagent explore --json # step 1: discover prompts
11352
+ mutagent prompts get <prompt-id> --json # step 2: inspect schemas
11353
+ mutagent prompts dataset add --help # step 3: read flags (Rule 2)
11354
+ mutagent prompts dataset add <prompt-id> --guided --json # step 4: get _directive.askUserQuestions
11355
+ mutagent prompts dataset add <prompt-id> -d '<json>' --name "<name>" --json # step 8: upload reviewed items
11356
+ \`\`\`
11357
+
11358
+ For the full flag glossary + cost notes, see [concepts/dataset-design.md](../concepts/dataset-design.md) § CLI commands.
11359
+ `,
11360
+ "workflows/eval-creation.md": `---
11361
+ name: mutagent-cli-workflows-eval-creation
11362
+ description: |
11363
+ Standalone evaluation rubric creation workflow. Use when the user wants to
11364
+ define eval criteria for a prompt WITHOUT immediately running optimization.
11365
+ Covers guided per-field rubric collection (INPUT MVC + OUTPUT Standards),
11366
+ full-depth granular rubrics, and upload via CLI.
11367
+ Cross-linked from workflows/optimization.md eval step.
11368
+ triggers:
11369
+ - "create evaluation"
11370
+ - "create rubric"
11371
+ - "evaluate prompt"
11372
+ - "evaluation criteria"
11373
+ - "rubric design"
11374
+ - "MVC"
11375
+ - "Output Standards"
11376
+ - "score this prompt"
11377
+ - "judge this prompt"
11378
+ - "eval guided"
11379
+ ---
11380
+
11381
+ # Workflow -- Evaluation Creation (Standalone)
11382
+
11383
+ > **When to use this workflow vs optimization.md**:
11384
+ >
11385
+ > Use THIS workflow when the user wants ONLY to define an evaluation rubric for
11386
+ > a prompt, without immediately running optimization. Common signals:
11387
+ > - "I want to score this prompt"
11388
+ > - "Let's define eval criteria for this prompt"
11389
+ > - "Create a rubric for this prompt"
11390
+ > - "How should we judge this prompt's outputs?"
11391
+ >
11392
+ > Use [workflows/optimization.md](./optimization.md) when the user wants the
11393
+ > full loop: create prompt -> dataset -> eval -> optimize. That workflow has
11394
+ > an inline eval step that cross-links back here.
11395
+
11396
+ Read the **5 rules** in [SKILL.md](../SKILL.md) before executing.
11397
+
11398
+ ---
11399
+
11400
+ ## When this workflow applies
11401
+
11402
+ - User explicitly wants to create or edit an evaluation rubric (no immediate optimization intent)
11403
+ - User wants to add criteria to an existing prompt (with or without an existing dataset)
11404
+ - User wants to understand how rubric design works before committing to optimization
11405
+ - User has a prompt uploaded already and wants quality scoring before iteration
11406
+
11407
+ ---
11408
+
11409
+ ## Required pre-read
11410
+
11411
+ Load [concepts/eval-criteria.md](../concepts/eval-criteria.md) before collecting criteria.
11412
+ It defines:
11413
+ - The Golden Rule (INPUT MVC vs OUTPUT Standards split)
11414
+ - 6-tier MVC anchor framework for INPUT-scoped criteria
11415
+ - Output Standards format for OUTPUT-scoped criteria
11416
+ - Format requirements (\`name\`, \`description\`, \`evaluationParameter\`)
11417
+ - Anti-patterns to avoid
11418
+
11419
+ The CLI's \`--guided\` directive also contains a self-sufficient inline version of these
11420
+ rules in \`_directive.instruction\` -- safe to execute even without the Skill loaded.
11421
+
11422
+ ---
11423
+
11424
+ ## Workflow steps
11425
+
11426
+ \`\`\`
11427
+ 1. mutagent explore --json
11428
+ -> confirm which prompt you're creating an evaluation for
11429
+ -> show command output to user
11430
+ -> ask: "Which prompt would you like to evaluate?"
11431
+
11432
+ 2. mutagent prompts get <prompt-id> --json
11433
+ -> inspect inputSchema + outputSchema fields
11434
+ -> understand what input parameters and output shape look like
11435
+ -> these drive the per-field rubric collection in step 4
11436
+
11437
+ 3. mutagent prompts evaluation create --help
11438
+ -> read flags (Rule 2: always --help before first use)
11439
+
11440
+ 4. mutagent prompts evaluation create <prompt-id> --guided --json
11441
+ -> CLI returns _directive.askUserQuestions with per-field questions
11442
+ -> follow the instruction in _directive.instruction
11443
+ -> the instruction inlines the INPUT MVC vs OUTPUT Standards framing
11444
+ (bootstrappable -- works even if concepts/eval-criteria.md isn't loaded)
11445
+
11446
+ 5. For EACH question in _directive.askUserQuestions:
11447
+ -> use AskUserQuestion to collect the answer from the user
11448
+ -> INPUT-scoped fields (source: "inputSchema") -> ask MVC rubric:
11449
+ what's minimum viable context the input MUST contain?
11450
+ -> OUTPUT-scoped fields (source: "outputSchema") -> ask Output Standards:
11451
+ what does correct vs incorrect look like for this field?
11452
+ -> do NOT skip any field
11453
+ -> do NOT auto-fill answers (Rule 3: never auto-generate criteria)
11454
+
11455
+ 6. Construct rubric items from collected answers:
11456
+ -> format: [{"name": "...", "description": "...", "evaluationParameter": "..."}, ...]
11457
+ -> one rubric per schema field unless user opts to merge fields
11458
+ -> use 6-tier full-depth descriptions for complex INPUT criteria
11459
+ (see concepts/eval-criteria.md for examples)
11460
+ -> use simpler 2-3-tier descriptions for OUTPUT correctness criteria
11461
+
11462
+ 7. Ask user to review the constructed criteria before upload:
11463
+ "Here are the N evaluation criteria I drafted. Review before upload?"
11464
+ -> show criteria in a readable format (table or numbered list)
11465
+ -> accept corrections; loop step 5-7 if user wants edits
11466
+
11467
+ 8. mutagent prompts evaluation create <prompt-id> -d '<json>' --name "<name>" --json
11468
+ -> upload the reviewed criteria
11469
+ -> show command output to user (confirm evaluationId)
11470
+ -> record evaluationId in .mutagent/mutation-context.md
11471
+
11472
+ 9. Ask: "What would you like to do next?"
11473
+ -> Option A: Add more criteria (loop back to step 4)
11474
+ -> Option B: Add a dataset -> route to workflows/dataset-curation.md
11475
+ -> Option C: Start optimization -> route to workflows/optimization.md step 10
11476
+ -> Option D: Done
11477
+ \`\`\`
11478
+
11479
+ ---
11480
+
11481
+ ## Guided mode output shape
11482
+
11483
+ \`mutagent prompts evaluation create <prompt-id> --guided --json\` returns:
11484
+
11485
+ \`\`\`json
11486
+ {
11487
+ "promptId": "...",
11488
+ "promptName": "...",
11489
+ "schemaFields": { "input": ["field1", "field2"], "output": ["result"] },
11490
+ "_directive": {
11491
+ "instruction": "...", // self-sufficient INPUT MVC + OUTPUT Standards rules (bootstrappable)
11492
+ "next": ["mutagent prompts evaluation create <id> -d '<json>' --name '<name>' --json"],
11493
+ "decisionTree": {
11494
+ "step1": "Confirm input parameters with the user via _directive.askUserQuestions...",
11495
+ "step2": "Define correctness criteria for EVERY field..."
11496
+ },
11497
+ "askUserQuestions": [ // inside _directive (not a sibling)
11498
+ { "field": "field1", "source": "inputSchema", "question": "What MVC anchors define minimum viable context for 'field1'?" },
11499
+ { "field": "result", "source": "outputSchema", "question": "What does a correct 'result' look like vs incorrect?" },
11500
+ ...
11501
+ ]
11502
+ },
11503
+ "_compat": { "cliVersion": "...", "skillVersion": "...", "skillMinCliVersion": "..." }
11504
+ }
11505
+ \`\`\`
11506
+
11507
+ Key: \`askUserQuestions\` is inside \`_directive\` (not a top-level sibling). Parse \`_directive.askUserQuestions\`. The \`decisionTree\` field guides multi-step branching.
11508
+
11509
+ ---
11510
+
11511
+ ## Cost control
11512
+
11513
+ Eval creation has NO LLM cost on its own -- it's a pure storage operation.
11514
+ Only \`mutagent prompts optimize start\` incurs LLM cost (judge model + exec model
11515
+ multiplied by dataset items × iterations). Safe to create/edit eval criteria freely.
11516
+
11517
+ ---
11518
+
11519
+ ## Common pitfalls
11520
+
11521
+ For the canonical anti-pattern list, see [concepts/eval-criteria.md](../concepts/eval-criteria.md) § Anti-patterns. Workflow-specific execution mistakes:
11522
+
11523
+ - **Skipping per-field collection** -> rubric incomplete; optimizer scores against a sparse signal
11524
+ - **Auto-filling answers from context** instead of asking user -> rubric reflects the agent's assumptions, not the user's domain knowledge (Rule 3 violation)
11525
+ - **Merging input + output criteria into one** -> loses INPUT MVC vs OUTPUT Standards distinction; harder to interpret per-criterion scores
11526
+ - **Uploading without user review** -> user can't catch misinterpretations of their domain
11527
+ - **Wrong \`evaluationParameter\` value** -> server rejects with schema validation error
11528
+
11529
+ ---
11530
+
11531
+ ## Cross-references
11532
+
11533
+ - [SKILL.md](../SKILL.md) -- 5 rules + journey router
11534
+ - [concepts/eval-criteria.md](../concepts/eval-criteria.md) -- Golden Rule, MVC/Output Standards, format requirements, anti-patterns
11535
+ - [workflows/optimization.md](./optimization.md) -- full loop; eval step cross-links here
11536
+ - [workflows/dataset-curation.md](./dataset-curation.md) -- parallel workflow doc (for dataset side)
11537
+ - [concepts/scorecard-output.md](../concepts/scorecard-output.md) -- how eval scores surface in optimization scorecard
11538
+
11539
+ ---
11540
+
11541
+ ## CLI commands
11542
+
11543
+ \`\`\`bash
11544
+ # Discovery (no LLM cost)
11545
+ mutagent prompts evaluation --help # list eval subcommands
11546
+ mutagent prompts evaluation create --help # read flags before first use (Rule 2)
11547
+ mutagent prompts evaluation list <prompt-id> --json # list existing evaluations on a prompt
11548
+ mutagent prompts evaluation get <eval-id> --json # inspect single evaluation's criteria + metadata
11549
+
11550
+ # Creation -- guided (no LLM cost; just storage)
11551
+ mutagent prompts evaluation create <prompt-id> --guided --json # get _directive.askUserQuestions + decisionTree (per-field collection)
11552
+ mutagent prompts evaluation create <prompt-id> -d '<json>' --name "<name>" --json # upload criteria
11553
+ # -d accepts inline JSON OR @path/to/file.json OR - (stdin)
11554
+
11555
+ # Mutations
11556
+ mutagent prompts evaluation update <eval-id> -d '<json>' --json # update existing criteria
11557
+ mutagent prompts evaluation delete <eval-id> --json # delete evaluation (idempotent; --force skips confirm)
11558
+ \`\`\`
11559
+
11560
+ **Flag glossary** (eval-specific):
11561
+ - \`--guided\` -- emit per-field \`askUserQuestions\` directive instead of expecting \`-d\` upfront. Use when collecting from user.
11562
+ - \`-d <json>\` / \`--data <json>\` -- supply criteria payload inline. Accepts: inline JSON, \`@path\` (read from file), \`-\` (read from stdin).
11563
+ - \`--name "<name>"\` -- human-readable label for the evaluation (shows in dashboard).
11564
+ - \`--json\` -- structured output (Rule 1: always use). Returns \`_directive\` + \`_links\`.
11565
+
11566
+ **Cost note**: eval creation/edit/delete commands incur ZERO LLM cost. They are pure storage operations against the platform API. LLM cost is only incurred when \`mutagent prompts optimize start\` runs the judge model against this evaluation.
10589
11567
  `,
10590
11568
  "workflows/exploration.md": `---
10591
11569
  name: mutagent-cli-workflows-exploration
@@ -10720,6 +11698,27 @@ After step 1, show the command output to the user before proceeding to classific
10720
11698
 
10721
11699
  ---
10722
11700
 
11701
+ ## CLI commands
11702
+
11703
+ \`\`\`bash
11704
+ # Discovery -- read-only (no LLM cost)
11705
+ mutagent explore --help # read flags before first use (Rule 2)
11706
+ mutagent explore --json # scan cwd for prompts + agents (full scan)
11707
+ mutagent explore --path ./src --json # scan specific directory subtree
11708
+ mutagent explore --markers-only --json # show only files with existing .mutagent/* markers
11709
+ mutagent explore --classify-only --json # taxonomy output only (skip variable inference)
11710
+ \`\`\`
11711
+
11712
+ **Flag glossary** (explore-specific):
11713
+ - \`--path <dir>\` -- restrict scan to subtree. Useful for monorepos with multiple apps; default is cwd.
11714
+ - \`--markers-only\` -- skip discovery; show only prompts/agents already uploaded (have \`.mutagent/*.md\` marker file). Use to refresh an existing index.
11715
+ - \`--classify-only\` -- skip per-prompt delimiter inference. Faster scan when you only need the prompts[]/agents[] taxonomy split.
11716
+ - \`--json\` -- structured output (Rule 1: always use). Returns \`prompts[]\`, \`agents[]\`, taxonomy, plus per-prompt \`delimiter\` field.
11717
+
11718
+ **Cost note**: \`mutagent explore\` is fully read-only -- no LLM calls, no platform API mutations. Safe to run repeatedly. The output is a snapshot of cwd at run time; re-run after meaningful code changes.
11719
+
11720
+ ---
11721
+
10723
11722
  ## Cross-references
10724
11723
 
10725
11724
  - [SKILL.md](../SKILL.md) → 5 rules + journey router
@@ -10826,6 +11825,9 @@ explore → prompts create → dataset add → evaluation create --guided
10826
11825
 
10827
11826
  7. Load [concepts/eval-criteria.md](../concepts/eval-criteria.md)
10828
11827
  → understand INPUT-param (MVC) vs OUTPUT-param (Standards) scope
11828
+ → for standalone eval-only work outside this optimization context, see
11829
+ [workflows/eval-creation.md](./eval-creation.md) -- this step inlines a brief
11830
+ version of that workflow
10829
11831
 
10830
11832
  8. mutagent prompts evaluation create <promptId> --guided --json
10831
11833
  → the CLI provides a list of fields, each needing a rubric
@@ -10836,6 +11838,8 @@ explore → prompts create → dataset add → evaluation create --guided
10836
11838
  - do NOT skip any field
10837
11839
  - do NOT auto-generate any answer
10838
11840
  → collect at minimum: one INPUT criterion per {variable}, one OUTPUT criterion
11841
+ → for the full step-by-step including review-before-upload + decisionTree handling,
11842
+ see [workflows/eval-creation.md](./eval-creation.md)
10839
11843
 
10840
11844
  9. mutagent prompts evaluation create <promptId> -d '<json>' --json
10841
11845
  → upload the criteria collected in step 8
@@ -10923,6 +11927,63 @@ Ensure at least 2 hard/edge cases per category. Then upload:
10923
11927
  mutagent prompts dataset add <prompt-id> -d '<constructed-json>' --name '<name>' --json
10924
11928
  \`\`\`
10925
11929
 
11930
+ For dataset-only work (no optimization needed yet), see [workflows/dataset-curation.md](./dataset-curation.md)
11931
+ and [concepts/dataset-design.md](../concepts/dataset-design.md) for the full curation principles.
11932
+
11933
+ ---
11934
+
11935
+ ## CLI commands
11936
+
11937
+ Run these before the first use of each command (Rule 2: \`--help\` before first use):
11938
+
11939
+ \`\`\`bash
11940
+ mutagent explore --help # codebase scan flags
11941
+ mutagent prompts create --help # prompt upload flags + brace convention
11942
+ mutagent prompts dataset add --help # dataset add flags + --guided semantics
11943
+ mutagent prompts evaluation create --help # eval create flags + --guided semantics
11944
+ mutagent prompts optimize start --help # optimize start flags + cost-relevant flags
11945
+ mutagent prompts optimize status --help # status polling flags
11946
+ mutagent prompts optimize results --help # results flags + --apply / --diff
11947
+ mutagent usage --help # quota query flags
11948
+ mutagent providers list --help # provider catalog query flags
11949
+ \`\`\`
11950
+
11951
+ Workflow execution sequence (annotated with cost markers):
11952
+
11953
+ \`\`\`bash
11954
+ # Discovery + setup (no LLM cost)
11955
+ mutagent explore --json # step 1: discover prompts
11956
+ mutagent prompts create --name "<name>" --raw-file <path> --json # step 4: upload prompt
11957
+ mutagent prompts dataset add <id> --guided --json # step 5-6: guided dataset (returns _directive.askUserQuestions)
11958
+ mutagent prompts dataset add <id> -d '<json>' --name "<name>" --json # step 6: upload dataset items
11959
+ mutagent prompts evaluation create <id> --guided --json # step 8: guided eval (returns _directive.askUserQuestions + decisionTree)
11960
+ mutagent prompts evaluation create <id> -d '<json>' --name "<name>" --json # step 9: upload criteria
11961
+
11962
+ # Pre-flight checks (no LLM cost)
11963
+ mutagent usage --json # step 10: surface quota to user (Rule 5)
11964
+ mutagent providers list --models --json # verify exec/eval models are available (Rule 6)
11965
+
11966
+ # \uD83D\uDCB0 LLM COST starts here -- requires explicit user confirmation per Rule 5
11967
+ mutagent prompts optimize start <id> --dataset <d> --evaluation <e> --max-iterations 1 --json
11968
+ # step 11: start job (cost = exec_model × items × iterations
11969
+ # + judge_model × items × iterations)
11970
+ # --max-iterations defaults to 1; never raise without user consent
11971
+
11972
+ # Polling + results (no LLM cost; just reads job state + emits verbatim card)
11973
+ mutagent prompts optimize status <job-id> --json # step 12: poll progress (verbatim card)
11974
+ mutagent prompts optimize results <job-id> --json # step 13: view scorecard (verbatim card)
11975
+ mutagent prompts optimize results <job-id> --diff --json # step 14a: view prompt diff (no apply)
11976
+ mutagent prompts optimize results <job-id> --apply --json # step 14b: apply optimized prompt to stored version
11977
+ \`\`\`
11978
+
11979
+ **Cost note**: \`optimize start\` is the ONLY cost-incurring command in this workflow. All other commands are pure storage/discovery operations. The \`--max-iterations\` flag bounds total cost (default = 1; never raise silently).
11980
+
11981
+ **Verbatim card protocol**: \`optimize start\`, \`optimize status\`, and \`optimize results\` emit \`_directive.renderedCard\` -- echo verbatim per [SKILL.md § MANDATORY: Verbatim Card Display Protocol](../SKILL.md).
11982
+
11983
+ For per-topic standalone HOW workflows, see:
11984
+ - [workflows/dataset-curation.md](./dataset-curation.md) -- standalone dataset curation
11985
+ - [workflows/eval-creation.md](./eval-creation.md) -- standalone evaluation rubric creation
11986
+
10926
11987
  ---
10927
11988
 
10928
11989
  ## Cross-references
@@ -10930,6 +11991,8 @@ mutagent prompts dataset add <prompt-id> -d '<constructed-json>' --name '<name>'
10930
11991
  - [SKILL.md](../SKILL.md) → 5 rules + journey router
10931
11992
  - [concepts/prompt-variables.md](../concepts/prompt-variables.md) → brace convention + conversion (critical for steps 3 and 15)
10932
11993
  - [concepts/eval-criteria.md](../concepts/eval-criteria.md) → INPUT MVC + OUTPUT Standards + granular rubric (critical for steps 7-8)
11994
+ - [concepts/dataset-design.md](../concepts/dataset-design.md) → dataset curation principles (Golden Rule, case categories, anti-patterns)
11995
+ - [workflows/dataset-curation.md](./dataset-curation.md) → standalone dataset curation (when no optimization needed yet)
10933
11996
  - [workflows/exploration.md](./exploration.md) → step 1 of this workflow
10934
11997
  - [workflows/tracing.md](./tracing.md) → parallel or follow-up path
10935
11998
  `,
@@ -11044,6 +12107,36 @@ After step 6:
11044
12107
 
11045
12108
  ---
11046
12109
 
12110
+ ## CLI commands
12111
+
12112
+ \`\`\`bash
12113
+ # Discovery (no LLM cost, read-only)
12114
+ mutagent explore --help # read flags before first use (Rule 2)
12115
+ mutagent explore --json # step 1: detect framework + prompts taxonomy
12116
+ mutagent integrate --help # list supported frameworks + per-framework flags
12117
+
12118
+ # Code generation (no LLM cost; emits integration snippet to stdout)
12119
+ mutagent integrate <framework> --json # step 3: get integration snippet for the detected framework
12120
+ mutagent integrate <framework> --output <path> --json # write snippet directly to file (instead of stdout)
12121
+
12122
+ # Verification (no LLM cost, read-only)
12123
+ mutagent traces list --json # step 6: verify traces arriving (recent N traces)
12124
+ mutagent traces list --prompt-id <id> --json # filter by prompt
12125
+ mutagent traces list --since <ISO-timestamp> --json # filter by time window (e.g., since first integration)
12126
+ mutagent traces get <trace-id> --json # inspect single trace's spans + metadata
12127
+ \`\`\`
12128
+
12129
+ **Flag glossary** (tracing-specific):
12130
+ - \`<framework>\` -- supported frameworks: \`langchain\`, \`langgraph\`, \`llamaindex\`, \`openai-agents\`, \`crewai\`, \`autogen\`, \`vercel-ai\`. Run \`mutagent integrate --help\` for the canonical current list.
12131
+ - \`--output <path>\` -- write the integration snippet directly to a file. Without this flag, the snippet goes to stdout (typical for agent-mediated workflows so the agent can re-emit verbatim to user).
12132
+ - \`--prompt-id <id>\` -- filter trace list to one prompt's traces.
12133
+ - \`--since <ts>\` -- filter by timestamp (ISO-8601). Useful right after first integration to confirm traces are landing.
12134
+ - \`--json\` -- structured output (Rule 1: always use).
12135
+
12136
+ **Cost note**: tracing is fully free at the CLI/platform layer -- the platform stores spans for analytics. The only "cost" is the marginal LLM call latency from in-process span emission inside the user's app (typically &lt;5ms per call). No optimizer cost incurred.
12137
+
12138
+ ---
12139
+
11047
12140
  ## Cross-references
11048
12141
 
11049
12142
  - [SKILL.md](../SKILL.md) → 5 rules + journey router
@@ -11096,7 +12189,7 @@ that teaches coding agents how to use the MutagenT CLI effectively.
11096
12189
  const isJson = parentCmd ? getJsonFlag(parentCmd) : false;
11097
12190
  const output = new OutputFormatter(isJson ? "json" : "table");
11098
12191
  const repoRoot = findRepoRoot();
11099
- const skillDir = join7(repoRoot, SKILL_DIR);
12192
+ const skillDir = join8(repoRoot, SKILL_DIR);
11100
12193
  const files = getSkillFiles();
11101
12194
  const writtenFiles = [];
11102
12195
  let totalBytes = 0;
@@ -11108,9 +12201,9 @@ that teaches coding agents how to use the MutagenT CLI effectively.
11108
12201
  return a.localeCompare(b);
11109
12202
  });
11110
12203
  for (const relPath of sortedKeys) {
11111
- const destPath = join7(skillDir, relPath);
12204
+ const destPath = join8(skillDir, relPath);
11112
12205
  const parentDir = dirname(destPath);
11113
- if (!existsSync12(parentDir)) {
12206
+ if (!existsSync13(parentDir)) {
11114
12207
  mkdirSync4(parentDir, { recursive: true });
11115
12208
  }
11116
12209
  const raw = files[relPath] ?? "";
@@ -11244,18 +12337,18 @@ import { Command as Command19 } from "commander";
11244
12337
  import { randomUUID } from "crypto";
11245
12338
 
11246
12339
  // src/commands/hooks/state.ts
11247
- import { readFileSync as readFileSync10, writeFileSync as writeFileSync6, renameSync, unlinkSync, existsSync as existsSync13 } from "fs";
11248
- import { join as join8 } from "path";
12340
+ import { readFileSync as readFileSync11, writeFileSync as writeFileSync6, renameSync, unlinkSync, existsSync as existsSync14 } from "fs";
12341
+ import { join as join9 } from "path";
11249
12342
  import { tmpdir } from "os";
11250
12343
  function stateFilePath(sessionId) {
11251
- return join8(tmpdir(), `mutagent-hook-${sessionId}.json`);
12344
+ return join9(tmpdir(), `mutagent-hook-${sessionId}.json`);
11252
12345
  }
11253
12346
  function readState(sessionId) {
11254
12347
  const path = stateFilePath(sessionId);
11255
- if (!existsSync13(path))
12348
+ if (!existsSync14(path))
11256
12349
  return null;
11257
12350
  try {
11258
- const raw = JSON.parse(readFileSync10(path, "utf-8"));
12351
+ const raw = JSON.parse(readFileSync11(path, "utf-8"));
11259
12352
  if (!Array.isArray(raw.parentStack)) {
11260
12353
  raw.parentStack = [];
11261
12354
  }
@@ -11275,7 +12368,7 @@ function writeState(sessionId, state) {
11275
12368
  }
11276
12369
  function deleteState(sessionId) {
11277
12370
  const path = stateFilePath(sessionId);
11278
- if (existsSync13(path)) {
12371
+ if (existsSync14(path)) {
11279
12372
  try {
11280
12373
  unlinkSync(path);
11281
12374
  } catch {}
@@ -11922,8 +13015,23 @@ async function handlePostToolUseFailure() {
11922
13015
  }
11923
13016
 
11924
13017
  // src/commands/hooks/install.ts
11925
- import { readFileSync as readFileSync11, writeFileSync as writeFileSync7, existsSync as existsSync14, mkdirSync as mkdirSync5 } from "fs";
11926
- import { join as join9 } from "path";
13018
+ import { readFileSync as readFileSync12, writeFileSync as writeFileSync7, existsSync as existsSync15, mkdirSync as mkdirSync5 } from "fs";
13019
+ import { join as join10 } from "path";
13020
+
13021
+ class SettingsParseError extends Error {
13022
+ settingsPath;
13023
+ backupPath;
13024
+ constructor(settingsPath, backupPath, cause) {
13025
+ super(`settings.local.json contains unparseable JSON.
13026
+ ` + ` Original file backed up to: ${backupPath}
13027
+ ` + ` To recover: restore from backup, fix JSON syntax, then re-run ` + `\`mutagent hooks install\``);
13028
+ this.settingsPath = settingsPath;
13029
+ this.backupPath = backupPath;
13030
+ this.name = "SettingsParseError";
13031
+ if (cause instanceof Error)
13032
+ this.cause = cause;
13033
+ }
13034
+ }
11927
13035
  var V1_MIGRATIONS = {
11928
13036
  Stop: ["mutagent hooks claude-code session-end"]
11929
13037
  };
@@ -11970,15 +13078,18 @@ function migrateV1Hooks(settings) {
11970
13078
  return migrated;
11971
13079
  }
11972
13080
  function installHooks(cwd) {
11973
- const claudeDir = join9(cwd, ".claude");
11974
- const settingsPath = join9(claudeDir, "settings.local.json");
11975
- const existed = existsSync14(settingsPath);
13081
+ const claudeDir = join10(cwd, ".claude");
13082
+ const settingsPath = join10(claudeDir, "settings.local.json");
13083
+ const existed = existsSync15(settingsPath);
11976
13084
  let settings = {};
11977
13085
  if (existed) {
13086
+ const raw = readFileSync12(settingsPath, "utf-8");
11978
13087
  try {
11979
- settings = JSON.parse(readFileSync11(settingsPath, "utf-8"));
11980
- } catch {
11981
- settings = {};
13088
+ settings = JSON.parse(raw);
13089
+ } catch (err) {
13090
+ const backupPath = `${settingsPath}.bak.${new Date().toISOString().replace(/:/g, "-")}`;
13091
+ writeFileSync7(backupPath, raw, "utf-8");
13092
+ throw new SettingsParseError(settingsPath, backupPath, err);
11982
13093
  }
11983
13094
  }
11984
13095
  const added = [];
@@ -12005,14 +13116,25 @@ function installHooks(cwd) {
12005
13116
  }
12006
13117
  }
12007
13118
  }
13119
+ let userWarning;
12008
13120
  if (added.length > 0 || migrated.length > 0) {
12009
- if (!existsSync14(claudeDir)) {
13121
+ if (!existsSync15(claudeDir)) {
12010
13122
  mkdirSync5(claudeDir, { recursive: true });
12011
13123
  }
12012
13124
  writeFileSync7(settingsPath, JSON.stringify(settings, null, 2) + `
12013
13125
  `, "utf-8");
12014
13126
  }
12015
- return { settingsPath, existed, added, alreadyPresent, migrated };
13127
+ if (added.length > 0) {
13128
+ const addedList = added.map((cmd) => {
13129
+ const parts = cmd.split(" ");
13130
+ return parts[parts.length - 1] ?? cmd;
13131
+ }).join(", ");
13132
+ userWarning = `MutagenT hooks installed into .claude/settings.local.json
13133
+ ` + ` Added: ${addedList}
13134
+ ` + ` This file was modified. Review with: git diff .claude/settings.local.json
13135
+ ` + ` (To remove hooks, edit .claude/settings.local.json and delete the mutagent entries)`;
13136
+ }
13137
+ return { settingsPath, existed, added, alreadyPresent, migrated, userWarning };
12016
13138
  }
12017
13139
 
12018
13140
  // src/commands/hooks/index.ts
@@ -12047,7 +13169,28 @@ hook already present (checked by command string) so running this multiple
12047
13169
  times is safe.
12048
13170
  `).action((opts) => {
12049
13171
  const targetDir = opts.cwd ?? process.cwd();
12050
- const result = installHooks(targetDir);
13172
+ const isJson = Boolean(opts.json);
13173
+ let result;
13174
+ try {
13175
+ result = installHooks(targetDir);
13176
+ } catch (err) {
13177
+ if (err instanceof SettingsParseError) {
13178
+ if (isJson) {
13179
+ process.stdout.write(JSON.stringify({
13180
+ success: false,
13181
+ error: err.message,
13182
+ backupPath: err.backupPath,
13183
+ settingsPath: err.settingsPath
13184
+ }) + `
13185
+ `);
13186
+ } else {
13187
+ process.stderr.write(`[mutagent hooks install] ERROR: ${err.message}
13188
+ `);
13189
+ }
13190
+ process.exit(1);
13191
+ }
13192
+ throw err;
13193
+ }
12051
13194
  for (const { event, command } of result.migrated) {
12052
13195
  process.stderr.write(`[mutagent hooks install] ⚠️ Migrated v1 hook: removed '${event} → ${command}' (v2 wires this as 'SessionEnd')
12053
13196
  `);
@@ -12063,14 +13206,24 @@ times is safe.
12063
13206
  `);
12064
13207
  return;
12065
13208
  }
12066
- process.stdout.write(JSON.stringify({
13209
+ if (result.userWarning) {
13210
+ if (isJson) {} else {
13211
+ process.stderr.write(`⚠ ${result.userWarning}
13212
+ `);
13213
+ }
13214
+ }
13215
+ const jsonResponse = {
12067
13216
  success: true,
12068
13217
  settingsPath: result.settingsPath,
12069
13218
  existed: result.existed,
12070
13219
  added: result.added,
12071
13220
  alreadyPresent: result.alreadyPresent,
12072
13221
  message: result.added.length > 0 ? `Installed ${String(result.added.length)} hook(s). ${String(result.alreadyPresent.length)} already present.` : `All hooks already present (${String(result.alreadyPresent.length)}).`
12073
- }) + `
13222
+ };
13223
+ if (result.userWarning && isJson) {
13224
+ jsonResponse.warnings = [result.userWarning];
13225
+ }
13226
+ process.stdout.write(JSON.stringify(jsonResponse) + `
12074
13227
  `);
12075
13228
  });
12076
13229
  const claudeCode = hooks.command("claude-code").description("Claude Code session telemetry");
@@ -12115,8 +13268,8 @@ import { Command as Command20 } from "commander";
12115
13268
  import chalk38 from "chalk";
12116
13269
  init_errors();
12117
13270
  init_config();
12118
- import { readFileSync as readFileSync12, existsSync as existsSync15 } from "fs";
12119
- import { join as join10, dirname as dirname2 } from "path";
13271
+ import { readFileSync as readFileSync13, existsSync as existsSync16 } from "fs";
13272
+ import { join as join11, dirname as dirname2 } from "path";
12120
13273
  import { fileURLToPath } from "url";
12121
13274
  var VALID_CATEGORIES = ["bug", "feature", "improvement", "praise"];
12122
13275
  function getCliVersion() {
@@ -12125,8 +13278,8 @@ function getCliVersion() {
12125
13278
  }
12126
13279
  try {
12127
13280
  const __dirname2 = dirname2(fileURLToPath(import.meta.url));
12128
- const pkgPath = join10(__dirname2, "..", "..", "package.json");
12129
- const pkg = JSON.parse(readFileSync12(pkgPath, "utf-8"));
13281
+ const pkgPath = join11(__dirname2, "..", "..", "package.json");
13282
+ const pkg = JSON.parse(readFileSync13(pkgPath, "utf-8"));
12130
13283
  return pkg.version ?? "0.1.1";
12131
13284
  } catch {
12132
13285
  return "0.1.1";
@@ -12157,12 +13310,12 @@ async function resolveContextSource(source, _readStdinOverride) {
12157
13310
  raw = await (_readStdinOverride ?? readStdin2)();
12158
13311
  } else if (source.startsWith("@")) {
12159
13312
  const filePath = source.slice(1);
12160
- if (!existsSync15(filePath)) {
13313
+ if (!existsSync16(filePath)) {
12161
13314
  throw new MutagentError("INVALID_ARGUMENTS", `Context file not found: ${filePath}`, `Verify the path exists: ls -la "${filePath}"
12162
13315
  Or use inline JSON: --context '{"key":"value"}'`);
12163
13316
  }
12164
13317
  try {
12165
- raw = readFileSync12(filePath, "utf-8").trim();
13318
+ raw = readFileSync13(filePath, "utf-8").trim();
12166
13319
  } catch (err) {
12167
13320
  const msg = err instanceof Error ? err.message : String(err);
12168
13321
  throw new MutagentError("INVALID_ARGUMENTS", `Cannot read context file "${filePath}": ${msg}`, `Check file permissions: ls -la "${filePath}"`);
@@ -12314,11 +13467,12 @@ if (process.env.CLI_VERSION) {
12314
13467
  } else {
12315
13468
  try {
12316
13469
  const __dirname2 = dirname3(fileURLToPath2(import.meta.url));
12317
- const pkgPath = join11(__dirname2, "..", "..", "package.json");
12318
- const pkg = JSON.parse(readFileSync13(pkgPath, "utf-8"));
13470
+ const pkgPath = join12(__dirname2, "..", "..", "package.json");
13471
+ const pkg = JSON.parse(readFileSync14(pkgPath, "utf-8"));
12319
13472
  cliVersion = pkg.version ?? cliVersion;
12320
13473
  } catch {}
12321
13474
  }
13475
+ setCliVersion(cliVersion);
12322
13476
  var program = new Command21;
12323
13477
  program.name("mutagent").description(`MutagenT CLI - AI-native prompt optimization platform
12324
13478
 
@@ -12328,6 +13482,15 @@ program.name("mutagent").description(`MutagenT CLI - AI-native prompt optimizati
12328
13482
  showGlobalOptions: true
12329
13483
  });
12330
13484
  program.addHelpText("after", `
13485
+ ${chalk39.bold.cyan("WORKFLOWS:")}
13486
+ ${chalk39.bold("Setup")} mutagent login → mutagent init → mutagent skills install
13487
+ ${chalk39.bold("Framework Integration")} mutagent explore → mutagent integrate <fw> → mutagent traces
13488
+ ${chalk39.bold("Prompt Optimization")} mutagent prompts create → prompts dataset → prompts evaluation → prompts optimize
13489
+ ${chalk39.bold("Agent Optimization")} ${chalk39.dim("[coming soon — see: mutagent agents --help]")}
13490
+
13491
+ ${chalk39.dim("For rubric design, prompt-vs-agent classification, scorecard interpretation,")}
13492
+ ${chalk39.dim("see the Skill at")} ${chalk39.cyan(".claude/skills/mutagent-cli/SKILL.md")} ${chalk39.dim("(install via")} ${chalk39.cyan("mutagent skills install")}${chalk39.dim(")")}
13493
+
12331
13494
  ${AGENTS_WIP_BANNER}
12332
13495
 
12333
13496
  ${chalk39.yellow("Non-Interactive Mode (CI/CD & Coding Agents):")}
@@ -12396,10 +13559,14 @@ ${chalk39.yellow("Post-Onboarding Decision Tree:")}
12396
13559
 
12397
13560
  ${chalk39.yellow("Directive System:")}
12398
13561
  Every --json response may include:
12399
- ${chalk39.bold("_directive.renderedCard")} Pre-formatted card for the user ${chalk39.red("(MUST be shown in chat)")}
12400
- ${chalk39.bold("_directive.instruction")} Next step for the agent
12401
- ${chalk39.bold("_directive.next")} Array of suggested follow-up commands
12402
- ${chalk39.bold("_links")} Dashboard/API URLs (format as markdown links)
13562
+ ${chalk39.bold("_directive.display")} Type tag — 'status_card' for card-kind directives (drives test/docs guards)
13563
+ ${chalk39.bold("_directive.renderedCard")} Pre-formatted card ${chalk39.red("(MUST echo verbatim in chat whenever this field exists — see SKILL.md Verbatim Card Display Protocol)")}
13564
+ ${chalk39.bold("_directive.instruction")} Next step for the agent (self-sufficient, no Skill required)
13565
+ ${chalk39.bold("_directive.next")} Array of suggested follow-up commands
13566
+ ${chalk39.bold("_directive.decisionTree")} Multi-step branching logic for guided flows
13567
+ ${chalk39.bold("_directive.askUserQuestions")} Questions to collect from the user (inside _directive)
13568
+ ${chalk39.bold("_links")} Dashboard/API URLs (format as markdown links)
13569
+ ${chalk39.bold("_compat")} Compat metadata: cliVersion, skillVersion, skillMinCliVersion
12403
13570
 
12404
13571
  ${chalk39.yellow("Evaluation Criteria Format:")}
12405
13572
  Each criterion MUST have: ${chalk39.bold("name")}, ${chalk39.bold("description")} (scoring rubric), ${chalk39.bold("evaluationParameter")}
@@ -12455,7 +13622,10 @@ ${!hasCredentials() ? `
12455
13622
  var rawArgs = process.argv.slice(2);
12456
13623
  if (rawArgs.includes("-v") || rawArgs.includes("--version")) {
12457
13624
  if (rawArgs.includes("--json")) {
12458
- console.log(JSON.stringify({ version: cliVersion }));
13625
+ console.log(JSON.stringify({
13626
+ version: cliVersion,
13627
+ _compat: getCompatMetadata(cliVersion)
13628
+ }));
12459
13629
  } else {
12460
13630
  console.log(cliVersion);
12461
13631
  }
@@ -12492,12 +13662,12 @@ program.addCommand(createFeedbackCommand());
12492
13662
  var isInteractive = process.stdin.isTTY && !rawArgs.includes("--json") && process.env.CI !== "true";
12493
13663
  var isSkillCommand = rawArgs[0] === "skills" || rawArgs[0] === "hooks";
12494
13664
  if (isInteractive && !isSkillCommand) {
12495
- const skillPath = join11(process.cwd(), ".claude/skills/mutagent-cli/SKILL.md");
12496
- if (!existsSync16(skillPath)) {
13665
+ const skillPath = join12(process.cwd(), ".claude/skills/mutagent-cli/SKILL.md");
13666
+ if (!existsSync17(skillPath)) {
12497
13667
  console.log(chalk39.dim("MutagenT SKILL not installed. Install it for AI agent support? Run:"), chalk39.cyan("mutagent skills install"));
12498
13668
  }
12499
13669
  }
12500
13670
  program.parse();
12501
13671
 
12502
- //# debugId=5E2BEB108643839E64756E2164756E21
13672
+ //# debugId=B6E7E54395DC9FFD64756E2164756E21
12503
13673
  //# sourceMappingURL=cli.js.map