@mutagent/cli 0.1.177 → 0.1.178
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bin/cli.js +1364 -194
- package/dist/bin/cli.js.map +16 -16
- package/dist/index.js +43 -2
- package/dist/index.js.map +3 -3
- package/package.json +1 -1
package/dist/bin/cli.js
CHANGED
|
@@ -1271,8 +1271,8 @@ var init_sdk_client = __esm(() => {
|
|
|
1271
1271
|
// src/bin/cli.ts
|
|
1272
1272
|
import { Command as Command21 } from "commander";
|
|
1273
1273
|
import chalk39 from "chalk";
|
|
1274
|
-
import { readFileSync as
|
|
1275
|
-
import { join as
|
|
1274
|
+
import { readFileSync as readFileSync14, existsSync as existsSync17 } from "fs";
|
|
1275
|
+
import { join as join12, dirname as dirname3 } from "path";
|
|
1276
1276
|
import { fileURLToPath as fileURLToPath2 } from "url";
|
|
1277
1277
|
|
|
1278
1278
|
// src/commands/auth.ts
|
|
@@ -1280,11 +1280,45 @@ init_config();
|
|
|
1280
1280
|
init_sdk_client();
|
|
1281
1281
|
import { Command } from "commander";
|
|
1282
1282
|
import chalk4 from "chalk";
|
|
1283
|
-
import { existsSync as
|
|
1284
|
-
import { join as
|
|
1283
|
+
import { existsSync as existsSync4 } from "fs";
|
|
1284
|
+
import { join as join5 } from "path";
|
|
1285
1285
|
|
|
1286
1286
|
// src/lib/output.ts
|
|
1287
1287
|
import chalk from "chalk";
|
|
1288
|
+
import { existsSync as existsSync2, readFileSync as readFileSync2 } from "fs";
|
|
1289
|
+
import { join as join2 } from "path";
|
|
1290
|
+
var _compatCache = null;
|
|
1291
|
+
var _globalCliVersion = "0.0.0";
|
|
1292
|
+
function setCliVersion(version) {
|
|
1293
|
+
_globalCliVersion = version;
|
|
1294
|
+
_compatCache = null;
|
|
1295
|
+
}
|
|
1296
|
+
function getCompatMetadata(cliVersion = _globalCliVersion) {
|
|
1297
|
+
if (_compatCache !== null)
|
|
1298
|
+
return _compatCache;
|
|
1299
|
+
let skillVersion = "unknown";
|
|
1300
|
+
let skillMinCliVersion = "unknown";
|
|
1301
|
+
try {
|
|
1302
|
+
const candidatePaths = [
|
|
1303
|
+
join2(process.cwd(), ".claude", "skills", "mutagent-cli", "SKILL.md"),
|
|
1304
|
+
join2(process.cwd(), "mutagent-cli", ".claude", "skills", "mutagent-cli", "SKILL.md")
|
|
1305
|
+
];
|
|
1306
|
+
for (const p of candidatePaths) {
|
|
1307
|
+
if (existsSync2(p)) {
|
|
1308
|
+
const content = readFileSync2(p, "utf-8");
|
|
1309
|
+
const verMatch = /^SKILL_VERSION:\s*(.+)$/m.exec(content);
|
|
1310
|
+
const minMatch = /^SKILL_MIN_CLI_VERSION:\s*(.+)$/m.exec(content);
|
|
1311
|
+
if (verMatch?.[1])
|
|
1312
|
+
skillVersion = verMatch[1].trim();
|
|
1313
|
+
if (minMatch?.[1])
|
|
1314
|
+
skillMinCliVersion = minMatch[1].trim();
|
|
1315
|
+
break;
|
|
1316
|
+
}
|
|
1317
|
+
}
|
|
1318
|
+
} catch {}
|
|
1319
|
+
_compatCache = { cliVersion, skillVersion, skillMinCliVersion };
|
|
1320
|
+
return _compatCache;
|
|
1321
|
+
}
|
|
1288
1322
|
function getJsonFlag(command) {
|
|
1289
1323
|
if (!command)
|
|
1290
1324
|
return false;
|
|
@@ -1300,7 +1334,14 @@ class OutputFormatter {
|
|
|
1300
1334
|
}
|
|
1301
1335
|
output(data) {
|
|
1302
1336
|
if (this.format === "json") {
|
|
1303
|
-
|
|
1337
|
+
let enriched = data;
|
|
1338
|
+
if (_globalCliVersion !== "0.0.0" && data !== null && typeof data === "object" && !Array.isArray(data) && "_directive" in data) {
|
|
1339
|
+
enriched = {
|
|
1340
|
+
...data,
|
|
1341
|
+
_compat: getCompatMetadata(_globalCliVersion)
|
|
1342
|
+
};
|
|
1343
|
+
}
|
|
1344
|
+
console.log(JSON.stringify(enriched, null, 2));
|
|
1304
1345
|
} else {
|
|
1305
1346
|
this.formatTable(data);
|
|
1306
1347
|
}
|
|
@@ -1431,8 +1472,8 @@ function createSpinner(text, isJson) {
|
|
|
1431
1472
|
init_errors();
|
|
1432
1473
|
|
|
1433
1474
|
// src/lib/mutation-context.ts
|
|
1434
|
-
import { existsSync as
|
|
1435
|
-
import { join as
|
|
1475
|
+
import { existsSync as existsSync3, readFileSync as readFileSync3, writeFileSync as writeFileSync2, mkdirSync as mkdirSync2 } from "fs";
|
|
1476
|
+
import { join as join3, resolve } from "path";
|
|
1436
1477
|
function parseTableRows(lines) {
|
|
1437
1478
|
const rows = [];
|
|
1438
1479
|
for (const line of lines) {
|
|
@@ -1473,18 +1514,18 @@ class MutationContext {
|
|
|
1473
1514
|
}
|
|
1474
1515
|
static load(projectRoot) {
|
|
1475
1516
|
const root = projectRoot ?? process.cwd();
|
|
1476
|
-
const dirPath =
|
|
1477
|
-
const filePath =
|
|
1517
|
+
const dirPath = join3(resolve(root), CONTEXT_DIR);
|
|
1518
|
+
const filePath = join3(dirPath, CONTEXT_FILE);
|
|
1478
1519
|
const ctx = new MutationContext(filePath);
|
|
1479
|
-
if (
|
|
1480
|
-
const content =
|
|
1520
|
+
if (existsSync3(filePath)) {
|
|
1521
|
+
const content = readFileSync3(filePath, "utf-8");
|
|
1481
1522
|
ctx.parse(content);
|
|
1482
1523
|
}
|
|
1483
1524
|
return ctx;
|
|
1484
1525
|
}
|
|
1485
1526
|
save() {
|
|
1486
|
-
const dirPath =
|
|
1487
|
-
if (!
|
|
1527
|
+
const dirPath = join3(this.filePath, "..");
|
|
1528
|
+
if (!existsSync3(dirPath)) {
|
|
1488
1529
|
mkdirSync2(dirPath, { recursive: true });
|
|
1489
1530
|
}
|
|
1490
1531
|
writeFileSync2(this.filePath, this.render(), "utf-8");
|
|
@@ -1710,8 +1751,8 @@ import chalk2 from "chalk";
|
|
|
1710
1751
|
import { resolve as resolve2 } from "path";
|
|
1711
1752
|
|
|
1712
1753
|
// src/lib/explorer.ts
|
|
1713
|
-
import { readdirSync, readFileSync as
|
|
1714
|
-
import { join as
|
|
1754
|
+
import { readdirSync, readFileSync as readFileSync4, statSync } from "fs";
|
|
1755
|
+
import { join as join4, relative, extname, basename } from "path";
|
|
1715
1756
|
var DOUBLE_VAR_PATTERN = /\{\{\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*\}\}/g;
|
|
1716
1757
|
var SINGLE_VAR_PATTERN = /(?<![\\{])\{\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*\}(?!\})/g;
|
|
1717
1758
|
var FENCED_CODE_PATTERN = /```[\s\S]*?```/g;
|
|
@@ -1805,7 +1846,7 @@ function walkDir(dir, extensions, excludeDirs, maxDepth, currentDepth = 0) {
|
|
|
1805
1846
|
return files;
|
|
1806
1847
|
}
|
|
1807
1848
|
for (const entry of entries) {
|
|
1808
|
-
const fullPath =
|
|
1849
|
+
const fullPath = join4(dir, entry);
|
|
1809
1850
|
let stat;
|
|
1810
1851
|
try {
|
|
1811
1852
|
stat = statSync(fullPath);
|
|
@@ -1831,7 +1872,7 @@ function scanForPrompts(filePath, relativePath) {
|
|
|
1831
1872
|
const results = [];
|
|
1832
1873
|
let content;
|
|
1833
1874
|
try {
|
|
1834
|
-
content =
|
|
1875
|
+
content = readFileSync4(filePath, "utf-8");
|
|
1835
1876
|
} catch {
|
|
1836
1877
|
return results;
|
|
1837
1878
|
}
|
|
@@ -1905,7 +1946,7 @@ function scanForAgents(filePath, relativePath) {
|
|
|
1905
1946
|
const results = [];
|
|
1906
1947
|
let content;
|
|
1907
1948
|
try {
|
|
1908
|
-
content =
|
|
1949
|
+
content = readFileSync4(filePath, "utf-8");
|
|
1909
1950
|
} catch {
|
|
1910
1951
|
return results;
|
|
1911
1952
|
}
|
|
@@ -1953,7 +1994,7 @@ function scanForMarkers(filePath, relativePath) {
|
|
|
1953
1994
|
const results = [];
|
|
1954
1995
|
let content;
|
|
1955
1996
|
try {
|
|
1956
|
-
content =
|
|
1997
|
+
content = readFileSync4(filePath, "utf-8");
|
|
1957
1998
|
} catch {
|
|
1958
1999
|
return results;
|
|
1959
2000
|
}
|
|
@@ -1987,7 +2028,7 @@ function scanJsonForSchemas(filePath, relativePath) {
|
|
|
1987
2028
|
return results;
|
|
1988
2029
|
let content;
|
|
1989
2030
|
try {
|
|
1990
|
-
content =
|
|
2031
|
+
content = readFileSync4(filePath, "utf-8");
|
|
1991
2032
|
} catch {
|
|
1992
2033
|
return results;
|
|
1993
2034
|
}
|
|
@@ -2014,7 +2055,7 @@ function scanForDatasets(dir, rootPath, excludeDirs, maxDepth) {
|
|
|
2014
2055
|
const name = basename(filePath, ext);
|
|
2015
2056
|
let content;
|
|
2016
2057
|
try {
|
|
2017
|
-
content =
|
|
2058
|
+
content = readFileSync4(filePath, "utf-8");
|
|
2018
2059
|
} catch {
|
|
2019
2060
|
continue;
|
|
2020
2061
|
}
|
|
@@ -2637,8 +2678,8 @@ Examples:
|
|
|
2637
2678
|
}
|
|
2638
2679
|
const isValid = await validateApiKey(apiKey, endpoint);
|
|
2639
2680
|
const cwd = process.cwd();
|
|
2640
|
-
const hasOnboarding =
|
|
2641
|
-
const hasContextFile =
|
|
2681
|
+
const hasOnboarding = existsSync4(join5(cwd, ".mutagentrc.json"));
|
|
2682
|
+
const hasContextFile = existsSync4(join5(cwd, ".mutagent", "mutation-context.md"));
|
|
2642
2683
|
let contextSummary;
|
|
2643
2684
|
if (hasContextFile) {
|
|
2644
2685
|
try {
|
|
@@ -2799,7 +2840,7 @@ init_errors();
|
|
|
2799
2840
|
init_sdk_client();
|
|
2800
2841
|
import { Command as Command7 } from "commander";
|
|
2801
2842
|
import chalk18 from "chalk";
|
|
2802
|
-
import { readFileSync as
|
|
2843
|
+
import { readFileSync as readFileSync5, existsSync as existsSync5 } from "fs";
|
|
2803
2844
|
|
|
2804
2845
|
// src/lib/ui-links.ts
|
|
2805
2846
|
function getAppBaseUrl() {
|
|
@@ -2906,7 +2947,7 @@ function formatCreationHints(hint) {
|
|
|
2906
2947
|
return lines.join(`
|
|
2907
2948
|
`);
|
|
2908
2949
|
}
|
|
2909
|
-
var
|
|
2950
|
+
var VERBATIM_CARD_INSTRUCTION = `MANDATORY: HARD STOP — DO NOT PROCEED.
|
|
2910
2951
|
` + `1. The renderedCard below appears in your bash/terminal output. That is NOT sufficient — the user cannot see it there.
|
|
2911
2952
|
` + `2. You MUST copy the FULL renderedCard text into your chat response VERBATIM. Do NOT summarize, rephrase, or omit any lines.
|
|
2912
2953
|
` + `3. Do NOT run any other command until you have displayed the card in chat.
|
|
@@ -2989,7 +3030,7 @@ function promptCreatedDirective(promptId, name) {
|
|
|
2989
3030
|
fields: { promptId: String(promptId), name },
|
|
2990
3031
|
links: { dashboard: dashboardUrl, api: apiUrl },
|
|
2991
3032
|
next,
|
|
2992
|
-
instruction:
|
|
3033
|
+
instruction: VERBATIM_CARD_INSTRUCTION,
|
|
2993
3034
|
renderedCard: renderPlainCard({ title, icon: "✓", rows, links: cardLinks, next })
|
|
2994
3035
|
};
|
|
2995
3036
|
}
|
|
@@ -3013,7 +3054,7 @@ function promptUpdatedDirective(promptId, name) {
|
|
|
3013
3054
|
fields: { promptId: String(promptId), name },
|
|
3014
3055
|
links: { dashboard: dashboardUrl, api: apiUrl },
|
|
3015
3056
|
next,
|
|
3016
|
-
instruction:
|
|
3057
|
+
instruction: VERBATIM_CARD_INSTRUCTION,
|
|
3017
3058
|
renderedCard: renderPlainCard({ title, icon: "✓", rows, links: cardLinks, next })
|
|
3018
3059
|
};
|
|
3019
3060
|
}
|
|
@@ -3030,7 +3071,7 @@ function promptDeletedDirective(promptId) {
|
|
|
3030
3071
|
fields: { promptId },
|
|
3031
3072
|
links: { dashboard: dashboardUrl },
|
|
3032
3073
|
next,
|
|
3033
|
-
instruction:
|
|
3074
|
+
instruction: VERBATIM_CARD_INSTRUCTION,
|
|
3034
3075
|
renderedCard: renderPlainCard({ title, icon: "✗", rows, links: cardLinks, next })
|
|
3035
3076
|
};
|
|
3036
3077
|
}
|
|
@@ -3067,7 +3108,7 @@ function datasetAddedDirective(promptId, datasetId, name, itemCount, state) {
|
|
|
3067
3108
|
},
|
|
3068
3109
|
links: { dashboard: dashboardUrl, api: apiUrl },
|
|
3069
3110
|
next,
|
|
3070
|
-
instruction:
|
|
3111
|
+
instruction: VERBATIM_CARD_INSTRUCTION,
|
|
3071
3112
|
renderedCard: renderPlainCard({ title, icon: "✓", rows, links: cardLinks, next })
|
|
3072
3113
|
};
|
|
3073
3114
|
}
|
|
@@ -3087,7 +3128,7 @@ function datasetDeletedDirective(promptId, datasetId) {
|
|
|
3087
3128
|
fields: { promptId, datasetId },
|
|
3088
3129
|
links: { datasets: datasetsUrl },
|
|
3089
3130
|
next,
|
|
3090
|
-
instruction:
|
|
3131
|
+
instruction: VERBATIM_CARD_INSTRUCTION,
|
|
3091
3132
|
renderedCard: renderPlainCard({ title, icon: "✗", rows, links: cardLinks, next })
|
|
3092
3133
|
};
|
|
3093
3134
|
}
|
|
@@ -3123,7 +3164,7 @@ function evaluationCreatedDirective(promptId, evalId, name, criteriaCount, state
|
|
|
3123
3164
|
},
|
|
3124
3165
|
links: { dashboard: dashboardUrl, api: apiUrl },
|
|
3125
3166
|
next,
|
|
3126
|
-
instruction:
|
|
3167
|
+
instruction: VERBATIM_CARD_INSTRUCTION,
|
|
3127
3168
|
renderedCard: renderPlainCard({ title, icon: "✓", rows, links: cardLinks, next })
|
|
3128
3169
|
};
|
|
3129
3170
|
}
|
|
@@ -3140,7 +3181,7 @@ function evaluationDeletedDirective(evaluationId) {
|
|
|
3140
3181
|
fields: { evaluationId },
|
|
3141
3182
|
links: { dashboard: dashboardUrl },
|
|
3142
3183
|
next,
|
|
3143
|
-
instruction:
|
|
3184
|
+
instruction: VERBATIM_CARD_INSTRUCTION,
|
|
3144
3185
|
renderedCard: renderPlainCard({ title, icon: "✗", rows, links: cardLinks, next })
|
|
3145
3186
|
};
|
|
3146
3187
|
}
|
|
@@ -3508,6 +3549,7 @@ ${chalk7.yellow("AI Agent: Format Selection Rules")}
|
|
|
3508
3549
|
${chalk7.red("outputSchema is required.")}
|
|
3509
3550
|
|
|
3510
3551
|
${chalk7.yellow("AI Agent: ALWAYS append --json to this command.")}
|
|
3552
|
+
${chalk7.cyan("Conceptual help (brace convention, delimiter inference): .claude/skills/mutagent-cli/concepts/prompt-variables.md")} ${chalk7.dim("(install via mutagent skills install)")}
|
|
3511
3553
|
`).action(async (options) => {
|
|
3512
3554
|
const isJson = getJsonFlag(prompts);
|
|
3513
3555
|
const output = new OutputFormatter(isJson ? "json" : "table");
|
|
@@ -3806,10 +3848,12 @@ async function buildGuidedWorkflow(promptId) {
|
|
|
3806
3848
|
const askUserQuestions = allFields.map(({ field, source, fieldSchema }) => {
|
|
3807
3849
|
const isInput = source === "inputSchema";
|
|
3808
3850
|
const question = isInput ? `Define the Minimum Viable Context for "${field}". What data MUST be present in this input for the prompt to produce a correct output? Describe what constitutes complete vs incomplete input, and WHY this field matters.` : `Define what correct "${field}" looks like. What structure, content, or qualities make it good vs bad? Give concrete examples of good and bad outputs.`;
|
|
3809
|
-
const hint = isInput ? `Input fields define what data the prompt NEEDS to work correctly. Without defining minimum viable context, the optimizer cannot detect whether failures come from bad input or bad prompt logic.` :
|
|
3851
|
+
const hint = isInput ? `Input fields define what data the prompt NEEDS to work correctly. Without defining minimum viable context, the optimizer cannot detect whether failures come from bad input or bad prompt logic.` : undefined;
|
|
3810
3852
|
const header = isInput ? `${field} [INPUT]` : `${field} [OUTPUT]`;
|
|
3811
3853
|
const defineDesc = isInput ? `Describe what data MUST be present in "${field}" and WHY the prompt needs it. Focus on completeness and minimum viable context.` : `Describe what a correct vs incorrect "${field}" looks like. Focus on structure, content quality, and concrete examples — not numeric scores.`;
|
|
3812
3854
|
return {
|
|
3855
|
+
field,
|
|
3856
|
+
source,
|
|
3813
3857
|
question,
|
|
3814
3858
|
header,
|
|
3815
3859
|
...hint != null ? { hint } : {},
|
|
@@ -3841,7 +3885,7 @@ async function buildGuidedWorkflow(promptId) {
|
|
|
3841
3885
|
description: "Follow these steps to create an evaluation for this prompt:",
|
|
3842
3886
|
steps: [
|
|
3843
3887
|
{ step: 1, action: "Review prompt schemas and context", detail: "Understand what the prompt does and what each field means." },
|
|
3844
|
-
{ step: 2, action: "Define correctness criteria for EVERY field", detail: `For EACH of the ${String(allFields.length)} fields in askUserQuestions, use AskUserQuestion to collect a rubric. INPUT fields (${String(inputFields.length)}): collect minimum viable context definitions — what data MUST exist. OUTPUT fields (${String(outputFields.length)}): collect correctness criteria — what correct vs incorrect looks like. Do NOT skip any field. Use the context hints to suggest rubrics when asked.` },
|
|
3888
|
+
{ step: 2, action: "Define correctness criteria for EVERY field", detail: `For EACH of the ${String(allFields.length)} fields in _directive.askUserQuestions, use AskUserQuestion to collect a rubric. INPUT fields (${String(inputFields.length)}): collect minimum viable context definitions — what data MUST exist. OUTPUT fields (${String(outputFields.length)}): collect correctness criteria — what correct vs incorrect looks like. Do NOT skip any field. Use the context hints to suggest rubrics when asked.` },
|
|
3845
3889
|
{ step: 3, action: "Build criteria JSON", detail: "Map each rubric to: { name: field, description: rubric, evaluationParameter: field }" },
|
|
3846
3890
|
{ step: 4, action: "Ask for evaluation name", detail: "Ask user what to name this evaluation" },
|
|
3847
3891
|
{ step: 5, action: "Create the evaluation", detail: `mutagent prompts evaluation create ${promptId} --name "<name>" -d '<json>' --json` }
|
|
@@ -3865,11 +3909,9 @@ async function buildGuidedWorkflow(promptId) {
|
|
|
3865
3909
|
}
|
|
3866
3910
|
})}'`
|
|
3867
3911
|
},
|
|
3868
|
-
askUserQuestions,
|
|
3869
3912
|
_directive: {
|
|
3870
|
-
display: "workflow_guide",
|
|
3871
3913
|
instruction: `MANDATORY ORDERS — READ CAREFULLY:
|
|
3872
|
-
` + `1. You MUST ask the user about EVERY SINGLE field listed in askUserQuestions. There are ${String(allFields.length)} fields total.
|
|
3914
|
+
` + `1. You MUST ask the user about EVERY SINGLE field listed in _directive.askUserQuestions. There are ${String(allFields.length)} fields total.
|
|
3873
3915
|
` + `2. Do NOT skip ANY field. Do NOT decide that a field "doesn't need" a definition.
|
|
3874
3916
|
` + `3. INPUT fields (${String(inputFields.length)}): These define MINIMUM VIABLE CONTEXT — the data that MUST exist in the input for the prompt to work. Ask: "What data MUST be present in this field and WHY does the prompt need it?"
|
|
3875
3917
|
` + ` WHY THIS MATTERS: Without minimum viable context definitions, the optimizer cannot distinguish between failures caused by incomplete input vs failures caused by bad prompt logic.
|
|
@@ -3893,11 +3935,15 @@ async function buildGuidedWorkflow(promptId) {
|
|
|
3893
3935
|
` + ` NEVER use vague rubrics like "0-1 scale" or "score based on accuracy".
|
|
3894
3936
|
` + ` Canonical source: .claude/skills/mutagent-cli/SKILL.md → "Guided Eval Criteria".
|
|
3895
3937
|
` + " Note: template variables use single-brace `{variable}` (platform canonical). `mutagent explore`\n" + " infers single vs double per-file via inferPromptVariables().",
|
|
3896
|
-
|
|
3938
|
+
next: [
|
|
3939
|
+
`mutagent prompts evaluation create ${promptId} --name "<name>" -d '<criteria-json>' --json`
|
|
3940
|
+
],
|
|
3941
|
+
decisionTree: {
|
|
3897
3942
|
step1: "Check if criteria already exist in the user's code. If criteria match expected shape (name + description + evaluationParameter targeting schema fields), use --data directly.",
|
|
3898
|
-
step2: "If criteria are missing or malformed, use the askUserQuestions payloads to collect them via AskUserQuestion.",
|
|
3943
|
+
step2: "If criteria are missing or malformed, use the _directive.askUserQuestions payloads to collect them via AskUserQuestion.",
|
|
3899
3944
|
step3: `After collecting all criteria, construct the --data JSON and run: mutagent prompts evaluation create <prompt-id> --name "<name>" -d '<json>'`
|
|
3900
|
-
}
|
|
3945
|
+
},
|
|
3946
|
+
askUserQuestions
|
|
3901
3947
|
},
|
|
3902
3948
|
_meta: { inputFields, outputFields, allFields }
|
|
3903
3949
|
};
|
|
@@ -4002,12 +4048,11 @@ function buildGuidedDatasetWorkflow(prompt) {
|
|
|
4002
4048
|
input: hasSchema ? templateInput : { "<field>": "<value>" },
|
|
4003
4049
|
expectedOutput: hasSchema ? templateOutput : { "<field>": "<expected>" }
|
|
4004
4050
|
},
|
|
4005
|
-
askUserQuestions,
|
|
4006
4051
|
guidance: {
|
|
4007
4052
|
minItems: 5,
|
|
4008
4053
|
priorityRule: "Hard cases that expose prompt weaknesses > easy cases that always pass",
|
|
4009
4054
|
steps: [
|
|
4010
|
-
"Collect answers to each question from the user",
|
|
4055
|
+
"Collect answers to each question from the user via _directive.askUserQuestions",
|
|
4011
4056
|
"Generate 5-10 dataset items covering all categories",
|
|
4012
4057
|
"Ensure at least 2 hard/edge cases per category",
|
|
4013
4058
|
'Format as JSON array: [{"input": {...}, "expectedOutput": {...}}, ...]',
|
|
@@ -4015,10 +4060,42 @@ function buildGuidedDatasetWorkflow(prompt) {
|
|
|
4015
4060
|
]
|
|
4016
4061
|
},
|
|
4017
4062
|
_directive: {
|
|
4018
|
-
instruction:
|
|
4063
|
+
instruction: `DATASET CURATION — MANDATORY RULES (self-sufficient, no Skill required):
|
|
4064
|
+
` + `
|
|
4065
|
+
` + `1. PRIORITY RULE: Hard cases that expose prompt weaknesses > easy cases that always pass.
|
|
4066
|
+
` + ` Collect HARD CASES FIRST. Ask about failure modes, edge cases, adversarial inputs.
|
|
4067
|
+
` + `
|
|
4068
|
+
` + `2. CASE CATEGORIES (collect in order):
|
|
4069
|
+
` + ` a. Edge Cases (HIGH priority): boundary inputs, empty fields, very long values, unicode, malformed data
|
|
4070
|
+
` + ` b. Hard Cases (HIGH priority): ambiguous inputs, adversarial examples, domain traps that fool the model
|
|
4071
|
+
` + ` c. Representative Cases (MEDIUM priority): typical production inputs (only after hard cases covered)
|
|
4072
|
+
` + `
|
|
4073
|
+
` + `3. PER-INPUT-FIELD: For each input field, ask what VALUES should be tested. Collect at minimum:
|
|
4074
|
+
` + ` - One boundary value (empty, null, extremely long, wrong type)
|
|
4075
|
+
` + ` - One ambiguous value (that could be interpreted multiple ways)
|
|
4076
|
+
` + ` - One representative production value
|
|
4077
|
+
` + `
|
|
4078
|
+
` + `4. PER-OUTPUT-FIELD: For each output field, ask what CORRECT looks like vs INCORRECT.
|
|
4079
|
+
` + ` You need the correctness shape to generate realistic expectedOutput values.
|
|
4080
|
+
` + `
|
|
4081
|
+
` + `5. MINIMUM: 5 dataset items total. At least 2 must be hard or edge cases.
|
|
4082
|
+
` + `
|
|
4083
|
+
` + `6. FORMAT: Each item MUST have:
|
|
4084
|
+
` + ` { "input": { <inputSchema fields> }, "expectedOutput": { <outputSchema fields> } }
|
|
4085
|
+
` + ` NEVER omit expectedOutput on labelable items — optimizer needs the ground truth.
|
|
4086
|
+
` + `
|
|
4087
|
+
` + `7. ANTI-PATTERNS (never do):
|
|
4088
|
+
` + ` - All easy cases that always pass → useless for optimization
|
|
4089
|
+
` + ` - No edge cases → optimizer never sees failure modes
|
|
4090
|
+
` + ` - Fictional inputs that won't happen in production → scores are misleading
|
|
4091
|
+
` + ` - Missing expectedOutput on items with knowable correct answers
|
|
4092
|
+
` + `
|
|
4093
|
+
` + `Use _directive.askUserQuestions (${String(askUserQuestions.length)} questions) to collect domain knowledge from the user.
|
|
4094
|
+
` + "For deeper conceptual background, see: .claude/skills/mutagent-cli/concepts/dataset-design.md",
|
|
4019
4095
|
next: [
|
|
4020
4096
|
`mutagent prompts dataset add ${promptId} -d '<constructed-json>' --name '<name>' --json`
|
|
4021
|
-
]
|
|
4097
|
+
],
|
|
4098
|
+
askUserQuestions
|
|
4022
4099
|
}
|
|
4023
4100
|
};
|
|
4024
4101
|
}
|
|
@@ -4653,6 +4730,7 @@ Expected Criteria Shape (--data):
|
|
|
4653
4730
|
${chalk10.red("Required: --name (unless --guided). Criteria must include evaluationParameter.")}
|
|
4654
4731
|
${chalk10.dim("CLI flags (--name, --description) override --data fields.")}
|
|
4655
4732
|
${chalk10.dim("Get prompt IDs: mutagent prompts list")}
|
|
4733
|
+
${chalk10.cyan("Conceptual help: .claude/skills/mutagent-cli/concepts/eval-criteria.md")} ${chalk10.dim("(install via mutagent skills install)")}
|
|
4656
4734
|
`).action(async (promptId, options) => {
|
|
4657
4735
|
let isJson = getJsonFlag(prompts);
|
|
4658
4736
|
if (options.guided) {
|
|
@@ -5154,7 +5232,10 @@ function startDirective(job, promptId, datasetId, evaluationId) {
|
|
|
5154
5232
|
];
|
|
5155
5233
|
const next = [`mutagent prompts optimize status ${job.id} --json`];
|
|
5156
5234
|
return {
|
|
5235
|
+
instruction: VERBATIM_CARD_INSTRUCTION,
|
|
5236
|
+
next,
|
|
5157
5237
|
display: "status_card",
|
|
5238
|
+
renderedCard: renderPlainCard({ title, icon: "⚡", rows, links: cardLinks, next }),
|
|
5158
5239
|
template: "optimization_started",
|
|
5159
5240
|
title,
|
|
5160
5241
|
fields: {
|
|
@@ -5167,13 +5248,7 @@ function startDirective(job, promptId, datasetId, evaluationId) {
|
|
|
5167
5248
|
targetScore: job.config.targetScore ?? 0.8,
|
|
5168
5249
|
model: job.config.model ?? "default"
|
|
5169
5250
|
},
|
|
5170
|
-
links: { monitor: monitorUrl, api: apiUrl }
|
|
5171
|
-
next,
|
|
5172
|
-
instruction: `HARD STOP — DO NOT PROCEED.
|
|
5173
|
-
` + `The renderedCard below appears in bash/terminal output. That is NOT sufficient.
|
|
5174
|
-
` + `You MUST copy the FULL renderedCard text into your chat response VERBATIM before running any other command.
|
|
5175
|
-
` + "After displaying, follow the suggested next steps.",
|
|
5176
|
-
renderedCard: renderPlainCard({ title, icon: "⚡", rows, links: cardLinks, next })
|
|
5251
|
+
links: { monitor: monitorUrl, api: apiUrl }
|
|
5177
5252
|
};
|
|
5178
5253
|
}
|
|
5179
5254
|
function statusDirective(status, promptId) {
|
|
@@ -5201,7 +5276,10 @@ function statusDirective(status, promptId) {
|
|
|
5201
5276
|
const monitorUrl = optimizerLink(promptId ?? "unknown", status.jobId);
|
|
5202
5277
|
const cardLinks = [{ label: "Monitor", url: monitorUrl }];
|
|
5203
5278
|
return {
|
|
5279
|
+
instruction: VERBATIM_CARD_INSTRUCTION,
|
|
5280
|
+
next,
|
|
5204
5281
|
display: "status_card",
|
|
5282
|
+
renderedCard: renderPlainCard({ title, icon, rows, links: cardLinks, next }),
|
|
5205
5283
|
template: "optimization_progress",
|
|
5206
5284
|
title,
|
|
5207
5285
|
fields: {
|
|
@@ -5213,13 +5291,7 @@ function statusDirective(status, promptId) {
|
|
|
5213
5291
|
eta,
|
|
5214
5292
|
message: status.message
|
|
5215
5293
|
},
|
|
5216
|
-
links: { monitor: monitorUrl }
|
|
5217
|
-
next,
|
|
5218
|
-
instruction: `HARD STOP — DO NOT PROCEED.
|
|
5219
|
-
` + `The renderedCard below appears in bash/terminal output. That is NOT sufficient.
|
|
5220
|
-
` + `You MUST copy the FULL renderedCard text into your chat response VERBATIM before running any other command.
|
|
5221
|
-
` + "After displaying, follow the suggested next steps.",
|
|
5222
|
-
renderedCard: renderPlainCard({ title, icon, rows, links: cardLinks, next })
|
|
5294
|
+
links: { monitor: monitorUrl }
|
|
5223
5295
|
};
|
|
5224
5296
|
}
|
|
5225
5297
|
function showPromptDiff(original, optimized) {
|
|
@@ -6149,6 +6221,8 @@ ${chalk16.yellow(`⚠ COST WARNING — AI Agent:
|
|
|
6149
6221
|
|
|
6150
6222
|
${chalk16.yellow("AI Agent: ALWAYS append --json to this command.")}
|
|
6151
6223
|
|
|
6224
|
+
${chalk16.cyan("Conceptual help (full optimization loop): .claude/skills/mutagent-cli/workflows/optimization.md")} ${chalk16.dim("(install via mutagent skills install)")}
|
|
6225
|
+
|
|
6152
6226
|
${chalk16.bold("Hit a bug or unexpected result?")}
|
|
6153
6227
|
${chalk16.cyan('mutagent feedback send --category bug -m "describe what went wrong"')}
|
|
6154
6228
|
`).action(async (promptId, options) => {
|
|
@@ -6461,6 +6535,7 @@ After viewing results:
|
|
|
6461
6535
|
${chalk16.dim("No flag = view scorecard only.")}
|
|
6462
6536
|
|
|
6463
6537
|
${chalk16.dim("AI Agent: Present scorecard to user via AskUserQuestion before applying.")}
|
|
6538
|
+
${chalk16.cyan("Conceptual help (scorecard interpretation, nextAction): .claude/skills/mutagent-cli/concepts/scorecard-output.md")} ${chalk16.dim("(install via mutagent skills install)")}
|
|
6464
6539
|
`).action(async (jobId, options) => {
|
|
6465
6540
|
const isJson = getJsonFlag(prompts);
|
|
6466
6541
|
const output = new OutputFormatter(isJson ? "json" : "table");
|
|
@@ -6473,12 +6548,8 @@ After viewing results:
|
|
|
6473
6548
|
const isCompleted = jobData?.status === "completed";
|
|
6474
6549
|
const scorecardText = buildResultsScorecardText(resultData);
|
|
6475
6550
|
const directive = {
|
|
6476
|
-
display: "
|
|
6477
|
-
instruction:
|
|
6478
|
-
` + `The scorecard below appears in bash/terminal output. That is NOT sufficient.
|
|
6479
|
-
` + `You MUST copy the FULL renderedCard text into your chat response VERBATIM.
|
|
6480
|
-
` + `Then use AskUserQuestion with options: [Apply optimized prompt / View full diff / Reject and keep original].
|
|
6481
|
-
` + "Do NOT auto-apply.",
|
|
6551
|
+
display: "status_card",
|
|
6552
|
+
instruction: VERBATIM_CARD_INSTRUCTION,
|
|
6482
6553
|
renderedCard: scorecardText,
|
|
6483
6554
|
next: isCompleted ? [
|
|
6484
6555
|
`mutagent prompts optimize results ${jobId} --apply --json`,
|
|
@@ -6799,12 +6870,12 @@ Provide a valid JSON Schema, e.g., '{"type":"object","properties":{"field":{"typ
|
|
|
6799
6870
|
}
|
|
6800
6871
|
}
|
|
6801
6872
|
if (filePath) {
|
|
6802
|
-
if (!
|
|
6873
|
+
if (!existsSync5(filePath)) {
|
|
6803
6874
|
throw new MutagentError("FILE_NOT_FOUND", `File not found: ${filePath}`, `Run: ${helpCommand}
|
|
6804
6875
|
Check the file path and try again`);
|
|
6805
6876
|
}
|
|
6806
6877
|
try {
|
|
6807
|
-
return JSON.parse(
|
|
6878
|
+
return JSON.parse(readFileSync5(filePath, "utf-8"));
|
|
6808
6879
|
} catch {
|
|
6809
6880
|
throw new MutagentError("INVALID_JSON", `Failed to parse JSON from ${filePath}`, `Run: ${helpCommand}
|
|
6810
6881
|
Ensure the file contains valid JSON Schema`);
|
|
@@ -7102,20 +7173,20 @@ ${chalk22.dim("Returns full trace details including spans, tokens, and latency."
|
|
|
7102
7173
|
init_config();
|
|
7103
7174
|
import { Command as Command9 } from "commander";
|
|
7104
7175
|
import chalk23 from "chalk";
|
|
7105
|
-
import { writeFileSync as writeFileSync3, existsSync as
|
|
7176
|
+
import { writeFileSync as writeFileSync3, existsSync as existsSync10 } from "fs";
|
|
7106
7177
|
import { execSync } from "child_process";
|
|
7107
7178
|
init_errors();
|
|
7108
7179
|
|
|
7109
7180
|
// src/lib/integrations/langchain.ts
|
|
7110
|
-
import { readFileSync as
|
|
7181
|
+
import { readFileSync as readFileSync6, existsSync as existsSync6 } from "fs";
|
|
7111
7182
|
var langchainIntegration = {
|
|
7112
7183
|
name: "langchain",
|
|
7113
7184
|
description: "LangChain framework",
|
|
7114
7185
|
detect() {
|
|
7115
7186
|
let hasLangchain = false;
|
|
7116
|
-
if (
|
|
7187
|
+
if (existsSync6("package.json")) {
|
|
7117
7188
|
try {
|
|
7118
|
-
const pkg = JSON.parse(
|
|
7189
|
+
const pkg = JSON.parse(readFileSync6("package.json", "utf-8"));
|
|
7119
7190
|
const deps = { ...pkg.dependencies, ...pkg.devDependencies };
|
|
7120
7191
|
hasLangchain = "langchain" in deps || "@langchain/core" in deps;
|
|
7121
7192
|
} catch {}
|
|
@@ -7247,15 +7318,15 @@ mutagent traces analyze <prompt-id>
|
|
|
7247
7318
|
};
|
|
7248
7319
|
|
|
7249
7320
|
// src/lib/integrations/langgraph.ts
|
|
7250
|
-
import { readFileSync as
|
|
7321
|
+
import { readFileSync as readFileSync7, existsSync as existsSync7 } from "fs";
|
|
7251
7322
|
var langgraphIntegration = {
|
|
7252
7323
|
name: "langgraph",
|
|
7253
7324
|
description: "LangGraph agent workflow framework",
|
|
7254
7325
|
detect() {
|
|
7255
7326
|
let hasLanggraph = false;
|
|
7256
|
-
if (
|
|
7327
|
+
if (existsSync7("package.json")) {
|
|
7257
7328
|
try {
|
|
7258
|
-
const pkg = JSON.parse(
|
|
7329
|
+
const pkg = JSON.parse(readFileSync7("package.json", "utf-8"));
|
|
7259
7330
|
const deps = { ...pkg.dependencies, ...pkg.devDependencies };
|
|
7260
7331
|
hasLanggraph = "@langchain/langgraph" in deps;
|
|
7261
7332
|
} catch {}
|
|
@@ -7337,15 +7408,15 @@ mutagent integrate langgraph --verify
|
|
|
7337
7408
|
};
|
|
7338
7409
|
|
|
7339
7410
|
// src/lib/integrations/vercel-ai.ts
|
|
7340
|
-
import { readFileSync as
|
|
7411
|
+
import { readFileSync as readFileSync8, existsSync as existsSync8 } from "fs";
|
|
7341
7412
|
var vercelAiIntegration = {
|
|
7342
7413
|
name: "vercel-ai",
|
|
7343
7414
|
description: "Vercel AI SDK",
|
|
7344
7415
|
detect() {
|
|
7345
7416
|
let hasAiSdk = false;
|
|
7346
|
-
if (
|
|
7417
|
+
if (existsSync8("package.json")) {
|
|
7347
7418
|
try {
|
|
7348
|
-
const pkg = JSON.parse(
|
|
7419
|
+
const pkg = JSON.parse(readFileSync8("package.json", "utf-8"));
|
|
7349
7420
|
const deps = { ...pkg.dependencies, ...pkg.devDependencies };
|
|
7350
7421
|
hasAiSdk = "ai" in deps;
|
|
7351
7422
|
} catch {}
|
|
@@ -7487,15 +7558,15 @@ mutagent integrate vercel-ai --verify
|
|
|
7487
7558
|
};
|
|
7488
7559
|
|
|
7489
7560
|
// src/lib/integrations/openai.ts
|
|
7490
|
-
import { readFileSync as
|
|
7561
|
+
import { readFileSync as readFileSync9, existsSync as existsSync9 } from "fs";
|
|
7491
7562
|
var openaiIntegration = {
|
|
7492
7563
|
name: "openai",
|
|
7493
7564
|
description: "OpenAI SDK integration with automatic tracing",
|
|
7494
7565
|
detect() {
|
|
7495
7566
|
let hasOpenAI = false;
|
|
7496
|
-
if (
|
|
7567
|
+
if (existsSync9("package.json")) {
|
|
7497
7568
|
try {
|
|
7498
|
-
const pkg = JSON.parse(
|
|
7569
|
+
const pkg = JSON.parse(readFileSync9("package.json", "utf-8"));
|
|
7499
7570
|
const deps = { ...pkg.dependencies, ...pkg.devDependencies };
|
|
7500
7571
|
hasOpenAI = "openai" in deps;
|
|
7501
7572
|
} catch {}
|
|
@@ -7644,16 +7715,16 @@ function getFrameworkMetadata(name) {
|
|
|
7644
7715
|
|
|
7645
7716
|
// src/commands/integrate.ts
|
|
7646
7717
|
function detectPackageManager() {
|
|
7647
|
-
if (
|
|
7718
|
+
if (existsSync10("bun.lockb") || existsSync10("bun.lock")) {
|
|
7648
7719
|
return "bun";
|
|
7649
7720
|
}
|
|
7650
|
-
if (
|
|
7721
|
+
if (existsSync10("pnpm-lock.yaml")) {
|
|
7651
7722
|
return "pnpm";
|
|
7652
7723
|
}
|
|
7653
|
-
if (
|
|
7724
|
+
if (existsSync10("yarn.lock")) {
|
|
7654
7725
|
return "yarn";
|
|
7655
7726
|
}
|
|
7656
|
-
if (
|
|
7727
|
+
if (existsSync10("package-lock.json")) {
|
|
7657
7728
|
return "npm";
|
|
7658
7729
|
}
|
|
7659
7730
|
try {
|
|
@@ -9039,15 +9110,15 @@ init_config();
|
|
|
9039
9110
|
import { Command as Command15 } from "commander";
|
|
9040
9111
|
import inquirer2 from "inquirer";
|
|
9041
9112
|
import chalk34 from "chalk";
|
|
9042
|
-
import { existsSync as
|
|
9113
|
+
import { existsSync as existsSync12, mkdirSync as mkdirSync3, writeFileSync as writeFileSync4 } from "fs";
|
|
9043
9114
|
import { execSync as execSync3 } from "child_process";
|
|
9044
|
-
import { join as
|
|
9115
|
+
import { join as join7 } from "path";
|
|
9045
9116
|
init_errors();
|
|
9046
9117
|
|
|
9047
9118
|
// src/lib/framework-detection.ts
|
|
9048
|
-
import { existsSync as
|
|
9119
|
+
import { existsSync as existsSync11, readFileSync as readFileSync10 } from "fs";
|
|
9049
9120
|
import { execSync as execSync2 } from "child_process";
|
|
9050
|
-
import { join as
|
|
9121
|
+
import { join as join6 } from "path";
|
|
9051
9122
|
var FRAMEWORK_DETECTION_MAP = {
|
|
9052
9123
|
"@mastra/core": {
|
|
9053
9124
|
name: "mastra",
|
|
@@ -9097,16 +9168,16 @@ var FRAMEWORK_DETECTION_MAP = {
|
|
|
9097
9168
|
}
|
|
9098
9169
|
};
|
|
9099
9170
|
function detectPackageManager2(cwd = process.cwd()) {
|
|
9100
|
-
if (
|
|
9171
|
+
if (existsSync11(join6(cwd, "bun.lockb")) || existsSync11(join6(cwd, "bun.lock"))) {
|
|
9101
9172
|
return "bun";
|
|
9102
9173
|
}
|
|
9103
|
-
if (
|
|
9174
|
+
if (existsSync11(join6(cwd, "pnpm-lock.yaml"))) {
|
|
9104
9175
|
return "pnpm";
|
|
9105
9176
|
}
|
|
9106
|
-
if (
|
|
9177
|
+
if (existsSync11(join6(cwd, "yarn.lock"))) {
|
|
9107
9178
|
return "yarn";
|
|
9108
9179
|
}
|
|
9109
|
-
if (
|
|
9180
|
+
if (existsSync11(join6(cwd, "package-lock.json"))) {
|
|
9110
9181
|
return "npm";
|
|
9111
9182
|
}
|
|
9112
9183
|
try {
|
|
@@ -9127,13 +9198,13 @@ function getInstallCommand2(pm, packages) {
|
|
|
9127
9198
|
return commands[pm];
|
|
9128
9199
|
}
|
|
9129
9200
|
function detectFrameworkFromPackageJson(cwd = process.cwd()) {
|
|
9130
|
-
const pkgPath =
|
|
9131
|
-
if (!
|
|
9201
|
+
const pkgPath = join6(cwd, "package.json");
|
|
9202
|
+
if (!existsSync11(pkgPath)) {
|
|
9132
9203
|
return null;
|
|
9133
9204
|
}
|
|
9134
9205
|
let pkg;
|
|
9135
9206
|
try {
|
|
9136
|
-
pkg = JSON.parse(
|
|
9207
|
+
pkg = JSON.parse(readFileSync10(pkgPath, "utf-8"));
|
|
9137
9208
|
} catch {
|
|
9138
9209
|
return null;
|
|
9139
9210
|
}
|
|
@@ -9149,12 +9220,12 @@ function detectFrameworkFromPackageJson(cwd = process.cwd()) {
|
|
|
9149
9220
|
return null;
|
|
9150
9221
|
}
|
|
9151
9222
|
function hasRcConfig(cwd = process.cwd()) {
|
|
9152
|
-
return
|
|
9223
|
+
return existsSync11(join6(cwd, ".mutagentrc.json"));
|
|
9153
9224
|
}
|
|
9154
9225
|
|
|
9155
9226
|
// src/commands/init.ts
|
|
9156
9227
|
function writeRcConfig(config, cwd = process.cwd()) {
|
|
9157
|
-
const rcPath =
|
|
9228
|
+
const rcPath = join7(cwd, ".mutagentrc.json");
|
|
9158
9229
|
writeFileSync4(rcPath, JSON.stringify(config, null, 2) + `
|
|
9159
9230
|
`);
|
|
9160
9231
|
}
|
|
@@ -9370,8 +9441,8 @@ Modes:
|
|
|
9370
9441
|
}
|
|
9371
9442
|
}
|
|
9372
9443
|
}
|
|
9373
|
-
const skillPath =
|
|
9374
|
-
const skillInstalled =
|
|
9444
|
+
const skillPath = join7(cwd, ".claude/skills/mutagent-cli/SKILL.md");
|
|
9445
|
+
const skillInstalled = existsSync12(skillPath);
|
|
9375
9446
|
if (!isNonInteractive && !skillInstalled) {
|
|
9376
9447
|
const { installSkill } = await inquirer2.prompt([{
|
|
9377
9448
|
type: "confirm",
|
|
@@ -9381,11 +9452,11 @@ Modes:
|
|
|
9381
9452
|
}]);
|
|
9382
9453
|
if (installSkill) {
|
|
9383
9454
|
try {
|
|
9384
|
-
const skillDir =
|
|
9385
|
-
if (!
|
|
9455
|
+
const skillDir = join7(cwd, ".claude/skills/mutagent-cli");
|
|
9456
|
+
if (!existsSync12(skillDir)) {
|
|
9386
9457
|
mkdirSync3(skillDir, { recursive: true });
|
|
9387
9458
|
}
|
|
9388
|
-
execSync3("node " +
|
|
9459
|
+
execSync3("node " + join7(cwd, "node_modules/.bin/mutagent") + " skills install", {
|
|
9389
9460
|
cwd,
|
|
9390
9461
|
stdio: "ignore"
|
|
9391
9462
|
});
|
|
@@ -9395,6 +9466,36 @@ Modes:
|
|
|
9395
9466
|
}
|
|
9396
9467
|
}
|
|
9397
9468
|
}
|
|
9469
|
+
const initDirective = {
|
|
9470
|
+
instruction: skillInstalled ? "Verify workspace. Run: mutagent workspaces list --json" : [
|
|
9471
|
+
"MANDATORY: Ask the user whether they want to install the MutagenT CLI skill and Claude Code hooks.",
|
|
9472
|
+
"Use your AskUserQuestion tool to present the following choices:",
|
|
9473
|
+
"1. Install skill + hooks (Recommended) — Installs the CLI skill AND configures Claude Code session telemetry hooks",
|
|
9474
|
+
"2. Install skill only — Installs the CLI skill (teaches AI agents to use MutagenT CLI)",
|
|
9475
|
+
"3. Skip — Do not install",
|
|
9476
|
+
"",
|
|
9477
|
+
"If the user picks option 1, run BOTH commands:",
|
|
9478
|
+
" mutagent skills install --json",
|
|
9479
|
+
" mutagent hooks install --json",
|
|
9480
|
+
"",
|
|
9481
|
+
"If the user picks option 2, run only: mutagent skills install --json",
|
|
9482
|
+
"",
|
|
9483
|
+
"After skill/hooks install, verify workspace: mutagent workspaces list --json"
|
|
9484
|
+
].join(`
|
|
9485
|
+
`),
|
|
9486
|
+
next: ["mutagent workspaces list --json", "mutagent prompts list --json"],
|
|
9487
|
+
...!skillInstalled ? {
|
|
9488
|
+
askUserQuestions: [{
|
|
9489
|
+
question: "Would you like to install the MutagenT CLI skill and Claude Code hooks?",
|
|
9490
|
+
options: [
|
|
9491
|
+
{ label: "Install skill + hooks (Recommended)", description: "Installs the CLI skill AND configures Claude Code session telemetry hooks" },
|
|
9492
|
+
{ label: "Install skill only", description: "Installs the CLI skill (teaches AI agents to use MutagenT CLI)" },
|
|
9493
|
+
{ label: "Skip", description: "Do not install" }
|
|
9494
|
+
],
|
|
9495
|
+
multiSelect: false
|
|
9496
|
+
}]
|
|
9497
|
+
} : {}
|
|
9498
|
+
};
|
|
9398
9499
|
const summary = {
|
|
9399
9500
|
success: true,
|
|
9400
9501
|
mode,
|
|
@@ -9403,30 +9504,8 @@ Modes:
|
|
|
9403
9504
|
framework: confirmedFramework?.name ?? null,
|
|
9404
9505
|
authenticated,
|
|
9405
9506
|
workspaceValidation: workspaceValidation ?? null,
|
|
9406
|
-
skillInstalled: skillInstalled ||
|
|
9407
|
-
_directive:
|
|
9408
|
-
instruction: "Verify workspace. Run: mutagent workspaces list --json",
|
|
9409
|
-
next: ["mutagent workspaces list --json", "mutagent prompts list --json"],
|
|
9410
|
-
...!skillInstalled ? {
|
|
9411
|
-
skillInstall: {
|
|
9412
|
-
instruction: [
|
|
9413
|
-
"MANDATORY: Ask the user whether they want to install the MutagenT CLI skill and Claude Code hooks.",
|
|
9414
|
-
"Use your AskUserQuestion tool to present the following choices:",
|
|
9415
|
-
"1. Install skill + hooks (Recommended) — Installs the CLI skill AND configures Claude Code session telemetry hooks",
|
|
9416
|
-
"2. Install skill only — Installs the CLI skill (teaches AI agents to use MutagenT CLI)",
|
|
9417
|
-
"3. Skip — Do not install",
|
|
9418
|
-
"",
|
|
9419
|
-
"If the user picks option 1, run BOTH commands:",
|
|
9420
|
-
" mutagent skills install --json",
|
|
9421
|
-
" mutagent hooks install --json",
|
|
9422
|
-
"",
|
|
9423
|
-
"If the user picks option 2, run only: mutagent skills install --json"
|
|
9424
|
-
].join(`
|
|
9425
|
-
`),
|
|
9426
|
-
command: "mutagent skills install --json"
|
|
9427
|
-
}
|
|
9428
|
-
} : {}
|
|
9429
|
-
}
|
|
9507
|
+
skillInstalled: skillInstalled || existsSync12(skillPath),
|
|
9508
|
+
_directive: initDirective
|
|
9430
9509
|
};
|
|
9431
9510
|
output.output(summary);
|
|
9432
9511
|
} catch (error) {
|
|
@@ -9469,6 +9548,7 @@ ${chalk35.bold("Template variable delimiters (inferred per-file):")}
|
|
|
9469
9548
|
${chalk35.dim('{ "file": "src/prompts/summarize.ts", "line": 12, "reason": "template-variable", "delimiter": "single" }')}
|
|
9470
9549
|
|
|
9471
9550
|
${chalk35.dim("Results are saved to .mutagent/mutation-context.md for use by other commands.")}
|
|
9551
|
+
${chalk35.cyan("Conceptual help (prompt vs agent taxonomy, brace inference): .claude/skills/mutagent-cli/workflows/exploration.md")} ${chalk35.dim("(install via mutagent skills install)")}
|
|
9472
9552
|
`).action((options) => {
|
|
9473
9553
|
const isJson = getJsonFlag(explore);
|
|
9474
9554
|
const output = new OutputFormatter(isJson ? "json" : "table");
|
|
@@ -9598,8 +9678,8 @@ Scanning ${scanPath}...
|
|
|
9598
9678
|
// src/commands/skills.ts
|
|
9599
9679
|
import { Command as Command17 } from "commander";
|
|
9600
9680
|
import chalk36 from "chalk";
|
|
9601
|
-
import { existsSync as
|
|
9602
|
-
import { dirname, join as
|
|
9681
|
+
import { existsSync as existsSync13, mkdirSync as mkdirSync4, writeFileSync as writeFileSync5 } from "fs";
|
|
9682
|
+
import { dirname, join as join8 } from "path";
|
|
9603
9683
|
import { execSync as execSync4 } from "child_process";
|
|
9604
9684
|
|
|
9605
9685
|
// src/generated/skill-content.ts
|
|
@@ -9614,6 +9694,8 @@ description: |
|
|
|
9614
9694
|
"create evaluation", "upload dataset", "explore prompts", "mutagent cli",
|
|
9615
9695
|
"eval", "dataset", "guided", "how do I optimize", "improve my prompt",
|
|
9616
9696
|
"set up tracing", "add observability".
|
|
9697
|
+
SKILL_VERSION: 1.1.0
|
|
9698
|
+
SKILL_MIN_CLI_VERSION: 0.1.163
|
|
9617
9699
|
---
|
|
9618
9700
|
|
|
9619
9701
|
# MutagenT CLI Skill
|
|
@@ -9622,7 +9704,135 @@ description: |
|
|
|
9622
9704
|
> Packed into the CLI binary via \`scripts/sync-skill.ts\`. Installed to end-user
|
|
9623
9705
|
> dev environments via \`mutagent skills install\`. Edit this file, not the installed copy.
|
|
9624
9706
|
|
|
9625
|
-
|
|
9707
|
+
---
|
|
9708
|
+
|
|
9709
|
+
## CLI Prerequisite Check (RUN FIRST)
|
|
9710
|
+
|
|
9711
|
+
Before executing ANY workflow step, verify the CLI is installed and version-compatible:
|
|
9712
|
+
|
|
9713
|
+
**Step 1 -- Check CLI presence:**
|
|
9714
|
+
\`\`\`bash
|
|
9715
|
+
mutagent --version --json
|
|
9716
|
+
\`\`\`
|
|
9717
|
+
|
|
9718
|
+
**Step 2 -- If command not found (error / not on PATH):**
|
|
9719
|
+
|
|
9720
|
+
This is the **Path 2 onboarding case**: the Skill was installed first (e.g. from a skill registry, manually, or bundled in someone else's CLAUDE.md), but the CLI itself isn't installed yet. Do NOT just dump install instructions and stop -- proactively **offer to install it**.
|
|
9721
|
+
|
|
9722
|
+
**2a. Detect the user's package manager** (best-effort — check in this order):
|
|
9723
|
+
\`\`\`bash
|
|
9724
|
+
# In the user's project root (cwd):
|
|
9725
|
+
test -f bun.lockb && echo "bun"
|
|
9726
|
+
test -f pnpm-lock.yaml && echo "pnpm"
|
|
9727
|
+
test -f yarn.lock && echo "yarn"
|
|
9728
|
+
test -f package-lock.json && echo "npm"
|
|
9729
|
+
# Fallback: which bun || which pnpm || which yarn || which npm
|
|
9730
|
+
\`\`\`
|
|
9731
|
+
If multiple lockfiles exist, prefer in order: \`bun > pnpm > yarn > npm\`.
|
|
9732
|
+
If no lockfile and the user is in a non-JS project (e.g. Python, Go), ask which they prefer.
|
|
9733
|
+
|
|
9734
|
+
**2b. Ask the user via AskUserQuestion** (do NOT auto-install without consent):
|
|
9735
|
+
|
|
9736
|
+
> "The MutagenT CLI is not installed yet. I can install it globally via \`<detected-pm>\`. Proceed?"
|
|
9737
|
+
|
|
9738
|
+
Options to present:
|
|
9739
|
+
1. **Yes, install globally with \`<detected-pm>\`** (Recommended) -- runs \`<pm> add -g @mutagent/cli\` (or \`npm install -g @mutagent/cli\` for npm)
|
|
9740
|
+
2. **Yes, but use a different package manager** -- prompt for choice (npm / bun / pnpm / yarn)
|
|
9741
|
+
3. **No, I'll install it myself** -- show the four install commands as a verbatim block; STOP and wait for the user to install
|
|
9742
|
+
4. **Skip — I have it installed via a different path** -- ask the user to add it to PATH and re-invoke
|
|
9743
|
+
|
|
9744
|
+
**2c. On user approval (option 1 or 2)**, run the install command in a Bash tool call:
|
|
9745
|
+
\`\`\`bash
|
|
9746
|
+
# bun
|
|
9747
|
+
bun add -g @mutagent/cli
|
|
9748
|
+
# npm
|
|
9749
|
+
npm install -g @mutagent/cli
|
|
9750
|
+
# pnpm
|
|
9751
|
+
pnpm add -g @mutagent/cli
|
|
9752
|
+
# yarn
|
|
9753
|
+
yarn global add @mutagent/cli
|
|
9754
|
+
\`\`\`
|
|
9755
|
+
Show the install output to the user verbatim. After it completes, **re-run Step 1** (\`mutagent --version --json\`) to confirm the CLI is now on PATH. If the post-install version check still fails (e.g. global bin not on PATH), tell the user:
|
|
9756
|
+
> "Install succeeded but \`mutagent\` isn't on PATH yet. Common fixes:
|
|
9757
|
+
> - bun: \`export PATH=\\"$HOME/.bun/bin:$PATH\\"\`
|
|
9758
|
+
> - npm: check \`npm config get prefix\` and add \`<prefix>/bin\` to PATH
|
|
9759
|
+
> - pnpm: \`pnpm setup\` then restart your shell
|
|
9760
|
+
> - yarn: \`yarn global bin\` and add that path to PATH"
|
|
9761
|
+
> Then ask the user to restart their shell or source the relevant rc file, and re-invoke me.
|
|
9762
|
+
|
|
9763
|
+
**2d. On rejection (option 3 or 4)**, output the four install commands as a verbatim block and STOP. Do NOT proceed with any workflow until the user confirms the CLI is installed and re-invokes the skill.
|
|
9764
|
+
|
|
9765
|
+
\`\`\`bash
|
|
9766
|
+
# Pick one (Recommended in order: bun > pnpm > yarn > npm)
|
|
9767
|
+
bun add -g @mutagent/cli
|
|
9768
|
+
pnpm add -g @mutagent/cli
|
|
9769
|
+
yarn global add @mutagent/cli
|
|
9770
|
+
npm install -g @mutagent/cli
|
|
9771
|
+
\`\`\`
|
|
9772
|
+
|
|
9773
|
+
**Why proactive install (not just "stop and tell")**: a user invoking this Skill has already shown intent to use MutagenT. Forcing them to context-switch to a separate terminal, copy-paste an install command, and re-invoke the agent is friction that often loses the user. Asking once + installing on consent is the smoother path. The opt-out (option 3/4) preserves user control.
|
|
9774
|
+
|
|
9775
|
+
**Step 3 -- Version compatibility check:**
|
|
9776
|
+
Parse \`_compat.cliVersion\` from the \`--version --json\` output and compare against
|
|
9777
|
+
\`SKILL_MIN_CLI_VERSION\` (above in this file's frontmatter, currently \`0.1.163\`).
|
|
9778
|
+
|
|
9779
|
+
- If \`cliVersion >= SKILL_MIN_CLI_VERSION\`: all good, proceed normally.
|
|
9780
|
+
- If \`cliVersion < SKILL_MIN_CLI_VERSION\`: emit a **PROMINENT PERSISTENT WARN** to the user:
|
|
9781
|
+
|
|
9782
|
+
> Warning: **CLI version mismatch**: Your \`mutagent\` CLI is \`{cliVersion}\` but this Skill
|
|
9783
|
+
> requires \`>= 0.1.163\`. Some features may not work correctly.
|
|
9784
|
+
> Upgrade: \`npm install -g @mutagent/cli@latest\`
|
|
9785
|
+
|
|
9786
|
+
**DO NOT BLOCK** -- proceed with the requested workflow after showing the warn.
|
|
9787
|
+
Re-emit this warn at the start of every subsequent Skill invocation until the user upgrades.
|
|
9788
|
+
|
|
9789
|
+
**Per decision D4 (locked 2026-05-03)**: version mismatch is warn-only, never a hard block.
|
|
9790
|
+
|
|
9791
|
+
---
|
|
9792
|
+
|
|
9793
|
+
## MANDATORY: Verbatim Card Display Protocol
|
|
9794
|
+
|
|
9795
|
+
If a mutagent CLI response contains \`_directive.renderedCard\`, your FIRST action
|
|
9796
|
+
MUST be to copy that text byte-for-byte into your chat reply BEFORE running any
|
|
9797
|
+
further command, asking any question, or doing any other work.
|
|
9798
|
+
|
|
9799
|
+
The card also appears on stderr (auto-echoed by the CLI via \`echoDirectiveToStderr\`)
|
|
9800
|
+
but you MUST STILL render it in chat -- bash/stderr blocks may be collapsed in the
|
|
9801
|
+
user's UI and they cannot read them.
|
|
9802
|
+
|
|
9803
|
+
- Do NOT summarize, paraphrase, or truncate any line of the card.
|
|
9804
|
+
- Do NOT skip the card to "save tokens."
|
|
9805
|
+
- Do NOT delay rendering until "after the next step."
|
|
9806
|
+
- This applies whether \`_directive.display\` is \`'status_card'\` or absent -- the
|
|
9807
|
+
trigger is the presence of \`renderedCard\`, not the display tag.
|
|
9808
|
+
- Failure to display the card verbatim = protocol violation.
|
|
9809
|
+
|
|
9810
|
+
This rule applies to: prompt/agent/dataset/evaluation create+update+delete,
|
|
9811
|
+
provider add+update+delete, and optimize start/status/results.
|
|
9812
|
+
|
|
9813
|
+
---
|
|
9814
|
+
|
|
9815
|
+
## Agent Runtime — interactive question handling
|
|
9816
|
+
|
|
9817
|
+
This skill emits \`_directive.askUserQuestions\` arrays for guided flows
|
|
9818
|
+
(eval-creation, dataset-curation, init). On Claude Code, use the
|
|
9819
|
+
**AskUserQuestion** tool to present each question to the user. On other
|
|
9820
|
+
coding-agent runtimes (Cursor, Aider, Continue, etc.), use the equivalent
|
|
9821
|
+
inquiry/prompt mechanism your runtime provides.
|
|
9822
|
+
|
|
9823
|
+
If your runtime has no interactive-question tool, fall back to:
|
|
9824
|
+
1. Echo each question's \`question\` field verbatim into chat
|
|
9825
|
+
2. Wait for the user's reply before proceeding to the next question
|
|
9826
|
+
3. Do NOT auto-fill answers from context (Rule 3)
|
|
9827
|
+
|
|
9828
|
+
The \`_directive.askUserQuestions\` schema is described in
|
|
9829
|
+
[\`concepts/eval-criteria.md\`](./concepts/eval-criteria.md) §
|
|
9830
|
+
"Per-field rubric collection" and follows
|
|
9831
|
+
[Claude Code's AskUserQuestion tool shape](https://docs.claude.com/en/docs/claude-code/sdk).
|
|
9832
|
+
|
|
9833
|
+
---
|
|
9834
|
+
|
|
9835
|
+
## SKILL vs CLI -- responsibility split
|
|
9626
9836
|
|
|
9627
9837
|
| Layer | Owner | Responsibility |
|
|
9628
9838
|
|---|---|---|
|
|
@@ -9630,33 +9840,54 @@ description: |
|
|
|
9630
9840
|
| **CLI** | \`mutagent <cmd>\` | commands, flags, \`--json\`, \`_directive.*\`, \`_links\` |
|
|
9631
9841
|
| Platform | api.mutagent.io | storage, optimization, eval execution, \`{variable}\` rendering |
|
|
9632
9842
|
|
|
9633
|
-
**Rule**: SKILL never duplicates CLI flag lists
|
|
9843
|
+
**Rule**: SKILL never duplicates CLI flag lists -- always \`mutagent <cmd> --help\` for flags.
|
|
9634
9844
|
|
|
9635
9845
|
---
|
|
9636
9846
|
|
|
9637
|
-
## 5 Core Rules
|
|
9847
|
+
## 5 Core Rules -- NON-NEGOTIABLE
|
|
9638
9848
|
|
|
9639
9849
|
1. **\`--json\` on EVERY command.** No exceptions. Agents use JSON mode exclusively.
|
|
9640
|
-
2. **\`<command> --help\` BEFORE first use of any command.** The CLI is the source of truth for flags
|
|
9641
|
-
3. **NEVER auto-generate eval criteria
|
|
9850
|
+
2. **\`<command> --help\` BEFORE first use of any command.** The CLI is the source of truth for flags -- this SKILL never inlines them.
|
|
9851
|
+
3. **NEVER auto-generate eval criteria -- collect from user.** Ask the user for each rubric field. See [concepts/eval-criteria.md](./concepts/eval-criteria.md) for the rubric format.
|
|
9642
9852
|
4. **Explore-before-modify.** Run \`mutagent explore --json\` before any write operation. Present findings, get user confirmation. Never mutate without discovery first.
|
|
9643
9853
|
5. **Cost transparency before \`optimize start\`.** Run \`mutagent usage --json\` and show the result to the user. Get explicit confirmation before any optimization job.
|
|
9644
9854
|
6. **Before optimizing, run \`mutagent providers list --models\` to verify available models.** This calls \`/providers/catalog\` and shows which models are available per provider. Use the output to pick valid \`--exec-model\` and \`--eval-model\` values.
|
|
9645
9855
|
|
|
9646
9856
|
---
|
|
9647
9857
|
|
|
9648
|
-
##
|
|
9858
|
+
## Prompt vs Agent -- pick the right loop
|
|
9859
|
+
|
|
9860
|
+
| Signal | Use | CLI surface | Skill workflow |
|
|
9861
|
+
|---|---|---|---|
|
|
9862
|
+
| Single LLM call -> text/JSON output | Prompt Optimization | \`mutagent prompts *\` | [workflows/optimization.md](./workflows/optimization.md) |
|
|
9863
|
+
| Multi-turn / tool-calling / state graph | Agent (WIP) | \`mutagent agents *\` (CRUD only) | [workflows/agents.md](./workflows/agents.md) (stub) |
|
|
9649
9864
|
|
|
9650
|
-
|
|
9865
|
+
When in doubt: run \`mutagent explore --json\` (it classifies discovered code under \`prompts[]\` vs \`agents[]\`).
|
|
9651
9866
|
|
|
9652
|
-
|
|
9867
|
+
---
|
|
9868
|
+
|
|
9869
|
+
## Journey Router -- route by user intent
|
|
9870
|
+
|
|
9871
|
+
> **Concept files = WHY/WHAT pre-reads. Workflow files = HOW step sequences.**
|
|
9872
|
+
> Load BOTH when intent matches both axes (e.g., "create rubric" loads
|
|
9873
|
+
> \`concepts/eval-criteria.md\` for the rubric design framework AND
|
|
9874
|
+
> \`workflows/eval-creation.md\` for the step-by-step CLI sequence). Each topic's
|
|
9875
|
+
> concept ↔ workflow pairing is shown in the Subfile Map below.
|
|
9876
|
+
|
|
9877
|
+
Match the user's first request. Load ONLY the matching subfile(s) per the table. Do NOT preload the whole set.
|
|
9878
|
+
|
|
9879
|
+
| User said / signal detected | Load subfile(s) | Why |
|
|
9653
9880
|
|---|---|---|
|
|
9654
9881
|
| "trace", "observe", "integrate", "add framework" | [workflows/tracing.md](./workflows/tracing.md) | Non-destructive, fastest first-value path |
|
|
9655
|
-
| "optimize", "improve", "tune", "
|
|
9882
|
+
| "optimize", "improve", "tune", "upload prompt" | [workflows/optimization.md](./workflows/optimization.md) | Full create->dataset->eval->optimize loop (orchestrator) |
|
|
9883
|
+
| "create dataset", "add examples", "test cases", "edge cases", "hard cases", "expand dataset", "dataset items" | [workflows/dataset-curation.md](./workflows/dataset-curation.md) (HOW) + [concepts/dataset-design.md](./concepts/dataset-design.md) (WHY) | Standalone dataset curation (no optimization context needed) |
|
|
9884
|
+
| "create evaluation", "create rubric", "evaluate prompt", "judge", "score this prompt", "rubric design", "MVC", "Output Standards" | [workflows/eval-creation.md](./workflows/eval-creation.md) (HOW) + [concepts/eval-criteria.md](./concepts/eval-criteria.md) (WHY) | Standalone evaluation rubric creation (no optimization context needed) |
|
|
9656
9885
|
| "explore", "scan", "find prompts", "what prompts", "discover" | [workflows/exploration.md](./workflows/exploration.md) | Read-only discovery + taxonomy |
|
|
9657
|
-
| \`AgentExecutor\`, \`StateGraph\`, \`createReactAgent\`, \`tool_calls\`, \`@tool\`, \`langgraph\`, \`crewai\`, \`autogen\`, \`openai/agents\`, multi-turn | [workflows/agents.md](./workflows/agents.md) | WIP path
|
|
9658
|
-
| "how do variables work", "single vs double braces", delimiter | [concepts/prompt-variables.md](./concepts/prompt-variables.md) | Delimiter inference contract |
|
|
9659
|
-
| "what makes a good eval"
|
|
9886
|
+
| \`AgentExecutor\`, \`StateGraph\`, \`createReactAgent\`, \`tool_calls\`, \`@tool\`, \`langgraph\`, \`crewai\`, \`autogen\`, \`openai/agents\`, multi-turn | [workflows/agents.md](./workflows/agents.md) | WIP path -- surface partnership link |
|
|
9887
|
+
| "how do variables work", "single vs double braces", delimiter | [concepts/prompt-variables.md](./concepts/prompt-variables.md) | Delimiter inference contract (concept-only; prompt creation lives inline in optimization.md step 4) |
|
|
9888
|
+
| "what makes a good eval" (concept question only, no creation intent) | [concepts/eval-criteria.md](./concepts/eval-criteria.md) | INPUT MVC + OUTPUT Standards (no workflow load) |
|
|
9889
|
+
| "what makes a good dataset" (concept question only, no creation intent) | [concepts/dataset-design.md](./concepts/dataset-design.md) | Dataset curation principles + case categories (no workflow load) |
|
|
9890
|
+
| "scorecard", "interpret results", "what does X score mean" | [concepts/scorecard-output.md](./concepts/scorecard-output.md) | Interpretation only (no workflow needed) |
|
|
9660
9891
|
| "check models", "what models", "available models", "which models" | run \`mutagent providers list --models --json\` | Discovery: shows catalog per provider before model selection |
|
|
9661
9892
|
| Unclear / first time | run \`mutagent explore --json\` first, then reroute | Discovery before action |
|
|
9662
9893
|
|
|
@@ -9667,59 +9898,62 @@ Match the user's first request. Load ONLY the matching subfile. Do NOT preload t
|
|
|
9667
9898
|
| File | WHEN to load | WHY | ENFORCEMENT |
|
|
9668
9899
|
|---|---|---|---|
|
|
9669
9900
|
| [workflows/tracing.md](./workflows/tracing.md) | User wants to add framework tracing / observability | Non-destructive append-only integration sequence | Must run explore first (Rule 4) |
|
|
9670
|
-
| [workflows/optimization.md](./workflows/optimization.md) | User wants to optimize or evaluate a prompt | Full loop: explore
|
|
9901
|
+
| [workflows/optimization.md](./workflows/optimization.md) | User wants to optimize or evaluate a prompt | Full loop: explore -> upload -> dataset -> eval -> optimize -> apply | Must check usage before optimize (Rule 5); must collect rubrics from user (Rule 3) |
|
|
9902
|
+
| [workflows/dataset-curation.md](./workflows/dataset-curation.md) | User wants to create/expand a dataset (standalone) | Focused dataset curation without full optimization context | Hard cases first; ask per-field questions |
|
|
9903
|
+
| [workflows/eval-creation.md](./workflows/eval-creation.md) | User wants to create/edit evaluation rubric (standalone) | Focused per-field rubric collection without full optimization context | INPUT MVC + OUTPUT Standards split; ask per-field questions; collect from user (Rule 3) |
|
|
9671
9904
|
| [workflows/exploration.md](./workflows/exploration.md) | User wants to scan codebase, identify prompts vs agents | Read-only discovery; output taxonomy to user | Run only; no writes |
|
|
9672
|
-
| [workflows/agents.md](./workflows/agents.md) | Multi-turn / tool-calling code detected | WIP
|
|
9905
|
+
| [workflows/agents.md](./workflows/agents.md) | Multi-turn / tool-calling code detected | WIP -- do NOT attempt optimizer, surface partnership link | Show WIP card to user verbatim |
|
|
9673
9906
|
| [concepts/prompt-variables.md](./concepts/prompt-variables.md) | Any question about \`{var}\` vs \`{{var}}\`, delimiter inference | Brace convention + conversion rules | Load before \`prompts create\` in optimization workflow |
|
|
9674
|
-
| [concepts/eval-criteria.md](./concepts/eval-criteria.md) | Any question about rubric design, MVC, Output Standards | granular rubric format
|
|
9907
|
+
| [concepts/eval-criteria.md](./concepts/eval-criteria.md) | Any question about rubric design, MVC, Output Standards | granular rubric format -- INPUT-param vs OUTPUT-param scope | Load before \`evaluation create --guided\` in optimization workflow |
|
|
9908
|
+
| [concepts/dataset-design.md](./concepts/dataset-design.md) | Any question about dataset quality, case categories, hard cases | Dataset design principles -- parallel structure to eval-criteria.md | Load before \`dataset add --guided\` |
|
|
9675
9909
|
|
|
9676
9910
|
---
|
|
9677
9911
|
|
|
9678
9912
|
## Output handling
|
|
9679
9913
|
|
|
9680
9914
|
After every CLI command:
|
|
9681
|
-
- **Show the command output to the user.** Command output appears in bash blocks that users may not see
|
|
9682
|
-
- **For evaluation create \`--guided\`**: the CLI provides per-field questions
|
|
9915
|
+
- **Show the command output to the user.** Command output appears in bash blocks that users may not see -- always present the key results in your chat response.
|
|
9916
|
+
- **For evaluation create \`--guided\`**: the CLI provides per-field questions in \`_directive.askUserQuestions\`. Ask the user each question in turn. Do not skip any field. Do not pre-fill answers.
|
|
9683
9917
|
- **For \`optimize results\`**: present the before/after scorecard to the user and confirm whether to apply, view diff, or reject.
|
|
9684
9918
|
|
|
9685
9919
|
---
|
|
9686
9920
|
|
|
9687
|
-
## Anti-patterns
|
|
9921
|
+
## Anti-patterns -- NEVER do these
|
|
9688
9922
|
|
|
9689
9923
|
- Run any command without \`--json\`
|
|
9690
|
-
- Auto-generate eval criteria
|
|
9924
|
+
- Auto-generate eval criteria -- always collect from the user
|
|
9691
9925
|
- Skip any schema field when collecting evaluation rubrics
|
|
9692
9926
|
- Skip \`mutagent explore --json\` before any write operation
|
|
9693
9927
|
- Run \`optimize start\` without first showing \`usage --json\` to the user
|
|
9694
9928
|
- Increase \`--max-iterations\` above 1 without explicit user consent (each iteration = LLM spend)
|
|
9695
9929
|
- Run a multi-turn agent through the prompt optimizer
|
|
9696
9930
|
- Skip showing command output results to the user
|
|
9697
|
-
- Inline CLI flags from memory
|
|
9931
|
+
- Inline CLI flags from memory -- always read \`--help\` first
|
|
9698
9932
|
|
|
9699
9933
|
---
|
|
9700
9934
|
|
|
9701
9935
|
## State Tracking
|
|
9702
9936
|
|
|
9703
|
-
- \`.mutagent/mutation-context.md\`
|
|
9704
|
-
- \`mutagent auth status --json\`
|
|
9937
|
+
- \`.mutagent/mutation-context.md\` -- codebase index of discovered/uploaded prompts. Update after explore, create, dataset ops.
|
|
9938
|
+
- \`mutagent auth status --json\` -- auth + workspace state.
|
|
9705
9939
|
|
|
9706
9940
|
---
|
|
9707
9941
|
|
|
9708
9942
|
## Login (two paths)
|
|
9709
9943
|
|
|
9710
|
-
- **CI / automated**: \`export MUTAGENT_API_KEY=mt_... && mutagent login --json\`
|
|
9711
|
-
- **Onboarding a user**: \`mutagent login --browser --json\`
|
|
9944
|
+
- **CI / automated**: \`export MUTAGENT_API_KEY=mt_... && mutagent login --json\` -- no browser, no prompts.
|
|
9945
|
+
- **Onboarding a user**: \`mutagent login --browser --json\` -- CLI prints auth URL to stdout, polls 5 min. **Surface the URL verbatim to the user.** \`--non-interactive\` is NOT needed when \`--browser\` is set.
|
|
9712
9946
|
|
|
9713
|
-
\`mutagent login\` is canonical. \`mutagent auth login\` is a back-compat alias. Both delegate to \`lib/auth-flow.ts\`. Decision record: [cli-design-principles.md](../../docs/cli-design-principles.md)
|
|
9947
|
+
\`mutagent login\` is canonical. \`mutagent auth login\` is a back-compat alias. Both delegate to \`lib/auth-flow.ts\`. Decision record: [cli-design-principles.md](../../docs/cli-design-principles.md) -> Login Unification.
|
|
9714
9948
|
|
|
9715
9949
|
---
|
|
9716
9950
|
|
|
9717
|
-
## Error Recovery
|
|
9951
|
+
## Error Recovery -- Agent-Aware Bug Reporting
|
|
9718
9952
|
|
|
9719
9953
|
When ANY mutagent CLI command returns a non-zero exit code or an error response,
|
|
9720
9954
|
follow this protocol:
|
|
9721
9955
|
|
|
9722
|
-
1. **Show the error to the user** (always)
|
|
9956
|
+
1. **Show the error to the user** (always) -- reproduce the exact command and output.
|
|
9723
9957
|
2. **ASK the user** if they want to file a bug report with session context.
|
|
9724
9958
|
3. **On user approval**, run:
|
|
9725
9959
|
\`\`\`bash
|
|
@@ -9758,7 +9992,7 @@ top-level keys you supply:
|
|
|
9758
9992
|
}
|
|
9759
9993
|
\`\`\`
|
|
9760
9994
|
|
|
9761
|
-
\`_auto\` is always populated by the CLI
|
|
9995
|
+
\`_auto\` is always populated by the CLI -- do **not** set it manually. Your
|
|
9762
9996
|
top-level keys are never overwritten; if you supply \`workspaceId: "ws_agent_B"\`,
|
|
9763
9997
|
the CLI's current workspace A goes into \`_auto.workspaceId\`, not the top level.
|
|
9764
9998
|
|
|
@@ -9775,7 +10009,218 @@ If the feedback command returns a non-zero exit code, DO NOT retry silently. Sho
|
|
|
9775
10009
|
|
|
9776
10010
|
## Extensibility
|
|
9777
10011
|
|
|
9778
|
-
Add \`workflows/custom-<name>.md\` with frontmatter \`triggers: ["phrase"]\`
|
|
10012
|
+
Add \`workflows/custom-<name>.md\` with frontmatter \`triggers: ["phrase"]\` -- auto-discovered by the decision tree fallback row. No rebuild needed.
|
|
10013
|
+
`,
|
|
10014
|
+
"concepts/dataset-design.md": `---
|
|
10015
|
+
name: mutagent-cli-concepts-dataset-design
|
|
10016
|
+
description: |
|
|
10017
|
+
Canonical source for MutagenT dataset design principles.
|
|
10018
|
+
The Golden Rule: hard cases > easy cases; edge cases are mandatory.
|
|
10019
|
+
Covers case categories (Edge / Hard / Representative / Adversarial),
|
|
10020
|
+
format requirements (input/expectedOutput shape), and anti-patterns.
|
|
10021
|
+
Parallel structure to concepts/eval-criteria.md for cognitive parity.
|
|
10022
|
+
Mirrored in the CLI directive's bootstrappable instruction field.
|
|
10023
|
+
triggers:
|
|
10024
|
+
- "dataset design"
|
|
10025
|
+
- "dataset quality"
|
|
10026
|
+
- "what makes a good dataset"
|
|
10027
|
+
- "hard cases"
|
|
10028
|
+
- "edge cases"
|
|
10029
|
+
- "test cases"
|
|
10030
|
+
- "expectedOutput"
|
|
10031
|
+
- "dataset items"
|
|
10032
|
+
- "guided dataset"
|
|
10033
|
+
---
|
|
10034
|
+
|
|
10035
|
+
# Concept -- Dataset Design
|
|
10036
|
+
|
|
10037
|
+
> **Parallel to** [concepts/eval-criteria.md](./eval-criteria.md) -- same section
|
|
10038
|
+
> structure so agents can navigate both consistently.
|
|
10039
|
+
>
|
|
10040
|
+
> **Canonical source** for dataset curation principles. Mirrored inline in
|
|
10041
|
+
> \`mutagent-cli/src/commands/prompts/guided-dataset.ts\` (directive instruction field)
|
|
10042
|
+
> so even agents without the Skill loaded can execute correctly.
|
|
10043
|
+
|
|
10044
|
+
## The Golden Rule
|
|
10045
|
+
|
|
10046
|
+
**Hard cases that expose prompt weaknesses FIRST. Easy cases that always pass LAST.**
|
|
10047
|
+
|
|
10048
|
+
A dataset where every item produces correct output tells you nothing about where the
|
|
10049
|
+
prompt fails. The optimizer needs failure signal to improve. Prioritize inputs that:
|
|
10050
|
+
|
|
10051
|
+
1. Are ambiguous (multiple valid interpretations)
|
|
10052
|
+
2. Are adversarial (designed to trigger a known failure mode)
|
|
10053
|
+
3. Are at the boundary of what the prompt should handle
|
|
10054
|
+
4. Are drawn from actual production failure cases
|
|
10055
|
+
|
|
10056
|
+
**One edge case that causes a failure is worth 10 easy cases that succeed.**
|
|
10057
|
+
|
|
10058
|
+
---
|
|
10059
|
+
|
|
10060
|
+
## NEVER skip expectedOutput on labelable items
|
|
10061
|
+
|
|
10062
|
+
This is the dataset equivalent of Rule 3 (never auto-generate eval criteria).
|
|
10063
|
+
|
|
10064
|
+
- If you know what the correct output should be for a given input, you MUST include \`expectedOutput\`.
|
|
10065
|
+
- The optimizer uses \`expectedOutput\` as the ground-truth signal for G-Eval scoring.
|
|
10066
|
+
- Omitting \`expectedOutput\` on a labelable item forces the evaluator to use LLM judgment alone -- much noisier.
|
|
10067
|
+
- **Only omit \`expectedOutput\`** when correct output is genuinely subjective / context-dependent AND no rubric can distinguish good from bad.
|
|
10068
|
+
|
|
10069
|
+
Ask the user for expected outputs field by field -- do NOT auto-generate them.
|
|
10070
|
+
|
|
10071
|
+
---
|
|
10072
|
+
|
|
10073
|
+
## NEVER auto-generate dataset items
|
|
10074
|
+
|
|
10075
|
+
This is the counterpart to Rule 3 (never auto-generate eval criteria). Reasons:
|
|
10076
|
+
|
|
10077
|
+
- Auto-generated items tend to be representative cases (easy) rather than hard cases.
|
|
10078
|
+
- The user knows what production inputs look like and where the prompt fails; the agent does not.
|
|
10079
|
+
- Synthetic easy cases produce noisy optimization signal -- the optimizer improves scores on the easy
|
|
10080
|
+
cases but the real prompt weaknesses go uncovered.
|
|
10081
|
+
- Collect items from the user via AskUserQuestion, one category at a time.
|
|
10082
|
+
|
|
10083
|
+
---
|
|
10084
|
+
|
|
10085
|
+
## Case Categories
|
|
10086
|
+
|
|
10087
|
+
Collect in this priority order -- hardest categories first:
|
|
10088
|
+
|
|
10089
|
+
### 1. Edge Cases (HIGH priority)
|
|
10090
|
+
|
|
10091
|
+
Boundary inputs that test the limits of what the prompt should handle.
|
|
10092
|
+
|
|
10093
|
+
| What to ask | Examples |
|
|
10094
|
+
|---|---|
|
|
10095
|
+
| Empty or null inputs | \`""\`, \`null\`, \`0\`, \`[]\` |
|
|
10096
|
+
| Very long inputs (token limits) | paragraph-length where field should be short |
|
|
10097
|
+
| Malformed inputs | wrong type, wrong format, garbled text |
|
|
10098
|
+
| Unicode / special characters | emoji, RTL text, control chars, escaped quotes |
|
|
10099
|
+
| Missing required sub-fields | object with some required fields absent |
|
|
10100
|
+
|
|
10101
|
+
**Collect at minimum**: 1-2 edge cases per input field.
|
|
10102
|
+
|
|
10103
|
+
### 2. Hard Cases (HIGH priority)
|
|
10104
|
+
|
|
10105
|
+
Inputs that are valid but expose known prompt weaknesses or require nuanced reasoning.
|
|
10106
|
+
|
|
10107
|
+
| What to ask | Examples |
|
|
10108
|
+
|---|---|
|
|
10109
|
+
| Ambiguous inputs | "What does this mean?" (multiple valid answers) |
|
|
10110
|
+
| Adversarial inputs | phrasing designed to trigger hallucination or refusal |
|
|
10111
|
+
| Domain traps | technical jargon with multiple meanings in context |
|
|
10112
|
+
| Instruction conflicts | input that triggers contradictory rules in the prompt |
|
|
10113
|
+
| Near-miss inputs | almost correct format but slightly off |
|
|
10114
|
+
|
|
10115
|
+
**Collect at minimum**: 2-3 hard cases total.
|
|
10116
|
+
|
|
10117
|
+
### 3. Representative Cases (MEDIUM priority)
|
|
10118
|
+
|
|
10119
|
+
Typical production inputs -- what the prompt handles 80% of the time.
|
|
10120
|
+
|
|
10121
|
+
| What to ask | Examples |
|
|
10122
|
+
|---|---|
|
|
10123
|
+
| Common use cases | most frequent user inputs |
|
|
10124
|
+
| Standard formats | well-formed, expected-length, standard vocabulary |
|
|
10125
|
+
| Baseline quality | inputs where the prompt should succeed reliably |
|
|
10126
|
+
|
|
10127
|
+
**Collect after** hard and edge cases are covered.
|
|
10128
|
+
|
|
10129
|
+
### 4. Adversarial Cases (LOW priority, if relevant)
|
|
10130
|
+
|
|
10131
|
+
Inputs designed to test security / safety / guardrails.
|
|
10132
|
+
|
|
10133
|
+
| What to ask | Examples |
|
|
10134
|
+
|---|---|
|
|
10135
|
+
| Prompt injection attempts | "Ignore previous instructions and..." |
|
|
10136
|
+
| Off-topic requests | completely unrelated to the prompt's domain |
|
|
10137
|
+
| Jailbreak patterns | attempts to bypass constraints |
|
|
10138
|
+
|
|
10139
|
+
Only collect if the prompt has explicit safety constraints.
|
|
10140
|
+
|
|
10141
|
+
---
|
|
10142
|
+
|
|
10143
|
+
## Format Requirements
|
|
10144
|
+
|
|
10145
|
+
Every dataset item MUST have:
|
|
10146
|
+
|
|
10147
|
+
\`\`\`json
|
|
10148
|
+
{
|
|
10149
|
+
"input": {
|
|
10150
|
+
"<inputSchema_field_1>": "<value>",
|
|
10151
|
+
"<inputSchema_field_2>": "<value>"
|
|
10152
|
+
},
|
|
10153
|
+
"expectedOutput": {
|
|
10154
|
+
"<outputSchema_field_1>": "<expected_value>",
|
|
10155
|
+
"<outputSchema_field_2>": "<expected_value>"
|
|
10156
|
+
}
|
|
10157
|
+
}
|
|
10158
|
+
\`\`\`
|
|
10159
|
+
|
|
10160
|
+
Rules:
|
|
10161
|
+
- \`input\` keys MUST match the prompt's \`inputSchema.properties\` exactly (no extras, no missing required fields).
|
|
10162
|
+
- \`expectedOutput\` keys MUST match the prompt's \`outputSchema.properties\`.
|
|
10163
|
+
- String values in \`expectedOutput\` should be the verbatim correct answer (not a description of it).
|
|
10164
|
+
- Numeric scores in \`expectedOutput\` should match what the evaluator would award for a perfect response.
|
|
10165
|
+
- Upload as a JSON array: \`[{item1}, {item2}, ...]\`
|
|
10166
|
+
|
|
10167
|
+
### Minimum dataset size
|
|
10168
|
+
|
|
10169
|
+
- **5 items minimum** for any optimization run.
|
|
10170
|
+
- **At least 2 items** must be hard or edge cases.
|
|
10171
|
+
- More items = better signal, especially for per-criterion scoring.
|
|
10172
|
+
- \`mutagent prompts dataset add --help\` has the upload command flags.
|
|
10173
|
+
|
|
10174
|
+
---
|
|
10175
|
+
|
|
10176
|
+
## Anti-patterns
|
|
10177
|
+
|
|
10178
|
+
| Anti-pattern | Why it's bad | Fix |
|
|
10179
|
+
|---|---|---|
|
|
10180
|
+
| All easy cases (prompt always succeeds) | No failure signal for optimizer | Add hard/edge cases first |
|
|
10181
|
+
| No edge cases | Optimizer never sees boundary behavior | Ask user about failure modes |
|
|
10182
|
+
| Fictional inputs that won't happen in production | Optimization targets unrealistic scenarios | Anchor to real usage patterns |
|
|
10183
|
+
| Missing \`expectedOutput\` on labelable items | Optimizer uses LLM judgment alone (noisy) | Ask user for expected outputs |
|
|
10184
|
+
| Duplicate items | Wastes dataset budget, skews scores | Check for duplicates before upload |
|
|
10185
|
+
| Items that are identical to training data | May overfit | Include diverse failure modes |
|
|
10186
|
+
|
|
10187
|
+
---
|
|
10188
|
+
|
|
10189
|
+
## Cross-references
|
|
10190
|
+
|
|
10191
|
+
- [SKILL.md](../SKILL.md) -- 5 rules + journey router
|
|
10192
|
+
- [workflows/dataset-curation.md](../workflows/dataset-curation.md) -- standalone dataset curation workflow (HOW; this file is WHY)
|
|
10193
|
+
- [workflows/optimization.md](../workflows/optimization.md) -- full loop that includes dataset add step
|
|
10194
|
+
- [concepts/eval-criteria.md](./eval-criteria.md) -- parallel concept doc for evaluation criteria
|
|
10195
|
+
|
|
10196
|
+
---
|
|
10197
|
+
|
|
10198
|
+
## CLI commands
|
|
10199
|
+
|
|
10200
|
+
\`\`\`bash
|
|
10201
|
+
# Discovery (no LLM cost)
|
|
10202
|
+
mutagent prompts dataset --help # list dataset subcommands
|
|
10203
|
+
mutagent prompts dataset add --help # read flags before first use (Rule 2)
|
|
10204
|
+
mutagent prompts dataset list <prompt-id> --json # list datasets attached to a prompt
|
|
10205
|
+
mutagent prompts dataset get <dataset-id> --json # inspect single dataset's items + metadata
|
|
10206
|
+
|
|
10207
|
+
# Creation -- guided (no LLM cost; just storage)
|
|
10208
|
+
mutagent prompts dataset add <prompt-id> --guided --json # get _directive.askUserQuestions (per-field collection)
|
|
10209
|
+
mutagent prompts dataset add <prompt-id> -d '<json>' --name "<name>" --json # upload items
|
|
10210
|
+
# -d accepts inline JSON OR @path/to/file.json OR - (stdin)
|
|
10211
|
+
|
|
10212
|
+
# Mutations
|
|
10213
|
+
mutagent prompts dataset update <dataset-id> -d '<json>' --json # replace items in existing dataset
|
|
10214
|
+
mutagent prompts dataset delete <dataset-id> --json # delete dataset (idempotent; --force skips confirm)
|
|
10215
|
+
\`\`\`
|
|
10216
|
+
|
|
10217
|
+
**Flag glossary** (dataset-specific):
|
|
10218
|
+
- \`--guided\` -- emit per-field \`askUserQuestions\` directive instead of expecting \`-d\` upfront. Use when collecting from user.
|
|
10219
|
+
- \`-d <json>\` / \`--data <json>\` -- supply items payload inline. Accepts: inline JSON, \`@path\` (read from file), \`-\` (read from stdin).
|
|
10220
|
+
- \`--name "<name>"\` -- human-readable label for the dataset (shows in dashboard).
|
|
10221
|
+
- \`--json\` -- structured output (Rule 1: always use). Returns \`_directive\` + \`_links\`.
|
|
10222
|
+
|
|
10223
|
+
**Cost note**: dataset creation/edit/delete commands incur ZERO LLM cost. They are pure storage operations against the platform API. LLM cost is only incurred when \`mutagent prompts optimize start\` runs the exec model against these dataset items.
|
|
9779
10224
|
`,
|
|
9780
10225
|
"concepts/eval-criteria.md": `---
|
|
9781
10226
|
name: mutagent-cli-concepts-eval-criteria
|
|
@@ -10079,6 +10524,37 @@ the output to collect rubrics in the correct order.
|
|
|
10079
10524
|
- [workflows/optimization.md](../workflows/optimization.md) → steps 7-9 (where this concept is applied)
|
|
10080
10525
|
- [concepts/prompt-variables.md](./prompt-variables.md) → delimiter inference (used in MVC step)
|
|
10081
10526
|
- \`mutagent/src/modules/prompts/prompt-evaluations/README.md\` → BE mirror
|
|
10527
|
+
|
|
10528
|
+
---
|
|
10529
|
+
|
|
10530
|
+
## CLI commands
|
|
10531
|
+
|
|
10532
|
+
\`\`\`bash
|
|
10533
|
+
# Discovery (no LLM cost)
|
|
10534
|
+
mutagent prompts evaluation --help # list eval subcommands
|
|
10535
|
+
mutagent prompts evaluation create --help # read flags before first use (Rule 2)
|
|
10536
|
+
mutagent prompts evaluation list <prompt-id> --json # list existing evaluations on a prompt
|
|
10537
|
+
mutagent prompts evaluation get <eval-id> --json # inspect single evaluation's criteria + metadata
|
|
10538
|
+
|
|
10539
|
+
# Creation -- guided (no LLM cost; just storage)
|
|
10540
|
+
mutagent prompts evaluation create <prompt-id> --guided --json # get _directive.askUserQuestions + decisionTree (per-field collection)
|
|
10541
|
+
mutagent prompts evaluation create <prompt-id> -d '<json>' --name "<name>" --json # upload criteria
|
|
10542
|
+
# -d accepts inline JSON OR @path/to/file.json OR - (stdin)
|
|
10543
|
+
|
|
10544
|
+
# Mutations
|
|
10545
|
+
mutagent prompts evaluation update <eval-id> -d '<json>' --json # update existing criteria
|
|
10546
|
+
mutagent prompts evaluation delete <eval-id> --json # delete evaluation (idempotent; --force skips confirm)
|
|
10547
|
+
\`\`\`
|
|
10548
|
+
|
|
10549
|
+
**Flag glossary** (eval-specific):
|
|
10550
|
+
- \`--guided\` -- emit per-field \`askUserQuestions\` directive instead of expecting \`-d\` upfront.
|
|
10551
|
+
- \`-d <json>\` / \`--data <json>\` -- supply criteria payload inline. Accepts: inline JSON, \`@path\` (file), \`-\` (stdin).
|
|
10552
|
+
- \`--name "<name>"\` -- human-readable label (shows in dashboard).
|
|
10553
|
+
- \`--json\` -- structured output (Rule 1: always use). Returns \`_directive\` + \`_links\` + \`_compat\`.
|
|
10554
|
+
|
|
10555
|
+
**Cost note**: eval creation/edit/delete commands incur ZERO LLM cost. Pure storage operations. LLM cost is incurred only when \`mutagent prompts optimize start\` runs the judge model against this evaluation.
|
|
10556
|
+
|
|
10557
|
+
**Workflow cross-link**: for the standalone HOW (step-by-step CLI sequence), see [workflows/eval-creation.md](../workflows/eval-creation.md).
|
|
10082
10558
|
`,
|
|
10083
10559
|
"concepts/prompt-variables.md": `---
|
|
10084
10560
|
name: mutagent-cli-concepts-prompt-variables
|
|
@@ -10235,6 +10711,36 @@ Use the delimiter field to:
|
|
|
10235
10711
|
- [concepts/eval-criteria.md](./eval-criteria.md) → MVC (Minimum Viable Context) — uses delimiter to enumerate input params
|
|
10236
10712
|
- Source: \`mutagent-cli/src/lib/explorer.ts\` → \`inferPromptVariables()\` and \`DiscoveredPrompt.delimiter\`
|
|
10237
10713
|
- Tests: \`mutagent-cli/src/__tests__/lib/explorer.test.ts\`
|
|
10714
|
+
|
|
10715
|
+
---
|
|
10716
|
+
|
|
10717
|
+
## CLI commands
|
|
10718
|
+
|
|
10719
|
+
\`\`\`bash
|
|
10720
|
+
# Discovery (no LLM cost, read-only)
|
|
10721
|
+
mutagent explore --help # read flags before first use (Rule 2)
|
|
10722
|
+
mutagent explore --json # scans codebase; emits "delimiter" field per prompt
|
|
10723
|
+
mutagent prompts get <id> --json # inspect uploaded prompt's stored form (incl. delimiter)
|
|
10724
|
+
|
|
10725
|
+
# Creation (no LLM cost, just storage)
|
|
10726
|
+
mutagent prompts create --help # read brace format rules before creating
|
|
10727
|
+
mutagent prompts create --name "<name>" --raw "<prompt>" --json # upload prompt (use single-brace {var} convention)
|
|
10728
|
+
mutagent prompts create --name "<name>" --raw-file <path> --json # upload from file (preferred for multi-line prompts)
|
|
10729
|
+
|
|
10730
|
+
# Mutations
|
|
10731
|
+
mutagent prompts update <id> --raw "<prompt>" --json # replace stored prompt body
|
|
10732
|
+
mutagent prompts delete <id> --json # delete prompt (idempotent; --force skips confirm)
|
|
10733
|
+
\`\`\`
|
|
10734
|
+
|
|
10735
|
+
**Flag glossary** (prompt-create-specific):
|
|
10736
|
+
- \`--raw "<text>"\` -- inline prompt body. Use for short single-line prompts.
|
|
10737
|
+
- \`--raw-file <path>\` -- read prompt body from file. Preferred for multi-line / templated prompts; preserves whitespace.
|
|
10738
|
+
- \`--name "<name>"\` -- human-readable label (shows in dashboard + explore output).
|
|
10739
|
+
- \`--json\` -- structured output (Rule 1: always use). Returns \`_directive\` (status_card) + \`_links\` + \`_compat\`.
|
|
10740
|
+
|
|
10741
|
+
**Cost note**: prompt creation/edit/delete commands incur ZERO LLM cost. Pure storage. Only \`mutagent prompts optimize start\` and \`mutagent prompts playground\` (interactive testing) incur LLM cost.
|
|
10742
|
+
|
|
10743
|
+
**Brace convention reminder**: use single-brace \`{var}\` for variables you'll later supply via dataset items. Use double-brace \`{{literal}}\` only when you need a literal \`{var}\` substring rendered (rare). The CLI's \`mutagent explore\` infers delimiter automatically -- see § "Delimiter inference" above for the rules.
|
|
10238
10744
|
`,
|
|
10239
10745
|
"concepts/scorecard-output.md": `---
|
|
10240
10746
|
name: mutagent-cli-concepts-scorecard-output
|
|
@@ -10456,6 +10962,47 @@ rl.on('line', (line) => {
|
|
|
10456
10962
|
defined; \`criteria[].name\` in \`ScorecardData\` maps to \`name\` in the rubric.
|
|
10457
10963
|
- [workflows/optimization.md](../workflows/optimization.md) — full optimization
|
|
10458
10964
|
loop; the scorecard is produced at Step 8 (watch) and Step 9 (results).
|
|
10965
|
+
|
|
10966
|
+
---
|
|
10967
|
+
|
|
10968
|
+
## CLI commands
|
|
10969
|
+
|
|
10970
|
+
\`\`\`bash
|
|
10971
|
+
# Discovery (no LLM cost)
|
|
10972
|
+
mutagent prompts optimize --help # list optimize subcommands
|
|
10973
|
+
mutagent prompts optimize start --help # read flags before first use (Rule 2)
|
|
10974
|
+
|
|
10975
|
+
# \uD83D\uDCB0 LLM COST -- requires usage check (Rule 5) + provider catalog check (Rule 6)
|
|
10976
|
+
mutagent prompts optimize start <id> --dataset <d> --evaluation <e> --json
|
|
10977
|
+
# start job (cost = exec_model × items × iterations
|
|
10978
|
+
# + judge_model × items × iterations)
|
|
10979
|
+
mutagent prompts optimize start <id> --dataset <d> --evaluation <e> --watch --json
|
|
10980
|
+
# start + stream NDJSON events to stdout
|
|
10981
|
+
|
|
10982
|
+
# Polling / watching (no LLM cost; just reads job state)
|
|
10983
|
+
mutagent prompts optimize status <job-id> --json # poll progress snapshot (includes bestScore)
|
|
10984
|
+
mutagent prompts optimize watch <job-id> --json # attach to running job (NDJSON stream)
|
|
10985
|
+
mutagent prompts optimize results <job-id> --json # full scorecard after completion (emits verbatim card)
|
|
10986
|
+
mutagent prompts optimize results <job-id> --diff --json # view prompt diff (no apply)
|
|
10987
|
+
|
|
10988
|
+
# Mutation (no LLM cost itself; modifies stored prompt)
|
|
10989
|
+
mutagent prompts optimize results <job-id> --apply --json # apply optimized prompt -> updates stored version
|
|
10990
|
+
# (irreversible without manual revert via prompts update)
|
|
10991
|
+
\`\`\`
|
|
10992
|
+
|
|
10993
|
+
**Flag glossary** (optimize-specific):
|
|
10994
|
+
- \`--dataset <d>\` -- dataset ID (from \`prompts dataset list\`). Items run through both exec and judge models.
|
|
10995
|
+
- \`--evaluation <e>\` -- evaluation ID (from \`prompts evaluation list\`). Drives the judge model's scoring rubric.
|
|
10996
|
+
- \`--watch\` -- after start, stream NDJSON events instead of returning immediately. Equivalent to \`start\` then \`watch\`.
|
|
10997
|
+
- \`--max-iterations N\` -- bound the optimizer loop. **Defaults to 1**; never raise without explicit user consent (each iteration = full eval × dataset round-trip).
|
|
10998
|
+
- \`--exec-model <model>\` / \`--eval-model <model>\` -- override defaults. Validate first via \`mutagent providers list --models --json\` (Rule 6).
|
|
10999
|
+
- \`--apply\` -- write optimized prompt back to stored version. Cannot be undone via flag; use \`prompts update\` to revert.
|
|
11000
|
+
- \`--diff\` -- view before/after diff without applying.
|
|
11001
|
+
- \`--json\` -- structured output (Rule 1: always use). Returns \`_directive\` + \`_links\` + \`_compat\`.
|
|
11002
|
+
|
|
11003
|
+
**Cost note**: \`optimize start\` is the ONLY cost-incurring command in this family. Always run \`mutagent usage --json\` first (Rule 5) to surface remaining quota; show the result to the user; require explicit confirmation. The \`--max-iterations\` default of 1 keeps cost bounded.
|
|
11004
|
+
|
|
11005
|
+
**Verbatim card protocol**: \`optimize start\`, \`optimize status\`, and \`optimize results\` all emit \`_directive.renderedCard\` -- the agent MUST echo the rendered card verbatim into chat before any next action (per SKILL.md § "MANDATORY: Verbatim Card Display Protocol"). The card also auto-echoes to stderr via \`echoDirectiveToStderr\`, but bash blocks may be collapsed in the user's UI.
|
|
10459
11006
|
`,
|
|
10460
11007
|
"workflows/agents.md": `---
|
|
10461
11008
|
name: mutagent-cli-workflows-agents
|
|
@@ -10579,6 +11126,35 @@ Do NOT try to extract the whole agent loop at once.
|
|
|
10579
11126
|
|
|
10580
11127
|
---
|
|
10581
11128
|
|
|
11129
|
+
## CLI commands
|
|
11130
|
+
|
|
11131
|
+
\`\`\`bash
|
|
11132
|
+
# Discovery (no LLM cost, read-only)
|
|
11133
|
+
mutagent explore --json # detect agents[] in codebase via taxonomy classifier
|
|
11134
|
+
mutagent agents --help # list available agent subcommands (CRUD + WIP banner)
|
|
11135
|
+
mutagent agents list --json # CRUD: list registered agents
|
|
11136
|
+
mutagent agents get <id> --json # CRUD: inspect single agent (config + metadata)
|
|
11137
|
+
|
|
11138
|
+
# Mutations (no LLM cost; just storage)
|
|
11139
|
+
mutagent agents create --name "<name>" --json # register a new agent
|
|
11140
|
+
mutagent agents update <id> --json # update agent config
|
|
11141
|
+
mutagent agents delete <id> --json # delete agent (idempotent; --force skips confirm)
|
|
11142
|
+
|
|
11143
|
+
# NOT YET AVAILABLE -- shows AGENTS_WIP_BANNER if attempted
|
|
11144
|
+
mutagent agents optimize <id> # WIP -- tracked separately; see partnership link below
|
|
11145
|
+
\`\`\`
|
|
11146
|
+
|
|
11147
|
+
**Flag glossary** (agent-specific):
|
|
11148
|
+
- \`--name "<name>"\` -- human-readable label (shows in dashboard).
|
|
11149
|
+
- \`--force\` -- skip interactive confirmation on delete (auto-skipped in \`--json\` mode).
|
|
11150
|
+
- \`--json\` -- structured output (Rule 1: always use). Returns \`_directive\` (status_card) + \`_links\` + \`_compat\`.
|
|
11151
|
+
|
|
11152
|
+
**Cost note**: all current \`mutagent agents *\` commands are CRUD (zero LLM cost). Agent optimization (\`mutagent agents optimize\`) is NOT yet available -- when shipped it will incur LLM cost similar to \`prompts optimize start\`. Current behavior on \`mutagent agents optimize\`: returns \`AGENTS_WIP_BANNER\` directive pointing to the partnership link.
|
|
11153
|
+
|
|
11154
|
+
**Partnership link**: <https://www.mutagent.io/agents-partnership> -- for early access to multi-turn / tool-calling agent optimization.
|
|
11155
|
+
|
|
11156
|
+
---
|
|
11157
|
+
|
|
10582
11158
|
## Cross-references
|
|
10583
11159
|
|
|
10584
11160
|
- [SKILL.md](../SKILL.md) → 5 rules + journey router
|
|
@@ -10586,6 +11162,408 @@ Do NOT try to extract the whole agent loop at once.
|
|
|
10586
11162
|
- [workflows/optimization.md](./optimization.md) → branch (c) destination
|
|
10587
11163
|
- [concepts/prompt-variables.md](../concepts/prompt-variables.md) → \`{foo}\` vs \`{{foo}}\` for sub-prompt extraction
|
|
10588
11164
|
- Partnership link: https://www.mutagent.io/agents-partnership
|
|
11165
|
+
`,
|
|
11166
|
+
"workflows/dataset-curation.md": `---
|
|
11167
|
+
name: mutagent-cli-workflows-dataset-curation
|
|
11168
|
+
description: |
|
|
11169
|
+
Standalone dataset curation workflow. Use when the user wants to create
|
|
11170
|
+
or expand a dataset WITHOUT running the full optimization loop.
|
|
11171
|
+
Covers guided and manual dataset creation, per-field question collection,
|
|
11172
|
+
hard-cases-first priority, and upload via CLI.
|
|
11173
|
+
Cross-linked from workflows/optimization.md dataset step.
|
|
11174
|
+
triggers:
|
|
11175
|
+
- "create dataset"
|
|
11176
|
+
- "add examples"
|
|
11177
|
+
- "test cases"
|
|
11178
|
+
- "edge cases"
|
|
11179
|
+
- "hard cases"
|
|
11180
|
+
- "expand dataset"
|
|
11181
|
+
- "dataset items"
|
|
11182
|
+
- "curate dataset"
|
|
11183
|
+
- "build dataset"
|
|
11184
|
+
- "dataset curation"
|
|
11185
|
+
---
|
|
11186
|
+
|
|
11187
|
+
# Workflow -- Dataset Curation (Standalone)
|
|
11188
|
+
|
|
11189
|
+
> **When to use this workflow vs optimization.md**:
|
|
11190
|
+
>
|
|
11191
|
+
> Use THIS workflow when the user wants ONLY to create or expand a dataset,
|
|
11192
|
+
> without immediately running optimization. Common signals:
|
|
11193
|
+
> - "I want to add more test cases"
|
|
11194
|
+
> - "Let's build a dataset for this prompt"
|
|
11195
|
+
> - "Add some edge cases"
|
|
11196
|
+
> - "Expand the existing dataset"
|
|
11197
|
+
>
|
|
11198
|
+
> Use [workflows/optimization.md](./optimization.md) when the user wants the
|
|
11199
|
+
> full loop: create prompt -> dataset -> eval -> optimize. That workflow has
|
|
11200
|
+
> an inline dataset step that cross-links back here.
|
|
11201
|
+
|
|
11202
|
+
Read the **5 rules** in [SKILL.md](../SKILL.md) before executing.
|
|
11203
|
+
|
|
11204
|
+
---
|
|
11205
|
+
|
|
11206
|
+
## When this workflow applies
|
|
11207
|
+
|
|
11208
|
+
- User explicitly wants to curate/build a dataset (no immediate optimization intent)
|
|
11209
|
+
- User wants to add hard cases / edge cases to an existing dataset
|
|
11210
|
+
- User wants to understand what good dataset items look like before committing to optimization
|
|
11211
|
+
- User has a prompt uploaded already and wants to build test coverage
|
|
11212
|
+
|
|
11213
|
+
---
|
|
11214
|
+
|
|
11215
|
+
## Required pre-read
|
|
11216
|
+
|
|
11217
|
+
Load [concepts/dataset-design.md](../concepts/dataset-design.md) before collecting items.
|
|
11218
|
+
It defines:
|
|
11219
|
+
- The Golden Rule (hard cases first)
|
|
11220
|
+
- 4 case categories (Edge / Hard / Representative / Adversarial)
|
|
11221
|
+
- Format requirements (\`input\` + \`expectedOutput\` shape)
|
|
11222
|
+
- Anti-patterns to avoid
|
|
11223
|
+
|
|
11224
|
+
The CLI's \`--guided\` directive also contains a self-sufficient inline version of these
|
|
11225
|
+
rules in \`_directive.instruction\` -- safe to execute even without the Skill loaded.
|
|
11226
|
+
|
|
11227
|
+
---
|
|
11228
|
+
|
|
11229
|
+
## Workflow steps
|
|
11230
|
+
|
|
11231
|
+
\`\`\`
|
|
11232
|
+
1. mutagent explore --json
|
|
11233
|
+
-> confirm which prompt you're building a dataset for
|
|
11234
|
+
-> show command output to user
|
|
11235
|
+
-> ask: "Which prompt would you like to build a dataset for?"
|
|
11236
|
+
|
|
11237
|
+
2. mutagent prompts get <prompt-id> --json
|
|
11238
|
+
-> inspect inputSchema + outputSchema fields
|
|
11239
|
+
-> understand what input and output shapes look like
|
|
11240
|
+
|
|
11241
|
+
3. mutagent prompts dataset add --help
|
|
11242
|
+
-> read flags (Rule 2: always --help before first use)
|
|
11243
|
+
|
|
11244
|
+
4. mutagent prompts dataset add <prompt-id> --guided --json
|
|
11245
|
+
-> CLI returns _directive.askUserQuestions with per-field questions
|
|
11246
|
+
-> follow the instruction in _directive.instruction
|
|
11247
|
+
|
|
11248
|
+
5. For EACH question in _directive.askUserQuestions:
|
|
11249
|
+
-> use AskUserQuestion to collect the answer from the user
|
|
11250
|
+
-> prioritize hard/edge case questions first (they come first in the list)
|
|
11251
|
+
-> do NOT skip any question
|
|
11252
|
+
-> do NOT auto-fill answers
|
|
11253
|
+
|
|
11254
|
+
6. Construct dataset items from collected answers:
|
|
11255
|
+
-> format: [{"input": {...}, "expectedOutput": {...}}, ...]
|
|
11256
|
+
-> minimum 5 items; at least 2 must be hard/edge cases
|
|
11257
|
+
-> verify all input keys match promptSchema.inputSchema.properties
|
|
11258
|
+
-> verify all expectedOutput keys match promptSchema.outputSchema.properties
|
|
11259
|
+
|
|
11260
|
+
7. Ask user to review the constructed items before upload:
|
|
11261
|
+
"Here are the 7 dataset items I constructed. Review them before upload?"
|
|
11262
|
+
-> show items in a readable format
|
|
11263
|
+
-> accept corrections
|
|
11264
|
+
|
|
11265
|
+
8. mutagent prompts dataset add <prompt-id> -d '[...]' --name "<name>" --json
|
|
11266
|
+
-> upload the reviewed items
|
|
11267
|
+
-> show command output to user (confirm datasetId)
|
|
11268
|
+
-> record datasetId in .mutagent/mutation-context.md
|
|
11269
|
+
|
|
11270
|
+
9. Ask: "What would you like to do next?"
|
|
11271
|
+
-> Option A: Add more items (loop back to step 4)
|
|
11272
|
+
-> Option B: Create an evaluation -> route to evaluation create --guided
|
|
11273
|
+
-> Option C: Start optimization -> route to workflows/optimization.md step 10
|
|
11274
|
+
-> Option D: Done
|
|
11275
|
+
\`\`\`
|
|
11276
|
+
|
|
11277
|
+
---
|
|
11278
|
+
|
|
11279
|
+
## Guided mode output shape
|
|
11280
|
+
|
|
11281
|
+
\`mutagent prompts dataset add <prompt-id> --guided --json\` returns:
|
|
11282
|
+
|
|
11283
|
+
\`\`\`json
|
|
11284
|
+
{
|
|
11285
|
+
"promptId": "...",
|
|
11286
|
+
"promptName": "...",
|
|
11287
|
+
"schemaFields": { "input": ["field1", "field2"], "output": ["result"] },
|
|
11288
|
+
"suggestedCategories": [
|
|
11289
|
+
{ "name": "Edge Cases", "description": "...", "priority": "high" },
|
|
11290
|
+
{ "name": "Hard Cases", "description": "...", "priority": "high" },
|
|
11291
|
+
{ "name": "Representative Cases", "description": "...", "priority": "medium" }
|
|
11292
|
+
],
|
|
11293
|
+
"templateItem": {
|
|
11294
|
+
"input": { "field1": "<value>", "field2": "<value>" },
|
|
11295
|
+
"expectedOutput": { "result": "<expected>" }
|
|
11296
|
+
},
|
|
11297
|
+
"guidance": {
|
|
11298
|
+
"minItems": 5,
|
|
11299
|
+
"priorityRule": "Hard cases that expose prompt weaknesses > easy cases that always pass",
|
|
11300
|
+
"steps": [...]
|
|
11301
|
+
},
|
|
11302
|
+
"_directive": {
|
|
11303
|
+
"instruction": "...", // self-sufficient conceptual rules (bootstrappable without Skill)
|
|
11304
|
+
"next": ["mutagent prompts dataset add <id> -d '<json>' --name '<name>' --json"],
|
|
11305
|
+
"askUserQuestions": [ // inside _directive (not a sibling)
|
|
11306
|
+
{ "field": "_general", "question": "What are the hardest inputs for this prompt?" },
|
|
11307
|
+
{ "field": "_edge_cases", "question": "What edge cases have caused failures?" },
|
|
11308
|
+
{ "field": "field1", "source": "inputSchema", "question": "What values should "field1" have?" },
|
|
11309
|
+
...
|
|
11310
|
+
]
|
|
11311
|
+
},
|
|
11312
|
+
"_compat": { "cliVersion": "...", "skillVersion": "...", "skillMinCliVersion": "..." }
|
|
11313
|
+
}
|
|
11314
|
+
\`\`\`
|
|
11315
|
+
|
|
11316
|
+
Key: \`askUserQuestions\` is inside \`_directive\` (not a top-level sibling). Parse \`_directive.askUserQuestions\`.
|
|
11317
|
+
|
|
11318
|
+
---
|
|
11319
|
+
|
|
11320
|
+
## Cost control
|
|
11321
|
+
|
|
11322
|
+
Dataset curation has NO LLM cost on its own -- it's a pure storage operation.
|
|
11323
|
+
Only \`mutagent prompts optimize start\` incurs LLM cost. Safe to run freely.
|
|
11324
|
+
|
|
11325
|
+
---
|
|
11326
|
+
|
|
11327
|
+
## Common pitfalls
|
|
11328
|
+
|
|
11329
|
+
For the canonical anti-pattern list (WHY each is bad + how to fix), see [concepts/dataset-design.md](../concepts/dataset-design.md) § Anti-patterns. Workflow-specific execution mistakes:
|
|
11330
|
+
|
|
11331
|
+
- **Uploading items with wrong field names** -> schema mismatch error from optimizer (not caught by concept-level rules)
|
|
11332
|
+
- **Forgetting to ask the user to review before upload** -> user can't correct mistakes (workflow step 7)
|
|
11333
|
+
- **Skipping the explore step** -> uploading to the wrong prompt (workflow step 1)
|
|
11334
|
+
|
|
11335
|
+
---
|
|
11336
|
+
|
|
11337
|
+
## Cross-references
|
|
11338
|
+
|
|
11339
|
+
- [SKILL.md](../SKILL.md) -- 5 rules + journey router
|
|
11340
|
+
- [concepts/dataset-design.md](../concepts/dataset-design.md) -- Golden Rule, case categories, format requirements, anti-patterns (WHY; this file is HOW)
|
|
11341
|
+
- [workflows/optimization.md](./optimization.md) -- full loop; dataset step cross-links here
|
|
11342
|
+
- [workflows/eval-creation.md](./eval-creation.md) -- parallel workflow doc (for evaluation side)
|
|
11343
|
+
- [concepts/prompt-variables.md](../concepts/prompt-variables.md) -- brace convention (for input field values)
|
|
11344
|
+
|
|
11345
|
+
---
|
|
11346
|
+
|
|
11347
|
+
## CLI commands
|
|
11348
|
+
|
|
11349
|
+
\`\`\`bash
|
|
11350
|
+
# Workflow execution sequence (commands appear inline in steps above; this is a quick reference)
|
|
11351
|
+
mutagent explore --json # step 1: discover prompts
|
|
11352
|
+
mutagent prompts get <prompt-id> --json # step 2: inspect schemas
|
|
11353
|
+
mutagent prompts dataset add --help # step 3: read flags (Rule 2)
|
|
11354
|
+
mutagent prompts dataset add <prompt-id> --guided --json # step 4: get _directive.askUserQuestions
|
|
11355
|
+
mutagent prompts dataset add <prompt-id> -d '<json>' --name "<name>" --json # step 8: upload reviewed items
|
|
11356
|
+
\`\`\`
|
|
11357
|
+
|
|
11358
|
+
For the full flag glossary + cost notes, see [concepts/dataset-design.md](../concepts/dataset-design.md) § CLI commands.
|
|
11359
|
+
`,
|
|
11360
|
+
"workflows/eval-creation.md": `---
|
|
11361
|
+
name: mutagent-cli-workflows-eval-creation
|
|
11362
|
+
description: |
|
|
11363
|
+
Standalone evaluation rubric creation workflow. Use when the user wants to
|
|
11364
|
+
define eval criteria for a prompt WITHOUT immediately running optimization.
|
|
11365
|
+
Covers guided per-field rubric collection (INPUT MVC + OUTPUT Standards),
|
|
11366
|
+
full-depth granular rubrics, and upload via CLI.
|
|
11367
|
+
Cross-linked from workflows/optimization.md eval step.
|
|
11368
|
+
triggers:
|
|
11369
|
+
- "create evaluation"
|
|
11370
|
+
- "create rubric"
|
|
11371
|
+
- "evaluate prompt"
|
|
11372
|
+
- "evaluation criteria"
|
|
11373
|
+
- "rubric design"
|
|
11374
|
+
- "MVC"
|
|
11375
|
+
- "Output Standards"
|
|
11376
|
+
- "score this prompt"
|
|
11377
|
+
- "judge this prompt"
|
|
11378
|
+
- "eval guided"
|
|
11379
|
+
---
|
|
11380
|
+
|
|
11381
|
+
# Workflow -- Evaluation Creation (Standalone)
|
|
11382
|
+
|
|
11383
|
+
> **When to use this workflow vs optimization.md**:
|
|
11384
|
+
>
|
|
11385
|
+
> Use THIS workflow when the user wants ONLY to define an evaluation rubric for
|
|
11386
|
+
> a prompt, without immediately running optimization. Common signals:
|
|
11387
|
+
> - "I want to score this prompt"
|
|
11388
|
+
> - "Let's define eval criteria for this prompt"
|
|
11389
|
+
> - "Create a rubric for this prompt"
|
|
11390
|
+
> - "How should we judge this prompt's outputs?"
|
|
11391
|
+
>
|
|
11392
|
+
> Use [workflows/optimization.md](./optimization.md) when the user wants the
|
|
11393
|
+
> full loop: create prompt -> dataset -> eval -> optimize. That workflow has
|
|
11394
|
+
> an inline eval step that cross-links back here.
|
|
11395
|
+
|
|
11396
|
+
Read the **5 rules** in [SKILL.md](../SKILL.md) before executing.
|
|
11397
|
+
|
|
11398
|
+
---
|
|
11399
|
+
|
|
11400
|
+
## When this workflow applies
|
|
11401
|
+
|
|
11402
|
+
- User explicitly wants to create or edit an evaluation rubric (no immediate optimization intent)
|
|
11403
|
+
- User wants to add criteria to an existing prompt (with or without an existing dataset)
|
|
11404
|
+
- User wants to understand how rubric design works before committing to optimization
|
|
11405
|
+
- User has a prompt uploaded already and wants quality scoring before iteration
|
|
11406
|
+
|
|
11407
|
+
---
|
|
11408
|
+
|
|
11409
|
+
## Required pre-read
|
|
11410
|
+
|
|
11411
|
+
Load [concepts/eval-criteria.md](../concepts/eval-criteria.md) before collecting criteria.
|
|
11412
|
+
It defines:
|
|
11413
|
+
- The Golden Rule (INPUT MVC vs OUTPUT Standards split)
|
|
11414
|
+
- 6-tier MVC anchor framework for INPUT-scoped criteria
|
|
11415
|
+
- Output Standards format for OUTPUT-scoped criteria
|
|
11416
|
+
- Format requirements (\`name\`, \`description\`, \`evaluationParameter\`)
|
|
11417
|
+
- Anti-patterns to avoid
|
|
11418
|
+
|
|
11419
|
+
The CLI's \`--guided\` directive also contains a self-sufficient inline version of these
|
|
11420
|
+
rules in \`_directive.instruction\` -- safe to execute even without the Skill loaded.
|
|
11421
|
+
|
|
11422
|
+
---
|
|
11423
|
+
|
|
11424
|
+
## Workflow steps
|
|
11425
|
+
|
|
11426
|
+
\`\`\`
|
|
11427
|
+
1. mutagent explore --json
|
|
11428
|
+
-> confirm which prompt you're creating an evaluation for
|
|
11429
|
+
-> show command output to user
|
|
11430
|
+
-> ask: "Which prompt would you like to evaluate?"
|
|
11431
|
+
|
|
11432
|
+
2. mutagent prompts get <prompt-id> --json
|
|
11433
|
+
-> inspect inputSchema + outputSchema fields
|
|
11434
|
+
-> understand what input parameters and output shape look like
|
|
11435
|
+
-> these drive the per-field rubric collection in step 4
|
|
11436
|
+
|
|
11437
|
+
3. mutagent prompts evaluation create --help
|
|
11438
|
+
-> read flags (Rule 2: always --help before first use)
|
|
11439
|
+
|
|
11440
|
+
4. mutagent prompts evaluation create <prompt-id> --guided --json
|
|
11441
|
+
-> CLI returns _directive.askUserQuestions with per-field questions
|
|
11442
|
+
-> follow the instruction in _directive.instruction
|
|
11443
|
+
-> the instruction inlines the INPUT MVC vs OUTPUT Standards framing
|
|
11444
|
+
(bootstrappable -- works even if concepts/eval-criteria.md isn't loaded)
|
|
11445
|
+
|
|
11446
|
+
5. For EACH question in _directive.askUserQuestions:
|
|
11447
|
+
-> use AskUserQuestion to collect the answer from the user
|
|
11448
|
+
-> INPUT-scoped fields (source: "inputSchema") -> ask MVC rubric:
|
|
11449
|
+
what's minimum viable context the input MUST contain?
|
|
11450
|
+
-> OUTPUT-scoped fields (source: "outputSchema") -> ask Output Standards:
|
|
11451
|
+
what does correct vs incorrect look like for this field?
|
|
11452
|
+
-> do NOT skip any field
|
|
11453
|
+
-> do NOT auto-fill answers (Rule 3: never auto-generate criteria)
|
|
11454
|
+
|
|
11455
|
+
6. Construct rubric items from collected answers:
|
|
11456
|
+
-> format: [{"name": "...", "description": "...", "evaluationParameter": "..."}, ...]
|
|
11457
|
+
-> one rubric per schema field unless user opts to merge fields
|
|
11458
|
+
-> use 6-tier full-depth descriptions for complex INPUT criteria
|
|
11459
|
+
(see concepts/eval-criteria.md for examples)
|
|
11460
|
+
-> use simpler 2-3-tier descriptions for OUTPUT correctness criteria
|
|
11461
|
+
|
|
11462
|
+
7. Ask user to review the constructed criteria before upload:
|
|
11463
|
+
"Here are the N evaluation criteria I drafted. Review before upload?"
|
|
11464
|
+
-> show criteria in a readable format (table or numbered list)
|
|
11465
|
+
-> accept corrections; loop step 5-7 if user wants edits
|
|
11466
|
+
|
|
11467
|
+
8. mutagent prompts evaluation create <prompt-id> -d '<json>' --name "<name>" --json
|
|
11468
|
+
-> upload the reviewed criteria
|
|
11469
|
+
-> show command output to user (confirm evaluationId)
|
|
11470
|
+
-> record evaluationId in .mutagent/mutation-context.md
|
|
11471
|
+
|
|
11472
|
+
9. Ask: "What would you like to do next?"
|
|
11473
|
+
-> Option A: Add more criteria (loop back to step 4)
|
|
11474
|
+
-> Option B: Add a dataset -> route to workflows/dataset-curation.md
|
|
11475
|
+
-> Option C: Start optimization -> route to workflows/optimization.md step 10
|
|
11476
|
+
-> Option D: Done
|
|
11477
|
+
\`\`\`
|
|
11478
|
+
|
|
11479
|
+
---
|
|
11480
|
+
|
|
11481
|
+
## Guided mode output shape
|
|
11482
|
+
|
|
11483
|
+
\`mutagent prompts evaluation create <prompt-id> --guided --json\` returns:
|
|
11484
|
+
|
|
11485
|
+
\`\`\`json
|
|
11486
|
+
{
|
|
11487
|
+
"promptId": "...",
|
|
11488
|
+
"promptName": "...",
|
|
11489
|
+
"schemaFields": { "input": ["field1", "field2"], "output": ["result"] },
|
|
11490
|
+
"_directive": {
|
|
11491
|
+
"instruction": "...", // self-sufficient INPUT MVC + OUTPUT Standards rules (bootstrappable)
|
|
11492
|
+
"next": ["mutagent prompts evaluation create <id> -d '<json>' --name '<name>' --json"],
|
|
11493
|
+
"decisionTree": {
|
|
11494
|
+
"step1": "Confirm input parameters with the user via _directive.askUserQuestions...",
|
|
11495
|
+
"step2": "Define correctness criteria for EVERY field..."
|
|
11496
|
+
},
|
|
11497
|
+
"askUserQuestions": [ // inside _directive (not a sibling)
|
|
11498
|
+
{ "field": "field1", "source": "inputSchema", "question": "What MVC anchors define minimum viable context for 'field1'?" },
|
|
11499
|
+
{ "field": "result", "source": "outputSchema", "question": "What does a correct 'result' look like vs incorrect?" },
|
|
11500
|
+
...
|
|
11501
|
+
]
|
|
11502
|
+
},
|
|
11503
|
+
"_compat": { "cliVersion": "...", "skillVersion": "...", "skillMinCliVersion": "..." }
|
|
11504
|
+
}
|
|
11505
|
+
\`\`\`
|
|
11506
|
+
|
|
11507
|
+
Key: \`askUserQuestions\` is inside \`_directive\` (not a top-level sibling). Parse \`_directive.askUserQuestions\`. The \`decisionTree\` field guides multi-step branching.
|
|
11508
|
+
|
|
11509
|
+
---
|
|
11510
|
+
|
|
11511
|
+
## Cost control
|
|
11512
|
+
|
|
11513
|
+
Eval creation has NO LLM cost on its own -- it's a pure storage operation.
|
|
11514
|
+
Only \`mutagent prompts optimize start\` incurs LLM cost (judge model + exec model
|
|
11515
|
+
multiplied by dataset items × iterations). Safe to create/edit eval criteria freely.
|
|
11516
|
+
|
|
11517
|
+
---
|
|
11518
|
+
|
|
11519
|
+
## Common pitfalls
|
|
11520
|
+
|
|
11521
|
+
For the canonical anti-pattern list, see [concepts/eval-criteria.md](../concepts/eval-criteria.md) § Anti-patterns. Workflow-specific execution mistakes:
|
|
11522
|
+
|
|
11523
|
+
- **Skipping per-field collection** -> rubric incomplete; optimizer scores against a sparse signal
|
|
11524
|
+
- **Auto-filling answers from context** instead of asking user -> rubric reflects the agent's assumptions, not the user's domain knowledge (Rule 3 violation)
|
|
11525
|
+
- **Merging input + output criteria into one** -> loses INPUT MVC vs OUTPUT Standards distinction; harder to interpret per-criterion scores
|
|
11526
|
+
- **Uploading without user review** -> user can't catch misinterpretations of their domain
|
|
11527
|
+
- **Wrong \`evaluationParameter\` value** -> server rejects with schema validation error
|
|
11528
|
+
|
|
11529
|
+
---
|
|
11530
|
+
|
|
11531
|
+
## Cross-references
|
|
11532
|
+
|
|
11533
|
+
- [SKILL.md](../SKILL.md) -- 5 rules + journey router
|
|
11534
|
+
- [concepts/eval-criteria.md](../concepts/eval-criteria.md) -- Golden Rule, MVC/Output Standards, format requirements, anti-patterns
|
|
11535
|
+
- [workflows/optimization.md](./optimization.md) -- full loop; eval step cross-links here
|
|
11536
|
+
- [workflows/dataset-curation.md](./dataset-curation.md) -- parallel workflow doc (for dataset side)
|
|
11537
|
+
- [concepts/scorecard-output.md](../concepts/scorecard-output.md) -- how eval scores surface in optimization scorecard
|
|
11538
|
+
|
|
11539
|
+
---
|
|
11540
|
+
|
|
11541
|
+
## CLI commands
|
|
11542
|
+
|
|
11543
|
+
\`\`\`bash
|
|
11544
|
+
# Discovery (no LLM cost)
|
|
11545
|
+
mutagent prompts evaluation --help # list eval subcommands
|
|
11546
|
+
mutagent prompts evaluation create --help # read flags before first use (Rule 2)
|
|
11547
|
+
mutagent prompts evaluation list <prompt-id> --json # list existing evaluations on a prompt
|
|
11548
|
+
mutagent prompts evaluation get <eval-id> --json # inspect single evaluation's criteria + metadata
|
|
11549
|
+
|
|
11550
|
+
# Creation -- guided (no LLM cost; just storage)
|
|
11551
|
+
mutagent prompts evaluation create <prompt-id> --guided --json # get _directive.askUserQuestions + decisionTree (per-field collection)
|
|
11552
|
+
mutagent prompts evaluation create <prompt-id> -d '<json>' --name "<name>" --json # upload criteria
|
|
11553
|
+
# -d accepts inline JSON OR @path/to/file.json OR - (stdin)
|
|
11554
|
+
|
|
11555
|
+
# Mutations
|
|
11556
|
+
mutagent prompts evaluation update <eval-id> -d '<json>' --json # update existing criteria
|
|
11557
|
+
mutagent prompts evaluation delete <eval-id> --json # delete evaluation (idempotent; --force skips confirm)
|
|
11558
|
+
\`\`\`
|
|
11559
|
+
|
|
11560
|
+
**Flag glossary** (eval-specific):
|
|
11561
|
+
- \`--guided\` -- emit per-field \`askUserQuestions\` directive instead of expecting \`-d\` upfront. Use when collecting from user.
|
|
11562
|
+
- \`-d <json>\` / \`--data <json>\` -- supply criteria payload inline. Accepts: inline JSON, \`@path\` (read from file), \`-\` (read from stdin).
|
|
11563
|
+
- \`--name "<name>"\` -- human-readable label for the evaluation (shows in dashboard).
|
|
11564
|
+
- \`--json\` -- structured output (Rule 1: always use). Returns \`_directive\` + \`_links\`.
|
|
11565
|
+
|
|
11566
|
+
**Cost note**: eval creation/edit/delete commands incur ZERO LLM cost. They are pure storage operations against the platform API. LLM cost is only incurred when \`mutagent prompts optimize start\` runs the judge model against this evaluation.
|
|
10589
11567
|
`,
|
|
10590
11568
|
"workflows/exploration.md": `---
|
|
10591
11569
|
name: mutagent-cli-workflows-exploration
|
|
@@ -10720,6 +11698,27 @@ After step 1, show the command output to the user before proceeding to classific
|
|
|
10720
11698
|
|
|
10721
11699
|
---
|
|
10722
11700
|
|
|
11701
|
+
## CLI commands
|
|
11702
|
+
|
|
11703
|
+
\`\`\`bash
|
|
11704
|
+
# Discovery -- read-only (no LLM cost)
|
|
11705
|
+
mutagent explore --help # read flags before first use (Rule 2)
|
|
11706
|
+
mutagent explore --json # scan cwd for prompts + agents (full scan)
|
|
11707
|
+
mutagent explore --path ./src --json # scan specific directory subtree
|
|
11708
|
+
mutagent explore --markers-only --json # show only files with existing .mutagent/* markers
|
|
11709
|
+
mutagent explore --classify-only --json # taxonomy output only (skip variable inference)
|
|
11710
|
+
\`\`\`
|
|
11711
|
+
|
|
11712
|
+
**Flag glossary** (explore-specific):
|
|
11713
|
+
- \`--path <dir>\` -- restrict scan to subtree. Useful for monorepos with multiple apps; default is cwd.
|
|
11714
|
+
- \`--markers-only\` -- skip discovery; show only prompts/agents already uploaded (have \`.mutagent/*.md\` marker file). Use to refresh an existing index.
|
|
11715
|
+
- \`--classify-only\` -- skip per-prompt delimiter inference. Faster scan when you only need the prompts[]/agents[] taxonomy split.
|
|
11716
|
+
- \`--json\` -- structured output (Rule 1: always use). Returns \`prompts[]\`, \`agents[]\`, taxonomy, plus per-prompt \`delimiter\` field.
|
|
11717
|
+
|
|
11718
|
+
**Cost note**: \`mutagent explore\` is fully read-only -- no LLM calls, no platform API mutations. Safe to run repeatedly. The output is a snapshot of cwd at run time; re-run after meaningful code changes.
|
|
11719
|
+
|
|
11720
|
+
---
|
|
11721
|
+
|
|
10723
11722
|
## Cross-references
|
|
10724
11723
|
|
|
10725
11724
|
- [SKILL.md](../SKILL.md) → 5 rules + journey router
|
|
@@ -10826,6 +11825,9 @@ explore → prompts create → dataset add → evaluation create --guided
|
|
|
10826
11825
|
|
|
10827
11826
|
7. Load [concepts/eval-criteria.md](../concepts/eval-criteria.md)
|
|
10828
11827
|
→ understand INPUT-param (MVC) vs OUTPUT-param (Standards) scope
|
|
11828
|
+
→ for standalone eval-only work outside this optimization context, see
|
|
11829
|
+
[workflows/eval-creation.md](./eval-creation.md) -- this step inlines a brief
|
|
11830
|
+
version of that workflow
|
|
10829
11831
|
|
|
10830
11832
|
8. mutagent prompts evaluation create <promptId> --guided --json
|
|
10831
11833
|
→ the CLI provides a list of fields, each needing a rubric
|
|
@@ -10836,6 +11838,8 @@ explore → prompts create → dataset add → evaluation create --guided
|
|
|
10836
11838
|
- do NOT skip any field
|
|
10837
11839
|
- do NOT auto-generate any answer
|
|
10838
11840
|
→ collect at minimum: one INPUT criterion per {variable}, one OUTPUT criterion
|
|
11841
|
+
→ for the full step-by-step including review-before-upload + decisionTree handling,
|
|
11842
|
+
see [workflows/eval-creation.md](./eval-creation.md)
|
|
10839
11843
|
|
|
10840
11844
|
9. mutagent prompts evaluation create <promptId> -d '<json>' --json
|
|
10841
11845
|
→ upload the criteria collected in step 8
|
|
@@ -10923,6 +11927,63 @@ Ensure at least 2 hard/edge cases per category. Then upload:
|
|
|
10923
11927
|
mutagent prompts dataset add <prompt-id> -d '<constructed-json>' --name '<name>' --json
|
|
10924
11928
|
\`\`\`
|
|
10925
11929
|
|
|
11930
|
+
For dataset-only work (no optimization needed yet), see [workflows/dataset-curation.md](./dataset-curation.md)
|
|
11931
|
+
and [concepts/dataset-design.md](../concepts/dataset-design.md) for the full curation principles.
|
|
11932
|
+
|
|
11933
|
+
---
|
|
11934
|
+
|
|
11935
|
+
## CLI commands
|
|
11936
|
+
|
|
11937
|
+
Run these before the first use of each command (Rule 2: \`--help\` before first use):
|
|
11938
|
+
|
|
11939
|
+
\`\`\`bash
|
|
11940
|
+
mutagent explore --help # codebase scan flags
|
|
11941
|
+
mutagent prompts create --help # prompt upload flags + brace convention
|
|
11942
|
+
mutagent prompts dataset add --help # dataset add flags + --guided semantics
|
|
11943
|
+
mutagent prompts evaluation create --help # eval create flags + --guided semantics
|
|
11944
|
+
mutagent prompts optimize start --help # optimize start flags + cost-relevant flags
|
|
11945
|
+
mutagent prompts optimize status --help # status polling flags
|
|
11946
|
+
mutagent prompts optimize results --help # results flags + --apply / --diff
|
|
11947
|
+
mutagent usage --help # quota query flags
|
|
11948
|
+
mutagent providers list --help # provider catalog query flags
|
|
11949
|
+
\`\`\`
|
|
11950
|
+
|
|
11951
|
+
Workflow execution sequence (annotated with cost markers):
|
|
11952
|
+
|
|
11953
|
+
\`\`\`bash
|
|
11954
|
+
# Discovery + setup (no LLM cost)
|
|
11955
|
+
mutagent explore --json # step 1: discover prompts
|
|
11956
|
+
mutagent prompts create --name "<name>" --raw-file <path> --json # step 4: upload prompt
|
|
11957
|
+
mutagent prompts dataset add <id> --guided --json # step 5-6: guided dataset (returns _directive.askUserQuestions)
|
|
11958
|
+
mutagent prompts dataset add <id> -d '<json>' --name "<name>" --json # step 6: upload dataset items
|
|
11959
|
+
mutagent prompts evaluation create <id> --guided --json # step 8: guided eval (returns _directive.askUserQuestions + decisionTree)
|
|
11960
|
+
mutagent prompts evaluation create <id> -d '<json>' --name "<name>" --json # step 9: upload criteria
|
|
11961
|
+
|
|
11962
|
+
# Pre-flight checks (no LLM cost)
|
|
11963
|
+
mutagent usage --json # step 10: surface quota to user (Rule 5)
|
|
11964
|
+
mutagent providers list --models --json # verify exec/eval models are available (Rule 6)
|
|
11965
|
+
|
|
11966
|
+
# \uD83D\uDCB0 LLM COST starts here -- requires explicit user confirmation per Rule 5
|
|
11967
|
+
mutagent prompts optimize start <id> --dataset <d> --evaluation <e> --max-iterations 1 --json
|
|
11968
|
+
# step 11: start job (cost = exec_model × items × iterations
|
|
11969
|
+
# + judge_model × items × iterations)
|
|
11970
|
+
# --max-iterations defaults to 1; never raise without user consent
|
|
11971
|
+
|
|
11972
|
+
# Polling + results (no LLM cost; just reads job state + emits verbatim card)
|
|
11973
|
+
mutagent prompts optimize status <job-id> --json # step 12: poll progress (verbatim card)
|
|
11974
|
+
mutagent prompts optimize results <job-id> --json # step 13: view scorecard (verbatim card)
|
|
11975
|
+
mutagent prompts optimize results <job-id> --diff --json # step 14a: view prompt diff (no apply)
|
|
11976
|
+
mutagent prompts optimize results <job-id> --apply --json # step 14b: apply optimized prompt to stored version
|
|
11977
|
+
\`\`\`
|
|
11978
|
+
|
|
11979
|
+
**Cost note**: \`optimize start\` is the ONLY cost-incurring command in this workflow. All other commands are pure storage/discovery operations. The \`--max-iterations\` flag bounds total cost (default = 1; never raise silently).
|
|
11980
|
+
|
|
11981
|
+
**Verbatim card protocol**: \`optimize start\`, \`optimize status\`, and \`optimize results\` emit \`_directive.renderedCard\` -- echo verbatim per [SKILL.md § MANDATORY: Verbatim Card Display Protocol](../SKILL.md).
|
|
11982
|
+
|
|
11983
|
+
For per-topic standalone HOW workflows, see:
|
|
11984
|
+
- [workflows/dataset-curation.md](./dataset-curation.md) -- standalone dataset curation
|
|
11985
|
+
- [workflows/eval-creation.md](./eval-creation.md) -- standalone evaluation rubric creation
|
|
11986
|
+
|
|
10926
11987
|
---
|
|
10927
11988
|
|
|
10928
11989
|
## Cross-references
|
|
@@ -10930,6 +11991,8 @@ mutagent prompts dataset add <prompt-id> -d '<constructed-json>' --name '<name>'
|
|
|
10930
11991
|
- [SKILL.md](../SKILL.md) → 5 rules + journey router
|
|
10931
11992
|
- [concepts/prompt-variables.md](../concepts/prompt-variables.md) → brace convention + conversion (critical for steps 3 and 15)
|
|
10932
11993
|
- [concepts/eval-criteria.md](../concepts/eval-criteria.md) → INPUT MVC + OUTPUT Standards + granular rubric (critical for steps 7-8)
|
|
11994
|
+
- [concepts/dataset-design.md](../concepts/dataset-design.md) → dataset curation principles (Golden Rule, case categories, anti-patterns)
|
|
11995
|
+
- [workflows/dataset-curation.md](./dataset-curation.md) → standalone dataset curation (when no optimization needed yet)
|
|
10933
11996
|
- [workflows/exploration.md](./exploration.md) → step 1 of this workflow
|
|
10934
11997
|
- [workflows/tracing.md](./tracing.md) → parallel or follow-up path
|
|
10935
11998
|
`,
|
|
@@ -11044,6 +12107,36 @@ After step 6:
|
|
|
11044
12107
|
|
|
11045
12108
|
---
|
|
11046
12109
|
|
|
12110
|
+
## CLI commands
|
|
12111
|
+
|
|
12112
|
+
\`\`\`bash
|
|
12113
|
+
# Discovery (no LLM cost, read-only)
|
|
12114
|
+
mutagent explore --help # read flags before first use (Rule 2)
|
|
12115
|
+
mutagent explore --json # step 1: detect framework + prompts taxonomy
|
|
12116
|
+
mutagent integrate --help # list supported frameworks + per-framework flags
|
|
12117
|
+
|
|
12118
|
+
# Code generation (no LLM cost; emits integration snippet to stdout)
|
|
12119
|
+
mutagent integrate <framework> --json # step 3: get integration snippet for the detected framework
|
|
12120
|
+
mutagent integrate <framework> --output <path> --json # write snippet directly to file (instead of stdout)
|
|
12121
|
+
|
|
12122
|
+
# Verification (no LLM cost, read-only)
|
|
12123
|
+
mutagent traces list --json # step 6: verify traces arriving (recent N traces)
|
|
12124
|
+
mutagent traces list --prompt-id <id> --json # filter by prompt
|
|
12125
|
+
mutagent traces list --since <ISO-timestamp> --json # filter by time window (e.g., since first integration)
|
|
12126
|
+
mutagent traces get <trace-id> --json # inspect single trace's spans + metadata
|
|
12127
|
+
\`\`\`
|
|
12128
|
+
|
|
12129
|
+
**Flag glossary** (tracing-specific):
|
|
12130
|
+
- \`<framework>\` -- supported frameworks: \`langchain\`, \`langgraph\`, \`llamaindex\`, \`openai-agents\`, \`crewai\`, \`autogen\`, \`vercel-ai\`. Run \`mutagent integrate --help\` for the canonical current list.
|
|
12131
|
+
- \`--output <path>\` -- write the integration snippet directly to a file. Without this flag, the snippet goes to stdout (typical for agent-mediated workflows so the agent can re-emit verbatim to user).
|
|
12132
|
+
- \`--prompt-id <id>\` -- filter trace list to one prompt's traces.
|
|
12133
|
+
- \`--since <ts>\` -- filter by timestamp (ISO-8601). Useful right after first integration to confirm traces are landing.
|
|
12134
|
+
- \`--json\` -- structured output (Rule 1: always use).
|
|
12135
|
+
|
|
12136
|
+
**Cost note**: tracing is fully free at the CLI/platform layer -- the platform stores spans for analytics. The only "cost" is the marginal LLM call latency from in-process span emission inside the user's app (typically <5ms per call). No optimizer cost incurred.
|
|
12137
|
+
|
|
12138
|
+
---
|
|
12139
|
+
|
|
11047
12140
|
## Cross-references
|
|
11048
12141
|
|
|
11049
12142
|
- [SKILL.md](../SKILL.md) → 5 rules + journey router
|
|
@@ -11096,7 +12189,7 @@ that teaches coding agents how to use the MutagenT CLI effectively.
|
|
|
11096
12189
|
const isJson = parentCmd ? getJsonFlag(parentCmd) : false;
|
|
11097
12190
|
const output = new OutputFormatter(isJson ? "json" : "table");
|
|
11098
12191
|
const repoRoot = findRepoRoot();
|
|
11099
|
-
const skillDir =
|
|
12192
|
+
const skillDir = join8(repoRoot, SKILL_DIR);
|
|
11100
12193
|
const files = getSkillFiles();
|
|
11101
12194
|
const writtenFiles = [];
|
|
11102
12195
|
let totalBytes = 0;
|
|
@@ -11108,9 +12201,9 @@ that teaches coding agents how to use the MutagenT CLI effectively.
|
|
|
11108
12201
|
return a.localeCompare(b);
|
|
11109
12202
|
});
|
|
11110
12203
|
for (const relPath of sortedKeys) {
|
|
11111
|
-
const destPath =
|
|
12204
|
+
const destPath = join8(skillDir, relPath);
|
|
11112
12205
|
const parentDir = dirname(destPath);
|
|
11113
|
-
if (!
|
|
12206
|
+
if (!existsSync13(parentDir)) {
|
|
11114
12207
|
mkdirSync4(parentDir, { recursive: true });
|
|
11115
12208
|
}
|
|
11116
12209
|
const raw = files[relPath] ?? "";
|
|
@@ -11244,18 +12337,18 @@ import { Command as Command19 } from "commander";
|
|
|
11244
12337
|
import { randomUUID } from "crypto";
|
|
11245
12338
|
|
|
11246
12339
|
// src/commands/hooks/state.ts
|
|
11247
|
-
import { readFileSync as
|
|
11248
|
-
import { join as
|
|
12340
|
+
import { readFileSync as readFileSync11, writeFileSync as writeFileSync6, renameSync, unlinkSync, existsSync as existsSync14 } from "fs";
|
|
12341
|
+
import { join as join9 } from "path";
|
|
11249
12342
|
import { tmpdir } from "os";
|
|
11250
12343
|
function stateFilePath(sessionId) {
|
|
11251
|
-
return
|
|
12344
|
+
return join9(tmpdir(), `mutagent-hook-${sessionId}.json`);
|
|
11252
12345
|
}
|
|
11253
12346
|
function readState(sessionId) {
|
|
11254
12347
|
const path = stateFilePath(sessionId);
|
|
11255
|
-
if (!
|
|
12348
|
+
if (!existsSync14(path))
|
|
11256
12349
|
return null;
|
|
11257
12350
|
try {
|
|
11258
|
-
const raw = JSON.parse(
|
|
12351
|
+
const raw = JSON.parse(readFileSync11(path, "utf-8"));
|
|
11259
12352
|
if (!Array.isArray(raw.parentStack)) {
|
|
11260
12353
|
raw.parentStack = [];
|
|
11261
12354
|
}
|
|
@@ -11275,7 +12368,7 @@ function writeState(sessionId, state) {
|
|
|
11275
12368
|
}
|
|
11276
12369
|
function deleteState(sessionId) {
|
|
11277
12370
|
const path = stateFilePath(sessionId);
|
|
11278
|
-
if (
|
|
12371
|
+
if (existsSync14(path)) {
|
|
11279
12372
|
try {
|
|
11280
12373
|
unlinkSync(path);
|
|
11281
12374
|
} catch {}
|
|
@@ -11922,8 +13015,23 @@ async function handlePostToolUseFailure() {
|
|
|
11922
13015
|
}
|
|
11923
13016
|
|
|
11924
13017
|
// src/commands/hooks/install.ts
|
|
11925
|
-
import { readFileSync as
|
|
11926
|
-
import { join as
|
|
13018
|
+
import { readFileSync as readFileSync12, writeFileSync as writeFileSync7, existsSync as existsSync15, mkdirSync as mkdirSync5 } from "fs";
|
|
13019
|
+
import { join as join10 } from "path";
|
|
13020
|
+
|
|
13021
|
+
class SettingsParseError extends Error {
|
|
13022
|
+
settingsPath;
|
|
13023
|
+
backupPath;
|
|
13024
|
+
constructor(settingsPath, backupPath, cause) {
|
|
13025
|
+
super(`settings.local.json contains unparseable JSON.
|
|
13026
|
+
` + ` Original file backed up to: ${backupPath}
|
|
13027
|
+
` + ` To recover: restore from backup, fix JSON syntax, then re-run ` + `\`mutagent hooks install\``);
|
|
13028
|
+
this.settingsPath = settingsPath;
|
|
13029
|
+
this.backupPath = backupPath;
|
|
13030
|
+
this.name = "SettingsParseError";
|
|
13031
|
+
if (cause instanceof Error)
|
|
13032
|
+
this.cause = cause;
|
|
13033
|
+
}
|
|
13034
|
+
}
|
|
11927
13035
|
var V1_MIGRATIONS = {
|
|
11928
13036
|
Stop: ["mutagent hooks claude-code session-end"]
|
|
11929
13037
|
};
|
|
@@ -11970,15 +13078,18 @@ function migrateV1Hooks(settings) {
|
|
|
11970
13078
|
return migrated;
|
|
11971
13079
|
}
|
|
11972
13080
|
function installHooks(cwd) {
|
|
11973
|
-
const claudeDir =
|
|
11974
|
-
const settingsPath =
|
|
11975
|
-
const existed =
|
|
13081
|
+
const claudeDir = join10(cwd, ".claude");
|
|
13082
|
+
const settingsPath = join10(claudeDir, "settings.local.json");
|
|
13083
|
+
const existed = existsSync15(settingsPath);
|
|
11976
13084
|
let settings = {};
|
|
11977
13085
|
if (existed) {
|
|
13086
|
+
const raw = readFileSync12(settingsPath, "utf-8");
|
|
11978
13087
|
try {
|
|
11979
|
-
settings = JSON.parse(
|
|
11980
|
-
} catch {
|
|
11981
|
-
|
|
13088
|
+
settings = JSON.parse(raw);
|
|
13089
|
+
} catch (err) {
|
|
13090
|
+
const backupPath = `${settingsPath}.bak.${new Date().toISOString().replace(/:/g, "-")}`;
|
|
13091
|
+
writeFileSync7(backupPath, raw, "utf-8");
|
|
13092
|
+
throw new SettingsParseError(settingsPath, backupPath, err);
|
|
11982
13093
|
}
|
|
11983
13094
|
}
|
|
11984
13095
|
const added = [];
|
|
@@ -12005,14 +13116,25 @@ function installHooks(cwd) {
|
|
|
12005
13116
|
}
|
|
12006
13117
|
}
|
|
12007
13118
|
}
|
|
13119
|
+
let userWarning;
|
|
12008
13120
|
if (added.length > 0 || migrated.length > 0) {
|
|
12009
|
-
if (!
|
|
13121
|
+
if (!existsSync15(claudeDir)) {
|
|
12010
13122
|
mkdirSync5(claudeDir, { recursive: true });
|
|
12011
13123
|
}
|
|
12012
13124
|
writeFileSync7(settingsPath, JSON.stringify(settings, null, 2) + `
|
|
12013
13125
|
`, "utf-8");
|
|
12014
13126
|
}
|
|
12015
|
-
|
|
13127
|
+
if (added.length > 0) {
|
|
13128
|
+
const addedList = added.map((cmd) => {
|
|
13129
|
+
const parts = cmd.split(" ");
|
|
13130
|
+
return parts[parts.length - 1] ?? cmd;
|
|
13131
|
+
}).join(", ");
|
|
13132
|
+
userWarning = `MutagenT hooks installed into .claude/settings.local.json
|
|
13133
|
+
` + ` Added: ${addedList}
|
|
13134
|
+
` + ` This file was modified. Review with: git diff .claude/settings.local.json
|
|
13135
|
+
` + ` (To remove hooks, edit .claude/settings.local.json and delete the mutagent entries)`;
|
|
13136
|
+
}
|
|
13137
|
+
return { settingsPath, existed, added, alreadyPresent, migrated, userWarning };
|
|
12016
13138
|
}
|
|
12017
13139
|
|
|
12018
13140
|
// src/commands/hooks/index.ts
|
|
@@ -12047,7 +13169,28 @@ hook already present (checked by command string) so running this multiple
|
|
|
12047
13169
|
times is safe.
|
|
12048
13170
|
`).action((opts) => {
|
|
12049
13171
|
const targetDir = opts.cwd ?? process.cwd();
|
|
12050
|
-
const
|
|
13172
|
+
const isJson = Boolean(opts.json);
|
|
13173
|
+
let result;
|
|
13174
|
+
try {
|
|
13175
|
+
result = installHooks(targetDir);
|
|
13176
|
+
} catch (err) {
|
|
13177
|
+
if (err instanceof SettingsParseError) {
|
|
13178
|
+
if (isJson) {
|
|
13179
|
+
process.stdout.write(JSON.stringify({
|
|
13180
|
+
success: false,
|
|
13181
|
+
error: err.message,
|
|
13182
|
+
backupPath: err.backupPath,
|
|
13183
|
+
settingsPath: err.settingsPath
|
|
13184
|
+
}) + `
|
|
13185
|
+
`);
|
|
13186
|
+
} else {
|
|
13187
|
+
process.stderr.write(`[mutagent hooks install] ERROR: ${err.message}
|
|
13188
|
+
`);
|
|
13189
|
+
}
|
|
13190
|
+
process.exit(1);
|
|
13191
|
+
}
|
|
13192
|
+
throw err;
|
|
13193
|
+
}
|
|
12051
13194
|
for (const { event, command } of result.migrated) {
|
|
12052
13195
|
process.stderr.write(`[mutagent hooks install] ⚠️ Migrated v1 hook: removed '${event} → ${command}' (v2 wires this as 'SessionEnd')
|
|
12053
13196
|
`);
|
|
@@ -12063,14 +13206,24 @@ times is safe.
|
|
|
12063
13206
|
`);
|
|
12064
13207
|
return;
|
|
12065
13208
|
}
|
|
12066
|
-
|
|
13209
|
+
if (result.userWarning) {
|
|
13210
|
+
if (isJson) {} else {
|
|
13211
|
+
process.stderr.write(`⚠ ${result.userWarning}
|
|
13212
|
+
`);
|
|
13213
|
+
}
|
|
13214
|
+
}
|
|
13215
|
+
const jsonResponse = {
|
|
12067
13216
|
success: true,
|
|
12068
13217
|
settingsPath: result.settingsPath,
|
|
12069
13218
|
existed: result.existed,
|
|
12070
13219
|
added: result.added,
|
|
12071
13220
|
alreadyPresent: result.alreadyPresent,
|
|
12072
13221
|
message: result.added.length > 0 ? `Installed ${String(result.added.length)} hook(s). ${String(result.alreadyPresent.length)} already present.` : `All hooks already present (${String(result.alreadyPresent.length)}).`
|
|
12073
|
-
}
|
|
13222
|
+
};
|
|
13223
|
+
if (result.userWarning && isJson) {
|
|
13224
|
+
jsonResponse.warnings = [result.userWarning];
|
|
13225
|
+
}
|
|
13226
|
+
process.stdout.write(JSON.stringify(jsonResponse) + `
|
|
12074
13227
|
`);
|
|
12075
13228
|
});
|
|
12076
13229
|
const claudeCode = hooks.command("claude-code").description("Claude Code session telemetry");
|
|
@@ -12115,8 +13268,8 @@ import { Command as Command20 } from "commander";
|
|
|
12115
13268
|
import chalk38 from "chalk";
|
|
12116
13269
|
init_errors();
|
|
12117
13270
|
init_config();
|
|
12118
|
-
import { readFileSync as
|
|
12119
|
-
import { join as
|
|
13271
|
+
import { readFileSync as readFileSync13, existsSync as existsSync16 } from "fs";
|
|
13272
|
+
import { join as join11, dirname as dirname2 } from "path";
|
|
12120
13273
|
import { fileURLToPath } from "url";
|
|
12121
13274
|
var VALID_CATEGORIES = ["bug", "feature", "improvement", "praise"];
|
|
12122
13275
|
function getCliVersion() {
|
|
@@ -12125,8 +13278,8 @@ function getCliVersion() {
|
|
|
12125
13278
|
}
|
|
12126
13279
|
try {
|
|
12127
13280
|
const __dirname2 = dirname2(fileURLToPath(import.meta.url));
|
|
12128
|
-
const pkgPath =
|
|
12129
|
-
const pkg = JSON.parse(
|
|
13281
|
+
const pkgPath = join11(__dirname2, "..", "..", "package.json");
|
|
13282
|
+
const pkg = JSON.parse(readFileSync13(pkgPath, "utf-8"));
|
|
12130
13283
|
return pkg.version ?? "0.1.1";
|
|
12131
13284
|
} catch {
|
|
12132
13285
|
return "0.1.1";
|
|
@@ -12157,12 +13310,12 @@ async function resolveContextSource(source, _readStdinOverride) {
|
|
|
12157
13310
|
raw = await (_readStdinOverride ?? readStdin2)();
|
|
12158
13311
|
} else if (source.startsWith("@")) {
|
|
12159
13312
|
const filePath = source.slice(1);
|
|
12160
|
-
if (!
|
|
13313
|
+
if (!existsSync16(filePath)) {
|
|
12161
13314
|
throw new MutagentError("INVALID_ARGUMENTS", `Context file not found: ${filePath}`, `Verify the path exists: ls -la "${filePath}"
|
|
12162
13315
|
Or use inline JSON: --context '{"key":"value"}'`);
|
|
12163
13316
|
}
|
|
12164
13317
|
try {
|
|
12165
|
-
raw =
|
|
13318
|
+
raw = readFileSync13(filePath, "utf-8").trim();
|
|
12166
13319
|
} catch (err) {
|
|
12167
13320
|
const msg = err instanceof Error ? err.message : String(err);
|
|
12168
13321
|
throw new MutagentError("INVALID_ARGUMENTS", `Cannot read context file "${filePath}": ${msg}`, `Check file permissions: ls -la "${filePath}"`);
|
|
@@ -12314,11 +13467,12 @@ if (process.env.CLI_VERSION) {
|
|
|
12314
13467
|
} else {
|
|
12315
13468
|
try {
|
|
12316
13469
|
const __dirname2 = dirname3(fileURLToPath2(import.meta.url));
|
|
12317
|
-
const pkgPath =
|
|
12318
|
-
const pkg = JSON.parse(
|
|
13470
|
+
const pkgPath = join12(__dirname2, "..", "..", "package.json");
|
|
13471
|
+
const pkg = JSON.parse(readFileSync14(pkgPath, "utf-8"));
|
|
12319
13472
|
cliVersion = pkg.version ?? cliVersion;
|
|
12320
13473
|
} catch {}
|
|
12321
13474
|
}
|
|
13475
|
+
setCliVersion(cliVersion);
|
|
12322
13476
|
var program = new Command21;
|
|
12323
13477
|
program.name("mutagent").description(`MutagenT CLI - AI-native prompt optimization platform
|
|
12324
13478
|
|
|
@@ -12328,6 +13482,15 @@ program.name("mutagent").description(`MutagenT CLI - AI-native prompt optimizati
|
|
|
12328
13482
|
showGlobalOptions: true
|
|
12329
13483
|
});
|
|
12330
13484
|
program.addHelpText("after", `
|
|
13485
|
+
${chalk39.bold.cyan("WORKFLOWS:")}
|
|
13486
|
+
${chalk39.bold("Setup")} mutagent login → mutagent init → mutagent skills install
|
|
13487
|
+
${chalk39.bold("Framework Integration")} mutagent explore → mutagent integrate <fw> → mutagent traces
|
|
13488
|
+
${chalk39.bold("Prompt Optimization")} mutagent prompts create → prompts dataset → prompts evaluation → prompts optimize
|
|
13489
|
+
${chalk39.bold("Agent Optimization")} ${chalk39.dim("[coming soon — see: mutagent agents --help]")}
|
|
13490
|
+
|
|
13491
|
+
${chalk39.dim("For rubric design, prompt-vs-agent classification, scorecard interpretation,")}
|
|
13492
|
+
${chalk39.dim("see the Skill at")} ${chalk39.cyan(".claude/skills/mutagent-cli/SKILL.md")} ${chalk39.dim("(install via")} ${chalk39.cyan("mutagent skills install")}${chalk39.dim(")")}
|
|
13493
|
+
|
|
12331
13494
|
${AGENTS_WIP_BANNER}
|
|
12332
13495
|
|
|
12333
13496
|
${chalk39.yellow("Non-Interactive Mode (CI/CD & Coding Agents):")}
|
|
@@ -12396,10 +13559,14 @@ ${chalk39.yellow("Post-Onboarding Decision Tree:")}
|
|
|
12396
13559
|
|
|
12397
13560
|
${chalk39.yellow("Directive System:")}
|
|
12398
13561
|
Every --json response may include:
|
|
12399
|
-
${chalk39.bold("_directive.
|
|
12400
|
-
${chalk39.bold("_directive.
|
|
12401
|
-
${chalk39.bold("_directive.
|
|
12402
|
-
${chalk39.bold("
|
|
13562
|
+
${chalk39.bold("_directive.display")} Type tag — 'status_card' for card-kind directives (drives test/docs guards)
|
|
13563
|
+
${chalk39.bold("_directive.renderedCard")} Pre-formatted card ${chalk39.red("(MUST echo verbatim in chat whenever this field exists — see SKILL.md Verbatim Card Display Protocol)")}
|
|
13564
|
+
${chalk39.bold("_directive.instruction")} Next step for the agent (self-sufficient, no Skill required)
|
|
13565
|
+
${chalk39.bold("_directive.next")} Array of suggested follow-up commands
|
|
13566
|
+
${chalk39.bold("_directive.decisionTree")} Multi-step branching logic for guided flows
|
|
13567
|
+
${chalk39.bold("_directive.askUserQuestions")} Questions to collect from the user (inside _directive)
|
|
13568
|
+
${chalk39.bold("_links")} Dashboard/API URLs (format as markdown links)
|
|
13569
|
+
${chalk39.bold("_compat")} Compat metadata: cliVersion, skillVersion, skillMinCliVersion
|
|
12403
13570
|
|
|
12404
13571
|
${chalk39.yellow("Evaluation Criteria Format:")}
|
|
12405
13572
|
Each criterion MUST have: ${chalk39.bold("name")}, ${chalk39.bold("description")} (scoring rubric), ${chalk39.bold("evaluationParameter")}
|
|
@@ -12455,7 +13622,10 @@ ${!hasCredentials() ? `
|
|
|
12455
13622
|
var rawArgs = process.argv.slice(2);
|
|
12456
13623
|
if (rawArgs.includes("-v") || rawArgs.includes("--version")) {
|
|
12457
13624
|
if (rawArgs.includes("--json")) {
|
|
12458
|
-
console.log(JSON.stringify({
|
|
13625
|
+
console.log(JSON.stringify({
|
|
13626
|
+
version: cliVersion,
|
|
13627
|
+
_compat: getCompatMetadata(cliVersion)
|
|
13628
|
+
}));
|
|
12459
13629
|
} else {
|
|
12460
13630
|
console.log(cliVersion);
|
|
12461
13631
|
}
|
|
@@ -12492,12 +13662,12 @@ program.addCommand(createFeedbackCommand());
|
|
|
12492
13662
|
var isInteractive = process.stdin.isTTY && !rawArgs.includes("--json") && process.env.CI !== "true";
|
|
12493
13663
|
var isSkillCommand = rawArgs[0] === "skills" || rawArgs[0] === "hooks";
|
|
12494
13664
|
if (isInteractive && !isSkillCommand) {
|
|
12495
|
-
const skillPath =
|
|
12496
|
-
if (!
|
|
13665
|
+
const skillPath = join12(process.cwd(), ".claude/skills/mutagent-cli/SKILL.md");
|
|
13666
|
+
if (!existsSync17(skillPath)) {
|
|
12497
13667
|
console.log(chalk39.dim("MutagenT SKILL not installed. Install it for AI agent support? Run:"), chalk39.cyan("mutagent skills install"));
|
|
12498
13668
|
}
|
|
12499
13669
|
}
|
|
12500
13670
|
program.parse();
|
|
12501
13671
|
|
|
12502
|
-
//# debugId=
|
|
13672
|
+
//# debugId=B6E7E54395DC9FFD64756E2164756E21
|
|
12503
13673
|
//# sourceMappingURL=cli.js.map
|