@mutagent/cli 0.1.146 → 0.1.148
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bin/cli.js +272 -4
- package/dist/bin/cli.js.map +7 -7
- package/dist/index.js +15 -2
- package/dist/index.js.map +3 -3
- package/package.json +1 -1
package/dist/bin/cli.js
CHANGED
|
@@ -863,12 +863,24 @@ class SDKClientWrapper {
|
|
|
863
863
|
this.handleError(error);
|
|
864
864
|
}
|
|
865
865
|
}
|
|
866
|
+
async getOptimizationScorecard(jobId) {
|
|
867
|
+
try {
|
|
868
|
+
const res = await this.request(`/api/optimization/${jobId}/results`);
|
|
869
|
+
if (res.scorecard?.rendered) {
|
|
870
|
+
return { rendered: res.scorecard.rendered, data: res.scorecard.data ?? [] };
|
|
871
|
+
}
|
|
872
|
+
return null;
|
|
873
|
+
} catch {
|
|
874
|
+
return null;
|
|
875
|
+
}
|
|
876
|
+
}
|
|
866
877
|
async getOptimizationResults(jobId) {
|
|
867
878
|
try {
|
|
868
879
|
const job = await this.request(`/api/optimization/${jobId}`);
|
|
869
880
|
const progress = await this.request(`/api/optimization/${jobId}/progress`);
|
|
870
881
|
const prompt = await this.getPrompt(String(job.promptId ?? ""));
|
|
871
882
|
const statesRes = await this.request(`/api/optimization/${jobId}/states`).catch(() => ({ states: [] }));
|
|
883
|
+
const scorecardDigest = await this.getOptimizationScorecard(jobId);
|
|
872
884
|
const latestState = statesRes.states[statesRes.states.length - 1];
|
|
873
885
|
const rawState = latestState?.state ?? {};
|
|
874
886
|
const iterCtx = rawState.iterationContext ?? rawState.current?.context;
|
|
@@ -895,7 +907,8 @@ class SDKClientWrapper {
|
|
|
895
907
|
datasetResults: extracted.datasetResults,
|
|
896
908
|
failureModes: extracted.failureModes,
|
|
897
909
|
mutations: extracted.mutations,
|
|
898
|
-
evaluationDetails: extracted.evaluationDetails
|
|
910
|
+
evaluationDetails: extracted.evaluationDetails,
|
|
911
|
+
...scorecardDigest ? { scorecard: scorecardDigest } : {}
|
|
899
912
|
};
|
|
900
913
|
} catch (error) {
|
|
901
914
|
this.handleError(error);
|
|
@@ -5253,7 +5266,7 @@ function createWatchClient(opts) {
|
|
|
5253
5266
|
}
|
|
5254
5267
|
if (msg.type !== "event" || !msg.event)
|
|
5255
5268
|
return;
|
|
5256
|
-
const { eventType, iteration, stage, data } = msg.event;
|
|
5269
|
+
const { eventType, iteration, stage, data, scorecardPayload } = msg.event;
|
|
5257
5270
|
switch (eventType) {
|
|
5258
5271
|
case "stage:completed":
|
|
5259
5272
|
opts.onStageComplete(stage ?? "unknown", iteration ?? 0, data?.stageResult ?? data ?? { type: "unknown" });
|
|
@@ -5271,6 +5284,11 @@ function createWatchClient(opts) {
|
|
|
5271
5284
|
opts.onError(new Error(data?.error ?? "Optimization job failed"));
|
|
5272
5285
|
close();
|
|
5273
5286
|
break;
|
|
5287
|
+
case "scorecard:update":
|
|
5288
|
+
if (scorecardPayload && opts.onScorecardUpdate) {
|
|
5289
|
+
opts.onScorecardUpdate(scorecardPayload);
|
|
5290
|
+
}
|
|
5291
|
+
break;
|
|
5274
5292
|
default:
|
|
5275
5293
|
break;
|
|
5276
5294
|
}
|
|
@@ -5824,6 +5842,30 @@ async function startWatchStream(jobId, isJson, maxIterations, baselineScore) {
|
|
|
5824
5842
|
console.error(chalk14.red(`✗ Watch error: ${error.message}`));
|
|
5825
5843
|
}
|
|
5826
5844
|
reject(error);
|
|
5845
|
+
},
|
|
5846
|
+
onScorecardUpdate: (payload) => {
|
|
5847
|
+
if (payload.error) {
|
|
5848
|
+
if (isJson) {
|
|
5849
|
+
console.log(JSON.stringify({
|
|
5850
|
+
type: "scorecard.update",
|
|
5851
|
+
iteration: payload.iteration,
|
|
5852
|
+
error: payload.error
|
|
5853
|
+
}));
|
|
5854
|
+
} else {
|
|
5855
|
+
console.warn(chalk14.yellow(`⚠ Scorecard unavailable for iteration ${String(payload.iteration)} (${payload.error.type})`));
|
|
5856
|
+
}
|
|
5857
|
+
return;
|
|
5858
|
+
}
|
|
5859
|
+
if (isJson) {
|
|
5860
|
+
console.log(JSON.stringify({
|
|
5861
|
+
type: "scorecard.update",
|
|
5862
|
+
iteration: payload.iteration,
|
|
5863
|
+
scorecard: payload.scorecard
|
|
5864
|
+
}));
|
|
5865
|
+
} else {
|
|
5866
|
+
console.log(payload.renderedScorecard);
|
|
5867
|
+
console.log("");
|
|
5868
|
+
}
|
|
5827
5869
|
}
|
|
5828
5870
|
});
|
|
5829
5871
|
client.connect().catch(reject);
|
|
@@ -6370,7 +6412,12 @@ After viewing results:
|
|
|
6370
6412
|
output.output({ ...resultData, _links: { optimizer: optimizerLink(resultsPromptId, jobId) }, _directive: directive });
|
|
6371
6413
|
echoDirectiveToStderr(directive);
|
|
6372
6414
|
} else {
|
|
6373
|
-
|
|
6415
|
+
const serverScorecard = resultData.scorecard;
|
|
6416
|
+
if (serverScorecard?.rendered) {
|
|
6417
|
+
console.log(serverScorecard.rendered);
|
|
6418
|
+
} else {
|
|
6419
|
+
renderScorecard(resultData);
|
|
6420
|
+
}
|
|
6374
6421
|
const jobData = resultData.job;
|
|
6375
6422
|
const isCompleted = jobData?.status === "completed";
|
|
6376
6423
|
if (options.diff) {
|
|
@@ -9978,6 +10025,227 @@ Use the delimiter field to:
|
|
|
9978
10025
|
- [concepts/eval-criteria.md](./eval-criteria.md) → MVC (Minimum Viable Context) — uses delimiter to enumerate input params
|
|
9979
10026
|
- Source: \`mutagent-cli/src/lib/explorer.ts\` → \`inferPromptVariables()\` and \`DiscoveredPrompt.delimiter\`
|
|
9980
10027
|
- Tests: \`mutagent-cli/src/__tests__/lib/explorer.test.ts\`
|
|
10028
|
+
`,
|
|
10029
|
+
"concepts/scorecard-output.md": `---
|
|
10030
|
+
name: mutagent-cli-concepts-scorecard-output
|
|
10031
|
+
description: |
|
|
10032
|
+
Per-iteration structured scorecard emitted by the optimizer.
|
|
10033
|
+
Covers the ScorecardData shape, where agents see it (watch vs results),
|
|
10034
|
+
how to consume NDJSON streams, cost-gate patterns, and criterion drill-down.
|
|
10035
|
+
triggers:
|
|
10036
|
+
- "scorecard"
|
|
10037
|
+
- "scorecard output"
|
|
10038
|
+
- "optimize scorecard"
|
|
10039
|
+
- "scorecard data"
|
|
10040
|
+
- "iteration scorecard"
|
|
10041
|
+
- "nextAction"
|
|
10042
|
+
- "stop-stagnation"
|
|
10043
|
+
- "cumulativeUsd"
|
|
10044
|
+
- "optimizer results json"
|
|
10045
|
+
- "watch optimizer"
|
|
10046
|
+
---
|
|
10047
|
+
|
|
10048
|
+
# Concept — Scorecard Output
|
|
10049
|
+
|
|
10050
|
+
> Per-iteration structured scorecard emitted by the optimizer, available
|
|
10051
|
+
> progressively via \`--watch\` and as a post-hoc digest via \`optimize results\`.
|
|
10052
|
+
|
|
10053
|
+
## What it is
|
|
10054
|
+
|
|
10055
|
+
Every completed optimizer iteration produces a \`ScorecardData\` record: a
|
|
10056
|
+
structured snapshot of scores, criterion pass-rates, cumulative cost, and the
|
|
10057
|
+
optimizer's own next-action decision. The scorecard is the primary signal
|
|
10058
|
+
surface for agents monitoring or reacting to a running optimization job.
|
|
10059
|
+
|
|
10060
|
+
Two delivery modes:
|
|
10061
|
+
|
|
10062
|
+
- **Progressive (streaming)** — emitted once per completed iteration over the
|
|
10063
|
+
WebSocket event bus while the job runs.
|
|
10064
|
+
- **Post-hoc (batch)** — the full collection of per-iteration scorecards
|
|
10065
|
+
returned in the \`/api/optimization/:id/results\` response after the job
|
|
10066
|
+
finishes (or to inspect a paused job).
|
|
10067
|
+
|
|
10068
|
+
---
|
|
10069
|
+
|
|
10070
|
+
## Where an agent sees it
|
|
10071
|
+
|
|
10072
|
+
### 1. \`optimize start --watch --json\`
|
|
10073
|
+
|
|
10074
|
+
Streams NDJSON to stdout while the job runs. Each iteration emits one line:
|
|
10075
|
+
|
|
10076
|
+
\`\`\`
|
|
10077
|
+
{ "type": "scorecard.update", "iteration": 1, "scorecard": { ...ScorecardData } }
|
|
10078
|
+
\`\`\`
|
|
10079
|
+
|
|
10080
|
+
Other line types (e.g. \`job.started\`, \`stage.completed\`) may appear in the
|
|
10081
|
+
stream — branch on \`type === "scorecard.update"\` to isolate scorecard events.
|
|
10082
|
+
|
|
10083
|
+
### 2. \`optimize watch <id> --json\`
|
|
10084
|
+
|
|
10085
|
+
Attaches to an already-running job and streams the same NDJSON shape. Useful
|
|
10086
|
+
when the agent started the job in a previous session or on behalf of a
|
|
10087
|
+
background process.
|
|
10088
|
+
|
|
10089
|
+
### 3. \`optimize results <id> --json\`
|
|
10090
|
+
|
|
10091
|
+
Returns the complete result digest after job completion:
|
|
10092
|
+
|
|
10093
|
+
\`\`\`json
|
|
10094
|
+
{
|
|
10095
|
+
"job": { "id": "...", "status": "completed", ... },
|
|
10096
|
+
"prompt": { ... },
|
|
10097
|
+
"scorecard": {
|
|
10098
|
+
"rendered": "<ASCII scorecard string>",
|
|
10099
|
+
"data": [ ...ScorecardData[] ]
|
|
10100
|
+
}
|
|
10101
|
+
}
|
|
10102
|
+
\`\`\`
|
|
10103
|
+
|
|
10104
|
+
The \`scorecard.data\` array contains one entry per completed iteration in
|
|
10105
|
+
chronological order. \`scorecard.rendered\` is the ASCII terminal render of the
|
|
10106
|
+
final iteration (same bytes as the terminal output), included as a convenience
|
|
10107
|
+
for agents that want to surface a human-readable summary without re-rendering.
|
|
10108
|
+
|
|
10109
|
+
---
|
|
10110
|
+
|
|
10111
|
+
## \`ScorecardData\` shape
|
|
10112
|
+
|
|
10113
|
+
Reproduced from
|
|
10114
|
+
\`mutagent/src/framework/metatuner/output/scorecard.ts\`:
|
|
10115
|
+
|
|
10116
|
+
\`\`\`typescript
|
|
10117
|
+
interface ScorecardData {
|
|
10118
|
+
jobId: string;
|
|
10119
|
+
iteration: number;
|
|
10120
|
+
totalIterations: number | null; // null when maxIterations not configured
|
|
10121
|
+
timestamp: string; // ISO-8601
|
|
10122
|
+
stage: string; // always "result-analysis" for completed iterations
|
|
10123
|
+
|
|
10124
|
+
scores: {
|
|
10125
|
+
bestScore: number; // highest score seen across all iterations so far
|
|
10126
|
+
currentScore: number; // score for this iteration
|
|
10127
|
+
targetScore: number | null; // convergence threshold; null if not set
|
|
10128
|
+
};
|
|
10129
|
+
|
|
10130
|
+
criteria: Array<{
|
|
10131
|
+
name: string; // criterion name from the evaluation rubric
|
|
10132
|
+
scoreAvg: number; // mean LLM score for this criterion this iteration
|
|
10133
|
+
passRate: number; // fraction of dataset items passing (0.0–1.0)
|
|
10134
|
+
}>;
|
|
10135
|
+
|
|
10136
|
+
costs: {
|
|
10137
|
+
iterationUsd: number; // LLM cost for this iteration only
|
|
10138
|
+
cumulativeUsd: number; // total cost from job start through this iteration
|
|
10139
|
+
};
|
|
10140
|
+
|
|
10141
|
+
durations: {
|
|
10142
|
+
iterationMs: number; // wall-clock duration of this iteration
|
|
10143
|
+
stageBreakdown: Record<string, number>; // token counts keyed by stage name
|
|
10144
|
+
};
|
|
10145
|
+
|
|
10146
|
+
nextAction:
|
|
10147
|
+
| 'continue' // optimizer will run another iteration
|
|
10148
|
+
| 'stop-converged' // currentScore >= targetScore
|
|
10149
|
+
| 'stop-budget' // cost or token budget exhausted
|
|
10150
|
+
| 'stop-stagnation' // stagnationCount >= patience threshold
|
|
10151
|
+
| 'stop-max-iter'; // iteration count reached maxIterations
|
|
10152
|
+
}
|
|
10153
|
+
\`\`\`
|
|
10154
|
+
|
|
10155
|
+
Key fields at a glance:
|
|
10156
|
+
|
|
10157
|
+
| Field | Why it matters |
|
|
10158
|
+
|---|---|
|
|
10159
|
+
| \`scores.bestScore\` | The best the optimizer has achieved — compare to \`targetScore\` to estimate progress |
|
|
10160
|
+
| \`scores.currentScore\` | This iteration's score — use to detect regressions |
|
|
10161
|
+
| \`scores.targetScore\` | Convergence threshold; \`null\` means run until \`maxIterations\` |
|
|
10162
|
+
| \`criteria[].passRate\` | Per-criterion health — values below \`0.5\` flag a specific rubric as blocking |
|
|
10163
|
+
| \`costs.cumulativeUsd\` | Running cost — gate against any user-approved budget |
|
|
10164
|
+
| \`nextAction\` | The optimizer's own routing decision — surface \`stop-stagnation\` proactively |
|
|
10165
|
+
|
|
10166
|
+
---
|
|
10167
|
+
|
|
10168
|
+
## How an agent should use it
|
|
10169
|
+
|
|
10170
|
+
### Per-iteration monitoring (streaming)
|
|
10171
|
+
|
|
10172
|
+
While consuming \`optimize start --watch --json\` or \`optimize watch <id> --json\`:
|
|
10173
|
+
|
|
10174
|
+
1. **Score trend** — compare \`scores.currentScore\` against the previous
|
|
10175
|
+
iteration's \`currentScore\`. A sustained downward trend (3+ iterations)
|
|
10176
|
+
warrants surfacing to the user before the budget is spent.
|
|
10177
|
+
|
|
10178
|
+
2. **\`nextAction\` gate** — if \`nextAction === "stop-stagnation"\`, the optimizer
|
|
10179
|
+
has detected a plateau and is about to halt. Surface this to the user with
|
|
10180
|
+
the current \`scores.bestScore\` and \`costs.cumulativeUsd\` so they can decide
|
|
10181
|
+
whether to continue with a higher patience budget or accept the result.
|
|
10182
|
+
|
|
10183
|
+
3. **Cost gate** — if \`costs.cumulativeUsd\` crosses a threshold the user
|
|
10184
|
+
approved for, alert immediately. The optimizer does not know about
|
|
10185
|
+
user-defined soft budgets; the agent is the enforcement layer.
|
|
10186
|
+
|
|
10187
|
+
### Post-hoc analysis (\`optimize results\`)
|
|
10188
|
+
|
|
10189
|
+
1. **Criterion drill-down** — iterate \`scorecard.data\` (all iterations) and
|
|
10190
|
+
collect \`criteria[].passRate\` per criterion. Any criterion with a mean
|
|
10191
|
+
\`passRate < 0.5\` across iterations did not benefit from optimization and
|
|
10192
|
+
should be flagged back to the user as potentially mis-specified or
|
|
10193
|
+
conflicting with another criterion.
|
|
10194
|
+
|
|
10195
|
+
2. **Best-iteration identification** — find the entry where
|
|
10196
|
+
\`scores.currentScore === Math.max(...data.map(d => d.scores.currentScore))\`.
|
|
10197
|
+
This is the iteration the optimizer treats as \`bestIteration\`. Compare its
|
|
10198
|
+
\`criteria\` snapshot to the final iteration to detect regressions in
|
|
10199
|
+
individual rubrics even when the composite score improved.
|
|
10200
|
+
|
|
10201
|
+
3. **Cost/iteration trade-off** — divide \`costs.cumulativeUsd\` by
|
|
10202
|
+
\`iteration\` to get average cost per iteration. Present this when asking the
|
|
10203
|
+
user whether to re-run with more iterations.
|
|
10204
|
+
|
|
10205
|
+
---
|
|
10206
|
+
|
|
10207
|
+
## Parsing example
|
|
10208
|
+
|
|
10209
|
+
Minimal Node.js / Bun snippet — consume NDJSON from \`optimize start --watch --json\`
|
|
10210
|
+
and branch on \`nextAction\`:
|
|
10211
|
+
|
|
10212
|
+
\`\`\`js
|
|
10213
|
+
import { spawn } from 'node:child_process';
|
|
10214
|
+
import * as readline from 'node:readline';
|
|
10215
|
+
|
|
10216
|
+
const proc = spawn('mutagent', ['optimize', 'start', JOB_ID, '--watch', '--json']);
|
|
10217
|
+
const rl = readline.createInterface({ input: proc.stdout });
|
|
10218
|
+
|
|
10219
|
+
rl.on('line', (line) => {
|
|
10220
|
+
let event;
|
|
10221
|
+
try { event = JSON.parse(line); } catch { return; }
|
|
10222
|
+
if (event.type !== 'scorecard.update') return;
|
|
10223
|
+
|
|
10224
|
+
const { scorecard } = event;
|
|
10225
|
+
const { scores, costs, criteria, nextAction } = scorecard;
|
|
10226
|
+
|
|
10227
|
+
console.log(\`Iter \${scorecard.iteration}: score=\${scores.currentScore.toFixed(3)} cost=$\${costs.cumulativeUsd.toFixed(4)}\`);
|
|
10228
|
+
|
|
10229
|
+
if (nextAction === 'stop-stagnation') {
|
|
10230
|
+
// Surface to user — optimizer is about to halt without converging
|
|
10231
|
+
console.warn(\`[ALERT] Optimizer stagnated at score \${scores.bestScore}. Consider raising patience.\`);
|
|
10232
|
+
}
|
|
10233
|
+
|
|
10234
|
+
const weakCriteria = criteria.filter(c => c.passRate < 0.5).map(c => c.name);
|
|
10235
|
+
if (weakCriteria.length > 0) {
|
|
10236
|
+
console.warn(\`[ALERT] Low-pass criteria: \${weakCriteria.join(', ')}\`);
|
|
10237
|
+
}
|
|
10238
|
+
});
|
|
10239
|
+
\`\`\`
|
|
10240
|
+
|
|
10241
|
+
---
|
|
10242
|
+
|
|
10243
|
+
## Cross-references
|
|
10244
|
+
|
|
10245
|
+
- [concepts/eval-criteria.md](./eval-criteria.md) — how evaluation criteria are
|
|
10246
|
+
defined; \`criteria[].name\` in \`ScorecardData\` maps to \`name\` in the rubric.
|
|
10247
|
+
- [workflows/optimization.md](../workflows/optimization.md) — full optimization
|
|
10248
|
+
loop; the scorecard is produced at Step 8 (watch) and Step 9 (results).
|
|
9981
10249
|
`,
|
|
9982
10250
|
"workflows/agents.md": `---
|
|
9983
10251
|
name: mutagent-cli-workflows-agents
|
|
@@ -12018,5 +12286,5 @@ if (isInteractive && !isSkillCommand) {
|
|
|
12018
12286
|
}
|
|
12019
12287
|
program.parse();
|
|
12020
12288
|
|
|
12021
|
-
//# debugId=
|
|
12289
|
+
//# debugId=D625C9A316600B1764756E2164756E21
|
|
12022
12290
|
//# sourceMappingURL=cli.js.map
|