@mutagent/cli 0.1.146 → 0.1.147

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/bin/cli.js CHANGED
@@ -863,12 +863,24 @@ class SDKClientWrapper {
863
863
  this.handleError(error);
864
864
  }
865
865
  }
866
+ async getOptimizationScorecard(jobId) {
867
+ try {
868
+ const res = await this.request(`/api/optimization/${jobId}/results`);
869
+ if (res.scorecard?.rendered) {
870
+ return { rendered: res.scorecard.rendered, data: res.scorecard.data ?? [] };
871
+ }
872
+ return null;
873
+ } catch {
874
+ return null;
875
+ }
876
+ }
866
877
  async getOptimizationResults(jobId) {
867
878
  try {
868
879
  const job = await this.request(`/api/optimization/${jobId}`);
869
880
  const progress = await this.request(`/api/optimization/${jobId}/progress`);
870
881
  const prompt = await this.getPrompt(String(job.promptId ?? ""));
871
882
  const statesRes = await this.request(`/api/optimization/${jobId}/states`).catch(() => ({ states: [] }));
883
+ const scorecardDigest = await this.getOptimizationScorecard(jobId);
872
884
  const latestState = statesRes.states[statesRes.states.length - 1];
873
885
  const rawState = latestState?.state ?? {};
874
886
  const iterCtx = rawState.iterationContext ?? rawState.current?.context;
@@ -895,7 +907,8 @@ class SDKClientWrapper {
895
907
  datasetResults: extracted.datasetResults,
896
908
  failureModes: extracted.failureModes,
897
909
  mutations: extracted.mutations,
898
- evaluationDetails: extracted.evaluationDetails
910
+ evaluationDetails: extracted.evaluationDetails,
911
+ ...scorecardDigest ? { scorecard: scorecardDigest } : {}
899
912
  };
900
913
  } catch (error) {
901
914
  this.handleError(error);
@@ -5253,7 +5266,7 @@ function createWatchClient(opts) {
5253
5266
  }
5254
5267
  if (msg.type !== "event" || !msg.event)
5255
5268
  return;
5256
- const { eventType, iteration, stage, data } = msg.event;
5269
+ const { eventType, iteration, stage, data, scorecardPayload } = msg.event;
5257
5270
  switch (eventType) {
5258
5271
  case "stage:completed":
5259
5272
  opts.onStageComplete(stage ?? "unknown", iteration ?? 0, data?.stageResult ?? data ?? { type: "unknown" });
@@ -5271,6 +5284,11 @@ function createWatchClient(opts) {
5271
5284
  opts.onError(new Error(data?.error ?? "Optimization job failed"));
5272
5285
  close();
5273
5286
  break;
5287
+ case "scorecard:update":
5288
+ if (scorecardPayload && opts.onScorecardUpdate) {
5289
+ opts.onScorecardUpdate(scorecardPayload);
5290
+ }
5291
+ break;
5274
5292
  default:
5275
5293
  break;
5276
5294
  }
@@ -5824,6 +5842,30 @@ async function startWatchStream(jobId, isJson, maxIterations, baselineScore) {
5824
5842
  console.error(chalk14.red(`✗ Watch error: ${error.message}`));
5825
5843
  }
5826
5844
  reject(error);
5845
+ },
5846
+ onScorecardUpdate: (payload) => {
5847
+ if (payload.error) {
5848
+ if (isJson) {
5849
+ console.log(JSON.stringify({
5850
+ type: "scorecard.update",
5851
+ iteration: payload.iteration,
5852
+ error: payload.error
5853
+ }));
5854
+ } else {
5855
+ console.warn(chalk14.yellow(`⚠ Scorecard unavailable for iteration ${String(payload.iteration)} (${payload.error.type})`));
5856
+ }
5857
+ return;
5858
+ }
5859
+ if (isJson) {
5860
+ console.log(JSON.stringify({
5861
+ type: "scorecard.update",
5862
+ iteration: payload.iteration,
5863
+ scorecard: payload.scorecard
5864
+ }));
5865
+ } else {
5866
+ console.log(payload.renderedScorecard);
5867
+ console.log("");
5868
+ }
5827
5869
  }
5828
5870
  });
5829
5871
  client.connect().catch(reject);
@@ -6370,7 +6412,12 @@ After viewing results:
6370
6412
  output.output({ ...resultData, _links: { optimizer: optimizerLink(resultsPromptId, jobId) }, _directive: directive });
6371
6413
  echoDirectiveToStderr(directive);
6372
6414
  } else {
6373
- renderScorecard(resultData);
6415
+ const serverScorecard = resultData.scorecard;
6416
+ if (serverScorecard?.rendered) {
6417
+ console.log(serverScorecard.rendered);
6418
+ } else {
6419
+ renderScorecard(resultData);
6420
+ }
6374
6421
  const jobData = resultData.job;
6375
6422
  const isCompleted = jobData?.status === "completed";
6376
6423
  if (options.diff) {
@@ -9978,6 +10025,227 @@ Use the delimiter field to:
9978
10025
  - [concepts/eval-criteria.md](./eval-criteria.md) → MVC (Minimum Viable Context) — uses delimiter to enumerate input params
9979
10026
  - Source: \`mutagent-cli/src/lib/explorer.ts\` → \`inferPromptVariables()\` and \`DiscoveredPrompt.delimiter\`
9980
10027
  - Tests: \`mutagent-cli/src/__tests__/lib/explorer.test.ts\`
10028
+ `,
10029
+ "concepts/scorecard-output.md": `---
10030
+ name: mutagent-cli-concepts-scorecard-output
10031
+ description: |
10032
+ Per-iteration structured scorecard emitted by the optimizer.
10033
+ Covers the ScorecardData shape, where agents see it (watch vs results),
10034
+ how to consume NDJSON streams, cost-gate patterns, and criterion drill-down.
10035
+ triggers:
10036
+ - "scorecard"
10037
+ - "scorecard output"
10038
+ - "optimize scorecard"
10039
+ - "scorecard data"
10040
+ - "iteration scorecard"
10041
+ - "nextAction"
10042
+ - "stop-stagnation"
10043
+ - "cumulativeUsd"
10044
+ - "optimizer results json"
10045
+ - "watch optimizer"
10046
+ ---
10047
+
10048
+ # Concept — Scorecard Output
10049
+
10050
+ > Per-iteration structured scorecard emitted by the optimizer, available
10051
+ > progressively via \`--watch\` and as a post-hoc digest via \`optimize results\`.
10052
+
10053
+ ## What it is
10054
+
10055
+ Every completed optimizer iteration produces a \`ScorecardData\` record: a
10056
+ structured snapshot of scores, criterion pass-rates, cumulative cost, and the
10057
+ optimizer's own next-action decision. The scorecard is the primary signal
10058
+ surface for agents monitoring or reacting to a running optimization job.
10059
+
10060
+ Two delivery modes:
10061
+
10062
+ - **Progressive (streaming)** — emitted once per completed iteration over the
10063
+ WebSocket event bus while the job runs.
10064
+ - **Post-hoc (batch)** — the full collection of per-iteration scorecards
10065
+ returned in the \`/api/optimization/:id/results\` response after the job
10066
+ finishes (or to inspect a paused job).
10067
+
10068
+ ---
10069
+
10070
+ ## Where an agent sees it
10071
+
10072
+ ### 1. \`optimize start --watch --json\`
10073
+
10074
+ Streams NDJSON to stdout while the job runs. Each iteration emits one line:
10075
+
10076
+ \`\`\`
10077
+ { "type": "scorecard.update", "iteration": 1, "scorecard": { ...ScorecardData } }
10078
+ \`\`\`
10079
+
10080
+ Other line types (e.g. \`job.started\`, \`stage.completed\`) may appear in the
10081
+ stream — branch on \`type === "scorecard.update"\` to isolate scorecard events.
10082
+
10083
+ ### 2. \`optimize watch <id> --json\`
10084
+
10085
+ Attaches to an already-running job and streams the same NDJSON shape. Useful
10086
+ when the agent started the job in a previous session or on behalf of a
10087
+ background process.
10088
+
10089
+ ### 3. \`optimize results <id> --json\`
10090
+
10091
+ Returns the complete result digest after job completion:
10092
+
10093
+ \`\`\`json
10094
+ {
10095
+ "job": { "id": "...", "status": "completed", ... },
10096
+ "prompt": { ... },
10097
+ "scorecard": {
10098
+ "rendered": "<ASCII scorecard string>",
10099
+ "data": [ ...ScorecardData[] ]
10100
+ }
10101
+ }
10102
+ \`\`\`
10103
+
10104
+ The \`scorecard.data\` array contains one entry per completed iteration in
10105
+ chronological order. \`scorecard.rendered\` is the ASCII terminal render of the
10106
+ final iteration (same bytes as the terminal output), included as a convenience
10107
+ for agents that want to surface a human-readable summary without re-rendering.
10108
+
10109
+ ---
10110
+
10111
+ ## \`ScorecardData\` shape
10112
+
10113
+ Reproduced from
10114
+ \`mutagent/src/framework/metatuner/output/scorecard.ts\`:
10115
+
10116
+ \`\`\`typescript
10117
+ interface ScorecardData {
10118
+ jobId: string;
10119
+ iteration: number;
10120
+ totalIterations: number | null; // null when maxIterations not configured
10121
+ timestamp: string; // ISO-8601
10122
+ stage: string; // always "result-analysis" for completed iterations
10123
+
10124
+ scores: {
10125
+ bestScore: number; // highest score seen across all iterations so far
10126
+ currentScore: number; // score for this iteration
10127
+ targetScore: number | null; // convergence threshold; null if not set
10128
+ };
10129
+
10130
+ criteria: Array<{
10131
+ name: string; // criterion name from the evaluation rubric
10132
+ scoreAvg: number; // mean LLM score for this criterion this iteration
10133
+ passRate: number; // fraction of dataset items passing (0.0–1.0)
10134
+ }>;
10135
+
10136
+ costs: {
10137
+ iterationUsd: number; // LLM cost for this iteration only
10138
+ cumulativeUsd: number; // total cost from job start through this iteration
10139
+ };
10140
+
10141
+ durations: {
10142
+ iterationMs: number; // wall-clock duration of this iteration
10143
+ stageBreakdown: Record<string, number>; // token counts keyed by stage name
10144
+ };
10145
+
10146
+ nextAction:
10147
+ | 'continue' // optimizer will run another iteration
10148
+ | 'stop-converged' // currentScore >= targetScore
10149
+ | 'stop-budget' // cost or token budget exhausted
10150
+ | 'stop-stagnation' // stagnationCount >= patience threshold
10151
+ | 'stop-max-iter'; // iteration count reached maxIterations
10152
+ }
10153
+ \`\`\`
10154
+
10155
+ Key fields at a glance:
10156
+
10157
+ | Field | Why it matters |
10158
+ |---|---|
10159
+ | \`scores.bestScore\` | The best the optimizer has achieved — compare to \`targetScore\` to estimate progress |
10160
+ | \`scores.currentScore\` | This iteration's score — use to detect regressions |
10161
+ | \`scores.targetScore\` | Convergence threshold; \`null\` means run until \`maxIterations\` |
10162
+ | \`criteria[].passRate\` | Per-criterion health — values below \`0.5\` flag a specific rubric as blocking |
10163
+ | \`costs.cumulativeUsd\` | Running cost — gate against any user-approved budget |
10164
+ | \`nextAction\` | The optimizer's own routing decision — surface \`stop-stagnation\` proactively |
10165
+
10166
+ ---
10167
+
10168
+ ## How an agent should use it
10169
+
10170
+ ### Per-iteration monitoring (streaming)
10171
+
10172
+ While consuming \`optimize start --watch --json\` or \`optimize watch <id> --json\`:
10173
+
10174
+ 1. **Score trend** — compare \`scores.currentScore\` against the previous
10175
+ iteration's \`currentScore\`. A sustained downward trend (3+ iterations)
10176
+ warrants surfacing to the user before the budget is spent.
10177
+
10178
+ 2. **\`nextAction\` gate** — if \`nextAction === "stop-stagnation"\`, the optimizer
10179
+ has detected a plateau and is about to halt. Surface this to the user with
10180
+ the current \`scores.bestScore\` and \`costs.cumulativeUsd\` so they can decide
10181
+ whether to continue with a higher patience budget or accept the result.
10182
+
10183
+ 3. **Cost gate** — if \`costs.cumulativeUsd\` crosses a threshold the user
10184
+ approved for, alert immediately. The optimizer does not know about
10185
+ user-defined soft budgets; the agent is the enforcement layer.
10186
+
10187
+ ### Post-hoc analysis (\`optimize results\`)
10188
+
10189
+ 1. **Criterion drill-down** — iterate \`scorecard.data\` (all iterations) and
10190
+ collect \`criteria[].passRate\` per criterion. Any criterion with a mean
10191
+ \`passRate < 0.5\` across iterations did not benefit from optimization and
10192
+ should be flagged back to the user as potentially mis-specified or
10193
+ conflicting with another criterion.
10194
+
10195
+ 2. **Best-iteration identification** — find the entry where
10196
+ \`scores.currentScore === Math.max(...data.map(d => d.scores.currentScore))\`.
10197
+ This is the iteration the optimizer treats as \`bestIteration\`. Compare its
10198
+ \`criteria\` snapshot to the final iteration to detect regressions in
10199
+ individual rubrics even when the composite score improved.
10200
+
10201
+ 3. **Cost/iteration trade-off** — divide \`costs.cumulativeUsd\` by
10202
+ \`iteration\` to get average cost per iteration. Present this when asking the
10203
+ user whether to re-run with more iterations.
10204
+
10205
+ ---
10206
+
10207
+ ## Parsing example
10208
+
10209
+ Minimal Node.js / Bun snippet — consume NDJSON from \`optimize start --watch --json\`
10210
+ and branch on \`nextAction\`:
10211
+
10212
+ \`\`\`js
10213
+ import { spawn } from 'node:child_process';
10214
+ import * as readline from 'node:readline';
10215
+
10216
+ const proc = spawn('mutagent', ['optimize', 'start', JOB_ID, '--watch', '--json']);
10217
+ const rl = readline.createInterface({ input: proc.stdout });
10218
+
10219
+ rl.on('line', (line) => {
10220
+ let event;
10221
+ try { event = JSON.parse(line); } catch { return; }
10222
+ if (event.type !== 'scorecard.update') return;
10223
+
10224
+ const { scorecard } = event;
10225
+ const { scores, costs, criteria, nextAction } = scorecard;
10226
+
10227
+ console.log(\`Iter \${scorecard.iteration}: score=\${scores.currentScore.toFixed(3)} cost=$\${costs.cumulativeUsd.toFixed(4)}\`);
10228
+
10229
+ if (nextAction === 'stop-stagnation') {
10230
+ // Surface to user — optimizer is about to halt without converging
10231
+ console.warn(\`[ALERT] Optimizer stagnated at score \${scores.bestScore}. Consider raising patience.\`);
10232
+ }
10233
+
10234
+ const weakCriteria = criteria.filter(c => c.passRate < 0.5).map(c => c.name);
10235
+ if (weakCriteria.length > 0) {
10236
+ console.warn(\`[ALERT] Low-pass criteria: \${weakCriteria.join(', ')}\`);
10237
+ }
10238
+ });
10239
+ \`\`\`
10240
+
10241
+ ---
10242
+
10243
+ ## Cross-references
10244
+
10245
+ - [concepts/eval-criteria.md](./eval-criteria.md) — how evaluation criteria are
10246
+ defined; \`criteria[].name\` in \`ScorecardData\` maps to \`name\` in the rubric.
10247
+ - [workflows/optimization.md](../workflows/optimization.md) — full optimization
10248
+ loop; the scorecard is produced at Step 8 (watch) and Step 9 (results).
9981
10249
  `,
9982
10250
  "workflows/agents.md": `---
9983
10251
  name: mutagent-cli-workflows-agents
@@ -12018,5 +12286,5 @@ if (isInteractive && !isSkillCommand) {
12018
12286
  }
12019
12287
  program.parse();
12020
12288
 
12021
- //# debugId=7B63A61BDC73994664756E2164756E21
12289
+ //# debugId=D625C9A316600B1764756E2164756E21
12022
12290
  //# sourceMappingURL=cli.js.map