npm - @mutagent/cli - Versions diffs - 0.1.146 → 0.1.147 - Mend

@mutagent/cli 0.1.146 → 0.1.147

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/bin/cli.js CHANGED Viewed

@@ -863,12 +863,24 @@ class SDKClientWrapper {
       this.handleError(error);
     }
   }
+  async getOptimizationScorecard(jobId) {
+    try {
+      const res = await this.request(`/api/optimization/${jobId}/results`);
+      if (res.scorecard?.rendered) {
+        return { rendered: res.scorecard.rendered, data: res.scorecard.data ?? [] };
+      }
+      return null;
+    } catch {
+      return null;
+    }
+  }
   async getOptimizationResults(jobId) {
     try {
       const job = await this.request(`/api/optimization/${jobId}`);
       const progress = await this.request(`/api/optimization/${jobId}/progress`);
       const prompt = await this.getPrompt(String(job.promptId ?? ""));
       const statesRes = await this.request(`/api/optimization/${jobId}/states`).catch(() => ({ states: [] }));
+      const scorecardDigest = await this.getOptimizationScorecard(jobId);
       const latestState = statesRes.states[statesRes.states.length - 1];
       const rawState = latestState?.state ?? {};
       const iterCtx = rawState.iterationContext ?? rawState.current?.context;
@@ -895,7 +907,8 @@ class SDKClientWrapper {
         datasetResults: extracted.datasetResults,
         failureModes: extracted.failureModes,
         mutations: extracted.mutations,
-        evaluationDetails: extracted.evaluationDetails
+        evaluationDetails: extracted.evaluationDetails,
+        ...scorecardDigest ? { scorecard: scorecardDigest } : {}
       };
     } catch (error) {
       this.handleError(error);
@@ -5253,7 +5266,7 @@ function createWatchClient(opts) {
     }
     if (msg.type !== "event" || !msg.event)
       return;
-    const { eventType, iteration, stage, data } = msg.event;
+    const { eventType, iteration, stage, data, scorecardPayload } = msg.event;
     switch (eventType) {
       case "stage:completed":
         opts.onStageComplete(stage ?? "unknown", iteration ?? 0, data?.stageResult ?? data ?? { type: "unknown" });
@@ -5271,6 +5284,11 @@ function createWatchClient(opts) {
         opts.onError(new Error(data?.error ?? "Optimization job failed"));
         close();
         break;
+      case "scorecard:update":
+        if (scorecardPayload && opts.onScorecardUpdate) {
+          opts.onScorecardUpdate(scorecardPayload);
+        }
+        break;
       default:
         break;
     }
@@ -5824,6 +5842,30 @@ async function startWatchStream(jobId, isJson, maxIterations, baselineScore) {
           console.error(chalk14.red(`✗ Watch error: ${error.message}`));
         }
         reject(error);
+      },
+      onScorecardUpdate: (payload) => {
+        if (payload.error) {
+          if (isJson) {
+            console.log(JSON.stringify({
+              type: "scorecard.update",
+              iteration: payload.iteration,
+              error: payload.error
+            }));
+          } else {
+            console.warn(chalk14.yellow(`⚠ Scorecard unavailable for iteration ${String(payload.iteration)} (${payload.error.type})`));
+          }
+          return;
+        }
+        if (isJson) {
+          console.log(JSON.stringify({
+            type: "scorecard.update",
+            iteration: payload.iteration,
+            scorecard: payload.scorecard
+          }));
+        } else {
+          console.log(payload.renderedScorecard);
+          console.log("");
+        }
       }
     });
     client.connect().catch(reject);
@@ -6370,7 +6412,12 @@ After viewing results:
         output.output({ ...resultData, _links: { optimizer: optimizerLink(resultsPromptId, jobId) }, _directive: directive });
         echoDirectiveToStderr(directive);
       } else {
-        renderScorecard(resultData);
+        const serverScorecard = resultData.scorecard;
+        if (serverScorecard?.rendered) {
+          console.log(serverScorecard.rendered);
+        } else {
+          renderScorecard(resultData);
+        }
         const jobData = resultData.job;
         const isCompleted = jobData?.status === "completed";
         if (options.diff) {
@@ -9978,6 +10025,227 @@ Use the delimiter field to:
 - [concepts/eval-criteria.md](./eval-criteria.md) → MVC (Minimum Viable Context) — uses delimiter to enumerate input params
 - Source: \`mutagent-cli/src/lib/explorer.ts\` → \`inferPromptVariables()\` and \`DiscoveredPrompt.delimiter\`
 - Tests: \`mutagent-cli/src/__tests__/lib/explorer.test.ts\`
+`,
+  "concepts/scorecard-output.md": `---
+name: mutagent-cli-concepts-scorecard-output
+description: |
+  Per-iteration structured scorecard emitted by the optimizer.
+  Covers the ScorecardData shape, where agents see it (watch vs results),
+  how to consume NDJSON streams, cost-gate patterns, and criterion drill-down.
+triggers:
+  - "scorecard"
+  - "scorecard output"
+  - "optimize scorecard"
+  - "scorecard data"
+  - "iteration scorecard"
+  - "nextAction"
+  - "stop-stagnation"
+  - "cumulativeUsd"
+  - "optimizer results json"
+  - "watch optimizer"
+---
+# Concept — Scorecard Output
+> Per-iteration structured scorecard emitted by the optimizer, available
+> progressively via \`--watch\` and as a post-hoc digest via \`optimize results\`.
+## What it is
+Every completed optimizer iteration produces a \`ScorecardData\` record: a
+structured snapshot of scores, criterion pass-rates, cumulative cost, and the
+optimizer's own next-action decision. The scorecard is the primary signal
+surface for agents monitoring or reacting to a running optimization job.
+Two delivery modes:
+- **Progressive (streaming)** — emitted once per completed iteration over the
+  WebSocket event bus while the job runs.
+- **Post-hoc (batch)** — the full collection of per-iteration scorecards
+  returned in the \`/api/optimization/:id/results\` response after the job
+  finishes (or to inspect a paused job).
+---
+## Where an agent sees it
+### 1. \`optimize start --watch --json\`
+Streams NDJSON to stdout while the job runs. Each iteration emits one line:
+\`\`\`
+{ "type": "scorecard.update", "iteration": 1, "scorecard": { ...ScorecardData } }
+\`\`\`
+Other line types (e.g. \`job.started\`, \`stage.completed\`) may appear in the
+stream — branch on \`type === "scorecard.update"\` to isolate scorecard events.
+### 2. \`optimize watch <id> --json\`
+Attaches to an already-running job and streams the same NDJSON shape. Useful
+when the agent started the job in a previous session or on behalf of a
+background process.
+### 3. \`optimize results <id> --json\`
+Returns the complete result digest after job completion:
+\`\`\`json
+{
+  "job": { "id": "...", "status": "completed", ... },
+  "prompt": { ... },
+  "scorecard": {
+    "rendered": "<ASCII scorecard string>",
+    "data": [ ...ScorecardData[] ]
+  }
+}
+\`\`\`
+The \`scorecard.data\` array contains one entry per completed iteration in
+chronological order. \`scorecard.rendered\` is the ASCII terminal render of the
+final iteration (same bytes as the terminal output), included as a convenience
+for agents that want to surface a human-readable summary without re-rendering.
+---
+## \`ScorecardData\` shape
+Reproduced from
+\`mutagent/src/framework/metatuner/output/scorecard.ts\`:
+\`\`\`typescript
+interface ScorecardData {
+  jobId: string;
+  iteration: number;
+  totalIterations: number | null;  // null when maxIterations not configured
+  timestamp: string;               // ISO-8601
+  stage: string;                   // always "result-analysis" for completed iterations
+  scores: {
+    bestScore: number;             // highest score seen across all iterations so far
+    currentScore: number;          // score for this iteration
+    targetScore: number | null;    // convergence threshold; null if not set
+  };
+  criteria: Array<{
+    name: string;                  // criterion name from the evaluation rubric
+    scoreAvg: number;              // mean LLM score for this criterion this iteration
+    passRate: number;              // fraction of dataset items passing (0.0–1.0)
+  }>;
+  costs: {
+    iterationUsd: number;          // LLM cost for this iteration only
+    cumulativeUsd: number;         // total cost from job start through this iteration
+  };
+  durations: {
+    iterationMs: number;           // wall-clock duration of this iteration
+    stageBreakdown: Record<string, number>; // token counts keyed by stage name
+  };
+  nextAction:
+    | 'continue'          // optimizer will run another iteration
+    | 'stop-converged'    // currentScore >= targetScore
+    | 'stop-budget'       // cost or token budget exhausted
+    | 'stop-stagnation'   // stagnationCount >= patience threshold
+    | 'stop-max-iter';    // iteration count reached maxIterations
+}
+\`\`\`
+Key fields at a glance:
+| Field | Why it matters |
+|---|---|
+| \`scores.bestScore\` | The best the optimizer has achieved — compare to \`targetScore\` to estimate progress |
+| \`scores.currentScore\` | This iteration's score — use to detect regressions |
+| \`scores.targetScore\` | Convergence threshold; \`null\` means run until \`maxIterations\` |
+| \`criteria[].passRate\` | Per-criterion health — values below \`0.5\` flag a specific rubric as blocking |
+| \`costs.cumulativeUsd\` | Running cost — gate against any user-approved budget |
+| \`nextAction\` | The optimizer's own routing decision — surface \`stop-stagnation\` proactively |
+---
+## How an agent should use it
+### Per-iteration monitoring (streaming)
+While consuming \`optimize start --watch --json\` or \`optimize watch <id> --json\`:
+1. **Score trend** — compare \`scores.currentScore\` against the previous
+   iteration's \`currentScore\`. A sustained downward trend (3+ iterations)
+   warrants surfacing to the user before the budget is spent.
+2. **\`nextAction\` gate** — if \`nextAction === "stop-stagnation"\`, the optimizer
+   has detected a plateau and is about to halt. Surface this to the user with
+   the current \`scores.bestScore\` and \`costs.cumulativeUsd\` so they can decide
+   whether to continue with a higher patience budget or accept the result.
+3. **Cost gate** — if \`costs.cumulativeUsd\` crosses a threshold the user
+   approved for, alert immediately. The optimizer does not know about
+   user-defined soft budgets; the agent is the enforcement layer.
+### Post-hoc analysis (\`optimize results\`)
+1. **Criterion drill-down** — iterate \`scorecard.data\` (all iterations) and
+   collect \`criteria[].passRate\` per criterion. Any criterion with a mean
+   \`passRate < 0.5\` across iterations did not benefit from optimization and
+   should be flagged back to the user as potentially mis-specified or
+   conflicting with another criterion.
+2. **Best-iteration identification** — find the entry where
+   \`scores.currentScore === Math.max(...data.map(d => d.scores.currentScore))\`.
+   This is the iteration the optimizer treats as \`bestIteration\`. Compare its
+   \`criteria\` snapshot to the final iteration to detect regressions in
+   individual rubrics even when the composite score improved.
+3. **Cost/iteration trade-off** — divide \`costs.cumulativeUsd\` by
+   \`iteration\` to get average cost per iteration. Present this when asking the
+   user whether to re-run with more iterations.
+---
+## Parsing example
+Minimal Node.js / Bun snippet — consume NDJSON from \`optimize start --watch --json\`
+and branch on \`nextAction\`:
+\`\`\`js
+import { spawn } from 'node:child_process';
+import * as readline from 'node:readline';
+const proc = spawn('mutagent', ['optimize', 'start', JOB_ID, '--watch', '--json']);
+const rl = readline.createInterface({ input: proc.stdout });
+rl.on('line', (line) => {
+  let event;
+  try { event = JSON.parse(line); } catch { return; }
+  if (event.type !== 'scorecard.update') return;
+  const { scorecard } = event;
+  const { scores, costs, criteria, nextAction } = scorecard;
+  console.log(\`Iter \${scorecard.iteration}: score=\${scores.currentScore.toFixed(3)} cost=$\${costs.cumulativeUsd.toFixed(4)}\`);
+  if (nextAction === 'stop-stagnation') {
+    // Surface to user — optimizer is about to halt without converging
+    console.warn(\`[ALERT] Optimizer stagnated at score \${scores.bestScore}. Consider raising patience.\`);
+  }
+  const weakCriteria = criteria.filter(c => c.passRate < 0.5).map(c => c.name);
+  if (weakCriteria.length > 0) {
+    console.warn(\`[ALERT] Low-pass criteria: \${weakCriteria.join(', ')}\`);
+  }
+});
+\`\`\`
+---
+## Cross-references
+- [concepts/eval-criteria.md](./eval-criteria.md) — how evaluation criteria are
+  defined; \`criteria[].name\` in \`ScorecardData\` maps to \`name\` in the rubric.
+- [workflows/optimization.md](../workflows/optimization.md) — full optimization
+  loop; the scorecard is produced at Step 8 (watch) and Step 9 (results).
 `,
   "workflows/agents.md": `---
 name: mutagent-cli-workflows-agents
@@ -12018,5 +12286,5 @@ if (isInteractive && !isSkillCommand) {
 }
 program.parse();
-//# debugId=7B63A61BDC73994664756E2164756E21
+//# debugId=D625C9A316600B1764756E2164756E21
 //# sourceMappingURL=cli.js.map