npm - @axlsdk/studio - Versions diffs - 0.16.1 → 0.17.0 - Mend

@axlsdk/studio 0.16.1 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

package/README.md +3 -3
package/dist/{chunk-RE6VPUXA.js → chunk-GADFO7DZ.js} +159 -71
package/dist/chunk-GADFO7DZ.js.map +1 -0
package/dist/cli.cjs +173 -126
package/dist/cli.cjs.map +1 -1
package/dist/cli.js +36 -10
package/dist/cli.js.map +1 -1
package/dist/client/assets/index-C3yGF34O.js +313 -0
package/dist/client/assets/index-DNRVA4F2.css +1 -0
package/dist/client/index.html +2 -2
package/dist/middleware.cjs +205 -174
package/dist/middleware.cjs.map +1 -1
package/dist/middleware.js +37 -67
package/dist/middleware.js.map +1 -1
package/dist/server/index.cjs +158 -70
package/dist/server/index.cjs.map +1 -1
package/dist/server/index.d.cts +13 -0
package/dist/server/index.d.ts +13 -0
package/dist/server/index.js +1 -1
package/package.json +4 -4
package/dist/chunk-JGQ3MSIG.js +0 -80
package/dist/chunk-JGQ3MSIG.js.map +0 -1
package/dist/chunk-RE6VPUXA.js.map +0 -1
package/dist/client/assets/index-BzQe3w-R.js +0 -313
package/dist/client/assets/index-C2nTRFWX.css +0 -1

package/README.md CHANGED Viewed

@@ -153,10 +153,10 @@ Studio exposes a REST API that the SPA consumes. You can also call these directl
 | `DELETE /api/memory/:scope/:key` | Delete memory entry |
 | `GET /api/evals` | List registered eval configs |
 | `GET /api/evals/history` | List eval run history |
-| `POST /api/evals/:name/run` | Run a registered eval by name. Body: `{ runs?: N, stream?: true, captureTraces?: true }` (`runs` capped at 25). When `stream: true`, returns `{ evalRunId }` immediately and broadcasts progress over the `eval:{evalRunId}` WS channel (`item_done` / `run_done` / `done` / `error`). The `done` event carries only `{ evalResultId, runGroupId? }` — a pointer, not the full result — so multi-item evals don't hit the 64KB WS frame cap. Clients refetch the full result from history. `captureTraces: true` populates per-item `EvalItem.traces` on every item (success + failure); the Eval Runner panel renders these inline on item detail. Synchronous mode (default) is unchanged |
+| `POST /api/evals/:name/run` | Run a registered eval by name. Body: `{ runs?: N, stream?: true, captureTraces?: true }` (`runs` capped at 25). When `stream: true`, returns `{ evalRunId }` immediately and broadcasts progress over the `eval:{evalRunId}` WS channel: `item_done` per item, `run_done` per successful run, `run_failed` on a provider error, `run_cancelled` on user-initiated abort, terminal `done` (carrying only `{ evalResultId, runGroupId? }` plus `partial: true / batchCompleted / batchAttempted` and either `cancelled: true` OR `batchFailure` — never both — when the batch is partial), or terminal `error` if no runs completed. Clients refetch the full result from history. `captureTraces: true` populates per-item `EvalItem.traces` on every item (success + failure); the Eval Runner panel renders these inline on item detail. Synchronous mode (default) returns the full `EvalResult` enriched with `_multiRun.partial` markers when applicable |
 | `POST /api/evals/runs/:evalRunId/cancel` | Abort an active streaming eval run. The cancelled run appears in history with remaining items marked as cancelled |
 | `POST /api/evals/:name/rescore` | Re-score a history entry with the eval's current scorers |
-| `POST /api/evals/import` | Import a CLI eval artifact (parsed `EvalResult` JSON) into runtime history |
+| `POST /api/evals/import` | Import a CLI eval artifact (parsed `EvalResult` JSON) into runtime history. Body: `{ result: EvalResult \| EvalResult[], eval? }`. The CLI's `--output` writes a JSON array when `--runs N > 1` (including for partial batches), so array form is supported — each entry imports as its own history entry with shared `runGroupId`, rendering as a coherent group in the History tab. Single-object response is `{ id, eval, timestamp }`; array response is `{ imported: [{ id, eval, timestamp }, ...] }`. Per-entry validation; import is all-or-nothing |
 | `DELETE /api/evals/history/:id` | Delete a single history entry. Blocked in readOnly |
 | `POST /api/evals/compare` | Compare two eval results by history ID. Body: `{ baselineId, candidateId, options? }` where each ID is `string` (single run) or `string[]` (pooled multi-run). Resolves IDs server-side from `runtime.getEvalHistory()` so the wire payload stays small |
 | `POST /api/playground/chat` | Chat with an agent directly (no workflow required). Accepts `{ message, agent?, sessionId? }`. Streams results via WebSocket |
@@ -522,7 +522,7 @@ src/
 **Client:** React 19 SPA with Tailwind CSS v4, TanStack Query, and react-router-dom. Pre-built at publish time and served as static assets. Reads `window.__AXL_STUDIO_BASE__` for runtime base path configuration.
-**CLI:** Auto-detects and loads the user's config via tsx's `tsImport()` API (handles ESM/CJS correctly for TypeScript files without process-wide side effects), validates the runtime, starts the server, and optionally opens the browser.
+**CLI:** Auto-detects and loads the user's config. TypeScript files activate tsx's ESM loader hooks process-wide (registered once per process via `tsx/esm/api`'s `register()`), so chained imports of `.ts` workspace sources (e.g. resolved via `--conditions development`) are also transformed. Validates the runtime, starts the server, and optionally opens the browser.
 **Middleware:** `createStudioMiddleware()` wraps the Hono app as a Node.js `(req, res)` handler via `@hono/node-server`. Adds `verifyUpgrade` for WS auth, `readOnly` mode, and `basePath` injection into the SPA.

package/dist/{chunk-RE6VPUXA.js → chunk-GADFO7DZ.js} RENAMED Viewed

@@ -100,8 +100,11 @@ function redactEvalItem(item) {
 }
 function redactEvalResult(result, redact) {
   if (!redact) return result;
+  const meta = result.metadata;
+  const scrubbedMetadata = meta && typeof meta.batchFailure === "string" ? { ...meta, batchFailure: REDACTED } : result.metadata;
   return {
     ...result,
+    metadata: scrubbedMetadata,
     items: result.items.map(redactEvalItem)
   };
 }
@@ -896,13 +899,18 @@ function reduceEvalTrends(acc, entry) {
   const cost = extractCost(entry.data);
   const model = extractModel(entry.data);
   const duration = extractDuration(entry.data);
+  const metadata = entry.data?.metadata;
+  const runGroupId = typeof metadata?.runGroupId === "string" ? metadata.runGroupId : void 0;
+  const batchAttempted = typeof metadata?.batchAttempted === "number" && Number.isFinite(metadata.batchAttempted) ? metadata.batchAttempted : void 0;
   const run = {
     timestamp: entry.timestamp,
     id: entry.id,
     scores,
     cost,
     ...model !== void 0 ? { model } : {},
-    ...duration !== void 0 ? { duration } : {}
+    ...duration !== void 0 ? { duration } : {},
+    ...runGroupId !== void 0 ? { runGroupId } : {},
+    ...batchAttempted !== void 0 ? { batchAttempted } : {}
   };
   const byEval = { ...acc.byEval };
   const prev = byEval[entry.eval];
@@ -1585,33 +1593,79 @@ function createEvalRoutes(connMgr, evalLoader) {
           if (runs > 1) {
             const runGroupId = randomUUID();
             const results = [];
+            let runFailure;
+            let cancelled = false;
             for (let r = 0; r < runs; r++) {
-              if (ac.signal.aborted) break;
-              const result = await runtime.runRegisteredEval(name, {
-                metadata: { runGroupId, runIndex: r },
-                signal: ac.signal,
-                captureTraces,
-                onProgress: (event) => {
-                  if (event.type === "run_done") return;
+              if (ac.signal.aborted) {
+                cancelled = true;
+                break;
+              }
+              try {
+                const result = await runtime.runRegisteredEval(name, {
+                  metadata: { runGroupId, runIndex: r, batchAttempted: runs },
+                  signal: ac.signal,
+                  captureTraces,
+                  onProgress: (event) => {
+                    if (event.type === "run_done") return;
+                    connMgr.broadcastWithWildcard(`eval:${evalRunId}`, {
+                      ...event,
+                      run: r + 1,
+                      totalRuns: runs
+                    });
+                  }
+                });
+                results.push(result);
+                connMgr.broadcastWithWildcard(`eval:${evalRunId}`, {
+                  type: "run_done",
+                  run: r + 1,
+                  totalRuns: runs
+                });
+              } catch (err) {
+                const isAbort = ac.signal.aborted || err instanceof Error && err.name === "AbortError";
+                if (isAbort) {
+                  cancelled = true;
                   connMgr.broadcastWithWildcard(`eval:${evalRunId}`, {
-                    ...event,
+                    type: "run_cancelled",
                     run: r + 1,
                     totalRuns: runs
                   });
+                  break;
                 }
-              });
-              results.push(result);
-              connMgr.broadcastWithWildcard(`eval:${evalRunId}`, {
-                type: "run_done",
-                run: r + 1,
-                totalRuns: runs
-              });
+                runFailure = err instanceof Error ? err : new Error(String(err));
+                connMgr.broadcastWithWildcard(`eval:${evalRunId}`, {
+                  type: "run_failed",
+                  run: r + 1,
+                  totalRuns: runs,
+                  message: redactErrorMessage(runFailure, redactOn)
+                });
+                break;
+              }
             }
             if (results.length > 0) {
+              const partial = results.length < runs;
+              const failureMsg = runFailure ? redactErrorMessage(runFailure, redactOn) || String(runFailure) || void 0 : void 0;
               connMgr.broadcastWithWildcard(`eval:${evalRunId}`, {
                 type: "done",
                 evalResultId: results[0].id,
-                runGroupId
+                runGroupId,
+                ...partial && {
+                  partial: true,
+                  batchCompleted: results.length,
+                  batchAttempted: runs,
+                  // `cancelled` and `batchFailure` are mutually exclusive:
+                  // the catch block sets at most one of {cancelled,
+                  // runFailure}. The client uses `cancelled` to render a
+                  // neutral "Cancelled — X of N runs completed" caption
+                  // instead of the amber "Stopped after: <message>"
+                  // failure caption.
+                  ...cancelled ? { cancelled: true } : {},
+                  ...failureMsg ? { batchFailure: failureMsg } : {}
+                }
+              });
+            } else if (runFailure) {
+              connMgr.broadcastWithWildcard(`eval:${evalRunId}`, {
+                type: "error",
+                message: redactErrorMessage(runFailure, redactOn)
               });
             } else {
               connMgr.broadcastWithWildcard(`eval:${evalRunId}`, {
@@ -1649,19 +1703,38 @@ function createEvalRoutes(connMgr, evalLoader) {
         const { aggregateRuns } = await import("@axlsdk/eval");
         const runGroupId = randomUUID();
         const results = [];
+        let runFailure;
         for (let r = 0; r < runs; r++) {
-          const result2 = await runtime.runRegisteredEval(name, {
-            metadata: { runGroupId, runIndex: r },
-            captureTraces
-          });
-          results.push(result2);
+          try {
+            const result2 = await runtime.runRegisteredEval(name, {
+              metadata: { runGroupId, runIndex: r, batchAttempted: runs },
+              captureTraces
+            });
+            results.push(result2);
+          } catch (err) {
+            runFailure = err instanceof Error ? err : new Error(String(err));
+            break;
+          }
+        }
+        if (results.length === 0) {
+          throw runFailure ?? new Error("No runs completed");
         }
-        const typedResults = results;
-        const aggregate = aggregateRuns(typedResults);
-        const first = typedResults[0];
+        const aggregate = aggregateRuns(results);
+        const first = results[0];
+        const partial = results.length < runs;
+        const failureMsg = runFailure ? redactErrorMessage(runFailure, redactOn) || String(runFailure) || void 0 : void 0;
         const result = {
           ...first,
-          _multiRun: { aggregate, allRuns: typedResults }
+          _multiRun: {
+            aggregate,
+            allRuns: results,
+            ...partial && {
+              partial: true,
+              batchCompleted: results.length,
+              batchAttempted: runs,
+              ...failureMsg ? { batchFailure: failureMsg } : {}
+            }
+          }
         };
         return c.json({
           ok: true,
@@ -1833,60 +1906,75 @@ function createEvalRoutes(connMgr, evalLoader) {
     const runtime = c.get("runtime");
     const body = await c.req.json();
     const bad = (message) => c.json({ ok: false, error: { code: "BAD_REQUEST", message } }, 400);
-    if (!body.result || typeof body.result !== "object") {
+    if (body.result === void 0 || body.result === null) {
       return bad("result is required");
     }
-    const result = body.result;
-    if (!Array.isArray(result.items)) {
-      return bad("result.items must be an array");
-    }
-    if (typeof result.summary !== "object" || result.summary == null) {
-      return bad("result.summary must be an object");
-    }
-    if (typeof result.dataset !== "string" || !result.dataset) {
-      return bad("result.dataset must be a non-empty string (required for compare)");
+    const resultsRaw = Array.isArray(body.result) ? body.result : [body.result];
+    if (resultsRaw.length === 0) {
+      return bad("result must be a non-empty array or object");
     }
-    const summary = result.summary;
-    if (typeof summary.scorers !== "object" || summary.scorers == null) {
-      return bad("result.summary.scorers must be an object");
-    }
-    const summaryScorerNames = Object.keys(summary.scorers);
-    const items = result.items;
-    const summaryScorerSet = new Set(summaryScorerNames);
-    const uncoveredAcrossItems = /* @__PURE__ */ new Set();
-    for (const item of items) {
-      const itemScores = item?.scores;
-      if (itemScores && typeof itemScores === "object") {
-        for (const name of Object.keys(itemScores)) {
-          if (!summaryScorerSet.has(name)) uncoveredAcrossItems.add(name);
+    const validatedResults = [];
+    for (let i = 0; i < resultsRaw.length; i++) {
+      const entry = resultsRaw[i];
+      const prefix = resultsRaw.length > 1 ? `result[${i}]` : "result";
+      if (!entry || typeof entry !== "object") {
+        return bad(`${prefix} must be an object`);
+      }
+      const r = entry;
+      if (!Array.isArray(r.items)) {
+        return bad(`${prefix}.items must be an array`);
+      }
+      if (typeof r.summary !== "object" || r.summary == null) {
+        return bad(`${prefix}.summary must be an object`);
+      }
+      if (typeof r.dataset !== "string" || !r.dataset) {
+        return bad(`${prefix}.dataset must be a non-empty string (required for compare)`);
+      }
+      const summary = r.summary;
+      if (typeof summary.scorers !== "object" || summary.scorers == null) {
+        return bad(`${prefix}.summary.scorers must be an object`);
+      }
+      const summaryScorerNames = Object.keys(summary.scorers);
+      const items = r.items;
+      const summaryScorerSet = new Set(summaryScorerNames);
+      const uncoveredAcrossItems = /* @__PURE__ */ new Set();
+      for (const item of items) {
+        const itemScores = item?.scores;
+        if (itemScores && typeof itemScores === "object") {
+          for (const name of Object.keys(itemScores)) {
+            if (!summaryScorerSet.has(name)) uncoveredAcrossItems.add(name);
+          }
         }
       }
-    }
-    if (uncoveredAcrossItems.size > 0) {
-      return bad(
-        `item scores reference scorer(s) not in summary.scorers: ${[...uncoveredAcrossItems].join(", ")}`
-      );
+      if (uncoveredAcrossItems.size > 0) {
+        return bad(
+          `${prefix} item scores reference scorer(s) not in summary.scorers: ${[...uncoveredAcrossItems].join(", ")}`
+        );
+      }
+      validatedResults.push(r);
     }
     const trim = (v) => typeof v === "string" && v.trim() !== "" ? v.trim() : void 0;
-    const metadataObj = typeof result.metadata === "object" && result.metadata != null ? result.metadata : {};
+    const firstResult = validatedResults[0];
+    const metadataObj = typeof firstResult.metadata === "object" && firstResult.metadata != null ? firstResult.metadata : {};
     const workflowsFromMeta = Array.isArray(metadataObj.workflows) ? metadataObj.workflows : [];
     const primaryWorkflow = workflowsFromMeta.find((w) => typeof w === "string");
-    const evalName = trim(body.eval) ?? trim(primaryWorkflow) ?? // Legacy fallback: pre-0.14 CLI artifacts had workflow at the top level.
-    trim(result.workflow) ?? "imported";
-    const id = randomUUID();
+    const evalName = trim(body.eval) ?? trim(primaryWorkflow) ?? trim(firstResult.workflow) ?? "imported";
     const timestamp = Date.now();
-    const imported = {
-      ...result,
-      id,
-      metadata: typeof result.metadata === "object" && result.metadata != null ? result.metadata : {}
-    };
-    await runtime.saveEvalResult({
-      id,
-      eval: evalName,
-      timestamp,
-      data: imported
-    });
-    return c.json({ ok: true, data: { id, eval: evalName, timestamp } });
+    const imported = [];
+    for (const r of validatedResults) {
+      const id = randomUUID();
+      const entry = {
+        ...r,
+        id,
+        metadata: typeof r.metadata === "object" && r.metadata != null ? r.metadata : {}
+      };
+      await runtime.saveEvalResult({ id, eval: evalName, timestamp, data: entry });
+      imported.push({ id, eval: evalName, timestamp });
+    }
+    if (imported.length === 1) {
+      return c.json({ ok: true, data: imported[0] });
+    }
+    return c.json({ ok: true, data: { imported } });
   });
   function closeActiveRuns() {
     for (const ac of activeRuns.values()) ac.abort();
@@ -2210,4 +2298,4 @@ export {
   EvalAggregator,
   createServer
 };
-//# sourceMappingURL=chunk-RE6VPUXA.js.map
+//# sourceMappingURL=chunk-GADFO7DZ.js.map