@axlsdk/studio 0.16.1 → 0.17.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -153,10 +153,10 @@ Studio exposes a REST API that the SPA consumes. You can also call these directl
153
153
  | `DELETE /api/memory/:scope/:key` | Delete memory entry |
154
154
  | `GET /api/evals` | List registered eval configs |
155
155
  | `GET /api/evals/history` | List eval run history |
156
- | `POST /api/evals/:name/run` | Run a registered eval by name. Body: `{ runs?: N, stream?: true, captureTraces?: true }` (`runs` capped at 25). When `stream: true`, returns `{ evalRunId }` immediately and broadcasts progress over the `eval:{evalRunId}` WS channel (`item_done` / `run_done` / `done` / `error`). The `done` event carries only `{ evalResultId, runGroupId? }` a pointer, not the full resultso multi-item evals don't hit the 64KB WS frame cap. Clients refetch the full result from history. `captureTraces: true` populates per-item `EvalItem.traces` on every item (success + failure); the Eval Runner panel renders these inline on item detail. Synchronous mode (default) is unchanged |
156
+ | `POST /api/evals/:name/run` | Run a registered eval by name. Body: `{ runs?: N, stream?: true, captureTraces?: true }` (`runs` capped at 25). When `stream: true`, returns `{ evalRunId }` immediately and broadcasts progress over the `eval:{evalRunId}` WS channel: `item_done` per item, `run_done` per successful run, `run_failed` on a provider error, `run_cancelled` on user-initiated abort, terminal `done` (carrying only `{ evalResultId, runGroupId? }` plus `partial: true / batchCompleted / batchAttempted` and either `cancelled: true` OR `batchFailure` never both when the batch is partial), or terminal `error` if no runs completed. Clients refetch the full result from history. `captureTraces: true` populates per-item `EvalItem.traces` on every item (success + failure); the Eval Runner panel renders these inline on item detail. Synchronous mode (default) returns the full `EvalResult` enriched with `_multiRun.partial` markers when applicable |
157
157
  | `POST /api/evals/runs/:evalRunId/cancel` | Abort an active streaming eval run. The cancelled run appears in history with remaining items marked as cancelled |
158
158
  | `POST /api/evals/:name/rescore` | Re-score a history entry with the eval's current scorers |
159
- | `POST /api/evals/import` | Import a CLI eval artifact (parsed `EvalResult` JSON) into runtime history |
159
+ | `POST /api/evals/import` | Import a CLI eval artifact (parsed `EvalResult` JSON) into runtime history. Body: `{ result: EvalResult \| EvalResult[], eval? }`. The CLI's `--output` writes a JSON array when `--runs N > 1` (including for partial batches), so array form is supported — each entry imports as its own history entry with shared `runGroupId`, rendering as a coherent group in the History tab. Single-object response is `{ id, eval, timestamp }`; array response is `{ imported: [{ id, eval, timestamp }, ...] }`. Per-entry validation; import is all-or-nothing |
160
160
  | `DELETE /api/evals/history/:id` | Delete a single history entry. Blocked in readOnly |
161
161
  | `POST /api/evals/compare` | Compare two eval results by history ID. Body: `{ baselineId, candidateId, options? }` where each ID is `string` (single run) or `string[]` (pooled multi-run). Resolves IDs server-side from `runtime.getEvalHistory()` so the wire payload stays small |
162
162
  | `POST /api/playground/chat` | Chat with an agent directly (no workflow required). Accepts `{ message, agent?, sessionId? }`. Streams results via WebSocket |
@@ -522,7 +522,7 @@ src/
522
522
 
523
523
  **Client:** React 19 SPA with Tailwind CSS v4, TanStack Query, and react-router-dom. Pre-built at publish time and served as static assets. Reads `window.__AXL_STUDIO_BASE__` for runtime base path configuration.
524
524
 
525
- **CLI:** Auto-detects and loads the user's config via tsx's `tsImport()` API (handles ESM/CJS correctly for TypeScript files without process-wide side effects), validates the runtime, starts the server, and optionally opens the browser.
525
+ **CLI:** Auto-detects and loads the user's config. TypeScript files activate tsx's ESM loader hooks process-wide (registered once per process via `tsx/esm/api`'s `register()`), so chained imports of `.ts` workspace sources (e.g. resolved via `--conditions development`) are also transformed. Validates the runtime, starts the server, and optionally opens the browser.
526
526
 
527
527
  **Middleware:** `createStudioMiddleware()` wraps the Hono app as a Node.js `(req, res)` handler via `@hono/node-server`. Adds `verifyUpgrade` for WS auth, `readOnly` mode, and `basePath` injection into the SPA.
528
528
 
@@ -100,8 +100,11 @@ function redactEvalItem(item) {
100
100
  }
101
101
  function redactEvalResult(result, redact) {
102
102
  if (!redact) return result;
103
+ const meta = result.metadata;
104
+ const scrubbedMetadata = meta && typeof meta.batchFailure === "string" ? { ...meta, batchFailure: REDACTED } : result.metadata;
103
105
  return {
104
106
  ...result,
107
+ metadata: scrubbedMetadata,
105
108
  items: result.items.map(redactEvalItem)
106
109
  };
107
110
  }
@@ -896,13 +899,18 @@ function reduceEvalTrends(acc, entry) {
896
899
  const cost = extractCost(entry.data);
897
900
  const model = extractModel(entry.data);
898
901
  const duration = extractDuration(entry.data);
902
+ const metadata = entry.data?.metadata;
903
+ const runGroupId = typeof metadata?.runGroupId === "string" ? metadata.runGroupId : void 0;
904
+ const batchAttempted = typeof metadata?.batchAttempted === "number" && Number.isFinite(metadata.batchAttempted) ? metadata.batchAttempted : void 0;
899
905
  const run = {
900
906
  timestamp: entry.timestamp,
901
907
  id: entry.id,
902
908
  scores,
903
909
  cost,
904
910
  ...model !== void 0 ? { model } : {},
905
- ...duration !== void 0 ? { duration } : {}
911
+ ...duration !== void 0 ? { duration } : {},
912
+ ...runGroupId !== void 0 ? { runGroupId } : {},
913
+ ...batchAttempted !== void 0 ? { batchAttempted } : {}
906
914
  };
907
915
  const byEval = { ...acc.byEval };
908
916
  const prev = byEval[entry.eval];
@@ -1585,33 +1593,79 @@ function createEvalRoutes(connMgr, evalLoader) {
1585
1593
  if (runs > 1) {
1586
1594
  const runGroupId = randomUUID();
1587
1595
  const results = [];
1596
+ let runFailure;
1597
+ let cancelled = false;
1588
1598
  for (let r = 0; r < runs; r++) {
1589
- if (ac.signal.aborted) break;
1590
- const result = await runtime.runRegisteredEval(name, {
1591
- metadata: { runGroupId, runIndex: r },
1592
- signal: ac.signal,
1593
- captureTraces,
1594
- onProgress: (event) => {
1595
- if (event.type === "run_done") return;
1599
+ if (ac.signal.aborted) {
1600
+ cancelled = true;
1601
+ break;
1602
+ }
1603
+ try {
1604
+ const result = await runtime.runRegisteredEval(name, {
1605
+ metadata: { runGroupId, runIndex: r, batchAttempted: runs },
1606
+ signal: ac.signal,
1607
+ captureTraces,
1608
+ onProgress: (event) => {
1609
+ if (event.type === "run_done") return;
1610
+ connMgr.broadcastWithWildcard(`eval:${evalRunId}`, {
1611
+ ...event,
1612
+ run: r + 1,
1613
+ totalRuns: runs
1614
+ });
1615
+ }
1616
+ });
1617
+ results.push(result);
1618
+ connMgr.broadcastWithWildcard(`eval:${evalRunId}`, {
1619
+ type: "run_done",
1620
+ run: r + 1,
1621
+ totalRuns: runs
1622
+ });
1623
+ } catch (err) {
1624
+ const isAbort = ac.signal.aborted || err instanceof Error && err.name === "AbortError";
1625
+ if (isAbort) {
1626
+ cancelled = true;
1596
1627
  connMgr.broadcastWithWildcard(`eval:${evalRunId}`, {
1597
- ...event,
1628
+ type: "run_cancelled",
1598
1629
  run: r + 1,
1599
1630
  totalRuns: runs
1600
1631
  });
1632
+ break;
1601
1633
  }
1602
- });
1603
- results.push(result);
1604
- connMgr.broadcastWithWildcard(`eval:${evalRunId}`, {
1605
- type: "run_done",
1606
- run: r + 1,
1607
- totalRuns: runs
1608
- });
1634
+ runFailure = err instanceof Error ? err : new Error(String(err));
1635
+ connMgr.broadcastWithWildcard(`eval:${evalRunId}`, {
1636
+ type: "run_failed",
1637
+ run: r + 1,
1638
+ totalRuns: runs,
1639
+ message: redactErrorMessage(runFailure, redactOn)
1640
+ });
1641
+ break;
1642
+ }
1609
1643
  }
1610
1644
  if (results.length > 0) {
1645
+ const partial = results.length < runs;
1646
+ const failureMsg = runFailure ? redactErrorMessage(runFailure, redactOn) || String(runFailure) || void 0 : void 0;
1611
1647
  connMgr.broadcastWithWildcard(`eval:${evalRunId}`, {
1612
1648
  type: "done",
1613
1649
  evalResultId: results[0].id,
1614
- runGroupId
1650
+ runGroupId,
1651
+ ...partial && {
1652
+ partial: true,
1653
+ batchCompleted: results.length,
1654
+ batchAttempted: runs,
1655
+ // `cancelled` and `batchFailure` are mutually exclusive:
1656
+ // the catch block sets at most one of {cancelled,
1657
+ // runFailure}. The client uses `cancelled` to render a
1658
+ // neutral "Cancelled — X of N runs completed" caption
1659
+ // instead of the amber "Stopped after: <message>"
1660
+ // failure caption.
1661
+ ...cancelled ? { cancelled: true } : {},
1662
+ ...failureMsg ? { batchFailure: failureMsg } : {}
1663
+ }
1664
+ });
1665
+ } else if (runFailure) {
1666
+ connMgr.broadcastWithWildcard(`eval:${evalRunId}`, {
1667
+ type: "error",
1668
+ message: redactErrorMessage(runFailure, redactOn)
1615
1669
  });
1616
1670
  } else {
1617
1671
  connMgr.broadcastWithWildcard(`eval:${evalRunId}`, {
@@ -1649,19 +1703,38 @@ function createEvalRoutes(connMgr, evalLoader) {
1649
1703
  const { aggregateRuns } = await import("@axlsdk/eval");
1650
1704
  const runGroupId = randomUUID();
1651
1705
  const results = [];
1706
+ let runFailure;
1652
1707
  for (let r = 0; r < runs; r++) {
1653
- const result2 = await runtime.runRegisteredEval(name, {
1654
- metadata: { runGroupId, runIndex: r },
1655
- captureTraces
1656
- });
1657
- results.push(result2);
1708
+ try {
1709
+ const result2 = await runtime.runRegisteredEval(name, {
1710
+ metadata: { runGroupId, runIndex: r, batchAttempted: runs },
1711
+ captureTraces
1712
+ });
1713
+ results.push(result2);
1714
+ } catch (err) {
1715
+ runFailure = err instanceof Error ? err : new Error(String(err));
1716
+ break;
1717
+ }
1718
+ }
1719
+ if (results.length === 0) {
1720
+ throw runFailure ?? new Error("No runs completed");
1658
1721
  }
1659
- const typedResults = results;
1660
- const aggregate = aggregateRuns(typedResults);
1661
- const first = typedResults[0];
1722
+ const aggregate = aggregateRuns(results);
1723
+ const first = results[0];
1724
+ const partial = results.length < runs;
1725
+ const failureMsg = runFailure ? redactErrorMessage(runFailure, redactOn) || String(runFailure) || void 0 : void 0;
1662
1726
  const result = {
1663
1727
  ...first,
1664
- _multiRun: { aggregate, allRuns: typedResults }
1728
+ _multiRun: {
1729
+ aggregate,
1730
+ allRuns: results,
1731
+ ...partial && {
1732
+ partial: true,
1733
+ batchCompleted: results.length,
1734
+ batchAttempted: runs,
1735
+ ...failureMsg ? { batchFailure: failureMsg } : {}
1736
+ }
1737
+ }
1665
1738
  };
1666
1739
  return c.json({
1667
1740
  ok: true,
@@ -1833,60 +1906,75 @@ function createEvalRoutes(connMgr, evalLoader) {
1833
1906
  const runtime = c.get("runtime");
1834
1907
  const body = await c.req.json();
1835
1908
  const bad = (message) => c.json({ ok: false, error: { code: "BAD_REQUEST", message } }, 400);
1836
- if (!body.result || typeof body.result !== "object") {
1909
+ if (body.result === void 0 || body.result === null) {
1837
1910
  return bad("result is required");
1838
1911
  }
1839
- const result = body.result;
1840
- if (!Array.isArray(result.items)) {
1841
- return bad("result.items must be an array");
1842
- }
1843
- if (typeof result.summary !== "object" || result.summary == null) {
1844
- return bad("result.summary must be an object");
1845
- }
1846
- if (typeof result.dataset !== "string" || !result.dataset) {
1847
- return bad("result.dataset must be a non-empty string (required for compare)");
1912
+ const resultsRaw = Array.isArray(body.result) ? body.result : [body.result];
1913
+ if (resultsRaw.length === 0) {
1914
+ return bad("result must be a non-empty array or object");
1848
1915
  }
1849
- const summary = result.summary;
1850
- if (typeof summary.scorers !== "object" || summary.scorers == null) {
1851
- return bad("result.summary.scorers must be an object");
1852
- }
1853
- const summaryScorerNames = Object.keys(summary.scorers);
1854
- const items = result.items;
1855
- const summaryScorerSet = new Set(summaryScorerNames);
1856
- const uncoveredAcrossItems = /* @__PURE__ */ new Set();
1857
- for (const item of items) {
1858
- const itemScores = item?.scores;
1859
- if (itemScores && typeof itemScores === "object") {
1860
- for (const name of Object.keys(itemScores)) {
1861
- if (!summaryScorerSet.has(name)) uncoveredAcrossItems.add(name);
1916
+ const validatedResults = [];
1917
+ for (let i = 0; i < resultsRaw.length; i++) {
1918
+ const entry = resultsRaw[i];
1919
+ const prefix = resultsRaw.length > 1 ? `result[${i}]` : "result";
1920
+ if (!entry || typeof entry !== "object") {
1921
+ return bad(`${prefix} must be an object`);
1922
+ }
1923
+ const r = entry;
1924
+ if (!Array.isArray(r.items)) {
1925
+ return bad(`${prefix}.items must be an array`);
1926
+ }
1927
+ if (typeof r.summary !== "object" || r.summary == null) {
1928
+ return bad(`${prefix}.summary must be an object`);
1929
+ }
1930
+ if (typeof r.dataset !== "string" || !r.dataset) {
1931
+ return bad(`${prefix}.dataset must be a non-empty string (required for compare)`);
1932
+ }
1933
+ const summary = r.summary;
1934
+ if (typeof summary.scorers !== "object" || summary.scorers == null) {
1935
+ return bad(`${prefix}.summary.scorers must be an object`);
1936
+ }
1937
+ const summaryScorerNames = Object.keys(summary.scorers);
1938
+ const items = r.items;
1939
+ const summaryScorerSet = new Set(summaryScorerNames);
1940
+ const uncoveredAcrossItems = /* @__PURE__ */ new Set();
1941
+ for (const item of items) {
1942
+ const itemScores = item?.scores;
1943
+ if (itemScores && typeof itemScores === "object") {
1944
+ for (const name of Object.keys(itemScores)) {
1945
+ if (!summaryScorerSet.has(name)) uncoveredAcrossItems.add(name);
1946
+ }
1862
1947
  }
1863
1948
  }
1864
- }
1865
- if (uncoveredAcrossItems.size > 0) {
1866
- return bad(
1867
- `item scores reference scorer(s) not in summary.scorers: ${[...uncoveredAcrossItems].join(", ")}`
1868
- );
1949
+ if (uncoveredAcrossItems.size > 0) {
1950
+ return bad(
1951
+ `${prefix} item scores reference scorer(s) not in summary.scorers: ${[...uncoveredAcrossItems].join(", ")}`
1952
+ );
1953
+ }
1954
+ validatedResults.push(r);
1869
1955
  }
1870
1956
  const trim = (v) => typeof v === "string" && v.trim() !== "" ? v.trim() : void 0;
1871
- const metadataObj = typeof result.metadata === "object" && result.metadata != null ? result.metadata : {};
1957
+ const firstResult = validatedResults[0];
1958
+ const metadataObj = typeof firstResult.metadata === "object" && firstResult.metadata != null ? firstResult.metadata : {};
1872
1959
  const workflowsFromMeta = Array.isArray(metadataObj.workflows) ? metadataObj.workflows : [];
1873
1960
  const primaryWorkflow = workflowsFromMeta.find((w) => typeof w === "string");
1874
- const evalName = trim(body.eval) ?? trim(primaryWorkflow) ?? // Legacy fallback: pre-0.14 CLI artifacts had workflow at the top level.
1875
- trim(result.workflow) ?? "imported";
1876
- const id = randomUUID();
1961
+ const evalName = trim(body.eval) ?? trim(primaryWorkflow) ?? trim(firstResult.workflow) ?? "imported";
1877
1962
  const timestamp = Date.now();
1878
- const imported = {
1879
- ...result,
1880
- id,
1881
- metadata: typeof result.metadata === "object" && result.metadata != null ? result.metadata : {}
1882
- };
1883
- await runtime.saveEvalResult({
1884
- id,
1885
- eval: evalName,
1886
- timestamp,
1887
- data: imported
1888
- });
1889
- return c.json({ ok: true, data: { id, eval: evalName, timestamp } });
1963
+ const imported = [];
1964
+ for (const r of validatedResults) {
1965
+ const id = randomUUID();
1966
+ const entry = {
1967
+ ...r,
1968
+ id,
1969
+ metadata: typeof r.metadata === "object" && r.metadata != null ? r.metadata : {}
1970
+ };
1971
+ await runtime.saveEvalResult({ id, eval: evalName, timestamp, data: entry });
1972
+ imported.push({ id, eval: evalName, timestamp });
1973
+ }
1974
+ if (imported.length === 1) {
1975
+ return c.json({ ok: true, data: imported[0] });
1976
+ }
1977
+ return c.json({ ok: true, data: { imported } });
1890
1978
  });
1891
1979
  function closeActiveRuns() {
1892
1980
  for (const ac of activeRuns.values()) ac.abort();
@@ -2210,4 +2298,4 @@ export {
2210
2298
  EvalAggregator,
2211
2299
  createServer
2212
2300
  };
2213
- //# sourceMappingURL=chunk-RE6VPUXA.js.map
2301
+ //# sourceMappingURL=chunk-GADFO7DZ.js.map