@axlsdk/studio 0.16.1 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -3
- package/dist/{chunk-RE6VPUXA.js → chunk-GADFO7DZ.js} +159 -71
- package/dist/chunk-GADFO7DZ.js.map +1 -0
- package/dist/cli.cjs +173 -126
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +36 -10
- package/dist/cli.js.map +1 -1
- package/dist/client/assets/index-C3yGF34O.js +313 -0
- package/dist/client/assets/index-DNRVA4F2.css +1 -0
- package/dist/client/index.html +2 -2
- package/dist/middleware.cjs +205 -174
- package/dist/middleware.cjs.map +1 -1
- package/dist/middleware.js +37 -67
- package/dist/middleware.js.map +1 -1
- package/dist/server/index.cjs +158 -70
- package/dist/server/index.cjs.map +1 -1
- package/dist/server/index.d.cts +13 -0
- package/dist/server/index.d.ts +13 -0
- package/dist/server/index.js +1 -1
- package/package.json +4 -4
- package/dist/chunk-JGQ3MSIG.js +0 -80
- package/dist/chunk-JGQ3MSIG.js.map +0 -1
- package/dist/chunk-RE6VPUXA.js.map +0 -1
- package/dist/client/assets/index-BzQe3w-R.js +0 -313
- package/dist/client/assets/index-C2nTRFWX.css +0 -1
package/README.md
CHANGED
|
@@ -153,10 +153,10 @@ Studio exposes a REST API that the SPA consumes. You can also call these directl
|
|
|
153
153
|
| `DELETE /api/memory/:scope/:key` | Delete memory entry |
|
|
154
154
|
| `GET /api/evals` | List registered eval configs |
|
|
155
155
|
| `GET /api/evals/history` | List eval run history |
|
|
156
|
-
| `POST /api/evals/:name/run` | Run a registered eval by name. Body: `{ runs?: N, stream?: true, captureTraces?: true }` (`runs` capped at 25). When `stream: true`, returns `{ evalRunId }` immediately and broadcasts progress over the `eval:{evalRunId}` WS channel
|
|
156
|
+
| `POST /api/evals/:name/run` | Run a registered eval by name. Body: `{ runs?: N, stream?: true, captureTraces?: true }` (`runs` capped at 25). When `stream: true`, returns `{ evalRunId }` immediately and broadcasts progress over the `eval:{evalRunId}` WS channel: `item_done` per item, `run_done` per successful run, `run_failed` on a provider error, `run_cancelled` on user-initiated abort, terminal `done` (carrying only `{ evalResultId, runGroupId? }` plus `partial: true / batchCompleted / batchAttempted` and either `cancelled: true` OR `batchFailure` — never both — when the batch is partial), or terminal `error` if no runs completed. Clients refetch the full result from history. `captureTraces: true` populates per-item `EvalItem.traces` on every item (success + failure); the Eval Runner panel renders these inline on item detail. Synchronous mode (default) returns the full `EvalResult` enriched with `_multiRun.partial` markers when applicable |
|
|
157
157
|
| `POST /api/evals/runs/:evalRunId/cancel` | Abort an active streaming eval run. The cancelled run appears in history with remaining items marked as cancelled |
|
|
158
158
|
| `POST /api/evals/:name/rescore` | Re-score a history entry with the eval's current scorers |
|
|
159
|
-
| `POST /api/evals/import` | Import a CLI eval artifact (parsed `EvalResult` JSON) into runtime history |
|
|
159
|
+
| `POST /api/evals/import` | Import a CLI eval artifact (parsed `EvalResult` JSON) into runtime history. Body: `{ result: EvalResult \| EvalResult[], eval? }`. The CLI's `--output` writes a JSON array when `--runs N > 1` (including for partial batches), so array form is supported — each entry imports as its own history entry with shared `runGroupId`, rendering as a coherent group in the History tab. Single-object response is `{ id, eval, timestamp }`; array response is `{ imported: [{ id, eval, timestamp }, ...] }`. Per-entry validation; import is all-or-nothing |
|
|
160
160
|
| `DELETE /api/evals/history/:id` | Delete a single history entry. Blocked in readOnly |
|
|
161
161
|
| `POST /api/evals/compare` | Compare two eval results by history ID. Body: `{ baselineId, candidateId, options? }` where each ID is `string` (single run) or `string[]` (pooled multi-run). Resolves IDs server-side from `runtime.getEvalHistory()` so the wire payload stays small |
|
|
162
162
|
| `POST /api/playground/chat` | Chat with an agent directly (no workflow required). Accepts `{ message, agent?, sessionId? }`. Streams results via WebSocket |
|
|
@@ -522,7 +522,7 @@ src/
|
|
|
522
522
|
|
|
523
523
|
**Client:** React 19 SPA with Tailwind CSS v4, TanStack Query, and react-router-dom. Pre-built at publish time and served as static assets. Reads `window.__AXL_STUDIO_BASE__` for runtime base path configuration.
|
|
524
524
|
|
|
525
|
-
**CLI:** Auto-detects and loads the user's config via tsx's `
|
|
525
|
+
**CLI:** Auto-detects and loads the user's config. TypeScript files activate tsx's ESM loader hooks process-wide (registered once per process via `tsx/esm/api`'s `register()`), so chained imports of `.ts` workspace sources (e.g. resolved via `--conditions development`) are also transformed. Validates the runtime, starts the server, and optionally opens the browser.
|
|
526
526
|
|
|
527
527
|
**Middleware:** `createStudioMiddleware()` wraps the Hono app as a Node.js `(req, res)` handler via `@hono/node-server`. Adds `verifyUpgrade` for WS auth, `readOnly` mode, and `basePath` injection into the SPA.
|
|
528
528
|
|
|
@@ -100,8 +100,11 @@ function redactEvalItem(item) {
|
|
|
100
100
|
}
|
|
101
101
|
function redactEvalResult(result, redact) {
|
|
102
102
|
if (!redact) return result;
|
|
103
|
+
const meta = result.metadata;
|
|
104
|
+
const scrubbedMetadata = meta && typeof meta.batchFailure === "string" ? { ...meta, batchFailure: REDACTED } : result.metadata;
|
|
103
105
|
return {
|
|
104
106
|
...result,
|
|
107
|
+
metadata: scrubbedMetadata,
|
|
105
108
|
items: result.items.map(redactEvalItem)
|
|
106
109
|
};
|
|
107
110
|
}
|
|
@@ -896,13 +899,18 @@ function reduceEvalTrends(acc, entry) {
|
|
|
896
899
|
const cost = extractCost(entry.data);
|
|
897
900
|
const model = extractModel(entry.data);
|
|
898
901
|
const duration = extractDuration(entry.data);
|
|
902
|
+
const metadata = entry.data?.metadata;
|
|
903
|
+
const runGroupId = typeof metadata?.runGroupId === "string" ? metadata.runGroupId : void 0;
|
|
904
|
+
const batchAttempted = typeof metadata?.batchAttempted === "number" && Number.isFinite(metadata.batchAttempted) ? metadata.batchAttempted : void 0;
|
|
899
905
|
const run = {
|
|
900
906
|
timestamp: entry.timestamp,
|
|
901
907
|
id: entry.id,
|
|
902
908
|
scores,
|
|
903
909
|
cost,
|
|
904
910
|
...model !== void 0 ? { model } : {},
|
|
905
|
-
...duration !== void 0 ? { duration } : {}
|
|
911
|
+
...duration !== void 0 ? { duration } : {},
|
|
912
|
+
...runGroupId !== void 0 ? { runGroupId } : {},
|
|
913
|
+
...batchAttempted !== void 0 ? { batchAttempted } : {}
|
|
906
914
|
};
|
|
907
915
|
const byEval = { ...acc.byEval };
|
|
908
916
|
const prev = byEval[entry.eval];
|
|
@@ -1585,33 +1593,79 @@ function createEvalRoutes(connMgr, evalLoader) {
|
|
|
1585
1593
|
if (runs > 1) {
|
|
1586
1594
|
const runGroupId = randomUUID();
|
|
1587
1595
|
const results = [];
|
|
1596
|
+
let runFailure;
|
|
1597
|
+
let cancelled = false;
|
|
1588
1598
|
for (let r = 0; r < runs; r++) {
|
|
1589
|
-
if (ac.signal.aborted)
|
|
1590
|
-
|
|
1591
|
-
|
|
1592
|
-
|
|
1593
|
-
|
|
1594
|
-
|
|
1595
|
-
|
|
1599
|
+
if (ac.signal.aborted) {
|
|
1600
|
+
cancelled = true;
|
|
1601
|
+
break;
|
|
1602
|
+
}
|
|
1603
|
+
try {
|
|
1604
|
+
const result = await runtime.runRegisteredEval(name, {
|
|
1605
|
+
metadata: { runGroupId, runIndex: r, batchAttempted: runs },
|
|
1606
|
+
signal: ac.signal,
|
|
1607
|
+
captureTraces,
|
|
1608
|
+
onProgress: (event) => {
|
|
1609
|
+
if (event.type === "run_done") return;
|
|
1610
|
+
connMgr.broadcastWithWildcard(`eval:${evalRunId}`, {
|
|
1611
|
+
...event,
|
|
1612
|
+
run: r + 1,
|
|
1613
|
+
totalRuns: runs
|
|
1614
|
+
});
|
|
1615
|
+
}
|
|
1616
|
+
});
|
|
1617
|
+
results.push(result);
|
|
1618
|
+
connMgr.broadcastWithWildcard(`eval:${evalRunId}`, {
|
|
1619
|
+
type: "run_done",
|
|
1620
|
+
run: r + 1,
|
|
1621
|
+
totalRuns: runs
|
|
1622
|
+
});
|
|
1623
|
+
} catch (err) {
|
|
1624
|
+
const isAbort = ac.signal.aborted || err instanceof Error && err.name === "AbortError";
|
|
1625
|
+
if (isAbort) {
|
|
1626
|
+
cancelled = true;
|
|
1596
1627
|
connMgr.broadcastWithWildcard(`eval:${evalRunId}`, {
|
|
1597
|
-
|
|
1628
|
+
type: "run_cancelled",
|
|
1598
1629
|
run: r + 1,
|
|
1599
1630
|
totalRuns: runs
|
|
1600
1631
|
});
|
|
1632
|
+
break;
|
|
1601
1633
|
}
|
|
1602
|
-
|
|
1603
|
-
|
|
1604
|
-
|
|
1605
|
-
|
|
1606
|
-
|
|
1607
|
-
|
|
1608
|
-
|
|
1634
|
+
runFailure = err instanceof Error ? err : new Error(String(err));
|
|
1635
|
+
connMgr.broadcastWithWildcard(`eval:${evalRunId}`, {
|
|
1636
|
+
type: "run_failed",
|
|
1637
|
+
run: r + 1,
|
|
1638
|
+
totalRuns: runs,
|
|
1639
|
+
message: redactErrorMessage(runFailure, redactOn)
|
|
1640
|
+
});
|
|
1641
|
+
break;
|
|
1642
|
+
}
|
|
1609
1643
|
}
|
|
1610
1644
|
if (results.length > 0) {
|
|
1645
|
+
const partial = results.length < runs;
|
|
1646
|
+
const failureMsg = runFailure ? redactErrorMessage(runFailure, redactOn) || String(runFailure) || void 0 : void 0;
|
|
1611
1647
|
connMgr.broadcastWithWildcard(`eval:${evalRunId}`, {
|
|
1612
1648
|
type: "done",
|
|
1613
1649
|
evalResultId: results[0].id,
|
|
1614
|
-
runGroupId
|
|
1650
|
+
runGroupId,
|
|
1651
|
+
...partial && {
|
|
1652
|
+
partial: true,
|
|
1653
|
+
batchCompleted: results.length,
|
|
1654
|
+
batchAttempted: runs,
|
|
1655
|
+
// `cancelled` and `batchFailure` are mutually exclusive:
|
|
1656
|
+
// the catch block sets at most one of {cancelled,
|
|
1657
|
+
// runFailure}. The client uses `cancelled` to render a
|
|
1658
|
+
// neutral "Cancelled — X of N runs completed" caption
|
|
1659
|
+
// instead of the amber "Stopped after: <message>"
|
|
1660
|
+
// failure caption.
|
|
1661
|
+
...cancelled ? { cancelled: true } : {},
|
|
1662
|
+
...failureMsg ? { batchFailure: failureMsg } : {}
|
|
1663
|
+
}
|
|
1664
|
+
});
|
|
1665
|
+
} else if (runFailure) {
|
|
1666
|
+
connMgr.broadcastWithWildcard(`eval:${evalRunId}`, {
|
|
1667
|
+
type: "error",
|
|
1668
|
+
message: redactErrorMessage(runFailure, redactOn)
|
|
1615
1669
|
});
|
|
1616
1670
|
} else {
|
|
1617
1671
|
connMgr.broadcastWithWildcard(`eval:${evalRunId}`, {
|
|
@@ -1649,19 +1703,38 @@ function createEvalRoutes(connMgr, evalLoader) {
|
|
|
1649
1703
|
const { aggregateRuns } = await import("@axlsdk/eval");
|
|
1650
1704
|
const runGroupId = randomUUID();
|
|
1651
1705
|
const results = [];
|
|
1706
|
+
let runFailure;
|
|
1652
1707
|
for (let r = 0; r < runs; r++) {
|
|
1653
|
-
|
|
1654
|
-
|
|
1655
|
-
|
|
1656
|
-
|
|
1657
|
-
|
|
1708
|
+
try {
|
|
1709
|
+
const result2 = await runtime.runRegisteredEval(name, {
|
|
1710
|
+
metadata: { runGroupId, runIndex: r, batchAttempted: runs },
|
|
1711
|
+
captureTraces
|
|
1712
|
+
});
|
|
1713
|
+
results.push(result2);
|
|
1714
|
+
} catch (err) {
|
|
1715
|
+
runFailure = err instanceof Error ? err : new Error(String(err));
|
|
1716
|
+
break;
|
|
1717
|
+
}
|
|
1718
|
+
}
|
|
1719
|
+
if (results.length === 0) {
|
|
1720
|
+
throw runFailure ?? new Error("No runs completed");
|
|
1658
1721
|
}
|
|
1659
|
-
const
|
|
1660
|
-
const
|
|
1661
|
-
const
|
|
1722
|
+
const aggregate = aggregateRuns(results);
|
|
1723
|
+
const first = results[0];
|
|
1724
|
+
const partial = results.length < runs;
|
|
1725
|
+
const failureMsg = runFailure ? redactErrorMessage(runFailure, redactOn) || String(runFailure) || void 0 : void 0;
|
|
1662
1726
|
const result = {
|
|
1663
1727
|
...first,
|
|
1664
|
-
_multiRun: {
|
|
1728
|
+
_multiRun: {
|
|
1729
|
+
aggregate,
|
|
1730
|
+
allRuns: results,
|
|
1731
|
+
...partial && {
|
|
1732
|
+
partial: true,
|
|
1733
|
+
batchCompleted: results.length,
|
|
1734
|
+
batchAttempted: runs,
|
|
1735
|
+
...failureMsg ? { batchFailure: failureMsg } : {}
|
|
1736
|
+
}
|
|
1737
|
+
}
|
|
1665
1738
|
};
|
|
1666
1739
|
return c.json({
|
|
1667
1740
|
ok: true,
|
|
@@ -1833,60 +1906,75 @@ function createEvalRoutes(connMgr, evalLoader) {
|
|
|
1833
1906
|
const runtime = c.get("runtime");
|
|
1834
1907
|
const body = await c.req.json();
|
|
1835
1908
|
const bad = (message) => c.json({ ok: false, error: { code: "BAD_REQUEST", message } }, 400);
|
|
1836
|
-
if (
|
|
1909
|
+
if (body.result === void 0 || body.result === null) {
|
|
1837
1910
|
return bad("result is required");
|
|
1838
1911
|
}
|
|
1839
|
-
const
|
|
1840
|
-
if (
|
|
1841
|
-
return bad("result
|
|
1842
|
-
}
|
|
1843
|
-
if (typeof result.summary !== "object" || result.summary == null) {
|
|
1844
|
-
return bad("result.summary must be an object");
|
|
1845
|
-
}
|
|
1846
|
-
if (typeof result.dataset !== "string" || !result.dataset) {
|
|
1847
|
-
return bad("result.dataset must be a non-empty string (required for compare)");
|
|
1912
|
+
const resultsRaw = Array.isArray(body.result) ? body.result : [body.result];
|
|
1913
|
+
if (resultsRaw.length === 0) {
|
|
1914
|
+
return bad("result must be a non-empty array or object");
|
|
1848
1915
|
}
|
|
1849
|
-
const
|
|
1850
|
-
|
|
1851
|
-
|
|
1852
|
-
|
|
1853
|
-
|
|
1854
|
-
|
|
1855
|
-
|
|
1856
|
-
|
|
1857
|
-
|
|
1858
|
-
|
|
1859
|
-
|
|
1860
|
-
|
|
1861
|
-
|
|
1916
|
+
const validatedResults = [];
|
|
1917
|
+
for (let i = 0; i < resultsRaw.length; i++) {
|
|
1918
|
+
const entry = resultsRaw[i];
|
|
1919
|
+
const prefix = resultsRaw.length > 1 ? `result[${i}]` : "result";
|
|
1920
|
+
if (!entry || typeof entry !== "object") {
|
|
1921
|
+
return bad(`${prefix} must be an object`);
|
|
1922
|
+
}
|
|
1923
|
+
const r = entry;
|
|
1924
|
+
if (!Array.isArray(r.items)) {
|
|
1925
|
+
return bad(`${prefix}.items must be an array`);
|
|
1926
|
+
}
|
|
1927
|
+
if (typeof r.summary !== "object" || r.summary == null) {
|
|
1928
|
+
return bad(`${prefix}.summary must be an object`);
|
|
1929
|
+
}
|
|
1930
|
+
if (typeof r.dataset !== "string" || !r.dataset) {
|
|
1931
|
+
return bad(`${prefix}.dataset must be a non-empty string (required for compare)`);
|
|
1932
|
+
}
|
|
1933
|
+
const summary = r.summary;
|
|
1934
|
+
if (typeof summary.scorers !== "object" || summary.scorers == null) {
|
|
1935
|
+
return bad(`${prefix}.summary.scorers must be an object`);
|
|
1936
|
+
}
|
|
1937
|
+
const summaryScorerNames = Object.keys(summary.scorers);
|
|
1938
|
+
const items = r.items;
|
|
1939
|
+
const summaryScorerSet = new Set(summaryScorerNames);
|
|
1940
|
+
const uncoveredAcrossItems = /* @__PURE__ */ new Set();
|
|
1941
|
+
for (const item of items) {
|
|
1942
|
+
const itemScores = item?.scores;
|
|
1943
|
+
if (itemScores && typeof itemScores === "object") {
|
|
1944
|
+
for (const name of Object.keys(itemScores)) {
|
|
1945
|
+
if (!summaryScorerSet.has(name)) uncoveredAcrossItems.add(name);
|
|
1946
|
+
}
|
|
1862
1947
|
}
|
|
1863
1948
|
}
|
|
1864
|
-
|
|
1865
|
-
|
|
1866
|
-
|
|
1867
|
-
|
|
1868
|
-
|
|
1949
|
+
if (uncoveredAcrossItems.size > 0) {
|
|
1950
|
+
return bad(
|
|
1951
|
+
`${prefix} item scores reference scorer(s) not in summary.scorers: ${[...uncoveredAcrossItems].join(", ")}`
|
|
1952
|
+
);
|
|
1953
|
+
}
|
|
1954
|
+
validatedResults.push(r);
|
|
1869
1955
|
}
|
|
1870
1956
|
const trim = (v) => typeof v === "string" && v.trim() !== "" ? v.trim() : void 0;
|
|
1871
|
-
const
|
|
1957
|
+
const firstResult = validatedResults[0];
|
|
1958
|
+
const metadataObj = typeof firstResult.metadata === "object" && firstResult.metadata != null ? firstResult.metadata : {};
|
|
1872
1959
|
const workflowsFromMeta = Array.isArray(metadataObj.workflows) ? metadataObj.workflows : [];
|
|
1873
1960
|
const primaryWorkflow = workflowsFromMeta.find((w) => typeof w === "string");
|
|
1874
|
-
const evalName = trim(body.eval) ?? trim(primaryWorkflow) ??
|
|
1875
|
-
trim(result.workflow) ?? "imported";
|
|
1876
|
-
const id = randomUUID();
|
|
1961
|
+
const evalName = trim(body.eval) ?? trim(primaryWorkflow) ?? trim(firstResult.workflow) ?? "imported";
|
|
1877
1962
|
const timestamp = Date.now();
|
|
1878
|
-
const imported =
|
|
1879
|
-
|
|
1880
|
-
id
|
|
1881
|
-
|
|
1882
|
-
|
|
1883
|
-
|
|
1884
|
-
|
|
1885
|
-
|
|
1886
|
-
timestamp,
|
|
1887
|
-
|
|
1888
|
-
}
|
|
1889
|
-
|
|
1963
|
+
const imported = [];
|
|
1964
|
+
for (const r of validatedResults) {
|
|
1965
|
+
const id = randomUUID();
|
|
1966
|
+
const entry = {
|
|
1967
|
+
...r,
|
|
1968
|
+
id,
|
|
1969
|
+
metadata: typeof r.metadata === "object" && r.metadata != null ? r.metadata : {}
|
|
1970
|
+
};
|
|
1971
|
+
await runtime.saveEvalResult({ id, eval: evalName, timestamp, data: entry });
|
|
1972
|
+
imported.push({ id, eval: evalName, timestamp });
|
|
1973
|
+
}
|
|
1974
|
+
if (imported.length === 1) {
|
|
1975
|
+
return c.json({ ok: true, data: imported[0] });
|
|
1976
|
+
}
|
|
1977
|
+
return c.json({ ok: true, data: { imported } });
|
|
1890
1978
|
});
|
|
1891
1979
|
function closeActiveRuns() {
|
|
1892
1980
|
for (const ac of activeRuns.values()) ac.abort();
|
|
@@ -2210,4 +2298,4 @@ export {
|
|
|
2210
2298
|
EvalAggregator,
|
|
2211
2299
|
createServer
|
|
2212
2300
|
};
|
|
2213
|
-
//# sourceMappingURL=chunk-
|
|
2301
|
+
//# sourceMappingURL=chunk-GADFO7DZ.js.map
|