@axlsdk/studio 0.17.9 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +51 -385
- package/dist/{chunk-WUCCIBQ6.js → chunk-CY4BTXRZ.js} +23 -1
- package/dist/chunk-CY4BTXRZ.js.map +1 -0
- package/dist/cli.cjs +22 -0
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +1 -1
- package/dist/client/assets/index-B90nxXYQ.js +313 -0
- package/dist/client/index.html +1 -1
- package/dist/middleware.cjs +22 -0
- package/dist/middleware.cjs.map +1 -1
- package/dist/middleware.js +1 -1
- package/dist/server/index.cjs +22 -0
- package/dist/server/index.cjs.map +1 -1
- package/dist/server/index.js +1 -1
- package/package.json +4 -4
- package/dist/chunk-WUCCIBQ6.js.map +0 -1
- package/dist/client/assets/index-Cskx93hn.js +0 -313
package/README.md
CHANGED
|
@@ -4,6 +4,10 @@
|
|
|
4
4
|
|
|
5
5
|
Local development UI for debugging, testing, and iterating on [Axl](https://github.com/axl-sdk/axl) agents and workflows.
|
|
6
6
|
|
|
7
|
+
<p align="center">
|
|
8
|
+
<img src="../../docs/assets/studio-trace-explorer.gif" alt="Axl Studio trace explorer — a waterfall of agent calls, tool calls, and retries with cost and duration per step" width="840">
|
|
9
|
+
</p>
|
|
10
|
+
|
|
7
11
|
## Installation
|
|
8
12
|
|
|
9
13
|
```bash
|
|
@@ -85,36 +89,55 @@ The `--conditions` flag is useful in monorepos where workspace packages use the
|
|
|
85
89
|
|
|
86
90
|
## Panels
|
|
87
91
|
|
|
92
|
+
Eight panels, all live over WebSocket. The screenshots below are placeholders — see [`docs/assets/CAPTURE.md`](../../docs/assets/CAPTURE.md) for how to record the real thing.
|
|
93
|
+
|
|
88
94
|
### Agent Playground
|
|
89
|
-
Chat with any registered agent in real-time. See streaming tokens, tool calls with expandable input/output, and multi-turn conversation history.
|
|
90
95
|
|
|
91
|
-
|
|
92
|
-
|
|
96
|
+
Chat with any registered agent in real time. Streaming tokens, tool calls with expandable input/output, and multi-turn history.
|
|
97
|
+
|
|
98
|
+
<p align="center">
|
|
99
|
+
<img src="../../docs/assets/studio-playground.gif" alt="Axl Studio agent playground — structured output streaming in char-by-char (typewriter)" width="760">
|
|
100
|
+
</p>
|
|
93
101
|
|
|
94
102
|
### Trace Explorer
|
|
95
|
-
Waterfall visualization of execution traces. Filter by type, agent, or tool. View token counts, cost per step, and duration.
|
|
96
103
|
|
|
97
|
-
|
|
104
|
+
Waterfall visualization of execution traces. Filter by type, agent, or tool; see token counts, cost per step, and duration. The Stats view shows the event-type distribution, top tools, and retry stacks.
|
|
105
|
+
|
|
106
|
+
> **Studio vs `ctx.events`.** Studio consumes the same `AxlEvent` firehose via `runtime.on('trace', …)` — every event from every execution. Inside a workflow handler, `ctx.events` is the in-handler counterpart (per-context, scoped to the current workflow). The two coexist: Studio is for cross-execution observability and replay; `ctx.events` is for in-handler streaming UIs. See [`docs/observability.md`](../../docs/observability.md#observation-paths).
|
|
98
107
|
|
|
99
108
|
### Cost Dashboard
|
|
100
|
-
Track spending across agents, models, workflows, and embedders with time-window filtering (24h/7d/30d/all). Live cost updates via WebSocket; all breakdown tables are user-sortable by any column. Two sections appear conditionally:
|
|
101
109
|
|
|
102
|
-
- **Retry Overhead**
|
|
103
|
-
|
|
110
|
+
Track spending across agents, models, workflows, and embedders with time-window filtering (24h/7d/30d/all). Sortable breakdown tables, a **Retry Overhead** section that decomposes cost by `retryReason` when retries happen, and a **Memory (Embedder)** section bucketed by embedder model. Sub-cent values use tiered precision so embedder costs don't collapse to `$0.0000`.
|
|
111
|
+
|
|
112
|
+
<p align="center">
|
|
113
|
+
<img src="../../docs/assets/studio-cost-dashboard.gif" alt="Axl Studio cost dashboard — spend by agent, model, and workflow with time-window filtering" width="840">
|
|
114
|
+
</p>
|
|
115
|
+
|
|
116
|
+
### Eval Runner
|
|
117
|
+
|
|
118
|
+
Run evaluations from the UI, watch items stream in, and drill into per-item scores, timing, cost, and LLM-judge reasoning. Compare two runs (baseline vs candidate) with bootstrap-CI significance, a score-distribution chart, and an item-level diff table. The History tab groups multi-run results and tracks mean scores across runs. Toggle **Capture traces** to render each item's events inline. Requires `@axlsdk/eval` as an optional peer dependency.
|
|
104
119
|
|
|
105
|
-
|
|
120
|
+
Amber banners flag runs you shouldn't fully trust — a scorer that failed on too many items, a subset run, or annotations dropped by the dataset schema — so a thinned or misconfigured run can't quietly look clean.
|
|
121
|
+
|
|
122
|
+
<p align="center">
|
|
123
|
+
<img src="../../docs/assets/studio-eval-runner.gif" alt="Axl Studio eval runner — score and duration trends by scorer and model across runs" width="840">
|
|
124
|
+
</p>
|
|
125
|
+
|
|
126
|
+
### Workflow Runner
|
|
127
|
+
|
|
128
|
+
Execute workflows with custom JSON input. View execution timelines showing each agent call, tool invocation, and cost. A Stats tab surfaces per-workflow p50/p95 and failure rates.
|
|
106
129
|
|
|
107
130
|
### Memory Browser
|
|
108
|
-
|
|
131
|
+
|
|
132
|
+
View and manage agent memory (session and global scope). Create, edit, and delete entries; test semantic recall queries.
|
|
109
133
|
|
|
110
134
|
### Session Manager
|
|
111
|
-
|
|
135
|
+
|
|
136
|
+
Browse active sessions with conversation history. Replay sessions step by step; view handoff chains between agents. Each assistant message is badged with its originating agent.
|
|
112
137
|
|
|
113
138
|
### Tool Inspector
|
|
114
|
-
Browse all registered tools with their schemas rendered as forms. Test any tool directly with custom input and see the result.
|
|
115
139
|
|
|
116
|
-
|
|
117
|
-
Run evaluations from the UI. Toggle **Capture traces** in the command bar to populate per-item `EvalItem.traces` — the item detail panel renders each captured event inline with type, agent/tool, duration, and cost (success and failure paths both). View per-item results with scores, timing, and cost. Drill into individual items to see LLM scorer reasoning, per-scorer timing/cost, and annotations. Filter items by error state or score threshold, sort by score/duration/cost. Score distribution chart shows how scores are spread across bins. Compare two runs with the run picker (baseline/candidate selection from history), timing/cost tradeoff analysis, item-level comparison table, and expandable regression detail showing side-by-side outputs and reasoning. History tab groups multi-run results and tracks mean scores across runs with an eval name filter. Multi-run switcher navigates between individual runs. LLM scorer badges distinguish LLM-judged from deterministic scorers. Significance tooltips explain bootstrap CI methodology. Requires `@axlsdk/eval` as an optional peer dependency.
|
|
140
|
+
Browse all registered tools with their schemas rendered as forms. Test any tool directly with custom input and see the result.
|
|
118
141
|
|
|
119
142
|
## What gets registered
|
|
120
143
|
|
|
@@ -129,83 +152,19 @@ Studio discovers your project through the `AxlRuntime` instance. Use these metho
|
|
|
129
152
|
|
|
130
153
|
Workflows are required for execution. Agents and tools are optional but recommended — they power the Playground agent picker and Tool Inspector panels. Evals require `@axlsdk/eval` as a peer dependency.
|
|
131
154
|
|
|
132
|
-
## API Endpoints
|
|
133
|
-
|
|
134
|
-
Studio exposes a REST API that the SPA consumes. You can also call these directly for scripting or testing.
|
|
135
|
-
|
|
136
|
-
| Endpoint | Description |
|
|
137
|
-
|----------|-------------|
|
|
138
|
-
| `GET /api/health` | Server status, registered workflow/agent/tool counts |
|
|
139
|
-
| `GET /api/workflows` | List all workflows with input/output schemas |
|
|
140
|
-
| `GET /api/workflows/:name` | Workflow detail |
|
|
141
|
-
| `POST /api/workflows/:name/execute` | Execute a workflow |
|
|
142
|
-
| `GET /api/agents` | List all agents |
|
|
143
|
-
| `GET /api/agents/:name` | Agent detail with config |
|
|
144
|
-
| `GET /api/tools` | List all tools with JSON Schema |
|
|
145
|
-
| `GET /api/tools/:name` | Tool detail |
|
|
146
|
-
| `POST /api/tools/:name/test` | Test a tool with `{ input: {...} }` |
|
|
147
|
-
| `GET /api/sessions` | List sessions |
|
|
148
|
-
| `GET /api/executions` | List executions |
|
|
149
|
-
| `GET /api/executions/:id` | Execution detail. `?since={step}` filters `events` to those with `step > since` (polling tail) |
|
|
150
|
-
| `POST /api/executions/:id/abort` | Abort a running execution (signal-driven; wakes paused `ctx.awaitHuman`) |
|
|
151
|
-
| `DELETE /api/executions/:id` | Delete an execution from history (GDPR scrub). Calls `runtime.deleteExecution` AND scrubs the WS replay buffer for `execution:{id}`. Returns `{ id, deleted: true }` or 404. Blocked in readOnly |
|
|
152
|
-
| `GET /api/costs?window=24h\|7d\|30d\|all` | Aggregated cost data for a time window (default `7d`). `?windows=all` returns all four windows at once for debugging |
|
|
153
|
-
| `GET /api/eval-trends?window=` | Per-eval score trends (latest, mean, std), cost totals, recent runs with `model`/`duration` |
|
|
154
|
-
| `GET /api/workflow-stats?window=` | Per-workflow totals, completed/failed counts, p50/p95/avg duration, failure rate |
|
|
155
|
-
| `GET /api/trace-stats?window=` | Event-type distribution, tool call counts (calls/approved/denied), retry breakdown by agent |
|
|
156
|
-
| `GET /api/memory/:scope/:key` | Read memory entry |
|
|
157
|
-
| `PUT /api/memory/:scope/:key` | Save memory entry |
|
|
158
|
-
| `DELETE /api/memory/:scope/:key` | Delete memory entry |
|
|
159
|
-
| `GET /api/evals` | List registered eval configs |
|
|
160
|
-
| `GET /api/evals/history` | List eval run history |
|
|
161
|
-
| `POST /api/evals/:name/run` | Run a registered eval by name. Body: `{ runs?: N, stream?: true, captureTraces?: true }` (`runs` capped at 25). When `stream: true`, returns `{ evalRunId }` immediately and broadcasts progress over the `eval:{evalRunId}` WS channel: `item_done` per item, `run_done` per successful run, `run_failed` on a provider error, `run_cancelled` on user-initiated abort, terminal `done` (carrying only `{ evalResultId, runGroupId? }` plus `partial: true / batchCompleted / batchAttempted` and either `cancelled: true` OR `batchFailure` — never both — when the batch is partial), or terminal `error` if no runs completed. Clients refetch the full result from history. `captureTraces: true` populates per-item `EvalItem.traces` on every item (success + failure); the Eval Runner panel renders these inline on item detail. Synchronous mode (default) returns the full `EvalResult` enriched with `_multiRun.partial` markers when applicable |
|
|
162
|
-
| `POST /api/evals/runs/:evalRunId/cancel` | Abort an active streaming eval run. The cancelled run appears in history with remaining items marked as cancelled |
|
|
163
|
-
| `POST /api/evals/:name/rescore` | Re-score a history entry with the eval's current scorers |
|
|
164
|
-
| `POST /api/evals/import` | Import a CLI eval artifact (parsed `EvalResult` JSON) into runtime history. Body: `{ result: EvalResult \| EvalResult[], eval? }`. The CLI's `--output` writes a JSON array when `--runs N > 1` (including for partial batches), so array form is supported — each entry imports as its own history entry with shared `runGroupId`, rendering as a coherent group in the History tab. Single-object response is `{ id, eval, timestamp }`; array response is `{ imported: [{ id, eval, timestamp }, ...] }`. Per-entry validation; import is all-or-nothing |
|
|
165
|
-
| `DELETE /api/evals/history/:id` | Delete a single history entry. Blocked in readOnly |
|
|
166
|
-
| `POST /api/evals/compare` | Compare two eval results by history ID. Body: `{ baselineId, candidateId, options? }` where each ID is `string` (single run) or `string[]` (pooled multi-run). Resolves IDs server-side from `runtime.getEvalHistory()` so the wire payload stays small |
|
|
167
|
-
| `POST /api/playground/chat` | Chat with an agent directly (no workflow required). Accepts `{ message, agent?, sessionId? }`. Streams results via WebSocket |
|
|
168
|
-
| `GET /api/decisions` | List pending decisions |
|
|
169
|
-
| `POST /api/decisions/:id/resolve` | Resolve a pending decision |
|
|
170
|
-
|
|
171
|
-
All endpoints return `{ ok: true, data: {...} }` on success or `{ ok: false, error: { code, message } }` on error.
|
|
172
|
-
|
|
173
|
-
### WebSocket
|
|
174
|
-
|
|
175
|
-
Single endpoint at `ws://localhost:4400/ws` with channel multiplexing:
|
|
176
|
-
|
|
177
|
-
```json
|
|
178
|
-
{ "type": "subscribe", "channel": "trace:*" }
|
|
179
|
-
{ "type": "event", "channel": "trace:abc-123", "data": { ... } }
|
|
180
|
-
```
|
|
181
|
-
|
|
182
|
-
Channels: `execution:{id}`, `trace:{id}`, `trace:*`, `eval:{id}`, `eval:{evalRunId}`, `eval:*`, `costs`, `eval-trends`, `workflow-stats`, `trace-stats`, `decisions`. Execution and eval channels have replay buffering — late subscribers receive the full event history (capped at 1000 events by default; tunable via `bufferCaps`, see below). Buffers are cleaned up 30s after the stream completes. Aggregate channels (`costs`, `eval-trends`, `workflow-stats`, `trace-stats`) broadcast `{ snapshots: Record<WindowId, State>, updatedAt }` on every fold or rebuild.
|
|
183
|
-
|
|
184
|
-
**Outbound frame budget.** The WS broadcast layer enforces a 64KB soft cap via `truncateIfOversized`. Oversized verbose-mode `agent_call_end.data.messages` snapshots are replaced with a `{ __truncated: true, originalBytes, maxBytes, hint }` placeholder that preserves the event's `type`/`step`/`agent`/`tool` so the Trace Explorer still renders the row. The 64KB threshold matches the inbound message reject limit in the WS protocol (shared constant).
|
|
185
|
-
|
|
186
|
-
### Migrating from 0.14
|
|
187
|
-
|
|
188
|
-
- **`POST /api/costs/reset` has been removed.** Any script hitting the old endpoint gets `404`. Use window selection (`?window=`) instead — snapshots evict automatically as their window slides.
|
|
189
|
-
- **`CostAggregator` class is no longer exported** from `@axlsdk/studio`. Replaced by `TraceAggregator<CostData>` configured with a pure `reduceCost` reducer. Behavior is preserved.
|
|
190
|
-
- **`costs` WS channel payload shape changed** from `CostData` to `{ snapshots: Record<WindowId, CostData>, updatedAt: number }`. Clients that read the old shape must select a window (typically `snapshots['7d']`).
|
|
191
|
-
|
|
192
155
|
## Embeddable Middleware
|
|
193
156
|
|
|
194
|
-
For
|
|
157
|
+
For apps using dependency injection (NestJS, etc.) or an existing HTTP server, Studio mounts as middleware instead of running as a standalone CLI. Works with Express, Fastify, Koa, NestJS, raw `http.Server`, and Hono-in-Hono.
|
|
195
158
|
|
|
196
159
|
```typescript
|
|
197
160
|
import express from 'express';
|
|
198
161
|
import { AxlRuntime } from '@axlsdk/axl';
|
|
199
162
|
import { createStudioMiddleware } from '@axlsdk/studio/middleware';
|
|
200
163
|
|
|
201
|
-
const runtime = new AxlRuntime({ providers: ['openai'] });
|
|
202
|
-
// ... register workflows, agents, tools ...
|
|
203
|
-
|
|
204
164
|
const studio = createStudioMiddleware({
|
|
205
165
|
runtime,
|
|
206
166
|
basePath: '/studio',
|
|
207
|
-
//
|
|
208
|
-
// This runs on WebSocket upgrades, which bypass Express middleware.
|
|
167
|
+
// WebSocket upgrades bypass Express middleware — always authenticate here.
|
|
209
168
|
verifyUpgrade: (req) => {
|
|
210
169
|
const url = new URL(req.url!, `http://${req.headers.host}`);
|
|
211
170
|
return url.searchParams.get('token') === process.env.MY_SECRET;
|
|
@@ -214,324 +173,31 @@ const studio = createStudioMiddleware({
|
|
|
214
173
|
|
|
215
174
|
const app = express();
|
|
216
175
|
app.use('/studio', studio.handler);
|
|
217
|
-
|
|
218
176
|
const server = app.listen(3000);
|
|
219
|
-
studio.upgradeWebSocket(server);
|
|
220
|
-
```
|
|
221
|
-
|
|
222
|
-
### Options
|
|
223
|
-
|
|
224
|
-
| Option | Type | Default | Description |
|
|
225
|
-
|--------|------|---------|-------------|
|
|
226
|
-
| `runtime` | `AxlRuntime` | required | The runtime instance to observe and control |
|
|
227
|
-
| `basePath` | `string` | `''` | URL path prefix (e.g., `'/studio'`) |
|
|
228
|
-
| `serveClient` | `boolean` | `true` | Serve the pre-built SPA |
|
|
229
|
-
| `verifyUpgrade` | `(req) => boolean \| { allowed: boolean, metadata?: unknown } \| Promise<...>` | — | Auth callback for WebSocket upgrades. The object form attaches `metadata` (tenant/user id / role) to the connection, available to `filterTraceEvent` on every outbound broadcast. Bare boolean still works (back-compat) |
|
|
230
|
-
| `filterTraceEvent` | `(event, metadata) => boolean` | — | Per-connection broadcast filter for multi-tenant deployments. Called on every outbound trace event (and on replay buffer events for late subscribers, so historical cross-tenant events can't leak on reconnect). Predicate errors are fail-closed — event is dropped |
|
|
231
|
-
| `readOnly` | `boolean` | `false` | Disable all mutating endpoints. `POST /api/evals/compare` is allowed (pure computation); `POST /api/evals/import`, `POST /api/evals/:name/run`, `POST /api/evals/:name/rescore`, `POST /api/evals/runs/:evalRunId/cancel`, `DELETE /api/evals/history/:id`, and `DELETE /api/executions/:id` are blocked (405 with `error.code: 'READ_ONLY'`) |
|
|
232
|
-
| `evals` | `string \| string[] \| { files, conditions? }` | — | Lazy-load eval files for the Eval Runner panel |
|
|
233
|
-
| `bufferCaps` | `{ maxEventsPerBuffer?, maxBytesPerBuffer?, maxActiveBuffers? }` | `{ 1000, 4 MiB, 256 }` | Override the default WebSocket replay-buffer resource caps for high-churn deployments. Worst-case memory is roughly `maxActiveBuffers × maxBytesPerBuffer` (≈1 GiB at defaults). Terminal `done`/`error` events are always buffered regardless of caps |
|
|
234
|
-
|
|
235
|
-
### Return value
|
|
236
|
-
|
|
237
|
-
| Property | Description |
|
|
238
|
-
|----------|-------------|
|
|
239
|
-
| `handler` | Node.js `(req, res)` handler for Express/Fastify/Koa/raw HTTP |
|
|
240
|
-
| `handleWebSocket(ws)` | Handle an individual WebSocket (framework-agnostic) |
|
|
241
|
-
| `upgradeWebSocket(server)` | Attach WS upgrade handling to an `http.Server` |
|
|
242
|
-
| `app` | Underlying Hono app (for Hono-in-Hono mounting) |
|
|
243
|
-
| `connectionManager` | WS connection/channel manager |
|
|
244
|
-
| `close()` | Shut down middleware (removes listeners, closes connections) |
|
|
245
|
-
|
|
246
|
-
**Note:** `upgradeWebSocket(server)` is required for real-time features (trace streaming, cost updates, execution events, decision resolution). Without it, the Studio SPA loads but panels relying on live data will show no updates. If your framework manages WebSocket connections itself (NestJS gateway, Fastify plugin), use `handleWebSocket()` instead.
|
|
247
|
-
|
|
248
|
-
### Host body limits
|
|
249
|
-
|
|
250
|
-
Studio's API uses small request bodies — the eval comparison flow sends history IDs (~100 bytes), not full result payloads — so the default body limits in Express, NestJS, Fastify, and Koa (typically 100KB) are sufficient for normal use.
|
|
251
|
-
|
|
252
|
-
The one exception is `POST /api/evals/import`, which accepts a full `EvalResult` JSON (typically a CLI artifact from `axl-eval --output result.json`). If you import sizeable eval files through Studio, raise your host framework's JSON body limit *on the Studio sub-mount only*.
|
|
253
|
-
|
|
254
|
-
**Express:**
|
|
255
|
-
|
|
256
|
-
```typescript
|
|
257
|
-
import express from 'express';
|
|
258
|
-
const app = express();
|
|
259
|
-
// Larger limit just for Studio; the rest of the app keeps its defaults.
|
|
260
|
-
app.use('/studio', express.json({ limit: '10mb' }), studio.handler);
|
|
261
|
-
```
|
|
262
|
-
|
|
263
|
-
**NestJS:** NestJS registers its own body-parser at bootstrap, so `app.use(express.json(...))` added after `NestFactory.create()` does *not* override it — the built-in parser runs first and still rejects with `PayloadTooLargeError`. Disable the built-in parser and register a conditional one:
|
|
264
|
-
|
|
265
|
-
```typescript
|
|
266
|
-
// main.ts
|
|
267
|
-
import { NestFactory, HttpAdapterHost } from '@nestjs/core';
|
|
268
|
-
import { json } from 'express';
|
|
269
|
-
import { AppModule } from './app.module';
|
|
270
|
-
import { createStudioMiddleware } from '@axlsdk/studio/middleware';
|
|
271
|
-
|
|
272
|
-
async function bootstrap() {
|
|
273
|
-
// Disable Nest's built-in body parser so we control limits ourselves.
|
|
274
|
-
const app = await NestFactory.create(AppModule, { bodyParser: false });
|
|
275
|
-
|
|
276
|
-
// Apply 10 MB limit to the Studio sub-mount only; rest of the app keeps
|
|
277
|
-
// the 100 KB default. This is the maintainer-endorsed pattern for
|
|
278
|
-
// per-route body limits in NestJS (see nestjs/nest#14734).
|
|
279
|
-
const studioJson = json({ limit: '10mb' });
|
|
280
|
-
const defaultJson = json();
|
|
281
|
-
app.use((req, res, next) =>
|
|
282
|
-
req.url.startsWith('/studio') ? studioJson(req, res, next) : defaultJson(req, res, next),
|
|
283
|
-
);
|
|
284
|
-
|
|
285
|
-
const studio = createStudioMiddleware({ runtime });
|
|
286
|
-
const expressApp = app.get(HttpAdapterHost).httpAdapter.getInstance();
|
|
287
|
-
expressApp.use('/studio', studio.handler);
|
|
288
|
-
studio.upgradeWebSocket(app.getHttpServer());
|
|
289
|
-
|
|
290
|
-
await app.listen(3000);
|
|
291
|
-
}
|
|
292
|
-
bootstrap();
|
|
293
|
-
```
|
|
294
|
-
|
|
295
|
-
> `app.useBodyParser('json', { limit })` raises the limit **globally**, not per-route — avoid it if you want the larger limit scoped to Studio.
|
|
296
|
-
|
|
297
|
-
**Fastify:** set `bodyLimit` on the Fastify instance or pass it via `fastify({ bodyLimit: 10 * 1024 * 1024 })`. There's no per-route equivalent as clean as Express's; if Studio is the only route that needs a larger limit, either raise the global limit or mount Studio on a separate Fastify instance.
|
|
298
|
-
|
|
299
|
-
### Framework examples
|
|
300
|
-
|
|
301
|
-
#### NestJS
|
|
302
|
-
|
|
303
|
-
```typescript
|
|
304
|
-
import { Module, OnModuleInit, OnModuleDestroy } from '@nestjs/common';
|
|
305
|
-
import { HttpAdapterHost } from '@nestjs/core';
|
|
306
|
-
import { createStudioMiddleware, type StudioMiddleware } from '@axlsdk/studio/middleware';
|
|
307
|
-
|
|
308
|
-
@Module({ /* ... */ })
|
|
309
|
-
export class AppModule implements OnModuleInit, OnModuleDestroy {
|
|
310
|
-
private studio!: StudioMiddleware;
|
|
311
|
-
|
|
312
|
-
constructor(
|
|
313
|
-
private readonly httpAdapterHost: HttpAdapterHost,
|
|
314
|
-
private readonly runtime: AxlRuntime, // injected via custom provider
|
|
315
|
-
) {}
|
|
316
|
-
|
|
317
|
-
onModuleInit() {
|
|
318
|
-
this.studio = createStudioMiddleware({
|
|
319
|
-
runtime: this.runtime,
|
|
320
|
-
basePath: '/studio',
|
|
321
|
-
verifyUpgrade: (req) => req.headers['authorization'] === `Bearer ${process.env.MY_SECRET}`,
|
|
322
|
-
});
|
|
323
|
-
|
|
324
|
-
// Mount on the underlying Express instance — this is the recommended
|
|
325
|
-
// NestJS pattern for sub-application mounting (see NestJS HTTP adapter docs).
|
|
326
|
-
const expressApp = this.httpAdapterHost.httpAdapter.getInstance();
|
|
327
|
-
expressApp.use('/studio', this.studio.handler);
|
|
328
|
-
this.studio.upgradeWebSocket(this.httpAdapterHost.httpAdapter.getHttpServer());
|
|
329
|
-
}
|
|
330
|
-
|
|
331
|
-
onModuleDestroy() {
|
|
332
|
-
this.studio.close();
|
|
333
|
-
}
|
|
334
|
-
}
|
|
335
|
-
```
|
|
336
|
-
|
|
337
|
-
#### Fastify
|
|
338
|
-
|
|
339
|
-
```typescript
|
|
340
|
-
import Fastify from 'fastify';
|
|
341
|
-
import middie from '@fastify/middie';
|
|
342
|
-
import { createStudioMiddleware } from '@axlsdk/studio/middleware';
|
|
343
|
-
|
|
344
|
-
const studio = createStudioMiddleware({ runtime, basePath: '/studio' });
|
|
345
|
-
const fastify = Fastify();
|
|
346
|
-
|
|
347
|
-
await fastify.register(middie);
|
|
348
|
-
fastify.use('/studio', studio.handler);
|
|
349
|
-
|
|
350
|
-
await fastify.listen({ port: 3000 });
|
|
351
|
-
studio.upgradeWebSocket(fastify.server);
|
|
352
|
-
```
|
|
353
|
-
|
|
354
|
-
#### Raw Node.js
|
|
355
|
-
|
|
356
|
-
```typescript
|
|
357
|
-
import { createServer } from 'node:http';
|
|
358
|
-
import { createStudioMiddleware } from '@axlsdk/studio/middleware';
|
|
359
|
-
|
|
360
|
-
const studio = createStudioMiddleware({ runtime });
|
|
361
|
-
const server = createServer(studio.handler);
|
|
362
|
-
studio.upgradeWebSocket(server);
|
|
363
|
-
server.listen(3000);
|
|
364
|
-
```
|
|
365
|
-
|
|
366
|
-
#### Hono-in-Hono
|
|
367
|
-
|
|
368
|
-
```typescript
|
|
369
|
-
import { Hono } from 'hono';
|
|
370
|
-
import { createStudioMiddleware, handleWsMessage } from '@axlsdk/studio/middleware';
|
|
371
|
-
|
|
372
|
-
const studio = createStudioMiddleware({ runtime, basePath: '/studio' });
|
|
373
|
-
const app = new Hono();
|
|
374
|
-
app.route('/studio', studio.app);
|
|
375
|
-
// Wire WebSocket via Hono's native WS support — see spec for full example
|
|
376
|
-
```
|
|
377
|
-
|
|
378
|
-
### Important: `basePath` must match your mount path
|
|
379
|
-
|
|
380
|
-
`basePath` tells the SPA where it's mounted in the browser URL. It must match the path in your framework's mount call:
|
|
381
|
-
|
|
382
|
-
```typescript
|
|
383
|
-
// These must match:
|
|
384
|
-
createStudioMiddleware({ basePath: '/studio' }) // tells the SPA
|
|
385
|
-
app.use('/studio', studio.handler) // tells Express
|
|
386
|
-
```
|
|
387
|
-
|
|
388
|
-
If they don't match, the SPA will load but API calls will fail (the SPA sends requests to the wrong path).
|
|
389
|
-
|
|
390
|
-
### Lazy eval loading
|
|
391
|
-
|
|
392
|
-
In monorepos, eval files often import from domain modules (prompt builders, validators, fixture datasets) that would create circular dependencies if statically imported from the module that owns the runtime. The `evals` option solves this by dynamically importing eval files on first access to the Eval Runner panel — never during normal API operation.
|
|
393
|
-
|
|
394
|
-
```typescript
|
|
395
|
-
const studio = createStudioMiddleware({
|
|
396
|
-
runtime,
|
|
397
|
-
basePath: '/studio',
|
|
398
|
-
evals: 'evals/**/*.eval.ts',
|
|
399
|
-
});
|
|
400
|
-
```
|
|
401
|
-
|
|
402
|
-
Eval files are standalone entry points (like `axl.config.ts`). They can import from any module without creating circular deps in the static module graph, and `@axlsdk/eval` can remain a `devDependency` since bundlers can't see dynamic `import()` calls.
|
|
403
|
-
|
|
404
|
-
**Multiple patterns or explicit paths:**
|
|
405
|
-
|
|
406
|
-
```typescript
|
|
407
|
-
evals: ['evals/*.eval.ts', 'tests/evals/*.eval.ts']
|
|
408
|
-
```
|
|
409
|
-
|
|
410
|
-
**Monorepo import conditions** (process-wide via `module.register()`):
|
|
411
|
-
|
|
412
|
-
```typescript
|
|
413
|
-
evals: {
|
|
414
|
-
files: 'libs/api/evals/*.eval.ts',
|
|
415
|
-
conditions: ['development'],
|
|
416
|
-
}
|
|
417
|
-
```
|
|
418
|
-
|
|
419
|
-
Each file should `export default` a config with `{ workflow, dataset, scorers }` (the result of `defineEval()`). By default, the runtime executes the named workflow for each dataset item. For self-contained evals that don't depend on a registered workflow, export an `executeWorkflow` function — it will be called instead of `runtime.execute()`. See the [`@axlsdk/eval` README](../axl-eval/README.md#defineevalconfig) for details.
|
|
420
|
-
|
|
421
|
-
Eval names are the file's path relative to the project root (`cwd`), minus the `.eval.*` suffix:
|
|
422
|
-
|
|
423
|
-
```
|
|
424
|
-
evals/suggestions.eval.ts → "evals/suggestions"
|
|
425
|
-
evals/api/accuracy.eval.ts → "evals/api/accuracy"
|
|
426
|
-
libs/search/accuracy.eval.ts → "libs/search/accuracy"
|
|
177
|
+
studio.upgradeWebSocket(server); // required for live data
|
|
427
178
|
```
|
|
428
179
|
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
Lazy-loaded evals coexist with evals registered directly via `runtime.registerEval()`.
|
|
432
|
-
|
|
433
|
-
**Important notes:**
|
|
180
|
+
Key options: `readOnly` (disable mutating endpoints for production monitoring), `evals` (lazy-load eval files), `filterTraceEvent` (per-tenant broadcast scoping), `bufferCaps` (WS replay-buffer limits). `basePath` must match your framework's mount path.
|
|
434
181
|
|
|
435
|
-
|
|
436
|
-
- **Running nested evals**: Names containing `/` must be URL-encoded in the run endpoint: `POST /api/evals/api%2Faccuracy/run`.
|
|
437
|
-
- **Name stability**: Names are project-relative paths, so they never change when other files or patterns are added/removed.
|
|
438
|
-
- **Supported glob patterns**: `dir/*.eval.ts` (single directory), `dir/**/*.eval.ts` (recursive), `**/*.eval.ts` (recursive from cwd). Multi-segment `**` (e.g., `a/**/b/**/*.ts`) is not supported.
|
|
182
|
+
**See [`docs/studio-api.md`](../../docs/studio-api.md) for the full reference:** every REST endpoint, the WebSocket protocol, the complete middleware options/return tables, NestJS/Fastify/Hono examples, host body-limit guidance, lazy eval loading, multi-tenant setup, and the internal architecture.
|
|
439
183
|
|
|
440
|
-
|
|
184
|
+
## Security
|
|
441
185
|
|
|
442
|
-
- **Always** provide `verifyUpgrade` — WebSocket upgrades bypass Express/Fastify/Koa middleware, so your auth middleware does
|
|
443
|
-
- Consider `readOnly: true` for production monitoring — view traces, costs, and schemas without execution capability
|
|
444
|
-
- CORS is not applied in embedded mode — the host framework owns CORS policy
|
|
445
|
-
- `basePath` is validated against unsafe characters and path traversal
|
|
186
|
+
- **Always** provide `verifyUpgrade` — WebSocket upgrades bypass Express/Fastify/Koa middleware, so your auth middleware does **not** protect WebSocket connections.
|
|
187
|
+
- Consider `readOnly: true` for production monitoring — view traces, costs, and schemas without execution capability.
|
|
188
|
+
- CORS is not applied in embedded mode — the host framework owns CORS policy.
|
|
189
|
+
- `basePath` is validated against unsafe characters and path traversal.
|
|
446
190
|
|
|
447
|
-
###
|
|
191
|
+
### Redaction
|
|
448
192
|
|
|
449
|
-
When the runtime is constructed with `config.trace.redact: true`, Studio scrubs user/LLM content at three
|
|
193
|
+
When the runtime is constructed with `config.trace.redact: true`, Studio scrubs user/LLM content at three boundaries (trace emission, REST serialization, WS broadcast) while preserving structural metadata (IDs, names, roles, cost/token/duration, timestamps).
|
|
450
194
|
|
|
451
195
|
```typescript
|
|
452
196
|
const runtime = new AxlRuntime({ trace: { redact: true } });
|
|
453
197
|
const studio = createStudioMiddleware({ runtime });
|
|
454
198
|
```
|
|
455
199
|
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
**`DELETE /api/executions/:id` is a second cleanup boundary** alongside redaction. Redaction scrubs *content* on read; the delete endpoint removes the *whole row + indexes + checkpoints + suspended state + streaming buffer + pending decisions* AND scrubs the WebSocket replay buffer for `execution:{id}` so late subscribers can't reconstruct events for a deleted run. Audit via `runtime.on('execution_deleted', ...)`.
|
|
459
|
-
|
|
460
|
-
Studio checks the flag via `runtime.isRedactEnabled(): boolean` — it does **not** reach into the config object directly, because `Readonly<AxlConfig>` is shallow and consumers could mutate the nested `trace.redact` field via sub-object access. `GET /api/health` also reports `readOnly: boolean` so clients can gate mutating UI affordances.
|
|
461
|
-
|
|
462
|
-
See [`docs/observability.md`](../../docs/observability.md#pii-and-redaction) for the complete scrubbed/preserved field table.
|
|
463
|
-
|
|
464
|
-
### Multi-tenant deployments
|
|
465
|
-
|
|
466
|
-
Combine `verifyUpgrade` returning `{ allowed, metadata }` with `filterTraceEvent` to scope each WebSocket connection to a tenant/user:
|
|
467
|
-
|
|
468
|
-
```typescript
|
|
469
|
-
const studio = createStudioMiddleware({
|
|
470
|
-
runtime,
|
|
471
|
-
verifyUpgrade: (req) => {
|
|
472
|
-
const userId = authenticate(req);
|
|
473
|
-
if (!userId) return { allowed: false };
|
|
474
|
-
return { allowed: true, metadata: { userId, tenantId: lookupTenant(userId) } };
|
|
475
|
-
},
|
|
476
|
-
filterTraceEvent: (event, metadata) => {
|
|
477
|
-
// Scope the trace firehose: only let a connection see its own tenant's events.
|
|
478
|
-
return event.metadata?.tenantId === metadata?.tenantId;
|
|
479
|
-
},
|
|
480
|
-
});
|
|
481
|
-
```
|
|
482
|
-
|
|
483
|
-
The filter runs on live broadcasts **and** on replay buffer events delivered to late subscribers, so historical cross-tenant events can't leak on reconnect. Predicate errors are fail-closed (event dropped).
|
|
484
|
-
|
|
485
|
-
### Migrating from the standalone CLI
|
|
486
|
-
|
|
487
|
-
If you currently use `npx @axlsdk/studio` with a config file:
|
|
488
|
-
|
|
489
|
-
1. Move runtime creation from `axl.config.ts` into your app's initialization code
|
|
490
|
-
2. Register workflows, agents, and tools on the runtime where they have access to your services
|
|
491
|
-
3. Call `createStudioMiddleware({ runtime, basePath: '/studio' })` and mount the handler
|
|
492
|
-
4. Call `upgradeWebSocket(server)` for WebSocket support
|
|
493
|
-
5. Remove the `axl-studio` CLI from your dev scripts
|
|
494
|
-
|
|
495
|
-
The `axl.config.ts` file is no longer needed. The standalone CLI continues to work for projects that don't need embedded middleware.
|
|
496
|
-
|
|
497
|
-
## Architecture
|
|
498
|
-
|
|
499
|
-
```
|
|
500
|
-
src/
|
|
501
|
-
cli.ts CLI entry — loads config, starts server
|
|
502
|
-
middleware.ts Embeddable middleware: createStudioMiddleware()
|
|
503
|
-
resolve-runtime.ts Config module interop (ESM default, CJS wrapping, named exports)
|
|
504
|
-
server/
|
|
505
|
-
index.ts createServer() — Hono app composition (basePath, readOnly, cors)
|
|
506
|
-
types.ts API types, WebSocket message types
|
|
507
|
-
aggregates/
|
|
508
|
-
aggregate-snapshots.ts AggregateSnapshots<State> helper (per-window state, fold, replace, broadcastTransform)
|
|
509
|
-
trace-aggregator.ts TraceAggregator<State> — AxlEvent consumer (costs, trace-stats)
|
|
510
|
-
execution-aggregator.ts ExecutionAggregator<State> — ExecutionInfo consumer (workflow-stats)
|
|
511
|
-
eval-aggregator.ts EvalAggregator<State> — EvalHistoryEntry consumer (eval-trends)
|
|
512
|
-
reducers.ts Pure reducers: reduceCost, reduceWorkflowStats, reduceTraceStats, reduceEvalTrends + enrichWorkflowStats
|
|
513
|
-
middleware/
|
|
514
|
-
error-handler.ts Axl errors → JSON error envelope
|
|
515
|
-
routes/ One file per resource (health, workflows, agents, tools, costs, eval-trends, workflow-stats, trace-stats, evals, etc.)
|
|
516
|
-
ws/
|
|
517
|
-
handler.ts WebSocket message routing (Hono adapter)
|
|
518
|
-
connection-manager.ts Channel subscriptions + broadcast (BroadcastTarget) + replay buffer for execution channels
|
|
519
|
-
protocol.ts Shared WS protocol: handleWsMessage(), channel validation
|
|
520
|
-
client/
|
|
521
|
-
App.tsx React SPA — sidebar + 8 panel routes
|
|
522
|
-
lib/
|
|
523
|
-
api.ts Typed fetch wrappers (reads window.__AXL_STUDIO_BASE__)
|
|
524
|
-
ws.ts WebSocket client with channel subscriptions (reads base path)
|
|
525
|
-
panels/ One directory per panel
|
|
526
|
-
```
|
|
527
|
-
|
|
528
|
-
**Server:** Hono HTTP server wrapping the user's `AxlRuntime`. REST endpoints for CRUD, WebSocket for live streaming. Supports standalone CLI and embeddable middleware modes.
|
|
529
|
-
|
|
530
|
-
**Client:** React 19 SPA with Tailwind CSS v4, TanStack Query, and react-router-dom. Pre-built at publish time and served as static assets. Reads `window.__AXL_STUDIO_BASE__` for runtime base path configuration.
|
|
531
|
-
|
|
532
|
-
**CLI:** Auto-detects and loads the user's config. TypeScript files activate tsx's loader hooks process-wide (registered once per process via both `tsx/esm/api`'s and `tsx/cjs/api`'s `register()`), so chained `import()` AND transitive `require('./x.ts')` calls from CJS workspace deps are transformed. Validates the runtime, starts the server, and optionally opens the browser.
|
|
533
|
-
|
|
534
|
-
**Middleware:** `createStudioMiddleware()` wraps the Hono app as a Node.js `(req, res)` handler via `@hono/node-server`. Adds `verifyUpgrade` for WS auth, `readOnly` mode, and `basePath` injection into the SPA.
|
|
200
|
+
See the [redaction section in the API reference](../../docs/studio-api.md#observability-boundary-redaction) and the [scrubbed/preserved field table](../../docs/observability.md#pii-and-redaction).
|
|
535
201
|
|
|
536
202
|
## Development
|
|
537
203
|
|
|
@@ -542,7 +208,7 @@ pnpm install
|
|
|
542
208
|
# Build everything (client then server)
|
|
543
209
|
pnpm --filter @axlsdk/studio build
|
|
544
210
|
|
|
545
|
-
# Dev mode (Vite HMR + server watch)
|
|
211
|
+
# Dev mode (Vite HMR + server watch, seeded with dev-fixtures)
|
|
546
212
|
pnpm --filter @axlsdk/studio dev
|
|
547
213
|
|
|
548
214
|
# Type check
|
|
@@ -1799,8 +1799,13 @@ function createEvalRoutes(connMgr, evalLoader) {
|
|
|
1799
1799
|
const first = results[0];
|
|
1800
1800
|
const partial2 = results.length < runs;
|
|
1801
1801
|
const failureMsg = runFailure ? redactErrorMessage(runFailure, redactOn) || String(runFailure) || void 0 : void 0;
|
|
1802
|
+
const aggDegraded = unionDegradedScorers(results);
|
|
1802
1803
|
const result = {
|
|
1803
1804
|
...first,
|
|
1805
|
+
summary: {
|
|
1806
|
+
...first.summary,
|
|
1807
|
+
...aggDegraded.length > 0 ? { degraded: aggDegraded } : {}
|
|
1808
|
+
},
|
|
1804
1809
|
_multiRun: {
|
|
1805
1810
|
aggregate,
|
|
1806
1811
|
allRuns: results,
|
|
@@ -2058,6 +2063,23 @@ function createEvalRoutes(connMgr, evalLoader) {
|
|
|
2058
2063
|
}
|
|
2059
2064
|
return { app: app5, closeActiveRuns };
|
|
2060
2065
|
}
|
|
2066
|
+
function unionDegradedScorers(results) {
|
|
2067
|
+
const byScorer = /* @__PURE__ */ new Map();
|
|
2068
|
+
for (const run of results) {
|
|
2069
|
+
const degraded = run.summary?.degraded;
|
|
2070
|
+
if (!Array.isArray(degraded)) continue;
|
|
2071
|
+
for (const d of degraded) {
|
|
2072
|
+
const existing = byScorer.get(d.scorer);
|
|
2073
|
+
if (!existing) {
|
|
2074
|
+
byScorer.set(d.scorer, { ...d, runsAffected: 1 });
|
|
2075
|
+
} else {
|
|
2076
|
+
const worse = d.rate > existing.rate ? d : existing;
|
|
2077
|
+
byScorer.set(d.scorer, { ...worse, runsAffected: existing.runsAffected + 1 });
|
|
2078
|
+
}
|
|
2079
|
+
}
|
|
2080
|
+
}
|
|
2081
|
+
return [...byScorer.values()];
|
|
2082
|
+
}
|
|
2061
2083
|
|
|
2062
2084
|
// src/server/routes/playground.ts
|
|
2063
2085
|
import { Hono as Hono11 } from "hono";
|
|
@@ -16155,4 +16177,4 @@ export {
|
|
|
16155
16177
|
EvalAggregator,
|
|
16156
16178
|
createServer
|
|
16157
16179
|
};
|
|
16158
|
-
//# sourceMappingURL=chunk-
|
|
16180
|
+
//# sourceMappingURL=chunk-CY4BTXRZ.js.map
|