@alexkroman1/aai 1.4.5 → 1.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/.turbo/turbo-build.log +10 -10
  2. package/CHANGELOG.md +19 -0
  3. package/dist/{_internal-types-3p3OJZPb.js → _internal-types-DFL07G3f.js} +2 -0
  4. package/dist/assemblyai-C969QGi4.js +35 -0
  5. package/dist/cartesia-BfQPOQ7Y.js +37 -0
  6. package/dist/host/_pipeline-test-fakes.d.ts +3 -1
  7. package/dist/host/providers/stt/deepgram.d.ts +28 -0
  8. package/dist/host/providers/tts/cartesia.d.ts +1 -1
  9. package/dist/host/providers/tts/rime.d.ts +44 -0
  10. package/dist/host/runtime-barrel.d.ts +4 -2
  11. package/dist/host/runtime-barrel.js +1434 -1209
  12. package/dist/host/runtime.d.ts +2 -2
  13. package/dist/host/s2s.d.ts +16 -16
  14. package/dist/host/session-core.d.ts +37 -0
  15. package/dist/host/transports/pipeline-transport.d.ts +48 -0
  16. package/dist/host/transports/s2s-transport.d.ts +19 -0
  17. package/dist/host/transports/types.d.ts +45 -0
  18. package/dist/host/ws-handler.d.ts +14 -10
  19. package/dist/sdk/_internal-types.d.ts +2 -0
  20. package/dist/sdk/manifest-barrel.js +1 -1
  21. package/dist/sdk/protocol.d.ts +6 -5
  22. package/dist/sdk/providers/llm-barrel.js +1 -1
  23. package/dist/sdk/providers/stt/deepgram.d.ts +35 -0
  24. package/dist/sdk/providers/stt-barrel.d.ts +1 -0
  25. package/dist/sdk/providers/stt-barrel.js +2 -2
  26. package/dist/sdk/providers/tts/cartesia.d.ts +12 -4
  27. package/dist/sdk/providers/tts/rime.d.ts +42 -0
  28. package/dist/sdk/providers/tts-barrel.d.ts +1 -0
  29. package/dist/sdk/providers/tts-barrel.js +2 -2
  30. package/host/_pipeline-test-fakes.ts +6 -3
  31. package/host/_test-utils.ts +209 -128
  32. package/host/builtin-tools.ts +1 -0
  33. package/host/cleanup.test.ts +25 -298
  34. package/host/integration/pipeline-reference.integration.test.ts +30 -35
  35. package/host/providers/resolve.ts +10 -2
  36. package/host/providers/stt/deepgram.test.ts +229 -0
  37. package/host/providers/stt/deepgram.ts +172 -0
  38. package/host/providers/tts/cartesia.ts +7 -3
  39. package/host/providers/tts/rime.test.ts +251 -0
  40. package/host/providers/tts/rime.ts +322 -0
  41. package/host/runtime-barrel.ts +4 -2
  42. package/host/runtime.test.ts +16 -47
  43. package/host/runtime.ts +131 -23
  44. package/host/s2s.test.ts +122 -131
  45. package/host/s2s.ts +44 -52
  46. package/host/session-core.test.ts +257 -0
  47. package/host/session-core.ts +262 -0
  48. package/host/to-vercel-tools.test.ts +9 -1
  49. package/host/transports/pipeline-transport.test.ts +653 -0
  50. package/host/transports/pipeline-transport.ts +532 -0
  51. package/host/{fixture-replay.test.ts → transports/s2s-transport-fixtures.test.ts} +76 -106
  52. package/host/transports/s2s-transport.test.ts +56 -0
  53. package/host/transports/s2s-transport.ts +116 -0
  54. package/host/transports/types.test.ts +22 -0
  55. package/host/transports/types.ts +51 -0
  56. package/host/ws-handler.test.ts +324 -242
  57. package/host/ws-handler.ts +56 -59
  58. package/package.json +2 -1
  59. package/sdk/__snapshots__/exports.test.ts.snap +3 -3
  60. package/sdk/__snapshots__/schema-shapes.test.ts.snap +1 -0
  61. package/sdk/_internal-types.ts +3 -0
  62. package/sdk/protocol-compat.test.ts +8 -0
  63. package/sdk/protocol.ts +6 -5
  64. package/sdk/providers/stt/deepgram.ts +43 -0
  65. package/sdk/providers/stt-barrel.ts +2 -0
  66. package/sdk/providers/tts/cartesia.ts +15 -5
  67. package/sdk/providers/tts/rime.ts +52 -0
  68. package/sdk/providers/tts-barrel.ts +2 -0
  69. package/sdk/schema-alignment.test.ts +18 -6
  70. package/dist/assemblyai-Cxg9eobY.js +0 -18
  71. package/dist/cartesia-DwDk2tEu.js +0 -10
  72. package/dist/host/pipeline-session-ctx.d.ts +0 -24
  73. package/dist/host/pipeline-session.d.ts +0 -52
  74. package/dist/host/session-ctx.d.ts +0 -73
  75. package/dist/host/session.d.ts +0 -62
  76. package/host/pipeline-session-ctx.test.ts +0 -31
  77. package/host/pipeline-session-ctx.ts +0 -36
  78. package/host/pipeline-session.test.ts +0 -672
  79. package/host/pipeline-session.ts +0 -533
  80. package/host/s2s-fixtures.test.ts +0 -237
  81. package/host/session-ctx.test.ts +0 -387
  82. package/host/session-ctx.ts +0 -134
  83. package/host/session-fixture-replay.test.ts +0 -128
  84. package/host/session.test.ts +0 -634
  85. package/host/session.ts +0 -412
  86. /package/dist/{anthropic-BrUCPKUc.js → anthropic-CcLZygAr.js} +0 -0
@@ -1,23 +1,24 @@
1
1
  import { r as DEFAULT_SYSTEM_PROMPT } from "../types-KUgezM6u.js";
2
- import { _ as TOOL_EXECUTION_TIMEOUT_MS, a as DEFAULT_SHUTDOWN_TIMEOUT_MS, c as FETCH_TIMEOUT_MS, d as MAX_PAGE_CHARS, f as MAX_TOOL_RESULT_CHARS, g as RUN_CODE_TIMEOUT_MS, h as PIPELINE_FLUSH_TIMEOUT_MS, l as MAX_HTML_BYTES, m as MAX_WS_PAYLOAD_BYTES, o as DEFAULT_STT_SAMPLE_RATE, p as MAX_VALUE_SIZE, s as DEFAULT_TTS_SAMPLE_RATE, t as AGENT_CSP } from "../constants-C2nirZUI.js";
2
+ import { _ as TOOL_EXECUTION_TIMEOUT_MS, a as DEFAULT_SHUTDOWN_TIMEOUT_MS, c as FETCH_TIMEOUT_MS, d as MAX_PAGE_CHARS, g as RUN_CODE_TIMEOUT_MS, h as PIPELINE_FLUSH_TIMEOUT_MS, l as MAX_HTML_BYTES, m as MAX_WS_PAYLOAD_BYTES, o as DEFAULT_STT_SAMPLE_RATE, p as MAX_VALUE_SIZE, s as DEFAULT_TTS_SAMPLE_RATE, t as AGENT_CSP } from "../constants-C2nirZUI.js";
3
3
  import { i as toolError, n as errorDetail, r as errorMessage, t as parseWsUpgradeParams } from "../ws-upgrade-BeOQ7fXL.js";
4
4
  import { ClientMessageSchema, buildReadyConfig, lenientParse } from "../sdk/protocol.js";
5
- import { a as toAgentConfig, c as makeSttError, i as agentToolsToSchemas, l as makeTtsError, n as EMPTY_PARAMS, s as assertProviderTriple } from "../_internal-types-3p3OJZPb.js";
6
- import { t as ANTHROPIC_KIND } from "../anthropic-BrUCPKUc.js";
7
- import { t as ASSEMBLYAI_KIND } from "../assemblyai-Cxg9eobY.js";
8
- import { t as CARTESIA_KIND } from "../cartesia-DwDk2tEu.js";
5
+ import { a as toAgentConfig, c as makeSttError, i as agentToolsToSchemas, l as makeTtsError, n as EMPTY_PARAMS, s as assertProviderTriple } from "../_internal-types-DFL07G3f.js";
6
+ import { r as DEEPGRAM_KIND, t as ASSEMBLYAI_KIND } from "../assemblyai-C969QGi4.js";
7
+ import { a as RIME_KIND, n as CARTESIA_KIND } from "../cartesia-BfQPOQ7Y.js";
8
+ import { t as ANTHROPIC_KIND } from "../anthropic-CcLZygAr.js";
9
9
  import { z } from "zod";
10
10
  import { convert } from "html-to-text";
11
11
  import vm from "node:vm";
12
12
  import pTimeout from "p-timeout";
13
13
  import { createStorage, prefixStorage } from "unstorage";
14
- import { jsonSchema, stepCountIs, streamText, tool } from "ai";
15
14
  import { createAnthropic } from "@ai-sdk/anthropic";
16
15
  import { AssemblyAI } from "assemblyai";
17
16
  import { createNanoEvents } from "nanoevents";
17
+ import { DeepgramClient } from "@deepgram/sdk";
18
18
  import { randomUUID } from "node:crypto";
19
19
  import { Cartesia } from "@cartesia/cartesia-js";
20
20
  import WsWebSocket, { WebSocketServer } from "ws";
21
+ import { jsonSchema, stepCountIs, streamText, tool } from "ai";
21
22
  import fs from "node:fs";
22
23
  import http from "node:http";
23
24
  import path from "node:path";
@@ -333,6 +334,7 @@ function resolveAllBuiltins(names, opts) {
333
334
  for (const name of names) for (const [toolName, def] of resolveBuiltin(name, opts)) {
334
335
  defs[toolName] = def;
335
336
  schemas.push({
337
+ type: "function",
336
338
  name: toolName,
337
339
  description: def.description,
338
340
  parameters: z.toJSONSchema(def.parameters ?? EMPTY_PARAMS)
@@ -378,712 +380,236 @@ function buildSystemPrompt(config, opts) {
378
380
  return DEFAULT_SYSTEM_PROMPT + `\n\nToday's date is ${getFormattedDate()}.` + agentInstructions + toolPreamble + guidance + (opts.voice ? VOICE_RULES : "");
379
381
  }
380
382
  //#endregion
381
- //#region host/session-ctx.ts
382
- function _buildBaseCtx(opts) {
383
- const { agentConfig, log } = opts;
384
- const maxHistory = opts.maxHistory ?? 200;
385
- const ctx = {
386
- ...opts,
387
- reply: {
388
- pendingTools: [],
389
- toolCallCount: 0,
390
- currentReplyId: null
391
- },
392
- turnPromise: null,
393
- conversationMessages: [],
394
- maxHistory,
395
- consumeToolCallStep(_name, replyId) {
396
- if (replyId === null || replyId !== ctx.reply.currentReplyId) return toolError("Reply was interrupted. Discarding stale tool call.");
397
- const maxSteps = agentConfig.maxSteps;
398
- ctx.reply.toolCallCount++;
399
- if (maxSteps !== void 0 && ctx.reply.toolCallCount > maxSteps) {
400
- log.info("maxSteps exceeded, refusing tool call", {
401
- toolCallCount: ctx.reply.toolCallCount,
402
- maxSteps
403
- });
404
- return toolError("Maximum tool steps reached. Please respond to the user now.");
383
+ //#region host/providers/stt/assemblyai.ts
384
+ /**
385
+ * AssemblyAI Universal-Streaming STT opener (host-only).
386
+ *
387
+ * The user-facing descriptor factory (`assemblyAI(...)`) lives in
388
+ * `sdk/providers/stt/assemblyai.ts`. This module is the host-side
389
+ * counterpart: it takes the descriptor options + an API key and
390
+ * returns an {@link SttOpener} that the pipeline session drives.
391
+ *
392
+ * Default model: `"u3pro-rt"` (Universal-3 Pro Real-Time). The adapter
393
+ * maps that to the SDK's `"u3-rt-pro"` `speechModel` value; any other
394
+ * string is forwarded verbatim.
395
+ */
396
+ /** Translate the descriptor's model alias to the SDK's `speechModel` value. */
397
+ function resolveSpeechModel(model) {
398
+ if (model === "u3pro-rt") return "u3-rt-pro";
399
+ return model;
400
+ }
401
+ /** Build an {@link SttOpener} from resolved AssemblyAI descriptor options. */
402
+ function openAssemblyAI(opts = {}) {
403
+ return {
404
+ name: "assemblyai",
405
+ async open(openOpts) {
406
+ const apiKey = openOpts.apiKey || process.env.ASSEMBLYAI_API_KEY;
407
+ if (!apiKey) throw makeSttError("stt_auth_failed", "AssemblyAI STT: missing API key. Set ASSEMBLYAI_API_KEY in the agent env.");
408
+ const client = new AssemblyAI({ apiKey });
409
+ const speechModel = resolveSpeechModel(opts.model ?? "u3pro-rt");
410
+ const transcriber = client.streaming.transcriber({
411
+ sampleRate: openOpts.sampleRate,
412
+ speechModel,
413
+ ...openOpts.sttPrompt ? { prompt: openOpts.sttPrompt } : {}
414
+ });
415
+ const emitter = createNanoEvents();
416
+ let closed = false;
417
+ transcriber.on("turn", (event) => {
418
+ if (closed) return;
419
+ const text = event.transcript ?? "";
420
+ if (event.end_of_turn) {
421
+ if (text.length > 0) emitter.emit("final", text);
422
+ } else if (text.length > 0) emitter.emit("partial", text);
423
+ });
424
+ transcriber.on("error", (err) => {
425
+ if (closed) return;
426
+ emitter.emit("error", makeSttError("stt_stream_error", err?.message ?? String(err)));
427
+ });
428
+ transcriber.on("close", (code) => {
429
+ if (closed) return;
430
+ if (code !== 1e3) emitter.emit("error", makeSttError("stt_stream_error", `socket closed ${code}`));
431
+ });
432
+ try {
433
+ await transcriber.connect();
434
+ } catch (cause) {
435
+ throw makeSttError("stt_connect_failed", `AssemblyAI STT: connect failed: ${cause instanceof Error ? cause.message : String(cause)}`);
405
436
  }
406
- return null;
407
- },
408
- pushMessages(...msgs) {
409
- ctx.conversationMessages.push(...msgs);
410
- if (maxHistory > 0 && ctx.conversationMessages.length > maxHistory) ctx.conversationMessages.splice(0, ctx.conversationMessages.length - maxHistory);
411
- },
412
- beginReply(replyId) {
413
- ctx.reply = {
414
- pendingTools: [],
415
- toolCallCount: 0,
416
- currentReplyId: replyId
437
+ const close = async () => {
438
+ if (closed) return;
439
+ closed = true;
440
+ try {
441
+ await transcriber.close();
442
+ } catch {}
417
443
  };
418
- ctx.turnPromise = null;
419
- },
420
- cancelReply() {
421
- ctx.reply = {
422
- pendingTools: [],
423
- toolCallCount: 0,
424
- currentReplyId: null
444
+ if (openOpts.signal.aborted) close();
445
+ else openOpts.signal.addEventListener("abort", () => void close(), { once: true });
446
+ return {
447
+ sendAudio(pcm) {
448
+ if (closed) return;
449
+ const copy = new Uint8Array(pcm.byteLength);
450
+ copy.set(new Uint8Array(pcm.buffer, pcm.byteOffset, pcm.byteLength));
451
+ transcriber.sendAudio(copy.buffer);
452
+ },
453
+ on(event, fn) {
454
+ return emitter.on(event, fn);
455
+ },
456
+ close,
457
+ _transcriber: transcriber
425
458
  };
426
- },
427
- chainTurn(p) {
428
- ctx.turnPromise = (ctx.turnPromise ?? Promise.resolve()).then(() => p);
429
459
  }
430
460
  };
431
- return ctx;
432
- }
433
- function buildCtx(opts) {
434
- const base = _buildBaseCtx(opts);
435
- base.s2s = null;
436
- return base;
437
- }
438
- //#endregion
439
- //#region host/pipeline-session-ctx.ts
440
- function buildPipelineCtx(opts) {
441
- const base = _buildBaseCtx(opts);
442
- base.stt = null;
443
- base.tts = null;
444
- return base;
445
461
  }
446
462
  //#endregion
447
- //#region host/runtime-config.ts
463
+ //#region host/providers/stt/deepgram.ts
448
464
  /**
449
- * Runtime dependencies injected into the session pipeline.
465
+ * Deepgram Nova streaming STT opener (host-only).
450
466
  *
451
- * Defines the {@link Logger} interface, a default {@link consoleLogger},
452
- * and the {@link S2SConfig} for Speech-to-Speech endpoint configuration.
467
+ * The user-facing descriptor factory (`deepgram(...)`) lives in
468
+ * `sdk/providers/stt/deepgram.ts`. This module is the host-side
469
+ * counterpart: it takes the descriptor options + an API key and
470
+ * returns an {@link SttOpener} that the pipeline session drives.
471
+ *
472
+ * Default model: `"nova-3"`. Any string is forwarded verbatim to the SDK.
473
+ *
474
+ * This adapter targets the Deepgram SDK v5 (`@deepgram/sdk@^5`). The v5
475
+ * streaming API is:
476
+ * `client.listen.v1.connect(args)` → `Promise<V1Socket>`
477
+ * followed by:
478
+ * `socket.connect()` + `socket.waitForOpen()` to establish the connection.
453
479
  */
454
- function consoleLog(fn) {
455
- return (msg, ctx) => ctx ? fn(msg, ctx) : fn(msg);
456
- }
457
- /** Default console-backed logger. */
458
- const consoleLogger = {
459
- info: consoleLog(console.log),
460
- warn: consoleLog(console.warn),
461
- error: consoleLog(console.error),
462
- debug: consoleLog(console.debug)
463
- };
464
480
  /**
465
- * Structured JSON logger for production diagnostics. Each log entry is a
466
- * single-line JSON object with `timestamp`, `level`, `msg`, and any
467
- * caller-provided context fields.
481
+ * Handle an incoming Deepgram transcript message, emitting `partial` or
482
+ * `final` events on the emitter. Empty transcripts are silently dropped.
468
483
  */
469
- function jsonLog(level) {
470
- return (msg, ctx) => {
471
- const entry = {
472
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
473
- level,
474
- msg
475
- };
476
- if (ctx) Object.assign(entry, ctx);
477
- (level === "error" || level === "warn" ? process.stderr : process.stdout).write(`${JSON.stringify(entry)}\n`);
484
+ function handleMessage(data, closed, emitter) {
485
+ if (closed) return;
486
+ if (data.type !== "Results") return;
487
+ const result = data;
488
+ const text = result.channel?.alternatives?.[0]?.transcript ?? "";
489
+ if (result.is_final) {
490
+ if (text.length > 0) emitter.emit("final", text);
491
+ } else if (text.length > 0) emitter.emit("partial", text);
492
+ }
493
+ /** Wire Deepgram socket events onto the nanoevents emitter. */
494
+ function wireSocketEvents(connection, emitter, getIsClosed) {
495
+ connection.on("message", (data) => handleMessage(data, getIsClosed(), emitter));
496
+ connection.on("error", (err) => {
497
+ if (getIsClosed()) return;
498
+ emitter.emit("error", makeSttError("stt_stream_error", err?.message ?? String(err)));
499
+ });
500
+ connection.on("close", (event) => {
501
+ if (getIsClosed()) return;
502
+ const code = event?.code;
503
+ if (code !== void 0 && code !== 1e3) emitter.emit("error", makeSttError("stt_stream_error", `socket closed ${code}`));
504
+ });
505
+ }
506
+ /** Wire the AbortSignal to the close function. */
507
+ function wireAbortSignal(signal, close) {
508
+ if (signal.aborted) close();
509
+ else signal.addEventListener("abort", () => void close(), { once: true });
510
+ }
511
+ /** Build an {@link SttOpener} from resolved Deepgram descriptor options. */
512
+ function openDeepgram(opts = {}) {
513
+ return {
514
+ name: "deepgram",
515
+ async open(openOpts) {
516
+ const apiKey = openOpts.apiKey || process.env.DEEPGRAM_API_KEY;
517
+ if (!apiKey) throw makeSttError("stt_auth_failed", "Deepgram STT: missing API key. Set DEEPGRAM_API_KEY in the agent env.");
518
+ const client = new DeepgramClient({ apiKey });
519
+ let connection;
520
+ try {
521
+ connection = await client.listen.v1.connect({
522
+ model: opts.model ?? "nova-3",
523
+ language: opts.language ?? "en",
524
+ encoding: "linear16",
525
+ sample_rate: openOpts.sampleRate,
526
+ channels: 1,
527
+ interim_results: "true",
528
+ smart_format: "true",
529
+ endpointing: 300,
530
+ utterance_end_ms: "1000",
531
+ Authorization: apiKey
532
+ });
533
+ } catch (cause) {
534
+ throw makeSttError("stt_connect_failed", `Deepgram STT: connect failed: ${cause instanceof Error ? cause.message : String(cause)}`);
535
+ }
536
+ const emitter = createNanoEvents();
537
+ let closed = false;
538
+ wireSocketEvents(connection, emitter, () => closed);
539
+ connection.connect();
540
+ try {
541
+ await connection.waitForOpen();
542
+ } catch (cause) {
543
+ throw makeSttError("stt_connect_failed", `Deepgram STT: WebSocket open failed: ${cause instanceof Error ? cause.message : String(cause)}`);
544
+ }
545
+ const close = async () => {
546
+ if (closed) return;
547
+ closed = true;
548
+ try {
549
+ connection.close();
550
+ } catch {}
551
+ };
552
+ wireAbortSignal(openOpts.signal, close);
553
+ return {
554
+ sendAudio(pcm) {
555
+ if (closed) return;
556
+ connection.sendMedia(new Uint8Array(pcm.buffer, pcm.byteOffset, pcm.byteLength));
557
+ },
558
+ on(event, fn) {
559
+ return emitter.on(event, fn);
560
+ },
561
+ close,
562
+ _connection: connection
563
+ };
564
+ }
478
565
  };
479
566
  }
480
- const jsonLogger = {
481
- info: jsonLog("info"),
482
- warn: jsonLog("warn"),
483
- error: jsonLog("error"),
484
- debug: jsonLog("debug")
485
- };
486
- /** Default S2S endpoint configuration. */
487
- const DEFAULT_S2S_CONFIG = {
488
- wssUrl: "wss://agents.assemblyai.com/v1/voice",
489
- inputSampleRate: DEFAULT_STT_SAMPLE_RATE,
490
- outputSampleRate: DEFAULT_TTS_SAMPLE_RATE
491
- };
492
567
  //#endregion
493
- //#region host/to-vercel-tools.ts
568
+ //#region host/providers/tts/cartesia.ts
494
569
  /**
495
- * Converts agent {@link ToolSchema}[] to Vercel AI SDK tools with `execute`
496
- * delegation to the agent's {@link ExecuteTool} function.
570
+ * Cartesia TTS opener (host-only).
497
571
  *
498
- * The pipeline orchestrator passes the output to `streamText({ tools })`.
499
- * Each produced tool's `execute` closure calls
500
- * `ctx.executeTool(name, args, sessionId, messages(), { signal, toolCallId })`,
501
- * so the existing agent tool infrastructure (argument validation, KV, hooks,
502
- * timeout) remains the single source of truth for tool behavior.
572
+ * The user-facing descriptor factory (`cartesia(...)`) lives in
573
+ * `sdk/providers/tts/cartesia.ts`. This module is the host-side
574
+ * counterpart: it takes the descriptor options + an API key and
575
+ * returns a {@link TtsOpener} that the pipeline session drives.
503
576
  *
504
- * Per-call `options.abortSignal` (forwarded by `streamText` when the
505
- * outer turn is aborted, e.g. barge-in) takes precedence over the
506
- * bag-level `ctx.signal` so individual invocations respect streamText
507
- * aborts.
508
- */
509
- /**
510
- * Convert an array of {@link ToolSchema} to a Vercel AI SDK `ToolSet`
511
- * (record keyed by tool name).
577
+ * Wraps `@cartesia/cartesia-js`'s `TTSWS` / `TTSWSContext` and normalizes it
578
+ * onto the {@link TtsEvents} contract consumed by the pipeline orchestrator.
512
579
  *
513
- * Uses the v6 `tool()` helper with `inputSchema: jsonSchema(...)` wrapping
514
- * the agent's JSON Schema `parameters`. Execution is delegated to
515
- * `ctx.executeTool` so validation, KV, timeouts, and hooks keep working.
580
+ * **Per-turn context lifecycle.** Each `sendText(...)` within the same turn
581
+ * appends to the same Cartesia context. On `flush()` or `cancel()`, a new
582
+ * context is minted for the next turn so concurrent `cancel({ contextId })`
583
+ * only targets the in-flight turn, never the one that follows.
584
+ *
585
+ * **Audio format.** The adapter requests `raw` / `pcm_s16le` at the
586
+ * negotiated `sampleRate` so it can forward chunks as `Int16Array` with no
587
+ * conversion.
516
588
  */
517
- function toVercelTools(schemas, ctx) {
518
- const out = {};
519
- for (const schema of schemas) out[schema.name] = tool({
520
- description: schema.description,
521
- inputSchema: jsonSchema(schema.parameters),
522
- execute: async (args, options) => {
523
- const input = args ?? {};
524
- const signal = options.abortSignal ?? ctx.signal;
525
- const opts = {};
526
- if (signal !== void 0) opts.signal = signal;
527
- if (options.toolCallId !== void 0) opts.toolCallId = options.toolCallId;
528
- return ctx.executeTool(schema.name, input, ctx.sessionId, ctx.messages().slice(), opts);
529
- }
530
- });
531
- return out;
589
+ /** PCM16 sample rates supported by Cartesia's `raw` output format. */
590
+ const CARTESIA_PCM16_RATES = [
591
+ 8e3,
592
+ 16e3,
593
+ 22050,
594
+ 24e3,
595
+ 44100,
596
+ 48e3
597
+ ];
598
+ function assertSupportedSampleRate$1(rate) {
599
+ if (CARTESIA_PCM16_RATES.includes(rate)) return rate;
600
+ throw makeTtsError("tts_connect_failed", `Cartesia TTS: unsupported sample rate ${rate}. Supported: ${CARTESIA_PCM16_RATES.join(", ")}.`);
532
601
  }
533
- //#endregion
534
- //#region host/pipeline-session.ts
535
- function toModelMessage(m) {
536
- if (m.role === "user") return {
537
- role: "user",
538
- content: m.content
539
- };
540
- if (m.role === "assistant") return {
541
- role: "assistant",
542
- content: m.content
543
- };
544
- return {
545
- role: "assistant",
546
- content: m.content
547
- };
548
- }
549
- function emitError(client, code, message) {
550
- client.event({
551
- type: "error",
552
- code,
553
- message
554
- });
555
- }
556
- function handleStreamPart(part, deps) {
557
- switch (part.type) {
558
- case "text-delta": {
559
- const delta = part.text ?? "";
560
- if (delta.length === 0) return;
561
- deps.onTextDelta(delta);
562
- deps.tts?.sendText(delta);
563
- return;
564
- }
565
- case "tool-call": {
566
- const input = part.input ?? {};
567
- deps.client.event({
568
- type: "tool_call",
569
- toolCallId: part.toolCallId ?? "",
570
- toolName: part.toolName ?? "",
571
- args: input
572
- });
573
- return;
574
- }
575
- case "tool-result": {
576
- const output = part.output;
577
- const resultString = typeof output === "string" ? output : JSON.stringify(output);
578
- deps.client.event({
579
- type: "tool_call_done",
580
- toolCallId: part.toolCallId ?? "",
581
- result: resultString
582
- });
583
- return;
584
- }
585
- case "error": {
586
- const msg = errorMessage(part.error);
587
- deps.log.error("LLM stream error", {
588
- message: msg,
589
- sessionId: deps.sessionId
590
- });
591
- emitError(deps.client, "llm", msg);
592
- return;
593
- }
594
- default: return;
595
- }
596
- }
597
- /** Create a pluggable-provider voice session. */
598
- function createPipelineSession(opts) {
599
- const log = opts.logger ?? consoleLogger;
600
- const sttSampleRate = opts.sttSampleRate ?? 16e3;
601
- const ttsSampleRate = opts.ttsSampleRate ?? 24e3;
602
- const { client, agentConfig, toolSchemas, executeTool } = opts;
603
- const systemPrompt = buildSystemPrompt(agentConfig, {
604
- hasTools: toolSchemas.length > 0 || (agentConfig.builtinTools?.length ?? 0) > 0,
605
- voice: true,
606
- toolGuidance: opts.toolGuidance
607
- });
608
- const ctx = buildPipelineCtx({
609
- id: opts.id,
610
- agent: opts.agent,
611
- client,
612
- agentConfig,
613
- executeTool,
614
- log,
615
- maxHistory: opts.maxHistory
616
- });
617
- const sessionAbort = new AbortController();
618
- let audioReady = false;
619
- let terminated = false;
620
- let turnController = null;
621
- let nextReplyId = 0;
622
- const sttSubs = [];
623
- const ttsSubs = [];
624
- /**
625
- * Tear down the session after an unrecoverable provider error. Aborts the
626
- * in-flight turn, cancels TTS, signals providers to close via sessionAbort,
627
- * and flips `terminated` so future STT events and audio frames become
628
- * no-ops. Idempotent.
629
- */
630
- function terminate() {
631
- if (terminated) return;
632
- terminated = true;
633
- if (turnController !== null) {
634
- turnController.abort();
635
- turnController = null;
636
- }
637
- ctx.tts?.cancel();
638
- ctx.cancelReply();
639
- sessionAbort.abort();
640
- }
641
- function onSttPartial(_text) {
642
- if (terminated) return;
643
- if (turnController === null) return;
644
- log.info("Pipeline barge-in", { sessionId: opts.id });
645
- turnController.abort();
646
- turnController = null;
647
- ctx.tts?.cancel();
648
- ctx.cancelReply();
649
- client.event({ type: "cancelled" });
650
- }
651
- function onSttFinal(text) {
652
- if (terminated) return;
653
- const trimmed = text.trim();
654
- if (trimmed.length === 0) return;
655
- if (turnController !== null) {
656
- log.info("Pipeline replacing in-flight turn", { sessionId: opts.id });
657
- turnController.abort();
658
- turnController = null;
659
- ctx.tts?.cancel();
660
- ctx.cancelReply();
661
- client.event({ type: "cancelled" });
662
- }
663
- client.event({
664
- type: "user_transcript",
665
- text
666
- });
667
- const turn = runTurn(trimmed).catch((err) => {
668
- log.error("Pipeline turn crashed", {
669
- error: errorMessage(err),
670
- sessionId: opts.id
671
- });
672
- });
673
- ctx.chainTurn(turn);
674
- }
675
- function onSttError(err) {
676
- if (terminated) return;
677
- log.error("STT error", {
678
- code: err.code,
679
- message: err.message,
680
- sessionId: opts.id
681
- });
682
- emitError(client, "stt", err.message);
683
- terminate();
684
- }
685
- function onTtsError(err) {
686
- if (terminated) return;
687
- log.error("TTS error", {
688
- code: err.code,
689
- message: err.message,
690
- sessionId: opts.id
691
- });
692
- emitError(client, "tts", err.message);
693
- terminate();
694
- }
695
- async function consumeLlmStream(ctl, messages, tools, onDelta) {
696
- const deps = {
697
- client,
698
- tts: ctx.tts,
699
- log,
700
- sessionId: opts.id,
701
- onTextDelta: onDelta
702
- };
703
- try {
704
- const maxSteps = agentConfig.maxSteps ?? 5;
705
- const result = streamText({
706
- model: opts.llm,
707
- system: systemPrompt,
708
- messages,
709
- tools,
710
- stopWhen: stepCountIs(maxSteps),
711
- abortSignal: ctl.signal
712
- });
713
- for await (const part of result.fullStream) {
714
- if (ctl.signal.aborted) break;
715
- handleStreamPart(part, deps);
716
- }
717
- } catch (err) {
718
- if (!ctl.signal.aborted) {
719
- const msg = errorMessage(err);
720
- log.error("LLM streamText failed", {
721
- error: msg,
722
- sessionId: opts.id
723
- });
724
- emitError(client, "llm", msg);
725
- }
726
- }
727
- }
728
- /**
729
- * Flush TTS and wait for drain. Resolves on any of:
730
- * - TTS emits `done`
731
- * - `signal` aborts (barge-in, provider error, session stop)
732
- * - `PIPELINE_FLUSH_TIMEOUT_MS` elapses
733
- * Resolves immediately if no TTS session.
734
- */
735
- function flushTtsAndWait(signal) {
736
- const tts = ctx.tts;
737
- if (!tts) return Promise.resolve();
738
- return new Promise((resolve) => {
739
- let off = null;
740
- let timer = null;
741
- const cleanup = () => {
742
- if (off) {
743
- off();
744
- off = null;
745
- }
746
- if (timer) {
747
- clearTimeout(timer);
748
- timer = null;
749
- }
750
- signal.removeEventListener("abort", onAbort);
751
- };
752
- const finish = () => {
753
- cleanup();
754
- resolve();
755
- };
756
- const onAbort = () => finish();
757
- if (signal.aborted) {
758
- resolve();
759
- return;
760
- }
761
- signal.addEventListener("abort", onAbort, { once: true });
762
- off = tts.on("done", finish);
763
- timer = setTimeout(() => {
764
- log.warn("TTS flush timeout", {
765
- sessionId: opts.id,
766
- timeoutMs: PIPELINE_FLUSH_TIMEOUT_MS
767
- });
768
- finish();
769
- }, PIPELINE_FLUSH_TIMEOUT_MS);
770
- tts.flush();
771
- });
772
- }
773
- async function runTurn(userText) {
774
- const replyId = `pipeline-${++nextReplyId}`;
775
- ctx.beginReply(replyId);
776
- ctx.pushMessages({
777
- role: "user",
778
- content: userText
779
- });
780
- const ctl = new AbortController();
781
- turnController = ctl;
782
- const tools = toVercelTools(toolSchemas, {
783
- executeTool,
784
- sessionId: opts.id,
785
- messages: () => ctx.conversationMessages,
786
- signal: ctl.signal
787
- });
788
- const messages = ctx.conversationMessages.map(toModelMessage);
789
- let accumulated = "";
790
- await consumeLlmStream(ctl, messages, tools, (delta) => {
791
- accumulated += delta;
792
- });
793
- if (ctl.signal.aborted) {
794
- if (turnController === ctl) turnController = null;
795
- return;
796
- }
797
- if (accumulated.length > 0) {
798
- client.event({
799
- type: "agent_transcript",
800
- text: accumulated
801
- });
802
- ctx.pushMessages({
803
- role: "assistant",
804
- content: accumulated
805
- });
806
- }
807
- await flushTtsAndWait(ctl.signal);
808
- if (ctl.signal.aborted) {
809
- if (turnController === ctl) turnController = null;
810
- return;
811
- }
812
- client.playAudioDone();
813
- client.event({ type: "reply_done" });
814
- if (turnController === ctl) turnController = null;
815
- }
816
- async function runGreeting(text) {
817
- const replyId = `pipeline-greeting-${++nextReplyId}`;
818
- ctx.beginReply(replyId);
819
- const ctl = new AbortController();
820
- turnController = ctl;
821
- client.event({
822
- type: "agent_transcript",
823
- text
824
- });
825
- ctx.pushMessages({
826
- role: "assistant",
827
- content: text
828
- });
829
- ctx.tts?.sendText(text);
830
- await flushTtsAndWait(ctl.signal);
831
- if (ctl.signal.aborted) {
832
- if (turnController === ctl) turnController = null;
833
- return;
834
- }
835
- client.playAudioDone();
836
- client.event({ type: "reply_done" });
837
- if (turnController === ctl) turnController = null;
838
- }
839
- function reportOpenRejection(which, reason) {
840
- const msg = errorMessage(reason);
841
- log.error(`${which === "stt" ? "STT" : "TTS"} open failed`, {
842
- error: msg,
843
- sessionId: opts.id
844
- });
845
- emitError(client, which, msg);
846
- }
847
- async function adoptStt(sttSession, teardown) {
848
- if (teardown) {
849
- await sttSession.close().catch(() => void 0);
850
- return;
851
- }
852
- ctx.stt = sttSession;
853
- sttSubs.push(sttSession.on("partial", onSttPartial));
854
- sttSubs.push(sttSession.on("final", onSttFinal));
855
- sttSubs.push(sttSession.on("error", onSttError));
856
- }
857
- async function adoptTts(ttsSession, teardown) {
858
- if (teardown) {
859
- await ttsSession.close().catch(() => void 0);
860
- return;
861
- }
862
- ctx.tts = ttsSession;
863
- ttsSubs.push(ttsSession.on("audio", (pcm) => {
864
- client.playAudioChunk(new Uint8Array(pcm.buffer, pcm.byteOffset, pcm.byteLength));
865
- }));
866
- ttsSubs.push(ttsSession.on("error", onTtsError));
867
- }
868
- async function openProviders() {
869
- const [sttResult, ttsResult] = await Promise.allSettled([opts.stt.open({
870
- sampleRate: sttSampleRate,
871
- apiKey: opts.sttApiKey,
872
- sttPrompt: agentConfig.sttPrompt,
873
- signal: sessionAbort.signal
874
- }), opts.tts.open({
875
- sampleRate: ttsSampleRate,
876
- apiKey: opts.ttsApiKey,
877
- signal: sessionAbort.signal
878
- })]);
879
- if (sttResult.status === "rejected") reportOpenRejection("stt", sttResult.reason);
880
- if (ttsResult.status === "rejected") reportOpenRejection("tts", ttsResult.reason);
881
- const aborted = sessionAbort.signal.aborted;
882
- const sttFailed = sttResult.status === "rejected";
883
- const ttsFailed = ttsResult.status === "rejected";
884
- const teardown = aborted || sttFailed || ttsFailed;
885
- if (sttResult.status === "fulfilled") await adoptStt(sttResult.value, teardown);
886
- if (ttsResult.status === "fulfilled") await adoptTts(ttsResult.value, teardown);
887
- if (!aborted && (sttFailed || ttsFailed)) terminate();
888
- }
889
- return {
890
- async start() {
891
- await openProviders();
892
- },
893
- async stop() {
894
- if (sessionAbort.signal.aborted) return;
895
- sessionAbort.abort();
896
- turnController?.abort();
897
- for (const off of sttSubs) off();
898
- for (const off of ttsSubs) off();
899
- sttSubs.length = 0;
900
- ttsSubs.length = 0;
901
- if (ctx.turnPromise !== null) await ctx.turnPromise;
902
- await ctx.stt?.close().catch(() => {});
903
- await ctx.tts?.close().catch(() => {});
904
- },
905
- onAudio(data) {
906
- if (terminated || !audioReady) return;
907
- const offset = data.byteOffset;
908
- const length = data.byteLength;
909
- let pcm;
910
- if (offset % 2 === 0 && length % 2 === 0) pcm = new Int16Array(data.buffer, offset, length / 2);
911
- else {
912
- const copy = new Uint8Array(length - length % 2);
913
- copy.set(data.subarray(0, copy.byteLength));
914
- pcm = new Int16Array(copy.buffer);
915
- }
916
- ctx.stt?.sendAudio(pcm);
917
- },
918
- onAudioReady() {
919
- if (audioReady || terminated) return;
920
- audioReady = true;
921
- if (opts.skipGreeting) return;
922
- const greeting = agentConfig.greeting;
923
- if (!greeting) return;
924
- const turn = runGreeting(greeting).catch((err) => {
925
- log.error("Pipeline greeting failed", {
926
- error: errorMessage(err),
927
- sessionId: opts.id
928
- });
929
- });
930
- ctx.chainTurn(turn);
931
- },
932
- onCancel() {
933
- if (terminated) return;
934
- turnController?.abort();
935
- turnController = null;
936
- ctx.tts?.cancel();
937
- ctx.cancelReply();
938
- client.event({ type: "cancelled" });
939
- },
940
- onReset() {
941
- if (terminated) return;
942
- turnController?.abort();
943
- turnController = null;
944
- ctx.tts?.cancel();
945
- ctx.cancelReply();
946
- ctx.conversationMessages = [];
947
- ctx.turnPromise = null;
948
- client.event({ type: "reset" });
949
- },
950
- onHistory(incoming) {
951
- if (terminated) return;
952
- ctx.pushMessages(...incoming.map((m) => ({
953
- role: m.role,
954
- content: m.content
955
- })));
956
- },
957
- waitForTurn() {
958
- return ctx.turnPromise ?? Promise.resolve();
959
- }
960
- };
961
- }
962
- //#endregion
963
- //#region host/providers/stt/assemblyai.ts
964
- /**
965
- * AssemblyAI Universal-Streaming STT opener (host-only).
966
- *
967
- * The user-facing descriptor factory (`assemblyAI(...)`) lives in
968
- * `sdk/providers/stt/assemblyai.ts`. This module is the host-side
969
- * counterpart: it takes the descriptor options + an API key and
970
- * returns an {@link SttOpener} that the pipeline session drives.
971
- *
972
- * Default model: `"u3pro-rt"` (Universal-3 Pro Real-Time). The adapter
973
- * maps that to the SDK's `"u3-rt-pro"` `speechModel` value; any other
974
- * string is forwarded verbatim.
975
- */
976
- /** Translate the descriptor's model alias to the SDK's `speechModel` value. */
977
- function resolveSpeechModel(model) {
978
- if (model === "u3pro-rt") return "u3-rt-pro";
979
- return model;
980
- }
981
- /** Build an {@link SttOpener} from resolved AssemblyAI descriptor options. */
982
- function openAssemblyAI(opts = {}) {
983
- return {
984
- name: "assemblyai",
985
- async open(openOpts) {
986
- const apiKey = openOpts.apiKey || process.env.ASSEMBLYAI_API_KEY;
987
- if (!apiKey) throw makeSttError("stt_auth_failed", "AssemblyAI STT: missing API key. Set ASSEMBLYAI_API_KEY in the agent env.");
988
- const client = new AssemblyAI({ apiKey });
989
- const speechModel = resolveSpeechModel(opts.model ?? "u3pro-rt");
990
- const transcriber = client.streaming.transcriber({
991
- sampleRate: openOpts.sampleRate,
992
- speechModel,
993
- ...openOpts.sttPrompt ? { prompt: openOpts.sttPrompt } : {}
994
- });
995
- const emitter = createNanoEvents();
996
- let closed = false;
997
- transcriber.on("turn", (event) => {
998
- if (closed) return;
999
- const text = event.transcript ?? "";
1000
- if (event.end_of_turn) {
1001
- if (text.length > 0) emitter.emit("final", text);
1002
- } else if (text.length > 0) emitter.emit("partial", text);
1003
- });
1004
- transcriber.on("error", (err) => {
1005
- if (closed) return;
1006
- emitter.emit("error", makeSttError("stt_stream_error", err?.message ?? String(err)));
1007
- });
1008
- transcriber.on("close", (code) => {
1009
- if (closed) return;
1010
- if (code !== 1e3) emitter.emit("error", makeSttError("stt_stream_error", `socket closed ${code}`));
1011
- });
1012
- try {
1013
- await transcriber.connect();
1014
- } catch (cause) {
1015
- throw makeSttError("stt_connect_failed", `AssemblyAI STT: connect failed: ${cause instanceof Error ? cause.message : String(cause)}`);
1016
- }
1017
- const close = async () => {
1018
- if (closed) return;
1019
- closed = true;
1020
- try {
1021
- await transcriber.close();
1022
- } catch {}
1023
- };
1024
- if (openOpts.signal.aborted) close();
1025
- else openOpts.signal.addEventListener("abort", () => void close(), { once: true });
1026
- return {
1027
- sendAudio(pcm) {
1028
- if (closed) return;
1029
- const copy = new Uint8Array(pcm.byteLength);
1030
- copy.set(new Uint8Array(pcm.buffer, pcm.byteOffset, pcm.byteLength));
1031
- transcriber.sendAudio(copy.buffer);
1032
- },
1033
- on(event, fn) {
1034
- return emitter.on(event, fn);
1035
- },
1036
- close,
1037
- _transcriber: transcriber
1038
- };
1039
- }
1040
- };
1041
- }
1042
- //#endregion
1043
- //#region host/providers/tts/cartesia.ts
1044
- /**
1045
- * Cartesia TTS opener (host-only).
1046
- *
1047
- * The user-facing descriptor factory (`cartesia(...)`) lives in
1048
- * `sdk/providers/tts/cartesia.ts`. This module is the host-side
1049
- * counterpart: it takes the descriptor options + an API key and
1050
- * returns a {@link TtsOpener} that the pipeline session drives.
1051
- *
1052
- * Wraps `@cartesia/cartesia-js`'s `TTSWS` / `TTSWSContext` and normalizes it
1053
- * onto the {@link TtsEvents} contract consumed by the pipeline orchestrator.
1054
- *
1055
- * **Per-turn context lifecycle.** Each `sendText(...)` within the same turn
1056
- * appends to the same Cartesia context. On `flush()` or `cancel()`, a new
1057
- * context is minted for the next turn — so concurrent `cancel({ contextId })`
1058
- * only targets the in-flight turn, never the one that follows.
1059
- *
1060
- * **Audio format.** The adapter requests `raw` / `pcm_s16le` at the
1061
- * negotiated `sampleRate` so it can forward chunks as `Int16Array` with no
1062
- * conversion.
1063
- */
1064
- /** PCM16 sample rates supported by Cartesia's `raw` output format. */
1065
- const CARTESIA_PCM16_RATES = [
1066
- 8e3,
1067
- 16e3,
1068
- 22050,
1069
- 24e3,
1070
- 44100,
1071
- 48e3
1072
- ];
1073
- function assertSupportedSampleRate(rate) {
1074
- if (CARTESIA_PCM16_RATES.includes(rate)) return rate;
1075
- throw makeTtsError("tts_connect_failed", `Cartesia TTS: unsupported sample rate ${rate}. Supported: ${CARTESIA_PCM16_RATES.join(", ")}.`);
1076
- }
1077
- /** Build a {@link TtsOpener} from resolved Cartesia descriptor options. */
1078
- function openCartesia(opts) {
602
+ /** Build a {@link TtsOpener} from resolved Cartesia descriptor options. */
603
+ function openCartesia(opts) {
1079
604
  return {
1080
605
  name: "cartesia",
1081
606
  async open(openOpts) {
1082
607
  const apiKey = openOpts.apiKey || process.env.CARTESIA_API_KEY;
1083
608
  if (!apiKey) throw makeTtsError("tts_auth_failed", "Cartesia TTS: missing API key. Set CARTESIA_API_KEY in the agent env.");
1084
- const sampleRate = assertSupportedSampleRate(openOpts.sampleRate);
609
+ const sampleRate = assertSupportedSampleRate$1(openOpts.sampleRate);
1085
610
  const model = opts.model ?? "sonic-2";
1086
611
  const language = opts.language ?? "en";
612
+ const voice = opts.voice ?? "f786b574-daa5-4673-aa0c-cbe3e8534c02";
1087
613
  const client = new Cartesia({ apiKey });
1088
614
  let ws;
1089
615
  try {
@@ -1098,7 +624,7 @@ function openCartesia(opts) {
1098
624
  model_id: model,
1099
625
  voice: {
1100
626
  mode: "id",
1101
- id: opts.voice
627
+ id: voice
1102
628
  },
1103
629
  output_format: {
1104
630
  container: "raw",
@@ -1169,7 +695,7 @@ function openCartesia(opts) {
1169
695
  model_id: model,
1170
696
  voice: {
1171
697
  mode: "id",
1172
- id: opts.voice
698
+ id: voice
1173
699
  },
1174
700
  output_format: {
1175
701
  container: "raw",
@@ -1178,38 +704,247 @@ function openCartesia(opts) {
1178
704
  },
1179
705
  language
1180
706
  };
1181
- const ignoreRejection = (_err) => {};
707
+ const ignoreRejection = (_err) => {};
708
+ return {
709
+ sendText(text) {
710
+ if (closed || text.length === 0) return;
711
+ rotateIfPending();
712
+ context.send({
713
+ ...baseRequest,
714
+ transcript: text,
715
+ continue: true
716
+ }).catch(ignoreRejection);
717
+ },
718
+ flush() {
719
+ if (closed || rotatePending) return;
720
+ context.send({
721
+ ...baseRequest,
722
+ transcript: "",
723
+ continue: false
724
+ }).catch(ignoreRejection);
725
+ rotatePending = true;
726
+ },
727
+ cancel() {
728
+ if (closed) return;
729
+ if (!doneEmitted) context.cancel().catch(ignoreRejection);
730
+ emitDoneOnce();
731
+ rotatePending = true;
732
+ },
733
+ on(event, fn) {
734
+ return emitter.on(event, fn);
735
+ },
736
+ close,
737
+ _ws: ws,
738
+ _currentContextId: () => context.contextId
739
+ };
740
+ }
741
+ };
742
+ }
743
+ //#endregion
744
+ //#region host/providers/tts/rime.ts
745
+ /**
746
+ * Rime TTS opener (host-only).
747
+ *
748
+ * The user-facing descriptor factory (`rime(...)`) lives in
749
+ * `sdk/providers/tts/rime.ts`. This module is the host-side
750
+ * counterpart: it takes the descriptor options + an API key and
751
+ * returns a {@link TtsOpener} that the pipeline session drives.
752
+ *
753
+ * **Protocol.** Connects to Rime's `ws2` JSON WebSocket endpoint
754
+ * (`wss://users-ws.rime.ai/ws2`). Client-to-server messages are JSON:
755
+ * - `{ "text": "..." }` — append text to the synthesis buffer
756
+ * - `{ "operation": "clear" }` — drop buffered text (barge-in)
757
+ * - `{ "operation": "eos" }` — drain buffer, close connection (NOT used
758
+ * during a session: it would tear down the WS, forcing reconnect per
759
+ * turn). We force end-of-turn synthesis with a trailing `"."` instead.
760
+ * The server responds with JSON frames:
761
+ * - `{ type: "chunk", data: <base64 PCM16 LE>, contextId: string | null }`
762
+ * - `{ type: "timestamps", ... }` (ignored)
763
+ * - `{ type: "error", message: string }` (surfaced as `tts_stream_error`)
764
+ *
765
+ * **Single long-lived connection per session.** Rime buffers text until it
766
+ * sees terminal punctuation (`.`, `?`, `!`), so we use one WebSocket per
767
+ * `open()` call and reuse it across turns. `clear` resets the buffer
768
+ * between cancellations.
769
+ *
770
+ * **Done detection.** After `flush()` sends a trailing `"."` to force the
771
+ * server to synthesize any half-buffered text, we arm a quiescence timer
772
+ * that fires 500 ms after the last received audio chunk. When it fires,
773
+ * `done` is emitted.
774
+ *
775
+ * **Audio format.** The URL requests `audioFormat=pcm` at the negotiated
776
+ * `sampleRate`, which returns raw PCM16 little-endian. We decode the base64
777
+ * payload and construct a zero-copy `Int16Array` view over the decoded bytes.
778
+ */
779
+ /** PCM16 sample rates accepted by the Rime `ws2` endpoint. */
780
+ const RIME_PCM16_RATES = [
781
+ 8e3,
782
+ 16e3,
783
+ 22050,
784
+ 24e3,
785
+ 44100,
786
+ 48e3
787
+ ];
788
+ function assertSupportedSampleRate(rate) {
789
+ if (RIME_PCM16_RATES.includes(rate)) return rate;
790
+ throw makeTtsError("tts_connect_failed", `Rime TTS: unsupported sample rate ${rate}. Supported: ${RIME_PCM16_RATES.join(", ")}.`);
791
+ }
792
+ /**
793
+ * Decode a base64 string from Rime into a zero-copy `Int16Array`.
794
+ *
795
+ * Rime's `ws2` endpoint returns base64-encoded PCM16 LE in each chunk.
796
+ * `Buffer.from(base64, "base64")` gives us a Node.js Buffer (which is a
797
+ * Uint8Array subclass) with `byteOffset === 0`. PCM16 bytes always come in
798
+ * pairs so the length is guaranteed to be even.
799
+ */
800
+ function base64ToPcm(data) {
801
+ const bytes = Buffer.from(data, "base64");
802
+ const evenLen = bytes.byteLength - bytes.byteLength % 2;
803
+ if (evenLen === 0) return new Int16Array(0);
804
+ return new Int16Array(bytes.buffer, bytes.byteOffset, evenLen / 2);
805
+ }
806
+ /** Quiescence timeout in ms — how long to wait after the last audio chunk before emitting `done`. */
807
+ const QUIESCENCE_MS = 500;
808
+ /**
809
+ * After `flush()`, how long to wait for the FIRST audio chunk before
810
+ * giving up and emitting `done`. Greeting and short replies hit this
811
+ * path: `flush()` runs immediately after `sendText()`, so audio TTFB
812
+ * exceeds the 500 ms quiescence window. Once the first chunk arrives,
813
+ * we transition to the shorter quiescence timeout.
814
+ */
815
+ const FIRST_AUDIO_TIMEOUT_MS = 5e3;
816
+ /** Wait for the WebSocket `open` event; reject on first `error`. */
817
+ function waitForOpen(ws) {
818
+ return new Promise((resolve, reject) => {
819
+ const onOpen = () => {
820
+ ws.removeListener("error", onError);
821
+ resolve();
822
+ };
823
+ const onError = (err) => {
824
+ ws.removeListener("open", onOpen);
825
+ reject(makeTtsError("tts_connect_failed", `Rime TTS: connect failed: ${err?.message ?? String(err)}`));
826
+ };
827
+ ws.once("open", onOpen);
828
+ ws.once("error", onError);
829
+ });
830
+ }
831
+ /**
832
+ * Handle one incoming WebSocket message frame.
833
+ *
834
+ * Extracted into a top-level function to keep `open()` under the cognitive
835
+ * complexity limit while retaining full access to the session state via refs.
836
+ */
837
+ function handleRimeMessage(raw, emitter, armQuiescence, isActiveTimer) {
838
+ let msg;
839
+ try {
840
+ msg = JSON.parse(typeof raw === "string" ? raw : raw.toString());
841
+ } catch {
842
+ return;
843
+ }
844
+ if (msg.type === "chunk" && typeof msg.data === "string") {
845
+ const pcm = base64ToPcm(msg.data);
846
+ if (pcm.length > 0) {
847
+ emitter.emit("audio", pcm);
848
+ if (isActiveTimer()) armQuiescence();
849
+ }
850
+ return;
851
+ }
852
+ if (msg.type === "error") emitter.emit("error", makeTtsError("tts_stream_error", `Rime TTS: ${msg.message ?? "unknown error"}`));
853
+ }
854
+ /** Build a {@link TtsOpener} from resolved Rime descriptor options. */
855
+ function openRime(opts) {
856
+ return {
857
+ name: "rime",
858
+ async open(openOpts) {
859
+ const apiKey = openOpts.apiKey || process.env.RIME_API_KEY;
860
+ if (!apiKey) throw makeTtsError("tts_auth_failed", "Rime TTS: missing API key. Set RIME_API_KEY in the agent env.");
861
+ const sampleRate = assertSupportedSampleRate(openOpts.sampleRate);
862
+ const model = opts.model ?? "mistv2";
863
+ const lang = opts.language ?? "eng";
864
+ const voice = opts.voice ?? "cove";
865
+ const url = `wss://users-ws.rime.ai/ws2?speaker=${encodeURIComponent(voice)}&modelId=${encodeURIComponent(model)}&audioFormat=pcm&samplingRate=${sampleRate}&lang=${encodeURIComponent(lang)}`;
866
+ let ws;
867
+ try {
868
+ ws = new WsWebSocket(url, { headers: { Authorization: `Bearer ${apiKey}` } });
869
+ } catch (cause) {
870
+ throw makeTtsError("tts_connect_failed", `Rime TTS: failed to create WebSocket: ${cause instanceof Error ? cause.message : String(cause)}`);
871
+ }
872
+ await waitForOpen(ws);
873
+ const emitter = createNanoEvents();
874
+ let closed = false;
875
+ let doneEmitted = false;
876
+ /**
877
+ * After `flush()`, we arm a timer that fires `done`. Initial timeout is
878
+ * `FIRST_AUDIO_TIMEOUT_MS` to give Rime headroom on TTFB; the first
879
+ * chunk swaps it for a shorter `QUIESCENCE_MS` window that resets on
880
+ * each subsequent chunk. `cancel()` emits `done` synchronously.
881
+ */
882
+ let quiescenceTimer = null;
883
+ const clearQuiescence = () => {
884
+ if (quiescenceTimer !== null) {
885
+ clearTimeout(quiescenceTimer);
886
+ quiescenceTimer = null;
887
+ }
888
+ };
889
+ const emitDoneOnce = () => {
890
+ clearQuiescence();
891
+ if (doneEmitted || closed) return;
892
+ doneEmitted = true;
893
+ emitter.emit("done");
894
+ };
895
+ const armQuiescence = () => {
896
+ clearQuiescence();
897
+ quiescenceTimer = setTimeout(emitDoneOnce, QUIESCENCE_MS);
898
+ };
899
+ const armFirstAudioTimer = () => {
900
+ clearQuiescence();
901
+ quiescenceTimer = setTimeout(emitDoneOnce, FIRST_AUDIO_TIMEOUT_MS);
902
+ };
903
+ ws.on("message", (raw) => {
904
+ if (closed) return;
905
+ handleRimeMessage(raw, emitter, armQuiescence, () => quiescenceTimer !== null);
906
+ });
907
+ ws.on("error", (err) => {
908
+ if (closed) return;
909
+ emitter.emit("error", makeTtsError("tts_stream_error", `Rime TTS stream error: ${err?.message ?? String(err)}`));
910
+ });
911
+ ws.on("close", () => {
912
+ if (closed) return;
913
+ emitDoneOnce();
914
+ });
915
+ const close = async () => {
916
+ if (closed) return;
917
+ closed = true;
918
+ clearQuiescence();
919
+ try {
920
+ ws.close();
921
+ } catch {}
922
+ };
923
+ if (openOpts.signal.aborted) close();
924
+ else openOpts.signal.addEventListener("abort", () => void close(), { once: true });
1182
925
  return {
1183
926
  sendText(text) {
1184
927
  if (closed || text.length === 0) return;
1185
- rotateIfPending();
1186
- context.send({
1187
- ...baseRequest,
1188
- transcript: text,
1189
- continue: true
1190
- }).catch(ignoreRejection);
928
+ if (ws.readyState !== WsWebSocket.OPEN) return;
929
+ doneEmitted = false;
930
+ ws.send(JSON.stringify({ text }));
1191
931
  },
1192
932
  flush() {
1193
- if (closed || rotatePending) return;
1194
- context.send({
1195
- ...baseRequest,
1196
- transcript: "",
1197
- continue: false
1198
- }).catch(ignoreRejection);
1199
- rotatePending = true;
933
+ if (closed) return;
934
+ if (ws.readyState !== WsWebSocket.OPEN) return;
935
+ ws.send(JSON.stringify({ text: "." }));
936
+ armFirstAudioTimer();
1200
937
  },
1201
938
  cancel() {
1202
939
  if (closed) return;
1203
- if (!doneEmitted) context.cancel().catch(ignoreRejection);
940
+ if (ws.readyState === WsWebSocket.OPEN) ws.send(JSON.stringify({ operation: "clear" }));
1204
941
  emitDoneOnce();
1205
- rotatePending = true;
1206
942
  },
1207
943
  on(event, fn) {
1208
944
  return emitter.on(event, fn);
1209
945
  },
1210
946
  close,
1211
- _ws: ws,
1212
- _currentContextId: () => context.contextId
947
+ _ws: ws
1213
948
  };
1214
949
  }
1215
950
  };
@@ -1225,53 +960,785 @@ function openCartesia(opts) {
1225
960
  * resolvers here to turn each descriptor into its openable / callable
1226
961
  * host-side counterpart, importing the third-party SDK only at that point.
1227
962
  *
1228
- * The guest sandbox never imports these functions, which is how the agent
1229
- * bundle stays free of `@ai-sdk/anthropic` / `assemblyai` /
1230
- * `@cartesia/cartesia-js`.
963
+ * The guest sandbox never imports these functions, which is how the agent
964
+ * bundle stays free of `@ai-sdk/anthropic` / `assemblyai` /
965
+ * `@cartesia/cartesia-js`.
966
+ */
967
+ /**
968
+ * Look up a provider API key: agent env first (set via `aai secret put` or
969
+ * `.env`), then the host's `process.env` as a fallback for self-hosted mode.
970
+ * Returns `""` if neither has it — the caller decides whether that's fatal.
971
+ */
972
+ function resolveApiKey(envVar, env) {
973
+ return env[envVar] ?? process.env[envVar] ?? "";
974
+ }
975
+ /** Resolve an {@link SttProvider} descriptor into a host-side opener. */
976
+ function resolveStt(descriptor) {
977
+ switch (descriptor.kind) {
978
+ case ASSEMBLYAI_KIND: return openAssemblyAI(descriptor.options);
979
+ case DEEPGRAM_KIND: return openDeepgram(descriptor.options);
980
+ default: throw new Error(`Unknown STT provider kind: "${descriptor.kind}". Supported: ${ASSEMBLYAI_KIND}, ${DEEPGRAM_KIND}.`);
981
+ }
982
+ }
983
+ /** Resolve a {@link TtsProvider} descriptor into a host-side opener. */
984
+ function resolveTts(descriptor) {
985
+ switch (descriptor.kind) {
986
+ case CARTESIA_KIND: return openCartesia(descriptor.options);
987
+ case RIME_KIND: return openRime(descriptor.options);
988
+ default: throw new Error(`Unknown TTS provider kind: "${descriptor.kind}". Supported: ${CARTESIA_KIND}, ${RIME_KIND}.`);
989
+ }
990
+ }
991
+ /**
992
+ * Resolve an {@link LlmProvider} descriptor into a Vercel AI SDK
993
+ * {@link LanguageModel}.
994
+ *
995
+ * The API key is pulled from the agent's env (e.g. `ANTHROPIC_API_KEY`).
996
+ * Missing keys throw here — the pipeline session would fail on first
997
+ * `streamText` call otherwise, and the error is clearer at construction.
998
+ */
999
+ function resolveLlm(descriptor, env) {
1000
+ switch (descriptor.kind) {
1001
+ case ANTHROPIC_KIND: {
1002
+ const options = descriptor.options;
1003
+ const apiKey = resolveApiKey("ANTHROPIC_API_KEY", env);
1004
+ if (!apiKey) throw new Error("Anthropic LLM: missing API key. Set ANTHROPIC_API_KEY in the agent env.");
1005
+ return createAnthropic({
1006
+ apiKey,
1007
+ baseURL: "https://api.anthropic.com/v1"
1008
+ })(options.model);
1009
+ }
1010
+ default: throw new Error(`Unknown LLM provider kind: "${descriptor.kind}". Supported: ${ANTHROPIC_KIND}.`);
1011
+ }
1012
+ }
1013
+ //#endregion
1014
+ //#region host/runtime-config.ts
1015
+ /**
1016
+ * Runtime dependencies injected into the session pipeline.
1017
+ *
1018
+ * Defines the {@link Logger} interface, a default {@link consoleLogger},
1019
+ * and the {@link S2SConfig} for Speech-to-Speech endpoint configuration.
1020
+ */
1021
+ function consoleLog(fn) {
1022
+ return (msg, ctx) => ctx ? fn(msg, ctx) : fn(msg);
1023
+ }
1024
+ /** Default console-backed logger. */
1025
+ const consoleLogger = {
1026
+ info: consoleLog(console.log),
1027
+ warn: consoleLog(console.warn),
1028
+ error: consoleLog(console.error),
1029
+ debug: consoleLog(console.debug)
1030
+ };
1031
+ /**
1032
+ * Structured JSON logger for production diagnostics. Each log entry is a
1033
+ * single-line JSON object with `timestamp`, `level`, `msg`, and any
1034
+ * caller-provided context fields.
1035
+ */
1036
+ function jsonLog(level) {
1037
+ return (msg, ctx) => {
1038
+ const entry = {
1039
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
1040
+ level,
1041
+ msg
1042
+ };
1043
+ if (ctx) Object.assign(entry, ctx);
1044
+ (level === "error" || level === "warn" ? process.stderr : process.stdout).write(`${JSON.stringify(entry)}\n`);
1045
+ };
1046
+ }
1047
+ const jsonLogger = {
1048
+ info: jsonLog("info"),
1049
+ warn: jsonLog("warn"),
1050
+ error: jsonLog("error"),
1051
+ debug: jsonLog("debug")
1052
+ };
1053
+ /** Default S2S endpoint configuration. */
1054
+ const DEFAULT_S2S_CONFIG = {
1055
+ wssUrl: "wss://agents.assemblyai.com/v1/voice",
1056
+ inputSampleRate: DEFAULT_STT_SAMPLE_RATE,
1057
+ outputSampleRate: DEFAULT_TTS_SAMPLE_RATE
1058
+ };
1059
+ //#endregion
1060
+ //#region host/session-core.ts
1061
+ const REPLY_DONE_SLOW_THRESHOLD_MS = 50;
1062
+ function createSessionCore(opts) {
1063
+ const log = opts.logger ?? consoleLogger;
1064
+ const maxHistory = opts.maxHistory ?? 200;
1065
+ const idleMs = (() => {
1066
+ const raw = opts.agentConfig.idleTimeoutMs ?? 3e5;
1067
+ return raw === 0 || !Number.isFinite(raw) ? 0 : raw;
1068
+ })();
1069
+ let reply = {
1070
+ currentReplyId: null,
1071
+ pendingTools: [],
1072
+ toolCallCount: 0
1073
+ };
1074
+ let history = [];
1075
+ let turnPromise = null;
1076
+ let idleTimer = null;
1077
+ let stopped = false;
1078
+ function emit(event) {
1079
+ opts.client.event(event);
1080
+ }
1081
+ function resetIdle() {
1082
+ if (stopped || idleMs <= 0) return;
1083
+ if (idleTimer) clearTimeout(idleTimer);
1084
+ idleTimer = setTimeout(() => {
1085
+ log.info("session idle timeout", { sid: opts.id });
1086
+ emit({ type: "idle_timeout" });
1087
+ }, idleMs);
1088
+ }
1089
+ function pushMessages(...msgs) {
1090
+ history.push(...msgs);
1091
+ if (maxHistory > 0 && history.length > maxHistory) history.splice(0, history.length - maxHistory);
1092
+ }
1093
+ function beginReply(replyId) {
1094
+ reply = {
1095
+ currentReplyId: replyId,
1096
+ pendingTools: [],
1097
+ toolCallCount: 0
1098
+ };
1099
+ turnPromise = null;
1100
+ }
1101
+ function cancelReply() {
1102
+ reply = {
1103
+ currentReplyId: null,
1104
+ pendingTools: [],
1105
+ toolCallCount: 0
1106
+ };
1107
+ }
1108
+ function flushReply(startMs, hadTurnPromise) {
1109
+ const stepsUsed = reply.toolCallCount;
1110
+ if (stepsUsed > 0) log.info("Turn complete", {
1111
+ steps: stepsUsed,
1112
+ agent: opts.agent
1113
+ });
1114
+ opts.client.playAudioDone();
1115
+ emit({ type: "reply_done" });
1116
+ reply.currentReplyId = null;
1117
+ const durationMs = Date.now() - startMs;
1118
+ if (durationMs >= REPLY_DONE_SLOW_THRESHOLD_MS) log.warn("slow reply_done dispatch", {
1119
+ sid: opts.id,
1120
+ agent: opts.agent,
1121
+ durationMs,
1122
+ hadTurnPromise
1123
+ });
1124
+ }
1125
+ return {
1126
+ id: opts.id,
1127
+ async start() {
1128
+ resetIdle();
1129
+ await opts.transport.start();
1130
+ },
1131
+ async stop() {
1132
+ if (stopped) return;
1133
+ stopped = true;
1134
+ if (idleTimer) {
1135
+ clearTimeout(idleTimer);
1136
+ idleTimer = null;
1137
+ }
1138
+ if (turnPromise !== null) await turnPromise;
1139
+ await opts.transport.stop();
1140
+ },
1141
+ onAudio(bytes) {
1142
+ resetIdle();
1143
+ opts.transport.sendUserAudio(bytes);
1144
+ },
1145
+ onAudioReady() {},
1146
+ onCancel() {
1147
+ opts.transport.cancelReply();
1148
+ emit({ type: "cancelled" });
1149
+ },
1150
+ onReset() {
1151
+ cancelReply();
1152
+ history = [];
1153
+ emit({ type: "reset" });
1154
+ },
1155
+ onHistory(messages) {
1156
+ pushMessages(...messages);
1157
+ },
1158
+ onReplyStarted(replyId) {
1159
+ beginReply(replyId);
1160
+ },
1161
+ onReplyDone() {
1162
+ const startMs = Date.now();
1163
+ const doneReplyId = reply.currentReplyId;
1164
+ if (doneReplyId === null) {
1165
+ log.debug("Dropping duplicate reply.done (no active reply)");
1166
+ return;
1167
+ }
1168
+ const hadTurnPromise = turnPromise !== null;
1169
+ const sendPending = () => {
1170
+ if (reply.currentReplyId !== doneReplyId) {
1171
+ reply.pendingTools = [];
1172
+ return;
1173
+ }
1174
+ if (reply.pendingTools.length > 0) {
1175
+ for (const tool of reply.pendingTools) opts.transport.sendToolResult(tool.callId, tool.result);
1176
+ reply.pendingTools = [];
1177
+ } else flushReply(startMs, hadTurnPromise);
1178
+ };
1179
+ if (hadTurnPromise) turnPromise?.then(sendPending);
1180
+ else sendPending();
1181
+ },
1182
+ onCancelled() {
1183
+ cancelReply();
1184
+ emit({ type: "cancelled" });
1185
+ },
1186
+ onAudioChunk(bytes) {
1187
+ opts.client.playAudioChunk(bytes);
1188
+ },
1189
+ onAudioDone() {
1190
+ opts.client.playAudioDone();
1191
+ },
1192
+ onUserTranscript(text) {
1193
+ emit({
1194
+ type: "user_transcript",
1195
+ text
1196
+ });
1197
+ pushMessages({
1198
+ role: "user",
1199
+ content: text
1200
+ });
1201
+ },
1202
+ onAgentTranscript(text, interrupted) {
1203
+ emit({
1204
+ type: "agent_transcript",
1205
+ text
1206
+ });
1207
+ if (!interrupted) pushMessages({
1208
+ role: "assistant",
1209
+ content: text
1210
+ });
1211
+ },
1212
+ onToolCall(callId, name, args) {
1213
+ emit({
1214
+ type: "tool_call",
1215
+ toolCallId: callId,
1216
+ toolName: name,
1217
+ args
1218
+ });
1219
+ if (reply.currentReplyId === null) {
1220
+ log.warn("tool_call with no active reply", {
1221
+ sid: opts.id,
1222
+ name
1223
+ });
1224
+ return;
1225
+ }
1226
+ reply.toolCallCount++;
1227
+ const maxSteps = opts.agentConfig.maxSteps;
1228
+ if (maxSteps !== void 0 && reply.toolCallCount > maxSteps) {
1229
+ log.info("maxSteps exceeded; refusing tool call", {
1230
+ toolCallCount: reply.toolCallCount,
1231
+ maxSteps
1232
+ });
1233
+ reply.pendingTools.push({
1234
+ callId,
1235
+ result: JSON.stringify({ error: "Maximum tool steps reached. Please respond to the user now." })
1236
+ });
1237
+ emit({
1238
+ type: "tool_call_done",
1239
+ toolCallId: callId,
1240
+ result: "{}"
1241
+ });
1242
+ return;
1243
+ }
1244
+ const p = (async () => {
1245
+ try {
1246
+ const result = await opts.executeTool(name, args, opts.id, history);
1247
+ reply.pendingTools.push({
1248
+ callId,
1249
+ result
1250
+ });
1251
+ emit({
1252
+ type: "tool_call_done",
1253
+ toolCallId: callId,
1254
+ result
1255
+ });
1256
+ } catch (err) {
1257
+ const message = err instanceof Error ? err.message : String(err);
1258
+ reply.pendingTools.push({
1259
+ callId,
1260
+ result: JSON.stringify({ error: message })
1261
+ });
1262
+ emit({
1263
+ type: "tool_call_done",
1264
+ toolCallId: callId,
1265
+ result: message
1266
+ });
1267
+ }
1268
+ })();
1269
+ turnPromise = (turnPromise ?? Promise.resolve()).then(() => p);
1270
+ },
1271
+ onError(code, message) {
1272
+ emit({
1273
+ type: "error",
1274
+ code,
1275
+ message
1276
+ });
1277
+ },
1278
+ onSpeechStarted() {
1279
+ emit({ type: "speech_started" });
1280
+ },
1281
+ onSpeechStopped() {
1282
+ emit({ type: "speech_stopped" });
1283
+ }
1284
+ };
1285
+ }
1286
+ //#endregion
1287
+ //#region host/tool-executor.ts
1288
+ /**
1289
+ * Tool execution — validates arguments and invokes tool handlers.
1290
+ *
1291
+ * {@link executeToolCall} is the single entry point used by both the
1292
+ * direct (self-hosted) runtime and the platform sandbox sidecar.
1293
+ */
1294
+ const yieldTick = () => new Promise((r) => setTimeout(r, 0));
1295
+ function buildToolContext(opts) {
1296
+ const { env, state, kv, messages, sessionId } = opts;
1297
+ return {
1298
+ env,
1299
+ state: state ?? {},
1300
+ get kv() {
1301
+ if (!kv) throw new Error("KV not available");
1302
+ return kv;
1303
+ },
1304
+ messages: messages ?? [],
1305
+ sessionId: sessionId ?? "",
1306
+ send(event, data) {
1307
+ opts.send?.(event, data);
1308
+ }
1309
+ };
1310
+ }
1311
+ async function executeToolCall(name, args, options) {
1312
+ const { tool } = options;
1313
+ const parsed = (tool.parameters ?? EMPTY_PARAMS).safeParse(args);
1314
+ if (!parsed.success) return toolError(`Invalid arguments for tool "${name}": ${(parsed.error?.issues ?? []).map((i) => `${i.path.map(String).join(".")}: ${i.message}`).join(", ")}`);
1315
+ try {
1316
+ const ctx = buildToolContext(options);
1317
+ await yieldTick();
1318
+ const result = await pTimeout(Promise.resolve(tool.execute(parsed.data, ctx)), {
1319
+ milliseconds: TOOL_EXECUTION_TIMEOUT_MS,
1320
+ message: `Tool "${name}" timed out after ${TOOL_EXECUTION_TIMEOUT_MS}ms`
1321
+ });
1322
+ await yieldTick();
1323
+ if (result == null) return "null";
1324
+ return typeof result === "string" ? result : JSON.stringify(result);
1325
+ } catch (err) {
1326
+ const log = options.logger;
1327
+ if (log) log.warn("Tool execution failed", {
1328
+ tool: name,
1329
+ error: errorDetail(err)
1330
+ });
1331
+ else console.warn(`[tool-executor] Tool execution failed: ${name}`, err);
1332
+ return toolError(errorMessage(err));
1333
+ }
1334
+ }
1335
+ //#endregion
1336
+ //#region host/to-vercel-tools.ts
1337
+ /**
1338
+ * Converts agent {@link ToolSchema}[] to Vercel AI SDK tools with `execute`
1339
+ * delegation to the agent's {@link ExecuteTool} function.
1340
+ *
1341
+ * The pipeline orchestrator passes the output to `streamText({ tools })`.
1342
+ * Each produced tool's `execute` closure calls
1343
+ * `ctx.executeTool(name, args, sessionId, messages(), { signal, toolCallId })`,
1344
+ * so the existing agent tool infrastructure (argument validation, KV, hooks,
1345
+ * timeout) remains the single source of truth for tool behavior.
1346
+ *
1347
+ * Per-call `options.abortSignal` (forwarded by `streamText` when the
1348
+ * outer turn is aborted, e.g. barge-in) takes precedence over the
1349
+ * bag-level `ctx.signal` so individual invocations respect streamText
1350
+ * aborts.
1231
1351
  */
1232
1352
  /**
1233
- * Look up a provider API key: agent env first (set via `aai secret put` or
1234
- * `.env`), then the host's `process.env` as a fallback for self-hosted mode.
1235
- * Returns `""` if neither has it — the caller decides whether that's fatal.
1353
+ * Convert an array of {@link ToolSchema} to a Vercel AI SDK `ToolSet`
1354
+ * (record keyed by tool name).
1355
+ *
1356
+ * Uses the v6 `tool()` helper with `inputSchema: jsonSchema(...)` wrapping
1357
+ * the agent's JSON Schema `parameters`. Execution is delegated to
1358
+ * `ctx.executeTool` so validation, KV, timeouts, and hooks keep working.
1236
1359
  */
1237
- function resolveApiKey(envVar, env) {
1238
- return env[envVar] ?? process.env[envVar] ?? "";
1360
+ function toVercelTools(schemas, ctx) {
1361
+ const out = {};
1362
+ for (const schema of schemas) out[schema.name] = tool({
1363
+ description: schema.description,
1364
+ inputSchema: jsonSchema(schema.parameters),
1365
+ execute: async (args, options) => {
1366
+ const input = args ?? {};
1367
+ const signal = options.abortSignal ?? ctx.signal;
1368
+ const opts = {};
1369
+ if (signal !== void 0) opts.signal = signal;
1370
+ if (options.toolCallId !== void 0) opts.toolCallId = options.toolCallId;
1371
+ return ctx.executeTool(schema.name, input, ctx.sessionId, ctx.messages().slice(), opts);
1372
+ }
1373
+ });
1374
+ return out;
1239
1375
  }
1240
- /** Resolve an {@link SttProvider} descriptor into a host-side opener. */
1241
- function resolveStt(descriptor) {
1242
- switch (descriptor.kind) {
1243
- case ASSEMBLYAI_KIND: return openAssemblyAI(descriptor.options);
1244
- default: throw new Error(`Unknown STT provider kind: "${descriptor.kind}". Supported: ${ASSEMBLYAI_KIND}.`);
1376
+ //#endregion
1377
+ //#region host/transports/pipeline-transport.ts
1378
+ function toModelMessage(m) {
1379
+ if (m.role === "user") return {
1380
+ role: "user",
1381
+ content: m.content
1382
+ };
1383
+ return {
1384
+ role: "assistant",
1385
+ content: m.content
1386
+ };
1387
+ }
1388
+ /** Create a pipeline-mode Transport (STT → LLM → TTS). */
1389
+ function createPipelineTransport(opts) {
1390
+ const log = opts.logger ?? consoleLogger;
1391
+ const sttSampleRate = opts.sttSampleRate ?? 16e3;
1392
+ const ttsSampleRate = opts.ttsSampleRate ?? 24e3;
1393
+ const maxSteps = opts.maxSteps ?? 5;
1394
+ const toolChoice = opts.toolChoice ?? "auto";
1395
+ const toolSchemas = opts.toolSchemas ?? [];
1396
+ const executeTool = opts.executeTool ?? (async () => {
1397
+ throw new Error("No executeTool provided");
1398
+ });
1399
+ const { callbacks, sessionConfig } = opts;
1400
+ const systemPrompt = sessionConfig.systemPrompt;
1401
+ const sessionAbort = new AbortController();
1402
+ let audioReady = false;
1403
+ let terminated = false;
1404
+ let sttSession = null;
1405
+ let ttsSession = null;
1406
+ let turnController = null;
1407
+ let nextReplyId = 0;
1408
+ const conversationMessages = sessionConfig.history ? [...sessionConfig.history] : [];
1409
+ let turnPromise = null;
1410
+ const sttSubs = [];
1411
+ const ttsSubs = [];
1412
+ function pushMessages(...msgs) {
1413
+ conversationMessages.push(...msgs);
1414
+ if (conversationMessages.length > 200) conversationMessages.splice(0, conversationMessages.length - 200);
1415
+ }
1416
+ function chainTurn(p) {
1417
+ turnPromise = (turnPromise ?? Promise.resolve()).then(() => p);
1418
+ }
1419
+ function emitError(code, message) {
1420
+ callbacks.onError(code, message);
1421
+ }
1422
+ /**
1423
+ * Tear down after an unrecoverable provider error. Aborts the in-flight
1424
+ * turn, cancels TTS, signals providers to close. Idempotent.
1425
+ */
1426
+ function terminate() {
1427
+ if (terminated) return;
1428
+ terminated = true;
1429
+ if (turnController !== null) {
1430
+ turnController.abort();
1431
+ turnController = null;
1432
+ }
1433
+ ttsSession?.cancel();
1434
+ callbacks.onCancelled();
1435
+ sessionAbort.abort();
1436
+ }
1437
+ function onSttPartial(_text) {
1438
+ if (terminated) return;
1439
+ if (turnController === null) return;
1440
+ log.info("Pipeline barge-in", { sid: opts.sid });
1441
+ turnController.abort();
1442
+ turnController = null;
1443
+ ttsSession?.cancel();
1444
+ callbacks.onCancelled();
1445
+ }
1446
+ function onSttFinal(text) {
1447
+ if (terminated) return;
1448
+ const trimmed = text.trim();
1449
+ if (trimmed.length === 0) return;
1450
+ if (turnController !== null) {
1451
+ log.info("Pipeline replacing in-flight turn", { sid: opts.sid });
1452
+ turnController.abort();
1453
+ turnController = null;
1454
+ ttsSession?.cancel();
1455
+ callbacks.onCancelled();
1456
+ }
1457
+ callbacks.onUserTranscript(text);
1458
+ chainTurn(runTurn(trimmed).catch((err) => {
1459
+ log.error("Pipeline turn crashed", {
1460
+ error: errorMessage(err),
1461
+ sid: opts.sid
1462
+ });
1463
+ }));
1464
+ }
1465
+ function onSttError(err) {
1466
+ if (terminated) return;
1467
+ log.error("STT error", {
1468
+ code: err.code,
1469
+ message: err.message,
1470
+ sid: opts.sid
1471
+ });
1472
+ emitError("stt", err.message);
1473
+ terminate();
1474
+ }
1475
+ function onTtsError(err) {
1476
+ if (terminated) return;
1477
+ log.error("TTS error", {
1478
+ code: err.code,
1479
+ message: err.message,
1480
+ sid: opts.sid
1481
+ });
1482
+ emitError("tts", err.message);
1483
+ terminate();
1484
+ }
1485
+ async function consumeLlmStream(ctl, messages, tools, onDelta) {
1486
+ try {
1487
+ const result = streamText({
1488
+ model: opts.llm,
1489
+ system: systemPrompt,
1490
+ messages,
1491
+ tools,
1492
+ toolChoice,
1493
+ stopWhen: stepCountIs(maxSteps),
1494
+ abortSignal: ctl.signal
1495
+ });
1496
+ for await (const part of result.fullStream) {
1497
+ if (ctl.signal.aborted) break;
1498
+ handleStreamPart(part, ctl, onDelta);
1499
+ }
1500
+ } catch (err) {
1501
+ if (!ctl.signal.aborted) {
1502
+ const msg = errorMessage(err);
1503
+ log.error("LLM streamText failed", {
1504
+ error: msg,
1505
+ sid: opts.sid
1506
+ });
1507
+ emitError("llm", msg);
1508
+ }
1509
+ }
1510
+ }
1511
+ function handleStreamPart(part, _ctl, onDelta) {
1512
+ switch (part.type) {
1513
+ case "text-delta": {
1514
+ const delta = part.text ?? "";
1515
+ if (delta.length === 0) return;
1516
+ onDelta(delta);
1517
+ ttsSession?.sendText(delta);
1518
+ return;
1519
+ }
1520
+ case "tool-call": {
1521
+ const input = part.input ?? {};
1522
+ callbacks.onToolCall(part.toolCallId ?? "", part.toolName ?? "", input);
1523
+ return;
1524
+ }
1525
+ case "error": {
1526
+ const msg = errorMessage(part.error);
1527
+ log.error("LLM stream error", {
1528
+ message: msg,
1529
+ sid: opts.sid
1530
+ });
1531
+ emitError("llm", msg);
1532
+ return;
1533
+ }
1534
+ default: return;
1535
+ }
1536
+ }
1537
+ /**
1538
+ * Flush TTS and wait for drain. Resolves on:
1539
+ * - TTS emits `done`
1540
+ * - `signal` aborts (barge-in / provider error / session stop)
1541
+ * - PIPELINE_FLUSH_TIMEOUT_MS elapses
1542
+ * Resolves immediately if no TTS session.
1543
+ */
1544
+ function flushTtsAndWait(signal) {
1545
+ const tts = ttsSession;
1546
+ if (!tts) return Promise.resolve();
1547
+ return new Promise((resolve) => {
1548
+ let off = null;
1549
+ let timer = null;
1550
+ const cleanup = () => {
1551
+ if (off) {
1552
+ off();
1553
+ off = null;
1554
+ }
1555
+ if (timer) {
1556
+ clearTimeout(timer);
1557
+ timer = null;
1558
+ }
1559
+ signal.removeEventListener("abort", onAbort);
1560
+ };
1561
+ const finish = () => {
1562
+ cleanup();
1563
+ resolve();
1564
+ };
1565
+ const onAbort = () => finish();
1566
+ if (signal.aborted) {
1567
+ resolve();
1568
+ return;
1569
+ }
1570
+ signal.addEventListener("abort", onAbort, { once: true });
1571
+ off = tts.on("done", finish);
1572
+ timer = setTimeout(() => {
1573
+ log.warn("TTS flush timeout", {
1574
+ sid: opts.sid,
1575
+ timeoutMs: PIPELINE_FLUSH_TIMEOUT_MS
1576
+ });
1577
+ finish();
1578
+ }, PIPELINE_FLUSH_TIMEOUT_MS);
1579
+ tts.flush();
1580
+ });
1581
+ }
1582
+ async function runTurn(userText) {
1583
+ const replyId = `pipeline-${++nextReplyId}`;
1584
+ callbacks.onReplyStarted(replyId);
1585
+ pushMessages({
1586
+ role: "user",
1587
+ content: userText
1588
+ });
1589
+ const ctl = new AbortController();
1590
+ turnController = ctl;
1591
+ const tools = toVercelTools(toolSchemas, {
1592
+ executeTool,
1593
+ sessionId: opts.sid,
1594
+ messages: () => conversationMessages,
1595
+ signal: ctl.signal
1596
+ });
1597
+ const messages = conversationMessages.map(toModelMessage);
1598
+ let accumulated = "";
1599
+ await consumeLlmStream(ctl, messages, tools, (delta) => {
1600
+ accumulated += delta;
1601
+ });
1602
+ if (ctl.signal.aborted) {
1603
+ if (turnController === ctl) turnController = null;
1604
+ return;
1605
+ }
1606
+ if (accumulated.length > 0) {
1607
+ callbacks.onAgentTranscript(accumulated, false);
1608
+ pushMessages({
1609
+ role: "assistant",
1610
+ content: accumulated
1611
+ });
1612
+ }
1613
+ await flushTtsAndWait(ctl.signal);
1614
+ if (ctl.signal.aborted) {
1615
+ if (turnController === ctl) turnController = null;
1616
+ return;
1617
+ }
1618
+ callbacks.onReplyDone();
1619
+ if (turnController === ctl) turnController = null;
1620
+ }
1621
+ async function runGreeting(text) {
1622
+ const replyId = `pipeline-greeting-${++nextReplyId}`;
1623
+ callbacks.onReplyStarted(replyId);
1624
+ const ctl = new AbortController();
1625
+ turnController = ctl;
1626
+ callbacks.onAgentTranscript(text, false);
1627
+ pushMessages({
1628
+ role: "assistant",
1629
+ content: text
1630
+ });
1631
+ ttsSession?.sendText(text);
1632
+ await flushTtsAndWait(ctl.signal);
1633
+ if (ctl.signal.aborted) {
1634
+ if (turnController === ctl) turnController = null;
1635
+ return;
1636
+ }
1637
+ callbacks.onReplyDone();
1638
+ if (turnController === ctl) turnController = null;
1639
+ }
1640
+ function reportOpenRejection(which, reason) {
1641
+ const msg = errorMessage(reason);
1642
+ log.error(`${which === "stt" ? "STT" : "TTS"} open failed`, {
1643
+ error: msg,
1644
+ sid: opts.sid
1645
+ });
1646
+ emitError(which, msg);
1647
+ }
1648
+ async function adoptStt(session, teardown) {
1649
+ if (teardown) {
1650
+ await session.close().catch(() => void 0);
1651
+ return;
1652
+ }
1653
+ sttSession = session;
1654
+ sttSubs.push(session.on("partial", onSttPartial));
1655
+ sttSubs.push(session.on("final", onSttFinal));
1656
+ sttSubs.push(session.on("error", onSttError));
1657
+ }
1658
+ async function adoptTts(session, teardown) {
1659
+ if (teardown) {
1660
+ await session.close().catch(() => void 0);
1661
+ return;
1662
+ }
1663
+ ttsSession = session;
1664
+ ttsSubs.push(session.on("audio", (pcm) => {
1665
+ callbacks.onAudioChunk(new Uint8Array(pcm.buffer, pcm.byteOffset, pcm.byteLength));
1666
+ }));
1667
+ ttsSubs.push(session.on("error", onTtsError));
1668
+ }
1669
+ async function openProviders() {
1670
+ const [sttResult, ttsResult] = await Promise.allSettled([opts.stt.open({
1671
+ sampleRate: sttSampleRate,
1672
+ apiKey: opts.providerKeys.stt,
1673
+ sttPrompt: opts.sttPrompt,
1674
+ signal: sessionAbort.signal
1675
+ }), opts.tts.open({
1676
+ sampleRate: ttsSampleRate,
1677
+ apiKey: opts.providerKeys.tts,
1678
+ signal: sessionAbort.signal
1679
+ })]);
1680
+ if (sttResult.status === "rejected") reportOpenRejection("stt", sttResult.reason);
1681
+ if (ttsResult.status === "rejected") reportOpenRejection("tts", ttsResult.reason);
1682
+ const aborted = sessionAbort.signal.aborted;
1683
+ const sttFailed = sttResult.status === "rejected";
1684
+ const ttsFailed = ttsResult.status === "rejected";
1685
+ const teardown = aborted || sttFailed || ttsFailed;
1686
+ if (sttResult.status === "fulfilled") await adoptStt(sttResult.value, teardown);
1687
+ if (ttsResult.status === "fulfilled") await adoptTts(ttsResult.value, teardown);
1688
+ if (!aborted && (sttFailed || ttsFailed)) terminate();
1245
1689
  }
1246
- }
1247
- /** Resolve a {@link TtsProvider} descriptor into a host-side opener. */
1248
- function resolveTts(descriptor) {
1249
- switch (descriptor.kind) {
1250
- case CARTESIA_KIND: return openCartesia(descriptor.options);
1251
- default: throw new Error(`Unknown TTS provider kind: "${descriptor.kind}". Supported: ${CARTESIA_KIND}.`);
1690
+ function onAudioReady() {
1691
+ if (audioReady || terminated) return;
1692
+ audioReady = true;
1693
+ if (opts.skipGreeting) return;
1694
+ const greeting = sessionConfig.greeting;
1695
+ if (!greeting) return;
1696
+ chainTurn(runGreeting(greeting).catch((err) => {
1697
+ log.error("Pipeline greeting failed", {
1698
+ error: errorMessage(err),
1699
+ sid: opts.sid
1700
+ });
1701
+ }));
1252
1702
  }
1253
- }
1254
- /**
1255
- * Resolve an {@link LlmProvider} descriptor into a Vercel AI SDK
1256
- * {@link LanguageModel}.
1257
- *
1258
- * The API key is pulled from the agent's env (e.g. `ANTHROPIC_API_KEY`).
1259
- * Missing keys throw here — the pipeline session would fail on first
1260
- * `streamText` call otherwise, and the error is clearer at construction.
1261
- */
1262
- function resolveLlm(descriptor, env) {
1263
- switch (descriptor.kind) {
1264
- case ANTHROPIC_KIND: {
1265
- const options = descriptor.options;
1266
- const apiKey = resolveApiKey("ANTHROPIC_API_KEY", env);
1267
- if (!apiKey) throw new Error("Anthropic LLM: missing API key. Set ANTHROPIC_API_KEY in the agent env.");
1268
- return createAnthropic({
1269
- apiKey,
1270
- baseURL: "https://api.anthropic.com/v1"
1271
- })(options.model);
1703
+ return {
1704
+ async start() {
1705
+ await openProviders();
1706
+ callbacks.onSessionReady?.(opts.sid);
1707
+ onAudioReady();
1708
+ },
1709
+ async stop() {
1710
+ if (sessionAbort.signal.aborted) return;
1711
+ sessionAbort.abort();
1712
+ turnController?.abort();
1713
+ for (const off of sttSubs) off();
1714
+ for (const off of ttsSubs) off();
1715
+ sttSubs.length = 0;
1716
+ ttsSubs.length = 0;
1717
+ if (turnPromise !== null) await turnPromise;
1718
+ await sttSession?.close().catch(() => {});
1719
+ await ttsSession?.close().catch(() => {});
1720
+ },
1721
+ sendUserAudio(bytes) {
1722
+ if (terminated || !audioReady) return;
1723
+ const offset = bytes.byteOffset;
1724
+ const length = bytes.byteLength;
1725
+ let pcm;
1726
+ if (offset % 2 === 0 && length % 2 === 0) pcm = new Int16Array(bytes.buffer, offset, length / 2);
1727
+ else {
1728
+ const copy = new Uint8Array(length - length % 2);
1729
+ copy.set(bytes.subarray(0, copy.byteLength));
1730
+ pcm = new Int16Array(copy.buffer);
1731
+ }
1732
+ sttSession?.sendAudio(pcm);
1733
+ },
1734
+ sendToolResult(_callId, _result) {},
1735
+ cancelReply() {
1736
+ if (terminated) return;
1737
+ turnController?.abort();
1738
+ turnController = null;
1739
+ ttsSession?.cancel();
1272
1740
  }
1273
- default: throw new Error(`Unknown LLM provider kind: "${descriptor.kind}". Supported: ${ANTHROPIC_KIND}.`);
1274
- }
1741
+ };
1275
1742
  }
1276
1743
  //#endregion
1277
1744
  //#region host/s2s.ts
@@ -1326,72 +1793,59 @@ function parseS2sMessage(obj) {
1326
1793
  const result = S2sMessageSchema.safeParse(obj);
1327
1794
  return result.success ? result.data : void 0;
1328
1795
  }
1329
- function dispatchS2sMessage(emitter, msg, state, dispatchCtx) {
1796
+ function dispatchS2sMessage(callbacks, msg, state, ctx) {
1330
1797
  switch (msg.type) {
1331
1798
  case "session.ready":
1332
- emitter.emit("ready", { sessionId: msg.session_id });
1799
+ callbacks.onSessionReady(msg.session_id);
1333
1800
  break;
1334
1801
  case "session.updated": break;
1335
1802
  case "input.speech.started":
1336
1803
  if (!state.speechActive) {
1337
1804
  state.speechActive = true;
1338
- emitter.emit("event", { type: "speech_started" });
1805
+ callbacks.onSpeechStarted();
1339
1806
  }
1340
1807
  break;
1341
1808
  case "input.speech.stopped":
1342
1809
  if (state.speechActive) {
1343
1810
  state.speechActive = false;
1344
- emitter.emit("event", { type: "speech_stopped" });
1811
+ callbacks.onSpeechStopped();
1345
1812
  }
1346
1813
  break;
1347
1814
  case "transcript.user":
1348
- emitter.emit("event", {
1349
- type: "user_transcript",
1350
- text: msg.text
1351
- });
1815
+ callbacks.onUserTranscript(msg.text);
1352
1816
  break;
1353
1817
  case "reply.started":
1354
- emitter.emit("replyStarted", { replyId: msg.reply_id });
1818
+ callbacks.onReplyStarted(msg.reply_id);
1355
1819
  break;
1356
1820
  case "transcript.agent":
1357
- emitter.emit("event", {
1358
- type: "agent_transcript",
1359
- text: msg.text,
1360
- _interrupted: msg.interrupted
1361
- });
1821
+ callbacks.onAgentTranscript(msg.text, msg.interrupted);
1362
1822
  break;
1363
1823
  case "tool.call":
1364
- emitter.emit("event", {
1365
- type: "tool_call",
1366
- toolCallId: msg.call_id,
1367
- toolName: msg.name,
1368
- args: msg.args
1369
- });
1824
+ callbacks.onToolCall(msg.call_id, msg.name, msg.args);
1370
1825
  break;
1371
1826
  case "reply.done":
1372
- dispatchCtx.log.info("S2S << reply.done", {
1373
- ...dispatchCtx.sid !== void 0 ? { sid: dispatchCtx.sid } : {},
1827
+ ctx.log.info("S2S << reply.done", {
1828
+ ...ctx.sid !== void 0 ? { sid: ctx.sid } : {},
1374
1829
  status: msg.status ?? "completed"
1375
1830
  });
1376
- if (msg.status === "interrupted") emitter.emit("event", { type: "cancelled" });
1377
- else emitter.emit("event", { type: "reply_done" });
1831
+ if (msg.status === "interrupted") callbacks.onCancelled();
1832
+ else callbacks.onReplyDone();
1378
1833
  break;
1379
1834
  case "session.error":
1380
- if (msg.code === "session_not_found" || msg.code === "session_forbidden") emitter.emit("sessionExpired");
1381
- else emitter.emit("error", new Error(msg.message));
1835
+ if (msg.code === "session_not_found" || msg.code === "session_forbidden") callbacks.onSessionExpired();
1836
+ else callbacks.onError(new Error(msg.message));
1382
1837
  break;
1383
1838
  case "error":
1384
- emitter.emit("error", new Error(msg.message));
1839
+ callbacks.onError(new Error(msg.message));
1385
1840
  break;
1386
1841
  default: break;
1387
1842
  }
1388
1843
  }
1389
1844
  function connectS2s(opts) {
1390
- const { apiKey, config, createWebSocket, logger: log = consoleLogger, sid } = opts;
1845
+ const { apiKey, config, createWebSocket, callbacks, logger: log = consoleLogger, sid } = opts;
1391
1846
  return new Promise((resolve, reject) => {
1392
1847
  log.info("S2S connecting", { url: config.wssUrl });
1393
1848
  const ws = createWebSocket(config.wssUrl, { headers: { Authorization: `Bearer ${apiKey}` } });
1394
- const emitter = createNanoEvents();
1395
1849
  const dispatchState = { speechActive: false };
1396
1850
  const dispatchCtx = sid !== void 0 ? {
1397
1851
  log,
@@ -1409,7 +1863,6 @@ function connectS2s(opts) {
1409
1863
  ws.send(json);
1410
1864
  }
1411
1865
  const handle = {
1412
- on: emitter.on.bind(emitter),
1413
1866
  sendAudio(audio) {
1414
1867
  if (ws.readyState !== 1) {
1415
1868
  log.debug("S2S sendAudio dropped: socket not open");
@@ -1422,16 +1875,15 @@ function connectS2s(opts) {
1422
1875
  ws.send(jsonFrame);
1423
1876
  },
1424
1877
  sendToolResult(callId, result) {
1425
- const msg = {
1426
- type: "tool.result",
1427
- call_id: callId,
1428
- result
1429
- };
1430
1878
  log.info("S2S >> tool.result", {
1431
1879
  call_id: callId,
1432
1880
  resultLength: result.length
1433
1881
  });
1434
- send(msg);
1882
+ send({
1883
+ type: "tool.result",
1884
+ call_id: callId,
1885
+ result
1886
+ });
1435
1887
  },
1436
1888
  updateSession(sessionConfig) {
1437
1889
  const { systemPrompt, ...rest } = sessionConfig;
@@ -1468,8 +1920,7 @@ function connectS2s(opts) {
1468
1920
  }
1469
1921
  function handleAudioFastPath(obj) {
1470
1922
  if (obj.type === "reply.audio" && typeof obj.data === "string") {
1471
- const audioBytes = base64ToUint8(obj.data);
1472
- emitter.emit("audio", { audio: audioBytes });
1923
+ callbacks.onAudio(base64ToUint8(obj.data));
1473
1924
  return true;
1474
1925
  }
1475
1926
  return false;
@@ -1479,7 +1930,7 @@ function connectS2s(opts) {
1479
1930
  if (obj.type === "reply.done") return;
1480
1931
  log.info(`S2S << ${obj.type}`);
1481
1932
  }
1482
- function handleS2sMessage(ev) {
1933
+ ws.addEventListener("message", (ev) => {
1483
1934
  const raw = tryParseJson(ev.data);
1484
1935
  if (raw === void 0) return;
1485
1936
  if (typeof raw !== "object" || raw === null || Array.isArray(raw)) {
@@ -1494,9 +1945,8 @@ function connectS2s(opts) {
1494
1945
  log.warn(`S2S << unrecognised message type: ${obj.type ?? JSON.stringify(raw).slice(0, 200)}`);
1495
1946
  return;
1496
1947
  }
1497
- dispatchS2sMessage(emitter, parsed, dispatchState, dispatchCtx);
1498
- }
1499
- ws.addEventListener("message", handleS2sMessage);
1948
+ dispatchS2sMessage(callbacks, parsed, dispatchState, dispatchCtx);
1949
+ });
1500
1950
  ws.addEventListener("close", (ev) => {
1501
1951
  const code = ev.code ?? 0;
1502
1952
  const reason = ev.reason ?? "";
@@ -1505,394 +1955,102 @@ function connectS2s(opts) {
1505
1955
  reason
1506
1956
  });
1507
1957
  if (!opened) reject(/* @__PURE__ */ new Error(`WebSocket closed before open (code: ${code})`));
1508
- emitter.emit("close", code, reason);
1958
+ callbacks.onClose(code, reason);
1509
1959
  });
1510
1960
  ws.addEventListener("error", (ev) => {
1511
1961
  const message = typeof ev.message === "string" ? ev.message : "WebSocket error";
1512
1962
  const errObj = new Error(message);
1513
1963
  log.error("S2S WebSocket error", { error: errObj.message });
1514
1964
  if (!opened) reject(errObj);
1515
- else emitter.emit("error", errObj);
1965
+ else callbacks.onError(errObj);
1516
1966
  });
1517
1967
  });
1518
1968
  }
1519
1969
  //#endregion
1520
- //#region host/session.ts
1521
- /** @internal Not part of the public API. Exposed for testing only. */
1970
+ //#region host/transports/s2s-transport.ts
1971
+ /** @internal Exposed for testing allows spying on connectS2s in unit tests. */
1522
1972
  const _internals = { connectS2s };
1523
- /**
1524
- * Create an idle timer that closes the S2S connection after inactivity.
1525
- * Convention: `timeoutMs <= 0` disables the timer entirely (returns a no-op).
1526
- * This allows agents to opt out of idle timeout via `idleTimeoutMs: 0` in their config.
1527
- */
1528
- function createIdleTimer(opts) {
1529
- if (opts.timeoutMs <= 0) return {
1530
- reset() {},
1531
- clear() {}
1532
- };
1533
- let timer = null;
1534
- return {
1535
- reset() {
1536
- if (timer !== null) clearTimeout(timer);
1537
- timer = setTimeout(() => {
1538
- opts.log.info("S2S idle timeout", {
1539
- timeoutMs: opts.timeoutMs,
1540
- agent: opts.agent
1541
- });
1542
- opts.client.event({ type: "idle_timeout" });
1543
- opts.ctx.s2s?.close();
1544
- }, opts.timeoutMs);
1545
- },
1546
- clear() {
1547
- if (timer !== null) {
1548
- clearTimeout(timer);
1549
- timer = null;
1973
+ function createS2sTransport(opts) {
1974
+ const log = opts.logger ?? consoleLogger;
1975
+ const createWs = opts.createWebSocket ?? defaultCreateS2sWebSocket;
1976
+ let handle = null;
1977
+ let currentReplyId = null;
1978
+ async function start() {
1979
+ handle = await _internals.connectS2s({
1980
+ apiKey: opts.apiKey,
1981
+ config: opts.s2sConfig,
1982
+ createWebSocket: createWs,
1983
+ logger: log,
1984
+ sid: opts.sid,
1985
+ callbacks: {
1986
+ onSessionReady: (providerSessionId) => opts.callbacks.onSessionReady?.(providerSessionId),
1987
+ onReplyStarted: (replyId) => {
1988
+ currentReplyId = replyId;
1989
+ opts.callbacks.onReplyStarted(replyId);
1990
+ },
1991
+ onReplyDone: () => {
1992
+ currentReplyId = null;
1993
+ opts.callbacks.onReplyDone();
1994
+ },
1995
+ onCancelled: () => {
1996
+ currentReplyId = null;
1997
+ opts.callbacks.onCancelled();
1998
+ },
1999
+ onAudio: (bytes) => opts.callbacks.onAudioChunk(bytes),
2000
+ onUserTranscript: opts.callbacks.onUserTranscript,
2001
+ onAgentTranscript: opts.callbacks.onAgentTranscript,
2002
+ onToolCall: opts.callbacks.onToolCall,
2003
+ onSpeechStarted: opts.callbacks.onSpeechStarted,
2004
+ onSpeechStopped: opts.callbacks.onSpeechStopped,
2005
+ onSessionExpired: () => {
2006
+ log.info("S2S session expired", { sid: opts.sid });
2007
+ handle?.close();
2008
+ },
2009
+ onError: (err) => opts.callbacks.onError("internal", err.message),
2010
+ onClose: (code, reason) => {
2011
+ if (currentReplyId !== null) {
2012
+ log.warn("S2S closed with active reply", {
2013
+ sid: opts.sid,
2014
+ agent: opts.agent,
2015
+ activeReplyId: currentReplyId,
2016
+ code,
2017
+ reason
2018
+ });
2019
+ opts.callbacks.onError("connection", `S2S closed mid-reply (code=${code})`);
2020
+ } else log.info("S2S closed", {
2021
+ code,
2022
+ reason
2023
+ });
2024
+ }
1550
2025
  }
1551
- }
1552
- };
1553
- }
1554
- /**
1555
- * Complete a tool call by truncating the result, emitting a `tool_call_done` event,
1556
- * and accumulating the result in `ctx.reply.pendingTools` — but only if the reply that
1557
- * initiated this call is still active.
1558
- */
1559
- function finishToolCall(ctx, callId, result, replyId) {
1560
- const truncatedResult = result.length > 4e3 ? result.slice(0, MAX_TOOL_RESULT_CHARS) : result;
1561
- ctx.client.event({
1562
- type: "tool_call_done",
1563
- toolCallId: callId,
1564
- result: truncatedResult
1565
- });
1566
- if (replyId !== null && replyId === ctx.reply.currentReplyId) {
1567
- ctx.reply.pendingTools.push({
1568
- callId,
1569
- result
1570
- });
1571
- if (ctx.maxHistory > 0 && ctx.reply.pendingTools.length > ctx.maxHistory) ctx.reply.pendingTools.shift();
1572
- }
1573
- }
1574
- async function handleToolCall(ctx, event) {
1575
- const { toolCallId: callId, toolName: name, args: parsedArgs } = event;
1576
- const replyId = ctx.reply.currentReplyId;
1577
- ctx.client.event(event);
1578
- const refused = ctx.consumeToolCallStep(name, replyId);
1579
- if (refused !== null) {
1580
- finishToolCall(ctx, callId, refused, replyId);
1581
- return;
1582
- }
1583
- ctx.log.info("S2S tool call", {
1584
- tool: name,
1585
- callId,
1586
- args: parsedArgs,
1587
- agent: ctx.agent
1588
- });
1589
- let result;
1590
- try {
1591
- result = await ctx.executeTool(name, parsedArgs, ctx.id, ctx.conversationMessages);
1592
- } catch (err) {
1593
- const msg = errorMessage(err);
1594
- ctx.log.error("Tool execution failed", {
1595
- tool: name,
1596
- error: errorDetail(err)
1597
2026
  });
1598
- result = toolError(msg);
1599
- }
1600
- ctx.log.info("S2S tool result", {
1601
- tool: name,
1602
- callId,
1603
- resultLength: result.length
1604
- });
1605
- finishToolCall(ctx, callId, result, replyId);
1606
- }
1607
- function handleUserTranscript(ctx, text) {
1608
- ctx.log.info("S2S user transcript", { text });
1609
- ctx.client.event({
1610
- type: "user_transcript",
1611
- text
1612
- });
1613
- ctx.pushMessages({
1614
- role: "user",
1615
- content: text
1616
- });
1617
- }
1618
- function handleAgentTranscript(ctx, text, interrupted) {
1619
- ctx.client.event({
1620
- type: "agent_transcript",
1621
- text
1622
- });
1623
- if (!interrupted) ctx.pushMessages({
1624
- role: "assistant",
1625
- content: text
1626
- });
1627
- }
1628
- function handleReplyCancelled(ctx) {
1629
- ctx.log.info("S2S reply interrupted (barge-in)");
1630
- ctx.cancelReply();
1631
- ctx.client.event({ type: "cancelled" });
1632
- }
1633
- /**
1634
- * Warn when the entry-to-emit time for a reply_done dispatch exceeds this.
1635
- * Tool-less sessions should be sub-millisecond; sessions with pending tools
1636
- * will legitimately spend time awaiting ctx.turnPromise. We log both (with
1637
- * `hadTurnPromise`) so event-loop starvation is distinguishable from
1638
- * genuine tool-call latency.
1639
- */
1640
- const REPLY_DONE_SLOW_THRESHOLD_MS = 50;
1641
- function handleReplyDone(ctx) {
1642
- const startMs = Date.now();
1643
- const doneReplyId = ctx.reply.currentReplyId;
1644
- if (doneReplyId === null) {
1645
- ctx.log.debug("Dropping duplicate reply.done (no active reply)");
1646
- return;
2027
+ handle.updateSession(opts.sessionConfig);
1647
2028
  }
1648
- const hadTurnPromise = ctx.turnPromise !== null;
1649
- const sendPending = () => {
1650
- if (ctx.reply.currentReplyId !== doneReplyId) {
1651
- ctx.reply.pendingTools = [];
1652
- return;
1653
- }
1654
- if (ctx.reply.pendingTools.length > 0) {
1655
- for (const tool of ctx.reply.pendingTools) ctx.s2s?.sendToolResult(tool.callId, tool.result);
1656
- ctx.reply.pendingTools = [];
1657
- } else {
1658
- const stepsUsed = ctx.reply.toolCallCount;
1659
- if (stepsUsed > 0) ctx.log.info("Turn complete", {
1660
- steps: stepsUsed,
1661
- agent: ctx.agent
1662
- });
1663
- ctx.client.playAudioDone();
1664
- ctx.client.event({ type: "reply_done" });
1665
- ctx.reply.currentReplyId = null;
1666
- const durationMs = Date.now() - startMs;
1667
- if (durationMs >= REPLY_DONE_SLOW_THRESHOLD_MS) ctx.log.warn("slow reply_done dispatch", {
1668
- sid: ctx.id,
1669
- agent: ctx.agent,
1670
- durationMs,
1671
- hadTurnPromise
1672
- });
1673
- }
1674
- };
1675
- if (hadTurnPromise) ctx.turnPromise?.then(sendPending);
1676
- else sendPending();
1677
- }
1678
- function setupListeners(ctx, handle) {
1679
- handle.on("ready", ({ sessionId }) => ctx.log.info("S2S session ready", { sessionId }));
1680
- handle.on("replyStarted", ({ replyId }) => {
1681
- ctx.beginReply(replyId);
1682
- });
1683
- handle.on("sessionExpired", () => {
1684
- ctx.log.info("S2S session expired");
1685
- handle.close();
1686
- });
1687
- handle.on("audio", ({ audio }) => ctx.client.playAudioChunk(audio));
1688
- handle.on("error", (err) => {
1689
- ctx.log.error("S2S error", { message: err.message });
1690
- ctx.client.event({
1691
- type: "error",
1692
- code: "internal",
1693
- message: err.message
1694
- });
1695
- handle.close();
1696
- });
1697
- handle.on("close", (code, reason) => {
1698
- const activeReplyId = ctx.reply.currentReplyId;
1699
- if (activeReplyId !== null) ctx.log.warn("S2S closed with active reply", {
1700
- sid: ctx.id,
1701
- agent: ctx.agent,
1702
- activeReplyId,
1703
- code,
1704
- reason
1705
- });
1706
- else ctx.log.info("S2S closed", {
1707
- code,
1708
- reason
1709
- });
1710
- ctx.s2s = null;
1711
- ctx.cancelReply();
1712
- });
1713
- handle.on("event", (event) => {
1714
- switch (event.type) {
1715
- case "user_transcript":
1716
- handleUserTranscript(ctx, event.text);
1717
- break;
1718
- case "agent_transcript":
1719
- handleAgentTranscript(ctx, event.text, event._interrupted ?? false);
1720
- break;
1721
- case "tool_call": {
1722
- const p = handleToolCall(ctx, event).catch((err) => {
1723
- ctx.log.error("Tool call handler failed", { err: errorMessage(err) });
1724
- });
1725
- ctx.chainTurn(p);
1726
- break;
1727
- }
1728
- case "reply_done":
1729
- handleReplyDone(ctx);
1730
- break;
1731
- case "cancelled":
1732
- handleReplyCancelled(ctx);
1733
- break;
1734
- default: ctx.client.event(event);
1735
- }
1736
- });
1737
- }
1738
- function createS2sSession(opts) {
1739
- const { id, agent, client, toolSchemas, apiKey, s2sConfig, executeTool, createWebSocket = defaultCreateS2sWebSocket, logger: log = consoleLogger } = opts;
1740
- const agentConfig = opts.skipGreeting ? {
1741
- ...opts.agentConfig,
1742
- greeting: ""
1743
- } : opts.agentConfig;
1744
- const systemPrompt = buildSystemPrompt(agentConfig, {
1745
- hasTools: toolSchemas.length > 0 || (agentConfig.builtinTools?.length ?? 0) > 0,
1746
- voice: true,
1747
- toolGuidance: opts.toolGuidance
1748
- });
1749
- const s2sTools = toolSchemas.map((ts) => ({
1750
- type: "function",
1751
- name: ts.name,
1752
- description: ts.description,
1753
- parameters: ts.parameters
1754
- }));
1755
- const sessionAbort = new AbortController();
1756
- const ctx = buildCtx({
1757
- id,
1758
- agent,
1759
- client,
1760
- agentConfig,
1761
- executeTool,
1762
- log,
1763
- maxHistory: opts.maxHistory
1764
- });
1765
- const rawTimeout = agentConfig.idleTimeoutMs ?? 3e5;
1766
- const idle = createIdleTimer({
1767
- timeoutMs: rawTimeout === 0 || !Number.isFinite(rawTimeout) ? 0 : rawTimeout,
1768
- agent,
1769
- log,
1770
- client,
1771
- ctx
1772
- });
1773
- let connectGeneration = 0;
1774
- const sessionUpdatePayload = {
1775
- systemPrompt,
1776
- tools: s2sTools,
1777
- ...agentConfig.greeting ? { greeting: agentConfig.greeting } : {}
1778
- };
1779
- async function connectAndSetup() {
1780
- const generation = ++connectGeneration;
1781
- try {
1782
- const handle = await _internals.connectS2s({
1783
- apiKey,
1784
- config: s2sConfig,
1785
- createWebSocket,
1786
- logger: log,
1787
- sid: id
1788
- });
1789
- if (sessionAbort.signal.aborted || generation !== connectGeneration) {
1790
- handle.close();
1791
- return;
1792
- }
1793
- setupListeners(ctx, handle);
1794
- handle.updateSession(sessionUpdatePayload);
1795
- ctx.s2s = handle;
1796
- idle.reset();
1797
- } catch (err) {
1798
- const msg = errorMessage(err);
1799
- log.error("S2S connect failed", { error: errorDetail(err) });
1800
- client.event({
1801
- type: "error",
1802
- code: "internal",
1803
- message: msg
1804
- });
1805
- }
2029
+ async function stop() {
2030
+ handle?.close();
2031
+ handle = null;
1806
2032
  }
1807
2033
  return {
1808
- async start() {
1809
- await connectAndSetup();
1810
- },
1811
- async stop() {
1812
- if (sessionAbort.signal.aborted) return;
1813
- sessionAbort.abort();
1814
- idle.clear();
1815
- if (ctx.turnPromise !== null) await ctx.turnPromise;
1816
- ctx.s2s?.close();
1817
- },
1818
- onAudio(data) {
1819
- idle.reset();
1820
- ctx.s2s?.sendAudio(data);
1821
- },
1822
- onAudioReady() {},
1823
- onCancel() {
1824
- client.event({ type: "cancelled" });
1825
- },
1826
- onReset() {
1827
- ctx.cancelReply();
1828
- ctx.conversationMessages = [];
1829
- ctx.reply.toolCallCount = 0;
1830
- ctx.turnPromise = null;
1831
- idle.clear();
1832
- ctx.s2s?.close();
1833
- client.event({ type: "reset" });
1834
- connectAndSetup().catch((err) => log.error("S2S reset reconnect failed", { error: errorMessage(err) }));
2034
+ start,
2035
+ stop,
2036
+ sendUserAudio(bytes) {
2037
+ handle?.sendAudio(bytes);
1835
2038
  },
1836
- onHistory(incoming) {
1837
- ctx.pushMessages(...incoming.map((m) => ({
1838
- role: m.role,
1839
- content: m.content
1840
- })));
2039
+ sendToolResult(callId, result) {
2040
+ handle?.sendToolResult(callId, result);
1841
2041
  },
1842
- waitForTurn() {
1843
- return ctx.turnPromise ?? Promise.resolve();
1844
- }
1845
- };
1846
- }
1847
- //#endregion
1848
- //#region host/tool-executor.ts
1849
- /**
1850
- * Tool execution — validates arguments and invokes tool handlers.
1851
- *
1852
- * {@link executeToolCall} is the single entry point used by both the
1853
- * direct (self-hosted) runtime and the platform sandbox sidecar.
1854
- */
1855
- const yieldTick = () => new Promise((r) => setTimeout(r, 0));
1856
- function buildToolContext(opts) {
1857
- const { env, state, kv, messages, sessionId } = opts;
1858
- return {
1859
- env,
1860
- state: state ?? {},
1861
- get kv() {
1862
- if (!kv) throw new Error("KV not available");
1863
- return kv;
2042
+ cancelReply() {
2043
+ currentReplyId = null;
1864
2044
  },
1865
- messages: messages ?? [],
1866
- sessionId: sessionId ?? "",
1867
- send(event, data) {
1868
- opts.send?.(event, data);
2045
+ updateSession(config) {
2046
+ handle?.updateSession({
2047
+ systemPrompt: config.systemPrompt,
2048
+ tools: config.tools ?? [],
2049
+ ...config.greeting !== void 0 ? { greeting: config.greeting } : {}
2050
+ });
1869
2051
  }
1870
2052
  };
1871
2053
  }
1872
- async function executeToolCall(name, args, options) {
1873
- const { tool } = options;
1874
- const parsed = (tool.parameters ?? EMPTY_PARAMS).safeParse(args);
1875
- if (!parsed.success) return toolError(`Invalid arguments for tool "${name}": ${(parsed.error?.issues ?? []).map((i) => `${i.path.map(String).join(".")}: ${i.message}`).join(", ")}`);
1876
- try {
1877
- const ctx = buildToolContext(options);
1878
- await yieldTick();
1879
- const result = await pTimeout(Promise.resolve(tool.execute(parsed.data, ctx)), {
1880
- milliseconds: TOOL_EXECUTION_TIMEOUT_MS,
1881
- message: `Tool "${name}" timed out after ${TOOL_EXECUTION_TIMEOUT_MS}ms`
1882
- });
1883
- await yieldTick();
1884
- if (result == null) return "null";
1885
- return typeof result === "string" ? result : JSON.stringify(result);
1886
- } catch (err) {
1887
- const log = options.logger;
1888
- if (log) log.warn("Tool execution failed", {
1889
- tool: name,
1890
- error: errorDetail(err)
1891
- });
1892
- else console.warn(`[tool-executor] Tool execution failed: ${name}`, err);
1893
- return toolError(errorMessage(err));
1894
- }
1895
- }
1896
2054
  //#endregion
1897
2055
  //#region host/unstorage-kv.ts
1898
2056
  /**
@@ -1944,20 +2102,20 @@ function createUnstorageKv(options) {
1944
2102
  *
1945
2103
  * Audio validation is handled at the host transport layer (see server.ts).
1946
2104
  */
2105
+ const AUDIO_DONE_FRAME = JSON.stringify({ type: "audio_done" });
1947
2106
  /**
1948
2107
  * Creates a {@link ClientSink} backed by a plain WebSocket.
1949
2108
  *
1950
- * Text events are sent as JSON text frames; audio chunks are sent as
1951
- * binary frames (zero-copy).
2109
+ * Session events are sent as JSON text frames; audio chunks are sent as raw
2110
+ * PCM16 binary frames.
1952
2111
  */
1953
2112
  function createClientSink(ws, log) {
1954
- /** Send data over ws, silently dropping if the socket is not open. */
1955
2113
  function safeSend(data) {
1956
2114
  try {
1957
2115
  if (ws.readyState !== 1) return;
1958
2116
  ws.send(data);
1959
2117
  } catch (err) {
1960
- log.debug?.("safeSend: socket closed between readyState check and send", { error: errorMessage(err) });
2118
+ log.debug?.("safeSend: socket closed between readyState check and send", { error: err instanceof Error ? err.message : String(err) });
1961
2119
  }
1962
2120
  }
1963
2121
  return {
@@ -1971,7 +2129,7 @@ function createClientSink(ws, log) {
1971
2129
  safeSend(chunk);
1972
2130
  },
1973
2131
  playAudioDone() {
1974
- safeSend(JSON.stringify({ type: "audio_done" }));
2132
+ safeSend(AUDIO_DONE_FRAME);
1975
2133
  }
1976
2134
  };
1977
2135
  }
@@ -1980,35 +2138,32 @@ function handleBinaryAudio(data, session) {
1980
2138
  session.onAudio(data);
1981
2139
  return true;
1982
2140
  }
1983
- if (data instanceof ArrayBuffer) {
1984
- session.onAudio(new Uint8Array(data));
1985
- return true;
1986
- }
1987
2141
  return false;
1988
2142
  }
1989
- function handleTextMessage(data, session, log, ctx, sid) {
1990
- if (typeof data !== "string") return;
1991
- let json;
2143
+ function handleTextMessage(data, session, log, sid) {
2144
+ if (typeof data !== "string") {
2145
+ log.warn("ws: non-string, non-binary frame received; dropping", { sid });
2146
+ return;
2147
+ }
2148
+ let parsed;
1992
2149
  try {
1993
- json = JSON.parse(data);
2150
+ parsed = JSON.parse(data);
1994
2151
  } catch {
1995
- log.warn("Invalid JSON from client", {
1996
- ...ctx,
1997
- sid
2152
+ log.warn("ws: invalid JSON; dropping", {
2153
+ sid,
2154
+ data: data.slice(0, 200)
1998
2155
  });
1999
2156
  return;
2000
2157
  }
2001
- const parsed = lenientParse(ClientMessageSchema, json);
2002
- if (!parsed.ok) {
2003
- if (parsed.malformed) log.warn("Invalid client message", {
2004
- ...ctx,
2158
+ const result = lenientParse(ClientMessageSchema, parsed);
2159
+ if (!result.ok) {
2160
+ if (result.malformed) log.warn("ws: malformed client message", {
2005
2161
  sid,
2006
- error: parsed.error
2162
+ error: result.error
2007
2163
  });
2008
2164
  return;
2009
2165
  }
2010
- const msg = parsed.data;
2011
- switch (msg.type) {
2166
+ switch (result.data.type) {
2012
2167
  case "audio_ready":
2013
2168
  session.onAudioReady();
2014
2169
  break;
@@ -2019,19 +2174,19 @@ function handleTextMessage(data, session, log, ctx, sid) {
2019
2174
  session.onReset();
2020
2175
  break;
2021
2176
  case "history":
2022
- session.onHistory(msg.messages);
2177
+ session.onHistory(result.data.messages);
2023
2178
  break;
2024
2179
  default: break;
2025
2180
  }
2026
2181
  }
2027
2182
  /**
2028
- * Attaches session lifecycle handlers to a native WebSocket using
2029
- * plain JSON text frames and binary audio frames.
2183
+ * Attaches session lifecycle handlers to a native WebSocket using JSON text
2184
+ * frames for control messages and raw PCM16 binary frames for audio.
2030
2185
  *
2031
2186
  * Connection flow:
2032
- * 1. WebSocket opens → server sends `{ type: "config", ...ReadyConfig }`
2033
- * 2. Client sets up audio → sends `{ type: "audio_ready" }`
2034
- * 3. If reconnecting → client sends `{ type: "history", messages: [...] }`
2187
+ * 1. WebSocket opens → server sends JSON CONFIG frame with sampleRate, ttsSampleRate, sessionId
2188
+ * 2. Client sets up audio → sends JSON AUDIO_READY frame
2189
+ * 3. If reconnecting → client sends JSON HISTORY frame with prior messages
2035
2190
  */
2036
2191
  function wireSessionSocket(ws, opts) {
2037
2192
  const { sessions, logger: log = consoleLogger } = opts;
@@ -2041,7 +2196,7 @@ function wireSessionSocket(ws, opts) {
2041
2196
  let session = null;
2042
2197
  /** Set to true once session.start() resolves. Messages arriving before
2043
2198
  * this flag is set are buffered and replayed once the session is ready,
2044
- * preventing audio/text from being dispatched to a half-initialized session. */
2199
+ * preventing audio/frames from being dispatched to a half-initialized session. */
2045
2200
  let sessionReady = false;
2046
2201
  let messageBuffer = [];
2047
2202
  function drainBuffer() {
@@ -2049,9 +2204,8 @@ function wireSessionSocket(ws, opts) {
2049
2204
  const buf = messageBuffer;
2050
2205
  messageBuffer = null;
2051
2206
  for (const event of buf) {
2052
- const { data } = event;
2053
- if (handleBinaryAudio(data, session)) continue;
2054
- handleTextMessage(data, session, log, ctx, sid);
2207
+ if (handleBinaryAudio(event.data, session)) continue;
2208
+ handleTextMessage(event.data, session, log, sid);
2055
2209
  }
2056
2210
  }
2057
2211
  function onOpen() {
@@ -2066,7 +2220,9 @@ function wireSessionSocket(ws, opts) {
2066
2220
  opts.onSinkCreated?.(sessionId, client);
2067
2221
  ws.send(JSON.stringify({
2068
2222
  type: "config",
2069
- ...opts.readyConfig,
2223
+ audioFormat: opts.readyConfig.audioFormat,
2224
+ sampleRate: opts.readyConfig.sampleRate,
2225
+ ttsSampleRate: opts.readyConfig.ttsSampleRate,
2070
2226
  sessionId
2071
2227
  }));
2072
2228
  const timeoutMs = opts.sessionStartTimeoutMs ?? 1e4;
@@ -2099,9 +2255,8 @@ function wireSessionSocket(ws, opts) {
2099
2255
  if (messageBuffer && messageBuffer.length < 100) messageBuffer.push(event);
2100
2256
  return;
2101
2257
  }
2102
- const { data } = event;
2103
- if (handleBinaryAudio(data, session)) return;
2104
- handleTextMessage(data, session, log, ctx, sid);
2258
+ if (handleBinaryAudio(event.data, session)) return;
2259
+ handleTextMessage(event.data, session, log, sid);
2105
2260
  });
2106
2261
  ws.addEventListener("close", () => {
2107
2262
  log.info("Session disconnected", {
@@ -2132,6 +2287,30 @@ function wireSessionSocket(ws, opts) {
2132
2287
  //#endregion
2133
2288
  //#region host/runtime.ts
2134
2289
  /**
2290
+ * Resolve the API key env-var for the configured STT provider.
2291
+ *
2292
+ * Each STT provider uses its own env var (e.g. `ASSEMBLYAI_API_KEY`,
2293
+ * `DEEPGRAM_API_KEY`). We read the kind from the descriptor if it is one;
2294
+ * pre-resolved openers have no kind field so we fall back to AssemblyAI for
2295
+ * backward compatibility (openers supply their own key at open-time anyway).
2296
+ */
2297
+ function resolveSttApiKey(stt, env) {
2298
+ if ((stt != null && "kind" in stt && typeof stt.kind === "string" ? stt.kind : void 0) === "deepgram") return resolveApiKey("DEEPGRAM_API_KEY", env);
2299
+ return resolveApiKey("ASSEMBLYAI_API_KEY", env);
2300
+ }
2301
+ /**
2302
+ * Resolve the API key env-var for the configured TTS provider.
2303
+ *
2304
+ * Each TTS provider uses its own env var (e.g. `CARTESIA_API_KEY`,
2305
+ * `RIME_API_KEY`). We read the kind from the descriptor if it is one;
2306
+ * pre-resolved openers have no kind field so we fall back to Cartesia for
2307
+ * backward compatibility (openers supply their own key at open-time anyway).
2308
+ */
2309
+ function resolveTtsApiKey(tts, env) {
2310
+ if ((tts != null && "kind" in tts && typeof tts.kind === "string" ? tts.kind : void 0) === "rime") return resolveApiKey("RIME_API_KEY", env);
2311
+ return resolveApiKey("CARTESIA_API_KEY", env);
2312
+ }
2313
+ /**
2135
2314
  * Distinguish a descriptor (`{ kind, options }`) from an already-resolved
2136
2315
  * opener / `LanguageModel`. The production path always passes descriptors;
2137
2316
  * openers are a test escape hatch (fakes in `_pipeline-test-fakes.ts`).
@@ -2236,40 +2415,86 @@ function createRuntime(opts) {
2236
2415
  } : null;
2237
2416
  function createSession(sessionOpts) {
2238
2417
  sinkMap.set(sessionOpts.id, sessionOpts.client);
2239
- if (pipelineProviders) return createPipelineSession({
2240
- id: sessionOpts.id,
2418
+ const isPipeline = Boolean(pipelineProviders);
2419
+ const systemPrompt = buildSystemPrompt(agentConfig, {
2420
+ hasTools: toolSchemas.length > 0 || (agentConfig.builtinTools?.length ?? 0) > 0,
2421
+ voice: true,
2422
+ toolGuidance
2423
+ });
2424
+ let core = null;
2425
+ function bindCore() {
2426
+ if (!core) throw new Error("SessionCore not yet created");
2427
+ return core;
2428
+ }
2429
+ const callbacks = {
2430
+ onReplyStarted: (replyId) => bindCore().onReplyStarted(replyId),
2431
+ onReplyDone: () => bindCore().onReplyDone(),
2432
+ onCancelled: () => bindCore().onCancelled(),
2433
+ onAudioChunk: (bytes) => bindCore().onAudioChunk(bytes),
2434
+ onAudioDone: () => bindCore().onAudioDone(),
2435
+ onUserTranscript: (text) => bindCore().onUserTranscript(text),
2436
+ onAgentTranscript: (text, interrupted) => bindCore().onAgentTranscript(text, interrupted),
2437
+ onToolCall: isPipeline ? (id, name, args) => sessionOpts.client.event({
2438
+ type: "tool_call",
2439
+ toolCallId: id,
2440
+ toolName: name,
2441
+ args
2442
+ }) : (id, name, args) => bindCore().onToolCall(id, name, args),
2443
+ onError: (code, message) => bindCore().onError(code, message),
2444
+ onSpeechStarted: () => bindCore().onSpeechStarted(),
2445
+ onSpeechStopped: () => bindCore().onSpeechStopped()
2446
+ };
2447
+ let transport;
2448
+ if (pipelineProviders) transport = createPipelineTransport({
2449
+ sid: sessionOpts.id,
2241
2450
  agent: sessionOpts.agent,
2242
- client: sessionOpts.client,
2243
- agentConfig,
2244
- toolSchemas,
2245
- toolGuidance,
2246
- executeTool,
2247
2451
  stt: pipelineProviders.stt,
2248
2452
  llm: pipelineProviders.llm,
2249
2453
  tts: pipelineProviders.tts,
2250
- sttApiKey: resolveApiKey("ASSEMBLYAI_API_KEY", env),
2251
- ttsApiKey: resolveApiKey("CARTESIA_API_KEY", env),
2454
+ callbacks,
2455
+ sessionConfig: {
2456
+ systemPrompt,
2457
+ greeting: agentConfig.greeting,
2458
+ tools: toolSchemas
2459
+ },
2460
+ toolSchemas,
2461
+ executeTool,
2462
+ providerKeys: {
2463
+ stt: resolveSttApiKey(opts.stt, env),
2464
+ tts: resolveTtsApiKey(opts.tts, env)
2465
+ },
2252
2466
  sttSampleRate: s2sConfig.inputSampleRate,
2253
2467
  ttsSampleRate: s2sConfig.outputSampleRate,
2468
+ maxSteps: agentConfig.maxSteps,
2469
+ toolChoice: agentConfig.toolChoice,
2254
2470
  skipGreeting: sessionOpts.skipGreeting ?? false,
2255
2471
  logger
2256
2472
  });
2257
- const apiKey = env.ASSEMBLYAI_API_KEY ?? "";
2258
- return createS2sSession({
2473
+ else transport = createS2sTransport({
2474
+ apiKey: env.ASSEMBLYAI_API_KEY ?? "",
2475
+ s2sConfig,
2476
+ sessionConfig: {
2477
+ systemPrompt,
2478
+ tools: toolSchemas,
2479
+ ...agentConfig.greeting !== void 0 ? { greeting: agentConfig.greeting } : {}
2480
+ },
2481
+ toolSchemas,
2482
+ callbacks,
2483
+ sid: sessionOpts.id,
2484
+ agent: sessionOpts.agent,
2485
+ ...createWebSocket ? { createWebSocket } : {},
2486
+ logger
2487
+ });
2488
+ core = createSessionCore({
2259
2489
  id: sessionOpts.id,
2260
2490
  agent: sessionOpts.agent,
2261
2491
  client: sessionOpts.client,
2262
2492
  agentConfig,
2263
- toolSchemas,
2264
- toolGuidance,
2265
- apiKey,
2266
- s2sConfig,
2267
2493
  executeTool,
2268
- ...createWebSocket ? { createWebSocket } : {},
2269
- skipGreeting: sessionOpts.skipGreeting ?? false,
2270
- logger,
2271
- ...sessionOpts.resumeFrom ? { resumeFrom: sessionOpts.resumeFrom } : {}
2494
+ transport,
2495
+ logger
2272
2496
  });
2497
+ return core;
2273
2498
  }
2274
2499
  function startSession(ws, startOpts) {
2275
2500
  const resumeFrom = startOpts?.resumeFrom;
@@ -2454,4 +2679,4 @@ function createServer(options) {
2454
2679
  };
2455
2680
  }
2456
2681
  //#endregion
2457
- export { DEFAULT_S2S_CONFIG, _buildBaseCtx, _internals, buildCtx, consoleLogger, createRuntime, createS2sSession, createServer, createUnstorageKv, executeInIsolate, executeToolCall, jsonLogger, resolveAllBuiltins, wireSessionSocket };
2682
+ export { DEFAULT_S2S_CONFIG, _internals, consoleLogger, createPipelineTransport, createRuntime, createS2sTransport, createServer, createSessionCore, createUnstorageKv, executeInIsolate, executeToolCall, jsonLogger, resolveAllBuiltins, wireSessionSocket };