@alexkroman1/aai 1.4.5 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. package/.turbo/turbo-build.log +9 -9
  2. package/CHANGELOG.md +13 -0
  3. package/dist/assemblyai-C969QGi4.js +35 -0
  4. package/dist/cartesia-BfQPOQ7Y.js +37 -0
  5. package/dist/host/_pipeline-test-fakes.d.ts +3 -1
  6. package/dist/host/providers/stt/deepgram.d.ts +28 -0
  7. package/dist/host/providers/tts/cartesia.d.ts +1 -1
  8. package/dist/host/providers/tts/rime.d.ts +44 -0
  9. package/dist/host/runtime-barrel.d.ts +4 -2
  10. package/dist/host/runtime-barrel.js +1432 -1208
  11. package/dist/host/runtime.d.ts +2 -2
  12. package/dist/host/s2s.d.ts +16 -16
  13. package/dist/host/session-core.d.ts +37 -0
  14. package/dist/host/transports/pipeline-transport.d.ts +48 -0
  15. package/dist/host/transports/s2s-transport.d.ts +19 -0
  16. package/dist/host/transports/types.d.ts +45 -0
  17. package/dist/host/ws-handler.d.ts +14 -10
  18. package/dist/sdk/protocol.d.ts +6 -5
  19. package/dist/sdk/providers/llm-barrel.js +1 -1
  20. package/dist/sdk/providers/stt/deepgram.d.ts +35 -0
  21. package/dist/sdk/providers/stt-barrel.d.ts +1 -0
  22. package/dist/sdk/providers/stt-barrel.js +2 -2
  23. package/dist/sdk/providers/tts/cartesia.d.ts +12 -4
  24. package/dist/sdk/providers/tts/rime.d.ts +42 -0
  25. package/dist/sdk/providers/tts-barrel.d.ts +1 -0
  26. package/dist/sdk/providers/tts-barrel.js +2 -2
  27. package/host/_pipeline-test-fakes.ts +6 -3
  28. package/host/_test-utils.ts +209 -128
  29. package/host/cleanup.test.ts +25 -298
  30. package/host/integration/pipeline-reference.integration.test.ts +30 -35
  31. package/host/providers/resolve.ts +10 -2
  32. package/host/providers/stt/deepgram.test.ts +229 -0
  33. package/host/providers/stt/deepgram.ts +172 -0
  34. package/host/providers/tts/cartesia.ts +7 -3
  35. package/host/providers/tts/rime.test.ts +251 -0
  36. package/host/providers/tts/rime.ts +322 -0
  37. package/host/runtime-barrel.ts +4 -2
  38. package/host/runtime.test.ts +13 -46
  39. package/host/runtime.ts +131 -23
  40. package/host/s2s.test.ts +122 -131
  41. package/host/s2s.ts +44 -52
  42. package/host/session-core.test.ts +257 -0
  43. package/host/session-core.ts +262 -0
  44. package/host/transports/pipeline-transport.test.ts +651 -0
  45. package/host/transports/pipeline-transport.ts +532 -0
  46. package/host/{fixture-replay.test.ts → transports/s2s-transport-fixtures.test.ts} +76 -106
  47. package/host/transports/s2s-transport.test.ts +56 -0
  48. package/host/transports/s2s-transport.ts +116 -0
  49. package/host/transports/types.test.ts +22 -0
  50. package/host/transports/types.ts +51 -0
  51. package/host/ws-handler.test.ts +324 -242
  52. package/host/ws-handler.ts +56 -59
  53. package/package.json +2 -1
  54. package/sdk/__snapshots__/exports.test.ts.snap +3 -3
  55. package/sdk/protocol-compat.test.ts +8 -0
  56. package/sdk/protocol.ts +6 -5
  57. package/sdk/providers/stt/deepgram.ts +43 -0
  58. package/sdk/providers/stt-barrel.ts +2 -0
  59. package/sdk/providers/tts/cartesia.ts +15 -5
  60. package/sdk/providers/tts/rime.ts +52 -0
  61. package/sdk/providers/tts-barrel.ts +2 -0
  62. package/dist/assemblyai-Cxg9eobY.js +0 -18
  63. package/dist/cartesia-DwDk2tEu.js +0 -10
  64. package/dist/host/pipeline-session-ctx.d.ts +0 -24
  65. package/dist/host/pipeline-session.d.ts +0 -52
  66. package/dist/host/session-ctx.d.ts +0 -73
  67. package/dist/host/session.d.ts +0 -62
  68. package/host/pipeline-session-ctx.test.ts +0 -31
  69. package/host/pipeline-session-ctx.ts +0 -36
  70. package/host/pipeline-session.test.ts +0 -672
  71. package/host/pipeline-session.ts +0 -533
  72. package/host/s2s-fixtures.test.ts +0 -237
  73. package/host/session-ctx.test.ts +0 -387
  74. package/host/session-ctx.ts +0 -134
  75. package/host/session-fixture-replay.test.ts +0 -128
  76. package/host/session.test.ts +0 -634
  77. package/host/session.ts +0 -412
  78. /package/dist/{anthropic-BrUCPKUc.js → anthropic-CcLZygAr.js} +0 -0
@@ -1,23 +1,24 @@
1
1
  import { r as DEFAULT_SYSTEM_PROMPT } from "../types-KUgezM6u.js";
2
- import { _ as TOOL_EXECUTION_TIMEOUT_MS, a as DEFAULT_SHUTDOWN_TIMEOUT_MS, c as FETCH_TIMEOUT_MS, d as MAX_PAGE_CHARS, f as MAX_TOOL_RESULT_CHARS, g as RUN_CODE_TIMEOUT_MS, h as PIPELINE_FLUSH_TIMEOUT_MS, l as MAX_HTML_BYTES, m as MAX_WS_PAYLOAD_BYTES, o as DEFAULT_STT_SAMPLE_RATE, p as MAX_VALUE_SIZE, s as DEFAULT_TTS_SAMPLE_RATE, t as AGENT_CSP } from "../constants-C2nirZUI.js";
2
+ import { _ as TOOL_EXECUTION_TIMEOUT_MS, a as DEFAULT_SHUTDOWN_TIMEOUT_MS, c as FETCH_TIMEOUT_MS, d as MAX_PAGE_CHARS, g as RUN_CODE_TIMEOUT_MS, h as PIPELINE_FLUSH_TIMEOUT_MS, l as MAX_HTML_BYTES, m as MAX_WS_PAYLOAD_BYTES, o as DEFAULT_STT_SAMPLE_RATE, p as MAX_VALUE_SIZE, s as DEFAULT_TTS_SAMPLE_RATE, t as AGENT_CSP } from "../constants-C2nirZUI.js";
3
3
  import { i as toolError, n as errorDetail, r as errorMessage, t as parseWsUpgradeParams } from "../ws-upgrade-BeOQ7fXL.js";
4
4
  import { ClientMessageSchema, buildReadyConfig, lenientParse } from "../sdk/protocol.js";
5
5
  import { a as toAgentConfig, c as makeSttError, i as agentToolsToSchemas, l as makeTtsError, n as EMPTY_PARAMS, s as assertProviderTriple } from "../_internal-types-3p3OJZPb.js";
6
- import { t as ANTHROPIC_KIND } from "../anthropic-BrUCPKUc.js";
7
- import { t as ASSEMBLYAI_KIND } from "../assemblyai-Cxg9eobY.js";
8
- import { t as CARTESIA_KIND } from "../cartesia-DwDk2tEu.js";
6
+ import { r as DEEPGRAM_KIND, t as ASSEMBLYAI_KIND } from "../assemblyai-C969QGi4.js";
7
+ import { a as RIME_KIND, n as CARTESIA_KIND } from "../cartesia-BfQPOQ7Y.js";
8
+ import { t as ANTHROPIC_KIND } from "../anthropic-CcLZygAr.js";
9
9
  import { z } from "zod";
10
10
  import { convert } from "html-to-text";
11
11
  import vm from "node:vm";
12
12
  import pTimeout from "p-timeout";
13
13
  import { createStorage, prefixStorage } from "unstorage";
14
- import { jsonSchema, stepCountIs, streamText, tool } from "ai";
15
14
  import { createAnthropic } from "@ai-sdk/anthropic";
16
15
  import { AssemblyAI } from "assemblyai";
17
16
  import { createNanoEvents } from "nanoevents";
17
+ import { DeepgramClient } from "@deepgram/sdk";
18
18
  import { randomUUID } from "node:crypto";
19
19
  import { Cartesia } from "@cartesia/cartesia-js";
20
20
  import WsWebSocket, { WebSocketServer } from "ws";
21
+ import { jsonSchema, stepCountIs, streamText, tool } from "ai";
21
22
  import fs from "node:fs";
22
23
  import http from "node:http";
23
24
  import path from "node:path";
@@ -378,712 +379,236 @@ function buildSystemPrompt(config, opts) {
378
379
  return DEFAULT_SYSTEM_PROMPT + `\n\nToday's date is ${getFormattedDate()}.` + agentInstructions + toolPreamble + guidance + (opts.voice ? VOICE_RULES : "");
379
380
  }
380
381
  //#endregion
381
- //#region host/session-ctx.ts
382
- function _buildBaseCtx(opts) {
383
- const { agentConfig, log } = opts;
384
- const maxHistory = opts.maxHistory ?? 200;
385
- const ctx = {
386
- ...opts,
387
- reply: {
388
- pendingTools: [],
389
- toolCallCount: 0,
390
- currentReplyId: null
391
- },
392
- turnPromise: null,
393
- conversationMessages: [],
394
- maxHistory,
395
- consumeToolCallStep(_name, replyId) {
396
- if (replyId === null || replyId !== ctx.reply.currentReplyId) return toolError("Reply was interrupted. Discarding stale tool call.");
397
- const maxSteps = agentConfig.maxSteps;
398
- ctx.reply.toolCallCount++;
399
- if (maxSteps !== void 0 && ctx.reply.toolCallCount > maxSteps) {
400
- log.info("maxSteps exceeded, refusing tool call", {
401
- toolCallCount: ctx.reply.toolCallCount,
402
- maxSteps
403
- });
404
- return toolError("Maximum tool steps reached. Please respond to the user now.");
382
+ //#region host/providers/stt/assemblyai.ts
383
+ /**
384
+ * AssemblyAI Universal-Streaming STT opener (host-only).
385
+ *
386
+ * The user-facing descriptor factory (`assemblyAI(...)`) lives in
387
+ * `sdk/providers/stt/assemblyai.ts`. This module is the host-side
388
+ * counterpart: it takes the descriptor options + an API key and
389
+ * returns an {@link SttOpener} that the pipeline session drives.
390
+ *
391
+ * Default model: `"u3pro-rt"` (Universal-3 Pro Real-Time). The adapter
392
+ * maps that to the SDK's `"u3-rt-pro"` `speechModel` value; any other
393
+ * string is forwarded verbatim.
394
+ */
395
+ /** Translate the descriptor's model alias to the SDK's `speechModel` value. */
396
+ function resolveSpeechModel(model) {
397
+ if (model === "u3pro-rt") return "u3-rt-pro";
398
+ return model;
399
+ }
400
+ /** Build an {@link SttOpener} from resolved AssemblyAI descriptor options. */
401
+ function openAssemblyAI(opts = {}) {
402
+ return {
403
+ name: "assemblyai",
404
+ async open(openOpts) {
405
+ const apiKey = openOpts.apiKey || process.env.ASSEMBLYAI_API_KEY;
406
+ if (!apiKey) throw makeSttError("stt_auth_failed", "AssemblyAI STT: missing API key. Set ASSEMBLYAI_API_KEY in the agent env.");
407
+ const client = new AssemblyAI({ apiKey });
408
+ const speechModel = resolveSpeechModel(opts.model ?? "u3pro-rt");
409
+ const transcriber = client.streaming.transcriber({
410
+ sampleRate: openOpts.sampleRate,
411
+ speechModel,
412
+ ...openOpts.sttPrompt ? { prompt: openOpts.sttPrompt } : {}
413
+ });
414
+ const emitter = createNanoEvents();
415
+ let closed = false;
416
+ transcriber.on("turn", (event) => {
417
+ if (closed) return;
418
+ const text = event.transcript ?? "";
419
+ if (event.end_of_turn) {
420
+ if (text.length > 0) emitter.emit("final", text);
421
+ } else if (text.length > 0) emitter.emit("partial", text);
422
+ });
423
+ transcriber.on("error", (err) => {
424
+ if (closed) return;
425
+ emitter.emit("error", makeSttError("stt_stream_error", err?.message ?? String(err)));
426
+ });
427
+ transcriber.on("close", (code) => {
428
+ if (closed) return;
429
+ if (code !== 1e3) emitter.emit("error", makeSttError("stt_stream_error", `socket closed ${code}`));
430
+ });
431
+ try {
432
+ await transcriber.connect();
433
+ } catch (cause) {
434
+ throw makeSttError("stt_connect_failed", `AssemblyAI STT: connect failed: ${cause instanceof Error ? cause.message : String(cause)}`);
405
435
  }
406
- return null;
407
- },
408
- pushMessages(...msgs) {
409
- ctx.conversationMessages.push(...msgs);
410
- if (maxHistory > 0 && ctx.conversationMessages.length > maxHistory) ctx.conversationMessages.splice(0, ctx.conversationMessages.length - maxHistory);
411
- },
412
- beginReply(replyId) {
413
- ctx.reply = {
414
- pendingTools: [],
415
- toolCallCount: 0,
416
- currentReplyId: replyId
436
+ const close = async () => {
437
+ if (closed) return;
438
+ closed = true;
439
+ try {
440
+ await transcriber.close();
441
+ } catch {}
417
442
  };
418
- ctx.turnPromise = null;
419
- },
420
- cancelReply() {
421
- ctx.reply = {
422
- pendingTools: [],
423
- toolCallCount: 0,
424
- currentReplyId: null
443
+ if (openOpts.signal.aborted) close();
444
+ else openOpts.signal.addEventListener("abort", () => void close(), { once: true });
445
+ return {
446
+ sendAudio(pcm) {
447
+ if (closed) return;
448
+ const copy = new Uint8Array(pcm.byteLength);
449
+ copy.set(new Uint8Array(pcm.buffer, pcm.byteOffset, pcm.byteLength));
450
+ transcriber.sendAudio(copy.buffer);
451
+ },
452
+ on(event, fn) {
453
+ return emitter.on(event, fn);
454
+ },
455
+ close,
456
+ _transcriber: transcriber
425
457
  };
426
- },
427
- chainTurn(p) {
428
- ctx.turnPromise = (ctx.turnPromise ?? Promise.resolve()).then(() => p);
429
458
  }
430
459
  };
431
- return ctx;
432
- }
433
- function buildCtx(opts) {
434
- const base = _buildBaseCtx(opts);
435
- base.s2s = null;
436
- return base;
437
- }
438
- //#endregion
439
- //#region host/pipeline-session-ctx.ts
440
- function buildPipelineCtx(opts) {
441
- const base = _buildBaseCtx(opts);
442
- base.stt = null;
443
- base.tts = null;
444
- return base;
445
460
  }
446
461
  //#endregion
447
- //#region host/runtime-config.ts
462
+ //#region host/providers/stt/deepgram.ts
448
463
  /**
449
- * Runtime dependencies injected into the session pipeline.
464
+ * Deepgram Nova streaming STT opener (host-only).
450
465
  *
451
- * Defines the {@link Logger} interface, a default {@link consoleLogger},
452
- * and the {@link S2SConfig} for Speech-to-Speech endpoint configuration.
466
+ * The user-facing descriptor factory (`deepgram(...)`) lives in
467
+ * `sdk/providers/stt/deepgram.ts`. This module is the host-side
468
+ * counterpart: it takes the descriptor options + an API key and
469
+ * returns an {@link SttOpener} that the pipeline session drives.
470
+ *
471
+ * Default model: `"nova-3"`. Any string is forwarded verbatim to the SDK.
472
+ *
473
+ * This adapter targets the Deepgram SDK v5 (`@deepgram/sdk@^5`). The v5
474
+ * streaming API is:
475
+ * `client.listen.v1.connect(args)` → `Promise<V1Socket>`
476
+ * followed by:
477
+ * `socket.connect()` + `socket.waitForOpen()` to establish the connection.
453
478
  */
454
- function consoleLog(fn) {
455
- return (msg, ctx) => ctx ? fn(msg, ctx) : fn(msg);
456
- }
457
- /** Default console-backed logger. */
458
- const consoleLogger = {
459
- info: consoleLog(console.log),
460
- warn: consoleLog(console.warn),
461
- error: consoleLog(console.error),
462
- debug: consoleLog(console.debug)
463
- };
464
479
  /**
465
- * Structured JSON logger for production diagnostics. Each log entry is a
466
- * single-line JSON object with `timestamp`, `level`, `msg`, and any
467
- * caller-provided context fields.
480
+ * Handle an incoming Deepgram transcript message, emitting `partial` or
481
+ * `final` events on the emitter. Empty transcripts are silently dropped.
468
482
  */
469
- function jsonLog(level) {
470
- return (msg, ctx) => {
471
- const entry = {
472
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
473
- level,
474
- msg
475
- };
476
- if (ctx) Object.assign(entry, ctx);
477
- (level === "error" || level === "warn" ? process.stderr : process.stdout).write(`${JSON.stringify(entry)}\n`);
483
+ function handleMessage(data, closed, emitter) {
484
+ if (closed) return;
485
+ if (data.type !== "Results") return;
486
+ const result = data;
487
+ const text = result.channel?.alternatives?.[0]?.transcript ?? "";
488
+ if (result.is_final) {
489
+ if (text.length > 0) emitter.emit("final", text);
490
+ } else if (text.length > 0) emitter.emit("partial", text);
491
+ }
492
+ /** Wire Deepgram socket events onto the nanoevents emitter. */
493
+ function wireSocketEvents(connection, emitter, getIsClosed) {
494
+ connection.on("message", (data) => handleMessage(data, getIsClosed(), emitter));
495
+ connection.on("error", (err) => {
496
+ if (getIsClosed()) return;
497
+ emitter.emit("error", makeSttError("stt_stream_error", err?.message ?? String(err)));
498
+ });
499
+ connection.on("close", (event) => {
500
+ if (getIsClosed()) return;
501
+ const code = event?.code;
502
+ if (code !== void 0 && code !== 1e3) emitter.emit("error", makeSttError("stt_stream_error", `socket closed ${code}`));
503
+ });
504
+ }
505
+ /** Wire the AbortSignal to the close function. */
506
+ function wireAbortSignal(signal, close) {
507
+ if (signal.aborted) close();
508
+ else signal.addEventListener("abort", () => void close(), { once: true });
509
+ }
510
+ /** Build an {@link SttOpener} from resolved Deepgram descriptor options. */
511
+ function openDeepgram(opts = {}) {
512
+ return {
513
+ name: "deepgram",
514
+ async open(openOpts) {
515
+ const apiKey = openOpts.apiKey || process.env.DEEPGRAM_API_KEY;
516
+ if (!apiKey) throw makeSttError("stt_auth_failed", "Deepgram STT: missing API key. Set DEEPGRAM_API_KEY in the agent env.");
517
+ const client = new DeepgramClient({ apiKey });
518
+ let connection;
519
+ try {
520
+ connection = await client.listen.v1.connect({
521
+ model: opts.model ?? "nova-3",
522
+ language: opts.language ?? "en",
523
+ encoding: "linear16",
524
+ sample_rate: openOpts.sampleRate,
525
+ channels: 1,
526
+ interim_results: "true",
527
+ smart_format: "true",
528
+ endpointing: 300,
529
+ utterance_end_ms: "1000",
530
+ Authorization: apiKey
531
+ });
532
+ } catch (cause) {
533
+ throw makeSttError("stt_connect_failed", `Deepgram STT: connect failed: ${cause instanceof Error ? cause.message : String(cause)}`);
534
+ }
535
+ const emitter = createNanoEvents();
536
+ let closed = false;
537
+ wireSocketEvents(connection, emitter, () => closed);
538
+ connection.connect();
539
+ try {
540
+ await connection.waitForOpen();
541
+ } catch (cause) {
542
+ throw makeSttError("stt_connect_failed", `Deepgram STT: WebSocket open failed: ${cause instanceof Error ? cause.message : String(cause)}`);
543
+ }
544
+ const close = async () => {
545
+ if (closed) return;
546
+ closed = true;
547
+ try {
548
+ connection.close();
549
+ } catch {}
550
+ };
551
+ wireAbortSignal(openOpts.signal, close);
552
+ return {
553
+ sendAudio(pcm) {
554
+ if (closed) return;
555
+ connection.sendMedia(new Uint8Array(pcm.buffer, pcm.byteOffset, pcm.byteLength));
556
+ },
557
+ on(event, fn) {
558
+ return emitter.on(event, fn);
559
+ },
560
+ close,
561
+ _connection: connection
562
+ };
563
+ }
478
564
  };
479
565
  }
480
- const jsonLogger = {
481
- info: jsonLog("info"),
482
- warn: jsonLog("warn"),
483
- error: jsonLog("error"),
484
- debug: jsonLog("debug")
485
- };
486
- /** Default S2S endpoint configuration. */
487
- const DEFAULT_S2S_CONFIG = {
488
- wssUrl: "wss://agents.assemblyai.com/v1/voice",
489
- inputSampleRate: DEFAULT_STT_SAMPLE_RATE,
490
- outputSampleRate: DEFAULT_TTS_SAMPLE_RATE
491
- };
492
566
  //#endregion
493
- //#region host/to-vercel-tools.ts
567
+ //#region host/providers/tts/cartesia.ts
494
568
  /**
495
- * Converts agent {@link ToolSchema}[] to Vercel AI SDK tools with `execute`
496
- * delegation to the agent's {@link ExecuteTool} function.
569
+ * Cartesia TTS opener (host-only).
497
570
  *
498
- * The pipeline orchestrator passes the output to `streamText({ tools })`.
499
- * Each produced tool's `execute` closure calls
500
- * `ctx.executeTool(name, args, sessionId, messages(), { signal, toolCallId })`,
501
- * so the existing agent tool infrastructure (argument validation, KV, hooks,
502
- * timeout) remains the single source of truth for tool behavior.
571
+ * The user-facing descriptor factory (`cartesia(...)`) lives in
572
+ * `sdk/providers/tts/cartesia.ts`. This module is the host-side
573
+ * counterpart: it takes the descriptor options + an API key and
574
+ * returns a {@link TtsOpener} that the pipeline session drives.
503
575
  *
504
- * Per-call `options.abortSignal` (forwarded by `streamText` when the
505
- * outer turn is aborted, e.g. barge-in) takes precedence over the
506
- * bag-level `ctx.signal` so individual invocations respect streamText
507
- * aborts.
508
- */
509
- /**
510
- * Convert an array of {@link ToolSchema} to a Vercel AI SDK `ToolSet`
511
- * (record keyed by tool name).
576
+ * Wraps `@cartesia/cartesia-js`'s `TTSWS` / `TTSWSContext` and normalizes it
577
+ * onto the {@link TtsEvents} contract consumed by the pipeline orchestrator.
512
578
  *
513
- * Uses the v6 `tool()` helper with `inputSchema: jsonSchema(...)` wrapping
514
- * the agent's JSON Schema `parameters`. Execution is delegated to
515
- * `ctx.executeTool` so validation, KV, timeouts, and hooks keep working.
579
+ * **Per-turn context lifecycle.** Each `sendText(...)` within the same turn
580
+ * appends to the same Cartesia context. On `flush()` or `cancel()`, a new
581
+ * context is minted for the next turn so concurrent `cancel({ contextId })`
582
+ * only targets the in-flight turn, never the one that follows.
583
+ *
584
+ * **Audio format.** The adapter requests `raw` / `pcm_s16le` at the
585
+ * negotiated `sampleRate` so it can forward chunks as `Int16Array` with no
586
+ * conversion.
516
587
  */
517
- function toVercelTools(schemas, ctx) {
518
- const out = {};
519
- for (const schema of schemas) out[schema.name] = tool({
520
- description: schema.description,
521
- inputSchema: jsonSchema(schema.parameters),
522
- execute: async (args, options) => {
523
- const input = args ?? {};
524
- const signal = options.abortSignal ?? ctx.signal;
525
- const opts = {};
526
- if (signal !== void 0) opts.signal = signal;
527
- if (options.toolCallId !== void 0) opts.toolCallId = options.toolCallId;
528
- return ctx.executeTool(schema.name, input, ctx.sessionId, ctx.messages().slice(), opts);
529
- }
530
- });
531
- return out;
588
+ /** PCM16 sample rates supported by Cartesia's `raw` output format. */
589
+ const CARTESIA_PCM16_RATES = [
590
+ 8e3,
591
+ 16e3,
592
+ 22050,
593
+ 24e3,
594
+ 44100,
595
+ 48e3
596
+ ];
597
+ function assertSupportedSampleRate$1(rate) {
598
+ if (CARTESIA_PCM16_RATES.includes(rate)) return rate;
599
+ throw makeTtsError("tts_connect_failed", `Cartesia TTS: unsupported sample rate ${rate}. Supported: ${CARTESIA_PCM16_RATES.join(", ")}.`);
532
600
  }
533
- //#endregion
534
- //#region host/pipeline-session.ts
535
- function toModelMessage(m) {
536
- if (m.role === "user") return {
537
- role: "user",
538
- content: m.content
539
- };
540
- if (m.role === "assistant") return {
541
- role: "assistant",
542
- content: m.content
543
- };
544
- return {
545
- role: "assistant",
546
- content: m.content
547
- };
548
- }
549
- function emitError(client, code, message) {
550
- client.event({
551
- type: "error",
552
- code,
553
- message
554
- });
555
- }
556
- function handleStreamPart(part, deps) {
557
- switch (part.type) {
558
- case "text-delta": {
559
- const delta = part.text ?? "";
560
- if (delta.length === 0) return;
561
- deps.onTextDelta(delta);
562
- deps.tts?.sendText(delta);
563
- return;
564
- }
565
- case "tool-call": {
566
- const input = part.input ?? {};
567
- deps.client.event({
568
- type: "tool_call",
569
- toolCallId: part.toolCallId ?? "",
570
- toolName: part.toolName ?? "",
571
- args: input
572
- });
573
- return;
574
- }
575
- case "tool-result": {
576
- const output = part.output;
577
- const resultString = typeof output === "string" ? output : JSON.stringify(output);
578
- deps.client.event({
579
- type: "tool_call_done",
580
- toolCallId: part.toolCallId ?? "",
581
- result: resultString
582
- });
583
- return;
584
- }
585
- case "error": {
586
- const msg = errorMessage(part.error);
587
- deps.log.error("LLM stream error", {
588
- message: msg,
589
- sessionId: deps.sessionId
590
- });
591
- emitError(deps.client, "llm", msg);
592
- return;
593
- }
594
- default: return;
595
- }
596
- }
597
- /** Create a pluggable-provider voice session. */
598
- function createPipelineSession(opts) {
599
- const log = opts.logger ?? consoleLogger;
600
- const sttSampleRate = opts.sttSampleRate ?? 16e3;
601
- const ttsSampleRate = opts.ttsSampleRate ?? 24e3;
602
- const { client, agentConfig, toolSchemas, executeTool } = opts;
603
- const systemPrompt = buildSystemPrompt(agentConfig, {
604
- hasTools: toolSchemas.length > 0 || (agentConfig.builtinTools?.length ?? 0) > 0,
605
- voice: true,
606
- toolGuidance: opts.toolGuidance
607
- });
608
- const ctx = buildPipelineCtx({
609
- id: opts.id,
610
- agent: opts.agent,
611
- client,
612
- agentConfig,
613
- executeTool,
614
- log,
615
- maxHistory: opts.maxHistory
616
- });
617
- const sessionAbort = new AbortController();
618
- let audioReady = false;
619
- let terminated = false;
620
- let turnController = null;
621
- let nextReplyId = 0;
622
- const sttSubs = [];
623
- const ttsSubs = [];
624
- /**
625
- * Tear down the session after an unrecoverable provider error. Aborts the
626
- * in-flight turn, cancels TTS, signals providers to close via sessionAbort,
627
- * and flips `terminated` so future STT events and audio frames become
628
- * no-ops. Idempotent.
629
- */
630
- function terminate() {
631
- if (terminated) return;
632
- terminated = true;
633
- if (turnController !== null) {
634
- turnController.abort();
635
- turnController = null;
636
- }
637
- ctx.tts?.cancel();
638
- ctx.cancelReply();
639
- sessionAbort.abort();
640
- }
641
- function onSttPartial(_text) {
642
- if (terminated) return;
643
- if (turnController === null) return;
644
- log.info("Pipeline barge-in", { sessionId: opts.id });
645
- turnController.abort();
646
- turnController = null;
647
- ctx.tts?.cancel();
648
- ctx.cancelReply();
649
- client.event({ type: "cancelled" });
650
- }
651
- function onSttFinal(text) {
652
- if (terminated) return;
653
- const trimmed = text.trim();
654
- if (trimmed.length === 0) return;
655
- if (turnController !== null) {
656
- log.info("Pipeline replacing in-flight turn", { sessionId: opts.id });
657
- turnController.abort();
658
- turnController = null;
659
- ctx.tts?.cancel();
660
- ctx.cancelReply();
661
- client.event({ type: "cancelled" });
662
- }
663
- client.event({
664
- type: "user_transcript",
665
- text
666
- });
667
- const turn = runTurn(trimmed).catch((err) => {
668
- log.error("Pipeline turn crashed", {
669
- error: errorMessage(err),
670
- sessionId: opts.id
671
- });
672
- });
673
- ctx.chainTurn(turn);
674
- }
675
- function onSttError(err) {
676
- if (terminated) return;
677
- log.error("STT error", {
678
- code: err.code,
679
- message: err.message,
680
- sessionId: opts.id
681
- });
682
- emitError(client, "stt", err.message);
683
- terminate();
684
- }
685
- function onTtsError(err) {
686
- if (terminated) return;
687
- log.error("TTS error", {
688
- code: err.code,
689
- message: err.message,
690
- sessionId: opts.id
691
- });
692
- emitError(client, "tts", err.message);
693
- terminate();
694
- }
695
- async function consumeLlmStream(ctl, messages, tools, onDelta) {
696
- const deps = {
697
- client,
698
- tts: ctx.tts,
699
- log,
700
- sessionId: opts.id,
701
- onTextDelta: onDelta
702
- };
703
- try {
704
- const maxSteps = agentConfig.maxSteps ?? 5;
705
- const result = streamText({
706
- model: opts.llm,
707
- system: systemPrompt,
708
- messages,
709
- tools,
710
- stopWhen: stepCountIs(maxSteps),
711
- abortSignal: ctl.signal
712
- });
713
- for await (const part of result.fullStream) {
714
- if (ctl.signal.aborted) break;
715
- handleStreamPart(part, deps);
716
- }
717
- } catch (err) {
718
- if (!ctl.signal.aborted) {
719
- const msg = errorMessage(err);
720
- log.error("LLM streamText failed", {
721
- error: msg,
722
- sessionId: opts.id
723
- });
724
- emitError(client, "llm", msg);
725
- }
726
- }
727
- }
728
- /**
729
- * Flush TTS and wait for drain. Resolves on any of:
730
- * - TTS emits `done`
731
- * - `signal` aborts (barge-in, provider error, session stop)
732
- * - `PIPELINE_FLUSH_TIMEOUT_MS` elapses
733
- * Resolves immediately if no TTS session.
734
- */
735
- function flushTtsAndWait(signal) {
736
- const tts = ctx.tts;
737
- if (!tts) return Promise.resolve();
738
- return new Promise((resolve) => {
739
- let off = null;
740
- let timer = null;
741
- const cleanup = () => {
742
- if (off) {
743
- off();
744
- off = null;
745
- }
746
- if (timer) {
747
- clearTimeout(timer);
748
- timer = null;
749
- }
750
- signal.removeEventListener("abort", onAbort);
751
- };
752
- const finish = () => {
753
- cleanup();
754
- resolve();
755
- };
756
- const onAbort = () => finish();
757
- if (signal.aborted) {
758
- resolve();
759
- return;
760
- }
761
- signal.addEventListener("abort", onAbort, { once: true });
762
- off = tts.on("done", finish);
763
- timer = setTimeout(() => {
764
- log.warn("TTS flush timeout", {
765
- sessionId: opts.id,
766
- timeoutMs: PIPELINE_FLUSH_TIMEOUT_MS
767
- });
768
- finish();
769
- }, PIPELINE_FLUSH_TIMEOUT_MS);
770
- tts.flush();
771
- });
772
- }
773
- async function runTurn(userText) {
774
- const replyId = `pipeline-${++nextReplyId}`;
775
- ctx.beginReply(replyId);
776
- ctx.pushMessages({
777
- role: "user",
778
- content: userText
779
- });
780
- const ctl = new AbortController();
781
- turnController = ctl;
782
- const tools = toVercelTools(toolSchemas, {
783
- executeTool,
784
- sessionId: opts.id,
785
- messages: () => ctx.conversationMessages,
786
- signal: ctl.signal
787
- });
788
- const messages = ctx.conversationMessages.map(toModelMessage);
789
- let accumulated = "";
790
- await consumeLlmStream(ctl, messages, tools, (delta) => {
791
- accumulated += delta;
792
- });
793
- if (ctl.signal.aborted) {
794
- if (turnController === ctl) turnController = null;
795
- return;
796
- }
797
- if (accumulated.length > 0) {
798
- client.event({
799
- type: "agent_transcript",
800
- text: accumulated
801
- });
802
- ctx.pushMessages({
803
- role: "assistant",
804
- content: accumulated
805
- });
806
- }
807
- await flushTtsAndWait(ctl.signal);
808
- if (ctl.signal.aborted) {
809
- if (turnController === ctl) turnController = null;
810
- return;
811
- }
812
- client.playAudioDone();
813
- client.event({ type: "reply_done" });
814
- if (turnController === ctl) turnController = null;
815
- }
816
- async function runGreeting(text) {
817
- const replyId = `pipeline-greeting-${++nextReplyId}`;
818
- ctx.beginReply(replyId);
819
- const ctl = new AbortController();
820
- turnController = ctl;
821
- client.event({
822
- type: "agent_transcript",
823
- text
824
- });
825
- ctx.pushMessages({
826
- role: "assistant",
827
- content: text
828
- });
829
- ctx.tts?.sendText(text);
830
- await flushTtsAndWait(ctl.signal);
831
- if (ctl.signal.aborted) {
832
- if (turnController === ctl) turnController = null;
833
- return;
834
- }
835
- client.playAudioDone();
836
- client.event({ type: "reply_done" });
837
- if (turnController === ctl) turnController = null;
838
- }
839
- function reportOpenRejection(which, reason) {
840
- const msg = errorMessage(reason);
841
- log.error(`${which === "stt" ? "STT" : "TTS"} open failed`, {
842
- error: msg,
843
- sessionId: opts.id
844
- });
845
- emitError(client, which, msg);
846
- }
847
- async function adoptStt(sttSession, teardown) {
848
- if (teardown) {
849
- await sttSession.close().catch(() => void 0);
850
- return;
851
- }
852
- ctx.stt = sttSession;
853
- sttSubs.push(sttSession.on("partial", onSttPartial));
854
- sttSubs.push(sttSession.on("final", onSttFinal));
855
- sttSubs.push(sttSession.on("error", onSttError));
856
- }
857
- async function adoptTts(ttsSession, teardown) {
858
- if (teardown) {
859
- await ttsSession.close().catch(() => void 0);
860
- return;
861
- }
862
- ctx.tts = ttsSession;
863
- ttsSubs.push(ttsSession.on("audio", (pcm) => {
864
- client.playAudioChunk(new Uint8Array(pcm.buffer, pcm.byteOffset, pcm.byteLength));
865
- }));
866
- ttsSubs.push(ttsSession.on("error", onTtsError));
867
- }
868
- async function openProviders() {
869
- const [sttResult, ttsResult] = await Promise.allSettled([opts.stt.open({
870
- sampleRate: sttSampleRate,
871
- apiKey: opts.sttApiKey,
872
- sttPrompt: agentConfig.sttPrompt,
873
- signal: sessionAbort.signal
874
- }), opts.tts.open({
875
- sampleRate: ttsSampleRate,
876
- apiKey: opts.ttsApiKey,
877
- signal: sessionAbort.signal
878
- })]);
879
- if (sttResult.status === "rejected") reportOpenRejection("stt", sttResult.reason);
880
- if (ttsResult.status === "rejected") reportOpenRejection("tts", ttsResult.reason);
881
- const aborted = sessionAbort.signal.aborted;
882
- const sttFailed = sttResult.status === "rejected";
883
- const ttsFailed = ttsResult.status === "rejected";
884
- const teardown = aborted || sttFailed || ttsFailed;
885
- if (sttResult.status === "fulfilled") await adoptStt(sttResult.value, teardown);
886
- if (ttsResult.status === "fulfilled") await adoptTts(ttsResult.value, teardown);
887
- if (!aborted && (sttFailed || ttsFailed)) terminate();
888
- }
889
- return {
890
- async start() {
891
- await openProviders();
892
- },
893
- async stop() {
894
- if (sessionAbort.signal.aborted) return;
895
- sessionAbort.abort();
896
- turnController?.abort();
897
- for (const off of sttSubs) off();
898
- for (const off of ttsSubs) off();
899
- sttSubs.length = 0;
900
- ttsSubs.length = 0;
901
- if (ctx.turnPromise !== null) await ctx.turnPromise;
902
- await ctx.stt?.close().catch(() => {});
903
- await ctx.tts?.close().catch(() => {});
904
- },
905
- onAudio(data) {
906
- if (terminated || !audioReady) return;
907
- const offset = data.byteOffset;
908
- const length = data.byteLength;
909
- let pcm;
910
- if (offset % 2 === 0 && length % 2 === 0) pcm = new Int16Array(data.buffer, offset, length / 2);
911
- else {
912
- const copy = new Uint8Array(length - length % 2);
913
- copy.set(data.subarray(0, copy.byteLength));
914
- pcm = new Int16Array(copy.buffer);
915
- }
916
- ctx.stt?.sendAudio(pcm);
917
- },
918
- onAudioReady() {
919
- if (audioReady || terminated) return;
920
- audioReady = true;
921
- if (opts.skipGreeting) return;
922
- const greeting = agentConfig.greeting;
923
- if (!greeting) return;
924
- const turn = runGreeting(greeting).catch((err) => {
925
- log.error("Pipeline greeting failed", {
926
- error: errorMessage(err),
927
- sessionId: opts.id
928
- });
929
- });
930
- ctx.chainTurn(turn);
931
- },
932
- onCancel() {
933
- if (terminated) return;
934
- turnController?.abort();
935
- turnController = null;
936
- ctx.tts?.cancel();
937
- ctx.cancelReply();
938
- client.event({ type: "cancelled" });
939
- },
940
- onReset() {
941
- if (terminated) return;
942
- turnController?.abort();
943
- turnController = null;
944
- ctx.tts?.cancel();
945
- ctx.cancelReply();
946
- ctx.conversationMessages = [];
947
- ctx.turnPromise = null;
948
- client.event({ type: "reset" });
949
- },
950
- onHistory(incoming) {
951
- if (terminated) return;
952
- ctx.pushMessages(...incoming.map((m) => ({
953
- role: m.role,
954
- content: m.content
955
- })));
956
- },
957
- waitForTurn() {
958
- return ctx.turnPromise ?? Promise.resolve();
959
- }
960
- };
961
- }
962
- //#endregion
963
- //#region host/providers/stt/assemblyai.ts
964
- /**
965
- * AssemblyAI Universal-Streaming STT opener (host-only).
966
- *
967
- * The user-facing descriptor factory (`assemblyAI(...)`) lives in
968
- * `sdk/providers/stt/assemblyai.ts`. This module is the host-side
969
- * counterpart: it takes the descriptor options + an API key and
970
- * returns an {@link SttOpener} that the pipeline session drives.
971
- *
972
- * Default model: `"u3pro-rt"` (Universal-3 Pro Real-Time). The adapter
973
- * maps that to the SDK's `"u3-rt-pro"` `speechModel` value; any other
974
- * string is forwarded verbatim.
975
- */
976
- /** Translate the descriptor's model alias to the SDK's `speechModel` value. */
977
- function resolveSpeechModel(model) {
978
- if (model === "u3pro-rt") return "u3-rt-pro";
979
- return model;
980
- }
981
- /** Build an {@link SttOpener} from resolved AssemblyAI descriptor options. */
982
- function openAssemblyAI(opts = {}) {
983
- return {
984
- name: "assemblyai",
985
- async open(openOpts) {
986
- const apiKey = openOpts.apiKey || process.env.ASSEMBLYAI_API_KEY;
987
- if (!apiKey) throw makeSttError("stt_auth_failed", "AssemblyAI STT: missing API key. Set ASSEMBLYAI_API_KEY in the agent env.");
988
- const client = new AssemblyAI({ apiKey });
989
- const speechModel = resolveSpeechModel(opts.model ?? "u3pro-rt");
990
- const transcriber = client.streaming.transcriber({
991
- sampleRate: openOpts.sampleRate,
992
- speechModel,
993
- ...openOpts.sttPrompt ? { prompt: openOpts.sttPrompt } : {}
994
- });
995
- const emitter = createNanoEvents();
996
- let closed = false;
997
- transcriber.on("turn", (event) => {
998
- if (closed) return;
999
- const text = event.transcript ?? "";
1000
- if (event.end_of_turn) {
1001
- if (text.length > 0) emitter.emit("final", text);
1002
- } else if (text.length > 0) emitter.emit("partial", text);
1003
- });
1004
- transcriber.on("error", (err) => {
1005
- if (closed) return;
1006
- emitter.emit("error", makeSttError("stt_stream_error", err?.message ?? String(err)));
1007
- });
1008
- transcriber.on("close", (code) => {
1009
- if (closed) return;
1010
- if (code !== 1e3) emitter.emit("error", makeSttError("stt_stream_error", `socket closed ${code}`));
1011
- });
1012
- try {
1013
- await transcriber.connect();
1014
- } catch (cause) {
1015
- throw makeSttError("stt_connect_failed", `AssemblyAI STT: connect failed: ${cause instanceof Error ? cause.message : String(cause)}`);
1016
- }
1017
- const close = async () => {
1018
- if (closed) return;
1019
- closed = true;
1020
- try {
1021
- await transcriber.close();
1022
- } catch {}
1023
- };
1024
- if (openOpts.signal.aborted) close();
1025
- else openOpts.signal.addEventListener("abort", () => void close(), { once: true });
1026
- return {
1027
- sendAudio(pcm) {
1028
- if (closed) return;
1029
- const copy = new Uint8Array(pcm.byteLength);
1030
- copy.set(new Uint8Array(pcm.buffer, pcm.byteOffset, pcm.byteLength));
1031
- transcriber.sendAudio(copy.buffer);
1032
- },
1033
- on(event, fn) {
1034
- return emitter.on(event, fn);
1035
- },
1036
- close,
1037
- _transcriber: transcriber
1038
- };
1039
- }
1040
- };
1041
- }
1042
- //#endregion
1043
- //#region host/providers/tts/cartesia.ts
1044
- /**
1045
- * Cartesia TTS opener (host-only).
1046
- *
1047
- * The user-facing descriptor factory (`cartesia(...)`) lives in
1048
- * `sdk/providers/tts/cartesia.ts`. This module is the host-side
1049
- * counterpart: it takes the descriptor options + an API key and
1050
- * returns a {@link TtsOpener} that the pipeline session drives.
1051
- *
1052
- * Wraps `@cartesia/cartesia-js`'s `TTSWS` / `TTSWSContext` and normalizes it
1053
- * onto the {@link TtsEvents} contract consumed by the pipeline orchestrator.
1054
- *
1055
- * **Per-turn context lifecycle.** Each `sendText(...)` within the same turn
1056
- * appends to the same Cartesia context. On `flush()` or `cancel()`, a new
1057
- * context is minted for the next turn — so concurrent `cancel({ contextId })`
1058
- * only targets the in-flight turn, never the one that follows.
1059
- *
1060
- * **Audio format.** The adapter requests `raw` / `pcm_s16le` at the
1061
- * negotiated `sampleRate` so it can forward chunks as `Int16Array` with no
1062
- * conversion.
1063
- */
1064
- /** PCM16 sample rates supported by Cartesia's `raw` output format. */
1065
- const CARTESIA_PCM16_RATES = [
1066
- 8e3,
1067
- 16e3,
1068
- 22050,
1069
- 24e3,
1070
- 44100,
1071
- 48e3
1072
- ];
1073
- function assertSupportedSampleRate(rate) {
1074
- if (CARTESIA_PCM16_RATES.includes(rate)) return rate;
1075
- throw makeTtsError("tts_connect_failed", `Cartesia TTS: unsupported sample rate ${rate}. Supported: ${CARTESIA_PCM16_RATES.join(", ")}.`);
1076
- }
1077
- /** Build a {@link TtsOpener} from resolved Cartesia descriptor options. */
1078
- function openCartesia(opts) {
601
+ /** Build a {@link TtsOpener} from resolved Cartesia descriptor options. */
602
+ function openCartesia(opts) {
1079
603
  return {
1080
604
  name: "cartesia",
1081
605
  async open(openOpts) {
1082
606
  const apiKey = openOpts.apiKey || process.env.CARTESIA_API_KEY;
1083
607
  if (!apiKey) throw makeTtsError("tts_auth_failed", "Cartesia TTS: missing API key. Set CARTESIA_API_KEY in the agent env.");
1084
- const sampleRate = assertSupportedSampleRate(openOpts.sampleRate);
608
+ const sampleRate = assertSupportedSampleRate$1(openOpts.sampleRate);
1085
609
  const model = opts.model ?? "sonic-2";
1086
610
  const language = opts.language ?? "en";
611
+ const voice = opts.voice ?? "f786b574-daa5-4673-aa0c-cbe3e8534c02";
1087
612
  const client = new Cartesia({ apiKey });
1088
613
  let ws;
1089
614
  try {
@@ -1098,7 +623,7 @@ function openCartesia(opts) {
1098
623
  model_id: model,
1099
624
  voice: {
1100
625
  mode: "id",
1101
- id: opts.voice
626
+ id: voice
1102
627
  },
1103
628
  output_format: {
1104
629
  container: "raw",
@@ -1169,7 +694,7 @@ function openCartesia(opts) {
1169
694
  model_id: model,
1170
695
  voice: {
1171
696
  mode: "id",
1172
- id: opts.voice
697
+ id: voice
1173
698
  },
1174
699
  output_format: {
1175
700
  container: "raw",
@@ -1178,38 +703,247 @@ function openCartesia(opts) {
1178
703
  },
1179
704
  language
1180
705
  };
1181
- const ignoreRejection = (_err) => {};
706
+ const ignoreRejection = (_err) => {};
707
+ return {
708
+ sendText(text) {
709
+ if (closed || text.length === 0) return;
710
+ rotateIfPending();
711
+ context.send({
712
+ ...baseRequest,
713
+ transcript: text,
714
+ continue: true
715
+ }).catch(ignoreRejection);
716
+ },
717
+ flush() {
718
+ if (closed || rotatePending) return;
719
+ context.send({
720
+ ...baseRequest,
721
+ transcript: "",
722
+ continue: false
723
+ }).catch(ignoreRejection);
724
+ rotatePending = true;
725
+ },
726
+ cancel() {
727
+ if (closed) return;
728
+ if (!doneEmitted) context.cancel().catch(ignoreRejection);
729
+ emitDoneOnce();
730
+ rotatePending = true;
731
+ },
732
+ on(event, fn) {
733
+ return emitter.on(event, fn);
734
+ },
735
+ close,
736
+ _ws: ws,
737
+ _currentContextId: () => context.contextId
738
+ };
739
+ }
740
+ };
741
+ }
742
+ //#endregion
743
+ //#region host/providers/tts/rime.ts
744
+ /**
745
+ * Rime TTS opener (host-only).
746
+ *
747
+ * The user-facing descriptor factory (`rime(...)`) lives in
748
+ * `sdk/providers/tts/rime.ts`. This module is the host-side
749
+ * counterpart: it takes the descriptor options + an API key and
750
+ * returns a {@link TtsOpener} that the pipeline session drives.
751
+ *
752
+ * **Protocol.** Connects to Rime's `ws2` JSON WebSocket endpoint
753
+ * (`wss://users-ws.rime.ai/ws2`). Client-to-server messages are JSON:
754
+ * - `{ "text": "..." }` — append text to the synthesis buffer
755
+ * - `{ "operation": "clear" }` — drop buffered text (barge-in)
756
+ * - `{ "operation": "eos" }` — drain buffer, close connection (NOT used
757
+ * during a session: it would tear down the WS, forcing reconnect per
758
+ * turn). We force end-of-turn synthesis with a trailing `"."` instead.
759
+ * The server responds with JSON frames:
760
+ * - `{ type: "chunk", data: <base64 PCM16 LE>, contextId: string | null }`
761
+ * - `{ type: "timestamps", ... }` (ignored)
762
+ * - `{ type: "error", message: string }` (surfaced as `tts_stream_error`)
763
+ *
764
+ * **Single long-lived connection per session.** Rime buffers text until it
765
+ * sees terminal punctuation (`.`, `?`, `!`), so we use one WebSocket per
766
+ * `open()` call and reuse it across turns. `clear` resets the buffer
767
+ * between cancellations.
768
+ *
769
+ * **Done detection.** After `flush()` sends a trailing `"."` to force the
770
+ * server to synthesize any half-buffered text, we arm a quiescence timer
771
+ * that fires 500 ms after the last received audio chunk. When it fires,
772
+ * `done` is emitted.
773
+ *
774
+ * **Audio format.** The URL requests `audioFormat=pcm` at the negotiated
775
+ * `sampleRate`, which returns raw PCM16 little-endian. We decode the base64
776
+ * payload and construct a zero-copy `Int16Array` view over the decoded bytes.
777
+ */
778
+ /** PCM16 sample rates accepted by the Rime `ws2` endpoint. */
779
+ const RIME_PCM16_RATES = [
780
+ 8e3,
781
+ 16e3,
782
+ 22050,
783
+ 24e3,
784
+ 44100,
785
+ 48e3
786
+ ];
787
+ function assertSupportedSampleRate(rate) {
788
+ if (RIME_PCM16_RATES.includes(rate)) return rate;
789
+ throw makeTtsError("tts_connect_failed", `Rime TTS: unsupported sample rate ${rate}. Supported: ${RIME_PCM16_RATES.join(", ")}.`);
790
+ }
791
+ /**
792
+ * Decode a base64 string from Rime into a zero-copy `Int16Array`.
793
+ *
794
+ * Rime's `ws2` endpoint returns base64-encoded PCM16 LE in each chunk.
795
+ * `Buffer.from(base64, "base64")` gives us a Node.js Buffer (which is a
796
+ * Uint8Array subclass) with `byteOffset === 0`. PCM16 bytes always come in
797
+ * pairs so the length is guaranteed to be even.
798
+ */
799
+ function base64ToPcm(data) {
800
+ const bytes = Buffer.from(data, "base64");
801
+ const evenLen = bytes.byteLength - bytes.byteLength % 2;
802
+ if (evenLen === 0) return new Int16Array(0);
803
+ return new Int16Array(bytes.buffer, bytes.byteOffset, evenLen / 2);
804
+ }
805
+ /** Quiescence timeout in ms — how long to wait after the last audio chunk before emitting `done`. */
806
+ const QUIESCENCE_MS = 500;
807
+ /**
808
+ * After `flush()`, how long to wait for the FIRST audio chunk before
809
+ * giving up and emitting `done`. Greeting and short replies hit this
810
+ * path: `flush()` runs immediately after `sendText()`, so audio TTFB
811
+ * exceeds the 500 ms quiescence window. Once the first chunk arrives,
812
+ * we transition to the shorter quiescence timeout.
813
+ */
814
+ const FIRST_AUDIO_TIMEOUT_MS = 5e3;
815
+ /** Wait for the WebSocket `open` event; reject on first `error`. */
816
+ function waitForOpen(ws) {
817
+ return new Promise((resolve, reject) => {
818
+ const onOpen = () => {
819
+ ws.removeListener("error", onError);
820
+ resolve();
821
+ };
822
+ const onError = (err) => {
823
+ ws.removeListener("open", onOpen);
824
+ reject(makeTtsError("tts_connect_failed", `Rime TTS: connect failed: ${err?.message ?? String(err)}`));
825
+ };
826
+ ws.once("open", onOpen);
827
+ ws.once("error", onError);
828
+ });
829
+ }
830
+ /**
831
+ * Handle one incoming WebSocket message frame.
832
+ *
833
+ * Extracted into a top-level function to keep `open()` under the cognitive
834
+ * complexity limit while retaining full access to the session state via refs.
835
+ */
836
+ function handleRimeMessage(raw, emitter, armQuiescence, isActiveTimer) {
837
+ let msg;
838
+ try {
839
+ msg = JSON.parse(typeof raw === "string" ? raw : raw.toString());
840
+ } catch {
841
+ return;
842
+ }
843
+ if (msg.type === "chunk" && typeof msg.data === "string") {
844
+ const pcm = base64ToPcm(msg.data);
845
+ if (pcm.length > 0) {
846
+ emitter.emit("audio", pcm);
847
+ if (isActiveTimer()) armQuiescence();
848
+ }
849
+ return;
850
+ }
851
+ if (msg.type === "error") emitter.emit("error", makeTtsError("tts_stream_error", `Rime TTS: ${msg.message ?? "unknown error"}`));
852
+ }
853
+ /** Build a {@link TtsOpener} from resolved Rime descriptor options. */
854
+ function openRime(opts) {
855
+ return {
856
+ name: "rime",
857
+ async open(openOpts) {
858
+ const apiKey = openOpts.apiKey || process.env.RIME_API_KEY;
859
+ if (!apiKey) throw makeTtsError("tts_auth_failed", "Rime TTS: missing API key. Set RIME_API_KEY in the agent env.");
860
+ const sampleRate = assertSupportedSampleRate(openOpts.sampleRate);
861
+ const model = opts.model ?? "mistv2";
862
+ const lang = opts.language ?? "eng";
863
+ const voice = opts.voice ?? "cove";
864
+ const url = `wss://users-ws.rime.ai/ws2?speaker=${encodeURIComponent(voice)}&modelId=${encodeURIComponent(model)}&audioFormat=pcm&samplingRate=${sampleRate}&lang=${encodeURIComponent(lang)}`;
865
+ let ws;
866
+ try {
867
+ ws = new WsWebSocket(url, { headers: { Authorization: `Bearer ${apiKey}` } });
868
+ } catch (cause) {
869
+ throw makeTtsError("tts_connect_failed", `Rime TTS: failed to create WebSocket: ${cause instanceof Error ? cause.message : String(cause)}`);
870
+ }
871
+ await waitForOpen(ws);
872
+ const emitter = createNanoEvents();
873
+ let closed = false;
874
+ let doneEmitted = false;
875
+ /**
876
+ * After `flush()`, we arm a timer that fires `done`. Initial timeout is
877
+ * `FIRST_AUDIO_TIMEOUT_MS` to give Rime headroom on TTFB; the first
878
+ * chunk swaps it for a shorter `QUIESCENCE_MS` window that resets on
879
+ * each subsequent chunk. `cancel()` emits `done` synchronously.
880
+ */
881
+ let quiescenceTimer = null;
882
+ const clearQuiescence = () => {
883
+ if (quiescenceTimer !== null) {
884
+ clearTimeout(quiescenceTimer);
885
+ quiescenceTimer = null;
886
+ }
887
+ };
888
+ const emitDoneOnce = () => {
889
+ clearQuiescence();
890
+ if (doneEmitted || closed) return;
891
+ doneEmitted = true;
892
+ emitter.emit("done");
893
+ };
894
+ const armQuiescence = () => {
895
+ clearQuiescence();
896
+ quiescenceTimer = setTimeout(emitDoneOnce, QUIESCENCE_MS);
897
+ };
898
+ const armFirstAudioTimer = () => {
899
+ clearQuiescence();
900
+ quiescenceTimer = setTimeout(emitDoneOnce, FIRST_AUDIO_TIMEOUT_MS);
901
+ };
902
+ ws.on("message", (raw) => {
903
+ if (closed) return;
904
+ handleRimeMessage(raw, emitter, armQuiescence, () => quiescenceTimer !== null);
905
+ });
906
+ ws.on("error", (err) => {
907
+ if (closed) return;
908
+ emitter.emit("error", makeTtsError("tts_stream_error", `Rime TTS stream error: ${err?.message ?? String(err)}`));
909
+ });
910
+ ws.on("close", () => {
911
+ if (closed) return;
912
+ emitDoneOnce();
913
+ });
914
+ const close = async () => {
915
+ if (closed) return;
916
+ closed = true;
917
+ clearQuiescence();
918
+ try {
919
+ ws.close();
920
+ } catch {}
921
+ };
922
+ if (openOpts.signal.aborted) close();
923
+ else openOpts.signal.addEventListener("abort", () => void close(), { once: true });
1182
924
  return {
1183
925
  sendText(text) {
1184
926
  if (closed || text.length === 0) return;
1185
- rotateIfPending();
1186
- context.send({
1187
- ...baseRequest,
1188
- transcript: text,
1189
- continue: true
1190
- }).catch(ignoreRejection);
927
+ if (ws.readyState !== WsWebSocket.OPEN) return;
928
+ doneEmitted = false;
929
+ ws.send(JSON.stringify({ text }));
1191
930
  },
1192
931
  flush() {
1193
- if (closed || rotatePending) return;
1194
- context.send({
1195
- ...baseRequest,
1196
- transcript: "",
1197
- continue: false
1198
- }).catch(ignoreRejection);
1199
- rotatePending = true;
932
+ if (closed) return;
933
+ if (ws.readyState !== WsWebSocket.OPEN) return;
934
+ ws.send(JSON.stringify({ text: "." }));
935
+ armFirstAudioTimer();
1200
936
  },
1201
937
  cancel() {
1202
938
  if (closed) return;
1203
- if (!doneEmitted) context.cancel().catch(ignoreRejection);
939
+ if (ws.readyState === WsWebSocket.OPEN) ws.send(JSON.stringify({ operation: "clear" }));
1204
940
  emitDoneOnce();
1205
- rotatePending = true;
1206
941
  },
1207
942
  on(event, fn) {
1208
943
  return emitter.on(event, fn);
1209
944
  },
1210
945
  close,
1211
- _ws: ws,
1212
- _currentContextId: () => context.contextId
946
+ _ws: ws
1213
947
  };
1214
948
  }
1215
949
  };
@@ -1225,53 +959,785 @@ function openCartesia(opts) {
1225
959
  * resolvers here to turn each descriptor into its openable / callable
1226
960
  * host-side counterpart, importing the third-party SDK only at that point.
1227
961
  *
1228
- * The guest sandbox never imports these functions, which is how the agent
1229
- * bundle stays free of `@ai-sdk/anthropic` / `assemblyai` /
1230
- * `@cartesia/cartesia-js`.
962
+ * The guest sandbox never imports these functions, which is how the agent
963
+ * bundle stays free of `@ai-sdk/anthropic` / `assemblyai` /
964
+ * `@cartesia/cartesia-js`.
965
+ */
966
+ /**
967
+ * Look up a provider API key: agent env first (set via `aai secret put` or
968
+ * `.env`), then the host's `process.env` as a fallback for self-hosted mode.
969
+ * Returns `""` if neither has it — the caller decides whether that's fatal.
970
+ */
971
+ function resolveApiKey(envVar, env) {
972
+ return env[envVar] ?? process.env[envVar] ?? "";
973
+ }
974
+ /** Resolve an {@link SttProvider} descriptor into a host-side opener. */
975
+ function resolveStt(descriptor) {
976
+ switch (descriptor.kind) {
977
+ case ASSEMBLYAI_KIND: return openAssemblyAI(descriptor.options);
978
+ case DEEPGRAM_KIND: return openDeepgram(descriptor.options);
979
+ default: throw new Error(`Unknown STT provider kind: "${descriptor.kind}". Supported: ${ASSEMBLYAI_KIND}, ${DEEPGRAM_KIND}.`);
980
+ }
981
+ }
982
+ /** Resolve a {@link TtsProvider} descriptor into a host-side opener. */
983
+ function resolveTts(descriptor) {
984
+ switch (descriptor.kind) {
985
+ case CARTESIA_KIND: return openCartesia(descriptor.options);
986
+ case RIME_KIND: return openRime(descriptor.options);
987
+ default: throw new Error(`Unknown TTS provider kind: "${descriptor.kind}". Supported: ${CARTESIA_KIND}, ${RIME_KIND}.`);
988
+ }
989
+ }
990
+ /**
991
+ * Resolve an {@link LlmProvider} descriptor into a Vercel AI SDK
992
+ * {@link LanguageModel}.
993
+ *
994
+ * The API key is pulled from the agent's env (e.g. `ANTHROPIC_API_KEY`).
995
+ * Missing keys throw here — the pipeline session would fail on first
996
+ * `streamText` call otherwise, and the error is clearer at construction.
997
+ */
998
+ function resolveLlm(descriptor, env) {
999
+ switch (descriptor.kind) {
1000
+ case ANTHROPIC_KIND: {
1001
+ const options = descriptor.options;
1002
+ const apiKey = resolveApiKey("ANTHROPIC_API_KEY", env);
1003
+ if (!apiKey) throw new Error("Anthropic LLM: missing API key. Set ANTHROPIC_API_KEY in the agent env.");
1004
+ return createAnthropic({
1005
+ apiKey,
1006
+ baseURL: "https://api.anthropic.com/v1"
1007
+ })(options.model);
1008
+ }
1009
+ default: throw new Error(`Unknown LLM provider kind: "${descriptor.kind}". Supported: ${ANTHROPIC_KIND}.`);
1010
+ }
1011
+ }
1012
+ //#endregion
1013
+ //#region host/runtime-config.ts
1014
+ /**
1015
+ * Runtime dependencies injected into the session pipeline.
1016
+ *
1017
+ * Defines the {@link Logger} interface, a default {@link consoleLogger},
1018
+ * and the {@link S2SConfig} for Speech-to-Speech endpoint configuration.
1019
+ */
1020
+ function consoleLog(fn) {
1021
+ return (msg, ctx) => ctx ? fn(msg, ctx) : fn(msg);
1022
+ }
1023
+ /** Default console-backed logger. */
1024
+ const consoleLogger = {
1025
+ info: consoleLog(console.log),
1026
+ warn: consoleLog(console.warn),
1027
+ error: consoleLog(console.error),
1028
+ debug: consoleLog(console.debug)
1029
+ };
1030
+ /**
1031
+ * Structured JSON logger for production diagnostics. Each log entry is a
1032
+ * single-line JSON object with `timestamp`, `level`, `msg`, and any
1033
+ * caller-provided context fields.
1034
+ */
1035
+ function jsonLog(level) {
1036
+ return (msg, ctx) => {
1037
+ const entry = {
1038
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
1039
+ level,
1040
+ msg
1041
+ };
1042
+ if (ctx) Object.assign(entry, ctx);
1043
+ (level === "error" || level === "warn" ? process.stderr : process.stdout).write(`${JSON.stringify(entry)}\n`);
1044
+ };
1045
+ }
1046
+ const jsonLogger = {
1047
+ info: jsonLog("info"),
1048
+ warn: jsonLog("warn"),
1049
+ error: jsonLog("error"),
1050
+ debug: jsonLog("debug")
1051
+ };
1052
+ /** Default S2S endpoint configuration. */
1053
+ const DEFAULT_S2S_CONFIG = {
1054
+ wssUrl: "wss://agents.assemblyai.com/v1/voice",
1055
+ inputSampleRate: DEFAULT_STT_SAMPLE_RATE,
1056
+ outputSampleRate: DEFAULT_TTS_SAMPLE_RATE
1057
+ };
1058
+ //#endregion
1059
+ //#region host/session-core.ts
1060
+ const REPLY_DONE_SLOW_THRESHOLD_MS = 50;
1061
+ function createSessionCore(opts) {
1062
+ const log = opts.logger ?? consoleLogger;
1063
+ const maxHistory = opts.maxHistory ?? 200;
1064
+ const idleMs = (() => {
1065
+ const raw = opts.agentConfig.idleTimeoutMs ?? 3e5;
1066
+ return raw === 0 || !Number.isFinite(raw) ? 0 : raw;
1067
+ })();
1068
+ let reply = {
1069
+ currentReplyId: null,
1070
+ pendingTools: [],
1071
+ toolCallCount: 0
1072
+ };
1073
+ let history = [];
1074
+ let turnPromise = null;
1075
+ let idleTimer = null;
1076
+ let stopped = false;
1077
+ function emit(event) {
1078
+ opts.client.event(event);
1079
+ }
1080
+ function resetIdle() {
1081
+ if (stopped || idleMs <= 0) return;
1082
+ if (idleTimer) clearTimeout(idleTimer);
1083
+ idleTimer = setTimeout(() => {
1084
+ log.info("session idle timeout", { sid: opts.id });
1085
+ emit({ type: "idle_timeout" });
1086
+ }, idleMs);
1087
+ }
1088
+ function pushMessages(...msgs) {
1089
+ history.push(...msgs);
1090
+ if (maxHistory > 0 && history.length > maxHistory) history.splice(0, history.length - maxHistory);
1091
+ }
1092
+ function beginReply(replyId) {
1093
+ reply = {
1094
+ currentReplyId: replyId,
1095
+ pendingTools: [],
1096
+ toolCallCount: 0
1097
+ };
1098
+ turnPromise = null;
1099
+ }
1100
+ function cancelReply() {
1101
+ reply = {
1102
+ currentReplyId: null,
1103
+ pendingTools: [],
1104
+ toolCallCount: 0
1105
+ };
1106
+ }
1107
+ function flushReply(startMs, hadTurnPromise) {
1108
+ const stepsUsed = reply.toolCallCount;
1109
+ if (stepsUsed > 0) log.info("Turn complete", {
1110
+ steps: stepsUsed,
1111
+ agent: opts.agent
1112
+ });
1113
+ opts.client.playAudioDone();
1114
+ emit({ type: "reply_done" });
1115
+ reply.currentReplyId = null;
1116
+ const durationMs = Date.now() - startMs;
1117
+ if (durationMs >= REPLY_DONE_SLOW_THRESHOLD_MS) log.warn("slow reply_done dispatch", {
1118
+ sid: opts.id,
1119
+ agent: opts.agent,
1120
+ durationMs,
1121
+ hadTurnPromise
1122
+ });
1123
+ }
1124
+ return {
1125
+ id: opts.id,
1126
+ async start() {
1127
+ resetIdle();
1128
+ await opts.transport.start();
1129
+ },
1130
+ async stop() {
1131
+ if (stopped) return;
1132
+ stopped = true;
1133
+ if (idleTimer) {
1134
+ clearTimeout(idleTimer);
1135
+ idleTimer = null;
1136
+ }
1137
+ if (turnPromise !== null) await turnPromise;
1138
+ await opts.transport.stop();
1139
+ },
1140
+ onAudio(bytes) {
1141
+ resetIdle();
1142
+ opts.transport.sendUserAudio(bytes);
1143
+ },
1144
+ onAudioReady() {},
1145
+ onCancel() {
1146
+ opts.transport.cancelReply();
1147
+ emit({ type: "cancelled" });
1148
+ },
1149
+ onReset() {
1150
+ cancelReply();
1151
+ history = [];
1152
+ emit({ type: "reset" });
1153
+ },
1154
+ onHistory(messages) {
1155
+ pushMessages(...messages);
1156
+ },
1157
+ onReplyStarted(replyId) {
1158
+ beginReply(replyId);
1159
+ },
1160
+ onReplyDone() {
1161
+ const startMs = Date.now();
1162
+ const doneReplyId = reply.currentReplyId;
1163
+ if (doneReplyId === null) {
1164
+ log.debug("Dropping duplicate reply.done (no active reply)");
1165
+ return;
1166
+ }
1167
+ const hadTurnPromise = turnPromise !== null;
1168
+ const sendPending = () => {
1169
+ if (reply.currentReplyId !== doneReplyId) {
1170
+ reply.pendingTools = [];
1171
+ return;
1172
+ }
1173
+ if (reply.pendingTools.length > 0) {
1174
+ for (const tool of reply.pendingTools) opts.transport.sendToolResult(tool.callId, tool.result);
1175
+ reply.pendingTools = [];
1176
+ } else flushReply(startMs, hadTurnPromise);
1177
+ };
1178
+ if (hadTurnPromise) turnPromise?.then(sendPending);
1179
+ else sendPending();
1180
+ },
1181
+ onCancelled() {
1182
+ cancelReply();
1183
+ emit({ type: "cancelled" });
1184
+ },
1185
+ onAudioChunk(bytes) {
1186
+ opts.client.playAudioChunk(bytes);
1187
+ },
1188
+ onAudioDone() {
1189
+ opts.client.playAudioDone();
1190
+ },
1191
+ onUserTranscript(text) {
1192
+ emit({
1193
+ type: "user_transcript",
1194
+ text
1195
+ });
1196
+ pushMessages({
1197
+ role: "user",
1198
+ content: text
1199
+ });
1200
+ },
1201
+ onAgentTranscript(text, interrupted) {
1202
+ emit({
1203
+ type: "agent_transcript",
1204
+ text
1205
+ });
1206
+ if (!interrupted) pushMessages({
1207
+ role: "assistant",
1208
+ content: text
1209
+ });
1210
+ },
1211
+ onToolCall(callId, name, args) {
1212
+ emit({
1213
+ type: "tool_call",
1214
+ toolCallId: callId,
1215
+ toolName: name,
1216
+ args
1217
+ });
1218
+ if (reply.currentReplyId === null) {
1219
+ log.warn("tool_call with no active reply", {
1220
+ sid: opts.id,
1221
+ name
1222
+ });
1223
+ return;
1224
+ }
1225
+ reply.toolCallCount++;
1226
+ const maxSteps = opts.agentConfig.maxSteps;
1227
+ if (maxSteps !== void 0 && reply.toolCallCount > maxSteps) {
1228
+ log.info("maxSteps exceeded; refusing tool call", {
1229
+ toolCallCount: reply.toolCallCount,
1230
+ maxSteps
1231
+ });
1232
+ reply.pendingTools.push({
1233
+ callId,
1234
+ result: JSON.stringify({ error: "Maximum tool steps reached. Please respond to the user now." })
1235
+ });
1236
+ emit({
1237
+ type: "tool_call_done",
1238
+ toolCallId: callId,
1239
+ result: "{}"
1240
+ });
1241
+ return;
1242
+ }
1243
+ const p = (async () => {
1244
+ try {
1245
+ const result = await opts.executeTool(name, args, opts.id, history);
1246
+ reply.pendingTools.push({
1247
+ callId,
1248
+ result
1249
+ });
1250
+ emit({
1251
+ type: "tool_call_done",
1252
+ toolCallId: callId,
1253
+ result
1254
+ });
1255
+ } catch (err) {
1256
+ const message = err instanceof Error ? err.message : String(err);
1257
+ reply.pendingTools.push({
1258
+ callId,
1259
+ result: JSON.stringify({ error: message })
1260
+ });
1261
+ emit({
1262
+ type: "tool_call_done",
1263
+ toolCallId: callId,
1264
+ result: message
1265
+ });
1266
+ }
1267
+ })();
1268
+ turnPromise = (turnPromise ?? Promise.resolve()).then(() => p);
1269
+ },
1270
+ onError(code, message) {
1271
+ emit({
1272
+ type: "error",
1273
+ code,
1274
+ message
1275
+ });
1276
+ },
1277
+ onSpeechStarted() {
1278
+ emit({ type: "speech_started" });
1279
+ },
1280
+ onSpeechStopped() {
1281
+ emit({ type: "speech_stopped" });
1282
+ }
1283
+ };
1284
+ }
1285
+ //#endregion
1286
+ //#region host/tool-executor.ts
1287
+ /**
1288
+ * Tool execution — validates arguments and invokes tool handlers.
1289
+ *
1290
+ * {@link executeToolCall} is the single entry point used by both the
1291
+ * direct (self-hosted) runtime and the platform sandbox sidecar.
1292
+ */
1293
+ const yieldTick = () => new Promise((r) => setTimeout(r, 0));
1294
+ function buildToolContext(opts) {
1295
+ const { env, state, kv, messages, sessionId } = opts;
1296
+ return {
1297
+ env,
1298
+ state: state ?? {},
1299
+ get kv() {
1300
+ if (!kv) throw new Error("KV not available");
1301
+ return kv;
1302
+ },
1303
+ messages: messages ?? [],
1304
+ sessionId: sessionId ?? "",
1305
+ send(event, data) {
1306
+ opts.send?.(event, data);
1307
+ }
1308
+ };
1309
+ }
1310
+ async function executeToolCall(name, args, options) {
1311
+ const { tool } = options;
1312
+ const parsed = (tool.parameters ?? EMPTY_PARAMS).safeParse(args);
1313
+ if (!parsed.success) return toolError(`Invalid arguments for tool "${name}": ${(parsed.error?.issues ?? []).map((i) => `${i.path.map(String).join(".")}: ${i.message}`).join(", ")}`);
1314
+ try {
1315
+ const ctx = buildToolContext(options);
1316
+ await yieldTick();
1317
+ const result = await pTimeout(Promise.resolve(tool.execute(parsed.data, ctx)), {
1318
+ milliseconds: TOOL_EXECUTION_TIMEOUT_MS,
1319
+ message: `Tool "${name}" timed out after ${TOOL_EXECUTION_TIMEOUT_MS}ms`
1320
+ });
1321
+ await yieldTick();
1322
+ if (result == null) return "null";
1323
+ return typeof result === "string" ? result : JSON.stringify(result);
1324
+ } catch (err) {
1325
+ const log = options.logger;
1326
+ if (log) log.warn("Tool execution failed", {
1327
+ tool: name,
1328
+ error: errorDetail(err)
1329
+ });
1330
+ else console.warn(`[tool-executor] Tool execution failed: ${name}`, err);
1331
+ return toolError(errorMessage(err));
1332
+ }
1333
+ }
1334
+ //#endregion
1335
+ //#region host/to-vercel-tools.ts
1336
+ /**
1337
+ * Converts agent {@link ToolSchema}[] to Vercel AI SDK tools with `execute`
1338
+ * delegation to the agent's {@link ExecuteTool} function.
1339
+ *
1340
+ * The pipeline orchestrator passes the output to `streamText({ tools })`.
1341
+ * Each produced tool's `execute` closure calls
1342
+ * `ctx.executeTool(name, args, sessionId, messages(), { signal, toolCallId })`,
1343
+ * so the existing agent tool infrastructure (argument validation, KV, hooks,
1344
+ * timeout) remains the single source of truth for tool behavior.
1345
+ *
1346
+ * Per-call `options.abortSignal` (forwarded by `streamText` when the
1347
+ * outer turn is aborted, e.g. barge-in) takes precedence over the
1348
+ * bag-level `ctx.signal` so individual invocations respect streamText
1349
+ * aborts.
1231
1350
  */
1232
1351
  /**
1233
- * Look up a provider API key: agent env first (set via `aai secret put` or
1234
- * `.env`), then the host's `process.env` as a fallback for self-hosted mode.
1235
- * Returns `""` if neither has it — the caller decides whether that's fatal.
1352
+ * Convert an array of {@link ToolSchema} to a Vercel AI SDK `ToolSet`
1353
+ * (record keyed by tool name).
1354
+ *
1355
+ * Uses the v6 `tool()` helper with `inputSchema: jsonSchema(...)` wrapping
1356
+ * the agent's JSON Schema `parameters`. Execution is delegated to
1357
+ * `ctx.executeTool` so validation, KV, timeouts, and hooks keep working.
1236
1358
  */
1237
- function resolveApiKey(envVar, env) {
1238
- return env[envVar] ?? process.env[envVar] ?? "";
1359
+ function toVercelTools(schemas, ctx) {
1360
+ const out = {};
1361
+ for (const schema of schemas) out[schema.name] = tool({
1362
+ description: schema.description,
1363
+ inputSchema: jsonSchema(schema.parameters),
1364
+ execute: async (args, options) => {
1365
+ const input = args ?? {};
1366
+ const signal = options.abortSignal ?? ctx.signal;
1367
+ const opts = {};
1368
+ if (signal !== void 0) opts.signal = signal;
1369
+ if (options.toolCallId !== void 0) opts.toolCallId = options.toolCallId;
1370
+ return ctx.executeTool(schema.name, input, ctx.sessionId, ctx.messages().slice(), opts);
1371
+ }
1372
+ });
1373
+ return out;
1239
1374
  }
1240
- /** Resolve an {@link SttProvider} descriptor into a host-side opener. */
1241
- function resolveStt(descriptor) {
1242
- switch (descriptor.kind) {
1243
- case ASSEMBLYAI_KIND: return openAssemblyAI(descriptor.options);
1244
- default: throw new Error(`Unknown STT provider kind: "${descriptor.kind}". Supported: ${ASSEMBLYAI_KIND}.`);
1375
+ //#endregion
1376
+ //#region host/transports/pipeline-transport.ts
1377
+ function toModelMessage(m) {
1378
+ if (m.role === "user") return {
1379
+ role: "user",
1380
+ content: m.content
1381
+ };
1382
+ return {
1383
+ role: "assistant",
1384
+ content: m.content
1385
+ };
1386
+ }
1387
+ /** Create a pipeline-mode Transport (STT → LLM → TTS). */
1388
+ function createPipelineTransport(opts) {
1389
+ const log = opts.logger ?? consoleLogger;
1390
+ const sttSampleRate = opts.sttSampleRate ?? 16e3;
1391
+ const ttsSampleRate = opts.ttsSampleRate ?? 24e3;
1392
+ const maxSteps = opts.maxSteps ?? 5;
1393
+ const toolChoice = opts.toolChoice ?? "auto";
1394
+ const toolSchemas = opts.toolSchemas ?? [];
1395
+ const executeTool = opts.executeTool ?? (async () => {
1396
+ throw new Error("No executeTool provided");
1397
+ });
1398
+ const { callbacks, sessionConfig } = opts;
1399
+ const systemPrompt = sessionConfig.systemPrompt;
1400
+ const sessionAbort = new AbortController();
1401
+ let audioReady = false;
1402
+ let terminated = false;
1403
+ let sttSession = null;
1404
+ let ttsSession = null;
1405
+ let turnController = null;
1406
+ let nextReplyId = 0;
1407
+ const conversationMessages = sessionConfig.history ? [...sessionConfig.history] : [];
1408
+ let turnPromise = null;
1409
+ const sttSubs = [];
1410
+ const ttsSubs = [];
1411
+ function pushMessages(...msgs) {
1412
+ conversationMessages.push(...msgs);
1413
+ if (conversationMessages.length > 200) conversationMessages.splice(0, conversationMessages.length - 200);
1414
+ }
1415
+ function chainTurn(p) {
1416
+ turnPromise = (turnPromise ?? Promise.resolve()).then(() => p);
1417
+ }
1418
+ function emitError(code, message) {
1419
+ callbacks.onError(code, message);
1420
+ }
1421
+ /**
1422
+ * Tear down after an unrecoverable provider error. Aborts the in-flight
1423
+ * turn, cancels TTS, signals providers to close. Idempotent.
1424
+ */
1425
+ function terminate() {
1426
+ if (terminated) return;
1427
+ terminated = true;
1428
+ if (turnController !== null) {
1429
+ turnController.abort();
1430
+ turnController = null;
1431
+ }
1432
+ ttsSession?.cancel();
1433
+ callbacks.onCancelled();
1434
+ sessionAbort.abort();
1435
+ }
1436
+ function onSttPartial(_text) {
1437
+ if (terminated) return;
1438
+ if (turnController === null) return;
1439
+ log.info("Pipeline barge-in", { sid: opts.sid });
1440
+ turnController.abort();
1441
+ turnController = null;
1442
+ ttsSession?.cancel();
1443
+ callbacks.onCancelled();
1444
+ }
1445
+ function onSttFinal(text) {
1446
+ if (terminated) return;
1447
+ const trimmed = text.trim();
1448
+ if (trimmed.length === 0) return;
1449
+ if (turnController !== null) {
1450
+ log.info("Pipeline replacing in-flight turn", { sid: opts.sid });
1451
+ turnController.abort();
1452
+ turnController = null;
1453
+ ttsSession?.cancel();
1454
+ callbacks.onCancelled();
1455
+ }
1456
+ callbacks.onUserTranscript(text);
1457
+ chainTurn(runTurn(trimmed).catch((err) => {
1458
+ log.error("Pipeline turn crashed", {
1459
+ error: errorMessage(err),
1460
+ sid: opts.sid
1461
+ });
1462
+ }));
1463
+ }
1464
+ function onSttError(err) {
1465
+ if (terminated) return;
1466
+ log.error("STT error", {
1467
+ code: err.code,
1468
+ message: err.message,
1469
+ sid: opts.sid
1470
+ });
1471
+ emitError("stt", err.message);
1472
+ terminate();
1473
+ }
1474
+ function onTtsError(err) {
1475
+ if (terminated) return;
1476
+ log.error("TTS error", {
1477
+ code: err.code,
1478
+ message: err.message,
1479
+ sid: opts.sid
1480
+ });
1481
+ emitError("tts", err.message);
1482
+ terminate();
1483
+ }
1484
+ async function consumeLlmStream(ctl, messages, tools, onDelta) {
1485
+ try {
1486
+ const result = streamText({
1487
+ model: opts.llm,
1488
+ system: systemPrompt,
1489
+ messages,
1490
+ tools,
1491
+ toolChoice,
1492
+ stopWhen: stepCountIs(maxSteps),
1493
+ abortSignal: ctl.signal
1494
+ });
1495
+ for await (const part of result.fullStream) {
1496
+ if (ctl.signal.aborted) break;
1497
+ handleStreamPart(part, ctl, onDelta);
1498
+ }
1499
+ } catch (err) {
1500
+ if (!ctl.signal.aborted) {
1501
+ const msg = errorMessage(err);
1502
+ log.error("LLM streamText failed", {
1503
+ error: msg,
1504
+ sid: opts.sid
1505
+ });
1506
+ emitError("llm", msg);
1507
+ }
1508
+ }
1509
+ }
1510
+ function handleStreamPart(part, _ctl, onDelta) {
1511
+ switch (part.type) {
1512
+ case "text-delta": {
1513
+ const delta = part.text ?? "";
1514
+ if (delta.length === 0) return;
1515
+ onDelta(delta);
1516
+ ttsSession?.sendText(delta);
1517
+ return;
1518
+ }
1519
+ case "tool-call": {
1520
+ const input = part.input ?? {};
1521
+ callbacks.onToolCall(part.toolCallId ?? "", part.toolName ?? "", input);
1522
+ return;
1523
+ }
1524
+ case "error": {
1525
+ const msg = errorMessage(part.error);
1526
+ log.error("LLM stream error", {
1527
+ message: msg,
1528
+ sid: opts.sid
1529
+ });
1530
+ emitError("llm", msg);
1531
+ return;
1532
+ }
1533
+ default: return;
1534
+ }
1535
+ }
1536
+ /**
1537
+ * Flush TTS and wait for drain. Resolves on:
1538
+ * - TTS emits `done`
1539
+ * - `signal` aborts (barge-in / provider error / session stop)
1540
+ * - PIPELINE_FLUSH_TIMEOUT_MS elapses
1541
+ * Resolves immediately if no TTS session.
1542
+ */
1543
+ function flushTtsAndWait(signal) {
1544
+ const tts = ttsSession;
1545
+ if (!tts) return Promise.resolve();
1546
+ return new Promise((resolve) => {
1547
+ let off = null;
1548
+ let timer = null;
1549
+ const cleanup = () => {
1550
+ if (off) {
1551
+ off();
1552
+ off = null;
1553
+ }
1554
+ if (timer) {
1555
+ clearTimeout(timer);
1556
+ timer = null;
1557
+ }
1558
+ signal.removeEventListener("abort", onAbort);
1559
+ };
1560
+ const finish = () => {
1561
+ cleanup();
1562
+ resolve();
1563
+ };
1564
+ const onAbort = () => finish();
1565
+ if (signal.aborted) {
1566
+ resolve();
1567
+ return;
1568
+ }
1569
+ signal.addEventListener("abort", onAbort, { once: true });
1570
+ off = tts.on("done", finish);
1571
+ timer = setTimeout(() => {
1572
+ log.warn("TTS flush timeout", {
1573
+ sid: opts.sid,
1574
+ timeoutMs: PIPELINE_FLUSH_TIMEOUT_MS
1575
+ });
1576
+ finish();
1577
+ }, PIPELINE_FLUSH_TIMEOUT_MS);
1578
+ tts.flush();
1579
+ });
1580
+ }
1581
+ async function runTurn(userText) {
1582
+ const replyId = `pipeline-${++nextReplyId}`;
1583
+ callbacks.onReplyStarted(replyId);
1584
+ pushMessages({
1585
+ role: "user",
1586
+ content: userText
1587
+ });
1588
+ const ctl = new AbortController();
1589
+ turnController = ctl;
1590
+ const tools = toVercelTools(toolSchemas, {
1591
+ executeTool,
1592
+ sessionId: opts.sid,
1593
+ messages: () => conversationMessages,
1594
+ signal: ctl.signal
1595
+ });
1596
+ const messages = conversationMessages.map(toModelMessage);
1597
+ let accumulated = "";
1598
+ await consumeLlmStream(ctl, messages, tools, (delta) => {
1599
+ accumulated += delta;
1600
+ });
1601
+ if (ctl.signal.aborted) {
1602
+ if (turnController === ctl) turnController = null;
1603
+ return;
1604
+ }
1605
+ if (accumulated.length > 0) {
1606
+ callbacks.onAgentTranscript(accumulated, false);
1607
+ pushMessages({
1608
+ role: "assistant",
1609
+ content: accumulated
1610
+ });
1611
+ }
1612
+ await flushTtsAndWait(ctl.signal);
1613
+ if (ctl.signal.aborted) {
1614
+ if (turnController === ctl) turnController = null;
1615
+ return;
1616
+ }
1617
+ callbacks.onReplyDone();
1618
+ if (turnController === ctl) turnController = null;
1619
+ }
1620
+ async function runGreeting(text) {
1621
+ const replyId = `pipeline-greeting-${++nextReplyId}`;
1622
+ callbacks.onReplyStarted(replyId);
1623
+ const ctl = new AbortController();
1624
+ turnController = ctl;
1625
+ callbacks.onAgentTranscript(text, false);
1626
+ pushMessages({
1627
+ role: "assistant",
1628
+ content: text
1629
+ });
1630
+ ttsSession?.sendText(text);
1631
+ await flushTtsAndWait(ctl.signal);
1632
+ if (ctl.signal.aborted) {
1633
+ if (turnController === ctl) turnController = null;
1634
+ return;
1635
+ }
1636
+ callbacks.onReplyDone();
1637
+ if (turnController === ctl) turnController = null;
1638
+ }
1639
+ function reportOpenRejection(which, reason) {
1640
+ const msg = errorMessage(reason);
1641
+ log.error(`${which === "stt" ? "STT" : "TTS"} open failed`, {
1642
+ error: msg,
1643
+ sid: opts.sid
1644
+ });
1645
+ emitError(which, msg);
1646
+ }
1647
+ async function adoptStt(session, teardown) {
1648
+ if (teardown) {
1649
+ await session.close().catch(() => void 0);
1650
+ return;
1651
+ }
1652
+ sttSession = session;
1653
+ sttSubs.push(session.on("partial", onSttPartial));
1654
+ sttSubs.push(session.on("final", onSttFinal));
1655
+ sttSubs.push(session.on("error", onSttError));
1656
+ }
1657
+ async function adoptTts(session, teardown) {
1658
+ if (teardown) {
1659
+ await session.close().catch(() => void 0);
1660
+ return;
1661
+ }
1662
+ ttsSession = session;
1663
+ ttsSubs.push(session.on("audio", (pcm) => {
1664
+ callbacks.onAudioChunk(new Uint8Array(pcm.buffer, pcm.byteOffset, pcm.byteLength));
1665
+ }));
1666
+ ttsSubs.push(session.on("error", onTtsError));
1667
+ }
1668
+ async function openProviders() {
1669
+ const [sttResult, ttsResult] = await Promise.allSettled([opts.stt.open({
1670
+ sampleRate: sttSampleRate,
1671
+ apiKey: opts.providerKeys.stt,
1672
+ sttPrompt: opts.sttPrompt,
1673
+ signal: sessionAbort.signal
1674
+ }), opts.tts.open({
1675
+ sampleRate: ttsSampleRate,
1676
+ apiKey: opts.providerKeys.tts,
1677
+ signal: sessionAbort.signal
1678
+ })]);
1679
+ if (sttResult.status === "rejected") reportOpenRejection("stt", sttResult.reason);
1680
+ if (ttsResult.status === "rejected") reportOpenRejection("tts", ttsResult.reason);
1681
+ const aborted = sessionAbort.signal.aborted;
1682
+ const sttFailed = sttResult.status === "rejected";
1683
+ const ttsFailed = ttsResult.status === "rejected";
1684
+ const teardown = aborted || sttFailed || ttsFailed;
1685
+ if (sttResult.status === "fulfilled") await adoptStt(sttResult.value, teardown);
1686
+ if (ttsResult.status === "fulfilled") await adoptTts(ttsResult.value, teardown);
1687
+ if (!aborted && (sttFailed || ttsFailed)) terminate();
1245
1688
  }
1246
- }
1247
- /** Resolve a {@link TtsProvider} descriptor into a host-side opener. */
1248
- function resolveTts(descriptor) {
1249
- switch (descriptor.kind) {
1250
- case CARTESIA_KIND: return openCartesia(descriptor.options);
1251
- default: throw new Error(`Unknown TTS provider kind: "${descriptor.kind}". Supported: ${CARTESIA_KIND}.`);
1689
+ function onAudioReady() {
1690
+ if (audioReady || terminated) return;
1691
+ audioReady = true;
1692
+ if (opts.skipGreeting) return;
1693
+ const greeting = sessionConfig.greeting;
1694
+ if (!greeting) return;
1695
+ chainTurn(runGreeting(greeting).catch((err) => {
1696
+ log.error("Pipeline greeting failed", {
1697
+ error: errorMessage(err),
1698
+ sid: opts.sid
1699
+ });
1700
+ }));
1252
1701
  }
1253
- }
1254
- /**
1255
- * Resolve an {@link LlmProvider} descriptor into a Vercel AI SDK
1256
- * {@link LanguageModel}.
1257
- *
1258
- * The API key is pulled from the agent's env (e.g. `ANTHROPIC_API_KEY`).
1259
- * Missing keys throw here — the pipeline session would fail on first
1260
- * `streamText` call otherwise, and the error is clearer at construction.
1261
- */
1262
- function resolveLlm(descriptor, env) {
1263
- switch (descriptor.kind) {
1264
- case ANTHROPIC_KIND: {
1265
- const options = descriptor.options;
1266
- const apiKey = resolveApiKey("ANTHROPIC_API_KEY", env);
1267
- if (!apiKey) throw new Error("Anthropic LLM: missing API key. Set ANTHROPIC_API_KEY in the agent env.");
1268
- return createAnthropic({
1269
- apiKey,
1270
- baseURL: "https://api.anthropic.com/v1"
1271
- })(options.model);
1702
+ return {
1703
+ async start() {
1704
+ await openProviders();
1705
+ callbacks.onSessionReady?.(opts.sid);
1706
+ onAudioReady();
1707
+ },
1708
+ async stop() {
1709
+ if (sessionAbort.signal.aborted) return;
1710
+ sessionAbort.abort();
1711
+ turnController?.abort();
1712
+ for (const off of sttSubs) off();
1713
+ for (const off of ttsSubs) off();
1714
+ sttSubs.length = 0;
1715
+ ttsSubs.length = 0;
1716
+ if (turnPromise !== null) await turnPromise;
1717
+ await sttSession?.close().catch(() => {});
1718
+ await ttsSession?.close().catch(() => {});
1719
+ },
1720
+ sendUserAudio(bytes) {
1721
+ if (terminated || !audioReady) return;
1722
+ const offset = bytes.byteOffset;
1723
+ const length = bytes.byteLength;
1724
+ let pcm;
1725
+ if (offset % 2 === 0 && length % 2 === 0) pcm = new Int16Array(bytes.buffer, offset, length / 2);
1726
+ else {
1727
+ const copy = new Uint8Array(length - length % 2);
1728
+ copy.set(bytes.subarray(0, copy.byteLength));
1729
+ pcm = new Int16Array(copy.buffer);
1730
+ }
1731
+ sttSession?.sendAudio(pcm);
1732
+ },
1733
+ sendToolResult(_callId, _result) {},
1734
+ cancelReply() {
1735
+ if (terminated) return;
1736
+ turnController?.abort();
1737
+ turnController = null;
1738
+ ttsSession?.cancel();
1272
1739
  }
1273
- default: throw new Error(`Unknown LLM provider kind: "${descriptor.kind}". Supported: ${ANTHROPIC_KIND}.`);
1274
- }
1740
+ };
1275
1741
  }
1276
1742
  //#endregion
1277
1743
  //#region host/s2s.ts
@@ -1326,72 +1792,59 @@ function parseS2sMessage(obj) {
1326
1792
  const result = S2sMessageSchema.safeParse(obj);
1327
1793
  return result.success ? result.data : void 0;
1328
1794
  }
1329
- function dispatchS2sMessage(emitter, msg, state, dispatchCtx) {
1795
+ function dispatchS2sMessage(callbacks, msg, state, ctx) {
1330
1796
  switch (msg.type) {
1331
1797
  case "session.ready":
1332
- emitter.emit("ready", { sessionId: msg.session_id });
1798
+ callbacks.onSessionReady(msg.session_id);
1333
1799
  break;
1334
1800
  case "session.updated": break;
1335
1801
  case "input.speech.started":
1336
1802
  if (!state.speechActive) {
1337
1803
  state.speechActive = true;
1338
- emitter.emit("event", { type: "speech_started" });
1804
+ callbacks.onSpeechStarted();
1339
1805
  }
1340
1806
  break;
1341
1807
  case "input.speech.stopped":
1342
1808
  if (state.speechActive) {
1343
1809
  state.speechActive = false;
1344
- emitter.emit("event", { type: "speech_stopped" });
1810
+ callbacks.onSpeechStopped();
1345
1811
  }
1346
1812
  break;
1347
1813
  case "transcript.user":
1348
- emitter.emit("event", {
1349
- type: "user_transcript",
1350
- text: msg.text
1351
- });
1814
+ callbacks.onUserTranscript(msg.text);
1352
1815
  break;
1353
1816
  case "reply.started":
1354
- emitter.emit("replyStarted", { replyId: msg.reply_id });
1817
+ callbacks.onReplyStarted(msg.reply_id);
1355
1818
  break;
1356
1819
  case "transcript.agent":
1357
- emitter.emit("event", {
1358
- type: "agent_transcript",
1359
- text: msg.text,
1360
- _interrupted: msg.interrupted
1361
- });
1820
+ callbacks.onAgentTranscript(msg.text, msg.interrupted);
1362
1821
  break;
1363
1822
  case "tool.call":
1364
- emitter.emit("event", {
1365
- type: "tool_call",
1366
- toolCallId: msg.call_id,
1367
- toolName: msg.name,
1368
- args: msg.args
1369
- });
1823
+ callbacks.onToolCall(msg.call_id, msg.name, msg.args);
1370
1824
  break;
1371
1825
  case "reply.done":
1372
- dispatchCtx.log.info("S2S << reply.done", {
1373
- ...dispatchCtx.sid !== void 0 ? { sid: dispatchCtx.sid } : {},
1826
+ ctx.log.info("S2S << reply.done", {
1827
+ ...ctx.sid !== void 0 ? { sid: ctx.sid } : {},
1374
1828
  status: msg.status ?? "completed"
1375
1829
  });
1376
- if (msg.status === "interrupted") emitter.emit("event", { type: "cancelled" });
1377
- else emitter.emit("event", { type: "reply_done" });
1830
+ if (msg.status === "interrupted") callbacks.onCancelled();
1831
+ else callbacks.onReplyDone();
1378
1832
  break;
1379
1833
  case "session.error":
1380
- if (msg.code === "session_not_found" || msg.code === "session_forbidden") emitter.emit("sessionExpired");
1381
- else emitter.emit("error", new Error(msg.message));
1834
+ if (msg.code === "session_not_found" || msg.code === "session_forbidden") callbacks.onSessionExpired();
1835
+ else callbacks.onError(new Error(msg.message));
1382
1836
  break;
1383
1837
  case "error":
1384
- emitter.emit("error", new Error(msg.message));
1838
+ callbacks.onError(new Error(msg.message));
1385
1839
  break;
1386
1840
  default: break;
1387
1841
  }
1388
1842
  }
1389
1843
  function connectS2s(opts) {
1390
- const { apiKey, config, createWebSocket, logger: log = consoleLogger, sid } = opts;
1844
+ const { apiKey, config, createWebSocket, callbacks, logger: log = consoleLogger, sid } = opts;
1391
1845
  return new Promise((resolve, reject) => {
1392
1846
  log.info("S2S connecting", { url: config.wssUrl });
1393
1847
  const ws = createWebSocket(config.wssUrl, { headers: { Authorization: `Bearer ${apiKey}` } });
1394
- const emitter = createNanoEvents();
1395
1848
  const dispatchState = { speechActive: false };
1396
1849
  const dispatchCtx = sid !== void 0 ? {
1397
1850
  log,
@@ -1409,7 +1862,6 @@ function connectS2s(opts) {
1409
1862
  ws.send(json);
1410
1863
  }
1411
1864
  const handle = {
1412
- on: emitter.on.bind(emitter),
1413
1865
  sendAudio(audio) {
1414
1866
  if (ws.readyState !== 1) {
1415
1867
  log.debug("S2S sendAudio dropped: socket not open");
@@ -1422,16 +1874,15 @@ function connectS2s(opts) {
1422
1874
  ws.send(jsonFrame);
1423
1875
  },
1424
1876
  sendToolResult(callId, result) {
1425
- const msg = {
1426
- type: "tool.result",
1427
- call_id: callId,
1428
- result
1429
- };
1430
1877
  log.info("S2S >> tool.result", {
1431
1878
  call_id: callId,
1432
1879
  resultLength: result.length
1433
1880
  });
1434
- send(msg);
1881
+ send({
1882
+ type: "tool.result",
1883
+ call_id: callId,
1884
+ result
1885
+ });
1435
1886
  },
1436
1887
  updateSession(sessionConfig) {
1437
1888
  const { systemPrompt, ...rest } = sessionConfig;
@@ -1468,8 +1919,7 @@ function connectS2s(opts) {
1468
1919
  }
1469
1920
  function handleAudioFastPath(obj) {
1470
1921
  if (obj.type === "reply.audio" && typeof obj.data === "string") {
1471
- const audioBytes = base64ToUint8(obj.data);
1472
- emitter.emit("audio", { audio: audioBytes });
1922
+ callbacks.onAudio(base64ToUint8(obj.data));
1473
1923
  return true;
1474
1924
  }
1475
1925
  return false;
@@ -1479,7 +1929,7 @@ function connectS2s(opts) {
1479
1929
  if (obj.type === "reply.done") return;
1480
1930
  log.info(`S2S << ${obj.type}`);
1481
1931
  }
1482
- function handleS2sMessage(ev) {
1932
+ ws.addEventListener("message", (ev) => {
1483
1933
  const raw = tryParseJson(ev.data);
1484
1934
  if (raw === void 0) return;
1485
1935
  if (typeof raw !== "object" || raw === null || Array.isArray(raw)) {
@@ -1494,9 +1944,8 @@ function connectS2s(opts) {
1494
1944
  log.warn(`S2S << unrecognised message type: ${obj.type ?? JSON.stringify(raw).slice(0, 200)}`);
1495
1945
  return;
1496
1946
  }
1497
- dispatchS2sMessage(emitter, parsed, dispatchState, dispatchCtx);
1498
- }
1499
- ws.addEventListener("message", handleS2sMessage);
1947
+ dispatchS2sMessage(callbacks, parsed, dispatchState, dispatchCtx);
1948
+ });
1500
1949
  ws.addEventListener("close", (ev) => {
1501
1950
  const code = ev.code ?? 0;
1502
1951
  const reason = ev.reason ?? "";
@@ -1505,394 +1954,102 @@ function connectS2s(opts) {
1505
1954
  reason
1506
1955
  });
1507
1956
  if (!opened) reject(/* @__PURE__ */ new Error(`WebSocket closed before open (code: ${code})`));
1508
- emitter.emit("close", code, reason);
1957
+ callbacks.onClose(code, reason);
1509
1958
  });
1510
1959
  ws.addEventListener("error", (ev) => {
1511
1960
  const message = typeof ev.message === "string" ? ev.message : "WebSocket error";
1512
1961
  const errObj = new Error(message);
1513
1962
  log.error("S2S WebSocket error", { error: errObj.message });
1514
1963
  if (!opened) reject(errObj);
1515
- else emitter.emit("error", errObj);
1964
+ else callbacks.onError(errObj);
1516
1965
  });
1517
1966
  });
1518
1967
  }
1519
1968
  //#endregion
1520
- //#region host/session.ts
1521
- /** @internal Not part of the public API. Exposed for testing only. */
1969
+ //#region host/transports/s2s-transport.ts
1970
+ /** @internal Exposed for testing allows spying on connectS2s in unit tests. */
1522
1971
  const _internals = { connectS2s };
1523
- /**
1524
- * Create an idle timer that closes the S2S connection after inactivity.
1525
- * Convention: `timeoutMs <= 0` disables the timer entirely (returns a no-op).
1526
- * This allows agents to opt out of idle timeout via `idleTimeoutMs: 0` in their config.
1527
- */
1528
- function createIdleTimer(opts) {
1529
- if (opts.timeoutMs <= 0) return {
1530
- reset() {},
1531
- clear() {}
1532
- };
1533
- let timer = null;
1534
- return {
1535
- reset() {
1536
- if (timer !== null) clearTimeout(timer);
1537
- timer = setTimeout(() => {
1538
- opts.log.info("S2S idle timeout", {
1539
- timeoutMs: opts.timeoutMs,
1540
- agent: opts.agent
1541
- });
1542
- opts.client.event({ type: "idle_timeout" });
1543
- opts.ctx.s2s?.close();
1544
- }, opts.timeoutMs);
1545
- },
1546
- clear() {
1547
- if (timer !== null) {
1548
- clearTimeout(timer);
1549
- timer = null;
1972
+ function createS2sTransport(opts) {
1973
+ const log = opts.logger ?? consoleLogger;
1974
+ const createWs = opts.createWebSocket ?? defaultCreateS2sWebSocket;
1975
+ let handle = null;
1976
+ let currentReplyId = null;
1977
+ async function start() {
1978
+ handle = await _internals.connectS2s({
1979
+ apiKey: opts.apiKey,
1980
+ config: opts.s2sConfig,
1981
+ createWebSocket: createWs,
1982
+ logger: log,
1983
+ sid: opts.sid,
1984
+ callbacks: {
1985
+ onSessionReady: (providerSessionId) => opts.callbacks.onSessionReady?.(providerSessionId),
1986
+ onReplyStarted: (replyId) => {
1987
+ currentReplyId = replyId;
1988
+ opts.callbacks.onReplyStarted(replyId);
1989
+ },
1990
+ onReplyDone: () => {
1991
+ currentReplyId = null;
1992
+ opts.callbacks.onReplyDone();
1993
+ },
1994
+ onCancelled: () => {
1995
+ currentReplyId = null;
1996
+ opts.callbacks.onCancelled();
1997
+ },
1998
+ onAudio: (bytes) => opts.callbacks.onAudioChunk(bytes),
1999
+ onUserTranscript: opts.callbacks.onUserTranscript,
2000
+ onAgentTranscript: opts.callbacks.onAgentTranscript,
2001
+ onToolCall: opts.callbacks.onToolCall,
2002
+ onSpeechStarted: opts.callbacks.onSpeechStarted,
2003
+ onSpeechStopped: opts.callbacks.onSpeechStopped,
2004
+ onSessionExpired: () => {
2005
+ log.info("S2S session expired", { sid: opts.sid });
2006
+ handle?.close();
2007
+ },
2008
+ onError: (err) => opts.callbacks.onError("internal", err.message),
2009
+ onClose: (code, reason) => {
2010
+ if (currentReplyId !== null) {
2011
+ log.warn("S2S closed with active reply", {
2012
+ sid: opts.sid,
2013
+ agent: opts.agent,
2014
+ activeReplyId: currentReplyId,
2015
+ code,
2016
+ reason
2017
+ });
2018
+ opts.callbacks.onError("connection", `S2S closed mid-reply (code=${code})`);
2019
+ } else log.info("S2S closed", {
2020
+ code,
2021
+ reason
2022
+ });
2023
+ }
1550
2024
  }
1551
- }
1552
- };
1553
- }
1554
- /**
1555
- * Complete a tool call by truncating the result, emitting a `tool_call_done` event,
1556
- * and accumulating the result in `ctx.reply.pendingTools` — but only if the reply that
1557
- * initiated this call is still active.
1558
- */
1559
- function finishToolCall(ctx, callId, result, replyId) {
1560
- const truncatedResult = result.length > 4e3 ? result.slice(0, MAX_TOOL_RESULT_CHARS) : result;
1561
- ctx.client.event({
1562
- type: "tool_call_done",
1563
- toolCallId: callId,
1564
- result: truncatedResult
1565
- });
1566
- if (replyId !== null && replyId === ctx.reply.currentReplyId) {
1567
- ctx.reply.pendingTools.push({
1568
- callId,
1569
- result
1570
- });
1571
- if (ctx.maxHistory > 0 && ctx.reply.pendingTools.length > ctx.maxHistory) ctx.reply.pendingTools.shift();
1572
- }
1573
- }
1574
- async function handleToolCall(ctx, event) {
1575
- const { toolCallId: callId, toolName: name, args: parsedArgs } = event;
1576
- const replyId = ctx.reply.currentReplyId;
1577
- ctx.client.event(event);
1578
- const refused = ctx.consumeToolCallStep(name, replyId);
1579
- if (refused !== null) {
1580
- finishToolCall(ctx, callId, refused, replyId);
1581
- return;
1582
- }
1583
- ctx.log.info("S2S tool call", {
1584
- tool: name,
1585
- callId,
1586
- args: parsedArgs,
1587
- agent: ctx.agent
1588
- });
1589
- let result;
1590
- try {
1591
- result = await ctx.executeTool(name, parsedArgs, ctx.id, ctx.conversationMessages);
1592
- } catch (err) {
1593
- const msg = errorMessage(err);
1594
- ctx.log.error("Tool execution failed", {
1595
- tool: name,
1596
- error: errorDetail(err)
1597
2025
  });
1598
- result = toolError(msg);
1599
- }
1600
- ctx.log.info("S2S tool result", {
1601
- tool: name,
1602
- callId,
1603
- resultLength: result.length
1604
- });
1605
- finishToolCall(ctx, callId, result, replyId);
1606
- }
1607
- function handleUserTranscript(ctx, text) {
1608
- ctx.log.info("S2S user transcript", { text });
1609
- ctx.client.event({
1610
- type: "user_transcript",
1611
- text
1612
- });
1613
- ctx.pushMessages({
1614
- role: "user",
1615
- content: text
1616
- });
1617
- }
1618
- function handleAgentTranscript(ctx, text, interrupted) {
1619
- ctx.client.event({
1620
- type: "agent_transcript",
1621
- text
1622
- });
1623
- if (!interrupted) ctx.pushMessages({
1624
- role: "assistant",
1625
- content: text
1626
- });
1627
- }
1628
- function handleReplyCancelled(ctx) {
1629
- ctx.log.info("S2S reply interrupted (barge-in)");
1630
- ctx.cancelReply();
1631
- ctx.client.event({ type: "cancelled" });
1632
- }
1633
- /**
1634
- * Warn when the entry-to-emit time for a reply_done dispatch exceeds this.
1635
- * Tool-less sessions should be sub-millisecond; sessions with pending tools
1636
- * will legitimately spend time awaiting ctx.turnPromise. We log both (with
1637
- * `hadTurnPromise`) so event-loop starvation is distinguishable from
1638
- * genuine tool-call latency.
1639
- */
1640
- const REPLY_DONE_SLOW_THRESHOLD_MS = 50;
1641
- function handleReplyDone(ctx) {
1642
- const startMs = Date.now();
1643
- const doneReplyId = ctx.reply.currentReplyId;
1644
- if (doneReplyId === null) {
1645
- ctx.log.debug("Dropping duplicate reply.done (no active reply)");
1646
- return;
2026
+ handle.updateSession(opts.sessionConfig);
1647
2027
  }
1648
- const hadTurnPromise = ctx.turnPromise !== null;
1649
- const sendPending = () => {
1650
- if (ctx.reply.currentReplyId !== doneReplyId) {
1651
- ctx.reply.pendingTools = [];
1652
- return;
1653
- }
1654
- if (ctx.reply.pendingTools.length > 0) {
1655
- for (const tool of ctx.reply.pendingTools) ctx.s2s?.sendToolResult(tool.callId, tool.result);
1656
- ctx.reply.pendingTools = [];
1657
- } else {
1658
- const stepsUsed = ctx.reply.toolCallCount;
1659
- if (stepsUsed > 0) ctx.log.info("Turn complete", {
1660
- steps: stepsUsed,
1661
- agent: ctx.agent
1662
- });
1663
- ctx.client.playAudioDone();
1664
- ctx.client.event({ type: "reply_done" });
1665
- ctx.reply.currentReplyId = null;
1666
- const durationMs = Date.now() - startMs;
1667
- if (durationMs >= REPLY_DONE_SLOW_THRESHOLD_MS) ctx.log.warn("slow reply_done dispatch", {
1668
- sid: ctx.id,
1669
- agent: ctx.agent,
1670
- durationMs,
1671
- hadTurnPromise
1672
- });
1673
- }
1674
- };
1675
- if (hadTurnPromise) ctx.turnPromise?.then(sendPending);
1676
- else sendPending();
1677
- }
1678
- function setupListeners(ctx, handle) {
1679
- handle.on("ready", ({ sessionId }) => ctx.log.info("S2S session ready", { sessionId }));
1680
- handle.on("replyStarted", ({ replyId }) => {
1681
- ctx.beginReply(replyId);
1682
- });
1683
- handle.on("sessionExpired", () => {
1684
- ctx.log.info("S2S session expired");
1685
- handle.close();
1686
- });
1687
- handle.on("audio", ({ audio }) => ctx.client.playAudioChunk(audio));
1688
- handle.on("error", (err) => {
1689
- ctx.log.error("S2S error", { message: err.message });
1690
- ctx.client.event({
1691
- type: "error",
1692
- code: "internal",
1693
- message: err.message
1694
- });
1695
- handle.close();
1696
- });
1697
- handle.on("close", (code, reason) => {
1698
- const activeReplyId = ctx.reply.currentReplyId;
1699
- if (activeReplyId !== null) ctx.log.warn("S2S closed with active reply", {
1700
- sid: ctx.id,
1701
- agent: ctx.agent,
1702
- activeReplyId,
1703
- code,
1704
- reason
1705
- });
1706
- else ctx.log.info("S2S closed", {
1707
- code,
1708
- reason
1709
- });
1710
- ctx.s2s = null;
1711
- ctx.cancelReply();
1712
- });
1713
- handle.on("event", (event) => {
1714
- switch (event.type) {
1715
- case "user_transcript":
1716
- handleUserTranscript(ctx, event.text);
1717
- break;
1718
- case "agent_transcript":
1719
- handleAgentTranscript(ctx, event.text, event._interrupted ?? false);
1720
- break;
1721
- case "tool_call": {
1722
- const p = handleToolCall(ctx, event).catch((err) => {
1723
- ctx.log.error("Tool call handler failed", { err: errorMessage(err) });
1724
- });
1725
- ctx.chainTurn(p);
1726
- break;
1727
- }
1728
- case "reply_done":
1729
- handleReplyDone(ctx);
1730
- break;
1731
- case "cancelled":
1732
- handleReplyCancelled(ctx);
1733
- break;
1734
- default: ctx.client.event(event);
1735
- }
1736
- });
1737
- }
1738
- function createS2sSession(opts) {
1739
- const { id, agent, client, toolSchemas, apiKey, s2sConfig, executeTool, createWebSocket = defaultCreateS2sWebSocket, logger: log = consoleLogger } = opts;
1740
- const agentConfig = opts.skipGreeting ? {
1741
- ...opts.agentConfig,
1742
- greeting: ""
1743
- } : opts.agentConfig;
1744
- const systemPrompt = buildSystemPrompt(agentConfig, {
1745
- hasTools: toolSchemas.length > 0 || (agentConfig.builtinTools?.length ?? 0) > 0,
1746
- voice: true,
1747
- toolGuidance: opts.toolGuidance
1748
- });
1749
- const s2sTools = toolSchemas.map((ts) => ({
1750
- type: "function",
1751
- name: ts.name,
1752
- description: ts.description,
1753
- parameters: ts.parameters
1754
- }));
1755
- const sessionAbort = new AbortController();
1756
- const ctx = buildCtx({
1757
- id,
1758
- agent,
1759
- client,
1760
- agentConfig,
1761
- executeTool,
1762
- log,
1763
- maxHistory: opts.maxHistory
1764
- });
1765
- const rawTimeout = agentConfig.idleTimeoutMs ?? 3e5;
1766
- const idle = createIdleTimer({
1767
- timeoutMs: rawTimeout === 0 || !Number.isFinite(rawTimeout) ? 0 : rawTimeout,
1768
- agent,
1769
- log,
1770
- client,
1771
- ctx
1772
- });
1773
- let connectGeneration = 0;
1774
- const sessionUpdatePayload = {
1775
- systemPrompt,
1776
- tools: s2sTools,
1777
- ...agentConfig.greeting ? { greeting: agentConfig.greeting } : {}
1778
- };
1779
- async function connectAndSetup() {
1780
- const generation = ++connectGeneration;
1781
- try {
1782
- const handle = await _internals.connectS2s({
1783
- apiKey,
1784
- config: s2sConfig,
1785
- createWebSocket,
1786
- logger: log,
1787
- sid: id
1788
- });
1789
- if (sessionAbort.signal.aborted || generation !== connectGeneration) {
1790
- handle.close();
1791
- return;
1792
- }
1793
- setupListeners(ctx, handle);
1794
- handle.updateSession(sessionUpdatePayload);
1795
- ctx.s2s = handle;
1796
- idle.reset();
1797
- } catch (err) {
1798
- const msg = errorMessage(err);
1799
- log.error("S2S connect failed", { error: errorDetail(err) });
1800
- client.event({
1801
- type: "error",
1802
- code: "internal",
1803
- message: msg
1804
- });
1805
- }
2028
+ async function stop() {
2029
+ handle?.close();
2030
+ handle = null;
1806
2031
  }
1807
2032
  return {
1808
- async start() {
1809
- await connectAndSetup();
1810
- },
1811
- async stop() {
1812
- if (sessionAbort.signal.aborted) return;
1813
- sessionAbort.abort();
1814
- idle.clear();
1815
- if (ctx.turnPromise !== null) await ctx.turnPromise;
1816
- ctx.s2s?.close();
1817
- },
1818
- onAudio(data) {
1819
- idle.reset();
1820
- ctx.s2s?.sendAudio(data);
1821
- },
1822
- onAudioReady() {},
1823
- onCancel() {
1824
- client.event({ type: "cancelled" });
1825
- },
1826
- onReset() {
1827
- ctx.cancelReply();
1828
- ctx.conversationMessages = [];
1829
- ctx.reply.toolCallCount = 0;
1830
- ctx.turnPromise = null;
1831
- idle.clear();
1832
- ctx.s2s?.close();
1833
- client.event({ type: "reset" });
1834
- connectAndSetup().catch((err) => log.error("S2S reset reconnect failed", { error: errorMessage(err) }));
2033
+ start,
2034
+ stop,
2035
+ sendUserAudio(bytes) {
2036
+ handle?.sendAudio(bytes);
1835
2037
  },
1836
- onHistory(incoming) {
1837
- ctx.pushMessages(...incoming.map((m) => ({
1838
- role: m.role,
1839
- content: m.content
1840
- })));
2038
+ sendToolResult(callId, result) {
2039
+ handle?.sendToolResult(callId, result);
1841
2040
  },
1842
- waitForTurn() {
1843
- return ctx.turnPromise ?? Promise.resolve();
1844
- }
1845
- };
1846
- }
1847
- //#endregion
1848
- //#region host/tool-executor.ts
1849
- /**
1850
- * Tool execution — validates arguments and invokes tool handlers.
1851
- *
1852
- * {@link executeToolCall} is the single entry point used by both the
1853
- * direct (self-hosted) runtime and the platform sandbox sidecar.
1854
- */
1855
- const yieldTick = () => new Promise((r) => setTimeout(r, 0));
1856
- function buildToolContext(opts) {
1857
- const { env, state, kv, messages, sessionId } = opts;
1858
- return {
1859
- env,
1860
- state: state ?? {},
1861
- get kv() {
1862
- if (!kv) throw new Error("KV not available");
1863
- return kv;
2041
+ cancelReply() {
2042
+ currentReplyId = null;
1864
2043
  },
1865
- messages: messages ?? [],
1866
- sessionId: sessionId ?? "",
1867
- send(event, data) {
1868
- opts.send?.(event, data);
2044
+ updateSession(config) {
2045
+ handle?.updateSession({
2046
+ systemPrompt: config.systemPrompt,
2047
+ tools: config.tools ?? [],
2048
+ ...config.greeting !== void 0 ? { greeting: config.greeting } : {}
2049
+ });
1869
2050
  }
1870
2051
  };
1871
2052
  }
1872
- async function executeToolCall(name, args, options) {
1873
- const { tool } = options;
1874
- const parsed = (tool.parameters ?? EMPTY_PARAMS).safeParse(args);
1875
- if (!parsed.success) return toolError(`Invalid arguments for tool "${name}": ${(parsed.error?.issues ?? []).map((i) => `${i.path.map(String).join(".")}: ${i.message}`).join(", ")}`);
1876
- try {
1877
- const ctx = buildToolContext(options);
1878
- await yieldTick();
1879
- const result = await pTimeout(Promise.resolve(tool.execute(parsed.data, ctx)), {
1880
- milliseconds: TOOL_EXECUTION_TIMEOUT_MS,
1881
- message: `Tool "${name}" timed out after ${TOOL_EXECUTION_TIMEOUT_MS}ms`
1882
- });
1883
- await yieldTick();
1884
- if (result == null) return "null";
1885
- return typeof result === "string" ? result : JSON.stringify(result);
1886
- } catch (err) {
1887
- const log = options.logger;
1888
- if (log) log.warn("Tool execution failed", {
1889
- tool: name,
1890
- error: errorDetail(err)
1891
- });
1892
- else console.warn(`[tool-executor] Tool execution failed: ${name}`, err);
1893
- return toolError(errorMessage(err));
1894
- }
1895
- }
1896
2053
  //#endregion
1897
2054
  //#region host/unstorage-kv.ts
1898
2055
  /**
@@ -1944,20 +2101,20 @@ function createUnstorageKv(options) {
1944
2101
  *
1945
2102
  * Audio validation is handled at the host transport layer (see server.ts).
1946
2103
  */
2104
+ const AUDIO_DONE_FRAME = JSON.stringify({ type: "audio_done" });
1947
2105
  /**
1948
2106
  * Creates a {@link ClientSink} backed by a plain WebSocket.
1949
2107
  *
1950
- * Text events are sent as JSON text frames; audio chunks are sent as
1951
- * binary frames (zero-copy).
2108
+ * Session events are sent as JSON text frames; audio chunks are sent as raw
2109
+ * PCM16 binary frames.
1952
2110
  */
1953
2111
  function createClientSink(ws, log) {
1954
- /** Send data over ws, silently dropping if the socket is not open. */
1955
2112
  function safeSend(data) {
1956
2113
  try {
1957
2114
  if (ws.readyState !== 1) return;
1958
2115
  ws.send(data);
1959
2116
  } catch (err) {
1960
- log.debug?.("safeSend: socket closed between readyState check and send", { error: errorMessage(err) });
2117
+ log.debug?.("safeSend: socket closed between readyState check and send", { error: err instanceof Error ? err.message : String(err) });
1961
2118
  }
1962
2119
  }
1963
2120
  return {
@@ -1971,7 +2128,7 @@ function createClientSink(ws, log) {
1971
2128
  safeSend(chunk);
1972
2129
  },
1973
2130
  playAudioDone() {
1974
- safeSend(JSON.stringify({ type: "audio_done" }));
2131
+ safeSend(AUDIO_DONE_FRAME);
1975
2132
  }
1976
2133
  };
1977
2134
  }
@@ -1980,35 +2137,32 @@ function handleBinaryAudio(data, session) {
1980
2137
  session.onAudio(data);
1981
2138
  return true;
1982
2139
  }
1983
- if (data instanceof ArrayBuffer) {
1984
- session.onAudio(new Uint8Array(data));
1985
- return true;
1986
- }
1987
2140
  return false;
1988
2141
  }
1989
- function handleTextMessage(data, session, log, ctx, sid) {
1990
- if (typeof data !== "string") return;
1991
- let json;
2142
+ function handleTextMessage(data, session, log, sid) {
2143
+ if (typeof data !== "string") {
2144
+ log.warn("ws: non-string, non-binary frame received; dropping", { sid });
2145
+ return;
2146
+ }
2147
+ let parsed;
1992
2148
  try {
1993
- json = JSON.parse(data);
2149
+ parsed = JSON.parse(data);
1994
2150
  } catch {
1995
- log.warn("Invalid JSON from client", {
1996
- ...ctx,
1997
- sid
2151
+ log.warn("ws: invalid JSON; dropping", {
2152
+ sid,
2153
+ data: data.slice(0, 200)
1998
2154
  });
1999
2155
  return;
2000
2156
  }
2001
- const parsed = lenientParse(ClientMessageSchema, json);
2002
- if (!parsed.ok) {
2003
- if (parsed.malformed) log.warn("Invalid client message", {
2004
- ...ctx,
2157
+ const result = lenientParse(ClientMessageSchema, parsed);
2158
+ if (!result.ok) {
2159
+ if (result.malformed) log.warn("ws: malformed client message", {
2005
2160
  sid,
2006
- error: parsed.error
2161
+ error: result.error
2007
2162
  });
2008
2163
  return;
2009
2164
  }
2010
- const msg = parsed.data;
2011
- switch (msg.type) {
2165
+ switch (result.data.type) {
2012
2166
  case "audio_ready":
2013
2167
  session.onAudioReady();
2014
2168
  break;
@@ -2019,19 +2173,19 @@ function handleTextMessage(data, session, log, ctx, sid) {
2019
2173
  session.onReset();
2020
2174
  break;
2021
2175
  case "history":
2022
- session.onHistory(msg.messages);
2176
+ session.onHistory(result.data.messages);
2023
2177
  break;
2024
2178
  default: break;
2025
2179
  }
2026
2180
  }
2027
2181
  /**
2028
- * Attaches session lifecycle handlers to a native WebSocket using
2029
- * plain JSON text frames and binary audio frames.
2182
+ * Attaches session lifecycle handlers to a native WebSocket using JSON text
2183
+ * frames for control messages and raw PCM16 binary frames for audio.
2030
2184
  *
2031
2185
  * Connection flow:
2032
- * 1. WebSocket opens → server sends `{ type: "config", ...ReadyConfig }`
2033
- * 2. Client sets up audio → sends `{ type: "audio_ready" }`
2034
- * 3. If reconnecting → client sends `{ type: "history", messages: [...] }`
2186
+ * 1. WebSocket opens → server sends JSON CONFIG frame with sampleRate, ttsSampleRate, sessionId
2187
+ * 2. Client sets up audio → sends JSON AUDIO_READY frame
2188
+ * 3. If reconnecting → client sends JSON HISTORY frame with prior messages
2035
2189
  */
2036
2190
  function wireSessionSocket(ws, opts) {
2037
2191
  const { sessions, logger: log = consoleLogger } = opts;
@@ -2041,7 +2195,7 @@ function wireSessionSocket(ws, opts) {
2041
2195
  let session = null;
2042
2196
  /** Set to true once session.start() resolves. Messages arriving before
2043
2197
  * this flag is set are buffered and replayed once the session is ready,
2044
- * preventing audio/text from being dispatched to a half-initialized session. */
2198
+ * preventing audio/frames from being dispatched to a half-initialized session. */
2045
2199
  let sessionReady = false;
2046
2200
  let messageBuffer = [];
2047
2201
  function drainBuffer() {
@@ -2049,9 +2203,8 @@ function wireSessionSocket(ws, opts) {
2049
2203
  const buf = messageBuffer;
2050
2204
  messageBuffer = null;
2051
2205
  for (const event of buf) {
2052
- const { data } = event;
2053
- if (handleBinaryAudio(data, session)) continue;
2054
- handleTextMessage(data, session, log, ctx, sid);
2206
+ if (handleBinaryAudio(event.data, session)) continue;
2207
+ handleTextMessage(event.data, session, log, sid);
2055
2208
  }
2056
2209
  }
2057
2210
  function onOpen() {
@@ -2066,7 +2219,9 @@ function wireSessionSocket(ws, opts) {
2066
2219
  opts.onSinkCreated?.(sessionId, client);
2067
2220
  ws.send(JSON.stringify({
2068
2221
  type: "config",
2069
- ...opts.readyConfig,
2222
+ audioFormat: opts.readyConfig.audioFormat,
2223
+ sampleRate: opts.readyConfig.sampleRate,
2224
+ ttsSampleRate: opts.readyConfig.ttsSampleRate,
2070
2225
  sessionId
2071
2226
  }));
2072
2227
  const timeoutMs = opts.sessionStartTimeoutMs ?? 1e4;
@@ -2099,9 +2254,8 @@ function wireSessionSocket(ws, opts) {
2099
2254
  if (messageBuffer && messageBuffer.length < 100) messageBuffer.push(event);
2100
2255
  return;
2101
2256
  }
2102
- const { data } = event;
2103
- if (handleBinaryAudio(data, session)) return;
2104
- handleTextMessage(data, session, log, ctx, sid);
2257
+ if (handleBinaryAudio(event.data, session)) return;
2258
+ handleTextMessage(event.data, session, log, sid);
2105
2259
  });
2106
2260
  ws.addEventListener("close", () => {
2107
2261
  log.info("Session disconnected", {
@@ -2132,6 +2286,30 @@ function wireSessionSocket(ws, opts) {
2132
2286
  //#endregion
2133
2287
  //#region host/runtime.ts
2134
2288
  /**
2289
+ * Resolve the API key env-var for the configured STT provider.
2290
+ *
2291
+ * Each STT provider uses its own env var (e.g. `ASSEMBLYAI_API_KEY`,
2292
+ * `DEEPGRAM_API_KEY`). We read the kind from the descriptor if it is one;
2293
+ * pre-resolved openers have no kind field so we fall back to AssemblyAI for
2294
+ * backward compatibility (openers supply their own key at open-time anyway).
2295
+ */
2296
+ function resolveSttApiKey(stt, env) {
2297
+ if ((stt != null && "kind" in stt && typeof stt.kind === "string" ? stt.kind : void 0) === "deepgram") return resolveApiKey("DEEPGRAM_API_KEY", env);
2298
+ return resolveApiKey("ASSEMBLYAI_API_KEY", env);
2299
+ }
2300
+ /**
2301
+ * Resolve the API key env-var for the configured TTS provider.
2302
+ *
2303
+ * Each TTS provider uses its own env var (e.g. `CARTESIA_API_KEY`,
2304
+ * `RIME_API_KEY`). We read the kind from the descriptor if it is one;
2305
+ * pre-resolved openers have no kind field so we fall back to Cartesia for
2306
+ * backward compatibility (openers supply their own key at open-time anyway).
2307
+ */
2308
+ function resolveTtsApiKey(tts, env) {
2309
+ if ((tts != null && "kind" in tts && typeof tts.kind === "string" ? tts.kind : void 0) === "rime") return resolveApiKey("RIME_API_KEY", env);
2310
+ return resolveApiKey("CARTESIA_API_KEY", env);
2311
+ }
2312
+ /**
2135
2313
  * Distinguish a descriptor (`{ kind, options }`) from an already-resolved
2136
2314
  * opener / `LanguageModel`. The production path always passes descriptors;
2137
2315
  * openers are a test escape hatch (fakes in `_pipeline-test-fakes.ts`).
@@ -2236,40 +2414,86 @@ function createRuntime(opts) {
2236
2414
  } : null;
2237
2415
  function createSession(sessionOpts) {
2238
2416
  sinkMap.set(sessionOpts.id, sessionOpts.client);
2239
- if (pipelineProviders) return createPipelineSession({
2240
- id: sessionOpts.id,
2417
+ const isPipeline = Boolean(pipelineProviders);
2418
+ const systemPrompt = buildSystemPrompt(agentConfig, {
2419
+ hasTools: toolSchemas.length > 0 || (agentConfig.builtinTools?.length ?? 0) > 0,
2420
+ voice: true,
2421
+ toolGuidance
2422
+ });
2423
+ let core = null;
2424
+ function bindCore() {
2425
+ if (!core) throw new Error("SessionCore not yet created");
2426
+ return core;
2427
+ }
2428
+ const callbacks = {
2429
+ onReplyStarted: (replyId) => bindCore().onReplyStarted(replyId),
2430
+ onReplyDone: () => bindCore().onReplyDone(),
2431
+ onCancelled: () => bindCore().onCancelled(),
2432
+ onAudioChunk: (bytes) => bindCore().onAudioChunk(bytes),
2433
+ onAudioDone: () => bindCore().onAudioDone(),
2434
+ onUserTranscript: (text) => bindCore().onUserTranscript(text),
2435
+ onAgentTranscript: (text, interrupted) => bindCore().onAgentTranscript(text, interrupted),
2436
+ onToolCall: isPipeline ? (id, name, args) => sessionOpts.client.event({
2437
+ type: "tool_call",
2438
+ toolCallId: id,
2439
+ toolName: name,
2440
+ args
2441
+ }) : (id, name, args) => bindCore().onToolCall(id, name, args),
2442
+ onError: (code, message) => bindCore().onError(code, message),
2443
+ onSpeechStarted: () => bindCore().onSpeechStarted(),
2444
+ onSpeechStopped: () => bindCore().onSpeechStopped()
2445
+ };
2446
+ let transport;
2447
+ if (pipelineProviders) transport = createPipelineTransport({
2448
+ sid: sessionOpts.id,
2241
2449
  agent: sessionOpts.agent,
2242
- client: sessionOpts.client,
2243
- agentConfig,
2244
- toolSchemas,
2245
- toolGuidance,
2246
- executeTool,
2247
2450
  stt: pipelineProviders.stt,
2248
2451
  llm: pipelineProviders.llm,
2249
2452
  tts: pipelineProviders.tts,
2250
- sttApiKey: resolveApiKey("ASSEMBLYAI_API_KEY", env),
2251
- ttsApiKey: resolveApiKey("CARTESIA_API_KEY", env),
2453
+ callbacks,
2454
+ sessionConfig: {
2455
+ systemPrompt,
2456
+ greeting: agentConfig.greeting,
2457
+ tools: toolSchemas
2458
+ },
2459
+ toolSchemas,
2460
+ executeTool,
2461
+ providerKeys: {
2462
+ stt: resolveSttApiKey(opts.stt, env),
2463
+ tts: resolveTtsApiKey(opts.tts, env)
2464
+ },
2252
2465
  sttSampleRate: s2sConfig.inputSampleRate,
2253
2466
  ttsSampleRate: s2sConfig.outputSampleRate,
2467
+ maxSteps: agentConfig.maxSteps,
2468
+ toolChoice: agentConfig.toolChoice,
2254
2469
  skipGreeting: sessionOpts.skipGreeting ?? false,
2255
2470
  logger
2256
2471
  });
2257
- const apiKey = env.ASSEMBLYAI_API_KEY ?? "";
2258
- return createS2sSession({
2472
+ else transport = createS2sTransport({
2473
+ apiKey: env.ASSEMBLYAI_API_KEY ?? "",
2474
+ s2sConfig,
2475
+ sessionConfig: {
2476
+ systemPrompt,
2477
+ tools: toolSchemas,
2478
+ ...agentConfig.greeting !== void 0 ? { greeting: agentConfig.greeting } : {}
2479
+ },
2480
+ toolSchemas,
2481
+ callbacks,
2482
+ sid: sessionOpts.id,
2483
+ agent: sessionOpts.agent,
2484
+ ...createWebSocket ? { createWebSocket } : {},
2485
+ logger
2486
+ });
2487
+ core = createSessionCore({
2259
2488
  id: sessionOpts.id,
2260
2489
  agent: sessionOpts.agent,
2261
2490
  client: sessionOpts.client,
2262
2491
  agentConfig,
2263
- toolSchemas,
2264
- toolGuidance,
2265
- apiKey,
2266
- s2sConfig,
2267
2492
  executeTool,
2268
- ...createWebSocket ? { createWebSocket } : {},
2269
- skipGreeting: sessionOpts.skipGreeting ?? false,
2270
- logger,
2271
- ...sessionOpts.resumeFrom ? { resumeFrom: sessionOpts.resumeFrom } : {}
2493
+ transport,
2494
+ logger
2272
2495
  });
2496
+ return core;
2273
2497
  }
2274
2498
  function startSession(ws, startOpts) {
2275
2499
  const resumeFrom = startOpts?.resumeFrom;
@@ -2454,4 +2678,4 @@ function createServer(options) {
2454
2678
  };
2455
2679
  }
2456
2680
  //#endregion
2457
- export { DEFAULT_S2S_CONFIG, _buildBaseCtx, _internals, buildCtx, consoleLogger, createRuntime, createS2sSession, createServer, createUnstorageKv, executeInIsolate, executeToolCall, jsonLogger, resolveAllBuiltins, wireSessionSocket };
2681
+ export { DEFAULT_S2S_CONFIG, _internals, consoleLogger, createPipelineTransport, createRuntime, createS2sTransport, createServer, createSessionCore, createUnstorageKv, executeInIsolate, executeToolCall, jsonLogger, resolveAllBuiltins, wireSessionSocket };