@alexkroman1/aai 1.2.3 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/.turbo/turbo-build.log +14 -12
  2. package/CHANGELOG.md +14 -0
  3. package/dist/host/_pipeline-test-fakes.d.ts +107 -0
  4. package/dist/host/pipeline-session-ctx.d.ts +24 -0
  5. package/dist/host/pipeline-session.d.ts +48 -0
  6. package/dist/host/providers/llm.d.ts +2 -0
  7. package/dist/host/providers/stt/assemblyai.d.ts +31 -0
  8. package/dist/host/providers/stt-barrel.d.ts +8 -0
  9. package/dist/host/providers/stt-barrel.js +92 -0
  10. package/dist/host/providers/stt.d.ts +2 -0
  11. package/dist/host/providers/tts/cartesia.d.ts +39 -0
  12. package/dist/host/providers/tts-barrel.d.ts +8 -0
  13. package/dist/host/providers/tts-barrel.js +182 -0
  14. package/dist/host/providers/tts.d.ts +2 -0
  15. package/dist/host/runtime-barrel.js +498 -80
  16. package/dist/host/runtime.d.ts +17 -0
  17. package/dist/host/s2s.d.ts +5 -0
  18. package/dist/host/session-ctx.d.ts +22 -4
  19. package/dist/host/to-vercel-tools.d.ts +44 -0
  20. package/dist/index.js +5 -0
  21. package/dist/sdk/_internal-types.d.ts +15 -1
  22. package/dist/sdk/define.d.ts +21 -0
  23. package/dist/sdk/manifest.d.ts +22 -0
  24. package/dist/sdk/protocol.d.ts +3 -3
  25. package/dist/sdk/providers.d.ts +70 -0
  26. package/dist/sdk/types.d.ts +16 -0
  27. package/exports-no-dev-deps.test.ts +39 -14
  28. package/host/_pipeline-test-fakes.ts +323 -0
  29. package/host/_test-utils.ts +1 -0
  30. package/host/integration/fixtures/README.md +49 -0
  31. package/host/integration/pipeline-reference.integration.test.ts +124 -0
  32. package/host/pipeline-session-ctx.test.ts +31 -0
  33. package/host/pipeline-session-ctx.ts +36 -0
  34. package/host/pipeline-session.test.ts +337 -0
  35. package/host/pipeline-session.ts +405 -0
  36. package/host/providers/llm.ts +3 -0
  37. package/host/providers/providers.test-d.ts +31 -0
  38. package/host/providers/stt/assemblyai.test.ts +100 -0
  39. package/host/providers/stt/assemblyai.ts +154 -0
  40. package/host/providers/stt/fixtures/assemblyai/basic-turn.json +30 -0
  41. package/host/providers/stt-barrel.ts +13 -0
  42. package/host/providers/stt.ts +3 -0
  43. package/host/providers/tts/cartesia.test.ts +210 -0
  44. package/host/providers/tts/cartesia.ts +251 -0
  45. package/host/providers/tts-barrel.ts +13 -0
  46. package/host/providers/tts.ts +3 -0
  47. package/host/runtime.test.ts +81 -1
  48. package/host/runtime.ts +61 -0
  49. package/host/s2s.test.ts +19 -0
  50. package/host/s2s.ts +10 -0
  51. package/host/session-ctx.ts +35 -8
  52. package/host/to-vercel-tools.test.ts +153 -0
  53. package/host/to-vercel-tools.ts +70 -0
  54. package/package.json +15 -1
  55. package/sdk/__snapshots__/exports.test.ts.snap +1 -0
  56. package/sdk/_internal-types.ts +16 -0
  57. package/sdk/define.test-d.ts +21 -0
  58. package/sdk/define.test.ts +33 -0
  59. package/sdk/define.ts +21 -0
  60. package/sdk/manifest.test-d.ts +14 -0
  61. package/sdk/manifest.test.ts +51 -0
  62. package/sdk/manifest.ts +39 -0
  63. package/sdk/providers.ts +90 -0
  64. package/sdk/types.ts +16 -0
  65. package/vitest.config.ts +1 -0
@@ -8,6 +8,7 @@ import { convert } from "html-to-text";
8
8
  import vm from "node:vm";
9
9
  import pTimeout from "p-timeout";
10
10
  import { createStorage, prefixStorage } from "unstorage";
11
+ import { jsonSchema, stepCountIs, streamText, tool } from "ai";
11
12
  import { createNanoEvents } from "nanoevents";
12
13
  import WsWebSocket, { WebSocketServer } from "ws";
13
14
  import fs from "node:fs";
@@ -339,6 +340,103 @@ function resolveAllBuiltins(names, opts) {
339
340
  };
340
341
  }
341
342
  //#endregion
343
+ //#region sdk/system-prompt.ts
344
+ function getFormattedDate() {
345
+ return (/* @__PURE__ */ new Date()).toLocaleDateString("en-US", {
346
+ weekday: "long",
347
+ year: "numeric",
348
+ month: "long",
349
+ day: "numeric"
350
+ });
351
+ }
352
+ const VOICE_RULES = "\n\nCRITICAL OUTPUT RULES — you MUST follow these for EVERY response:\nYour response will be spoken aloud by a TTS system and displayed as plain text.\n- NEVER use markdown: no **, no *, no _, no #, no `, no [](), no ---\n- NEVER use bullet points (-, *, •) or numbered lists (1., 2.)\n- NEVER use code blocks or inline code\n- NEVER mention tools, search, APIs, or technical failures to the user. If a tool returns no results, just answer naturally without explaining why.\n- Write exactly as you would say it out loud to a friend\n- Use short conversational sentences. To list things, say \"First,\" \"Next,\" \"Finally,\"\n- Keep responses concise — 1 to 3 sentences max";
353
+ /**
354
+ * Build the system prompt sent to the LLM from the agent configuration.
355
+ *
356
+ * Assembles the default system prompt, today's date, agent-specific instructions,
357
+ * and optional sections for tool usage preamble and voice output rules.
358
+ *
359
+ * @param config - The serializable agent configuration (name, systemPrompt, etc.).
360
+ * @param opts.hasTools - When `true`, appends a preamble instructing the LLM to
361
+ * speak a brief phrase before each tool call to fill silence.
362
+ * @param opts.voice - When `true`, appends strict voice-specific output rules
363
+ * (no markdown, no bullet points, conversational tone, concise responses).
364
+ * @returns The assembled system prompt string.
365
+ */
366
+ function buildSystemPrompt(config, opts) {
367
+ const { hasTools } = opts;
368
+ const agentInstructions = config.systemPrompt && config.systemPrompt !== DEFAULT_SYSTEM_PROMPT ? `\n\nAgent-Specific Instructions:\n${config.systemPrompt}` : "";
369
+ const toolPreamble = hasTools ? "\n\nWhen you decide to use a tool, ALWAYS say a brief natural phrase BEFORE the tool call (e.g. \"Let me look that up\" or \"One moment while I check\"). This fills silence while the tool executes. Keep preambles to one short sentence." : "";
370
+ const guidance = opts.toolGuidance && opts.toolGuidance.length > 0 ? `\n\nBuilt-in Tool Usage:\n${opts.toolGuidance.join("\n")}` : "";
371
+ return DEFAULT_SYSTEM_PROMPT + `\n\nToday's date is ${getFormattedDate()}.` + agentInstructions + toolPreamble + guidance + (opts.voice ? VOICE_RULES : "");
372
+ }
373
+ //#endregion
374
+ //#region host/session-ctx.ts
375
+ function _buildBaseCtx(opts) {
376
+ const { agentConfig, log } = opts;
377
+ const maxHistory = opts.maxHistory ?? 200;
378
+ const ctx = {
379
+ ...opts,
380
+ reply: {
381
+ pendingTools: [],
382
+ toolCallCount: 0,
383
+ currentReplyId: null
384
+ },
385
+ turnPromise: null,
386
+ conversationMessages: [],
387
+ maxHistory,
388
+ consumeToolCallStep(_name, replyId) {
389
+ if (replyId === null || replyId !== ctx.reply.currentReplyId) return toolError("Reply was interrupted. Discarding stale tool call.");
390
+ const maxSteps = agentConfig.maxSteps;
391
+ ctx.reply.toolCallCount++;
392
+ if (maxSteps !== void 0 && ctx.reply.toolCallCount > maxSteps) {
393
+ log.info("maxSteps exceeded, refusing tool call", {
394
+ toolCallCount: ctx.reply.toolCallCount,
395
+ maxSteps
396
+ });
397
+ return toolError("Maximum tool steps reached. Please respond to the user now.");
398
+ }
399
+ return null;
400
+ },
401
+ pushMessages(...msgs) {
402
+ ctx.conversationMessages.push(...msgs);
403
+ if (maxHistory > 0 && ctx.conversationMessages.length > maxHistory) ctx.conversationMessages.splice(0, ctx.conversationMessages.length - maxHistory);
404
+ },
405
+ beginReply(replyId) {
406
+ ctx.reply = {
407
+ pendingTools: [],
408
+ toolCallCount: 0,
409
+ currentReplyId: replyId
410
+ };
411
+ ctx.turnPromise = null;
412
+ },
413
+ cancelReply() {
414
+ ctx.reply = {
415
+ pendingTools: [],
416
+ toolCallCount: 0,
417
+ currentReplyId: null
418
+ };
419
+ },
420
+ chainTurn(p) {
421
+ ctx.turnPromise = (ctx.turnPromise ?? Promise.resolve()).then(() => p);
422
+ }
423
+ };
424
+ return ctx;
425
+ }
426
+ function buildCtx(opts) {
427
+ const base = _buildBaseCtx(opts);
428
+ base.s2s = null;
429
+ return base;
430
+ }
431
+ //#endregion
432
+ //#region host/pipeline-session-ctx.ts
433
+ function buildPipelineCtx(opts) {
434
+ const base = _buildBaseCtx(opts);
435
+ base.stt = null;
436
+ base.tts = null;
437
+ return base;
438
+ }
439
+ //#endregion
342
440
  //#region host/runtime-config.ts
343
441
  /**
344
442
  * Runtime dependencies injected into the session pipeline.
@@ -385,35 +483,371 @@ const DEFAULT_S2S_CONFIG = {
385
483
  outputSampleRate: DEFAULT_TTS_SAMPLE_RATE
386
484
  };
387
485
  //#endregion
388
- //#region sdk/system-prompt.ts
389
- function getFormattedDate() {
390
- return (/* @__PURE__ */ new Date()).toLocaleDateString("en-US", {
391
- weekday: "long",
392
- year: "numeric",
393
- month: "long",
394
- day: "numeric"
395
- });
396
- }
397
- const VOICE_RULES = "\n\nCRITICAL OUTPUT RULES — you MUST follow these for EVERY response:\nYour response will be spoken aloud by a TTS system and displayed as plain text.\n- NEVER use markdown: no **, no *, no _, no #, no `, no [](), no ---\n- NEVER use bullet points (-, *, •) or numbered lists (1., 2.)\n- NEVER use code blocks or inline code\n- NEVER mention tools, search, APIs, or technical failures to the user. If a tool returns no results, just answer naturally without explaining why.\n- Write exactly as you would say it out loud to a friend\n- Use short conversational sentences. To list things, say \"First,\" \"Next,\" \"Finally,\"\n- Keep responses concise — 1 to 3 sentences max";
486
+ //#region host/to-vercel-tools.ts
398
487
  /**
399
- * Build the system prompt sent to the LLM from the agent configuration.
488
+ * Converts agent {@link ToolSchema}[] to Vercel AI SDK tools with `execute`
489
+ * delegation to the agent's {@link ExecuteTool} function.
400
490
  *
401
- * Assembles the default system prompt, today's date, agent-specific instructions,
402
- * and optional sections for tool usage preamble and voice output rules.
491
+ * The pipeline orchestrator passes the output to `streamText({ tools })`.
492
+ * Each produced tool's `execute` closure calls
493
+ * `ctx.executeTool(name, args, sessionId, messages(), { signal, toolCallId })`,
494
+ * so the existing agent tool infrastructure (argument validation, KV, hooks,
495
+ * timeout) remains the single source of truth for tool behavior.
403
496
  *
404
- * @param config - The serializable agent configuration (name, systemPrompt, etc.).
405
- * @param opts.hasTools - When `true`, appends a preamble instructing the LLM to
406
- * speak a brief phrase before each tool call to fill silence.
407
- * @param opts.voice - When `true`, appends strict voice-specific output rules
408
- * (no markdown, no bullet points, conversational tone, concise responses).
409
- * @returns The assembled system prompt string.
497
+ * Per-call `options.abortSignal` (forwarded by `streamText` when the
498
+ * outer turn is aborted, e.g. barge-in) takes precedence over the
499
+ * bag-level `ctx.signal` so individual invocations respect streamText
500
+ * aborts.
410
501
  */
411
- function buildSystemPrompt(config, opts) {
412
- const { hasTools } = opts;
413
- const agentInstructions = config.systemPrompt && config.systemPrompt !== DEFAULT_SYSTEM_PROMPT ? `\n\nAgent-Specific Instructions:\n${config.systemPrompt}` : "";
414
- const toolPreamble = hasTools ? "\n\nWhen you decide to use a tool, ALWAYS say a brief natural phrase BEFORE the tool call (e.g. \"Let me look that up\" or \"One moment while I check\"). This fills silence while the tool executes. Keep preambles to one short sentence." : "";
415
- const guidance = opts.toolGuidance && opts.toolGuidance.length > 0 ? `\n\nBuilt-in Tool Usage:\n${opts.toolGuidance.join("\n")}` : "";
416
- return DEFAULT_SYSTEM_PROMPT + `\n\nToday's date is ${getFormattedDate()}.` + agentInstructions + toolPreamble + guidance + (opts.voice ? VOICE_RULES : "");
502
+ /**
503
+ * Convert an array of {@link ToolSchema} to a Vercel AI SDK `ToolSet`
504
+ * (record keyed by tool name).
505
+ *
506
+ * Uses the v6 `tool()` helper with `inputSchema: jsonSchema(...)` wrapping
507
+ * the agent's JSON Schema `parameters`. Execution is delegated to
508
+ * `ctx.executeTool` so validation, KV, timeouts, and hooks keep working.
509
+ */
510
+ function toVercelTools(schemas, ctx) {
511
+ const out = {};
512
+ for (const schema of schemas) out[schema.name] = tool({
513
+ description: schema.description,
514
+ inputSchema: jsonSchema(schema.parameters),
515
+ execute: async (args, options) => {
516
+ const input = args ?? {};
517
+ const signal = options.abortSignal ?? ctx.signal;
518
+ const opts = {};
519
+ if (signal !== void 0) opts.signal = signal;
520
+ if (options.toolCallId !== void 0) opts.toolCallId = options.toolCallId;
521
+ return ctx.executeTool(schema.name, input, ctx.sessionId, ctx.messages(), opts);
522
+ }
523
+ });
524
+ return out;
525
+ }
526
+ //#endregion
527
+ //#region host/pipeline-session.ts
528
+ function toModelMessage(m) {
529
+ if (m.role === "user") return {
530
+ role: "user",
531
+ content: m.content
532
+ };
533
+ if (m.role === "assistant") return {
534
+ role: "assistant",
535
+ content: m.content
536
+ };
537
+ return {
538
+ role: "assistant",
539
+ content: m.content
540
+ };
541
+ }
542
+ function emitError(client, code, message) {
543
+ client.event({
544
+ type: "error",
545
+ code,
546
+ message
547
+ });
548
+ }
549
+ function handleStreamPart(part, deps) {
550
+ switch (part.type) {
551
+ case "text-delta": {
552
+ const delta = part.text ?? "";
553
+ if (delta.length === 0) return;
554
+ deps.onTextDelta(delta);
555
+ deps.tts?.sendText(delta);
556
+ deps.client.event({
557
+ type: "agent_transcript",
558
+ text: delta
559
+ });
560
+ return;
561
+ }
562
+ case "tool-call": {
563
+ const input = part.input ?? {};
564
+ deps.client.event({
565
+ type: "tool_call",
566
+ toolCallId: part.toolCallId ?? "",
567
+ toolName: part.toolName ?? "",
568
+ args: input
569
+ });
570
+ return;
571
+ }
572
+ case "tool-result": {
573
+ const output = part.output;
574
+ const resultString = typeof output === "string" ? output : JSON.stringify(output);
575
+ deps.client.event({
576
+ type: "tool_call_done",
577
+ toolCallId: part.toolCallId ?? "",
578
+ result: resultString
579
+ });
580
+ return;
581
+ }
582
+ case "error": {
583
+ const msg = errorMessage(part.error);
584
+ deps.log.error("LLM stream error", {
585
+ message: msg,
586
+ sessionId: deps.sessionId
587
+ });
588
+ emitError(deps.client, "llm", msg);
589
+ return;
590
+ }
591
+ default: return;
592
+ }
593
+ }
594
+ /** Create a pluggable-provider voice session. */
595
+ function createPipelineSession(opts) {
596
+ const log = opts.logger ?? consoleLogger;
597
+ const sampleRate = opts.sampleRate ?? 16e3;
598
+ const { client, agentConfig, toolSchemas, executeTool } = opts;
599
+ const systemPrompt = buildSystemPrompt(agentConfig, {
600
+ hasTools: toolSchemas.length > 0 || (agentConfig.builtinTools?.length ?? 0) > 0,
601
+ voice: true,
602
+ toolGuidance: opts.toolGuidance
603
+ });
604
+ const ctx = buildPipelineCtx({
605
+ id: opts.id,
606
+ agent: opts.agent,
607
+ client,
608
+ agentConfig,
609
+ executeTool,
610
+ log,
611
+ maxHistory: opts.maxHistory
612
+ });
613
+ const sessionAbort = new AbortController();
614
+ let audioReady = false;
615
+ let turnController = null;
616
+ let nextReplyId = 0;
617
+ const sttSubs = [];
618
+ const ttsSubs = [];
619
+ function onSttPartial(_text) {
620
+ if (turnController === null) return;
621
+ log.info("Pipeline barge-in", { sessionId: opts.id });
622
+ turnController.abort();
623
+ turnController = null;
624
+ ctx.tts?.cancel();
625
+ ctx.cancelReply();
626
+ client.event({ type: "cancelled" });
627
+ }
628
+ function onSttFinal(text) {
629
+ const trimmed = text.trim();
630
+ if (trimmed.length === 0) return;
631
+ client.event({
632
+ type: "user_transcript",
633
+ text
634
+ });
635
+ const turn = runTurn(trimmed).catch((err) => {
636
+ log.error("Pipeline turn crashed", {
637
+ error: errorMessage(err),
638
+ sessionId: opts.id
639
+ });
640
+ });
641
+ ctx.chainTurn(turn);
642
+ }
643
+ function onSttError(err) {
644
+ log.error("STT error", {
645
+ code: err.code,
646
+ message: err.message,
647
+ sessionId: opts.id
648
+ });
649
+ emitError(client, "stt", err.message);
650
+ }
651
+ function onTtsError(err) {
652
+ log.error("TTS error", {
653
+ code: err.code,
654
+ message: err.message,
655
+ sessionId: opts.id
656
+ });
657
+ emitError(client, "tts", err.message);
658
+ }
659
+ async function consumeLlmStream(ctl, messages, tools, onDelta) {
660
+ const deps = {
661
+ client,
662
+ tts: ctx.tts,
663
+ log,
664
+ sessionId: opts.id,
665
+ onTextDelta: onDelta
666
+ };
667
+ try {
668
+ const maxSteps = agentConfig.maxSteps ?? 5;
669
+ const result = streamText({
670
+ model: opts.llm,
671
+ system: systemPrompt,
672
+ messages,
673
+ tools,
674
+ stopWhen: stepCountIs(maxSteps),
675
+ abortSignal: ctl.signal
676
+ });
677
+ for await (const part of result.fullStream) {
678
+ if (ctl.signal.aborted) break;
679
+ handleStreamPart(part, deps);
680
+ }
681
+ } catch (err) {
682
+ if (!ctl.signal.aborted) {
683
+ const msg = errorMessage(err);
684
+ log.error("LLM streamText failed", {
685
+ error: msg,
686
+ sessionId: opts.id
687
+ });
688
+ emitError(client, "llm", msg);
689
+ }
690
+ }
691
+ }
692
+ function flushTtsAndWait() {
693
+ const tts = ctx.tts;
694
+ if (!tts) return Promise.resolve();
695
+ return new Promise((resolve) => {
696
+ const off = tts.on("done", () => {
697
+ off();
698
+ resolve();
699
+ });
700
+ tts.flush();
701
+ });
702
+ }
703
+ async function runTurn(userText) {
704
+ const replyId = `pipeline-${++nextReplyId}`;
705
+ ctx.beginReply(replyId);
706
+ ctx.pushMessages({
707
+ role: "user",
708
+ content: userText
709
+ });
710
+ const ctl = new AbortController();
711
+ turnController = ctl;
712
+ const tools = toVercelTools(toolSchemas, {
713
+ executeTool,
714
+ sessionId: opts.id,
715
+ messages: () => ctx.conversationMessages,
716
+ signal: ctl.signal
717
+ });
718
+ const messages = ctx.conversationMessages.map(toModelMessage);
719
+ let accumulated = "";
720
+ await consumeLlmStream(ctl, messages, tools, (delta) => {
721
+ accumulated += delta;
722
+ });
723
+ if (ctl.signal.aborted) {
724
+ if (turnController === ctl) turnController = null;
725
+ return;
726
+ }
727
+ await flushTtsAndWait();
728
+ if (ctl.signal.aborted) {
729
+ if (turnController === ctl) turnController = null;
730
+ return;
731
+ }
732
+ if (accumulated.length > 0) ctx.pushMessages({
733
+ role: "assistant",
734
+ content: accumulated
735
+ });
736
+ client.playAudioDone();
737
+ client.event({ type: "reply_done" });
738
+ if (turnController === ctl) turnController = null;
739
+ }
740
+ async function openProviders() {
741
+ const [sttResult, ttsResult] = await Promise.allSettled([opts.stt.open({
742
+ sampleRate,
743
+ apiKey: opts.sttApiKey,
744
+ sttPrompt: agentConfig.sttPrompt,
745
+ signal: sessionAbort.signal
746
+ }), opts.tts.open({
747
+ sampleRate,
748
+ apiKey: opts.ttsApiKey,
749
+ signal: sessionAbort.signal
750
+ })]);
751
+ if (sttResult.status === "rejected") {
752
+ const msg = errorMessage(sttResult.reason);
753
+ log.error("STT open failed", {
754
+ error: msg,
755
+ sessionId: opts.id
756
+ });
757
+ emitError(client, "stt", msg);
758
+ }
759
+ if (ttsResult.status === "rejected") {
760
+ const msg = errorMessage(ttsResult.reason);
761
+ log.error("TTS open failed", {
762
+ error: msg,
763
+ sessionId: opts.id
764
+ });
765
+ emitError(client, "tts", msg);
766
+ }
767
+ const aborted = sessionAbort.signal.aborted;
768
+ const sttFailed = sttResult.status === "rejected";
769
+ const ttsFailed = ttsResult.status === "rejected";
770
+ const teardown = aborted || sttFailed || ttsFailed;
771
+ if (sttResult.status === "fulfilled") {
772
+ const sttSession = sttResult.value;
773
+ if (teardown) await sttSession.close().catch(() => void 0);
774
+ else {
775
+ ctx.stt = sttSession;
776
+ sttSubs.push(sttSession.on("partial", onSttPartial));
777
+ sttSubs.push(sttSession.on("final", onSttFinal));
778
+ sttSubs.push(sttSession.on("error", onSttError));
779
+ }
780
+ }
781
+ if (ttsResult.status === "fulfilled") {
782
+ const ttsSession = ttsResult.value;
783
+ if (teardown) await ttsSession.close().catch(() => void 0);
784
+ else {
785
+ ctx.tts = ttsSession;
786
+ ttsSubs.push(ttsSession.on("audio", (pcm) => {
787
+ client.playAudioChunk(new Uint8Array(pcm.buffer, pcm.byteOffset, pcm.byteLength));
788
+ }));
789
+ ttsSubs.push(ttsSession.on("error", onTtsError));
790
+ }
791
+ }
792
+ }
793
+ return {
794
+ async start() {
795
+ await openProviders();
796
+ },
797
+ async stop() {
798
+ if (sessionAbort.signal.aborted) return;
799
+ sessionAbort.abort();
800
+ turnController?.abort();
801
+ for (const off of sttSubs) off();
802
+ for (const off of ttsSubs) off();
803
+ sttSubs.length = 0;
804
+ ttsSubs.length = 0;
805
+ if (ctx.turnPromise !== null) await ctx.turnPromise;
806
+ await ctx.stt?.close().catch(() => {});
807
+ await ctx.tts?.close().catch(() => {});
808
+ },
809
+ onAudio(data) {
810
+ if (!audioReady) return;
811
+ const offset = data.byteOffset;
812
+ const length = data.byteLength;
813
+ let pcm;
814
+ if (offset % 2 === 0 && length % 2 === 0) pcm = new Int16Array(data.buffer, offset, length / 2);
815
+ else {
816
+ const copy = new Uint8Array(length - length % 2);
817
+ copy.set(data.subarray(0, copy.byteLength));
818
+ pcm = new Int16Array(copy.buffer);
819
+ }
820
+ ctx.stt?.sendAudio(pcm);
821
+ },
822
+ onAudioReady() {
823
+ audioReady = true;
824
+ },
825
+ onCancel() {
826
+ turnController?.abort();
827
+ turnController = null;
828
+ ctx.tts?.cancel();
829
+ ctx.cancelReply();
830
+ client.event({ type: "cancelled" });
831
+ },
832
+ onReset() {
833
+ turnController?.abort();
834
+ turnController = null;
835
+ ctx.tts?.cancel();
836
+ ctx.cancelReply();
837
+ ctx.conversationMessages = [];
838
+ ctx.turnPromise = null;
839
+ client.event({ type: "reset" });
840
+ },
841
+ onHistory(incoming) {
842
+ ctx.pushMessages(...incoming.map((m) => ({
843
+ role: m.role,
844
+ content: m.content
845
+ })));
846
+ },
847
+ waitForTurn() {
848
+ return ctx.turnPromise ?? Promise.resolve();
849
+ }
850
+ };
417
851
  }
418
852
  //#endregion
419
853
  //#region host/s2s.ts
@@ -544,6 +978,10 @@ function connectS2s(opts) {
544
978
  }
545
979
  ws.send(`{"type":"input.audio","audio":"${uint8ToBase64(audio)}"}`);
546
980
  },
981
+ sendAudioRaw(jsonFrame) {
982
+ if (ws.readyState !== 1) return;
983
+ ws.send(jsonFrame);
984
+ },
547
985
  sendToolResult(callId, result) {
548
986
  const msg = {
549
987
  type: "tool.result",
@@ -639,60 +1077,6 @@ function connectS2s(opts) {
639
1077
  });
640
1078
  }
641
1079
  //#endregion
642
- //#region host/session-ctx.ts
643
- function buildCtx(opts) {
644
- const { agentConfig, log } = opts;
645
- const maxHistory = opts.maxHistory ?? 200;
646
- const ctx = {
647
- ...opts,
648
- s2s: null,
649
- reply: {
650
- pendingTools: [],
651
- toolCallCount: 0,
652
- currentReplyId: null
653
- },
654
- turnPromise: null,
655
- conversationMessages: [],
656
- maxHistory,
657
- consumeToolCallStep(_name, replyId) {
658
- if (replyId === null || replyId !== ctx.reply.currentReplyId) return toolError("Reply was interrupted. Discarding stale tool call.");
659
- const maxSteps = agentConfig.maxSteps;
660
- ctx.reply.toolCallCount++;
661
- if (maxSteps !== void 0 && ctx.reply.toolCallCount > maxSteps) {
662
- log.info("maxSteps exceeded, refusing tool call", {
663
- toolCallCount: ctx.reply.toolCallCount,
664
- maxSteps
665
- });
666
- return toolError("Maximum tool steps reached. Please respond to the user now.");
667
- }
668
- return null;
669
- },
670
- pushMessages(...msgs) {
671
- ctx.conversationMessages.push(...msgs);
672
- if (maxHistory > 0 && ctx.conversationMessages.length > maxHistory) ctx.conversationMessages.splice(0, ctx.conversationMessages.length - maxHistory);
673
- },
674
- beginReply(replyId) {
675
- ctx.reply = {
676
- pendingTools: [],
677
- toolCallCount: 0,
678
- currentReplyId: replyId
679
- };
680
- ctx.turnPromise = null;
681
- },
682
- cancelReply() {
683
- ctx.reply = {
684
- pendingTools: [],
685
- toolCallCount: 0,
686
- currentReplyId: null
687
- };
688
- },
689
- chainTurn(p) {
690
- ctx.turnPromise = (ctx.turnPromise ?? Promise.resolve()).then(() => p);
691
- }
692
- };
693
- return ctx;
694
- }
695
- //#endregion
696
1080
  //#region host/session.ts
697
1081
  /** @internal Not part of the public API. Exposed for testing only. */
698
1082
  const _internals = { connectS2s };
@@ -1288,6 +1672,17 @@ function createLocalKv() {
1288
1672
  return createUnstorageKv({ storage: createStorage() });
1289
1673
  }
1290
1674
  /**
1675
+ * Resolve an API key host-side for pipeline providers.
1676
+ *
1677
+ * Checks the agent's declared env first, then the host process env as a
1678
+ * fallback. Returns `""` when absent — pipeline providers surface a clear
1679
+ * `MissingCredentialsError` via their `open()` that the orchestrator
1680
+ * converts to a `session.error` wire event.
1681
+ */
1682
+ function resolveApiKey(envVar, env) {
1683
+ return env[envVar] ?? process.env[envVar] ?? "";
1684
+ }
1685
+ /**
1291
1686
  * Create an agent runtime — the execution engine for a voice agent.
1292
1687
  *
1293
1688
  * Merges built-in and custom tool definitions, builds tool schemas for the
@@ -1301,6 +1696,9 @@ function createLocalKv() {
1301
1696
  */
1302
1697
  function createRuntime(opts) {
1303
1698
  const { agent, env, kv = createLocalKv(), createWebSocket, logger = consoleLogger, s2sConfig = DEFAULT_S2S_CONFIG, sessionStartTimeoutMs, shutdownTimeoutMs = DEFAULT_SHUTDOWN_TIMEOUT_MS } = opts;
1699
+ const providerCount = (opts.stt != null ? 1 : 0) + (opts.llm != null ? 1 : 0) + (opts.tts != null ? 1 : 0);
1700
+ if (providerCount !== 0 && providerCount !== 3) throw new Error("stt, llm, and tts must be set together");
1701
+ const mode = providerCount === 3 ? "pipeline" : "s2s";
1304
1702
  const agentConfig = toAgentConfig(agent);
1305
1703
  const sessions = /* @__PURE__ */ new Map();
1306
1704
  const sinkMap = /* @__PURE__ */ new Map();
@@ -1365,6 +1763,26 @@ function createRuntime(opts) {
1365
1763
  }
1366
1764
  function createSession(sessionOpts) {
1367
1765
  sinkMap.set(sessionOpts.id, sessionOpts.client);
1766
+ if (mode === "pipeline") {
1767
+ const stt = opts.stt;
1768
+ const llm = opts.llm;
1769
+ const tts = opts.tts;
1770
+ return createPipelineSession({
1771
+ id: sessionOpts.id,
1772
+ agent: sessionOpts.agent,
1773
+ client: sessionOpts.client,
1774
+ agentConfig,
1775
+ toolSchemas,
1776
+ toolGuidance,
1777
+ executeTool,
1778
+ stt,
1779
+ llm,
1780
+ tts,
1781
+ sttApiKey: resolveApiKey("ASSEMBLYAI_API_KEY", env),
1782
+ ttsApiKey: resolveApiKey("CARTESIA_API_KEY", env),
1783
+ logger
1784
+ });
1785
+ }
1368
1786
  const apiKey = env.ASSEMBLYAI_API_KEY ?? "";
1369
1787
  return createS2sSession({
1370
1788
  id: sessionOpts.id,
@@ -1565,4 +1983,4 @@ function createServer(options) {
1565
1983
  };
1566
1984
  }
1567
1985
  //#endregion
1568
- export { DEFAULT_S2S_CONFIG, _internals, buildCtx, consoleLogger, createRuntime, createS2sSession, createServer, createUnstorageKv, executeInIsolate, executeToolCall, jsonLogger, resolveAllBuiltins, wireSessionSocket };
1986
+ export { DEFAULT_S2S_CONFIG, _buildBaseCtx, _internals, buildCtx, consoleLogger, createRuntime, createS2sSession, createServer, createUnstorageKv, executeInIsolate, executeToolCall, jsonLogger, resolveAllBuiltins, wireSessionSocket };
@@ -9,6 +9,7 @@ import { type ToolSchema } from "../sdk/_internal-types.ts";
9
9
  import type { Kv } from "../sdk/kv.ts";
10
10
  import type { ClientSink } from "../sdk/protocol.ts";
11
11
  import { type ReadyConfig } from "../sdk/protocol.ts";
12
+ import type { LlmProvider, SttProvider, TtsProvider } from "../sdk/providers.ts";
12
13
  import type { AgentDef } from "../sdk/types.ts";
13
14
  import type { Logger, S2SConfig } from "./runtime-config.ts";
14
15
  import type { CreateS2sWebSocket } from "./s2s.ts";
@@ -89,6 +90,22 @@ export type RuntimeOptions = {
89
90
  * their own fetch wrapper.
90
91
  */
91
92
  fetch?: typeof globalThis.fetch | undefined;
93
+ /**
94
+ * Pluggable STT provider. Must be set together with `llm` and `tts` to
95
+ * route sessions through the pipeline path; leave all three unset for
96
+ * the default AssemblyAI Streaming Speech-to-Speech (S2S) path.
97
+ */
98
+ stt?: SttProvider | undefined;
99
+ /**
100
+ * Pluggable LLM provider (Vercel AI SDK `LanguageModel`). Must be set
101
+ * together with `stt` and `tts` to route sessions through the pipeline path.
102
+ */
103
+ llm?: LlmProvider | undefined;
104
+ /**
105
+ * Pluggable TTS provider. Must be set together with `stt` and `llm` to
106
+ * route sessions through the pipeline path.
107
+ */
108
+ tts?: TtsProvider | undefined;
92
109
  };
93
110
  /**
94
111
  * The agent runtime returned by {@link createRuntime}.
@@ -62,6 +62,11 @@ export type S2sEvents = {
62
62
  export type S2sHandle = {
63
63
  on<K extends keyof S2sEvents>(event: K, cb: S2sEvents[K]): Unsubscribe;
64
64
  sendAudio(audio: Uint8Array): void;
65
+ /**
66
+ * Send a pre-encoded audio wire frame. For perf-critical callers (load tests)
67
+ * that batch-encode up front. Skips logging; caller owns wire format.
68
+ */
69
+ sendAudioRaw(jsonFrame: string): void;
65
70
  sendToolResult(callId: string, result: string): void;
66
71
  updateSession(config: S2sSessionConfig): void;
67
72
  resumeSession(sessionId: string): void;