npm - @alexkroman1/aai - Versions diffs - 1.4.0 → 1.4.2 - Mend

@alexkroman1/aai 1.4.0 → 1.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/.turbo/turbo-build.log +4 -4
package/CHANGELOG.md +12 -0
package/dist/host/pipeline-session.d.ts +6 -2
package/dist/host/runtime-barrel.js +69 -14
package/host/pipeline-session.test.ts +112 -12
package/host/pipeline-session.ts +53 -9
package/host/providers/resolve.ts +4 -1
package/host/providers/tts/cartesia.test.ts +16 -6
package/host/providers/tts/cartesia.ts +25 -11
package/host/runtime.ts +3 -0
package/package.json +1 -1

package/.turbo/turbo-build.log CHANGED Viewed

@@ -1,5 +1,5 @@
-> @alexkroman1/aai@1.4.0 build /home/runner/work/agent/agent/packages/aai
+> @alexkroman1/aai@1.4.2 build /home/runner/work/agent/agent/packages/aai
 > tsdown && tsc -p tsconfig.build.json
 [34mℹ[39m [34mtsdown v0.21.7[39m powered by [38;2;255;126;23mrolldown v1.0.0-rc.12[39m
@@ -8,7 +8,7 @@
 [34mℹ[39m target: [34mnode22[39m
 [34mℹ[39m tsconfig: [34mtsconfig.json[39m
 [34mℹ[39m Build start
-[34mℹ[39m [2mdist/[22m[1mhost/runtime-barrel.js[22m       [2m74.18 kB[22m [2m│ gzip: 22.09 kB[22m
+[34mℹ[39m [2mdist/[22m[1mhost/runtime-barrel.js[22m       [2m75.92 kB[22m [2m│ gzip: 22.48 kB[22m
 [34mℹ[39m [2mdist/[22m[1msdk/protocol.js[22m              [2m 4.75 kB[22m [2m│ gzip:  1.76 kB[22m
 [34mℹ[39m [2mdist/[22m[1mindex.js[22m                     [2m 2.88 kB[22m [2m│ gzip:  1.24 kB[22m
 [34mℹ[39m [2mdist/[22m[1msdk/manifest-barrel.js[22m       [2m 0.36 kB[22m [2m│ gzip:  0.20 kB[22m
@@ -22,5 +22,5 @@
 [34mℹ[39m [2mdist/[22massemblyai-Cxg9eobY.js       [2m 0.53 kB[22m [2m│ gzip:  0.35 kB[22m
 [34mℹ[39m [2mdist/[22manthropic-BrUCPKUc.js        [2m 0.23 kB[22m [2m│ gzip:  0.18 kB[22m
 [34mℹ[39m [2mdist/[22mcartesia-DwDk2tEu.js         [2m 0.22 kB[22m [2m│ gzip:  0.17 kB[22m
-[34mℹ[39m 14 files, total: 99.00 kB
-[32m✔[39m Build complete in [32m39ms[39m
+[34mℹ[39m 14 files, total: 100.74 kB
+[32m✔[39m Build complete in [32m48ms[39m

package/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,17 @@
 # @alexkroman1/aai
+## 1.4.2
+### Patch Changes
+- f877a6f: Fix pipeline mode: play greeting, emit a single agent_transcript per turn, open TTS at the client's playback sample rate, and stop the Cartesia adapter from eagerly rotating its context (which was silently dropping in-flight audio chunks).
+## 1.4.1
+### Patch Changes
+- 63de397: Pass explicit baseURL to createAnthropic so the SDK's loadOptionalSetting returns before reading process.env['ANTHROPIC_BASE_URL']. The Deno platform server runs without --allow-env, and the missing baseURL caused pipeline-mode sessions to crash on first use.
 ## 1.4.0
 ## 1.3.2

package/dist/host/pipeline-session.d.ts CHANGED Viewed

@@ -37,8 +37,12 @@ export interface PipelineSessionOptions {
     sttApiKey: string;
     /** TTS API key. */
     ttsApiKey: string;
-    /** Audio sample rate (PCM16, Hz). Defaults to {@link DEFAULT_STT_SAMPLE_RATE}. */
-    sampleRate?: number | undefined;
+    /** STT audio sample rate (PCM16, Hz). Defaults to {@link DEFAULT_STT_SAMPLE_RATE}. */
+    sttSampleRate?: number | undefined;
+    /** TTS audio sample rate (PCM16, Hz). Must match the client's playback AudioContext rate. Defaults to {@link DEFAULT_TTS_SAMPLE_RATE}. */
+    ttsSampleRate?: number | undefined;
+    /** Skip the initial greeting audio on connect (used for session resume). */
+    skipGreeting?: boolean | undefined;
     /** Logger. Defaults to the console logger. */
     logger?: Logger | undefined;
     /** Sliding-window conversation history size. */

package/dist/host/runtime-barrel.js CHANGED Viewed

@@ -560,10 +560,6 @@ function handleStreamPart(part, deps) {
 			if (delta.length === 0) return;
 			deps.onTextDelta(delta);
 			deps.tts?.sendText(delta);
-			deps.client.event({
-				type: "agent_transcript",
-				text: delta
-			});
 			return;
 		}
 		case "tool-call": {
@@ -601,7 +597,8 @@ function handleStreamPart(part, deps) {
 /** Create a pluggable-provider voice session. */
 function createPipelineSession(opts) {
 	const log = opts.logger ?? consoleLogger;
-	const sampleRate = opts.sampleRate ?? 16e3;
+	const sttSampleRate = opts.sttSampleRate ?? 16e3;
+	const ttsSampleRate = opts.ttsSampleRate ?? 24e3;
 	const { client, agentConfig, toolSchemas, executeTool } = opts;
 	const systemPrompt = buildSystemPrompt(agentConfig, {
 		hasTools: toolSchemas.length > 0 || (agentConfig.builtinTools?.length ?? 0) > 0,
@@ -797,15 +794,44 @@ function createPipelineSession(opts) {
 			if (turnController === ctl) turnController = null;
 			return;
 		}
+		if (accumulated.length > 0) {
+			client.event({
+				type: "agent_transcript",
+				text: accumulated
+			});
+			ctx.pushMessages({
+				role: "assistant",
+				content: accumulated
+			});
+		}
 		await flushTtsAndWait(ctl.signal);
 		if (ctl.signal.aborted) {
 			if (turnController === ctl) turnController = null;
 			return;
 		}
-		if (accumulated.length > 0) ctx.pushMessages({
+		client.playAudioDone();
+		client.event({ type: "reply_done" });
+		if (turnController === ctl) turnController = null;
+	}
+	async function runGreeting(text) {
+		const replyId = `pipeline-greeting-${++nextReplyId}`;
+		ctx.beginReply(replyId);
+		const ctl = new AbortController();
+		turnController = ctl;
+		client.event({
+			type: "agent_transcript",
+			text
+		});
+		ctx.pushMessages({
 			role: "assistant",
-			content: accumulated
+			content: text
 		});
+		ctx.tts?.sendText(text);
+		await flushTtsAndWait(ctl.signal);
+		if (ctl.signal.aborted) {
+			if (turnController === ctl) turnController = null;
+			return;
+		}
 		client.playAudioDone();
 		client.event({ type: "reply_done" });
 		if (turnController === ctl) turnController = null;
@@ -841,12 +867,12 @@ function createPipelineSession(opts) {
 	}
 	async function openProviders() {
 		const [sttResult, ttsResult] = await Promise.allSettled([opts.stt.open({
-			sampleRate,
+			sampleRate: sttSampleRate,
 			apiKey: opts.sttApiKey,
 			sttPrompt: agentConfig.sttPrompt,
 			signal: sessionAbort.signal
 		}), opts.tts.open({
-			sampleRate,
+			sampleRate: ttsSampleRate,
 			apiKey: opts.ttsApiKey,
 			signal: sessionAbort.signal
 		})]);
@@ -890,7 +916,18 @@ function createPipelineSession(opts) {
 			ctx.stt?.sendAudio(pcm);
 		},
 		onAudioReady() {
+			if (audioReady || terminated) return;
 			audioReady = true;
+			if (opts.skipGreeting) return;
+			const greeting = agentConfig.greeting;
+			if (!greeting) return;
+			const turn = runGreeting(greeting).catch((err) => {
+				log.error("Pipeline greeting failed", {
+					error: errorMessage(err),
+					sessionId: opts.id
+				});
+			});
+			ctx.chainTurn(turn);
 		},
 		onCancel() {
 			if (terminated) return;
@@ -1076,9 +1113,21 @@ function openCartesia(opts) {
 			* Reset whenever a fresh context is minted (i.e. at turn boundaries).
 			*/
 			let doneEmitted = false;
+			/**
+			* After `flush()` or `cancel()`, the current context is done accepting
+			* input. We defer minting a fresh one until the next `sendText()` so
+			* that late audio chunks + Cartesia's real `done` event (both tagged
+			* with the flushed context's id) still pass the filter below. Rotating
+			* eagerly would silently drop all audio still in flight.
+			*/
+			let rotatePending = false;
 			const rotateContext = () => {
 				context = mintContext();
 				doneEmitted = false;
+				rotatePending = false;
+			};
+			const rotateIfPending = () => {
+				if (rotatePending) rotateContext();
 			};
 			const emitDoneOnce = () => {
 				if (doneEmitted || closed) return;
@@ -1133,6 +1182,7 @@ function openCartesia(opts) {
 			return {
 				sendText(text) {
 					if (closed || text.length === 0) return;
+					rotateIfPending();
 					context.send({
 						...baseRequest,
 						transcript: text,
@@ -1140,20 +1190,19 @@ function openCartesia(opts) {
 					}).catch(ignoreRejection);
 				},
 				flush() {
-					if (closed) return;
+					if (closed || rotatePending) return;
 					context.send({
 						...baseRequest,
 						transcript: "",
 						continue: false
 					}).catch(ignoreRejection);
-					queueMicrotask(emitDoneOnce);
-					rotateContext();
+					rotatePending = true;
 				},
 				cancel() {
 					if (closed) return;
 					context.cancel().catch(ignoreRejection);
 					emitDoneOnce();
-					rotateContext();
+					rotatePending = true;
 				},
 				on(event, fn) {
 					return emitter.on(event, fn);
@@ -1216,7 +1265,10 @@ function resolveLlm(descriptor, env) {
 			const options = descriptor.options;
 			const apiKey = resolveApiKey("ANTHROPIC_API_KEY", env);
 			if (!apiKey) throw new Error("Anthropic LLM: missing API key. Set ANTHROPIC_API_KEY in the agent env.");
-			return createAnthropic({ apiKey })(options.model);
+			return createAnthropic({
+				apiKey,
+				baseURL: "https://api.anthropic.com/v1"
+			})(options.model);
 		}
 		default: throw new Error(`Unknown LLM provider kind: "${descriptor.kind}". Supported: ${ANTHROPIC_KIND}.`);
 	}
@@ -2150,6 +2202,9 @@ function createRuntime(opts) {
 			tts: pipelineProviders.tts,
 			sttApiKey: resolveApiKey("ASSEMBLYAI_API_KEY", env),
 			ttsApiKey: resolveApiKey("CARTESIA_API_KEY", env),
+			sttSampleRate: s2sConfig.inputSampleRate,
+			ttsSampleRate: s2sConfig.outputSampleRate,
+			skipGreeting: sessionOpts.skipGreeting ?? false,
 			logger
 		});
 		const apiKey = env.ASSEMBLYAI_API_KEY ?? "";

package/host/pipeline-session.test.ts CHANGED Viewed

@@ -43,7 +43,8 @@ function makeOpts(overrides: Partial<PipelineSessionOptions> = {}): {
     tts,
     sttApiKey: "stt-key",
     ttsApiKey: "tts-key",
-    sampleRate: 16_000,
+    sttSampleRate: 16_000,
+    ttsSampleRate: 24_000,
     logger: silentLogger,
     ...overrides,
   };
@@ -81,20 +82,112 @@ describe("createPipelineSession — happy path", () => {
     expect(ttsSession.textChunks).toEqual(["Hello", " there"]);
     expect(ttsSession.flush).toHaveBeenCalledTimes(1);
-    // Verify wire events in order
+    // Verify wire events in order — the pipeline emits a single
+    // `agent_transcript` with the full accumulated reply (not one per
+    // delta) so the UI renders one assistant message per turn.
     const types = eventTypes(client.events);
-    expect(types).toEqual([
-      "user_transcript",
-      "agent_transcript", // "Hello"
-      "agent_transcript", // " there"
-      "reply_done",
-    ]);
+    expect(types).toEqual(["user_transcript", "agent_transcript", "reply_done"]);
     // user_transcript text matches
     expect(client.events[0]).toMatchObject({
       type: "user_transcript",
       text: "Hello there, how are you?",
     });
+    expect(client.events[1]).toMatchObject({
+      type: "agent_transcript",
+      text: "Hello there",
+    });
+    await session.stop();
+  });
+});
+describe("createPipelineSession — greeting", () => {
+  test("onAudioReady sends greeting to TTS and emits agent_transcript + reply_done", async () => {
+    const { opts, tts, client } = makeOpts({
+      agentConfig: {
+        name: "pipeline-agent",
+        systemPrompt: DEFAULT_SYSTEM_PROMPT,
+        greeting: "Hi! I'm pipeline mode.",
+      },
+    });
+    const session = createPipelineSession(opts);
+    await session.start();
+    const ttsSession = tts.last();
+    if (!ttsSession) throw new Error("TTS didn't open");
+    session.onAudioReady();
+    await session.waitForTurn();
+    expect(ttsSession.textChunks).toEqual(["Hi! I'm pipeline mode."]);
+    expect(ttsSession.flush).toHaveBeenCalledTimes(1);
+    const types = eventTypes(client.events);
+    expect(types).toEqual(["agent_transcript", "reply_done"]);
+    expect(client.events[0]).toMatchObject({
+      type: "agent_transcript",
+      text: "Hi! I'm pipeline mode.",
+    });
+    await session.stop();
+  });
+  test("skipGreeting=true suppresses the greeting turn", async () => {
+    const { opts, tts, client } = makeOpts({
+      agentConfig: {
+        name: "pipeline-agent",
+        systemPrompt: DEFAULT_SYSTEM_PROMPT,
+        greeting: "Hello there.",
+      },
+      skipGreeting: true,
+    });
+    const session = createPipelineSession(opts);
+    await session.start();
+    const ttsSession = tts.last();
+    if (!ttsSession) throw new Error("TTS didn't open");
+    session.onAudioReady();
+    await session.waitForTurn();
+    expect(ttsSession.sendText).not.toHaveBeenCalled();
+    expect(ttsSession.flush).not.toHaveBeenCalled();
+    expect(client.events).toEqual([]);
+    await session.stop();
+  });
+  test("empty greeting is a no-op", async () => {
+    const { opts, tts, client } = makeOpts();
+    // CONFIG already has greeting: ""
+    const session = createPipelineSession(opts);
+    await session.start();
+    const ttsSession = tts.last();
+    if (!ttsSession) throw new Error("TTS didn't open");
+    session.onAudioReady();
+    await session.waitForTurn();
+    expect(ttsSession.sendText).not.toHaveBeenCalled();
+    expect(client.events).toEqual([]);
+    await session.stop();
+  });
+  test("passes sttSampleRate / ttsSampleRate through to providers", async () => {
+    const { opts, stt, tts } = makeOpts({
+      sttSampleRate: 16_000,
+      ttsSampleRate: 24_000,
+    });
+    const session = createPipelineSession(opts);
+    await session.start();
+    expect(stt.last()?.opts.sampleRate).toBe(16_000);
+    expect(tts.last()?.opts.sampleRate).toBe(24_000);
     await session.stop();
   });
@@ -163,8 +256,10 @@ describe("createPipelineSession — barge-in", () => {
     // TTS.cancel must have been called exactly once.
     expect(ttsSession.cancel).toHaveBeenCalledTimes(1);
-    // Wire events: user_transcript, some agent_transcript(s), then cancelled.
-    // No reply_done — barge-in short-circuits the drain.
+    // Wire events: user_transcript then cancelled. No agent_transcript
+    // (the pipeline only emits it after the LLM stream finishes cleanly)
+    // and no reply_done — barge-in short-circuits both the stream and
+    // the drain.
     const types = eventTypes(client.events);
     expect(types).toContain("user_transcript");
     expect(types).toContain("cancelled");
@@ -212,12 +307,17 @@ describe("createPipelineSession — tool calls", () => {
     const types = eventTypes(client.events);
     expect(types).toEqual([
       "user_transcript",
-      "agent_transcript", // "Let me check"
       "tool_call",
       "tool_call_done",
-      "agent_transcript", // " — it's sunny."
+      "agent_transcript", // combined: "Let me check — it's sunny."
       "reply_done",
     ]);
+    expect(client.events.find((e) => (e as ClientEvent).type === "agent_transcript")).toMatchObject(
+      {
+        type: "agent_transcript",
+        text: "Let me check — it's sunny.",
+      },
+    );
     const toolCall = client.events.find((e) => (e as ClientEvent).type === "tool_call");
     expect(toolCall).toMatchObject({

package/host/pipeline-session.ts CHANGED Viewed

@@ -10,7 +10,11 @@
 import type { LanguageModel, ModelMessage } from "ai";
 import { stepCountIs, streamText } from "ai";
 import type { AgentConfig, ExecuteTool, ToolSchema } from "../sdk/_internal-types.ts";
-import { DEFAULT_STT_SAMPLE_RATE, PIPELINE_FLUSH_TIMEOUT_MS } from "../sdk/constants.ts";
+import {
+  DEFAULT_STT_SAMPLE_RATE,
+  DEFAULT_TTS_SAMPLE_RATE,
+  PIPELINE_FLUSH_TIMEOUT_MS,
+} from "../sdk/constants.ts";
 import type { ClientSink, SessionErrorCode } from "../sdk/protocol.ts";
 import type {
   SttError,
@@ -55,8 +59,12 @@ export interface PipelineSessionOptions {
   sttApiKey: string;
   /** TTS API key. */
   ttsApiKey: string;
-  /** Audio sample rate (PCM16, Hz). Defaults to {@link DEFAULT_STT_SAMPLE_RATE}. */
-  sampleRate?: number | undefined;
+  /** STT audio sample rate (PCM16, Hz). Defaults to {@link DEFAULT_STT_SAMPLE_RATE}. */
+  sttSampleRate?: number | undefined;
+  /** TTS audio sample rate (PCM16, Hz). Must match the client's playback AudioContext rate. Defaults to {@link DEFAULT_TTS_SAMPLE_RATE}. */
+  ttsSampleRate?: number | undefined;
+  /** Skip the initial greeting audio on connect (used for session resume). */
+  skipGreeting?: boolean | undefined;
   /** Logger. Defaults to the console logger. */
   logger?: Logger | undefined;
   /** Sliding-window conversation history size. */
@@ -99,7 +107,6 @@ function handleStreamPart(
       if (delta.length === 0) return;
       deps.onTextDelta(delta);
       deps.tts?.sendText(delta);
-      deps.client.event({ type: "agent_transcript", text: delta });
       return;
     }
     case "tool-call": {
@@ -136,7 +143,8 @@ function handleStreamPart(
 /** Create a pluggable-provider voice session. */
 export function createPipelineSession(opts: PipelineSessionOptions): Session {
   const log = opts.logger ?? consoleLogger;
-  const sampleRate = opts.sampleRate ?? DEFAULT_STT_SAMPLE_RATE;
+  const sttSampleRate = opts.sttSampleRate ?? DEFAULT_STT_SAMPLE_RATE;
+  const ttsSampleRate = opts.ttsSampleRate ?? DEFAULT_TTS_SAMPLE_RATE;
   const { client, agentConfig, toolSchemas, executeTool } = opts;
   const hasTools = toolSchemas.length > 0 || (agentConfig.builtinTools?.length ?? 0) > 0;
@@ -342,6 +350,14 @@ export function createPipelineSession(opts: PipelineSessionOptions): Session {
       return;
     }
+    // Emit the complete transcript once the LLM finishes streaming, so the
+    // UI renders a single assistant message (vs. one per delta) and the user
+    // sees the text while TTS drains the synthesized audio.
+    if (accumulated.length > 0) {
+      client.event({ type: "agent_transcript", text: accumulated });
+      ctx.pushMessages({ role: "assistant", content: accumulated });
+    }
     await flushTtsAndWait(ctl.signal);
     if (ctl.signal.aborted) {
@@ -349,9 +365,29 @@ export function createPipelineSession(opts: PipelineSessionOptions): Session {
       return;
     }
-    if (accumulated.length > 0) {
-      ctx.pushMessages({ role: "assistant", content: accumulated });
+    client.playAudioDone();
+    client.event({ type: "reply_done" });
+    if (turnController === ctl) turnController = null;
+  }
+  async function runGreeting(text: string): Promise<void> {
+    const replyId = `pipeline-greeting-${++nextReplyId}`;
+    ctx.beginReply(replyId);
+    const ctl = new AbortController();
+    turnController = ctl;
+    client.event({ type: "agent_transcript", text });
+    ctx.pushMessages({ role: "assistant", content: text });
+    ctx.tts?.sendText(text);
+    await flushTtsAndWait(ctl.signal);
+    if (ctl.signal.aborted) {
+      if (turnController === ctl) turnController = null;
+      return;
     }
     client.playAudioDone();
     client.event({ type: "reply_done" });
     if (turnController === ctl) turnController = null;
@@ -394,13 +430,13 @@ export function createPipelineSession(opts: PipelineSessionOptions): Session {
   async function openProviders(): Promise<void> {
     const [sttResult, ttsResult] = await Promise.allSettled([
       opts.stt.open({
-        sampleRate,
+        sampleRate: sttSampleRate,
         apiKey: opts.sttApiKey,
         sttPrompt: agentConfig.sttPrompt,
         signal: sessionAbort.signal,
       }),
       opts.tts.open({
-        sampleRate,
+        sampleRate: ttsSampleRate,
         apiKey: opts.ttsApiKey,
         signal: sessionAbort.signal,
       }),
@@ -458,7 +494,15 @@ export function createPipelineSession(opts: PipelineSessionOptions): Session {
       ctx.stt?.sendAudio(pcm);
     },
     onAudioReady(): void {
+      if (audioReady || terminated) return;
       audioReady = true;
+      if (opts.skipGreeting) return;
+      const greeting = agentConfig.greeting;
+      if (!greeting) return;
+      const turn = runGreeting(greeting).catch((err: unknown) => {
+        log.error("Pipeline greeting failed", { error: errorMessage(err), sessionId: opts.id });
+      });
+      ctx.chainTurn(turn);
     },
     onCancel(): void {
       if (terminated) return;

package/host/providers/resolve.ts CHANGED Viewed

@@ -77,7 +77,10 @@ export function resolveLlm(descriptor: LlmProvider, env: Record<string, string>)
       if (!apiKey) {
         throw new Error("Anthropic LLM: missing API key. Set ANTHROPIC_API_KEY in the agent env.");
       }
-      return createAnthropic({ apiKey })(options.model);
+      // Pass baseURL explicitly so the SDK's loadOptionalSetting returns
+      // before reading process.env["ANTHROPIC_BASE_URL"]. Without this,
+      // the Deno platform server needs --allow-env to start a session.
+      return createAnthropic({ apiKey, baseURL: "https://api.anthropic.com/v1" })(options.model);
     }
     default:
       throw new Error(

package/host/providers/tts/cartesia.test.ts CHANGED Viewed

@@ -151,12 +151,15 @@ describe("cartesia TTS adapter", () => {
       },
     ]);
-    // After flush(), the adapter has rotated to a new context.
-    const turn2 = session._currentContextId();
-    expect(turn2).not.toBe(turn1);
+    // Rotation is deferred until the next sendText so Cartesia's late
+    // audio chunks + real `done` event (both tagged with turn1's id) still
+    // pass the context-id filter.
+    expect(session._currentContextId()).toBe(turn1);
-    // Subsequent sendText targets the new context.
+    // Subsequent sendText rotates to a fresh context.
     session.sendText("next");
+    const turn2 = session._currentContextId();
+    expect(turn2).not.toBe(turn1);
     await flush();
     expect(sends.filter((s) => s.contextId === turn2)).toEqual([
       {
@@ -201,8 +204,15 @@ describe("cartesia TTS adapter", () => {
       { kind: "cancel", contextId: turn1 },
     ]);
-    // Cancelling rotates the context so the next turn is unambiguous.
-    expect(session._currentContextId()).not.toBe(turn1);
+    // Rotation is deferred until the next sendText — cancel() halts the
+    // old context on Cartesia's side, so late events for turn1 can safely
+    // keep passing the filter until the next turn actually begins.
+    expect(session._currentContextId()).toBe(turn1);
+    // A subsequent sendText mints a fresh context for turn2.
+    session.sendText("again");
+    const turn2 = session._currentContextId();
+    expect(turn2).not.toBe(turn1);
     controller.abort();
     await session.close();

package/host/providers/tts/cartesia.ts CHANGED Viewed

@@ -107,9 +107,21 @@ export function openCartesia(opts: CartesiaOptions): TtsOpener {
        * Reset whenever a fresh context is minted (i.e. at turn boundaries).
        */
       let doneEmitted = false;
+      /**
+       * After `flush()` or `cancel()`, the current context is done accepting
+       * input. We defer minting a fresh one until the next `sendText()` so
+       * that late audio chunks + Cartesia's real `done` event (both tagged
+       * with the flushed context's id) still pass the filter below. Rotating
+       * eagerly would silently drop all audio still in flight.
+       */
+      let rotatePending = false;
       const rotateContext = () => {
         context = mintContext();
         doneEmitted = false;
+        rotatePending = false;
+      };
+      const rotateIfPending = () => {
+        if (rotatePending) rotateContext();
       };
       const emitDoneOnce = () => {
         if (doneEmitted || closed) return;
@@ -179,33 +191,35 @@ export function openCartesia(opts: CartesiaOptions): TtsOpener {
       const session: CartesiaSession = {
         sendText(text: string) {
           if (closed || text.length === 0) return;
+          // First sendText after a flush/cancel starts a fresh context so
+          // we don't append to one that's already been finalized.
+          rotateIfPending();
           void context
             .send({ ...baseRequest, transcript: text, continue: true })
             .catch(ignoreRejection);
         },
         flush() {
-          if (closed) return;
+          if (closed || rotatePending) return;
           // Empty transcript with `continue: false` is the canonical
-          // end-of-turn signal. Cartesia replies with a `done` tagged
-          // by context_id, driving `emitDoneOnce`. The microtask
-          // fallback guards against a dropped server event wedging
-          // the orchestrator's state machine.
-          // TODO: drop the microtask fallback once we've verified
-          // Cartesia always emits `done` for cleanly-flushed contexts.
+          // end-of-turn signal. Cartesia finishes synthesizing whatever
+          // is queued and then emits a `done` tagged with the same
+          // context_id — at that point `emitDoneOnce` fires for real.
+          // Defer rotation so the filter below still accepts in-flight
+          // audio chunks and the real `done` event.
           void context
             .send({ ...baseRequest, transcript: "", continue: false })
             .catch(ignoreRejection);
-          queueMicrotask(emitDoneOnce);
-          rotateContext();
+          rotatePending = true;
         },
         cancel() {
           if (closed) return;
           void context.cancel().catch(ignoreRejection);
           // Emit synchronously: barge-in advances the orchestrator's
           // state machine on `done`, and delaying it would audibly
-          // stall subsequent turns.
+          // stall subsequent turns. Cartesia stops producing audio
+          // after cancel, so dropping any late chunks is fine.
           emitDoneOnce();
-          rotateContext();
+          rotatePending = true;
         },
         on(event, fn) {
           return emitter.on(event, fn);

package/host/runtime.ts CHANGED Viewed

@@ -326,6 +326,9 @@ export function createRuntime(opts: RuntimeOptions): Runtime {
         tts: pipelineProviders.tts,
         sttApiKey: resolveApiKey("ASSEMBLYAI_API_KEY", env),
         ttsApiKey: resolveApiKey("CARTESIA_API_KEY", env),
+        sttSampleRate: s2sConfig.inputSampleRate,
+        ttsSampleRate: s2sConfig.outputSampleRate,
+        skipGreeting: sessionOpts.skipGreeting ?? false,
         logger,
       });
     }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@alexkroman1/aai",
-  "version": "1.4.0",
+  "version": "1.4.2",
   "type": "module",
   "exports": {
     ".": {