@alexkroman1/aai 1.4.1 → 1.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
 
2
- > @alexkroman1/aai@1.4.1 build /home/runner/work/agent/agent/packages/aai
2
+ > @alexkroman1/aai@1.4.3 build /home/runner/work/agent/agent/packages/aai
3
3
  > tsdown && tsc -p tsconfig.build.json
4
4
 
5
5
  ℹ tsdown v0.21.7 powered by rolldown v1.0.0-rc.12
@@ -8,7 +8,7 @@
8
8
  ℹ target: node22
9
9
  ℹ tsconfig: tsconfig.json
10
10
  ℹ Build start
11
- ℹ dist/host/runtime-barrel.js 74.24 kB │ gzip: 22.11 kB
11
+ ℹ dist/host/runtime-barrel.js 75.94 kB │ gzip: 22.51 kB
12
12
  ℹ dist/sdk/protocol.js  4.75 kB │ gzip: 1.76 kB
13
13
  ℹ dist/index.js  2.88 kB │ gzip: 1.24 kB
14
14
  ℹ dist/sdk/manifest-barrel.js  0.36 kB │ gzip: 0.20 kB
@@ -22,5 +22,5 @@
22
22
  ℹ dist/assemblyai-Cxg9eobY.js  0.53 kB │ gzip: 0.35 kB
23
23
  ℹ dist/anthropic-BrUCPKUc.js  0.23 kB │ gzip: 0.18 kB
24
24
  ℹ dist/cartesia-DwDk2tEu.js  0.22 kB │ gzip: 0.17 kB
25
- ℹ 14 files, total: 99.06 kB
25
+ ℹ 14 files, total: 100.76 kB
26
26
  ✔ Build complete in 45ms
package/CHANGELOG.md CHANGED
@@ -1,5 +1,17 @@
1
1
  # @alexkroman1/aai
2
2
 
3
+ ## 1.4.3
4
+
5
+ ### Patch Changes
6
+
7
+ - 62d5a99: Fix pipeline mode: play greeting, emit a single agent_transcript per turn, open TTS at the client's playback sample rate, stop the Cartesia adapter from eagerly rotating its context (which was silently dropping in-flight audio chunks), and skip the wire `context.cancel()` when the context is already final on Cartesia's side (avoids a benign 400 that was killing the session).
8
+
9
+ ## 1.4.2
10
+
11
+ ### Patch Changes
12
+
13
+ - f877a6f: Fix pipeline mode: play greeting, emit a single agent_transcript per turn, open TTS at the client's playback sample rate, and stop the Cartesia adapter from eagerly rotating its context (which was silently dropping in-flight audio chunks).
14
+
3
15
  ## 1.4.1
4
16
 
5
17
  ### Patch Changes
@@ -37,8 +37,12 @@ export interface PipelineSessionOptions {
37
37
  sttApiKey: string;
38
38
  /** TTS API key. */
39
39
  ttsApiKey: string;
40
- /** Audio sample rate (PCM16, Hz). Defaults to {@link DEFAULT_STT_SAMPLE_RATE}. */
41
- sampleRate?: number | undefined;
40
+ /** STT audio sample rate (PCM16, Hz). Defaults to {@link DEFAULT_STT_SAMPLE_RATE}. */
41
+ sttSampleRate?: number | undefined;
42
+ /** TTS audio sample rate (PCM16, Hz). Must match the client's playback AudioContext rate. Defaults to {@link DEFAULT_TTS_SAMPLE_RATE}. */
43
+ ttsSampleRate?: number | undefined;
44
+ /** Skip the initial greeting audio on connect (used for session resume). */
45
+ skipGreeting?: boolean | undefined;
42
46
  /** Logger. Defaults to the console logger. */
43
47
  logger?: Logger | undefined;
44
48
  /** Sliding-window conversation history size. */
@@ -560,10 +560,6 @@ function handleStreamPart(part, deps) {
560
560
  if (delta.length === 0) return;
561
561
  deps.onTextDelta(delta);
562
562
  deps.tts?.sendText(delta);
563
- deps.client.event({
564
- type: "agent_transcript",
565
- text: delta
566
- });
567
563
  return;
568
564
  }
569
565
  case "tool-call": {
@@ -601,7 +597,8 @@ function handleStreamPart(part, deps) {
601
597
  /** Create a pluggable-provider voice session. */
602
598
  function createPipelineSession(opts) {
603
599
  const log = opts.logger ?? consoleLogger;
604
- const sampleRate = opts.sampleRate ?? 16e3;
600
+ const sttSampleRate = opts.sttSampleRate ?? 16e3;
601
+ const ttsSampleRate = opts.ttsSampleRate ?? 24e3;
605
602
  const { client, agentConfig, toolSchemas, executeTool } = opts;
606
603
  const systemPrompt = buildSystemPrompt(agentConfig, {
607
604
  hasTools: toolSchemas.length > 0 || (agentConfig.builtinTools?.length ?? 0) > 0,
@@ -797,15 +794,44 @@ function createPipelineSession(opts) {
797
794
  if (turnController === ctl) turnController = null;
798
795
  return;
799
796
  }
797
+ if (accumulated.length > 0) {
798
+ client.event({
799
+ type: "agent_transcript",
800
+ text: accumulated
801
+ });
802
+ ctx.pushMessages({
803
+ role: "assistant",
804
+ content: accumulated
805
+ });
806
+ }
800
807
  await flushTtsAndWait(ctl.signal);
801
808
  if (ctl.signal.aborted) {
802
809
  if (turnController === ctl) turnController = null;
803
810
  return;
804
811
  }
805
- if (accumulated.length > 0) ctx.pushMessages({
812
+ client.playAudioDone();
813
+ client.event({ type: "reply_done" });
814
+ if (turnController === ctl) turnController = null;
815
+ }
816
+ async function runGreeting(text) {
817
+ const replyId = `pipeline-greeting-${++nextReplyId}`;
818
+ ctx.beginReply(replyId);
819
+ const ctl = new AbortController();
820
+ turnController = ctl;
821
+ client.event({
822
+ type: "agent_transcript",
823
+ text
824
+ });
825
+ ctx.pushMessages({
806
826
  role: "assistant",
807
- content: accumulated
827
+ content: text
808
828
  });
829
+ ctx.tts?.sendText(text);
830
+ await flushTtsAndWait(ctl.signal);
831
+ if (ctl.signal.aborted) {
832
+ if (turnController === ctl) turnController = null;
833
+ return;
834
+ }
809
835
  client.playAudioDone();
810
836
  client.event({ type: "reply_done" });
811
837
  if (turnController === ctl) turnController = null;
@@ -841,12 +867,12 @@ function createPipelineSession(opts) {
841
867
  }
842
868
  async function openProviders() {
843
869
  const [sttResult, ttsResult] = await Promise.allSettled([opts.stt.open({
844
- sampleRate,
870
+ sampleRate: sttSampleRate,
845
871
  apiKey: opts.sttApiKey,
846
872
  sttPrompt: agentConfig.sttPrompt,
847
873
  signal: sessionAbort.signal
848
874
  }), opts.tts.open({
849
- sampleRate,
875
+ sampleRate: ttsSampleRate,
850
876
  apiKey: opts.ttsApiKey,
851
877
  signal: sessionAbort.signal
852
878
  })]);
@@ -890,7 +916,18 @@ function createPipelineSession(opts) {
890
916
  ctx.stt?.sendAudio(pcm);
891
917
  },
892
918
  onAudioReady() {
919
+ if (audioReady || terminated) return;
893
920
  audioReady = true;
921
+ if (opts.skipGreeting) return;
922
+ const greeting = agentConfig.greeting;
923
+ if (!greeting) return;
924
+ const turn = runGreeting(greeting).catch((err) => {
925
+ log.error("Pipeline greeting failed", {
926
+ error: errorMessage(err),
927
+ sessionId: opts.id
928
+ });
929
+ });
930
+ ctx.chainTurn(turn);
894
931
  },
895
932
  onCancel() {
896
933
  if (terminated) return;
@@ -1076,9 +1113,21 @@ function openCartesia(opts) {
1076
1113
  * Reset whenever a fresh context is minted (i.e. at turn boundaries).
1077
1114
  */
1078
1115
  let doneEmitted = false;
1116
+ /**
1117
+ * After `flush()` or `cancel()`, the current context is done accepting
1118
+ * input. We defer minting a fresh one until the next `sendText()` so
1119
+ * that late audio chunks + Cartesia's real `done` event (both tagged
1120
+ * with the flushed context's id) still pass the filter below. Rotating
1121
+ * eagerly would silently drop all audio still in flight.
1122
+ */
1123
+ let rotatePending = false;
1079
1124
  const rotateContext = () => {
1080
1125
  context = mintContext();
1081
1126
  doneEmitted = false;
1127
+ rotatePending = false;
1128
+ };
1129
+ const rotateIfPending = () => {
1130
+ if (rotatePending) rotateContext();
1082
1131
  };
1083
1132
  const emitDoneOnce = () => {
1084
1133
  if (doneEmitted || closed) return;
@@ -1133,6 +1182,7 @@ function openCartesia(opts) {
1133
1182
  return {
1134
1183
  sendText(text) {
1135
1184
  if (closed || text.length === 0) return;
1185
+ rotateIfPending();
1136
1186
  context.send({
1137
1187
  ...baseRequest,
1138
1188
  transcript: text,
@@ -1140,20 +1190,19 @@ function openCartesia(opts) {
1140
1190
  }).catch(ignoreRejection);
1141
1191
  },
1142
1192
  flush() {
1143
- if (closed) return;
1193
+ if (closed || rotatePending) return;
1144
1194
  context.send({
1145
1195
  ...baseRequest,
1146
1196
  transcript: "",
1147
1197
  continue: false
1148
1198
  }).catch(ignoreRejection);
1149
- queueMicrotask(emitDoneOnce);
1150
- rotateContext();
1199
+ rotatePending = true;
1151
1200
  },
1152
1201
  cancel() {
1153
1202
  if (closed) return;
1154
- context.cancel().catch(ignoreRejection);
1203
+ if (!doneEmitted) context.cancel().catch(ignoreRejection);
1155
1204
  emitDoneOnce();
1156
- rotateContext();
1205
+ rotatePending = true;
1157
1206
  },
1158
1207
  on(event, fn) {
1159
1208
  return emitter.on(event, fn);
@@ -2153,6 +2202,9 @@ function createRuntime(opts) {
2153
2202
  tts: pipelineProviders.tts,
2154
2203
  sttApiKey: resolveApiKey("ASSEMBLYAI_API_KEY", env),
2155
2204
  ttsApiKey: resolveApiKey("CARTESIA_API_KEY", env),
2205
+ sttSampleRate: s2sConfig.inputSampleRate,
2206
+ ttsSampleRate: s2sConfig.outputSampleRate,
2207
+ skipGreeting: sessionOpts.skipGreeting ?? false,
2156
2208
  logger
2157
2209
  });
2158
2210
  const apiKey = env.ASSEMBLYAI_API_KEY ?? "";
@@ -43,7 +43,8 @@ function makeOpts(overrides: Partial<PipelineSessionOptions> = {}): {
43
43
  tts,
44
44
  sttApiKey: "stt-key",
45
45
  ttsApiKey: "tts-key",
46
- sampleRate: 16_000,
46
+ sttSampleRate: 16_000,
47
+ ttsSampleRate: 24_000,
47
48
  logger: silentLogger,
48
49
  ...overrides,
49
50
  };
@@ -81,20 +82,112 @@ describe("createPipelineSession — happy path", () => {
81
82
  expect(ttsSession.textChunks).toEqual(["Hello", " there"]);
82
83
  expect(ttsSession.flush).toHaveBeenCalledTimes(1);
83
84
 
84
- // Verify wire events in order
85
+ // Verify wire events in order — the pipeline emits a single
86
+ // `agent_transcript` with the full accumulated reply (not one per
87
+ // delta) so the UI renders one assistant message per turn.
85
88
  const types = eventTypes(client.events);
86
- expect(types).toEqual([
87
- "user_transcript",
88
- "agent_transcript", // "Hello"
89
- "agent_transcript", // " there"
90
- "reply_done",
91
- ]);
89
+ expect(types).toEqual(["user_transcript", "agent_transcript", "reply_done"]);
92
90
 
93
91
  // user_transcript text matches
94
92
  expect(client.events[0]).toMatchObject({
95
93
  type: "user_transcript",
96
94
  text: "Hello there, how are you?",
97
95
  });
96
+ expect(client.events[1]).toMatchObject({
97
+ type: "agent_transcript",
98
+ text: "Hello there",
99
+ });
100
+
101
+ await session.stop();
102
+ });
103
+ });
104
+
105
+ describe("createPipelineSession — greeting", () => {
106
+ test("onAudioReady sends greeting to TTS and emits agent_transcript + reply_done", async () => {
107
+ const { opts, tts, client } = makeOpts({
108
+ agentConfig: {
109
+ name: "pipeline-agent",
110
+ systemPrompt: DEFAULT_SYSTEM_PROMPT,
111
+ greeting: "Hi! I'm pipeline mode.",
112
+ },
113
+ });
114
+
115
+ const session = createPipelineSession(opts);
116
+ await session.start();
117
+
118
+ const ttsSession = tts.last();
119
+ if (!ttsSession) throw new Error("TTS didn't open");
120
+
121
+ session.onAudioReady();
122
+ await session.waitForTurn();
123
+
124
+ expect(ttsSession.textChunks).toEqual(["Hi! I'm pipeline mode."]);
125
+ expect(ttsSession.flush).toHaveBeenCalledTimes(1);
126
+
127
+ const types = eventTypes(client.events);
128
+ expect(types).toEqual(["agent_transcript", "reply_done"]);
129
+ expect(client.events[0]).toMatchObject({
130
+ type: "agent_transcript",
131
+ text: "Hi! I'm pipeline mode.",
132
+ });
133
+
134
+ await session.stop();
135
+ });
136
+
137
+ test("skipGreeting=true suppresses the greeting turn", async () => {
138
+ const { opts, tts, client } = makeOpts({
139
+ agentConfig: {
140
+ name: "pipeline-agent",
141
+ systemPrompt: DEFAULT_SYSTEM_PROMPT,
142
+ greeting: "Hello there.",
143
+ },
144
+ skipGreeting: true,
145
+ });
146
+
147
+ const session = createPipelineSession(opts);
148
+ await session.start();
149
+
150
+ const ttsSession = tts.last();
151
+ if (!ttsSession) throw new Error("TTS didn't open");
152
+
153
+ session.onAudioReady();
154
+ await session.waitForTurn();
155
+
156
+ expect(ttsSession.sendText).not.toHaveBeenCalled();
157
+ expect(ttsSession.flush).not.toHaveBeenCalled();
158
+ expect(client.events).toEqual([]);
159
+
160
+ await session.stop();
161
+ });
162
+
163
+ test("empty greeting is a no-op", async () => {
164
+ const { opts, tts, client } = makeOpts();
165
+ // CONFIG already has greeting: ""
166
+ const session = createPipelineSession(opts);
167
+ await session.start();
168
+
169
+ const ttsSession = tts.last();
170
+ if (!ttsSession) throw new Error("TTS didn't open");
171
+
172
+ session.onAudioReady();
173
+ await session.waitForTurn();
174
+
175
+ expect(ttsSession.sendText).not.toHaveBeenCalled();
176
+ expect(client.events).toEqual([]);
177
+
178
+ await session.stop();
179
+ });
180
+
181
+ test("passes sttSampleRate / ttsSampleRate through to providers", async () => {
182
+ const { opts, stt, tts } = makeOpts({
183
+ sttSampleRate: 16_000,
184
+ ttsSampleRate: 24_000,
185
+ });
186
+ const session = createPipelineSession(opts);
187
+ await session.start();
188
+
189
+ expect(stt.last()?.opts.sampleRate).toBe(16_000);
190
+ expect(tts.last()?.opts.sampleRate).toBe(24_000);
98
191
 
99
192
  await session.stop();
100
193
  });
@@ -163,8 +256,10 @@ describe("createPipelineSession — barge-in", () => {
163
256
 
164
257
  // TTS.cancel must have been called exactly once.
165
258
  expect(ttsSession.cancel).toHaveBeenCalledTimes(1);
166
- // Wire events: user_transcript, some agent_transcript(s), then cancelled.
167
- // No reply_done barge-in short-circuits the drain.
259
+ // Wire events: user_transcript then cancelled. No agent_transcript
260
+ // (the pipeline only emits it after the LLM stream finishes cleanly)
261
+ // and no reply_done — barge-in short-circuits both the stream and
262
+ // the drain.
168
263
  const types = eventTypes(client.events);
169
264
  expect(types).toContain("user_transcript");
170
265
  expect(types).toContain("cancelled");
@@ -212,12 +307,17 @@ describe("createPipelineSession — tool calls", () => {
212
307
  const types = eventTypes(client.events);
213
308
  expect(types).toEqual([
214
309
  "user_transcript",
215
- "agent_transcript", // "Let me check"
216
310
  "tool_call",
217
311
  "tool_call_done",
218
- "agent_transcript", // " — it's sunny."
312
+ "agent_transcript", // combined: "Let me check — it's sunny."
219
313
  "reply_done",
220
314
  ]);
315
+ expect(client.events.find((e) => (e as ClientEvent).type === "agent_transcript")).toMatchObject(
316
+ {
317
+ type: "agent_transcript",
318
+ text: "Let me check — it's sunny.",
319
+ },
320
+ );
221
321
 
222
322
  const toolCall = client.events.find((e) => (e as ClientEvent).type === "tool_call");
223
323
  expect(toolCall).toMatchObject({
@@ -10,7 +10,11 @@
10
10
  import type { LanguageModel, ModelMessage } from "ai";
11
11
  import { stepCountIs, streamText } from "ai";
12
12
  import type { AgentConfig, ExecuteTool, ToolSchema } from "../sdk/_internal-types.ts";
13
- import { DEFAULT_STT_SAMPLE_RATE, PIPELINE_FLUSH_TIMEOUT_MS } from "../sdk/constants.ts";
13
+ import {
14
+ DEFAULT_STT_SAMPLE_RATE,
15
+ DEFAULT_TTS_SAMPLE_RATE,
16
+ PIPELINE_FLUSH_TIMEOUT_MS,
17
+ } from "../sdk/constants.ts";
14
18
  import type { ClientSink, SessionErrorCode } from "../sdk/protocol.ts";
15
19
  import type {
16
20
  SttError,
@@ -55,8 +59,12 @@ export interface PipelineSessionOptions {
55
59
  sttApiKey: string;
56
60
  /** TTS API key. */
57
61
  ttsApiKey: string;
58
- /** Audio sample rate (PCM16, Hz). Defaults to {@link DEFAULT_STT_SAMPLE_RATE}. */
59
- sampleRate?: number | undefined;
62
+ /** STT audio sample rate (PCM16, Hz). Defaults to {@link DEFAULT_STT_SAMPLE_RATE}. */
63
+ sttSampleRate?: number | undefined;
64
+ /** TTS audio sample rate (PCM16, Hz). Must match the client's playback AudioContext rate. Defaults to {@link DEFAULT_TTS_SAMPLE_RATE}. */
65
+ ttsSampleRate?: number | undefined;
66
+ /** Skip the initial greeting audio on connect (used for session resume). */
67
+ skipGreeting?: boolean | undefined;
60
68
  /** Logger. Defaults to the console logger. */
61
69
  logger?: Logger | undefined;
62
70
  /** Sliding-window conversation history size. */
@@ -99,7 +107,6 @@ function handleStreamPart(
99
107
  if (delta.length === 0) return;
100
108
  deps.onTextDelta(delta);
101
109
  deps.tts?.sendText(delta);
102
- deps.client.event({ type: "agent_transcript", text: delta });
103
110
  return;
104
111
  }
105
112
  case "tool-call": {
@@ -136,7 +143,8 @@ function handleStreamPart(
136
143
  /** Create a pluggable-provider voice session. */
137
144
  export function createPipelineSession(opts: PipelineSessionOptions): Session {
138
145
  const log = opts.logger ?? consoleLogger;
139
- const sampleRate = opts.sampleRate ?? DEFAULT_STT_SAMPLE_RATE;
146
+ const sttSampleRate = opts.sttSampleRate ?? DEFAULT_STT_SAMPLE_RATE;
147
+ const ttsSampleRate = opts.ttsSampleRate ?? DEFAULT_TTS_SAMPLE_RATE;
140
148
  const { client, agentConfig, toolSchemas, executeTool } = opts;
141
149
 
142
150
  const hasTools = toolSchemas.length > 0 || (agentConfig.builtinTools?.length ?? 0) > 0;
@@ -342,6 +350,14 @@ export function createPipelineSession(opts: PipelineSessionOptions): Session {
342
350
  return;
343
351
  }
344
352
 
353
+ // Emit the complete transcript once the LLM finishes streaming, so the
354
+ // UI renders a single assistant message (vs. one per delta) and the user
355
+ // sees the text while TTS drains the synthesized audio.
356
+ if (accumulated.length > 0) {
357
+ client.event({ type: "agent_transcript", text: accumulated });
358
+ ctx.pushMessages({ role: "assistant", content: accumulated });
359
+ }
360
+
345
361
  await flushTtsAndWait(ctl.signal);
346
362
 
347
363
  if (ctl.signal.aborted) {
@@ -349,9 +365,29 @@ export function createPipelineSession(opts: PipelineSessionOptions): Session {
349
365
  return;
350
366
  }
351
367
 
352
- if (accumulated.length > 0) {
353
- ctx.pushMessages({ role: "assistant", content: accumulated });
368
+ client.playAudioDone();
369
+ client.event({ type: "reply_done" });
370
+ if (turnController === ctl) turnController = null;
371
+ }
372
+
373
+ async function runGreeting(text: string): Promise<void> {
374
+ const replyId = `pipeline-greeting-${++nextReplyId}`;
375
+ ctx.beginReply(replyId);
376
+
377
+ const ctl = new AbortController();
378
+ turnController = ctl;
379
+
380
+ client.event({ type: "agent_transcript", text });
381
+ ctx.pushMessages({ role: "assistant", content: text });
382
+ ctx.tts?.sendText(text);
383
+
384
+ await flushTtsAndWait(ctl.signal);
385
+
386
+ if (ctl.signal.aborted) {
387
+ if (turnController === ctl) turnController = null;
388
+ return;
354
389
  }
390
+
355
391
  client.playAudioDone();
356
392
  client.event({ type: "reply_done" });
357
393
  if (turnController === ctl) turnController = null;
@@ -394,13 +430,13 @@ export function createPipelineSession(opts: PipelineSessionOptions): Session {
394
430
  async function openProviders(): Promise<void> {
395
431
  const [sttResult, ttsResult] = await Promise.allSettled([
396
432
  opts.stt.open({
397
- sampleRate,
433
+ sampleRate: sttSampleRate,
398
434
  apiKey: opts.sttApiKey,
399
435
  sttPrompt: agentConfig.sttPrompt,
400
436
  signal: sessionAbort.signal,
401
437
  }),
402
438
  opts.tts.open({
403
- sampleRate,
439
+ sampleRate: ttsSampleRate,
404
440
  apiKey: opts.ttsApiKey,
405
441
  signal: sessionAbort.signal,
406
442
  }),
@@ -458,7 +494,15 @@ export function createPipelineSession(opts: PipelineSessionOptions): Session {
458
494
  ctx.stt?.sendAudio(pcm);
459
495
  },
460
496
  onAudioReady(): void {
497
+ if (audioReady || terminated) return;
461
498
  audioReady = true;
499
+ if (opts.skipGreeting) return;
500
+ const greeting = agentConfig.greeting;
501
+ if (!greeting) return;
502
+ const turn = runGreeting(greeting).catch((err: unknown) => {
503
+ log.error("Pipeline greeting failed", { error: errorMessage(err), sessionId: opts.id });
504
+ });
505
+ ctx.chainTurn(turn);
462
506
  },
463
507
  onCancel(): void {
464
508
  if (terminated) return;
@@ -151,12 +151,15 @@ describe("cartesia TTS adapter", () => {
151
151
  },
152
152
  ]);
153
153
 
154
- // After flush(), the adapter has rotated to a new context.
155
- const turn2 = session._currentContextId();
156
- expect(turn2).not.toBe(turn1);
154
+ // Rotation is deferred until the next sendText so Cartesia's late
155
+ // audio chunks + real `done` event (both tagged with turn1's id) still
156
+ // pass the context-id filter.
157
+ expect(session._currentContextId()).toBe(turn1);
157
158
 
158
- // Subsequent sendText targets the new context.
159
+ // Subsequent sendText rotates to a fresh context.
159
160
  session.sendText("next");
161
+ const turn2 = session._currentContextId();
162
+ expect(turn2).not.toBe(turn1);
160
163
  await flush();
161
164
  expect(sends.filter((s) => s.contextId === turn2)).toEqual([
162
165
  {
@@ -201,8 +204,56 @@ describe("cartesia TTS adapter", () => {
201
204
  { kind: "cancel", contextId: turn1 },
202
205
  ]);
203
206
 
204
- // Cancelling rotates the context so the next turn is unambiguous.
205
- expect(session._currentContextId()).not.toBe(turn1);
207
+ // Rotation is deferred until the next sendText cancel() halts the
208
+ // old context on Cartesia's side, so late events for turn1 can safely
209
+ // keep passing the filter until the next turn actually begins.
210
+ expect(session._currentContextId()).toBe(turn1);
211
+
212
+ // A subsequent sendText mints a fresh context for turn2.
213
+ session.sendText("again");
214
+ const turn2 = session._currentContextId();
215
+ expect(turn2).not.toBe(turn1);
216
+
217
+ controller.abort();
218
+ await session.close();
219
+ });
220
+
221
+ test("cancel() after done is a no-op on the wire (avoids Cartesia's 'context ID does not exist' 400)", async () => {
222
+ const { session, controller } = await openSession();
223
+ const turn1 = session._currentContextId();
224
+
225
+ session.sendText("hello");
226
+ session.flush();
227
+ await flush();
228
+
229
+ // Cartesia finishes synthesizing and emits `done` for the flushed context.
230
+ const ws = session._ws as unknown as { _fire(event: string, payload: unknown): void };
231
+ ws._fire("done", { context_id: turn1 });
232
+
233
+ // A late cancel (e.g. client `cancel` event after the turn completed
234
+ // normally) must not re-send `context.cancel()` — doing so would trip
235
+ // Cartesia's 400 and kill the session via onTtsError → terminate.
236
+ session.cancel();
237
+ await flush();
238
+
239
+ const cancels = sends.filter((s) => s.kind === "cancel");
240
+ expect(cancels).toEqual([]);
241
+
242
+ controller.abort();
243
+ await session.close();
244
+ });
245
+
246
+ test("double cancel() only sends one wire cancel", async () => {
247
+ const { session, controller } = await openSession();
248
+ const turn1 = session._currentContextId();
249
+
250
+ session.sendText("hello");
251
+ session.cancel();
252
+ session.cancel();
253
+ await flush();
254
+
255
+ const cancels = sends.filter((s) => s.kind === "cancel");
256
+ expect(cancels).toEqual([{ kind: "cancel", contextId: turn1 }]);
206
257
 
207
258
  controller.abort();
208
259
  await session.close();
@@ -107,9 +107,21 @@ export function openCartesia(opts: CartesiaOptions): TtsOpener {
107
107
  * Reset whenever a fresh context is minted (i.e. at turn boundaries).
108
108
  */
109
109
  let doneEmitted = false;
110
+ /**
111
+ * After `flush()` or `cancel()`, the current context is done accepting
112
+ * input. We defer minting a fresh one until the next `sendText()` so
113
+ * that late audio chunks + Cartesia's real `done` event (both tagged
114
+ * with the flushed context's id) still pass the filter below. Rotating
115
+ * eagerly would silently drop all audio still in flight.
116
+ */
117
+ let rotatePending = false;
110
118
  const rotateContext = () => {
111
119
  context = mintContext();
112
120
  doneEmitted = false;
121
+ rotatePending = false;
122
+ };
123
+ const rotateIfPending = () => {
124
+ if (rotatePending) rotateContext();
113
125
  };
114
126
  const emitDoneOnce = () => {
115
127
  if (doneEmitted || closed) return;
@@ -179,33 +191,43 @@ export function openCartesia(opts: CartesiaOptions): TtsOpener {
179
191
  const session: CartesiaSession = {
180
192
  sendText(text: string) {
181
193
  if (closed || text.length === 0) return;
194
+ // First sendText after a flush/cancel starts a fresh context so
195
+ // we don't append to one that's already been finalized.
196
+ rotateIfPending();
182
197
  void context
183
198
  .send({ ...baseRequest, transcript: text, continue: true })
184
199
  .catch(ignoreRejection);
185
200
  },
186
201
  flush() {
187
- if (closed) return;
202
+ if (closed || rotatePending) return;
188
203
  // Empty transcript with `continue: false` is the canonical
189
- // end-of-turn signal. Cartesia replies with a `done` tagged
190
- // by context_id, driving `emitDoneOnce`. The microtask
191
- // fallback guards against a dropped server event wedging
192
- // the orchestrator's state machine.
193
- // TODO: drop the microtask fallback once we've verified
194
- // Cartesia always emits `done` for cleanly-flushed contexts.
204
+ // end-of-turn signal. Cartesia finishes synthesizing whatever
205
+ // is queued and then emits a `done` tagged with the same
206
+ // context_id at that point `emitDoneOnce` fires for real.
207
+ // Defer rotation so the filter below still accepts in-flight
208
+ // audio chunks and the real `done` event.
195
209
  void context
196
210
  .send({ ...baseRequest, transcript: "", continue: false })
197
211
  .catch(ignoreRejection);
198
- queueMicrotask(emitDoneOnce);
199
- rotateContext();
212
+ rotatePending = true;
200
213
  },
201
214
  cancel() {
202
215
  if (closed) return;
203
- void context.cancel().catch(ignoreRejection);
216
+ // Skip the wire cancel if the context is already final on
217
+ // Cartesia's side (natural `done` after flush, or a prior
218
+ // cancel). Cartesia responds to cancel on a retired context
219
+ // with a 400 "context ID does not exist", which our error
220
+ // listener surfaces as `tts_stream_error` and the pipeline
221
+ // treats as fatal — killing the session for a benign race.
222
+ if (!doneEmitted) {
223
+ void context.cancel().catch(ignoreRejection);
224
+ }
204
225
  // Emit synchronously: barge-in advances the orchestrator's
205
226
  // state machine on `done`, and delaying it would audibly
206
- // stall subsequent turns.
227
+ // stall subsequent turns. Cartesia stops producing audio
228
+ // after cancel, so dropping any late chunks is fine.
207
229
  emitDoneOnce();
208
- rotateContext();
230
+ rotatePending = true;
209
231
  },
210
232
  on(event, fn) {
211
233
  return emitter.on(event, fn);
package/host/runtime.ts CHANGED
@@ -326,6 +326,9 @@ export function createRuntime(opts: RuntimeOptions): Runtime {
326
326
  tts: pipelineProviders.tts,
327
327
  sttApiKey: resolveApiKey("ASSEMBLYAI_API_KEY", env),
328
328
  ttsApiKey: resolveApiKey("CARTESIA_API_KEY", env),
329
+ sttSampleRate: s2sConfig.inputSampleRate,
330
+ ttsSampleRate: s2sConfig.outputSampleRate,
331
+ skipGreeting: sessionOpts.skipGreeting ?? false,
329
332
  logger,
330
333
  });
331
334
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@alexkroman1/aai",
3
- "version": "1.4.1",
3
+ "version": "1.4.3",
4
4
  "type": "module",
5
5
  "exports": {
6
6
  ".": {