@alexkroman1/aai 1.4.2 → 1.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
 
2
- > @alexkroman1/aai@1.4.2 build /home/runner/work/agent/agent/packages/aai
2
+ > @alexkroman1/aai@1.4.4 build /home/runner/work/agent/agent/packages/aai
3
3
  > tsdown && tsc -p tsconfig.build.json
4
4
 
5
5
  ℹ tsdown v0.21.7 powered by rolldown v1.0.0-rc.12
@@ -8,7 +8,7 @@
8
8
  ℹ target: node22
9
9
  ℹ tsconfig: tsconfig.json
10
10
  ℹ Build start
11
- ℹ dist/host/runtime-barrel.js 75.92 kB │ gzip: 22.48 kB
11
+ ℹ dist/host/runtime-barrel.js 76.29 kB │ gzip: 22.68 kB
12
12
  ℹ dist/sdk/protocol.js  4.75 kB │ gzip: 1.76 kB
13
13
  ℹ dist/index.js  2.88 kB │ gzip: 1.24 kB
14
14
  ℹ dist/sdk/manifest-barrel.js  0.36 kB │ gzip: 0.20 kB
@@ -22,5 +22,5 @@
22
22
  ℹ dist/assemblyai-Cxg9eobY.js  0.53 kB │ gzip: 0.35 kB
23
23
  ℹ dist/anthropic-BrUCPKUc.js  0.23 kB │ gzip: 0.18 kB
24
24
  ℹ dist/cartesia-DwDk2tEu.js  0.22 kB │ gzip: 0.17 kB
25
- ℹ 14 files, total: 100.74 kB
26
- ✔ Build complete in 48ms
25
+ ℹ 14 files, total: 101.11 kB
26
+ ✔ Build complete in 41ms
package/CHANGELOG.md CHANGED
@@ -1,5 +1,17 @@
1
1
  # @alexkroman1/aai
2
2
 
3
+ ## 1.4.4
4
+
5
+ ### Patch Changes
6
+
7
+ - 74341a4: fix(aai): dedup duplicate S2S reply.done and speech.stopped events to prevent client-side cascades in the voice session wire protocol
8
+
9
+ ## 1.4.3
10
+
11
+ ### Patch Changes
12
+
13
+ - 62d5a99: Fix pipeline mode: play greeting, emit a single agent_transcript per turn, open TTS at the client's playback sample rate, stop the Cartesia adapter from eagerly rotating its context (which was silently dropping in-flight audio chunks), and skip the wire `context.cancel()` when the context is already final on Cartesia's side (avoids a benign 400 that was killing the session).
14
+
3
15
  ## 1.4.2
4
16
 
5
17
  ### Patch Changes
@@ -1200,7 +1200,7 @@ function openCartesia(opts) {
1200
1200
  },
1201
1201
  cancel() {
1202
1202
  if (closed) return;
1203
- context.cancel().catch(ignoreRejection);
1203
+ if (!doneEmitted) context.cancel().catch(ignoreRejection);
1204
1204
  emitDoneOnce();
1205
1205
  rotatePending = true;
1206
1206
  },
@@ -1326,17 +1326,23 @@ function parseS2sMessage(obj) {
1326
1326
  const result = S2sMessageSchema.safeParse(obj);
1327
1327
  return result.success ? result.data : void 0;
1328
1328
  }
1329
- function dispatchS2sMessage(emitter, msg) {
1329
+ function dispatchS2sMessage(emitter, msg, state) {
1330
1330
  switch (msg.type) {
1331
1331
  case "session.ready":
1332
1332
  emitter.emit("ready", { sessionId: msg.session_id });
1333
1333
  break;
1334
1334
  case "session.updated": break;
1335
1335
  case "input.speech.started":
1336
- emitter.emit("event", { type: "speech_started" });
1336
+ if (!state.speechActive) {
1337
+ state.speechActive = true;
1338
+ emitter.emit("event", { type: "speech_started" });
1339
+ }
1337
1340
  break;
1338
1341
  case "input.speech.stopped":
1339
- emitter.emit("event", { type: "speech_stopped" });
1342
+ if (state.speechActive) {
1343
+ state.speechActive = false;
1344
+ emitter.emit("event", { type: "speech_stopped" });
1345
+ }
1340
1346
  break;
1341
1347
  case "transcript.user":
1342
1348
  emitter.emit("event", {
@@ -1382,6 +1388,7 @@ function connectS2s(opts) {
1382
1388
  log.info("S2S connecting", { url: config.wssUrl });
1383
1389
  const ws = createWebSocket(config.wssUrl, { headers: { Authorization: `Bearer ${apiKey}` } });
1384
1390
  const emitter = createNanoEvents();
1391
+ const dispatchState = { speechActive: false };
1385
1392
  let opened = false;
1386
1393
  function send(msg) {
1387
1394
  if (ws.readyState !== 1) {
@@ -1478,7 +1485,7 @@ function connectS2s(opts) {
1478
1485
  log.warn(`S2S << unrecognised message type: ${obj.type ?? JSON.stringify(raw).slice(0, 200)}`);
1479
1486
  return;
1480
1487
  }
1481
- dispatchS2sMessage(emitter, parsed);
1488
+ dispatchS2sMessage(emitter, parsed, dispatchState);
1482
1489
  }
1483
1490
  ws.addEventListener("message", handleS2sMessage);
1484
1491
  ws.addEventListener("close", (ev) => {
@@ -1616,6 +1623,10 @@ function handleReplyCancelled(ctx) {
1616
1623
  }
1617
1624
  function handleReplyDone(ctx) {
1618
1625
  const doneReplyId = ctx.reply.currentReplyId;
1626
+ if (doneReplyId === null) {
1627
+ ctx.log.debug("Dropping duplicate reply.done (no active reply)");
1628
+ return;
1629
+ }
1619
1630
  const sendPending = () => {
1620
1631
  if (ctx.reply.currentReplyId !== doneReplyId) {
1621
1632
  ctx.reply.pendingTools = [];
@@ -1632,6 +1643,7 @@ function handleReplyDone(ctx) {
1632
1643
  });
1633
1644
  ctx.client.playAudioDone();
1634
1645
  ctx.client.event({ type: "reply_done" });
1646
+ ctx.reply.currentReplyId = null;
1635
1647
  }
1636
1648
  };
1637
1649
  if (ctx.turnPromise !== null) ctx.turnPromise.then(sendPending);
@@ -217,4 +217,45 @@ describe("cartesia TTS adapter", () => {
217
217
  controller.abort();
218
218
  await session.close();
219
219
  });
220
+
221
+ test("cancel() after done is a no-op on the wire (avoids Cartesia's 'context ID does not exist' 400)", async () => {
222
+ const { session, controller } = await openSession();
223
+ const turn1 = session._currentContextId();
224
+
225
+ session.sendText("hello");
226
+ session.flush();
227
+ await flush();
228
+
229
+ // Cartesia finishes synthesizing and emits `done` for the flushed context.
230
+ const ws = session._ws as unknown as { _fire(event: string, payload: unknown): void };
231
+ ws._fire("done", { context_id: turn1 });
232
+
233
+ // A late cancel (e.g. client `cancel` event after the turn completed
234
+ // normally) must not re-send `context.cancel()` — doing so would trip
235
+ // Cartesia's 400 and kill the session via onTtsError → terminate.
236
+ session.cancel();
237
+ await flush();
238
+
239
+ const cancels = sends.filter((s) => s.kind === "cancel");
240
+ expect(cancels).toEqual([]);
241
+
242
+ controller.abort();
243
+ await session.close();
244
+ });
245
+
246
+ test("double cancel() only sends one wire cancel", async () => {
247
+ const { session, controller } = await openSession();
248
+ const turn1 = session._currentContextId();
249
+
250
+ session.sendText("hello");
251
+ session.cancel();
252
+ session.cancel();
253
+ await flush();
254
+
255
+ const cancels = sends.filter((s) => s.kind === "cancel");
256
+ expect(cancels).toEqual([{ kind: "cancel", contextId: turn1 }]);
257
+
258
+ controller.abort();
259
+ await session.close();
260
+ });
220
261
  });
@@ -213,7 +213,15 @@ export function openCartesia(opts: CartesiaOptions): TtsOpener {
213
213
  },
214
214
  cancel() {
215
215
  if (closed) return;
216
- void context.cancel().catch(ignoreRejection);
216
+ // Skip the wire cancel if the context is already final on
217
+ // Cartesia's side (natural `done` after flush, or a prior
218
+ // cancel). Cartesia responds to cancel on a retired context
219
+ // with a 400 "context ID does not exist", which our error
220
+ // listener surfaces as `tts_stream_error` and the pipeline
221
+ // treats as fatal — killing the session for a benign race.
222
+ if (!doneEmitted) {
223
+ void context.cancel().catch(ignoreRejection);
224
+ }
217
225
  // Emit synchronously: barge-in advances the orchestrator's
218
226
  // state machine on `done`, and delaying it would audibly
219
227
  // stall subsequent turns. Cartesia stops producing audio
package/host/s2s.test.ts CHANGED
@@ -219,10 +219,25 @@ describe("connectS2s", () => {
219
219
  const handler = vi.fn();
220
220
  handle.on("event", handler);
221
221
 
222
+ // Prime VAD state — speech_stopped is only forwarded after a speech_started.
223
+ raw.emit("message", Buffer.from(JSON.stringify({ type: "input.speech.started" })));
222
224
  raw.emit("message", Buffer.from(JSON.stringify({ type: "input.speech.stopped" })));
223
225
 
224
- expect(handler).toHaveBeenCalledOnce();
225
- expect(handler.mock.calls[0]?.[0]).toEqual({ type: "speech_stopped" });
226
+ expect(handler).toHaveBeenCalledTimes(2);
227
+ expect(handler.mock.calls[0]?.[0]).toEqual({ type: "speech_started" });
228
+ expect(handler.mock.calls[1]?.[0]).toEqual({ type: "speech_stopped" });
229
+ });
230
+
231
+ test("duplicate input.speech.stopped is suppressed", async () => {
232
+ const { raw, handle } = await setupHandle();
233
+ const handler = vi.fn();
234
+ handle.on("event", handler);
235
+
236
+ raw.emit("message", Buffer.from(JSON.stringify({ type: "input.speech.started" })));
237
+ raw.emit("message", Buffer.from(JSON.stringify({ type: "input.speech.stopped" })));
238
+ raw.emit("message", Buffer.from(JSON.stringify({ type: "input.speech.stopped" })));
239
+
240
+ expect(handler.mock.calls.filter((c) => c[0].type === "speech_stopped")).toHaveLength(1);
226
241
  });
227
242
 
228
243
  test("transcript.user dispatches 'event' with user_transcript", async () => {
package/host/s2s.ts CHANGED
@@ -79,7 +79,18 @@ function parseS2sMessage(obj: Record<string, unknown>): S2sServerMessage | undef
79
79
  */
80
80
  export type S2sEvent = ClientEvent & { _interrupted?: boolean };
81
81
 
82
- function dispatchS2sMessage(emitter: Emitter<S2sEvents>, msg: S2sServerMessage): void {
82
+ /**
83
+ * Per-connection dispatch state. Used to dedup events that the upstream S2S
84
+ * service may emit more than once for a single logical turn (e.g. repeated
85
+ * `input.speech.stopped` after the VAD flips).
86
+ */
87
+ type DispatchState = { speechActive: boolean };
88
+
89
+ function dispatchS2sMessage(
90
+ emitter: Emitter<S2sEvents>,
91
+ msg: S2sServerMessage,
92
+ state: DispatchState,
93
+ ): void {
83
94
  switch (msg.type) {
84
95
  case "session.ready":
85
96
  emitter.emit("ready", { sessionId: msg.session_id });
@@ -87,10 +98,16 @@ function dispatchS2sMessage(emitter: Emitter<S2sEvents>, msg: S2sServerMessage):
87
98
  case "session.updated":
88
99
  break;
89
100
  case "input.speech.started":
90
- emitter.emit("event", { type: "speech_started" });
101
+ if (!state.speechActive) {
102
+ state.speechActive = true;
103
+ emitter.emit("event", { type: "speech_started" });
104
+ }
91
105
  break;
92
106
  case "input.speech.stopped":
93
- emitter.emit("event", { type: "speech_stopped" });
107
+ if (state.speechActive) {
108
+ state.speechActive = false;
109
+ emitter.emit("event", { type: "speech_stopped" });
110
+ }
94
111
  break;
95
112
  case "transcript.user":
96
113
  emitter.emit("event", { type: "user_transcript", text: msg.text });
@@ -188,6 +205,7 @@ export function connectS2s(opts: ConnectS2sOptions): Promise<S2sHandle> {
188
205
  });
189
206
 
190
207
  const emitter = createNanoEvents<S2sEvents>();
208
+ const dispatchState: DispatchState = { speechActive: false };
191
209
  let opened = false;
192
210
 
193
211
  function send(msg: { type: string; [key: string]: unknown }): void {
@@ -291,7 +309,7 @@ export function connectS2s(opts: ConnectS2sOptions): Promise<S2sHandle> {
291
309
  );
292
310
  return;
293
311
  }
294
- dispatchS2sMessage(emitter, parsed);
312
+ dispatchS2sMessage(emitter, parsed, dispatchState);
295
313
  }
296
314
 
297
315
  ws.addEventListener("message", handleS2sMessage);
@@ -168,12 +168,29 @@ describe("createS2sSession", () => {
168
168
  const { session, client, mockHandle } = setup();
169
169
  await session.start();
170
170
 
171
+ mockHandle._fire("replyStarted", { replyId: "r1" });
171
172
  mockHandle._fire("event", { type: "reply_done" });
172
173
 
173
174
  expect(client.audioDoneCount).toBe(1);
174
175
  expect(client.events).toContainEvent("reply_done");
175
176
  });
176
177
 
178
+ test("duplicate reply_done is suppressed after reply completes", async () => {
179
+ const { session, client, mockHandle } = setup();
180
+ await session.start();
181
+
182
+ mockHandle._fire("replyStarted", { replyId: "r1" });
183
+ mockHandle._fire("event", { type: "reply_done" });
184
+ mockHandle._fire("event", { type: "reply_done" });
185
+
186
+ const replyDones = client.events.filter(
187
+ (e): e is { type: string } =>
188
+ typeof e === "object" && e !== null && "type" in e && e.type === "reply_done",
189
+ );
190
+ expect(replyDones).toHaveLength(1);
191
+ expect(client.audioDoneCount).toBe(1);
192
+ });
193
+
177
194
  test("cancelled event emits cancelled", async () => {
178
195
  const { session, client, mockHandle } = setup();
179
196
  await session.start();
package/host/session.ts CHANGED
@@ -181,6 +181,12 @@ function handleReplyCancelled(ctx: S2sSessionCtx): void {
181
181
 
182
182
  function handleReplyDone(ctx: S2sSessionCtx): void {
183
183
  const doneReplyId = ctx.reply.currentReplyId;
184
+ // Dedup duplicate reply.done events from the S2S service: once the reply
185
+ // has been fully dispatched (or was never started), currentReplyId is null.
186
+ if (doneReplyId === null) {
187
+ ctx.log.debug("Dropping duplicate reply.done (no active reply)");
188
+ return;
189
+ }
184
190
  const sendPending = () => {
185
191
  if (ctx.reply.currentReplyId !== doneReplyId) {
186
192
  ctx.reply.pendingTools = [];
@@ -196,6 +202,8 @@ function handleReplyDone(ctx: S2sSessionCtx): void {
196
202
  }
197
203
  ctx.client.playAudioDone();
198
204
  ctx.client.event({ type: "reply_done" });
205
+ // Mark reply as finished so any repeated reply.done is dropped above.
206
+ ctx.reply.currentReplyId = null;
199
207
  }
200
208
  };
201
209
  if (ctx.turnPromise !== null) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@alexkroman1/aai",
3
- "version": "1.4.2",
3
+ "version": "1.4.4",
4
4
  "type": "module",
5
5
  "exports": {
6
6
  ".": {