@alexkroman1/aai 1.4.2 → 1.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +4 -4
- package/CHANGELOG.md +12 -0
- package/dist/host/runtime-barrel.js +17 -5
- package/host/providers/tts/cartesia.test.ts +41 -0
- package/host/providers/tts/cartesia.ts +9 -1
- package/host/s2s.test.ts +17 -2
- package/host/s2s.ts +22 -4
- package/host/session.test.ts +17 -0
- package/host/session.ts +8 -0
- package/package.json +1 -1
package/.turbo/turbo-build.log
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
|
|
2
|
-
> @alexkroman1/aai@1.4.
|
|
2
|
+
> @alexkroman1/aai@1.4.4 build /home/runner/work/agent/agent/packages/aai
|
|
3
3
|
> tsdown && tsc -p tsconfig.build.json
|
|
4
4
|
|
|
5
5
|
[34mℹ[39m [34mtsdown v0.21.7[39m powered by [38;2;255;126;23mrolldown v1.0.0-rc.12[39m
|
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
[34mℹ[39m target: [34mnode22[39m
|
|
9
9
|
[34mℹ[39m tsconfig: [34mtsconfig.json[39m
|
|
10
10
|
[34mℹ[39m Build start
|
|
11
|
-
[34mℹ[39m [2mdist/[22m[1mhost/runtime-barrel.js[22m [
|
|
11
|
+
[34mℹ[39m [2mdist/[22m[1mhost/runtime-barrel.js[22m [2m76.29 kB[22m [2m│ gzip: 22.68 kB[22m
|
|
12
12
|
[34mℹ[39m [2mdist/[22m[1msdk/protocol.js[22m [2m 4.75 kB[22m [2m│ gzip: 1.76 kB[22m
|
|
13
13
|
[34mℹ[39m [2mdist/[22m[1mindex.js[22m [2m 2.88 kB[22m [2m│ gzip: 1.24 kB[22m
|
|
14
14
|
[34mℹ[39m [2mdist/[22m[1msdk/manifest-barrel.js[22m [2m 0.36 kB[22m [2m│ gzip: 0.20 kB[22m
|
|
@@ -22,5 +22,5 @@
|
|
|
22
22
|
[34mℹ[39m [2mdist/[22massemblyai-Cxg9eobY.js [2m 0.53 kB[22m [2m│ gzip: 0.35 kB[22m
|
|
23
23
|
[34mℹ[39m [2mdist/[22manthropic-BrUCPKUc.js [2m 0.23 kB[22m [2m│ gzip: 0.18 kB[22m
|
|
24
24
|
[34mℹ[39m [2mdist/[22mcartesia-DwDk2tEu.js [2m 0.22 kB[22m [2m│ gzip: 0.17 kB[22m
|
|
25
|
-
[34mℹ[39m 14 files, total:
|
|
26
|
-
[32m✔[39m Build complete in [
|
|
25
|
+
[34mℹ[39m 14 files, total: 101.11 kB
|
|
26
|
+
[32m✔[39m Build complete in [32m41ms[39m
|
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,17 @@
|
|
|
1
1
|
# @alexkroman1/aai
|
|
2
2
|
|
|
3
|
+
## 1.4.4
|
|
4
|
+
|
|
5
|
+
### Patch Changes
|
|
6
|
+
|
|
7
|
+
- 74341a4: fix(aai): dedup duplicate S2S reply.done and speech.stopped events to prevent client-side cascades in the voice session wire protocol
|
|
8
|
+
|
|
9
|
+
## 1.4.3
|
|
10
|
+
|
|
11
|
+
### Patch Changes
|
|
12
|
+
|
|
13
|
+
- 62d5a99: Fix pipeline mode: play greeting, emit a single agent_transcript per turn, open TTS at the client's playback sample rate, stop the Cartesia adapter from eagerly rotating its context (which was silently dropping in-flight audio chunks), and skip the wire `context.cancel()` when the context is already final on Cartesia's side (avoids a benign 400 that was killing the session).
|
|
14
|
+
|
|
3
15
|
## 1.4.2
|
|
4
16
|
|
|
5
17
|
### Patch Changes
|
|
@@ -1200,7 +1200,7 @@ function openCartesia(opts) {
|
|
|
1200
1200
|
},
|
|
1201
1201
|
cancel() {
|
|
1202
1202
|
if (closed) return;
|
|
1203
|
-
context.cancel().catch(ignoreRejection);
|
|
1203
|
+
if (!doneEmitted) context.cancel().catch(ignoreRejection);
|
|
1204
1204
|
emitDoneOnce();
|
|
1205
1205
|
rotatePending = true;
|
|
1206
1206
|
},
|
|
@@ -1326,17 +1326,23 @@ function parseS2sMessage(obj) {
|
|
|
1326
1326
|
const result = S2sMessageSchema.safeParse(obj);
|
|
1327
1327
|
return result.success ? result.data : void 0;
|
|
1328
1328
|
}
|
|
1329
|
-
function dispatchS2sMessage(emitter, msg) {
|
|
1329
|
+
function dispatchS2sMessage(emitter, msg, state) {
|
|
1330
1330
|
switch (msg.type) {
|
|
1331
1331
|
case "session.ready":
|
|
1332
1332
|
emitter.emit("ready", { sessionId: msg.session_id });
|
|
1333
1333
|
break;
|
|
1334
1334
|
case "session.updated": break;
|
|
1335
1335
|
case "input.speech.started":
|
|
1336
|
-
|
|
1336
|
+
if (!state.speechActive) {
|
|
1337
|
+
state.speechActive = true;
|
|
1338
|
+
emitter.emit("event", { type: "speech_started" });
|
|
1339
|
+
}
|
|
1337
1340
|
break;
|
|
1338
1341
|
case "input.speech.stopped":
|
|
1339
|
-
|
|
1342
|
+
if (state.speechActive) {
|
|
1343
|
+
state.speechActive = false;
|
|
1344
|
+
emitter.emit("event", { type: "speech_stopped" });
|
|
1345
|
+
}
|
|
1340
1346
|
break;
|
|
1341
1347
|
case "transcript.user":
|
|
1342
1348
|
emitter.emit("event", {
|
|
@@ -1382,6 +1388,7 @@ function connectS2s(opts) {
|
|
|
1382
1388
|
log.info("S2S connecting", { url: config.wssUrl });
|
|
1383
1389
|
const ws = createWebSocket(config.wssUrl, { headers: { Authorization: `Bearer ${apiKey}` } });
|
|
1384
1390
|
const emitter = createNanoEvents();
|
|
1391
|
+
const dispatchState = { speechActive: false };
|
|
1385
1392
|
let opened = false;
|
|
1386
1393
|
function send(msg) {
|
|
1387
1394
|
if (ws.readyState !== 1) {
|
|
@@ -1478,7 +1485,7 @@ function connectS2s(opts) {
|
|
|
1478
1485
|
log.warn(`S2S << unrecognised message type: ${obj.type ?? JSON.stringify(raw).slice(0, 200)}`);
|
|
1479
1486
|
return;
|
|
1480
1487
|
}
|
|
1481
|
-
dispatchS2sMessage(emitter, parsed);
|
|
1488
|
+
dispatchS2sMessage(emitter, parsed, dispatchState);
|
|
1482
1489
|
}
|
|
1483
1490
|
ws.addEventListener("message", handleS2sMessage);
|
|
1484
1491
|
ws.addEventListener("close", (ev) => {
|
|
@@ -1616,6 +1623,10 @@ function handleReplyCancelled(ctx) {
|
|
|
1616
1623
|
}
|
|
1617
1624
|
function handleReplyDone(ctx) {
|
|
1618
1625
|
const doneReplyId = ctx.reply.currentReplyId;
|
|
1626
|
+
if (doneReplyId === null) {
|
|
1627
|
+
ctx.log.debug("Dropping duplicate reply.done (no active reply)");
|
|
1628
|
+
return;
|
|
1629
|
+
}
|
|
1619
1630
|
const sendPending = () => {
|
|
1620
1631
|
if (ctx.reply.currentReplyId !== doneReplyId) {
|
|
1621
1632
|
ctx.reply.pendingTools = [];
|
|
@@ -1632,6 +1643,7 @@ function handleReplyDone(ctx) {
|
|
|
1632
1643
|
});
|
|
1633
1644
|
ctx.client.playAudioDone();
|
|
1634
1645
|
ctx.client.event({ type: "reply_done" });
|
|
1646
|
+
ctx.reply.currentReplyId = null;
|
|
1635
1647
|
}
|
|
1636
1648
|
};
|
|
1637
1649
|
if (ctx.turnPromise !== null) ctx.turnPromise.then(sendPending);
|
|
@@ -217,4 +217,45 @@ describe("cartesia TTS adapter", () => {
|
|
|
217
217
|
controller.abort();
|
|
218
218
|
await session.close();
|
|
219
219
|
});
|
|
220
|
+
|
|
221
|
+
test("cancel() after done is a no-op on the wire (avoids Cartesia's 'context ID does not exist' 400)", async () => {
|
|
222
|
+
const { session, controller } = await openSession();
|
|
223
|
+
const turn1 = session._currentContextId();
|
|
224
|
+
|
|
225
|
+
session.sendText("hello");
|
|
226
|
+
session.flush();
|
|
227
|
+
await flush();
|
|
228
|
+
|
|
229
|
+
// Cartesia finishes synthesizing and emits `done` for the flushed context.
|
|
230
|
+
const ws = session._ws as unknown as { _fire(event: string, payload: unknown): void };
|
|
231
|
+
ws._fire("done", { context_id: turn1 });
|
|
232
|
+
|
|
233
|
+
// A late cancel (e.g. client `cancel` event after the turn completed
|
|
234
|
+
// normally) must not re-send `context.cancel()` — doing so would trip
|
|
235
|
+
// Cartesia's 400 and kill the session via onTtsError → terminate.
|
|
236
|
+
session.cancel();
|
|
237
|
+
await flush();
|
|
238
|
+
|
|
239
|
+
const cancels = sends.filter((s) => s.kind === "cancel");
|
|
240
|
+
expect(cancels).toEqual([]);
|
|
241
|
+
|
|
242
|
+
controller.abort();
|
|
243
|
+
await session.close();
|
|
244
|
+
});
|
|
245
|
+
|
|
246
|
+
test("double cancel() only sends one wire cancel", async () => {
|
|
247
|
+
const { session, controller } = await openSession();
|
|
248
|
+
const turn1 = session._currentContextId();
|
|
249
|
+
|
|
250
|
+
session.sendText("hello");
|
|
251
|
+
session.cancel();
|
|
252
|
+
session.cancel();
|
|
253
|
+
await flush();
|
|
254
|
+
|
|
255
|
+
const cancels = sends.filter((s) => s.kind === "cancel");
|
|
256
|
+
expect(cancels).toEqual([{ kind: "cancel", contextId: turn1 }]);
|
|
257
|
+
|
|
258
|
+
controller.abort();
|
|
259
|
+
await session.close();
|
|
260
|
+
});
|
|
220
261
|
});
|
|
@@ -213,7 +213,15 @@ export function openCartesia(opts: CartesiaOptions): TtsOpener {
|
|
|
213
213
|
},
|
|
214
214
|
cancel() {
|
|
215
215
|
if (closed) return;
|
|
216
|
-
|
|
216
|
+
// Skip the wire cancel if the context is already final on
|
|
217
|
+
// Cartesia's side (natural `done` after flush, or a prior
|
|
218
|
+
// cancel). Cartesia responds to cancel on a retired context
|
|
219
|
+
// with a 400 "context ID does not exist", which our error
|
|
220
|
+
// listener surfaces as `tts_stream_error` and the pipeline
|
|
221
|
+
// treats as fatal — killing the session for a benign race.
|
|
222
|
+
if (!doneEmitted) {
|
|
223
|
+
void context.cancel().catch(ignoreRejection);
|
|
224
|
+
}
|
|
217
225
|
// Emit synchronously: barge-in advances the orchestrator's
|
|
218
226
|
// state machine on `done`, and delaying it would audibly
|
|
219
227
|
// stall subsequent turns. Cartesia stops producing audio
|
package/host/s2s.test.ts
CHANGED
|
@@ -219,10 +219,25 @@ describe("connectS2s", () => {
|
|
|
219
219
|
const handler = vi.fn();
|
|
220
220
|
handle.on("event", handler);
|
|
221
221
|
|
|
222
|
+
// Prime VAD state — speech_stopped is only forwarded after a speech_started.
|
|
223
|
+
raw.emit("message", Buffer.from(JSON.stringify({ type: "input.speech.started" })));
|
|
222
224
|
raw.emit("message", Buffer.from(JSON.stringify({ type: "input.speech.stopped" })));
|
|
223
225
|
|
|
224
|
-
expect(handler).
|
|
225
|
-
expect(handler.mock.calls[0]?.[0]).toEqual({ type: "
|
|
226
|
+
expect(handler).toHaveBeenCalledTimes(2);
|
|
227
|
+
expect(handler.mock.calls[0]?.[0]).toEqual({ type: "speech_started" });
|
|
228
|
+
expect(handler.mock.calls[1]?.[0]).toEqual({ type: "speech_stopped" });
|
|
229
|
+
});
|
|
230
|
+
|
|
231
|
+
test("duplicate input.speech.stopped is suppressed", async () => {
|
|
232
|
+
const { raw, handle } = await setupHandle();
|
|
233
|
+
const handler = vi.fn();
|
|
234
|
+
handle.on("event", handler);
|
|
235
|
+
|
|
236
|
+
raw.emit("message", Buffer.from(JSON.stringify({ type: "input.speech.started" })));
|
|
237
|
+
raw.emit("message", Buffer.from(JSON.stringify({ type: "input.speech.stopped" })));
|
|
238
|
+
raw.emit("message", Buffer.from(JSON.stringify({ type: "input.speech.stopped" })));
|
|
239
|
+
|
|
240
|
+
expect(handler.mock.calls.filter((c) => c[0].type === "speech_stopped")).toHaveLength(1);
|
|
226
241
|
});
|
|
227
242
|
|
|
228
243
|
test("transcript.user dispatches 'event' with user_transcript", async () => {
|
package/host/s2s.ts
CHANGED
|
@@ -79,7 +79,18 @@ function parseS2sMessage(obj: Record<string, unknown>): S2sServerMessage | undef
|
|
|
79
79
|
*/
|
|
80
80
|
export type S2sEvent = ClientEvent & { _interrupted?: boolean };
|
|
81
81
|
|
|
82
|
-
|
|
82
|
+
/**
|
|
83
|
+
* Per-connection dispatch state. Used to dedup events that the upstream S2S
|
|
84
|
+
* service may emit more than once for a single logical turn (e.g. repeated
|
|
85
|
+
* `input.speech.stopped` after the VAD flips).
|
|
86
|
+
*/
|
|
87
|
+
type DispatchState = { speechActive: boolean };
|
|
88
|
+
|
|
89
|
+
function dispatchS2sMessage(
|
|
90
|
+
emitter: Emitter<S2sEvents>,
|
|
91
|
+
msg: S2sServerMessage,
|
|
92
|
+
state: DispatchState,
|
|
93
|
+
): void {
|
|
83
94
|
switch (msg.type) {
|
|
84
95
|
case "session.ready":
|
|
85
96
|
emitter.emit("ready", { sessionId: msg.session_id });
|
|
@@ -87,10 +98,16 @@ function dispatchS2sMessage(emitter: Emitter<S2sEvents>, msg: S2sServerMessage):
|
|
|
87
98
|
case "session.updated":
|
|
88
99
|
break;
|
|
89
100
|
case "input.speech.started":
|
|
90
|
-
|
|
101
|
+
if (!state.speechActive) {
|
|
102
|
+
state.speechActive = true;
|
|
103
|
+
emitter.emit("event", { type: "speech_started" });
|
|
104
|
+
}
|
|
91
105
|
break;
|
|
92
106
|
case "input.speech.stopped":
|
|
93
|
-
|
|
107
|
+
if (state.speechActive) {
|
|
108
|
+
state.speechActive = false;
|
|
109
|
+
emitter.emit("event", { type: "speech_stopped" });
|
|
110
|
+
}
|
|
94
111
|
break;
|
|
95
112
|
case "transcript.user":
|
|
96
113
|
emitter.emit("event", { type: "user_transcript", text: msg.text });
|
|
@@ -188,6 +205,7 @@ export function connectS2s(opts: ConnectS2sOptions): Promise<S2sHandle> {
|
|
|
188
205
|
});
|
|
189
206
|
|
|
190
207
|
const emitter = createNanoEvents<S2sEvents>();
|
|
208
|
+
const dispatchState: DispatchState = { speechActive: false };
|
|
191
209
|
let opened = false;
|
|
192
210
|
|
|
193
211
|
function send(msg: { type: string; [key: string]: unknown }): void {
|
|
@@ -291,7 +309,7 @@ export function connectS2s(opts: ConnectS2sOptions): Promise<S2sHandle> {
|
|
|
291
309
|
);
|
|
292
310
|
return;
|
|
293
311
|
}
|
|
294
|
-
dispatchS2sMessage(emitter, parsed);
|
|
312
|
+
dispatchS2sMessage(emitter, parsed, dispatchState);
|
|
295
313
|
}
|
|
296
314
|
|
|
297
315
|
ws.addEventListener("message", handleS2sMessage);
|
package/host/session.test.ts
CHANGED
|
@@ -168,12 +168,29 @@ describe("createS2sSession", () => {
|
|
|
168
168
|
const { session, client, mockHandle } = setup();
|
|
169
169
|
await session.start();
|
|
170
170
|
|
|
171
|
+
mockHandle._fire("replyStarted", { replyId: "r1" });
|
|
171
172
|
mockHandle._fire("event", { type: "reply_done" });
|
|
172
173
|
|
|
173
174
|
expect(client.audioDoneCount).toBe(1);
|
|
174
175
|
expect(client.events).toContainEvent("reply_done");
|
|
175
176
|
});
|
|
176
177
|
|
|
178
|
+
test("duplicate reply_done is suppressed after reply completes", async () => {
|
|
179
|
+
const { session, client, mockHandle } = setup();
|
|
180
|
+
await session.start();
|
|
181
|
+
|
|
182
|
+
mockHandle._fire("replyStarted", { replyId: "r1" });
|
|
183
|
+
mockHandle._fire("event", { type: "reply_done" });
|
|
184
|
+
mockHandle._fire("event", { type: "reply_done" });
|
|
185
|
+
|
|
186
|
+
const replyDones = client.events.filter(
|
|
187
|
+
(e): e is { type: string } =>
|
|
188
|
+
typeof e === "object" && e !== null && "type" in e && e.type === "reply_done",
|
|
189
|
+
);
|
|
190
|
+
expect(replyDones).toHaveLength(1);
|
|
191
|
+
expect(client.audioDoneCount).toBe(1);
|
|
192
|
+
});
|
|
193
|
+
|
|
177
194
|
test("cancelled event emits cancelled", async () => {
|
|
178
195
|
const { session, client, mockHandle } = setup();
|
|
179
196
|
await session.start();
|
package/host/session.ts
CHANGED
|
@@ -181,6 +181,12 @@ function handleReplyCancelled(ctx: S2sSessionCtx): void {
|
|
|
181
181
|
|
|
182
182
|
function handleReplyDone(ctx: S2sSessionCtx): void {
|
|
183
183
|
const doneReplyId = ctx.reply.currentReplyId;
|
|
184
|
+
// Dedup duplicate reply.done events from the S2S service: once the reply
|
|
185
|
+
// has been fully dispatched (or was never started), currentReplyId is null.
|
|
186
|
+
if (doneReplyId === null) {
|
|
187
|
+
ctx.log.debug("Dropping duplicate reply.done (no active reply)");
|
|
188
|
+
return;
|
|
189
|
+
}
|
|
184
190
|
const sendPending = () => {
|
|
185
191
|
if (ctx.reply.currentReplyId !== doneReplyId) {
|
|
186
192
|
ctx.reply.pendingTools = [];
|
|
@@ -196,6 +202,8 @@ function handleReplyDone(ctx: S2sSessionCtx): void {
|
|
|
196
202
|
}
|
|
197
203
|
ctx.client.playAudioDone();
|
|
198
204
|
ctx.client.event({ type: "reply_done" });
|
|
205
|
+
// Mark reply as finished so any repeated reply.done is dropped above.
|
|
206
|
+
ctx.reply.currentReplyId = null;
|
|
199
207
|
}
|
|
200
208
|
};
|
|
201
209
|
if (ctx.turnPromise !== null) {
|