@alexkroman1/aai 1.4.1 → 1.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +3 -3
- package/CHANGELOG.md +12 -0
- package/dist/host/pipeline-session.d.ts +6 -2
- package/dist/host/runtime-barrel.js +66 -14
- package/host/pipeline-session.test.ts +112 -12
- package/host/pipeline-session.ts +53 -9
- package/host/providers/tts/cartesia.test.ts +57 -6
- package/host/providers/tts/cartesia.ts +34 -12
- package/host/runtime.ts +3 -0
- package/package.json +1 -1
package/.turbo/turbo-build.log
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
|
|
2
|
-
> @alexkroman1/aai@1.4.
|
|
2
|
+
> @alexkroman1/aai@1.4.3 build /home/runner/work/agent/agent/packages/aai
|
|
3
3
|
> tsdown && tsc -p tsconfig.build.json
|
|
4
4
|
|
|
5
5
|
[34mℹ[39m [34mtsdown v0.21.7[39m powered by [38;2;255;126;23mrolldown v1.0.0-rc.12[39m
|
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
[34mℹ[39m target: [34mnode22[39m
|
|
9
9
|
[34mℹ[39m tsconfig: [34mtsconfig.json[39m
|
|
10
10
|
[34mℹ[39m Build start
|
|
11
|
-
[34mℹ[39m [2mdist/[22m[1mhost/runtime-barrel.js[22m [
|
|
11
|
+
[34mℹ[39m [2mdist/[22m[1mhost/runtime-barrel.js[22m [2m75.94 kB[22m [2m│ gzip: 22.51 kB[22m
|
|
12
12
|
[34mℹ[39m [2mdist/[22m[1msdk/protocol.js[22m [2m 4.75 kB[22m [2m│ gzip: 1.76 kB[22m
|
|
13
13
|
[34mℹ[39m [2mdist/[22m[1mindex.js[22m [2m 2.88 kB[22m [2m│ gzip: 1.24 kB[22m
|
|
14
14
|
[34mℹ[39m [2mdist/[22m[1msdk/manifest-barrel.js[22m [2m 0.36 kB[22m [2m│ gzip: 0.20 kB[22m
|
|
@@ -22,5 +22,5 @@
|
|
|
22
22
|
[34mℹ[39m [2mdist/[22massemblyai-Cxg9eobY.js [2m 0.53 kB[22m [2m│ gzip: 0.35 kB[22m
|
|
23
23
|
[34mℹ[39m [2mdist/[22manthropic-BrUCPKUc.js [2m 0.23 kB[22m [2m│ gzip: 0.18 kB[22m
|
|
24
24
|
[34mℹ[39m [2mdist/[22mcartesia-DwDk2tEu.js [2m 0.22 kB[22m [2m│ gzip: 0.17 kB[22m
|
|
25
|
-
[34mℹ[39m 14 files, total:
|
|
25
|
+
[34mℹ[39m 14 files, total: 100.76 kB
|
|
26
26
|
[32m✔[39m Build complete in [32m45ms[39m
|
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,17 @@
|
|
|
1
1
|
# @alexkroman1/aai
|
|
2
2
|
|
|
3
|
+
## 1.4.3
|
|
4
|
+
|
|
5
|
+
### Patch Changes
|
|
6
|
+
|
|
7
|
+
- 62d5a99: Fix pipeline mode: play greeting, emit a single agent_transcript per turn, open TTS at the client's playback sample rate, stop the Cartesia adapter from eagerly rotating its context (which was silently dropping in-flight audio chunks), and skip the wire `context.cancel()` when the context is already final on Cartesia's side (avoids a benign 400 that was killing the session).
|
|
8
|
+
|
|
9
|
+
## 1.4.2
|
|
10
|
+
|
|
11
|
+
### Patch Changes
|
|
12
|
+
|
|
13
|
+
- f877a6f: Fix pipeline mode: play greeting, emit a single agent_transcript per turn, open TTS at the client's playback sample rate, and stop the Cartesia adapter from eagerly rotating its context (which was silently dropping in-flight audio chunks).
|
|
14
|
+
|
|
3
15
|
## 1.4.1
|
|
4
16
|
|
|
5
17
|
### Patch Changes
|
|
@@ -37,8 +37,12 @@ export interface PipelineSessionOptions {
|
|
|
37
37
|
sttApiKey: string;
|
|
38
38
|
/** TTS API key. */
|
|
39
39
|
ttsApiKey: string;
|
|
40
|
-
/**
|
|
41
|
-
|
|
40
|
+
/** STT audio sample rate (PCM16, Hz). Defaults to {@link DEFAULT_STT_SAMPLE_RATE}. */
|
|
41
|
+
sttSampleRate?: number | undefined;
|
|
42
|
+
/** TTS audio sample rate (PCM16, Hz). Must match the client's playback AudioContext rate. Defaults to {@link DEFAULT_TTS_SAMPLE_RATE}. */
|
|
43
|
+
ttsSampleRate?: number | undefined;
|
|
44
|
+
/** Skip the initial greeting audio on connect (used for session resume). */
|
|
45
|
+
skipGreeting?: boolean | undefined;
|
|
42
46
|
/** Logger. Defaults to the console logger. */
|
|
43
47
|
logger?: Logger | undefined;
|
|
44
48
|
/** Sliding-window conversation history size. */
|
|
@@ -560,10 +560,6 @@ function handleStreamPart(part, deps) {
|
|
|
560
560
|
if (delta.length === 0) return;
|
|
561
561
|
deps.onTextDelta(delta);
|
|
562
562
|
deps.tts?.sendText(delta);
|
|
563
|
-
deps.client.event({
|
|
564
|
-
type: "agent_transcript",
|
|
565
|
-
text: delta
|
|
566
|
-
});
|
|
567
563
|
return;
|
|
568
564
|
}
|
|
569
565
|
case "tool-call": {
|
|
@@ -601,7 +597,8 @@ function handleStreamPart(part, deps) {
|
|
|
601
597
|
/** Create a pluggable-provider voice session. */
|
|
602
598
|
function createPipelineSession(opts) {
|
|
603
599
|
const log = opts.logger ?? consoleLogger;
|
|
604
|
-
const
|
|
600
|
+
const sttSampleRate = opts.sttSampleRate ?? 16e3;
|
|
601
|
+
const ttsSampleRate = opts.ttsSampleRate ?? 24e3;
|
|
605
602
|
const { client, agentConfig, toolSchemas, executeTool } = opts;
|
|
606
603
|
const systemPrompt = buildSystemPrompt(agentConfig, {
|
|
607
604
|
hasTools: toolSchemas.length > 0 || (agentConfig.builtinTools?.length ?? 0) > 0,
|
|
@@ -797,15 +794,44 @@ function createPipelineSession(opts) {
|
|
|
797
794
|
if (turnController === ctl) turnController = null;
|
|
798
795
|
return;
|
|
799
796
|
}
|
|
797
|
+
if (accumulated.length > 0) {
|
|
798
|
+
client.event({
|
|
799
|
+
type: "agent_transcript",
|
|
800
|
+
text: accumulated
|
|
801
|
+
});
|
|
802
|
+
ctx.pushMessages({
|
|
803
|
+
role: "assistant",
|
|
804
|
+
content: accumulated
|
|
805
|
+
});
|
|
806
|
+
}
|
|
800
807
|
await flushTtsAndWait(ctl.signal);
|
|
801
808
|
if (ctl.signal.aborted) {
|
|
802
809
|
if (turnController === ctl) turnController = null;
|
|
803
810
|
return;
|
|
804
811
|
}
|
|
805
|
-
|
|
812
|
+
client.playAudioDone();
|
|
813
|
+
client.event({ type: "reply_done" });
|
|
814
|
+
if (turnController === ctl) turnController = null;
|
|
815
|
+
}
|
|
816
|
+
async function runGreeting(text) {
|
|
817
|
+
const replyId = `pipeline-greeting-${++nextReplyId}`;
|
|
818
|
+
ctx.beginReply(replyId);
|
|
819
|
+
const ctl = new AbortController();
|
|
820
|
+
turnController = ctl;
|
|
821
|
+
client.event({
|
|
822
|
+
type: "agent_transcript",
|
|
823
|
+
text
|
|
824
|
+
});
|
|
825
|
+
ctx.pushMessages({
|
|
806
826
|
role: "assistant",
|
|
807
|
-
content:
|
|
827
|
+
content: text
|
|
808
828
|
});
|
|
829
|
+
ctx.tts?.sendText(text);
|
|
830
|
+
await flushTtsAndWait(ctl.signal);
|
|
831
|
+
if (ctl.signal.aborted) {
|
|
832
|
+
if (turnController === ctl) turnController = null;
|
|
833
|
+
return;
|
|
834
|
+
}
|
|
809
835
|
client.playAudioDone();
|
|
810
836
|
client.event({ type: "reply_done" });
|
|
811
837
|
if (turnController === ctl) turnController = null;
|
|
@@ -841,12 +867,12 @@ function createPipelineSession(opts) {
|
|
|
841
867
|
}
|
|
842
868
|
async function openProviders() {
|
|
843
869
|
const [sttResult, ttsResult] = await Promise.allSettled([opts.stt.open({
|
|
844
|
-
sampleRate,
|
|
870
|
+
sampleRate: sttSampleRate,
|
|
845
871
|
apiKey: opts.sttApiKey,
|
|
846
872
|
sttPrompt: agentConfig.sttPrompt,
|
|
847
873
|
signal: sessionAbort.signal
|
|
848
874
|
}), opts.tts.open({
|
|
849
|
-
sampleRate,
|
|
875
|
+
sampleRate: ttsSampleRate,
|
|
850
876
|
apiKey: opts.ttsApiKey,
|
|
851
877
|
signal: sessionAbort.signal
|
|
852
878
|
})]);
|
|
@@ -890,7 +916,18 @@ function createPipelineSession(opts) {
|
|
|
890
916
|
ctx.stt?.sendAudio(pcm);
|
|
891
917
|
},
|
|
892
918
|
onAudioReady() {
|
|
919
|
+
if (audioReady || terminated) return;
|
|
893
920
|
audioReady = true;
|
|
921
|
+
if (opts.skipGreeting) return;
|
|
922
|
+
const greeting = agentConfig.greeting;
|
|
923
|
+
if (!greeting) return;
|
|
924
|
+
const turn = runGreeting(greeting).catch((err) => {
|
|
925
|
+
log.error("Pipeline greeting failed", {
|
|
926
|
+
error: errorMessage(err),
|
|
927
|
+
sessionId: opts.id
|
|
928
|
+
});
|
|
929
|
+
});
|
|
930
|
+
ctx.chainTurn(turn);
|
|
894
931
|
},
|
|
895
932
|
onCancel() {
|
|
896
933
|
if (terminated) return;
|
|
@@ -1076,9 +1113,21 @@ function openCartesia(opts) {
|
|
|
1076
1113
|
* Reset whenever a fresh context is minted (i.e. at turn boundaries).
|
|
1077
1114
|
*/
|
|
1078
1115
|
let doneEmitted = false;
|
|
1116
|
+
/**
|
|
1117
|
+
* After `flush()` or `cancel()`, the current context is done accepting
|
|
1118
|
+
* input. We defer minting a fresh one until the next `sendText()` so
|
|
1119
|
+
* that late audio chunks + Cartesia's real `done` event (both tagged
|
|
1120
|
+
* with the flushed context's id) still pass the filter below. Rotating
|
|
1121
|
+
* eagerly would silently drop all audio still in flight.
|
|
1122
|
+
*/
|
|
1123
|
+
let rotatePending = false;
|
|
1079
1124
|
const rotateContext = () => {
|
|
1080
1125
|
context = mintContext();
|
|
1081
1126
|
doneEmitted = false;
|
|
1127
|
+
rotatePending = false;
|
|
1128
|
+
};
|
|
1129
|
+
const rotateIfPending = () => {
|
|
1130
|
+
if (rotatePending) rotateContext();
|
|
1082
1131
|
};
|
|
1083
1132
|
const emitDoneOnce = () => {
|
|
1084
1133
|
if (doneEmitted || closed) return;
|
|
@@ -1133,6 +1182,7 @@ function openCartesia(opts) {
|
|
|
1133
1182
|
return {
|
|
1134
1183
|
sendText(text) {
|
|
1135
1184
|
if (closed || text.length === 0) return;
|
|
1185
|
+
rotateIfPending();
|
|
1136
1186
|
context.send({
|
|
1137
1187
|
...baseRequest,
|
|
1138
1188
|
transcript: text,
|
|
@@ -1140,20 +1190,19 @@ function openCartesia(opts) {
|
|
|
1140
1190
|
}).catch(ignoreRejection);
|
|
1141
1191
|
},
|
|
1142
1192
|
flush() {
|
|
1143
|
-
if (closed) return;
|
|
1193
|
+
if (closed || rotatePending) return;
|
|
1144
1194
|
context.send({
|
|
1145
1195
|
...baseRequest,
|
|
1146
1196
|
transcript: "",
|
|
1147
1197
|
continue: false
|
|
1148
1198
|
}).catch(ignoreRejection);
|
|
1149
|
-
|
|
1150
|
-
rotateContext();
|
|
1199
|
+
rotatePending = true;
|
|
1151
1200
|
},
|
|
1152
1201
|
cancel() {
|
|
1153
1202
|
if (closed) return;
|
|
1154
|
-
context.cancel().catch(ignoreRejection);
|
|
1203
|
+
if (!doneEmitted) context.cancel().catch(ignoreRejection);
|
|
1155
1204
|
emitDoneOnce();
|
|
1156
|
-
|
|
1205
|
+
rotatePending = true;
|
|
1157
1206
|
},
|
|
1158
1207
|
on(event, fn) {
|
|
1159
1208
|
return emitter.on(event, fn);
|
|
@@ -2153,6 +2202,9 @@ function createRuntime(opts) {
|
|
|
2153
2202
|
tts: pipelineProviders.tts,
|
|
2154
2203
|
sttApiKey: resolveApiKey("ASSEMBLYAI_API_KEY", env),
|
|
2155
2204
|
ttsApiKey: resolveApiKey("CARTESIA_API_KEY", env),
|
|
2205
|
+
sttSampleRate: s2sConfig.inputSampleRate,
|
|
2206
|
+
ttsSampleRate: s2sConfig.outputSampleRate,
|
|
2207
|
+
skipGreeting: sessionOpts.skipGreeting ?? false,
|
|
2156
2208
|
logger
|
|
2157
2209
|
});
|
|
2158
2210
|
const apiKey = env.ASSEMBLYAI_API_KEY ?? "";
|
|
@@ -43,7 +43,8 @@ function makeOpts(overrides: Partial<PipelineSessionOptions> = {}): {
|
|
|
43
43
|
tts,
|
|
44
44
|
sttApiKey: "stt-key",
|
|
45
45
|
ttsApiKey: "tts-key",
|
|
46
|
-
|
|
46
|
+
sttSampleRate: 16_000,
|
|
47
|
+
ttsSampleRate: 24_000,
|
|
47
48
|
logger: silentLogger,
|
|
48
49
|
...overrides,
|
|
49
50
|
};
|
|
@@ -81,20 +82,112 @@ describe("createPipelineSession — happy path", () => {
|
|
|
81
82
|
expect(ttsSession.textChunks).toEqual(["Hello", " there"]);
|
|
82
83
|
expect(ttsSession.flush).toHaveBeenCalledTimes(1);
|
|
83
84
|
|
|
84
|
-
// Verify wire events in order
|
|
85
|
+
// Verify wire events in order — the pipeline emits a single
|
|
86
|
+
// `agent_transcript` with the full accumulated reply (not one per
|
|
87
|
+
// delta) so the UI renders one assistant message per turn.
|
|
85
88
|
const types = eventTypes(client.events);
|
|
86
|
-
expect(types).toEqual([
|
|
87
|
-
"user_transcript",
|
|
88
|
-
"agent_transcript", // "Hello"
|
|
89
|
-
"agent_transcript", // " there"
|
|
90
|
-
"reply_done",
|
|
91
|
-
]);
|
|
89
|
+
expect(types).toEqual(["user_transcript", "agent_transcript", "reply_done"]);
|
|
92
90
|
|
|
93
91
|
// user_transcript text matches
|
|
94
92
|
expect(client.events[0]).toMatchObject({
|
|
95
93
|
type: "user_transcript",
|
|
96
94
|
text: "Hello there, how are you?",
|
|
97
95
|
});
|
|
96
|
+
expect(client.events[1]).toMatchObject({
|
|
97
|
+
type: "agent_transcript",
|
|
98
|
+
text: "Hello there",
|
|
99
|
+
});
|
|
100
|
+
|
|
101
|
+
await session.stop();
|
|
102
|
+
});
|
|
103
|
+
});
|
|
104
|
+
|
|
105
|
+
describe("createPipelineSession — greeting", () => {
|
|
106
|
+
test("onAudioReady sends greeting to TTS and emits agent_transcript + reply_done", async () => {
|
|
107
|
+
const { opts, tts, client } = makeOpts({
|
|
108
|
+
agentConfig: {
|
|
109
|
+
name: "pipeline-agent",
|
|
110
|
+
systemPrompt: DEFAULT_SYSTEM_PROMPT,
|
|
111
|
+
greeting: "Hi! I'm pipeline mode.",
|
|
112
|
+
},
|
|
113
|
+
});
|
|
114
|
+
|
|
115
|
+
const session = createPipelineSession(opts);
|
|
116
|
+
await session.start();
|
|
117
|
+
|
|
118
|
+
const ttsSession = tts.last();
|
|
119
|
+
if (!ttsSession) throw new Error("TTS didn't open");
|
|
120
|
+
|
|
121
|
+
session.onAudioReady();
|
|
122
|
+
await session.waitForTurn();
|
|
123
|
+
|
|
124
|
+
expect(ttsSession.textChunks).toEqual(["Hi! I'm pipeline mode."]);
|
|
125
|
+
expect(ttsSession.flush).toHaveBeenCalledTimes(1);
|
|
126
|
+
|
|
127
|
+
const types = eventTypes(client.events);
|
|
128
|
+
expect(types).toEqual(["agent_transcript", "reply_done"]);
|
|
129
|
+
expect(client.events[0]).toMatchObject({
|
|
130
|
+
type: "agent_transcript",
|
|
131
|
+
text: "Hi! I'm pipeline mode.",
|
|
132
|
+
});
|
|
133
|
+
|
|
134
|
+
await session.stop();
|
|
135
|
+
});
|
|
136
|
+
|
|
137
|
+
test("skipGreeting=true suppresses the greeting turn", async () => {
|
|
138
|
+
const { opts, tts, client } = makeOpts({
|
|
139
|
+
agentConfig: {
|
|
140
|
+
name: "pipeline-agent",
|
|
141
|
+
systemPrompt: DEFAULT_SYSTEM_PROMPT,
|
|
142
|
+
greeting: "Hello there.",
|
|
143
|
+
},
|
|
144
|
+
skipGreeting: true,
|
|
145
|
+
});
|
|
146
|
+
|
|
147
|
+
const session = createPipelineSession(opts);
|
|
148
|
+
await session.start();
|
|
149
|
+
|
|
150
|
+
const ttsSession = tts.last();
|
|
151
|
+
if (!ttsSession) throw new Error("TTS didn't open");
|
|
152
|
+
|
|
153
|
+
session.onAudioReady();
|
|
154
|
+
await session.waitForTurn();
|
|
155
|
+
|
|
156
|
+
expect(ttsSession.sendText).not.toHaveBeenCalled();
|
|
157
|
+
expect(ttsSession.flush).not.toHaveBeenCalled();
|
|
158
|
+
expect(client.events).toEqual([]);
|
|
159
|
+
|
|
160
|
+
await session.stop();
|
|
161
|
+
});
|
|
162
|
+
|
|
163
|
+
test("empty greeting is a no-op", async () => {
|
|
164
|
+
const { opts, tts, client } = makeOpts();
|
|
165
|
+
// CONFIG already has greeting: ""
|
|
166
|
+
const session = createPipelineSession(opts);
|
|
167
|
+
await session.start();
|
|
168
|
+
|
|
169
|
+
const ttsSession = tts.last();
|
|
170
|
+
if (!ttsSession) throw new Error("TTS didn't open");
|
|
171
|
+
|
|
172
|
+
session.onAudioReady();
|
|
173
|
+
await session.waitForTurn();
|
|
174
|
+
|
|
175
|
+
expect(ttsSession.sendText).not.toHaveBeenCalled();
|
|
176
|
+
expect(client.events).toEqual([]);
|
|
177
|
+
|
|
178
|
+
await session.stop();
|
|
179
|
+
});
|
|
180
|
+
|
|
181
|
+
test("passes sttSampleRate / ttsSampleRate through to providers", async () => {
|
|
182
|
+
const { opts, stt, tts } = makeOpts({
|
|
183
|
+
sttSampleRate: 16_000,
|
|
184
|
+
ttsSampleRate: 24_000,
|
|
185
|
+
});
|
|
186
|
+
const session = createPipelineSession(opts);
|
|
187
|
+
await session.start();
|
|
188
|
+
|
|
189
|
+
expect(stt.last()?.opts.sampleRate).toBe(16_000);
|
|
190
|
+
expect(tts.last()?.opts.sampleRate).toBe(24_000);
|
|
98
191
|
|
|
99
192
|
await session.stop();
|
|
100
193
|
});
|
|
@@ -163,8 +256,10 @@ describe("createPipelineSession — barge-in", () => {
|
|
|
163
256
|
|
|
164
257
|
// TTS.cancel must have been called exactly once.
|
|
165
258
|
expect(ttsSession.cancel).toHaveBeenCalledTimes(1);
|
|
166
|
-
// Wire events: user_transcript
|
|
167
|
-
//
|
|
259
|
+
// Wire events: user_transcript then cancelled. No agent_transcript
|
|
260
|
+
// (the pipeline only emits it after the LLM stream finishes cleanly)
|
|
261
|
+
// and no reply_done — barge-in short-circuits both the stream and
|
|
262
|
+
// the drain.
|
|
168
263
|
const types = eventTypes(client.events);
|
|
169
264
|
expect(types).toContain("user_transcript");
|
|
170
265
|
expect(types).toContain("cancelled");
|
|
@@ -212,12 +307,17 @@ describe("createPipelineSession — tool calls", () => {
|
|
|
212
307
|
const types = eventTypes(client.events);
|
|
213
308
|
expect(types).toEqual([
|
|
214
309
|
"user_transcript",
|
|
215
|
-
"agent_transcript", // "Let me check"
|
|
216
310
|
"tool_call",
|
|
217
311
|
"tool_call_done",
|
|
218
|
-
"agent_transcript", // " — it's sunny."
|
|
312
|
+
"agent_transcript", // combined: "Let me check — it's sunny."
|
|
219
313
|
"reply_done",
|
|
220
314
|
]);
|
|
315
|
+
expect(client.events.find((e) => (e as ClientEvent).type === "agent_transcript")).toMatchObject(
|
|
316
|
+
{
|
|
317
|
+
type: "agent_transcript",
|
|
318
|
+
text: "Let me check — it's sunny.",
|
|
319
|
+
},
|
|
320
|
+
);
|
|
221
321
|
|
|
222
322
|
const toolCall = client.events.find((e) => (e as ClientEvent).type === "tool_call");
|
|
223
323
|
expect(toolCall).toMatchObject({
|
package/host/pipeline-session.ts
CHANGED
|
@@ -10,7 +10,11 @@
|
|
|
10
10
|
import type { LanguageModel, ModelMessage } from "ai";
|
|
11
11
|
import { stepCountIs, streamText } from "ai";
|
|
12
12
|
import type { AgentConfig, ExecuteTool, ToolSchema } from "../sdk/_internal-types.ts";
|
|
13
|
-
import {
|
|
13
|
+
import {
|
|
14
|
+
DEFAULT_STT_SAMPLE_RATE,
|
|
15
|
+
DEFAULT_TTS_SAMPLE_RATE,
|
|
16
|
+
PIPELINE_FLUSH_TIMEOUT_MS,
|
|
17
|
+
} from "../sdk/constants.ts";
|
|
14
18
|
import type { ClientSink, SessionErrorCode } from "../sdk/protocol.ts";
|
|
15
19
|
import type {
|
|
16
20
|
SttError,
|
|
@@ -55,8 +59,12 @@ export interface PipelineSessionOptions {
|
|
|
55
59
|
sttApiKey: string;
|
|
56
60
|
/** TTS API key. */
|
|
57
61
|
ttsApiKey: string;
|
|
58
|
-
/**
|
|
59
|
-
|
|
62
|
+
/** STT audio sample rate (PCM16, Hz). Defaults to {@link DEFAULT_STT_SAMPLE_RATE}. */
|
|
63
|
+
sttSampleRate?: number | undefined;
|
|
64
|
+
/** TTS audio sample rate (PCM16, Hz). Must match the client's playback AudioContext rate. Defaults to {@link DEFAULT_TTS_SAMPLE_RATE}. */
|
|
65
|
+
ttsSampleRate?: number | undefined;
|
|
66
|
+
/** Skip the initial greeting audio on connect (used for session resume). */
|
|
67
|
+
skipGreeting?: boolean | undefined;
|
|
60
68
|
/** Logger. Defaults to the console logger. */
|
|
61
69
|
logger?: Logger | undefined;
|
|
62
70
|
/** Sliding-window conversation history size. */
|
|
@@ -99,7 +107,6 @@ function handleStreamPart(
|
|
|
99
107
|
if (delta.length === 0) return;
|
|
100
108
|
deps.onTextDelta(delta);
|
|
101
109
|
deps.tts?.sendText(delta);
|
|
102
|
-
deps.client.event({ type: "agent_transcript", text: delta });
|
|
103
110
|
return;
|
|
104
111
|
}
|
|
105
112
|
case "tool-call": {
|
|
@@ -136,7 +143,8 @@ function handleStreamPart(
|
|
|
136
143
|
/** Create a pluggable-provider voice session. */
|
|
137
144
|
export function createPipelineSession(opts: PipelineSessionOptions): Session {
|
|
138
145
|
const log = opts.logger ?? consoleLogger;
|
|
139
|
-
const
|
|
146
|
+
const sttSampleRate = opts.sttSampleRate ?? DEFAULT_STT_SAMPLE_RATE;
|
|
147
|
+
const ttsSampleRate = opts.ttsSampleRate ?? DEFAULT_TTS_SAMPLE_RATE;
|
|
140
148
|
const { client, agentConfig, toolSchemas, executeTool } = opts;
|
|
141
149
|
|
|
142
150
|
const hasTools = toolSchemas.length > 0 || (agentConfig.builtinTools?.length ?? 0) > 0;
|
|
@@ -342,6 +350,14 @@ export function createPipelineSession(opts: PipelineSessionOptions): Session {
|
|
|
342
350
|
return;
|
|
343
351
|
}
|
|
344
352
|
|
|
353
|
+
// Emit the complete transcript once the LLM finishes streaming, so the
|
|
354
|
+
// UI renders a single assistant message (vs. one per delta) and the user
|
|
355
|
+
// sees the text while TTS drains the synthesized audio.
|
|
356
|
+
if (accumulated.length > 0) {
|
|
357
|
+
client.event({ type: "agent_transcript", text: accumulated });
|
|
358
|
+
ctx.pushMessages({ role: "assistant", content: accumulated });
|
|
359
|
+
}
|
|
360
|
+
|
|
345
361
|
await flushTtsAndWait(ctl.signal);
|
|
346
362
|
|
|
347
363
|
if (ctl.signal.aborted) {
|
|
@@ -349,9 +365,29 @@ export function createPipelineSession(opts: PipelineSessionOptions): Session {
|
|
|
349
365
|
return;
|
|
350
366
|
}
|
|
351
367
|
|
|
352
|
-
|
|
353
|
-
|
|
368
|
+
client.playAudioDone();
|
|
369
|
+
client.event({ type: "reply_done" });
|
|
370
|
+
if (turnController === ctl) turnController = null;
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
async function runGreeting(text: string): Promise<void> {
|
|
374
|
+
const replyId = `pipeline-greeting-${++nextReplyId}`;
|
|
375
|
+
ctx.beginReply(replyId);
|
|
376
|
+
|
|
377
|
+
const ctl = new AbortController();
|
|
378
|
+
turnController = ctl;
|
|
379
|
+
|
|
380
|
+
client.event({ type: "agent_transcript", text });
|
|
381
|
+
ctx.pushMessages({ role: "assistant", content: text });
|
|
382
|
+
ctx.tts?.sendText(text);
|
|
383
|
+
|
|
384
|
+
await flushTtsAndWait(ctl.signal);
|
|
385
|
+
|
|
386
|
+
if (ctl.signal.aborted) {
|
|
387
|
+
if (turnController === ctl) turnController = null;
|
|
388
|
+
return;
|
|
354
389
|
}
|
|
390
|
+
|
|
355
391
|
client.playAudioDone();
|
|
356
392
|
client.event({ type: "reply_done" });
|
|
357
393
|
if (turnController === ctl) turnController = null;
|
|
@@ -394,13 +430,13 @@ export function createPipelineSession(opts: PipelineSessionOptions): Session {
|
|
|
394
430
|
async function openProviders(): Promise<void> {
|
|
395
431
|
const [sttResult, ttsResult] = await Promise.allSettled([
|
|
396
432
|
opts.stt.open({
|
|
397
|
-
sampleRate,
|
|
433
|
+
sampleRate: sttSampleRate,
|
|
398
434
|
apiKey: opts.sttApiKey,
|
|
399
435
|
sttPrompt: agentConfig.sttPrompt,
|
|
400
436
|
signal: sessionAbort.signal,
|
|
401
437
|
}),
|
|
402
438
|
opts.tts.open({
|
|
403
|
-
sampleRate,
|
|
439
|
+
sampleRate: ttsSampleRate,
|
|
404
440
|
apiKey: opts.ttsApiKey,
|
|
405
441
|
signal: sessionAbort.signal,
|
|
406
442
|
}),
|
|
@@ -458,7 +494,15 @@ export function createPipelineSession(opts: PipelineSessionOptions): Session {
|
|
|
458
494
|
ctx.stt?.sendAudio(pcm);
|
|
459
495
|
},
|
|
460
496
|
onAudioReady(): void {
|
|
497
|
+
if (audioReady || terminated) return;
|
|
461
498
|
audioReady = true;
|
|
499
|
+
if (opts.skipGreeting) return;
|
|
500
|
+
const greeting = agentConfig.greeting;
|
|
501
|
+
if (!greeting) return;
|
|
502
|
+
const turn = runGreeting(greeting).catch((err: unknown) => {
|
|
503
|
+
log.error("Pipeline greeting failed", { error: errorMessage(err), sessionId: opts.id });
|
|
504
|
+
});
|
|
505
|
+
ctx.chainTurn(turn);
|
|
462
506
|
},
|
|
463
507
|
onCancel(): void {
|
|
464
508
|
if (terminated) return;
|
|
@@ -151,12 +151,15 @@ describe("cartesia TTS adapter", () => {
|
|
|
151
151
|
},
|
|
152
152
|
]);
|
|
153
153
|
|
|
154
|
-
//
|
|
155
|
-
|
|
156
|
-
|
|
154
|
+
// Rotation is deferred until the next sendText so Cartesia's late
|
|
155
|
+
// audio chunks + real `done` event (both tagged with turn1's id) still
|
|
156
|
+
// pass the context-id filter.
|
|
157
|
+
expect(session._currentContextId()).toBe(turn1);
|
|
157
158
|
|
|
158
|
-
// Subsequent sendText
|
|
159
|
+
// Subsequent sendText rotates to a fresh context.
|
|
159
160
|
session.sendText("next");
|
|
161
|
+
const turn2 = session._currentContextId();
|
|
162
|
+
expect(turn2).not.toBe(turn1);
|
|
160
163
|
await flush();
|
|
161
164
|
expect(sends.filter((s) => s.contextId === turn2)).toEqual([
|
|
162
165
|
{
|
|
@@ -201,8 +204,56 @@ describe("cartesia TTS adapter", () => {
|
|
|
201
204
|
{ kind: "cancel", contextId: turn1 },
|
|
202
205
|
]);
|
|
203
206
|
|
|
204
|
-
//
|
|
205
|
-
|
|
207
|
+
// Rotation is deferred until the next sendText — cancel() halts the
|
|
208
|
+
// old context on Cartesia's side, so late events for turn1 can safely
|
|
209
|
+
// keep passing the filter until the next turn actually begins.
|
|
210
|
+
expect(session._currentContextId()).toBe(turn1);
|
|
211
|
+
|
|
212
|
+
// A subsequent sendText mints a fresh context for turn2.
|
|
213
|
+
session.sendText("again");
|
|
214
|
+
const turn2 = session._currentContextId();
|
|
215
|
+
expect(turn2).not.toBe(turn1);
|
|
216
|
+
|
|
217
|
+
controller.abort();
|
|
218
|
+
await session.close();
|
|
219
|
+
});
|
|
220
|
+
|
|
221
|
+
test("cancel() after done is a no-op on the wire (avoids Cartesia's 'context ID does not exist' 400)", async () => {
|
|
222
|
+
const { session, controller } = await openSession();
|
|
223
|
+
const turn1 = session._currentContextId();
|
|
224
|
+
|
|
225
|
+
session.sendText("hello");
|
|
226
|
+
session.flush();
|
|
227
|
+
await flush();
|
|
228
|
+
|
|
229
|
+
// Cartesia finishes synthesizing and emits `done` for the flushed context.
|
|
230
|
+
const ws = session._ws as unknown as { _fire(event: string, payload: unknown): void };
|
|
231
|
+
ws._fire("done", { context_id: turn1 });
|
|
232
|
+
|
|
233
|
+
// A late cancel (e.g. client `cancel` event after the turn completed
|
|
234
|
+
// normally) must not re-send `context.cancel()` — doing so would trip
|
|
235
|
+
// Cartesia's 400 and kill the session via onTtsError → terminate.
|
|
236
|
+
session.cancel();
|
|
237
|
+
await flush();
|
|
238
|
+
|
|
239
|
+
const cancels = sends.filter((s) => s.kind === "cancel");
|
|
240
|
+
expect(cancels).toEqual([]);
|
|
241
|
+
|
|
242
|
+
controller.abort();
|
|
243
|
+
await session.close();
|
|
244
|
+
});
|
|
245
|
+
|
|
246
|
+
test("double cancel() only sends one wire cancel", async () => {
|
|
247
|
+
const { session, controller } = await openSession();
|
|
248
|
+
const turn1 = session._currentContextId();
|
|
249
|
+
|
|
250
|
+
session.sendText("hello");
|
|
251
|
+
session.cancel();
|
|
252
|
+
session.cancel();
|
|
253
|
+
await flush();
|
|
254
|
+
|
|
255
|
+
const cancels = sends.filter((s) => s.kind === "cancel");
|
|
256
|
+
expect(cancels).toEqual([{ kind: "cancel", contextId: turn1 }]);
|
|
206
257
|
|
|
207
258
|
controller.abort();
|
|
208
259
|
await session.close();
|
|
@@ -107,9 +107,21 @@ export function openCartesia(opts: CartesiaOptions): TtsOpener {
|
|
|
107
107
|
* Reset whenever a fresh context is minted (i.e. at turn boundaries).
|
|
108
108
|
*/
|
|
109
109
|
let doneEmitted = false;
|
|
110
|
+
/**
|
|
111
|
+
* After `flush()` or `cancel()`, the current context is done accepting
|
|
112
|
+
* input. We defer minting a fresh one until the next `sendText()` so
|
|
113
|
+
* that late audio chunks + Cartesia's real `done` event (both tagged
|
|
114
|
+
* with the flushed context's id) still pass the filter below. Rotating
|
|
115
|
+
* eagerly would silently drop all audio still in flight.
|
|
116
|
+
*/
|
|
117
|
+
let rotatePending = false;
|
|
110
118
|
const rotateContext = () => {
|
|
111
119
|
context = mintContext();
|
|
112
120
|
doneEmitted = false;
|
|
121
|
+
rotatePending = false;
|
|
122
|
+
};
|
|
123
|
+
const rotateIfPending = () => {
|
|
124
|
+
if (rotatePending) rotateContext();
|
|
113
125
|
};
|
|
114
126
|
const emitDoneOnce = () => {
|
|
115
127
|
if (doneEmitted || closed) return;
|
|
@@ -179,33 +191,43 @@ export function openCartesia(opts: CartesiaOptions): TtsOpener {
|
|
|
179
191
|
const session: CartesiaSession = {
|
|
180
192
|
sendText(text: string) {
|
|
181
193
|
if (closed || text.length === 0) return;
|
|
194
|
+
// First sendText after a flush/cancel starts a fresh context so
|
|
195
|
+
// we don't append to one that's already been finalized.
|
|
196
|
+
rotateIfPending();
|
|
182
197
|
void context
|
|
183
198
|
.send({ ...baseRequest, transcript: text, continue: true })
|
|
184
199
|
.catch(ignoreRejection);
|
|
185
200
|
},
|
|
186
201
|
flush() {
|
|
187
|
-
if (closed) return;
|
|
202
|
+
if (closed || rotatePending) return;
|
|
188
203
|
// Empty transcript with `continue: false` is the canonical
|
|
189
|
-
// end-of-turn signal. Cartesia
|
|
190
|
-
//
|
|
191
|
-
//
|
|
192
|
-
// the
|
|
193
|
-
//
|
|
194
|
-
// Cartesia always emits `done` for cleanly-flushed contexts.
|
|
204
|
+
// end-of-turn signal. Cartesia finishes synthesizing whatever
|
|
205
|
+
// is queued and then emits a `done` tagged with the same
|
|
206
|
+
// context_id — at that point `emitDoneOnce` fires for real.
|
|
207
|
+
// Defer rotation so the filter below still accepts in-flight
|
|
208
|
+
// audio chunks and the real `done` event.
|
|
195
209
|
void context
|
|
196
210
|
.send({ ...baseRequest, transcript: "", continue: false })
|
|
197
211
|
.catch(ignoreRejection);
|
|
198
|
-
|
|
199
|
-
rotateContext();
|
|
212
|
+
rotatePending = true;
|
|
200
213
|
},
|
|
201
214
|
cancel() {
|
|
202
215
|
if (closed) return;
|
|
203
|
-
|
|
216
|
+
// Skip the wire cancel if the context is already final on
|
|
217
|
+
// Cartesia's side (natural `done` after flush, or a prior
|
|
218
|
+
// cancel). Cartesia responds to cancel on a retired context
|
|
219
|
+
// with a 400 "context ID does not exist", which our error
|
|
220
|
+
// listener surfaces as `tts_stream_error` and the pipeline
|
|
221
|
+
// treats as fatal — killing the session for a benign race.
|
|
222
|
+
if (!doneEmitted) {
|
|
223
|
+
void context.cancel().catch(ignoreRejection);
|
|
224
|
+
}
|
|
204
225
|
// Emit synchronously: barge-in advances the orchestrator's
|
|
205
226
|
// state machine on `done`, and delaying it would audibly
|
|
206
|
-
// stall subsequent turns.
|
|
227
|
+
// stall subsequent turns. Cartesia stops producing audio
|
|
228
|
+
// after cancel, so dropping any late chunks is fine.
|
|
207
229
|
emitDoneOnce();
|
|
208
|
-
|
|
230
|
+
rotatePending = true;
|
|
209
231
|
},
|
|
210
232
|
on(event, fn) {
|
|
211
233
|
return emitter.on(event, fn);
|
package/host/runtime.ts
CHANGED
|
@@ -326,6 +326,9 @@ export function createRuntime(opts: RuntimeOptions): Runtime {
|
|
|
326
326
|
tts: pipelineProviders.tts,
|
|
327
327
|
sttApiKey: resolveApiKey("ASSEMBLYAI_API_KEY", env),
|
|
328
328
|
ttsApiKey: resolveApiKey("CARTESIA_API_KEY", env),
|
|
329
|
+
sttSampleRate: s2sConfig.inputSampleRate,
|
|
330
|
+
ttsSampleRate: s2sConfig.outputSampleRate,
|
|
331
|
+
skipGreeting: sessionOpts.skipGreeting ?? false,
|
|
329
332
|
logger,
|
|
330
333
|
});
|
|
331
334
|
}
|