@alexkroman1/aai 1.4.0 → 1.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +4 -4
- package/CHANGELOG.md +12 -0
- package/dist/host/pipeline-session.d.ts +6 -2
- package/dist/host/runtime-barrel.js +69 -14
- package/host/pipeline-session.test.ts +112 -12
- package/host/pipeline-session.ts +53 -9
- package/host/providers/resolve.ts +4 -1
- package/host/providers/tts/cartesia.test.ts +16 -6
- package/host/providers/tts/cartesia.ts +25 -11
- package/host/runtime.ts +3 -0
- package/package.json +1 -1
package/.turbo/turbo-build.log
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
|
|
2
|
-
> @alexkroman1/aai@1.4.
|
|
2
|
+
> @alexkroman1/aai@1.4.2 build /home/runner/work/agent/agent/packages/aai
|
|
3
3
|
> tsdown && tsc -p tsconfig.build.json
|
|
4
4
|
|
|
5
5
|
[34mℹ[39m [34mtsdown v0.21.7[39m powered by [38;2;255;126;23mrolldown v1.0.0-rc.12[39m
|
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
[34mℹ[39m target: [34mnode22[39m
|
|
9
9
|
[34mℹ[39m tsconfig: [34mtsconfig.json[39m
|
|
10
10
|
[34mℹ[39m Build start
|
|
11
|
-
[34mℹ[39m [2mdist/[22m[1mhost/runtime-barrel.js[22m [
|
|
11
|
+
[34mℹ[39m [2mdist/[22m[1mhost/runtime-barrel.js[22m [2m75.92 kB[22m [2m│ gzip: 22.48 kB[22m
|
|
12
12
|
[34mℹ[39m [2mdist/[22m[1msdk/protocol.js[22m [2m 4.75 kB[22m [2m│ gzip: 1.76 kB[22m
|
|
13
13
|
[34mℹ[39m [2mdist/[22m[1mindex.js[22m [2m 2.88 kB[22m [2m│ gzip: 1.24 kB[22m
|
|
14
14
|
[34mℹ[39m [2mdist/[22m[1msdk/manifest-barrel.js[22m [2m 0.36 kB[22m [2m│ gzip: 0.20 kB[22m
|
|
@@ -22,5 +22,5 @@
|
|
|
22
22
|
[34mℹ[39m [2mdist/[22massemblyai-Cxg9eobY.js [2m 0.53 kB[22m [2m│ gzip: 0.35 kB[22m
|
|
23
23
|
[34mℹ[39m [2mdist/[22manthropic-BrUCPKUc.js [2m 0.23 kB[22m [2m│ gzip: 0.18 kB[22m
|
|
24
24
|
[34mℹ[39m [2mdist/[22mcartesia-DwDk2tEu.js [2m 0.22 kB[22m [2m│ gzip: 0.17 kB[22m
|
|
25
|
-
[34mℹ[39m 14 files, total:
|
|
26
|
-
[32m✔[39m Build complete in [
|
|
25
|
+
[34mℹ[39m 14 files, total: 100.74 kB
|
|
26
|
+
[32m✔[39m Build complete in [32m48ms[39m
|
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,17 @@
|
|
|
1
1
|
# @alexkroman1/aai
|
|
2
2
|
|
|
3
|
+
## 1.4.2
|
|
4
|
+
|
|
5
|
+
### Patch Changes
|
|
6
|
+
|
|
7
|
+
- f877a6f: Fix pipeline mode: play greeting, emit a single agent_transcript per turn, open TTS at the client's playback sample rate, and stop the Cartesia adapter from eagerly rotating its context (which was silently dropping in-flight audio chunks).
|
|
8
|
+
|
|
9
|
+
## 1.4.1
|
|
10
|
+
|
|
11
|
+
### Patch Changes
|
|
12
|
+
|
|
13
|
+
- 63de397: Pass explicit baseURL to createAnthropic so the SDK's loadOptionalSetting returns before reading process.env['ANTHROPIC_BASE_URL']. The Deno platform server runs without --allow-env, and the missing baseURL caused pipeline-mode sessions to crash on first use.
|
|
14
|
+
|
|
3
15
|
## 1.4.0
|
|
4
16
|
|
|
5
17
|
## 1.3.2
|
|
@@ -37,8 +37,12 @@ export interface PipelineSessionOptions {
|
|
|
37
37
|
sttApiKey: string;
|
|
38
38
|
/** TTS API key. */
|
|
39
39
|
ttsApiKey: string;
|
|
40
|
-
/**
|
|
41
|
-
|
|
40
|
+
/** STT audio sample rate (PCM16, Hz). Defaults to {@link DEFAULT_STT_SAMPLE_RATE}. */
|
|
41
|
+
sttSampleRate?: number | undefined;
|
|
42
|
+
/** TTS audio sample rate (PCM16, Hz). Must match the client's playback AudioContext rate. Defaults to {@link DEFAULT_TTS_SAMPLE_RATE}. */
|
|
43
|
+
ttsSampleRate?: number | undefined;
|
|
44
|
+
/** Skip the initial greeting audio on connect (used for session resume). */
|
|
45
|
+
skipGreeting?: boolean | undefined;
|
|
42
46
|
/** Logger. Defaults to the console logger. */
|
|
43
47
|
logger?: Logger | undefined;
|
|
44
48
|
/** Sliding-window conversation history size. */
|
|
@@ -560,10 +560,6 @@ function handleStreamPart(part, deps) {
|
|
|
560
560
|
if (delta.length === 0) return;
|
|
561
561
|
deps.onTextDelta(delta);
|
|
562
562
|
deps.tts?.sendText(delta);
|
|
563
|
-
deps.client.event({
|
|
564
|
-
type: "agent_transcript",
|
|
565
|
-
text: delta
|
|
566
|
-
});
|
|
567
563
|
return;
|
|
568
564
|
}
|
|
569
565
|
case "tool-call": {
|
|
@@ -601,7 +597,8 @@ function handleStreamPart(part, deps) {
|
|
|
601
597
|
/** Create a pluggable-provider voice session. */
|
|
602
598
|
function createPipelineSession(opts) {
|
|
603
599
|
const log = opts.logger ?? consoleLogger;
|
|
604
|
-
const
|
|
600
|
+
const sttSampleRate = opts.sttSampleRate ?? 16e3;
|
|
601
|
+
const ttsSampleRate = opts.ttsSampleRate ?? 24e3;
|
|
605
602
|
const { client, agentConfig, toolSchemas, executeTool } = opts;
|
|
606
603
|
const systemPrompt = buildSystemPrompt(agentConfig, {
|
|
607
604
|
hasTools: toolSchemas.length > 0 || (agentConfig.builtinTools?.length ?? 0) > 0,
|
|
@@ -797,15 +794,44 @@ function createPipelineSession(opts) {
|
|
|
797
794
|
if (turnController === ctl) turnController = null;
|
|
798
795
|
return;
|
|
799
796
|
}
|
|
797
|
+
if (accumulated.length > 0) {
|
|
798
|
+
client.event({
|
|
799
|
+
type: "agent_transcript",
|
|
800
|
+
text: accumulated
|
|
801
|
+
});
|
|
802
|
+
ctx.pushMessages({
|
|
803
|
+
role: "assistant",
|
|
804
|
+
content: accumulated
|
|
805
|
+
});
|
|
806
|
+
}
|
|
800
807
|
await flushTtsAndWait(ctl.signal);
|
|
801
808
|
if (ctl.signal.aborted) {
|
|
802
809
|
if (turnController === ctl) turnController = null;
|
|
803
810
|
return;
|
|
804
811
|
}
|
|
805
|
-
|
|
812
|
+
client.playAudioDone();
|
|
813
|
+
client.event({ type: "reply_done" });
|
|
814
|
+
if (turnController === ctl) turnController = null;
|
|
815
|
+
}
|
|
816
|
+
async function runGreeting(text) {
|
|
817
|
+
const replyId = `pipeline-greeting-${++nextReplyId}`;
|
|
818
|
+
ctx.beginReply(replyId);
|
|
819
|
+
const ctl = new AbortController();
|
|
820
|
+
turnController = ctl;
|
|
821
|
+
client.event({
|
|
822
|
+
type: "agent_transcript",
|
|
823
|
+
text
|
|
824
|
+
});
|
|
825
|
+
ctx.pushMessages({
|
|
806
826
|
role: "assistant",
|
|
807
|
-
content:
|
|
827
|
+
content: text
|
|
808
828
|
});
|
|
829
|
+
ctx.tts?.sendText(text);
|
|
830
|
+
await flushTtsAndWait(ctl.signal);
|
|
831
|
+
if (ctl.signal.aborted) {
|
|
832
|
+
if (turnController === ctl) turnController = null;
|
|
833
|
+
return;
|
|
834
|
+
}
|
|
809
835
|
client.playAudioDone();
|
|
810
836
|
client.event({ type: "reply_done" });
|
|
811
837
|
if (turnController === ctl) turnController = null;
|
|
@@ -841,12 +867,12 @@ function createPipelineSession(opts) {
|
|
|
841
867
|
}
|
|
842
868
|
async function openProviders() {
|
|
843
869
|
const [sttResult, ttsResult] = await Promise.allSettled([opts.stt.open({
|
|
844
|
-
sampleRate,
|
|
870
|
+
sampleRate: sttSampleRate,
|
|
845
871
|
apiKey: opts.sttApiKey,
|
|
846
872
|
sttPrompt: agentConfig.sttPrompt,
|
|
847
873
|
signal: sessionAbort.signal
|
|
848
874
|
}), opts.tts.open({
|
|
849
|
-
sampleRate,
|
|
875
|
+
sampleRate: ttsSampleRate,
|
|
850
876
|
apiKey: opts.ttsApiKey,
|
|
851
877
|
signal: sessionAbort.signal
|
|
852
878
|
})]);
|
|
@@ -890,7 +916,18 @@ function createPipelineSession(opts) {
|
|
|
890
916
|
ctx.stt?.sendAudio(pcm);
|
|
891
917
|
},
|
|
892
918
|
onAudioReady() {
|
|
919
|
+
if (audioReady || terminated) return;
|
|
893
920
|
audioReady = true;
|
|
921
|
+
if (opts.skipGreeting) return;
|
|
922
|
+
const greeting = agentConfig.greeting;
|
|
923
|
+
if (!greeting) return;
|
|
924
|
+
const turn = runGreeting(greeting).catch((err) => {
|
|
925
|
+
log.error("Pipeline greeting failed", {
|
|
926
|
+
error: errorMessage(err),
|
|
927
|
+
sessionId: opts.id
|
|
928
|
+
});
|
|
929
|
+
});
|
|
930
|
+
ctx.chainTurn(turn);
|
|
894
931
|
},
|
|
895
932
|
onCancel() {
|
|
896
933
|
if (terminated) return;
|
|
@@ -1076,9 +1113,21 @@ function openCartesia(opts) {
|
|
|
1076
1113
|
* Reset whenever a fresh context is minted (i.e. at turn boundaries).
|
|
1077
1114
|
*/
|
|
1078
1115
|
let doneEmitted = false;
|
|
1116
|
+
/**
|
|
1117
|
+
* After `flush()` or `cancel()`, the current context is done accepting
|
|
1118
|
+
* input. We defer minting a fresh one until the next `sendText()` so
|
|
1119
|
+
* that late audio chunks + Cartesia's real `done` event (both tagged
|
|
1120
|
+
* with the flushed context's id) still pass the filter below. Rotating
|
|
1121
|
+
* eagerly would silently drop all audio still in flight.
|
|
1122
|
+
*/
|
|
1123
|
+
let rotatePending = false;
|
|
1079
1124
|
const rotateContext = () => {
|
|
1080
1125
|
context = mintContext();
|
|
1081
1126
|
doneEmitted = false;
|
|
1127
|
+
rotatePending = false;
|
|
1128
|
+
};
|
|
1129
|
+
const rotateIfPending = () => {
|
|
1130
|
+
if (rotatePending) rotateContext();
|
|
1082
1131
|
};
|
|
1083
1132
|
const emitDoneOnce = () => {
|
|
1084
1133
|
if (doneEmitted || closed) return;
|
|
@@ -1133,6 +1182,7 @@ function openCartesia(opts) {
|
|
|
1133
1182
|
return {
|
|
1134
1183
|
sendText(text) {
|
|
1135
1184
|
if (closed || text.length === 0) return;
|
|
1185
|
+
rotateIfPending();
|
|
1136
1186
|
context.send({
|
|
1137
1187
|
...baseRequest,
|
|
1138
1188
|
transcript: text,
|
|
@@ -1140,20 +1190,19 @@ function openCartesia(opts) {
|
|
|
1140
1190
|
}).catch(ignoreRejection);
|
|
1141
1191
|
},
|
|
1142
1192
|
flush() {
|
|
1143
|
-
if (closed) return;
|
|
1193
|
+
if (closed || rotatePending) return;
|
|
1144
1194
|
context.send({
|
|
1145
1195
|
...baseRequest,
|
|
1146
1196
|
transcript: "",
|
|
1147
1197
|
continue: false
|
|
1148
1198
|
}).catch(ignoreRejection);
|
|
1149
|
-
|
|
1150
|
-
rotateContext();
|
|
1199
|
+
rotatePending = true;
|
|
1151
1200
|
},
|
|
1152
1201
|
cancel() {
|
|
1153
1202
|
if (closed) return;
|
|
1154
1203
|
context.cancel().catch(ignoreRejection);
|
|
1155
1204
|
emitDoneOnce();
|
|
1156
|
-
|
|
1205
|
+
rotatePending = true;
|
|
1157
1206
|
},
|
|
1158
1207
|
on(event, fn) {
|
|
1159
1208
|
return emitter.on(event, fn);
|
|
@@ -1216,7 +1265,10 @@ function resolveLlm(descriptor, env) {
|
|
|
1216
1265
|
const options = descriptor.options;
|
|
1217
1266
|
const apiKey = resolveApiKey("ANTHROPIC_API_KEY", env);
|
|
1218
1267
|
if (!apiKey) throw new Error("Anthropic LLM: missing API key. Set ANTHROPIC_API_KEY in the agent env.");
|
|
1219
|
-
return createAnthropic({
|
|
1268
|
+
return createAnthropic({
|
|
1269
|
+
apiKey,
|
|
1270
|
+
baseURL: "https://api.anthropic.com/v1"
|
|
1271
|
+
})(options.model);
|
|
1220
1272
|
}
|
|
1221
1273
|
default: throw new Error(`Unknown LLM provider kind: "${descriptor.kind}". Supported: ${ANTHROPIC_KIND}.`);
|
|
1222
1274
|
}
|
|
@@ -2150,6 +2202,9 @@ function createRuntime(opts) {
|
|
|
2150
2202
|
tts: pipelineProviders.tts,
|
|
2151
2203
|
sttApiKey: resolveApiKey("ASSEMBLYAI_API_KEY", env),
|
|
2152
2204
|
ttsApiKey: resolveApiKey("CARTESIA_API_KEY", env),
|
|
2205
|
+
sttSampleRate: s2sConfig.inputSampleRate,
|
|
2206
|
+
ttsSampleRate: s2sConfig.outputSampleRate,
|
|
2207
|
+
skipGreeting: sessionOpts.skipGreeting ?? false,
|
|
2153
2208
|
logger
|
|
2154
2209
|
});
|
|
2155
2210
|
const apiKey = env.ASSEMBLYAI_API_KEY ?? "";
|
|
@@ -43,7 +43,8 @@ function makeOpts(overrides: Partial<PipelineSessionOptions> = {}): {
|
|
|
43
43
|
tts,
|
|
44
44
|
sttApiKey: "stt-key",
|
|
45
45
|
ttsApiKey: "tts-key",
|
|
46
|
-
|
|
46
|
+
sttSampleRate: 16_000,
|
|
47
|
+
ttsSampleRate: 24_000,
|
|
47
48
|
logger: silentLogger,
|
|
48
49
|
...overrides,
|
|
49
50
|
};
|
|
@@ -81,20 +82,112 @@ describe("createPipelineSession — happy path", () => {
|
|
|
81
82
|
expect(ttsSession.textChunks).toEqual(["Hello", " there"]);
|
|
82
83
|
expect(ttsSession.flush).toHaveBeenCalledTimes(1);
|
|
83
84
|
|
|
84
|
-
// Verify wire events in order
|
|
85
|
+
// Verify wire events in order — the pipeline emits a single
|
|
86
|
+
// `agent_transcript` with the full accumulated reply (not one per
|
|
87
|
+
// delta) so the UI renders one assistant message per turn.
|
|
85
88
|
const types = eventTypes(client.events);
|
|
86
|
-
expect(types).toEqual([
|
|
87
|
-
"user_transcript",
|
|
88
|
-
"agent_transcript", // "Hello"
|
|
89
|
-
"agent_transcript", // " there"
|
|
90
|
-
"reply_done",
|
|
91
|
-
]);
|
|
89
|
+
expect(types).toEqual(["user_transcript", "agent_transcript", "reply_done"]);
|
|
92
90
|
|
|
93
91
|
// user_transcript text matches
|
|
94
92
|
expect(client.events[0]).toMatchObject({
|
|
95
93
|
type: "user_transcript",
|
|
96
94
|
text: "Hello there, how are you?",
|
|
97
95
|
});
|
|
96
|
+
expect(client.events[1]).toMatchObject({
|
|
97
|
+
type: "agent_transcript",
|
|
98
|
+
text: "Hello there",
|
|
99
|
+
});
|
|
100
|
+
|
|
101
|
+
await session.stop();
|
|
102
|
+
});
|
|
103
|
+
});
|
|
104
|
+
|
|
105
|
+
describe("createPipelineSession — greeting", () => {
|
|
106
|
+
test("onAudioReady sends greeting to TTS and emits agent_transcript + reply_done", async () => {
|
|
107
|
+
const { opts, tts, client } = makeOpts({
|
|
108
|
+
agentConfig: {
|
|
109
|
+
name: "pipeline-agent",
|
|
110
|
+
systemPrompt: DEFAULT_SYSTEM_PROMPT,
|
|
111
|
+
greeting: "Hi! I'm pipeline mode.",
|
|
112
|
+
},
|
|
113
|
+
});
|
|
114
|
+
|
|
115
|
+
const session = createPipelineSession(opts);
|
|
116
|
+
await session.start();
|
|
117
|
+
|
|
118
|
+
const ttsSession = tts.last();
|
|
119
|
+
if (!ttsSession) throw new Error("TTS didn't open");
|
|
120
|
+
|
|
121
|
+
session.onAudioReady();
|
|
122
|
+
await session.waitForTurn();
|
|
123
|
+
|
|
124
|
+
expect(ttsSession.textChunks).toEqual(["Hi! I'm pipeline mode."]);
|
|
125
|
+
expect(ttsSession.flush).toHaveBeenCalledTimes(1);
|
|
126
|
+
|
|
127
|
+
const types = eventTypes(client.events);
|
|
128
|
+
expect(types).toEqual(["agent_transcript", "reply_done"]);
|
|
129
|
+
expect(client.events[0]).toMatchObject({
|
|
130
|
+
type: "agent_transcript",
|
|
131
|
+
text: "Hi! I'm pipeline mode.",
|
|
132
|
+
});
|
|
133
|
+
|
|
134
|
+
await session.stop();
|
|
135
|
+
});
|
|
136
|
+
|
|
137
|
+
test("skipGreeting=true suppresses the greeting turn", async () => {
|
|
138
|
+
const { opts, tts, client } = makeOpts({
|
|
139
|
+
agentConfig: {
|
|
140
|
+
name: "pipeline-agent",
|
|
141
|
+
systemPrompt: DEFAULT_SYSTEM_PROMPT,
|
|
142
|
+
greeting: "Hello there.",
|
|
143
|
+
},
|
|
144
|
+
skipGreeting: true,
|
|
145
|
+
});
|
|
146
|
+
|
|
147
|
+
const session = createPipelineSession(opts);
|
|
148
|
+
await session.start();
|
|
149
|
+
|
|
150
|
+
const ttsSession = tts.last();
|
|
151
|
+
if (!ttsSession) throw new Error("TTS didn't open");
|
|
152
|
+
|
|
153
|
+
session.onAudioReady();
|
|
154
|
+
await session.waitForTurn();
|
|
155
|
+
|
|
156
|
+
expect(ttsSession.sendText).not.toHaveBeenCalled();
|
|
157
|
+
expect(ttsSession.flush).not.toHaveBeenCalled();
|
|
158
|
+
expect(client.events).toEqual([]);
|
|
159
|
+
|
|
160
|
+
await session.stop();
|
|
161
|
+
});
|
|
162
|
+
|
|
163
|
+
test("empty greeting is a no-op", async () => {
|
|
164
|
+
const { opts, tts, client } = makeOpts();
|
|
165
|
+
// CONFIG already has greeting: ""
|
|
166
|
+
const session = createPipelineSession(opts);
|
|
167
|
+
await session.start();
|
|
168
|
+
|
|
169
|
+
const ttsSession = tts.last();
|
|
170
|
+
if (!ttsSession) throw new Error("TTS didn't open");
|
|
171
|
+
|
|
172
|
+
session.onAudioReady();
|
|
173
|
+
await session.waitForTurn();
|
|
174
|
+
|
|
175
|
+
expect(ttsSession.sendText).not.toHaveBeenCalled();
|
|
176
|
+
expect(client.events).toEqual([]);
|
|
177
|
+
|
|
178
|
+
await session.stop();
|
|
179
|
+
});
|
|
180
|
+
|
|
181
|
+
test("passes sttSampleRate / ttsSampleRate through to providers", async () => {
|
|
182
|
+
const { opts, stt, tts } = makeOpts({
|
|
183
|
+
sttSampleRate: 16_000,
|
|
184
|
+
ttsSampleRate: 24_000,
|
|
185
|
+
});
|
|
186
|
+
const session = createPipelineSession(opts);
|
|
187
|
+
await session.start();
|
|
188
|
+
|
|
189
|
+
expect(stt.last()?.opts.sampleRate).toBe(16_000);
|
|
190
|
+
expect(tts.last()?.opts.sampleRate).toBe(24_000);
|
|
98
191
|
|
|
99
192
|
await session.stop();
|
|
100
193
|
});
|
|
@@ -163,8 +256,10 @@ describe("createPipelineSession — barge-in", () => {
|
|
|
163
256
|
|
|
164
257
|
// TTS.cancel must have been called exactly once.
|
|
165
258
|
expect(ttsSession.cancel).toHaveBeenCalledTimes(1);
|
|
166
|
-
// Wire events: user_transcript
|
|
167
|
-
//
|
|
259
|
+
// Wire events: user_transcript then cancelled. No agent_transcript
|
|
260
|
+
// (the pipeline only emits it after the LLM stream finishes cleanly)
|
|
261
|
+
// and no reply_done — barge-in short-circuits both the stream and
|
|
262
|
+
// the drain.
|
|
168
263
|
const types = eventTypes(client.events);
|
|
169
264
|
expect(types).toContain("user_transcript");
|
|
170
265
|
expect(types).toContain("cancelled");
|
|
@@ -212,12 +307,17 @@ describe("createPipelineSession — tool calls", () => {
|
|
|
212
307
|
const types = eventTypes(client.events);
|
|
213
308
|
expect(types).toEqual([
|
|
214
309
|
"user_transcript",
|
|
215
|
-
"agent_transcript", // "Let me check"
|
|
216
310
|
"tool_call",
|
|
217
311
|
"tool_call_done",
|
|
218
|
-
"agent_transcript", // " — it's sunny."
|
|
312
|
+
"agent_transcript", // combined: "Let me check — it's sunny."
|
|
219
313
|
"reply_done",
|
|
220
314
|
]);
|
|
315
|
+
expect(client.events.find((e) => (e as ClientEvent).type === "agent_transcript")).toMatchObject(
|
|
316
|
+
{
|
|
317
|
+
type: "agent_transcript",
|
|
318
|
+
text: "Let me check — it's sunny.",
|
|
319
|
+
},
|
|
320
|
+
);
|
|
221
321
|
|
|
222
322
|
const toolCall = client.events.find((e) => (e as ClientEvent).type === "tool_call");
|
|
223
323
|
expect(toolCall).toMatchObject({
|
package/host/pipeline-session.ts
CHANGED
|
@@ -10,7 +10,11 @@
|
|
|
10
10
|
import type { LanguageModel, ModelMessage } from "ai";
|
|
11
11
|
import { stepCountIs, streamText } from "ai";
|
|
12
12
|
import type { AgentConfig, ExecuteTool, ToolSchema } from "../sdk/_internal-types.ts";
|
|
13
|
-
import {
|
|
13
|
+
import {
|
|
14
|
+
DEFAULT_STT_SAMPLE_RATE,
|
|
15
|
+
DEFAULT_TTS_SAMPLE_RATE,
|
|
16
|
+
PIPELINE_FLUSH_TIMEOUT_MS,
|
|
17
|
+
} from "../sdk/constants.ts";
|
|
14
18
|
import type { ClientSink, SessionErrorCode } from "../sdk/protocol.ts";
|
|
15
19
|
import type {
|
|
16
20
|
SttError,
|
|
@@ -55,8 +59,12 @@ export interface PipelineSessionOptions {
|
|
|
55
59
|
sttApiKey: string;
|
|
56
60
|
/** TTS API key. */
|
|
57
61
|
ttsApiKey: string;
|
|
58
|
-
/**
|
|
59
|
-
|
|
62
|
+
/** STT audio sample rate (PCM16, Hz). Defaults to {@link DEFAULT_STT_SAMPLE_RATE}. */
|
|
63
|
+
sttSampleRate?: number | undefined;
|
|
64
|
+
/** TTS audio sample rate (PCM16, Hz). Must match the client's playback AudioContext rate. Defaults to {@link DEFAULT_TTS_SAMPLE_RATE}. */
|
|
65
|
+
ttsSampleRate?: number | undefined;
|
|
66
|
+
/** Skip the initial greeting audio on connect (used for session resume). */
|
|
67
|
+
skipGreeting?: boolean | undefined;
|
|
60
68
|
/** Logger. Defaults to the console logger. */
|
|
61
69
|
logger?: Logger | undefined;
|
|
62
70
|
/** Sliding-window conversation history size. */
|
|
@@ -99,7 +107,6 @@ function handleStreamPart(
|
|
|
99
107
|
if (delta.length === 0) return;
|
|
100
108
|
deps.onTextDelta(delta);
|
|
101
109
|
deps.tts?.sendText(delta);
|
|
102
|
-
deps.client.event({ type: "agent_transcript", text: delta });
|
|
103
110
|
return;
|
|
104
111
|
}
|
|
105
112
|
case "tool-call": {
|
|
@@ -136,7 +143,8 @@ function handleStreamPart(
|
|
|
136
143
|
/** Create a pluggable-provider voice session. */
|
|
137
144
|
export function createPipelineSession(opts: PipelineSessionOptions): Session {
|
|
138
145
|
const log = opts.logger ?? consoleLogger;
|
|
139
|
-
const
|
|
146
|
+
const sttSampleRate = opts.sttSampleRate ?? DEFAULT_STT_SAMPLE_RATE;
|
|
147
|
+
const ttsSampleRate = opts.ttsSampleRate ?? DEFAULT_TTS_SAMPLE_RATE;
|
|
140
148
|
const { client, agentConfig, toolSchemas, executeTool } = opts;
|
|
141
149
|
|
|
142
150
|
const hasTools = toolSchemas.length > 0 || (agentConfig.builtinTools?.length ?? 0) > 0;
|
|
@@ -342,6 +350,14 @@ export function createPipelineSession(opts: PipelineSessionOptions): Session {
|
|
|
342
350
|
return;
|
|
343
351
|
}
|
|
344
352
|
|
|
353
|
+
// Emit the complete transcript once the LLM finishes streaming, so the
|
|
354
|
+
// UI renders a single assistant message (vs. one per delta) and the user
|
|
355
|
+
// sees the text while TTS drains the synthesized audio.
|
|
356
|
+
if (accumulated.length > 0) {
|
|
357
|
+
client.event({ type: "agent_transcript", text: accumulated });
|
|
358
|
+
ctx.pushMessages({ role: "assistant", content: accumulated });
|
|
359
|
+
}
|
|
360
|
+
|
|
345
361
|
await flushTtsAndWait(ctl.signal);
|
|
346
362
|
|
|
347
363
|
if (ctl.signal.aborted) {
|
|
@@ -349,9 +365,29 @@ export function createPipelineSession(opts: PipelineSessionOptions): Session {
|
|
|
349
365
|
return;
|
|
350
366
|
}
|
|
351
367
|
|
|
352
|
-
|
|
353
|
-
|
|
368
|
+
client.playAudioDone();
|
|
369
|
+
client.event({ type: "reply_done" });
|
|
370
|
+
if (turnController === ctl) turnController = null;
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
async function runGreeting(text: string): Promise<void> {
|
|
374
|
+
const replyId = `pipeline-greeting-${++nextReplyId}`;
|
|
375
|
+
ctx.beginReply(replyId);
|
|
376
|
+
|
|
377
|
+
const ctl = new AbortController();
|
|
378
|
+
turnController = ctl;
|
|
379
|
+
|
|
380
|
+
client.event({ type: "agent_transcript", text });
|
|
381
|
+
ctx.pushMessages({ role: "assistant", content: text });
|
|
382
|
+
ctx.tts?.sendText(text);
|
|
383
|
+
|
|
384
|
+
await flushTtsAndWait(ctl.signal);
|
|
385
|
+
|
|
386
|
+
if (ctl.signal.aborted) {
|
|
387
|
+
if (turnController === ctl) turnController = null;
|
|
388
|
+
return;
|
|
354
389
|
}
|
|
390
|
+
|
|
355
391
|
client.playAudioDone();
|
|
356
392
|
client.event({ type: "reply_done" });
|
|
357
393
|
if (turnController === ctl) turnController = null;
|
|
@@ -394,13 +430,13 @@ export function createPipelineSession(opts: PipelineSessionOptions): Session {
|
|
|
394
430
|
async function openProviders(): Promise<void> {
|
|
395
431
|
const [sttResult, ttsResult] = await Promise.allSettled([
|
|
396
432
|
opts.stt.open({
|
|
397
|
-
sampleRate,
|
|
433
|
+
sampleRate: sttSampleRate,
|
|
398
434
|
apiKey: opts.sttApiKey,
|
|
399
435
|
sttPrompt: agentConfig.sttPrompt,
|
|
400
436
|
signal: sessionAbort.signal,
|
|
401
437
|
}),
|
|
402
438
|
opts.tts.open({
|
|
403
|
-
sampleRate,
|
|
439
|
+
sampleRate: ttsSampleRate,
|
|
404
440
|
apiKey: opts.ttsApiKey,
|
|
405
441
|
signal: sessionAbort.signal,
|
|
406
442
|
}),
|
|
@@ -458,7 +494,15 @@ export function createPipelineSession(opts: PipelineSessionOptions): Session {
|
|
|
458
494
|
ctx.stt?.sendAudio(pcm);
|
|
459
495
|
},
|
|
460
496
|
onAudioReady(): void {
|
|
497
|
+
if (audioReady || terminated) return;
|
|
461
498
|
audioReady = true;
|
|
499
|
+
if (opts.skipGreeting) return;
|
|
500
|
+
const greeting = agentConfig.greeting;
|
|
501
|
+
if (!greeting) return;
|
|
502
|
+
const turn = runGreeting(greeting).catch((err: unknown) => {
|
|
503
|
+
log.error("Pipeline greeting failed", { error: errorMessage(err), sessionId: opts.id });
|
|
504
|
+
});
|
|
505
|
+
ctx.chainTurn(turn);
|
|
462
506
|
},
|
|
463
507
|
onCancel(): void {
|
|
464
508
|
if (terminated) return;
|
|
@@ -77,7 +77,10 @@ export function resolveLlm(descriptor: LlmProvider, env: Record<string, string>)
|
|
|
77
77
|
if (!apiKey) {
|
|
78
78
|
throw new Error("Anthropic LLM: missing API key. Set ANTHROPIC_API_KEY in the agent env.");
|
|
79
79
|
}
|
|
80
|
-
|
|
80
|
+
// Pass baseURL explicitly so the SDK's loadOptionalSetting returns
|
|
81
|
+
// before reading process.env["ANTHROPIC_BASE_URL"]. Without this,
|
|
82
|
+
// the Deno platform server needs --allow-env to start a session.
|
|
83
|
+
return createAnthropic({ apiKey, baseURL: "https://api.anthropic.com/v1" })(options.model);
|
|
81
84
|
}
|
|
82
85
|
default:
|
|
83
86
|
throw new Error(
|
|
@@ -151,12 +151,15 @@ describe("cartesia TTS adapter", () => {
|
|
|
151
151
|
},
|
|
152
152
|
]);
|
|
153
153
|
|
|
154
|
-
//
|
|
155
|
-
|
|
156
|
-
|
|
154
|
+
// Rotation is deferred until the next sendText so Cartesia's late
|
|
155
|
+
// audio chunks + real `done` event (both tagged with turn1's id) still
|
|
156
|
+
// pass the context-id filter.
|
|
157
|
+
expect(session._currentContextId()).toBe(turn1);
|
|
157
158
|
|
|
158
|
-
// Subsequent sendText
|
|
159
|
+
// Subsequent sendText rotates to a fresh context.
|
|
159
160
|
session.sendText("next");
|
|
161
|
+
const turn2 = session._currentContextId();
|
|
162
|
+
expect(turn2).not.toBe(turn1);
|
|
160
163
|
await flush();
|
|
161
164
|
expect(sends.filter((s) => s.contextId === turn2)).toEqual([
|
|
162
165
|
{
|
|
@@ -201,8 +204,15 @@ describe("cartesia TTS adapter", () => {
|
|
|
201
204
|
{ kind: "cancel", contextId: turn1 },
|
|
202
205
|
]);
|
|
203
206
|
|
|
204
|
-
//
|
|
205
|
-
|
|
207
|
+
// Rotation is deferred until the next sendText — cancel() halts the
|
|
208
|
+
// old context on Cartesia's side, so late events for turn1 can safely
|
|
209
|
+
// keep passing the filter until the next turn actually begins.
|
|
210
|
+
expect(session._currentContextId()).toBe(turn1);
|
|
211
|
+
|
|
212
|
+
// A subsequent sendText mints a fresh context for turn2.
|
|
213
|
+
session.sendText("again");
|
|
214
|
+
const turn2 = session._currentContextId();
|
|
215
|
+
expect(turn2).not.toBe(turn1);
|
|
206
216
|
|
|
207
217
|
controller.abort();
|
|
208
218
|
await session.close();
|
|
@@ -107,9 +107,21 @@ export function openCartesia(opts: CartesiaOptions): TtsOpener {
|
|
|
107
107
|
* Reset whenever a fresh context is minted (i.e. at turn boundaries).
|
|
108
108
|
*/
|
|
109
109
|
let doneEmitted = false;
|
|
110
|
+
/**
|
|
111
|
+
* After `flush()` or `cancel()`, the current context is done accepting
|
|
112
|
+
* input. We defer minting a fresh one until the next `sendText()` so
|
|
113
|
+
* that late audio chunks + Cartesia's real `done` event (both tagged
|
|
114
|
+
* with the flushed context's id) still pass the filter below. Rotating
|
|
115
|
+
* eagerly would silently drop all audio still in flight.
|
|
116
|
+
*/
|
|
117
|
+
let rotatePending = false;
|
|
110
118
|
const rotateContext = () => {
|
|
111
119
|
context = mintContext();
|
|
112
120
|
doneEmitted = false;
|
|
121
|
+
rotatePending = false;
|
|
122
|
+
};
|
|
123
|
+
const rotateIfPending = () => {
|
|
124
|
+
if (rotatePending) rotateContext();
|
|
113
125
|
};
|
|
114
126
|
const emitDoneOnce = () => {
|
|
115
127
|
if (doneEmitted || closed) return;
|
|
@@ -179,33 +191,35 @@ export function openCartesia(opts: CartesiaOptions): TtsOpener {
|
|
|
179
191
|
const session: CartesiaSession = {
|
|
180
192
|
sendText(text: string) {
|
|
181
193
|
if (closed || text.length === 0) return;
|
|
194
|
+
// First sendText after a flush/cancel starts a fresh context so
|
|
195
|
+
// we don't append to one that's already been finalized.
|
|
196
|
+
rotateIfPending();
|
|
182
197
|
void context
|
|
183
198
|
.send({ ...baseRequest, transcript: text, continue: true })
|
|
184
199
|
.catch(ignoreRejection);
|
|
185
200
|
},
|
|
186
201
|
flush() {
|
|
187
|
-
if (closed) return;
|
|
202
|
+
if (closed || rotatePending) return;
|
|
188
203
|
// Empty transcript with `continue: false` is the canonical
|
|
189
|
-
// end-of-turn signal. Cartesia
|
|
190
|
-
//
|
|
191
|
-
//
|
|
192
|
-
// the
|
|
193
|
-
//
|
|
194
|
-
// Cartesia always emits `done` for cleanly-flushed contexts.
|
|
204
|
+
// end-of-turn signal. Cartesia finishes synthesizing whatever
|
|
205
|
+
// is queued and then emits a `done` tagged with the same
|
|
206
|
+
// context_id — at that point `emitDoneOnce` fires for real.
|
|
207
|
+
// Defer rotation so the filter below still accepts in-flight
|
|
208
|
+
// audio chunks and the real `done` event.
|
|
195
209
|
void context
|
|
196
210
|
.send({ ...baseRequest, transcript: "", continue: false })
|
|
197
211
|
.catch(ignoreRejection);
|
|
198
|
-
|
|
199
|
-
rotateContext();
|
|
212
|
+
rotatePending = true;
|
|
200
213
|
},
|
|
201
214
|
cancel() {
|
|
202
215
|
if (closed) return;
|
|
203
216
|
void context.cancel().catch(ignoreRejection);
|
|
204
217
|
// Emit synchronously: barge-in advances the orchestrator's
|
|
205
218
|
// state machine on `done`, and delaying it would audibly
|
|
206
|
-
// stall subsequent turns.
|
|
219
|
+
// stall subsequent turns. Cartesia stops producing audio
|
|
220
|
+
// after cancel, so dropping any late chunks is fine.
|
|
207
221
|
emitDoneOnce();
|
|
208
|
-
|
|
222
|
+
rotatePending = true;
|
|
209
223
|
},
|
|
210
224
|
on(event, fn) {
|
|
211
225
|
return emitter.on(event, fn);
|
package/host/runtime.ts
CHANGED
|
@@ -326,6 +326,9 @@ export function createRuntime(opts: RuntimeOptions): Runtime {
|
|
|
326
326
|
tts: pipelineProviders.tts,
|
|
327
327
|
sttApiKey: resolveApiKey("ASSEMBLYAI_API_KEY", env),
|
|
328
328
|
ttsApiKey: resolveApiKey("CARTESIA_API_KEY", env),
|
|
329
|
+
sttSampleRate: s2sConfig.inputSampleRate,
|
|
330
|
+
ttsSampleRate: s2sConfig.outputSampleRate,
|
|
331
|
+
skipGreeting: sessionOpts.skipGreeting ?? false,
|
|
329
332
|
logger,
|
|
330
333
|
});
|
|
331
334
|
}
|