@alexkroman1/aai 1.4.5 → 1.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +10 -10
- package/CHANGELOG.md +19 -0
- package/dist/{_internal-types-3p3OJZPb.js → _internal-types-DFL07G3f.js} +2 -0
- package/dist/assemblyai-C969QGi4.js +35 -0
- package/dist/cartesia-BfQPOQ7Y.js +37 -0
- package/dist/host/_pipeline-test-fakes.d.ts +3 -1
- package/dist/host/providers/stt/deepgram.d.ts +28 -0
- package/dist/host/providers/tts/cartesia.d.ts +1 -1
- package/dist/host/providers/tts/rime.d.ts +44 -0
- package/dist/host/runtime-barrel.d.ts +4 -2
- package/dist/host/runtime-barrel.js +1434 -1209
- package/dist/host/runtime.d.ts +2 -2
- package/dist/host/s2s.d.ts +16 -16
- package/dist/host/session-core.d.ts +37 -0
- package/dist/host/transports/pipeline-transport.d.ts +48 -0
- package/dist/host/transports/s2s-transport.d.ts +19 -0
- package/dist/host/transports/types.d.ts +45 -0
- package/dist/host/ws-handler.d.ts +14 -10
- package/dist/sdk/_internal-types.d.ts +2 -0
- package/dist/sdk/manifest-barrel.js +1 -1
- package/dist/sdk/protocol.d.ts +6 -5
- package/dist/sdk/providers/llm-barrel.js +1 -1
- package/dist/sdk/providers/stt/deepgram.d.ts +35 -0
- package/dist/sdk/providers/stt-barrel.d.ts +1 -0
- package/dist/sdk/providers/stt-barrel.js +2 -2
- package/dist/sdk/providers/tts/cartesia.d.ts +12 -4
- package/dist/sdk/providers/tts/rime.d.ts +42 -0
- package/dist/sdk/providers/tts-barrel.d.ts +1 -0
- package/dist/sdk/providers/tts-barrel.js +2 -2
- package/host/_pipeline-test-fakes.ts +6 -3
- package/host/_test-utils.ts +209 -128
- package/host/builtin-tools.ts +1 -0
- package/host/cleanup.test.ts +25 -298
- package/host/integration/pipeline-reference.integration.test.ts +30 -35
- package/host/providers/resolve.ts +10 -2
- package/host/providers/stt/deepgram.test.ts +229 -0
- package/host/providers/stt/deepgram.ts +172 -0
- package/host/providers/tts/cartesia.ts +7 -3
- package/host/providers/tts/rime.test.ts +251 -0
- package/host/providers/tts/rime.ts +322 -0
- package/host/runtime-barrel.ts +4 -2
- package/host/runtime.test.ts +16 -47
- package/host/runtime.ts +131 -23
- package/host/s2s.test.ts +122 -131
- package/host/s2s.ts +44 -52
- package/host/session-core.test.ts +257 -0
- package/host/session-core.ts +262 -0
- package/host/to-vercel-tools.test.ts +9 -1
- package/host/transports/pipeline-transport.test.ts +653 -0
- package/host/transports/pipeline-transport.ts +532 -0
- package/host/{fixture-replay.test.ts → transports/s2s-transport-fixtures.test.ts} +76 -106
- package/host/transports/s2s-transport.test.ts +56 -0
- package/host/transports/s2s-transport.ts +116 -0
- package/host/transports/types.test.ts +22 -0
- package/host/transports/types.ts +51 -0
- package/host/ws-handler.test.ts +324 -242
- package/host/ws-handler.ts +56 -59
- package/package.json +2 -1
- package/sdk/__snapshots__/exports.test.ts.snap +3 -3
- package/sdk/__snapshots__/schema-shapes.test.ts.snap +1 -0
- package/sdk/_internal-types.ts +3 -0
- package/sdk/protocol-compat.test.ts +8 -0
- package/sdk/protocol.ts +6 -5
- package/sdk/providers/stt/deepgram.ts +43 -0
- package/sdk/providers/stt-barrel.ts +2 -0
- package/sdk/providers/tts/cartesia.ts +15 -5
- package/sdk/providers/tts/rime.ts +52 -0
- package/sdk/providers/tts-barrel.ts +2 -0
- package/sdk/schema-alignment.test.ts +18 -6
- package/dist/assemblyai-Cxg9eobY.js +0 -18
- package/dist/cartesia-DwDk2tEu.js +0 -10
- package/dist/host/pipeline-session-ctx.d.ts +0 -24
- package/dist/host/pipeline-session.d.ts +0 -52
- package/dist/host/session-ctx.d.ts +0 -73
- package/dist/host/session.d.ts +0 -62
- package/host/pipeline-session-ctx.test.ts +0 -31
- package/host/pipeline-session-ctx.ts +0 -36
- package/host/pipeline-session.test.ts +0 -672
- package/host/pipeline-session.ts +0 -533
- package/host/s2s-fixtures.test.ts +0 -237
- package/host/session-ctx.test.ts +0 -387
- package/host/session-ctx.ts +0 -134
- package/host/session-fixture-replay.test.ts +0 -128
- package/host/session.test.ts +0 -634
- package/host/session.ts +0 -412
- /package/dist/{anthropic-BrUCPKUc.js → anthropic-CcLZygAr.js} +0 -0
|
@@ -1,23 +1,24 @@
|
|
|
1
1
|
import { r as DEFAULT_SYSTEM_PROMPT } from "../types-KUgezM6u.js";
|
|
2
|
-
import { _ as TOOL_EXECUTION_TIMEOUT_MS, a as DEFAULT_SHUTDOWN_TIMEOUT_MS, c as FETCH_TIMEOUT_MS, d as MAX_PAGE_CHARS,
|
|
2
|
+
import { _ as TOOL_EXECUTION_TIMEOUT_MS, a as DEFAULT_SHUTDOWN_TIMEOUT_MS, c as FETCH_TIMEOUT_MS, d as MAX_PAGE_CHARS, g as RUN_CODE_TIMEOUT_MS, h as PIPELINE_FLUSH_TIMEOUT_MS, l as MAX_HTML_BYTES, m as MAX_WS_PAYLOAD_BYTES, o as DEFAULT_STT_SAMPLE_RATE, p as MAX_VALUE_SIZE, s as DEFAULT_TTS_SAMPLE_RATE, t as AGENT_CSP } from "../constants-C2nirZUI.js";
|
|
3
3
|
import { i as toolError, n as errorDetail, r as errorMessage, t as parseWsUpgradeParams } from "../ws-upgrade-BeOQ7fXL.js";
|
|
4
4
|
import { ClientMessageSchema, buildReadyConfig, lenientParse } from "../sdk/protocol.js";
|
|
5
|
-
import { a as toAgentConfig, c as makeSttError, i as agentToolsToSchemas, l as makeTtsError, n as EMPTY_PARAMS, s as assertProviderTriple } from "../_internal-types-
|
|
6
|
-
import { t as
|
|
7
|
-
import {
|
|
8
|
-
import { t as
|
|
5
|
+
import { a as toAgentConfig, c as makeSttError, i as agentToolsToSchemas, l as makeTtsError, n as EMPTY_PARAMS, s as assertProviderTriple } from "../_internal-types-DFL07G3f.js";
|
|
6
|
+
import { r as DEEPGRAM_KIND, t as ASSEMBLYAI_KIND } from "../assemblyai-C969QGi4.js";
|
|
7
|
+
import { a as RIME_KIND, n as CARTESIA_KIND } from "../cartesia-BfQPOQ7Y.js";
|
|
8
|
+
import { t as ANTHROPIC_KIND } from "../anthropic-CcLZygAr.js";
|
|
9
9
|
import { z } from "zod";
|
|
10
10
|
import { convert } from "html-to-text";
|
|
11
11
|
import vm from "node:vm";
|
|
12
12
|
import pTimeout from "p-timeout";
|
|
13
13
|
import { createStorage, prefixStorage } from "unstorage";
|
|
14
|
-
import { jsonSchema, stepCountIs, streamText, tool } from "ai";
|
|
15
14
|
import { createAnthropic } from "@ai-sdk/anthropic";
|
|
16
15
|
import { AssemblyAI } from "assemblyai";
|
|
17
16
|
import { createNanoEvents } from "nanoevents";
|
|
17
|
+
import { DeepgramClient } from "@deepgram/sdk";
|
|
18
18
|
import { randomUUID } from "node:crypto";
|
|
19
19
|
import { Cartesia } from "@cartesia/cartesia-js";
|
|
20
20
|
import WsWebSocket, { WebSocketServer } from "ws";
|
|
21
|
+
import { jsonSchema, stepCountIs, streamText, tool } from "ai";
|
|
21
22
|
import fs from "node:fs";
|
|
22
23
|
import http from "node:http";
|
|
23
24
|
import path from "node:path";
|
|
@@ -333,6 +334,7 @@ function resolveAllBuiltins(names, opts) {
|
|
|
333
334
|
for (const name of names) for (const [toolName, def] of resolveBuiltin(name, opts)) {
|
|
334
335
|
defs[toolName] = def;
|
|
335
336
|
schemas.push({
|
|
337
|
+
type: "function",
|
|
336
338
|
name: toolName,
|
|
337
339
|
description: def.description,
|
|
338
340
|
parameters: z.toJSONSchema(def.parameters ?? EMPTY_PARAMS)
|
|
@@ -378,712 +380,236 @@ function buildSystemPrompt(config, opts) {
|
|
|
378
380
|
return DEFAULT_SYSTEM_PROMPT + `\n\nToday's date is ${getFormattedDate()}.` + agentInstructions + toolPreamble + guidance + (opts.voice ? VOICE_RULES : "");
|
|
379
381
|
}
|
|
380
382
|
//#endregion
|
|
381
|
-
//#region host/
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
383
|
+
//#region host/providers/stt/assemblyai.ts
|
|
384
|
+
/**
|
|
385
|
+
* AssemblyAI Universal-Streaming STT opener (host-only).
|
|
386
|
+
*
|
|
387
|
+
* The user-facing descriptor factory (`assemblyAI(...)`) lives in
|
|
388
|
+
* `sdk/providers/stt/assemblyai.ts`. This module is the host-side
|
|
389
|
+
* counterpart: it takes the descriptor options + an API key and
|
|
390
|
+
* returns an {@link SttOpener} that the pipeline session drives.
|
|
391
|
+
*
|
|
392
|
+
* Default model: `"u3pro-rt"` (Universal-3 Pro Real-Time). The adapter
|
|
393
|
+
* maps that to the SDK's `"u3-rt-pro"` `speechModel` value; any other
|
|
394
|
+
* string is forwarded verbatim.
|
|
395
|
+
*/
|
|
396
|
+
/** Translate the descriptor's model alias to the SDK's `speechModel` value. */
|
|
397
|
+
function resolveSpeechModel(model) {
|
|
398
|
+
if (model === "u3pro-rt") return "u3-rt-pro";
|
|
399
|
+
return model;
|
|
400
|
+
}
|
|
401
|
+
/** Build an {@link SttOpener} from resolved AssemblyAI descriptor options. */
|
|
402
|
+
function openAssemblyAI(opts = {}) {
|
|
403
|
+
return {
|
|
404
|
+
name: "assemblyai",
|
|
405
|
+
async open(openOpts) {
|
|
406
|
+
const apiKey = openOpts.apiKey || process.env.ASSEMBLYAI_API_KEY;
|
|
407
|
+
if (!apiKey) throw makeSttError("stt_auth_failed", "AssemblyAI STT: missing API key. Set ASSEMBLYAI_API_KEY in the agent env.");
|
|
408
|
+
const client = new AssemblyAI({ apiKey });
|
|
409
|
+
const speechModel = resolveSpeechModel(opts.model ?? "u3pro-rt");
|
|
410
|
+
const transcriber = client.streaming.transcriber({
|
|
411
|
+
sampleRate: openOpts.sampleRate,
|
|
412
|
+
speechModel,
|
|
413
|
+
...openOpts.sttPrompt ? { prompt: openOpts.sttPrompt } : {}
|
|
414
|
+
});
|
|
415
|
+
const emitter = createNanoEvents();
|
|
416
|
+
let closed = false;
|
|
417
|
+
transcriber.on("turn", (event) => {
|
|
418
|
+
if (closed) return;
|
|
419
|
+
const text = event.transcript ?? "";
|
|
420
|
+
if (event.end_of_turn) {
|
|
421
|
+
if (text.length > 0) emitter.emit("final", text);
|
|
422
|
+
} else if (text.length > 0) emitter.emit("partial", text);
|
|
423
|
+
});
|
|
424
|
+
transcriber.on("error", (err) => {
|
|
425
|
+
if (closed) return;
|
|
426
|
+
emitter.emit("error", makeSttError("stt_stream_error", err?.message ?? String(err)));
|
|
427
|
+
});
|
|
428
|
+
transcriber.on("close", (code) => {
|
|
429
|
+
if (closed) return;
|
|
430
|
+
if (code !== 1e3) emitter.emit("error", makeSttError("stt_stream_error", `socket closed ${code}`));
|
|
431
|
+
});
|
|
432
|
+
try {
|
|
433
|
+
await transcriber.connect();
|
|
434
|
+
} catch (cause) {
|
|
435
|
+
throw makeSttError("stt_connect_failed", `AssemblyAI STT: connect failed: ${cause instanceof Error ? cause.message : String(cause)}`);
|
|
405
436
|
}
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
beginReply(replyId) {
|
|
413
|
-
ctx.reply = {
|
|
414
|
-
pendingTools: [],
|
|
415
|
-
toolCallCount: 0,
|
|
416
|
-
currentReplyId: replyId
|
|
437
|
+
const close = async () => {
|
|
438
|
+
if (closed) return;
|
|
439
|
+
closed = true;
|
|
440
|
+
try {
|
|
441
|
+
await transcriber.close();
|
|
442
|
+
} catch {}
|
|
417
443
|
};
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
444
|
+
if (openOpts.signal.aborted) close();
|
|
445
|
+
else openOpts.signal.addEventListener("abort", () => void close(), { once: true });
|
|
446
|
+
return {
|
|
447
|
+
sendAudio(pcm) {
|
|
448
|
+
if (closed) return;
|
|
449
|
+
const copy = new Uint8Array(pcm.byteLength);
|
|
450
|
+
copy.set(new Uint8Array(pcm.buffer, pcm.byteOffset, pcm.byteLength));
|
|
451
|
+
transcriber.sendAudio(copy.buffer);
|
|
452
|
+
},
|
|
453
|
+
on(event, fn) {
|
|
454
|
+
return emitter.on(event, fn);
|
|
455
|
+
},
|
|
456
|
+
close,
|
|
457
|
+
_transcriber: transcriber
|
|
425
458
|
};
|
|
426
|
-
},
|
|
427
|
-
chainTurn(p) {
|
|
428
|
-
ctx.turnPromise = (ctx.turnPromise ?? Promise.resolve()).then(() => p);
|
|
429
459
|
}
|
|
430
460
|
};
|
|
431
|
-
return ctx;
|
|
432
|
-
}
|
|
433
|
-
function buildCtx(opts) {
|
|
434
|
-
const base = _buildBaseCtx(opts);
|
|
435
|
-
base.s2s = null;
|
|
436
|
-
return base;
|
|
437
|
-
}
|
|
438
|
-
//#endregion
|
|
439
|
-
//#region host/pipeline-session-ctx.ts
|
|
440
|
-
function buildPipelineCtx(opts) {
|
|
441
|
-
const base = _buildBaseCtx(opts);
|
|
442
|
-
base.stt = null;
|
|
443
|
-
base.tts = null;
|
|
444
|
-
return base;
|
|
445
461
|
}
|
|
446
462
|
//#endregion
|
|
447
|
-
//#region host/
|
|
463
|
+
//#region host/providers/stt/deepgram.ts
|
|
448
464
|
/**
|
|
449
|
-
*
|
|
465
|
+
* Deepgram Nova streaming STT opener (host-only).
|
|
450
466
|
*
|
|
451
|
-
*
|
|
452
|
-
*
|
|
467
|
+
* The user-facing descriptor factory (`deepgram(...)`) lives in
|
|
468
|
+
* `sdk/providers/stt/deepgram.ts`. This module is the host-side
|
|
469
|
+
* counterpart: it takes the descriptor options + an API key and
|
|
470
|
+
* returns an {@link SttOpener} that the pipeline session drives.
|
|
471
|
+
*
|
|
472
|
+
* Default model: `"nova-3"`. Any string is forwarded verbatim to the SDK.
|
|
473
|
+
*
|
|
474
|
+
* This adapter targets the Deepgram SDK v5 (`@deepgram/sdk@^5`). The v5
|
|
475
|
+
* streaming API is:
|
|
476
|
+
* `client.listen.v1.connect(args)` → `Promise<V1Socket>`
|
|
477
|
+
* followed by:
|
|
478
|
+
* `socket.connect()` + `socket.waitForOpen()` to establish the connection.
|
|
453
479
|
*/
|
|
454
|
-
function consoleLog(fn) {
|
|
455
|
-
return (msg, ctx) => ctx ? fn(msg, ctx) : fn(msg);
|
|
456
|
-
}
|
|
457
|
-
/** Default console-backed logger. */
|
|
458
|
-
const consoleLogger = {
|
|
459
|
-
info: consoleLog(console.log),
|
|
460
|
-
warn: consoleLog(console.warn),
|
|
461
|
-
error: consoleLog(console.error),
|
|
462
|
-
debug: consoleLog(console.debug)
|
|
463
|
-
};
|
|
464
480
|
/**
|
|
465
|
-
*
|
|
466
|
-
*
|
|
467
|
-
* caller-provided context fields.
|
|
481
|
+
* Handle an incoming Deepgram transcript message, emitting `partial` or
|
|
482
|
+
* `final` events on the emitter. Empty transcripts are silently dropped.
|
|
468
483
|
*/
|
|
469
|
-
function
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
484
|
+
function handleMessage(data, closed, emitter) {
|
|
485
|
+
if (closed) return;
|
|
486
|
+
if (data.type !== "Results") return;
|
|
487
|
+
const result = data;
|
|
488
|
+
const text = result.channel?.alternatives?.[0]?.transcript ?? "";
|
|
489
|
+
if (result.is_final) {
|
|
490
|
+
if (text.length > 0) emitter.emit("final", text);
|
|
491
|
+
} else if (text.length > 0) emitter.emit("partial", text);
|
|
492
|
+
}
|
|
493
|
+
/** Wire Deepgram socket events onto the nanoevents emitter. */
|
|
494
|
+
function wireSocketEvents(connection, emitter, getIsClosed) {
|
|
495
|
+
connection.on("message", (data) => handleMessage(data, getIsClosed(), emitter));
|
|
496
|
+
connection.on("error", (err) => {
|
|
497
|
+
if (getIsClosed()) return;
|
|
498
|
+
emitter.emit("error", makeSttError("stt_stream_error", err?.message ?? String(err)));
|
|
499
|
+
});
|
|
500
|
+
connection.on("close", (event) => {
|
|
501
|
+
if (getIsClosed()) return;
|
|
502
|
+
const code = event?.code;
|
|
503
|
+
if (code !== void 0 && code !== 1e3) emitter.emit("error", makeSttError("stt_stream_error", `socket closed ${code}`));
|
|
504
|
+
});
|
|
505
|
+
}
|
|
506
|
+
/** Wire the AbortSignal to the close function. */
|
|
507
|
+
function wireAbortSignal(signal, close) {
|
|
508
|
+
if (signal.aborted) close();
|
|
509
|
+
else signal.addEventListener("abort", () => void close(), { once: true });
|
|
510
|
+
}
|
|
511
|
+
/** Build an {@link SttOpener} from resolved Deepgram descriptor options. */
|
|
512
|
+
function openDeepgram(opts = {}) {
|
|
513
|
+
return {
|
|
514
|
+
name: "deepgram",
|
|
515
|
+
async open(openOpts) {
|
|
516
|
+
const apiKey = openOpts.apiKey || process.env.DEEPGRAM_API_KEY;
|
|
517
|
+
if (!apiKey) throw makeSttError("stt_auth_failed", "Deepgram STT: missing API key. Set DEEPGRAM_API_KEY in the agent env.");
|
|
518
|
+
const client = new DeepgramClient({ apiKey });
|
|
519
|
+
let connection;
|
|
520
|
+
try {
|
|
521
|
+
connection = await client.listen.v1.connect({
|
|
522
|
+
model: opts.model ?? "nova-3",
|
|
523
|
+
language: opts.language ?? "en",
|
|
524
|
+
encoding: "linear16",
|
|
525
|
+
sample_rate: openOpts.sampleRate,
|
|
526
|
+
channels: 1,
|
|
527
|
+
interim_results: "true",
|
|
528
|
+
smart_format: "true",
|
|
529
|
+
endpointing: 300,
|
|
530
|
+
utterance_end_ms: "1000",
|
|
531
|
+
Authorization: apiKey
|
|
532
|
+
});
|
|
533
|
+
} catch (cause) {
|
|
534
|
+
throw makeSttError("stt_connect_failed", `Deepgram STT: connect failed: ${cause instanceof Error ? cause.message : String(cause)}`);
|
|
535
|
+
}
|
|
536
|
+
const emitter = createNanoEvents();
|
|
537
|
+
let closed = false;
|
|
538
|
+
wireSocketEvents(connection, emitter, () => closed);
|
|
539
|
+
connection.connect();
|
|
540
|
+
try {
|
|
541
|
+
await connection.waitForOpen();
|
|
542
|
+
} catch (cause) {
|
|
543
|
+
throw makeSttError("stt_connect_failed", `Deepgram STT: WebSocket open failed: ${cause instanceof Error ? cause.message : String(cause)}`);
|
|
544
|
+
}
|
|
545
|
+
const close = async () => {
|
|
546
|
+
if (closed) return;
|
|
547
|
+
closed = true;
|
|
548
|
+
try {
|
|
549
|
+
connection.close();
|
|
550
|
+
} catch {}
|
|
551
|
+
};
|
|
552
|
+
wireAbortSignal(openOpts.signal, close);
|
|
553
|
+
return {
|
|
554
|
+
sendAudio(pcm) {
|
|
555
|
+
if (closed) return;
|
|
556
|
+
connection.sendMedia(new Uint8Array(pcm.buffer, pcm.byteOffset, pcm.byteLength));
|
|
557
|
+
},
|
|
558
|
+
on(event, fn) {
|
|
559
|
+
return emitter.on(event, fn);
|
|
560
|
+
},
|
|
561
|
+
close,
|
|
562
|
+
_connection: connection
|
|
563
|
+
};
|
|
564
|
+
}
|
|
478
565
|
};
|
|
479
566
|
}
|
|
480
|
-
const jsonLogger = {
|
|
481
|
-
info: jsonLog("info"),
|
|
482
|
-
warn: jsonLog("warn"),
|
|
483
|
-
error: jsonLog("error"),
|
|
484
|
-
debug: jsonLog("debug")
|
|
485
|
-
};
|
|
486
|
-
/** Default S2S endpoint configuration. */
|
|
487
|
-
const DEFAULT_S2S_CONFIG = {
|
|
488
|
-
wssUrl: "wss://agents.assemblyai.com/v1/voice",
|
|
489
|
-
inputSampleRate: DEFAULT_STT_SAMPLE_RATE,
|
|
490
|
-
outputSampleRate: DEFAULT_TTS_SAMPLE_RATE
|
|
491
|
-
};
|
|
492
567
|
//#endregion
|
|
493
|
-
//#region host/
|
|
568
|
+
//#region host/providers/tts/cartesia.ts
|
|
494
569
|
/**
|
|
495
|
-
*
|
|
496
|
-
* delegation to the agent's {@link ExecuteTool} function.
|
|
570
|
+
* Cartesia TTS opener (host-only).
|
|
497
571
|
*
|
|
498
|
-
* The
|
|
499
|
-
*
|
|
500
|
-
*
|
|
501
|
-
*
|
|
502
|
-
* timeout) remains the single source of truth for tool behavior.
|
|
572
|
+
* The user-facing descriptor factory (`cartesia(...)`) lives in
|
|
573
|
+
* `sdk/providers/tts/cartesia.ts`. This module is the host-side
|
|
574
|
+
* counterpart: it takes the descriptor options + an API key and
|
|
575
|
+
* returns a {@link TtsOpener} that the pipeline session drives.
|
|
503
576
|
*
|
|
504
|
-
*
|
|
505
|
-
*
|
|
506
|
-
* bag-level `ctx.signal` so individual invocations respect streamText
|
|
507
|
-
* aborts.
|
|
508
|
-
*/
|
|
509
|
-
/**
|
|
510
|
-
* Convert an array of {@link ToolSchema} to a Vercel AI SDK `ToolSet`
|
|
511
|
-
* (record keyed by tool name).
|
|
577
|
+
* Wraps `@cartesia/cartesia-js`'s `TTSWS` / `TTSWSContext` and normalizes it
|
|
578
|
+
* onto the {@link TtsEvents} contract consumed by the pipeline orchestrator.
|
|
512
579
|
*
|
|
513
|
-
*
|
|
514
|
-
* the
|
|
515
|
-
*
|
|
580
|
+
* **Per-turn context lifecycle.** Each `sendText(...)` within the same turn
|
|
581
|
+
* appends to the same Cartesia context. On `flush()` or `cancel()`, a new
|
|
582
|
+
* context is minted for the next turn — so concurrent `cancel({ contextId })`
|
|
583
|
+
* only targets the in-flight turn, never the one that follows.
|
|
584
|
+
*
|
|
585
|
+
* **Audio format.** The adapter requests `raw` / `pcm_s16le` at the
|
|
586
|
+
* negotiated `sampleRate` so it can forward chunks as `Int16Array` with no
|
|
587
|
+
* conversion.
|
|
516
588
|
*/
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
}
|
|
530
|
-
});
|
|
531
|
-
return out;
|
|
589
|
+
/** PCM16 sample rates supported by Cartesia's `raw` output format. */
|
|
590
|
+
const CARTESIA_PCM16_RATES = [
|
|
591
|
+
8e3,
|
|
592
|
+
16e3,
|
|
593
|
+
22050,
|
|
594
|
+
24e3,
|
|
595
|
+
44100,
|
|
596
|
+
48e3
|
|
597
|
+
];
|
|
598
|
+
function assertSupportedSampleRate$1(rate) {
|
|
599
|
+
if (CARTESIA_PCM16_RATES.includes(rate)) return rate;
|
|
600
|
+
throw makeTtsError("tts_connect_failed", `Cartesia TTS: unsupported sample rate ${rate}. Supported: ${CARTESIA_PCM16_RATES.join(", ")}.`);
|
|
532
601
|
}
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
function toModelMessage(m) {
|
|
536
|
-
if (m.role === "user") return {
|
|
537
|
-
role: "user",
|
|
538
|
-
content: m.content
|
|
539
|
-
};
|
|
540
|
-
if (m.role === "assistant") return {
|
|
541
|
-
role: "assistant",
|
|
542
|
-
content: m.content
|
|
543
|
-
};
|
|
544
|
-
return {
|
|
545
|
-
role: "assistant",
|
|
546
|
-
content: m.content
|
|
547
|
-
};
|
|
548
|
-
}
|
|
549
|
-
function emitError(client, code, message) {
|
|
550
|
-
client.event({
|
|
551
|
-
type: "error",
|
|
552
|
-
code,
|
|
553
|
-
message
|
|
554
|
-
});
|
|
555
|
-
}
|
|
556
|
-
function handleStreamPart(part, deps) {
|
|
557
|
-
switch (part.type) {
|
|
558
|
-
case "text-delta": {
|
|
559
|
-
const delta = part.text ?? "";
|
|
560
|
-
if (delta.length === 0) return;
|
|
561
|
-
deps.onTextDelta(delta);
|
|
562
|
-
deps.tts?.sendText(delta);
|
|
563
|
-
return;
|
|
564
|
-
}
|
|
565
|
-
case "tool-call": {
|
|
566
|
-
const input = part.input ?? {};
|
|
567
|
-
deps.client.event({
|
|
568
|
-
type: "tool_call",
|
|
569
|
-
toolCallId: part.toolCallId ?? "",
|
|
570
|
-
toolName: part.toolName ?? "",
|
|
571
|
-
args: input
|
|
572
|
-
});
|
|
573
|
-
return;
|
|
574
|
-
}
|
|
575
|
-
case "tool-result": {
|
|
576
|
-
const output = part.output;
|
|
577
|
-
const resultString = typeof output === "string" ? output : JSON.stringify(output);
|
|
578
|
-
deps.client.event({
|
|
579
|
-
type: "tool_call_done",
|
|
580
|
-
toolCallId: part.toolCallId ?? "",
|
|
581
|
-
result: resultString
|
|
582
|
-
});
|
|
583
|
-
return;
|
|
584
|
-
}
|
|
585
|
-
case "error": {
|
|
586
|
-
const msg = errorMessage(part.error);
|
|
587
|
-
deps.log.error("LLM stream error", {
|
|
588
|
-
message: msg,
|
|
589
|
-
sessionId: deps.sessionId
|
|
590
|
-
});
|
|
591
|
-
emitError(deps.client, "llm", msg);
|
|
592
|
-
return;
|
|
593
|
-
}
|
|
594
|
-
default: return;
|
|
595
|
-
}
|
|
596
|
-
}
|
|
597
|
-
/** Create a pluggable-provider voice session. */
|
|
598
|
-
function createPipelineSession(opts) {
|
|
599
|
-
const log = opts.logger ?? consoleLogger;
|
|
600
|
-
const sttSampleRate = opts.sttSampleRate ?? 16e3;
|
|
601
|
-
const ttsSampleRate = opts.ttsSampleRate ?? 24e3;
|
|
602
|
-
const { client, agentConfig, toolSchemas, executeTool } = opts;
|
|
603
|
-
const systemPrompt = buildSystemPrompt(agentConfig, {
|
|
604
|
-
hasTools: toolSchemas.length > 0 || (agentConfig.builtinTools?.length ?? 0) > 0,
|
|
605
|
-
voice: true,
|
|
606
|
-
toolGuidance: opts.toolGuidance
|
|
607
|
-
});
|
|
608
|
-
const ctx = buildPipelineCtx({
|
|
609
|
-
id: opts.id,
|
|
610
|
-
agent: opts.agent,
|
|
611
|
-
client,
|
|
612
|
-
agentConfig,
|
|
613
|
-
executeTool,
|
|
614
|
-
log,
|
|
615
|
-
maxHistory: opts.maxHistory
|
|
616
|
-
});
|
|
617
|
-
const sessionAbort = new AbortController();
|
|
618
|
-
let audioReady = false;
|
|
619
|
-
let terminated = false;
|
|
620
|
-
let turnController = null;
|
|
621
|
-
let nextReplyId = 0;
|
|
622
|
-
const sttSubs = [];
|
|
623
|
-
const ttsSubs = [];
|
|
624
|
-
/**
|
|
625
|
-
* Tear down the session after an unrecoverable provider error. Aborts the
|
|
626
|
-
* in-flight turn, cancels TTS, signals providers to close via sessionAbort,
|
|
627
|
-
* and flips `terminated` so future STT events and audio frames become
|
|
628
|
-
* no-ops. Idempotent.
|
|
629
|
-
*/
|
|
630
|
-
function terminate() {
|
|
631
|
-
if (terminated) return;
|
|
632
|
-
terminated = true;
|
|
633
|
-
if (turnController !== null) {
|
|
634
|
-
turnController.abort();
|
|
635
|
-
turnController = null;
|
|
636
|
-
}
|
|
637
|
-
ctx.tts?.cancel();
|
|
638
|
-
ctx.cancelReply();
|
|
639
|
-
sessionAbort.abort();
|
|
640
|
-
}
|
|
641
|
-
function onSttPartial(_text) {
|
|
642
|
-
if (terminated) return;
|
|
643
|
-
if (turnController === null) return;
|
|
644
|
-
log.info("Pipeline barge-in", { sessionId: opts.id });
|
|
645
|
-
turnController.abort();
|
|
646
|
-
turnController = null;
|
|
647
|
-
ctx.tts?.cancel();
|
|
648
|
-
ctx.cancelReply();
|
|
649
|
-
client.event({ type: "cancelled" });
|
|
650
|
-
}
|
|
651
|
-
function onSttFinal(text) {
|
|
652
|
-
if (terminated) return;
|
|
653
|
-
const trimmed = text.trim();
|
|
654
|
-
if (trimmed.length === 0) return;
|
|
655
|
-
if (turnController !== null) {
|
|
656
|
-
log.info("Pipeline replacing in-flight turn", { sessionId: opts.id });
|
|
657
|
-
turnController.abort();
|
|
658
|
-
turnController = null;
|
|
659
|
-
ctx.tts?.cancel();
|
|
660
|
-
ctx.cancelReply();
|
|
661
|
-
client.event({ type: "cancelled" });
|
|
662
|
-
}
|
|
663
|
-
client.event({
|
|
664
|
-
type: "user_transcript",
|
|
665
|
-
text
|
|
666
|
-
});
|
|
667
|
-
const turn = runTurn(trimmed).catch((err) => {
|
|
668
|
-
log.error("Pipeline turn crashed", {
|
|
669
|
-
error: errorMessage(err),
|
|
670
|
-
sessionId: opts.id
|
|
671
|
-
});
|
|
672
|
-
});
|
|
673
|
-
ctx.chainTurn(turn);
|
|
674
|
-
}
|
|
675
|
-
function onSttError(err) {
|
|
676
|
-
if (terminated) return;
|
|
677
|
-
log.error("STT error", {
|
|
678
|
-
code: err.code,
|
|
679
|
-
message: err.message,
|
|
680
|
-
sessionId: opts.id
|
|
681
|
-
});
|
|
682
|
-
emitError(client, "stt", err.message);
|
|
683
|
-
terminate();
|
|
684
|
-
}
|
|
685
|
-
function onTtsError(err) {
|
|
686
|
-
if (terminated) return;
|
|
687
|
-
log.error("TTS error", {
|
|
688
|
-
code: err.code,
|
|
689
|
-
message: err.message,
|
|
690
|
-
sessionId: opts.id
|
|
691
|
-
});
|
|
692
|
-
emitError(client, "tts", err.message);
|
|
693
|
-
terminate();
|
|
694
|
-
}
|
|
695
|
-
async function consumeLlmStream(ctl, messages, tools, onDelta) {
|
|
696
|
-
const deps = {
|
|
697
|
-
client,
|
|
698
|
-
tts: ctx.tts,
|
|
699
|
-
log,
|
|
700
|
-
sessionId: opts.id,
|
|
701
|
-
onTextDelta: onDelta
|
|
702
|
-
};
|
|
703
|
-
try {
|
|
704
|
-
const maxSteps = agentConfig.maxSteps ?? 5;
|
|
705
|
-
const result = streamText({
|
|
706
|
-
model: opts.llm,
|
|
707
|
-
system: systemPrompt,
|
|
708
|
-
messages,
|
|
709
|
-
tools,
|
|
710
|
-
stopWhen: stepCountIs(maxSteps),
|
|
711
|
-
abortSignal: ctl.signal
|
|
712
|
-
});
|
|
713
|
-
for await (const part of result.fullStream) {
|
|
714
|
-
if (ctl.signal.aborted) break;
|
|
715
|
-
handleStreamPart(part, deps);
|
|
716
|
-
}
|
|
717
|
-
} catch (err) {
|
|
718
|
-
if (!ctl.signal.aborted) {
|
|
719
|
-
const msg = errorMessage(err);
|
|
720
|
-
log.error("LLM streamText failed", {
|
|
721
|
-
error: msg,
|
|
722
|
-
sessionId: opts.id
|
|
723
|
-
});
|
|
724
|
-
emitError(client, "llm", msg);
|
|
725
|
-
}
|
|
726
|
-
}
|
|
727
|
-
}
|
|
728
|
-
/**
|
|
729
|
-
* Flush TTS and wait for drain. Resolves on any of:
|
|
730
|
-
* - TTS emits `done`
|
|
731
|
-
* - `signal` aborts (barge-in, provider error, session stop)
|
|
732
|
-
* - `PIPELINE_FLUSH_TIMEOUT_MS` elapses
|
|
733
|
-
* Resolves immediately if no TTS session.
|
|
734
|
-
*/
|
|
735
|
-
function flushTtsAndWait(signal) {
|
|
736
|
-
const tts = ctx.tts;
|
|
737
|
-
if (!tts) return Promise.resolve();
|
|
738
|
-
return new Promise((resolve) => {
|
|
739
|
-
let off = null;
|
|
740
|
-
let timer = null;
|
|
741
|
-
const cleanup = () => {
|
|
742
|
-
if (off) {
|
|
743
|
-
off();
|
|
744
|
-
off = null;
|
|
745
|
-
}
|
|
746
|
-
if (timer) {
|
|
747
|
-
clearTimeout(timer);
|
|
748
|
-
timer = null;
|
|
749
|
-
}
|
|
750
|
-
signal.removeEventListener("abort", onAbort);
|
|
751
|
-
};
|
|
752
|
-
const finish = () => {
|
|
753
|
-
cleanup();
|
|
754
|
-
resolve();
|
|
755
|
-
};
|
|
756
|
-
const onAbort = () => finish();
|
|
757
|
-
if (signal.aborted) {
|
|
758
|
-
resolve();
|
|
759
|
-
return;
|
|
760
|
-
}
|
|
761
|
-
signal.addEventListener("abort", onAbort, { once: true });
|
|
762
|
-
off = tts.on("done", finish);
|
|
763
|
-
timer = setTimeout(() => {
|
|
764
|
-
log.warn("TTS flush timeout", {
|
|
765
|
-
sessionId: opts.id,
|
|
766
|
-
timeoutMs: PIPELINE_FLUSH_TIMEOUT_MS
|
|
767
|
-
});
|
|
768
|
-
finish();
|
|
769
|
-
}, PIPELINE_FLUSH_TIMEOUT_MS);
|
|
770
|
-
tts.flush();
|
|
771
|
-
});
|
|
772
|
-
}
|
|
773
|
-
async function runTurn(userText) {
|
|
774
|
-
const replyId = `pipeline-${++nextReplyId}`;
|
|
775
|
-
ctx.beginReply(replyId);
|
|
776
|
-
ctx.pushMessages({
|
|
777
|
-
role: "user",
|
|
778
|
-
content: userText
|
|
779
|
-
});
|
|
780
|
-
const ctl = new AbortController();
|
|
781
|
-
turnController = ctl;
|
|
782
|
-
const tools = toVercelTools(toolSchemas, {
|
|
783
|
-
executeTool,
|
|
784
|
-
sessionId: opts.id,
|
|
785
|
-
messages: () => ctx.conversationMessages,
|
|
786
|
-
signal: ctl.signal
|
|
787
|
-
});
|
|
788
|
-
const messages = ctx.conversationMessages.map(toModelMessage);
|
|
789
|
-
let accumulated = "";
|
|
790
|
-
await consumeLlmStream(ctl, messages, tools, (delta) => {
|
|
791
|
-
accumulated += delta;
|
|
792
|
-
});
|
|
793
|
-
if (ctl.signal.aborted) {
|
|
794
|
-
if (turnController === ctl) turnController = null;
|
|
795
|
-
return;
|
|
796
|
-
}
|
|
797
|
-
if (accumulated.length > 0) {
|
|
798
|
-
client.event({
|
|
799
|
-
type: "agent_transcript",
|
|
800
|
-
text: accumulated
|
|
801
|
-
});
|
|
802
|
-
ctx.pushMessages({
|
|
803
|
-
role: "assistant",
|
|
804
|
-
content: accumulated
|
|
805
|
-
});
|
|
806
|
-
}
|
|
807
|
-
await flushTtsAndWait(ctl.signal);
|
|
808
|
-
if (ctl.signal.aborted) {
|
|
809
|
-
if (turnController === ctl) turnController = null;
|
|
810
|
-
return;
|
|
811
|
-
}
|
|
812
|
-
client.playAudioDone();
|
|
813
|
-
client.event({ type: "reply_done" });
|
|
814
|
-
if (turnController === ctl) turnController = null;
|
|
815
|
-
}
|
|
816
|
-
async function runGreeting(text) {
|
|
817
|
-
const replyId = `pipeline-greeting-${++nextReplyId}`;
|
|
818
|
-
ctx.beginReply(replyId);
|
|
819
|
-
const ctl = new AbortController();
|
|
820
|
-
turnController = ctl;
|
|
821
|
-
client.event({
|
|
822
|
-
type: "agent_transcript",
|
|
823
|
-
text
|
|
824
|
-
});
|
|
825
|
-
ctx.pushMessages({
|
|
826
|
-
role: "assistant",
|
|
827
|
-
content: text
|
|
828
|
-
});
|
|
829
|
-
ctx.tts?.sendText(text);
|
|
830
|
-
await flushTtsAndWait(ctl.signal);
|
|
831
|
-
if (ctl.signal.aborted) {
|
|
832
|
-
if (turnController === ctl) turnController = null;
|
|
833
|
-
return;
|
|
834
|
-
}
|
|
835
|
-
client.playAudioDone();
|
|
836
|
-
client.event({ type: "reply_done" });
|
|
837
|
-
if (turnController === ctl) turnController = null;
|
|
838
|
-
}
|
|
839
|
-
function reportOpenRejection(which, reason) {
|
|
840
|
-
const msg = errorMessage(reason);
|
|
841
|
-
log.error(`${which === "stt" ? "STT" : "TTS"} open failed`, {
|
|
842
|
-
error: msg,
|
|
843
|
-
sessionId: opts.id
|
|
844
|
-
});
|
|
845
|
-
emitError(client, which, msg);
|
|
846
|
-
}
|
|
847
|
-
async function adoptStt(sttSession, teardown) {
|
|
848
|
-
if (teardown) {
|
|
849
|
-
await sttSession.close().catch(() => void 0);
|
|
850
|
-
return;
|
|
851
|
-
}
|
|
852
|
-
ctx.stt = sttSession;
|
|
853
|
-
sttSubs.push(sttSession.on("partial", onSttPartial));
|
|
854
|
-
sttSubs.push(sttSession.on("final", onSttFinal));
|
|
855
|
-
sttSubs.push(sttSession.on("error", onSttError));
|
|
856
|
-
}
|
|
857
|
-
async function adoptTts(ttsSession, teardown) {
|
|
858
|
-
if (teardown) {
|
|
859
|
-
await ttsSession.close().catch(() => void 0);
|
|
860
|
-
return;
|
|
861
|
-
}
|
|
862
|
-
ctx.tts = ttsSession;
|
|
863
|
-
ttsSubs.push(ttsSession.on("audio", (pcm) => {
|
|
864
|
-
client.playAudioChunk(new Uint8Array(pcm.buffer, pcm.byteOffset, pcm.byteLength));
|
|
865
|
-
}));
|
|
866
|
-
ttsSubs.push(ttsSession.on("error", onTtsError));
|
|
867
|
-
}
|
|
868
|
-
async function openProviders() {
|
|
869
|
-
const [sttResult, ttsResult] = await Promise.allSettled([opts.stt.open({
|
|
870
|
-
sampleRate: sttSampleRate,
|
|
871
|
-
apiKey: opts.sttApiKey,
|
|
872
|
-
sttPrompt: agentConfig.sttPrompt,
|
|
873
|
-
signal: sessionAbort.signal
|
|
874
|
-
}), opts.tts.open({
|
|
875
|
-
sampleRate: ttsSampleRate,
|
|
876
|
-
apiKey: opts.ttsApiKey,
|
|
877
|
-
signal: sessionAbort.signal
|
|
878
|
-
})]);
|
|
879
|
-
if (sttResult.status === "rejected") reportOpenRejection("stt", sttResult.reason);
|
|
880
|
-
if (ttsResult.status === "rejected") reportOpenRejection("tts", ttsResult.reason);
|
|
881
|
-
const aborted = sessionAbort.signal.aborted;
|
|
882
|
-
const sttFailed = sttResult.status === "rejected";
|
|
883
|
-
const ttsFailed = ttsResult.status === "rejected";
|
|
884
|
-
const teardown = aborted || sttFailed || ttsFailed;
|
|
885
|
-
if (sttResult.status === "fulfilled") await adoptStt(sttResult.value, teardown);
|
|
886
|
-
if (ttsResult.status === "fulfilled") await adoptTts(ttsResult.value, teardown);
|
|
887
|
-
if (!aborted && (sttFailed || ttsFailed)) terminate();
|
|
888
|
-
}
|
|
889
|
-
return {
|
|
890
|
-
async start() {
|
|
891
|
-
await openProviders();
|
|
892
|
-
},
|
|
893
|
-
async stop() {
|
|
894
|
-
if (sessionAbort.signal.aborted) return;
|
|
895
|
-
sessionAbort.abort();
|
|
896
|
-
turnController?.abort();
|
|
897
|
-
for (const off of sttSubs) off();
|
|
898
|
-
for (const off of ttsSubs) off();
|
|
899
|
-
sttSubs.length = 0;
|
|
900
|
-
ttsSubs.length = 0;
|
|
901
|
-
if (ctx.turnPromise !== null) await ctx.turnPromise;
|
|
902
|
-
await ctx.stt?.close().catch(() => {});
|
|
903
|
-
await ctx.tts?.close().catch(() => {});
|
|
904
|
-
},
|
|
905
|
-
onAudio(data) {
|
|
906
|
-
if (terminated || !audioReady) return;
|
|
907
|
-
const offset = data.byteOffset;
|
|
908
|
-
const length = data.byteLength;
|
|
909
|
-
let pcm;
|
|
910
|
-
if (offset % 2 === 0 && length % 2 === 0) pcm = new Int16Array(data.buffer, offset, length / 2);
|
|
911
|
-
else {
|
|
912
|
-
const copy = new Uint8Array(length - length % 2);
|
|
913
|
-
copy.set(data.subarray(0, copy.byteLength));
|
|
914
|
-
pcm = new Int16Array(copy.buffer);
|
|
915
|
-
}
|
|
916
|
-
ctx.stt?.sendAudio(pcm);
|
|
917
|
-
},
|
|
918
|
-
onAudioReady() {
|
|
919
|
-
if (audioReady || terminated) return;
|
|
920
|
-
audioReady = true;
|
|
921
|
-
if (opts.skipGreeting) return;
|
|
922
|
-
const greeting = agentConfig.greeting;
|
|
923
|
-
if (!greeting) return;
|
|
924
|
-
const turn = runGreeting(greeting).catch((err) => {
|
|
925
|
-
log.error("Pipeline greeting failed", {
|
|
926
|
-
error: errorMessage(err),
|
|
927
|
-
sessionId: opts.id
|
|
928
|
-
});
|
|
929
|
-
});
|
|
930
|
-
ctx.chainTurn(turn);
|
|
931
|
-
},
|
|
932
|
-
onCancel() {
|
|
933
|
-
if (terminated) return;
|
|
934
|
-
turnController?.abort();
|
|
935
|
-
turnController = null;
|
|
936
|
-
ctx.tts?.cancel();
|
|
937
|
-
ctx.cancelReply();
|
|
938
|
-
client.event({ type: "cancelled" });
|
|
939
|
-
},
|
|
940
|
-
onReset() {
|
|
941
|
-
if (terminated) return;
|
|
942
|
-
turnController?.abort();
|
|
943
|
-
turnController = null;
|
|
944
|
-
ctx.tts?.cancel();
|
|
945
|
-
ctx.cancelReply();
|
|
946
|
-
ctx.conversationMessages = [];
|
|
947
|
-
ctx.turnPromise = null;
|
|
948
|
-
client.event({ type: "reset" });
|
|
949
|
-
},
|
|
950
|
-
onHistory(incoming) {
|
|
951
|
-
if (terminated) return;
|
|
952
|
-
ctx.pushMessages(...incoming.map((m) => ({
|
|
953
|
-
role: m.role,
|
|
954
|
-
content: m.content
|
|
955
|
-
})));
|
|
956
|
-
},
|
|
957
|
-
waitForTurn() {
|
|
958
|
-
return ctx.turnPromise ?? Promise.resolve();
|
|
959
|
-
}
|
|
960
|
-
};
|
|
961
|
-
}
|
|
962
|
-
//#endregion
|
|
963
|
-
//#region host/providers/stt/assemblyai.ts
|
|
964
|
-
/**
|
|
965
|
-
* AssemblyAI Universal-Streaming STT opener (host-only).
|
|
966
|
-
*
|
|
967
|
-
* The user-facing descriptor factory (`assemblyAI(...)`) lives in
|
|
968
|
-
* `sdk/providers/stt/assemblyai.ts`. This module is the host-side
|
|
969
|
-
* counterpart: it takes the descriptor options + an API key and
|
|
970
|
-
* returns an {@link SttOpener} that the pipeline session drives.
|
|
971
|
-
*
|
|
972
|
-
* Default model: `"u3pro-rt"` (Universal-3 Pro Real-Time). The adapter
|
|
973
|
-
* maps that to the SDK's `"u3-rt-pro"` `speechModel` value; any other
|
|
974
|
-
* string is forwarded verbatim.
|
|
975
|
-
*/
|
|
976
|
-
/** Translate the descriptor's model alias to the SDK's `speechModel` value. */
|
|
977
|
-
function resolveSpeechModel(model) {
|
|
978
|
-
if (model === "u3pro-rt") return "u3-rt-pro";
|
|
979
|
-
return model;
|
|
980
|
-
}
|
|
981
|
-
/** Build an {@link SttOpener} from resolved AssemblyAI descriptor options. */
|
|
982
|
-
function openAssemblyAI(opts = {}) {
|
|
983
|
-
return {
|
|
984
|
-
name: "assemblyai",
|
|
985
|
-
async open(openOpts) {
|
|
986
|
-
const apiKey = openOpts.apiKey || process.env.ASSEMBLYAI_API_KEY;
|
|
987
|
-
if (!apiKey) throw makeSttError("stt_auth_failed", "AssemblyAI STT: missing API key. Set ASSEMBLYAI_API_KEY in the agent env.");
|
|
988
|
-
const client = new AssemblyAI({ apiKey });
|
|
989
|
-
const speechModel = resolveSpeechModel(opts.model ?? "u3pro-rt");
|
|
990
|
-
const transcriber = client.streaming.transcriber({
|
|
991
|
-
sampleRate: openOpts.sampleRate,
|
|
992
|
-
speechModel,
|
|
993
|
-
...openOpts.sttPrompt ? { prompt: openOpts.sttPrompt } : {}
|
|
994
|
-
});
|
|
995
|
-
const emitter = createNanoEvents();
|
|
996
|
-
let closed = false;
|
|
997
|
-
transcriber.on("turn", (event) => {
|
|
998
|
-
if (closed) return;
|
|
999
|
-
const text = event.transcript ?? "";
|
|
1000
|
-
if (event.end_of_turn) {
|
|
1001
|
-
if (text.length > 0) emitter.emit("final", text);
|
|
1002
|
-
} else if (text.length > 0) emitter.emit("partial", text);
|
|
1003
|
-
});
|
|
1004
|
-
transcriber.on("error", (err) => {
|
|
1005
|
-
if (closed) return;
|
|
1006
|
-
emitter.emit("error", makeSttError("stt_stream_error", err?.message ?? String(err)));
|
|
1007
|
-
});
|
|
1008
|
-
transcriber.on("close", (code) => {
|
|
1009
|
-
if (closed) return;
|
|
1010
|
-
if (code !== 1e3) emitter.emit("error", makeSttError("stt_stream_error", `socket closed ${code}`));
|
|
1011
|
-
});
|
|
1012
|
-
try {
|
|
1013
|
-
await transcriber.connect();
|
|
1014
|
-
} catch (cause) {
|
|
1015
|
-
throw makeSttError("stt_connect_failed", `AssemblyAI STT: connect failed: ${cause instanceof Error ? cause.message : String(cause)}`);
|
|
1016
|
-
}
|
|
1017
|
-
const close = async () => {
|
|
1018
|
-
if (closed) return;
|
|
1019
|
-
closed = true;
|
|
1020
|
-
try {
|
|
1021
|
-
await transcriber.close();
|
|
1022
|
-
} catch {}
|
|
1023
|
-
};
|
|
1024
|
-
if (openOpts.signal.aborted) close();
|
|
1025
|
-
else openOpts.signal.addEventListener("abort", () => void close(), { once: true });
|
|
1026
|
-
return {
|
|
1027
|
-
sendAudio(pcm) {
|
|
1028
|
-
if (closed) return;
|
|
1029
|
-
const copy = new Uint8Array(pcm.byteLength);
|
|
1030
|
-
copy.set(new Uint8Array(pcm.buffer, pcm.byteOffset, pcm.byteLength));
|
|
1031
|
-
transcriber.sendAudio(copy.buffer);
|
|
1032
|
-
},
|
|
1033
|
-
on(event, fn) {
|
|
1034
|
-
return emitter.on(event, fn);
|
|
1035
|
-
},
|
|
1036
|
-
close,
|
|
1037
|
-
_transcriber: transcriber
|
|
1038
|
-
};
|
|
1039
|
-
}
|
|
1040
|
-
};
|
|
1041
|
-
}
|
|
1042
|
-
//#endregion
|
|
1043
|
-
//#region host/providers/tts/cartesia.ts
|
|
1044
|
-
/**
|
|
1045
|
-
* Cartesia TTS opener (host-only).
|
|
1046
|
-
*
|
|
1047
|
-
* The user-facing descriptor factory (`cartesia(...)`) lives in
|
|
1048
|
-
* `sdk/providers/tts/cartesia.ts`. This module is the host-side
|
|
1049
|
-
* counterpart: it takes the descriptor options + an API key and
|
|
1050
|
-
* returns a {@link TtsOpener} that the pipeline session drives.
|
|
1051
|
-
*
|
|
1052
|
-
* Wraps `@cartesia/cartesia-js`'s `TTSWS` / `TTSWSContext` and normalizes it
|
|
1053
|
-
* onto the {@link TtsEvents} contract consumed by the pipeline orchestrator.
|
|
1054
|
-
*
|
|
1055
|
-
* **Per-turn context lifecycle.** Each `sendText(...)` within the same turn
|
|
1056
|
-
* appends to the same Cartesia context. On `flush()` or `cancel()`, a new
|
|
1057
|
-
* context is minted for the next turn — so concurrent `cancel({ contextId })`
|
|
1058
|
-
* only targets the in-flight turn, never the one that follows.
|
|
1059
|
-
*
|
|
1060
|
-
* **Audio format.** The adapter requests `raw` / `pcm_s16le` at the
|
|
1061
|
-
* negotiated `sampleRate` so it can forward chunks as `Int16Array` with no
|
|
1062
|
-
* conversion.
|
|
1063
|
-
*/
|
|
1064
|
-
/** PCM16 sample rates supported by Cartesia's `raw` output format. */
|
|
1065
|
-
const CARTESIA_PCM16_RATES = [
|
|
1066
|
-
8e3,
|
|
1067
|
-
16e3,
|
|
1068
|
-
22050,
|
|
1069
|
-
24e3,
|
|
1070
|
-
44100,
|
|
1071
|
-
48e3
|
|
1072
|
-
];
|
|
1073
|
-
function assertSupportedSampleRate(rate) {
|
|
1074
|
-
if (CARTESIA_PCM16_RATES.includes(rate)) return rate;
|
|
1075
|
-
throw makeTtsError("tts_connect_failed", `Cartesia TTS: unsupported sample rate ${rate}. Supported: ${CARTESIA_PCM16_RATES.join(", ")}.`);
|
|
1076
|
-
}
|
|
1077
|
-
/** Build a {@link TtsOpener} from resolved Cartesia descriptor options. */
|
|
1078
|
-
function openCartesia(opts) {
|
|
602
|
+
/** Build a {@link TtsOpener} from resolved Cartesia descriptor options. */
|
|
603
|
+
function openCartesia(opts) {
|
|
1079
604
|
return {
|
|
1080
605
|
name: "cartesia",
|
|
1081
606
|
async open(openOpts) {
|
|
1082
607
|
const apiKey = openOpts.apiKey || process.env.CARTESIA_API_KEY;
|
|
1083
608
|
if (!apiKey) throw makeTtsError("tts_auth_failed", "Cartesia TTS: missing API key. Set CARTESIA_API_KEY in the agent env.");
|
|
1084
|
-
const sampleRate = assertSupportedSampleRate(openOpts.sampleRate);
|
|
609
|
+
const sampleRate = assertSupportedSampleRate$1(openOpts.sampleRate);
|
|
1085
610
|
const model = opts.model ?? "sonic-2";
|
|
1086
611
|
const language = opts.language ?? "en";
|
|
612
|
+
const voice = opts.voice ?? "f786b574-daa5-4673-aa0c-cbe3e8534c02";
|
|
1087
613
|
const client = new Cartesia({ apiKey });
|
|
1088
614
|
let ws;
|
|
1089
615
|
try {
|
|
@@ -1098,7 +624,7 @@ function openCartesia(opts) {
|
|
|
1098
624
|
model_id: model,
|
|
1099
625
|
voice: {
|
|
1100
626
|
mode: "id",
|
|
1101
|
-
id:
|
|
627
|
+
id: voice
|
|
1102
628
|
},
|
|
1103
629
|
output_format: {
|
|
1104
630
|
container: "raw",
|
|
@@ -1169,7 +695,7 @@ function openCartesia(opts) {
|
|
|
1169
695
|
model_id: model,
|
|
1170
696
|
voice: {
|
|
1171
697
|
mode: "id",
|
|
1172
|
-
id:
|
|
698
|
+
id: voice
|
|
1173
699
|
},
|
|
1174
700
|
output_format: {
|
|
1175
701
|
container: "raw",
|
|
@@ -1178,38 +704,247 @@ function openCartesia(opts) {
|
|
|
1178
704
|
},
|
|
1179
705
|
language
|
|
1180
706
|
};
|
|
1181
|
-
const ignoreRejection = (_err) => {};
|
|
707
|
+
const ignoreRejection = (_err) => {};
|
|
708
|
+
return {
|
|
709
|
+
sendText(text) {
|
|
710
|
+
if (closed || text.length === 0) return;
|
|
711
|
+
rotateIfPending();
|
|
712
|
+
context.send({
|
|
713
|
+
...baseRequest,
|
|
714
|
+
transcript: text,
|
|
715
|
+
continue: true
|
|
716
|
+
}).catch(ignoreRejection);
|
|
717
|
+
},
|
|
718
|
+
flush() {
|
|
719
|
+
if (closed || rotatePending) return;
|
|
720
|
+
context.send({
|
|
721
|
+
...baseRequest,
|
|
722
|
+
transcript: "",
|
|
723
|
+
continue: false
|
|
724
|
+
}).catch(ignoreRejection);
|
|
725
|
+
rotatePending = true;
|
|
726
|
+
},
|
|
727
|
+
cancel() {
|
|
728
|
+
if (closed) return;
|
|
729
|
+
if (!doneEmitted) context.cancel().catch(ignoreRejection);
|
|
730
|
+
emitDoneOnce();
|
|
731
|
+
rotatePending = true;
|
|
732
|
+
},
|
|
733
|
+
on(event, fn) {
|
|
734
|
+
return emitter.on(event, fn);
|
|
735
|
+
},
|
|
736
|
+
close,
|
|
737
|
+
_ws: ws,
|
|
738
|
+
_currentContextId: () => context.contextId
|
|
739
|
+
};
|
|
740
|
+
}
|
|
741
|
+
};
|
|
742
|
+
}
|
|
743
|
+
//#endregion
|
|
744
|
+
//#region host/providers/tts/rime.ts
|
|
745
|
+
/**
|
|
746
|
+
* Rime TTS opener (host-only).
|
|
747
|
+
*
|
|
748
|
+
* The user-facing descriptor factory (`rime(...)`) lives in
|
|
749
|
+
* `sdk/providers/tts/rime.ts`. This module is the host-side
|
|
750
|
+
* counterpart: it takes the descriptor options + an API key and
|
|
751
|
+
* returns a {@link TtsOpener} that the pipeline session drives.
|
|
752
|
+
*
|
|
753
|
+
* **Protocol.** Connects to Rime's `ws2` JSON WebSocket endpoint
|
|
754
|
+
* (`wss://users-ws.rime.ai/ws2`). Client-to-server messages are JSON:
|
|
755
|
+
* - `{ "text": "..." }` — append text to the synthesis buffer
|
|
756
|
+
* - `{ "operation": "clear" }` — drop buffered text (barge-in)
|
|
757
|
+
* - `{ "operation": "eos" }` — drain buffer, close connection (NOT used
|
|
758
|
+
* during a session: it would tear down the WS, forcing reconnect per
|
|
759
|
+
* turn). We force end-of-turn synthesis with a trailing `"."` instead.
|
|
760
|
+
* The server responds with JSON frames:
|
|
761
|
+
* - `{ type: "chunk", data: <base64 PCM16 LE>, contextId: string | null }`
|
|
762
|
+
* - `{ type: "timestamps", ... }` (ignored)
|
|
763
|
+
* - `{ type: "error", message: string }` (surfaced as `tts_stream_error`)
|
|
764
|
+
*
|
|
765
|
+
* **Single long-lived connection per session.** Rime buffers text until it
|
|
766
|
+
* sees terminal punctuation (`.`, `?`, `!`), so we use one WebSocket per
|
|
767
|
+
* `open()` call and reuse it across turns. `clear` resets the buffer
|
|
768
|
+
* between cancellations.
|
|
769
|
+
*
|
|
770
|
+
* **Done detection.** After `flush()` sends a trailing `"."` to force the
|
|
771
|
+
* server to synthesize any half-buffered text, we arm a quiescence timer
|
|
772
|
+
* that fires 500 ms after the last received audio chunk. When it fires,
|
|
773
|
+
* `done` is emitted.
|
|
774
|
+
*
|
|
775
|
+
* **Audio format.** The URL requests `audioFormat=pcm` at the negotiated
|
|
776
|
+
* `sampleRate`, which returns raw PCM16 little-endian. We decode the base64
|
|
777
|
+
* payload and construct a zero-copy `Int16Array` view over the decoded bytes.
|
|
778
|
+
*/
|
|
779
|
+
/** PCM16 sample rates accepted by the Rime `ws2` endpoint. */
|
|
780
|
+
const RIME_PCM16_RATES = [
|
|
781
|
+
8e3,
|
|
782
|
+
16e3,
|
|
783
|
+
22050,
|
|
784
|
+
24e3,
|
|
785
|
+
44100,
|
|
786
|
+
48e3
|
|
787
|
+
];
|
|
788
|
+
function assertSupportedSampleRate(rate) {
|
|
789
|
+
if (RIME_PCM16_RATES.includes(rate)) return rate;
|
|
790
|
+
throw makeTtsError("tts_connect_failed", `Rime TTS: unsupported sample rate ${rate}. Supported: ${RIME_PCM16_RATES.join(", ")}.`);
|
|
791
|
+
}
|
|
792
|
+
/**
|
|
793
|
+
* Decode a base64 string from Rime into a zero-copy `Int16Array`.
|
|
794
|
+
*
|
|
795
|
+
* Rime's `ws2` endpoint returns base64-encoded PCM16 LE in each chunk.
|
|
796
|
+
* `Buffer.from(base64, "base64")` gives us a Node.js Buffer (which is a
|
|
797
|
+
* Uint8Array subclass) with `byteOffset === 0`. PCM16 bytes always come in
|
|
798
|
+
* pairs so the length is guaranteed to be even.
|
|
799
|
+
*/
|
|
800
|
+
function base64ToPcm(data) {
|
|
801
|
+
const bytes = Buffer.from(data, "base64");
|
|
802
|
+
const evenLen = bytes.byteLength - bytes.byteLength % 2;
|
|
803
|
+
if (evenLen === 0) return new Int16Array(0);
|
|
804
|
+
return new Int16Array(bytes.buffer, bytes.byteOffset, evenLen / 2);
|
|
805
|
+
}
|
|
806
|
+
/** Quiescence timeout in ms — how long to wait after the last audio chunk before emitting `done`. */
|
|
807
|
+
const QUIESCENCE_MS = 500;
|
|
808
|
+
/**
|
|
809
|
+
* After `flush()`, how long to wait for the FIRST audio chunk before
|
|
810
|
+
* giving up and emitting `done`. Greeting and short replies hit this
|
|
811
|
+
* path: `flush()` runs immediately after `sendText()`, so audio TTFB
|
|
812
|
+
* exceeds the 500 ms quiescence window. Once the first chunk arrives,
|
|
813
|
+
* we transition to the shorter quiescence timeout.
|
|
814
|
+
*/
|
|
815
|
+
const FIRST_AUDIO_TIMEOUT_MS = 5e3;
|
|
816
|
+
/** Wait for the WebSocket `open` event; reject on first `error`. */
|
|
817
|
+
function waitForOpen(ws) {
|
|
818
|
+
return new Promise((resolve, reject) => {
|
|
819
|
+
const onOpen = () => {
|
|
820
|
+
ws.removeListener("error", onError);
|
|
821
|
+
resolve();
|
|
822
|
+
};
|
|
823
|
+
const onError = (err) => {
|
|
824
|
+
ws.removeListener("open", onOpen);
|
|
825
|
+
reject(makeTtsError("tts_connect_failed", `Rime TTS: connect failed: ${err?.message ?? String(err)}`));
|
|
826
|
+
};
|
|
827
|
+
ws.once("open", onOpen);
|
|
828
|
+
ws.once("error", onError);
|
|
829
|
+
});
|
|
830
|
+
}
|
|
831
|
+
/**
|
|
832
|
+
* Handle one incoming WebSocket message frame.
|
|
833
|
+
*
|
|
834
|
+
* Extracted into a top-level function to keep `open()` under the cognitive
|
|
835
|
+
* complexity limit while retaining full access to the session state via refs.
|
|
836
|
+
*/
|
|
837
|
+
function handleRimeMessage(raw, emitter, armQuiescence, isActiveTimer) {
|
|
838
|
+
let msg;
|
|
839
|
+
try {
|
|
840
|
+
msg = JSON.parse(typeof raw === "string" ? raw : raw.toString());
|
|
841
|
+
} catch {
|
|
842
|
+
return;
|
|
843
|
+
}
|
|
844
|
+
if (msg.type === "chunk" && typeof msg.data === "string") {
|
|
845
|
+
const pcm = base64ToPcm(msg.data);
|
|
846
|
+
if (pcm.length > 0) {
|
|
847
|
+
emitter.emit("audio", pcm);
|
|
848
|
+
if (isActiveTimer()) armQuiescence();
|
|
849
|
+
}
|
|
850
|
+
return;
|
|
851
|
+
}
|
|
852
|
+
if (msg.type === "error") emitter.emit("error", makeTtsError("tts_stream_error", `Rime TTS: ${msg.message ?? "unknown error"}`));
|
|
853
|
+
}
|
|
854
|
+
/** Build a {@link TtsOpener} from resolved Rime descriptor options. */
|
|
855
|
+
function openRime(opts) {
|
|
856
|
+
return {
|
|
857
|
+
name: "rime",
|
|
858
|
+
async open(openOpts) {
|
|
859
|
+
const apiKey = openOpts.apiKey || process.env.RIME_API_KEY;
|
|
860
|
+
if (!apiKey) throw makeTtsError("tts_auth_failed", "Rime TTS: missing API key. Set RIME_API_KEY in the agent env.");
|
|
861
|
+
const sampleRate = assertSupportedSampleRate(openOpts.sampleRate);
|
|
862
|
+
const model = opts.model ?? "mistv2";
|
|
863
|
+
const lang = opts.language ?? "eng";
|
|
864
|
+
const voice = opts.voice ?? "cove";
|
|
865
|
+
const url = `wss://users-ws.rime.ai/ws2?speaker=${encodeURIComponent(voice)}&modelId=${encodeURIComponent(model)}&audioFormat=pcm&samplingRate=${sampleRate}&lang=${encodeURIComponent(lang)}`;
|
|
866
|
+
let ws;
|
|
867
|
+
try {
|
|
868
|
+
ws = new WsWebSocket(url, { headers: { Authorization: `Bearer ${apiKey}` } });
|
|
869
|
+
} catch (cause) {
|
|
870
|
+
throw makeTtsError("tts_connect_failed", `Rime TTS: failed to create WebSocket: ${cause instanceof Error ? cause.message : String(cause)}`);
|
|
871
|
+
}
|
|
872
|
+
await waitForOpen(ws);
|
|
873
|
+
const emitter = createNanoEvents();
|
|
874
|
+
let closed = false;
|
|
875
|
+
let doneEmitted = false;
|
|
876
|
+
/**
|
|
877
|
+
* After `flush()`, we arm a timer that fires `done`. Initial timeout is
|
|
878
|
+
* `FIRST_AUDIO_TIMEOUT_MS` to give Rime headroom on TTFB; the first
|
|
879
|
+
* chunk swaps it for a shorter `QUIESCENCE_MS` window that resets on
|
|
880
|
+
* each subsequent chunk. `cancel()` emits `done` synchronously.
|
|
881
|
+
*/
|
|
882
|
+
let quiescenceTimer = null;
|
|
883
|
+
const clearQuiescence = () => {
|
|
884
|
+
if (quiescenceTimer !== null) {
|
|
885
|
+
clearTimeout(quiescenceTimer);
|
|
886
|
+
quiescenceTimer = null;
|
|
887
|
+
}
|
|
888
|
+
};
|
|
889
|
+
const emitDoneOnce = () => {
|
|
890
|
+
clearQuiescence();
|
|
891
|
+
if (doneEmitted || closed) return;
|
|
892
|
+
doneEmitted = true;
|
|
893
|
+
emitter.emit("done");
|
|
894
|
+
};
|
|
895
|
+
const armQuiescence = () => {
|
|
896
|
+
clearQuiescence();
|
|
897
|
+
quiescenceTimer = setTimeout(emitDoneOnce, QUIESCENCE_MS);
|
|
898
|
+
};
|
|
899
|
+
const armFirstAudioTimer = () => {
|
|
900
|
+
clearQuiescence();
|
|
901
|
+
quiescenceTimer = setTimeout(emitDoneOnce, FIRST_AUDIO_TIMEOUT_MS);
|
|
902
|
+
};
|
|
903
|
+
ws.on("message", (raw) => {
|
|
904
|
+
if (closed) return;
|
|
905
|
+
handleRimeMessage(raw, emitter, armQuiescence, () => quiescenceTimer !== null);
|
|
906
|
+
});
|
|
907
|
+
ws.on("error", (err) => {
|
|
908
|
+
if (closed) return;
|
|
909
|
+
emitter.emit("error", makeTtsError("tts_stream_error", `Rime TTS stream error: ${err?.message ?? String(err)}`));
|
|
910
|
+
});
|
|
911
|
+
ws.on("close", () => {
|
|
912
|
+
if (closed) return;
|
|
913
|
+
emitDoneOnce();
|
|
914
|
+
});
|
|
915
|
+
const close = async () => {
|
|
916
|
+
if (closed) return;
|
|
917
|
+
closed = true;
|
|
918
|
+
clearQuiescence();
|
|
919
|
+
try {
|
|
920
|
+
ws.close();
|
|
921
|
+
} catch {}
|
|
922
|
+
};
|
|
923
|
+
if (openOpts.signal.aborted) close();
|
|
924
|
+
else openOpts.signal.addEventListener("abort", () => void close(), { once: true });
|
|
1182
925
|
return {
|
|
1183
926
|
sendText(text) {
|
|
1184
927
|
if (closed || text.length === 0) return;
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
transcript: text,
|
|
1189
|
-
continue: true
|
|
1190
|
-
}).catch(ignoreRejection);
|
|
928
|
+
if (ws.readyState !== WsWebSocket.OPEN) return;
|
|
929
|
+
doneEmitted = false;
|
|
930
|
+
ws.send(JSON.stringify({ text }));
|
|
1191
931
|
},
|
|
1192
932
|
flush() {
|
|
1193
|
-
if (closed
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
continue: false
|
|
1198
|
-
}).catch(ignoreRejection);
|
|
1199
|
-
rotatePending = true;
|
|
933
|
+
if (closed) return;
|
|
934
|
+
if (ws.readyState !== WsWebSocket.OPEN) return;
|
|
935
|
+
ws.send(JSON.stringify({ text: "." }));
|
|
936
|
+
armFirstAudioTimer();
|
|
1200
937
|
},
|
|
1201
938
|
cancel() {
|
|
1202
939
|
if (closed) return;
|
|
1203
|
-
if (
|
|
940
|
+
if (ws.readyState === WsWebSocket.OPEN) ws.send(JSON.stringify({ operation: "clear" }));
|
|
1204
941
|
emitDoneOnce();
|
|
1205
|
-
rotatePending = true;
|
|
1206
942
|
},
|
|
1207
943
|
on(event, fn) {
|
|
1208
944
|
return emitter.on(event, fn);
|
|
1209
945
|
},
|
|
1210
946
|
close,
|
|
1211
|
-
_ws: ws
|
|
1212
|
-
_currentContextId: () => context.contextId
|
|
947
|
+
_ws: ws
|
|
1213
948
|
};
|
|
1214
949
|
}
|
|
1215
950
|
};
|
|
@@ -1225,53 +960,785 @@ function openCartesia(opts) {
|
|
|
1225
960
|
* resolvers here to turn each descriptor into its openable / callable
|
|
1226
961
|
* host-side counterpart, importing the third-party SDK only at that point.
|
|
1227
962
|
*
|
|
1228
|
-
* The guest sandbox never imports these functions, which is how the agent
|
|
1229
|
-
* bundle stays free of `@ai-sdk/anthropic` / `assemblyai` /
|
|
1230
|
-
* `@cartesia/cartesia-js`.
|
|
963
|
+
* The guest sandbox never imports these functions, which is how the agent
|
|
964
|
+
* bundle stays free of `@ai-sdk/anthropic` / `assemblyai` /
|
|
965
|
+
* `@cartesia/cartesia-js`.
|
|
966
|
+
*/
|
|
967
|
+
/**
|
|
968
|
+
* Look up a provider API key: agent env first (set via `aai secret put` or
|
|
969
|
+
* `.env`), then the host's `process.env` as a fallback for self-hosted mode.
|
|
970
|
+
* Returns `""` if neither has it — the caller decides whether that's fatal.
|
|
971
|
+
*/
|
|
972
|
+
function resolveApiKey(envVar, env) {
|
|
973
|
+
return env[envVar] ?? process.env[envVar] ?? "";
|
|
974
|
+
}
|
|
975
|
+
/** Resolve an {@link SttProvider} descriptor into a host-side opener. */
|
|
976
|
+
function resolveStt(descriptor) {
|
|
977
|
+
switch (descriptor.kind) {
|
|
978
|
+
case ASSEMBLYAI_KIND: return openAssemblyAI(descriptor.options);
|
|
979
|
+
case DEEPGRAM_KIND: return openDeepgram(descriptor.options);
|
|
980
|
+
default: throw new Error(`Unknown STT provider kind: "${descriptor.kind}". Supported: ${ASSEMBLYAI_KIND}, ${DEEPGRAM_KIND}.`);
|
|
981
|
+
}
|
|
982
|
+
}
|
|
983
|
+
/** Resolve a {@link TtsProvider} descriptor into a host-side opener. */
|
|
984
|
+
function resolveTts(descriptor) {
|
|
985
|
+
switch (descriptor.kind) {
|
|
986
|
+
case CARTESIA_KIND: return openCartesia(descriptor.options);
|
|
987
|
+
case RIME_KIND: return openRime(descriptor.options);
|
|
988
|
+
default: throw new Error(`Unknown TTS provider kind: "${descriptor.kind}". Supported: ${CARTESIA_KIND}, ${RIME_KIND}.`);
|
|
989
|
+
}
|
|
990
|
+
}
|
|
991
|
+
/**
|
|
992
|
+
* Resolve an {@link LlmProvider} descriptor into a Vercel AI SDK
|
|
993
|
+
* {@link LanguageModel}.
|
|
994
|
+
*
|
|
995
|
+
* The API key is pulled from the agent's env (e.g. `ANTHROPIC_API_KEY`).
|
|
996
|
+
* Missing keys throw here — the pipeline session would fail on first
|
|
997
|
+
* `streamText` call otherwise, and the error is clearer at construction.
|
|
998
|
+
*/
|
|
999
|
+
function resolveLlm(descriptor, env) {
|
|
1000
|
+
switch (descriptor.kind) {
|
|
1001
|
+
case ANTHROPIC_KIND: {
|
|
1002
|
+
const options = descriptor.options;
|
|
1003
|
+
const apiKey = resolveApiKey("ANTHROPIC_API_KEY", env);
|
|
1004
|
+
if (!apiKey) throw new Error("Anthropic LLM: missing API key. Set ANTHROPIC_API_KEY in the agent env.");
|
|
1005
|
+
return createAnthropic({
|
|
1006
|
+
apiKey,
|
|
1007
|
+
baseURL: "https://api.anthropic.com/v1"
|
|
1008
|
+
})(options.model);
|
|
1009
|
+
}
|
|
1010
|
+
default: throw new Error(`Unknown LLM provider kind: "${descriptor.kind}". Supported: ${ANTHROPIC_KIND}.`);
|
|
1011
|
+
}
|
|
1012
|
+
}
|
|
1013
|
+
//#endregion
|
|
1014
|
+
//#region host/runtime-config.ts
|
|
1015
|
+
/**
|
|
1016
|
+
* Runtime dependencies injected into the session pipeline.
|
|
1017
|
+
*
|
|
1018
|
+
* Defines the {@link Logger} interface, a default {@link consoleLogger},
|
|
1019
|
+
* and the {@link S2SConfig} for Speech-to-Speech endpoint configuration.
|
|
1020
|
+
*/
|
|
1021
|
+
function consoleLog(fn) {
|
|
1022
|
+
return (msg, ctx) => ctx ? fn(msg, ctx) : fn(msg);
|
|
1023
|
+
}
|
|
1024
|
+
/** Default console-backed logger. */
|
|
1025
|
+
const consoleLogger = {
|
|
1026
|
+
info: consoleLog(console.log),
|
|
1027
|
+
warn: consoleLog(console.warn),
|
|
1028
|
+
error: consoleLog(console.error),
|
|
1029
|
+
debug: consoleLog(console.debug)
|
|
1030
|
+
};
|
|
1031
|
+
/**
|
|
1032
|
+
* Structured JSON logger for production diagnostics. Each log entry is a
|
|
1033
|
+
* single-line JSON object with `timestamp`, `level`, `msg`, and any
|
|
1034
|
+
* caller-provided context fields.
|
|
1035
|
+
*/
|
|
1036
|
+
function jsonLog(level) {
|
|
1037
|
+
return (msg, ctx) => {
|
|
1038
|
+
const entry = {
|
|
1039
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1040
|
+
level,
|
|
1041
|
+
msg
|
|
1042
|
+
};
|
|
1043
|
+
if (ctx) Object.assign(entry, ctx);
|
|
1044
|
+
(level === "error" || level === "warn" ? process.stderr : process.stdout).write(`${JSON.stringify(entry)}\n`);
|
|
1045
|
+
};
|
|
1046
|
+
}
|
|
1047
|
+
const jsonLogger = {
|
|
1048
|
+
info: jsonLog("info"),
|
|
1049
|
+
warn: jsonLog("warn"),
|
|
1050
|
+
error: jsonLog("error"),
|
|
1051
|
+
debug: jsonLog("debug")
|
|
1052
|
+
};
|
|
1053
|
+
/** Default S2S endpoint configuration. */
|
|
1054
|
+
const DEFAULT_S2S_CONFIG = {
|
|
1055
|
+
wssUrl: "wss://agents.assemblyai.com/v1/voice",
|
|
1056
|
+
inputSampleRate: DEFAULT_STT_SAMPLE_RATE,
|
|
1057
|
+
outputSampleRate: DEFAULT_TTS_SAMPLE_RATE
|
|
1058
|
+
};
|
|
1059
|
+
//#endregion
|
|
1060
|
+
//#region host/session-core.ts
|
|
1061
|
+
const REPLY_DONE_SLOW_THRESHOLD_MS = 50;
|
|
1062
|
+
function createSessionCore(opts) {
|
|
1063
|
+
const log = opts.logger ?? consoleLogger;
|
|
1064
|
+
const maxHistory = opts.maxHistory ?? 200;
|
|
1065
|
+
const idleMs = (() => {
|
|
1066
|
+
const raw = opts.agentConfig.idleTimeoutMs ?? 3e5;
|
|
1067
|
+
return raw === 0 || !Number.isFinite(raw) ? 0 : raw;
|
|
1068
|
+
})();
|
|
1069
|
+
let reply = {
|
|
1070
|
+
currentReplyId: null,
|
|
1071
|
+
pendingTools: [],
|
|
1072
|
+
toolCallCount: 0
|
|
1073
|
+
};
|
|
1074
|
+
let history = [];
|
|
1075
|
+
let turnPromise = null;
|
|
1076
|
+
let idleTimer = null;
|
|
1077
|
+
let stopped = false;
|
|
1078
|
+
function emit(event) {
|
|
1079
|
+
opts.client.event(event);
|
|
1080
|
+
}
|
|
1081
|
+
function resetIdle() {
|
|
1082
|
+
if (stopped || idleMs <= 0) return;
|
|
1083
|
+
if (idleTimer) clearTimeout(idleTimer);
|
|
1084
|
+
idleTimer = setTimeout(() => {
|
|
1085
|
+
log.info("session idle timeout", { sid: opts.id });
|
|
1086
|
+
emit({ type: "idle_timeout" });
|
|
1087
|
+
}, idleMs);
|
|
1088
|
+
}
|
|
1089
|
+
function pushMessages(...msgs) {
|
|
1090
|
+
history.push(...msgs);
|
|
1091
|
+
if (maxHistory > 0 && history.length > maxHistory) history.splice(0, history.length - maxHistory);
|
|
1092
|
+
}
|
|
1093
|
+
function beginReply(replyId) {
|
|
1094
|
+
reply = {
|
|
1095
|
+
currentReplyId: replyId,
|
|
1096
|
+
pendingTools: [],
|
|
1097
|
+
toolCallCount: 0
|
|
1098
|
+
};
|
|
1099
|
+
turnPromise = null;
|
|
1100
|
+
}
|
|
1101
|
+
function cancelReply() {
|
|
1102
|
+
reply = {
|
|
1103
|
+
currentReplyId: null,
|
|
1104
|
+
pendingTools: [],
|
|
1105
|
+
toolCallCount: 0
|
|
1106
|
+
};
|
|
1107
|
+
}
|
|
1108
|
+
function flushReply(startMs, hadTurnPromise) {
|
|
1109
|
+
const stepsUsed = reply.toolCallCount;
|
|
1110
|
+
if (stepsUsed > 0) log.info("Turn complete", {
|
|
1111
|
+
steps: stepsUsed,
|
|
1112
|
+
agent: opts.agent
|
|
1113
|
+
});
|
|
1114
|
+
opts.client.playAudioDone();
|
|
1115
|
+
emit({ type: "reply_done" });
|
|
1116
|
+
reply.currentReplyId = null;
|
|
1117
|
+
const durationMs = Date.now() - startMs;
|
|
1118
|
+
if (durationMs >= REPLY_DONE_SLOW_THRESHOLD_MS) log.warn("slow reply_done dispatch", {
|
|
1119
|
+
sid: opts.id,
|
|
1120
|
+
agent: opts.agent,
|
|
1121
|
+
durationMs,
|
|
1122
|
+
hadTurnPromise
|
|
1123
|
+
});
|
|
1124
|
+
}
|
|
1125
|
+
return {
|
|
1126
|
+
id: opts.id,
|
|
1127
|
+
async start() {
|
|
1128
|
+
resetIdle();
|
|
1129
|
+
await opts.transport.start();
|
|
1130
|
+
},
|
|
1131
|
+
async stop() {
|
|
1132
|
+
if (stopped) return;
|
|
1133
|
+
stopped = true;
|
|
1134
|
+
if (idleTimer) {
|
|
1135
|
+
clearTimeout(idleTimer);
|
|
1136
|
+
idleTimer = null;
|
|
1137
|
+
}
|
|
1138
|
+
if (turnPromise !== null) await turnPromise;
|
|
1139
|
+
await opts.transport.stop();
|
|
1140
|
+
},
|
|
1141
|
+
onAudio(bytes) {
|
|
1142
|
+
resetIdle();
|
|
1143
|
+
opts.transport.sendUserAudio(bytes);
|
|
1144
|
+
},
|
|
1145
|
+
onAudioReady() {},
|
|
1146
|
+
onCancel() {
|
|
1147
|
+
opts.transport.cancelReply();
|
|
1148
|
+
emit({ type: "cancelled" });
|
|
1149
|
+
},
|
|
1150
|
+
onReset() {
|
|
1151
|
+
cancelReply();
|
|
1152
|
+
history = [];
|
|
1153
|
+
emit({ type: "reset" });
|
|
1154
|
+
},
|
|
1155
|
+
onHistory(messages) {
|
|
1156
|
+
pushMessages(...messages);
|
|
1157
|
+
},
|
|
1158
|
+
onReplyStarted(replyId) {
|
|
1159
|
+
beginReply(replyId);
|
|
1160
|
+
},
|
|
1161
|
+
onReplyDone() {
|
|
1162
|
+
const startMs = Date.now();
|
|
1163
|
+
const doneReplyId = reply.currentReplyId;
|
|
1164
|
+
if (doneReplyId === null) {
|
|
1165
|
+
log.debug("Dropping duplicate reply.done (no active reply)");
|
|
1166
|
+
return;
|
|
1167
|
+
}
|
|
1168
|
+
const hadTurnPromise = turnPromise !== null;
|
|
1169
|
+
const sendPending = () => {
|
|
1170
|
+
if (reply.currentReplyId !== doneReplyId) {
|
|
1171
|
+
reply.pendingTools = [];
|
|
1172
|
+
return;
|
|
1173
|
+
}
|
|
1174
|
+
if (reply.pendingTools.length > 0) {
|
|
1175
|
+
for (const tool of reply.pendingTools) opts.transport.sendToolResult(tool.callId, tool.result);
|
|
1176
|
+
reply.pendingTools = [];
|
|
1177
|
+
} else flushReply(startMs, hadTurnPromise);
|
|
1178
|
+
};
|
|
1179
|
+
if (hadTurnPromise) turnPromise?.then(sendPending);
|
|
1180
|
+
else sendPending();
|
|
1181
|
+
},
|
|
1182
|
+
onCancelled() {
|
|
1183
|
+
cancelReply();
|
|
1184
|
+
emit({ type: "cancelled" });
|
|
1185
|
+
},
|
|
1186
|
+
onAudioChunk(bytes) {
|
|
1187
|
+
opts.client.playAudioChunk(bytes);
|
|
1188
|
+
},
|
|
1189
|
+
onAudioDone() {
|
|
1190
|
+
opts.client.playAudioDone();
|
|
1191
|
+
},
|
|
1192
|
+
onUserTranscript(text) {
|
|
1193
|
+
emit({
|
|
1194
|
+
type: "user_transcript",
|
|
1195
|
+
text
|
|
1196
|
+
});
|
|
1197
|
+
pushMessages({
|
|
1198
|
+
role: "user",
|
|
1199
|
+
content: text
|
|
1200
|
+
});
|
|
1201
|
+
},
|
|
1202
|
+
onAgentTranscript(text, interrupted) {
|
|
1203
|
+
emit({
|
|
1204
|
+
type: "agent_transcript",
|
|
1205
|
+
text
|
|
1206
|
+
});
|
|
1207
|
+
if (!interrupted) pushMessages({
|
|
1208
|
+
role: "assistant",
|
|
1209
|
+
content: text
|
|
1210
|
+
});
|
|
1211
|
+
},
|
|
1212
|
+
onToolCall(callId, name, args) {
|
|
1213
|
+
emit({
|
|
1214
|
+
type: "tool_call",
|
|
1215
|
+
toolCallId: callId,
|
|
1216
|
+
toolName: name,
|
|
1217
|
+
args
|
|
1218
|
+
});
|
|
1219
|
+
if (reply.currentReplyId === null) {
|
|
1220
|
+
log.warn("tool_call with no active reply", {
|
|
1221
|
+
sid: opts.id,
|
|
1222
|
+
name
|
|
1223
|
+
});
|
|
1224
|
+
return;
|
|
1225
|
+
}
|
|
1226
|
+
reply.toolCallCount++;
|
|
1227
|
+
const maxSteps = opts.agentConfig.maxSteps;
|
|
1228
|
+
if (maxSteps !== void 0 && reply.toolCallCount > maxSteps) {
|
|
1229
|
+
log.info("maxSteps exceeded; refusing tool call", {
|
|
1230
|
+
toolCallCount: reply.toolCallCount,
|
|
1231
|
+
maxSteps
|
|
1232
|
+
});
|
|
1233
|
+
reply.pendingTools.push({
|
|
1234
|
+
callId,
|
|
1235
|
+
result: JSON.stringify({ error: "Maximum tool steps reached. Please respond to the user now." })
|
|
1236
|
+
});
|
|
1237
|
+
emit({
|
|
1238
|
+
type: "tool_call_done",
|
|
1239
|
+
toolCallId: callId,
|
|
1240
|
+
result: "{}"
|
|
1241
|
+
});
|
|
1242
|
+
return;
|
|
1243
|
+
}
|
|
1244
|
+
const p = (async () => {
|
|
1245
|
+
try {
|
|
1246
|
+
const result = await opts.executeTool(name, args, opts.id, history);
|
|
1247
|
+
reply.pendingTools.push({
|
|
1248
|
+
callId,
|
|
1249
|
+
result
|
|
1250
|
+
});
|
|
1251
|
+
emit({
|
|
1252
|
+
type: "tool_call_done",
|
|
1253
|
+
toolCallId: callId,
|
|
1254
|
+
result
|
|
1255
|
+
});
|
|
1256
|
+
} catch (err) {
|
|
1257
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
1258
|
+
reply.pendingTools.push({
|
|
1259
|
+
callId,
|
|
1260
|
+
result: JSON.stringify({ error: message })
|
|
1261
|
+
});
|
|
1262
|
+
emit({
|
|
1263
|
+
type: "tool_call_done",
|
|
1264
|
+
toolCallId: callId,
|
|
1265
|
+
result: message
|
|
1266
|
+
});
|
|
1267
|
+
}
|
|
1268
|
+
})();
|
|
1269
|
+
turnPromise = (turnPromise ?? Promise.resolve()).then(() => p);
|
|
1270
|
+
},
|
|
1271
|
+
onError(code, message) {
|
|
1272
|
+
emit({
|
|
1273
|
+
type: "error",
|
|
1274
|
+
code,
|
|
1275
|
+
message
|
|
1276
|
+
});
|
|
1277
|
+
},
|
|
1278
|
+
onSpeechStarted() {
|
|
1279
|
+
emit({ type: "speech_started" });
|
|
1280
|
+
},
|
|
1281
|
+
onSpeechStopped() {
|
|
1282
|
+
emit({ type: "speech_stopped" });
|
|
1283
|
+
}
|
|
1284
|
+
};
|
|
1285
|
+
}
|
|
1286
|
+
//#endregion
|
|
1287
|
+
//#region host/tool-executor.ts
|
|
1288
|
+
/**
|
|
1289
|
+
* Tool execution — validates arguments and invokes tool handlers.
|
|
1290
|
+
*
|
|
1291
|
+
* {@link executeToolCall} is the single entry point used by both the
|
|
1292
|
+
* direct (self-hosted) runtime and the platform sandbox sidecar.
|
|
1293
|
+
*/
|
|
1294
|
+
const yieldTick = () => new Promise((r) => setTimeout(r, 0));
|
|
1295
|
+
function buildToolContext(opts) {
|
|
1296
|
+
const { env, state, kv, messages, sessionId } = opts;
|
|
1297
|
+
return {
|
|
1298
|
+
env,
|
|
1299
|
+
state: state ?? {},
|
|
1300
|
+
get kv() {
|
|
1301
|
+
if (!kv) throw new Error("KV not available");
|
|
1302
|
+
return kv;
|
|
1303
|
+
},
|
|
1304
|
+
messages: messages ?? [],
|
|
1305
|
+
sessionId: sessionId ?? "",
|
|
1306
|
+
send(event, data) {
|
|
1307
|
+
opts.send?.(event, data);
|
|
1308
|
+
}
|
|
1309
|
+
};
|
|
1310
|
+
}
|
|
1311
|
+
async function executeToolCall(name, args, options) {
|
|
1312
|
+
const { tool } = options;
|
|
1313
|
+
const parsed = (tool.parameters ?? EMPTY_PARAMS).safeParse(args);
|
|
1314
|
+
if (!parsed.success) return toolError(`Invalid arguments for tool "${name}": ${(parsed.error?.issues ?? []).map((i) => `${i.path.map(String).join(".")}: ${i.message}`).join(", ")}`);
|
|
1315
|
+
try {
|
|
1316
|
+
const ctx = buildToolContext(options);
|
|
1317
|
+
await yieldTick();
|
|
1318
|
+
const result = await pTimeout(Promise.resolve(tool.execute(parsed.data, ctx)), {
|
|
1319
|
+
milliseconds: TOOL_EXECUTION_TIMEOUT_MS,
|
|
1320
|
+
message: `Tool "${name}" timed out after ${TOOL_EXECUTION_TIMEOUT_MS}ms`
|
|
1321
|
+
});
|
|
1322
|
+
await yieldTick();
|
|
1323
|
+
if (result == null) return "null";
|
|
1324
|
+
return typeof result === "string" ? result : JSON.stringify(result);
|
|
1325
|
+
} catch (err) {
|
|
1326
|
+
const log = options.logger;
|
|
1327
|
+
if (log) log.warn("Tool execution failed", {
|
|
1328
|
+
tool: name,
|
|
1329
|
+
error: errorDetail(err)
|
|
1330
|
+
});
|
|
1331
|
+
else console.warn(`[tool-executor] Tool execution failed: ${name}`, err);
|
|
1332
|
+
return toolError(errorMessage(err));
|
|
1333
|
+
}
|
|
1334
|
+
}
|
|
1335
|
+
//#endregion
|
|
1336
|
+
//#region host/to-vercel-tools.ts
|
|
1337
|
+
/**
|
|
1338
|
+
* Converts agent {@link ToolSchema}[] to Vercel AI SDK tools with `execute`
|
|
1339
|
+
* delegation to the agent's {@link ExecuteTool} function.
|
|
1340
|
+
*
|
|
1341
|
+
* The pipeline orchestrator passes the output to `streamText({ tools })`.
|
|
1342
|
+
* Each produced tool's `execute` closure calls
|
|
1343
|
+
* `ctx.executeTool(name, args, sessionId, messages(), { signal, toolCallId })`,
|
|
1344
|
+
* so the existing agent tool infrastructure (argument validation, KV, hooks,
|
|
1345
|
+
* timeout) remains the single source of truth for tool behavior.
|
|
1346
|
+
*
|
|
1347
|
+
* Per-call `options.abortSignal` (forwarded by `streamText` when the
|
|
1348
|
+
* outer turn is aborted, e.g. barge-in) takes precedence over the
|
|
1349
|
+
* bag-level `ctx.signal` so individual invocations respect streamText
|
|
1350
|
+
* aborts.
|
|
1231
1351
|
*/
|
|
1232
1352
|
/**
|
|
1233
|
-
*
|
|
1234
|
-
*
|
|
1235
|
-
*
|
|
1353
|
+
* Convert an array of {@link ToolSchema} to a Vercel AI SDK `ToolSet`
|
|
1354
|
+
* (record keyed by tool name).
|
|
1355
|
+
*
|
|
1356
|
+
* Uses the v6 `tool()` helper with `inputSchema: jsonSchema(...)` wrapping
|
|
1357
|
+
* the agent's JSON Schema `parameters`. Execution is delegated to
|
|
1358
|
+
* `ctx.executeTool` so validation, KV, timeouts, and hooks keep working.
|
|
1236
1359
|
*/
|
|
1237
|
-
function
|
|
1238
|
-
|
|
1360
|
+
function toVercelTools(schemas, ctx) {
|
|
1361
|
+
const out = {};
|
|
1362
|
+
for (const schema of schemas) out[schema.name] = tool({
|
|
1363
|
+
description: schema.description,
|
|
1364
|
+
inputSchema: jsonSchema(schema.parameters),
|
|
1365
|
+
execute: async (args, options) => {
|
|
1366
|
+
const input = args ?? {};
|
|
1367
|
+
const signal = options.abortSignal ?? ctx.signal;
|
|
1368
|
+
const opts = {};
|
|
1369
|
+
if (signal !== void 0) opts.signal = signal;
|
|
1370
|
+
if (options.toolCallId !== void 0) opts.toolCallId = options.toolCallId;
|
|
1371
|
+
return ctx.executeTool(schema.name, input, ctx.sessionId, ctx.messages().slice(), opts);
|
|
1372
|
+
}
|
|
1373
|
+
});
|
|
1374
|
+
return out;
|
|
1239
1375
|
}
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1376
|
+
//#endregion
|
|
1377
|
+
//#region host/transports/pipeline-transport.ts
|
|
1378
|
+
function toModelMessage(m) {
|
|
1379
|
+
if (m.role === "user") return {
|
|
1380
|
+
role: "user",
|
|
1381
|
+
content: m.content
|
|
1382
|
+
};
|
|
1383
|
+
return {
|
|
1384
|
+
role: "assistant",
|
|
1385
|
+
content: m.content
|
|
1386
|
+
};
|
|
1387
|
+
}
|
|
1388
|
+
/** Create a pipeline-mode Transport (STT → LLM → TTS). */
|
|
1389
|
+
function createPipelineTransport(opts) {
|
|
1390
|
+
const log = opts.logger ?? consoleLogger;
|
|
1391
|
+
const sttSampleRate = opts.sttSampleRate ?? 16e3;
|
|
1392
|
+
const ttsSampleRate = opts.ttsSampleRate ?? 24e3;
|
|
1393
|
+
const maxSteps = opts.maxSteps ?? 5;
|
|
1394
|
+
const toolChoice = opts.toolChoice ?? "auto";
|
|
1395
|
+
const toolSchemas = opts.toolSchemas ?? [];
|
|
1396
|
+
const executeTool = opts.executeTool ?? (async () => {
|
|
1397
|
+
throw new Error("No executeTool provided");
|
|
1398
|
+
});
|
|
1399
|
+
const { callbacks, sessionConfig } = opts;
|
|
1400
|
+
const systemPrompt = sessionConfig.systemPrompt;
|
|
1401
|
+
const sessionAbort = new AbortController();
|
|
1402
|
+
let audioReady = false;
|
|
1403
|
+
let terminated = false;
|
|
1404
|
+
let sttSession = null;
|
|
1405
|
+
let ttsSession = null;
|
|
1406
|
+
let turnController = null;
|
|
1407
|
+
let nextReplyId = 0;
|
|
1408
|
+
const conversationMessages = sessionConfig.history ? [...sessionConfig.history] : [];
|
|
1409
|
+
let turnPromise = null;
|
|
1410
|
+
const sttSubs = [];
|
|
1411
|
+
const ttsSubs = [];
|
|
1412
|
+
function pushMessages(...msgs) {
|
|
1413
|
+
conversationMessages.push(...msgs);
|
|
1414
|
+
if (conversationMessages.length > 200) conversationMessages.splice(0, conversationMessages.length - 200);
|
|
1415
|
+
}
|
|
1416
|
+
function chainTurn(p) {
|
|
1417
|
+
turnPromise = (turnPromise ?? Promise.resolve()).then(() => p);
|
|
1418
|
+
}
|
|
1419
|
+
function emitError(code, message) {
|
|
1420
|
+
callbacks.onError(code, message);
|
|
1421
|
+
}
|
|
1422
|
+
/**
|
|
1423
|
+
* Tear down after an unrecoverable provider error. Aborts the in-flight
|
|
1424
|
+
* turn, cancels TTS, signals providers to close. Idempotent.
|
|
1425
|
+
*/
|
|
1426
|
+
function terminate() {
|
|
1427
|
+
if (terminated) return;
|
|
1428
|
+
terminated = true;
|
|
1429
|
+
if (turnController !== null) {
|
|
1430
|
+
turnController.abort();
|
|
1431
|
+
turnController = null;
|
|
1432
|
+
}
|
|
1433
|
+
ttsSession?.cancel();
|
|
1434
|
+
callbacks.onCancelled();
|
|
1435
|
+
sessionAbort.abort();
|
|
1436
|
+
}
|
|
1437
|
+
function onSttPartial(_text) {
|
|
1438
|
+
if (terminated) return;
|
|
1439
|
+
if (turnController === null) return;
|
|
1440
|
+
log.info("Pipeline barge-in", { sid: opts.sid });
|
|
1441
|
+
turnController.abort();
|
|
1442
|
+
turnController = null;
|
|
1443
|
+
ttsSession?.cancel();
|
|
1444
|
+
callbacks.onCancelled();
|
|
1445
|
+
}
|
|
1446
|
+
function onSttFinal(text) {
|
|
1447
|
+
if (terminated) return;
|
|
1448
|
+
const trimmed = text.trim();
|
|
1449
|
+
if (trimmed.length === 0) return;
|
|
1450
|
+
if (turnController !== null) {
|
|
1451
|
+
log.info("Pipeline replacing in-flight turn", { sid: opts.sid });
|
|
1452
|
+
turnController.abort();
|
|
1453
|
+
turnController = null;
|
|
1454
|
+
ttsSession?.cancel();
|
|
1455
|
+
callbacks.onCancelled();
|
|
1456
|
+
}
|
|
1457
|
+
callbacks.onUserTranscript(text);
|
|
1458
|
+
chainTurn(runTurn(trimmed).catch((err) => {
|
|
1459
|
+
log.error("Pipeline turn crashed", {
|
|
1460
|
+
error: errorMessage(err),
|
|
1461
|
+
sid: opts.sid
|
|
1462
|
+
});
|
|
1463
|
+
}));
|
|
1464
|
+
}
|
|
1465
|
+
function onSttError(err) {
|
|
1466
|
+
if (terminated) return;
|
|
1467
|
+
log.error("STT error", {
|
|
1468
|
+
code: err.code,
|
|
1469
|
+
message: err.message,
|
|
1470
|
+
sid: opts.sid
|
|
1471
|
+
});
|
|
1472
|
+
emitError("stt", err.message);
|
|
1473
|
+
terminate();
|
|
1474
|
+
}
|
|
1475
|
+
function onTtsError(err) {
|
|
1476
|
+
if (terminated) return;
|
|
1477
|
+
log.error("TTS error", {
|
|
1478
|
+
code: err.code,
|
|
1479
|
+
message: err.message,
|
|
1480
|
+
sid: opts.sid
|
|
1481
|
+
});
|
|
1482
|
+
emitError("tts", err.message);
|
|
1483
|
+
terminate();
|
|
1484
|
+
}
|
|
1485
|
+
async function consumeLlmStream(ctl, messages, tools, onDelta) {
|
|
1486
|
+
try {
|
|
1487
|
+
const result = streamText({
|
|
1488
|
+
model: opts.llm,
|
|
1489
|
+
system: systemPrompt,
|
|
1490
|
+
messages,
|
|
1491
|
+
tools,
|
|
1492
|
+
toolChoice,
|
|
1493
|
+
stopWhen: stepCountIs(maxSteps),
|
|
1494
|
+
abortSignal: ctl.signal
|
|
1495
|
+
});
|
|
1496
|
+
for await (const part of result.fullStream) {
|
|
1497
|
+
if (ctl.signal.aborted) break;
|
|
1498
|
+
handleStreamPart(part, ctl, onDelta);
|
|
1499
|
+
}
|
|
1500
|
+
} catch (err) {
|
|
1501
|
+
if (!ctl.signal.aborted) {
|
|
1502
|
+
const msg = errorMessage(err);
|
|
1503
|
+
log.error("LLM streamText failed", {
|
|
1504
|
+
error: msg,
|
|
1505
|
+
sid: opts.sid
|
|
1506
|
+
});
|
|
1507
|
+
emitError("llm", msg);
|
|
1508
|
+
}
|
|
1509
|
+
}
|
|
1510
|
+
}
|
|
1511
|
+
function handleStreamPart(part, _ctl, onDelta) {
|
|
1512
|
+
switch (part.type) {
|
|
1513
|
+
case "text-delta": {
|
|
1514
|
+
const delta = part.text ?? "";
|
|
1515
|
+
if (delta.length === 0) return;
|
|
1516
|
+
onDelta(delta);
|
|
1517
|
+
ttsSession?.sendText(delta);
|
|
1518
|
+
return;
|
|
1519
|
+
}
|
|
1520
|
+
case "tool-call": {
|
|
1521
|
+
const input = part.input ?? {};
|
|
1522
|
+
callbacks.onToolCall(part.toolCallId ?? "", part.toolName ?? "", input);
|
|
1523
|
+
return;
|
|
1524
|
+
}
|
|
1525
|
+
case "error": {
|
|
1526
|
+
const msg = errorMessage(part.error);
|
|
1527
|
+
log.error("LLM stream error", {
|
|
1528
|
+
message: msg,
|
|
1529
|
+
sid: opts.sid
|
|
1530
|
+
});
|
|
1531
|
+
emitError("llm", msg);
|
|
1532
|
+
return;
|
|
1533
|
+
}
|
|
1534
|
+
default: return;
|
|
1535
|
+
}
|
|
1536
|
+
}
|
|
1537
|
+
/**
|
|
1538
|
+
* Flush TTS and wait for drain. Resolves on:
|
|
1539
|
+
* - TTS emits `done`
|
|
1540
|
+
* - `signal` aborts (barge-in / provider error / session stop)
|
|
1541
|
+
* - PIPELINE_FLUSH_TIMEOUT_MS elapses
|
|
1542
|
+
* Resolves immediately if no TTS session.
|
|
1543
|
+
*/
|
|
1544
|
+
function flushTtsAndWait(signal) {
|
|
1545
|
+
const tts = ttsSession;
|
|
1546
|
+
if (!tts) return Promise.resolve();
|
|
1547
|
+
return new Promise((resolve) => {
|
|
1548
|
+
let off = null;
|
|
1549
|
+
let timer = null;
|
|
1550
|
+
const cleanup = () => {
|
|
1551
|
+
if (off) {
|
|
1552
|
+
off();
|
|
1553
|
+
off = null;
|
|
1554
|
+
}
|
|
1555
|
+
if (timer) {
|
|
1556
|
+
clearTimeout(timer);
|
|
1557
|
+
timer = null;
|
|
1558
|
+
}
|
|
1559
|
+
signal.removeEventListener("abort", onAbort);
|
|
1560
|
+
};
|
|
1561
|
+
const finish = () => {
|
|
1562
|
+
cleanup();
|
|
1563
|
+
resolve();
|
|
1564
|
+
};
|
|
1565
|
+
const onAbort = () => finish();
|
|
1566
|
+
if (signal.aborted) {
|
|
1567
|
+
resolve();
|
|
1568
|
+
return;
|
|
1569
|
+
}
|
|
1570
|
+
signal.addEventListener("abort", onAbort, { once: true });
|
|
1571
|
+
off = tts.on("done", finish);
|
|
1572
|
+
timer = setTimeout(() => {
|
|
1573
|
+
log.warn("TTS flush timeout", {
|
|
1574
|
+
sid: opts.sid,
|
|
1575
|
+
timeoutMs: PIPELINE_FLUSH_TIMEOUT_MS
|
|
1576
|
+
});
|
|
1577
|
+
finish();
|
|
1578
|
+
}, PIPELINE_FLUSH_TIMEOUT_MS);
|
|
1579
|
+
tts.flush();
|
|
1580
|
+
});
|
|
1581
|
+
}
|
|
1582
|
+
async function runTurn(userText) {
|
|
1583
|
+
const replyId = `pipeline-${++nextReplyId}`;
|
|
1584
|
+
callbacks.onReplyStarted(replyId);
|
|
1585
|
+
pushMessages({
|
|
1586
|
+
role: "user",
|
|
1587
|
+
content: userText
|
|
1588
|
+
});
|
|
1589
|
+
const ctl = new AbortController();
|
|
1590
|
+
turnController = ctl;
|
|
1591
|
+
const tools = toVercelTools(toolSchemas, {
|
|
1592
|
+
executeTool,
|
|
1593
|
+
sessionId: opts.sid,
|
|
1594
|
+
messages: () => conversationMessages,
|
|
1595
|
+
signal: ctl.signal
|
|
1596
|
+
});
|
|
1597
|
+
const messages = conversationMessages.map(toModelMessage);
|
|
1598
|
+
let accumulated = "";
|
|
1599
|
+
await consumeLlmStream(ctl, messages, tools, (delta) => {
|
|
1600
|
+
accumulated += delta;
|
|
1601
|
+
});
|
|
1602
|
+
if (ctl.signal.aborted) {
|
|
1603
|
+
if (turnController === ctl) turnController = null;
|
|
1604
|
+
return;
|
|
1605
|
+
}
|
|
1606
|
+
if (accumulated.length > 0) {
|
|
1607
|
+
callbacks.onAgentTranscript(accumulated, false);
|
|
1608
|
+
pushMessages({
|
|
1609
|
+
role: "assistant",
|
|
1610
|
+
content: accumulated
|
|
1611
|
+
});
|
|
1612
|
+
}
|
|
1613
|
+
await flushTtsAndWait(ctl.signal);
|
|
1614
|
+
if (ctl.signal.aborted) {
|
|
1615
|
+
if (turnController === ctl) turnController = null;
|
|
1616
|
+
return;
|
|
1617
|
+
}
|
|
1618
|
+
callbacks.onReplyDone();
|
|
1619
|
+
if (turnController === ctl) turnController = null;
|
|
1620
|
+
}
|
|
1621
|
+
async function runGreeting(text) {
|
|
1622
|
+
const replyId = `pipeline-greeting-${++nextReplyId}`;
|
|
1623
|
+
callbacks.onReplyStarted(replyId);
|
|
1624
|
+
const ctl = new AbortController();
|
|
1625
|
+
turnController = ctl;
|
|
1626
|
+
callbacks.onAgentTranscript(text, false);
|
|
1627
|
+
pushMessages({
|
|
1628
|
+
role: "assistant",
|
|
1629
|
+
content: text
|
|
1630
|
+
});
|
|
1631
|
+
ttsSession?.sendText(text);
|
|
1632
|
+
await flushTtsAndWait(ctl.signal);
|
|
1633
|
+
if (ctl.signal.aborted) {
|
|
1634
|
+
if (turnController === ctl) turnController = null;
|
|
1635
|
+
return;
|
|
1636
|
+
}
|
|
1637
|
+
callbacks.onReplyDone();
|
|
1638
|
+
if (turnController === ctl) turnController = null;
|
|
1639
|
+
}
|
|
1640
|
+
function reportOpenRejection(which, reason) {
|
|
1641
|
+
const msg = errorMessage(reason);
|
|
1642
|
+
log.error(`${which === "stt" ? "STT" : "TTS"} open failed`, {
|
|
1643
|
+
error: msg,
|
|
1644
|
+
sid: opts.sid
|
|
1645
|
+
});
|
|
1646
|
+
emitError(which, msg);
|
|
1647
|
+
}
|
|
1648
|
+
async function adoptStt(session, teardown) {
|
|
1649
|
+
if (teardown) {
|
|
1650
|
+
await session.close().catch(() => void 0);
|
|
1651
|
+
return;
|
|
1652
|
+
}
|
|
1653
|
+
sttSession = session;
|
|
1654
|
+
sttSubs.push(session.on("partial", onSttPartial));
|
|
1655
|
+
sttSubs.push(session.on("final", onSttFinal));
|
|
1656
|
+
sttSubs.push(session.on("error", onSttError));
|
|
1657
|
+
}
|
|
1658
|
+
async function adoptTts(session, teardown) {
|
|
1659
|
+
if (teardown) {
|
|
1660
|
+
await session.close().catch(() => void 0);
|
|
1661
|
+
return;
|
|
1662
|
+
}
|
|
1663
|
+
ttsSession = session;
|
|
1664
|
+
ttsSubs.push(session.on("audio", (pcm) => {
|
|
1665
|
+
callbacks.onAudioChunk(new Uint8Array(pcm.buffer, pcm.byteOffset, pcm.byteLength));
|
|
1666
|
+
}));
|
|
1667
|
+
ttsSubs.push(session.on("error", onTtsError));
|
|
1668
|
+
}
|
|
1669
|
+
async function openProviders() {
|
|
1670
|
+
const [sttResult, ttsResult] = await Promise.allSettled([opts.stt.open({
|
|
1671
|
+
sampleRate: sttSampleRate,
|
|
1672
|
+
apiKey: opts.providerKeys.stt,
|
|
1673
|
+
sttPrompt: opts.sttPrompt,
|
|
1674
|
+
signal: sessionAbort.signal
|
|
1675
|
+
}), opts.tts.open({
|
|
1676
|
+
sampleRate: ttsSampleRate,
|
|
1677
|
+
apiKey: opts.providerKeys.tts,
|
|
1678
|
+
signal: sessionAbort.signal
|
|
1679
|
+
})]);
|
|
1680
|
+
if (sttResult.status === "rejected") reportOpenRejection("stt", sttResult.reason);
|
|
1681
|
+
if (ttsResult.status === "rejected") reportOpenRejection("tts", ttsResult.reason);
|
|
1682
|
+
const aborted = sessionAbort.signal.aborted;
|
|
1683
|
+
const sttFailed = sttResult.status === "rejected";
|
|
1684
|
+
const ttsFailed = ttsResult.status === "rejected";
|
|
1685
|
+
const teardown = aborted || sttFailed || ttsFailed;
|
|
1686
|
+
if (sttResult.status === "fulfilled") await adoptStt(sttResult.value, teardown);
|
|
1687
|
+
if (ttsResult.status === "fulfilled") await adoptTts(ttsResult.value, teardown);
|
|
1688
|
+
if (!aborted && (sttFailed || ttsFailed)) terminate();
|
|
1245
1689
|
}
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1690
|
+
function onAudioReady() {
|
|
1691
|
+
if (audioReady || terminated) return;
|
|
1692
|
+
audioReady = true;
|
|
1693
|
+
if (opts.skipGreeting) return;
|
|
1694
|
+
const greeting = sessionConfig.greeting;
|
|
1695
|
+
if (!greeting) return;
|
|
1696
|
+
chainTurn(runGreeting(greeting).catch((err) => {
|
|
1697
|
+
log.error("Pipeline greeting failed", {
|
|
1698
|
+
error: errorMessage(err),
|
|
1699
|
+
sid: opts.sid
|
|
1700
|
+
});
|
|
1701
|
+
}));
|
|
1252
1702
|
}
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
if (
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1703
|
+
return {
|
|
1704
|
+
async start() {
|
|
1705
|
+
await openProviders();
|
|
1706
|
+
callbacks.onSessionReady?.(opts.sid);
|
|
1707
|
+
onAudioReady();
|
|
1708
|
+
},
|
|
1709
|
+
async stop() {
|
|
1710
|
+
if (sessionAbort.signal.aborted) return;
|
|
1711
|
+
sessionAbort.abort();
|
|
1712
|
+
turnController?.abort();
|
|
1713
|
+
for (const off of sttSubs) off();
|
|
1714
|
+
for (const off of ttsSubs) off();
|
|
1715
|
+
sttSubs.length = 0;
|
|
1716
|
+
ttsSubs.length = 0;
|
|
1717
|
+
if (turnPromise !== null) await turnPromise;
|
|
1718
|
+
await sttSession?.close().catch(() => {});
|
|
1719
|
+
await ttsSession?.close().catch(() => {});
|
|
1720
|
+
},
|
|
1721
|
+
sendUserAudio(bytes) {
|
|
1722
|
+
if (terminated || !audioReady) return;
|
|
1723
|
+
const offset = bytes.byteOffset;
|
|
1724
|
+
const length = bytes.byteLength;
|
|
1725
|
+
let pcm;
|
|
1726
|
+
if (offset % 2 === 0 && length % 2 === 0) pcm = new Int16Array(bytes.buffer, offset, length / 2);
|
|
1727
|
+
else {
|
|
1728
|
+
const copy = new Uint8Array(length - length % 2);
|
|
1729
|
+
copy.set(bytes.subarray(0, copy.byteLength));
|
|
1730
|
+
pcm = new Int16Array(copy.buffer);
|
|
1731
|
+
}
|
|
1732
|
+
sttSession?.sendAudio(pcm);
|
|
1733
|
+
},
|
|
1734
|
+
sendToolResult(_callId, _result) {},
|
|
1735
|
+
cancelReply() {
|
|
1736
|
+
if (terminated) return;
|
|
1737
|
+
turnController?.abort();
|
|
1738
|
+
turnController = null;
|
|
1739
|
+
ttsSession?.cancel();
|
|
1272
1740
|
}
|
|
1273
|
-
|
|
1274
|
-
}
|
|
1741
|
+
};
|
|
1275
1742
|
}
|
|
1276
1743
|
//#endregion
|
|
1277
1744
|
//#region host/s2s.ts
|
|
@@ -1326,72 +1793,59 @@ function parseS2sMessage(obj) {
|
|
|
1326
1793
|
const result = S2sMessageSchema.safeParse(obj);
|
|
1327
1794
|
return result.success ? result.data : void 0;
|
|
1328
1795
|
}
|
|
1329
|
-
function dispatchS2sMessage(
|
|
1796
|
+
function dispatchS2sMessage(callbacks, msg, state, ctx) {
|
|
1330
1797
|
switch (msg.type) {
|
|
1331
1798
|
case "session.ready":
|
|
1332
|
-
|
|
1799
|
+
callbacks.onSessionReady(msg.session_id);
|
|
1333
1800
|
break;
|
|
1334
1801
|
case "session.updated": break;
|
|
1335
1802
|
case "input.speech.started":
|
|
1336
1803
|
if (!state.speechActive) {
|
|
1337
1804
|
state.speechActive = true;
|
|
1338
|
-
|
|
1805
|
+
callbacks.onSpeechStarted();
|
|
1339
1806
|
}
|
|
1340
1807
|
break;
|
|
1341
1808
|
case "input.speech.stopped":
|
|
1342
1809
|
if (state.speechActive) {
|
|
1343
1810
|
state.speechActive = false;
|
|
1344
|
-
|
|
1811
|
+
callbacks.onSpeechStopped();
|
|
1345
1812
|
}
|
|
1346
1813
|
break;
|
|
1347
1814
|
case "transcript.user":
|
|
1348
|
-
|
|
1349
|
-
type: "user_transcript",
|
|
1350
|
-
text: msg.text
|
|
1351
|
-
});
|
|
1815
|
+
callbacks.onUserTranscript(msg.text);
|
|
1352
1816
|
break;
|
|
1353
1817
|
case "reply.started":
|
|
1354
|
-
|
|
1818
|
+
callbacks.onReplyStarted(msg.reply_id);
|
|
1355
1819
|
break;
|
|
1356
1820
|
case "transcript.agent":
|
|
1357
|
-
|
|
1358
|
-
type: "agent_transcript",
|
|
1359
|
-
text: msg.text,
|
|
1360
|
-
_interrupted: msg.interrupted
|
|
1361
|
-
});
|
|
1821
|
+
callbacks.onAgentTranscript(msg.text, msg.interrupted);
|
|
1362
1822
|
break;
|
|
1363
1823
|
case "tool.call":
|
|
1364
|
-
|
|
1365
|
-
type: "tool_call",
|
|
1366
|
-
toolCallId: msg.call_id,
|
|
1367
|
-
toolName: msg.name,
|
|
1368
|
-
args: msg.args
|
|
1369
|
-
});
|
|
1824
|
+
callbacks.onToolCall(msg.call_id, msg.name, msg.args);
|
|
1370
1825
|
break;
|
|
1371
1826
|
case "reply.done":
|
|
1372
|
-
|
|
1373
|
-
...
|
|
1827
|
+
ctx.log.info("S2S << reply.done", {
|
|
1828
|
+
...ctx.sid !== void 0 ? { sid: ctx.sid } : {},
|
|
1374
1829
|
status: msg.status ?? "completed"
|
|
1375
1830
|
});
|
|
1376
|
-
if (msg.status === "interrupted")
|
|
1377
|
-
else
|
|
1831
|
+
if (msg.status === "interrupted") callbacks.onCancelled();
|
|
1832
|
+
else callbacks.onReplyDone();
|
|
1378
1833
|
break;
|
|
1379
1834
|
case "session.error":
|
|
1380
|
-
if (msg.code === "session_not_found" || msg.code === "session_forbidden")
|
|
1381
|
-
else
|
|
1835
|
+
if (msg.code === "session_not_found" || msg.code === "session_forbidden") callbacks.onSessionExpired();
|
|
1836
|
+
else callbacks.onError(new Error(msg.message));
|
|
1382
1837
|
break;
|
|
1383
1838
|
case "error":
|
|
1384
|
-
|
|
1839
|
+
callbacks.onError(new Error(msg.message));
|
|
1385
1840
|
break;
|
|
1386
1841
|
default: break;
|
|
1387
1842
|
}
|
|
1388
1843
|
}
|
|
1389
1844
|
function connectS2s(opts) {
|
|
1390
|
-
const { apiKey, config, createWebSocket, logger: log = consoleLogger, sid } = opts;
|
|
1845
|
+
const { apiKey, config, createWebSocket, callbacks, logger: log = consoleLogger, sid } = opts;
|
|
1391
1846
|
return new Promise((resolve, reject) => {
|
|
1392
1847
|
log.info("S2S connecting", { url: config.wssUrl });
|
|
1393
1848
|
const ws = createWebSocket(config.wssUrl, { headers: { Authorization: `Bearer ${apiKey}` } });
|
|
1394
|
-
const emitter = createNanoEvents();
|
|
1395
1849
|
const dispatchState = { speechActive: false };
|
|
1396
1850
|
const dispatchCtx = sid !== void 0 ? {
|
|
1397
1851
|
log,
|
|
@@ -1409,7 +1863,6 @@ function connectS2s(opts) {
|
|
|
1409
1863
|
ws.send(json);
|
|
1410
1864
|
}
|
|
1411
1865
|
const handle = {
|
|
1412
|
-
on: emitter.on.bind(emitter),
|
|
1413
1866
|
sendAudio(audio) {
|
|
1414
1867
|
if (ws.readyState !== 1) {
|
|
1415
1868
|
log.debug("S2S sendAudio dropped: socket not open");
|
|
@@ -1422,16 +1875,15 @@ function connectS2s(opts) {
|
|
|
1422
1875
|
ws.send(jsonFrame);
|
|
1423
1876
|
},
|
|
1424
1877
|
sendToolResult(callId, result) {
|
|
1425
|
-
const msg = {
|
|
1426
|
-
type: "tool.result",
|
|
1427
|
-
call_id: callId,
|
|
1428
|
-
result
|
|
1429
|
-
};
|
|
1430
1878
|
log.info("S2S >> tool.result", {
|
|
1431
1879
|
call_id: callId,
|
|
1432
1880
|
resultLength: result.length
|
|
1433
1881
|
});
|
|
1434
|
-
send(
|
|
1882
|
+
send({
|
|
1883
|
+
type: "tool.result",
|
|
1884
|
+
call_id: callId,
|
|
1885
|
+
result
|
|
1886
|
+
});
|
|
1435
1887
|
},
|
|
1436
1888
|
updateSession(sessionConfig) {
|
|
1437
1889
|
const { systemPrompt, ...rest } = sessionConfig;
|
|
@@ -1468,8 +1920,7 @@ function connectS2s(opts) {
|
|
|
1468
1920
|
}
|
|
1469
1921
|
function handleAudioFastPath(obj) {
|
|
1470
1922
|
if (obj.type === "reply.audio" && typeof obj.data === "string") {
|
|
1471
|
-
|
|
1472
|
-
emitter.emit("audio", { audio: audioBytes });
|
|
1923
|
+
callbacks.onAudio(base64ToUint8(obj.data));
|
|
1473
1924
|
return true;
|
|
1474
1925
|
}
|
|
1475
1926
|
return false;
|
|
@@ -1479,7 +1930,7 @@ function connectS2s(opts) {
|
|
|
1479
1930
|
if (obj.type === "reply.done") return;
|
|
1480
1931
|
log.info(`S2S << ${obj.type}`);
|
|
1481
1932
|
}
|
|
1482
|
-
|
|
1933
|
+
ws.addEventListener("message", (ev) => {
|
|
1483
1934
|
const raw = tryParseJson(ev.data);
|
|
1484
1935
|
if (raw === void 0) return;
|
|
1485
1936
|
if (typeof raw !== "object" || raw === null || Array.isArray(raw)) {
|
|
@@ -1494,9 +1945,8 @@ function connectS2s(opts) {
|
|
|
1494
1945
|
log.warn(`S2S << unrecognised message type: ${obj.type ?? JSON.stringify(raw).slice(0, 200)}`);
|
|
1495
1946
|
return;
|
|
1496
1947
|
}
|
|
1497
|
-
dispatchS2sMessage(
|
|
1498
|
-
}
|
|
1499
|
-
ws.addEventListener("message", handleS2sMessage);
|
|
1948
|
+
dispatchS2sMessage(callbacks, parsed, dispatchState, dispatchCtx);
|
|
1949
|
+
});
|
|
1500
1950
|
ws.addEventListener("close", (ev) => {
|
|
1501
1951
|
const code = ev.code ?? 0;
|
|
1502
1952
|
const reason = ev.reason ?? "";
|
|
@@ -1505,394 +1955,102 @@ function connectS2s(opts) {
|
|
|
1505
1955
|
reason
|
|
1506
1956
|
});
|
|
1507
1957
|
if (!opened) reject(/* @__PURE__ */ new Error(`WebSocket closed before open (code: ${code})`));
|
|
1508
|
-
|
|
1958
|
+
callbacks.onClose(code, reason);
|
|
1509
1959
|
});
|
|
1510
1960
|
ws.addEventListener("error", (ev) => {
|
|
1511
1961
|
const message = typeof ev.message === "string" ? ev.message : "WebSocket error";
|
|
1512
1962
|
const errObj = new Error(message);
|
|
1513
1963
|
log.error("S2S WebSocket error", { error: errObj.message });
|
|
1514
1964
|
if (!opened) reject(errObj);
|
|
1515
|
-
else
|
|
1965
|
+
else callbacks.onError(errObj);
|
|
1516
1966
|
});
|
|
1517
1967
|
});
|
|
1518
1968
|
}
|
|
1519
1969
|
//#endregion
|
|
1520
|
-
//#region host/
|
|
1521
|
-
/** @internal
|
|
1970
|
+
//#region host/transports/s2s-transport.ts
|
|
1971
|
+
/** @internal Exposed for testing — allows spying on connectS2s in unit tests. */
|
|
1522
1972
|
const _internals = { connectS2s };
|
|
1523
|
-
|
|
1524
|
-
|
|
1525
|
-
|
|
1526
|
-
|
|
1527
|
-
|
|
1528
|
-
function
|
|
1529
|
-
|
|
1530
|
-
|
|
1531
|
-
|
|
1532
|
-
|
|
1533
|
-
|
|
1534
|
-
|
|
1535
|
-
|
|
1536
|
-
|
|
1537
|
-
|
|
1538
|
-
|
|
1539
|
-
|
|
1540
|
-
|
|
1541
|
-
|
|
1542
|
-
|
|
1543
|
-
|
|
1544
|
-
|
|
1545
|
-
|
|
1546
|
-
|
|
1547
|
-
|
|
1548
|
-
|
|
1549
|
-
|
|
1973
|
+
function createS2sTransport(opts) {
|
|
1974
|
+
const log = opts.logger ?? consoleLogger;
|
|
1975
|
+
const createWs = opts.createWebSocket ?? defaultCreateS2sWebSocket;
|
|
1976
|
+
let handle = null;
|
|
1977
|
+
let currentReplyId = null;
|
|
1978
|
+
async function start() {
|
|
1979
|
+
handle = await _internals.connectS2s({
|
|
1980
|
+
apiKey: opts.apiKey,
|
|
1981
|
+
config: opts.s2sConfig,
|
|
1982
|
+
createWebSocket: createWs,
|
|
1983
|
+
logger: log,
|
|
1984
|
+
sid: opts.sid,
|
|
1985
|
+
callbacks: {
|
|
1986
|
+
onSessionReady: (providerSessionId) => opts.callbacks.onSessionReady?.(providerSessionId),
|
|
1987
|
+
onReplyStarted: (replyId) => {
|
|
1988
|
+
currentReplyId = replyId;
|
|
1989
|
+
opts.callbacks.onReplyStarted(replyId);
|
|
1990
|
+
},
|
|
1991
|
+
onReplyDone: () => {
|
|
1992
|
+
currentReplyId = null;
|
|
1993
|
+
opts.callbacks.onReplyDone();
|
|
1994
|
+
},
|
|
1995
|
+
onCancelled: () => {
|
|
1996
|
+
currentReplyId = null;
|
|
1997
|
+
opts.callbacks.onCancelled();
|
|
1998
|
+
},
|
|
1999
|
+
onAudio: (bytes) => opts.callbacks.onAudioChunk(bytes),
|
|
2000
|
+
onUserTranscript: opts.callbacks.onUserTranscript,
|
|
2001
|
+
onAgentTranscript: opts.callbacks.onAgentTranscript,
|
|
2002
|
+
onToolCall: opts.callbacks.onToolCall,
|
|
2003
|
+
onSpeechStarted: opts.callbacks.onSpeechStarted,
|
|
2004
|
+
onSpeechStopped: opts.callbacks.onSpeechStopped,
|
|
2005
|
+
onSessionExpired: () => {
|
|
2006
|
+
log.info("S2S session expired", { sid: opts.sid });
|
|
2007
|
+
handle?.close();
|
|
2008
|
+
},
|
|
2009
|
+
onError: (err) => opts.callbacks.onError("internal", err.message),
|
|
2010
|
+
onClose: (code, reason) => {
|
|
2011
|
+
if (currentReplyId !== null) {
|
|
2012
|
+
log.warn("S2S closed with active reply", {
|
|
2013
|
+
sid: opts.sid,
|
|
2014
|
+
agent: opts.agent,
|
|
2015
|
+
activeReplyId: currentReplyId,
|
|
2016
|
+
code,
|
|
2017
|
+
reason
|
|
2018
|
+
});
|
|
2019
|
+
opts.callbacks.onError("connection", `S2S closed mid-reply (code=${code})`);
|
|
2020
|
+
} else log.info("S2S closed", {
|
|
2021
|
+
code,
|
|
2022
|
+
reason
|
|
2023
|
+
});
|
|
2024
|
+
}
|
|
1550
2025
|
}
|
|
1551
|
-
}
|
|
1552
|
-
};
|
|
1553
|
-
}
|
|
1554
|
-
/**
|
|
1555
|
-
* Complete a tool call by truncating the result, emitting a `tool_call_done` event,
|
|
1556
|
-
* and accumulating the result in `ctx.reply.pendingTools` — but only if the reply that
|
|
1557
|
-
* initiated this call is still active.
|
|
1558
|
-
*/
|
|
1559
|
-
function finishToolCall(ctx, callId, result, replyId) {
|
|
1560
|
-
const truncatedResult = result.length > 4e3 ? result.slice(0, MAX_TOOL_RESULT_CHARS) : result;
|
|
1561
|
-
ctx.client.event({
|
|
1562
|
-
type: "tool_call_done",
|
|
1563
|
-
toolCallId: callId,
|
|
1564
|
-
result: truncatedResult
|
|
1565
|
-
});
|
|
1566
|
-
if (replyId !== null && replyId === ctx.reply.currentReplyId) {
|
|
1567
|
-
ctx.reply.pendingTools.push({
|
|
1568
|
-
callId,
|
|
1569
|
-
result
|
|
1570
|
-
});
|
|
1571
|
-
if (ctx.maxHistory > 0 && ctx.reply.pendingTools.length > ctx.maxHistory) ctx.reply.pendingTools.shift();
|
|
1572
|
-
}
|
|
1573
|
-
}
|
|
1574
|
-
async function handleToolCall(ctx, event) {
|
|
1575
|
-
const { toolCallId: callId, toolName: name, args: parsedArgs } = event;
|
|
1576
|
-
const replyId = ctx.reply.currentReplyId;
|
|
1577
|
-
ctx.client.event(event);
|
|
1578
|
-
const refused = ctx.consumeToolCallStep(name, replyId);
|
|
1579
|
-
if (refused !== null) {
|
|
1580
|
-
finishToolCall(ctx, callId, refused, replyId);
|
|
1581
|
-
return;
|
|
1582
|
-
}
|
|
1583
|
-
ctx.log.info("S2S tool call", {
|
|
1584
|
-
tool: name,
|
|
1585
|
-
callId,
|
|
1586
|
-
args: parsedArgs,
|
|
1587
|
-
agent: ctx.agent
|
|
1588
|
-
});
|
|
1589
|
-
let result;
|
|
1590
|
-
try {
|
|
1591
|
-
result = await ctx.executeTool(name, parsedArgs, ctx.id, ctx.conversationMessages);
|
|
1592
|
-
} catch (err) {
|
|
1593
|
-
const msg = errorMessage(err);
|
|
1594
|
-
ctx.log.error("Tool execution failed", {
|
|
1595
|
-
tool: name,
|
|
1596
|
-
error: errorDetail(err)
|
|
1597
2026
|
});
|
|
1598
|
-
|
|
1599
|
-
}
|
|
1600
|
-
ctx.log.info("S2S tool result", {
|
|
1601
|
-
tool: name,
|
|
1602
|
-
callId,
|
|
1603
|
-
resultLength: result.length
|
|
1604
|
-
});
|
|
1605
|
-
finishToolCall(ctx, callId, result, replyId);
|
|
1606
|
-
}
|
|
1607
|
-
function handleUserTranscript(ctx, text) {
|
|
1608
|
-
ctx.log.info("S2S user transcript", { text });
|
|
1609
|
-
ctx.client.event({
|
|
1610
|
-
type: "user_transcript",
|
|
1611
|
-
text
|
|
1612
|
-
});
|
|
1613
|
-
ctx.pushMessages({
|
|
1614
|
-
role: "user",
|
|
1615
|
-
content: text
|
|
1616
|
-
});
|
|
1617
|
-
}
|
|
1618
|
-
function handleAgentTranscript(ctx, text, interrupted) {
|
|
1619
|
-
ctx.client.event({
|
|
1620
|
-
type: "agent_transcript",
|
|
1621
|
-
text
|
|
1622
|
-
});
|
|
1623
|
-
if (!interrupted) ctx.pushMessages({
|
|
1624
|
-
role: "assistant",
|
|
1625
|
-
content: text
|
|
1626
|
-
});
|
|
1627
|
-
}
|
|
1628
|
-
function handleReplyCancelled(ctx) {
|
|
1629
|
-
ctx.log.info("S2S reply interrupted (barge-in)");
|
|
1630
|
-
ctx.cancelReply();
|
|
1631
|
-
ctx.client.event({ type: "cancelled" });
|
|
1632
|
-
}
|
|
1633
|
-
/**
|
|
1634
|
-
* Warn when the entry-to-emit time for a reply_done dispatch exceeds this.
|
|
1635
|
-
* Tool-less sessions should be sub-millisecond; sessions with pending tools
|
|
1636
|
-
* will legitimately spend time awaiting ctx.turnPromise. We log both (with
|
|
1637
|
-
* `hadTurnPromise`) so event-loop starvation is distinguishable from
|
|
1638
|
-
* genuine tool-call latency.
|
|
1639
|
-
*/
|
|
1640
|
-
const REPLY_DONE_SLOW_THRESHOLD_MS = 50;
|
|
1641
|
-
function handleReplyDone(ctx) {
|
|
1642
|
-
const startMs = Date.now();
|
|
1643
|
-
const doneReplyId = ctx.reply.currentReplyId;
|
|
1644
|
-
if (doneReplyId === null) {
|
|
1645
|
-
ctx.log.debug("Dropping duplicate reply.done (no active reply)");
|
|
1646
|
-
return;
|
|
2027
|
+
handle.updateSession(opts.sessionConfig);
|
|
1647
2028
|
}
|
|
1648
|
-
|
|
1649
|
-
|
|
1650
|
-
|
|
1651
|
-
ctx.reply.pendingTools = [];
|
|
1652
|
-
return;
|
|
1653
|
-
}
|
|
1654
|
-
if (ctx.reply.pendingTools.length > 0) {
|
|
1655
|
-
for (const tool of ctx.reply.pendingTools) ctx.s2s?.sendToolResult(tool.callId, tool.result);
|
|
1656
|
-
ctx.reply.pendingTools = [];
|
|
1657
|
-
} else {
|
|
1658
|
-
const stepsUsed = ctx.reply.toolCallCount;
|
|
1659
|
-
if (stepsUsed > 0) ctx.log.info("Turn complete", {
|
|
1660
|
-
steps: stepsUsed,
|
|
1661
|
-
agent: ctx.agent
|
|
1662
|
-
});
|
|
1663
|
-
ctx.client.playAudioDone();
|
|
1664
|
-
ctx.client.event({ type: "reply_done" });
|
|
1665
|
-
ctx.reply.currentReplyId = null;
|
|
1666
|
-
const durationMs = Date.now() - startMs;
|
|
1667
|
-
if (durationMs >= REPLY_DONE_SLOW_THRESHOLD_MS) ctx.log.warn("slow reply_done dispatch", {
|
|
1668
|
-
sid: ctx.id,
|
|
1669
|
-
agent: ctx.agent,
|
|
1670
|
-
durationMs,
|
|
1671
|
-
hadTurnPromise
|
|
1672
|
-
});
|
|
1673
|
-
}
|
|
1674
|
-
};
|
|
1675
|
-
if (hadTurnPromise) ctx.turnPromise?.then(sendPending);
|
|
1676
|
-
else sendPending();
|
|
1677
|
-
}
|
|
1678
|
-
function setupListeners(ctx, handle) {
|
|
1679
|
-
handle.on("ready", ({ sessionId }) => ctx.log.info("S2S session ready", { sessionId }));
|
|
1680
|
-
handle.on("replyStarted", ({ replyId }) => {
|
|
1681
|
-
ctx.beginReply(replyId);
|
|
1682
|
-
});
|
|
1683
|
-
handle.on("sessionExpired", () => {
|
|
1684
|
-
ctx.log.info("S2S session expired");
|
|
1685
|
-
handle.close();
|
|
1686
|
-
});
|
|
1687
|
-
handle.on("audio", ({ audio }) => ctx.client.playAudioChunk(audio));
|
|
1688
|
-
handle.on("error", (err) => {
|
|
1689
|
-
ctx.log.error("S2S error", { message: err.message });
|
|
1690
|
-
ctx.client.event({
|
|
1691
|
-
type: "error",
|
|
1692
|
-
code: "internal",
|
|
1693
|
-
message: err.message
|
|
1694
|
-
});
|
|
1695
|
-
handle.close();
|
|
1696
|
-
});
|
|
1697
|
-
handle.on("close", (code, reason) => {
|
|
1698
|
-
const activeReplyId = ctx.reply.currentReplyId;
|
|
1699
|
-
if (activeReplyId !== null) ctx.log.warn("S2S closed with active reply", {
|
|
1700
|
-
sid: ctx.id,
|
|
1701
|
-
agent: ctx.agent,
|
|
1702
|
-
activeReplyId,
|
|
1703
|
-
code,
|
|
1704
|
-
reason
|
|
1705
|
-
});
|
|
1706
|
-
else ctx.log.info("S2S closed", {
|
|
1707
|
-
code,
|
|
1708
|
-
reason
|
|
1709
|
-
});
|
|
1710
|
-
ctx.s2s = null;
|
|
1711
|
-
ctx.cancelReply();
|
|
1712
|
-
});
|
|
1713
|
-
handle.on("event", (event) => {
|
|
1714
|
-
switch (event.type) {
|
|
1715
|
-
case "user_transcript":
|
|
1716
|
-
handleUserTranscript(ctx, event.text);
|
|
1717
|
-
break;
|
|
1718
|
-
case "agent_transcript":
|
|
1719
|
-
handleAgentTranscript(ctx, event.text, event._interrupted ?? false);
|
|
1720
|
-
break;
|
|
1721
|
-
case "tool_call": {
|
|
1722
|
-
const p = handleToolCall(ctx, event).catch((err) => {
|
|
1723
|
-
ctx.log.error("Tool call handler failed", { err: errorMessage(err) });
|
|
1724
|
-
});
|
|
1725
|
-
ctx.chainTurn(p);
|
|
1726
|
-
break;
|
|
1727
|
-
}
|
|
1728
|
-
case "reply_done":
|
|
1729
|
-
handleReplyDone(ctx);
|
|
1730
|
-
break;
|
|
1731
|
-
case "cancelled":
|
|
1732
|
-
handleReplyCancelled(ctx);
|
|
1733
|
-
break;
|
|
1734
|
-
default: ctx.client.event(event);
|
|
1735
|
-
}
|
|
1736
|
-
});
|
|
1737
|
-
}
|
|
1738
|
-
function createS2sSession(opts) {
|
|
1739
|
-
const { id, agent, client, toolSchemas, apiKey, s2sConfig, executeTool, createWebSocket = defaultCreateS2sWebSocket, logger: log = consoleLogger } = opts;
|
|
1740
|
-
const agentConfig = opts.skipGreeting ? {
|
|
1741
|
-
...opts.agentConfig,
|
|
1742
|
-
greeting: ""
|
|
1743
|
-
} : opts.agentConfig;
|
|
1744
|
-
const systemPrompt = buildSystemPrompt(agentConfig, {
|
|
1745
|
-
hasTools: toolSchemas.length > 0 || (agentConfig.builtinTools?.length ?? 0) > 0,
|
|
1746
|
-
voice: true,
|
|
1747
|
-
toolGuidance: opts.toolGuidance
|
|
1748
|
-
});
|
|
1749
|
-
const s2sTools = toolSchemas.map((ts) => ({
|
|
1750
|
-
type: "function",
|
|
1751
|
-
name: ts.name,
|
|
1752
|
-
description: ts.description,
|
|
1753
|
-
parameters: ts.parameters
|
|
1754
|
-
}));
|
|
1755
|
-
const sessionAbort = new AbortController();
|
|
1756
|
-
const ctx = buildCtx({
|
|
1757
|
-
id,
|
|
1758
|
-
agent,
|
|
1759
|
-
client,
|
|
1760
|
-
agentConfig,
|
|
1761
|
-
executeTool,
|
|
1762
|
-
log,
|
|
1763
|
-
maxHistory: opts.maxHistory
|
|
1764
|
-
});
|
|
1765
|
-
const rawTimeout = agentConfig.idleTimeoutMs ?? 3e5;
|
|
1766
|
-
const idle = createIdleTimer({
|
|
1767
|
-
timeoutMs: rawTimeout === 0 || !Number.isFinite(rawTimeout) ? 0 : rawTimeout,
|
|
1768
|
-
agent,
|
|
1769
|
-
log,
|
|
1770
|
-
client,
|
|
1771
|
-
ctx
|
|
1772
|
-
});
|
|
1773
|
-
let connectGeneration = 0;
|
|
1774
|
-
const sessionUpdatePayload = {
|
|
1775
|
-
systemPrompt,
|
|
1776
|
-
tools: s2sTools,
|
|
1777
|
-
...agentConfig.greeting ? { greeting: agentConfig.greeting } : {}
|
|
1778
|
-
};
|
|
1779
|
-
async function connectAndSetup() {
|
|
1780
|
-
const generation = ++connectGeneration;
|
|
1781
|
-
try {
|
|
1782
|
-
const handle = await _internals.connectS2s({
|
|
1783
|
-
apiKey,
|
|
1784
|
-
config: s2sConfig,
|
|
1785
|
-
createWebSocket,
|
|
1786
|
-
logger: log,
|
|
1787
|
-
sid: id
|
|
1788
|
-
});
|
|
1789
|
-
if (sessionAbort.signal.aborted || generation !== connectGeneration) {
|
|
1790
|
-
handle.close();
|
|
1791
|
-
return;
|
|
1792
|
-
}
|
|
1793
|
-
setupListeners(ctx, handle);
|
|
1794
|
-
handle.updateSession(sessionUpdatePayload);
|
|
1795
|
-
ctx.s2s = handle;
|
|
1796
|
-
idle.reset();
|
|
1797
|
-
} catch (err) {
|
|
1798
|
-
const msg = errorMessage(err);
|
|
1799
|
-
log.error("S2S connect failed", { error: errorDetail(err) });
|
|
1800
|
-
client.event({
|
|
1801
|
-
type: "error",
|
|
1802
|
-
code: "internal",
|
|
1803
|
-
message: msg
|
|
1804
|
-
});
|
|
1805
|
-
}
|
|
2029
|
+
async function stop() {
|
|
2030
|
+
handle?.close();
|
|
2031
|
+
handle = null;
|
|
1806
2032
|
}
|
|
1807
2033
|
return {
|
|
1808
|
-
|
|
1809
|
-
|
|
1810
|
-
|
|
1811
|
-
|
|
1812
|
-
if (sessionAbort.signal.aborted) return;
|
|
1813
|
-
sessionAbort.abort();
|
|
1814
|
-
idle.clear();
|
|
1815
|
-
if (ctx.turnPromise !== null) await ctx.turnPromise;
|
|
1816
|
-
ctx.s2s?.close();
|
|
1817
|
-
},
|
|
1818
|
-
onAudio(data) {
|
|
1819
|
-
idle.reset();
|
|
1820
|
-
ctx.s2s?.sendAudio(data);
|
|
1821
|
-
},
|
|
1822
|
-
onAudioReady() {},
|
|
1823
|
-
onCancel() {
|
|
1824
|
-
client.event({ type: "cancelled" });
|
|
1825
|
-
},
|
|
1826
|
-
onReset() {
|
|
1827
|
-
ctx.cancelReply();
|
|
1828
|
-
ctx.conversationMessages = [];
|
|
1829
|
-
ctx.reply.toolCallCount = 0;
|
|
1830
|
-
ctx.turnPromise = null;
|
|
1831
|
-
idle.clear();
|
|
1832
|
-
ctx.s2s?.close();
|
|
1833
|
-
client.event({ type: "reset" });
|
|
1834
|
-
connectAndSetup().catch((err) => log.error("S2S reset reconnect failed", { error: errorMessage(err) }));
|
|
2034
|
+
start,
|
|
2035
|
+
stop,
|
|
2036
|
+
sendUserAudio(bytes) {
|
|
2037
|
+
handle?.sendAudio(bytes);
|
|
1835
2038
|
},
|
|
1836
|
-
|
|
1837
|
-
|
|
1838
|
-
role: m.role,
|
|
1839
|
-
content: m.content
|
|
1840
|
-
})));
|
|
2039
|
+
sendToolResult(callId, result) {
|
|
2040
|
+
handle?.sendToolResult(callId, result);
|
|
1841
2041
|
},
|
|
1842
|
-
|
|
1843
|
-
|
|
1844
|
-
}
|
|
1845
|
-
};
|
|
1846
|
-
}
|
|
1847
|
-
//#endregion
|
|
1848
|
-
//#region host/tool-executor.ts
|
|
1849
|
-
/**
|
|
1850
|
-
* Tool execution — validates arguments and invokes tool handlers.
|
|
1851
|
-
*
|
|
1852
|
-
* {@link executeToolCall} is the single entry point used by both the
|
|
1853
|
-
* direct (self-hosted) runtime and the platform sandbox sidecar.
|
|
1854
|
-
*/
|
|
1855
|
-
const yieldTick = () => new Promise((r) => setTimeout(r, 0));
|
|
1856
|
-
function buildToolContext(opts) {
|
|
1857
|
-
const { env, state, kv, messages, sessionId } = opts;
|
|
1858
|
-
return {
|
|
1859
|
-
env,
|
|
1860
|
-
state: state ?? {},
|
|
1861
|
-
get kv() {
|
|
1862
|
-
if (!kv) throw new Error("KV not available");
|
|
1863
|
-
return kv;
|
|
2042
|
+
cancelReply() {
|
|
2043
|
+
currentReplyId = null;
|
|
1864
2044
|
},
|
|
1865
|
-
|
|
1866
|
-
|
|
1867
|
-
|
|
1868
|
-
|
|
2045
|
+
updateSession(config) {
|
|
2046
|
+
handle?.updateSession({
|
|
2047
|
+
systemPrompt: config.systemPrompt,
|
|
2048
|
+
tools: config.tools ?? [],
|
|
2049
|
+
...config.greeting !== void 0 ? { greeting: config.greeting } : {}
|
|
2050
|
+
});
|
|
1869
2051
|
}
|
|
1870
2052
|
};
|
|
1871
2053
|
}
|
|
1872
|
-
async function executeToolCall(name, args, options) {
|
|
1873
|
-
const { tool } = options;
|
|
1874
|
-
const parsed = (tool.parameters ?? EMPTY_PARAMS).safeParse(args);
|
|
1875
|
-
if (!parsed.success) return toolError(`Invalid arguments for tool "${name}": ${(parsed.error?.issues ?? []).map((i) => `${i.path.map(String).join(".")}: ${i.message}`).join(", ")}`);
|
|
1876
|
-
try {
|
|
1877
|
-
const ctx = buildToolContext(options);
|
|
1878
|
-
await yieldTick();
|
|
1879
|
-
const result = await pTimeout(Promise.resolve(tool.execute(parsed.data, ctx)), {
|
|
1880
|
-
milliseconds: TOOL_EXECUTION_TIMEOUT_MS,
|
|
1881
|
-
message: `Tool "${name}" timed out after ${TOOL_EXECUTION_TIMEOUT_MS}ms`
|
|
1882
|
-
});
|
|
1883
|
-
await yieldTick();
|
|
1884
|
-
if (result == null) return "null";
|
|
1885
|
-
return typeof result === "string" ? result : JSON.stringify(result);
|
|
1886
|
-
} catch (err) {
|
|
1887
|
-
const log = options.logger;
|
|
1888
|
-
if (log) log.warn("Tool execution failed", {
|
|
1889
|
-
tool: name,
|
|
1890
|
-
error: errorDetail(err)
|
|
1891
|
-
});
|
|
1892
|
-
else console.warn(`[tool-executor] Tool execution failed: ${name}`, err);
|
|
1893
|
-
return toolError(errorMessage(err));
|
|
1894
|
-
}
|
|
1895
|
-
}
|
|
1896
2054
|
//#endregion
|
|
1897
2055
|
//#region host/unstorage-kv.ts
|
|
1898
2056
|
/**
|
|
@@ -1944,20 +2102,20 @@ function createUnstorageKv(options) {
|
|
|
1944
2102
|
*
|
|
1945
2103
|
* Audio validation is handled at the host transport layer (see server.ts).
|
|
1946
2104
|
*/
|
|
2105
|
+
const AUDIO_DONE_FRAME = JSON.stringify({ type: "audio_done" });
|
|
1947
2106
|
/**
|
|
1948
2107
|
* Creates a {@link ClientSink} backed by a plain WebSocket.
|
|
1949
2108
|
*
|
|
1950
|
-
*
|
|
1951
|
-
* binary frames
|
|
2109
|
+
* Session events are sent as JSON text frames; audio chunks are sent as raw
|
|
2110
|
+
* PCM16 binary frames.
|
|
1952
2111
|
*/
|
|
1953
2112
|
function createClientSink(ws, log) {
|
|
1954
|
-
/** Send data over ws, silently dropping if the socket is not open. */
|
|
1955
2113
|
function safeSend(data) {
|
|
1956
2114
|
try {
|
|
1957
2115
|
if (ws.readyState !== 1) return;
|
|
1958
2116
|
ws.send(data);
|
|
1959
2117
|
} catch (err) {
|
|
1960
|
-
log.debug?.("safeSend: socket closed between readyState check and send", { error:
|
|
2118
|
+
log.debug?.("safeSend: socket closed between readyState check and send", { error: err instanceof Error ? err.message : String(err) });
|
|
1961
2119
|
}
|
|
1962
2120
|
}
|
|
1963
2121
|
return {
|
|
@@ -1971,7 +2129,7 @@ function createClientSink(ws, log) {
|
|
|
1971
2129
|
safeSend(chunk);
|
|
1972
2130
|
},
|
|
1973
2131
|
playAudioDone() {
|
|
1974
|
-
safeSend(
|
|
2132
|
+
safeSend(AUDIO_DONE_FRAME);
|
|
1975
2133
|
}
|
|
1976
2134
|
};
|
|
1977
2135
|
}
|
|
@@ -1980,35 +2138,32 @@ function handleBinaryAudio(data, session) {
|
|
|
1980
2138
|
session.onAudio(data);
|
|
1981
2139
|
return true;
|
|
1982
2140
|
}
|
|
1983
|
-
if (data instanceof ArrayBuffer) {
|
|
1984
|
-
session.onAudio(new Uint8Array(data));
|
|
1985
|
-
return true;
|
|
1986
|
-
}
|
|
1987
2141
|
return false;
|
|
1988
2142
|
}
|
|
1989
|
-
function handleTextMessage(data, session, log,
|
|
1990
|
-
if (typeof data !== "string")
|
|
1991
|
-
|
|
2143
|
+
function handleTextMessage(data, session, log, sid) {
|
|
2144
|
+
if (typeof data !== "string") {
|
|
2145
|
+
log.warn("ws: non-string, non-binary frame received; dropping", { sid });
|
|
2146
|
+
return;
|
|
2147
|
+
}
|
|
2148
|
+
let parsed;
|
|
1992
2149
|
try {
|
|
1993
|
-
|
|
2150
|
+
parsed = JSON.parse(data);
|
|
1994
2151
|
} catch {
|
|
1995
|
-
log.warn("
|
|
1996
|
-
|
|
1997
|
-
|
|
2152
|
+
log.warn("ws: invalid JSON; dropping", {
|
|
2153
|
+
sid,
|
|
2154
|
+
data: data.slice(0, 200)
|
|
1998
2155
|
});
|
|
1999
2156
|
return;
|
|
2000
2157
|
}
|
|
2001
|
-
const
|
|
2002
|
-
if (!
|
|
2003
|
-
if (
|
|
2004
|
-
...ctx,
|
|
2158
|
+
const result = lenientParse(ClientMessageSchema, parsed);
|
|
2159
|
+
if (!result.ok) {
|
|
2160
|
+
if (result.malformed) log.warn("ws: malformed client message", {
|
|
2005
2161
|
sid,
|
|
2006
|
-
error:
|
|
2162
|
+
error: result.error
|
|
2007
2163
|
});
|
|
2008
2164
|
return;
|
|
2009
2165
|
}
|
|
2010
|
-
|
|
2011
|
-
switch (msg.type) {
|
|
2166
|
+
switch (result.data.type) {
|
|
2012
2167
|
case "audio_ready":
|
|
2013
2168
|
session.onAudioReady();
|
|
2014
2169
|
break;
|
|
@@ -2019,19 +2174,19 @@ function handleTextMessage(data, session, log, ctx, sid) {
|
|
|
2019
2174
|
session.onReset();
|
|
2020
2175
|
break;
|
|
2021
2176
|
case "history":
|
|
2022
|
-
session.onHistory(
|
|
2177
|
+
session.onHistory(result.data.messages);
|
|
2023
2178
|
break;
|
|
2024
2179
|
default: break;
|
|
2025
2180
|
}
|
|
2026
2181
|
}
|
|
2027
2182
|
/**
|
|
2028
|
-
* Attaches session lifecycle handlers to a native WebSocket using
|
|
2029
|
-
*
|
|
2183
|
+
* Attaches session lifecycle handlers to a native WebSocket using JSON text
|
|
2184
|
+
* frames for control messages and raw PCM16 binary frames for audio.
|
|
2030
2185
|
*
|
|
2031
2186
|
* Connection flow:
|
|
2032
|
-
* 1. WebSocket opens → server sends
|
|
2033
|
-
* 2. Client sets up audio → sends
|
|
2034
|
-
* 3. If reconnecting → client sends
|
|
2187
|
+
* 1. WebSocket opens → server sends JSON CONFIG frame with sampleRate, ttsSampleRate, sessionId
|
|
2188
|
+
* 2. Client sets up audio → sends JSON AUDIO_READY frame
|
|
2189
|
+
* 3. If reconnecting → client sends JSON HISTORY frame with prior messages
|
|
2035
2190
|
*/
|
|
2036
2191
|
function wireSessionSocket(ws, opts) {
|
|
2037
2192
|
const { sessions, logger: log = consoleLogger } = opts;
|
|
@@ -2041,7 +2196,7 @@ function wireSessionSocket(ws, opts) {
|
|
|
2041
2196
|
let session = null;
|
|
2042
2197
|
/** Set to true once session.start() resolves. Messages arriving before
|
|
2043
2198
|
* this flag is set are buffered and replayed once the session is ready,
|
|
2044
|
-
* preventing audio/
|
|
2199
|
+
* preventing audio/frames from being dispatched to a half-initialized session. */
|
|
2045
2200
|
let sessionReady = false;
|
|
2046
2201
|
let messageBuffer = [];
|
|
2047
2202
|
function drainBuffer() {
|
|
@@ -2049,9 +2204,8 @@ function wireSessionSocket(ws, opts) {
|
|
|
2049
2204
|
const buf = messageBuffer;
|
|
2050
2205
|
messageBuffer = null;
|
|
2051
2206
|
for (const event of buf) {
|
|
2052
|
-
|
|
2053
|
-
|
|
2054
|
-
handleTextMessage(data, session, log, ctx, sid);
|
|
2207
|
+
if (handleBinaryAudio(event.data, session)) continue;
|
|
2208
|
+
handleTextMessage(event.data, session, log, sid);
|
|
2055
2209
|
}
|
|
2056
2210
|
}
|
|
2057
2211
|
function onOpen() {
|
|
@@ -2066,7 +2220,9 @@ function wireSessionSocket(ws, opts) {
|
|
|
2066
2220
|
opts.onSinkCreated?.(sessionId, client);
|
|
2067
2221
|
ws.send(JSON.stringify({
|
|
2068
2222
|
type: "config",
|
|
2069
|
-
|
|
2223
|
+
audioFormat: opts.readyConfig.audioFormat,
|
|
2224
|
+
sampleRate: opts.readyConfig.sampleRate,
|
|
2225
|
+
ttsSampleRate: opts.readyConfig.ttsSampleRate,
|
|
2070
2226
|
sessionId
|
|
2071
2227
|
}));
|
|
2072
2228
|
const timeoutMs = opts.sessionStartTimeoutMs ?? 1e4;
|
|
@@ -2099,9 +2255,8 @@ function wireSessionSocket(ws, opts) {
|
|
|
2099
2255
|
if (messageBuffer && messageBuffer.length < 100) messageBuffer.push(event);
|
|
2100
2256
|
return;
|
|
2101
2257
|
}
|
|
2102
|
-
|
|
2103
|
-
|
|
2104
|
-
handleTextMessage(data, session, log, ctx, sid);
|
|
2258
|
+
if (handleBinaryAudio(event.data, session)) return;
|
|
2259
|
+
handleTextMessage(event.data, session, log, sid);
|
|
2105
2260
|
});
|
|
2106
2261
|
ws.addEventListener("close", () => {
|
|
2107
2262
|
log.info("Session disconnected", {
|
|
@@ -2132,6 +2287,30 @@ function wireSessionSocket(ws, opts) {
|
|
|
2132
2287
|
//#endregion
|
|
2133
2288
|
//#region host/runtime.ts
|
|
2134
2289
|
/**
|
|
2290
|
+
* Resolve the API key env-var for the configured STT provider.
|
|
2291
|
+
*
|
|
2292
|
+
* Each STT provider uses its own env var (e.g. `ASSEMBLYAI_API_KEY`,
|
|
2293
|
+
* `DEEPGRAM_API_KEY`). We read the kind from the descriptor if it is one;
|
|
2294
|
+
* pre-resolved openers have no kind field so we fall back to AssemblyAI for
|
|
2295
|
+
* backward compatibility (openers supply their own key at open-time anyway).
|
|
2296
|
+
*/
|
|
2297
|
+
function resolveSttApiKey(stt, env) {
|
|
2298
|
+
if ((stt != null && "kind" in stt && typeof stt.kind === "string" ? stt.kind : void 0) === "deepgram") return resolveApiKey("DEEPGRAM_API_KEY", env);
|
|
2299
|
+
return resolveApiKey("ASSEMBLYAI_API_KEY", env);
|
|
2300
|
+
}
|
|
2301
|
+
/**
|
|
2302
|
+
* Resolve the API key env-var for the configured TTS provider.
|
|
2303
|
+
*
|
|
2304
|
+
* Each TTS provider uses its own env var (e.g. `CARTESIA_API_KEY`,
|
|
2305
|
+
* `RIME_API_KEY`). We read the kind from the descriptor if it is one;
|
|
2306
|
+
* pre-resolved openers have no kind field so we fall back to Cartesia for
|
|
2307
|
+
* backward compatibility (openers supply their own key at open-time anyway).
|
|
2308
|
+
*/
|
|
2309
|
+
function resolveTtsApiKey(tts, env) {
|
|
2310
|
+
if ((tts != null && "kind" in tts && typeof tts.kind === "string" ? tts.kind : void 0) === "rime") return resolveApiKey("RIME_API_KEY", env);
|
|
2311
|
+
return resolveApiKey("CARTESIA_API_KEY", env);
|
|
2312
|
+
}
|
|
2313
|
+
/**
|
|
2135
2314
|
* Distinguish a descriptor (`{ kind, options }`) from an already-resolved
|
|
2136
2315
|
* opener / `LanguageModel`. The production path always passes descriptors;
|
|
2137
2316
|
* openers are a test escape hatch (fakes in `_pipeline-test-fakes.ts`).
|
|
@@ -2236,40 +2415,86 @@ function createRuntime(opts) {
|
|
|
2236
2415
|
} : null;
|
|
2237
2416
|
function createSession(sessionOpts) {
|
|
2238
2417
|
sinkMap.set(sessionOpts.id, sessionOpts.client);
|
|
2239
|
-
|
|
2240
|
-
|
|
2418
|
+
const isPipeline = Boolean(pipelineProviders);
|
|
2419
|
+
const systemPrompt = buildSystemPrompt(agentConfig, {
|
|
2420
|
+
hasTools: toolSchemas.length > 0 || (agentConfig.builtinTools?.length ?? 0) > 0,
|
|
2421
|
+
voice: true,
|
|
2422
|
+
toolGuidance
|
|
2423
|
+
});
|
|
2424
|
+
let core = null;
|
|
2425
|
+
function bindCore() {
|
|
2426
|
+
if (!core) throw new Error("SessionCore not yet created");
|
|
2427
|
+
return core;
|
|
2428
|
+
}
|
|
2429
|
+
const callbacks = {
|
|
2430
|
+
onReplyStarted: (replyId) => bindCore().onReplyStarted(replyId),
|
|
2431
|
+
onReplyDone: () => bindCore().onReplyDone(),
|
|
2432
|
+
onCancelled: () => bindCore().onCancelled(),
|
|
2433
|
+
onAudioChunk: (bytes) => bindCore().onAudioChunk(bytes),
|
|
2434
|
+
onAudioDone: () => bindCore().onAudioDone(),
|
|
2435
|
+
onUserTranscript: (text) => bindCore().onUserTranscript(text),
|
|
2436
|
+
onAgentTranscript: (text, interrupted) => bindCore().onAgentTranscript(text, interrupted),
|
|
2437
|
+
onToolCall: isPipeline ? (id, name, args) => sessionOpts.client.event({
|
|
2438
|
+
type: "tool_call",
|
|
2439
|
+
toolCallId: id,
|
|
2440
|
+
toolName: name,
|
|
2441
|
+
args
|
|
2442
|
+
}) : (id, name, args) => bindCore().onToolCall(id, name, args),
|
|
2443
|
+
onError: (code, message) => bindCore().onError(code, message),
|
|
2444
|
+
onSpeechStarted: () => bindCore().onSpeechStarted(),
|
|
2445
|
+
onSpeechStopped: () => bindCore().onSpeechStopped()
|
|
2446
|
+
};
|
|
2447
|
+
let transport;
|
|
2448
|
+
if (pipelineProviders) transport = createPipelineTransport({
|
|
2449
|
+
sid: sessionOpts.id,
|
|
2241
2450
|
agent: sessionOpts.agent,
|
|
2242
|
-
client: sessionOpts.client,
|
|
2243
|
-
agentConfig,
|
|
2244
|
-
toolSchemas,
|
|
2245
|
-
toolGuidance,
|
|
2246
|
-
executeTool,
|
|
2247
2451
|
stt: pipelineProviders.stt,
|
|
2248
2452
|
llm: pipelineProviders.llm,
|
|
2249
2453
|
tts: pipelineProviders.tts,
|
|
2250
|
-
|
|
2251
|
-
|
|
2454
|
+
callbacks,
|
|
2455
|
+
sessionConfig: {
|
|
2456
|
+
systemPrompt,
|
|
2457
|
+
greeting: agentConfig.greeting,
|
|
2458
|
+
tools: toolSchemas
|
|
2459
|
+
},
|
|
2460
|
+
toolSchemas,
|
|
2461
|
+
executeTool,
|
|
2462
|
+
providerKeys: {
|
|
2463
|
+
stt: resolveSttApiKey(opts.stt, env),
|
|
2464
|
+
tts: resolveTtsApiKey(opts.tts, env)
|
|
2465
|
+
},
|
|
2252
2466
|
sttSampleRate: s2sConfig.inputSampleRate,
|
|
2253
2467
|
ttsSampleRate: s2sConfig.outputSampleRate,
|
|
2468
|
+
maxSteps: agentConfig.maxSteps,
|
|
2469
|
+
toolChoice: agentConfig.toolChoice,
|
|
2254
2470
|
skipGreeting: sessionOpts.skipGreeting ?? false,
|
|
2255
2471
|
logger
|
|
2256
2472
|
});
|
|
2257
|
-
|
|
2258
|
-
|
|
2473
|
+
else transport = createS2sTransport({
|
|
2474
|
+
apiKey: env.ASSEMBLYAI_API_KEY ?? "",
|
|
2475
|
+
s2sConfig,
|
|
2476
|
+
sessionConfig: {
|
|
2477
|
+
systemPrompt,
|
|
2478
|
+
tools: toolSchemas,
|
|
2479
|
+
...agentConfig.greeting !== void 0 ? { greeting: agentConfig.greeting } : {}
|
|
2480
|
+
},
|
|
2481
|
+
toolSchemas,
|
|
2482
|
+
callbacks,
|
|
2483
|
+
sid: sessionOpts.id,
|
|
2484
|
+
agent: sessionOpts.agent,
|
|
2485
|
+
...createWebSocket ? { createWebSocket } : {},
|
|
2486
|
+
logger
|
|
2487
|
+
});
|
|
2488
|
+
core = createSessionCore({
|
|
2259
2489
|
id: sessionOpts.id,
|
|
2260
2490
|
agent: sessionOpts.agent,
|
|
2261
2491
|
client: sessionOpts.client,
|
|
2262
2492
|
agentConfig,
|
|
2263
|
-
toolSchemas,
|
|
2264
|
-
toolGuidance,
|
|
2265
|
-
apiKey,
|
|
2266
|
-
s2sConfig,
|
|
2267
2493
|
executeTool,
|
|
2268
|
-
|
|
2269
|
-
|
|
2270
|
-
logger,
|
|
2271
|
-
...sessionOpts.resumeFrom ? { resumeFrom: sessionOpts.resumeFrom } : {}
|
|
2494
|
+
transport,
|
|
2495
|
+
logger
|
|
2272
2496
|
});
|
|
2497
|
+
return core;
|
|
2273
2498
|
}
|
|
2274
2499
|
function startSession(ws, startOpts) {
|
|
2275
2500
|
const resumeFrom = startOpts?.resumeFrom;
|
|
@@ -2454,4 +2679,4 @@ function createServer(options) {
|
|
|
2454
2679
|
};
|
|
2455
2680
|
}
|
|
2456
2681
|
//#endregion
|
|
2457
|
-
export { DEFAULT_S2S_CONFIG,
|
|
2682
|
+
export { DEFAULT_S2S_CONFIG, _internals, consoleLogger, createPipelineTransport, createRuntime, createS2sTransport, createServer, createSessionCore, createUnstorageKv, executeInIsolate, executeToolCall, jsonLogger, resolveAllBuiltins, wireSessionSocket };
|