@alexkroman1/aai 1.4.5 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +9 -9
- package/CHANGELOG.md +13 -0
- package/dist/assemblyai-C969QGi4.js +35 -0
- package/dist/cartesia-BfQPOQ7Y.js +37 -0
- package/dist/host/_pipeline-test-fakes.d.ts +3 -1
- package/dist/host/providers/stt/deepgram.d.ts +28 -0
- package/dist/host/providers/tts/cartesia.d.ts +1 -1
- package/dist/host/providers/tts/rime.d.ts +44 -0
- package/dist/host/runtime-barrel.d.ts +4 -2
- package/dist/host/runtime-barrel.js +1432 -1208
- package/dist/host/runtime.d.ts +2 -2
- package/dist/host/s2s.d.ts +16 -16
- package/dist/host/session-core.d.ts +37 -0
- package/dist/host/transports/pipeline-transport.d.ts +48 -0
- package/dist/host/transports/s2s-transport.d.ts +19 -0
- package/dist/host/transports/types.d.ts +45 -0
- package/dist/host/ws-handler.d.ts +14 -10
- package/dist/sdk/protocol.d.ts +6 -5
- package/dist/sdk/providers/llm-barrel.js +1 -1
- package/dist/sdk/providers/stt/deepgram.d.ts +35 -0
- package/dist/sdk/providers/stt-barrel.d.ts +1 -0
- package/dist/sdk/providers/stt-barrel.js +2 -2
- package/dist/sdk/providers/tts/cartesia.d.ts +12 -4
- package/dist/sdk/providers/tts/rime.d.ts +42 -0
- package/dist/sdk/providers/tts-barrel.d.ts +1 -0
- package/dist/sdk/providers/tts-barrel.js +2 -2
- package/host/_pipeline-test-fakes.ts +6 -3
- package/host/_test-utils.ts +209 -128
- package/host/cleanup.test.ts +25 -298
- package/host/integration/pipeline-reference.integration.test.ts +30 -35
- package/host/providers/resolve.ts +10 -2
- package/host/providers/stt/deepgram.test.ts +229 -0
- package/host/providers/stt/deepgram.ts +172 -0
- package/host/providers/tts/cartesia.ts +7 -3
- package/host/providers/tts/rime.test.ts +251 -0
- package/host/providers/tts/rime.ts +322 -0
- package/host/runtime-barrel.ts +4 -2
- package/host/runtime.test.ts +13 -46
- package/host/runtime.ts +131 -23
- package/host/s2s.test.ts +122 -131
- package/host/s2s.ts +44 -52
- package/host/session-core.test.ts +257 -0
- package/host/session-core.ts +262 -0
- package/host/transports/pipeline-transport.test.ts +651 -0
- package/host/transports/pipeline-transport.ts +532 -0
- package/host/{fixture-replay.test.ts → transports/s2s-transport-fixtures.test.ts} +76 -106
- package/host/transports/s2s-transport.test.ts +56 -0
- package/host/transports/s2s-transport.ts +116 -0
- package/host/transports/types.test.ts +22 -0
- package/host/transports/types.ts +51 -0
- package/host/ws-handler.test.ts +324 -242
- package/host/ws-handler.ts +56 -59
- package/package.json +2 -1
- package/sdk/__snapshots__/exports.test.ts.snap +3 -3
- package/sdk/protocol-compat.test.ts +8 -0
- package/sdk/protocol.ts +6 -5
- package/sdk/providers/stt/deepgram.ts +43 -0
- package/sdk/providers/stt-barrel.ts +2 -0
- package/sdk/providers/tts/cartesia.ts +15 -5
- package/sdk/providers/tts/rime.ts +52 -0
- package/sdk/providers/tts-barrel.ts +2 -0
- package/dist/assemblyai-Cxg9eobY.js +0 -18
- package/dist/cartesia-DwDk2tEu.js +0 -10
- package/dist/host/pipeline-session-ctx.d.ts +0 -24
- package/dist/host/pipeline-session.d.ts +0 -52
- package/dist/host/session-ctx.d.ts +0 -73
- package/dist/host/session.d.ts +0 -62
- package/host/pipeline-session-ctx.test.ts +0 -31
- package/host/pipeline-session-ctx.ts +0 -36
- package/host/pipeline-session.test.ts +0 -672
- package/host/pipeline-session.ts +0 -533
- package/host/s2s-fixtures.test.ts +0 -237
- package/host/session-ctx.test.ts +0 -387
- package/host/session-ctx.ts +0 -134
- package/host/session-fixture-replay.test.ts +0 -128
- package/host/session.test.ts +0 -634
- package/host/session.ts +0 -412
- /package/dist/{anthropic-BrUCPKUc.js → anthropic-CcLZygAr.js} +0 -0
|
@@ -1,23 +1,24 @@
|
|
|
1
1
|
import { r as DEFAULT_SYSTEM_PROMPT } from "../types-KUgezM6u.js";
|
|
2
|
-
import { _ as TOOL_EXECUTION_TIMEOUT_MS, a as DEFAULT_SHUTDOWN_TIMEOUT_MS, c as FETCH_TIMEOUT_MS, d as MAX_PAGE_CHARS,
|
|
2
|
+
import { _ as TOOL_EXECUTION_TIMEOUT_MS, a as DEFAULT_SHUTDOWN_TIMEOUT_MS, c as FETCH_TIMEOUT_MS, d as MAX_PAGE_CHARS, g as RUN_CODE_TIMEOUT_MS, h as PIPELINE_FLUSH_TIMEOUT_MS, l as MAX_HTML_BYTES, m as MAX_WS_PAYLOAD_BYTES, o as DEFAULT_STT_SAMPLE_RATE, p as MAX_VALUE_SIZE, s as DEFAULT_TTS_SAMPLE_RATE, t as AGENT_CSP } from "../constants-C2nirZUI.js";
|
|
3
3
|
import { i as toolError, n as errorDetail, r as errorMessage, t as parseWsUpgradeParams } from "../ws-upgrade-BeOQ7fXL.js";
|
|
4
4
|
import { ClientMessageSchema, buildReadyConfig, lenientParse } from "../sdk/protocol.js";
|
|
5
5
|
import { a as toAgentConfig, c as makeSttError, i as agentToolsToSchemas, l as makeTtsError, n as EMPTY_PARAMS, s as assertProviderTriple } from "../_internal-types-3p3OJZPb.js";
|
|
6
|
-
import { t as
|
|
7
|
-
import {
|
|
8
|
-
import { t as
|
|
6
|
+
import { r as DEEPGRAM_KIND, t as ASSEMBLYAI_KIND } from "../assemblyai-C969QGi4.js";
|
|
7
|
+
import { a as RIME_KIND, n as CARTESIA_KIND } from "../cartesia-BfQPOQ7Y.js";
|
|
8
|
+
import { t as ANTHROPIC_KIND } from "../anthropic-CcLZygAr.js";
|
|
9
9
|
import { z } from "zod";
|
|
10
10
|
import { convert } from "html-to-text";
|
|
11
11
|
import vm from "node:vm";
|
|
12
12
|
import pTimeout from "p-timeout";
|
|
13
13
|
import { createStorage, prefixStorage } from "unstorage";
|
|
14
|
-
import { jsonSchema, stepCountIs, streamText, tool } from "ai";
|
|
15
14
|
import { createAnthropic } from "@ai-sdk/anthropic";
|
|
16
15
|
import { AssemblyAI } from "assemblyai";
|
|
17
16
|
import { createNanoEvents } from "nanoevents";
|
|
17
|
+
import { DeepgramClient } from "@deepgram/sdk";
|
|
18
18
|
import { randomUUID } from "node:crypto";
|
|
19
19
|
import { Cartesia } from "@cartesia/cartesia-js";
|
|
20
20
|
import WsWebSocket, { WebSocketServer } from "ws";
|
|
21
|
+
import { jsonSchema, stepCountIs, streamText, tool } from "ai";
|
|
21
22
|
import fs from "node:fs";
|
|
22
23
|
import http from "node:http";
|
|
23
24
|
import path from "node:path";
|
|
@@ -378,712 +379,236 @@ function buildSystemPrompt(config, opts) {
|
|
|
378
379
|
return DEFAULT_SYSTEM_PROMPT + `\n\nToday's date is ${getFormattedDate()}.` + agentInstructions + toolPreamble + guidance + (opts.voice ? VOICE_RULES : "");
|
|
379
380
|
}
|
|
380
381
|
//#endregion
|
|
381
|
-
//#region host/
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
382
|
+
//#region host/providers/stt/assemblyai.ts
|
|
383
|
+
/**
|
|
384
|
+
* AssemblyAI Universal-Streaming STT opener (host-only).
|
|
385
|
+
*
|
|
386
|
+
* The user-facing descriptor factory (`assemblyAI(...)`) lives in
|
|
387
|
+
* `sdk/providers/stt/assemblyai.ts`. This module is the host-side
|
|
388
|
+
* counterpart: it takes the descriptor options + an API key and
|
|
389
|
+
* returns an {@link SttOpener} that the pipeline session drives.
|
|
390
|
+
*
|
|
391
|
+
* Default model: `"u3pro-rt"` (Universal-3 Pro Real-Time). The adapter
|
|
392
|
+
* maps that to the SDK's `"u3-rt-pro"` `speechModel` value; any other
|
|
393
|
+
* string is forwarded verbatim.
|
|
394
|
+
*/
|
|
395
|
+
/** Translate the descriptor's model alias to the SDK's `speechModel` value. */
|
|
396
|
+
function resolveSpeechModel(model) {
|
|
397
|
+
if (model === "u3pro-rt") return "u3-rt-pro";
|
|
398
|
+
return model;
|
|
399
|
+
}
|
|
400
|
+
/** Build an {@link SttOpener} from resolved AssemblyAI descriptor options. */
|
|
401
|
+
function openAssemblyAI(opts = {}) {
|
|
402
|
+
return {
|
|
403
|
+
name: "assemblyai",
|
|
404
|
+
async open(openOpts) {
|
|
405
|
+
const apiKey = openOpts.apiKey || process.env.ASSEMBLYAI_API_KEY;
|
|
406
|
+
if (!apiKey) throw makeSttError("stt_auth_failed", "AssemblyAI STT: missing API key. Set ASSEMBLYAI_API_KEY in the agent env.");
|
|
407
|
+
const client = new AssemblyAI({ apiKey });
|
|
408
|
+
const speechModel = resolveSpeechModel(opts.model ?? "u3pro-rt");
|
|
409
|
+
const transcriber = client.streaming.transcriber({
|
|
410
|
+
sampleRate: openOpts.sampleRate,
|
|
411
|
+
speechModel,
|
|
412
|
+
...openOpts.sttPrompt ? { prompt: openOpts.sttPrompt } : {}
|
|
413
|
+
});
|
|
414
|
+
const emitter = createNanoEvents();
|
|
415
|
+
let closed = false;
|
|
416
|
+
transcriber.on("turn", (event) => {
|
|
417
|
+
if (closed) return;
|
|
418
|
+
const text = event.transcript ?? "";
|
|
419
|
+
if (event.end_of_turn) {
|
|
420
|
+
if (text.length > 0) emitter.emit("final", text);
|
|
421
|
+
} else if (text.length > 0) emitter.emit("partial", text);
|
|
422
|
+
});
|
|
423
|
+
transcriber.on("error", (err) => {
|
|
424
|
+
if (closed) return;
|
|
425
|
+
emitter.emit("error", makeSttError("stt_stream_error", err?.message ?? String(err)));
|
|
426
|
+
});
|
|
427
|
+
transcriber.on("close", (code) => {
|
|
428
|
+
if (closed) return;
|
|
429
|
+
if (code !== 1e3) emitter.emit("error", makeSttError("stt_stream_error", `socket closed ${code}`));
|
|
430
|
+
});
|
|
431
|
+
try {
|
|
432
|
+
await transcriber.connect();
|
|
433
|
+
} catch (cause) {
|
|
434
|
+
throw makeSttError("stt_connect_failed", `AssemblyAI STT: connect failed: ${cause instanceof Error ? cause.message : String(cause)}`);
|
|
405
435
|
}
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
beginReply(replyId) {
|
|
413
|
-
ctx.reply = {
|
|
414
|
-
pendingTools: [],
|
|
415
|
-
toolCallCount: 0,
|
|
416
|
-
currentReplyId: replyId
|
|
436
|
+
const close = async () => {
|
|
437
|
+
if (closed) return;
|
|
438
|
+
closed = true;
|
|
439
|
+
try {
|
|
440
|
+
await transcriber.close();
|
|
441
|
+
} catch {}
|
|
417
442
|
};
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
443
|
+
if (openOpts.signal.aborted) close();
|
|
444
|
+
else openOpts.signal.addEventListener("abort", () => void close(), { once: true });
|
|
445
|
+
return {
|
|
446
|
+
sendAudio(pcm) {
|
|
447
|
+
if (closed) return;
|
|
448
|
+
const copy = new Uint8Array(pcm.byteLength);
|
|
449
|
+
copy.set(new Uint8Array(pcm.buffer, pcm.byteOffset, pcm.byteLength));
|
|
450
|
+
transcriber.sendAudio(copy.buffer);
|
|
451
|
+
},
|
|
452
|
+
on(event, fn) {
|
|
453
|
+
return emitter.on(event, fn);
|
|
454
|
+
},
|
|
455
|
+
close,
|
|
456
|
+
_transcriber: transcriber
|
|
425
457
|
};
|
|
426
|
-
},
|
|
427
|
-
chainTurn(p) {
|
|
428
|
-
ctx.turnPromise = (ctx.turnPromise ?? Promise.resolve()).then(() => p);
|
|
429
458
|
}
|
|
430
459
|
};
|
|
431
|
-
return ctx;
|
|
432
|
-
}
|
|
433
|
-
function buildCtx(opts) {
|
|
434
|
-
const base = _buildBaseCtx(opts);
|
|
435
|
-
base.s2s = null;
|
|
436
|
-
return base;
|
|
437
|
-
}
|
|
438
|
-
//#endregion
|
|
439
|
-
//#region host/pipeline-session-ctx.ts
|
|
440
|
-
function buildPipelineCtx(opts) {
|
|
441
|
-
const base = _buildBaseCtx(opts);
|
|
442
|
-
base.stt = null;
|
|
443
|
-
base.tts = null;
|
|
444
|
-
return base;
|
|
445
460
|
}
|
|
446
461
|
//#endregion
|
|
447
|
-
//#region host/
|
|
462
|
+
//#region host/providers/stt/deepgram.ts
|
|
448
463
|
/**
|
|
449
|
-
*
|
|
464
|
+
* Deepgram Nova streaming STT opener (host-only).
|
|
450
465
|
*
|
|
451
|
-
*
|
|
452
|
-
*
|
|
466
|
+
* The user-facing descriptor factory (`deepgram(...)`) lives in
|
|
467
|
+
* `sdk/providers/stt/deepgram.ts`. This module is the host-side
|
|
468
|
+
* counterpart: it takes the descriptor options + an API key and
|
|
469
|
+
* returns an {@link SttOpener} that the pipeline session drives.
|
|
470
|
+
*
|
|
471
|
+
* Default model: `"nova-3"`. Any string is forwarded verbatim to the SDK.
|
|
472
|
+
*
|
|
473
|
+
* This adapter targets the Deepgram SDK v5 (`@deepgram/sdk@^5`). The v5
|
|
474
|
+
* streaming API is:
|
|
475
|
+
* `client.listen.v1.connect(args)` → `Promise<V1Socket>`
|
|
476
|
+
* followed by:
|
|
477
|
+
* `socket.connect()` + `socket.waitForOpen()` to establish the connection.
|
|
453
478
|
*/
|
|
454
|
-
function consoleLog(fn) {
|
|
455
|
-
return (msg, ctx) => ctx ? fn(msg, ctx) : fn(msg);
|
|
456
|
-
}
|
|
457
|
-
/** Default console-backed logger. */
|
|
458
|
-
const consoleLogger = {
|
|
459
|
-
info: consoleLog(console.log),
|
|
460
|
-
warn: consoleLog(console.warn),
|
|
461
|
-
error: consoleLog(console.error),
|
|
462
|
-
debug: consoleLog(console.debug)
|
|
463
|
-
};
|
|
464
479
|
/**
|
|
465
|
-
*
|
|
466
|
-
*
|
|
467
|
-
* caller-provided context fields.
|
|
480
|
+
* Handle an incoming Deepgram transcript message, emitting `partial` or
|
|
481
|
+
* `final` events on the emitter. Empty transcripts are silently dropped.
|
|
468
482
|
*/
|
|
469
|
-
function
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
483
|
+
function handleMessage(data, closed, emitter) {
|
|
484
|
+
if (closed) return;
|
|
485
|
+
if (data.type !== "Results") return;
|
|
486
|
+
const result = data;
|
|
487
|
+
const text = result.channel?.alternatives?.[0]?.transcript ?? "";
|
|
488
|
+
if (result.is_final) {
|
|
489
|
+
if (text.length > 0) emitter.emit("final", text);
|
|
490
|
+
} else if (text.length > 0) emitter.emit("partial", text);
|
|
491
|
+
}
|
|
492
|
+
/** Wire Deepgram socket events onto the nanoevents emitter. */
|
|
493
|
+
function wireSocketEvents(connection, emitter, getIsClosed) {
|
|
494
|
+
connection.on("message", (data) => handleMessage(data, getIsClosed(), emitter));
|
|
495
|
+
connection.on("error", (err) => {
|
|
496
|
+
if (getIsClosed()) return;
|
|
497
|
+
emitter.emit("error", makeSttError("stt_stream_error", err?.message ?? String(err)));
|
|
498
|
+
});
|
|
499
|
+
connection.on("close", (event) => {
|
|
500
|
+
if (getIsClosed()) return;
|
|
501
|
+
const code = event?.code;
|
|
502
|
+
if (code !== void 0 && code !== 1e3) emitter.emit("error", makeSttError("stt_stream_error", `socket closed ${code}`));
|
|
503
|
+
});
|
|
504
|
+
}
|
|
505
|
+
/** Wire the AbortSignal to the close function. */
|
|
506
|
+
function wireAbortSignal(signal, close) {
|
|
507
|
+
if (signal.aborted) close();
|
|
508
|
+
else signal.addEventListener("abort", () => void close(), { once: true });
|
|
509
|
+
}
|
|
510
|
+
/** Build an {@link SttOpener} from resolved Deepgram descriptor options. */
|
|
511
|
+
function openDeepgram(opts = {}) {
|
|
512
|
+
return {
|
|
513
|
+
name: "deepgram",
|
|
514
|
+
async open(openOpts) {
|
|
515
|
+
const apiKey = openOpts.apiKey || process.env.DEEPGRAM_API_KEY;
|
|
516
|
+
if (!apiKey) throw makeSttError("stt_auth_failed", "Deepgram STT: missing API key. Set DEEPGRAM_API_KEY in the agent env.");
|
|
517
|
+
const client = new DeepgramClient({ apiKey });
|
|
518
|
+
let connection;
|
|
519
|
+
try {
|
|
520
|
+
connection = await client.listen.v1.connect({
|
|
521
|
+
model: opts.model ?? "nova-3",
|
|
522
|
+
language: opts.language ?? "en",
|
|
523
|
+
encoding: "linear16",
|
|
524
|
+
sample_rate: openOpts.sampleRate,
|
|
525
|
+
channels: 1,
|
|
526
|
+
interim_results: "true",
|
|
527
|
+
smart_format: "true",
|
|
528
|
+
endpointing: 300,
|
|
529
|
+
utterance_end_ms: "1000",
|
|
530
|
+
Authorization: apiKey
|
|
531
|
+
});
|
|
532
|
+
} catch (cause) {
|
|
533
|
+
throw makeSttError("stt_connect_failed", `Deepgram STT: connect failed: ${cause instanceof Error ? cause.message : String(cause)}`);
|
|
534
|
+
}
|
|
535
|
+
const emitter = createNanoEvents();
|
|
536
|
+
let closed = false;
|
|
537
|
+
wireSocketEvents(connection, emitter, () => closed);
|
|
538
|
+
connection.connect();
|
|
539
|
+
try {
|
|
540
|
+
await connection.waitForOpen();
|
|
541
|
+
} catch (cause) {
|
|
542
|
+
throw makeSttError("stt_connect_failed", `Deepgram STT: WebSocket open failed: ${cause instanceof Error ? cause.message : String(cause)}`);
|
|
543
|
+
}
|
|
544
|
+
const close = async () => {
|
|
545
|
+
if (closed) return;
|
|
546
|
+
closed = true;
|
|
547
|
+
try {
|
|
548
|
+
connection.close();
|
|
549
|
+
} catch {}
|
|
550
|
+
};
|
|
551
|
+
wireAbortSignal(openOpts.signal, close);
|
|
552
|
+
return {
|
|
553
|
+
sendAudio(pcm) {
|
|
554
|
+
if (closed) return;
|
|
555
|
+
connection.sendMedia(new Uint8Array(pcm.buffer, pcm.byteOffset, pcm.byteLength));
|
|
556
|
+
},
|
|
557
|
+
on(event, fn) {
|
|
558
|
+
return emitter.on(event, fn);
|
|
559
|
+
},
|
|
560
|
+
close,
|
|
561
|
+
_connection: connection
|
|
562
|
+
};
|
|
563
|
+
}
|
|
478
564
|
};
|
|
479
565
|
}
|
|
480
|
-
const jsonLogger = {
|
|
481
|
-
info: jsonLog("info"),
|
|
482
|
-
warn: jsonLog("warn"),
|
|
483
|
-
error: jsonLog("error"),
|
|
484
|
-
debug: jsonLog("debug")
|
|
485
|
-
};
|
|
486
|
-
/** Default S2S endpoint configuration. */
|
|
487
|
-
const DEFAULT_S2S_CONFIG = {
|
|
488
|
-
wssUrl: "wss://agents.assemblyai.com/v1/voice",
|
|
489
|
-
inputSampleRate: DEFAULT_STT_SAMPLE_RATE,
|
|
490
|
-
outputSampleRate: DEFAULT_TTS_SAMPLE_RATE
|
|
491
|
-
};
|
|
492
566
|
//#endregion
|
|
493
|
-
//#region host/
|
|
567
|
+
//#region host/providers/tts/cartesia.ts
|
|
494
568
|
/**
|
|
495
|
-
*
|
|
496
|
-
* delegation to the agent's {@link ExecuteTool} function.
|
|
569
|
+
* Cartesia TTS opener (host-only).
|
|
497
570
|
*
|
|
498
|
-
* The
|
|
499
|
-
*
|
|
500
|
-
*
|
|
501
|
-
*
|
|
502
|
-
* timeout) remains the single source of truth for tool behavior.
|
|
571
|
+
* The user-facing descriptor factory (`cartesia(...)`) lives in
|
|
572
|
+
* `sdk/providers/tts/cartesia.ts`. This module is the host-side
|
|
573
|
+
* counterpart: it takes the descriptor options + an API key and
|
|
574
|
+
* returns a {@link TtsOpener} that the pipeline session drives.
|
|
503
575
|
*
|
|
504
|
-
*
|
|
505
|
-
*
|
|
506
|
-
* bag-level `ctx.signal` so individual invocations respect streamText
|
|
507
|
-
* aborts.
|
|
508
|
-
*/
|
|
509
|
-
/**
|
|
510
|
-
* Convert an array of {@link ToolSchema} to a Vercel AI SDK `ToolSet`
|
|
511
|
-
* (record keyed by tool name).
|
|
576
|
+
* Wraps `@cartesia/cartesia-js`'s `TTSWS` / `TTSWSContext` and normalizes it
|
|
577
|
+
* onto the {@link TtsEvents} contract consumed by the pipeline orchestrator.
|
|
512
578
|
*
|
|
513
|
-
*
|
|
514
|
-
* the
|
|
515
|
-
*
|
|
579
|
+
* **Per-turn context lifecycle.** Each `sendText(...)` within the same turn
|
|
580
|
+
* appends to the same Cartesia context. On `flush()` or `cancel()`, a new
|
|
581
|
+
* context is minted for the next turn — so concurrent `cancel({ contextId })`
|
|
582
|
+
* only targets the in-flight turn, never the one that follows.
|
|
583
|
+
*
|
|
584
|
+
* **Audio format.** The adapter requests `raw` / `pcm_s16le` at the
|
|
585
|
+
* negotiated `sampleRate` so it can forward chunks as `Int16Array` with no
|
|
586
|
+
* conversion.
|
|
516
587
|
*/
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
}
|
|
530
|
-
});
|
|
531
|
-
return out;
|
|
588
|
+
/** PCM16 sample rates supported by Cartesia's `raw` output format. */
|
|
589
|
+
const CARTESIA_PCM16_RATES = [
|
|
590
|
+
8e3,
|
|
591
|
+
16e3,
|
|
592
|
+
22050,
|
|
593
|
+
24e3,
|
|
594
|
+
44100,
|
|
595
|
+
48e3
|
|
596
|
+
];
|
|
597
|
+
function assertSupportedSampleRate$1(rate) {
|
|
598
|
+
if (CARTESIA_PCM16_RATES.includes(rate)) return rate;
|
|
599
|
+
throw makeTtsError("tts_connect_failed", `Cartesia TTS: unsupported sample rate ${rate}. Supported: ${CARTESIA_PCM16_RATES.join(", ")}.`);
|
|
532
600
|
}
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
function toModelMessage(m) {
|
|
536
|
-
if (m.role === "user") return {
|
|
537
|
-
role: "user",
|
|
538
|
-
content: m.content
|
|
539
|
-
};
|
|
540
|
-
if (m.role === "assistant") return {
|
|
541
|
-
role: "assistant",
|
|
542
|
-
content: m.content
|
|
543
|
-
};
|
|
544
|
-
return {
|
|
545
|
-
role: "assistant",
|
|
546
|
-
content: m.content
|
|
547
|
-
};
|
|
548
|
-
}
|
|
549
|
-
function emitError(client, code, message) {
|
|
550
|
-
client.event({
|
|
551
|
-
type: "error",
|
|
552
|
-
code,
|
|
553
|
-
message
|
|
554
|
-
});
|
|
555
|
-
}
|
|
556
|
-
function handleStreamPart(part, deps) {
|
|
557
|
-
switch (part.type) {
|
|
558
|
-
case "text-delta": {
|
|
559
|
-
const delta = part.text ?? "";
|
|
560
|
-
if (delta.length === 0) return;
|
|
561
|
-
deps.onTextDelta(delta);
|
|
562
|
-
deps.tts?.sendText(delta);
|
|
563
|
-
return;
|
|
564
|
-
}
|
|
565
|
-
case "tool-call": {
|
|
566
|
-
const input = part.input ?? {};
|
|
567
|
-
deps.client.event({
|
|
568
|
-
type: "tool_call",
|
|
569
|
-
toolCallId: part.toolCallId ?? "",
|
|
570
|
-
toolName: part.toolName ?? "",
|
|
571
|
-
args: input
|
|
572
|
-
});
|
|
573
|
-
return;
|
|
574
|
-
}
|
|
575
|
-
case "tool-result": {
|
|
576
|
-
const output = part.output;
|
|
577
|
-
const resultString = typeof output === "string" ? output : JSON.stringify(output);
|
|
578
|
-
deps.client.event({
|
|
579
|
-
type: "tool_call_done",
|
|
580
|
-
toolCallId: part.toolCallId ?? "",
|
|
581
|
-
result: resultString
|
|
582
|
-
});
|
|
583
|
-
return;
|
|
584
|
-
}
|
|
585
|
-
case "error": {
|
|
586
|
-
const msg = errorMessage(part.error);
|
|
587
|
-
deps.log.error("LLM stream error", {
|
|
588
|
-
message: msg,
|
|
589
|
-
sessionId: deps.sessionId
|
|
590
|
-
});
|
|
591
|
-
emitError(deps.client, "llm", msg);
|
|
592
|
-
return;
|
|
593
|
-
}
|
|
594
|
-
default: return;
|
|
595
|
-
}
|
|
596
|
-
}
|
|
597
|
-
/** Create a pluggable-provider voice session. */
|
|
598
|
-
function createPipelineSession(opts) {
|
|
599
|
-
const log = opts.logger ?? consoleLogger;
|
|
600
|
-
const sttSampleRate = opts.sttSampleRate ?? 16e3;
|
|
601
|
-
const ttsSampleRate = opts.ttsSampleRate ?? 24e3;
|
|
602
|
-
const { client, agentConfig, toolSchemas, executeTool } = opts;
|
|
603
|
-
const systemPrompt = buildSystemPrompt(agentConfig, {
|
|
604
|
-
hasTools: toolSchemas.length > 0 || (agentConfig.builtinTools?.length ?? 0) > 0,
|
|
605
|
-
voice: true,
|
|
606
|
-
toolGuidance: opts.toolGuidance
|
|
607
|
-
});
|
|
608
|
-
const ctx = buildPipelineCtx({
|
|
609
|
-
id: opts.id,
|
|
610
|
-
agent: opts.agent,
|
|
611
|
-
client,
|
|
612
|
-
agentConfig,
|
|
613
|
-
executeTool,
|
|
614
|
-
log,
|
|
615
|
-
maxHistory: opts.maxHistory
|
|
616
|
-
});
|
|
617
|
-
const sessionAbort = new AbortController();
|
|
618
|
-
let audioReady = false;
|
|
619
|
-
let terminated = false;
|
|
620
|
-
let turnController = null;
|
|
621
|
-
let nextReplyId = 0;
|
|
622
|
-
const sttSubs = [];
|
|
623
|
-
const ttsSubs = [];
|
|
624
|
-
/**
|
|
625
|
-
* Tear down the session after an unrecoverable provider error. Aborts the
|
|
626
|
-
* in-flight turn, cancels TTS, signals providers to close via sessionAbort,
|
|
627
|
-
* and flips `terminated` so future STT events and audio frames become
|
|
628
|
-
* no-ops. Idempotent.
|
|
629
|
-
*/
|
|
630
|
-
function terminate() {
|
|
631
|
-
if (terminated) return;
|
|
632
|
-
terminated = true;
|
|
633
|
-
if (turnController !== null) {
|
|
634
|
-
turnController.abort();
|
|
635
|
-
turnController = null;
|
|
636
|
-
}
|
|
637
|
-
ctx.tts?.cancel();
|
|
638
|
-
ctx.cancelReply();
|
|
639
|
-
sessionAbort.abort();
|
|
640
|
-
}
|
|
641
|
-
function onSttPartial(_text) {
|
|
642
|
-
if (terminated) return;
|
|
643
|
-
if (turnController === null) return;
|
|
644
|
-
log.info("Pipeline barge-in", { sessionId: opts.id });
|
|
645
|
-
turnController.abort();
|
|
646
|
-
turnController = null;
|
|
647
|
-
ctx.tts?.cancel();
|
|
648
|
-
ctx.cancelReply();
|
|
649
|
-
client.event({ type: "cancelled" });
|
|
650
|
-
}
|
|
651
|
-
function onSttFinal(text) {
|
|
652
|
-
if (terminated) return;
|
|
653
|
-
const trimmed = text.trim();
|
|
654
|
-
if (trimmed.length === 0) return;
|
|
655
|
-
if (turnController !== null) {
|
|
656
|
-
log.info("Pipeline replacing in-flight turn", { sessionId: opts.id });
|
|
657
|
-
turnController.abort();
|
|
658
|
-
turnController = null;
|
|
659
|
-
ctx.tts?.cancel();
|
|
660
|
-
ctx.cancelReply();
|
|
661
|
-
client.event({ type: "cancelled" });
|
|
662
|
-
}
|
|
663
|
-
client.event({
|
|
664
|
-
type: "user_transcript",
|
|
665
|
-
text
|
|
666
|
-
});
|
|
667
|
-
const turn = runTurn(trimmed).catch((err) => {
|
|
668
|
-
log.error("Pipeline turn crashed", {
|
|
669
|
-
error: errorMessage(err),
|
|
670
|
-
sessionId: opts.id
|
|
671
|
-
});
|
|
672
|
-
});
|
|
673
|
-
ctx.chainTurn(turn);
|
|
674
|
-
}
|
|
675
|
-
function onSttError(err) {
|
|
676
|
-
if (terminated) return;
|
|
677
|
-
log.error("STT error", {
|
|
678
|
-
code: err.code,
|
|
679
|
-
message: err.message,
|
|
680
|
-
sessionId: opts.id
|
|
681
|
-
});
|
|
682
|
-
emitError(client, "stt", err.message);
|
|
683
|
-
terminate();
|
|
684
|
-
}
|
|
685
|
-
function onTtsError(err) {
|
|
686
|
-
if (terminated) return;
|
|
687
|
-
log.error("TTS error", {
|
|
688
|
-
code: err.code,
|
|
689
|
-
message: err.message,
|
|
690
|
-
sessionId: opts.id
|
|
691
|
-
});
|
|
692
|
-
emitError(client, "tts", err.message);
|
|
693
|
-
terminate();
|
|
694
|
-
}
|
|
695
|
-
async function consumeLlmStream(ctl, messages, tools, onDelta) {
|
|
696
|
-
const deps = {
|
|
697
|
-
client,
|
|
698
|
-
tts: ctx.tts,
|
|
699
|
-
log,
|
|
700
|
-
sessionId: opts.id,
|
|
701
|
-
onTextDelta: onDelta
|
|
702
|
-
};
|
|
703
|
-
try {
|
|
704
|
-
const maxSteps = agentConfig.maxSteps ?? 5;
|
|
705
|
-
const result = streamText({
|
|
706
|
-
model: opts.llm,
|
|
707
|
-
system: systemPrompt,
|
|
708
|
-
messages,
|
|
709
|
-
tools,
|
|
710
|
-
stopWhen: stepCountIs(maxSteps),
|
|
711
|
-
abortSignal: ctl.signal
|
|
712
|
-
});
|
|
713
|
-
for await (const part of result.fullStream) {
|
|
714
|
-
if (ctl.signal.aborted) break;
|
|
715
|
-
handleStreamPart(part, deps);
|
|
716
|
-
}
|
|
717
|
-
} catch (err) {
|
|
718
|
-
if (!ctl.signal.aborted) {
|
|
719
|
-
const msg = errorMessage(err);
|
|
720
|
-
log.error("LLM streamText failed", {
|
|
721
|
-
error: msg,
|
|
722
|
-
sessionId: opts.id
|
|
723
|
-
});
|
|
724
|
-
emitError(client, "llm", msg);
|
|
725
|
-
}
|
|
726
|
-
}
|
|
727
|
-
}
|
|
728
|
-
/**
|
|
729
|
-
* Flush TTS and wait for drain. Resolves on any of:
|
|
730
|
-
* - TTS emits `done`
|
|
731
|
-
* - `signal` aborts (barge-in, provider error, session stop)
|
|
732
|
-
* - `PIPELINE_FLUSH_TIMEOUT_MS` elapses
|
|
733
|
-
* Resolves immediately if no TTS session.
|
|
734
|
-
*/
|
|
735
|
-
function flushTtsAndWait(signal) {
|
|
736
|
-
const tts = ctx.tts;
|
|
737
|
-
if (!tts) return Promise.resolve();
|
|
738
|
-
return new Promise((resolve) => {
|
|
739
|
-
let off = null;
|
|
740
|
-
let timer = null;
|
|
741
|
-
const cleanup = () => {
|
|
742
|
-
if (off) {
|
|
743
|
-
off();
|
|
744
|
-
off = null;
|
|
745
|
-
}
|
|
746
|
-
if (timer) {
|
|
747
|
-
clearTimeout(timer);
|
|
748
|
-
timer = null;
|
|
749
|
-
}
|
|
750
|
-
signal.removeEventListener("abort", onAbort);
|
|
751
|
-
};
|
|
752
|
-
const finish = () => {
|
|
753
|
-
cleanup();
|
|
754
|
-
resolve();
|
|
755
|
-
};
|
|
756
|
-
const onAbort = () => finish();
|
|
757
|
-
if (signal.aborted) {
|
|
758
|
-
resolve();
|
|
759
|
-
return;
|
|
760
|
-
}
|
|
761
|
-
signal.addEventListener("abort", onAbort, { once: true });
|
|
762
|
-
off = tts.on("done", finish);
|
|
763
|
-
timer = setTimeout(() => {
|
|
764
|
-
log.warn("TTS flush timeout", {
|
|
765
|
-
sessionId: opts.id,
|
|
766
|
-
timeoutMs: PIPELINE_FLUSH_TIMEOUT_MS
|
|
767
|
-
});
|
|
768
|
-
finish();
|
|
769
|
-
}, PIPELINE_FLUSH_TIMEOUT_MS);
|
|
770
|
-
tts.flush();
|
|
771
|
-
});
|
|
772
|
-
}
|
|
773
|
-
async function runTurn(userText) {
|
|
774
|
-
const replyId = `pipeline-${++nextReplyId}`;
|
|
775
|
-
ctx.beginReply(replyId);
|
|
776
|
-
ctx.pushMessages({
|
|
777
|
-
role: "user",
|
|
778
|
-
content: userText
|
|
779
|
-
});
|
|
780
|
-
const ctl = new AbortController();
|
|
781
|
-
turnController = ctl;
|
|
782
|
-
const tools = toVercelTools(toolSchemas, {
|
|
783
|
-
executeTool,
|
|
784
|
-
sessionId: opts.id,
|
|
785
|
-
messages: () => ctx.conversationMessages,
|
|
786
|
-
signal: ctl.signal
|
|
787
|
-
});
|
|
788
|
-
const messages = ctx.conversationMessages.map(toModelMessage);
|
|
789
|
-
let accumulated = "";
|
|
790
|
-
await consumeLlmStream(ctl, messages, tools, (delta) => {
|
|
791
|
-
accumulated += delta;
|
|
792
|
-
});
|
|
793
|
-
if (ctl.signal.aborted) {
|
|
794
|
-
if (turnController === ctl) turnController = null;
|
|
795
|
-
return;
|
|
796
|
-
}
|
|
797
|
-
if (accumulated.length > 0) {
|
|
798
|
-
client.event({
|
|
799
|
-
type: "agent_transcript",
|
|
800
|
-
text: accumulated
|
|
801
|
-
});
|
|
802
|
-
ctx.pushMessages({
|
|
803
|
-
role: "assistant",
|
|
804
|
-
content: accumulated
|
|
805
|
-
});
|
|
806
|
-
}
|
|
807
|
-
await flushTtsAndWait(ctl.signal);
|
|
808
|
-
if (ctl.signal.aborted) {
|
|
809
|
-
if (turnController === ctl) turnController = null;
|
|
810
|
-
return;
|
|
811
|
-
}
|
|
812
|
-
client.playAudioDone();
|
|
813
|
-
client.event({ type: "reply_done" });
|
|
814
|
-
if (turnController === ctl) turnController = null;
|
|
815
|
-
}
|
|
816
|
-
async function runGreeting(text) {
|
|
817
|
-
const replyId = `pipeline-greeting-${++nextReplyId}`;
|
|
818
|
-
ctx.beginReply(replyId);
|
|
819
|
-
const ctl = new AbortController();
|
|
820
|
-
turnController = ctl;
|
|
821
|
-
client.event({
|
|
822
|
-
type: "agent_transcript",
|
|
823
|
-
text
|
|
824
|
-
});
|
|
825
|
-
ctx.pushMessages({
|
|
826
|
-
role: "assistant",
|
|
827
|
-
content: text
|
|
828
|
-
});
|
|
829
|
-
ctx.tts?.sendText(text);
|
|
830
|
-
await flushTtsAndWait(ctl.signal);
|
|
831
|
-
if (ctl.signal.aborted) {
|
|
832
|
-
if (turnController === ctl) turnController = null;
|
|
833
|
-
return;
|
|
834
|
-
}
|
|
835
|
-
client.playAudioDone();
|
|
836
|
-
client.event({ type: "reply_done" });
|
|
837
|
-
if (turnController === ctl) turnController = null;
|
|
838
|
-
}
|
|
839
|
-
function reportOpenRejection(which, reason) {
|
|
840
|
-
const msg = errorMessage(reason);
|
|
841
|
-
log.error(`${which === "stt" ? "STT" : "TTS"} open failed`, {
|
|
842
|
-
error: msg,
|
|
843
|
-
sessionId: opts.id
|
|
844
|
-
});
|
|
845
|
-
emitError(client, which, msg);
|
|
846
|
-
}
|
|
847
|
-
async function adoptStt(sttSession, teardown) {
|
|
848
|
-
if (teardown) {
|
|
849
|
-
await sttSession.close().catch(() => void 0);
|
|
850
|
-
return;
|
|
851
|
-
}
|
|
852
|
-
ctx.stt = sttSession;
|
|
853
|
-
sttSubs.push(sttSession.on("partial", onSttPartial));
|
|
854
|
-
sttSubs.push(sttSession.on("final", onSttFinal));
|
|
855
|
-
sttSubs.push(sttSession.on("error", onSttError));
|
|
856
|
-
}
|
|
857
|
-
async function adoptTts(ttsSession, teardown) {
|
|
858
|
-
if (teardown) {
|
|
859
|
-
await ttsSession.close().catch(() => void 0);
|
|
860
|
-
return;
|
|
861
|
-
}
|
|
862
|
-
ctx.tts = ttsSession;
|
|
863
|
-
ttsSubs.push(ttsSession.on("audio", (pcm) => {
|
|
864
|
-
client.playAudioChunk(new Uint8Array(pcm.buffer, pcm.byteOffset, pcm.byteLength));
|
|
865
|
-
}));
|
|
866
|
-
ttsSubs.push(ttsSession.on("error", onTtsError));
|
|
867
|
-
}
|
|
868
|
-
async function openProviders() {
|
|
869
|
-
const [sttResult, ttsResult] = await Promise.allSettled([opts.stt.open({
|
|
870
|
-
sampleRate: sttSampleRate,
|
|
871
|
-
apiKey: opts.sttApiKey,
|
|
872
|
-
sttPrompt: agentConfig.sttPrompt,
|
|
873
|
-
signal: sessionAbort.signal
|
|
874
|
-
}), opts.tts.open({
|
|
875
|
-
sampleRate: ttsSampleRate,
|
|
876
|
-
apiKey: opts.ttsApiKey,
|
|
877
|
-
signal: sessionAbort.signal
|
|
878
|
-
})]);
|
|
879
|
-
if (sttResult.status === "rejected") reportOpenRejection("stt", sttResult.reason);
|
|
880
|
-
if (ttsResult.status === "rejected") reportOpenRejection("tts", ttsResult.reason);
|
|
881
|
-
const aborted = sessionAbort.signal.aborted;
|
|
882
|
-
const sttFailed = sttResult.status === "rejected";
|
|
883
|
-
const ttsFailed = ttsResult.status === "rejected";
|
|
884
|
-
const teardown = aborted || sttFailed || ttsFailed;
|
|
885
|
-
if (sttResult.status === "fulfilled") await adoptStt(sttResult.value, teardown);
|
|
886
|
-
if (ttsResult.status === "fulfilled") await adoptTts(ttsResult.value, teardown);
|
|
887
|
-
if (!aborted && (sttFailed || ttsFailed)) terminate();
|
|
888
|
-
}
|
|
889
|
-
return {
|
|
890
|
-
async start() {
|
|
891
|
-
await openProviders();
|
|
892
|
-
},
|
|
893
|
-
async stop() {
|
|
894
|
-
if (sessionAbort.signal.aborted) return;
|
|
895
|
-
sessionAbort.abort();
|
|
896
|
-
turnController?.abort();
|
|
897
|
-
for (const off of sttSubs) off();
|
|
898
|
-
for (const off of ttsSubs) off();
|
|
899
|
-
sttSubs.length = 0;
|
|
900
|
-
ttsSubs.length = 0;
|
|
901
|
-
if (ctx.turnPromise !== null) await ctx.turnPromise;
|
|
902
|
-
await ctx.stt?.close().catch(() => {});
|
|
903
|
-
await ctx.tts?.close().catch(() => {});
|
|
904
|
-
},
|
|
905
|
-
onAudio(data) {
|
|
906
|
-
if (terminated || !audioReady) return;
|
|
907
|
-
const offset = data.byteOffset;
|
|
908
|
-
const length = data.byteLength;
|
|
909
|
-
let pcm;
|
|
910
|
-
if (offset % 2 === 0 && length % 2 === 0) pcm = new Int16Array(data.buffer, offset, length / 2);
|
|
911
|
-
else {
|
|
912
|
-
const copy = new Uint8Array(length - length % 2);
|
|
913
|
-
copy.set(data.subarray(0, copy.byteLength));
|
|
914
|
-
pcm = new Int16Array(copy.buffer);
|
|
915
|
-
}
|
|
916
|
-
ctx.stt?.sendAudio(pcm);
|
|
917
|
-
},
|
|
918
|
-
onAudioReady() {
|
|
919
|
-
if (audioReady || terminated) return;
|
|
920
|
-
audioReady = true;
|
|
921
|
-
if (opts.skipGreeting) return;
|
|
922
|
-
const greeting = agentConfig.greeting;
|
|
923
|
-
if (!greeting) return;
|
|
924
|
-
const turn = runGreeting(greeting).catch((err) => {
|
|
925
|
-
log.error("Pipeline greeting failed", {
|
|
926
|
-
error: errorMessage(err),
|
|
927
|
-
sessionId: opts.id
|
|
928
|
-
});
|
|
929
|
-
});
|
|
930
|
-
ctx.chainTurn(turn);
|
|
931
|
-
},
|
|
932
|
-
onCancel() {
|
|
933
|
-
if (terminated) return;
|
|
934
|
-
turnController?.abort();
|
|
935
|
-
turnController = null;
|
|
936
|
-
ctx.tts?.cancel();
|
|
937
|
-
ctx.cancelReply();
|
|
938
|
-
client.event({ type: "cancelled" });
|
|
939
|
-
},
|
|
940
|
-
onReset() {
|
|
941
|
-
if (terminated) return;
|
|
942
|
-
turnController?.abort();
|
|
943
|
-
turnController = null;
|
|
944
|
-
ctx.tts?.cancel();
|
|
945
|
-
ctx.cancelReply();
|
|
946
|
-
ctx.conversationMessages = [];
|
|
947
|
-
ctx.turnPromise = null;
|
|
948
|
-
client.event({ type: "reset" });
|
|
949
|
-
},
|
|
950
|
-
onHistory(incoming) {
|
|
951
|
-
if (terminated) return;
|
|
952
|
-
ctx.pushMessages(...incoming.map((m) => ({
|
|
953
|
-
role: m.role,
|
|
954
|
-
content: m.content
|
|
955
|
-
})));
|
|
956
|
-
},
|
|
957
|
-
waitForTurn() {
|
|
958
|
-
return ctx.turnPromise ?? Promise.resolve();
|
|
959
|
-
}
|
|
960
|
-
};
|
|
961
|
-
}
|
|
962
|
-
//#endregion
|
|
963
|
-
//#region host/providers/stt/assemblyai.ts
|
|
964
|
-
/**
|
|
965
|
-
* AssemblyAI Universal-Streaming STT opener (host-only).
|
|
966
|
-
*
|
|
967
|
-
* The user-facing descriptor factory (`assemblyAI(...)`) lives in
|
|
968
|
-
* `sdk/providers/stt/assemblyai.ts`. This module is the host-side
|
|
969
|
-
* counterpart: it takes the descriptor options + an API key and
|
|
970
|
-
* returns an {@link SttOpener} that the pipeline session drives.
|
|
971
|
-
*
|
|
972
|
-
* Default model: `"u3pro-rt"` (Universal-3 Pro Real-Time). The adapter
|
|
973
|
-
* maps that to the SDK's `"u3-rt-pro"` `speechModel` value; any other
|
|
974
|
-
* string is forwarded verbatim.
|
|
975
|
-
*/
|
|
976
|
-
/** Translate the descriptor's model alias to the SDK's `speechModel` value. */
|
|
977
|
-
function resolveSpeechModel(model) {
|
|
978
|
-
if (model === "u3pro-rt") return "u3-rt-pro";
|
|
979
|
-
return model;
|
|
980
|
-
}
|
|
981
|
-
/** Build an {@link SttOpener} from resolved AssemblyAI descriptor options. */
|
|
982
|
-
function openAssemblyAI(opts = {}) {
|
|
983
|
-
return {
|
|
984
|
-
name: "assemblyai",
|
|
985
|
-
async open(openOpts) {
|
|
986
|
-
const apiKey = openOpts.apiKey || process.env.ASSEMBLYAI_API_KEY;
|
|
987
|
-
if (!apiKey) throw makeSttError("stt_auth_failed", "AssemblyAI STT: missing API key. Set ASSEMBLYAI_API_KEY in the agent env.");
|
|
988
|
-
const client = new AssemblyAI({ apiKey });
|
|
989
|
-
const speechModel = resolveSpeechModel(opts.model ?? "u3pro-rt");
|
|
990
|
-
const transcriber = client.streaming.transcriber({
|
|
991
|
-
sampleRate: openOpts.sampleRate,
|
|
992
|
-
speechModel,
|
|
993
|
-
...openOpts.sttPrompt ? { prompt: openOpts.sttPrompt } : {}
|
|
994
|
-
});
|
|
995
|
-
const emitter = createNanoEvents();
|
|
996
|
-
let closed = false;
|
|
997
|
-
transcriber.on("turn", (event) => {
|
|
998
|
-
if (closed) return;
|
|
999
|
-
const text = event.transcript ?? "";
|
|
1000
|
-
if (event.end_of_turn) {
|
|
1001
|
-
if (text.length > 0) emitter.emit("final", text);
|
|
1002
|
-
} else if (text.length > 0) emitter.emit("partial", text);
|
|
1003
|
-
});
|
|
1004
|
-
transcriber.on("error", (err) => {
|
|
1005
|
-
if (closed) return;
|
|
1006
|
-
emitter.emit("error", makeSttError("stt_stream_error", err?.message ?? String(err)));
|
|
1007
|
-
});
|
|
1008
|
-
transcriber.on("close", (code) => {
|
|
1009
|
-
if (closed) return;
|
|
1010
|
-
if (code !== 1e3) emitter.emit("error", makeSttError("stt_stream_error", `socket closed ${code}`));
|
|
1011
|
-
});
|
|
1012
|
-
try {
|
|
1013
|
-
await transcriber.connect();
|
|
1014
|
-
} catch (cause) {
|
|
1015
|
-
throw makeSttError("stt_connect_failed", `AssemblyAI STT: connect failed: ${cause instanceof Error ? cause.message : String(cause)}`);
|
|
1016
|
-
}
|
|
1017
|
-
const close = async () => {
|
|
1018
|
-
if (closed) return;
|
|
1019
|
-
closed = true;
|
|
1020
|
-
try {
|
|
1021
|
-
await transcriber.close();
|
|
1022
|
-
} catch {}
|
|
1023
|
-
};
|
|
1024
|
-
if (openOpts.signal.aborted) close();
|
|
1025
|
-
else openOpts.signal.addEventListener("abort", () => void close(), { once: true });
|
|
1026
|
-
return {
|
|
1027
|
-
sendAudio(pcm) {
|
|
1028
|
-
if (closed) return;
|
|
1029
|
-
const copy = new Uint8Array(pcm.byteLength);
|
|
1030
|
-
copy.set(new Uint8Array(pcm.buffer, pcm.byteOffset, pcm.byteLength));
|
|
1031
|
-
transcriber.sendAudio(copy.buffer);
|
|
1032
|
-
},
|
|
1033
|
-
on(event, fn) {
|
|
1034
|
-
return emitter.on(event, fn);
|
|
1035
|
-
},
|
|
1036
|
-
close,
|
|
1037
|
-
_transcriber: transcriber
|
|
1038
|
-
};
|
|
1039
|
-
}
|
|
1040
|
-
};
|
|
1041
|
-
}
|
|
1042
|
-
//#endregion
|
|
1043
|
-
//#region host/providers/tts/cartesia.ts
|
|
1044
|
-
/**
|
|
1045
|
-
* Cartesia TTS opener (host-only).
|
|
1046
|
-
*
|
|
1047
|
-
* The user-facing descriptor factory (`cartesia(...)`) lives in
|
|
1048
|
-
* `sdk/providers/tts/cartesia.ts`. This module is the host-side
|
|
1049
|
-
* counterpart: it takes the descriptor options + an API key and
|
|
1050
|
-
* returns a {@link TtsOpener} that the pipeline session drives.
|
|
1051
|
-
*
|
|
1052
|
-
* Wraps `@cartesia/cartesia-js`'s `TTSWS` / `TTSWSContext` and normalizes it
|
|
1053
|
-
* onto the {@link TtsEvents} contract consumed by the pipeline orchestrator.
|
|
1054
|
-
*
|
|
1055
|
-
* **Per-turn context lifecycle.** Each `sendText(...)` within the same turn
|
|
1056
|
-
* appends to the same Cartesia context. On `flush()` or `cancel()`, a new
|
|
1057
|
-
* context is minted for the next turn — so concurrent `cancel({ contextId })`
|
|
1058
|
-
* only targets the in-flight turn, never the one that follows.
|
|
1059
|
-
*
|
|
1060
|
-
* **Audio format.** The adapter requests `raw` / `pcm_s16le` at the
|
|
1061
|
-
* negotiated `sampleRate` so it can forward chunks as `Int16Array` with no
|
|
1062
|
-
* conversion.
|
|
1063
|
-
*/
|
|
1064
|
-
/** PCM16 sample rates supported by Cartesia's `raw` output format. */
|
|
1065
|
-
const CARTESIA_PCM16_RATES = [
|
|
1066
|
-
8e3,
|
|
1067
|
-
16e3,
|
|
1068
|
-
22050,
|
|
1069
|
-
24e3,
|
|
1070
|
-
44100,
|
|
1071
|
-
48e3
|
|
1072
|
-
];
|
|
1073
|
-
function assertSupportedSampleRate(rate) {
|
|
1074
|
-
if (CARTESIA_PCM16_RATES.includes(rate)) return rate;
|
|
1075
|
-
throw makeTtsError("tts_connect_failed", `Cartesia TTS: unsupported sample rate ${rate}. Supported: ${CARTESIA_PCM16_RATES.join(", ")}.`);
|
|
1076
|
-
}
|
|
1077
|
-
/** Build a {@link TtsOpener} from resolved Cartesia descriptor options. */
|
|
1078
|
-
function openCartesia(opts) {
|
|
601
|
+
/** Build a {@link TtsOpener} from resolved Cartesia descriptor options. */
|
|
602
|
+
function openCartesia(opts) {
|
|
1079
603
|
return {
|
|
1080
604
|
name: "cartesia",
|
|
1081
605
|
async open(openOpts) {
|
|
1082
606
|
const apiKey = openOpts.apiKey || process.env.CARTESIA_API_KEY;
|
|
1083
607
|
if (!apiKey) throw makeTtsError("tts_auth_failed", "Cartesia TTS: missing API key. Set CARTESIA_API_KEY in the agent env.");
|
|
1084
|
-
const sampleRate = assertSupportedSampleRate(openOpts.sampleRate);
|
|
608
|
+
const sampleRate = assertSupportedSampleRate$1(openOpts.sampleRate);
|
|
1085
609
|
const model = opts.model ?? "sonic-2";
|
|
1086
610
|
const language = opts.language ?? "en";
|
|
611
|
+
const voice = opts.voice ?? "f786b574-daa5-4673-aa0c-cbe3e8534c02";
|
|
1087
612
|
const client = new Cartesia({ apiKey });
|
|
1088
613
|
let ws;
|
|
1089
614
|
try {
|
|
@@ -1098,7 +623,7 @@ function openCartesia(opts) {
|
|
|
1098
623
|
model_id: model,
|
|
1099
624
|
voice: {
|
|
1100
625
|
mode: "id",
|
|
1101
|
-
id:
|
|
626
|
+
id: voice
|
|
1102
627
|
},
|
|
1103
628
|
output_format: {
|
|
1104
629
|
container: "raw",
|
|
@@ -1169,7 +694,7 @@ function openCartesia(opts) {
|
|
|
1169
694
|
model_id: model,
|
|
1170
695
|
voice: {
|
|
1171
696
|
mode: "id",
|
|
1172
|
-
id:
|
|
697
|
+
id: voice
|
|
1173
698
|
},
|
|
1174
699
|
output_format: {
|
|
1175
700
|
container: "raw",
|
|
@@ -1178,38 +703,247 @@ function openCartesia(opts) {
|
|
|
1178
703
|
},
|
|
1179
704
|
language
|
|
1180
705
|
};
|
|
1181
|
-
const ignoreRejection = (_err) => {};
|
|
706
|
+
const ignoreRejection = (_err) => {};
|
|
707
|
+
return {
|
|
708
|
+
sendText(text) {
|
|
709
|
+
if (closed || text.length === 0) return;
|
|
710
|
+
rotateIfPending();
|
|
711
|
+
context.send({
|
|
712
|
+
...baseRequest,
|
|
713
|
+
transcript: text,
|
|
714
|
+
continue: true
|
|
715
|
+
}).catch(ignoreRejection);
|
|
716
|
+
},
|
|
717
|
+
flush() {
|
|
718
|
+
if (closed || rotatePending) return;
|
|
719
|
+
context.send({
|
|
720
|
+
...baseRequest,
|
|
721
|
+
transcript: "",
|
|
722
|
+
continue: false
|
|
723
|
+
}).catch(ignoreRejection);
|
|
724
|
+
rotatePending = true;
|
|
725
|
+
},
|
|
726
|
+
cancel() {
|
|
727
|
+
if (closed) return;
|
|
728
|
+
if (!doneEmitted) context.cancel().catch(ignoreRejection);
|
|
729
|
+
emitDoneOnce();
|
|
730
|
+
rotatePending = true;
|
|
731
|
+
},
|
|
732
|
+
on(event, fn) {
|
|
733
|
+
return emitter.on(event, fn);
|
|
734
|
+
},
|
|
735
|
+
close,
|
|
736
|
+
_ws: ws,
|
|
737
|
+
_currentContextId: () => context.contextId
|
|
738
|
+
};
|
|
739
|
+
}
|
|
740
|
+
};
|
|
741
|
+
}
|
|
742
|
+
//#endregion
|
|
743
|
+
//#region host/providers/tts/rime.ts
|
|
744
|
+
/**
|
|
745
|
+
* Rime TTS opener (host-only).
|
|
746
|
+
*
|
|
747
|
+
* The user-facing descriptor factory (`rime(...)`) lives in
|
|
748
|
+
* `sdk/providers/tts/rime.ts`. This module is the host-side
|
|
749
|
+
* counterpart: it takes the descriptor options + an API key and
|
|
750
|
+
* returns a {@link TtsOpener} that the pipeline session drives.
|
|
751
|
+
*
|
|
752
|
+
* **Protocol.** Connects to Rime's `ws2` JSON WebSocket endpoint
|
|
753
|
+
* (`wss://users-ws.rime.ai/ws2`). Client-to-server messages are JSON:
|
|
754
|
+
* - `{ "text": "..." }` — append text to the synthesis buffer
|
|
755
|
+
* - `{ "operation": "clear" }` — drop buffered text (barge-in)
|
|
756
|
+
* - `{ "operation": "eos" }` — drain buffer, close connection (NOT used
|
|
757
|
+
* during a session: it would tear down the WS, forcing reconnect per
|
|
758
|
+
* turn). We force end-of-turn synthesis with a trailing `"."` instead.
|
|
759
|
+
* The server responds with JSON frames:
|
|
760
|
+
* - `{ type: "chunk", data: <base64 PCM16 LE>, contextId: string | null }`
|
|
761
|
+
* - `{ type: "timestamps", ... }` (ignored)
|
|
762
|
+
* - `{ type: "error", message: string }` (surfaced as `tts_stream_error`)
|
|
763
|
+
*
|
|
764
|
+
* **Single long-lived connection per session.** Rime buffers text until it
|
|
765
|
+
* sees terminal punctuation (`.`, `?`, `!`), so we use one WebSocket per
|
|
766
|
+
* `open()` call and reuse it across turns. `clear` resets the buffer
|
|
767
|
+
* between cancellations.
|
|
768
|
+
*
|
|
769
|
+
* **Done detection.** After `flush()` sends a trailing `"."` to force the
|
|
770
|
+
* server to synthesize any half-buffered text, we arm a quiescence timer
|
|
771
|
+
* that fires 500 ms after the last received audio chunk. When it fires,
|
|
772
|
+
* `done` is emitted.
|
|
773
|
+
*
|
|
774
|
+
* **Audio format.** The URL requests `audioFormat=pcm` at the negotiated
|
|
775
|
+
* `sampleRate`, which returns raw PCM16 little-endian. We decode the base64
|
|
776
|
+
* payload and construct a zero-copy `Int16Array` view over the decoded bytes.
|
|
777
|
+
*/
|
|
778
|
+
/** PCM16 sample rates accepted by the Rime `ws2` endpoint. */
|
|
779
|
+
const RIME_PCM16_RATES = [
|
|
780
|
+
8e3,
|
|
781
|
+
16e3,
|
|
782
|
+
22050,
|
|
783
|
+
24e3,
|
|
784
|
+
44100,
|
|
785
|
+
48e3
|
|
786
|
+
];
|
|
787
|
+
function assertSupportedSampleRate(rate) {
|
|
788
|
+
if (RIME_PCM16_RATES.includes(rate)) return rate;
|
|
789
|
+
throw makeTtsError("tts_connect_failed", `Rime TTS: unsupported sample rate ${rate}. Supported: ${RIME_PCM16_RATES.join(", ")}.`);
|
|
790
|
+
}
|
|
791
|
+
/**
|
|
792
|
+
* Decode a base64 string from Rime into a zero-copy `Int16Array`.
|
|
793
|
+
*
|
|
794
|
+
* Rime's `ws2` endpoint returns base64-encoded PCM16 LE in each chunk.
|
|
795
|
+
* `Buffer.from(base64, "base64")` gives us a Node.js Buffer (which is a
|
|
796
|
+
* Uint8Array subclass) with `byteOffset === 0`. PCM16 bytes always come in
|
|
797
|
+
* pairs so the length is guaranteed to be even.
|
|
798
|
+
*/
|
|
799
|
+
function base64ToPcm(data) {
|
|
800
|
+
const bytes = Buffer.from(data, "base64");
|
|
801
|
+
const evenLen = bytes.byteLength - bytes.byteLength % 2;
|
|
802
|
+
if (evenLen === 0) return new Int16Array(0);
|
|
803
|
+
return new Int16Array(bytes.buffer, bytes.byteOffset, evenLen / 2);
|
|
804
|
+
}
|
|
805
|
+
/** Quiescence timeout in ms — how long to wait after the last audio chunk before emitting `done`. */
|
|
806
|
+
const QUIESCENCE_MS = 500;
|
|
807
|
+
/**
|
|
808
|
+
* After `flush()`, how long to wait for the FIRST audio chunk before
|
|
809
|
+
* giving up and emitting `done`. Greeting and short replies hit this
|
|
810
|
+
* path: `flush()` runs immediately after `sendText()`, so audio TTFB
|
|
811
|
+
* exceeds the 500 ms quiescence window. Once the first chunk arrives,
|
|
812
|
+
* we transition to the shorter quiescence timeout.
|
|
813
|
+
*/
|
|
814
|
+
const FIRST_AUDIO_TIMEOUT_MS = 5e3;
|
|
815
|
+
/** Wait for the WebSocket `open` event; reject on first `error`. */
|
|
816
|
+
function waitForOpen(ws) {
|
|
817
|
+
return new Promise((resolve, reject) => {
|
|
818
|
+
const onOpen = () => {
|
|
819
|
+
ws.removeListener("error", onError);
|
|
820
|
+
resolve();
|
|
821
|
+
};
|
|
822
|
+
const onError = (err) => {
|
|
823
|
+
ws.removeListener("open", onOpen);
|
|
824
|
+
reject(makeTtsError("tts_connect_failed", `Rime TTS: connect failed: ${err?.message ?? String(err)}`));
|
|
825
|
+
};
|
|
826
|
+
ws.once("open", onOpen);
|
|
827
|
+
ws.once("error", onError);
|
|
828
|
+
});
|
|
829
|
+
}
|
|
830
|
+
/**
|
|
831
|
+
* Handle one incoming WebSocket message frame.
|
|
832
|
+
*
|
|
833
|
+
* Extracted into a top-level function to keep `open()` under the cognitive
|
|
834
|
+
* complexity limit while retaining full access to the session state via refs.
|
|
835
|
+
*/
|
|
836
|
+
function handleRimeMessage(raw, emitter, armQuiescence, isActiveTimer) {
|
|
837
|
+
let msg;
|
|
838
|
+
try {
|
|
839
|
+
msg = JSON.parse(typeof raw === "string" ? raw : raw.toString());
|
|
840
|
+
} catch {
|
|
841
|
+
return;
|
|
842
|
+
}
|
|
843
|
+
if (msg.type === "chunk" && typeof msg.data === "string") {
|
|
844
|
+
const pcm = base64ToPcm(msg.data);
|
|
845
|
+
if (pcm.length > 0) {
|
|
846
|
+
emitter.emit("audio", pcm);
|
|
847
|
+
if (isActiveTimer()) armQuiescence();
|
|
848
|
+
}
|
|
849
|
+
return;
|
|
850
|
+
}
|
|
851
|
+
if (msg.type === "error") emitter.emit("error", makeTtsError("tts_stream_error", `Rime TTS: ${msg.message ?? "unknown error"}`));
|
|
852
|
+
}
|
|
853
|
+
/** Build a {@link TtsOpener} from resolved Rime descriptor options. */
|
|
854
|
+
function openRime(opts) {
|
|
855
|
+
return {
|
|
856
|
+
name: "rime",
|
|
857
|
+
async open(openOpts) {
|
|
858
|
+
const apiKey = openOpts.apiKey || process.env.RIME_API_KEY;
|
|
859
|
+
if (!apiKey) throw makeTtsError("tts_auth_failed", "Rime TTS: missing API key. Set RIME_API_KEY in the agent env.");
|
|
860
|
+
const sampleRate = assertSupportedSampleRate(openOpts.sampleRate);
|
|
861
|
+
const model = opts.model ?? "mistv2";
|
|
862
|
+
const lang = opts.language ?? "eng";
|
|
863
|
+
const voice = opts.voice ?? "cove";
|
|
864
|
+
const url = `wss://users-ws.rime.ai/ws2?speaker=${encodeURIComponent(voice)}&modelId=${encodeURIComponent(model)}&audioFormat=pcm&samplingRate=${sampleRate}&lang=${encodeURIComponent(lang)}`;
|
|
865
|
+
let ws;
|
|
866
|
+
try {
|
|
867
|
+
ws = new WsWebSocket(url, { headers: { Authorization: `Bearer ${apiKey}` } });
|
|
868
|
+
} catch (cause) {
|
|
869
|
+
throw makeTtsError("tts_connect_failed", `Rime TTS: failed to create WebSocket: ${cause instanceof Error ? cause.message : String(cause)}`);
|
|
870
|
+
}
|
|
871
|
+
await waitForOpen(ws);
|
|
872
|
+
const emitter = createNanoEvents();
|
|
873
|
+
let closed = false;
|
|
874
|
+
let doneEmitted = false;
|
|
875
|
+
/**
|
|
876
|
+
* After `flush()`, we arm a timer that fires `done`. Initial timeout is
|
|
877
|
+
* `FIRST_AUDIO_TIMEOUT_MS` to give Rime headroom on TTFB; the first
|
|
878
|
+
* chunk swaps it for a shorter `QUIESCENCE_MS` window that resets on
|
|
879
|
+
* each subsequent chunk. `cancel()` emits `done` synchronously.
|
|
880
|
+
*/
|
|
881
|
+
let quiescenceTimer = null;
|
|
882
|
+
const clearQuiescence = () => {
|
|
883
|
+
if (quiescenceTimer !== null) {
|
|
884
|
+
clearTimeout(quiescenceTimer);
|
|
885
|
+
quiescenceTimer = null;
|
|
886
|
+
}
|
|
887
|
+
};
|
|
888
|
+
const emitDoneOnce = () => {
|
|
889
|
+
clearQuiescence();
|
|
890
|
+
if (doneEmitted || closed) return;
|
|
891
|
+
doneEmitted = true;
|
|
892
|
+
emitter.emit("done");
|
|
893
|
+
};
|
|
894
|
+
const armQuiescence = () => {
|
|
895
|
+
clearQuiescence();
|
|
896
|
+
quiescenceTimer = setTimeout(emitDoneOnce, QUIESCENCE_MS);
|
|
897
|
+
};
|
|
898
|
+
const armFirstAudioTimer = () => {
|
|
899
|
+
clearQuiescence();
|
|
900
|
+
quiescenceTimer = setTimeout(emitDoneOnce, FIRST_AUDIO_TIMEOUT_MS);
|
|
901
|
+
};
|
|
902
|
+
ws.on("message", (raw) => {
|
|
903
|
+
if (closed) return;
|
|
904
|
+
handleRimeMessage(raw, emitter, armQuiescence, () => quiescenceTimer !== null);
|
|
905
|
+
});
|
|
906
|
+
ws.on("error", (err) => {
|
|
907
|
+
if (closed) return;
|
|
908
|
+
emitter.emit("error", makeTtsError("tts_stream_error", `Rime TTS stream error: ${err?.message ?? String(err)}`));
|
|
909
|
+
});
|
|
910
|
+
ws.on("close", () => {
|
|
911
|
+
if (closed) return;
|
|
912
|
+
emitDoneOnce();
|
|
913
|
+
});
|
|
914
|
+
const close = async () => {
|
|
915
|
+
if (closed) return;
|
|
916
|
+
closed = true;
|
|
917
|
+
clearQuiescence();
|
|
918
|
+
try {
|
|
919
|
+
ws.close();
|
|
920
|
+
} catch {}
|
|
921
|
+
};
|
|
922
|
+
if (openOpts.signal.aborted) close();
|
|
923
|
+
else openOpts.signal.addEventListener("abort", () => void close(), { once: true });
|
|
1182
924
|
return {
|
|
1183
925
|
sendText(text) {
|
|
1184
926
|
if (closed || text.length === 0) return;
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
transcript: text,
|
|
1189
|
-
continue: true
|
|
1190
|
-
}).catch(ignoreRejection);
|
|
927
|
+
if (ws.readyState !== WsWebSocket.OPEN) return;
|
|
928
|
+
doneEmitted = false;
|
|
929
|
+
ws.send(JSON.stringify({ text }));
|
|
1191
930
|
},
|
|
1192
931
|
flush() {
|
|
1193
|
-
if (closed
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
continue: false
|
|
1198
|
-
}).catch(ignoreRejection);
|
|
1199
|
-
rotatePending = true;
|
|
932
|
+
if (closed) return;
|
|
933
|
+
if (ws.readyState !== WsWebSocket.OPEN) return;
|
|
934
|
+
ws.send(JSON.stringify({ text: "." }));
|
|
935
|
+
armFirstAudioTimer();
|
|
1200
936
|
},
|
|
1201
937
|
cancel() {
|
|
1202
938
|
if (closed) return;
|
|
1203
|
-
if (
|
|
939
|
+
if (ws.readyState === WsWebSocket.OPEN) ws.send(JSON.stringify({ operation: "clear" }));
|
|
1204
940
|
emitDoneOnce();
|
|
1205
|
-
rotatePending = true;
|
|
1206
941
|
},
|
|
1207
942
|
on(event, fn) {
|
|
1208
943
|
return emitter.on(event, fn);
|
|
1209
944
|
},
|
|
1210
945
|
close,
|
|
1211
|
-
_ws: ws
|
|
1212
|
-
_currentContextId: () => context.contextId
|
|
946
|
+
_ws: ws
|
|
1213
947
|
};
|
|
1214
948
|
}
|
|
1215
949
|
};
|
|
@@ -1225,53 +959,785 @@ function openCartesia(opts) {
|
|
|
1225
959
|
* resolvers here to turn each descriptor into its openable / callable
|
|
1226
960
|
* host-side counterpart, importing the third-party SDK only at that point.
|
|
1227
961
|
*
|
|
1228
|
-
* The guest sandbox never imports these functions, which is how the agent
|
|
1229
|
-
* bundle stays free of `@ai-sdk/anthropic` / `assemblyai` /
|
|
1230
|
-
* `@cartesia/cartesia-js`.
|
|
962
|
+
* The guest sandbox never imports these functions, which is how the agent
|
|
963
|
+
* bundle stays free of `@ai-sdk/anthropic` / `assemblyai` /
|
|
964
|
+
* `@cartesia/cartesia-js`.
|
|
965
|
+
*/
|
|
966
|
+
/**
|
|
967
|
+
* Look up a provider API key: agent env first (set via `aai secret put` or
|
|
968
|
+
* `.env`), then the host's `process.env` as a fallback for self-hosted mode.
|
|
969
|
+
* Returns `""` if neither has it — the caller decides whether that's fatal.
|
|
970
|
+
*/
|
|
971
|
+
function resolveApiKey(envVar, env) {
|
|
972
|
+
return env[envVar] ?? process.env[envVar] ?? "";
|
|
973
|
+
}
|
|
974
|
+
/** Resolve an {@link SttProvider} descriptor into a host-side opener. */
|
|
975
|
+
function resolveStt(descriptor) {
|
|
976
|
+
switch (descriptor.kind) {
|
|
977
|
+
case ASSEMBLYAI_KIND: return openAssemblyAI(descriptor.options);
|
|
978
|
+
case DEEPGRAM_KIND: return openDeepgram(descriptor.options);
|
|
979
|
+
default: throw new Error(`Unknown STT provider kind: "${descriptor.kind}". Supported: ${ASSEMBLYAI_KIND}, ${DEEPGRAM_KIND}.`);
|
|
980
|
+
}
|
|
981
|
+
}
|
|
982
|
+
/** Resolve a {@link TtsProvider} descriptor into a host-side opener. */
|
|
983
|
+
function resolveTts(descriptor) {
|
|
984
|
+
switch (descriptor.kind) {
|
|
985
|
+
case CARTESIA_KIND: return openCartesia(descriptor.options);
|
|
986
|
+
case RIME_KIND: return openRime(descriptor.options);
|
|
987
|
+
default: throw new Error(`Unknown TTS provider kind: "${descriptor.kind}". Supported: ${CARTESIA_KIND}, ${RIME_KIND}.`);
|
|
988
|
+
}
|
|
989
|
+
}
|
|
990
|
+
/**
|
|
991
|
+
* Resolve an {@link LlmProvider} descriptor into a Vercel AI SDK
|
|
992
|
+
* {@link LanguageModel}.
|
|
993
|
+
*
|
|
994
|
+
* The API key is pulled from the agent's env (e.g. `ANTHROPIC_API_KEY`).
|
|
995
|
+
* Missing keys throw here — the pipeline session would fail on first
|
|
996
|
+
* `streamText` call otherwise, and the error is clearer at construction.
|
|
997
|
+
*/
|
|
998
|
+
function resolveLlm(descriptor, env) {
|
|
999
|
+
switch (descriptor.kind) {
|
|
1000
|
+
case ANTHROPIC_KIND: {
|
|
1001
|
+
const options = descriptor.options;
|
|
1002
|
+
const apiKey = resolveApiKey("ANTHROPIC_API_KEY", env);
|
|
1003
|
+
if (!apiKey) throw new Error("Anthropic LLM: missing API key. Set ANTHROPIC_API_KEY in the agent env.");
|
|
1004
|
+
return createAnthropic({
|
|
1005
|
+
apiKey,
|
|
1006
|
+
baseURL: "https://api.anthropic.com/v1"
|
|
1007
|
+
})(options.model);
|
|
1008
|
+
}
|
|
1009
|
+
default: throw new Error(`Unknown LLM provider kind: "${descriptor.kind}". Supported: ${ANTHROPIC_KIND}.`);
|
|
1010
|
+
}
|
|
1011
|
+
}
|
|
1012
|
+
//#endregion
|
|
1013
|
+
//#region host/runtime-config.ts
|
|
1014
|
+
/**
|
|
1015
|
+
* Runtime dependencies injected into the session pipeline.
|
|
1016
|
+
*
|
|
1017
|
+
* Defines the {@link Logger} interface, a default {@link consoleLogger},
|
|
1018
|
+
* and the {@link S2SConfig} for Speech-to-Speech endpoint configuration.
|
|
1019
|
+
*/
|
|
1020
|
+
function consoleLog(fn) {
|
|
1021
|
+
return (msg, ctx) => ctx ? fn(msg, ctx) : fn(msg);
|
|
1022
|
+
}
|
|
1023
|
+
/** Default console-backed logger. */
|
|
1024
|
+
const consoleLogger = {
|
|
1025
|
+
info: consoleLog(console.log),
|
|
1026
|
+
warn: consoleLog(console.warn),
|
|
1027
|
+
error: consoleLog(console.error),
|
|
1028
|
+
debug: consoleLog(console.debug)
|
|
1029
|
+
};
|
|
1030
|
+
/**
|
|
1031
|
+
* Structured JSON logger for production diagnostics. Each log entry is a
|
|
1032
|
+
* single-line JSON object with `timestamp`, `level`, `msg`, and any
|
|
1033
|
+
* caller-provided context fields.
|
|
1034
|
+
*/
|
|
1035
|
+
function jsonLog(level) {
|
|
1036
|
+
return (msg, ctx) => {
|
|
1037
|
+
const entry = {
|
|
1038
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1039
|
+
level,
|
|
1040
|
+
msg
|
|
1041
|
+
};
|
|
1042
|
+
if (ctx) Object.assign(entry, ctx);
|
|
1043
|
+
(level === "error" || level === "warn" ? process.stderr : process.stdout).write(`${JSON.stringify(entry)}\n`);
|
|
1044
|
+
};
|
|
1045
|
+
}
|
|
1046
|
+
const jsonLogger = {
|
|
1047
|
+
info: jsonLog("info"),
|
|
1048
|
+
warn: jsonLog("warn"),
|
|
1049
|
+
error: jsonLog("error"),
|
|
1050
|
+
debug: jsonLog("debug")
|
|
1051
|
+
};
|
|
1052
|
+
/** Default S2S endpoint configuration. */
|
|
1053
|
+
const DEFAULT_S2S_CONFIG = {
|
|
1054
|
+
wssUrl: "wss://agents.assemblyai.com/v1/voice",
|
|
1055
|
+
inputSampleRate: DEFAULT_STT_SAMPLE_RATE,
|
|
1056
|
+
outputSampleRate: DEFAULT_TTS_SAMPLE_RATE
|
|
1057
|
+
};
|
|
1058
|
+
//#endregion
|
|
1059
|
+
//#region host/session-core.ts
|
|
1060
|
+
const REPLY_DONE_SLOW_THRESHOLD_MS = 50;
|
|
1061
|
+
function createSessionCore(opts) {
|
|
1062
|
+
const log = opts.logger ?? consoleLogger;
|
|
1063
|
+
const maxHistory = opts.maxHistory ?? 200;
|
|
1064
|
+
const idleMs = (() => {
|
|
1065
|
+
const raw = opts.agentConfig.idleTimeoutMs ?? 3e5;
|
|
1066
|
+
return raw === 0 || !Number.isFinite(raw) ? 0 : raw;
|
|
1067
|
+
})();
|
|
1068
|
+
let reply = {
|
|
1069
|
+
currentReplyId: null,
|
|
1070
|
+
pendingTools: [],
|
|
1071
|
+
toolCallCount: 0
|
|
1072
|
+
};
|
|
1073
|
+
let history = [];
|
|
1074
|
+
let turnPromise = null;
|
|
1075
|
+
let idleTimer = null;
|
|
1076
|
+
let stopped = false;
|
|
1077
|
+
function emit(event) {
|
|
1078
|
+
opts.client.event(event);
|
|
1079
|
+
}
|
|
1080
|
+
function resetIdle() {
|
|
1081
|
+
if (stopped || idleMs <= 0) return;
|
|
1082
|
+
if (idleTimer) clearTimeout(idleTimer);
|
|
1083
|
+
idleTimer = setTimeout(() => {
|
|
1084
|
+
log.info("session idle timeout", { sid: opts.id });
|
|
1085
|
+
emit({ type: "idle_timeout" });
|
|
1086
|
+
}, idleMs);
|
|
1087
|
+
}
|
|
1088
|
+
function pushMessages(...msgs) {
|
|
1089
|
+
history.push(...msgs);
|
|
1090
|
+
if (maxHistory > 0 && history.length > maxHistory) history.splice(0, history.length - maxHistory);
|
|
1091
|
+
}
|
|
1092
|
+
function beginReply(replyId) {
|
|
1093
|
+
reply = {
|
|
1094
|
+
currentReplyId: replyId,
|
|
1095
|
+
pendingTools: [],
|
|
1096
|
+
toolCallCount: 0
|
|
1097
|
+
};
|
|
1098
|
+
turnPromise = null;
|
|
1099
|
+
}
|
|
1100
|
+
function cancelReply() {
|
|
1101
|
+
reply = {
|
|
1102
|
+
currentReplyId: null,
|
|
1103
|
+
pendingTools: [],
|
|
1104
|
+
toolCallCount: 0
|
|
1105
|
+
};
|
|
1106
|
+
}
|
|
1107
|
+
function flushReply(startMs, hadTurnPromise) {
|
|
1108
|
+
const stepsUsed = reply.toolCallCount;
|
|
1109
|
+
if (stepsUsed > 0) log.info("Turn complete", {
|
|
1110
|
+
steps: stepsUsed,
|
|
1111
|
+
agent: opts.agent
|
|
1112
|
+
});
|
|
1113
|
+
opts.client.playAudioDone();
|
|
1114
|
+
emit({ type: "reply_done" });
|
|
1115
|
+
reply.currentReplyId = null;
|
|
1116
|
+
const durationMs = Date.now() - startMs;
|
|
1117
|
+
if (durationMs >= REPLY_DONE_SLOW_THRESHOLD_MS) log.warn("slow reply_done dispatch", {
|
|
1118
|
+
sid: opts.id,
|
|
1119
|
+
agent: opts.agent,
|
|
1120
|
+
durationMs,
|
|
1121
|
+
hadTurnPromise
|
|
1122
|
+
});
|
|
1123
|
+
}
|
|
1124
|
+
return {
|
|
1125
|
+
id: opts.id,
|
|
1126
|
+
async start() {
|
|
1127
|
+
resetIdle();
|
|
1128
|
+
await opts.transport.start();
|
|
1129
|
+
},
|
|
1130
|
+
async stop() {
|
|
1131
|
+
if (stopped) return;
|
|
1132
|
+
stopped = true;
|
|
1133
|
+
if (idleTimer) {
|
|
1134
|
+
clearTimeout(idleTimer);
|
|
1135
|
+
idleTimer = null;
|
|
1136
|
+
}
|
|
1137
|
+
if (turnPromise !== null) await turnPromise;
|
|
1138
|
+
await opts.transport.stop();
|
|
1139
|
+
},
|
|
1140
|
+
onAudio(bytes) {
|
|
1141
|
+
resetIdle();
|
|
1142
|
+
opts.transport.sendUserAudio(bytes);
|
|
1143
|
+
},
|
|
1144
|
+
onAudioReady() {},
|
|
1145
|
+
onCancel() {
|
|
1146
|
+
opts.transport.cancelReply();
|
|
1147
|
+
emit({ type: "cancelled" });
|
|
1148
|
+
},
|
|
1149
|
+
onReset() {
|
|
1150
|
+
cancelReply();
|
|
1151
|
+
history = [];
|
|
1152
|
+
emit({ type: "reset" });
|
|
1153
|
+
},
|
|
1154
|
+
onHistory(messages) {
|
|
1155
|
+
pushMessages(...messages);
|
|
1156
|
+
},
|
|
1157
|
+
onReplyStarted(replyId) {
|
|
1158
|
+
beginReply(replyId);
|
|
1159
|
+
},
|
|
1160
|
+
onReplyDone() {
|
|
1161
|
+
const startMs = Date.now();
|
|
1162
|
+
const doneReplyId = reply.currentReplyId;
|
|
1163
|
+
if (doneReplyId === null) {
|
|
1164
|
+
log.debug("Dropping duplicate reply.done (no active reply)");
|
|
1165
|
+
return;
|
|
1166
|
+
}
|
|
1167
|
+
const hadTurnPromise = turnPromise !== null;
|
|
1168
|
+
const sendPending = () => {
|
|
1169
|
+
if (reply.currentReplyId !== doneReplyId) {
|
|
1170
|
+
reply.pendingTools = [];
|
|
1171
|
+
return;
|
|
1172
|
+
}
|
|
1173
|
+
if (reply.pendingTools.length > 0) {
|
|
1174
|
+
for (const tool of reply.pendingTools) opts.transport.sendToolResult(tool.callId, tool.result);
|
|
1175
|
+
reply.pendingTools = [];
|
|
1176
|
+
} else flushReply(startMs, hadTurnPromise);
|
|
1177
|
+
};
|
|
1178
|
+
if (hadTurnPromise) turnPromise?.then(sendPending);
|
|
1179
|
+
else sendPending();
|
|
1180
|
+
},
|
|
1181
|
+
onCancelled() {
|
|
1182
|
+
cancelReply();
|
|
1183
|
+
emit({ type: "cancelled" });
|
|
1184
|
+
},
|
|
1185
|
+
onAudioChunk(bytes) {
|
|
1186
|
+
opts.client.playAudioChunk(bytes);
|
|
1187
|
+
},
|
|
1188
|
+
onAudioDone() {
|
|
1189
|
+
opts.client.playAudioDone();
|
|
1190
|
+
},
|
|
1191
|
+
onUserTranscript(text) {
|
|
1192
|
+
emit({
|
|
1193
|
+
type: "user_transcript",
|
|
1194
|
+
text
|
|
1195
|
+
});
|
|
1196
|
+
pushMessages({
|
|
1197
|
+
role: "user",
|
|
1198
|
+
content: text
|
|
1199
|
+
});
|
|
1200
|
+
},
|
|
1201
|
+
onAgentTranscript(text, interrupted) {
|
|
1202
|
+
emit({
|
|
1203
|
+
type: "agent_transcript",
|
|
1204
|
+
text
|
|
1205
|
+
});
|
|
1206
|
+
if (!interrupted) pushMessages({
|
|
1207
|
+
role: "assistant",
|
|
1208
|
+
content: text
|
|
1209
|
+
});
|
|
1210
|
+
},
|
|
1211
|
+
onToolCall(callId, name, args) {
|
|
1212
|
+
emit({
|
|
1213
|
+
type: "tool_call",
|
|
1214
|
+
toolCallId: callId,
|
|
1215
|
+
toolName: name,
|
|
1216
|
+
args
|
|
1217
|
+
});
|
|
1218
|
+
if (reply.currentReplyId === null) {
|
|
1219
|
+
log.warn("tool_call with no active reply", {
|
|
1220
|
+
sid: opts.id,
|
|
1221
|
+
name
|
|
1222
|
+
});
|
|
1223
|
+
return;
|
|
1224
|
+
}
|
|
1225
|
+
reply.toolCallCount++;
|
|
1226
|
+
const maxSteps = opts.agentConfig.maxSteps;
|
|
1227
|
+
if (maxSteps !== void 0 && reply.toolCallCount > maxSteps) {
|
|
1228
|
+
log.info("maxSteps exceeded; refusing tool call", {
|
|
1229
|
+
toolCallCount: reply.toolCallCount,
|
|
1230
|
+
maxSteps
|
|
1231
|
+
});
|
|
1232
|
+
reply.pendingTools.push({
|
|
1233
|
+
callId,
|
|
1234
|
+
result: JSON.stringify({ error: "Maximum tool steps reached. Please respond to the user now." })
|
|
1235
|
+
});
|
|
1236
|
+
emit({
|
|
1237
|
+
type: "tool_call_done",
|
|
1238
|
+
toolCallId: callId,
|
|
1239
|
+
result: "{}"
|
|
1240
|
+
});
|
|
1241
|
+
return;
|
|
1242
|
+
}
|
|
1243
|
+
const p = (async () => {
|
|
1244
|
+
try {
|
|
1245
|
+
const result = await opts.executeTool(name, args, opts.id, history);
|
|
1246
|
+
reply.pendingTools.push({
|
|
1247
|
+
callId,
|
|
1248
|
+
result
|
|
1249
|
+
});
|
|
1250
|
+
emit({
|
|
1251
|
+
type: "tool_call_done",
|
|
1252
|
+
toolCallId: callId,
|
|
1253
|
+
result
|
|
1254
|
+
});
|
|
1255
|
+
} catch (err) {
|
|
1256
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
1257
|
+
reply.pendingTools.push({
|
|
1258
|
+
callId,
|
|
1259
|
+
result: JSON.stringify({ error: message })
|
|
1260
|
+
});
|
|
1261
|
+
emit({
|
|
1262
|
+
type: "tool_call_done",
|
|
1263
|
+
toolCallId: callId,
|
|
1264
|
+
result: message
|
|
1265
|
+
});
|
|
1266
|
+
}
|
|
1267
|
+
})();
|
|
1268
|
+
turnPromise = (turnPromise ?? Promise.resolve()).then(() => p);
|
|
1269
|
+
},
|
|
1270
|
+
onError(code, message) {
|
|
1271
|
+
emit({
|
|
1272
|
+
type: "error",
|
|
1273
|
+
code,
|
|
1274
|
+
message
|
|
1275
|
+
});
|
|
1276
|
+
},
|
|
1277
|
+
onSpeechStarted() {
|
|
1278
|
+
emit({ type: "speech_started" });
|
|
1279
|
+
},
|
|
1280
|
+
onSpeechStopped() {
|
|
1281
|
+
emit({ type: "speech_stopped" });
|
|
1282
|
+
}
|
|
1283
|
+
};
|
|
1284
|
+
}
|
|
1285
|
+
//#endregion
|
|
1286
|
+
//#region host/tool-executor.ts
|
|
1287
|
+
/**
|
|
1288
|
+
* Tool execution — validates arguments and invokes tool handlers.
|
|
1289
|
+
*
|
|
1290
|
+
* {@link executeToolCall} is the single entry point used by both the
|
|
1291
|
+
* direct (self-hosted) runtime and the platform sandbox sidecar.
|
|
1292
|
+
*/
|
|
1293
|
+
const yieldTick = () => new Promise((r) => setTimeout(r, 0));
|
|
1294
|
+
function buildToolContext(opts) {
|
|
1295
|
+
const { env, state, kv, messages, sessionId } = opts;
|
|
1296
|
+
return {
|
|
1297
|
+
env,
|
|
1298
|
+
state: state ?? {},
|
|
1299
|
+
get kv() {
|
|
1300
|
+
if (!kv) throw new Error("KV not available");
|
|
1301
|
+
return kv;
|
|
1302
|
+
},
|
|
1303
|
+
messages: messages ?? [],
|
|
1304
|
+
sessionId: sessionId ?? "",
|
|
1305
|
+
send(event, data) {
|
|
1306
|
+
opts.send?.(event, data);
|
|
1307
|
+
}
|
|
1308
|
+
};
|
|
1309
|
+
}
|
|
1310
|
+
async function executeToolCall(name, args, options) {
|
|
1311
|
+
const { tool } = options;
|
|
1312
|
+
const parsed = (tool.parameters ?? EMPTY_PARAMS).safeParse(args);
|
|
1313
|
+
if (!parsed.success) return toolError(`Invalid arguments for tool "${name}": ${(parsed.error?.issues ?? []).map((i) => `${i.path.map(String).join(".")}: ${i.message}`).join(", ")}`);
|
|
1314
|
+
try {
|
|
1315
|
+
const ctx = buildToolContext(options);
|
|
1316
|
+
await yieldTick();
|
|
1317
|
+
const result = await pTimeout(Promise.resolve(tool.execute(parsed.data, ctx)), {
|
|
1318
|
+
milliseconds: TOOL_EXECUTION_TIMEOUT_MS,
|
|
1319
|
+
message: `Tool "${name}" timed out after ${TOOL_EXECUTION_TIMEOUT_MS}ms`
|
|
1320
|
+
});
|
|
1321
|
+
await yieldTick();
|
|
1322
|
+
if (result == null) return "null";
|
|
1323
|
+
return typeof result === "string" ? result : JSON.stringify(result);
|
|
1324
|
+
} catch (err) {
|
|
1325
|
+
const log = options.logger;
|
|
1326
|
+
if (log) log.warn("Tool execution failed", {
|
|
1327
|
+
tool: name,
|
|
1328
|
+
error: errorDetail(err)
|
|
1329
|
+
});
|
|
1330
|
+
else console.warn(`[tool-executor] Tool execution failed: ${name}`, err);
|
|
1331
|
+
return toolError(errorMessage(err));
|
|
1332
|
+
}
|
|
1333
|
+
}
|
|
1334
|
+
//#endregion
|
|
1335
|
+
//#region host/to-vercel-tools.ts
|
|
1336
|
+
/**
|
|
1337
|
+
* Converts agent {@link ToolSchema}[] to Vercel AI SDK tools with `execute`
|
|
1338
|
+
* delegation to the agent's {@link ExecuteTool} function.
|
|
1339
|
+
*
|
|
1340
|
+
* The pipeline orchestrator passes the output to `streamText({ tools })`.
|
|
1341
|
+
* Each produced tool's `execute` closure calls
|
|
1342
|
+
* `ctx.executeTool(name, args, sessionId, messages(), { signal, toolCallId })`,
|
|
1343
|
+
* so the existing agent tool infrastructure (argument validation, KV, hooks,
|
|
1344
|
+
* timeout) remains the single source of truth for tool behavior.
|
|
1345
|
+
*
|
|
1346
|
+
* Per-call `options.abortSignal` (forwarded by `streamText` when the
|
|
1347
|
+
* outer turn is aborted, e.g. barge-in) takes precedence over the
|
|
1348
|
+
* bag-level `ctx.signal` so individual invocations respect streamText
|
|
1349
|
+
* aborts.
|
|
1231
1350
|
*/
|
|
1232
1351
|
/**
|
|
1233
|
-
*
|
|
1234
|
-
*
|
|
1235
|
-
*
|
|
1352
|
+
* Convert an array of {@link ToolSchema} to a Vercel AI SDK `ToolSet`
|
|
1353
|
+
* (record keyed by tool name).
|
|
1354
|
+
*
|
|
1355
|
+
* Uses the v6 `tool()` helper with `inputSchema: jsonSchema(...)` wrapping
|
|
1356
|
+
* the agent's JSON Schema `parameters`. Execution is delegated to
|
|
1357
|
+
* `ctx.executeTool` so validation, KV, timeouts, and hooks keep working.
|
|
1236
1358
|
*/
|
|
1237
|
-
function
|
|
1238
|
-
|
|
1359
|
+
function toVercelTools(schemas, ctx) {
|
|
1360
|
+
const out = {};
|
|
1361
|
+
for (const schema of schemas) out[schema.name] = tool({
|
|
1362
|
+
description: schema.description,
|
|
1363
|
+
inputSchema: jsonSchema(schema.parameters),
|
|
1364
|
+
execute: async (args, options) => {
|
|
1365
|
+
const input = args ?? {};
|
|
1366
|
+
const signal = options.abortSignal ?? ctx.signal;
|
|
1367
|
+
const opts = {};
|
|
1368
|
+
if (signal !== void 0) opts.signal = signal;
|
|
1369
|
+
if (options.toolCallId !== void 0) opts.toolCallId = options.toolCallId;
|
|
1370
|
+
return ctx.executeTool(schema.name, input, ctx.sessionId, ctx.messages().slice(), opts);
|
|
1371
|
+
}
|
|
1372
|
+
});
|
|
1373
|
+
return out;
|
|
1239
1374
|
}
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1375
|
+
//#endregion
|
|
1376
|
+
//#region host/transports/pipeline-transport.ts
|
|
1377
|
+
function toModelMessage(m) {
|
|
1378
|
+
if (m.role === "user") return {
|
|
1379
|
+
role: "user",
|
|
1380
|
+
content: m.content
|
|
1381
|
+
};
|
|
1382
|
+
return {
|
|
1383
|
+
role: "assistant",
|
|
1384
|
+
content: m.content
|
|
1385
|
+
};
|
|
1386
|
+
}
|
|
1387
|
+
/** Create a pipeline-mode Transport (STT → LLM → TTS). */
|
|
1388
|
+
function createPipelineTransport(opts) {
|
|
1389
|
+
const log = opts.logger ?? consoleLogger;
|
|
1390
|
+
const sttSampleRate = opts.sttSampleRate ?? 16e3;
|
|
1391
|
+
const ttsSampleRate = opts.ttsSampleRate ?? 24e3;
|
|
1392
|
+
const maxSteps = opts.maxSteps ?? 5;
|
|
1393
|
+
const toolChoice = opts.toolChoice ?? "auto";
|
|
1394
|
+
const toolSchemas = opts.toolSchemas ?? [];
|
|
1395
|
+
const executeTool = opts.executeTool ?? (async () => {
|
|
1396
|
+
throw new Error("No executeTool provided");
|
|
1397
|
+
});
|
|
1398
|
+
const { callbacks, sessionConfig } = opts;
|
|
1399
|
+
const systemPrompt = sessionConfig.systemPrompt;
|
|
1400
|
+
const sessionAbort = new AbortController();
|
|
1401
|
+
let audioReady = false;
|
|
1402
|
+
let terminated = false;
|
|
1403
|
+
let sttSession = null;
|
|
1404
|
+
let ttsSession = null;
|
|
1405
|
+
let turnController = null;
|
|
1406
|
+
let nextReplyId = 0;
|
|
1407
|
+
const conversationMessages = sessionConfig.history ? [...sessionConfig.history] : [];
|
|
1408
|
+
let turnPromise = null;
|
|
1409
|
+
const sttSubs = [];
|
|
1410
|
+
const ttsSubs = [];
|
|
1411
|
+
function pushMessages(...msgs) {
|
|
1412
|
+
conversationMessages.push(...msgs);
|
|
1413
|
+
if (conversationMessages.length > 200) conversationMessages.splice(0, conversationMessages.length - 200);
|
|
1414
|
+
}
|
|
1415
|
+
function chainTurn(p) {
|
|
1416
|
+
turnPromise = (turnPromise ?? Promise.resolve()).then(() => p);
|
|
1417
|
+
}
|
|
1418
|
+
function emitError(code, message) {
|
|
1419
|
+
callbacks.onError(code, message);
|
|
1420
|
+
}
|
|
1421
|
+
/**
|
|
1422
|
+
* Tear down after an unrecoverable provider error. Aborts the in-flight
|
|
1423
|
+
* turn, cancels TTS, signals providers to close. Idempotent.
|
|
1424
|
+
*/
|
|
1425
|
+
function terminate() {
|
|
1426
|
+
if (terminated) return;
|
|
1427
|
+
terminated = true;
|
|
1428
|
+
if (turnController !== null) {
|
|
1429
|
+
turnController.abort();
|
|
1430
|
+
turnController = null;
|
|
1431
|
+
}
|
|
1432
|
+
ttsSession?.cancel();
|
|
1433
|
+
callbacks.onCancelled();
|
|
1434
|
+
sessionAbort.abort();
|
|
1435
|
+
}
|
|
1436
|
+
function onSttPartial(_text) {
|
|
1437
|
+
if (terminated) return;
|
|
1438
|
+
if (turnController === null) return;
|
|
1439
|
+
log.info("Pipeline barge-in", { sid: opts.sid });
|
|
1440
|
+
turnController.abort();
|
|
1441
|
+
turnController = null;
|
|
1442
|
+
ttsSession?.cancel();
|
|
1443
|
+
callbacks.onCancelled();
|
|
1444
|
+
}
|
|
1445
|
+
function onSttFinal(text) {
|
|
1446
|
+
if (terminated) return;
|
|
1447
|
+
const trimmed = text.trim();
|
|
1448
|
+
if (trimmed.length === 0) return;
|
|
1449
|
+
if (turnController !== null) {
|
|
1450
|
+
log.info("Pipeline replacing in-flight turn", { sid: opts.sid });
|
|
1451
|
+
turnController.abort();
|
|
1452
|
+
turnController = null;
|
|
1453
|
+
ttsSession?.cancel();
|
|
1454
|
+
callbacks.onCancelled();
|
|
1455
|
+
}
|
|
1456
|
+
callbacks.onUserTranscript(text);
|
|
1457
|
+
chainTurn(runTurn(trimmed).catch((err) => {
|
|
1458
|
+
log.error("Pipeline turn crashed", {
|
|
1459
|
+
error: errorMessage(err),
|
|
1460
|
+
sid: opts.sid
|
|
1461
|
+
});
|
|
1462
|
+
}));
|
|
1463
|
+
}
|
|
1464
|
+
function onSttError(err) {
|
|
1465
|
+
if (terminated) return;
|
|
1466
|
+
log.error("STT error", {
|
|
1467
|
+
code: err.code,
|
|
1468
|
+
message: err.message,
|
|
1469
|
+
sid: opts.sid
|
|
1470
|
+
});
|
|
1471
|
+
emitError("stt", err.message);
|
|
1472
|
+
terminate();
|
|
1473
|
+
}
|
|
1474
|
+
function onTtsError(err) {
|
|
1475
|
+
if (terminated) return;
|
|
1476
|
+
log.error("TTS error", {
|
|
1477
|
+
code: err.code,
|
|
1478
|
+
message: err.message,
|
|
1479
|
+
sid: opts.sid
|
|
1480
|
+
});
|
|
1481
|
+
emitError("tts", err.message);
|
|
1482
|
+
terminate();
|
|
1483
|
+
}
|
|
1484
|
+
async function consumeLlmStream(ctl, messages, tools, onDelta) {
|
|
1485
|
+
try {
|
|
1486
|
+
const result = streamText({
|
|
1487
|
+
model: opts.llm,
|
|
1488
|
+
system: systemPrompt,
|
|
1489
|
+
messages,
|
|
1490
|
+
tools,
|
|
1491
|
+
toolChoice,
|
|
1492
|
+
stopWhen: stepCountIs(maxSteps),
|
|
1493
|
+
abortSignal: ctl.signal
|
|
1494
|
+
});
|
|
1495
|
+
for await (const part of result.fullStream) {
|
|
1496
|
+
if (ctl.signal.aborted) break;
|
|
1497
|
+
handleStreamPart(part, ctl, onDelta);
|
|
1498
|
+
}
|
|
1499
|
+
} catch (err) {
|
|
1500
|
+
if (!ctl.signal.aborted) {
|
|
1501
|
+
const msg = errorMessage(err);
|
|
1502
|
+
log.error("LLM streamText failed", {
|
|
1503
|
+
error: msg,
|
|
1504
|
+
sid: opts.sid
|
|
1505
|
+
});
|
|
1506
|
+
emitError("llm", msg);
|
|
1507
|
+
}
|
|
1508
|
+
}
|
|
1509
|
+
}
|
|
1510
|
+
function handleStreamPart(part, _ctl, onDelta) {
|
|
1511
|
+
switch (part.type) {
|
|
1512
|
+
case "text-delta": {
|
|
1513
|
+
const delta = part.text ?? "";
|
|
1514
|
+
if (delta.length === 0) return;
|
|
1515
|
+
onDelta(delta);
|
|
1516
|
+
ttsSession?.sendText(delta);
|
|
1517
|
+
return;
|
|
1518
|
+
}
|
|
1519
|
+
case "tool-call": {
|
|
1520
|
+
const input = part.input ?? {};
|
|
1521
|
+
callbacks.onToolCall(part.toolCallId ?? "", part.toolName ?? "", input);
|
|
1522
|
+
return;
|
|
1523
|
+
}
|
|
1524
|
+
case "error": {
|
|
1525
|
+
const msg = errorMessage(part.error);
|
|
1526
|
+
log.error("LLM stream error", {
|
|
1527
|
+
message: msg,
|
|
1528
|
+
sid: opts.sid
|
|
1529
|
+
});
|
|
1530
|
+
emitError("llm", msg);
|
|
1531
|
+
return;
|
|
1532
|
+
}
|
|
1533
|
+
default: return;
|
|
1534
|
+
}
|
|
1535
|
+
}
|
|
1536
|
+
/**
|
|
1537
|
+
* Flush TTS and wait for drain. Resolves on:
|
|
1538
|
+
* - TTS emits `done`
|
|
1539
|
+
* - `signal` aborts (barge-in / provider error / session stop)
|
|
1540
|
+
* - PIPELINE_FLUSH_TIMEOUT_MS elapses
|
|
1541
|
+
* Resolves immediately if no TTS session.
|
|
1542
|
+
*/
|
|
1543
|
+
function flushTtsAndWait(signal) {
|
|
1544
|
+
const tts = ttsSession;
|
|
1545
|
+
if (!tts) return Promise.resolve();
|
|
1546
|
+
return new Promise((resolve) => {
|
|
1547
|
+
let off = null;
|
|
1548
|
+
let timer = null;
|
|
1549
|
+
const cleanup = () => {
|
|
1550
|
+
if (off) {
|
|
1551
|
+
off();
|
|
1552
|
+
off = null;
|
|
1553
|
+
}
|
|
1554
|
+
if (timer) {
|
|
1555
|
+
clearTimeout(timer);
|
|
1556
|
+
timer = null;
|
|
1557
|
+
}
|
|
1558
|
+
signal.removeEventListener("abort", onAbort);
|
|
1559
|
+
};
|
|
1560
|
+
const finish = () => {
|
|
1561
|
+
cleanup();
|
|
1562
|
+
resolve();
|
|
1563
|
+
};
|
|
1564
|
+
const onAbort = () => finish();
|
|
1565
|
+
if (signal.aborted) {
|
|
1566
|
+
resolve();
|
|
1567
|
+
return;
|
|
1568
|
+
}
|
|
1569
|
+
signal.addEventListener("abort", onAbort, { once: true });
|
|
1570
|
+
off = tts.on("done", finish);
|
|
1571
|
+
timer = setTimeout(() => {
|
|
1572
|
+
log.warn("TTS flush timeout", {
|
|
1573
|
+
sid: opts.sid,
|
|
1574
|
+
timeoutMs: PIPELINE_FLUSH_TIMEOUT_MS
|
|
1575
|
+
});
|
|
1576
|
+
finish();
|
|
1577
|
+
}, PIPELINE_FLUSH_TIMEOUT_MS);
|
|
1578
|
+
tts.flush();
|
|
1579
|
+
});
|
|
1580
|
+
}
|
|
1581
|
+
async function runTurn(userText) {
|
|
1582
|
+
const replyId = `pipeline-${++nextReplyId}`;
|
|
1583
|
+
callbacks.onReplyStarted(replyId);
|
|
1584
|
+
pushMessages({
|
|
1585
|
+
role: "user",
|
|
1586
|
+
content: userText
|
|
1587
|
+
});
|
|
1588
|
+
const ctl = new AbortController();
|
|
1589
|
+
turnController = ctl;
|
|
1590
|
+
const tools = toVercelTools(toolSchemas, {
|
|
1591
|
+
executeTool,
|
|
1592
|
+
sessionId: opts.sid,
|
|
1593
|
+
messages: () => conversationMessages,
|
|
1594
|
+
signal: ctl.signal
|
|
1595
|
+
});
|
|
1596
|
+
const messages = conversationMessages.map(toModelMessage);
|
|
1597
|
+
let accumulated = "";
|
|
1598
|
+
await consumeLlmStream(ctl, messages, tools, (delta) => {
|
|
1599
|
+
accumulated += delta;
|
|
1600
|
+
});
|
|
1601
|
+
if (ctl.signal.aborted) {
|
|
1602
|
+
if (turnController === ctl) turnController = null;
|
|
1603
|
+
return;
|
|
1604
|
+
}
|
|
1605
|
+
if (accumulated.length > 0) {
|
|
1606
|
+
callbacks.onAgentTranscript(accumulated, false);
|
|
1607
|
+
pushMessages({
|
|
1608
|
+
role: "assistant",
|
|
1609
|
+
content: accumulated
|
|
1610
|
+
});
|
|
1611
|
+
}
|
|
1612
|
+
await flushTtsAndWait(ctl.signal);
|
|
1613
|
+
if (ctl.signal.aborted) {
|
|
1614
|
+
if (turnController === ctl) turnController = null;
|
|
1615
|
+
return;
|
|
1616
|
+
}
|
|
1617
|
+
callbacks.onReplyDone();
|
|
1618
|
+
if (turnController === ctl) turnController = null;
|
|
1619
|
+
}
|
|
1620
|
+
async function runGreeting(text) {
|
|
1621
|
+
const replyId = `pipeline-greeting-${++nextReplyId}`;
|
|
1622
|
+
callbacks.onReplyStarted(replyId);
|
|
1623
|
+
const ctl = new AbortController();
|
|
1624
|
+
turnController = ctl;
|
|
1625
|
+
callbacks.onAgentTranscript(text, false);
|
|
1626
|
+
pushMessages({
|
|
1627
|
+
role: "assistant",
|
|
1628
|
+
content: text
|
|
1629
|
+
});
|
|
1630
|
+
ttsSession?.sendText(text);
|
|
1631
|
+
await flushTtsAndWait(ctl.signal);
|
|
1632
|
+
if (ctl.signal.aborted) {
|
|
1633
|
+
if (turnController === ctl) turnController = null;
|
|
1634
|
+
return;
|
|
1635
|
+
}
|
|
1636
|
+
callbacks.onReplyDone();
|
|
1637
|
+
if (turnController === ctl) turnController = null;
|
|
1638
|
+
}
|
|
1639
|
+
function reportOpenRejection(which, reason) {
|
|
1640
|
+
const msg = errorMessage(reason);
|
|
1641
|
+
log.error(`${which === "stt" ? "STT" : "TTS"} open failed`, {
|
|
1642
|
+
error: msg,
|
|
1643
|
+
sid: opts.sid
|
|
1644
|
+
});
|
|
1645
|
+
emitError(which, msg);
|
|
1646
|
+
}
|
|
1647
|
+
async function adoptStt(session, teardown) {
|
|
1648
|
+
if (teardown) {
|
|
1649
|
+
await session.close().catch(() => void 0);
|
|
1650
|
+
return;
|
|
1651
|
+
}
|
|
1652
|
+
sttSession = session;
|
|
1653
|
+
sttSubs.push(session.on("partial", onSttPartial));
|
|
1654
|
+
sttSubs.push(session.on("final", onSttFinal));
|
|
1655
|
+
sttSubs.push(session.on("error", onSttError));
|
|
1656
|
+
}
|
|
1657
|
+
async function adoptTts(session, teardown) {
|
|
1658
|
+
if (teardown) {
|
|
1659
|
+
await session.close().catch(() => void 0);
|
|
1660
|
+
return;
|
|
1661
|
+
}
|
|
1662
|
+
ttsSession = session;
|
|
1663
|
+
ttsSubs.push(session.on("audio", (pcm) => {
|
|
1664
|
+
callbacks.onAudioChunk(new Uint8Array(pcm.buffer, pcm.byteOffset, pcm.byteLength));
|
|
1665
|
+
}));
|
|
1666
|
+
ttsSubs.push(session.on("error", onTtsError));
|
|
1667
|
+
}
|
|
1668
|
+
async function openProviders() {
|
|
1669
|
+
const [sttResult, ttsResult] = await Promise.allSettled([opts.stt.open({
|
|
1670
|
+
sampleRate: sttSampleRate,
|
|
1671
|
+
apiKey: opts.providerKeys.stt,
|
|
1672
|
+
sttPrompt: opts.sttPrompt,
|
|
1673
|
+
signal: sessionAbort.signal
|
|
1674
|
+
}), opts.tts.open({
|
|
1675
|
+
sampleRate: ttsSampleRate,
|
|
1676
|
+
apiKey: opts.providerKeys.tts,
|
|
1677
|
+
signal: sessionAbort.signal
|
|
1678
|
+
})]);
|
|
1679
|
+
if (sttResult.status === "rejected") reportOpenRejection("stt", sttResult.reason);
|
|
1680
|
+
if (ttsResult.status === "rejected") reportOpenRejection("tts", ttsResult.reason);
|
|
1681
|
+
const aborted = sessionAbort.signal.aborted;
|
|
1682
|
+
const sttFailed = sttResult.status === "rejected";
|
|
1683
|
+
const ttsFailed = ttsResult.status === "rejected";
|
|
1684
|
+
const teardown = aborted || sttFailed || ttsFailed;
|
|
1685
|
+
if (sttResult.status === "fulfilled") await adoptStt(sttResult.value, teardown);
|
|
1686
|
+
if (ttsResult.status === "fulfilled") await adoptTts(ttsResult.value, teardown);
|
|
1687
|
+
if (!aborted && (sttFailed || ttsFailed)) terminate();
|
|
1245
1688
|
}
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1689
|
+
function onAudioReady() {
|
|
1690
|
+
if (audioReady || terminated) return;
|
|
1691
|
+
audioReady = true;
|
|
1692
|
+
if (opts.skipGreeting) return;
|
|
1693
|
+
const greeting = sessionConfig.greeting;
|
|
1694
|
+
if (!greeting) return;
|
|
1695
|
+
chainTurn(runGreeting(greeting).catch((err) => {
|
|
1696
|
+
log.error("Pipeline greeting failed", {
|
|
1697
|
+
error: errorMessage(err),
|
|
1698
|
+
sid: opts.sid
|
|
1699
|
+
});
|
|
1700
|
+
}));
|
|
1252
1701
|
}
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
if (
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1702
|
+
return {
|
|
1703
|
+
async start() {
|
|
1704
|
+
await openProviders();
|
|
1705
|
+
callbacks.onSessionReady?.(opts.sid);
|
|
1706
|
+
onAudioReady();
|
|
1707
|
+
},
|
|
1708
|
+
async stop() {
|
|
1709
|
+
if (sessionAbort.signal.aborted) return;
|
|
1710
|
+
sessionAbort.abort();
|
|
1711
|
+
turnController?.abort();
|
|
1712
|
+
for (const off of sttSubs) off();
|
|
1713
|
+
for (const off of ttsSubs) off();
|
|
1714
|
+
sttSubs.length = 0;
|
|
1715
|
+
ttsSubs.length = 0;
|
|
1716
|
+
if (turnPromise !== null) await turnPromise;
|
|
1717
|
+
await sttSession?.close().catch(() => {});
|
|
1718
|
+
await ttsSession?.close().catch(() => {});
|
|
1719
|
+
},
|
|
1720
|
+
sendUserAudio(bytes) {
|
|
1721
|
+
if (terminated || !audioReady) return;
|
|
1722
|
+
const offset = bytes.byteOffset;
|
|
1723
|
+
const length = bytes.byteLength;
|
|
1724
|
+
let pcm;
|
|
1725
|
+
if (offset % 2 === 0 && length % 2 === 0) pcm = new Int16Array(bytes.buffer, offset, length / 2);
|
|
1726
|
+
else {
|
|
1727
|
+
const copy = new Uint8Array(length - length % 2);
|
|
1728
|
+
copy.set(bytes.subarray(0, copy.byteLength));
|
|
1729
|
+
pcm = new Int16Array(copy.buffer);
|
|
1730
|
+
}
|
|
1731
|
+
sttSession?.sendAudio(pcm);
|
|
1732
|
+
},
|
|
1733
|
+
sendToolResult(_callId, _result) {},
|
|
1734
|
+
cancelReply() {
|
|
1735
|
+
if (terminated) return;
|
|
1736
|
+
turnController?.abort();
|
|
1737
|
+
turnController = null;
|
|
1738
|
+
ttsSession?.cancel();
|
|
1272
1739
|
}
|
|
1273
|
-
|
|
1274
|
-
}
|
|
1740
|
+
};
|
|
1275
1741
|
}
|
|
1276
1742
|
//#endregion
|
|
1277
1743
|
//#region host/s2s.ts
|
|
@@ -1326,72 +1792,59 @@ function parseS2sMessage(obj) {
|
|
|
1326
1792
|
const result = S2sMessageSchema.safeParse(obj);
|
|
1327
1793
|
return result.success ? result.data : void 0;
|
|
1328
1794
|
}
|
|
1329
|
-
function dispatchS2sMessage(
|
|
1795
|
+
function dispatchS2sMessage(callbacks, msg, state, ctx) {
|
|
1330
1796
|
switch (msg.type) {
|
|
1331
1797
|
case "session.ready":
|
|
1332
|
-
|
|
1798
|
+
callbacks.onSessionReady(msg.session_id);
|
|
1333
1799
|
break;
|
|
1334
1800
|
case "session.updated": break;
|
|
1335
1801
|
case "input.speech.started":
|
|
1336
1802
|
if (!state.speechActive) {
|
|
1337
1803
|
state.speechActive = true;
|
|
1338
|
-
|
|
1804
|
+
callbacks.onSpeechStarted();
|
|
1339
1805
|
}
|
|
1340
1806
|
break;
|
|
1341
1807
|
case "input.speech.stopped":
|
|
1342
1808
|
if (state.speechActive) {
|
|
1343
1809
|
state.speechActive = false;
|
|
1344
|
-
|
|
1810
|
+
callbacks.onSpeechStopped();
|
|
1345
1811
|
}
|
|
1346
1812
|
break;
|
|
1347
1813
|
case "transcript.user":
|
|
1348
|
-
|
|
1349
|
-
type: "user_transcript",
|
|
1350
|
-
text: msg.text
|
|
1351
|
-
});
|
|
1814
|
+
callbacks.onUserTranscript(msg.text);
|
|
1352
1815
|
break;
|
|
1353
1816
|
case "reply.started":
|
|
1354
|
-
|
|
1817
|
+
callbacks.onReplyStarted(msg.reply_id);
|
|
1355
1818
|
break;
|
|
1356
1819
|
case "transcript.agent":
|
|
1357
|
-
|
|
1358
|
-
type: "agent_transcript",
|
|
1359
|
-
text: msg.text,
|
|
1360
|
-
_interrupted: msg.interrupted
|
|
1361
|
-
});
|
|
1820
|
+
callbacks.onAgentTranscript(msg.text, msg.interrupted);
|
|
1362
1821
|
break;
|
|
1363
1822
|
case "tool.call":
|
|
1364
|
-
|
|
1365
|
-
type: "tool_call",
|
|
1366
|
-
toolCallId: msg.call_id,
|
|
1367
|
-
toolName: msg.name,
|
|
1368
|
-
args: msg.args
|
|
1369
|
-
});
|
|
1823
|
+
callbacks.onToolCall(msg.call_id, msg.name, msg.args);
|
|
1370
1824
|
break;
|
|
1371
1825
|
case "reply.done":
|
|
1372
|
-
|
|
1373
|
-
...
|
|
1826
|
+
ctx.log.info("S2S << reply.done", {
|
|
1827
|
+
...ctx.sid !== void 0 ? { sid: ctx.sid } : {},
|
|
1374
1828
|
status: msg.status ?? "completed"
|
|
1375
1829
|
});
|
|
1376
|
-
if (msg.status === "interrupted")
|
|
1377
|
-
else
|
|
1830
|
+
if (msg.status === "interrupted") callbacks.onCancelled();
|
|
1831
|
+
else callbacks.onReplyDone();
|
|
1378
1832
|
break;
|
|
1379
1833
|
case "session.error":
|
|
1380
|
-
if (msg.code === "session_not_found" || msg.code === "session_forbidden")
|
|
1381
|
-
else
|
|
1834
|
+
if (msg.code === "session_not_found" || msg.code === "session_forbidden") callbacks.onSessionExpired();
|
|
1835
|
+
else callbacks.onError(new Error(msg.message));
|
|
1382
1836
|
break;
|
|
1383
1837
|
case "error":
|
|
1384
|
-
|
|
1838
|
+
callbacks.onError(new Error(msg.message));
|
|
1385
1839
|
break;
|
|
1386
1840
|
default: break;
|
|
1387
1841
|
}
|
|
1388
1842
|
}
|
|
1389
1843
|
function connectS2s(opts) {
|
|
1390
|
-
const { apiKey, config, createWebSocket, logger: log = consoleLogger, sid } = opts;
|
|
1844
|
+
const { apiKey, config, createWebSocket, callbacks, logger: log = consoleLogger, sid } = opts;
|
|
1391
1845
|
return new Promise((resolve, reject) => {
|
|
1392
1846
|
log.info("S2S connecting", { url: config.wssUrl });
|
|
1393
1847
|
const ws = createWebSocket(config.wssUrl, { headers: { Authorization: `Bearer ${apiKey}` } });
|
|
1394
|
-
const emitter = createNanoEvents();
|
|
1395
1848
|
const dispatchState = { speechActive: false };
|
|
1396
1849
|
const dispatchCtx = sid !== void 0 ? {
|
|
1397
1850
|
log,
|
|
@@ -1409,7 +1862,6 @@ function connectS2s(opts) {
|
|
|
1409
1862
|
ws.send(json);
|
|
1410
1863
|
}
|
|
1411
1864
|
const handle = {
|
|
1412
|
-
on: emitter.on.bind(emitter),
|
|
1413
1865
|
sendAudio(audio) {
|
|
1414
1866
|
if (ws.readyState !== 1) {
|
|
1415
1867
|
log.debug("S2S sendAudio dropped: socket not open");
|
|
@@ -1422,16 +1874,15 @@ function connectS2s(opts) {
|
|
|
1422
1874
|
ws.send(jsonFrame);
|
|
1423
1875
|
},
|
|
1424
1876
|
sendToolResult(callId, result) {
|
|
1425
|
-
const msg = {
|
|
1426
|
-
type: "tool.result",
|
|
1427
|
-
call_id: callId,
|
|
1428
|
-
result
|
|
1429
|
-
};
|
|
1430
1877
|
log.info("S2S >> tool.result", {
|
|
1431
1878
|
call_id: callId,
|
|
1432
1879
|
resultLength: result.length
|
|
1433
1880
|
});
|
|
1434
|
-
send(
|
|
1881
|
+
send({
|
|
1882
|
+
type: "tool.result",
|
|
1883
|
+
call_id: callId,
|
|
1884
|
+
result
|
|
1885
|
+
});
|
|
1435
1886
|
},
|
|
1436
1887
|
updateSession(sessionConfig) {
|
|
1437
1888
|
const { systemPrompt, ...rest } = sessionConfig;
|
|
@@ -1468,8 +1919,7 @@ function connectS2s(opts) {
|
|
|
1468
1919
|
}
|
|
1469
1920
|
function handleAudioFastPath(obj) {
|
|
1470
1921
|
if (obj.type === "reply.audio" && typeof obj.data === "string") {
|
|
1471
|
-
|
|
1472
|
-
emitter.emit("audio", { audio: audioBytes });
|
|
1922
|
+
callbacks.onAudio(base64ToUint8(obj.data));
|
|
1473
1923
|
return true;
|
|
1474
1924
|
}
|
|
1475
1925
|
return false;
|
|
@@ -1479,7 +1929,7 @@ function connectS2s(opts) {
|
|
|
1479
1929
|
if (obj.type === "reply.done") return;
|
|
1480
1930
|
log.info(`S2S << ${obj.type}`);
|
|
1481
1931
|
}
|
|
1482
|
-
|
|
1932
|
+
ws.addEventListener("message", (ev) => {
|
|
1483
1933
|
const raw = tryParseJson(ev.data);
|
|
1484
1934
|
if (raw === void 0) return;
|
|
1485
1935
|
if (typeof raw !== "object" || raw === null || Array.isArray(raw)) {
|
|
@@ -1494,9 +1944,8 @@ function connectS2s(opts) {
|
|
|
1494
1944
|
log.warn(`S2S << unrecognised message type: ${obj.type ?? JSON.stringify(raw).slice(0, 200)}`);
|
|
1495
1945
|
return;
|
|
1496
1946
|
}
|
|
1497
|
-
dispatchS2sMessage(
|
|
1498
|
-
}
|
|
1499
|
-
ws.addEventListener("message", handleS2sMessage);
|
|
1947
|
+
dispatchS2sMessage(callbacks, parsed, dispatchState, dispatchCtx);
|
|
1948
|
+
});
|
|
1500
1949
|
ws.addEventListener("close", (ev) => {
|
|
1501
1950
|
const code = ev.code ?? 0;
|
|
1502
1951
|
const reason = ev.reason ?? "";
|
|
@@ -1505,394 +1954,102 @@ function connectS2s(opts) {
|
|
|
1505
1954
|
reason
|
|
1506
1955
|
});
|
|
1507
1956
|
if (!opened) reject(/* @__PURE__ */ new Error(`WebSocket closed before open (code: ${code})`));
|
|
1508
|
-
|
|
1957
|
+
callbacks.onClose(code, reason);
|
|
1509
1958
|
});
|
|
1510
1959
|
ws.addEventListener("error", (ev) => {
|
|
1511
1960
|
const message = typeof ev.message === "string" ? ev.message : "WebSocket error";
|
|
1512
1961
|
const errObj = new Error(message);
|
|
1513
1962
|
log.error("S2S WebSocket error", { error: errObj.message });
|
|
1514
1963
|
if (!opened) reject(errObj);
|
|
1515
|
-
else
|
|
1964
|
+
else callbacks.onError(errObj);
|
|
1516
1965
|
});
|
|
1517
1966
|
});
|
|
1518
1967
|
}
|
|
1519
1968
|
//#endregion
|
|
1520
|
-
//#region host/
|
|
1521
|
-
/** @internal
|
|
1969
|
+
//#region host/transports/s2s-transport.ts
|
|
1970
|
+
/** @internal Exposed for testing — allows spying on connectS2s in unit tests. */
|
|
1522
1971
|
const _internals = { connectS2s };
|
|
1523
|
-
|
|
1524
|
-
|
|
1525
|
-
|
|
1526
|
-
|
|
1527
|
-
|
|
1528
|
-
function
|
|
1529
|
-
|
|
1530
|
-
|
|
1531
|
-
|
|
1532
|
-
|
|
1533
|
-
|
|
1534
|
-
|
|
1535
|
-
|
|
1536
|
-
|
|
1537
|
-
|
|
1538
|
-
|
|
1539
|
-
|
|
1540
|
-
|
|
1541
|
-
|
|
1542
|
-
|
|
1543
|
-
|
|
1544
|
-
|
|
1545
|
-
|
|
1546
|
-
|
|
1547
|
-
|
|
1548
|
-
|
|
1549
|
-
|
|
1972
|
+
function createS2sTransport(opts) {
|
|
1973
|
+
const log = opts.logger ?? consoleLogger;
|
|
1974
|
+
const createWs = opts.createWebSocket ?? defaultCreateS2sWebSocket;
|
|
1975
|
+
let handle = null;
|
|
1976
|
+
let currentReplyId = null;
|
|
1977
|
+
async function start() {
|
|
1978
|
+
handle = await _internals.connectS2s({
|
|
1979
|
+
apiKey: opts.apiKey,
|
|
1980
|
+
config: opts.s2sConfig,
|
|
1981
|
+
createWebSocket: createWs,
|
|
1982
|
+
logger: log,
|
|
1983
|
+
sid: opts.sid,
|
|
1984
|
+
callbacks: {
|
|
1985
|
+
onSessionReady: (providerSessionId) => opts.callbacks.onSessionReady?.(providerSessionId),
|
|
1986
|
+
onReplyStarted: (replyId) => {
|
|
1987
|
+
currentReplyId = replyId;
|
|
1988
|
+
opts.callbacks.onReplyStarted(replyId);
|
|
1989
|
+
},
|
|
1990
|
+
onReplyDone: () => {
|
|
1991
|
+
currentReplyId = null;
|
|
1992
|
+
opts.callbacks.onReplyDone();
|
|
1993
|
+
},
|
|
1994
|
+
onCancelled: () => {
|
|
1995
|
+
currentReplyId = null;
|
|
1996
|
+
opts.callbacks.onCancelled();
|
|
1997
|
+
},
|
|
1998
|
+
onAudio: (bytes) => opts.callbacks.onAudioChunk(bytes),
|
|
1999
|
+
onUserTranscript: opts.callbacks.onUserTranscript,
|
|
2000
|
+
onAgentTranscript: opts.callbacks.onAgentTranscript,
|
|
2001
|
+
onToolCall: opts.callbacks.onToolCall,
|
|
2002
|
+
onSpeechStarted: opts.callbacks.onSpeechStarted,
|
|
2003
|
+
onSpeechStopped: opts.callbacks.onSpeechStopped,
|
|
2004
|
+
onSessionExpired: () => {
|
|
2005
|
+
log.info("S2S session expired", { sid: opts.sid });
|
|
2006
|
+
handle?.close();
|
|
2007
|
+
},
|
|
2008
|
+
onError: (err) => opts.callbacks.onError("internal", err.message),
|
|
2009
|
+
onClose: (code, reason) => {
|
|
2010
|
+
if (currentReplyId !== null) {
|
|
2011
|
+
log.warn("S2S closed with active reply", {
|
|
2012
|
+
sid: opts.sid,
|
|
2013
|
+
agent: opts.agent,
|
|
2014
|
+
activeReplyId: currentReplyId,
|
|
2015
|
+
code,
|
|
2016
|
+
reason
|
|
2017
|
+
});
|
|
2018
|
+
opts.callbacks.onError("connection", `S2S closed mid-reply (code=${code})`);
|
|
2019
|
+
} else log.info("S2S closed", {
|
|
2020
|
+
code,
|
|
2021
|
+
reason
|
|
2022
|
+
});
|
|
2023
|
+
}
|
|
1550
2024
|
}
|
|
1551
|
-
}
|
|
1552
|
-
};
|
|
1553
|
-
}
|
|
1554
|
-
/**
|
|
1555
|
-
* Complete a tool call by truncating the result, emitting a `tool_call_done` event,
|
|
1556
|
-
* and accumulating the result in `ctx.reply.pendingTools` — but only if the reply that
|
|
1557
|
-
* initiated this call is still active.
|
|
1558
|
-
*/
|
|
1559
|
-
function finishToolCall(ctx, callId, result, replyId) {
|
|
1560
|
-
const truncatedResult = result.length > 4e3 ? result.slice(0, MAX_TOOL_RESULT_CHARS) : result;
|
|
1561
|
-
ctx.client.event({
|
|
1562
|
-
type: "tool_call_done",
|
|
1563
|
-
toolCallId: callId,
|
|
1564
|
-
result: truncatedResult
|
|
1565
|
-
});
|
|
1566
|
-
if (replyId !== null && replyId === ctx.reply.currentReplyId) {
|
|
1567
|
-
ctx.reply.pendingTools.push({
|
|
1568
|
-
callId,
|
|
1569
|
-
result
|
|
1570
|
-
});
|
|
1571
|
-
if (ctx.maxHistory > 0 && ctx.reply.pendingTools.length > ctx.maxHistory) ctx.reply.pendingTools.shift();
|
|
1572
|
-
}
|
|
1573
|
-
}
|
|
1574
|
-
async function handleToolCall(ctx, event) {
|
|
1575
|
-
const { toolCallId: callId, toolName: name, args: parsedArgs } = event;
|
|
1576
|
-
const replyId = ctx.reply.currentReplyId;
|
|
1577
|
-
ctx.client.event(event);
|
|
1578
|
-
const refused = ctx.consumeToolCallStep(name, replyId);
|
|
1579
|
-
if (refused !== null) {
|
|
1580
|
-
finishToolCall(ctx, callId, refused, replyId);
|
|
1581
|
-
return;
|
|
1582
|
-
}
|
|
1583
|
-
ctx.log.info("S2S tool call", {
|
|
1584
|
-
tool: name,
|
|
1585
|
-
callId,
|
|
1586
|
-
args: parsedArgs,
|
|
1587
|
-
agent: ctx.agent
|
|
1588
|
-
});
|
|
1589
|
-
let result;
|
|
1590
|
-
try {
|
|
1591
|
-
result = await ctx.executeTool(name, parsedArgs, ctx.id, ctx.conversationMessages);
|
|
1592
|
-
} catch (err) {
|
|
1593
|
-
const msg = errorMessage(err);
|
|
1594
|
-
ctx.log.error("Tool execution failed", {
|
|
1595
|
-
tool: name,
|
|
1596
|
-
error: errorDetail(err)
|
|
1597
2025
|
});
|
|
1598
|
-
|
|
1599
|
-
}
|
|
1600
|
-
ctx.log.info("S2S tool result", {
|
|
1601
|
-
tool: name,
|
|
1602
|
-
callId,
|
|
1603
|
-
resultLength: result.length
|
|
1604
|
-
});
|
|
1605
|
-
finishToolCall(ctx, callId, result, replyId);
|
|
1606
|
-
}
|
|
1607
|
-
function handleUserTranscript(ctx, text) {
|
|
1608
|
-
ctx.log.info("S2S user transcript", { text });
|
|
1609
|
-
ctx.client.event({
|
|
1610
|
-
type: "user_transcript",
|
|
1611
|
-
text
|
|
1612
|
-
});
|
|
1613
|
-
ctx.pushMessages({
|
|
1614
|
-
role: "user",
|
|
1615
|
-
content: text
|
|
1616
|
-
});
|
|
1617
|
-
}
|
|
1618
|
-
function handleAgentTranscript(ctx, text, interrupted) {
|
|
1619
|
-
ctx.client.event({
|
|
1620
|
-
type: "agent_transcript",
|
|
1621
|
-
text
|
|
1622
|
-
});
|
|
1623
|
-
if (!interrupted) ctx.pushMessages({
|
|
1624
|
-
role: "assistant",
|
|
1625
|
-
content: text
|
|
1626
|
-
});
|
|
1627
|
-
}
|
|
1628
|
-
function handleReplyCancelled(ctx) {
|
|
1629
|
-
ctx.log.info("S2S reply interrupted (barge-in)");
|
|
1630
|
-
ctx.cancelReply();
|
|
1631
|
-
ctx.client.event({ type: "cancelled" });
|
|
1632
|
-
}
|
|
1633
|
-
/**
|
|
1634
|
-
* Warn when the entry-to-emit time for a reply_done dispatch exceeds this.
|
|
1635
|
-
* Tool-less sessions should be sub-millisecond; sessions with pending tools
|
|
1636
|
-
* will legitimately spend time awaiting ctx.turnPromise. We log both (with
|
|
1637
|
-
* `hadTurnPromise`) so event-loop starvation is distinguishable from
|
|
1638
|
-
* genuine tool-call latency.
|
|
1639
|
-
*/
|
|
1640
|
-
const REPLY_DONE_SLOW_THRESHOLD_MS = 50;
|
|
1641
|
-
function handleReplyDone(ctx) {
|
|
1642
|
-
const startMs = Date.now();
|
|
1643
|
-
const doneReplyId = ctx.reply.currentReplyId;
|
|
1644
|
-
if (doneReplyId === null) {
|
|
1645
|
-
ctx.log.debug("Dropping duplicate reply.done (no active reply)");
|
|
1646
|
-
return;
|
|
2026
|
+
handle.updateSession(opts.sessionConfig);
|
|
1647
2027
|
}
|
|
1648
|
-
|
|
1649
|
-
|
|
1650
|
-
|
|
1651
|
-
ctx.reply.pendingTools = [];
|
|
1652
|
-
return;
|
|
1653
|
-
}
|
|
1654
|
-
if (ctx.reply.pendingTools.length > 0) {
|
|
1655
|
-
for (const tool of ctx.reply.pendingTools) ctx.s2s?.sendToolResult(tool.callId, tool.result);
|
|
1656
|
-
ctx.reply.pendingTools = [];
|
|
1657
|
-
} else {
|
|
1658
|
-
const stepsUsed = ctx.reply.toolCallCount;
|
|
1659
|
-
if (stepsUsed > 0) ctx.log.info("Turn complete", {
|
|
1660
|
-
steps: stepsUsed,
|
|
1661
|
-
agent: ctx.agent
|
|
1662
|
-
});
|
|
1663
|
-
ctx.client.playAudioDone();
|
|
1664
|
-
ctx.client.event({ type: "reply_done" });
|
|
1665
|
-
ctx.reply.currentReplyId = null;
|
|
1666
|
-
const durationMs = Date.now() - startMs;
|
|
1667
|
-
if (durationMs >= REPLY_DONE_SLOW_THRESHOLD_MS) ctx.log.warn("slow reply_done dispatch", {
|
|
1668
|
-
sid: ctx.id,
|
|
1669
|
-
agent: ctx.agent,
|
|
1670
|
-
durationMs,
|
|
1671
|
-
hadTurnPromise
|
|
1672
|
-
});
|
|
1673
|
-
}
|
|
1674
|
-
};
|
|
1675
|
-
if (hadTurnPromise) ctx.turnPromise?.then(sendPending);
|
|
1676
|
-
else sendPending();
|
|
1677
|
-
}
|
|
1678
|
-
function setupListeners(ctx, handle) {
|
|
1679
|
-
handle.on("ready", ({ sessionId }) => ctx.log.info("S2S session ready", { sessionId }));
|
|
1680
|
-
handle.on("replyStarted", ({ replyId }) => {
|
|
1681
|
-
ctx.beginReply(replyId);
|
|
1682
|
-
});
|
|
1683
|
-
handle.on("sessionExpired", () => {
|
|
1684
|
-
ctx.log.info("S2S session expired");
|
|
1685
|
-
handle.close();
|
|
1686
|
-
});
|
|
1687
|
-
handle.on("audio", ({ audio }) => ctx.client.playAudioChunk(audio));
|
|
1688
|
-
handle.on("error", (err) => {
|
|
1689
|
-
ctx.log.error("S2S error", { message: err.message });
|
|
1690
|
-
ctx.client.event({
|
|
1691
|
-
type: "error",
|
|
1692
|
-
code: "internal",
|
|
1693
|
-
message: err.message
|
|
1694
|
-
});
|
|
1695
|
-
handle.close();
|
|
1696
|
-
});
|
|
1697
|
-
handle.on("close", (code, reason) => {
|
|
1698
|
-
const activeReplyId = ctx.reply.currentReplyId;
|
|
1699
|
-
if (activeReplyId !== null) ctx.log.warn("S2S closed with active reply", {
|
|
1700
|
-
sid: ctx.id,
|
|
1701
|
-
agent: ctx.agent,
|
|
1702
|
-
activeReplyId,
|
|
1703
|
-
code,
|
|
1704
|
-
reason
|
|
1705
|
-
});
|
|
1706
|
-
else ctx.log.info("S2S closed", {
|
|
1707
|
-
code,
|
|
1708
|
-
reason
|
|
1709
|
-
});
|
|
1710
|
-
ctx.s2s = null;
|
|
1711
|
-
ctx.cancelReply();
|
|
1712
|
-
});
|
|
1713
|
-
handle.on("event", (event) => {
|
|
1714
|
-
switch (event.type) {
|
|
1715
|
-
case "user_transcript":
|
|
1716
|
-
handleUserTranscript(ctx, event.text);
|
|
1717
|
-
break;
|
|
1718
|
-
case "agent_transcript":
|
|
1719
|
-
handleAgentTranscript(ctx, event.text, event._interrupted ?? false);
|
|
1720
|
-
break;
|
|
1721
|
-
case "tool_call": {
|
|
1722
|
-
const p = handleToolCall(ctx, event).catch((err) => {
|
|
1723
|
-
ctx.log.error("Tool call handler failed", { err: errorMessage(err) });
|
|
1724
|
-
});
|
|
1725
|
-
ctx.chainTurn(p);
|
|
1726
|
-
break;
|
|
1727
|
-
}
|
|
1728
|
-
case "reply_done":
|
|
1729
|
-
handleReplyDone(ctx);
|
|
1730
|
-
break;
|
|
1731
|
-
case "cancelled":
|
|
1732
|
-
handleReplyCancelled(ctx);
|
|
1733
|
-
break;
|
|
1734
|
-
default: ctx.client.event(event);
|
|
1735
|
-
}
|
|
1736
|
-
});
|
|
1737
|
-
}
|
|
1738
|
-
function createS2sSession(opts) {
|
|
1739
|
-
const { id, agent, client, toolSchemas, apiKey, s2sConfig, executeTool, createWebSocket = defaultCreateS2sWebSocket, logger: log = consoleLogger } = opts;
|
|
1740
|
-
const agentConfig = opts.skipGreeting ? {
|
|
1741
|
-
...opts.agentConfig,
|
|
1742
|
-
greeting: ""
|
|
1743
|
-
} : opts.agentConfig;
|
|
1744
|
-
const systemPrompt = buildSystemPrompt(agentConfig, {
|
|
1745
|
-
hasTools: toolSchemas.length > 0 || (agentConfig.builtinTools?.length ?? 0) > 0,
|
|
1746
|
-
voice: true,
|
|
1747
|
-
toolGuidance: opts.toolGuidance
|
|
1748
|
-
});
|
|
1749
|
-
const s2sTools = toolSchemas.map((ts) => ({
|
|
1750
|
-
type: "function",
|
|
1751
|
-
name: ts.name,
|
|
1752
|
-
description: ts.description,
|
|
1753
|
-
parameters: ts.parameters
|
|
1754
|
-
}));
|
|
1755
|
-
const sessionAbort = new AbortController();
|
|
1756
|
-
const ctx = buildCtx({
|
|
1757
|
-
id,
|
|
1758
|
-
agent,
|
|
1759
|
-
client,
|
|
1760
|
-
agentConfig,
|
|
1761
|
-
executeTool,
|
|
1762
|
-
log,
|
|
1763
|
-
maxHistory: opts.maxHistory
|
|
1764
|
-
});
|
|
1765
|
-
const rawTimeout = agentConfig.idleTimeoutMs ?? 3e5;
|
|
1766
|
-
const idle = createIdleTimer({
|
|
1767
|
-
timeoutMs: rawTimeout === 0 || !Number.isFinite(rawTimeout) ? 0 : rawTimeout,
|
|
1768
|
-
agent,
|
|
1769
|
-
log,
|
|
1770
|
-
client,
|
|
1771
|
-
ctx
|
|
1772
|
-
});
|
|
1773
|
-
let connectGeneration = 0;
|
|
1774
|
-
const sessionUpdatePayload = {
|
|
1775
|
-
systemPrompt,
|
|
1776
|
-
tools: s2sTools,
|
|
1777
|
-
...agentConfig.greeting ? { greeting: agentConfig.greeting } : {}
|
|
1778
|
-
};
|
|
1779
|
-
async function connectAndSetup() {
|
|
1780
|
-
const generation = ++connectGeneration;
|
|
1781
|
-
try {
|
|
1782
|
-
const handle = await _internals.connectS2s({
|
|
1783
|
-
apiKey,
|
|
1784
|
-
config: s2sConfig,
|
|
1785
|
-
createWebSocket,
|
|
1786
|
-
logger: log,
|
|
1787
|
-
sid: id
|
|
1788
|
-
});
|
|
1789
|
-
if (sessionAbort.signal.aborted || generation !== connectGeneration) {
|
|
1790
|
-
handle.close();
|
|
1791
|
-
return;
|
|
1792
|
-
}
|
|
1793
|
-
setupListeners(ctx, handle);
|
|
1794
|
-
handle.updateSession(sessionUpdatePayload);
|
|
1795
|
-
ctx.s2s = handle;
|
|
1796
|
-
idle.reset();
|
|
1797
|
-
} catch (err) {
|
|
1798
|
-
const msg = errorMessage(err);
|
|
1799
|
-
log.error("S2S connect failed", { error: errorDetail(err) });
|
|
1800
|
-
client.event({
|
|
1801
|
-
type: "error",
|
|
1802
|
-
code: "internal",
|
|
1803
|
-
message: msg
|
|
1804
|
-
});
|
|
1805
|
-
}
|
|
2028
|
+
async function stop() {
|
|
2029
|
+
handle?.close();
|
|
2030
|
+
handle = null;
|
|
1806
2031
|
}
|
|
1807
2032
|
return {
|
|
1808
|
-
|
|
1809
|
-
|
|
1810
|
-
|
|
1811
|
-
|
|
1812
|
-
if (sessionAbort.signal.aborted) return;
|
|
1813
|
-
sessionAbort.abort();
|
|
1814
|
-
idle.clear();
|
|
1815
|
-
if (ctx.turnPromise !== null) await ctx.turnPromise;
|
|
1816
|
-
ctx.s2s?.close();
|
|
1817
|
-
},
|
|
1818
|
-
onAudio(data) {
|
|
1819
|
-
idle.reset();
|
|
1820
|
-
ctx.s2s?.sendAudio(data);
|
|
1821
|
-
},
|
|
1822
|
-
onAudioReady() {},
|
|
1823
|
-
onCancel() {
|
|
1824
|
-
client.event({ type: "cancelled" });
|
|
1825
|
-
},
|
|
1826
|
-
onReset() {
|
|
1827
|
-
ctx.cancelReply();
|
|
1828
|
-
ctx.conversationMessages = [];
|
|
1829
|
-
ctx.reply.toolCallCount = 0;
|
|
1830
|
-
ctx.turnPromise = null;
|
|
1831
|
-
idle.clear();
|
|
1832
|
-
ctx.s2s?.close();
|
|
1833
|
-
client.event({ type: "reset" });
|
|
1834
|
-
connectAndSetup().catch((err) => log.error("S2S reset reconnect failed", { error: errorMessage(err) }));
|
|
2033
|
+
start,
|
|
2034
|
+
stop,
|
|
2035
|
+
sendUserAudio(bytes) {
|
|
2036
|
+
handle?.sendAudio(bytes);
|
|
1835
2037
|
},
|
|
1836
|
-
|
|
1837
|
-
|
|
1838
|
-
role: m.role,
|
|
1839
|
-
content: m.content
|
|
1840
|
-
})));
|
|
2038
|
+
sendToolResult(callId, result) {
|
|
2039
|
+
handle?.sendToolResult(callId, result);
|
|
1841
2040
|
},
|
|
1842
|
-
|
|
1843
|
-
|
|
1844
|
-
}
|
|
1845
|
-
};
|
|
1846
|
-
}
|
|
1847
|
-
//#endregion
|
|
1848
|
-
//#region host/tool-executor.ts
|
|
1849
|
-
/**
|
|
1850
|
-
* Tool execution — validates arguments and invokes tool handlers.
|
|
1851
|
-
*
|
|
1852
|
-
* {@link executeToolCall} is the single entry point used by both the
|
|
1853
|
-
* direct (self-hosted) runtime and the platform sandbox sidecar.
|
|
1854
|
-
*/
|
|
1855
|
-
const yieldTick = () => new Promise((r) => setTimeout(r, 0));
|
|
1856
|
-
function buildToolContext(opts) {
|
|
1857
|
-
const { env, state, kv, messages, sessionId } = opts;
|
|
1858
|
-
return {
|
|
1859
|
-
env,
|
|
1860
|
-
state: state ?? {},
|
|
1861
|
-
get kv() {
|
|
1862
|
-
if (!kv) throw new Error("KV not available");
|
|
1863
|
-
return kv;
|
|
2041
|
+
cancelReply() {
|
|
2042
|
+
currentReplyId = null;
|
|
1864
2043
|
},
|
|
1865
|
-
|
|
1866
|
-
|
|
1867
|
-
|
|
1868
|
-
|
|
2044
|
+
updateSession(config) {
|
|
2045
|
+
handle?.updateSession({
|
|
2046
|
+
systemPrompt: config.systemPrompt,
|
|
2047
|
+
tools: config.tools ?? [],
|
|
2048
|
+
...config.greeting !== void 0 ? { greeting: config.greeting } : {}
|
|
2049
|
+
});
|
|
1869
2050
|
}
|
|
1870
2051
|
};
|
|
1871
2052
|
}
|
|
1872
|
-
async function executeToolCall(name, args, options) {
|
|
1873
|
-
const { tool } = options;
|
|
1874
|
-
const parsed = (tool.parameters ?? EMPTY_PARAMS).safeParse(args);
|
|
1875
|
-
if (!parsed.success) return toolError(`Invalid arguments for tool "${name}": ${(parsed.error?.issues ?? []).map((i) => `${i.path.map(String).join(".")}: ${i.message}`).join(", ")}`);
|
|
1876
|
-
try {
|
|
1877
|
-
const ctx = buildToolContext(options);
|
|
1878
|
-
await yieldTick();
|
|
1879
|
-
const result = await pTimeout(Promise.resolve(tool.execute(parsed.data, ctx)), {
|
|
1880
|
-
milliseconds: TOOL_EXECUTION_TIMEOUT_MS,
|
|
1881
|
-
message: `Tool "${name}" timed out after ${TOOL_EXECUTION_TIMEOUT_MS}ms`
|
|
1882
|
-
});
|
|
1883
|
-
await yieldTick();
|
|
1884
|
-
if (result == null) return "null";
|
|
1885
|
-
return typeof result === "string" ? result : JSON.stringify(result);
|
|
1886
|
-
} catch (err) {
|
|
1887
|
-
const log = options.logger;
|
|
1888
|
-
if (log) log.warn("Tool execution failed", {
|
|
1889
|
-
tool: name,
|
|
1890
|
-
error: errorDetail(err)
|
|
1891
|
-
});
|
|
1892
|
-
else console.warn(`[tool-executor] Tool execution failed: ${name}`, err);
|
|
1893
|
-
return toolError(errorMessage(err));
|
|
1894
|
-
}
|
|
1895
|
-
}
|
|
1896
2053
|
//#endregion
|
|
1897
2054
|
//#region host/unstorage-kv.ts
|
|
1898
2055
|
/**
|
|
@@ -1944,20 +2101,20 @@ function createUnstorageKv(options) {
|
|
|
1944
2101
|
*
|
|
1945
2102
|
* Audio validation is handled at the host transport layer (see server.ts).
|
|
1946
2103
|
*/
|
|
2104
|
+
const AUDIO_DONE_FRAME = JSON.stringify({ type: "audio_done" });
|
|
1947
2105
|
/**
|
|
1948
2106
|
* Creates a {@link ClientSink} backed by a plain WebSocket.
|
|
1949
2107
|
*
|
|
1950
|
-
*
|
|
1951
|
-
* binary frames
|
|
2108
|
+
* Session events are sent as JSON text frames; audio chunks are sent as raw
|
|
2109
|
+
* PCM16 binary frames.
|
|
1952
2110
|
*/
|
|
1953
2111
|
function createClientSink(ws, log) {
|
|
1954
|
-
/** Send data over ws, silently dropping if the socket is not open. */
|
|
1955
2112
|
function safeSend(data) {
|
|
1956
2113
|
try {
|
|
1957
2114
|
if (ws.readyState !== 1) return;
|
|
1958
2115
|
ws.send(data);
|
|
1959
2116
|
} catch (err) {
|
|
1960
|
-
log.debug?.("safeSend: socket closed between readyState check and send", { error:
|
|
2117
|
+
log.debug?.("safeSend: socket closed between readyState check and send", { error: err instanceof Error ? err.message : String(err) });
|
|
1961
2118
|
}
|
|
1962
2119
|
}
|
|
1963
2120
|
return {
|
|
@@ -1971,7 +2128,7 @@ function createClientSink(ws, log) {
|
|
|
1971
2128
|
safeSend(chunk);
|
|
1972
2129
|
},
|
|
1973
2130
|
playAudioDone() {
|
|
1974
|
-
safeSend(
|
|
2131
|
+
safeSend(AUDIO_DONE_FRAME);
|
|
1975
2132
|
}
|
|
1976
2133
|
};
|
|
1977
2134
|
}
|
|
@@ -1980,35 +2137,32 @@ function handleBinaryAudio(data, session) {
|
|
|
1980
2137
|
session.onAudio(data);
|
|
1981
2138
|
return true;
|
|
1982
2139
|
}
|
|
1983
|
-
if (data instanceof ArrayBuffer) {
|
|
1984
|
-
session.onAudio(new Uint8Array(data));
|
|
1985
|
-
return true;
|
|
1986
|
-
}
|
|
1987
2140
|
return false;
|
|
1988
2141
|
}
|
|
1989
|
-
function handleTextMessage(data, session, log,
|
|
1990
|
-
if (typeof data !== "string")
|
|
1991
|
-
|
|
2142
|
+
function handleTextMessage(data, session, log, sid) {
|
|
2143
|
+
if (typeof data !== "string") {
|
|
2144
|
+
log.warn("ws: non-string, non-binary frame received; dropping", { sid });
|
|
2145
|
+
return;
|
|
2146
|
+
}
|
|
2147
|
+
let parsed;
|
|
1992
2148
|
try {
|
|
1993
|
-
|
|
2149
|
+
parsed = JSON.parse(data);
|
|
1994
2150
|
} catch {
|
|
1995
|
-
log.warn("
|
|
1996
|
-
|
|
1997
|
-
|
|
2151
|
+
log.warn("ws: invalid JSON; dropping", {
|
|
2152
|
+
sid,
|
|
2153
|
+
data: data.slice(0, 200)
|
|
1998
2154
|
});
|
|
1999
2155
|
return;
|
|
2000
2156
|
}
|
|
2001
|
-
const
|
|
2002
|
-
if (!
|
|
2003
|
-
if (
|
|
2004
|
-
...ctx,
|
|
2157
|
+
const result = lenientParse(ClientMessageSchema, parsed);
|
|
2158
|
+
if (!result.ok) {
|
|
2159
|
+
if (result.malformed) log.warn("ws: malformed client message", {
|
|
2005
2160
|
sid,
|
|
2006
|
-
error:
|
|
2161
|
+
error: result.error
|
|
2007
2162
|
});
|
|
2008
2163
|
return;
|
|
2009
2164
|
}
|
|
2010
|
-
|
|
2011
|
-
switch (msg.type) {
|
|
2165
|
+
switch (result.data.type) {
|
|
2012
2166
|
case "audio_ready":
|
|
2013
2167
|
session.onAudioReady();
|
|
2014
2168
|
break;
|
|
@@ -2019,19 +2173,19 @@ function handleTextMessage(data, session, log, ctx, sid) {
|
|
|
2019
2173
|
session.onReset();
|
|
2020
2174
|
break;
|
|
2021
2175
|
case "history":
|
|
2022
|
-
session.onHistory(
|
|
2176
|
+
session.onHistory(result.data.messages);
|
|
2023
2177
|
break;
|
|
2024
2178
|
default: break;
|
|
2025
2179
|
}
|
|
2026
2180
|
}
|
|
2027
2181
|
/**
|
|
2028
|
-
* Attaches session lifecycle handlers to a native WebSocket using
|
|
2029
|
-
*
|
|
2182
|
+
* Attaches session lifecycle handlers to a native WebSocket using JSON text
|
|
2183
|
+
* frames for control messages and raw PCM16 binary frames for audio.
|
|
2030
2184
|
*
|
|
2031
2185
|
* Connection flow:
|
|
2032
|
-
* 1. WebSocket opens → server sends
|
|
2033
|
-
* 2. Client sets up audio → sends
|
|
2034
|
-
* 3. If reconnecting → client sends
|
|
2186
|
+
* 1. WebSocket opens → server sends JSON CONFIG frame with sampleRate, ttsSampleRate, sessionId
|
|
2187
|
+
* 2. Client sets up audio → sends JSON AUDIO_READY frame
|
|
2188
|
+
* 3. If reconnecting → client sends JSON HISTORY frame with prior messages
|
|
2035
2189
|
*/
|
|
2036
2190
|
function wireSessionSocket(ws, opts) {
|
|
2037
2191
|
const { sessions, logger: log = consoleLogger } = opts;
|
|
@@ -2041,7 +2195,7 @@ function wireSessionSocket(ws, opts) {
|
|
|
2041
2195
|
let session = null;
|
|
2042
2196
|
/** Set to true once session.start() resolves. Messages arriving before
|
|
2043
2197
|
* this flag is set are buffered and replayed once the session is ready,
|
|
2044
|
-
* preventing audio/
|
|
2198
|
+
* preventing audio/frames from being dispatched to a half-initialized session. */
|
|
2045
2199
|
let sessionReady = false;
|
|
2046
2200
|
let messageBuffer = [];
|
|
2047
2201
|
function drainBuffer() {
|
|
@@ -2049,9 +2203,8 @@ function wireSessionSocket(ws, opts) {
|
|
|
2049
2203
|
const buf = messageBuffer;
|
|
2050
2204
|
messageBuffer = null;
|
|
2051
2205
|
for (const event of buf) {
|
|
2052
|
-
|
|
2053
|
-
|
|
2054
|
-
handleTextMessage(data, session, log, ctx, sid);
|
|
2206
|
+
if (handleBinaryAudio(event.data, session)) continue;
|
|
2207
|
+
handleTextMessage(event.data, session, log, sid);
|
|
2055
2208
|
}
|
|
2056
2209
|
}
|
|
2057
2210
|
function onOpen() {
|
|
@@ -2066,7 +2219,9 @@ function wireSessionSocket(ws, opts) {
|
|
|
2066
2219
|
opts.onSinkCreated?.(sessionId, client);
|
|
2067
2220
|
ws.send(JSON.stringify({
|
|
2068
2221
|
type: "config",
|
|
2069
|
-
|
|
2222
|
+
audioFormat: opts.readyConfig.audioFormat,
|
|
2223
|
+
sampleRate: opts.readyConfig.sampleRate,
|
|
2224
|
+
ttsSampleRate: opts.readyConfig.ttsSampleRate,
|
|
2070
2225
|
sessionId
|
|
2071
2226
|
}));
|
|
2072
2227
|
const timeoutMs = opts.sessionStartTimeoutMs ?? 1e4;
|
|
@@ -2099,9 +2254,8 @@ function wireSessionSocket(ws, opts) {
|
|
|
2099
2254
|
if (messageBuffer && messageBuffer.length < 100) messageBuffer.push(event);
|
|
2100
2255
|
return;
|
|
2101
2256
|
}
|
|
2102
|
-
|
|
2103
|
-
|
|
2104
|
-
handleTextMessage(data, session, log, ctx, sid);
|
|
2257
|
+
if (handleBinaryAudio(event.data, session)) return;
|
|
2258
|
+
handleTextMessage(event.data, session, log, sid);
|
|
2105
2259
|
});
|
|
2106
2260
|
ws.addEventListener("close", () => {
|
|
2107
2261
|
log.info("Session disconnected", {
|
|
@@ -2132,6 +2286,30 @@ function wireSessionSocket(ws, opts) {
|
|
|
2132
2286
|
//#endregion
|
|
2133
2287
|
//#region host/runtime.ts
|
|
2134
2288
|
/**
|
|
2289
|
+
* Resolve the API key env-var for the configured STT provider.
|
|
2290
|
+
*
|
|
2291
|
+
* Each STT provider uses its own env var (e.g. `ASSEMBLYAI_API_KEY`,
|
|
2292
|
+
* `DEEPGRAM_API_KEY`). We read the kind from the descriptor if it is one;
|
|
2293
|
+
* pre-resolved openers have no kind field so we fall back to AssemblyAI for
|
|
2294
|
+
* backward compatibility (openers supply their own key at open-time anyway).
|
|
2295
|
+
*/
|
|
2296
|
+
function resolveSttApiKey(stt, env) {
|
|
2297
|
+
if ((stt != null && "kind" in stt && typeof stt.kind === "string" ? stt.kind : void 0) === "deepgram") return resolveApiKey("DEEPGRAM_API_KEY", env);
|
|
2298
|
+
return resolveApiKey("ASSEMBLYAI_API_KEY", env);
|
|
2299
|
+
}
|
|
2300
|
+
/**
|
|
2301
|
+
* Resolve the API key env-var for the configured TTS provider.
|
|
2302
|
+
*
|
|
2303
|
+
* Each TTS provider uses its own env var (e.g. `CARTESIA_API_KEY`,
|
|
2304
|
+
* `RIME_API_KEY`). We read the kind from the descriptor if it is one;
|
|
2305
|
+
* pre-resolved openers have no kind field so we fall back to Cartesia for
|
|
2306
|
+
* backward compatibility (openers supply their own key at open-time anyway).
|
|
2307
|
+
*/
|
|
2308
|
+
function resolveTtsApiKey(tts, env) {
|
|
2309
|
+
if ((tts != null && "kind" in tts && typeof tts.kind === "string" ? tts.kind : void 0) === "rime") return resolveApiKey("RIME_API_KEY", env);
|
|
2310
|
+
return resolveApiKey("CARTESIA_API_KEY", env);
|
|
2311
|
+
}
|
|
2312
|
+
/**
|
|
2135
2313
|
* Distinguish a descriptor (`{ kind, options }`) from an already-resolved
|
|
2136
2314
|
* opener / `LanguageModel`. The production path always passes descriptors;
|
|
2137
2315
|
* openers are a test escape hatch (fakes in `_pipeline-test-fakes.ts`).
|
|
@@ -2236,40 +2414,86 @@ function createRuntime(opts) {
|
|
|
2236
2414
|
} : null;
|
|
2237
2415
|
function createSession(sessionOpts) {
|
|
2238
2416
|
sinkMap.set(sessionOpts.id, sessionOpts.client);
|
|
2239
|
-
|
|
2240
|
-
|
|
2417
|
+
const isPipeline = Boolean(pipelineProviders);
|
|
2418
|
+
const systemPrompt = buildSystemPrompt(agentConfig, {
|
|
2419
|
+
hasTools: toolSchemas.length > 0 || (agentConfig.builtinTools?.length ?? 0) > 0,
|
|
2420
|
+
voice: true,
|
|
2421
|
+
toolGuidance
|
|
2422
|
+
});
|
|
2423
|
+
let core = null;
|
|
2424
|
+
function bindCore() {
|
|
2425
|
+
if (!core) throw new Error("SessionCore not yet created");
|
|
2426
|
+
return core;
|
|
2427
|
+
}
|
|
2428
|
+
const callbacks = {
|
|
2429
|
+
onReplyStarted: (replyId) => bindCore().onReplyStarted(replyId),
|
|
2430
|
+
onReplyDone: () => bindCore().onReplyDone(),
|
|
2431
|
+
onCancelled: () => bindCore().onCancelled(),
|
|
2432
|
+
onAudioChunk: (bytes) => bindCore().onAudioChunk(bytes),
|
|
2433
|
+
onAudioDone: () => bindCore().onAudioDone(),
|
|
2434
|
+
onUserTranscript: (text) => bindCore().onUserTranscript(text),
|
|
2435
|
+
onAgentTranscript: (text, interrupted) => bindCore().onAgentTranscript(text, interrupted),
|
|
2436
|
+
onToolCall: isPipeline ? (id, name, args) => sessionOpts.client.event({
|
|
2437
|
+
type: "tool_call",
|
|
2438
|
+
toolCallId: id,
|
|
2439
|
+
toolName: name,
|
|
2440
|
+
args
|
|
2441
|
+
}) : (id, name, args) => bindCore().onToolCall(id, name, args),
|
|
2442
|
+
onError: (code, message) => bindCore().onError(code, message),
|
|
2443
|
+
onSpeechStarted: () => bindCore().onSpeechStarted(),
|
|
2444
|
+
onSpeechStopped: () => bindCore().onSpeechStopped()
|
|
2445
|
+
};
|
|
2446
|
+
let transport;
|
|
2447
|
+
if (pipelineProviders) transport = createPipelineTransport({
|
|
2448
|
+
sid: sessionOpts.id,
|
|
2241
2449
|
agent: sessionOpts.agent,
|
|
2242
|
-
client: sessionOpts.client,
|
|
2243
|
-
agentConfig,
|
|
2244
|
-
toolSchemas,
|
|
2245
|
-
toolGuidance,
|
|
2246
|
-
executeTool,
|
|
2247
2450
|
stt: pipelineProviders.stt,
|
|
2248
2451
|
llm: pipelineProviders.llm,
|
|
2249
2452
|
tts: pipelineProviders.tts,
|
|
2250
|
-
|
|
2251
|
-
|
|
2453
|
+
callbacks,
|
|
2454
|
+
sessionConfig: {
|
|
2455
|
+
systemPrompt,
|
|
2456
|
+
greeting: agentConfig.greeting,
|
|
2457
|
+
tools: toolSchemas
|
|
2458
|
+
},
|
|
2459
|
+
toolSchemas,
|
|
2460
|
+
executeTool,
|
|
2461
|
+
providerKeys: {
|
|
2462
|
+
stt: resolveSttApiKey(opts.stt, env),
|
|
2463
|
+
tts: resolveTtsApiKey(opts.tts, env)
|
|
2464
|
+
},
|
|
2252
2465
|
sttSampleRate: s2sConfig.inputSampleRate,
|
|
2253
2466
|
ttsSampleRate: s2sConfig.outputSampleRate,
|
|
2467
|
+
maxSteps: agentConfig.maxSteps,
|
|
2468
|
+
toolChoice: agentConfig.toolChoice,
|
|
2254
2469
|
skipGreeting: sessionOpts.skipGreeting ?? false,
|
|
2255
2470
|
logger
|
|
2256
2471
|
});
|
|
2257
|
-
|
|
2258
|
-
|
|
2472
|
+
else transport = createS2sTransport({
|
|
2473
|
+
apiKey: env.ASSEMBLYAI_API_KEY ?? "",
|
|
2474
|
+
s2sConfig,
|
|
2475
|
+
sessionConfig: {
|
|
2476
|
+
systemPrompt,
|
|
2477
|
+
tools: toolSchemas,
|
|
2478
|
+
...agentConfig.greeting !== void 0 ? { greeting: agentConfig.greeting } : {}
|
|
2479
|
+
},
|
|
2480
|
+
toolSchemas,
|
|
2481
|
+
callbacks,
|
|
2482
|
+
sid: sessionOpts.id,
|
|
2483
|
+
agent: sessionOpts.agent,
|
|
2484
|
+
...createWebSocket ? { createWebSocket } : {},
|
|
2485
|
+
logger
|
|
2486
|
+
});
|
|
2487
|
+
core = createSessionCore({
|
|
2259
2488
|
id: sessionOpts.id,
|
|
2260
2489
|
agent: sessionOpts.agent,
|
|
2261
2490
|
client: sessionOpts.client,
|
|
2262
2491
|
agentConfig,
|
|
2263
|
-
toolSchemas,
|
|
2264
|
-
toolGuidance,
|
|
2265
|
-
apiKey,
|
|
2266
|
-
s2sConfig,
|
|
2267
2492
|
executeTool,
|
|
2268
|
-
|
|
2269
|
-
|
|
2270
|
-
logger,
|
|
2271
|
-
...sessionOpts.resumeFrom ? { resumeFrom: sessionOpts.resumeFrom } : {}
|
|
2493
|
+
transport,
|
|
2494
|
+
logger
|
|
2272
2495
|
});
|
|
2496
|
+
return core;
|
|
2273
2497
|
}
|
|
2274
2498
|
function startSession(ws, startOpts) {
|
|
2275
2499
|
const resumeFrom = startOpts?.resumeFrom;
|
|
@@ -2454,4 +2678,4 @@ function createServer(options) {
|
|
|
2454
2678
|
};
|
|
2455
2679
|
}
|
|
2456
2680
|
//#endregion
|
|
2457
|
-
export { DEFAULT_S2S_CONFIG,
|
|
2681
|
+
export { DEFAULT_S2S_CONFIG, _internals, consoleLogger, createPipelineTransport, createRuntime, createS2sTransport, createServer, createSessionCore, createUnstorageKv, executeInIsolate, executeToolCall, jsonLogger, resolveAllBuiltins, wireSessionSocket };
|