@alexkroman1/aai 1.7.1 → 1.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +11 -9
- package/CHANGELOG.md +23 -0
- package/dist/{_internal-types-CrnTi9Ew.js → _internal-types-CfOAbK6V.js} +22 -35
- package/dist/constants-y68COEGj.js +29 -0
- package/dist/host/_base64.d.ts +2 -0
- package/dist/host/_mock-ws.d.ts +0 -61
- package/dist/host/_pipeline-test-fakes.d.ts +7 -4
- package/dist/host/_run-code.d.ts +0 -25
- package/dist/host/_runtime-conformance.d.ts +3 -34
- package/dist/host/memory-vector.d.ts +0 -11
- package/dist/host/providers/resolve-kv.d.ts +0 -7
- package/dist/host/providers/resolve-vector.d.ts +0 -8
- package/dist/host/providers/stt/assemblyai.d.ts +0 -14
- package/dist/host/providers/stt/deepgram.d.ts +2 -14
- package/dist/host/providers/stt/soniox.d.ts +0 -22
- package/dist/host/providers/tts/rime.d.ts +10 -31
- package/dist/host/runtime-barrel.js +670 -630
- package/dist/host/runtime-config.d.ts +9 -6
- package/dist/host/runtime.d.ts +3 -0
- package/dist/host/to-vercel-tools.d.ts +3 -33
- package/dist/host/transports/openai-realtime-transport.d.ts +45 -0
- package/dist/host/unstorage-kv.d.ts +0 -26
- package/dist/index.js +3 -3
- package/dist/openai-realtime-cjPAHMMx.js +10 -0
- package/dist/sdk/_internal-types.d.ts +6 -55
- package/dist/sdk/allowed-hosts.d.ts +4 -3
- package/dist/sdk/constants.d.ts +4 -29
- package/dist/sdk/define.d.ts +7 -4
- package/dist/sdk/kv.d.ts +13 -37
- package/dist/sdk/manifest-barrel.js +1 -1
- package/dist/sdk/manifest.d.ts +8 -2
- package/dist/sdk/protocol.js +1 -1
- package/dist/sdk/providers/s2s/openai-realtime.d.ts +17 -0
- package/dist/sdk/providers/s2s-barrel.d.ts +9 -0
- package/dist/sdk/providers/s2s-barrel.js +2 -0
- package/dist/sdk/providers/tts/rime.d.ts +1 -1
- package/dist/sdk/providers.d.ts +6 -2
- package/dist/sdk/types.d.ts +7 -1
- package/dist/{types-KUgezM6u.js → types-DOWVZhb9.js} +1 -7
- package/dist/{ws-upgrade-BeOQ7fXL.js → ws-upgrade-CG8-by1n.js} +2 -3
- package/host/_base64.ts +9 -0
- package/host/_mock-ws.ts +0 -65
- package/host/_pipeline-test-fakes.ts +19 -31
- package/host/_run-code.ts +10 -53
- package/host/_runtime-conformance.ts +3 -44
- package/host/_test-utils.ts +20 -42
- package/host/builtin-tools.test.ts +127 -222
- package/host/builtin-tools.ts +6 -10
- package/host/cleanup.test.ts +30 -73
- package/host/integration/pipeline-reference.integration.test.ts +12 -17
- package/host/integration.test.ts +0 -7
- package/host/memory-vector.test.ts +3 -1
- package/host/memory-vector.ts +16 -21
- package/host/pinecone-vector.test.ts +14 -17
- package/host/pinecone-vector.ts +10 -19
- package/host/providers/providers.test-d.ts +5 -3
- package/host/providers/resolve-kv.ts +23 -41
- package/host/providers/resolve-vector.ts +3 -12
- package/host/providers/resolve.test.ts +15 -28
- package/host/providers/resolve.ts +24 -24
- package/host/providers/stt/assemblyai.test.ts +2 -14
- package/host/providers/stt/assemblyai.ts +12 -35
- package/host/providers/stt/deepgram.test.ts +23 -83
- package/host/providers/stt/deepgram.ts +15 -40
- package/host/providers/stt/elevenlabs.test.ts +26 -38
- package/host/providers/stt/elevenlabs.ts +10 -9
- package/host/providers/stt/soniox.test.ts +35 -85
- package/host/providers/stt/soniox.ts +8 -53
- package/host/providers/tts/cartesia.test.ts +19 -58
- package/host/providers/tts/cartesia.ts +36 -66
- package/host/providers/tts/rime.test.ts +12 -38
- package/host/providers/tts/rime.ts +23 -86
- package/host/runtime-config.test.ts +9 -9
- package/host/runtime-config.ts +16 -22
- package/host/runtime.test.ts +111 -73
- package/host/runtime.ts +139 -86
- package/host/s2s.test.ts +92 -191
- package/host/s2s.ts +55 -49
- package/host/server-shutdown.test.ts +9 -30
- package/host/server.test.ts +2 -13
- package/host/server.ts +85 -100
- package/host/session-core.test.ts +15 -30
- package/host/session-core.ts +10 -13
- package/host/session-prompt.test.ts +1 -5
- package/host/to-vercel-tools.test.ts +53 -72
- package/host/to-vercel-tools.ts +9 -39
- package/host/tool-executor.test.ts +25 -51
- package/host/tool-executor.ts +18 -12
- package/host/transports/openai-realtime-transport.test.ts +439 -0
- package/host/transports/openai-realtime-transport.ts +371 -0
- package/host/transports/pipeline-transport.test.ts +125 -298
- package/host/transports/pipeline-transport.ts +20 -68
- package/host/transports/s2s-transport-fixtures.test.ts +31 -92
- package/host/transports/s2s-transport.test.ts +65 -134
- package/host/transports/s2s-transport.ts +15 -43
- package/host/transports/types.test.ts +4 -8
- package/host/unstorage-kv.test.ts +3 -2
- package/host/unstorage-kv.ts +5 -35
- package/host/ws-handler.test.ts +72 -176
- package/host/ws-handler.ts +6 -12
- package/package.json +6 -1
- package/sdk/__snapshots__/exports.test.ts.snap +7 -0
- package/sdk/__snapshots__/schema-shapes.test.ts.snap +1 -0
- package/sdk/_internal-types.test.ts +6 -9
- package/sdk/_internal-types.ts +16 -57
- package/sdk/_test-matchers.ts +25 -15
- package/sdk/allowed-hosts.test.ts +50 -114
- package/sdk/allowed-hosts.ts +8 -14
- package/sdk/constants.ts +5 -52
- package/sdk/define.test.ts +7 -6
- package/sdk/define.ts +7 -3
- package/sdk/exports.test.ts +6 -1
- package/sdk/kv.ts +13 -37
- package/sdk/manifest.test-d.ts +5 -0
- package/sdk/manifest.test.ts +61 -9
- package/sdk/manifest.ts +11 -11
- package/sdk/protocol-compat.test.ts +66 -98
- package/sdk/protocol-snapshot.test.ts +2 -16
- package/sdk/protocol.test.ts +13 -22
- package/sdk/providers/s2s/openai-realtime.ts +36 -0
- package/sdk/providers/s2s-barrel.ts +12 -0
- package/sdk/providers/tts/rime.ts +1 -1
- package/sdk/providers.ts +24 -5
- package/sdk/schema-alignment.test.ts +25 -73
- package/sdk/schema-shapes.test.ts +1 -29
- package/sdk/system-prompt.test.ts +0 -1
- package/sdk/system-prompt.ts +17 -19
- package/sdk/types-inference.test.ts +10 -36
- package/sdk/types.ts +7 -0
- package/sdk/ws-upgrade.test.ts +24 -23
- package/sdk/ws-upgrade.ts +2 -3
- package/tsdown.config.ts +8 -11
- package/dist/constants-C2nirZUI.js +0 -54
|
@@ -1,13 +1,14 @@
|
|
|
1
|
-
import { r as DEFAULT_SYSTEM_PROMPT } from "../types-
|
|
2
|
-
import { _ as TOOL_EXECUTION_TIMEOUT_MS, a as DEFAULT_SHUTDOWN_TIMEOUT_MS, c as FETCH_TIMEOUT_MS, d as MAX_PAGE_CHARS, g as RUN_CODE_TIMEOUT_MS, h as PIPELINE_FLUSH_TIMEOUT_MS, l as MAX_HTML_BYTES, m as MAX_WS_PAYLOAD_BYTES, o as DEFAULT_STT_SAMPLE_RATE, p as MAX_VALUE_SIZE, s as DEFAULT_TTS_SAMPLE_RATE, t as AGENT_CSP } from "../constants-
|
|
3
|
-
import { i as toolError, n as errorDetail, r as errorMessage, t as parseWsUpgradeParams } from "../ws-upgrade-
|
|
1
|
+
import { r as DEFAULT_SYSTEM_PROMPT } from "../types-DOWVZhb9.js";
|
|
2
|
+
import { _ as TOOL_EXECUTION_TIMEOUT_MS, a as DEFAULT_SHUTDOWN_TIMEOUT_MS, c as FETCH_TIMEOUT_MS, d as MAX_PAGE_CHARS, g as RUN_CODE_TIMEOUT_MS, h as PIPELINE_FLUSH_TIMEOUT_MS, l as MAX_HTML_BYTES, m as MAX_WS_PAYLOAD_BYTES, o as DEFAULT_STT_SAMPLE_RATE, p as MAX_VALUE_SIZE, s as DEFAULT_TTS_SAMPLE_RATE, t as AGENT_CSP } from "../constants-y68COEGj.js";
|
|
3
|
+
import { i as toolError, n as errorDetail, r as errorMessage, t as parseWsUpgradeParams } from "../ws-upgrade-CG8-by1n.js";
|
|
4
4
|
import { ClientMessageSchema, VectorRequestSchema, buildReadyConfig, lenientParse } from "../sdk/protocol.js";
|
|
5
|
-
import { a as toAgentConfig, c as makeSttError, i as agentToolsToSchemas, l as makeTtsError, n as EMPTY_PARAMS, s as assertProviderTriple } from "../_internal-types-
|
|
5
|
+
import { a as toAgentConfig, c as makeSttError, i as agentToolsToSchemas, l as makeTtsError, n as EMPTY_PARAMS, s as assertProviderTriple } from "../_internal-types-CfOAbK6V.js";
|
|
6
6
|
import { a as MISTRAL_KIND, d as ANTHROPIC_KIND, l as GOOGLE_KIND, r as OPENAI_KIND, s as GROQ_KIND } from "../xai-BDI61Y2M.js";
|
|
7
7
|
import { a as DEEPGRAM_KIND, r as ELEVENLABS_KIND, s as ASSEMBLYAI_KIND, t as SONIOX_KIND } from "../soniox-BQdL0mB5.js";
|
|
8
8
|
import { a as CARTESIA_KIND, n as RIME_KIND } from "../rime-58p9mDR8.js";
|
|
9
9
|
import { a as MEMORY_KV_KIND, r as REDIS_KV_KIND } from "../s3-BtCMvCod.js";
|
|
10
10
|
import { r as IN_MEMORY_VECTOR_KIND, t as PINECONE_VECTOR_KIND } from "../pinecone-CeJ69aRs.js";
|
|
11
|
+
import "../openai-realtime-cjPAHMMx.js";
|
|
11
12
|
import { createRequire } from "node:module";
|
|
12
13
|
import { z } from "zod";
|
|
13
14
|
import { convert } from "html-to-text";
|
|
@@ -35,20 +36,12 @@ import path from "node:path";
|
|
|
35
36
|
import escapeHtml from "escape-html";
|
|
36
37
|
import { lookup } from "mime-types";
|
|
37
38
|
//#region host/_run-code.ts
|
|
38
|
-
/**
|
|
39
|
-
* run_code built-in tool — executes user JavaScript in a fresh `node:vm`
|
|
40
|
-
* context with no network, filesystem, or process access.
|
|
41
|
-
*/
|
|
42
39
|
const SKIPPED_CLASS_KEYS = new Set([
|
|
43
40
|
"constructor",
|
|
44
41
|
"prototype",
|
|
45
42
|
"length",
|
|
46
43
|
"name"
|
|
47
44
|
]);
|
|
48
|
-
/**
|
|
49
|
-
* Copy static members from a class constructor to a wrapper function,
|
|
50
|
-
* skipping built-in keys that must not be forwarded.
|
|
51
|
-
*/
|
|
52
45
|
function copyStaticMembers(src, dst) {
|
|
53
46
|
for (const key of Object.getOwnPropertyNames(src)) {
|
|
54
47
|
if (SKIPPED_CLASS_KEYS.has(key)) continue;
|
|
@@ -59,16 +52,10 @@ function copyStaticMembers(src, dst) {
|
|
|
59
52
|
}
|
|
60
53
|
}
|
|
61
54
|
/**
|
|
62
|
-
*
|
|
63
|
-
*
|
|
64
|
-
*
|
|
65
|
-
*
|
|
66
|
-
*
|
|
67
|
-
* For class constructors: additionally copies static methods and neutralizes
|
|
68
|
-
* `prototype.constructor` so instances created via `new` also cannot escape.
|
|
69
|
-
*
|
|
70
|
-
* This prevents sandbox code from reaching the host `Function` constructor
|
|
71
|
-
* via patterns like `fn.constructor.constructor('return process')()`.
|
|
55
|
+
* Prevents sandbox code from reaching the host `Function` constructor via
|
|
56
|
+
* `fn.constructor.constructor('return process')()`. For class constructors
|
|
57
|
+
* we also copy static members and neuter `prototype.constructor` so
|
|
58
|
+
* instances created via `new` cannot escape either.
|
|
72
59
|
*/
|
|
73
60
|
function neutralizeConstructor(fn) {
|
|
74
61
|
const hasPrototype = typeof fn.prototype === "object" && fn.prototype !== null;
|
|
@@ -92,19 +79,6 @@ function neutralizeConstructor(fn) {
|
|
|
92
79
|
return Wrapper;
|
|
93
80
|
}
|
|
94
81
|
const runCodeParams = z.object({ code: z.string().describe("JavaScript code to execute. Use console.log() for output.") });
|
|
95
|
-
/**
|
|
96
|
-
* Execute JavaScript code inside a fresh `node:vm` context.
|
|
97
|
-
*
|
|
98
|
-
* Each invocation creates a disposable VM context with:
|
|
99
|
-
* - No filesystem access (`node:fs` and other built-ins unavailable)
|
|
100
|
-
* - No network access (`fetch`, `http` unavailable)
|
|
101
|
-
* - No child process spawning
|
|
102
|
-
* - No environment variable access (`process` unavailable)
|
|
103
|
-
* - Execution timeout (default 5 s)
|
|
104
|
-
*
|
|
105
|
-
* The context is discarded after execution, so no state leaks between
|
|
106
|
-
* invocations or across sessions.
|
|
107
|
-
*/
|
|
108
82
|
function createRunCode() {
|
|
109
83
|
return {
|
|
110
84
|
guidance: "You MUST use the run_code tool for ANY question involving math, counting, calculations, data processing, or code. NEVER do mental math or recite code verbally. run_code executes JavaScript (not Python). Always write JavaScript.",
|
|
@@ -115,14 +89,6 @@ function createRunCode() {
|
|
|
115
89
|
}
|
|
116
90
|
};
|
|
117
91
|
}
|
|
118
|
-
/**
|
|
119
|
-
* Execute user code in a fresh `node:vm` context.
|
|
120
|
-
*
|
|
121
|
-
* @remarks
|
|
122
|
-
* The VM context only exposes standard ECMAScript globals and a console
|
|
123
|
-
* object that captures output. Node.js APIs (`process`, `require`,
|
|
124
|
-
* `import()`) are not available inside the sandbox.
|
|
125
|
-
*/
|
|
126
92
|
async function executeInIsolate(code) {
|
|
127
93
|
const output = [];
|
|
128
94
|
const capture = (...args) => output.push(args.map(String).join(" "));
|
|
@@ -258,12 +224,11 @@ function createVisitWebpage(fetchFn = globalThis.fetch) {
|
|
|
258
224
|
error: `Failed to fetch: ${resp.status} ${resp.statusText}`,
|
|
259
225
|
url
|
|
260
226
|
};
|
|
261
|
-
const
|
|
262
|
-
const text = htmlToText(htmlContent.length > 2e5 ? htmlContent.slice(0, MAX_HTML_BYTES) : htmlContent);
|
|
227
|
+
const text = htmlToText((await resp.text()).slice(0, MAX_HTML_BYTES));
|
|
263
228
|
const truncated = text.length > MAX_PAGE_CHARS;
|
|
264
229
|
return {
|
|
265
230
|
url,
|
|
266
|
-
content:
|
|
231
|
+
content: text.slice(0, MAX_PAGE_CHARS),
|
|
267
232
|
...truncated ? {
|
|
268
233
|
truncated: true,
|
|
269
234
|
totalChars: text.length
|
|
@@ -323,7 +288,6 @@ function createFetchJson(fetchFn = globalThis.fetch) {
|
|
|
323
288
|
}
|
|
324
289
|
};
|
|
325
290
|
}
|
|
326
|
-
/** Resolve a builtin name to an array of [toolName, ToolDef] pairs. */
|
|
327
291
|
function resolveBuiltin(name, opts) {
|
|
328
292
|
switch (name) {
|
|
329
293
|
case "web_search": return [["web_search", createWebSearch(opts?.fetch)]];
|
|
@@ -349,8 +313,7 @@ function resolveAllBuiltins(names, opts) {
|
|
|
349
313
|
description: def.description,
|
|
350
314
|
parameters: z.toJSONSchema(def.parameters ?? EMPTY_PARAMS)
|
|
351
315
|
});
|
|
352
|
-
|
|
353
|
-
if (g) guidance.push(g);
|
|
316
|
+
if (def.guidance) guidance.push(def.guidance);
|
|
354
317
|
}
|
|
355
318
|
return {
|
|
356
319
|
defs,
|
|
@@ -360,16 +323,7 @@ function resolveAllBuiltins(names, opts) {
|
|
|
360
323
|
}
|
|
361
324
|
//#endregion
|
|
362
325
|
//#region host/memory-vector.ts
|
|
363
|
-
|
|
364
|
-
* In-memory Vector implementation.
|
|
365
|
-
*
|
|
366
|
-
* INTENTIONALLY BAD QUALITY. Pseudo-embedding hashes the text into a
|
|
367
|
-
* 64-dim Float32Array of values in [-1, ~0.99], then L2-normalizes
|
|
368
|
-
* the result. Because both stored and probe vectors are unit-length,
|
|
369
|
-
* cosine similarity reduces to a plain dot product — that's what
|
|
370
|
-
* `cosine()` computes. Used only for `aai dev` and tests — the goal
|
|
371
|
-
* is proving tool wiring, not retrieval ranking.
|
|
372
|
-
*/
|
|
326
|
+
const DIM = 64;
|
|
373
327
|
const stores = /* @__PURE__ */ new Map();
|
|
374
328
|
function getStore(ns) {
|
|
375
329
|
let store = stores.get(ns);
|
|
@@ -379,13 +333,14 @@ function getStore(ns) {
|
|
|
379
333
|
}
|
|
380
334
|
return store;
|
|
381
335
|
}
|
|
382
|
-
const DIM = 64;
|
|
383
336
|
function pseudoEmbed(text) {
|
|
384
337
|
const out = new Float32Array(DIM);
|
|
385
338
|
const h1 = createHash("sha256").update(text).digest();
|
|
386
339
|
const h2 = createHash("sha256").update(h1).digest();
|
|
387
|
-
for (let i = 0; i < 32; i++)
|
|
388
|
-
|
|
340
|
+
for (let i = 0; i < 32; i++) {
|
|
341
|
+
out[i] = (h1[i] - 128) / 128;
|
|
342
|
+
out[i + 32] = (h2[i] - 128) / 128;
|
|
343
|
+
}
|
|
389
344
|
let norm = 0;
|
|
390
345
|
for (let i = 0; i < DIM; i++) norm += out[i] * out[i];
|
|
391
346
|
norm = Math.sqrt(norm) || 1;
|
|
@@ -421,12 +376,13 @@ function createMemoryVector(opts) {
|
|
|
421
376
|
const scored = [];
|
|
422
377
|
for (const [id, rec] of getStore(ns)) {
|
|
423
378
|
if (filter && !matches(rec.metadata, filter)) continue;
|
|
424
|
-
|
|
379
|
+
const match = {
|
|
425
380
|
id,
|
|
426
381
|
score: cosine(probe, rec.vec),
|
|
427
|
-
text: rec.text
|
|
428
|
-
|
|
429
|
-
|
|
382
|
+
text: rec.text
|
|
383
|
+
};
|
|
384
|
+
if (rec.metadata !== void 0) match.metadata = rec.metadata;
|
|
385
|
+
scored.push(match);
|
|
430
386
|
}
|
|
431
387
|
scored.sort((a, b) => b.score - a.score);
|
|
432
388
|
return scored.slice(0, topK);
|
|
@@ -440,24 +396,9 @@ function createMemoryVector(opts) {
|
|
|
440
396
|
}
|
|
441
397
|
//#endregion
|
|
442
398
|
//#region host/providers/stt/assemblyai.ts
|
|
443
|
-
/**
|
|
444
|
-
* AssemblyAI Universal-Streaming STT opener (host-only).
|
|
445
|
-
*
|
|
446
|
-
* The user-facing descriptor factory (`assemblyAI(...)`) lives in
|
|
447
|
-
* `sdk/providers/stt/assemblyai.ts`. This module is the host-side
|
|
448
|
-
* counterpart: it takes the descriptor options + an API key and
|
|
449
|
-
* returns an {@link SttOpener} that the pipeline session drives.
|
|
450
|
-
*
|
|
451
|
-
* Default model: `"u3pro-rt"` (Universal-3 Pro Real-Time). The adapter
|
|
452
|
-
* maps that to the SDK's `"u3-rt-pro"` `speechModel` value; any other
|
|
453
|
-
* string is forwarded verbatim.
|
|
454
|
-
*/
|
|
455
|
-
/** Translate the descriptor's model alias to the SDK's `speechModel` value. */
|
|
456
399
|
function resolveSpeechModel(model) {
|
|
457
|
-
|
|
458
|
-
return model;
|
|
400
|
+
return model === "u3pro-rt" ? "u3-rt-pro" : model;
|
|
459
401
|
}
|
|
460
|
-
/** Build an {@link SttOpener} from resolved AssemblyAI descriptor options. */
|
|
461
402
|
function openAssemblyAI(opts = {}) {
|
|
462
403
|
return {
|
|
463
404
|
name: "assemblyai",
|
|
@@ -476,17 +417,16 @@ function openAssemblyAI(opts = {}) {
|
|
|
476
417
|
transcriber.on("turn", (event) => {
|
|
477
418
|
if (closed) return;
|
|
478
419
|
const text = event.transcript ?? "";
|
|
479
|
-
if (
|
|
480
|
-
|
|
481
|
-
} else if (text.length > 0) emitter.emit("partial", text);
|
|
420
|
+
if (text.length === 0) return;
|
|
421
|
+
emitter.emit(event.end_of_turn ? "final" : "partial", text);
|
|
482
422
|
});
|
|
483
423
|
transcriber.on("error", (err) => {
|
|
484
424
|
if (closed) return;
|
|
485
425
|
emitter.emit("error", makeSttError("stt_stream_error", err?.message ?? String(err)));
|
|
486
426
|
});
|
|
487
427
|
transcriber.on("close", (code) => {
|
|
488
|
-
if (closed) return;
|
|
489
|
-
|
|
428
|
+
if (closed || code === 1e3) return;
|
|
429
|
+
emitter.emit("error", makeSttError("stt_stream_error", `socket closed ${code}`));
|
|
490
430
|
});
|
|
491
431
|
try {
|
|
492
432
|
await transcriber.connect();
|
|
@@ -505,8 +445,7 @@ function openAssemblyAI(opts = {}) {
|
|
|
505
445
|
return {
|
|
506
446
|
sendAudio(pcm) {
|
|
507
447
|
if (closed) return;
|
|
508
|
-
const copy = new Uint8Array(pcm.byteLength);
|
|
509
|
-
copy.set(new Uint8Array(pcm.buffer, pcm.byteOffset, pcm.byteLength));
|
|
448
|
+
const copy = new Uint8Array(pcm.buffer.slice(pcm.byteOffset, pcm.byteOffset + pcm.byteLength));
|
|
510
449
|
transcriber.sendAudio(copy.buffer);
|
|
511
450
|
},
|
|
512
451
|
on(event, fn) {
|
|
@@ -523,33 +462,18 @@ function openAssemblyAI(opts = {}) {
|
|
|
523
462
|
/**
|
|
524
463
|
* Deepgram Nova streaming STT opener (host-only).
|
|
525
464
|
*
|
|
526
|
-
*
|
|
527
|
-
* `
|
|
528
|
-
* counterpart: it takes the descriptor options + an API key and
|
|
529
|
-
* returns an {@link SttOpener} that the pipeline session drives.
|
|
530
|
-
*
|
|
531
|
-
* Default model: `"nova-3"`. Any string is forwarded verbatim to the SDK.
|
|
532
|
-
*
|
|
533
|
-
* This adapter targets the Deepgram SDK v5 (`@deepgram/sdk@^5`). The v5
|
|
534
|
-
* streaming API is:
|
|
535
|
-
* `client.listen.v1.connect(args)` → `Promise<V1Socket>`
|
|
536
|
-
* followed by:
|
|
537
|
-
* `socket.connect()` + `socket.waitForOpen()` to establish the connection.
|
|
538
|
-
*/
|
|
539
|
-
/**
|
|
540
|
-
* Handle an incoming Deepgram transcript message, emitting `partial` or
|
|
541
|
-
* `final` events on the emitter. Empty transcripts are silently dropped.
|
|
465
|
+
* Targets Deepgram SDK v5: `client.listen.v1.connect(args)` returns a
|
|
466
|
+
* socket; `socket.connect()` + `socket.waitForOpen()` establish it.
|
|
542
467
|
*/
|
|
468
|
+
function errMsg(cause) {
|
|
469
|
+
return cause instanceof Error ? cause.message : String(cause);
|
|
470
|
+
}
|
|
543
471
|
function handleMessage(data, closed, emitter) {
|
|
544
|
-
if (closed) return;
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
if (text.length > 0) emitter.emit("final", text);
|
|
550
|
-
} else if (text.length > 0) emitter.emit("partial", text);
|
|
551
|
-
}
|
|
552
|
-
/** Wire Deepgram socket events onto the nanoevents emitter. */
|
|
472
|
+
if (closed || data.type !== "Results") return;
|
|
473
|
+
const text = data.channel?.alternatives?.[0]?.transcript ?? "";
|
|
474
|
+
if (text.length === 0) return;
|
|
475
|
+
emitter.emit(data.is_final ? "final" : "partial", text);
|
|
476
|
+
}
|
|
553
477
|
function wireSocketEvents(connection, emitter, getIsClosed) {
|
|
554
478
|
connection.on("message", (data) => handleMessage(data, getIsClosed(), emitter));
|
|
555
479
|
connection.on("error", (err) => {
|
|
@@ -562,12 +486,13 @@ function wireSocketEvents(connection, emitter, getIsClosed) {
|
|
|
562
486
|
if (code !== void 0 && code !== 1e3) emitter.emit("error", makeSttError("stt_stream_error", `socket closed ${code}`));
|
|
563
487
|
});
|
|
564
488
|
}
|
|
565
|
-
/** Wire the AbortSignal to the close function. */
|
|
566
489
|
function wireAbortSignal(signal, close) {
|
|
567
|
-
if (signal.aborted)
|
|
568
|
-
|
|
490
|
+
if (signal.aborted) {
|
|
491
|
+
close();
|
|
492
|
+
return;
|
|
493
|
+
}
|
|
494
|
+
signal.addEventListener("abort", () => void close(), { once: true });
|
|
569
495
|
}
|
|
570
|
-
/** Build an {@link SttOpener} from resolved Deepgram descriptor options. */
|
|
571
496
|
function openDeepgram(opts = {}) {
|
|
572
497
|
return {
|
|
573
498
|
name: "deepgram",
|
|
@@ -590,7 +515,7 @@ function openDeepgram(opts = {}) {
|
|
|
590
515
|
Authorization: apiKey
|
|
591
516
|
});
|
|
592
517
|
} catch (cause) {
|
|
593
|
-
throw makeSttError("stt_connect_failed", `Deepgram STT: connect failed: ${
|
|
518
|
+
throw makeSttError("stt_connect_failed", `Deepgram STT: connect failed: ${errMsg(cause)}`);
|
|
594
519
|
}
|
|
595
520
|
const emitter = createNanoEvents();
|
|
596
521
|
let closed = false;
|
|
@@ -599,7 +524,7 @@ function openDeepgram(opts = {}) {
|
|
|
599
524
|
try {
|
|
600
525
|
await connection.waitForOpen();
|
|
601
526
|
} catch (cause) {
|
|
602
|
-
throw makeSttError("stt_connect_failed", `Deepgram STT: WebSocket open failed: ${
|
|
527
|
+
throw makeSttError("stt_connect_failed", `Deepgram STT: WebSocket open failed: ${errMsg(cause)}`);
|
|
603
528
|
}
|
|
604
529
|
const close = async () => {
|
|
605
530
|
if (closed) return;
|
|
@@ -671,15 +596,15 @@ function openElevenLabs(opts = {}) {
|
|
|
671
596
|
}
|
|
672
597
|
const emitter = createNanoEvents();
|
|
673
598
|
let closed = false;
|
|
674
|
-
|
|
599
|
+
function emitTranscript(event, text) {
|
|
675
600
|
if (closed) return;
|
|
676
|
-
|
|
677
|
-
|
|
601
|
+
if (text && text.length > 0) emitter.emit(event, text);
|
|
602
|
+
}
|
|
603
|
+
connection.on(RealtimeEvents.PARTIAL_TRANSCRIPT, (msg) => {
|
|
604
|
+
emitTranscript("partial", msg.text);
|
|
678
605
|
});
|
|
679
606
|
connection.on(RealtimeEvents.COMMITTED_TRANSCRIPT, (msg) => {
|
|
680
|
-
|
|
681
|
-
const text = msg.text ?? "";
|
|
682
|
-
if (text.length > 0) emitter.emit("final", text);
|
|
607
|
+
emitTranscript("final", msg.text);
|
|
683
608
|
});
|
|
684
609
|
connection.on(RealtimeEvents.ERROR, (payload) => {
|
|
685
610
|
if (closed) return;
|
|
@@ -690,13 +615,13 @@ function openElevenLabs(opts = {}) {
|
|
|
690
615
|
if (closed) return;
|
|
691
616
|
emitter.emit("error", makeSttError("stt_auth_failed", msg.error));
|
|
692
617
|
});
|
|
693
|
-
|
|
618
|
+
async function close() {
|
|
694
619
|
if (closed) return;
|
|
695
620
|
closed = true;
|
|
696
621
|
try {
|
|
697
622
|
connection.close();
|
|
698
623
|
} catch {}
|
|
699
|
-
}
|
|
624
|
+
}
|
|
700
625
|
if (openOpts.signal.aborted) close();
|
|
701
626
|
else openOpts.signal.addEventListener("abort", () => void close(), { once: true });
|
|
702
627
|
return {
|
|
@@ -715,32 +640,7 @@ function openElevenLabs(opts = {}) {
|
|
|
715
640
|
}
|
|
716
641
|
//#endregion
|
|
717
642
|
//#region host/providers/stt/soniox.ts
|
|
718
|
-
/**
|
|
719
|
-
* Soniox real-time STT opener (host-only).
|
|
720
|
-
*
|
|
721
|
-
* The user-facing descriptor factory (`soniox(...)`) lives in
|
|
722
|
-
* `sdk/providers/stt/soniox.ts`. This module is the host-side
|
|
723
|
-
* counterpart: it takes the descriptor options + an API key and
|
|
724
|
-
* returns an {@link SttOpener} that the pipeline session drives.
|
|
725
|
-
*
|
|
726
|
-
* Soniox's published JS client (`@soniox/speech-to-text-web`) is
|
|
727
|
-
* browser-only — it depends on `MediaRecorder` and `getUserMedia`. For
|
|
728
|
-
* server-side use we talk to the WebSocket directly:
|
|
729
|
-
* `wss://stt-rt.soniox.com/transcribe-websocket`
|
|
730
|
-
*
|
|
731
|
-
* Wire format:
|
|
732
|
-
* - First text frame: JSON config with api_key, model, audio_format,
|
|
733
|
-
* sample_rate, num_channels (and optional language hints).
|
|
734
|
-
* - Subsequent binary frames: 16-bit signed little-endian PCM audio.
|
|
735
|
-
* - Server replies: JSON `{ tokens: [{ text, is_final }] }` messages.
|
|
736
|
-
* Final tokens accumulate; non-final tokens are a rolling preview.
|
|
737
|
-
* - On error: `{ error_code, error_message }`.
|
|
738
|
-
*/
|
|
739
643
|
const SONIOX_WS_URL = "wss://stt-rt.soniox.com/transcribe-websocket";
|
|
740
|
-
/**
|
|
741
|
-
* Walk a batch of Soniox tokens, sending finals into `appendFinal` and
|
|
742
|
-
* returning the concatenated non-finals as a rolling preview string.
|
|
743
|
-
*/
|
|
744
644
|
function consumeTokens(tokens, appendFinal) {
|
|
745
645
|
let nonFinal = "";
|
|
746
646
|
for (const tok of tokens) {
|
|
@@ -751,7 +651,6 @@ function consumeTokens(tokens, appendFinal) {
|
|
|
751
651
|
}
|
|
752
652
|
return nonFinal;
|
|
753
653
|
}
|
|
754
|
-
/** Resolve once the WebSocket opens; reject on the first error. */
|
|
755
654
|
function waitForOpen$1(ws) {
|
|
756
655
|
return new Promise((resolve, reject) => {
|
|
757
656
|
const onOpen = () => {
|
|
@@ -766,7 +665,6 @@ function waitForOpen$1(ws) {
|
|
|
766
665
|
ws.once("error", onErr);
|
|
767
666
|
});
|
|
768
667
|
}
|
|
769
|
-
/** Build the initial JSON config frame for a Soniox session. */
|
|
770
668
|
function buildConfigFrame(apiKey, opts, sampleRate) {
|
|
771
669
|
const config = {
|
|
772
670
|
api_key: apiKey,
|
|
@@ -778,7 +676,6 @@ function buildConfigFrame(apiKey, opts, sampleRate) {
|
|
|
778
676
|
if (opts.languageHints && opts.languageHints.length > 0) config.language_hints = [...opts.languageHints];
|
|
779
677
|
return config;
|
|
780
678
|
}
|
|
781
|
-
/** Parse a Soniox text frame into a {@link SonioxResponse}; returns null on garbage. */
|
|
782
679
|
function parseFrame(raw) {
|
|
783
680
|
try {
|
|
784
681
|
return JSON.parse(raw.toString());
|
|
@@ -786,12 +683,6 @@ function parseFrame(raw) {
|
|
|
786
683
|
return null;
|
|
787
684
|
}
|
|
788
685
|
}
|
|
789
|
-
/**
|
|
790
|
-
* Handle one server response. Emits `error`, `final`, and `partial` events
|
|
791
|
-
* onto `emitter` based on the token batch and the running `finalBuf`. The
|
|
792
|
-
* caller owns `finalBuf` so it survives across messages and can be flushed
|
|
793
|
-
* on close.
|
|
794
|
-
*/
|
|
795
686
|
function handleResponse(res, emitter, finalBuf) {
|
|
796
687
|
if (res.error_code !== void 0) {
|
|
797
688
|
emitter.emit("error", makeSttError("stt_stream_error", `Soniox error ${res.error_code}: ${res.error_message ?? "unknown"}`));
|
|
@@ -807,7 +698,6 @@ function handleResponse(res, emitter, finalBuf) {
|
|
|
807
698
|
}
|
|
808
699
|
if (nonFinal.length > 0) emitter.emit("partial", nonFinal);
|
|
809
700
|
}
|
|
810
|
-
/** Build an {@link SttOpener} from resolved Soniox descriptor options. */
|
|
811
701
|
function openSoniox(opts = {}) {
|
|
812
702
|
return {
|
|
813
703
|
name: "soniox",
|
|
@@ -918,8 +808,7 @@ function openCartesia(opts) {
|
|
|
918
808
|
}
|
|
919
809
|
const emitter = createNanoEvents();
|
|
920
810
|
let closed = false;
|
|
921
|
-
|
|
922
|
-
const mintContext = () => ws.context({
|
|
811
|
+
const audioConfig = {
|
|
923
812
|
model_id: model,
|
|
924
813
|
voice: {
|
|
925
814
|
mode: "id",
|
|
@@ -929,39 +818,32 @@ function openCartesia(opts) {
|
|
|
929
818
|
container: "raw",
|
|
930
819
|
encoding: "pcm_s16le",
|
|
931
820
|
sample_rate: sampleRate
|
|
932
|
-
}
|
|
821
|
+
}
|
|
822
|
+
};
|
|
823
|
+
const baseRequest = {
|
|
824
|
+
...audioConfig,
|
|
825
|
+
language
|
|
826
|
+
};
|
|
827
|
+
const mintContext = () => ws.context({
|
|
828
|
+
...audioConfig,
|
|
933
829
|
contextId: randomUUID()
|
|
934
830
|
});
|
|
935
831
|
let context = mintContext();
|
|
936
|
-
/**
|
|
937
|
-
* `doneEmitted` guards against emitting `done` more than once per turn.
|
|
938
|
-
* Reset whenever a fresh context is minted (i.e. at turn boundaries).
|
|
939
|
-
*/
|
|
940
832
|
let doneEmitted = false;
|
|
941
|
-
/**
|
|
942
|
-
* After `flush()` or `cancel()`, the current context is done accepting
|
|
943
|
-
* input. We defer minting a fresh one until the next `sendText()` so
|
|
944
|
-
* that late audio chunks + Cartesia's real `done` event (both tagged
|
|
945
|
-
* with the flushed context's id) still pass the filter below. Rotating
|
|
946
|
-
* eagerly would silently drop all audio still in flight.
|
|
947
|
-
*/
|
|
948
833
|
let rotatePending = false;
|
|
949
|
-
const
|
|
834
|
+
const rotateIfPending = () => {
|
|
835
|
+
if (!rotatePending) return;
|
|
950
836
|
context = mintContext();
|
|
951
837
|
doneEmitted = false;
|
|
952
838
|
rotatePending = false;
|
|
953
839
|
};
|
|
954
|
-
const rotateIfPending = () => {
|
|
955
|
-
if (rotatePending) rotateContext();
|
|
956
|
-
};
|
|
957
840
|
const emitDoneOnce = () => {
|
|
958
841
|
if (doneEmitted || closed) return;
|
|
959
842
|
doneEmitted = true;
|
|
960
843
|
emitter.emit("done");
|
|
961
844
|
};
|
|
962
845
|
ws.on("chunk", (event) => {
|
|
963
|
-
if (closed) return;
|
|
964
|
-
if (event.context_id !== context.contextId) return;
|
|
846
|
+
if (closed || event.context_id !== context.contextId) return;
|
|
965
847
|
const buf = event.audio;
|
|
966
848
|
if (!buf || buf.byteLength === 0) return;
|
|
967
849
|
const evenBytes = buf.byteLength - buf.byteLength % 2;
|
|
@@ -970,8 +852,7 @@ function openCartesia(opts) {
|
|
|
970
852
|
emitter.emit("audio", pcm);
|
|
971
853
|
});
|
|
972
854
|
ws.on("done", (event) => {
|
|
973
|
-
if (closed) return;
|
|
974
|
-
if (event.context_id !== context.contextId) return;
|
|
855
|
+
if (closed || event.context_id !== context.contextId) return;
|
|
975
856
|
emitDoneOnce();
|
|
976
857
|
});
|
|
977
858
|
ws.on("error", (err) => {
|
|
@@ -990,19 +871,6 @@ function openCartesia(opts) {
|
|
|
990
871
|
};
|
|
991
872
|
if (openOpts.signal.aborted) close();
|
|
992
873
|
else openOpts.signal.addEventListener("abort", () => void close(), { once: true });
|
|
993
|
-
const baseRequest = {
|
|
994
|
-
model_id: model,
|
|
995
|
-
voice: {
|
|
996
|
-
mode: "id",
|
|
997
|
-
id: voice
|
|
998
|
-
},
|
|
999
|
-
output_format: {
|
|
1000
|
-
container: "raw",
|
|
1001
|
-
encoding: "pcm_s16le",
|
|
1002
|
-
sample_rate: sampleRate
|
|
1003
|
-
},
|
|
1004
|
-
language
|
|
1005
|
-
};
|
|
1006
874
|
const ignoreRejection = (_err) => {};
|
|
1007
875
|
return {
|
|
1008
876
|
sendText(text) {
|
|
@@ -1044,38 +912,18 @@ function openCartesia(opts) {
|
|
|
1044
912
|
/**
|
|
1045
913
|
* Rime TTS opener (host-only).
|
|
1046
914
|
*
|
|
1047
|
-
*
|
|
1048
|
-
*
|
|
1049
|
-
*
|
|
1050
|
-
*
|
|
1051
|
-
*
|
|
1052
|
-
*
|
|
1053
|
-
* (`wss://users-ws.rime.ai/ws2`). Client-to-server messages are JSON:
|
|
1054
|
-
* - `{ "text": "..." }` — append text to the synthesis buffer
|
|
1055
|
-
* - `{ "operation": "clear" }` — drop buffered text (barge-in)
|
|
1056
|
-
* - `{ "operation": "eos" }` — drain buffer, close connection (NOT used
|
|
1057
|
-
* during a session: it would tear down the WS, forcing reconnect per
|
|
1058
|
-
* turn). We force end-of-turn synthesis with a trailing `"."` instead.
|
|
1059
|
-
* The server responds with JSON frames:
|
|
1060
|
-
* - `{ type: "chunk", data: <base64 PCM16 LE>, contextId: string | null }`
|
|
1061
|
-
* - `{ type: "timestamps", ... }` (ignored)
|
|
1062
|
-
* - `{ type: "error", message: string }` (surfaced as `tts_stream_error`)
|
|
1063
|
-
*
|
|
1064
|
-
* **Single long-lived connection per session.** Rime buffers text until it
|
|
1065
|
-
* sees terminal punctuation (`.`, `?`, `!`), so we use one WebSocket per
|
|
1066
|
-
* `open()` call and reuse it across turns. `clear` resets the buffer
|
|
1067
|
-
* between cancellations.
|
|
915
|
+
* Connects to Rime's `ws2` JSON WebSocket endpoint with one long-lived
|
|
916
|
+
* connection per session. Client → server: `{ text }` appends to the
|
|
917
|
+
* synthesis buffer, `{ operation: "clear" }` drops it (barge-in). We never
|
|
918
|
+
* send `eos` since it tears down the WS — `flush()` instead sends a
|
|
919
|
+
* trailing `"."` to force synthesis of any text buffered behind missing
|
|
920
|
+
* terminal punctuation while keeping the connection reusable.
|
|
1068
921
|
*
|
|
1069
|
-
*
|
|
1070
|
-
*
|
|
1071
|
-
*
|
|
1072
|
-
*
|
|
1073
|
-
*
|
|
1074
|
-
* **Audio format.** The URL requests `audioFormat=pcm` at the negotiated
|
|
1075
|
-
* `sampleRate`, which returns raw PCM16 little-endian. We decode the base64
|
|
1076
|
-
* payload and construct a zero-copy `Int16Array` view over the decoded bytes.
|
|
922
|
+
* Server → client: `{ type: "chunk", data: <base64 PCM16 LE> }` carries
|
|
923
|
+
* audio; `timestamps` is ignored; `error` surfaces as `tts_stream_error`.
|
|
924
|
+
* The `audioFormat=pcm` query param at the negotiated `sampleRate` returns
|
|
925
|
+
* raw PCM16 LE that we view as a zero-copy `Int16Array`.
|
|
1077
926
|
*/
|
|
1078
|
-
/** PCM16 sample rates accepted by the Rime `ws2` endpoint. */
|
|
1079
927
|
const RIME_PCM16_RATES = [
|
|
1080
928
|
8e3,
|
|
1081
929
|
16e3,
|
|
@@ -1088,31 +936,14 @@ function assertSupportedSampleRate(rate) {
|
|
|
1088
936
|
if (RIME_PCM16_RATES.includes(rate)) return rate;
|
|
1089
937
|
throw makeTtsError("tts_connect_failed", `Rime TTS: unsupported sample rate ${rate}. Supported: ${RIME_PCM16_RATES.join(", ")}.`);
|
|
1090
938
|
}
|
|
1091
|
-
/**
|
|
1092
|
-
* Decode a base64 string from Rime into a zero-copy `Int16Array`.
|
|
1093
|
-
*
|
|
1094
|
-
* Rime's `ws2` endpoint returns base64-encoded PCM16 LE in each chunk.
|
|
1095
|
-
* `Buffer.from(base64, "base64")` gives us a Node.js Buffer (which is a
|
|
1096
|
-
* Uint8Array subclass) with `byteOffset === 0`. PCM16 bytes always come in
|
|
1097
|
-
* pairs so the length is guaranteed to be even.
|
|
1098
|
-
*/
|
|
1099
939
|
function base64ToPcm(data) {
|
|
1100
940
|
const bytes = Buffer.from(data, "base64");
|
|
1101
941
|
const evenLen = bytes.byteLength - bytes.byteLength % 2;
|
|
1102
942
|
if (evenLen === 0) return new Int16Array(0);
|
|
1103
943
|
return new Int16Array(bytes.buffer, bytes.byteOffset, evenLen / 2);
|
|
1104
944
|
}
|
|
1105
|
-
/** Quiescence timeout in ms — how long to wait after the last audio chunk before emitting `done`. */
|
|
1106
945
|
const QUIESCENCE_MS = 500;
|
|
1107
|
-
/**
|
|
1108
|
-
* After `flush()`, how long to wait for the FIRST audio chunk before
|
|
1109
|
-
* giving up and emitting `done`. Greeting and short replies hit this
|
|
1110
|
-
* path: `flush()` runs immediately after `sendText()`, so audio TTFB
|
|
1111
|
-
* exceeds the 500 ms quiescence window. Once the first chunk arrives,
|
|
1112
|
-
* we transition to the shorter quiescence timeout.
|
|
1113
|
-
*/
|
|
1114
946
|
const FIRST_AUDIO_TIMEOUT_MS = 5e3;
|
|
1115
|
-
/** Wait for the WebSocket `open` event; reject on first `error`. */
|
|
1116
947
|
function waitForOpen(ws) {
|
|
1117
948
|
return new Promise((resolve, reject) => {
|
|
1118
949
|
const onOpen = () => {
|
|
@@ -1127,12 +958,6 @@ function waitForOpen(ws) {
|
|
|
1127
958
|
ws.once("error", onError);
|
|
1128
959
|
});
|
|
1129
960
|
}
|
|
1130
|
-
/**
|
|
1131
|
-
* Handle one incoming WebSocket message frame.
|
|
1132
|
-
*
|
|
1133
|
-
* Extracted into a top-level function to keep `open()` under the cognitive
|
|
1134
|
-
* complexity limit while retaining full access to the session state via refs.
|
|
1135
|
-
*/
|
|
1136
961
|
function handleRimeMessage(raw, emitter, armQuiescence, isActiveTimer) {
|
|
1137
962
|
let msg;
|
|
1138
963
|
try {
|
|
@@ -1150,7 +975,6 @@ function handleRimeMessage(raw, emitter, armQuiescence, isActiveTimer) {
|
|
|
1150
975
|
}
|
|
1151
976
|
if (msg.type === "error") emitter.emit("error", makeTtsError("tts_stream_error", `Rime TTS: ${msg.message ?? "unknown error"}`));
|
|
1152
977
|
}
|
|
1153
|
-
/** Build a {@link TtsOpener} from resolved Rime descriptor options. */
|
|
1154
978
|
function openRime(opts) {
|
|
1155
979
|
return {
|
|
1156
980
|
name: "rime",
|
|
@@ -1172,12 +996,6 @@ function openRime(opts) {
|
|
|
1172
996
|
const emitter = createNanoEvents();
|
|
1173
997
|
let closed = false;
|
|
1174
998
|
let doneEmitted = false;
|
|
1175
|
-
/**
|
|
1176
|
-
* After `flush()`, we arm a timer that fires `done`. Initial timeout is
|
|
1177
|
-
* `FIRST_AUDIO_TIMEOUT_MS` to give Rime headroom on TTFB; the first
|
|
1178
|
-
* chunk swaps it for a shorter `QUIESCENCE_MS` window that resets on
|
|
1179
|
-
* each subsequent chunk. `cancel()` emits `done` synchronously.
|
|
1180
|
-
*/
|
|
1181
999
|
let quiescenceTimer = null;
|
|
1182
1000
|
const clearQuiescence = () => {
|
|
1183
1001
|
if (quiescenceTimer !== null) {
|
|
@@ -1271,21 +1089,24 @@ function openRime(opts) {
|
|
|
1271
1089
|
function resolveApiKey(envVar, env) {
|
|
1272
1090
|
return env[envVar] ?? process.env[envVar] ?? "";
|
|
1273
1091
|
}
|
|
1092
|
+
function options(descriptor) {
|
|
1093
|
+
return descriptor.options;
|
|
1094
|
+
}
|
|
1274
1095
|
/** Resolve an {@link SttProvider} descriptor into a host-side opener. */
|
|
1275
1096
|
function resolveStt(descriptor) {
|
|
1276
1097
|
switch (descriptor.kind) {
|
|
1277
|
-
case ASSEMBLYAI_KIND: return openAssemblyAI(descriptor
|
|
1278
|
-
case DEEPGRAM_KIND: return openDeepgram(descriptor
|
|
1279
|
-
case ELEVENLABS_KIND: return openElevenLabs(descriptor
|
|
1280
|
-
case SONIOX_KIND: return openSoniox(descriptor
|
|
1098
|
+
case ASSEMBLYAI_KIND: return openAssemblyAI(options(descriptor));
|
|
1099
|
+
case DEEPGRAM_KIND: return openDeepgram(options(descriptor));
|
|
1100
|
+
case ELEVENLABS_KIND: return openElevenLabs(options(descriptor));
|
|
1101
|
+
case SONIOX_KIND: return openSoniox(options(descriptor));
|
|
1281
1102
|
default: throw new Error(`Unknown STT provider kind: "${descriptor.kind}". Supported: ${ASSEMBLYAI_KIND}, ${DEEPGRAM_KIND}, ${ELEVENLABS_KIND}, ${SONIOX_KIND}.`);
|
|
1282
1103
|
}
|
|
1283
1104
|
}
|
|
1284
1105
|
/** Resolve a {@link TtsProvider} descriptor into a host-side opener. */
|
|
1285
1106
|
function resolveTts(descriptor) {
|
|
1286
1107
|
switch (descriptor.kind) {
|
|
1287
|
-
case CARTESIA_KIND: return openCartesia(descriptor
|
|
1288
|
-
case RIME_KIND: return openRime(descriptor
|
|
1108
|
+
case CARTESIA_KIND: return openCartesia(options(descriptor));
|
|
1109
|
+
case RIME_KIND: return openRime(options(descriptor));
|
|
1289
1110
|
default: throw new Error(`Unknown TTS provider kind: "${descriptor.kind}". Supported: ${CARTESIA_KIND}, ${RIME_KIND}.`);
|
|
1290
1111
|
}
|
|
1291
1112
|
}
|
|
@@ -1302,12 +1123,12 @@ function resolveLlm(descriptor, env) {
|
|
|
1302
1123
|
case ANTHROPIC_KIND: return createAnthropic({
|
|
1303
1124
|
apiKey: requireKey(env, "ANTHROPIC_API_KEY", "Anthropic"),
|
|
1304
1125
|
baseURL: "https://api.anthropic.com/v1"
|
|
1305
|
-
})(descriptor.
|
|
1306
|
-
case OPENAI_KIND: return createOpenAI({ apiKey: requireKey(env, "OPENAI_API_KEY", "OpenAI") })(descriptor.
|
|
1307
|
-
case GOOGLE_KIND: return createGoogleGenerativeAI({ apiKey: requireKey(env, "GOOGLE_GENERATIVE_AI_API_KEY", "Google") })(descriptor.
|
|
1308
|
-
case MISTRAL_KIND: return createMistral({ apiKey: requireKey(env, "MISTRAL_API_KEY", "Mistral") })(descriptor.
|
|
1309
|
-
case "xai": return createXai({ apiKey: requireKey(env, "XAI_API_KEY", "xAI") })(descriptor.
|
|
1310
|
-
case GROQ_KIND: return createGroq({ apiKey: requireKey(env, "GROQ_API_KEY", "Groq") })(descriptor.
|
|
1126
|
+
})(options(descriptor).model);
|
|
1127
|
+
case OPENAI_KIND: return createOpenAI({ apiKey: requireKey(env, "OPENAI_API_KEY", "OpenAI") })(options(descriptor).model);
|
|
1128
|
+
case GOOGLE_KIND: return createGoogleGenerativeAI({ apiKey: requireKey(env, "GOOGLE_GENERATIVE_AI_API_KEY", "Google") })(options(descriptor).model);
|
|
1129
|
+
case MISTRAL_KIND: return createMistral({ apiKey: requireKey(env, "MISTRAL_API_KEY", "Mistral") })(options(descriptor).model);
|
|
1130
|
+
case "xai": return createXai({ apiKey: requireKey(env, "XAI_API_KEY", "xAI") })(options(descriptor).model);
|
|
1131
|
+
case GROQ_KIND: return createGroq({ apiKey: requireKey(env, "GROQ_API_KEY", "Groq") })(options(descriptor).model);
|
|
1311
1132
|
default: throw new Error(`Unknown LLM provider kind: "${descriptor.kind}". Supported: ${ANTHROPIC_KIND}, ${OPENAI_KIND}, ${GOOGLE_KIND}, ${MISTRAL_KIND}, xai, ${GROQ_KIND}.`);
|
|
1312
1133
|
}
|
|
1313
1134
|
}
|
|
@@ -1321,8 +1142,9 @@ function loadProviderPackage(name, label) {
|
|
|
1321
1142
|
try {
|
|
1322
1143
|
return requireFromHere(name);
|
|
1323
1144
|
} catch (err) {
|
|
1324
|
-
|
|
1325
|
-
throw err;
|
|
1145
|
+
const code = err?.code;
|
|
1146
|
+
if (!(err instanceof Error && (code === "MODULE_NOT_FOUND" || code === "ERR_MODULE_NOT_FOUND") && err.message.includes(name))) throw err;
|
|
1147
|
+
throw new Error(`${label}: package \`${name}\` is not installed. Run \`pnpm add ${name}\`.`, { cause: err });
|
|
1326
1148
|
}
|
|
1327
1149
|
}
|
|
1328
1150
|
function requireKey(env, name, label) {
|
|
@@ -1334,67 +1156,42 @@ function requireKey(env, name, label) {
|
|
|
1334
1156
|
//#region host/pinecone-vector.ts
|
|
1335
1157
|
function createPineconeVector(opts) {
|
|
1336
1158
|
const { Pinecone } = loadProviderPackage("@pinecone-database/pinecone", "Pinecone Vector");
|
|
1337
|
-
const
|
|
1338
|
-
const ns = () => client.index(opts.index).namespace(opts.namespace);
|
|
1159
|
+
const ns = new Pinecone({ apiKey: opts.apiKey }).index(opts.index).namespace(opts.namespace);
|
|
1339
1160
|
return {
|
|
1340
1161
|
async upsert(id, text, metadata) {
|
|
1341
|
-
|
|
1162
|
+
await ns.upsertRecords([{
|
|
1342
1163
|
_id: id,
|
|
1343
1164
|
text,
|
|
1344
1165
|
...metadata ?? {}
|
|
1345
|
-
};
|
|
1346
|
-
await ns().upsertRecords([record]);
|
|
1166
|
+
}]);
|
|
1347
1167
|
},
|
|
1348
1168
|
async query(text, queryOpts) {
|
|
1349
|
-
const topK = queryOpts
|
|
1350
|
-
|
|
1169
|
+
const { topK = 5, filter } = queryOpts ?? {};
|
|
1170
|
+
return (await ns.searchRecords({
|
|
1351
1171
|
query: {
|
|
1352
1172
|
inputs: { text },
|
|
1353
1173
|
topK,
|
|
1354
|
-
...
|
|
1174
|
+
...filter !== void 0 ? { filter } : {}
|
|
1355
1175
|
},
|
|
1356
1176
|
fields: ["*"]
|
|
1357
|
-
}
|
|
1358
|
-
return (await ns().searchRecords(req)).result.hits.map((hit) => {
|
|
1177
|
+
})).result.hits.map((hit) => {
|
|
1359
1178
|
const { text: hitText, ...rest } = hit.fields;
|
|
1360
|
-
const
|
|
1361
|
-
return {
|
|
1179
|
+
const match = {
|
|
1362
1180
|
id: hit._id,
|
|
1363
1181
|
score: hit._score,
|
|
1364
|
-
text: typeof hitText === "string" ? hitText : ""
|
|
1365
|
-
...metadata !== void 0 ? { metadata } : {}
|
|
1182
|
+
text: typeof hitText === "string" ? hitText : ""
|
|
1366
1183
|
};
|
|
1184
|
+
if (Object.keys(rest).length > 0) match.metadata = rest;
|
|
1185
|
+
return match;
|
|
1367
1186
|
});
|
|
1368
1187
|
},
|
|
1369
1188
|
async delete(ids) {
|
|
1370
|
-
|
|
1371
|
-
await ns().deleteMany(list);
|
|
1189
|
+
await ns.deleteMany(Array.isArray(ids) ? ids : [ids]);
|
|
1372
1190
|
}
|
|
1373
1191
|
};
|
|
1374
1192
|
}
|
|
1375
1193
|
//#endregion
|
|
1376
1194
|
//#region host/unstorage-kv.ts
|
|
1377
|
-
/**
|
|
1378
|
-
* Key-value store backed by unstorage.
|
|
1379
|
-
*
|
|
1380
|
-
* Works with any unstorage driver (memory, fs, S3/R2, etc.).
|
|
1381
|
-
*/
|
|
1382
|
-
/**
|
|
1383
|
-
* Create a KV store backed by any unstorage driver.
|
|
1384
|
-
*
|
|
1385
|
-
* @param options - See {@link UnstorageKvOptions}.
|
|
1386
|
-
* @returns A {@link Kv} instance.
|
|
1387
|
-
*
|
|
1388
|
-
* @example
|
|
1389
|
-
* ```ts
|
|
1390
|
-
* import { createStorage } from "unstorage";
|
|
1391
|
-
* import { createUnstorageKv } from "@alexkroman1/aai/unstorage-kv";
|
|
1392
|
-
*
|
|
1393
|
-
* const kv = createUnstorageKv({ storage: createStorage() });
|
|
1394
|
-
* await kv.set("greeting", "hello");
|
|
1395
|
-
* const value = await kv.get<string>("greeting"); // "hello"
|
|
1396
|
-
* ```
|
|
1397
|
-
*/
|
|
1398
1195
|
function createUnstorageKv(options) {
|
|
1399
1196
|
const store = options.prefix ? prefixStorage(options.storage, options.prefix) : options.storage;
|
|
1400
1197
|
return {
|
|
@@ -1403,9 +1200,9 @@ function createUnstorageKv(options) {
|
|
|
1403
1200
|
},
|
|
1404
1201
|
async set(key, value, setOptions) {
|
|
1405
1202
|
if (JSON.stringify(value).length > 65536) throw new Error(`Value exceeds max size of ${MAX_VALUE_SIZE} bytes`);
|
|
1406
|
-
const
|
|
1407
|
-
|
|
1408
|
-
|
|
1203
|
+
const expireIn = setOptions?.expireIn;
|
|
1204
|
+
const ttlOption = expireIn && expireIn > 0 ? { ttl: Math.ceil(expireIn / 1e3) } : void 0;
|
|
1205
|
+
await store.setItem(key, value, ttlOption);
|
|
1409
1206
|
},
|
|
1410
1207
|
async delete(keys) {
|
|
1411
1208
|
const keyArray = Array.isArray(keys) ? keys : [keys];
|
|
@@ -1418,36 +1215,15 @@ function createUnstorageKv(options) {
|
|
|
1418
1215
|
}
|
|
1419
1216
|
//#endregion
|
|
1420
1217
|
//#region host/providers/resolve-kv.ts
|
|
1421
|
-
/**
|
|
1422
|
-
* Descriptor → concrete `Kv` resolver. Mirror of `resolveLlm` /
|
|
1423
|
-
* `resolveVector`. Always wraps the produced unstorage Storage in
|
|
1424
|
-
* `createUnstorageKv` with the provided per-tenant prefix so namespace
|
|
1425
|
-
* isolation is enforced regardless of backend choice.
|
|
1426
|
-
*/
|
|
1427
|
-
/**
|
|
1428
|
-
* Load a CJS unstorage driver factory. The CJS variants use
|
|
1429
|
-
* `module.exports = defineDriver(...)` so the require result is the
|
|
1430
|
-
* factory itself (not an object with `.default`).
|
|
1431
|
-
*
|
|
1432
|
-
* Delegates to loadProviderPackage (lazy-load via createRequire so the
|
|
1433
|
-
* driver is a true optional peer dep).
|
|
1434
|
-
*/
|
|
1435
1218
|
function loadDriver(modulePath, label) {
|
|
1436
1219
|
return loadProviderPackage(modulePath, `${label} KV: driver`);
|
|
1437
1220
|
}
|
|
1438
|
-
/**
|
|
1439
|
-
* Build a lazy unstorage Driver that defers loading the real driver
|
|
1440
|
-
* factory until the first I/O operation. This is necessary for drivers
|
|
1441
|
-
* whose peer dependencies (e.g. `ioredis`) may not be installed on the
|
|
1442
|
-
* host at startup — the missing package will only surface when the agent
|
|
1443
|
-
* actually performs KV operations, not at session creation time.
|
|
1444
|
-
*/
|
|
1445
1221
|
function makeLazyDriver(modulePath, label, opts) {
|
|
1446
1222
|
let resolved = null;
|
|
1447
|
-
|
|
1223
|
+
function get() {
|
|
1448
1224
|
if (!resolved) resolved = loadDriver(modulePath, label)(opts);
|
|
1449
1225
|
return resolved;
|
|
1450
|
-
}
|
|
1226
|
+
}
|
|
1451
1227
|
return {
|
|
1452
1228
|
name: label.toLowerCase(),
|
|
1453
1229
|
hasItem: (key, txOpts) => get().hasItem(key, txOpts),
|
|
@@ -1458,10 +1234,9 @@ function makeLazyDriver(modulePath, label, opts) {
|
|
|
1458
1234
|
removeItem: (key, txOpts) => get().removeItem?.(key, txOpts),
|
|
1459
1235
|
getKeys: (base, txOpts) => get().getKeys(base, txOpts),
|
|
1460
1236
|
clear: (base, txOpts) => get().clear?.(base, txOpts),
|
|
1461
|
-
dispose: () => resolved
|
|
1237
|
+
dispose: () => resolved?.dispose?.()
|
|
1462
1238
|
};
|
|
1463
1239
|
}
|
|
1464
|
-
/** Resolve a {@link KvProvider} descriptor into a {@link Kv}. */
|
|
1465
1240
|
function resolveKv(descriptor, env, prefix) {
|
|
1466
1241
|
switch (descriptor.kind) {
|
|
1467
1242
|
case MEMORY_KV_KIND: return createUnstorageKv({
|
|
@@ -1508,24 +1283,16 @@ function resolveKv(descriptor, env, prefix) {
|
|
|
1508
1283
|
}
|
|
1509
1284
|
//#endregion
|
|
1510
1285
|
//#region host/providers/resolve-vector.ts
|
|
1511
|
-
/**
|
|
1512
|
-
* Descriptor → concrete `Vector` resolver. Mirror of `resolveLlm`.
|
|
1513
|
-
*
|
|
1514
|
-
* Pulls API keys from the agent env so descriptors stay
|
|
1515
|
-
* secret-free. Lazy-loads provider SDKs via `createRequire` so
|
|
1516
|
-
* unused providers never enter the bundle.
|
|
1517
|
-
*/
|
|
1518
|
-
/** Resolve a {@link VectorProvider} descriptor into a {@link Vector}. */
|
|
1519
1286
|
function resolveVector(descriptor, env, namespace) {
|
|
1520
1287
|
switch (descriptor.kind) {
|
|
1521
1288
|
case IN_MEMORY_VECTOR_KIND: return createMemoryVector({ namespace });
|
|
1522
1289
|
case PINECONE_VECTOR_KIND: {
|
|
1523
1290
|
const apiKey = resolveApiKey("PINECONE_API_KEY", env);
|
|
1524
1291
|
if (!apiKey) throw new Error("Pinecone Vector: missing API key. Set PINECONE_API_KEY in the agent env.");
|
|
1525
|
-
const
|
|
1292
|
+
const { index } = descriptor.options;
|
|
1526
1293
|
return createPineconeVector({
|
|
1527
1294
|
apiKey,
|
|
1528
|
-
index
|
|
1295
|
+
index,
|
|
1529
1296
|
namespace
|
|
1530
1297
|
});
|
|
1531
1298
|
}
|
|
@@ -1534,14 +1301,13 @@ function resolveVector(descriptor, env, namespace) {
|
|
|
1534
1301
|
}
|
|
1535
1302
|
//#endregion
|
|
1536
1303
|
//#region sdk/system-prompt.ts
|
|
1537
|
-
|
|
1538
|
-
|
|
1539
|
-
|
|
1540
|
-
|
|
1541
|
-
|
|
1542
|
-
|
|
1543
|
-
|
|
1544
|
-
}
|
|
1304
|
+
const DATE_FORMAT_OPTIONS = {
|
|
1305
|
+
weekday: "long",
|
|
1306
|
+
year: "numeric",
|
|
1307
|
+
month: "long",
|
|
1308
|
+
day: "numeric"
|
|
1309
|
+
};
|
|
1310
|
+
const TOOL_PREAMBLE = "\n\nWhen you decide to use a tool, ALWAYS say a brief natural phrase BEFORE the tool call (e.g. \"Let me look that up\" or \"One moment while I check\"). This fills silence while the tool executes. Keep preambles to one short sentence.";
|
|
1545
1311
|
const VOICE_RULES = "\n\nCRITICAL OUTPUT RULES — you MUST follow these for EVERY response:\nYour response will be spoken aloud by a TTS system and displayed as plain text.\n- NEVER use markdown: no **, no *, no _, no #, no `, no [](), no ---\n- NEVER use bullet points (-, *, •) or numbered lists (1., 2.)\n- NEVER use code blocks or inline code\n- NEVER mention tools, search, APIs, or technical failures to the user. If a tool returns no results, just answer naturally without explaining why.\n- Write exactly as you would say it out loud to a friend\n- Use short conversational sentences. To list things, say \"First,\" \"Next,\" \"Finally,\"\n- Keep responses concise — 1 to 3 sentences max";
|
|
1546
1312
|
/**
|
|
1547
1313
|
* Build the system prompt sent to the LLM from the agent configuration.
|
|
@@ -1557,11 +1323,10 @@ const VOICE_RULES = "\n\nCRITICAL OUTPUT RULES — you MUST follow these for EVE
|
|
|
1557
1323
|
* @returns The assembled system prompt string.
|
|
1558
1324
|
*/
|
|
1559
1325
|
function buildSystemPrompt(config, opts) {
|
|
1560
|
-
const { hasTools } = opts;
|
|
1561
1326
|
const agentInstructions = config.systemPrompt && config.systemPrompt !== DEFAULT_SYSTEM_PROMPT ? `\n\nAgent-Specific Instructions:\n${config.systemPrompt}` : "";
|
|
1562
|
-
const toolPreamble = hasTools ?
|
|
1327
|
+
const toolPreamble = opts.hasTools ? TOOL_PREAMBLE : "";
|
|
1563
1328
|
const guidance = opts.toolGuidance && opts.toolGuidance.length > 0 ? `\n\nBuilt-in Tool Usage:\n${opts.toolGuidance.join("\n")}` : "";
|
|
1564
|
-
return DEFAULT_SYSTEM_PROMPT + `\n\nToday's date is ${
|
|
1329
|
+
return DEFAULT_SYSTEM_PROMPT + `\n\nToday's date is ${(/* @__PURE__ */ new Date()).toLocaleDateString("en-US", DATE_FORMAT_OPTIONS)}.` + agentInstructions + toolPreamble + guidance + (opts.voice ? VOICE_RULES : "");
|
|
1565
1330
|
}
|
|
1566
1331
|
//#endregion
|
|
1567
1332
|
//#region host/runtime-config.ts
|
|
@@ -1581,22 +1346,23 @@ const consoleLogger = {
|
|
|
1581
1346
|
error: consoleLog(console.error),
|
|
1582
1347
|
debug: consoleLog(console.debug)
|
|
1583
1348
|
};
|
|
1584
|
-
/**
|
|
1585
|
-
* Structured JSON logger for production diagnostics. Each log entry is a
|
|
1586
|
-
* single-line JSON object with `timestamp`, `level`, `msg`, and any
|
|
1587
|
-
* caller-provided context fields.
|
|
1588
|
-
*/
|
|
1589
1349
|
function jsonLog(level) {
|
|
1350
|
+
const out = level === "error" || level === "warn" ? process.stderr : process.stdout;
|
|
1590
1351
|
return (msg, ctx) => {
|
|
1591
1352
|
const entry = {
|
|
1592
1353
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1593
1354
|
level,
|
|
1594
|
-
msg
|
|
1355
|
+
msg,
|
|
1356
|
+
...ctx
|
|
1595
1357
|
};
|
|
1596
|
-
|
|
1597
|
-
(level === "error" || level === "warn" ? process.stderr : process.stdout).write(`${JSON.stringify(entry)}\n`);
|
|
1358
|
+
out.write(`${JSON.stringify(entry)}\n`);
|
|
1598
1359
|
};
|
|
1599
1360
|
}
|
|
1361
|
+
/**
|
|
1362
|
+
* Structured JSON logger for production diagnostics. Each log entry is a
|
|
1363
|
+
* single-line JSON object with `timestamp`, `level`, `msg`, and any
|
|
1364
|
+
* caller-provided context fields.
|
|
1365
|
+
*/
|
|
1600
1366
|
const jsonLogger = {
|
|
1601
1367
|
info: jsonLog("info"),
|
|
1602
1368
|
warn: jsonLog("warn"),
|
|
@@ -1615,15 +1381,16 @@ const REPLY_DONE_SLOW_THRESHOLD_MS = 50;
|
|
|
1615
1381
|
function createSessionCore(opts) {
|
|
1616
1382
|
const log = opts.logger ?? consoleLogger;
|
|
1617
1383
|
const maxHistory = opts.maxHistory ?? 200;
|
|
1618
|
-
const
|
|
1619
|
-
|
|
1620
|
-
|
|
1621
|
-
|
|
1622
|
-
|
|
1623
|
-
|
|
1624
|
-
|
|
1625
|
-
|
|
1626
|
-
}
|
|
1384
|
+
const rawIdleMs = opts.agentConfig.idleTimeoutMs ?? 3e5;
|
|
1385
|
+
const idleMs = rawIdleMs === 0 || !Number.isFinite(rawIdleMs) ? 0 : rawIdleMs;
|
|
1386
|
+
function emptyReply() {
|
|
1387
|
+
return {
|
|
1388
|
+
currentReplyId: null,
|
|
1389
|
+
pendingTools: [],
|
|
1390
|
+
toolCallCount: 0
|
|
1391
|
+
};
|
|
1392
|
+
}
|
|
1393
|
+
let reply = emptyReply();
|
|
1627
1394
|
let history = [];
|
|
1628
1395
|
let turnPromise = null;
|
|
1629
1396
|
let idleTimer = null;
|
|
@@ -1645,18 +1412,13 @@ function createSessionCore(opts) {
|
|
|
1645
1412
|
}
|
|
1646
1413
|
function beginReply(replyId) {
|
|
1647
1414
|
reply = {
|
|
1648
|
-
|
|
1649
|
-
|
|
1650
|
-
toolCallCount: 0
|
|
1415
|
+
...emptyReply(),
|
|
1416
|
+
currentReplyId: replyId
|
|
1651
1417
|
};
|
|
1652
1418
|
turnPromise = null;
|
|
1653
1419
|
}
|
|
1654
1420
|
function cancelReply() {
|
|
1655
|
-
reply =
|
|
1656
|
-
currentReplyId: null,
|
|
1657
|
-
pendingTools: [],
|
|
1658
|
-
toolCallCount: 0
|
|
1659
|
-
};
|
|
1421
|
+
reply = emptyReply();
|
|
1660
1422
|
}
|
|
1661
1423
|
function flushReply(startMs, hadTurnPromise) {
|
|
1662
1424
|
const stepsUsed = reply.toolCallCount;
|
|
@@ -1846,7 +1608,7 @@ function createSessionCore(opts) {
|
|
|
1846
1608
|
*/
|
|
1847
1609
|
const yieldTick = () => new Promise((r) => setTimeout(r, 0));
|
|
1848
1610
|
function buildToolContext(opts) {
|
|
1849
|
-
const { env, state, kv, vector, messages, sessionId } = opts;
|
|
1611
|
+
const { env, state, kv, vector, messages, sessionId, send } = opts;
|
|
1850
1612
|
return {
|
|
1851
1613
|
env,
|
|
1852
1614
|
state: state ?? {},
|
|
@@ -1861,14 +1623,21 @@ function buildToolContext(opts) {
|
|
|
1861
1623
|
messages: messages ?? [],
|
|
1862
1624
|
sessionId: sessionId ?? "",
|
|
1863
1625
|
send(event, data) {
|
|
1864
|
-
|
|
1626
|
+
send?.(event, data);
|
|
1865
1627
|
}
|
|
1866
1628
|
};
|
|
1867
1629
|
}
|
|
1630
|
+
function formatZodIssues(error) {
|
|
1631
|
+
return (error?.issues ?? []).map((i) => `${i.path.map(String).join(".")}: ${i.message}`).join(", ");
|
|
1632
|
+
}
|
|
1633
|
+
function stringifyResult(result) {
|
|
1634
|
+
if (result == null) return "null";
|
|
1635
|
+
return typeof result === "string" ? result : JSON.stringify(result);
|
|
1636
|
+
}
|
|
1868
1637
|
async function executeToolCall(name, args, options) {
|
|
1869
|
-
const { tool } = options;
|
|
1638
|
+
const { tool, logger } = options;
|
|
1870
1639
|
const parsed = (tool.parameters ?? EMPTY_PARAMS).safeParse(args);
|
|
1871
|
-
if (!parsed.success) return toolError(`Invalid arguments for tool "${name}": ${(parsed.error
|
|
1640
|
+
if (!parsed.success) return toolError(`Invalid arguments for tool "${name}": ${formatZodIssues(parsed.error)}`);
|
|
1872
1641
|
try {
|
|
1873
1642
|
const ctx = buildToolContext(options);
|
|
1874
1643
|
await yieldTick();
|
|
@@ -1877,11 +1646,9 @@ async function executeToolCall(name, args, options) {
|
|
|
1877
1646
|
message: `Tool "${name}" timed out after ${TOOL_EXECUTION_TIMEOUT_MS}ms`
|
|
1878
1647
|
});
|
|
1879
1648
|
await yieldTick();
|
|
1880
|
-
|
|
1881
|
-
return typeof result === "string" ? result : JSON.stringify(result);
|
|
1649
|
+
return stringifyResult(result);
|
|
1882
1650
|
} catch (err) {
|
|
1883
|
-
|
|
1884
|
-
if (log) log.warn("Tool execution failed", {
|
|
1651
|
+
if (logger) logger.warn("Tool execution failed", {
|
|
1885
1652
|
tool: name,
|
|
1886
1653
|
error: errorDetail(err)
|
|
1887
1654
|
});
|
|
@@ -1890,29 +1657,317 @@ async function executeToolCall(name, args, options) {
|
|
|
1890
1657
|
}
|
|
1891
1658
|
}
|
|
1892
1659
|
//#endregion
|
|
1660
|
+
//#region host/_base64.ts
|
|
1661
|
+
function uint8ToBase64(bytes) {
|
|
1662
|
+
return Buffer.from(bytes).toString("base64");
|
|
1663
|
+
}
|
|
1664
|
+
function base64ToUint8(base64) {
|
|
1665
|
+
return new Uint8Array(Buffer.from(base64, "base64"));
|
|
1666
|
+
}
|
|
1667
|
+
//#endregion
|
|
1668
|
+
//#region host/transports/openai-realtime-transport.ts
|
|
1669
|
+
const DEFAULT_MODEL = "gpt-realtime-2";
|
|
1670
|
+
const DEFAULT_VOICE = "alloy";
|
|
1671
|
+
const DEFAULT_URL = "wss://api.openai.com/v1/realtime";
|
|
1672
|
+
const defaultCreateOpenaiRealtimeWebSocket = (url, opts) => new WsWebSocket(url, { headers: opts.headers });
|
|
1673
|
+
function createOpenaiRealtimeTransport(opts) {
|
|
1674
|
+
const log = opts.logger ?? consoleLogger;
|
|
1675
|
+
const createWs = opts.createWebSocket ?? defaultCreateOpenaiRealtimeWebSocket;
|
|
1676
|
+
const model = opts.options.model ?? DEFAULT_MODEL;
|
|
1677
|
+
const voice = opts.options.voice ?? DEFAULT_VOICE;
|
|
1678
|
+
const baseUrl = opts.options.url ?? DEFAULT_URL;
|
|
1679
|
+
let ws = null;
|
|
1680
|
+
let closing = false;
|
|
1681
|
+
const agentTranscriptBuffers = /* @__PURE__ */ new Map();
|
|
1682
|
+
const toolBuffers = /* @__PURE__ */ new Map();
|
|
1683
|
+
let currentResponseId = null;
|
|
1684
|
+
let responseCreateQueued = false;
|
|
1685
|
+
function send(payload) {
|
|
1686
|
+
if (!ws || ws.readyState !== 1) {
|
|
1687
|
+
log.debug("OpenAI Realtime send dropped: socket not open", { type: payload.type });
|
|
1688
|
+
return;
|
|
1689
|
+
}
|
|
1690
|
+
ws.send(JSON.stringify(payload));
|
|
1691
|
+
}
|
|
1692
|
+
function sendGreeting() {
|
|
1693
|
+
if (opts.skipGreeting) return;
|
|
1694
|
+
const greeting = opts.sessionConfig.greeting;
|
|
1695
|
+
if (!greeting) return;
|
|
1696
|
+
send({
|
|
1697
|
+
type: "response.create",
|
|
1698
|
+
response: { instructions: `Say exactly: ${JSON.stringify(greeting)}` }
|
|
1699
|
+
});
|
|
1700
|
+
}
|
|
1701
|
+
function sendSessionUpdate() {
|
|
1702
|
+
send({
|
|
1703
|
+
type: "session.update",
|
|
1704
|
+
session: {
|
|
1705
|
+
type: "realtime",
|
|
1706
|
+
output_modalities: ["audio"],
|
|
1707
|
+
instructions: opts.sessionConfig.systemPrompt,
|
|
1708
|
+
audio: {
|
|
1709
|
+
input: {
|
|
1710
|
+
format: {
|
|
1711
|
+
type: "audio/pcm",
|
|
1712
|
+
rate: 24e3
|
|
1713
|
+
},
|
|
1714
|
+
turn_detection: { type: "server_vad" },
|
|
1715
|
+
transcription: { model: "whisper-1" }
|
|
1716
|
+
},
|
|
1717
|
+
output: {
|
|
1718
|
+
format: {
|
|
1719
|
+
type: "audio/pcm",
|
|
1720
|
+
rate: 24e3
|
|
1721
|
+
},
|
|
1722
|
+
voice
|
|
1723
|
+
}
|
|
1724
|
+
},
|
|
1725
|
+
tools: opts.toolSchemas,
|
|
1726
|
+
tool_choice: opts.toolChoice
|
|
1727
|
+
}
|
|
1728
|
+
});
|
|
1729
|
+
}
|
|
1730
|
+
async function start() {
|
|
1731
|
+
const url = `${baseUrl}?model=${encodeURIComponent(model)}`;
|
|
1732
|
+
log.info("OpenAI Realtime connecting", { url });
|
|
1733
|
+
return new Promise((resolve, reject) => {
|
|
1734
|
+
const sock = createWs(url, { headers: { Authorization: `Bearer ${opts.apiKey}` } });
|
|
1735
|
+
ws = sock;
|
|
1736
|
+
let opened = false;
|
|
1737
|
+
sock.addEventListener("open", () => {
|
|
1738
|
+
opened = true;
|
|
1739
|
+
sendSessionUpdate();
|
|
1740
|
+
sendGreeting();
|
|
1741
|
+
resolve();
|
|
1742
|
+
});
|
|
1743
|
+
sock.addEventListener("message", (ev) => handleMessage(ev.data));
|
|
1744
|
+
sock.addEventListener("close", (ev) => handleClose(ev.code ?? 0, ev.reason ?? ""));
|
|
1745
|
+
sock.addEventListener("error", (ev) => {
|
|
1746
|
+
const msg = typeof ev.message === "string" ? ev.message : "WebSocket error";
|
|
1747
|
+
if (!opened) {
|
|
1748
|
+
reject(new Error(msg));
|
|
1749
|
+
return;
|
|
1750
|
+
}
|
|
1751
|
+
if (closing) {
|
|
1752
|
+
log.info("OpenAI Realtime error during close", { error: msg });
|
|
1753
|
+
return;
|
|
1754
|
+
}
|
|
1755
|
+
opts.callbacks.onError("internal", msg);
|
|
1756
|
+
});
|
|
1757
|
+
});
|
|
1758
|
+
}
|
|
1759
|
+
function asString(v) {
|
|
1760
|
+
return typeof v === "string" ? v : "";
|
|
1761
|
+
}
|
|
1762
|
+
function handleAudioDelta(obj) {
|
|
1763
|
+
if (typeof obj.delta === "string") opts.callbacks.onAudioChunk(base64ToUint8(obj.delta));
|
|
1764
|
+
}
|
|
1765
|
+
function handleUserTranscript(obj) {
|
|
1766
|
+
if (typeof obj.transcript === "string") opts.callbacks.onUserTranscript(obj.transcript);
|
|
1767
|
+
}
|
|
1768
|
+
function handleResponseCreated(obj) {
|
|
1769
|
+
const resp = obj.response;
|
|
1770
|
+
const id = asString(resp?.id);
|
|
1771
|
+
currentResponseId = id;
|
|
1772
|
+
opts.callbacks.onReplyStarted(id);
|
|
1773
|
+
}
|
|
1774
|
+
function handleAgentTranscriptDelta(obj) {
|
|
1775
|
+
const id = asString(obj.item_id);
|
|
1776
|
+
const delta = asString(obj.delta);
|
|
1777
|
+
agentTranscriptBuffers.set(id, (agentTranscriptBuffers.get(id) ?? "") + delta);
|
|
1778
|
+
}
|
|
1779
|
+
function handleAgentTranscriptDone(obj) {
|
|
1780
|
+
const id = asString(obj.item_id);
|
|
1781
|
+
const text = agentTranscriptBuffers.get(id) ?? "";
|
|
1782
|
+
agentTranscriptBuffers.delete(id);
|
|
1783
|
+
if (text) opts.callbacks.onAgentTranscript(text, false);
|
|
1784
|
+
}
|
|
1785
|
+
function clearTurnBuffers() {
|
|
1786
|
+
agentTranscriptBuffers.clear();
|
|
1787
|
+
toolBuffers.clear();
|
|
1788
|
+
}
|
|
1789
|
+
function handleResponseDone() {
|
|
1790
|
+
currentResponseId = null;
|
|
1791
|
+
clearTurnBuffers();
|
|
1792
|
+
opts.callbacks.onReplyDone();
|
|
1793
|
+
}
|
|
1794
|
+
function handleErrorEvent(obj) {
|
|
1795
|
+
const err = obj.error;
|
|
1796
|
+
const message = typeof err?.message === "string" ? err.message : "OpenAI Realtime error";
|
|
1797
|
+
log.warn("OpenAI Realtime error event", { error: obj.error });
|
|
1798
|
+
clearTurnBuffers();
|
|
1799
|
+
opts.callbacks.onError("internal", message);
|
|
1800
|
+
}
|
|
1801
|
+
function handleOutputItemAdded(obj) {
|
|
1802
|
+
const item = obj.item;
|
|
1803
|
+
log.info("OpenAI Realtime output_item.added", {
|
|
1804
|
+
itemType: item?.type,
|
|
1805
|
+
name: item?.name,
|
|
1806
|
+
callId: item?.call_id
|
|
1807
|
+
});
|
|
1808
|
+
if (item?.type !== "function_call" || !item.id) return;
|
|
1809
|
+
toolBuffers.set(item.id, {
|
|
1810
|
+
callId: item.call_id ?? "",
|
|
1811
|
+
name: item.name ?? "",
|
|
1812
|
+
argsBuffer: ""
|
|
1813
|
+
});
|
|
1814
|
+
}
|
|
1815
|
+
function handleFunctionCallArgsDelta(obj) {
|
|
1816
|
+
const id = asString(obj.item_id);
|
|
1817
|
+
const delta = asString(obj.delta);
|
|
1818
|
+
const buf = toolBuffers.get(id);
|
|
1819
|
+
if (buf) buf.argsBuffer += delta;
|
|
1820
|
+
}
|
|
1821
|
+
function parseToolArgs(argsStr, name, callId) {
|
|
1822
|
+
if (!argsStr) return {};
|
|
1823
|
+
try {
|
|
1824
|
+
const parsed = JSON.parse(argsStr);
|
|
1825
|
+
if (parsed && typeof parsed === "object" && !Array.isArray(parsed)) return parsed;
|
|
1826
|
+
} catch {
|
|
1827
|
+
log.warn("OpenAI Realtime: invalid tool args JSON", {
|
|
1828
|
+
name,
|
|
1829
|
+
callId
|
|
1830
|
+
});
|
|
1831
|
+
}
|
|
1832
|
+
return {};
|
|
1833
|
+
}
|
|
1834
|
+
function handleFunctionCallArgsDone(obj) {
|
|
1835
|
+
const id = asString(obj.item_id);
|
|
1836
|
+
const buf = toolBuffers.get(id);
|
|
1837
|
+
toolBuffers.delete(id);
|
|
1838
|
+
const callId = asString(obj.call_id) || (buf?.callId ?? "");
|
|
1839
|
+
const name = asString(obj.name) || (buf?.name ?? "");
|
|
1840
|
+
const argsStr = asString(obj.arguments) || (buf?.argsBuffer ?? "");
|
|
1841
|
+
log.info("OpenAI Realtime tool call", {
|
|
1842
|
+
name,
|
|
1843
|
+
callId,
|
|
1844
|
+
args: argsStr
|
|
1845
|
+
});
|
|
1846
|
+
const args = parseToolArgs(argsStr, name, callId);
|
|
1847
|
+
opts.callbacks.onToolCall(callId, name, args);
|
|
1848
|
+
}
|
|
1849
|
+
function handleMessage(data) {
|
|
1850
|
+
let raw;
|
|
1851
|
+
try {
|
|
1852
|
+
raw = JSON.parse(String(data));
|
|
1853
|
+
} catch {
|
|
1854
|
+
log.warn("OpenAI Realtime: invalid JSON");
|
|
1855
|
+
return;
|
|
1856
|
+
}
|
|
1857
|
+
if (typeof raw !== "object" || raw === null) return;
|
|
1858
|
+
const obj = raw;
|
|
1859
|
+
switch (obj.type) {
|
|
1860
|
+
case "response.output_audio.delta":
|
|
1861
|
+
case "response.audio.delta":
|
|
1862
|
+
handleAudioDelta(obj);
|
|
1863
|
+
return;
|
|
1864
|
+
case "response.output_audio.done":
|
|
1865
|
+
case "response.audio.done":
|
|
1866
|
+
opts.callbacks.onAudioDone();
|
|
1867
|
+
return;
|
|
1868
|
+
case "input_audio_buffer.speech_started":
|
|
1869
|
+
opts.callbacks.onSpeechStarted();
|
|
1870
|
+
return;
|
|
1871
|
+
case "input_audio_buffer.speech_stopped":
|
|
1872
|
+
opts.callbacks.onSpeechStopped();
|
|
1873
|
+
return;
|
|
1874
|
+
case "conversation.item.input_audio_transcription.completed":
|
|
1875
|
+
handleUserTranscript(obj);
|
|
1876
|
+
return;
|
|
1877
|
+
case "response.created":
|
|
1878
|
+
handleResponseCreated(obj);
|
|
1879
|
+
return;
|
|
1880
|
+
case "response.output_audio_transcript.delta":
|
|
1881
|
+
case "response.audio_transcript.delta":
|
|
1882
|
+
handleAgentTranscriptDelta(obj);
|
|
1883
|
+
return;
|
|
1884
|
+
case "response.output_audio_transcript.done":
|
|
1885
|
+
case "response.audio_transcript.done":
|
|
1886
|
+
handleAgentTranscriptDone(obj);
|
|
1887
|
+
return;
|
|
1888
|
+
case "response.done":
|
|
1889
|
+
handleResponseDone();
|
|
1890
|
+
return;
|
|
1891
|
+
case "response.output_item.added":
|
|
1892
|
+
handleOutputItemAdded(obj);
|
|
1893
|
+
return;
|
|
1894
|
+
case "response.function_call_arguments.delta":
|
|
1895
|
+
handleFunctionCallArgsDelta(obj);
|
|
1896
|
+
return;
|
|
1897
|
+
case "response.function_call_arguments.done":
|
|
1898
|
+
handleFunctionCallArgsDone(obj);
|
|
1899
|
+
return;
|
|
1900
|
+
case "error":
|
|
1901
|
+
handleErrorEvent(obj);
|
|
1902
|
+
return;
|
|
1903
|
+
default:
|
|
1904
|
+
log.debug("OpenAI Realtime: unhandled event", { type: obj.type });
|
|
1905
|
+
return;
|
|
1906
|
+
}
|
|
1907
|
+
}
|
|
1908
|
+
function handleClose(code, reason) {
|
|
1909
|
+
if (closing) {
|
|
1910
|
+
log.info("OpenAI Realtime closed", {
|
|
1911
|
+
code,
|
|
1912
|
+
reason
|
|
1913
|
+
});
|
|
1914
|
+
return;
|
|
1915
|
+
}
|
|
1916
|
+
log.warn("OpenAI Realtime closed unexpectedly", {
|
|
1917
|
+
code,
|
|
1918
|
+
reason
|
|
1919
|
+
});
|
|
1920
|
+
opts.callbacks.onError("connection", `OpenAI Realtime closed (code=${code})`);
|
|
1921
|
+
}
|
|
1922
|
+
async function stop() {
|
|
1923
|
+
closing = true;
|
|
1924
|
+
ws?.close();
|
|
1925
|
+
ws = null;
|
|
1926
|
+
}
|
|
1927
|
+
return {
|
|
1928
|
+
start,
|
|
1929
|
+
stop,
|
|
1930
|
+
sendUserAudio(bytes) {
|
|
1931
|
+
if (!ws || ws.readyState !== 1) return;
|
|
1932
|
+
ws.send(`{"type":"input_audio_buffer.append","audio":"${uint8ToBase64(bytes)}"}`);
|
|
1933
|
+
},
|
|
1934
|
+
sendToolResult(callId, result) {
|
|
1935
|
+
log.info("OpenAI Realtime sendToolResult", {
|
|
1936
|
+
callId,
|
|
1937
|
+
resultLen: result.length,
|
|
1938
|
+
preview: result.slice(0, 200)
|
|
1939
|
+
});
|
|
1940
|
+
send({
|
|
1941
|
+
type: "conversation.item.create",
|
|
1942
|
+
item: {
|
|
1943
|
+
type: "function_call_output",
|
|
1944
|
+
call_id: callId,
|
|
1945
|
+
output: result
|
|
1946
|
+
}
|
|
1947
|
+
});
|
|
1948
|
+
if (!responseCreateQueued) {
|
|
1949
|
+
responseCreateQueued = true;
|
|
1950
|
+
queueMicrotask(() => {
|
|
1951
|
+
responseCreateQueued = false;
|
|
1952
|
+
send({ type: "response.create" });
|
|
1953
|
+
});
|
|
1954
|
+
}
|
|
1955
|
+
},
|
|
1956
|
+
cancelReply() {
|
|
1957
|
+
if (currentResponseId === null) return;
|
|
1958
|
+
send({ type: "response.cancel" });
|
|
1959
|
+
currentResponseId = null;
|
|
1960
|
+
clearTurnBuffers();
|
|
1961
|
+
opts.callbacks.onCancelled();
|
|
1962
|
+
}
|
|
1963
|
+
};
|
|
1964
|
+
}
|
|
1965
|
+
//#endregion
|
|
1893
1966
|
//#region host/to-vercel-tools.ts
|
|
1894
1967
|
/**
|
|
1895
|
-
* Converts agent {@link ToolSchema}[] to Vercel AI SDK tools
|
|
1896
|
-
*
|
|
1897
|
-
*
|
|
1898
|
-
* The pipeline orchestrator passes the output to `streamText({ tools })`.
|
|
1899
|
-
* Each produced tool's `execute` closure calls
|
|
1900
|
-
* `ctx.executeTool(name, args, sessionId, messages(), { signal, toolCallId })`,
|
|
1901
|
-
* so the existing agent tool infrastructure (argument validation, KV, hooks,
|
|
1902
|
-
* timeout) remains the single source of truth for tool behavior.
|
|
1903
|
-
*
|
|
1904
|
-
* Per-call `options.abortSignal` (forwarded by `streamText` when the
|
|
1905
|
-
* outer turn is aborted, e.g. barge-in) takes precedence over the
|
|
1906
|
-
* bag-level `ctx.signal` so individual invocations respect streamText
|
|
1907
|
-
* aborts.
|
|
1908
|
-
*/
|
|
1909
|
-
/**
|
|
1910
|
-
* Convert an array of {@link ToolSchema} to a Vercel AI SDK `ToolSet`
|
|
1911
|
-
* (record keyed by tool name).
|
|
1912
|
-
*
|
|
1913
|
-
* Uses the v6 `tool()` helper with `inputSchema: jsonSchema(...)` wrapping
|
|
1914
|
-
* the agent's JSON Schema `parameters`. Execution is delegated to
|
|
1915
|
-
* `ctx.executeTool` so validation, KV, timeouts, and hooks keep working.
|
|
1968
|
+
* Converts agent {@link ToolSchema}[] to Vercel AI SDK tools, delegating
|
|
1969
|
+
* `execute` to the agent's {@link ExecuteTool} so validation, KV, hooks,
|
|
1970
|
+
* and timeouts remain the single source of truth for tool behavior.
|
|
1916
1971
|
*/
|
|
1917
1972
|
function toVercelTools(schemas, ctx) {
|
|
1918
1973
|
const out = {};
|
|
@@ -1925,7 +1980,8 @@ function toVercelTools(schemas, ctx) {
|
|
|
1925
1980
|
const opts = {};
|
|
1926
1981
|
if (signal !== void 0) opts.signal = signal;
|
|
1927
1982
|
if (options.toolCallId !== void 0) opts.toolCallId = options.toolCallId;
|
|
1928
|
-
|
|
1983
|
+
const history = ctx.messages().slice();
|
|
1984
|
+
return ctx.executeTool(schema.name, input, ctx.sessionId, history, opts);
|
|
1929
1985
|
}
|
|
1930
1986
|
});
|
|
1931
1987
|
return out;
|
|
@@ -1976,10 +2032,6 @@ function createPipelineTransport(opts) {
|
|
|
1976
2032
|
function emitError(code, message) {
|
|
1977
2033
|
callbacks.onError(code, message);
|
|
1978
2034
|
}
|
|
1979
|
-
/**
|
|
1980
|
-
* Tear down after an unrecoverable provider error. Aborts the in-flight
|
|
1981
|
-
* turn, cancels TTS, signals providers to close. Idempotent.
|
|
1982
|
-
*/
|
|
1983
2035
|
function terminate() {
|
|
1984
2036
|
if (terminated) return;
|
|
1985
2037
|
terminated = true;
|
|
@@ -2115,16 +2167,10 @@ function createPipelineTransport(opts) {
|
|
|
2115
2167
|
}
|
|
2116
2168
|
};
|
|
2117
2169
|
}
|
|
2118
|
-
/**
|
|
2119
|
-
* Flush TTS and wait for drain. Resolves on:
|
|
2120
|
-
* - TTS emits `done`
|
|
2121
|
-
* - `signal` aborts (barge-in / provider error / session stop)
|
|
2122
|
-
* - PIPELINE_FLUSH_TIMEOUT_MS elapses
|
|
2123
|
-
* Resolves immediately if no TTS session.
|
|
2124
|
-
*/
|
|
2125
2170
|
function flushTtsAndWait(signal) {
|
|
2126
2171
|
const tts = ttsSession;
|
|
2127
2172
|
if (!tts) return Promise.resolve();
|
|
2173
|
+
if (signal.aborted) return Promise.resolve();
|
|
2128
2174
|
return new Promise((resolve) => {
|
|
2129
2175
|
let off = null;
|
|
2130
2176
|
let timer = null;
|
|
@@ -2144,10 +2190,6 @@ function createPipelineTransport(opts) {
|
|
|
2144
2190
|
resolve();
|
|
2145
2191
|
};
|
|
2146
2192
|
const onAbort = () => finish();
|
|
2147
|
-
if (signal.aborted) {
|
|
2148
|
-
resolve();
|
|
2149
|
-
return;
|
|
2150
|
-
}
|
|
2151
2193
|
signal.addEventListener("abort", onAbort, { once: true });
|
|
2152
2194
|
off = tts.on("done", finish);
|
|
2153
2195
|
timer = setTimeout(() => {
|
|
@@ -2301,8 +2343,7 @@ function createPipelineTransport(opts) {
|
|
|
2301
2343
|
},
|
|
2302
2344
|
sendUserAudio(bytes) {
|
|
2303
2345
|
if (terminated || !audioReady) return;
|
|
2304
|
-
const offset = bytes
|
|
2305
|
-
const length = bytes.byteLength;
|
|
2346
|
+
const { byteOffset: offset, byteLength: length } = bytes;
|
|
2306
2347
|
let pcm;
|
|
2307
2348
|
if (offset % 2 === 0 && length % 2 === 0) pcm = new Int16Array(bytes.buffer, offset, length / 2);
|
|
2308
2349
|
else {
|
|
@@ -2323,15 +2364,16 @@ function createPipelineTransport(opts) {
|
|
|
2323
2364
|
}
|
|
2324
2365
|
//#endregion
|
|
2325
2366
|
//#region host/s2s.ts
|
|
2326
|
-
const uint8ToBase64 = (bytes) => Buffer.from(bytes).toString("base64");
|
|
2327
|
-
const base64ToUint8 = (base64) => new Uint8Array(Buffer.from(base64, "base64"));
|
|
2328
2367
|
const defaultCreateS2sWebSocket = (url, opts) => new WsWebSocket(url, { headers: opts.headers });
|
|
2329
2368
|
const S2sMessageSchema = z.discriminatedUnion("type", [
|
|
2330
2369
|
z.object({
|
|
2331
2370
|
type: z.literal("session.ready"),
|
|
2332
2371
|
session_id: z.string()
|
|
2333
2372
|
}).passthrough(),
|
|
2334
|
-
z.object({
|
|
2373
|
+
z.object({
|
|
2374
|
+
type: z.literal("session.updated"),
|
|
2375
|
+
config: z.object({ id: z.string().optional() }).passthrough().optional()
|
|
2376
|
+
}).passthrough(),
|
|
2335
2377
|
z.object({ type: z.literal("input.speech.started") }),
|
|
2336
2378
|
z.object({ type: z.literal("input.speech.stopped") }),
|
|
2337
2379
|
z.object({
|
|
@@ -2374,12 +2416,17 @@ function parseS2sMessage(obj) {
|
|
|
2374
2416
|
const result = S2sMessageSchema.safeParse(obj);
|
|
2375
2417
|
return result.success ? result.data : void 0;
|
|
2376
2418
|
}
|
|
2419
|
+
function sidFields(ctx) {
|
|
2420
|
+
return ctx.sid !== void 0 ? { sid: ctx.sid } : {};
|
|
2421
|
+
}
|
|
2377
2422
|
function dispatchS2sMessage(callbacks, msg, state, ctx) {
|
|
2378
2423
|
switch (msg.type) {
|
|
2379
2424
|
case "session.ready":
|
|
2380
2425
|
callbacks.onSessionReady(msg.session_id);
|
|
2381
2426
|
break;
|
|
2382
|
-
case "session.updated":
|
|
2427
|
+
case "session.updated":
|
|
2428
|
+
if (msg.config?.id !== void 0) callbacks.onSessionReady(msg.config.id);
|
|
2429
|
+
break;
|
|
2383
2430
|
case "input.speech.started":
|
|
2384
2431
|
if (!state.speechActive) {
|
|
2385
2432
|
state.speechActive = true;
|
|
@@ -2406,13 +2453,18 @@ function dispatchS2sMessage(callbacks, msg, state, ctx) {
|
|
|
2406
2453
|
break;
|
|
2407
2454
|
case "reply.done":
|
|
2408
2455
|
ctx.log.info("S2S << reply.done", {
|
|
2409
|
-
...ctx
|
|
2456
|
+
...sidFields(ctx),
|
|
2410
2457
|
status: msg.status ?? "completed"
|
|
2411
2458
|
});
|
|
2412
2459
|
if (msg.status === "interrupted") callbacks.onCancelled();
|
|
2413
2460
|
else callbacks.onReplyDone();
|
|
2414
2461
|
break;
|
|
2415
2462
|
case "session.error":
|
|
2463
|
+
ctx.log.warn("S2S << session.error", {
|
|
2464
|
+
...sidFields(ctx),
|
|
2465
|
+
code: msg.code,
|
|
2466
|
+
message: msg.message
|
|
2467
|
+
});
|
|
2416
2468
|
if (msg.code === "session_not_found" || msg.code === "session_forbidden") callbacks.onSessionExpired();
|
|
2417
2469
|
else callbacks.onError(new Error(msg.message));
|
|
2418
2470
|
break;
|
|
@@ -2439,8 +2491,8 @@ function connectS2s(opts) {
|
|
|
2439
2491
|
return;
|
|
2440
2492
|
}
|
|
2441
2493
|
const json = JSON.stringify(msg);
|
|
2442
|
-
if (msg.type
|
|
2443
|
-
else log.info(`S2S >> ${msg.type}`);
|
|
2494
|
+
if (msg.type === "session.update") log.info(`S2S >> ${msg.type}`, { payload: json });
|
|
2495
|
+
else if (msg.type !== "input.audio") log.info(`S2S >> ${msg.type}`);
|
|
2444
2496
|
ws.send(json);
|
|
2445
2497
|
}
|
|
2446
2498
|
const handle = {
|
|
@@ -2489,35 +2541,28 @@ function connectS2s(opts) {
|
|
|
2489
2541
|
log.info("S2S WebSocket open");
|
|
2490
2542
|
resolve(handle);
|
|
2491
2543
|
});
|
|
2492
|
-
function
|
|
2544
|
+
function logIncoming(type) {
|
|
2545
|
+
if (type === "reply.audio" || type === "input.audio" || type === "reply.done" || type === "session.error") return;
|
|
2546
|
+
log.info(`S2S << ${type}`);
|
|
2547
|
+
}
|
|
2548
|
+
ws.addEventListener("message", (ev) => {
|
|
2549
|
+
let raw;
|
|
2493
2550
|
try {
|
|
2494
|
-
|
|
2551
|
+
raw = JSON.parse(String(ev.data));
|
|
2495
2552
|
} catch {
|
|
2496
|
-
log.warn("S2S << invalid JSON", { data: String(data).slice(0, 200) });
|
|
2497
|
-
|
|
2498
|
-
}
|
|
2499
|
-
function handleAudioFastPath(obj) {
|
|
2500
|
-
if (obj.type === "reply.audio" && typeof obj.data === "string") {
|
|
2501
|
-
callbacks.onAudio(base64ToUint8(obj.data));
|
|
2502
|
-
return true;
|
|
2553
|
+
log.warn("S2S << invalid JSON", { data: String(ev.data).slice(0, 200) });
|
|
2554
|
+
return;
|
|
2503
2555
|
}
|
|
2504
|
-
return false;
|
|
2505
|
-
}
|
|
2506
|
-
function logIncoming(obj) {
|
|
2507
|
-
if (obj.type === "reply.audio" || obj.type === "input.audio") return;
|
|
2508
|
-
if (obj.type === "reply.done") return;
|
|
2509
|
-
log.info(`S2S << ${obj.type}`);
|
|
2510
|
-
}
|
|
2511
|
-
ws.addEventListener("message", (ev) => {
|
|
2512
|
-
const raw = tryParseJson(ev.data);
|
|
2513
|
-
if (raw === void 0) return;
|
|
2514
2556
|
if (typeof raw !== "object" || raw === null || Array.isArray(raw)) {
|
|
2515
2557
|
log.warn("S2S << non-object JSON message", { type: typeof raw });
|
|
2516
2558
|
return;
|
|
2517
2559
|
}
|
|
2518
2560
|
const obj = raw;
|
|
2519
|
-
logIncoming(obj);
|
|
2520
|
-
if (
|
|
2561
|
+
logIncoming(obj.type);
|
|
2562
|
+
if (obj.type === "reply.audio" && typeof obj.data === "string") {
|
|
2563
|
+
callbacks.onAudio(base64ToUint8(obj.data));
|
|
2564
|
+
return;
|
|
2565
|
+
}
|
|
2521
2566
|
const parsed = parseS2sMessage(obj);
|
|
2522
2567
|
if (!parsed) {
|
|
2523
2568
|
log.warn(`S2S << unrecognised message type: ${obj.type ?? JSON.stringify(raw).slice(0, 200)}`);
|
|
@@ -2550,9 +2595,9 @@ function connectS2s(opts) {
|
|
|
2550
2595
|
const _internals = { connectS2s };
|
|
2551
2596
|
/**
|
|
2552
2597
|
* Close codes worth attempting `session.resume` on. These are network/server
|
|
2553
|
-
* blips, not protocol or auth violations.
|
|
2554
|
-
*
|
|
2555
|
-
*
|
|
2598
|
+
* blips, not protocol or auth violations. AssemblyAI keeps the session
|
|
2599
|
+
* available for 30 s after disconnect; reconnect runs immediately on close,
|
|
2600
|
+
* so the resume request reliably lands inside that window.
|
|
2556
2601
|
*/
|
|
2557
2602
|
const TRANSIENT_CLOSE_CODES = new Set([
|
|
2558
2603
|
1005,
|
|
@@ -2560,48 +2605,29 @@ const TRANSIENT_CLOSE_CODES = new Set([
|
|
|
2560
2605
|
1011,
|
|
2561
2606
|
3005
|
|
2562
2607
|
]);
|
|
2563
|
-
/**
|
|
2564
|
-
* AssemblyAI keeps the session alive for 30 s after disconnect; we leave a
|
|
2565
|
-
* little headroom so the resume request still fits inside that window after
|
|
2566
|
-
* the new WebSocket finishes opening.
|
|
2567
|
-
*/
|
|
2568
|
-
const RESUME_WINDOW_MS = 25e3;
|
|
2569
2608
|
function createS2sTransport(opts) {
|
|
2570
2609
|
const log = opts.logger ?? consoleLogger;
|
|
2571
2610
|
const createWs = opts.createWebSocket ?? defaultCreateS2sWebSocket;
|
|
2572
2611
|
let handle = null;
|
|
2573
2612
|
let currentReplyId = null;
|
|
2574
|
-
/** Most recent `session.ready` ID — present once the upstream session is established. */
|
|
2575
2613
|
let providerSessionId = null;
|
|
2576
|
-
/** When the current session became ready; bounds the resume window. */
|
|
2577
|
-
let sessionReadyAt = 0;
|
|
2578
|
-
/** Set by `stop()` so a deliberate close doesn't trigger a reconnect. */
|
|
2579
2614
|
let closing = false;
|
|
2580
|
-
/**
|
|
2581
|
-
* True while a `session.resume` round-trip is in flight (between sending
|
|
2582
|
-
* resume and the next `session.ready`). Used to distinguish a resume failure
|
|
2583
|
-
* (close before ready) from a normal close.
|
|
2584
|
-
*/
|
|
2585
2615
|
let reconnecting = false;
|
|
2586
|
-
/**
|
|
2587
|
-
* Set when a reconnect attempt is kicked off, cleared once the resumed
|
|
2588
|
-
* session's `session.ready` arrives. Prevents back-to-back reconnect loops
|
|
2589
|
-
* when the freshly-resumed socket also drops before fully recovering.
|
|
2590
|
-
*/
|
|
2591
|
-
let reconnectInFlight = false;
|
|
2592
2616
|
function buildCallbacks() {
|
|
2593
2617
|
return {
|
|
2594
2618
|
onSessionReady: (id) => {
|
|
2619
|
+
const isFirstReady = providerSessionId === null;
|
|
2595
2620
|
providerSessionId = id;
|
|
2596
|
-
sessionReadyAt = Date.now();
|
|
2597
2621
|
if (reconnecting) {
|
|
2598
2622
|
reconnecting = false;
|
|
2599
|
-
reconnectInFlight = false;
|
|
2600
2623
|
log.info("S2S resumed", {
|
|
2601
2624
|
sid: opts.sid,
|
|
2602
2625
|
sessionId: id
|
|
2603
2626
|
});
|
|
2604
|
-
}
|
|
2627
|
+
} else if (isFirstReady) log.info("S2S session ready", {
|
|
2628
|
+
sid: opts.sid,
|
|
2629
|
+
sessionId: id
|
|
2630
|
+
});
|
|
2605
2631
|
opts.callbacks.onSessionReady?.(id);
|
|
2606
2632
|
},
|
|
2607
2633
|
onReplyStarted: (replyId) => {
|
|
@@ -2625,7 +2651,6 @@ function createS2sTransport(opts) {
|
|
|
2625
2651
|
onSessionExpired: () => {
|
|
2626
2652
|
if (reconnecting) {
|
|
2627
2653
|
reconnecting = false;
|
|
2628
|
-
reconnectInFlight = false;
|
|
2629
2654
|
log.warn("S2S resume rejected: session expired", { sid: opts.sid });
|
|
2630
2655
|
opts.callbacks.onError("connection", "S2S resume failed: session expired");
|
|
2631
2656
|
return;
|
|
@@ -2638,15 +2663,11 @@ function createS2sTransport(opts) {
|
|
|
2638
2663
|
};
|
|
2639
2664
|
}
|
|
2640
2665
|
function canResumeAfter(code) {
|
|
2641
|
-
|
|
2642
|
-
if (providerSessionId === null) return false;
|
|
2643
|
-
if (reconnectInFlight) return false;
|
|
2644
|
-
return sessionReadyAt > 0 && Date.now() - sessionReadyAt < RESUME_WINDOW_MS;
|
|
2666
|
+
return TRANSIENT_CLOSE_CODES.has(code) && providerSessionId !== null && !reconnecting;
|
|
2645
2667
|
}
|
|
2646
2668
|
function emitFatalClose(code, reason, wasReconnecting) {
|
|
2647
2669
|
if (wasReconnecting) {
|
|
2648
2670
|
reconnecting = false;
|
|
2649
|
-
reconnectInFlight = false;
|
|
2650
2671
|
opts.callbacks.onError("connection", `S2S resume failed (code=${code})`);
|
|
2651
2672
|
return;
|
|
2652
2673
|
}
|
|
@@ -2667,7 +2688,6 @@ function createS2sTransport(opts) {
|
|
|
2667
2688
|
});
|
|
2668
2689
|
}
|
|
2669
2690
|
function startResume(prevId, code, reason) {
|
|
2670
|
-
reconnectInFlight = true;
|
|
2671
2691
|
reconnecting = true;
|
|
2672
2692
|
log.warn("S2S unexpected close — attempting resume", {
|
|
2673
2693
|
sid: opts.sid,
|
|
@@ -2682,7 +2702,6 @@ function createS2sTransport(opts) {
|
|
|
2682
2702
|
}
|
|
2683
2703
|
resume(prevId).catch((err) => {
|
|
2684
2704
|
reconnecting = false;
|
|
2685
|
-
reconnectInFlight = false;
|
|
2686
2705
|
const msg = err instanceof Error ? err.message : String(err);
|
|
2687
2706
|
log.warn("S2S resume failed", {
|
|
2688
2707
|
sid: opts.sid,
|
|
@@ -2700,12 +2719,11 @@ function createS2sTransport(opts) {
|
|
|
2700
2719
|
return;
|
|
2701
2720
|
}
|
|
2702
2721
|
const wasReconnecting = reconnecting;
|
|
2703
|
-
|
|
2722
|
+
const prevId = providerSessionId;
|
|
2723
|
+
if (!canResumeAfter(code) || prevId === null) {
|
|
2704
2724
|
emitFatalClose(code, reason, wasReconnecting);
|
|
2705
2725
|
return;
|
|
2706
2726
|
}
|
|
2707
|
-
const prevId = providerSessionId;
|
|
2708
|
-
if (prevId === null) return;
|
|
2709
2727
|
startResume(prevId, code, reason);
|
|
2710
2728
|
}
|
|
2711
2729
|
async function resume(prevSessionId) {
|
|
@@ -2714,7 +2732,7 @@ function createS2sTransport(opts) {
|
|
|
2714
2732
|
config: opts.s2sConfig,
|
|
2715
2733
|
createWebSocket: createWs,
|
|
2716
2734
|
logger: log,
|
|
2717
|
-
|
|
2735
|
+
sid: opts.sid,
|
|
2718
2736
|
callbacks: buildCallbacks()
|
|
2719
2737
|
});
|
|
2720
2738
|
if (closing) {
|
|
@@ -2799,14 +2817,11 @@ function createClientSink(ws, log) {
|
|
|
2799
2817
|
}
|
|
2800
2818
|
};
|
|
2801
2819
|
}
|
|
2802
|
-
function
|
|
2820
|
+
function dispatchMessage(data, session, log, sid) {
|
|
2803
2821
|
if (data instanceof Uint8Array) {
|
|
2804
2822
|
session.onAudio(data);
|
|
2805
|
-
return
|
|
2823
|
+
return;
|
|
2806
2824
|
}
|
|
2807
|
-
return false;
|
|
2808
|
-
}
|
|
2809
|
-
function handleTextMessage(data, session, log, sid) {
|
|
2810
2825
|
if (typeof data !== "string") {
|
|
2811
2826
|
log.warn("ws: non-string, non-binary frame received; dropping", { sid });
|
|
2812
2827
|
return;
|
|
@@ -2869,10 +2884,7 @@ function wireSessionSocket(ws, opts) {
|
|
|
2869
2884
|
if (!(session && messageBuffer)) return;
|
|
2870
2885
|
const buf = messageBuffer;
|
|
2871
2886
|
messageBuffer = null;
|
|
2872
|
-
for (const event of buf)
|
|
2873
|
-
if (handleBinaryAudio(event.data, session)) continue;
|
|
2874
|
-
handleTextMessage(event.data, session, log, sid);
|
|
2875
|
-
}
|
|
2887
|
+
for (const event of buf) dispatchMessage(event.data, session, log, sid);
|
|
2876
2888
|
}
|
|
2877
2889
|
function onOpen() {
|
|
2878
2890
|
opts.onOpen?.();
|
|
@@ -2921,8 +2933,7 @@ function wireSessionSocket(ws, opts) {
|
|
|
2921
2933
|
if (messageBuffer && messageBuffer.length < 100) messageBuffer.push(event);
|
|
2922
2934
|
return;
|
|
2923
2935
|
}
|
|
2924
|
-
|
|
2925
|
-
handleTextMessage(event.data, session, log, sid);
|
|
2936
|
+
dispatchMessage(event.data, session, log, sid);
|
|
2926
2937
|
});
|
|
2927
2938
|
ws.addEventListener("close", () => {
|
|
2928
2939
|
log.info("Session disconnected", {
|
|
@@ -2953,27 +2964,19 @@ function wireSessionSocket(ws, opts) {
|
|
|
2953
2964
|
//#endregion
|
|
2954
2965
|
//#region host/runtime.ts
|
|
2955
2966
|
/**
|
|
2956
|
-
*
|
|
2957
|
-
*
|
|
2958
|
-
* Each STT provider uses its own env var (e.g. `ASSEMBLYAI_API_KEY`,
|
|
2959
|
-
* `DEEPGRAM_API_KEY`). We read the kind from the descriptor if it is one;
|
|
2960
|
-
* pre-resolved openers have no kind field so we fall back to AssemblyAI for
|
|
2961
|
-
* backward compatibility (openers supply their own key at open-time anyway).
|
|
2967
|
+
* Read the descriptor `kind` if present. Pre-resolved openers (test escape
|
|
2968
|
+
* hatch) have no `kind` field, so callers fall back to a default env var.
|
|
2962
2969
|
*/
|
|
2970
|
+
function descriptorKind(value) {
|
|
2971
|
+
const kind = value?.kind;
|
|
2972
|
+
return typeof kind === "string" ? kind : void 0;
|
|
2973
|
+
}
|
|
2963
2974
|
function resolveSttApiKey(stt, env) {
|
|
2964
|
-
if ((stt
|
|
2975
|
+
if (descriptorKind(stt) === "deepgram") return resolveApiKey("DEEPGRAM_API_KEY", env);
|
|
2965
2976
|
return resolveApiKey("ASSEMBLYAI_API_KEY", env);
|
|
2966
2977
|
}
|
|
2967
|
-
/**
|
|
2968
|
-
* Resolve the API key env-var for the configured TTS provider.
|
|
2969
|
-
*
|
|
2970
|
-
* Each TTS provider uses its own env var (e.g. `CARTESIA_API_KEY`,
|
|
2971
|
-
* `RIME_API_KEY`). We read the kind from the descriptor if it is one;
|
|
2972
|
-
* pre-resolved openers have no kind field so we fall back to Cartesia for
|
|
2973
|
-
* backward compatibility (openers supply their own key at open-time anyway).
|
|
2974
|
-
*/
|
|
2975
2978
|
function resolveTtsApiKey(tts, env) {
|
|
2976
|
-
if ((tts
|
|
2979
|
+
if (descriptorKind(tts) === "rime") return resolveApiKey("RIME_API_KEY", env);
|
|
2977
2980
|
return resolveApiKey("CARTESIA_API_KEY", env);
|
|
2978
2981
|
}
|
|
2979
2982
|
/**
|
|
@@ -3014,7 +3017,7 @@ function createLocalVector(slug) {
|
|
|
3014
3017
|
* @public
|
|
3015
3018
|
*/
|
|
3016
3019
|
function createRuntime(opts) {
|
|
3017
|
-
const { agent, env, kv = createLocalKv(), vector, createWebSocket, logger = consoleLogger, s2sConfig = DEFAULT_S2S_CONFIG, sessionStartTimeoutMs, shutdownTimeoutMs = DEFAULT_SHUTDOWN_TIMEOUT_MS } = opts;
|
|
3020
|
+
const { agent, env, kv = createLocalKv(), vector, createWebSocket, createOpenaiRealtimeWebSocket, logger = consoleLogger, s2sConfig = DEFAULT_S2S_CONFIG, sessionStartTimeoutMs, shutdownTimeoutMs = DEFAULT_SHUTDOWN_TIMEOUT_MS } = opts;
|
|
3018
3021
|
const mode = assertProviderTriple(opts.stt, opts.llm, opts.tts);
|
|
3019
3022
|
const slug = agent.name ?? "local";
|
|
3020
3023
|
const resolvedKv = agent.kv ? resolveKv(agent.kv, env, "") : kv;
|
|
@@ -3083,49 +3086,20 @@ function createRuntime(opts) {
|
|
|
3083
3086
|
});
|
|
3084
3087
|
};
|
|
3085
3088
|
}
|
|
3086
|
-
|
|
3089
|
+
let pipelineProviders = null;
|
|
3090
|
+
if (mode === "pipeline" && opts.stt && opts.llm && opts.tts) pipelineProviders = {
|
|
3087
3091
|
stt: resolveSttIfDescriptor(opts.stt),
|
|
3088
3092
|
llm: resolveLlmIfDescriptor(opts.llm, env),
|
|
3089
3093
|
tts: resolveTtsIfDescriptor(opts.tts)
|
|
3090
|
-
}
|
|
3091
|
-
function
|
|
3092
|
-
|
|
3093
|
-
|
|
3094
|
-
const systemPrompt = buildSystemPrompt(agentConfig, {
|
|
3095
|
-
hasTools: toolSchemas.length > 0 || (agentConfig.builtinTools?.length ?? 0) > 0,
|
|
3096
|
-
voice: true,
|
|
3097
|
-
toolGuidance
|
|
3098
|
-
});
|
|
3099
|
-
let core = null;
|
|
3100
|
-
function bindCore() {
|
|
3101
|
-
if (!core) throw new Error("SessionCore not yet created");
|
|
3102
|
-
return core;
|
|
3103
|
-
}
|
|
3104
|
-
const callbacks = {
|
|
3105
|
-
onReplyStarted: (replyId) => bindCore().onReplyStarted(replyId),
|
|
3106
|
-
onReplyDone: () => bindCore().onReplyDone(),
|
|
3107
|
-
onCancelled: () => bindCore().onCancelled(),
|
|
3108
|
-
onAudioChunk: (bytes) => bindCore().onAudioChunk(bytes),
|
|
3109
|
-
onAudioDone: () => bindCore().onAudioDone(),
|
|
3110
|
-
onUserTranscript: (text) => bindCore().onUserTranscript(text),
|
|
3111
|
-
onAgentTranscript: (text, interrupted) => bindCore().onAgentTranscript(text, interrupted),
|
|
3112
|
-
onToolCall: isPipeline ? (id, name, args) => sessionOpts.client.event({
|
|
3113
|
-
type: "tool_call",
|
|
3114
|
-
toolCallId: id,
|
|
3115
|
-
toolName: name,
|
|
3116
|
-
args
|
|
3117
|
-
}) : (id, name, args) => bindCore().onToolCall(id, name, args),
|
|
3118
|
-
onError: (code, message) => bindCore().onError(code, message),
|
|
3119
|
-
onSpeechStarted: () => bindCore().onSpeechStarted(),
|
|
3120
|
-
onSpeechStopped: () => bindCore().onSpeechStopped()
|
|
3121
|
-
};
|
|
3122
|
-
let transport;
|
|
3123
|
-
if (pipelineProviders) transport = createPipelineTransport({
|
|
3094
|
+
};
|
|
3095
|
+
function buildPipelineTransport(args) {
|
|
3096
|
+
const { sessionOpts, systemPrompt, callbacks, providers } = args;
|
|
3097
|
+
return createPipelineTransport({
|
|
3124
3098
|
sid: sessionOpts.id,
|
|
3125
3099
|
agent: sessionOpts.agent,
|
|
3126
|
-
stt:
|
|
3127
|
-
llm:
|
|
3128
|
-
tts:
|
|
3100
|
+
stt: providers.stt,
|
|
3101
|
+
llm: providers.llm,
|
|
3102
|
+
tts: providers.tts,
|
|
3129
3103
|
callbacks,
|
|
3130
3104
|
sessionConfig: {
|
|
3131
3105
|
systemPrompt,
|
|
@@ -3145,7 +3119,30 @@ function createRuntime(opts) {
|
|
|
3145
3119
|
skipGreeting: sessionOpts.skipGreeting ?? false,
|
|
3146
3120
|
logger
|
|
3147
3121
|
});
|
|
3148
|
-
|
|
3122
|
+
}
|
|
3123
|
+
function buildOpenaiRealtimeTransport(args) {
|
|
3124
|
+
const { sessionOpts, systemPrompt, callbacks } = args;
|
|
3125
|
+
return createOpenaiRealtimeTransport({
|
|
3126
|
+
apiKey: resolveApiKey("OPENAI_API_KEY", env),
|
|
3127
|
+
options: agent.s2s?.options ?? {},
|
|
3128
|
+
sessionConfig: {
|
|
3129
|
+
systemPrompt,
|
|
3130
|
+
...agentConfig.greeting !== void 0 ? { greeting: agentConfig.greeting } : {},
|
|
3131
|
+
tools: toolSchemas
|
|
3132
|
+
},
|
|
3133
|
+
toolSchemas,
|
|
3134
|
+
toolChoice: agentConfig.toolChoice ?? "auto",
|
|
3135
|
+
callbacks,
|
|
3136
|
+
sid: sessionOpts.id,
|
|
3137
|
+
agent: sessionOpts.agent,
|
|
3138
|
+
skipGreeting: sessionOpts.skipGreeting ?? false,
|
|
3139
|
+
...createOpenaiRealtimeWebSocket ? { createWebSocket: createOpenaiRealtimeWebSocket } : {},
|
|
3140
|
+
logger
|
|
3141
|
+
});
|
|
3142
|
+
}
|
|
3143
|
+
function buildAssemblyS2sTransport(args) {
|
|
3144
|
+
const { sessionOpts, systemPrompt, callbacks } = args;
|
|
3145
|
+
return createS2sTransport({
|
|
3149
3146
|
apiKey: env.ASSEMBLYAI_API_KEY ?? "",
|
|
3150
3147
|
s2sConfig,
|
|
3151
3148
|
sessionConfig: {
|
|
@@ -3160,6 +3157,54 @@ function createRuntime(opts) {
|
|
|
3160
3157
|
...createWebSocket ? { createWebSocket } : {},
|
|
3161
3158
|
logger
|
|
3162
3159
|
});
|
|
3160
|
+
}
|
|
3161
|
+
function buildTransport(args) {
|
|
3162
|
+
if (pipelineProviders) return buildPipelineTransport({
|
|
3163
|
+
...args,
|
|
3164
|
+
providers: pipelineProviders
|
|
3165
|
+
});
|
|
3166
|
+
if (agent.s2s !== void 0) {
|
|
3167
|
+
const kind = descriptorKind(agent.s2s);
|
|
3168
|
+
if (kind === "openai-realtime") return buildOpenaiRealtimeTransport(args);
|
|
3169
|
+
throw new Error(`Unknown s2s provider kind: ${kind ?? "<missing>"}`);
|
|
3170
|
+
}
|
|
3171
|
+
return buildAssemblyS2sTransport(args);
|
|
3172
|
+
}
|
|
3173
|
+
function createSession(sessionOpts) {
|
|
3174
|
+
sinkMap.set(sessionOpts.id, sessionOpts.client);
|
|
3175
|
+
const isPipeline = Boolean(pipelineProviders);
|
|
3176
|
+
const systemPrompt = buildSystemPrompt(agentConfig, {
|
|
3177
|
+
hasTools: toolSchemas.length > 0 || (agentConfig.builtinTools?.length ?? 0) > 0,
|
|
3178
|
+
voice: true,
|
|
3179
|
+
toolGuidance
|
|
3180
|
+
});
|
|
3181
|
+
let core = null;
|
|
3182
|
+
function bindCore() {
|
|
3183
|
+
if (!core) throw new Error("SessionCore not yet created");
|
|
3184
|
+
return core;
|
|
3185
|
+
}
|
|
3186
|
+
const transport = buildTransport({
|
|
3187
|
+
sessionOpts,
|
|
3188
|
+
systemPrompt,
|
|
3189
|
+
callbacks: {
|
|
3190
|
+
onReplyStarted: (replyId) => bindCore().onReplyStarted(replyId),
|
|
3191
|
+
onReplyDone: () => bindCore().onReplyDone(),
|
|
3192
|
+
onCancelled: () => bindCore().onCancelled(),
|
|
3193
|
+
onAudioChunk: (bytes) => bindCore().onAudioChunk(bytes),
|
|
3194
|
+
onAudioDone: () => bindCore().onAudioDone(),
|
|
3195
|
+
onUserTranscript: (text) => bindCore().onUserTranscript(text),
|
|
3196
|
+
onAgentTranscript: (text, interrupted) => bindCore().onAgentTranscript(text, interrupted),
|
|
3197
|
+
onToolCall: isPipeline ? (id, name, args) => sessionOpts.client.event({
|
|
3198
|
+
type: "tool_call",
|
|
3199
|
+
toolCallId: id,
|
|
3200
|
+
toolName: name,
|
|
3201
|
+
args
|
|
3202
|
+
}) : (id, name, args) => bindCore().onToolCall(id, name, args),
|
|
3203
|
+
onError: (code, message) => bindCore().onError(code, message),
|
|
3204
|
+
onSpeechStarted: () => bindCore().onSpeechStarted(),
|
|
3205
|
+
onSpeechStopped: () => bindCore().onSpeechStopped()
|
|
3206
|
+
}
|
|
3207
|
+
});
|
|
3163
3208
|
core = createSessionCore({
|
|
3164
3209
|
id: sessionOpts.id,
|
|
3165
3210
|
agent: sessionOpts.agent,
|
|
@@ -3228,6 +3273,11 @@ function createRuntime(opts) {
|
|
|
3228
3273
|
* **Internal module** — used by `aai-cli` dev server. Not a public API.
|
|
3229
3274
|
* Import via `aai/host`.
|
|
3230
3275
|
*/
|
|
3276
|
+
const JSON_HEADERS = { "Content-Type": "application/json" };
|
|
3277
|
+
function sendJson(res, status, body) {
|
|
3278
|
+
res.writeHead(status, JSON_HEADERS);
|
|
3279
|
+
res.end(JSON.stringify(body));
|
|
3280
|
+
}
|
|
3231
3281
|
async function serveStatic(dir, req, res) {
|
|
3232
3282
|
const url = req.url?.split("?")[0] ?? "/";
|
|
3233
3283
|
const filePath = path.join(dir, url === "/" ? "index.html" : url);
|
|
@@ -3247,66 +3297,59 @@ async function serveStatic(dir, req, res) {
|
|
|
3247
3297
|
return false;
|
|
3248
3298
|
}
|
|
3249
3299
|
}
|
|
3250
|
-
function
|
|
3300
|
+
async function readBody(req) {
|
|
3251
3301
|
let body = "";
|
|
3252
|
-
|
|
3253
|
-
|
|
3254
|
-
|
|
3255
|
-
|
|
3256
|
-
|
|
3257
|
-
|
|
3258
|
-
|
|
3259
|
-
|
|
3260
|
-
|
|
3261
|
-
res.end(JSON.stringify({ error: parsed.error.message }));
|
|
3262
|
-
return;
|
|
3263
|
-
}
|
|
3264
|
-
const op = parsed.data;
|
|
3265
|
-
let result;
|
|
3266
|
-
switch (op.op) {
|
|
3267
|
-
case "upsert":
|
|
3268
|
-
await vector.upsert(op.id, op.text, op.metadata);
|
|
3269
|
-
result = "OK";
|
|
3270
|
-
break;
|
|
3271
|
-
case "query":
|
|
3272
|
-
result = await vector.query(op.text, {
|
|
3273
|
-
...op.topK !== void 0 ? { topK: op.topK } : {},
|
|
3274
|
-
...op.filter !== void 0 ? { filter: op.filter } : {}
|
|
3275
|
-
});
|
|
3276
|
-
break;
|
|
3277
|
-
case "delete":
|
|
3278
|
-
await vector.delete(op.ids);
|
|
3279
|
-
result = "OK";
|
|
3280
|
-
break;
|
|
3281
|
-
default: break;
|
|
3282
|
-
}
|
|
3283
|
-
res.statusCode = 200;
|
|
3284
|
-
res.end(JSON.stringify({ result }));
|
|
3285
|
-
} catch (err) {
|
|
3286
|
-
res.statusCode = 500;
|
|
3287
|
-
res.end(JSON.stringify({ error: err instanceof Error ? err.message : String(err) }));
|
|
3302
|
+
for await (const chunk of req) body += chunk;
|
|
3303
|
+
return body;
|
|
3304
|
+
}
|
|
3305
|
+
async function handleVectorPost(vector, req, res) {
|
|
3306
|
+
try {
|
|
3307
|
+
const parsed = VectorRequestSchema.safeParse(JSON.parse(await readBody(req)));
|
|
3308
|
+
if (!parsed.success) {
|
|
3309
|
+
sendJson(res, 400, { error: parsed.error.message });
|
|
3310
|
+
return;
|
|
3288
3311
|
}
|
|
3289
|
-
|
|
3312
|
+
const op = parsed.data;
|
|
3313
|
+
let result;
|
|
3314
|
+
switch (op.op) {
|
|
3315
|
+
case "upsert":
|
|
3316
|
+
await vector.upsert(op.id, op.text, op.metadata);
|
|
3317
|
+
result = "OK";
|
|
3318
|
+
break;
|
|
3319
|
+
case "query":
|
|
3320
|
+
result = await vector.query(op.text, {
|
|
3321
|
+
...op.topK !== void 0 ? { topK: op.topK } : {},
|
|
3322
|
+
...op.filter !== void 0 ? { filter: op.filter } : {}
|
|
3323
|
+
});
|
|
3324
|
+
break;
|
|
3325
|
+
case "delete":
|
|
3326
|
+
await vector.delete(op.ids);
|
|
3327
|
+
result = "OK";
|
|
3328
|
+
break;
|
|
3329
|
+
default: return op;
|
|
3330
|
+
}
|
|
3331
|
+
sendJson(res, 200, { result });
|
|
3332
|
+
} catch (err) {
|
|
3333
|
+
sendJson(res, 500, { error: err instanceof Error ? err.message : String(err) });
|
|
3334
|
+
}
|
|
3290
3335
|
}
|
|
3291
|
-
function handleKvGet(kv, req, res) {
|
|
3336
|
+
async function handleKvGet(kv, req, res) {
|
|
3292
3337
|
const key = new URL(req.url ?? "/", "http://localhost").searchParams.get("key");
|
|
3293
3338
|
if (!key) {
|
|
3294
|
-
res
|
|
3295
|
-
res.end(JSON.stringify({ error: "Missing key query parameter" }));
|
|
3339
|
+
sendJson(res, 400, { error: "Missing key query parameter" });
|
|
3296
3340
|
return;
|
|
3297
3341
|
}
|
|
3298
|
-
|
|
3342
|
+
try {
|
|
3343
|
+
const value = await kv.get(key);
|
|
3299
3344
|
if (value === null) {
|
|
3300
|
-
res.writeHead(404,
|
|
3345
|
+
res.writeHead(404, JSON_HEADERS);
|
|
3301
3346
|
res.end("null");
|
|
3302
|
-
|
|
3303
|
-
res.writeHead(200, { "Content-Type": "application/json" });
|
|
3304
|
-
res.end(JSON.stringify(value));
|
|
3347
|
+
return;
|
|
3305
3348
|
}
|
|
3306
|
-
|
|
3307
|
-
|
|
3308
|
-
res
|
|
3309
|
-
}
|
|
3349
|
+
sendJson(res, 200, value);
|
|
3350
|
+
} catch {
|
|
3351
|
+
sendJson(res, 500, { error: "KV error" });
|
|
3352
|
+
}
|
|
3310
3353
|
}
|
|
3311
3354
|
/**
|
|
3312
3355
|
* Create an HTTP + WebSocket server for an agent.
|
|
@@ -3317,8 +3360,17 @@ function createServer(options) {
|
|
|
3317
3360
|
const { runtime, clientHtml, clientDir, logger = consoleLogger, kv, vector } = options;
|
|
3318
3361
|
const name = options.name ?? "agent";
|
|
3319
3362
|
if (clientHtml && clientDir) throw new Error("clientHtml and clientDir are mutually exclusive");
|
|
3320
|
-
const
|
|
3321
|
-
|
|
3363
|
+
const defaultHtml = clientHtml ?? `<!DOCTYPE html><html><body><h1>${escapeHtml(name)}</h1><p>Agent server running.</p></body></html>`;
|
|
3364
|
+
async function handleRequest(req, res, url, method) {
|
|
3365
|
+
if (clientDir && await serveStatic(clientDir, req, res)) return;
|
|
3366
|
+
if (method === "GET" && url === "/") {
|
|
3367
|
+
res.writeHead(200, { "Content-Type": "text/html" });
|
|
3368
|
+
res.end(defaultHtml);
|
|
3369
|
+
return;
|
|
3370
|
+
}
|
|
3371
|
+
logger.error(`${method} ${url} 404`);
|
|
3372
|
+
sendJson(res, 404, { error: "Not found" });
|
|
3373
|
+
}
|
|
3322
3374
|
const httpServer = http.createServer((req, res) => {
|
|
3323
3375
|
const url = req.url?.split("?")[0] ?? "/";
|
|
3324
3376
|
const method = req.method ?? "GET";
|
|
@@ -3326,11 +3378,10 @@ function createServer(options) {
|
|
|
3326
3378
|
res.setHeader("X-Content-Type-Options", "nosniff");
|
|
3327
3379
|
res.setHeader("X-Frame-Options", "SAMEORIGIN");
|
|
3328
3380
|
if (method === "GET" && url === "/health") {
|
|
3329
|
-
res
|
|
3330
|
-
res.end(JSON.stringify({
|
|
3381
|
+
sendJson(res, 200, {
|
|
3331
3382
|
status: "ok",
|
|
3332
3383
|
name
|
|
3333
|
-
})
|
|
3384
|
+
});
|
|
3334
3385
|
return;
|
|
3335
3386
|
}
|
|
3336
3387
|
if (kv && method === "GET" && url === "/kv") {
|
|
@@ -3343,17 +3394,6 @@ function createServer(options) {
|
|
|
3343
3394
|
}
|
|
3344
3395
|
handleRequest(req, res, url, method);
|
|
3345
3396
|
});
|
|
3346
|
-
async function handleRequest(req, res, url, method) {
|
|
3347
|
-
if (clientDir && await serveStatic(clientDir, req, res)) return;
|
|
3348
|
-
if (method === "GET" && url === "/") {
|
|
3349
|
-
res.writeHead(200, { "Content-Type": "text/html" });
|
|
3350
|
-
res.end(defaultHtml);
|
|
3351
|
-
return;
|
|
3352
|
-
}
|
|
3353
|
-
logger.error(`${method} ${url} 404`);
|
|
3354
|
-
res.writeHead(404, { "Content-Type": "application/json" });
|
|
3355
|
-
res.end(JSON.stringify({ error: "Not found" }));
|
|
3356
|
-
}
|
|
3357
3397
|
const wss = new WebSocketServer({
|
|
3358
3398
|
noServer: true,
|
|
3359
3399
|
maxPayload: MAX_WS_PAYLOAD_BYTES
|