@alexkroman1/aai 1.2.3 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +14 -12
- package/CHANGELOG.md +20 -0
- package/dist/{constants-VTFoymJ-.js → constants-BL3nvg4I.js} +8 -1
- package/dist/host/_pipeline-test-fakes.d.ts +117 -0
- package/dist/host/pipeline-session-ctx.d.ts +24 -0
- package/dist/host/pipeline-session.d.ts +48 -0
- package/dist/host/providers/llm.d.ts +2 -0
- package/dist/host/providers/stt/assemblyai.d.ts +31 -0
- package/dist/host/providers/stt-barrel.d.ts +8 -0
- package/dist/host/providers/stt-barrel.js +92 -0
- package/dist/host/providers/stt.d.ts +2 -0
- package/dist/host/providers/tts/cartesia.d.ts +39 -0
- package/dist/host/providers/tts-barrel.d.ts +8 -0
- package/dist/host/providers/tts-barrel.js +182 -0
- package/dist/host/providers/tts.d.ts +2 -0
- package/dist/host/runtime-barrel.js +565 -81
- package/dist/host/runtime.d.ts +17 -0
- package/dist/host/s2s.d.ts +5 -0
- package/dist/host/session-ctx.d.ts +22 -4
- package/dist/host/to-vercel-tools.d.ts +45 -0
- package/dist/index.js +7 -2
- package/dist/sdk/_internal-types.d.ts +15 -1
- package/dist/sdk/constants.d.ts +7 -0
- package/dist/sdk/define.d.ts +21 -0
- package/dist/sdk/manifest.d.ts +22 -0
- package/dist/sdk/protocol.d.ts +3 -3
- package/dist/sdk/protocol.js +1 -1
- package/dist/sdk/providers.d.ts +70 -0
- package/dist/sdk/types.d.ts +16 -0
- package/exports-no-dev-deps.test.ts +39 -14
- package/host/_pipeline-test-fakes.ts +357 -0
- package/host/_test-utils.ts +1 -0
- package/host/integration/fixtures/README.md +49 -0
- package/host/integration/pipeline-reference.integration.test.ts +124 -0
- package/host/pipeline-session-ctx.test.ts +31 -0
- package/host/pipeline-session-ctx.ts +36 -0
- package/host/pipeline-session.test.ts +572 -0
- package/host/pipeline-session.ts +489 -0
- package/host/providers/llm.ts +3 -0
- package/host/providers/providers.test-d.ts +31 -0
- package/host/providers/stt/assemblyai.test.ts +100 -0
- package/host/providers/stt/assemblyai.ts +154 -0
- package/host/providers/stt/fixtures/assemblyai/basic-turn.json +30 -0
- package/host/providers/stt-barrel.ts +13 -0
- package/host/providers/stt.ts +3 -0
- package/host/providers/tts/cartesia.test.ts +210 -0
- package/host/providers/tts/cartesia.ts +251 -0
- package/host/providers/tts-barrel.ts +13 -0
- package/host/providers/tts.ts +3 -0
- package/host/runtime.test.ts +81 -1
- package/host/runtime.ts +61 -0
- package/host/s2s.test.ts +19 -0
- package/host/s2s.ts +10 -0
- package/host/session-ctx.ts +35 -8
- package/host/to-vercel-tools.test.ts +187 -0
- package/host/to-vercel-tools.ts +74 -0
- package/package.json +15 -1
- package/sdk/__snapshots__/exports.test.ts.snap +2 -0
- package/sdk/_internal-types.ts +16 -0
- package/sdk/constants.ts +8 -0
- package/sdk/define.test-d.ts +21 -0
- package/sdk/define.test.ts +33 -0
- package/sdk/define.ts +21 -0
- package/sdk/manifest.test-d.ts +14 -0
- package/sdk/manifest.test.ts +51 -0
- package/sdk/manifest.ts +39 -0
- package/sdk/providers.ts +90 -0
- package/sdk/types.ts +16 -0
- package/vitest.config.ts +1 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { a as DEFAULT_SHUTDOWN_TIMEOUT_MS, c as FETCH_TIMEOUT_MS, d as MAX_PAGE_CHARS, f as MAX_TOOL_RESULT_CHARS, g as
|
|
1
|
+
import { _ as TOOL_EXECUTION_TIMEOUT_MS, a as DEFAULT_SHUTDOWN_TIMEOUT_MS, c as FETCH_TIMEOUT_MS, d as MAX_PAGE_CHARS, f as MAX_TOOL_RESULT_CHARS, g as RUN_CODE_TIMEOUT_MS, h as PIPELINE_FLUSH_TIMEOUT_MS, l as MAX_HTML_BYTES, m as MAX_WS_PAYLOAD_BYTES, o as DEFAULT_STT_SAMPLE_RATE, p as MAX_VALUE_SIZE, s as DEFAULT_TTS_SAMPLE_RATE, t as AGENT_CSP } from "../constants-BL3nvg4I.js";
|
|
2
2
|
import { r as DEFAULT_SYSTEM_PROMPT } from "../types-Cfx_4QDK.js";
|
|
3
3
|
import { i as toolError, n as errorDetail, r as errorMessage, t as parseWsUpgradeParams } from "../ws-upgrade-BeOQ7fXL.js";
|
|
4
4
|
import { ClientMessageSchema, buildReadyConfig, lenientParse } from "../sdk/protocol.js";
|
|
@@ -8,6 +8,7 @@ import { convert } from "html-to-text";
|
|
|
8
8
|
import vm from "node:vm";
|
|
9
9
|
import pTimeout from "p-timeout";
|
|
10
10
|
import { createStorage, prefixStorage } from "unstorage";
|
|
11
|
+
import { jsonSchema, stepCountIs, streamText, tool } from "ai";
|
|
11
12
|
import { createNanoEvents } from "nanoevents";
|
|
12
13
|
import WsWebSocket, { WebSocketServer } from "ws";
|
|
13
14
|
import fs from "node:fs";
|
|
@@ -339,6 +340,103 @@ function resolveAllBuiltins(names, opts) {
|
|
|
339
340
|
};
|
|
340
341
|
}
|
|
341
342
|
//#endregion
|
|
343
|
+
//#region sdk/system-prompt.ts
|
|
344
|
+
function getFormattedDate() {
|
|
345
|
+
return (/* @__PURE__ */ new Date()).toLocaleDateString("en-US", {
|
|
346
|
+
weekday: "long",
|
|
347
|
+
year: "numeric",
|
|
348
|
+
month: "long",
|
|
349
|
+
day: "numeric"
|
|
350
|
+
});
|
|
351
|
+
}
|
|
352
|
+
const VOICE_RULES = "\n\nCRITICAL OUTPUT RULES — you MUST follow these for EVERY response:\nYour response will be spoken aloud by a TTS system and displayed as plain text.\n- NEVER use markdown: no **, no *, no _, no #, no `, no [](), no ---\n- NEVER use bullet points (-, *, •) or numbered lists (1., 2.)\n- NEVER use code blocks or inline code\n- NEVER mention tools, search, APIs, or technical failures to the user. If a tool returns no results, just answer naturally without explaining why.\n- Write exactly as you would say it out loud to a friend\n- Use short conversational sentences. To list things, say \"First,\" \"Next,\" \"Finally,\"\n- Keep responses concise — 1 to 3 sentences max";
|
|
353
|
+
/**
|
|
354
|
+
* Build the system prompt sent to the LLM from the agent configuration.
|
|
355
|
+
*
|
|
356
|
+
* Assembles the default system prompt, today's date, agent-specific instructions,
|
|
357
|
+
* and optional sections for tool usage preamble and voice output rules.
|
|
358
|
+
*
|
|
359
|
+
* @param config - The serializable agent configuration (name, systemPrompt, etc.).
|
|
360
|
+
* @param opts.hasTools - When `true`, appends a preamble instructing the LLM to
|
|
361
|
+
* speak a brief phrase before each tool call to fill silence.
|
|
362
|
+
* @param opts.voice - When `true`, appends strict voice-specific output rules
|
|
363
|
+
* (no markdown, no bullet points, conversational tone, concise responses).
|
|
364
|
+
* @returns The assembled system prompt string.
|
|
365
|
+
*/
|
|
366
|
+
function buildSystemPrompt(config, opts) {
|
|
367
|
+
const { hasTools } = opts;
|
|
368
|
+
const agentInstructions = config.systemPrompt && config.systemPrompt !== DEFAULT_SYSTEM_PROMPT ? `\n\nAgent-Specific Instructions:\n${config.systemPrompt}` : "";
|
|
369
|
+
const toolPreamble = hasTools ? "\n\nWhen you decide to use a tool, ALWAYS say a brief natural phrase BEFORE the tool call (e.g. \"Let me look that up\" or \"One moment while I check\"). This fills silence while the tool executes. Keep preambles to one short sentence." : "";
|
|
370
|
+
const guidance = opts.toolGuidance && opts.toolGuidance.length > 0 ? `\n\nBuilt-in Tool Usage:\n${opts.toolGuidance.join("\n")}` : "";
|
|
371
|
+
return DEFAULT_SYSTEM_PROMPT + `\n\nToday's date is ${getFormattedDate()}.` + agentInstructions + toolPreamble + guidance + (opts.voice ? VOICE_RULES : "");
|
|
372
|
+
}
|
|
373
|
+
//#endregion
|
|
374
|
+
//#region host/session-ctx.ts
|
|
375
|
+
function _buildBaseCtx(opts) {
|
|
376
|
+
const { agentConfig, log } = opts;
|
|
377
|
+
const maxHistory = opts.maxHistory ?? 200;
|
|
378
|
+
const ctx = {
|
|
379
|
+
...opts,
|
|
380
|
+
reply: {
|
|
381
|
+
pendingTools: [],
|
|
382
|
+
toolCallCount: 0,
|
|
383
|
+
currentReplyId: null
|
|
384
|
+
},
|
|
385
|
+
turnPromise: null,
|
|
386
|
+
conversationMessages: [],
|
|
387
|
+
maxHistory,
|
|
388
|
+
consumeToolCallStep(_name, replyId) {
|
|
389
|
+
if (replyId === null || replyId !== ctx.reply.currentReplyId) return toolError("Reply was interrupted. Discarding stale tool call.");
|
|
390
|
+
const maxSteps = agentConfig.maxSteps;
|
|
391
|
+
ctx.reply.toolCallCount++;
|
|
392
|
+
if (maxSteps !== void 0 && ctx.reply.toolCallCount > maxSteps) {
|
|
393
|
+
log.info("maxSteps exceeded, refusing tool call", {
|
|
394
|
+
toolCallCount: ctx.reply.toolCallCount,
|
|
395
|
+
maxSteps
|
|
396
|
+
});
|
|
397
|
+
return toolError("Maximum tool steps reached. Please respond to the user now.");
|
|
398
|
+
}
|
|
399
|
+
return null;
|
|
400
|
+
},
|
|
401
|
+
pushMessages(...msgs) {
|
|
402
|
+
ctx.conversationMessages.push(...msgs);
|
|
403
|
+
if (maxHistory > 0 && ctx.conversationMessages.length > maxHistory) ctx.conversationMessages.splice(0, ctx.conversationMessages.length - maxHistory);
|
|
404
|
+
},
|
|
405
|
+
beginReply(replyId) {
|
|
406
|
+
ctx.reply = {
|
|
407
|
+
pendingTools: [],
|
|
408
|
+
toolCallCount: 0,
|
|
409
|
+
currentReplyId: replyId
|
|
410
|
+
};
|
|
411
|
+
ctx.turnPromise = null;
|
|
412
|
+
},
|
|
413
|
+
cancelReply() {
|
|
414
|
+
ctx.reply = {
|
|
415
|
+
pendingTools: [],
|
|
416
|
+
toolCallCount: 0,
|
|
417
|
+
currentReplyId: null
|
|
418
|
+
};
|
|
419
|
+
},
|
|
420
|
+
chainTurn(p) {
|
|
421
|
+
ctx.turnPromise = (ctx.turnPromise ?? Promise.resolve()).then(() => p);
|
|
422
|
+
}
|
|
423
|
+
};
|
|
424
|
+
return ctx;
|
|
425
|
+
}
|
|
426
|
+
function buildCtx(opts) {
|
|
427
|
+
const base = _buildBaseCtx(opts);
|
|
428
|
+
base.s2s = null;
|
|
429
|
+
return base;
|
|
430
|
+
}
|
|
431
|
+
//#endregion
|
|
432
|
+
//#region host/pipeline-session-ctx.ts
|
|
433
|
+
function buildPipelineCtx(opts) {
|
|
434
|
+
const base = _buildBaseCtx(opts);
|
|
435
|
+
base.stt = null;
|
|
436
|
+
base.tts = null;
|
|
437
|
+
return base;
|
|
438
|
+
}
|
|
439
|
+
//#endregion
|
|
342
440
|
//#region host/runtime-config.ts
|
|
343
441
|
/**
|
|
344
442
|
* Runtime dependencies injected into the session pipeline.
|
|
@@ -385,35 +483,437 @@ const DEFAULT_S2S_CONFIG = {
|
|
|
385
483
|
outputSampleRate: DEFAULT_TTS_SAMPLE_RATE
|
|
386
484
|
};
|
|
387
485
|
//#endregion
|
|
388
|
-
//#region
|
|
389
|
-
function getFormattedDate() {
|
|
390
|
-
return (/* @__PURE__ */ new Date()).toLocaleDateString("en-US", {
|
|
391
|
-
weekday: "long",
|
|
392
|
-
year: "numeric",
|
|
393
|
-
month: "long",
|
|
394
|
-
day: "numeric"
|
|
395
|
-
});
|
|
396
|
-
}
|
|
397
|
-
const VOICE_RULES = "\n\nCRITICAL OUTPUT RULES — you MUST follow these for EVERY response:\nYour response will be spoken aloud by a TTS system and displayed as plain text.\n- NEVER use markdown: no **, no *, no _, no #, no `, no [](), no ---\n- NEVER use bullet points (-, *, •) or numbered lists (1., 2.)\n- NEVER use code blocks or inline code\n- NEVER mention tools, search, APIs, or technical failures to the user. If a tool returns no results, just answer naturally without explaining why.\n- Write exactly as you would say it out loud to a friend\n- Use short conversational sentences. To list things, say \"First,\" \"Next,\" \"Finally,\"\n- Keep responses concise — 1 to 3 sentences max";
|
|
486
|
+
//#region host/to-vercel-tools.ts
|
|
398
487
|
/**
|
|
399
|
-
*
|
|
488
|
+
* Converts agent {@link ToolSchema}[] to Vercel AI SDK tools with `execute`
|
|
489
|
+
* delegation to the agent's {@link ExecuteTool} function.
|
|
400
490
|
*
|
|
401
|
-
*
|
|
402
|
-
*
|
|
491
|
+
* The pipeline orchestrator passes the output to `streamText({ tools })`.
|
|
492
|
+
* Each produced tool's `execute` closure calls
|
|
493
|
+
* `ctx.executeTool(name, args, sessionId, messages(), { signal, toolCallId })`,
|
|
494
|
+
* so the existing agent tool infrastructure (argument validation, KV, hooks,
|
|
495
|
+
* timeout) remains the single source of truth for tool behavior.
|
|
403
496
|
*
|
|
404
|
-
*
|
|
405
|
-
*
|
|
406
|
-
*
|
|
407
|
-
*
|
|
408
|
-
* (no markdown, no bullet points, conversational tone, concise responses).
|
|
409
|
-
* @returns The assembled system prompt string.
|
|
497
|
+
* Per-call `options.abortSignal` (forwarded by `streamText` when the
|
|
498
|
+
* outer turn is aborted, e.g. barge-in) takes precedence over the
|
|
499
|
+
* bag-level `ctx.signal` so individual invocations respect streamText
|
|
500
|
+
* aborts.
|
|
410
501
|
*/
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
502
|
+
/**
|
|
503
|
+
* Convert an array of {@link ToolSchema} to a Vercel AI SDK `ToolSet`
|
|
504
|
+
* (record keyed by tool name).
|
|
505
|
+
*
|
|
506
|
+
* Uses the v6 `tool()` helper with `inputSchema: jsonSchema(...)` wrapping
|
|
507
|
+
* the agent's JSON Schema `parameters`. Execution is delegated to
|
|
508
|
+
* `ctx.executeTool` so validation, KV, timeouts, and hooks keep working.
|
|
509
|
+
*/
|
|
510
|
+
function toVercelTools(schemas, ctx) {
|
|
511
|
+
const out = {};
|
|
512
|
+
for (const schema of schemas) out[schema.name] = tool({
|
|
513
|
+
description: schema.description,
|
|
514
|
+
inputSchema: jsonSchema(schema.parameters),
|
|
515
|
+
execute: async (args, options) => {
|
|
516
|
+
const input = args ?? {};
|
|
517
|
+
const signal = options.abortSignal ?? ctx.signal;
|
|
518
|
+
const opts = {};
|
|
519
|
+
if (signal !== void 0) opts.signal = signal;
|
|
520
|
+
if (options.toolCallId !== void 0) opts.toolCallId = options.toolCallId;
|
|
521
|
+
return ctx.executeTool(schema.name, input, ctx.sessionId, ctx.messages().slice(), opts);
|
|
522
|
+
}
|
|
523
|
+
});
|
|
524
|
+
return out;
|
|
525
|
+
}
|
|
526
|
+
//#endregion
|
|
527
|
+
//#region host/pipeline-session.ts
|
|
528
|
+
function toModelMessage(m) {
|
|
529
|
+
if (m.role === "user") return {
|
|
530
|
+
role: "user",
|
|
531
|
+
content: m.content
|
|
532
|
+
};
|
|
533
|
+
if (m.role === "assistant") return {
|
|
534
|
+
role: "assistant",
|
|
535
|
+
content: m.content
|
|
536
|
+
};
|
|
537
|
+
return {
|
|
538
|
+
role: "assistant",
|
|
539
|
+
content: m.content
|
|
540
|
+
};
|
|
541
|
+
}
|
|
542
|
+
function emitError(client, code, message) {
|
|
543
|
+
client.event({
|
|
544
|
+
type: "error",
|
|
545
|
+
code,
|
|
546
|
+
message
|
|
547
|
+
});
|
|
548
|
+
}
|
|
549
|
+
function handleStreamPart(part, deps) {
|
|
550
|
+
switch (part.type) {
|
|
551
|
+
case "text-delta": {
|
|
552
|
+
const delta = part.text ?? "";
|
|
553
|
+
if (delta.length === 0) return;
|
|
554
|
+
deps.onTextDelta(delta);
|
|
555
|
+
deps.tts?.sendText(delta);
|
|
556
|
+
deps.client.event({
|
|
557
|
+
type: "agent_transcript",
|
|
558
|
+
text: delta
|
|
559
|
+
});
|
|
560
|
+
return;
|
|
561
|
+
}
|
|
562
|
+
case "tool-call": {
|
|
563
|
+
const input = part.input ?? {};
|
|
564
|
+
deps.client.event({
|
|
565
|
+
type: "tool_call",
|
|
566
|
+
toolCallId: part.toolCallId ?? "",
|
|
567
|
+
toolName: part.toolName ?? "",
|
|
568
|
+
args: input
|
|
569
|
+
});
|
|
570
|
+
return;
|
|
571
|
+
}
|
|
572
|
+
case "tool-result": {
|
|
573
|
+
const output = part.output;
|
|
574
|
+
const resultString = typeof output === "string" ? output : JSON.stringify(output);
|
|
575
|
+
deps.client.event({
|
|
576
|
+
type: "tool_call_done",
|
|
577
|
+
toolCallId: part.toolCallId ?? "",
|
|
578
|
+
result: resultString
|
|
579
|
+
});
|
|
580
|
+
return;
|
|
581
|
+
}
|
|
582
|
+
case "error": {
|
|
583
|
+
const msg = errorMessage(part.error);
|
|
584
|
+
deps.log.error("LLM stream error", {
|
|
585
|
+
message: msg,
|
|
586
|
+
sessionId: deps.sessionId
|
|
587
|
+
});
|
|
588
|
+
emitError(deps.client, "llm", msg);
|
|
589
|
+
return;
|
|
590
|
+
}
|
|
591
|
+
default: return;
|
|
592
|
+
}
|
|
593
|
+
}
|
|
594
|
+
/** Create a pluggable-provider voice session. */
|
|
595
|
+
function createPipelineSession(opts) {
|
|
596
|
+
const log = opts.logger ?? consoleLogger;
|
|
597
|
+
const sampleRate = opts.sampleRate ?? 16e3;
|
|
598
|
+
const { client, agentConfig, toolSchemas, executeTool } = opts;
|
|
599
|
+
const systemPrompt = buildSystemPrompt(agentConfig, {
|
|
600
|
+
hasTools: toolSchemas.length > 0 || (agentConfig.builtinTools?.length ?? 0) > 0,
|
|
601
|
+
voice: true,
|
|
602
|
+
toolGuidance: opts.toolGuidance
|
|
603
|
+
});
|
|
604
|
+
const ctx = buildPipelineCtx({
|
|
605
|
+
id: opts.id,
|
|
606
|
+
agent: opts.agent,
|
|
607
|
+
client,
|
|
608
|
+
agentConfig,
|
|
609
|
+
executeTool,
|
|
610
|
+
log,
|
|
611
|
+
maxHistory: opts.maxHistory
|
|
612
|
+
});
|
|
613
|
+
const sessionAbort = new AbortController();
|
|
614
|
+
let audioReady = false;
|
|
615
|
+
let terminated = false;
|
|
616
|
+
let turnController = null;
|
|
617
|
+
let nextReplyId = 0;
|
|
618
|
+
const sttSubs = [];
|
|
619
|
+
const ttsSubs = [];
|
|
620
|
+
/**
|
|
621
|
+
* Tear down the session after an unrecoverable provider error. Aborts the
|
|
622
|
+
* in-flight turn, cancels TTS, signals providers to close via sessionAbort,
|
|
623
|
+
* and flips `terminated` so future STT events and audio frames become
|
|
624
|
+
* no-ops. Idempotent.
|
|
625
|
+
*/
|
|
626
|
+
function terminate() {
|
|
627
|
+
if (terminated) return;
|
|
628
|
+
terminated = true;
|
|
629
|
+
if (turnController !== null) {
|
|
630
|
+
turnController.abort();
|
|
631
|
+
turnController = null;
|
|
632
|
+
}
|
|
633
|
+
ctx.tts?.cancel();
|
|
634
|
+
ctx.cancelReply();
|
|
635
|
+
sessionAbort.abort();
|
|
636
|
+
}
|
|
637
|
+
function onSttPartial(_text) {
|
|
638
|
+
if (terminated) return;
|
|
639
|
+
if (turnController === null) return;
|
|
640
|
+
log.info("Pipeline barge-in", { sessionId: opts.id });
|
|
641
|
+
turnController.abort();
|
|
642
|
+
turnController = null;
|
|
643
|
+
ctx.tts?.cancel();
|
|
644
|
+
ctx.cancelReply();
|
|
645
|
+
client.event({ type: "cancelled" });
|
|
646
|
+
}
|
|
647
|
+
function onSttFinal(text) {
|
|
648
|
+
if (terminated) return;
|
|
649
|
+
const trimmed = text.trim();
|
|
650
|
+
if (trimmed.length === 0) return;
|
|
651
|
+
if (turnController !== null) {
|
|
652
|
+
log.info("Pipeline replacing in-flight turn", { sessionId: opts.id });
|
|
653
|
+
turnController.abort();
|
|
654
|
+
turnController = null;
|
|
655
|
+
ctx.tts?.cancel();
|
|
656
|
+
ctx.cancelReply();
|
|
657
|
+
client.event({ type: "cancelled" });
|
|
658
|
+
}
|
|
659
|
+
client.event({
|
|
660
|
+
type: "user_transcript",
|
|
661
|
+
text
|
|
662
|
+
});
|
|
663
|
+
const turn = runTurn(trimmed).catch((err) => {
|
|
664
|
+
log.error("Pipeline turn crashed", {
|
|
665
|
+
error: errorMessage(err),
|
|
666
|
+
sessionId: opts.id
|
|
667
|
+
});
|
|
668
|
+
});
|
|
669
|
+
ctx.chainTurn(turn);
|
|
670
|
+
}
|
|
671
|
+
function onSttError(err) {
|
|
672
|
+
if (terminated) return;
|
|
673
|
+
log.error("STT error", {
|
|
674
|
+
code: err.code,
|
|
675
|
+
message: err.message,
|
|
676
|
+
sessionId: opts.id
|
|
677
|
+
});
|
|
678
|
+
emitError(client, "stt", err.message);
|
|
679
|
+
terminate();
|
|
680
|
+
}
|
|
681
|
+
function onTtsError(err) {
|
|
682
|
+
if (terminated) return;
|
|
683
|
+
log.error("TTS error", {
|
|
684
|
+
code: err.code,
|
|
685
|
+
message: err.message,
|
|
686
|
+
sessionId: opts.id
|
|
687
|
+
});
|
|
688
|
+
emitError(client, "tts", err.message);
|
|
689
|
+
terminate();
|
|
690
|
+
}
|
|
691
|
+
async function consumeLlmStream(ctl, messages, tools, onDelta) {
|
|
692
|
+
const deps = {
|
|
693
|
+
client,
|
|
694
|
+
tts: ctx.tts,
|
|
695
|
+
log,
|
|
696
|
+
sessionId: opts.id,
|
|
697
|
+
onTextDelta: onDelta
|
|
698
|
+
};
|
|
699
|
+
try {
|
|
700
|
+
const maxSteps = agentConfig.maxSteps ?? 5;
|
|
701
|
+
const result = streamText({
|
|
702
|
+
model: opts.llm,
|
|
703
|
+
system: systemPrompt,
|
|
704
|
+
messages,
|
|
705
|
+
tools,
|
|
706
|
+
stopWhen: stepCountIs(maxSteps),
|
|
707
|
+
abortSignal: ctl.signal
|
|
708
|
+
});
|
|
709
|
+
for await (const part of result.fullStream) {
|
|
710
|
+
if (ctl.signal.aborted) break;
|
|
711
|
+
handleStreamPart(part, deps);
|
|
712
|
+
}
|
|
713
|
+
} catch (err) {
|
|
714
|
+
if (!ctl.signal.aborted) {
|
|
715
|
+
const msg = errorMessage(err);
|
|
716
|
+
log.error("LLM streamText failed", {
|
|
717
|
+
error: msg,
|
|
718
|
+
sessionId: opts.id
|
|
719
|
+
});
|
|
720
|
+
emitError(client, "llm", msg);
|
|
721
|
+
}
|
|
722
|
+
}
|
|
723
|
+
}
|
|
724
|
+
/**
|
|
725
|
+
* Flush TTS and wait for drain. Resolves on any of:
|
|
726
|
+
* - TTS emits `done`
|
|
727
|
+
* - `signal` aborts (barge-in, provider error, session stop)
|
|
728
|
+
* - `PIPELINE_FLUSH_TIMEOUT_MS` elapses
|
|
729
|
+
* Resolves immediately if no TTS session.
|
|
730
|
+
*/
|
|
731
|
+
function flushTtsAndWait(signal) {
|
|
732
|
+
const tts = ctx.tts;
|
|
733
|
+
if (!tts) return Promise.resolve();
|
|
734
|
+
return new Promise((resolve) => {
|
|
735
|
+
let off = null;
|
|
736
|
+
let timer = null;
|
|
737
|
+
const cleanup = () => {
|
|
738
|
+
if (off) {
|
|
739
|
+
off();
|
|
740
|
+
off = null;
|
|
741
|
+
}
|
|
742
|
+
if (timer) {
|
|
743
|
+
clearTimeout(timer);
|
|
744
|
+
timer = null;
|
|
745
|
+
}
|
|
746
|
+
signal.removeEventListener("abort", onAbort);
|
|
747
|
+
};
|
|
748
|
+
const finish = () => {
|
|
749
|
+
cleanup();
|
|
750
|
+
resolve();
|
|
751
|
+
};
|
|
752
|
+
const onAbort = () => finish();
|
|
753
|
+
if (signal.aborted) {
|
|
754
|
+
resolve();
|
|
755
|
+
return;
|
|
756
|
+
}
|
|
757
|
+
signal.addEventListener("abort", onAbort, { once: true });
|
|
758
|
+
off = tts.on("done", finish);
|
|
759
|
+
timer = setTimeout(() => {
|
|
760
|
+
log.warn("TTS flush timeout", {
|
|
761
|
+
sessionId: opts.id,
|
|
762
|
+
timeoutMs: PIPELINE_FLUSH_TIMEOUT_MS
|
|
763
|
+
});
|
|
764
|
+
finish();
|
|
765
|
+
}, PIPELINE_FLUSH_TIMEOUT_MS);
|
|
766
|
+
tts.flush();
|
|
767
|
+
});
|
|
768
|
+
}
|
|
769
|
+
async function runTurn(userText) {
|
|
770
|
+
const replyId = `pipeline-${++nextReplyId}`;
|
|
771
|
+
ctx.beginReply(replyId);
|
|
772
|
+
ctx.pushMessages({
|
|
773
|
+
role: "user",
|
|
774
|
+
content: userText
|
|
775
|
+
});
|
|
776
|
+
const ctl = new AbortController();
|
|
777
|
+
turnController = ctl;
|
|
778
|
+
const tools = toVercelTools(toolSchemas, {
|
|
779
|
+
executeTool,
|
|
780
|
+
sessionId: opts.id,
|
|
781
|
+
messages: () => ctx.conversationMessages,
|
|
782
|
+
signal: ctl.signal
|
|
783
|
+
});
|
|
784
|
+
const messages = ctx.conversationMessages.map(toModelMessage);
|
|
785
|
+
let accumulated = "";
|
|
786
|
+
await consumeLlmStream(ctl, messages, tools, (delta) => {
|
|
787
|
+
accumulated += delta;
|
|
788
|
+
});
|
|
789
|
+
if (ctl.signal.aborted) {
|
|
790
|
+
if (turnController === ctl) turnController = null;
|
|
791
|
+
return;
|
|
792
|
+
}
|
|
793
|
+
await flushTtsAndWait(ctl.signal);
|
|
794
|
+
if (ctl.signal.aborted) {
|
|
795
|
+
if (turnController === ctl) turnController = null;
|
|
796
|
+
return;
|
|
797
|
+
}
|
|
798
|
+
if (accumulated.length > 0) ctx.pushMessages({
|
|
799
|
+
role: "assistant",
|
|
800
|
+
content: accumulated
|
|
801
|
+
});
|
|
802
|
+
client.playAudioDone();
|
|
803
|
+
client.event({ type: "reply_done" });
|
|
804
|
+
if (turnController === ctl) turnController = null;
|
|
805
|
+
}
|
|
806
|
+
function reportOpenRejection(which, reason) {
|
|
807
|
+
const msg = errorMessage(reason);
|
|
808
|
+
log.error(`${which === "stt" ? "STT" : "TTS"} open failed`, {
|
|
809
|
+
error: msg,
|
|
810
|
+
sessionId: opts.id
|
|
811
|
+
});
|
|
812
|
+
emitError(client, which, msg);
|
|
813
|
+
}
|
|
814
|
+
async function adoptStt(sttSession, teardown) {
|
|
815
|
+
if (teardown) {
|
|
816
|
+
await sttSession.close().catch(() => void 0);
|
|
817
|
+
return;
|
|
818
|
+
}
|
|
819
|
+
ctx.stt = sttSession;
|
|
820
|
+
sttSubs.push(sttSession.on("partial", onSttPartial));
|
|
821
|
+
sttSubs.push(sttSession.on("final", onSttFinal));
|
|
822
|
+
sttSubs.push(sttSession.on("error", onSttError));
|
|
823
|
+
}
|
|
824
|
+
async function adoptTts(ttsSession, teardown) {
|
|
825
|
+
if (teardown) {
|
|
826
|
+
await ttsSession.close().catch(() => void 0);
|
|
827
|
+
return;
|
|
828
|
+
}
|
|
829
|
+
ctx.tts = ttsSession;
|
|
830
|
+
ttsSubs.push(ttsSession.on("audio", (pcm) => {
|
|
831
|
+
client.playAudioChunk(new Uint8Array(pcm.buffer, pcm.byteOffset, pcm.byteLength));
|
|
832
|
+
}));
|
|
833
|
+
ttsSubs.push(ttsSession.on("error", onTtsError));
|
|
834
|
+
}
|
|
835
|
+
async function openProviders() {
|
|
836
|
+
const [sttResult, ttsResult] = await Promise.allSettled([opts.stt.open({
|
|
837
|
+
sampleRate,
|
|
838
|
+
apiKey: opts.sttApiKey,
|
|
839
|
+
sttPrompt: agentConfig.sttPrompt,
|
|
840
|
+
signal: sessionAbort.signal
|
|
841
|
+
}), opts.tts.open({
|
|
842
|
+
sampleRate,
|
|
843
|
+
apiKey: opts.ttsApiKey,
|
|
844
|
+
signal: sessionAbort.signal
|
|
845
|
+
})]);
|
|
846
|
+
if (sttResult.status === "rejected") reportOpenRejection("stt", sttResult.reason);
|
|
847
|
+
if (ttsResult.status === "rejected") reportOpenRejection("tts", ttsResult.reason);
|
|
848
|
+
const aborted = sessionAbort.signal.aborted;
|
|
849
|
+
const sttFailed = sttResult.status === "rejected";
|
|
850
|
+
const ttsFailed = ttsResult.status === "rejected";
|
|
851
|
+
const teardown = aborted || sttFailed || ttsFailed;
|
|
852
|
+
if (sttResult.status === "fulfilled") await adoptStt(sttResult.value, teardown);
|
|
853
|
+
if (ttsResult.status === "fulfilled") await adoptTts(ttsResult.value, teardown);
|
|
854
|
+
if (!aborted && (sttFailed || ttsFailed)) terminate();
|
|
855
|
+
}
|
|
856
|
+
return {
|
|
857
|
+
async start() {
|
|
858
|
+
await openProviders();
|
|
859
|
+
},
|
|
860
|
+
async stop() {
|
|
861
|
+
if (sessionAbort.signal.aborted) return;
|
|
862
|
+
sessionAbort.abort();
|
|
863
|
+
turnController?.abort();
|
|
864
|
+
for (const off of sttSubs) off();
|
|
865
|
+
for (const off of ttsSubs) off();
|
|
866
|
+
sttSubs.length = 0;
|
|
867
|
+
ttsSubs.length = 0;
|
|
868
|
+
if (ctx.turnPromise !== null) await ctx.turnPromise;
|
|
869
|
+
await ctx.stt?.close().catch(() => {});
|
|
870
|
+
await ctx.tts?.close().catch(() => {});
|
|
871
|
+
},
|
|
872
|
+
onAudio(data) {
|
|
873
|
+
if (terminated || !audioReady) return;
|
|
874
|
+
const offset = data.byteOffset;
|
|
875
|
+
const length = data.byteLength;
|
|
876
|
+
let pcm;
|
|
877
|
+
if (offset % 2 === 0 && length % 2 === 0) pcm = new Int16Array(data.buffer, offset, length / 2);
|
|
878
|
+
else {
|
|
879
|
+
const copy = new Uint8Array(length - length % 2);
|
|
880
|
+
copy.set(data.subarray(0, copy.byteLength));
|
|
881
|
+
pcm = new Int16Array(copy.buffer);
|
|
882
|
+
}
|
|
883
|
+
ctx.stt?.sendAudio(pcm);
|
|
884
|
+
},
|
|
885
|
+
onAudioReady() {
|
|
886
|
+
audioReady = true;
|
|
887
|
+
},
|
|
888
|
+
onCancel() {
|
|
889
|
+
if (terminated) return;
|
|
890
|
+
turnController?.abort();
|
|
891
|
+
turnController = null;
|
|
892
|
+
ctx.tts?.cancel();
|
|
893
|
+
ctx.cancelReply();
|
|
894
|
+
client.event({ type: "cancelled" });
|
|
895
|
+
},
|
|
896
|
+
onReset() {
|
|
897
|
+
if (terminated) return;
|
|
898
|
+
turnController?.abort();
|
|
899
|
+
turnController = null;
|
|
900
|
+
ctx.tts?.cancel();
|
|
901
|
+
ctx.cancelReply();
|
|
902
|
+
ctx.conversationMessages = [];
|
|
903
|
+
ctx.turnPromise = null;
|
|
904
|
+
client.event({ type: "reset" });
|
|
905
|
+
},
|
|
906
|
+
onHistory(incoming) {
|
|
907
|
+
if (terminated) return;
|
|
908
|
+
ctx.pushMessages(...incoming.map((m) => ({
|
|
909
|
+
role: m.role,
|
|
910
|
+
content: m.content
|
|
911
|
+
})));
|
|
912
|
+
},
|
|
913
|
+
waitForTurn() {
|
|
914
|
+
return ctx.turnPromise ?? Promise.resolve();
|
|
915
|
+
}
|
|
916
|
+
};
|
|
417
917
|
}
|
|
418
918
|
//#endregion
|
|
419
919
|
//#region host/s2s.ts
|
|
@@ -544,6 +1044,10 @@ function connectS2s(opts) {
|
|
|
544
1044
|
}
|
|
545
1045
|
ws.send(`{"type":"input.audio","audio":"${uint8ToBase64(audio)}"}`);
|
|
546
1046
|
},
|
|
1047
|
+
sendAudioRaw(jsonFrame) {
|
|
1048
|
+
if (ws.readyState !== 1) return;
|
|
1049
|
+
ws.send(jsonFrame);
|
|
1050
|
+
},
|
|
547
1051
|
sendToolResult(callId, result) {
|
|
548
1052
|
const msg = {
|
|
549
1053
|
type: "tool.result",
|
|
@@ -639,60 +1143,6 @@ function connectS2s(opts) {
|
|
|
639
1143
|
});
|
|
640
1144
|
}
|
|
641
1145
|
//#endregion
|
|
642
|
-
//#region host/session-ctx.ts
|
|
643
|
-
function buildCtx(opts) {
|
|
644
|
-
const { agentConfig, log } = opts;
|
|
645
|
-
const maxHistory = opts.maxHistory ?? 200;
|
|
646
|
-
const ctx = {
|
|
647
|
-
...opts,
|
|
648
|
-
s2s: null,
|
|
649
|
-
reply: {
|
|
650
|
-
pendingTools: [],
|
|
651
|
-
toolCallCount: 0,
|
|
652
|
-
currentReplyId: null
|
|
653
|
-
},
|
|
654
|
-
turnPromise: null,
|
|
655
|
-
conversationMessages: [],
|
|
656
|
-
maxHistory,
|
|
657
|
-
consumeToolCallStep(_name, replyId) {
|
|
658
|
-
if (replyId === null || replyId !== ctx.reply.currentReplyId) return toolError("Reply was interrupted. Discarding stale tool call.");
|
|
659
|
-
const maxSteps = agentConfig.maxSteps;
|
|
660
|
-
ctx.reply.toolCallCount++;
|
|
661
|
-
if (maxSteps !== void 0 && ctx.reply.toolCallCount > maxSteps) {
|
|
662
|
-
log.info("maxSteps exceeded, refusing tool call", {
|
|
663
|
-
toolCallCount: ctx.reply.toolCallCount,
|
|
664
|
-
maxSteps
|
|
665
|
-
});
|
|
666
|
-
return toolError("Maximum tool steps reached. Please respond to the user now.");
|
|
667
|
-
}
|
|
668
|
-
return null;
|
|
669
|
-
},
|
|
670
|
-
pushMessages(...msgs) {
|
|
671
|
-
ctx.conversationMessages.push(...msgs);
|
|
672
|
-
if (maxHistory > 0 && ctx.conversationMessages.length > maxHistory) ctx.conversationMessages.splice(0, ctx.conversationMessages.length - maxHistory);
|
|
673
|
-
},
|
|
674
|
-
beginReply(replyId) {
|
|
675
|
-
ctx.reply = {
|
|
676
|
-
pendingTools: [],
|
|
677
|
-
toolCallCount: 0,
|
|
678
|
-
currentReplyId: replyId
|
|
679
|
-
};
|
|
680
|
-
ctx.turnPromise = null;
|
|
681
|
-
},
|
|
682
|
-
cancelReply() {
|
|
683
|
-
ctx.reply = {
|
|
684
|
-
pendingTools: [],
|
|
685
|
-
toolCallCount: 0,
|
|
686
|
-
currentReplyId: null
|
|
687
|
-
};
|
|
688
|
-
},
|
|
689
|
-
chainTurn(p) {
|
|
690
|
-
ctx.turnPromise = (ctx.turnPromise ?? Promise.resolve()).then(() => p);
|
|
691
|
-
}
|
|
692
|
-
};
|
|
693
|
-
return ctx;
|
|
694
|
-
}
|
|
695
|
-
//#endregion
|
|
696
1146
|
//#region host/session.ts
|
|
697
1147
|
/** @internal Not part of the public API. Exposed for testing only. */
|
|
698
1148
|
const _internals = { connectS2s };
|
|
@@ -1288,6 +1738,17 @@ function createLocalKv() {
|
|
|
1288
1738
|
return createUnstorageKv({ storage: createStorage() });
|
|
1289
1739
|
}
|
|
1290
1740
|
/**
|
|
1741
|
+
* Resolve an API key host-side for pipeline providers.
|
|
1742
|
+
*
|
|
1743
|
+
* Checks the agent's declared env first, then the host process env as a
|
|
1744
|
+
* fallback. Returns `""` when absent — pipeline providers surface a clear
|
|
1745
|
+
* `MissingCredentialsError` via their `open()` that the orchestrator
|
|
1746
|
+
* converts to a `session.error` wire event.
|
|
1747
|
+
*/
|
|
1748
|
+
function resolveApiKey(envVar, env) {
|
|
1749
|
+
return env[envVar] ?? process.env[envVar] ?? "";
|
|
1750
|
+
}
|
|
1751
|
+
/**
|
|
1291
1752
|
* Create an agent runtime — the execution engine for a voice agent.
|
|
1292
1753
|
*
|
|
1293
1754
|
* Merges built-in and custom tool definitions, builds tool schemas for the
|
|
@@ -1301,6 +1762,9 @@ function createLocalKv() {
|
|
|
1301
1762
|
*/
|
|
1302
1763
|
function createRuntime(opts) {
|
|
1303
1764
|
const { agent, env, kv = createLocalKv(), createWebSocket, logger = consoleLogger, s2sConfig = DEFAULT_S2S_CONFIG, sessionStartTimeoutMs, shutdownTimeoutMs = DEFAULT_SHUTDOWN_TIMEOUT_MS } = opts;
|
|
1765
|
+
const providerCount = (opts.stt != null ? 1 : 0) + (opts.llm != null ? 1 : 0) + (opts.tts != null ? 1 : 0);
|
|
1766
|
+
if (providerCount !== 0 && providerCount !== 3) throw new Error("stt, llm, and tts must be set together");
|
|
1767
|
+
const mode = providerCount === 3 ? "pipeline" : "s2s";
|
|
1304
1768
|
const agentConfig = toAgentConfig(agent);
|
|
1305
1769
|
const sessions = /* @__PURE__ */ new Map();
|
|
1306
1770
|
const sinkMap = /* @__PURE__ */ new Map();
|
|
@@ -1365,6 +1829,26 @@ function createRuntime(opts) {
|
|
|
1365
1829
|
}
|
|
1366
1830
|
function createSession(sessionOpts) {
|
|
1367
1831
|
sinkMap.set(sessionOpts.id, sessionOpts.client);
|
|
1832
|
+
if (mode === "pipeline") {
|
|
1833
|
+
const stt = opts.stt;
|
|
1834
|
+
const llm = opts.llm;
|
|
1835
|
+
const tts = opts.tts;
|
|
1836
|
+
return createPipelineSession({
|
|
1837
|
+
id: sessionOpts.id,
|
|
1838
|
+
agent: sessionOpts.agent,
|
|
1839
|
+
client: sessionOpts.client,
|
|
1840
|
+
agentConfig,
|
|
1841
|
+
toolSchemas,
|
|
1842
|
+
toolGuidance,
|
|
1843
|
+
executeTool,
|
|
1844
|
+
stt,
|
|
1845
|
+
llm,
|
|
1846
|
+
tts,
|
|
1847
|
+
sttApiKey: resolveApiKey("ASSEMBLYAI_API_KEY", env),
|
|
1848
|
+
ttsApiKey: resolveApiKey("CARTESIA_API_KEY", env),
|
|
1849
|
+
logger
|
|
1850
|
+
});
|
|
1851
|
+
}
|
|
1368
1852
|
const apiKey = env.ASSEMBLYAI_API_KEY ?? "";
|
|
1369
1853
|
return createS2sSession({
|
|
1370
1854
|
id: sessionOpts.id,
|
|
@@ -1565,4 +2049,4 @@ function createServer(options) {
|
|
|
1565
2049
|
};
|
|
1566
2050
|
}
|
|
1567
2051
|
//#endregion
|
|
1568
|
-
export { DEFAULT_S2S_CONFIG, _internals, buildCtx, consoleLogger, createRuntime, createS2sSession, createServer, createUnstorageKv, executeInIsolate, executeToolCall, jsonLogger, resolveAllBuiltins, wireSessionSocket };
|
|
2052
|
+
export { DEFAULT_S2S_CONFIG, _buildBaseCtx, _internals, buildCtx, consoleLogger, createRuntime, createS2sSession, createServer, createUnstorageKv, executeInIsolate, executeToolCall, jsonLogger, resolveAllBuiltins, wireSessionSocket };
|