@alexkroman1/aai 1.3.2 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/.turbo/turbo-build.log +18 -14
  2. package/CHANGELOG.md +2 -0
  3. package/dist/_internal-types-3p3OJZPb.js +145 -0
  4. package/dist/anthropic-BrUCPKUc.js +10 -0
  5. package/dist/assemblyai-Cxg9eobY.js +18 -0
  6. package/dist/cartesia-DwDk2tEu.js +10 -0
  7. package/dist/host/_pipeline-test-fakes.d.ts +5 -5
  8. package/dist/host/pipeline-session.d.ts +5 -5
  9. package/dist/host/providers/resolve.d.ts +34 -0
  10. package/dist/host/providers/stt/assemblyai.d.ts +9 -18
  11. package/dist/host/providers/tts/cartesia.d.ts +11 -18
  12. package/dist/host/runtime-barrel.js +345 -42
  13. package/dist/host/runtime.d.ts +13 -9
  14. package/dist/index.js +2 -91
  15. package/dist/sdk/_internal-types.d.ts +27 -1
  16. package/dist/sdk/manifest-barrel.d.ts +2 -0
  17. package/dist/sdk/manifest-barrel.js +2 -2
  18. package/dist/sdk/manifest.d.ts +13 -2
  19. package/dist/sdk/protocol.d.ts +3 -3
  20. package/dist/sdk/protocol.js +1 -1
  21. package/dist/sdk/providers/llm/anthropic.d.ts +23 -0
  22. package/dist/sdk/providers/llm-barrel.d.ts +9 -0
  23. package/dist/sdk/providers/llm-barrel.js +2 -0
  24. package/dist/sdk/providers/stt/assemblyai.d.ts +30 -0
  25. package/dist/sdk/providers/stt-barrel.d.ts +9 -0
  26. package/dist/sdk/providers/stt-barrel.js +2 -0
  27. package/dist/sdk/providers/tts/cartesia.d.ts +23 -0
  28. package/dist/sdk/providers/tts-barrel.d.ts +9 -0
  29. package/dist/sdk/providers/tts-barrel.js +2 -0
  30. package/dist/sdk/providers.d.ts +59 -11
  31. package/dist/types-KUgezM6u.js +128 -0
  32. package/host/_pipeline-test-fakes.ts +6 -6
  33. package/host/integration/pipeline-reference.integration.test.ts +4 -4
  34. package/host/pipeline-session.ts +6 -6
  35. package/host/providers/providers.test-d.ts +19 -10
  36. package/host/providers/resolve.ts +87 -0
  37. package/host/providers/stt/assemblyai.test.ts +2 -2
  38. package/host/providers/stt/assemblyai.ts +25 -47
  39. package/host/providers/tts/cartesia.test.ts +2 -2
  40. package/host/providers/tts/cartesia.ts +43 -73
  41. package/host/runtime.ts +66 -39
  42. package/package.json +13 -7
  43. package/sdk/__snapshots__/exports.test.ts.snap +2 -0
  44. package/sdk/__snapshots__/schema-shapes.test.ts.snap +4 -0
  45. package/sdk/_internal-types.ts +28 -1
  46. package/sdk/define.test.ts +12 -10
  47. package/sdk/manifest-barrel.ts +2 -0
  48. package/sdk/manifest.test.ts +6 -3
  49. package/sdk/manifest.ts +26 -18
  50. package/sdk/providers/llm/anthropic.ts +31 -0
  51. package/sdk/providers/llm-barrel.ts +12 -0
  52. package/sdk/providers/stt/assemblyai.ts +38 -0
  53. package/sdk/providers/stt-barrel.ts +12 -0
  54. package/sdk/providers/tts/cartesia.ts +31 -0
  55. package/sdk/providers/tts-barrel.ts +12 -0
  56. package/sdk/providers.ts +81 -17
  57. package/dist/_internal-types-CoDTiBd1.js +0 -61
  58. package/dist/host/providers/llm.d.ts +0 -2
  59. package/dist/host/providers/stt-barrel.d.ts +0 -8
  60. package/dist/host/providers/stt-barrel.js +0 -92
  61. package/dist/host/providers/stt.d.ts +0 -2
  62. package/dist/host/providers/tts-barrel.d.ts +0 -8
  63. package/dist/host/providers/tts-barrel.js +0 -182
  64. package/dist/host/providers/tts.d.ts +0 -2
  65. package/dist/types-Cfx_4QDK.js +0 -39
  66. package/host/providers/llm.ts +0 -3
  67. package/host/providers/stt-barrel.ts +0 -13
  68. package/host/providers/stt.ts +0 -3
  69. package/host/providers/tts-barrel.ts +0 -13
  70. package/host/providers/tts.ts +0 -3
  71. /package/dist/{constants-BL3nvg4I.js → constants-C2nirZUI.js} +0 -0
@@ -1,15 +1,22 @@
1
- import { _ as TOOL_EXECUTION_TIMEOUT_MS, a as DEFAULT_SHUTDOWN_TIMEOUT_MS, c as FETCH_TIMEOUT_MS, d as MAX_PAGE_CHARS, f as MAX_TOOL_RESULT_CHARS, g as RUN_CODE_TIMEOUT_MS, h as PIPELINE_FLUSH_TIMEOUT_MS, l as MAX_HTML_BYTES, m as MAX_WS_PAYLOAD_BYTES, o as DEFAULT_STT_SAMPLE_RATE, p as MAX_VALUE_SIZE, s as DEFAULT_TTS_SAMPLE_RATE, t as AGENT_CSP } from "../constants-BL3nvg4I.js";
2
- import { r as DEFAULT_SYSTEM_PROMPT } from "../types-Cfx_4QDK.js";
1
+ import { r as DEFAULT_SYSTEM_PROMPT } from "../types-KUgezM6u.js";
2
+ import { _ as TOOL_EXECUTION_TIMEOUT_MS, a as DEFAULT_SHUTDOWN_TIMEOUT_MS, c as FETCH_TIMEOUT_MS, d as MAX_PAGE_CHARS, f as MAX_TOOL_RESULT_CHARS, g as RUN_CODE_TIMEOUT_MS, h as PIPELINE_FLUSH_TIMEOUT_MS, l as MAX_HTML_BYTES, m as MAX_WS_PAYLOAD_BYTES, o as DEFAULT_STT_SAMPLE_RATE, p as MAX_VALUE_SIZE, s as DEFAULT_TTS_SAMPLE_RATE, t as AGENT_CSP } from "../constants-C2nirZUI.js";
3
3
  import { i as toolError, n as errorDetail, r as errorMessage, t as parseWsUpgradeParams } from "../ws-upgrade-BeOQ7fXL.js";
4
4
  import { ClientMessageSchema, buildReadyConfig, lenientParse } from "../sdk/protocol.js";
5
- import { a as toAgentConfig, i as agentToolsToSchemas, n as EMPTY_PARAMS } from "../_internal-types-CoDTiBd1.js";
5
+ import { a as toAgentConfig, c as makeSttError, i as agentToolsToSchemas, l as makeTtsError, n as EMPTY_PARAMS, s as assertProviderTriple } from "../_internal-types-3p3OJZPb.js";
6
+ import { t as ANTHROPIC_KIND } from "../anthropic-BrUCPKUc.js";
7
+ import { t as ASSEMBLYAI_KIND } from "../assemblyai-Cxg9eobY.js";
8
+ import { t as CARTESIA_KIND } from "../cartesia-DwDk2tEu.js";
6
9
  import { z } from "zod";
7
10
  import { convert } from "html-to-text";
8
11
  import vm from "node:vm";
9
12
  import pTimeout from "p-timeout";
10
13
  import { createStorage, prefixStorage } from "unstorage";
11
14
  import { jsonSchema, stepCountIs, streamText, tool } from "ai";
15
+ import { createAnthropic } from "@ai-sdk/anthropic";
16
+ import { AssemblyAI } from "assemblyai";
12
17
  import { createNanoEvents } from "nanoevents";
18
+ import { randomUUID } from "node:crypto";
19
+ import { Cartesia } from "@cartesia/cartesia-js";
13
20
  import WsWebSocket, { WebSocketServer } from "ws";
14
21
  import fs from "node:fs";
15
22
  import http from "node:http";
@@ -916,6 +923,305 @@ function createPipelineSession(opts) {
916
923
  };
917
924
  }
918
925
  //#endregion
926
+ //#region host/providers/stt/assemblyai.ts
927
+ /**
928
+ * AssemblyAI Universal-Streaming STT opener (host-only).
929
+ *
930
+ * The user-facing descriptor factory (`assemblyAI(...)`) lives in
931
+ * `sdk/providers/stt/assemblyai.ts`. This module is the host-side
932
+ * counterpart: it takes the descriptor options + an API key and
933
+ * returns an {@link SttOpener} that the pipeline session drives.
934
+ *
935
+ * Default model: `"u3pro-rt"` (Universal-3 Pro Real-Time). The adapter
936
+ * maps that to the SDK's `"u3-rt-pro"` `speechModel` value; any other
937
+ * string is forwarded verbatim.
938
+ */
939
+ /** Translate the descriptor's model alias to the SDK's `speechModel` value. */
940
+ function resolveSpeechModel(model) {
941
+ if (model === "u3pro-rt") return "u3-rt-pro";
942
+ return model;
943
+ }
944
+ /** Build an {@link SttOpener} from resolved AssemblyAI descriptor options. */
945
+ function openAssemblyAI(opts = {}) {
946
+ return {
947
+ name: "assemblyai",
948
+ async open(openOpts) {
949
+ const apiKey = openOpts.apiKey || process.env.ASSEMBLYAI_API_KEY;
950
+ if (!apiKey) throw makeSttError("stt_auth_failed", "AssemblyAI STT: missing API key. Set ASSEMBLYAI_API_KEY in the agent env.");
951
+ const client = new AssemblyAI({ apiKey });
952
+ const speechModel = resolveSpeechModel(opts.model ?? "u3pro-rt");
953
+ const transcriber = client.streaming.transcriber({
954
+ sampleRate: openOpts.sampleRate,
955
+ speechModel,
956
+ ...openOpts.sttPrompt ? { prompt: openOpts.sttPrompt } : {}
957
+ });
958
+ const emitter = createNanoEvents();
959
+ let closed = false;
960
+ transcriber.on("turn", (event) => {
961
+ if (closed) return;
962
+ const text = event.transcript ?? "";
963
+ if (event.end_of_turn) {
964
+ if (text.length > 0) emitter.emit("final", text);
965
+ } else if (text.length > 0) emitter.emit("partial", text);
966
+ });
967
+ transcriber.on("error", (err) => {
968
+ if (closed) return;
969
+ emitter.emit("error", makeSttError("stt_stream_error", err?.message ?? String(err)));
970
+ });
971
+ transcriber.on("close", (code) => {
972
+ if (closed) return;
973
+ if (code !== 1e3) emitter.emit("error", makeSttError("stt_stream_error", `socket closed ${code}`));
974
+ });
975
+ try {
976
+ await transcriber.connect();
977
+ } catch (cause) {
978
+ throw makeSttError("stt_connect_failed", `AssemblyAI STT: connect failed: ${cause instanceof Error ? cause.message : String(cause)}`);
979
+ }
980
+ const close = async () => {
981
+ if (closed) return;
982
+ closed = true;
983
+ try {
984
+ await transcriber.close();
985
+ } catch {}
986
+ };
987
+ if (openOpts.signal.aborted) close();
988
+ else openOpts.signal.addEventListener("abort", () => void close(), { once: true });
989
+ return {
990
+ sendAudio(pcm) {
991
+ if (closed) return;
992
+ const copy = new Uint8Array(pcm.byteLength);
993
+ copy.set(new Uint8Array(pcm.buffer, pcm.byteOffset, pcm.byteLength));
994
+ transcriber.sendAudio(copy.buffer);
995
+ },
996
+ on(event, fn) {
997
+ return emitter.on(event, fn);
998
+ },
999
+ close,
1000
+ _transcriber: transcriber
1001
+ };
1002
+ }
1003
+ };
1004
+ }
1005
+ //#endregion
1006
+ //#region host/providers/tts/cartesia.ts
1007
+ /**
1008
+ * Cartesia TTS opener (host-only).
1009
+ *
1010
+ * The user-facing descriptor factory (`cartesia(...)`) lives in
1011
+ * `sdk/providers/tts/cartesia.ts`. This module is the host-side
1012
+ * counterpart: it takes the descriptor options + an API key and
1013
+ * returns a {@link TtsOpener} that the pipeline session drives.
1014
+ *
1015
+ * Wraps `@cartesia/cartesia-js`'s `TTSWS` / `TTSWSContext` and normalizes it
1016
+ * onto the {@link TtsEvents} contract consumed by the pipeline orchestrator.
1017
+ *
1018
+ * **Per-turn context lifecycle.** Each `sendText(...)` within the same turn
1019
+ * appends to the same Cartesia context. On `flush()` or `cancel()`, a new
1020
+ * context is minted for the next turn — so concurrent `cancel({ contextId })`
1021
+ * only targets the in-flight turn, never the one that follows.
1022
+ *
1023
+ * **Audio format.** The adapter requests `raw` / `pcm_s16le` at the
1024
+ * negotiated `sampleRate` so it can forward chunks as `Int16Array` with no
1025
+ * conversion.
1026
+ */
1027
+ /** PCM16 sample rates supported by Cartesia's `raw` output format. */
1028
+ const CARTESIA_PCM16_RATES = [
1029
+ 8e3,
1030
+ 16e3,
1031
+ 22050,
1032
+ 24e3,
1033
+ 44100,
1034
+ 48e3
1035
+ ];
1036
+ function assertSupportedSampleRate(rate) {
1037
+ if (CARTESIA_PCM16_RATES.includes(rate)) return rate;
1038
+ throw makeTtsError("tts_connect_failed", `Cartesia TTS: unsupported sample rate ${rate}. Supported: ${CARTESIA_PCM16_RATES.join(", ")}.`);
1039
+ }
1040
+ /** Build a {@link TtsOpener} from resolved Cartesia descriptor options. */
1041
+ function openCartesia(opts) {
1042
+ return {
1043
+ name: "cartesia",
1044
+ async open(openOpts) {
1045
+ const apiKey = openOpts.apiKey || process.env.CARTESIA_API_KEY;
1046
+ if (!apiKey) throw makeTtsError("tts_auth_failed", "Cartesia TTS: missing API key. Set CARTESIA_API_KEY in the agent env.");
1047
+ const sampleRate = assertSupportedSampleRate(openOpts.sampleRate);
1048
+ const model = opts.model ?? "sonic-2";
1049
+ const language = opts.language ?? "en";
1050
+ const client = new Cartesia({ apiKey });
1051
+ let ws;
1052
+ try {
1053
+ ws = await client.tts.websocket();
1054
+ } catch (cause) {
1055
+ throw makeTtsError("tts_connect_failed", `Cartesia TTS: connect failed: ${cause instanceof Error ? cause.message : String(cause)}`);
1056
+ }
1057
+ const emitter = createNanoEvents();
1058
+ let closed = false;
1059
+ /** Mint a fresh context bound to the shared TTSWS connection. */
1060
+ const mintContext = () => ws.context({
1061
+ model_id: model,
1062
+ voice: {
1063
+ mode: "id",
1064
+ id: opts.voice
1065
+ },
1066
+ output_format: {
1067
+ container: "raw",
1068
+ encoding: "pcm_s16le",
1069
+ sample_rate: sampleRate
1070
+ },
1071
+ contextId: randomUUID()
1072
+ });
1073
+ let context = mintContext();
1074
+ /**
1075
+ * `doneEmitted` guards against emitting `done` more than once per turn.
1076
+ * Reset whenever a fresh context is minted (i.e. at turn boundaries).
1077
+ */
1078
+ let doneEmitted = false;
1079
+ const rotateContext = () => {
1080
+ context = mintContext();
1081
+ doneEmitted = false;
1082
+ };
1083
+ const emitDoneOnce = () => {
1084
+ if (doneEmitted || closed) return;
1085
+ doneEmitted = true;
1086
+ emitter.emit("done");
1087
+ };
1088
+ ws.on("chunk", (event) => {
1089
+ if (closed) return;
1090
+ if (event.context_id !== context.contextId) return;
1091
+ const buf = event.audio;
1092
+ if (!buf || buf.byteLength === 0) return;
1093
+ const evenBytes = buf.byteLength - buf.byteLength % 2;
1094
+ if (evenBytes === 0) return;
1095
+ const pcm = new Int16Array(buf.buffer.slice(buf.byteOffset, buf.byteOffset + evenBytes));
1096
+ emitter.emit("audio", pcm);
1097
+ });
1098
+ ws.on("done", (event) => {
1099
+ if (closed) return;
1100
+ if (event.context_id !== context.contextId) return;
1101
+ emitDoneOnce();
1102
+ });
1103
+ ws.on("error", (err) => {
1104
+ if (closed) return;
1105
+ emitter.emit("error", makeTtsError("tts_stream_error", err?.message ?? String(err)));
1106
+ });
1107
+ const close = async () => {
1108
+ if (closed) return;
1109
+ closed = true;
1110
+ try {
1111
+ ws.close({
1112
+ code: 1e3,
1113
+ reason: "client close"
1114
+ });
1115
+ } catch {}
1116
+ };
1117
+ if (openOpts.signal.aborted) close();
1118
+ else openOpts.signal.addEventListener("abort", () => void close(), { once: true });
1119
+ const baseRequest = {
1120
+ model_id: model,
1121
+ voice: {
1122
+ mode: "id",
1123
+ id: opts.voice
1124
+ },
1125
+ output_format: {
1126
+ container: "raw",
1127
+ encoding: "pcm_s16le",
1128
+ sample_rate: sampleRate
1129
+ },
1130
+ language
1131
+ };
1132
+ const ignoreRejection = (_err) => {};
1133
+ return {
1134
+ sendText(text) {
1135
+ if (closed || text.length === 0) return;
1136
+ context.send({
1137
+ ...baseRequest,
1138
+ transcript: text,
1139
+ continue: true
1140
+ }).catch(ignoreRejection);
1141
+ },
1142
+ flush() {
1143
+ if (closed) return;
1144
+ context.send({
1145
+ ...baseRequest,
1146
+ transcript: "",
1147
+ continue: false
1148
+ }).catch(ignoreRejection);
1149
+ queueMicrotask(emitDoneOnce);
1150
+ rotateContext();
1151
+ },
1152
+ cancel() {
1153
+ if (closed) return;
1154
+ context.cancel().catch(ignoreRejection);
1155
+ emitDoneOnce();
1156
+ rotateContext();
1157
+ },
1158
+ on(event, fn) {
1159
+ return emitter.on(event, fn);
1160
+ },
1161
+ close,
1162
+ _ws: ws,
1163
+ _currentContextId: () => context.contextId
1164
+ };
1165
+ }
1166
+ };
1167
+ }
1168
+ //#endregion
1169
+ //#region host/providers/resolve.ts
1170
+ /**
1171
+ * Descriptor → concrete-provider resolution (host-only).
1172
+ *
1173
+ * User code (and the server, after extracting config from a bundled agent)
1174
+ * holds `SttProvider` / `LlmProvider` / `TtsProvider` **descriptors** —
1175
+ * plain `{ kind, options }` data. At session start the runtime calls the
1176
+ * resolvers here to turn each descriptor into its openable / callable
1177
+ * host-side counterpart, importing the third-party SDK only at that point.
1178
+ *
1179
+ * The guest sandbox never imports these functions, which is how the agent
1180
+ * bundle stays free of `@ai-sdk/anthropic` / `assemblyai` /
1181
+ * `@cartesia/cartesia-js`.
1182
+ */
1183
+ /**
1184
+ * Look up a provider API key: agent env first (set via `aai secret put` or
1185
+ * `.env`), then the host's `process.env` as a fallback for self-hosted mode.
1186
+ * Returns `""` if neither has it — the caller decides whether that's fatal.
1187
+ */
1188
+ function resolveApiKey(envVar, env) {
1189
+ return env[envVar] ?? process.env[envVar] ?? "";
1190
+ }
1191
+ /** Resolve an {@link SttProvider} descriptor into a host-side opener. */
1192
+ function resolveStt(descriptor) {
1193
+ switch (descriptor.kind) {
1194
+ case ASSEMBLYAI_KIND: return openAssemblyAI(descriptor.options);
1195
+ default: throw new Error(`Unknown STT provider kind: "${descriptor.kind}". Supported: ${ASSEMBLYAI_KIND}.`);
1196
+ }
1197
+ }
1198
+ /** Resolve a {@link TtsProvider} descriptor into a host-side opener. */
1199
+ function resolveTts(descriptor) {
1200
+ switch (descriptor.kind) {
1201
+ case CARTESIA_KIND: return openCartesia(descriptor.options);
1202
+ default: throw new Error(`Unknown TTS provider kind: "${descriptor.kind}". Supported: ${CARTESIA_KIND}.`);
1203
+ }
1204
+ }
1205
+ /**
1206
+ * Resolve an {@link LlmProvider} descriptor into a Vercel AI SDK
1207
+ * {@link LanguageModel}.
1208
+ *
1209
+ * The API key is pulled from the agent's env (e.g. `ANTHROPIC_API_KEY`).
1210
+ * Missing keys throw here — the pipeline session would fail on first
1211
+ * `streamText` call otherwise, and the error is clearer at construction.
1212
+ */
1213
+ function resolveLlm(descriptor, env) {
1214
+ switch (descriptor.kind) {
1215
+ case ANTHROPIC_KIND: {
1216
+ const options = descriptor.options;
1217
+ const apiKey = resolveApiKey("ANTHROPIC_API_KEY", env);
1218
+ if (!apiKey) throw new Error("Anthropic LLM: missing API key. Set ANTHROPIC_API_KEY in the agent env.");
1219
+ return createAnthropic({ apiKey })(options.model);
1220
+ }
1221
+ default: throw new Error(`Unknown LLM provider kind: "${descriptor.kind}". Supported: ${ANTHROPIC_KIND}.`);
1222
+ }
1223
+ }
1224
+ //#endregion
919
1225
  //#region host/s2s.ts
920
1226
  const uint8ToBase64 = (bytes) => Buffer.from(bytes).toString("base64");
921
1227
  const base64ToUint8 = (base64) => new Uint8Array(Buffer.from(base64, "base64"));
@@ -1727,28 +2033,27 @@ function wireSessionSocket(ws, opts) {
1727
2033
  //#endregion
1728
2034
  //#region host/runtime.ts
1729
2035
  /**
1730
- * Agent runtime the execution engine for voice agents.
1731
- *
1732
- * {@link createRuntime} builds the single execution engine used by both
1733
- * self-hosted servers and the platform sandbox. It wires up tool execution,
1734
- * lifecycle hooks, and session management.
2036
+ * Distinguish a descriptor (`{ kind, options }`) from an already-resolved
2037
+ * opener / `LanguageModel`. The production path always passes descriptors;
2038
+ * openers are a test escape hatch (fakes in `_pipeline-test-fakes.ts`).
2039
+ * STT/TTS openers are identified by the `open` method, `LanguageModel` by
2040
+ * its `specificationVersion` field both absent on descriptors.
1735
2041
  */
2042
+ function resolveSttIfDescriptor(value) {
2043
+ return "open" in value ? value : resolveStt(value);
2044
+ }
2045
+ function resolveTtsIfDescriptor(value) {
2046
+ return "open" in value ? value : resolveTts(value);
2047
+ }
2048
+ function resolveLlmIfDescriptor(value, env) {
2049
+ if (typeof value === "string") return value;
2050
+ return "specificationVersion" in value ? value : resolveLlm(value, env);
2051
+ }
1736
2052
  /** Create an in-memory KV store (default for self-hosted). */
1737
2053
  function createLocalKv() {
1738
2054
  return createUnstorageKv({ storage: createStorage() });
1739
2055
  }
1740
2056
  /**
1741
- * Resolve an API key host-side for pipeline providers.
1742
- *
1743
- * Checks the agent's declared env first, then the host process env as a
1744
- * fallback. Returns `""` when absent — pipeline providers surface a clear
1745
- * `MissingCredentialsError` via their `open()` that the orchestrator
1746
- * converts to a `session.error` wire event.
1747
- */
1748
- function resolveApiKey(envVar, env) {
1749
- return env[envVar] ?? process.env[envVar] ?? "";
1750
- }
1751
- /**
1752
2057
  * Create an agent runtime — the execution engine for a voice agent.
1753
2058
  *
1754
2059
  * Merges built-in and custom tool definitions, builds tool schemas for the
@@ -1762,9 +2067,7 @@ function resolveApiKey(envVar, env) {
1762
2067
  */
1763
2068
  function createRuntime(opts) {
1764
2069
  const { agent, env, kv = createLocalKv(), createWebSocket, logger = consoleLogger, s2sConfig = DEFAULT_S2S_CONFIG, sessionStartTimeoutMs, shutdownTimeoutMs = DEFAULT_SHUTDOWN_TIMEOUT_MS } = opts;
1765
- const providerCount = (opts.stt != null ? 1 : 0) + (opts.llm != null ? 1 : 0) + (opts.tts != null ? 1 : 0);
1766
- if (providerCount !== 0 && providerCount !== 3) throw new Error("stt, llm, and tts must be set together");
1767
- const mode = providerCount === 3 ? "pipeline" : "s2s";
2070
+ const mode = assertProviderTriple(opts.stt, opts.llm, opts.tts);
1768
2071
  const agentConfig = toAgentConfig(agent);
1769
2072
  const sessions = /* @__PURE__ */ new Map();
1770
2073
  const sinkMap = /* @__PURE__ */ new Map();
@@ -1827,28 +2130,28 @@ function createRuntime(opts) {
1827
2130
  });
1828
2131
  };
1829
2132
  }
2133
+ const pipelineProviders = mode === "pipeline" ? {
2134
+ stt: resolveSttIfDescriptor(opts.stt),
2135
+ llm: resolveLlmIfDescriptor(opts.llm, env),
2136
+ tts: resolveTtsIfDescriptor(opts.tts)
2137
+ } : null;
1830
2138
  function createSession(sessionOpts) {
1831
2139
  sinkMap.set(sessionOpts.id, sessionOpts.client);
1832
- if (mode === "pipeline") {
1833
- const stt = opts.stt;
1834
- const llm = opts.llm;
1835
- const tts = opts.tts;
1836
- return createPipelineSession({
1837
- id: sessionOpts.id,
1838
- agent: sessionOpts.agent,
1839
- client: sessionOpts.client,
1840
- agentConfig,
1841
- toolSchemas,
1842
- toolGuidance,
1843
- executeTool,
1844
- stt,
1845
- llm,
1846
- tts,
1847
- sttApiKey: resolveApiKey("ASSEMBLYAI_API_KEY", env),
1848
- ttsApiKey: resolveApiKey("CARTESIA_API_KEY", env),
1849
- logger
1850
- });
1851
- }
2140
+ if (pipelineProviders) return createPipelineSession({
2141
+ id: sessionOpts.id,
2142
+ agent: sessionOpts.agent,
2143
+ client: sessionOpts.client,
2144
+ agentConfig,
2145
+ toolSchemas,
2146
+ toolGuidance,
2147
+ executeTool,
2148
+ stt: pipelineProviders.stt,
2149
+ llm: pipelineProviders.llm,
2150
+ tts: pipelineProviders.tts,
2151
+ sttApiKey: resolveApiKey("ASSEMBLYAI_API_KEY", env),
2152
+ ttsApiKey: resolveApiKey("CARTESIA_API_KEY", env),
2153
+ logger
2154
+ });
1852
2155
  const apiKey = env.ASSEMBLYAI_API_KEY ?? "";
1853
2156
  return createS2sSession({
1854
2157
  id: sessionOpts.id,
@@ -5,11 +5,12 @@
5
5
  * self-hosted servers and the platform sandbox. It wires up tool execution,
6
6
  * lifecycle hooks, and session management.
7
7
  */
8
+ import type { LanguageModel } from "ai";
8
9
  import { type ToolSchema } from "../sdk/_internal-types.ts";
9
10
  import type { Kv } from "../sdk/kv.ts";
10
11
  import type { ClientSink } from "../sdk/protocol.ts";
11
12
  import { type ReadyConfig } from "../sdk/protocol.ts";
12
- import type { LlmProvider, SttProvider, TtsProvider } from "../sdk/providers.ts";
13
+ import { type LlmProvider, type SttOpener, type SttProvider, type TtsOpener, type TtsProvider } from "../sdk/providers.ts";
13
14
  import type { AgentDef } from "../sdk/types.ts";
14
15
  import type { Logger, S2SConfig } from "./runtime-config.ts";
15
16
  import type { CreateS2sWebSocket } from "./s2s.ts";
@@ -91,21 +92,24 @@ export type RuntimeOptions = {
91
92
  */
92
93
  fetch?: typeof globalThis.fetch | undefined;
93
94
  /**
94
- * Pluggable STT provider. Must be set together with `llm` and `tts` to
95
+ * STT provider. Accepts either a descriptor ({@link SttProvider},
96
+ * the normal production path) or a pre-resolved {@link SttOpener}
97
+ * (test escape hatch). Must be set together with `llm` and `tts` to
95
98
  * route sessions through the pipeline path; leave all three unset for
96
99
  * the default AssemblyAI Streaming Speech-to-Speech (S2S) path.
97
100
  */
98
- stt?: SttProvider | undefined;
101
+ stt?: SttProvider | SttOpener | undefined;
99
102
  /**
100
- * Pluggable LLM provider (Vercel AI SDK `LanguageModel`). Must be set
101
- * together with `stt` and `tts` to route sessions through the pipeline path.
103
+ * LLM provider. Accepts either a descriptor ({@link LlmProvider},
104
+ * produced by factories like `anthropic(...)`) or a concrete Vercel AI
105
+ * SDK `LanguageModel` (self-hosted / test escape hatch).
102
106
  */
103
- llm?: LlmProvider | undefined;
107
+ llm?: LlmProvider | LanguageModel | undefined;
104
108
  /**
105
- * Pluggable TTS provider. Must be set together with `stt` and `llm` to
106
- * route sessions through the pipeline path.
109
+ * TTS provider. Accepts either a descriptor ({@link TtsProvider})
110
+ * or a pre-resolved {@link TtsOpener}.
107
111
  */
108
- tts?: TtsProvider | undefined;
112
+ tts?: TtsProvider | TtsOpener | undefined;
109
113
  };
110
114
  /**
111
115
  * The agent runtime returned by {@link createRuntime}.
package/dist/index.js CHANGED
@@ -1,95 +1,6 @@
1
- import { _ as TOOL_EXECUTION_TIMEOUT_MS, a as DEFAULT_SHUTDOWN_TIMEOUT_MS, c as FETCH_TIMEOUT_MS, d as MAX_PAGE_CHARS, f as MAX_TOOL_RESULT_CHARS, g as RUN_CODE_TIMEOUT_MS, h as PIPELINE_FLUSH_TIMEOUT_MS, i as DEFAULT_SESSION_START_TIMEOUT_MS, l as MAX_HTML_BYTES, m as MAX_WS_PAYLOAD_BYTES, n as DEFAULT_IDLE_TIMEOUT_MS, o as DEFAULT_STT_SAMPLE_RATE, p as MAX_VALUE_SIZE, r as DEFAULT_MAX_HISTORY, s as DEFAULT_TTS_SAMPLE_RATE, t as AGENT_CSP, u as MAX_MESSAGE_BUFFER_SIZE, v as WS_OPEN } from "./constants-BL3nvg4I.js";
2
- import { i as ToolChoiceSchema, n as DEFAULT_GREETING, r as DEFAULT_SYSTEM_PROMPT, t as BuiltinToolSchema } from "./types-Cfx_4QDK.js";
1
+ import { a as matchesAllowedHost, i as ToolChoiceSchema, n as DEFAULT_GREETING, o as validateAllowedHostPattern, r as DEFAULT_SYSTEM_PROMPT, t as BuiltinToolSchema } from "./types-KUgezM6u.js";
2
+ import { _ as TOOL_EXECUTION_TIMEOUT_MS, a as DEFAULT_SHUTDOWN_TIMEOUT_MS, c as FETCH_TIMEOUT_MS, d as MAX_PAGE_CHARS, f as MAX_TOOL_RESULT_CHARS, g as RUN_CODE_TIMEOUT_MS, h as PIPELINE_FLUSH_TIMEOUT_MS, i as DEFAULT_SESSION_START_TIMEOUT_MS, l as MAX_HTML_BYTES, m as MAX_WS_PAYLOAD_BYTES, n as DEFAULT_IDLE_TIMEOUT_MS, o as DEFAULT_STT_SAMPLE_RATE, p as MAX_VALUE_SIZE, r as DEFAULT_MAX_HISTORY, s as DEFAULT_TTS_SAMPLE_RATE, t as AGENT_CSP, u as MAX_MESSAGE_BUFFER_SIZE, v as WS_OPEN } from "./constants-C2nirZUI.js";
3
3
  import { i as toolError, n as errorDetail, r as errorMessage, t as parseWsUpgradeParams } from "./ws-upgrade-BeOQ7fXL.js";
4
- //#region sdk/allowed-hosts.ts
5
- /**
6
- * Allowlist matching for outbound host validation.
7
- *
8
- * Used at deploy time (manifest validation) and at runtime (SSRF enforcement)
9
- * to restrict which external hosts an agent is permitted to contact.
10
- *
11
- * Lives in sdk/ because it has zero Node.js dependencies and can run in any
12
- * environment (browser, Deno, Node.js sandboxes).
13
- */
14
- /** Private/special-use TLDs that must never appear in allowedHosts patterns. */
15
- const BLOCKED_TLDS = [
16
- "local",
17
- "internal",
18
- "localhost"
19
- ];
20
- /**
21
- * Regex that matches an IPv4 address (four decimal octets separated by dots).
22
- * Anchored so partial matches like "192.168.1.1.example.com" don't trigger it.
23
- */
24
- const IPV4_RE = /^(\d{1,3}\.){3}\d{1,3}$/;
25
- function fail(reason) {
26
- return {
27
- valid: false,
28
- reason
29
- };
30
- }
31
- function checkStructural(pattern) {
32
- if (pattern === "") return fail("Pattern must not be empty.");
33
- if (pattern.includes("://")) return fail("Pattern must not include a protocol (e.g. remove 'https://').");
34
- if (pattern.includes("/")) return fail("Pattern must not include a path component (remove '/').");
35
- if (pattern.includes("?")) return fail("Pattern must not include a query string (remove '?').");
36
- if (pattern.startsWith("[") || pattern.includes("::")) return fail("IP address literals are not allowed in allowedHosts patterns.");
37
- if (pattern.includes(":")) return fail("Pattern must not include a port number (e.g. remove ':8080').");
38
- return null;
39
- }
40
- function checkWildcard(pattern) {
41
- if (!pattern.includes("*")) return null;
42
- if (pattern === "*" || pattern === "**") return fail("Bare wildcard '*' is not allowed. Use '*.example.com' to allow all subdomains.");
43
- if (pattern.indexOf("*") !== 0 || pattern[1] !== ".") return fail("Wildcard '*' may only appear as the leading segment (e.g. '*.example.com').");
44
- if (pattern.lastIndexOf("*") !== 0) return fail("Only a single leading wildcard segment is supported.");
45
- return null;
46
- }
47
- function checkHostPart(hostPart) {
48
- if (IPV4_RE.test(hostPart)) return fail("IP address literals are not allowed in allowedHosts patterns.");
49
- const tld = hostPart.split(".").at(-1)?.toLowerCase() ?? "";
50
- if (BLOCKED_TLDS.includes(tld)) return fail(`Patterns ending in '.${tld}' are not allowed (private/special-use TLD).`);
51
- return null;
52
- }
53
- /**
54
- * Validate a single `allowedHosts` pattern at deploy time.
55
- *
56
- * Returns `{ valid: true }` for acceptable patterns or
57
- * `{ valid: false; reason: string }` with a human-readable rejection reason.
58
- */
59
- function validateAllowedHostPattern(pattern) {
60
- const structural = checkStructural(pattern);
61
- if (structural !== null) return structural;
62
- const wildcard = checkWildcard(pattern);
63
- if (wildcard !== null) return wildcard;
64
- const hostCheck = checkHostPart(pattern.startsWith("*.") ? pattern.slice(2) : pattern);
65
- if (hostCheck !== null) return hostCheck;
66
- return { valid: true };
67
- }
68
- /**
69
- * Test whether `hostname` matches any pattern in `patterns`.
70
- *
71
- * - Exact match is case-insensitive; trailing dots on the hostname are stripped.
72
- * - Wildcard pattern `*.example.com` matches any hostname ending with
73
- * `.example.com` (one or more labels), but does NOT match `example.com` itself.
74
- * - A port suffix on `hostname` (e.g. `api.example.com:8080`) is stripped before
75
- * matching.
76
- * - Returns `false` when `patterns` is empty.
77
- */
78
- function matchesAllowedHost(hostname, patterns) {
79
- if (patterns.length === 0) return false;
80
- const portIndex = hostname.lastIndexOf(":");
81
- let host = portIndex !== -1 && !hostname.includes("[") ? hostname.slice(0, portIndex) : hostname;
82
- host = host.toLowerCase().replace(/\.$/, "");
83
- for (const pattern of patterns) {
84
- const p = pattern.toLowerCase();
85
- if (p.startsWith("*.")) {
86
- const suffix = p.slice(1);
87
- if (host.endsWith(suffix) && host.length > suffix.length) return true;
88
- } else if (host === p) return true;
89
- }
90
- return false;
91
- }
92
- //#endregion
93
4
  //#region sdk/define.ts
94
5
  /**
95
6
  * Define a tool with typed parameters and execute function.
@@ -5,6 +5,7 @@
5
5
  */
6
6
  import type { JSONSchema7 } from "json-schema";
7
7
  import { z } from "zod";
8
+ import { type LlmProvider, type SttProvider, type TtsProvider } from "./providers.ts";
8
9
  import type { Message } from "./types.ts";
9
10
  import { type ToolDef } from "./types.ts";
10
11
  /**
@@ -51,6 +52,22 @@ export declare const AgentConfigSchema: z.ZodObject<{
51
52
  run_code: "run_code";
52
53
  }>>>>;
53
54
  idleTimeoutMs: z.ZodOptional<z.ZodNumber>;
55
+ stt: z.ZodOptional<z.ZodObject<{
56
+ kind: z.ZodString;
57
+ options: z.ZodRecord<z.ZodString, z.ZodUnknown>;
58
+ }, z.core.$strip>>;
59
+ llm: z.ZodOptional<z.ZodObject<{
60
+ kind: z.ZodString;
61
+ options: z.ZodRecord<z.ZodString, z.ZodUnknown>;
62
+ }, z.core.$strip>>;
63
+ tts: z.ZodOptional<z.ZodObject<{
64
+ kind: z.ZodString;
65
+ options: z.ZodRecord<z.ZodString, z.ZodUnknown>;
66
+ }, z.core.$strip>>;
67
+ mode: z.ZodOptional<z.ZodEnum<{
68
+ s2s: "s2s";
69
+ pipeline: "pipeline";
70
+ }>>;
54
71
  }, z.core.$strip>;
55
72
  /** Serializable agent configuration — derived from {@link AgentConfigSchema}. */
56
73
  export type AgentConfig = z.infer<typeof AgentConfigSchema>;
@@ -68,8 +85,17 @@ export interface AgentConfigSource {
68
85
  toolChoice?: AgentConfig["toolChoice"] | undefined;
69
86
  builtinTools?: Readonly<AgentConfig["builtinTools"]> | undefined;
70
87
  idleTimeoutMs?: number | undefined;
88
+ stt?: SttProvider | undefined;
89
+ llm?: LlmProvider | undefined;
90
+ tts?: TtsProvider | undefined;
71
91
  }
72
- /** Extract the serializable {@link AgentConfig} subset from a source object. */
92
+ /**
93
+ * Extract the serializable {@link AgentConfig} subset from a source object.
94
+ *
95
+ * When `stt`, `llm`, and `tts` descriptors are present they are all three
96
+ * required (or none) — enforced here so the server can trust the config.
97
+ * `mode` is derived from their presence.
98
+ */
73
99
  export declare function toAgentConfig(src: AgentConfigSource): AgentConfig;
74
100
  /**
75
101
  * Zod schema for serialized tool definitions sent over the wire.
@@ -4,3 +4,5 @@
4
4
  * Used by aai-cli (bundler) and aai-server (rpc-schemas).
5
5
  */
6
6
  export { type AgentConfig, AgentConfigSchema, type AgentConfigSource, agentToolsToSchemas, EMPTY_PARAMS, type ExecuteTool, type ToolSchema, ToolSchemaSchema, toAgentConfig, } from "./_internal-types.ts";
7
+ export { ProviderDescriptorSchema } from "./manifest.ts";
8
+ export { assertProviderTriple, type SessionMode } from "./providers.ts";
@@ -1,2 +1,2 @@
1
- import { a as toAgentConfig, i as agentToolsToSchemas, n as EMPTY_PARAMS, r as ToolSchemaSchema, t as AgentConfigSchema } from "../_internal-types-CoDTiBd1.js";
2
- export { AgentConfigSchema, EMPTY_PARAMS, ToolSchemaSchema, agentToolsToSchemas, toAgentConfig };
1
+ import { a as toAgentConfig, i as agentToolsToSchemas, n as EMPTY_PARAMS, o as ProviderDescriptorSchema, r as ToolSchemaSchema, s as assertProviderTriple, t as AgentConfigSchema } from "../_internal-types-3p3OJZPb.js";
2
+ export { AgentConfigSchema, EMPTY_PARAMS, ProviderDescriptorSchema, ToolSchemaSchema, agentToolsToSchemas, assertProviderTriple, toAgentConfig };
@@ -4,7 +4,8 @@
4
4
  * Flows from build → host → sdk. Validated via Zod at the boundary,
5
5
  * then used as a plain typed object throughout the runtime.
6
6
  */
7
- import type { LlmProvider, SttProvider, TtsProvider } from "./providers.ts";
7
+ import { z } from "zod";
8
+ import { type LlmProvider, type SessionMode, type SttProvider, type TtsProvider } from "./providers.ts";
8
9
  /**
9
10
  * Tool definition as it appears in the serialized manifest JSON.
10
11
  *
@@ -60,8 +61,18 @@ export type Manifest = {
60
61
  * - `"s2s"` (default): AssemblyAI Streaming Speech-to-Speech path (no stt/llm/tts set).
61
62
  * - `"pipeline"`: pluggable STT → LLM → TTS path (stt + llm + tts all set).
62
63
  */
63
- mode: "s2s" | "pipeline";
64
+ mode: SessionMode;
64
65
  };
66
+ /**
67
+ * Provider descriptor — a `{ kind, options }` pair produced by factories
68
+ * like `assemblyAI(...)` / `anthropic(...)` / `cartesia(...)`. Kept
69
+ * deliberately generic at the schema layer: kind-specific validation lives
70
+ * in the host-side resolver, which knows what each adapter expects.
71
+ */
72
+ export declare const ProviderDescriptorSchema: z.ZodObject<{
73
+ kind: z.ZodString;
74
+ options: z.ZodRecord<z.ZodString, z.ZodUnknown>;
75
+ }, z.core.$strip>;
65
76
  /**
66
77
  * Parse and normalize a raw agent manifest, applying defaults for all
67
78
  * optional fields. Input is typically the JSON from a bundled agent.ts.