@alexkroman1/aai 1.2.3 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/.turbo/turbo-build.log +14 -12
  2. package/CHANGELOG.md +14 -0
  3. package/dist/host/_pipeline-test-fakes.d.ts +107 -0
  4. package/dist/host/pipeline-session-ctx.d.ts +24 -0
  5. package/dist/host/pipeline-session.d.ts +48 -0
  6. package/dist/host/providers/llm.d.ts +2 -0
  7. package/dist/host/providers/stt/assemblyai.d.ts +31 -0
  8. package/dist/host/providers/stt-barrel.d.ts +8 -0
  9. package/dist/host/providers/stt-barrel.js +92 -0
  10. package/dist/host/providers/stt.d.ts +2 -0
  11. package/dist/host/providers/tts/cartesia.d.ts +39 -0
  12. package/dist/host/providers/tts-barrel.d.ts +8 -0
  13. package/dist/host/providers/tts-barrel.js +182 -0
  14. package/dist/host/providers/tts.d.ts +2 -0
  15. package/dist/host/runtime-barrel.js +498 -80
  16. package/dist/host/runtime.d.ts +17 -0
  17. package/dist/host/s2s.d.ts +5 -0
  18. package/dist/host/session-ctx.d.ts +22 -4
  19. package/dist/host/to-vercel-tools.d.ts +44 -0
  20. package/dist/index.js +5 -0
  21. package/dist/sdk/_internal-types.d.ts +15 -1
  22. package/dist/sdk/define.d.ts +21 -0
  23. package/dist/sdk/manifest.d.ts +22 -0
  24. package/dist/sdk/protocol.d.ts +3 -3
  25. package/dist/sdk/providers.d.ts +70 -0
  26. package/dist/sdk/types.d.ts +16 -0
  27. package/exports-no-dev-deps.test.ts +39 -14
  28. package/host/_pipeline-test-fakes.ts +323 -0
  29. package/host/_test-utils.ts +1 -0
  30. package/host/integration/fixtures/README.md +49 -0
  31. package/host/integration/pipeline-reference.integration.test.ts +124 -0
  32. package/host/pipeline-session-ctx.test.ts +31 -0
  33. package/host/pipeline-session-ctx.ts +36 -0
  34. package/host/pipeline-session.test.ts +337 -0
  35. package/host/pipeline-session.ts +405 -0
  36. package/host/providers/llm.ts +3 -0
  37. package/host/providers/providers.test-d.ts +31 -0
  38. package/host/providers/stt/assemblyai.test.ts +100 -0
  39. package/host/providers/stt/assemblyai.ts +154 -0
  40. package/host/providers/stt/fixtures/assemblyai/basic-turn.json +30 -0
  41. package/host/providers/stt-barrel.ts +13 -0
  42. package/host/providers/stt.ts +3 -0
  43. package/host/providers/tts/cartesia.test.ts +210 -0
  44. package/host/providers/tts/cartesia.ts +251 -0
  45. package/host/providers/tts-barrel.ts +13 -0
  46. package/host/providers/tts.ts +3 -0
  47. package/host/runtime.test.ts +81 -1
  48. package/host/runtime.ts +61 -0
  49. package/host/s2s.test.ts +19 -0
  50. package/host/s2s.ts +10 -0
  51. package/host/session-ctx.ts +35 -8
  52. package/host/to-vercel-tools.test.ts +153 -0
  53. package/host/to-vercel-tools.ts +70 -0
  54. package/package.json +15 -1
  55. package/sdk/__snapshots__/exports.test.ts.snap +1 -0
  56. package/sdk/_internal-types.ts +16 -0
  57. package/sdk/define.test-d.ts +21 -0
  58. package/sdk/define.test.ts +33 -0
  59. package/sdk/define.ts +21 -0
  60. package/sdk/manifest.test-d.ts +14 -0
  61. package/sdk/manifest.test.ts +51 -0
  62. package/sdk/manifest.ts +39 -0
  63. package/sdk/providers.ts +90 -0
  64. package/sdk/types.ts +16 -0
  65. package/vitest.config.ts +1 -0
@@ -0,0 +1,405 @@
1
+ // Copyright 2025 the AAI authors. MIT license.
2
+ /**
3
+ * Pipeline session — pluggable STT → LLM → TTS orchestrator.
4
+ *
5
+ * Alternative to the S2S session (see `session.ts`) that drives three
6
+ * independent providers. A new partial STT event while the agent is replying
7
+ * triggers barge-in (aborts the LLM stream and cancels TTS).
8
+ */
9
+
10
+ import type { LanguageModel, ModelMessage } from "ai";
11
+ import { stepCountIs, streamText } from "ai";
12
+ import type { AgentConfig, ExecuteTool, ToolSchema } from "../sdk/_internal-types.ts";
13
+ import { DEFAULT_STT_SAMPLE_RATE } from "../sdk/constants.ts";
14
+ import type { ClientSink, SessionErrorCode } from "../sdk/protocol.ts";
15
+ import type {
16
+ SttError,
17
+ SttProvider,
18
+ TtsError,
19
+ TtsProvider,
20
+ TtsSession,
21
+ Unsubscribe,
22
+ } from "../sdk/providers.ts";
23
+ import { buildSystemPrompt } from "../sdk/system-prompt.ts";
24
+ import type { Message } from "../sdk/types.ts";
25
+ import { errorMessage } from "../sdk/utils.ts";
26
+ import { buildPipelineCtx, type PipelineSessionCtx } from "./pipeline-session-ctx.ts";
27
+ import { consoleLogger, type Logger } from "./runtime-config.ts";
28
+ import type { Session } from "./session.ts";
29
+ import { toVercelTools } from "./to-vercel-tools.ts";
30
+
31
+ /** Configuration options for {@link createPipelineSession}. */
32
+ export interface PipelineSessionOptions {
33
+ /** Unique session identifier. */
34
+ id: string;
35
+ /** Agent slug. */
36
+ agent: string;
37
+ /** Sink for wire events + audio back to the browser client. */
38
+ client: ClientSink;
39
+ /** Serializable agent config (name, system prompt, maxSteps, etc.). */
40
+ agentConfig: AgentConfig;
41
+ /** JSON Schema definitions for the agent's tools. */
42
+ toolSchemas: readonly ToolSchema[];
43
+ /** Optional natural-language guidance appended to the system prompt. */
44
+ toolGuidance?: readonly string[] | undefined;
45
+ /** Function to invoke tools by name. */
46
+ executeTool: ExecuteTool;
47
+ /** STT provider (injected via manifest in pipeline mode). */
48
+ stt: SttProvider;
49
+ /** LLM provider (Vercel AI SDK `LanguageModel`). */
50
+ llm: LanguageModel;
51
+ /** TTS provider (injected via manifest in pipeline mode). */
52
+ tts: TtsProvider;
53
+ /** STT API key. */
54
+ sttApiKey: string;
55
+ /** TTS API key. */
56
+ ttsApiKey: string;
57
+ /** Audio sample rate (PCM16, Hz). Defaults to {@link DEFAULT_STT_SAMPLE_RATE}. */
58
+ sampleRate?: number | undefined;
59
+ /** Logger. Defaults to the console logger. */
60
+ logger?: Logger | undefined;
61
+ /** Sliding-window conversation history size. */
62
+ maxHistory?: number | undefined;
63
+ }
64
+
65
+ function toModelMessage(m: Message): ModelMessage {
66
+ if (m.role === "user") return { role: "user", content: m.content };
67
+ if (m.role === "assistant") return { role: "assistant", content: m.content };
68
+ return { role: "assistant", content: m.content };
69
+ }
70
+
71
+ function emitError(client: ClientSink, code: SessionErrorCode, message: string): void {
72
+ client.event({ type: "error", code, message });
73
+ }
74
+
75
+ type StreamPartHandlerDeps = {
76
+ client: ClientSink;
77
+ tts: TtsSession | null;
78
+ log: Logger;
79
+ sessionId: string;
80
+ onTextDelta: (delta: string) => void;
81
+ };
82
+
83
+ function handleStreamPart(
84
+ part: {
85
+ readonly type: string;
86
+ readonly text?: string;
87
+ readonly input?: unknown;
88
+ readonly output?: unknown;
89
+ readonly toolCallId?: string;
90
+ readonly toolName?: string;
91
+ readonly error?: unknown;
92
+ },
93
+ deps: StreamPartHandlerDeps,
94
+ ): void {
95
+ switch (part.type) {
96
+ case "text-delta": {
97
+ const delta = part.text ?? "";
98
+ if (delta.length === 0) return;
99
+ deps.onTextDelta(delta);
100
+ deps.tts?.sendText(delta);
101
+ deps.client.event({ type: "agent_transcript", text: delta });
102
+ return;
103
+ }
104
+ case "tool-call": {
105
+ const input = (part.input ?? {}) as Readonly<Record<string, unknown>>;
106
+ deps.client.event({
107
+ type: "tool_call",
108
+ toolCallId: part.toolCallId ?? "",
109
+ toolName: part.toolName ?? "",
110
+ args: input,
111
+ });
112
+ return;
113
+ }
114
+ case "tool-result": {
115
+ const output = part.output;
116
+ const resultString = typeof output === "string" ? output : JSON.stringify(output);
117
+ deps.client.event({
118
+ type: "tool_call_done",
119
+ toolCallId: part.toolCallId ?? "",
120
+ result: resultString,
121
+ });
122
+ return;
123
+ }
124
+ case "error": {
125
+ const msg = errorMessage(part.error);
126
+ deps.log.error("LLM stream error", { message: msg, sessionId: deps.sessionId });
127
+ emitError(deps.client, "llm", msg);
128
+ return;
129
+ }
130
+ default:
131
+ return;
132
+ }
133
+ }
134
+
135
+ /** Create a pluggable-provider voice session. */
136
+ export function createPipelineSession(opts: PipelineSessionOptions): Session {
137
+ const log = opts.logger ?? consoleLogger;
138
+ const sampleRate = opts.sampleRate ?? DEFAULT_STT_SAMPLE_RATE;
139
+ const { client, agentConfig, toolSchemas, executeTool } = opts;
140
+
141
+ const hasTools = toolSchemas.length > 0 || (agentConfig.builtinTools?.length ?? 0) > 0;
142
+ const systemPrompt = buildSystemPrompt(agentConfig, {
143
+ hasTools,
144
+ voice: true,
145
+ toolGuidance: opts.toolGuidance,
146
+ });
147
+
148
+ const ctx: PipelineSessionCtx = buildPipelineCtx({
149
+ id: opts.id,
150
+ agent: opts.agent,
151
+ client,
152
+ agentConfig,
153
+ executeTool,
154
+ log,
155
+ maxHistory: opts.maxHistory,
156
+ });
157
+
158
+ const sessionAbort = new AbortController();
159
+ let audioReady = false;
160
+ let turnController: AbortController | null = null;
161
+ let nextReplyId = 0;
162
+ const sttSubs: Unsubscribe[] = [];
163
+ const ttsSubs: Unsubscribe[] = [];
164
+
165
+ function onSttPartial(_text: string): void {
166
+ if (turnController === null) return;
167
+ log.info("Pipeline barge-in", { sessionId: opts.id });
168
+ turnController.abort();
169
+ turnController = null;
170
+ ctx.tts?.cancel();
171
+ ctx.cancelReply();
172
+ client.event({ type: "cancelled" });
173
+ }
174
+
175
+ function onSttFinal(text: string): void {
176
+ const trimmed = text.trim();
177
+ if (trimmed.length === 0) return;
178
+ client.event({ type: "user_transcript", text });
179
+ const turn = runTurn(trimmed).catch((err: unknown) => {
180
+ log.error("Pipeline turn crashed", { error: errorMessage(err), sessionId: opts.id });
181
+ });
182
+ ctx.chainTurn(turn);
183
+ }
184
+
185
+ function onSttError(err: SttError): void {
186
+ log.error("STT error", { code: err.code, message: err.message, sessionId: opts.id });
187
+ emitError(client, "stt", err.message);
188
+ }
189
+
190
+ function onTtsError(err: TtsError): void {
191
+ log.error("TTS error", { code: err.code, message: err.message, sessionId: opts.id });
192
+ emitError(client, "tts", err.message);
193
+ }
194
+
195
+ async function consumeLlmStream(
196
+ ctl: AbortController,
197
+ messages: ModelMessage[],
198
+ tools: ReturnType<typeof toVercelTools>,
199
+ onDelta: (delta: string) => void,
200
+ ): Promise<void> {
201
+ const deps: StreamPartHandlerDeps = {
202
+ client,
203
+ tts: ctx.tts,
204
+ log,
205
+ sessionId: opts.id,
206
+ onTextDelta: onDelta,
207
+ };
208
+ try {
209
+ // Vercel AI SDK v6 defaults to a single step — without `stopWhen`, the
210
+ // stream terminates after the first tool result and the agent can't
211
+ // follow up on its own tool calls.
212
+ const maxSteps = agentConfig.maxSteps ?? 5;
213
+ const result = streamText({
214
+ model: opts.llm,
215
+ system: systemPrompt,
216
+ messages,
217
+ tools,
218
+ stopWhen: stepCountIs(maxSteps),
219
+ abortSignal: ctl.signal,
220
+ });
221
+ for await (const part of result.fullStream) {
222
+ if (ctl.signal.aborted) break;
223
+ handleStreamPart(part, deps);
224
+ }
225
+ } catch (err: unknown) {
226
+ if (!ctl.signal.aborted) {
227
+ const msg = errorMessage(err);
228
+ log.error("LLM streamText failed", { error: msg, sessionId: opts.id });
229
+ emitError(client, "llm", msg);
230
+ }
231
+ }
232
+ }
233
+
234
+ function flushTtsAndWait(): Promise<void> {
235
+ const tts = ctx.tts;
236
+ if (!tts) return Promise.resolve();
237
+ return new Promise<void>((resolve) => {
238
+ const off = tts.on("done", () => {
239
+ off();
240
+ resolve();
241
+ });
242
+ tts.flush();
243
+ });
244
+ }
245
+
246
+ async function runTurn(userText: string): Promise<void> {
247
+ const replyId = `pipeline-${++nextReplyId}`;
248
+ ctx.beginReply(replyId);
249
+ ctx.pushMessages({ role: "user", content: userText });
250
+
251
+ const ctl = new AbortController();
252
+ turnController = ctl;
253
+
254
+ const tools = toVercelTools(toolSchemas, {
255
+ executeTool,
256
+ sessionId: opts.id,
257
+ messages: () => ctx.conversationMessages,
258
+ signal: ctl.signal,
259
+ });
260
+
261
+ const messages: ModelMessage[] = ctx.conversationMessages.map(toModelMessage);
262
+ let accumulated = "";
263
+ await consumeLlmStream(ctl, messages, tools, (delta) => {
264
+ accumulated += delta;
265
+ });
266
+
267
+ if (ctl.signal.aborted) {
268
+ if (turnController === ctl) turnController = null;
269
+ return;
270
+ }
271
+
272
+ await flushTtsAndWait();
273
+
274
+ if (ctl.signal.aborted) {
275
+ if (turnController === ctl) turnController = null;
276
+ return;
277
+ }
278
+
279
+ if (accumulated.length > 0) {
280
+ ctx.pushMessages({ role: "assistant", content: accumulated });
281
+ }
282
+ client.playAudioDone();
283
+ client.event({ type: "reply_done" });
284
+ if (turnController === ctl) turnController = null;
285
+ }
286
+
287
+ async function openProviders(): Promise<void> {
288
+ const [sttResult, ttsResult] = await Promise.allSettled([
289
+ opts.stt.open({
290
+ sampleRate,
291
+ apiKey: opts.sttApiKey,
292
+ sttPrompt: agentConfig.sttPrompt,
293
+ signal: sessionAbort.signal,
294
+ }),
295
+ opts.tts.open({
296
+ sampleRate,
297
+ apiKey: opts.ttsApiKey,
298
+ signal: sessionAbort.signal,
299
+ }),
300
+ ]);
301
+
302
+ if (sttResult.status === "rejected") {
303
+ const msg = errorMessage(sttResult.reason);
304
+ log.error("STT open failed", { error: msg, sessionId: opts.id });
305
+ emitError(client, "stt", msg);
306
+ }
307
+ if (ttsResult.status === "rejected") {
308
+ const msg = errorMessage(ttsResult.reason);
309
+ log.error("TTS open failed", { error: msg, sessionId: opts.id });
310
+ emitError(client, "tts", msg);
311
+ }
312
+
313
+ const aborted = sessionAbort.signal.aborted;
314
+ const sttFailed = sttResult.status === "rejected";
315
+ const ttsFailed = ttsResult.status === "rejected";
316
+ const teardown = aborted || sttFailed || ttsFailed;
317
+
318
+ if (sttResult.status === "fulfilled") {
319
+ const sttSession = sttResult.value;
320
+ if (teardown) {
321
+ await sttSession.close().catch(() => undefined);
322
+ } else {
323
+ ctx.stt = sttSession;
324
+ sttSubs.push(sttSession.on("partial", onSttPartial));
325
+ sttSubs.push(sttSession.on("final", onSttFinal));
326
+ sttSubs.push(sttSession.on("error", onSttError));
327
+ }
328
+ }
329
+ if (ttsResult.status === "fulfilled") {
330
+ const ttsSession = ttsResult.value;
331
+ if (teardown) {
332
+ await ttsSession.close().catch(() => undefined);
333
+ } else {
334
+ ctx.tts = ttsSession;
335
+ ttsSubs.push(
336
+ ttsSession.on("audio", (pcm) => {
337
+ client.playAudioChunk(new Uint8Array(pcm.buffer, pcm.byteOffset, pcm.byteLength));
338
+ }),
339
+ );
340
+ ttsSubs.push(ttsSession.on("error", onTtsError));
341
+ }
342
+ }
343
+ }
344
+
345
+ return {
346
+ async start(): Promise<void> {
347
+ await openProviders();
348
+ },
349
+ async stop(): Promise<void> {
350
+ if (sessionAbort.signal.aborted) return;
351
+ sessionAbort.abort();
352
+ turnController?.abort();
353
+ for (const off of sttSubs) off();
354
+ for (const off of ttsSubs) off();
355
+ sttSubs.length = 0;
356
+ ttsSubs.length = 0;
357
+ if (ctx.turnPromise !== null) await ctx.turnPromise;
358
+ await ctx.stt?.close().catch(() => {
359
+ /* already closed */
360
+ });
361
+ await ctx.tts?.close().catch(() => {
362
+ /* already closed */
363
+ });
364
+ },
365
+ onAudio(data: Uint8Array): void {
366
+ if (!audioReady) return;
367
+ const offset = data.byteOffset;
368
+ const length = data.byteLength;
369
+ let pcm: Int16Array;
370
+ if (offset % 2 === 0 && length % 2 === 0) {
371
+ pcm = new Int16Array(data.buffer, offset, length / 2);
372
+ } else {
373
+ const copy = new Uint8Array(length - (length % 2));
374
+ copy.set(data.subarray(0, copy.byteLength));
375
+ pcm = new Int16Array(copy.buffer);
376
+ }
377
+ ctx.stt?.sendAudio(pcm);
378
+ },
379
+ onAudioReady(): void {
380
+ audioReady = true;
381
+ },
382
+ onCancel(): void {
383
+ turnController?.abort();
384
+ turnController = null;
385
+ ctx.tts?.cancel();
386
+ ctx.cancelReply();
387
+ client.event({ type: "cancelled" });
388
+ },
389
+ onReset(): void {
390
+ turnController?.abort();
391
+ turnController = null;
392
+ ctx.tts?.cancel();
393
+ ctx.cancelReply();
394
+ ctx.conversationMessages = [];
395
+ ctx.turnPromise = null;
396
+ client.event({ type: "reset" });
397
+ },
398
+ onHistory(incoming: readonly { role: "user" | "assistant"; content: string }[]): void {
399
+ ctx.pushMessages(...incoming.map((m) => ({ role: m.role, content: m.content })));
400
+ },
401
+ waitForTurn(): Promise<void> {
402
+ return ctx.turnPromise ?? Promise.resolve();
403
+ },
404
+ };
405
+ }
@@ -0,0 +1,3 @@
1
+ // Copyright 2025 the AAI authors. MIT license.
2
+ /** LLM provider type — re-exported from sdk/ for host-side consumption. */
3
+ export type * from "../../sdk/providers.ts";
@@ -0,0 +1,31 @@
1
+ // Copyright 2025 the AAI authors. MIT license.
2
+ import type { LanguageModel } from "ai";
3
+ import { expectTypeOf, test } from "vitest";
4
+ import type { LlmProvider } from "./llm.ts";
5
+ import type { SttEvents, SttProvider, SttSession, Unsubscribe } from "./stt.ts";
6
+ import type { TtsEvents, TtsSession } from "./tts.ts";
7
+
8
+ test("SttProvider.open returns Promise<SttSession>", () => {
9
+ expectTypeOf<SttProvider["open"]>().returns.toEqualTypeOf<Promise<SttSession>>();
10
+ });
11
+
12
+ test("SttEvents.partial takes a string", () => {
13
+ expectTypeOf<SttEvents["partial"]>().parameters.toEqualTypeOf<[string]>();
14
+ });
15
+
16
+ test("TtsSession.cancel is synchronous", () => {
17
+ expectTypeOf<TtsSession["cancel"]>().returns.toEqualTypeOf<void>();
18
+ });
19
+
20
+ test("TtsEvents.audio takes Int16Array", () => {
21
+ expectTypeOf<TtsEvents["audio"]>().parameters.toEqualTypeOf<[Int16Array]>();
22
+ });
23
+
24
+ test("LlmProvider is Vercel AI SDK's LanguageModel", () => {
25
+ expectTypeOf<LlmProvider>().toEqualTypeOf<LanguageModel>();
26
+ });
27
+
28
+ test("Stt/Tts on() returns Unsubscribe", () => {
29
+ expectTypeOf<SttSession["on"]>().returns.toEqualTypeOf<Unsubscribe>();
30
+ expectTypeOf<TtsSession["on"]>().returns.toEqualTypeOf<Unsubscribe>();
31
+ });
@@ -0,0 +1,100 @@
1
+ // Copyright 2025 the AAI authors. MIT license.
2
+ /** Fixture-replay unit test for the AssemblyAI STT adapter. */
3
+
4
+ import { readFile } from "node:fs/promises";
5
+ import { dirname, join } from "node:path";
6
+ import { fileURLToPath } from "node:url";
7
+ import type { TurnEvent } from "assemblyai";
8
+ import { describe, expect, test, vi } from "vitest";
9
+ import { flush } from "../../_test-utils.ts";
10
+ import { type AssemblyAISession, assemblyAI } from "./assemblyai.ts";
11
+
12
+ const here = dirname(fileURLToPath(import.meta.url));
13
+
14
+ // ---------------------------------------------------------------------------
15
+ // Mock the `assemblyai` SDK so no real sockets are opened.
16
+ //
17
+ // Each fake `StreamingTranscriber` keeps its own listener map and exposes
18
+ // `_fire(event, payload)` for tests to inject events. The adapter's
19
+ // `open()` returns an `AssemblyAISession` with a `_transcriber` pointer,
20
+ // which in the test is the fake — giving the test a handle to `_fire`.
21
+ // ---------------------------------------------------------------------------
22
+
23
+ interface FakeTranscriber {
24
+ on(ev: string, fn: (...args: unknown[]) => void): void;
25
+ connect(): Promise<void>;
26
+ close(): Promise<void>;
27
+ sendAudio(_data: ArrayBufferLike): void;
28
+ _fire(ev: string, ...args: unknown[]): void;
29
+ }
30
+
31
+ vi.mock("assemblyai", () => {
32
+ const makeFakeTranscriber = (): FakeTranscriber => {
33
+ const listeners = new Map<string, Array<(...args: unknown[]) => void>>();
34
+ return {
35
+ on(ev, fn) {
36
+ const arr = listeners.get(ev) ?? [];
37
+ arr.push(fn);
38
+ listeners.set(ev, arr);
39
+ },
40
+ async connect() {
41
+ this._fire("open", { type: "Begin", id: "mock-sess", expires_at: 0 });
42
+ },
43
+ async close() {
44
+ /* no-op */
45
+ },
46
+ sendAudio(_data: ArrayBufferLike) {
47
+ /* no-op */
48
+ },
49
+ _fire(ev, ...args) {
50
+ for (const fn of listeners.get(ev) ?? []) fn(...args);
51
+ },
52
+ };
53
+ };
54
+ return {
55
+ AssemblyAI: class {
56
+ streaming = {
57
+ transcriber: (_params: unknown): FakeTranscriber => makeFakeTranscriber(),
58
+ };
59
+ },
60
+ };
61
+ });
62
+
63
+ describe("assemblyAI STT adapter — fixture replay", () => {
64
+ test("maps turn events onto partial/final SttEvents", async () => {
65
+ const fixture = JSON.parse(
66
+ await readFile(join(here, "fixtures/assemblyai/basic-turn.json"), "utf8"),
67
+ ) as Record<string, unknown>[];
68
+
69
+ const provider = assemblyAI({ model: "u3pro-rt", apiKey: "k" });
70
+ const controller = new AbortController();
71
+ const session = (await provider.open({
72
+ sampleRate: 16_000,
73
+ apiKey: "k",
74
+ signal: controller.signal,
75
+ })) as AssemblyAISession;
76
+
77
+ const partials: string[] = [];
78
+ const finals: string[] = [];
79
+ const errors: string[] = [];
80
+ session.on("partial", (t) => partials.push(t));
81
+ session.on("final", (t) => finals.push(t));
82
+ session.on("error", (e) => errors.push(e.message));
83
+
84
+ // Replay fixture through the fake transcriber. The JSON's "type" field
85
+ // distinguishes Begin from Turn; we only dispatch turn messages since
86
+ // Begin is consumed inside `connect()` by the real SDK.
87
+ const fake = session._transcriber as unknown as FakeTranscriber;
88
+ for (const msg of fixture) {
89
+ if (msg.type === "Turn") fake._fire("turn", msg as TurnEvent);
90
+ }
91
+
92
+ await flush();
93
+
94
+ expect(partials).toEqual(["what", "what's the"]);
95
+ expect(finals).toEqual(["what's the weather?"]);
96
+ expect(errors).toEqual([]);
97
+
98
+ await session.close();
99
+ });
100
+ });
@@ -0,0 +1,154 @@
1
+ // Copyright 2025 the AAI authors. MIT license.
2
+ /**
3
+ * AssemblyAI Universal-Streaming STT adapter.
4
+ *
5
+ * Wraps the `assemblyai` Node SDK's {@link StreamingTranscriber} and
6
+ * normalizes its event surface onto the {@link SttProvider} /
7
+ * {@link SttEvents} contract consumed by the pipeline orchestrator.
8
+ *
9
+ * Default model: `"u3pro-rt"` (Universal-3 Pro Real-Time). The adapter
10
+ * maps that to the SDK's `"u3-rt-pro"` `speechModel` value; any other
11
+ * string is forwarded verbatim.
12
+ */
13
+
14
+ import { AssemblyAI, type StreamingTranscriber } from "assemblyai";
15
+ import { createNanoEvents, type Emitter } from "nanoevents";
16
+ import type {
17
+ SttError,
18
+ SttEvents,
19
+ SttOpenOptions,
20
+ SttProvider,
21
+ SttSession,
22
+ } from "../../../sdk/providers.ts";
23
+
24
+ export interface AssemblyAIOptions {
25
+ /**
26
+ * Streaming speech model. Defaults to `"u3pro-rt"` (Universal-3 Pro
27
+ * Real-Time). Arbitrary strings are forwarded to the SDK unchanged.
28
+ */
29
+ model?: "u3pro-rt" | string;
30
+ /**
31
+ * AssemblyAI API key. Falls back to `SttOpenOptions.apiKey`, then
32
+ * `process.env.ASSEMBLYAI_API_KEY`.
33
+ */
34
+ apiKey?: string;
35
+ }
36
+
37
+ /** Internal: SttSession with a test-only handle to the raw SDK transcriber. */
38
+ export interface AssemblyAISession extends SttSession {
39
+ /** @internal Test-only: exposes the underlying SDK transcriber for fixture replay. */
40
+ readonly _transcriber: StreamingTranscriber;
41
+ }
42
+
43
+ /** Translate the adapter's model alias to the SDK's `speechModel` value. */
44
+ function resolveSpeechModel(model: string): string {
45
+ // Plan's public name is "u3pro-rt"; the SDK's enum uses "u3-rt-pro".
46
+ if (model === "u3pro-rt") return "u3-rt-pro";
47
+ return model;
48
+ }
49
+
50
+ function makeError(message: string): SttError {
51
+ const err = new Error(message) as SttError & { code: SttError["code"] };
52
+ (err as { code: SttError["code"] }).code = "stt_stream_error";
53
+ return err;
54
+ }
55
+
56
+ export function assemblyAI(opts: AssemblyAIOptions = {}): SttProvider {
57
+ return {
58
+ name: "assemblyai",
59
+ async open(openOpts: SttOpenOptions): Promise<SttSession> {
60
+ const apiKey = opts.apiKey ?? openOpts.apiKey ?? process.env.ASSEMBLYAI_API_KEY;
61
+ if (!apiKey) {
62
+ const err = new Error(
63
+ "AssemblyAI STT adapter: missing API key. Provide via the factory option, SttOpenOptions, or the ASSEMBLYAI_API_KEY environment variable.",
64
+ ) as SttError & { code: SttError["code"] };
65
+ (err as { code: SttError["code"] }).code = "stt_auth_failed";
66
+ throw err;
67
+ }
68
+
69
+ const client = new AssemblyAI({ apiKey });
70
+ const speechModel = resolveSpeechModel(opts.model ?? "u3pro-rt");
71
+ const transcriber = client.streaming.transcriber({
72
+ sampleRate: openOpts.sampleRate,
73
+ // The SDK types `speechModel` as a string-literal union; the adapter
74
+ // accepts `string` as an escape hatch, so cast at the boundary.
75
+ speechModel: speechModel as never,
76
+ ...(openOpts.sttPrompt ? { prompt: openOpts.sttPrompt } : {}),
77
+ });
78
+
79
+ const emitter: Emitter<SttEvents> = createNanoEvents<SttEvents>();
80
+ let closed = false;
81
+
82
+ transcriber.on("turn", (event) => {
83
+ if (closed) return;
84
+ const text = event.transcript ?? "";
85
+ if (event.end_of_turn) {
86
+ if (text.length > 0) emitter.emit("final", text);
87
+ } else if (text.length > 0) {
88
+ emitter.emit("partial", text);
89
+ }
90
+ });
91
+
92
+ transcriber.on("error", (err) => {
93
+ if (closed) return;
94
+ emitter.emit("error", makeError(err?.message ?? String(err)));
95
+ });
96
+
97
+ transcriber.on("close", (code) => {
98
+ if (closed) return;
99
+ // 1000 = normal closure.
100
+ if (code !== 1000) {
101
+ emitter.emit("error", makeError(`socket closed ${code}`));
102
+ }
103
+ });
104
+
105
+ try {
106
+ await transcriber.connect();
107
+ } catch (cause) {
108
+ const err = new Error(
109
+ `AssemblyAI STT: connect failed: ${cause instanceof Error ? cause.message : String(cause)}`,
110
+ ) as SttError & { code: SttError["code"] };
111
+ (err as { code: SttError["code"] }).code = "stt_connect_failed";
112
+ throw err;
113
+ }
114
+
115
+ const close = async (): Promise<void> => {
116
+ if (closed) return;
117
+ closed = true;
118
+ try {
119
+ await transcriber.close();
120
+ } catch {
121
+ // Swallow: the caller has already decided to tear down.
122
+ }
123
+ };
124
+
125
+ // Wire session-level abort to close the SDK socket.
126
+ if (openOpts.signal.aborted) {
127
+ void close();
128
+ } else {
129
+ openOpts.signal.addEventListener("abort", () => void close(), {
130
+ once: true,
131
+ });
132
+ }
133
+
134
+ const session: AssemblyAISession = {
135
+ sendAudio(pcm: Int16Array) {
136
+ if (closed) return;
137
+ // The SDK's sendAudio accepts ArrayBufferLike. Forward a detached
138
+ // copy of the PCM view's window so the consumer sees only this
139
+ // chunk's bytes.
140
+ const copy = new Uint8Array(pcm.byteLength);
141
+ copy.set(new Uint8Array(pcm.buffer, pcm.byteOffset, pcm.byteLength));
142
+ transcriber.sendAudio(copy.buffer);
143
+ },
144
+ on(event, fn) {
145
+ return emitter.on(event, fn);
146
+ },
147
+ close,
148
+ _transcriber: transcriber,
149
+ };
150
+
151
+ return session;
152
+ },
153
+ };
154
+ }