@alexkroman1/aai 1.2.3 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/.turbo/turbo-build.log +14 -12
  2. package/CHANGELOG.md +20 -0
  3. package/dist/{constants-VTFoymJ-.js → constants-BL3nvg4I.js} +8 -1
  4. package/dist/host/_pipeline-test-fakes.d.ts +117 -0
  5. package/dist/host/pipeline-session-ctx.d.ts +24 -0
  6. package/dist/host/pipeline-session.d.ts +48 -0
  7. package/dist/host/providers/llm.d.ts +2 -0
  8. package/dist/host/providers/stt/assemblyai.d.ts +31 -0
  9. package/dist/host/providers/stt-barrel.d.ts +8 -0
  10. package/dist/host/providers/stt-barrel.js +92 -0
  11. package/dist/host/providers/stt.d.ts +2 -0
  12. package/dist/host/providers/tts/cartesia.d.ts +39 -0
  13. package/dist/host/providers/tts-barrel.d.ts +8 -0
  14. package/dist/host/providers/tts-barrel.js +182 -0
  15. package/dist/host/providers/tts.d.ts +2 -0
  16. package/dist/host/runtime-barrel.js +565 -81
  17. package/dist/host/runtime.d.ts +17 -0
  18. package/dist/host/s2s.d.ts +5 -0
  19. package/dist/host/session-ctx.d.ts +22 -4
  20. package/dist/host/to-vercel-tools.d.ts +45 -0
  21. package/dist/index.js +7 -2
  22. package/dist/sdk/_internal-types.d.ts +15 -1
  23. package/dist/sdk/constants.d.ts +7 -0
  24. package/dist/sdk/define.d.ts +21 -0
  25. package/dist/sdk/manifest.d.ts +22 -0
  26. package/dist/sdk/protocol.d.ts +3 -3
  27. package/dist/sdk/protocol.js +1 -1
  28. package/dist/sdk/providers.d.ts +70 -0
  29. package/dist/sdk/types.d.ts +16 -0
  30. package/exports-no-dev-deps.test.ts +39 -14
  31. package/host/_pipeline-test-fakes.ts +357 -0
  32. package/host/_test-utils.ts +1 -0
  33. package/host/integration/fixtures/README.md +49 -0
  34. package/host/integration/pipeline-reference.integration.test.ts +124 -0
  35. package/host/pipeline-session-ctx.test.ts +31 -0
  36. package/host/pipeline-session-ctx.ts +36 -0
  37. package/host/pipeline-session.test.ts +572 -0
  38. package/host/pipeline-session.ts +489 -0
  39. package/host/providers/llm.ts +3 -0
  40. package/host/providers/providers.test-d.ts +31 -0
  41. package/host/providers/stt/assemblyai.test.ts +100 -0
  42. package/host/providers/stt/assemblyai.ts +154 -0
  43. package/host/providers/stt/fixtures/assemblyai/basic-turn.json +30 -0
  44. package/host/providers/stt-barrel.ts +13 -0
  45. package/host/providers/stt.ts +3 -0
  46. package/host/providers/tts/cartesia.test.ts +210 -0
  47. package/host/providers/tts/cartesia.ts +251 -0
  48. package/host/providers/tts-barrel.ts +13 -0
  49. package/host/providers/tts.ts +3 -0
  50. package/host/runtime.test.ts +81 -1
  51. package/host/runtime.ts +61 -0
  52. package/host/s2s.test.ts +19 -0
  53. package/host/s2s.ts +10 -0
  54. package/host/session-ctx.ts +35 -8
  55. package/host/to-vercel-tools.test.ts +187 -0
  56. package/host/to-vercel-tools.ts +74 -0
  57. package/package.json +15 -1
  58. package/sdk/__snapshots__/exports.test.ts.snap +2 -0
  59. package/sdk/_internal-types.ts +16 -0
  60. package/sdk/constants.ts +8 -0
  61. package/sdk/define.test-d.ts +21 -0
  62. package/sdk/define.test.ts +33 -0
  63. package/sdk/define.ts +21 -0
  64. package/sdk/manifest.test-d.ts +14 -0
  65. package/sdk/manifest.test.ts +51 -0
  66. package/sdk/manifest.ts +39 -0
  67. package/sdk/providers.ts +90 -0
  68. package/sdk/types.ts +16 -0
  69. package/vitest.config.ts +1 -0
@@ -0,0 +1,154 @@
1
+ // Copyright 2025 the AAI authors. MIT license.
2
+ /**
3
+ * AssemblyAI Universal-Streaming STT adapter.
4
+ *
5
+ * Wraps the `assemblyai` Node SDK's {@link StreamingTranscriber} and
6
+ * normalizes its event surface onto the {@link SttProvider} /
7
+ * {@link SttEvents} contract consumed by the pipeline orchestrator.
8
+ *
9
+ * Default model: `"u3pro-rt"` (Universal-3 Pro Real-Time). The adapter
10
+ * maps that to the SDK's `"u3-rt-pro"` `speechModel` value; any other
11
+ * string is forwarded verbatim.
12
+ */
13
+
14
+ import { AssemblyAI, type StreamingTranscriber } from "assemblyai";
15
+ import { createNanoEvents, type Emitter } from "nanoevents";
16
+ import type {
17
+ SttError,
18
+ SttEvents,
19
+ SttOpenOptions,
20
+ SttProvider,
21
+ SttSession,
22
+ } from "../../../sdk/providers.ts";
23
+
24
+ export interface AssemblyAIOptions {
25
+ /**
26
+ * Streaming speech model. Defaults to `"u3pro-rt"` (Universal-3 Pro
27
+ * Real-Time). Arbitrary strings are forwarded to the SDK unchanged.
28
+ */
29
+ model?: "u3pro-rt" | string;
30
+ /**
31
+ * AssemblyAI API key. Falls back to `SttOpenOptions.apiKey`, then
32
+ * `process.env.ASSEMBLYAI_API_KEY`.
33
+ */
34
+ apiKey?: string;
35
+ }
36
+
37
+ /** Internal: SttSession with a test-only handle to the raw SDK transcriber. */
38
+ export interface AssemblyAISession extends SttSession {
39
+ /** @internal Test-only: exposes the underlying SDK transcriber for fixture replay. */
40
+ readonly _transcriber: StreamingTranscriber;
41
+ }
42
+
43
+ /** Translate the adapter's model alias to the SDK's `speechModel` value. */
44
+ function resolveSpeechModel(model: string): string {
45
+ // Plan's public name is "u3pro-rt"; the SDK's enum uses "u3-rt-pro".
46
+ if (model === "u3pro-rt") return "u3-rt-pro";
47
+ return model;
48
+ }
49
+
50
+ function makeError(message: string): SttError {
51
+ const err = new Error(message) as SttError & { code: SttError["code"] };
52
+ (err as { code: SttError["code"] }).code = "stt_stream_error";
53
+ return err;
54
+ }
55
+
56
+ export function assemblyAI(opts: AssemblyAIOptions = {}): SttProvider {
57
+ return {
58
+ name: "assemblyai",
59
+ async open(openOpts: SttOpenOptions): Promise<SttSession> {
60
+ const apiKey = opts.apiKey ?? openOpts.apiKey ?? process.env.ASSEMBLYAI_API_KEY;
61
+ if (!apiKey) {
62
+ const err = new Error(
63
+ "AssemblyAI STT adapter: missing API key. Provide via the factory option, SttOpenOptions, or the ASSEMBLYAI_API_KEY environment variable.",
64
+ ) as SttError & { code: SttError["code"] };
65
+ (err as { code: SttError["code"] }).code = "stt_auth_failed";
66
+ throw err;
67
+ }
68
+
69
+ const client = new AssemblyAI({ apiKey });
70
+ const speechModel = resolveSpeechModel(opts.model ?? "u3pro-rt");
71
+ const transcriber = client.streaming.transcriber({
72
+ sampleRate: openOpts.sampleRate,
73
+ // The SDK types `speechModel` as a string-literal union; the adapter
74
+ // accepts `string` as an escape hatch, so cast at the boundary.
75
+ speechModel: speechModel as never,
76
+ ...(openOpts.sttPrompt ? { prompt: openOpts.sttPrompt } : {}),
77
+ });
78
+
79
+ const emitter: Emitter<SttEvents> = createNanoEvents<SttEvents>();
80
+ let closed = false;
81
+
82
+ transcriber.on("turn", (event) => {
83
+ if (closed) return;
84
+ const text = event.transcript ?? "";
85
+ if (event.end_of_turn) {
86
+ if (text.length > 0) emitter.emit("final", text);
87
+ } else if (text.length > 0) {
88
+ emitter.emit("partial", text);
89
+ }
90
+ });
91
+
92
+ transcriber.on("error", (err) => {
93
+ if (closed) return;
94
+ emitter.emit("error", makeError(err?.message ?? String(err)));
95
+ });
96
+
97
+ transcriber.on("close", (code) => {
98
+ if (closed) return;
99
+ // 1000 = normal closure.
100
+ if (code !== 1000) {
101
+ emitter.emit("error", makeError(`socket closed ${code}`));
102
+ }
103
+ });
104
+
105
+ try {
106
+ await transcriber.connect();
107
+ } catch (cause) {
108
+ const err = new Error(
109
+ `AssemblyAI STT: connect failed: ${cause instanceof Error ? cause.message : String(cause)}`,
110
+ ) as SttError & { code: SttError["code"] };
111
+ (err as { code: SttError["code"] }).code = "stt_connect_failed";
112
+ throw err;
113
+ }
114
+
115
+ const close = async (): Promise<void> => {
116
+ if (closed) return;
117
+ closed = true;
118
+ try {
119
+ await transcriber.close();
120
+ } catch {
121
+ // Swallow: the caller has already decided to tear down.
122
+ }
123
+ };
124
+
125
+ // Wire session-level abort to close the SDK socket.
126
+ if (openOpts.signal.aborted) {
127
+ void close();
128
+ } else {
129
+ openOpts.signal.addEventListener("abort", () => void close(), {
130
+ once: true,
131
+ });
132
+ }
133
+
134
+ const session: AssemblyAISession = {
135
+ sendAudio(pcm: Int16Array) {
136
+ if (closed) return;
137
+ // The SDK's sendAudio accepts ArrayBufferLike. Forward a detached
138
+ // copy of the PCM view's window so the consumer sees only this
139
+ // chunk's bytes.
140
+ const copy = new Uint8Array(pcm.byteLength);
141
+ copy.set(new Uint8Array(pcm.buffer, pcm.byteOffset, pcm.byteLength));
142
+ transcriber.sendAudio(copy.buffer);
143
+ },
144
+ on(event, fn) {
145
+ return emitter.on(event, fn);
146
+ },
147
+ close,
148
+ _transcriber: transcriber,
149
+ };
150
+
151
+ return session;
152
+ },
153
+ };
154
+ }
@@ -0,0 +1,30 @@
1
+ [
2
+ { "type": "Begin", "id": "sess-x", "expires_at": 0 },
3
+ {
4
+ "type": "Turn",
5
+ "turn_order": 1,
6
+ "turn_is_formatted": false,
7
+ "end_of_turn": false,
8
+ "transcript": "what",
9
+ "end_of_turn_confidence": 0.1,
10
+ "words": []
11
+ },
12
+ {
13
+ "type": "Turn",
14
+ "turn_order": 1,
15
+ "turn_is_formatted": false,
16
+ "end_of_turn": false,
17
+ "transcript": "what's the",
18
+ "end_of_turn_confidence": 0.3,
19
+ "words": []
20
+ },
21
+ {
22
+ "type": "Turn",
23
+ "turn_order": 1,
24
+ "turn_is_formatted": true,
25
+ "end_of_turn": true,
26
+ "transcript": "what's the weather?",
27
+ "end_of_turn_confidence": 0.95,
28
+ "words": []
29
+ }
30
+ ]
@@ -0,0 +1,13 @@
1
+ // Copyright 2025 the AAI authors. MIT license.
2
+ /**
3
+ * `@alexkroman1/aai/stt` subpath barrel. Re-exports the STT provider
4
+ * contract types (via `stt.ts` → `sdk/providers.ts`) alongside the
5
+ * concrete AssemblyAI adapter factory. Task 9 owns wiring this file
6
+ * into `package.json` exports.
7
+ */
8
+
9
+ // biome-ignore lint/performance/noReExportAll: subpath barrel
10
+ export * from "./stt/assemblyai.ts";
11
+ // Type-only re-export — no biome suppression needed; `export type *` is
12
+ // excluded from the `noReExportAll` rule.
13
+ export type * from "./stt.ts";
@@ -0,0 +1,3 @@
1
+ // Copyright 2025 the AAI authors. MIT license.
2
+ /** STT provider interface — re-exported from sdk/ for host-side consumption. */
3
+ export type * from "../../sdk/providers.ts";
@@ -0,0 +1,210 @@
1
+ // Copyright 2025 the AAI authors. MIT license.
2
+ /** Unit test for the Cartesia TTS adapter. Mocks `@cartesia/cartesia-js`. */
3
+
4
+ import { beforeEach, describe, expect, test, vi } from "vitest";
5
+ import { flush } from "../../_test-utils.ts";
6
+ import { type CartesiaSession, cartesia } from "./cartesia.ts";
7
+
8
+ // Recorded interactions on the fake `TTSWSContext` — one entry per method call.
9
+ interface RecordedSend {
10
+ kind: "send" | "cancel";
11
+ contextId: string;
12
+ transcript?: string | undefined;
13
+ continue?: boolean | undefined;
14
+ language?: string | undefined;
15
+ model_id?: string | undefined;
16
+ }
17
+
18
+ const sends: RecordedSend[] = [];
19
+
20
+ /** Minimal shape of the request the adapter sends to Cartesia. */
21
+ interface FakeGenerationRequest {
22
+ transcript: string;
23
+ continue: boolean;
24
+ language?: string;
25
+ model_id?: string;
26
+ }
27
+
28
+ /**
29
+ * Fake `TTSWSContext`. Mirrors the fields the adapter touches:
30
+ * `contextId`, `send`, `cancel`.
31
+ */
32
+ interface FakeContext {
33
+ contextId: string;
34
+ send(req: FakeGenerationRequest): Promise<void>;
35
+ cancel(): Promise<void>;
36
+ }
37
+
38
+ /** Fake `TTSWS`. EventEmitter-ish with a `_fire` test hook. */
39
+ interface FakeTTSWS {
40
+ contexts: FakeContext[];
41
+ context(opts: { contextId: string }): FakeContext;
42
+ on(event: string, fn: (...args: unknown[]) => void): FakeTTSWS;
43
+ close(props?: { code: number; reason: string }): void;
44
+ _fire(event: string, payload: unknown): void;
45
+ }
46
+
47
+ vi.mock("@cartesia/cartesia-js", () => {
48
+ const makeWs = (): FakeTTSWS => {
49
+ const listeners = new Map<string, Array<(...args: unknown[]) => void>>();
50
+ const ws: FakeTTSWS = {
51
+ contexts: [],
52
+ context(opts) {
53
+ const ctx: FakeContext = {
54
+ contextId: opts.contextId,
55
+ async send(req) {
56
+ sends.push({
57
+ kind: "send",
58
+ contextId: ctx.contextId,
59
+ transcript: req.transcript,
60
+ continue: req.continue,
61
+ language: req.language,
62
+ model_id: req.model_id,
63
+ });
64
+ },
65
+ async cancel() {
66
+ sends.push({ kind: "cancel", contextId: ctx.contextId });
67
+ },
68
+ };
69
+ ws.contexts.push(ctx);
70
+ return ctx;
71
+ },
72
+ on(event, fn) {
73
+ const arr = listeners.get(event) ?? [];
74
+ arr.push(fn);
75
+ listeners.set(event, arr);
76
+ return ws;
77
+ },
78
+ close(_props) {
79
+ /* no-op */
80
+ },
81
+ _fire(event, payload) {
82
+ for (const fn of listeners.get(event) ?? []) fn(payload);
83
+ },
84
+ };
85
+ return ws;
86
+ };
87
+ return {
88
+ Cartesia: class {
89
+ tts = {
90
+ websocket: async () => makeWs(),
91
+ };
92
+ },
93
+ };
94
+ });
95
+
96
+ beforeEach(() => {
97
+ sends.length = 0;
98
+ });
99
+
100
+ async function openSession(): Promise<{
101
+ session: CartesiaSession;
102
+ controller: AbortController;
103
+ }> {
104
+ const provider = cartesia({ voice: "voice-id", apiKey: "k" });
105
+ const controller = new AbortController();
106
+ const session = (await provider.open({
107
+ sampleRate: 16_000,
108
+ apiKey: "k",
109
+ signal: controller.signal,
110
+ })) as CartesiaSession;
111
+ return { session, controller };
112
+ }
113
+
114
+ describe("cartesia TTS adapter", () => {
115
+ test("sendText deltas share one contextId; flush ends the turn; next turn uses a fresh contextId", async () => {
116
+ const { session, controller } = await openSession();
117
+ const turn1 = session._currentContextId();
118
+
119
+ session.sendText("hello");
120
+ session.sendText(" world");
121
+ session.flush();
122
+ await flush();
123
+
124
+ // All three sends for turn 1 carry the same contextId — two deltas with
125
+ // continue: true, then an empty-transcript send with continue: false.
126
+ const turn1Sends = sends.filter((s) => s.contextId === turn1);
127
+ expect(turn1Sends).toEqual([
128
+ {
129
+ kind: "send",
130
+ contextId: turn1,
131
+ transcript: "hello",
132
+ continue: true,
133
+ language: "en",
134
+ model_id: "sonic-2",
135
+ },
136
+ {
137
+ kind: "send",
138
+ contextId: turn1,
139
+ transcript: " world",
140
+ continue: true,
141
+ language: "en",
142
+ model_id: "sonic-2",
143
+ },
144
+ {
145
+ kind: "send",
146
+ contextId: turn1,
147
+ transcript: "",
148
+ continue: false,
149
+ language: "en",
150
+ model_id: "sonic-2",
151
+ },
152
+ ]);
153
+
154
+ // After flush(), the adapter has rotated to a new context.
155
+ const turn2 = session._currentContextId();
156
+ expect(turn2).not.toBe(turn1);
157
+
158
+ // Subsequent sendText targets the new context.
159
+ session.sendText("next");
160
+ await flush();
161
+ expect(sends.filter((s) => s.contextId === turn2)).toEqual([
162
+ {
163
+ kind: "send",
164
+ contextId: turn2,
165
+ transcript: "next",
166
+ continue: true,
167
+ language: "en",
168
+ model_id: "sonic-2",
169
+ },
170
+ ]);
171
+
172
+ controller.abort();
173
+ await session.close();
174
+ });
175
+
176
+ test("cancel() calls ws.cancelContext(contextId) and emits `done` synchronously", async () => {
177
+ const { session, controller } = await openSession();
178
+ const turn1 = session._currentContextId();
179
+
180
+ const doneEvents: number[] = [];
181
+ session.on("done", () => doneEvents.push(Date.now()));
182
+
183
+ session.sendText("hello");
184
+ // cancel() must emit `done` synchronously — the orchestrator advances
185
+ // state on `done`, and barge-in response cannot be microtask-deferred.
186
+ session.cancel();
187
+ expect(doneEvents.length).toBe(1);
188
+
189
+ await flush();
190
+
191
+ // We expect: send("hello", continue:true) on turn1, then cancel(turn1).
192
+ expect(sends).toEqual([
193
+ {
194
+ kind: "send",
195
+ contextId: turn1,
196
+ transcript: "hello",
197
+ continue: true,
198
+ language: "en",
199
+ model_id: "sonic-2",
200
+ },
201
+ { kind: "cancel", contextId: turn1 },
202
+ ]);
203
+
204
+ // Cancelling rotates the context so the next turn is unambiguous.
205
+ expect(session._currentContextId()).not.toBe(turn1);
206
+
207
+ controller.abort();
208
+ await session.close();
209
+ });
210
+ });
@@ -0,0 +1,251 @@
1
+ // Copyright 2025 the AAI authors. MIT license.
2
+ /**
3
+ * Cartesia TTS adapter — streaming WebSocket with per-turn `context_id`.
4
+ *
5
+ * Wraps `@cartesia/cartesia-js`'s `TTSWS` / `TTSWSContext` and normalizes it
6
+ * onto the {@link TtsProvider} / {@link TtsEvents} contract consumed by the
7
+ * pipeline orchestrator.
8
+ *
9
+ * **Per-turn context lifecycle.** Each `sendText(...)` within the same turn
10
+ * appends to the same Cartesia context. On `flush()` or `cancel()`, a new
11
+ * context is minted for the next turn — so concurrent `cancel({ contextId })`
12
+ * only targets the in-flight turn, never the one that follows.
13
+ *
14
+ * **Audio format.** The adapter requests `raw` / `pcm_s16le` at the
15
+ * negotiated `sampleRate` so it can forward chunks as `Int16Array` with no
16
+ * conversion.
17
+ */
18
+
19
+ import { randomUUID } from "node:crypto";
20
+ import { Cartesia } from "@cartesia/cartesia-js";
21
+ import type { TTSWS, TTSWSContext } from "@cartesia/cartesia-js/resources/tts";
22
+ import { createNanoEvents, type Emitter } from "nanoevents";
23
+ import type {
24
+ TtsError,
25
+ TtsEvents,
26
+ TtsOpenOptions,
27
+ TtsProvider,
28
+ TtsSession,
29
+ } from "../../../sdk/providers.ts";
30
+
31
+ export interface CartesiaOptions {
32
+ /** Cartesia voice ID. Required. */
33
+ voice: string;
34
+ /** Model ID. Defaults to `"sonic-2"`. */
35
+ model?: string;
36
+ /**
37
+ * Cartesia API key. Falls back to `TtsOpenOptions.apiKey`, then
38
+ * `process.env.CARTESIA_API_KEY`.
39
+ */
40
+ apiKey?: string;
41
+ /** Spoken language hint. Defaults to `"en"`. */
42
+ language?: string;
43
+ }
44
+
45
+ /** Internal: TtsSession with a test-only handle to the raw SDK socket. */
46
+ export interface CartesiaSession extends TtsSession {
47
+ /** @internal Test-only: exposes the underlying SDK WebSocket wrapper. */
48
+ readonly _ws: TTSWS;
49
+ /** @internal Test-only: id of the currently-active context. */
50
+ readonly _currentContextId: () => string;
51
+ }
52
+
53
+ function makeError(message: string): TtsError {
54
+ const err = new Error(message) as TtsError & { code: TtsError["code"] };
55
+ (err as { code: TtsError["code"] }).code = "tts_stream_error";
56
+ return err;
57
+ }
58
+
59
+ /** PCM16 sample rates supported by Cartesia's `raw` output format. */
60
+ const CARTESIA_PCM16_RATES = [
61
+ 8000, 16_000, 22_050, 24_000, 44_100, 48_000,
62
+ ] as const satisfies readonly number[];
63
+ type CartesiaSampleRate = (typeof CARTESIA_PCM16_RATES)[number];
64
+
65
+ function assertSupportedSampleRate(rate: number): CartesiaSampleRate {
66
+ if ((CARTESIA_PCM16_RATES as readonly number[]).includes(rate)) {
67
+ return rate as CartesiaSampleRate;
68
+ }
69
+ const err = new Error(
70
+ `Cartesia TTS adapter: unsupported sample rate ${rate}. Supported: ${CARTESIA_PCM16_RATES.join(", ")}.`,
71
+ ) as TtsError & { code: TtsError["code"] };
72
+ (err as { code: TtsError["code"] }).code = "tts_connect_failed";
73
+ throw err;
74
+ }
75
+
76
+ export function cartesia(opts: CartesiaOptions): TtsProvider {
77
+ return {
78
+ name: "cartesia",
79
+ async open(openOpts: TtsOpenOptions): Promise<TtsSession> {
80
+ const apiKey = opts.apiKey ?? openOpts.apiKey ?? process.env.CARTESIA_API_KEY;
81
+ if (!apiKey) {
82
+ const err = new Error(
83
+ "Cartesia TTS adapter: missing API key. Provide via the factory option, TtsOpenOptions, or the CARTESIA_API_KEY environment variable.",
84
+ ) as TtsError & { code: TtsError["code"] };
85
+ (err as { code: TtsError["code"] }).code = "tts_auth_failed";
86
+ throw err;
87
+ }
88
+
89
+ const sampleRate = assertSupportedSampleRate(openOpts.sampleRate);
90
+ const model = opts.model ?? "sonic-2";
91
+ const language = opts.language ?? "en";
92
+
93
+ const client = new Cartesia({ apiKey });
94
+ let ws: TTSWS;
95
+ try {
96
+ ws = await client.tts.websocket();
97
+ } catch (cause) {
98
+ const err = new Error(
99
+ `Cartesia TTS: connect failed: ${cause instanceof Error ? cause.message : String(cause)}`,
100
+ ) as TtsError & { code: TtsError["code"] };
101
+ (err as { code: TtsError["code"] }).code = "tts_connect_failed";
102
+ throw err;
103
+ }
104
+
105
+ const emitter: Emitter<TtsEvents> = createNanoEvents<TtsEvents>();
106
+ let closed = false;
107
+
108
+ /** Mint a fresh context bound to the shared TTSWS connection. */
109
+ const mintContext = (): TTSWSContext =>
110
+ ws.context({
111
+ model_id: model,
112
+ voice: { mode: "id", id: opts.voice },
113
+ output_format: {
114
+ container: "raw",
115
+ encoding: "pcm_s16le",
116
+ sample_rate: sampleRate,
117
+ },
118
+ contextId: randomUUID(),
119
+ });
120
+
121
+ let context = mintContext();
122
+ /**
123
+ * `doneEmitted` guards against emitting `done` more than once per turn.
124
+ * Reset whenever a fresh context is minted (i.e. at turn boundaries).
125
+ */
126
+ let doneEmitted = false;
127
+ const rotateContext = () => {
128
+ context = mintContext();
129
+ doneEmitted = false;
130
+ };
131
+ const emitDoneOnce = () => {
132
+ if (doneEmitted || closed) return;
133
+ doneEmitted = true;
134
+ emitter.emit("done");
135
+ };
136
+
137
+ // Route SDK events onto the adapter's event surface, filtering by the
138
+ // currently-active `context_id`. The TTSWS EventEmitter fires globally
139
+ // across all contexts on the socket; we only care about the active one.
140
+ ws.on("chunk", (event) => {
141
+ if (closed) return;
142
+ if (event.context_id !== context.contextId) return;
143
+ // SDK decodes base64 → Buffer on receipt (`event.audio`). Forward as
144
+ // Int16Array over the same byte window.
145
+ const buf = event.audio;
146
+ if (!buf || buf.byteLength === 0) return;
147
+ // Cartesia sends PCM16 little-endian with even byte counts. Be defensive.
148
+ const evenBytes = buf.byteLength - (buf.byteLength % 2);
149
+ if (evenBytes === 0) return;
150
+ const pcm = new Int16Array(buf.buffer.slice(buf.byteOffset, buf.byteOffset + evenBytes));
151
+ emitter.emit("audio", pcm);
152
+ });
153
+
154
+ ws.on("done", (event) => {
155
+ if (closed) return;
156
+ if (event.context_id !== context.contextId) return;
157
+ emitDoneOnce();
158
+ });
159
+
160
+ ws.on("error", (err) => {
161
+ if (closed) return;
162
+ emitter.emit("error", makeError(err?.message ?? String(err)));
163
+ });
164
+
165
+ const close = async (): Promise<void> => {
166
+ if (closed) return;
167
+ closed = true;
168
+ try {
169
+ ws.close({ code: 1000, reason: "client close" });
170
+ } catch {
171
+ // Swallow: caller has already decided to tear down.
172
+ }
173
+ };
174
+
175
+ // Session-level abort → close the SDK socket.
176
+ if (openOpts.signal.aborted) {
177
+ void close();
178
+ } else {
179
+ openOpts.signal.addEventListener("abort", () => void close(), {
180
+ once: true,
181
+ });
182
+ }
183
+
184
+ /** Static part of each generation request; only `transcript` and
185
+ * `continue` vary per send. Pinned here so `language` threads through. */
186
+ const baseRequest = {
187
+ model_id: model,
188
+ voice: { mode: "id" as const, id: opts.voice },
189
+ output_format: {
190
+ container: "raw" as const,
191
+ encoding: "pcm_s16le" as const,
192
+ sample_rate: sampleRate,
193
+ },
194
+ language,
195
+ };
196
+
197
+ /**
198
+ * Swallow rejections from async SDK calls — the global `error`
199
+ * listener on `ws` emits a normalized {@link TtsError}, so there's
200
+ * nothing useful for the caller to do with per-send failures.
201
+ */
202
+ const ignoreRejection = (_err: unknown): void => {
203
+ // intentionally empty
204
+ };
205
+
206
+ const session: CartesiaSession = {
207
+ sendText(text: string) {
208
+ if (closed || text.length === 0) return;
209
+ // Send a delta with `continue: true`, sharing the same
210
+ // context_id across all deltas of this turn.
211
+ void context
212
+ .send({ ...baseRequest, transcript: text, continue: true })
213
+ .catch(ignoreRejection);
214
+ },
215
+ flush() {
216
+ if (closed) return;
217
+ // Send an empty transcript with `continue: false` — the canonical
218
+ // end-of-turn signal. The server replies with a `done` event
219
+ // tagged with this context's id, which drives `emitDoneOnce`. We
220
+ // also microtask-emit `done` as a fallback so the orchestrator's
221
+ // state machine can't wedge if the server event is dropped.
222
+ // TODO: drop the microtask fallback once we've verified Cartesia
223
+ // always emits a `done` for cleanly-flushed contexts. See
224
+ // 2026-04-22-pluggable-providers-design.md → "Note on flush() timing".
225
+ void context
226
+ .send({ ...baseRequest, transcript: "", continue: false })
227
+ .catch(ignoreRejection);
228
+ queueMicrotask(emitDoneOnce);
229
+ rotateContext();
230
+ },
231
+ cancel() {
232
+ if (closed) return;
233
+ // `cancel()` calls ws.cancelContext(contextId) under the hood.
234
+ void context.cancel().catch(ignoreRejection);
235
+ // Emit `done` synchronously — the orchestrator's state machine
236
+ // advances on `done`, and barge-in must not be delayed.
237
+ emitDoneOnce();
238
+ rotateContext();
239
+ },
240
+ on(event, fn) {
241
+ return emitter.on(event, fn);
242
+ },
243
+ close,
244
+ _ws: ws,
245
+ _currentContextId: () => context.contextId,
246
+ };
247
+
248
+ return session;
249
+ },
250
+ };
251
+ }
@@ -0,0 +1,13 @@
1
+ // Copyright 2025 the AAI authors. MIT license.
2
+ /**
3
+ * `@alexkroman1/aai/tts` subpath barrel. Re-exports the TTS provider
4
+ * contract types (via `tts.ts` → `sdk/providers.ts`) alongside the
5
+ * concrete Cartesia adapter factory. Task 9 owns wiring this file
6
+ * into `package.json` exports.
7
+ */
8
+
9
+ // biome-ignore lint/performance/noReExportAll: subpath barrel
10
+ export * from "./tts/cartesia.ts";
11
+ // Type-only re-export — no biome suppression needed; `export type *` is
12
+ // excluded from the `noReExportAll` rule.
13
+ export type * from "./tts.ts";
@@ -0,0 +1,3 @@
1
+ // Copyright 2025 the AAI authors. MIT license.
2
+ /** TTS provider interface — re-exported from sdk/ for host-side consumption. */
3
+ export type * from "../../sdk/providers.ts";