@alexkroman1/aai 1.2.3 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/.turbo/turbo-build.log +14 -12
  2. package/CHANGELOG.md +14 -0
  3. package/dist/host/_pipeline-test-fakes.d.ts +107 -0
  4. package/dist/host/pipeline-session-ctx.d.ts +24 -0
  5. package/dist/host/pipeline-session.d.ts +48 -0
  6. package/dist/host/providers/llm.d.ts +2 -0
  7. package/dist/host/providers/stt/assemblyai.d.ts +31 -0
  8. package/dist/host/providers/stt-barrel.d.ts +8 -0
  9. package/dist/host/providers/stt-barrel.js +92 -0
  10. package/dist/host/providers/stt.d.ts +2 -0
  11. package/dist/host/providers/tts/cartesia.d.ts +39 -0
  12. package/dist/host/providers/tts-barrel.d.ts +8 -0
  13. package/dist/host/providers/tts-barrel.js +182 -0
  14. package/dist/host/providers/tts.d.ts +2 -0
  15. package/dist/host/runtime-barrel.js +498 -80
  16. package/dist/host/runtime.d.ts +17 -0
  17. package/dist/host/s2s.d.ts +5 -0
  18. package/dist/host/session-ctx.d.ts +22 -4
  19. package/dist/host/to-vercel-tools.d.ts +44 -0
  20. package/dist/index.js +5 -0
  21. package/dist/sdk/_internal-types.d.ts +15 -1
  22. package/dist/sdk/define.d.ts +21 -0
  23. package/dist/sdk/manifest.d.ts +22 -0
  24. package/dist/sdk/protocol.d.ts +3 -3
  25. package/dist/sdk/providers.d.ts +70 -0
  26. package/dist/sdk/types.d.ts +16 -0
  27. package/exports-no-dev-deps.test.ts +39 -14
  28. package/host/_pipeline-test-fakes.ts +323 -0
  29. package/host/_test-utils.ts +1 -0
  30. package/host/integration/fixtures/README.md +49 -0
  31. package/host/integration/pipeline-reference.integration.test.ts +124 -0
  32. package/host/pipeline-session-ctx.test.ts +31 -0
  33. package/host/pipeline-session-ctx.ts +36 -0
  34. package/host/pipeline-session.test.ts +337 -0
  35. package/host/pipeline-session.ts +405 -0
  36. package/host/providers/llm.ts +3 -0
  37. package/host/providers/providers.test-d.ts +31 -0
  38. package/host/providers/stt/assemblyai.test.ts +100 -0
  39. package/host/providers/stt/assemblyai.ts +154 -0
  40. package/host/providers/stt/fixtures/assemblyai/basic-turn.json +30 -0
  41. package/host/providers/stt-barrel.ts +13 -0
  42. package/host/providers/stt.ts +3 -0
  43. package/host/providers/tts/cartesia.test.ts +210 -0
  44. package/host/providers/tts/cartesia.ts +251 -0
  45. package/host/providers/tts-barrel.ts +13 -0
  46. package/host/providers/tts.ts +3 -0
  47. package/host/runtime.test.ts +81 -1
  48. package/host/runtime.ts +61 -0
  49. package/host/s2s.test.ts +19 -0
  50. package/host/s2s.ts +10 -0
  51. package/host/session-ctx.ts +35 -8
  52. package/host/to-vercel-tools.test.ts +153 -0
  53. package/host/to-vercel-tools.ts +70 -0
  54. package/package.json +15 -1
  55. package/sdk/__snapshots__/exports.test.ts.snap +1 -0
  56. package/sdk/_internal-types.ts +16 -0
  57. package/sdk/define.test-d.ts +21 -0
  58. package/sdk/define.test.ts +33 -0
  59. package/sdk/define.ts +21 -0
  60. package/sdk/manifest.test-d.ts +14 -0
  61. package/sdk/manifest.test.ts +51 -0
  62. package/sdk/manifest.ts +39 -0
  63. package/sdk/providers.ts +90 -0
  64. package/sdk/types.ts +16 -0
  65. package/vitest.config.ts +1 -0
@@ -1,20 +1,22 @@
1
1
 
2
- > @alexkroman1/aai@1.2.3 build /home/runner/work/agent/agent/packages/aai
2
+ > @alexkroman1/aai@1.3.0 build /home/runner/work/agent/agent/packages/aai
3
3
  > tsdown && tsc -p tsconfig.build.json
4
4
 
5
5
  ℹ tsdown v0.21.7 powered by rolldown v1.0.0-rc.12
6
6
  ℹ config file: /home/runner/work/agent/agent/packages/aai/tsdown.config.ts
7
- ℹ entry: index.ts, sdk/protocol.ts, host/runtime-barrel.ts, sdk/manifest-barrel.ts
7
+ ℹ entry: index.ts, sdk/protocol.ts, host/runtime-barrel.ts, sdk/manifest-barrel.ts, host/providers/stt-barrel.ts, host/providers/tts-barrel.ts
8
8
  ℹ target: node22
9
9
  ℹ tsconfig: tsconfig.json
10
10
  ℹ Build start
11
- ℹ dist/host/runtime-barrel.js 50.02 kB │ gzip: 15.70 kB
12
- ℹ dist/index.js  6.38 kB │ gzip: 2.49 kB
13
- ℹ dist/sdk/protocol.js  4.75 kB │ gzip: 1.76 kB
14
- ℹ dist/sdk/manifest-barrel.js  0.26 kB │ gzip: 0.17 kB
15
- ℹ dist/constants-VTFoymJ-.js  2.75 kB │ gzip: 1.23 kB
16
- ℹ dist/_internal-types-CoDTiBd1.js  2.33 kB │ gzip: 0.99 kB
17
- ℹ dist/types-Cfx_4QDK.js  1.74 kB │ gzip: 0.93 kB
18
- ℹ dist/ws-upgrade-BeOQ7fXL.js  1.14 kB │ gzip: 0.54 kB
19
- ℹ 8 files, total: 69.36 kB
20
- ✔ Build complete in 47ms
11
+ ℹ dist/host/runtime-barrel.js 61.50 kB │ gzip: 18.59 kB
12
+ ℹ dist/index.js  6.62 kB │ gzip: 2.63 kB
13
+ ℹ dist/host/providers/tts-barrel.js  5.52 kB │ gzip: 2.12 kB
14
+ ℹ dist/sdk/protocol.js  4.75 kB │ gzip: 1.76 kB
15
+ ℹ dist/host/providers/stt-barrel.js  3.08 kB │ gzip: 1.26 kB
16
+ ℹ dist/sdk/manifest-barrel.js  0.26 kB │ gzip: 0.17 kB
17
+ ℹ dist/constants-VTFoymJ-.js  2.75 kB │ gzip: 1.23 kB
18
+ ℹ dist/_internal-types-CoDTiBd1.js  2.33 kB │ gzip: 0.99 kB
19
+ ℹ dist/types-Cfx_4QDK.js  1.74 kB │ gzip: 0.93 kB
20
+ ℹ dist/ws-upgrade-BeOQ7fXL.js  1.14 kB │ gzip: 0.54 kB
21
+ ℹ 10 files, total: 89.68 kB
22
+ ✔ Build complete in 46ms
package/CHANGELOG.md CHANGED
@@ -1,5 +1,19 @@
1
1
  # @alexkroman1/aai
2
2
 
3
+ ## 1.3.0
4
+
5
+ ### Minor Changes
6
+
7
+ - f1a9764: Internal: manifests now classify session mode (`s2s` | `pipeline`) at parse time, and expose optional `stt`, `llm`, and `tts` fields on the `Manifest` type. Groundwork for upcoming pluggable provider support — no user-visible behavior change yet.
8
+
9
+ ### Patch Changes
10
+
11
+ - c95212a: Fix runtime crash when loading the host runtime without the provider SDKs installed. `ai`, `assemblyai`, and `@cartesia/cartesia-js` are now regular dependencies instead of optional peer dependencies — the runtime eagerly imports `pipeline-session.ts`, so they were already required at module load even for S2S-mode agents. Optional peer deps described a design the code didn't enforce; now the metadata matches behavior.
12
+ - f1a9764: Fix PipelineSession: thread agentConfig.maxSteps into streamText via stopWhen: stepCountIs(n). Vercel AI SDK v6 defaults to a single step, so multi-step tool use would silently terminate after the first tool-result.
13
+ - f1a9764: agent() helper accepts stt/llm/tts fields directly, removing the need for the spread workaround in pipeline-mode agents
14
+ - 0231114: Simplify pipeline-session state management and parallelize provider open. Removes redundant PipelineState variable (equivalent to turnController != null), opens STT+TTS concurrently via Promise.allSettled (halves session-start latency), and cleans up either session if one open fails or the session aborts mid-open.
15
+ - 8a79282: Add sendAudioRaw to S2sHandle for batch-encoded audio frames
16
+
3
17
  ## 1.2.3
4
18
 
5
19
  ### Patch Changes
@@ -0,0 +1,107 @@
1
+ /**
2
+ * In-memory fake providers + fake `LanguageModel` for pipeline-session tests.
3
+ *
4
+ * These fakes do not touch the network. Each `createFake*Provider()` returns a
5
+ * provider whose `open()` records the most recently opened session so tests
6
+ * can reach into it via `.last()` and drive events (partial/final transcripts,
7
+ * TTS chunks) or observe calls (`sendText`, `flush`, `cancel`).
8
+ *
9
+ * The fake `LanguageModel` implements the minimum of {@link LanguageModelV3}
10
+ * required by `streamText` — `doStream()` returns a `ReadableStream` of
11
+ * {@link LanguageModelV3StreamPart}s produced from a scripted sequence.
12
+ *
13
+ * @internal Not part of the public API.
14
+ */
15
+ import type { LanguageModel } from "ai";
16
+ import { type Emitter } from "nanoevents";
17
+ import { vi } from "vitest";
18
+ import type { SttEvents, SttOpenOptions, SttProvider, SttSession, TtsEvents, TtsOpenOptions, TtsProvider, TtsSession } from "../sdk/providers.ts";
19
+ export type FakeSttSession = SttSession & {
20
+ readonly emitter: Emitter<SttEvents>;
21
+ readonly opts: SttOpenOptions;
22
+ readonly audioFrames: Int16Array[];
23
+ readonly closed: {
24
+ value: boolean;
25
+ };
26
+ firePartial(text: string): void;
27
+ fireFinal(text: string): void;
28
+ fireError(code: "stt_stream_error" | "stt_connect_failed" | "stt_auth_failed", message: string): void;
29
+ };
30
+ export type FakeSttProvider = SttProvider & {
31
+ /** The most recently opened session, or undefined if `open()` hasn't been called. */
32
+ last(): FakeSttSession | undefined;
33
+ readonly sessions: FakeSttSession[];
34
+ };
35
+ export declare function createFakeSttProvider(): FakeSttProvider;
36
+ export type FakeTtsSession = TtsSession & {
37
+ readonly emitter: Emitter<TtsEvents>;
38
+ readonly opts: TtsOpenOptions;
39
+ readonly textChunks: string[];
40
+ readonly closed: {
41
+ value: boolean;
42
+ };
43
+ readonly sendText: ReturnType<typeof vi.fn<(text: string) => void>>;
44
+ readonly flush: ReturnType<typeof vi.fn<() => void>>;
45
+ readonly cancel: ReturnType<typeof vi.fn<() => void>>;
46
+ fireAudio(pcm: Int16Array): void;
47
+ fireError(code: "tts_stream_error" | "tts_connect_failed" | "tts_auth_failed", message: string): void;
48
+ };
49
+ export type FakeTtsProvider = TtsProvider & {
50
+ /** The most recently opened session, or undefined if `open()` hasn't been called. */
51
+ last(): FakeTtsSession | undefined;
52
+ readonly sessions: FakeTtsSession[];
53
+ };
54
+ /**
55
+ * Fake TTS provider. By default, `flush()` synchronously emits a single `done`
56
+ * event so tests don't have to script the drain separately. Pass
57
+ * `{ autoDoneOnFlush: false }` to drive `done` manually.
58
+ */
59
+ export declare function createFakeTtsProvider(options?: {
60
+ autoDoneOnFlush?: boolean;
61
+ }): FakeTtsProvider;
62
+ /**
63
+ * A scripted stream part. `text` yields a `text-delta` in the LLM provider's
64
+ * raw wire format; `tool-call` / `tool-result` emit the corresponding parts
65
+ * (v3 provider spec: `toolCallId`, `toolName`, `input` as JSON string for
66
+ * calls, `result` as JSON-serialisable value for results).
67
+ */
68
+ export type ScriptedPart = {
69
+ type: "text";
70
+ text: string;
71
+ } | {
72
+ type: "tool-call";
73
+ toolCallId: string;
74
+ toolName: string;
75
+ input: string;
76
+ } | {
77
+ type: "tool-result";
78
+ toolCallId: string;
79
+ toolName: string;
80
+ result: unknown;
81
+ } | {
82
+ type: "error";
83
+ error: unknown;
84
+ };
85
+ /**
86
+ * Create a fake {@link LanguageModel} that yields a scripted sequence of
87
+ * parts when `streamText` drives `doStream()`. The fake ignores the prompt
88
+ * and tools — it simply replays the script.
89
+ *
90
+ * Pass `{ delayMs: N }` to space out parts with `setTimeout(N)` so that
91
+ * barge-in tests can abort mid-stream deterministically.
92
+ *
93
+ * Pass `{ steps: ScriptedPart[][] }` (instead of `script`) for multi-step
94
+ * scenarios: each call to `doStream()` consumes the next step's parts.
95
+ * This is how `streamText` drives multi-turn tool loops under `stopWhen`.
96
+ *
97
+ * The returned value is cast to the `LanguageModel` union because we
98
+ * implement the provider shape structurally rather than importing the
99
+ * full `@ai-sdk/provider` types into the aai package.
100
+ */
101
+ export declare function createFakeLanguageModel(options: {
102
+ script: ScriptedPart[];
103
+ delayMs?: number;
104
+ } | {
105
+ steps: ScriptedPart[][];
106
+ delayMs?: number;
107
+ }): LanguageModel;
@@ -0,0 +1,24 @@
1
+ /** Pipeline session context — base ctx + STT/TTS session slots. */
2
+ import type { AgentConfig, ExecuteTool } from "../sdk/_internal-types.ts";
3
+ import type { ClientSink } from "../sdk/protocol.ts";
4
+ import type { SttSession, TtsSession } from "../sdk/providers.ts";
5
+ import type { Logger } from "./runtime-config.ts";
6
+ import { type BaseSessionCtx } from "./session-ctx.ts";
7
+ /**
8
+ * Pipeline session context — {@link BaseSessionCtx} plus STT/TTS provider
9
+ * session handles. Replaces the S2S `s2s` field with decoupled `stt` + `tts`
10
+ * slots so the pipeline orchestrator can drive independent providers.
11
+ */
12
+ export type PipelineSessionCtx = BaseSessionCtx & {
13
+ stt: SttSession | null;
14
+ tts: TtsSession | null;
15
+ };
16
+ export declare function buildPipelineCtx(opts: {
17
+ id: string;
18
+ agent: string;
19
+ client: ClientSink;
20
+ agentConfig: AgentConfig;
21
+ executeTool: ExecuteTool;
22
+ log: Logger;
23
+ maxHistory?: number | undefined;
24
+ }): PipelineSessionCtx;
@@ -0,0 +1,48 @@
1
+ /**
2
+ * Pipeline session — pluggable STT → LLM → TTS orchestrator.
3
+ *
4
+ * Alternative to the S2S session (see `session.ts`) that drives three
5
+ * independent providers. A new partial STT event while the agent is replying
6
+ * triggers barge-in (aborts the LLM stream and cancels TTS).
7
+ */
8
+ import type { LanguageModel } from "ai";
9
+ import type { AgentConfig, ExecuteTool, ToolSchema } from "../sdk/_internal-types.ts";
10
+ import type { ClientSink } from "../sdk/protocol.ts";
11
+ import type { SttProvider, TtsProvider } from "../sdk/providers.ts";
12
+ import { type Logger } from "./runtime-config.ts";
13
+ import type { Session } from "./session.ts";
14
+ /** Configuration options for {@link createPipelineSession}. */
15
+ export interface PipelineSessionOptions {
16
+ /** Unique session identifier. */
17
+ id: string;
18
+ /** Agent slug. */
19
+ agent: string;
20
+ /** Sink for wire events + audio back to the browser client. */
21
+ client: ClientSink;
22
+ /** Serializable agent config (name, system prompt, maxSteps, etc.). */
23
+ agentConfig: AgentConfig;
24
+ /** JSON Schema definitions for the agent's tools. */
25
+ toolSchemas: readonly ToolSchema[];
26
+ /** Optional natural-language guidance appended to the system prompt. */
27
+ toolGuidance?: readonly string[] | undefined;
28
+ /** Function to invoke tools by name. */
29
+ executeTool: ExecuteTool;
30
+ /** STT provider (injected via manifest in pipeline mode). */
31
+ stt: SttProvider;
32
+ /** LLM provider (Vercel AI SDK `LanguageModel`). */
33
+ llm: LanguageModel;
34
+ /** TTS provider (injected via manifest in pipeline mode). */
35
+ tts: TtsProvider;
36
+ /** STT API key. */
37
+ sttApiKey: string;
38
+ /** TTS API key. */
39
+ ttsApiKey: string;
40
+ /** Audio sample rate (PCM16, Hz). Defaults to {@link DEFAULT_STT_SAMPLE_RATE}. */
41
+ sampleRate?: number | undefined;
42
+ /** Logger. Defaults to the console logger. */
43
+ logger?: Logger | undefined;
44
+ /** Sliding-window conversation history size. */
45
+ maxHistory?: number | undefined;
46
+ }
47
+ /** Create a pluggable-provider voice session. */
48
+ export declare function createPipelineSession(opts: PipelineSessionOptions): Session;
@@ -0,0 +1,2 @@
1
+ /** LLM provider type — re-exported from sdk/ for host-side consumption. */
2
+ export type * from "../../sdk/providers.ts";
@@ -0,0 +1,31 @@
1
+ /**
2
+ * AssemblyAI Universal-Streaming STT adapter.
3
+ *
4
+ * Wraps the `assemblyai` Node SDK's {@link StreamingTranscriber} and
5
+ * normalizes its event surface onto the {@link SttProvider} /
6
+ * {@link SttEvents} contract consumed by the pipeline orchestrator.
7
+ *
8
+ * Default model: `"u3pro-rt"` (Universal-3 Pro Real-Time). The adapter
9
+ * maps that to the SDK's `"u3-rt-pro"` `speechModel` value; any other
10
+ * string is forwarded verbatim.
11
+ */
12
+ import { type StreamingTranscriber } from "assemblyai";
13
+ import type { SttProvider, SttSession } from "../../../sdk/providers.ts";
14
+ export interface AssemblyAIOptions {
15
+ /**
16
+ * Streaming speech model. Defaults to `"u3pro-rt"` (Universal-3 Pro
17
+ * Real-Time). Arbitrary strings are forwarded to the SDK unchanged.
18
+ */
19
+ model?: "u3pro-rt" | string;
20
+ /**
21
+ * AssemblyAI API key. Falls back to `SttOpenOptions.apiKey`, then
22
+ * `process.env.ASSEMBLYAI_API_KEY`.
23
+ */
24
+ apiKey?: string;
25
+ }
26
+ /** Internal: SttSession with a test-only handle to the raw SDK transcriber. */
27
+ export interface AssemblyAISession extends SttSession {
28
+ /** @internal Test-only: exposes the underlying SDK transcriber for fixture replay. */
29
+ readonly _transcriber: StreamingTranscriber;
30
+ }
31
+ export declare function assemblyAI(opts?: AssemblyAIOptions): SttProvider;
@@ -0,0 +1,8 @@
1
+ /**
2
+ * `@alexkroman1/aai/stt` subpath barrel. Re-exports the STT provider
3
+ * contract types (via `stt.ts` → `sdk/providers.ts`) alongside the
4
+ * concrete AssemblyAI adapter factory. Task 9 owns wiring this file
5
+ * into `package.json` exports.
6
+ */
7
+ export * from "./stt/assemblyai.ts";
8
+ export type * from "./stt.ts";
@@ -0,0 +1,92 @@
1
+ import { createNanoEvents } from "nanoevents";
2
+ import { AssemblyAI } from "assemblyai";
3
+ //#region host/providers/stt/assemblyai.ts
4
+ /**
5
+ * AssemblyAI Universal-Streaming STT adapter.
6
+ *
7
+ * Wraps the `assemblyai` Node SDK's {@link StreamingTranscriber} and
8
+ * normalizes its event surface onto the {@link SttProvider} /
9
+ * {@link SttEvents} contract consumed by the pipeline orchestrator.
10
+ *
11
+ * Default model: `"u3pro-rt"` (Universal-3 Pro Real-Time). The adapter
12
+ * maps that to the SDK's `"u3-rt-pro"` `speechModel` value; any other
13
+ * string is forwarded verbatim.
14
+ */
15
+ /** Translate the adapter's model alias to the SDK's `speechModel` value. */
16
+ function resolveSpeechModel(model) {
17
+ if (model === "u3pro-rt") return "u3-rt-pro";
18
+ return model;
19
+ }
20
+ function makeError(message) {
21
+ const err = new Error(message);
22
+ err.code = "stt_stream_error";
23
+ return err;
24
+ }
25
+ function assemblyAI(opts = {}) {
26
+ return {
27
+ name: "assemblyai",
28
+ async open(openOpts) {
29
+ const apiKey = opts.apiKey ?? openOpts.apiKey ?? process.env.ASSEMBLYAI_API_KEY;
30
+ if (!apiKey) {
31
+ const err = /* @__PURE__ */ new Error("AssemblyAI STT adapter: missing API key. Provide via the factory option, SttOpenOptions, or the ASSEMBLYAI_API_KEY environment variable.");
32
+ err.code = "stt_auth_failed";
33
+ throw err;
34
+ }
35
+ const client = new AssemblyAI({ apiKey });
36
+ const speechModel = resolveSpeechModel(opts.model ?? "u3pro-rt");
37
+ const transcriber = client.streaming.transcriber({
38
+ sampleRate: openOpts.sampleRate,
39
+ speechModel,
40
+ ...openOpts.sttPrompt ? { prompt: openOpts.sttPrompt } : {}
41
+ });
42
+ const emitter = createNanoEvents();
43
+ let closed = false;
44
+ transcriber.on("turn", (event) => {
45
+ if (closed) return;
46
+ const text = event.transcript ?? "";
47
+ if (event.end_of_turn) {
48
+ if (text.length > 0) emitter.emit("final", text);
49
+ } else if (text.length > 0) emitter.emit("partial", text);
50
+ });
51
+ transcriber.on("error", (err) => {
52
+ if (closed) return;
53
+ emitter.emit("error", makeError(err?.message ?? String(err)));
54
+ });
55
+ transcriber.on("close", (code) => {
56
+ if (closed) return;
57
+ if (code !== 1e3) emitter.emit("error", makeError(`socket closed ${code}`));
58
+ });
59
+ try {
60
+ await transcriber.connect();
61
+ } catch (cause) {
62
+ const err = /* @__PURE__ */ new Error(`AssemblyAI STT: connect failed: ${cause instanceof Error ? cause.message : String(cause)}`);
63
+ err.code = "stt_connect_failed";
64
+ throw err;
65
+ }
66
+ const close = async () => {
67
+ if (closed) return;
68
+ closed = true;
69
+ try {
70
+ await transcriber.close();
71
+ } catch {}
72
+ };
73
+ if (openOpts.signal.aborted) close();
74
+ else openOpts.signal.addEventListener("abort", () => void close(), { once: true });
75
+ return {
76
+ sendAudio(pcm) {
77
+ if (closed) return;
78
+ const copy = new Uint8Array(pcm.byteLength);
79
+ copy.set(new Uint8Array(pcm.buffer, pcm.byteOffset, pcm.byteLength));
80
+ transcriber.sendAudio(copy.buffer);
81
+ },
82
+ on(event, fn) {
83
+ return emitter.on(event, fn);
84
+ },
85
+ close,
86
+ _transcriber: transcriber
87
+ };
88
+ }
89
+ };
90
+ }
91
+ //#endregion
92
+ export { assemblyAI };
@@ -0,0 +1,2 @@
1
+ /** STT provider interface — re-exported from sdk/ for host-side consumption. */
2
+ export type * from "../../sdk/providers.ts";
@@ -0,0 +1,39 @@
1
+ /**
2
+ * Cartesia TTS adapter — streaming WebSocket with per-turn `context_id`.
3
+ *
4
+ * Wraps `@cartesia/cartesia-js`'s `TTSWS` / `TTSWSContext` and normalizes it
5
+ * onto the {@link TtsProvider} / {@link TtsEvents} contract consumed by the
6
+ * pipeline orchestrator.
7
+ *
8
+ * **Per-turn context lifecycle.** Each `sendText(...)` within the same turn
9
+ * appends to the same Cartesia context. On `flush()` or `cancel()`, a new
10
+ * context is minted for the next turn — so concurrent `cancel({ contextId })`
11
+ * only targets the in-flight turn, never the one that follows.
12
+ *
13
+ * **Audio format.** The adapter requests `raw` / `pcm_s16le` at the
14
+ * negotiated `sampleRate` so it can forward chunks as `Int16Array` with no
15
+ * conversion.
16
+ */
17
+ import type { TTSWS } from "@cartesia/cartesia-js/resources/tts";
18
+ import type { TtsProvider, TtsSession } from "../../../sdk/providers.ts";
19
+ export interface CartesiaOptions {
20
+ /** Cartesia voice ID. Required. */
21
+ voice: string;
22
+ /** Model ID. Defaults to `"sonic-2"`. */
23
+ model?: string;
24
+ /**
25
+ * Cartesia API key. Falls back to `TtsOpenOptions.apiKey`, then
26
+ * `process.env.CARTESIA_API_KEY`.
27
+ */
28
+ apiKey?: string;
29
+ /** Spoken language hint. Defaults to `"en"`. */
30
+ language?: string;
31
+ }
32
+ /** Internal: TtsSession with a test-only handle to the raw SDK socket. */
33
+ export interface CartesiaSession extends TtsSession {
34
+ /** @internal Test-only: exposes the underlying SDK WebSocket wrapper. */
35
+ readonly _ws: TTSWS;
36
+ /** @internal Test-only: id of the currently-active context. */
37
+ readonly _currentContextId: () => string;
38
+ }
39
+ export declare function cartesia(opts: CartesiaOptions): TtsProvider;
@@ -0,0 +1,8 @@
1
+ /**
2
+ * `@alexkroman1/aai/tts` subpath barrel. Re-exports the TTS provider
3
+ * contract types (via `tts.ts` → `sdk/providers.ts`) alongside the
4
+ * concrete Cartesia adapter factory. Task 9 owns wiring this file
5
+ * into `package.json` exports.
6
+ */
7
+ export * from "./tts/cartesia.ts";
8
+ export type * from "./tts.ts";
@@ -0,0 +1,182 @@
1
+ import { createNanoEvents } from "nanoevents";
2
+ import { randomUUID } from "node:crypto";
3
+ import { Cartesia } from "@cartesia/cartesia-js";
4
+ //#region host/providers/tts/cartesia.ts
5
+ /**
6
+ * Cartesia TTS adapter — streaming WebSocket with per-turn `context_id`.
7
+ *
8
+ * Wraps `@cartesia/cartesia-js`'s `TTSWS` / `TTSWSContext` and normalizes it
9
+ * onto the {@link TtsProvider} / {@link TtsEvents} contract consumed by the
10
+ * pipeline orchestrator.
11
+ *
12
+ * **Per-turn context lifecycle.** Each `sendText(...)` within the same turn
13
+ * appends to the same Cartesia context. On `flush()` or `cancel()`, a new
14
+ * context is minted for the next turn — so concurrent `cancel({ contextId })`
15
+ * only targets the in-flight turn, never the one that follows.
16
+ *
17
+ * **Audio format.** The adapter requests `raw` / `pcm_s16le` at the
18
+ * negotiated `sampleRate` so it can forward chunks as `Int16Array` with no
19
+ * conversion.
20
+ */
21
+ function makeError(message) {
22
+ const err = new Error(message);
23
+ err.code = "tts_stream_error";
24
+ return err;
25
+ }
26
+ /** PCM16 sample rates supported by Cartesia's `raw` output format. */
27
+ const CARTESIA_PCM16_RATES = [
28
+ 8e3,
29
+ 16e3,
30
+ 22050,
31
+ 24e3,
32
+ 44100,
33
+ 48e3
34
+ ];
35
+ function assertSupportedSampleRate(rate) {
36
+ if (CARTESIA_PCM16_RATES.includes(rate)) return rate;
37
+ const err = /* @__PURE__ */ new Error(`Cartesia TTS adapter: unsupported sample rate ${rate}. Supported: ${CARTESIA_PCM16_RATES.join(", ")}.`);
38
+ err.code = "tts_connect_failed";
39
+ throw err;
40
+ }
41
+ function cartesia(opts) {
42
+ return {
43
+ name: "cartesia",
44
+ async open(openOpts) {
45
+ const apiKey = opts.apiKey ?? openOpts.apiKey ?? process.env.CARTESIA_API_KEY;
46
+ if (!apiKey) {
47
+ const err = /* @__PURE__ */ new Error("Cartesia TTS adapter: missing API key. Provide via the factory option, TtsOpenOptions, or the CARTESIA_API_KEY environment variable.");
48
+ err.code = "tts_auth_failed";
49
+ throw err;
50
+ }
51
+ const sampleRate = assertSupportedSampleRate(openOpts.sampleRate);
52
+ const model = opts.model ?? "sonic-2";
53
+ const language = opts.language ?? "en";
54
+ const client = new Cartesia({ apiKey });
55
+ let ws;
56
+ try {
57
+ ws = await client.tts.websocket();
58
+ } catch (cause) {
59
+ const err = /* @__PURE__ */ new Error(`Cartesia TTS: connect failed: ${cause instanceof Error ? cause.message : String(cause)}`);
60
+ err.code = "tts_connect_failed";
61
+ throw err;
62
+ }
63
+ const emitter = createNanoEvents();
64
+ let closed = false;
65
+ /** Mint a fresh context bound to the shared TTSWS connection. */
66
+ const mintContext = () => ws.context({
67
+ model_id: model,
68
+ voice: {
69
+ mode: "id",
70
+ id: opts.voice
71
+ },
72
+ output_format: {
73
+ container: "raw",
74
+ encoding: "pcm_s16le",
75
+ sample_rate: sampleRate
76
+ },
77
+ contextId: randomUUID()
78
+ });
79
+ let context = mintContext();
80
+ /**
81
+ * `doneEmitted` guards against emitting `done` more than once per turn.
82
+ * Reset whenever a fresh context is minted (i.e. at turn boundaries).
83
+ */
84
+ let doneEmitted = false;
85
+ const rotateContext = () => {
86
+ context = mintContext();
87
+ doneEmitted = false;
88
+ };
89
+ const emitDoneOnce = () => {
90
+ if (doneEmitted || closed) return;
91
+ doneEmitted = true;
92
+ emitter.emit("done");
93
+ };
94
+ ws.on("chunk", (event) => {
95
+ if (closed) return;
96
+ if (event.context_id !== context.contextId) return;
97
+ const buf = event.audio;
98
+ if (!buf || buf.byteLength === 0) return;
99
+ const evenBytes = buf.byteLength - buf.byteLength % 2;
100
+ if (evenBytes === 0) return;
101
+ const pcm = new Int16Array(buf.buffer.slice(buf.byteOffset, buf.byteOffset + evenBytes));
102
+ emitter.emit("audio", pcm);
103
+ });
104
+ ws.on("done", (event) => {
105
+ if (closed) return;
106
+ if (event.context_id !== context.contextId) return;
107
+ emitDoneOnce();
108
+ });
109
+ ws.on("error", (err) => {
110
+ if (closed) return;
111
+ emitter.emit("error", makeError(err?.message ?? String(err)));
112
+ });
113
+ const close = async () => {
114
+ if (closed) return;
115
+ closed = true;
116
+ try {
117
+ ws.close({
118
+ code: 1e3,
119
+ reason: "client close"
120
+ });
121
+ } catch {}
122
+ };
123
+ if (openOpts.signal.aborted) close();
124
+ else openOpts.signal.addEventListener("abort", () => void close(), { once: true });
125
+ /** Static part of each generation request; only `transcript` and
126
+ * `continue` vary per send. Pinned here so `language` threads through. */
127
+ const baseRequest = {
128
+ model_id: model,
129
+ voice: {
130
+ mode: "id",
131
+ id: opts.voice
132
+ },
133
+ output_format: {
134
+ container: "raw",
135
+ encoding: "pcm_s16le",
136
+ sample_rate: sampleRate
137
+ },
138
+ language
139
+ };
140
+ /**
141
+ * Swallow rejections from async SDK calls — the global `error`
142
+ * listener on `ws` emits a normalized {@link TtsError}, so there's
143
+ * nothing useful for the caller to do with per-send failures.
144
+ */
145
+ const ignoreRejection = (_err) => {};
146
+ return {
147
+ sendText(text) {
148
+ if (closed || text.length === 0) return;
149
+ context.send({
150
+ ...baseRequest,
151
+ transcript: text,
152
+ continue: true
153
+ }).catch(ignoreRejection);
154
+ },
155
+ flush() {
156
+ if (closed) return;
157
+ context.send({
158
+ ...baseRequest,
159
+ transcript: "",
160
+ continue: false
161
+ }).catch(ignoreRejection);
162
+ queueMicrotask(emitDoneOnce);
163
+ rotateContext();
164
+ },
165
+ cancel() {
166
+ if (closed) return;
167
+ context.cancel().catch(ignoreRejection);
168
+ emitDoneOnce();
169
+ rotateContext();
170
+ },
171
+ on(event, fn) {
172
+ return emitter.on(event, fn);
173
+ },
174
+ close,
175
+ _ws: ws,
176
+ _currentContextId: () => context.contextId
177
+ };
178
+ }
179
+ };
180
+ }
181
+ //#endregion
182
+ export { cartesia };
@@ -0,0 +1,2 @@
1
+ /** TTS provider interface — re-exported from sdk/ for host-side consumption. */
2
+ export type * from "../../sdk/providers.ts";