@alexkroman1/aai 1.3.2 → 1.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +18 -14
- package/CHANGELOG.md +8 -0
- package/dist/_internal-types-3p3OJZPb.js +145 -0
- package/dist/anthropic-BrUCPKUc.js +10 -0
- package/dist/assemblyai-Cxg9eobY.js +18 -0
- package/dist/cartesia-DwDk2tEu.js +10 -0
- package/dist/host/_pipeline-test-fakes.d.ts +5 -5
- package/dist/host/pipeline-session.d.ts +5 -5
- package/dist/host/providers/resolve.d.ts +34 -0
- package/dist/host/providers/stt/assemblyai.d.ts +9 -18
- package/dist/host/providers/tts/cartesia.d.ts +11 -18
- package/dist/host/runtime-barrel.js +348 -42
- package/dist/host/runtime.d.ts +13 -9
- package/dist/index.js +2 -91
- package/dist/sdk/_internal-types.d.ts +27 -1
- package/dist/sdk/manifest-barrel.d.ts +2 -0
- package/dist/sdk/manifest-barrel.js +2 -2
- package/dist/sdk/manifest.d.ts +13 -2
- package/dist/sdk/protocol.d.ts +3 -3
- package/dist/sdk/protocol.js +1 -1
- package/dist/sdk/providers/llm/anthropic.d.ts +23 -0
- package/dist/sdk/providers/llm-barrel.d.ts +9 -0
- package/dist/sdk/providers/llm-barrel.js +2 -0
- package/dist/sdk/providers/stt/assemblyai.d.ts +30 -0
- package/dist/sdk/providers/stt-barrel.d.ts +9 -0
- package/dist/sdk/providers/stt-barrel.js +2 -0
- package/dist/sdk/providers/tts/cartesia.d.ts +23 -0
- package/dist/sdk/providers/tts-barrel.d.ts +9 -0
- package/dist/sdk/providers/tts-barrel.js +2 -0
- package/dist/sdk/providers.d.ts +59 -11
- package/dist/types-KUgezM6u.js +128 -0
- package/host/_pipeline-test-fakes.ts +6 -6
- package/host/integration/pipeline-reference.integration.test.ts +4 -4
- package/host/pipeline-session.ts +6 -6
- package/host/providers/providers.test-d.ts +19 -10
- package/host/providers/resolve.ts +90 -0
- package/host/providers/stt/assemblyai.test.ts +2 -2
- package/host/providers/stt/assemblyai.ts +25 -47
- package/host/providers/tts/cartesia.test.ts +2 -2
- package/host/providers/tts/cartesia.ts +43 -73
- package/host/runtime.ts +66 -39
- package/package.json +13 -7
- package/sdk/__snapshots__/exports.test.ts.snap +2 -0
- package/sdk/__snapshots__/schema-shapes.test.ts.snap +4 -0
- package/sdk/_internal-types.ts +28 -1
- package/sdk/define.test.ts +12 -10
- package/sdk/manifest-barrel.ts +2 -0
- package/sdk/manifest.test.ts +6 -3
- package/sdk/manifest.ts +26 -18
- package/sdk/providers/llm/anthropic.ts +31 -0
- package/sdk/providers/llm-barrel.ts +12 -0
- package/sdk/providers/stt/assemblyai.ts +38 -0
- package/sdk/providers/stt-barrel.ts +12 -0
- package/sdk/providers/tts/cartesia.ts +31 -0
- package/sdk/providers/tts-barrel.ts +12 -0
- package/sdk/providers.ts +81 -17
- package/dist/_internal-types-CoDTiBd1.js +0 -61
- package/dist/host/providers/llm.d.ts +0 -2
- package/dist/host/providers/stt-barrel.d.ts +0 -8
- package/dist/host/providers/stt-barrel.js +0 -92
- package/dist/host/providers/stt.d.ts +0 -2
- package/dist/host/providers/tts-barrel.d.ts +0 -8
- package/dist/host/providers/tts-barrel.js +0 -182
- package/dist/host/providers/tts.d.ts +0 -2
- package/dist/types-Cfx_4QDK.js +0 -39
- package/host/providers/llm.ts +0 -3
- package/host/providers/stt-barrel.ts +0 -13
- package/host/providers/stt.ts +0 -3
- package/host/providers/tts-barrel.ts +0 -13
- package/host/providers/tts.ts +0 -3
- /package/dist/{constants-BL3nvg4I.js → constants-C2nirZUI.js} +0 -0
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
// Copyright 2025 the AAI authors. MIT license.
|
|
2
|
+
/**
|
|
3
|
+
* Descriptor → concrete-provider resolution (host-only).
|
|
4
|
+
*
|
|
5
|
+
* User code (and the server, after extracting config from a bundled agent)
|
|
6
|
+
* holds `SttProvider` / `LlmProvider` / `TtsProvider` **descriptors** —
|
|
7
|
+
* plain `{ kind, options }` data. At session start the runtime calls the
|
|
8
|
+
* resolvers here to turn each descriptor into its openable / callable
|
|
9
|
+
* host-side counterpart, importing the third-party SDK only at that point.
|
|
10
|
+
*
|
|
11
|
+
* The guest sandbox never imports these functions, which is how the agent
|
|
12
|
+
* bundle stays free of `@ai-sdk/anthropic` / `assemblyai` /
|
|
13
|
+
* `@cartesia/cartesia-js`.
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import { createAnthropic } from "@ai-sdk/anthropic";
|
|
17
|
+
import type { LanguageModel } from "ai";
|
|
18
|
+
import { ANTHROPIC_KIND, type AnthropicOptions } from "../../sdk/providers/llm/anthropic.ts";
|
|
19
|
+
import { ASSEMBLYAI_KIND, type AssemblyAIOptions } from "../../sdk/providers/stt/assemblyai.ts";
|
|
20
|
+
import { CARTESIA_KIND, type CartesiaOptions } from "../../sdk/providers/tts/cartesia.ts";
|
|
21
|
+
import type {
|
|
22
|
+
LlmProvider,
|
|
23
|
+
SttOpener,
|
|
24
|
+
SttProvider,
|
|
25
|
+
TtsOpener,
|
|
26
|
+
TtsProvider,
|
|
27
|
+
} from "../../sdk/providers.ts";
|
|
28
|
+
import { openAssemblyAI } from "./stt/assemblyai.ts";
|
|
29
|
+
import { openCartesia } from "./tts/cartesia.ts";
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Look up a provider API key: agent env first (set via `aai secret put` or
|
|
33
|
+
* `.env`), then the host's `process.env` as a fallback for self-hosted mode.
|
|
34
|
+
* Returns `""` if neither has it — the caller decides whether that's fatal.
|
|
35
|
+
*/
|
|
36
|
+
export function resolveApiKey(envVar: string, env: Record<string, string>): string {
|
|
37
|
+
return env[envVar] ?? process.env[envVar] ?? "";
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/** Resolve an {@link SttProvider} descriptor into a host-side opener. */
|
|
41
|
+
export function resolveStt(descriptor: SttProvider): SttOpener {
|
|
42
|
+
switch (descriptor.kind) {
|
|
43
|
+
case ASSEMBLYAI_KIND:
|
|
44
|
+
return openAssemblyAI(descriptor.options as unknown as AssemblyAIOptions);
|
|
45
|
+
default:
|
|
46
|
+
throw new Error(
|
|
47
|
+
`Unknown STT provider kind: "${descriptor.kind}". Supported: ${ASSEMBLYAI_KIND}.`,
|
|
48
|
+
);
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/** Resolve a {@link TtsProvider} descriptor into a host-side opener. */
|
|
53
|
+
export function resolveTts(descriptor: TtsProvider): TtsOpener {
|
|
54
|
+
switch (descriptor.kind) {
|
|
55
|
+
case CARTESIA_KIND:
|
|
56
|
+
return openCartesia(descriptor.options as unknown as CartesiaOptions);
|
|
57
|
+
default:
|
|
58
|
+
throw new Error(
|
|
59
|
+
`Unknown TTS provider kind: "${descriptor.kind}". Supported: ${CARTESIA_KIND}.`,
|
|
60
|
+
);
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* Resolve an {@link LlmProvider} descriptor into a Vercel AI SDK
|
|
66
|
+
* {@link LanguageModel}.
|
|
67
|
+
*
|
|
68
|
+
* The API key is pulled from the agent's env (e.g. `ANTHROPIC_API_KEY`).
|
|
69
|
+
* Missing keys throw here — the pipeline session would fail on first
|
|
70
|
+
* `streamText` call otherwise, and the error is clearer at construction.
|
|
71
|
+
*/
|
|
72
|
+
export function resolveLlm(descriptor: LlmProvider, env: Record<string, string>): LanguageModel {
|
|
73
|
+
switch (descriptor.kind) {
|
|
74
|
+
case ANTHROPIC_KIND: {
|
|
75
|
+
const options = descriptor.options as unknown as AnthropicOptions;
|
|
76
|
+
const apiKey = resolveApiKey("ANTHROPIC_API_KEY", env);
|
|
77
|
+
if (!apiKey) {
|
|
78
|
+
throw new Error("Anthropic LLM: missing API key. Set ANTHROPIC_API_KEY in the agent env.");
|
|
79
|
+
}
|
|
80
|
+
// Pass baseURL explicitly so the SDK's loadOptionalSetting returns
|
|
81
|
+
// before reading process.env["ANTHROPIC_BASE_URL"]. Without this,
|
|
82
|
+
// the Deno platform server needs --allow-env to start a session.
|
|
83
|
+
return createAnthropic({ apiKey, baseURL: "https://api.anthropic.com/v1" })(options.model);
|
|
84
|
+
}
|
|
85
|
+
default:
|
|
86
|
+
throw new Error(
|
|
87
|
+
`Unknown LLM provider kind: "${descriptor.kind}". Supported: ${ANTHROPIC_KIND}.`,
|
|
88
|
+
);
|
|
89
|
+
}
|
|
90
|
+
}
|
|
@@ -7,7 +7,7 @@ import { fileURLToPath } from "node:url";
|
|
|
7
7
|
import type { TurnEvent } from "assemblyai";
|
|
8
8
|
import { describe, expect, test, vi } from "vitest";
|
|
9
9
|
import { flush } from "../../_test-utils.ts";
|
|
10
|
-
import { type AssemblyAISession,
|
|
10
|
+
import { type AssemblyAISession, openAssemblyAI } from "./assemblyai.ts";
|
|
11
11
|
|
|
12
12
|
const here = dirname(fileURLToPath(import.meta.url));
|
|
13
13
|
|
|
@@ -66,7 +66,7 @@ describe("assemblyAI STT adapter — fixture replay", () => {
|
|
|
66
66
|
await readFile(join(here, "fixtures/assemblyai/basic-turn.json"), "utf8"),
|
|
67
67
|
) as Record<string, unknown>[];
|
|
68
68
|
|
|
69
|
-
const provider =
|
|
69
|
+
const provider = openAssemblyAI({ model: "u3pro-rt" });
|
|
70
70
|
const controller = new AbortController();
|
|
71
71
|
const session = (await provider.open({
|
|
72
72
|
sampleRate: 16_000,
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
// Copyright 2025 the AAI authors. MIT license.
|
|
2
2
|
/**
|
|
3
|
-
* AssemblyAI Universal-Streaming STT
|
|
3
|
+
* AssemblyAI Universal-Streaming STT opener (host-only).
|
|
4
4
|
*
|
|
5
|
-
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
5
|
+
* The user-facing descriptor factory (`assemblyAI(...)`) lives in
|
|
6
|
+
* `sdk/providers/stt/assemblyai.ts`. This module is the host-side
|
|
7
|
+
* counterpart: it takes the descriptor options + an API key and
|
|
8
|
+
* returns an {@link SttOpener} that the pipeline session drives.
|
|
8
9
|
*
|
|
9
10
|
* Default model: `"u3pro-rt"` (Universal-3 Pro Real-Time). The adapter
|
|
10
11
|
* maps that to the SDK's `"u3-rt-pro"` `speechModel` value; any other
|
|
@@ -13,57 +14,39 @@
|
|
|
13
14
|
|
|
14
15
|
import { AssemblyAI, type StreamingTranscriber } from "assemblyai";
|
|
15
16
|
import { createNanoEvents, type Emitter } from "nanoevents";
|
|
16
|
-
import type {
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
17
|
+
import type { AssemblyAIOptions } from "../../../sdk/providers/stt/assemblyai.ts";
|
|
18
|
+
import {
|
|
19
|
+
makeSttError,
|
|
20
|
+
type SttEvents,
|
|
21
|
+
type SttOpener,
|
|
22
|
+
type SttOpenOptions,
|
|
23
|
+
type SttSession,
|
|
22
24
|
} from "../../../sdk/providers.ts";
|
|
23
25
|
|
|
24
|
-
export interface AssemblyAIOptions {
|
|
25
|
-
/**
|
|
26
|
-
* Streaming speech model. Defaults to `"u3pro-rt"` (Universal-3 Pro
|
|
27
|
-
* Real-Time). Arbitrary strings are forwarded to the SDK unchanged.
|
|
28
|
-
*/
|
|
29
|
-
model?: "u3pro-rt" | string;
|
|
30
|
-
/**
|
|
31
|
-
* AssemblyAI API key. Falls back to `SttOpenOptions.apiKey`, then
|
|
32
|
-
* `process.env.ASSEMBLYAI_API_KEY`.
|
|
33
|
-
*/
|
|
34
|
-
apiKey?: string;
|
|
35
|
-
}
|
|
36
|
-
|
|
37
26
|
/** Internal: SttSession with a test-only handle to the raw SDK transcriber. */
|
|
38
27
|
export interface AssemblyAISession extends SttSession {
|
|
39
28
|
/** @internal Test-only: exposes the underlying SDK transcriber for fixture replay. */
|
|
40
29
|
readonly _transcriber: StreamingTranscriber;
|
|
41
30
|
}
|
|
42
31
|
|
|
43
|
-
/** Translate the
|
|
32
|
+
/** Translate the descriptor's model alias to the SDK's `speechModel` value. */
|
|
44
33
|
function resolveSpeechModel(model: string): string {
|
|
45
34
|
// Plan's public name is "u3pro-rt"; the SDK's enum uses "u3-rt-pro".
|
|
46
35
|
if (model === "u3pro-rt") return "u3-rt-pro";
|
|
47
36
|
return model;
|
|
48
37
|
}
|
|
49
38
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
(err as { code: SttError["code"] }).code = "stt_stream_error";
|
|
53
|
-
return err;
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
export function assemblyAI(opts: AssemblyAIOptions = {}): SttProvider {
|
|
39
|
+
/** Build an {@link SttOpener} from resolved AssemblyAI descriptor options. */
|
|
40
|
+
export function openAssemblyAI(opts: AssemblyAIOptions = {}): SttOpener {
|
|
57
41
|
return {
|
|
58
42
|
name: "assemblyai",
|
|
59
43
|
async open(openOpts: SttOpenOptions): Promise<SttSession> {
|
|
60
|
-
const apiKey =
|
|
44
|
+
const apiKey = openOpts.apiKey || process.env.ASSEMBLYAI_API_KEY;
|
|
61
45
|
if (!apiKey) {
|
|
62
|
-
|
|
63
|
-
"
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
throw err;
|
|
46
|
+
throw makeSttError(
|
|
47
|
+
"stt_auth_failed",
|
|
48
|
+
"AssemblyAI STT: missing API key. Set ASSEMBLYAI_API_KEY in the agent env.",
|
|
49
|
+
);
|
|
67
50
|
}
|
|
68
51
|
|
|
69
52
|
const client = new AssemblyAI({ apiKey });
|
|
@@ -91,25 +74,24 @@ export function assemblyAI(opts: AssemblyAIOptions = {}): SttProvider {
|
|
|
91
74
|
|
|
92
75
|
transcriber.on("error", (err) => {
|
|
93
76
|
if (closed) return;
|
|
94
|
-
emitter.emit("error",
|
|
77
|
+
emitter.emit("error", makeSttError("stt_stream_error", err?.message ?? String(err)));
|
|
95
78
|
});
|
|
96
79
|
|
|
97
80
|
transcriber.on("close", (code) => {
|
|
98
81
|
if (closed) return;
|
|
99
82
|
// 1000 = normal closure.
|
|
100
83
|
if (code !== 1000) {
|
|
101
|
-
emitter.emit("error",
|
|
84
|
+
emitter.emit("error", makeSttError("stt_stream_error", `socket closed ${code}`));
|
|
102
85
|
}
|
|
103
86
|
});
|
|
104
87
|
|
|
105
88
|
try {
|
|
106
89
|
await transcriber.connect();
|
|
107
90
|
} catch (cause) {
|
|
108
|
-
|
|
91
|
+
throw makeSttError(
|
|
92
|
+
"stt_connect_failed",
|
|
109
93
|
`AssemblyAI STT: connect failed: ${cause instanceof Error ? cause.message : String(cause)}`,
|
|
110
|
-
)
|
|
111
|
-
(err as { code: SttError["code"] }).code = "stt_connect_failed";
|
|
112
|
-
throw err;
|
|
94
|
+
);
|
|
113
95
|
}
|
|
114
96
|
|
|
115
97
|
const close = async (): Promise<void> => {
|
|
@@ -122,7 +104,6 @@ export function assemblyAI(opts: AssemblyAIOptions = {}): SttProvider {
|
|
|
122
104
|
}
|
|
123
105
|
};
|
|
124
106
|
|
|
125
|
-
// Wire session-level abort to close the SDK socket.
|
|
126
107
|
if (openOpts.signal.aborted) {
|
|
127
108
|
void close();
|
|
128
109
|
} else {
|
|
@@ -134,9 +115,6 @@ export function assemblyAI(opts: AssemblyAIOptions = {}): SttProvider {
|
|
|
134
115
|
const session: AssemblyAISession = {
|
|
135
116
|
sendAudio(pcm: Int16Array) {
|
|
136
117
|
if (closed) return;
|
|
137
|
-
// The SDK's sendAudio accepts ArrayBufferLike. Forward a detached
|
|
138
|
-
// copy of the PCM view's window so the consumer sees only this
|
|
139
|
-
// chunk's bytes.
|
|
140
118
|
const copy = new Uint8Array(pcm.byteLength);
|
|
141
119
|
copy.set(new Uint8Array(pcm.buffer, pcm.byteOffset, pcm.byteLength));
|
|
142
120
|
transcriber.sendAudio(copy.buffer);
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
|
|
4
4
|
import { beforeEach, describe, expect, test, vi } from "vitest";
|
|
5
5
|
import { flush } from "../../_test-utils.ts";
|
|
6
|
-
import { type CartesiaSession,
|
|
6
|
+
import { type CartesiaSession, openCartesia } from "./cartesia.ts";
|
|
7
7
|
|
|
8
8
|
// Recorded interactions on the fake `TTSWSContext` — one entry per method call.
|
|
9
9
|
interface RecordedSend {
|
|
@@ -101,7 +101,7 @@ async function openSession(): Promise<{
|
|
|
101
101
|
session: CartesiaSession;
|
|
102
102
|
controller: AbortController;
|
|
103
103
|
}> {
|
|
104
|
-
const provider =
|
|
104
|
+
const provider = openCartesia({ voice: "voice-id" });
|
|
105
105
|
const controller = new AbortController();
|
|
106
106
|
const session = (await provider.open({
|
|
107
107
|
sampleRate: 16_000,
|
|
@@ -1,10 +1,14 @@
|
|
|
1
1
|
// Copyright 2025 the AAI authors. MIT license.
|
|
2
2
|
/**
|
|
3
|
-
* Cartesia TTS
|
|
3
|
+
* Cartesia TTS opener (host-only).
|
|
4
|
+
*
|
|
5
|
+
* The user-facing descriptor factory (`cartesia(...)`) lives in
|
|
6
|
+
* `sdk/providers/tts/cartesia.ts`. This module is the host-side
|
|
7
|
+
* counterpart: it takes the descriptor options + an API key and
|
|
8
|
+
* returns a {@link TtsOpener} that the pipeline session drives.
|
|
4
9
|
*
|
|
5
10
|
* Wraps `@cartesia/cartesia-js`'s `TTSWS` / `TTSWSContext` and normalizes it
|
|
6
|
-
* onto the {@link
|
|
7
|
-
* pipeline orchestrator.
|
|
11
|
+
* onto the {@link TtsEvents} contract consumed by the pipeline orchestrator.
|
|
8
12
|
*
|
|
9
13
|
* **Per-turn context lifecycle.** Each `sendText(...)` within the same turn
|
|
10
14
|
* appends to the same Cartesia context. On `flush()` or `cancel()`, a new
|
|
@@ -20,28 +24,15 @@ import { randomUUID } from "node:crypto";
|
|
|
20
24
|
import { Cartesia } from "@cartesia/cartesia-js";
|
|
21
25
|
import type { TTSWS, TTSWSContext } from "@cartesia/cartesia-js/resources/tts";
|
|
22
26
|
import { createNanoEvents, type Emitter } from "nanoevents";
|
|
23
|
-
import type {
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
27
|
+
import type { CartesiaOptions } from "../../../sdk/providers/tts/cartesia.ts";
|
|
28
|
+
import {
|
|
29
|
+
makeTtsError,
|
|
30
|
+
type TtsEvents,
|
|
31
|
+
type TtsOpener,
|
|
32
|
+
type TtsOpenOptions,
|
|
33
|
+
type TtsSession,
|
|
29
34
|
} from "../../../sdk/providers.ts";
|
|
30
35
|
|
|
31
|
-
export interface CartesiaOptions {
|
|
32
|
-
/** Cartesia voice ID. Required. */
|
|
33
|
-
voice: string;
|
|
34
|
-
/** Model ID. Defaults to `"sonic-2"`. */
|
|
35
|
-
model?: string;
|
|
36
|
-
/**
|
|
37
|
-
* Cartesia API key. Falls back to `TtsOpenOptions.apiKey`, then
|
|
38
|
-
* `process.env.CARTESIA_API_KEY`.
|
|
39
|
-
*/
|
|
40
|
-
apiKey?: string;
|
|
41
|
-
/** Spoken language hint. Defaults to `"en"`. */
|
|
42
|
-
language?: string;
|
|
43
|
-
}
|
|
44
|
-
|
|
45
36
|
/** Internal: TtsSession with a test-only handle to the raw SDK socket. */
|
|
46
37
|
export interface CartesiaSession extends TtsSession {
|
|
47
38
|
/** @internal Test-only: exposes the underlying SDK WebSocket wrapper. */
|
|
@@ -50,12 +41,6 @@ export interface CartesiaSession extends TtsSession {
|
|
|
50
41
|
readonly _currentContextId: () => string;
|
|
51
42
|
}
|
|
52
43
|
|
|
53
|
-
function makeError(message: string): TtsError {
|
|
54
|
-
const err = new Error(message) as TtsError & { code: TtsError["code"] };
|
|
55
|
-
(err as { code: TtsError["code"] }).code = "tts_stream_error";
|
|
56
|
-
return err;
|
|
57
|
-
}
|
|
58
|
-
|
|
59
44
|
/** PCM16 sample rates supported by Cartesia's `raw` output format. */
|
|
60
45
|
const CARTESIA_PCM16_RATES = [
|
|
61
46
|
8000, 16_000, 22_050, 24_000, 44_100, 48_000,
|
|
@@ -66,24 +51,23 @@ function assertSupportedSampleRate(rate: number): CartesiaSampleRate {
|
|
|
66
51
|
if ((CARTESIA_PCM16_RATES as readonly number[]).includes(rate)) {
|
|
67
52
|
return rate as CartesiaSampleRate;
|
|
68
53
|
}
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
throw err;
|
|
54
|
+
throw makeTtsError(
|
|
55
|
+
"tts_connect_failed",
|
|
56
|
+
`Cartesia TTS: unsupported sample rate ${rate}. Supported: ${CARTESIA_PCM16_RATES.join(", ")}.`,
|
|
57
|
+
);
|
|
74
58
|
}
|
|
75
59
|
|
|
76
|
-
|
|
60
|
+
/** Build a {@link TtsOpener} from resolved Cartesia descriptor options. */
|
|
61
|
+
export function openCartesia(opts: CartesiaOptions): TtsOpener {
|
|
77
62
|
return {
|
|
78
63
|
name: "cartesia",
|
|
79
64
|
async open(openOpts: TtsOpenOptions): Promise<TtsSession> {
|
|
80
|
-
const apiKey =
|
|
65
|
+
const apiKey = openOpts.apiKey || process.env.CARTESIA_API_KEY;
|
|
81
66
|
if (!apiKey) {
|
|
82
|
-
|
|
83
|
-
"
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
throw err;
|
|
67
|
+
throw makeTtsError(
|
|
68
|
+
"tts_auth_failed",
|
|
69
|
+
"Cartesia TTS: missing API key. Set CARTESIA_API_KEY in the agent env.",
|
|
70
|
+
);
|
|
87
71
|
}
|
|
88
72
|
|
|
89
73
|
const sampleRate = assertSupportedSampleRate(openOpts.sampleRate);
|
|
@@ -95,11 +79,10 @@ export function cartesia(opts: CartesiaOptions): TtsProvider {
|
|
|
95
79
|
try {
|
|
96
80
|
ws = await client.tts.websocket();
|
|
97
81
|
} catch (cause) {
|
|
98
|
-
|
|
82
|
+
throw makeTtsError(
|
|
83
|
+
"tts_connect_failed",
|
|
99
84
|
`Cartesia TTS: connect failed: ${cause instanceof Error ? cause.message : String(cause)}`,
|
|
100
|
-
)
|
|
101
|
-
(err as { code: TtsError["code"] }).code = "tts_connect_failed";
|
|
102
|
-
throw err;
|
|
85
|
+
);
|
|
103
86
|
}
|
|
104
87
|
|
|
105
88
|
const emitter: Emitter<TtsEvents> = createNanoEvents<TtsEvents>();
|
|
@@ -134,17 +117,15 @@ export function cartesia(opts: CartesiaOptions): TtsProvider {
|
|
|
134
117
|
emitter.emit("done");
|
|
135
118
|
};
|
|
136
119
|
|
|
137
|
-
//
|
|
138
|
-
// currently-active
|
|
139
|
-
// across all contexts on the socket; we only care about the active one.
|
|
120
|
+
// TTSWS fires events globally across all contexts on the shared
|
|
121
|
+
// socket; filter by the currently-active context_id.
|
|
140
122
|
ws.on("chunk", (event) => {
|
|
141
123
|
if (closed) return;
|
|
142
124
|
if (event.context_id !== context.contextId) return;
|
|
143
|
-
// SDK decodes base64 → Buffer on receipt (`event.audio`). Forward as
|
|
144
|
-
// Int16Array over the same byte window.
|
|
145
125
|
const buf = event.audio;
|
|
146
126
|
if (!buf || buf.byteLength === 0) return;
|
|
147
|
-
// Cartesia sends PCM16
|
|
127
|
+
// Cartesia sends PCM16 LE; be defensive about odd byte counts
|
|
128
|
+
// so `new Int16Array` never throws on a misaligned length.
|
|
148
129
|
const evenBytes = buf.byteLength - (buf.byteLength % 2);
|
|
149
130
|
if (evenBytes === 0) return;
|
|
150
131
|
const pcm = new Int16Array(buf.buffer.slice(buf.byteOffset, buf.byteOffset + evenBytes));
|
|
@@ -159,7 +140,7 @@ export function cartesia(opts: CartesiaOptions): TtsProvider {
|
|
|
159
140
|
|
|
160
141
|
ws.on("error", (err) => {
|
|
161
142
|
if (closed) return;
|
|
162
|
-
emitter.emit("error",
|
|
143
|
+
emitter.emit("error", makeTtsError("tts_stream_error", err?.message ?? String(err)));
|
|
163
144
|
});
|
|
164
145
|
|
|
165
146
|
const close = async (): Promise<void> => {
|
|
@@ -172,7 +153,6 @@ export function cartesia(opts: CartesiaOptions): TtsProvider {
|
|
|
172
153
|
}
|
|
173
154
|
};
|
|
174
155
|
|
|
175
|
-
// Session-level abort → close the SDK socket.
|
|
176
156
|
if (openOpts.signal.aborted) {
|
|
177
157
|
void close();
|
|
178
158
|
} else {
|
|
@@ -181,8 +161,6 @@ export function cartesia(opts: CartesiaOptions): TtsProvider {
|
|
|
181
161
|
});
|
|
182
162
|
}
|
|
183
163
|
|
|
184
|
-
/** Static part of each generation request; only `transcript` and
|
|
185
|
-
* `continue` vary per send. Pinned here so `language` threads through. */
|
|
186
164
|
const baseRequest = {
|
|
187
165
|
model_id: model,
|
|
188
166
|
voice: { mode: "id" as const, id: opts.voice },
|
|
@@ -194,11 +172,6 @@ export function cartesia(opts: CartesiaOptions): TtsProvider {
|
|
|
194
172
|
language,
|
|
195
173
|
};
|
|
196
174
|
|
|
197
|
-
/**
|
|
198
|
-
* Swallow rejections from async SDK calls — the global `error`
|
|
199
|
-
* listener on `ws` emits a normalized {@link TtsError}, so there's
|
|
200
|
-
* nothing useful for the caller to do with per-send failures.
|
|
201
|
-
*/
|
|
202
175
|
const ignoreRejection = (_err: unknown): void => {
|
|
203
176
|
// intentionally empty
|
|
204
177
|
};
|
|
@@ -206,22 +179,19 @@ export function cartesia(opts: CartesiaOptions): TtsProvider {
|
|
|
206
179
|
const session: CartesiaSession = {
|
|
207
180
|
sendText(text: string) {
|
|
208
181
|
if (closed || text.length === 0) return;
|
|
209
|
-
// Send a delta with `continue: true`, sharing the same
|
|
210
|
-
// context_id across all deltas of this turn.
|
|
211
182
|
void context
|
|
212
183
|
.send({ ...baseRequest, transcript: text, continue: true })
|
|
213
184
|
.catch(ignoreRejection);
|
|
214
185
|
},
|
|
215
186
|
flush() {
|
|
216
187
|
if (closed) return;
|
|
217
|
-
//
|
|
218
|
-
// end-of-turn signal.
|
|
219
|
-
//
|
|
220
|
-
//
|
|
221
|
-
//
|
|
222
|
-
// TODO: drop the microtask fallback once we've verified
|
|
223
|
-
// always emits
|
|
224
|
-
// 2026-04-22-pluggable-providers-design.md → "Note on flush() timing".
|
|
188
|
+
// Empty transcript with `continue: false` is the canonical
|
|
189
|
+
// end-of-turn signal. Cartesia replies with a `done` tagged
|
|
190
|
+
// by context_id, driving `emitDoneOnce`. The microtask
|
|
191
|
+
// fallback guards against a dropped server event wedging
|
|
192
|
+
// the orchestrator's state machine.
|
|
193
|
+
// TODO: drop the microtask fallback once we've verified
|
|
194
|
+
// Cartesia always emits `done` for cleanly-flushed contexts.
|
|
225
195
|
void context
|
|
226
196
|
.send({ ...baseRequest, transcript: "", continue: false })
|
|
227
197
|
.catch(ignoreRejection);
|
|
@@ -230,10 +200,10 @@ export function cartesia(opts: CartesiaOptions): TtsProvider {
|
|
|
230
200
|
},
|
|
231
201
|
cancel() {
|
|
232
202
|
if (closed) return;
|
|
233
|
-
// `cancel()` calls ws.cancelContext(contextId) under the hood.
|
|
234
203
|
void context.cancel().catch(ignoreRejection);
|
|
235
|
-
// Emit
|
|
236
|
-
//
|
|
204
|
+
// Emit synchronously: barge-in advances the orchestrator's
|
|
205
|
+
// state machine on `done`, and delaying it would audibly
|
|
206
|
+
// stall subsequent turns.
|
|
237
207
|
emitDoneOnce();
|
|
238
208
|
rotateContext();
|
|
239
209
|
},
|
package/host/runtime.ts
CHANGED
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
* lifecycle hooks, and session management.
|
|
8
8
|
*/
|
|
9
9
|
|
|
10
|
+
import type { LanguageModel } from "ai";
|
|
10
11
|
import pTimeout from "p-timeout";
|
|
11
12
|
import { createStorage } from "unstorage";
|
|
12
13
|
import { agentToolsToSchemas, type ToolSchema, toAgentConfig } from "../sdk/_internal-types.ts";
|
|
@@ -14,11 +15,19 @@ import { DEFAULT_SHUTDOWN_TIMEOUT_MS } from "../sdk/constants.ts";
|
|
|
14
15
|
import type { Kv } from "../sdk/kv.ts";
|
|
15
16
|
import type { ClientSink } from "../sdk/protocol.ts";
|
|
16
17
|
import { buildReadyConfig, type ReadyConfig } from "../sdk/protocol.ts";
|
|
17
|
-
import
|
|
18
|
+
import {
|
|
19
|
+
assertProviderTriple,
|
|
20
|
+
type LlmProvider,
|
|
21
|
+
type SttOpener,
|
|
22
|
+
type SttProvider,
|
|
23
|
+
type TtsOpener,
|
|
24
|
+
type TtsProvider,
|
|
25
|
+
} from "../sdk/providers.ts";
|
|
18
26
|
import type { AgentDef } from "../sdk/types.ts";
|
|
19
27
|
import { toolError } from "../sdk/utils.ts";
|
|
20
28
|
import { resolveAllBuiltins } from "./builtin-tools.ts";
|
|
21
29
|
import { createPipelineSession } from "./pipeline-session.ts";
|
|
30
|
+
import { resolveApiKey, resolveLlm, resolveStt, resolveTts } from "./providers/resolve.ts";
|
|
22
31
|
import type { Logger, S2SConfig } from "./runtime-config.ts";
|
|
23
32
|
import { consoleLogger, DEFAULT_S2S_CONFIG } from "./runtime-config.ts";
|
|
24
33
|
import type { CreateS2sWebSocket } from "./s2s.ts";
|
|
@@ -55,23 +64,36 @@ export type AgentRuntime = {
|
|
|
55
64
|
|
|
56
65
|
// ─── Runtime implementation ──────────────────────────────────────────────────
|
|
57
66
|
|
|
67
|
+
/**
|
|
68
|
+
* Distinguish a descriptor (`{ kind, options }`) from an already-resolved
|
|
69
|
+
* opener / `LanguageModel`. The production path always passes descriptors;
|
|
70
|
+
* openers are a test escape hatch (fakes in `_pipeline-test-fakes.ts`).
|
|
71
|
+
* STT/TTS openers are identified by the `open` method, `LanguageModel` by
|
|
72
|
+
* its `specificationVersion` field — both absent on descriptors.
|
|
73
|
+
*/
|
|
74
|
+
function resolveSttIfDescriptor(value: SttProvider | SttOpener): SttOpener {
|
|
75
|
+
return "open" in value ? value : resolveStt(value);
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
function resolveTtsIfDescriptor(value: TtsProvider | TtsOpener): TtsOpener {
|
|
79
|
+
return "open" in value ? value : resolveTts(value);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
function resolveLlmIfDescriptor(
|
|
83
|
+
value: LlmProvider | LanguageModel,
|
|
84
|
+
env: Record<string, string>,
|
|
85
|
+
): LanguageModel {
|
|
86
|
+
// LanguageModel can be a string (model-id shortcut) or an object with
|
|
87
|
+
// `specificationVersion`; descriptors are plain `{ kind, options }` objects.
|
|
88
|
+
if (typeof value === "string") return value;
|
|
89
|
+
return "specificationVersion" in value ? value : resolveLlm(value, env);
|
|
90
|
+
}
|
|
91
|
+
|
|
58
92
|
/** Create an in-memory KV store (default for self-hosted). */
|
|
59
93
|
function createLocalKv(): Kv {
|
|
60
94
|
return createUnstorageKv({ storage: createStorage() });
|
|
61
95
|
}
|
|
62
96
|
|
|
63
|
-
/**
|
|
64
|
-
* Resolve an API key host-side for pipeline providers.
|
|
65
|
-
*
|
|
66
|
-
* Checks the agent's declared env first, then the host process env as a
|
|
67
|
-
* fallback. Returns `""` when absent — pipeline providers surface a clear
|
|
68
|
-
* `MissingCredentialsError` via their `open()` that the orchestrator
|
|
69
|
-
* converts to a `session.error` wire event.
|
|
70
|
-
*/
|
|
71
|
-
function resolveApiKey(envVar: string, env: Record<string, string>): string {
|
|
72
|
-
return env[envVar] ?? process.env[envVar] ?? "";
|
|
73
|
-
}
|
|
74
|
-
|
|
75
97
|
/**
|
|
76
98
|
* Configuration for {@link createRuntime}.
|
|
77
99
|
*
|
|
@@ -126,21 +148,24 @@ export type RuntimeOptions = {
|
|
|
126
148
|
*/
|
|
127
149
|
fetch?: typeof globalThis.fetch | undefined;
|
|
128
150
|
/**
|
|
129
|
-
*
|
|
151
|
+
* STT provider. Accepts either a descriptor ({@link SttProvider},
|
|
152
|
+
* the normal production path) or a pre-resolved {@link SttOpener}
|
|
153
|
+
* (test escape hatch). Must be set together with `llm` and `tts` to
|
|
130
154
|
* route sessions through the pipeline path; leave all three unset for
|
|
131
155
|
* the default AssemblyAI Streaming Speech-to-Speech (S2S) path.
|
|
132
156
|
*/
|
|
133
|
-
stt?: SttProvider | undefined;
|
|
157
|
+
stt?: SttProvider | SttOpener | undefined;
|
|
134
158
|
/**
|
|
135
|
-
*
|
|
136
|
-
*
|
|
159
|
+
* LLM provider. Accepts either a descriptor ({@link LlmProvider},
|
|
160
|
+
* produced by factories like `anthropic(...)`) or a concrete Vercel AI
|
|
161
|
+
* SDK `LanguageModel` (self-hosted / test escape hatch).
|
|
137
162
|
*/
|
|
138
|
-
llm?: LlmProvider | undefined;
|
|
163
|
+
llm?: LlmProvider | LanguageModel | undefined;
|
|
139
164
|
/**
|
|
140
|
-
*
|
|
141
|
-
*
|
|
165
|
+
* TTS provider. Accepts either a descriptor ({@link TtsProvider})
|
|
166
|
+
* or a pre-resolved {@link TtsOpener}.
|
|
142
167
|
*/
|
|
143
|
-
tts?: TtsProvider | undefined;
|
|
168
|
+
tts?: TtsProvider | TtsOpener | undefined;
|
|
144
169
|
};
|
|
145
170
|
|
|
146
171
|
/**
|
|
@@ -190,14 +215,7 @@ export function createRuntime(opts: RuntimeOptions): Runtime {
|
|
|
190
215
|
sessionStartTimeoutMs,
|
|
191
216
|
shutdownTimeoutMs = DEFAULT_SHUTDOWN_TIMEOUT_MS,
|
|
192
217
|
} = opts;
|
|
193
|
-
|
|
194
|
-
// none set ⇒ s2s. Anything in-between is a configuration error.
|
|
195
|
-
const providerCount =
|
|
196
|
-
(opts.stt != null ? 1 : 0) + (opts.llm != null ? 1 : 0) + (opts.tts != null ? 1 : 0);
|
|
197
|
-
if (providerCount !== 0 && providerCount !== 3) {
|
|
198
|
-
throw new Error("stt, llm, and tts must be set together");
|
|
199
|
-
}
|
|
200
|
-
const mode: "s2s" | "pipeline" = providerCount === 3 ? "pipeline" : "s2s";
|
|
218
|
+
const mode = assertProviderTriple(opts.stt, opts.llm, opts.tts);
|
|
201
219
|
const agentConfig = toAgentConfig(agent);
|
|
202
220
|
const sessions = new Map<string, Session>();
|
|
203
221
|
const sinkMap = new Map<string, ClientSink>();
|
|
@@ -271,6 +289,21 @@ export function createRuntime(opts: RuntimeOptions): Runtime {
|
|
|
271
289
|
};
|
|
272
290
|
}
|
|
273
291
|
|
|
292
|
+
// Resolve pipeline providers once per runtime (not per session). Each
|
|
293
|
+
// session reuses the same opener / LanguageModel — the opener's `open()`
|
|
294
|
+
// mints the per-session stream inside.
|
|
295
|
+
const pipelineProviders =
|
|
296
|
+
mode === "pipeline"
|
|
297
|
+
? {
|
|
298
|
+
// biome-ignore lint/style/noNonNullAssertion: mode === "pipeline" ⇒ all three set
|
|
299
|
+
stt: resolveSttIfDescriptor(opts.stt!),
|
|
300
|
+
// biome-ignore lint/style/noNonNullAssertion: mode === "pipeline" ⇒ all three set
|
|
301
|
+
llm: resolveLlmIfDescriptor(opts.llm!, env),
|
|
302
|
+
// biome-ignore lint/style/noNonNullAssertion: mode === "pipeline" ⇒ all three set
|
|
303
|
+
tts: resolveTtsIfDescriptor(opts.tts!),
|
|
304
|
+
}
|
|
305
|
+
: null;
|
|
306
|
+
|
|
274
307
|
function createSession(sessionOpts: {
|
|
275
308
|
id: string;
|
|
276
309
|
agent: string;
|
|
@@ -279,13 +312,7 @@ export function createRuntime(opts: RuntimeOptions): Runtime {
|
|
|
279
312
|
resumeFrom?: string;
|
|
280
313
|
}): Session {
|
|
281
314
|
sinkMap.set(sessionOpts.id, sessionOpts.client);
|
|
282
|
-
if (
|
|
283
|
-
// biome-ignore lint/style/noNonNullAssertion: providerCount === 3 ⇒ all set
|
|
284
|
-
const stt = opts.stt!;
|
|
285
|
-
// biome-ignore lint/style/noNonNullAssertion: providerCount === 3 ⇒ all set
|
|
286
|
-
const llm = opts.llm!;
|
|
287
|
-
// biome-ignore lint/style/noNonNullAssertion: providerCount === 3 ⇒ all set
|
|
288
|
-
const tts = opts.tts!;
|
|
315
|
+
if (pipelineProviders) {
|
|
289
316
|
return createPipelineSession({
|
|
290
317
|
id: sessionOpts.id,
|
|
291
318
|
agent: sessionOpts.agent,
|
|
@@ -294,9 +321,9 @@ export function createRuntime(opts: RuntimeOptions): Runtime {
|
|
|
294
321
|
toolSchemas,
|
|
295
322
|
toolGuidance,
|
|
296
323
|
executeTool,
|
|
297
|
-
stt,
|
|
298
|
-
llm,
|
|
299
|
-
tts,
|
|
324
|
+
stt: pipelineProviders.stt,
|
|
325
|
+
llm: pipelineProviders.llm,
|
|
326
|
+
tts: pipelineProviders.tts,
|
|
300
327
|
sttApiKey: resolveApiKey("ASSEMBLYAI_API_KEY", env),
|
|
301
328
|
ttsApiKey: resolveApiKey("CARTESIA_API_KEY", env),
|
|
302
329
|
logger,
|