@alexkroman1/aai 1.4.5 → 1.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +10 -10
- package/CHANGELOG.md +19 -0
- package/dist/{_internal-types-3p3OJZPb.js → _internal-types-DFL07G3f.js} +2 -0
- package/dist/assemblyai-C969QGi4.js +35 -0
- package/dist/cartesia-BfQPOQ7Y.js +37 -0
- package/dist/host/_pipeline-test-fakes.d.ts +3 -1
- package/dist/host/providers/stt/deepgram.d.ts +28 -0
- package/dist/host/providers/tts/cartesia.d.ts +1 -1
- package/dist/host/providers/tts/rime.d.ts +44 -0
- package/dist/host/runtime-barrel.d.ts +4 -2
- package/dist/host/runtime-barrel.js +1434 -1209
- package/dist/host/runtime.d.ts +2 -2
- package/dist/host/s2s.d.ts +16 -16
- package/dist/host/session-core.d.ts +37 -0
- package/dist/host/transports/pipeline-transport.d.ts +48 -0
- package/dist/host/transports/s2s-transport.d.ts +19 -0
- package/dist/host/transports/types.d.ts +45 -0
- package/dist/host/ws-handler.d.ts +14 -10
- package/dist/sdk/_internal-types.d.ts +2 -0
- package/dist/sdk/manifest-barrel.js +1 -1
- package/dist/sdk/protocol.d.ts +6 -5
- package/dist/sdk/providers/llm-barrel.js +1 -1
- package/dist/sdk/providers/stt/deepgram.d.ts +35 -0
- package/dist/sdk/providers/stt-barrel.d.ts +1 -0
- package/dist/sdk/providers/stt-barrel.js +2 -2
- package/dist/sdk/providers/tts/cartesia.d.ts +12 -4
- package/dist/sdk/providers/tts/rime.d.ts +42 -0
- package/dist/sdk/providers/tts-barrel.d.ts +1 -0
- package/dist/sdk/providers/tts-barrel.js +2 -2
- package/host/_pipeline-test-fakes.ts +6 -3
- package/host/_test-utils.ts +209 -128
- package/host/builtin-tools.ts +1 -0
- package/host/cleanup.test.ts +25 -298
- package/host/integration/pipeline-reference.integration.test.ts +30 -35
- package/host/providers/resolve.ts +10 -2
- package/host/providers/stt/deepgram.test.ts +229 -0
- package/host/providers/stt/deepgram.ts +172 -0
- package/host/providers/tts/cartesia.ts +7 -3
- package/host/providers/tts/rime.test.ts +251 -0
- package/host/providers/tts/rime.ts +322 -0
- package/host/runtime-barrel.ts +4 -2
- package/host/runtime.test.ts +16 -47
- package/host/runtime.ts +131 -23
- package/host/s2s.test.ts +122 -131
- package/host/s2s.ts +44 -52
- package/host/session-core.test.ts +257 -0
- package/host/session-core.ts +262 -0
- package/host/to-vercel-tools.test.ts +9 -1
- package/host/transports/pipeline-transport.test.ts +653 -0
- package/host/transports/pipeline-transport.ts +532 -0
- package/host/{fixture-replay.test.ts → transports/s2s-transport-fixtures.test.ts} +76 -106
- package/host/transports/s2s-transport.test.ts +56 -0
- package/host/transports/s2s-transport.ts +116 -0
- package/host/transports/types.test.ts +22 -0
- package/host/transports/types.ts +51 -0
- package/host/ws-handler.test.ts +324 -242
- package/host/ws-handler.ts +56 -59
- package/package.json +2 -1
- package/sdk/__snapshots__/exports.test.ts.snap +3 -3
- package/sdk/__snapshots__/schema-shapes.test.ts.snap +1 -0
- package/sdk/_internal-types.ts +3 -0
- package/sdk/protocol-compat.test.ts +8 -0
- package/sdk/protocol.ts +6 -5
- package/sdk/providers/stt/deepgram.ts +43 -0
- package/sdk/providers/stt-barrel.ts +2 -0
- package/sdk/providers/tts/cartesia.ts +15 -5
- package/sdk/providers/tts/rime.ts +52 -0
- package/sdk/providers/tts-barrel.ts +2 -0
- package/sdk/schema-alignment.test.ts +18 -6
- package/dist/assemblyai-Cxg9eobY.js +0 -18
- package/dist/cartesia-DwDk2tEu.js +0 -10
- package/dist/host/pipeline-session-ctx.d.ts +0 -24
- package/dist/host/pipeline-session.d.ts +0 -52
- package/dist/host/session-ctx.d.ts +0 -73
- package/dist/host/session.d.ts +0 -62
- package/host/pipeline-session-ctx.test.ts +0 -31
- package/host/pipeline-session-ctx.ts +0 -36
- package/host/pipeline-session.test.ts +0 -672
- package/host/pipeline-session.ts +0 -533
- package/host/s2s-fixtures.test.ts +0 -237
- package/host/session-ctx.test.ts +0 -387
- package/host/session-ctx.ts +0 -134
- package/host/session-fixture-replay.test.ts +0 -128
- package/host/session.test.ts +0 -634
- package/host/session.ts +0 -412
- /package/dist/{anthropic-BrUCPKUc.js → anthropic-CcLZygAr.js} +0 -0
package/host/ws-handler.ts
CHANGED
|
@@ -11,12 +11,11 @@ import {
|
|
|
11
11
|
MAX_MESSAGE_BUFFER_SIZE,
|
|
12
12
|
WS_OPEN,
|
|
13
13
|
} from "../sdk/constants.ts";
|
|
14
|
-
import
|
|
15
|
-
import {
|
|
16
|
-
import { errorDetail, errorMessage } from "../sdk/utils.ts";
|
|
14
|
+
import { ClientMessageSchema, type ClientSink, lenientParse } from "../sdk/protocol.ts";
|
|
15
|
+
import { errorDetail } from "../sdk/utils.ts";
|
|
17
16
|
import type { Logger } from "./runtime-config.ts";
|
|
18
17
|
import { consoleLogger } from "./runtime-config.ts";
|
|
19
|
-
import type {
|
|
18
|
+
import type { SessionCore } from "./session-core.ts";
|
|
20
19
|
|
|
21
20
|
/**
|
|
22
21
|
* Minimal WebSocket interface accepted by {@link wireSessionSocket}.
|
|
@@ -34,11 +33,11 @@ export type SessionWebSocket = {
|
|
|
34
33
|
/** Options for wiring a WebSocket to a session. */
|
|
35
34
|
export type WsSessionOptions = {
|
|
36
35
|
/** Map of active sessions (session is added on open, removed on close). */
|
|
37
|
-
sessions: Map<string,
|
|
36
|
+
sessions: Map<string, SessionCore>;
|
|
38
37
|
/** Factory function to create a session for a given ID and client sink. */
|
|
39
|
-
createSession: (sessionId: string, client: ClientSink) =>
|
|
38
|
+
createSession: (sessionId: string, client: ClientSink) => SessionCore;
|
|
40
39
|
/** Protocol config sent to the client immediately on connect. */
|
|
41
|
-
readyConfig:
|
|
40
|
+
readyConfig: { audioFormat: "pcm16"; sampleRate: number; ttsSampleRate: number };
|
|
42
41
|
/** Additional key-value pairs included in log messages. */
|
|
43
42
|
logContext?: Record<string, string>;
|
|
44
43
|
/** Callback invoked when the WebSocket connection opens. */
|
|
@@ -57,25 +56,27 @@ export type WsSessionOptions = {
|
|
|
57
56
|
resumeFrom?: string;
|
|
58
57
|
};
|
|
59
58
|
|
|
59
|
+
const AUDIO_DONE_FRAME = JSON.stringify({
|
|
60
|
+
type: "audio_done",
|
|
61
|
+
} satisfies { type: "audio_done" });
|
|
62
|
+
|
|
60
63
|
/**
|
|
61
64
|
* Creates a {@link ClientSink} backed by a plain WebSocket.
|
|
62
65
|
*
|
|
63
|
-
*
|
|
64
|
-
* binary frames
|
|
66
|
+
* Session events are sent as JSON text frames; audio chunks are sent as raw
|
|
67
|
+
* PCM16 binary frames.
|
|
65
68
|
*/
|
|
66
69
|
function createClientSink(ws: SessionWebSocket, log: Logger): ClientSink {
|
|
67
|
-
|
|
68
|
-
function safeSend(data: string | ArrayBuffer | Uint8Array): void {
|
|
70
|
+
function safeSend(data: string | Uint8Array): void {
|
|
69
71
|
try {
|
|
70
72
|
if (ws.readyState !== WS_OPEN) return;
|
|
71
73
|
ws.send(data);
|
|
72
74
|
} catch (err) {
|
|
73
75
|
log.debug?.("safeSend: socket closed between readyState check and send", {
|
|
74
|
-
error:
|
|
76
|
+
error: err instanceof Error ? err.message : String(err),
|
|
75
77
|
});
|
|
76
78
|
}
|
|
77
79
|
}
|
|
78
|
-
|
|
79
80
|
return {
|
|
80
81
|
get open() {
|
|
81
82
|
return ws.readyState === WS_OPEN;
|
|
@@ -87,48 +88,40 @@ function createClientSink(ws: SessionWebSocket, log: Logger): ClientSink {
|
|
|
87
88
|
safeSend(chunk);
|
|
88
89
|
},
|
|
89
90
|
playAudioDone() {
|
|
90
|
-
safeSend(
|
|
91
|
+
safeSend(AUDIO_DONE_FRAME);
|
|
91
92
|
},
|
|
92
93
|
};
|
|
93
94
|
}
|
|
94
95
|
|
|
95
|
-
function handleBinaryAudio(data: unknown, session:
|
|
96
|
-
// Buffer extends Uint8Array in Node, so this catches Buffer too.
|
|
96
|
+
function handleBinaryAudio(data: unknown, session: SessionCore): boolean {
|
|
97
97
|
if (data instanceof Uint8Array) {
|
|
98
98
|
session.onAudio(data);
|
|
99
99
|
return true;
|
|
100
100
|
}
|
|
101
|
-
if (data instanceof ArrayBuffer) {
|
|
102
|
-
session.onAudio(new Uint8Array(data));
|
|
103
|
-
return true;
|
|
104
|
-
}
|
|
105
101
|
return false;
|
|
106
102
|
}
|
|
107
103
|
|
|
108
|
-
function handleTextMessage(
|
|
109
|
-
data
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
): void {
|
|
115
|
-
if (typeof data !== "string") return;
|
|
116
|
-
let json: unknown;
|
|
104
|
+
function handleTextMessage(data: unknown, session: SessionCore, log: Logger, sid: string): void {
|
|
105
|
+
if (typeof data !== "string") {
|
|
106
|
+
log.warn("ws: non-string, non-binary frame received; dropping", { sid });
|
|
107
|
+
return;
|
|
108
|
+
}
|
|
109
|
+
let parsed: unknown;
|
|
117
110
|
try {
|
|
118
|
-
|
|
111
|
+
parsed = JSON.parse(data);
|
|
119
112
|
} catch {
|
|
120
|
-
log.warn("
|
|
113
|
+
log.warn("ws: invalid JSON; dropping", { sid, data: data.slice(0, 200) });
|
|
121
114
|
return;
|
|
122
115
|
}
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
116
|
+
const result = lenientParse(ClientMessageSchema, parsed);
|
|
117
|
+
if (!result.ok) {
|
|
118
|
+
if (result.malformed) {
|
|
119
|
+
log.warn("ws: malformed client message", { sid, error: result.error });
|
|
120
|
+
}
|
|
121
|
+
// else: unrecognised type — silently drop (rolling-upgrade tolerance)
|
|
127
122
|
return;
|
|
128
123
|
}
|
|
129
|
-
|
|
130
|
-
const msg: ClientMessage = parsed.data;
|
|
131
|
-
switch (msg.type) {
|
|
124
|
+
switch (result.data.type) {
|
|
132
125
|
case "audio_ready":
|
|
133
126
|
session.onAudioReady();
|
|
134
127
|
break;
|
|
@@ -139,7 +132,7 @@ function handleTextMessage(
|
|
|
139
132
|
session.onReset();
|
|
140
133
|
break;
|
|
141
134
|
case "history":
|
|
142
|
-
session.onHistory(
|
|
135
|
+
session.onHistory(result.data.messages);
|
|
143
136
|
break;
|
|
144
137
|
default:
|
|
145
138
|
break;
|
|
@@ -147,13 +140,13 @@ function handleTextMessage(
|
|
|
147
140
|
}
|
|
148
141
|
|
|
149
142
|
/**
|
|
150
|
-
* Attaches session lifecycle handlers to a native WebSocket using
|
|
151
|
-
*
|
|
143
|
+
* Attaches session lifecycle handlers to a native WebSocket using JSON text
|
|
144
|
+
* frames for control messages and raw PCM16 binary frames for audio.
|
|
152
145
|
*
|
|
153
146
|
* Connection flow:
|
|
154
|
-
* 1. WebSocket opens → server sends
|
|
155
|
-
* 2. Client sets up audio → sends
|
|
156
|
-
* 3. If reconnecting → client sends
|
|
147
|
+
* 1. WebSocket opens → server sends JSON CONFIG frame with sampleRate, ttsSampleRate, sessionId
|
|
148
|
+
* 2. Client sets up audio → sends JSON AUDIO_READY frame
|
|
149
|
+
* 3. If reconnecting → client sends JSON HISTORY frame with prior messages
|
|
157
150
|
*/
|
|
158
151
|
export function wireSessionSocket(ws: SessionWebSocket, opts: WsSessionOptions): void {
|
|
159
152
|
const { sessions, logger: log = consoleLogger } = opts;
|
|
@@ -161,10 +154,10 @@ export function wireSessionSocket(ws: SessionWebSocket, opts: WsSessionOptions):
|
|
|
161
154
|
const sid = sessionId.slice(0, 8);
|
|
162
155
|
const ctx = opts.logContext ?? {};
|
|
163
156
|
|
|
164
|
-
let session:
|
|
157
|
+
let session: SessionCore | null = null;
|
|
165
158
|
/** Set to true once session.start() resolves. Messages arriving before
|
|
166
159
|
* this flag is set are buffered and replayed once the session is ready,
|
|
167
|
-
* preventing audio/
|
|
160
|
+
* preventing audio/frames from being dispatched to a half-initialized session. */
|
|
168
161
|
let sessionReady = false;
|
|
169
162
|
let messageBuffer: { data: unknown }[] | null = [];
|
|
170
163
|
|
|
@@ -173,9 +166,8 @@ export function wireSessionSocket(ws: SessionWebSocket, opts: WsSessionOptions):
|
|
|
173
166
|
const buf = messageBuffer;
|
|
174
167
|
messageBuffer = null;
|
|
175
168
|
for (const event of buf) {
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
handleTextMessage(data, session, log, ctx, sid);
|
|
169
|
+
if (handleBinaryAudio(event.data, session)) continue;
|
|
170
|
+
handleTextMessage(event.data, session, log, sid);
|
|
179
171
|
}
|
|
180
172
|
}
|
|
181
173
|
|
|
@@ -188,9 +180,17 @@ export function wireSessionSocket(ws: SessionWebSocket, opts: WsSessionOptions):
|
|
|
188
180
|
sessions.set(sessionId, session);
|
|
189
181
|
opts.onSinkCreated?.(sessionId, client);
|
|
190
182
|
|
|
191
|
-
// Send config immediately — zero RTT. Include sessionId so the
|
|
192
|
-
// can reconnect with ?sessionId=<id> to resume a persisted session.
|
|
193
|
-
ws.send(
|
|
183
|
+
// Send config immediately — zero RTT. Include sessionId so the
|
|
184
|
+
// client can reconnect with ?sessionId=<id> to resume a persisted session.
|
|
185
|
+
ws.send(
|
|
186
|
+
JSON.stringify({
|
|
187
|
+
type: "config",
|
|
188
|
+
audioFormat: opts.readyConfig.audioFormat,
|
|
189
|
+
sampleRate: opts.readyConfig.sampleRate,
|
|
190
|
+
ttsSampleRate: opts.readyConfig.ttsSampleRate,
|
|
191
|
+
sessionId,
|
|
192
|
+
}),
|
|
193
|
+
);
|
|
194
194
|
|
|
195
195
|
const timeoutMs = opts.sessionStartTimeoutMs ?? DEFAULT_SESSION_START_TIMEOUT_MS;
|
|
196
196
|
const startWithTimeout = pTimeout(session.start(), {
|
|
@@ -222,17 +222,14 @@ export function wireSessionSocket(ws: SessionWebSocket, opts: WsSessionOptions):
|
|
|
222
222
|
ws.addEventListener("message", (event) => {
|
|
223
223
|
if (!session) return;
|
|
224
224
|
// Buffer messages until session.start() completes to avoid dispatching
|
|
225
|
-
// to a session whose
|
|
225
|
+
// to a session whose transport connection isn't established yet.
|
|
226
226
|
if (!sessionReady) {
|
|
227
|
-
if (messageBuffer && messageBuffer.length < MAX_MESSAGE_BUFFER_SIZE)
|
|
227
|
+
if (messageBuffer && messageBuffer.length < MAX_MESSAGE_BUFFER_SIZE)
|
|
228
228
|
messageBuffer.push(event);
|
|
229
|
-
}
|
|
230
229
|
return;
|
|
231
230
|
}
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
if (handleBinaryAudio(data, session)) return;
|
|
235
|
-
handleTextMessage(data, session, log, ctx, sid);
|
|
231
|
+
if (handleBinaryAudio(event.data, session)) return;
|
|
232
|
+
handleTextMessage(event.data, session, log, sid);
|
|
236
233
|
});
|
|
237
234
|
|
|
238
235
|
ws.addEventListener("close", () => {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@alexkroman1/aai",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.5.1",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"exports": {
|
|
6
6
|
".": {
|
|
@@ -42,6 +42,7 @@
|
|
|
42
42
|
"dependencies": {
|
|
43
43
|
"@ai-sdk/anthropic": "^3.0.0",
|
|
44
44
|
"@cartesia/cartesia-js": "^3.0.0",
|
|
45
|
+
"@deepgram/sdk": "^5.0.0",
|
|
45
46
|
"ai": "^6.0.161",
|
|
46
47
|
"assemblyai": "^4.30.0",
|
|
47
48
|
"escape-html": "^1.0.3",
|
|
@@ -66,13 +66,13 @@ exports[`export surface stability > @alexkroman1/aai/protocol export 1`] = `
|
|
|
66
66
|
exports[`export surface stability > @alexkroman1/aai/runtime export 1`] = `
|
|
67
67
|
[
|
|
68
68
|
"DEFAULT_S2S_CONFIG",
|
|
69
|
-
"_buildBaseCtx",
|
|
70
69
|
"_internals",
|
|
71
|
-
"buildCtx",
|
|
72
70
|
"consoleLogger",
|
|
71
|
+
"createPipelineTransport",
|
|
73
72
|
"createRuntime",
|
|
74
|
-
"
|
|
73
|
+
"createS2sTransport",
|
|
75
74
|
"createServer",
|
|
75
|
+
"createSessionCore",
|
|
76
76
|
"createUnstorageKv",
|
|
77
77
|
"executeInIsolate",
|
|
78
78
|
"executeToolCall",
|
package/sdk/_internal-types.ts
CHANGED
|
@@ -128,6 +128,7 @@ export function toAgentConfig(src: AgentConfigSource): AgentConfig {
|
|
|
128
128
|
* etc.) — the Vercel AI SDK wraps it via `jsonSchema()`.
|
|
129
129
|
*/
|
|
130
130
|
export const ToolSchemaSchema = z.object({
|
|
131
|
+
type: z.literal("function"),
|
|
131
132
|
name: z.string().min(1),
|
|
132
133
|
description: z.string().min(1),
|
|
133
134
|
parameters: z.record(z.string(), z.unknown()),
|
|
@@ -135,6 +136,7 @@ export const ToolSchemaSchema = z.object({
|
|
|
135
136
|
|
|
136
137
|
/** Serialized tool schema — derived from {@link ToolSchemaSchema}. */
|
|
137
138
|
export type ToolSchema = {
|
|
139
|
+
type: "function";
|
|
138
140
|
name: string;
|
|
139
141
|
description: string;
|
|
140
142
|
parameters: JSONSchema7;
|
|
@@ -151,6 +153,7 @@ export const EMPTY_PARAMS = z.object({});
|
|
|
151
153
|
*/
|
|
152
154
|
export function agentToolsToSchemas(tools: Readonly<Record<string, ToolDef>>): ToolSchema[] {
|
|
153
155
|
return Object.entries(tools).map(([name, def]) => ({
|
|
156
|
+
type: "function",
|
|
154
157
|
name,
|
|
155
158
|
description: def.description,
|
|
156
159
|
parameters: z.toJSONSchema(def.parameters ?? EMPTY_PARAMS) as JSONSchema7,
|
|
@@ -24,8 +24,16 @@ import {
|
|
|
24
24
|
// ── Load fixtures ─────────────────────────────────────────────────────────
|
|
25
25
|
|
|
26
26
|
const FIXTURE_DIR = join(import.meta.dirname, "compat-fixtures");
|
|
27
|
+
// Only load compat fixtures that have the expected schema-compat structure
|
|
28
|
+
// (ServerMessage, ClientMessage, KvRequest, constants). Wire-format fixtures
|
|
29
|
+
// like wire-v1.json use a different shape and are tested by wire.test.ts.
|
|
27
30
|
const fixtureFiles = readdirSync(FIXTURE_DIR)
|
|
28
31
|
.filter((f) => f.endsWith(".json"))
|
|
32
|
+
.filter((f) => {
|
|
33
|
+
const raw = readFileSync(join(FIXTURE_DIR, f), "utf-8");
|
|
34
|
+
const parsed = JSON.parse(raw) as Record<string, unknown>;
|
|
35
|
+
return "ServerMessage" in parsed && "ClientMessage" in parsed;
|
|
36
|
+
})
|
|
29
37
|
.sort();
|
|
30
38
|
|
|
31
39
|
type Fixture = {
|
package/sdk/protocol.ts
CHANGED
|
@@ -141,16 +141,17 @@ export type ClientEvent = z.infer<typeof ClientEventSchema>;
|
|
|
141
141
|
/**
|
|
142
142
|
* Typed interface for pushing session events to a connected client.
|
|
143
143
|
*
|
|
144
|
-
*
|
|
144
|
+
* Events (`event`, `playAudioDone`) send JSON text frames. Audio chunks
|
|
145
|
+
* (`playAudioChunk`) send raw PCM16 binary frames.
|
|
145
146
|
*/
|
|
146
147
|
export interface ClientSink {
|
|
147
|
-
/**
|
|
148
|
+
/** True when the underlying connection is open and will accept calls. */
|
|
148
149
|
readonly open: boolean;
|
|
149
|
-
/** Push a session event to the client. */
|
|
150
|
+
/** Push a session event (JSON text frame) to the client. */
|
|
150
151
|
event(e: ClientEvent): void;
|
|
151
|
-
/** Send a single
|
|
152
|
+
/** Send a single PCM16 audio chunk (raw binary frame) to the client. */
|
|
152
153
|
playAudioChunk(chunk: Uint8Array): void;
|
|
153
|
-
/** Signal that TTS audio is complete. */
|
|
154
|
+
/** Signal that TTS audio is complete (JSON text frame). */
|
|
154
155
|
playAudioDone(): void;
|
|
155
156
|
}
|
|
156
157
|
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
// Copyright 2026 the AAI authors. MIT license.
|
|
2
|
+
/**
|
|
3
|
+
* Deepgram Nova streaming STT factory — returns a pure descriptor.
|
|
4
|
+
*
|
|
5
|
+
* The descriptor flows through the bundle → server → runtime pipeline
|
|
6
|
+
* without importing the `@deepgram/sdk` package. The host-side resolver in
|
|
7
|
+
* `host/providers/resolve.ts` turns it into an openable {@link SttOpener}
|
|
8
|
+
* during `createRuntime`.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import type { SttProvider } from "../../providers.ts";
|
|
12
|
+
|
|
13
|
+
/** Kind tag recognised by the host-side resolver. */
|
|
14
|
+
export const DEEPGRAM_KIND = "deepgram" as const;
|
|
15
|
+
|
|
16
|
+
export interface DeepgramOptions {
|
|
17
|
+
/**
|
|
18
|
+
* Streaming speech model. Defaults to `"nova-3"`. Any string is forwarded
|
|
19
|
+
* to the SDK unchanged, which allows opt-in to future models.
|
|
20
|
+
*/
|
|
21
|
+
model?: "nova-3" | "nova-2" | string;
|
|
22
|
+
/**
|
|
23
|
+
* BCP-47 language code for transcription. Defaults to `"en"`.
|
|
24
|
+
* Examples: `"en"`, `"es"`, `"fr"`, `"de"`.
|
|
25
|
+
*/
|
|
26
|
+
language?: string;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export type DeepgramProvider = SttProvider & {
|
|
30
|
+
readonly kind: typeof DEEPGRAM_KIND;
|
|
31
|
+
readonly options: DeepgramOptions;
|
|
32
|
+
};
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Build a Deepgram STT descriptor.
|
|
36
|
+
*
|
|
37
|
+
* The API key is resolved host-side from the agent's env
|
|
38
|
+
* (`DEEPGRAM_API_KEY`); there is no factory-time key parameter, so the
|
|
39
|
+
* descriptor stays free of secrets and safe to serialize.
|
|
40
|
+
*/
|
|
41
|
+
export function deepgram(opts: DeepgramOptions = {}): DeepgramProvider {
|
|
42
|
+
return { kind: DEEPGRAM_KIND, options: { ...opts } };
|
|
43
|
+
}
|
|
@@ -10,3 +10,5 @@
|
|
|
10
10
|
export type { SttError, SttEvents, SttOpenOptions, SttProvider, SttSession } from "../providers.ts";
|
|
11
11
|
// biome-ignore lint/performance/noReExportAll: subpath barrel
|
|
12
12
|
export * from "./stt/assemblyai.ts";
|
|
13
|
+
// biome-ignore lint/performance/noReExportAll: subpath barrel
|
|
14
|
+
export * from "./stt/deepgram.ts";
|
|
@@ -12,9 +12,16 @@ import type { TtsProvider } from "../../providers.ts";
|
|
|
12
12
|
|
|
13
13
|
export const CARTESIA_KIND = "cartesia" as const;
|
|
14
14
|
|
|
15
|
+
/**
|
|
16
|
+
* Default voice used when callers invoke `cartesia()` with no `voice`. This
|
|
17
|
+
* is the same voice the example templates ship with, so a bare `cartesia()`
|
|
18
|
+
* works out of the box for new agents.
|
|
19
|
+
*/
|
|
20
|
+
export const CARTESIA_DEFAULT_VOICE = "f786b574-daa5-4673-aa0c-cbe3e8534c02";
|
|
21
|
+
|
|
15
22
|
export interface CartesiaOptions {
|
|
16
|
-
/** Cartesia voice ID.
|
|
17
|
-
voice
|
|
23
|
+
/** Cartesia voice ID. Defaults to {@link CARTESIA_DEFAULT_VOICE}. */
|
|
24
|
+
voice?: string;
|
|
18
25
|
/** Model ID. Defaults to `"sonic-2"`. */
|
|
19
26
|
model?: string;
|
|
20
27
|
/** Spoken language hint. Defaults to `"en"`. */
|
|
@@ -23,9 +30,12 @@ export interface CartesiaOptions {
|
|
|
23
30
|
|
|
24
31
|
export type CartesiaProvider = TtsProvider & {
|
|
25
32
|
readonly kind: typeof CARTESIA_KIND;
|
|
26
|
-
readonly options: CartesiaOptions;
|
|
33
|
+
readonly options: CartesiaOptions & { voice: string };
|
|
27
34
|
};
|
|
28
35
|
|
|
29
|
-
export function cartesia(opts: CartesiaOptions): CartesiaProvider {
|
|
30
|
-
return {
|
|
36
|
+
export function cartesia(opts: CartesiaOptions = {}): CartesiaProvider {
|
|
37
|
+
return {
|
|
38
|
+
kind: CARTESIA_KIND,
|
|
39
|
+
options: { ...opts, voice: opts.voice ?? CARTESIA_DEFAULT_VOICE },
|
|
40
|
+
};
|
|
31
41
|
}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
// Copyright 2026 the AAI authors. MIT license.
|
|
2
|
+
/**
|
|
3
|
+
* Rime TTS factory — returns a pure descriptor.
|
|
4
|
+
*
|
|
5
|
+
* See `sdk/providers/stt/assemblyai.ts` for the descriptor/opener split;
|
|
6
|
+
* the host-side resolver in `host/providers/resolve.ts` turns this into an
|
|
7
|
+
* openable {@link TtsOpener} during `createRuntime` using the
|
|
8
|
+
* `RIME_API_KEY` from the agent's env.
|
|
9
|
+
*
|
|
10
|
+
* Language codes follow ISO 639-3 (three-letter): `"eng"`, `"fra"`, etc.
|
|
11
|
+
* This differs from many APIs that use ISO 639-1 two-letter codes like `"en"`.
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import type { TtsProvider } from "../../providers.ts";
|
|
15
|
+
|
|
16
|
+
export const RIME_KIND = "rime" as const;
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Default Rime speaker used when callers invoke `rime()` with no `voice`.
|
|
20
|
+
* `cove` is a `mistv2` speaker, matching the default model below — so a
|
|
21
|
+
* bare `rime()` works out of the box for new agents.
|
|
22
|
+
*/
|
|
23
|
+
export const RIME_DEFAULT_VOICE = "cove";
|
|
24
|
+
|
|
25
|
+
export interface RimeOptions {
|
|
26
|
+
/** Rime speaker ID. Defaults to {@link RIME_DEFAULT_VOICE}. */
|
|
27
|
+
voice?: string;
|
|
28
|
+
/**
|
|
29
|
+
* Rime model ID. Defaults to `"mistv2"` (Rime's most compatible model).
|
|
30
|
+
* Common values: `"mistv2"`, `"arcana"`.
|
|
31
|
+
*/
|
|
32
|
+
model?: "mistv2" | "arcana" | string;
|
|
33
|
+
/**
|
|
34
|
+
* Spoken language. Uses ISO 639-3 (three-letter codes).
|
|
35
|
+
* Defaults to `"eng"` (English).
|
|
36
|
+
*
|
|
37
|
+
* Note: Rime uses 3-letter codes — use `"eng"` not `"en"`.
|
|
38
|
+
*/
|
|
39
|
+
language?: string;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export type RimeProvider = TtsProvider & {
|
|
43
|
+
readonly kind: typeof RIME_KIND;
|
|
44
|
+
readonly options: RimeOptions & { voice: string };
|
|
45
|
+
};
|
|
46
|
+
|
|
47
|
+
export function rime(opts: RimeOptions = {}): RimeProvider {
|
|
48
|
+
return {
|
|
49
|
+
kind: RIME_KIND,
|
|
50
|
+
options: { ...opts, voice: opts.voice ?? RIME_DEFAULT_VOICE },
|
|
51
|
+
};
|
|
52
|
+
}
|
|
@@ -10,3 +10,5 @@
|
|
|
10
10
|
export type { TtsError, TtsEvents, TtsOpenOptions, TtsProvider, TtsSession } from "../providers.ts";
|
|
11
11
|
// biome-ignore lint/performance/noReExportAll: subpath barrel
|
|
12
12
|
export * from "./tts/cartesia.ts";
|
|
13
|
+
// biome-ignore lint/performance/noReExportAll: subpath barrel
|
|
14
|
+
export * from "./tts/rime.ts";
|
|
@@ -65,6 +65,7 @@ describe("AgentConfigSchema", () => {
|
|
|
65
65
|
describe("ToolSchemaSchema", () => {
|
|
66
66
|
test("accepts valid tool schema", () => {
|
|
67
67
|
const valid = {
|
|
68
|
+
type: "function" as const,
|
|
68
69
|
name: "get_weather",
|
|
69
70
|
description: "Get weather",
|
|
70
71
|
parameters: { type: "object", properties: { city: { type: "string" } } },
|
|
@@ -73,15 +74,25 @@ describe("ToolSchemaSchema", () => {
|
|
|
73
74
|
});
|
|
74
75
|
|
|
75
76
|
test("rejects empty name", () => {
|
|
76
|
-
expect(
|
|
77
|
-
|
|
78
|
-
|
|
77
|
+
expect(
|
|
78
|
+
ToolSchemaSchema.safeParse({
|
|
79
|
+
type: "function",
|
|
80
|
+
name: "",
|
|
81
|
+
description: "d",
|
|
82
|
+
parameters: {},
|
|
83
|
+
}).success,
|
|
84
|
+
).toBe(false);
|
|
79
85
|
});
|
|
80
86
|
|
|
81
87
|
test("rejects empty description", () => {
|
|
82
|
-
expect(
|
|
83
|
-
|
|
84
|
-
|
|
88
|
+
expect(
|
|
89
|
+
ToolSchemaSchema.safeParse({
|
|
90
|
+
type: "function",
|
|
91
|
+
name: "n",
|
|
92
|
+
description: "",
|
|
93
|
+
parameters: {},
|
|
94
|
+
}).success,
|
|
95
|
+
).toBe(false);
|
|
85
96
|
});
|
|
86
97
|
|
|
87
98
|
test("ToolSchema is assignable from schema inference", () => {
|
|
@@ -89,6 +100,7 @@ describe("ToolSchemaSchema", () => {
|
|
|
89
100
|
// the runtime schema's Record<string, unknown>. Verify the direction:
|
|
90
101
|
// a parsed result should be assignable to ToolSchema (narrow → wide).
|
|
91
102
|
const parsed = ToolSchemaSchema.parse({
|
|
103
|
+
type: "function",
|
|
92
104
|
name: "test",
|
|
93
105
|
description: "test",
|
|
94
106
|
parameters: { type: "object" },
|
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
//#region sdk/providers/stt/assemblyai.ts
|
|
2
|
-
/** Kind tag recognised by the host-side resolver. */
|
|
3
|
-
const ASSEMBLYAI_KIND = "assemblyai";
|
|
4
|
-
/**
|
|
5
|
-
* Build an AssemblyAI STT descriptor.
|
|
6
|
-
*
|
|
7
|
-
* The API key is resolved host-side from the agent's env
|
|
8
|
-
* (`ASSEMBLYAI_API_KEY`); there is no factory-time key parameter, so the
|
|
9
|
-
* descriptor stays free of secrets and safe to serialize.
|
|
10
|
-
*/
|
|
11
|
-
function assemblyAI(opts = {}) {
|
|
12
|
-
return {
|
|
13
|
-
kind: ASSEMBLYAI_KIND,
|
|
14
|
-
options: { ...opts }
|
|
15
|
-
};
|
|
16
|
-
}
|
|
17
|
-
//#endregion
|
|
18
|
-
export { assemblyAI as n, ASSEMBLYAI_KIND as t };
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
/** Pipeline session context — base ctx + STT/TTS session slots. */
|
|
2
|
-
import type { AgentConfig, ExecuteTool } from "../sdk/_internal-types.ts";
|
|
3
|
-
import type { ClientSink } from "../sdk/protocol.ts";
|
|
4
|
-
import type { SttSession, TtsSession } from "../sdk/providers.ts";
|
|
5
|
-
import type { Logger } from "./runtime-config.ts";
|
|
6
|
-
import { type BaseSessionCtx } from "./session-ctx.ts";
|
|
7
|
-
/**
|
|
8
|
-
* Pipeline session context — {@link BaseSessionCtx} plus STT/TTS provider
|
|
9
|
-
* session handles. Replaces the S2S `s2s` field with decoupled `stt` + `tts`
|
|
10
|
-
* slots so the pipeline orchestrator can drive independent providers.
|
|
11
|
-
*/
|
|
12
|
-
export type PipelineSessionCtx = BaseSessionCtx & {
|
|
13
|
-
stt: SttSession | null;
|
|
14
|
-
tts: TtsSession | null;
|
|
15
|
-
};
|
|
16
|
-
export declare function buildPipelineCtx(opts: {
|
|
17
|
-
id: string;
|
|
18
|
-
agent: string;
|
|
19
|
-
client: ClientSink;
|
|
20
|
-
agentConfig: AgentConfig;
|
|
21
|
-
executeTool: ExecuteTool;
|
|
22
|
-
log: Logger;
|
|
23
|
-
maxHistory?: number | undefined;
|
|
24
|
-
}): PipelineSessionCtx;
|
|
@@ -1,52 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Pipeline session — pluggable STT → LLM → TTS orchestrator.
|
|
3
|
-
*
|
|
4
|
-
* Alternative to the S2S session (see `session.ts`) that drives three
|
|
5
|
-
* independent providers. A new partial STT event while the agent is replying
|
|
6
|
-
* triggers barge-in (aborts the LLM stream and cancels TTS).
|
|
7
|
-
*/
|
|
8
|
-
import type { LanguageModel } from "ai";
|
|
9
|
-
import type { AgentConfig, ExecuteTool, ToolSchema } from "../sdk/_internal-types.ts";
|
|
10
|
-
import type { ClientSink } from "../sdk/protocol.ts";
|
|
11
|
-
import type { SttOpener, TtsOpener } from "../sdk/providers.ts";
|
|
12
|
-
import { type Logger } from "./runtime-config.ts";
|
|
13
|
-
import type { Session } from "./session.ts";
|
|
14
|
-
/** Configuration options for {@link createPipelineSession}. */
|
|
15
|
-
export interface PipelineSessionOptions {
|
|
16
|
-
/** Unique session identifier. */
|
|
17
|
-
id: string;
|
|
18
|
-
/** Agent slug. */
|
|
19
|
-
agent: string;
|
|
20
|
-
/** Sink for wire events + audio back to the browser client. */
|
|
21
|
-
client: ClientSink;
|
|
22
|
-
/** Serializable agent config (name, system prompt, maxSteps, etc.). */
|
|
23
|
-
agentConfig: AgentConfig;
|
|
24
|
-
/** JSON Schema definitions for the agent's tools. */
|
|
25
|
-
toolSchemas: readonly ToolSchema[];
|
|
26
|
-
/** Optional natural-language guidance appended to the system prompt. */
|
|
27
|
-
toolGuidance?: readonly string[] | undefined;
|
|
28
|
-
/** Function to invoke tools by name. */
|
|
29
|
-
executeTool: ExecuteTool;
|
|
30
|
-
/** STT opener (resolved from an {@link SttProvider} descriptor). */
|
|
31
|
-
stt: SttOpener;
|
|
32
|
-
/** LLM provider (Vercel AI SDK `LanguageModel`). */
|
|
33
|
-
llm: LanguageModel;
|
|
34
|
-
/** TTS opener (resolved from a {@link TtsProvider} descriptor). */
|
|
35
|
-
tts: TtsOpener;
|
|
36
|
-
/** STT API key. */
|
|
37
|
-
sttApiKey: string;
|
|
38
|
-
/** TTS API key. */
|
|
39
|
-
ttsApiKey: string;
|
|
40
|
-
/** STT audio sample rate (PCM16, Hz). Defaults to {@link DEFAULT_STT_SAMPLE_RATE}. */
|
|
41
|
-
sttSampleRate?: number | undefined;
|
|
42
|
-
/** TTS audio sample rate (PCM16, Hz). Must match the client's playback AudioContext rate. Defaults to {@link DEFAULT_TTS_SAMPLE_RATE}. */
|
|
43
|
-
ttsSampleRate?: number | undefined;
|
|
44
|
-
/** Skip the initial greeting audio on connect (used for session resume). */
|
|
45
|
-
skipGreeting?: boolean | undefined;
|
|
46
|
-
/** Logger. Defaults to the console logger. */
|
|
47
|
-
logger?: Logger | undefined;
|
|
48
|
-
/** Sliding-window conversation history size. */
|
|
49
|
-
maxHistory?: number | undefined;
|
|
50
|
-
}
|
|
51
|
-
/** Create a pluggable-provider voice session. */
|
|
52
|
-
export declare function createPipelineSession(opts: PipelineSessionOptions): Session;
|