@alexkroman1/aai 1.7.1 → 1.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +11 -9
- package/CHANGELOG.md +10 -0
- package/dist/{_internal-types-CrnTi9Ew.js → _internal-types-CfOAbK6V.js} +22 -35
- package/dist/constants-y68COEGj.js +29 -0
- package/dist/host/_base64.d.ts +2 -0
- package/dist/host/_mock-ws.d.ts +0 -61
- package/dist/host/_pipeline-test-fakes.d.ts +7 -4
- package/dist/host/_run-code.d.ts +0 -25
- package/dist/host/_runtime-conformance.d.ts +3 -34
- package/dist/host/memory-vector.d.ts +0 -11
- package/dist/host/providers/resolve-kv.d.ts +0 -7
- package/dist/host/providers/resolve-vector.d.ts +0 -8
- package/dist/host/providers/stt/assemblyai.d.ts +0 -14
- package/dist/host/providers/stt/deepgram.d.ts +2 -14
- package/dist/host/providers/stt/soniox.d.ts +0 -22
- package/dist/host/providers/tts/rime.d.ts +10 -31
- package/dist/host/runtime-barrel.js +619 -630
- package/dist/host/runtime-config.d.ts +9 -6
- package/dist/host/runtime.d.ts +3 -0
- package/dist/host/to-vercel-tools.d.ts +3 -33
- package/dist/host/transports/openai-realtime-transport.d.ts +43 -0
- package/dist/host/unstorage-kv.d.ts +0 -26
- package/dist/index.js +3 -3
- package/dist/openai-realtime-cjPAHMMx.js +10 -0
- package/dist/sdk/_internal-types.d.ts +6 -55
- package/dist/sdk/allowed-hosts.d.ts +4 -3
- package/dist/sdk/constants.d.ts +4 -29
- package/dist/sdk/define.d.ts +7 -4
- package/dist/sdk/kv.d.ts +13 -37
- package/dist/sdk/manifest-barrel.js +1 -1
- package/dist/sdk/manifest.d.ts +8 -2
- package/dist/sdk/protocol.js +1 -1
- package/dist/sdk/providers/s2s/openai-realtime.d.ts +17 -0
- package/dist/sdk/providers/s2s-barrel.d.ts +9 -0
- package/dist/sdk/providers/s2s-barrel.js +2 -0
- package/dist/sdk/providers/tts/rime.d.ts +1 -1
- package/dist/sdk/providers.d.ts +6 -2
- package/dist/sdk/types.d.ts +7 -1
- package/dist/{types-KUgezM6u.js → types-DOWVZhb9.js} +1 -7
- package/dist/{ws-upgrade-BeOQ7fXL.js → ws-upgrade-CG8-by1n.js} +2 -3
- package/host/_base64.ts +9 -0
- package/host/_mock-ws.ts +0 -65
- package/host/_pipeline-test-fakes.ts +19 -31
- package/host/_run-code.ts +10 -53
- package/host/_runtime-conformance.ts +3 -44
- package/host/_test-utils.ts +20 -42
- package/host/builtin-tools.test.ts +127 -222
- package/host/builtin-tools.ts +6 -10
- package/host/cleanup.test.ts +30 -73
- package/host/integration/pipeline-reference.integration.test.ts +12 -17
- package/host/integration.test.ts +0 -7
- package/host/memory-vector.test.ts +3 -1
- package/host/memory-vector.ts +16 -21
- package/host/pinecone-vector.test.ts +14 -17
- package/host/pinecone-vector.ts +10 -19
- package/host/providers/providers.test-d.ts +5 -3
- package/host/providers/resolve-kv.ts +23 -41
- package/host/providers/resolve-vector.ts +3 -12
- package/host/providers/resolve.test.ts +15 -28
- package/host/providers/resolve.ts +24 -24
- package/host/providers/stt/assemblyai.test.ts +2 -14
- package/host/providers/stt/assemblyai.ts +12 -35
- package/host/providers/stt/deepgram.test.ts +23 -83
- package/host/providers/stt/deepgram.ts +15 -40
- package/host/providers/stt/elevenlabs.test.ts +26 -38
- package/host/providers/stt/elevenlabs.ts +10 -9
- package/host/providers/stt/soniox.test.ts +35 -85
- package/host/providers/stt/soniox.ts +8 -53
- package/host/providers/tts/cartesia.test.ts +19 -58
- package/host/providers/tts/cartesia.ts +36 -66
- package/host/providers/tts/rime.test.ts +12 -38
- package/host/providers/tts/rime.ts +23 -86
- package/host/runtime-config.test.ts +9 -9
- package/host/runtime-config.ts +16 -22
- package/host/runtime.test.ts +111 -73
- package/host/runtime.ts +138 -86
- package/host/s2s.test.ts +92 -191
- package/host/s2s.ts +55 -49
- package/host/server-shutdown.test.ts +9 -30
- package/host/server.test.ts +2 -13
- package/host/server.ts +85 -100
- package/host/session-core.test.ts +15 -30
- package/host/session-core.ts +10 -13
- package/host/session-prompt.test.ts +1 -5
- package/host/to-vercel-tools.test.ts +53 -72
- package/host/to-vercel-tools.ts +9 -39
- package/host/tool-executor.test.ts +25 -51
- package/host/tool-executor.ts +18 -12
- package/host/transports/openai-realtime-transport.test.ts +371 -0
- package/host/transports/openai-realtime-transport.ts +319 -0
- package/host/transports/pipeline-transport.test.ts +125 -298
- package/host/transports/pipeline-transport.ts +20 -68
- package/host/transports/s2s-transport-fixtures.test.ts +31 -92
- package/host/transports/s2s-transport.test.ts +65 -134
- package/host/transports/s2s-transport.ts +15 -43
- package/host/transports/types.test.ts +4 -8
- package/host/unstorage-kv.test.ts +3 -2
- package/host/unstorage-kv.ts +5 -35
- package/host/ws-handler.test.ts +72 -176
- package/host/ws-handler.ts +6 -12
- package/package.json +6 -1
- package/sdk/__snapshots__/exports.test.ts.snap +7 -0
- package/sdk/__snapshots__/schema-shapes.test.ts.snap +1 -0
- package/sdk/_internal-types.test.ts +6 -9
- package/sdk/_internal-types.ts +16 -57
- package/sdk/_test-matchers.ts +25 -15
- package/sdk/allowed-hosts.test.ts +50 -114
- package/sdk/allowed-hosts.ts +8 -14
- package/sdk/constants.ts +5 -52
- package/sdk/define.test.ts +7 -6
- package/sdk/define.ts +7 -3
- package/sdk/exports.test.ts +6 -1
- package/sdk/kv.ts +13 -37
- package/sdk/manifest.test-d.ts +5 -0
- package/sdk/manifest.test.ts +61 -9
- package/sdk/manifest.ts +11 -11
- package/sdk/protocol-compat.test.ts +66 -98
- package/sdk/protocol-snapshot.test.ts +2 -16
- package/sdk/protocol.test.ts +13 -22
- package/sdk/providers/s2s/openai-realtime.ts +36 -0
- package/sdk/providers/s2s-barrel.ts +12 -0
- package/sdk/providers/tts/rime.ts +1 -1
- package/sdk/providers.ts +24 -5
- package/sdk/schema-alignment.test.ts +25 -73
- package/sdk/schema-shapes.test.ts +1 -29
- package/sdk/system-prompt.test.ts +0 -1
- package/sdk/system-prompt.ts +17 -19
- package/sdk/types-inference.test.ts +10 -36
- package/sdk/types.ts +7 -0
- package/sdk/ws-upgrade.test.ts +24 -23
- package/sdk/ws-upgrade.ts +2 -3
- package/tsdown.config.ts +8 -11
- package/dist/constants-C2nirZUI.js +0 -54
|
@@ -85,16 +85,17 @@ export function openElevenLabs(opts: ElevenLabsOptions = {}): SttOpener {
|
|
|
85
85
|
const emitter: Emitter<SttEvents> = createNanoEvents<SttEvents>();
|
|
86
86
|
let closed = false;
|
|
87
87
|
|
|
88
|
-
|
|
88
|
+
function emitTranscript(event: "partial" | "final", text: string | undefined) {
|
|
89
89
|
if (closed) return;
|
|
90
|
-
|
|
91
|
-
|
|
90
|
+
if (text && text.length > 0) emitter.emit(event, text);
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
connection.on(RealtimeEvents.PARTIAL_TRANSCRIPT, (msg) => {
|
|
94
|
+
emitTranscript("partial", msg.text);
|
|
92
95
|
});
|
|
93
96
|
|
|
94
97
|
connection.on(RealtimeEvents.COMMITTED_TRANSCRIPT, (msg) => {
|
|
95
|
-
|
|
96
|
-
const text = msg.text ?? "";
|
|
97
|
-
if (text.length > 0) emitter.emit("final", text);
|
|
98
|
+
emitTranscript("final", msg.text);
|
|
98
99
|
});
|
|
99
100
|
|
|
100
101
|
connection.on(RealtimeEvents.ERROR, (payload) => {
|
|
@@ -111,15 +112,15 @@ export function openElevenLabs(opts: ElevenLabsOptions = {}): SttOpener {
|
|
|
111
112
|
emitter.emit("error", makeSttError("stt_auth_failed", msg.error));
|
|
112
113
|
});
|
|
113
114
|
|
|
114
|
-
|
|
115
|
+
async function close(): Promise<void> {
|
|
115
116
|
if (closed) return;
|
|
116
117
|
closed = true;
|
|
117
118
|
try {
|
|
118
119
|
connection.close();
|
|
119
120
|
} catch {
|
|
120
|
-
//
|
|
121
|
+
// Already tearing down — ignore close errors.
|
|
121
122
|
}
|
|
122
|
-
}
|
|
123
|
+
}
|
|
123
124
|
|
|
124
125
|
if (openOpts.signal.aborted) {
|
|
125
126
|
void close();
|
|
@@ -5,20 +5,6 @@ import { describe, expect, test, vi } from "vitest";
|
|
|
5
5
|
import { flush } from "../../_test-utils.ts";
|
|
6
6
|
import { openSoniox } from "./soniox.ts";
|
|
7
7
|
|
|
8
|
-
// ---------------------------------------------------------------------------
|
|
9
|
-
// Mock the `ws` package. Each FakeWS:
|
|
10
|
-
// - extends EventEmitter for `on`/`off`/`once` semantics that match the
|
|
11
|
-
// real `ws.WebSocket` API
|
|
12
|
-
// - exposes `readyState` initialised to OPEN once "open" fires
|
|
13
|
-
// - records sent frames so tests can assert on them
|
|
14
|
-
// - exposes `_fire(ev, data)` so tests inject incoming server frames
|
|
15
|
-
//
|
|
16
|
-
// Vitest hoists `vi.mock` to module top, so the factory can't reference
|
|
17
|
-
// outer top-level declarations. `vi.hoisted` runs even earlier and lets
|
|
18
|
-
// us share `FakeWS` + the `latest` capture between the factory and the
|
|
19
|
-
// test bodies.
|
|
20
|
-
// ---------------------------------------------------------------------------
|
|
21
|
-
|
|
22
8
|
interface FakeWSInstance {
|
|
23
9
|
readyState: number;
|
|
24
10
|
sent: Array<string | Uint8Array>;
|
|
@@ -32,6 +18,7 @@ interface FakeWSInstance {
|
|
|
32
18
|
|
|
33
19
|
type Listener = (...args: unknown[]) => void;
|
|
34
20
|
|
|
21
|
+
// `vi.mock` is hoisted above top-level decls, so share state via `vi.hoisted`.
|
|
35
22
|
const { latest, FakeWS } = vi.hoisted(() => {
|
|
36
23
|
const latestRef: { ws: FakeWSInstance | undefined } = { ws: undefined };
|
|
37
24
|
class FakeWSImpl implements FakeWSInstance {
|
|
@@ -87,37 +74,37 @@ const { latest, FakeWS } = vi.hoisted(() => {
|
|
|
87
74
|
|
|
88
75
|
vi.mock("ws", () => ({ default: FakeWS, WebSocket: FakeWS }));
|
|
89
76
|
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
77
|
+
interface OpenSessionOpts {
|
|
78
|
+
apiKey?: string;
|
|
79
|
+
languageHints?: string[];
|
|
80
|
+
model?: string;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
async function openSession(opts: OpenSessionOpts = {}): Promise<{
|
|
93
84
|
session: import("../../../sdk/providers.ts").SttSession;
|
|
94
85
|
ws: FakeWSInstance;
|
|
95
86
|
controller: AbortController;
|
|
96
87
|
}> {
|
|
97
88
|
latest.ws = undefined;
|
|
98
|
-
const
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
...(opts.languageHints ? { languageHints: opts.languageHints } : {}),
|
|
103
|
-
}
|
|
104
|
-
: {},
|
|
105
|
-
);
|
|
89
|
+
const openerOpts: { model?: string; languageHints?: string[] } = {};
|
|
90
|
+
if (opts.model) openerOpts.model = opts.model;
|
|
91
|
+
if (opts.languageHints) openerOpts.languageHints = opts.languageHints;
|
|
92
|
+
const opener = openSoniox(openerOpts);
|
|
106
93
|
const controller = new AbortController();
|
|
107
94
|
const session = await opener.open({
|
|
108
95
|
sampleRate: 16_000,
|
|
109
96
|
apiKey: opts.apiKey ?? "test-key",
|
|
110
97
|
signal: controller.signal,
|
|
111
98
|
});
|
|
112
|
-
|
|
113
|
-
// adapter's `await waitForOpen(ws)` already drained it, so `latest.ws`
|
|
114
|
-
// is fully wired by now. Capture into a local const so TS narrows the
|
|
115
|
-
// type — direct property access on a mutable ref keeps the union.
|
|
116
|
-
const ws: FakeWSInstance | undefined = latest.ws;
|
|
99
|
+
const ws = latest.ws;
|
|
117
100
|
if (!ws) throw new Error("no fake ws captured");
|
|
118
101
|
return { session, ws, controller };
|
|
119
102
|
}
|
|
120
103
|
|
|
104
|
+
function frame(payload: unknown): Buffer {
|
|
105
|
+
return Buffer.from(JSON.stringify(payload));
|
|
106
|
+
}
|
|
107
|
+
|
|
121
108
|
describe("Soniox real-time STT adapter", () => {
|
|
122
109
|
test("openSoniox() returns an opener with name 'soniox'", () => {
|
|
123
110
|
expect(openSoniox({}).name).toBe("soniox");
|
|
@@ -166,14 +153,12 @@ describe("Soniox real-time STT adapter", () => {
|
|
|
166
153
|
|
|
167
154
|
ws._fire(
|
|
168
155
|
"message",
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
}),
|
|
176
|
-
),
|
|
156
|
+
frame({
|
|
157
|
+
tokens: [
|
|
158
|
+
{ text: "hel", is_final: false },
|
|
159
|
+
{ text: "lo", is_final: false },
|
|
160
|
+
],
|
|
161
|
+
}),
|
|
177
162
|
);
|
|
178
163
|
|
|
179
164
|
await flush();
|
|
@@ -188,31 +173,19 @@ describe("Soniox real-time STT adapter", () => {
|
|
|
188
173
|
session.on("final", (t) => finals.push(t));
|
|
189
174
|
session.on("partial", (t) => partials.push(t));
|
|
190
175
|
|
|
191
|
-
// Frame 1: only final tokens — buffered, NOT yet emitted.
|
|
192
176
|
ws._fire(
|
|
193
177
|
"message",
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
}),
|
|
201
|
-
),
|
|
178
|
+
frame({
|
|
179
|
+
tokens: [
|
|
180
|
+
{ text: "hello", is_final: true },
|
|
181
|
+
{ text: " world", is_final: true },
|
|
182
|
+
],
|
|
183
|
+
}),
|
|
202
184
|
);
|
|
203
185
|
await flush();
|
|
204
186
|
expect(finals).toEqual([]);
|
|
205
187
|
|
|
206
|
-
|
|
207
|
-
// and emits the new partial.
|
|
208
|
-
ws._fire(
|
|
209
|
-
"message",
|
|
210
|
-
Buffer.from(
|
|
211
|
-
JSON.stringify({
|
|
212
|
-
tokens: [{ text: "how", is_final: false }],
|
|
213
|
-
}),
|
|
214
|
-
),
|
|
215
|
-
);
|
|
188
|
+
ws._fire("message", frame({ tokens: [{ text: "how", is_final: false }] }));
|
|
216
189
|
await flush();
|
|
217
190
|
expect(finals).toEqual(["hello world"]);
|
|
218
191
|
expect(partials).toEqual(["how"]);
|
|
@@ -224,15 +197,7 @@ describe("Soniox real-time STT adapter", () => {
|
|
|
224
197
|
const finals: string[] = [];
|
|
225
198
|
session.on("final", (t) => finals.push(t));
|
|
226
199
|
|
|
227
|
-
ws._fire(
|
|
228
|
-
"message",
|
|
229
|
-
Buffer.from(
|
|
230
|
-
JSON.stringify({
|
|
231
|
-
tokens: [{ text: "bye", is_final: true }],
|
|
232
|
-
finished: true,
|
|
233
|
-
}),
|
|
234
|
-
),
|
|
235
|
-
);
|
|
200
|
+
ws._fire("message", frame({ tokens: [{ text: "bye", is_final: true }], finished: true }));
|
|
236
201
|
|
|
237
202
|
await flush();
|
|
238
203
|
expect(finals).toEqual(["bye"]);
|
|
@@ -244,16 +209,9 @@ describe("Soniox real-time STT adapter", () => {
|
|
|
244
209
|
const finals: string[] = [];
|
|
245
210
|
session.on("final", (t) => finals.push(t));
|
|
246
211
|
|
|
247
|
-
ws._fire(
|
|
248
|
-
"message",
|
|
249
|
-
Buffer.from(
|
|
250
|
-
JSON.stringify({
|
|
251
|
-
tokens: [{ text: "trailing", is_final: true }],
|
|
252
|
-
}),
|
|
253
|
-
),
|
|
254
|
-
);
|
|
212
|
+
ws._fire("message", frame({ tokens: [{ text: "trailing", is_final: true }] }));
|
|
255
213
|
await flush();
|
|
256
|
-
expect(finals).toEqual([]);
|
|
214
|
+
expect(finals).toEqual([]);
|
|
257
215
|
|
|
258
216
|
await session.close();
|
|
259
217
|
expect(finals).toEqual(["trailing"]);
|
|
@@ -264,10 +222,7 @@ describe("Soniox real-time STT adapter", () => {
|
|
|
264
222
|
const errors: { code: string; message: string }[] = [];
|
|
265
223
|
session.on("error", (e) => errors.push({ code: e.code, message: e.message }));
|
|
266
224
|
|
|
267
|
-
ws._fire(
|
|
268
|
-
"message",
|
|
269
|
-
Buffer.from(JSON.stringify({ error_code: 503, error_message: "service unavailable" })),
|
|
270
|
-
);
|
|
225
|
+
ws._fire("message", frame({ error_code: 503, error_message: "service unavailable" }));
|
|
271
226
|
|
|
272
227
|
await flush();
|
|
273
228
|
expect(errors).toHaveLength(1);
|
|
@@ -325,12 +280,7 @@ describe("Soniox real-time STT adapter", () => {
|
|
|
325
280
|
await session.close();
|
|
326
281
|
await session.close();
|
|
327
282
|
|
|
328
|
-
ws._fire(
|
|
329
|
-
"message",
|
|
330
|
-
Buffer.from(
|
|
331
|
-
JSON.stringify({ tokens: [{ text: "ignored", is_final: true }], finished: true }),
|
|
332
|
-
),
|
|
333
|
-
);
|
|
283
|
+
ws._fire("message", frame({ tokens: [{ text: "ignored", is_final: true }], finished: true }));
|
|
334
284
|
|
|
335
285
|
await flush();
|
|
336
286
|
expect(finals).toEqual([]);
|
|
@@ -1,25 +1,4 @@
|
|
|
1
1
|
// Copyright 2026 the AAI authors. MIT license.
|
|
2
|
-
/**
|
|
3
|
-
* Soniox real-time STT opener (host-only).
|
|
4
|
-
*
|
|
5
|
-
* The user-facing descriptor factory (`soniox(...)`) lives in
|
|
6
|
-
* `sdk/providers/stt/soniox.ts`. This module is the host-side
|
|
7
|
-
* counterpart: it takes the descriptor options + an API key and
|
|
8
|
-
* returns an {@link SttOpener} that the pipeline session drives.
|
|
9
|
-
*
|
|
10
|
-
* Soniox's published JS client (`@soniox/speech-to-text-web`) is
|
|
11
|
-
* browser-only — it depends on `MediaRecorder` and `getUserMedia`. For
|
|
12
|
-
* server-side use we talk to the WebSocket directly:
|
|
13
|
-
* `wss://stt-rt.soniox.com/transcribe-websocket`
|
|
14
|
-
*
|
|
15
|
-
* Wire format:
|
|
16
|
-
* - First text frame: JSON config with api_key, model, audio_format,
|
|
17
|
-
* sample_rate, num_channels (and optional language hints).
|
|
18
|
-
* - Subsequent binary frames: 16-bit signed little-endian PCM audio.
|
|
19
|
-
* - Server replies: JSON `{ tokens: [{ text, is_final }] }` messages.
|
|
20
|
-
* Final tokens accumulate; non-final tokens are a rolling preview.
|
|
21
|
-
* - On error: `{ error_code, error_message }`.
|
|
22
|
-
*/
|
|
23
2
|
|
|
24
3
|
import { createNanoEvents, type Emitter } from "nanoevents";
|
|
25
4
|
import WebSocket from "ws";
|
|
@@ -32,9 +11,10 @@ import {
|
|
|
32
11
|
type SttSession,
|
|
33
12
|
} from "../../../sdk/providers.ts";
|
|
34
13
|
|
|
14
|
+
// `@soniox/speech-to-text-web` is browser-only (MediaRecorder/getUserMedia),
|
|
15
|
+
// so we speak the WebSocket protocol directly.
|
|
35
16
|
const SONIOX_WS_URL = "wss://stt-rt.soniox.com/transcribe-websocket";
|
|
36
17
|
|
|
37
|
-
/** Soniox token shape from the wire protocol. */
|
|
38
18
|
interface SonioxToken {
|
|
39
19
|
text?: string;
|
|
40
20
|
is_final?: boolean;
|
|
@@ -47,10 +27,6 @@ interface SonioxResponse {
|
|
|
47
27
|
error_message?: string;
|
|
48
28
|
}
|
|
49
29
|
|
|
50
|
-
/**
|
|
51
|
-
* Walk a batch of Soniox tokens, sending finals into `appendFinal` and
|
|
52
|
-
* returning the concatenated non-finals as a rolling preview string.
|
|
53
|
-
*/
|
|
54
30
|
function consumeTokens(tokens: SonioxToken[], appendFinal: (text: string) => void): string {
|
|
55
31
|
let nonFinal = "";
|
|
56
32
|
for (const tok of tokens) {
|
|
@@ -65,14 +41,13 @@ function consumeTokens(tokens: SonioxToken[], appendFinal: (text: string) => voi
|
|
|
65
41
|
return nonFinal;
|
|
66
42
|
}
|
|
67
43
|
|
|
68
|
-
/** Resolve once the WebSocket opens; reject on the first error. */
|
|
69
44
|
function waitForOpen(ws: WebSocket): Promise<void> {
|
|
70
45
|
return new Promise((resolve, reject) => {
|
|
71
|
-
const onOpen = () => {
|
|
46
|
+
const onOpen = (): void => {
|
|
72
47
|
ws.off("error", onErr);
|
|
73
48
|
resolve();
|
|
74
49
|
};
|
|
75
|
-
const onErr = (err: Error) => {
|
|
50
|
+
const onErr = (err: Error): void => {
|
|
76
51
|
ws.off("open", onOpen);
|
|
77
52
|
reject(err);
|
|
78
53
|
};
|
|
@@ -81,7 +56,6 @@ function waitForOpen(ws: WebSocket): Promise<void> {
|
|
|
81
56
|
});
|
|
82
57
|
}
|
|
83
58
|
|
|
84
|
-
/** Build the initial JSON config frame for a Soniox session. */
|
|
85
59
|
function buildConfigFrame(
|
|
86
60
|
apiKey: string,
|
|
87
61
|
opts: SonioxOptions,
|
|
@@ -100,7 +74,6 @@ function buildConfigFrame(
|
|
|
100
74
|
return config;
|
|
101
75
|
}
|
|
102
76
|
|
|
103
|
-
/** Parse a Soniox text frame into a {@link SonioxResponse}; returns null on garbage. */
|
|
104
77
|
function parseFrame(raw: WebSocket.RawData): SonioxResponse | null {
|
|
105
78
|
try {
|
|
106
79
|
return JSON.parse(raw.toString()) as SonioxResponse;
|
|
@@ -109,12 +82,6 @@ function parseFrame(raw: WebSocket.RawData): SonioxResponse | null {
|
|
|
109
82
|
}
|
|
110
83
|
}
|
|
111
84
|
|
|
112
|
-
/**
|
|
113
|
-
* Handle one server response. Emits `error`, `final`, and `partial` events
|
|
114
|
-
* onto `emitter` based on the token batch and the running `finalBuf`. The
|
|
115
|
-
* caller owns `finalBuf` so it survives across messages and can be flushed
|
|
116
|
-
* on close.
|
|
117
|
-
*/
|
|
118
85
|
function handleResponse(
|
|
119
86
|
res: SonioxResponse,
|
|
120
87
|
emitter: Emitter<SttEvents>,
|
|
@@ -134,10 +101,8 @@ function handleResponse(
|
|
|
134
101
|
const nonFinal = consumeTokens(res.tokens, (text) => {
|
|
135
102
|
finalBuf.value += text;
|
|
136
103
|
});
|
|
137
|
-
//
|
|
138
|
-
//
|
|
139
|
-
// tokens into a single `final` event, matching what downstream pipeline
|
|
140
|
-
// session code expects.
|
|
104
|
+
// Batch contiguous finals into one `final` event by flushing only when
|
|
105
|
+
// a new non-final preview starts (or the session finishes).
|
|
141
106
|
if (finalBuf.value.length > 0 && (nonFinal.length > 0 || res.finished)) {
|
|
142
107
|
emitter.emit("final", finalBuf.value);
|
|
143
108
|
finalBuf.value = "";
|
|
@@ -147,7 +112,6 @@ function handleResponse(
|
|
|
147
112
|
}
|
|
148
113
|
}
|
|
149
114
|
|
|
150
|
-
/** Build an {@link SttOpener} from resolved Soniox descriptor options. */
|
|
151
115
|
export function openSoniox(opts: SonioxOptions = {}): SttOpener {
|
|
152
116
|
return {
|
|
153
117
|
name: "soniox",
|
|
@@ -163,11 +127,6 @@ export function openSoniox(opts: SonioxOptions = {}): SttOpener {
|
|
|
163
127
|
const ws = new WebSocket(SONIOX_WS_URL);
|
|
164
128
|
const emitter: Emitter<SttEvents> = createNanoEvents<SttEvents>();
|
|
165
129
|
let closed = false;
|
|
166
|
-
// Soniox emits final tokens once and non-final tokens repeatedly. We
|
|
167
|
-
// accumulate finals into a buffer flushed on each non-final boundary
|
|
168
|
-
// and forward non-finals as the rolling partial. Mirrors how the
|
|
169
|
-
// existing AssemblyAI/Deepgram openers map provider-specific token
|
|
170
|
-
// streams onto the SttEvents `partial`/`final` contract.
|
|
171
130
|
const finalBuf = { value: "" };
|
|
172
131
|
|
|
173
132
|
try {
|
|
@@ -179,7 +138,6 @@ export function openSoniox(opts: SonioxOptions = {}): SttOpener {
|
|
|
179
138
|
);
|
|
180
139
|
}
|
|
181
140
|
|
|
182
|
-
// Initial config frame (text). Sent first; audio binary frames follow.
|
|
183
141
|
ws.send(JSON.stringify(buildConfigFrame(apiKey, opts, openOpts.sampleRate)));
|
|
184
142
|
|
|
185
143
|
ws.on("message", (raw: WebSocket.RawData) => {
|
|
@@ -195,7 +153,6 @@ export function openSoniox(opts: SonioxOptions = {}): SttOpener {
|
|
|
195
153
|
|
|
196
154
|
ws.on("close", (code: number) => {
|
|
197
155
|
if (closed) return;
|
|
198
|
-
// 1000 = normal closure.
|
|
199
156
|
if (code !== 1000) {
|
|
200
157
|
emitter.emit("error", makeSttError("stt_stream_error", `socket closed ${code}`));
|
|
201
158
|
}
|
|
@@ -204,7 +161,6 @@ export function openSoniox(opts: SonioxOptions = {}): SttOpener {
|
|
|
204
161
|
const close = async (): Promise<void> => {
|
|
205
162
|
if (closed) return;
|
|
206
163
|
closed = true;
|
|
207
|
-
// Flush any trailing final tokens that arrived right before close.
|
|
208
164
|
if (finalBuf.value.length > 0) {
|
|
209
165
|
emitter.emit("final", finalBuf.value);
|
|
210
166
|
finalBuf.value = "";
|
|
@@ -212,7 +168,7 @@ export function openSoniox(opts: SonioxOptions = {}): SttOpener {
|
|
|
212
168
|
try {
|
|
213
169
|
ws.close();
|
|
214
170
|
} catch {
|
|
215
|
-
//
|
|
171
|
+
// Caller is tearing down; ws.close errors are not actionable.
|
|
216
172
|
}
|
|
217
173
|
};
|
|
218
174
|
|
|
@@ -225,8 +181,7 @@ export function openSoniox(opts: SonioxOptions = {}): SttOpener {
|
|
|
225
181
|
return {
|
|
226
182
|
sendAudio(pcm: Int16Array) {
|
|
227
183
|
if (closed || ws.readyState !== WebSocket.OPEN) return;
|
|
228
|
-
//
|
|
229
|
-
// hand it to the OS as a binary frame.
|
|
184
|
+
// Pass the underlying buffer to avoid a copy.
|
|
230
185
|
ws.send(new Uint8Array(pcm.buffer, pcm.byteOffset, pcm.byteLength), { binary: true });
|
|
231
186
|
},
|
|
232
187
|
on(event, fn) {
|
|
@@ -1,11 +1,9 @@
|
|
|
1
1
|
// Copyright 2025 the AAI authors. MIT license.
|
|
2
|
-
/** Unit test for the Cartesia TTS adapter. Mocks `@cartesia/cartesia-js`. */
|
|
3
2
|
|
|
4
3
|
import { beforeEach, describe, expect, test, vi } from "vitest";
|
|
5
4
|
import { flush } from "../../_test-utils.ts";
|
|
6
5
|
import { type CartesiaSession, openCartesia } from "./cartesia.ts";
|
|
7
6
|
|
|
8
|
-
// Recorded interactions on the fake `TTSWSContext` — one entry per method call.
|
|
9
7
|
interface RecordedSend {
|
|
10
8
|
kind: "send" | "cancel";
|
|
11
9
|
contextId: string;
|
|
@@ -17,7 +15,6 @@ interface RecordedSend {
|
|
|
17
15
|
|
|
18
16
|
const sends: RecordedSend[] = [];
|
|
19
17
|
|
|
20
|
-
/** Minimal shape of the request the adapter sends to Cartesia. */
|
|
21
18
|
interface FakeGenerationRequest {
|
|
22
19
|
transcript: string;
|
|
23
20
|
continue: boolean;
|
|
@@ -25,17 +22,12 @@ interface FakeGenerationRequest {
|
|
|
25
22
|
model_id?: string;
|
|
26
23
|
}
|
|
27
24
|
|
|
28
|
-
/**
|
|
29
|
-
* Fake `TTSWSContext`. Mirrors the fields the adapter touches:
|
|
30
|
-
* `contextId`, `send`, `cancel`.
|
|
31
|
-
*/
|
|
32
25
|
interface FakeContext {
|
|
33
26
|
contextId: string;
|
|
34
27
|
send(req: FakeGenerationRequest): Promise<void>;
|
|
35
28
|
cancel(): Promise<void>;
|
|
36
29
|
}
|
|
37
30
|
|
|
38
|
-
/** Fake `TTSWS`. EventEmitter-ish with a `_fire` test hook. */
|
|
39
31
|
interface FakeTTSWS {
|
|
40
32
|
contexts: FakeContext[];
|
|
41
33
|
context(opts: { contextId: string }): FakeContext;
|
|
@@ -45,7 +37,7 @@ interface FakeTTSWS {
|
|
|
45
37
|
}
|
|
46
38
|
|
|
47
39
|
vi.mock("@cartesia/cartesia-js", () => {
|
|
48
|
-
|
|
40
|
+
function makeWs(): FakeTTSWS {
|
|
49
41
|
const listeners = new Map<string, Array<(...args: unknown[]) => void>>();
|
|
50
42
|
const ws: FakeTTSWS = {
|
|
51
43
|
contexts: [],
|
|
@@ -75,7 +67,7 @@ vi.mock("@cartesia/cartesia-js", () => {
|
|
|
75
67
|
listeners.set(event, arr);
|
|
76
68
|
return ws;
|
|
77
69
|
},
|
|
78
|
-
close(
|
|
70
|
+
close() {
|
|
79
71
|
/* no-op */
|
|
80
72
|
},
|
|
81
73
|
_fire(event, payload) {
|
|
@@ -83,7 +75,7 @@ vi.mock("@cartesia/cartesia-js", () => {
|
|
|
83
75
|
},
|
|
84
76
|
};
|
|
85
77
|
return ws;
|
|
86
|
-
}
|
|
78
|
+
}
|
|
87
79
|
return {
|
|
88
80
|
Cartesia: class {
|
|
89
81
|
tts = {
|
|
@@ -93,6 +85,17 @@ vi.mock("@cartesia/cartesia-js", () => {
|
|
|
93
85
|
};
|
|
94
86
|
});
|
|
95
87
|
|
|
88
|
+
function expectedSend(contextId: string, transcript: string, cont: boolean): RecordedSend {
|
|
89
|
+
return {
|
|
90
|
+
kind: "send",
|
|
91
|
+
contextId,
|
|
92
|
+
transcript,
|
|
93
|
+
continue: cont,
|
|
94
|
+
language: "en",
|
|
95
|
+
model_id: "sonic-2",
|
|
96
|
+
};
|
|
97
|
+
}
|
|
98
|
+
|
|
96
99
|
beforeEach(() => {
|
|
97
100
|
sends.length = 0;
|
|
98
101
|
});
|
|
@@ -121,34 +124,11 @@ describe("cartesia TTS adapter", () => {
|
|
|
121
124
|
session.flush();
|
|
122
125
|
await flush();
|
|
123
126
|
|
|
124
|
-
// All three sends for turn 1 carry the same contextId — two deltas with
|
|
125
|
-
// continue: true, then an empty-transcript send with continue: false.
|
|
126
127
|
const turn1Sends = sends.filter((s) => s.contextId === turn1);
|
|
127
128
|
expect(turn1Sends).toEqual([
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
transcript: "hello",
|
|
132
|
-
continue: true,
|
|
133
|
-
language: "en",
|
|
134
|
-
model_id: "sonic-2",
|
|
135
|
-
},
|
|
136
|
-
{
|
|
137
|
-
kind: "send",
|
|
138
|
-
contextId: turn1,
|
|
139
|
-
transcript: " world",
|
|
140
|
-
continue: true,
|
|
141
|
-
language: "en",
|
|
142
|
-
model_id: "sonic-2",
|
|
143
|
-
},
|
|
144
|
-
{
|
|
145
|
-
kind: "send",
|
|
146
|
-
contextId: turn1,
|
|
147
|
-
transcript: "",
|
|
148
|
-
continue: false,
|
|
149
|
-
language: "en",
|
|
150
|
-
model_id: "sonic-2",
|
|
151
|
-
},
|
|
129
|
+
expectedSend(turn1, "hello", true),
|
|
130
|
+
expectedSend(turn1, " world", true),
|
|
131
|
+
expectedSend(turn1, "", false),
|
|
152
132
|
]);
|
|
153
133
|
|
|
154
134
|
// Rotation is deferred until the next sendText so Cartesia's late
|
|
@@ -156,21 +136,11 @@ describe("cartesia TTS adapter", () => {
|
|
|
156
136
|
// pass the context-id filter.
|
|
157
137
|
expect(session._currentContextId()).toBe(turn1);
|
|
158
138
|
|
|
159
|
-
// Subsequent sendText rotates to a fresh context.
|
|
160
139
|
session.sendText("next");
|
|
161
140
|
const turn2 = session._currentContextId();
|
|
162
141
|
expect(turn2).not.toBe(turn1);
|
|
163
142
|
await flush();
|
|
164
|
-
expect(sends.filter((s) => s.contextId === turn2)).toEqual([
|
|
165
|
-
{
|
|
166
|
-
kind: "send",
|
|
167
|
-
contextId: turn2,
|
|
168
|
-
transcript: "next",
|
|
169
|
-
continue: true,
|
|
170
|
-
language: "en",
|
|
171
|
-
model_id: "sonic-2",
|
|
172
|
-
},
|
|
173
|
-
]);
|
|
143
|
+
expect(sends.filter((s) => s.contextId === turn2)).toEqual([expectedSend(turn2, "next", true)]);
|
|
174
144
|
|
|
175
145
|
controller.abort();
|
|
176
146
|
await session.close();
|
|
@@ -191,16 +161,8 @@ describe("cartesia TTS adapter", () => {
|
|
|
191
161
|
|
|
192
162
|
await flush();
|
|
193
163
|
|
|
194
|
-
// We expect: send("hello", continue:true) on turn1, then cancel(turn1).
|
|
195
164
|
expect(sends).toEqual([
|
|
196
|
-
|
|
197
|
-
kind: "send",
|
|
198
|
-
contextId: turn1,
|
|
199
|
-
transcript: "hello",
|
|
200
|
-
continue: true,
|
|
201
|
-
language: "en",
|
|
202
|
-
model_id: "sonic-2",
|
|
203
|
-
},
|
|
165
|
+
expectedSend(turn1, "hello", true),
|
|
204
166
|
{ kind: "cancel", contextId: turn1 },
|
|
205
167
|
]);
|
|
206
168
|
|
|
@@ -209,7 +171,6 @@ describe("cartesia TTS adapter", () => {
|
|
|
209
171
|
// keep passing the filter until the next turn actually begins.
|
|
210
172
|
expect(session._currentContextId()).toBe(turn1);
|
|
211
173
|
|
|
212
|
-
// A subsequent sendText mints a fresh context for turn2.
|
|
213
174
|
session.sendText("again");
|
|
214
175
|
const turn2 = session._currentContextId();
|
|
215
176
|
expect(turn2).not.toBe(turn1);
|