@alexkroman1/aai 1.7.1 → 1.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +11 -9
- package/CHANGELOG.md +23 -0
- package/dist/{_internal-types-CrnTi9Ew.js → _internal-types-CfOAbK6V.js} +22 -35
- package/dist/constants-y68COEGj.js +29 -0
- package/dist/host/_base64.d.ts +2 -0
- package/dist/host/_mock-ws.d.ts +0 -61
- package/dist/host/_pipeline-test-fakes.d.ts +7 -4
- package/dist/host/_run-code.d.ts +0 -25
- package/dist/host/_runtime-conformance.d.ts +3 -34
- package/dist/host/memory-vector.d.ts +0 -11
- package/dist/host/providers/resolve-kv.d.ts +0 -7
- package/dist/host/providers/resolve-vector.d.ts +0 -8
- package/dist/host/providers/stt/assemblyai.d.ts +0 -14
- package/dist/host/providers/stt/deepgram.d.ts +2 -14
- package/dist/host/providers/stt/soniox.d.ts +0 -22
- package/dist/host/providers/tts/rime.d.ts +10 -31
- package/dist/host/runtime-barrel.js +670 -630
- package/dist/host/runtime-config.d.ts +9 -6
- package/dist/host/runtime.d.ts +3 -0
- package/dist/host/to-vercel-tools.d.ts +3 -33
- package/dist/host/transports/openai-realtime-transport.d.ts +45 -0
- package/dist/host/unstorage-kv.d.ts +0 -26
- package/dist/index.js +3 -3
- package/dist/openai-realtime-cjPAHMMx.js +10 -0
- package/dist/sdk/_internal-types.d.ts +6 -55
- package/dist/sdk/allowed-hosts.d.ts +4 -3
- package/dist/sdk/constants.d.ts +4 -29
- package/dist/sdk/define.d.ts +7 -4
- package/dist/sdk/kv.d.ts +13 -37
- package/dist/sdk/manifest-barrel.js +1 -1
- package/dist/sdk/manifest.d.ts +8 -2
- package/dist/sdk/protocol.js +1 -1
- package/dist/sdk/providers/s2s/openai-realtime.d.ts +17 -0
- package/dist/sdk/providers/s2s-barrel.d.ts +9 -0
- package/dist/sdk/providers/s2s-barrel.js +2 -0
- package/dist/sdk/providers/tts/rime.d.ts +1 -1
- package/dist/sdk/providers.d.ts +6 -2
- package/dist/sdk/types.d.ts +7 -1
- package/dist/{types-KUgezM6u.js → types-DOWVZhb9.js} +1 -7
- package/dist/{ws-upgrade-BeOQ7fXL.js → ws-upgrade-CG8-by1n.js} +2 -3
- package/host/_base64.ts +9 -0
- package/host/_mock-ws.ts +0 -65
- package/host/_pipeline-test-fakes.ts +19 -31
- package/host/_run-code.ts +10 -53
- package/host/_runtime-conformance.ts +3 -44
- package/host/_test-utils.ts +20 -42
- package/host/builtin-tools.test.ts +127 -222
- package/host/builtin-tools.ts +6 -10
- package/host/cleanup.test.ts +30 -73
- package/host/integration/pipeline-reference.integration.test.ts +12 -17
- package/host/integration.test.ts +0 -7
- package/host/memory-vector.test.ts +3 -1
- package/host/memory-vector.ts +16 -21
- package/host/pinecone-vector.test.ts +14 -17
- package/host/pinecone-vector.ts +10 -19
- package/host/providers/providers.test-d.ts +5 -3
- package/host/providers/resolve-kv.ts +23 -41
- package/host/providers/resolve-vector.ts +3 -12
- package/host/providers/resolve.test.ts +15 -28
- package/host/providers/resolve.ts +24 -24
- package/host/providers/stt/assemblyai.test.ts +2 -14
- package/host/providers/stt/assemblyai.ts +12 -35
- package/host/providers/stt/deepgram.test.ts +23 -83
- package/host/providers/stt/deepgram.ts +15 -40
- package/host/providers/stt/elevenlabs.test.ts +26 -38
- package/host/providers/stt/elevenlabs.ts +10 -9
- package/host/providers/stt/soniox.test.ts +35 -85
- package/host/providers/stt/soniox.ts +8 -53
- package/host/providers/tts/cartesia.test.ts +19 -58
- package/host/providers/tts/cartesia.ts +36 -66
- package/host/providers/tts/rime.test.ts +12 -38
- package/host/providers/tts/rime.ts +23 -86
- package/host/runtime-config.test.ts +9 -9
- package/host/runtime-config.ts +16 -22
- package/host/runtime.test.ts +111 -73
- package/host/runtime.ts +139 -86
- package/host/s2s.test.ts +92 -191
- package/host/s2s.ts +55 -49
- package/host/server-shutdown.test.ts +9 -30
- package/host/server.test.ts +2 -13
- package/host/server.ts +85 -100
- package/host/session-core.test.ts +15 -30
- package/host/session-core.ts +10 -13
- package/host/session-prompt.test.ts +1 -5
- package/host/to-vercel-tools.test.ts +53 -72
- package/host/to-vercel-tools.ts +9 -39
- package/host/tool-executor.test.ts +25 -51
- package/host/tool-executor.ts +18 -12
- package/host/transports/openai-realtime-transport.test.ts +439 -0
- package/host/transports/openai-realtime-transport.ts +371 -0
- package/host/transports/pipeline-transport.test.ts +125 -298
- package/host/transports/pipeline-transport.ts +20 -68
- package/host/transports/s2s-transport-fixtures.test.ts +31 -92
- package/host/transports/s2s-transport.test.ts +65 -134
- package/host/transports/s2s-transport.ts +15 -43
- package/host/transports/types.test.ts +4 -8
- package/host/unstorage-kv.test.ts +3 -2
- package/host/unstorage-kv.ts +5 -35
- package/host/ws-handler.test.ts +72 -176
- package/host/ws-handler.ts +6 -12
- package/package.json +6 -1
- package/sdk/__snapshots__/exports.test.ts.snap +7 -0
- package/sdk/__snapshots__/schema-shapes.test.ts.snap +1 -0
- package/sdk/_internal-types.test.ts +6 -9
- package/sdk/_internal-types.ts +16 -57
- package/sdk/_test-matchers.ts +25 -15
- package/sdk/allowed-hosts.test.ts +50 -114
- package/sdk/allowed-hosts.ts +8 -14
- package/sdk/constants.ts +5 -52
- package/sdk/define.test.ts +7 -6
- package/sdk/define.ts +7 -3
- package/sdk/exports.test.ts +6 -1
- package/sdk/kv.ts +13 -37
- package/sdk/manifest.test-d.ts +5 -0
- package/sdk/manifest.test.ts +61 -9
- package/sdk/manifest.ts +11 -11
- package/sdk/protocol-compat.test.ts +66 -98
- package/sdk/protocol-snapshot.test.ts +2 -16
- package/sdk/protocol.test.ts +13 -22
- package/sdk/providers/s2s/openai-realtime.ts +36 -0
- package/sdk/providers/s2s-barrel.ts +12 -0
- package/sdk/providers/tts/rime.ts +1 -1
- package/sdk/providers.ts +24 -5
- package/sdk/schema-alignment.test.ts +25 -73
- package/sdk/schema-shapes.test.ts +1 -29
- package/sdk/system-prompt.test.ts +0 -1
- package/sdk/system-prompt.ts +17 -19
- package/sdk/types-inference.test.ts +10 -36
- package/sdk/types.ts +7 -0
- package/sdk/ws-upgrade.test.ts +24 -23
- package/sdk/ws-upgrade.ts +2 -3
- package/tsdown.config.ts +8 -11
- package/dist/constants-C2nirZUI.js +0 -54
|
@@ -92,41 +92,32 @@ export function openCartesia(opts: CartesiaOptions): TtsOpener {
|
|
|
92
92
|
const emitter: Emitter<TtsEvents> = createNanoEvents<TtsEvents>();
|
|
93
93
|
let closed = false;
|
|
94
94
|
|
|
95
|
-
|
|
95
|
+
const audioConfig = {
|
|
96
|
+
model_id: model,
|
|
97
|
+
voice: { mode: "id" as const, id: voice },
|
|
98
|
+
output_format: {
|
|
99
|
+
container: "raw" as const,
|
|
100
|
+
encoding: "pcm_s16le" as const,
|
|
101
|
+
sample_rate: sampleRate,
|
|
102
|
+
},
|
|
103
|
+
};
|
|
104
|
+
const baseRequest = { ...audioConfig, language };
|
|
105
|
+
|
|
96
106
|
const mintContext = (): TTSWSContext =>
|
|
97
|
-
ws.context({
|
|
98
|
-
model_id: model,
|
|
99
|
-
voice: { mode: "id", id: voice },
|
|
100
|
-
output_format: {
|
|
101
|
-
container: "raw",
|
|
102
|
-
encoding: "pcm_s16le",
|
|
103
|
-
sample_rate: sampleRate,
|
|
104
|
-
},
|
|
105
|
-
contextId: randomUUID(),
|
|
106
|
-
});
|
|
107
|
+
ws.context({ ...audioConfig, contextId: randomUUID() });
|
|
107
108
|
|
|
108
109
|
let context = mintContext();
|
|
109
|
-
/**
|
|
110
|
-
* `doneEmitted` guards against emitting `done` more than once per turn.
|
|
111
|
-
* Reset whenever a fresh context is minted (i.e. at turn boundaries).
|
|
112
|
-
*/
|
|
113
110
|
let doneEmitted = false;
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
* that late audio chunks + Cartesia's real `done` event (both tagged
|
|
118
|
-
* with the flushed context's id) still pass the filter below. Rotating
|
|
119
|
-
* eagerly would silently drop all audio still in flight.
|
|
120
|
-
*/
|
|
111
|
+
// Defer minting after flush/cancel until next sendText so late audio
|
|
112
|
+
// chunks + Cartesia's real `done` (tagged with the flushed context's id)
|
|
113
|
+
// still pass the filter. Rotating eagerly would drop in-flight audio.
|
|
121
114
|
let rotatePending = false;
|
|
122
|
-
const
|
|
115
|
+
const rotateIfPending = () => {
|
|
116
|
+
if (!rotatePending) return;
|
|
123
117
|
context = mintContext();
|
|
124
118
|
doneEmitted = false;
|
|
125
119
|
rotatePending = false;
|
|
126
120
|
};
|
|
127
|
-
const rotateIfPending = () => {
|
|
128
|
-
if (rotatePending) rotateContext();
|
|
129
|
-
};
|
|
130
121
|
const emitDoneOnce = () => {
|
|
131
122
|
if (doneEmitted || closed) return;
|
|
132
123
|
doneEmitted = true;
|
|
@@ -136,12 +127,11 @@ export function openCartesia(opts: CartesiaOptions): TtsOpener {
|
|
|
136
127
|
// TTSWS fires events globally across all contexts on the shared
|
|
137
128
|
// socket; filter by the currently-active context_id.
|
|
138
129
|
ws.on("chunk", (event) => {
|
|
139
|
-
if (closed) return;
|
|
140
|
-
if (event.context_id !== context.contextId) return;
|
|
130
|
+
if (closed || event.context_id !== context.contextId) return;
|
|
141
131
|
const buf = event.audio;
|
|
142
132
|
if (!buf || buf.byteLength === 0) return;
|
|
143
|
-
//
|
|
144
|
-
//
|
|
133
|
+
// Defensive: trim odd byte counts so `new Int16Array` never throws
|
|
134
|
+
// on a misaligned length.
|
|
145
135
|
const evenBytes = buf.byteLength - (buf.byteLength % 2);
|
|
146
136
|
if (evenBytes === 0) return;
|
|
147
137
|
const pcm = new Int16Array(buf.buffer.slice(buf.byteOffset, buf.byteOffset + evenBytes));
|
|
@@ -149,8 +139,7 @@ export function openCartesia(opts: CartesiaOptions): TtsOpener {
|
|
|
149
139
|
});
|
|
150
140
|
|
|
151
141
|
ws.on("done", (event) => {
|
|
152
|
-
if (closed) return;
|
|
153
|
-
if (event.context_id !== context.contextId) return;
|
|
142
|
+
if (closed || event.context_id !== context.contextId) return;
|
|
154
143
|
emitDoneOnce();
|
|
155
144
|
});
|
|
156
145
|
|
|
@@ -165,38 +154,25 @@ export function openCartesia(opts: CartesiaOptions): TtsOpener {
|
|
|
165
154
|
try {
|
|
166
155
|
ws.close({ code: 1000, reason: "client close" });
|
|
167
156
|
} catch {
|
|
168
|
-
//
|
|
157
|
+
// Caller has already decided to tear down.
|
|
169
158
|
}
|
|
170
159
|
};
|
|
171
160
|
|
|
172
161
|
if (openOpts.signal.aborted) {
|
|
173
162
|
void close();
|
|
174
163
|
} else {
|
|
175
|
-
openOpts.signal.addEventListener("abort", () => void close(), {
|
|
176
|
-
once: true,
|
|
177
|
-
});
|
|
164
|
+
openOpts.signal.addEventListener("abort", () => void close(), { once: true });
|
|
178
165
|
}
|
|
179
166
|
|
|
180
|
-
const baseRequest = {
|
|
181
|
-
model_id: model,
|
|
182
|
-
voice: { mode: "id" as const, id: voice },
|
|
183
|
-
output_format: {
|
|
184
|
-
container: "raw" as const,
|
|
185
|
-
encoding: "pcm_s16le" as const,
|
|
186
|
-
sample_rate: sampleRate,
|
|
187
|
-
},
|
|
188
|
-
language,
|
|
189
|
-
};
|
|
190
|
-
|
|
191
167
|
const ignoreRejection = (_err: unknown): void => {
|
|
192
|
-
|
|
168
|
+
/* no-op */
|
|
193
169
|
};
|
|
194
170
|
|
|
195
171
|
const session: CartesiaSession = {
|
|
196
172
|
sendText(text: string) {
|
|
197
173
|
if (closed || text.length === 0) return;
|
|
198
|
-
// First sendText after
|
|
199
|
-
//
|
|
174
|
+
// First sendText after flush/cancel starts a fresh context so we
|
|
175
|
+
// don't append to one that's already been finalized.
|
|
200
176
|
rotateIfPending();
|
|
201
177
|
void context
|
|
202
178
|
.send({ ...baseRequest, transcript: text, continue: true })
|
|
@@ -204,12 +180,10 @@ export function openCartesia(opts: CartesiaOptions): TtsOpener {
|
|
|
204
180
|
},
|
|
205
181
|
flush() {
|
|
206
182
|
if (closed || rotatePending) return;
|
|
207
|
-
// Empty transcript
|
|
208
|
-
//
|
|
209
|
-
//
|
|
210
|
-
//
|
|
211
|
-
// Defer rotation so the filter below still accepts in-flight
|
|
212
|
-
// audio chunks and the real `done` event.
|
|
183
|
+
// Empty transcript + `continue: false` is the canonical end-of-turn
|
|
184
|
+
// signal. Cartesia finishes synthesizing what's queued and emits
|
|
185
|
+
// `done` tagged with the same context_id; rotation is deferred so
|
|
186
|
+
// in-flight audio chunks and the real `done` still pass the filter.
|
|
213
187
|
void context
|
|
214
188
|
.send({ ...baseRequest, transcript: "", continue: false })
|
|
215
189
|
.catch(ignoreRejection);
|
|
@@ -218,18 +192,15 @@ export function openCartesia(opts: CartesiaOptions): TtsOpener {
|
|
|
218
192
|
cancel() {
|
|
219
193
|
if (closed) return;
|
|
220
194
|
// Skip the wire cancel if the context is already final on
|
|
221
|
-
// Cartesia's side
|
|
222
|
-
//
|
|
223
|
-
//
|
|
224
|
-
// listener surfaces as `tts_stream_error` and the pipeline
|
|
225
|
-
// treats as fatal — killing the session for a benign race.
|
|
195
|
+
// Cartesia's side: cancelling a retired context returns a 400
|
|
196
|
+
// ("context ID does not exist") which surfaces as a fatal
|
|
197
|
+
// tts_stream_error for a benign race.
|
|
226
198
|
if (!doneEmitted) {
|
|
227
199
|
void context.cancel().catch(ignoreRejection);
|
|
228
200
|
}
|
|
229
|
-
// Emit synchronously: barge-in advances the orchestrator
|
|
230
|
-
//
|
|
231
|
-
//
|
|
232
|
-
// after cancel, so dropping any late chunks is fine.
|
|
201
|
+
// Emit synchronously: barge-in advances the orchestrator on `done`;
|
|
202
|
+
// delaying would audibly stall subsequent turns. Cartesia stops
|
|
203
|
+
// producing audio after cancel, so dropping late chunks is fine.
|
|
233
204
|
emitDoneOnce();
|
|
234
205
|
rotatePending = true;
|
|
235
206
|
},
|
|
@@ -240,7 +211,6 @@ export function openCartesia(opts: CartesiaOptions): TtsOpener {
|
|
|
240
211
|
_ws: ws,
|
|
241
212
|
_currentContextId: () => context.contextId,
|
|
242
213
|
};
|
|
243
|
-
|
|
244
214
|
return session;
|
|
245
215
|
},
|
|
246
216
|
};
|
|
@@ -1,13 +1,8 @@
|
|
|
1
1
|
// Copyright 2026 the AAI authors. MIT license.
|
|
2
|
-
/** Unit test for the Rime TTS adapter. Mocks the `ws` package. */
|
|
3
2
|
|
|
4
3
|
import { afterEach, beforeEach, describe, expect, test, vi } from "vitest";
|
|
5
4
|
import { openRime, type RimeSession } from "./rime.ts";
|
|
6
5
|
|
|
7
|
-
// ──────────────────────────────────────────────────────────────────────────────
|
|
8
|
-
// Fake WebSocket — hoisted so `vi.mock` factory can reference it
|
|
9
|
-
// ──────────────────────────────────────────────────────────────────────────────
|
|
10
|
-
|
|
11
6
|
type WsEvent = "open" | "message" | "error" | "close";
|
|
12
7
|
type WsListener = (...args: unknown[]) => void;
|
|
13
8
|
|
|
@@ -15,19 +10,17 @@ const { FakeWebSocket } = vi.hoisted(() => {
|
|
|
15
10
|
class FakeWebSocket {
|
|
16
11
|
static OPEN = 1;
|
|
17
12
|
static CLOSED = 3;
|
|
13
|
+
static instances: FakeWebSocket[] = [];
|
|
18
14
|
|
|
19
15
|
readyState = FakeWebSocket.OPEN;
|
|
20
16
|
sent: string[] = [];
|
|
21
|
-
private readonly listeners = new Map<string, WsListener[]>();
|
|
22
|
-
|
|
23
|
-
static instances: FakeWebSocket[] = [];
|
|
24
|
-
|
|
25
17
|
readonly url: string;
|
|
18
|
+
private readonly listeners = new Map<string, WsListener[]>();
|
|
26
19
|
|
|
27
20
|
constructor(url: string, _opts?: unknown) {
|
|
28
21
|
this.url = url;
|
|
29
22
|
FakeWebSocket.instances.push(this);
|
|
30
|
-
//
|
|
23
|
+
// Real `ws` fires "open" asynchronously; match that timing.
|
|
31
24
|
queueMicrotask(() => this._fire("open"));
|
|
32
25
|
}
|
|
33
26
|
|
|
@@ -39,17 +32,13 @@ const { FakeWebSocket } = vi.hoisted(() => {
|
|
|
39
32
|
|
|
40
33
|
once(event: string, fn: WsListener) {
|
|
41
34
|
const wrapper = (...args: unknown[]) => {
|
|
42
|
-
this.
|
|
35
|
+
this.removeListener(event, wrapper);
|
|
43
36
|
fn(...args);
|
|
44
37
|
};
|
|
45
38
|
this.on(event, wrapper);
|
|
46
39
|
}
|
|
47
40
|
|
|
48
41
|
removeListener(event: string, fn: WsListener) {
|
|
49
|
-
this.off(event, fn);
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
private off(event: string, fn: WsListener) {
|
|
53
42
|
const arr = this.listeners.get(event) ?? [];
|
|
54
43
|
this.listeners.set(
|
|
55
44
|
event,
|
|
@@ -66,12 +55,10 @@ const { FakeWebSocket } = vi.hoisted(() => {
|
|
|
66
55
|
this._fire("close");
|
|
67
56
|
}
|
|
68
57
|
|
|
69
|
-
/** Test helper: fire an event on this socket. */
|
|
70
58
|
_fire(event: WsEvent, ...args: unknown[]) {
|
|
71
59
|
for (const fn of this.listeners.get(event) ?? []) fn(...args);
|
|
72
60
|
}
|
|
73
61
|
|
|
74
|
-
/** Test helper: simulate a JSON message from the server. */
|
|
75
62
|
_msg(payload: unknown) {
|
|
76
63
|
this._fire("message", JSON.stringify(payload));
|
|
77
64
|
}
|
|
@@ -85,10 +72,6 @@ vi.mock("ws", () => ({
|
|
|
85
72
|
WebSocket: FakeWebSocket,
|
|
86
73
|
}));
|
|
87
74
|
|
|
88
|
-
// ──────────────────────────────────────────────────────────────────────────────
|
|
89
|
-
// Helpers
|
|
90
|
-
// ──────────────────────────────────────────────────────────────────────────────
|
|
91
|
-
|
|
92
75
|
beforeEach(() => {
|
|
93
76
|
FakeWebSocket.instances.length = 0;
|
|
94
77
|
vi.useFakeTimers();
|
|
@@ -112,7 +95,7 @@ async function openSession(apiKey = "test-key"): Promise<{
|
|
|
112
95
|
signal: controller.signal,
|
|
113
96
|
}) as Promise<RimeSession>;
|
|
114
97
|
|
|
115
|
-
// Let the microtask that fires
|
|
98
|
+
// Let the queued microtask that fires "open" run.
|
|
116
99
|
await Promise.resolve();
|
|
117
100
|
|
|
118
101
|
const session = await openPromise;
|
|
@@ -121,10 +104,6 @@ async function openSession(apiKey = "test-key"): Promise<{
|
|
|
121
104
|
return { session, ws, controller };
|
|
122
105
|
}
|
|
123
106
|
|
|
124
|
-
// ──────────────────────────────────────────────────────────────────────────────
|
|
125
|
-
// Tests
|
|
126
|
-
// ──────────────────────────────────────────────────────────────────────────────
|
|
127
|
-
|
|
128
107
|
describe("rime TTS adapter", () => {
|
|
129
108
|
test("openRime returns an opener with name 'rime'", () => {
|
|
130
109
|
const opener = openRime({ voice: "cove" });
|
|
@@ -150,18 +129,15 @@ describe("rime TTS adapter", () => {
|
|
|
150
129
|
const audioEvents: Int16Array[] = [];
|
|
151
130
|
session.on("audio", (pcm) => audioEvents.push(pcm));
|
|
152
131
|
|
|
153
|
-
// Encode 4 PCM16 samples (8 bytes) as base64.
|
|
154
132
|
const samples = new Int16Array([100, 200, 300, 400]);
|
|
155
133
|
const base64 = Buffer.from(samples.buffer).toString("base64");
|
|
156
134
|
|
|
157
135
|
ws._msg({ type: "chunk", data: base64, contextId: null });
|
|
158
136
|
|
|
159
137
|
expect(audioEvents.length).toBe(1);
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
// biome-ignore lint/style/noNonNullAssertion: length was asserted to be 1 on the line above
|
|
164
|
-
const pcm = firstChunk!;
|
|
138
|
+
// biome-ignore lint/style/noNonNullAssertion: length was asserted to be 1 above
|
|
139
|
+
const pcm = audioEvents[0]!;
|
|
140
|
+
expect(pcm).toBeInstanceOf(Int16Array);
|
|
165
141
|
expect(pcm.length).toBe(4);
|
|
166
142
|
expect(pcm[0]).toBe(100);
|
|
167
143
|
expect(pcm[3]).toBe(400);
|
|
@@ -188,11 +164,11 @@ describe("rime TTS adapter", () => {
|
|
|
188
164
|
// closing the WS (which `eos` would do).
|
|
189
165
|
expect(ws.sent).toContain(JSON.stringify({ text: "." }));
|
|
190
166
|
|
|
191
|
-
// First-audio timer is
|
|
167
|
+
// First-audio timer is 5s — short window must not fire `done` yet.
|
|
192
168
|
vi.advanceTimersByTime(500);
|
|
193
169
|
expect(doneEvents.length).toBe(0);
|
|
194
170
|
|
|
195
|
-
// First chunk arrives
|
|
171
|
+
// First chunk arrives, switching to the short quiescence window.
|
|
196
172
|
const samples = new Int16Array([100, 200, 300, 400]);
|
|
197
173
|
ws._msg({
|
|
198
174
|
type: "chunk",
|
|
@@ -215,7 +191,7 @@ describe("rime TTS adapter", () => {
|
|
|
215
191
|
session.sendText("Hi there");
|
|
216
192
|
session.flush();
|
|
217
193
|
|
|
218
|
-
// No chunk arrives —
|
|
194
|
+
// No chunk arrives — must wait the full FIRST_AUDIO_TIMEOUT_MS (5s).
|
|
219
195
|
vi.advanceTimersByTime(4999);
|
|
220
196
|
expect(doneEvents.length).toBe(0);
|
|
221
197
|
vi.advanceTimersByTime(1);
|
|
@@ -229,11 +205,10 @@ describe("rime TTS adapter", () => {
|
|
|
229
205
|
session.on("done", () => doneEvents.push(Date.now()));
|
|
230
206
|
|
|
231
207
|
session.sendText("Hello");
|
|
232
|
-
//
|
|
208
|
+
// Barge-in cannot be deferred — `done` must fire synchronously.
|
|
233
209
|
session.cancel();
|
|
234
210
|
|
|
235
211
|
expect(ws.sent).toContain(JSON.stringify({ operation: "clear" }));
|
|
236
|
-
// done was emitted synchronously (before any await / timer).
|
|
237
212
|
expect(doneEvents.length).toBe(1);
|
|
238
213
|
});
|
|
239
214
|
|
|
@@ -245,7 +220,6 @@ describe("rime TTS adapter", () => {
|
|
|
245
220
|
await session.close();
|
|
246
221
|
expect(ws.readyState).toBe(FakeWebSocket.CLOSED);
|
|
247
222
|
|
|
248
|
-
// Second close should not throw.
|
|
249
223
|
await expect(session.close()).resolves.toBeUndefined();
|
|
250
224
|
});
|
|
251
225
|
});
|
|
@@ -2,36 +2,17 @@
|
|
|
2
2
|
/**
|
|
3
3
|
* Rime TTS opener (host-only).
|
|
4
4
|
*
|
|
5
|
-
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
5
|
+
* Connects to Rime's `ws2` JSON WebSocket endpoint with one long-lived
|
|
6
|
+
* connection per session. Client → server: `{ text }` appends to the
|
|
7
|
+
* synthesis buffer, `{ operation: "clear" }` drops it (barge-in). We never
|
|
8
|
+
* send `eos` since it tears down the WS — `flush()` instead sends a
|
|
9
|
+
* trailing `"."` to force synthesis of any text buffered behind missing
|
|
10
|
+
* terminal punctuation while keeping the connection reusable.
|
|
9
11
|
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
12
|
-
*
|
|
13
|
-
*
|
|
14
|
-
* - `{ "operation": "eos" }` — drain buffer, close connection (NOT used
|
|
15
|
-
* during a session: it would tear down the WS, forcing reconnect per
|
|
16
|
-
* turn). We force end-of-turn synthesis with a trailing `"."` instead.
|
|
17
|
-
* The server responds with JSON frames:
|
|
18
|
-
* - `{ type: "chunk", data: <base64 PCM16 LE>, contextId: string | null }`
|
|
19
|
-
* - `{ type: "timestamps", ... }` (ignored)
|
|
20
|
-
* - `{ type: "error", message: string }` (surfaced as `tts_stream_error`)
|
|
21
|
-
*
|
|
22
|
-
* **Single long-lived connection per session.** Rime buffers text until it
|
|
23
|
-
* sees terminal punctuation (`.`, `?`, `!`), so we use one WebSocket per
|
|
24
|
-
* `open()` call and reuse it across turns. `clear` resets the buffer
|
|
25
|
-
* between cancellations.
|
|
26
|
-
*
|
|
27
|
-
* **Done detection.** After `flush()` sends a trailing `"."` to force the
|
|
28
|
-
* server to synthesize any half-buffered text, we arm a quiescence timer
|
|
29
|
-
* that fires 500 ms after the last received audio chunk. When it fires,
|
|
30
|
-
* `done` is emitted.
|
|
31
|
-
*
|
|
32
|
-
* **Audio format.** The URL requests `audioFormat=pcm` at the negotiated
|
|
33
|
-
* `sampleRate`, which returns raw PCM16 little-endian. We decode the base64
|
|
34
|
-
* payload and construct a zero-copy `Int16Array` view over the decoded bytes.
|
|
12
|
+
* Server → client: `{ type: "chunk", data: <base64 PCM16 LE> }` carries
|
|
13
|
+
* audio; `timestamps` is ignored; `error` surfaces as `tts_stream_error`.
|
|
14
|
+
* The `audioFormat=pcm` query param at the negotiated `sampleRate` returns
|
|
15
|
+
* raw PCM16 LE that we view as a zero-copy `Int16Array`.
|
|
35
16
|
*/
|
|
36
17
|
|
|
37
18
|
import { createNanoEvents, type Emitter } from "nanoevents";
|
|
@@ -45,13 +26,11 @@ import {
|
|
|
45
26
|
type TtsSession,
|
|
46
27
|
} from "../../../sdk/providers.ts";
|
|
47
28
|
|
|
48
|
-
/** Internal: TtsSession with a test-only handle to the raw WebSocket. */
|
|
49
29
|
export interface RimeSession extends TtsSession {
|
|
50
30
|
/** @internal Test-only: exposes the underlying raw WebSocket. */
|
|
51
31
|
readonly _ws: WebSocket;
|
|
52
32
|
}
|
|
53
33
|
|
|
54
|
-
/** PCM16 sample rates accepted by the Rime `ws2` endpoint. */
|
|
55
34
|
const RIME_PCM16_RATES = [
|
|
56
35
|
8000, 16_000, 22_050, 24_000, 44_100, 48_000,
|
|
57
36
|
] as const satisfies readonly number[];
|
|
@@ -64,51 +43,28 @@ function assertSupportedSampleRate(rate: number): number {
|
|
|
64
43
|
);
|
|
65
44
|
}
|
|
66
45
|
|
|
67
|
-
/**
|
|
68
|
-
* Decode a base64 string from Rime into a zero-copy `Int16Array`.
|
|
69
|
-
*
|
|
70
|
-
* Rime's `ws2` endpoint returns base64-encoded PCM16 LE in each chunk.
|
|
71
|
-
* `Buffer.from(base64, "base64")` gives us a Node.js Buffer (which is a
|
|
72
|
-
* Uint8Array subclass) with `byteOffset === 0`. PCM16 bytes always come in
|
|
73
|
-
* pairs so the length is guaranteed to be even.
|
|
74
|
-
*/
|
|
75
46
|
function base64ToPcm(data: string): Int16Array {
|
|
76
47
|
const bytes = Buffer.from(data, "base64");
|
|
77
|
-
// Defensive: drop a trailing odd byte rather than
|
|
48
|
+
// Defensive: drop a trailing odd byte rather than throw on misalignment.
|
|
78
49
|
const evenLen = bytes.byteLength - (bytes.byteLength % 2);
|
|
79
50
|
if (evenLen === 0) return new Int16Array(0);
|
|
80
51
|
return new Int16Array(bytes.buffer, bytes.byteOffset, evenLen / 2);
|
|
81
52
|
}
|
|
82
53
|
|
|
83
|
-
/**
|
|
84
|
-
* Shape of JSON messages received from Rime's `ws2` endpoint.
|
|
85
|
-
*
|
|
86
|
-
* Only `chunk` messages carry audio; `timestamps` messages are informational
|
|
87
|
-
* and can be ignored for the current use case.
|
|
88
|
-
*/
|
|
89
54
|
interface RimeMessage {
|
|
90
55
|
type: "chunk" | "timestamps" | "error" | string;
|
|
91
|
-
/** Base64-encoded PCM16 LE audio. Present on `type === "chunk"`. */
|
|
92
56
|
data?: string;
|
|
93
|
-
/** Context discriminator for the in-flight utterance. May be null. */
|
|
94
57
|
contextId?: string | null;
|
|
95
|
-
/** Error description. Present on `type === "error"`. */
|
|
96
58
|
message?: string;
|
|
97
59
|
}
|
|
98
60
|
|
|
99
|
-
/** Quiescence timeout in ms — how long to wait after the last audio chunk before emitting `done`. */
|
|
100
61
|
const QUIESCENCE_MS = 500;
|
|
101
62
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
* path: `flush()` runs immediately after `sendText()`, so audio TTFB
|
|
106
|
-
* exceeds the 500 ms quiescence window. Once the first chunk arrives,
|
|
107
|
-
* we transition to the shorter quiescence timeout.
|
|
108
|
-
*/
|
|
63
|
+
// Greetings and short replies emit `flush()` immediately after `sendText()`,
|
|
64
|
+
// so audio TTFB easily exceeds QUIESCENCE_MS. Wait longer for the FIRST
|
|
65
|
+
// chunk; subsequent chunks revert to the shorter quiescence window.
|
|
109
66
|
const FIRST_AUDIO_TIMEOUT_MS = 5000;
|
|
110
67
|
|
|
111
|
-
/** Wait for the WebSocket `open` event; reject on first `error`. */
|
|
112
68
|
function waitForOpen(ws: WebSocket): Promise<void> {
|
|
113
69
|
return new Promise<void>((resolve, reject) => {
|
|
114
70
|
const onOpen = () => {
|
|
@@ -129,12 +85,8 @@ function waitForOpen(ws: WebSocket): Promise<void> {
|
|
|
129
85
|
});
|
|
130
86
|
}
|
|
131
87
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
*
|
|
135
|
-
* Extracted into a top-level function to keep `open()` under the cognitive
|
|
136
|
-
* complexity limit while retaining full access to the session state via refs.
|
|
137
|
-
*/
|
|
88
|
+
// Extracted to a top-level function to keep `open()` under the cognitive
|
|
89
|
+
// complexity limit; session state is threaded through via the ref callbacks.
|
|
138
90
|
function handleRimeMessage(
|
|
139
91
|
raw: WebSocket.Data,
|
|
140
92
|
emitter: Emitter<TtsEvents>,
|
|
@@ -145,7 +97,6 @@ function handleRimeMessage(
|
|
|
145
97
|
try {
|
|
146
98
|
msg = JSON.parse(typeof raw === "string" ? raw : raw.toString()) as RimeMessage;
|
|
147
99
|
} catch {
|
|
148
|
-
// Unparseable frame — ignore.
|
|
149
100
|
return;
|
|
150
101
|
}
|
|
151
102
|
|
|
@@ -153,9 +104,8 @@ function handleRimeMessage(
|
|
|
153
104
|
const pcm = base64ToPcm(msg.data);
|
|
154
105
|
if (pcm.length > 0) {
|
|
155
106
|
emitter.emit("audio", pcm);
|
|
156
|
-
//
|
|
157
|
-
//
|
|
158
|
-
// quiescence window — so `done` fires only after audio stops.
|
|
107
|
+
// Each chunk resets the quiescence window so `done` fires only after
|
|
108
|
+
// audio stops — applies to both the first-audio and post-chunk timers.
|
|
159
109
|
if (isActiveTimer()) armQuiescence();
|
|
160
110
|
}
|
|
161
111
|
return;
|
|
@@ -166,10 +116,8 @@ function handleRimeMessage(
|
|
|
166
116
|
makeTtsError("tts_stream_error", `Rime TTS: ${msg.message ?? "unknown error"}`),
|
|
167
117
|
);
|
|
168
118
|
}
|
|
169
|
-
// Ignore `type === "timestamps"` and unknown message types.
|
|
170
119
|
}
|
|
171
120
|
|
|
172
|
-
/** Build a {@link TtsOpener} from resolved Rime descriptor options. */
|
|
173
121
|
export function openRime(opts: RimeOptions): TtsOpener {
|
|
174
122
|
return {
|
|
175
123
|
name: "rime",
|
|
@@ -187,7 +135,6 @@ export function openRime(opts: RimeOptions): TtsOpener {
|
|
|
187
135
|
const lang = opts.language ?? "eng";
|
|
188
136
|
const voice = opts.voice ?? RIME_DEFAULT_VOICE;
|
|
189
137
|
|
|
190
|
-
// Construct the ws2 URL with query parameters.
|
|
191
138
|
const url = `wss://users-ws.rime.ai/ws2?speaker=${encodeURIComponent(voice)}&modelId=${encodeURIComponent(model)}&audioFormat=pcm&samplingRate=${sampleRate}&lang=${encodeURIComponent(lang)}`;
|
|
192
139
|
|
|
193
140
|
let ws: WebSocket;
|
|
@@ -205,12 +152,6 @@ export function openRime(opts: RimeOptions): TtsOpener {
|
|
|
205
152
|
const emitter: Emitter<TtsEvents> = createNanoEvents<TtsEvents>();
|
|
206
153
|
let closed = false;
|
|
207
154
|
let doneEmitted = false;
|
|
208
|
-
/**
|
|
209
|
-
* After `flush()`, we arm a timer that fires `done`. Initial timeout is
|
|
210
|
-
* `FIRST_AUDIO_TIMEOUT_MS` to give Rime headroom on TTFB; the first
|
|
211
|
-
* chunk swaps it for a shorter `QUIESCENCE_MS` window that resets on
|
|
212
|
-
* each subsequent chunk. `cancel()` emits `done` synchronously.
|
|
213
|
-
*/
|
|
214
155
|
let quiescenceTimer: ReturnType<typeof setTimeout> | null = null;
|
|
215
156
|
|
|
216
157
|
const clearQuiescence = () => {
|
|
@@ -264,7 +205,7 @@ export function openRime(opts: RimeOptions): TtsOpener {
|
|
|
264
205
|
try {
|
|
265
206
|
ws.close();
|
|
266
207
|
} catch {
|
|
267
|
-
//
|
|
208
|
+
// Caller has already decided to tear down.
|
|
268
209
|
}
|
|
269
210
|
};
|
|
270
211
|
|
|
@@ -278,7 +219,6 @@ export function openRime(opts: RimeOptions): TtsOpener {
|
|
|
278
219
|
sendText(text: string) {
|
|
279
220
|
if (closed || text.length === 0) return;
|
|
280
221
|
if (ws.readyState !== WebSocket.OPEN) return;
|
|
281
|
-
// Reset done state at the start of a new turn.
|
|
282
222
|
doneEmitted = false;
|
|
283
223
|
ws.send(JSON.stringify({ text }));
|
|
284
224
|
},
|
|
@@ -286,19 +226,16 @@ export function openRime(opts: RimeOptions): TtsOpener {
|
|
|
286
226
|
flush() {
|
|
287
227
|
if (closed) return;
|
|
288
228
|
if (ws.readyState !== WebSocket.OPEN) return;
|
|
289
|
-
// Force synthesis of any text buffered behind
|
|
290
|
-
// punctuation:
|
|
291
|
-
// operation would close
|
|
292
|
-
// every turn
|
|
293
|
-
// first-audio timer until the initial chunk arrives; the chunk
|
|
294
|
-
// handler swaps it for short quiescence on each subsequent chunk.
|
|
229
|
+
// Force synthesis of any text buffered behind missing terminal
|
|
230
|
+
// punctuation: a trailing `"."` keeps the WS reusable, whereas
|
|
231
|
+
// the JSON `eos` operation would close it and require a
|
|
232
|
+
// reconnect every turn.
|
|
295
233
|
ws.send(JSON.stringify({ text: "." }));
|
|
296
234
|
armFirstAudioTimer();
|
|
297
235
|
},
|
|
298
236
|
|
|
299
237
|
cancel() {
|
|
300
238
|
if (closed) return;
|
|
301
|
-
// Drop Rime's server-side buffer for barge-in.
|
|
302
239
|
if (ws.readyState === WebSocket.OPEN) {
|
|
303
240
|
ws.send(JSON.stringify({ operation: "clear" }));
|
|
304
241
|
}
|
|
@@ -3,13 +3,19 @@
|
|
|
3
3
|
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
|
4
4
|
import { jsonLogger } from "./runtime-config.ts";
|
|
5
5
|
|
|
6
|
-
/** Parse the JSON line at `index` from `chunks`, failing if missing. */
|
|
7
6
|
function parseEntry(chunks: string[], index: number): Record<string, unknown> {
|
|
8
7
|
const raw = chunks[index];
|
|
9
8
|
if (raw === undefined) throw new Error(`No chunk at index ${index}`);
|
|
10
9
|
return JSON.parse(raw) as Record<string, unknown>;
|
|
11
10
|
}
|
|
12
11
|
|
|
12
|
+
function spyWriteInto(stream: NodeJS.WriteStream, sink: string[]): void {
|
|
13
|
+
vi.spyOn(stream, "write").mockImplementation((chunk: string | Uint8Array) => {
|
|
14
|
+
sink.push(String(chunk));
|
|
15
|
+
return true;
|
|
16
|
+
});
|
|
17
|
+
}
|
|
18
|
+
|
|
13
19
|
describe("jsonLogger", () => {
|
|
14
20
|
let stdoutChunks: string[];
|
|
15
21
|
let stderrChunks: string[];
|
|
@@ -17,14 +23,8 @@ describe("jsonLogger", () => {
|
|
|
17
23
|
beforeEach(() => {
|
|
18
24
|
stdoutChunks = [];
|
|
19
25
|
stderrChunks = [];
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
return true;
|
|
23
|
-
});
|
|
24
|
-
vi.spyOn(process.stderr, "write").mockImplementation((chunk: string | Uint8Array) => {
|
|
25
|
-
stderrChunks.push(String(chunk));
|
|
26
|
-
return true;
|
|
27
|
-
});
|
|
26
|
+
spyWriteInto(process.stdout, stdoutChunks);
|
|
27
|
+
spyWriteInto(process.stderr, stderrChunks);
|
|
28
28
|
});
|
|
29
29
|
|
|
30
30
|
afterEach(() => {
|