@alexkroman1/aai 1.7.1 → 1.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. package/.turbo/turbo-build.log +11 -9
  2. package/CHANGELOG.md +23 -0
  3. package/dist/{_internal-types-CrnTi9Ew.js → _internal-types-CfOAbK6V.js} +22 -35
  4. package/dist/constants-y68COEGj.js +29 -0
  5. package/dist/host/_base64.d.ts +2 -0
  6. package/dist/host/_mock-ws.d.ts +0 -61
  7. package/dist/host/_pipeline-test-fakes.d.ts +7 -4
  8. package/dist/host/_run-code.d.ts +0 -25
  9. package/dist/host/_runtime-conformance.d.ts +3 -34
  10. package/dist/host/memory-vector.d.ts +0 -11
  11. package/dist/host/providers/resolve-kv.d.ts +0 -7
  12. package/dist/host/providers/resolve-vector.d.ts +0 -8
  13. package/dist/host/providers/stt/assemblyai.d.ts +0 -14
  14. package/dist/host/providers/stt/deepgram.d.ts +2 -14
  15. package/dist/host/providers/stt/soniox.d.ts +0 -22
  16. package/dist/host/providers/tts/rime.d.ts +10 -31
  17. package/dist/host/runtime-barrel.js +670 -630
  18. package/dist/host/runtime-config.d.ts +9 -6
  19. package/dist/host/runtime.d.ts +3 -0
  20. package/dist/host/to-vercel-tools.d.ts +3 -33
  21. package/dist/host/transports/openai-realtime-transport.d.ts +45 -0
  22. package/dist/host/unstorage-kv.d.ts +0 -26
  23. package/dist/index.js +3 -3
  24. package/dist/openai-realtime-cjPAHMMx.js +10 -0
  25. package/dist/sdk/_internal-types.d.ts +6 -55
  26. package/dist/sdk/allowed-hosts.d.ts +4 -3
  27. package/dist/sdk/constants.d.ts +4 -29
  28. package/dist/sdk/define.d.ts +7 -4
  29. package/dist/sdk/kv.d.ts +13 -37
  30. package/dist/sdk/manifest-barrel.js +1 -1
  31. package/dist/sdk/manifest.d.ts +8 -2
  32. package/dist/sdk/protocol.js +1 -1
  33. package/dist/sdk/providers/s2s/openai-realtime.d.ts +17 -0
  34. package/dist/sdk/providers/s2s-barrel.d.ts +9 -0
  35. package/dist/sdk/providers/s2s-barrel.js +2 -0
  36. package/dist/sdk/providers/tts/rime.d.ts +1 -1
  37. package/dist/sdk/providers.d.ts +6 -2
  38. package/dist/sdk/types.d.ts +7 -1
  39. package/dist/{types-KUgezM6u.js → types-DOWVZhb9.js} +1 -7
  40. package/dist/{ws-upgrade-BeOQ7fXL.js → ws-upgrade-CG8-by1n.js} +2 -3
  41. package/host/_base64.ts +9 -0
  42. package/host/_mock-ws.ts +0 -65
  43. package/host/_pipeline-test-fakes.ts +19 -31
  44. package/host/_run-code.ts +10 -53
  45. package/host/_runtime-conformance.ts +3 -44
  46. package/host/_test-utils.ts +20 -42
  47. package/host/builtin-tools.test.ts +127 -222
  48. package/host/builtin-tools.ts +6 -10
  49. package/host/cleanup.test.ts +30 -73
  50. package/host/integration/pipeline-reference.integration.test.ts +12 -17
  51. package/host/integration.test.ts +0 -7
  52. package/host/memory-vector.test.ts +3 -1
  53. package/host/memory-vector.ts +16 -21
  54. package/host/pinecone-vector.test.ts +14 -17
  55. package/host/pinecone-vector.ts +10 -19
  56. package/host/providers/providers.test-d.ts +5 -3
  57. package/host/providers/resolve-kv.ts +23 -41
  58. package/host/providers/resolve-vector.ts +3 -12
  59. package/host/providers/resolve.test.ts +15 -28
  60. package/host/providers/resolve.ts +24 -24
  61. package/host/providers/stt/assemblyai.test.ts +2 -14
  62. package/host/providers/stt/assemblyai.ts +12 -35
  63. package/host/providers/stt/deepgram.test.ts +23 -83
  64. package/host/providers/stt/deepgram.ts +15 -40
  65. package/host/providers/stt/elevenlabs.test.ts +26 -38
  66. package/host/providers/stt/elevenlabs.ts +10 -9
  67. package/host/providers/stt/soniox.test.ts +35 -85
  68. package/host/providers/stt/soniox.ts +8 -53
  69. package/host/providers/tts/cartesia.test.ts +19 -58
  70. package/host/providers/tts/cartesia.ts +36 -66
  71. package/host/providers/tts/rime.test.ts +12 -38
  72. package/host/providers/tts/rime.ts +23 -86
  73. package/host/runtime-config.test.ts +9 -9
  74. package/host/runtime-config.ts +16 -22
  75. package/host/runtime.test.ts +111 -73
  76. package/host/runtime.ts +139 -86
  77. package/host/s2s.test.ts +92 -191
  78. package/host/s2s.ts +55 -49
  79. package/host/server-shutdown.test.ts +9 -30
  80. package/host/server.test.ts +2 -13
  81. package/host/server.ts +85 -100
  82. package/host/session-core.test.ts +15 -30
  83. package/host/session-core.ts +10 -13
  84. package/host/session-prompt.test.ts +1 -5
  85. package/host/to-vercel-tools.test.ts +53 -72
  86. package/host/to-vercel-tools.ts +9 -39
  87. package/host/tool-executor.test.ts +25 -51
  88. package/host/tool-executor.ts +18 -12
  89. package/host/transports/openai-realtime-transport.test.ts +439 -0
  90. package/host/transports/openai-realtime-transport.ts +371 -0
  91. package/host/transports/pipeline-transport.test.ts +125 -298
  92. package/host/transports/pipeline-transport.ts +20 -68
  93. package/host/transports/s2s-transport-fixtures.test.ts +31 -92
  94. package/host/transports/s2s-transport.test.ts +65 -134
  95. package/host/transports/s2s-transport.ts +15 -43
  96. package/host/transports/types.test.ts +4 -8
  97. package/host/unstorage-kv.test.ts +3 -2
  98. package/host/unstorage-kv.ts +5 -35
  99. package/host/ws-handler.test.ts +72 -176
  100. package/host/ws-handler.ts +6 -12
  101. package/package.json +6 -1
  102. package/sdk/__snapshots__/exports.test.ts.snap +7 -0
  103. package/sdk/__snapshots__/schema-shapes.test.ts.snap +1 -0
  104. package/sdk/_internal-types.test.ts +6 -9
  105. package/sdk/_internal-types.ts +16 -57
  106. package/sdk/_test-matchers.ts +25 -15
  107. package/sdk/allowed-hosts.test.ts +50 -114
  108. package/sdk/allowed-hosts.ts +8 -14
  109. package/sdk/constants.ts +5 -52
  110. package/sdk/define.test.ts +7 -6
  111. package/sdk/define.ts +7 -3
  112. package/sdk/exports.test.ts +6 -1
  113. package/sdk/kv.ts +13 -37
  114. package/sdk/manifest.test-d.ts +5 -0
  115. package/sdk/manifest.test.ts +61 -9
  116. package/sdk/manifest.ts +11 -11
  117. package/sdk/protocol-compat.test.ts +66 -98
  118. package/sdk/protocol-snapshot.test.ts +2 -16
  119. package/sdk/protocol.test.ts +13 -22
  120. package/sdk/providers/s2s/openai-realtime.ts +36 -0
  121. package/sdk/providers/s2s-barrel.ts +12 -0
  122. package/sdk/providers/tts/rime.ts +1 -1
  123. package/sdk/providers.ts +24 -5
  124. package/sdk/schema-alignment.test.ts +25 -73
  125. package/sdk/schema-shapes.test.ts +1 -29
  126. package/sdk/system-prompt.test.ts +0 -1
  127. package/sdk/system-prompt.ts +17 -19
  128. package/sdk/types-inference.test.ts +10 -36
  129. package/sdk/types.ts +7 -0
  130. package/sdk/ws-upgrade.test.ts +24 -23
  131. package/sdk/ws-upgrade.ts +2 -3
  132. package/tsdown.config.ts +8 -11
  133. package/dist/constants-C2nirZUI.js +0 -54
@@ -92,41 +92,32 @@ export function openCartesia(opts: CartesiaOptions): TtsOpener {
92
92
  const emitter: Emitter<TtsEvents> = createNanoEvents<TtsEvents>();
93
93
  let closed = false;
94
94
 
95
- /** Mint a fresh context bound to the shared TTSWS connection. */
95
+ const audioConfig = {
96
+ model_id: model,
97
+ voice: { mode: "id" as const, id: voice },
98
+ output_format: {
99
+ container: "raw" as const,
100
+ encoding: "pcm_s16le" as const,
101
+ sample_rate: sampleRate,
102
+ },
103
+ };
104
+ const baseRequest = { ...audioConfig, language };
105
+
96
106
  const mintContext = (): TTSWSContext =>
97
- ws.context({
98
- model_id: model,
99
- voice: { mode: "id", id: voice },
100
- output_format: {
101
- container: "raw",
102
- encoding: "pcm_s16le",
103
- sample_rate: sampleRate,
104
- },
105
- contextId: randomUUID(),
106
- });
107
+ ws.context({ ...audioConfig, contextId: randomUUID() });
107
108
 
108
109
  let context = mintContext();
109
- /**
110
- * `doneEmitted` guards against emitting `done` more than once per turn.
111
- * Reset whenever a fresh context is minted (i.e. at turn boundaries).
112
- */
113
110
  let doneEmitted = false;
114
- /**
115
- * After `flush()` or `cancel()`, the current context is done accepting
116
- * input. We defer minting a fresh one until the next `sendText()` so
117
- * that late audio chunks + Cartesia's real `done` event (both tagged
118
- * with the flushed context's id) still pass the filter below. Rotating
119
- * eagerly would silently drop all audio still in flight.
120
- */
111
+ // Defer minting after flush/cancel until next sendText so late audio
112
+ // chunks + Cartesia's real `done` (tagged with the flushed context's id)
113
+ // still pass the filter. Rotating eagerly would drop in-flight audio.
121
114
  let rotatePending = false;
122
- const rotateContext = () => {
115
+ const rotateIfPending = () => {
116
+ if (!rotatePending) return;
123
117
  context = mintContext();
124
118
  doneEmitted = false;
125
119
  rotatePending = false;
126
120
  };
127
- const rotateIfPending = () => {
128
- if (rotatePending) rotateContext();
129
- };
130
121
  const emitDoneOnce = () => {
131
122
  if (doneEmitted || closed) return;
132
123
  doneEmitted = true;
@@ -136,12 +127,11 @@ export function openCartesia(opts: CartesiaOptions): TtsOpener {
136
127
  // TTSWS fires events globally across all contexts on the shared
137
128
  // socket; filter by the currently-active context_id.
138
129
  ws.on("chunk", (event) => {
139
- if (closed) return;
140
- if (event.context_id !== context.contextId) return;
130
+ if (closed || event.context_id !== context.contextId) return;
141
131
  const buf = event.audio;
142
132
  if (!buf || buf.byteLength === 0) return;
143
- // Cartesia sends PCM16 LE; be defensive about odd byte counts
144
- // so `new Int16Array` never throws on a misaligned length.
133
+ // Defensive: trim odd byte counts so `new Int16Array` never throws
134
+ // on a misaligned length.
145
135
  const evenBytes = buf.byteLength - (buf.byteLength % 2);
146
136
  if (evenBytes === 0) return;
147
137
  const pcm = new Int16Array(buf.buffer.slice(buf.byteOffset, buf.byteOffset + evenBytes));
@@ -149,8 +139,7 @@ export function openCartesia(opts: CartesiaOptions): TtsOpener {
149
139
  });
150
140
 
151
141
  ws.on("done", (event) => {
152
- if (closed) return;
153
- if (event.context_id !== context.contextId) return;
142
+ if (closed || event.context_id !== context.contextId) return;
154
143
  emitDoneOnce();
155
144
  });
156
145
 
@@ -165,38 +154,25 @@ export function openCartesia(opts: CartesiaOptions): TtsOpener {
165
154
  try {
166
155
  ws.close({ code: 1000, reason: "client close" });
167
156
  } catch {
168
- // Swallow: caller has already decided to tear down.
157
+ // Caller has already decided to tear down.
169
158
  }
170
159
  };
171
160
 
172
161
  if (openOpts.signal.aborted) {
173
162
  void close();
174
163
  } else {
175
- openOpts.signal.addEventListener("abort", () => void close(), {
176
- once: true,
177
- });
164
+ openOpts.signal.addEventListener("abort", () => void close(), { once: true });
178
165
  }
179
166
 
180
- const baseRequest = {
181
- model_id: model,
182
- voice: { mode: "id" as const, id: voice },
183
- output_format: {
184
- container: "raw" as const,
185
- encoding: "pcm_s16le" as const,
186
- sample_rate: sampleRate,
187
- },
188
- language,
189
- };
190
-
191
167
  const ignoreRejection = (_err: unknown): void => {
192
- // intentionally empty
168
+ /* no-op */
193
169
  };
194
170
 
195
171
  const session: CartesiaSession = {
196
172
  sendText(text: string) {
197
173
  if (closed || text.length === 0) return;
198
- // First sendText after a flush/cancel starts a fresh context so
199
- // we don't append to one that's already been finalized.
174
+ // First sendText after flush/cancel starts a fresh context so we
175
+ // don't append to one that's already been finalized.
200
176
  rotateIfPending();
201
177
  void context
202
178
  .send({ ...baseRequest, transcript: text, continue: true })
@@ -204,12 +180,10 @@ export function openCartesia(opts: CartesiaOptions): TtsOpener {
204
180
  },
205
181
  flush() {
206
182
  if (closed || rotatePending) return;
207
- // Empty transcript with `continue: false` is the canonical
208
- // end-of-turn signal. Cartesia finishes synthesizing whatever
209
- // is queued and then emits a `done` tagged with the same
210
- // context_id at that point `emitDoneOnce` fires for real.
211
- // Defer rotation so the filter below still accepts in-flight
212
- // audio chunks and the real `done` event.
183
+ // Empty transcript + `continue: false` is the canonical end-of-turn
184
+ // signal. Cartesia finishes synthesizing what's queued and emits
185
+ // `done` tagged with the same context_id; rotation is deferred so
186
+ // in-flight audio chunks and the real `done` still pass the filter.
213
187
  void context
214
188
  .send({ ...baseRequest, transcript: "", continue: false })
215
189
  .catch(ignoreRejection);
@@ -218,18 +192,15 @@ export function openCartesia(opts: CartesiaOptions): TtsOpener {
218
192
  cancel() {
219
193
  if (closed) return;
220
194
  // Skip the wire cancel if the context is already final on
221
- // Cartesia's side (natural `done` after flush, or a prior
222
- // cancel). Cartesia responds to cancel on a retired context
223
- // with a 400 "context ID does not exist", which our error
224
- // listener surfaces as `tts_stream_error` and the pipeline
225
- // treats as fatal — killing the session for a benign race.
195
+ // Cartesia's side: cancelling a retired context returns a 400
196
+ // ("context ID does not exist") which surfaces as a fatal
197
+ // tts_stream_error for a benign race.
226
198
  if (!doneEmitted) {
227
199
  void context.cancel().catch(ignoreRejection);
228
200
  }
229
- // Emit synchronously: barge-in advances the orchestrator's
230
- // state machine on `done`, and delaying it would audibly
231
- // stall subsequent turns. Cartesia stops producing audio
232
- // after cancel, so dropping any late chunks is fine.
201
+ // Emit synchronously: barge-in advances the orchestrator on `done`;
202
+ // delaying would audibly stall subsequent turns. Cartesia stops
203
+ // producing audio after cancel, so dropping late chunks is fine.
233
204
  emitDoneOnce();
234
205
  rotatePending = true;
235
206
  },
@@ -240,7 +211,6 @@ export function openCartesia(opts: CartesiaOptions): TtsOpener {
240
211
  _ws: ws,
241
212
  _currentContextId: () => context.contextId,
242
213
  };
243
-
244
214
  return session;
245
215
  },
246
216
  };
@@ -1,13 +1,8 @@
1
1
  // Copyright 2026 the AAI authors. MIT license.
2
- /** Unit test for the Rime TTS adapter. Mocks the `ws` package. */
3
2
 
4
3
  import { afterEach, beforeEach, describe, expect, test, vi } from "vitest";
5
4
  import { openRime, type RimeSession } from "./rime.ts";
6
5
 
7
- // ──────────────────────────────────────────────────────────────────────────────
8
- // Fake WebSocket — hoisted so `vi.mock` factory can reference it
9
- // ──────────────────────────────────────────────────────────────────────────────
10
-
11
6
  type WsEvent = "open" | "message" | "error" | "close";
12
7
  type WsListener = (...args: unknown[]) => void;
13
8
 
@@ -15,19 +10,17 @@ const { FakeWebSocket } = vi.hoisted(() => {
15
10
  class FakeWebSocket {
16
11
  static OPEN = 1;
17
12
  static CLOSED = 3;
13
+ static instances: FakeWebSocket[] = [];
18
14
 
19
15
  readyState = FakeWebSocket.OPEN;
20
16
  sent: string[] = [];
21
- private readonly listeners = new Map<string, WsListener[]>();
22
-
23
- static instances: FakeWebSocket[] = [];
24
-
25
17
  readonly url: string;
18
+ private readonly listeners = new Map<string, WsListener[]>();
26
19
 
27
20
  constructor(url: string, _opts?: unknown) {
28
21
  this.url = url;
29
22
  FakeWebSocket.instances.push(this);
30
- // Simulate async open on next microtask (matches real ws behaviour).
23
+ // Real `ws` fires "open" asynchronously; match that timing.
31
24
  queueMicrotask(() => this._fire("open"));
32
25
  }
33
26
 
@@ -39,17 +32,13 @@ const { FakeWebSocket } = vi.hoisted(() => {
39
32
 
40
33
  once(event: string, fn: WsListener) {
41
34
  const wrapper = (...args: unknown[]) => {
42
- this.off(event, wrapper);
35
+ this.removeListener(event, wrapper);
43
36
  fn(...args);
44
37
  };
45
38
  this.on(event, wrapper);
46
39
  }
47
40
 
48
41
  removeListener(event: string, fn: WsListener) {
49
- this.off(event, fn);
50
- }
51
-
52
- private off(event: string, fn: WsListener) {
53
42
  const arr = this.listeners.get(event) ?? [];
54
43
  this.listeners.set(
55
44
  event,
@@ -66,12 +55,10 @@ const { FakeWebSocket } = vi.hoisted(() => {
66
55
  this._fire("close");
67
56
  }
68
57
 
69
- /** Test helper: fire an event on this socket. */
70
58
  _fire(event: WsEvent, ...args: unknown[]) {
71
59
  for (const fn of this.listeners.get(event) ?? []) fn(...args);
72
60
  }
73
61
 
74
- /** Test helper: simulate a JSON message from the server. */
75
62
  _msg(payload: unknown) {
76
63
  this._fire("message", JSON.stringify(payload));
77
64
  }
@@ -85,10 +72,6 @@ vi.mock("ws", () => ({
85
72
  WebSocket: FakeWebSocket,
86
73
  }));
87
74
 
88
- // ──────────────────────────────────────────────────────────────────────────────
89
- // Helpers
90
- // ──────────────────────────────────────────────────────────────────────────────
91
-
92
75
  beforeEach(() => {
93
76
  FakeWebSocket.instances.length = 0;
94
77
  vi.useFakeTimers();
@@ -112,7 +95,7 @@ async function openSession(apiKey = "test-key"): Promise<{
112
95
  signal: controller.signal,
113
96
  }) as Promise<RimeSession>;
114
97
 
115
- // Let the microtask that fires FakeWebSocket "open" run.
98
+ // Let the queued microtask that fires "open" run.
116
99
  await Promise.resolve();
117
100
 
118
101
  const session = await openPromise;
@@ -121,10 +104,6 @@ async function openSession(apiKey = "test-key"): Promise<{
121
104
  return { session, ws, controller };
122
105
  }
123
106
 
124
- // ──────────────────────────────────────────────────────────────────────────────
125
- // Tests
126
- // ──────────────────────────────────────────────────────────────────────────────
127
-
128
107
  describe("rime TTS adapter", () => {
129
108
  test("openRime returns an opener with name 'rime'", () => {
130
109
  const opener = openRime({ voice: "cove" });
@@ -150,18 +129,15 @@ describe("rime TTS adapter", () => {
150
129
  const audioEvents: Int16Array[] = [];
151
130
  session.on("audio", (pcm) => audioEvents.push(pcm));
152
131
 
153
- // Encode 4 PCM16 samples (8 bytes) as base64.
154
132
  const samples = new Int16Array([100, 200, 300, 400]);
155
133
  const base64 = Buffer.from(samples.buffer).toString("base64");
156
134
 
157
135
  ws._msg({ type: "chunk", data: base64, contextId: null });
158
136
 
159
137
  expect(audioEvents.length).toBe(1);
160
- const firstChunk = audioEvents[0];
161
- expect(firstChunk).toBeInstanceOf(Int16Array);
162
- // Each sample pair decodes correctly.
163
- // biome-ignore lint/style/noNonNullAssertion: length was asserted to be 1 on the line above
164
- const pcm = firstChunk!;
138
+ // biome-ignore lint/style/noNonNullAssertion: length was asserted to be 1 above
139
+ const pcm = audioEvents[0]!;
140
+ expect(pcm).toBeInstanceOf(Int16Array);
165
141
  expect(pcm.length).toBe(4);
166
142
  expect(pcm[0]).toBe(100);
167
143
  expect(pcm[3]).toBe(400);
@@ -188,11 +164,11 @@ describe("rime TTS adapter", () => {
188
164
  // closing the WS (which `eos` would do).
189
165
  expect(ws.sent).toContain(JSON.stringify({ text: "." }));
190
166
 
191
- // First-audio timer is 5 s — short window must not fire `done` yet.
167
+ // First-audio timer is 5s — short window must not fire `done` yet.
192
168
  vi.advanceTimersByTime(500);
193
169
  expect(doneEvents.length).toBe(0);
194
170
 
195
- // First chunk arrives switch to short quiescence window.
171
+ // First chunk arrives, switching to the short quiescence window.
196
172
  const samples = new Int16Array([100, 200, 300, 400]);
197
173
  ws._msg({
198
174
  type: "chunk",
@@ -215,7 +191,7 @@ describe("rime TTS adapter", () => {
215
191
  session.sendText("Hi there");
216
192
  session.flush();
217
193
 
218
- // No chunk arrives — done must wait the full FIRST_AUDIO_TIMEOUT_MS (5 s).
194
+ // No chunk arrives — must wait the full FIRST_AUDIO_TIMEOUT_MS (5s).
219
195
  vi.advanceTimersByTime(4999);
220
196
  expect(doneEvents.length).toBe(0);
221
197
  vi.advanceTimersByTime(1);
@@ -229,11 +205,10 @@ describe("rime TTS adapter", () => {
229
205
  session.on("done", () => doneEvents.push(Date.now()));
230
206
 
231
207
  session.sendText("Hello");
232
- // cancel() must emit `done` synchronously barge-in cannot be deferred.
208
+ // Barge-in cannot be deferred — `done` must fire synchronously.
233
209
  session.cancel();
234
210
 
235
211
  expect(ws.sent).toContain(JSON.stringify({ operation: "clear" }));
236
- // done was emitted synchronously (before any await / timer).
237
212
  expect(doneEvents.length).toBe(1);
238
213
  });
239
214
 
@@ -245,7 +220,6 @@ describe("rime TTS adapter", () => {
245
220
  await session.close();
246
221
  expect(ws.readyState).toBe(FakeWebSocket.CLOSED);
247
222
 
248
- // Second close should not throw.
249
223
  await expect(session.close()).resolves.toBeUndefined();
250
224
  });
251
225
  });
@@ -2,36 +2,17 @@
2
2
  /**
3
3
  * Rime TTS opener (host-only).
4
4
  *
5
- * The user-facing descriptor factory (`rime(...)`) lives in
6
- * `sdk/providers/tts/rime.ts`. This module is the host-side
7
- * counterpart: it takes the descriptor options + an API key and
8
- * returns a {@link TtsOpener} that the pipeline session drives.
5
+ * Connects to Rime's `ws2` JSON WebSocket endpoint with one long-lived
6
+ * connection per session. Client server: `{ text }` appends to the
7
+ * synthesis buffer, `{ operation: "clear" }` drops it (barge-in). We never
8
+ * send `eos` since it tears down the WS `flush()` instead sends a
9
+ * trailing `"."` to force synthesis of any text buffered behind missing
10
+ * terminal punctuation while keeping the connection reusable.
9
11
  *
10
- * **Protocol.** Connects to Rime's `ws2` JSON WebSocket endpoint
11
- * (`wss://users-ws.rime.ai/ws2`). Client-to-server messages are JSON:
12
- * - `{ "text": "..." }` append text to the synthesis buffer
13
- * - `{ "operation": "clear" }` drop buffered text (barge-in)
14
- * - `{ "operation": "eos" }` — drain buffer, close connection (NOT used
15
- * during a session: it would tear down the WS, forcing reconnect per
16
- * turn). We force end-of-turn synthesis with a trailing `"."` instead.
17
- * The server responds with JSON frames:
18
- * - `{ type: "chunk", data: <base64 PCM16 LE>, contextId: string | null }`
19
- * - `{ type: "timestamps", ... }` (ignored)
20
- * - `{ type: "error", message: string }` (surfaced as `tts_stream_error`)
21
- *
22
- * **Single long-lived connection per session.** Rime buffers text until it
23
- * sees terminal punctuation (`.`, `?`, `!`), so we use one WebSocket per
24
- * `open()` call and reuse it across turns. `clear` resets the buffer
25
- * between cancellations.
26
- *
27
- * **Done detection.** After `flush()` sends a trailing `"."` to force the
28
- * server to synthesize any half-buffered text, we arm a quiescence timer
29
- * that fires 500 ms after the last received audio chunk. When it fires,
30
- * `done` is emitted.
31
- *
32
- * **Audio format.** The URL requests `audioFormat=pcm` at the negotiated
33
- * `sampleRate`, which returns raw PCM16 little-endian. We decode the base64
34
- * payload and construct a zero-copy `Int16Array` view over the decoded bytes.
12
+ * Server client: `{ type: "chunk", data: <base64 PCM16 LE> }` carries
13
+ * audio; `timestamps` is ignored; `error` surfaces as `tts_stream_error`.
14
+ * The `audioFormat=pcm` query param at the negotiated `sampleRate` returns
15
+ * raw PCM16 LE that we view as a zero-copy `Int16Array`.
35
16
  */
36
17
 
37
18
  import { createNanoEvents, type Emitter } from "nanoevents";
@@ -45,13 +26,11 @@ import {
45
26
  type TtsSession,
46
27
  } from "../../../sdk/providers.ts";
47
28
 
48
- /** Internal: TtsSession with a test-only handle to the raw WebSocket. */
49
29
  export interface RimeSession extends TtsSession {
50
30
  /** @internal Test-only: exposes the underlying raw WebSocket. */
51
31
  readonly _ws: WebSocket;
52
32
  }
53
33
 
54
- /** PCM16 sample rates accepted by the Rime `ws2` endpoint. */
55
34
  const RIME_PCM16_RATES = [
56
35
  8000, 16_000, 22_050, 24_000, 44_100, 48_000,
57
36
  ] as const satisfies readonly number[];
@@ -64,51 +43,28 @@ function assertSupportedSampleRate(rate: number): number {
64
43
  );
65
44
  }
66
45
 
67
- /**
68
- * Decode a base64 string from Rime into a zero-copy `Int16Array`.
69
- *
70
- * Rime's `ws2` endpoint returns base64-encoded PCM16 LE in each chunk.
71
- * `Buffer.from(base64, "base64")` gives us a Node.js Buffer (which is a
72
- * Uint8Array subclass) with `byteOffset === 0`. PCM16 bytes always come in
73
- * pairs so the length is guaranteed to be even.
74
- */
75
46
  function base64ToPcm(data: string): Int16Array {
76
47
  const bytes = Buffer.from(data, "base64");
77
- // Defensive: drop a trailing odd byte rather than throwing on misalignment.
48
+ // Defensive: drop a trailing odd byte rather than throw on misalignment.
78
49
  const evenLen = bytes.byteLength - (bytes.byteLength % 2);
79
50
  if (evenLen === 0) return new Int16Array(0);
80
51
  return new Int16Array(bytes.buffer, bytes.byteOffset, evenLen / 2);
81
52
  }
82
53
 
83
- /**
84
- * Shape of JSON messages received from Rime's `ws2` endpoint.
85
- *
86
- * Only `chunk` messages carry audio; `timestamps` messages are informational
87
- * and can be ignored for the current use case.
88
- */
89
54
  interface RimeMessage {
90
55
  type: "chunk" | "timestamps" | "error" | string;
91
- /** Base64-encoded PCM16 LE audio. Present on `type === "chunk"`. */
92
56
  data?: string;
93
- /** Context discriminator for the in-flight utterance. May be null. */
94
57
  contextId?: string | null;
95
- /** Error description. Present on `type === "error"`. */
96
58
  message?: string;
97
59
  }
98
60
 
99
- /** Quiescence timeout in ms — how long to wait after the last audio chunk before emitting `done`. */
100
61
  const QUIESCENCE_MS = 500;
101
62
 
102
- /**
103
- * After `flush()`, how long to wait for the FIRST audio chunk before
104
- * giving up and emitting `done`. Greeting and short replies hit this
105
- * path: `flush()` runs immediately after `sendText()`, so audio TTFB
106
- * exceeds the 500 ms quiescence window. Once the first chunk arrives,
107
- * we transition to the shorter quiescence timeout.
108
- */
63
+ // Greetings and short replies emit `flush()` immediately after `sendText()`,
64
+ // so audio TTFB easily exceeds QUIESCENCE_MS. Wait longer for the FIRST
65
+ // chunk; subsequent chunks revert to the shorter quiescence window.
109
66
  const FIRST_AUDIO_TIMEOUT_MS = 5000;
110
67
 
111
- /** Wait for the WebSocket `open` event; reject on first `error`. */
112
68
  function waitForOpen(ws: WebSocket): Promise<void> {
113
69
  return new Promise<void>((resolve, reject) => {
114
70
  const onOpen = () => {
@@ -129,12 +85,8 @@ function waitForOpen(ws: WebSocket): Promise<void> {
129
85
  });
130
86
  }
131
87
 
132
- /**
133
- * Handle one incoming WebSocket message frame.
134
- *
135
- * Extracted into a top-level function to keep `open()` under the cognitive
136
- * complexity limit while retaining full access to the session state via refs.
137
- */
88
+ // Extracted to a top-level function to keep `open()` under the cognitive
89
+ // complexity limit; session state is threaded through via the ref callbacks.
138
90
  function handleRimeMessage(
139
91
  raw: WebSocket.Data,
140
92
  emitter: Emitter<TtsEvents>,
@@ -145,7 +97,6 @@ function handleRimeMessage(
145
97
  try {
146
98
  msg = JSON.parse(typeof raw === "string" ? raw : raw.toString()) as RimeMessage;
147
99
  } catch {
148
- // Unparseable frame — ignore.
149
100
  return;
150
101
  }
151
102
 
@@ -153,9 +104,8 @@ function handleRimeMessage(
153
104
  const pcm = base64ToPcm(msg.data);
154
105
  if (pcm.length > 0) {
155
106
  emitter.emit("audio", pcm);
156
- // While we're waiting on a flush (long timer for first audio, or
157
- // short timer between chunks), each chunk resets to the short
158
- // quiescence window — so `done` fires only after audio stops.
107
+ // Each chunk resets the quiescence window so `done` fires only after
108
+ // audio stops applies to both the first-audio and post-chunk timers.
159
109
  if (isActiveTimer()) armQuiescence();
160
110
  }
161
111
  return;
@@ -166,10 +116,8 @@ function handleRimeMessage(
166
116
  makeTtsError("tts_stream_error", `Rime TTS: ${msg.message ?? "unknown error"}`),
167
117
  );
168
118
  }
169
- // Ignore `type === "timestamps"` and unknown message types.
170
119
  }
171
120
 
172
- /** Build a {@link TtsOpener} from resolved Rime descriptor options. */
173
121
  export function openRime(opts: RimeOptions): TtsOpener {
174
122
  return {
175
123
  name: "rime",
@@ -187,7 +135,6 @@ export function openRime(opts: RimeOptions): TtsOpener {
187
135
  const lang = opts.language ?? "eng";
188
136
  const voice = opts.voice ?? RIME_DEFAULT_VOICE;
189
137
 
190
- // Construct the ws2 URL with query parameters.
191
138
  const url = `wss://users-ws.rime.ai/ws2?speaker=${encodeURIComponent(voice)}&modelId=${encodeURIComponent(model)}&audioFormat=pcm&samplingRate=${sampleRate}&lang=${encodeURIComponent(lang)}`;
192
139
 
193
140
  let ws: WebSocket;
@@ -205,12 +152,6 @@ export function openRime(opts: RimeOptions): TtsOpener {
205
152
  const emitter: Emitter<TtsEvents> = createNanoEvents<TtsEvents>();
206
153
  let closed = false;
207
154
  let doneEmitted = false;
208
- /**
209
- * After `flush()`, we arm a timer that fires `done`. Initial timeout is
210
- * `FIRST_AUDIO_TIMEOUT_MS` to give Rime headroom on TTFB; the first
211
- * chunk swaps it for a shorter `QUIESCENCE_MS` window that resets on
212
- * each subsequent chunk. `cancel()` emits `done` synchronously.
213
- */
214
155
  let quiescenceTimer: ReturnType<typeof setTimeout> | null = null;
215
156
 
216
157
  const clearQuiescence = () => {
@@ -264,7 +205,7 @@ export function openRime(opts: RimeOptions): TtsOpener {
264
205
  try {
265
206
  ws.close();
266
207
  } catch {
267
- // Swallow: caller has already decided to tear down.
208
+ // Caller has already decided to tear down.
268
209
  }
269
210
  };
270
211
 
@@ -278,7 +219,6 @@ export function openRime(opts: RimeOptions): TtsOpener {
278
219
  sendText(text: string) {
279
220
  if (closed || text.length === 0) return;
280
221
  if (ws.readyState !== WebSocket.OPEN) return;
281
- // Reset done state at the start of a new turn.
282
222
  doneEmitted = false;
283
223
  ws.send(JSON.stringify({ text }));
284
224
  },
@@ -286,19 +226,16 @@ export function openRime(opts: RimeOptions): TtsOpener {
286
226
  flush() {
287
227
  if (closed) return;
288
228
  if (ws.readyState !== WebSocket.OPEN) return;
289
- // Force synthesis of any text buffered behind a missing terminal
290
- // punctuation: append a trailing `"."`. Sending the JSON `eos`
291
- // operation would close the connection, requiring a reconnect on
292
- // every turn — `"."` keeps the WS reusable. Use the longer
293
- // first-audio timer until the initial chunk arrives; the chunk
294
- // handler swaps it for short quiescence on each subsequent chunk.
229
+ // Force synthesis of any text buffered behind missing terminal
230
+ // punctuation: a trailing `"."` keeps the WS reusable, whereas
231
+ // the JSON `eos` operation would close it and require a
232
+ // reconnect every turn.
295
233
  ws.send(JSON.stringify({ text: "." }));
296
234
  armFirstAudioTimer();
297
235
  },
298
236
 
299
237
  cancel() {
300
238
  if (closed) return;
301
- // Drop Rime's server-side buffer for barge-in.
302
239
  if (ws.readyState === WebSocket.OPEN) {
303
240
  ws.send(JSON.stringify({ operation: "clear" }));
304
241
  }
@@ -3,13 +3,19 @@
3
3
  import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
4
4
  import { jsonLogger } from "./runtime-config.ts";
5
5
 
6
- /** Parse the JSON line at `index` from `chunks`, failing if missing. */
7
6
  function parseEntry(chunks: string[], index: number): Record<string, unknown> {
8
7
  const raw = chunks[index];
9
8
  if (raw === undefined) throw new Error(`No chunk at index ${index}`);
10
9
  return JSON.parse(raw) as Record<string, unknown>;
11
10
  }
12
11
 
12
+ function spyWriteInto(stream: NodeJS.WriteStream, sink: string[]): void {
13
+ vi.spyOn(stream, "write").mockImplementation((chunk: string | Uint8Array) => {
14
+ sink.push(String(chunk));
15
+ return true;
16
+ });
17
+ }
18
+
13
19
  describe("jsonLogger", () => {
14
20
  let stdoutChunks: string[];
15
21
  let stderrChunks: string[];
@@ -17,14 +23,8 @@ describe("jsonLogger", () => {
17
23
  beforeEach(() => {
18
24
  stdoutChunks = [];
19
25
  stderrChunks = [];
20
- vi.spyOn(process.stdout, "write").mockImplementation((chunk: string | Uint8Array) => {
21
- stdoutChunks.push(String(chunk));
22
- return true;
23
- });
24
- vi.spyOn(process.stderr, "write").mockImplementation((chunk: string | Uint8Array) => {
25
- stderrChunks.push(String(chunk));
26
- return true;
27
- });
26
+ spyWriteInto(process.stdout, stdoutChunks);
27
+ spyWriteInto(process.stderr, stderrChunks);
28
28
  });
29
29
 
30
30
  afterEach(() => {