getpatter 0.6.1 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunk-CL2U3YET.mjs +1429 -0
- package/dist/{chunk-TEW3NAZJ.mjs → chunk-LE63CSOB.mjs} +371 -1486
- package/dist/{chunk-RV7APPYE.mjs → chunk-R2T4JABZ.mjs} +13 -0
- package/dist/cli.js +48 -23
- package/dist/dashboard/ui.html +8 -8
- package/dist/index.d.mts +452 -186
- package/dist/index.d.ts +452 -186
- package/dist/index.js +1485 -979
- package/dist/index.mjs +973 -790
- package/dist/openai-realtime-2-CNFARP25.mjs +8 -0
- package/dist/{silero-vad-NSEXI4XS.mjs → silero-vad-LNDFGIY7.mjs} +1 -1
- package/dist/{test-mode-WEKKNBLD.mjs → test-mode-RS57BDM6.mjs} +2 -1
- package/package.json +1 -1
- package/src/dashboard/ui.html +8 -8
|
@@ -0,0 +1,1429 @@
|
|
|
1
|
+
import {
|
|
2
|
+
getLogger
|
|
3
|
+
} from "./chunk-MVOQFAEO.mjs";
|
|
4
|
+
import {
|
|
5
|
+
init_esm_shims
|
|
6
|
+
} from "./chunk-N565J3CF.mjs";
|
|
7
|
+
|
|
8
|
+
// src/providers/openai-realtime-2.ts
|
|
9
|
+
init_esm_shims();
|
|
10
|
+
import WebSocket2 from "ws";
|
|
11
|
+
|
|
12
|
+
// src/providers/openai-realtime.ts
|
|
13
|
+
init_esm_shims();
|
|
14
|
+
import WebSocket from "ws";
|
|
15
|
+
var OpenAIRealtimeAudioFormat = {
|
|
16
|
+
G711_ULAW: "g711_ulaw",
|
|
17
|
+
G711_ALAW: "g711_alaw",
|
|
18
|
+
PCM16: "pcm16"
|
|
19
|
+
};
|
|
20
|
+
var OpenAIRealtimeModel = {
|
|
21
|
+
GPT_REALTIME: "gpt-realtime",
|
|
22
|
+
GPT_REALTIME_2: "gpt-realtime-2",
|
|
23
|
+
GPT_REALTIME_MINI: "gpt-realtime-mini",
|
|
24
|
+
GPT_4O_REALTIME_PREVIEW: "gpt-4o-realtime-preview",
|
|
25
|
+
GPT_4O_MINI_REALTIME_PREVIEW: "gpt-4o-mini-realtime-preview"
|
|
26
|
+
};
|
|
27
|
+
var OpenAIVoice = {
|
|
28
|
+
ALLOY: "alloy",
|
|
29
|
+
ASH: "ash",
|
|
30
|
+
BALLAD: "ballad",
|
|
31
|
+
CORAL: "coral",
|
|
32
|
+
ECHO: "echo",
|
|
33
|
+
FABLE: "fable",
|
|
34
|
+
NOVA: "nova",
|
|
35
|
+
ONYX: "onyx",
|
|
36
|
+
SAGE: "sage",
|
|
37
|
+
SHIMMER: "shimmer",
|
|
38
|
+
VERSE: "verse"
|
|
39
|
+
};
|
|
40
|
+
var OpenAITranscriptionModel = {
|
|
41
|
+
WHISPER_1: "whisper-1",
|
|
42
|
+
GPT_4O_TRANSCRIBE: "gpt-4o-transcribe",
|
|
43
|
+
GPT_4O_MINI_TRANSCRIBE: "gpt-4o-mini-transcribe",
|
|
44
|
+
GPT_REALTIME_WHISPER: "gpt-realtime-whisper"
|
|
45
|
+
};
|
|
46
|
+
var OpenAIRealtimeVADType = {
|
|
47
|
+
SERVER_VAD: "server_vad",
|
|
48
|
+
SEMANTIC_VAD: "semantic_vad"
|
|
49
|
+
};
|
|
50
|
+
var OpenAIRealtimeAdapter = class {
|
|
51
|
+
constructor(apiKey, model = OpenAIRealtimeModel.GPT_REALTIME_MINI, voice = OpenAIVoice.ALLOY, instructions = "", tools, audioFormat = OpenAIRealtimeAudioFormat.G711_ULAW, options = {}) {
|
|
52
|
+
this.apiKey = apiKey;
|
|
53
|
+
this.model = model;
|
|
54
|
+
this.voice = voice;
|
|
55
|
+
this.instructions = instructions;
|
|
56
|
+
this.tools = tools;
|
|
57
|
+
this.audioFormat = audioFormat;
|
|
58
|
+
this.options = options;
|
|
59
|
+
}
|
|
60
|
+
apiKey;
|
|
61
|
+
model;
|
|
62
|
+
voice;
|
|
63
|
+
instructions;
|
|
64
|
+
tools;
|
|
65
|
+
audioFormat;
|
|
66
|
+
// Fields exposed `protected` (not `private`) so a subclass can implement
|
|
67
|
+
// alternate transports — e.g. `OpenAIRealtime2Adapter` overrides
|
|
68
|
+
// `connect()` to speak the GA Realtime API while reusing the rest of
|
|
69
|
+
// the runtime (audio dispatch, barge-in, heartbeat).
|
|
70
|
+
ws = null;
|
|
71
|
+
eventCallbacks = /* @__PURE__ */ new Set();
|
|
72
|
+
messageListenerAttached = false;
|
|
73
|
+
heartbeat = null;
|
|
74
|
+
// Track the in-flight assistant item id so we can truncate cleanly on
|
|
75
|
+
// barge-in (see ``cancelResponse``) — matches the Python adapter.
|
|
76
|
+
currentResponseItemId = null;
|
|
77
|
+
currentResponseAudioMs = 0;
|
|
78
|
+
// Wall-clock timestamp (Date.now()) of the first ``response.audio.delta``
|
|
79
|
+
// received since the current response item started. ``cancelResponse``
|
|
80
|
+
// uses this to bound ``audio_end_ms`` to what the caller could plausibly
|
|
81
|
+
// have heard — generated audio frequently arrives 5-10x real-time, so
|
|
82
|
+
// ``audio_end_ms`` driven purely by the per-chunk byte counter overshoots
|
|
83
|
+
// reality and leaves phantom assistant text on the conversation. The
|
|
84
|
+
// wall-clock cap corresponds to the maximum playback that real-time TTS
|
|
85
|
+
// could have produced, which is what the user actually heard.
|
|
86
|
+
currentResponseFirstAudioAt = null;
|
|
87
|
+
options;
|
|
88
|
+
/**
|
|
89
|
+
* Build the production session.update body. Mirrors the body sent
|
|
90
|
+
* inside `connect()` so warmup can apply identical configuration to
|
|
91
|
+
* the upstream session and prime it without billing.
|
|
92
|
+
*/
|
|
93
|
+
buildSessionConfig() {
|
|
94
|
+
const config = {
|
|
95
|
+
input_audio_format: this.audioFormat,
|
|
96
|
+
output_audio_format: this.audioFormat,
|
|
97
|
+
voice: this.voice,
|
|
98
|
+
instructions: this.instructions || "You are a helpful voice assistant. Be concise.",
|
|
99
|
+
turn_detection: {
|
|
100
|
+
type: this.options.vadType ?? OpenAIRealtimeVADType.SERVER_VAD,
|
|
101
|
+
threshold: 0.5,
|
|
102
|
+
prefix_padding_ms: 300,
|
|
103
|
+
silence_duration_ms: this.options.silenceDurationMs ?? 300
|
|
104
|
+
},
|
|
105
|
+
input_audio_transcription: {
|
|
106
|
+
model: this.options.inputAudioTranscriptionModel ?? OpenAITranscriptionModel.WHISPER_1
|
|
107
|
+
}
|
|
108
|
+
};
|
|
109
|
+
if (this.options.temperature !== void 0) config.temperature = this.options.temperature;
|
|
110
|
+
if (this.options.maxResponseOutputTokens !== void 0) {
|
|
111
|
+
config.max_response_output_tokens = this.options.maxResponseOutputTokens;
|
|
112
|
+
}
|
|
113
|
+
if (this.options.modalities !== void 0) config.modalities = this.options.modalities;
|
|
114
|
+
if (this.options.toolChoice !== void 0) config.tool_choice = this.options.toolChoice;
|
|
115
|
+
if (this.options.reasoningEffort !== void 0) {
|
|
116
|
+
config.reasoning = { effort: this.options.reasoningEffort };
|
|
117
|
+
}
|
|
118
|
+
if (this.tools?.length) {
|
|
119
|
+
config.tools = this.tools.map((t) => {
|
|
120
|
+
const def = {
|
|
121
|
+
type: "function",
|
|
122
|
+
name: t.name,
|
|
123
|
+
description: t.description,
|
|
124
|
+
parameters: t.parameters
|
|
125
|
+
};
|
|
126
|
+
if (t.strict === true) {
|
|
127
|
+
def.strict = true;
|
|
128
|
+
}
|
|
129
|
+
return def;
|
|
130
|
+
});
|
|
131
|
+
}
|
|
132
|
+
return config;
|
|
133
|
+
}
|
|
134
|
+
/**
|
|
135
|
+
* Pre-call WebSocket warmup for the OpenAI Realtime endpoint.
|
|
136
|
+
*
|
|
137
|
+
* The canonical session-only warm step on the Realtime API: open the
|
|
138
|
+
* WS, wait for `session.created`, send a single `session.update`
|
|
139
|
+
* containing the same fields that the production `connect()` path
|
|
140
|
+
* applies (`input_audio_format`, `output_audio_format`, `voice`,
|
|
141
|
+
* `instructions`, `turn_detection`, `input_audio_transcription`,
|
|
142
|
+
* plus any opt-in fields populated on the adapter), wait for the
|
|
143
|
+
* matching `session.updated` ack, then close cleanly. This primes
|
|
144
|
+
* the per-session state on the OpenAI side — DNS + TLS + auth
|
|
145
|
+
* handshake + initial config exchange — without ever invoking the
|
|
146
|
+
* model.
|
|
147
|
+
*
|
|
148
|
+
* Earlier revisions sent `response.create` with
|
|
149
|
+
* `{"response": {"generate": false}}` to prime the inference path.
|
|
150
|
+
* That field is NOT in the OpenAI Realtime API schema; the server
|
|
151
|
+
* either ignores it (and bills tokens for a real model response) or
|
|
152
|
+
* rejects the request with `invalid_request_error`. Both behaviours
|
|
153
|
+
* are billing-unsafe or a no-op beyond TLS warm. The
|
|
154
|
+
* `session.update` flow is documented and side-effect-free.
|
|
155
|
+
*
|
|
156
|
+
* Billing safety: `session.update` only mutates session
|
|
157
|
+
* configuration. It does NOT invoke the model, does NOT consume any
|
|
158
|
+
* audio buffer, and does NOT trigger token generation, so no
|
|
159
|
+
* per-token cost is accrued. Best-effort: failures are logged at
|
|
160
|
+
* debug level and never raised.
|
|
161
|
+
*/
|
|
162
|
+
async warmup() {
|
|
163
|
+
const url = `wss://api.openai.com/v1/realtime?model=${encodeURIComponent(this.model)}`;
|
|
164
|
+
let ws = null;
|
|
165
|
+
try {
|
|
166
|
+
ws = await new Promise((resolve, reject) => {
|
|
167
|
+
const sock = new WebSocket(url, {
|
|
168
|
+
headers: {
|
|
169
|
+
Authorization: `Bearer ${this.apiKey}`
|
|
170
|
+
}
|
|
171
|
+
});
|
|
172
|
+
const timer = setTimeout(() => {
|
|
173
|
+
try {
|
|
174
|
+
sock.close();
|
|
175
|
+
} catch {
|
|
176
|
+
}
|
|
177
|
+
reject(new Error("OpenAI Realtime warmup connect timeout"));
|
|
178
|
+
}, 5e3);
|
|
179
|
+
sock.once("open", () => {
|
|
180
|
+
clearTimeout(timer);
|
|
181
|
+
resolve(sock);
|
|
182
|
+
});
|
|
183
|
+
sock.once("error", (err) => {
|
|
184
|
+
clearTimeout(timer);
|
|
185
|
+
reject(err);
|
|
186
|
+
});
|
|
187
|
+
});
|
|
188
|
+
const sessionCreated = await new Promise((resolve) => {
|
|
189
|
+
const timer = setTimeout(() => resolve(false), 2e3);
|
|
190
|
+
const onMsg = (raw) => {
|
|
191
|
+
try {
|
|
192
|
+
const data = JSON.parse(raw.toString());
|
|
193
|
+
if (data.type === "session.created") {
|
|
194
|
+
clearTimeout(timer);
|
|
195
|
+
ws.off("message", onMsg);
|
|
196
|
+
resolve(true);
|
|
197
|
+
}
|
|
198
|
+
} catch {
|
|
199
|
+
}
|
|
200
|
+
};
|
|
201
|
+
ws.on("message", onMsg);
|
|
202
|
+
});
|
|
203
|
+
if (!sessionCreated) return;
|
|
204
|
+
try {
|
|
205
|
+
ws.send(JSON.stringify({ type: "session.update", session: this.buildSessionConfig() }));
|
|
206
|
+
} catch {
|
|
207
|
+
return;
|
|
208
|
+
}
|
|
209
|
+
await new Promise((resolve) => {
|
|
210
|
+
const timer = setTimeout(() => resolve(), 1500);
|
|
211
|
+
const onMsg = (raw) => {
|
|
212
|
+
try {
|
|
213
|
+
const data = JSON.parse(raw.toString());
|
|
214
|
+
if (data.type === "session.updated") {
|
|
215
|
+
clearTimeout(timer);
|
|
216
|
+
ws.off("message", onMsg);
|
|
217
|
+
resolve();
|
|
218
|
+
}
|
|
219
|
+
} catch {
|
|
220
|
+
}
|
|
221
|
+
};
|
|
222
|
+
ws.on("message", onMsg);
|
|
223
|
+
});
|
|
224
|
+
} catch (err) {
|
|
225
|
+
getLogger().debug(`OpenAI Realtime warmup failed (best-effort): ${String(err)}`);
|
|
226
|
+
} finally {
|
|
227
|
+
if (ws) {
|
|
228
|
+
try {
|
|
229
|
+
ws.close();
|
|
230
|
+
} catch {
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
/** Open the Realtime WebSocket and apply the session configuration. */
|
|
236
|
+
async connect() {
|
|
237
|
+
const url = `wss://api.openai.com/v1/realtime?model=${encodeURIComponent(this.model)}`;
|
|
238
|
+
this.ws = new WebSocket(url, {
|
|
239
|
+
headers: {
|
|
240
|
+
Authorization: `Bearer ${this.apiKey}`
|
|
241
|
+
}
|
|
242
|
+
});
|
|
243
|
+
await new Promise((resolve, reject) => {
|
|
244
|
+
let sessionCreated = false;
|
|
245
|
+
let settled = false;
|
|
246
|
+
const ws = this.ws;
|
|
247
|
+
const onSetupMessage = (raw) => {
|
|
248
|
+
let msg;
|
|
249
|
+
try {
|
|
250
|
+
msg = JSON.parse(raw.toString());
|
|
251
|
+
} catch (e) {
|
|
252
|
+
getLogger().warn(`OpenAI Realtime: failed to parse message: ${String(e)}`);
|
|
253
|
+
return;
|
|
254
|
+
}
|
|
255
|
+
if (msg.type === "session.created" && !sessionCreated) {
|
|
256
|
+
sessionCreated = true;
|
|
257
|
+
ws.send(JSON.stringify({ type: "session.update", session: this.buildSessionConfig() }));
|
|
258
|
+
} else if (msg.type === "session.updated") {
|
|
259
|
+
cleanup();
|
|
260
|
+
resolve();
|
|
261
|
+
}
|
|
262
|
+
};
|
|
263
|
+
const onSetupError = (err) => {
|
|
264
|
+
cleanup();
|
|
265
|
+
try {
|
|
266
|
+
ws.close();
|
|
267
|
+
} catch {
|
|
268
|
+
}
|
|
269
|
+
reject(err);
|
|
270
|
+
};
|
|
271
|
+
const cleanup = () => {
|
|
272
|
+
if (settled) return;
|
|
273
|
+
settled = true;
|
|
274
|
+
clearTimeout(timer);
|
|
275
|
+
ws.off("message", onSetupMessage);
|
|
276
|
+
ws.off("error", onSetupError);
|
|
277
|
+
};
|
|
278
|
+
const timer = setTimeout(() => {
|
|
279
|
+
cleanup();
|
|
280
|
+
try {
|
|
281
|
+
ws.close();
|
|
282
|
+
} catch {
|
|
283
|
+
}
|
|
284
|
+
reject(new Error("OpenAI Realtime connect timeout"));
|
|
285
|
+
}, 15e3);
|
|
286
|
+
ws.on("message", onSetupMessage);
|
|
287
|
+
ws.on("error", onSetupError);
|
|
288
|
+
});
|
|
289
|
+
this.armHeartbeatAndListener();
|
|
290
|
+
}
|
|
291
|
+
/**
|
|
292
|
+
* Adopt a pre-opened, already-`session.updated` Realtime WebSocket
|
|
293
|
+
* produced by the prewarm pipeline (see `Patter.parkProviderConnections`).
|
|
294
|
+
* Skips the fresh `new WebSocket()` + `session.created` /
|
|
295
|
+
* `session.update` round-trip — saves ~250-450 ms on first turn.
|
|
296
|
+
*
|
|
297
|
+
* Caller MUST verify `ws.readyState === OPEN` before calling and MUST
|
|
298
|
+
* have already received `session.updated` on the parked socket. If
|
|
299
|
+
* the parked WS died between park and adopt, fall back to `connect()`.
|
|
300
|
+
*/
|
|
301
|
+
adoptWebSocket(ws) {
|
|
302
|
+
this.ws = ws;
|
|
303
|
+
this.armHeartbeatAndListener();
|
|
304
|
+
}
|
|
305
|
+
armHeartbeatAndListener() {
|
|
306
|
+
this.heartbeat = setInterval(() => {
|
|
307
|
+
try {
|
|
308
|
+
this.ws?.ping();
|
|
309
|
+
} catch {
|
|
310
|
+
}
|
|
311
|
+
}, 2e4);
|
|
312
|
+
this.ensureMessageListener();
|
|
313
|
+
}
|
|
314
|
+
/**
|
|
315
|
+
* Open a fresh Realtime WS, exchange `session.created` /
|
|
316
|
+
* `session.update` / `session.updated` (so the upstream session is
|
|
317
|
+
* fully primed), and return the OPEN socket WITHOUT arming the
|
|
318
|
+
* heartbeat / message listener. Used by the prewarm pipeline to park
|
|
319
|
+
* a Realtime connection during ringing; the live consumer adopts it
|
|
320
|
+
* via {@link adoptWebSocket}.
|
|
321
|
+
*
|
|
322
|
+
* Bounded by 8 s. Throws on timeout / handshake failure — callers
|
|
323
|
+
* (the prewarm pipeline) treat any error as a cache miss and the
|
|
324
|
+
* call falls through to the cold `connect()` path.
|
|
325
|
+
*
|
|
326
|
+
* Billing safety: `session.update` does not invoke the model. No
|
|
327
|
+
* tokens are billed.
|
|
328
|
+
*/
|
|
329
|
+
async openParkedConnection() {
|
|
330
|
+
const url = `wss://api.openai.com/v1/realtime?model=${encodeURIComponent(this.model)}`;
|
|
331
|
+
const ws = new WebSocket(url, {
|
|
332
|
+
headers: {
|
|
333
|
+
Authorization: `Bearer ${this.apiKey}`
|
|
334
|
+
}
|
|
335
|
+
});
|
|
336
|
+
await new Promise((resolve, reject) => {
|
|
337
|
+
let sessionCreated = false;
|
|
338
|
+
let settled = false;
|
|
339
|
+
const onMessage = (raw) => {
|
|
340
|
+
let msg;
|
|
341
|
+
try {
|
|
342
|
+
msg = JSON.parse(raw.toString());
|
|
343
|
+
} catch {
|
|
344
|
+
return;
|
|
345
|
+
}
|
|
346
|
+
if (msg.type === "session.created" && !sessionCreated) {
|
|
347
|
+
sessionCreated = true;
|
|
348
|
+
try {
|
|
349
|
+
ws.send(JSON.stringify({ type: "session.update", session: this.buildSessionConfig() }));
|
|
350
|
+
} catch (err) {
|
|
351
|
+
cleanup();
|
|
352
|
+
reject(err instanceof Error ? err : new Error(String(err)));
|
|
353
|
+
}
|
|
354
|
+
} else if (msg.type === "session.updated") {
|
|
355
|
+
cleanup();
|
|
356
|
+
resolve();
|
|
357
|
+
}
|
|
358
|
+
};
|
|
359
|
+
const onError = (err) => {
|
|
360
|
+
cleanup();
|
|
361
|
+
reject(err);
|
|
362
|
+
};
|
|
363
|
+
const cleanup = () => {
|
|
364
|
+
if (settled) return;
|
|
365
|
+
settled = true;
|
|
366
|
+
clearTimeout(timer);
|
|
367
|
+
ws.off("message", onMessage);
|
|
368
|
+
ws.off("error", onError);
|
|
369
|
+
};
|
|
370
|
+
const timer = setTimeout(() => {
|
|
371
|
+
cleanup();
|
|
372
|
+
reject(new Error("OpenAI Realtime park connect timeout"));
|
|
373
|
+
}, 8e3);
|
|
374
|
+
ws.on("message", onMessage);
|
|
375
|
+
ws.on("error", onError);
|
|
376
|
+
});
|
|
377
|
+
return ws;
|
|
378
|
+
}
|
|
379
|
+
/** Append a base64-encoded audio chunk to the realtime input buffer. */
|
|
380
|
+
sendAudio(mulawAudio) {
|
|
381
|
+
if (!this.ws || this.ws.readyState !== WebSocket.OPEN) return;
|
|
382
|
+
this.ws.send(JSON.stringify({ type: "input_audio_buffer.append", audio: mulawAudio.toString("base64") }));
|
|
383
|
+
}
|
|
384
|
+
/**
|
|
385
|
+
* Register a listener for parsed realtime events.
|
|
386
|
+
*
|
|
387
|
+
* Previously every call attached a new ``ws.on('message')`` handler,
|
|
388
|
+
* which leaked listeners across retries and multi-consumer hooks. We now
|
|
389
|
+
* route all traffic through a single persistent handler that fans out to
|
|
390
|
+
* a Set of callbacks. Use {@link offEvent} to remove one.
|
|
391
|
+
*/
|
|
392
|
+
onEvent(callback) {
|
|
393
|
+
this.eventCallbacks.add(callback);
|
|
394
|
+
this.ensureMessageListener();
|
|
395
|
+
}
|
|
396
|
+
/** Remove a previously registered {@link onEvent} callback. */
|
|
397
|
+
offEvent(callback) {
|
|
398
|
+
this.eventCallbacks.delete(callback);
|
|
399
|
+
}
|
|
400
|
+
ensureMessageListener() {
|
|
401
|
+
if (this.messageListenerAttached || !this.ws) return;
|
|
402
|
+
this.messageListenerAttached = true;
|
|
403
|
+
const ws = this.ws;
|
|
404
|
+
const dispatch = (type, payload) => {
|
|
405
|
+
for (const cb of this.eventCallbacks) {
|
|
406
|
+
void Promise.resolve(cb(type, payload)).catch(
|
|
407
|
+
(err) => getLogger().error("onEvent callback error:", err)
|
|
408
|
+
);
|
|
409
|
+
}
|
|
410
|
+
};
|
|
411
|
+
ws.on("message", (raw) => {
|
|
412
|
+
let data;
|
|
413
|
+
try {
|
|
414
|
+
data = JSON.parse(raw.toString());
|
|
415
|
+
} catch (e) {
|
|
416
|
+
getLogger().warn(`OpenAI Realtime: failed to parse event message: ${String(e)}`);
|
|
417
|
+
return;
|
|
418
|
+
}
|
|
419
|
+
const t = data.type;
|
|
420
|
+
if (t === "response.audio.delta") {
|
|
421
|
+
const buf = Buffer.from(data.delta ?? "", "base64");
|
|
422
|
+
this.currentResponseAudioMs += estimateAudioMs(buf, this.audioFormat);
|
|
423
|
+
if (this.currentResponseFirstAudioAt === null) {
|
|
424
|
+
this.currentResponseFirstAudioAt = Date.now();
|
|
425
|
+
}
|
|
426
|
+
dispatch("audio", buf);
|
|
427
|
+
} else if (t === "response.audio_transcript.delta") {
|
|
428
|
+
dispatch("transcript_output", data.delta);
|
|
429
|
+
} else if (t === "response.content_part.added" || t === "response.output_item.added") {
|
|
430
|
+
const itemId = data.item?.id ?? data.item_id ?? null;
|
|
431
|
+
if (itemId) {
|
|
432
|
+
this.currentResponseItemId = itemId;
|
|
433
|
+
this.currentResponseAudioMs = 0;
|
|
434
|
+
this.currentResponseFirstAudioAt = null;
|
|
435
|
+
}
|
|
436
|
+
} else if (t === "input_audio_buffer.speech_started") {
|
|
437
|
+
dispatch("speech_started", null);
|
|
438
|
+
} else if (t === "input_audio_buffer.speech_stopped") {
|
|
439
|
+
dispatch("speech_stopped", null);
|
|
440
|
+
} else if (t === "conversation.item.input_audio_transcription.completed") {
|
|
441
|
+
dispatch("transcript_input", data.transcript);
|
|
442
|
+
} else if (t === "response.function_call_arguments.done") {
|
|
443
|
+
dispatch("function_call", { call_id: data.call_id, name: data.name, arguments: data.arguments });
|
|
444
|
+
} else if (t === "response.done") {
|
|
445
|
+
this.currentResponseItemId = null;
|
|
446
|
+
this.currentResponseAudioMs = 0;
|
|
447
|
+
this.currentResponseFirstAudioAt = null;
|
|
448
|
+
dispatch("response_done", data.response ?? null);
|
|
449
|
+
} else if (t === "error") {
|
|
450
|
+
dispatch("error", data.error);
|
|
451
|
+
}
|
|
452
|
+
});
|
|
453
|
+
ws.on("close", (code, reason) => {
|
|
454
|
+
if (code !== 1e3) {
|
|
455
|
+
dispatch("error", {
|
|
456
|
+
type: "connection_closed",
|
|
457
|
+
code,
|
|
458
|
+
reason: reason?.toString() ?? ""
|
|
459
|
+
});
|
|
460
|
+
}
|
|
461
|
+
});
|
|
462
|
+
ws.on("error", (err) => {
|
|
463
|
+
dispatch("error", { type: "socket_error", message: err?.message ?? String(err) });
|
|
464
|
+
});
|
|
465
|
+
}
|
|
466
|
+
/** Truncate the in-flight assistant turn and cancel the active response.
|
|
467
|
+
*
|
|
468
|
+
* ``audio_end_ms`` MUST reflect what the caller actually heard, not what
|
|
469
|
+
* the server generated. OpenAI streams audio at 5-10x real-time, so the
|
|
470
|
+
* byte-derived counter overstates playback whenever the consumer cleared
|
|
471
|
+
* its playout buffer (e.g. ``send_clear``) before the audio reached the
|
|
472
|
+
* speaker. We bound the truncate point by wall-clock time since the first
|
|
473
|
+
* chunk of this response — that's the physical maximum a 1x real-time
|
|
474
|
+
* playback could have produced. Without this cap, OpenAI keeps the full
|
|
475
|
+
* generated assistant text on the transcript, and the model replays /
|
|
476
|
+
* resumes from it on the next turn — manifesting as re-greetings and
|
|
477
|
+
* mid-sentence fragments after a barge-in storm.
|
|
478
|
+
*/
|
|
479
|
+
cancelResponse() {
|
|
480
|
+
if (!this.ws) return;
|
|
481
|
+
if (!this.currentResponseItemId) {
|
|
482
|
+
return;
|
|
483
|
+
}
|
|
484
|
+
let audioEndMs = this.currentResponseAudioMs;
|
|
485
|
+
if (this.currentResponseFirstAudioAt !== null) {
|
|
486
|
+
const elapsedMs = Date.now() - this.currentResponseFirstAudioAt;
|
|
487
|
+
audioEndMs = Math.min(audioEndMs, Math.max(elapsedMs, 0));
|
|
488
|
+
}
|
|
489
|
+
try {
|
|
490
|
+
this.ws.send(JSON.stringify({
|
|
491
|
+
type: "conversation.item.truncate",
|
|
492
|
+
item_id: this.currentResponseItemId,
|
|
493
|
+
content_index: 0,
|
|
494
|
+
audio_end_ms: audioEndMs
|
|
495
|
+
}));
|
|
496
|
+
} catch (err) {
|
|
497
|
+
getLogger().debug?.(`conversation.item.truncate failed: ${String(err)}`);
|
|
498
|
+
}
|
|
499
|
+
this.ws.send(JSON.stringify({ type: "response.cancel" }));
|
|
500
|
+
this.currentResponseItemId = null;
|
|
501
|
+
this.currentResponseAudioMs = 0;
|
|
502
|
+
this.currentResponseFirstAudioAt = null;
|
|
503
|
+
}
|
|
504
|
+
/** Inject a user text turn and request a new response. */
|
|
505
|
+
async sendText(text) {
|
|
506
|
+
this.ws?.send(JSON.stringify({
|
|
507
|
+
type: "conversation.item.create",
|
|
508
|
+
item: { type: "message", role: "user", content: [{ type: "input_text", text }] }
|
|
509
|
+
}));
|
|
510
|
+
this.ws?.send(JSON.stringify({ type: "response.create" }));
|
|
511
|
+
}
|
|
512
|
+
/**
|
|
513
|
+
* Trigger `response.create` with no new user item.
|
|
514
|
+
*
|
|
515
|
+
* Used by the Realtime stream-handler to drive a response after the
|
|
516
|
+
* client-side hallucination filter accepts an
|
|
517
|
+
* `input_audio_transcription.completed` event. The server VAD config
|
|
518
|
+
* sets `create_response: false` so OpenAI no longer auto-creates a
|
|
519
|
+
* response on every `input_audio_buffer.committed`; Patter is now
|
|
520
|
+
* responsible for triggering it explicitly when a real user turn lands.
|
|
521
|
+
*/
|
|
522
|
+
async requestResponse() {
|
|
523
|
+
this.ws?.send(JSON.stringify({ type: "response.create" }));
|
|
524
|
+
}
|
|
525
|
+
/**
|
|
526
|
+
* Make the AI speak ``text`` as its opening line.
|
|
527
|
+
*
|
|
528
|
+
* Triggers ``response.create`` with explicit ``instructions`` that force
|
|
529
|
+
* the model to render ``text`` verbatim as its first audio utterance.
|
|
530
|
+
* This is the correct semantics for ``Agent.firstMessage`` per its
|
|
531
|
+
* docstring ("What the AI says when the callee answers").
|
|
532
|
+
*
|
|
533
|
+
* Without this, ``sendText(firstMessage)`` would inject ``text`` as
|
|
534
|
+
* ``role: user`` and the AI would *reply* to its own greeting, producing
|
|
535
|
+
* role-confused openings (e.g. a receptionist agent responding "I'd like
|
|
536
|
+
* to schedule a haircut" because it took its own first_message as a
|
|
537
|
+
* customer cue).
|
|
538
|
+
*/
|
|
539
|
+
async sendFirstMessage(text) {
|
|
540
|
+
this.ws?.send(JSON.stringify({
|
|
541
|
+
type: "response.create",
|
|
542
|
+
response: {
|
|
543
|
+
modalities: ["audio", "text"],
|
|
544
|
+
instructions: `Say exactly the following sentence as your first turn and nothing else: "${text}"`
|
|
545
|
+
}
|
|
546
|
+
}));
|
|
547
|
+
}
|
|
548
|
+
/** Submit a tool/function-call result and request the next response. */
|
|
549
|
+
async sendFunctionResult(callId, result) {
|
|
550
|
+
this.ws?.send(JSON.stringify({
|
|
551
|
+
type: "conversation.item.create",
|
|
552
|
+
item: { type: "function_call_output", call_id: callId, output: result }
|
|
553
|
+
}));
|
|
554
|
+
this.ws?.send(JSON.stringify({ type: "response.create" }));
|
|
555
|
+
}
|
|
556
|
+
/** Stop the heartbeat, drop listeners, and close the Realtime WebSocket. */
|
|
557
|
+
close() {
|
|
558
|
+
if (this.heartbeat) {
|
|
559
|
+
clearInterval(this.heartbeat);
|
|
560
|
+
this.heartbeat = null;
|
|
561
|
+
}
|
|
562
|
+
this.eventCallbacks.clear();
|
|
563
|
+
this.messageListenerAttached = false;
|
|
564
|
+
this.ws?.close();
|
|
565
|
+
this.ws = null;
|
|
566
|
+
}
|
|
567
|
+
};
|
|
568
|
+
function estimateAudioMs(chunk, format) {
|
|
569
|
+
if (chunk.length === 0) return 0;
|
|
570
|
+
if (format === OpenAIRealtimeAudioFormat.G711_ULAW || format === OpenAIRealtimeAudioFormat.G711_ALAW)
|
|
571
|
+
return Math.floor(chunk.length / 8);
|
|
572
|
+
if (format === OpenAIRealtimeAudioFormat.PCM16) {
|
|
573
|
+
return Math.floor(chunk.length / 48);
|
|
574
|
+
}
|
|
575
|
+
return 0;
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
// src/audio/transcoding.ts
|
|
579
|
+
init_esm_shims();
|
|
580
|
+
var MULAW_TO_PCM16_TABLE = (() => {
|
|
581
|
+
const table = new Int16Array(256);
|
|
582
|
+
for (let i = 0; i < 256; i++) {
|
|
583
|
+
const mu = ~i & 255;
|
|
584
|
+
const sign = mu & 128 ? -1 : 1;
|
|
585
|
+
const exponent = mu >> 4 & 7;
|
|
586
|
+
const mantissa = mu & 15;
|
|
587
|
+
const magnitude = (mantissa << 1 | 33) << exponent + 2;
|
|
588
|
+
table[i] = sign * (magnitude - 132);
|
|
589
|
+
}
|
|
590
|
+
return table;
|
|
591
|
+
})();
|
|
592
|
+
var PCM16_TO_MULAW_TABLE = (() => {
|
|
593
|
+
const BIAS = 132;
|
|
594
|
+
const CLIP = 32635;
|
|
595
|
+
const table = new Uint8Array(65536);
|
|
596
|
+
for (let i = 0; i < 65536; i++) {
|
|
597
|
+
let sample = i >= 32768 ? i - 65536 : i;
|
|
598
|
+
const sign = sample < 0 ? 128 : 0;
|
|
599
|
+
if (sample < 0) sample = -sample;
|
|
600
|
+
if (sample > CLIP) sample = CLIP;
|
|
601
|
+
sample += BIAS;
|
|
602
|
+
let exponent = 7;
|
|
603
|
+
const exponentMask = 16384;
|
|
604
|
+
for (let shift = exponentMask; shift > 0 && (sample & shift) === 0; shift >>= 1) {
|
|
605
|
+
exponent--;
|
|
606
|
+
}
|
|
607
|
+
const mantissa = sample >> exponent + 3 & 15;
|
|
608
|
+
const mulaw = ~(sign | exponent << 4 | mantissa) & 255;
|
|
609
|
+
table[i] = mulaw;
|
|
610
|
+
}
|
|
611
|
+
return table;
|
|
612
|
+
})();
|
|
613
|
+
function mulawToPcm16(mulawData) {
|
|
614
|
+
const out = Buffer.alloc(mulawData.length * 2);
|
|
615
|
+
for (let i = 0; i < mulawData.length; i++) {
|
|
616
|
+
out.writeInt16LE(MULAW_TO_PCM16_TABLE[mulawData[i]], i * 2);
|
|
617
|
+
}
|
|
618
|
+
return out;
|
|
619
|
+
}
|
|
620
|
+
function pcm16ToMulaw(pcmData) {
|
|
621
|
+
const sampleCount = Math.floor(pcmData.length / 2);
|
|
622
|
+
const out = Buffer.alloc(sampleCount);
|
|
623
|
+
for (let i = 0; i < sampleCount; i++) {
|
|
624
|
+
const sample = pcmData.readInt16LE(i * 2);
|
|
625
|
+
out[i] = PCM16_TO_MULAW_TABLE[sample + 65536 & 65535];
|
|
626
|
+
}
|
|
627
|
+
return out;
|
|
628
|
+
}
|
|
629
|
+
var PcmCarry = class {
|
|
630
|
+
pending = null;
|
|
631
|
+
/**
|
|
632
|
+
* Prepend any carried odd byte, return the even-length prefix, and stash
|
|
633
|
+
* any new trailing odd byte for the next call.
|
|
634
|
+
*
|
|
635
|
+
* Returns a zero-length buffer when no complete sample is yet available.
|
|
636
|
+
*/
|
|
637
|
+
push(chunk) {
|
|
638
|
+
const combined = this.pending !== null ? Buffer.concat([this.pending, chunk]) : chunk;
|
|
639
|
+
this.pending = null;
|
|
640
|
+
const alignedLen = combined.length & ~1;
|
|
641
|
+
if (alignedLen < combined.length) {
|
|
642
|
+
this.pending = combined.subarray(alignedLen);
|
|
643
|
+
}
|
|
644
|
+
return combined.subarray(0, alignedLen);
|
|
645
|
+
}
|
|
646
|
+
/**
|
|
647
|
+
* Return any pending byte as a 1-byte buffer (rare in practice — only if
|
|
648
|
+
* the entire stream had an odd byte count), then reset internal state.
|
|
649
|
+
*/
|
|
650
|
+
flush() {
|
|
651
|
+
if (this.pending === null) return Buffer.alloc(0);
|
|
652
|
+
const out = this.pending;
|
|
653
|
+
this.pending = null;
|
|
654
|
+
return out;
|
|
655
|
+
}
|
|
656
|
+
/** Reset carry state without flushing. */
|
|
657
|
+
reset() {
|
|
658
|
+
this.pending = null;
|
|
659
|
+
}
|
|
660
|
+
};
|
|
661
|
+
var StatefulResampler = class {
|
|
662
|
+
srcRate;
|
|
663
|
+
dstRate;
|
|
664
|
+
// 16k→8k: 5-tap FIR state.
|
|
665
|
+
// Extended sample buffer carries the 2 history samples that precede the
|
|
666
|
+
// current chunk AND any "pending" input sample that did not yet generate
|
|
667
|
+
// output (i.e. the odd sample when the chunk had an odd sample count).
|
|
668
|
+
// `firPhase` = 0 means the next output is at input position 0 of the
|
|
669
|
+
// current chunk; 1 means it starts at input position 1 (because the
|
|
670
|
+
// previous chunk ended on an even-output boundary).
|
|
671
|
+
firHistory = new Int16Array(2);
|
|
672
|
+
// [s_{-2}, s_{-1}]
|
|
673
|
+
firHistoryValid = false;
|
|
674
|
+
// Pending sample carried from odd-count chunks (not the byte carry —
|
|
675
|
+
// this is a complete Int16 sample that becomes the first input for the
|
|
676
|
+
// next call).
|
|
677
|
+
firPendingSample = null;
|
|
678
|
+
// 8k→16k: last input sample deferred across chunk boundaries.
|
|
679
|
+
upsampleLast = 0;
|
|
680
|
+
upsampleHasHistory = false;
|
|
681
|
+
// 24k→16k: fractional phase and last input sample across chunks.
|
|
682
|
+
resample24Last = 0;
|
|
683
|
+
resample24Phase = 0;
|
|
684
|
+
resample24HasHistory = false;
|
|
685
|
+
// Odd-byte alignment carry.
|
|
686
|
+
carry = new PcmCarry();
|
|
687
|
+
constructor(opts) {
|
|
688
|
+
this.srcRate = opts.srcRate;
|
|
689
|
+
this.dstRate = opts.dstRate;
|
|
690
|
+
if (opts.channels !== void 0 && opts.channels !== 1) {
|
|
691
|
+
throw new Error("StatefulResampler: only mono (channels=1) is supported");
|
|
692
|
+
}
|
|
693
|
+
const key = `${this.srcRate}->${this.dstRate}`;
|
|
694
|
+
if (key !== "16000->8000" && key !== "8000->16000" && key !== "24000->16000" && key !== "24000->8000") {
|
|
695
|
+
throw new Error(
|
|
696
|
+
`StatefulResampler: unsupported conversion ${key}. Supported: 16000->8000, 8000->16000, 24000->16000, 24000->8000`
|
|
697
|
+
);
|
|
698
|
+
}
|
|
699
|
+
}
|
|
700
|
+
/**
|
|
701
|
+
* Process a chunk of PCM16-LE samples.
|
|
702
|
+
*
|
|
703
|
+
* Handles odd-byte inputs via an internal carry buffer. Returns an even-byte-
|
|
704
|
+
* aligned output buffer; may return a zero-length buffer if not enough
|
|
705
|
+
* aligned input is available yet.
|
|
706
|
+
*/
|
|
707
|
+
process(pcm) {
|
|
708
|
+
const aligned = this.carry.push(pcm);
|
|
709
|
+
if (aligned.length === 0) return Buffer.alloc(0);
|
|
710
|
+
if (this.srcRate === 16e3 && this.dstRate === 8e3) {
|
|
711
|
+
return this._downsample16kTo8k(aligned);
|
|
712
|
+
}
|
|
713
|
+
if (this.srcRate === 8e3 && this.dstRate === 16e3) {
|
|
714
|
+
return this._upsample8kTo16k(aligned);
|
|
715
|
+
}
|
|
716
|
+
if (this.srcRate === 24e3 && this.dstRate === 8e3) {
|
|
717
|
+
return this._resample24kTo8k(aligned);
|
|
718
|
+
}
|
|
719
|
+
return this._resample24kTo16k(aligned);
|
|
720
|
+
}
|
|
721
|
+
/**
|
|
722
|
+
* Flush internal state and return any remaining output samples.
|
|
723
|
+
*
|
|
724
|
+
* For 8k→16k: the deferred last sample is emitted duplicated (matching
|
|
725
|
+
* the stateless helper's end-of-stream behaviour).
|
|
726
|
+
* For 16k→8k: any pending odd sample is processed with edge-replication.
|
|
727
|
+
* Resets all state after flushing.
|
|
728
|
+
*/
|
|
729
|
+
flush() {
|
|
730
|
+
this.carry.flush();
|
|
731
|
+
if (this.srcRate === 16e3 && this.dstRate === 8e3 && this.firPendingSample !== null) {
|
|
732
|
+
const s = this.firPendingSample;
|
|
733
|
+
const tmp = Buffer.alloc(4);
|
|
734
|
+
tmp.writeInt16LE(s, 0);
|
|
735
|
+
tmp.writeInt16LE(s, 2);
|
|
736
|
+
const out = this._downsample16kTo8k(tmp);
|
|
737
|
+
this.firPendingSample = null;
|
|
738
|
+
return out;
|
|
739
|
+
}
|
|
740
|
+
if (this.srcRate === 8e3 && this.dstRate === 16e3 && this.upsampleHasHistory) {
|
|
741
|
+
const out = Buffer.alloc(4);
|
|
742
|
+
out.writeInt16LE(this.upsampleLast, 0);
|
|
743
|
+
out.writeInt16LE(this.upsampleLast, 2);
|
|
744
|
+
this.upsampleHasHistory = false;
|
|
745
|
+
this.upsampleLast = 0;
|
|
746
|
+
return out;
|
|
747
|
+
}
|
|
748
|
+
return Buffer.alloc(0);
|
|
749
|
+
}
|
|
750
|
+
/** Reset all carried state (e.g. at call boundaries). */
|
|
751
|
+
reset() {
|
|
752
|
+
this.firHistory = new Int16Array(2);
|
|
753
|
+
this.firHistoryValid = false;
|
|
754
|
+
this.firPendingSample = null;
|
|
755
|
+
this.upsampleLast = 0;
|
|
756
|
+
this.upsampleHasHistory = false;
|
|
757
|
+
this.resample24Last = 0;
|
|
758
|
+
this.resample24Phase = 0;
|
|
759
|
+
this.resample24HasHistory = false;
|
|
760
|
+
this.carry.reset();
|
|
761
|
+
}
|
|
762
|
+
// ---------------------------------------------------------------------------
|
|
763
|
+
// Private: 16 kHz → 8 kHz
|
|
764
|
+
// ---------------------------------------------------------------------------
|
|
765
|
+
/**
|
|
766
|
+
* 2:1 decimation with a 5-tap binomial FIR anti-alias filter.
|
|
767
|
+
*
|
|
768
|
+
* FIR coefficients: [1, 4, 6, 4, 1] / 16 (cutoff ~Fs/4 = 4 kHz).
|
|
769
|
+
*
|
|
770
|
+
* Cross-chunk state:
|
|
771
|
+
* - `firHistory[0]` = s_{-2}, `firHistory[1]` = s_{-1} relative to the
|
|
772
|
+
* virtual stream (seeded to first-sample on the very first call).
|
|
773
|
+
* - `firPendingSample` = a lone input sample carried from a chunk whose
|
|
774
|
+
* sample count was odd; it will become the first input of the next chunk.
|
|
775
|
+
*
|
|
776
|
+
* Decimation: outputs are at even positions (0, 2, 4 …) in the virtual
|
|
777
|
+
* extended stream, so every 2 input samples yield 1 output. An odd-sample-
|
|
778
|
+
* count chunk leaves 1 sample in `firPendingSample`; the next chunk
|
|
779
|
+
* prepends it so the output cadence is unbroken.
|
|
780
|
+
*/
|
|
781
|
+
_downsample16kTo8k(buf) {
|
|
782
|
+
const newSampleCount = buf.length >> 1;
|
|
783
|
+
const hasPending = this.firPendingSample !== null;
|
|
784
|
+
const totalInput = newSampleCount + (hasPending ? 1 : 0);
|
|
785
|
+
const input = new Int16Array(totalInput);
|
|
786
|
+
if (hasPending) {
|
|
787
|
+
input[0] = this.firPendingSample;
|
|
788
|
+
for (let j = 0; j < newSampleCount; j++) input[j + 1] = buf.readInt16LE(j * 2);
|
|
789
|
+
} else {
|
|
790
|
+
for (let j = 0; j < newSampleCount; j++) input[j] = buf.readInt16LE(j * 2);
|
|
791
|
+
}
|
|
792
|
+
this.firPendingSample = null;
|
|
793
|
+
if (totalInput === 0) return Buffer.alloc(0);
|
|
794
|
+
if (!this.firHistoryValid) {
|
|
795
|
+
this.firHistory[0] = 0;
|
|
796
|
+
this.firHistory[1] = 0;
|
|
797
|
+
this.firHistoryValid = true;
|
|
798
|
+
}
|
|
799
|
+
const extended = new Int16Array(totalInput + 2);
|
|
800
|
+
extended[0] = this.firHistory[0];
|
|
801
|
+
extended[1] = this.firHistory[1];
|
|
802
|
+
for (let j = 0; j < totalInput; j++) extended[j + 2] = input[j];
|
|
803
|
+
const outSamples = totalInput >> 1;
|
|
804
|
+
const out = Buffer.alloc(outSamples * 2);
|
|
805
|
+
for (let i = 0; i < outSamples; i++) {
|
|
806
|
+
const c = 2 + i * 2;
|
|
807
|
+
const sM2 = extended[c - 2];
|
|
808
|
+
const sM1 = extended[c - 1];
|
|
809
|
+
const s0 = extended[c];
|
|
810
|
+
const sP1 = c + 1 < extended.length ? extended[c + 1] : extended[extended.length - 1];
|
|
811
|
+
const sP2 = c + 2 < extended.length ? extended[c + 2] : extended[extended.length - 1];
|
|
812
|
+
const filtered = sM2 + 4 * sM1 + 6 * s0 + 4 * sP1 + sP2 + 8 >> 4;
|
|
813
|
+
out.writeInt16LE(Math.max(-32768, Math.min(32767, filtered)), i * 2);
|
|
814
|
+
}
|
|
815
|
+
if (totalInput % 2 === 1) {
|
|
816
|
+
this.firPendingSample = input[totalInput - 1];
|
|
817
|
+
}
|
|
818
|
+
if (totalInput >= 2) {
|
|
819
|
+
this.firHistory[0] = input[totalInput - 2];
|
|
820
|
+
this.firHistory[1] = input[totalInput - 1];
|
|
821
|
+
} else {
|
|
822
|
+
this.firHistory[0] = this.firHistory[1];
|
|
823
|
+
this.firHistory[1] = input[0];
|
|
824
|
+
}
|
|
825
|
+
return out;
|
|
826
|
+
}
|
|
827
|
+
// ---------------------------------------------------------------------------
|
|
828
|
+
// Private: 8 kHz → 16 kHz
|
|
829
|
+
// ---------------------------------------------------------------------------
|
|
830
|
+
/**
|
|
831
|
+
* 1:2 linear-interpolation upsampler.
|
|
832
|
+
*
|
|
833
|
+
* For the first chunk (no history): emits 2*(N-1) samples and defers the
|
|
834
|
+
* last sample. For subsequent chunks (with history): emits the deferred
|
|
835
|
+
* sample + its interpolated midpoint THEN 2*(N-1) samples from the new
|
|
836
|
+
* chunk, deferring the new last sample. Total across K chunks + flush =
|
|
837
|
+
* 2*total_input_samples (correct output length).
|
|
838
|
+
*
|
|
839
|
+
* Call flush() after the final chunk to emit the last deferred sample
|
|
840
|
+
* pair (self-duplicate at end of stream).
|
|
841
|
+
*/
|
|
842
|
+
_upsample8kTo16k(buf) {
|
|
843
|
+
const sampleCount = buf.length >> 1;
|
|
844
|
+
if (sampleCount === 0) return Buffer.alloc(0);
|
|
845
|
+
const outArr = [];
|
|
846
|
+
if (this.upsampleHasHistory) {
|
|
847
|
+
const next = buf.readInt16LE(0);
|
|
848
|
+
outArr.push(this.upsampleLast);
|
|
849
|
+
outArr.push(Math.round((this.upsampleLast + next) / 2));
|
|
850
|
+
}
|
|
851
|
+
for (let i = 0; i < sampleCount - 1; i++) {
|
|
852
|
+
const s0 = buf.readInt16LE(i * 2);
|
|
853
|
+
const s1 = buf.readInt16LE((i + 1) * 2);
|
|
854
|
+
outArr.push(s0);
|
|
855
|
+
outArr.push(Math.round((s0 + s1) / 2));
|
|
856
|
+
}
|
|
857
|
+
this.upsampleLast = buf.readInt16LE((sampleCount - 1) * 2);
|
|
858
|
+
this.upsampleHasHistory = true;
|
|
859
|
+
const outBuf = Buffer.alloc(outArr.length * 2);
|
|
860
|
+
for (let j = 0; j < outArr.length; j++) outBuf.writeInt16LE(outArr[j], j * 2);
|
|
861
|
+
return outBuf;
|
|
862
|
+
}
|
|
863
|
+
// ---------------------------------------------------------------------------
|
|
864
|
+
// Private: 24 kHz → 16 kHz / 8 kHz
|
|
865
|
+
// ---------------------------------------------------------------------------
|
|
866
|
+
/**
|
|
867
|
+
* 3:2 linear-interpolation decimator (ratio srcRate/dstRate = 1.5).
|
|
868
|
+
*
|
|
869
|
+
* `resample24Phase` tracks the fractional input position of the next output
|
|
870
|
+
* sample relative to the START of the next chunk. Negative phase means the
|
|
871
|
+
* next output straddles the previous/current chunk boundary; those are
|
|
872
|
+
* handled using `resample24Last`.
|
|
873
|
+
*/
|
|
874
|
+
_resample24kTo16k(buf) {
|
|
875
|
+
return this._resample24kStep(buf, 24e3 / 16e3);
|
|
876
|
+
}
|
|
877
|
+
/** 3:1 decimation — collapses the 24k→16k→8k chain into a single step. */
|
|
878
|
+
_resample24kTo8k(buf) {
|
|
879
|
+
return this._resample24kStep(buf, 24e3 / 8e3);
|
|
880
|
+
}
|
|
881
|
+
/** Shared phase-stepping resampler used by 24→16 (step 1.5) and 24→8 (step 3). */
|
|
882
|
+
_resample24kStep(buf, step) {
|
|
883
|
+
const sampleCount = buf.length >> 1;
|
|
884
|
+
if (sampleCount === 0) return Buffer.alloc(0);
|
|
885
|
+
const outArr = [];
|
|
886
|
+
let phase = this.resample24Phase;
|
|
887
|
+
while (true) {
|
|
888
|
+
const idx = Math.floor(phase);
|
|
889
|
+
if (idx >= sampleCount) break;
|
|
890
|
+
const frac = phase - idx;
|
|
891
|
+
let s0;
|
|
892
|
+
let s1;
|
|
893
|
+
if (idx < 0) {
|
|
894
|
+
s0 = this.resample24HasHistory ? this.resample24Last : 0;
|
|
895
|
+
s1 = buf.readInt16LE(0);
|
|
896
|
+
} else {
|
|
897
|
+
s0 = buf.readInt16LE(idx * 2);
|
|
898
|
+
s1 = idx + 1 < sampleCount ? buf.readInt16LE((idx + 1) * 2) : s0;
|
|
899
|
+
}
|
|
900
|
+
const interp = Math.round(s0 + (s1 - s0) * frac);
|
|
901
|
+
outArr.push(Math.max(-32768, Math.min(32767, interp)));
|
|
902
|
+
phase += step;
|
|
903
|
+
}
|
|
904
|
+
this.resample24Last = buf.readInt16LE((sampleCount - 1) * 2);
|
|
905
|
+
this.resample24HasHistory = true;
|
|
906
|
+
this.resample24Phase = phase - sampleCount;
|
|
907
|
+
const outBuf = Buffer.alloc(outArr.length * 2);
|
|
908
|
+
for (let j = 0; j < outArr.length; j++) outBuf.writeInt16LE(outArr[j], j * 2);
|
|
909
|
+
return outBuf;
|
|
910
|
+
}
|
|
911
|
+
};
|
|
912
|
+
function createResampler16kTo8k() {
|
|
913
|
+
return new StatefulResampler({ srcRate: 16e3, dstRate: 8e3 });
|
|
914
|
+
}
|
|
915
|
+
function createResampler8kTo16k() {
|
|
916
|
+
return new StatefulResampler({ srcRate: 8e3, dstRate: 16e3 });
|
|
917
|
+
}
|
|
918
|
+
function createResampler24kTo16k() {
|
|
919
|
+
return new StatefulResampler({ srcRate: 24e3, dstRate: 16e3 });
|
|
920
|
+
}
|
|
921
|
+
function createResampler24kTo8k() {
|
|
922
|
+
return new StatefulResampler({ srcRate: 24e3, dstRate: 8e3 });
|
|
923
|
+
}
|
|
924
|
+
var _warnedResample8kTo16k = false;
|
|
925
|
+
var _warnedResample16kTo8k = false;
|
|
926
|
+
var _warnedResample24kTo16k = false;
|
|
927
|
+
function resample8kTo16k(pcm8k) {
|
|
928
|
+
if (!_warnedResample8kTo16k) {
|
|
929
|
+
_warnedResample8kTo16k = true;
|
|
930
|
+
getLogger().warn(
|
|
931
|
+
"[patter] resample8kTo16k() is deprecated. Use createResampler8kTo16k() (StatefulResampler) to eliminate chunk-boundary discontinuities."
|
|
932
|
+
);
|
|
933
|
+
}
|
|
934
|
+
if (pcm8k.length === 0) return Buffer.alloc(0);
|
|
935
|
+
const r = createResampler8kTo16k();
|
|
936
|
+
const main = r.process(pcm8k);
|
|
937
|
+
const tail = r.flush();
|
|
938
|
+
return tail.length > 0 ? Buffer.concat([main, tail]) : main;
|
|
939
|
+
}
|
|
940
|
+
function resample16kTo8k(pcm16k) {
|
|
941
|
+
if (!_warnedResample16kTo8k) {
|
|
942
|
+
_warnedResample16kTo8k = true;
|
|
943
|
+
getLogger().warn(
|
|
944
|
+
"[patter] resample16kTo8k() is deprecated. Use createResampler16kTo8k() (StatefulResampler) to eliminate chunk-boundary discontinuities."
|
|
945
|
+
);
|
|
946
|
+
}
|
|
947
|
+
if (pcm16k.length === 0) return Buffer.alloc(0);
|
|
948
|
+
const r = createResampler16kTo8k();
|
|
949
|
+
const out = r.process(pcm16k);
|
|
950
|
+
const tail = r.flush();
|
|
951
|
+
return tail.length > 0 ? Buffer.concat([out, tail]) : out;
|
|
952
|
+
}
|
|
953
|
+
function resample24kTo16k(pcm24k) {
|
|
954
|
+
if (!_warnedResample24kTo16k) {
|
|
955
|
+
_warnedResample24kTo16k = true;
|
|
956
|
+
getLogger().warn(
|
|
957
|
+
"[patter] resample24kTo16k() is deprecated. Use createResampler24kTo16k() (StatefulResampler) or OpenAITTS.resampleStreaming for anti-aliased resampling."
|
|
958
|
+
);
|
|
959
|
+
}
|
|
960
|
+
if (pcm24k.length === 0) return Buffer.alloc(0);
|
|
961
|
+
const sampleCount = Math.floor(pcm24k.length / 2);
|
|
962
|
+
const outSamples = Math.floor(sampleCount * 2 / 3);
|
|
963
|
+
const out = Buffer.alloc(outSamples * 2);
|
|
964
|
+
for (let i = 0; i < outSamples; i++) {
|
|
965
|
+
const pos = i * 1.5;
|
|
966
|
+
const idx = Math.floor(pos);
|
|
967
|
+
const frac = pos - idx;
|
|
968
|
+
const s0 = pcm24k.readInt16LE(idx * 2);
|
|
969
|
+
const s1 = idx + 1 < sampleCount ? pcm24k.readInt16LE((idx + 1) * 2) : s0;
|
|
970
|
+
const interp = Math.round(s0 + (s1 - s0) * frac);
|
|
971
|
+
out.writeInt16LE(Math.max(-32768, Math.min(32767, interp)), i * 2);
|
|
972
|
+
}
|
|
973
|
+
return out;
|
|
974
|
+
}
|
|
975
|
+
|
|
976
|
+
// src/providers/openai-realtime-2.ts
|
|
977
|
+
var GA_TO_V1_EVENT_NAMES = {
|
|
978
|
+
"response.output_audio.delta": "response.audio.delta",
|
|
979
|
+
"response.output_audio.done": "response.audio.done",
|
|
980
|
+
"response.output_audio_transcript.delta": "response.audio_transcript.delta",
|
|
981
|
+
"response.output_audio_transcript.done": "response.audio_transcript.done"
|
|
982
|
+
};
|
|
983
|
+
var OpenAIRealtime2Adapter = class extends OpenAIRealtimeAdapter {
|
|
984
|
+
/** Two-stage outbound resampler for 24 kHz → 8 kHz. Created lazily on
|
|
985
|
+
* the first audio frame so each Realtime session has its own state.
|
|
986
|
+
*
|
|
987
|
+
* We chain `24k → 16k → 8k` instead of using the direct `24k → 8k`
|
|
988
|
+
* variant of {@link StatefulResampler}: the direct path is a 3:1
|
|
989
|
+
* decimation with linear interpolation only — no anti-alias filter
|
|
990
|
+
* — so any energy above 4 kHz in the source aliases down into the
|
|
991
|
+
* audible band and is heard as raspy/scratchy artefacts on speech.
|
|
992
|
+
* `gpt-realtime-2` outputs voice with significant content above
|
|
993
|
+
* 4 kHz. The second stage (16k → 8k) uses a 5-tap FIR anti-alias
|
|
994
|
+
* filter which removes the offending band before decimation, and
|
|
995
|
+
* empirically (see commit message) the chain produces audibly
|
|
996
|
+
* cleaner output. The 24k → 16k step is still pure linear-interp
|
|
997
|
+
* but the inputs to it stay below the Nyquist of the 16 kHz stage,
|
|
998
|
+
* so it doesn't introduce new artefacts.
|
|
999
|
+
*/
|
|
1000
|
+
outboundResampler24To16 = null;
|
|
1001
|
+
outboundResampler16To8 = null;
|
|
1002
|
+
/** Last 8 kHz input sample carried across chunk boundaries for the
|
|
1003
|
+
* direct 3× linear upsample (see `transcodeInboundMulaw8ToPcm24`).
|
|
1004
|
+
* The carry guarantees the very first output of each chunk
|
|
1005
|
+
* interpolates from the *real* preceding sample, not from the chunk's
|
|
1006
|
+
* own first sample replicated — without it every 20 ms Twilio frame
|
|
1007
|
+
* boundary becomes a small DC step that the GA server VAD interprets
|
|
1008
|
+
* as constant low-energy noise, which never crosses the speech
|
|
1009
|
+
* threshold. */
|
|
1010
|
+
inbound8kCarry = null;
|
|
1011
|
+
/** GA-shape `session.update` payload. See module-level docstring. */
|
|
1012
|
+
buildGASessionConfig() {
|
|
1013
|
+
const opts = this.options;
|
|
1014
|
+
const fmt = { type: "audio/pcm", rate: 24e3 };
|
|
1015
|
+
const config = {
|
|
1016
|
+
type: "realtime",
|
|
1017
|
+
output_modalities: opts.modalities ?? ["audio"],
|
|
1018
|
+
audio: {
|
|
1019
|
+
input: {
|
|
1020
|
+
format: fmt,
|
|
1021
|
+
transcription: {
|
|
1022
|
+
model: opts.inputAudioTranscriptionModel ?? OpenAITranscriptionModel.WHISPER_1
|
|
1023
|
+
},
|
|
1024
|
+
// VAD threshold raised back to the OpenAI default (0.5) on
|
|
1025
|
+
// 2026-05-22. The earlier 0.1 tuning (motivated by the
|
|
1026
|
+
// upsampled telephony-band loss in high frequencies) made the
|
|
1027
|
+
// server VAD trigger on the carrier-loopback echo of the
|
|
1028
|
+
// agent's OWN outbound audio in PSTN no-AEC scenarios.
|
|
1029
|
+
// Combined with the default ``turn_detection.create_response:
|
|
1030
|
+
// true``, every phantom ``speech_started`` ended a turn early
|
|
1031
|
+
// and auto-created a new response that the agent immediately
|
|
1032
|
+
// spoke over, leading to a runaway loop where the first
|
|
1033
|
+
// message was repeatedly cut and re-generated.
|
|
1034
|
+
turn_detection: {
|
|
1035
|
+
type: opts.vadType ?? OpenAIRealtimeVADType.SERVER_VAD,
|
|
1036
|
+
threshold: 0.5,
|
|
1037
|
+
prefix_padding_ms: 300,
|
|
1038
|
+
silence_duration_ms: opts.silenceDurationMs ?? 500,
|
|
1039
|
+
// Defer ``response.create`` to the application: when OpenAI's
|
|
1040
|
+
// server VAD commits an ``input_audio_buffer.committed`` segment
|
|
1041
|
+
// that turns out to be a Whisper hallucination on silence/echo,
|
|
1042
|
+
// auto-creating a response would generate a phantom turn (the
|
|
1043
|
+
// model reads the hallucinated text as user input). Patter
|
|
1044
|
+
// triggers ``response.create`` explicitly in the Realtime
|
|
1045
|
+
// stream-handler AFTER validating ``transcript_input`` against
|
|
1046
|
+
// the hallucination filter. Pair with ``interrupt_response:
|
|
1047
|
+
// false`` so server VAD also leaves in-flight responses alone —
|
|
1048
|
+
// barge-in is gated client-side.
|
|
1049
|
+
create_response: false,
|
|
1050
|
+
interrupt_response: false
|
|
1051
|
+
}
|
|
1052
|
+
},
|
|
1053
|
+
output: {
|
|
1054
|
+
format: fmt,
|
|
1055
|
+
voice: this.voice
|
|
1056
|
+
}
|
|
1057
|
+
},
|
|
1058
|
+
instructions: this.instructions || "You are a helpful voice assistant. Be concise."
|
|
1059
|
+
};
|
|
1060
|
+
if (opts.temperature !== void 0) config.temperature = opts.temperature;
|
|
1061
|
+
if (opts.maxResponseOutputTokens !== void 0) {
|
|
1062
|
+
config.max_output_tokens = opts.maxResponseOutputTokens;
|
|
1063
|
+
}
|
|
1064
|
+
if (opts.toolChoice !== void 0) config.tool_choice = opts.toolChoice;
|
|
1065
|
+
if (opts.reasoningEffort !== void 0) {
|
|
1066
|
+
config.reasoning = { effort: opts.reasoningEffort };
|
|
1067
|
+
}
|
|
1068
|
+
if (this.tools?.length) {
|
|
1069
|
+
config.tools = this.tools.map((t) => {
|
|
1070
|
+
const def = {
|
|
1071
|
+
type: "function",
|
|
1072
|
+
name: t.name,
|
|
1073
|
+
description: t.description,
|
|
1074
|
+
parameters: t.parameters
|
|
1075
|
+
};
|
|
1076
|
+
if (t.strict === true) def.strict = true;
|
|
1077
|
+
return def;
|
|
1078
|
+
});
|
|
1079
|
+
}
|
|
1080
|
+
return config;
|
|
1081
|
+
}
|
|
1082
|
+
/**
|
|
1083
|
+
* Open the Realtime WebSocket against the GA endpoint and apply the GA
|
|
1084
|
+
* session configuration. Header `OpenAI-Beta: realtime=v1` is OMITTED
|
|
1085
|
+
* (the GA endpoint rejects it). Wire shape uses nested `audio.{input,
|
|
1086
|
+
* output}` + `output_modalities` + `session.type === "realtime"`.
|
|
1087
|
+
*/
|
|
1088
|
+
async connect() {
|
|
1089
|
+
const url = `wss://api.openai.com/v1/realtime?model=${encodeURIComponent(this.model)}`;
|
|
1090
|
+
this.ws = new WebSocket2(url, {
|
|
1091
|
+
headers: { Authorization: `Bearer ${this.apiKey}` }
|
|
1092
|
+
});
|
|
1093
|
+
const wsRef = this.ws;
|
|
1094
|
+
const originalOn = wsRef.on.bind(this.ws);
|
|
1095
|
+
wsRef.on = (event, handler) => {
|
|
1096
|
+
if (event !== "message") return originalOn(event, handler);
|
|
1097
|
+
const wrapped = (raw, ...rest) => {
|
|
1098
|
+
try {
|
|
1099
|
+
const text = typeof raw === "string" ? raw : raw.toString();
|
|
1100
|
+
const parsed = JSON.parse(text);
|
|
1101
|
+
const t = parsed.type;
|
|
1102
|
+
if (t && t in GA_TO_V1_EVENT_NAMES) {
|
|
1103
|
+
const newType = GA_TO_V1_EVENT_NAMES[t];
|
|
1104
|
+
if (t === "response.output_audio.delta" && typeof parsed.delta === "string") {
|
|
1105
|
+
const mulaw = this.transcodeOutboundPcm24ToMulaw8Buffer(parsed.delta);
|
|
1106
|
+
const FRAME_BYTES = 160;
|
|
1107
|
+
if (mulaw.length === 0) return;
|
|
1108
|
+
for (let off = 0; off < mulaw.length; off += FRAME_BYTES) {
|
|
1109
|
+
const slice = mulaw.subarray(off, Math.min(off + FRAME_BYTES, mulaw.length));
|
|
1110
|
+
const frame = { ...parsed, type: newType, delta: slice.toString("base64") };
|
|
1111
|
+
handler(Buffer.from(JSON.stringify(frame)), ...rest);
|
|
1112
|
+
}
|
|
1113
|
+
return;
|
|
1114
|
+
}
|
|
1115
|
+
parsed.type = newType;
|
|
1116
|
+
handler(Buffer.from(JSON.stringify(parsed)), ...rest);
|
|
1117
|
+
return;
|
|
1118
|
+
}
|
|
1119
|
+
} catch {
|
|
1120
|
+
}
|
|
1121
|
+
handler(raw, ...rest);
|
|
1122
|
+
};
|
|
1123
|
+
return originalOn(event, wrapped);
|
|
1124
|
+
};
|
|
1125
|
+
await new Promise((resolve, reject) => {
|
|
1126
|
+
let sessionCreated = false;
|
|
1127
|
+
let settled = false;
|
|
1128
|
+
const ws = this.ws;
|
|
1129
|
+
const onSetupMessage = (raw) => {
|
|
1130
|
+
let msg;
|
|
1131
|
+
try {
|
|
1132
|
+
msg = JSON.parse(raw.toString());
|
|
1133
|
+
} catch (e) {
|
|
1134
|
+
getLogger().warn(`OpenAI Realtime 2: failed to parse message: ${String(e)}`);
|
|
1135
|
+
return;
|
|
1136
|
+
}
|
|
1137
|
+
if (msg.type === "session.created" && !sessionCreated) {
|
|
1138
|
+
sessionCreated = true;
|
|
1139
|
+
ws.send(JSON.stringify({ type: "session.update", session: this.buildGASessionConfig() }));
|
|
1140
|
+
} else if (msg.type === "session.updated") {
|
|
1141
|
+
cleanup();
|
|
1142
|
+
resolve();
|
|
1143
|
+
} else if (msg.type === "error") {
|
|
1144
|
+
cleanup();
|
|
1145
|
+
try {
|
|
1146
|
+
ws.close();
|
|
1147
|
+
} catch {
|
|
1148
|
+
}
|
|
1149
|
+
reject(new Error(`OpenAI Realtime 2 setup error: ${msg.error?.message ?? JSON.stringify(msg)}`));
|
|
1150
|
+
}
|
|
1151
|
+
};
|
|
1152
|
+
const onSetupError = (err) => {
|
|
1153
|
+
cleanup();
|
|
1154
|
+
try {
|
|
1155
|
+
ws.close();
|
|
1156
|
+
} catch {
|
|
1157
|
+
}
|
|
1158
|
+
reject(err);
|
|
1159
|
+
};
|
|
1160
|
+
const cleanup = () => {
|
|
1161
|
+
if (settled) return;
|
|
1162
|
+
settled = true;
|
|
1163
|
+
clearTimeout(timer);
|
|
1164
|
+
ws.off("message", onSetupMessage);
|
|
1165
|
+
ws.off("error", onSetupError);
|
|
1166
|
+
};
|
|
1167
|
+
const timer = setTimeout(() => {
|
|
1168
|
+
cleanup();
|
|
1169
|
+
try {
|
|
1170
|
+
ws.close();
|
|
1171
|
+
} catch {
|
|
1172
|
+
}
|
|
1173
|
+
reject(new Error("OpenAI Realtime 2 connect timeout"));
|
|
1174
|
+
}, 15e3);
|
|
1175
|
+
ws.on("message", onSetupMessage);
|
|
1176
|
+
ws.on("error", onSetupError);
|
|
1177
|
+
});
|
|
1178
|
+
this.armHeartbeatAndListener();
|
|
1179
|
+
}
|
|
1180
|
+
/**
|
|
1181
|
+
* GA-API variant of {@link OpenAIRealtimeAdapter.openParkedConnection}.
|
|
1182
|
+
* Opens a fresh Realtime WS against the GA endpoint, exchanges
|
|
1183
|
+
* `session.created` → GA-shape `session.update` → `session.updated`
|
|
1184
|
+
* so the upstream session is fully primed, and returns the OPEN
|
|
1185
|
+
* socket WITHOUT taking it on `this.ws` or arming the heartbeat /
|
|
1186
|
+
* message listener.
|
|
1187
|
+
*
|
|
1188
|
+
* Used by `Patter.parkProviderConnections` during the carrier
|
|
1189
|
+
* ringing window so the per-call `StreamHandler` can adopt the
|
|
1190
|
+
* primed socket at carrier `start` — eliminating the TCP + TLS +
|
|
1191
|
+
* HTTP-101 + `session.update` ack round-trip from the critical path.
|
|
1192
|
+
* Saves ~300-600 ms of first-audible-word latency.
|
|
1193
|
+
*
|
|
1194
|
+
* Bounded by 8 s. Throws on timeout / handshake failure / GA-side
|
|
1195
|
+
* rejection. Callers treat any error as a cache miss and fall
|
|
1196
|
+
* through to the cold {@link connect} path.
|
|
1197
|
+
*
|
|
1198
|
+
* Billing safety: confirmed by OpenAI's Managing Realtime Costs
|
|
1199
|
+
* guide — `session.update` does NOT invoke the model and bills no
|
|
1200
|
+
* tokens. An idle parked socket costs $0.
|
|
1201
|
+
*/
|
|
1202
|
+
async openParkedConnection() {
|
|
1203
|
+
const url = `wss://api.openai.com/v1/realtime?model=${encodeURIComponent(this.model)}`;
|
|
1204
|
+
const ws = new WebSocket2(url, {
|
|
1205
|
+
headers: { Authorization: `Bearer ${this.apiKey}` }
|
|
1206
|
+
});
|
|
1207
|
+
await new Promise((resolve, reject) => {
|
|
1208
|
+
let sessionCreated = false;
|
|
1209
|
+
let settled = false;
|
|
1210
|
+
const onMessage = (raw) => {
|
|
1211
|
+
let msg;
|
|
1212
|
+
try {
|
|
1213
|
+
msg = JSON.parse(raw.toString());
|
|
1214
|
+
} catch {
|
|
1215
|
+
return;
|
|
1216
|
+
}
|
|
1217
|
+
if (msg.type === "session.created" && !sessionCreated) {
|
|
1218
|
+
sessionCreated = true;
|
|
1219
|
+
try {
|
|
1220
|
+
ws.send(JSON.stringify({ type: "session.update", session: this.buildGASessionConfig() }));
|
|
1221
|
+
} catch (err) {
|
|
1222
|
+
cleanup();
|
|
1223
|
+
reject(err instanceof Error ? err : new Error(String(err)));
|
|
1224
|
+
}
|
|
1225
|
+
} else if (msg.type === "session.updated") {
|
|
1226
|
+
cleanup();
|
|
1227
|
+
resolve();
|
|
1228
|
+
} else if (msg.type === "error") {
|
|
1229
|
+
cleanup();
|
|
1230
|
+
reject(new Error(`OpenAI Realtime 2 parked-setup error: ${msg.error?.message ?? JSON.stringify(msg)}`));
|
|
1231
|
+
}
|
|
1232
|
+
};
|
|
1233
|
+
const onError = (err) => {
|
|
1234
|
+
cleanup();
|
|
1235
|
+
reject(err);
|
|
1236
|
+
};
|
|
1237
|
+
const cleanup = () => {
|
|
1238
|
+
if (settled) return;
|
|
1239
|
+
settled = true;
|
|
1240
|
+
clearTimeout(timer);
|
|
1241
|
+
ws.off("message", onMessage);
|
|
1242
|
+
ws.off("error", onError);
|
|
1243
|
+
};
|
|
1244
|
+
const timer = setTimeout(() => {
|
|
1245
|
+
cleanup();
|
|
1246
|
+
reject(new Error("OpenAI Realtime 2 park connect timeout"));
|
|
1247
|
+
}, 8e3);
|
|
1248
|
+
ws.on("message", onMessage);
|
|
1249
|
+
ws.on("error", onError);
|
|
1250
|
+
});
|
|
1251
|
+
const keepalive = setInterval(() => {
|
|
1252
|
+
if (ws.readyState !== ws.OPEN) {
|
|
1253
|
+
clearInterval(keepalive);
|
|
1254
|
+
return;
|
|
1255
|
+
}
|
|
1256
|
+
try {
|
|
1257
|
+
ws.send(JSON.stringify({ type: "session.update", session: this.buildGASessionConfig() }));
|
|
1258
|
+
} catch {
|
|
1259
|
+
clearInterval(keepalive);
|
|
1260
|
+
}
|
|
1261
|
+
}, 3e3);
|
|
1262
|
+
ws._parkedKeepalive = keepalive;
|
|
1263
|
+
return ws;
|
|
1264
|
+
}
|
|
1265
|
+
/**
|
|
1266
|
+
* GA-API variant of {@link OpenAIRealtimeAdapter.adoptWebSocket}. Takes
|
|
1267
|
+
* over a WS that {@link openParkedConnection} produced (already through
|
|
1268
|
+
* `session.created` + `session.update` + `session.updated`) and arms
|
|
1269
|
+
* the heartbeat + message listener so the GA event-translation shim
|
|
1270
|
+
* is wired up. Skips the cold-connect path — saves ~300-600 ms on
|
|
1271
|
+
* first audible word.
|
|
1272
|
+
*
|
|
1273
|
+
* Caller MUST verify `ws.readyState === OPEN` before calling. If the
|
|
1274
|
+
* parked WS died between park and adopt, fall back to {@link connect}.
|
|
1275
|
+
*/
|
|
1276
|
+
adoptWebSocket(ws) {
|
|
1277
|
+
const wsAny = ws;
|
|
1278
|
+
if (wsAny._parkedKeepalive) {
|
|
1279
|
+
clearInterval(wsAny._parkedKeepalive);
|
|
1280
|
+
delete wsAny._parkedKeepalive;
|
|
1281
|
+
}
|
|
1282
|
+
this.ws = ws;
|
|
1283
|
+
const wsRef = ws;
|
|
1284
|
+
const originalOn = wsRef.on.bind(ws);
|
|
1285
|
+
wsRef.on = (event, handler) => {
|
|
1286
|
+
if (event !== "message") return originalOn(event, handler);
|
|
1287
|
+
const wrapped = (raw, ...rest) => {
|
|
1288
|
+
try {
|
|
1289
|
+
const text = typeof raw === "string" ? raw : raw.toString();
|
|
1290
|
+
const parsed = JSON.parse(text);
|
|
1291
|
+
const t = parsed.type;
|
|
1292
|
+
if (t && Object.prototype.hasOwnProperty.call(GA_TO_V1_EVENT_NAMES, t)) {
|
|
1293
|
+
parsed.type = GA_TO_V1_EVENT_NAMES[t];
|
|
1294
|
+
handler(JSON.stringify(parsed), ...rest);
|
|
1295
|
+
return;
|
|
1296
|
+
}
|
|
1297
|
+
} catch {
|
|
1298
|
+
}
|
|
1299
|
+
handler(raw, ...rest);
|
|
1300
|
+
};
|
|
1301
|
+
return originalOn(event, wrapped);
|
|
1302
|
+
};
|
|
1303
|
+
this.armHeartbeatAndListener();
|
|
1304
|
+
}
|
|
1305
|
+
/**
|
|
1306
|
+
* GA-API variant of {@link OpenAIRealtimeAdapter.sendFirstMessage}. Two
|
|
1307
|
+
* differences from the v1 path:
|
|
1308
|
+
*
|
|
1309
|
+
* 1. The v1 implementation sends `response.modalities` which the GA
|
|
1310
|
+
* endpoint rejects with `Unknown parameter: 'response.modalities'`.
|
|
1311
|
+
* Use `output_modalities` to match the GA `session.update` shape.
|
|
1312
|
+
*
|
|
1313
|
+
* 2. The GA `response.create` does NOT inherit `audio.output.voice`
|
|
1314
|
+
* from the session — it falls back to the server-side default
|
|
1315
|
+
* (`marin`, female) when the field is omitted on the response
|
|
1316
|
+
* itself. Session-level `voice: "alloy"` only affects subsequent
|
|
1317
|
+
* server-VAD-triggered responses, NOT this explicit
|
|
1318
|
+
* `response.create`. We re-inject the configured voice here so the
|
|
1319
|
+
* first-message voice matches the rest of the call.
|
|
1320
|
+
*/
|
|
1321
|
+
/**
|
|
1322
|
+
* Override the parent `sendAudio` to transcode inbound carrier audio
|
|
1323
|
+
* (mulaw 8 kHz from Twilio/Telnyx) into PCM-16 24 kHz before sending
|
|
1324
|
+
* `input_audio_buffer.append`. The GA server's audio engine ignores
|
|
1325
|
+
* mulaw frames (commit returns "buffer only has 0.00ms of audio") even
|
|
1326
|
+
* though it accepts `audio/pcmu` at the protocol level.
|
|
1327
|
+
*/
|
|
1328
|
+
sendAudio(mulawAudio) {
|
|
1329
|
+
if (!this.ws || this.ws.readyState !== this.ws.OPEN) return;
|
|
1330
|
+
const pcm24k = this.transcodeInboundMulaw8ToPcm24(mulawAudio);
|
|
1331
|
+
this.ws.send(JSON.stringify({
|
|
1332
|
+
type: "input_audio_buffer.append",
|
|
1333
|
+
audio: pcm24k.toString("base64")
|
|
1334
|
+
}));
|
|
1335
|
+
}
|
|
1336
|
+
/**
|
|
1337
|
+
* mulaw 8 kHz Buffer → PCM-16-LE 24 kHz Buffer.
|
|
1338
|
+
*
|
|
1339
|
+
* Direct 3× linear-interpolation upsample with a one-sample carry
|
|
1340
|
+
* across chunk boundaries. For every consecutive pair of 8 kHz
|
|
1341
|
+
* samples `(s_a, s_b)` we emit three 24 kHz samples:
|
|
1342
|
+
*
|
|
1343
|
+
* out_0 = s_a
|
|
1344
|
+
* out_1 = 2/3·s_a + 1/3·s_b
|
|
1345
|
+
* out_2 = 1/3·s_a + 2/3·s_b
|
|
1346
|
+
*
|
|
1347
|
+
* The carry stores the last 8 kHz sample of the chunk so the next
|
|
1348
|
+
* chunk can start by pairing `(carry, firstNewSample)` — that's what
|
|
1349
|
+
* keeps the output rate exact (each input sample → 3 output samples)
|
|
1350
|
+
* and eliminates the chunk-boundary DC step that confused the GA
|
|
1351
|
+
* server VAD. The first chunk has no carry and loses 3 samples at
|
|
1352
|
+
* the leading edge (375 µs of audio); that's well below any audible
|
|
1353
|
+
* artefact and well below the GA VAD's 300 ms prefix-padding window.
|
|
1354
|
+
*/
|
|
1355
|
+
transcodeInboundMulaw8ToPcm24(mulaw) {
|
|
1356
|
+
const pcm8 = mulawToPcm16(mulaw);
|
|
1357
|
+
const samples8 = pcm8.length / 2;
|
|
1358
|
+
if (samples8 === 0) return Buffer.alloc(0);
|
|
1359
|
+
const GAIN = 2;
|
|
1360
|
+
const inputs = [];
|
|
1361
|
+
if (this.inbound8kCarry !== null) inputs.push(this.inbound8kCarry);
|
|
1362
|
+
for (let i = 0; i < samples8; i++) {
|
|
1363
|
+
const raw = pcm8.readInt16LE(i * 2) * GAIN;
|
|
1364
|
+
inputs.push(Math.max(-32768, Math.min(32767, raw)));
|
|
1365
|
+
}
|
|
1366
|
+
this.inbound8kCarry = inputs[inputs.length - 1];
|
|
1367
|
+
const numPairs = inputs.length - 1;
|
|
1368
|
+
if (numPairs <= 0) return Buffer.alloc(0);
|
|
1369
|
+
const out = Buffer.allocUnsafe(numPairs * 3 * 2);
|
|
1370
|
+
for (let i = 0; i < numPairs; i++) {
|
|
1371
|
+
const s0 = inputs[i];
|
|
1372
|
+
const s1 = inputs[i + 1];
|
|
1373
|
+
out.writeInt16LE(s0, i * 6);
|
|
1374
|
+
out.writeInt16LE(Math.round((s0 * 2 + s1) / 3), i * 6 + 2);
|
|
1375
|
+
out.writeInt16LE(Math.round((s0 + s1 * 2) / 3), i * 6 + 4);
|
|
1376
|
+
}
|
|
1377
|
+
return out;
|
|
1378
|
+
}
|
|
1379
|
+
/**
|
|
1380
|
+
* Base64 PCM-16-LE 24 kHz → Base64 mulaw 8 kHz. Used by the WS
|
|
1381
|
+
* translation shim on each `response.output_audio.delta`. The stateful
|
|
1382
|
+
* resampler is created lazily and reused across all deltas in this
|
|
1383
|
+
* session so the 3:1 decimator's phase carries across chunk
|
|
1384
|
+
* boundaries — without that, every chunk boundary produces a click.
|
|
1385
|
+
*/
|
|
1386
|
+
transcodeOutboundPcm24ToMulaw8Buffer(deltaB64) {
|
|
1387
|
+
if (!this.outboundResampler24To16) {
|
|
1388
|
+
this.outboundResampler24To16 = new StatefulResampler({ srcRate: 24e3, dstRate: 16e3 });
|
|
1389
|
+
this.outboundResampler16To8 = new StatefulResampler({ srcRate: 16e3, dstRate: 8e3 });
|
|
1390
|
+
}
|
|
1391
|
+
const pcm24 = Buffer.from(deltaB64, "base64");
|
|
1392
|
+
const pcm16 = this.outboundResampler24To16.process(pcm24);
|
|
1393
|
+
const pcm8 = this.outboundResampler16To8.process(pcm16);
|
|
1394
|
+
if (pcm8.length === 0) return Buffer.alloc(0);
|
|
1395
|
+
return pcm16ToMulaw(pcm8);
|
|
1396
|
+
}
|
|
1397
|
+
async sendFirstMessage(text) {
|
|
1398
|
+
const responseBody = {
|
|
1399
|
+
output_modalities: ["audio"],
|
|
1400
|
+
audio: { output: { voice: this.voice } },
|
|
1401
|
+
instructions: `Say exactly the following sentence as your first turn and nothing else: "${text}"`
|
|
1402
|
+
};
|
|
1403
|
+
if (this.options.reasoningEffort !== void 0) {
|
|
1404
|
+
responseBody.reasoning = { effort: this.options.reasoningEffort };
|
|
1405
|
+
}
|
|
1406
|
+
this.ws?.send(JSON.stringify({ type: "response.create", response: responseBody }));
|
|
1407
|
+
}
|
|
1408
|
+
};
|
|
1409
|
+
|
|
1410
|
+
export {
|
|
1411
|
+
OpenAIRealtimeAudioFormat,
|
|
1412
|
+
OpenAIRealtimeModel,
|
|
1413
|
+
OpenAIVoice,
|
|
1414
|
+
OpenAITranscriptionModel,
|
|
1415
|
+
OpenAIRealtimeVADType,
|
|
1416
|
+
OpenAIRealtimeAdapter,
|
|
1417
|
+
mulawToPcm16,
|
|
1418
|
+
pcm16ToMulaw,
|
|
1419
|
+
PcmCarry,
|
|
1420
|
+
StatefulResampler,
|
|
1421
|
+
createResampler16kTo8k,
|
|
1422
|
+
createResampler8kTo16k,
|
|
1423
|
+
createResampler24kTo16k,
|
|
1424
|
+
createResampler24kTo8k,
|
|
1425
|
+
resample8kTo16k,
|
|
1426
|
+
resample16kTo8k,
|
|
1427
|
+
resample24kTo16k,
|
|
1428
|
+
OpenAIRealtime2Adapter
|
|
1429
|
+
};
|