getpatter 0.6.1 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1429 @@
1
+ import {
2
+ getLogger
3
+ } from "./chunk-MVOQFAEO.mjs";
4
+ import {
5
+ init_esm_shims
6
+ } from "./chunk-N565J3CF.mjs";
7
+
8
+ // src/providers/openai-realtime-2.ts
9
+ init_esm_shims();
10
+ import WebSocket2 from "ws";
11
+
12
+ // src/providers/openai-realtime.ts
13
+ init_esm_shims();
14
+ import WebSocket from "ws";
15
+ var OpenAIRealtimeAudioFormat = {
16
+ G711_ULAW: "g711_ulaw",
17
+ G711_ALAW: "g711_alaw",
18
+ PCM16: "pcm16"
19
+ };
20
+ var OpenAIRealtimeModel = {
21
+ GPT_REALTIME: "gpt-realtime",
22
+ GPT_REALTIME_2: "gpt-realtime-2",
23
+ GPT_REALTIME_MINI: "gpt-realtime-mini",
24
+ GPT_4O_REALTIME_PREVIEW: "gpt-4o-realtime-preview",
25
+ GPT_4O_MINI_REALTIME_PREVIEW: "gpt-4o-mini-realtime-preview"
26
+ };
27
+ var OpenAIVoice = {
28
+ ALLOY: "alloy",
29
+ ASH: "ash",
30
+ BALLAD: "ballad",
31
+ CORAL: "coral",
32
+ ECHO: "echo",
33
+ FABLE: "fable",
34
+ NOVA: "nova",
35
+ ONYX: "onyx",
36
+ SAGE: "sage",
37
+ SHIMMER: "shimmer",
38
+ VERSE: "verse"
39
+ };
40
+ var OpenAITranscriptionModel = {
41
+ WHISPER_1: "whisper-1",
42
+ GPT_4O_TRANSCRIBE: "gpt-4o-transcribe",
43
+ GPT_4O_MINI_TRANSCRIBE: "gpt-4o-mini-transcribe",
44
+ GPT_REALTIME_WHISPER: "gpt-realtime-whisper"
45
+ };
46
+ var OpenAIRealtimeVADType = {
47
+ SERVER_VAD: "server_vad",
48
+ SEMANTIC_VAD: "semantic_vad"
49
+ };
50
+ var OpenAIRealtimeAdapter = class {
51
+ constructor(apiKey, model = OpenAIRealtimeModel.GPT_REALTIME_MINI, voice = OpenAIVoice.ALLOY, instructions = "", tools, audioFormat = OpenAIRealtimeAudioFormat.G711_ULAW, options = {}) {
52
+ this.apiKey = apiKey;
53
+ this.model = model;
54
+ this.voice = voice;
55
+ this.instructions = instructions;
56
+ this.tools = tools;
57
+ this.audioFormat = audioFormat;
58
+ this.options = options;
59
+ }
60
+ apiKey;
61
+ model;
62
+ voice;
63
+ instructions;
64
+ tools;
65
+ audioFormat;
66
+ // Fields exposed `protected` (not `private`) so a subclass can implement
67
+ // alternate transports — e.g. `OpenAIRealtime2Adapter` overrides
68
+ // `connect()` to speak the GA Realtime API while reusing the rest of
69
+ // the runtime (audio dispatch, barge-in, heartbeat).
70
+ ws = null;
71
+ eventCallbacks = /* @__PURE__ */ new Set();
72
+ messageListenerAttached = false;
73
+ heartbeat = null;
74
+ // Track the in-flight assistant item id so we can truncate cleanly on
75
+ // barge-in (see ``cancelResponse``) — matches the Python adapter.
76
+ currentResponseItemId = null;
77
+ currentResponseAudioMs = 0;
78
+ // Wall-clock timestamp (Date.now()) of the first ``response.audio.delta``
79
+ // received since the current response item started. ``cancelResponse``
80
+ // uses this to bound ``audio_end_ms`` to what the caller could plausibly
81
+ // have heard — generated audio frequently arrives 5-10x real-time, so
82
+ // ``audio_end_ms`` driven purely by the per-chunk byte counter overshoots
83
+ // reality and leaves phantom assistant text on the conversation. The
84
+ // wall-clock cap corresponds to the maximum playback that real-time TTS
85
+ // could have produced, which is what the user actually heard.
86
+ currentResponseFirstAudioAt = null;
87
+ options;
88
+ /**
89
+ * Build the production session.update body. Mirrors the body sent
90
+ * inside `connect()` so warmup can apply identical configuration to
91
+ * the upstream session and prime it without billing.
92
+ */
93
+ buildSessionConfig() {
94
+ const config = {
95
+ input_audio_format: this.audioFormat,
96
+ output_audio_format: this.audioFormat,
97
+ voice: this.voice,
98
+ instructions: this.instructions || "You are a helpful voice assistant. Be concise.",
99
+ turn_detection: {
100
+ type: this.options.vadType ?? OpenAIRealtimeVADType.SERVER_VAD,
101
+ threshold: 0.5,
102
+ prefix_padding_ms: 300,
103
+ silence_duration_ms: this.options.silenceDurationMs ?? 300
104
+ },
105
+ input_audio_transcription: {
106
+ model: this.options.inputAudioTranscriptionModel ?? OpenAITranscriptionModel.WHISPER_1
107
+ }
108
+ };
109
+ if (this.options.temperature !== void 0) config.temperature = this.options.temperature;
110
+ if (this.options.maxResponseOutputTokens !== void 0) {
111
+ config.max_response_output_tokens = this.options.maxResponseOutputTokens;
112
+ }
113
+ if (this.options.modalities !== void 0) config.modalities = this.options.modalities;
114
+ if (this.options.toolChoice !== void 0) config.tool_choice = this.options.toolChoice;
115
+ if (this.options.reasoningEffort !== void 0) {
116
+ config.reasoning = { effort: this.options.reasoningEffort };
117
+ }
118
+ if (this.tools?.length) {
119
+ config.tools = this.tools.map((t) => {
120
+ const def = {
121
+ type: "function",
122
+ name: t.name,
123
+ description: t.description,
124
+ parameters: t.parameters
125
+ };
126
+ if (t.strict === true) {
127
+ def.strict = true;
128
+ }
129
+ return def;
130
+ });
131
+ }
132
+ return config;
133
+ }
134
+ /**
135
+ * Pre-call WebSocket warmup for the OpenAI Realtime endpoint.
136
+ *
137
+ * The canonical session-only warm step on the Realtime API: open the
138
+ * WS, wait for `session.created`, send a single `session.update`
139
+ * containing the same fields that the production `connect()` path
140
+ * applies (`input_audio_format`, `output_audio_format`, `voice`,
141
+ * `instructions`, `turn_detection`, `input_audio_transcription`,
142
+ * plus any opt-in fields populated on the adapter), wait for the
143
+ * matching `session.updated` ack, then close cleanly. This primes
144
+ * the per-session state on the OpenAI side — DNS + TLS + auth
145
+ * handshake + initial config exchange — without ever invoking the
146
+ * model.
147
+ *
148
+ * Earlier revisions sent `response.create` with
149
+ * `{"response": {"generate": false}}` to prime the inference path.
150
+ * That field is NOT in the OpenAI Realtime API schema; the server
151
+ * either ignores it (and bills tokens for a real model response) or
152
+ * rejects the request with `invalid_request_error`. Both behaviours
153
+ * are billing-unsafe or a no-op beyond TLS warm. The
154
+ * `session.update` flow is documented and side-effect-free.
155
+ *
156
+ * Billing safety: `session.update` only mutates session
157
+ * configuration. It does NOT invoke the model, does NOT consume any
158
+ * audio buffer, and does NOT trigger token generation, so no
159
+ * per-token cost is accrued. Best-effort: failures are logged at
160
+ * debug level and never raised.
161
+ */
162
+ async warmup() {
163
+ const url = `wss://api.openai.com/v1/realtime?model=${encodeURIComponent(this.model)}`;
164
+ let ws = null;
165
+ try {
166
+ ws = await new Promise((resolve, reject) => {
167
+ const sock = new WebSocket(url, {
168
+ headers: {
169
+ Authorization: `Bearer ${this.apiKey}`
170
+ }
171
+ });
172
+ const timer = setTimeout(() => {
173
+ try {
174
+ sock.close();
175
+ } catch {
176
+ }
177
+ reject(new Error("OpenAI Realtime warmup connect timeout"));
178
+ }, 5e3);
179
+ sock.once("open", () => {
180
+ clearTimeout(timer);
181
+ resolve(sock);
182
+ });
183
+ sock.once("error", (err) => {
184
+ clearTimeout(timer);
185
+ reject(err);
186
+ });
187
+ });
188
+ const sessionCreated = await new Promise((resolve) => {
189
+ const timer = setTimeout(() => resolve(false), 2e3);
190
+ const onMsg = (raw) => {
191
+ try {
192
+ const data = JSON.parse(raw.toString());
193
+ if (data.type === "session.created") {
194
+ clearTimeout(timer);
195
+ ws.off("message", onMsg);
196
+ resolve(true);
197
+ }
198
+ } catch {
199
+ }
200
+ };
201
+ ws.on("message", onMsg);
202
+ });
203
+ if (!sessionCreated) return;
204
+ try {
205
+ ws.send(JSON.stringify({ type: "session.update", session: this.buildSessionConfig() }));
206
+ } catch {
207
+ return;
208
+ }
209
+ await new Promise((resolve) => {
210
+ const timer = setTimeout(() => resolve(), 1500);
211
+ const onMsg = (raw) => {
212
+ try {
213
+ const data = JSON.parse(raw.toString());
214
+ if (data.type === "session.updated") {
215
+ clearTimeout(timer);
216
+ ws.off("message", onMsg);
217
+ resolve();
218
+ }
219
+ } catch {
220
+ }
221
+ };
222
+ ws.on("message", onMsg);
223
+ });
224
+ } catch (err) {
225
+ getLogger().debug(`OpenAI Realtime warmup failed (best-effort): ${String(err)}`);
226
+ } finally {
227
+ if (ws) {
228
+ try {
229
+ ws.close();
230
+ } catch {
231
+ }
232
+ }
233
+ }
234
+ }
235
+ /** Open the Realtime WebSocket and apply the session configuration. */
236
+ async connect() {
237
+ const url = `wss://api.openai.com/v1/realtime?model=${encodeURIComponent(this.model)}`;
238
+ this.ws = new WebSocket(url, {
239
+ headers: {
240
+ Authorization: `Bearer ${this.apiKey}`
241
+ }
242
+ });
243
+ await new Promise((resolve, reject) => {
244
+ let sessionCreated = false;
245
+ let settled = false;
246
+ const ws = this.ws;
247
+ const onSetupMessage = (raw) => {
248
+ let msg;
249
+ try {
250
+ msg = JSON.parse(raw.toString());
251
+ } catch (e) {
252
+ getLogger().warn(`OpenAI Realtime: failed to parse message: ${String(e)}`);
253
+ return;
254
+ }
255
+ if (msg.type === "session.created" && !sessionCreated) {
256
+ sessionCreated = true;
257
+ ws.send(JSON.stringify({ type: "session.update", session: this.buildSessionConfig() }));
258
+ } else if (msg.type === "session.updated") {
259
+ cleanup();
260
+ resolve();
261
+ }
262
+ };
263
+ const onSetupError = (err) => {
264
+ cleanup();
265
+ try {
266
+ ws.close();
267
+ } catch {
268
+ }
269
+ reject(err);
270
+ };
271
+ const cleanup = () => {
272
+ if (settled) return;
273
+ settled = true;
274
+ clearTimeout(timer);
275
+ ws.off("message", onSetupMessage);
276
+ ws.off("error", onSetupError);
277
+ };
278
+ const timer = setTimeout(() => {
279
+ cleanup();
280
+ try {
281
+ ws.close();
282
+ } catch {
283
+ }
284
+ reject(new Error("OpenAI Realtime connect timeout"));
285
+ }, 15e3);
286
+ ws.on("message", onSetupMessage);
287
+ ws.on("error", onSetupError);
288
+ });
289
+ this.armHeartbeatAndListener();
290
+ }
291
+ /**
292
+ * Adopt a pre-opened, already-`session.updated` Realtime WebSocket
293
+ * produced by the prewarm pipeline (see `Patter.parkProviderConnections`).
294
+ * Skips the fresh `new WebSocket()` + `session.created` /
295
+ * `session.update` round-trip — saves ~250-450 ms on first turn.
296
+ *
297
+ * Caller MUST verify `ws.readyState === OPEN` before calling and MUST
298
+ * have already received `session.updated` on the parked socket. If
299
+ * the parked WS died between park and adopt, fall back to `connect()`.
300
+ */
301
+ adoptWebSocket(ws) {
302
+ this.ws = ws;
303
+ this.armHeartbeatAndListener();
304
+ }
305
+ armHeartbeatAndListener() {
306
+ this.heartbeat = setInterval(() => {
307
+ try {
308
+ this.ws?.ping();
309
+ } catch {
310
+ }
311
+ }, 2e4);
312
+ this.ensureMessageListener();
313
+ }
314
+ /**
315
+ * Open a fresh Realtime WS, exchange `session.created` /
316
+ * `session.update` / `session.updated` (so the upstream session is
317
+ * fully primed), and return the OPEN socket WITHOUT arming the
318
+ * heartbeat / message listener. Used by the prewarm pipeline to park
319
+ * a Realtime connection during ringing; the live consumer adopts it
320
+ * via {@link adoptWebSocket}.
321
+ *
322
+ * Bounded by 8 s. Throws on timeout / handshake failure — callers
323
+ * (the prewarm pipeline) treat any error as a cache miss and the
324
+ * call falls through to the cold `connect()` path.
325
+ *
326
+ * Billing safety: `session.update` does not invoke the model. No
327
+ * tokens are billed.
328
+ */
329
+ async openParkedConnection() {
330
+ const url = `wss://api.openai.com/v1/realtime?model=${encodeURIComponent(this.model)}`;
331
+ const ws = new WebSocket(url, {
332
+ headers: {
333
+ Authorization: `Bearer ${this.apiKey}`
334
+ }
335
+ });
336
+ await new Promise((resolve, reject) => {
337
+ let sessionCreated = false;
338
+ let settled = false;
339
+ const onMessage = (raw) => {
340
+ let msg;
341
+ try {
342
+ msg = JSON.parse(raw.toString());
343
+ } catch {
344
+ return;
345
+ }
346
+ if (msg.type === "session.created" && !sessionCreated) {
347
+ sessionCreated = true;
348
+ try {
349
+ ws.send(JSON.stringify({ type: "session.update", session: this.buildSessionConfig() }));
350
+ } catch (err) {
351
+ cleanup();
352
+ reject(err instanceof Error ? err : new Error(String(err)));
353
+ }
354
+ } else if (msg.type === "session.updated") {
355
+ cleanup();
356
+ resolve();
357
+ }
358
+ };
359
+ const onError = (err) => {
360
+ cleanup();
361
+ reject(err);
362
+ };
363
+ const cleanup = () => {
364
+ if (settled) return;
365
+ settled = true;
366
+ clearTimeout(timer);
367
+ ws.off("message", onMessage);
368
+ ws.off("error", onError);
369
+ };
370
+ const timer = setTimeout(() => {
371
+ cleanup();
372
+ reject(new Error("OpenAI Realtime park connect timeout"));
373
+ }, 8e3);
374
+ ws.on("message", onMessage);
375
+ ws.on("error", onError);
376
+ });
377
+ return ws;
378
+ }
379
+ /** Append a base64-encoded audio chunk to the realtime input buffer. */
380
+ sendAudio(mulawAudio) {
381
+ if (!this.ws || this.ws.readyState !== WebSocket.OPEN) return;
382
+ this.ws.send(JSON.stringify({ type: "input_audio_buffer.append", audio: mulawAudio.toString("base64") }));
383
+ }
384
+ /**
385
+ * Register a listener for parsed realtime events.
386
+ *
387
+ * Previously every call attached a new ``ws.on('message')`` handler,
388
+ * which leaked listeners across retries and multi-consumer hooks. We now
389
+ * route all traffic through a single persistent handler that fans out to
390
+ * a Set of callbacks. Use {@link offEvent} to remove one.
391
+ */
392
+ onEvent(callback) {
393
+ this.eventCallbacks.add(callback);
394
+ this.ensureMessageListener();
395
+ }
396
+ /** Remove a previously registered {@link onEvent} callback. */
397
+ offEvent(callback) {
398
+ this.eventCallbacks.delete(callback);
399
+ }
400
+ ensureMessageListener() {
401
+ if (this.messageListenerAttached || !this.ws) return;
402
+ this.messageListenerAttached = true;
403
+ const ws = this.ws;
404
+ const dispatch = (type, payload) => {
405
+ for (const cb of this.eventCallbacks) {
406
+ void Promise.resolve(cb(type, payload)).catch(
407
+ (err) => getLogger().error("onEvent callback error:", err)
408
+ );
409
+ }
410
+ };
411
+ ws.on("message", (raw) => {
412
+ let data;
413
+ try {
414
+ data = JSON.parse(raw.toString());
415
+ } catch (e) {
416
+ getLogger().warn(`OpenAI Realtime: failed to parse event message: ${String(e)}`);
417
+ return;
418
+ }
419
+ const t = data.type;
420
+ if (t === "response.audio.delta") {
421
+ const buf = Buffer.from(data.delta ?? "", "base64");
422
+ this.currentResponseAudioMs += estimateAudioMs(buf, this.audioFormat);
423
+ if (this.currentResponseFirstAudioAt === null) {
424
+ this.currentResponseFirstAudioAt = Date.now();
425
+ }
426
+ dispatch("audio", buf);
427
+ } else if (t === "response.audio_transcript.delta") {
428
+ dispatch("transcript_output", data.delta);
429
+ } else if (t === "response.content_part.added" || t === "response.output_item.added") {
430
+ const itemId = data.item?.id ?? data.item_id ?? null;
431
+ if (itemId) {
432
+ this.currentResponseItemId = itemId;
433
+ this.currentResponseAudioMs = 0;
434
+ this.currentResponseFirstAudioAt = null;
435
+ }
436
+ } else if (t === "input_audio_buffer.speech_started") {
437
+ dispatch("speech_started", null);
438
+ } else if (t === "input_audio_buffer.speech_stopped") {
439
+ dispatch("speech_stopped", null);
440
+ } else if (t === "conversation.item.input_audio_transcription.completed") {
441
+ dispatch("transcript_input", data.transcript);
442
+ } else if (t === "response.function_call_arguments.done") {
443
+ dispatch("function_call", { call_id: data.call_id, name: data.name, arguments: data.arguments });
444
+ } else if (t === "response.done") {
445
+ this.currentResponseItemId = null;
446
+ this.currentResponseAudioMs = 0;
447
+ this.currentResponseFirstAudioAt = null;
448
+ dispatch("response_done", data.response ?? null);
449
+ } else if (t === "error") {
450
+ dispatch("error", data.error);
451
+ }
452
+ });
453
+ ws.on("close", (code, reason) => {
454
+ if (code !== 1e3) {
455
+ dispatch("error", {
456
+ type: "connection_closed",
457
+ code,
458
+ reason: reason?.toString() ?? ""
459
+ });
460
+ }
461
+ });
462
+ ws.on("error", (err) => {
463
+ dispatch("error", { type: "socket_error", message: err?.message ?? String(err) });
464
+ });
465
+ }
466
+ /** Truncate the in-flight assistant turn and cancel the active response.
467
+ *
468
+ * ``audio_end_ms`` MUST reflect what the caller actually heard, not what
469
+ * the server generated. OpenAI streams audio at 5-10x real-time, so the
470
+ * byte-derived counter overstates playback whenever the consumer cleared
471
+ * its playout buffer (e.g. ``send_clear``) before the audio reached the
472
+ * speaker. We bound the truncate point by wall-clock time since the first
473
+ * chunk of this response — that's the physical maximum a 1x real-time
474
+ * playback could have produced. Without this cap, OpenAI keeps the full
475
+ * generated assistant text on the transcript, and the model replays /
476
+ * resumes from it on the next turn — manifesting as re-greetings and
477
+ * mid-sentence fragments after a barge-in storm.
478
+ */
479
+ cancelResponse() {
480
+ if (!this.ws) return;
481
+ if (!this.currentResponseItemId) {
482
+ return;
483
+ }
484
+ let audioEndMs = this.currentResponseAudioMs;
485
+ if (this.currentResponseFirstAudioAt !== null) {
486
+ const elapsedMs = Date.now() - this.currentResponseFirstAudioAt;
487
+ audioEndMs = Math.min(audioEndMs, Math.max(elapsedMs, 0));
488
+ }
489
+ try {
490
+ this.ws.send(JSON.stringify({
491
+ type: "conversation.item.truncate",
492
+ item_id: this.currentResponseItemId,
493
+ content_index: 0,
494
+ audio_end_ms: audioEndMs
495
+ }));
496
+ } catch (err) {
497
+ getLogger().debug?.(`conversation.item.truncate failed: ${String(err)}`);
498
+ }
499
+ this.ws.send(JSON.stringify({ type: "response.cancel" }));
500
+ this.currentResponseItemId = null;
501
+ this.currentResponseAudioMs = 0;
502
+ this.currentResponseFirstAudioAt = null;
503
+ }
504
+ /** Inject a user text turn and request a new response. */
505
+ async sendText(text) {
506
+ this.ws?.send(JSON.stringify({
507
+ type: "conversation.item.create",
508
+ item: { type: "message", role: "user", content: [{ type: "input_text", text }] }
509
+ }));
510
+ this.ws?.send(JSON.stringify({ type: "response.create" }));
511
+ }
512
+ /**
513
+ * Trigger `response.create` with no new user item.
514
+ *
515
+ * Used by the Realtime stream-handler to drive a response after the
516
+ * client-side hallucination filter accepts an
517
+ * `input_audio_transcription.completed` event. The server VAD config
518
+ * sets `create_response: false` so OpenAI no longer auto-creates a
519
+ * response on every `input_audio_buffer.committed`; Patter is now
520
+ * responsible for triggering it explicitly when a real user turn lands.
521
+ */
522
+ async requestResponse() {
523
+ this.ws?.send(JSON.stringify({ type: "response.create" }));
524
+ }
525
+ /**
526
+ * Make the AI speak ``text`` as its opening line.
527
+ *
528
+ * Triggers ``response.create`` with explicit ``instructions`` that force
529
+ * the model to render ``text`` verbatim as its first audio utterance.
530
+ * This is the correct semantics for ``Agent.firstMessage`` per its
531
+ * docstring ("What the AI says when the callee answers").
532
+ *
533
+ * Without this, ``sendText(firstMessage)`` would inject ``text`` as
534
+ * ``role: user`` and the AI would *reply* to its own greeting, producing
535
+ * role-confused openings (e.g. a receptionist agent responding "I'd like
536
+ * to schedule a haircut" because it took its own first_message as a
537
+ * customer cue).
538
+ */
539
+ async sendFirstMessage(text) {
540
+ this.ws?.send(JSON.stringify({
541
+ type: "response.create",
542
+ response: {
543
+ modalities: ["audio", "text"],
544
+ instructions: `Say exactly the following sentence as your first turn and nothing else: "${text}"`
545
+ }
546
+ }));
547
+ }
548
+ /** Submit a tool/function-call result and request the next response. */
549
+ async sendFunctionResult(callId, result) {
550
+ this.ws?.send(JSON.stringify({
551
+ type: "conversation.item.create",
552
+ item: { type: "function_call_output", call_id: callId, output: result }
553
+ }));
554
+ this.ws?.send(JSON.stringify({ type: "response.create" }));
555
+ }
556
+ /** Stop the heartbeat, drop listeners, and close the Realtime WebSocket. */
557
+ close() {
558
+ if (this.heartbeat) {
559
+ clearInterval(this.heartbeat);
560
+ this.heartbeat = null;
561
+ }
562
+ this.eventCallbacks.clear();
563
+ this.messageListenerAttached = false;
564
+ this.ws?.close();
565
+ this.ws = null;
566
+ }
567
+ };
568
+ function estimateAudioMs(chunk, format) {
569
+ if (chunk.length === 0) return 0;
570
+ if (format === OpenAIRealtimeAudioFormat.G711_ULAW || format === OpenAIRealtimeAudioFormat.G711_ALAW)
571
+ return Math.floor(chunk.length / 8);
572
+ if (format === OpenAIRealtimeAudioFormat.PCM16) {
573
+ return Math.floor(chunk.length / 48);
574
+ }
575
+ return 0;
576
+ }
577
+
578
+ // src/audio/transcoding.ts
579
+ init_esm_shims();
580
+ var MULAW_TO_PCM16_TABLE = (() => {
581
+ const table = new Int16Array(256);
582
+ for (let i = 0; i < 256; i++) {
583
+ const mu = ~i & 255;
584
+ const sign = mu & 128 ? -1 : 1;
585
+ const exponent = mu >> 4 & 7;
586
+ const mantissa = mu & 15;
587
+ const magnitude = (mantissa << 1 | 33) << exponent + 2;
588
+ table[i] = sign * (magnitude - 132);
589
+ }
590
+ return table;
591
+ })();
592
+ var PCM16_TO_MULAW_TABLE = (() => {
593
+ const BIAS = 132;
594
+ const CLIP = 32635;
595
+ const table = new Uint8Array(65536);
596
+ for (let i = 0; i < 65536; i++) {
597
+ let sample = i >= 32768 ? i - 65536 : i;
598
+ const sign = sample < 0 ? 128 : 0;
599
+ if (sample < 0) sample = -sample;
600
+ if (sample > CLIP) sample = CLIP;
601
+ sample += BIAS;
602
+ let exponent = 7;
603
+ const exponentMask = 16384;
604
+ for (let shift = exponentMask; shift > 0 && (sample & shift) === 0; shift >>= 1) {
605
+ exponent--;
606
+ }
607
+ const mantissa = sample >> exponent + 3 & 15;
608
+ const mulaw = ~(sign | exponent << 4 | mantissa) & 255;
609
+ table[i] = mulaw;
610
+ }
611
+ return table;
612
+ })();
613
+ function mulawToPcm16(mulawData) {
614
+ const out = Buffer.alloc(mulawData.length * 2);
615
+ for (let i = 0; i < mulawData.length; i++) {
616
+ out.writeInt16LE(MULAW_TO_PCM16_TABLE[mulawData[i]], i * 2);
617
+ }
618
+ return out;
619
+ }
620
+ function pcm16ToMulaw(pcmData) {
621
+ const sampleCount = Math.floor(pcmData.length / 2);
622
+ const out = Buffer.alloc(sampleCount);
623
+ for (let i = 0; i < sampleCount; i++) {
624
+ const sample = pcmData.readInt16LE(i * 2);
625
+ out[i] = PCM16_TO_MULAW_TABLE[sample + 65536 & 65535];
626
+ }
627
+ return out;
628
+ }
629
+ var PcmCarry = class {
630
+ pending = null;
631
+ /**
632
+ * Prepend any carried odd byte, return the even-length prefix, and stash
633
+ * any new trailing odd byte for the next call.
634
+ *
635
+ * Returns a zero-length buffer when no complete sample is yet available.
636
+ */
637
+ push(chunk) {
638
+ const combined = this.pending !== null ? Buffer.concat([this.pending, chunk]) : chunk;
639
+ this.pending = null;
640
+ const alignedLen = combined.length & ~1;
641
+ if (alignedLen < combined.length) {
642
+ this.pending = combined.subarray(alignedLen);
643
+ }
644
+ return combined.subarray(0, alignedLen);
645
+ }
646
+ /**
647
+ * Return any pending byte as a 1-byte buffer (rare in practice — only if
648
+ * the entire stream had an odd byte count), then reset internal state.
649
+ */
650
+ flush() {
651
+ if (this.pending === null) return Buffer.alloc(0);
652
+ const out = this.pending;
653
+ this.pending = null;
654
+ return out;
655
+ }
656
+ /** Reset carry state without flushing. */
657
+ reset() {
658
+ this.pending = null;
659
+ }
660
+ };
661
+ var StatefulResampler = class {
662
+ srcRate;
663
+ dstRate;
664
+ // 16k→8k: 5-tap FIR state.
665
+ // Extended sample buffer carries the 2 history samples that precede the
666
+ // current chunk AND any "pending" input sample that did not yet generate
667
+ // output (i.e. the odd sample when the chunk had an odd sample count).
668
+ // `firPhase` = 0 means the next output is at input position 0 of the
669
+ // current chunk; 1 means it starts at input position 1 (because the
670
+ // previous chunk ended on an even-output boundary).
671
+ firHistory = new Int16Array(2);
672
+ // [s_{-2}, s_{-1}]
673
+ firHistoryValid = false;
674
+ // Pending sample carried from odd-count chunks (not the byte carry —
675
+ // this is a complete Int16 sample that becomes the first input for the
676
+ // next call).
677
+ firPendingSample = null;
678
+ // 8k→16k: last input sample deferred across chunk boundaries.
679
+ upsampleLast = 0;
680
+ upsampleHasHistory = false;
681
+ // 24k→16k: fractional phase and last input sample across chunks.
682
+ resample24Last = 0;
683
+ resample24Phase = 0;
684
+ resample24HasHistory = false;
685
+ // Odd-byte alignment carry.
686
+ carry = new PcmCarry();
687
+ constructor(opts) {
688
+ this.srcRate = opts.srcRate;
689
+ this.dstRate = opts.dstRate;
690
+ if (opts.channels !== void 0 && opts.channels !== 1) {
691
+ throw new Error("StatefulResampler: only mono (channels=1) is supported");
692
+ }
693
+ const key = `${this.srcRate}->${this.dstRate}`;
694
+ if (key !== "16000->8000" && key !== "8000->16000" && key !== "24000->16000" && key !== "24000->8000") {
695
+ throw new Error(
696
+ `StatefulResampler: unsupported conversion ${key}. Supported: 16000->8000, 8000->16000, 24000->16000, 24000->8000`
697
+ );
698
+ }
699
+ }
700
+ /**
701
+ * Process a chunk of PCM16-LE samples.
702
+ *
703
+ * Handles odd-byte inputs via an internal carry buffer. Returns an even-byte-
704
+ * aligned output buffer; may return a zero-length buffer if not enough
705
+ * aligned input is available yet.
706
+ */
707
+ process(pcm) {
708
+ const aligned = this.carry.push(pcm);
709
+ if (aligned.length === 0) return Buffer.alloc(0);
710
+ if (this.srcRate === 16e3 && this.dstRate === 8e3) {
711
+ return this._downsample16kTo8k(aligned);
712
+ }
713
+ if (this.srcRate === 8e3 && this.dstRate === 16e3) {
714
+ return this._upsample8kTo16k(aligned);
715
+ }
716
+ if (this.srcRate === 24e3 && this.dstRate === 8e3) {
717
+ return this._resample24kTo8k(aligned);
718
+ }
719
+ return this._resample24kTo16k(aligned);
720
+ }
721
+ /**
722
+ * Flush internal state and return any remaining output samples.
723
+ *
724
+ * For 8k→16k: the deferred last sample is emitted duplicated (matching
725
+ * the stateless helper's end-of-stream behaviour).
726
+ * For 16k→8k: any pending odd sample is processed with edge-replication.
727
+ * Resets all state after flushing.
728
+ */
729
+ flush() {
730
+ this.carry.flush();
731
+ if (this.srcRate === 16e3 && this.dstRate === 8e3 && this.firPendingSample !== null) {
732
+ const s = this.firPendingSample;
733
+ const tmp = Buffer.alloc(4);
734
+ tmp.writeInt16LE(s, 0);
735
+ tmp.writeInt16LE(s, 2);
736
+ const out = this._downsample16kTo8k(tmp);
737
+ this.firPendingSample = null;
738
+ return out;
739
+ }
740
+ if (this.srcRate === 8e3 && this.dstRate === 16e3 && this.upsampleHasHistory) {
741
+ const out = Buffer.alloc(4);
742
+ out.writeInt16LE(this.upsampleLast, 0);
743
+ out.writeInt16LE(this.upsampleLast, 2);
744
+ this.upsampleHasHistory = false;
745
+ this.upsampleLast = 0;
746
+ return out;
747
+ }
748
+ return Buffer.alloc(0);
749
+ }
750
+ /** Reset all carried state (e.g. at call boundaries). */
751
+ reset() {
752
+ this.firHistory = new Int16Array(2);
753
+ this.firHistoryValid = false;
754
+ this.firPendingSample = null;
755
+ this.upsampleLast = 0;
756
+ this.upsampleHasHistory = false;
757
+ this.resample24Last = 0;
758
+ this.resample24Phase = 0;
759
+ this.resample24HasHistory = false;
760
+ this.carry.reset();
761
+ }
762
+ // ---------------------------------------------------------------------------
763
+ // Private: 16 kHz → 8 kHz
764
+ // ---------------------------------------------------------------------------
765
+ /**
766
+ * 2:1 decimation with a 5-tap binomial FIR anti-alias filter.
767
+ *
768
+ * FIR coefficients: [1, 4, 6, 4, 1] / 16 (cutoff ~Fs/4 = 4 kHz).
769
+ *
770
+ * Cross-chunk state:
771
+ * - `firHistory[0]` = s_{-2}, `firHistory[1]` = s_{-1} relative to the
772
+ * virtual stream (seeded to first-sample on the very first call).
773
+ * - `firPendingSample` = a lone input sample carried from a chunk whose
774
+ * sample count was odd; it will become the first input of the next chunk.
775
+ *
776
+ * Decimation: outputs are at even positions (0, 2, 4 …) in the virtual
777
+ * extended stream, so every 2 input samples yield 1 output. An odd-sample-
778
+ * count chunk leaves 1 sample in `firPendingSample`; the next chunk
779
+ * prepends it so the output cadence is unbroken.
780
+ */
781
+ _downsample16kTo8k(buf) {
782
+ const newSampleCount = buf.length >> 1;
783
+ const hasPending = this.firPendingSample !== null;
784
+ const totalInput = newSampleCount + (hasPending ? 1 : 0);
785
+ const input = new Int16Array(totalInput);
786
+ if (hasPending) {
787
+ input[0] = this.firPendingSample;
788
+ for (let j = 0; j < newSampleCount; j++) input[j + 1] = buf.readInt16LE(j * 2);
789
+ } else {
790
+ for (let j = 0; j < newSampleCount; j++) input[j] = buf.readInt16LE(j * 2);
791
+ }
792
+ this.firPendingSample = null;
793
+ if (totalInput === 0) return Buffer.alloc(0);
794
+ if (!this.firHistoryValid) {
795
+ this.firHistory[0] = 0;
796
+ this.firHistory[1] = 0;
797
+ this.firHistoryValid = true;
798
+ }
799
+ const extended = new Int16Array(totalInput + 2);
800
+ extended[0] = this.firHistory[0];
801
+ extended[1] = this.firHistory[1];
802
+ for (let j = 0; j < totalInput; j++) extended[j + 2] = input[j];
803
+ const outSamples = totalInput >> 1;
804
+ const out = Buffer.alloc(outSamples * 2);
805
+ for (let i = 0; i < outSamples; i++) {
806
+ const c = 2 + i * 2;
807
+ const sM2 = extended[c - 2];
808
+ const sM1 = extended[c - 1];
809
+ const s0 = extended[c];
810
+ const sP1 = c + 1 < extended.length ? extended[c + 1] : extended[extended.length - 1];
811
+ const sP2 = c + 2 < extended.length ? extended[c + 2] : extended[extended.length - 1];
812
+ const filtered = sM2 + 4 * sM1 + 6 * s0 + 4 * sP1 + sP2 + 8 >> 4;
813
+ out.writeInt16LE(Math.max(-32768, Math.min(32767, filtered)), i * 2);
814
+ }
815
+ if (totalInput % 2 === 1) {
816
+ this.firPendingSample = input[totalInput - 1];
817
+ }
818
+ if (totalInput >= 2) {
819
+ this.firHistory[0] = input[totalInput - 2];
820
+ this.firHistory[1] = input[totalInput - 1];
821
+ } else {
822
+ this.firHistory[0] = this.firHistory[1];
823
+ this.firHistory[1] = input[0];
824
+ }
825
+ return out;
826
+ }
827
+ // ---------------------------------------------------------------------------
828
+ // Private: 8 kHz → 16 kHz
829
+ // ---------------------------------------------------------------------------
830
+ /**
831
+ * 1:2 linear-interpolation upsampler.
832
+ *
833
+ * For the first chunk (no history): emits 2*(N-1) samples and defers the
834
+ * last sample. For subsequent chunks (with history): emits the deferred
835
+ * sample + its interpolated midpoint THEN 2*(N-1) samples from the new
836
+ * chunk, deferring the new last sample. Total across K chunks + flush =
837
+ * 2*total_input_samples (correct output length).
838
+ *
839
+ * Call flush() after the final chunk to emit the last deferred sample
840
+ * pair (self-duplicate at end of stream).
841
+ */
842
+ _upsample8kTo16k(buf) {
843
+ const sampleCount = buf.length >> 1;
844
+ if (sampleCount === 0) return Buffer.alloc(0);
845
+ const outArr = [];
846
+ if (this.upsampleHasHistory) {
847
+ const next = buf.readInt16LE(0);
848
+ outArr.push(this.upsampleLast);
849
+ outArr.push(Math.round((this.upsampleLast + next) / 2));
850
+ }
851
+ for (let i = 0; i < sampleCount - 1; i++) {
852
+ const s0 = buf.readInt16LE(i * 2);
853
+ const s1 = buf.readInt16LE((i + 1) * 2);
854
+ outArr.push(s0);
855
+ outArr.push(Math.round((s0 + s1) / 2));
856
+ }
857
+ this.upsampleLast = buf.readInt16LE((sampleCount - 1) * 2);
858
+ this.upsampleHasHistory = true;
859
+ const outBuf = Buffer.alloc(outArr.length * 2);
860
+ for (let j = 0; j < outArr.length; j++) outBuf.writeInt16LE(outArr[j], j * 2);
861
+ return outBuf;
862
+ }
863
+ // ---------------------------------------------------------------------------
864
+ // Private: 24 kHz → 16 kHz / 8 kHz
865
+ // ---------------------------------------------------------------------------
866
+ /**
867
+ * 3:2 linear-interpolation decimator (ratio srcRate/dstRate = 1.5).
868
+ *
869
+ * `resample24Phase` tracks the fractional input position of the next output
870
+ * sample relative to the START of the next chunk. Negative phase means the
871
+ * next output straddles the previous/current chunk boundary; those are
872
+ * handled using `resample24Last`.
873
+ */
874
+ _resample24kTo16k(buf) {
875
+ return this._resample24kStep(buf, 24e3 / 16e3);
876
+ }
877
+ /** 3:1 decimation — collapses the 24k→16k→8k chain into a single step. */
878
+ _resample24kTo8k(buf) {
879
+ return this._resample24kStep(buf, 24e3 / 8e3);
880
+ }
881
+ /** Shared phase-stepping resampler used by 24→16 (step 1.5) and 24→8 (step 3). */
882
+ _resample24kStep(buf, step) {
883
+ const sampleCount = buf.length >> 1;
884
+ if (sampleCount === 0) return Buffer.alloc(0);
885
+ const outArr = [];
886
+ let phase = this.resample24Phase;
887
+ while (true) {
888
+ const idx = Math.floor(phase);
889
+ if (idx >= sampleCount) break;
890
+ const frac = phase - idx;
891
+ let s0;
892
+ let s1;
893
+ if (idx < 0) {
894
+ s0 = this.resample24HasHistory ? this.resample24Last : 0;
895
+ s1 = buf.readInt16LE(0);
896
+ } else {
897
+ s0 = buf.readInt16LE(idx * 2);
898
+ s1 = idx + 1 < sampleCount ? buf.readInt16LE((idx + 1) * 2) : s0;
899
+ }
900
+ const interp = Math.round(s0 + (s1 - s0) * frac);
901
+ outArr.push(Math.max(-32768, Math.min(32767, interp)));
902
+ phase += step;
903
+ }
904
+ this.resample24Last = buf.readInt16LE((sampleCount - 1) * 2);
905
+ this.resample24HasHistory = true;
906
+ this.resample24Phase = phase - sampleCount;
907
+ const outBuf = Buffer.alloc(outArr.length * 2);
908
+ for (let j = 0; j < outArr.length; j++) outBuf.writeInt16LE(outArr[j], j * 2);
909
+ return outBuf;
910
+ }
911
+ };
912
+ function createResampler16kTo8k() {
913
+ return new StatefulResampler({ srcRate: 16e3, dstRate: 8e3 });
914
+ }
915
+ function createResampler8kTo16k() {
916
+ return new StatefulResampler({ srcRate: 8e3, dstRate: 16e3 });
917
+ }
918
+ function createResampler24kTo16k() {
919
+ return new StatefulResampler({ srcRate: 24e3, dstRate: 16e3 });
920
+ }
921
+ function createResampler24kTo8k() {
922
+ return new StatefulResampler({ srcRate: 24e3, dstRate: 8e3 });
923
+ }
924
+ var _warnedResample8kTo16k = false;
925
+ var _warnedResample16kTo8k = false;
926
+ var _warnedResample24kTo16k = false;
927
+ function resample8kTo16k(pcm8k) {
928
+ if (!_warnedResample8kTo16k) {
929
+ _warnedResample8kTo16k = true;
930
+ getLogger().warn(
931
+ "[patter] resample8kTo16k() is deprecated. Use createResampler8kTo16k() (StatefulResampler) to eliminate chunk-boundary discontinuities."
932
+ );
933
+ }
934
+ if (pcm8k.length === 0) return Buffer.alloc(0);
935
+ const r = createResampler8kTo16k();
936
+ const main = r.process(pcm8k);
937
+ const tail = r.flush();
938
+ return tail.length > 0 ? Buffer.concat([main, tail]) : main;
939
+ }
940
+ function resample16kTo8k(pcm16k) {
941
+ if (!_warnedResample16kTo8k) {
942
+ _warnedResample16kTo8k = true;
943
+ getLogger().warn(
944
+ "[patter] resample16kTo8k() is deprecated. Use createResampler16kTo8k() (StatefulResampler) to eliminate chunk-boundary discontinuities."
945
+ );
946
+ }
947
+ if (pcm16k.length === 0) return Buffer.alloc(0);
948
+ const r = createResampler16kTo8k();
949
+ const out = r.process(pcm16k);
950
+ const tail = r.flush();
951
+ return tail.length > 0 ? Buffer.concat([out, tail]) : out;
952
+ }
953
+ function resample24kTo16k(pcm24k) {
954
+ if (!_warnedResample24kTo16k) {
955
+ _warnedResample24kTo16k = true;
956
+ getLogger().warn(
957
+ "[patter] resample24kTo16k() is deprecated. Use createResampler24kTo16k() (StatefulResampler) or OpenAITTS.resampleStreaming for anti-aliased resampling."
958
+ );
959
+ }
960
+ if (pcm24k.length === 0) return Buffer.alloc(0);
961
+ const sampleCount = Math.floor(pcm24k.length / 2);
962
+ const outSamples = Math.floor(sampleCount * 2 / 3);
963
+ const out = Buffer.alloc(outSamples * 2);
964
+ for (let i = 0; i < outSamples; i++) {
965
+ const pos = i * 1.5;
966
+ const idx = Math.floor(pos);
967
+ const frac = pos - idx;
968
+ const s0 = pcm24k.readInt16LE(idx * 2);
969
+ const s1 = idx + 1 < sampleCount ? pcm24k.readInt16LE((idx + 1) * 2) : s0;
970
+ const interp = Math.round(s0 + (s1 - s0) * frac);
971
+ out.writeInt16LE(Math.max(-32768, Math.min(32767, interp)), i * 2);
972
+ }
973
+ return out;
974
+ }
975
+
976
+ // src/providers/openai-realtime-2.ts
977
+ var GA_TO_V1_EVENT_NAMES = {
978
+ "response.output_audio.delta": "response.audio.delta",
979
+ "response.output_audio.done": "response.audio.done",
980
+ "response.output_audio_transcript.delta": "response.audio_transcript.delta",
981
+ "response.output_audio_transcript.done": "response.audio_transcript.done"
982
+ };
983
+ var OpenAIRealtime2Adapter = class extends OpenAIRealtimeAdapter {
984
+ /** Two-stage outbound resampler for 24 kHz → 8 kHz. Created lazily on
985
+ * the first audio frame so each Realtime session has its own state.
986
+ *
987
+ * We chain `24k → 16k → 8k` instead of using the direct `24k → 8k`
988
+ * variant of {@link StatefulResampler}: the direct path is a 3:1
989
+ * decimation with linear interpolation only — no anti-alias filter
990
+ * — so any energy above 4 kHz in the source aliases down into the
991
+ * audible band and is heard as raspy/scratchy artefacts on speech.
992
+ * `gpt-realtime-2` outputs voice with significant content above
993
+ * 4 kHz. The second stage (16k → 8k) uses a 5-tap FIR anti-alias
994
+ * filter which removes the offending band before decimation, and
995
+ * empirically (see commit message) the chain produces audibly
996
+ * cleaner output. The 24k → 16k step is still pure linear-interp
997
+ * but the inputs to it stay below the Nyquist of the 16 kHz stage,
998
+ * so it doesn't introduce new artefacts.
999
+ */
1000
+ outboundResampler24To16 = null;
1001
+ outboundResampler16To8 = null;
1002
+ /** Last 8 kHz input sample carried across chunk boundaries for the
1003
+ * direct 3× linear upsample (see `transcodeInboundMulaw8ToPcm24`).
1004
+ * The carry guarantees the very first output of each chunk
1005
+ * interpolates from the *real* preceding sample, not from the chunk's
1006
+ * own first sample replicated — without it every 20 ms Twilio frame
1007
+ * boundary becomes a small DC step that the GA server VAD interprets
1008
+ * as constant low-energy noise, which never crosses the speech
1009
+ * threshold. */
1010
+ inbound8kCarry = null;
1011
+ /** GA-shape `session.update` payload. See module-level docstring. */
1012
+ buildGASessionConfig() {
1013
+ const opts = this.options;
1014
+ const fmt = { type: "audio/pcm", rate: 24e3 };
1015
+ const config = {
1016
+ type: "realtime",
1017
+ output_modalities: opts.modalities ?? ["audio"],
1018
+ audio: {
1019
+ input: {
1020
+ format: fmt,
1021
+ transcription: {
1022
+ model: opts.inputAudioTranscriptionModel ?? OpenAITranscriptionModel.WHISPER_1
1023
+ },
1024
+ // VAD threshold raised back to the OpenAI default (0.5) on
1025
+ // 2026-05-22. The earlier 0.1 tuning (motivated by the
1026
+ // upsampled telephony-band loss in high frequencies) made the
1027
+ // server VAD trigger on the carrier-loopback echo of the
1028
+ // agent's OWN outbound audio in PSTN no-AEC scenarios.
1029
+ // Combined with the default ``turn_detection.create_response:
1030
+ // true``, every phantom ``speech_started`` ended a turn early
1031
+ // and auto-created a new response that the agent immediately
1032
+ // spoke over, leading to a runaway loop where the first
1033
+ // message was repeatedly cut and re-generated.
1034
+ turn_detection: {
1035
+ type: opts.vadType ?? OpenAIRealtimeVADType.SERVER_VAD,
1036
+ threshold: 0.5,
1037
+ prefix_padding_ms: 300,
1038
+ silence_duration_ms: opts.silenceDurationMs ?? 500,
1039
+ // Defer ``response.create`` to the application: when OpenAI's
1040
+ // server VAD commits an ``input_audio_buffer.committed`` segment
1041
+ // that turns out to be a Whisper hallucination on silence/echo,
1042
+ // auto-creating a response would generate a phantom turn (the
1043
+ // model reads the hallucinated text as user input). Patter
1044
+ // triggers ``response.create`` explicitly in the Realtime
1045
+ // stream-handler AFTER validating ``transcript_input`` against
1046
+ // the hallucination filter. Pair with ``interrupt_response:
1047
+ // false`` so server VAD also leaves in-flight responses alone —
1048
+ // barge-in is gated client-side.
1049
+ create_response: false,
1050
+ interrupt_response: false
1051
+ }
1052
+ },
1053
+ output: {
1054
+ format: fmt,
1055
+ voice: this.voice
1056
+ }
1057
+ },
1058
+ instructions: this.instructions || "You are a helpful voice assistant. Be concise."
1059
+ };
1060
+ if (opts.temperature !== void 0) config.temperature = opts.temperature;
1061
+ if (opts.maxResponseOutputTokens !== void 0) {
1062
+ config.max_output_tokens = opts.maxResponseOutputTokens;
1063
+ }
1064
+ if (opts.toolChoice !== void 0) config.tool_choice = opts.toolChoice;
1065
+ if (opts.reasoningEffort !== void 0) {
1066
+ config.reasoning = { effort: opts.reasoningEffort };
1067
+ }
1068
+ if (this.tools?.length) {
1069
+ config.tools = this.tools.map((t) => {
1070
+ const def = {
1071
+ type: "function",
1072
+ name: t.name,
1073
+ description: t.description,
1074
+ parameters: t.parameters
1075
+ };
1076
+ if (t.strict === true) def.strict = true;
1077
+ return def;
1078
+ });
1079
+ }
1080
+ return config;
1081
+ }
1082
+ /**
1083
+ * Open the Realtime WebSocket against the GA endpoint and apply the GA
1084
+ * session configuration. Header `OpenAI-Beta: realtime=v1` is OMITTED
1085
+ * (the GA endpoint rejects it). Wire shape uses nested `audio.{input,
1086
+ * output}` + `output_modalities` + `session.type === "realtime"`.
1087
+ */
1088
+ async connect() {
1089
+ const url = `wss://api.openai.com/v1/realtime?model=${encodeURIComponent(this.model)}`;
1090
+ this.ws = new WebSocket2(url, {
1091
+ headers: { Authorization: `Bearer ${this.apiKey}` }
1092
+ });
1093
+ const wsRef = this.ws;
1094
+ const originalOn = wsRef.on.bind(this.ws);
1095
+ wsRef.on = (event, handler) => {
1096
+ if (event !== "message") return originalOn(event, handler);
1097
+ const wrapped = (raw, ...rest) => {
1098
+ try {
1099
+ const text = typeof raw === "string" ? raw : raw.toString();
1100
+ const parsed = JSON.parse(text);
1101
+ const t = parsed.type;
1102
+ if (t && t in GA_TO_V1_EVENT_NAMES) {
1103
+ const newType = GA_TO_V1_EVENT_NAMES[t];
1104
+ if (t === "response.output_audio.delta" && typeof parsed.delta === "string") {
1105
+ const mulaw = this.transcodeOutboundPcm24ToMulaw8Buffer(parsed.delta);
1106
+ const FRAME_BYTES = 160;
1107
+ if (mulaw.length === 0) return;
1108
+ for (let off = 0; off < mulaw.length; off += FRAME_BYTES) {
1109
+ const slice = mulaw.subarray(off, Math.min(off + FRAME_BYTES, mulaw.length));
1110
+ const frame = { ...parsed, type: newType, delta: slice.toString("base64") };
1111
+ handler(Buffer.from(JSON.stringify(frame)), ...rest);
1112
+ }
1113
+ return;
1114
+ }
1115
+ parsed.type = newType;
1116
+ handler(Buffer.from(JSON.stringify(parsed)), ...rest);
1117
+ return;
1118
+ }
1119
+ } catch {
1120
+ }
1121
+ handler(raw, ...rest);
1122
+ };
1123
+ return originalOn(event, wrapped);
1124
+ };
1125
+ await new Promise((resolve, reject) => {
1126
+ let sessionCreated = false;
1127
+ let settled = false;
1128
+ const ws = this.ws;
1129
+ const onSetupMessage = (raw) => {
1130
+ let msg;
1131
+ try {
1132
+ msg = JSON.parse(raw.toString());
1133
+ } catch (e) {
1134
+ getLogger().warn(`OpenAI Realtime 2: failed to parse message: ${String(e)}`);
1135
+ return;
1136
+ }
1137
+ if (msg.type === "session.created" && !sessionCreated) {
1138
+ sessionCreated = true;
1139
+ ws.send(JSON.stringify({ type: "session.update", session: this.buildGASessionConfig() }));
1140
+ } else if (msg.type === "session.updated") {
1141
+ cleanup();
1142
+ resolve();
1143
+ } else if (msg.type === "error") {
1144
+ cleanup();
1145
+ try {
1146
+ ws.close();
1147
+ } catch {
1148
+ }
1149
+ reject(new Error(`OpenAI Realtime 2 setup error: ${msg.error?.message ?? JSON.stringify(msg)}`));
1150
+ }
1151
+ };
1152
+ const onSetupError = (err) => {
1153
+ cleanup();
1154
+ try {
1155
+ ws.close();
1156
+ } catch {
1157
+ }
1158
+ reject(err);
1159
+ };
1160
+ const cleanup = () => {
1161
+ if (settled) return;
1162
+ settled = true;
1163
+ clearTimeout(timer);
1164
+ ws.off("message", onSetupMessage);
1165
+ ws.off("error", onSetupError);
1166
+ };
1167
+ const timer = setTimeout(() => {
1168
+ cleanup();
1169
+ try {
1170
+ ws.close();
1171
+ } catch {
1172
+ }
1173
+ reject(new Error("OpenAI Realtime 2 connect timeout"));
1174
+ }, 15e3);
1175
+ ws.on("message", onSetupMessage);
1176
+ ws.on("error", onSetupError);
1177
+ });
1178
+ this.armHeartbeatAndListener();
1179
+ }
1180
+ /**
1181
+ * GA-API variant of {@link OpenAIRealtimeAdapter.openParkedConnection}.
1182
+ * Opens a fresh Realtime WS against the GA endpoint, exchanges
1183
+ * `session.created` → GA-shape `session.update` → `session.updated`
1184
+ * so the upstream session is fully primed, and returns the OPEN
1185
+ * socket WITHOUT taking it on `this.ws` or arming the heartbeat /
1186
+ * message listener.
1187
+ *
1188
+ * Used by `Patter.parkProviderConnections` during the carrier
1189
+ * ringing window so the per-call `StreamHandler` can adopt the
1190
+ * primed socket at carrier `start` — eliminating the TCP + TLS +
1191
+ * HTTP-101 + `session.update` ack round-trip from the critical path.
1192
+ * Saves ~300-600 ms of first-audible-word latency.
1193
+ *
1194
+ * Bounded by 8 s. Throws on timeout / handshake failure / GA-side
1195
+ * rejection. Callers treat any error as a cache miss and fall
1196
+ * through to the cold {@link connect} path.
1197
+ *
1198
+ * Billing safety: confirmed by OpenAI's Managing Realtime Costs
1199
+ * guide — `session.update` does NOT invoke the model and bills no
1200
+ * tokens. An idle parked socket costs $0.
1201
+ */
1202
+ async openParkedConnection() {
1203
+ const url = `wss://api.openai.com/v1/realtime?model=${encodeURIComponent(this.model)}`;
1204
+ const ws = new WebSocket2(url, {
1205
+ headers: { Authorization: `Bearer ${this.apiKey}` }
1206
+ });
1207
+ await new Promise((resolve, reject) => {
1208
+ let sessionCreated = false;
1209
+ let settled = false;
1210
+ const onMessage = (raw) => {
1211
+ let msg;
1212
+ try {
1213
+ msg = JSON.parse(raw.toString());
1214
+ } catch {
1215
+ return;
1216
+ }
1217
+ if (msg.type === "session.created" && !sessionCreated) {
1218
+ sessionCreated = true;
1219
+ try {
1220
+ ws.send(JSON.stringify({ type: "session.update", session: this.buildGASessionConfig() }));
1221
+ } catch (err) {
1222
+ cleanup();
1223
+ reject(err instanceof Error ? err : new Error(String(err)));
1224
+ }
1225
+ } else if (msg.type === "session.updated") {
1226
+ cleanup();
1227
+ resolve();
1228
+ } else if (msg.type === "error") {
1229
+ cleanup();
1230
+ reject(new Error(`OpenAI Realtime 2 parked-setup error: ${msg.error?.message ?? JSON.stringify(msg)}`));
1231
+ }
1232
+ };
1233
+ const onError = (err) => {
1234
+ cleanup();
1235
+ reject(err);
1236
+ };
1237
+ const cleanup = () => {
1238
+ if (settled) return;
1239
+ settled = true;
1240
+ clearTimeout(timer);
1241
+ ws.off("message", onMessage);
1242
+ ws.off("error", onError);
1243
+ };
1244
+ const timer = setTimeout(() => {
1245
+ cleanup();
1246
+ reject(new Error("OpenAI Realtime 2 park connect timeout"));
1247
+ }, 8e3);
1248
+ ws.on("message", onMessage);
1249
+ ws.on("error", onError);
1250
+ });
1251
+ const keepalive = setInterval(() => {
1252
+ if (ws.readyState !== ws.OPEN) {
1253
+ clearInterval(keepalive);
1254
+ return;
1255
+ }
1256
+ try {
1257
+ ws.send(JSON.stringify({ type: "session.update", session: this.buildGASessionConfig() }));
1258
+ } catch {
1259
+ clearInterval(keepalive);
1260
+ }
1261
+ }, 3e3);
1262
+ ws._parkedKeepalive = keepalive;
1263
+ return ws;
1264
+ }
1265
+ /**
1266
+ * GA-API variant of {@link OpenAIRealtimeAdapter.adoptWebSocket}. Takes
1267
+ * over a WS that {@link openParkedConnection} produced (already through
1268
+ * `session.created` + `session.update` + `session.updated`) and arms
1269
+ * the heartbeat + message listener so the GA event-translation shim
1270
+ * is wired up. Skips the cold-connect path — saves ~300-600 ms on
1271
+ * first audible word.
1272
+ *
1273
+ * Caller MUST verify `ws.readyState === OPEN` before calling. If the
1274
+ * parked WS died between park and adopt, fall back to {@link connect}.
1275
+ */
1276
+ adoptWebSocket(ws) {
1277
+ const wsAny = ws;
1278
+ if (wsAny._parkedKeepalive) {
1279
+ clearInterval(wsAny._parkedKeepalive);
1280
+ delete wsAny._parkedKeepalive;
1281
+ }
1282
+ this.ws = ws;
1283
+ const wsRef = ws;
1284
+ const originalOn = wsRef.on.bind(ws);
1285
+ wsRef.on = (event, handler) => {
1286
+ if (event !== "message") return originalOn(event, handler);
1287
+ const wrapped = (raw, ...rest) => {
1288
+ try {
1289
+ const text = typeof raw === "string" ? raw : raw.toString();
1290
+ const parsed = JSON.parse(text);
1291
+ const t = parsed.type;
1292
+ if (t && Object.prototype.hasOwnProperty.call(GA_TO_V1_EVENT_NAMES, t)) {
1293
+ parsed.type = GA_TO_V1_EVENT_NAMES[t];
1294
+ handler(JSON.stringify(parsed), ...rest);
1295
+ return;
1296
+ }
1297
+ } catch {
1298
+ }
1299
+ handler(raw, ...rest);
1300
+ };
1301
+ return originalOn(event, wrapped);
1302
+ };
1303
+ this.armHeartbeatAndListener();
1304
+ }
1305
+ /**
1306
+ * GA-API variant of {@link OpenAIRealtimeAdapter.sendFirstMessage}. Two
1307
+ * differences from the v1 path:
1308
+ *
1309
+ * 1. The v1 implementation sends `response.modalities` which the GA
1310
+ * endpoint rejects with `Unknown parameter: 'response.modalities'`.
1311
+ * Use `output_modalities` to match the GA `session.update` shape.
1312
+ *
1313
+ * 2. The GA `response.create` does NOT inherit `audio.output.voice`
1314
+ * from the session — it falls back to the server-side default
1315
+ * (`marin`, female) when the field is omitted on the response
1316
+ * itself. Session-level `voice: "alloy"` only affects subsequent
1317
+ * server-VAD-triggered responses, NOT this explicit
1318
+ * `response.create`. We re-inject the configured voice here so the
1319
+ * first-message voice matches the rest of the call.
1320
+ */
1321
+ /**
1322
+ * Override the parent `sendAudio` to transcode inbound carrier audio
1323
+ * (mulaw 8 kHz from Twilio/Telnyx) into PCM-16 24 kHz before sending
1324
+ * `input_audio_buffer.append`. The GA server's audio engine ignores
1325
+ * mulaw frames (commit returns "buffer only has 0.00ms of audio") even
1326
+ * though it accepts `audio/pcmu` at the protocol level.
1327
+ */
1328
+ sendAudio(mulawAudio) {
1329
+ if (!this.ws || this.ws.readyState !== this.ws.OPEN) return;
1330
+ const pcm24k = this.transcodeInboundMulaw8ToPcm24(mulawAudio);
1331
+ this.ws.send(JSON.stringify({
1332
+ type: "input_audio_buffer.append",
1333
+ audio: pcm24k.toString("base64")
1334
+ }));
1335
+ }
1336
+ /**
1337
+ * mulaw 8 kHz Buffer → PCM-16-LE 24 kHz Buffer.
1338
+ *
1339
+ * Direct 3× linear-interpolation upsample with a one-sample carry
1340
+ * across chunk boundaries. For every consecutive pair of 8 kHz
1341
+ * samples `(s_a, s_b)` we emit three 24 kHz samples:
1342
+ *
1343
+ * out_0 = s_a
1344
+ * out_1 = 2/3·s_a + 1/3·s_b
1345
+ * out_2 = 1/3·s_a + 2/3·s_b
1346
+ *
1347
+ * The carry stores the last 8 kHz sample of the chunk so the next
1348
+ * chunk can start by pairing `(carry, firstNewSample)` — that's what
1349
+ * keeps the output rate exact (each input sample → 3 output samples)
1350
+ * and eliminates the chunk-boundary DC step that confused the GA
1351
+ * server VAD. The first chunk has no carry and loses 3 samples at
1352
+ * the leading edge (375 µs of audio); that's well below any audible
1353
+ * artefact and well below the GA VAD's 300 ms prefix-padding window.
1354
+ */
1355
+ transcodeInboundMulaw8ToPcm24(mulaw) {
1356
+ const pcm8 = mulawToPcm16(mulaw);
1357
+ const samples8 = pcm8.length / 2;
1358
+ if (samples8 === 0) return Buffer.alloc(0);
1359
+ const GAIN = 2;
1360
+ const inputs = [];
1361
+ if (this.inbound8kCarry !== null) inputs.push(this.inbound8kCarry);
1362
+ for (let i = 0; i < samples8; i++) {
1363
+ const raw = pcm8.readInt16LE(i * 2) * GAIN;
1364
+ inputs.push(Math.max(-32768, Math.min(32767, raw)));
1365
+ }
1366
+ this.inbound8kCarry = inputs[inputs.length - 1];
1367
+ const numPairs = inputs.length - 1;
1368
+ if (numPairs <= 0) return Buffer.alloc(0);
1369
+ const out = Buffer.allocUnsafe(numPairs * 3 * 2);
1370
+ for (let i = 0; i < numPairs; i++) {
1371
+ const s0 = inputs[i];
1372
+ const s1 = inputs[i + 1];
1373
+ out.writeInt16LE(s0, i * 6);
1374
+ out.writeInt16LE(Math.round((s0 * 2 + s1) / 3), i * 6 + 2);
1375
+ out.writeInt16LE(Math.round((s0 + s1 * 2) / 3), i * 6 + 4);
1376
+ }
1377
+ return out;
1378
+ }
1379
+ /**
1380
+ * Base64 PCM-16-LE 24 kHz → Base64 mulaw 8 kHz. Used by the WS
1381
+ * translation shim on each `response.output_audio.delta`. The stateful
1382
+ * resampler is created lazily and reused across all deltas in this
1383
+ * session so the 3:1 decimator's phase carries across chunk
1384
+ * boundaries — without that, every chunk boundary produces a click.
1385
+ */
1386
+ transcodeOutboundPcm24ToMulaw8Buffer(deltaB64) {
1387
+ if (!this.outboundResampler24To16) {
1388
+ this.outboundResampler24To16 = new StatefulResampler({ srcRate: 24e3, dstRate: 16e3 });
1389
+ this.outboundResampler16To8 = new StatefulResampler({ srcRate: 16e3, dstRate: 8e3 });
1390
+ }
1391
+ const pcm24 = Buffer.from(deltaB64, "base64");
1392
+ const pcm16 = this.outboundResampler24To16.process(pcm24);
1393
+ const pcm8 = this.outboundResampler16To8.process(pcm16);
1394
+ if (pcm8.length === 0) return Buffer.alloc(0);
1395
+ return pcm16ToMulaw(pcm8);
1396
+ }
1397
+ async sendFirstMessage(text) {
1398
+ const responseBody = {
1399
+ output_modalities: ["audio"],
1400
+ audio: { output: { voice: this.voice } },
1401
+ instructions: `Say exactly the following sentence as your first turn and nothing else: "${text}"`
1402
+ };
1403
+ if (this.options.reasoningEffort !== void 0) {
1404
+ responseBody.reasoning = { effort: this.options.reasoningEffort };
1405
+ }
1406
+ this.ws?.send(JSON.stringify({ type: "response.create", response: responseBody }));
1407
+ }
1408
+ };
1409
+
1410
+ export {
1411
+ OpenAIRealtimeAudioFormat,
1412
+ OpenAIRealtimeModel,
1413
+ OpenAIVoice,
1414
+ OpenAITranscriptionModel,
1415
+ OpenAIRealtimeVADType,
1416
+ OpenAIRealtimeAdapter,
1417
+ mulawToPcm16,
1418
+ pcm16ToMulaw,
1419
+ PcmCarry,
1420
+ StatefulResampler,
1421
+ createResampler16kTo8k,
1422
+ createResampler8kTo16k,
1423
+ createResampler24kTo16k,
1424
+ createResampler24kTo8k,
1425
+ resample8kTo16k,
1426
+ resample16kTo8k,
1427
+ resample24kTo16k,
1428
+ OpenAIRealtime2Adapter
1429
+ };