getpatter 0.6.1 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunk-CL2U3YET.mjs +1429 -0
- package/dist/{chunk-TEW3NAZJ.mjs → chunk-LE63CSOB.mjs} +371 -1486
- package/dist/{chunk-RV7APPYE.mjs → chunk-R2T4JABZ.mjs} +13 -0
- package/dist/cli.js +48 -23
- package/dist/dashboard/ui.html +8 -8
- package/dist/index.d.mts +452 -186
- package/dist/index.d.ts +452 -186
- package/dist/index.js +1485 -979
- package/dist/index.mjs +973 -790
- package/dist/openai-realtime-2-CNFARP25.mjs +8 -0
- package/dist/{silero-vad-NSEXI4XS.mjs → silero-vad-LNDFGIY7.mjs} +1 -1
- package/dist/{test-mode-WEKKNBLD.mjs → test-mode-RS57BDM6.mjs} +2 -1
- package/package.json +1 -1
- package/src/dashboard/ui.html +8 -8
|
@@ -1,3 +1,11 @@
|
|
|
1
|
+
import {
|
|
2
|
+
OpenAIRealtime2Adapter,
|
|
3
|
+
OpenAIRealtimeAdapter,
|
|
4
|
+
createResampler16kTo8k,
|
|
5
|
+
createResampler8kTo16k,
|
|
6
|
+
mulawToPcm16,
|
|
7
|
+
pcm16ToMulaw
|
|
8
|
+
} from "./chunk-CL2U3YET.mjs";
|
|
1
9
|
import {
|
|
2
10
|
getLogger
|
|
3
11
|
} from "./chunk-MVOQFAEO.mjs";
|
|
@@ -21,1259 +29,9 @@ import express from "express";
|
|
|
21
29
|
import { createServer } from "http";
|
|
22
30
|
import { WebSocketServer } from "ws";
|
|
23
31
|
|
|
24
|
-
// src/providers/openai-realtime.ts
|
|
25
|
-
init_esm_shims();
|
|
26
|
-
import WebSocket from "ws";
|
|
27
|
-
var OpenAIRealtimeAudioFormat = {
|
|
28
|
-
G711_ULAW: "g711_ulaw",
|
|
29
|
-
G711_ALAW: "g711_alaw",
|
|
30
|
-
PCM16: "pcm16"
|
|
31
|
-
};
|
|
32
|
-
var OpenAIRealtimeModel = {
|
|
33
|
-
GPT_REALTIME: "gpt-realtime",
|
|
34
|
-
GPT_REALTIME_2: "gpt-realtime-2",
|
|
35
|
-
GPT_REALTIME_MINI: "gpt-realtime-mini",
|
|
36
|
-
GPT_4O_REALTIME_PREVIEW: "gpt-4o-realtime-preview",
|
|
37
|
-
GPT_4O_MINI_REALTIME_PREVIEW: "gpt-4o-mini-realtime-preview"
|
|
38
|
-
};
|
|
39
|
-
var OpenAIVoice = {
|
|
40
|
-
ALLOY: "alloy",
|
|
41
|
-
ASH: "ash",
|
|
42
|
-
BALLAD: "ballad",
|
|
43
|
-
CORAL: "coral",
|
|
44
|
-
ECHO: "echo",
|
|
45
|
-
FABLE: "fable",
|
|
46
|
-
NOVA: "nova",
|
|
47
|
-
ONYX: "onyx",
|
|
48
|
-
SAGE: "sage",
|
|
49
|
-
SHIMMER: "shimmer",
|
|
50
|
-
VERSE: "verse"
|
|
51
|
-
};
|
|
52
|
-
var OpenAITranscriptionModel = {
|
|
53
|
-
WHISPER_1: "whisper-1",
|
|
54
|
-
GPT_4O_TRANSCRIBE: "gpt-4o-transcribe",
|
|
55
|
-
GPT_4O_MINI_TRANSCRIBE: "gpt-4o-mini-transcribe",
|
|
56
|
-
GPT_REALTIME_WHISPER: "gpt-realtime-whisper"
|
|
57
|
-
};
|
|
58
|
-
var OpenAIRealtimeVADType = {
|
|
59
|
-
SERVER_VAD: "server_vad",
|
|
60
|
-
SEMANTIC_VAD: "semantic_vad"
|
|
61
|
-
};
|
|
62
|
-
var OpenAIRealtimeAdapter = class {
|
|
63
|
-
constructor(apiKey, model = OpenAIRealtimeModel.GPT_REALTIME_MINI, voice = OpenAIVoice.ALLOY, instructions = "", tools, audioFormat = OpenAIRealtimeAudioFormat.G711_ULAW, options = {}) {
|
|
64
|
-
this.apiKey = apiKey;
|
|
65
|
-
this.model = model;
|
|
66
|
-
this.voice = voice;
|
|
67
|
-
this.instructions = instructions;
|
|
68
|
-
this.tools = tools;
|
|
69
|
-
this.audioFormat = audioFormat;
|
|
70
|
-
this.options = options;
|
|
71
|
-
}
|
|
72
|
-
apiKey;
|
|
73
|
-
model;
|
|
74
|
-
voice;
|
|
75
|
-
instructions;
|
|
76
|
-
tools;
|
|
77
|
-
audioFormat;
|
|
78
|
-
// Fields exposed `protected` (not `private`) so a subclass can implement
|
|
79
|
-
// alternate transports — e.g. `OpenAIRealtime2Adapter` overrides
|
|
80
|
-
// `connect()` to speak the GA Realtime API while reusing the rest of
|
|
81
|
-
// the runtime (audio dispatch, barge-in, heartbeat).
|
|
82
|
-
ws = null;
|
|
83
|
-
eventCallbacks = /* @__PURE__ */ new Set();
|
|
84
|
-
messageListenerAttached = false;
|
|
85
|
-
heartbeat = null;
|
|
86
|
-
// Track the in-flight assistant item id so we can truncate cleanly on
|
|
87
|
-
// barge-in (see ``cancelResponse``) — matches the Python adapter.
|
|
88
|
-
currentResponseItemId = null;
|
|
89
|
-
currentResponseAudioMs = 0;
|
|
90
|
-
// Wall-clock timestamp (Date.now()) of the first ``response.audio.delta``
|
|
91
|
-
// received since the current response item started. ``cancelResponse``
|
|
92
|
-
// uses this to bound ``audio_end_ms`` to what the caller could plausibly
|
|
93
|
-
// have heard — generated audio frequently arrives 5-10x real-time, so
|
|
94
|
-
// ``audio_end_ms`` driven purely by the per-chunk byte counter overshoots
|
|
95
|
-
// reality and leaves phantom assistant text on the conversation. The
|
|
96
|
-
// wall-clock cap corresponds to the maximum playback that real-time TTS
|
|
97
|
-
// could have produced, which is what the user actually heard.
|
|
98
|
-
currentResponseFirstAudioAt = null;
|
|
99
|
-
options;
|
|
100
|
-
/**
|
|
101
|
-
* Build the production session.update body. Mirrors the body sent
|
|
102
|
-
* inside `connect()` so warmup can apply identical configuration to
|
|
103
|
-
* the upstream session and prime it without billing.
|
|
104
|
-
*/
|
|
105
|
-
buildSessionConfig() {
|
|
106
|
-
const config = {
|
|
107
|
-
input_audio_format: this.audioFormat,
|
|
108
|
-
output_audio_format: this.audioFormat,
|
|
109
|
-
voice: this.voice,
|
|
110
|
-
instructions: this.instructions || "You are a helpful voice assistant. Be concise.",
|
|
111
|
-
turn_detection: {
|
|
112
|
-
type: this.options.vadType ?? OpenAIRealtimeVADType.SERVER_VAD,
|
|
113
|
-
threshold: 0.5,
|
|
114
|
-
prefix_padding_ms: 300,
|
|
115
|
-
silence_duration_ms: this.options.silenceDurationMs ?? 300
|
|
116
|
-
},
|
|
117
|
-
input_audio_transcription: {
|
|
118
|
-
model: this.options.inputAudioTranscriptionModel ?? OpenAITranscriptionModel.WHISPER_1
|
|
119
|
-
}
|
|
120
|
-
};
|
|
121
|
-
if (this.options.temperature !== void 0) config.temperature = this.options.temperature;
|
|
122
|
-
if (this.options.maxResponseOutputTokens !== void 0) {
|
|
123
|
-
config.max_response_output_tokens = this.options.maxResponseOutputTokens;
|
|
124
|
-
}
|
|
125
|
-
if (this.options.modalities !== void 0) config.modalities = this.options.modalities;
|
|
126
|
-
if (this.options.toolChoice !== void 0) config.tool_choice = this.options.toolChoice;
|
|
127
|
-
if (this.options.reasoningEffort !== void 0) {
|
|
128
|
-
config.reasoning = { effort: this.options.reasoningEffort };
|
|
129
|
-
}
|
|
130
|
-
if (this.tools?.length) {
|
|
131
|
-
config.tools = this.tools.map((t) => {
|
|
132
|
-
const def = {
|
|
133
|
-
type: "function",
|
|
134
|
-
name: t.name,
|
|
135
|
-
description: t.description,
|
|
136
|
-
parameters: t.parameters
|
|
137
|
-
};
|
|
138
|
-
if (t.strict === true) {
|
|
139
|
-
def.strict = true;
|
|
140
|
-
}
|
|
141
|
-
return def;
|
|
142
|
-
});
|
|
143
|
-
}
|
|
144
|
-
return config;
|
|
145
|
-
}
|
|
146
|
-
/**
|
|
147
|
-
* Pre-call WebSocket warmup for the OpenAI Realtime endpoint.
|
|
148
|
-
*
|
|
149
|
-
* The canonical session-only warm step on the Realtime API: open the
|
|
150
|
-
* WS, wait for `session.created`, send a single `session.update`
|
|
151
|
-
* containing the same fields that the production `connect()` path
|
|
152
|
-
* applies (`input_audio_format`, `output_audio_format`, `voice`,
|
|
153
|
-
* `instructions`, `turn_detection`, `input_audio_transcription`,
|
|
154
|
-
* plus any opt-in fields populated on the adapter), wait for the
|
|
155
|
-
* matching `session.updated` ack, then close cleanly. This primes
|
|
156
|
-
* the per-session state on the OpenAI side — DNS + TLS + auth
|
|
157
|
-
* handshake + initial config exchange — without ever invoking the
|
|
158
|
-
* model.
|
|
159
|
-
*
|
|
160
|
-
* Earlier revisions sent `response.create` with
|
|
161
|
-
* `{"response": {"generate": false}}` to prime the inference path.
|
|
162
|
-
* That field is NOT in the OpenAI Realtime API schema; the server
|
|
163
|
-
* either ignores it (and bills tokens for a real model response) or
|
|
164
|
-
* rejects the request with `invalid_request_error`. Both behaviours
|
|
165
|
-
* are billing-unsafe or a no-op beyond TLS warm. The
|
|
166
|
-
* `session.update` flow is documented and side-effect-free.
|
|
167
|
-
*
|
|
168
|
-
* Billing safety: `session.update` only mutates session
|
|
169
|
-
* configuration. It does NOT invoke the model, does NOT consume any
|
|
170
|
-
* audio buffer, and does NOT trigger token generation, so no
|
|
171
|
-
* per-token cost is accrued. Best-effort: failures are logged at
|
|
172
|
-
* debug level and never raised.
|
|
173
|
-
*/
|
|
174
|
-
async warmup() {
|
|
175
|
-
const url = `wss://api.openai.com/v1/realtime?model=${encodeURIComponent(this.model)}`;
|
|
176
|
-
let ws = null;
|
|
177
|
-
try {
|
|
178
|
-
ws = await new Promise((resolve, reject) => {
|
|
179
|
-
const sock = new WebSocket(url, {
|
|
180
|
-
headers: {
|
|
181
|
-
Authorization: `Bearer ${this.apiKey}`,
|
|
182
|
-
"OpenAI-Beta": "realtime=v1"
|
|
183
|
-
}
|
|
184
|
-
});
|
|
185
|
-
const timer = setTimeout(() => {
|
|
186
|
-
try {
|
|
187
|
-
sock.close();
|
|
188
|
-
} catch {
|
|
189
|
-
}
|
|
190
|
-
reject(new Error("OpenAI Realtime warmup connect timeout"));
|
|
191
|
-
}, 5e3);
|
|
192
|
-
sock.once("open", () => {
|
|
193
|
-
clearTimeout(timer);
|
|
194
|
-
resolve(sock);
|
|
195
|
-
});
|
|
196
|
-
sock.once("error", (err) => {
|
|
197
|
-
clearTimeout(timer);
|
|
198
|
-
reject(err);
|
|
199
|
-
});
|
|
200
|
-
});
|
|
201
|
-
const sessionCreated = await new Promise((resolve) => {
|
|
202
|
-
const timer = setTimeout(() => resolve(false), 2e3);
|
|
203
|
-
const onMsg = (raw) => {
|
|
204
|
-
try {
|
|
205
|
-
const data = JSON.parse(raw.toString());
|
|
206
|
-
if (data.type === "session.created") {
|
|
207
|
-
clearTimeout(timer);
|
|
208
|
-
ws.off("message", onMsg);
|
|
209
|
-
resolve(true);
|
|
210
|
-
}
|
|
211
|
-
} catch {
|
|
212
|
-
}
|
|
213
|
-
};
|
|
214
|
-
ws.on("message", onMsg);
|
|
215
|
-
});
|
|
216
|
-
if (!sessionCreated) return;
|
|
217
|
-
try {
|
|
218
|
-
ws.send(JSON.stringify({ type: "session.update", session: this.buildSessionConfig() }));
|
|
219
|
-
} catch {
|
|
220
|
-
return;
|
|
221
|
-
}
|
|
222
|
-
await new Promise((resolve) => {
|
|
223
|
-
const timer = setTimeout(() => resolve(), 1500);
|
|
224
|
-
const onMsg = (raw) => {
|
|
225
|
-
try {
|
|
226
|
-
const data = JSON.parse(raw.toString());
|
|
227
|
-
if (data.type === "session.updated") {
|
|
228
|
-
clearTimeout(timer);
|
|
229
|
-
ws.off("message", onMsg);
|
|
230
|
-
resolve();
|
|
231
|
-
}
|
|
232
|
-
} catch {
|
|
233
|
-
}
|
|
234
|
-
};
|
|
235
|
-
ws.on("message", onMsg);
|
|
236
|
-
});
|
|
237
|
-
} catch (err) {
|
|
238
|
-
getLogger().debug(`OpenAI Realtime warmup failed (best-effort): ${String(err)}`);
|
|
239
|
-
} finally {
|
|
240
|
-
if (ws) {
|
|
241
|
-
try {
|
|
242
|
-
ws.close();
|
|
243
|
-
} catch {
|
|
244
|
-
}
|
|
245
|
-
}
|
|
246
|
-
}
|
|
247
|
-
}
|
|
248
|
-
/** Open the Realtime WebSocket and apply the session configuration. */
|
|
249
|
-
async connect() {
|
|
250
|
-
const url = `wss://api.openai.com/v1/realtime?model=${encodeURIComponent(this.model)}`;
|
|
251
|
-
this.ws = new WebSocket(url, {
|
|
252
|
-
headers: {
|
|
253
|
-
Authorization: `Bearer ${this.apiKey}`,
|
|
254
|
-
"OpenAI-Beta": "realtime=v1"
|
|
255
|
-
}
|
|
256
|
-
});
|
|
257
|
-
await new Promise((resolve, reject) => {
|
|
258
|
-
let sessionCreated = false;
|
|
259
|
-
let settled = false;
|
|
260
|
-
const ws = this.ws;
|
|
261
|
-
const onSetupMessage = (raw) => {
|
|
262
|
-
let msg;
|
|
263
|
-
try {
|
|
264
|
-
msg = JSON.parse(raw.toString());
|
|
265
|
-
} catch (e) {
|
|
266
|
-
getLogger().warn(`OpenAI Realtime: failed to parse message: ${String(e)}`);
|
|
267
|
-
return;
|
|
268
|
-
}
|
|
269
|
-
if (msg.type === "session.created" && !sessionCreated) {
|
|
270
|
-
sessionCreated = true;
|
|
271
|
-
ws.send(JSON.stringify({ type: "session.update", session: this.buildSessionConfig() }));
|
|
272
|
-
} else if (msg.type === "session.updated") {
|
|
273
|
-
cleanup();
|
|
274
|
-
resolve();
|
|
275
|
-
}
|
|
276
|
-
};
|
|
277
|
-
const onSetupError = (err) => {
|
|
278
|
-
cleanup();
|
|
279
|
-
try {
|
|
280
|
-
ws.close();
|
|
281
|
-
} catch {
|
|
282
|
-
}
|
|
283
|
-
reject(err);
|
|
284
|
-
};
|
|
285
|
-
const cleanup = () => {
|
|
286
|
-
if (settled) return;
|
|
287
|
-
settled = true;
|
|
288
|
-
clearTimeout(timer);
|
|
289
|
-
ws.off("message", onSetupMessage);
|
|
290
|
-
ws.off("error", onSetupError);
|
|
291
|
-
};
|
|
292
|
-
const timer = setTimeout(() => {
|
|
293
|
-
cleanup();
|
|
294
|
-
try {
|
|
295
|
-
ws.close();
|
|
296
|
-
} catch {
|
|
297
|
-
}
|
|
298
|
-
reject(new Error("OpenAI Realtime connect timeout"));
|
|
299
|
-
}, 15e3);
|
|
300
|
-
ws.on("message", onSetupMessage);
|
|
301
|
-
ws.on("error", onSetupError);
|
|
302
|
-
});
|
|
303
|
-
this.armHeartbeatAndListener();
|
|
304
|
-
}
|
|
305
|
-
/**
|
|
306
|
-
* Adopt a pre-opened, already-`session.updated` Realtime WebSocket
|
|
307
|
-
* produced by the prewarm pipeline (see `Patter.parkProviderConnections`).
|
|
308
|
-
* Skips the fresh `new WebSocket()` + `session.created` /
|
|
309
|
-
* `session.update` round-trip — saves ~250-450 ms on first turn.
|
|
310
|
-
*
|
|
311
|
-
* Caller MUST verify `ws.readyState === OPEN` before calling and MUST
|
|
312
|
-
* have already received `session.updated` on the parked socket. If
|
|
313
|
-
* the parked WS died between park and adopt, fall back to `connect()`.
|
|
314
|
-
*/
|
|
315
|
-
adoptWebSocket(ws) {
|
|
316
|
-
this.ws = ws;
|
|
317
|
-
this.armHeartbeatAndListener();
|
|
318
|
-
}
|
|
319
|
-
armHeartbeatAndListener() {
|
|
320
|
-
this.heartbeat = setInterval(() => {
|
|
321
|
-
try {
|
|
322
|
-
this.ws?.ping();
|
|
323
|
-
} catch {
|
|
324
|
-
}
|
|
325
|
-
}, 2e4);
|
|
326
|
-
this.ensureMessageListener();
|
|
327
|
-
}
|
|
328
|
-
/**
|
|
329
|
-
* Open a fresh Realtime WS, exchange `session.created` /
|
|
330
|
-
* `session.update` / `session.updated` (so the upstream session is
|
|
331
|
-
* fully primed), and return the OPEN socket WITHOUT arming the
|
|
332
|
-
* heartbeat / message listener. Used by the prewarm pipeline to park
|
|
333
|
-
* a Realtime connection during ringing; the live consumer adopts it
|
|
334
|
-
* via {@link adoptWebSocket}.
|
|
335
|
-
*
|
|
336
|
-
* Bounded by 8 s. Throws on timeout / handshake failure — callers
|
|
337
|
-
* (the prewarm pipeline) treat any error as a cache miss and the
|
|
338
|
-
* call falls through to the cold `connect()` path.
|
|
339
|
-
*
|
|
340
|
-
* Billing safety: `session.update` does not invoke the model. No
|
|
341
|
-
* tokens are billed.
|
|
342
|
-
*/
|
|
343
|
-
async openParkedConnection() {
|
|
344
|
-
const url = `wss://api.openai.com/v1/realtime?model=${encodeURIComponent(this.model)}`;
|
|
345
|
-
const ws = new WebSocket(url, {
|
|
346
|
-
headers: {
|
|
347
|
-
Authorization: `Bearer ${this.apiKey}`,
|
|
348
|
-
"OpenAI-Beta": "realtime=v1"
|
|
349
|
-
}
|
|
350
|
-
});
|
|
351
|
-
await new Promise((resolve, reject) => {
|
|
352
|
-
let sessionCreated = false;
|
|
353
|
-
let settled = false;
|
|
354
|
-
const onMessage = (raw) => {
|
|
355
|
-
let msg;
|
|
356
|
-
try {
|
|
357
|
-
msg = JSON.parse(raw.toString());
|
|
358
|
-
} catch {
|
|
359
|
-
return;
|
|
360
|
-
}
|
|
361
|
-
if (msg.type === "session.created" && !sessionCreated) {
|
|
362
|
-
sessionCreated = true;
|
|
363
|
-
try {
|
|
364
|
-
ws.send(JSON.stringify({ type: "session.update", session: this.buildSessionConfig() }));
|
|
365
|
-
} catch (err) {
|
|
366
|
-
cleanup();
|
|
367
|
-
reject(err instanceof Error ? err : new Error(String(err)));
|
|
368
|
-
}
|
|
369
|
-
} else if (msg.type === "session.updated") {
|
|
370
|
-
cleanup();
|
|
371
|
-
resolve();
|
|
372
|
-
}
|
|
373
|
-
};
|
|
374
|
-
const onError = (err) => {
|
|
375
|
-
cleanup();
|
|
376
|
-
reject(err);
|
|
377
|
-
};
|
|
378
|
-
const cleanup = () => {
|
|
379
|
-
if (settled) return;
|
|
380
|
-
settled = true;
|
|
381
|
-
clearTimeout(timer);
|
|
382
|
-
ws.off("message", onMessage);
|
|
383
|
-
ws.off("error", onError);
|
|
384
|
-
};
|
|
385
|
-
const timer = setTimeout(() => {
|
|
386
|
-
cleanup();
|
|
387
|
-
reject(new Error("OpenAI Realtime park connect timeout"));
|
|
388
|
-
}, 8e3);
|
|
389
|
-
ws.on("message", onMessage);
|
|
390
|
-
ws.on("error", onError);
|
|
391
|
-
});
|
|
392
|
-
return ws;
|
|
393
|
-
}
|
|
394
|
-
/** Append a base64-encoded audio chunk to the realtime input buffer. */
|
|
395
|
-
sendAudio(mulawAudio) {
|
|
396
|
-
if (!this.ws || this.ws.readyState !== WebSocket.OPEN) return;
|
|
397
|
-
this.ws.send(JSON.stringify({ type: "input_audio_buffer.append", audio: mulawAudio.toString("base64") }));
|
|
398
|
-
}
|
|
399
|
-
/**
|
|
400
|
-
* Register a listener for parsed realtime events.
|
|
401
|
-
*
|
|
402
|
-
* Previously every call attached a new ``ws.on('message')`` handler,
|
|
403
|
-
* which leaked listeners across retries and multi-consumer hooks. We now
|
|
404
|
-
* route all traffic through a single persistent handler that fans out to
|
|
405
|
-
* a Set of callbacks. Use {@link offEvent} to remove one.
|
|
406
|
-
*/
|
|
407
|
-
onEvent(callback) {
|
|
408
|
-
this.eventCallbacks.add(callback);
|
|
409
|
-
this.ensureMessageListener();
|
|
410
|
-
}
|
|
411
|
-
/** Remove a previously registered {@link onEvent} callback. */
|
|
412
|
-
offEvent(callback) {
|
|
413
|
-
this.eventCallbacks.delete(callback);
|
|
414
|
-
}
|
|
415
|
-
ensureMessageListener() {
|
|
416
|
-
if (this.messageListenerAttached || !this.ws) return;
|
|
417
|
-
this.messageListenerAttached = true;
|
|
418
|
-
const ws = this.ws;
|
|
419
|
-
const dispatch = (type, payload) => {
|
|
420
|
-
for (const cb of this.eventCallbacks) {
|
|
421
|
-
void Promise.resolve(cb(type, payload)).catch(
|
|
422
|
-
(err) => getLogger().error("onEvent callback error:", err)
|
|
423
|
-
);
|
|
424
|
-
}
|
|
425
|
-
};
|
|
426
|
-
ws.on("message", (raw) => {
|
|
427
|
-
let data;
|
|
428
|
-
try {
|
|
429
|
-
data = JSON.parse(raw.toString());
|
|
430
|
-
} catch (e) {
|
|
431
|
-
getLogger().warn(`OpenAI Realtime: failed to parse event message: ${String(e)}`);
|
|
432
|
-
return;
|
|
433
|
-
}
|
|
434
|
-
const t = data.type;
|
|
435
|
-
if (t === "response.audio.delta") {
|
|
436
|
-
const buf = Buffer.from(data.delta ?? "", "base64");
|
|
437
|
-
this.currentResponseAudioMs += estimateAudioMs(buf, this.audioFormat);
|
|
438
|
-
if (this.currentResponseFirstAudioAt === null) {
|
|
439
|
-
this.currentResponseFirstAudioAt = Date.now();
|
|
440
|
-
}
|
|
441
|
-
dispatch("audio", buf);
|
|
442
|
-
} else if (t === "response.audio_transcript.delta") {
|
|
443
|
-
dispatch("transcript_output", data.delta);
|
|
444
|
-
} else if (t === "response.content_part.added" || t === "response.output_item.added") {
|
|
445
|
-
const itemId = data.item?.id ?? data.item_id ?? null;
|
|
446
|
-
if (itemId) {
|
|
447
|
-
this.currentResponseItemId = itemId;
|
|
448
|
-
this.currentResponseAudioMs = 0;
|
|
449
|
-
this.currentResponseFirstAudioAt = null;
|
|
450
|
-
}
|
|
451
|
-
} else if (t === "input_audio_buffer.speech_started") {
|
|
452
|
-
dispatch("speech_started", null);
|
|
453
|
-
} else if (t === "input_audio_buffer.speech_stopped") {
|
|
454
|
-
dispatch("speech_stopped", null);
|
|
455
|
-
} else if (t === "conversation.item.input_audio_transcription.completed") {
|
|
456
|
-
dispatch("transcript_input", data.transcript);
|
|
457
|
-
} else if (t === "response.function_call_arguments.done") {
|
|
458
|
-
dispatch("function_call", { call_id: data.call_id, name: data.name, arguments: data.arguments });
|
|
459
|
-
} else if (t === "response.done") {
|
|
460
|
-
this.currentResponseItemId = null;
|
|
461
|
-
this.currentResponseAudioMs = 0;
|
|
462
|
-
this.currentResponseFirstAudioAt = null;
|
|
463
|
-
dispatch("response_done", data.response ?? null);
|
|
464
|
-
} else if (t === "error") {
|
|
465
|
-
dispatch("error", data.error);
|
|
466
|
-
}
|
|
467
|
-
});
|
|
468
|
-
ws.on("close", (code, reason) => {
|
|
469
|
-
if (code !== 1e3) {
|
|
470
|
-
dispatch("error", {
|
|
471
|
-
type: "connection_closed",
|
|
472
|
-
code,
|
|
473
|
-
reason: reason?.toString() ?? ""
|
|
474
|
-
});
|
|
475
|
-
}
|
|
476
|
-
});
|
|
477
|
-
ws.on("error", (err) => {
|
|
478
|
-
dispatch("error", { type: "socket_error", message: err?.message ?? String(err) });
|
|
479
|
-
});
|
|
480
|
-
}
|
|
481
|
-
/** Truncate the in-flight assistant turn and cancel the active response.
|
|
482
|
-
*
|
|
483
|
-
* ``audio_end_ms`` MUST reflect what the caller actually heard, not what
|
|
484
|
-
* the server generated. OpenAI streams audio at 5-10x real-time, so the
|
|
485
|
-
* byte-derived counter overstates playback whenever the consumer cleared
|
|
486
|
-
* its playout buffer (e.g. ``send_clear``) before the audio reached the
|
|
487
|
-
* speaker. We bound the truncate point by wall-clock time since the first
|
|
488
|
-
* chunk of this response — that's the physical maximum a 1x real-time
|
|
489
|
-
* playback could have produced. Without this cap, OpenAI keeps the full
|
|
490
|
-
* generated assistant text on the transcript, and the model replays /
|
|
491
|
-
* resumes from it on the next turn — manifesting as re-greetings and
|
|
492
|
-
* mid-sentence fragments after a barge-in storm.
|
|
493
|
-
*/
|
|
494
|
-
cancelResponse() {
|
|
495
|
-
if (!this.ws) return;
|
|
496
|
-
if (this.currentResponseItemId) {
|
|
497
|
-
let audioEndMs = this.currentResponseAudioMs;
|
|
498
|
-
if (this.currentResponseFirstAudioAt !== null) {
|
|
499
|
-
const elapsedMs = Date.now() - this.currentResponseFirstAudioAt;
|
|
500
|
-
audioEndMs = Math.min(audioEndMs, Math.max(elapsedMs, 0));
|
|
501
|
-
}
|
|
502
|
-
try {
|
|
503
|
-
this.ws.send(JSON.stringify({
|
|
504
|
-
type: "conversation.item.truncate",
|
|
505
|
-
item_id: this.currentResponseItemId,
|
|
506
|
-
content_index: 0,
|
|
507
|
-
audio_end_ms: audioEndMs
|
|
508
|
-
}));
|
|
509
|
-
} catch (err) {
|
|
510
|
-
getLogger().debug?.(`conversation.item.truncate failed: ${String(err)}`);
|
|
511
|
-
}
|
|
512
|
-
}
|
|
513
|
-
this.ws.send(JSON.stringify({ type: "response.cancel" }));
|
|
514
|
-
this.currentResponseItemId = null;
|
|
515
|
-
this.currentResponseAudioMs = 0;
|
|
516
|
-
this.currentResponseFirstAudioAt = null;
|
|
517
|
-
}
|
|
518
|
-
/** Inject a user text turn and request a new response. */
|
|
519
|
-
async sendText(text) {
|
|
520
|
-
this.ws?.send(JSON.stringify({
|
|
521
|
-
type: "conversation.item.create",
|
|
522
|
-
item: { type: "message", role: "user", content: [{ type: "input_text", text }] }
|
|
523
|
-
}));
|
|
524
|
-
this.ws?.send(JSON.stringify({ type: "response.create" }));
|
|
525
|
-
}
|
|
526
|
-
/**
|
|
527
|
-
* Make the AI speak ``text`` as its opening line.
|
|
528
|
-
*
|
|
529
|
-
* Triggers ``response.create`` with explicit ``instructions`` that force
|
|
530
|
-
* the model to render ``text`` verbatim as its first audio utterance.
|
|
531
|
-
* This is the correct semantics for ``Agent.firstMessage`` per its
|
|
532
|
-
* docstring ("What the AI says when the callee answers").
|
|
533
|
-
*
|
|
534
|
-
* Without this, ``sendText(firstMessage)`` would inject ``text`` as
|
|
535
|
-
* ``role: user`` and the AI would *reply* to its own greeting, producing
|
|
536
|
-
* role-confused openings (e.g. a receptionist agent responding "I'd like
|
|
537
|
-
* to schedule a haircut" because it took its own first_message as a
|
|
538
|
-
* customer cue).
|
|
539
|
-
*/
|
|
540
|
-
async sendFirstMessage(text) {
|
|
541
|
-
this.ws?.send(JSON.stringify({
|
|
542
|
-
type: "response.create",
|
|
543
|
-
response: {
|
|
544
|
-
modalities: ["audio", "text"],
|
|
545
|
-
instructions: `Say exactly the following sentence as your first turn and nothing else: "${text}"`
|
|
546
|
-
}
|
|
547
|
-
}));
|
|
548
|
-
}
|
|
549
|
-
/** Submit a tool/function-call result and request the next response. */
|
|
550
|
-
async sendFunctionResult(callId, result) {
|
|
551
|
-
this.ws?.send(JSON.stringify({
|
|
552
|
-
type: "conversation.item.create",
|
|
553
|
-
item: { type: "function_call_output", call_id: callId, output: result }
|
|
554
|
-
}));
|
|
555
|
-
this.ws?.send(JSON.stringify({ type: "response.create" }));
|
|
556
|
-
}
|
|
557
|
-
/** Stop the heartbeat, drop listeners, and close the Realtime WebSocket. */
|
|
558
|
-
close() {
|
|
559
|
-
if (this.heartbeat) {
|
|
560
|
-
clearInterval(this.heartbeat);
|
|
561
|
-
this.heartbeat = null;
|
|
562
|
-
}
|
|
563
|
-
this.eventCallbacks.clear();
|
|
564
|
-
this.messageListenerAttached = false;
|
|
565
|
-
this.ws?.close();
|
|
566
|
-
this.ws = null;
|
|
567
|
-
}
|
|
568
|
-
};
|
|
569
|
-
function estimateAudioMs(chunk, format) {
|
|
570
|
-
if (chunk.length === 0) return 0;
|
|
571
|
-
if (format === OpenAIRealtimeAudioFormat.G711_ULAW || format === OpenAIRealtimeAudioFormat.G711_ALAW)
|
|
572
|
-
return Math.floor(chunk.length / 8);
|
|
573
|
-
if (format === OpenAIRealtimeAudioFormat.PCM16) {
|
|
574
|
-
return Math.floor(chunk.length / 48);
|
|
575
|
-
}
|
|
576
|
-
return 0;
|
|
577
|
-
}
|
|
578
|
-
|
|
579
|
-
// src/providers/openai-realtime-2.ts
|
|
580
|
-
init_esm_shims();
|
|
581
|
-
import WebSocket2 from "ws";
|
|
582
|
-
|
|
583
|
-
// src/audio/transcoding.ts
|
|
584
|
-
init_esm_shims();
|
|
585
|
-
var MULAW_TO_PCM16_TABLE = (() => {
|
|
586
|
-
const table = new Int16Array(256);
|
|
587
|
-
for (let i = 0; i < 256; i++) {
|
|
588
|
-
const mu = ~i & 255;
|
|
589
|
-
const sign = mu & 128 ? -1 : 1;
|
|
590
|
-
const exponent = mu >> 4 & 7;
|
|
591
|
-
const mantissa = mu & 15;
|
|
592
|
-
const magnitude = (mantissa << 1 | 33) << exponent + 2;
|
|
593
|
-
table[i] = sign * (magnitude - 132);
|
|
594
|
-
}
|
|
595
|
-
return table;
|
|
596
|
-
})();
|
|
597
|
-
var PCM16_TO_MULAW_TABLE = (() => {
|
|
598
|
-
const BIAS = 132;
|
|
599
|
-
const CLIP = 32635;
|
|
600
|
-
const table = new Uint8Array(65536);
|
|
601
|
-
for (let i = 0; i < 65536; i++) {
|
|
602
|
-
let sample = i >= 32768 ? i - 65536 : i;
|
|
603
|
-
const sign = sample < 0 ? 128 : 0;
|
|
604
|
-
if (sample < 0) sample = -sample;
|
|
605
|
-
if (sample > CLIP) sample = CLIP;
|
|
606
|
-
sample += BIAS;
|
|
607
|
-
let exponent = 7;
|
|
608
|
-
const exponentMask = 16384;
|
|
609
|
-
for (let shift = exponentMask; shift > 0 && (sample & shift) === 0; shift >>= 1) {
|
|
610
|
-
exponent--;
|
|
611
|
-
}
|
|
612
|
-
const mantissa = sample >> exponent + 3 & 15;
|
|
613
|
-
const mulaw = ~(sign | exponent << 4 | mantissa) & 255;
|
|
614
|
-
table[i] = mulaw;
|
|
615
|
-
}
|
|
616
|
-
return table;
|
|
617
|
-
})();
|
|
618
|
-
function mulawToPcm16(mulawData) {
|
|
619
|
-
const out = Buffer.alloc(mulawData.length * 2);
|
|
620
|
-
for (let i = 0; i < mulawData.length; i++) {
|
|
621
|
-
out.writeInt16LE(MULAW_TO_PCM16_TABLE[mulawData[i]], i * 2);
|
|
622
|
-
}
|
|
623
|
-
return out;
|
|
624
|
-
}
|
|
625
|
-
function pcm16ToMulaw(pcmData) {
|
|
626
|
-
const sampleCount = Math.floor(pcmData.length / 2);
|
|
627
|
-
const out = Buffer.alloc(sampleCount);
|
|
628
|
-
for (let i = 0; i < sampleCount; i++) {
|
|
629
|
-
const sample = pcmData.readInt16LE(i * 2);
|
|
630
|
-
out[i] = PCM16_TO_MULAW_TABLE[sample + 65536 & 65535];
|
|
631
|
-
}
|
|
632
|
-
return out;
|
|
633
|
-
}
|
|
634
|
-
var PcmCarry = class {
|
|
635
|
-
pending = null;
|
|
636
|
-
/**
|
|
637
|
-
* Prepend any carried odd byte, return the even-length prefix, and stash
|
|
638
|
-
* any new trailing odd byte for the next call.
|
|
639
|
-
*
|
|
640
|
-
* Returns a zero-length buffer when no complete sample is yet available.
|
|
641
|
-
*/
|
|
642
|
-
push(chunk) {
|
|
643
|
-
const combined = this.pending !== null ? Buffer.concat([this.pending, chunk]) : chunk;
|
|
644
|
-
this.pending = null;
|
|
645
|
-
const alignedLen = combined.length & ~1;
|
|
646
|
-
if (alignedLen < combined.length) {
|
|
647
|
-
this.pending = combined.subarray(alignedLen);
|
|
648
|
-
}
|
|
649
|
-
return combined.subarray(0, alignedLen);
|
|
650
|
-
}
|
|
651
|
-
/**
|
|
652
|
-
* Return any pending byte as a 1-byte buffer (rare in practice — only if
|
|
653
|
-
* the entire stream had an odd byte count), then reset internal state.
|
|
654
|
-
*/
|
|
655
|
-
flush() {
|
|
656
|
-
if (this.pending === null) return Buffer.alloc(0);
|
|
657
|
-
const out = this.pending;
|
|
658
|
-
this.pending = null;
|
|
659
|
-
return out;
|
|
660
|
-
}
|
|
661
|
-
/** Reset carry state without flushing. */
|
|
662
|
-
reset() {
|
|
663
|
-
this.pending = null;
|
|
664
|
-
}
|
|
665
|
-
};
|
|
666
|
-
var StatefulResampler = class {
|
|
667
|
-
srcRate;
|
|
668
|
-
dstRate;
|
|
669
|
-
// 16k→8k: 5-tap FIR state.
|
|
670
|
-
// Extended sample buffer carries the 2 history samples that precede the
|
|
671
|
-
// current chunk AND any "pending" input sample that did not yet generate
|
|
672
|
-
// output (i.e. the odd sample when the chunk had an odd sample count).
|
|
673
|
-
// `firPhase` = 0 means the next output is at input position 0 of the
|
|
674
|
-
// current chunk; 1 means it starts at input position 1 (because the
|
|
675
|
-
// previous chunk ended on an even-output boundary).
|
|
676
|
-
firHistory = new Int16Array(2);
|
|
677
|
-
// [s_{-2}, s_{-1}]
|
|
678
|
-
firHistoryValid = false;
|
|
679
|
-
// Pending sample carried from odd-count chunks (not the byte carry —
|
|
680
|
-
// this is a complete Int16 sample that becomes the first input for the
|
|
681
|
-
// next call).
|
|
682
|
-
firPendingSample = null;
|
|
683
|
-
// 8k→16k: last input sample deferred across chunk boundaries.
|
|
684
|
-
upsampleLast = 0;
|
|
685
|
-
upsampleHasHistory = false;
|
|
686
|
-
// 24k→16k: fractional phase and last input sample across chunks.
|
|
687
|
-
resample24Last = 0;
|
|
688
|
-
resample24Phase = 0;
|
|
689
|
-
resample24HasHistory = false;
|
|
690
|
-
// Odd-byte alignment carry.
|
|
691
|
-
carry = new PcmCarry();
|
|
692
|
-
constructor(opts) {
|
|
693
|
-
this.srcRate = opts.srcRate;
|
|
694
|
-
this.dstRate = opts.dstRate;
|
|
695
|
-
if (opts.channels !== void 0 && opts.channels !== 1) {
|
|
696
|
-
throw new Error("StatefulResampler: only mono (channels=1) is supported");
|
|
697
|
-
}
|
|
698
|
-
const key = `${this.srcRate}->${this.dstRate}`;
|
|
699
|
-
if (key !== "16000->8000" && key !== "8000->16000" && key !== "24000->16000" && key !== "24000->8000") {
|
|
700
|
-
throw new Error(
|
|
701
|
-
`StatefulResampler: unsupported conversion ${key}. Supported: 16000->8000, 8000->16000, 24000->16000, 24000->8000`
|
|
702
|
-
);
|
|
703
|
-
}
|
|
704
|
-
}
|
|
705
|
-
/**
|
|
706
|
-
* Process a chunk of PCM16-LE samples.
|
|
707
|
-
*
|
|
708
|
-
* Handles odd-byte inputs via an internal carry buffer. Returns an even-byte-
|
|
709
|
-
* aligned output buffer; may return a zero-length buffer if not enough
|
|
710
|
-
* aligned input is available yet.
|
|
711
|
-
*/
|
|
712
|
-
process(pcm) {
|
|
713
|
-
const aligned = this.carry.push(pcm);
|
|
714
|
-
if (aligned.length === 0) return Buffer.alloc(0);
|
|
715
|
-
if (this.srcRate === 16e3 && this.dstRate === 8e3) {
|
|
716
|
-
return this._downsample16kTo8k(aligned);
|
|
717
|
-
}
|
|
718
|
-
if (this.srcRate === 8e3 && this.dstRate === 16e3) {
|
|
719
|
-
return this._upsample8kTo16k(aligned);
|
|
720
|
-
}
|
|
721
|
-
if (this.srcRate === 24e3 && this.dstRate === 8e3) {
|
|
722
|
-
return this._resample24kTo8k(aligned);
|
|
723
|
-
}
|
|
724
|
-
return this._resample24kTo16k(aligned);
|
|
725
|
-
}
|
|
726
|
-
/**
|
|
727
|
-
* Flush internal state and return any remaining output samples.
|
|
728
|
-
*
|
|
729
|
-
* For 8k→16k: the deferred last sample is emitted duplicated (matching
|
|
730
|
-
* the stateless helper's end-of-stream behaviour).
|
|
731
|
-
* For 16k→8k: any pending odd sample is processed with edge-replication.
|
|
732
|
-
* Resets all state after flushing.
|
|
733
|
-
*/
|
|
734
|
-
flush() {
|
|
735
|
-
this.carry.flush();
|
|
736
|
-
if (this.srcRate === 16e3 && this.dstRate === 8e3 && this.firPendingSample !== null) {
|
|
737
|
-
const s = this.firPendingSample;
|
|
738
|
-
const tmp = Buffer.alloc(4);
|
|
739
|
-
tmp.writeInt16LE(s, 0);
|
|
740
|
-
tmp.writeInt16LE(s, 2);
|
|
741
|
-
const out = this._downsample16kTo8k(tmp);
|
|
742
|
-
this.firPendingSample = null;
|
|
743
|
-
return out;
|
|
744
|
-
}
|
|
745
|
-
if (this.srcRate === 8e3 && this.dstRate === 16e3 && this.upsampleHasHistory) {
|
|
746
|
-
const out = Buffer.alloc(4);
|
|
747
|
-
out.writeInt16LE(this.upsampleLast, 0);
|
|
748
|
-
out.writeInt16LE(this.upsampleLast, 2);
|
|
749
|
-
this.upsampleHasHistory = false;
|
|
750
|
-
this.upsampleLast = 0;
|
|
751
|
-
return out;
|
|
752
|
-
}
|
|
753
|
-
return Buffer.alloc(0);
|
|
754
|
-
}
|
|
755
|
-
/** Reset all carried state (e.g. at call boundaries). */
|
|
756
|
-
reset() {
|
|
757
|
-
this.firHistory = new Int16Array(2);
|
|
758
|
-
this.firHistoryValid = false;
|
|
759
|
-
this.firPendingSample = null;
|
|
760
|
-
this.upsampleLast = 0;
|
|
761
|
-
this.upsampleHasHistory = false;
|
|
762
|
-
this.resample24Last = 0;
|
|
763
|
-
this.resample24Phase = 0;
|
|
764
|
-
this.resample24HasHistory = false;
|
|
765
|
-
this.carry.reset();
|
|
766
|
-
}
|
|
767
|
-
// ---------------------------------------------------------------------------
|
|
768
|
-
// Private: 16 kHz → 8 kHz
|
|
769
|
-
// ---------------------------------------------------------------------------
|
|
770
|
-
/**
|
|
771
|
-
* 2:1 decimation with a 5-tap binomial FIR anti-alias filter.
|
|
772
|
-
*
|
|
773
|
-
* FIR coefficients: [1, 4, 6, 4, 1] / 16 (cutoff ~Fs/4 = 4 kHz).
|
|
774
|
-
*
|
|
775
|
-
* Cross-chunk state:
|
|
776
|
-
* - `firHistory[0]` = s_{-2}, `firHistory[1]` = s_{-1} relative to the
|
|
777
|
-
* virtual stream (seeded to first-sample on the very first call).
|
|
778
|
-
* - `firPendingSample` = a lone input sample carried from a chunk whose
|
|
779
|
-
* sample count was odd; it will become the first input of the next chunk.
|
|
780
|
-
*
|
|
781
|
-
* Decimation: outputs are at even positions (0, 2, 4 …) in the virtual
|
|
782
|
-
* extended stream, so every 2 input samples yield 1 output. An odd-sample-
|
|
783
|
-
* count chunk leaves 1 sample in `firPendingSample`; the next chunk
|
|
784
|
-
* prepends it so the output cadence is unbroken.
|
|
785
|
-
*/
|
|
786
|
-
_downsample16kTo8k(buf) {
|
|
787
|
-
const newSampleCount = buf.length >> 1;
|
|
788
|
-
const hasPending = this.firPendingSample !== null;
|
|
789
|
-
const totalInput = newSampleCount + (hasPending ? 1 : 0);
|
|
790
|
-
const input = new Int16Array(totalInput);
|
|
791
|
-
if (hasPending) {
|
|
792
|
-
input[0] = this.firPendingSample;
|
|
793
|
-
for (let j = 0; j < newSampleCount; j++) input[j + 1] = buf.readInt16LE(j * 2);
|
|
794
|
-
} else {
|
|
795
|
-
for (let j = 0; j < newSampleCount; j++) input[j] = buf.readInt16LE(j * 2);
|
|
796
|
-
}
|
|
797
|
-
this.firPendingSample = null;
|
|
798
|
-
if (totalInput === 0) return Buffer.alloc(0);
|
|
799
|
-
if (!this.firHistoryValid) {
|
|
800
|
-
this.firHistory[0] = 0;
|
|
801
|
-
this.firHistory[1] = 0;
|
|
802
|
-
this.firHistoryValid = true;
|
|
803
|
-
}
|
|
804
|
-
const extended = new Int16Array(totalInput + 2);
|
|
805
|
-
extended[0] = this.firHistory[0];
|
|
806
|
-
extended[1] = this.firHistory[1];
|
|
807
|
-
for (let j = 0; j < totalInput; j++) extended[j + 2] = input[j];
|
|
808
|
-
const outSamples = totalInput >> 1;
|
|
809
|
-
const out = Buffer.alloc(outSamples * 2);
|
|
810
|
-
for (let i = 0; i < outSamples; i++) {
|
|
811
|
-
const c = 2 + i * 2;
|
|
812
|
-
const sM2 = extended[c - 2];
|
|
813
|
-
const sM1 = extended[c - 1];
|
|
814
|
-
const s0 = extended[c];
|
|
815
|
-
const sP1 = c + 1 < extended.length ? extended[c + 1] : extended[extended.length - 1];
|
|
816
|
-
const sP2 = c + 2 < extended.length ? extended[c + 2] : extended[extended.length - 1];
|
|
817
|
-
const filtered = sM2 + 4 * sM1 + 6 * s0 + 4 * sP1 + sP2 + 8 >> 4;
|
|
818
|
-
out.writeInt16LE(Math.max(-32768, Math.min(32767, filtered)), i * 2);
|
|
819
|
-
}
|
|
820
|
-
if (totalInput % 2 === 1) {
|
|
821
|
-
this.firPendingSample = input[totalInput - 1];
|
|
822
|
-
}
|
|
823
|
-
if (totalInput >= 2) {
|
|
824
|
-
this.firHistory[0] = input[totalInput - 2];
|
|
825
|
-
this.firHistory[1] = input[totalInput - 1];
|
|
826
|
-
} else {
|
|
827
|
-
this.firHistory[0] = this.firHistory[1];
|
|
828
|
-
this.firHistory[1] = input[0];
|
|
829
|
-
}
|
|
830
|
-
return out;
|
|
831
|
-
}
|
|
832
|
-
// ---------------------------------------------------------------------------
|
|
833
|
-
// Private: 8 kHz → 16 kHz
|
|
834
|
-
// ---------------------------------------------------------------------------
|
|
835
|
-
/**
|
|
836
|
-
* 1:2 linear-interpolation upsampler.
|
|
837
|
-
*
|
|
838
|
-
* For the first chunk (no history): emits 2*(N-1) samples and defers the
|
|
839
|
-
* last sample. For subsequent chunks (with history): emits the deferred
|
|
840
|
-
* sample + its interpolated midpoint THEN 2*(N-1) samples from the new
|
|
841
|
-
* chunk, deferring the new last sample. Total across K chunks + flush =
|
|
842
|
-
* 2*total_input_samples (correct output length).
|
|
843
|
-
*
|
|
844
|
-
* Call flush() after the final chunk to emit the last deferred sample
|
|
845
|
-
* pair (self-duplicate at end of stream).
|
|
846
|
-
*/
|
|
847
|
-
_upsample8kTo16k(buf) {
|
|
848
|
-
const sampleCount = buf.length >> 1;
|
|
849
|
-
if (sampleCount === 0) return Buffer.alloc(0);
|
|
850
|
-
const outArr = [];
|
|
851
|
-
if (this.upsampleHasHistory) {
|
|
852
|
-
const next = buf.readInt16LE(0);
|
|
853
|
-
outArr.push(this.upsampleLast);
|
|
854
|
-
outArr.push(Math.round((this.upsampleLast + next) / 2));
|
|
855
|
-
}
|
|
856
|
-
for (let i = 0; i < sampleCount - 1; i++) {
|
|
857
|
-
const s0 = buf.readInt16LE(i * 2);
|
|
858
|
-
const s1 = buf.readInt16LE((i + 1) * 2);
|
|
859
|
-
outArr.push(s0);
|
|
860
|
-
outArr.push(Math.round((s0 + s1) / 2));
|
|
861
|
-
}
|
|
862
|
-
this.upsampleLast = buf.readInt16LE((sampleCount - 1) * 2);
|
|
863
|
-
this.upsampleHasHistory = true;
|
|
864
|
-
const outBuf = Buffer.alloc(outArr.length * 2);
|
|
865
|
-
for (let j = 0; j < outArr.length; j++) outBuf.writeInt16LE(outArr[j], j * 2);
|
|
866
|
-
return outBuf;
|
|
867
|
-
}
|
|
868
|
-
// ---------------------------------------------------------------------------
|
|
869
|
-
// Private: 24 kHz → 16 kHz / 8 kHz
|
|
870
|
-
// ---------------------------------------------------------------------------
|
|
871
|
-
/**
|
|
872
|
-
* 3:2 linear-interpolation decimator (ratio srcRate/dstRate = 1.5).
|
|
873
|
-
*
|
|
874
|
-
* `resample24Phase` tracks the fractional input position of the next output
|
|
875
|
-
* sample relative to the START of the next chunk. Negative phase means the
|
|
876
|
-
* next output straddles the previous/current chunk boundary; those are
|
|
877
|
-
* handled using `resample24Last`.
|
|
878
|
-
*/
|
|
879
|
-
_resample24kTo16k(buf) {
|
|
880
|
-
return this._resample24kStep(buf, 24e3 / 16e3);
|
|
881
|
-
}
|
|
882
|
-
/** 3:1 decimation — collapses the 24k→16k→8k chain into a single step. */
|
|
883
|
-
_resample24kTo8k(buf) {
|
|
884
|
-
return this._resample24kStep(buf, 24e3 / 8e3);
|
|
885
|
-
}
|
|
886
|
-
/** Shared phase-stepping resampler used by 24→16 (step 1.5) and 24→8 (step 3). */
|
|
887
|
-
_resample24kStep(buf, step) {
|
|
888
|
-
const sampleCount = buf.length >> 1;
|
|
889
|
-
if (sampleCount === 0) return Buffer.alloc(0);
|
|
890
|
-
const outArr = [];
|
|
891
|
-
let phase = this.resample24Phase;
|
|
892
|
-
while (true) {
|
|
893
|
-
const idx = Math.floor(phase);
|
|
894
|
-
if (idx >= sampleCount) break;
|
|
895
|
-
const frac = phase - idx;
|
|
896
|
-
let s0;
|
|
897
|
-
let s1;
|
|
898
|
-
if (idx < 0) {
|
|
899
|
-
s0 = this.resample24HasHistory ? this.resample24Last : 0;
|
|
900
|
-
s1 = buf.readInt16LE(0);
|
|
901
|
-
} else {
|
|
902
|
-
s0 = buf.readInt16LE(idx * 2);
|
|
903
|
-
s1 = idx + 1 < sampleCount ? buf.readInt16LE((idx + 1) * 2) : s0;
|
|
904
|
-
}
|
|
905
|
-
const interp = Math.round(s0 + (s1 - s0) * frac);
|
|
906
|
-
outArr.push(Math.max(-32768, Math.min(32767, interp)));
|
|
907
|
-
phase += step;
|
|
908
|
-
}
|
|
909
|
-
this.resample24Last = buf.readInt16LE((sampleCount - 1) * 2);
|
|
910
|
-
this.resample24HasHistory = true;
|
|
911
|
-
this.resample24Phase = phase - sampleCount;
|
|
912
|
-
const outBuf = Buffer.alloc(outArr.length * 2);
|
|
913
|
-
for (let j = 0; j < outArr.length; j++) outBuf.writeInt16LE(outArr[j], j * 2);
|
|
914
|
-
return outBuf;
|
|
915
|
-
}
|
|
916
|
-
};
|
|
917
|
-
function createResampler16kTo8k() {
|
|
918
|
-
return new StatefulResampler({ srcRate: 16e3, dstRate: 8e3 });
|
|
919
|
-
}
|
|
920
|
-
function createResampler8kTo16k() {
|
|
921
|
-
return new StatefulResampler({ srcRate: 8e3, dstRate: 16e3 });
|
|
922
|
-
}
|
|
923
|
-
function createResampler24kTo16k() {
|
|
924
|
-
return new StatefulResampler({ srcRate: 24e3, dstRate: 16e3 });
|
|
925
|
-
}
|
|
926
|
-
function createResampler24kTo8k() {
|
|
927
|
-
return new StatefulResampler({ srcRate: 24e3, dstRate: 8e3 });
|
|
928
|
-
}
|
|
929
|
-
var _warnedResample8kTo16k = false;
|
|
930
|
-
var _warnedResample16kTo8k = false;
|
|
931
|
-
var _warnedResample24kTo16k = false;
|
|
932
|
-
function resample8kTo16k(pcm8k) {
|
|
933
|
-
if (!_warnedResample8kTo16k) {
|
|
934
|
-
_warnedResample8kTo16k = true;
|
|
935
|
-
getLogger().warn(
|
|
936
|
-
"[patter] resample8kTo16k() is deprecated. Use createResampler8kTo16k() (StatefulResampler) to eliminate chunk-boundary discontinuities."
|
|
937
|
-
);
|
|
938
|
-
}
|
|
939
|
-
if (pcm8k.length === 0) return Buffer.alloc(0);
|
|
940
|
-
const r = createResampler8kTo16k();
|
|
941
|
-
const main = r.process(pcm8k);
|
|
942
|
-
const tail = r.flush();
|
|
943
|
-
return tail.length > 0 ? Buffer.concat([main, tail]) : main;
|
|
944
|
-
}
|
|
945
|
-
function resample16kTo8k(pcm16k) {
|
|
946
|
-
if (!_warnedResample16kTo8k) {
|
|
947
|
-
_warnedResample16kTo8k = true;
|
|
948
|
-
getLogger().warn(
|
|
949
|
-
"[patter] resample16kTo8k() is deprecated. Use createResampler16kTo8k() (StatefulResampler) to eliminate chunk-boundary discontinuities."
|
|
950
|
-
);
|
|
951
|
-
}
|
|
952
|
-
if (pcm16k.length === 0) return Buffer.alloc(0);
|
|
953
|
-
const r = createResampler16kTo8k();
|
|
954
|
-
const out = r.process(pcm16k);
|
|
955
|
-
const tail = r.flush();
|
|
956
|
-
return tail.length > 0 ? Buffer.concat([out, tail]) : out;
|
|
957
|
-
}
|
|
958
|
-
function resample24kTo16k(pcm24k) {
|
|
959
|
-
if (!_warnedResample24kTo16k) {
|
|
960
|
-
_warnedResample24kTo16k = true;
|
|
961
|
-
getLogger().warn(
|
|
962
|
-
"[patter] resample24kTo16k() is deprecated. Use createResampler24kTo16k() (StatefulResampler) or OpenAITTS.resampleStreaming for anti-aliased resampling."
|
|
963
|
-
);
|
|
964
|
-
}
|
|
965
|
-
if (pcm24k.length === 0) return Buffer.alloc(0);
|
|
966
|
-
const sampleCount = Math.floor(pcm24k.length / 2);
|
|
967
|
-
const outSamples = Math.floor(sampleCount * 2 / 3);
|
|
968
|
-
const out = Buffer.alloc(outSamples * 2);
|
|
969
|
-
for (let i = 0; i < outSamples; i++) {
|
|
970
|
-
const pos = i * 1.5;
|
|
971
|
-
const idx = Math.floor(pos);
|
|
972
|
-
const frac = pos - idx;
|
|
973
|
-
const s0 = pcm24k.readInt16LE(idx * 2);
|
|
974
|
-
const s1 = idx + 1 < sampleCount ? pcm24k.readInt16LE((idx + 1) * 2) : s0;
|
|
975
|
-
const interp = Math.round(s0 + (s1 - s0) * frac);
|
|
976
|
-
out.writeInt16LE(Math.max(-32768, Math.min(32767, interp)), i * 2);
|
|
977
|
-
}
|
|
978
|
-
return out;
|
|
979
|
-
}
|
|
980
|
-
|
|
981
|
-
// src/providers/openai-realtime-2.ts
|
|
982
|
-
var GA_TO_V1_EVENT_NAMES = {
|
|
983
|
-
"response.output_audio.delta": "response.audio.delta",
|
|
984
|
-
"response.output_audio.done": "response.audio.done",
|
|
985
|
-
"response.output_audio_transcript.delta": "response.audio_transcript.delta",
|
|
986
|
-
"response.output_audio_transcript.done": "response.audio_transcript.done"
|
|
987
|
-
};
|
|
988
|
-
var OpenAIRealtime2Adapter = class extends OpenAIRealtimeAdapter {
|
|
989
|
-
/** Two-stage outbound resampler for 24 kHz → 8 kHz. Created lazily on
|
|
990
|
-
* the first audio frame so each Realtime session has its own state.
|
|
991
|
-
*
|
|
992
|
-
* We chain `24k → 16k → 8k` instead of using the direct `24k → 8k`
|
|
993
|
-
* variant of {@link StatefulResampler}: the direct path is a 3:1
|
|
994
|
-
* decimation with linear interpolation only — no anti-alias filter
|
|
995
|
-
* — so any energy above 4 kHz in the source aliases down into the
|
|
996
|
-
* audible band and is heard as raspy/scratchy artefacts on speech.
|
|
997
|
-
* `gpt-realtime-2` outputs voice with significant content above
|
|
998
|
-
* 4 kHz. The second stage (16k → 8k) uses a 5-tap FIR anti-alias
|
|
999
|
-
* filter which removes the offending band before decimation, and
|
|
1000
|
-
* empirically (see commit message) the chain produces audibly
|
|
1001
|
-
* cleaner output. The 24k → 16k step is still pure linear-interp
|
|
1002
|
-
* but the inputs to it stay below the Nyquist of the 16 kHz stage,
|
|
1003
|
-
* so it doesn't introduce new artefacts.
|
|
1004
|
-
*/
|
|
1005
|
-
outboundResampler24To16 = null;
|
|
1006
|
-
outboundResampler16To8 = null;
|
|
1007
|
-
/** Last 8 kHz input sample carried across chunk boundaries for the
|
|
1008
|
-
* direct 3× linear upsample (see `transcodeInboundMulaw8ToPcm24`).
|
|
1009
|
-
* The carry guarantees the very first output of each chunk
|
|
1010
|
-
* interpolates from the *real* preceding sample, not from the chunk's
|
|
1011
|
-
* own first sample replicated — without it every 20 ms Twilio frame
|
|
1012
|
-
* boundary becomes a small DC step that the GA server VAD interprets
|
|
1013
|
-
* as constant low-energy noise, which never crosses the speech
|
|
1014
|
-
* threshold. */
|
|
1015
|
-
inbound8kCarry = null;
|
|
1016
|
-
/** GA-shape `session.update` payload. See module-level docstring. */
|
|
1017
|
-
buildGASessionConfig() {
|
|
1018
|
-
const opts = this.options;
|
|
1019
|
-
const fmt = { type: "audio/pcm", rate: 24e3 };
|
|
1020
|
-
const config = {
|
|
1021
|
-
type: "realtime",
|
|
1022
|
-
output_modalities: opts.modalities ?? ["audio"],
|
|
1023
|
-
audio: {
|
|
1024
|
-
input: {
|
|
1025
|
-
format: fmt,
|
|
1026
|
-
transcription: {
|
|
1027
|
-
model: opts.inputAudioTranscriptionModel ?? OpenAITranscriptionModel.WHISPER_1
|
|
1028
|
-
},
|
|
1029
|
-
// Lower threshold (0.3 vs the 0.5 default) because the inbound
|
|
1030
|
-
// audio is telephony-band (8 kHz) linearly upsampled to 24 kHz —
|
|
1031
|
-
// the upper 4-12 kHz band is interpolation, not real harmonics,
|
|
1032
|
-
// and the GA server VAD's default tuning was calibrated against
|
|
1033
|
-
// studio-quality 24 kHz audio. A more permissive threshold
|
|
1034
|
-
// recovers reliable speech detection on phone-band input.
|
|
1035
|
-
turn_detection: {
|
|
1036
|
-
type: opts.vadType ?? OpenAIRealtimeVADType.SERVER_VAD,
|
|
1037
|
-
threshold: 0.1,
|
|
1038
|
-
prefix_padding_ms: 300,
|
|
1039
|
-
silence_duration_ms: opts.silenceDurationMs ?? 500
|
|
1040
|
-
}
|
|
1041
|
-
},
|
|
1042
|
-
output: {
|
|
1043
|
-
format: fmt,
|
|
1044
|
-
voice: this.voice
|
|
1045
|
-
}
|
|
1046
|
-
},
|
|
1047
|
-
instructions: this.instructions || "You are a helpful voice assistant. Be concise."
|
|
1048
|
-
};
|
|
1049
|
-
if (opts.temperature !== void 0) config.temperature = opts.temperature;
|
|
1050
|
-
if (opts.maxResponseOutputTokens !== void 0) {
|
|
1051
|
-
config.max_output_tokens = opts.maxResponseOutputTokens;
|
|
1052
|
-
}
|
|
1053
|
-
if (opts.toolChoice !== void 0) config.tool_choice = opts.toolChoice;
|
|
1054
|
-
if (opts.reasoningEffort !== void 0) {
|
|
1055
|
-
config.reasoning = { effort: opts.reasoningEffort };
|
|
1056
|
-
}
|
|
1057
|
-
if (this.tools?.length) {
|
|
1058
|
-
config.tools = this.tools.map((t) => {
|
|
1059
|
-
const def = {
|
|
1060
|
-
type: "function",
|
|
1061
|
-
name: t.name,
|
|
1062
|
-
description: t.description,
|
|
1063
|
-
parameters: t.parameters
|
|
1064
|
-
};
|
|
1065
|
-
if (t.strict === true) def.strict = true;
|
|
1066
|
-
return def;
|
|
1067
|
-
});
|
|
1068
|
-
}
|
|
1069
|
-
return config;
|
|
1070
|
-
}
|
|
1071
|
-
/**
|
|
1072
|
-
* Open the Realtime WebSocket against the GA endpoint and apply the GA
|
|
1073
|
-
* session configuration. Header `OpenAI-Beta: realtime=v1` is OMITTED
|
|
1074
|
-
* (the GA endpoint rejects it). Wire shape uses nested `audio.{input,
|
|
1075
|
-
* output}` + `output_modalities` + `session.type === "realtime"`.
|
|
1076
|
-
*/
|
|
1077
|
-
async connect() {
|
|
1078
|
-
const url = `wss://api.openai.com/v1/realtime?model=${encodeURIComponent(this.model)}`;
|
|
1079
|
-
this.ws = new WebSocket2(url, {
|
|
1080
|
-
headers: { Authorization: `Bearer ${this.apiKey}` }
|
|
1081
|
-
});
|
|
1082
|
-
const wsRef = this.ws;
|
|
1083
|
-
const originalOn = wsRef.on.bind(this.ws);
|
|
1084
|
-
wsRef.on = (event, handler) => {
|
|
1085
|
-
if (event !== "message") return originalOn(event, handler);
|
|
1086
|
-
const wrapped = (raw, ...rest) => {
|
|
1087
|
-
try {
|
|
1088
|
-
const text = typeof raw === "string" ? raw : raw.toString();
|
|
1089
|
-
const parsed = JSON.parse(text);
|
|
1090
|
-
const t = parsed.type;
|
|
1091
|
-
if (t && t in GA_TO_V1_EVENT_NAMES) {
|
|
1092
|
-
const newType = GA_TO_V1_EVENT_NAMES[t];
|
|
1093
|
-
if (t === "response.output_audio.delta" && typeof parsed.delta === "string") {
|
|
1094
|
-
const mulaw = this.transcodeOutboundPcm24ToMulaw8Buffer(parsed.delta);
|
|
1095
|
-
const FRAME_BYTES = 160;
|
|
1096
|
-
if (mulaw.length === 0) return;
|
|
1097
|
-
for (let off = 0; off < mulaw.length; off += FRAME_BYTES) {
|
|
1098
|
-
const slice = mulaw.subarray(off, Math.min(off + FRAME_BYTES, mulaw.length));
|
|
1099
|
-
const frame = { ...parsed, type: newType, delta: slice.toString("base64") };
|
|
1100
|
-
handler(Buffer.from(JSON.stringify(frame)), ...rest);
|
|
1101
|
-
}
|
|
1102
|
-
return;
|
|
1103
|
-
}
|
|
1104
|
-
parsed.type = newType;
|
|
1105
|
-
handler(Buffer.from(JSON.stringify(parsed)), ...rest);
|
|
1106
|
-
return;
|
|
1107
|
-
}
|
|
1108
|
-
} catch {
|
|
1109
|
-
}
|
|
1110
|
-
handler(raw, ...rest);
|
|
1111
|
-
};
|
|
1112
|
-
return originalOn(event, wrapped);
|
|
1113
|
-
};
|
|
1114
|
-
await new Promise((resolve, reject) => {
|
|
1115
|
-
let sessionCreated = false;
|
|
1116
|
-
let settled = false;
|
|
1117
|
-
const ws = this.ws;
|
|
1118
|
-
const onSetupMessage = (raw) => {
|
|
1119
|
-
let msg;
|
|
1120
|
-
try {
|
|
1121
|
-
msg = JSON.parse(raw.toString());
|
|
1122
|
-
} catch (e) {
|
|
1123
|
-
getLogger().warn(`OpenAI Realtime 2: failed to parse message: ${String(e)}`);
|
|
1124
|
-
return;
|
|
1125
|
-
}
|
|
1126
|
-
if (msg.type === "session.created" && !sessionCreated) {
|
|
1127
|
-
sessionCreated = true;
|
|
1128
|
-
ws.send(JSON.stringify({ type: "session.update", session: this.buildGASessionConfig() }));
|
|
1129
|
-
} else if (msg.type === "session.updated") {
|
|
1130
|
-
cleanup();
|
|
1131
|
-
resolve();
|
|
1132
|
-
} else if (msg.type === "error") {
|
|
1133
|
-
cleanup();
|
|
1134
|
-
try {
|
|
1135
|
-
ws.close();
|
|
1136
|
-
} catch {
|
|
1137
|
-
}
|
|
1138
|
-
reject(new Error(`OpenAI Realtime 2 setup error: ${msg.error?.message ?? JSON.stringify(msg)}`));
|
|
1139
|
-
}
|
|
1140
|
-
};
|
|
1141
|
-
const onSetupError = (err) => {
|
|
1142
|
-
cleanup();
|
|
1143
|
-
try {
|
|
1144
|
-
ws.close();
|
|
1145
|
-
} catch {
|
|
1146
|
-
}
|
|
1147
|
-
reject(err);
|
|
1148
|
-
};
|
|
1149
|
-
const cleanup = () => {
|
|
1150
|
-
if (settled) return;
|
|
1151
|
-
settled = true;
|
|
1152
|
-
clearTimeout(timer);
|
|
1153
|
-
ws.off("message", onSetupMessage);
|
|
1154
|
-
ws.off("error", onSetupError);
|
|
1155
|
-
};
|
|
1156
|
-
const timer = setTimeout(() => {
|
|
1157
|
-
cleanup();
|
|
1158
|
-
try {
|
|
1159
|
-
ws.close();
|
|
1160
|
-
} catch {
|
|
1161
|
-
}
|
|
1162
|
-
reject(new Error("OpenAI Realtime 2 connect timeout"));
|
|
1163
|
-
}, 15e3);
|
|
1164
|
-
ws.on("message", onSetupMessage);
|
|
1165
|
-
ws.on("error", onSetupError);
|
|
1166
|
-
});
|
|
1167
|
-
this.armHeartbeatAndListener();
|
|
1168
|
-
}
|
|
1169
|
-
/**
|
|
1170
|
-
* GA-API variant of {@link OpenAIRealtimeAdapter.sendFirstMessage}. Two
|
|
1171
|
-
* differences from the v1 path:
|
|
1172
|
-
*
|
|
1173
|
-
* 1. The v1 implementation sends `response.modalities` which the GA
|
|
1174
|
-
* endpoint rejects with `Unknown parameter: 'response.modalities'`.
|
|
1175
|
-
* Use `output_modalities` to match the GA `session.update` shape.
|
|
1176
|
-
*
|
|
1177
|
-
* 2. The GA `response.create` does NOT inherit `audio.output.voice`
|
|
1178
|
-
* from the session — it falls back to the server-side default
|
|
1179
|
-
* (`marin`, female) when the field is omitted on the response
|
|
1180
|
-
* itself. Session-level `voice: "alloy"` only affects subsequent
|
|
1181
|
-
* server-VAD-triggered responses, NOT this explicit
|
|
1182
|
-
* `response.create`. We re-inject the configured voice here so the
|
|
1183
|
-
* first-message voice matches the rest of the call.
|
|
1184
|
-
*/
|
|
1185
|
-
/**
|
|
1186
|
-
* Override the parent `sendAudio` to transcode inbound carrier audio
|
|
1187
|
-
* (mulaw 8 kHz from Twilio/Telnyx) into PCM-16 24 kHz before sending
|
|
1188
|
-
* `input_audio_buffer.append`. The GA server's audio engine ignores
|
|
1189
|
-
* mulaw frames (commit returns "buffer only has 0.00ms of audio") even
|
|
1190
|
-
* though it accepts `audio/pcmu` at the protocol level.
|
|
1191
|
-
*/
|
|
1192
|
-
sendAudio(mulawAudio) {
|
|
1193
|
-
if (!this.ws || this.ws.readyState !== this.ws.OPEN) return;
|
|
1194
|
-
const pcm24k = this.transcodeInboundMulaw8ToPcm24(mulawAudio);
|
|
1195
|
-
this.ws.send(JSON.stringify({
|
|
1196
|
-
type: "input_audio_buffer.append",
|
|
1197
|
-
audio: pcm24k.toString("base64")
|
|
1198
|
-
}));
|
|
1199
|
-
}
|
|
1200
|
-
/**
|
|
1201
|
-
* mulaw 8 kHz Buffer → PCM-16-LE 24 kHz Buffer.
|
|
1202
|
-
*
|
|
1203
|
-
* Direct 3× linear-interpolation upsample with a one-sample carry
|
|
1204
|
-
* across chunk boundaries. For every consecutive pair of 8 kHz
|
|
1205
|
-
* samples `(s_a, s_b)` we emit three 24 kHz samples:
|
|
1206
|
-
*
|
|
1207
|
-
* out_0 = s_a
|
|
1208
|
-
* out_1 = 2/3·s_a + 1/3·s_b
|
|
1209
|
-
* out_2 = 1/3·s_a + 2/3·s_b
|
|
1210
|
-
*
|
|
1211
|
-
* The carry stores the last 8 kHz sample of the chunk so the next
|
|
1212
|
-
* chunk can start by pairing `(carry, firstNewSample)` — that's what
|
|
1213
|
-
* keeps the output rate exact (each input sample → 3 output samples)
|
|
1214
|
-
* and eliminates the chunk-boundary DC step that confused the GA
|
|
1215
|
-
* server VAD. The first chunk has no carry and loses 3 samples at
|
|
1216
|
-
* the leading edge (375 µs of audio); that's well below any audible
|
|
1217
|
-
* artefact and well below the GA VAD's 300 ms prefix-padding window.
|
|
1218
|
-
*/
|
|
1219
|
-
transcodeInboundMulaw8ToPcm24(mulaw) {
|
|
1220
|
-
const pcm8 = mulawToPcm16(mulaw);
|
|
1221
|
-
const samples8 = pcm8.length / 2;
|
|
1222
|
-
if (samples8 === 0) return Buffer.alloc(0);
|
|
1223
|
-
const GAIN = 2;
|
|
1224
|
-
const inputs = [];
|
|
1225
|
-
if (this.inbound8kCarry !== null) inputs.push(this.inbound8kCarry);
|
|
1226
|
-
for (let i = 0; i < samples8; i++) {
|
|
1227
|
-
const raw = pcm8.readInt16LE(i * 2) * GAIN;
|
|
1228
|
-
inputs.push(Math.max(-32768, Math.min(32767, raw)));
|
|
1229
|
-
}
|
|
1230
|
-
this.inbound8kCarry = inputs[inputs.length - 1];
|
|
1231
|
-
const numPairs = inputs.length - 1;
|
|
1232
|
-
if (numPairs <= 0) return Buffer.alloc(0);
|
|
1233
|
-
const out = Buffer.allocUnsafe(numPairs * 3 * 2);
|
|
1234
|
-
for (let i = 0; i < numPairs; i++) {
|
|
1235
|
-
const s0 = inputs[i];
|
|
1236
|
-
const s1 = inputs[i + 1];
|
|
1237
|
-
out.writeInt16LE(s0, i * 6);
|
|
1238
|
-
out.writeInt16LE(Math.round((s0 * 2 + s1) / 3), i * 6 + 2);
|
|
1239
|
-
out.writeInt16LE(Math.round((s0 + s1 * 2) / 3), i * 6 + 4);
|
|
1240
|
-
}
|
|
1241
|
-
return out;
|
|
1242
|
-
}
|
|
1243
|
-
/**
|
|
1244
|
-
* Base64 PCM-16-LE 24 kHz → Base64 mulaw 8 kHz. Used by the WS
|
|
1245
|
-
* translation shim on each `response.output_audio.delta`. The stateful
|
|
1246
|
-
* resampler is created lazily and reused across all deltas in this
|
|
1247
|
-
* session so the 3:1 decimator's phase carries across chunk
|
|
1248
|
-
* boundaries — without that, every chunk boundary produces a click.
|
|
1249
|
-
*/
|
|
1250
|
-
transcodeOutboundPcm24ToMulaw8Buffer(deltaB64) {
|
|
1251
|
-
if (!this.outboundResampler24To16) {
|
|
1252
|
-
this.outboundResampler24To16 = new StatefulResampler({ srcRate: 24e3, dstRate: 16e3 });
|
|
1253
|
-
this.outboundResampler16To8 = new StatefulResampler({ srcRate: 16e3, dstRate: 8e3 });
|
|
1254
|
-
}
|
|
1255
|
-
const pcm24 = Buffer.from(deltaB64, "base64");
|
|
1256
|
-
const pcm16 = this.outboundResampler24To16.process(pcm24);
|
|
1257
|
-
const pcm8 = this.outboundResampler16To8.process(pcm16);
|
|
1258
|
-
if (pcm8.length === 0) return Buffer.alloc(0);
|
|
1259
|
-
return pcm16ToMulaw(pcm8);
|
|
1260
|
-
}
|
|
1261
|
-
async sendFirstMessage(text) {
|
|
1262
|
-
this.ws?.send(JSON.stringify({
|
|
1263
|
-
type: "response.create",
|
|
1264
|
-
response: {
|
|
1265
|
-
output_modalities: ["audio"],
|
|
1266
|
-
audio: { output: { voice: this.voice } },
|
|
1267
|
-
reasoning: { effort: "minimal" },
|
|
1268
|
-
instructions: `Say exactly the following sentence as your first turn and nothing else: "${text}"`
|
|
1269
|
-
}
|
|
1270
|
-
}));
|
|
1271
|
-
}
|
|
1272
|
-
};
|
|
1273
|
-
|
|
1274
32
|
// src/providers/elevenlabs-convai.ts
|
|
1275
33
|
init_esm_shims();
|
|
1276
|
-
import
|
|
34
|
+
import WebSocket from "ws";
|
|
1277
35
|
var ELEVENLABS_CONVAI_URL = "wss://api.elevenlabs.io/v1/convai/conversation";
|
|
1278
36
|
var ELEVENLABS_SIGNED_URL = "https://api.elevenlabs.io/v1/convai/conversation/get-signed-url";
|
|
1279
37
|
var AGENT_SILENCE_MS = 500;
|
|
@@ -1395,8 +153,8 @@ var ElevenLabsConvAIAdapter = class _ElevenLabsConvAIAdapter {
|
|
|
1395
153
|
wsUrl = this.agentId ? `${ELEVENLABS_CONVAI_URL}?agent_id=${encodeURIComponent(this.agentId)}` : ELEVENLABS_CONVAI_URL;
|
|
1396
154
|
wsOptions = { headers: { "xi-api-key": this.apiKey } };
|
|
1397
155
|
}
|
|
1398
|
-
this.ws = new
|
|
1399
|
-
await new Promise((
|
|
156
|
+
this.ws = new WebSocket(wsUrl, wsOptions);
|
|
157
|
+
await new Promise((resolve2, reject) => {
|
|
1400
158
|
const timeout = setTimeout(
|
|
1401
159
|
() => reject(new Error("ElevenLabs ConvAI connect timeout")),
|
|
1402
160
|
15e3
|
|
@@ -1420,7 +178,7 @@ var ElevenLabsConvAIAdapter = class _ElevenLabsConvAIAdapter {
|
|
|
1420
178
|
conversation_config_override: override
|
|
1421
179
|
};
|
|
1422
180
|
this.ws.send(JSON.stringify(config));
|
|
1423
|
-
|
|
181
|
+
resolve2();
|
|
1424
182
|
});
|
|
1425
183
|
this.ws.once("error", (err) => {
|
|
1426
184
|
clearTimeout(timeout);
|
|
@@ -1457,7 +215,7 @@ var ElevenLabsConvAIAdapter = class _ElevenLabsConvAIAdapter {
|
|
|
1457
215
|
}
|
|
1458
216
|
respondToPing(eventId, delayMs) {
|
|
1459
217
|
const send = () => {
|
|
1460
|
-
if (!this.ws || this.ws.readyState !==
|
|
218
|
+
if (!this.ws || this.ws.readyState !== WebSocket.OPEN) return;
|
|
1461
219
|
try {
|
|
1462
220
|
this.ws.send(JSON.stringify({ type: "pong", event_id: eventId }));
|
|
1463
221
|
} catch (err) {
|
|
@@ -1554,7 +312,7 @@ var ElevenLabsConvAIAdapter = class _ElevenLabsConvAIAdapter {
|
|
|
1554
312
|
}
|
|
1555
313
|
/** Send a caller-side audio chunk to ConvAI as a base64 `user_audio_chunk`. */
|
|
1556
314
|
sendAudio(audioBytes) {
|
|
1557
|
-
if (!this.ws || this.ws.readyState !==
|
|
315
|
+
if (!this.ws || this.ws.readyState !== WebSocket.OPEN) return;
|
|
1558
316
|
this.ws.send(
|
|
1559
317
|
JSON.stringify({
|
|
1560
318
|
user_audio_chunk: audioBytes.toString("base64")
|
|
@@ -1577,20 +335,20 @@ var ElevenLabsConvAIAdapter = class _ElevenLabsConvAIAdapter {
|
|
|
1577
335
|
return;
|
|
1578
336
|
}
|
|
1579
337
|
const ws = this.ws;
|
|
1580
|
-
this.closePromise = new Promise((
|
|
1581
|
-
if (ws.readyState ===
|
|
1582
|
-
|
|
338
|
+
this.closePromise = new Promise((resolve2) => {
|
|
339
|
+
if (ws.readyState === WebSocket.CLOSED || ws.readyState === WebSocket.CLOSING) {
|
|
340
|
+
resolve2();
|
|
1583
341
|
return;
|
|
1584
342
|
}
|
|
1585
343
|
const done = () => {
|
|
1586
|
-
|
|
344
|
+
resolve2();
|
|
1587
345
|
};
|
|
1588
346
|
ws.once("close", done);
|
|
1589
347
|
ws.once("error", done);
|
|
1590
348
|
try {
|
|
1591
349
|
ws.close();
|
|
1592
350
|
} catch {
|
|
1593
|
-
|
|
351
|
+
resolve2();
|
|
1594
352
|
}
|
|
1595
353
|
});
|
|
1596
354
|
try {
|
|
@@ -1614,6 +372,8 @@ async function createTTS(agent) {
|
|
|
1614
372
|
|
|
1615
373
|
// src/pricing.ts
|
|
1616
374
|
init_esm_shims();
|
|
375
|
+
var PRICING_VERSION = "2026.3";
|
|
376
|
+
var PRICING_LAST_UPDATED = "2026-05-08";
|
|
1617
377
|
var PricingUnit = {
|
|
1618
378
|
MINUTE: "minute",
|
|
1619
379
|
THOUSAND_CHARS: "1k_chars",
|
|
@@ -2063,8 +823,28 @@ function calculateTelephonyCost(provider2, durationSeconds, pricing) {
|
|
|
2063
823
|
// src/dashboard/store.ts
|
|
2064
824
|
init_esm_shims();
|
|
2065
825
|
import { EventEmitter } from "events";
|
|
826
|
+
import * as fs2 from "fs";
|
|
827
|
+
import * as path2 from "path";
|
|
828
|
+
|
|
829
|
+
// src/version.ts
|
|
830
|
+
init_esm_shims();
|
|
2066
831
|
import * as fs from "fs";
|
|
2067
832
|
import * as path from "path";
|
|
833
|
+
function readVersion() {
|
|
834
|
+
try {
|
|
835
|
+
const pkgPath = path.resolve(__dirname, "..", "package.json");
|
|
836
|
+
const pkg = JSON.parse(fs.readFileSync(pkgPath, "utf8"));
|
|
837
|
+
return typeof pkg.version === "string" && pkg.version.length > 0 ? pkg.version : "";
|
|
838
|
+
} catch {
|
|
839
|
+
return "";
|
|
840
|
+
}
|
|
841
|
+
}
|
|
842
|
+
var VERSION = readVersion();
|
|
843
|
+
|
|
844
|
+
// src/dashboard/store.ts
|
|
845
|
+
function sdkVersion() {
|
|
846
|
+
return VERSION;
|
|
847
|
+
}
|
|
2068
848
|
var MetricsStore = class extends EventEmitter {
|
|
2069
849
|
maxCalls;
|
|
2070
850
|
calls = [];
|
|
@@ -2347,15 +1127,15 @@ var MetricsStore = class extends EventEmitter {
|
|
|
2347
1127
|
persistDeletedIds() {
|
|
2348
1128
|
if (this.deletedIdsPath === null) return;
|
|
2349
1129
|
try {
|
|
2350
|
-
const dir =
|
|
2351
|
-
|
|
1130
|
+
const dir = path2.dirname(this.deletedIdsPath);
|
|
1131
|
+
fs2.mkdirSync(dir, { recursive: true });
|
|
2352
1132
|
const tmp = this.deletedIdsPath + ".tmp";
|
|
2353
1133
|
const payload = {
|
|
2354
1134
|
version: 1,
|
|
2355
1135
|
deleted_call_ids: Array.from(this.deletedCallIds).sort()
|
|
2356
1136
|
};
|
|
2357
|
-
|
|
2358
|
-
|
|
1137
|
+
fs2.writeFileSync(tmp, JSON.stringify(payload, null, 2), "utf8");
|
|
1138
|
+
fs2.renameSync(tmp, this.deletedIdsPath);
|
|
2359
1139
|
} catch (err) {
|
|
2360
1140
|
getLogger().debug(
|
|
2361
1141
|
`MetricsStore.persistDeletedIds: ${String(err)}`
|
|
@@ -2388,7 +1168,8 @@ var MetricsStore = class extends EventEmitter {
|
|
|
2388
1168
|
avg_duration: 0,
|
|
2389
1169
|
avg_latency_ms: 0,
|
|
2390
1170
|
cost_breakdown: { stt: 0, tts: 0, llm: 0, telephony: 0 },
|
|
2391
|
-
active_calls: this.activeCalls.size
|
|
1171
|
+
active_calls: this.activeCalls.size,
|
|
1172
|
+
sdk_version: sdkVersion()
|
|
2392
1173
|
};
|
|
2393
1174
|
}
|
|
2394
1175
|
let totalCost = 0;
|
|
@@ -2427,7 +1208,8 @@ var MetricsStore = class extends EventEmitter {
|
|
|
2427
1208
|
llm: Math.round(costLlm * 1e6) / 1e6,
|
|
2428
1209
|
telephony: Math.round(costTel * 1e6) / 1e6
|
|
2429
1210
|
},
|
|
2430
|
-
active_calls: this.activeCalls.size
|
|
1211
|
+
active_calls: this.activeCalls.size,
|
|
1212
|
+
sdk_version: sdkVersion()
|
|
2431
1213
|
};
|
|
2432
1214
|
}
|
|
2433
1215
|
/**
|
|
@@ -2463,11 +1245,11 @@ var MetricsStore = class extends EventEmitter {
|
|
|
2463
1245
|
*/
|
|
2464
1246
|
hydrate(logRoot) {
|
|
2465
1247
|
if (!logRoot) return 0;
|
|
2466
|
-
const deletedIdsPath =
|
|
1248
|
+
const deletedIdsPath = path2.join(logRoot, ".deleted_call_ids.json");
|
|
2467
1249
|
this.deletedIdsPath = deletedIdsPath;
|
|
2468
|
-
if (
|
|
1250
|
+
if (fs2.existsSync(deletedIdsPath)) {
|
|
2469
1251
|
try {
|
|
2470
|
-
const raw =
|
|
1252
|
+
const raw = fs2.readFileSync(deletedIdsPath, "utf8");
|
|
2471
1253
|
const payload = JSON.parse(raw);
|
|
2472
1254
|
const arr = Array.isArray(payload.deleted_call_ids) ? payload.deleted_call_ids : [];
|
|
2473
1255
|
for (const cid of arr) {
|
|
@@ -2481,19 +1263,19 @@ var MetricsStore = class extends EventEmitter {
|
|
|
2481
1263
|
);
|
|
2482
1264
|
}
|
|
2483
1265
|
}
|
|
2484
|
-
const callsRoot =
|
|
2485
|
-
if (!
|
|
1266
|
+
const callsRoot = path2.join(logRoot, "calls");
|
|
1267
|
+
if (!fs2.existsSync(callsRoot)) return 0;
|
|
2486
1268
|
const collected = [];
|
|
2487
1269
|
const seen = new Set(this.calls.map((c) => c.call_id));
|
|
2488
1270
|
const walk = (dir, depth) => {
|
|
2489
1271
|
let entries;
|
|
2490
1272
|
try {
|
|
2491
|
-
entries =
|
|
1273
|
+
entries = fs2.readdirSync(dir, { withFileTypes: true });
|
|
2492
1274
|
} catch {
|
|
2493
1275
|
return;
|
|
2494
1276
|
}
|
|
2495
1277
|
for (const entry of entries) {
|
|
2496
|
-
const childPath =
|
|
1278
|
+
const childPath = path2.join(dir, entry.name);
|
|
2497
1279
|
if (depth < 3) {
|
|
2498
1280
|
if (entry.isDirectory() && /^\d+$/.test(entry.name)) {
|
|
2499
1281
|
walk(childPath, depth + 1);
|
|
@@ -2501,10 +1283,10 @@ var MetricsStore = class extends EventEmitter {
|
|
|
2501
1283
|
continue;
|
|
2502
1284
|
}
|
|
2503
1285
|
if (!entry.isDirectory()) continue;
|
|
2504
|
-
const metadataPath =
|
|
2505
|
-
if (!
|
|
1286
|
+
const metadataPath = path2.join(childPath, "metadata.json");
|
|
1287
|
+
if (!fs2.existsSync(metadataPath)) continue;
|
|
2506
1288
|
try {
|
|
2507
|
-
const raw =
|
|
1289
|
+
const raw = fs2.readFileSync(metadataPath, "utf8");
|
|
2508
1290
|
const meta = JSON.parse(raw);
|
|
2509
1291
|
const callId = meta.call_id || entry.name;
|
|
2510
1292
|
if (!callId || seen.has(callId)) continue;
|
|
@@ -2517,7 +1299,7 @@ var MetricsStore = class extends EventEmitter {
|
|
|
2517
1299
|
}
|
|
2518
1300
|
if (!record.transcript || record.transcript.length === 0) {
|
|
2519
1301
|
const fromJsonl = loadTranscriptJsonl(
|
|
2520
|
-
|
|
1302
|
+
path2.join(childPath, "transcript.jsonl")
|
|
2521
1303
|
);
|
|
2522
1304
|
if (fromJsonl.length > 0) record.transcript = fromJsonl;
|
|
2523
1305
|
}
|
|
@@ -2596,8 +1378,8 @@ function metadataToCallRecord(callId, meta) {
|
|
|
2596
1378
|
}
|
|
2597
1379
|
function loadTranscriptJsonl(filePath) {
|
|
2598
1380
|
try {
|
|
2599
|
-
if (!
|
|
2600
|
-
const raw =
|
|
1381
|
+
if (!fs2.existsSync(filePath)) return [];
|
|
1382
|
+
const raw = fs2.readFileSync(filePath, "utf8");
|
|
2601
1383
|
const lines = raw.split("\n").filter((l) => l.trim().length > 0);
|
|
2602
1384
|
const out = [];
|
|
2603
1385
|
for (const line of lines) {
|
|
@@ -2731,7 +1513,7 @@ function csvEscape(value) {
|
|
|
2731
1513
|
|
|
2732
1514
|
// src/dashboard/ui.ts
|
|
2733
1515
|
init_esm_shims();
|
|
2734
|
-
import { readFileSync as
|
|
1516
|
+
import { readFileSync as readFileSync3 } from "fs";
|
|
2735
1517
|
import { join as join2, dirname as dirname2 } from "path";
|
|
2736
1518
|
var FALLBACK_HTML = `<!doctype html>
|
|
2737
1519
|
<html><head><meta charset="utf-8"><title>Patter dashboard</title></head>
|
|
@@ -2748,9 +1530,9 @@ function loadDashboardHtml() {
|
|
|
2748
1530
|
join2(here, "dashboard", "ui.html"),
|
|
2749
1531
|
join2(here, "..", "dashboard", "ui.html")
|
|
2750
1532
|
];
|
|
2751
|
-
for (const
|
|
1533
|
+
for (const path4 of candidates) {
|
|
2752
1534
|
try {
|
|
2753
|
-
return
|
|
1535
|
+
return readFileSync3(path4, "utf8");
|
|
2754
1536
|
} catch {
|
|
2755
1537
|
}
|
|
2756
1538
|
}
|
|
@@ -3047,8 +1829,8 @@ var RemoteMessageHandler = class {
|
|
|
3047
1829
|
"WebSocket URL uses unencrypted ws:// \u2014 call transcripts and phone numbers will be sent in plaintext. Use wss:// in production."
|
|
3048
1830
|
);
|
|
3049
1831
|
}
|
|
3050
|
-
const { WebSocket:
|
|
3051
|
-
const ws = new
|
|
1832
|
+
const { WebSocket: WebSocket3 } = await import("ws");
|
|
1833
|
+
const ws = new WebSocket3(url);
|
|
3052
1834
|
const chunks = [];
|
|
3053
1835
|
let done = false;
|
|
3054
1836
|
let error = null;
|
|
@@ -3102,10 +1884,10 @@ var RemoteMessageHandler = class {
|
|
|
3102
1884
|
}
|
|
3103
1885
|
});
|
|
3104
1886
|
try {
|
|
3105
|
-
await new Promise((
|
|
1887
|
+
await new Promise((resolve2, reject) => {
|
|
3106
1888
|
ws.on("open", () => {
|
|
3107
1889
|
ws.send(JSON.stringify(data));
|
|
3108
|
-
|
|
1890
|
+
resolve2();
|
|
3109
1891
|
});
|
|
3110
1892
|
ws.on("error", (err) => {
|
|
3111
1893
|
reject(err);
|
|
@@ -3115,11 +1897,11 @@ var RemoteMessageHandler = class {
|
|
|
3115
1897
|
yield chunks.shift();
|
|
3116
1898
|
}
|
|
3117
1899
|
while (!done && !error) {
|
|
3118
|
-
const text = await new Promise((
|
|
1900
|
+
const text = await new Promise((resolve2) => {
|
|
3119
1901
|
if (chunks.length > 0) {
|
|
3120
|
-
|
|
1902
|
+
resolve2(chunks.shift());
|
|
3121
1903
|
} else {
|
|
3122
|
-
resolveNext =
|
|
1904
|
+
resolveNext = resolve2;
|
|
3123
1905
|
}
|
|
3124
1906
|
});
|
|
3125
1907
|
if (text === null) break;
|
|
@@ -3146,7 +1928,7 @@ init_esm_shims();
|
|
|
3146
1928
|
|
|
3147
1929
|
// src/providers/deepgram-stt.ts
|
|
3148
1930
|
init_esm_shims();
|
|
3149
|
-
import
|
|
1931
|
+
import WebSocket2 from "ws";
|
|
3150
1932
|
|
|
3151
1933
|
// src/errors.ts
|
|
3152
1934
|
init_esm_shims();
|
|
@@ -3327,8 +2109,8 @@ var DeepgramSTT = class _DeepgramSTT {
|
|
|
3327
2109
|
const url = `${DEEPGRAM_WS_URL}?${params.toString()}`;
|
|
3328
2110
|
let ws = null;
|
|
3329
2111
|
try {
|
|
3330
|
-
ws = await new Promise((
|
|
3331
|
-
const sock = new
|
|
2112
|
+
ws = await new Promise((resolve2, reject) => {
|
|
2113
|
+
const sock = new WebSocket2(url, {
|
|
3332
2114
|
headers: { Authorization: `Token ${this.apiKey}` }
|
|
3333
2115
|
});
|
|
3334
2116
|
const timer = setTimeout(() => {
|
|
@@ -3340,7 +2122,7 @@ var DeepgramSTT = class _DeepgramSTT {
|
|
|
3340
2122
|
}, 5e3);
|
|
3341
2123
|
sock.once("open", () => {
|
|
3342
2124
|
clearTimeout(timer);
|
|
3343
|
-
|
|
2125
|
+
resolve2(sock);
|
|
3344
2126
|
});
|
|
3345
2127
|
sock.once("error", (err) => {
|
|
3346
2128
|
clearTimeout(timer);
|
|
@@ -3367,11 +2149,11 @@ var DeepgramSTT = class _DeepgramSTT {
|
|
|
3367
2149
|
}
|
|
3368
2150
|
async openSocket() {
|
|
3369
2151
|
const url = this.buildUrl();
|
|
3370
|
-
const ws = new
|
|
2152
|
+
const ws = new WebSocket2(url, {
|
|
3371
2153
|
headers: { Authorization: `Token ${this.apiKey}` }
|
|
3372
2154
|
});
|
|
3373
2155
|
this.ws = ws;
|
|
3374
|
-
await new Promise((
|
|
2156
|
+
await new Promise((resolve2, reject) => {
|
|
3375
2157
|
let settled = false;
|
|
3376
2158
|
const settle = (fn) => {
|
|
3377
2159
|
if (settled) return;
|
|
@@ -3383,7 +2165,7 @@ var DeepgramSTT = class _DeepgramSTT {
|
|
|
3383
2165
|
() => settle(() => reject(new PatterConnectionError("Deepgram connect timeout"))),
|
|
3384
2166
|
1e4
|
|
3385
2167
|
);
|
|
3386
|
-
ws.once("open", () => settle(
|
|
2168
|
+
ws.once("open", () => settle(resolve2));
|
|
3387
2169
|
ws.once("error", (err) => settle(() => reject(err)));
|
|
3388
2170
|
ws.once("unexpected-response", (_req, res) => {
|
|
3389
2171
|
const status = res?.statusCode ?? 0;
|
|
@@ -3404,7 +2186,7 @@ var DeepgramSTT = class _DeepgramSTT {
|
|
|
3404
2186
|
ws.on("close", (code, reason) => this.handleClose(code, reason.toString()));
|
|
3405
2187
|
ws.on("error", (err) => this.handleError(err));
|
|
3406
2188
|
this.keepaliveTimer = setInterval(() => {
|
|
3407
|
-
if (this.ws && this.ws.readyState ===
|
|
2189
|
+
if (this.ws && this.ws.readyState === WebSocket2.OPEN) {
|
|
3408
2190
|
try {
|
|
3409
2191
|
this.ws.send(JSON.stringify({ type: "KeepAlive" }));
|
|
3410
2192
|
} catch {
|
|
@@ -3523,7 +2305,7 @@ var DeepgramSTT = class _DeepgramSTT {
|
|
|
3523
2305
|
}
|
|
3524
2306
|
/** Send a binary audio chunk to Deepgram for transcription. */
|
|
3525
2307
|
sendAudio(audio) {
|
|
3526
|
-
if (!this.ws || this.ws.readyState !==
|
|
2308
|
+
if (!this.ws || this.ws.readyState !== WebSocket2.OPEN) {
|
|
3527
2309
|
this.audioDroppedCount++;
|
|
3528
2310
|
if (this.audioDroppedCount === 1 || this.audioDroppedCount % 50 === 0) {
|
|
3529
2311
|
getLogger().info(
|
|
@@ -3572,7 +2354,7 @@ var DeepgramSTT = class _DeepgramSTT {
|
|
|
3572
2354
|
*/
|
|
3573
2355
|
finalize() {
|
|
3574
2356
|
const ws = this.ws;
|
|
3575
|
-
if (!ws || ws.readyState !==
|
|
2357
|
+
if (!ws || ws.readyState !== WebSocket2.OPEN) {
|
|
3576
2358
|
getLogger().info(
|
|
3577
2359
|
`[DIAG] DeepgramSTT.finalize SKIPPED (ws state=${ws?.readyState ?? "null"})`
|
|
3578
2360
|
);
|
|
@@ -3593,7 +2375,7 @@ var DeepgramSTT = class _DeepgramSTT {
|
|
|
3593
2375
|
if (!ws) return;
|
|
3594
2376
|
this.ws = null;
|
|
3595
2377
|
const sendSafe = (payload) => {
|
|
3596
|
-
if (ws.readyState ===
|
|
2378
|
+
if (ws.readyState === WebSocket2.OPEN) {
|
|
3597
2379
|
try {
|
|
3598
2380
|
ws.send(payload);
|
|
3599
2381
|
} catch {
|
|
@@ -3607,7 +2389,7 @@ var DeepgramSTT = class _DeepgramSTT {
|
|
|
3607
2389
|
} catch {
|
|
3608
2390
|
}
|
|
3609
2391
|
};
|
|
3610
|
-
if (ws.readyState !==
|
|
2392
|
+
if (ws.readyState !== WebSocket2.OPEN) {
|
|
3611
2393
|
finishClose();
|
|
3612
2394
|
return;
|
|
3613
2395
|
}
|
|
@@ -3676,6 +2458,21 @@ var CallMetricsAccumulator = class {
|
|
|
3676
2458
|
_bargeinStoppedAt = null;
|
|
3677
2459
|
_turnUserText = "";
|
|
3678
2460
|
_turnSttAudioSeconds = 0;
|
|
2461
|
+
/**
|
|
2462
|
+
* Guard against the recordTurnInterrupted / recordTurnComplete race.
|
|
2463
|
+
*
|
|
2464
|
+
* A VAD-path barge-in fires ``recordTurnInterrupted`` synchronously
|
|
2465
|
+
* inside ``handleAudioAsync`` while the in-flight pipeline LLM stream
|
|
2466
|
+
* keeps unwinding on its own task. When the LLM stream eventually
|
|
2467
|
+
* exits, the existing pipeline path falls through to
|
|
2468
|
+
* ``recordTurnComplete``, which would push a second turn for the same
|
|
2469
|
+
* logical exchange (this time carrying ``user_text=''`` because the
|
|
2470
|
+
* field was already reset). ``_turnAlreadyClosed`` is flipped by
|
|
2471
|
+
* ``recordTurnInterrupted`` and read by ``recordTurnComplete`` so the
|
|
2472
|
+
* late ``recordTurnComplete`` becomes a no-op until the next
|
|
2473
|
+
* ``startTurn`` re-arms the accumulator.
|
|
2474
|
+
*/
|
|
2475
|
+
_turnAlreadyClosed = false;
|
|
3679
2476
|
// Cumulative usage counters
|
|
3680
2477
|
_totalSttAudioSeconds = 0;
|
|
3681
2478
|
_totalTtsCharacters = 0;
|
|
@@ -3773,6 +2570,7 @@ var CallMetricsAccumulator = class {
|
|
|
3773
2570
|
this._bargeinStoppedAt = null;
|
|
3774
2571
|
this._turnUserText = "";
|
|
3775
2572
|
this._turnSttAudioSeconds = 0;
|
|
2573
|
+
this._turnAlreadyClosed = false;
|
|
3776
2574
|
this._vadStoppedAt = null;
|
|
3777
2575
|
this._sttFinalAt = null;
|
|
3778
2576
|
this._turnCommittedAt = null;
|
|
@@ -3929,8 +2727,18 @@ var CallMetricsAccumulator = class {
|
|
|
3929
2727
|
recordTtsStopped(ts) {
|
|
3930
2728
|
this._bargeinStoppedAt = ts ?? hrTimeMs();
|
|
3931
2729
|
}
|
|
3932
|
-
/**
|
|
2730
|
+
/**
|
|
2731
|
+
* Close the current turn cleanly and append a `TurnMetrics` record.
|
|
2732
|
+
*
|
|
2733
|
+
* Returns ``null`` when ``recordTurnInterrupted`` has already closed
|
|
2734
|
+
* the current turn — this protects against the VAD-barge-in /
|
|
2735
|
+
* pipeline-LLM race where both paths try to finalise the same logical
|
|
2736
|
+
* turn and the second would otherwise push a phantom entry with
|
|
2737
|
+
* ``user_text=''``. The caller treats ``null`` as "nothing to emit";
|
|
2738
|
+
* ``emitTurnMetrics`` is already null-safe.
|
|
2739
|
+
*/
|
|
3933
2740
|
recordTurnComplete(agentText) {
|
|
2741
|
+
if (this._turnAlreadyClosed) return null;
|
|
3934
2742
|
const latency = this._computeTurnLatency();
|
|
3935
2743
|
const turn = {
|
|
3936
2744
|
turn_index: this._turns.length,
|
|
@@ -3943,13 +2751,23 @@ var CallMetricsAccumulator = class {
|
|
|
3943
2751
|
};
|
|
3944
2752
|
this._turns.push(turn);
|
|
3945
2753
|
this._resetTurnState();
|
|
2754
|
+
this._turnAlreadyClosed = true;
|
|
3946
2755
|
this._eventBus?.emit("turn_ended", { callId: this.callId, turn });
|
|
3947
2756
|
this._eventBus?.emit("metrics_collected", { callId: this.callId, turn });
|
|
3948
2757
|
return turn;
|
|
3949
2758
|
}
|
|
3950
|
-
/**
|
|
2759
|
+
/**
|
|
2760
|
+
* Close the current turn as interrupted (barge-in) and return the
|
|
2761
|
+
* recorded metrics. Returns ``null`` when no turn is open, OR when
|
|
2762
|
+
* ``recordTurnComplete`` has already finalised the current turn —
|
|
2763
|
+
* bidirectional parity with the guard at the top of
|
|
2764
|
+
* ``recordTurnComplete``. Prevents an out-of-order interruption (e.g.
|
|
2765
|
+
* a future refactor that reorders the bargein + LLM-unwind paths)
|
|
2766
|
+
* from overwriting a turn that the complete path already emitted.
|
|
2767
|
+
*/
|
|
3951
2768
|
recordTurnInterrupted() {
|
|
3952
2769
|
if (this._turnStart === null) return null;
|
|
2770
|
+
if (this._turnAlreadyClosed) return null;
|
|
3953
2771
|
const latency = this._computeTurnLatency();
|
|
3954
2772
|
const turn = {
|
|
3955
2773
|
turn_index: this._turns.length,
|
|
@@ -3964,6 +2782,7 @@ var CallMetricsAccumulator = class {
|
|
|
3964
2782
|
this._eventBus?.emit("turn_ended", { callId: this.callId, turn });
|
|
3965
2783
|
this._eventBus?.emit("metrics_collected", { callId: this.callId, turn });
|
|
3966
2784
|
this._resetTurnState();
|
|
2785
|
+
this._turnAlreadyClosed = true;
|
|
3967
2786
|
this._turnCommittedMono = null;
|
|
3968
2787
|
this._endpointSignalAt = null;
|
|
3969
2788
|
return turn;
|
|
@@ -5252,7 +4071,23 @@ var HALLUCINATIONS = /* @__PURE__ */ new Set([
|
|
|
5252
4071
|
".",
|
|
5253
4072
|
"bye",
|
|
5254
4073
|
"right",
|
|
5255
|
-
"cool"
|
|
4074
|
+
"cool",
|
|
4075
|
+
// Whisper YouTube-caption hallucinations
|
|
4076
|
+
"thank you for watching",
|
|
4077
|
+
"thanks for watching",
|
|
4078
|
+
"thank you for watching!",
|
|
4079
|
+
"thanks for watching!",
|
|
4080
|
+
"thank you so much for watching",
|
|
4081
|
+
"thanks for listening",
|
|
4082
|
+
"please subscribe",
|
|
4083
|
+
"subscribe",
|
|
4084
|
+
"music",
|
|
4085
|
+
"[music]",
|
|
4086
|
+
"\u266A",
|
|
4087
|
+
"[no audio]",
|
|
4088
|
+
"[silence]",
|
|
4089
|
+
"[blank_audio]",
|
|
4090
|
+
"(silence)"
|
|
5256
4091
|
]);
|
|
5257
4092
|
var StreamHandler = class _StreamHandler {
|
|
5258
4093
|
deps;
|
|
@@ -5378,13 +4213,17 @@ var StreamHandler = class _StreamHandler {
|
|
|
5378
4213
|
* Same as the AEC variant but for deployments where AEC is OFF
|
|
5379
4214
|
* (default on PSTN — Twilio/Telnyx). Without an adaptive filter to
|
|
5380
4215
|
* converge, the only justification for a gate is anti-flicker on
|
|
5381
|
-
* micro-events (cough, click). 100
|
|
5382
|
-
*
|
|
5383
|
-
* the
|
|
5384
|
-
*
|
|
5385
|
-
*
|
|
5386
|
-
|
|
5387
|
-
|
|
4216
|
+
* micro-events (cough, click). Raised 100 → 500 ms on 2026-05-19
|
|
4217
|
+
* after the 0.6.2 acceptance run showed a phantom VAD speech_start
|
|
4218
|
+
* firing on the very first inbound frame (~500 ms into the call,
|
|
4219
|
+
* which is past a 100 ms gate). The phantom barge-in cancelled the
|
|
4220
|
+
* prewarmed firstMessage, the user heard a clipped (graffiante)
|
|
4221
|
+
* audio fragment, and the SDK left ``_turnAlreadyClosed=true`` so
|
|
4222
|
+
* subsequent ``recordTurnComplete`` calls were no-ops. 500 ms
|
|
4223
|
+
* filters those phantoms while still letting a real interruption
|
|
4224
|
+
* land within half a second of agent onset.
|
|
4225
|
+
*/
|
|
4226
|
+
static MIN_AGENT_SPEAKING_MS_BEFORE_BARGE_IN_NO_AEC = 500;
|
|
5388
4227
|
/** Handle for the pending grace-period timer, so it can be cleared on cleanup. */
|
|
5389
4228
|
graceTimer = null;
|
|
5390
4229
|
/**
|
|
@@ -5424,30 +4263,12 @@ var StreamHandler = class _StreamHandler {
|
|
|
5424
4263
|
* coexist without name collisions even when firstMessage finishes while
|
|
5425
4264
|
* a Realtime turn is still streaming.
|
|
5426
4265
|
*/
|
|
5427
|
-
firstMessageMarkCounter
|
|
5428
|
-
|
|
5429
|
-
|
|
5430
|
-
|
|
5431
|
-
|
|
5432
|
-
|
|
5433
|
-
* — vs. ~2-5 s with the previous burst-send code, which was the
|
|
5434
|
-
* root cause of "firstMessage non interrompibile". Higher values
|
|
5435
|
-
* smooth playback under jittery RTT (each mark echo adds ~150-250 ms
|
|
5436
|
-
* RTT on PSTN) at the cost of longer barge-in latency; lower values
|
|
5437
|
-
* risk under-buffering. 3 hit the smallest barge-in cap without
|
|
5438
|
-
* audible gaps in 2026-05 acceptance.
|
|
5439
|
-
*/
|
|
5440
|
-
static FIRST_MESSAGE_MARK_WINDOW = 3;
|
|
5441
|
-
/**
|
|
5442
|
-
* Per-chunk soft timeout (ms) while awaiting a mark echo. Twilio's
|
|
5443
|
-
* mark echoes typically arrive within 100-250 ms of audio playback.
|
|
5444
|
-
* Capping at 500 ms guards against carriers (or test doubles) that
|
|
5445
|
-
* never echo — without it a stalled echo would deadlock the loop and
|
|
5446
|
-
* the agent would freeze mid-utterance. On timeout we drop the
|
|
5447
|
-
* waiter from the queue and continue: playout may glitch by one
|
|
5448
|
-
* chunk but the call stays alive.
|
|
5449
|
-
*/
|
|
5450
|
-
static MARK_AWAIT_TIMEOUT_MS = 500;
|
|
4266
|
+
// firstMessageMarkCounter / FIRST_MESSAGE_MARK_WINDOW /
|
|
4267
|
+
// MARK_AWAIT_TIMEOUT_MS were retired with the move to the Twilio-FIFO-
|
|
4268
|
+
// trusts model (sendPacedFirstMessageBytes no longer emits marks).
|
|
4269
|
+
// Marks are still consumed via ``onMark`` for any adapter that wants
|
|
4270
|
+
// to round-trip one, but the firstMessage path no longer back-pressures
|
|
4271
|
+
// on them.
|
|
5451
4272
|
/**
|
|
5452
4273
|
* Minimum drain window (ms) between a ``cancelSpeaking`` and the next
|
|
5453
4274
|
* ``beginSpeaking``. 150 ms covers a typical PSTN jitter buffer drain
|
|
@@ -5512,6 +4333,14 @@ var StreamHandler = class _StreamHandler {
|
|
|
5512
4333
|
} catch {
|
|
5513
4334
|
}
|
|
5514
4335
|
}
|
|
4336
|
+
const ttsCancelable = this.tts;
|
|
4337
|
+
if (typeof ttsCancelable?.cancelActiveStream === "function") {
|
|
4338
|
+
try {
|
|
4339
|
+
ttsCancelable.cancelActiveStream();
|
|
4340
|
+
} catch (err) {
|
|
4341
|
+
getLogger().debug(`TTS cancelActiveStream raised: ${String(err)}`);
|
|
4342
|
+
}
|
|
4343
|
+
}
|
|
5515
4344
|
}
|
|
5516
4345
|
/**
|
|
5517
4346
|
* Resolve every entry in ``pendingMarks`` and empty the queue. Idempotent
|
|
@@ -5528,56 +4357,19 @@ var StreamHandler = class _StreamHandler {
|
|
|
5528
4357
|
}
|
|
5529
4358
|
this.pendingMarks.length = 0;
|
|
5530
4359
|
}
|
|
4360
|
+
// Mark-based back-pressure (sendMarkAwaitable / waitForMarkWindow)
|
|
4361
|
+
// was removed when sendPacedFirstMessageBytes switched to the
|
|
4362
|
+
// Twilio-FIFO-trusts model — see that method's doc comment for
|
|
4363
|
+
// rationale. ``pendingMarks`` and ``onMark`` are still kept so an
|
|
4364
|
+
// adapter that wants to round-trip a mark for some other purpose can
|
|
4365
|
+
// still do so without breaking the firstMessage path.
|
|
5531
4366
|
/**
|
|
5532
|
-
*
|
|
5533
|
-
*
|
|
5534
|
-
*
|
|
5535
|
-
*
|
|
5536
|
-
*
|
|
5537
|
-
|
|
5538
|
-
sendMarkAwaitable() {
|
|
5539
|
-
if (this.deps.bridge.telephonyProvider !== "twilio") return null;
|
|
5540
|
-
this.firstMessageMarkCounter += 1;
|
|
5541
|
-
const markName = `fm_${this.firstMessageMarkCounter}`;
|
|
5542
|
-
let resolve;
|
|
5543
|
-
const promise = new Promise((r) => {
|
|
5544
|
-
resolve = r;
|
|
5545
|
-
});
|
|
5546
|
-
this.pendingMarks.push({ name: markName, resolve, promise });
|
|
5547
|
-
try {
|
|
5548
|
-
this.deps.bridge.sendMark(this.ws, markName, this.streamSid);
|
|
5549
|
-
} catch (err) {
|
|
5550
|
-
getLogger().debug(`sendMark failed (${markName}): ${String(err)}`);
|
|
5551
|
-
const idx = this.pendingMarks.findIndex((m) => m.name === markName);
|
|
5552
|
-
if (idx >= 0) this.pendingMarks.splice(idx, 1);
|
|
5553
|
-
return Promise.resolve();
|
|
5554
|
-
}
|
|
5555
|
-
return promise;
|
|
5556
|
-
}
|
|
5557
|
-
/**
|
|
5558
|
-
* If the in-flight mark queue is at or above ``FIRST_MESSAGE_MARK_WINDOW``
|
|
5559
|
-
* entries, wait for the oldest entry to clear (mark echoed, agent
|
|
5560
|
-
* cancelled, or per-mark timeout). Repeats until the queue depth is
|
|
5561
|
-
* within the window — under high RTT the carrier may have several
|
|
5562
|
-
* marks queued and we want every loop iteration to be naturally back-
|
|
5563
|
-
* pressured by playback.
|
|
5564
|
-
*/
|
|
5565
|
-
async waitForMarkWindow() {
|
|
5566
|
-
while (this.isSpeaking && this.pendingMarks.length >= _StreamHandler.FIRST_MESSAGE_MARK_WINDOW) {
|
|
5567
|
-
const oldest = this.pendingMarks[0];
|
|
5568
|
-
const timeout = new Promise(
|
|
5569
|
-
(resolve) => setTimeout(resolve, _StreamHandler.MARK_AWAIT_TIMEOUT_MS)
|
|
5570
|
-
);
|
|
5571
|
-
await Promise.race([oldest.promise, timeout]);
|
|
5572
|
-
if (this.pendingMarks[0] === oldest) {
|
|
5573
|
-
this.pendingMarks.shift();
|
|
5574
|
-
}
|
|
5575
|
-
}
|
|
5576
|
-
}
|
|
5577
|
-
/**
|
|
5578
|
-
* Bytes-per-millisecond for a 16 kHz PCM16 mono stream. Used by the
|
|
5579
|
-
* non-Twilio firstMessage pacing path to translate chunk size into a
|
|
5580
|
-
* playout-duration sleep. 16000 samples/sec × 2 bytes = 32 bytes/ms.
|
|
4367
|
+
* Bytes-per-millisecond for a 16 kHz PCM16 mono stream. Used by
|
|
4368
|
+
* ``sendPacedFirstMessageBytes`` to translate chunk size into a
|
|
4369
|
+
* playout-duration sleep so we never deliver faster than the carrier
|
|
4370
|
+
* can decode + play out (which manifested as severe crackling on the
|
|
4371
|
+
* HTTP-TTS path with client-side resampling). 16000 samples/sec × 2
|
|
4372
|
+
* bytes/sample = 32 bytes/ms.
|
|
5581
4373
|
*/
|
|
5582
4374
|
static PCM16_16K_BYTES_PER_MS = 32;
|
|
5583
4375
|
/** Cancel and clear the pending grace timer, if any. */
|
|
@@ -6015,7 +4807,7 @@ var StreamHandler = class _StreamHandler {
|
|
|
6015
4807
|
if (activeVad && !this.vadDisabled) {
|
|
6016
4808
|
try {
|
|
6017
4809
|
const vadPromise = activeVad.processFrame(pcm16k, 16e3);
|
|
6018
|
-
const timeoutPromise = new Promise((
|
|
4810
|
+
const timeoutPromise = new Promise((resolve2) => setTimeout(() => resolve2(null), 25));
|
|
6019
4811
|
const evt = await Promise.race([vadPromise, timeoutPromise]);
|
|
6020
4812
|
if (evt) {
|
|
6021
4813
|
getLogger().info(
|
|
@@ -6151,9 +4943,21 @@ var StreamHandler = class _StreamHandler {
|
|
|
6151
4943
|
/** Handle call stop / stream end. */
|
|
6152
4944
|
/** Handle a carrier-emitted `stop` event signalling the call has ended. */
|
|
6153
4945
|
async handleStop() {
|
|
4946
|
+
if (this.llmAbort !== null) {
|
|
4947
|
+
try {
|
|
4948
|
+
this.llmAbort.abort();
|
|
4949
|
+
} catch {
|
|
4950
|
+
}
|
|
4951
|
+
}
|
|
4952
|
+
const ttsCancelable = this.tts;
|
|
4953
|
+
if (typeof ttsCancelable?.cancelActiveStream === "function") {
|
|
4954
|
+
try {
|
|
4955
|
+
ttsCancelable.cancelActiveStream();
|
|
4956
|
+
} catch {
|
|
4957
|
+
}
|
|
4958
|
+
}
|
|
6154
4959
|
this.clearPendingBargeIn();
|
|
6155
4960
|
this.drainPendingMarks();
|
|
6156
|
-
this.firstMessageMarkCounter = 0;
|
|
6157
4961
|
this.clearGraceTimer();
|
|
6158
4962
|
this.flushResamplers();
|
|
6159
4963
|
await this.closeSttOnce();
|
|
@@ -6166,9 +4970,21 @@ var StreamHandler = class _StreamHandler {
|
|
|
6166
4970
|
/** Handle WebSocket close event. */
|
|
6167
4971
|
/** Tear down adapter, STT/TTS, and per-call state when the carrier WebSocket closes. */
|
|
6168
4972
|
async handleWsClose() {
|
|
4973
|
+
if (this.llmAbort !== null) {
|
|
4974
|
+
try {
|
|
4975
|
+
this.llmAbort.abort();
|
|
4976
|
+
} catch {
|
|
4977
|
+
}
|
|
4978
|
+
}
|
|
4979
|
+
const ttsCancelable = this.tts;
|
|
4980
|
+
if (typeof ttsCancelable?.cancelActiveStream === "function") {
|
|
4981
|
+
try {
|
|
4982
|
+
ttsCancelable.cancelActiveStream();
|
|
4983
|
+
} catch {
|
|
4984
|
+
}
|
|
4985
|
+
}
|
|
6169
4986
|
this.clearPendingBargeIn();
|
|
6170
4987
|
this.drainPendingMarks();
|
|
6171
|
-
this.firstMessageMarkCounter = 0;
|
|
6172
4988
|
this.clearGraceTimer();
|
|
6173
4989
|
this.flushResamplers();
|
|
6174
4990
|
await this.closeSttOnce();
|
|
@@ -6207,13 +5023,39 @@ var StreamHandler = class _StreamHandler {
|
|
|
6207
5023
|
* Maintains a 1-byte carry across calls so unaligned HTTP chunks from
|
|
6208
5024
|
* streaming TTS providers never byte-swap the PCM16 samples downstream.
|
|
6209
5025
|
*/
|
|
6210
|
-
encodePipelineAudio(
|
|
6211
|
-
|
|
5026
|
+
encodePipelineAudio(audioChunk) {
|
|
5027
|
+
if (this.ttsOutputFormatNativeForCarrier === true) {
|
|
5028
|
+
return audioChunk.toString("base64");
|
|
5029
|
+
}
|
|
5030
|
+
const aligned = this.alignPcm16(audioChunk);
|
|
6212
5031
|
if (aligned.length === 0) return "";
|
|
6213
5032
|
const pcm8k = this.outboundResampler.process(aligned);
|
|
6214
5033
|
const mulaw = pcm16ToMulaw(pcm8k);
|
|
6215
5034
|
return mulaw.toString("base64");
|
|
6216
5035
|
}
|
|
5036
|
+
/**
|
|
5037
|
+
* Cached result of ``isTtsOutputFormatNativeForCarrier()`` — settled
|
|
5038
|
+
* once at ``initPipeline`` time after ``setTelephonyCarrier`` has run
|
|
5039
|
+
* on the TTS adapter. Stable for the call lifetime: changes to the
|
|
5040
|
+
* adapter's output format mid-call would NOT flip this. ``true`` means
|
|
5041
|
+
* ``encodePipelineAudio`` can take the bypass path.
|
|
5042
|
+
*/
|
|
5043
|
+
ttsOutputFormatNativeForCarrier = false;
|
|
5044
|
+
/**
|
|
5045
|
+
* Probe whether the TTS adapter is configured to emit bytes already in
|
|
5046
|
+
* the carrier's wire codec. Currently: Twilio expects ``ulaw_8000``,
|
|
5047
|
+
* Telnyx expects ``pcm_16000`` (no client transcode in either case if
|
|
5048
|
+
* matched). Anything else takes the resample-and-encode path.
|
|
5049
|
+
*/
|
|
5050
|
+
isTtsOutputFormatNativeForCarrier() {
|
|
5051
|
+
if (!this.tts) return false;
|
|
5052
|
+
const fmt = this.tts.outputFormat;
|
|
5053
|
+
if (typeof fmt !== "string") return false;
|
|
5054
|
+
const carrier = this.deps.bridge.telephonyProvider;
|
|
5055
|
+
if (carrier === "twilio") return fmt === "ulaw_8000";
|
|
5056
|
+
if (carrier === "telnyx") return fmt === "pcm_16000";
|
|
5057
|
+
return false;
|
|
5058
|
+
}
|
|
6217
5059
|
/**
|
|
6218
5060
|
* Prepend any carry byte from the previous chunk, return the even-length
|
|
6219
5061
|
* portion, and stash the final odd byte (if any) for the next call.
|
|
@@ -6224,17 +5066,11 @@ var StreamHandler = class _StreamHandler {
|
|
|
6224
5066
|
this.ttsByteCarry = alignedLen < combined.length ? combined.subarray(alignedLen) : null;
|
|
6225
5067
|
return combined.subarray(0, alignedLen);
|
|
6226
5068
|
}
|
|
6227
|
-
/**
|
|
6228
|
-
* 40 ms @ 16 kHz mono PCM16 = 1280 bytes. Sized to mirror the smallest
|
|
6229
|
-
* live-TTS chunk boundary so cancel granularity (mark/clear bookkeeping)
|
|
6230
|
-
* is identical regardless of whether the firstMessage came from the
|
|
6231
|
-
* prewarm cache or a live ``tts.synthesizeStream`` stream.
|
|
6232
|
-
*/
|
|
6233
|
-
static PREWARM_CHUNK_BYTES = 1280;
|
|
6234
5069
|
/**
|
|
6235
5070
|
* Stream a cached firstMessage buffer in pacing-friendly chunks.
|
|
6236
5071
|
*
|
|
6237
|
-
* Splits ``prewarmBytes`` into
|
|
5072
|
+
* Splits ``prewarmBytes`` into 20 ms slices (matching Twilio's PSTN
|
|
5073
|
+
* frame quantum) and
|
|
6238
5074
|
* forwards each through ``deps.bridge.sendAudio`` exactly like the
|
|
6239
5075
|
* live TTS path does — preserving Twilio mark/clear granularity. A
|
|
6240
5076
|
* single multi-second sendAudio call would push the whole intro into
|
|
@@ -6250,7 +5086,7 @@ var StreamHandler = class _StreamHandler {
|
|
|
6250
5086
|
return this.sendPacedFirstMessageBytes(prewarmBytes);
|
|
6251
5087
|
}
|
|
6252
5088
|
/**
|
|
6253
|
-
* Iterate ``bytes``
|
|
5089
|
+
* Iterate ``bytes`` in 20 ms slices (Twilio PSTN frame quantum) and
|
|
6254
5090
|
* forward each via ``deps.bridge.sendAudio`` with mark-gated pacing
|
|
6255
5091
|
* (Twilio) or playout-time-based pacing (Telnyx). Caps the carrier-
|
|
6256
5092
|
* side buffer at ``FIRST_MESSAGE_MARK_WINDOW`` chunks so a barge-in's
|
|
@@ -6267,30 +5103,20 @@ var StreamHandler = class _StreamHandler {
|
|
|
6267
5103
|
*/
|
|
6268
5104
|
async sendPacedFirstMessageBytes(bytes) {
|
|
6269
5105
|
if (this.pendingMarks.length > 0) this.drainPendingMarks();
|
|
6270
|
-
this.firstMessageMarkCounter = 0;
|
|
6271
5106
|
let firstChunkSent = false;
|
|
6272
|
-
|
|
6273
|
-
|
|
6274
|
-
|
|
6275
|
-
|
|
5107
|
+
const PSTN_FRAME_MS = 20;
|
|
5108
|
+
const bytesPerMs = this.ttsOutputFormatNativeForCarrier ? 8 : _StreamHandler.PCM16_16K_BYTES_PER_MS;
|
|
5109
|
+
const sliceBytes = bytesPerMs * PSTN_FRAME_MS;
|
|
5110
|
+
for (let i = 0; i < bytes.length; i += sliceBytes) {
|
|
6276
5111
|
if (!this.isSpeaking) break;
|
|
6277
|
-
const chunk = bytes.subarray(i, i +
|
|
5112
|
+
const chunk = bytes.subarray(i, i + sliceBytes);
|
|
6278
5113
|
if (!firstChunkSent) firstChunkSent = true;
|
|
6279
|
-
if (this.aec
|
|
5114
|
+
if (this.aec && !this.ttsOutputFormatNativeForCarrier) {
|
|
5115
|
+
this.aec.pushFarEnd(chunk);
|
|
5116
|
+
}
|
|
6280
5117
|
const encoded = this.encodePipelineAudio(chunk);
|
|
6281
5118
|
this.deps.bridge.sendAudio(this.ws, encoded, this.streamSid);
|
|
6282
5119
|
this.markFirstAudioSent();
|
|
6283
|
-
const markPromise = this.sendMarkAwaitable();
|
|
6284
|
-
if (!initialFillComplete && this.pendingMarks.length >= _StreamHandler.FIRST_MESSAGE_MARK_WINDOW) {
|
|
6285
|
-
initialFillComplete = true;
|
|
6286
|
-
}
|
|
6287
|
-
if (markPromise === null || initialFillComplete) {
|
|
6288
|
-
const playoutMs = Math.max(
|
|
6289
|
-
1,
|
|
6290
|
-
Math.floor(chunk.length / _StreamHandler.PCM16_16K_BYTES_PER_MS)
|
|
6291
|
-
);
|
|
6292
|
-
await new Promise((resolve) => setTimeout(resolve, playoutMs));
|
|
6293
|
-
}
|
|
6294
5120
|
}
|
|
6295
5121
|
return firstChunkSent;
|
|
6296
5122
|
}
|
|
@@ -6310,6 +5136,12 @@ var StreamHandler = class _StreamHandler {
|
|
|
6310
5136
|
getLogger().debug(`TTS setTelephonyCarrier failed (${label}): ${String(e)}`);
|
|
6311
5137
|
}
|
|
6312
5138
|
}
|
|
5139
|
+
this.ttsOutputFormatNativeForCarrier = this.isTtsOutputFormatNativeForCarrier();
|
|
5140
|
+
if (this.ttsOutputFormatNativeForCarrier) {
|
|
5141
|
+
getLogger().debug(
|
|
5142
|
+
`TTS outputFormat matches ${this.deps.bridge.telephonyProvider} wire codec \u2014 bypassing client-side transcode`
|
|
5143
|
+
);
|
|
5144
|
+
}
|
|
6313
5145
|
}
|
|
6314
5146
|
if (!this.stt) {
|
|
6315
5147
|
getLogger().debug(`Pipeline mode (${label}): no STT configured`);
|
|
@@ -6319,7 +5151,7 @@ var StreamHandler = class _StreamHandler {
|
|
|
6319
5151
|
}
|
|
6320
5152
|
if (!this.deps.agent.vad) {
|
|
6321
5153
|
try {
|
|
6322
|
-
const { SileroVAD } = await import("./silero-vad-
|
|
5154
|
+
const { SileroVAD } = await import("./silero-vad-LNDFGIY7.mjs");
|
|
6323
5155
|
this.autoVad = await SileroVAD.forPhoneCall();
|
|
6324
5156
|
getLogger().info(
|
|
6325
5157
|
`auto-VAD enabled (SileroVAD, phone preset). Pass agent.vad=\u2026 to override.`
|
|
@@ -7012,16 +5844,49 @@ var StreamHandler = class _StreamHandler {
|
|
|
7012
5844
|
async initRealtimeAdapter(resolvedPrompt) {
|
|
7013
5845
|
const label = this.deps.bridge.label;
|
|
7014
5846
|
this.adapter = this.deps.buildAIAdapter(resolvedPrompt);
|
|
7015
|
-
|
|
7016
|
-
|
|
7017
|
-
getLogger().debug(`AI adapter connected (${label})`);
|
|
7018
|
-
} catch (e) {
|
|
7019
|
-
getLogger().error(`AI adapter connect FAILED (${label}):`, e);
|
|
5847
|
+
let parked;
|
|
5848
|
+
if (typeof this.deps.popPrewarmedConnections === "function") {
|
|
7020
5849
|
try {
|
|
7021
|
-
|
|
7022
|
-
} catch {
|
|
5850
|
+
parked = this.deps.popPrewarmedConnections(this.callId);
|
|
5851
|
+
} catch (err) {
|
|
5852
|
+
getLogger().debug(`popPrewarmedConnections raised: ${String(err)}`);
|
|
5853
|
+
}
|
|
5854
|
+
}
|
|
5855
|
+
const parkedRealtimeWs = parked?.openaiRealtime;
|
|
5856
|
+
let adoptOk = false;
|
|
5857
|
+
if (parkedRealtimeWs !== void 0) {
|
|
5858
|
+
const adapterAny = this.adapter;
|
|
5859
|
+
const wsAlive = parkedRealtimeWs.readyState === 1;
|
|
5860
|
+
if (typeof adapterAny?.adoptWebSocket === "function" && wsAlive) {
|
|
5861
|
+
try {
|
|
5862
|
+
adapterAny.adoptWebSocket(parkedRealtimeWs);
|
|
5863
|
+
getLogger().info(
|
|
5864
|
+
`[CONNECT] callId=${this.callId} provider=openai_realtime source=adopted ms=0`
|
|
5865
|
+
);
|
|
5866
|
+
adoptOk = true;
|
|
5867
|
+
} catch (err) {
|
|
5868
|
+
getLogger().debug(`Realtime adoptWebSocket failed: ${String(err)}; falling back`);
|
|
5869
|
+
}
|
|
5870
|
+
}
|
|
5871
|
+
if (!adoptOk) {
|
|
5872
|
+
try {
|
|
5873
|
+
parkedRealtimeWs.close();
|
|
5874
|
+
} catch {
|
|
5875
|
+
}
|
|
5876
|
+
}
|
|
5877
|
+
}
|
|
5878
|
+
if (!adoptOk) {
|
|
5879
|
+
try {
|
|
5880
|
+
await this.adapter.connect();
|
|
5881
|
+
getLogger().debug(`AI adapter connected (${label})`);
|
|
5882
|
+
} catch (e) {
|
|
5883
|
+
getLogger().error(`AI adapter connect FAILED (${label}):`, e);
|
|
5884
|
+
try {
|
|
5885
|
+
await this.deps.bridge.endCall(this.callId, this.ws);
|
|
5886
|
+
} catch {
|
|
5887
|
+
}
|
|
5888
|
+
return;
|
|
7023
5889
|
}
|
|
7024
|
-
return;
|
|
7025
5890
|
}
|
|
7026
5891
|
if (this.deps.agent.firstMessage) {
|
|
7027
5892
|
this.metricsAcc.startTurn();
|
|
@@ -7141,8 +6006,21 @@ var StreamHandler = class _StreamHandler {
|
|
|
7141
6006
|
await this.emitUserSpeechEnded();
|
|
7142
6007
|
}
|
|
7143
6008
|
async onAdapterTranscriptInput(inputText) {
|
|
6009
|
+
const stripped = inputText.trim().toLowerCase();
|
|
6010
|
+
if (HALLUCINATIONS.has(stripped) || stripped === "") {
|
|
6011
|
+
getLogger().debug(
|
|
6012
|
+
`Realtime transcript_input dropped (likely Whisper hallucination on silence/echo): ${sanitizeLogValue(inputText.slice(0, 60))}`
|
|
6013
|
+
);
|
|
6014
|
+
this.userTranscriptPending = false;
|
|
6015
|
+
return;
|
|
6016
|
+
}
|
|
7144
6017
|
getLogger().debug(`User (${this.deps.bridge.label}): ${sanitizeLogValue(inputText)}`);
|
|
7145
6018
|
this.history.push({ role: "user", text: inputText, timestamp: Date.now() });
|
|
6019
|
+
if (this.adapter instanceof OpenAIRealtimeAdapter) {
|
|
6020
|
+
void this.adapter.requestResponse().catch(
|
|
6021
|
+
(err) => getLogger().debug(`Realtime requestResponse failed: ${String(err)}`)
|
|
6022
|
+
);
|
|
6023
|
+
}
|
|
7146
6024
|
if (!this.metricsAcc.turnActive) {
|
|
7147
6025
|
this.metricsAcc.startTurn();
|
|
7148
6026
|
this.currentAgentText = "";
|
|
@@ -7294,6 +6172,18 @@ var StreamHandler = class _StreamHandler {
|
|
|
7294
6172
|
await this.flushAssistantTurn(text);
|
|
7295
6173
|
}
|
|
7296
6174
|
async onAdapterSpeechInterrupt() {
|
|
6175
|
+
if (this.adapter instanceof OpenAIRealtimeAdapter) {
|
|
6176
|
+
const startedAt = this.adapter.currentResponseFirstAudioAt;
|
|
6177
|
+
if (startedAt !== null) {
|
|
6178
|
+
const elapsedMs = Date.now() - startedAt;
|
|
6179
|
+
if (elapsedMs < _StreamHandler.MIN_AGENT_SPEAKING_MS_BEFORE_BARGE_IN_NO_AEC) {
|
|
6180
|
+
getLogger().info(
|
|
6181
|
+
`Realtime barge-in suppressed (response < gate, ${elapsedMs}ms)`
|
|
6182
|
+
);
|
|
6183
|
+
return;
|
|
6184
|
+
}
|
|
6185
|
+
}
|
|
6186
|
+
}
|
|
7297
6187
|
this.deps.bridge.sendClear(this.ws, this.streamSid);
|
|
7298
6188
|
if (this.adapter instanceof OpenAIRealtimeAdapter) this.adapter.cancelResponse();
|
|
7299
6189
|
this.metricsAcc.recordTurnInterrupted();
|
|
@@ -7529,31 +6419,31 @@ async function queryDeepgramCost(metricsAcc, deepgramKey, deepgramRequestId) {
|
|
|
7529
6419
|
// src/services/call-log.ts
|
|
7530
6420
|
init_esm_shims();
|
|
7531
6421
|
import * as crypto3 from "crypto";
|
|
7532
|
-
import * as
|
|
6422
|
+
import * as fs3 from "fs";
|
|
7533
6423
|
import { promises as fsp } from "fs";
|
|
7534
6424
|
import * as os from "os";
|
|
7535
|
-
import * as
|
|
6425
|
+
import * as path3 from "path";
|
|
7536
6426
|
var SCHEMA_VERSION = "1.0";
|
|
7537
6427
|
var DEFAULT_RETENTION_DAYS = 30;
|
|
7538
6428
|
function xdgDataHome() {
|
|
7539
|
-
return process.env.XDG_DATA_HOME ||
|
|
6429
|
+
return process.env.XDG_DATA_HOME || path3.join(os.homedir(), ".local", "share");
|
|
7540
6430
|
}
|
|
7541
6431
|
function platformDefaultRoot() {
|
|
7542
6432
|
if (process.platform === "darwin") {
|
|
7543
|
-
return
|
|
6433
|
+
return path3.join(os.homedir(), "Library", "Application Support", "patter");
|
|
7544
6434
|
}
|
|
7545
6435
|
if (process.platform === "win32") {
|
|
7546
6436
|
const localAppData = process.env.LOCALAPPDATA;
|
|
7547
|
-
if (localAppData) return
|
|
7548
|
-
return
|
|
6437
|
+
if (localAppData) return path3.join(localAppData, "patter");
|
|
6438
|
+
return path3.join(os.homedir(), "AppData", "Local", "patter");
|
|
7549
6439
|
}
|
|
7550
|
-
return
|
|
6440
|
+
return path3.join(xdgDataHome(), "patter");
|
|
7551
6441
|
}
|
|
7552
6442
|
function resolveLogRoot(explicit) {
|
|
7553
6443
|
const value = explicit ?? process.env.PATTER_LOG_DIR;
|
|
7554
6444
|
if (!value) return null;
|
|
7555
6445
|
if (value.trim().toLowerCase() === "auto") return platformDefaultRoot();
|
|
7556
|
-
if (value.startsWith("~")) return
|
|
6446
|
+
if (value.startsWith("~")) return path3.join(os.homedir(), value.slice(1));
|
|
7557
6447
|
return value;
|
|
7558
6448
|
}
|
|
7559
6449
|
function retentionDays() {
|
|
@@ -7564,9 +6454,9 @@ function retentionDays() {
|
|
|
7564
6454
|
return Math.max(0, parsed);
|
|
7565
6455
|
}
|
|
7566
6456
|
function redactMode() {
|
|
7567
|
-
const raw = (process.env.PATTER_LOG_REDACT_PHONE || "
|
|
6457
|
+
const raw = (process.env.PATTER_LOG_REDACT_PHONE || "full").trim().toLowerCase();
|
|
7568
6458
|
if (raw === "full" || raw === "mask" || raw === "hash_only") return raw;
|
|
7569
|
-
return "
|
|
6459
|
+
return "full";
|
|
7570
6460
|
}
|
|
7571
6461
|
function redactPhone(raw) {
|
|
7572
6462
|
if (!raw) return "";
|
|
@@ -7582,9 +6472,9 @@ function utcIso(tsSeconds) {
|
|
|
7582
6472
|
return new Date(ms).toISOString();
|
|
7583
6473
|
}
|
|
7584
6474
|
async function atomicWriteJson(filePath, payload) {
|
|
7585
|
-
const dir =
|
|
6475
|
+
const dir = path3.dirname(filePath);
|
|
7586
6476
|
await fsp.mkdir(dir, { recursive: true });
|
|
7587
|
-
const tmp =
|
|
6477
|
+
const tmp = path3.join(dir, `.tmp.${process.pid}.${crypto3.randomBytes(4).toString("hex")}.json`);
|
|
7588
6478
|
try {
|
|
7589
6479
|
const handle = await fsp.open(tmp, "w");
|
|
7590
6480
|
try {
|
|
@@ -7603,7 +6493,7 @@ async function atomicWriteJson(filePath, payload) {
|
|
|
7603
6493
|
}
|
|
7604
6494
|
}
|
|
7605
6495
|
async function appendJsonl(filePath, record) {
|
|
7606
|
-
await fsp.mkdir(
|
|
6496
|
+
await fsp.mkdir(path3.dirname(filePath), { recursive: true });
|
|
7607
6497
|
await fsp.appendFile(filePath, JSON.stringify(record) + "\n", { encoding: "utf8" });
|
|
7608
6498
|
}
|
|
7609
6499
|
var CallLogger = class {
|
|
@@ -7613,9 +6503,9 @@ var CallLogger = class {
|
|
|
7613
6503
|
this.root = null;
|
|
7614
6504
|
return;
|
|
7615
6505
|
}
|
|
7616
|
-
const resolved = root.startsWith("~") ?
|
|
6506
|
+
const resolved = root.startsWith("~") ? path3.join(os.homedir(), root.slice(1)) : root;
|
|
7617
6507
|
try {
|
|
7618
|
-
|
|
6508
|
+
fs3.mkdirSync(resolved, { recursive: true });
|
|
7619
6509
|
this.root = resolved;
|
|
7620
6510
|
getLogger().info(`Call logs: ${resolved}`);
|
|
7621
6511
|
} catch (err) {
|
|
@@ -7637,7 +6527,7 @@ var CallLogger = class {
|
|
|
7637
6527
|
const month = String(dt.getUTCMonth() + 1).padStart(2, "0");
|
|
7638
6528
|
const day = String(dt.getUTCDate()).padStart(2, "0");
|
|
7639
6529
|
const safeId = sanitizeLogValue(callId, 64).replace(/\//g, "_") || "unknown";
|
|
7640
|
-
return
|
|
6530
|
+
return path3.join(this.root, "calls", year, month, day, safeId);
|
|
7641
6531
|
}
|
|
7642
6532
|
/** Write the initial `metadata.json` for a new call. */
|
|
7643
6533
|
async logCallStart(callId, input = {}) {
|
|
@@ -7655,6 +6545,7 @@ var CallLogger = class {
|
|
|
7655
6545
|
status: "in_progress",
|
|
7656
6546
|
caller: redactPhone(input.caller ?? ""),
|
|
7657
6547
|
callee: redactPhone(input.callee ?? ""),
|
|
6548
|
+
direction: input.direction || "inbound",
|
|
7658
6549
|
telephony_provider: input.telephonyProvider ?? "",
|
|
7659
6550
|
provider_mode: input.providerMode ?? "",
|
|
7660
6551
|
agent: input.agent ?? {},
|
|
@@ -7664,7 +6555,7 @@ var CallLogger = class {
|
|
|
7664
6555
|
error: null
|
|
7665
6556
|
};
|
|
7666
6557
|
try {
|
|
7667
|
-
await atomicWriteJson(
|
|
6558
|
+
await atomicWriteJson(path3.join(dir, "metadata.json"), metadata);
|
|
7668
6559
|
} catch (err) {
|
|
7669
6560
|
getLogger().warn(`call_log write failed (${sanitizeLogValue(callId)}): ${sanitizeLogValue(String(err))}`);
|
|
7670
6561
|
}
|
|
@@ -7683,7 +6574,7 @@ var CallLogger = class {
|
|
|
7683
6574
|
...turn
|
|
7684
6575
|
};
|
|
7685
6576
|
try {
|
|
7686
|
-
await appendJsonl(
|
|
6577
|
+
await appendJsonl(path3.join(dir, "transcript.jsonl"), record);
|
|
7687
6578
|
} catch (err) {
|
|
7688
6579
|
getLogger().warn(
|
|
7689
6580
|
`call_log turn write failed (${sanitizeLogValue(callId)}): ${sanitizeLogValue(String(err))}`
|
|
@@ -7702,7 +6593,7 @@ var CallLogger = class {
|
|
|
7702
6593
|
data: payload
|
|
7703
6594
|
};
|
|
7704
6595
|
try {
|
|
7705
|
-
await appendJsonl(
|
|
6596
|
+
await appendJsonl(path3.join(dir, "events.jsonl"), record);
|
|
7706
6597
|
} catch (err) {
|
|
7707
6598
|
getLogger().warn(
|
|
7708
6599
|
`call_log event write failed (${sanitizeLogValue(callId)}): ${sanitizeLogValue(String(err))}`
|
|
@@ -7714,7 +6605,7 @@ var CallLogger = class {
|
|
|
7714
6605
|
if (!this.enabled) return;
|
|
7715
6606
|
const dir = this.callDir(callId);
|
|
7716
6607
|
if (dir === null) return;
|
|
7717
|
-
const metadataPath =
|
|
6608
|
+
const metadataPath = path3.join(dir, "metadata.json");
|
|
7718
6609
|
let existing = {};
|
|
7719
6610
|
try {
|
|
7720
6611
|
existing = JSON.parse(await fsp.readFile(metadataPath, "utf8"));
|
|
@@ -7749,20 +6640,20 @@ var CallLogger = class {
|
|
|
7749
6640
|
const days = retentionDays();
|
|
7750
6641
|
if (days === 0) return;
|
|
7751
6642
|
const cutoff = Date.now() / 1e3 - days * 86400;
|
|
7752
|
-
const callsRoot =
|
|
7753
|
-
if (!
|
|
6643
|
+
const callsRoot = path3.join(this.root, "calls");
|
|
6644
|
+
if (!fs3.existsSync(callsRoot)) return;
|
|
7754
6645
|
try {
|
|
7755
|
-
for (const yearName of
|
|
6646
|
+
for (const yearName of fs3.readdirSync(callsRoot)) {
|
|
7756
6647
|
if (!/^\d+$/.test(yearName)) continue;
|
|
7757
|
-
const yearDir =
|
|
7758
|
-
if (!
|
|
7759
|
-
for (const monthName of
|
|
6648
|
+
const yearDir = path3.join(callsRoot, yearName);
|
|
6649
|
+
if (!fs3.statSync(yearDir).isDirectory()) continue;
|
|
6650
|
+
for (const monthName of fs3.readdirSync(yearDir)) {
|
|
7760
6651
|
if (!/^\d+$/.test(monthName)) continue;
|
|
7761
|
-
const monthDir =
|
|
7762
|
-
if (!
|
|
7763
|
-
for (const dayName of
|
|
6652
|
+
const monthDir = path3.join(yearDir, monthName);
|
|
6653
|
+
if (!fs3.statSync(monthDir).isDirectory()) continue;
|
|
6654
|
+
for (const dayName of fs3.readdirSync(monthDir)) {
|
|
7764
6655
|
if (!/^\d+$/.test(dayName)) continue;
|
|
7765
|
-
const dayDir =
|
|
6656
|
+
const dayDir = path3.join(monthDir, dayName);
|
|
7766
6657
|
const y = Number.parseInt(yearName, 10);
|
|
7767
6658
|
const m = Number.parseInt(monthName, 10);
|
|
7768
6659
|
const d = Number.parseInt(dayName, 10);
|
|
@@ -7772,12 +6663,12 @@ var CallLogger = class {
|
|
|
7772
6663
|
}
|
|
7773
6664
|
}
|
|
7774
6665
|
try {
|
|
7775
|
-
if (
|
|
6666
|
+
if (fs3.readdirSync(monthDir).length === 0) fs3.rmdirSync(monthDir);
|
|
7776
6667
|
} catch {
|
|
7777
6668
|
}
|
|
7778
6669
|
}
|
|
7779
6670
|
try {
|
|
7780
|
-
if (
|
|
6671
|
+
if (fs3.readdirSync(yearDir).length === 0) fs3.rmdirSync(yearDir);
|
|
7781
6672
|
} catch {
|
|
7782
6673
|
}
|
|
7783
6674
|
}
|
|
@@ -7788,19 +6679,19 @@ var CallLogger = class {
|
|
|
7788
6679
|
};
|
|
7789
6680
|
function rmTree(target) {
|
|
7790
6681
|
try {
|
|
7791
|
-
for (const child of
|
|
7792
|
-
const childPath =
|
|
7793
|
-
const stat =
|
|
6682
|
+
for (const child of fs3.readdirSync(target)) {
|
|
6683
|
+
const childPath = path3.join(target, child);
|
|
6684
|
+
const stat = fs3.lstatSync(childPath);
|
|
7794
6685
|
if (stat.isDirectory()) {
|
|
7795
6686
|
rmTree(childPath);
|
|
7796
6687
|
} else {
|
|
7797
6688
|
try {
|
|
7798
|
-
|
|
6689
|
+
fs3.unlinkSync(childPath);
|
|
7799
6690
|
} catch {
|
|
7800
6691
|
}
|
|
7801
6692
|
}
|
|
7802
6693
|
}
|
|
7803
|
-
|
|
6694
|
+
fs3.rmdirSync(target);
|
|
7804
6695
|
} catch {
|
|
7805
6696
|
}
|
|
7806
6697
|
}
|
|
@@ -8097,7 +6988,7 @@ var TELNYX_DTMF_ALLOWED = new Set("0123456789*#ABCDabcdwW");
|
|
|
8097
6988
|
var TELNYX_DTMF_DURATION_MS = 250;
|
|
8098
6989
|
async function sleep(ms) {
|
|
8099
6990
|
if (ms <= 0) return;
|
|
8100
|
-
await new Promise((
|
|
6991
|
+
await new Promise((resolve2) => setTimeout(resolve2, ms));
|
|
8101
6992
|
}
|
|
8102
6993
|
var TelnyxBridge = class {
|
|
8103
6994
|
constructor(config) {
|
|
@@ -8700,7 +7591,7 @@ var EmbeddedServer = class {
|
|
|
8700
7591
|
this.handleTwilioStream(ws, url);
|
|
8701
7592
|
}
|
|
8702
7593
|
});
|
|
8703
|
-
await new Promise((
|
|
7594
|
+
await new Promise((resolve2) => {
|
|
8704
7595
|
const bindHost = process.env.PATTER_BIND_HOST ?? "127.0.0.1";
|
|
8705
7596
|
this.server.listen(port, bindHost, () => {
|
|
8706
7597
|
getLogger().info(`Server on port ${port}`);
|
|
@@ -8722,7 +7613,7 @@ var EmbeddedServer = class {
|
|
|
8722
7613
|
}
|
|
8723
7614
|
console.log("\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n");
|
|
8724
7615
|
}
|
|
8725
|
-
|
|
7616
|
+
resolve2();
|
|
8726
7617
|
});
|
|
8727
7618
|
});
|
|
8728
7619
|
}
|
|
@@ -8765,7 +7656,7 @@ var EmbeddedServer = class {
|
|
|
8765
7656
|
`Telnyx voicemail speak failed: ${speakResp.status} ${(await speakResp.text()).slice(0, 200)}`
|
|
8766
7657
|
);
|
|
8767
7658
|
}
|
|
8768
|
-
await new Promise((
|
|
7659
|
+
await new Promise((resolve2) => setTimeout(resolve2, estimatedMs));
|
|
8769
7660
|
await fetch(`https://api.telnyx.com/v2/calls/${encoded}/actions/hangup`, {
|
|
8770
7661
|
method: "POST",
|
|
8771
7662
|
headers,
|
|
@@ -8836,9 +7727,11 @@ var EmbeddedServer = class {
|
|
|
8836
7727
|
const active = callId ? store.getActive(callId) : void 0;
|
|
8837
7728
|
const resolvedCaller = dataCaller || active?.caller || "";
|
|
8838
7729
|
const resolvedCallee = dataCallee || active?.callee || "";
|
|
7730
|
+
const resolvedDirection = (typeof data.direction === "string" ? data.direction : "") || active?.direction || "inbound";
|
|
8839
7731
|
void logger.logCallStart(callId, {
|
|
8840
7732
|
caller: resolvedCaller,
|
|
8841
7733
|
callee: resolvedCallee,
|
|
7734
|
+
direction: resolvedDirection,
|
|
8842
7735
|
telephonyProvider: bridge.telephonyProvider,
|
|
8843
7736
|
providerMode: agent.provider ?? "",
|
|
8844
7737
|
agent: agentSnapshot()
|
|
@@ -8996,8 +7889,8 @@ var EmbeddedServer = class {
|
|
|
8996
7889
|
*/
|
|
8997
7890
|
async stop() {
|
|
8998
7891
|
if (!this.server) return;
|
|
8999
|
-
const httpClosePromise = new Promise((
|
|
9000
|
-
this.server.close(() =>
|
|
7892
|
+
const httpClosePromise = new Promise((resolve2) => {
|
|
7893
|
+
this.server.close(() => resolve2());
|
|
9001
7894
|
});
|
|
9002
7895
|
const isTelnyx = this.config.telephonyProvider === "telnyx";
|
|
9003
7896
|
for (const [ws, callId] of this.activeCallIds) {
|
|
@@ -9017,15 +7910,15 @@ var EmbeddedServer = class {
|
|
|
9017
7910
|
if (this.activeConnections.size > 0) {
|
|
9018
7911
|
getLogger().info(`Waiting for ${this.activeConnections.size} active connection(s) to close...`);
|
|
9019
7912
|
await Promise.race([
|
|
9020
|
-
new Promise((
|
|
7913
|
+
new Promise((resolve2) => {
|
|
9021
7914
|
const checkInterval = setInterval(() => {
|
|
9022
7915
|
if (this.activeConnections.size === 0) {
|
|
9023
7916
|
clearInterval(checkInterval);
|
|
9024
|
-
|
|
7917
|
+
resolve2();
|
|
9025
7918
|
}
|
|
9026
7919
|
}, 100);
|
|
9027
7920
|
}),
|
|
9028
|
-
new Promise((
|
|
7921
|
+
new Promise((resolve2) => setTimeout(resolve2, GRACEFUL_SHUTDOWN_TIMEOUT_MS))
|
|
9029
7922
|
]);
|
|
9030
7923
|
}
|
|
9031
7924
|
if (this.activeConnections.size > 0) {
|
|
@@ -9782,7 +8675,7 @@ var TestSession = class {
|
|
|
9782
8675
|
input: process.stdin,
|
|
9783
8676
|
output: process.stdout
|
|
9784
8677
|
});
|
|
9785
|
-
const askQuestion = (prompt) => new Promise((
|
|
8678
|
+
const askQuestion = (prompt) => new Promise((resolve2) => rl.question(prompt, resolve2));
|
|
9786
8679
|
try {
|
|
9787
8680
|
while (!ended) {
|
|
9788
8681
|
let userInput;
|
|
@@ -9881,26 +8774,17 @@ export {
|
|
|
9881
8774
|
AuthenticationError,
|
|
9882
8775
|
ProvisionError,
|
|
9883
8776
|
RateLimitError,
|
|
9884
|
-
OpenAIRealtimeAdapter,
|
|
9885
|
-
mulawToPcm16,
|
|
9886
|
-
pcm16ToMulaw,
|
|
9887
|
-
PcmCarry,
|
|
9888
|
-
StatefulResampler,
|
|
9889
|
-
createResampler16kTo8k,
|
|
9890
|
-
createResampler8kTo16k,
|
|
9891
|
-
createResampler24kTo16k,
|
|
9892
|
-
createResampler24kTo8k,
|
|
9893
|
-
resample8kTo16k,
|
|
9894
|
-
resample16kTo8k,
|
|
9895
|
-
resample24kTo16k,
|
|
9896
|
-
OpenAIRealtime2Adapter,
|
|
9897
8777
|
ElevenLabsConvAIAdapter,
|
|
8778
|
+
PRICING_VERSION,
|
|
8779
|
+
PRICING_LAST_UPDATED,
|
|
8780
|
+
PricingUnit,
|
|
9898
8781
|
DEFAULT_PRICING,
|
|
9899
8782
|
mergePricing,
|
|
9900
8783
|
calculateSttCost,
|
|
9901
8784
|
calculateTtsCost,
|
|
9902
8785
|
calculateRealtimeCost,
|
|
9903
8786
|
calculateTelephonyCost,
|
|
8787
|
+
VERSION,
|
|
9904
8788
|
MetricsStore,
|
|
9905
8789
|
makeAuthMiddleware,
|
|
9906
8790
|
callsToCsv,
|
|
@@ -9910,6 +8794,7 @@ export {
|
|
|
9910
8794
|
RemoteMessageHandler,
|
|
9911
8795
|
isRemoteUrl,
|
|
9912
8796
|
isWebSocketUrl,
|
|
8797
|
+
DeepgramModel,
|
|
9913
8798
|
DeepgramSTT,
|
|
9914
8799
|
CallMetricsAccumulator,
|
|
9915
8800
|
SPAN_CALL,
|