getpatter 0.4.4 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -247,242 +247,13 @@ var ElevenLabsConvAIAdapter = class {
247
247
  }
248
248
  };
249
249
 
250
- // src/providers/deepgram-stt.ts
251
- import WebSocket3 from "ws";
252
- var DEEPGRAM_WS_URL = "wss://api.deepgram.com/v1/listen";
253
- var DeepgramSTT = class _DeepgramSTT {
254
- ws = null;
255
- callbacks = [];
256
- /** Request ID from Deepgram — used to query actual cost post-call. */
257
- requestId = "";
258
- apiKey;
259
- language;
260
- model;
261
- encoding;
262
- sampleRate;
263
- endpointingMs;
264
- utteranceEndMs;
265
- smartFormat;
266
- interimResults;
267
- vadEvents;
268
- constructor(apiKey, languageOrOptions, model, encoding, sampleRate, options) {
269
- this.apiKey = apiKey;
270
- const opts = typeof languageOrOptions === "object" && languageOrOptions !== null ? languageOrOptions : options ?? {};
271
- this.language = (typeof languageOrOptions === "string" ? languageOrOptions : opts.language) ?? "en";
272
- this.model = model ?? opts.model ?? "nova-3";
273
- this.encoding = encoding ?? opts.encoding ?? "linear16";
274
- this.sampleRate = sampleRate ?? opts.sampleRate ?? 16e3;
275
- this.endpointingMs = opts.endpointingMs ?? 150;
276
- this.utteranceEndMs = opts.utteranceEndMs === null ? null : opts.utteranceEndMs ?? 1e3;
277
- this.smartFormat = opts.smartFormat ?? true;
278
- this.interimResults = opts.interimResults ?? true;
279
- this.vadEvents = opts.vadEvents ?? true;
280
- }
281
- /** Factory for Twilio calls — mulaw 8 kHz. Forwards tuning options through. */
282
- static forTwilio(apiKey, language = "en", model = "nova-3", options = {}) {
283
- return new _DeepgramSTT(apiKey, language, model, "mulaw", 8e3, options);
284
- }
285
- async connect() {
286
- const params = new URLSearchParams({
287
- model: this.model,
288
- language: this.language,
289
- encoding: this.encoding,
290
- sample_rate: String(this.sampleRate),
291
- channels: "1",
292
- interim_results: this.interimResults ? "true" : "false",
293
- endpointing: String(this.endpointingMs),
294
- smart_format: this.smartFormat ? "true" : "false",
295
- vad_events: this.vadEvents ? "true" : "false",
296
- no_delay: "true"
297
- });
298
- if (this.utteranceEndMs !== null) {
299
- params.set("utterance_end_ms", String(Math.max(this.utteranceEndMs, 1e3)));
300
- }
301
- const url = `${DEEPGRAM_WS_URL}?${params.toString()}`;
302
- this.ws = new WebSocket3(url, {
303
- headers: { Authorization: `Token ${this.apiKey}` }
304
- });
305
- await new Promise((resolve, reject) => {
306
- const timer = setTimeout(() => reject(new Error("Deepgram connect timeout")), 1e4);
307
- this.ws.once("open", () => {
308
- clearTimeout(timer);
309
- resolve();
310
- });
311
- this.ws.once("error", (err) => {
312
- clearTimeout(timer);
313
- reject(err);
314
- });
315
- });
316
- this.ws.on("message", (raw) => {
317
- let data;
318
- try {
319
- data = JSON.parse(raw.toString());
320
- } catch {
321
- return;
322
- }
323
- if (data.type === "Metadata" && data.request_id) {
324
- this.requestId = data.request_id;
325
- return;
326
- }
327
- if (data.type !== "Results") return;
328
- const alternatives = data.channel?.alternatives ?? [];
329
- if (!alternatives.length) return;
330
- const best = alternatives[0];
331
- const text = (best.transcript ?? "").trim();
332
- if (!text) return;
333
- const transcript = {
334
- text,
335
- isFinal: Boolean(data.is_final) || Boolean(data.speech_final),
336
- confidence: best.confidence ?? 0
337
- };
338
- for (const cb of this.callbacks) {
339
- cb(transcript);
340
- }
341
- });
342
- }
343
- sendAudio(audio) {
344
- if (!this.ws || this.ws.readyState !== WebSocket3.OPEN) return;
345
- this.ws.send(audio);
346
- }
347
- onTranscript(callback) {
348
- if (this.callbacks.length >= 10) {
349
- getLogger().warn("DeepgramSTT: maximum of 10 onTranscript callbacks reached; replacing the last callback.");
350
- this.callbacks[this.callbacks.length - 1] = callback;
351
- return;
352
- }
353
- this.callbacks.push(callback);
354
- }
355
- close() {
356
- if (this.ws) {
357
- try {
358
- this.ws.send(JSON.stringify({ type: "CloseStream" }));
359
- } catch {
360
- }
361
- this.ws.close();
362
- this.ws = null;
363
- }
364
- }
365
- };
366
-
367
- // src/providers/whisper-stt.ts
368
- var OPENAI_TRANSCRIPTION_URL = "https://api.openai.com/v1/audio/transcriptions";
369
- var DEFAULT_BUFFER_SIZE = 16e3 * 2;
370
- function wrapPcmInWav(pcm, sampleRate = 16e3, channels = 1, bitsPerSample = 16) {
371
- const dataSize = pcm.length;
372
- const header = Buffer.alloc(44);
373
- header.write("RIFF", 0);
374
- header.writeUInt32LE(36 + dataSize, 4);
375
- header.write("WAVE", 8);
376
- header.write("fmt ", 12);
377
- header.writeUInt32LE(16, 16);
378
- header.writeUInt16LE(1, 20);
379
- header.writeUInt16LE(channels, 22);
380
- header.writeUInt32LE(sampleRate, 24);
381
- header.writeUInt32LE(sampleRate * channels * (bitsPerSample / 8), 28);
382
- header.writeUInt16LE(channels * (bitsPerSample / 8), 32);
383
- header.writeUInt16LE(bitsPerSample, 34);
384
- header.write("data", 36);
385
- header.writeUInt32LE(dataSize, 40);
386
- return Buffer.concat([header, pcm]);
250
+ // src/provider-factory.ts
251
+ async function createSTT(agent) {
252
+ return agent.stt ?? null;
253
+ }
254
+ async function createTTS(agent) {
255
+ return agent.tts ?? null;
387
256
  }
388
- var WhisperSTT = class _WhisperSTT {
389
- apiKey;
390
- model;
391
- language;
392
- bufferSize;
393
- buffer = Buffer.alloc(0);
394
- callbacks = [];
395
- running = false;
396
- pendingTranscriptions = [];
397
- constructor(apiKey, model = "whisper-1", language, bufferSize = DEFAULT_BUFFER_SIZE) {
398
- this.apiKey = apiKey;
399
- this.model = model;
400
- this.language = language;
401
- this.bufferSize = bufferSize;
402
- }
403
- /** Factory for Twilio calls — mulaw 8 kHz is transcoded upstream, so we still receive PCM 16-bit. */
404
- static forTwilio(apiKey, language = "en", model = "whisper-1") {
405
- return new _WhisperSTT(apiKey, model, language);
406
- }
407
- async connect() {
408
- this.running = true;
409
- this.buffer = Buffer.alloc(0);
410
- }
411
- sendAudio(audio) {
412
- if (!this.running) return;
413
- this.buffer = Buffer.concat([this.buffer, audio]);
414
- if (this.buffer.length >= this.bufferSize) {
415
- const pcm = this.buffer;
416
- this.buffer = Buffer.alloc(0);
417
- this.trackTranscription(this.transcribeBuffer(pcm));
418
- }
419
- }
420
- trackTranscription(promise) {
421
- const wrapped = promise.finally(() => {
422
- const idx = this.pendingTranscriptions.indexOf(wrapped);
423
- if (idx !== -1) this.pendingTranscriptions.splice(idx, 1);
424
- });
425
- this.pendingTranscriptions.push(wrapped);
426
- }
427
- onTranscript(callback) {
428
- if (this.callbacks.length >= 10) {
429
- getLogger().warn("WhisperSTT: maximum of 10 onTranscript callbacks reached; replacing the last callback.");
430
- this.callbacks[this.callbacks.length - 1] = callback;
431
- return;
432
- }
433
- this.callbacks.push(callback);
434
- }
435
- async close() {
436
- this.running = false;
437
- if (this.buffer.length >= this.bufferSize / 4) {
438
- const pcm = this.buffer;
439
- this.buffer = Buffer.alloc(0);
440
- this.trackTranscription(this.transcribeBuffer(pcm));
441
- } else {
442
- this.buffer = Buffer.alloc(0);
443
- }
444
- await Promise.allSettled(this.pendingTranscriptions);
445
- this.callbacks = [];
446
- }
447
- // ------------------------------------------------------------------
448
- // Private
449
- // ------------------------------------------------------------------
450
- async transcribeBuffer(pcm) {
451
- const wav = wrapPcmInWav(pcm);
452
- const formData = new FormData();
453
- formData.append("file", new Blob([wav.buffer.slice(wav.byteOffset, wav.byteOffset + wav.byteLength)], { type: "audio/wav" }), "audio.wav");
454
- formData.append("model", this.model);
455
- if (this.language) {
456
- formData.append("language", this.language);
457
- }
458
- try {
459
- const resp = await fetch(OPENAI_TRANSCRIPTION_URL, {
460
- method: "POST",
461
- headers: { Authorization: `Bearer ${this.apiKey}` },
462
- body: formData,
463
- signal: AbortSignal.timeout(15e3)
464
- });
465
- if (!resp.ok) {
466
- const body = await resp.text();
467
- getLogger().error(`WhisperSTT transcription error: ${resp.status} ${body}`);
468
- return;
469
- }
470
- const json = await resp.json();
471
- const text = (json.text ?? "").trim();
472
- if (!text) return;
473
- const transcript = {
474
- text,
475
- isFinal: true,
476
- confidence: 1
477
- };
478
- for (const cb of this.callbacks) {
479
- cb(transcript);
480
- }
481
- } catch (err) {
482
- getLogger().error(`WhisperSTT transcription error: ${String(err)}`);
483
- }
484
- }
485
- };
486
257
 
487
258
  // src/pricing.ts
488
259
  var DEFAULT_PRICING = {
@@ -1855,248 +1626,120 @@ function isWebSocketUrl(url) {
1855
1626
  return url.startsWith("ws://") || url.startsWith("wss://");
1856
1627
  }
1857
1628
 
1858
- // src/providers/elevenlabs-tts.ts
1859
- var ELEVENLABS_BASE_URL = "https://api.elevenlabs.io/v1";
1860
- var ELEVENLABS_VOICE_ID_BY_NAME = {
1861
- rachel: "21m00Tcm4TlvDq8ikWAM",
1862
- drew: "29vD33N1CtxCmqQRPOHJ",
1863
- clyde: "2EiwWnXFnvU5JabPnv8n",
1864
- paul: "5Q0t7uMcjvnagumLfvZi",
1865
- domi: "AZnzlk1XvdvUeBnXmlld",
1866
- dave: "CYw3kZ02Hs0563khs1Fj",
1867
- fin: "D38z5RcWu1voky8WS1ja",
1868
- bella: "EXAVITQu4vr4xnSDxMaL",
1869
- antoni: "ErXwobaYiN019PkySvjV",
1870
- thomas: "GBv7mTt0atIp3Br8iCZE",
1871
- charlie: "IKne3meq5aSn9XLyUdCD",
1872
- george: "JBFqnCBsd6RMkjVDRZzb",
1873
- emily: "LcfcDJNUP1GQjkzn1xUU",
1874
- elli: "MF3mGyEYCl7XYWbV9V6O",
1875
- callum: "N2lVS1w4EtoT3dr4eOWO",
1876
- patrick: "ODq5zmih8GrVes37Dizd",
1877
- harry: "SOYHLrjzK2X1ezoPC6cr",
1878
- liam: "TX3LPaxmHKxFdv7VOQHJ",
1879
- dorothy: "ThT5KcBeYPX3keUQqHPh",
1880
- josh: "TxGEqnHWrfWFTfGW9XjX",
1881
- arnold: "VR6AewLTigWG4xSOukaG",
1882
- charlotte: "XB0fDUnXU5powFXDhCwa",
1883
- matilda: "XrExE9yKIg1WjnnlVkGX",
1884
- matthew: "Yko7PKHZNXotIFUBG7I9",
1885
- james: "ZQe5CZNOzWyzPSCn5a3c",
1886
- joseph: "Zlb1dXrM653N07WRdFW3",
1887
- jeremy: "bVMeCyTHy58xNoL34h3p",
1888
- michael: "flq6f7yk4E4fJM5XTYuZ",
1889
- ethan: "g5CIjZEefAph4nQFvHAz",
1890
- gigi: "jBpfuIE2acCO8z3wKNLl",
1891
- freya: "jsCqWAovK2LkecY7zXl4",
1892
- brian: "nPczCjzI2devNBz1zQrb",
1893
- grace: "oWAxZDx7w5VEj9dCyTzz",
1894
- daniel: "onwK4e9ZLuTAKqWW03F9",
1895
- lily: "pFZP5JQG7iQjIQuC4Bku",
1896
- serena: "pMsXgVXv3BLzUgSXRplE",
1897
- adam: "pNInz6obpgDQGcFmaJgB",
1898
- nicole: "piTKgcLEGmPE4e6mEKli",
1899
- bill: "pqHfZKP75CvOlQylNhV4",
1900
- jessie: "t0jbNlBVZ17f02VDIeMI",
1901
- ryan: "wViXBPUzp2ZZixB1xQuM",
1902
- sam: "yoZ06aMxZJJ28mfd3POQ",
1903
- glinda: "z9fAnlkpzviPz146aGWa",
1904
- giovanni: "zcAOhNBS3c14rBihAFp1",
1905
- mimi: "zrHiDhphv9ZnVXBqCLjz",
1906
- alloy: "21m00Tcm4TlvDq8ikWAM"
1907
- };
1908
- var VOICE_ID_PATTERN = /^[A-Za-z0-9]{20}$/;
1909
- function resolveVoiceId(voice) {
1910
- if (!voice) return voice;
1911
- if (VOICE_ID_PATTERN.test(voice)) return voice;
1912
- return ELEVENLABS_VOICE_ID_BY_NAME[voice.toLowerCase()] ?? voice;
1913
- }
1914
- var ElevenLabsTTS = class {
1915
- constructor(apiKey, voiceId = "21m00Tcm4TlvDq8ikWAM", modelId = "eleven_turbo_v2_5", outputFormat = "pcm_16000") {
1629
+ // src/providers/deepgram-stt.ts
1630
+ import WebSocket3 from "ws";
1631
+ var DEEPGRAM_WS_URL = "wss://api.deepgram.com/v1/listen";
1632
+ var DeepgramSTT = class _DeepgramSTT {
1633
+ ws = null;
1634
+ callbacks = [];
1635
+ /** Request ID from Deepgram — used to query actual cost post-call. */
1636
+ requestId = "";
1637
+ apiKey;
1638
+ language;
1639
+ model;
1640
+ encoding;
1641
+ sampleRate;
1642
+ endpointingMs;
1643
+ utteranceEndMs;
1644
+ smartFormat;
1645
+ interimResults;
1646
+ vadEvents;
1647
+ constructor(apiKey, languageOrOptions, model, encoding, sampleRate, options) {
1916
1648
  this.apiKey = apiKey;
1917
- this.modelId = modelId;
1918
- this.outputFormat = outputFormat;
1919
- this.voiceId = resolveVoiceId(voiceId);
1649
+ const opts = typeof languageOrOptions === "object" && languageOrOptions !== null ? languageOrOptions : options ?? {};
1650
+ this.language = (typeof languageOrOptions === "string" ? languageOrOptions : opts.language) ?? "en";
1651
+ this.model = model ?? opts.model ?? "nova-3";
1652
+ this.encoding = encoding ?? opts.encoding ?? "linear16";
1653
+ this.sampleRate = sampleRate ?? opts.sampleRate ?? 16e3;
1654
+ this.endpointingMs = opts.endpointingMs ?? 150;
1655
+ this.utteranceEndMs = opts.utteranceEndMs === null ? null : opts.utteranceEndMs ?? 1e3;
1656
+ this.smartFormat = opts.smartFormat ?? true;
1657
+ this.interimResults = opts.interimResults ?? true;
1658
+ this.vadEvents = opts.vadEvents ?? true;
1920
1659
  }
1921
- voiceId;
1922
- /**
1923
- * Synthesise text to speech and return the full audio as a single Buffer.
1924
- *
1925
- * For large chunks (or when latency matters) call `synthesizeStream` instead.
1926
- */
1927
- async synthesize(text) {
1928
- const chunks = [];
1929
- for await (const chunk of this.synthesizeStream(text)) {
1930
- chunks.push(chunk);
1931
- }
1932
- return Buffer.concat(chunks);
1660
+ /** Factory for Twilio calls — mulaw 8 kHz. Forwards tuning options through. */
1661
+ static forTwilio(apiKey, language = "en", model = "nova-3", options = {}) {
1662
+ return new _DeepgramSTT(apiKey, language, model, "mulaw", 8e3, options);
1933
1663
  }
1934
- /**
1935
- * Synthesise text and yield audio chunks as they arrive (streaming).
1936
- *
1937
- * The yielded buffers are raw PCM at 16 kHz (or whatever `outputFormat` is
1938
- * configured to).
1939
- */
1940
- async *synthesizeStream(text) {
1941
- const url = `${ELEVENLABS_BASE_URL}/text-to-speech/${encodeURIComponent(this.voiceId)}/stream?output_format=${encodeURIComponent(this.outputFormat)}`;
1942
- const response = await fetch(url, {
1943
- method: "POST",
1944
- headers: {
1945
- "xi-api-key": this.apiKey,
1946
- "Content-Type": "application/json"
1947
- },
1948
- body: JSON.stringify({ text, model_id: this.modelId }),
1949
- signal: AbortSignal.timeout(3e4)
1664
+ async connect() {
1665
+ const params = new URLSearchParams({
1666
+ model: this.model,
1667
+ language: this.language,
1668
+ encoding: this.encoding,
1669
+ sample_rate: String(this.sampleRate),
1670
+ channels: "1",
1671
+ interim_results: this.interimResults ? "true" : "false",
1672
+ endpointing: String(this.endpointingMs),
1673
+ smart_format: this.smartFormat ? "true" : "false",
1674
+ vad_events: this.vadEvents ? "true" : "false",
1675
+ no_delay: "true"
1950
1676
  });
1951
- if (!response.ok) {
1952
- const body = await response.text();
1953
- throw new Error(`ElevenLabs TTS error ${response.status}: ${body}`);
1954
- }
1955
- if (!response.body) {
1956
- throw new Error("ElevenLabs TTS: no response body");
1677
+ if (this.utteranceEndMs !== null) {
1678
+ params.set("utterance_end_ms", String(Math.max(this.utteranceEndMs, 1e3)));
1957
1679
  }
1958
- const reader = response.body.getReader();
1959
- try {
1960
- while (true) {
1961
- const { done, value } = await reader.read();
1962
- if (done) break;
1963
- if (value && value.length > 0) {
1964
- yield Buffer.from(value);
1965
- }
1966
- }
1967
- } finally {
1968
- if (typeof reader.cancel === "function") await reader.cancel().catch(() => {
1680
+ const url = `${DEEPGRAM_WS_URL}?${params.toString()}`;
1681
+ this.ws = new WebSocket3(url, {
1682
+ headers: { Authorization: `Token ${this.apiKey}` }
1683
+ });
1684
+ await new Promise((resolve, reject) => {
1685
+ const timer = setTimeout(() => reject(new Error("Deepgram connect timeout")), 1e4);
1686
+ this.ws.once("open", () => {
1687
+ clearTimeout(timer);
1688
+ resolve();
1689
+ });
1690
+ this.ws.once("error", (err) => {
1691
+ clearTimeout(timer);
1692
+ reject(err);
1969
1693
  });
1970
- reader.releaseLock();
1971
- }
1972
- }
1973
- };
1974
-
1975
- // src/providers/openai-tts.ts
1976
- var OPENAI_TTS_URL = "https://api.openai.com/v1/audio/speech";
1977
- var OpenAITTS = class _OpenAITTS {
1978
- constructor(apiKey, voice = "alloy", model = "tts-1") {
1979
- this.apiKey = apiKey;
1980
- this.voice = voice;
1981
- this.model = model;
1982
- }
1983
- /**
1984
- * Synthesise text to speech and return the full audio as a single Buffer.
1985
- *
1986
- * For large chunks (or when latency matters) call `synthesizeStream` instead.
1987
- */
1988
- async synthesize(text) {
1989
- const chunks = [];
1990
- for await (const chunk of this.synthesizeStream(text)) {
1991
- chunks.push(chunk);
1992
- }
1993
- return Buffer.concat(chunks);
1994
- }
1995
- /**
1996
- * Synthesise text and yield audio chunks as they arrive (streaming).
1997
- *
1998
- * OpenAI returns 24 kHz PCM16; each chunk is resampled to 16 kHz before
1999
- * yielding so the output is ready for telephony pipelines.
2000
- *
2001
- * The resampler carries state (buffered samples + odd trailing byte)
2002
- * between chunks — without that state cross-chunk sample alignment drifts
2003
- * and the caller hears pops / dropped audio (BUG #23, mirror of the
2004
- * Python `audioop.ratecv` fix).
2005
- */
2006
- async *synthesizeStream(text) {
2007
- const response = await fetch(OPENAI_TTS_URL, {
2008
- method: "POST",
2009
- headers: {
2010
- "Authorization": `Bearer ${this.apiKey}`,
2011
- "Content-Type": "application/json"
2012
- },
2013
- body: JSON.stringify({
2014
- model: this.model,
2015
- input: text,
2016
- voice: this.voice,
2017
- response_format: "pcm"
2018
- }),
2019
- signal: AbortSignal.timeout(3e4)
2020
1694
  });
2021
- if (!response.ok) {
2022
- const body = await response.text();
2023
- throw new Error(`OpenAI TTS error ${response.status}: ${body}`);
2024
- }
2025
- if (!response.body) {
2026
- throw new Error("OpenAI TTS: no response body");
2027
- }
2028
- const ctx = { carryByte: null, leftover: [] };
2029
- const reader = response.body.getReader();
2030
- try {
2031
- while (true) {
2032
- const { done, value } = await reader.read();
2033
- if (done) break;
2034
- if (value && value.length > 0) {
2035
- const out = _OpenAITTS.resampleStreaming(Buffer.from(value), ctx);
2036
- if (out.length > 0) yield out;
2037
- }
1695
+ this.ws.on("message", (raw) => {
1696
+ let data;
1697
+ try {
1698
+ data = JSON.parse(raw.toString());
1699
+ } catch {
1700
+ return;
2038
1701
  }
2039
- if (ctx.leftover.length > 0) {
2040
- const tail = Buffer.alloc(ctx.leftover.length * 2);
2041
- for (let i = 0; i < ctx.leftover.length; i++) {
2042
- tail.writeInt16LE(ctx.leftover[i], i * 2);
2043
- }
2044
- yield tail;
1702
+ if (data.type === "Metadata" && data.request_id) {
1703
+ this.requestId = data.request_id;
1704
+ return;
2045
1705
  }
2046
- } finally {
2047
- if (typeof reader.cancel === "function") await reader.cancel().catch(() => {
2048
- });
2049
- reader.releaseLock();
2050
- }
1706
+ if (data.type !== "Results") return;
1707
+ const alternatives = data.channel?.alternatives ?? [];
1708
+ if (!alternatives.length) return;
1709
+ const best = alternatives[0];
1710
+ const text = (best.transcript ?? "").trim();
1711
+ if (!text) return;
1712
+ const transcript = {
1713
+ text,
1714
+ isFinal: Boolean(data.is_final) || Boolean(data.speech_final),
1715
+ confidence: best.confidence ?? 0
1716
+ };
1717
+ for (const cb of this.callbacks) {
1718
+ cb(transcript);
1719
+ }
1720
+ });
2051
1721
  }
2052
- /**
2053
- * Streaming 24 kHz 16 kHz resampler (PCM16-LE). Maintains cross-chunk
2054
- * state so the 3:2 pattern doesn't reset at every network read.
2055
- */
2056
- static resampleStreaming(audio, ctx) {
2057
- let buf;
2058
- if (ctx.carryByte !== null) {
2059
- buf = Buffer.concat([Buffer.from([ctx.carryByte]), audio]);
2060
- ctx.carryByte = null;
2061
- } else {
2062
- buf = audio;
2063
- }
2064
- if (buf.length % 2 === 1) {
2065
- ctx.carryByte = buf[buf.length - 1];
2066
- buf = buf.subarray(0, buf.length - 1);
2067
- }
2068
- if (buf.length === 0 && ctx.leftover.length === 0) {
2069
- return Buffer.alloc(0);
2070
- }
2071
- const sampleCount = buf.length / 2;
2072
- const samples = ctx.leftover.slice();
2073
- for (let i2 = 0; i2 < sampleCount; i2++) {
2074
- samples.push(buf.readInt16LE(i2 * 2));
2075
- }
2076
- const out = [];
2077
- let i = 0;
2078
- while (i + 2 < samples.length) {
2079
- out.push(samples[i]);
2080
- out.push(Math.trunc((samples[i + 1] + samples[i + 2]) / 2));
2081
- i += 3;
2082
- }
2083
- ctx.leftover = samples.slice(i);
2084
- const buffer = Buffer.alloc(out.length * 2);
2085
- for (let j = 0; j < out.length; j++) {
2086
- buffer.writeInt16LE(out[j], j * 2);
1722
+ sendAudio(audio) {
1723
+ if (!this.ws || this.ws.readyState !== WebSocket3.OPEN) return;
1724
+ this.ws.send(audio);
1725
+ }
1726
+ onTranscript(callback) {
1727
+ if (this.callbacks.length >= 10) {
1728
+ getLogger().warn("DeepgramSTT: maximum of 10 onTranscript callbacks reached; replacing the last callback.");
1729
+ this.callbacks[this.callbacks.length - 1] = callback;
1730
+ return;
2087
1731
  }
2088
- return buffer;
1732
+ this.callbacks.push(callback);
2089
1733
  }
2090
- /** @deprecated use {@link resampleStreaming} with persistent state. */
2091
- static resample24kTo16k(audio) {
2092
- const ctx = { carryByte: null, leftover: [] };
2093
- const out = _OpenAITTS.resampleStreaming(audio, ctx);
2094
- if (ctx.leftover.length === 0) return out;
2095
- const tail = Buffer.alloc(ctx.leftover.length * 2);
2096
- for (let i = 0; i < ctx.leftover.length; i++) {
2097
- tail.writeInt16LE(ctx.leftover[i], i * 2);
1734
+ close() {
1735
+ if (this.ws) {
1736
+ try {
1737
+ this.ws.send(JSON.stringify({ type: "CloseStream" }));
1738
+ } catch {
1739
+ }
1740
+ this.ws.close();
1741
+ this.ws = null;
2098
1742
  }
2099
- return Buffer.concat([out, tail]);
2100
1743
  }
2101
1744
  };
2102
1745
 
@@ -2697,8 +2340,8 @@ var StreamHandler = class {
2697
2340
  this.caller = caller;
2698
2341
  this.callee = callee;
2699
2342
  this.history = createHistoryManager(200);
2700
- const sttProviderName = deps.agent.stt?.provider || (deps.agent.deepgramKey ? "deepgram" : void 0);
2701
- const ttsProviderName = deps.agent.tts?.provider === "elevenlabs" ? "elevenlabs" : deps.agent.tts?.provider === "openai" ? "openai_tts" : deps.agent.elevenlabsKey ? "elevenlabs" : void 0;
2343
+ const sttProviderName = deps.agent.stt ? deps.agent.stt.constructor?.name ?? "custom" : void 0;
2344
+ const ttsProviderName = deps.agent.tts ? deps.agent.tts.constructor?.name ?? "custom" : void 0;
2702
2345
  const providerMode = deps.agent.provider ?? "openai_realtime";
2703
2346
  this.metricsAcc = new CallMetricsAccumulator({
2704
2347
  callId: "",
@@ -2888,17 +2531,8 @@ var StreamHandler = class {
2888
2531
  // ---------------------------------------------------------------------------
2889
2532
  async initPipeline(resolvedPrompt) {
2890
2533
  const label = this.deps.bridge.label;
2891
- this.stt = this.deps.bridge.createStt(this.deps.agent);
2892
- if (this.deps.agent.tts) {
2893
- if (this.deps.agent.tts.provider === "elevenlabs") {
2894
- this.tts = new ElevenLabsTTS(this.deps.agent.tts.apiKey, this.deps.agent.tts.voice ?? "21m00Tcm4TlvDq8ikWAM");
2895
- }
2896
- if (this.deps.agent.tts.provider === "openai") {
2897
- this.tts = new OpenAITTS(this.deps.agent.tts.apiKey, this.deps.agent.tts.voice ?? "alloy");
2898
- }
2899
- } else if (this.deps.agent.elevenlabsKey) {
2900
- this.tts = new ElevenLabsTTS(this.deps.agent.elevenlabsKey, this.deps.agent.voice || "rachel");
2901
- }
2534
+ this.stt = await this.deps.bridge.createStt(this.deps.agent);
2535
+ this.tts = await createTTS(this.deps.agent);
2902
2536
  if (!this.stt) {
2903
2537
  getLogger().info(`Pipeline mode (${label}): no STT configured`);
2904
2538
  }
@@ -3416,10 +3050,11 @@ var StreamHandler = class {
3416
3050
  this.maxDurationTimer = null;
3417
3051
  }
3418
3052
  await this.deps.bridge.queryTelephonyCost(this.metricsAcc, this.callId);
3419
- const deepgramKey = this.deps.agent.deepgramKey;
3420
- const deepgramRequestId = this.stt?.requestId;
3421
- if (deepgramKey && deepgramRequestId) {
3422
- await queryDeepgramCost(this.metricsAcc, deepgramKey, deepgramRequestId);
3053
+ if (this.stt instanceof DeepgramSTT && this.stt.requestId) {
3054
+ const dgKey = this.stt.apiKey;
3055
+ if (dgKey) {
3056
+ await queryDeepgramCost(this.metricsAcc, dgKey, this.stt.requestId);
3057
+ }
3423
3058
  }
3424
3059
  const finalMetrics = this.metricsAcc.endCall();
3425
3060
  const callEndData = {
@@ -3576,11 +3211,16 @@ function resolveVariables(template, variables) {
3576
3211
  return result;
3577
3212
  }
3578
3213
  function buildAIAdapter(config, agent, resolvedPrompt) {
3214
+ const engine = agent.engine;
3579
3215
  if (agent.provider === "elevenlabs_convai") {
3580
- const key = agent.elevenlabsKey ?? "";
3216
+ if (!engine || engine.kind !== "elevenlabs_convai") {
3217
+ throw new Error(
3218
+ "ElevenLabs ConvAI mode requires `agent.engine = new ElevenLabsConvAI({...})`."
3219
+ );
3220
+ }
3581
3221
  return new ElevenLabsConvAIAdapter(
3582
- key,
3583
- agent.elevenlabsAgentId ?? "",
3222
+ engine.apiKey,
3223
+ engine.agentId,
3584
3224
  agent.voice ?? "21m00Tcm4TlvDq8ikWAM",
3585
3225
  "eleven_turbo_v2_5",
3586
3226
  agent.language ?? "en",
@@ -3593,33 +3233,15 @@ function buildAIAdapter(config, agent, resolvedPrompt) {
3593
3233
  parameters: t.parameters
3594
3234
  })) ?? [];
3595
3235
  const tools = [...agentTools, TRANSFER_CALL_TOOL, END_CALL_TOOL];
3236
+ const openaiKey = engine && engine.kind === "openai_realtime" ? engine.apiKey : config.openaiKey ?? "";
3596
3237
  return new OpenAIRealtimeAdapter(
3597
- config.openaiKey ?? "",
3238
+ openaiKey,
3598
3239
  agent.model,
3599
3240
  agent.voice,
3600
3241
  resolvedPrompt ?? agent.systemPrompt,
3601
3242
  tools
3602
3243
  );
3603
3244
  }
3604
- function extractDeepgramOptions(options) {
3605
- if (!options) return {};
3606
- const get = (snake, camel) => options[snake] ?? options[camel];
3607
- const out = {};
3608
- const model = get("model", "model");
3609
- if (typeof model === "string") out.model = model;
3610
- const endpointing = get("endpointing_ms", "endpointingMs");
3611
- if (typeof endpointing === "number") out.endpointingMs = endpointing;
3612
- const utteranceEnd = get("utterance_end_ms", "utteranceEndMs");
3613
- if (utteranceEnd === null) out.utteranceEndMs = null;
3614
- else if (typeof utteranceEnd === "number") out.utteranceEndMs = utteranceEnd;
3615
- const smart = get("smart_format", "smartFormat");
3616
- if (typeof smart === "boolean") out.smartFormat = smart;
3617
- const interim = get("interim_results", "interimResults");
3618
- if (typeof interim === "boolean") out.interimResults = interim;
3619
- const vad = get("vad_events", "vadEvents");
3620
- if (typeof vad === "boolean") out.vadEvents = vad;
3621
- return out;
3622
- }
3623
3245
  var TwilioBridge = class {
3624
3246
  constructor(config) {
3625
3247
  this.config = config;
@@ -3671,24 +3293,7 @@ var TwilioBridge = class {
3671
3293
  }
3672
3294
  }
3673
3295
  createStt(agent) {
3674
- const isPipeline = agent.provider === "pipeline";
3675
- if (agent.stt) {
3676
- if (agent.stt.provider === "deepgram") {
3677
- const dgOptions = extractDeepgramOptions(agent.stt.options);
3678
- if (isPipeline) {
3679
- return new DeepgramSTT(agent.stt.apiKey, agent.stt.language ?? "en", dgOptions.model, "linear16", 16e3, dgOptions);
3680
- }
3681
- return DeepgramSTT.forTwilio(agent.stt.apiKey, agent.stt.language ?? "en", dgOptions.model, dgOptions);
3682
- } else if (agent.stt.provider === "whisper") {
3683
- return isPipeline ? new WhisperSTT(agent.stt.apiKey, "whisper-1", agent.stt.language ?? "en") : WhisperSTT.forTwilio(agent.stt.apiKey, agent.stt.language ?? "en");
3684
- }
3685
- } else if (agent.deepgramKey) {
3686
- if (isPipeline) {
3687
- return new DeepgramSTT(agent.deepgramKey, agent.language ?? "en", "nova-3", "linear16", 16e3);
3688
- }
3689
- return DeepgramSTT.forTwilio(agent.deepgramKey, agent.language ?? "en");
3690
- }
3691
- return null;
3296
+ return createSTT(agent);
3692
3297
  }
3693
3298
  async queryTelephonyCost(metricsAcc, callId) {
3694
3299
  if (this.config.twilioSid && this.config.twilioToken && callId) {
@@ -3835,24 +3440,7 @@ var TelnyxBridge = class {
3835
3440
  ws.close();
3836
3441
  }
3837
3442
  createStt(agent) {
3838
- if (agent.stt) {
3839
- if (agent.stt.provider === "deepgram") {
3840
- const dgOptions = extractDeepgramOptions(agent.stt.options);
3841
- return new DeepgramSTT(
3842
- agent.stt.apiKey,
3843
- agent.stt.language ?? "en",
3844
- dgOptions.model ?? "nova-3",
3845
- "linear16",
3846
- 16e3,
3847
- dgOptions
3848
- );
3849
- } else if (agent.stt.provider === "whisper") {
3850
- return new WhisperSTT(agent.stt.apiKey, "whisper-1", agent.stt.language ?? "en");
3851
- }
3852
- } else if (agent.deepgramKey) {
3853
- return new DeepgramSTT(agent.deepgramKey, agent.language ?? "en", "nova-3", "linear16", 16e3);
3854
- }
3855
- return null;
3443
+ return createSTT(agent);
3856
3444
  }
3857
3445
  async queryTelephonyCost(metricsAcc, callId) {
3858
3446
  if (this.config.telnyxKey && callId) {
@@ -4790,8 +4378,6 @@ var TestSession = class {
4790
4378
  export {
4791
4379
  OpenAIRealtimeAdapter,
4792
4380
  ElevenLabsConvAIAdapter,
4793
- DeepgramSTT,
4794
- WhisperSTT,
4795
4381
  DEFAULT_PRICING,
4796
4382
  mergePricing,
4797
4383
  calculateSttCost,
@@ -4807,8 +4393,7 @@ export {
4807
4393
  RemoteMessageHandler,
4808
4394
  isRemoteUrl,
4809
4395
  isWebSocketUrl,
4810
- ElevenLabsTTS,
4811
- OpenAITTS,
4396
+ DeepgramSTT,
4812
4397
  CallMetricsAccumulator,
4813
4398
  mulawToPcm16,
4814
4399
  pcm16ToMulaw,