getpatter 0.4.4 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -305,254 +305,16 @@ var init_elevenlabs_convai = __esm({
305
305
  }
306
306
  });
307
307
 
308
- // src/providers/deepgram-stt.ts
309
- var import_ws4, DEEPGRAM_WS_URL, DeepgramSTT;
310
- var init_deepgram_stt = __esm({
311
- "src/providers/deepgram-stt.ts"() {
312
- "use strict";
313
- import_ws4 = __toESM(require("ws"));
314
- init_logger();
315
- DEEPGRAM_WS_URL = "wss://api.deepgram.com/v1/listen";
316
- DeepgramSTT = class _DeepgramSTT {
317
- ws = null;
318
- callbacks = [];
319
- /** Request ID from Deepgram — used to query actual cost post-call. */
320
- requestId = "";
321
- apiKey;
322
- language;
323
- model;
324
- encoding;
325
- sampleRate;
326
- endpointingMs;
327
- utteranceEndMs;
328
- smartFormat;
329
- interimResults;
330
- vadEvents;
331
- constructor(apiKey, languageOrOptions, model, encoding, sampleRate, options) {
332
- this.apiKey = apiKey;
333
- const opts = typeof languageOrOptions === "object" && languageOrOptions !== null ? languageOrOptions : options ?? {};
334
- this.language = (typeof languageOrOptions === "string" ? languageOrOptions : opts.language) ?? "en";
335
- this.model = model ?? opts.model ?? "nova-3";
336
- this.encoding = encoding ?? opts.encoding ?? "linear16";
337
- this.sampleRate = sampleRate ?? opts.sampleRate ?? 16e3;
338
- this.endpointingMs = opts.endpointingMs ?? 150;
339
- this.utteranceEndMs = opts.utteranceEndMs === null ? null : opts.utteranceEndMs ?? 1e3;
340
- this.smartFormat = opts.smartFormat ?? true;
341
- this.interimResults = opts.interimResults ?? true;
342
- this.vadEvents = opts.vadEvents ?? true;
343
- }
344
- /** Factory for Twilio calls — mulaw 8 kHz. Forwards tuning options through. */
345
- static forTwilio(apiKey, language = "en", model = "nova-3", options = {}) {
346
- return new _DeepgramSTT(apiKey, language, model, "mulaw", 8e3, options);
347
- }
348
- async connect() {
349
- const params = new URLSearchParams({
350
- model: this.model,
351
- language: this.language,
352
- encoding: this.encoding,
353
- sample_rate: String(this.sampleRate),
354
- channels: "1",
355
- interim_results: this.interimResults ? "true" : "false",
356
- endpointing: String(this.endpointingMs),
357
- smart_format: this.smartFormat ? "true" : "false",
358
- vad_events: this.vadEvents ? "true" : "false",
359
- no_delay: "true"
360
- });
361
- if (this.utteranceEndMs !== null) {
362
- params.set("utterance_end_ms", String(Math.max(this.utteranceEndMs, 1e3)));
363
- }
364
- const url = `${DEEPGRAM_WS_URL}?${params.toString()}`;
365
- this.ws = new import_ws4.default(url, {
366
- headers: { Authorization: `Token ${this.apiKey}` }
367
- });
368
- await new Promise((resolve, reject) => {
369
- const timer = setTimeout(() => reject(new Error("Deepgram connect timeout")), 1e4);
370
- this.ws.once("open", () => {
371
- clearTimeout(timer);
372
- resolve();
373
- });
374
- this.ws.once("error", (err) => {
375
- clearTimeout(timer);
376
- reject(err);
377
- });
378
- });
379
- this.ws.on("message", (raw) => {
380
- let data;
381
- try {
382
- data = JSON.parse(raw.toString());
383
- } catch {
384
- return;
385
- }
386
- if (data.type === "Metadata" && data.request_id) {
387
- this.requestId = data.request_id;
388
- return;
389
- }
390
- if (data.type !== "Results") return;
391
- const alternatives = data.channel?.alternatives ?? [];
392
- if (!alternatives.length) return;
393
- const best = alternatives[0];
394
- const text = (best.transcript ?? "").trim();
395
- if (!text) return;
396
- const transcript = {
397
- text,
398
- isFinal: Boolean(data.is_final) || Boolean(data.speech_final),
399
- confidence: best.confidence ?? 0
400
- };
401
- for (const cb of this.callbacks) {
402
- cb(transcript);
403
- }
404
- });
405
- }
406
- sendAudio(audio) {
407
- if (!this.ws || this.ws.readyState !== import_ws4.default.OPEN) return;
408
- this.ws.send(audio);
409
- }
410
- onTranscript(callback) {
411
- if (this.callbacks.length >= 10) {
412
- getLogger().warn("DeepgramSTT: maximum of 10 onTranscript callbacks reached; replacing the last callback.");
413
- this.callbacks[this.callbacks.length - 1] = callback;
414
- return;
415
- }
416
- this.callbacks.push(callback);
417
- }
418
- close() {
419
- if (this.ws) {
420
- try {
421
- this.ws.send(JSON.stringify({ type: "CloseStream" }));
422
- } catch {
423
- }
424
- this.ws.close();
425
- this.ws = null;
426
- }
427
- }
428
- };
429
- }
430
- });
431
-
432
- // src/providers/whisper-stt.ts
433
- function wrapPcmInWav(pcm, sampleRate = 16e3, channels = 1, bitsPerSample = 16) {
434
- const dataSize = pcm.length;
435
- const header = Buffer.alloc(44);
436
- header.write("RIFF", 0);
437
- header.writeUInt32LE(36 + dataSize, 4);
438
- header.write("WAVE", 8);
439
- header.write("fmt ", 12);
440
- header.writeUInt32LE(16, 16);
441
- header.writeUInt16LE(1, 20);
442
- header.writeUInt16LE(channels, 22);
443
- header.writeUInt32LE(sampleRate, 24);
444
- header.writeUInt32LE(sampleRate * channels * (bitsPerSample / 8), 28);
445
- header.writeUInt16LE(channels * (bitsPerSample / 8), 32);
446
- header.writeUInt16LE(bitsPerSample, 34);
447
- header.write("data", 36);
448
- header.writeUInt32LE(dataSize, 40);
449
- return Buffer.concat([header, pcm]);
308
+ // src/provider-factory.ts
309
+ async function createSTT(agent) {
310
+ return agent.stt ?? null;
450
311
  }
451
- var OPENAI_TRANSCRIPTION_URL, DEFAULT_BUFFER_SIZE, WhisperSTT;
452
- var init_whisper_stt = __esm({
453
- "src/providers/whisper-stt.ts"() {
312
+ async function createTTS(agent) {
313
+ return agent.tts ?? null;
314
+ }
315
+ var init_provider_factory = __esm({
316
+ "src/provider-factory.ts"() {
454
317
  "use strict";
455
- init_logger();
456
- OPENAI_TRANSCRIPTION_URL = "https://api.openai.com/v1/audio/transcriptions";
457
- DEFAULT_BUFFER_SIZE = 16e3 * 2;
458
- WhisperSTT = class _WhisperSTT {
459
- apiKey;
460
- model;
461
- language;
462
- bufferSize;
463
- buffer = Buffer.alloc(0);
464
- callbacks = [];
465
- running = false;
466
- pendingTranscriptions = [];
467
- constructor(apiKey, model = "whisper-1", language, bufferSize = DEFAULT_BUFFER_SIZE) {
468
- this.apiKey = apiKey;
469
- this.model = model;
470
- this.language = language;
471
- this.bufferSize = bufferSize;
472
- }
473
- /** Factory for Twilio calls — mulaw 8 kHz is transcoded upstream, so we still receive PCM 16-bit. */
474
- static forTwilio(apiKey, language = "en", model = "whisper-1") {
475
- return new _WhisperSTT(apiKey, model, language);
476
- }
477
- async connect() {
478
- this.running = true;
479
- this.buffer = Buffer.alloc(0);
480
- }
481
- sendAudio(audio) {
482
- if (!this.running) return;
483
- this.buffer = Buffer.concat([this.buffer, audio]);
484
- if (this.buffer.length >= this.bufferSize) {
485
- const pcm = this.buffer;
486
- this.buffer = Buffer.alloc(0);
487
- this.trackTranscription(this.transcribeBuffer(pcm));
488
- }
489
- }
490
- trackTranscription(promise) {
491
- const wrapped = promise.finally(() => {
492
- const idx = this.pendingTranscriptions.indexOf(wrapped);
493
- if (idx !== -1) this.pendingTranscriptions.splice(idx, 1);
494
- });
495
- this.pendingTranscriptions.push(wrapped);
496
- }
497
- onTranscript(callback) {
498
- if (this.callbacks.length >= 10) {
499
- getLogger().warn("WhisperSTT: maximum of 10 onTranscript callbacks reached; replacing the last callback.");
500
- this.callbacks[this.callbacks.length - 1] = callback;
501
- return;
502
- }
503
- this.callbacks.push(callback);
504
- }
505
- async close() {
506
- this.running = false;
507
- if (this.buffer.length >= this.bufferSize / 4) {
508
- const pcm = this.buffer;
509
- this.buffer = Buffer.alloc(0);
510
- this.trackTranscription(this.transcribeBuffer(pcm));
511
- } else {
512
- this.buffer = Buffer.alloc(0);
513
- }
514
- await Promise.allSettled(this.pendingTranscriptions);
515
- this.callbacks = [];
516
- }
517
- // ------------------------------------------------------------------
518
- // Private
519
- // ------------------------------------------------------------------
520
- async transcribeBuffer(pcm) {
521
- const wav = wrapPcmInWav(pcm);
522
- const formData = new FormData();
523
- formData.append("file", new Blob([wav.buffer.slice(wav.byteOffset, wav.byteOffset + wav.byteLength)], { type: "audio/wav" }), "audio.wav");
524
- formData.append("model", this.model);
525
- if (this.language) {
526
- formData.append("language", this.language);
527
- }
528
- try {
529
- const resp = await fetch(OPENAI_TRANSCRIPTION_URL, {
530
- method: "POST",
531
- headers: { Authorization: `Bearer ${this.apiKey}` },
532
- body: formData,
533
- signal: AbortSignal.timeout(15e3)
534
- });
535
- if (!resp.ok) {
536
- const body = await resp.text();
537
- getLogger().error(`WhisperSTT transcription error: ${resp.status} ${body}`);
538
- return;
539
- }
540
- const json = await resp.json();
541
- const text = (json.text ?? "").trim();
542
- if (!text) return;
543
- const transcript = {
544
- text,
545
- isFinal: true,
546
- confidence: 1
547
- };
548
- for (const cb of this.callbacks) {
549
- cb(transcript);
550
- }
551
- } catch (err) {
552
- getLogger().error(`WhisperSTT transcription error: ${String(err)}`);
553
- }
554
- }
555
- };
556
318
  }
557
319
  });
558
320
 
@@ -1972,258 +1734,125 @@ var init_remote_message = __esm({
1972
1734
  }
1973
1735
  });
1974
1736
 
1975
- // src/providers/elevenlabs-tts.ts
1976
- function resolveVoiceId(voice) {
1977
- if (!voice) return voice;
1978
- if (VOICE_ID_PATTERN.test(voice)) return voice;
1979
- return ELEVENLABS_VOICE_ID_BY_NAME[voice.toLowerCase()] ?? voice;
1980
- }
1981
- var ELEVENLABS_BASE_URL, ELEVENLABS_VOICE_ID_BY_NAME, VOICE_ID_PATTERN, ElevenLabsTTS;
1982
- var init_elevenlabs_tts = __esm({
1983
- "src/providers/elevenlabs-tts.ts"() {
1737
+ // src/providers/deepgram-stt.ts
1738
+ var import_ws4, DEEPGRAM_WS_URL, DeepgramSTT;
1739
+ var init_deepgram_stt = __esm({
1740
+ "src/providers/deepgram-stt.ts"() {
1984
1741
  "use strict";
1985
- ELEVENLABS_BASE_URL = "https://api.elevenlabs.io/v1";
1986
- ELEVENLABS_VOICE_ID_BY_NAME = {
1987
- rachel: "21m00Tcm4TlvDq8ikWAM",
1988
- drew: "29vD33N1CtxCmqQRPOHJ",
1989
- clyde: "2EiwWnXFnvU5JabPnv8n",
1990
- paul: "5Q0t7uMcjvnagumLfvZi",
1991
- domi: "AZnzlk1XvdvUeBnXmlld",
1992
- dave: "CYw3kZ02Hs0563khs1Fj",
1993
- fin: "D38z5RcWu1voky8WS1ja",
1994
- bella: "EXAVITQu4vr4xnSDxMaL",
1995
- antoni: "ErXwobaYiN019PkySvjV",
1996
- thomas: "GBv7mTt0atIp3Br8iCZE",
1997
- charlie: "IKne3meq5aSn9XLyUdCD",
1998
- george: "JBFqnCBsd6RMkjVDRZzb",
1999
- emily: "LcfcDJNUP1GQjkzn1xUU",
2000
- elli: "MF3mGyEYCl7XYWbV9V6O",
2001
- callum: "N2lVS1w4EtoT3dr4eOWO",
2002
- patrick: "ODq5zmih8GrVes37Dizd",
2003
- harry: "SOYHLrjzK2X1ezoPC6cr",
2004
- liam: "TX3LPaxmHKxFdv7VOQHJ",
2005
- dorothy: "ThT5KcBeYPX3keUQqHPh",
2006
- josh: "TxGEqnHWrfWFTfGW9XjX",
2007
- arnold: "VR6AewLTigWG4xSOukaG",
2008
- charlotte: "XB0fDUnXU5powFXDhCwa",
2009
- matilda: "XrExE9yKIg1WjnnlVkGX",
2010
- matthew: "Yko7PKHZNXotIFUBG7I9",
2011
- james: "ZQe5CZNOzWyzPSCn5a3c",
2012
- joseph: "Zlb1dXrM653N07WRdFW3",
2013
- jeremy: "bVMeCyTHy58xNoL34h3p",
2014
- michael: "flq6f7yk4E4fJM5XTYuZ",
2015
- ethan: "g5CIjZEefAph4nQFvHAz",
2016
- gigi: "jBpfuIE2acCO8z3wKNLl",
2017
- freya: "jsCqWAovK2LkecY7zXl4",
2018
- brian: "nPczCjzI2devNBz1zQrb",
2019
- grace: "oWAxZDx7w5VEj9dCyTzz",
2020
- daniel: "onwK4e9ZLuTAKqWW03F9",
2021
- lily: "pFZP5JQG7iQjIQuC4Bku",
2022
- serena: "pMsXgVXv3BLzUgSXRplE",
2023
- adam: "pNInz6obpgDQGcFmaJgB",
2024
- nicole: "piTKgcLEGmPE4e6mEKli",
2025
- bill: "pqHfZKP75CvOlQylNhV4",
2026
- jessie: "t0jbNlBVZ17f02VDIeMI",
2027
- ryan: "wViXBPUzp2ZZixB1xQuM",
2028
- sam: "yoZ06aMxZJJ28mfd3POQ",
2029
- glinda: "z9fAnlkpzviPz146aGWa",
2030
- giovanni: "zcAOhNBS3c14rBihAFp1",
2031
- mimi: "zrHiDhphv9ZnVXBqCLjz",
2032
- alloy: "21m00Tcm4TlvDq8ikWAM"
2033
- };
2034
- VOICE_ID_PATTERN = /^[A-Za-z0-9]{20}$/;
2035
- ElevenLabsTTS = class {
2036
- constructor(apiKey, voiceId = "21m00Tcm4TlvDq8ikWAM", modelId = "eleven_turbo_v2_5", outputFormat = "pcm_16000") {
1742
+ import_ws4 = __toESM(require("ws"));
1743
+ init_logger();
1744
+ DEEPGRAM_WS_URL = "wss://api.deepgram.com/v1/listen";
1745
+ DeepgramSTT = class _DeepgramSTT {
1746
+ ws = null;
1747
+ callbacks = [];
1748
+ /** Request ID from Deepgram — used to query actual cost post-call. */
1749
+ requestId = "";
1750
+ apiKey;
1751
+ language;
1752
+ model;
1753
+ encoding;
1754
+ sampleRate;
1755
+ endpointingMs;
1756
+ utteranceEndMs;
1757
+ smartFormat;
1758
+ interimResults;
1759
+ vadEvents;
1760
+ constructor(apiKey, languageOrOptions, model, encoding, sampleRate, options) {
2037
1761
  this.apiKey = apiKey;
2038
- this.modelId = modelId;
2039
- this.outputFormat = outputFormat;
2040
- this.voiceId = resolveVoiceId(voiceId);
1762
+ const opts = typeof languageOrOptions === "object" && languageOrOptions !== null ? languageOrOptions : options ?? {};
1763
+ this.language = (typeof languageOrOptions === "string" ? languageOrOptions : opts.language) ?? "en";
1764
+ this.model = model ?? opts.model ?? "nova-3";
1765
+ this.encoding = encoding ?? opts.encoding ?? "linear16";
1766
+ this.sampleRate = sampleRate ?? opts.sampleRate ?? 16e3;
1767
+ this.endpointingMs = opts.endpointingMs ?? 150;
1768
+ this.utteranceEndMs = opts.utteranceEndMs === null ? null : opts.utteranceEndMs ?? 1e3;
1769
+ this.smartFormat = opts.smartFormat ?? true;
1770
+ this.interimResults = opts.interimResults ?? true;
1771
+ this.vadEvents = opts.vadEvents ?? true;
2041
1772
  }
2042
- voiceId;
2043
- /**
2044
- * Synthesise text to speech and return the full audio as a single Buffer.
2045
- *
2046
- * For large chunks (or when latency matters) call `synthesizeStream` instead.
2047
- */
2048
- async synthesize(text) {
2049
- const chunks = [];
2050
- for await (const chunk of this.synthesizeStream(text)) {
2051
- chunks.push(chunk);
2052
- }
2053
- return Buffer.concat(chunks);
1773
+ /** Factory for Twilio calls — mulaw 8 kHz. Forwards tuning options through. */
1774
+ static forTwilio(apiKey, language = "en", model = "nova-3", options = {}) {
1775
+ return new _DeepgramSTT(apiKey, language, model, "mulaw", 8e3, options);
2054
1776
  }
2055
- /**
2056
- * Synthesise text and yield audio chunks as they arrive (streaming).
2057
- *
2058
- * The yielded buffers are raw PCM at 16 kHz (or whatever `outputFormat` is
2059
- * configured to).
2060
- */
2061
- async *synthesizeStream(text) {
2062
- const url = `${ELEVENLABS_BASE_URL}/text-to-speech/${encodeURIComponent(this.voiceId)}/stream?output_format=${encodeURIComponent(this.outputFormat)}`;
2063
- const response = await fetch(url, {
2064
- method: "POST",
2065
- headers: {
2066
- "xi-api-key": this.apiKey,
2067
- "Content-Type": "application/json"
2068
- },
2069
- body: JSON.stringify({ text, model_id: this.modelId }),
2070
- signal: AbortSignal.timeout(3e4)
1777
+ async connect() {
1778
+ const params = new URLSearchParams({
1779
+ model: this.model,
1780
+ language: this.language,
1781
+ encoding: this.encoding,
1782
+ sample_rate: String(this.sampleRate),
1783
+ channels: "1",
1784
+ interim_results: this.interimResults ? "true" : "false",
1785
+ endpointing: String(this.endpointingMs),
1786
+ smart_format: this.smartFormat ? "true" : "false",
1787
+ vad_events: this.vadEvents ? "true" : "false",
1788
+ no_delay: "true"
2071
1789
  });
2072
- if (!response.ok) {
2073
- const body = await response.text();
2074
- throw new Error(`ElevenLabs TTS error ${response.status}: ${body}`);
2075
- }
2076
- if (!response.body) {
2077
- throw new Error("ElevenLabs TTS: no response body");
1790
+ if (this.utteranceEndMs !== null) {
1791
+ params.set("utterance_end_ms", String(Math.max(this.utteranceEndMs, 1e3)));
2078
1792
  }
2079
- const reader = response.body.getReader();
2080
- try {
2081
- while (true) {
2082
- const { done, value } = await reader.read();
2083
- if (done) break;
2084
- if (value && value.length > 0) {
2085
- yield Buffer.from(value);
2086
- }
2087
- }
2088
- } finally {
2089
- if (typeof reader.cancel === "function") await reader.cancel().catch(() => {
1793
+ const url = `${DEEPGRAM_WS_URL}?${params.toString()}`;
1794
+ this.ws = new import_ws4.default(url, {
1795
+ headers: { Authorization: `Token ${this.apiKey}` }
1796
+ });
1797
+ await new Promise((resolve, reject) => {
1798
+ const timer = setTimeout(() => reject(new Error("Deepgram connect timeout")), 1e4);
1799
+ this.ws.once("open", () => {
1800
+ clearTimeout(timer);
1801
+ resolve();
1802
+ });
1803
+ this.ws.once("error", (err) => {
1804
+ clearTimeout(timer);
1805
+ reject(err);
2090
1806
  });
2091
- reader.releaseLock();
2092
- }
2093
- }
2094
- };
2095
- }
2096
- });
2097
-
2098
- // src/providers/openai-tts.ts
2099
- var OPENAI_TTS_URL, OpenAITTS;
2100
- var init_openai_tts = __esm({
2101
- "src/providers/openai-tts.ts"() {
2102
- "use strict";
2103
- OPENAI_TTS_URL = "https://api.openai.com/v1/audio/speech";
2104
- OpenAITTS = class _OpenAITTS {
2105
- constructor(apiKey, voice = "alloy", model = "tts-1") {
2106
- this.apiKey = apiKey;
2107
- this.voice = voice;
2108
- this.model = model;
2109
- }
2110
- /**
2111
- * Synthesise text to speech and return the full audio as a single Buffer.
2112
- *
2113
- * For large chunks (or when latency matters) call `synthesizeStream` instead.
2114
- */
2115
- async synthesize(text) {
2116
- const chunks = [];
2117
- for await (const chunk of this.synthesizeStream(text)) {
2118
- chunks.push(chunk);
2119
- }
2120
- return Buffer.concat(chunks);
2121
- }
2122
- /**
2123
- * Synthesise text and yield audio chunks as they arrive (streaming).
2124
- *
2125
- * OpenAI returns 24 kHz PCM16; each chunk is resampled to 16 kHz before
2126
- * yielding so the output is ready for telephony pipelines.
2127
- *
2128
- * The resampler carries state (buffered samples + odd trailing byte)
2129
- * between chunks — without that state cross-chunk sample alignment drifts
2130
- * and the caller hears pops / dropped audio (BUG #23, mirror of the
2131
- * Python `audioop.ratecv` fix).
2132
- */
2133
- async *synthesizeStream(text) {
2134
- const response = await fetch(OPENAI_TTS_URL, {
2135
- method: "POST",
2136
- headers: {
2137
- "Authorization": `Bearer ${this.apiKey}`,
2138
- "Content-Type": "application/json"
2139
- },
2140
- body: JSON.stringify({
2141
- model: this.model,
2142
- input: text,
2143
- voice: this.voice,
2144
- response_format: "pcm"
2145
- }),
2146
- signal: AbortSignal.timeout(3e4)
2147
1807
  });
2148
- if (!response.ok) {
2149
- const body = await response.text();
2150
- throw new Error(`OpenAI TTS error ${response.status}: ${body}`);
2151
- }
2152
- if (!response.body) {
2153
- throw new Error("OpenAI TTS: no response body");
2154
- }
2155
- const ctx = { carryByte: null, leftover: [] };
2156
- const reader = response.body.getReader();
2157
- try {
2158
- while (true) {
2159
- const { done, value } = await reader.read();
2160
- if (done) break;
2161
- if (value && value.length > 0) {
2162
- const out = _OpenAITTS.resampleStreaming(Buffer.from(value), ctx);
2163
- if (out.length > 0) yield out;
2164
- }
1808
+ this.ws.on("message", (raw) => {
1809
+ let data;
1810
+ try {
1811
+ data = JSON.parse(raw.toString());
1812
+ } catch {
1813
+ return;
2165
1814
  }
2166
- if (ctx.leftover.length > 0) {
2167
- const tail = Buffer.alloc(ctx.leftover.length * 2);
2168
- for (let i = 0; i < ctx.leftover.length; i++) {
2169
- tail.writeInt16LE(ctx.leftover[i], i * 2);
2170
- }
2171
- yield tail;
1815
+ if (data.type === "Metadata" && data.request_id) {
1816
+ this.requestId = data.request_id;
1817
+ return;
2172
1818
  }
2173
- } finally {
2174
- if (typeof reader.cancel === "function") await reader.cancel().catch(() => {
2175
- });
2176
- reader.releaseLock();
1819
+ if (data.type !== "Results") return;
1820
+ const alternatives = data.channel?.alternatives ?? [];
1821
+ if (!alternatives.length) return;
1822
+ const best = alternatives[0];
1823
+ const text = (best.transcript ?? "").trim();
1824
+ if (!text) return;
1825
+ const transcript = {
1826
+ text,
1827
+ isFinal: Boolean(data.is_final) || Boolean(data.speech_final),
1828
+ confidence: best.confidence ?? 0
1829
+ };
1830
+ for (const cb of this.callbacks) {
1831
+ cb(transcript);
1832
+ }
1833
+ });
1834
+ }
1835
+ sendAudio(audio) {
1836
+ if (!this.ws || this.ws.readyState !== import_ws4.default.OPEN) return;
1837
+ this.ws.send(audio);
1838
+ }
1839
+ onTranscript(callback) {
1840
+ if (this.callbacks.length >= 10) {
1841
+ getLogger().warn("DeepgramSTT: maximum of 10 onTranscript callbacks reached; replacing the last callback.");
1842
+ this.callbacks[this.callbacks.length - 1] = callback;
1843
+ return;
2177
1844
  }
1845
+ this.callbacks.push(callback);
2178
1846
  }
2179
- /**
2180
- * Streaming 24 kHz → 16 kHz resampler (PCM16-LE). Maintains cross-chunk
2181
- * state so the 3:2 pattern doesn't reset at every network read.
2182
- */
2183
- static resampleStreaming(audio, ctx) {
2184
- let buf;
2185
- if (ctx.carryByte !== null) {
2186
- buf = Buffer.concat([Buffer.from([ctx.carryByte]), audio]);
2187
- ctx.carryByte = null;
2188
- } else {
2189
- buf = audio;
2190
- }
2191
- if (buf.length % 2 === 1) {
2192
- ctx.carryByte = buf[buf.length - 1];
2193
- buf = buf.subarray(0, buf.length - 1);
2194
- }
2195
- if (buf.length === 0 && ctx.leftover.length === 0) {
2196
- return Buffer.alloc(0);
2197
- }
2198
- const sampleCount = buf.length / 2;
2199
- const samples = ctx.leftover.slice();
2200
- for (let i2 = 0; i2 < sampleCount; i2++) {
2201
- samples.push(buf.readInt16LE(i2 * 2));
2202
- }
2203
- const out = [];
2204
- let i = 0;
2205
- while (i + 2 < samples.length) {
2206
- out.push(samples[i]);
2207
- out.push(Math.trunc((samples[i + 1] + samples[i + 2]) / 2));
2208
- i += 3;
2209
- }
2210
- ctx.leftover = samples.slice(i);
2211
- const buffer = Buffer.alloc(out.length * 2);
2212
- for (let j = 0; j < out.length; j++) {
2213
- buffer.writeInt16LE(out[j], j * 2);
2214
- }
2215
- return buffer;
2216
- }
2217
- /** @deprecated use {@link resampleStreaming} with persistent state. */
2218
- static resample24kTo16k(audio) {
2219
- const ctx = { carryByte: null, leftover: [] };
2220
- const out = _OpenAITTS.resampleStreaming(audio, ctx);
2221
- if (ctx.leftover.length === 0) return out;
2222
- const tail = Buffer.alloc(ctx.leftover.length * 2);
2223
- for (let i = 0; i < ctx.leftover.length; i++) {
2224
- tail.writeInt16LE(ctx.leftover[i], i * 2);
1847
+ close() {
1848
+ if (this.ws) {
1849
+ try {
1850
+ this.ws.send(JSON.stringify({ type: "CloseStream" }));
1851
+ } catch {
1852
+ }
1853
+ this.ws.close();
1854
+ this.ws = null;
2225
1855
  }
2226
- return Buffer.concat([out, tail]);
2227
1856
  }
2228
1857
  };
2229
1858
  }
@@ -3133,8 +2762,8 @@ var init_stream_handler = __esm({
3133
2762
  "use strict";
3134
2763
  init_openai_realtime();
3135
2764
  init_elevenlabs_convai();
3136
- init_elevenlabs_tts();
3137
- init_openai_tts();
2765
+ init_deepgram_stt();
2766
+ init_provider_factory();
3138
2767
  init_metrics();
3139
2768
  init_transcoding();
3140
2769
  init_llm_loop();
@@ -3176,8 +2805,8 @@ var init_stream_handler = __esm({
3176
2805
  this.caller = caller;
3177
2806
  this.callee = callee;
3178
2807
  this.history = createHistoryManager(200);
3179
- const sttProviderName = deps.agent.stt?.provider || (deps.agent.deepgramKey ? "deepgram" : void 0);
3180
- const ttsProviderName = deps.agent.tts?.provider === "elevenlabs" ? "elevenlabs" : deps.agent.tts?.provider === "openai" ? "openai_tts" : deps.agent.elevenlabsKey ? "elevenlabs" : void 0;
2808
+ const sttProviderName = deps.agent.stt ? deps.agent.stt.constructor?.name ?? "custom" : void 0;
2809
+ const ttsProviderName = deps.agent.tts ? deps.agent.tts.constructor?.name ?? "custom" : void 0;
3181
2810
  const providerMode = deps.agent.provider ?? "openai_realtime";
3182
2811
  this.metricsAcc = new CallMetricsAccumulator({
3183
2812
  callId: "",
@@ -3367,17 +2996,8 @@ var init_stream_handler = __esm({
3367
2996
  // ---------------------------------------------------------------------------
3368
2997
  async initPipeline(resolvedPrompt) {
3369
2998
  const label = this.deps.bridge.label;
3370
- this.stt = this.deps.bridge.createStt(this.deps.agent);
3371
- if (this.deps.agent.tts) {
3372
- if (this.deps.agent.tts.provider === "elevenlabs") {
3373
- this.tts = new ElevenLabsTTS(this.deps.agent.tts.apiKey, this.deps.agent.tts.voice ?? "21m00Tcm4TlvDq8ikWAM");
3374
- }
3375
- if (this.deps.agent.tts.provider === "openai") {
3376
- this.tts = new OpenAITTS(this.deps.agent.tts.apiKey, this.deps.agent.tts.voice ?? "alloy");
3377
- }
3378
- } else if (this.deps.agent.elevenlabsKey) {
3379
- this.tts = new ElevenLabsTTS(this.deps.agent.elevenlabsKey, this.deps.agent.voice || "rachel");
3380
- }
2999
+ this.stt = await this.deps.bridge.createStt(this.deps.agent);
3000
+ this.tts = await createTTS(this.deps.agent);
3381
3001
  if (!this.stt) {
3382
3002
  getLogger().info(`Pipeline mode (${label}): no STT configured`);
3383
3003
  }
@@ -3895,10 +3515,11 @@ var init_stream_handler = __esm({
3895
3515
  this.maxDurationTimer = null;
3896
3516
  }
3897
3517
  await this.deps.bridge.queryTelephonyCost(this.metricsAcc, this.callId);
3898
- const deepgramKey = this.deps.agent.deepgramKey;
3899
- const deepgramRequestId = this.stt?.requestId;
3900
- if (deepgramKey && deepgramRequestId) {
3901
- await queryDeepgramCost(this.metricsAcc, deepgramKey, deepgramRequestId);
3518
+ if (this.stt instanceof DeepgramSTT && this.stt.requestId) {
3519
+ const dgKey = this.stt.apiKey;
3520
+ if (dgKey) {
3521
+ await queryDeepgramCost(this.metricsAcc, dgKey, this.stt.requestId);
3522
+ }
3902
3523
  }
3903
3524
  const finalMetrics = this.metricsAcc.endCall();
3904
3525
  const callEndData = {
@@ -4000,11 +3621,16 @@ function resolveVariables(template, variables) {
4000
3621
  return result;
4001
3622
  }
4002
3623
  function buildAIAdapter(config, agent, resolvedPrompt) {
3624
+ const engine = agent.engine;
4003
3625
  if (agent.provider === "elevenlabs_convai") {
4004
- const key = agent.elevenlabsKey ?? "";
3626
+ if (!engine || engine.kind !== "elevenlabs_convai") {
3627
+ throw new Error(
3628
+ "ElevenLabs ConvAI mode requires `agent.engine = new ElevenLabsConvAI({...})`."
3629
+ );
3630
+ }
4005
3631
  return new ElevenLabsConvAIAdapter(
4006
- key,
4007
- agent.elevenlabsAgentId ?? "",
3632
+ engine.apiKey,
3633
+ engine.agentId,
4008
3634
  agent.voice ?? "21m00Tcm4TlvDq8ikWAM",
4009
3635
  "eleven_turbo_v2_5",
4010
3636
  agent.language ?? "en",
@@ -4017,33 +3643,15 @@ function buildAIAdapter(config, agent, resolvedPrompt) {
4017
3643
  parameters: t.parameters
4018
3644
  })) ?? [];
4019
3645
  const tools = [...agentTools, TRANSFER_CALL_TOOL, END_CALL_TOOL];
3646
+ const openaiKey = engine && engine.kind === "openai_realtime" ? engine.apiKey : config.openaiKey ?? "";
4020
3647
  return new OpenAIRealtimeAdapter(
4021
- config.openaiKey ?? "",
3648
+ openaiKey,
4022
3649
  agent.model,
4023
3650
  agent.voice,
4024
3651
  resolvedPrompt ?? agent.systemPrompt,
4025
3652
  tools
4026
3653
  );
4027
3654
  }
4028
- function extractDeepgramOptions(options) {
4029
- if (!options) return {};
4030
- const get = (snake, camel) => options[snake] ?? options[camel];
4031
- const out = {};
4032
- const model = get("model", "model");
4033
- if (typeof model === "string") out.model = model;
4034
- const endpointing = get("endpointing_ms", "endpointingMs");
4035
- if (typeof endpointing === "number") out.endpointingMs = endpointing;
4036
- const utteranceEnd = get("utterance_end_ms", "utteranceEndMs");
4037
- if (utteranceEnd === null) out.utteranceEndMs = null;
4038
- else if (typeof utteranceEnd === "number") out.utteranceEndMs = utteranceEnd;
4039
- const smart = get("smart_format", "smartFormat");
4040
- if (typeof smart === "boolean") out.smartFormat = smart;
4041
- const interim = get("interim_results", "interimResults");
4042
- if (typeof interim === "boolean") out.interimResults = interim;
4043
- const vad = get("vad_events", "vadEvents");
4044
- if (typeof vad === "boolean") out.vadEvents = vad;
4045
- return out;
4046
- }
4047
3655
  function isValidTelnyxTransferTarget(target) {
4048
3656
  if (typeof target !== "string" || !target) return false;
4049
3657
  if (/^\+[1-9]\d{6,14}$/.test(target)) return true;
@@ -4063,8 +3671,7 @@ var init_server = __esm({
4063
3671
  import_ws5 = require("ws");
4064
3672
  init_openai_realtime();
4065
3673
  init_elevenlabs_convai();
4066
- init_deepgram_stt();
4067
- init_whisper_stt();
3674
+ init_provider_factory();
4068
3675
  init_pricing();
4069
3676
  init_store();
4070
3677
  init_routes();
@@ -4149,24 +3756,7 @@ var init_server = __esm({
4149
3756
  }
4150
3757
  }
4151
3758
  createStt(agent) {
4152
- const isPipeline = agent.provider === "pipeline";
4153
- if (agent.stt) {
4154
- if (agent.stt.provider === "deepgram") {
4155
- const dgOptions = extractDeepgramOptions(agent.stt.options);
4156
- if (isPipeline) {
4157
- return new DeepgramSTT(agent.stt.apiKey, agent.stt.language ?? "en", dgOptions.model, "linear16", 16e3, dgOptions);
4158
- }
4159
- return DeepgramSTT.forTwilio(agent.stt.apiKey, agent.stt.language ?? "en", dgOptions.model, dgOptions);
4160
- } else if (agent.stt.provider === "whisper") {
4161
- return isPipeline ? new WhisperSTT(agent.stt.apiKey, "whisper-1", agent.stt.language ?? "en") : WhisperSTT.forTwilio(agent.stt.apiKey, agent.stt.language ?? "en");
4162
- }
4163
- } else if (agent.deepgramKey) {
4164
- if (isPipeline) {
4165
- return new DeepgramSTT(agent.deepgramKey, agent.language ?? "en", "nova-3", "linear16", 16e3);
4166
- }
4167
- return DeepgramSTT.forTwilio(agent.deepgramKey, agent.language ?? "en");
4168
- }
4169
- return null;
3759
+ return createSTT(agent);
4170
3760
  }
4171
3761
  async queryTelephonyCost(metricsAcc, callId) {
4172
3762
  if (this.config.twilioSid && this.config.twilioToken && callId) {
@@ -4304,24 +3894,7 @@ var init_server = __esm({
4304
3894
  ws.close();
4305
3895
  }
4306
3896
  createStt(agent) {
4307
- if (agent.stt) {
4308
- if (agent.stt.provider === "deepgram") {
4309
- const dgOptions = extractDeepgramOptions(agent.stt.options);
4310
- return new DeepgramSTT(
4311
- agent.stt.apiKey,
4312
- agent.stt.language ?? "en",
4313
- dgOptions.model ?? "nova-3",
4314
- "linear16",
4315
- 16e3,
4316
- dgOptions
4317
- );
4318
- } else if (agent.stt.provider === "whisper") {
4319
- return new WhisperSTT(agent.stt.apiKey, "whisper-1", agent.stt.language ?? "en");
4320
- }
4321
- } else if (agent.deepgramKey) {
4322
- return new DeepgramSTT(agent.deepgramKey, agent.language ?? "en", "nova-3", "linear16", 16e3);
4323
- }
4324
- return null;
3897
+ return createSTT(agent);
4325
3898
  }
4326
3899
  async queryTelephonyCost(metricsAcc, callId) {
4327
3900
  if (this.config.telnyxKey && callId) {
@@ -5760,6 +5333,94 @@ var init_tunnel = __esm({
5760
5333
  }
5761
5334
  });
5762
5335
 
5336
+ // src/carrier-config.ts
5337
+ var carrier_config_exports = {};
5338
+ __export(carrier_config_exports, {
5339
+ autoConfigureCarrier: () => autoConfigureCarrier,
5340
+ configureTelnyxNumber: () => configureTelnyxNumber,
5341
+ configureTwilioNumber: () => configureTwilioNumber
5342
+ });
5343
+ async function configureTwilioNumber(accountSid, authToken, phoneNumber, voiceUrl) {
5344
+ const auth = `Basic ${Buffer.from(`${accountSid}:${authToken}`).toString("base64")}`;
5345
+ const listUrl = `${TWILIO_API_BASE}/Accounts/${accountSid}/IncomingPhoneNumbers.json?PhoneNumber=${encodeURIComponent(phoneNumber)}`;
5346
+ const listResp = await fetch(listUrl, {
5347
+ method: "GET",
5348
+ headers: { Authorization: auth }
5349
+ });
5350
+ if (!listResp.ok) {
5351
+ throw new Error(
5352
+ `Twilio IncomingPhoneNumbers.list failed: ${listResp.status} ${await listResp.text()}`
5353
+ );
5354
+ }
5355
+ const body = await listResp.json();
5356
+ const match = body.incoming_phone_numbers?.[0];
5357
+ if (!match) {
5358
+ throw new Error(`Twilio number ${phoneNumber} not found on account ${accountSid}`);
5359
+ }
5360
+ const updateUrl = `${TWILIO_API_BASE}/Accounts/${accountSid}/IncomingPhoneNumbers/${match.sid}.json`;
5361
+ const form = new URLSearchParams({ VoiceUrl: voiceUrl, VoiceMethod: "POST" });
5362
+ const updateResp = await fetch(updateUrl, {
5363
+ method: "POST",
5364
+ headers: {
5365
+ Authorization: auth,
5366
+ "Content-Type": "application/x-www-form-urlencoded"
5367
+ },
5368
+ body: form.toString()
5369
+ });
5370
+ if (!updateResp.ok) {
5371
+ throw new Error(
5372
+ `Twilio IncomingPhoneNumbers.update failed: ${updateResp.status} ${await updateResp.text()}`
5373
+ );
5374
+ }
5375
+ }
5376
+ async function configureTelnyxNumber(apiKey, connectionId, phoneNumber) {
5377
+ const resp = await fetch(`${TELNYX_API_BASE}/phone_numbers/${encodeURIComponent(phoneNumber)}`, {
5378
+ method: "PATCH",
5379
+ headers: {
5380
+ Authorization: `Bearer ${apiKey}`,
5381
+ "Content-Type": "application/json"
5382
+ },
5383
+ body: JSON.stringify({ connection_id: connectionId })
5384
+ });
5385
+ if (!resp.ok) {
5386
+ throw new Error(
5387
+ `Telnyx PATCH /phone_numbers/${phoneNumber} failed: ${resp.status} ${await resp.text()}`
5388
+ );
5389
+ }
5390
+ }
5391
+ async function autoConfigureCarrier(params) {
5392
+ const log2 = getLogger();
5393
+ const provider = params.telephonyProvider ?? (params.twilioSid ? "twilio" : "telnyx");
5394
+ if (provider === "twilio" && params.twilioSid && params.twilioToken) {
5395
+ const voiceUrl = `https://${params.webhookHost}/webhooks/twilio/voice`;
5396
+ try {
5397
+ await configureTwilioNumber(params.twilioSid, params.twilioToken, params.phoneNumber, voiceUrl);
5398
+ log2.info("Twilio webhook set to %s", voiceUrl);
5399
+ } catch (err) {
5400
+ log2.warn("Could not auto-configure Twilio webhook: %s", err instanceof Error ? err.message : String(err));
5401
+ log2.info("Set webhook manually to: %s", voiceUrl);
5402
+ }
5403
+ return;
5404
+ }
5405
+ if (provider === "telnyx" && params.telnyxKey && params.telnyxConnectionId) {
5406
+ try {
5407
+ await configureTelnyxNumber(params.telnyxKey, params.telnyxConnectionId, params.phoneNumber);
5408
+ log2.info("Telnyx number %s associated with connection %s", params.phoneNumber, params.telnyxConnectionId);
5409
+ } catch (err) {
5410
+ log2.warn("Could not auto-configure Telnyx number: %s", err instanceof Error ? err.message : String(err));
5411
+ }
5412
+ }
5413
+ }
5414
+ var TWILIO_API_BASE, TELNYX_API_BASE;
5415
+ var init_carrier_config = __esm({
5416
+ "src/carrier-config.ts"() {
5417
+ "use strict";
5418
+ init_logger();
5419
+ TWILIO_API_BASE = "https://api.twilio.com/2010-04-01";
5420
+ TELNYX_API_BASE = "https://api.telnyx.com/v2";
5421
+ }
5422
+ });
5423
+
5763
5424
  // src/test-mode.ts
5764
5425
  var test_mode_exports = {};
5765
5426
  __export(test_mode_exports, {
@@ -6874,31 +6535,35 @@ var require_node_cron = __commonJS({
6874
6535
  var index_exports = {};
6875
6536
  __export(index_exports, {
6876
6537
  AllProvidersFailedError: () => AllProvidersFailedError,
6877
- AssemblyAISTT: () => AssemblyAISTT,
6538
+ AssemblyAISTT: () => STT5,
6878
6539
  AuthenticationError: () => AuthenticationError,
6879
6540
  BackgroundAudioPlayer: () => BackgroundAudioPlayer,
6880
6541
  BuiltinAudioClip: () => BuiltinAudioClip,
6881
6542
  CallMetricsAccumulator: () => CallMetricsAccumulator,
6882
- CartesiaSTT: () => CartesiaSTT,
6883
- CartesiaTTS: () => CartesiaTTS,
6543
+ CartesiaSTT: () => STT3,
6544
+ CartesiaTTS: () => TTS3,
6884
6545
  ChatContext: () => ChatContext,
6546
+ CloudflareTunnel: () => CloudflareTunnel,
6885
6547
  DEFAULT_MIN_SENTENCE_LEN: () => DEFAULT_MIN_SENTENCE_LEN,
6886
6548
  DEFAULT_PRICING: () => DEFAULT_PRICING,
6887
6549
  DTMF_EVENTS: () => DTMF_EVENTS,
6888
- DeepgramSTT: () => DeepgramSTT,
6550
+ DeepgramSTT: () => STT,
6551
+ ElevenLabsConvAI: () => ConvAI,
6889
6552
  ElevenLabsConvAIAdapter: () => ElevenLabsConvAIAdapter,
6890
- ElevenLabsTTS: () => ElevenLabsTTS,
6553
+ ElevenLabsTTS: () => TTS,
6891
6554
  FallbackLLMProvider: () => FallbackLLMProvider,
6892
6555
  GEMINI_DEFAULT_INPUT_SR: () => GEMINI_DEFAULT_INPUT_SR,
6893
6556
  GEMINI_DEFAULT_OUTPUT_SR: () => GEMINI_DEFAULT_OUTPUT_SR,
6894
6557
  GeminiLiveAdapter: () => GeminiLiveAdapter,
6558
+ Guardrail: () => Guardrail,
6895
6559
  IVRActivity: () => IVRActivity,
6896
6560
  LLMLoop: () => LLMLoop,
6897
- LMNTTTS: () => LMNTTTS,
6561
+ LMNTTTS: () => TTS5,
6898
6562
  MetricsStore: () => MetricsStore,
6899
6563
  OpenAILLMProvider: () => OpenAILLMProvider,
6564
+ OpenAIRealtime: () => Realtime,
6900
6565
  OpenAIRealtimeAdapter: () => OpenAIRealtimeAdapter,
6901
- OpenAITTS: () => OpenAITTS,
6566
+ OpenAITTS: () => TTS2,
6902
6567
  PartialStreamError: () => PartialStreamError,
6903
6568
  Patter: () => Patter,
6904
6569
  PatterConnectionError: () => PatterConnectionError,
@@ -6906,15 +6571,19 @@ __export(index_exports, {
6906
6571
  PipelineHookExecutor: () => PipelineHookExecutor,
6907
6572
  ProvisionError: () => ProvisionError,
6908
6573
  RemoteMessageHandler: () => RemoteMessageHandler,
6909
- RimeTTS: () => RimeTTS,
6574
+ RimeTTS: () => TTS4,
6910
6575
  SentenceChunker: () => SentenceChunker,
6911
- SonioxSTT: () => SonioxSTT,
6576
+ SonioxSTT: () => STT4,
6577
+ StaticTunnel: () => Static,
6578
+ Telnyx: () => Carrier2,
6912
6579
  TestSession: () => TestSession,
6913
6580
  TfidfLoopDetector: () => TfidfLoopDetector,
6581
+ Tool: () => Tool,
6582
+ Twilio: () => Carrier,
6914
6583
  ULTRAVOX_DEFAULT_API_BASE: () => ULTRAVOX_DEFAULT_API_BASE,
6915
6584
  ULTRAVOX_DEFAULT_SR: () => ULTRAVOX_DEFAULT_SR,
6916
6585
  UltravoxRealtimeAdapter: () => UltravoxRealtimeAdapter,
6917
- WhisperSTT: () => WhisperSTT,
6586
+ WhisperSTT: () => STT2,
6918
6587
  builtinClipPath: () => builtinClipPath,
6919
6588
  calculateRealtimeCost: () => calculateRealtimeCost,
6920
6589
  calculateSttCost: () => calculateSttCost,
@@ -6930,6 +6599,7 @@ __export(index_exports, {
6930
6599
  filterMarkdown: () => filterMarkdown,
6931
6600
  formatDtmf: () => formatDtmf,
6932
6601
  getLogger: () => getLogger,
6602
+ guardrail: () => guardrail,
6933
6603
  isRemoteUrl: () => isRemoteUrl,
6934
6604
  isWebSocketUrl: () => isWebSocketUrl,
6935
6605
  makeAuthMiddleware: () => makeAuthMiddleware,
@@ -6951,6 +6621,7 @@ __export(index_exports, {
6951
6621
  selectSoundFromList: () => selectSoundFromList,
6952
6622
  setLogger: () => setLogger,
6953
6623
  startTunnel: () => startTunnel,
6624
+ tool: () => tool,
6954
6625
  whisper: () => whisper
6955
6626
  });
6956
6627
  module.exports = __toCommonJS(index_exports);
@@ -7096,77 +6767,69 @@ var PatterConnection = class {
7096
6767
  }
7097
6768
  };
7098
6769
 
7099
- // src/providers.ts
7100
- var STTConfigImpl = class {
7101
- provider;
6770
+ // src/client.ts
6771
+ init_server();
6772
+
6773
+ // src/engines/openai.ts
6774
+ var Realtime = class {
6775
+ kind = "openai_realtime";
7102
6776
  apiKey;
7103
- language;
7104
- options;
7105
- constructor(provider, apiKey, language = "en", options) {
7106
- this.provider = provider;
7107
- this.apiKey = apiKey;
7108
- this.language = language;
7109
- if (options) this.options = options;
7110
- }
7111
- toDict() {
7112
- const out = {
7113
- provider: this.provider,
7114
- api_key: this.apiKey,
7115
- language: this.language
7116
- };
7117
- if (this.options) out.options = { ...this.options };
7118
- return out;
6777
+ model;
6778
+ voice;
6779
+ constructor(opts = {}) {
6780
+ const key = opts.apiKey ?? process.env.OPENAI_API_KEY;
6781
+ if (!key) {
6782
+ throw new Error(
6783
+ "OpenAI Realtime requires an apiKey. Pass { apiKey: 'sk-...' } or set OPENAI_API_KEY in the environment."
6784
+ );
6785
+ }
6786
+ this.apiKey = key;
6787
+ this.model = opts.model ?? "gpt-4o-mini-realtime-preview";
6788
+ this.voice = opts.voice ?? "alloy";
7119
6789
  }
7120
6790
  };
7121
- var TTSConfigImpl = class {
7122
- provider;
6791
+
6792
+ // src/engines/elevenlabs.ts
6793
+ var ConvAI = class {
6794
+ kind = "elevenlabs_convai";
7123
6795
  apiKey;
6796
+ agentId;
7124
6797
  voice;
7125
- constructor(provider, apiKey, voice = "alloy") {
7126
- this.provider = provider;
7127
- this.apiKey = apiKey;
7128
- this.voice = voice;
6798
+ constructor(opts = {}) {
6799
+ const key = opts.apiKey ?? process.env.ELEVENLABS_API_KEY;
6800
+ const agent = opts.agentId ?? process.env.ELEVENLABS_AGENT_ID;
6801
+ if (!key) {
6802
+ throw new Error(
6803
+ "ElevenLabs ConvAI requires an apiKey. Pass { apiKey: '...' } or set ELEVENLABS_API_KEY in the environment."
6804
+ );
6805
+ }
6806
+ if (!agent) {
6807
+ throw new Error(
6808
+ "ElevenLabs ConvAI requires an agentId. Pass { agentId: 'agent_...' } or set ELEVENLABS_AGENT_ID in the environment."
6809
+ );
6810
+ }
6811
+ this.apiKey = key;
6812
+ this.agentId = agent;
6813
+ this.voice = opts.voice;
7129
6814
  }
7130
- toDict() {
7131
- return { provider: this.provider, api_key: this.apiKey, voice: this.voice };
6815
+ };
6816
+
6817
+ // src/tunnels/index.ts
6818
+ var CloudflareTunnel = class {
6819
+ kind = "cloudflare";
6820
+ };
6821
+ var Static = class {
6822
+ kind = "static";
6823
+ hostname;
6824
+ constructor(opts) {
6825
+ if (!opts.hostname) {
6826
+ throw new Error("Static tunnel requires a non-empty hostname.");
6827
+ }
6828
+ this.hostname = opts.hostname;
7132
6829
  }
7133
6830
  };
7134
- function deepgram(opts) {
7135
- const options = {
7136
- model: opts.model ?? "nova-3",
7137
- endpointing_ms: opts.endpointingMs ?? 150,
7138
- utterance_end_ms: opts.utteranceEndMs === null ? null : opts.utteranceEndMs ?? 1e3,
7139
- smart_format: opts.smartFormat ?? true,
7140
- interim_results: opts.interimResults ?? true
7141
- };
7142
- if (opts.vadEvents !== void 0) options.vad_events = opts.vadEvents;
7143
- return new STTConfigImpl("deepgram", opts.apiKey, opts.language ?? "en", options);
7144
- }
7145
- function whisper(opts) {
7146
- return new STTConfigImpl("whisper", opts.apiKey, opts.language ?? "en");
7147
- }
7148
- function elevenlabs(opts) {
7149
- return new TTSConfigImpl("elevenlabs", opts.apiKey, opts.voice ?? "rachel");
7150
- }
7151
- function openaiTts(opts) {
7152
- return new TTSConfigImpl("openai", opts.apiKey, opts.voice ?? "alloy");
7153
- }
7154
- function cartesia(opts) {
7155
- return new TTSConfigImpl(
7156
- "cartesia",
7157
- opts.apiKey,
7158
- opts.voice ?? "f786b574-daa5-4673-aa0c-cbe3e8534c02"
7159
- );
7160
- }
7161
- function rime(opts) {
7162
- return new TTSConfigImpl("rime", opts.apiKey, opts.voice ?? "astra");
7163
- }
7164
- function lmnt(opts) {
7165
- return new TTSConfigImpl("lmnt", opts.apiKey, opts.voice ?? "leah");
7166
- }
7167
6831
 
7168
6832
  // src/client.ts
7169
- init_server();
7170
6833
  var DEFAULT_BACKEND_URL2 = "wss://api.getpatter.com";
7171
6834
  var DEFAULT_REST_URL = "https://api.getpatter.com";
7172
6835
  function sttConfigToDict(cfg) {
@@ -7197,21 +6860,39 @@ var Patter = class {
7197
6860
  embeddedServer = null;
7198
6861
  tunnelHandle = null;
7199
6862
  constructor(options) {
7200
- const isLocal = "mode" in options && options.mode === "local" || !("apiKey" in options) && ("twilioSid" in options && options.twilioSid || "telnyxKey" in options && options.telnyxKey);
6863
+ const hasCarrier = "carrier" in options && options.carrier !== void 0;
6864
+ const isLocal = "mode" in options && options.mode === "local" || hasCarrier;
7201
6865
  if (isLocal) {
7202
6866
  const local = options;
7203
6867
  if (!local.phoneNumber) {
7204
6868
  throw new Error("Local mode requires phoneNumber");
7205
6869
  }
7206
- if (!local.twilioSid && !local.telnyxKey) {
7207
- throw new Error("Local mode requires twilioSid or telnyxKey");
6870
+ if (!local.carrier) {
6871
+ throw new Error(
6872
+ "Local mode requires a `carrier` instance. Pass `carrier: new Twilio({...})` or `carrier: new Telnyx({...})`."
6873
+ );
7208
6874
  }
7209
- if (local.twilioSid && !local.twilioToken) {
7210
- throw new Error("twilioToken is required when using twilioSid");
6875
+ const carrier = local.carrier;
6876
+ const tunnel = local.tunnel;
6877
+ let tunnelWebhookUrl;
6878
+ if (tunnel instanceof Static) {
6879
+ if (local.webhookUrl) {
6880
+ throw new Error(
6881
+ "Cannot use both `tunnel: new StaticTunnel(...)` and `webhookUrl`. Pick one."
6882
+ );
6883
+ }
6884
+ tunnelWebhookUrl = tunnel.hostname;
7211
6885
  }
7212
6886
  this.mode = "local";
7213
- const normalizedLocal = local.webhookUrl ? { ...local, webhookUrl: local.webhookUrl.replace(/^https?:\/\//, "").replace(/\/$/, "") } : local;
7214
- this.localConfig = normalizedLocal;
6887
+ const rawWebhook = tunnelWebhookUrl ?? local.webhookUrl;
6888
+ const normalizedWebhook = rawWebhook ? rawWebhook.replace(/^https?:\/\//, "").replace(/\/$/, "") : void 0;
6889
+ this.localConfig = {
6890
+ carrier,
6891
+ phoneNumber: local.phoneNumber,
6892
+ webhookUrl: normalizedWebhook,
6893
+ tunnel: local.tunnel,
6894
+ openaiKey: local.openaiKey
6895
+ };
7215
6896
  this.apiKey = "";
7216
6897
  this.backendUrl = DEFAULT_BACKEND_URL2;
7217
6898
  this.restUrl = DEFAULT_REST_URL;
@@ -7228,25 +6909,55 @@ var Patter = class {
7228
6909
  }
7229
6910
  // === Local mode ===
7230
6911
  agent(opts) {
7231
- if (opts.provider) {
6912
+ let working = { ...opts };
6913
+ if (opts.engine) {
6914
+ if (opts.provider) {
6915
+ throw new Error(
6916
+ "Cannot pass both `engine:` and `provider:`. Use one (engine is preferred)."
6917
+ );
6918
+ }
6919
+ const engine = opts.engine;
6920
+ if (engine instanceof Realtime) {
6921
+ working = {
6922
+ ...working,
6923
+ provider: "openai_realtime",
6924
+ model: working.model ?? engine.model,
6925
+ voice: working.voice ?? engine.voice
6926
+ };
6927
+ if (this.localConfig && !this.localConfig.openaiKey) {
6928
+ this.localConfig = { ...this.localConfig, openaiKey: engine.apiKey };
6929
+ }
6930
+ } else if (engine instanceof ConvAI) {
6931
+ working = {
6932
+ ...working,
6933
+ provider: "elevenlabs_convai",
6934
+ voice: working.voice ?? engine.voice
6935
+ };
6936
+ } else {
6937
+ throw new Error(
6938
+ "Unknown engine. Expected OpenAIRealtime or ElevenLabsConvAI instance."
6939
+ );
6940
+ }
6941
+ }
6942
+ if (working.provider) {
7232
6943
  const valid = ["openai_realtime", "elevenlabs_convai", "pipeline"];
7233
- if (!valid.includes(opts.provider)) {
7234
- throw new Error(`provider must be one of: ${valid.join(", ")}. Got: '${opts.provider}'`);
6944
+ if (!valid.includes(working.provider)) {
6945
+ throw new Error(`provider must be one of: ${valid.join(", ")}. Got: '${working.provider}'`);
7235
6946
  }
7236
6947
  }
7237
- if (opts.tools) {
7238
- if (!Array.isArray(opts.tools)) {
6948
+ if (working.tools) {
6949
+ if (!Array.isArray(working.tools)) {
7239
6950
  throw new TypeError("tools must be an array");
7240
6951
  }
7241
- opts.tools.forEach((tool, i) => {
7242
- if (!tool.name) throw new Error(`tools[${i}] missing required 'name' field`);
7243
- if (!tool.webhookUrl && !tool.handler) throw new Error(`tools[${i}] requires either 'webhookUrl' or 'handler'`);
6952
+ working.tools.forEach((tool2, i) => {
6953
+ if (!tool2.name) throw new Error(`tools[${i}] missing required 'name' field`);
6954
+ if (!tool2.webhookUrl && !tool2.handler) throw new Error(`tools[${i}] requires either 'webhookUrl' or 'handler'`);
7244
6955
  });
7245
6956
  }
7246
- if (opts.variables !== void 0 && (typeof opts.variables !== "object" || Array.isArray(opts.variables))) {
6957
+ if (working.variables !== void 0 && (typeof working.variables !== "object" || Array.isArray(working.variables))) {
7247
6958
  throw new TypeError("variables must be an object");
7248
6959
  }
7249
- return { ...opts };
6960
+ return working;
7250
6961
  }
7251
6962
  async serve(opts) {
7252
6963
  if (this.mode !== "local" || !this.localConfig) {
@@ -7269,10 +6980,14 @@ var Patter = class {
7269
6980
  }
7270
6981
  let webhookUrl = this.localConfig.webhookUrl ?? "";
7271
6982
  const port = opts.port ?? 8e3;
7272
- if (opts.tunnel && webhookUrl) {
6983
+ const ctorTunnel = this.localConfig.tunnel;
6984
+ const wantsCloudflaredFromServe = opts.tunnel === true;
6985
+ const wantsCloudflaredFromCtor = ctorTunnel === true || ctorTunnel instanceof CloudflareTunnel;
6986
+ const wantsCloudflared = wantsCloudflaredFromServe || wantsCloudflaredFromCtor;
6987
+ if (wantsCloudflared && webhookUrl) {
7273
6988
  throw new Error("Cannot use both tunnel: true and webhookUrl. Pick one.");
7274
6989
  }
7275
- if (opts.tunnel) {
6990
+ if (wantsCloudflared) {
7276
6991
  const { startTunnel: startTunnel2 } = await Promise.resolve().then(() => (init_tunnel(), tunnel_exports));
7277
6992
  this.tunnelHandle = await startTunnel2(port);
7278
6993
  webhookUrl = this.tunnelHandle.hostname;
@@ -7282,17 +6997,29 @@ var Patter = class {
7282
6997
  "No webhookUrl configured. Either:\n - Pass webhookUrl in the Patter constructor\n - Use tunnel: true in serve() to auto-create a tunnel"
7283
6998
  );
7284
6999
  }
7000
+ const carrier = this.localConfig.carrier;
7001
+ const telephonyProvider = carrier.kind === "twilio" ? "twilio" : "telnyx";
7002
+ const { autoConfigureCarrier: autoConfigureCarrier2 } = await Promise.resolve().then(() => (init_carrier_config(), carrier_config_exports));
7003
+ await autoConfigureCarrier2({
7004
+ telephonyProvider,
7005
+ twilioSid: carrier.kind === "twilio" ? carrier.accountSid : void 0,
7006
+ twilioToken: carrier.kind === "twilio" ? carrier.authToken : void 0,
7007
+ telnyxKey: carrier.kind === "telnyx" ? carrier.apiKey : void 0,
7008
+ telnyxConnectionId: carrier.kind === "telnyx" ? carrier.connectionId : void 0,
7009
+ phoneNumber: this.localConfig.phoneNumber,
7010
+ webhookHost: webhookUrl
7011
+ });
7285
7012
  this.embeddedServer = new EmbeddedServer(
7286
7013
  {
7287
- twilioSid: this.localConfig.twilioSid,
7288
- twilioToken: this.localConfig.twilioToken,
7014
+ twilioSid: carrier.kind === "twilio" ? carrier.accountSid : void 0,
7015
+ twilioToken: carrier.kind === "twilio" ? carrier.authToken : void 0,
7289
7016
  openaiKey: this.localConfig.openaiKey,
7290
7017
  phoneNumber: this.localConfig.phoneNumber,
7291
7018
  webhookUrl,
7292
- telephonyProvider: this.localConfig.telephonyProvider,
7293
- telnyxKey: this.localConfig.telnyxKey,
7294
- telnyxConnectionId: this.localConfig.telnyxConnectionId,
7295
- telnyxPublicKey: this.localConfig.telnyxPublicKey
7019
+ telephonyProvider,
7020
+ telnyxKey: carrier.kind === "telnyx" ? carrier.apiKey : void 0,
7021
+ telnyxConnectionId: carrier.kind === "telnyx" ? carrier.connectionId : void 0,
7022
+ telnyxPublicKey: carrier.kind === "telnyx" ? carrier.publicKey : void 0
7296
7023
  },
7297
7024
  opts.agent,
7298
7025
  opts.onCallStart,
@@ -7353,10 +7080,10 @@ var Patter = class {
7353
7080
  if (!this.localConfig) {
7354
7081
  throw new Error("local config missing");
7355
7082
  }
7356
- const { phoneNumber, webhookUrl, telephonyProvider } = this.localConfig;
7357
- if (telephonyProvider === "telnyx") {
7358
- const telnyxKey = this.localConfig.telnyxKey ?? "";
7359
- const connectionId = this.localConfig.telnyxConnectionId ?? "";
7083
+ const { phoneNumber, webhookUrl, carrier } = this.localConfig;
7084
+ if (carrier.kind === "telnyx") {
7085
+ const telnyxKey = carrier.apiKey;
7086
+ const connectionId = carrier.connectionId;
7360
7087
  const streamUrl = `wss://${webhookUrl}/ws/stream/${encodeURIComponent(localOpts.to)}?caller=${encodeURIComponent(phoneNumber)}&callee=${encodeURIComponent(localOpts.to)}`;
7361
7088
  const telnyxPayload = {
7362
7089
  connection_id: connectionId,
@@ -7396,8 +7123,8 @@ var Patter = class {
7396
7123
  }
7397
7124
  return;
7398
7125
  }
7399
- const twilioSid = this.localConfig.twilioSid ?? "";
7400
- const twilioToken = this.localConfig.twilioToken ?? "";
7126
+ const twilioSid = carrier.accountSid;
7127
+ const twilioToken = carrier.authToken;
7401
7128
  const statusCallbackUrl = `https://${webhookUrl}/webhooks/twilio/status`;
7402
7129
  const url = `https://api.twilio.com/2010-04-01/Accounts/${twilioSid}/Calls.json`;
7403
7130
  const params = new URLSearchParams({
@@ -7529,65 +7256,6 @@ var Patter = class {
7529
7256
  const data = await response.json();
7530
7257
  return data.map((c) => ({ id: c.id, direction: c.direction, caller: c.caller, callee: c.callee, startedAt: c.started_at, endedAt: c.ended_at, durationSeconds: c.duration_seconds, status: c.status, transcript: c.transcript }));
7531
7258
  }
7532
- // Provider helpers — mirror the Python classmethod factories so callers can
7533
- // write ``Patter.deepgram({ apiKey })`` without importing the top-level.
7534
- static deepgram = deepgram;
7535
- static whisper = whisper;
7536
- static elevenlabs = elevenlabs;
7537
- static openaiTts = openaiTts;
7538
- static cartesia = cartesia;
7539
- static rime = rime;
7540
- static lmnt = lmnt;
7541
- static guardrail(opts) {
7542
- return {
7543
- name: opts.name,
7544
- blockedTerms: opts.blockedTerms,
7545
- check: opts.check,
7546
- replacement: opts.replacement ?? "I'm sorry, I can't respond to that."
7547
- };
7548
- }
7549
- /**
7550
- * Create a tool definition for use with `agent({ tools: [...] })`.
7551
- *
7552
- * Either `handler` (a function) or `webhookUrl` must be provided.
7553
- *
7554
- * @param opts.name - Tool name (visible to the LLM).
7555
- * @param opts.description - What the tool does (visible to the LLM).
7556
- * @param opts.parameters - JSON Schema for tool arguments.
7557
- * @param opts.handler - Async function called in-process when the LLM invokes the tool.
7558
- * @param opts.webhookUrl - URL to POST to when the LLM invokes the tool.
7559
- *
7560
- * @example
7561
- * ```ts
7562
- * phone.agent({
7563
- * systemPrompt: 'You are a pizza bot.',
7564
- * tools: [
7565
- * Patter.tool({
7566
- * name: 'check_menu',
7567
- * description: 'Check available menu items',
7568
- * handler: async (args) => JSON.stringify({ items: ['margherita'] }),
7569
- * }),
7570
- * ],
7571
- * });
7572
- * ```
7573
- */
7574
- static tool(opts) {
7575
- if (!opts.handler && !opts.webhookUrl) {
7576
- throw new Error("tool() requires either handler or webhookUrl");
7577
- }
7578
- const t = {
7579
- name: opts.name,
7580
- description: opts.description ?? "",
7581
- parameters: opts.parameters ?? { type: "object", properties: {} }
7582
- };
7583
- if (opts.handler) {
7584
- t.handler = opts.handler;
7585
- }
7586
- if (opts.webhookUrl) {
7587
- t.webhookUrl = opts.webhookUrl;
7588
- }
7589
- return t;
7590
- }
7591
7259
  // Internal
7592
7260
  async registerNumber(provider, providerKey, number, providerSecret, country = "US", stt, tts) {
7593
7261
  const credentials = { api_key: providerKey };
@@ -7682,8 +7350,64 @@ function filterForTTS(text) {
7682
7350
  return filterEmoji(filterMarkdown(text));
7683
7351
  }
7684
7352
 
7685
- // src/index.ts
7686
- init_pricing();
7353
+ // src/providers.ts
7354
+ var STTConfigImpl = class {
7355
+ provider;
7356
+ apiKey;
7357
+ language;
7358
+ options;
7359
+ constructor(provider, apiKey, language = "en", options) {
7360
+ this.provider = provider;
7361
+ this.apiKey = apiKey;
7362
+ this.language = language;
7363
+ if (options) this.options = options;
7364
+ }
7365
+ toDict() {
7366
+ const out = {
7367
+ provider: this.provider,
7368
+ api_key: this.apiKey,
7369
+ language: this.language
7370
+ };
7371
+ if (this.options) out.options = { ...this.options };
7372
+ return out;
7373
+ }
7374
+ };
7375
+ var TTSConfigImpl = class {
7376
+ provider;
7377
+ apiKey;
7378
+ voice;
7379
+ constructor(provider, apiKey, voice = "alloy") {
7380
+ this.provider = provider;
7381
+ this.apiKey = apiKey;
7382
+ this.voice = voice;
7383
+ }
7384
+ toDict() {
7385
+ return { provider: this.provider, api_key: this.apiKey, voice: this.voice };
7386
+ }
7387
+ };
7388
+ function deepgram(opts) {
7389
+ const options = {
7390
+ model: opts.model ?? "nova-3",
7391
+ endpointing_ms: opts.endpointingMs ?? 150,
7392
+ utterance_end_ms: opts.utteranceEndMs === null ? null : opts.utteranceEndMs ?? 1e3,
7393
+ smart_format: opts.smartFormat ?? true,
7394
+ interim_results: opts.interimResults ?? true
7395
+ };
7396
+ if (opts.vadEvents !== void 0) options.vad_events = opts.vadEvents;
7397
+ return new STTConfigImpl("deepgram", opts.apiKey, opts.language ?? "en", options);
7398
+ }
7399
+ function whisper(opts) {
7400
+ return new STTConfigImpl("whisper", opts.apiKey, opts.language ?? "en");
7401
+ }
7402
+ function elevenlabs(opts) {
7403
+ return new TTSConfigImpl("elevenlabs", opts.apiKey, opts.voice ?? "rachel");
7404
+ }
7405
+ function openaiTts(opts) {
7406
+ return new TTSConfigImpl("openai", opts.apiKey, opts.voice ?? "alloy");
7407
+ }
7408
+
7409
+ // src/index.ts
7410
+ init_pricing();
7687
7411
  init_metrics();
7688
7412
  init_store();
7689
7413
  init_auth();
@@ -8372,166 +8096,480 @@ function scheduleInterval(intervalOrOpts, callback) {
8372
8096
  };
8373
8097
  }
8374
8098
 
8375
- // src/index.ts
8099
+ // src/stt/deepgram.ts
8376
8100
  init_deepgram_stt();
8377
-
8378
- // src/providers/soniox-stt.ts
8379
- var import_ws7 = __toESM(require("ws"));
8380
- init_logger();
8381
- var SONIOX_WS_URL = "wss://stt-rt.soniox.com/transcribe-websocket";
8382
- var KEEPALIVE_MESSAGE = '{"type": "keepalive"}';
8383
- var END_TOKEN = "<end>";
8384
- var FINALIZED_TOKEN = "<fin>";
8385
- var KEEPALIVE_INTERVAL_MS = 5e3;
8386
- function isEndToken(token) {
8387
- return token.text === END_TOKEN || token.text === FINALIZED_TOKEN;
8388
- }
8389
- var TokenAccumulator = class {
8390
- text = "";
8391
- confSum = 0;
8392
- confCount = 0;
8393
- update(token) {
8394
- if (token.text) {
8395
- this.text += token.text;
8396
- }
8397
- if (typeof token.confidence === "number") {
8398
- this.confSum += token.confidence;
8399
- this.confCount += 1;
8101
+ var STT = class extends DeepgramSTT {
8102
+ constructor(opts = {}) {
8103
+ const key = opts.apiKey ?? process.env.DEEPGRAM_API_KEY;
8104
+ if (!key) {
8105
+ throw new Error(
8106
+ "Deepgram STT requires an apiKey. Pass { apiKey: 'dg_...' } or set DEEPGRAM_API_KEY in the environment."
8107
+ );
8400
8108
  }
8401
- }
8402
- get confidence() {
8403
- return this.confCount === 0 ? 0 : this.confSum / this.confCount;
8404
- }
8405
- reset() {
8406
- this.text = "";
8407
- this.confSum = 0;
8408
- this.confCount = 0;
8409
- }
8410
- get raw() {
8411
- return { sum: this.confSum, count: this.confCount };
8109
+ super(
8110
+ key,
8111
+ opts.language ?? "en",
8112
+ opts.model ?? "nova-3",
8113
+ opts.encoding ?? "linear16",
8114
+ opts.sampleRate ?? 16e3,
8115
+ {
8116
+ endpointingMs: opts.endpointingMs ?? 150,
8117
+ utteranceEndMs: opts.utteranceEndMs === null ? null : opts.utteranceEndMs ?? 1e3,
8118
+ smartFormat: opts.smartFormat ?? true,
8119
+ interimResults: opts.interimResults ?? true,
8120
+ ...opts.vadEvents !== void 0 ? { vadEvents: opts.vadEvents } : {}
8121
+ }
8122
+ );
8412
8123
  }
8413
8124
  };
8414
- var SonioxSTT = class _SonioxSTT {
8415
- ws = null;
8416
- callbacks = [];
8417
- final = new TokenAccumulator();
8418
- keepaliveTimer = null;
8125
+
8126
+ // src/providers/whisper-stt.ts
8127
+ init_logger();
8128
+ var OPENAI_TRANSCRIPTION_URL = "https://api.openai.com/v1/audio/transcriptions";
8129
+ var DEFAULT_BUFFER_SIZE = 16e3 * 2;
8130
+ function wrapPcmInWav(pcm, sampleRate = 16e3, channels = 1, bitsPerSample = 16) {
8131
+ const dataSize = pcm.length;
8132
+ const header = Buffer.alloc(44);
8133
+ header.write("RIFF", 0);
8134
+ header.writeUInt32LE(36 + dataSize, 4);
8135
+ header.write("WAVE", 8);
8136
+ header.write("fmt ", 12);
8137
+ header.writeUInt32LE(16, 16);
8138
+ header.writeUInt16LE(1, 20);
8139
+ header.writeUInt16LE(channels, 22);
8140
+ header.writeUInt32LE(sampleRate, 24);
8141
+ header.writeUInt32LE(sampleRate * channels * (bitsPerSample / 8), 28);
8142
+ header.writeUInt16LE(channels * (bitsPerSample / 8), 32);
8143
+ header.writeUInt16LE(bitsPerSample, 34);
8144
+ header.write("data", 36);
8145
+ header.writeUInt32LE(dataSize, 40);
8146
+ return Buffer.concat([header, pcm]);
8147
+ }
8148
+ var WhisperSTT = class _WhisperSTT {
8419
8149
  apiKey;
8420
8150
  model;
8421
- languageHints;
8422
- languageHintsStrict;
8423
- sampleRate;
8424
- numChannels;
8425
- enableSpeakerDiarization;
8426
- enableLanguageIdentification;
8427
- maxEndpointDelayMs;
8428
- clientReferenceId;
8429
- baseUrl;
8430
- constructor(apiKey, options = {}) {
8431
- if (!apiKey) {
8432
- throw new Error("Soniox apiKey is required");
8433
- }
8434
- const maxEndpointDelayMs = options.maxEndpointDelayMs ?? 500;
8435
- if (maxEndpointDelayMs < 500 || maxEndpointDelayMs > 3e3) {
8436
- throw new Error("maxEndpointDelayMs must be between 500 and 3000");
8437
- }
8151
+ language;
8152
+ bufferSize;
8153
+ buffer = Buffer.alloc(0);
8154
+ callbacks = [];
8155
+ running = false;
8156
+ pendingTranscriptions = [];
8157
+ constructor(apiKey, model = "whisper-1", language, bufferSize = DEFAULT_BUFFER_SIZE) {
8438
8158
  this.apiKey = apiKey;
8439
- this.model = options.model ?? "stt-rt-v4";
8440
- this.languageHints = options.languageHints;
8441
- this.languageHintsStrict = options.languageHintsStrict ?? false;
8442
- this.sampleRate = options.sampleRate ?? 16e3;
8443
- this.numChannels = options.numChannels ?? 1;
8444
- this.enableSpeakerDiarization = options.enableSpeakerDiarization ?? false;
8445
- this.enableLanguageIdentification = options.enableLanguageIdentification ?? true;
8446
- this.maxEndpointDelayMs = maxEndpointDelayMs;
8447
- this.clientReferenceId = options.clientReferenceId;
8448
- this.baseUrl = options.baseUrl ?? SONIOX_WS_URL;
8449
- }
8450
- /** Factory for Twilio-style 8 kHz linear PCM. */
8451
- static forTwilio(apiKey, languageHints) {
8452
- return new _SonioxSTT(apiKey, { sampleRate: 8e3, languageHints });
8159
+ this.model = model;
8160
+ this.language = language;
8161
+ this.bufferSize = bufferSize;
8453
8162
  }
8454
- buildConfig() {
8455
- const config = {
8456
- api_key: this.apiKey,
8457
- model: this.model,
8458
- audio_format: "pcm_s16le",
8459
- num_channels: this.numChannels,
8460
- sample_rate: this.sampleRate,
8461
- enable_endpoint_detection: true,
8462
- enable_speaker_diarization: this.enableSpeakerDiarization,
8463
- enable_language_identification: this.enableLanguageIdentification,
8464
- max_endpoint_delay_ms: this.maxEndpointDelayMs
8465
- };
8466
- if (this.languageHints) {
8467
- config.language_hints = this.languageHints;
8468
- config.language_hints_strict = this.languageHintsStrict;
8469
- }
8470
- if (this.clientReferenceId) {
8471
- config.client_reference_id = this.clientReferenceId;
8472
- }
8473
- return config;
8163
+ /** Factory for Twilio calls — mulaw 8 kHz is transcoded upstream, so we still receive PCM 16-bit. */
8164
+ static forTwilio(apiKey, language = "en", model = "whisper-1") {
8165
+ return new _WhisperSTT(apiKey, model, language);
8474
8166
  }
8475
8167
  async connect() {
8476
- this.ws = new import_ws7.default(this.baseUrl);
8477
- await new Promise((resolve, reject) => {
8478
- const timer = setTimeout(() => reject(new Error("Soniox connect timeout")), 1e4);
8479
- this.ws.once("open", () => {
8480
- clearTimeout(timer);
8481
- resolve();
8482
- });
8483
- this.ws.once("error", (err) => {
8484
- clearTimeout(timer);
8485
- reject(err);
8486
- });
8487
- });
8488
- this.ws.send(JSON.stringify(this.buildConfig()));
8489
- this.ws.on("message", (raw) => this.handleMessage(raw.toString()));
8490
- this.ws.on("close", () => this.clearKeepalive());
8491
- this.ws.on("error", (err) => {
8492
- getLogger().error(`SonioxSTT WebSocket error: ${String(err)}`);
8168
+ this.running = true;
8169
+ this.buffer = Buffer.alloc(0);
8170
+ }
8171
+ sendAudio(audio) {
8172
+ if (!this.running) return;
8173
+ this.buffer = Buffer.concat([this.buffer, audio]);
8174
+ if (this.buffer.length >= this.bufferSize) {
8175
+ const pcm = this.buffer;
8176
+ this.buffer = Buffer.alloc(0);
8177
+ this.trackTranscription(this.transcribeBuffer(pcm));
8178
+ }
8179
+ }
8180
+ trackTranscription(promise) {
8181
+ const wrapped = promise.finally(() => {
8182
+ const idx = this.pendingTranscriptions.indexOf(wrapped);
8183
+ if (idx !== -1) this.pendingTranscriptions.splice(idx, 1);
8493
8184
  });
8494
- this.keepaliveTimer = setInterval(() => {
8495
- if (this.ws && this.ws.readyState === import_ws7.default.OPEN) {
8496
- try {
8497
- this.ws.send(KEEPALIVE_MESSAGE);
8498
- } catch {
8499
- }
8500
- }
8501
- }, KEEPALIVE_INTERVAL_MS);
8185
+ this.pendingTranscriptions.push(wrapped);
8502
8186
  }
8503
- clearKeepalive() {
8504
- if (this.keepaliveTimer) {
8505
- clearInterval(this.keepaliveTimer);
8506
- this.keepaliveTimer = null;
8187
+ onTranscript(callback) {
8188
+ if (this.callbacks.length >= 10) {
8189
+ getLogger().warn("WhisperSTT: maximum of 10 onTranscript callbacks reached; replacing the last callback.");
8190
+ this.callbacks[this.callbacks.length - 1] = callback;
8191
+ return;
8507
8192
  }
8193
+ this.callbacks.push(callback);
8508
8194
  }
8509
- handleMessage(raw) {
8510
- let content;
8195
+ async close() {
8196
+ this.running = false;
8197
+ if (this.buffer.length >= this.bufferSize / 4) {
8198
+ const pcm = this.buffer;
8199
+ this.buffer = Buffer.alloc(0);
8200
+ this.trackTranscription(this.transcribeBuffer(pcm));
8201
+ } else {
8202
+ this.buffer = Buffer.alloc(0);
8203
+ }
8204
+ await Promise.allSettled(this.pendingTranscriptions);
8205
+ this.callbacks = [];
8206
+ }
8207
+ // ------------------------------------------------------------------
8208
+ // Private
8209
+ // ------------------------------------------------------------------
8210
+ async transcribeBuffer(pcm) {
8211
+ const wav = wrapPcmInWav(pcm);
8212
+ const formData = new FormData();
8213
+ formData.append("file", new Blob([wav.buffer.slice(wav.byteOffset, wav.byteOffset + wav.byteLength)], { type: "audio/wav" }), "audio.wav");
8214
+ formData.append("model", this.model);
8215
+ if (this.language) {
8216
+ formData.append("language", this.language);
8217
+ }
8511
8218
  try {
8512
- content = JSON.parse(raw);
8513
- } catch {
8514
- return;
8219
+ const resp = await fetch(OPENAI_TRANSCRIPTION_URL, {
8220
+ method: "POST",
8221
+ headers: { Authorization: `Bearer ${this.apiKey}` },
8222
+ body: formData,
8223
+ signal: AbortSignal.timeout(15e3)
8224
+ });
8225
+ if (!resp.ok) {
8226
+ const body = await resp.text();
8227
+ getLogger().error(`WhisperSTT transcription error: ${resp.status} ${body}`);
8228
+ return;
8229
+ }
8230
+ const json = await resp.json();
8231
+ const text = (json.text ?? "").trim();
8232
+ if (!text) return;
8233
+ const transcript = {
8234
+ text,
8235
+ isFinal: true,
8236
+ confidence: 1
8237
+ };
8238
+ for (const cb of this.callbacks) {
8239
+ cb(transcript);
8240
+ }
8241
+ } catch (err) {
8242
+ getLogger().error(`WhisperSTT transcription error: ${String(err)}`);
8515
8243
  }
8516
- if (content.error_code || content.error_message) {
8517
- getLogger().error(
8518
- `SonioxSTT error ${String(content.error_code)}: ${String(content.error_message)}`
8244
+ }
8245
+ };
8246
+
8247
+ // src/stt/whisper.ts
8248
+ var STT2 = class extends WhisperSTT {
8249
+ constructor(opts = {}) {
8250
+ const key = opts.apiKey ?? process.env.OPENAI_API_KEY;
8251
+ if (!key) {
8252
+ throw new Error(
8253
+ "Whisper STT requires an apiKey. Pass { apiKey: 'sk-...' } or set OPENAI_API_KEY in the environment."
8519
8254
  );
8520
8255
  }
8521
- const tokens = content.tokens ?? [];
8522
- const nonFinal = new TokenAccumulator();
8523
- let emittedFinalThisMsg = false;
8524
- for (const token of tokens) {
8525
- if (token.is_final) {
8526
- if (isEndToken(token)) {
8527
- if (this.final.text) {
8528
- this.emit({
8529
- text: this.final.text.trim(),
8530
- isFinal: true,
8531
- confidence: this.final.confidence
8532
- });
8533
- this.final.reset();
8534
- emittedFinalThisMsg = true;
8256
+ super(key, opts.model ?? "whisper-1", opts.language, opts.bufferSize);
8257
+ }
8258
+ };
8259
+
8260
+ // src/providers/cartesia-stt.ts
8261
+ var import_ws7 = __toESM(require("ws"));
8262
+ init_logger();
8263
+ var DEFAULT_BASE_URL = "https://api.cartesia.ai";
8264
+ var API_VERSION = "2025-04-16";
8265
+ var USER_AGENT = "Patter/1.0 (integration=LiveKit-port; provider=Cartesia)";
8266
+ var KEEPALIVE_INTERVAL_MS = 3e4;
8267
+ var CONNECT_TIMEOUT_MS = 1e4;
8268
+ var MAX_CALLBACKS = 10;
8269
+ var CartesiaSTT = class {
8270
+ constructor(apiKey, options = {}) {
8271
+ this.apiKey = apiKey;
8272
+ this.options = options;
8273
+ if (!apiKey) {
8274
+ throw new Error("CartesiaSTT requires a non-empty apiKey");
8275
+ }
8276
+ }
8277
+ ws = null;
8278
+ callbacks = [];
8279
+ keepaliveTimer = null;
8280
+ /** Cartesia request id — set from the server transcript events. */
8281
+ requestId = "";
8282
+ buildWsUrl() {
8283
+ const opts = this.options;
8284
+ const rawBase = opts.baseUrl ?? DEFAULT_BASE_URL;
8285
+ let base;
8286
+ if (rawBase.startsWith("http://")) {
8287
+ base = `ws://${rawBase.slice("http://".length)}`;
8288
+ } else if (rawBase.startsWith("https://")) {
8289
+ base = `wss://${rawBase.slice("https://".length)}`;
8290
+ } else if (rawBase.startsWith("ws://") || rawBase.startsWith("wss://")) {
8291
+ base = rawBase;
8292
+ } else {
8293
+ base = `wss://${rawBase}`;
8294
+ }
8295
+ const language = opts.language ?? "en";
8296
+ const params = new URLSearchParams({
8297
+ model: opts.model ?? "ink-whisper",
8298
+ sample_rate: String(opts.sampleRate ?? 16e3),
8299
+ encoding: opts.encoding ?? "pcm_s16le",
8300
+ cartesia_version: API_VERSION,
8301
+ api_key: this.apiKey,
8302
+ language
8303
+ });
8304
+ return `${base}/stt/websocket?${params.toString()}`;
8305
+ }
8306
+ async connect() {
8307
+ const url = this.buildWsUrl();
8308
+ this.ws = new import_ws7.default(url, {
8309
+ headers: { "User-Agent": USER_AGENT }
8310
+ });
8311
+ await new Promise((resolve, reject) => {
8312
+ const timer = setTimeout(
8313
+ () => reject(new Error("Cartesia STT connect timeout")),
8314
+ CONNECT_TIMEOUT_MS
8315
+ );
8316
+ this.ws.once("open", () => {
8317
+ clearTimeout(timer);
8318
+ resolve();
8319
+ });
8320
+ this.ws.once("error", (err) => {
8321
+ clearTimeout(timer);
8322
+ reject(err);
8323
+ });
8324
+ });
8325
+ this.ws.on("message", (raw) => {
8326
+ let event;
8327
+ try {
8328
+ event = JSON.parse(raw.toString());
8329
+ } catch {
8330
+ return;
8331
+ }
8332
+ this.handleEvent(event);
8333
+ });
8334
+ this.keepaliveTimer = setInterval(() => {
8335
+ if (this.ws && this.ws.readyState === import_ws7.default.OPEN) {
8336
+ try {
8337
+ this.ws.ping();
8338
+ } catch {
8339
+ }
8340
+ }
8341
+ }, KEEPALIVE_INTERVAL_MS);
8342
+ }
8343
+ handleEvent(event) {
8344
+ const type = event.type;
8345
+ if (type === "transcript") {
8346
+ const text = (event.text ?? "").trim();
8347
+ const isFinal = Boolean(event.is_final);
8348
+ if (!text && !isFinal) return;
8349
+ if (event.request_id) {
8350
+ this.requestId = event.request_id;
8351
+ }
8352
+ if (!text) return;
8353
+ const confidence = Number(event.probability ?? 1);
8354
+ this.emit({ text, isFinal, confidence });
8355
+ return;
8356
+ }
8357
+ if (type === "error") {
8358
+ getLogger().error(`Cartesia STT error: ${event.message ?? "unknown"}`);
8359
+ return;
8360
+ }
8361
+ }
8362
+ emit(transcript) {
8363
+ for (const cb of this.callbacks) {
8364
+ cb(transcript);
8365
+ }
8366
+ }
8367
+ sendAudio(audio) {
8368
+ if (!this.ws || this.ws.readyState !== import_ws7.default.OPEN) return;
8369
+ this.ws.send(audio);
8370
+ }
8371
+ onTranscript(callback) {
8372
+ if (this.callbacks.length >= MAX_CALLBACKS) {
8373
+ getLogger().warn(
8374
+ "CartesiaSTT: maximum of 10 onTranscript callbacks reached; replacing the last callback."
8375
+ );
8376
+ this.callbacks[this.callbacks.length - 1] = callback;
8377
+ return;
8378
+ }
8379
+ this.callbacks.push(callback);
8380
+ }
8381
+ close() {
8382
+ if (this.keepaliveTimer) {
8383
+ clearInterval(this.keepaliveTimer);
8384
+ this.keepaliveTimer = null;
8385
+ }
8386
+ if (this.ws) {
8387
+ try {
8388
+ this.ws.send("finalize");
8389
+ } catch {
8390
+ }
8391
+ this.ws.close();
8392
+ this.ws = null;
8393
+ }
8394
+ }
8395
+ };
8396
+
8397
+ // src/stt/cartesia.ts
8398
+ var STT3 = class extends CartesiaSTT {
8399
+ constructor(opts = {}) {
8400
+ const key = opts.apiKey ?? process.env.CARTESIA_API_KEY;
8401
+ if (!key) {
8402
+ throw new Error(
8403
+ "Cartesia STT requires an apiKey. Pass { apiKey: '...' } or set CARTESIA_API_KEY in the environment."
8404
+ );
8405
+ }
8406
+ super(key, {
8407
+ model: opts.model,
8408
+ language: opts.language,
8409
+ encoding: opts.encoding,
8410
+ sampleRate: opts.sampleRate,
8411
+ baseUrl: opts.baseUrl
8412
+ });
8413
+ }
8414
+ };
8415
+
8416
+ // src/providers/soniox-stt.ts
8417
+ var import_ws8 = __toESM(require("ws"));
8418
+ init_logger();
8419
+ var SONIOX_WS_URL = "wss://stt-rt.soniox.com/transcribe-websocket";
8420
+ var KEEPALIVE_MESSAGE = '{"type": "keepalive"}';
8421
+ var END_TOKEN = "<end>";
8422
+ var FINALIZED_TOKEN = "<fin>";
8423
+ var KEEPALIVE_INTERVAL_MS2 = 5e3;
8424
+ function isEndToken(token) {
8425
+ return token.text === END_TOKEN || token.text === FINALIZED_TOKEN;
8426
+ }
8427
+ var TokenAccumulator = class {
8428
+ text = "";
8429
+ confSum = 0;
8430
+ confCount = 0;
8431
+ update(token) {
8432
+ if (token.text) {
8433
+ this.text += token.text;
8434
+ }
8435
+ if (typeof token.confidence === "number") {
8436
+ this.confSum += token.confidence;
8437
+ this.confCount += 1;
8438
+ }
8439
+ }
8440
+ get confidence() {
8441
+ return this.confCount === 0 ? 0 : this.confSum / this.confCount;
8442
+ }
8443
+ reset() {
8444
+ this.text = "";
8445
+ this.confSum = 0;
8446
+ this.confCount = 0;
8447
+ }
8448
+ get raw() {
8449
+ return { sum: this.confSum, count: this.confCount };
8450
+ }
8451
+ };
8452
+ var SonioxSTT = class _SonioxSTT {
8453
+ ws = null;
8454
+ callbacks = [];
8455
+ final = new TokenAccumulator();
8456
+ keepaliveTimer = null;
8457
+ apiKey;
8458
+ model;
8459
+ languageHints;
8460
+ languageHintsStrict;
8461
+ sampleRate;
8462
+ numChannels;
8463
+ enableSpeakerDiarization;
8464
+ enableLanguageIdentification;
8465
+ maxEndpointDelayMs;
8466
+ clientReferenceId;
8467
+ baseUrl;
8468
+ constructor(apiKey, options = {}) {
8469
+ if (!apiKey) {
8470
+ throw new Error("Soniox apiKey is required");
8471
+ }
8472
+ const maxEndpointDelayMs = options.maxEndpointDelayMs ?? 500;
8473
+ if (maxEndpointDelayMs < 500 || maxEndpointDelayMs > 3e3) {
8474
+ throw new Error("maxEndpointDelayMs must be between 500 and 3000");
8475
+ }
8476
+ this.apiKey = apiKey;
8477
+ this.model = options.model ?? "stt-rt-v4";
8478
+ this.languageHints = options.languageHints;
8479
+ this.languageHintsStrict = options.languageHintsStrict ?? false;
8480
+ this.sampleRate = options.sampleRate ?? 16e3;
8481
+ this.numChannels = options.numChannels ?? 1;
8482
+ this.enableSpeakerDiarization = options.enableSpeakerDiarization ?? false;
8483
+ this.enableLanguageIdentification = options.enableLanguageIdentification ?? true;
8484
+ this.maxEndpointDelayMs = maxEndpointDelayMs;
8485
+ this.clientReferenceId = options.clientReferenceId;
8486
+ this.baseUrl = options.baseUrl ?? SONIOX_WS_URL;
8487
+ }
8488
+ /** Factory for Twilio-style 8 kHz linear PCM. */
8489
+ static forTwilio(apiKey, languageHints) {
8490
+ return new _SonioxSTT(apiKey, { sampleRate: 8e3, languageHints });
8491
+ }
8492
+ buildConfig() {
8493
+ const config = {
8494
+ api_key: this.apiKey,
8495
+ model: this.model,
8496
+ audio_format: "pcm_s16le",
8497
+ num_channels: this.numChannels,
8498
+ sample_rate: this.sampleRate,
8499
+ enable_endpoint_detection: true,
8500
+ enable_speaker_diarization: this.enableSpeakerDiarization,
8501
+ enable_language_identification: this.enableLanguageIdentification,
8502
+ max_endpoint_delay_ms: this.maxEndpointDelayMs
8503
+ };
8504
+ if (this.languageHints) {
8505
+ config.language_hints = this.languageHints;
8506
+ config.language_hints_strict = this.languageHintsStrict;
8507
+ }
8508
+ if (this.clientReferenceId) {
8509
+ config.client_reference_id = this.clientReferenceId;
8510
+ }
8511
+ return config;
8512
+ }
8513
+ async connect() {
8514
+ this.ws = new import_ws8.default(this.baseUrl);
8515
+ await new Promise((resolve, reject) => {
8516
+ const timer = setTimeout(() => reject(new Error("Soniox connect timeout")), 1e4);
8517
+ this.ws.once("open", () => {
8518
+ clearTimeout(timer);
8519
+ resolve();
8520
+ });
8521
+ this.ws.once("error", (err) => {
8522
+ clearTimeout(timer);
8523
+ reject(err);
8524
+ });
8525
+ });
8526
+ this.ws.send(JSON.stringify(this.buildConfig()));
8527
+ this.ws.on("message", (raw) => this.handleMessage(raw.toString()));
8528
+ this.ws.on("close", () => this.clearKeepalive());
8529
+ this.ws.on("error", (err) => {
8530
+ getLogger().error(`SonioxSTT WebSocket error: ${String(err)}`);
8531
+ });
8532
+ this.keepaliveTimer = setInterval(() => {
8533
+ if (this.ws && this.ws.readyState === import_ws8.default.OPEN) {
8534
+ try {
8535
+ this.ws.send(KEEPALIVE_MESSAGE);
8536
+ } catch {
8537
+ }
8538
+ }
8539
+ }, KEEPALIVE_INTERVAL_MS2);
8540
+ }
8541
+ clearKeepalive() {
8542
+ if (this.keepaliveTimer) {
8543
+ clearInterval(this.keepaliveTimer);
8544
+ this.keepaliveTimer = null;
8545
+ }
8546
+ }
8547
+ handleMessage(raw) {
8548
+ let content;
8549
+ try {
8550
+ content = JSON.parse(raw);
8551
+ } catch {
8552
+ return;
8553
+ }
8554
+ if (content.error_code || content.error_message) {
8555
+ getLogger().error(
8556
+ `SonioxSTT error ${String(content.error_code)}: ${String(content.error_message)}`
8557
+ );
8558
+ }
8559
+ const tokens = content.tokens ?? [];
8560
+ const nonFinal = new TokenAccumulator();
8561
+ let emittedFinalThisMsg = false;
8562
+ for (const token of tokens) {
8563
+ if (token.is_final) {
8564
+ if (isEndToken(token)) {
8565
+ if (this.final.text) {
8566
+ this.emit({
8567
+ text: this.final.text.trim(),
8568
+ isFinal: true,
8569
+ confidence: this.final.confidence
8570
+ });
8571
+ this.final.reset();
8572
+ emittedFinalThisMsg = true;
8535
8573
  }
8536
8574
  } else {
8537
8575
  this.final.update(token);
@@ -8565,7 +8603,7 @@ var SonioxSTT = class _SonioxSTT {
8565
8603
  }
8566
8604
  }
8567
8605
  sendAudio(audio) {
8568
- if (!this.ws || this.ws.readyState !== import_ws7.default.OPEN) return;
8606
+ if (!this.ws || this.ws.readyState !== import_ws8.default.OPEN) return;
8569
8607
  if (audio.length === 0) return;
8570
8608
  this.ws.send(audio);
8571
8609
  }
@@ -8595,16 +8633,28 @@ var SonioxSTT = class _SonioxSTT {
8595
8633
  }
8596
8634
  };
8597
8635
 
8598
- // src/index.ts
8599
- init_whisper_stt();
8636
+ // src/stt/soniox.ts
8637
+ var STT4 = class extends SonioxSTT {
8638
+ constructor(opts = {}) {
8639
+ const key = opts.apiKey ?? process.env.SONIOX_API_KEY;
8640
+ if (!key) {
8641
+ throw new Error(
8642
+ "Soniox STT requires an apiKey. Pass { apiKey: '...' } or set SONIOX_API_KEY in the environment."
8643
+ );
8644
+ }
8645
+ const { apiKey: _ignored, ...rest } = opts;
8646
+ void _ignored;
8647
+ super(key, rest);
8648
+ }
8649
+ };
8600
8650
 
8601
8651
  // src/providers/assemblyai-stt.ts
8602
- var import_ws8 = __toESM(require("ws"));
8652
+ var import_ws9 = __toESM(require("ws"));
8603
8653
  init_logger();
8604
- var DEFAULT_BASE_URL = "wss://streaming.assemblyai.com";
8654
+ var DEFAULT_BASE_URL2 = "wss://streaming.assemblyai.com";
8605
8655
  var DEFAULT_MIN_TURN_SILENCE_MS = 100;
8606
- var CONNECT_TIMEOUT_MS = 1e4;
8607
- var MAX_CALLBACKS = 10;
8656
+ var CONNECT_TIMEOUT_MS2 = 1e4;
8657
+ var MAX_CALLBACKS2 = 10;
8608
8658
  var AssemblyAISTT = class _AssemblyAISTT {
8609
8659
  constructor(apiKey, options = {}) {
8610
8660
  this.apiKey = apiKey;
@@ -8667,168 +8717,21 @@ var AssemblyAISTT = class _AssemblyAISTT {
8667
8717
  params.set(key, String(value));
8668
8718
  }
8669
8719
  }
8670
- const base = opts.baseUrl ?? DEFAULT_BASE_URL;
8720
+ const base = opts.baseUrl ?? DEFAULT_BASE_URL2;
8671
8721
  return `${base}/v3/ws?${params.toString()}`;
8672
8722
  }
8673
8723
  async connect() {
8674
- const url = this.buildUrl();
8675
- this.ws = new import_ws8.default(url, {
8676
- headers: {
8677
- Authorization: this.apiKey,
8678
- "Content-Type": "application/json",
8679
- "User-Agent": "Patter/1.0 (integration=LiveKit-port)"
8680
- }
8681
- });
8682
- await new Promise((resolve, reject) => {
8683
- const timer = setTimeout(
8684
- () => reject(new Error("AssemblyAI connect timeout")),
8685
- CONNECT_TIMEOUT_MS
8686
- );
8687
- this.ws.once("open", () => {
8688
- clearTimeout(timer);
8689
- resolve();
8690
- });
8691
- this.ws.once("error", (err) => {
8692
- clearTimeout(timer);
8693
- reject(err);
8694
- });
8695
- });
8696
- this.ws.on("message", (raw) => {
8697
- let event;
8698
- try {
8699
- event = JSON.parse(raw.toString());
8700
- } catch {
8701
- return;
8702
- }
8703
- this.handleEvent(event);
8704
- });
8705
- }
8706
- handleEvent(event) {
8707
- const type = event.type;
8708
- if (type === "Begin") {
8709
- this.sessionId = event.id ?? "";
8710
- this.expiresAt = event.expires_at ?? 0;
8711
- return;
8712
- }
8713
- if (type !== "Turn") {
8714
- return;
8715
- }
8716
- const endOfTurn = Boolean(event.end_of_turn);
8717
- const turnIsFormatted = Boolean(event.turn_is_formatted);
8718
- const words = event.words ?? [];
8719
- const transcriptText = (event.transcript ?? "").trim();
8720
- if (endOfTurn) {
8721
- if (this.options.formatTurns && !turnIsFormatted) return;
8722
- if (!transcriptText) return;
8723
- this.emit({
8724
- text: transcriptText,
8725
- isFinal: true,
8726
- confidence: averageConfidence(words)
8727
- });
8728
- return;
8729
- }
8730
- if (!words.length) return;
8731
- const interim = words.map((w) => (w.text ?? "").trim()).filter(Boolean).join(" ");
8732
- if (!interim) return;
8733
- this.emit({
8734
- text: interim,
8735
- isFinal: false,
8736
- confidence: averageConfidence(words)
8737
- });
8738
- }
8739
- emit(transcript) {
8740
- for (const cb of this.callbacks) {
8741
- cb(transcript);
8742
- }
8743
- }
8744
- sendAudio(audio) {
8745
- if (!this.ws || this.ws.readyState !== import_ws8.default.OPEN) return;
8746
- this.ws.send(audio);
8747
- }
8748
- onTranscript(callback) {
8749
- if (this.callbacks.length >= MAX_CALLBACKS) {
8750
- getLogger().warn(
8751
- "AssemblyAISTT: maximum of 10 onTranscript callbacks reached; replacing the last callback."
8752
- );
8753
- this.callbacks[this.callbacks.length - 1] = callback;
8754
- return;
8755
- }
8756
- this.callbacks.push(callback);
8757
- }
8758
- close() {
8759
- if (this.ws) {
8760
- try {
8761
- this.ws.send(JSON.stringify({ type: "Terminate" }));
8762
- } catch {
8763
- }
8764
- this.ws.close();
8765
- this.ws = null;
8766
- }
8767
- }
8768
- };
8769
- function averageConfidence(words) {
8770
- if (!words.length) return 0;
8771
- let total = 0;
8772
- for (const w of words) {
8773
- total += Number(w.confidence ?? 0);
8774
- }
8775
- return total / words.length;
8776
- }
8777
-
8778
- // src/providers/cartesia-stt.ts
8779
- var import_ws9 = __toESM(require("ws"));
8780
- init_logger();
8781
- var DEFAULT_BASE_URL2 = "https://api.cartesia.ai";
8782
- var API_VERSION = "2025-04-16";
8783
- var USER_AGENT = "Patter/1.0 (integration=LiveKit-port; provider=Cartesia)";
8784
- var KEEPALIVE_INTERVAL_MS2 = 3e4;
8785
- var CONNECT_TIMEOUT_MS2 = 1e4;
8786
- var MAX_CALLBACKS2 = 10;
8787
- var CartesiaSTT = class {
8788
- constructor(apiKey, options = {}) {
8789
- this.apiKey = apiKey;
8790
- this.options = options;
8791
- if (!apiKey) {
8792
- throw new Error("CartesiaSTT requires a non-empty apiKey");
8793
- }
8794
- }
8795
- ws = null;
8796
- callbacks = [];
8797
- keepaliveTimer = null;
8798
- /** Cartesia request id — set from the server transcript events. */
8799
- requestId = "";
8800
- buildWsUrl() {
8801
- const opts = this.options;
8802
- const rawBase = opts.baseUrl ?? DEFAULT_BASE_URL2;
8803
- let base;
8804
- if (rawBase.startsWith("http://")) {
8805
- base = `ws://${rawBase.slice("http://".length)}`;
8806
- } else if (rawBase.startsWith("https://")) {
8807
- base = `wss://${rawBase.slice("https://".length)}`;
8808
- } else if (rawBase.startsWith("ws://") || rawBase.startsWith("wss://")) {
8809
- base = rawBase;
8810
- } else {
8811
- base = `wss://${rawBase}`;
8812
- }
8813
- const language = opts.language ?? "en";
8814
- const params = new URLSearchParams({
8815
- model: opts.model ?? "ink-whisper",
8816
- sample_rate: String(opts.sampleRate ?? 16e3),
8817
- encoding: opts.encoding ?? "pcm_s16le",
8818
- cartesia_version: API_VERSION,
8819
- api_key: this.apiKey,
8820
- language
8821
- });
8822
- return `${base}/stt/websocket?${params.toString()}`;
8823
- }
8824
- async connect() {
8825
- const url = this.buildWsUrl();
8724
+ const url = this.buildUrl();
8826
8725
  this.ws = new import_ws9.default(url, {
8827
- headers: { "User-Agent": USER_AGENT }
8726
+ headers: {
8727
+ Authorization: this.apiKey,
8728
+ "Content-Type": "application/json",
8729
+ "User-Agent": "Patter/1.0 (integration=LiveKit-port)"
8730
+ }
8828
8731
  });
8829
8732
  await new Promise((resolve, reject) => {
8830
8733
  const timer = setTimeout(
8831
- () => reject(new Error("Cartesia STT connect timeout")),
8734
+ () => reject(new Error("AssemblyAI connect timeout")),
8832
8735
  CONNECT_TIMEOUT_MS2
8833
8736
  );
8834
8737
  this.ws.once("open", () => {
@@ -8849,33 +8752,39 @@ var CartesiaSTT = class {
8849
8752
  }
8850
8753
  this.handleEvent(event);
8851
8754
  });
8852
- this.keepaliveTimer = setInterval(() => {
8853
- if (this.ws && this.ws.readyState === import_ws9.default.OPEN) {
8854
- try {
8855
- this.ws.ping();
8856
- } catch {
8857
- }
8858
- }
8859
- }, KEEPALIVE_INTERVAL_MS2);
8860
8755
  }
8861
8756
  handleEvent(event) {
8862
8757
  const type = event.type;
8863
- if (type === "transcript") {
8864
- const text = (event.text ?? "").trim();
8865
- const isFinal = Boolean(event.is_final);
8866
- if (!text && !isFinal) return;
8867
- if (event.request_id) {
8868
- this.requestId = event.request_id;
8869
- }
8870
- if (!text) return;
8871
- const confidence = Number(event.probability ?? 1);
8872
- this.emit({ text, isFinal, confidence });
8758
+ if (type === "Begin") {
8759
+ this.sessionId = event.id ?? "";
8760
+ this.expiresAt = event.expires_at ?? 0;
8873
8761
  return;
8874
8762
  }
8875
- if (type === "error") {
8876
- getLogger().error(`Cartesia STT error: ${event.message ?? "unknown"}`);
8763
+ if (type !== "Turn") {
8764
+ return;
8765
+ }
8766
+ const endOfTurn = Boolean(event.end_of_turn);
8767
+ const turnIsFormatted = Boolean(event.turn_is_formatted);
8768
+ const words = event.words ?? [];
8769
+ const transcriptText = (event.transcript ?? "").trim();
8770
+ if (endOfTurn) {
8771
+ if (this.options.formatTurns && !turnIsFormatted) return;
8772
+ if (!transcriptText) return;
8773
+ this.emit({
8774
+ text: transcriptText,
8775
+ isFinal: true,
8776
+ confidence: averageConfidence(words)
8777
+ });
8877
8778
  return;
8878
8779
  }
8780
+ if (!words.length) return;
8781
+ const interim = words.map((w) => (w.text ?? "").trim()).filter(Boolean).join(" ");
8782
+ if (!interim) return;
8783
+ this.emit({
8784
+ text: interim,
8785
+ isFinal: false,
8786
+ confidence: averageConfidence(words)
8787
+ });
8879
8788
  }
8880
8789
  emit(transcript) {
8881
8790
  for (const cb of this.callbacks) {
@@ -8889,7 +8798,7 @@ var CartesiaSTT = class {
8889
8798
  onTranscript(callback) {
8890
8799
  if (this.callbacks.length >= MAX_CALLBACKS2) {
8891
8800
  getLogger().warn(
8892
- "CartesiaSTT: maximum of 10 onTranscript callbacks reached; replacing the last callback."
8801
+ "AssemblyAISTT: maximum of 10 onTranscript callbacks reached; replacing the last callback."
8893
8802
  );
8894
8803
  this.callbacks[this.callbacks.length - 1] = callback;
8895
8804
  return;
@@ -8897,13 +8806,9 @@ var CartesiaSTT = class {
8897
8806
  this.callbacks.push(callback);
8898
8807
  }
8899
8808
  close() {
8900
- if (this.keepaliveTimer) {
8901
- clearInterval(this.keepaliveTimer);
8902
- this.keepaliveTimer = null;
8903
- }
8904
8809
  if (this.ws) {
8905
8810
  try {
8906
- this.ws.send("finalize");
8811
+ this.ws.send(JSON.stringify({ type: "Terminate" }));
8907
8812
  } catch {
8908
8813
  }
8909
8814
  this.ws.close();
@@ -8911,10 +8816,305 @@ var CartesiaSTT = class {
8911
8816
  }
8912
8817
  }
8913
8818
  };
8819
+ function averageConfidence(words) {
8820
+ if (!words.length) return 0;
8821
+ let total = 0;
8822
+ for (const w of words) {
8823
+ total += Number(w.confidence ?? 0);
8824
+ }
8825
+ return total / words.length;
8826
+ }
8827
+
8828
+ // src/stt/assemblyai.ts
8829
+ var STT5 = class extends AssemblyAISTT {
8830
+ constructor(opts = {}) {
8831
+ const key = opts.apiKey ?? process.env.ASSEMBLYAI_API_KEY;
8832
+ if (!key) {
8833
+ throw new Error(
8834
+ "AssemblyAI STT requires an apiKey. Pass { apiKey: '...' } or set ASSEMBLYAI_API_KEY in the environment."
8835
+ );
8836
+ }
8837
+ const { apiKey: _ignored, ...rest } = opts;
8838
+ void _ignored;
8839
+ super(key, rest);
8840
+ }
8841
+ };
8914
8842
 
8915
- // src/index.ts
8916
- init_elevenlabs_tts();
8917
- init_openai_tts();
8843
+ // src/providers/elevenlabs-tts.ts
8844
+ var ELEVENLABS_BASE_URL = "https://api.elevenlabs.io/v1";
8845
+ var ELEVENLABS_VOICE_ID_BY_NAME = {
8846
+ rachel: "21m00Tcm4TlvDq8ikWAM",
8847
+ drew: "29vD33N1CtxCmqQRPOHJ",
8848
+ clyde: "2EiwWnXFnvU5JabPnv8n",
8849
+ paul: "5Q0t7uMcjvnagumLfvZi",
8850
+ domi: "AZnzlk1XvdvUeBnXmlld",
8851
+ dave: "CYw3kZ02Hs0563khs1Fj",
8852
+ fin: "D38z5RcWu1voky8WS1ja",
8853
+ bella: "EXAVITQu4vr4xnSDxMaL",
8854
+ antoni: "ErXwobaYiN019PkySvjV",
8855
+ thomas: "GBv7mTt0atIp3Br8iCZE",
8856
+ charlie: "IKne3meq5aSn9XLyUdCD",
8857
+ george: "JBFqnCBsd6RMkjVDRZzb",
8858
+ emily: "LcfcDJNUP1GQjkzn1xUU",
8859
+ elli: "MF3mGyEYCl7XYWbV9V6O",
8860
+ callum: "N2lVS1w4EtoT3dr4eOWO",
8861
+ patrick: "ODq5zmih8GrVes37Dizd",
8862
+ harry: "SOYHLrjzK2X1ezoPC6cr",
8863
+ liam: "TX3LPaxmHKxFdv7VOQHJ",
8864
+ dorothy: "ThT5KcBeYPX3keUQqHPh",
8865
+ josh: "TxGEqnHWrfWFTfGW9XjX",
8866
+ arnold: "VR6AewLTigWG4xSOukaG",
8867
+ charlotte: "XB0fDUnXU5powFXDhCwa",
8868
+ matilda: "XrExE9yKIg1WjnnlVkGX",
8869
+ matthew: "Yko7PKHZNXotIFUBG7I9",
8870
+ james: "ZQe5CZNOzWyzPSCn5a3c",
8871
+ joseph: "Zlb1dXrM653N07WRdFW3",
8872
+ jeremy: "bVMeCyTHy58xNoL34h3p",
8873
+ michael: "flq6f7yk4E4fJM5XTYuZ",
8874
+ ethan: "g5CIjZEefAph4nQFvHAz",
8875
+ gigi: "jBpfuIE2acCO8z3wKNLl",
8876
+ freya: "jsCqWAovK2LkecY7zXl4",
8877
+ brian: "nPczCjzI2devNBz1zQrb",
8878
+ grace: "oWAxZDx7w5VEj9dCyTzz",
8879
+ daniel: "onwK4e9ZLuTAKqWW03F9",
8880
+ lily: "pFZP5JQG7iQjIQuC4Bku",
8881
+ serena: "pMsXgVXv3BLzUgSXRplE",
8882
+ adam: "pNInz6obpgDQGcFmaJgB",
8883
+ nicole: "piTKgcLEGmPE4e6mEKli",
8884
+ bill: "pqHfZKP75CvOlQylNhV4",
8885
+ jessie: "t0jbNlBVZ17f02VDIeMI",
8886
+ ryan: "wViXBPUzp2ZZixB1xQuM",
8887
+ sam: "yoZ06aMxZJJ28mfd3POQ",
8888
+ glinda: "z9fAnlkpzviPz146aGWa",
8889
+ giovanni: "zcAOhNBS3c14rBihAFp1",
8890
+ mimi: "zrHiDhphv9ZnVXBqCLjz",
8891
+ alloy: "21m00Tcm4TlvDq8ikWAM"
8892
+ };
8893
+ var VOICE_ID_PATTERN = /^[A-Za-z0-9]{20}$/;
8894
+ function resolveVoiceId(voice) {
8895
+ if (!voice) return voice;
8896
+ if (VOICE_ID_PATTERN.test(voice)) return voice;
8897
+ return ELEVENLABS_VOICE_ID_BY_NAME[voice.toLowerCase()] ?? voice;
8898
+ }
8899
+ var ElevenLabsTTS = class {
8900
+ constructor(apiKey, voiceId = "21m00Tcm4TlvDq8ikWAM", modelId = "eleven_turbo_v2_5", outputFormat = "pcm_16000") {
8901
+ this.apiKey = apiKey;
8902
+ this.modelId = modelId;
8903
+ this.outputFormat = outputFormat;
8904
+ this.voiceId = resolveVoiceId(voiceId);
8905
+ }
8906
+ voiceId;
8907
+ /**
8908
+ * Synthesise text to speech and return the full audio as a single Buffer.
8909
+ *
8910
+ * For large chunks (or when latency matters) call `synthesizeStream` instead.
8911
+ */
8912
+ async synthesize(text) {
8913
+ const chunks = [];
8914
+ for await (const chunk of this.synthesizeStream(text)) {
8915
+ chunks.push(chunk);
8916
+ }
8917
+ return Buffer.concat(chunks);
8918
+ }
8919
+ /**
8920
+ * Synthesise text and yield audio chunks as they arrive (streaming).
8921
+ *
8922
+ * The yielded buffers are raw PCM at 16 kHz (or whatever `outputFormat` is
8923
+ * configured to).
8924
+ */
8925
+ async *synthesizeStream(text) {
8926
+ const url = `${ELEVENLABS_BASE_URL}/text-to-speech/${encodeURIComponent(this.voiceId)}/stream?output_format=${encodeURIComponent(this.outputFormat)}`;
8927
+ const response = await fetch(url, {
8928
+ method: "POST",
8929
+ headers: {
8930
+ "xi-api-key": this.apiKey,
8931
+ "Content-Type": "application/json"
8932
+ },
8933
+ body: JSON.stringify({ text, model_id: this.modelId }),
8934
+ signal: AbortSignal.timeout(3e4)
8935
+ });
8936
+ if (!response.ok) {
8937
+ const body = await response.text();
8938
+ throw new Error(`ElevenLabs TTS error ${response.status}: ${body}`);
8939
+ }
8940
+ if (!response.body) {
8941
+ throw new Error("ElevenLabs TTS: no response body");
8942
+ }
8943
+ const reader = response.body.getReader();
8944
+ try {
8945
+ while (true) {
8946
+ const { done, value } = await reader.read();
8947
+ if (done) break;
8948
+ if (value && value.length > 0) {
8949
+ yield Buffer.from(value);
8950
+ }
8951
+ }
8952
+ } finally {
8953
+ if (typeof reader.cancel === "function") await reader.cancel().catch(() => {
8954
+ });
8955
+ reader.releaseLock();
8956
+ }
8957
+ }
8958
+ };
8959
+
8960
+ // src/tts/elevenlabs.ts
8961
+ var TTS = class extends ElevenLabsTTS {
8962
+ constructor(opts = {}) {
8963
+ const key = opts.apiKey ?? process.env.ELEVENLABS_API_KEY;
8964
+ if (!key) {
8965
+ throw new Error(
8966
+ "ElevenLabs TTS requires an apiKey. Pass { apiKey: '...' } or set ELEVENLABS_API_KEY in the environment."
8967
+ );
8968
+ }
8969
+ super(
8970
+ key,
8971
+ opts.voiceId ?? "21m00Tcm4TlvDq8ikWAM",
8972
+ opts.modelId ?? "eleven_turbo_v2_5",
8973
+ opts.outputFormat ?? "pcm_16000"
8974
+ );
8975
+ }
8976
+ };
8977
+
8978
+ // src/providers/openai-tts.ts
8979
+ var OPENAI_TTS_URL = "https://api.openai.com/v1/audio/speech";
8980
+ var OpenAITTS = class _OpenAITTS {
8981
+ constructor(apiKey, voice = "alloy", model = "tts-1") {
8982
+ this.apiKey = apiKey;
8983
+ this.voice = voice;
8984
+ this.model = model;
8985
+ }
8986
+ /**
8987
+ * Synthesise text to speech and return the full audio as a single Buffer.
8988
+ *
8989
+ * For large chunks (or when latency matters) call `synthesizeStream` instead.
8990
+ */
8991
+ async synthesize(text) {
8992
+ const chunks = [];
8993
+ for await (const chunk of this.synthesizeStream(text)) {
8994
+ chunks.push(chunk);
8995
+ }
8996
+ return Buffer.concat(chunks);
8997
+ }
8998
+ /**
8999
+ * Synthesise text and yield audio chunks as they arrive (streaming).
9000
+ *
9001
+ * OpenAI returns 24 kHz PCM16; each chunk is resampled to 16 kHz before
9002
+ * yielding so the output is ready for telephony pipelines.
9003
+ *
9004
+ * The resampler carries state (buffered samples + odd trailing byte)
9005
+ * between chunks — without that state cross-chunk sample alignment drifts
9006
+ * and the caller hears pops / dropped audio (BUG #23, mirror of the
9007
+ * Python `audioop.ratecv` fix).
9008
+ */
9009
+ async *synthesizeStream(text) {
9010
+ const response = await fetch(OPENAI_TTS_URL, {
9011
+ method: "POST",
9012
+ headers: {
9013
+ "Authorization": `Bearer ${this.apiKey}`,
9014
+ "Content-Type": "application/json"
9015
+ },
9016
+ body: JSON.stringify({
9017
+ model: this.model,
9018
+ input: text,
9019
+ voice: this.voice,
9020
+ response_format: "pcm"
9021
+ }),
9022
+ signal: AbortSignal.timeout(3e4)
9023
+ });
9024
+ if (!response.ok) {
9025
+ const body = await response.text();
9026
+ throw new Error(`OpenAI TTS error ${response.status}: ${body}`);
9027
+ }
9028
+ if (!response.body) {
9029
+ throw new Error("OpenAI TTS: no response body");
9030
+ }
9031
+ const ctx = { carryByte: null, leftover: [] };
9032
+ const reader = response.body.getReader();
9033
+ try {
9034
+ while (true) {
9035
+ const { done, value } = await reader.read();
9036
+ if (done) break;
9037
+ if (value && value.length > 0) {
9038
+ const out = _OpenAITTS.resampleStreaming(Buffer.from(value), ctx);
9039
+ if (out.length > 0) yield out;
9040
+ }
9041
+ }
9042
+ if (ctx.leftover.length > 0) {
9043
+ const tail = Buffer.alloc(ctx.leftover.length * 2);
9044
+ for (let i = 0; i < ctx.leftover.length; i++) {
9045
+ tail.writeInt16LE(ctx.leftover[i], i * 2);
9046
+ }
9047
+ yield tail;
9048
+ }
9049
+ } finally {
9050
+ if (typeof reader.cancel === "function") await reader.cancel().catch(() => {
9051
+ });
9052
+ reader.releaseLock();
9053
+ }
9054
+ }
9055
+ /**
9056
+ * Streaming 24 kHz → 16 kHz resampler (PCM16-LE). Maintains cross-chunk
9057
+ * state so the 3:2 pattern doesn't reset at every network read.
9058
+ */
9059
+ static resampleStreaming(audio, ctx) {
9060
+ let buf;
9061
+ if (ctx.carryByte !== null) {
9062
+ buf = Buffer.concat([Buffer.from([ctx.carryByte]), audio]);
9063
+ ctx.carryByte = null;
9064
+ } else {
9065
+ buf = audio;
9066
+ }
9067
+ if (buf.length % 2 === 1) {
9068
+ ctx.carryByte = buf[buf.length - 1];
9069
+ buf = buf.subarray(0, buf.length - 1);
9070
+ }
9071
+ if (buf.length === 0 && ctx.leftover.length === 0) {
9072
+ return Buffer.alloc(0);
9073
+ }
9074
+ const sampleCount = buf.length / 2;
9075
+ const samples = ctx.leftover.slice();
9076
+ for (let i2 = 0; i2 < sampleCount; i2++) {
9077
+ samples.push(buf.readInt16LE(i2 * 2));
9078
+ }
9079
+ const out = [];
9080
+ let i = 0;
9081
+ while (i + 2 < samples.length) {
9082
+ out.push(samples[i]);
9083
+ out.push(Math.trunc((samples[i + 1] + samples[i + 2]) / 2));
9084
+ i += 3;
9085
+ }
9086
+ ctx.leftover = samples.slice(i);
9087
+ const buffer = Buffer.alloc(out.length * 2);
9088
+ for (let j = 0; j < out.length; j++) {
9089
+ buffer.writeInt16LE(out[j], j * 2);
9090
+ }
9091
+ return buffer;
9092
+ }
9093
+ /** @deprecated use {@link resampleStreaming} with persistent state. */
9094
+ static resample24kTo16k(audio) {
9095
+ const ctx = { carryByte: null, leftover: [] };
9096
+ const out = _OpenAITTS.resampleStreaming(audio, ctx);
9097
+ if (ctx.leftover.length === 0) return out;
9098
+ const tail = Buffer.alloc(ctx.leftover.length * 2);
9099
+ for (let i = 0; i < ctx.leftover.length; i++) {
9100
+ tail.writeInt16LE(ctx.leftover[i], i * 2);
9101
+ }
9102
+ return Buffer.concat([out, tail]);
9103
+ }
9104
+ };
9105
+
9106
+ // src/tts/openai.ts
9107
+ var TTS2 = class extends OpenAITTS {
9108
+ constructor(opts = {}) {
9109
+ const key = opts.apiKey ?? process.env.OPENAI_API_KEY;
9110
+ if (!key) {
9111
+ throw new Error(
9112
+ "OpenAI TTS requires an apiKey. Pass { apiKey: 'sk-...' } or set OPENAI_API_KEY in the environment."
9113
+ );
9114
+ }
9115
+ super(key, opts.voice ?? "alloy", opts.model ?? "tts-1");
9116
+ }
9117
+ };
8918
9118
 
8919
9119
  // src/providers/cartesia-tts.ts
8920
9120
  var CARTESIA_BASE_URL = "https://api.cartesia.ai";
@@ -9014,6 +9214,21 @@ var CartesiaTTS = class {
9014
9214
  }
9015
9215
  };
9016
9216
 
9217
+ // src/tts/cartesia.ts
9218
+ var TTS3 = class extends CartesiaTTS {
9219
+ constructor(opts = {}) {
9220
+ const key = opts.apiKey ?? process.env.CARTESIA_API_KEY;
9221
+ if (!key) {
9222
+ throw new Error(
9223
+ "Cartesia TTS requires an apiKey. Pass { apiKey: '...' } or set CARTESIA_API_KEY in the environment."
9224
+ );
9225
+ }
9226
+ const { apiKey: _ignored, ...rest } = opts;
9227
+ void _ignored;
9228
+ super(key, rest);
9229
+ }
9230
+ };
9231
+
9017
9232
  // src/providers/rime-tts.ts
9018
9233
  var RIME_BASE_URL = "https://users.rime.ai/v1/rime-tts";
9019
9234
  var ARCANA_MODEL_TIMEOUT_MS = 60 * 4 * 1e3;
@@ -9141,6 +9356,21 @@ var RimeTTS = class {
9141
9356
  }
9142
9357
  };
9143
9358
 
9359
+ // src/tts/rime.ts
9360
+ var TTS4 = class extends RimeTTS {
9361
+ constructor(opts = {}) {
9362
+ const key = opts.apiKey ?? process.env.RIME_API_KEY;
9363
+ if (!key) {
9364
+ throw new Error(
9365
+ "Rime TTS requires an apiKey. Pass { apiKey: '...' } or set RIME_API_KEY in the environment."
9366
+ );
9367
+ }
9368
+ const { apiKey: _ignored, ...rest } = opts;
9369
+ void _ignored;
9370
+ super(key, rest);
9371
+ }
9372
+ };
9373
+
9144
9374
  // src/providers/lmnt-tts.ts
9145
9375
  var LMNT_BASE_URL = "https://api.lmnt.com/v1/ai/speech/bytes";
9146
9376
  var LMNTTTS = class {
@@ -9219,6 +9449,119 @@ var LMNTTTS = class {
9219
9449
  }
9220
9450
  };
9221
9451
 
9452
+ // src/tts/lmnt.ts
9453
+ var TTS5 = class extends LMNTTTS {
9454
+ constructor(opts = {}) {
9455
+ const key = opts.apiKey ?? process.env.LMNT_API_KEY;
9456
+ if (!key) {
9457
+ throw new Error(
9458
+ "LMNT TTS requires an apiKey. Pass { apiKey: '...' } or set LMNT_API_KEY in the environment."
9459
+ );
9460
+ }
9461
+ const { apiKey: _ignored, ...rest } = opts;
9462
+ void _ignored;
9463
+ super(key, rest);
9464
+ }
9465
+ };
9466
+
9467
+ // src/carriers/twilio.ts
9468
+ var Carrier = class {
9469
+ kind = "twilio";
9470
+ accountSid;
9471
+ authToken;
9472
+ constructor(opts = {}) {
9473
+ const sid = opts.accountSid ?? process.env.TWILIO_ACCOUNT_SID;
9474
+ const tok = opts.authToken ?? process.env.TWILIO_AUTH_TOKEN;
9475
+ if (!sid) {
9476
+ throw new Error(
9477
+ "Twilio carrier requires accountSid. Pass { accountSid: 'AC...' } or set TWILIO_ACCOUNT_SID in the environment."
9478
+ );
9479
+ }
9480
+ if (!tok) {
9481
+ throw new Error(
9482
+ "Twilio carrier requires authToken. Pass { authToken: '...' } or set TWILIO_AUTH_TOKEN in the environment."
9483
+ );
9484
+ }
9485
+ this.accountSid = sid;
9486
+ this.authToken = tok;
9487
+ }
9488
+ };
9489
+
9490
+ // src/carriers/telnyx.ts
9491
+ var Carrier2 = class {
9492
+ kind = "telnyx";
9493
+ apiKey;
9494
+ connectionId;
9495
+ publicKey;
9496
+ constructor(opts = {}) {
9497
+ const key = opts.apiKey ?? process.env.TELNYX_API_KEY;
9498
+ const conn = opts.connectionId ?? process.env.TELNYX_CONNECTION_ID;
9499
+ const pub = opts.publicKey ?? process.env.TELNYX_PUBLIC_KEY;
9500
+ if (!key) {
9501
+ throw new Error(
9502
+ "Telnyx carrier requires apiKey. Pass { apiKey: '...' } or set TELNYX_API_KEY in the environment."
9503
+ );
9504
+ }
9505
+ if (!conn) {
9506
+ throw new Error(
9507
+ "Telnyx carrier requires connectionId. Pass { connectionId: '...' } or set TELNYX_CONNECTION_ID in the environment."
9508
+ );
9509
+ }
9510
+ this.apiKey = key;
9511
+ this.connectionId = conn;
9512
+ this.publicKey = pub;
9513
+ }
9514
+ };
9515
+
9516
+ // src/public-api.ts
9517
+ var DEFAULT_GUARDRAIL_REPLACEMENT = "I'm sorry, I can't respond to that.";
9518
+ var Guardrail = class {
9519
+ name;
9520
+ blockedTerms;
9521
+ check;
9522
+ replacement;
9523
+ constructor(opts) {
9524
+ if (!opts.name) {
9525
+ throw new Error("Guardrail requires a non-empty name.");
9526
+ }
9527
+ this.name = opts.name;
9528
+ if (opts.blockedTerms) this.blockedTerms = opts.blockedTerms;
9529
+ if (opts.check) this.check = opts.check;
9530
+ this.replacement = opts.replacement ?? DEFAULT_GUARDRAIL_REPLACEMENT;
9531
+ }
9532
+ };
9533
+ function guardrail(opts) {
9534
+ return new Guardrail(opts);
9535
+ }
9536
+ var Tool = class {
9537
+ name;
9538
+ description;
9539
+ parameters;
9540
+ handler;
9541
+ webhookUrl;
9542
+ constructor(opts) {
9543
+ if (!opts.name) {
9544
+ throw new Error("Tool requires a non-empty name.");
9545
+ }
9546
+ const hasHandler = typeof opts.handler === "function";
9547
+ const hasWebhook = typeof opts.webhookUrl === "string" && opts.webhookUrl.length > 0;
9548
+ if (!hasHandler && !hasWebhook) {
9549
+ throw new Error("Tool requires either handler or webhookUrl.");
9550
+ }
9551
+ if (hasHandler && hasWebhook) {
9552
+ throw new Error("Tool accepts handler OR webhookUrl, not both.");
9553
+ }
9554
+ this.name = opts.name;
9555
+ this.description = opts.description ?? "";
9556
+ this.parameters = opts.parameters ?? { type: "object", properties: {} };
9557
+ if (hasHandler) this.handler = opts.handler;
9558
+ if (hasWebhook) this.webhookUrl = opts.webhookUrl;
9559
+ }
9560
+ };
9561
+ function tool(opts) {
9562
+ return new Tool(opts);
9563
+ }
9564
+
9222
9565
  // src/index.ts
9223
9566
  init_transcoding();
9224
9567
  init_tunnel();
@@ -9843,21 +10186,25 @@ function isAudioConfig(value) {
9843
10186
  CartesiaSTT,
9844
10187
  CartesiaTTS,
9845
10188
  ChatContext,
10189
+ CloudflareTunnel,
9846
10190
  DEFAULT_MIN_SENTENCE_LEN,
9847
10191
  DEFAULT_PRICING,
9848
10192
  DTMF_EVENTS,
9849
10193
  DeepgramSTT,
10194
+ ElevenLabsConvAI,
9850
10195
  ElevenLabsConvAIAdapter,
9851
10196
  ElevenLabsTTS,
9852
10197
  FallbackLLMProvider,
9853
10198
  GEMINI_DEFAULT_INPUT_SR,
9854
10199
  GEMINI_DEFAULT_OUTPUT_SR,
9855
10200
  GeminiLiveAdapter,
10201
+ Guardrail,
9856
10202
  IVRActivity,
9857
10203
  LLMLoop,
9858
10204
  LMNTTTS,
9859
10205
  MetricsStore,
9860
10206
  OpenAILLMProvider,
10207
+ OpenAIRealtime,
9861
10208
  OpenAIRealtimeAdapter,
9862
10209
  OpenAITTS,
9863
10210
  PartialStreamError,
@@ -9870,8 +10217,12 @@ function isAudioConfig(value) {
9870
10217
  RimeTTS,
9871
10218
  SentenceChunker,
9872
10219
  SonioxSTT,
10220
+ StaticTunnel,
10221
+ Telnyx,
9873
10222
  TestSession,
9874
10223
  TfidfLoopDetector,
10224
+ Tool,
10225
+ Twilio,
9875
10226
  ULTRAVOX_DEFAULT_API_BASE,
9876
10227
  ULTRAVOX_DEFAULT_SR,
9877
10228
  UltravoxRealtimeAdapter,
@@ -9891,6 +10242,7 @@ function isAudioConfig(value) {
9891
10242
  filterMarkdown,
9892
10243
  formatDtmf,
9893
10244
  getLogger,
10245
+ guardrail,
9894
10246
  isRemoteUrl,
9895
10247
  isWebSocketUrl,
9896
10248
  makeAuthMiddleware,
@@ -9912,5 +10264,6 @@ function isAudioConfig(value) {
9912
10264
  selectSoundFromList,
9913
10265
  setLogger,
9914
10266
  startTunnel,
10267
+ tool,
9915
10268
  whisper
9916
10269
  });