getpatter 0.6.1 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -6,6 +6,7 @@ import {
6
6
  CallMetricsAccumulator,
7
7
  DEFAULT_MIN_SENTENCE_LEN,
8
8
  DEFAULT_PRICING,
9
+ DeepgramModel,
9
10
  DeepgramSTT,
10
11
  DefaultToolExecutor,
11
12
  ElevenLabsConvAIAdapter,
@@ -15,12 +16,12 @@ import {
15
16
  LLMLoop,
16
17
  MetricsStore,
17
18
  OpenAILLMProvider,
18
- OpenAIRealtime2Adapter,
19
- OpenAIRealtimeAdapter,
19
+ PRICING_LAST_UPDATED,
20
+ PRICING_VERSION,
20
21
  PatterConnectionError,
21
22
  PatterError,
22
- PcmCarry,
23
23
  PipelineHookExecutor,
24
+ PricingUnit,
24
25
  ProvisionError,
25
26
  RateLimitError,
26
27
  RemoteMessageHandler,
@@ -32,18 +33,14 @@ import {
32
33
  SPAN_TOOL,
33
34
  SPAN_TTS,
34
35
  SentenceChunker,
35
- StatefulResampler,
36
36
  TestSession,
37
+ VERSION,
37
38
  calculateRealtimeCost,
38
39
  calculateSttCost,
39
40
  calculateTelephonyCost,
40
41
  calculateTtsCost,
41
42
  callsToCsv,
42
43
  callsToJson,
43
- createResampler16kTo8k,
44
- createResampler24kTo16k,
45
- createResampler24kTo8k,
46
- createResampler8kTo16k,
47
44
  initTracing,
48
45
  isRemoteUrl,
49
46
  isTracingEnabled,
@@ -53,14 +50,29 @@ import {
53
50
  mergePricing,
54
51
  mountApi,
55
52
  mountDashboard,
53
+ resolveLogRoot,
54
+ startSpan
55
+ } from "./chunk-LE63CSOB.mjs";
56
+ import {
57
+ OpenAIRealtime2Adapter,
58
+ OpenAIRealtimeAdapter,
59
+ OpenAIRealtimeAudioFormat,
60
+ OpenAIRealtimeModel,
61
+ OpenAIRealtimeVADType,
62
+ OpenAITranscriptionModel,
63
+ OpenAIVoice,
64
+ PcmCarry,
65
+ StatefulResampler,
66
+ createResampler16kTo8k,
67
+ createResampler24kTo16k,
68
+ createResampler24kTo8k,
69
+ createResampler8kTo16k,
56
70
  mulawToPcm16,
57
71
  pcm16ToMulaw,
58
72
  resample16kTo8k,
59
73
  resample24kTo16k,
60
- resample8kTo16k,
61
- resolveLogRoot,
62
- startSpan
63
- } from "./chunk-TEW3NAZJ.mjs";
74
+ resample8kTo16k
75
+ } from "./chunk-CL2U3YET.mjs";
64
76
  import {
65
77
  MinWordsStrategy,
66
78
  evaluateStrategies,
@@ -75,7 +87,7 @@ import {
75
87
  } from "./chunk-6GR5MHHQ.mjs";
76
88
  import {
77
89
  SileroVAD
78
- } from "./chunk-RV7APPYE.mjs";
90
+ } from "./chunk-R2T4JABZ.mjs";
79
91
  import {
80
92
  __dirname,
81
93
  __require,
@@ -105,7 +117,7 @@ var Realtime = class {
105
117
  );
106
118
  }
107
119
  this.apiKey = key;
108
- this.model = opts.model ?? "gpt-4o-mini-realtime-preview";
120
+ this.model = opts.model ?? "gpt-realtime-mini";
109
121
  this.voice = opts.voice ?? "alloy";
110
122
  this.reasoningEffort = opts.reasoningEffort;
111
123
  this.inputAudioTranscriptionModel = opts.inputAudioTranscriptionModel;
@@ -557,7 +569,9 @@ function resolvePersistRoot(persist) {
557
569
  if (persist === false) return null;
558
570
  if (persist === true) return resolveLogRoot("auto");
559
571
  if (typeof persist === "string") return resolveLogRoot(persist);
560
- return resolveLogRoot();
572
+ const envRoot = resolveLogRoot();
573
+ if (envRoot !== null) return envRoot;
574
+ return resolveLogRoot("auto");
561
575
  }
562
576
  function closeParkedConnections(slot) {
563
577
  if (slot.stt) {
@@ -573,6 +587,11 @@ function closeParkedConnections(slot) {
573
587
  }
574
588
  }
575
589
  if (slot.openaiRealtime) {
590
+ const wsAny = slot.openaiRealtime;
591
+ if (wsAny._parkedKeepalive) {
592
+ clearInterval(wsAny._parkedKeepalive);
593
+ delete wsAny._parkedKeepalive;
594
+ }
576
595
  try {
577
596
  slot.openaiRealtime.close();
578
597
  } catch {
@@ -1014,7 +1033,7 @@ var Patter = class {
1014
1033
  }
1015
1034
  /** Run the agent in interactive terminal-test mode (no real telephony). */
1016
1035
  async test(opts) {
1017
- const { TestSession: TestSession2 } = await import("./test-mode-WEKKNBLD.mjs");
1036
+ const { TestSession: TestSession2 } = await import("./test-mode-RS57BDM6.mjs");
1018
1037
  const session = new TestSession2();
1019
1038
  await session.run({
1020
1039
  agent: opts.agent,
@@ -1144,7 +1163,9 @@ var Patter = class {
1144
1163
  const tts = agent.tts;
1145
1164
  const sttOpen = typeof stt?.openParkedConnection === "function" ? stt.openParkedConnection.bind(stt) : null;
1146
1165
  const ttsOpen = typeof tts?.openParkedConnection === "function" ? tts.openParkedConnection.bind(tts) : null;
1147
- if (!sttOpen && !ttsOpen) return;
1166
+ const providerStr = agent.provider ?? "";
1167
+ const wantsRealtimePark = providerStr === "openai_realtime" || providerStr === "openai_realtime_2";
1168
+ if (!sttOpen && !ttsOpen && !wantsRealtimePark) return;
1148
1169
  const slot = {};
1149
1170
  this.prewarmedConnections.set(callId, slot);
1150
1171
  const startedAt = Date.now();
@@ -1189,6 +1210,43 @@ var Patter = class {
1189
1210
  }
1190
1211
  })());
1191
1212
  }
1213
+ if (wantsRealtimePark) {
1214
+ tasks.push((async () => {
1215
+ const { OpenAIRealtime2Adapter: OpenAIRealtime2Adapter2 } = await import("./openai-realtime-2-CNFARP25.mjs");
1216
+ const apiKey = process.env.OPENAI_API_KEY ?? "";
1217
+ if (!apiKey) {
1218
+ getLogger().debug(`Park OpenAI Realtime skipped for ${callId}: no OPENAI_API_KEY`);
1219
+ return;
1220
+ }
1221
+ try {
1222
+ const tmpAdapter = new OpenAIRealtime2Adapter2(
1223
+ apiKey,
1224
+ agent.model ?? "gpt-realtime-mini",
1225
+ agent.voice ?? "alloy",
1226
+ agent.systemPrompt ?? "",
1227
+ [],
1228
+ // audioFormat — the GA adapter always emits audio/pcm@24000
1229
+ // internally regardless of this value, but it's a required
1230
+ // positional param. Default to g711_ulaw (Twilio wire format).
1231
+ void 0
1232
+ );
1233
+ const ws = await tmpAdapter.openParkedConnection();
1234
+ if (this.prewarmedConnections.get(callId) !== slot) {
1235
+ try {
1236
+ ws.close();
1237
+ } catch {
1238
+ }
1239
+ return;
1240
+ }
1241
+ slot.openaiRealtime = ws;
1242
+ getLogger().info(
1243
+ `[PREWARM] callId=${callId} provider=openai_realtime ms=${Date.now() - startedAt}`
1244
+ );
1245
+ } catch (err) {
1246
+ getLogger().debug(`Park OpenAI Realtime failed for ${callId}: ${String(err)}`);
1247
+ }
1248
+ })());
1249
+ }
1192
1250
  const task = (async () => {
1193
1251
  await Promise.allSettled(tasks);
1194
1252
  })();
@@ -1266,7 +1324,7 @@ var Patter = class {
1266
1324
  * with a warn when the cap is reached (the call still proceeds —
1267
1325
  * StreamHandler falls back to live TTS).
1268
1326
  */
1269
- spawnPrewarmFirstMessage(agent, callId, ringTimeout) {
1327
+ spawnPrewarmFirstMessage(agent, callId, ringTimeout, carrier) {
1270
1328
  if (!agent.prewarmFirstMessage) return;
1271
1329
  const providerMode = agent.provider ?? "openai_realtime";
1272
1330
  if (providerMode !== "pipeline") {
@@ -1279,6 +1337,18 @@ var Patter = class {
1279
1337
  const tts = agent.tts;
1280
1338
  if (!firstMessage || !tts) return;
1281
1339
  if (typeof tts.synthesizeStream !== "function") return;
1340
+ if (carrier) {
1341
+ const carrierAware = tts;
1342
+ if (typeof carrierAware.setTelephonyCarrier === "function") {
1343
+ try {
1344
+ carrierAware.setTelephonyCarrier(carrier);
1345
+ } catch (err) {
1346
+ getLogger().debug(
1347
+ `Prewarm TTS setTelephonyCarrier failed for ${callId}: ${String(err)}`
1348
+ );
1349
+ }
1350
+ }
1351
+ }
1282
1352
  const inFlight = this.prewarmAudio.size + this.prewarmTasks.size;
1283
1353
  if (inFlight >= PREWARM_CACHE_MAX) {
1284
1354
  getLogger().warn(
@@ -1391,16 +1461,25 @@ var Patter = class {
1391
1461
  telnyxCallId = body.data?.call_control_id;
1392
1462
  } catch {
1393
1463
  }
1394
- if (this.embeddedServer && telnyxCallId) {
1395
- this.embeddedServer.metricsStore.recordCallInitiated({
1464
+ if (telnyxCallId) {
1465
+ const initiatedPayload = {
1396
1466
  call_id: telnyxCallId,
1397
1467
  caller: phoneNumber,
1398
1468
  callee: options.to,
1399
- direction: "outbound"
1400
- });
1469
+ direction: "outbound",
1470
+ status: "initiated"
1471
+ };
1472
+ if (this.embeddedServer) {
1473
+ this.embeddedServer.metricsStore.recordCallInitiated(initiatedPayload);
1474
+ }
1475
+ try {
1476
+ const { notifyDashboard: notifyDashboard2 } = await import("./persistence-LVIAHESK.mjs");
1477
+ notifyDashboard2(initiatedPayload);
1478
+ } catch {
1479
+ }
1401
1480
  }
1402
1481
  if (telnyxCallId) {
1403
- this.spawnPrewarmFirstMessage(options.agent, telnyxCallId, effectiveRingTimeout);
1482
+ this.spawnPrewarmFirstMessage(options.agent, telnyxCallId, effectiveRingTimeout, "telnyx");
1404
1483
  if (options.agent.prewarm !== false) {
1405
1484
  this.parkProviderConnections(options.agent, telnyxCallId);
1406
1485
  }
@@ -1453,21 +1532,30 @@ var Patter = class {
1453
1532
  twilioNotificationsPath = body.subresource_uris?.notifications;
1454
1533
  } catch {
1455
1534
  }
1456
- if (this.embeddedServer && twilioCallSid) {
1457
- this.embeddedServer.metricsStore.recordCallInitiated({
1535
+ if (twilioCallSid) {
1536
+ const initiatedPayload = {
1458
1537
  call_id: twilioCallSid,
1459
1538
  caller: phoneNumber,
1460
1539
  callee: options.to,
1461
- direction: "outbound"
1462
- });
1463
- if (twilioNotificationsPath) {
1464
- getLogger().info(
1465
- `Outbound call ${twilioCallSid} placed. Twilio notifications: https://api.twilio.com${twilioNotificationsPath} (check here if the call drops with no audio).`
1466
- );
1540
+ direction: "outbound",
1541
+ status: "initiated"
1542
+ };
1543
+ if (this.embeddedServer) {
1544
+ this.embeddedServer.metricsStore.recordCallInitiated(initiatedPayload);
1545
+ if (twilioNotificationsPath) {
1546
+ getLogger().info(
1547
+ `Outbound call ${twilioCallSid} placed. Twilio notifications: https://api.twilio.com${twilioNotificationsPath} (check here if the call drops with no audio).`
1548
+ );
1549
+ }
1550
+ }
1551
+ try {
1552
+ const { notifyDashboard: notifyDashboard2 } = await import("./persistence-LVIAHESK.mjs");
1553
+ notifyDashboard2(initiatedPayload);
1554
+ } catch {
1467
1555
  }
1468
1556
  }
1469
1557
  if (twilioCallSid) {
1470
- this.spawnPrewarmFirstMessage(options.agent, twilioCallSid, effectiveRingTimeout);
1558
+ this.spawnPrewarmFirstMessage(options.agent, twilioCallSid, effectiveRingTimeout, "twilio");
1471
1559
  if (options.agent.prewarm !== false) {
1472
1560
  this.parkProviderConnections(options.agent, twilioCallSid);
1473
1561
  }
@@ -2764,109 +2852,694 @@ function scheduleInterval(intervalOrOpts, callback) {
2764
2852
  };
2765
2853
  }
2766
2854
 
2767
- // src/stt/deepgram.ts
2855
+ // src/providers/elevenlabs-tts.ts
2768
2856
  init_esm_shims();
2769
- var STT = class extends DeepgramSTT {
2770
- static providerKey = "deepgram";
2771
- constructor(opts = {}) {
2772
- const key = opts.apiKey ?? process.env.DEEPGRAM_API_KEY;
2773
- if (!key) {
2774
- throw new Error(
2775
- "Deepgram STT requires an apiKey. Pass { apiKey: 'dg_...' } or set DEEPGRAM_API_KEY in the environment."
2776
- );
2777
- }
2778
- super(
2779
- key,
2780
- opts.language ?? "en",
2781
- opts.model ?? "nova-3",
2782
- opts.encoding ?? "linear16",
2783
- opts.sampleRate ?? 16e3,
2784
- {
2785
- endpointingMs: opts.endpointingMs ?? 150,
2786
- utteranceEndMs: opts.utteranceEndMs === null ? null : opts.utteranceEndMs ?? 1e3,
2787
- smartFormat: opts.smartFormat ?? true,
2788
- interimResults: opts.interimResults ?? true,
2789
- ...opts.vadEvents !== void 0 ? { vadEvents: opts.vadEvents } : {}
2790
- }
2791
- );
2792
- }
2857
+ var ELEVENLABS_BASE_URL = "https://api.elevenlabs.io/v1";
2858
+ var ELEVENLABS_VOICE_ID_BY_NAME = {
2859
+ rachel: "21m00Tcm4TlvDq8ikWAM",
2860
+ drew: "29vD33N1CtxCmqQRPOHJ",
2861
+ clyde: "2EiwWnXFnvU5JabPnv8n",
2862
+ paul: "5Q0t7uMcjvnagumLfvZi",
2863
+ domi: "AZnzlk1XvdvUeBnXmlld",
2864
+ dave: "CYw3kZ02Hs0563khs1Fj",
2865
+ fin: "D38z5RcWu1voky8WS1ja",
2866
+ bella: "EXAVITQu4vr4xnSDxMaL",
2867
+ antoni: "ErXwobaYiN019PkySvjV",
2868
+ thomas: "GBv7mTt0atIp3Br8iCZE",
2869
+ charlie: "IKne3meq5aSn9XLyUdCD",
2870
+ george: "JBFqnCBsd6RMkjVDRZzb",
2871
+ emily: "LcfcDJNUP1GQjkzn1xUU",
2872
+ elli: "MF3mGyEYCl7XYWbV9V6O",
2873
+ callum: "N2lVS1w4EtoT3dr4eOWO",
2874
+ patrick: "ODq5zmih8GrVes37Dizd",
2875
+ harry: "SOYHLrjzK2X1ezoPC6cr",
2876
+ liam: "TX3LPaxmHKxFdv7VOQHJ",
2877
+ dorothy: "ThT5KcBeYPX3keUQqHPh",
2878
+ josh: "TxGEqnHWrfWFTfGW9XjX",
2879
+ arnold: "VR6AewLTigWG4xSOukaG",
2880
+ charlotte: "XB0fDUnXU5powFXDhCwa",
2881
+ matilda: "XrExE9yKIg1WjnnlVkGX",
2882
+ matthew: "Yko7PKHZNXotIFUBG7I9",
2883
+ james: "ZQe5CZNOzWyzPSCn5a3c",
2884
+ joseph: "Zlb1dXrM653N07WRdFW3",
2885
+ jeremy: "bVMeCyTHy58xNoL34h3p",
2886
+ michael: "flq6f7yk4E4fJM5XTYuZ",
2887
+ ethan: "g5CIjZEefAph4nQFvHAz",
2888
+ gigi: "jBpfuIE2acCO8z3wKNLl",
2889
+ freya: "jsCqWAovK2LkecY7zXl4",
2890
+ brian: "nPczCjzI2devNBz1zQrb",
2891
+ grace: "oWAxZDx7w5VEj9dCyTzz",
2892
+ daniel: "onwK4e9ZLuTAKqWW03F9",
2893
+ lily: "pFZP5JQG7iQjIQuC4Bku",
2894
+ serena: "pMsXgVXv3BLzUgSXRplE",
2895
+ adam: "pNInz6obpgDQGcFmaJgB",
2896
+ nicole: "piTKgcLEGmPE4e6mEKli",
2897
+ bill: "pqHfZKP75CvOlQylNhV4",
2898
+ jessie: "t0jbNlBVZ17f02VDIeMI",
2899
+ ryan: "wViXBPUzp2ZZixB1xQuM",
2900
+ sam: "yoZ06aMxZJJ28mfd3POQ",
2901
+ glinda: "z9fAnlkpzviPz146aGWa",
2902
+ giovanni: "zcAOhNBS3c14rBihAFp1",
2903
+ mimi: "zrHiDhphv9ZnVXBqCLjz",
2904
+ sarah: "EXAVITQu4vr4xnSDxMaL",
2905
+ alloy: "EXAVITQu4vr4xnSDxMaL"
2793
2906
  };
2794
-
2795
- // src/stt/whisper.ts
2796
- init_esm_shims();
2797
-
2798
- // src/providers/whisper-stt.ts
2799
- init_esm_shims();
2800
- var OPENAI_TRANSCRIPTION_URL = "https://api.openai.com/v1/audio/transcriptions";
2801
- var DEFAULT_BUFFER_SIZE = 16e3 * 2;
2802
- var ALLOWED_MODELS = /* @__PURE__ */ new Set(["whisper-1", "gpt-4o-transcribe", "gpt-4o-mini-transcribe"]);
2803
- function wrapPcmInWav(pcm, sampleRate = 16e3, channels = 1, bitsPerSample = 16) {
2804
- const dataSize = pcm.length;
2805
- const header = Buffer.alloc(44);
2806
- header.write("RIFF", 0);
2807
- header.writeUInt32LE(36 + dataSize, 4);
2808
- header.write("WAVE", 8);
2809
- header.write("fmt ", 12);
2810
- header.writeUInt32LE(16, 16);
2811
- header.writeUInt16LE(1, 20);
2812
- header.writeUInt16LE(channels, 22);
2813
- header.writeUInt32LE(sampleRate, 24);
2814
- header.writeUInt32LE(sampleRate * channels * (bitsPerSample / 8), 28);
2815
- header.writeUInt16LE(channels * (bitsPerSample / 8), 32);
2816
- header.writeUInt16LE(bitsPerSample, 34);
2817
- header.write("data", 36);
2818
- header.writeUInt32LE(dataSize, 40);
2819
- return Buffer.concat([header, pcm]);
2907
+ var VOICE_ID_PATTERN = /^[A-Za-z0-9]{20}$/;
2908
+ function resolveVoiceId(voice) {
2909
+ if (!voice) return voice;
2910
+ if (VOICE_ID_PATTERN.test(voice)) return voice;
2911
+ return ELEVENLABS_VOICE_ID_BY_NAME[voice.toLowerCase()] ?? voice;
2820
2912
  }
2821
- var WhisperSTT = class _WhisperSTT {
2822
- /** Stable pricing/dashboard key — read by stream-handler/metrics. */
2823
- static providerKey = "whisper";
2913
+ var ElevenLabsModel = {
2914
+ V3: "eleven_v3",
2915
+ FLASH_V2_5: "eleven_flash_v2_5",
2916
+ TURBO_V2_5: "eleven_turbo_v2_5",
2917
+ MULTILINGUAL_V2: "eleven_multilingual_v2",
2918
+ MONOLINGUAL_V1: "eleven_monolingual_v1"
2919
+ };
2920
+ var ElevenLabsOutputFormat = {
2921
+ MP3_22050_32: "mp3_22050_32",
2922
+ MP3_44100_32: "mp3_44100_32",
2923
+ MP3_44100_64: "mp3_44100_64",
2924
+ MP3_44100_96: "mp3_44100_96",
2925
+ MP3_44100_128: "mp3_44100_128",
2926
+ MP3_44100_192: "mp3_44100_192",
2927
+ PCM_8000: "pcm_8000",
2928
+ PCM_16000: "pcm_16000",
2929
+ PCM_22050: "pcm_22050",
2930
+ PCM_24000: "pcm_24000",
2931
+ PCM_44100: "pcm_44100",
2932
+ ULAW_8000: "ulaw_8000"
2933
+ };
2934
+ var ElevenLabsTTS = class _ElevenLabsTTS {
2935
+ // Stable pricing/dashboard key — read by stream-handler / metrics via
2936
+ // ``(agent.tts.constructor as any).providerKey``. Without this the cost
2937
+ // calculator falls back to ``constructor.name`` ("ElevenLabsTTS") which
2938
+ // does NOT match the pricing table key "elevenlabs", silently zeroing
2939
+ // TTS cost for callers that construct the raw REST class directly
2940
+ // (exposed at top level as ``ElevenLabsRestTTS``).
2941
+ static providerKey = "elevenlabs";
2824
2942
  apiKey;
2825
- model;
2826
- language;
2827
- bufferSize;
2828
- responseFormat;
2829
- // Accumulate chunks in an array and concat once on flush — avoids the
2830
- // per-``sendAudio`` O(n) ``Buffer.concat([buffer, chunk])`` that quickly
2831
- // dominates CPU when the phone leg delivers 20 ms frames.
2832
- chunks = [];
2833
- bufferedBytes = 0;
2834
- callbacks = /* @__PURE__ */ new Set();
2835
- running = false;
2836
- pendingTranscriptions = [];
2943
+ voiceId;
2944
+ modelId;
2945
+ _outputFormat;
2946
+ _outputFormatExplicit;
2947
+ voiceSettings;
2948
+ languageCode;
2949
+ chunkSize;
2837
2950
  /**
2838
- * @param apiKey OpenAI API key.
2839
- * @param language ISO-639-1 language code (e.g. ``"en"``, ``"it"``). Optional.
2840
- * @param model One of ``whisper-1``, ``gpt-4o-transcribe``, ``gpt-4o-mini-transcribe``.
2841
- * @param bufferSize Bytes of PCM16 to buffer before each transcription request.
2842
- * @param responseFormat ``"json"`` (default) or ``"verbose_json"``.
2843
- *
2844
- * Argument order matches the Python SDK's ``WhisperSTT(api_key, language, model, response_format)``
2845
- * for cross-language parity. Pre-0.5.3 the TS positional order was
2846
- * ``(apiKey, model, language, bufferSize, responseFormat)`` — callers using
2847
- * the old order will need to swap ``language`` and ``model``.
2951
+ * Public view of the (possibly auto-flipped) wire format. Read by the
2952
+ * stream-handler to decide whether to skip the client-side resample +
2953
+ * mulaw encode when the bytes are already in the carrier's wire codec.
2848
2954
  */
2849
- constructor(apiKey, language, model = "whisper-1", bufferSize = DEFAULT_BUFFER_SIZE, responseFormat = "json") {
2850
- if (!ALLOWED_MODELS.has(model)) {
2851
- throw new Error(
2852
- `WhisperSTT: unsupported model "${model}". Expected one of ${[...ALLOWED_MODELS].join(", ")}.`
2853
- );
2854
- }
2855
- this.apiKey = apiKey;
2856
- this.model = model;
2857
- this.language = language;
2858
- this.bufferSize = bufferSize;
2859
- this.responseFormat = responseFormat;
2860
- }
2861
- /** Factory for Twilio calls — mulaw 8 kHz is transcoded upstream, so we still receive PCM 16-bit. */
2862
- static forTwilio(apiKey, language = "en", model = "whisper-1") {
2863
- return new _WhisperSTT(apiKey, language, model);
2864
- }
2865
- /** Reset the audio buffer and arm the adapter for incoming chunks. */
2866
- async connect() {
2867
- this.running = true;
2868
- this.chunks = [];
2869
- this.bufferedBytes = 0;
2955
+ get outputFormat() {
2956
+ return this._outputFormat;
2957
+ }
2958
+ constructor(apiKey, voiceIdOrOptions = "21m00Tcm4TlvDq8ikWAM", modelId = ElevenLabsModel.FLASH_V2_5, outputFormat = ElevenLabsOutputFormat.PCM_16000) {
2959
+ this.apiKey = apiKey;
2960
+ if (typeof voiceIdOrOptions === "object") {
2961
+ const o = voiceIdOrOptions;
2962
+ this.voiceId = resolveVoiceId(o.voiceId ?? "21m00Tcm4TlvDq8ikWAM");
2963
+ this.modelId = o.modelId ?? ElevenLabsModel.FLASH_V2_5;
2964
+ this._outputFormatExplicit = o.outputFormat !== void 0;
2965
+ this._outputFormat = o.outputFormat ?? ElevenLabsOutputFormat.PCM_16000;
2966
+ this.voiceSettings = o.voiceSettings;
2967
+ this.languageCode = o.languageCode;
2968
+ this.chunkSize = o.chunkSize ?? 4096;
2969
+ } else {
2970
+ this.voiceId = resolveVoiceId(voiceIdOrOptions);
2971
+ this.modelId = modelId;
2972
+ this._outputFormatExplicit = outputFormat !== ElevenLabsOutputFormat.PCM_16000;
2973
+ this._outputFormat = outputFormat;
2974
+ this.voiceSettings = void 0;
2975
+ this.languageCode = void 0;
2976
+ this.chunkSize = 4096;
2977
+ }
2978
+ }
2979
+ /**
2980
+ * Hook called by ``StreamHandler.initPipeline`` to advise the carrier
2981
+ * wire format. When the user did NOT pass an explicit ``outputFormat``,
2982
+ * auto-flip to the carrier's native codec so the audio bytes ElevenLabs
2983
+ * returns are already in Twilio/Telnyx wire format — eliminating the
2984
+ * client-side 16 kHz → 8 kHz resample and PCM → μ-law encode. The
2985
+ * resample/encode chain was a source of audible artifacts on the
2986
+ * prewarmed firstMessage (see 0.6.2 acceptance notes — burst delivery
2987
+ * of resampled audio crackled on the carrier-side jitter buffer).
2988
+ *
2989
+ * No-op when the caller passed an explicit ``outputFormat`` (incl. via
2990
+ * the ``forTwilio`` / ``forTelnyx`` factories) — user wins.
2991
+ *
2992
+ * Parity with {@link ElevenLabsWebSocketTTS.setTelephonyCarrier}.
2993
+ */
2994
+ setTelephonyCarrier(carrier) {
2995
+ if (this._outputFormatExplicit) return;
2996
+ if (carrier === "twilio") {
2997
+ this._outputFormat = ElevenLabsOutputFormat.ULAW_8000;
2998
+ } else if (carrier === "telnyx") {
2999
+ this._outputFormat = ElevenLabsOutputFormat.PCM_16000;
3000
+ }
3001
+ }
3002
+ /**
3003
+ * Construct an instance pre-configured for Twilio Media Streams.
3004
+ *
3005
+ * Sets `outputFormat='ulaw_8000'` so ElevenLabs emits μ-law @ 8 kHz
3006
+ * directly — the exact wire format Twilio's media stream uses — letting
3007
+ * the SDK skip the 16 kHz→8 kHz resample and PCM→μ-law conversion in
3008
+ * `TwilioAudioSender`. Saves ~30–80 ms first-byte and per-frame CPU,
3009
+ * and removes a potential aliasing source.
3010
+ *
3011
+ * `voiceSettings` defaults to a low-bandwidth-friendly profile
3012
+ * (speaker boost off, modest stability) which sounds cleaner at 8 kHz
3013
+ * μ-law than the studio default. Pass an explicit object to override.
3014
+ */
3015
+ static forTwilio(apiKey, options = {}) {
3016
+ const voiceSettings = options.voiceSettings ?? {
3017
+ // Speaker boost adds high-frequency emphasis that aliases ugly over an
3018
+ // 8 kHz μ-law line. Slightly higher stability tames the excursions
3019
+ // that compander quantization noise can amplify.
3020
+ stability: 0.6,
3021
+ similarity_boost: 0.75,
3022
+ use_speaker_boost: false
3023
+ };
3024
+ return new _ElevenLabsTTS(apiKey, {
3025
+ ...options,
3026
+ voiceSettings,
3027
+ outputFormat: ElevenLabsOutputFormat.ULAW_8000
3028
+ });
3029
+ }
3030
+ /**
3031
+ * Construct an instance pre-configured for Telnyx bidirectional media.
3032
+ *
3033
+ * Telnyx's default media-streaming codec is L16 PCM @ 16 kHz, which
3034
+ * matches our default Telnyx handler. We pick `pcm_16000` so the audio
3035
+ * flows end-to-end with zero resampling or transcoding.
3036
+ *
3037
+ * Trade-off: if your Telnyx profile is pinned to PCMU/8000 (μ-law),
3038
+ * construct `ElevenLabsTTS` directly with `outputFormat: 'ulaw_8000'`
3039
+ * — Telnyx supports that natively too.
3040
+ */
3041
+ static forTelnyx(apiKey, options = {}) {
3042
+ return new _ElevenLabsTTS(apiKey, {
3043
+ ...options,
3044
+ outputFormat: ElevenLabsOutputFormat.PCM_16000
3045
+ });
3046
+ }
3047
+ /**
3048
+ * Synthesise text to speech and return the full audio as a single Buffer.
3049
+ *
3050
+ * For large chunks (or when latency matters) call `synthesizeStream` instead.
3051
+ */
3052
+ async synthesize(text) {
3053
+ const chunks = [];
3054
+ for await (const chunk of this.synthesizeStream(text)) {
3055
+ chunks.push(chunk);
3056
+ }
3057
+ return Buffer.concat(chunks);
3058
+ }
3059
+ /**
3060
+ * Synthesise text and yield audio chunks as they arrive (streaming).
3061
+ *
3062
+ * The yielded buffers are raw PCM at 16 kHz (or whatever `outputFormat` is
3063
+ * configured to). `chunkSize` controls the maximum yield size — 512 is a
3064
+ * good choice for low-latency telephony.
3065
+ */
3066
+ async *synthesizeStream(text) {
3067
+ const url = `${ELEVENLABS_BASE_URL}/text-to-speech/${encodeURIComponent(this.voiceId)}/stream?output_format=${encodeURIComponent(this._outputFormat)}`;
3068
+ const body = {
3069
+ text,
3070
+ model_id: this.modelId
3071
+ };
3072
+ if (this.voiceSettings) body["voice_settings"] = this.voiceSettings;
3073
+ if (this.languageCode) body["language_code"] = this.languageCode;
3074
+ const response = await fetch(url, {
3075
+ method: "POST",
3076
+ headers: {
3077
+ "xi-api-key": this.apiKey,
3078
+ "Content-Type": "application/json"
3079
+ },
3080
+ body: JSON.stringify(body),
3081
+ signal: AbortSignal.timeout(3e4)
3082
+ });
3083
+ if (!response.ok) {
3084
+ const errBody = await response.text();
3085
+ throw new Error(`ElevenLabs TTS error ${response.status}: ${errBody}`);
3086
+ }
3087
+ if (!response.body) {
3088
+ throw new Error("ElevenLabs TTS: no response body");
3089
+ }
3090
+ const reader = response.body.getReader();
3091
+ try {
3092
+ while (true) {
3093
+ const { done, value } = await reader.read();
3094
+ if (done) break;
3095
+ if (!value || value.length === 0) continue;
3096
+ const buf = Buffer.from(value);
3097
+ for (let offset = 0; offset < buf.length; offset += this.chunkSize) {
3098
+ yield buf.subarray(offset, Math.min(offset + this.chunkSize, buf.length));
3099
+ }
3100
+ }
3101
+ } finally {
3102
+ if (typeof reader.cancel === "function") await reader.cancel().catch(() => {
3103
+ });
3104
+ reader.releaseLock();
3105
+ }
3106
+ }
3107
+ };
3108
+
3109
+ // src/providers/cartesia-tts.ts
3110
+ init_esm_shims();
3111
+ var CARTESIA_BASE_URL = "https://api.cartesia.ai";
3112
+ var CARTESIA_API_VERSION = "2025-04-16";
3113
+ var CARTESIA_DEFAULT_VOICE_ID = "f786b574-daa5-4673-aa0c-cbe3e8534c02";
3114
+ var CartesiaTTSModel = {
3115
+ SONIC_3: "sonic-3",
3116
+ SONIC_2: "sonic-2",
3117
+ SONIC: "sonic"
3118
+ };
3119
+ var CartesiaTTSContainer = {
3120
+ RAW: "raw",
3121
+ WAV: "wav",
3122
+ MP3: "mp3"
3123
+ };
3124
+ var CartesiaTTSEncoding = {
3125
+ PCM_S16LE: "pcm_s16le",
3126
+ PCM_F32LE: "pcm_f32le",
3127
+ PCM_MULAW: "pcm_mulaw",
3128
+ PCM_ALAW: "pcm_alaw"
3129
+ };
3130
+ var CartesiaTTSSampleRate = {
3131
+ HZ_8000: 8e3,
3132
+ HZ_16000: 16e3,
3133
+ HZ_22050: 22050,
3134
+ HZ_24000: 24e3,
3135
+ HZ_44100: 44100
3136
+ };
3137
+ var CartesiaTTSVoiceMode = {
3138
+ ID: "id",
3139
+ EMBEDDING: "embedding"
3140
+ };
3141
+ var CartesiaTTS = class _CartesiaTTS {
3142
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
3143
+ static providerKey = "cartesia_tts";
3144
+ apiKey;
3145
+ model;
3146
+ voice;
3147
+ language;
3148
+ sampleRate;
3149
+ speed;
3150
+ emotion;
3151
+ volume;
3152
+ baseUrl;
3153
+ apiVersion;
3154
+ constructor(apiKey, opts = {}) {
3155
+ this.apiKey = apiKey;
3156
+ this.model = opts.model ?? CartesiaTTSModel.SONIC_3;
3157
+ this.voice = opts.voice ?? CARTESIA_DEFAULT_VOICE_ID;
3158
+ this.language = opts.language ?? "en";
3159
+ this.sampleRate = opts.sampleRate ?? CartesiaTTSSampleRate.HZ_16000;
3160
+ this.speed = opts.speed;
3161
+ this.emotion = typeof opts.emotion === "string" ? [opts.emotion] : opts.emotion;
3162
+ this.volume = opts.volume;
3163
+ this.baseUrl = opts.baseUrl ?? CARTESIA_BASE_URL;
3164
+ this.apiVersion = opts.apiVersion ?? CARTESIA_API_VERSION;
3165
+ }
3166
+ /**
3167
+ * Construct an instance pre-configured for Twilio Media Streams.
3168
+ *
3169
+ * Sets `sampleRate=8000` so Cartesia emits PCM_S16LE @ 8 kHz directly.
3170
+ * Twilio's media stream uses μ-law @ 8 kHz so the SDK still does the
3171
+ * PCM → μ-law transcode client-side, but the 16 kHz → 8 kHz resample
3172
+ * step is skipped. Saves ~10–30 ms first-byte plus per-frame CPU and
3173
+ * removes a potential aliasing source.
3174
+ */
3175
+ static forTwilio(apiKey, options = {}) {
3176
+ return new _CartesiaTTS(apiKey, {
3177
+ ...options,
3178
+ sampleRate: CartesiaTTSSampleRate.HZ_8000
3179
+ });
3180
+ }
3181
+ /**
3182
+ * Construct an instance pre-configured for Telnyx bidirectional media.
3183
+ *
3184
+ * Sets `sampleRate=16000` to match Telnyx's L16/16000 default codec —
3185
+ * audio flows end-to-end with zero resampling or transcoding. Same as
3186
+ * the bare-constructor default; exists for API symmetry with
3187
+ * {@link CartesiaTTS.forTwilio}.
3188
+ */
3189
+ static forTelnyx(apiKey, options = {}) {
3190
+ return new _CartesiaTTS(apiKey, {
3191
+ ...options,
3192
+ sampleRate: CartesiaTTSSampleRate.HZ_16000
3193
+ });
3194
+ }
3195
+ /** Build the JSON payload for the Cartesia bytes endpoint. */
3196
+ buildPayload(text) {
3197
+ const payload = {
3198
+ model_id: this.model,
3199
+ voice: { mode: CartesiaTTSVoiceMode.ID, id: this.voice },
3200
+ transcript: text,
3201
+ output_format: {
3202
+ container: CartesiaTTSContainer.RAW,
3203
+ encoding: CartesiaTTSEncoding.PCM_S16LE,
3204
+ sample_rate: this.sampleRate
3205
+ },
3206
+ language: this.language
3207
+ };
3208
+ const generationConfig = {};
3209
+ if (this.speed !== void 0) generationConfig.speed = this.speed;
3210
+ if (this.emotion && this.emotion.length > 0)
3211
+ generationConfig.emotion = this.emotion[0];
3212
+ if (this.volume !== void 0) generationConfig.volume = this.volume;
3213
+ if (Object.keys(generationConfig).length > 0) {
3214
+ payload.generation_config = generationConfig;
3215
+ }
3216
+ return payload;
3217
+ }
3218
+ /**
3219
+ * Pre-call HTTP warmup for the Cartesia `/tts/bytes` endpoint.
3220
+ *
3221
+ * Issues a lightweight `GET <baseUrl>/voices` so DNS, TLS, and HTTP/2
3222
+ * are already up by the time the first `synthesizeStream()` POST
3223
+ * lands. Best-effort: 5 s timeout, all exceptions swallowed at
3224
+ * debug level.
3225
+ *
3226
+ * Billing safety: `GET /voices` is a free metadata read on
3227
+ * Cartesia's REST surface (per https://docs.cartesia.ai). It does
3228
+ * not consume synthesis credits. The actual synthesis is billed
3229
+ * only when `POST /tts/bytes` runs with a non-empty `transcript`.
3230
+ *
3231
+ * Note: Cartesia TTS uses the HTTP path (vs the WebSocket variant
3232
+ * Cartesia also exposes) — connection warmup is therefore HTTP-GET
3233
+ * based, not WebSocket pre-handshake. The latency win is smaller
3234
+ * (~50-150 ms vs the ~200-500 ms of a WS prewarm) but still real.
3235
+ */
3236
+ async warmup() {
3237
+ try {
3238
+ await fetch(`${this.baseUrl}/voices`, {
3239
+ method: "GET",
3240
+ headers: {
3241
+ "X-API-Key": this.apiKey,
3242
+ "Cartesia-Version": this.apiVersion
3243
+ },
3244
+ signal: AbortSignal.timeout(5e3)
3245
+ });
3246
+ } catch (err) {
3247
+ getLogger().debug(`Cartesia TTS warmup failed (best-effort): ${String(err)}`);
3248
+ }
3249
+ }
3250
+ /** Synthesize text and return the concatenated audio buffer. */
3251
+ async synthesize(text) {
3252
+ const chunks = [];
3253
+ for await (const chunk of this.synthesizeStream(text)) {
3254
+ chunks.push(chunk);
3255
+ }
3256
+ return Buffer.concat(chunks);
3257
+ }
3258
+ /**
3259
+ * Synthesize text and yield raw PCM_S16LE chunks at the configured
3260
+ * `sampleRate` as they arrive from Cartesia.
3261
+ */
3262
+ async *synthesizeStream(text) {
3263
+ const response = await fetch(`${this.baseUrl}/tts/bytes`, {
3264
+ method: "POST",
3265
+ headers: {
3266
+ "X-API-Key": this.apiKey,
3267
+ "Cartesia-Version": this.apiVersion,
3268
+ "Content-Type": "application/json"
3269
+ },
3270
+ body: JSON.stringify(this.buildPayload(text)),
3271
+ signal: AbortSignal.timeout(3e4)
3272
+ });
3273
+ if (!response.ok) {
3274
+ const body = await response.text();
3275
+ throw new Error(`Cartesia TTS error ${response.status}: ${body}`);
3276
+ }
3277
+ if (!response.body) {
3278
+ throw new Error("Cartesia TTS: no response body");
3279
+ }
3280
+ const reader = response.body.getReader();
3281
+ try {
3282
+ while (true) {
3283
+ const { done, value } = await reader.read();
3284
+ if (done) break;
3285
+ if (value && value.length > 0) {
3286
+ yield Buffer.from(value);
3287
+ }
3288
+ }
3289
+ } finally {
3290
+ if (typeof reader.cancel === "function")
3291
+ await reader.cancel().catch(() => {
3292
+ });
3293
+ reader.releaseLock();
3294
+ }
3295
+ }
3296
+ };
3297
+
3298
+ // src/providers/rime-tts.ts
3299
+ init_esm_shims();
3300
+ var RIME_BASE_URL = "https://users.rime.ai/v1/rime-tts";
3301
+ var RimeModel = {
3302
+ ARCANA: "arcana",
3303
+ MIST: "mist",
3304
+ MIST_V2: "mistv2"
3305
+ };
3306
+ var RimeAudioFormat = {
3307
+ PCM: "audio/pcm",
3308
+ MP3: "audio/mp3",
3309
+ WAV: "audio/wav",
3310
+ MULAW: "audio/mulaw"
3311
+ };
3312
+ var ARCANA_MODEL_TIMEOUT_MS = 60 * 4 * 1e3;
3313
+ var MIST_MODEL_TIMEOUT_MS = 30 * 1e3;
3314
+ function isMistModel(model) {
3315
+ return model.includes(RimeModel.MIST);
3316
+ }
3317
+ function timeoutForModel(model) {
3318
+ if (model === RimeModel.ARCANA) return ARCANA_MODEL_TIMEOUT_MS;
3319
+ return MIST_MODEL_TIMEOUT_MS;
3320
+ }
3321
+ var RimeTTS = class {
3322
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
3323
+ static providerKey = "rime";
3324
+ apiKey;
3325
+ model;
3326
+ speaker;
3327
+ lang;
3328
+ sampleRate;
3329
+ repetitionPenalty;
3330
+ temperature;
3331
+ topP;
3332
+ maxTokens;
3333
+ speedAlpha;
3334
+ reduceLatency;
3335
+ pauseBetweenBrackets;
3336
+ phonemizeBetweenBrackets;
3337
+ baseUrl;
3338
+ totalTimeoutMs;
3339
+ constructor(apiKey, opts = {}) {
3340
+ this.apiKey = apiKey;
3341
+ this.model = opts.model ?? RimeModel.ARCANA;
3342
+ const defaultSpeaker = isMistModel(this.model) ? "cove" : "astra";
3343
+ this.speaker = opts.speaker ?? defaultSpeaker;
3344
+ this.lang = opts.lang ?? "eng";
3345
+ this.sampleRate = opts.sampleRate ?? 16e3;
3346
+ this.repetitionPenalty = opts.repetitionPenalty;
3347
+ this.temperature = opts.temperature;
3348
+ this.topP = opts.topP;
3349
+ this.maxTokens = opts.maxTokens;
3350
+ this.speedAlpha = opts.speedAlpha;
3351
+ this.reduceLatency = opts.reduceLatency;
3352
+ this.pauseBetweenBrackets = opts.pauseBetweenBrackets;
3353
+ this.phonemizeBetweenBrackets = opts.phonemizeBetweenBrackets;
3354
+ this.baseUrl = opts.baseUrl ?? RIME_BASE_URL;
3355
+ this.totalTimeoutMs = timeoutForModel(this.model);
3356
+ }
3357
+ buildPayload(text) {
3358
+ const payload = {
3359
+ speaker: this.speaker,
3360
+ text,
3361
+ modelId: this.model
3362
+ };
3363
+ if (this.model === RimeModel.ARCANA) {
3364
+ if (this.repetitionPenalty !== void 0)
3365
+ payload.repetition_penalty = this.repetitionPenalty;
3366
+ if (this.temperature !== void 0) payload.temperature = this.temperature;
3367
+ if (this.topP !== void 0) payload.top_p = this.topP;
3368
+ if (this.maxTokens !== void 0) payload.max_tokens = this.maxTokens;
3369
+ payload.lang = this.lang;
3370
+ payload.samplingRate = this.sampleRate;
3371
+ } else if (isMistModel(this.model)) {
3372
+ payload.lang = this.lang;
3373
+ payload.samplingRate = this.sampleRate;
3374
+ if (this.speedAlpha !== void 0) payload.speedAlpha = this.speedAlpha;
3375
+ if (this.model === RimeModel.MIST_V2 && this.reduceLatency !== void 0) {
3376
+ payload.reduceLatency = this.reduceLatency;
3377
+ }
3378
+ if (this.pauseBetweenBrackets !== void 0) {
3379
+ payload.pauseBetweenBrackets = this.pauseBetweenBrackets;
3380
+ }
3381
+ if (this.phonemizeBetweenBrackets !== void 0) {
3382
+ payload.phonemizeBetweenBrackets = this.phonemizeBetweenBrackets;
3383
+ }
3384
+ }
3385
+ return payload;
3386
+ }
3387
+ /** Synthesize text and return the concatenated audio buffer. */
3388
+ async synthesize(text) {
3389
+ const chunks = [];
3390
+ for await (const chunk of this.synthesizeStream(text)) {
3391
+ chunks.push(chunk);
3392
+ }
3393
+ return Buffer.concat(chunks);
3394
+ }
3395
+ /**
3396
+ * Synthesize text and yield raw PCM_S16LE chunks at the configured
3397
+ * `sampleRate` as they stream in.
3398
+ */
3399
+ async *synthesizeStream(text) {
3400
+ const response = await fetch(this.baseUrl, {
3401
+ method: "POST",
3402
+ headers: {
3403
+ accept: RimeAudioFormat.PCM,
3404
+ Authorization: `Bearer ${this.apiKey}`,
3405
+ "content-type": "application/json"
3406
+ },
3407
+ body: JSON.stringify(this.buildPayload(text)),
3408
+ signal: AbortSignal.timeout(this.totalTimeoutMs)
3409
+ });
3410
+ if (!response.ok) {
3411
+ const body = await response.text();
3412
+ throw new Error(`Rime TTS error ${response.status}: ${body}`);
3413
+ }
3414
+ const contentType = response.headers.get("content-type") ?? "";
3415
+ if (!contentType.startsWith("audio")) {
3416
+ const body = await response.text();
3417
+ throw new Error(`Rime returned non-audio response: ${body.slice(0, 500)}`);
3418
+ }
3419
+ if (!response.body) {
3420
+ throw new Error("Rime TTS: no response body");
3421
+ }
3422
+ const reader = response.body.getReader();
3423
+ try {
3424
+ while (true) {
3425
+ const { done, value } = await reader.read();
3426
+ if (done) break;
3427
+ if (value && value.length > 0) {
3428
+ yield Buffer.from(value);
3429
+ }
3430
+ }
3431
+ } finally {
3432
+ if (typeof reader.cancel === "function")
3433
+ await reader.cancel().catch(() => {
3434
+ });
3435
+ reader.releaseLock();
3436
+ }
3437
+ }
3438
+ };
3439
+
3440
+ // src/stt/deepgram.ts
3441
+ init_esm_shims();
3442
+ var STT = class extends DeepgramSTT {
3443
+ static providerKey = "deepgram";
3444
+ constructor(opts = {}) {
3445
+ const key = opts.apiKey ?? process.env.DEEPGRAM_API_KEY;
3446
+ if (!key) {
3447
+ throw new Error(
3448
+ "Deepgram STT requires an apiKey. Pass { apiKey: 'dg_...' } or set DEEPGRAM_API_KEY in the environment."
3449
+ );
3450
+ }
3451
+ super(
3452
+ key,
3453
+ opts.language ?? "en",
3454
+ opts.model ?? "nova-3",
3455
+ opts.encoding ?? "linear16",
3456
+ opts.sampleRate ?? 16e3,
3457
+ {
3458
+ endpointingMs: opts.endpointingMs ?? 150,
3459
+ utteranceEndMs: opts.utteranceEndMs === null ? null : opts.utteranceEndMs ?? 1e3,
3460
+ smartFormat: opts.smartFormat ?? true,
3461
+ interimResults: opts.interimResults ?? true,
3462
+ ...opts.vadEvents !== void 0 ? { vadEvents: opts.vadEvents } : {}
3463
+ }
3464
+ );
3465
+ }
3466
+ };
3467
+
3468
+ // src/stt/whisper.ts
3469
+ init_esm_shims();
3470
+
3471
+ // src/providers/whisper-stt.ts
3472
+ init_esm_shims();
3473
+ var OPENAI_TRANSCRIPTION_URL = "https://api.openai.com/v1/audio/transcriptions";
3474
+ var DEFAULT_BUFFER_SIZE = 16e3 * 2;
3475
+ var ALLOWED_MODELS = /* @__PURE__ */ new Set(["whisper-1", "gpt-4o-transcribe", "gpt-4o-mini-transcribe"]);
3476
+ function wrapPcmInWav(pcm, sampleRate = 16e3, channels = 1, bitsPerSample = 16) {
3477
+ const dataSize = pcm.length;
3478
+ const header = Buffer.alloc(44);
3479
+ header.write("RIFF", 0);
3480
+ header.writeUInt32LE(36 + dataSize, 4);
3481
+ header.write("WAVE", 8);
3482
+ header.write("fmt ", 12);
3483
+ header.writeUInt32LE(16, 16);
3484
+ header.writeUInt16LE(1, 20);
3485
+ header.writeUInt16LE(channels, 22);
3486
+ header.writeUInt32LE(sampleRate, 24);
3487
+ header.writeUInt32LE(sampleRate * channels * (bitsPerSample / 8), 28);
3488
+ header.writeUInt16LE(channels * (bitsPerSample / 8), 32);
3489
+ header.writeUInt16LE(bitsPerSample, 34);
3490
+ header.write("data", 36);
3491
+ header.writeUInt32LE(dataSize, 40);
3492
+ return Buffer.concat([header, pcm]);
3493
+ }
3494
+ var WhisperSTT = class _WhisperSTT {
3495
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
3496
+ static providerKey = "whisper";
3497
+ apiKey;
3498
+ model;
3499
+ language;
3500
+ bufferSize;
3501
+ responseFormat;
3502
+ // Accumulate chunks in an array and concat once on flush — avoids the
3503
+ // per-``sendAudio`` O(n) ``Buffer.concat([buffer, chunk])`` that quickly
3504
+ // dominates CPU when the phone leg delivers 20 ms frames.
3505
+ chunks = [];
3506
+ bufferedBytes = 0;
3507
+ callbacks = /* @__PURE__ */ new Set();
3508
+ running = false;
3509
+ pendingTranscriptions = [];
3510
+ /**
3511
+ * @param apiKey OpenAI API key.
3512
+ * @param language ISO-639-1 language code (e.g. ``"en"``, ``"it"``). Optional.
3513
+ * @param model One of ``whisper-1``, ``gpt-4o-transcribe``, ``gpt-4o-mini-transcribe``.
3514
+ * @param bufferSize Bytes of PCM16 to buffer before each transcription request.
3515
+ * @param responseFormat ``"json"`` (default) or ``"verbose_json"``.
3516
+ *
3517
+ * Argument order matches the Python SDK's ``WhisperSTT(api_key, language, model, response_format)``
3518
+ * for cross-language parity. Pre-0.5.3 the TS positional order was
3519
+ * ``(apiKey, model, language, bufferSize, responseFormat)`` — callers using
3520
+ * the old order will need to swap ``language`` and ``model``.
3521
+ */
3522
+ constructor(apiKey, language, model = "whisper-1", bufferSize = DEFAULT_BUFFER_SIZE, responseFormat = "json") {
3523
+ if (!ALLOWED_MODELS.has(model)) {
3524
+ throw new Error(
3525
+ `WhisperSTT: unsupported model "${model}". Expected one of ${[...ALLOWED_MODELS].join(", ")}.`
3526
+ );
3527
+ }
3528
+ this.apiKey = apiKey;
3529
+ this.model = model;
3530
+ this.language = language;
3531
+ this.bufferSize = bufferSize;
3532
+ this.responseFormat = responseFormat;
3533
+ }
3534
+ /** Factory for Twilio calls — mulaw 8 kHz is transcoded upstream, so we still receive PCM 16-bit. */
3535
+ static forTwilio(apiKey, language = "en", model = "whisper-1") {
3536
+ return new _WhisperSTT(apiKey, language, model);
3537
+ }
3538
+ /** Reset the audio buffer and arm the adapter for incoming chunks. */
3539
+ async connect() {
3540
+ this.running = true;
3541
+ this.chunks = [];
3542
+ this.bufferedBytes = 0;
2870
3543
  }
2871
3544
  /** Buffer a PCM16 chunk; flushes to Whisper once `bufferSize` bytes are reached. */
2872
3545
  sendAudio(audio) {
@@ -4448,264 +5121,42 @@ var SpeechmaticsSTT = class {
4448
5121
  close() {
4449
5122
  this.running = false;
4450
5123
  const ws = this.ws;
4451
- if (!ws) return;
4452
- this.ws = null;
4453
- const sendSafe = (payload) => {
4454
- if (ws.readyState === WebSocket5.OPEN) {
4455
- try {
4456
- ws.send(payload);
4457
- } catch {
4458
- }
4459
- }
4460
- };
4461
- sendSafe(
4462
- JSON.stringify({ message: "EndOfStream", last_seq_no: this.lastSeqNo })
4463
- );
4464
- try {
4465
- ws.close();
4466
- } catch {
4467
- }
4468
- }
4469
- };
4470
-
4471
- // src/stt/speechmatics.ts
4472
- var STT7 = class extends SpeechmaticsSTT {
4473
- static providerKey = "speechmatics";
4474
- constructor(opts = {}) {
4475
- const key = opts.apiKey ?? process.env.SPEECHMATICS_API_KEY;
4476
- if (!key) {
4477
- throw new Error(
4478
- "Speechmatics STT requires an apiKey. Pass { apiKey: 'sm_...' } or set SPEECHMATICS_API_KEY in the environment."
4479
- );
4480
- }
4481
- super(key, opts);
4482
- }
4483
- };
4484
-
4485
- // src/tts/elevenlabs.ts
4486
- init_esm_shims();
4487
-
4488
- // src/providers/elevenlabs-tts.ts
4489
- init_esm_shims();
4490
- var ELEVENLABS_BASE_URL = "https://api.elevenlabs.io/v1";
4491
- var ELEVENLABS_VOICE_ID_BY_NAME = {
4492
- rachel: "21m00Tcm4TlvDq8ikWAM",
4493
- drew: "29vD33N1CtxCmqQRPOHJ",
4494
- clyde: "2EiwWnXFnvU5JabPnv8n",
4495
- paul: "5Q0t7uMcjvnagumLfvZi",
4496
- domi: "AZnzlk1XvdvUeBnXmlld",
4497
- dave: "CYw3kZ02Hs0563khs1Fj",
4498
- fin: "D38z5RcWu1voky8WS1ja",
4499
- bella: "EXAVITQu4vr4xnSDxMaL",
4500
- antoni: "ErXwobaYiN019PkySvjV",
4501
- thomas: "GBv7mTt0atIp3Br8iCZE",
4502
- charlie: "IKne3meq5aSn9XLyUdCD",
4503
- george: "JBFqnCBsd6RMkjVDRZzb",
4504
- emily: "LcfcDJNUP1GQjkzn1xUU",
4505
- elli: "MF3mGyEYCl7XYWbV9V6O",
4506
- callum: "N2lVS1w4EtoT3dr4eOWO",
4507
- patrick: "ODq5zmih8GrVes37Dizd",
4508
- harry: "SOYHLrjzK2X1ezoPC6cr",
4509
- liam: "TX3LPaxmHKxFdv7VOQHJ",
4510
- dorothy: "ThT5KcBeYPX3keUQqHPh",
4511
- josh: "TxGEqnHWrfWFTfGW9XjX",
4512
- arnold: "VR6AewLTigWG4xSOukaG",
4513
- charlotte: "XB0fDUnXU5powFXDhCwa",
4514
- matilda: "XrExE9yKIg1WjnnlVkGX",
4515
- matthew: "Yko7PKHZNXotIFUBG7I9",
4516
- james: "ZQe5CZNOzWyzPSCn5a3c",
4517
- joseph: "Zlb1dXrM653N07WRdFW3",
4518
- jeremy: "bVMeCyTHy58xNoL34h3p",
4519
- michael: "flq6f7yk4E4fJM5XTYuZ",
4520
- ethan: "g5CIjZEefAph4nQFvHAz",
4521
- gigi: "jBpfuIE2acCO8z3wKNLl",
4522
- freya: "jsCqWAovK2LkecY7zXl4",
4523
- brian: "nPczCjzI2devNBz1zQrb",
4524
- grace: "oWAxZDx7w5VEj9dCyTzz",
4525
- daniel: "onwK4e9ZLuTAKqWW03F9",
4526
- lily: "pFZP5JQG7iQjIQuC4Bku",
4527
- serena: "pMsXgVXv3BLzUgSXRplE",
4528
- adam: "pNInz6obpgDQGcFmaJgB",
4529
- nicole: "piTKgcLEGmPE4e6mEKli",
4530
- bill: "pqHfZKP75CvOlQylNhV4",
4531
- jessie: "t0jbNlBVZ17f02VDIeMI",
4532
- ryan: "wViXBPUzp2ZZixB1xQuM",
4533
- sam: "yoZ06aMxZJJ28mfd3POQ",
4534
- glinda: "z9fAnlkpzviPz146aGWa",
4535
- giovanni: "zcAOhNBS3c14rBihAFp1",
4536
- mimi: "zrHiDhphv9ZnVXBqCLjz",
4537
- sarah: "EXAVITQu4vr4xnSDxMaL",
4538
- alloy: "EXAVITQu4vr4xnSDxMaL"
4539
- };
4540
- var VOICE_ID_PATTERN = /^[A-Za-z0-9]{20}$/;
4541
- function resolveVoiceId(voice) {
4542
- if (!voice) return voice;
4543
- if (VOICE_ID_PATTERN.test(voice)) return voice;
4544
- return ELEVENLABS_VOICE_ID_BY_NAME[voice.toLowerCase()] ?? voice;
4545
- }
4546
- var ElevenLabsModel = {
4547
- V3: "eleven_v3",
4548
- FLASH_V2_5: "eleven_flash_v2_5",
4549
- TURBO_V2_5: "eleven_turbo_v2_5",
4550
- MULTILINGUAL_V2: "eleven_multilingual_v2",
4551
- MONOLINGUAL_V1: "eleven_monolingual_v1"
4552
- };
4553
- var ElevenLabsOutputFormat = {
4554
- MP3_22050_32: "mp3_22050_32",
4555
- MP3_44100_32: "mp3_44100_32",
4556
- MP3_44100_64: "mp3_44100_64",
4557
- MP3_44100_96: "mp3_44100_96",
4558
- MP3_44100_128: "mp3_44100_128",
4559
- MP3_44100_192: "mp3_44100_192",
4560
- PCM_8000: "pcm_8000",
4561
- PCM_16000: "pcm_16000",
4562
- PCM_22050: "pcm_22050",
4563
- PCM_24000: "pcm_24000",
4564
- PCM_44100: "pcm_44100",
4565
- ULAW_8000: "ulaw_8000"
4566
- };
4567
- var ElevenLabsTTS = class _ElevenLabsTTS {
4568
- // Stable pricing/dashboard key — read by stream-handler / metrics via
4569
- // ``(agent.tts.constructor as any).providerKey``. Without this the cost
4570
- // calculator falls back to ``constructor.name`` ("ElevenLabsTTS") which
4571
- // does NOT match the pricing table key "elevenlabs", silently zeroing
4572
- // TTS cost for callers that construct the raw REST class directly
4573
- // (exposed at top level as ``ElevenLabsRestTTS``).
4574
- static providerKey = "elevenlabs";
4575
- apiKey;
4576
- voiceId;
4577
- modelId;
4578
- outputFormat;
4579
- voiceSettings;
4580
- languageCode;
4581
- chunkSize;
4582
- constructor(apiKey, voiceIdOrOptions = "21m00Tcm4TlvDq8ikWAM", modelId = ElevenLabsModel.FLASH_V2_5, outputFormat = ElevenLabsOutputFormat.PCM_16000) {
4583
- this.apiKey = apiKey;
4584
- if (typeof voiceIdOrOptions === "object") {
4585
- const o = voiceIdOrOptions;
4586
- this.voiceId = resolveVoiceId(o.voiceId ?? "21m00Tcm4TlvDq8ikWAM");
4587
- this.modelId = o.modelId ?? ElevenLabsModel.FLASH_V2_5;
4588
- this.outputFormat = o.outputFormat ?? ElevenLabsOutputFormat.PCM_16000;
4589
- this.voiceSettings = o.voiceSettings;
4590
- this.languageCode = o.languageCode;
4591
- this.chunkSize = o.chunkSize ?? 4096;
4592
- } else {
4593
- this.voiceId = resolveVoiceId(voiceIdOrOptions);
4594
- this.modelId = modelId;
4595
- this.outputFormat = outputFormat;
4596
- this.voiceSettings = void 0;
4597
- this.languageCode = void 0;
4598
- this.chunkSize = 4096;
4599
- }
4600
- }
4601
- /**
4602
- * Construct an instance pre-configured for Twilio Media Streams.
4603
- *
4604
- * Sets `outputFormat='ulaw_8000'` so ElevenLabs emits μ-law @ 8 kHz
4605
- * directly — the exact wire format Twilio's media stream uses — letting
4606
- * the SDK skip the 16 kHz→8 kHz resample and PCM→μ-law conversion in
4607
- * `TwilioAudioSender`. Saves ~30–80 ms first-byte and per-frame CPU,
4608
- * and removes a potential aliasing source.
4609
- *
4610
- * `voiceSettings` defaults to a low-bandwidth-friendly profile
4611
- * (speaker boost off, modest stability) which sounds cleaner at 8 kHz
4612
- * μ-law than the studio default. Pass an explicit object to override.
4613
- */
4614
- static forTwilio(apiKey, options = {}) {
4615
- const voiceSettings = options.voiceSettings ?? {
4616
- // Speaker boost adds high-frequency emphasis that aliases ugly over an
4617
- // 8 kHz μ-law line. Slightly higher stability tames the excursions
4618
- // that compander quantization noise can amplify.
4619
- stability: 0.6,
4620
- similarity_boost: 0.75,
4621
- use_speaker_boost: false
4622
- };
4623
- return new _ElevenLabsTTS(apiKey, {
4624
- ...options,
4625
- voiceSettings,
4626
- outputFormat: ElevenLabsOutputFormat.ULAW_8000
4627
- });
4628
- }
4629
- /**
4630
- * Construct an instance pre-configured for Telnyx bidirectional media.
4631
- *
4632
- * Telnyx's default media-streaming codec is L16 PCM @ 16 kHz, which
4633
- * matches our default Telnyx handler. We pick `pcm_16000` so the audio
4634
- * flows end-to-end with zero resampling or transcoding.
4635
- *
4636
- * Trade-off: if your Telnyx profile is pinned to PCMU/8000 (μ-law),
4637
- * construct `ElevenLabsTTS` directly with `outputFormat: 'ulaw_8000'`
4638
- * — Telnyx supports that natively too.
4639
- */
4640
- static forTelnyx(apiKey, options = {}) {
4641
- return new _ElevenLabsTTS(apiKey, {
4642
- ...options,
4643
- outputFormat: ElevenLabsOutputFormat.PCM_16000
4644
- });
4645
- }
4646
- /**
4647
- * Synthesise text to speech and return the full audio as a single Buffer.
4648
- *
4649
- * For large chunks (or when latency matters) call `synthesizeStream` instead.
4650
- */
4651
- async synthesize(text) {
4652
- const chunks = [];
4653
- for await (const chunk of this.synthesizeStream(text)) {
4654
- chunks.push(chunk);
4655
- }
4656
- return Buffer.concat(chunks);
4657
- }
4658
- /**
4659
- * Synthesise text and yield audio chunks as they arrive (streaming).
4660
- *
4661
- * The yielded buffers are raw PCM at 16 kHz (or whatever `outputFormat` is
4662
- * configured to). `chunkSize` controls the maximum yield size — 512 is a
4663
- * good choice for low-latency telephony.
4664
- */
4665
- async *synthesizeStream(text) {
4666
- const url = `${ELEVENLABS_BASE_URL}/text-to-speech/${encodeURIComponent(this.voiceId)}/stream?output_format=${encodeURIComponent(this.outputFormat)}`;
4667
- const body = {
4668
- text,
4669
- model_id: this.modelId
4670
- };
4671
- if (this.voiceSettings) body["voice_settings"] = this.voiceSettings;
4672
- if (this.languageCode) body["language_code"] = this.languageCode;
4673
- const response = await fetch(url, {
4674
- method: "POST",
4675
- headers: {
4676
- "xi-api-key": this.apiKey,
4677
- "Content-Type": "application/json"
4678
- },
4679
- body: JSON.stringify(body),
4680
- signal: AbortSignal.timeout(3e4)
4681
- });
4682
- if (!response.ok) {
4683
- const errBody = await response.text();
4684
- throw new Error(`ElevenLabs TTS error ${response.status}: ${errBody}`);
4685
- }
4686
- if (!response.body) {
4687
- throw new Error("ElevenLabs TTS: no response body");
4688
- }
4689
- const reader = response.body.getReader();
4690
- try {
4691
- while (true) {
4692
- const { done, value } = await reader.read();
4693
- if (done) break;
4694
- if (!value || value.length === 0) continue;
4695
- const buf = Buffer.from(value);
4696
- for (let offset = 0; offset < buf.length; offset += this.chunkSize) {
4697
- yield buf.subarray(offset, Math.min(offset + this.chunkSize, buf.length));
5124
+ if (!ws) return;
5125
+ this.ws = null;
5126
+ const sendSafe = (payload) => {
5127
+ if (ws.readyState === WebSocket5.OPEN) {
5128
+ try {
5129
+ ws.send(payload);
5130
+ } catch {
4698
5131
  }
4699
5132
  }
4700
- } finally {
4701
- if (typeof reader.cancel === "function") await reader.cancel().catch(() => {
4702
- });
4703
- reader.releaseLock();
5133
+ };
5134
+ sendSafe(
5135
+ JSON.stringify({ message: "EndOfStream", last_seq_no: this.lastSeqNo })
5136
+ );
5137
+ try {
5138
+ ws.close();
5139
+ } catch {
5140
+ }
5141
+ }
5142
+ };
5143
+
5144
+ // src/stt/speechmatics.ts
5145
+ var STT7 = class extends SpeechmaticsSTT {
5146
+ static providerKey = "speechmatics";
5147
+ constructor(opts = {}) {
5148
+ const key = opts.apiKey ?? process.env.SPEECHMATICS_API_KEY;
5149
+ if (!key) {
5150
+ throw new Error(
5151
+ "Speechmatics STT requires an apiKey. Pass { apiKey: 'sm_...' } or set SPEECHMATICS_API_KEY in the environment."
5152
+ );
4704
5153
  }
5154
+ super(key, opts);
4705
5155
  }
4706
5156
  };
4707
5157
 
4708
5158
  // src/tts/elevenlabs.ts
5159
+ init_esm_shims();
4709
5160
  function resolveApiKey(apiKey) {
4710
5161
  const key = apiKey ?? process.env.ELEVENLABS_API_KEY;
4711
5162
  if (!key) {
@@ -4721,7 +5172,7 @@ var TTS = class _TTS extends ElevenLabsTTS {
4721
5172
  super(resolveApiKey(opts.apiKey), {
4722
5173
  voiceId: opts.voiceId ?? "EXAVITQu4vr4xnSDxMaL",
4723
5174
  modelId: opts.modelId ?? "eleven_flash_v2_5",
4724
- outputFormat: opts.outputFormat ?? "pcm_16000",
5175
+ ...opts.outputFormat !== void 0 ? { outputFormat: opts.outputFormat } : {},
4725
5176
  languageCode: opts.languageCode,
4726
5177
  voiceSettings: opts.voiceSettings
4727
5178
  });
@@ -4792,6 +5243,20 @@ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
4792
5243
  * changes.
4793
5244
  */
4794
5245
  adoptedConnection = null;
5246
+ /**
5247
+ * Active WS for the in-flight ``synthesizeStream`` call, if any. Set
5248
+ * when a stream starts, cleared in its ``finally`` block. The
5249
+ * stream-handler calls ``cancelActiveStream()`` from ``cancelSpeaking``
5250
+ * to unblock the generator's inner ``await Promise<frame>`` — without
5251
+ * it, a barge-in on the firstMessage live path leaves the for-await
5252
+ * stuck waiting for the next frame; ElevenLabs never sends
5253
+ * ``isFinal=true`` after the consumer breaks, the 30 s frame timeout
5254
+ * fires post-call, and meanwhile ``initPipeline`` never returns so
5255
+ * the STT ``onTranscript`` callback never registers and subsequent
5256
+ * user turns are silently dropped (root cause of the 2026-05-20
5257
+ * "first message OK, then no response" symptom).
5258
+ */
5259
+ activeStreamWs = null;
4795
5260
  /**
4796
5261
  * The wire format requested over the ElevenLabs WS. Initially set from
4797
5262
  * the constructor; ``setTelephonyCarrier`` may auto-flip it to the
@@ -4840,6 +5305,32 @@ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
4840
5305
  if (!native) return;
4841
5306
  this._outputFormat = native;
4842
5307
  }
5308
+ /**
5309
+ * Force-close the WebSocket of any in-flight ``synthesizeStream`` call.
5310
+ * Called by the stream-handler from ``cancelSpeaking`` (barge-in) so
5311
+ * the generator's inner ``await Promise<frame>`` loop unblocks cleanly
5312
+ * via the ``onClose`` handler — instead of waiting up to 30 s for the
5313
+ * ``FRAME_TIMEOUT_MS`` watchdog to fire. No-op when no stream is in
5314
+ * flight or when the WS is already closing.
5315
+ *
5316
+ * Without this, a barge-in during the firstMessage live path left the
5317
+ * for-await stuck (ElevenLabs never sends ``isFinal=true`` after the
5318
+ * consumer breaks), ``initPipeline`` never returned, the STT
5319
+ * ``onTranscript`` callback never registered, and the entire remainder
5320
+ * of the call was silent for the user. Surfaced during the 2026-05-20
5321
+ * acceptance run.
5322
+ */
5323
+ cancelActiveStream() {
5324
+ const ws = this.activeStreamWs;
5325
+ if (!ws) return;
5326
+ this.activeStreamWs = null;
5327
+ try {
5328
+ if (ws.readyState === WebSocket6.OPEN || ws.readyState === WebSocket6.CONNECTING) {
5329
+ ws.close();
5330
+ }
5331
+ } catch {
5332
+ }
5333
+ }
4843
5334
  /** Pre-configured for Twilio Media Streams (`ulaw_8000`). */
4844
5335
  static forTwilio(opts) {
4845
5336
  return new _ElevenLabsWebSocketTTS({
@@ -4925,6 +5416,7 @@ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
4925
5416
  headers: { "xi-api-key": this.apiKey }
4926
5417
  });
4927
5418
  }
5419
+ this.activeStreamWs = ws;
4928
5420
  const queue = [];
4929
5421
  let done = false;
4930
5422
  let pendingError = null;
@@ -5045,6 +5537,7 @@ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
5045
5537
  }
5046
5538
  } finally {
5047
5539
  if (connectTimer) clearTimeout(connectTimer);
5540
+ if (this.activeStreamWs === ws) this.activeStreamWs = null;
5048
5541
  try {
5049
5542
  if (ws.readyState === WebSocket6.OPEN) {
5050
5543
  ws.send(JSON.stringify({ text: "" }));
@@ -5217,9 +5710,9 @@ function buildOpts(opts) {
5217
5710
  const out = {
5218
5711
  apiKey: resolveApiKey2(opts.apiKey),
5219
5712
  modelId: opts.modelId ?? "eleven_flash_v2_5",
5220
- outputFormat: opts.outputFormat ?? "pcm_16000",
5221
5713
  autoMode: opts.autoMode ?? true
5222
5714
  };
5715
+ if (opts.outputFormat !== void 0) out.outputFormat = opts.outputFormat;
5223
5716
  if (opts.voiceId !== void 0) out.voiceId = opts.voiceId;
5224
5717
  if (opts.voiceSettings !== void 0) out.voiceSettings = opts.voiceSettings;
5225
5718
  if (opts.languageCode !== void 0) out.languageCode = opts.languageCode;
@@ -5396,268 +5889,77 @@ var OpenAITTS = class _OpenAITTS {
5396
5889
  if (lpf) {
5397
5890
  y = lpfAlpha * x + (1 - lpfAlpha) * y;
5398
5891
  let s = Math.round(y);
5399
- if (s > 32767) s = 32767;
5400
- else if (s < -32768) s = -32768;
5401
- samples.push(s);
5402
- } else {
5403
- samples.push(x);
5404
- }
5405
- }
5406
- if (lpf) ctx.lpfPrev = y;
5407
- const out = [];
5408
- let i = 0;
5409
- if (direct8k) {
5410
- while (i + 2 < samples.length) {
5411
- out.push(samples[i]);
5412
- i += 3;
5413
- }
5414
- } else {
5415
- while (i + 2 < samples.length) {
5416
- out.push(samples[i]);
5417
- out.push(Math.round((samples[i + 1] + samples[i + 2]) / 2));
5418
- i += 3;
5419
- }
5420
- }
5421
- ctx.leftover = samples.slice(i);
5422
- const buffer = Buffer.alloc(out.length * 2);
5423
- for (let j = 0; j < out.length; j++) {
5424
- buffer.writeInt16LE(out[j], j * 2);
5425
- }
5426
- return buffer;
5427
- }
5428
- /** @deprecated use {@link resampleStreaming} with persistent state. */
5429
- static resample24kTo16k(audio) {
5430
- const ctx = {
5431
- carryByte: null,
5432
- leftover: [],
5433
- lpfPrev: 0,
5434
- lpfEnabled: false,
5435
- targetSampleRate: 16e3
5436
- };
5437
- const out = _OpenAITTS.resampleStreaming(audio, ctx);
5438
- if (ctx.leftover.length === 0) return out;
5439
- const tail = Buffer.alloc(ctx.leftover.length * 2);
5440
- for (let i = 0; i < ctx.leftover.length; i++) {
5441
- tail.writeInt16LE(ctx.leftover[i], i * 2);
5442
- }
5443
- return Buffer.concat([out, tail]);
5444
- }
5445
- };
5446
-
5447
- // src/tts/openai.ts
5448
- var TTS3 = class extends OpenAITTS {
5449
- static providerKey = "openai_tts";
5450
- constructor(opts = {}) {
5451
- const key = opts.apiKey ?? process.env.OPENAI_API_KEY;
5452
- if (!key) {
5453
- throw new Error(
5454
- "OpenAI TTS requires an apiKey. Pass { apiKey: 'sk-...' } or set OPENAI_API_KEY in the environment."
5455
- );
5456
- }
5457
- super(
5458
- key,
5459
- opts.voice ?? "alloy",
5460
- opts.model ?? "gpt-4o-mini-tts",
5461
- opts.instructions ?? null,
5462
- opts.speed ?? null,
5463
- opts.antiAlias ?? false
5464
- );
5465
- }
5466
- };
5467
-
5468
- // src/tts/cartesia.ts
5469
- init_esm_shims();
5470
-
5471
- // src/providers/cartesia-tts.ts
5472
- init_esm_shims();
5473
- var CARTESIA_BASE_URL = "https://api.cartesia.ai";
5474
- var CARTESIA_API_VERSION = "2025-04-16";
5475
- var CARTESIA_DEFAULT_VOICE_ID = "f786b574-daa5-4673-aa0c-cbe3e8534c02";
5476
- var CartesiaTTSModel = {
5477
- SONIC_3: "sonic-3",
5478
- SONIC_2: "sonic-2",
5479
- SONIC: "sonic"
5480
- };
5481
- var CartesiaTTSContainer = {
5482
- RAW: "raw",
5483
- WAV: "wav",
5484
- MP3: "mp3"
5485
- };
5486
- var CartesiaTTSEncoding = {
5487
- PCM_S16LE: "pcm_s16le",
5488
- PCM_F32LE: "pcm_f32le",
5489
- PCM_MULAW: "pcm_mulaw",
5490
- PCM_ALAW: "pcm_alaw"
5491
- };
5492
- var CartesiaTTSSampleRate = {
5493
- HZ_8000: 8e3,
5494
- HZ_16000: 16e3,
5495
- HZ_22050: 22050,
5496
- HZ_24000: 24e3,
5497
- HZ_44100: 44100
5498
- };
5499
- var CartesiaTTSVoiceMode = {
5500
- ID: "id",
5501
- EMBEDDING: "embedding"
5502
- };
5503
- var CartesiaTTS = class _CartesiaTTS {
5504
- /** Stable pricing/dashboard key — read by stream-handler/metrics. */
5505
- static providerKey = "cartesia_tts";
5506
- apiKey;
5507
- model;
5508
- voice;
5509
- language;
5510
- sampleRate;
5511
- speed;
5512
- emotion;
5513
- volume;
5514
- baseUrl;
5515
- apiVersion;
5516
- constructor(apiKey, opts = {}) {
5517
- this.apiKey = apiKey;
5518
- this.model = opts.model ?? CartesiaTTSModel.SONIC_3;
5519
- this.voice = opts.voice ?? CARTESIA_DEFAULT_VOICE_ID;
5520
- this.language = opts.language ?? "en";
5521
- this.sampleRate = opts.sampleRate ?? CartesiaTTSSampleRate.HZ_16000;
5522
- this.speed = opts.speed;
5523
- this.emotion = typeof opts.emotion === "string" ? [opts.emotion] : opts.emotion;
5524
- this.volume = opts.volume;
5525
- this.baseUrl = opts.baseUrl ?? CARTESIA_BASE_URL;
5526
- this.apiVersion = opts.apiVersion ?? CARTESIA_API_VERSION;
5527
- }
5528
- /**
5529
- * Construct an instance pre-configured for Twilio Media Streams.
5530
- *
5531
- * Sets `sampleRate=8000` so Cartesia emits PCM_S16LE @ 8 kHz directly.
5532
- * Twilio's media stream uses μ-law @ 8 kHz so the SDK still does the
5533
- * PCM → μ-law transcode client-side, but the 16 kHz → 8 kHz resample
5534
- * step is skipped. Saves ~10–30 ms first-byte plus per-frame CPU and
5535
- * removes a potential aliasing source.
5536
- */
5537
- static forTwilio(apiKey, options = {}) {
5538
- return new _CartesiaTTS(apiKey, {
5539
- ...options,
5540
- sampleRate: CartesiaTTSSampleRate.HZ_8000
5541
- });
5542
- }
5543
- /**
5544
- * Construct an instance pre-configured for Telnyx bidirectional media.
5545
- *
5546
- * Sets `sampleRate=16000` to match Telnyx's L16/16000 default codec —
5547
- * audio flows end-to-end with zero resampling or transcoding. Same as
5548
- * the bare-constructor default; exists for API symmetry with
5549
- * {@link CartesiaTTS.forTwilio}.
5550
- */
5551
- static forTelnyx(apiKey, options = {}) {
5552
- return new _CartesiaTTS(apiKey, {
5553
- ...options,
5554
- sampleRate: CartesiaTTSSampleRate.HZ_16000
5555
- });
5556
- }
5557
- /** Build the JSON payload for the Cartesia bytes endpoint. */
5558
- buildPayload(text) {
5559
- const payload = {
5560
- model_id: this.model,
5561
- voice: { mode: CartesiaTTSVoiceMode.ID, id: this.voice },
5562
- transcript: text,
5563
- output_format: {
5564
- container: CartesiaTTSContainer.RAW,
5565
- encoding: CartesiaTTSEncoding.PCM_S16LE,
5566
- sample_rate: this.sampleRate
5567
- },
5568
- language: this.language
5569
- };
5570
- const generationConfig = {};
5571
- if (this.speed !== void 0) generationConfig.speed = this.speed;
5572
- if (this.emotion && this.emotion.length > 0)
5573
- generationConfig.emotion = this.emotion[0];
5574
- if (this.volume !== void 0) generationConfig.volume = this.volume;
5575
- if (Object.keys(generationConfig).length > 0) {
5576
- payload.generation_config = generationConfig;
5577
- }
5578
- return payload;
5579
- }
5580
- /**
5581
- * Pre-call HTTP warmup for the Cartesia `/tts/bytes` endpoint.
5582
- *
5583
- * Issues a lightweight `GET <baseUrl>/voices` so DNS, TLS, and HTTP/2
5584
- * are already up by the time the first `synthesizeStream()` POST
5585
- * lands. Best-effort: 5 s timeout, all exceptions swallowed at
5586
- * debug level.
5587
- *
5588
- * Billing safety: `GET /voices` is a free metadata read on
5589
- * Cartesia's REST surface (per https://docs.cartesia.ai). It does
5590
- * not consume synthesis credits. The actual synthesis is billed
5591
- * only when `POST /tts/bytes` runs with a non-empty `transcript`.
5592
- *
5593
- * Note: Cartesia TTS uses the HTTP path (vs the WebSocket variant
5594
- * Cartesia also exposes) — connection warmup is therefore HTTP-GET
5595
- * based, not WebSocket pre-handshake. The latency win is smaller
5596
- * (~50-150 ms vs the ~200-500 ms of a WS prewarm) but still real.
5597
- */
5598
- async warmup() {
5599
- try {
5600
- await fetch(`${this.baseUrl}/voices`, {
5601
- method: "GET",
5602
- headers: {
5603
- "X-API-Key": this.apiKey,
5604
- "Cartesia-Version": this.apiVersion
5605
- },
5606
- signal: AbortSignal.timeout(5e3)
5607
- });
5608
- } catch (err) {
5609
- getLogger().debug(`Cartesia TTS warmup failed (best-effort): ${String(err)}`);
5892
+ if (s > 32767) s = 32767;
5893
+ else if (s < -32768) s = -32768;
5894
+ samples.push(s);
5895
+ } else {
5896
+ samples.push(x);
5897
+ }
5610
5898
  }
5611
- }
5612
- /** Synthesize text and return the concatenated audio buffer. */
5613
- async synthesize(text) {
5614
- const chunks = [];
5615
- for await (const chunk of this.synthesizeStream(text)) {
5616
- chunks.push(chunk);
5899
+ if (lpf) ctx.lpfPrev = y;
5900
+ const out = [];
5901
+ let i = 0;
5902
+ if (direct8k) {
5903
+ while (i + 2 < samples.length) {
5904
+ out.push(samples[i]);
5905
+ i += 3;
5906
+ }
5907
+ } else {
5908
+ while (i + 2 < samples.length) {
5909
+ out.push(samples[i]);
5910
+ out.push(Math.round((samples[i + 1] + samples[i + 2]) / 2));
5911
+ i += 3;
5912
+ }
5617
5913
  }
5618
- return Buffer.concat(chunks);
5619
- }
5620
- /**
5621
- * Synthesize text and yield raw PCM_S16LE chunks at the configured
5622
- * `sampleRate` as they arrive from Cartesia.
5623
- */
5624
- async *synthesizeStream(text) {
5625
- const response = await fetch(`${this.baseUrl}/tts/bytes`, {
5626
- method: "POST",
5627
- headers: {
5628
- "X-API-Key": this.apiKey,
5629
- "Cartesia-Version": this.apiVersion,
5630
- "Content-Type": "application/json"
5631
- },
5632
- body: JSON.stringify(this.buildPayload(text)),
5633
- signal: AbortSignal.timeout(3e4)
5634
- });
5635
- if (!response.ok) {
5636
- const body = await response.text();
5637
- throw new Error(`Cartesia TTS error ${response.status}: ${body}`);
5914
+ ctx.leftover = samples.slice(i);
5915
+ const buffer = Buffer.alloc(out.length * 2);
5916
+ for (let j = 0; j < out.length; j++) {
5917
+ buffer.writeInt16LE(out[j], j * 2);
5638
5918
  }
5639
- if (!response.body) {
5640
- throw new Error("Cartesia TTS: no response body");
5919
+ return buffer;
5920
+ }
5921
+ /** @deprecated use {@link resampleStreaming} with persistent state. */
5922
+ static resample24kTo16k(audio) {
5923
+ const ctx = {
5924
+ carryByte: null,
5925
+ leftover: [],
5926
+ lpfPrev: 0,
5927
+ lpfEnabled: false,
5928
+ targetSampleRate: 16e3
5929
+ };
5930
+ const out = _OpenAITTS.resampleStreaming(audio, ctx);
5931
+ if (ctx.leftover.length === 0) return out;
5932
+ const tail = Buffer.alloc(ctx.leftover.length * 2);
5933
+ for (let i = 0; i < ctx.leftover.length; i++) {
5934
+ tail.writeInt16LE(ctx.leftover[i], i * 2);
5641
5935
  }
5642
- const reader = response.body.getReader();
5643
- try {
5644
- while (true) {
5645
- const { done, value } = await reader.read();
5646
- if (done) break;
5647
- if (value && value.length > 0) {
5648
- yield Buffer.from(value);
5649
- }
5650
- }
5651
- } finally {
5652
- if (typeof reader.cancel === "function")
5653
- await reader.cancel().catch(() => {
5654
- });
5655
- reader.releaseLock();
5936
+ return Buffer.concat([out, tail]);
5937
+ }
5938
+ };
5939
+
5940
+ // src/tts/openai.ts
5941
+ var TTS3 = class extends OpenAITTS {
5942
+ static providerKey = "openai_tts";
5943
+ constructor(opts = {}) {
5944
+ const key = opts.apiKey ?? process.env.OPENAI_API_KEY;
5945
+ if (!key) {
5946
+ throw new Error(
5947
+ "OpenAI TTS requires an apiKey. Pass { apiKey: 'sk-...' } or set OPENAI_API_KEY in the environment."
5948
+ );
5656
5949
  }
5950
+ super(
5951
+ key,
5952
+ opts.voice ?? "alloy",
5953
+ opts.model ?? "gpt-4o-mini-tts",
5954
+ opts.instructions ?? null,
5955
+ opts.speed ?? null,
5956
+ opts.antiAlias ?? false
5957
+ );
5657
5958
  }
5658
5959
  };
5659
5960
 
5660
5961
  // src/tts/cartesia.ts
5962
+ init_esm_shims();
5661
5963
  function resolveApiKey3(apiKey) {
5662
5964
  const key = apiKey ?? process.env.CARTESIA_API_KEY;
5663
5965
  if (!key) {
@@ -5687,150 +5989,6 @@ var TTS4 = class _TTS extends CartesiaTTS {
5687
5989
 
5688
5990
  // src/tts/rime.ts
5689
5991
  init_esm_shims();
5690
-
5691
- // src/providers/rime-tts.ts
5692
- init_esm_shims();
5693
- var RIME_BASE_URL = "https://users.rime.ai/v1/rime-tts";
5694
- var RimeModel = {
5695
- ARCANA: "arcana",
5696
- MIST: "mist",
5697
- MIST_V2: "mistv2"
5698
- };
5699
- var RimeAudioFormat = {
5700
- PCM: "audio/pcm",
5701
- MP3: "audio/mp3",
5702
- WAV: "audio/wav",
5703
- MULAW: "audio/mulaw"
5704
- };
5705
- var ARCANA_MODEL_TIMEOUT_MS = 60 * 4 * 1e3;
5706
- var MIST_MODEL_TIMEOUT_MS = 30 * 1e3;
5707
- function isMistModel(model) {
5708
- return model.includes(RimeModel.MIST);
5709
- }
5710
- function timeoutForModel(model) {
5711
- if (model === RimeModel.ARCANA) return ARCANA_MODEL_TIMEOUT_MS;
5712
- return MIST_MODEL_TIMEOUT_MS;
5713
- }
5714
- var RimeTTS = class {
5715
- /** Stable pricing/dashboard key — read by stream-handler/metrics. */
5716
- static providerKey = "rime";
5717
- apiKey;
5718
- model;
5719
- speaker;
5720
- lang;
5721
- sampleRate;
5722
- repetitionPenalty;
5723
- temperature;
5724
- topP;
5725
- maxTokens;
5726
- speedAlpha;
5727
- reduceLatency;
5728
- pauseBetweenBrackets;
5729
- phonemizeBetweenBrackets;
5730
- baseUrl;
5731
- totalTimeoutMs;
5732
- constructor(apiKey, opts = {}) {
5733
- this.apiKey = apiKey;
5734
- this.model = opts.model ?? RimeModel.ARCANA;
5735
- const defaultSpeaker = isMistModel(this.model) ? "cove" : "astra";
5736
- this.speaker = opts.speaker ?? defaultSpeaker;
5737
- this.lang = opts.lang ?? "eng";
5738
- this.sampleRate = opts.sampleRate ?? 16e3;
5739
- this.repetitionPenalty = opts.repetitionPenalty;
5740
- this.temperature = opts.temperature;
5741
- this.topP = opts.topP;
5742
- this.maxTokens = opts.maxTokens;
5743
- this.speedAlpha = opts.speedAlpha;
5744
- this.reduceLatency = opts.reduceLatency;
5745
- this.pauseBetweenBrackets = opts.pauseBetweenBrackets;
5746
- this.phonemizeBetweenBrackets = opts.phonemizeBetweenBrackets;
5747
- this.baseUrl = opts.baseUrl ?? RIME_BASE_URL;
5748
- this.totalTimeoutMs = timeoutForModel(this.model);
5749
- }
5750
- buildPayload(text) {
5751
- const payload = {
5752
- speaker: this.speaker,
5753
- text,
5754
- modelId: this.model
5755
- };
5756
- if (this.model === RimeModel.ARCANA) {
5757
- if (this.repetitionPenalty !== void 0)
5758
- payload.repetition_penalty = this.repetitionPenalty;
5759
- if (this.temperature !== void 0) payload.temperature = this.temperature;
5760
- if (this.topP !== void 0) payload.top_p = this.topP;
5761
- if (this.maxTokens !== void 0) payload.max_tokens = this.maxTokens;
5762
- payload.lang = this.lang;
5763
- payload.samplingRate = this.sampleRate;
5764
- } else if (isMistModel(this.model)) {
5765
- payload.lang = this.lang;
5766
- payload.samplingRate = this.sampleRate;
5767
- if (this.speedAlpha !== void 0) payload.speedAlpha = this.speedAlpha;
5768
- if (this.model === RimeModel.MIST_V2 && this.reduceLatency !== void 0) {
5769
- payload.reduceLatency = this.reduceLatency;
5770
- }
5771
- if (this.pauseBetweenBrackets !== void 0) {
5772
- payload.pauseBetweenBrackets = this.pauseBetweenBrackets;
5773
- }
5774
- if (this.phonemizeBetweenBrackets !== void 0) {
5775
- payload.phonemizeBetweenBrackets = this.phonemizeBetweenBrackets;
5776
- }
5777
- }
5778
- return payload;
5779
- }
5780
- /** Synthesize text and return the concatenated audio buffer. */
5781
- async synthesize(text) {
5782
- const chunks = [];
5783
- for await (const chunk of this.synthesizeStream(text)) {
5784
- chunks.push(chunk);
5785
- }
5786
- return Buffer.concat(chunks);
5787
- }
5788
- /**
5789
- * Synthesize text and yield raw PCM_S16LE chunks at the configured
5790
- * `sampleRate` as they stream in.
5791
- */
5792
- async *synthesizeStream(text) {
5793
- const response = await fetch(this.baseUrl, {
5794
- method: "POST",
5795
- headers: {
5796
- accept: RimeAudioFormat.PCM,
5797
- Authorization: `Bearer ${this.apiKey}`,
5798
- "content-type": "application/json"
5799
- },
5800
- body: JSON.stringify(this.buildPayload(text)),
5801
- signal: AbortSignal.timeout(this.totalTimeoutMs)
5802
- });
5803
- if (!response.ok) {
5804
- const body = await response.text();
5805
- throw new Error(`Rime TTS error ${response.status}: ${body}`);
5806
- }
5807
- const contentType = response.headers.get("content-type") ?? "";
5808
- if (!contentType.startsWith("audio")) {
5809
- const body = await response.text();
5810
- throw new Error(`Rime returned non-audio response: ${body.slice(0, 500)}`);
5811
- }
5812
- if (!response.body) {
5813
- throw new Error("Rime TTS: no response body");
5814
- }
5815
- const reader = response.body.getReader();
5816
- try {
5817
- while (true) {
5818
- const { done, value } = await reader.read();
5819
- if (done) break;
5820
- if (value && value.length > 0) {
5821
- yield Buffer.from(value);
5822
- }
5823
- }
5824
- } finally {
5825
- if (typeof reader.cancel === "function")
5826
- await reader.cancel().catch(() => {
5827
- });
5828
- reader.releaseLock();
5829
- }
5830
- }
5831
- };
5832
-
5833
- // src/tts/rime.ts
5834
5992
  var TTS5 = class extends RimeTTS {
5835
5993
  static providerKey = "rime";
5836
5994
  constructor(opts = {}) {
@@ -6469,12 +6627,6 @@ init_esm_shims();
6469
6627
 
6470
6628
  // src/providers/groq-llm.ts
6471
6629
  init_esm_shims();
6472
-
6473
- // src/version.ts
6474
- init_esm_shims();
6475
- var VERSION = "0.5.5";
6476
-
6477
- // src/providers/groq-llm.ts
6478
6630
  var GROQ_BASE_URL = "https://api.groq.com/openai/v1";
6479
6631
  var GroqModel = {
6480
6632
  LLAMA_3_3_70B_VERSATILE: "llama-3.3-70b-versatile",
@@ -8131,12 +8283,28 @@ var TwilioAdapter = class _TwilioAdapter {
8131
8283
  return { callSid: call.sid };
8132
8284
  }
8133
8285
  /**
8134
- * Build a minimal ``<Response><Connect><Stream url="..."/></Connect></Response>``
8135
- * TwiML document. Mirrors the Python adapter's ``generate_stream_twiml``.
8286
+ * Build a ``<Response><Connect><Stream url="...">`` TwiML document.
8287
+ *
8288
+ * ``parameters`` is forwarded as ``<Parameter name="..." value="..."/>``
8289
+ * children of ``<Stream>``. Twilio Media Streams strips query-string params
8290
+ * from the ``<Stream url=...>`` before the WS handshake, so
8291
+ * ``<Parameter>`` tags are the supported way to pre-populate
8292
+ * ``start.customParameters`` on the WS ``start`` frame. Used by the
8293
+ * inbound path to carry caller / callee through to the bridge.
8294
+ *
8295
+ * Mirrors the Python adapter's ``generate_stream_twiml``.
8136
8296
  */
8137
- static generateStreamTwiml(streamUrl) {
8138
- const escaped = streamUrl.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;").replace(/'/g, "&apos;");
8139
- return `<?xml version="1.0" encoding="UTF-8"?><Response><Connect><Stream url="${escaped}"/></Connect></Response>`;
8297
+ static generateStreamTwiml(streamUrl, parameters) {
8298
+ const esc = (s) => s.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;").replace(/'/g, "&apos;");
8299
+ const escapedUrl = esc(streamUrl);
8300
+ let paramTags = "";
8301
+ if (parameters) {
8302
+ for (const [name, value] of Object.entries(parameters)) {
8303
+ if (value == null) continue;
8304
+ paramTags += `<Parameter name="${esc(name)}" value="${esc(String(value))}"/>`;
8305
+ }
8306
+ }
8307
+ return `<?xml version="1.0" encoding="UTF-8"?><Response><Connect><Stream url="${escapedUrl}">${paramTags}</Stream></Connect></Response>`;
8140
8308
  }
8141
8309
  /** Force-complete an in-progress call. */
8142
8310
  async endCall(callSid) {
@@ -8529,6 +8697,8 @@ export {
8529
8697
  CallMetricsAccumulator,
8530
8698
  STT4 as CartesiaSTT,
8531
8699
  TTS4 as CartesiaTTS,
8700
+ CartesiaTTSModel,
8701
+ CartesiaTTSVoiceMode,
8532
8702
  LLM4 as CerebrasLLM,
8533
8703
  ChatContext,
8534
8704
  CloudflareTunnel,
@@ -8536,10 +8706,13 @@ export {
8536
8706
  DEFAULT_PRICING,
8537
8707
  DTMF_EVENTS,
8538
8708
  DeepFilterNetFilter,
8709
+ DeepgramModel,
8539
8710
  STT as DeepgramSTT,
8540
8711
  DefaultToolExecutor,
8541
8712
  ConvAI as ElevenLabsConvAI,
8542
8713
  ElevenLabsConvAIAdapter,
8714
+ ElevenLabsModel,
8715
+ ElevenLabsOutputFormat,
8543
8716
  ElevenLabsTTS as ElevenLabsRestTTS,
8544
8717
  TTS as ElevenLabsTTS,
8545
8718
  TTS2 as ElevenLabsWebSocketTTS,
@@ -8568,8 +8741,15 @@ export {
8568
8741
  Realtime2 as OpenAIRealtime2,
8569
8742
  OpenAIRealtime2Adapter,
8570
8743
  OpenAIRealtimeAdapter,
8744
+ OpenAIRealtimeAudioFormat,
8745
+ OpenAIRealtimeModel,
8746
+ OpenAIRealtimeVADType,
8571
8747
  TTS3 as OpenAITTS,
8572
8748
  STT3 as OpenAITranscribeSTT,
8749
+ OpenAITranscriptionModel,
8750
+ OpenAIVoice,
8751
+ PRICING_LAST_UPDATED,
8752
+ PRICING_VERSION,
8573
8753
  PartialStreamError,
8574
8754
  Patter,
8575
8755
  PatterConnectionError,
@@ -8577,9 +8757,12 @@ export {
8577
8757
  PatterTool,
8578
8758
  PcmCarry,
8579
8759
  PipelineHookExecutor,
8760
+ PricingUnit,
8580
8761
  ProvisionError,
8581
8762
  RateLimitError,
8582
8763
  RemoteMessageHandler,
8764
+ RimeAudioFormat,
8765
+ RimeModel,
8583
8766
  TTS5 as RimeTTS,
8584
8767
  SPAN_BARGEIN,
8585
8768
  SPAN_CALL,