getpatter 0.6.0 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -6,6 +6,7 @@ import {
6
6
  CallMetricsAccumulator,
7
7
  DEFAULT_MIN_SENTENCE_LEN,
8
8
  DEFAULT_PRICING,
9
+ DeepgramModel,
9
10
  DeepgramSTT,
10
11
  DefaultToolExecutor,
11
12
  ElevenLabsConvAIAdapter,
@@ -15,11 +16,12 @@ import {
15
16
  LLMLoop,
16
17
  MetricsStore,
17
18
  OpenAILLMProvider,
18
- OpenAIRealtimeAdapter,
19
+ PRICING_LAST_UPDATED,
20
+ PRICING_VERSION,
19
21
  PatterConnectionError,
20
22
  PatterError,
21
- PcmCarry,
22
23
  PipelineHookExecutor,
24
+ PricingUnit,
23
25
  ProvisionError,
24
26
  RateLimitError,
25
27
  RemoteMessageHandler,
@@ -31,18 +33,14 @@ import {
31
33
  SPAN_TOOL,
32
34
  SPAN_TTS,
33
35
  SentenceChunker,
34
- StatefulResampler,
35
36
  TestSession,
37
+ VERSION,
36
38
  calculateRealtimeCost,
37
39
  calculateSttCost,
38
40
  calculateTelephonyCost,
39
41
  calculateTtsCost,
40
42
  callsToCsv,
41
43
  callsToJson,
42
- createResampler16kTo8k,
43
- createResampler24kTo16k,
44
- createResampler24kTo8k,
45
- createResampler8kTo16k,
46
44
  initTracing,
47
45
  isRemoteUrl,
48
46
  isTracingEnabled,
@@ -52,14 +50,34 @@ import {
52
50
  mergePricing,
53
51
  mountApi,
54
52
  mountDashboard,
53
+ resolveLogRoot,
54
+ startSpan
55
+ } from "./chunk-LE63CSOB.mjs";
56
+ import {
57
+ OpenAIRealtime2Adapter,
58
+ OpenAIRealtimeAdapter,
59
+ OpenAIRealtimeAudioFormat,
60
+ OpenAIRealtimeModel,
61
+ OpenAIRealtimeVADType,
62
+ OpenAITranscriptionModel,
63
+ OpenAIVoice,
64
+ PcmCarry,
65
+ StatefulResampler,
66
+ createResampler16kTo8k,
67
+ createResampler24kTo16k,
68
+ createResampler24kTo8k,
69
+ createResampler8kTo16k,
55
70
  mulawToPcm16,
56
71
  pcm16ToMulaw,
57
72
  resample16kTo8k,
58
73
  resample24kTo16k,
59
- resample8kTo16k,
60
- resolveLogRoot,
61
- startSpan
62
- } from "./chunk-JUQ5WQTQ.mjs";
74
+ resample8kTo16k
75
+ } from "./chunk-CL2U3YET.mjs";
76
+ import {
77
+ MinWordsStrategy,
78
+ evaluateStrategies,
79
+ resetStrategies
80
+ } from "./chunk-D4424JZR.mjs";
63
81
  import {
64
82
  getLogger,
65
83
  setLogger
@@ -69,7 +87,7 @@ import {
69
87
  } from "./chunk-6GR5MHHQ.mjs";
70
88
  import {
71
89
  SileroVAD
72
- } from "./chunk-X3364LSI.mjs";
90
+ } from "./chunk-R2T4JABZ.mjs";
73
91
  import {
74
92
  __dirname,
75
93
  __require,
@@ -99,7 +117,31 @@ var Realtime = class {
99
117
  );
100
118
  }
101
119
  this.apiKey = key;
102
- this.model = opts.model ?? "gpt-4o-mini-realtime-preview";
120
+ this.model = opts.model ?? "gpt-realtime-mini";
121
+ this.voice = opts.voice ?? "alloy";
122
+ this.reasoningEffort = opts.reasoningEffort;
123
+ this.inputAudioTranscriptionModel = opts.inputAudioTranscriptionModel;
124
+ }
125
+ };
126
+
127
+ // src/engines/openai-2.ts
128
+ init_esm_shims();
129
+ var Realtime2 = class {
130
+ kind = "openai_realtime_2";
131
+ apiKey;
132
+ model;
133
+ voice;
134
+ reasoningEffort;
135
+ inputAudioTranscriptionModel;
136
+ constructor(opts = {}) {
137
+ const key = opts.apiKey ?? process.env.OPENAI_API_KEY;
138
+ if (!key) {
139
+ throw new Error(
140
+ "OpenAI Realtime 2 requires an apiKey. Pass { apiKey: 'sk-...' } or set OPENAI_API_KEY in the environment."
141
+ );
142
+ }
143
+ this.apiKey = key;
144
+ this.model = opts.model ?? "gpt-realtime-2";
103
145
  this.voice = opts.voice ?? "alloy";
104
146
  this.reasoningEffort = opts.reasoningEffort;
105
147
  this.inputAudioTranscriptionModel = opts.inputAudioTranscriptionModel;
@@ -520,11 +562,41 @@ function filterUndef(obj) {
520
562
  }
521
563
 
522
564
  // src/client.ts
565
+ var PREWARM_CACHE_MAX = 200;
566
+ var PREWARM_TTL_GRACE_MS = 5e3;
567
+ var PARKED_CONN_TTL_MS = 3e4;
523
568
  function resolvePersistRoot(persist) {
524
569
  if (persist === false) return null;
525
570
  if (persist === true) return resolveLogRoot("auto");
526
571
  if (typeof persist === "string") return resolveLogRoot(persist);
527
- return resolveLogRoot();
572
+ const envRoot = resolveLogRoot();
573
+ if (envRoot !== null) return envRoot;
574
+ return resolveLogRoot("auto");
575
+ }
576
+ function closeParkedConnections(slot) {
577
+ if (slot.stt) {
578
+ try {
579
+ slot.stt.close();
580
+ } catch {
581
+ }
582
+ }
583
+ if (slot.tts) {
584
+ try {
585
+ slot.tts.ws.close();
586
+ } catch {
587
+ }
588
+ }
589
+ if (slot.openaiRealtime) {
590
+ const wsAny = slot.openaiRealtime;
591
+ if (wsAny._parkedKeepalive) {
592
+ clearInterval(wsAny._parkedKeepalive);
593
+ delete wsAny._parkedKeepalive;
594
+ }
595
+ try {
596
+ slot.openaiRealtime.close();
597
+ } catch {
598
+ }
599
+ }
528
600
  }
529
601
  var Patter = class {
530
602
  localConfig;
@@ -546,6 +618,65 @@ var Patter = class {
546
618
  * ``Cannot use both tunnel: true and webhookUrl``.
547
619
  */
548
620
  tunnelOwnsWebhookUrl = false;
621
+ /**
622
+ * Pre-rendered first-message TTS audio per outbound call_id. Populated
623
+ * by :meth:`call` when ``agent.prewarmFirstMessage`` is true; consumed
624
+ * by the StreamHandler firstMessage emit so the greeting streams
625
+ * instantly on ``start`` instead of paying the 200-700 ms TTS first-byte
626
+ * latency. See ``AgentOptions.prewarmFirstMessage``.
627
+ *
628
+ * Stores raw bytes in the TTS provider's native sample rate; the
629
+ * carrier-side audio sender resamples on emit.
630
+ */
631
+ prewarmAudio = /* @__PURE__ */ new Map();
632
+ /**
633
+ * Call IDs whose prewarm cache slot has already been consumed —
634
+ * either by ``popPrewarmAudio`` (cache hit OR miss on the firstMessage
635
+ * emit path) or by ``recordPrewarmWaste`` (call ended before pickup).
636
+ * The prewarm task checks this set BEFORE writing bytes so a slow
637
+ * synth that finishes after the consumer already polled doesn't
638
+ * orphan bytes in ``prewarmAudio``. See FIX #92 in the parity audit.
639
+ */
640
+ prewarmConsumed = /* @__PURE__ */ new Set();
641
+ /**
642
+ * Background tasks tracked so :meth:`disconnect` can wait on / drop any
643
+ * still-running prewarm-first-message synth before tearing down.
644
+ */
645
+ prewarmTasks = /* @__PURE__ */ new Set();
646
+ /**
647
+ * TTL eviction timers keyed by call_id so :meth:`disconnect` (and
648
+ * normal consumption / waste-record paths) can cancel any pending
649
+ * timer when the slot drains naturally. Without this, the timer
650
+ * would WARN spuriously after the cache was already emptied.
651
+ */
652
+ prewarmTtlTimers = /* @__PURE__ */ new Map();
653
+ /**
654
+ * Pre-opened, fully-handshaked provider WebSockets keyed by
655
+ * carrier-issued call_id. Populated by ``parkProviderConnections``
656
+ * during the carrier ringing window; consumed by the per-call
657
+ * StreamHandler at ``start`` via ``adoptWebSocket(...)`` so STT / TTS
658
+ * / Realtime audio can flow on the first turn without paying the
659
+ * 150-900 ms TLS + WS-upgrade + protocol-handshake round-trip again.
660
+ *
661
+ * Distinct from ``prewarmAudio`` (which holds pre-rendered TTS bytes
662
+ * for the first message); the two features are complementary and
663
+ * orthogonal — both can be active for the same call.
664
+ *
665
+ * Each slot may hold up to three parked connections (STT, TTS,
666
+ * Realtime). Drained by:
667
+ * - {@link popPrewarmedConnections} on the carrier ``start`` event
668
+ * (consumed normally — the handles transfer to the StreamHandler)
669
+ * - {@link recordPrewarmWaste} on call-termination paths (no-answer,
670
+ * busy, failed, canceled, AMD voicemail). Closes parked sockets.
671
+ * - {@link disconnect} on Patter teardown. Closes all parked sockets.
672
+ */
673
+ prewarmedConnections = /* @__PURE__ */ new Map();
674
+ /**
675
+ * TTL eviction handles keyed by call_id for connections that are never
676
+ * adopted (e.g. a carrier that swallows ``start``). Closes the parked
677
+ * sockets so they don't leak past the safety window.
678
+ */
679
+ prewarmedConnTimers = /* @__PURE__ */ new Map();
549
680
  /**
550
681
  * Speech-edge events for turn-taking instrumentation. Public surface: the
551
682
  * seven `on*` proxy accessors below plus the `conversationState` snapshot.
@@ -553,13 +684,15 @@ var Patter = class {
553
684
  * the previous behaviour.
554
685
  *
555
686
  * See `src/_speech-events.ts` for the full event taxonomy and the
556
- * industry-alignment table (LiveKit / Pipecat / OpenAI Realtime).
687
+ * OpenAI Realtime alignment table.
557
688
  */
558
689
  speechEvents = new SpeechEvents();
559
690
  // ---- Speech-edge event callback proxies ------------------------------
560
- // The seven `on*` properties below mirror the public APIs of LiveKit
561
- // Agents, Pipecat and OpenAI Realtime. They proxy to `speechEvents` so
562
- // the dispatcher remains the single source of truth (state + OTel).
691
+ // The seven `on*` properties below follow the canonical voice-agent
692
+ // metric set (user/agent state transitions, turn boundaries, TTFT, audio
693
+ // first-byte) and align with OpenAI Realtime where applicable. They
694
+ // proxy to `speechEvents` so the dispatcher remains the single source of
695
+ // truth (state + OTel).
563
696
  get onUserSpeechStarted() {
564
697
  return this.speechEvents.onUserSpeechStarted;
565
698
  }
@@ -604,8 +737,8 @@ var Patter = class {
604
737
  }
605
738
  /**
606
739
  * Snapshot of the current per-side state of the call.
607
- * Mirrors LiveKit's `user_state_changed` / `agent_state_changed`
608
- * payloads. Read-only and safe to call at any time.
740
+ * Returns the user_state / agent_state payload shape — read-only and
741
+ * safe to call at any time.
609
742
  */
610
743
  get conversationState() {
611
744
  return this.speechEvents.conversationState;
@@ -717,7 +850,7 @@ var Patter = class {
717
850
  );
718
851
  }
719
852
  const engine = opts.engine;
720
- if (engine instanceof Realtime) {
853
+ if (engine instanceof Realtime || engine instanceof Realtime2) {
721
854
  working = {
722
855
  ...working,
723
856
  provider: "openai_realtime",
@@ -735,7 +868,7 @@ var Patter = class {
735
868
  };
736
869
  } else {
737
870
  throw new Error(
738
- "Unknown engine. Expected OpenAIRealtime or ElevenLabsConvAI instance."
871
+ "Unknown engine. Expected OpenAIRealtime, OpenAIRealtime2, or ElevenLabsConvAI instance."
739
872
  );
740
873
  }
741
874
  } else if (!working.provider && (working.stt !== void 0 || working.tts !== void 0 || working.llm !== void 0)) {
@@ -795,6 +928,13 @@ var Patter = class {
795
928
  if (!opts.agent.systemPrompt && opts.agent.provider !== "pipeline") {
796
929
  throw new Error("agent.systemPrompt is required");
797
930
  }
931
+ if (opts.agent.echoCancellation) {
932
+ try {
933
+ await import("./aec-PJJMUM5E.mjs");
934
+ } catch (err) {
935
+ getLogger().debug(`AEC pre-import failed at serve(): ${String(err)}`);
936
+ }
937
+ }
798
938
  if (opts.port !== void 0) {
799
939
  if (typeof opts.port !== "number" || opts.port < 1 || opts.port > 65535) {
800
940
  throw new RangeError(`port must be between 1 and 65535, got ${opts.port}`);
@@ -876,6 +1016,9 @@ var Patter = class {
876
1016
  opts.dashboard ?? true,
877
1017
  opts.dashboardToken ?? ""
878
1018
  );
1019
+ this.embeddedServer.popPrewarmAudio = this.popPrewarmAudio;
1020
+ this.embeddedServer.popPrewarmedConnections = this.popPrewarmedConnections;
1021
+ this.embeddedServer.recordPrewarmWaste = this.recordPrewarmWaste;
879
1022
  try {
880
1023
  await this.embeddedServer.start(port);
881
1024
  if (this.tunnelHandle) {
@@ -890,7 +1033,7 @@ var Patter = class {
890
1033
  }
891
1034
  /** Run the agent in interactive terminal-test mode (no real telephony). */
892
1035
  async test(opts) {
893
- const { TestSession: TestSession2 } = await import("./test-mode-Y7YG5LFZ.mjs");
1036
+ const { TestSession: TestSession2 } = await import("./test-mode-RS57BDM6.mjs");
894
1037
  const session = new TestSession2();
895
1038
  await session.run({
896
1039
  agent: opts.agent,
@@ -900,6 +1043,376 @@ var Patter = class {
900
1043
  onCallEnd: opts.onCallEnd
901
1044
  });
902
1045
  }
1046
+ /**
1047
+ * Pop and return the pre-synthesised first-message audio for ``callId``.
1048
+ *
1049
+ * Returns ``undefined`` when ``agent.prewarmFirstMessage`` was not set
1050
+ * for the originating outbound call, or when the synth was still in
1051
+ * flight at the moment the carrier emitted ``start`` (cache miss — the
1052
+ * StreamHandler falls back to live TTS).
1053
+ *
1054
+ * Called by the per-call StreamHandler at the start of the firstMessage
1055
+ * emit. Returning bytes here lets the handler skip the live TTS
1056
+ * synthesis and stream the cached buffer directly.
1057
+ *
1058
+ * Marks ``callId`` as consumed regardless of cache hit/miss so a slow
1059
+ * synth task that finishes after this call drops its bytes instead of
1060
+ * orphaning them in ``prewarmAudio``. See FIX #92.
1061
+ */
1062
+ popPrewarmAudio = (callId) => {
1063
+ this.prewarmConsumed.add(callId);
1064
+ const ttl = this.prewarmTtlTimers.get(callId);
1065
+ if (ttl !== void 0) {
1066
+ clearTimeout(ttl);
1067
+ this.prewarmTtlTimers.delete(callId);
1068
+ }
1069
+ const buf = this.prewarmAudio.get(callId);
1070
+ if (buf !== void 0) this.prewarmAudio.delete(callId);
1071
+ return buf;
1072
+ };
1073
+ /**
1074
+ * Log a warning if a prewarmed greeting was paid for but never used.
1075
+ * The TTS bill for ``agent.firstMessage`` has already been incurred by
1076
+ * the background synth task, so the user should know — opt-in feature
1077
+ * with a known cost surface.
1078
+ *
1079
+ * Idempotent: the second call for the same ``callId`` is a no-op, so
1080
+ * the status callback firing first and ``endCall`` running afterwards
1081
+ * (or vice-versa) does not double-WARN. Public so the embedded
1082
+ * server's webhook handlers can invoke it on no-answer / busy /
1083
+ * failed / canceled / AMD-machine paths. See FIX #91.
1084
+ */
1085
+ recordPrewarmWaste = (callId) => {
1086
+ this.closePrewarmedConnections(callId);
1087
+ if (this.prewarmConsumed.has(callId)) {
1088
+ this.prewarmAudio.delete(callId);
1089
+ return;
1090
+ }
1091
+ this.prewarmConsumed.add(callId);
1092
+ const ttl = this.prewarmTtlTimers.get(callId);
1093
+ if (ttl !== void 0) {
1094
+ clearTimeout(ttl);
1095
+ this.prewarmTtlTimers.delete(callId);
1096
+ }
1097
+ const buf = this.prewarmAudio.get(callId);
1098
+ if (buf !== void 0) {
1099
+ this.prewarmAudio.delete(callId);
1100
+ getLogger().warn(
1101
+ `Prewarm wasted for call ${callId} \u2014 first-message TTS already paid (~${buf.byteLength} bytes synthesised) but call ended before pickup.`
1102
+ );
1103
+ }
1104
+ };
1105
+ /**
1106
+ * Pop and return the parked provider WebSockets for ``callId``, or
1107
+ * ``undefined`` when no parked connections exist.
1108
+ *
1109
+ * Wired into ``EmbeddedServer.popPrewarmedConnections`` so the
1110
+ * per-call ``StreamHandler`` can adopt the parked sockets at the
1111
+ * carrier ``start`` event instead of opening fresh ones — saving
1112
+ * ~150-900 ms of cold-start handshake on the first turn.
1113
+ */
1114
+ popPrewarmedConnections = (callId) => {
1115
+ const slot = this.prewarmedConnections.get(callId);
1116
+ if (slot === void 0) return void 0;
1117
+ this.prewarmedConnections.delete(callId);
1118
+ const ttl = this.prewarmedConnTimers.get(callId);
1119
+ if (ttl !== void 0) {
1120
+ clearTimeout(ttl);
1121
+ this.prewarmedConnTimers.delete(callId);
1122
+ }
1123
+ return slot;
1124
+ };
1125
+ /**
1126
+ * Close any parked provider WebSockets for ``callId``. Wired into
1127
+ * ``EmbeddedServer.closePrewarmedConnections`` so call-termination
1128
+ * paths (no-answer, busy, failed, canceled, AMD voicemail) drop the
1129
+ * sockets cleanly instead of leaving them to the upstream timeout.
1130
+ */
1131
+ closePrewarmedConnections = (callId) => {
1132
+ const slot = this.prewarmedConnections.get(callId);
1133
+ if (slot === void 0) return;
1134
+ this.prewarmedConnections.delete(callId);
1135
+ const ttl = this.prewarmedConnTimers.get(callId);
1136
+ if (ttl !== void 0) {
1137
+ clearTimeout(ttl);
1138
+ this.prewarmedConnTimers.delete(callId);
1139
+ }
1140
+ closeParkedConnections(slot);
1141
+ };
1142
+ /**
1143
+ * Open and park provider WebSockets in parallel with the carrier-side
1144
+ * ``initiateCall``. Unlike :meth:`spawnProviderWarmup` (which closes
1145
+ * the WS after a brief idle), the sockets opened here stay OPEN and
1146
+ * are handed off to the per-call ``StreamHandler`` on ``start``.
1147
+ *
1148
+ * This is the structural fix for first-turn cold-start: on Node's
1149
+ * ``ws`` package, opening + closing a WS does NOT warm TLS for the
1150
+ * next open — every fresh ``new WebSocket()`` re-pays the full
1151
+ * TCP + TLS + HTTP-101 round-trip. By keeping the WS open and
1152
+ * adopting it directly, the live first turn skips the handshake
1153
+ * entirely (saves ~150-900 ms depending on provider).
1154
+ *
1155
+ * Best-effort: each provider's parking task is wrapped in
1156
+ * ``Promise.allSettled`` so a slow or failing endpoint cannot block
1157
+ * the others. Providers without ``openParkedConnection`` contribute
1158
+ * nothing — the call falls through to the cold ``connect()`` path
1159
+ * for that provider.
1160
+ */
1161
+ parkProviderConnections(agent, callId) {
1162
+ const stt = agent.stt;
1163
+ const tts = agent.tts;
1164
+ const sttOpen = typeof stt?.openParkedConnection === "function" ? stt.openParkedConnection.bind(stt) : null;
1165
+ const ttsOpen = typeof tts?.openParkedConnection === "function" ? tts.openParkedConnection.bind(tts) : null;
1166
+ const providerStr = agent.provider ?? "";
1167
+ const wantsRealtimePark = providerStr === "openai_realtime" || providerStr === "openai_realtime_2";
1168
+ if (!sttOpen && !ttsOpen && !wantsRealtimePark) return;
1169
+ const slot = {};
1170
+ this.prewarmedConnections.set(callId, slot);
1171
+ const startedAt = Date.now();
1172
+ const tasks = [];
1173
+ if (sttOpen) {
1174
+ tasks.push((async () => {
1175
+ try {
1176
+ const ws = await sttOpen();
1177
+ if (this.prewarmedConnections.get(callId) !== slot) {
1178
+ try {
1179
+ ws.close();
1180
+ } catch {
1181
+ }
1182
+ return;
1183
+ }
1184
+ slot.stt = ws;
1185
+ getLogger().info(
1186
+ `[PREWARM] callId=${callId} provider=stt ms=${Date.now() - startedAt}`
1187
+ );
1188
+ } catch (err) {
1189
+ getLogger().debug(`Park STT failed for ${callId}: ${String(err)}`);
1190
+ }
1191
+ })());
1192
+ }
1193
+ if (ttsOpen) {
1194
+ tasks.push((async () => {
1195
+ try {
1196
+ const parked = await ttsOpen();
1197
+ if (this.prewarmedConnections.get(callId) !== slot) {
1198
+ try {
1199
+ parked.ws.close();
1200
+ } catch {
1201
+ }
1202
+ return;
1203
+ }
1204
+ slot.tts = parked;
1205
+ getLogger().info(
1206
+ `[PREWARM] callId=${callId} provider=tts ms=${Date.now() - startedAt}`
1207
+ );
1208
+ } catch (err) {
1209
+ getLogger().debug(`Park TTS failed for ${callId}: ${String(err)}`);
1210
+ }
1211
+ })());
1212
+ }
1213
+ if (wantsRealtimePark) {
1214
+ tasks.push((async () => {
1215
+ const { OpenAIRealtime2Adapter: OpenAIRealtime2Adapter2 } = await import("./openai-realtime-2-CNFARP25.mjs");
1216
+ const apiKey = process.env.OPENAI_API_KEY ?? "";
1217
+ if (!apiKey) {
1218
+ getLogger().debug(`Park OpenAI Realtime skipped for ${callId}: no OPENAI_API_KEY`);
1219
+ return;
1220
+ }
1221
+ try {
1222
+ const tmpAdapter = new OpenAIRealtime2Adapter2(
1223
+ apiKey,
1224
+ agent.model ?? "gpt-realtime-mini",
1225
+ agent.voice ?? "alloy",
1226
+ agent.systemPrompt ?? "",
1227
+ [],
1228
+ // audioFormat — the GA adapter always emits audio/pcm@24000
1229
+ // internally regardless of this value, but it's a required
1230
+ // positional param. Default to g711_ulaw (Twilio wire format).
1231
+ void 0
1232
+ );
1233
+ const ws = await tmpAdapter.openParkedConnection();
1234
+ if (this.prewarmedConnections.get(callId) !== slot) {
1235
+ try {
1236
+ ws.close();
1237
+ } catch {
1238
+ }
1239
+ return;
1240
+ }
1241
+ slot.openaiRealtime = ws;
1242
+ getLogger().info(
1243
+ `[PREWARM] callId=${callId} provider=openai_realtime ms=${Date.now() - startedAt}`
1244
+ );
1245
+ } catch (err) {
1246
+ getLogger().debug(`Park OpenAI Realtime failed for ${callId}: ${String(err)}`);
1247
+ }
1248
+ })());
1249
+ }
1250
+ const task = (async () => {
1251
+ await Promise.allSettled(tasks);
1252
+ })();
1253
+ this.prewarmTasks.add(task);
1254
+ void task.finally(() => {
1255
+ this.prewarmTasks.delete(task);
1256
+ if (!this.prewarmedConnections.has(callId)) return;
1257
+ const handle = setTimeout(() => {
1258
+ this.prewarmedConnTimers.delete(callId);
1259
+ const orphan = this.prewarmedConnections.get(callId);
1260
+ if (orphan === void 0) return;
1261
+ this.prewarmedConnections.delete(callId);
1262
+ closeParkedConnections(orphan);
1263
+ getLogger().warn(
1264
+ `[PREWARM] parked connections evicted by TTL for ${callId} \u2014 call never reached start (~${(PARKED_CONN_TTL_MS / 1e3).toFixed(0)}s).`
1265
+ );
1266
+ }, PARKED_CONN_TTL_MS);
1267
+ handle.unref?.();
1268
+ this.prewarmedConnTimers.set(callId, handle);
1269
+ });
1270
+ }
1271
+ /**
1272
+ * Spawn a fire-and-forget task that warms up STT / TTS / LLM in
1273
+ * parallel with the carrier-side ``initiateCall``.
1274
+ *
1275
+ * Best-effort: each provider's optional ``warmup()`` is wrapped in
1276
+ * ``Promise.allSettled`` so a slow or failing endpoint cannot block
1277
+ * the others. Providers without ``warmup`` contribute nothing.
1278
+ */
1279
+ spawnProviderWarmup(agent) {
1280
+ const targets = [];
1281
+ const collect = (provider, label) => {
1282
+ if (!provider || typeof provider !== "object") return;
1283
+ const fn = provider.warmup;
1284
+ if (typeof fn !== "function") return;
1285
+ targets.push({
1286
+ name: label,
1287
+ fn: fn.bind(provider)
1288
+ });
1289
+ };
1290
+ collect(agent.stt, "stt");
1291
+ collect(agent.tts, "tts");
1292
+ collect(agent.llm, "llm");
1293
+ if (targets.length === 0) return;
1294
+ const task = (async () => {
1295
+ const results = await Promise.allSettled(targets.map((t) => t.fn()));
1296
+ results.forEach((r, i) => {
1297
+ if (r.status === "rejected") {
1298
+ getLogger().debug(
1299
+ `Provider warmup failed (${targets[i].name}): ${String(r.reason)}`
1300
+ );
1301
+ }
1302
+ });
1303
+ })();
1304
+ this.prewarmTasks.add(task);
1305
+ void task.finally(() => this.prewarmTasks.delete(task));
1306
+ }
1307
+ /**
1308
+ * Pre-render ``agent.firstMessage`` to TTS bytes during the ringing
1309
+ * window and stash them in ``prewarmAudio.set(callId, buf)``.
1310
+ *
1311
+ * Skipped silently when ``agent.prewarmFirstMessage`` is false or
1312
+ * when ``agent.tts`` / ``agent.firstMessage`` is missing. The synth
1313
+ * is bounded by ``ringTimeout`` (default 25 s) so a never-answered
1314
+ * call doesn't tie up the TTS connection. On timeout / error the
1315
+ * cache is left empty and the StreamHandler falls back to live TTS.
1316
+ *
1317
+ * **Pipeline mode only.** Realtime / ConvAI provider modes never
1318
+ * consume the prewarm cache (the StreamHandler for those modes runs
1319
+ * its first-message emit through the provider's own audio path).
1320
+ * Spawning the prewarm in those modes pays the TTS bill for nothing
1321
+ * — refused with a warn.
1322
+ *
1323
+ * **Capped at ``PREWARM_CACHE_MAX`` concurrent entries.** Refused
1324
+ * with a warn when the cap is reached (the call still proceeds —
1325
+ * StreamHandler falls back to live TTS).
1326
+ */
1327
+ spawnPrewarmFirstMessage(agent, callId, ringTimeout, carrier) {
1328
+ if (!agent.prewarmFirstMessage) return;
1329
+ const providerMode = agent.provider ?? "openai_realtime";
1330
+ if (providerMode !== "pipeline") {
1331
+ getLogger().warn(
1332
+ `agent.prewarmFirstMessage=true is only supported in pipeline mode (provider=${providerMode}); skipping pre-synth to avoid wasted TTS spend.`
1333
+ );
1334
+ return;
1335
+ }
1336
+ const firstMessage = agent.firstMessage ?? "";
1337
+ const tts = agent.tts;
1338
+ if (!firstMessage || !tts) return;
1339
+ if (typeof tts.synthesizeStream !== "function") return;
1340
+ if (carrier) {
1341
+ const carrierAware = tts;
1342
+ if (typeof carrierAware.setTelephonyCarrier === "function") {
1343
+ try {
1344
+ carrierAware.setTelephonyCarrier(carrier);
1345
+ } catch (err) {
1346
+ getLogger().debug(
1347
+ `Prewarm TTS setTelephonyCarrier failed for ${callId}: ${String(err)}`
1348
+ );
1349
+ }
1350
+ }
1351
+ }
1352
+ const inFlight = this.prewarmAudio.size + this.prewarmTasks.size;
1353
+ if (inFlight >= PREWARM_CACHE_MAX) {
1354
+ getLogger().warn(
1355
+ `Prewarm cache full (${inFlight}/${PREWARM_CACHE_MAX} in-flight) \u2014 skipping pre-synth for call ${callId}; falling back to live TTS at pickup.`
1356
+ );
1357
+ return;
1358
+ }
1359
+ const timeoutMs = (typeof ringTimeout === "number" ? ringTimeout : 25) * 1e3;
1360
+ const task = (async () => {
1361
+ try {
1362
+ const accumulate = async () => {
1363
+ const chunks = [];
1364
+ for await (const chunk of tts.synthesizeStream(firstMessage)) {
1365
+ const u = chunk;
1366
+ if (Buffer.isBuffer(u)) chunks.push(u);
1367
+ else if (ArrayBuffer.isView(u))
1368
+ chunks.push(Buffer.from(u.buffer, u.byteOffset, u.byteLength));
1369
+ }
1370
+ return Buffer.concat(chunks);
1371
+ };
1372
+ const timer = new Promise(
1373
+ (_resolve, reject) => setTimeout(
1374
+ () => reject(new Error("prewarm-first-message timeout")),
1375
+ timeoutMs
1376
+ ).unref?.()
1377
+ );
1378
+ const buf = await Promise.race([accumulate(), timer]);
1379
+ if (buf.byteLength > 0) {
1380
+ if (this.prewarmConsumed.has(callId)) {
1381
+ getLogger().warn(
1382
+ `Prewarm orphaned for call ${callId} \u2014 synth completed (~${buf.byteLength} bytes) AFTER consumer polled; bytes dropped, TTS bill already paid.`
1383
+ );
1384
+ return;
1385
+ }
1386
+ this.prewarmAudio.set(callId, buf);
1387
+ getLogger().debug(
1388
+ `Prewarm first-message ready for call ${callId} (${buf.byteLength} bytes)`
1389
+ );
1390
+ }
1391
+ } catch (err) {
1392
+ getLogger().debug(
1393
+ `Prewarm first-message failed for call ${callId}: ${String(err)}`
1394
+ );
1395
+ }
1396
+ })();
1397
+ this.prewarmTasks.add(task);
1398
+ void task.finally(() => {
1399
+ this.prewarmTasks.delete(task);
1400
+ if (!this.prewarmAudio.has(callId)) return;
1401
+ const ttlMs = timeoutMs + PREWARM_TTL_GRACE_MS;
1402
+ const handle = setTimeout(() => {
1403
+ this.prewarmTtlTimers.delete(callId);
1404
+ const orphan = this.prewarmAudio.get(callId);
1405
+ if (orphan === void 0) return;
1406
+ this.prewarmAudio.delete(callId);
1407
+ this.prewarmConsumed.add(callId);
1408
+ getLogger().warn(
1409
+ `Prewarm bytes evicted by TTL \u2014 call ${callId} never consumed them (~${orphan.byteLength} bytes synthesised, ${(ttlMs / 1e3).toFixed(1)}s after ringTimeout).`
1410
+ );
1411
+ }, ttlMs);
1412
+ handle.unref?.();
1413
+ this.prewarmTtlTimers.set(callId, handle);
1414
+ });
1415
+ }
903
1416
  /** Place an outbound call via the configured carrier. */
904
1417
  async call(options) {
905
1418
  if (!options.to) {
@@ -914,6 +1427,9 @@ var Patter = class {
914
1427
  if (this.embeddedServer) {
915
1428
  this.embeddedServer.onMachineDetection = options.onMachineDetection;
916
1429
  }
1430
+ if (options.agent.prewarm !== false) {
1431
+ this.spawnProviderWarmup(options.agent);
1432
+ }
917
1433
  if (carrier.kind === "telnyx") {
918
1434
  const telnyxKey = carrier.apiKey;
919
1435
  const connectionId = carrier.connectionId;
@@ -939,21 +1455,35 @@ var Patter = class {
939
1455
  if (!response2.ok) {
940
1456
  throw new ProvisionError(`Failed to initiate Telnyx call: ${await response2.text()}`);
941
1457
  }
942
- if (this.embeddedServer) {
1458
+ let telnyxCallId;
1459
+ try {
1460
+ const body = await response2.clone().json();
1461
+ telnyxCallId = body.data?.call_control_id;
1462
+ } catch {
1463
+ }
1464
+ if (telnyxCallId) {
1465
+ const initiatedPayload = {
1466
+ call_id: telnyxCallId,
1467
+ caller: phoneNumber,
1468
+ callee: options.to,
1469
+ direction: "outbound",
1470
+ status: "initiated"
1471
+ };
1472
+ if (this.embeddedServer) {
1473
+ this.embeddedServer.metricsStore.recordCallInitiated(initiatedPayload);
1474
+ }
943
1475
  try {
944
- const body = await response2.clone().json();
945
- const callId = body.data?.call_control_id;
946
- if (callId) {
947
- this.embeddedServer.metricsStore.recordCallInitiated({
948
- call_id: callId,
949
- caller: phoneNumber,
950
- callee: options.to,
951
- direction: "outbound"
952
- });
953
- }
1476
+ const { notifyDashboard: notifyDashboard2 } = await import("./persistence-LVIAHESK.mjs");
1477
+ notifyDashboard2(initiatedPayload);
954
1478
  } catch {
955
1479
  }
956
1480
  }
1481
+ if (telnyxCallId) {
1482
+ this.spawnPrewarmFirstMessage(options.agent, telnyxCallId, effectiveRingTimeout, "telnyx");
1483
+ if (options.agent.prewarm !== false) {
1484
+ this.parkProviderConnections(options.agent, telnyxCallId);
1485
+ }
1486
+ }
957
1487
  return;
958
1488
  }
959
1489
  const twilioSid = carrier.accountSid;
@@ -994,34 +1524,77 @@ var Patter = class {
994
1524
  if (!response.ok) {
995
1525
  throw new ProvisionError(`Failed to initiate call: ${await response.text()}`);
996
1526
  }
997
- if (this.embeddedServer) {
998
- try {
999
- const body = await response.clone().json();
1000
- const callSid = body.sid;
1001
- if (callSid) {
1002
- this.embeddedServer.metricsStore.recordCallInitiated({
1003
- call_id: callSid,
1004
- caller: phoneNumber,
1005
- callee: options.to,
1006
- direction: "outbound"
1007
- });
1008
- const notificationsPath = body.subresource_uris?.notifications;
1009
- if (notificationsPath) {
1010
- getLogger().info(
1011
- `Outbound call ${callSid} placed. Twilio notifications: https://api.twilio.com${notificationsPath} (check here if the call drops with no audio).`
1012
- );
1013
- }
1527
+ let twilioCallSid;
1528
+ let twilioNotificationsPath;
1529
+ try {
1530
+ const body = await response.clone().json();
1531
+ twilioCallSid = body.sid;
1532
+ twilioNotificationsPath = body.subresource_uris?.notifications;
1533
+ } catch {
1534
+ }
1535
+ if (twilioCallSid) {
1536
+ const initiatedPayload = {
1537
+ call_id: twilioCallSid,
1538
+ caller: phoneNumber,
1539
+ callee: options.to,
1540
+ direction: "outbound",
1541
+ status: "initiated"
1542
+ };
1543
+ if (this.embeddedServer) {
1544
+ this.embeddedServer.metricsStore.recordCallInitiated(initiatedPayload);
1545
+ if (twilioNotificationsPath) {
1546
+ getLogger().info(
1547
+ `Outbound call ${twilioCallSid} placed. Twilio notifications: https://api.twilio.com${twilioNotificationsPath} (check here if the call drops with no audio).`
1548
+ );
1014
1549
  }
1550
+ }
1551
+ try {
1552
+ const { notifyDashboard: notifyDashboard2 } = await import("./persistence-LVIAHESK.mjs");
1553
+ notifyDashboard2(initiatedPayload);
1015
1554
  } catch {
1016
1555
  }
1017
1556
  }
1557
+ if (twilioCallSid) {
1558
+ this.spawnPrewarmFirstMessage(options.agent, twilioCallSid, effectiveRingTimeout, "twilio");
1559
+ if (options.agent.prewarm !== false) {
1560
+ this.parkProviderConnections(options.agent, twilioCallSid);
1561
+ }
1562
+ }
1018
1563
  }
1019
1564
  /**
1020
1565
  * Stop the embedded server and any running tunnel. Safe to call multiple
1021
1566
  * times. Leaves the instance reusable: a subsequent ``serve()`` works as
1022
1567
  * if the previous lifecycle never happened.
1568
+ *
1569
+ * Also clears any pending TTL eviction timers, awaits in-flight
1570
+ * prewarm-first-message synth tasks (best-effort, with a 1 s safety
1571
+ * timeout), and clears the prewarm cache. Without this a still-running
1572
+ * TTS WS keeps the user billed long after SDK teardown, and stale
1573
+ * entries leak across ``serve`` / ``disconnect`` cycles. See FIX #93.
1023
1574
  */
1024
1575
  async disconnect() {
1576
+ for (const handle of this.prewarmTtlTimers.values()) {
1577
+ clearTimeout(handle);
1578
+ }
1579
+ this.prewarmTtlTimers.clear();
1580
+ if (this.prewarmTasks.size > 0) {
1581
+ const drain = Promise.allSettled(Array.from(this.prewarmTasks));
1582
+ const timer = new Promise(
1583
+ (resolve) => setTimeout(resolve, 1e3).unref?.()
1584
+ );
1585
+ await Promise.race([drain, timer]);
1586
+ }
1587
+ this.prewarmTasks.clear();
1588
+ this.prewarmAudio.clear();
1589
+ this.prewarmConsumed.clear();
1590
+ for (const handle of this.prewarmedConnTimers.values()) {
1591
+ clearTimeout(handle);
1592
+ }
1593
+ this.prewarmedConnTimers.clear();
1594
+ for (const slot of this.prewarmedConnections.values()) {
1595
+ closeParkedConnections(slot);
1596
+ }
1597
+ this.prewarmedConnections.clear();
1025
1598
  if (this.tunnelHandle) {
1026
1599
  this.tunnelHandle.stop();
1027
1600
  this.tunnelHandle = null;
@@ -1072,6 +1645,7 @@ var Patter = class {
1072
1645
  if (!callSid) {
1073
1646
  throw new Error("callSid must be a non-empty string");
1074
1647
  }
1648
+ this.recordPrewarmWaste(callSid);
1075
1649
  const carrier = this.localConfig.carrier;
1076
1650
  if (carrier.kind === "twilio") {
1077
1651
  const auth = Buffer.from(`${carrier.accountSid}:${carrier.authToken}`).toString("base64");
@@ -1107,7 +1681,7 @@ var Patter = class {
1107
1681
  }
1108
1682
  };
1109
1683
  async function waitForTunnelPubliclyReachable(hostname, totalTimeoutMs = 6e4, graceMs = 5e3) {
1110
- const log = getLogger();
1684
+ const log2 = getLogger();
1111
1685
  const { Resolver } = await import("dns/promises");
1112
1686
  const resolver = new Resolver({ timeout: 1500, tries: 1 });
1113
1687
  resolver.setServers(["1.1.1.1", "8.8.8.8"]);
@@ -1119,7 +1693,7 @@ async function waitForTunnelPubliclyReachable(hostname, totalTimeoutMs = 6e4, gr
1119
1693
  try {
1120
1694
  const records = await resolver.resolve4(hostname);
1121
1695
  const first = records[0] ?? "<unknown>";
1122
- log.info(
1696
+ log2.info(
1123
1697
  "Tunnel DNS resolved \u2192 %s (attempt %d); waiting %d ms grace",
1124
1698
  first,
1125
1699
  attempt,
@@ -2278,48 +2852,633 @@ function scheduleInterval(intervalOrOpts, callback) {
2278
2852
  };
2279
2853
  }
2280
2854
 
2281
- // src/stt/deepgram.ts
2282
- init_esm_shims();
2283
- var STT = class extends DeepgramSTT {
2284
- static providerKey = "deepgram";
2285
- constructor(opts = {}) {
2286
- const key = opts.apiKey ?? process.env.DEEPGRAM_API_KEY;
2287
- if (!key) {
2288
- throw new Error(
2289
- "Deepgram STT requires an apiKey. Pass { apiKey: 'dg_...' } or set DEEPGRAM_API_KEY in the environment."
2290
- );
2291
- }
2292
- super(
2293
- key,
2294
- opts.language ?? "en",
2295
- opts.model ?? "nova-3",
2296
- opts.encoding ?? "linear16",
2297
- opts.sampleRate ?? 16e3,
2298
- {
2299
- endpointingMs: opts.endpointingMs ?? 150,
2300
- utteranceEndMs: opts.utteranceEndMs === null ? null : opts.utteranceEndMs ?? 1e3,
2301
- smartFormat: opts.smartFormat ?? true,
2302
- interimResults: opts.interimResults ?? true,
2303
- ...opts.vadEvents !== void 0 ? { vadEvents: opts.vadEvents } : {}
2304
- }
2305
- );
2306
- }
2307
- };
2308
-
2309
- // src/stt/whisper.ts
2310
- init_esm_shims();
2311
-
2312
- // src/providers/whisper-stt.ts
2855
+ // src/providers/elevenlabs-tts.ts
2313
2856
  init_esm_shims();
2314
- var OPENAI_TRANSCRIPTION_URL = "https://api.openai.com/v1/audio/transcriptions";
2315
- var DEFAULT_BUFFER_SIZE = 16e3 * 2;
2316
- var ALLOWED_MODELS = /* @__PURE__ */ new Set(["whisper-1", "gpt-4o-transcribe", "gpt-4o-mini-transcribe"]);
2317
- function wrapPcmInWav(pcm, sampleRate = 16e3, channels = 1, bitsPerSample = 16) {
2318
- const dataSize = pcm.length;
2319
- const header = Buffer.alloc(44);
2320
- header.write("RIFF", 0);
2321
- header.writeUInt32LE(36 + dataSize, 4);
2322
- header.write("WAVE", 8);
2857
+ var ELEVENLABS_BASE_URL = "https://api.elevenlabs.io/v1";
2858
+ var ELEVENLABS_VOICE_ID_BY_NAME = {
2859
+ rachel: "21m00Tcm4TlvDq8ikWAM",
2860
+ drew: "29vD33N1CtxCmqQRPOHJ",
2861
+ clyde: "2EiwWnXFnvU5JabPnv8n",
2862
+ paul: "5Q0t7uMcjvnagumLfvZi",
2863
+ domi: "AZnzlk1XvdvUeBnXmlld",
2864
+ dave: "CYw3kZ02Hs0563khs1Fj",
2865
+ fin: "D38z5RcWu1voky8WS1ja",
2866
+ bella: "EXAVITQu4vr4xnSDxMaL",
2867
+ antoni: "ErXwobaYiN019PkySvjV",
2868
+ thomas: "GBv7mTt0atIp3Br8iCZE",
2869
+ charlie: "IKne3meq5aSn9XLyUdCD",
2870
+ george: "JBFqnCBsd6RMkjVDRZzb",
2871
+ emily: "LcfcDJNUP1GQjkzn1xUU",
2872
+ elli: "MF3mGyEYCl7XYWbV9V6O",
2873
+ callum: "N2lVS1w4EtoT3dr4eOWO",
2874
+ patrick: "ODq5zmih8GrVes37Dizd",
2875
+ harry: "SOYHLrjzK2X1ezoPC6cr",
2876
+ liam: "TX3LPaxmHKxFdv7VOQHJ",
2877
+ dorothy: "ThT5KcBeYPX3keUQqHPh",
2878
+ josh: "TxGEqnHWrfWFTfGW9XjX",
2879
+ arnold: "VR6AewLTigWG4xSOukaG",
2880
+ charlotte: "XB0fDUnXU5powFXDhCwa",
2881
+ matilda: "XrExE9yKIg1WjnnlVkGX",
2882
+ matthew: "Yko7PKHZNXotIFUBG7I9",
2883
+ james: "ZQe5CZNOzWyzPSCn5a3c",
2884
+ joseph: "Zlb1dXrM653N07WRdFW3",
2885
+ jeremy: "bVMeCyTHy58xNoL34h3p",
2886
+ michael: "flq6f7yk4E4fJM5XTYuZ",
2887
+ ethan: "g5CIjZEefAph4nQFvHAz",
2888
+ gigi: "jBpfuIE2acCO8z3wKNLl",
2889
+ freya: "jsCqWAovK2LkecY7zXl4",
2890
+ brian: "nPczCjzI2devNBz1zQrb",
2891
+ grace: "oWAxZDx7w5VEj9dCyTzz",
2892
+ daniel: "onwK4e9ZLuTAKqWW03F9",
2893
+ lily: "pFZP5JQG7iQjIQuC4Bku",
2894
+ serena: "pMsXgVXv3BLzUgSXRplE",
2895
+ adam: "pNInz6obpgDQGcFmaJgB",
2896
+ nicole: "piTKgcLEGmPE4e6mEKli",
2897
+ bill: "pqHfZKP75CvOlQylNhV4",
2898
+ jessie: "t0jbNlBVZ17f02VDIeMI",
2899
+ ryan: "wViXBPUzp2ZZixB1xQuM",
2900
+ sam: "yoZ06aMxZJJ28mfd3POQ",
2901
+ glinda: "z9fAnlkpzviPz146aGWa",
2902
+ giovanni: "zcAOhNBS3c14rBihAFp1",
2903
+ mimi: "zrHiDhphv9ZnVXBqCLjz",
2904
+ sarah: "EXAVITQu4vr4xnSDxMaL",
2905
+ alloy: "EXAVITQu4vr4xnSDxMaL"
2906
+ };
2907
+ var VOICE_ID_PATTERN = /^[A-Za-z0-9]{20}$/;
2908
+ function resolveVoiceId(voice) {
2909
+ if (!voice) return voice;
2910
+ if (VOICE_ID_PATTERN.test(voice)) return voice;
2911
+ return ELEVENLABS_VOICE_ID_BY_NAME[voice.toLowerCase()] ?? voice;
2912
+ }
2913
+ var ElevenLabsModel = {
2914
+ V3: "eleven_v3",
2915
+ FLASH_V2_5: "eleven_flash_v2_5",
2916
+ TURBO_V2_5: "eleven_turbo_v2_5",
2917
+ MULTILINGUAL_V2: "eleven_multilingual_v2",
2918
+ MONOLINGUAL_V1: "eleven_monolingual_v1"
2919
+ };
2920
+ var ElevenLabsOutputFormat = {
2921
+ MP3_22050_32: "mp3_22050_32",
2922
+ MP3_44100_32: "mp3_44100_32",
2923
+ MP3_44100_64: "mp3_44100_64",
2924
+ MP3_44100_96: "mp3_44100_96",
2925
+ MP3_44100_128: "mp3_44100_128",
2926
+ MP3_44100_192: "mp3_44100_192",
2927
+ PCM_8000: "pcm_8000",
2928
+ PCM_16000: "pcm_16000",
2929
+ PCM_22050: "pcm_22050",
2930
+ PCM_24000: "pcm_24000",
2931
+ PCM_44100: "pcm_44100",
2932
+ ULAW_8000: "ulaw_8000"
2933
+ };
2934
+ var ElevenLabsTTS = class _ElevenLabsTTS {
2935
+ // Stable pricing/dashboard key — read by stream-handler / metrics via
2936
+ // ``(agent.tts.constructor as any).providerKey``. Without this the cost
2937
+ // calculator falls back to ``constructor.name`` ("ElevenLabsTTS") which
2938
+ // does NOT match the pricing table key "elevenlabs", silently zeroing
2939
+ // TTS cost for callers that construct the raw REST class directly
2940
+ // (exposed at top level as ``ElevenLabsRestTTS``).
2941
+ static providerKey = "elevenlabs";
2942
+ apiKey;
2943
+ voiceId;
2944
+ modelId;
2945
+ _outputFormat;
2946
+ _outputFormatExplicit;
2947
+ voiceSettings;
2948
+ languageCode;
2949
+ chunkSize;
2950
+ /**
2951
+ * Public view of the (possibly auto-flipped) wire format. Read by the
2952
+ * stream-handler to decide whether to skip the client-side resample +
2953
+ * mulaw encode when the bytes are already in the carrier's wire codec.
2954
+ */
2955
+ get outputFormat() {
2956
+ return this._outputFormat;
2957
+ }
2958
+ constructor(apiKey, voiceIdOrOptions = "21m00Tcm4TlvDq8ikWAM", modelId = ElevenLabsModel.FLASH_V2_5, outputFormat = ElevenLabsOutputFormat.PCM_16000) {
2959
+ this.apiKey = apiKey;
2960
+ if (typeof voiceIdOrOptions === "object") {
2961
+ const o = voiceIdOrOptions;
2962
+ this.voiceId = resolveVoiceId(o.voiceId ?? "21m00Tcm4TlvDq8ikWAM");
2963
+ this.modelId = o.modelId ?? ElevenLabsModel.FLASH_V2_5;
2964
+ this._outputFormatExplicit = o.outputFormat !== void 0;
2965
+ this._outputFormat = o.outputFormat ?? ElevenLabsOutputFormat.PCM_16000;
2966
+ this.voiceSettings = o.voiceSettings;
2967
+ this.languageCode = o.languageCode;
2968
+ this.chunkSize = o.chunkSize ?? 4096;
2969
+ } else {
2970
+ this.voiceId = resolveVoiceId(voiceIdOrOptions);
2971
+ this.modelId = modelId;
2972
+ this._outputFormatExplicit = outputFormat !== ElevenLabsOutputFormat.PCM_16000;
2973
+ this._outputFormat = outputFormat;
2974
+ this.voiceSettings = void 0;
2975
+ this.languageCode = void 0;
2976
+ this.chunkSize = 4096;
2977
+ }
2978
+ }
2979
+ /**
2980
+ * Hook called by ``StreamHandler.initPipeline`` to advise the carrier
2981
+ * wire format. When the user did NOT pass an explicit ``outputFormat``,
2982
+ * auto-flip to the carrier's native codec so the audio bytes ElevenLabs
2983
+ * returns are already in Twilio/Telnyx wire format — eliminating the
2984
+ * client-side 16 kHz → 8 kHz resample and PCM → μ-law encode. The
2985
+ * resample/encode chain was a source of audible artifacts on the
2986
+ * prewarmed firstMessage (see 0.6.2 acceptance notes — burst delivery
2987
+ * of resampled audio crackled on the carrier-side jitter buffer).
2988
+ *
2989
+ * No-op when the caller passed an explicit ``outputFormat`` (incl. via
2990
+ * the ``forTwilio`` / ``forTelnyx`` factories) — user wins.
2991
+ *
2992
+ * Parity with {@link ElevenLabsWebSocketTTS.setTelephonyCarrier}.
2993
+ */
2994
+ setTelephonyCarrier(carrier) {
2995
+ if (this._outputFormatExplicit) return;
2996
+ if (carrier === "twilio") {
2997
+ this._outputFormat = ElevenLabsOutputFormat.ULAW_8000;
2998
+ } else if (carrier === "telnyx") {
2999
+ this._outputFormat = ElevenLabsOutputFormat.PCM_16000;
3000
+ }
3001
+ }
3002
+ /**
3003
+ * Construct an instance pre-configured for Twilio Media Streams.
3004
+ *
3005
+ * Sets `outputFormat='ulaw_8000'` so ElevenLabs emits μ-law @ 8 kHz
3006
+ * directly — the exact wire format Twilio's media stream uses — letting
3007
+ * the SDK skip the 16 kHz→8 kHz resample and PCM→μ-law conversion in
3008
+ * `TwilioAudioSender`. Saves ~30–80 ms first-byte and per-frame CPU,
3009
+ * and removes a potential aliasing source.
3010
+ *
3011
+ * `voiceSettings` defaults to a low-bandwidth-friendly profile
3012
+ * (speaker boost off, modest stability) which sounds cleaner at 8 kHz
3013
+ * μ-law than the studio default. Pass an explicit object to override.
3014
+ */
3015
+ static forTwilio(apiKey, options = {}) {
3016
+ const voiceSettings = options.voiceSettings ?? {
3017
+ // Speaker boost adds high-frequency emphasis that aliases ugly over an
3018
+ // 8 kHz μ-law line. Slightly higher stability tames the excursions
3019
+ // that compander quantization noise can amplify.
3020
+ stability: 0.6,
3021
+ similarity_boost: 0.75,
3022
+ use_speaker_boost: false
3023
+ };
3024
+ return new _ElevenLabsTTS(apiKey, {
3025
+ ...options,
3026
+ voiceSettings,
3027
+ outputFormat: ElevenLabsOutputFormat.ULAW_8000
3028
+ });
3029
+ }
3030
+ /**
3031
+ * Construct an instance pre-configured for Telnyx bidirectional media.
3032
+ *
3033
+ * Telnyx's default media-streaming codec is L16 PCM @ 16 kHz, which
3034
+ * matches our default Telnyx handler. We pick `pcm_16000` so the audio
3035
+ * flows end-to-end with zero resampling or transcoding.
3036
+ *
3037
+ * Trade-off: if your Telnyx profile is pinned to PCMU/8000 (μ-law),
3038
+ * construct `ElevenLabsTTS` directly with `outputFormat: 'ulaw_8000'`
3039
+ * — Telnyx supports that natively too.
3040
+ */
3041
+ static forTelnyx(apiKey, options = {}) {
3042
+ return new _ElevenLabsTTS(apiKey, {
3043
+ ...options,
3044
+ outputFormat: ElevenLabsOutputFormat.PCM_16000
3045
+ });
3046
+ }
3047
+ /**
3048
+ * Synthesise text to speech and return the full audio as a single Buffer.
3049
+ *
3050
+ * For large chunks (or when latency matters) call `synthesizeStream` instead.
3051
+ */
3052
+ async synthesize(text) {
3053
+ const chunks = [];
3054
+ for await (const chunk of this.synthesizeStream(text)) {
3055
+ chunks.push(chunk);
3056
+ }
3057
+ return Buffer.concat(chunks);
3058
+ }
3059
+ /**
3060
+ * Synthesise text and yield audio chunks as they arrive (streaming).
3061
+ *
3062
+ * The yielded buffers are raw PCM at 16 kHz (or whatever `outputFormat` is
3063
+ * configured to). `chunkSize` controls the maximum yield size — 512 is a
3064
+ * good choice for low-latency telephony.
3065
+ */
3066
+ async *synthesizeStream(text) {
3067
+ const url = `${ELEVENLABS_BASE_URL}/text-to-speech/${encodeURIComponent(this.voiceId)}/stream?output_format=${encodeURIComponent(this._outputFormat)}`;
3068
+ const body = {
3069
+ text,
3070
+ model_id: this.modelId
3071
+ };
3072
+ if (this.voiceSettings) body["voice_settings"] = this.voiceSettings;
3073
+ if (this.languageCode) body["language_code"] = this.languageCode;
3074
+ const response = await fetch(url, {
3075
+ method: "POST",
3076
+ headers: {
3077
+ "xi-api-key": this.apiKey,
3078
+ "Content-Type": "application/json"
3079
+ },
3080
+ body: JSON.stringify(body),
3081
+ signal: AbortSignal.timeout(3e4)
3082
+ });
3083
+ if (!response.ok) {
3084
+ const errBody = await response.text();
3085
+ throw new Error(`ElevenLabs TTS error ${response.status}: ${errBody}`);
3086
+ }
3087
+ if (!response.body) {
3088
+ throw new Error("ElevenLabs TTS: no response body");
3089
+ }
3090
+ const reader = response.body.getReader();
3091
+ try {
3092
+ while (true) {
3093
+ const { done, value } = await reader.read();
3094
+ if (done) break;
3095
+ if (!value || value.length === 0) continue;
3096
+ const buf = Buffer.from(value);
3097
+ for (let offset = 0; offset < buf.length; offset += this.chunkSize) {
3098
+ yield buf.subarray(offset, Math.min(offset + this.chunkSize, buf.length));
3099
+ }
3100
+ }
3101
+ } finally {
3102
+ if (typeof reader.cancel === "function") await reader.cancel().catch(() => {
3103
+ });
3104
+ reader.releaseLock();
3105
+ }
3106
+ }
3107
+ };
3108
+
3109
+ // src/providers/cartesia-tts.ts
3110
+ init_esm_shims();
3111
+ var CARTESIA_BASE_URL = "https://api.cartesia.ai";
3112
+ var CARTESIA_API_VERSION = "2025-04-16";
3113
+ var CARTESIA_DEFAULT_VOICE_ID = "f786b574-daa5-4673-aa0c-cbe3e8534c02";
3114
+ var CartesiaTTSModel = {
3115
+ SONIC_3: "sonic-3",
3116
+ SONIC_2: "sonic-2",
3117
+ SONIC: "sonic"
3118
+ };
3119
+ var CartesiaTTSContainer = {
3120
+ RAW: "raw",
3121
+ WAV: "wav",
3122
+ MP3: "mp3"
3123
+ };
3124
+ var CartesiaTTSEncoding = {
3125
+ PCM_S16LE: "pcm_s16le",
3126
+ PCM_F32LE: "pcm_f32le",
3127
+ PCM_MULAW: "pcm_mulaw",
3128
+ PCM_ALAW: "pcm_alaw"
3129
+ };
3130
+ var CartesiaTTSSampleRate = {
3131
+ HZ_8000: 8e3,
3132
+ HZ_16000: 16e3,
3133
+ HZ_22050: 22050,
3134
+ HZ_24000: 24e3,
3135
+ HZ_44100: 44100
3136
+ };
3137
+ var CartesiaTTSVoiceMode = {
3138
+ ID: "id",
3139
+ EMBEDDING: "embedding"
3140
+ };
3141
+ var CartesiaTTS = class _CartesiaTTS {
3142
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
3143
+ static providerKey = "cartesia_tts";
3144
+ apiKey;
3145
+ model;
3146
+ voice;
3147
+ language;
3148
+ sampleRate;
3149
+ speed;
3150
+ emotion;
3151
+ volume;
3152
+ baseUrl;
3153
+ apiVersion;
3154
+ constructor(apiKey, opts = {}) {
3155
+ this.apiKey = apiKey;
3156
+ this.model = opts.model ?? CartesiaTTSModel.SONIC_3;
3157
+ this.voice = opts.voice ?? CARTESIA_DEFAULT_VOICE_ID;
3158
+ this.language = opts.language ?? "en";
3159
+ this.sampleRate = opts.sampleRate ?? CartesiaTTSSampleRate.HZ_16000;
3160
+ this.speed = opts.speed;
3161
+ this.emotion = typeof opts.emotion === "string" ? [opts.emotion] : opts.emotion;
3162
+ this.volume = opts.volume;
3163
+ this.baseUrl = opts.baseUrl ?? CARTESIA_BASE_URL;
3164
+ this.apiVersion = opts.apiVersion ?? CARTESIA_API_VERSION;
3165
+ }
3166
+ /**
3167
+ * Construct an instance pre-configured for Twilio Media Streams.
3168
+ *
3169
+ * Sets `sampleRate=8000` so Cartesia emits PCM_S16LE @ 8 kHz directly.
3170
+ * Twilio's media stream uses μ-law @ 8 kHz so the SDK still does the
3171
+ * PCM → μ-law transcode client-side, but the 16 kHz → 8 kHz resample
3172
+ * step is skipped. Saves ~10–30 ms first-byte plus per-frame CPU and
3173
+ * removes a potential aliasing source.
3174
+ */
3175
+ static forTwilio(apiKey, options = {}) {
3176
+ return new _CartesiaTTS(apiKey, {
3177
+ ...options,
3178
+ sampleRate: CartesiaTTSSampleRate.HZ_8000
3179
+ });
3180
+ }
3181
+ /**
3182
+ * Construct an instance pre-configured for Telnyx bidirectional media.
3183
+ *
3184
+ * Sets `sampleRate=16000` to match Telnyx's L16/16000 default codec —
3185
+ * audio flows end-to-end with zero resampling or transcoding. Same as
3186
+ * the bare-constructor default; exists for API symmetry with
3187
+ * {@link CartesiaTTS.forTwilio}.
3188
+ */
3189
+ static forTelnyx(apiKey, options = {}) {
3190
+ return new _CartesiaTTS(apiKey, {
3191
+ ...options,
3192
+ sampleRate: CartesiaTTSSampleRate.HZ_16000
3193
+ });
3194
+ }
3195
+ /** Build the JSON payload for the Cartesia bytes endpoint. */
3196
+ buildPayload(text) {
3197
+ const payload = {
3198
+ model_id: this.model,
3199
+ voice: { mode: CartesiaTTSVoiceMode.ID, id: this.voice },
3200
+ transcript: text,
3201
+ output_format: {
3202
+ container: CartesiaTTSContainer.RAW,
3203
+ encoding: CartesiaTTSEncoding.PCM_S16LE,
3204
+ sample_rate: this.sampleRate
3205
+ },
3206
+ language: this.language
3207
+ };
3208
+ const generationConfig = {};
3209
+ if (this.speed !== void 0) generationConfig.speed = this.speed;
3210
+ if (this.emotion && this.emotion.length > 0)
3211
+ generationConfig.emotion = this.emotion[0];
3212
+ if (this.volume !== void 0) generationConfig.volume = this.volume;
3213
+ if (Object.keys(generationConfig).length > 0) {
3214
+ payload.generation_config = generationConfig;
3215
+ }
3216
+ return payload;
3217
+ }
3218
+ /**
3219
+ * Pre-call HTTP warmup for the Cartesia `/tts/bytes` endpoint.
3220
+ *
3221
+ * Issues a lightweight `GET <baseUrl>/voices` so DNS, TLS, and HTTP/2
3222
+ * are already up by the time the first `synthesizeStream()` POST
3223
+ * lands. Best-effort: 5 s timeout, all exceptions swallowed at
3224
+ * debug level.
3225
+ *
3226
+ * Billing safety: `GET /voices` is a free metadata read on
3227
+ * Cartesia's REST surface (per https://docs.cartesia.ai). It does
3228
+ * not consume synthesis credits. The actual synthesis is billed
3229
+ * only when `POST /tts/bytes` runs with a non-empty `transcript`.
3230
+ *
3231
+ * Note: Cartesia TTS uses the HTTP path (vs the WebSocket variant
3232
+ * Cartesia also exposes) — connection warmup is therefore HTTP-GET
3233
+ * based, not WebSocket pre-handshake. The latency win is smaller
3234
+ * (~50-150 ms vs the ~200-500 ms of a WS prewarm) but still real.
3235
+ */
3236
+ async warmup() {
3237
+ try {
3238
+ await fetch(`${this.baseUrl}/voices`, {
3239
+ method: "GET",
3240
+ headers: {
3241
+ "X-API-Key": this.apiKey,
3242
+ "Cartesia-Version": this.apiVersion
3243
+ },
3244
+ signal: AbortSignal.timeout(5e3)
3245
+ });
3246
+ } catch (err) {
3247
+ getLogger().debug(`Cartesia TTS warmup failed (best-effort): ${String(err)}`);
3248
+ }
3249
+ }
3250
+ /** Synthesize text and return the concatenated audio buffer. */
3251
+ async synthesize(text) {
3252
+ const chunks = [];
3253
+ for await (const chunk of this.synthesizeStream(text)) {
3254
+ chunks.push(chunk);
3255
+ }
3256
+ return Buffer.concat(chunks);
3257
+ }
3258
+ /**
3259
+ * Synthesize text and yield raw PCM_S16LE chunks at the configured
3260
+ * `sampleRate` as they arrive from Cartesia.
3261
+ */
3262
+ async *synthesizeStream(text) {
3263
+ const response = await fetch(`${this.baseUrl}/tts/bytes`, {
3264
+ method: "POST",
3265
+ headers: {
3266
+ "X-API-Key": this.apiKey,
3267
+ "Cartesia-Version": this.apiVersion,
3268
+ "Content-Type": "application/json"
3269
+ },
3270
+ body: JSON.stringify(this.buildPayload(text)),
3271
+ signal: AbortSignal.timeout(3e4)
3272
+ });
3273
+ if (!response.ok) {
3274
+ const body = await response.text();
3275
+ throw new Error(`Cartesia TTS error ${response.status}: ${body}`);
3276
+ }
3277
+ if (!response.body) {
3278
+ throw new Error("Cartesia TTS: no response body");
3279
+ }
3280
+ const reader = response.body.getReader();
3281
+ try {
3282
+ while (true) {
3283
+ const { done, value } = await reader.read();
3284
+ if (done) break;
3285
+ if (value && value.length > 0) {
3286
+ yield Buffer.from(value);
3287
+ }
3288
+ }
3289
+ } finally {
3290
+ if (typeof reader.cancel === "function")
3291
+ await reader.cancel().catch(() => {
3292
+ });
3293
+ reader.releaseLock();
3294
+ }
3295
+ }
3296
+ };
3297
+
3298
+ // src/providers/rime-tts.ts
3299
+ init_esm_shims();
3300
+ var RIME_BASE_URL = "https://users.rime.ai/v1/rime-tts";
3301
+ var RimeModel = {
3302
+ ARCANA: "arcana",
3303
+ MIST: "mist",
3304
+ MIST_V2: "mistv2"
3305
+ };
3306
+ var RimeAudioFormat = {
3307
+ PCM: "audio/pcm",
3308
+ MP3: "audio/mp3",
3309
+ WAV: "audio/wav",
3310
+ MULAW: "audio/mulaw"
3311
+ };
3312
+ var ARCANA_MODEL_TIMEOUT_MS = 60 * 4 * 1e3;
3313
+ var MIST_MODEL_TIMEOUT_MS = 30 * 1e3;
3314
+ function isMistModel(model) {
3315
+ return model.includes(RimeModel.MIST);
3316
+ }
3317
+ function timeoutForModel(model) {
3318
+ if (model === RimeModel.ARCANA) return ARCANA_MODEL_TIMEOUT_MS;
3319
+ return MIST_MODEL_TIMEOUT_MS;
3320
+ }
3321
+ var RimeTTS = class {
3322
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
3323
+ static providerKey = "rime";
3324
+ apiKey;
3325
+ model;
3326
+ speaker;
3327
+ lang;
3328
+ sampleRate;
3329
+ repetitionPenalty;
3330
+ temperature;
3331
+ topP;
3332
+ maxTokens;
3333
+ speedAlpha;
3334
+ reduceLatency;
3335
+ pauseBetweenBrackets;
3336
+ phonemizeBetweenBrackets;
3337
+ baseUrl;
3338
+ totalTimeoutMs;
3339
+ constructor(apiKey, opts = {}) {
3340
+ this.apiKey = apiKey;
3341
+ this.model = opts.model ?? RimeModel.ARCANA;
3342
+ const defaultSpeaker = isMistModel(this.model) ? "cove" : "astra";
3343
+ this.speaker = opts.speaker ?? defaultSpeaker;
3344
+ this.lang = opts.lang ?? "eng";
3345
+ this.sampleRate = opts.sampleRate ?? 16e3;
3346
+ this.repetitionPenalty = opts.repetitionPenalty;
3347
+ this.temperature = opts.temperature;
3348
+ this.topP = opts.topP;
3349
+ this.maxTokens = opts.maxTokens;
3350
+ this.speedAlpha = opts.speedAlpha;
3351
+ this.reduceLatency = opts.reduceLatency;
3352
+ this.pauseBetweenBrackets = opts.pauseBetweenBrackets;
3353
+ this.phonemizeBetweenBrackets = opts.phonemizeBetweenBrackets;
3354
+ this.baseUrl = opts.baseUrl ?? RIME_BASE_URL;
3355
+ this.totalTimeoutMs = timeoutForModel(this.model);
3356
+ }
3357
+ buildPayload(text) {
3358
+ const payload = {
3359
+ speaker: this.speaker,
3360
+ text,
3361
+ modelId: this.model
3362
+ };
3363
+ if (this.model === RimeModel.ARCANA) {
3364
+ if (this.repetitionPenalty !== void 0)
3365
+ payload.repetition_penalty = this.repetitionPenalty;
3366
+ if (this.temperature !== void 0) payload.temperature = this.temperature;
3367
+ if (this.topP !== void 0) payload.top_p = this.topP;
3368
+ if (this.maxTokens !== void 0) payload.max_tokens = this.maxTokens;
3369
+ payload.lang = this.lang;
3370
+ payload.samplingRate = this.sampleRate;
3371
+ } else if (isMistModel(this.model)) {
3372
+ payload.lang = this.lang;
3373
+ payload.samplingRate = this.sampleRate;
3374
+ if (this.speedAlpha !== void 0) payload.speedAlpha = this.speedAlpha;
3375
+ if (this.model === RimeModel.MIST_V2 && this.reduceLatency !== void 0) {
3376
+ payload.reduceLatency = this.reduceLatency;
3377
+ }
3378
+ if (this.pauseBetweenBrackets !== void 0) {
3379
+ payload.pauseBetweenBrackets = this.pauseBetweenBrackets;
3380
+ }
3381
+ if (this.phonemizeBetweenBrackets !== void 0) {
3382
+ payload.phonemizeBetweenBrackets = this.phonemizeBetweenBrackets;
3383
+ }
3384
+ }
3385
+ return payload;
3386
+ }
3387
+ /** Synthesize text and return the concatenated audio buffer. */
3388
+ async synthesize(text) {
3389
+ const chunks = [];
3390
+ for await (const chunk of this.synthesizeStream(text)) {
3391
+ chunks.push(chunk);
3392
+ }
3393
+ return Buffer.concat(chunks);
3394
+ }
3395
+ /**
3396
+ * Synthesize text and yield raw PCM_S16LE chunks at the configured
3397
+ * `sampleRate` as they stream in.
3398
+ */
3399
+ async *synthesizeStream(text) {
3400
+ const response = await fetch(this.baseUrl, {
3401
+ method: "POST",
3402
+ headers: {
3403
+ accept: RimeAudioFormat.PCM,
3404
+ Authorization: `Bearer ${this.apiKey}`,
3405
+ "content-type": "application/json"
3406
+ },
3407
+ body: JSON.stringify(this.buildPayload(text)),
3408
+ signal: AbortSignal.timeout(this.totalTimeoutMs)
3409
+ });
3410
+ if (!response.ok) {
3411
+ const body = await response.text();
3412
+ throw new Error(`Rime TTS error ${response.status}: ${body}`);
3413
+ }
3414
+ const contentType = response.headers.get("content-type") ?? "";
3415
+ if (!contentType.startsWith("audio")) {
3416
+ const body = await response.text();
3417
+ throw new Error(`Rime returned non-audio response: ${body.slice(0, 500)}`);
3418
+ }
3419
+ if (!response.body) {
3420
+ throw new Error("Rime TTS: no response body");
3421
+ }
3422
+ const reader = response.body.getReader();
3423
+ try {
3424
+ while (true) {
3425
+ const { done, value } = await reader.read();
3426
+ if (done) break;
3427
+ if (value && value.length > 0) {
3428
+ yield Buffer.from(value);
3429
+ }
3430
+ }
3431
+ } finally {
3432
+ if (typeof reader.cancel === "function")
3433
+ await reader.cancel().catch(() => {
3434
+ });
3435
+ reader.releaseLock();
3436
+ }
3437
+ }
3438
+ };
3439
+
3440
+ // src/stt/deepgram.ts
3441
+ init_esm_shims();
3442
+ var STT = class extends DeepgramSTT {
3443
+ static providerKey = "deepgram";
3444
+ constructor(opts = {}) {
3445
+ const key = opts.apiKey ?? process.env.DEEPGRAM_API_KEY;
3446
+ if (!key) {
3447
+ throw new Error(
3448
+ "Deepgram STT requires an apiKey. Pass { apiKey: 'dg_...' } or set DEEPGRAM_API_KEY in the environment."
3449
+ );
3450
+ }
3451
+ super(
3452
+ key,
3453
+ opts.language ?? "en",
3454
+ opts.model ?? "nova-3",
3455
+ opts.encoding ?? "linear16",
3456
+ opts.sampleRate ?? 16e3,
3457
+ {
3458
+ endpointingMs: opts.endpointingMs ?? 150,
3459
+ utteranceEndMs: opts.utteranceEndMs === null ? null : opts.utteranceEndMs ?? 1e3,
3460
+ smartFormat: opts.smartFormat ?? true,
3461
+ interimResults: opts.interimResults ?? true,
3462
+ ...opts.vadEvents !== void 0 ? { vadEvents: opts.vadEvents } : {}
3463
+ }
3464
+ );
3465
+ }
3466
+ };
3467
+
3468
+ // src/stt/whisper.ts
3469
+ init_esm_shims();
3470
+
3471
+ // src/providers/whisper-stt.ts
3472
+ init_esm_shims();
3473
+ var OPENAI_TRANSCRIPTION_URL = "https://api.openai.com/v1/audio/transcriptions";
3474
+ var DEFAULT_BUFFER_SIZE = 16e3 * 2;
3475
+ var ALLOWED_MODELS = /* @__PURE__ */ new Set(["whisper-1", "gpt-4o-transcribe", "gpt-4o-mini-transcribe"]);
3476
+ function wrapPcmInWav(pcm, sampleRate = 16e3, channels = 1, bitsPerSample = 16) {
3477
+ const dataSize = pcm.length;
3478
+ const header = Buffer.alloc(44);
3479
+ header.write("RIFF", 0);
3480
+ header.writeUInt32LE(36 + dataSize, 4);
3481
+ header.write("WAVE", 8);
2323
3482
  header.write("fmt ", 12);
2324
3483
  header.writeUInt32LE(16, 16);
2325
3484
  header.writeUInt16LE(1, 20);
@@ -2333,6 +3492,8 @@ function wrapPcmInWav(pcm, sampleRate = 16e3, channels = 1, bitsPerSample = 16)
2333
3492
  return Buffer.concat([header, pcm]);
2334
3493
  }
2335
3494
  var WhisperSTT = class _WhisperSTT {
3495
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
3496
+ static providerKey = "whisper";
2336
3497
  apiKey;
2337
3498
  model;
2338
3499
  language;
@@ -2501,6 +3662,8 @@ init_esm_shims();
2501
3662
  var ALLOWED_MODELS2 = /* @__PURE__ */ new Set(["gpt-4o-transcribe", "gpt-4o-mini-transcribe"]);
2502
3663
  var DEFAULT_BUFFER_SIZE2 = 16e3 * 2;
2503
3664
  var OpenAITranscribeSTT = class extends WhisperSTT {
3665
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
3666
+ static providerKey = "openai_transcribe";
2504
3667
  /**
2505
3668
  * @param apiKey OpenAI API key.
2506
3669
  * @param language ISO-639-1 language code (e.g. ``"en"``, ``"it"``). Optional.
@@ -2576,6 +3739,8 @@ var CartesiaSTT = class {
2576
3739
  }
2577
3740
  apiKey;
2578
3741
  options;
3742
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
3743
+ static providerKey = "cartesia_stt";
2579
3744
  ws = null;
2580
3745
  callbacks = /* @__PURE__ */ new Set();
2581
3746
  keepaliveTimer = null;
@@ -2584,6 +3749,37 @@ var CartesiaSTT = class {
2584
3749
  * `null` until the first transcript event arrives (matches Python's `None`).
2585
3750
  */
2586
3751
  requestId = null;
3752
+ /**
3753
+ * Open a fresh WebSocket without arming any message / keepalive handlers
3754
+ * and without taking ownership on `this.ws`. Returns the OPEN socket so
3755
+ * the caller (the prewarm pipeline) can park it for later adoption via
3756
+ * `adoptWebSocket`. Bounded by `CONNECT_TIMEOUT_MS`.
3757
+ *
3758
+ * Billing safety: opening + parking the WS does not stream audio
3759
+ * (Cartesia STT bills on streamed audio seconds), so no charge is
3760
+ * incurred. Close the returned WS yourself if it is never adopted.
3761
+ */
3762
+ async openParkedConnection() {
3763
+ const url = this.buildWsUrl();
3764
+ const ws = new WebSocket2(url, {
3765
+ headers: { "User-Agent": USER_AGENT }
3766
+ });
3767
+ await new Promise((resolve, reject) => {
3768
+ const timer = setTimeout(
3769
+ () => reject(new Error("Cartesia STT park connect timeout")),
3770
+ CONNECT_TIMEOUT_MS
3771
+ );
3772
+ ws.once("open", () => {
3773
+ clearTimeout(timer);
3774
+ resolve();
3775
+ });
3776
+ ws.once("error", (err) => {
3777
+ clearTimeout(timer);
3778
+ reject(err);
3779
+ });
3780
+ });
3781
+ return ws;
3782
+ }
2587
3783
  buildWsUrl() {
2588
3784
  const opts = this.options;
2589
3785
  const rawBase = opts.baseUrl ?? DEFAULT_BASE_URL;
@@ -2608,6 +3804,57 @@ var CartesiaSTT = class {
2608
3804
  });
2609
3805
  return `${base}/stt/websocket?${params.toString()}`;
2610
3806
  }
3807
+ /**
3808
+ * Pre-call WebSocket warmup for the Cartesia STT `/stt/websocket` endpoint.
3809
+ *
3810
+ * Opens the WS (DNS + TLS + auth handshake), idles ~250 ms so the
3811
+ * Cartesia edge keeps session state warm, then closes. By the time
3812
+ * `connect()` is invoked at call-pickup the resolver and TLS session
3813
+ * are hot — net wire time saving of 200-500 ms.
3814
+ *
3815
+ * Billing safety: Cartesia STT bills on streamed audio seconds (per
3816
+ * https://docs.cartesia.ai/2025-04-16/api-reference/stt/stt). Opening
3817
+ * + closing the WebSocket without forwarding audio does not consume
3818
+ * billable seconds. Best-effort: failures logged at debug level.
3819
+ */
3820
+ async warmup() {
3821
+ const url = this.buildWsUrl();
3822
+ let ws = null;
3823
+ try {
3824
+ ws = await new Promise((resolve, reject) => {
3825
+ const sock = new WebSocket2(url, {
3826
+ headers: { "User-Agent": USER_AGENT }
3827
+ });
3828
+ const timer = setTimeout(() => {
3829
+ try {
3830
+ sock.close();
3831
+ } catch {
3832
+ }
3833
+ reject(new Error("Cartesia STT warmup connect timeout"));
3834
+ }, 5e3);
3835
+ sock.once("open", () => {
3836
+ clearTimeout(timer);
3837
+ resolve(sock);
3838
+ });
3839
+ sock.once("error", (err) => {
3840
+ clearTimeout(timer);
3841
+ reject(err);
3842
+ });
3843
+ });
3844
+ await new Promise((r) => setTimeout(r, 250));
3845
+ } catch (err) {
3846
+ getLogger().debug(
3847
+ `Cartesia STT warmup failed (best-effort): ${describeWarmupError(err)}`
3848
+ );
3849
+ } finally {
3850
+ if (ws) {
3851
+ try {
3852
+ ws.close();
3853
+ } catch {
3854
+ }
3855
+ }
3856
+ }
3857
+ }
2611
3858
  /** Open the streaming WebSocket and arm message + keepalive handlers. */
2612
3859
  async connect() {
2613
3860
  const url = this.buildWsUrl();
@@ -2628,6 +3875,24 @@ var CartesiaSTT = class {
2628
3875
  reject(err);
2629
3876
  });
2630
3877
  });
3878
+ this.armMessageAndKeepalive();
3879
+ }
3880
+ /**
3881
+ * Adopt a pre-opened, already-OPEN WebSocket produced by the prewarm
3882
+ * pipeline (see `Patter.parkProviderConnections`). Skips the fresh
3883
+ * `new WebSocket()` + handshake — the WS is already through DNS, TLS
3884
+ * and HTTP-101 so audio frames can flow on this turn instead of
3885
+ * paying ~150-400 ms of handshake.
3886
+ *
3887
+ * Caller MUST verify `ws.readyState === OPEN` before calling. If the
3888
+ * parked WS died between park and adopt, fall back to `connect()`.
3889
+ */
3890
+ adoptWebSocket(ws) {
3891
+ this.ws = ws;
3892
+ this.armMessageAndKeepalive();
3893
+ }
3894
+ armMessageAndKeepalive() {
3895
+ if (!this.ws) return;
2631
3896
  this.ws.on("message", (raw) => {
2632
3897
  let event;
2633
3898
  try {
@@ -2675,6 +3940,31 @@ var CartesiaSTT = class {
2675
3940
  if (!this.ws || this.ws.readyState !== WebSocket2.OPEN) return;
2676
3941
  this.ws.send(audio);
2677
3942
  }
3943
+ /**
3944
+ * Force Cartesia to finalise the in-flight utterance immediately.
3945
+ *
3946
+ * Sends a ``finalize`` text frame on the live WebSocket. Cartesia
3947
+ * replies with the final transcript followed by ``flush_done``,
3948
+ * bypassing its conservative internal silence heuristic (which can
3949
+ * wait 2-7 s on PSTN audio before naturally finalising). Wired
3950
+ * into ``StreamHandler`` on the VAD ``speech_end`` event so the
3951
+ * SDK's authoritative end-of-speech detection forces an immediate
3952
+ * STT finalisation — turning Cartesia's natural-pause endpointing
3953
+ * into a deterministic VAD-driven one, parity with the Deepgram
3954
+ * fast-path. No-op when the WS isn't open. Parity with Python
3955
+ * ``CartesiaSTT.finalize``.
3956
+ */
3957
+ async finalize() {
3958
+ if (!this.ws || this.ws.readyState !== WebSocket2.OPEN) return;
3959
+ await new Promise((resolve) => {
3960
+ this.ws.send(CartesiaSTTClientFrame.FINALIZE, (err) => {
3961
+ if (err) {
3962
+ getLogger().debug(`Cartesia finalize send failed: ${String(err)}`);
3963
+ }
3964
+ resolve();
3965
+ });
3966
+ });
3967
+ }
2678
3968
  /** Register a transcript listener. */
2679
3969
  onTranscript(callback) {
2680
3970
  this.callbacks.add(callback);
@@ -2748,6 +4038,17 @@ var CartesiaSTT = class {
2748
4038
  }
2749
4039
  }
2750
4040
  };
4041
+ function describeWarmupError(err) {
4042
+ if (typeof err === "object" && err !== null) {
4043
+ const e = err;
4044
+ if (typeof e.statusCode === "number") return `HTTP ${e.statusCode}`;
4045
+ if (typeof e.code === "number" && e.code >= 100 && e.code < 600) return `HTTP ${e.code}`;
4046
+ const ctor = e.constructor?.name;
4047
+ if (typeof ctor === "string" && ctor !== "Object") return ctor;
4048
+ if (typeof e.name === "string") return e.name;
4049
+ }
4050
+ return typeof err;
4051
+ }
2751
4052
 
2752
4053
  // src/stt/cartesia.ts
2753
4054
  var STT4 = class extends CartesiaSTT {
@@ -2826,6 +4127,8 @@ var TokenAccumulator = class {
2826
4127
  }
2827
4128
  };
2828
4129
  var SonioxSTT = class _SonioxSTT {
4130
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
4131
+ static providerKey = "soniox";
2829
4132
  ws = null;
2830
4133
  callbacks = [];
2831
4134
  final = new TokenAccumulator();
@@ -3103,6 +4406,8 @@ var AssemblyAISTT = class _AssemblyAISTT {
3103
4406
  }
3104
4407
  apiKey;
3105
4408
  options;
4409
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
4410
+ static providerKey = "assemblyai";
3106
4411
  ws = null;
3107
4412
  callbacks = /* @__PURE__ */ new Set();
3108
4413
  closing = false;
@@ -3192,6 +4497,62 @@ var AssemblyAISTT = class _AssemblyAISTT {
3192
4497
  }
3193
4498
  return headers;
3194
4499
  }
4500
+ /**
4501
+ * Pre-call WebSocket warmup for the AssemblyAI v3 `/v3/ws` endpoint.
4502
+ *
4503
+ * Opens the WS (DNS + TLS + auth handshake), idles ~250 ms so the
4504
+ * AssemblyAI edge keeps the session state warm, then sends Terminate
4505
+ * and closes. By the time `connect()` is invoked at call-pickup the
4506
+ * resolver and TLS session are hot — net wire time saving of
4507
+ * 200-500 ms.
4508
+ *
4509
+ * Billing safety: AssemblyAI Universal Streaming bills on streamed
4510
+ * audio seconds (per https://www.assemblyai.com/pricing). Opening +
4511
+ * closing the WebSocket without forwarding any audio frames does
4512
+ * not consume billable seconds. Best-effort: failures logged at
4513
+ * debug level.
4514
+ */
4515
+ async warmup() {
4516
+ const url = this.buildUrl();
4517
+ const headers = this.buildHeaders();
4518
+ let ws = null;
4519
+ try {
4520
+ ws = await new Promise((resolve, reject) => {
4521
+ const sock = new WebSocket4(url, { headers });
4522
+ const timer = setTimeout(() => {
4523
+ try {
4524
+ sock.close();
4525
+ } catch {
4526
+ }
4527
+ reject(new Error("AssemblyAI STT warmup connect timeout"));
4528
+ }, 5e3);
4529
+ sock.once("open", () => {
4530
+ clearTimeout(timer);
4531
+ resolve(sock);
4532
+ });
4533
+ sock.once("error", (err) => {
4534
+ clearTimeout(timer);
4535
+ reject(err);
4536
+ });
4537
+ });
4538
+ await new Promise((r) => setTimeout(r, 250));
4539
+ try {
4540
+ ws.send(JSON.stringify({ type: AssemblyAIClientFrame.TERMINATE }));
4541
+ } catch {
4542
+ }
4543
+ } catch (err) {
4544
+ getLogger().debug(
4545
+ `AssemblyAI STT warmup failed (best-effort): ${describeWarmupError2(err)}`
4546
+ );
4547
+ } finally {
4548
+ if (ws) {
4549
+ try {
4550
+ ws.close();
4551
+ } catch {
4552
+ }
4553
+ }
4554
+ }
4555
+ }
3195
4556
  /** Open the streaming WebSocket and arm message handlers. */
3196
4557
  async connect() {
3197
4558
  this.closing = false;
@@ -3420,6 +4781,17 @@ function averageConfidence(words) {
3420
4781
  }
3421
4782
  return total / words.length;
3422
4783
  }
4784
+ function describeWarmupError2(err) {
4785
+ if (typeof err === "object" && err !== null) {
4786
+ const e = err;
4787
+ if (typeof e.statusCode === "number") return `HTTP ${e.statusCode}`;
4788
+ if (typeof e.code === "number" && e.code >= 100 && e.code < 600) return `HTTP ${e.code}`;
4789
+ const ctor = e.constructor?.name;
4790
+ if (typeof ctor === "string" && ctor !== "Object") return ctor;
4791
+ if (typeof e.name === "string") return e.name;
4792
+ }
4793
+ return typeof err;
4794
+ }
3423
4795
 
3424
4796
  // src/stt/assemblyai.ts
3425
4797
  var STT6 = class extends AssemblyAISTT {
@@ -3476,6 +4848,8 @@ var SpeechmaticsServerMessage = {
3476
4848
  ERROR: "Error"
3477
4849
  };
3478
4850
  var SpeechmaticsSTT = class {
4851
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
4852
+ static providerKey = "speechmatics";
3479
4853
  ws = null;
3480
4854
  transcriptCallbacks = /* @__PURE__ */ new Set();
3481
4855
  errorCallbacks = /* @__PURE__ */ new Set();
@@ -3729,275 +5103,60 @@ var SpeechmaticsSTT = class {
3729
5103
  emitError(err) {
3730
5104
  for (const cb of this.errorCallbacks) {
3731
5105
  try {
3732
- cb(err);
3733
- } catch (cbErr) {
3734
- getLogger().error(`SpeechmaticsSTT error callback threw: ${String(cbErr)}`);
3735
- }
3736
- }
3737
- }
3738
- handleError(err) {
3739
- getLogger().error(`SpeechmaticsSTT WebSocket error: ${err.message}`);
3740
- this.emitError(err);
3741
- }
3742
- handleClose() {
3743
- if (!this.running) return;
3744
- this.running = false;
3745
- }
3746
- /** Send `EndOfStream` and close the WebSocket. Idempotent. */
3747
- close() {
3748
- this.running = false;
3749
- const ws = this.ws;
3750
- if (!ws) return;
3751
- this.ws = null;
3752
- const sendSafe = (payload) => {
3753
- if (ws.readyState === WebSocket5.OPEN) {
3754
- try {
3755
- ws.send(payload);
3756
- } catch {
3757
- }
3758
- }
3759
- };
3760
- sendSafe(
3761
- JSON.stringify({ message: "EndOfStream", last_seq_no: this.lastSeqNo })
3762
- );
3763
- try {
3764
- ws.close();
3765
- } catch {
3766
- }
3767
- }
3768
- };
3769
-
3770
- // src/stt/speechmatics.ts
3771
- var STT7 = class extends SpeechmaticsSTT {
3772
- static providerKey = "speechmatics";
3773
- constructor(opts = {}) {
3774
- const key = opts.apiKey ?? process.env.SPEECHMATICS_API_KEY;
3775
- if (!key) {
3776
- throw new Error(
3777
- "Speechmatics STT requires an apiKey. Pass { apiKey: 'sm_...' } or set SPEECHMATICS_API_KEY in the environment."
3778
- );
3779
- }
3780
- super(key, opts);
3781
- }
3782
- };
3783
-
3784
- // src/tts/elevenlabs.ts
3785
- init_esm_shims();
3786
-
3787
- // src/providers/elevenlabs-tts.ts
3788
- init_esm_shims();
3789
- var ELEVENLABS_BASE_URL = "https://api.elevenlabs.io/v1";
3790
- var ELEVENLABS_VOICE_ID_BY_NAME = {
3791
- rachel: "21m00Tcm4TlvDq8ikWAM",
3792
- drew: "29vD33N1CtxCmqQRPOHJ",
3793
- clyde: "2EiwWnXFnvU5JabPnv8n",
3794
- paul: "5Q0t7uMcjvnagumLfvZi",
3795
- domi: "AZnzlk1XvdvUeBnXmlld",
3796
- dave: "CYw3kZ02Hs0563khs1Fj",
3797
- fin: "D38z5RcWu1voky8WS1ja",
3798
- bella: "EXAVITQu4vr4xnSDxMaL",
3799
- antoni: "ErXwobaYiN019PkySvjV",
3800
- thomas: "GBv7mTt0atIp3Br8iCZE",
3801
- charlie: "IKne3meq5aSn9XLyUdCD",
3802
- george: "JBFqnCBsd6RMkjVDRZzb",
3803
- emily: "LcfcDJNUP1GQjkzn1xUU",
3804
- elli: "MF3mGyEYCl7XYWbV9V6O",
3805
- callum: "N2lVS1w4EtoT3dr4eOWO",
3806
- patrick: "ODq5zmih8GrVes37Dizd",
3807
- harry: "SOYHLrjzK2X1ezoPC6cr",
3808
- liam: "TX3LPaxmHKxFdv7VOQHJ",
3809
- dorothy: "ThT5KcBeYPX3keUQqHPh",
3810
- josh: "TxGEqnHWrfWFTfGW9XjX",
3811
- arnold: "VR6AewLTigWG4xSOukaG",
3812
- charlotte: "XB0fDUnXU5powFXDhCwa",
3813
- matilda: "XrExE9yKIg1WjnnlVkGX",
3814
- matthew: "Yko7PKHZNXotIFUBG7I9",
3815
- james: "ZQe5CZNOzWyzPSCn5a3c",
3816
- joseph: "Zlb1dXrM653N07WRdFW3",
3817
- jeremy: "bVMeCyTHy58xNoL34h3p",
3818
- michael: "flq6f7yk4E4fJM5XTYuZ",
3819
- ethan: "g5CIjZEefAph4nQFvHAz",
3820
- gigi: "jBpfuIE2acCO8z3wKNLl",
3821
- freya: "jsCqWAovK2LkecY7zXl4",
3822
- brian: "nPczCjzI2devNBz1zQrb",
3823
- grace: "oWAxZDx7w5VEj9dCyTzz",
3824
- daniel: "onwK4e9ZLuTAKqWW03F9",
3825
- lily: "pFZP5JQG7iQjIQuC4Bku",
3826
- serena: "pMsXgVXv3BLzUgSXRplE",
3827
- adam: "pNInz6obpgDQGcFmaJgB",
3828
- nicole: "piTKgcLEGmPE4e6mEKli",
3829
- bill: "pqHfZKP75CvOlQylNhV4",
3830
- jessie: "t0jbNlBVZ17f02VDIeMI",
3831
- ryan: "wViXBPUzp2ZZixB1xQuM",
3832
- sam: "yoZ06aMxZJJ28mfd3POQ",
3833
- glinda: "z9fAnlkpzviPz146aGWa",
3834
- giovanni: "zcAOhNBS3c14rBihAFp1",
3835
- mimi: "zrHiDhphv9ZnVXBqCLjz",
3836
- sarah: "EXAVITQu4vr4xnSDxMaL",
3837
- alloy: "EXAVITQu4vr4xnSDxMaL"
3838
- };
3839
- var VOICE_ID_PATTERN = /^[A-Za-z0-9]{20}$/;
3840
- function resolveVoiceId(voice) {
3841
- if (!voice) return voice;
3842
- if (VOICE_ID_PATTERN.test(voice)) return voice;
3843
- return ELEVENLABS_VOICE_ID_BY_NAME[voice.toLowerCase()] ?? voice;
3844
- }
3845
- var ElevenLabsModel = {
3846
- V3: "eleven_v3",
3847
- FLASH_V2_5: "eleven_flash_v2_5",
3848
- TURBO_V2_5: "eleven_turbo_v2_5",
3849
- MULTILINGUAL_V2: "eleven_multilingual_v2",
3850
- MONOLINGUAL_V1: "eleven_monolingual_v1"
3851
- };
3852
- var ElevenLabsOutputFormat = {
3853
- MP3_22050_32: "mp3_22050_32",
3854
- MP3_44100_32: "mp3_44100_32",
3855
- MP3_44100_64: "mp3_44100_64",
3856
- MP3_44100_96: "mp3_44100_96",
3857
- MP3_44100_128: "mp3_44100_128",
3858
- MP3_44100_192: "mp3_44100_192",
3859
- PCM_8000: "pcm_8000",
3860
- PCM_16000: "pcm_16000",
3861
- PCM_22050: "pcm_22050",
3862
- PCM_24000: "pcm_24000",
3863
- PCM_44100: "pcm_44100",
3864
- ULAW_8000: "ulaw_8000"
3865
- };
3866
- var ElevenLabsTTS = class _ElevenLabsTTS {
3867
- apiKey;
3868
- voiceId;
3869
- modelId;
3870
- outputFormat;
3871
- voiceSettings;
3872
- languageCode;
3873
- chunkSize;
3874
- constructor(apiKey, voiceIdOrOptions = "21m00Tcm4TlvDq8ikWAM", modelId = ElevenLabsModel.FLASH_V2_5, outputFormat = ElevenLabsOutputFormat.PCM_16000) {
3875
- this.apiKey = apiKey;
3876
- if (typeof voiceIdOrOptions === "object") {
3877
- const o = voiceIdOrOptions;
3878
- this.voiceId = resolveVoiceId(o.voiceId ?? "21m00Tcm4TlvDq8ikWAM");
3879
- this.modelId = o.modelId ?? ElevenLabsModel.FLASH_V2_5;
3880
- this.outputFormat = o.outputFormat ?? ElevenLabsOutputFormat.PCM_16000;
3881
- this.voiceSettings = o.voiceSettings;
3882
- this.languageCode = o.languageCode;
3883
- this.chunkSize = o.chunkSize ?? 4096;
3884
- } else {
3885
- this.voiceId = resolveVoiceId(voiceIdOrOptions);
3886
- this.modelId = modelId;
3887
- this.outputFormat = outputFormat;
3888
- this.voiceSettings = void 0;
3889
- this.languageCode = void 0;
3890
- this.chunkSize = 4096;
3891
- }
3892
- }
3893
- /**
3894
- * Construct an instance pre-configured for Twilio Media Streams.
3895
- *
3896
- * Sets `outputFormat='ulaw_8000'` so ElevenLabs emits μ-law @ 8 kHz
3897
- * directly — the exact wire format Twilio's media stream uses — letting
3898
- * the SDK skip the 16 kHz→8 kHz resample and PCM→μ-law conversion in
3899
- * `TwilioAudioSender`. Saves ~30–80 ms first-byte and per-frame CPU,
3900
- * and removes a potential aliasing source.
3901
- *
3902
- * `voiceSettings` defaults to a low-bandwidth-friendly profile
3903
- * (speaker boost off, modest stability) which sounds cleaner at 8 kHz
3904
- * μ-law than the studio default. Pass an explicit object to override.
3905
- */
3906
- static forTwilio(apiKey, options = {}) {
3907
- const voiceSettings = options.voiceSettings ?? {
3908
- // Speaker boost adds high-frequency emphasis that aliases ugly over an
3909
- // 8 kHz μ-law line. Slightly higher stability tames the excursions
3910
- // that compander quantization noise can amplify.
3911
- stability: 0.6,
3912
- similarity_boost: 0.75,
3913
- use_speaker_boost: false
3914
- };
3915
- return new _ElevenLabsTTS(apiKey, {
3916
- ...options,
3917
- voiceSettings,
3918
- outputFormat: ElevenLabsOutputFormat.ULAW_8000
3919
- });
3920
- }
3921
- /**
3922
- * Construct an instance pre-configured for Telnyx bidirectional media.
3923
- *
3924
- * Telnyx's default media-streaming codec is L16 PCM @ 16 kHz, which
3925
- * matches our default Telnyx handler. We pick `pcm_16000` so the audio
3926
- * flows end-to-end with zero resampling or transcoding.
3927
- *
3928
- * Trade-off: if your Telnyx profile is pinned to PCMU/8000 (μ-law),
3929
- * construct `ElevenLabsTTS` directly with `outputFormat: 'ulaw_8000'`
3930
- * — Telnyx supports that natively too.
3931
- */
3932
- static forTelnyx(apiKey, options = {}) {
3933
- return new _ElevenLabsTTS(apiKey, {
3934
- ...options,
3935
- outputFormat: ElevenLabsOutputFormat.PCM_16000
3936
- });
3937
- }
3938
- /**
3939
- * Synthesise text to speech and return the full audio as a single Buffer.
3940
- *
3941
- * For large chunks (or when latency matters) call `synthesizeStream` instead.
3942
- */
3943
- async synthesize(text) {
3944
- const chunks = [];
3945
- for await (const chunk of this.synthesizeStream(text)) {
3946
- chunks.push(chunk);
3947
- }
3948
- return Buffer.concat(chunks);
3949
- }
3950
- /**
3951
- * Synthesise text and yield audio chunks as they arrive (streaming).
3952
- *
3953
- * The yielded buffers are raw PCM at 16 kHz (or whatever `outputFormat` is
3954
- * configured to). `chunkSize` controls the maximum yield size — 512 is a
3955
- * good choice for low-latency telephony.
3956
- */
3957
- async *synthesizeStream(text) {
3958
- const url = `${ELEVENLABS_BASE_URL}/text-to-speech/${encodeURIComponent(this.voiceId)}/stream?output_format=${encodeURIComponent(this.outputFormat)}`;
3959
- const body = {
3960
- text,
3961
- model_id: this.modelId
3962
- };
3963
- if (this.voiceSettings) body["voice_settings"] = this.voiceSettings;
3964
- if (this.languageCode) body["language_code"] = this.languageCode;
3965
- const response = await fetch(url, {
3966
- method: "POST",
3967
- headers: {
3968
- "xi-api-key": this.apiKey,
3969
- "Content-Type": "application/json"
3970
- },
3971
- body: JSON.stringify(body),
3972
- signal: AbortSignal.timeout(3e4)
3973
- });
3974
- if (!response.ok) {
3975
- const errBody = await response.text();
3976
- throw new Error(`ElevenLabs TTS error ${response.status}: ${errBody}`);
3977
- }
3978
- if (!response.body) {
3979
- throw new Error("ElevenLabs TTS: no response body");
3980
- }
3981
- const reader = response.body.getReader();
3982
- try {
3983
- while (true) {
3984
- const { done, value } = await reader.read();
3985
- if (done) break;
3986
- if (!value || value.length === 0) continue;
3987
- const buf = Buffer.from(value);
3988
- for (let offset = 0; offset < buf.length; offset += this.chunkSize) {
3989
- yield buf.subarray(offset, Math.min(offset + this.chunkSize, buf.length));
5106
+ cb(err);
5107
+ } catch (cbErr) {
5108
+ getLogger().error(`SpeechmaticsSTT error callback threw: ${String(cbErr)}`);
5109
+ }
5110
+ }
5111
+ }
5112
+ handleError(err) {
5113
+ getLogger().error(`SpeechmaticsSTT WebSocket error: ${err.message}`);
5114
+ this.emitError(err);
5115
+ }
5116
+ handleClose() {
5117
+ if (!this.running) return;
5118
+ this.running = false;
5119
+ }
5120
+ /** Send `EndOfStream` and close the WebSocket. Idempotent. */
5121
+ close() {
5122
+ this.running = false;
5123
+ const ws = this.ws;
5124
+ if (!ws) return;
5125
+ this.ws = null;
5126
+ const sendSafe = (payload) => {
5127
+ if (ws.readyState === WebSocket5.OPEN) {
5128
+ try {
5129
+ ws.send(payload);
5130
+ } catch {
3990
5131
  }
3991
5132
  }
3992
- } finally {
3993
- if (typeof reader.cancel === "function") await reader.cancel().catch(() => {
3994
- });
3995
- reader.releaseLock();
5133
+ };
5134
+ sendSafe(
5135
+ JSON.stringify({ message: "EndOfStream", last_seq_no: this.lastSeqNo })
5136
+ );
5137
+ try {
5138
+ ws.close();
5139
+ } catch {
5140
+ }
5141
+ }
5142
+ };
5143
+
5144
+ // src/stt/speechmatics.ts
5145
+ var STT7 = class extends SpeechmaticsSTT {
5146
+ static providerKey = "speechmatics";
5147
+ constructor(opts = {}) {
5148
+ const key = opts.apiKey ?? process.env.SPEECHMATICS_API_KEY;
5149
+ if (!key) {
5150
+ throw new Error(
5151
+ "Speechmatics STT requires an apiKey. Pass { apiKey: 'sm_...' } or set SPEECHMATICS_API_KEY in the environment."
5152
+ );
3996
5153
  }
5154
+ super(key, opts);
3997
5155
  }
3998
5156
  };
3999
5157
 
4000
5158
  // src/tts/elevenlabs.ts
5159
+ init_esm_shims();
4001
5160
  function resolveApiKey(apiKey) {
4002
5161
  const key = apiKey ?? process.env.ELEVENLABS_API_KEY;
4003
5162
  if (!key) {
@@ -4013,7 +5172,7 @@ var TTS = class _TTS extends ElevenLabsTTS {
4013
5172
  super(resolveApiKey(opts.apiKey), {
4014
5173
  voiceId: opts.voiceId ?? "EXAVITQu4vr4xnSDxMaL",
4015
5174
  modelId: opts.modelId ?? "eleven_flash_v2_5",
4016
- outputFormat: opts.outputFormat ?? "pcm_16000",
5175
+ ...opts.outputFormat !== void 0 ? { outputFormat: opts.outputFormat } : {},
4017
5176
  languageCode: opts.languageCode,
4018
5177
  voiceSettings: opts.voiceSettings
4019
5178
  });
@@ -4052,7 +5211,7 @@ var ElevenLabsPlanError = class extends ElevenLabsTTSError {
4052
5211
  this.name = "ElevenLabsPlanError";
4053
5212
  }
4054
5213
  };
4055
- var PLAN_REQUIRED_MSG = "ElevenLabs WS streaming requires a Pro plan or higher (the WS endpoint returned `payment_required`). Either upgrade at https://elevenlabs.io/pricing, or use the HTTP `ElevenLabsTTS` class which works on all plans (drop-in API).";
5214
+ var PLAN_REQUIRED_MSG = "ElevenLabs WS streaming requires a Pro plan or higher (the WS endpoint returned `payment_required`). Either upgrade at https://elevenlabs.io/pricing, or use `ElevenLabsRestTTS` for HTTP REST instead which works on all plans (drop-in API).";
4056
5215
  function sanitiseLogStr(value, limit = 200) {
4057
5216
  return String(value).replace(/[\r\n\x00]/g, " ").slice(0, limit);
4058
5217
  }
@@ -4071,6 +5230,33 @@ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
4071
5230
  inactivityTimeout;
4072
5231
  chunkLengthSchedule;
4073
5232
  chunkSize;
5233
+ /**
5234
+ * Single-slot adoption queue. The prewarm pipeline parks one WS per
5235
+ * outbound call here; the next `synthesizeStream` call consumes it
5236
+ * (skipping `new WebSocket()` and the BOS send) instead of opening
5237
+ * a fresh socket. The slot is consumed exactly once: if a second
5238
+ * `synthesizeStream` runs before the first, only the first benefits.
5239
+ *
5240
+ * We keep this on the adapter (not in a parameter) so the existing
5241
+ * `for await (const chunk of agent.tts.synthesizeStream(...))` call
5242
+ * site in `StreamHandler` continues to work without signature
5243
+ * changes.
5244
+ */
5245
+ adoptedConnection = null;
5246
+ /**
5247
+ * Active WS for the in-flight ``synthesizeStream`` call, if any. Set
5248
+ * when a stream starts, cleared in its ``finally`` block. The
5249
+ * stream-handler calls ``cancelActiveStream()`` from ``cancelSpeaking``
5250
+ * to unblock the generator's inner ``await Promise<frame>`` — without
5251
+ * it, a barge-in on the firstMessage live path leaves the for-await
5252
+ * stuck waiting for the next frame; ElevenLabs never sends
5253
+ * ``isFinal=true`` after the consumer breaks, the 30 s frame timeout
5254
+ * fires post-call, and meanwhile ``initPipeline`` never returns so
5255
+ * the STT ``onTranscript`` callback never registers and subsequent
5256
+ * user turns are silently dropped (root cause of the 2026-05-20
5257
+ * "first message OK, then no response" symptom).
5258
+ */
5259
+ activeStreamWs = null;
4074
5260
  /**
4075
5261
  * The wire format requested over the ElevenLabs WS. Initially set from
4076
5262
  * the constructor; ``setTelephonyCarrier`` may auto-flip it to the
@@ -4086,7 +5272,7 @@ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
4086
5272
  constructor(opts) {
4087
5273
  if (opts.modelId === "eleven_v3") {
4088
5274
  throw new Error(
4089
- "eleven_v3 is not supported by the WebSocket stream-input endpoint \u2014 use the HTTP ElevenLabsTTS class instead."
5275
+ "eleven_v3 is not supported by the WebSocket stream-input endpoint \u2014 use `ElevenLabsRestTTS` for HTTP REST instead."
4090
5276
  );
4091
5277
  }
4092
5278
  this.apiKey = opts.apiKey;
@@ -4119,6 +5305,32 @@ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
4119
5305
  if (!native) return;
4120
5306
  this._outputFormat = native;
4121
5307
  }
5308
+ /**
5309
+ * Force-close the WebSocket of any in-flight ``synthesizeStream`` call.
5310
+ * Called by the stream-handler from ``cancelSpeaking`` (barge-in) so
5311
+ * the generator's inner ``await Promise<frame>`` loop unblocks cleanly
5312
+ * via the ``onClose`` handler — instead of waiting up to 30 s for the
5313
+ * ``FRAME_TIMEOUT_MS`` watchdog to fire. No-op when no stream is in
5314
+ * flight or when the WS is already closing.
5315
+ *
5316
+ * Without this, a barge-in during the firstMessage live path left the
5317
+ * for-await stuck (ElevenLabs never sends ``isFinal=true`` after the
5318
+ * consumer breaks), ``initPipeline`` never returned, the STT
5319
+ * ``onTranscript`` callback never registered, and the entire remainder
5320
+ * of the call was silent for the user. Surfaced during the 2026-05-20
5321
+ * acceptance run.
5322
+ */
5323
+ cancelActiveStream() {
5324
+ const ws = this.activeStreamWs;
5325
+ if (!ws) return;
5326
+ this.activeStreamWs = null;
5327
+ try {
5328
+ if (ws.readyState === WebSocket6.OPEN || ws.readyState === WebSocket6.CONNECTING) {
5329
+ ws.close();
5330
+ }
5331
+ } catch {
5332
+ }
5333
+ }
4122
5334
  /** Pre-configured for Twilio Media Streams (`ulaw_8000`). */
4123
5335
  static forTwilio(opts) {
4124
5336
  return new _ElevenLabsWebSocketTTS({
@@ -4148,6 +5360,24 @@ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
4148
5360
  if (this.languageCode) params.set("language_code", this.languageCode);
4149
5361
  return `${WS_BASE}/${encodeURIComponent(this.voiceId)}/stream-input?${params.toString()}`;
4150
5362
  }
5363
+ /**
5364
+ * Build the protocol-required BOS frame sent on every fresh WS.
5365
+ *
5366
+ * The single-space `{"text": " "}` keep-alive establishes the session
5367
+ * without committing any synthesis (no `flush: true`, no real text).
5368
+ * Production `synthesizeStream()` and `warmup()` share this exact
5369
+ * construction so the upstream worker chooses the same per-session
5370
+ * config in both cases — otherwise the warm session is on a different
5371
+ * worker than the live request, which defeats the warmup goal.
5372
+ */
5373
+ buildBosFrame() {
5374
+ const init = { text: " " };
5375
+ if (this.voiceSettings) init["voice_settings"] = this.voiceSettings;
5376
+ if (!this.autoMode && this.chunkLengthSchedule) {
5377
+ init["generation_config"] = { chunk_length_schedule: this.chunkLengthSchedule };
5378
+ }
5379
+ return init;
5380
+ }
4151
5381
  /**
4152
5382
  * Single-shot synthesis: open WS, send text, yield bytes, close.
4153
5383
  *
@@ -4166,9 +5396,27 @@ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
4166
5396
  * after flush — auto_mode could otherwise truncate the tail audio).
4167
5397
  */
4168
5398
  async *synthesizeStream(text) {
4169
- const ws = new WebSocket6(this.buildUrl(), {
4170
- headers: { "xi-api-key": this.apiKey }
4171
- });
5399
+ let ws;
5400
+ let bosAlreadySent = false;
5401
+ let adopted = false;
5402
+ const parked = this.adoptedConnection;
5403
+ this.adoptedConnection = null;
5404
+ if (parked && parked.ws.readyState === WebSocket6.OPEN) {
5405
+ ws = parked.ws;
5406
+ bosAlreadySent = parked.bosSent;
5407
+ adopted = true;
5408
+ } else {
5409
+ if (parked) {
5410
+ try {
5411
+ parked.ws.close();
5412
+ } catch {
5413
+ }
5414
+ }
5415
+ ws = new WebSocket6(this.buildUrl(), {
5416
+ headers: { "xi-api-key": this.apiKey }
5417
+ });
5418
+ }
5419
+ this.activeStreamWs = ws;
4172
5420
  const queue = [];
4173
5421
  let done = false;
4174
5422
  let pendingError = null;
@@ -4238,28 +5486,27 @@ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
4238
5486
  };
4239
5487
  ws.on("error", onError);
4240
5488
  try {
4241
- await new Promise((resolve, reject) => {
4242
- connectTimer = setTimeout(
4243
- () => reject(new Error("ElevenLabs WS connect timeout")),
4244
- CONNECT_TIMEOUT_MS4
4245
- );
4246
- ws.once("open", () => {
4247
- if (connectTimer) clearTimeout(connectTimer);
4248
- connectTimer = void 0;
4249
- resolve();
4250
- });
4251
- ws.once("error", (err) => {
4252
- if (connectTimer) clearTimeout(connectTimer);
4253
- connectTimer = void 0;
4254
- reject(err);
5489
+ if (!adopted) {
5490
+ await new Promise((resolve, reject) => {
5491
+ connectTimer = setTimeout(
5492
+ () => reject(new Error("ElevenLabs WS connect timeout")),
5493
+ CONNECT_TIMEOUT_MS4
5494
+ );
5495
+ ws.once("open", () => {
5496
+ if (connectTimer) clearTimeout(connectTimer);
5497
+ connectTimer = void 0;
5498
+ resolve();
5499
+ });
5500
+ ws.once("error", (err) => {
5501
+ if (connectTimer) clearTimeout(connectTimer);
5502
+ connectTimer = void 0;
5503
+ reject(err);
5504
+ });
4255
5505
  });
4256
- });
4257
- const init = { text: " " };
4258
- if (this.voiceSettings) init["voice_settings"] = this.voiceSettings;
4259
- if (!this.autoMode && this.chunkLengthSchedule) {
4260
- init["generation_config"] = { chunk_length_schedule: this.chunkLengthSchedule };
4261
5506
  }
4262
- ws.send(JSON.stringify(init));
5507
+ if (!bosAlreadySent) {
5508
+ ws.send(JSON.stringify(this.buildBosFrame()));
5509
+ }
4263
5510
  ws.send(JSON.stringify({ text: text + " ", flush: true }));
4264
5511
  ws.on("message", onMessage);
4265
5512
  ws.on("close", onClose);
@@ -4290,6 +5537,7 @@ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
4290
5537
  }
4291
5538
  } finally {
4292
5539
  if (connectTimer) clearTimeout(connectTimer);
5540
+ if (this.activeStreamWs === ws) this.activeStreamWs = null;
4293
5541
  try {
4294
5542
  if (ws.readyState === WebSocket6.OPEN) {
4295
5543
  ws.send(JSON.stringify({ text: "" }));
@@ -4305,387 +5553,227 @@ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
4305
5553
  ws.removeAllListeners();
4306
5554
  }
4307
5555
  }
4308
- /** No-op — connections are per-utterance and torn down inside synthesizeStream. */
4309
- async close() {
4310
- }
4311
- };
4312
- function looksLikeJson(buf) {
4313
- if (buf.length === 0) return false;
4314
- const b = buf[0];
4315
- return b === 123 || b === 91;
4316
- }
4317
-
4318
- // src/tts/elevenlabs-ws.ts
4319
- function resolveApiKey2(apiKey) {
4320
- const key = apiKey ?? process.env.ELEVENLABS_API_KEY;
4321
- if (!key) {
4322
- throw new Error(
4323
- "ElevenLabs WebSocket TTS requires an apiKey. Pass { apiKey: '...' } or set ELEVENLABS_API_KEY in the environment."
4324
- );
4325
- }
4326
- return key;
4327
- }
4328
- function buildOpts(opts) {
4329
- const out = {
4330
- apiKey: resolveApiKey2(opts.apiKey),
4331
- modelId: opts.modelId ?? "eleven_flash_v2_5",
4332
- outputFormat: opts.outputFormat ?? "pcm_16000",
4333
- autoMode: opts.autoMode ?? true
4334
- };
4335
- if (opts.voiceId !== void 0) out.voiceId = opts.voiceId;
4336
- if (opts.voiceSettings !== void 0) out.voiceSettings = opts.voiceSettings;
4337
- if (opts.languageCode !== void 0) out.languageCode = opts.languageCode;
4338
- if (opts.inactivityTimeout !== void 0) out.inactivityTimeout = opts.inactivityTimeout;
4339
- if (opts.chunkLengthSchedule !== void 0) out.chunkLengthSchedule = opts.chunkLengthSchedule;
4340
- return out;
4341
- }
4342
- var TTS2 = class _TTS extends ElevenLabsWebSocketTTS {
4343
- static providerKey = "elevenlabs_ws";
4344
- constructor(opts = {}) {
4345
- super(buildOpts(opts));
4346
- }
4347
- /** WebSocket TTS pre-configured for Twilio Media Streams (`ulaw_8000`). */
4348
- static forTwilio(opts = {}) {
4349
- return new _TTS({ ...opts, outputFormat: "ulaw_8000" });
4350
- }
4351
- /** WebSocket TTS pre-configured for Telnyx (`pcm_16000`). */
4352
- static forTelnyx(opts = {}) {
4353
- return new _TTS({ ...opts, outputFormat: "pcm_16000" });
4354
- }
4355
- };
4356
-
4357
- // src/tts/openai.ts
4358
- init_esm_shims();
4359
-
4360
- // src/providers/openai-tts.ts
4361
- init_esm_shims();
4362
- var OPENAI_TTS_URL = "https://api.openai.com/v1/audio/speech";
4363
- var INSTRUCTIONS_PREFIX = "gpt-4o-mini-tts";
4364
- var LPF_ALPHA = 0.78;
4365
- var LPF_ALPHA_8K = 0.45;
4366
- var OpenAITTS = class _OpenAITTS {
4367
- constructor(apiKey, voice = "alloy", model = "gpt-4o-mini-tts", instructions = null, speed = null, antiAlias = true, targetSampleRate = 16e3) {
4368
- this.apiKey = apiKey;
4369
- this.voice = voice;
4370
- this.model = model;
4371
- this.instructions = instructions;
4372
- this.speed = speed;
4373
- this.antiAlias = antiAlias;
4374
- this.targetSampleRate = targetSampleRate;
4375
- if (speed !== null && speed !== void 0 && (speed < 0.25 || speed > 4)) {
4376
- throw new Error("OpenAITTS: speed must be in [0.25, 4.0]");
4377
- }
4378
- if (targetSampleRate !== 8e3 && targetSampleRate !== 16e3) {
4379
- throw new Error("OpenAITTS: targetSampleRate must be 8000 or 16000");
4380
- }
4381
- }
4382
- apiKey;
4383
- voice;
4384
- model;
4385
- instructions;
4386
- speed;
4387
- antiAlias;
4388
- targetSampleRate;
4389
- /**
4390
- * Synthesise text to speech and return the full audio as a single Buffer.
4391
- *
4392
- * For large chunks (or when latency matters) call `synthesizeStream` instead.
4393
- */
4394
- async synthesize(text) {
4395
- const chunks = [];
4396
- for await (const chunk of this.synthesizeStream(text)) {
4397
- chunks.push(chunk);
4398
- }
4399
- return Buffer.concat(chunks);
4400
- }
4401
5556
  /**
4402
- * Synthesise text and yield audio chunks as they arrive (streaming).
5557
+ * Pre-call WebSocket warmup for the ElevenLabs `/stream-input` endpoint.
4403
5558
  *
4404
- * OpenAI returns 24 kHz PCM16; each chunk is lowpass-filtered then
4405
- * decimated 3:2 to 16 kHz before yielding so the output is ready for
4406
- * telephony pipelines.
5559
+ * Opens the WS (DNS + TLS + auth handshake), sends the EXACT same BOS
5560
+ * frame the production `synthesizeStream()` path sends including
5561
+ * `voice_settings` and (when configured) `generation_config` — so
5562
+ * ElevenLabs instantiates the same per-session worker for both
5563
+ * warmup and the live request. If the BOS frames differ, the server
5564
+ * may route warmup and the real call to two different workers, and
5565
+ * the warmed worker is wasted. Idles ~250 ms, then closes. By the
5566
+ * time the first `synthesizeStream()` call lands during the call,
5567
+ * the connection pool has the upstream warm — net wire time saving
5568
+ * of 200-500 ms.
4407
5569
  *
4408
- * The resampler carries state (filter memory + buffered samples + odd
4409
- * trailing byte) between chunks so cross-chunk sample alignment and
4410
- * filter phase don't reset on every network read.
5570
+ * Billing safety: ElevenLabs bills on synthesised characters
5571
+ * delivered via `audio` frames (per https://elevenlabs.io/pricing).
5572
+ * The keepalive (single-space `text`, no `flush: true`, no real
5573
+ * transcript) is documented as the session-establishment frame and
5574
+ * does NOT generate synthesis. Closing without sending the actual
5575
+ * transcript does not consume billable characters. Best-effort:
5576
+ * failures logged at debug level.
4411
5577
  */
4412
- async *synthesizeStream(text) {
4413
- const body = {
4414
- model: this.model,
4415
- input: text,
4416
- voice: this.voice,
4417
- response_format: "pcm"
4418
- };
4419
- if (this.instructions !== null && this.model.startsWith(INSTRUCTIONS_PREFIX)) {
4420
- body.instructions = this.instructions;
4421
- }
4422
- if (this.speed !== null) {
4423
- body.speed = this.speed;
4424
- }
4425
- const response = await fetch(OPENAI_TTS_URL, {
4426
- method: "POST",
4427
- headers: {
4428
- "Authorization": `Bearer ${this.apiKey}`,
4429
- "Content-Type": "application/json"
4430
- },
4431
- body: JSON.stringify(body)
5578
+ async warmup() {
5579
+ const ws = new WebSocket6(this.buildUrl(), {
5580
+ headers: { "xi-api-key": this.apiKey }
4432
5581
  });
4433
- if (!response.ok) {
4434
- const errBody = await response.text();
4435
- throw new Error(`OpenAI TTS error ${response.status}: ${errBody}`);
4436
- }
4437
- if (!response.body) {
4438
- throw new Error("OpenAI TTS: no response body");
4439
- }
4440
- const ctx = {
4441
- carryByte: null,
4442
- leftover: [],
4443
- lpfPrev: 0,
4444
- lpfEnabled: this.antiAlias,
4445
- targetSampleRate: this.targetSampleRate
4446
- };
4447
- const reader = response.body.getReader();
4448
5582
  try {
4449
- while (true) {
4450
- const { done, value } = await reader.read();
4451
- if (done) break;
4452
- if (value && value.length > 0) {
4453
- const out = _OpenAITTS.resampleStreaming(Buffer.from(value), ctx);
4454
- if (out.length > 0) yield out;
4455
- }
4456
- }
4457
- if (ctx.leftover.length > 0) {
4458
- const tail = Buffer.alloc(ctx.leftover.length * 2);
4459
- for (let i = 0; i < ctx.leftover.length; i++) {
4460
- tail.writeInt16LE(ctx.leftover[i], i * 2);
4461
- }
4462
- yield tail;
5583
+ await new Promise((resolve, reject) => {
5584
+ const timer = setTimeout(
5585
+ () => reject(new Error("ElevenLabs WS TTS warmup connect timeout")),
5586
+ CONNECT_TIMEOUT_MS4
5587
+ );
5588
+ ws.once("open", () => {
5589
+ clearTimeout(timer);
5590
+ resolve();
5591
+ });
5592
+ ws.once("error", (err) => {
5593
+ clearTimeout(timer);
5594
+ reject(err);
5595
+ });
5596
+ });
5597
+ try {
5598
+ ws.send(JSON.stringify(this.buildBosFrame()));
5599
+ } catch {
4463
5600
  }
5601
+ await new Promise((r) => setTimeout(r, 250));
5602
+ } catch (err) {
5603
+ getLogger().debug(`ElevenLabs WS TTS warmup failed (best-effort): ${String(err)}`);
4464
5604
  } finally {
4465
- if (typeof reader.cancel === "function") await reader.cancel().catch(() => {
4466
- });
4467
- reader.releaseLock();
5605
+ try {
5606
+ if (ws.readyState === WebSocket6.OPEN || ws.readyState === WebSocket6.CONNECTING) {
5607
+ ws.close();
5608
+ }
5609
+ } catch {
5610
+ }
5611
+ ws.removeAllListeners();
4468
5612
  }
4469
5613
  }
4470
5614
  /**
4471
- * Streaming 24 kHz → {16, 8} kHz resampler (PCM16-LE). Applies a single-pole
4472
- * lowpass ahead of the decimation and carries filter + sample state across
4473
- * chunks so the cadence doesn't reset at every network read.
5615
+ * Open a fresh WS, send the EXACT BOS frame the live `synthesizeStream`
5616
+ * sends, and return the OPEN socket without closing it. Used by the
5617
+ * prewarm pipeline to park a TTS connection during the carrier ringing
5618
+ * window so the next `synthesizeStream` call can adopt it via
5619
+ * {@link adoptWebSocket} and skip ~400-900 ms of TLS + BOS round-trip.
4474
5620
  *
4475
- * Output rate is selected by ``ctx.targetSampleRate``:
4476
- * 16000 3:2 decimation (sample 0 + mid(1,2)) [default]
4477
- * 8000 3:1 decimation (sample 0 only) [fix #46]
5621
+ * Returns a parked-handle the caller stashes; the next
5622
+ * `synthesizeStream` will detect the adoption queue and skip its own
5623
+ * `new WebSocket()` + BOS send.
4478
5624
  *
4479
- * ``ctx.lpfEnabled`` controls whether the LPF is engaged kept disabled
4480
- * for the legacy static helper so the bit-exact downsample-only tests
4481
- * remain valid; the real streaming path always engages it.
5625
+ * Billing safety: BOS is the documented session-establishment frame
5626
+ * (single space `text`, no `flush: true`) and does not generate
5627
+ * synthesis. ElevenLabs bills on `audio` frames received from the
5628
+ * server, not on BOS bytes sent by the client.
4482
5629
  */
4483
- static resampleStreaming(audio, ctx) {
4484
- let buf;
4485
- if (ctx.carryByte !== null) {
4486
- buf = Buffer.concat([Buffer.from([ctx.carryByte]), audio]);
4487
- ctx.carryByte = null;
4488
- } else {
4489
- buf = audio;
4490
- }
4491
- if (buf.length % 2 === 1) {
4492
- ctx.carryByte = buf[buf.length - 1];
4493
- buf = buf.subarray(0, buf.length - 1);
4494
- }
4495
- if (buf.length === 0 && ctx.leftover.length === 0) {
4496
- return Buffer.alloc(0);
5630
+ async openParkedConnection() {
5631
+ const ws = new WebSocket6(this.buildUrl(), {
5632
+ headers: { "xi-api-key": this.apiKey }
5633
+ });
5634
+ await new Promise((resolve, reject) => {
5635
+ const timer = setTimeout(
5636
+ () => reject(new Error("ElevenLabs WS park connect timeout")),
5637
+ CONNECT_TIMEOUT_MS4
5638
+ );
5639
+ ws.once("open", () => {
5640
+ clearTimeout(timer);
5641
+ resolve();
5642
+ });
5643
+ ws.once("error", (err) => {
5644
+ clearTimeout(timer);
5645
+ reject(err);
5646
+ });
5647
+ });
5648
+ let bosSent = false;
5649
+ try {
5650
+ ws.send(JSON.stringify(this.buildBosFrame()));
5651
+ bosSent = true;
5652
+ } catch {
4497
5653
  }
4498
- const direct8k = ctx.targetSampleRate === 8e3;
4499
- const lpfAlpha = direct8k ? LPF_ALPHA_8K : LPF_ALPHA;
4500
- const sampleCount = buf.length / 2;
4501
- const samples = ctx.leftover.slice();
4502
- const lpf = ctx.lpfEnabled !== false;
4503
- let y = ctx.lpfPrev;
4504
- for (let i2 = 0; i2 < sampleCount; i2++) {
4505
- const x = buf.readInt16LE(i2 * 2);
4506
- if (lpf) {
4507
- y = lpfAlpha * x + (1 - lpfAlpha) * y;
4508
- let s = Math.round(y);
4509
- if (s > 32767) s = 32767;
4510
- else if (s < -32768) s = -32768;
4511
- samples.push(s);
4512
- } else {
4513
- samples.push(x);
5654
+ return { ws, bosSent };
5655
+ }
5656
+ /**
5657
+ * Stash a parked WS handle so the next `synthesizeStream` call adopts
5658
+ * it instead of opening a fresh socket. Caller is responsible for
5659
+ * holding the handle alive until either the live request consumes it
5660
+ * or the call ends (in which case `discardAdoptedConnection()`
5661
+ * cleans it up).
5662
+ */
5663
+ adoptWebSocket(parked) {
5664
+ const prev = this.adoptedConnection;
5665
+ this.adoptedConnection = parked;
5666
+ if (prev && prev !== parked) {
5667
+ try {
5668
+ prev.ws.close();
5669
+ } catch {
4514
5670
  }
4515
5671
  }
4516
- if (lpf) ctx.lpfPrev = y;
4517
- const out = [];
4518
- let i = 0;
4519
- if (direct8k) {
4520
- while (i + 2 < samples.length) {
4521
- out.push(samples[i]);
4522
- i += 3;
4523
- }
4524
- } else {
4525
- while (i + 2 < samples.length) {
4526
- out.push(samples[i]);
4527
- out.push(Math.round((samples[i + 1] + samples[i + 2]) / 2));
4528
- i += 3;
5672
+ }
5673
+ /**
5674
+ * Drop and close any pending parked WS without consuming it. Used on
5675
+ * call-failure paths so a never-started call does not leak a TTS WS
5676
+ * that ElevenLabs will close after its inactivity timeout anyway.
5677
+ */
5678
+ discardAdoptedConnection() {
5679
+ const parked = this.adoptedConnection;
5680
+ this.adoptedConnection = null;
5681
+ if (parked) {
5682
+ try {
5683
+ parked.ws.close();
5684
+ } catch {
4529
5685
  }
4530
5686
  }
4531
- ctx.leftover = samples.slice(i);
4532
- const buffer = Buffer.alloc(out.length * 2);
4533
- for (let j = 0; j < out.length; j++) {
4534
- buffer.writeInt16LE(out[j], j * 2);
4535
- }
4536
- return buffer;
4537
5687
  }
4538
- /** @deprecated use {@link resampleStreaming} with persistent state. */
4539
- static resample24kTo16k(audio) {
4540
- const ctx = {
4541
- carryByte: null,
4542
- leftover: [],
4543
- lpfPrev: 0,
4544
- lpfEnabled: false,
4545
- targetSampleRate: 16e3
4546
- };
4547
- const out = _OpenAITTS.resampleStreaming(audio, ctx);
4548
- if (ctx.leftover.length === 0) return out;
4549
- const tail = Buffer.alloc(ctx.leftover.length * 2);
4550
- for (let i = 0; i < ctx.leftover.length; i++) {
4551
- tail.writeInt16LE(ctx.leftover[i], i * 2);
4552
- }
4553
- return Buffer.concat([out, tail]);
5688
+ /** No-op connections are per-utterance and torn down inside synthesizeStream. */
5689
+ async close() {
5690
+ this.discardAdoptedConnection();
4554
5691
  }
4555
5692
  };
5693
+ function looksLikeJson(buf) {
5694
+ if (buf.length === 0) return false;
5695
+ const b = buf[0];
5696
+ return b === 123 || b === 91;
5697
+ }
4556
5698
 
4557
- // src/tts/openai.ts
4558
- var TTS3 = class extends OpenAITTS {
4559
- static providerKey = "openai_tts";
4560
- constructor(opts = {}) {
4561
- const key = opts.apiKey ?? process.env.OPENAI_API_KEY;
4562
- if (!key) {
4563
- throw new Error(
4564
- "OpenAI TTS requires an apiKey. Pass { apiKey: 'sk-...' } or set OPENAI_API_KEY in the environment."
4565
- );
4566
- }
4567
- super(
4568
- key,
4569
- opts.voice ?? "alloy",
4570
- opts.model ?? "gpt-4o-mini-tts",
4571
- opts.instructions ?? null,
4572
- opts.speed ?? null,
4573
- opts.antiAlias ?? false
5699
+ // src/tts/elevenlabs-ws.ts
5700
+ function resolveApiKey2(apiKey) {
5701
+ const key = apiKey ?? process.env.ELEVENLABS_API_KEY;
5702
+ if (!key) {
5703
+ throw new Error(
5704
+ "ElevenLabs WebSocket TTS requires an apiKey. Pass { apiKey: '...' } or set ELEVENLABS_API_KEY in the environment."
4574
5705
  );
4575
5706
  }
5707
+ return key;
5708
+ }
5709
+ function buildOpts(opts) {
5710
+ const out = {
5711
+ apiKey: resolveApiKey2(opts.apiKey),
5712
+ modelId: opts.modelId ?? "eleven_flash_v2_5",
5713
+ autoMode: opts.autoMode ?? true
5714
+ };
5715
+ if (opts.outputFormat !== void 0) out.outputFormat = opts.outputFormat;
5716
+ if (opts.voiceId !== void 0) out.voiceId = opts.voiceId;
5717
+ if (opts.voiceSettings !== void 0) out.voiceSettings = opts.voiceSettings;
5718
+ if (opts.languageCode !== void 0) out.languageCode = opts.languageCode;
5719
+ if (opts.inactivityTimeout !== void 0) out.inactivityTimeout = opts.inactivityTimeout;
5720
+ if (opts.chunkLengthSchedule !== void 0) out.chunkLengthSchedule = opts.chunkLengthSchedule;
5721
+ return out;
5722
+ }
5723
+ var TTS2 = class _TTS extends ElevenLabsWebSocketTTS {
5724
+ static providerKey = "elevenlabs_ws";
5725
+ constructor(opts = {}) {
5726
+ super(buildOpts(opts));
5727
+ }
5728
+ /** WebSocket TTS pre-configured for Twilio Media Streams (`ulaw_8000`). */
5729
+ static forTwilio(opts = {}) {
5730
+ return new _TTS({ ...opts, outputFormat: "ulaw_8000" });
5731
+ }
5732
+ /** WebSocket TTS pre-configured for Telnyx (`pcm_16000`). */
5733
+ static forTelnyx(opts = {}) {
5734
+ return new _TTS({ ...opts, outputFormat: "pcm_16000" });
5735
+ }
4576
5736
  };
4577
5737
 
4578
- // src/tts/cartesia.ts
5738
+ // src/tts/openai.ts
4579
5739
  init_esm_shims();
4580
5740
 
4581
- // src/providers/cartesia-tts.ts
5741
+ // src/providers/openai-tts.ts
4582
5742
  init_esm_shims();
4583
- var CARTESIA_BASE_URL = "https://api.cartesia.ai";
4584
- var CARTESIA_API_VERSION = "2025-04-16";
4585
- var CARTESIA_DEFAULT_VOICE_ID = "f786b574-daa5-4673-aa0c-cbe3e8534c02";
4586
- var CartesiaTTSModel = {
4587
- SONIC_3: "sonic-3",
4588
- SONIC_2: "sonic-2",
4589
- SONIC: "sonic"
4590
- };
4591
- var CartesiaTTSContainer = {
4592
- RAW: "raw",
4593
- WAV: "wav",
4594
- MP3: "mp3"
4595
- };
4596
- var CartesiaTTSEncoding = {
4597
- PCM_S16LE: "pcm_s16le",
4598
- PCM_F32LE: "pcm_f32le",
4599
- PCM_MULAW: "pcm_mulaw",
4600
- PCM_ALAW: "pcm_alaw"
4601
- };
4602
- var CartesiaTTSSampleRate = {
4603
- HZ_8000: 8e3,
4604
- HZ_16000: 16e3,
4605
- HZ_22050: 22050,
4606
- HZ_24000: 24e3,
4607
- HZ_44100: 44100
4608
- };
4609
- var CartesiaTTSVoiceMode = {
4610
- ID: "id",
4611
- EMBEDDING: "embedding"
4612
- };
4613
- var CartesiaTTS = class _CartesiaTTS {
4614
- apiKey;
4615
- model;
4616
- voice;
4617
- language;
4618
- sampleRate;
4619
- speed;
4620
- emotion;
4621
- volume;
4622
- baseUrl;
4623
- apiVersion;
4624
- constructor(apiKey, opts = {}) {
5743
+ var OPENAI_TTS_URL = "https://api.openai.com/v1/audio/speech";
5744
+ var INSTRUCTIONS_PREFIX = "gpt-4o-mini-tts";
5745
+ var LPF_ALPHA = 0.78;
5746
+ var LPF_ALPHA_8K = 0.45;
5747
+ var OpenAITTS = class _OpenAITTS {
5748
+ constructor(apiKey, voice = "alloy", model = "gpt-4o-mini-tts", instructions = null, speed = null, antiAlias = true, targetSampleRate = 16e3) {
4625
5749
  this.apiKey = apiKey;
4626
- this.model = opts.model ?? CartesiaTTSModel.SONIC_3;
4627
- this.voice = opts.voice ?? CARTESIA_DEFAULT_VOICE_ID;
4628
- this.language = opts.language ?? "en";
4629
- this.sampleRate = opts.sampleRate ?? CartesiaTTSSampleRate.HZ_16000;
4630
- this.speed = opts.speed;
4631
- this.emotion = typeof opts.emotion === "string" ? [opts.emotion] : opts.emotion;
4632
- this.volume = opts.volume;
4633
- this.baseUrl = opts.baseUrl ?? CARTESIA_BASE_URL;
4634
- this.apiVersion = opts.apiVersion ?? CARTESIA_API_VERSION;
4635
- }
4636
- /**
4637
- * Construct an instance pre-configured for Twilio Media Streams.
4638
- *
4639
- * Sets `sampleRate=8000` so Cartesia emits PCM_S16LE @ 8 kHz directly.
4640
- * Twilio's media stream uses μ-law @ 8 kHz so the SDK still does the
4641
- * PCM → μ-law transcode client-side, but the 16 kHz → 8 kHz resample
4642
- * step is skipped. Saves ~10–30 ms first-byte plus per-frame CPU and
4643
- * removes a potential aliasing source.
4644
- */
4645
- static forTwilio(apiKey, options = {}) {
4646
- return new _CartesiaTTS(apiKey, {
4647
- ...options,
4648
- sampleRate: CartesiaTTSSampleRate.HZ_8000
4649
- });
5750
+ this.voice = voice;
5751
+ this.model = model;
5752
+ this.instructions = instructions;
5753
+ this.speed = speed;
5754
+ this.antiAlias = antiAlias;
5755
+ this.targetSampleRate = targetSampleRate;
5756
+ if (speed !== null && speed !== void 0 && (speed < 0.25 || speed > 4)) {
5757
+ throw new Error("OpenAITTS: speed must be in [0.25, 4.0]");
5758
+ }
5759
+ if (targetSampleRate !== 8e3 && targetSampleRate !== 16e3) {
5760
+ throw new Error("OpenAITTS: targetSampleRate must be 8000 or 16000");
5761
+ }
4650
5762
  }
5763
+ apiKey;
5764
+ voice;
5765
+ model;
5766
+ instructions;
5767
+ speed;
5768
+ antiAlias;
5769
+ targetSampleRate;
5770
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
5771
+ static providerKey = "openai_tts";
4651
5772
  /**
4652
- * Construct an instance pre-configured for Telnyx bidirectional media.
5773
+ * Synthesise text to speech and return the full audio as a single Buffer.
4653
5774
  *
4654
- * Sets `sampleRate=16000` to match Telnyx's L16/16000 default codec
4655
- * audio flows end-to-end with zero resampling or transcoding. Same as
4656
- * the bare-constructor default; exists for API symmetry with
4657
- * {@link CartesiaTTS.forTwilio}.
5775
+ * For large chunks (or when latency matters) call `synthesizeStream` instead.
4658
5776
  */
4659
- static forTelnyx(apiKey, options = {}) {
4660
- return new _CartesiaTTS(apiKey, {
4661
- ...options,
4662
- sampleRate: CartesiaTTSSampleRate.HZ_16000
4663
- });
4664
- }
4665
- /** Build the JSON payload for the Cartesia bytes endpoint. */
4666
- buildPayload(text) {
4667
- const payload = {
4668
- model_id: this.model,
4669
- voice: { mode: CartesiaTTSVoiceMode.ID, id: this.voice },
4670
- transcript: text,
4671
- output_format: {
4672
- container: CartesiaTTSContainer.RAW,
4673
- encoding: CartesiaTTSEncoding.PCM_S16LE,
4674
- sample_rate: this.sampleRate
4675
- },
4676
- language: this.language
4677
- };
4678
- const generationConfig = {};
4679
- if (this.speed !== void 0) generationConfig.speed = this.speed;
4680
- if (this.emotion && this.emotion.length > 0)
4681
- generationConfig.emotion = this.emotion[0];
4682
- if (this.volume !== void 0) generationConfig.volume = this.volume;
4683
- if (Object.keys(generationConfig).length > 0) {
4684
- payload.generation_config = generationConfig;
4685
- }
4686
- return payload;
4687
- }
4688
- /** Synthesize text and return the concatenated audio buffer. */
4689
5777
  async synthesize(text) {
4690
5778
  const chunks = [];
4691
5779
  for await (const chunk of this.synthesizeStream(text)) {
@@ -4694,217 +5782,213 @@ var CartesiaTTS = class _CartesiaTTS {
4694
5782
  return Buffer.concat(chunks);
4695
5783
  }
4696
5784
  /**
4697
- * Synthesize text and yield raw PCM_S16LE chunks at the configured
4698
- * `sampleRate` as they arrive from Cartesia.
5785
+ * Synthesise text and yield audio chunks as they arrive (streaming).
5786
+ *
5787
+ * OpenAI returns 24 kHz PCM16; each chunk is lowpass-filtered then
5788
+ * decimated 3:2 to 16 kHz before yielding so the output is ready for
5789
+ * telephony pipelines.
5790
+ *
5791
+ * The resampler carries state (filter memory + buffered samples + odd
5792
+ * trailing byte) between chunks so cross-chunk sample alignment and
5793
+ * filter phase don't reset on every network read.
4699
5794
  */
4700
5795
  async *synthesizeStream(text) {
4701
- const response = await fetch(`${this.baseUrl}/tts/bytes`, {
5796
+ const body = {
5797
+ model: this.model,
5798
+ input: text,
5799
+ voice: this.voice,
5800
+ response_format: "pcm"
5801
+ };
5802
+ if (this.instructions !== null && this.model.startsWith(INSTRUCTIONS_PREFIX)) {
5803
+ body.instructions = this.instructions;
5804
+ }
5805
+ if (this.speed !== null) {
5806
+ body.speed = this.speed;
5807
+ }
5808
+ const response = await fetch(OPENAI_TTS_URL, {
4702
5809
  method: "POST",
4703
5810
  headers: {
4704
- "X-API-Key": this.apiKey,
4705
- "Cartesia-Version": this.apiVersion,
5811
+ "Authorization": `Bearer ${this.apiKey}`,
4706
5812
  "Content-Type": "application/json"
4707
5813
  },
4708
- body: JSON.stringify(this.buildPayload(text)),
4709
- signal: AbortSignal.timeout(3e4)
5814
+ body: JSON.stringify(body)
4710
5815
  });
4711
5816
  if (!response.ok) {
4712
- const body = await response.text();
4713
- throw new Error(`Cartesia TTS error ${response.status}: ${body}`);
5817
+ const errBody = await response.text();
5818
+ throw new Error(`OpenAI TTS error ${response.status}: ${errBody}`);
4714
5819
  }
4715
5820
  if (!response.body) {
4716
- throw new Error("Cartesia TTS: no response body");
5821
+ throw new Error("OpenAI TTS: no response body");
4717
5822
  }
5823
+ const ctx = {
5824
+ carryByte: null,
5825
+ leftover: [],
5826
+ lpfPrev: 0,
5827
+ lpfEnabled: this.antiAlias,
5828
+ targetSampleRate: this.targetSampleRate
5829
+ };
4718
5830
  const reader = response.body.getReader();
4719
5831
  try {
4720
5832
  while (true) {
4721
5833
  const { done, value } = await reader.read();
4722
5834
  if (done) break;
4723
5835
  if (value && value.length > 0) {
4724
- yield Buffer.from(value);
5836
+ const out = _OpenAITTS.resampleStreaming(Buffer.from(value), ctx);
5837
+ if (out.length > 0) yield out;
5838
+ }
5839
+ }
5840
+ if (ctx.leftover.length > 0) {
5841
+ const tail = Buffer.alloc(ctx.leftover.length * 2);
5842
+ for (let i = 0; i < ctx.leftover.length; i++) {
5843
+ tail.writeInt16LE(ctx.leftover[i], i * 2);
4725
5844
  }
5845
+ yield tail;
4726
5846
  }
4727
5847
  } finally {
4728
- if (typeof reader.cancel === "function")
4729
- await reader.cancel().catch(() => {
4730
- });
5848
+ if (typeof reader.cancel === "function") await reader.cancel().catch(() => {
5849
+ });
4731
5850
  reader.releaseLock();
4732
5851
  }
4733
5852
  }
4734
- };
4735
-
4736
- // src/tts/cartesia.ts
4737
- function resolveApiKey3(apiKey) {
4738
- const key = apiKey ?? process.env.CARTESIA_API_KEY;
4739
- if (!key) {
4740
- throw new Error(
4741
- "Cartesia TTS requires an apiKey. Pass { apiKey: '...' } or set CARTESIA_API_KEY in the environment."
4742
- );
4743
- }
4744
- return key;
4745
- }
4746
- var TTS4 = class _TTS extends CartesiaTTS {
4747
- static providerKey = "cartesia_tts";
4748
- constructor(opts = {}) {
4749
- const key = resolveApiKey3(opts.apiKey);
4750
- const { apiKey: _ignored, ...rest } = opts;
4751
- void _ignored;
4752
- super(key, rest);
4753
- }
4754
- static forTwilio(arg1, arg2) {
4755
- const opts = typeof arg1 === "string" ? { apiKey: arg1, ...arg2 ?? {} } : arg1 ?? {};
4756
- return new _TTS({ ...opts, sampleRate: 8e3 });
4757
- }
4758
- static forTelnyx(arg1, arg2) {
4759
- const opts = typeof arg1 === "string" ? { apiKey: arg1, ...arg2 ?? {} } : arg1 ?? {};
4760
- return new _TTS({ ...opts, sampleRate: 16e3 });
4761
- }
4762
- };
4763
-
4764
- // src/tts/rime.ts
4765
- init_esm_shims();
4766
-
4767
- // src/providers/rime-tts.ts
4768
- init_esm_shims();
4769
- var RIME_BASE_URL = "https://users.rime.ai/v1/rime-tts";
4770
- var RimeModel = {
4771
- ARCANA: "arcana",
4772
- MIST: "mist",
4773
- MIST_V2: "mistv2"
4774
- };
4775
- var RimeAudioFormat = {
4776
- PCM: "audio/pcm",
4777
- MP3: "audio/mp3",
4778
- WAV: "audio/wav",
4779
- MULAW: "audio/mulaw"
4780
- };
4781
- var ARCANA_MODEL_TIMEOUT_MS = 60 * 4 * 1e3;
4782
- var MIST_MODEL_TIMEOUT_MS = 30 * 1e3;
4783
- function isMistModel(model) {
4784
- return model.includes(RimeModel.MIST);
4785
- }
4786
- function timeoutForModel(model) {
4787
- if (model === RimeModel.ARCANA) return ARCANA_MODEL_TIMEOUT_MS;
4788
- return MIST_MODEL_TIMEOUT_MS;
4789
- }
4790
- var RimeTTS = class {
4791
- apiKey;
4792
- model;
4793
- speaker;
4794
- lang;
4795
- sampleRate;
4796
- repetitionPenalty;
4797
- temperature;
4798
- topP;
4799
- maxTokens;
4800
- speedAlpha;
4801
- reduceLatency;
4802
- pauseBetweenBrackets;
4803
- phonemizeBetweenBrackets;
4804
- baseUrl;
4805
- totalTimeoutMs;
4806
- constructor(apiKey, opts = {}) {
4807
- this.apiKey = apiKey;
4808
- this.model = opts.model ?? RimeModel.ARCANA;
4809
- const defaultSpeaker = isMistModel(this.model) ? "cove" : "astra";
4810
- this.speaker = opts.speaker ?? defaultSpeaker;
4811
- this.lang = opts.lang ?? "eng";
4812
- this.sampleRate = opts.sampleRate ?? 16e3;
4813
- this.repetitionPenalty = opts.repetitionPenalty;
4814
- this.temperature = opts.temperature;
4815
- this.topP = opts.topP;
4816
- this.maxTokens = opts.maxTokens;
4817
- this.speedAlpha = opts.speedAlpha;
4818
- this.reduceLatency = opts.reduceLatency;
4819
- this.pauseBetweenBrackets = opts.pauseBetweenBrackets;
4820
- this.phonemizeBetweenBrackets = opts.phonemizeBetweenBrackets;
4821
- this.baseUrl = opts.baseUrl ?? RIME_BASE_URL;
4822
- this.totalTimeoutMs = timeoutForModel(this.model);
4823
- }
4824
- buildPayload(text) {
4825
- const payload = {
4826
- speaker: this.speaker,
4827
- text,
4828
- modelId: this.model
4829
- };
4830
- if (this.model === RimeModel.ARCANA) {
4831
- if (this.repetitionPenalty !== void 0)
4832
- payload.repetition_penalty = this.repetitionPenalty;
4833
- if (this.temperature !== void 0) payload.temperature = this.temperature;
4834
- if (this.topP !== void 0) payload.top_p = this.topP;
4835
- if (this.maxTokens !== void 0) payload.max_tokens = this.maxTokens;
4836
- payload.lang = this.lang;
4837
- payload.samplingRate = this.sampleRate;
4838
- } else if (isMistModel(this.model)) {
4839
- payload.lang = this.lang;
4840
- payload.samplingRate = this.sampleRate;
4841
- if (this.speedAlpha !== void 0) payload.speedAlpha = this.speedAlpha;
4842
- if (this.model === RimeModel.MIST_V2 && this.reduceLatency !== void 0) {
4843
- payload.reduceLatency = this.reduceLatency;
5853
+ /**
5854
+ * Streaming 24 kHz → {16, 8} kHz resampler (PCM16-LE). Applies a single-pole
5855
+ * lowpass ahead of the decimation and carries filter + sample state across
5856
+ * chunks so the cadence doesn't reset at every network read.
5857
+ *
5858
+ * Output rate is selected by ``ctx.targetSampleRate``:
5859
+ * 16000 3:2 decimation (sample 0 + mid(1,2)) [default]
5860
+ * 8000 3:1 decimation (sample 0 only) [fix #46]
5861
+ *
5862
+ * ``ctx.lpfEnabled`` controls whether the LPF is engaged — kept disabled
5863
+ * for the legacy static helper so the bit-exact downsample-only tests
5864
+ * remain valid; the real streaming path always engages it.
5865
+ */
5866
+ static resampleStreaming(audio, ctx) {
5867
+ let buf;
5868
+ if (ctx.carryByte !== null) {
5869
+ buf = Buffer.concat([Buffer.from([ctx.carryByte]), audio]);
5870
+ ctx.carryByte = null;
5871
+ } else {
5872
+ buf = audio;
5873
+ }
5874
+ if (buf.length % 2 === 1) {
5875
+ ctx.carryByte = buf[buf.length - 1];
5876
+ buf = buf.subarray(0, buf.length - 1);
5877
+ }
5878
+ if (buf.length === 0 && ctx.leftover.length === 0) {
5879
+ return Buffer.alloc(0);
5880
+ }
5881
+ const direct8k = ctx.targetSampleRate === 8e3;
5882
+ const lpfAlpha = direct8k ? LPF_ALPHA_8K : LPF_ALPHA;
5883
+ const sampleCount = buf.length / 2;
5884
+ const samples = ctx.leftover.slice();
5885
+ const lpf = ctx.lpfEnabled !== false;
5886
+ let y = ctx.lpfPrev;
5887
+ for (let i2 = 0; i2 < sampleCount; i2++) {
5888
+ const x = buf.readInt16LE(i2 * 2);
5889
+ if (lpf) {
5890
+ y = lpfAlpha * x + (1 - lpfAlpha) * y;
5891
+ let s = Math.round(y);
5892
+ if (s > 32767) s = 32767;
5893
+ else if (s < -32768) s = -32768;
5894
+ samples.push(s);
5895
+ } else {
5896
+ samples.push(x);
4844
5897
  }
4845
- if (this.pauseBetweenBrackets !== void 0) {
4846
- payload.pauseBetweenBrackets = this.pauseBetweenBrackets;
5898
+ }
5899
+ if (lpf) ctx.lpfPrev = y;
5900
+ const out = [];
5901
+ let i = 0;
5902
+ if (direct8k) {
5903
+ while (i + 2 < samples.length) {
5904
+ out.push(samples[i]);
5905
+ i += 3;
4847
5906
  }
4848
- if (this.phonemizeBetweenBrackets !== void 0) {
4849
- payload.phonemizeBetweenBrackets = this.phonemizeBetweenBrackets;
5907
+ } else {
5908
+ while (i + 2 < samples.length) {
5909
+ out.push(samples[i]);
5910
+ out.push(Math.round((samples[i + 1] + samples[i + 2]) / 2));
5911
+ i += 3;
4850
5912
  }
4851
5913
  }
4852
- return payload;
4853
- }
4854
- /** Synthesize text and return the concatenated audio buffer. */
4855
- async synthesize(text) {
4856
- const chunks = [];
4857
- for await (const chunk of this.synthesizeStream(text)) {
4858
- chunks.push(chunk);
5914
+ ctx.leftover = samples.slice(i);
5915
+ const buffer = Buffer.alloc(out.length * 2);
5916
+ for (let j = 0; j < out.length; j++) {
5917
+ buffer.writeInt16LE(out[j], j * 2);
4859
5918
  }
4860
- return Buffer.concat(chunks);
5919
+ return buffer;
4861
5920
  }
4862
- /**
4863
- * Synthesize text and yield raw PCM_S16LE chunks at the configured
4864
- * `sampleRate` as they stream in.
4865
- */
4866
- async *synthesizeStream(text) {
4867
- const response = await fetch(this.baseUrl, {
4868
- method: "POST",
4869
- headers: {
4870
- accept: RimeAudioFormat.PCM,
4871
- Authorization: `Bearer ${this.apiKey}`,
4872
- "content-type": "application/json"
4873
- },
4874
- body: JSON.stringify(this.buildPayload(text)),
4875
- signal: AbortSignal.timeout(this.totalTimeoutMs)
4876
- });
4877
- if (!response.ok) {
4878
- const body = await response.text();
4879
- throw new Error(`Rime TTS error ${response.status}: ${body}`);
4880
- }
4881
- const contentType = response.headers.get("content-type") ?? "";
4882
- if (!contentType.startsWith("audio")) {
4883
- const body = await response.text();
4884
- throw new Error(`Rime returned non-audio response: ${body.slice(0, 500)}`);
4885
- }
4886
- if (!response.body) {
4887
- throw new Error("Rime TTS: no response body");
5921
+ /** @deprecated use {@link resampleStreaming} with persistent state. */
5922
+ static resample24kTo16k(audio) {
5923
+ const ctx = {
5924
+ carryByte: null,
5925
+ leftover: [],
5926
+ lpfPrev: 0,
5927
+ lpfEnabled: false,
5928
+ targetSampleRate: 16e3
5929
+ };
5930
+ const out = _OpenAITTS.resampleStreaming(audio, ctx);
5931
+ if (ctx.leftover.length === 0) return out;
5932
+ const tail = Buffer.alloc(ctx.leftover.length * 2);
5933
+ for (let i = 0; i < ctx.leftover.length; i++) {
5934
+ tail.writeInt16LE(ctx.leftover[i], i * 2);
4888
5935
  }
4889
- const reader = response.body.getReader();
4890
- try {
4891
- while (true) {
4892
- const { done, value } = await reader.read();
4893
- if (done) break;
4894
- if (value && value.length > 0) {
4895
- yield Buffer.from(value);
4896
- }
4897
- }
4898
- } finally {
4899
- if (typeof reader.cancel === "function")
4900
- await reader.cancel().catch(() => {
4901
- });
4902
- reader.releaseLock();
5936
+ return Buffer.concat([out, tail]);
5937
+ }
5938
+ };
5939
+
5940
+ // src/tts/openai.ts
5941
+ var TTS3 = class extends OpenAITTS {
5942
+ static providerKey = "openai_tts";
5943
+ constructor(opts = {}) {
5944
+ const key = opts.apiKey ?? process.env.OPENAI_API_KEY;
5945
+ if (!key) {
5946
+ throw new Error(
5947
+ "OpenAI TTS requires an apiKey. Pass { apiKey: 'sk-...' } or set OPENAI_API_KEY in the environment."
5948
+ );
4903
5949
  }
5950
+ super(
5951
+ key,
5952
+ opts.voice ?? "alloy",
5953
+ opts.model ?? "gpt-4o-mini-tts",
5954
+ opts.instructions ?? null,
5955
+ opts.speed ?? null,
5956
+ opts.antiAlias ?? false
5957
+ );
5958
+ }
5959
+ };
5960
+
5961
+ // src/tts/cartesia.ts
5962
+ init_esm_shims();
5963
+ function resolveApiKey3(apiKey) {
5964
+ const key = apiKey ?? process.env.CARTESIA_API_KEY;
5965
+ if (!key) {
5966
+ throw new Error(
5967
+ "Cartesia TTS requires an apiKey. Pass { apiKey: '...' } or set CARTESIA_API_KEY in the environment."
5968
+ );
5969
+ }
5970
+ return key;
5971
+ }
5972
+ var TTS4 = class _TTS extends CartesiaTTS {
5973
+ static providerKey = "cartesia_tts";
5974
+ constructor(opts = {}) {
5975
+ const key = resolveApiKey3(opts.apiKey);
5976
+ const { apiKey: _ignored, ...rest } = opts;
5977
+ void _ignored;
5978
+ super(key, rest);
5979
+ }
5980
+ static forTwilio(arg1, arg2) {
5981
+ const opts = typeof arg1 === "string" ? { apiKey: arg1, ...arg2 ?? {} } : arg1 ?? {};
5982
+ return new _TTS({ ...opts, sampleRate: 8e3 });
5983
+ }
5984
+ static forTelnyx(arg1, arg2) {
5985
+ const opts = typeof arg1 === "string" ? { apiKey: arg1, ...arg2 ?? {} } : arg1 ?? {};
5986
+ return new _TTS({ ...opts, sampleRate: 16e3 });
4904
5987
  }
4905
5988
  };
4906
5989
 
4907
5990
  // src/tts/rime.ts
5991
+ init_esm_shims();
4908
5992
  var TTS5 = class extends RimeTTS {
4909
5993
  static providerKey = "rime";
4910
5994
  constructor(opts = {}) {
@@ -4943,6 +6027,8 @@ var LMNTSampleRate = {
4943
6027
  HZ_24000: 24e3
4944
6028
  };
4945
6029
  var LMNTTTS = class {
6030
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
6031
+ static providerKey = "lmnt";
4946
6032
  apiKey;
4947
6033
  model;
4948
6034
  voice;
@@ -5041,6 +6127,7 @@ init_esm_shims();
5041
6127
  // src/providers/inworld-tts.ts
5042
6128
  init_esm_shims();
5043
6129
  var INWORLD_BASE_URL = "https://api.inworld.ai/tts/v1/voice:stream";
6130
+ var INWORLD_VOICES_URL = "https://api.inworld.ai/tts/v1/voices";
5044
6131
  var InworldModel = {
5045
6132
  TTS_2: "inworld-tts-2",
5046
6133
  TTS_1_5_MAX: "inworld-tts-1.5-max",
@@ -5055,6 +6142,8 @@ var InworldAudioEncoding = {
5055
6142
  MP3: "MP3"
5056
6143
  };
5057
6144
  var InworldTTS = class {
6145
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
6146
+ static providerKey = "inworld";
5058
6147
  authToken;
5059
6148
  model;
5060
6149
  voice;
@@ -5099,6 +6188,45 @@ var InworldTTS = class {
5099
6188
  if (this.deliveryMode !== void 0) payload.deliveryMode = this.deliveryMode;
5100
6189
  return payload;
5101
6190
  }
6191
+ /**
6192
+ * Pre-call HTTP warmup for the Inworld TTS API.
6193
+ *
6194
+ * Issues a lightweight `GET /tts/v1/voices` against the API host so
6195
+ * DNS + TLS + HTTP/2 connection are already up by the time the first
6196
+ * `synthesizeStream()` POST lands. Best-effort: 5 s timeout, all
6197
+ * exceptions swallowed at debug level.
6198
+ *
6199
+ * Earlier revisions issued `HEAD` against the streaming endpoint
6200
+ * (`/tts/v1/voice:stream`). That endpoint is POST-only so HEAD
6201
+ * returns `405 Method Not Allowed` — the warmup still completed the
6202
+ * TLS handshake but spammed 405 errors into Inworld's audit logs and
6203
+ * into our own logs. Switching to a documented `GET /tts/v1/voices`
6204
+ * metadata read is a 2xx-clean equivalent.
6205
+ *
6206
+ * Billing safety: `GET /tts/v1/voices` is a free metadata endpoint
6207
+ * (per https://docs.inworld.ai/). It returns the voice catalogue
6208
+ * without invoking the synthesis pipeline. The actual synthesis is
6209
+ * billed only when `POST /tts/v1/voice:stream` runs with a non-empty
6210
+ * `text`.
6211
+ *
6212
+ * Note: Inworld TTS uses the HTTP NDJSON streaming path rather than
6213
+ * a persistent WebSocket — connection warmup is therefore HTTP-based,
6214
+ * not WebSocket pre-handshake. The latency win is smaller (~50-150 ms)
6215
+ * than the WS-based prewarms but still real on cold-start calls.
6216
+ */
6217
+ async warmup() {
6218
+ try {
6219
+ await fetch(INWORLD_VOICES_URL, {
6220
+ method: "GET",
6221
+ headers: {
6222
+ Authorization: `Basic ${this.authToken}`
6223
+ },
6224
+ signal: AbortSignal.timeout(5e3)
6225
+ });
6226
+ } catch (err) {
6227
+ getLogger().debug(`Inworld TTS warmup failed (best-effort): ${String(err)}`);
6228
+ }
6229
+ }
5102
6230
  /** Synthesize text and return the concatenated audio buffer. */
5103
6231
  async synthesize(text) {
5104
6232
  const chunks = [];
@@ -5238,6 +6366,8 @@ var DEFAULT_MODEL = AnthropicModel.CLAUDE_HAIKU_4_5_20251001;
5238
6366
  var DEFAULT_MAX_TOKENS = 1024;
5239
6367
  var PROMPT_CACHING_BETA = "prompt-caching-2024-07-31";
5240
6368
  var AnthropicLLMProvider = class {
6369
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
6370
+ static providerKey = "anthropic";
5241
6371
  apiKey;
5242
6372
  model;
5243
6373
  maxTokens;
@@ -5259,6 +6389,27 @@ var AnthropicLLMProvider = class {
5259
6389
  this.anthropicVersion = options.anthropicVersion ?? DEFAULT_ANTHROPIC_VERSION;
5260
6390
  this.promptCaching = options.promptCaching ?? true;
5261
6391
  }
6392
+ /**
6393
+ * Pre-call DNS / TLS warmup for the Anthropic Messages API.
6394
+ * Issues a lightweight ``GET https://api.anthropic.com/v1/models`` so
6395
+ * DNS, TLS and HTTP/2 are already up by the time the first ``messages``
6396
+ * call lands. Best-effort: 5 s timeout, exceptions swallowed at debug.
6397
+ */
6398
+ async warmup() {
6399
+ try {
6400
+ const modelsUrl = this.url.replace(/\/messages\/?$/, "/models");
6401
+ await fetch(modelsUrl, {
6402
+ method: "GET",
6403
+ headers: {
6404
+ "x-api-key": this.apiKey,
6405
+ "anthropic-version": this.anthropicVersion
6406
+ },
6407
+ signal: AbortSignal.timeout(5e3)
6408
+ });
6409
+ } catch (err) {
6410
+ getLogger().debug(`Anthropic LLM warmup failed (best-effort): ${String(err)}`);
6411
+ }
6412
+ }
5262
6413
  /** Stream Patter-format LLM chunks for the given OpenAI-style chat history. */
5263
6414
  async *stream(messages, tools, opts) {
5264
6415
  const { system, messages: anthropicMessages } = toAnthropicMessages(messages);
@@ -5476,12 +6627,6 @@ init_esm_shims();
5476
6627
 
5477
6628
  // src/providers/groq-llm.ts
5478
6629
  init_esm_shims();
5479
-
5480
- // src/version.ts
5481
- init_esm_shims();
5482
- var VERSION = "0.5.5";
5483
-
5484
- // src/providers/groq-llm.ts
5485
6630
  var GROQ_BASE_URL = "https://api.groq.com/openai/v1";
5486
6631
  var GroqModel = {
5487
6632
  LLAMA_3_3_70B_VERSATILE: "llama-3.3-70b-versatile",
@@ -5494,6 +6639,8 @@ var GroqModel = {
5494
6639
  };
5495
6640
  var DEFAULT_MODEL2 = GroqModel.LLAMA_3_3_70B_VERSATILE;
5496
6641
  var GroqLLMProvider = class {
6642
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
6643
+ static providerKey = "groq";
5497
6644
  apiKey;
5498
6645
  model;
5499
6646
  baseUrl;
@@ -5527,6 +6674,21 @@ var GroqLLMProvider = class {
5527
6674
  this.presencePenalty = options.presencePenalty;
5528
6675
  this.stop = options.stop;
5529
6676
  }
6677
+ /**
6678
+ * Pre-call DNS / TLS warmup for the Groq inference endpoint.
6679
+ * Best-effort: 5 s timeout, all exceptions swallowed at debug level.
6680
+ */
6681
+ async warmup() {
6682
+ try {
6683
+ await fetch(`${this.baseUrl}/models`, {
6684
+ method: "GET",
6685
+ headers: { Authorization: `Bearer ${this.apiKey}` },
6686
+ signal: AbortSignal.timeout(5e3)
6687
+ });
6688
+ } catch (err) {
6689
+ getLogger().debug(`Groq LLM warmup failed (best-effort): ${String(err)}`);
6690
+ }
6691
+ }
5530
6692
  /** Stream Patter-format LLM chunks from the Groq chat completions API. */
5531
6693
  async *stream(messages, tools, opts) {
5532
6694
  const body = {
@@ -5662,6 +6824,8 @@ var CerebrasModel = {
5662
6824
  var DEFAULT_MODEL3 = CerebrasModel.GPT_OSS_120B;
5663
6825
  var RETRY_BACKOFF_BASE_MS = 500;
5664
6826
  var CerebrasLLMProvider = class {
6827
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
6828
+ static providerKey = "cerebras";
5665
6829
  apiKey;
5666
6830
  model;
5667
6831
  baseUrl;
@@ -5697,6 +6861,21 @@ var CerebrasLLMProvider = class {
5697
6861
  this.presencePenalty = options.presencePenalty;
5698
6862
  this.stop = options.stop;
5699
6863
  }
6864
+ /**
6865
+ * Pre-call DNS / TLS warmup for the Cerebras inference endpoint.
6866
+ * Best-effort: 5 s timeout, all exceptions swallowed at debug level.
6867
+ */
6868
+ async warmup() {
6869
+ try {
6870
+ await fetch(`${this.baseUrl}/models`, {
6871
+ method: "GET",
6872
+ headers: { Authorization: `Bearer ${this.apiKey}` },
6873
+ signal: AbortSignal.timeout(5e3)
6874
+ });
6875
+ } catch (err) {
6876
+ getLogger().debug(`Cerebras LLM warmup failed (best-effort): ${String(err)}`);
6877
+ }
6878
+ }
5700
6879
  /** Stream Patter-format LLM chunks from the Cerebras chat completions API. */
5701
6880
  async *stream(messages, tools, opts) {
5702
6881
  const body = {
@@ -5859,6 +7038,8 @@ var GoogleModel = {
5859
7038
  var DEFAULT_MODEL4 = GoogleModel.GEMINI_2_5_FLASH;
5860
7039
  var DEFAULT_BASE_URL3 = "https://generativelanguage.googleapis.com/v1beta";
5861
7040
  var GoogleLLMProvider = class {
7041
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
7042
+ static providerKey = "google";
5862
7043
  apiKey;
5863
7044
  model;
5864
7045
  baseUrl;
@@ -5876,6 +7057,23 @@ var GoogleLLMProvider = class {
5876
7057
  this.temperature = options.temperature;
5877
7058
  this.maxOutputTokens = options.maxOutputTokens;
5878
7059
  }
7060
+ /**
7061
+ * Pre-call DNS / TLS warmup for the Gemini API.
7062
+ * Issues a lightweight ``GET ${baseUrl}/models?key=...`` so DNS, TLS
7063
+ * and HTTP/2 are already up by the time the first
7064
+ * ``streamGenerateContent`` call lands. Best-effort: 5 s timeout, all
7065
+ * exceptions swallowed at debug level.
7066
+ */
7067
+ async warmup() {
7068
+ try {
7069
+ await fetch(`${this.baseUrl}/models?key=${encodeURIComponent(this.apiKey)}`, {
7070
+ method: "GET",
7071
+ signal: AbortSignal.timeout(5e3)
7072
+ });
7073
+ } catch (err) {
7074
+ getLogger().debug(`Google LLM warmup failed (best-effort): ${String(err)}`);
7075
+ }
7076
+ }
5879
7077
  /** Stream Patter-format LLM chunks from the Gemini SSE endpoint. */
5880
7078
  async *stream(messages, tools, opts) {
5881
7079
  const { systemInstruction, contents } = toGeminiContents(messages);
@@ -6065,6 +7263,186 @@ var LLM5 = class extends GoogleLLMProvider {
6065
7263
  }
6066
7264
  };
6067
7265
 
7266
+ // src/providers/deepfilternet-filter.ts
7267
+ init_esm_shims();
7268
+ function log() {
7269
+ return getLogger();
7270
+ }
7271
+ var DEEPFILTERNET_SR = 48e3;
7272
+ async function loadOnnxRuntime() {
7273
+ try {
7274
+ const specifier = "onnxruntime-node";
7275
+ const mod = await import(specifier);
7276
+ return mod;
7277
+ } catch {
7278
+ return null;
7279
+ }
7280
+ }
7281
+ function pcm16ToFloat32(pcm) {
7282
+ const view = new Int16Array(pcm.buffer, pcm.byteOffset, Math.floor(pcm.byteLength / 2));
7283
+ const out = new Float32Array(view.length);
7284
+ for (let i = 0; i < view.length; i += 1) {
7285
+ out[i] = view[i] / 32768;
7286
+ }
7287
+ return out;
7288
+ }
7289
+ function float32ToPcm16(samples) {
7290
+ const out = Buffer.alloc(samples.length * 2);
7291
+ for (let i = 0; i < samples.length; i += 1) {
7292
+ const clamped = Math.max(-1, Math.min(1, samples[i]));
7293
+ out.writeInt16LE(Math.round(clamped * 32767), i * 2);
7294
+ }
7295
+ return out;
7296
+ }
7297
+ var DeepFilterNetFilter = class {
7298
+ modelPath;
7299
+ silenceWarnings;
7300
+ session = null;
7301
+ ort = null;
7302
+ warned = false;
7303
+ closed = false;
7304
+ // Fix 5: stateful resamplers for src_sr↔48k conversions so chunk-boundary
7305
+ // samples are not discarded. Lazy-created and torn down on rate change.
7306
+ _resamplerSrcRate = null;
7307
+ _upsamplerInst = null;
7308
+ _downsamplerInst = null;
7309
+ constructor(options = {}) {
7310
+ this.modelPath = options.modelPath;
7311
+ this.silenceWarnings = options.silenceWarnings === true;
7312
+ }
7313
+ async ensureSession() {
7314
+ if (this.session !== null) {
7315
+ return this.session;
7316
+ }
7317
+ if (!this.modelPath) {
7318
+ if (!this.warned && !this.silenceWarnings) {
7319
+ log().warn(
7320
+ "DeepFilterNetFilter: no modelPath provided; audio will pass through unmodified. Provide a DeepFilterNet ONNX model to enable noise suppression."
7321
+ );
7322
+ this.warned = true;
7323
+ }
7324
+ return null;
7325
+ }
7326
+ if (this.ort === null) {
7327
+ this.ort = await loadOnnxRuntime();
7328
+ }
7329
+ if (this.ort === null) {
7330
+ if (!this.warned && !this.silenceWarnings) {
7331
+ log().warn(
7332
+ "DeepFilterNetFilter: onnxruntime-node is not installed; audio will pass through unmodified. Run `npm install onnxruntime-node` to enable noise suppression."
7333
+ );
7334
+ this.warned = true;
7335
+ }
7336
+ return null;
7337
+ }
7338
+ try {
7339
+ this.session = await this.ort.InferenceSession.create(this.modelPath);
7340
+ return this.session;
7341
+ } catch (error) {
7342
+ const message = error instanceof Error ? error.message : String(error);
7343
+ log().error(`DeepFilterNetFilter: failed to load model: ${message}`);
7344
+ this.warned = true;
7345
+ return null;
7346
+ }
7347
+ }
7348
+ /** Run noise suppression on a PCM16 chunk; pass-through when no model is loaded. */
7349
+ async process(pcmChunk, sampleRate) {
7350
+ if (this.closed) {
7351
+ throw new Error("DeepFilterNetFilter is closed");
7352
+ }
7353
+ if (pcmChunk.length === 0) {
7354
+ return pcmChunk;
7355
+ }
7356
+ const session = await this.ensureSession();
7357
+ if (session === null || this.ort === null) {
7358
+ return pcmChunk;
7359
+ }
7360
+ try {
7361
+ if (this._resamplerSrcRate !== sampleRate) {
7362
+ this._resamplerSrcRate = sampleRate;
7363
+ this._upsamplerInst = new StatefulResampler({ srcRate: sampleRate, dstRate: DEEPFILTERNET_SR });
7364
+ this._downsamplerInst = new StatefulResampler({ srcRate: DEEPFILTERNET_SR, dstRate: sampleRate });
7365
+ }
7366
+ const samples = pcm16ToFloat32(pcmChunk);
7367
+ const pcm16Up = this._upsamplerInst.process(float32ToPcm16(new Float32Array(samples)));
7368
+ const upsampled = pcm16ToFloat32(pcm16Up);
7369
+ const inputName = session.inputNames[0];
7370
+ const outputName = session.outputNames[0];
7371
+ const tensor = new this.ort.Tensor("float32", upsampled, [1, upsampled.length]);
7372
+ const feeds = { [inputName]: tensor };
7373
+ const results = await session.run(feeds);
7374
+ const output = results[outputName];
7375
+ if (!output || !output.data) {
7376
+ return pcmChunk;
7377
+ }
7378
+ const enhanced = output.data instanceof Float32Array ? output.data : new Float32Array(output.data);
7379
+ const pcm16Enhanced = float32ToPcm16(enhanced);
7380
+ const pcm16Restored = this._downsamplerInst.process(pcm16Enhanced);
7381
+ return pcm16Restored;
7382
+ } catch (error) {
7383
+ const message = error instanceof Error ? error.message : String(error);
7384
+ log().error(`DeepFilterNetFilter.process failed: ${message}`);
7385
+ return pcmChunk;
7386
+ }
7387
+ }
7388
+ /** Flush resamplers, release the ONNX session, and mark the filter closed. */
7389
+ async close() {
7390
+ try {
7391
+ this._upsamplerInst?.flush();
7392
+ } catch {
7393
+ }
7394
+ try {
7395
+ this._downsamplerInst?.flush();
7396
+ } catch {
7397
+ }
7398
+ this._upsamplerInst = null;
7399
+ this._downsamplerInst = null;
7400
+ if (this.session !== null && typeof this.session.release === "function") {
7401
+ try {
7402
+ await this.session.release();
7403
+ } catch (error) {
7404
+ const message = error instanceof Error ? error.message : String(error);
7405
+ log().warn(`DeepFilterNetFilter.close: release failed: ${message}`);
7406
+ }
7407
+ }
7408
+ this.session = null;
7409
+ this.closed = true;
7410
+ }
7411
+ };
7412
+
7413
+ // src/providers/krisp-filter.ts
7414
+ init_esm_shims();
7415
+ var KrispSampleRate = {
7416
+ HZ_8000: 8e3,
7417
+ HZ_16000: 16e3,
7418
+ HZ_32000: 32e3,
7419
+ HZ_44100: 44100,
7420
+ HZ_48000: 48e3
7421
+ };
7422
+ var KrispFrameDuration = {
7423
+ MS_10: 10,
7424
+ MS_15: 15,
7425
+ MS_20: 20,
7426
+ MS_30: 30,
7427
+ MS_32: 32
7428
+ };
7429
+ var NODE_SDK_UNAVAILABLE_MESSAGE = "Krisp VIVA Filter is not yet available for the Patter TypeScript SDK.\n\nAs of 2026-05, Krisp does not publish an official Node.js (server) SDK. The Patter TypeScript SDK ships only the AudioFilter interface scaffold (this file) for parity with the Python implementation, since Patter runs server-side on a real-time audio stream from the telephony carrier.\n\nAvailable paths today:\n 1. Use the Python SDK: `from getpatter.providers.krisp_filter import KrispVivaFilter` \u2014 fully implemented, requires `pip install getpatter[krisp]` + `KRISP_VIVA_SDK_LICENSE_KEY` + `KRISP_VIVA_FILTER_MODEL_PATH`.\n 2. Use DeepFilterNet on TS: `new DeepFilterNetFilter({ modelPath: '.../DeepFilterNet3.onnx' })` \u2014 community ONNX export, no license needed.\n\nBrowser/React Native (not applicable to Patter server-side, listed for completeness):\n - Browser WASM wrappers (various third-party packages) process local microphone capture, not server-received PCM/mulaw audio.\n - Mobile client wrappers (iOS/Android, various third-party packages) are likewise client-side only.\n\nTrack Node SDK status:\n - https://krisp.ai/developers/\n - Patter backlog: task #38 \"Krisp TS port decision\"\n";
7430
+ var KrispVivaFilter = class {
7431
+ static providerKey = "krisp_viva";
7432
+ constructor(_options = {}) {
7433
+ throw new Error(NODE_SDK_UNAVAILABLE_MESSAGE);
7434
+ }
7435
+ // The two methods below are unreachable at runtime (constructor throws)
7436
+ // but kept so the class structurally satisfies `AudioFilter`. When the
7437
+ // Node binding lands, replace constructor + these stubs with the real
7438
+ // implementation.
7439
+ async process(pcmChunk, _sampleRate) {
7440
+ return pcmChunk;
7441
+ }
7442
+ async close() {
7443
+ }
7444
+ };
7445
+
6068
7446
  // src/telephony/twilio.ts
6069
7447
  init_esm_shims();
6070
7448
  var Carrier = class {
@@ -6905,12 +8283,28 @@ var TwilioAdapter = class _TwilioAdapter {
6905
8283
  return { callSid: call.sid };
6906
8284
  }
6907
8285
  /**
6908
- * Build a minimal ``<Response><Connect><Stream url="..."/></Connect></Response>``
6909
- * TwiML document. Mirrors the Python adapter's ``generate_stream_twiml``.
8286
+ * Build a ``<Response><Connect><Stream url="...">`` TwiML document.
8287
+ *
8288
+ * ``parameters`` is forwarded as ``<Parameter name="..." value="..."/>``
8289
+ * children of ``<Stream>``. Twilio Media Streams strips query-string params
8290
+ * from the ``<Stream url=...>`` before the WS handshake, so
8291
+ * ``<Parameter>`` tags are the supported way to pre-populate
8292
+ * ``start.customParameters`` on the WS ``start`` frame. Used by the
8293
+ * inbound path to carry caller / callee through to the bridge.
8294
+ *
8295
+ * Mirrors the Python adapter's ``generate_stream_twiml``.
6910
8296
  */
6911
- static generateStreamTwiml(streamUrl) {
6912
- const escaped = streamUrl.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;").replace(/'/g, "&apos;");
6913
- return `<?xml version="1.0" encoding="UTF-8"?><Response><Connect><Stream url="${escaped}"/></Connect></Response>`;
8297
+ static generateStreamTwiml(streamUrl, parameters) {
8298
+ const esc = (s) => s.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;").replace(/'/g, "&apos;");
8299
+ const escapedUrl = esc(streamUrl);
8300
+ let paramTags = "";
8301
+ if (parameters) {
8302
+ for (const [name, value] of Object.entries(parameters)) {
8303
+ if (value == null) continue;
8304
+ paramTags += `<Parameter name="${esc(name)}" value="${esc(String(value))}"/>`;
8305
+ }
8306
+ }
8307
+ return `<?xml version="1.0" encoding="UTF-8"?><Response><Connect><Stream url="${escapedUrl}">${paramTags}</Stream></Connect></Response>`;
6914
8308
  }
6915
8309
  /** Force-complete an in-progress call. */
6916
8310
  async endCall(callSid) {
@@ -7100,6 +8494,8 @@ var TelnyxSTT = class {
7100
8494
  transcriptionEngine;
7101
8495
  sampleRate;
7102
8496
  baseUrl;
8497
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
8498
+ static providerKey = "telnyx_stt";
7103
8499
  ws = null;
7104
8500
  callbacks = [];
7105
8501
  headerSent = false;
@@ -7204,6 +8600,8 @@ var TelnyxTTS = class {
7204
8600
  apiKey;
7205
8601
  voice;
7206
8602
  baseUrl;
8603
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
8604
+ static providerKey = "telnyx_tts";
7207
8605
  /** Collect every audio chunk into a single Buffer. */
7208
8606
  async synthesize(text) {
7209
8607
  const chunks = [];
@@ -7299,16 +8697,23 @@ export {
7299
8697
  CallMetricsAccumulator,
7300
8698
  STT4 as CartesiaSTT,
7301
8699
  TTS4 as CartesiaTTS,
8700
+ CartesiaTTSModel,
8701
+ CartesiaTTSVoiceMode,
7302
8702
  LLM4 as CerebrasLLM,
7303
8703
  ChatContext,
7304
8704
  CloudflareTunnel,
7305
8705
  DEFAULT_MIN_SENTENCE_LEN,
7306
8706
  DEFAULT_PRICING,
7307
8707
  DTMF_EVENTS,
8708
+ DeepFilterNetFilter,
8709
+ DeepgramModel,
7308
8710
  STT as DeepgramSTT,
7309
8711
  DefaultToolExecutor,
7310
8712
  ConvAI as ElevenLabsConvAI,
7311
8713
  ElevenLabsConvAIAdapter,
8714
+ ElevenLabsModel,
8715
+ ElevenLabsOutputFormat,
8716
+ ElevenLabsTTS as ElevenLabsRestTTS,
7312
8717
  TTS as ElevenLabsTTS,
7313
8718
  TTS2 as ElevenLabsWebSocketTTS,
7314
8719
  ErrorCode,
@@ -7322,16 +8727,29 @@ export {
7322
8727
  Guardrail,
7323
8728
  IVRActivity,
7324
8729
  TTS7 as InworldTTS,
8730
+ KrispFrameDuration,
8731
+ KrispSampleRate,
8732
+ KrispVivaFilter,
7325
8733
  LLMLoop,
7326
8734
  TTS6 as LMNTTTS,
7327
8735
  MetricsStore,
8736
+ MinWordsStrategy,
7328
8737
  Ngrok,
7329
8738
  LLM as OpenAILLM,
7330
8739
  OpenAILLMProvider,
7331
8740
  Realtime as OpenAIRealtime,
8741
+ Realtime2 as OpenAIRealtime2,
8742
+ OpenAIRealtime2Adapter,
7332
8743
  OpenAIRealtimeAdapter,
8744
+ OpenAIRealtimeAudioFormat,
8745
+ OpenAIRealtimeModel,
8746
+ OpenAIRealtimeVADType,
7333
8747
  TTS3 as OpenAITTS,
7334
8748
  STT3 as OpenAITranscribeSTT,
8749
+ OpenAITranscriptionModel,
8750
+ OpenAIVoice,
8751
+ PRICING_LAST_UPDATED,
8752
+ PRICING_VERSION,
7335
8753
  PartialStreamError,
7336
8754
  Patter,
7337
8755
  PatterConnectionError,
@@ -7339,9 +8757,12 @@ export {
7339
8757
  PatterTool,
7340
8758
  PcmCarry,
7341
8759
  PipelineHookExecutor,
8760
+ PricingUnit,
7342
8761
  ProvisionError,
7343
8762
  RateLimitError,
7344
8763
  RemoteMessageHandler,
8764
+ RimeAudioFormat,
8765
+ RimeModel,
7345
8766
  TTS5 as RimeTTS,
7346
8767
  SPAN_BARGEIN,
7347
8768
  SPAN_CALL,
@@ -7395,6 +8816,7 @@ export {
7395
8816
  deepgram,
7396
8817
  defineTool,
7397
8818
  elevenlabs,
8819
+ evaluateStrategies as evaluateBargeInStrategies,
7398
8820
  filterEmoji,
7399
8821
  filterForTTS,
7400
8822
  filterMarkdown,
@@ -7420,6 +8842,7 @@ export {
7420
8842
  resample24kTo16k,
7421
8843
  resample8kTo16k,
7422
8844
  resamplePcm,
8845
+ resetStrategies as resetBargeInStrategies,
7423
8846
  rime,
7424
8847
  scheduleCron,
7425
8848
  scheduleInterval,