getpatter 0.6.1 → 0.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -4,8 +4,10 @@ import {
4
4
  import {
5
5
  AuthenticationError,
6
6
  CallMetricsAccumulator,
7
+ Carrier,
7
8
  DEFAULT_MIN_SENTENCE_LEN,
8
9
  DEFAULT_PRICING,
10
+ DeepgramModel,
9
11
  DeepgramSTT,
10
12
  DefaultToolExecutor,
11
13
  ElevenLabsConvAIAdapter,
@@ -15,12 +17,13 @@ import {
15
17
  LLMLoop,
16
18
  MetricsStore,
17
19
  OpenAILLMProvider,
18
- OpenAIRealtime2Adapter,
19
- OpenAIRealtimeAdapter,
20
+ PRICING_LAST_UPDATED,
21
+ PRICING_VERSION,
20
22
  PatterConnectionError,
21
23
  PatterError,
22
- PcmCarry,
23
24
  PipelineHookExecutor,
25
+ PlivoAdapter,
26
+ PricingUnit,
24
27
  ProvisionError,
25
28
  RateLimitError,
26
29
  RemoteMessageHandler,
@@ -32,18 +35,14 @@ import {
32
35
  SPAN_TOOL,
33
36
  SPAN_TTS,
34
37
  SentenceChunker,
35
- StatefulResampler,
36
38
  TestSession,
39
+ VERSION,
37
40
  calculateRealtimeCost,
38
41
  calculateSttCost,
39
42
  calculateTelephonyCost,
40
43
  calculateTtsCost,
41
44
  callsToCsv,
42
45
  callsToJson,
43
- createResampler16kTo8k,
44
- createResampler24kTo16k,
45
- createResampler24kTo8k,
46
- createResampler8kTo16k,
47
46
  initTracing,
48
47
  isRemoteUrl,
49
48
  isTracingEnabled,
@@ -53,14 +52,29 @@ import {
53
52
  mergePricing,
54
53
  mountApi,
55
54
  mountDashboard,
55
+ resolveLogRoot,
56
+ startSpan
57
+ } from "./chunk-Z6W5XFWS.mjs";
58
+ import {
59
+ OpenAIRealtime2Adapter,
60
+ OpenAIRealtimeAdapter,
61
+ OpenAIRealtimeAudioFormat,
62
+ OpenAIRealtimeModel,
63
+ OpenAIRealtimeVADType,
64
+ OpenAITranscriptionModel,
65
+ OpenAIVoice,
66
+ PcmCarry,
67
+ StatefulResampler,
68
+ createResampler16kTo8k,
69
+ createResampler24kTo16k,
70
+ createResampler24kTo8k,
71
+ createResampler8kTo16k,
56
72
  mulawToPcm16,
57
73
  pcm16ToMulaw,
58
74
  resample16kTo8k,
59
75
  resample24kTo16k,
60
- resample8kTo16k,
61
- resolveLogRoot,
62
- startSpan
63
- } from "./chunk-TEW3NAZJ.mjs";
76
+ resample8kTo16k
77
+ } from "./chunk-CL2U3YET.mjs";
64
78
  import {
65
79
  MinWordsStrategy,
66
80
  evaluateStrategies,
@@ -75,7 +89,7 @@ import {
75
89
  } from "./chunk-6GR5MHHQ.mjs";
76
90
  import {
77
91
  SileroVAD
78
- } from "./chunk-RV7APPYE.mjs";
92
+ } from "./chunk-R2T4JABZ.mjs";
79
93
  import {
80
94
  __dirname,
81
95
  __require,
@@ -105,7 +119,7 @@ var Realtime = class {
105
119
  );
106
120
  }
107
121
  this.apiKey = key;
108
- this.model = opts.model ?? "gpt-4o-mini-realtime-preview";
122
+ this.model = opts.model ?? "gpt-realtime-mini";
109
123
  this.voice = opts.voice ?? "alloy";
110
124
  this.reasoningEffort = opts.reasoningEffort;
111
125
  this.inputAudioTranscriptionModel = opts.inputAudioTranscriptionModel;
@@ -557,7 +571,9 @@ function resolvePersistRoot(persist) {
557
571
  if (persist === false) return null;
558
572
  if (persist === true) return resolveLogRoot("auto");
559
573
  if (typeof persist === "string") return resolveLogRoot(persist);
560
- return resolveLogRoot();
574
+ const envRoot = resolveLogRoot();
575
+ if (envRoot !== null) return envRoot;
576
+ return resolveLogRoot("auto");
561
577
  }
562
578
  function closeParkedConnections(slot) {
563
579
  if (slot.stt) {
@@ -573,6 +589,11 @@ function closeParkedConnections(slot) {
573
589
  }
574
590
  }
575
591
  if (slot.openaiRealtime) {
592
+ const wsAny = slot.openaiRealtime;
593
+ if (wsAny._parkedKeepalive) {
594
+ clearInterval(wsAny._parkedKeepalive);
595
+ delete wsAny._parkedKeepalive;
596
+ }
576
597
  try {
577
598
  slot.openaiRealtime.close();
578
599
  } catch {
@@ -780,7 +801,7 @@ var Patter = class {
780
801
  }
781
802
  if (!options.carrier) {
782
803
  throw new Error(
783
- "Local mode requires a `carrier` instance. Pass `carrier: new Twilio({...})` or `carrier: new Telnyx({...})`."
804
+ "Local mode requires a `carrier` instance. Pass `carrier: new Twilio({...})`, `carrier: new Telnyx({...})` or `carrier: new Plivo({...})`."
784
805
  );
785
806
  }
786
807
  const carrier = options.carrier;
@@ -958,16 +979,18 @@ var Patter = class {
958
979
  throw err;
959
980
  }
960
981
  const carrier = this.localConfig.carrier;
961
- const telephonyProvider = carrier.kind === "twilio" ? "twilio" : "telnyx";
982
+ const telephonyProvider = carrier.kind;
962
983
  const wantsCarrierManagement = opts.manageWebhook !== false || wantsCloudflared;
963
984
  if (wantsCarrierManagement) {
964
- const { autoConfigureCarrier } = await import("./carrier-config-4ZKVYAWV.mjs");
985
+ const { autoConfigureCarrier } = await import("./carrier-config-3WDQXP5J.mjs");
965
986
  await autoConfigureCarrier({
966
987
  telephonyProvider,
967
988
  twilioSid: carrier.kind === "twilio" ? carrier.accountSid : void 0,
968
989
  twilioToken: carrier.kind === "twilio" ? carrier.authToken : void 0,
969
990
  telnyxKey: carrier.kind === "telnyx" ? carrier.apiKey : void 0,
970
991
  telnyxConnectionId: carrier.kind === "telnyx" ? carrier.connectionId : void 0,
992
+ plivoAuthId: carrier.kind === "plivo" ? carrier.authId : void 0,
993
+ plivoAuthToken: carrier.kind === "plivo" ? carrier.authToken : void 0,
971
994
  phoneNumber: this.localConfig.phoneNumber,
972
995
  webhookHost: webhookUrl
973
996
  });
@@ -983,6 +1006,8 @@ var Patter = class {
983
1006
  telnyxKey: carrier.kind === "telnyx" ? carrier.apiKey : void 0,
984
1007
  telnyxConnectionId: carrier.kind === "telnyx" ? carrier.connectionId : void 0,
985
1008
  telnyxPublicKey: carrier.kind === "telnyx" ? carrier.publicKey : void 0,
1009
+ plivoAuthId: carrier.kind === "plivo" ? carrier.authId : void 0,
1010
+ plivoAuthToken: carrier.kind === "plivo" ? carrier.authToken : void 0,
986
1011
  persistRoot: this.localConfig.persistRoot
987
1012
  },
988
1013
  opts.agent,
@@ -1014,7 +1039,7 @@ var Patter = class {
1014
1039
  }
1015
1040
  /** Run the agent in interactive terminal-test mode (no real telephony). */
1016
1041
  async test(opts) {
1017
- const { TestSession: TestSession2 } = await import("./test-mode-WEKKNBLD.mjs");
1042
+ const { TestSession: TestSession2 } = await import("./test-mode-MDBQ4ECE.mjs");
1018
1043
  const session = new TestSession2();
1019
1044
  await session.run({
1020
1045
  agent: opts.agent,
@@ -1144,7 +1169,9 @@ var Patter = class {
1144
1169
  const tts = agent.tts;
1145
1170
  const sttOpen = typeof stt?.openParkedConnection === "function" ? stt.openParkedConnection.bind(stt) : null;
1146
1171
  const ttsOpen = typeof tts?.openParkedConnection === "function" ? tts.openParkedConnection.bind(tts) : null;
1147
- if (!sttOpen && !ttsOpen) return;
1172
+ const providerStr = agent.provider ?? "";
1173
+ const wantsRealtimePark = providerStr === "openai_realtime" || providerStr === "openai_realtime_2";
1174
+ if (!sttOpen && !ttsOpen && !wantsRealtimePark) return;
1148
1175
  const slot = {};
1149
1176
  this.prewarmedConnections.set(callId, slot);
1150
1177
  const startedAt = Date.now();
@@ -1189,6 +1216,43 @@ var Patter = class {
1189
1216
  }
1190
1217
  })());
1191
1218
  }
1219
+ if (wantsRealtimePark) {
1220
+ tasks.push((async () => {
1221
+ const { OpenAIRealtime2Adapter: OpenAIRealtime2Adapter2 } = await import("./openai-realtime-2-CNFARP25.mjs");
1222
+ const apiKey = process.env.OPENAI_API_KEY ?? "";
1223
+ if (!apiKey) {
1224
+ getLogger().debug(`Park OpenAI Realtime skipped for ${callId}: no OPENAI_API_KEY`);
1225
+ return;
1226
+ }
1227
+ try {
1228
+ const tmpAdapter = new OpenAIRealtime2Adapter2(
1229
+ apiKey,
1230
+ agent.model ?? "gpt-realtime-mini",
1231
+ agent.voice ?? "alloy",
1232
+ agent.systemPrompt ?? "",
1233
+ [],
1234
+ // audioFormat — the GA adapter always emits audio/pcm@24000
1235
+ // internally regardless of this value, but it's a required
1236
+ // positional param. Default to g711_ulaw (Twilio wire format).
1237
+ void 0
1238
+ );
1239
+ const ws = await tmpAdapter.openParkedConnection();
1240
+ if (this.prewarmedConnections.get(callId) !== slot) {
1241
+ try {
1242
+ ws.close();
1243
+ } catch {
1244
+ }
1245
+ return;
1246
+ }
1247
+ slot.openaiRealtime = ws;
1248
+ getLogger().info(
1249
+ `[PREWARM] callId=${callId} provider=openai_realtime ms=${Date.now() - startedAt}`
1250
+ );
1251
+ } catch (err) {
1252
+ getLogger().debug(`Park OpenAI Realtime failed for ${callId}: ${String(err)}`);
1253
+ }
1254
+ })());
1255
+ }
1192
1256
  const task = (async () => {
1193
1257
  await Promise.allSettled(tasks);
1194
1258
  })();
@@ -1266,7 +1330,7 @@ var Patter = class {
1266
1330
  * with a warn when the cap is reached (the call still proceeds —
1267
1331
  * StreamHandler falls back to live TTS).
1268
1332
  */
1269
- spawnPrewarmFirstMessage(agent, callId, ringTimeout) {
1333
+ spawnPrewarmFirstMessage(agent, callId, ringTimeout, carrier) {
1270
1334
  if (!agent.prewarmFirstMessage) return;
1271
1335
  const providerMode = agent.provider ?? "openai_realtime";
1272
1336
  if (providerMode !== "pipeline") {
@@ -1279,6 +1343,18 @@ var Patter = class {
1279
1343
  const tts = agent.tts;
1280
1344
  if (!firstMessage || !tts) return;
1281
1345
  if (typeof tts.synthesizeStream !== "function") return;
1346
+ if (carrier) {
1347
+ const carrierAware = tts;
1348
+ if (typeof carrierAware.setTelephonyCarrier === "function") {
1349
+ try {
1350
+ carrierAware.setTelephonyCarrier(carrier);
1351
+ } catch (err) {
1352
+ getLogger().debug(
1353
+ `Prewarm TTS setTelephonyCarrier failed for ${callId}: ${String(err)}`
1354
+ );
1355
+ }
1356
+ }
1357
+ }
1282
1358
  const inFlight = this.prewarmAudio.size + this.prewarmTasks.size;
1283
1359
  if (inFlight >= PREWARM_CACHE_MAX) {
1284
1360
  getLogger().warn(
@@ -1343,7 +1419,15 @@ var Patter = class {
1343
1419
  this.prewarmTtlTimers.set(callId, handle);
1344
1420
  });
1345
1421
  }
1346
- /** Place an outbound call via the configured carrier. */
1422
+ /**
1423
+ * Place an outbound call via the configured carrier.
1424
+ *
1425
+ * With `wait: false` (default) this resolves to `void` the instant the
1426
+ * carrier accepts the dial (fire-and-forget). With `wait: true` it blocks
1427
+ * until the call reaches a terminal state and resolves to a
1428
+ * {@link CallResult} — see {@link LocalCallOptions.wait}. Mirrors Python's
1429
+ * `Patter.call(..., wait=False)`.
1430
+ */
1347
1431
  async call(options) {
1348
1432
  if (!options.to) {
1349
1433
  throw new Error("'to' phone number is required");
@@ -1351,7 +1435,13 @@ var Patter = class {
1351
1435
  if (!options.to.startsWith("+")) {
1352
1436
  throw new Error(`'to' must be in E.164 format (e.g., '+1234567890'). Got: '${options.to}'`);
1353
1437
  }
1438
+ if (options.wait && !this.embeddedServer) {
1439
+ throw new PatterConnectionError(
1440
+ "call({ wait: true }) requires an active server to receive the carrier completion webhooks. Call `await phone.serve(...)` first, or use `await using phone = new Patter(...)` (and serve inside the block) which keeps the server up for the duration of the block."
1441
+ );
1442
+ }
1354
1443
  const { phoneNumber, webhookUrl, carrier } = this.localConfig;
1444
+ let callId = "";
1355
1445
  const effectiveRingTimeout = options.ringTimeout === void 0 ? 25 : options.ringTimeout;
1356
1446
  const wantsAmd = options.machineDetection !== false || Boolean(options.voicemailMessage);
1357
1447
  if (this.embeddedServer) {
@@ -1391,20 +1481,92 @@ var Patter = class {
1391
1481
  telnyxCallId = body.data?.call_control_id;
1392
1482
  } catch {
1393
1483
  }
1394
- if (this.embeddedServer && telnyxCallId) {
1395
- this.embeddedServer.metricsStore.recordCallInitiated({
1484
+ if (telnyxCallId) {
1485
+ const initiatedPayload = {
1396
1486
  call_id: telnyxCallId,
1397
1487
  caller: phoneNumber,
1398
1488
  callee: options.to,
1399
- direction: "outbound"
1400
- });
1489
+ direction: "outbound",
1490
+ status: "initiated"
1491
+ };
1492
+ if (this.embeddedServer) {
1493
+ this.embeddedServer.metricsStore.recordCallInitiated(initiatedPayload);
1494
+ }
1495
+ try {
1496
+ const { notifyDashboard: notifyDashboard2 } = await import("./persistence-LVIAHESK.mjs");
1497
+ notifyDashboard2(initiatedPayload);
1498
+ } catch {
1499
+ }
1401
1500
  }
1402
1501
  if (telnyxCallId) {
1403
- this.spawnPrewarmFirstMessage(options.agent, telnyxCallId, effectiveRingTimeout);
1502
+ callId = telnyxCallId;
1503
+ this.spawnPrewarmFirstMessage(options.agent, telnyxCallId, effectiveRingTimeout, "telnyx");
1404
1504
  if (options.agent.prewarm !== false) {
1405
1505
  this.parkProviderConnections(options.agent, telnyxCallId);
1406
1506
  }
1407
1507
  }
1508
+ return this.maybeAwaitCompletion(options, callId, effectiveRingTimeout);
1509
+ }
1510
+ if (carrier.kind === "plivo") {
1511
+ const auth = `Basic ${Buffer.from(`${carrier.authId}:${carrier.authToken}`).toString("base64")}`;
1512
+ const plivoPayload = {
1513
+ from: phoneNumber,
1514
+ to: options.to,
1515
+ answer_url: `https://${webhookUrl}/webhooks/plivo/voice`,
1516
+ answer_method: "POST",
1517
+ // hangup_url is Plivo's StatusCallback analogue — without it the
1518
+ // /webhooks/plivo/status route never fires for outbound calls and
1519
+ // the dashboard misses no-answer / busy / failed.
1520
+ hangup_url: `https://${webhookUrl}/webhooks/plivo/status`,
1521
+ hangup_method: "POST"
1522
+ };
1523
+ if (effectiveRingTimeout !== null && effectiveRingTimeout !== void 0) {
1524
+ plivoPayload.ring_timeout = Math.max(1, Math.floor(effectiveRingTimeout));
1525
+ }
1526
+ if (wantsAmd) {
1527
+ plivoPayload.machine_detection = "true";
1528
+ plivoPayload.machine_detection_time = 5e3;
1529
+ plivoPayload.machine_detection_url = `https://${webhookUrl}/webhooks/plivo/amd`;
1530
+ plivoPayload.machine_detection_method = "POST";
1531
+ }
1532
+ if (options.voicemailMessage && this.embeddedServer) {
1533
+ this.embeddedServer.voicemailMessage = options.voicemailMessage;
1534
+ }
1535
+ const response2 = await fetch(`https://api.plivo.com/v1/Account/${carrier.authId}/Call/`, {
1536
+ method: "POST",
1537
+ headers: { "Content-Type": "application/json", Authorization: auth },
1538
+ body: JSON.stringify(plivoPayload)
1539
+ });
1540
+ if (!response2.ok) {
1541
+ throw new ProvisionError(`Failed to initiate Plivo call: ${await response2.text()}`);
1542
+ }
1543
+ let plivoCallId;
1544
+ try {
1545
+ const body = await response2.clone().json();
1546
+ plivoCallId = body.request_uuid;
1547
+ } catch {
1548
+ }
1549
+ if (plivoCallId) {
1550
+ const initiatedPayload = {
1551
+ call_id: plivoCallId,
1552
+ caller: phoneNumber,
1553
+ callee: options.to,
1554
+ direction: "outbound",
1555
+ status: "initiated"
1556
+ };
1557
+ if (this.embeddedServer) {
1558
+ this.embeddedServer.metricsStore.recordCallInitiated(initiatedPayload);
1559
+ }
1560
+ try {
1561
+ const { notifyDashboard: notifyDashboard2 } = await import("./persistence-LVIAHESK.mjs");
1562
+ notifyDashboard2(initiatedPayload);
1563
+ } catch {
1564
+ }
1565
+ this.spawnPrewarmFirstMessage(options.agent, plivoCallId, effectiveRingTimeout, "plivo");
1566
+ if (options.agent.prewarm !== false) {
1567
+ this.parkProviderConnections(options.agent, plivoCallId);
1568
+ }
1569
+ }
1408
1570
  return;
1409
1571
  }
1410
1572
  const twilioSid = carrier.accountSid;
@@ -1453,25 +1615,76 @@ var Patter = class {
1453
1615
  twilioNotificationsPath = body.subresource_uris?.notifications;
1454
1616
  } catch {
1455
1617
  }
1456
- if (this.embeddedServer && twilioCallSid) {
1457
- this.embeddedServer.metricsStore.recordCallInitiated({
1618
+ if (twilioCallSid) {
1619
+ const initiatedPayload = {
1458
1620
  call_id: twilioCallSid,
1459
1621
  caller: phoneNumber,
1460
1622
  callee: options.to,
1461
- direction: "outbound"
1462
- });
1463
- if (twilioNotificationsPath) {
1464
- getLogger().info(
1465
- `Outbound call ${twilioCallSid} placed. Twilio notifications: https://api.twilio.com${twilioNotificationsPath} (check here if the call drops with no audio).`
1466
- );
1623
+ direction: "outbound",
1624
+ status: "initiated"
1625
+ };
1626
+ if (this.embeddedServer) {
1627
+ this.embeddedServer.metricsStore.recordCallInitiated(initiatedPayload);
1628
+ if (twilioNotificationsPath) {
1629
+ getLogger().info(
1630
+ `Outbound call ${twilioCallSid} placed. Twilio notifications: https://api.twilio.com${twilioNotificationsPath} (check here if the call drops with no audio).`
1631
+ );
1632
+ }
1633
+ }
1634
+ try {
1635
+ const { notifyDashboard: notifyDashboard2 } = await import("./persistence-LVIAHESK.mjs");
1636
+ notifyDashboard2(initiatedPayload);
1637
+ } catch {
1467
1638
  }
1468
1639
  }
1469
1640
  if (twilioCallSid) {
1470
- this.spawnPrewarmFirstMessage(options.agent, twilioCallSid, effectiveRingTimeout);
1641
+ callId = twilioCallSid;
1642
+ this.spawnPrewarmFirstMessage(options.agent, twilioCallSid, effectiveRingTimeout, "twilio");
1471
1643
  if (options.agent.prewarm !== false) {
1472
1644
  this.parkProviderConnections(options.agent, twilioCallSid);
1473
1645
  }
1474
1646
  }
1647
+ return this.maybeAwaitCompletion(options, callId, effectiveRingTimeout);
1648
+ }
1649
+ /**
1650
+ * When `options.wait` is set, register a completion promise keyed by the
1651
+ * carrier-issued `callId` and await it (bounded by a backstop timeout).
1652
+ * Otherwise resolve to `void` immediately (fire-and-forget).
1653
+ *
1654
+ * The registration happens here — after the carrier accepted the dial and
1655
+ * issued the id — so the future correlates to the right call. The race
1656
+ * window between `initiateCall` returning and this registration is
1657
+ * harmless: the callee is still ringing, so no terminal signal can fire
1658
+ * before we register. Mirrors the Python `call(wait=True)` tail block.
1659
+ */
1660
+ async maybeAwaitCompletion(options, callId, ringTimeout) {
1661
+ if (!options.wait) return;
1662
+ const server = this.embeddedServer;
1663
+ if (!server || !callId) {
1664
+ throw new PatterConnectionError(
1665
+ "call({ wait: true }): no active server or carrier call id."
1666
+ );
1667
+ }
1668
+ const completion = server.registerCompletion(callId);
1669
+ const backstopMs = ((ringTimeout ?? 25) + 1800) * 1e3;
1670
+ let timer;
1671
+ const backstop = new Promise((_resolve, reject) => {
1672
+ timer = setTimeout(() => {
1673
+ server.deleteCompletion(callId);
1674
+ reject(
1675
+ new PatterConnectionError(
1676
+ `call({ wait: true }): no terminal signal for call ${callId} within ${(backstopMs / 1e3).toFixed(0)}s`,
1677
+ { code: ErrorCode.TIMEOUT }
1678
+ )
1679
+ );
1680
+ }, backstopMs);
1681
+ timer.unref?.();
1682
+ });
1683
+ try {
1684
+ return await Promise.race([completion, backstop]);
1685
+ } finally {
1686
+ if (timer) clearTimeout(timer);
1687
+ }
1475
1688
  }
1476
1689
  /**
1477
1690
  * Stop the embedded server and any running tunnel. Safe to call multiple
@@ -1512,6 +1725,11 @@ var Patter = class {
1512
1725
  this.tunnelHandle = null;
1513
1726
  }
1514
1727
  if (this.embeddedServer) {
1728
+ this.embeddedServer.failPendingCompletions(
1729
+ new PatterConnectionError(
1730
+ "Patter.disconnect() called while a call({ wait: true }) was still in flight."
1731
+ )
1732
+ );
1515
1733
  await this.embeddedServer.stop();
1516
1734
  this.embeddedServer = null;
1517
1735
  }
@@ -1535,6 +1753,30 @@ var Patter = class {
1535
1753
  this._ready.catch(() => {
1536
1754
  });
1537
1755
  }
1756
+ /**
1757
+ * Explicit-resource-management disposer so callers can write
1758
+ * ``await using phone = new Patter(...)`` and have {@link disconnect} run
1759
+ * automatically when the block exits — on the normal path AND when the
1760
+ * body throws. This guarantees the embedded server, any auto-started
1761
+ * tunnel, and in-flight prewarm/TTS work are torn down so a still-running
1762
+ * TTS WebSocket cannot keep the user billed after the block ends, and any
1763
+ * in-flight ``call({ wait: true })`` awaiter is failed rather than left
1764
+ * hanging. ``disconnect()`` is idempotent, so an explicit ``disconnect()``
1765
+ * inside the block is still safe. Mirrors Python's ``async with Patter(...)``.
1766
+ *
1767
+ * Note: this does NOT start the server (``serve()`` blocks until shutdown,
1768
+ * so it cannot run from a disposer) — call ``serve(...)`` inside the block:
1769
+ *
1770
+ * ```ts
1771
+ * await using phone = new Patter({ carrier: new Twilio(), phoneNumber: "+1555..." });
1772
+ * await phone.serve({ agent }); // inbound, or
1773
+ * const result = await phone.call({ to: "+1555...", agent, wait: true });
1774
+ * // disconnect() has run here — nothing left running.
1775
+ * ```
1776
+ */
1777
+ async [Symbol.asyncDispose]() {
1778
+ await this.disconnect();
1779
+ }
1538
1780
  /**
1539
1781
  * Terminate an active call on the configured carrier.
1540
1782
  *
@@ -1589,6 +1831,17 @@ var Patter = class {
1589
1831
  }
1590
1832
  return;
1591
1833
  }
1834
+ if (carrier.kind === "plivo") {
1835
+ const auth = Buffer.from(`${carrier.authId}:${carrier.authToken}`).toString("base64");
1836
+ const res = await fetch(
1837
+ `https://api.plivo.com/v1/Account/${carrier.authId}/Call/${encodeURIComponent(callSid)}/`,
1838
+ { method: "DELETE", headers: { Authorization: `Basic ${auth}` } }
1839
+ );
1840
+ if (!res.ok && res.status !== 404) {
1841
+ throw new Error(`Plivo hangup failed: ${res.status} ${await res.text()}`);
1842
+ }
1843
+ return;
1844
+ }
1592
1845
  throw new Error(`endCall() requires a configured carrier; got kind=${carrier.kind}`);
1593
1846
  }
1594
1847
  };
@@ -1986,7 +2239,6 @@ init_esm_shims();
1986
2239
 
1987
2240
  // src/integrations/patter-tool.ts
1988
2241
  init_esm_shims();
1989
- import { EventEmitter } from "events";
1990
2242
  var PARAMETERS_SCHEMA = {
1991
2243
  type: "object",
1992
2244
  properties: {
@@ -2013,7 +2265,7 @@ var PARAMETERS_SCHEMA = {
2013
2265
  };
2014
2266
  var DEFAULT_NAME = "make_phone_call";
2015
2267
  var DEFAULT_DESCRIPTION = "Place a real outbound phone call. Returns a JSON object with the full transcript, call status, duration in seconds, and cost. Use this when the user asks you to call someone, schedule appointments by phone, or otherwise reach a human via voice.";
2016
- var PatterTool = class _PatterTool {
2268
+ var PatterTool = class {
2017
2269
  name;
2018
2270
  description;
2019
2271
  phone;
@@ -2021,24 +2273,6 @@ var PatterTool = class _PatterTool {
2021
2273
  maxDurationSec;
2022
2274
  recording;
2023
2275
  started = false;
2024
- /** Resolver for the next `call_initiated` SSE event. Only set inside the
2025
- * dial mutex (`dialQueue`), so two parallel `execute()` calls never share
2026
- * it and never lose a dispatch. */
2027
- pendingDial = null;
2028
- /** Mutex that serializes the dial → call_id capture critical section.
2029
- * Each `execute()` chains a continuation onto this promise so the
2030
- * `pendingDial` slot is owned by exactly one caller at a time. */
2031
- dialQueue = Promise.resolve();
2032
- /** Captured SSE listener so `stop()` can detach it (prevents leaks when
2033
- * the underlying Patter instance outlives this tool). */
2034
- sseListener = null;
2035
- /** Captured Patter metrics store, for cleanup in `stop()`. */
2036
- metricsStoreRef = null;
2037
- /** call_id → pending promise machinery. */
2038
- pending = /* @__PURE__ */ new Map();
2039
- bus = new EventEmitter();
2040
- /** How long to wait for the `call_initiated` SSE before failing the dial. */
2041
- static DIAL_CAPTURE_TIMEOUT_MS = 1e4;
2042
2276
  constructor(opts) {
2043
2277
  if (!opts.phone) {
2044
2278
  throw new Error("PatterTool: `phone` (a Patter instance) is required.");
@@ -2082,7 +2316,15 @@ var PatterTool = class _PatterTool {
2082
2316
  };
2083
2317
  }
2084
2318
  // --- Lifecycle ----------------------------------------------------------
2085
- /** Start the underlying Patter server. Idempotent. */
2319
+ /**
2320
+ * Start the underlying Patter server. Idempotent.
2321
+ *
2322
+ * `execute()` relies on `Patter.call({ wait: true })`, which requires an
2323
+ * active server to receive the carrier completion webhooks — that's what
2324
+ * `serve()` provides here. No `onCallEnd` callback is wired: the SDK's own
2325
+ * per-callId completion registry resolves the result, so the user's
2326
+ * `onCallEnd` slot is left free.
2327
+ */
2086
2328
  async start() {
2087
2329
  if (this.started) return;
2088
2330
  if (!this.agent) {
@@ -2094,52 +2336,31 @@ var PatterTool = class _PatterTool {
2094
2336
  await this.phone.serve({
2095
2337
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
2096
2338
  agent: builtAgent,
2097
- recording: this.recording,
2098
- onCallEnd: this.onCallEndHandler.bind(this)
2339
+ recording: this.recording
2099
2340
  });
2100
- const store = this.phone.metricsStore;
2101
- if (!store) {
2102
- throw new Error(
2103
- "PatterTool.start: phone.metricsStore is null after serve() \u2014 is the dashboard disabled?"
2104
- );
2105
- }
2106
- const listener = (event) => {
2107
- if (event.type === "call_initiated" && this.pendingDial) {
2108
- const callId = event.data.call_id || "";
2109
- if (callId) {
2110
- const dispatch = this.pendingDial;
2111
- this.pendingDial = null;
2112
- dispatch(callId);
2113
- }
2114
- }
2115
- };
2116
- store.on("sse", listener);
2117
- this.sseListener = listener;
2118
- this.metricsStoreRef = store;
2119
2341
  this.started = true;
2120
2342
  }
2121
- /** Stop the underlying Patter server (and reject any pending calls). */
2343
+ /** Best-effort shutdown — tear the Patter server down via `disconnect()`. */
2122
2344
  async stop() {
2123
2345
  if (!this.started) return;
2124
- if (this.metricsStoreRef && this.sseListener) {
2125
- this.metricsStoreRef.off("sse", this.sseListener);
2126
- }
2127
- this.sseListener = null;
2128
- this.metricsStoreRef = null;
2129
- this.pendingDial = null;
2130
- for (const [, p] of this.pending) {
2131
- clearTimeout(p.timer);
2132
- p.reject(new Error("PatterTool: shutdown while call pending"));
2133
- }
2134
- this.pending.clear();
2135
- const stoppable = this.phone;
2136
- if (typeof stoppable.stop === "function") {
2137
- await stoppable.stop();
2346
+ const disconnectable = this.phone;
2347
+ if (typeof disconnectable.disconnect === "function") {
2348
+ try {
2349
+ await disconnectable.disconnect();
2350
+ } catch {
2351
+ }
2138
2352
  }
2139
2353
  this.started = false;
2140
2354
  }
2141
2355
  // --- Execution ----------------------------------------------------------
2142
- /** Place an outbound call and resolve once it ends with the transcript and metrics. */
2356
+ /**
2357
+ * Dial outbound, wait for the call to end, return a structured result.
2358
+ *
2359
+ * Thin wrapper over `Patter.call({ wait: true })`: the SDK now owns the
2360
+ * dial → callId → terminal-signal correlation, so this just bounds the wait
2361
+ * with `max_duration_sec` and maps the {@link CallResult} into the tool's
2362
+ * public envelope. Mirrors Python's `PatterTool.execute`.
2363
+ */
2143
2364
  async execute(args) {
2144
2365
  if (!this.started) await this.start();
2145
2366
  if (!args || typeof args.to !== "string" || !args.to.startsWith("+")) {
@@ -2155,55 +2376,32 @@ var PatterTool = class _PatterTool {
2155
2376
  ...args.goal !== void 0 ? { systemPrompt: args.goal } : {},
2156
2377
  ...args.first_message !== void 0 ? { firstMessage: args.first_message } : {}
2157
2378
  });
2158
- const callId = await this.acquireCallId(args.to, overrideAgent);
2159
- return new Promise((resolve, reject) => {
2160
- const timer = setTimeout(() => {
2161
- this.pending.delete(callId);
2162
- reject(new Error(`PatterTool.execute: call ${callId} exceeded ${timeoutSec}s timeout`));
2379
+ let timer;
2380
+ const timeout = new Promise((_resolve, reject) => {
2381
+ timer = setTimeout(() => {
2382
+ reject(
2383
+ new Error(
2384
+ `PatterTool.execute: call to ${args.to} exceeded ${timeoutSec}s timeout`
2385
+ )
2386
+ );
2163
2387
  }, timeoutSec * 1e3);
2164
- this.pending.set(callId, {
2165
- resolve,
2166
- reject,
2167
- timer,
2168
- startedAt: Date.now() / 1e3
2169
- });
2170
- });
2171
- }
2172
- /** Issue the outbound dial under the mutex and return its assigned call_id. */
2173
- async acquireCallId(to, agent) {
2174
- let release;
2175
- const slot = new Promise((r) => {
2176
- release = r;
2388
+ timer.unref?.();
2177
2389
  });
2178
- const previous = this.dialQueue;
2179
- this.dialQueue = previous.then(() => slot);
2180
- await previous;
2181
- let captureTimer = null;
2390
+ let result;
2182
2391
  try {
2183
- const callIdPromise = new Promise((resolve, reject) => {
2184
- this.pendingDial = resolve;
2185
- captureTimer = setTimeout(() => {
2186
- this.pendingDial = null;
2187
- reject(
2188
- new Error(
2189
- `PatterTool.execute: did not observe call_initiated within ${_PatterTool.DIAL_CAPTURE_TIMEOUT_MS}ms`
2190
- )
2191
- );
2192
- }, _PatterTool.DIAL_CAPTURE_TIMEOUT_MS);
2193
- });
2194
- await this.phone.call({
2195
- to,
2196
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
2197
- agent
2198
- });
2199
- const callId = await callIdPromise;
2200
- if (captureTimer) clearTimeout(captureTimer);
2201
- return callId;
2392
+ result = await Promise.race([
2393
+ this.phone.call({
2394
+ to: args.to,
2395
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
2396
+ agent: overrideAgent,
2397
+ wait: true
2398
+ }),
2399
+ timeout
2400
+ ]);
2202
2401
  } finally {
2203
- if (captureTimer) clearTimeout(captureTimer);
2204
- this.pendingDial = null;
2205
- release();
2402
+ if (timer) clearTimeout(timer);
2206
2403
  }
2404
+ return resultFromCallResult(result);
2207
2405
  }
2208
2406
  /**
2209
2407
  * Hermes-style handler: `(args, kwargs) => Promise<string>` returning a JSON
@@ -2221,32 +2419,32 @@ var PatterTool = class _PatterTool {
2221
2419
  }
2222
2420
  };
2223
2421
  }
2224
- // --- Internal: onCallEnd dispatcher -------------------------------------
2225
- async onCallEndHandler(data) {
2226
- const callId = data.call_id || "";
2227
- if (!callId) return;
2228
- const pending = this.pending.get(callId);
2229
- if (!pending) {
2230
- this.bus.emit("orphan_end", { call_id: callId, data });
2231
- return;
2232
- }
2233
- clearTimeout(pending.timer);
2234
- this.pending.delete(callId);
2235
- const metrics = data.metrics && typeof data.metrics === "object" ? data.metrics : null;
2236
- const cost = metrics && typeof metrics.cost === "object" && metrics.cost && typeof metrics.cost.total === "number" ? metrics.cost.total : void 0;
2237
- const duration = typeof metrics?.duration_seconds === "number" ? metrics?.duration_seconds : Math.max(0, Date.now() / 1e3 - pending.startedAt);
2238
- const transcript = Array.isArray(data.transcript) ? data.transcript : [];
2239
- const status = data.status || "completed";
2240
- pending.resolve({
2241
- call_id: callId,
2242
- status,
2243
- duration_seconds: duration,
2244
- cost_usd: cost,
2245
- transcript,
2246
- metrics
2247
- });
2248
- }
2249
2422
  };
2423
+ function resultFromCallResult(result) {
2424
+ if (!result) {
2425
+ return {
2426
+ call_id: "",
2427
+ status: "completed",
2428
+ outcome: "",
2429
+ duration_seconds: 0,
2430
+ cost_usd: void 0,
2431
+ transcript: [],
2432
+ metrics: null
2433
+ };
2434
+ }
2435
+ const costTotal = result.cost?.total;
2436
+ const costUsd = typeof costTotal === "number" ? costTotal : void 0;
2437
+ const metrics = result.metrics ? result.metrics : null;
2438
+ return {
2439
+ call_id: result.callId || "",
2440
+ status: result.status || "completed",
2441
+ outcome: result.outcome || "",
2442
+ duration_seconds: typeof result.durationSeconds === "number" ? result.durationSeconds : 0,
2443
+ cost_usd: costUsd,
2444
+ transcript: result.transcript ? [...result.transcript] : [],
2445
+ metrics
2446
+ };
2447
+ }
2250
2448
 
2251
2449
  // src/providers/gemini-live.ts
2252
2450
  init_esm_shims();
@@ -2764,54 +2962,642 @@ function scheduleInterval(intervalOrOpts, callback) {
2764
2962
  };
2765
2963
  }
2766
2964
 
2767
- // src/stt/deepgram.ts
2965
+ // src/providers/elevenlabs-tts.ts
2768
2966
  init_esm_shims();
2769
- var STT = class extends DeepgramSTT {
2770
- static providerKey = "deepgram";
2771
- constructor(opts = {}) {
2772
- const key = opts.apiKey ?? process.env.DEEPGRAM_API_KEY;
2773
- if (!key) {
2774
- throw new Error(
2775
- "Deepgram STT requires an apiKey. Pass { apiKey: 'dg_...' } or set DEEPGRAM_API_KEY in the environment."
2776
- );
2777
- }
2778
- super(
2779
- key,
2780
- opts.language ?? "en",
2781
- opts.model ?? "nova-3",
2782
- opts.encoding ?? "linear16",
2783
- opts.sampleRate ?? 16e3,
2784
- {
2785
- endpointingMs: opts.endpointingMs ?? 150,
2786
- utteranceEndMs: opts.utteranceEndMs === null ? null : opts.utteranceEndMs ?? 1e3,
2787
- smartFormat: opts.smartFormat ?? true,
2788
- interimResults: opts.interimResults ?? true,
2789
- ...opts.vadEvents !== void 0 ? { vadEvents: opts.vadEvents } : {}
2790
- }
2791
- );
2792
- }
2967
+ var ELEVENLABS_BASE_URL = "https://api.elevenlabs.io/v1";
2968
+ var ELEVENLABS_VOICE_ID_BY_NAME = {
2969
+ rachel: "21m00Tcm4TlvDq8ikWAM",
2970
+ drew: "29vD33N1CtxCmqQRPOHJ",
2971
+ clyde: "2EiwWnXFnvU5JabPnv8n",
2972
+ paul: "5Q0t7uMcjvnagumLfvZi",
2973
+ domi: "AZnzlk1XvdvUeBnXmlld",
2974
+ dave: "CYw3kZ02Hs0563khs1Fj",
2975
+ fin: "D38z5RcWu1voky8WS1ja",
2976
+ bella: "EXAVITQu4vr4xnSDxMaL",
2977
+ antoni: "ErXwobaYiN019PkySvjV",
2978
+ thomas: "GBv7mTt0atIp3Br8iCZE",
2979
+ charlie: "IKne3meq5aSn9XLyUdCD",
2980
+ george: "JBFqnCBsd6RMkjVDRZzb",
2981
+ emily: "LcfcDJNUP1GQjkzn1xUU",
2982
+ elli: "MF3mGyEYCl7XYWbV9V6O",
2983
+ callum: "N2lVS1w4EtoT3dr4eOWO",
2984
+ patrick: "ODq5zmih8GrVes37Dizd",
2985
+ harry: "SOYHLrjzK2X1ezoPC6cr",
2986
+ liam: "TX3LPaxmHKxFdv7VOQHJ",
2987
+ dorothy: "ThT5KcBeYPX3keUQqHPh",
2988
+ josh: "TxGEqnHWrfWFTfGW9XjX",
2989
+ arnold: "VR6AewLTigWG4xSOukaG",
2990
+ charlotte: "XB0fDUnXU5powFXDhCwa",
2991
+ matilda: "XrExE9yKIg1WjnnlVkGX",
2992
+ matthew: "Yko7PKHZNXotIFUBG7I9",
2993
+ james: "ZQe5CZNOzWyzPSCn5a3c",
2994
+ joseph: "Zlb1dXrM653N07WRdFW3",
2995
+ jeremy: "bVMeCyTHy58xNoL34h3p",
2996
+ michael: "flq6f7yk4E4fJM5XTYuZ",
2997
+ ethan: "g5CIjZEefAph4nQFvHAz",
2998
+ gigi: "jBpfuIE2acCO8z3wKNLl",
2999
+ freya: "jsCqWAovK2LkecY7zXl4",
3000
+ brian: "nPczCjzI2devNBz1zQrb",
3001
+ grace: "oWAxZDx7w5VEj9dCyTzz",
3002
+ daniel: "onwK4e9ZLuTAKqWW03F9",
3003
+ lily: "pFZP5JQG7iQjIQuC4Bku",
3004
+ serena: "pMsXgVXv3BLzUgSXRplE",
3005
+ adam: "pNInz6obpgDQGcFmaJgB",
3006
+ nicole: "piTKgcLEGmPE4e6mEKli",
3007
+ bill: "pqHfZKP75CvOlQylNhV4",
3008
+ jessie: "t0jbNlBVZ17f02VDIeMI",
3009
+ ryan: "wViXBPUzp2ZZixB1xQuM",
3010
+ sam: "yoZ06aMxZJJ28mfd3POQ",
3011
+ glinda: "z9fAnlkpzviPz146aGWa",
3012
+ giovanni: "zcAOhNBS3c14rBihAFp1",
3013
+ mimi: "zrHiDhphv9ZnVXBqCLjz",
3014
+ sarah: "EXAVITQu4vr4xnSDxMaL",
3015
+ alloy: "EXAVITQu4vr4xnSDxMaL"
2793
3016
  };
2794
-
2795
- // src/stt/whisper.ts
2796
- init_esm_shims();
2797
-
2798
- // src/providers/whisper-stt.ts
2799
- init_esm_shims();
2800
- var OPENAI_TRANSCRIPTION_URL = "https://api.openai.com/v1/audio/transcriptions";
2801
- var DEFAULT_BUFFER_SIZE = 16e3 * 2;
2802
- var ALLOWED_MODELS = /* @__PURE__ */ new Set(["whisper-1", "gpt-4o-transcribe", "gpt-4o-mini-transcribe"]);
2803
- function wrapPcmInWav(pcm, sampleRate = 16e3, channels = 1, bitsPerSample = 16) {
2804
- const dataSize = pcm.length;
2805
- const header = Buffer.alloc(44);
2806
- header.write("RIFF", 0);
2807
- header.writeUInt32LE(36 + dataSize, 4);
2808
- header.write("WAVE", 8);
2809
- header.write("fmt ", 12);
2810
- header.writeUInt32LE(16, 16);
2811
- header.writeUInt16LE(1, 20);
2812
- header.writeUInt16LE(channels, 22);
2813
- header.writeUInt32LE(sampleRate, 24);
2814
- header.writeUInt32LE(sampleRate * channels * (bitsPerSample / 8), 28);
3017
+ var VOICE_ID_PATTERN = /^[A-Za-z0-9]{20}$/;
3018
+ var CARRIER_NATIVE_FORMAT = {
3019
+ twilio: "ulaw_8000",
3020
+ telnyx: "pcm_16000",
3021
+ // Plivo streams mulaw 8 kHz (we pin contentType in the answer XML).
3022
+ plivo: "ulaw_8000"
3023
+ };
3024
+ function resolveVoiceId(voice) {
3025
+ if (!voice) return voice;
3026
+ if (VOICE_ID_PATTERN.test(voice)) return voice;
3027
+ return ELEVENLABS_VOICE_ID_BY_NAME[voice.toLowerCase()] ?? voice;
3028
+ }
3029
+ var ElevenLabsModel = {
3030
+ V3: "eleven_v3",
3031
+ FLASH_V2_5: "eleven_flash_v2_5",
3032
+ TURBO_V2_5: "eleven_turbo_v2_5",
3033
+ MULTILINGUAL_V2: "eleven_multilingual_v2",
3034
+ MONOLINGUAL_V1: "eleven_monolingual_v1"
3035
+ };
3036
+ var ElevenLabsOutputFormat = {
3037
+ MP3_22050_32: "mp3_22050_32",
3038
+ MP3_44100_32: "mp3_44100_32",
3039
+ MP3_44100_64: "mp3_44100_64",
3040
+ MP3_44100_96: "mp3_44100_96",
3041
+ MP3_44100_128: "mp3_44100_128",
3042
+ MP3_44100_192: "mp3_44100_192",
3043
+ PCM_8000: "pcm_8000",
3044
+ PCM_16000: "pcm_16000",
3045
+ PCM_22050: "pcm_22050",
3046
+ PCM_24000: "pcm_24000",
3047
+ PCM_44100: "pcm_44100",
3048
+ ULAW_8000: "ulaw_8000"
3049
+ };
3050
+ var ElevenLabsTTS = class _ElevenLabsTTS {
3051
+ // Stable pricing/dashboard key — read by stream-handler / metrics via
3052
+ // ``(agent.tts.constructor as any).providerKey``. Without this the cost
3053
+ // calculator falls back to ``constructor.name`` ("ElevenLabsTTS") which
3054
+ // does NOT match the pricing table key "elevenlabs", silently zeroing
3055
+ // TTS cost for callers that construct the raw REST class directly
3056
+ // (exposed at top level as ``ElevenLabsRestTTS``).
3057
+ static providerKey = "elevenlabs";
3058
+ apiKey;
3059
+ voiceId;
3060
+ modelId;
3061
+ _outputFormat;
3062
+ _outputFormatExplicit;
3063
+ voiceSettings;
3064
+ languageCode;
3065
+ chunkSize;
3066
+ /**
3067
+ * Public view of the (possibly auto-flipped) wire format. Read by the
3068
+ * stream-handler to decide whether to skip the client-side resample +
3069
+ * mulaw encode when the bytes are already in the carrier's wire codec.
3070
+ */
3071
+ get outputFormat() {
3072
+ return this._outputFormat;
3073
+ }
3074
+ constructor(apiKey, voiceIdOrOptions = "21m00Tcm4TlvDq8ikWAM", modelId = ElevenLabsModel.FLASH_V2_5, outputFormat = ElevenLabsOutputFormat.PCM_16000) {
3075
+ this.apiKey = apiKey;
3076
+ if (typeof voiceIdOrOptions === "object") {
3077
+ const o = voiceIdOrOptions;
3078
+ this.voiceId = resolveVoiceId(o.voiceId ?? "21m00Tcm4TlvDq8ikWAM");
3079
+ this.modelId = o.modelId ?? ElevenLabsModel.FLASH_V2_5;
3080
+ this._outputFormatExplicit = o.outputFormat !== void 0;
3081
+ this._outputFormat = o.outputFormat ?? ElevenLabsOutputFormat.PCM_16000;
3082
+ this.voiceSettings = o.voiceSettings;
3083
+ this.languageCode = o.languageCode;
3084
+ this.chunkSize = o.chunkSize ?? 4096;
3085
+ } else {
3086
+ this.voiceId = resolveVoiceId(voiceIdOrOptions);
3087
+ this.modelId = modelId;
3088
+ this._outputFormatExplicit = outputFormat !== ElevenLabsOutputFormat.PCM_16000;
3089
+ this._outputFormat = outputFormat;
3090
+ this.voiceSettings = void 0;
3091
+ this.languageCode = void 0;
3092
+ this.chunkSize = 4096;
3093
+ }
3094
+ }
3095
+ /**
3096
+ * Hook called by ``StreamHandler.initPipeline`` to advise the carrier
3097
+ * wire format. When the user did NOT pass an explicit ``outputFormat``,
3098
+ * auto-flip to the carrier's native codec so the audio bytes ElevenLabs
3099
+ * returns are already in Twilio/Telnyx wire format — eliminating the
3100
+ * client-side 16 kHz → 8 kHz resample and PCM → μ-law encode. The
3101
+ * resample/encode chain was a source of audible artifacts on the
3102
+ * prewarmed firstMessage (see 0.6.2 acceptance notes — burst delivery
3103
+ * of resampled audio crackled on the carrier-side jitter buffer).
3104
+ *
3105
+ * No-op when the caller passed an explicit ``outputFormat`` (incl. via
3106
+ * the ``forTwilio`` / ``forTelnyx`` factories) — user wins.
3107
+ *
3108
+ * Parity with {@link ElevenLabsWebSocketTTS.setTelephonyCarrier}.
3109
+ */
3110
+ setTelephonyCarrier(carrier) {
3111
+ if (this._outputFormatExplicit) return;
3112
+ const native = CARRIER_NATIVE_FORMAT[carrier];
3113
+ if (native !== void 0) this._outputFormat = native;
3114
+ }
3115
+ /**
3116
+ * Construct an instance pre-configured for Twilio Media Streams.
3117
+ *
3118
+ * Sets `outputFormat='ulaw_8000'` so ElevenLabs emits μ-law @ 8 kHz
3119
+ * directly — the exact wire format Twilio's media stream uses — letting
3120
+ * the SDK skip the 16 kHz→8 kHz resample and PCM→μ-law conversion in
3121
+ * `TwilioAudioSender`. Saves ~30–80 ms first-byte and per-frame CPU,
3122
+ * and removes a potential aliasing source.
3123
+ *
3124
+ * `voiceSettings` defaults to a low-bandwidth-friendly profile
3125
+ * (speaker boost off, modest stability) which sounds cleaner at 8 kHz
3126
+ * μ-law than the studio default. Pass an explicit object to override.
3127
+ */
3128
+ static forTwilio(apiKey, options = {}) {
3129
+ const voiceSettings = options.voiceSettings ?? {
3130
+ // Speaker boost adds high-frequency emphasis that aliases ugly over an
3131
+ // 8 kHz μ-law line. Slightly higher stability tames the excursions
3132
+ // that compander quantization noise can amplify.
3133
+ stability: 0.6,
3134
+ similarity_boost: 0.75,
3135
+ use_speaker_boost: false
3136
+ };
3137
+ return new _ElevenLabsTTS(apiKey, {
3138
+ ...options,
3139
+ voiceSettings,
3140
+ outputFormat: ElevenLabsOutputFormat.ULAW_8000
3141
+ });
3142
+ }
3143
+ /**
3144
+ * Construct an instance pre-configured for Telnyx bidirectional media.
3145
+ *
3146
+ * Telnyx's default media-streaming codec is L16 PCM @ 16 kHz, which
3147
+ * matches our default Telnyx handler. We pick `pcm_16000` so the audio
3148
+ * flows end-to-end with zero resampling or transcoding.
3149
+ *
3150
+ * Trade-off: if your Telnyx profile is pinned to PCMU/8000 (μ-law),
3151
+ * construct `ElevenLabsTTS` directly with `outputFormat: 'ulaw_8000'`
3152
+ * — Telnyx supports that natively too.
3153
+ */
3154
+ static forTelnyx(apiKey, options = {}) {
3155
+ return new _ElevenLabsTTS(apiKey, {
3156
+ ...options,
3157
+ outputFormat: ElevenLabsOutputFormat.PCM_16000
3158
+ });
3159
+ }
3160
+ /**
3161
+ * Synthesise text to speech and return the full audio as a single Buffer.
3162
+ *
3163
+ * For large chunks (or when latency matters) call `synthesizeStream` instead.
3164
+ */
3165
+ async synthesize(text) {
3166
+ const chunks = [];
3167
+ for await (const chunk of this.synthesizeStream(text)) {
3168
+ chunks.push(chunk);
3169
+ }
3170
+ return Buffer.concat(chunks);
3171
+ }
3172
+ /**
3173
+ * Synthesise text and yield audio chunks as they arrive (streaming).
3174
+ *
3175
+ * The yielded buffers are raw PCM at 16 kHz (or whatever `outputFormat` is
3176
+ * configured to). `chunkSize` controls the maximum yield size — 512 is a
3177
+ * good choice for low-latency telephony.
3178
+ */
3179
+ async *synthesizeStream(text) {
3180
+ const url = `${ELEVENLABS_BASE_URL}/text-to-speech/${encodeURIComponent(this.voiceId)}/stream?output_format=${encodeURIComponent(this._outputFormat)}`;
3181
+ const body = {
3182
+ text,
3183
+ model_id: this.modelId
3184
+ };
3185
+ if (this.voiceSettings) body["voice_settings"] = this.voiceSettings;
3186
+ if (this.languageCode) body["language_code"] = this.languageCode;
3187
+ const response = await fetch(url, {
3188
+ method: "POST",
3189
+ headers: {
3190
+ "xi-api-key": this.apiKey,
3191
+ "Content-Type": "application/json"
3192
+ },
3193
+ body: JSON.stringify(body),
3194
+ signal: AbortSignal.timeout(3e4)
3195
+ });
3196
+ if (!response.ok) {
3197
+ const errBody = await response.text();
3198
+ throw new Error(`ElevenLabs TTS error ${response.status}: ${errBody}`);
3199
+ }
3200
+ if (!response.body) {
3201
+ throw new Error("ElevenLabs TTS: no response body");
3202
+ }
3203
+ const reader = response.body.getReader();
3204
+ try {
3205
+ while (true) {
3206
+ const { done, value } = await reader.read();
3207
+ if (done) break;
3208
+ if (!value || value.length === 0) continue;
3209
+ const buf = Buffer.from(value);
3210
+ for (let offset = 0; offset < buf.length; offset += this.chunkSize) {
3211
+ yield buf.subarray(offset, Math.min(offset + this.chunkSize, buf.length));
3212
+ }
3213
+ }
3214
+ } finally {
3215
+ if (typeof reader.cancel === "function") await reader.cancel().catch(() => {
3216
+ });
3217
+ reader.releaseLock();
3218
+ }
3219
+ }
3220
+ };
3221
+
3222
+ // src/providers/cartesia-tts.ts
3223
+ init_esm_shims();
3224
+ var CARTESIA_BASE_URL = "https://api.cartesia.ai";
3225
+ var CARTESIA_API_VERSION = "2025-04-16";
3226
+ var CARTESIA_DEFAULT_VOICE_ID = "f786b574-daa5-4673-aa0c-cbe3e8534c02";
3227
+ var CartesiaTTSModel = {
3228
+ SONIC_3: "sonic-3",
3229
+ SONIC_2: "sonic-2",
3230
+ SONIC: "sonic"
3231
+ };
3232
+ var CartesiaTTSContainer = {
3233
+ RAW: "raw",
3234
+ WAV: "wav",
3235
+ MP3: "mp3"
3236
+ };
3237
+ var CartesiaTTSEncoding = {
3238
+ PCM_S16LE: "pcm_s16le",
3239
+ PCM_F32LE: "pcm_f32le",
3240
+ PCM_MULAW: "pcm_mulaw",
3241
+ PCM_ALAW: "pcm_alaw"
3242
+ };
3243
+ var CartesiaTTSSampleRate = {
3244
+ HZ_8000: 8e3,
3245
+ HZ_16000: 16e3,
3246
+ HZ_22050: 22050,
3247
+ HZ_24000: 24e3,
3248
+ HZ_44100: 44100
3249
+ };
3250
+ var CartesiaTTSVoiceMode = {
3251
+ ID: "id",
3252
+ EMBEDDING: "embedding"
3253
+ };
3254
+ var CartesiaTTS = class _CartesiaTTS {
3255
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
3256
+ static providerKey = "cartesia_tts";
3257
+ apiKey;
3258
+ model;
3259
+ voice;
3260
+ language;
3261
+ sampleRate;
3262
+ speed;
3263
+ emotion;
3264
+ volume;
3265
+ baseUrl;
3266
+ apiVersion;
3267
+ constructor(apiKey, opts = {}) {
3268
+ this.apiKey = apiKey;
3269
+ this.model = opts.model ?? CartesiaTTSModel.SONIC_3;
3270
+ this.voice = opts.voice ?? CARTESIA_DEFAULT_VOICE_ID;
3271
+ this.language = opts.language ?? "en";
3272
+ this.sampleRate = opts.sampleRate ?? CartesiaTTSSampleRate.HZ_16000;
3273
+ this.speed = opts.speed;
3274
+ this.emotion = typeof opts.emotion === "string" ? [opts.emotion] : opts.emotion;
3275
+ this.volume = opts.volume;
3276
+ this.baseUrl = opts.baseUrl ?? CARTESIA_BASE_URL;
3277
+ this.apiVersion = opts.apiVersion ?? CARTESIA_API_VERSION;
3278
+ }
3279
+ /**
3280
+ * Construct an instance pre-configured for Twilio Media Streams.
3281
+ *
3282
+ * Sets `sampleRate=8000` so Cartesia emits PCM_S16LE @ 8 kHz directly.
3283
+ * Twilio's media stream uses μ-law @ 8 kHz so the SDK still does the
3284
+ * PCM → μ-law transcode client-side, but the 16 kHz → 8 kHz resample
3285
+ * step is skipped. Saves ~10–30 ms first-byte plus per-frame CPU and
3286
+ * removes a potential aliasing source.
3287
+ */
3288
+ static forTwilio(apiKey, options = {}) {
3289
+ return new _CartesiaTTS(apiKey, {
3290
+ ...options,
3291
+ sampleRate: CartesiaTTSSampleRate.HZ_8000
3292
+ });
3293
+ }
3294
+ /**
3295
+ * Construct an instance pre-configured for Telnyx bidirectional media.
3296
+ *
3297
+ * Sets `sampleRate=16000` to match Telnyx's L16/16000 default codec —
3298
+ * audio flows end-to-end with zero resampling or transcoding. Same as
3299
+ * the bare-constructor default; exists for API symmetry with
3300
+ * {@link CartesiaTTS.forTwilio}.
3301
+ */
3302
+ static forTelnyx(apiKey, options = {}) {
3303
+ return new _CartesiaTTS(apiKey, {
3304
+ ...options,
3305
+ sampleRate: CartesiaTTSSampleRate.HZ_16000
3306
+ });
3307
+ }
3308
+ /** Build the JSON payload for the Cartesia bytes endpoint. */
3309
+ buildPayload(text) {
3310
+ const payload = {
3311
+ model_id: this.model,
3312
+ voice: { mode: CartesiaTTSVoiceMode.ID, id: this.voice },
3313
+ transcript: text,
3314
+ output_format: {
3315
+ container: CartesiaTTSContainer.RAW,
3316
+ encoding: CartesiaTTSEncoding.PCM_S16LE,
3317
+ sample_rate: this.sampleRate
3318
+ },
3319
+ language: this.language
3320
+ };
3321
+ const generationConfig = {};
3322
+ if (this.speed !== void 0) generationConfig.speed = this.speed;
3323
+ if (this.emotion && this.emotion.length > 0)
3324
+ generationConfig.emotion = this.emotion[0];
3325
+ if (this.volume !== void 0) generationConfig.volume = this.volume;
3326
+ if (Object.keys(generationConfig).length > 0) {
3327
+ payload.generation_config = generationConfig;
3328
+ }
3329
+ return payload;
3330
+ }
3331
+ /**
3332
+ * Pre-call HTTP warmup for the Cartesia `/tts/bytes` endpoint.
3333
+ *
3334
+ * Issues a lightweight `GET <baseUrl>/voices` so DNS, TLS, and HTTP/2
3335
+ * are already up by the time the first `synthesizeStream()` POST
3336
+ * lands. Best-effort: 5 s timeout, all exceptions swallowed at
3337
+ * debug level.
3338
+ *
3339
+ * Billing safety: `GET /voices` is a free metadata read on
3340
+ * Cartesia's REST surface (per https://docs.cartesia.ai). It does
3341
+ * not consume synthesis credits. The actual synthesis is billed
3342
+ * only when `POST /tts/bytes` runs with a non-empty `transcript`.
3343
+ *
3344
+ * Note: Cartesia TTS uses the HTTP path (vs the WebSocket variant
3345
+ * Cartesia also exposes) — connection warmup is therefore HTTP-GET
3346
+ * based, not WebSocket pre-handshake. The latency win is smaller
3347
+ * (~50-150 ms vs the ~200-500 ms of a WS prewarm) but still real.
3348
+ */
3349
+ async warmup() {
3350
+ try {
3351
+ await fetch(`${this.baseUrl}/voices`, {
3352
+ method: "GET",
3353
+ headers: {
3354
+ "X-API-Key": this.apiKey,
3355
+ "Cartesia-Version": this.apiVersion
3356
+ },
3357
+ signal: AbortSignal.timeout(5e3)
3358
+ });
3359
+ } catch (err) {
3360
+ getLogger().debug(`Cartesia TTS warmup failed (best-effort): ${String(err)}`);
3361
+ }
3362
+ }
3363
+ /** Synthesize text and return the concatenated audio buffer. */
3364
+ async synthesize(text) {
3365
+ const chunks = [];
3366
+ for await (const chunk of this.synthesizeStream(text)) {
3367
+ chunks.push(chunk);
3368
+ }
3369
+ return Buffer.concat(chunks);
3370
+ }
3371
+ /**
3372
+ * Synthesize text and yield raw PCM_S16LE chunks at the configured
3373
+ * `sampleRate` as they arrive from Cartesia.
3374
+ */
3375
+ async *synthesizeStream(text) {
3376
+ const response = await fetch(`${this.baseUrl}/tts/bytes`, {
3377
+ method: "POST",
3378
+ headers: {
3379
+ "X-API-Key": this.apiKey,
3380
+ "Cartesia-Version": this.apiVersion,
3381
+ "Content-Type": "application/json"
3382
+ },
3383
+ body: JSON.stringify(this.buildPayload(text)),
3384
+ signal: AbortSignal.timeout(3e4)
3385
+ });
3386
+ if (!response.ok) {
3387
+ const body = await response.text();
3388
+ throw new Error(`Cartesia TTS error ${response.status}: ${body}`);
3389
+ }
3390
+ if (!response.body) {
3391
+ throw new Error("Cartesia TTS: no response body");
3392
+ }
3393
+ const reader = response.body.getReader();
3394
+ try {
3395
+ while (true) {
3396
+ const { done, value } = await reader.read();
3397
+ if (done) break;
3398
+ if (value && value.length > 0) {
3399
+ yield Buffer.from(value);
3400
+ }
3401
+ }
3402
+ } finally {
3403
+ if (typeof reader.cancel === "function")
3404
+ await reader.cancel().catch(() => {
3405
+ });
3406
+ reader.releaseLock();
3407
+ }
3408
+ }
3409
+ };
3410
+
3411
+ // src/providers/rime-tts.ts
3412
+ init_esm_shims();
3413
+ var RIME_BASE_URL = "https://users.rime.ai/v1/rime-tts";
3414
+ var RimeModel = {
3415
+ ARCANA: "arcana",
3416
+ MIST: "mist",
3417
+ MIST_V2: "mistv2"
3418
+ };
3419
+ var RimeAudioFormat = {
3420
+ PCM: "audio/pcm",
3421
+ MP3: "audio/mp3",
3422
+ WAV: "audio/wav",
3423
+ MULAW: "audio/mulaw"
3424
+ };
3425
+ var ARCANA_MODEL_TIMEOUT_MS = 60 * 4 * 1e3;
3426
+ var MIST_MODEL_TIMEOUT_MS = 30 * 1e3;
3427
+ function isMistModel(model) {
3428
+ return model.includes(RimeModel.MIST);
3429
+ }
3430
+ function timeoutForModel(model) {
3431
+ if (model === RimeModel.ARCANA) return ARCANA_MODEL_TIMEOUT_MS;
3432
+ return MIST_MODEL_TIMEOUT_MS;
3433
+ }
3434
+ var RimeTTS = class {
3435
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
3436
+ static providerKey = "rime";
3437
+ apiKey;
3438
+ model;
3439
+ speaker;
3440
+ lang;
3441
+ sampleRate;
3442
+ repetitionPenalty;
3443
+ temperature;
3444
+ topP;
3445
+ maxTokens;
3446
+ speedAlpha;
3447
+ reduceLatency;
3448
+ pauseBetweenBrackets;
3449
+ phonemizeBetweenBrackets;
3450
+ baseUrl;
3451
+ totalTimeoutMs;
3452
+ constructor(apiKey, opts = {}) {
3453
+ this.apiKey = apiKey;
3454
+ this.model = opts.model ?? RimeModel.ARCANA;
3455
+ const defaultSpeaker = isMistModel(this.model) ? "cove" : "astra";
3456
+ this.speaker = opts.speaker ?? defaultSpeaker;
3457
+ this.lang = opts.lang ?? "eng";
3458
+ this.sampleRate = opts.sampleRate ?? 16e3;
3459
+ this.repetitionPenalty = opts.repetitionPenalty;
3460
+ this.temperature = opts.temperature;
3461
+ this.topP = opts.topP;
3462
+ this.maxTokens = opts.maxTokens;
3463
+ this.speedAlpha = opts.speedAlpha;
3464
+ this.reduceLatency = opts.reduceLatency;
3465
+ this.pauseBetweenBrackets = opts.pauseBetweenBrackets;
3466
+ this.phonemizeBetweenBrackets = opts.phonemizeBetweenBrackets;
3467
+ this.baseUrl = opts.baseUrl ?? RIME_BASE_URL;
3468
+ this.totalTimeoutMs = timeoutForModel(this.model);
3469
+ }
3470
+ buildPayload(text) {
3471
+ const payload = {
3472
+ speaker: this.speaker,
3473
+ text,
3474
+ modelId: this.model
3475
+ };
3476
+ if (this.model === RimeModel.ARCANA) {
3477
+ if (this.repetitionPenalty !== void 0)
3478
+ payload.repetition_penalty = this.repetitionPenalty;
3479
+ if (this.temperature !== void 0) payload.temperature = this.temperature;
3480
+ if (this.topP !== void 0) payload.top_p = this.topP;
3481
+ if (this.maxTokens !== void 0) payload.max_tokens = this.maxTokens;
3482
+ payload.lang = this.lang;
3483
+ payload.samplingRate = this.sampleRate;
3484
+ } else if (isMistModel(this.model)) {
3485
+ payload.lang = this.lang;
3486
+ payload.samplingRate = this.sampleRate;
3487
+ if (this.speedAlpha !== void 0) payload.speedAlpha = this.speedAlpha;
3488
+ if (this.model === RimeModel.MIST_V2 && this.reduceLatency !== void 0) {
3489
+ payload.reduceLatency = this.reduceLatency;
3490
+ }
3491
+ if (this.pauseBetweenBrackets !== void 0) {
3492
+ payload.pauseBetweenBrackets = this.pauseBetweenBrackets;
3493
+ }
3494
+ if (this.phonemizeBetweenBrackets !== void 0) {
3495
+ payload.phonemizeBetweenBrackets = this.phonemizeBetweenBrackets;
3496
+ }
3497
+ }
3498
+ return payload;
3499
+ }
3500
+ /** Synthesize text and return the concatenated audio buffer. */
3501
+ async synthesize(text) {
3502
+ const chunks = [];
3503
+ for await (const chunk of this.synthesizeStream(text)) {
3504
+ chunks.push(chunk);
3505
+ }
3506
+ return Buffer.concat(chunks);
3507
+ }
3508
+ /**
3509
+ * Synthesize text and yield raw PCM_S16LE chunks at the configured
3510
+ * `sampleRate` as they stream in.
3511
+ */
3512
+ async *synthesizeStream(text) {
3513
+ const response = await fetch(this.baseUrl, {
3514
+ method: "POST",
3515
+ headers: {
3516
+ accept: RimeAudioFormat.PCM,
3517
+ Authorization: `Bearer ${this.apiKey}`,
3518
+ "content-type": "application/json"
3519
+ },
3520
+ body: JSON.stringify(this.buildPayload(text)),
3521
+ signal: AbortSignal.timeout(this.totalTimeoutMs)
3522
+ });
3523
+ if (!response.ok) {
3524
+ const body = await response.text();
3525
+ throw new Error(`Rime TTS error ${response.status}: ${body}`);
3526
+ }
3527
+ const contentType = response.headers.get("content-type") ?? "";
3528
+ if (!contentType.startsWith("audio")) {
3529
+ const body = await response.text();
3530
+ throw new Error(`Rime returned non-audio response: ${body.slice(0, 500)}`);
3531
+ }
3532
+ if (!response.body) {
3533
+ throw new Error("Rime TTS: no response body");
3534
+ }
3535
+ const reader = response.body.getReader();
3536
+ try {
3537
+ while (true) {
3538
+ const { done, value } = await reader.read();
3539
+ if (done) break;
3540
+ if (value && value.length > 0) {
3541
+ yield Buffer.from(value);
3542
+ }
3543
+ }
3544
+ } finally {
3545
+ if (typeof reader.cancel === "function")
3546
+ await reader.cancel().catch(() => {
3547
+ });
3548
+ reader.releaseLock();
3549
+ }
3550
+ }
3551
+ };
3552
+
3553
+ // src/stt/deepgram.ts
3554
+ init_esm_shims();
3555
+ var STT = class extends DeepgramSTT {
3556
+ static providerKey = "deepgram";
3557
+ constructor(opts = {}) {
3558
+ const key = opts.apiKey ?? process.env.DEEPGRAM_API_KEY;
3559
+ if (!key) {
3560
+ throw new Error(
3561
+ "Deepgram STT requires an apiKey. Pass { apiKey: 'dg_...' } or set DEEPGRAM_API_KEY in the environment."
3562
+ );
3563
+ }
3564
+ super(
3565
+ key,
3566
+ opts.language ?? "en",
3567
+ opts.model ?? "nova-3",
3568
+ opts.encoding ?? "linear16",
3569
+ opts.sampleRate ?? 16e3,
3570
+ {
3571
+ endpointingMs: opts.endpointingMs ?? 150,
3572
+ utteranceEndMs: opts.utteranceEndMs === null ? null : opts.utteranceEndMs ?? 1e3,
3573
+ smartFormat: opts.smartFormat ?? true,
3574
+ interimResults: opts.interimResults ?? true,
3575
+ ...opts.vadEvents !== void 0 ? { vadEvents: opts.vadEvents } : {}
3576
+ }
3577
+ );
3578
+ }
3579
+ };
3580
+
3581
+ // src/stt/whisper.ts
3582
+ init_esm_shims();
3583
+
3584
+ // src/providers/whisper-stt.ts
3585
+ init_esm_shims();
3586
+ var OPENAI_TRANSCRIPTION_URL = "https://api.openai.com/v1/audio/transcriptions";
3587
+ var DEFAULT_BUFFER_SIZE = 16e3 * 2;
3588
+ var ALLOWED_MODELS = /* @__PURE__ */ new Set(["whisper-1", "gpt-4o-transcribe", "gpt-4o-mini-transcribe"]);
3589
+ function wrapPcmInWav(pcm, sampleRate = 16e3, channels = 1, bitsPerSample = 16) {
3590
+ const dataSize = pcm.length;
3591
+ const header = Buffer.alloc(44);
3592
+ header.write("RIFF", 0);
3593
+ header.writeUInt32LE(36 + dataSize, 4);
3594
+ header.write("WAVE", 8);
3595
+ header.write("fmt ", 12);
3596
+ header.writeUInt32LE(16, 16);
3597
+ header.writeUInt16LE(1, 20);
3598
+ header.writeUInt16LE(channels, 22);
3599
+ header.writeUInt32LE(sampleRate, 24);
3600
+ header.writeUInt32LE(sampleRate * channels * (bitsPerSample / 8), 28);
2815
3601
  header.writeUInt16LE(channels * (bitsPerSample / 8), 32);
2816
3602
  header.writeUInt16LE(bitsPerSample, 34);
2817
3603
  header.write("data", 36);
@@ -4448,264 +5234,42 @@ var SpeechmaticsSTT = class {
4448
5234
  close() {
4449
5235
  this.running = false;
4450
5236
  const ws = this.ws;
4451
- if (!ws) return;
4452
- this.ws = null;
4453
- const sendSafe = (payload) => {
4454
- if (ws.readyState === WebSocket5.OPEN) {
4455
- try {
4456
- ws.send(payload);
4457
- } catch {
4458
- }
4459
- }
4460
- };
4461
- sendSafe(
4462
- JSON.stringify({ message: "EndOfStream", last_seq_no: this.lastSeqNo })
4463
- );
4464
- try {
4465
- ws.close();
4466
- } catch {
4467
- }
4468
- }
4469
- };
4470
-
4471
- // src/stt/speechmatics.ts
4472
- var STT7 = class extends SpeechmaticsSTT {
4473
- static providerKey = "speechmatics";
4474
- constructor(opts = {}) {
4475
- const key = opts.apiKey ?? process.env.SPEECHMATICS_API_KEY;
4476
- if (!key) {
4477
- throw new Error(
4478
- "Speechmatics STT requires an apiKey. Pass { apiKey: 'sm_...' } or set SPEECHMATICS_API_KEY in the environment."
4479
- );
4480
- }
4481
- super(key, opts);
4482
- }
4483
- };
4484
-
4485
- // src/tts/elevenlabs.ts
4486
- init_esm_shims();
4487
-
4488
- // src/providers/elevenlabs-tts.ts
4489
- init_esm_shims();
4490
- var ELEVENLABS_BASE_URL = "https://api.elevenlabs.io/v1";
4491
- var ELEVENLABS_VOICE_ID_BY_NAME = {
4492
- rachel: "21m00Tcm4TlvDq8ikWAM",
4493
- drew: "29vD33N1CtxCmqQRPOHJ",
4494
- clyde: "2EiwWnXFnvU5JabPnv8n",
4495
- paul: "5Q0t7uMcjvnagumLfvZi",
4496
- domi: "AZnzlk1XvdvUeBnXmlld",
4497
- dave: "CYw3kZ02Hs0563khs1Fj",
4498
- fin: "D38z5RcWu1voky8WS1ja",
4499
- bella: "EXAVITQu4vr4xnSDxMaL",
4500
- antoni: "ErXwobaYiN019PkySvjV",
4501
- thomas: "GBv7mTt0atIp3Br8iCZE",
4502
- charlie: "IKne3meq5aSn9XLyUdCD",
4503
- george: "JBFqnCBsd6RMkjVDRZzb",
4504
- emily: "LcfcDJNUP1GQjkzn1xUU",
4505
- elli: "MF3mGyEYCl7XYWbV9V6O",
4506
- callum: "N2lVS1w4EtoT3dr4eOWO",
4507
- patrick: "ODq5zmih8GrVes37Dizd",
4508
- harry: "SOYHLrjzK2X1ezoPC6cr",
4509
- liam: "TX3LPaxmHKxFdv7VOQHJ",
4510
- dorothy: "ThT5KcBeYPX3keUQqHPh",
4511
- josh: "TxGEqnHWrfWFTfGW9XjX",
4512
- arnold: "VR6AewLTigWG4xSOukaG",
4513
- charlotte: "XB0fDUnXU5powFXDhCwa",
4514
- matilda: "XrExE9yKIg1WjnnlVkGX",
4515
- matthew: "Yko7PKHZNXotIFUBG7I9",
4516
- james: "ZQe5CZNOzWyzPSCn5a3c",
4517
- joseph: "Zlb1dXrM653N07WRdFW3",
4518
- jeremy: "bVMeCyTHy58xNoL34h3p",
4519
- michael: "flq6f7yk4E4fJM5XTYuZ",
4520
- ethan: "g5CIjZEefAph4nQFvHAz",
4521
- gigi: "jBpfuIE2acCO8z3wKNLl",
4522
- freya: "jsCqWAovK2LkecY7zXl4",
4523
- brian: "nPczCjzI2devNBz1zQrb",
4524
- grace: "oWAxZDx7w5VEj9dCyTzz",
4525
- daniel: "onwK4e9ZLuTAKqWW03F9",
4526
- lily: "pFZP5JQG7iQjIQuC4Bku",
4527
- serena: "pMsXgVXv3BLzUgSXRplE",
4528
- adam: "pNInz6obpgDQGcFmaJgB",
4529
- nicole: "piTKgcLEGmPE4e6mEKli",
4530
- bill: "pqHfZKP75CvOlQylNhV4",
4531
- jessie: "t0jbNlBVZ17f02VDIeMI",
4532
- ryan: "wViXBPUzp2ZZixB1xQuM",
4533
- sam: "yoZ06aMxZJJ28mfd3POQ",
4534
- glinda: "z9fAnlkpzviPz146aGWa",
4535
- giovanni: "zcAOhNBS3c14rBihAFp1",
4536
- mimi: "zrHiDhphv9ZnVXBqCLjz",
4537
- sarah: "EXAVITQu4vr4xnSDxMaL",
4538
- alloy: "EXAVITQu4vr4xnSDxMaL"
4539
- };
4540
- var VOICE_ID_PATTERN = /^[A-Za-z0-9]{20}$/;
4541
- function resolveVoiceId(voice) {
4542
- if (!voice) return voice;
4543
- if (VOICE_ID_PATTERN.test(voice)) return voice;
4544
- return ELEVENLABS_VOICE_ID_BY_NAME[voice.toLowerCase()] ?? voice;
4545
- }
4546
- var ElevenLabsModel = {
4547
- V3: "eleven_v3",
4548
- FLASH_V2_5: "eleven_flash_v2_5",
4549
- TURBO_V2_5: "eleven_turbo_v2_5",
4550
- MULTILINGUAL_V2: "eleven_multilingual_v2",
4551
- MONOLINGUAL_V1: "eleven_monolingual_v1"
4552
- };
4553
- var ElevenLabsOutputFormat = {
4554
- MP3_22050_32: "mp3_22050_32",
4555
- MP3_44100_32: "mp3_44100_32",
4556
- MP3_44100_64: "mp3_44100_64",
4557
- MP3_44100_96: "mp3_44100_96",
4558
- MP3_44100_128: "mp3_44100_128",
4559
- MP3_44100_192: "mp3_44100_192",
4560
- PCM_8000: "pcm_8000",
4561
- PCM_16000: "pcm_16000",
4562
- PCM_22050: "pcm_22050",
4563
- PCM_24000: "pcm_24000",
4564
- PCM_44100: "pcm_44100",
4565
- ULAW_8000: "ulaw_8000"
4566
- };
4567
- var ElevenLabsTTS = class _ElevenLabsTTS {
4568
- // Stable pricing/dashboard key — read by stream-handler / metrics via
4569
- // ``(agent.tts.constructor as any).providerKey``. Without this the cost
4570
- // calculator falls back to ``constructor.name`` ("ElevenLabsTTS") which
4571
- // does NOT match the pricing table key "elevenlabs", silently zeroing
4572
- // TTS cost for callers that construct the raw REST class directly
4573
- // (exposed at top level as ``ElevenLabsRestTTS``).
4574
- static providerKey = "elevenlabs";
4575
- apiKey;
4576
- voiceId;
4577
- modelId;
4578
- outputFormat;
4579
- voiceSettings;
4580
- languageCode;
4581
- chunkSize;
4582
- constructor(apiKey, voiceIdOrOptions = "21m00Tcm4TlvDq8ikWAM", modelId = ElevenLabsModel.FLASH_V2_5, outputFormat = ElevenLabsOutputFormat.PCM_16000) {
4583
- this.apiKey = apiKey;
4584
- if (typeof voiceIdOrOptions === "object") {
4585
- const o = voiceIdOrOptions;
4586
- this.voiceId = resolveVoiceId(o.voiceId ?? "21m00Tcm4TlvDq8ikWAM");
4587
- this.modelId = o.modelId ?? ElevenLabsModel.FLASH_V2_5;
4588
- this.outputFormat = o.outputFormat ?? ElevenLabsOutputFormat.PCM_16000;
4589
- this.voiceSettings = o.voiceSettings;
4590
- this.languageCode = o.languageCode;
4591
- this.chunkSize = o.chunkSize ?? 4096;
4592
- } else {
4593
- this.voiceId = resolveVoiceId(voiceIdOrOptions);
4594
- this.modelId = modelId;
4595
- this.outputFormat = outputFormat;
4596
- this.voiceSettings = void 0;
4597
- this.languageCode = void 0;
4598
- this.chunkSize = 4096;
4599
- }
4600
- }
4601
- /**
4602
- * Construct an instance pre-configured for Twilio Media Streams.
4603
- *
4604
- * Sets `outputFormat='ulaw_8000'` so ElevenLabs emits μ-law @ 8 kHz
4605
- * directly — the exact wire format Twilio's media stream uses — letting
4606
- * the SDK skip the 16 kHz→8 kHz resample and PCM→μ-law conversion in
4607
- * `TwilioAudioSender`. Saves ~30–80 ms first-byte and per-frame CPU,
4608
- * and removes a potential aliasing source.
4609
- *
4610
- * `voiceSettings` defaults to a low-bandwidth-friendly profile
4611
- * (speaker boost off, modest stability) which sounds cleaner at 8 kHz
4612
- * μ-law than the studio default. Pass an explicit object to override.
4613
- */
4614
- static forTwilio(apiKey, options = {}) {
4615
- const voiceSettings = options.voiceSettings ?? {
4616
- // Speaker boost adds high-frequency emphasis that aliases ugly over an
4617
- // 8 kHz μ-law line. Slightly higher stability tames the excursions
4618
- // that compander quantization noise can amplify.
4619
- stability: 0.6,
4620
- similarity_boost: 0.75,
4621
- use_speaker_boost: false
4622
- };
4623
- return new _ElevenLabsTTS(apiKey, {
4624
- ...options,
4625
- voiceSettings,
4626
- outputFormat: ElevenLabsOutputFormat.ULAW_8000
4627
- });
4628
- }
4629
- /**
4630
- * Construct an instance pre-configured for Telnyx bidirectional media.
4631
- *
4632
- * Telnyx's default media-streaming codec is L16 PCM @ 16 kHz, which
4633
- * matches our default Telnyx handler. We pick `pcm_16000` so the audio
4634
- * flows end-to-end with zero resampling or transcoding.
4635
- *
4636
- * Trade-off: if your Telnyx profile is pinned to PCMU/8000 (μ-law),
4637
- * construct `ElevenLabsTTS` directly with `outputFormat: 'ulaw_8000'`
4638
- * — Telnyx supports that natively too.
4639
- */
4640
- static forTelnyx(apiKey, options = {}) {
4641
- return new _ElevenLabsTTS(apiKey, {
4642
- ...options,
4643
- outputFormat: ElevenLabsOutputFormat.PCM_16000
4644
- });
4645
- }
4646
- /**
4647
- * Synthesise text to speech and return the full audio as a single Buffer.
4648
- *
4649
- * For large chunks (or when latency matters) call `synthesizeStream` instead.
4650
- */
4651
- async synthesize(text) {
4652
- const chunks = [];
4653
- for await (const chunk of this.synthesizeStream(text)) {
4654
- chunks.push(chunk);
4655
- }
4656
- return Buffer.concat(chunks);
4657
- }
4658
- /**
4659
- * Synthesise text and yield audio chunks as they arrive (streaming).
4660
- *
4661
- * The yielded buffers are raw PCM at 16 kHz (or whatever `outputFormat` is
4662
- * configured to). `chunkSize` controls the maximum yield size — 512 is a
4663
- * good choice for low-latency telephony.
4664
- */
4665
- async *synthesizeStream(text) {
4666
- const url = `${ELEVENLABS_BASE_URL}/text-to-speech/${encodeURIComponent(this.voiceId)}/stream?output_format=${encodeURIComponent(this.outputFormat)}`;
4667
- const body = {
4668
- text,
4669
- model_id: this.modelId
4670
- };
4671
- if (this.voiceSettings) body["voice_settings"] = this.voiceSettings;
4672
- if (this.languageCode) body["language_code"] = this.languageCode;
4673
- const response = await fetch(url, {
4674
- method: "POST",
4675
- headers: {
4676
- "xi-api-key": this.apiKey,
4677
- "Content-Type": "application/json"
4678
- },
4679
- body: JSON.stringify(body),
4680
- signal: AbortSignal.timeout(3e4)
4681
- });
4682
- if (!response.ok) {
4683
- const errBody = await response.text();
4684
- throw new Error(`ElevenLabs TTS error ${response.status}: ${errBody}`);
4685
- }
4686
- if (!response.body) {
4687
- throw new Error("ElevenLabs TTS: no response body");
4688
- }
4689
- const reader = response.body.getReader();
4690
- try {
4691
- while (true) {
4692
- const { done, value } = await reader.read();
4693
- if (done) break;
4694
- if (!value || value.length === 0) continue;
4695
- const buf = Buffer.from(value);
4696
- for (let offset = 0; offset < buf.length; offset += this.chunkSize) {
4697
- yield buf.subarray(offset, Math.min(offset + this.chunkSize, buf.length));
5237
+ if (!ws) return;
5238
+ this.ws = null;
5239
+ const sendSafe = (payload) => {
5240
+ if (ws.readyState === WebSocket5.OPEN) {
5241
+ try {
5242
+ ws.send(payload);
5243
+ } catch {
4698
5244
  }
4699
5245
  }
4700
- } finally {
4701
- if (typeof reader.cancel === "function") await reader.cancel().catch(() => {
4702
- });
4703
- reader.releaseLock();
5246
+ };
5247
+ sendSafe(
5248
+ JSON.stringify({ message: "EndOfStream", last_seq_no: this.lastSeqNo })
5249
+ );
5250
+ try {
5251
+ ws.close();
5252
+ } catch {
5253
+ }
5254
+ }
5255
+ };
5256
+
5257
+ // src/stt/speechmatics.ts
5258
+ var STT7 = class extends SpeechmaticsSTT {
5259
+ static providerKey = "speechmatics";
5260
+ constructor(opts = {}) {
5261
+ const key = opts.apiKey ?? process.env.SPEECHMATICS_API_KEY;
5262
+ if (!key) {
5263
+ throw new Error(
5264
+ "Speechmatics STT requires an apiKey. Pass { apiKey: 'sm_...' } or set SPEECHMATICS_API_KEY in the environment."
5265
+ );
4704
5266
  }
5267
+ super(key, opts);
4705
5268
  }
4706
5269
  };
4707
5270
 
4708
5271
  // src/tts/elevenlabs.ts
5272
+ init_esm_shims();
4709
5273
  function resolveApiKey(apiKey) {
4710
5274
  const key = apiKey ?? process.env.ELEVENLABS_API_KEY;
4711
5275
  if (!key) {
@@ -4721,7 +5285,7 @@ var TTS = class _TTS extends ElevenLabsTTS {
4721
5285
  super(resolveApiKey(opts.apiKey), {
4722
5286
  voiceId: opts.voiceId ?? "EXAVITQu4vr4xnSDxMaL",
4723
5287
  modelId: opts.modelId ?? "eleven_flash_v2_5",
4724
- outputFormat: opts.outputFormat ?? "pcm_16000",
5288
+ ...opts.outputFormat !== void 0 ? { outputFormat: opts.outputFormat } : {},
4725
5289
  languageCode: opts.languageCode,
4726
5290
  voiceSettings: opts.voiceSettings
4727
5291
  });
@@ -4764,9 +5328,11 @@ var PLAN_REQUIRED_MSG = "ElevenLabs WS streaming requires a Pro plan or higher (
4764
5328
  function sanitiseLogStr(value, limit = 200) {
4765
5329
  return String(value).replace(/[\r\n\x00]/g, " ").slice(0, limit);
4766
5330
  }
4767
- var CARRIER_NATIVE_FORMAT = {
5331
+ var CARRIER_NATIVE_FORMAT2 = {
4768
5332
  twilio: "ulaw_8000",
4769
- telnyx: "pcm_16000"
5333
+ telnyx: "pcm_16000",
5334
+ // Plivo streams mulaw 8 kHz (we pin contentType in the answer XML).
5335
+ plivo: "ulaw_8000"
4770
5336
  };
4771
5337
  var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
4772
5338
  static providerKey = "elevenlabs_ws";
@@ -4792,6 +5358,20 @@ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
4792
5358
  * changes.
4793
5359
  */
4794
5360
  adoptedConnection = null;
5361
+ /**
5362
+ * Active WS for the in-flight ``synthesizeStream`` call, if any. Set
5363
+ * when a stream starts, cleared in its ``finally`` block. The
5364
+ * stream-handler calls ``cancelActiveStream()`` from ``cancelSpeaking``
5365
+ * to unblock the generator's inner ``await Promise<frame>`` — without
5366
+ * it, a barge-in on the firstMessage live path leaves the for-await
5367
+ * stuck waiting for the next frame; ElevenLabs never sends
5368
+ * ``isFinal=true`` after the consumer breaks, the 30 s frame timeout
5369
+ * fires post-call, and meanwhile ``initPipeline`` never returns so
5370
+ * the STT ``onTranscript`` callback never registers and subsequent
5371
+ * user turns are silently dropped (root cause of the 2026-05-20
5372
+ * "first message OK, then no response" symptom).
5373
+ */
5374
+ activeStreamWs = null;
4795
5375
  /**
4796
5376
  * The wire format requested over the ElevenLabs WS. Initially set from
4797
5377
  * the constructor; ``setTelephonyCarrier`` may auto-flip it to the
@@ -4836,10 +5416,36 @@ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
4836
5416
  */
4837
5417
  setTelephonyCarrier(carrier) {
4838
5418
  if (this._outputFormatExplicit) return;
4839
- const native = CARRIER_NATIVE_FORMAT[carrier];
5419
+ const native = CARRIER_NATIVE_FORMAT2[carrier];
4840
5420
  if (!native) return;
4841
5421
  this._outputFormat = native;
4842
5422
  }
5423
+ /**
5424
+ * Force-close the WebSocket of any in-flight ``synthesizeStream`` call.
5425
+ * Called by the stream-handler from ``cancelSpeaking`` (barge-in) so
5426
+ * the generator's inner ``await Promise<frame>`` loop unblocks cleanly
5427
+ * via the ``onClose`` handler — instead of waiting up to 30 s for the
5428
+ * ``FRAME_TIMEOUT_MS`` watchdog to fire. No-op when no stream is in
5429
+ * flight or when the WS is already closing.
5430
+ *
5431
+ * Without this, a barge-in during the firstMessage live path left the
5432
+ * for-await stuck (ElevenLabs never sends ``isFinal=true`` after the
5433
+ * consumer breaks), ``initPipeline`` never returned, the STT
5434
+ * ``onTranscript`` callback never registered, and the entire remainder
5435
+ * of the call was silent for the user. Surfaced during the 2026-05-20
5436
+ * acceptance run.
5437
+ */
5438
+ cancelActiveStream() {
5439
+ const ws = this.activeStreamWs;
5440
+ if (!ws) return;
5441
+ this.activeStreamWs = null;
5442
+ try {
5443
+ if (ws.readyState === WebSocket6.OPEN || ws.readyState === WebSocket6.CONNECTING) {
5444
+ ws.close();
5445
+ }
5446
+ } catch {
5447
+ }
5448
+ }
4843
5449
  /** Pre-configured for Twilio Media Streams (`ulaw_8000`). */
4844
5450
  static forTwilio(opts) {
4845
5451
  return new _ElevenLabsWebSocketTTS({
@@ -4925,6 +5531,7 @@ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
4925
5531
  headers: { "xi-api-key": this.apiKey }
4926
5532
  });
4927
5533
  }
5534
+ this.activeStreamWs = ws;
4928
5535
  const queue = [];
4929
5536
  let done = false;
4930
5537
  let pendingError = null;
@@ -5045,6 +5652,7 @@ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
5045
5652
  }
5046
5653
  } finally {
5047
5654
  if (connectTimer) clearTimeout(connectTimer);
5655
+ if (this.activeStreamWs === ws) this.activeStreamWs = null;
5048
5656
  try {
5049
5657
  if (ws.readyState === WebSocket6.OPEN) {
5050
5658
  ws.send(JSON.stringify({ text: "" }));
@@ -5217,9 +5825,9 @@ function buildOpts(opts) {
5217
5825
  const out = {
5218
5826
  apiKey: resolveApiKey2(opts.apiKey),
5219
5827
  modelId: opts.modelId ?? "eleven_flash_v2_5",
5220
- outputFormat: opts.outputFormat ?? "pcm_16000",
5221
5828
  autoMode: opts.autoMode ?? true
5222
5829
  };
5830
+ if (opts.outputFormat !== void 0) out.outputFormat = opts.outputFormat;
5223
5831
  if (opts.voiceId !== void 0) out.voiceId = opts.voiceId;
5224
5832
  if (opts.voiceSettings !== void 0) out.voiceSettings = opts.voiceSettings;
5225
5833
  if (opts.languageCode !== void 0) out.languageCode = opts.languageCode;
@@ -5398,266 +6006,75 @@ var OpenAITTS = class _OpenAITTS {
5398
6006
  let s = Math.round(y);
5399
6007
  if (s > 32767) s = 32767;
5400
6008
  else if (s < -32768) s = -32768;
5401
- samples.push(s);
5402
- } else {
5403
- samples.push(x);
5404
- }
5405
- }
5406
- if (lpf) ctx.lpfPrev = y;
5407
- const out = [];
5408
- let i = 0;
5409
- if (direct8k) {
5410
- while (i + 2 < samples.length) {
5411
- out.push(samples[i]);
5412
- i += 3;
5413
- }
5414
- } else {
5415
- while (i + 2 < samples.length) {
5416
- out.push(samples[i]);
5417
- out.push(Math.round((samples[i + 1] + samples[i + 2]) / 2));
5418
- i += 3;
5419
- }
5420
- }
5421
- ctx.leftover = samples.slice(i);
5422
- const buffer = Buffer.alloc(out.length * 2);
5423
- for (let j = 0; j < out.length; j++) {
5424
- buffer.writeInt16LE(out[j], j * 2);
5425
- }
5426
- return buffer;
5427
- }
5428
- /** @deprecated use {@link resampleStreaming} with persistent state. */
5429
- static resample24kTo16k(audio) {
5430
- const ctx = {
5431
- carryByte: null,
5432
- leftover: [],
5433
- lpfPrev: 0,
5434
- lpfEnabled: false,
5435
- targetSampleRate: 16e3
5436
- };
5437
- const out = _OpenAITTS.resampleStreaming(audio, ctx);
5438
- if (ctx.leftover.length === 0) return out;
5439
- const tail = Buffer.alloc(ctx.leftover.length * 2);
5440
- for (let i = 0; i < ctx.leftover.length; i++) {
5441
- tail.writeInt16LE(ctx.leftover[i], i * 2);
5442
- }
5443
- return Buffer.concat([out, tail]);
5444
- }
5445
- };
5446
-
5447
- // src/tts/openai.ts
5448
- var TTS3 = class extends OpenAITTS {
5449
- static providerKey = "openai_tts";
5450
- constructor(opts = {}) {
5451
- const key = opts.apiKey ?? process.env.OPENAI_API_KEY;
5452
- if (!key) {
5453
- throw new Error(
5454
- "OpenAI TTS requires an apiKey. Pass { apiKey: 'sk-...' } or set OPENAI_API_KEY in the environment."
5455
- );
5456
- }
5457
- super(
5458
- key,
5459
- opts.voice ?? "alloy",
5460
- opts.model ?? "gpt-4o-mini-tts",
5461
- opts.instructions ?? null,
5462
- opts.speed ?? null,
5463
- opts.antiAlias ?? false
5464
- );
5465
- }
5466
- };
5467
-
5468
- // src/tts/cartesia.ts
5469
- init_esm_shims();
5470
-
5471
- // src/providers/cartesia-tts.ts
5472
- init_esm_shims();
5473
- var CARTESIA_BASE_URL = "https://api.cartesia.ai";
5474
- var CARTESIA_API_VERSION = "2025-04-16";
5475
- var CARTESIA_DEFAULT_VOICE_ID = "f786b574-daa5-4673-aa0c-cbe3e8534c02";
5476
- var CartesiaTTSModel = {
5477
- SONIC_3: "sonic-3",
5478
- SONIC_2: "sonic-2",
5479
- SONIC: "sonic"
5480
- };
5481
- var CartesiaTTSContainer = {
5482
- RAW: "raw",
5483
- WAV: "wav",
5484
- MP3: "mp3"
5485
- };
5486
- var CartesiaTTSEncoding = {
5487
- PCM_S16LE: "pcm_s16le",
5488
- PCM_F32LE: "pcm_f32le",
5489
- PCM_MULAW: "pcm_mulaw",
5490
- PCM_ALAW: "pcm_alaw"
5491
- };
5492
- var CartesiaTTSSampleRate = {
5493
- HZ_8000: 8e3,
5494
- HZ_16000: 16e3,
5495
- HZ_22050: 22050,
5496
- HZ_24000: 24e3,
5497
- HZ_44100: 44100
5498
- };
5499
- var CartesiaTTSVoiceMode = {
5500
- ID: "id",
5501
- EMBEDDING: "embedding"
5502
- };
5503
- var CartesiaTTS = class _CartesiaTTS {
5504
- /** Stable pricing/dashboard key — read by stream-handler/metrics. */
5505
- static providerKey = "cartesia_tts";
5506
- apiKey;
5507
- model;
5508
- voice;
5509
- language;
5510
- sampleRate;
5511
- speed;
5512
- emotion;
5513
- volume;
5514
- baseUrl;
5515
- apiVersion;
5516
- constructor(apiKey, opts = {}) {
5517
- this.apiKey = apiKey;
5518
- this.model = opts.model ?? CartesiaTTSModel.SONIC_3;
5519
- this.voice = opts.voice ?? CARTESIA_DEFAULT_VOICE_ID;
5520
- this.language = opts.language ?? "en";
5521
- this.sampleRate = opts.sampleRate ?? CartesiaTTSSampleRate.HZ_16000;
5522
- this.speed = opts.speed;
5523
- this.emotion = typeof opts.emotion === "string" ? [opts.emotion] : opts.emotion;
5524
- this.volume = opts.volume;
5525
- this.baseUrl = opts.baseUrl ?? CARTESIA_BASE_URL;
5526
- this.apiVersion = opts.apiVersion ?? CARTESIA_API_VERSION;
5527
- }
5528
- /**
5529
- * Construct an instance pre-configured for Twilio Media Streams.
5530
- *
5531
- * Sets `sampleRate=8000` so Cartesia emits PCM_S16LE @ 8 kHz directly.
5532
- * Twilio's media stream uses μ-law @ 8 kHz so the SDK still does the
5533
- * PCM → μ-law transcode client-side, but the 16 kHz → 8 kHz resample
5534
- * step is skipped. Saves ~10–30 ms first-byte plus per-frame CPU and
5535
- * removes a potential aliasing source.
5536
- */
5537
- static forTwilio(apiKey, options = {}) {
5538
- return new _CartesiaTTS(apiKey, {
5539
- ...options,
5540
- sampleRate: CartesiaTTSSampleRate.HZ_8000
5541
- });
5542
- }
5543
- /**
5544
- * Construct an instance pre-configured for Telnyx bidirectional media.
5545
- *
5546
- * Sets `sampleRate=16000` to match Telnyx's L16/16000 default codec —
5547
- * audio flows end-to-end with zero resampling or transcoding. Same as
5548
- * the bare-constructor default; exists for API symmetry with
5549
- * {@link CartesiaTTS.forTwilio}.
5550
- */
5551
- static forTelnyx(apiKey, options = {}) {
5552
- return new _CartesiaTTS(apiKey, {
5553
- ...options,
5554
- sampleRate: CartesiaTTSSampleRate.HZ_16000
5555
- });
5556
- }
5557
- /** Build the JSON payload for the Cartesia bytes endpoint. */
5558
- buildPayload(text) {
5559
- const payload = {
5560
- model_id: this.model,
5561
- voice: { mode: CartesiaTTSVoiceMode.ID, id: this.voice },
5562
- transcript: text,
5563
- output_format: {
5564
- container: CartesiaTTSContainer.RAW,
5565
- encoding: CartesiaTTSEncoding.PCM_S16LE,
5566
- sample_rate: this.sampleRate
5567
- },
5568
- language: this.language
5569
- };
5570
- const generationConfig = {};
5571
- if (this.speed !== void 0) generationConfig.speed = this.speed;
5572
- if (this.emotion && this.emotion.length > 0)
5573
- generationConfig.emotion = this.emotion[0];
5574
- if (this.volume !== void 0) generationConfig.volume = this.volume;
5575
- if (Object.keys(generationConfig).length > 0) {
5576
- payload.generation_config = generationConfig;
5577
- }
5578
- return payload;
5579
- }
5580
- /**
5581
- * Pre-call HTTP warmup for the Cartesia `/tts/bytes` endpoint.
5582
- *
5583
- * Issues a lightweight `GET <baseUrl>/voices` so DNS, TLS, and HTTP/2
5584
- * are already up by the time the first `synthesizeStream()` POST
5585
- * lands. Best-effort: 5 s timeout, all exceptions swallowed at
5586
- * debug level.
5587
- *
5588
- * Billing safety: `GET /voices` is a free metadata read on
5589
- * Cartesia's REST surface (per https://docs.cartesia.ai). It does
5590
- * not consume synthesis credits. The actual synthesis is billed
5591
- * only when `POST /tts/bytes` runs with a non-empty `transcript`.
5592
- *
5593
- * Note: Cartesia TTS uses the HTTP path (vs the WebSocket variant
5594
- * Cartesia also exposes) — connection warmup is therefore HTTP-GET
5595
- * based, not WebSocket pre-handshake. The latency win is smaller
5596
- * (~50-150 ms vs the ~200-500 ms of a WS prewarm) but still real.
5597
- */
5598
- async warmup() {
5599
- try {
5600
- await fetch(`${this.baseUrl}/voices`, {
5601
- method: "GET",
5602
- headers: {
5603
- "X-API-Key": this.apiKey,
5604
- "Cartesia-Version": this.apiVersion
5605
- },
5606
- signal: AbortSignal.timeout(5e3)
5607
- });
5608
- } catch (err) {
5609
- getLogger().debug(`Cartesia TTS warmup failed (best-effort): ${String(err)}`);
6009
+ samples.push(s);
6010
+ } else {
6011
+ samples.push(x);
6012
+ }
5610
6013
  }
5611
- }
5612
- /** Synthesize text and return the concatenated audio buffer. */
5613
- async synthesize(text) {
5614
- const chunks = [];
5615
- for await (const chunk of this.synthesizeStream(text)) {
5616
- chunks.push(chunk);
6014
+ if (lpf) ctx.lpfPrev = y;
6015
+ const out = [];
6016
+ let i = 0;
6017
+ if (direct8k) {
6018
+ while (i + 2 < samples.length) {
6019
+ out.push(samples[i]);
6020
+ i += 3;
6021
+ }
6022
+ } else {
6023
+ while (i + 2 < samples.length) {
6024
+ out.push(samples[i]);
6025
+ out.push(Math.round((samples[i + 1] + samples[i + 2]) / 2));
6026
+ i += 3;
6027
+ }
5617
6028
  }
5618
- return Buffer.concat(chunks);
5619
- }
5620
- /**
5621
- * Synthesize text and yield raw PCM_S16LE chunks at the configured
5622
- * `sampleRate` as they arrive from Cartesia.
5623
- */
5624
- async *synthesizeStream(text) {
5625
- const response = await fetch(`${this.baseUrl}/tts/bytes`, {
5626
- method: "POST",
5627
- headers: {
5628
- "X-API-Key": this.apiKey,
5629
- "Cartesia-Version": this.apiVersion,
5630
- "Content-Type": "application/json"
5631
- },
5632
- body: JSON.stringify(this.buildPayload(text)),
5633
- signal: AbortSignal.timeout(3e4)
5634
- });
5635
- if (!response.ok) {
5636
- const body = await response.text();
5637
- throw new Error(`Cartesia TTS error ${response.status}: ${body}`);
6029
+ ctx.leftover = samples.slice(i);
6030
+ const buffer = Buffer.alloc(out.length * 2);
6031
+ for (let j = 0; j < out.length; j++) {
6032
+ buffer.writeInt16LE(out[j], j * 2);
5638
6033
  }
5639
- if (!response.body) {
5640
- throw new Error("Cartesia TTS: no response body");
6034
+ return buffer;
6035
+ }
6036
+ /** @deprecated use {@link resampleStreaming} with persistent state. */
6037
+ static resample24kTo16k(audio) {
6038
+ const ctx = {
6039
+ carryByte: null,
6040
+ leftover: [],
6041
+ lpfPrev: 0,
6042
+ lpfEnabled: false,
6043
+ targetSampleRate: 16e3
6044
+ };
6045
+ const out = _OpenAITTS.resampleStreaming(audio, ctx);
6046
+ if (ctx.leftover.length === 0) return out;
6047
+ const tail = Buffer.alloc(ctx.leftover.length * 2);
6048
+ for (let i = 0; i < ctx.leftover.length; i++) {
6049
+ tail.writeInt16LE(ctx.leftover[i], i * 2);
5641
6050
  }
5642
- const reader = response.body.getReader();
5643
- try {
5644
- while (true) {
5645
- const { done, value } = await reader.read();
5646
- if (done) break;
5647
- if (value && value.length > 0) {
5648
- yield Buffer.from(value);
5649
- }
5650
- }
5651
- } finally {
5652
- if (typeof reader.cancel === "function")
5653
- await reader.cancel().catch(() => {
5654
- });
5655
- reader.releaseLock();
6051
+ return Buffer.concat([out, tail]);
6052
+ }
6053
+ };
6054
+
6055
+ // src/tts/openai.ts
6056
+ var TTS3 = class extends OpenAITTS {
6057
+ static providerKey = "openai_tts";
6058
+ constructor(opts = {}) {
6059
+ const key = opts.apiKey ?? process.env.OPENAI_API_KEY;
6060
+ if (!key) {
6061
+ throw new Error(
6062
+ "OpenAI TTS requires an apiKey. Pass { apiKey: 'sk-...' } or set OPENAI_API_KEY in the environment."
6063
+ );
5656
6064
  }
6065
+ super(
6066
+ key,
6067
+ opts.voice ?? "alloy",
6068
+ opts.model ?? "gpt-4o-mini-tts",
6069
+ opts.instructions ?? null,
6070
+ opts.speed ?? null,
6071
+ opts.antiAlias ?? false
6072
+ );
5657
6073
  }
5658
6074
  };
5659
6075
 
5660
6076
  // src/tts/cartesia.ts
6077
+ init_esm_shims();
5661
6078
  function resolveApiKey3(apiKey) {
5662
6079
  const key = apiKey ?? process.env.CARTESIA_API_KEY;
5663
6080
  if (!key) {
@@ -5687,150 +6104,6 @@ var TTS4 = class _TTS extends CartesiaTTS {
5687
6104
 
5688
6105
  // src/tts/rime.ts
5689
6106
  init_esm_shims();
5690
-
5691
- // src/providers/rime-tts.ts
5692
- init_esm_shims();
5693
- var RIME_BASE_URL = "https://users.rime.ai/v1/rime-tts";
5694
- var RimeModel = {
5695
- ARCANA: "arcana",
5696
- MIST: "mist",
5697
- MIST_V2: "mistv2"
5698
- };
5699
- var RimeAudioFormat = {
5700
- PCM: "audio/pcm",
5701
- MP3: "audio/mp3",
5702
- WAV: "audio/wav",
5703
- MULAW: "audio/mulaw"
5704
- };
5705
- var ARCANA_MODEL_TIMEOUT_MS = 60 * 4 * 1e3;
5706
- var MIST_MODEL_TIMEOUT_MS = 30 * 1e3;
5707
- function isMistModel(model) {
5708
- return model.includes(RimeModel.MIST);
5709
- }
5710
- function timeoutForModel(model) {
5711
- if (model === RimeModel.ARCANA) return ARCANA_MODEL_TIMEOUT_MS;
5712
- return MIST_MODEL_TIMEOUT_MS;
5713
- }
5714
- var RimeTTS = class {
5715
- /** Stable pricing/dashboard key — read by stream-handler/metrics. */
5716
- static providerKey = "rime";
5717
- apiKey;
5718
- model;
5719
- speaker;
5720
- lang;
5721
- sampleRate;
5722
- repetitionPenalty;
5723
- temperature;
5724
- topP;
5725
- maxTokens;
5726
- speedAlpha;
5727
- reduceLatency;
5728
- pauseBetweenBrackets;
5729
- phonemizeBetweenBrackets;
5730
- baseUrl;
5731
- totalTimeoutMs;
5732
- constructor(apiKey, opts = {}) {
5733
- this.apiKey = apiKey;
5734
- this.model = opts.model ?? RimeModel.ARCANA;
5735
- const defaultSpeaker = isMistModel(this.model) ? "cove" : "astra";
5736
- this.speaker = opts.speaker ?? defaultSpeaker;
5737
- this.lang = opts.lang ?? "eng";
5738
- this.sampleRate = opts.sampleRate ?? 16e3;
5739
- this.repetitionPenalty = opts.repetitionPenalty;
5740
- this.temperature = opts.temperature;
5741
- this.topP = opts.topP;
5742
- this.maxTokens = opts.maxTokens;
5743
- this.speedAlpha = opts.speedAlpha;
5744
- this.reduceLatency = opts.reduceLatency;
5745
- this.pauseBetweenBrackets = opts.pauseBetweenBrackets;
5746
- this.phonemizeBetweenBrackets = opts.phonemizeBetweenBrackets;
5747
- this.baseUrl = opts.baseUrl ?? RIME_BASE_URL;
5748
- this.totalTimeoutMs = timeoutForModel(this.model);
5749
- }
5750
- buildPayload(text) {
5751
- const payload = {
5752
- speaker: this.speaker,
5753
- text,
5754
- modelId: this.model
5755
- };
5756
- if (this.model === RimeModel.ARCANA) {
5757
- if (this.repetitionPenalty !== void 0)
5758
- payload.repetition_penalty = this.repetitionPenalty;
5759
- if (this.temperature !== void 0) payload.temperature = this.temperature;
5760
- if (this.topP !== void 0) payload.top_p = this.topP;
5761
- if (this.maxTokens !== void 0) payload.max_tokens = this.maxTokens;
5762
- payload.lang = this.lang;
5763
- payload.samplingRate = this.sampleRate;
5764
- } else if (isMistModel(this.model)) {
5765
- payload.lang = this.lang;
5766
- payload.samplingRate = this.sampleRate;
5767
- if (this.speedAlpha !== void 0) payload.speedAlpha = this.speedAlpha;
5768
- if (this.model === RimeModel.MIST_V2 && this.reduceLatency !== void 0) {
5769
- payload.reduceLatency = this.reduceLatency;
5770
- }
5771
- if (this.pauseBetweenBrackets !== void 0) {
5772
- payload.pauseBetweenBrackets = this.pauseBetweenBrackets;
5773
- }
5774
- if (this.phonemizeBetweenBrackets !== void 0) {
5775
- payload.phonemizeBetweenBrackets = this.phonemizeBetweenBrackets;
5776
- }
5777
- }
5778
- return payload;
5779
- }
5780
- /** Synthesize text and return the concatenated audio buffer. */
5781
- async synthesize(text) {
5782
- const chunks = [];
5783
- for await (const chunk of this.synthesizeStream(text)) {
5784
- chunks.push(chunk);
5785
- }
5786
- return Buffer.concat(chunks);
5787
- }
5788
- /**
5789
- * Synthesize text and yield raw PCM_S16LE chunks at the configured
5790
- * `sampleRate` as they stream in.
5791
- */
5792
- async *synthesizeStream(text) {
5793
- const response = await fetch(this.baseUrl, {
5794
- method: "POST",
5795
- headers: {
5796
- accept: RimeAudioFormat.PCM,
5797
- Authorization: `Bearer ${this.apiKey}`,
5798
- "content-type": "application/json"
5799
- },
5800
- body: JSON.stringify(this.buildPayload(text)),
5801
- signal: AbortSignal.timeout(this.totalTimeoutMs)
5802
- });
5803
- if (!response.ok) {
5804
- const body = await response.text();
5805
- throw new Error(`Rime TTS error ${response.status}: ${body}`);
5806
- }
5807
- const contentType = response.headers.get("content-type") ?? "";
5808
- if (!contentType.startsWith("audio")) {
5809
- const body = await response.text();
5810
- throw new Error(`Rime returned non-audio response: ${body.slice(0, 500)}`);
5811
- }
5812
- if (!response.body) {
5813
- throw new Error("Rime TTS: no response body");
5814
- }
5815
- const reader = response.body.getReader();
5816
- try {
5817
- while (true) {
5818
- const { done, value } = await reader.read();
5819
- if (done) break;
5820
- if (value && value.length > 0) {
5821
- yield Buffer.from(value);
5822
- }
5823
- }
5824
- } finally {
5825
- if (typeof reader.cancel === "function")
5826
- await reader.cancel().catch(() => {
5827
- });
5828
- reader.releaseLock();
5829
- }
5830
- }
5831
- };
5832
-
5833
- // src/tts/rime.ts
5834
6107
  var TTS5 = class extends RimeTTS {
5835
6108
  static providerKey = "rime";
5836
6109
  constructor(opts = {}) {
@@ -6469,12 +6742,6 @@ init_esm_shims();
6469
6742
 
6470
6743
  // src/providers/groq-llm.ts
6471
6744
  init_esm_shims();
6472
-
6473
- // src/version.ts
6474
- init_esm_shims();
6475
- var VERSION = "0.5.5";
6476
-
6477
- // src/providers/groq-llm.ts
6478
6745
  var GROQ_BASE_URL = "https://api.groq.com/openai/v1";
6479
6746
  var GroqModel = {
6480
6747
  LLAMA_3_3_70B_VERSATILE: "llama-3.3-70b-versatile",
@@ -7293,7 +7560,7 @@ var KrispVivaFilter = class {
7293
7560
 
7294
7561
  // src/telephony/twilio.ts
7295
7562
  init_esm_shims();
7296
- var Carrier = class {
7563
+ var Carrier2 = class {
7297
7564
  kind = "twilio";
7298
7565
  accountSid;
7299
7566
  authToken;
@@ -7317,7 +7584,7 @@ var Carrier = class {
7317
7584
 
7318
7585
  // src/telephony/telnyx.ts
7319
7586
  init_esm_shims();
7320
- var Carrier2 = class {
7587
+ var Carrier3 = class {
7321
7588
  kind = "telnyx";
7322
7589
  apiKey;
7323
7590
  connectionId;
@@ -8131,12 +8398,28 @@ var TwilioAdapter = class _TwilioAdapter {
8131
8398
  return { callSid: call.sid };
8132
8399
  }
8133
8400
  /**
8134
- * Build a minimal ``<Response><Connect><Stream url="..."/></Connect></Response>``
8135
- * TwiML document. Mirrors the Python adapter's ``generate_stream_twiml``.
8401
+ * Build a ``<Response><Connect><Stream url="...">`` TwiML document.
8402
+ *
8403
+ * ``parameters`` is forwarded as ``<Parameter name="..." value="..."/>``
8404
+ * children of ``<Stream>``. Twilio Media Streams strips query-string params
8405
+ * from the ``<Stream url=...>`` before the WS handshake, so
8406
+ * ``<Parameter>`` tags are the supported way to pre-populate
8407
+ * ``start.customParameters`` on the WS ``start`` frame. Used by the
8408
+ * inbound path to carry caller / callee through to the bridge.
8409
+ *
8410
+ * Mirrors the Python adapter's ``generate_stream_twiml``.
8136
8411
  */
8137
- static generateStreamTwiml(streamUrl) {
8138
- const escaped = streamUrl.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;").replace(/'/g, "&apos;");
8139
- return `<?xml version="1.0" encoding="UTF-8"?><Response><Connect><Stream url="${escaped}"/></Connect></Response>`;
8412
+ static generateStreamTwiml(streamUrl, parameters) {
8413
+ const esc = (s) => s.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;").replace(/'/g, "&apos;");
8414
+ const escapedUrl = esc(streamUrl);
8415
+ let paramTags = "";
8416
+ if (parameters) {
8417
+ for (const [name, value] of Object.entries(parameters)) {
8418
+ if (value == null) continue;
8419
+ paramTags += `<Parameter name="${esc(name)}" value="${esc(String(value))}"/>`;
8420
+ }
8421
+ }
8422
+ return `<?xml version="1.0" encoding="UTF-8"?><Response><Connect><Stream url="${escapedUrl}">${paramTags}</Stream></Connect></Response>`;
8140
8423
  }
8141
8424
  /** Force-complete an in-progress call. */
8142
8425
  async endCall(callSid) {
@@ -8529,6 +8812,8 @@ export {
8529
8812
  CallMetricsAccumulator,
8530
8813
  STT4 as CartesiaSTT,
8531
8814
  TTS4 as CartesiaTTS,
8815
+ CartesiaTTSModel,
8816
+ CartesiaTTSVoiceMode,
8532
8817
  LLM4 as CerebrasLLM,
8533
8818
  ChatContext,
8534
8819
  CloudflareTunnel,
@@ -8536,10 +8821,13 @@ export {
8536
8821
  DEFAULT_PRICING,
8537
8822
  DTMF_EVENTS,
8538
8823
  DeepFilterNetFilter,
8824
+ DeepgramModel,
8539
8825
  STT as DeepgramSTT,
8540
8826
  DefaultToolExecutor,
8541
8827
  ConvAI as ElevenLabsConvAI,
8542
8828
  ElevenLabsConvAIAdapter,
8829
+ ElevenLabsModel,
8830
+ ElevenLabsOutputFormat,
8543
8831
  ElevenLabsTTS as ElevenLabsRestTTS,
8544
8832
  TTS as ElevenLabsTTS,
8545
8833
  TTS2 as ElevenLabsWebSocketTTS,
@@ -8568,8 +8856,15 @@ export {
8568
8856
  Realtime2 as OpenAIRealtime2,
8569
8857
  OpenAIRealtime2Adapter,
8570
8858
  OpenAIRealtimeAdapter,
8859
+ OpenAIRealtimeAudioFormat,
8860
+ OpenAIRealtimeModel,
8861
+ OpenAIRealtimeVADType,
8571
8862
  TTS3 as OpenAITTS,
8572
8863
  STT3 as OpenAITranscribeSTT,
8864
+ OpenAITranscriptionModel,
8865
+ OpenAIVoice,
8866
+ PRICING_LAST_UPDATED,
8867
+ PRICING_VERSION,
8573
8868
  PartialStreamError,
8574
8869
  Patter,
8575
8870
  PatterConnectionError,
@@ -8577,9 +8872,14 @@ export {
8577
8872
  PatterTool,
8578
8873
  PcmCarry,
8579
8874
  PipelineHookExecutor,
8875
+ Carrier as Plivo,
8876
+ PlivoAdapter,
8877
+ PricingUnit,
8580
8878
  ProvisionError,
8581
8879
  RateLimitError,
8582
8880
  RemoteMessageHandler,
8881
+ RimeAudioFormat,
8882
+ RimeModel,
8583
8883
  TTS5 as RimeTTS,
8584
8884
  SPAN_BARGEIN,
8585
8885
  SPAN_CALL,
@@ -8600,7 +8900,7 @@ export {
8600
8900
  TurnDetectionMode as SpeechmaticsTurnDetectionMode,
8601
8901
  StatefulResampler,
8602
8902
  Static as StaticTunnel,
8603
- Carrier2 as Telnyx,
8903
+ Carrier3 as Telnyx,
8604
8904
  TelnyxAdapter,
8605
8905
  TelnyxSTT,
8606
8906
  TelnyxSTTInputFormat,
@@ -8611,7 +8911,7 @@ export {
8611
8911
  TestSession,
8612
8912
  TfidfLoopDetector,
8613
8913
  Tool,
8614
- Carrier as Twilio,
8914
+ Carrier2 as Twilio,
8615
8915
  TwilioAdapter,
8616
8916
  ULTRAVOX_DEFAULT_API_BASE,
8617
8917
  ULTRAVOX_DEFAULT_SR,