getpatter 0.6.1 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunk-CL2U3YET.mjs +1429 -0
- package/dist/{chunk-TEW3NAZJ.mjs → chunk-LE63CSOB.mjs} +371 -1486
- package/dist/{chunk-RV7APPYE.mjs → chunk-R2T4JABZ.mjs} +13 -0
- package/dist/cli.js +48 -23
- package/dist/dashboard/ui.html +8 -8
- package/dist/index.d.mts +452 -186
- package/dist/index.d.ts +452 -186
- package/dist/index.js +1485 -979
- package/dist/index.mjs +973 -790
- package/dist/openai-realtime-2-CNFARP25.mjs +8 -0
- package/dist/{silero-vad-NSEXI4XS.mjs → silero-vad-LNDFGIY7.mjs} +1 -1
- package/dist/{test-mode-WEKKNBLD.mjs → test-mode-RS57BDM6.mjs} +2 -1
- package/package.json +1 -1
- package/src/dashboard/ui.html +8 -8
package/dist/index.mjs
CHANGED
|
@@ -6,6 +6,7 @@ import {
|
|
|
6
6
|
CallMetricsAccumulator,
|
|
7
7
|
DEFAULT_MIN_SENTENCE_LEN,
|
|
8
8
|
DEFAULT_PRICING,
|
|
9
|
+
DeepgramModel,
|
|
9
10
|
DeepgramSTT,
|
|
10
11
|
DefaultToolExecutor,
|
|
11
12
|
ElevenLabsConvAIAdapter,
|
|
@@ -15,12 +16,12 @@ import {
|
|
|
15
16
|
LLMLoop,
|
|
16
17
|
MetricsStore,
|
|
17
18
|
OpenAILLMProvider,
|
|
18
|
-
|
|
19
|
-
|
|
19
|
+
PRICING_LAST_UPDATED,
|
|
20
|
+
PRICING_VERSION,
|
|
20
21
|
PatterConnectionError,
|
|
21
22
|
PatterError,
|
|
22
|
-
PcmCarry,
|
|
23
23
|
PipelineHookExecutor,
|
|
24
|
+
PricingUnit,
|
|
24
25
|
ProvisionError,
|
|
25
26
|
RateLimitError,
|
|
26
27
|
RemoteMessageHandler,
|
|
@@ -32,18 +33,14 @@ import {
|
|
|
32
33
|
SPAN_TOOL,
|
|
33
34
|
SPAN_TTS,
|
|
34
35
|
SentenceChunker,
|
|
35
|
-
StatefulResampler,
|
|
36
36
|
TestSession,
|
|
37
|
+
VERSION,
|
|
37
38
|
calculateRealtimeCost,
|
|
38
39
|
calculateSttCost,
|
|
39
40
|
calculateTelephonyCost,
|
|
40
41
|
calculateTtsCost,
|
|
41
42
|
callsToCsv,
|
|
42
43
|
callsToJson,
|
|
43
|
-
createResampler16kTo8k,
|
|
44
|
-
createResampler24kTo16k,
|
|
45
|
-
createResampler24kTo8k,
|
|
46
|
-
createResampler8kTo16k,
|
|
47
44
|
initTracing,
|
|
48
45
|
isRemoteUrl,
|
|
49
46
|
isTracingEnabled,
|
|
@@ -53,14 +50,29 @@ import {
|
|
|
53
50
|
mergePricing,
|
|
54
51
|
mountApi,
|
|
55
52
|
mountDashboard,
|
|
53
|
+
resolveLogRoot,
|
|
54
|
+
startSpan
|
|
55
|
+
} from "./chunk-LE63CSOB.mjs";
|
|
56
|
+
import {
|
|
57
|
+
OpenAIRealtime2Adapter,
|
|
58
|
+
OpenAIRealtimeAdapter,
|
|
59
|
+
OpenAIRealtimeAudioFormat,
|
|
60
|
+
OpenAIRealtimeModel,
|
|
61
|
+
OpenAIRealtimeVADType,
|
|
62
|
+
OpenAITranscriptionModel,
|
|
63
|
+
OpenAIVoice,
|
|
64
|
+
PcmCarry,
|
|
65
|
+
StatefulResampler,
|
|
66
|
+
createResampler16kTo8k,
|
|
67
|
+
createResampler24kTo16k,
|
|
68
|
+
createResampler24kTo8k,
|
|
69
|
+
createResampler8kTo16k,
|
|
56
70
|
mulawToPcm16,
|
|
57
71
|
pcm16ToMulaw,
|
|
58
72
|
resample16kTo8k,
|
|
59
73
|
resample24kTo16k,
|
|
60
|
-
resample8kTo16k
|
|
61
|
-
|
|
62
|
-
startSpan
|
|
63
|
-
} from "./chunk-TEW3NAZJ.mjs";
|
|
74
|
+
resample8kTo16k
|
|
75
|
+
} from "./chunk-CL2U3YET.mjs";
|
|
64
76
|
import {
|
|
65
77
|
MinWordsStrategy,
|
|
66
78
|
evaluateStrategies,
|
|
@@ -75,7 +87,7 @@ import {
|
|
|
75
87
|
} from "./chunk-6GR5MHHQ.mjs";
|
|
76
88
|
import {
|
|
77
89
|
SileroVAD
|
|
78
|
-
} from "./chunk-
|
|
90
|
+
} from "./chunk-R2T4JABZ.mjs";
|
|
79
91
|
import {
|
|
80
92
|
__dirname,
|
|
81
93
|
__require,
|
|
@@ -105,7 +117,7 @@ var Realtime = class {
|
|
|
105
117
|
);
|
|
106
118
|
}
|
|
107
119
|
this.apiKey = key;
|
|
108
|
-
this.model = opts.model ?? "gpt-
|
|
120
|
+
this.model = opts.model ?? "gpt-realtime-mini";
|
|
109
121
|
this.voice = opts.voice ?? "alloy";
|
|
110
122
|
this.reasoningEffort = opts.reasoningEffort;
|
|
111
123
|
this.inputAudioTranscriptionModel = opts.inputAudioTranscriptionModel;
|
|
@@ -557,7 +569,9 @@ function resolvePersistRoot(persist) {
|
|
|
557
569
|
if (persist === false) return null;
|
|
558
570
|
if (persist === true) return resolveLogRoot("auto");
|
|
559
571
|
if (typeof persist === "string") return resolveLogRoot(persist);
|
|
560
|
-
|
|
572
|
+
const envRoot = resolveLogRoot();
|
|
573
|
+
if (envRoot !== null) return envRoot;
|
|
574
|
+
return resolveLogRoot("auto");
|
|
561
575
|
}
|
|
562
576
|
function closeParkedConnections(slot) {
|
|
563
577
|
if (slot.stt) {
|
|
@@ -573,6 +587,11 @@ function closeParkedConnections(slot) {
|
|
|
573
587
|
}
|
|
574
588
|
}
|
|
575
589
|
if (slot.openaiRealtime) {
|
|
590
|
+
const wsAny = slot.openaiRealtime;
|
|
591
|
+
if (wsAny._parkedKeepalive) {
|
|
592
|
+
clearInterval(wsAny._parkedKeepalive);
|
|
593
|
+
delete wsAny._parkedKeepalive;
|
|
594
|
+
}
|
|
576
595
|
try {
|
|
577
596
|
slot.openaiRealtime.close();
|
|
578
597
|
} catch {
|
|
@@ -1014,7 +1033,7 @@ var Patter = class {
|
|
|
1014
1033
|
}
|
|
1015
1034
|
/** Run the agent in interactive terminal-test mode (no real telephony). */
|
|
1016
1035
|
async test(opts) {
|
|
1017
|
-
const { TestSession: TestSession2 } = await import("./test-mode-
|
|
1036
|
+
const { TestSession: TestSession2 } = await import("./test-mode-RS57BDM6.mjs");
|
|
1018
1037
|
const session = new TestSession2();
|
|
1019
1038
|
await session.run({
|
|
1020
1039
|
agent: opts.agent,
|
|
@@ -1144,7 +1163,9 @@ var Patter = class {
|
|
|
1144
1163
|
const tts = agent.tts;
|
|
1145
1164
|
const sttOpen = typeof stt?.openParkedConnection === "function" ? stt.openParkedConnection.bind(stt) : null;
|
|
1146
1165
|
const ttsOpen = typeof tts?.openParkedConnection === "function" ? tts.openParkedConnection.bind(tts) : null;
|
|
1147
|
-
|
|
1166
|
+
const providerStr = agent.provider ?? "";
|
|
1167
|
+
const wantsRealtimePark = providerStr === "openai_realtime" || providerStr === "openai_realtime_2";
|
|
1168
|
+
if (!sttOpen && !ttsOpen && !wantsRealtimePark) return;
|
|
1148
1169
|
const slot = {};
|
|
1149
1170
|
this.prewarmedConnections.set(callId, slot);
|
|
1150
1171
|
const startedAt = Date.now();
|
|
@@ -1189,6 +1210,43 @@ var Patter = class {
|
|
|
1189
1210
|
}
|
|
1190
1211
|
})());
|
|
1191
1212
|
}
|
|
1213
|
+
if (wantsRealtimePark) {
|
|
1214
|
+
tasks.push((async () => {
|
|
1215
|
+
const { OpenAIRealtime2Adapter: OpenAIRealtime2Adapter2 } = await import("./openai-realtime-2-CNFARP25.mjs");
|
|
1216
|
+
const apiKey = process.env.OPENAI_API_KEY ?? "";
|
|
1217
|
+
if (!apiKey) {
|
|
1218
|
+
getLogger().debug(`Park OpenAI Realtime skipped for ${callId}: no OPENAI_API_KEY`);
|
|
1219
|
+
return;
|
|
1220
|
+
}
|
|
1221
|
+
try {
|
|
1222
|
+
const tmpAdapter = new OpenAIRealtime2Adapter2(
|
|
1223
|
+
apiKey,
|
|
1224
|
+
agent.model ?? "gpt-realtime-mini",
|
|
1225
|
+
agent.voice ?? "alloy",
|
|
1226
|
+
agent.systemPrompt ?? "",
|
|
1227
|
+
[],
|
|
1228
|
+
// audioFormat — the GA adapter always emits audio/pcm@24000
|
|
1229
|
+
// internally regardless of this value, but it's a required
|
|
1230
|
+
// positional param. Default to g711_ulaw (Twilio wire format).
|
|
1231
|
+
void 0
|
|
1232
|
+
);
|
|
1233
|
+
const ws = await tmpAdapter.openParkedConnection();
|
|
1234
|
+
if (this.prewarmedConnections.get(callId) !== slot) {
|
|
1235
|
+
try {
|
|
1236
|
+
ws.close();
|
|
1237
|
+
} catch {
|
|
1238
|
+
}
|
|
1239
|
+
return;
|
|
1240
|
+
}
|
|
1241
|
+
slot.openaiRealtime = ws;
|
|
1242
|
+
getLogger().info(
|
|
1243
|
+
`[PREWARM] callId=${callId} provider=openai_realtime ms=${Date.now() - startedAt}`
|
|
1244
|
+
);
|
|
1245
|
+
} catch (err) {
|
|
1246
|
+
getLogger().debug(`Park OpenAI Realtime failed for ${callId}: ${String(err)}`);
|
|
1247
|
+
}
|
|
1248
|
+
})());
|
|
1249
|
+
}
|
|
1192
1250
|
const task = (async () => {
|
|
1193
1251
|
await Promise.allSettled(tasks);
|
|
1194
1252
|
})();
|
|
@@ -1266,7 +1324,7 @@ var Patter = class {
|
|
|
1266
1324
|
* with a warn when the cap is reached (the call still proceeds —
|
|
1267
1325
|
* StreamHandler falls back to live TTS).
|
|
1268
1326
|
*/
|
|
1269
|
-
spawnPrewarmFirstMessage(agent, callId, ringTimeout) {
|
|
1327
|
+
spawnPrewarmFirstMessage(agent, callId, ringTimeout, carrier) {
|
|
1270
1328
|
if (!agent.prewarmFirstMessage) return;
|
|
1271
1329
|
const providerMode = agent.provider ?? "openai_realtime";
|
|
1272
1330
|
if (providerMode !== "pipeline") {
|
|
@@ -1279,6 +1337,18 @@ var Patter = class {
|
|
|
1279
1337
|
const tts = agent.tts;
|
|
1280
1338
|
if (!firstMessage || !tts) return;
|
|
1281
1339
|
if (typeof tts.synthesizeStream !== "function") return;
|
|
1340
|
+
if (carrier) {
|
|
1341
|
+
const carrierAware = tts;
|
|
1342
|
+
if (typeof carrierAware.setTelephonyCarrier === "function") {
|
|
1343
|
+
try {
|
|
1344
|
+
carrierAware.setTelephonyCarrier(carrier);
|
|
1345
|
+
} catch (err) {
|
|
1346
|
+
getLogger().debug(
|
|
1347
|
+
`Prewarm TTS setTelephonyCarrier failed for ${callId}: ${String(err)}`
|
|
1348
|
+
);
|
|
1349
|
+
}
|
|
1350
|
+
}
|
|
1351
|
+
}
|
|
1282
1352
|
const inFlight = this.prewarmAudio.size + this.prewarmTasks.size;
|
|
1283
1353
|
if (inFlight >= PREWARM_CACHE_MAX) {
|
|
1284
1354
|
getLogger().warn(
|
|
@@ -1391,16 +1461,25 @@ var Patter = class {
|
|
|
1391
1461
|
telnyxCallId = body.data?.call_control_id;
|
|
1392
1462
|
} catch {
|
|
1393
1463
|
}
|
|
1394
|
-
if (
|
|
1395
|
-
|
|
1464
|
+
if (telnyxCallId) {
|
|
1465
|
+
const initiatedPayload = {
|
|
1396
1466
|
call_id: telnyxCallId,
|
|
1397
1467
|
caller: phoneNumber,
|
|
1398
1468
|
callee: options.to,
|
|
1399
|
-
direction: "outbound"
|
|
1400
|
-
|
|
1469
|
+
direction: "outbound",
|
|
1470
|
+
status: "initiated"
|
|
1471
|
+
};
|
|
1472
|
+
if (this.embeddedServer) {
|
|
1473
|
+
this.embeddedServer.metricsStore.recordCallInitiated(initiatedPayload);
|
|
1474
|
+
}
|
|
1475
|
+
try {
|
|
1476
|
+
const { notifyDashboard: notifyDashboard2 } = await import("./persistence-LVIAHESK.mjs");
|
|
1477
|
+
notifyDashboard2(initiatedPayload);
|
|
1478
|
+
} catch {
|
|
1479
|
+
}
|
|
1401
1480
|
}
|
|
1402
1481
|
if (telnyxCallId) {
|
|
1403
|
-
this.spawnPrewarmFirstMessage(options.agent, telnyxCallId, effectiveRingTimeout);
|
|
1482
|
+
this.spawnPrewarmFirstMessage(options.agent, telnyxCallId, effectiveRingTimeout, "telnyx");
|
|
1404
1483
|
if (options.agent.prewarm !== false) {
|
|
1405
1484
|
this.parkProviderConnections(options.agent, telnyxCallId);
|
|
1406
1485
|
}
|
|
@@ -1453,21 +1532,30 @@ var Patter = class {
|
|
|
1453
1532
|
twilioNotificationsPath = body.subresource_uris?.notifications;
|
|
1454
1533
|
} catch {
|
|
1455
1534
|
}
|
|
1456
|
-
if (
|
|
1457
|
-
|
|
1535
|
+
if (twilioCallSid) {
|
|
1536
|
+
const initiatedPayload = {
|
|
1458
1537
|
call_id: twilioCallSid,
|
|
1459
1538
|
caller: phoneNumber,
|
|
1460
1539
|
callee: options.to,
|
|
1461
|
-
direction: "outbound"
|
|
1462
|
-
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
)
|
|
1540
|
+
direction: "outbound",
|
|
1541
|
+
status: "initiated"
|
|
1542
|
+
};
|
|
1543
|
+
if (this.embeddedServer) {
|
|
1544
|
+
this.embeddedServer.metricsStore.recordCallInitiated(initiatedPayload);
|
|
1545
|
+
if (twilioNotificationsPath) {
|
|
1546
|
+
getLogger().info(
|
|
1547
|
+
`Outbound call ${twilioCallSid} placed. Twilio notifications: https://api.twilio.com${twilioNotificationsPath} (check here if the call drops with no audio).`
|
|
1548
|
+
);
|
|
1549
|
+
}
|
|
1550
|
+
}
|
|
1551
|
+
try {
|
|
1552
|
+
const { notifyDashboard: notifyDashboard2 } = await import("./persistence-LVIAHESK.mjs");
|
|
1553
|
+
notifyDashboard2(initiatedPayload);
|
|
1554
|
+
} catch {
|
|
1467
1555
|
}
|
|
1468
1556
|
}
|
|
1469
1557
|
if (twilioCallSid) {
|
|
1470
|
-
this.spawnPrewarmFirstMessage(options.agent, twilioCallSid, effectiveRingTimeout);
|
|
1558
|
+
this.spawnPrewarmFirstMessage(options.agent, twilioCallSid, effectiveRingTimeout, "twilio");
|
|
1471
1559
|
if (options.agent.prewarm !== false) {
|
|
1472
1560
|
this.parkProviderConnections(options.agent, twilioCallSid);
|
|
1473
1561
|
}
|
|
@@ -2764,109 +2852,694 @@ function scheduleInterval(intervalOrOpts, callback) {
|
|
|
2764
2852
|
};
|
|
2765
2853
|
}
|
|
2766
2854
|
|
|
2767
|
-
// src/
|
|
2855
|
+
// src/providers/elevenlabs-tts.ts
|
|
2768
2856
|
init_esm_shims();
|
|
2769
|
-
var
|
|
2770
|
-
|
|
2771
|
-
|
|
2772
|
-
|
|
2773
|
-
|
|
2774
|
-
|
|
2775
|
-
|
|
2776
|
-
|
|
2777
|
-
|
|
2778
|
-
|
|
2779
|
-
|
|
2780
|
-
|
|
2781
|
-
|
|
2782
|
-
|
|
2783
|
-
|
|
2784
|
-
|
|
2785
|
-
|
|
2786
|
-
|
|
2787
|
-
|
|
2788
|
-
|
|
2789
|
-
|
|
2790
|
-
|
|
2791
|
-
|
|
2792
|
-
|
|
2857
|
+
var ELEVENLABS_BASE_URL = "https://api.elevenlabs.io/v1";
|
|
2858
|
+
var ELEVENLABS_VOICE_ID_BY_NAME = {
|
|
2859
|
+
rachel: "21m00Tcm4TlvDq8ikWAM",
|
|
2860
|
+
drew: "29vD33N1CtxCmqQRPOHJ",
|
|
2861
|
+
clyde: "2EiwWnXFnvU5JabPnv8n",
|
|
2862
|
+
paul: "5Q0t7uMcjvnagumLfvZi",
|
|
2863
|
+
domi: "AZnzlk1XvdvUeBnXmlld",
|
|
2864
|
+
dave: "CYw3kZ02Hs0563khs1Fj",
|
|
2865
|
+
fin: "D38z5RcWu1voky8WS1ja",
|
|
2866
|
+
bella: "EXAVITQu4vr4xnSDxMaL",
|
|
2867
|
+
antoni: "ErXwobaYiN019PkySvjV",
|
|
2868
|
+
thomas: "GBv7mTt0atIp3Br8iCZE",
|
|
2869
|
+
charlie: "IKne3meq5aSn9XLyUdCD",
|
|
2870
|
+
george: "JBFqnCBsd6RMkjVDRZzb",
|
|
2871
|
+
emily: "LcfcDJNUP1GQjkzn1xUU",
|
|
2872
|
+
elli: "MF3mGyEYCl7XYWbV9V6O",
|
|
2873
|
+
callum: "N2lVS1w4EtoT3dr4eOWO",
|
|
2874
|
+
patrick: "ODq5zmih8GrVes37Dizd",
|
|
2875
|
+
harry: "SOYHLrjzK2X1ezoPC6cr",
|
|
2876
|
+
liam: "TX3LPaxmHKxFdv7VOQHJ",
|
|
2877
|
+
dorothy: "ThT5KcBeYPX3keUQqHPh",
|
|
2878
|
+
josh: "TxGEqnHWrfWFTfGW9XjX",
|
|
2879
|
+
arnold: "VR6AewLTigWG4xSOukaG",
|
|
2880
|
+
charlotte: "XB0fDUnXU5powFXDhCwa",
|
|
2881
|
+
matilda: "XrExE9yKIg1WjnnlVkGX",
|
|
2882
|
+
matthew: "Yko7PKHZNXotIFUBG7I9",
|
|
2883
|
+
james: "ZQe5CZNOzWyzPSCn5a3c",
|
|
2884
|
+
joseph: "Zlb1dXrM653N07WRdFW3",
|
|
2885
|
+
jeremy: "bVMeCyTHy58xNoL34h3p",
|
|
2886
|
+
michael: "flq6f7yk4E4fJM5XTYuZ",
|
|
2887
|
+
ethan: "g5CIjZEefAph4nQFvHAz",
|
|
2888
|
+
gigi: "jBpfuIE2acCO8z3wKNLl",
|
|
2889
|
+
freya: "jsCqWAovK2LkecY7zXl4",
|
|
2890
|
+
brian: "nPczCjzI2devNBz1zQrb",
|
|
2891
|
+
grace: "oWAxZDx7w5VEj9dCyTzz",
|
|
2892
|
+
daniel: "onwK4e9ZLuTAKqWW03F9",
|
|
2893
|
+
lily: "pFZP5JQG7iQjIQuC4Bku",
|
|
2894
|
+
serena: "pMsXgVXv3BLzUgSXRplE",
|
|
2895
|
+
adam: "pNInz6obpgDQGcFmaJgB",
|
|
2896
|
+
nicole: "piTKgcLEGmPE4e6mEKli",
|
|
2897
|
+
bill: "pqHfZKP75CvOlQylNhV4",
|
|
2898
|
+
jessie: "t0jbNlBVZ17f02VDIeMI",
|
|
2899
|
+
ryan: "wViXBPUzp2ZZixB1xQuM",
|
|
2900
|
+
sam: "yoZ06aMxZJJ28mfd3POQ",
|
|
2901
|
+
glinda: "z9fAnlkpzviPz146aGWa",
|
|
2902
|
+
giovanni: "zcAOhNBS3c14rBihAFp1",
|
|
2903
|
+
mimi: "zrHiDhphv9ZnVXBqCLjz",
|
|
2904
|
+
sarah: "EXAVITQu4vr4xnSDxMaL",
|
|
2905
|
+
alloy: "EXAVITQu4vr4xnSDxMaL"
|
|
2793
2906
|
};
|
|
2794
|
-
|
|
2795
|
-
|
|
2796
|
-
|
|
2797
|
-
|
|
2798
|
-
|
|
2799
|
-
init_esm_shims();
|
|
2800
|
-
var OPENAI_TRANSCRIPTION_URL = "https://api.openai.com/v1/audio/transcriptions";
|
|
2801
|
-
var DEFAULT_BUFFER_SIZE = 16e3 * 2;
|
|
2802
|
-
var ALLOWED_MODELS = /* @__PURE__ */ new Set(["whisper-1", "gpt-4o-transcribe", "gpt-4o-mini-transcribe"]);
|
|
2803
|
-
function wrapPcmInWav(pcm, sampleRate = 16e3, channels = 1, bitsPerSample = 16) {
|
|
2804
|
-
const dataSize = pcm.length;
|
|
2805
|
-
const header = Buffer.alloc(44);
|
|
2806
|
-
header.write("RIFF", 0);
|
|
2807
|
-
header.writeUInt32LE(36 + dataSize, 4);
|
|
2808
|
-
header.write("WAVE", 8);
|
|
2809
|
-
header.write("fmt ", 12);
|
|
2810
|
-
header.writeUInt32LE(16, 16);
|
|
2811
|
-
header.writeUInt16LE(1, 20);
|
|
2812
|
-
header.writeUInt16LE(channels, 22);
|
|
2813
|
-
header.writeUInt32LE(sampleRate, 24);
|
|
2814
|
-
header.writeUInt32LE(sampleRate * channels * (bitsPerSample / 8), 28);
|
|
2815
|
-
header.writeUInt16LE(channels * (bitsPerSample / 8), 32);
|
|
2816
|
-
header.writeUInt16LE(bitsPerSample, 34);
|
|
2817
|
-
header.write("data", 36);
|
|
2818
|
-
header.writeUInt32LE(dataSize, 40);
|
|
2819
|
-
return Buffer.concat([header, pcm]);
|
|
2907
|
+
var VOICE_ID_PATTERN = /^[A-Za-z0-9]{20}$/;
|
|
2908
|
+
function resolveVoiceId(voice) {
|
|
2909
|
+
if (!voice) return voice;
|
|
2910
|
+
if (VOICE_ID_PATTERN.test(voice)) return voice;
|
|
2911
|
+
return ELEVENLABS_VOICE_ID_BY_NAME[voice.toLowerCase()] ?? voice;
|
|
2820
2912
|
}
|
|
2821
|
-
var
|
|
2822
|
-
|
|
2823
|
-
|
|
2913
|
+
var ElevenLabsModel = {
|
|
2914
|
+
V3: "eleven_v3",
|
|
2915
|
+
FLASH_V2_5: "eleven_flash_v2_5",
|
|
2916
|
+
TURBO_V2_5: "eleven_turbo_v2_5",
|
|
2917
|
+
MULTILINGUAL_V2: "eleven_multilingual_v2",
|
|
2918
|
+
MONOLINGUAL_V1: "eleven_monolingual_v1"
|
|
2919
|
+
};
|
|
2920
|
+
var ElevenLabsOutputFormat = {
|
|
2921
|
+
MP3_22050_32: "mp3_22050_32",
|
|
2922
|
+
MP3_44100_32: "mp3_44100_32",
|
|
2923
|
+
MP3_44100_64: "mp3_44100_64",
|
|
2924
|
+
MP3_44100_96: "mp3_44100_96",
|
|
2925
|
+
MP3_44100_128: "mp3_44100_128",
|
|
2926
|
+
MP3_44100_192: "mp3_44100_192",
|
|
2927
|
+
PCM_8000: "pcm_8000",
|
|
2928
|
+
PCM_16000: "pcm_16000",
|
|
2929
|
+
PCM_22050: "pcm_22050",
|
|
2930
|
+
PCM_24000: "pcm_24000",
|
|
2931
|
+
PCM_44100: "pcm_44100",
|
|
2932
|
+
ULAW_8000: "ulaw_8000"
|
|
2933
|
+
};
|
|
2934
|
+
var ElevenLabsTTS = class _ElevenLabsTTS {
|
|
2935
|
+
// Stable pricing/dashboard key — read by stream-handler / metrics via
|
|
2936
|
+
// ``(agent.tts.constructor as any).providerKey``. Without this the cost
|
|
2937
|
+
// calculator falls back to ``constructor.name`` ("ElevenLabsTTS") which
|
|
2938
|
+
// does NOT match the pricing table key "elevenlabs", silently zeroing
|
|
2939
|
+
// TTS cost for callers that construct the raw REST class directly
|
|
2940
|
+
// (exposed at top level as ``ElevenLabsRestTTS``).
|
|
2941
|
+
static providerKey = "elevenlabs";
|
|
2824
2942
|
apiKey;
|
|
2825
|
-
|
|
2826
|
-
|
|
2827
|
-
|
|
2828
|
-
|
|
2829
|
-
|
|
2830
|
-
|
|
2831
|
-
|
|
2832
|
-
chunks = [];
|
|
2833
|
-
bufferedBytes = 0;
|
|
2834
|
-
callbacks = /* @__PURE__ */ new Set();
|
|
2835
|
-
running = false;
|
|
2836
|
-
pendingTranscriptions = [];
|
|
2943
|
+
voiceId;
|
|
2944
|
+
modelId;
|
|
2945
|
+
_outputFormat;
|
|
2946
|
+
_outputFormatExplicit;
|
|
2947
|
+
voiceSettings;
|
|
2948
|
+
languageCode;
|
|
2949
|
+
chunkSize;
|
|
2837
2950
|
/**
|
|
2838
|
-
*
|
|
2839
|
-
*
|
|
2840
|
-
*
|
|
2841
|
-
* @param bufferSize Bytes of PCM16 to buffer before each transcription request.
|
|
2842
|
-
* @param responseFormat ``"json"`` (default) or ``"verbose_json"``.
|
|
2843
|
-
*
|
|
2844
|
-
* Argument order matches the Python SDK's ``WhisperSTT(api_key, language, model, response_format)``
|
|
2845
|
-
* for cross-language parity. Pre-0.5.3 the TS positional order was
|
|
2846
|
-
* ``(apiKey, model, language, bufferSize, responseFormat)`` — callers using
|
|
2847
|
-
* the old order will need to swap ``language`` and ``model``.
|
|
2951
|
+
* Public view of the (possibly auto-flipped) wire format. Read by the
|
|
2952
|
+
* stream-handler to decide whether to skip the client-side resample +
|
|
2953
|
+
* mulaw encode when the bytes are already in the carrier's wire codec.
|
|
2848
2954
|
*/
|
|
2849
|
-
|
|
2850
|
-
|
|
2851
|
-
|
|
2852
|
-
|
|
2853
|
-
|
|
2854
|
-
|
|
2855
|
-
|
|
2856
|
-
|
|
2857
|
-
|
|
2858
|
-
|
|
2859
|
-
|
|
2860
|
-
|
|
2861
|
-
|
|
2862
|
-
|
|
2863
|
-
|
|
2864
|
-
|
|
2865
|
-
|
|
2866
|
-
|
|
2867
|
-
|
|
2868
|
-
|
|
2869
|
-
|
|
2955
|
+
get outputFormat() {
|
|
2956
|
+
return this._outputFormat;
|
|
2957
|
+
}
|
|
2958
|
+
constructor(apiKey, voiceIdOrOptions = "21m00Tcm4TlvDq8ikWAM", modelId = ElevenLabsModel.FLASH_V2_5, outputFormat = ElevenLabsOutputFormat.PCM_16000) {
|
|
2959
|
+
this.apiKey = apiKey;
|
|
2960
|
+
if (typeof voiceIdOrOptions === "object") {
|
|
2961
|
+
const o = voiceIdOrOptions;
|
|
2962
|
+
this.voiceId = resolveVoiceId(o.voiceId ?? "21m00Tcm4TlvDq8ikWAM");
|
|
2963
|
+
this.modelId = o.modelId ?? ElevenLabsModel.FLASH_V2_5;
|
|
2964
|
+
this._outputFormatExplicit = o.outputFormat !== void 0;
|
|
2965
|
+
this._outputFormat = o.outputFormat ?? ElevenLabsOutputFormat.PCM_16000;
|
|
2966
|
+
this.voiceSettings = o.voiceSettings;
|
|
2967
|
+
this.languageCode = o.languageCode;
|
|
2968
|
+
this.chunkSize = o.chunkSize ?? 4096;
|
|
2969
|
+
} else {
|
|
2970
|
+
this.voiceId = resolveVoiceId(voiceIdOrOptions);
|
|
2971
|
+
this.modelId = modelId;
|
|
2972
|
+
this._outputFormatExplicit = outputFormat !== ElevenLabsOutputFormat.PCM_16000;
|
|
2973
|
+
this._outputFormat = outputFormat;
|
|
2974
|
+
this.voiceSettings = void 0;
|
|
2975
|
+
this.languageCode = void 0;
|
|
2976
|
+
this.chunkSize = 4096;
|
|
2977
|
+
}
|
|
2978
|
+
}
|
|
2979
|
+
/**
|
|
2980
|
+
* Hook called by ``StreamHandler.initPipeline`` to advise the carrier
|
|
2981
|
+
* wire format. When the user did NOT pass an explicit ``outputFormat``,
|
|
2982
|
+
* auto-flip to the carrier's native codec so the audio bytes ElevenLabs
|
|
2983
|
+
* returns are already in Twilio/Telnyx wire format — eliminating the
|
|
2984
|
+
* client-side 16 kHz → 8 kHz resample and PCM → μ-law encode. The
|
|
2985
|
+
* resample/encode chain was a source of audible artifacts on the
|
|
2986
|
+
* prewarmed firstMessage (see 0.6.2 acceptance notes — burst delivery
|
|
2987
|
+
* of resampled audio crackled on the carrier-side jitter buffer).
|
|
2988
|
+
*
|
|
2989
|
+
* No-op when the caller passed an explicit ``outputFormat`` (incl. via
|
|
2990
|
+
* the ``forTwilio`` / ``forTelnyx`` factories) — user wins.
|
|
2991
|
+
*
|
|
2992
|
+
* Parity with {@link ElevenLabsWebSocketTTS.setTelephonyCarrier}.
|
|
2993
|
+
*/
|
|
2994
|
+
setTelephonyCarrier(carrier) {
|
|
2995
|
+
if (this._outputFormatExplicit) return;
|
|
2996
|
+
if (carrier === "twilio") {
|
|
2997
|
+
this._outputFormat = ElevenLabsOutputFormat.ULAW_8000;
|
|
2998
|
+
} else if (carrier === "telnyx") {
|
|
2999
|
+
this._outputFormat = ElevenLabsOutputFormat.PCM_16000;
|
|
3000
|
+
}
|
|
3001
|
+
}
|
|
3002
|
+
/**
|
|
3003
|
+
* Construct an instance pre-configured for Twilio Media Streams.
|
|
3004
|
+
*
|
|
3005
|
+
* Sets `outputFormat='ulaw_8000'` so ElevenLabs emits μ-law @ 8 kHz
|
|
3006
|
+
* directly — the exact wire format Twilio's media stream uses — letting
|
|
3007
|
+
* the SDK skip the 16 kHz→8 kHz resample and PCM→μ-law conversion in
|
|
3008
|
+
* `TwilioAudioSender`. Saves ~30–80 ms first-byte and per-frame CPU,
|
|
3009
|
+
* and removes a potential aliasing source.
|
|
3010
|
+
*
|
|
3011
|
+
* `voiceSettings` defaults to a low-bandwidth-friendly profile
|
|
3012
|
+
* (speaker boost off, modest stability) which sounds cleaner at 8 kHz
|
|
3013
|
+
* μ-law than the studio default. Pass an explicit object to override.
|
|
3014
|
+
*/
|
|
3015
|
+
static forTwilio(apiKey, options = {}) {
|
|
3016
|
+
const voiceSettings = options.voiceSettings ?? {
|
|
3017
|
+
// Speaker boost adds high-frequency emphasis that aliases ugly over an
|
|
3018
|
+
// 8 kHz μ-law line. Slightly higher stability tames the excursions
|
|
3019
|
+
// that compander quantization noise can amplify.
|
|
3020
|
+
stability: 0.6,
|
|
3021
|
+
similarity_boost: 0.75,
|
|
3022
|
+
use_speaker_boost: false
|
|
3023
|
+
};
|
|
3024
|
+
return new _ElevenLabsTTS(apiKey, {
|
|
3025
|
+
...options,
|
|
3026
|
+
voiceSettings,
|
|
3027
|
+
outputFormat: ElevenLabsOutputFormat.ULAW_8000
|
|
3028
|
+
});
|
|
3029
|
+
}
|
|
3030
|
+
/**
|
|
3031
|
+
* Construct an instance pre-configured for Telnyx bidirectional media.
|
|
3032
|
+
*
|
|
3033
|
+
* Telnyx's default media-streaming codec is L16 PCM @ 16 kHz, which
|
|
3034
|
+
* matches our default Telnyx handler. We pick `pcm_16000` so the audio
|
|
3035
|
+
* flows end-to-end with zero resampling or transcoding.
|
|
3036
|
+
*
|
|
3037
|
+
* Trade-off: if your Telnyx profile is pinned to PCMU/8000 (μ-law),
|
|
3038
|
+
* construct `ElevenLabsTTS` directly with `outputFormat: 'ulaw_8000'`
|
|
3039
|
+
* — Telnyx supports that natively too.
|
|
3040
|
+
*/
|
|
3041
|
+
static forTelnyx(apiKey, options = {}) {
|
|
3042
|
+
return new _ElevenLabsTTS(apiKey, {
|
|
3043
|
+
...options,
|
|
3044
|
+
outputFormat: ElevenLabsOutputFormat.PCM_16000
|
|
3045
|
+
});
|
|
3046
|
+
}
|
|
3047
|
+
/**
|
|
3048
|
+
* Synthesise text to speech and return the full audio as a single Buffer.
|
|
3049
|
+
*
|
|
3050
|
+
* For large chunks (or when latency matters) call `synthesizeStream` instead.
|
|
3051
|
+
*/
|
|
3052
|
+
async synthesize(text) {
|
|
3053
|
+
const chunks = [];
|
|
3054
|
+
for await (const chunk of this.synthesizeStream(text)) {
|
|
3055
|
+
chunks.push(chunk);
|
|
3056
|
+
}
|
|
3057
|
+
return Buffer.concat(chunks);
|
|
3058
|
+
}
|
|
3059
|
+
/**
|
|
3060
|
+
* Synthesise text and yield audio chunks as they arrive (streaming).
|
|
3061
|
+
*
|
|
3062
|
+
* The yielded buffers are raw PCM at 16 kHz (or whatever `outputFormat` is
|
|
3063
|
+
* configured to). `chunkSize` controls the maximum yield size — 512 is a
|
|
3064
|
+
* good choice for low-latency telephony.
|
|
3065
|
+
*/
|
|
3066
|
+
async *synthesizeStream(text) {
|
|
3067
|
+
const url = `${ELEVENLABS_BASE_URL}/text-to-speech/${encodeURIComponent(this.voiceId)}/stream?output_format=${encodeURIComponent(this._outputFormat)}`;
|
|
3068
|
+
const body = {
|
|
3069
|
+
text,
|
|
3070
|
+
model_id: this.modelId
|
|
3071
|
+
};
|
|
3072
|
+
if (this.voiceSettings) body["voice_settings"] = this.voiceSettings;
|
|
3073
|
+
if (this.languageCode) body["language_code"] = this.languageCode;
|
|
3074
|
+
const response = await fetch(url, {
|
|
3075
|
+
method: "POST",
|
|
3076
|
+
headers: {
|
|
3077
|
+
"xi-api-key": this.apiKey,
|
|
3078
|
+
"Content-Type": "application/json"
|
|
3079
|
+
},
|
|
3080
|
+
body: JSON.stringify(body),
|
|
3081
|
+
signal: AbortSignal.timeout(3e4)
|
|
3082
|
+
});
|
|
3083
|
+
if (!response.ok) {
|
|
3084
|
+
const errBody = await response.text();
|
|
3085
|
+
throw new Error(`ElevenLabs TTS error ${response.status}: ${errBody}`);
|
|
3086
|
+
}
|
|
3087
|
+
if (!response.body) {
|
|
3088
|
+
throw new Error("ElevenLabs TTS: no response body");
|
|
3089
|
+
}
|
|
3090
|
+
const reader = response.body.getReader();
|
|
3091
|
+
try {
|
|
3092
|
+
while (true) {
|
|
3093
|
+
const { done, value } = await reader.read();
|
|
3094
|
+
if (done) break;
|
|
3095
|
+
if (!value || value.length === 0) continue;
|
|
3096
|
+
const buf = Buffer.from(value);
|
|
3097
|
+
for (let offset = 0; offset < buf.length; offset += this.chunkSize) {
|
|
3098
|
+
yield buf.subarray(offset, Math.min(offset + this.chunkSize, buf.length));
|
|
3099
|
+
}
|
|
3100
|
+
}
|
|
3101
|
+
} finally {
|
|
3102
|
+
if (typeof reader.cancel === "function") await reader.cancel().catch(() => {
|
|
3103
|
+
});
|
|
3104
|
+
reader.releaseLock();
|
|
3105
|
+
}
|
|
3106
|
+
}
|
|
3107
|
+
};
|
|
3108
|
+
|
|
3109
|
+
// src/providers/cartesia-tts.ts
|
|
3110
|
+
init_esm_shims();
|
|
3111
|
+
var CARTESIA_BASE_URL = "https://api.cartesia.ai";
|
|
3112
|
+
var CARTESIA_API_VERSION = "2025-04-16";
|
|
3113
|
+
var CARTESIA_DEFAULT_VOICE_ID = "f786b574-daa5-4673-aa0c-cbe3e8534c02";
|
|
3114
|
+
var CartesiaTTSModel = {
|
|
3115
|
+
SONIC_3: "sonic-3",
|
|
3116
|
+
SONIC_2: "sonic-2",
|
|
3117
|
+
SONIC: "sonic"
|
|
3118
|
+
};
|
|
3119
|
+
var CartesiaTTSContainer = {
|
|
3120
|
+
RAW: "raw",
|
|
3121
|
+
WAV: "wav",
|
|
3122
|
+
MP3: "mp3"
|
|
3123
|
+
};
|
|
3124
|
+
var CartesiaTTSEncoding = {
|
|
3125
|
+
PCM_S16LE: "pcm_s16le",
|
|
3126
|
+
PCM_F32LE: "pcm_f32le",
|
|
3127
|
+
PCM_MULAW: "pcm_mulaw",
|
|
3128
|
+
PCM_ALAW: "pcm_alaw"
|
|
3129
|
+
};
|
|
3130
|
+
var CartesiaTTSSampleRate = {
|
|
3131
|
+
HZ_8000: 8e3,
|
|
3132
|
+
HZ_16000: 16e3,
|
|
3133
|
+
HZ_22050: 22050,
|
|
3134
|
+
HZ_24000: 24e3,
|
|
3135
|
+
HZ_44100: 44100
|
|
3136
|
+
};
|
|
3137
|
+
var CartesiaTTSVoiceMode = {
|
|
3138
|
+
ID: "id",
|
|
3139
|
+
EMBEDDING: "embedding"
|
|
3140
|
+
};
|
|
3141
|
+
var CartesiaTTS = class _CartesiaTTS {
|
|
3142
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
3143
|
+
static providerKey = "cartesia_tts";
|
|
3144
|
+
apiKey;
|
|
3145
|
+
model;
|
|
3146
|
+
voice;
|
|
3147
|
+
language;
|
|
3148
|
+
sampleRate;
|
|
3149
|
+
speed;
|
|
3150
|
+
emotion;
|
|
3151
|
+
volume;
|
|
3152
|
+
baseUrl;
|
|
3153
|
+
apiVersion;
|
|
3154
|
+
constructor(apiKey, opts = {}) {
|
|
3155
|
+
this.apiKey = apiKey;
|
|
3156
|
+
this.model = opts.model ?? CartesiaTTSModel.SONIC_3;
|
|
3157
|
+
this.voice = opts.voice ?? CARTESIA_DEFAULT_VOICE_ID;
|
|
3158
|
+
this.language = opts.language ?? "en";
|
|
3159
|
+
this.sampleRate = opts.sampleRate ?? CartesiaTTSSampleRate.HZ_16000;
|
|
3160
|
+
this.speed = opts.speed;
|
|
3161
|
+
this.emotion = typeof opts.emotion === "string" ? [opts.emotion] : opts.emotion;
|
|
3162
|
+
this.volume = opts.volume;
|
|
3163
|
+
this.baseUrl = opts.baseUrl ?? CARTESIA_BASE_URL;
|
|
3164
|
+
this.apiVersion = opts.apiVersion ?? CARTESIA_API_VERSION;
|
|
3165
|
+
}
|
|
3166
|
+
/**
|
|
3167
|
+
* Construct an instance pre-configured for Twilio Media Streams.
|
|
3168
|
+
*
|
|
3169
|
+
* Sets `sampleRate=8000` so Cartesia emits PCM_S16LE @ 8 kHz directly.
|
|
3170
|
+
* Twilio's media stream uses μ-law @ 8 kHz so the SDK still does the
|
|
3171
|
+
* PCM → μ-law transcode client-side, but the 16 kHz → 8 kHz resample
|
|
3172
|
+
* step is skipped. Saves ~10–30 ms first-byte plus per-frame CPU and
|
|
3173
|
+
* removes a potential aliasing source.
|
|
3174
|
+
*/
|
|
3175
|
+
static forTwilio(apiKey, options = {}) {
|
|
3176
|
+
return new _CartesiaTTS(apiKey, {
|
|
3177
|
+
...options,
|
|
3178
|
+
sampleRate: CartesiaTTSSampleRate.HZ_8000
|
|
3179
|
+
});
|
|
3180
|
+
}
|
|
3181
|
+
/**
|
|
3182
|
+
* Construct an instance pre-configured for Telnyx bidirectional media.
|
|
3183
|
+
*
|
|
3184
|
+
* Sets `sampleRate=16000` to match Telnyx's L16/16000 default codec —
|
|
3185
|
+
* audio flows end-to-end with zero resampling or transcoding. Same as
|
|
3186
|
+
* the bare-constructor default; exists for API symmetry with
|
|
3187
|
+
* {@link CartesiaTTS.forTwilio}.
|
|
3188
|
+
*/
|
|
3189
|
+
static forTelnyx(apiKey, options = {}) {
|
|
3190
|
+
return new _CartesiaTTS(apiKey, {
|
|
3191
|
+
...options,
|
|
3192
|
+
sampleRate: CartesiaTTSSampleRate.HZ_16000
|
|
3193
|
+
});
|
|
3194
|
+
}
|
|
3195
|
+
/** Build the JSON payload for the Cartesia bytes endpoint. */
|
|
3196
|
+
buildPayload(text) {
|
|
3197
|
+
const payload = {
|
|
3198
|
+
model_id: this.model,
|
|
3199
|
+
voice: { mode: CartesiaTTSVoiceMode.ID, id: this.voice },
|
|
3200
|
+
transcript: text,
|
|
3201
|
+
output_format: {
|
|
3202
|
+
container: CartesiaTTSContainer.RAW,
|
|
3203
|
+
encoding: CartesiaTTSEncoding.PCM_S16LE,
|
|
3204
|
+
sample_rate: this.sampleRate
|
|
3205
|
+
},
|
|
3206
|
+
language: this.language
|
|
3207
|
+
};
|
|
3208
|
+
const generationConfig = {};
|
|
3209
|
+
if (this.speed !== void 0) generationConfig.speed = this.speed;
|
|
3210
|
+
if (this.emotion && this.emotion.length > 0)
|
|
3211
|
+
generationConfig.emotion = this.emotion[0];
|
|
3212
|
+
if (this.volume !== void 0) generationConfig.volume = this.volume;
|
|
3213
|
+
if (Object.keys(generationConfig).length > 0) {
|
|
3214
|
+
payload.generation_config = generationConfig;
|
|
3215
|
+
}
|
|
3216
|
+
return payload;
|
|
3217
|
+
}
|
|
3218
|
+
/**
|
|
3219
|
+
* Pre-call HTTP warmup for the Cartesia `/tts/bytes` endpoint.
|
|
3220
|
+
*
|
|
3221
|
+
* Issues a lightweight `GET <baseUrl>/voices` so DNS, TLS, and HTTP/2
|
|
3222
|
+
* are already up by the time the first `synthesizeStream()` POST
|
|
3223
|
+
* lands. Best-effort: 5 s timeout, all exceptions swallowed at
|
|
3224
|
+
* debug level.
|
|
3225
|
+
*
|
|
3226
|
+
* Billing safety: `GET /voices` is a free metadata read on
|
|
3227
|
+
* Cartesia's REST surface (per https://docs.cartesia.ai). It does
|
|
3228
|
+
* not consume synthesis credits. The actual synthesis is billed
|
|
3229
|
+
* only when `POST /tts/bytes` runs with a non-empty `transcript`.
|
|
3230
|
+
*
|
|
3231
|
+
* Note: Cartesia TTS uses the HTTP path (vs the WebSocket variant
|
|
3232
|
+
* Cartesia also exposes) — connection warmup is therefore HTTP-GET
|
|
3233
|
+
* based, not WebSocket pre-handshake. The latency win is smaller
|
|
3234
|
+
* (~50-150 ms vs the ~200-500 ms of a WS prewarm) but still real.
|
|
3235
|
+
*/
|
|
3236
|
+
async warmup() {
|
|
3237
|
+
try {
|
|
3238
|
+
await fetch(`${this.baseUrl}/voices`, {
|
|
3239
|
+
method: "GET",
|
|
3240
|
+
headers: {
|
|
3241
|
+
"X-API-Key": this.apiKey,
|
|
3242
|
+
"Cartesia-Version": this.apiVersion
|
|
3243
|
+
},
|
|
3244
|
+
signal: AbortSignal.timeout(5e3)
|
|
3245
|
+
});
|
|
3246
|
+
} catch (err) {
|
|
3247
|
+
getLogger().debug(`Cartesia TTS warmup failed (best-effort): ${String(err)}`);
|
|
3248
|
+
}
|
|
3249
|
+
}
|
|
3250
|
+
/** Synthesize text and return the concatenated audio buffer. */
|
|
3251
|
+
async synthesize(text) {
|
|
3252
|
+
const chunks = [];
|
|
3253
|
+
for await (const chunk of this.synthesizeStream(text)) {
|
|
3254
|
+
chunks.push(chunk);
|
|
3255
|
+
}
|
|
3256
|
+
return Buffer.concat(chunks);
|
|
3257
|
+
}
|
|
3258
|
+
/**
|
|
3259
|
+
* Synthesize text and yield raw PCM_S16LE chunks at the configured
|
|
3260
|
+
* `sampleRate` as they arrive from Cartesia.
|
|
3261
|
+
*/
|
|
3262
|
+
async *synthesizeStream(text) {
|
|
3263
|
+
const response = await fetch(`${this.baseUrl}/tts/bytes`, {
|
|
3264
|
+
method: "POST",
|
|
3265
|
+
headers: {
|
|
3266
|
+
"X-API-Key": this.apiKey,
|
|
3267
|
+
"Cartesia-Version": this.apiVersion,
|
|
3268
|
+
"Content-Type": "application/json"
|
|
3269
|
+
},
|
|
3270
|
+
body: JSON.stringify(this.buildPayload(text)),
|
|
3271
|
+
signal: AbortSignal.timeout(3e4)
|
|
3272
|
+
});
|
|
3273
|
+
if (!response.ok) {
|
|
3274
|
+
const body = await response.text();
|
|
3275
|
+
throw new Error(`Cartesia TTS error ${response.status}: ${body}`);
|
|
3276
|
+
}
|
|
3277
|
+
if (!response.body) {
|
|
3278
|
+
throw new Error("Cartesia TTS: no response body");
|
|
3279
|
+
}
|
|
3280
|
+
const reader = response.body.getReader();
|
|
3281
|
+
try {
|
|
3282
|
+
while (true) {
|
|
3283
|
+
const { done, value } = await reader.read();
|
|
3284
|
+
if (done) break;
|
|
3285
|
+
if (value && value.length > 0) {
|
|
3286
|
+
yield Buffer.from(value);
|
|
3287
|
+
}
|
|
3288
|
+
}
|
|
3289
|
+
} finally {
|
|
3290
|
+
if (typeof reader.cancel === "function")
|
|
3291
|
+
await reader.cancel().catch(() => {
|
|
3292
|
+
});
|
|
3293
|
+
reader.releaseLock();
|
|
3294
|
+
}
|
|
3295
|
+
}
|
|
3296
|
+
};
|
|
3297
|
+
|
|
3298
|
+
// src/providers/rime-tts.ts
|
|
3299
|
+
init_esm_shims();
|
|
3300
|
+
var RIME_BASE_URL = "https://users.rime.ai/v1/rime-tts";
|
|
3301
|
+
var RimeModel = {
|
|
3302
|
+
ARCANA: "arcana",
|
|
3303
|
+
MIST: "mist",
|
|
3304
|
+
MIST_V2: "mistv2"
|
|
3305
|
+
};
|
|
3306
|
+
var RimeAudioFormat = {
|
|
3307
|
+
PCM: "audio/pcm",
|
|
3308
|
+
MP3: "audio/mp3",
|
|
3309
|
+
WAV: "audio/wav",
|
|
3310
|
+
MULAW: "audio/mulaw"
|
|
3311
|
+
};
|
|
3312
|
+
var ARCANA_MODEL_TIMEOUT_MS = 60 * 4 * 1e3;
|
|
3313
|
+
var MIST_MODEL_TIMEOUT_MS = 30 * 1e3;
|
|
3314
|
+
function isMistModel(model) {
|
|
3315
|
+
return model.includes(RimeModel.MIST);
|
|
3316
|
+
}
|
|
3317
|
+
function timeoutForModel(model) {
|
|
3318
|
+
if (model === RimeModel.ARCANA) return ARCANA_MODEL_TIMEOUT_MS;
|
|
3319
|
+
return MIST_MODEL_TIMEOUT_MS;
|
|
3320
|
+
}
|
|
3321
|
+
var RimeTTS = class {
|
|
3322
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
3323
|
+
static providerKey = "rime";
|
|
3324
|
+
apiKey;
|
|
3325
|
+
model;
|
|
3326
|
+
speaker;
|
|
3327
|
+
lang;
|
|
3328
|
+
sampleRate;
|
|
3329
|
+
repetitionPenalty;
|
|
3330
|
+
temperature;
|
|
3331
|
+
topP;
|
|
3332
|
+
maxTokens;
|
|
3333
|
+
speedAlpha;
|
|
3334
|
+
reduceLatency;
|
|
3335
|
+
pauseBetweenBrackets;
|
|
3336
|
+
phonemizeBetweenBrackets;
|
|
3337
|
+
baseUrl;
|
|
3338
|
+
totalTimeoutMs;
|
|
3339
|
+
constructor(apiKey, opts = {}) {
|
|
3340
|
+
this.apiKey = apiKey;
|
|
3341
|
+
this.model = opts.model ?? RimeModel.ARCANA;
|
|
3342
|
+
const defaultSpeaker = isMistModel(this.model) ? "cove" : "astra";
|
|
3343
|
+
this.speaker = opts.speaker ?? defaultSpeaker;
|
|
3344
|
+
this.lang = opts.lang ?? "eng";
|
|
3345
|
+
this.sampleRate = opts.sampleRate ?? 16e3;
|
|
3346
|
+
this.repetitionPenalty = opts.repetitionPenalty;
|
|
3347
|
+
this.temperature = opts.temperature;
|
|
3348
|
+
this.topP = opts.topP;
|
|
3349
|
+
this.maxTokens = opts.maxTokens;
|
|
3350
|
+
this.speedAlpha = opts.speedAlpha;
|
|
3351
|
+
this.reduceLatency = opts.reduceLatency;
|
|
3352
|
+
this.pauseBetweenBrackets = opts.pauseBetweenBrackets;
|
|
3353
|
+
this.phonemizeBetweenBrackets = opts.phonemizeBetweenBrackets;
|
|
3354
|
+
this.baseUrl = opts.baseUrl ?? RIME_BASE_URL;
|
|
3355
|
+
this.totalTimeoutMs = timeoutForModel(this.model);
|
|
3356
|
+
}
|
|
3357
|
+
buildPayload(text) {
|
|
3358
|
+
const payload = {
|
|
3359
|
+
speaker: this.speaker,
|
|
3360
|
+
text,
|
|
3361
|
+
modelId: this.model
|
|
3362
|
+
};
|
|
3363
|
+
if (this.model === RimeModel.ARCANA) {
|
|
3364
|
+
if (this.repetitionPenalty !== void 0)
|
|
3365
|
+
payload.repetition_penalty = this.repetitionPenalty;
|
|
3366
|
+
if (this.temperature !== void 0) payload.temperature = this.temperature;
|
|
3367
|
+
if (this.topP !== void 0) payload.top_p = this.topP;
|
|
3368
|
+
if (this.maxTokens !== void 0) payload.max_tokens = this.maxTokens;
|
|
3369
|
+
payload.lang = this.lang;
|
|
3370
|
+
payload.samplingRate = this.sampleRate;
|
|
3371
|
+
} else if (isMistModel(this.model)) {
|
|
3372
|
+
payload.lang = this.lang;
|
|
3373
|
+
payload.samplingRate = this.sampleRate;
|
|
3374
|
+
if (this.speedAlpha !== void 0) payload.speedAlpha = this.speedAlpha;
|
|
3375
|
+
if (this.model === RimeModel.MIST_V2 && this.reduceLatency !== void 0) {
|
|
3376
|
+
payload.reduceLatency = this.reduceLatency;
|
|
3377
|
+
}
|
|
3378
|
+
if (this.pauseBetweenBrackets !== void 0) {
|
|
3379
|
+
payload.pauseBetweenBrackets = this.pauseBetweenBrackets;
|
|
3380
|
+
}
|
|
3381
|
+
if (this.phonemizeBetweenBrackets !== void 0) {
|
|
3382
|
+
payload.phonemizeBetweenBrackets = this.phonemizeBetweenBrackets;
|
|
3383
|
+
}
|
|
3384
|
+
}
|
|
3385
|
+
return payload;
|
|
3386
|
+
}
|
|
3387
|
+
/** Synthesize text and return the concatenated audio buffer. */
|
|
3388
|
+
async synthesize(text) {
|
|
3389
|
+
const chunks = [];
|
|
3390
|
+
for await (const chunk of this.synthesizeStream(text)) {
|
|
3391
|
+
chunks.push(chunk);
|
|
3392
|
+
}
|
|
3393
|
+
return Buffer.concat(chunks);
|
|
3394
|
+
}
|
|
3395
|
+
/**
|
|
3396
|
+
* Synthesize text and yield raw PCM_S16LE chunks at the configured
|
|
3397
|
+
* `sampleRate` as they stream in.
|
|
3398
|
+
*/
|
|
3399
|
+
async *synthesizeStream(text) {
|
|
3400
|
+
const response = await fetch(this.baseUrl, {
|
|
3401
|
+
method: "POST",
|
|
3402
|
+
headers: {
|
|
3403
|
+
accept: RimeAudioFormat.PCM,
|
|
3404
|
+
Authorization: `Bearer ${this.apiKey}`,
|
|
3405
|
+
"content-type": "application/json"
|
|
3406
|
+
},
|
|
3407
|
+
body: JSON.stringify(this.buildPayload(text)),
|
|
3408
|
+
signal: AbortSignal.timeout(this.totalTimeoutMs)
|
|
3409
|
+
});
|
|
3410
|
+
if (!response.ok) {
|
|
3411
|
+
const body = await response.text();
|
|
3412
|
+
throw new Error(`Rime TTS error ${response.status}: ${body}`);
|
|
3413
|
+
}
|
|
3414
|
+
const contentType = response.headers.get("content-type") ?? "";
|
|
3415
|
+
if (!contentType.startsWith("audio")) {
|
|
3416
|
+
const body = await response.text();
|
|
3417
|
+
throw new Error(`Rime returned non-audio response: ${body.slice(0, 500)}`);
|
|
3418
|
+
}
|
|
3419
|
+
if (!response.body) {
|
|
3420
|
+
throw new Error("Rime TTS: no response body");
|
|
3421
|
+
}
|
|
3422
|
+
const reader = response.body.getReader();
|
|
3423
|
+
try {
|
|
3424
|
+
while (true) {
|
|
3425
|
+
const { done, value } = await reader.read();
|
|
3426
|
+
if (done) break;
|
|
3427
|
+
if (value && value.length > 0) {
|
|
3428
|
+
yield Buffer.from(value);
|
|
3429
|
+
}
|
|
3430
|
+
}
|
|
3431
|
+
} finally {
|
|
3432
|
+
if (typeof reader.cancel === "function")
|
|
3433
|
+
await reader.cancel().catch(() => {
|
|
3434
|
+
});
|
|
3435
|
+
reader.releaseLock();
|
|
3436
|
+
}
|
|
3437
|
+
}
|
|
3438
|
+
};
|
|
3439
|
+
|
|
3440
|
+
// src/stt/deepgram.ts
|
|
3441
|
+
init_esm_shims();
|
|
3442
|
+
var STT = class extends DeepgramSTT {
|
|
3443
|
+
static providerKey = "deepgram";
|
|
3444
|
+
constructor(opts = {}) {
|
|
3445
|
+
const key = opts.apiKey ?? process.env.DEEPGRAM_API_KEY;
|
|
3446
|
+
if (!key) {
|
|
3447
|
+
throw new Error(
|
|
3448
|
+
"Deepgram STT requires an apiKey. Pass { apiKey: 'dg_...' } or set DEEPGRAM_API_KEY in the environment."
|
|
3449
|
+
);
|
|
3450
|
+
}
|
|
3451
|
+
super(
|
|
3452
|
+
key,
|
|
3453
|
+
opts.language ?? "en",
|
|
3454
|
+
opts.model ?? "nova-3",
|
|
3455
|
+
opts.encoding ?? "linear16",
|
|
3456
|
+
opts.sampleRate ?? 16e3,
|
|
3457
|
+
{
|
|
3458
|
+
endpointingMs: opts.endpointingMs ?? 150,
|
|
3459
|
+
utteranceEndMs: opts.utteranceEndMs === null ? null : opts.utteranceEndMs ?? 1e3,
|
|
3460
|
+
smartFormat: opts.smartFormat ?? true,
|
|
3461
|
+
interimResults: opts.interimResults ?? true,
|
|
3462
|
+
...opts.vadEvents !== void 0 ? { vadEvents: opts.vadEvents } : {}
|
|
3463
|
+
}
|
|
3464
|
+
);
|
|
3465
|
+
}
|
|
3466
|
+
};
|
|
3467
|
+
|
|
3468
|
+
// src/stt/whisper.ts
|
|
3469
|
+
init_esm_shims();
|
|
3470
|
+
|
|
3471
|
+
// src/providers/whisper-stt.ts
|
|
3472
|
+
init_esm_shims();
|
|
3473
|
+
var OPENAI_TRANSCRIPTION_URL = "https://api.openai.com/v1/audio/transcriptions";
|
|
3474
|
+
var DEFAULT_BUFFER_SIZE = 16e3 * 2;
|
|
3475
|
+
var ALLOWED_MODELS = /* @__PURE__ */ new Set(["whisper-1", "gpt-4o-transcribe", "gpt-4o-mini-transcribe"]);
|
|
3476
|
+
function wrapPcmInWav(pcm, sampleRate = 16e3, channels = 1, bitsPerSample = 16) {
|
|
3477
|
+
const dataSize = pcm.length;
|
|
3478
|
+
const header = Buffer.alloc(44);
|
|
3479
|
+
header.write("RIFF", 0);
|
|
3480
|
+
header.writeUInt32LE(36 + dataSize, 4);
|
|
3481
|
+
header.write("WAVE", 8);
|
|
3482
|
+
header.write("fmt ", 12);
|
|
3483
|
+
header.writeUInt32LE(16, 16);
|
|
3484
|
+
header.writeUInt16LE(1, 20);
|
|
3485
|
+
header.writeUInt16LE(channels, 22);
|
|
3486
|
+
header.writeUInt32LE(sampleRate, 24);
|
|
3487
|
+
header.writeUInt32LE(sampleRate * channels * (bitsPerSample / 8), 28);
|
|
3488
|
+
header.writeUInt16LE(channels * (bitsPerSample / 8), 32);
|
|
3489
|
+
header.writeUInt16LE(bitsPerSample, 34);
|
|
3490
|
+
header.write("data", 36);
|
|
3491
|
+
header.writeUInt32LE(dataSize, 40);
|
|
3492
|
+
return Buffer.concat([header, pcm]);
|
|
3493
|
+
}
|
|
3494
|
+
var WhisperSTT = class _WhisperSTT {
|
|
3495
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
3496
|
+
static providerKey = "whisper";
|
|
3497
|
+
apiKey;
|
|
3498
|
+
model;
|
|
3499
|
+
language;
|
|
3500
|
+
bufferSize;
|
|
3501
|
+
responseFormat;
|
|
3502
|
+
// Accumulate chunks in an array and concat once on flush — avoids the
|
|
3503
|
+
// per-``sendAudio`` O(n) ``Buffer.concat([buffer, chunk])`` that quickly
|
|
3504
|
+
// dominates CPU when the phone leg delivers 20 ms frames.
|
|
3505
|
+
chunks = [];
|
|
3506
|
+
bufferedBytes = 0;
|
|
3507
|
+
callbacks = /* @__PURE__ */ new Set();
|
|
3508
|
+
running = false;
|
|
3509
|
+
pendingTranscriptions = [];
|
|
3510
|
+
/**
|
|
3511
|
+
* @param apiKey OpenAI API key.
|
|
3512
|
+
* @param language ISO-639-1 language code (e.g. ``"en"``, ``"it"``). Optional.
|
|
3513
|
+
* @param model One of ``whisper-1``, ``gpt-4o-transcribe``, ``gpt-4o-mini-transcribe``.
|
|
3514
|
+
* @param bufferSize Bytes of PCM16 to buffer before each transcription request.
|
|
3515
|
+
* @param responseFormat ``"json"`` (default) or ``"verbose_json"``.
|
|
3516
|
+
*
|
|
3517
|
+
* Argument order matches the Python SDK's ``WhisperSTT(api_key, language, model, response_format)``
|
|
3518
|
+
* for cross-language parity. Pre-0.5.3 the TS positional order was
|
|
3519
|
+
* ``(apiKey, model, language, bufferSize, responseFormat)`` — callers using
|
|
3520
|
+
* the old order will need to swap ``language`` and ``model``.
|
|
3521
|
+
*/
|
|
3522
|
+
constructor(apiKey, language, model = "whisper-1", bufferSize = DEFAULT_BUFFER_SIZE, responseFormat = "json") {
|
|
3523
|
+
if (!ALLOWED_MODELS.has(model)) {
|
|
3524
|
+
throw new Error(
|
|
3525
|
+
`WhisperSTT: unsupported model "${model}". Expected one of ${[...ALLOWED_MODELS].join(", ")}.`
|
|
3526
|
+
);
|
|
3527
|
+
}
|
|
3528
|
+
this.apiKey = apiKey;
|
|
3529
|
+
this.model = model;
|
|
3530
|
+
this.language = language;
|
|
3531
|
+
this.bufferSize = bufferSize;
|
|
3532
|
+
this.responseFormat = responseFormat;
|
|
3533
|
+
}
|
|
3534
|
+
/** Factory for Twilio calls — mulaw 8 kHz is transcoded upstream, so we still receive PCM 16-bit. */
|
|
3535
|
+
static forTwilio(apiKey, language = "en", model = "whisper-1") {
|
|
3536
|
+
return new _WhisperSTT(apiKey, language, model);
|
|
3537
|
+
}
|
|
3538
|
+
/** Reset the audio buffer and arm the adapter for incoming chunks. */
|
|
3539
|
+
async connect() {
|
|
3540
|
+
this.running = true;
|
|
3541
|
+
this.chunks = [];
|
|
3542
|
+
this.bufferedBytes = 0;
|
|
2870
3543
|
}
|
|
2871
3544
|
/** Buffer a PCM16 chunk; flushes to Whisper once `bufferSize` bytes are reached. */
|
|
2872
3545
|
sendAudio(audio) {
|
|
@@ -4448,264 +5121,42 @@ var SpeechmaticsSTT = class {
|
|
|
4448
5121
|
close() {
|
|
4449
5122
|
this.running = false;
|
|
4450
5123
|
const ws = this.ws;
|
|
4451
|
-
if (!ws) return;
|
|
4452
|
-
this.ws = null;
|
|
4453
|
-
const sendSafe = (payload) => {
|
|
4454
|
-
if (ws.readyState === WebSocket5.OPEN) {
|
|
4455
|
-
try {
|
|
4456
|
-
ws.send(payload);
|
|
4457
|
-
} catch {
|
|
4458
|
-
}
|
|
4459
|
-
}
|
|
4460
|
-
};
|
|
4461
|
-
sendSafe(
|
|
4462
|
-
JSON.stringify({ message: "EndOfStream", last_seq_no: this.lastSeqNo })
|
|
4463
|
-
);
|
|
4464
|
-
try {
|
|
4465
|
-
ws.close();
|
|
4466
|
-
} catch {
|
|
4467
|
-
}
|
|
4468
|
-
}
|
|
4469
|
-
};
|
|
4470
|
-
|
|
4471
|
-
// src/stt/speechmatics.ts
|
|
4472
|
-
var STT7 = class extends SpeechmaticsSTT {
|
|
4473
|
-
static providerKey = "speechmatics";
|
|
4474
|
-
constructor(opts = {}) {
|
|
4475
|
-
const key = opts.apiKey ?? process.env.SPEECHMATICS_API_KEY;
|
|
4476
|
-
if (!key) {
|
|
4477
|
-
throw new Error(
|
|
4478
|
-
"Speechmatics STT requires an apiKey. Pass { apiKey: 'sm_...' } or set SPEECHMATICS_API_KEY in the environment."
|
|
4479
|
-
);
|
|
4480
|
-
}
|
|
4481
|
-
super(key, opts);
|
|
4482
|
-
}
|
|
4483
|
-
};
|
|
4484
|
-
|
|
4485
|
-
// src/tts/elevenlabs.ts
|
|
4486
|
-
init_esm_shims();
|
|
4487
|
-
|
|
4488
|
-
// src/providers/elevenlabs-tts.ts
|
|
4489
|
-
init_esm_shims();
|
|
4490
|
-
var ELEVENLABS_BASE_URL = "https://api.elevenlabs.io/v1";
|
|
4491
|
-
var ELEVENLABS_VOICE_ID_BY_NAME = {
|
|
4492
|
-
rachel: "21m00Tcm4TlvDq8ikWAM",
|
|
4493
|
-
drew: "29vD33N1CtxCmqQRPOHJ",
|
|
4494
|
-
clyde: "2EiwWnXFnvU5JabPnv8n",
|
|
4495
|
-
paul: "5Q0t7uMcjvnagumLfvZi",
|
|
4496
|
-
domi: "AZnzlk1XvdvUeBnXmlld",
|
|
4497
|
-
dave: "CYw3kZ02Hs0563khs1Fj",
|
|
4498
|
-
fin: "D38z5RcWu1voky8WS1ja",
|
|
4499
|
-
bella: "EXAVITQu4vr4xnSDxMaL",
|
|
4500
|
-
antoni: "ErXwobaYiN019PkySvjV",
|
|
4501
|
-
thomas: "GBv7mTt0atIp3Br8iCZE",
|
|
4502
|
-
charlie: "IKne3meq5aSn9XLyUdCD",
|
|
4503
|
-
george: "JBFqnCBsd6RMkjVDRZzb",
|
|
4504
|
-
emily: "LcfcDJNUP1GQjkzn1xUU",
|
|
4505
|
-
elli: "MF3mGyEYCl7XYWbV9V6O",
|
|
4506
|
-
callum: "N2lVS1w4EtoT3dr4eOWO",
|
|
4507
|
-
patrick: "ODq5zmih8GrVes37Dizd",
|
|
4508
|
-
harry: "SOYHLrjzK2X1ezoPC6cr",
|
|
4509
|
-
liam: "TX3LPaxmHKxFdv7VOQHJ",
|
|
4510
|
-
dorothy: "ThT5KcBeYPX3keUQqHPh",
|
|
4511
|
-
josh: "TxGEqnHWrfWFTfGW9XjX",
|
|
4512
|
-
arnold: "VR6AewLTigWG4xSOukaG",
|
|
4513
|
-
charlotte: "XB0fDUnXU5powFXDhCwa",
|
|
4514
|
-
matilda: "XrExE9yKIg1WjnnlVkGX",
|
|
4515
|
-
matthew: "Yko7PKHZNXotIFUBG7I9",
|
|
4516
|
-
james: "ZQe5CZNOzWyzPSCn5a3c",
|
|
4517
|
-
joseph: "Zlb1dXrM653N07WRdFW3",
|
|
4518
|
-
jeremy: "bVMeCyTHy58xNoL34h3p",
|
|
4519
|
-
michael: "flq6f7yk4E4fJM5XTYuZ",
|
|
4520
|
-
ethan: "g5CIjZEefAph4nQFvHAz",
|
|
4521
|
-
gigi: "jBpfuIE2acCO8z3wKNLl",
|
|
4522
|
-
freya: "jsCqWAovK2LkecY7zXl4",
|
|
4523
|
-
brian: "nPczCjzI2devNBz1zQrb",
|
|
4524
|
-
grace: "oWAxZDx7w5VEj9dCyTzz",
|
|
4525
|
-
daniel: "onwK4e9ZLuTAKqWW03F9",
|
|
4526
|
-
lily: "pFZP5JQG7iQjIQuC4Bku",
|
|
4527
|
-
serena: "pMsXgVXv3BLzUgSXRplE",
|
|
4528
|
-
adam: "pNInz6obpgDQGcFmaJgB",
|
|
4529
|
-
nicole: "piTKgcLEGmPE4e6mEKli",
|
|
4530
|
-
bill: "pqHfZKP75CvOlQylNhV4",
|
|
4531
|
-
jessie: "t0jbNlBVZ17f02VDIeMI",
|
|
4532
|
-
ryan: "wViXBPUzp2ZZixB1xQuM",
|
|
4533
|
-
sam: "yoZ06aMxZJJ28mfd3POQ",
|
|
4534
|
-
glinda: "z9fAnlkpzviPz146aGWa",
|
|
4535
|
-
giovanni: "zcAOhNBS3c14rBihAFp1",
|
|
4536
|
-
mimi: "zrHiDhphv9ZnVXBqCLjz",
|
|
4537
|
-
sarah: "EXAVITQu4vr4xnSDxMaL",
|
|
4538
|
-
alloy: "EXAVITQu4vr4xnSDxMaL"
|
|
4539
|
-
};
|
|
4540
|
-
var VOICE_ID_PATTERN = /^[A-Za-z0-9]{20}$/;
|
|
4541
|
-
function resolveVoiceId(voice) {
|
|
4542
|
-
if (!voice) return voice;
|
|
4543
|
-
if (VOICE_ID_PATTERN.test(voice)) return voice;
|
|
4544
|
-
return ELEVENLABS_VOICE_ID_BY_NAME[voice.toLowerCase()] ?? voice;
|
|
4545
|
-
}
|
|
4546
|
-
var ElevenLabsModel = {
|
|
4547
|
-
V3: "eleven_v3",
|
|
4548
|
-
FLASH_V2_5: "eleven_flash_v2_5",
|
|
4549
|
-
TURBO_V2_5: "eleven_turbo_v2_5",
|
|
4550
|
-
MULTILINGUAL_V2: "eleven_multilingual_v2",
|
|
4551
|
-
MONOLINGUAL_V1: "eleven_monolingual_v1"
|
|
4552
|
-
};
|
|
4553
|
-
var ElevenLabsOutputFormat = {
|
|
4554
|
-
MP3_22050_32: "mp3_22050_32",
|
|
4555
|
-
MP3_44100_32: "mp3_44100_32",
|
|
4556
|
-
MP3_44100_64: "mp3_44100_64",
|
|
4557
|
-
MP3_44100_96: "mp3_44100_96",
|
|
4558
|
-
MP3_44100_128: "mp3_44100_128",
|
|
4559
|
-
MP3_44100_192: "mp3_44100_192",
|
|
4560
|
-
PCM_8000: "pcm_8000",
|
|
4561
|
-
PCM_16000: "pcm_16000",
|
|
4562
|
-
PCM_22050: "pcm_22050",
|
|
4563
|
-
PCM_24000: "pcm_24000",
|
|
4564
|
-
PCM_44100: "pcm_44100",
|
|
4565
|
-
ULAW_8000: "ulaw_8000"
|
|
4566
|
-
};
|
|
4567
|
-
var ElevenLabsTTS = class _ElevenLabsTTS {
|
|
4568
|
-
// Stable pricing/dashboard key — read by stream-handler / metrics via
|
|
4569
|
-
// ``(agent.tts.constructor as any).providerKey``. Without this the cost
|
|
4570
|
-
// calculator falls back to ``constructor.name`` ("ElevenLabsTTS") which
|
|
4571
|
-
// does NOT match the pricing table key "elevenlabs", silently zeroing
|
|
4572
|
-
// TTS cost for callers that construct the raw REST class directly
|
|
4573
|
-
// (exposed at top level as ``ElevenLabsRestTTS``).
|
|
4574
|
-
static providerKey = "elevenlabs";
|
|
4575
|
-
apiKey;
|
|
4576
|
-
voiceId;
|
|
4577
|
-
modelId;
|
|
4578
|
-
outputFormat;
|
|
4579
|
-
voiceSettings;
|
|
4580
|
-
languageCode;
|
|
4581
|
-
chunkSize;
|
|
4582
|
-
constructor(apiKey, voiceIdOrOptions = "21m00Tcm4TlvDq8ikWAM", modelId = ElevenLabsModel.FLASH_V2_5, outputFormat = ElevenLabsOutputFormat.PCM_16000) {
|
|
4583
|
-
this.apiKey = apiKey;
|
|
4584
|
-
if (typeof voiceIdOrOptions === "object") {
|
|
4585
|
-
const o = voiceIdOrOptions;
|
|
4586
|
-
this.voiceId = resolveVoiceId(o.voiceId ?? "21m00Tcm4TlvDq8ikWAM");
|
|
4587
|
-
this.modelId = o.modelId ?? ElevenLabsModel.FLASH_V2_5;
|
|
4588
|
-
this.outputFormat = o.outputFormat ?? ElevenLabsOutputFormat.PCM_16000;
|
|
4589
|
-
this.voiceSettings = o.voiceSettings;
|
|
4590
|
-
this.languageCode = o.languageCode;
|
|
4591
|
-
this.chunkSize = o.chunkSize ?? 4096;
|
|
4592
|
-
} else {
|
|
4593
|
-
this.voiceId = resolveVoiceId(voiceIdOrOptions);
|
|
4594
|
-
this.modelId = modelId;
|
|
4595
|
-
this.outputFormat = outputFormat;
|
|
4596
|
-
this.voiceSettings = void 0;
|
|
4597
|
-
this.languageCode = void 0;
|
|
4598
|
-
this.chunkSize = 4096;
|
|
4599
|
-
}
|
|
4600
|
-
}
|
|
4601
|
-
/**
|
|
4602
|
-
* Construct an instance pre-configured for Twilio Media Streams.
|
|
4603
|
-
*
|
|
4604
|
-
* Sets `outputFormat='ulaw_8000'` so ElevenLabs emits μ-law @ 8 kHz
|
|
4605
|
-
* directly — the exact wire format Twilio's media stream uses — letting
|
|
4606
|
-
* the SDK skip the 16 kHz→8 kHz resample and PCM→μ-law conversion in
|
|
4607
|
-
* `TwilioAudioSender`. Saves ~30–80 ms first-byte and per-frame CPU,
|
|
4608
|
-
* and removes a potential aliasing source.
|
|
4609
|
-
*
|
|
4610
|
-
* `voiceSettings` defaults to a low-bandwidth-friendly profile
|
|
4611
|
-
* (speaker boost off, modest stability) which sounds cleaner at 8 kHz
|
|
4612
|
-
* μ-law than the studio default. Pass an explicit object to override.
|
|
4613
|
-
*/
|
|
4614
|
-
static forTwilio(apiKey, options = {}) {
|
|
4615
|
-
const voiceSettings = options.voiceSettings ?? {
|
|
4616
|
-
// Speaker boost adds high-frequency emphasis that aliases ugly over an
|
|
4617
|
-
// 8 kHz μ-law line. Slightly higher stability tames the excursions
|
|
4618
|
-
// that compander quantization noise can amplify.
|
|
4619
|
-
stability: 0.6,
|
|
4620
|
-
similarity_boost: 0.75,
|
|
4621
|
-
use_speaker_boost: false
|
|
4622
|
-
};
|
|
4623
|
-
return new _ElevenLabsTTS(apiKey, {
|
|
4624
|
-
...options,
|
|
4625
|
-
voiceSettings,
|
|
4626
|
-
outputFormat: ElevenLabsOutputFormat.ULAW_8000
|
|
4627
|
-
});
|
|
4628
|
-
}
|
|
4629
|
-
/**
|
|
4630
|
-
* Construct an instance pre-configured for Telnyx bidirectional media.
|
|
4631
|
-
*
|
|
4632
|
-
* Telnyx's default media-streaming codec is L16 PCM @ 16 kHz, which
|
|
4633
|
-
* matches our default Telnyx handler. We pick `pcm_16000` so the audio
|
|
4634
|
-
* flows end-to-end with zero resampling or transcoding.
|
|
4635
|
-
*
|
|
4636
|
-
* Trade-off: if your Telnyx profile is pinned to PCMU/8000 (μ-law),
|
|
4637
|
-
* construct `ElevenLabsTTS` directly with `outputFormat: 'ulaw_8000'`
|
|
4638
|
-
* — Telnyx supports that natively too.
|
|
4639
|
-
*/
|
|
4640
|
-
static forTelnyx(apiKey, options = {}) {
|
|
4641
|
-
return new _ElevenLabsTTS(apiKey, {
|
|
4642
|
-
...options,
|
|
4643
|
-
outputFormat: ElevenLabsOutputFormat.PCM_16000
|
|
4644
|
-
});
|
|
4645
|
-
}
|
|
4646
|
-
/**
|
|
4647
|
-
* Synthesise text to speech and return the full audio as a single Buffer.
|
|
4648
|
-
*
|
|
4649
|
-
* For large chunks (or when latency matters) call `synthesizeStream` instead.
|
|
4650
|
-
*/
|
|
4651
|
-
async synthesize(text) {
|
|
4652
|
-
const chunks = [];
|
|
4653
|
-
for await (const chunk of this.synthesizeStream(text)) {
|
|
4654
|
-
chunks.push(chunk);
|
|
4655
|
-
}
|
|
4656
|
-
return Buffer.concat(chunks);
|
|
4657
|
-
}
|
|
4658
|
-
/**
|
|
4659
|
-
* Synthesise text and yield audio chunks as they arrive (streaming).
|
|
4660
|
-
*
|
|
4661
|
-
* The yielded buffers are raw PCM at 16 kHz (or whatever `outputFormat` is
|
|
4662
|
-
* configured to). `chunkSize` controls the maximum yield size — 512 is a
|
|
4663
|
-
* good choice for low-latency telephony.
|
|
4664
|
-
*/
|
|
4665
|
-
async *synthesizeStream(text) {
|
|
4666
|
-
const url = `${ELEVENLABS_BASE_URL}/text-to-speech/${encodeURIComponent(this.voiceId)}/stream?output_format=${encodeURIComponent(this.outputFormat)}`;
|
|
4667
|
-
const body = {
|
|
4668
|
-
text,
|
|
4669
|
-
model_id: this.modelId
|
|
4670
|
-
};
|
|
4671
|
-
if (this.voiceSettings) body["voice_settings"] = this.voiceSettings;
|
|
4672
|
-
if (this.languageCode) body["language_code"] = this.languageCode;
|
|
4673
|
-
const response = await fetch(url, {
|
|
4674
|
-
method: "POST",
|
|
4675
|
-
headers: {
|
|
4676
|
-
"xi-api-key": this.apiKey,
|
|
4677
|
-
"Content-Type": "application/json"
|
|
4678
|
-
},
|
|
4679
|
-
body: JSON.stringify(body),
|
|
4680
|
-
signal: AbortSignal.timeout(3e4)
|
|
4681
|
-
});
|
|
4682
|
-
if (!response.ok) {
|
|
4683
|
-
const errBody = await response.text();
|
|
4684
|
-
throw new Error(`ElevenLabs TTS error ${response.status}: ${errBody}`);
|
|
4685
|
-
}
|
|
4686
|
-
if (!response.body) {
|
|
4687
|
-
throw new Error("ElevenLabs TTS: no response body");
|
|
4688
|
-
}
|
|
4689
|
-
const reader = response.body.getReader();
|
|
4690
|
-
try {
|
|
4691
|
-
while (true) {
|
|
4692
|
-
const { done, value } = await reader.read();
|
|
4693
|
-
if (done) break;
|
|
4694
|
-
if (!value || value.length === 0) continue;
|
|
4695
|
-
const buf = Buffer.from(value);
|
|
4696
|
-
for (let offset = 0; offset < buf.length; offset += this.chunkSize) {
|
|
4697
|
-
yield buf.subarray(offset, Math.min(offset + this.chunkSize, buf.length));
|
|
5124
|
+
if (!ws) return;
|
|
5125
|
+
this.ws = null;
|
|
5126
|
+
const sendSafe = (payload) => {
|
|
5127
|
+
if (ws.readyState === WebSocket5.OPEN) {
|
|
5128
|
+
try {
|
|
5129
|
+
ws.send(payload);
|
|
5130
|
+
} catch {
|
|
4698
5131
|
}
|
|
4699
5132
|
}
|
|
4700
|
-
}
|
|
4701
|
-
|
|
4702
|
-
})
|
|
4703
|
-
|
|
5133
|
+
};
|
|
5134
|
+
sendSafe(
|
|
5135
|
+
JSON.stringify({ message: "EndOfStream", last_seq_no: this.lastSeqNo })
|
|
5136
|
+
);
|
|
5137
|
+
try {
|
|
5138
|
+
ws.close();
|
|
5139
|
+
} catch {
|
|
5140
|
+
}
|
|
5141
|
+
}
|
|
5142
|
+
};
|
|
5143
|
+
|
|
5144
|
+
// src/stt/speechmatics.ts
|
|
5145
|
+
var STT7 = class extends SpeechmaticsSTT {
|
|
5146
|
+
static providerKey = "speechmatics";
|
|
5147
|
+
constructor(opts = {}) {
|
|
5148
|
+
const key = opts.apiKey ?? process.env.SPEECHMATICS_API_KEY;
|
|
5149
|
+
if (!key) {
|
|
5150
|
+
throw new Error(
|
|
5151
|
+
"Speechmatics STT requires an apiKey. Pass { apiKey: 'sm_...' } or set SPEECHMATICS_API_KEY in the environment."
|
|
5152
|
+
);
|
|
4704
5153
|
}
|
|
5154
|
+
super(key, opts);
|
|
4705
5155
|
}
|
|
4706
5156
|
};
|
|
4707
5157
|
|
|
4708
5158
|
// src/tts/elevenlabs.ts
|
|
5159
|
+
init_esm_shims();
|
|
4709
5160
|
function resolveApiKey(apiKey) {
|
|
4710
5161
|
const key = apiKey ?? process.env.ELEVENLABS_API_KEY;
|
|
4711
5162
|
if (!key) {
|
|
@@ -4721,7 +5172,7 @@ var TTS = class _TTS extends ElevenLabsTTS {
|
|
|
4721
5172
|
super(resolveApiKey(opts.apiKey), {
|
|
4722
5173
|
voiceId: opts.voiceId ?? "EXAVITQu4vr4xnSDxMaL",
|
|
4723
5174
|
modelId: opts.modelId ?? "eleven_flash_v2_5",
|
|
4724
|
-
outputFormat: opts.outputFormat
|
|
5175
|
+
...opts.outputFormat !== void 0 ? { outputFormat: opts.outputFormat } : {},
|
|
4725
5176
|
languageCode: opts.languageCode,
|
|
4726
5177
|
voiceSettings: opts.voiceSettings
|
|
4727
5178
|
});
|
|
@@ -4792,6 +5243,20 @@ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
|
|
|
4792
5243
|
* changes.
|
|
4793
5244
|
*/
|
|
4794
5245
|
adoptedConnection = null;
|
|
5246
|
+
/**
|
|
5247
|
+
* Active WS for the in-flight ``synthesizeStream`` call, if any. Set
|
|
5248
|
+
* when a stream starts, cleared in its ``finally`` block. The
|
|
5249
|
+
* stream-handler calls ``cancelActiveStream()`` from ``cancelSpeaking``
|
|
5250
|
+
* to unblock the generator's inner ``await Promise<frame>`` — without
|
|
5251
|
+
* it, a barge-in on the firstMessage live path leaves the for-await
|
|
5252
|
+
* stuck waiting for the next frame; ElevenLabs never sends
|
|
5253
|
+
* ``isFinal=true`` after the consumer breaks, the 30 s frame timeout
|
|
5254
|
+
* fires post-call, and meanwhile ``initPipeline`` never returns so
|
|
5255
|
+
* the STT ``onTranscript`` callback never registers and subsequent
|
|
5256
|
+
* user turns are silently dropped (root cause of the 2026-05-20
|
|
5257
|
+
* "first message OK, then no response" symptom).
|
|
5258
|
+
*/
|
|
5259
|
+
activeStreamWs = null;
|
|
4795
5260
|
/**
|
|
4796
5261
|
* The wire format requested over the ElevenLabs WS. Initially set from
|
|
4797
5262
|
* the constructor; ``setTelephonyCarrier`` may auto-flip it to the
|
|
@@ -4840,6 +5305,32 @@ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
|
|
|
4840
5305
|
if (!native) return;
|
|
4841
5306
|
this._outputFormat = native;
|
|
4842
5307
|
}
|
|
5308
|
+
/**
|
|
5309
|
+
* Force-close the WebSocket of any in-flight ``synthesizeStream`` call.
|
|
5310
|
+
* Called by the stream-handler from ``cancelSpeaking`` (barge-in) so
|
|
5311
|
+
* the generator's inner ``await Promise<frame>`` loop unblocks cleanly
|
|
5312
|
+
* via the ``onClose`` handler — instead of waiting up to 30 s for the
|
|
5313
|
+
* ``FRAME_TIMEOUT_MS`` watchdog to fire. No-op when no stream is in
|
|
5314
|
+
* flight or when the WS is already closing.
|
|
5315
|
+
*
|
|
5316
|
+
* Without this, a barge-in during the firstMessage live path left the
|
|
5317
|
+
* for-await stuck (ElevenLabs never sends ``isFinal=true`` after the
|
|
5318
|
+
* consumer breaks), ``initPipeline`` never returned, the STT
|
|
5319
|
+
* ``onTranscript`` callback never registered, and the entire remainder
|
|
5320
|
+
* of the call was silent for the user. Surfaced during the 2026-05-20
|
|
5321
|
+
* acceptance run.
|
|
5322
|
+
*/
|
|
5323
|
+
cancelActiveStream() {
|
|
5324
|
+
const ws = this.activeStreamWs;
|
|
5325
|
+
if (!ws) return;
|
|
5326
|
+
this.activeStreamWs = null;
|
|
5327
|
+
try {
|
|
5328
|
+
if (ws.readyState === WebSocket6.OPEN || ws.readyState === WebSocket6.CONNECTING) {
|
|
5329
|
+
ws.close();
|
|
5330
|
+
}
|
|
5331
|
+
} catch {
|
|
5332
|
+
}
|
|
5333
|
+
}
|
|
4843
5334
|
/** Pre-configured for Twilio Media Streams (`ulaw_8000`). */
|
|
4844
5335
|
static forTwilio(opts) {
|
|
4845
5336
|
return new _ElevenLabsWebSocketTTS({
|
|
@@ -4925,6 +5416,7 @@ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
|
|
|
4925
5416
|
headers: { "xi-api-key": this.apiKey }
|
|
4926
5417
|
});
|
|
4927
5418
|
}
|
|
5419
|
+
this.activeStreamWs = ws;
|
|
4928
5420
|
const queue = [];
|
|
4929
5421
|
let done = false;
|
|
4930
5422
|
let pendingError = null;
|
|
@@ -5045,6 +5537,7 @@ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
|
|
|
5045
5537
|
}
|
|
5046
5538
|
} finally {
|
|
5047
5539
|
if (connectTimer) clearTimeout(connectTimer);
|
|
5540
|
+
if (this.activeStreamWs === ws) this.activeStreamWs = null;
|
|
5048
5541
|
try {
|
|
5049
5542
|
if (ws.readyState === WebSocket6.OPEN) {
|
|
5050
5543
|
ws.send(JSON.stringify({ text: "" }));
|
|
@@ -5217,9 +5710,9 @@ function buildOpts(opts) {
|
|
|
5217
5710
|
const out = {
|
|
5218
5711
|
apiKey: resolveApiKey2(opts.apiKey),
|
|
5219
5712
|
modelId: opts.modelId ?? "eleven_flash_v2_5",
|
|
5220
|
-
outputFormat: opts.outputFormat ?? "pcm_16000",
|
|
5221
5713
|
autoMode: opts.autoMode ?? true
|
|
5222
5714
|
};
|
|
5715
|
+
if (opts.outputFormat !== void 0) out.outputFormat = opts.outputFormat;
|
|
5223
5716
|
if (opts.voiceId !== void 0) out.voiceId = opts.voiceId;
|
|
5224
5717
|
if (opts.voiceSettings !== void 0) out.voiceSettings = opts.voiceSettings;
|
|
5225
5718
|
if (opts.languageCode !== void 0) out.languageCode = opts.languageCode;
|
|
@@ -5396,268 +5889,77 @@ var OpenAITTS = class _OpenAITTS {
|
|
|
5396
5889
|
if (lpf) {
|
|
5397
5890
|
y = lpfAlpha * x + (1 - lpfAlpha) * y;
|
|
5398
5891
|
let s = Math.round(y);
|
|
5399
|
-
if (s > 32767) s = 32767;
|
|
5400
|
-
else if (s < -32768) s = -32768;
|
|
5401
|
-
samples.push(s);
|
|
5402
|
-
} else {
|
|
5403
|
-
samples.push(x);
|
|
5404
|
-
}
|
|
5405
|
-
}
|
|
5406
|
-
if (lpf) ctx.lpfPrev = y;
|
|
5407
|
-
const out = [];
|
|
5408
|
-
let i = 0;
|
|
5409
|
-
if (direct8k) {
|
|
5410
|
-
while (i + 2 < samples.length) {
|
|
5411
|
-
out.push(samples[i]);
|
|
5412
|
-
i += 3;
|
|
5413
|
-
}
|
|
5414
|
-
} else {
|
|
5415
|
-
while (i + 2 < samples.length) {
|
|
5416
|
-
out.push(samples[i]);
|
|
5417
|
-
out.push(Math.round((samples[i + 1] + samples[i + 2]) / 2));
|
|
5418
|
-
i += 3;
|
|
5419
|
-
}
|
|
5420
|
-
}
|
|
5421
|
-
ctx.leftover = samples.slice(i);
|
|
5422
|
-
const buffer = Buffer.alloc(out.length * 2);
|
|
5423
|
-
for (let j = 0; j < out.length; j++) {
|
|
5424
|
-
buffer.writeInt16LE(out[j], j * 2);
|
|
5425
|
-
}
|
|
5426
|
-
return buffer;
|
|
5427
|
-
}
|
|
5428
|
-
/** @deprecated use {@link resampleStreaming} with persistent state. */
|
|
5429
|
-
static resample24kTo16k(audio) {
|
|
5430
|
-
const ctx = {
|
|
5431
|
-
carryByte: null,
|
|
5432
|
-
leftover: [],
|
|
5433
|
-
lpfPrev: 0,
|
|
5434
|
-
lpfEnabled: false,
|
|
5435
|
-
targetSampleRate: 16e3
|
|
5436
|
-
};
|
|
5437
|
-
const out = _OpenAITTS.resampleStreaming(audio, ctx);
|
|
5438
|
-
if (ctx.leftover.length === 0) return out;
|
|
5439
|
-
const tail = Buffer.alloc(ctx.leftover.length * 2);
|
|
5440
|
-
for (let i = 0; i < ctx.leftover.length; i++) {
|
|
5441
|
-
tail.writeInt16LE(ctx.leftover[i], i * 2);
|
|
5442
|
-
}
|
|
5443
|
-
return Buffer.concat([out, tail]);
|
|
5444
|
-
}
|
|
5445
|
-
};
|
|
5446
|
-
|
|
5447
|
-
// src/tts/openai.ts
|
|
5448
|
-
var TTS3 = class extends OpenAITTS {
|
|
5449
|
-
static providerKey = "openai_tts";
|
|
5450
|
-
constructor(opts = {}) {
|
|
5451
|
-
const key = opts.apiKey ?? process.env.OPENAI_API_KEY;
|
|
5452
|
-
if (!key) {
|
|
5453
|
-
throw new Error(
|
|
5454
|
-
"OpenAI TTS requires an apiKey. Pass { apiKey: 'sk-...' } or set OPENAI_API_KEY in the environment."
|
|
5455
|
-
);
|
|
5456
|
-
}
|
|
5457
|
-
super(
|
|
5458
|
-
key,
|
|
5459
|
-
opts.voice ?? "alloy",
|
|
5460
|
-
opts.model ?? "gpt-4o-mini-tts",
|
|
5461
|
-
opts.instructions ?? null,
|
|
5462
|
-
opts.speed ?? null,
|
|
5463
|
-
opts.antiAlias ?? false
|
|
5464
|
-
);
|
|
5465
|
-
}
|
|
5466
|
-
};
|
|
5467
|
-
|
|
5468
|
-
// src/tts/cartesia.ts
|
|
5469
|
-
init_esm_shims();
|
|
5470
|
-
|
|
5471
|
-
// src/providers/cartesia-tts.ts
|
|
5472
|
-
init_esm_shims();
|
|
5473
|
-
var CARTESIA_BASE_URL = "https://api.cartesia.ai";
|
|
5474
|
-
var CARTESIA_API_VERSION = "2025-04-16";
|
|
5475
|
-
var CARTESIA_DEFAULT_VOICE_ID = "f786b574-daa5-4673-aa0c-cbe3e8534c02";
|
|
5476
|
-
var CartesiaTTSModel = {
|
|
5477
|
-
SONIC_3: "sonic-3",
|
|
5478
|
-
SONIC_2: "sonic-2",
|
|
5479
|
-
SONIC: "sonic"
|
|
5480
|
-
};
|
|
5481
|
-
var CartesiaTTSContainer = {
|
|
5482
|
-
RAW: "raw",
|
|
5483
|
-
WAV: "wav",
|
|
5484
|
-
MP3: "mp3"
|
|
5485
|
-
};
|
|
5486
|
-
var CartesiaTTSEncoding = {
|
|
5487
|
-
PCM_S16LE: "pcm_s16le",
|
|
5488
|
-
PCM_F32LE: "pcm_f32le",
|
|
5489
|
-
PCM_MULAW: "pcm_mulaw",
|
|
5490
|
-
PCM_ALAW: "pcm_alaw"
|
|
5491
|
-
};
|
|
5492
|
-
var CartesiaTTSSampleRate = {
|
|
5493
|
-
HZ_8000: 8e3,
|
|
5494
|
-
HZ_16000: 16e3,
|
|
5495
|
-
HZ_22050: 22050,
|
|
5496
|
-
HZ_24000: 24e3,
|
|
5497
|
-
HZ_44100: 44100
|
|
5498
|
-
};
|
|
5499
|
-
var CartesiaTTSVoiceMode = {
|
|
5500
|
-
ID: "id",
|
|
5501
|
-
EMBEDDING: "embedding"
|
|
5502
|
-
};
|
|
5503
|
-
var CartesiaTTS = class _CartesiaTTS {
|
|
5504
|
-
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
5505
|
-
static providerKey = "cartesia_tts";
|
|
5506
|
-
apiKey;
|
|
5507
|
-
model;
|
|
5508
|
-
voice;
|
|
5509
|
-
language;
|
|
5510
|
-
sampleRate;
|
|
5511
|
-
speed;
|
|
5512
|
-
emotion;
|
|
5513
|
-
volume;
|
|
5514
|
-
baseUrl;
|
|
5515
|
-
apiVersion;
|
|
5516
|
-
constructor(apiKey, opts = {}) {
|
|
5517
|
-
this.apiKey = apiKey;
|
|
5518
|
-
this.model = opts.model ?? CartesiaTTSModel.SONIC_3;
|
|
5519
|
-
this.voice = opts.voice ?? CARTESIA_DEFAULT_VOICE_ID;
|
|
5520
|
-
this.language = opts.language ?? "en";
|
|
5521
|
-
this.sampleRate = opts.sampleRate ?? CartesiaTTSSampleRate.HZ_16000;
|
|
5522
|
-
this.speed = opts.speed;
|
|
5523
|
-
this.emotion = typeof opts.emotion === "string" ? [opts.emotion] : opts.emotion;
|
|
5524
|
-
this.volume = opts.volume;
|
|
5525
|
-
this.baseUrl = opts.baseUrl ?? CARTESIA_BASE_URL;
|
|
5526
|
-
this.apiVersion = opts.apiVersion ?? CARTESIA_API_VERSION;
|
|
5527
|
-
}
|
|
5528
|
-
/**
|
|
5529
|
-
* Construct an instance pre-configured for Twilio Media Streams.
|
|
5530
|
-
*
|
|
5531
|
-
* Sets `sampleRate=8000` so Cartesia emits PCM_S16LE @ 8 kHz directly.
|
|
5532
|
-
* Twilio's media stream uses μ-law @ 8 kHz so the SDK still does the
|
|
5533
|
-
* PCM → μ-law transcode client-side, but the 16 kHz → 8 kHz resample
|
|
5534
|
-
* step is skipped. Saves ~10–30 ms first-byte plus per-frame CPU and
|
|
5535
|
-
* removes a potential aliasing source.
|
|
5536
|
-
*/
|
|
5537
|
-
static forTwilio(apiKey, options = {}) {
|
|
5538
|
-
return new _CartesiaTTS(apiKey, {
|
|
5539
|
-
...options,
|
|
5540
|
-
sampleRate: CartesiaTTSSampleRate.HZ_8000
|
|
5541
|
-
});
|
|
5542
|
-
}
|
|
5543
|
-
/**
|
|
5544
|
-
* Construct an instance pre-configured for Telnyx bidirectional media.
|
|
5545
|
-
*
|
|
5546
|
-
* Sets `sampleRate=16000` to match Telnyx's L16/16000 default codec —
|
|
5547
|
-
* audio flows end-to-end with zero resampling or transcoding. Same as
|
|
5548
|
-
* the bare-constructor default; exists for API symmetry with
|
|
5549
|
-
* {@link CartesiaTTS.forTwilio}.
|
|
5550
|
-
*/
|
|
5551
|
-
static forTelnyx(apiKey, options = {}) {
|
|
5552
|
-
return new _CartesiaTTS(apiKey, {
|
|
5553
|
-
...options,
|
|
5554
|
-
sampleRate: CartesiaTTSSampleRate.HZ_16000
|
|
5555
|
-
});
|
|
5556
|
-
}
|
|
5557
|
-
/** Build the JSON payload for the Cartesia bytes endpoint. */
|
|
5558
|
-
buildPayload(text) {
|
|
5559
|
-
const payload = {
|
|
5560
|
-
model_id: this.model,
|
|
5561
|
-
voice: { mode: CartesiaTTSVoiceMode.ID, id: this.voice },
|
|
5562
|
-
transcript: text,
|
|
5563
|
-
output_format: {
|
|
5564
|
-
container: CartesiaTTSContainer.RAW,
|
|
5565
|
-
encoding: CartesiaTTSEncoding.PCM_S16LE,
|
|
5566
|
-
sample_rate: this.sampleRate
|
|
5567
|
-
},
|
|
5568
|
-
language: this.language
|
|
5569
|
-
};
|
|
5570
|
-
const generationConfig = {};
|
|
5571
|
-
if (this.speed !== void 0) generationConfig.speed = this.speed;
|
|
5572
|
-
if (this.emotion && this.emotion.length > 0)
|
|
5573
|
-
generationConfig.emotion = this.emotion[0];
|
|
5574
|
-
if (this.volume !== void 0) generationConfig.volume = this.volume;
|
|
5575
|
-
if (Object.keys(generationConfig).length > 0) {
|
|
5576
|
-
payload.generation_config = generationConfig;
|
|
5577
|
-
}
|
|
5578
|
-
return payload;
|
|
5579
|
-
}
|
|
5580
|
-
/**
|
|
5581
|
-
* Pre-call HTTP warmup for the Cartesia `/tts/bytes` endpoint.
|
|
5582
|
-
*
|
|
5583
|
-
* Issues a lightweight `GET <baseUrl>/voices` so DNS, TLS, and HTTP/2
|
|
5584
|
-
* are already up by the time the first `synthesizeStream()` POST
|
|
5585
|
-
* lands. Best-effort: 5 s timeout, all exceptions swallowed at
|
|
5586
|
-
* debug level.
|
|
5587
|
-
*
|
|
5588
|
-
* Billing safety: `GET /voices` is a free metadata read on
|
|
5589
|
-
* Cartesia's REST surface (per https://docs.cartesia.ai). It does
|
|
5590
|
-
* not consume synthesis credits. The actual synthesis is billed
|
|
5591
|
-
* only when `POST /tts/bytes` runs with a non-empty `transcript`.
|
|
5592
|
-
*
|
|
5593
|
-
* Note: Cartesia TTS uses the HTTP path (vs the WebSocket variant
|
|
5594
|
-
* Cartesia also exposes) — connection warmup is therefore HTTP-GET
|
|
5595
|
-
* based, not WebSocket pre-handshake. The latency win is smaller
|
|
5596
|
-
* (~50-150 ms vs the ~200-500 ms of a WS prewarm) but still real.
|
|
5597
|
-
*/
|
|
5598
|
-
async warmup() {
|
|
5599
|
-
try {
|
|
5600
|
-
await fetch(`${this.baseUrl}/voices`, {
|
|
5601
|
-
method: "GET",
|
|
5602
|
-
headers: {
|
|
5603
|
-
"X-API-Key": this.apiKey,
|
|
5604
|
-
"Cartesia-Version": this.apiVersion
|
|
5605
|
-
},
|
|
5606
|
-
signal: AbortSignal.timeout(5e3)
|
|
5607
|
-
});
|
|
5608
|
-
} catch (err) {
|
|
5609
|
-
getLogger().debug(`Cartesia TTS warmup failed (best-effort): ${String(err)}`);
|
|
5892
|
+
if (s > 32767) s = 32767;
|
|
5893
|
+
else if (s < -32768) s = -32768;
|
|
5894
|
+
samples.push(s);
|
|
5895
|
+
} else {
|
|
5896
|
+
samples.push(x);
|
|
5897
|
+
}
|
|
5610
5898
|
}
|
|
5611
|
-
|
|
5612
|
-
|
|
5613
|
-
|
|
5614
|
-
|
|
5615
|
-
|
|
5616
|
-
|
|
5899
|
+
if (lpf) ctx.lpfPrev = y;
|
|
5900
|
+
const out = [];
|
|
5901
|
+
let i = 0;
|
|
5902
|
+
if (direct8k) {
|
|
5903
|
+
while (i + 2 < samples.length) {
|
|
5904
|
+
out.push(samples[i]);
|
|
5905
|
+
i += 3;
|
|
5906
|
+
}
|
|
5907
|
+
} else {
|
|
5908
|
+
while (i + 2 < samples.length) {
|
|
5909
|
+
out.push(samples[i]);
|
|
5910
|
+
out.push(Math.round((samples[i + 1] + samples[i + 2]) / 2));
|
|
5911
|
+
i += 3;
|
|
5912
|
+
}
|
|
5617
5913
|
}
|
|
5618
|
-
|
|
5619
|
-
|
|
5620
|
-
|
|
5621
|
-
|
|
5622
|
-
* `sampleRate` as they arrive from Cartesia.
|
|
5623
|
-
*/
|
|
5624
|
-
async *synthesizeStream(text) {
|
|
5625
|
-
const response = await fetch(`${this.baseUrl}/tts/bytes`, {
|
|
5626
|
-
method: "POST",
|
|
5627
|
-
headers: {
|
|
5628
|
-
"X-API-Key": this.apiKey,
|
|
5629
|
-
"Cartesia-Version": this.apiVersion,
|
|
5630
|
-
"Content-Type": "application/json"
|
|
5631
|
-
},
|
|
5632
|
-
body: JSON.stringify(this.buildPayload(text)),
|
|
5633
|
-
signal: AbortSignal.timeout(3e4)
|
|
5634
|
-
});
|
|
5635
|
-
if (!response.ok) {
|
|
5636
|
-
const body = await response.text();
|
|
5637
|
-
throw new Error(`Cartesia TTS error ${response.status}: ${body}`);
|
|
5914
|
+
ctx.leftover = samples.slice(i);
|
|
5915
|
+
const buffer = Buffer.alloc(out.length * 2);
|
|
5916
|
+
for (let j = 0; j < out.length; j++) {
|
|
5917
|
+
buffer.writeInt16LE(out[j], j * 2);
|
|
5638
5918
|
}
|
|
5639
|
-
|
|
5640
|
-
|
|
5919
|
+
return buffer;
|
|
5920
|
+
}
|
|
5921
|
+
/** @deprecated use {@link resampleStreaming} with persistent state. */
|
|
5922
|
+
static resample24kTo16k(audio) {
|
|
5923
|
+
const ctx = {
|
|
5924
|
+
carryByte: null,
|
|
5925
|
+
leftover: [],
|
|
5926
|
+
lpfPrev: 0,
|
|
5927
|
+
lpfEnabled: false,
|
|
5928
|
+
targetSampleRate: 16e3
|
|
5929
|
+
};
|
|
5930
|
+
const out = _OpenAITTS.resampleStreaming(audio, ctx);
|
|
5931
|
+
if (ctx.leftover.length === 0) return out;
|
|
5932
|
+
const tail = Buffer.alloc(ctx.leftover.length * 2);
|
|
5933
|
+
for (let i = 0; i < ctx.leftover.length; i++) {
|
|
5934
|
+
tail.writeInt16LE(ctx.leftover[i], i * 2);
|
|
5641
5935
|
}
|
|
5642
|
-
|
|
5643
|
-
|
|
5644
|
-
|
|
5645
|
-
|
|
5646
|
-
|
|
5647
|
-
|
|
5648
|
-
|
|
5649
|
-
|
|
5650
|
-
|
|
5651
|
-
|
|
5652
|
-
|
|
5653
|
-
|
|
5654
|
-
|
|
5655
|
-
reader.releaseLock();
|
|
5936
|
+
return Buffer.concat([out, tail]);
|
|
5937
|
+
}
|
|
5938
|
+
};
|
|
5939
|
+
|
|
5940
|
+
// src/tts/openai.ts
|
|
5941
|
+
var TTS3 = class extends OpenAITTS {
|
|
5942
|
+
static providerKey = "openai_tts";
|
|
5943
|
+
constructor(opts = {}) {
|
|
5944
|
+
const key = opts.apiKey ?? process.env.OPENAI_API_KEY;
|
|
5945
|
+
if (!key) {
|
|
5946
|
+
throw new Error(
|
|
5947
|
+
"OpenAI TTS requires an apiKey. Pass { apiKey: 'sk-...' } or set OPENAI_API_KEY in the environment."
|
|
5948
|
+
);
|
|
5656
5949
|
}
|
|
5950
|
+
super(
|
|
5951
|
+
key,
|
|
5952
|
+
opts.voice ?? "alloy",
|
|
5953
|
+
opts.model ?? "gpt-4o-mini-tts",
|
|
5954
|
+
opts.instructions ?? null,
|
|
5955
|
+
opts.speed ?? null,
|
|
5956
|
+
opts.antiAlias ?? false
|
|
5957
|
+
);
|
|
5657
5958
|
}
|
|
5658
5959
|
};
|
|
5659
5960
|
|
|
5660
5961
|
// src/tts/cartesia.ts
|
|
5962
|
+
init_esm_shims();
|
|
5661
5963
|
function resolveApiKey3(apiKey) {
|
|
5662
5964
|
const key = apiKey ?? process.env.CARTESIA_API_KEY;
|
|
5663
5965
|
if (!key) {
|
|
@@ -5687,150 +5989,6 @@ var TTS4 = class _TTS extends CartesiaTTS {
|
|
|
5687
5989
|
|
|
5688
5990
|
// src/tts/rime.ts
|
|
5689
5991
|
init_esm_shims();
|
|
5690
|
-
|
|
5691
|
-
// src/providers/rime-tts.ts
|
|
5692
|
-
init_esm_shims();
|
|
5693
|
-
var RIME_BASE_URL = "https://users.rime.ai/v1/rime-tts";
|
|
5694
|
-
var RimeModel = {
|
|
5695
|
-
ARCANA: "arcana",
|
|
5696
|
-
MIST: "mist",
|
|
5697
|
-
MIST_V2: "mistv2"
|
|
5698
|
-
};
|
|
5699
|
-
var RimeAudioFormat = {
|
|
5700
|
-
PCM: "audio/pcm",
|
|
5701
|
-
MP3: "audio/mp3",
|
|
5702
|
-
WAV: "audio/wav",
|
|
5703
|
-
MULAW: "audio/mulaw"
|
|
5704
|
-
};
|
|
5705
|
-
var ARCANA_MODEL_TIMEOUT_MS = 60 * 4 * 1e3;
|
|
5706
|
-
var MIST_MODEL_TIMEOUT_MS = 30 * 1e3;
|
|
5707
|
-
function isMistModel(model) {
|
|
5708
|
-
return model.includes(RimeModel.MIST);
|
|
5709
|
-
}
|
|
5710
|
-
function timeoutForModel(model) {
|
|
5711
|
-
if (model === RimeModel.ARCANA) return ARCANA_MODEL_TIMEOUT_MS;
|
|
5712
|
-
return MIST_MODEL_TIMEOUT_MS;
|
|
5713
|
-
}
|
|
5714
|
-
var RimeTTS = class {
|
|
5715
|
-
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
5716
|
-
static providerKey = "rime";
|
|
5717
|
-
apiKey;
|
|
5718
|
-
model;
|
|
5719
|
-
speaker;
|
|
5720
|
-
lang;
|
|
5721
|
-
sampleRate;
|
|
5722
|
-
repetitionPenalty;
|
|
5723
|
-
temperature;
|
|
5724
|
-
topP;
|
|
5725
|
-
maxTokens;
|
|
5726
|
-
speedAlpha;
|
|
5727
|
-
reduceLatency;
|
|
5728
|
-
pauseBetweenBrackets;
|
|
5729
|
-
phonemizeBetweenBrackets;
|
|
5730
|
-
baseUrl;
|
|
5731
|
-
totalTimeoutMs;
|
|
5732
|
-
constructor(apiKey, opts = {}) {
|
|
5733
|
-
this.apiKey = apiKey;
|
|
5734
|
-
this.model = opts.model ?? RimeModel.ARCANA;
|
|
5735
|
-
const defaultSpeaker = isMistModel(this.model) ? "cove" : "astra";
|
|
5736
|
-
this.speaker = opts.speaker ?? defaultSpeaker;
|
|
5737
|
-
this.lang = opts.lang ?? "eng";
|
|
5738
|
-
this.sampleRate = opts.sampleRate ?? 16e3;
|
|
5739
|
-
this.repetitionPenalty = opts.repetitionPenalty;
|
|
5740
|
-
this.temperature = opts.temperature;
|
|
5741
|
-
this.topP = opts.topP;
|
|
5742
|
-
this.maxTokens = opts.maxTokens;
|
|
5743
|
-
this.speedAlpha = opts.speedAlpha;
|
|
5744
|
-
this.reduceLatency = opts.reduceLatency;
|
|
5745
|
-
this.pauseBetweenBrackets = opts.pauseBetweenBrackets;
|
|
5746
|
-
this.phonemizeBetweenBrackets = opts.phonemizeBetweenBrackets;
|
|
5747
|
-
this.baseUrl = opts.baseUrl ?? RIME_BASE_URL;
|
|
5748
|
-
this.totalTimeoutMs = timeoutForModel(this.model);
|
|
5749
|
-
}
|
|
5750
|
-
buildPayload(text) {
|
|
5751
|
-
const payload = {
|
|
5752
|
-
speaker: this.speaker,
|
|
5753
|
-
text,
|
|
5754
|
-
modelId: this.model
|
|
5755
|
-
};
|
|
5756
|
-
if (this.model === RimeModel.ARCANA) {
|
|
5757
|
-
if (this.repetitionPenalty !== void 0)
|
|
5758
|
-
payload.repetition_penalty = this.repetitionPenalty;
|
|
5759
|
-
if (this.temperature !== void 0) payload.temperature = this.temperature;
|
|
5760
|
-
if (this.topP !== void 0) payload.top_p = this.topP;
|
|
5761
|
-
if (this.maxTokens !== void 0) payload.max_tokens = this.maxTokens;
|
|
5762
|
-
payload.lang = this.lang;
|
|
5763
|
-
payload.samplingRate = this.sampleRate;
|
|
5764
|
-
} else if (isMistModel(this.model)) {
|
|
5765
|
-
payload.lang = this.lang;
|
|
5766
|
-
payload.samplingRate = this.sampleRate;
|
|
5767
|
-
if (this.speedAlpha !== void 0) payload.speedAlpha = this.speedAlpha;
|
|
5768
|
-
if (this.model === RimeModel.MIST_V2 && this.reduceLatency !== void 0) {
|
|
5769
|
-
payload.reduceLatency = this.reduceLatency;
|
|
5770
|
-
}
|
|
5771
|
-
if (this.pauseBetweenBrackets !== void 0) {
|
|
5772
|
-
payload.pauseBetweenBrackets = this.pauseBetweenBrackets;
|
|
5773
|
-
}
|
|
5774
|
-
if (this.phonemizeBetweenBrackets !== void 0) {
|
|
5775
|
-
payload.phonemizeBetweenBrackets = this.phonemizeBetweenBrackets;
|
|
5776
|
-
}
|
|
5777
|
-
}
|
|
5778
|
-
return payload;
|
|
5779
|
-
}
|
|
5780
|
-
/** Synthesize text and return the concatenated audio buffer. */
|
|
5781
|
-
async synthesize(text) {
|
|
5782
|
-
const chunks = [];
|
|
5783
|
-
for await (const chunk of this.synthesizeStream(text)) {
|
|
5784
|
-
chunks.push(chunk);
|
|
5785
|
-
}
|
|
5786
|
-
return Buffer.concat(chunks);
|
|
5787
|
-
}
|
|
5788
|
-
/**
|
|
5789
|
-
* Synthesize text and yield raw PCM_S16LE chunks at the configured
|
|
5790
|
-
* `sampleRate` as they stream in.
|
|
5791
|
-
*/
|
|
5792
|
-
async *synthesizeStream(text) {
|
|
5793
|
-
const response = await fetch(this.baseUrl, {
|
|
5794
|
-
method: "POST",
|
|
5795
|
-
headers: {
|
|
5796
|
-
accept: RimeAudioFormat.PCM,
|
|
5797
|
-
Authorization: `Bearer ${this.apiKey}`,
|
|
5798
|
-
"content-type": "application/json"
|
|
5799
|
-
},
|
|
5800
|
-
body: JSON.stringify(this.buildPayload(text)),
|
|
5801
|
-
signal: AbortSignal.timeout(this.totalTimeoutMs)
|
|
5802
|
-
});
|
|
5803
|
-
if (!response.ok) {
|
|
5804
|
-
const body = await response.text();
|
|
5805
|
-
throw new Error(`Rime TTS error ${response.status}: ${body}`);
|
|
5806
|
-
}
|
|
5807
|
-
const contentType = response.headers.get("content-type") ?? "";
|
|
5808
|
-
if (!contentType.startsWith("audio")) {
|
|
5809
|
-
const body = await response.text();
|
|
5810
|
-
throw new Error(`Rime returned non-audio response: ${body.slice(0, 500)}`);
|
|
5811
|
-
}
|
|
5812
|
-
if (!response.body) {
|
|
5813
|
-
throw new Error("Rime TTS: no response body");
|
|
5814
|
-
}
|
|
5815
|
-
const reader = response.body.getReader();
|
|
5816
|
-
try {
|
|
5817
|
-
while (true) {
|
|
5818
|
-
const { done, value } = await reader.read();
|
|
5819
|
-
if (done) break;
|
|
5820
|
-
if (value && value.length > 0) {
|
|
5821
|
-
yield Buffer.from(value);
|
|
5822
|
-
}
|
|
5823
|
-
}
|
|
5824
|
-
} finally {
|
|
5825
|
-
if (typeof reader.cancel === "function")
|
|
5826
|
-
await reader.cancel().catch(() => {
|
|
5827
|
-
});
|
|
5828
|
-
reader.releaseLock();
|
|
5829
|
-
}
|
|
5830
|
-
}
|
|
5831
|
-
};
|
|
5832
|
-
|
|
5833
|
-
// src/tts/rime.ts
|
|
5834
5992
|
var TTS5 = class extends RimeTTS {
|
|
5835
5993
|
static providerKey = "rime";
|
|
5836
5994
|
constructor(opts = {}) {
|
|
@@ -6469,12 +6627,6 @@ init_esm_shims();
|
|
|
6469
6627
|
|
|
6470
6628
|
// src/providers/groq-llm.ts
|
|
6471
6629
|
init_esm_shims();
|
|
6472
|
-
|
|
6473
|
-
// src/version.ts
|
|
6474
|
-
init_esm_shims();
|
|
6475
|
-
var VERSION = "0.5.5";
|
|
6476
|
-
|
|
6477
|
-
// src/providers/groq-llm.ts
|
|
6478
6630
|
var GROQ_BASE_URL = "https://api.groq.com/openai/v1";
|
|
6479
6631
|
var GroqModel = {
|
|
6480
6632
|
LLAMA_3_3_70B_VERSATILE: "llama-3.3-70b-versatile",
|
|
@@ -8131,12 +8283,28 @@ var TwilioAdapter = class _TwilioAdapter {
|
|
|
8131
8283
|
return { callSid: call.sid };
|
|
8132
8284
|
}
|
|
8133
8285
|
/**
|
|
8134
|
-
* Build a
|
|
8135
|
-
*
|
|
8286
|
+
* Build a ``<Response><Connect><Stream url="...">`` TwiML document.
|
|
8287
|
+
*
|
|
8288
|
+
* ``parameters`` is forwarded as ``<Parameter name="..." value="..."/>``
|
|
8289
|
+
* children of ``<Stream>``. Twilio Media Streams strips query-string params
|
|
8290
|
+
* from the ``<Stream url=...>`` before the WS handshake, so
|
|
8291
|
+
* ``<Parameter>`` tags are the supported way to pre-populate
|
|
8292
|
+
* ``start.customParameters`` on the WS ``start`` frame. Used by the
|
|
8293
|
+
* inbound path to carry caller / callee through to the bridge.
|
|
8294
|
+
*
|
|
8295
|
+
* Mirrors the Python adapter's ``generate_stream_twiml``.
|
|
8136
8296
|
*/
|
|
8137
|
-
static generateStreamTwiml(streamUrl) {
|
|
8138
|
-
const
|
|
8139
|
-
|
|
8297
|
+
static generateStreamTwiml(streamUrl, parameters) {
|
|
8298
|
+
const esc = (s) => s.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'");
|
|
8299
|
+
const escapedUrl = esc(streamUrl);
|
|
8300
|
+
let paramTags = "";
|
|
8301
|
+
if (parameters) {
|
|
8302
|
+
for (const [name, value] of Object.entries(parameters)) {
|
|
8303
|
+
if (value == null) continue;
|
|
8304
|
+
paramTags += `<Parameter name="${esc(name)}" value="${esc(String(value))}"/>`;
|
|
8305
|
+
}
|
|
8306
|
+
}
|
|
8307
|
+
return `<?xml version="1.0" encoding="UTF-8"?><Response><Connect><Stream url="${escapedUrl}">${paramTags}</Stream></Connect></Response>`;
|
|
8140
8308
|
}
|
|
8141
8309
|
/** Force-complete an in-progress call. */
|
|
8142
8310
|
async endCall(callSid) {
|
|
@@ -8529,6 +8697,8 @@ export {
|
|
|
8529
8697
|
CallMetricsAccumulator,
|
|
8530
8698
|
STT4 as CartesiaSTT,
|
|
8531
8699
|
TTS4 as CartesiaTTS,
|
|
8700
|
+
CartesiaTTSModel,
|
|
8701
|
+
CartesiaTTSVoiceMode,
|
|
8532
8702
|
LLM4 as CerebrasLLM,
|
|
8533
8703
|
ChatContext,
|
|
8534
8704
|
CloudflareTunnel,
|
|
@@ -8536,10 +8706,13 @@ export {
|
|
|
8536
8706
|
DEFAULT_PRICING,
|
|
8537
8707
|
DTMF_EVENTS,
|
|
8538
8708
|
DeepFilterNetFilter,
|
|
8709
|
+
DeepgramModel,
|
|
8539
8710
|
STT as DeepgramSTT,
|
|
8540
8711
|
DefaultToolExecutor,
|
|
8541
8712
|
ConvAI as ElevenLabsConvAI,
|
|
8542
8713
|
ElevenLabsConvAIAdapter,
|
|
8714
|
+
ElevenLabsModel,
|
|
8715
|
+
ElevenLabsOutputFormat,
|
|
8543
8716
|
ElevenLabsTTS as ElevenLabsRestTTS,
|
|
8544
8717
|
TTS as ElevenLabsTTS,
|
|
8545
8718
|
TTS2 as ElevenLabsWebSocketTTS,
|
|
@@ -8568,8 +8741,15 @@ export {
|
|
|
8568
8741
|
Realtime2 as OpenAIRealtime2,
|
|
8569
8742
|
OpenAIRealtime2Adapter,
|
|
8570
8743
|
OpenAIRealtimeAdapter,
|
|
8744
|
+
OpenAIRealtimeAudioFormat,
|
|
8745
|
+
OpenAIRealtimeModel,
|
|
8746
|
+
OpenAIRealtimeVADType,
|
|
8571
8747
|
TTS3 as OpenAITTS,
|
|
8572
8748
|
STT3 as OpenAITranscribeSTT,
|
|
8749
|
+
OpenAITranscriptionModel,
|
|
8750
|
+
OpenAIVoice,
|
|
8751
|
+
PRICING_LAST_UPDATED,
|
|
8752
|
+
PRICING_VERSION,
|
|
8573
8753
|
PartialStreamError,
|
|
8574
8754
|
Patter,
|
|
8575
8755
|
PatterConnectionError,
|
|
@@ -8577,9 +8757,12 @@ export {
|
|
|
8577
8757
|
PatterTool,
|
|
8578
8758
|
PcmCarry,
|
|
8579
8759
|
PipelineHookExecutor,
|
|
8760
|
+
PricingUnit,
|
|
8580
8761
|
ProvisionError,
|
|
8581
8762
|
RateLimitError,
|
|
8582
8763
|
RemoteMessageHandler,
|
|
8764
|
+
RimeAudioFormat,
|
|
8765
|
+
RimeModel,
|
|
8583
8766
|
TTS5 as RimeTTS,
|
|
8584
8767
|
SPAN_BARGEIN,
|
|
8585
8768
|
SPAN_CALL,
|