getpatter 0.6.1 → 0.6.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/{carrier-config-4ZKVYAWV.mjs → carrier-config-3WDQXP5J.mjs} +43 -1
- package/dist/chunk-CL2U3YET.mjs +1429 -0
- package/dist/{chunk-RV7APPYE.mjs → chunk-R2T4JABZ.mjs} +13 -0
- package/dist/{chunk-TEW3NAZJ.mjs → chunk-Z6W5XFWS.mjs} +1071 -1520
- package/dist/cli.js +48 -23
- package/dist/dashboard/ui.html +8 -8
- package/dist/index.d.mts +3912 -3428
- package/dist/index.d.ts +3912 -3428
- package/dist/index.js +2507 -1159
- package/dist/index.mjs +1175 -875
- package/dist/openai-realtime-2-CNFARP25.mjs +8 -0
- package/dist/{silero-vad-NSEXI4XS.mjs → silero-vad-LNDFGIY7.mjs} +1 -1
- package/dist/{test-mode-WEKKNBLD.mjs → test-mode-MDBQ4ECE.mjs} +2 -1
- package/package.json +1 -1
- package/src/dashboard/ui.html +8 -8
package/dist/index.mjs
CHANGED
|
@@ -4,8 +4,10 @@ import {
|
|
|
4
4
|
import {
|
|
5
5
|
AuthenticationError,
|
|
6
6
|
CallMetricsAccumulator,
|
|
7
|
+
Carrier,
|
|
7
8
|
DEFAULT_MIN_SENTENCE_LEN,
|
|
8
9
|
DEFAULT_PRICING,
|
|
10
|
+
DeepgramModel,
|
|
9
11
|
DeepgramSTT,
|
|
10
12
|
DefaultToolExecutor,
|
|
11
13
|
ElevenLabsConvAIAdapter,
|
|
@@ -15,12 +17,13 @@ import {
|
|
|
15
17
|
LLMLoop,
|
|
16
18
|
MetricsStore,
|
|
17
19
|
OpenAILLMProvider,
|
|
18
|
-
|
|
19
|
-
|
|
20
|
+
PRICING_LAST_UPDATED,
|
|
21
|
+
PRICING_VERSION,
|
|
20
22
|
PatterConnectionError,
|
|
21
23
|
PatterError,
|
|
22
|
-
PcmCarry,
|
|
23
24
|
PipelineHookExecutor,
|
|
25
|
+
PlivoAdapter,
|
|
26
|
+
PricingUnit,
|
|
24
27
|
ProvisionError,
|
|
25
28
|
RateLimitError,
|
|
26
29
|
RemoteMessageHandler,
|
|
@@ -32,18 +35,14 @@ import {
|
|
|
32
35
|
SPAN_TOOL,
|
|
33
36
|
SPAN_TTS,
|
|
34
37
|
SentenceChunker,
|
|
35
|
-
StatefulResampler,
|
|
36
38
|
TestSession,
|
|
39
|
+
VERSION,
|
|
37
40
|
calculateRealtimeCost,
|
|
38
41
|
calculateSttCost,
|
|
39
42
|
calculateTelephonyCost,
|
|
40
43
|
calculateTtsCost,
|
|
41
44
|
callsToCsv,
|
|
42
45
|
callsToJson,
|
|
43
|
-
createResampler16kTo8k,
|
|
44
|
-
createResampler24kTo16k,
|
|
45
|
-
createResampler24kTo8k,
|
|
46
|
-
createResampler8kTo16k,
|
|
47
46
|
initTracing,
|
|
48
47
|
isRemoteUrl,
|
|
49
48
|
isTracingEnabled,
|
|
@@ -53,14 +52,29 @@ import {
|
|
|
53
52
|
mergePricing,
|
|
54
53
|
mountApi,
|
|
55
54
|
mountDashboard,
|
|
55
|
+
resolveLogRoot,
|
|
56
|
+
startSpan
|
|
57
|
+
} from "./chunk-Z6W5XFWS.mjs";
|
|
58
|
+
import {
|
|
59
|
+
OpenAIRealtime2Adapter,
|
|
60
|
+
OpenAIRealtimeAdapter,
|
|
61
|
+
OpenAIRealtimeAudioFormat,
|
|
62
|
+
OpenAIRealtimeModel,
|
|
63
|
+
OpenAIRealtimeVADType,
|
|
64
|
+
OpenAITranscriptionModel,
|
|
65
|
+
OpenAIVoice,
|
|
66
|
+
PcmCarry,
|
|
67
|
+
StatefulResampler,
|
|
68
|
+
createResampler16kTo8k,
|
|
69
|
+
createResampler24kTo16k,
|
|
70
|
+
createResampler24kTo8k,
|
|
71
|
+
createResampler8kTo16k,
|
|
56
72
|
mulawToPcm16,
|
|
57
73
|
pcm16ToMulaw,
|
|
58
74
|
resample16kTo8k,
|
|
59
75
|
resample24kTo16k,
|
|
60
|
-
resample8kTo16k
|
|
61
|
-
|
|
62
|
-
startSpan
|
|
63
|
-
} from "./chunk-TEW3NAZJ.mjs";
|
|
76
|
+
resample8kTo16k
|
|
77
|
+
} from "./chunk-CL2U3YET.mjs";
|
|
64
78
|
import {
|
|
65
79
|
MinWordsStrategy,
|
|
66
80
|
evaluateStrategies,
|
|
@@ -75,7 +89,7 @@ import {
|
|
|
75
89
|
} from "./chunk-6GR5MHHQ.mjs";
|
|
76
90
|
import {
|
|
77
91
|
SileroVAD
|
|
78
|
-
} from "./chunk-
|
|
92
|
+
} from "./chunk-R2T4JABZ.mjs";
|
|
79
93
|
import {
|
|
80
94
|
__dirname,
|
|
81
95
|
__require,
|
|
@@ -105,7 +119,7 @@ var Realtime = class {
|
|
|
105
119
|
);
|
|
106
120
|
}
|
|
107
121
|
this.apiKey = key;
|
|
108
|
-
this.model = opts.model ?? "gpt-
|
|
122
|
+
this.model = opts.model ?? "gpt-realtime-mini";
|
|
109
123
|
this.voice = opts.voice ?? "alloy";
|
|
110
124
|
this.reasoningEffort = opts.reasoningEffort;
|
|
111
125
|
this.inputAudioTranscriptionModel = opts.inputAudioTranscriptionModel;
|
|
@@ -557,7 +571,9 @@ function resolvePersistRoot(persist) {
|
|
|
557
571
|
if (persist === false) return null;
|
|
558
572
|
if (persist === true) return resolveLogRoot("auto");
|
|
559
573
|
if (typeof persist === "string") return resolveLogRoot(persist);
|
|
560
|
-
|
|
574
|
+
const envRoot = resolveLogRoot();
|
|
575
|
+
if (envRoot !== null) return envRoot;
|
|
576
|
+
return resolveLogRoot("auto");
|
|
561
577
|
}
|
|
562
578
|
function closeParkedConnections(slot) {
|
|
563
579
|
if (slot.stt) {
|
|
@@ -573,6 +589,11 @@ function closeParkedConnections(slot) {
|
|
|
573
589
|
}
|
|
574
590
|
}
|
|
575
591
|
if (slot.openaiRealtime) {
|
|
592
|
+
const wsAny = slot.openaiRealtime;
|
|
593
|
+
if (wsAny._parkedKeepalive) {
|
|
594
|
+
clearInterval(wsAny._parkedKeepalive);
|
|
595
|
+
delete wsAny._parkedKeepalive;
|
|
596
|
+
}
|
|
576
597
|
try {
|
|
577
598
|
slot.openaiRealtime.close();
|
|
578
599
|
} catch {
|
|
@@ -780,7 +801,7 @@ var Patter = class {
|
|
|
780
801
|
}
|
|
781
802
|
if (!options.carrier) {
|
|
782
803
|
throw new Error(
|
|
783
|
-
"Local mode requires a `carrier` instance. Pass `carrier: new Twilio({...})` or `carrier: new
|
|
804
|
+
"Local mode requires a `carrier` instance. Pass `carrier: new Twilio({...})`, `carrier: new Telnyx({...})` or `carrier: new Plivo({...})`."
|
|
784
805
|
);
|
|
785
806
|
}
|
|
786
807
|
const carrier = options.carrier;
|
|
@@ -958,16 +979,18 @@ var Patter = class {
|
|
|
958
979
|
throw err;
|
|
959
980
|
}
|
|
960
981
|
const carrier = this.localConfig.carrier;
|
|
961
|
-
const telephonyProvider = carrier.kind
|
|
982
|
+
const telephonyProvider = carrier.kind;
|
|
962
983
|
const wantsCarrierManagement = opts.manageWebhook !== false || wantsCloudflared;
|
|
963
984
|
if (wantsCarrierManagement) {
|
|
964
|
-
const { autoConfigureCarrier } = await import("./carrier-config-
|
|
985
|
+
const { autoConfigureCarrier } = await import("./carrier-config-3WDQXP5J.mjs");
|
|
965
986
|
await autoConfigureCarrier({
|
|
966
987
|
telephonyProvider,
|
|
967
988
|
twilioSid: carrier.kind === "twilio" ? carrier.accountSid : void 0,
|
|
968
989
|
twilioToken: carrier.kind === "twilio" ? carrier.authToken : void 0,
|
|
969
990
|
telnyxKey: carrier.kind === "telnyx" ? carrier.apiKey : void 0,
|
|
970
991
|
telnyxConnectionId: carrier.kind === "telnyx" ? carrier.connectionId : void 0,
|
|
992
|
+
plivoAuthId: carrier.kind === "plivo" ? carrier.authId : void 0,
|
|
993
|
+
plivoAuthToken: carrier.kind === "plivo" ? carrier.authToken : void 0,
|
|
971
994
|
phoneNumber: this.localConfig.phoneNumber,
|
|
972
995
|
webhookHost: webhookUrl
|
|
973
996
|
});
|
|
@@ -983,6 +1006,8 @@ var Patter = class {
|
|
|
983
1006
|
telnyxKey: carrier.kind === "telnyx" ? carrier.apiKey : void 0,
|
|
984
1007
|
telnyxConnectionId: carrier.kind === "telnyx" ? carrier.connectionId : void 0,
|
|
985
1008
|
telnyxPublicKey: carrier.kind === "telnyx" ? carrier.publicKey : void 0,
|
|
1009
|
+
plivoAuthId: carrier.kind === "plivo" ? carrier.authId : void 0,
|
|
1010
|
+
plivoAuthToken: carrier.kind === "plivo" ? carrier.authToken : void 0,
|
|
986
1011
|
persistRoot: this.localConfig.persistRoot
|
|
987
1012
|
},
|
|
988
1013
|
opts.agent,
|
|
@@ -1014,7 +1039,7 @@ var Patter = class {
|
|
|
1014
1039
|
}
|
|
1015
1040
|
/** Run the agent in interactive terminal-test mode (no real telephony). */
|
|
1016
1041
|
async test(opts) {
|
|
1017
|
-
const { TestSession: TestSession2 } = await import("./test-mode-
|
|
1042
|
+
const { TestSession: TestSession2 } = await import("./test-mode-MDBQ4ECE.mjs");
|
|
1018
1043
|
const session = new TestSession2();
|
|
1019
1044
|
await session.run({
|
|
1020
1045
|
agent: opts.agent,
|
|
@@ -1144,7 +1169,9 @@ var Patter = class {
|
|
|
1144
1169
|
const tts = agent.tts;
|
|
1145
1170
|
const sttOpen = typeof stt?.openParkedConnection === "function" ? stt.openParkedConnection.bind(stt) : null;
|
|
1146
1171
|
const ttsOpen = typeof tts?.openParkedConnection === "function" ? tts.openParkedConnection.bind(tts) : null;
|
|
1147
|
-
|
|
1172
|
+
const providerStr = agent.provider ?? "";
|
|
1173
|
+
const wantsRealtimePark = providerStr === "openai_realtime" || providerStr === "openai_realtime_2";
|
|
1174
|
+
if (!sttOpen && !ttsOpen && !wantsRealtimePark) return;
|
|
1148
1175
|
const slot = {};
|
|
1149
1176
|
this.prewarmedConnections.set(callId, slot);
|
|
1150
1177
|
const startedAt = Date.now();
|
|
@@ -1189,6 +1216,43 @@ var Patter = class {
|
|
|
1189
1216
|
}
|
|
1190
1217
|
})());
|
|
1191
1218
|
}
|
|
1219
|
+
if (wantsRealtimePark) {
|
|
1220
|
+
tasks.push((async () => {
|
|
1221
|
+
const { OpenAIRealtime2Adapter: OpenAIRealtime2Adapter2 } = await import("./openai-realtime-2-CNFARP25.mjs");
|
|
1222
|
+
const apiKey = process.env.OPENAI_API_KEY ?? "";
|
|
1223
|
+
if (!apiKey) {
|
|
1224
|
+
getLogger().debug(`Park OpenAI Realtime skipped for ${callId}: no OPENAI_API_KEY`);
|
|
1225
|
+
return;
|
|
1226
|
+
}
|
|
1227
|
+
try {
|
|
1228
|
+
const tmpAdapter = new OpenAIRealtime2Adapter2(
|
|
1229
|
+
apiKey,
|
|
1230
|
+
agent.model ?? "gpt-realtime-mini",
|
|
1231
|
+
agent.voice ?? "alloy",
|
|
1232
|
+
agent.systemPrompt ?? "",
|
|
1233
|
+
[],
|
|
1234
|
+
// audioFormat — the GA adapter always emits audio/pcm@24000
|
|
1235
|
+
// internally regardless of this value, but it's a required
|
|
1236
|
+
// positional param. Default to g711_ulaw (Twilio wire format).
|
|
1237
|
+
void 0
|
|
1238
|
+
);
|
|
1239
|
+
const ws = await tmpAdapter.openParkedConnection();
|
|
1240
|
+
if (this.prewarmedConnections.get(callId) !== slot) {
|
|
1241
|
+
try {
|
|
1242
|
+
ws.close();
|
|
1243
|
+
} catch {
|
|
1244
|
+
}
|
|
1245
|
+
return;
|
|
1246
|
+
}
|
|
1247
|
+
slot.openaiRealtime = ws;
|
|
1248
|
+
getLogger().info(
|
|
1249
|
+
`[PREWARM] callId=${callId} provider=openai_realtime ms=${Date.now() - startedAt}`
|
|
1250
|
+
);
|
|
1251
|
+
} catch (err) {
|
|
1252
|
+
getLogger().debug(`Park OpenAI Realtime failed for ${callId}: ${String(err)}`);
|
|
1253
|
+
}
|
|
1254
|
+
})());
|
|
1255
|
+
}
|
|
1192
1256
|
const task = (async () => {
|
|
1193
1257
|
await Promise.allSettled(tasks);
|
|
1194
1258
|
})();
|
|
@@ -1266,7 +1330,7 @@ var Patter = class {
|
|
|
1266
1330
|
* with a warn when the cap is reached (the call still proceeds —
|
|
1267
1331
|
* StreamHandler falls back to live TTS).
|
|
1268
1332
|
*/
|
|
1269
|
-
spawnPrewarmFirstMessage(agent, callId, ringTimeout) {
|
|
1333
|
+
spawnPrewarmFirstMessage(agent, callId, ringTimeout, carrier) {
|
|
1270
1334
|
if (!agent.prewarmFirstMessage) return;
|
|
1271
1335
|
const providerMode = agent.provider ?? "openai_realtime";
|
|
1272
1336
|
if (providerMode !== "pipeline") {
|
|
@@ -1279,6 +1343,18 @@ var Patter = class {
|
|
|
1279
1343
|
const tts = agent.tts;
|
|
1280
1344
|
if (!firstMessage || !tts) return;
|
|
1281
1345
|
if (typeof tts.synthesizeStream !== "function") return;
|
|
1346
|
+
if (carrier) {
|
|
1347
|
+
const carrierAware = tts;
|
|
1348
|
+
if (typeof carrierAware.setTelephonyCarrier === "function") {
|
|
1349
|
+
try {
|
|
1350
|
+
carrierAware.setTelephonyCarrier(carrier);
|
|
1351
|
+
} catch (err) {
|
|
1352
|
+
getLogger().debug(
|
|
1353
|
+
`Prewarm TTS setTelephonyCarrier failed for ${callId}: ${String(err)}`
|
|
1354
|
+
);
|
|
1355
|
+
}
|
|
1356
|
+
}
|
|
1357
|
+
}
|
|
1282
1358
|
const inFlight = this.prewarmAudio.size + this.prewarmTasks.size;
|
|
1283
1359
|
if (inFlight >= PREWARM_CACHE_MAX) {
|
|
1284
1360
|
getLogger().warn(
|
|
@@ -1343,7 +1419,15 @@ var Patter = class {
|
|
|
1343
1419
|
this.prewarmTtlTimers.set(callId, handle);
|
|
1344
1420
|
});
|
|
1345
1421
|
}
|
|
1346
|
-
/**
|
|
1422
|
+
/**
|
|
1423
|
+
* Place an outbound call via the configured carrier.
|
|
1424
|
+
*
|
|
1425
|
+
* With `wait: false` (default) this resolves to `void` the instant the
|
|
1426
|
+
* carrier accepts the dial (fire-and-forget). With `wait: true` it blocks
|
|
1427
|
+
* until the call reaches a terminal state and resolves to a
|
|
1428
|
+
* {@link CallResult} — see {@link LocalCallOptions.wait}. Mirrors Python's
|
|
1429
|
+
* `Patter.call(..., wait=False)`.
|
|
1430
|
+
*/
|
|
1347
1431
|
async call(options) {
|
|
1348
1432
|
if (!options.to) {
|
|
1349
1433
|
throw new Error("'to' phone number is required");
|
|
@@ -1351,7 +1435,13 @@ var Patter = class {
|
|
|
1351
1435
|
if (!options.to.startsWith("+")) {
|
|
1352
1436
|
throw new Error(`'to' must be in E.164 format (e.g., '+1234567890'). Got: '${options.to}'`);
|
|
1353
1437
|
}
|
|
1438
|
+
if (options.wait && !this.embeddedServer) {
|
|
1439
|
+
throw new PatterConnectionError(
|
|
1440
|
+
"call({ wait: true }) requires an active server to receive the carrier completion webhooks. Call `await phone.serve(...)` first, or use `await using phone = new Patter(...)` (and serve inside the block) which keeps the server up for the duration of the block."
|
|
1441
|
+
);
|
|
1442
|
+
}
|
|
1354
1443
|
const { phoneNumber, webhookUrl, carrier } = this.localConfig;
|
|
1444
|
+
let callId = "";
|
|
1355
1445
|
const effectiveRingTimeout = options.ringTimeout === void 0 ? 25 : options.ringTimeout;
|
|
1356
1446
|
const wantsAmd = options.machineDetection !== false || Boolean(options.voicemailMessage);
|
|
1357
1447
|
if (this.embeddedServer) {
|
|
@@ -1391,20 +1481,92 @@ var Patter = class {
|
|
|
1391
1481
|
telnyxCallId = body.data?.call_control_id;
|
|
1392
1482
|
} catch {
|
|
1393
1483
|
}
|
|
1394
|
-
if (
|
|
1395
|
-
|
|
1484
|
+
if (telnyxCallId) {
|
|
1485
|
+
const initiatedPayload = {
|
|
1396
1486
|
call_id: telnyxCallId,
|
|
1397
1487
|
caller: phoneNumber,
|
|
1398
1488
|
callee: options.to,
|
|
1399
|
-
direction: "outbound"
|
|
1400
|
-
|
|
1489
|
+
direction: "outbound",
|
|
1490
|
+
status: "initiated"
|
|
1491
|
+
};
|
|
1492
|
+
if (this.embeddedServer) {
|
|
1493
|
+
this.embeddedServer.metricsStore.recordCallInitiated(initiatedPayload);
|
|
1494
|
+
}
|
|
1495
|
+
try {
|
|
1496
|
+
const { notifyDashboard: notifyDashboard2 } = await import("./persistence-LVIAHESK.mjs");
|
|
1497
|
+
notifyDashboard2(initiatedPayload);
|
|
1498
|
+
} catch {
|
|
1499
|
+
}
|
|
1401
1500
|
}
|
|
1402
1501
|
if (telnyxCallId) {
|
|
1403
|
-
|
|
1502
|
+
callId = telnyxCallId;
|
|
1503
|
+
this.spawnPrewarmFirstMessage(options.agent, telnyxCallId, effectiveRingTimeout, "telnyx");
|
|
1404
1504
|
if (options.agent.prewarm !== false) {
|
|
1405
1505
|
this.parkProviderConnections(options.agent, telnyxCallId);
|
|
1406
1506
|
}
|
|
1407
1507
|
}
|
|
1508
|
+
return this.maybeAwaitCompletion(options, callId, effectiveRingTimeout);
|
|
1509
|
+
}
|
|
1510
|
+
if (carrier.kind === "plivo") {
|
|
1511
|
+
const auth = `Basic ${Buffer.from(`${carrier.authId}:${carrier.authToken}`).toString("base64")}`;
|
|
1512
|
+
const plivoPayload = {
|
|
1513
|
+
from: phoneNumber,
|
|
1514
|
+
to: options.to,
|
|
1515
|
+
answer_url: `https://${webhookUrl}/webhooks/plivo/voice`,
|
|
1516
|
+
answer_method: "POST",
|
|
1517
|
+
// hangup_url is Plivo's StatusCallback analogue — without it the
|
|
1518
|
+
// /webhooks/plivo/status route never fires for outbound calls and
|
|
1519
|
+
// the dashboard misses no-answer / busy / failed.
|
|
1520
|
+
hangup_url: `https://${webhookUrl}/webhooks/plivo/status`,
|
|
1521
|
+
hangup_method: "POST"
|
|
1522
|
+
};
|
|
1523
|
+
if (effectiveRingTimeout !== null && effectiveRingTimeout !== void 0) {
|
|
1524
|
+
plivoPayload.ring_timeout = Math.max(1, Math.floor(effectiveRingTimeout));
|
|
1525
|
+
}
|
|
1526
|
+
if (wantsAmd) {
|
|
1527
|
+
plivoPayload.machine_detection = "true";
|
|
1528
|
+
plivoPayload.machine_detection_time = 5e3;
|
|
1529
|
+
plivoPayload.machine_detection_url = `https://${webhookUrl}/webhooks/plivo/amd`;
|
|
1530
|
+
plivoPayload.machine_detection_method = "POST";
|
|
1531
|
+
}
|
|
1532
|
+
if (options.voicemailMessage && this.embeddedServer) {
|
|
1533
|
+
this.embeddedServer.voicemailMessage = options.voicemailMessage;
|
|
1534
|
+
}
|
|
1535
|
+
const response2 = await fetch(`https://api.plivo.com/v1/Account/${carrier.authId}/Call/`, {
|
|
1536
|
+
method: "POST",
|
|
1537
|
+
headers: { "Content-Type": "application/json", Authorization: auth },
|
|
1538
|
+
body: JSON.stringify(plivoPayload)
|
|
1539
|
+
});
|
|
1540
|
+
if (!response2.ok) {
|
|
1541
|
+
throw new ProvisionError(`Failed to initiate Plivo call: ${await response2.text()}`);
|
|
1542
|
+
}
|
|
1543
|
+
let plivoCallId;
|
|
1544
|
+
try {
|
|
1545
|
+
const body = await response2.clone().json();
|
|
1546
|
+
plivoCallId = body.request_uuid;
|
|
1547
|
+
} catch {
|
|
1548
|
+
}
|
|
1549
|
+
if (plivoCallId) {
|
|
1550
|
+
const initiatedPayload = {
|
|
1551
|
+
call_id: plivoCallId,
|
|
1552
|
+
caller: phoneNumber,
|
|
1553
|
+
callee: options.to,
|
|
1554
|
+
direction: "outbound",
|
|
1555
|
+
status: "initiated"
|
|
1556
|
+
};
|
|
1557
|
+
if (this.embeddedServer) {
|
|
1558
|
+
this.embeddedServer.metricsStore.recordCallInitiated(initiatedPayload);
|
|
1559
|
+
}
|
|
1560
|
+
try {
|
|
1561
|
+
const { notifyDashboard: notifyDashboard2 } = await import("./persistence-LVIAHESK.mjs");
|
|
1562
|
+
notifyDashboard2(initiatedPayload);
|
|
1563
|
+
} catch {
|
|
1564
|
+
}
|
|
1565
|
+
this.spawnPrewarmFirstMessage(options.agent, plivoCallId, effectiveRingTimeout, "plivo");
|
|
1566
|
+
if (options.agent.prewarm !== false) {
|
|
1567
|
+
this.parkProviderConnections(options.agent, plivoCallId);
|
|
1568
|
+
}
|
|
1569
|
+
}
|
|
1408
1570
|
return;
|
|
1409
1571
|
}
|
|
1410
1572
|
const twilioSid = carrier.accountSid;
|
|
@@ -1453,25 +1615,76 @@ var Patter = class {
|
|
|
1453
1615
|
twilioNotificationsPath = body.subresource_uris?.notifications;
|
|
1454
1616
|
} catch {
|
|
1455
1617
|
}
|
|
1456
|
-
if (
|
|
1457
|
-
|
|
1618
|
+
if (twilioCallSid) {
|
|
1619
|
+
const initiatedPayload = {
|
|
1458
1620
|
call_id: twilioCallSid,
|
|
1459
1621
|
caller: phoneNumber,
|
|
1460
1622
|
callee: options.to,
|
|
1461
|
-
direction: "outbound"
|
|
1462
|
-
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
)
|
|
1623
|
+
direction: "outbound",
|
|
1624
|
+
status: "initiated"
|
|
1625
|
+
};
|
|
1626
|
+
if (this.embeddedServer) {
|
|
1627
|
+
this.embeddedServer.metricsStore.recordCallInitiated(initiatedPayload);
|
|
1628
|
+
if (twilioNotificationsPath) {
|
|
1629
|
+
getLogger().info(
|
|
1630
|
+
`Outbound call ${twilioCallSid} placed. Twilio notifications: https://api.twilio.com${twilioNotificationsPath} (check here if the call drops with no audio).`
|
|
1631
|
+
);
|
|
1632
|
+
}
|
|
1633
|
+
}
|
|
1634
|
+
try {
|
|
1635
|
+
const { notifyDashboard: notifyDashboard2 } = await import("./persistence-LVIAHESK.mjs");
|
|
1636
|
+
notifyDashboard2(initiatedPayload);
|
|
1637
|
+
} catch {
|
|
1467
1638
|
}
|
|
1468
1639
|
}
|
|
1469
1640
|
if (twilioCallSid) {
|
|
1470
|
-
|
|
1641
|
+
callId = twilioCallSid;
|
|
1642
|
+
this.spawnPrewarmFirstMessage(options.agent, twilioCallSid, effectiveRingTimeout, "twilio");
|
|
1471
1643
|
if (options.agent.prewarm !== false) {
|
|
1472
1644
|
this.parkProviderConnections(options.agent, twilioCallSid);
|
|
1473
1645
|
}
|
|
1474
1646
|
}
|
|
1647
|
+
return this.maybeAwaitCompletion(options, callId, effectiveRingTimeout);
|
|
1648
|
+
}
|
|
1649
|
+
/**
|
|
1650
|
+
* When `options.wait` is set, register a completion promise keyed by the
|
|
1651
|
+
* carrier-issued `callId` and await it (bounded by a backstop timeout).
|
|
1652
|
+
* Otherwise resolve to `void` immediately (fire-and-forget).
|
|
1653
|
+
*
|
|
1654
|
+
* The registration happens here — after the carrier accepted the dial and
|
|
1655
|
+
* issued the id — so the future correlates to the right call. The race
|
|
1656
|
+
* window between `initiateCall` returning and this registration is
|
|
1657
|
+
* harmless: the callee is still ringing, so no terminal signal can fire
|
|
1658
|
+
* before we register. Mirrors the Python `call(wait=True)` tail block.
|
|
1659
|
+
*/
|
|
1660
|
+
async maybeAwaitCompletion(options, callId, ringTimeout) {
|
|
1661
|
+
if (!options.wait) return;
|
|
1662
|
+
const server = this.embeddedServer;
|
|
1663
|
+
if (!server || !callId) {
|
|
1664
|
+
throw new PatterConnectionError(
|
|
1665
|
+
"call({ wait: true }): no active server or carrier call id."
|
|
1666
|
+
);
|
|
1667
|
+
}
|
|
1668
|
+
const completion = server.registerCompletion(callId);
|
|
1669
|
+
const backstopMs = ((ringTimeout ?? 25) + 1800) * 1e3;
|
|
1670
|
+
let timer;
|
|
1671
|
+
const backstop = new Promise((_resolve, reject) => {
|
|
1672
|
+
timer = setTimeout(() => {
|
|
1673
|
+
server.deleteCompletion(callId);
|
|
1674
|
+
reject(
|
|
1675
|
+
new PatterConnectionError(
|
|
1676
|
+
`call({ wait: true }): no terminal signal for call ${callId} within ${(backstopMs / 1e3).toFixed(0)}s`,
|
|
1677
|
+
{ code: ErrorCode.TIMEOUT }
|
|
1678
|
+
)
|
|
1679
|
+
);
|
|
1680
|
+
}, backstopMs);
|
|
1681
|
+
timer.unref?.();
|
|
1682
|
+
});
|
|
1683
|
+
try {
|
|
1684
|
+
return await Promise.race([completion, backstop]);
|
|
1685
|
+
} finally {
|
|
1686
|
+
if (timer) clearTimeout(timer);
|
|
1687
|
+
}
|
|
1475
1688
|
}
|
|
1476
1689
|
/**
|
|
1477
1690
|
* Stop the embedded server and any running tunnel. Safe to call multiple
|
|
@@ -1512,6 +1725,11 @@ var Patter = class {
|
|
|
1512
1725
|
this.tunnelHandle = null;
|
|
1513
1726
|
}
|
|
1514
1727
|
if (this.embeddedServer) {
|
|
1728
|
+
this.embeddedServer.failPendingCompletions(
|
|
1729
|
+
new PatterConnectionError(
|
|
1730
|
+
"Patter.disconnect() called while a call({ wait: true }) was still in flight."
|
|
1731
|
+
)
|
|
1732
|
+
);
|
|
1515
1733
|
await this.embeddedServer.stop();
|
|
1516
1734
|
this.embeddedServer = null;
|
|
1517
1735
|
}
|
|
@@ -1535,6 +1753,30 @@ var Patter = class {
|
|
|
1535
1753
|
this._ready.catch(() => {
|
|
1536
1754
|
});
|
|
1537
1755
|
}
|
|
1756
|
+
/**
|
|
1757
|
+
* Explicit-resource-management disposer so callers can write
|
|
1758
|
+
* ``await using phone = new Patter(...)`` and have {@link disconnect} run
|
|
1759
|
+
* automatically when the block exits — on the normal path AND when the
|
|
1760
|
+
* body throws. This guarantees the embedded server, any auto-started
|
|
1761
|
+
* tunnel, and in-flight prewarm/TTS work are torn down so a still-running
|
|
1762
|
+
* TTS WebSocket cannot keep the user billed after the block ends, and any
|
|
1763
|
+
* in-flight ``call({ wait: true })`` awaiter is failed rather than left
|
|
1764
|
+
* hanging. ``disconnect()`` is idempotent, so an explicit ``disconnect()``
|
|
1765
|
+
* inside the block is still safe. Mirrors Python's ``async with Patter(...)``.
|
|
1766
|
+
*
|
|
1767
|
+
* Note: this does NOT start the server (``serve()`` blocks until shutdown,
|
|
1768
|
+
* so it cannot run from a disposer) — call ``serve(...)`` inside the block:
|
|
1769
|
+
*
|
|
1770
|
+
* ```ts
|
|
1771
|
+
* await using phone = new Patter({ carrier: new Twilio(), phoneNumber: "+1555..." });
|
|
1772
|
+
* await phone.serve({ agent }); // inbound, or
|
|
1773
|
+
* const result = await phone.call({ to: "+1555...", agent, wait: true });
|
|
1774
|
+
* // disconnect() has run here — nothing left running.
|
|
1775
|
+
* ```
|
|
1776
|
+
*/
|
|
1777
|
+
async [Symbol.asyncDispose]() {
|
|
1778
|
+
await this.disconnect();
|
|
1779
|
+
}
|
|
1538
1780
|
/**
|
|
1539
1781
|
* Terminate an active call on the configured carrier.
|
|
1540
1782
|
*
|
|
@@ -1589,6 +1831,17 @@ var Patter = class {
|
|
|
1589
1831
|
}
|
|
1590
1832
|
return;
|
|
1591
1833
|
}
|
|
1834
|
+
if (carrier.kind === "plivo") {
|
|
1835
|
+
const auth = Buffer.from(`${carrier.authId}:${carrier.authToken}`).toString("base64");
|
|
1836
|
+
const res = await fetch(
|
|
1837
|
+
`https://api.plivo.com/v1/Account/${carrier.authId}/Call/${encodeURIComponent(callSid)}/`,
|
|
1838
|
+
{ method: "DELETE", headers: { Authorization: `Basic ${auth}` } }
|
|
1839
|
+
);
|
|
1840
|
+
if (!res.ok && res.status !== 404) {
|
|
1841
|
+
throw new Error(`Plivo hangup failed: ${res.status} ${await res.text()}`);
|
|
1842
|
+
}
|
|
1843
|
+
return;
|
|
1844
|
+
}
|
|
1592
1845
|
throw new Error(`endCall() requires a configured carrier; got kind=${carrier.kind}`);
|
|
1593
1846
|
}
|
|
1594
1847
|
};
|
|
@@ -1986,7 +2239,6 @@ init_esm_shims();
|
|
|
1986
2239
|
|
|
1987
2240
|
// src/integrations/patter-tool.ts
|
|
1988
2241
|
init_esm_shims();
|
|
1989
|
-
import { EventEmitter } from "events";
|
|
1990
2242
|
var PARAMETERS_SCHEMA = {
|
|
1991
2243
|
type: "object",
|
|
1992
2244
|
properties: {
|
|
@@ -2013,7 +2265,7 @@ var PARAMETERS_SCHEMA = {
|
|
|
2013
2265
|
};
|
|
2014
2266
|
var DEFAULT_NAME = "make_phone_call";
|
|
2015
2267
|
var DEFAULT_DESCRIPTION = "Place a real outbound phone call. Returns a JSON object with the full transcript, call status, duration in seconds, and cost. Use this when the user asks you to call someone, schedule appointments by phone, or otherwise reach a human via voice.";
|
|
2016
|
-
var PatterTool = class
|
|
2268
|
+
var PatterTool = class {
|
|
2017
2269
|
name;
|
|
2018
2270
|
description;
|
|
2019
2271
|
phone;
|
|
@@ -2021,24 +2273,6 @@ var PatterTool = class _PatterTool {
|
|
|
2021
2273
|
maxDurationSec;
|
|
2022
2274
|
recording;
|
|
2023
2275
|
started = false;
|
|
2024
|
-
/** Resolver for the next `call_initiated` SSE event. Only set inside the
|
|
2025
|
-
* dial mutex (`dialQueue`), so two parallel `execute()` calls never share
|
|
2026
|
-
* it and never lose a dispatch. */
|
|
2027
|
-
pendingDial = null;
|
|
2028
|
-
/** Mutex that serializes the dial → call_id capture critical section.
|
|
2029
|
-
* Each `execute()` chains a continuation onto this promise so the
|
|
2030
|
-
* `pendingDial` slot is owned by exactly one caller at a time. */
|
|
2031
|
-
dialQueue = Promise.resolve();
|
|
2032
|
-
/** Captured SSE listener so `stop()` can detach it (prevents leaks when
|
|
2033
|
-
* the underlying Patter instance outlives this tool). */
|
|
2034
|
-
sseListener = null;
|
|
2035
|
-
/** Captured Patter metrics store, for cleanup in `stop()`. */
|
|
2036
|
-
metricsStoreRef = null;
|
|
2037
|
-
/** call_id → pending promise machinery. */
|
|
2038
|
-
pending = /* @__PURE__ */ new Map();
|
|
2039
|
-
bus = new EventEmitter();
|
|
2040
|
-
/** How long to wait for the `call_initiated` SSE before failing the dial. */
|
|
2041
|
-
static DIAL_CAPTURE_TIMEOUT_MS = 1e4;
|
|
2042
2276
|
constructor(opts) {
|
|
2043
2277
|
if (!opts.phone) {
|
|
2044
2278
|
throw new Error("PatterTool: `phone` (a Patter instance) is required.");
|
|
@@ -2082,7 +2316,15 @@ var PatterTool = class _PatterTool {
|
|
|
2082
2316
|
};
|
|
2083
2317
|
}
|
|
2084
2318
|
// --- Lifecycle ----------------------------------------------------------
|
|
2085
|
-
/**
|
|
2319
|
+
/**
|
|
2320
|
+
* Start the underlying Patter server. Idempotent.
|
|
2321
|
+
*
|
|
2322
|
+
* `execute()` relies on `Patter.call({ wait: true })`, which requires an
|
|
2323
|
+
* active server to receive the carrier completion webhooks — that's what
|
|
2324
|
+
* `serve()` provides here. No `onCallEnd` callback is wired: the SDK's own
|
|
2325
|
+
* per-callId completion registry resolves the result, so the user's
|
|
2326
|
+
* `onCallEnd` slot is left free.
|
|
2327
|
+
*/
|
|
2086
2328
|
async start() {
|
|
2087
2329
|
if (this.started) return;
|
|
2088
2330
|
if (!this.agent) {
|
|
@@ -2094,52 +2336,31 @@ var PatterTool = class _PatterTool {
|
|
|
2094
2336
|
await this.phone.serve({
|
|
2095
2337
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
2096
2338
|
agent: builtAgent,
|
|
2097
|
-
recording: this.recording
|
|
2098
|
-
onCallEnd: this.onCallEndHandler.bind(this)
|
|
2339
|
+
recording: this.recording
|
|
2099
2340
|
});
|
|
2100
|
-
const store = this.phone.metricsStore;
|
|
2101
|
-
if (!store) {
|
|
2102
|
-
throw new Error(
|
|
2103
|
-
"PatterTool.start: phone.metricsStore is null after serve() \u2014 is the dashboard disabled?"
|
|
2104
|
-
);
|
|
2105
|
-
}
|
|
2106
|
-
const listener = (event) => {
|
|
2107
|
-
if (event.type === "call_initiated" && this.pendingDial) {
|
|
2108
|
-
const callId = event.data.call_id || "";
|
|
2109
|
-
if (callId) {
|
|
2110
|
-
const dispatch = this.pendingDial;
|
|
2111
|
-
this.pendingDial = null;
|
|
2112
|
-
dispatch(callId);
|
|
2113
|
-
}
|
|
2114
|
-
}
|
|
2115
|
-
};
|
|
2116
|
-
store.on("sse", listener);
|
|
2117
|
-
this.sseListener = listener;
|
|
2118
|
-
this.metricsStoreRef = store;
|
|
2119
2341
|
this.started = true;
|
|
2120
2342
|
}
|
|
2121
|
-
/**
|
|
2343
|
+
/** Best-effort shutdown — tear the Patter server down via `disconnect()`. */
|
|
2122
2344
|
async stop() {
|
|
2123
2345
|
if (!this.started) return;
|
|
2124
|
-
|
|
2125
|
-
|
|
2126
|
-
|
|
2127
|
-
|
|
2128
|
-
|
|
2129
|
-
|
|
2130
|
-
for (const [, p] of this.pending) {
|
|
2131
|
-
clearTimeout(p.timer);
|
|
2132
|
-
p.reject(new Error("PatterTool: shutdown while call pending"));
|
|
2133
|
-
}
|
|
2134
|
-
this.pending.clear();
|
|
2135
|
-
const stoppable = this.phone;
|
|
2136
|
-
if (typeof stoppable.stop === "function") {
|
|
2137
|
-
await stoppable.stop();
|
|
2346
|
+
const disconnectable = this.phone;
|
|
2347
|
+
if (typeof disconnectable.disconnect === "function") {
|
|
2348
|
+
try {
|
|
2349
|
+
await disconnectable.disconnect();
|
|
2350
|
+
} catch {
|
|
2351
|
+
}
|
|
2138
2352
|
}
|
|
2139
2353
|
this.started = false;
|
|
2140
2354
|
}
|
|
2141
2355
|
// --- Execution ----------------------------------------------------------
|
|
2142
|
-
/**
|
|
2356
|
+
/**
|
|
2357
|
+
* Dial outbound, wait for the call to end, return a structured result.
|
|
2358
|
+
*
|
|
2359
|
+
* Thin wrapper over `Patter.call({ wait: true })`: the SDK now owns the
|
|
2360
|
+
* dial → callId → terminal-signal correlation, so this just bounds the wait
|
|
2361
|
+
* with `max_duration_sec` and maps the {@link CallResult} into the tool's
|
|
2362
|
+
* public envelope. Mirrors Python's `PatterTool.execute`.
|
|
2363
|
+
*/
|
|
2143
2364
|
async execute(args) {
|
|
2144
2365
|
if (!this.started) await this.start();
|
|
2145
2366
|
if (!args || typeof args.to !== "string" || !args.to.startsWith("+")) {
|
|
@@ -2155,55 +2376,32 @@ var PatterTool = class _PatterTool {
|
|
|
2155
2376
|
...args.goal !== void 0 ? { systemPrompt: args.goal } : {},
|
|
2156
2377
|
...args.first_message !== void 0 ? { firstMessage: args.first_message } : {}
|
|
2157
2378
|
});
|
|
2158
|
-
|
|
2159
|
-
|
|
2160
|
-
|
|
2161
|
-
|
|
2162
|
-
|
|
2379
|
+
let timer;
|
|
2380
|
+
const timeout = new Promise((_resolve, reject) => {
|
|
2381
|
+
timer = setTimeout(() => {
|
|
2382
|
+
reject(
|
|
2383
|
+
new Error(
|
|
2384
|
+
`PatterTool.execute: call to ${args.to} exceeded ${timeoutSec}s timeout`
|
|
2385
|
+
)
|
|
2386
|
+
);
|
|
2163
2387
|
}, timeoutSec * 1e3);
|
|
2164
|
-
|
|
2165
|
-
resolve,
|
|
2166
|
-
reject,
|
|
2167
|
-
timer,
|
|
2168
|
-
startedAt: Date.now() / 1e3
|
|
2169
|
-
});
|
|
2170
|
-
});
|
|
2171
|
-
}
|
|
2172
|
-
/** Issue the outbound dial under the mutex and return its assigned call_id. */
|
|
2173
|
-
async acquireCallId(to, agent) {
|
|
2174
|
-
let release;
|
|
2175
|
-
const slot = new Promise((r) => {
|
|
2176
|
-
release = r;
|
|
2388
|
+
timer.unref?.();
|
|
2177
2389
|
});
|
|
2178
|
-
|
|
2179
|
-
this.dialQueue = previous.then(() => slot);
|
|
2180
|
-
await previous;
|
|
2181
|
-
let captureTimer = null;
|
|
2390
|
+
let result;
|
|
2182
2391
|
try {
|
|
2183
|
-
|
|
2184
|
-
this.
|
|
2185
|
-
|
|
2186
|
-
|
|
2187
|
-
|
|
2188
|
-
|
|
2189
|
-
|
|
2190
|
-
|
|
2191
|
-
|
|
2192
|
-
}, _PatterTool.DIAL_CAPTURE_TIMEOUT_MS);
|
|
2193
|
-
});
|
|
2194
|
-
await this.phone.call({
|
|
2195
|
-
to,
|
|
2196
|
-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
2197
|
-
agent
|
|
2198
|
-
});
|
|
2199
|
-
const callId = await callIdPromise;
|
|
2200
|
-
if (captureTimer) clearTimeout(captureTimer);
|
|
2201
|
-
return callId;
|
|
2392
|
+
result = await Promise.race([
|
|
2393
|
+
this.phone.call({
|
|
2394
|
+
to: args.to,
|
|
2395
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
2396
|
+
agent: overrideAgent,
|
|
2397
|
+
wait: true
|
|
2398
|
+
}),
|
|
2399
|
+
timeout
|
|
2400
|
+
]);
|
|
2202
2401
|
} finally {
|
|
2203
|
-
if (
|
|
2204
|
-
this.pendingDial = null;
|
|
2205
|
-
release();
|
|
2402
|
+
if (timer) clearTimeout(timer);
|
|
2206
2403
|
}
|
|
2404
|
+
return resultFromCallResult(result);
|
|
2207
2405
|
}
|
|
2208
2406
|
/**
|
|
2209
2407
|
* Hermes-style handler: `(args, kwargs) => Promise<string>` returning a JSON
|
|
@@ -2221,32 +2419,32 @@ var PatterTool = class _PatterTool {
|
|
|
2221
2419
|
}
|
|
2222
2420
|
};
|
|
2223
2421
|
}
|
|
2224
|
-
// --- Internal: onCallEnd dispatcher -------------------------------------
|
|
2225
|
-
async onCallEndHandler(data) {
|
|
2226
|
-
const callId = data.call_id || "";
|
|
2227
|
-
if (!callId) return;
|
|
2228
|
-
const pending = this.pending.get(callId);
|
|
2229
|
-
if (!pending) {
|
|
2230
|
-
this.bus.emit("orphan_end", { call_id: callId, data });
|
|
2231
|
-
return;
|
|
2232
|
-
}
|
|
2233
|
-
clearTimeout(pending.timer);
|
|
2234
|
-
this.pending.delete(callId);
|
|
2235
|
-
const metrics = data.metrics && typeof data.metrics === "object" ? data.metrics : null;
|
|
2236
|
-
const cost = metrics && typeof metrics.cost === "object" && metrics.cost && typeof metrics.cost.total === "number" ? metrics.cost.total : void 0;
|
|
2237
|
-
const duration = typeof metrics?.duration_seconds === "number" ? metrics?.duration_seconds : Math.max(0, Date.now() / 1e3 - pending.startedAt);
|
|
2238
|
-
const transcript = Array.isArray(data.transcript) ? data.transcript : [];
|
|
2239
|
-
const status = data.status || "completed";
|
|
2240
|
-
pending.resolve({
|
|
2241
|
-
call_id: callId,
|
|
2242
|
-
status,
|
|
2243
|
-
duration_seconds: duration,
|
|
2244
|
-
cost_usd: cost,
|
|
2245
|
-
transcript,
|
|
2246
|
-
metrics
|
|
2247
|
-
});
|
|
2248
|
-
}
|
|
2249
2422
|
};
|
|
2423
|
+
function resultFromCallResult(result) {
|
|
2424
|
+
if (!result) {
|
|
2425
|
+
return {
|
|
2426
|
+
call_id: "",
|
|
2427
|
+
status: "completed",
|
|
2428
|
+
outcome: "",
|
|
2429
|
+
duration_seconds: 0,
|
|
2430
|
+
cost_usd: void 0,
|
|
2431
|
+
transcript: [],
|
|
2432
|
+
metrics: null
|
|
2433
|
+
};
|
|
2434
|
+
}
|
|
2435
|
+
const costTotal = result.cost?.total;
|
|
2436
|
+
const costUsd = typeof costTotal === "number" ? costTotal : void 0;
|
|
2437
|
+
const metrics = result.metrics ? result.metrics : null;
|
|
2438
|
+
return {
|
|
2439
|
+
call_id: result.callId || "",
|
|
2440
|
+
status: result.status || "completed",
|
|
2441
|
+
outcome: result.outcome || "",
|
|
2442
|
+
duration_seconds: typeof result.durationSeconds === "number" ? result.durationSeconds : 0,
|
|
2443
|
+
cost_usd: costUsd,
|
|
2444
|
+
transcript: result.transcript ? [...result.transcript] : [],
|
|
2445
|
+
metrics
|
|
2446
|
+
};
|
|
2447
|
+
}
|
|
2250
2448
|
|
|
2251
2449
|
// src/providers/gemini-live.ts
|
|
2252
2450
|
init_esm_shims();
|
|
@@ -2764,54 +2962,642 @@ function scheduleInterval(intervalOrOpts, callback) {
|
|
|
2764
2962
|
};
|
|
2765
2963
|
}
|
|
2766
2964
|
|
|
2767
|
-
// src/
|
|
2965
|
+
// src/providers/elevenlabs-tts.ts
|
|
2768
2966
|
init_esm_shims();
|
|
2769
|
-
var
|
|
2770
|
-
|
|
2771
|
-
|
|
2772
|
-
|
|
2773
|
-
|
|
2774
|
-
|
|
2775
|
-
|
|
2776
|
-
|
|
2777
|
-
|
|
2778
|
-
|
|
2779
|
-
|
|
2780
|
-
|
|
2781
|
-
|
|
2782
|
-
|
|
2783
|
-
|
|
2784
|
-
|
|
2785
|
-
|
|
2786
|
-
|
|
2787
|
-
|
|
2788
|
-
|
|
2789
|
-
|
|
2790
|
-
|
|
2791
|
-
|
|
2792
|
-
|
|
2967
|
+
var ELEVENLABS_BASE_URL = "https://api.elevenlabs.io/v1";
|
|
2968
|
+
var ELEVENLABS_VOICE_ID_BY_NAME = {
|
|
2969
|
+
rachel: "21m00Tcm4TlvDq8ikWAM",
|
|
2970
|
+
drew: "29vD33N1CtxCmqQRPOHJ",
|
|
2971
|
+
clyde: "2EiwWnXFnvU5JabPnv8n",
|
|
2972
|
+
paul: "5Q0t7uMcjvnagumLfvZi",
|
|
2973
|
+
domi: "AZnzlk1XvdvUeBnXmlld",
|
|
2974
|
+
dave: "CYw3kZ02Hs0563khs1Fj",
|
|
2975
|
+
fin: "D38z5RcWu1voky8WS1ja",
|
|
2976
|
+
bella: "EXAVITQu4vr4xnSDxMaL",
|
|
2977
|
+
antoni: "ErXwobaYiN019PkySvjV",
|
|
2978
|
+
thomas: "GBv7mTt0atIp3Br8iCZE",
|
|
2979
|
+
charlie: "IKne3meq5aSn9XLyUdCD",
|
|
2980
|
+
george: "JBFqnCBsd6RMkjVDRZzb",
|
|
2981
|
+
emily: "LcfcDJNUP1GQjkzn1xUU",
|
|
2982
|
+
elli: "MF3mGyEYCl7XYWbV9V6O",
|
|
2983
|
+
callum: "N2lVS1w4EtoT3dr4eOWO",
|
|
2984
|
+
patrick: "ODq5zmih8GrVes37Dizd",
|
|
2985
|
+
harry: "SOYHLrjzK2X1ezoPC6cr",
|
|
2986
|
+
liam: "TX3LPaxmHKxFdv7VOQHJ",
|
|
2987
|
+
dorothy: "ThT5KcBeYPX3keUQqHPh",
|
|
2988
|
+
josh: "TxGEqnHWrfWFTfGW9XjX",
|
|
2989
|
+
arnold: "VR6AewLTigWG4xSOukaG",
|
|
2990
|
+
charlotte: "XB0fDUnXU5powFXDhCwa",
|
|
2991
|
+
matilda: "XrExE9yKIg1WjnnlVkGX",
|
|
2992
|
+
matthew: "Yko7PKHZNXotIFUBG7I9",
|
|
2993
|
+
james: "ZQe5CZNOzWyzPSCn5a3c",
|
|
2994
|
+
joseph: "Zlb1dXrM653N07WRdFW3",
|
|
2995
|
+
jeremy: "bVMeCyTHy58xNoL34h3p",
|
|
2996
|
+
michael: "flq6f7yk4E4fJM5XTYuZ",
|
|
2997
|
+
ethan: "g5CIjZEefAph4nQFvHAz",
|
|
2998
|
+
gigi: "jBpfuIE2acCO8z3wKNLl",
|
|
2999
|
+
freya: "jsCqWAovK2LkecY7zXl4",
|
|
3000
|
+
brian: "nPczCjzI2devNBz1zQrb",
|
|
3001
|
+
grace: "oWAxZDx7w5VEj9dCyTzz",
|
|
3002
|
+
daniel: "onwK4e9ZLuTAKqWW03F9",
|
|
3003
|
+
lily: "pFZP5JQG7iQjIQuC4Bku",
|
|
3004
|
+
serena: "pMsXgVXv3BLzUgSXRplE",
|
|
3005
|
+
adam: "pNInz6obpgDQGcFmaJgB",
|
|
3006
|
+
nicole: "piTKgcLEGmPE4e6mEKli",
|
|
3007
|
+
bill: "pqHfZKP75CvOlQylNhV4",
|
|
3008
|
+
jessie: "t0jbNlBVZ17f02VDIeMI",
|
|
3009
|
+
ryan: "wViXBPUzp2ZZixB1xQuM",
|
|
3010
|
+
sam: "yoZ06aMxZJJ28mfd3POQ",
|
|
3011
|
+
glinda: "z9fAnlkpzviPz146aGWa",
|
|
3012
|
+
giovanni: "zcAOhNBS3c14rBihAFp1",
|
|
3013
|
+
mimi: "zrHiDhphv9ZnVXBqCLjz",
|
|
3014
|
+
sarah: "EXAVITQu4vr4xnSDxMaL",
|
|
3015
|
+
alloy: "EXAVITQu4vr4xnSDxMaL"
|
|
2793
3016
|
};
|
|
2794
|
-
|
|
2795
|
-
|
|
2796
|
-
|
|
2797
|
-
|
|
2798
|
-
//
|
|
2799
|
-
|
|
2800
|
-
|
|
2801
|
-
|
|
2802
|
-
|
|
2803
|
-
|
|
2804
|
-
|
|
2805
|
-
|
|
2806
|
-
|
|
2807
|
-
|
|
2808
|
-
|
|
2809
|
-
|
|
2810
|
-
|
|
2811
|
-
|
|
2812
|
-
|
|
2813
|
-
|
|
2814
|
-
|
|
3017
|
+
var VOICE_ID_PATTERN = /^[A-Za-z0-9]{20}$/;
|
|
3018
|
+
var CARRIER_NATIVE_FORMAT = {
|
|
3019
|
+
twilio: "ulaw_8000",
|
|
3020
|
+
telnyx: "pcm_16000",
|
|
3021
|
+
// Plivo streams mulaw 8 kHz (we pin contentType in the answer XML).
|
|
3022
|
+
plivo: "ulaw_8000"
|
|
3023
|
+
};
|
|
3024
|
+
function resolveVoiceId(voice) {
|
|
3025
|
+
if (!voice) return voice;
|
|
3026
|
+
if (VOICE_ID_PATTERN.test(voice)) return voice;
|
|
3027
|
+
return ELEVENLABS_VOICE_ID_BY_NAME[voice.toLowerCase()] ?? voice;
|
|
3028
|
+
}
|
|
3029
|
+
var ElevenLabsModel = {
|
|
3030
|
+
V3: "eleven_v3",
|
|
3031
|
+
FLASH_V2_5: "eleven_flash_v2_5",
|
|
3032
|
+
TURBO_V2_5: "eleven_turbo_v2_5",
|
|
3033
|
+
MULTILINGUAL_V2: "eleven_multilingual_v2",
|
|
3034
|
+
MONOLINGUAL_V1: "eleven_monolingual_v1"
|
|
3035
|
+
};
|
|
3036
|
+
var ElevenLabsOutputFormat = {
|
|
3037
|
+
MP3_22050_32: "mp3_22050_32",
|
|
3038
|
+
MP3_44100_32: "mp3_44100_32",
|
|
3039
|
+
MP3_44100_64: "mp3_44100_64",
|
|
3040
|
+
MP3_44100_96: "mp3_44100_96",
|
|
3041
|
+
MP3_44100_128: "mp3_44100_128",
|
|
3042
|
+
MP3_44100_192: "mp3_44100_192",
|
|
3043
|
+
PCM_8000: "pcm_8000",
|
|
3044
|
+
PCM_16000: "pcm_16000",
|
|
3045
|
+
PCM_22050: "pcm_22050",
|
|
3046
|
+
PCM_24000: "pcm_24000",
|
|
3047
|
+
PCM_44100: "pcm_44100",
|
|
3048
|
+
ULAW_8000: "ulaw_8000"
|
|
3049
|
+
};
|
|
3050
|
+
var ElevenLabsTTS = class _ElevenLabsTTS {
|
|
3051
|
+
// Stable pricing/dashboard key — read by stream-handler / metrics via
|
|
3052
|
+
// ``(agent.tts.constructor as any).providerKey``. Without this the cost
|
|
3053
|
+
// calculator falls back to ``constructor.name`` ("ElevenLabsTTS") which
|
|
3054
|
+
// does NOT match the pricing table key "elevenlabs", silently zeroing
|
|
3055
|
+
// TTS cost for callers that construct the raw REST class directly
|
|
3056
|
+
// (exposed at top level as ``ElevenLabsRestTTS``).
|
|
3057
|
+
static providerKey = "elevenlabs";
|
|
3058
|
+
apiKey;
|
|
3059
|
+
voiceId;
|
|
3060
|
+
modelId;
|
|
3061
|
+
_outputFormat;
|
|
3062
|
+
_outputFormatExplicit;
|
|
3063
|
+
voiceSettings;
|
|
3064
|
+
languageCode;
|
|
3065
|
+
chunkSize;
|
|
3066
|
+
/**
|
|
3067
|
+
* Public view of the (possibly auto-flipped) wire format. Read by the
|
|
3068
|
+
* stream-handler to decide whether to skip the client-side resample +
|
|
3069
|
+
* mulaw encode when the bytes are already in the carrier's wire codec.
|
|
3070
|
+
*/
|
|
3071
|
+
get outputFormat() {
|
|
3072
|
+
return this._outputFormat;
|
|
3073
|
+
}
|
|
3074
|
+
constructor(apiKey, voiceIdOrOptions = "21m00Tcm4TlvDq8ikWAM", modelId = ElevenLabsModel.FLASH_V2_5, outputFormat = ElevenLabsOutputFormat.PCM_16000) {
|
|
3075
|
+
this.apiKey = apiKey;
|
|
3076
|
+
if (typeof voiceIdOrOptions === "object") {
|
|
3077
|
+
const o = voiceIdOrOptions;
|
|
3078
|
+
this.voiceId = resolveVoiceId(o.voiceId ?? "21m00Tcm4TlvDq8ikWAM");
|
|
3079
|
+
this.modelId = o.modelId ?? ElevenLabsModel.FLASH_V2_5;
|
|
3080
|
+
this._outputFormatExplicit = o.outputFormat !== void 0;
|
|
3081
|
+
this._outputFormat = o.outputFormat ?? ElevenLabsOutputFormat.PCM_16000;
|
|
3082
|
+
this.voiceSettings = o.voiceSettings;
|
|
3083
|
+
this.languageCode = o.languageCode;
|
|
3084
|
+
this.chunkSize = o.chunkSize ?? 4096;
|
|
3085
|
+
} else {
|
|
3086
|
+
this.voiceId = resolveVoiceId(voiceIdOrOptions);
|
|
3087
|
+
this.modelId = modelId;
|
|
3088
|
+
this._outputFormatExplicit = outputFormat !== ElevenLabsOutputFormat.PCM_16000;
|
|
3089
|
+
this._outputFormat = outputFormat;
|
|
3090
|
+
this.voiceSettings = void 0;
|
|
3091
|
+
this.languageCode = void 0;
|
|
3092
|
+
this.chunkSize = 4096;
|
|
3093
|
+
}
|
|
3094
|
+
}
|
|
3095
|
+
/**
|
|
3096
|
+
* Hook called by ``StreamHandler.initPipeline`` to advise the carrier
|
|
3097
|
+
* wire format. When the user did NOT pass an explicit ``outputFormat``,
|
|
3098
|
+
* auto-flip to the carrier's native codec so the audio bytes ElevenLabs
|
|
3099
|
+
* returns are already in Twilio/Telnyx wire format — eliminating the
|
|
3100
|
+
* client-side 16 kHz → 8 kHz resample and PCM → μ-law encode. The
|
|
3101
|
+
* resample/encode chain was a source of audible artifacts on the
|
|
3102
|
+
* prewarmed firstMessage (see 0.6.2 acceptance notes — burst delivery
|
|
3103
|
+
* of resampled audio crackled on the carrier-side jitter buffer).
|
|
3104
|
+
*
|
|
3105
|
+
* No-op when the caller passed an explicit ``outputFormat`` (incl. via
|
|
3106
|
+
* the ``forTwilio`` / ``forTelnyx`` factories) — user wins.
|
|
3107
|
+
*
|
|
3108
|
+
* Parity with {@link ElevenLabsWebSocketTTS.setTelephonyCarrier}.
|
|
3109
|
+
*/
|
|
3110
|
+
setTelephonyCarrier(carrier) {
|
|
3111
|
+
if (this._outputFormatExplicit) return;
|
|
3112
|
+
const native = CARRIER_NATIVE_FORMAT[carrier];
|
|
3113
|
+
if (native !== void 0) this._outputFormat = native;
|
|
3114
|
+
}
|
|
3115
|
+
/**
|
|
3116
|
+
* Construct an instance pre-configured for Twilio Media Streams.
|
|
3117
|
+
*
|
|
3118
|
+
* Sets `outputFormat='ulaw_8000'` so ElevenLabs emits μ-law @ 8 kHz
|
|
3119
|
+
* directly — the exact wire format Twilio's media stream uses — letting
|
|
3120
|
+
* the SDK skip the 16 kHz→8 kHz resample and PCM→μ-law conversion in
|
|
3121
|
+
* `TwilioAudioSender`. Saves ~30–80 ms first-byte and per-frame CPU,
|
|
3122
|
+
* and removes a potential aliasing source.
|
|
3123
|
+
*
|
|
3124
|
+
* `voiceSettings` defaults to a low-bandwidth-friendly profile
|
|
3125
|
+
* (speaker boost off, modest stability) which sounds cleaner at 8 kHz
|
|
3126
|
+
* μ-law than the studio default. Pass an explicit object to override.
|
|
3127
|
+
*/
|
|
3128
|
+
static forTwilio(apiKey, options = {}) {
|
|
3129
|
+
const voiceSettings = options.voiceSettings ?? {
|
|
3130
|
+
// Speaker boost adds high-frequency emphasis that aliases ugly over an
|
|
3131
|
+
// 8 kHz μ-law line. Slightly higher stability tames the excursions
|
|
3132
|
+
// that compander quantization noise can amplify.
|
|
3133
|
+
stability: 0.6,
|
|
3134
|
+
similarity_boost: 0.75,
|
|
3135
|
+
use_speaker_boost: false
|
|
3136
|
+
};
|
|
3137
|
+
return new _ElevenLabsTTS(apiKey, {
|
|
3138
|
+
...options,
|
|
3139
|
+
voiceSettings,
|
|
3140
|
+
outputFormat: ElevenLabsOutputFormat.ULAW_8000
|
|
3141
|
+
});
|
|
3142
|
+
}
|
|
3143
|
+
/**
|
|
3144
|
+
* Construct an instance pre-configured for Telnyx bidirectional media.
|
|
3145
|
+
*
|
|
3146
|
+
* Telnyx's default media-streaming codec is L16 PCM @ 16 kHz, which
|
|
3147
|
+
* matches our default Telnyx handler. We pick `pcm_16000` so the audio
|
|
3148
|
+
* flows end-to-end with zero resampling or transcoding.
|
|
3149
|
+
*
|
|
3150
|
+
* Trade-off: if your Telnyx profile is pinned to PCMU/8000 (μ-law),
|
|
3151
|
+
* construct `ElevenLabsTTS` directly with `outputFormat: 'ulaw_8000'`
|
|
3152
|
+
* — Telnyx supports that natively too.
|
|
3153
|
+
*/
|
|
3154
|
+
static forTelnyx(apiKey, options = {}) {
|
|
3155
|
+
return new _ElevenLabsTTS(apiKey, {
|
|
3156
|
+
...options,
|
|
3157
|
+
outputFormat: ElevenLabsOutputFormat.PCM_16000
|
|
3158
|
+
});
|
|
3159
|
+
}
|
|
3160
|
+
/**
|
|
3161
|
+
* Synthesise text to speech and return the full audio as a single Buffer.
|
|
3162
|
+
*
|
|
3163
|
+
* For large chunks (or when latency matters) call `synthesizeStream` instead.
|
|
3164
|
+
*/
|
|
3165
|
+
async synthesize(text) {
|
|
3166
|
+
const chunks = [];
|
|
3167
|
+
for await (const chunk of this.synthesizeStream(text)) {
|
|
3168
|
+
chunks.push(chunk);
|
|
3169
|
+
}
|
|
3170
|
+
return Buffer.concat(chunks);
|
|
3171
|
+
}
|
|
3172
|
+
/**
|
|
3173
|
+
* Synthesise text and yield audio chunks as they arrive (streaming).
|
|
3174
|
+
*
|
|
3175
|
+
* The yielded buffers are raw PCM at 16 kHz (or whatever `outputFormat` is
|
|
3176
|
+
* configured to). `chunkSize` controls the maximum yield size — 512 is a
|
|
3177
|
+
* good choice for low-latency telephony.
|
|
3178
|
+
*/
|
|
3179
|
+
async *synthesizeStream(text) {
|
|
3180
|
+
const url = `${ELEVENLABS_BASE_URL}/text-to-speech/${encodeURIComponent(this.voiceId)}/stream?output_format=${encodeURIComponent(this._outputFormat)}`;
|
|
3181
|
+
const body = {
|
|
3182
|
+
text,
|
|
3183
|
+
model_id: this.modelId
|
|
3184
|
+
};
|
|
3185
|
+
if (this.voiceSettings) body["voice_settings"] = this.voiceSettings;
|
|
3186
|
+
if (this.languageCode) body["language_code"] = this.languageCode;
|
|
3187
|
+
const response = await fetch(url, {
|
|
3188
|
+
method: "POST",
|
|
3189
|
+
headers: {
|
|
3190
|
+
"xi-api-key": this.apiKey,
|
|
3191
|
+
"Content-Type": "application/json"
|
|
3192
|
+
},
|
|
3193
|
+
body: JSON.stringify(body),
|
|
3194
|
+
signal: AbortSignal.timeout(3e4)
|
|
3195
|
+
});
|
|
3196
|
+
if (!response.ok) {
|
|
3197
|
+
const errBody = await response.text();
|
|
3198
|
+
throw new Error(`ElevenLabs TTS error ${response.status}: ${errBody}`);
|
|
3199
|
+
}
|
|
3200
|
+
if (!response.body) {
|
|
3201
|
+
throw new Error("ElevenLabs TTS: no response body");
|
|
3202
|
+
}
|
|
3203
|
+
const reader = response.body.getReader();
|
|
3204
|
+
try {
|
|
3205
|
+
while (true) {
|
|
3206
|
+
const { done, value } = await reader.read();
|
|
3207
|
+
if (done) break;
|
|
3208
|
+
if (!value || value.length === 0) continue;
|
|
3209
|
+
const buf = Buffer.from(value);
|
|
3210
|
+
for (let offset = 0; offset < buf.length; offset += this.chunkSize) {
|
|
3211
|
+
yield buf.subarray(offset, Math.min(offset + this.chunkSize, buf.length));
|
|
3212
|
+
}
|
|
3213
|
+
}
|
|
3214
|
+
} finally {
|
|
3215
|
+
if (typeof reader.cancel === "function") await reader.cancel().catch(() => {
|
|
3216
|
+
});
|
|
3217
|
+
reader.releaseLock();
|
|
3218
|
+
}
|
|
3219
|
+
}
|
|
3220
|
+
};
|
|
3221
|
+
|
|
3222
|
+
// src/providers/cartesia-tts.ts
|
|
3223
|
+
init_esm_shims();
|
|
3224
|
+
var CARTESIA_BASE_URL = "https://api.cartesia.ai";
|
|
3225
|
+
var CARTESIA_API_VERSION = "2025-04-16";
|
|
3226
|
+
var CARTESIA_DEFAULT_VOICE_ID = "f786b574-daa5-4673-aa0c-cbe3e8534c02";
|
|
3227
|
+
var CartesiaTTSModel = {
|
|
3228
|
+
SONIC_3: "sonic-3",
|
|
3229
|
+
SONIC_2: "sonic-2",
|
|
3230
|
+
SONIC: "sonic"
|
|
3231
|
+
};
|
|
3232
|
+
var CartesiaTTSContainer = {
|
|
3233
|
+
RAW: "raw",
|
|
3234
|
+
WAV: "wav",
|
|
3235
|
+
MP3: "mp3"
|
|
3236
|
+
};
|
|
3237
|
+
var CartesiaTTSEncoding = {
|
|
3238
|
+
PCM_S16LE: "pcm_s16le",
|
|
3239
|
+
PCM_F32LE: "pcm_f32le",
|
|
3240
|
+
PCM_MULAW: "pcm_mulaw",
|
|
3241
|
+
PCM_ALAW: "pcm_alaw"
|
|
3242
|
+
};
|
|
3243
|
+
var CartesiaTTSSampleRate = {
|
|
3244
|
+
HZ_8000: 8e3,
|
|
3245
|
+
HZ_16000: 16e3,
|
|
3246
|
+
HZ_22050: 22050,
|
|
3247
|
+
HZ_24000: 24e3,
|
|
3248
|
+
HZ_44100: 44100
|
|
3249
|
+
};
|
|
3250
|
+
var CartesiaTTSVoiceMode = {
|
|
3251
|
+
ID: "id",
|
|
3252
|
+
EMBEDDING: "embedding"
|
|
3253
|
+
};
|
|
3254
|
+
var CartesiaTTS = class _CartesiaTTS {
|
|
3255
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
3256
|
+
static providerKey = "cartesia_tts";
|
|
3257
|
+
apiKey;
|
|
3258
|
+
model;
|
|
3259
|
+
voice;
|
|
3260
|
+
language;
|
|
3261
|
+
sampleRate;
|
|
3262
|
+
speed;
|
|
3263
|
+
emotion;
|
|
3264
|
+
volume;
|
|
3265
|
+
baseUrl;
|
|
3266
|
+
apiVersion;
|
|
3267
|
+
constructor(apiKey, opts = {}) {
|
|
3268
|
+
this.apiKey = apiKey;
|
|
3269
|
+
this.model = opts.model ?? CartesiaTTSModel.SONIC_3;
|
|
3270
|
+
this.voice = opts.voice ?? CARTESIA_DEFAULT_VOICE_ID;
|
|
3271
|
+
this.language = opts.language ?? "en";
|
|
3272
|
+
this.sampleRate = opts.sampleRate ?? CartesiaTTSSampleRate.HZ_16000;
|
|
3273
|
+
this.speed = opts.speed;
|
|
3274
|
+
this.emotion = typeof opts.emotion === "string" ? [opts.emotion] : opts.emotion;
|
|
3275
|
+
this.volume = opts.volume;
|
|
3276
|
+
this.baseUrl = opts.baseUrl ?? CARTESIA_BASE_URL;
|
|
3277
|
+
this.apiVersion = opts.apiVersion ?? CARTESIA_API_VERSION;
|
|
3278
|
+
}
|
|
3279
|
+
/**
|
|
3280
|
+
* Construct an instance pre-configured for Twilio Media Streams.
|
|
3281
|
+
*
|
|
3282
|
+
* Sets `sampleRate=8000` so Cartesia emits PCM_S16LE @ 8 kHz directly.
|
|
3283
|
+
* Twilio's media stream uses μ-law @ 8 kHz so the SDK still does the
|
|
3284
|
+
* PCM → μ-law transcode client-side, but the 16 kHz → 8 kHz resample
|
|
3285
|
+
* step is skipped. Saves ~10–30 ms first-byte plus per-frame CPU and
|
|
3286
|
+
* removes a potential aliasing source.
|
|
3287
|
+
*/
|
|
3288
|
+
static forTwilio(apiKey, options = {}) {
|
|
3289
|
+
return new _CartesiaTTS(apiKey, {
|
|
3290
|
+
...options,
|
|
3291
|
+
sampleRate: CartesiaTTSSampleRate.HZ_8000
|
|
3292
|
+
});
|
|
3293
|
+
}
|
|
3294
|
+
/**
|
|
3295
|
+
* Construct an instance pre-configured for Telnyx bidirectional media.
|
|
3296
|
+
*
|
|
3297
|
+
* Sets `sampleRate=16000` to match Telnyx's L16/16000 default codec —
|
|
3298
|
+
* audio flows end-to-end with zero resampling or transcoding. Same as
|
|
3299
|
+
* the bare-constructor default; exists for API symmetry with
|
|
3300
|
+
* {@link CartesiaTTS.forTwilio}.
|
|
3301
|
+
*/
|
|
3302
|
+
static forTelnyx(apiKey, options = {}) {
|
|
3303
|
+
return new _CartesiaTTS(apiKey, {
|
|
3304
|
+
...options,
|
|
3305
|
+
sampleRate: CartesiaTTSSampleRate.HZ_16000
|
|
3306
|
+
});
|
|
3307
|
+
}
|
|
3308
|
+
/** Build the JSON payload for the Cartesia bytes endpoint. */
|
|
3309
|
+
buildPayload(text) {
|
|
3310
|
+
const payload = {
|
|
3311
|
+
model_id: this.model,
|
|
3312
|
+
voice: { mode: CartesiaTTSVoiceMode.ID, id: this.voice },
|
|
3313
|
+
transcript: text,
|
|
3314
|
+
output_format: {
|
|
3315
|
+
container: CartesiaTTSContainer.RAW,
|
|
3316
|
+
encoding: CartesiaTTSEncoding.PCM_S16LE,
|
|
3317
|
+
sample_rate: this.sampleRate
|
|
3318
|
+
},
|
|
3319
|
+
language: this.language
|
|
3320
|
+
};
|
|
3321
|
+
const generationConfig = {};
|
|
3322
|
+
if (this.speed !== void 0) generationConfig.speed = this.speed;
|
|
3323
|
+
if (this.emotion && this.emotion.length > 0)
|
|
3324
|
+
generationConfig.emotion = this.emotion[0];
|
|
3325
|
+
if (this.volume !== void 0) generationConfig.volume = this.volume;
|
|
3326
|
+
if (Object.keys(generationConfig).length > 0) {
|
|
3327
|
+
payload.generation_config = generationConfig;
|
|
3328
|
+
}
|
|
3329
|
+
return payload;
|
|
3330
|
+
}
|
|
3331
|
+
/**
|
|
3332
|
+
* Pre-call HTTP warmup for the Cartesia `/tts/bytes` endpoint.
|
|
3333
|
+
*
|
|
3334
|
+
* Issues a lightweight `GET <baseUrl>/voices` so DNS, TLS, and HTTP/2
|
|
3335
|
+
* are already up by the time the first `synthesizeStream()` POST
|
|
3336
|
+
* lands. Best-effort: 5 s timeout, all exceptions swallowed at
|
|
3337
|
+
* debug level.
|
|
3338
|
+
*
|
|
3339
|
+
* Billing safety: `GET /voices` is a free metadata read on
|
|
3340
|
+
* Cartesia's REST surface (per https://docs.cartesia.ai). It does
|
|
3341
|
+
* not consume synthesis credits. The actual synthesis is billed
|
|
3342
|
+
* only when `POST /tts/bytes` runs with a non-empty `transcript`.
|
|
3343
|
+
*
|
|
3344
|
+
* Note: Cartesia TTS uses the HTTP path (vs the WebSocket variant
|
|
3345
|
+
* Cartesia also exposes) — connection warmup is therefore HTTP-GET
|
|
3346
|
+
* based, not WebSocket pre-handshake. The latency win is smaller
|
|
3347
|
+
* (~50-150 ms vs the ~200-500 ms of a WS prewarm) but still real.
|
|
3348
|
+
*/
|
|
3349
|
+
async warmup() {
|
|
3350
|
+
try {
|
|
3351
|
+
await fetch(`${this.baseUrl}/voices`, {
|
|
3352
|
+
method: "GET",
|
|
3353
|
+
headers: {
|
|
3354
|
+
"X-API-Key": this.apiKey,
|
|
3355
|
+
"Cartesia-Version": this.apiVersion
|
|
3356
|
+
},
|
|
3357
|
+
signal: AbortSignal.timeout(5e3)
|
|
3358
|
+
});
|
|
3359
|
+
} catch (err) {
|
|
3360
|
+
getLogger().debug(`Cartesia TTS warmup failed (best-effort): ${String(err)}`);
|
|
3361
|
+
}
|
|
3362
|
+
}
|
|
3363
|
+
/** Synthesize text and return the concatenated audio buffer. */
|
|
3364
|
+
async synthesize(text) {
|
|
3365
|
+
const chunks = [];
|
|
3366
|
+
for await (const chunk of this.synthesizeStream(text)) {
|
|
3367
|
+
chunks.push(chunk);
|
|
3368
|
+
}
|
|
3369
|
+
return Buffer.concat(chunks);
|
|
3370
|
+
}
|
|
3371
|
+
/**
|
|
3372
|
+
* Synthesize text and yield raw PCM_S16LE chunks at the configured
|
|
3373
|
+
* `sampleRate` as they arrive from Cartesia.
|
|
3374
|
+
*/
|
|
3375
|
+
async *synthesizeStream(text) {
|
|
3376
|
+
const response = await fetch(`${this.baseUrl}/tts/bytes`, {
|
|
3377
|
+
method: "POST",
|
|
3378
|
+
headers: {
|
|
3379
|
+
"X-API-Key": this.apiKey,
|
|
3380
|
+
"Cartesia-Version": this.apiVersion,
|
|
3381
|
+
"Content-Type": "application/json"
|
|
3382
|
+
},
|
|
3383
|
+
body: JSON.stringify(this.buildPayload(text)),
|
|
3384
|
+
signal: AbortSignal.timeout(3e4)
|
|
3385
|
+
});
|
|
3386
|
+
if (!response.ok) {
|
|
3387
|
+
const body = await response.text();
|
|
3388
|
+
throw new Error(`Cartesia TTS error ${response.status}: ${body}`);
|
|
3389
|
+
}
|
|
3390
|
+
if (!response.body) {
|
|
3391
|
+
throw new Error("Cartesia TTS: no response body");
|
|
3392
|
+
}
|
|
3393
|
+
const reader = response.body.getReader();
|
|
3394
|
+
try {
|
|
3395
|
+
while (true) {
|
|
3396
|
+
const { done, value } = await reader.read();
|
|
3397
|
+
if (done) break;
|
|
3398
|
+
if (value && value.length > 0) {
|
|
3399
|
+
yield Buffer.from(value);
|
|
3400
|
+
}
|
|
3401
|
+
}
|
|
3402
|
+
} finally {
|
|
3403
|
+
if (typeof reader.cancel === "function")
|
|
3404
|
+
await reader.cancel().catch(() => {
|
|
3405
|
+
});
|
|
3406
|
+
reader.releaseLock();
|
|
3407
|
+
}
|
|
3408
|
+
}
|
|
3409
|
+
};
|
|
3410
|
+
|
|
3411
|
+
// src/providers/rime-tts.ts
|
|
3412
|
+
init_esm_shims();
|
|
3413
|
+
var RIME_BASE_URL = "https://users.rime.ai/v1/rime-tts";
|
|
3414
|
+
var RimeModel = {
|
|
3415
|
+
ARCANA: "arcana",
|
|
3416
|
+
MIST: "mist",
|
|
3417
|
+
MIST_V2: "mistv2"
|
|
3418
|
+
};
|
|
3419
|
+
var RimeAudioFormat = {
|
|
3420
|
+
PCM: "audio/pcm",
|
|
3421
|
+
MP3: "audio/mp3",
|
|
3422
|
+
WAV: "audio/wav",
|
|
3423
|
+
MULAW: "audio/mulaw"
|
|
3424
|
+
};
|
|
3425
|
+
var ARCANA_MODEL_TIMEOUT_MS = 60 * 4 * 1e3;
|
|
3426
|
+
var MIST_MODEL_TIMEOUT_MS = 30 * 1e3;
|
|
3427
|
+
function isMistModel(model) {
|
|
3428
|
+
return model.includes(RimeModel.MIST);
|
|
3429
|
+
}
|
|
3430
|
+
function timeoutForModel(model) {
|
|
3431
|
+
if (model === RimeModel.ARCANA) return ARCANA_MODEL_TIMEOUT_MS;
|
|
3432
|
+
return MIST_MODEL_TIMEOUT_MS;
|
|
3433
|
+
}
|
|
3434
|
+
var RimeTTS = class {
|
|
3435
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
3436
|
+
static providerKey = "rime";
|
|
3437
|
+
apiKey;
|
|
3438
|
+
model;
|
|
3439
|
+
speaker;
|
|
3440
|
+
lang;
|
|
3441
|
+
sampleRate;
|
|
3442
|
+
repetitionPenalty;
|
|
3443
|
+
temperature;
|
|
3444
|
+
topP;
|
|
3445
|
+
maxTokens;
|
|
3446
|
+
speedAlpha;
|
|
3447
|
+
reduceLatency;
|
|
3448
|
+
pauseBetweenBrackets;
|
|
3449
|
+
phonemizeBetweenBrackets;
|
|
3450
|
+
baseUrl;
|
|
3451
|
+
totalTimeoutMs;
|
|
3452
|
+
constructor(apiKey, opts = {}) {
|
|
3453
|
+
this.apiKey = apiKey;
|
|
3454
|
+
this.model = opts.model ?? RimeModel.ARCANA;
|
|
3455
|
+
const defaultSpeaker = isMistModel(this.model) ? "cove" : "astra";
|
|
3456
|
+
this.speaker = opts.speaker ?? defaultSpeaker;
|
|
3457
|
+
this.lang = opts.lang ?? "eng";
|
|
3458
|
+
this.sampleRate = opts.sampleRate ?? 16e3;
|
|
3459
|
+
this.repetitionPenalty = opts.repetitionPenalty;
|
|
3460
|
+
this.temperature = opts.temperature;
|
|
3461
|
+
this.topP = opts.topP;
|
|
3462
|
+
this.maxTokens = opts.maxTokens;
|
|
3463
|
+
this.speedAlpha = opts.speedAlpha;
|
|
3464
|
+
this.reduceLatency = opts.reduceLatency;
|
|
3465
|
+
this.pauseBetweenBrackets = opts.pauseBetweenBrackets;
|
|
3466
|
+
this.phonemizeBetweenBrackets = opts.phonemizeBetweenBrackets;
|
|
3467
|
+
this.baseUrl = opts.baseUrl ?? RIME_BASE_URL;
|
|
3468
|
+
this.totalTimeoutMs = timeoutForModel(this.model);
|
|
3469
|
+
}
|
|
3470
|
+
buildPayload(text) {
|
|
3471
|
+
const payload = {
|
|
3472
|
+
speaker: this.speaker,
|
|
3473
|
+
text,
|
|
3474
|
+
modelId: this.model
|
|
3475
|
+
};
|
|
3476
|
+
if (this.model === RimeModel.ARCANA) {
|
|
3477
|
+
if (this.repetitionPenalty !== void 0)
|
|
3478
|
+
payload.repetition_penalty = this.repetitionPenalty;
|
|
3479
|
+
if (this.temperature !== void 0) payload.temperature = this.temperature;
|
|
3480
|
+
if (this.topP !== void 0) payload.top_p = this.topP;
|
|
3481
|
+
if (this.maxTokens !== void 0) payload.max_tokens = this.maxTokens;
|
|
3482
|
+
payload.lang = this.lang;
|
|
3483
|
+
payload.samplingRate = this.sampleRate;
|
|
3484
|
+
} else if (isMistModel(this.model)) {
|
|
3485
|
+
payload.lang = this.lang;
|
|
3486
|
+
payload.samplingRate = this.sampleRate;
|
|
3487
|
+
if (this.speedAlpha !== void 0) payload.speedAlpha = this.speedAlpha;
|
|
3488
|
+
if (this.model === RimeModel.MIST_V2 && this.reduceLatency !== void 0) {
|
|
3489
|
+
payload.reduceLatency = this.reduceLatency;
|
|
3490
|
+
}
|
|
3491
|
+
if (this.pauseBetweenBrackets !== void 0) {
|
|
3492
|
+
payload.pauseBetweenBrackets = this.pauseBetweenBrackets;
|
|
3493
|
+
}
|
|
3494
|
+
if (this.phonemizeBetweenBrackets !== void 0) {
|
|
3495
|
+
payload.phonemizeBetweenBrackets = this.phonemizeBetweenBrackets;
|
|
3496
|
+
}
|
|
3497
|
+
}
|
|
3498
|
+
return payload;
|
|
3499
|
+
}
|
|
3500
|
+
/** Synthesize text and return the concatenated audio buffer. */
|
|
3501
|
+
async synthesize(text) {
|
|
3502
|
+
const chunks = [];
|
|
3503
|
+
for await (const chunk of this.synthesizeStream(text)) {
|
|
3504
|
+
chunks.push(chunk);
|
|
3505
|
+
}
|
|
3506
|
+
return Buffer.concat(chunks);
|
|
3507
|
+
}
|
|
3508
|
+
/**
|
|
3509
|
+
* Synthesize text and yield raw PCM_S16LE chunks at the configured
|
|
3510
|
+
* `sampleRate` as they stream in.
|
|
3511
|
+
*/
|
|
3512
|
+
async *synthesizeStream(text) {
|
|
3513
|
+
const response = await fetch(this.baseUrl, {
|
|
3514
|
+
method: "POST",
|
|
3515
|
+
headers: {
|
|
3516
|
+
accept: RimeAudioFormat.PCM,
|
|
3517
|
+
Authorization: `Bearer ${this.apiKey}`,
|
|
3518
|
+
"content-type": "application/json"
|
|
3519
|
+
},
|
|
3520
|
+
body: JSON.stringify(this.buildPayload(text)),
|
|
3521
|
+
signal: AbortSignal.timeout(this.totalTimeoutMs)
|
|
3522
|
+
});
|
|
3523
|
+
if (!response.ok) {
|
|
3524
|
+
const body = await response.text();
|
|
3525
|
+
throw new Error(`Rime TTS error ${response.status}: ${body}`);
|
|
3526
|
+
}
|
|
3527
|
+
const contentType = response.headers.get("content-type") ?? "";
|
|
3528
|
+
if (!contentType.startsWith("audio")) {
|
|
3529
|
+
const body = await response.text();
|
|
3530
|
+
throw new Error(`Rime returned non-audio response: ${body.slice(0, 500)}`);
|
|
3531
|
+
}
|
|
3532
|
+
if (!response.body) {
|
|
3533
|
+
throw new Error("Rime TTS: no response body");
|
|
3534
|
+
}
|
|
3535
|
+
const reader = response.body.getReader();
|
|
3536
|
+
try {
|
|
3537
|
+
while (true) {
|
|
3538
|
+
const { done, value } = await reader.read();
|
|
3539
|
+
if (done) break;
|
|
3540
|
+
if (value && value.length > 0) {
|
|
3541
|
+
yield Buffer.from(value);
|
|
3542
|
+
}
|
|
3543
|
+
}
|
|
3544
|
+
} finally {
|
|
3545
|
+
if (typeof reader.cancel === "function")
|
|
3546
|
+
await reader.cancel().catch(() => {
|
|
3547
|
+
});
|
|
3548
|
+
reader.releaseLock();
|
|
3549
|
+
}
|
|
3550
|
+
}
|
|
3551
|
+
};
|
|
3552
|
+
|
|
3553
|
+
// src/stt/deepgram.ts
|
|
3554
|
+
init_esm_shims();
|
|
3555
|
+
var STT = class extends DeepgramSTT {
|
|
3556
|
+
static providerKey = "deepgram";
|
|
3557
|
+
constructor(opts = {}) {
|
|
3558
|
+
const key = opts.apiKey ?? process.env.DEEPGRAM_API_KEY;
|
|
3559
|
+
if (!key) {
|
|
3560
|
+
throw new Error(
|
|
3561
|
+
"Deepgram STT requires an apiKey. Pass { apiKey: 'dg_...' } or set DEEPGRAM_API_KEY in the environment."
|
|
3562
|
+
);
|
|
3563
|
+
}
|
|
3564
|
+
super(
|
|
3565
|
+
key,
|
|
3566
|
+
opts.language ?? "en",
|
|
3567
|
+
opts.model ?? "nova-3",
|
|
3568
|
+
opts.encoding ?? "linear16",
|
|
3569
|
+
opts.sampleRate ?? 16e3,
|
|
3570
|
+
{
|
|
3571
|
+
endpointingMs: opts.endpointingMs ?? 150,
|
|
3572
|
+
utteranceEndMs: opts.utteranceEndMs === null ? null : opts.utteranceEndMs ?? 1e3,
|
|
3573
|
+
smartFormat: opts.smartFormat ?? true,
|
|
3574
|
+
interimResults: opts.interimResults ?? true,
|
|
3575
|
+
...opts.vadEvents !== void 0 ? { vadEvents: opts.vadEvents } : {}
|
|
3576
|
+
}
|
|
3577
|
+
);
|
|
3578
|
+
}
|
|
3579
|
+
};
|
|
3580
|
+
|
|
3581
|
+
// src/stt/whisper.ts
|
|
3582
|
+
init_esm_shims();
|
|
3583
|
+
|
|
3584
|
+
// src/providers/whisper-stt.ts
|
|
3585
|
+
init_esm_shims();
|
|
3586
|
+
var OPENAI_TRANSCRIPTION_URL = "https://api.openai.com/v1/audio/transcriptions";
|
|
3587
|
+
var DEFAULT_BUFFER_SIZE = 16e3 * 2;
|
|
3588
|
+
var ALLOWED_MODELS = /* @__PURE__ */ new Set(["whisper-1", "gpt-4o-transcribe", "gpt-4o-mini-transcribe"]);
|
|
3589
|
+
function wrapPcmInWav(pcm, sampleRate = 16e3, channels = 1, bitsPerSample = 16) {
|
|
3590
|
+
const dataSize = pcm.length;
|
|
3591
|
+
const header = Buffer.alloc(44);
|
|
3592
|
+
header.write("RIFF", 0);
|
|
3593
|
+
header.writeUInt32LE(36 + dataSize, 4);
|
|
3594
|
+
header.write("WAVE", 8);
|
|
3595
|
+
header.write("fmt ", 12);
|
|
3596
|
+
header.writeUInt32LE(16, 16);
|
|
3597
|
+
header.writeUInt16LE(1, 20);
|
|
3598
|
+
header.writeUInt16LE(channels, 22);
|
|
3599
|
+
header.writeUInt32LE(sampleRate, 24);
|
|
3600
|
+
header.writeUInt32LE(sampleRate * channels * (bitsPerSample / 8), 28);
|
|
2815
3601
|
header.writeUInt16LE(channels * (bitsPerSample / 8), 32);
|
|
2816
3602
|
header.writeUInt16LE(bitsPerSample, 34);
|
|
2817
3603
|
header.write("data", 36);
|
|
@@ -4448,264 +5234,42 @@ var SpeechmaticsSTT = class {
|
|
|
4448
5234
|
close() {
|
|
4449
5235
|
this.running = false;
|
|
4450
5236
|
const ws = this.ws;
|
|
4451
|
-
if (!ws) return;
|
|
4452
|
-
this.ws = null;
|
|
4453
|
-
const sendSafe = (payload) => {
|
|
4454
|
-
if (ws.readyState === WebSocket5.OPEN) {
|
|
4455
|
-
try {
|
|
4456
|
-
ws.send(payload);
|
|
4457
|
-
} catch {
|
|
4458
|
-
}
|
|
4459
|
-
}
|
|
4460
|
-
};
|
|
4461
|
-
sendSafe(
|
|
4462
|
-
JSON.stringify({ message: "EndOfStream", last_seq_no: this.lastSeqNo })
|
|
4463
|
-
);
|
|
4464
|
-
try {
|
|
4465
|
-
ws.close();
|
|
4466
|
-
} catch {
|
|
4467
|
-
}
|
|
4468
|
-
}
|
|
4469
|
-
};
|
|
4470
|
-
|
|
4471
|
-
// src/stt/speechmatics.ts
|
|
4472
|
-
var STT7 = class extends SpeechmaticsSTT {
|
|
4473
|
-
static providerKey = "speechmatics";
|
|
4474
|
-
constructor(opts = {}) {
|
|
4475
|
-
const key = opts.apiKey ?? process.env.SPEECHMATICS_API_KEY;
|
|
4476
|
-
if (!key) {
|
|
4477
|
-
throw new Error(
|
|
4478
|
-
"Speechmatics STT requires an apiKey. Pass { apiKey: 'sm_...' } or set SPEECHMATICS_API_KEY in the environment."
|
|
4479
|
-
);
|
|
4480
|
-
}
|
|
4481
|
-
super(key, opts);
|
|
4482
|
-
}
|
|
4483
|
-
};
|
|
4484
|
-
|
|
4485
|
-
// src/tts/elevenlabs.ts
|
|
4486
|
-
init_esm_shims();
|
|
4487
|
-
|
|
4488
|
-
// src/providers/elevenlabs-tts.ts
|
|
4489
|
-
init_esm_shims();
|
|
4490
|
-
var ELEVENLABS_BASE_URL = "https://api.elevenlabs.io/v1";
|
|
4491
|
-
var ELEVENLABS_VOICE_ID_BY_NAME = {
|
|
4492
|
-
rachel: "21m00Tcm4TlvDq8ikWAM",
|
|
4493
|
-
drew: "29vD33N1CtxCmqQRPOHJ",
|
|
4494
|
-
clyde: "2EiwWnXFnvU5JabPnv8n",
|
|
4495
|
-
paul: "5Q0t7uMcjvnagumLfvZi",
|
|
4496
|
-
domi: "AZnzlk1XvdvUeBnXmlld",
|
|
4497
|
-
dave: "CYw3kZ02Hs0563khs1Fj",
|
|
4498
|
-
fin: "D38z5RcWu1voky8WS1ja",
|
|
4499
|
-
bella: "EXAVITQu4vr4xnSDxMaL",
|
|
4500
|
-
antoni: "ErXwobaYiN019PkySvjV",
|
|
4501
|
-
thomas: "GBv7mTt0atIp3Br8iCZE",
|
|
4502
|
-
charlie: "IKne3meq5aSn9XLyUdCD",
|
|
4503
|
-
george: "JBFqnCBsd6RMkjVDRZzb",
|
|
4504
|
-
emily: "LcfcDJNUP1GQjkzn1xUU",
|
|
4505
|
-
elli: "MF3mGyEYCl7XYWbV9V6O",
|
|
4506
|
-
callum: "N2lVS1w4EtoT3dr4eOWO",
|
|
4507
|
-
patrick: "ODq5zmih8GrVes37Dizd",
|
|
4508
|
-
harry: "SOYHLrjzK2X1ezoPC6cr",
|
|
4509
|
-
liam: "TX3LPaxmHKxFdv7VOQHJ",
|
|
4510
|
-
dorothy: "ThT5KcBeYPX3keUQqHPh",
|
|
4511
|
-
josh: "TxGEqnHWrfWFTfGW9XjX",
|
|
4512
|
-
arnold: "VR6AewLTigWG4xSOukaG",
|
|
4513
|
-
charlotte: "XB0fDUnXU5powFXDhCwa",
|
|
4514
|
-
matilda: "XrExE9yKIg1WjnnlVkGX",
|
|
4515
|
-
matthew: "Yko7PKHZNXotIFUBG7I9",
|
|
4516
|
-
james: "ZQe5CZNOzWyzPSCn5a3c",
|
|
4517
|
-
joseph: "Zlb1dXrM653N07WRdFW3",
|
|
4518
|
-
jeremy: "bVMeCyTHy58xNoL34h3p",
|
|
4519
|
-
michael: "flq6f7yk4E4fJM5XTYuZ",
|
|
4520
|
-
ethan: "g5CIjZEefAph4nQFvHAz",
|
|
4521
|
-
gigi: "jBpfuIE2acCO8z3wKNLl",
|
|
4522
|
-
freya: "jsCqWAovK2LkecY7zXl4",
|
|
4523
|
-
brian: "nPczCjzI2devNBz1zQrb",
|
|
4524
|
-
grace: "oWAxZDx7w5VEj9dCyTzz",
|
|
4525
|
-
daniel: "onwK4e9ZLuTAKqWW03F9",
|
|
4526
|
-
lily: "pFZP5JQG7iQjIQuC4Bku",
|
|
4527
|
-
serena: "pMsXgVXv3BLzUgSXRplE",
|
|
4528
|
-
adam: "pNInz6obpgDQGcFmaJgB",
|
|
4529
|
-
nicole: "piTKgcLEGmPE4e6mEKli",
|
|
4530
|
-
bill: "pqHfZKP75CvOlQylNhV4",
|
|
4531
|
-
jessie: "t0jbNlBVZ17f02VDIeMI",
|
|
4532
|
-
ryan: "wViXBPUzp2ZZixB1xQuM",
|
|
4533
|
-
sam: "yoZ06aMxZJJ28mfd3POQ",
|
|
4534
|
-
glinda: "z9fAnlkpzviPz146aGWa",
|
|
4535
|
-
giovanni: "zcAOhNBS3c14rBihAFp1",
|
|
4536
|
-
mimi: "zrHiDhphv9ZnVXBqCLjz",
|
|
4537
|
-
sarah: "EXAVITQu4vr4xnSDxMaL",
|
|
4538
|
-
alloy: "EXAVITQu4vr4xnSDxMaL"
|
|
4539
|
-
};
|
|
4540
|
-
var VOICE_ID_PATTERN = /^[A-Za-z0-9]{20}$/;
|
|
4541
|
-
function resolveVoiceId(voice) {
|
|
4542
|
-
if (!voice) return voice;
|
|
4543
|
-
if (VOICE_ID_PATTERN.test(voice)) return voice;
|
|
4544
|
-
return ELEVENLABS_VOICE_ID_BY_NAME[voice.toLowerCase()] ?? voice;
|
|
4545
|
-
}
|
|
4546
|
-
var ElevenLabsModel = {
|
|
4547
|
-
V3: "eleven_v3",
|
|
4548
|
-
FLASH_V2_5: "eleven_flash_v2_5",
|
|
4549
|
-
TURBO_V2_5: "eleven_turbo_v2_5",
|
|
4550
|
-
MULTILINGUAL_V2: "eleven_multilingual_v2",
|
|
4551
|
-
MONOLINGUAL_V1: "eleven_monolingual_v1"
|
|
4552
|
-
};
|
|
4553
|
-
var ElevenLabsOutputFormat = {
|
|
4554
|
-
MP3_22050_32: "mp3_22050_32",
|
|
4555
|
-
MP3_44100_32: "mp3_44100_32",
|
|
4556
|
-
MP3_44100_64: "mp3_44100_64",
|
|
4557
|
-
MP3_44100_96: "mp3_44100_96",
|
|
4558
|
-
MP3_44100_128: "mp3_44100_128",
|
|
4559
|
-
MP3_44100_192: "mp3_44100_192",
|
|
4560
|
-
PCM_8000: "pcm_8000",
|
|
4561
|
-
PCM_16000: "pcm_16000",
|
|
4562
|
-
PCM_22050: "pcm_22050",
|
|
4563
|
-
PCM_24000: "pcm_24000",
|
|
4564
|
-
PCM_44100: "pcm_44100",
|
|
4565
|
-
ULAW_8000: "ulaw_8000"
|
|
4566
|
-
};
|
|
4567
|
-
var ElevenLabsTTS = class _ElevenLabsTTS {
|
|
4568
|
-
// Stable pricing/dashboard key — read by stream-handler / metrics via
|
|
4569
|
-
// ``(agent.tts.constructor as any).providerKey``. Without this the cost
|
|
4570
|
-
// calculator falls back to ``constructor.name`` ("ElevenLabsTTS") which
|
|
4571
|
-
// does NOT match the pricing table key "elevenlabs", silently zeroing
|
|
4572
|
-
// TTS cost for callers that construct the raw REST class directly
|
|
4573
|
-
// (exposed at top level as ``ElevenLabsRestTTS``).
|
|
4574
|
-
static providerKey = "elevenlabs";
|
|
4575
|
-
apiKey;
|
|
4576
|
-
voiceId;
|
|
4577
|
-
modelId;
|
|
4578
|
-
outputFormat;
|
|
4579
|
-
voiceSettings;
|
|
4580
|
-
languageCode;
|
|
4581
|
-
chunkSize;
|
|
4582
|
-
constructor(apiKey, voiceIdOrOptions = "21m00Tcm4TlvDq8ikWAM", modelId = ElevenLabsModel.FLASH_V2_5, outputFormat = ElevenLabsOutputFormat.PCM_16000) {
|
|
4583
|
-
this.apiKey = apiKey;
|
|
4584
|
-
if (typeof voiceIdOrOptions === "object") {
|
|
4585
|
-
const o = voiceIdOrOptions;
|
|
4586
|
-
this.voiceId = resolveVoiceId(o.voiceId ?? "21m00Tcm4TlvDq8ikWAM");
|
|
4587
|
-
this.modelId = o.modelId ?? ElevenLabsModel.FLASH_V2_5;
|
|
4588
|
-
this.outputFormat = o.outputFormat ?? ElevenLabsOutputFormat.PCM_16000;
|
|
4589
|
-
this.voiceSettings = o.voiceSettings;
|
|
4590
|
-
this.languageCode = o.languageCode;
|
|
4591
|
-
this.chunkSize = o.chunkSize ?? 4096;
|
|
4592
|
-
} else {
|
|
4593
|
-
this.voiceId = resolveVoiceId(voiceIdOrOptions);
|
|
4594
|
-
this.modelId = modelId;
|
|
4595
|
-
this.outputFormat = outputFormat;
|
|
4596
|
-
this.voiceSettings = void 0;
|
|
4597
|
-
this.languageCode = void 0;
|
|
4598
|
-
this.chunkSize = 4096;
|
|
4599
|
-
}
|
|
4600
|
-
}
|
|
4601
|
-
/**
|
|
4602
|
-
* Construct an instance pre-configured for Twilio Media Streams.
|
|
4603
|
-
*
|
|
4604
|
-
* Sets `outputFormat='ulaw_8000'` so ElevenLabs emits μ-law @ 8 kHz
|
|
4605
|
-
* directly — the exact wire format Twilio's media stream uses — letting
|
|
4606
|
-
* the SDK skip the 16 kHz→8 kHz resample and PCM→μ-law conversion in
|
|
4607
|
-
* `TwilioAudioSender`. Saves ~30–80 ms first-byte and per-frame CPU,
|
|
4608
|
-
* and removes a potential aliasing source.
|
|
4609
|
-
*
|
|
4610
|
-
* `voiceSettings` defaults to a low-bandwidth-friendly profile
|
|
4611
|
-
* (speaker boost off, modest stability) which sounds cleaner at 8 kHz
|
|
4612
|
-
* μ-law than the studio default. Pass an explicit object to override.
|
|
4613
|
-
*/
|
|
4614
|
-
static forTwilio(apiKey, options = {}) {
|
|
4615
|
-
const voiceSettings = options.voiceSettings ?? {
|
|
4616
|
-
// Speaker boost adds high-frequency emphasis that aliases ugly over an
|
|
4617
|
-
// 8 kHz μ-law line. Slightly higher stability tames the excursions
|
|
4618
|
-
// that compander quantization noise can amplify.
|
|
4619
|
-
stability: 0.6,
|
|
4620
|
-
similarity_boost: 0.75,
|
|
4621
|
-
use_speaker_boost: false
|
|
4622
|
-
};
|
|
4623
|
-
return new _ElevenLabsTTS(apiKey, {
|
|
4624
|
-
...options,
|
|
4625
|
-
voiceSettings,
|
|
4626
|
-
outputFormat: ElevenLabsOutputFormat.ULAW_8000
|
|
4627
|
-
});
|
|
4628
|
-
}
|
|
4629
|
-
/**
|
|
4630
|
-
* Construct an instance pre-configured for Telnyx bidirectional media.
|
|
4631
|
-
*
|
|
4632
|
-
* Telnyx's default media-streaming codec is L16 PCM @ 16 kHz, which
|
|
4633
|
-
* matches our default Telnyx handler. We pick `pcm_16000` so the audio
|
|
4634
|
-
* flows end-to-end with zero resampling or transcoding.
|
|
4635
|
-
*
|
|
4636
|
-
* Trade-off: if your Telnyx profile is pinned to PCMU/8000 (μ-law),
|
|
4637
|
-
* construct `ElevenLabsTTS` directly with `outputFormat: 'ulaw_8000'`
|
|
4638
|
-
* — Telnyx supports that natively too.
|
|
4639
|
-
*/
|
|
4640
|
-
static forTelnyx(apiKey, options = {}) {
|
|
4641
|
-
return new _ElevenLabsTTS(apiKey, {
|
|
4642
|
-
...options,
|
|
4643
|
-
outputFormat: ElevenLabsOutputFormat.PCM_16000
|
|
4644
|
-
});
|
|
4645
|
-
}
|
|
4646
|
-
/**
|
|
4647
|
-
* Synthesise text to speech and return the full audio as a single Buffer.
|
|
4648
|
-
*
|
|
4649
|
-
* For large chunks (or when latency matters) call `synthesizeStream` instead.
|
|
4650
|
-
*/
|
|
4651
|
-
async synthesize(text) {
|
|
4652
|
-
const chunks = [];
|
|
4653
|
-
for await (const chunk of this.synthesizeStream(text)) {
|
|
4654
|
-
chunks.push(chunk);
|
|
4655
|
-
}
|
|
4656
|
-
return Buffer.concat(chunks);
|
|
4657
|
-
}
|
|
4658
|
-
/**
|
|
4659
|
-
* Synthesise text and yield audio chunks as they arrive (streaming).
|
|
4660
|
-
*
|
|
4661
|
-
* The yielded buffers are raw PCM at 16 kHz (or whatever `outputFormat` is
|
|
4662
|
-
* configured to). `chunkSize` controls the maximum yield size — 512 is a
|
|
4663
|
-
* good choice for low-latency telephony.
|
|
4664
|
-
*/
|
|
4665
|
-
async *synthesizeStream(text) {
|
|
4666
|
-
const url = `${ELEVENLABS_BASE_URL}/text-to-speech/${encodeURIComponent(this.voiceId)}/stream?output_format=${encodeURIComponent(this.outputFormat)}`;
|
|
4667
|
-
const body = {
|
|
4668
|
-
text,
|
|
4669
|
-
model_id: this.modelId
|
|
4670
|
-
};
|
|
4671
|
-
if (this.voiceSettings) body["voice_settings"] = this.voiceSettings;
|
|
4672
|
-
if (this.languageCode) body["language_code"] = this.languageCode;
|
|
4673
|
-
const response = await fetch(url, {
|
|
4674
|
-
method: "POST",
|
|
4675
|
-
headers: {
|
|
4676
|
-
"xi-api-key": this.apiKey,
|
|
4677
|
-
"Content-Type": "application/json"
|
|
4678
|
-
},
|
|
4679
|
-
body: JSON.stringify(body),
|
|
4680
|
-
signal: AbortSignal.timeout(3e4)
|
|
4681
|
-
});
|
|
4682
|
-
if (!response.ok) {
|
|
4683
|
-
const errBody = await response.text();
|
|
4684
|
-
throw new Error(`ElevenLabs TTS error ${response.status}: ${errBody}`);
|
|
4685
|
-
}
|
|
4686
|
-
if (!response.body) {
|
|
4687
|
-
throw new Error("ElevenLabs TTS: no response body");
|
|
4688
|
-
}
|
|
4689
|
-
const reader = response.body.getReader();
|
|
4690
|
-
try {
|
|
4691
|
-
while (true) {
|
|
4692
|
-
const { done, value } = await reader.read();
|
|
4693
|
-
if (done) break;
|
|
4694
|
-
if (!value || value.length === 0) continue;
|
|
4695
|
-
const buf = Buffer.from(value);
|
|
4696
|
-
for (let offset = 0; offset < buf.length; offset += this.chunkSize) {
|
|
4697
|
-
yield buf.subarray(offset, Math.min(offset + this.chunkSize, buf.length));
|
|
5237
|
+
if (!ws) return;
|
|
5238
|
+
this.ws = null;
|
|
5239
|
+
const sendSafe = (payload) => {
|
|
5240
|
+
if (ws.readyState === WebSocket5.OPEN) {
|
|
5241
|
+
try {
|
|
5242
|
+
ws.send(payload);
|
|
5243
|
+
} catch {
|
|
4698
5244
|
}
|
|
4699
5245
|
}
|
|
4700
|
-
}
|
|
4701
|
-
|
|
4702
|
-
})
|
|
4703
|
-
|
|
5246
|
+
};
|
|
5247
|
+
sendSafe(
|
|
5248
|
+
JSON.stringify({ message: "EndOfStream", last_seq_no: this.lastSeqNo })
|
|
5249
|
+
);
|
|
5250
|
+
try {
|
|
5251
|
+
ws.close();
|
|
5252
|
+
} catch {
|
|
5253
|
+
}
|
|
5254
|
+
}
|
|
5255
|
+
};
|
|
5256
|
+
|
|
5257
|
+
// src/stt/speechmatics.ts
|
|
5258
|
+
var STT7 = class extends SpeechmaticsSTT {
|
|
5259
|
+
static providerKey = "speechmatics";
|
|
5260
|
+
constructor(opts = {}) {
|
|
5261
|
+
const key = opts.apiKey ?? process.env.SPEECHMATICS_API_KEY;
|
|
5262
|
+
if (!key) {
|
|
5263
|
+
throw new Error(
|
|
5264
|
+
"Speechmatics STT requires an apiKey. Pass { apiKey: 'sm_...' } or set SPEECHMATICS_API_KEY in the environment."
|
|
5265
|
+
);
|
|
4704
5266
|
}
|
|
5267
|
+
super(key, opts);
|
|
4705
5268
|
}
|
|
4706
5269
|
};
|
|
4707
5270
|
|
|
4708
5271
|
// src/tts/elevenlabs.ts
|
|
5272
|
+
init_esm_shims();
|
|
4709
5273
|
function resolveApiKey(apiKey) {
|
|
4710
5274
|
const key = apiKey ?? process.env.ELEVENLABS_API_KEY;
|
|
4711
5275
|
if (!key) {
|
|
@@ -4721,7 +5285,7 @@ var TTS = class _TTS extends ElevenLabsTTS {
|
|
|
4721
5285
|
super(resolveApiKey(opts.apiKey), {
|
|
4722
5286
|
voiceId: opts.voiceId ?? "EXAVITQu4vr4xnSDxMaL",
|
|
4723
5287
|
modelId: opts.modelId ?? "eleven_flash_v2_5",
|
|
4724
|
-
outputFormat: opts.outputFormat
|
|
5288
|
+
...opts.outputFormat !== void 0 ? { outputFormat: opts.outputFormat } : {},
|
|
4725
5289
|
languageCode: opts.languageCode,
|
|
4726
5290
|
voiceSettings: opts.voiceSettings
|
|
4727
5291
|
});
|
|
@@ -4764,9 +5328,11 @@ var PLAN_REQUIRED_MSG = "ElevenLabs WS streaming requires a Pro plan or higher (
|
|
|
4764
5328
|
function sanitiseLogStr(value, limit = 200) {
|
|
4765
5329
|
return String(value).replace(/[\r\n\x00]/g, " ").slice(0, limit);
|
|
4766
5330
|
}
|
|
4767
|
-
var
|
|
5331
|
+
var CARRIER_NATIVE_FORMAT2 = {
|
|
4768
5332
|
twilio: "ulaw_8000",
|
|
4769
|
-
telnyx: "pcm_16000"
|
|
5333
|
+
telnyx: "pcm_16000",
|
|
5334
|
+
// Plivo streams mulaw 8 kHz (we pin contentType in the answer XML).
|
|
5335
|
+
plivo: "ulaw_8000"
|
|
4770
5336
|
};
|
|
4771
5337
|
var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
|
|
4772
5338
|
static providerKey = "elevenlabs_ws";
|
|
@@ -4792,6 +5358,20 @@ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
|
|
|
4792
5358
|
* changes.
|
|
4793
5359
|
*/
|
|
4794
5360
|
adoptedConnection = null;
|
|
5361
|
+
/**
|
|
5362
|
+
* Active WS for the in-flight ``synthesizeStream`` call, if any. Set
|
|
5363
|
+
* when a stream starts, cleared in its ``finally`` block. The
|
|
5364
|
+
* stream-handler calls ``cancelActiveStream()`` from ``cancelSpeaking``
|
|
5365
|
+
* to unblock the generator's inner ``await Promise<frame>`` — without
|
|
5366
|
+
* it, a barge-in on the firstMessage live path leaves the for-await
|
|
5367
|
+
* stuck waiting for the next frame; ElevenLabs never sends
|
|
5368
|
+
* ``isFinal=true`` after the consumer breaks, the 30 s frame timeout
|
|
5369
|
+
* fires post-call, and meanwhile ``initPipeline`` never returns so
|
|
5370
|
+
* the STT ``onTranscript`` callback never registers and subsequent
|
|
5371
|
+
* user turns are silently dropped (root cause of the 2026-05-20
|
|
5372
|
+
* "first message OK, then no response" symptom).
|
|
5373
|
+
*/
|
|
5374
|
+
activeStreamWs = null;
|
|
4795
5375
|
/**
|
|
4796
5376
|
* The wire format requested over the ElevenLabs WS. Initially set from
|
|
4797
5377
|
* the constructor; ``setTelephonyCarrier`` may auto-flip it to the
|
|
@@ -4836,10 +5416,36 @@ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
|
|
|
4836
5416
|
*/
|
|
4837
5417
|
setTelephonyCarrier(carrier) {
|
|
4838
5418
|
if (this._outputFormatExplicit) return;
|
|
4839
|
-
const native =
|
|
5419
|
+
const native = CARRIER_NATIVE_FORMAT2[carrier];
|
|
4840
5420
|
if (!native) return;
|
|
4841
5421
|
this._outputFormat = native;
|
|
4842
5422
|
}
|
|
5423
|
+
/**
|
|
5424
|
+
* Force-close the WebSocket of any in-flight ``synthesizeStream`` call.
|
|
5425
|
+
* Called by the stream-handler from ``cancelSpeaking`` (barge-in) so
|
|
5426
|
+
* the generator's inner ``await Promise<frame>`` loop unblocks cleanly
|
|
5427
|
+
* via the ``onClose`` handler — instead of waiting up to 30 s for the
|
|
5428
|
+
* ``FRAME_TIMEOUT_MS`` watchdog to fire. No-op when no stream is in
|
|
5429
|
+
* flight or when the WS is already closing.
|
|
5430
|
+
*
|
|
5431
|
+
* Without this, a barge-in during the firstMessage live path left the
|
|
5432
|
+
* for-await stuck (ElevenLabs never sends ``isFinal=true`` after the
|
|
5433
|
+
* consumer breaks), ``initPipeline`` never returned, the STT
|
|
5434
|
+
* ``onTranscript`` callback never registered, and the entire remainder
|
|
5435
|
+
* of the call was silent for the user. Surfaced during the 2026-05-20
|
|
5436
|
+
* acceptance run.
|
|
5437
|
+
*/
|
|
5438
|
+
cancelActiveStream() {
|
|
5439
|
+
const ws = this.activeStreamWs;
|
|
5440
|
+
if (!ws) return;
|
|
5441
|
+
this.activeStreamWs = null;
|
|
5442
|
+
try {
|
|
5443
|
+
if (ws.readyState === WebSocket6.OPEN || ws.readyState === WebSocket6.CONNECTING) {
|
|
5444
|
+
ws.close();
|
|
5445
|
+
}
|
|
5446
|
+
} catch {
|
|
5447
|
+
}
|
|
5448
|
+
}
|
|
4843
5449
|
/** Pre-configured for Twilio Media Streams (`ulaw_8000`). */
|
|
4844
5450
|
static forTwilio(opts) {
|
|
4845
5451
|
return new _ElevenLabsWebSocketTTS({
|
|
@@ -4925,6 +5531,7 @@ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
|
|
|
4925
5531
|
headers: { "xi-api-key": this.apiKey }
|
|
4926
5532
|
});
|
|
4927
5533
|
}
|
|
5534
|
+
this.activeStreamWs = ws;
|
|
4928
5535
|
const queue = [];
|
|
4929
5536
|
let done = false;
|
|
4930
5537
|
let pendingError = null;
|
|
@@ -5045,6 +5652,7 @@ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
|
|
|
5045
5652
|
}
|
|
5046
5653
|
} finally {
|
|
5047
5654
|
if (connectTimer) clearTimeout(connectTimer);
|
|
5655
|
+
if (this.activeStreamWs === ws) this.activeStreamWs = null;
|
|
5048
5656
|
try {
|
|
5049
5657
|
if (ws.readyState === WebSocket6.OPEN) {
|
|
5050
5658
|
ws.send(JSON.stringify({ text: "" }));
|
|
@@ -5217,9 +5825,9 @@ function buildOpts(opts) {
|
|
|
5217
5825
|
const out = {
|
|
5218
5826
|
apiKey: resolveApiKey2(opts.apiKey),
|
|
5219
5827
|
modelId: opts.modelId ?? "eleven_flash_v2_5",
|
|
5220
|
-
outputFormat: opts.outputFormat ?? "pcm_16000",
|
|
5221
5828
|
autoMode: opts.autoMode ?? true
|
|
5222
5829
|
};
|
|
5830
|
+
if (opts.outputFormat !== void 0) out.outputFormat = opts.outputFormat;
|
|
5223
5831
|
if (opts.voiceId !== void 0) out.voiceId = opts.voiceId;
|
|
5224
5832
|
if (opts.voiceSettings !== void 0) out.voiceSettings = opts.voiceSettings;
|
|
5225
5833
|
if (opts.languageCode !== void 0) out.languageCode = opts.languageCode;
|
|
@@ -5398,266 +6006,75 @@ var OpenAITTS = class _OpenAITTS {
|
|
|
5398
6006
|
let s = Math.round(y);
|
|
5399
6007
|
if (s > 32767) s = 32767;
|
|
5400
6008
|
else if (s < -32768) s = -32768;
|
|
5401
|
-
samples.push(s);
|
|
5402
|
-
} else {
|
|
5403
|
-
samples.push(x);
|
|
5404
|
-
}
|
|
5405
|
-
}
|
|
5406
|
-
if (lpf) ctx.lpfPrev = y;
|
|
5407
|
-
const out = [];
|
|
5408
|
-
let i = 0;
|
|
5409
|
-
if (direct8k) {
|
|
5410
|
-
while (i + 2 < samples.length) {
|
|
5411
|
-
out.push(samples[i]);
|
|
5412
|
-
i += 3;
|
|
5413
|
-
}
|
|
5414
|
-
} else {
|
|
5415
|
-
while (i + 2 < samples.length) {
|
|
5416
|
-
out.push(samples[i]);
|
|
5417
|
-
out.push(Math.round((samples[i + 1] + samples[i + 2]) / 2));
|
|
5418
|
-
i += 3;
|
|
5419
|
-
}
|
|
5420
|
-
}
|
|
5421
|
-
ctx.leftover = samples.slice(i);
|
|
5422
|
-
const buffer = Buffer.alloc(out.length * 2);
|
|
5423
|
-
for (let j = 0; j < out.length; j++) {
|
|
5424
|
-
buffer.writeInt16LE(out[j], j * 2);
|
|
5425
|
-
}
|
|
5426
|
-
return buffer;
|
|
5427
|
-
}
|
|
5428
|
-
/** @deprecated use {@link resampleStreaming} with persistent state. */
|
|
5429
|
-
static resample24kTo16k(audio) {
|
|
5430
|
-
const ctx = {
|
|
5431
|
-
carryByte: null,
|
|
5432
|
-
leftover: [],
|
|
5433
|
-
lpfPrev: 0,
|
|
5434
|
-
lpfEnabled: false,
|
|
5435
|
-
targetSampleRate: 16e3
|
|
5436
|
-
};
|
|
5437
|
-
const out = _OpenAITTS.resampleStreaming(audio, ctx);
|
|
5438
|
-
if (ctx.leftover.length === 0) return out;
|
|
5439
|
-
const tail = Buffer.alloc(ctx.leftover.length * 2);
|
|
5440
|
-
for (let i = 0; i < ctx.leftover.length; i++) {
|
|
5441
|
-
tail.writeInt16LE(ctx.leftover[i], i * 2);
|
|
5442
|
-
}
|
|
5443
|
-
return Buffer.concat([out, tail]);
|
|
5444
|
-
}
|
|
5445
|
-
};
|
|
5446
|
-
|
|
5447
|
-
// src/tts/openai.ts
|
|
5448
|
-
var TTS3 = class extends OpenAITTS {
|
|
5449
|
-
static providerKey = "openai_tts";
|
|
5450
|
-
constructor(opts = {}) {
|
|
5451
|
-
const key = opts.apiKey ?? process.env.OPENAI_API_KEY;
|
|
5452
|
-
if (!key) {
|
|
5453
|
-
throw new Error(
|
|
5454
|
-
"OpenAI TTS requires an apiKey. Pass { apiKey: 'sk-...' } or set OPENAI_API_KEY in the environment."
|
|
5455
|
-
);
|
|
5456
|
-
}
|
|
5457
|
-
super(
|
|
5458
|
-
key,
|
|
5459
|
-
opts.voice ?? "alloy",
|
|
5460
|
-
opts.model ?? "gpt-4o-mini-tts",
|
|
5461
|
-
opts.instructions ?? null,
|
|
5462
|
-
opts.speed ?? null,
|
|
5463
|
-
opts.antiAlias ?? false
|
|
5464
|
-
);
|
|
5465
|
-
}
|
|
5466
|
-
};
|
|
5467
|
-
|
|
5468
|
-
// src/tts/cartesia.ts
|
|
5469
|
-
init_esm_shims();
|
|
5470
|
-
|
|
5471
|
-
// src/providers/cartesia-tts.ts
|
|
5472
|
-
init_esm_shims();
|
|
5473
|
-
var CARTESIA_BASE_URL = "https://api.cartesia.ai";
|
|
5474
|
-
var CARTESIA_API_VERSION = "2025-04-16";
|
|
5475
|
-
var CARTESIA_DEFAULT_VOICE_ID = "f786b574-daa5-4673-aa0c-cbe3e8534c02";
|
|
5476
|
-
var CartesiaTTSModel = {
|
|
5477
|
-
SONIC_3: "sonic-3",
|
|
5478
|
-
SONIC_2: "sonic-2",
|
|
5479
|
-
SONIC: "sonic"
|
|
5480
|
-
};
|
|
5481
|
-
var CartesiaTTSContainer = {
|
|
5482
|
-
RAW: "raw",
|
|
5483
|
-
WAV: "wav",
|
|
5484
|
-
MP3: "mp3"
|
|
5485
|
-
};
|
|
5486
|
-
var CartesiaTTSEncoding = {
|
|
5487
|
-
PCM_S16LE: "pcm_s16le",
|
|
5488
|
-
PCM_F32LE: "pcm_f32le",
|
|
5489
|
-
PCM_MULAW: "pcm_mulaw",
|
|
5490
|
-
PCM_ALAW: "pcm_alaw"
|
|
5491
|
-
};
|
|
5492
|
-
var CartesiaTTSSampleRate = {
|
|
5493
|
-
HZ_8000: 8e3,
|
|
5494
|
-
HZ_16000: 16e3,
|
|
5495
|
-
HZ_22050: 22050,
|
|
5496
|
-
HZ_24000: 24e3,
|
|
5497
|
-
HZ_44100: 44100
|
|
5498
|
-
};
|
|
5499
|
-
var CartesiaTTSVoiceMode = {
|
|
5500
|
-
ID: "id",
|
|
5501
|
-
EMBEDDING: "embedding"
|
|
5502
|
-
};
|
|
5503
|
-
var CartesiaTTS = class _CartesiaTTS {
|
|
5504
|
-
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
5505
|
-
static providerKey = "cartesia_tts";
|
|
5506
|
-
apiKey;
|
|
5507
|
-
model;
|
|
5508
|
-
voice;
|
|
5509
|
-
language;
|
|
5510
|
-
sampleRate;
|
|
5511
|
-
speed;
|
|
5512
|
-
emotion;
|
|
5513
|
-
volume;
|
|
5514
|
-
baseUrl;
|
|
5515
|
-
apiVersion;
|
|
5516
|
-
constructor(apiKey, opts = {}) {
|
|
5517
|
-
this.apiKey = apiKey;
|
|
5518
|
-
this.model = opts.model ?? CartesiaTTSModel.SONIC_3;
|
|
5519
|
-
this.voice = opts.voice ?? CARTESIA_DEFAULT_VOICE_ID;
|
|
5520
|
-
this.language = opts.language ?? "en";
|
|
5521
|
-
this.sampleRate = opts.sampleRate ?? CartesiaTTSSampleRate.HZ_16000;
|
|
5522
|
-
this.speed = opts.speed;
|
|
5523
|
-
this.emotion = typeof opts.emotion === "string" ? [opts.emotion] : opts.emotion;
|
|
5524
|
-
this.volume = opts.volume;
|
|
5525
|
-
this.baseUrl = opts.baseUrl ?? CARTESIA_BASE_URL;
|
|
5526
|
-
this.apiVersion = opts.apiVersion ?? CARTESIA_API_VERSION;
|
|
5527
|
-
}
|
|
5528
|
-
/**
|
|
5529
|
-
* Construct an instance pre-configured for Twilio Media Streams.
|
|
5530
|
-
*
|
|
5531
|
-
* Sets `sampleRate=8000` so Cartesia emits PCM_S16LE @ 8 kHz directly.
|
|
5532
|
-
* Twilio's media stream uses μ-law @ 8 kHz so the SDK still does the
|
|
5533
|
-
* PCM → μ-law transcode client-side, but the 16 kHz → 8 kHz resample
|
|
5534
|
-
* step is skipped. Saves ~10–30 ms first-byte plus per-frame CPU and
|
|
5535
|
-
* removes a potential aliasing source.
|
|
5536
|
-
*/
|
|
5537
|
-
static forTwilio(apiKey, options = {}) {
|
|
5538
|
-
return new _CartesiaTTS(apiKey, {
|
|
5539
|
-
...options,
|
|
5540
|
-
sampleRate: CartesiaTTSSampleRate.HZ_8000
|
|
5541
|
-
});
|
|
5542
|
-
}
|
|
5543
|
-
/**
|
|
5544
|
-
* Construct an instance pre-configured for Telnyx bidirectional media.
|
|
5545
|
-
*
|
|
5546
|
-
* Sets `sampleRate=16000` to match Telnyx's L16/16000 default codec —
|
|
5547
|
-
* audio flows end-to-end with zero resampling or transcoding. Same as
|
|
5548
|
-
* the bare-constructor default; exists for API symmetry with
|
|
5549
|
-
* {@link CartesiaTTS.forTwilio}.
|
|
5550
|
-
*/
|
|
5551
|
-
static forTelnyx(apiKey, options = {}) {
|
|
5552
|
-
return new _CartesiaTTS(apiKey, {
|
|
5553
|
-
...options,
|
|
5554
|
-
sampleRate: CartesiaTTSSampleRate.HZ_16000
|
|
5555
|
-
});
|
|
5556
|
-
}
|
|
5557
|
-
/** Build the JSON payload for the Cartesia bytes endpoint. */
|
|
5558
|
-
buildPayload(text) {
|
|
5559
|
-
const payload = {
|
|
5560
|
-
model_id: this.model,
|
|
5561
|
-
voice: { mode: CartesiaTTSVoiceMode.ID, id: this.voice },
|
|
5562
|
-
transcript: text,
|
|
5563
|
-
output_format: {
|
|
5564
|
-
container: CartesiaTTSContainer.RAW,
|
|
5565
|
-
encoding: CartesiaTTSEncoding.PCM_S16LE,
|
|
5566
|
-
sample_rate: this.sampleRate
|
|
5567
|
-
},
|
|
5568
|
-
language: this.language
|
|
5569
|
-
};
|
|
5570
|
-
const generationConfig = {};
|
|
5571
|
-
if (this.speed !== void 0) generationConfig.speed = this.speed;
|
|
5572
|
-
if (this.emotion && this.emotion.length > 0)
|
|
5573
|
-
generationConfig.emotion = this.emotion[0];
|
|
5574
|
-
if (this.volume !== void 0) generationConfig.volume = this.volume;
|
|
5575
|
-
if (Object.keys(generationConfig).length > 0) {
|
|
5576
|
-
payload.generation_config = generationConfig;
|
|
5577
|
-
}
|
|
5578
|
-
return payload;
|
|
5579
|
-
}
|
|
5580
|
-
/**
|
|
5581
|
-
* Pre-call HTTP warmup for the Cartesia `/tts/bytes` endpoint.
|
|
5582
|
-
*
|
|
5583
|
-
* Issues a lightweight `GET <baseUrl>/voices` so DNS, TLS, and HTTP/2
|
|
5584
|
-
* are already up by the time the first `synthesizeStream()` POST
|
|
5585
|
-
* lands. Best-effort: 5 s timeout, all exceptions swallowed at
|
|
5586
|
-
* debug level.
|
|
5587
|
-
*
|
|
5588
|
-
* Billing safety: `GET /voices` is a free metadata read on
|
|
5589
|
-
* Cartesia's REST surface (per https://docs.cartesia.ai). It does
|
|
5590
|
-
* not consume synthesis credits. The actual synthesis is billed
|
|
5591
|
-
* only when `POST /tts/bytes` runs with a non-empty `transcript`.
|
|
5592
|
-
*
|
|
5593
|
-
* Note: Cartesia TTS uses the HTTP path (vs the WebSocket variant
|
|
5594
|
-
* Cartesia also exposes) — connection warmup is therefore HTTP-GET
|
|
5595
|
-
* based, not WebSocket pre-handshake. The latency win is smaller
|
|
5596
|
-
* (~50-150 ms vs the ~200-500 ms of a WS prewarm) but still real.
|
|
5597
|
-
*/
|
|
5598
|
-
async warmup() {
|
|
5599
|
-
try {
|
|
5600
|
-
await fetch(`${this.baseUrl}/voices`, {
|
|
5601
|
-
method: "GET",
|
|
5602
|
-
headers: {
|
|
5603
|
-
"X-API-Key": this.apiKey,
|
|
5604
|
-
"Cartesia-Version": this.apiVersion
|
|
5605
|
-
},
|
|
5606
|
-
signal: AbortSignal.timeout(5e3)
|
|
5607
|
-
});
|
|
5608
|
-
} catch (err) {
|
|
5609
|
-
getLogger().debug(`Cartesia TTS warmup failed (best-effort): ${String(err)}`);
|
|
6009
|
+
samples.push(s);
|
|
6010
|
+
} else {
|
|
6011
|
+
samples.push(x);
|
|
6012
|
+
}
|
|
5610
6013
|
}
|
|
5611
|
-
|
|
5612
|
-
|
|
5613
|
-
|
|
5614
|
-
|
|
5615
|
-
|
|
5616
|
-
|
|
6014
|
+
if (lpf) ctx.lpfPrev = y;
|
|
6015
|
+
const out = [];
|
|
6016
|
+
let i = 0;
|
|
6017
|
+
if (direct8k) {
|
|
6018
|
+
while (i + 2 < samples.length) {
|
|
6019
|
+
out.push(samples[i]);
|
|
6020
|
+
i += 3;
|
|
6021
|
+
}
|
|
6022
|
+
} else {
|
|
6023
|
+
while (i + 2 < samples.length) {
|
|
6024
|
+
out.push(samples[i]);
|
|
6025
|
+
out.push(Math.round((samples[i + 1] + samples[i + 2]) / 2));
|
|
6026
|
+
i += 3;
|
|
6027
|
+
}
|
|
5617
6028
|
}
|
|
5618
|
-
|
|
5619
|
-
|
|
5620
|
-
|
|
5621
|
-
|
|
5622
|
-
* `sampleRate` as they arrive from Cartesia.
|
|
5623
|
-
*/
|
|
5624
|
-
async *synthesizeStream(text) {
|
|
5625
|
-
const response = await fetch(`${this.baseUrl}/tts/bytes`, {
|
|
5626
|
-
method: "POST",
|
|
5627
|
-
headers: {
|
|
5628
|
-
"X-API-Key": this.apiKey,
|
|
5629
|
-
"Cartesia-Version": this.apiVersion,
|
|
5630
|
-
"Content-Type": "application/json"
|
|
5631
|
-
},
|
|
5632
|
-
body: JSON.stringify(this.buildPayload(text)),
|
|
5633
|
-
signal: AbortSignal.timeout(3e4)
|
|
5634
|
-
});
|
|
5635
|
-
if (!response.ok) {
|
|
5636
|
-
const body = await response.text();
|
|
5637
|
-
throw new Error(`Cartesia TTS error ${response.status}: ${body}`);
|
|
6029
|
+
ctx.leftover = samples.slice(i);
|
|
6030
|
+
const buffer = Buffer.alloc(out.length * 2);
|
|
6031
|
+
for (let j = 0; j < out.length; j++) {
|
|
6032
|
+
buffer.writeInt16LE(out[j], j * 2);
|
|
5638
6033
|
}
|
|
5639
|
-
|
|
5640
|
-
|
|
6034
|
+
return buffer;
|
|
6035
|
+
}
|
|
6036
|
+
/** @deprecated use {@link resampleStreaming} with persistent state. */
|
|
6037
|
+
static resample24kTo16k(audio) {
|
|
6038
|
+
const ctx = {
|
|
6039
|
+
carryByte: null,
|
|
6040
|
+
leftover: [],
|
|
6041
|
+
lpfPrev: 0,
|
|
6042
|
+
lpfEnabled: false,
|
|
6043
|
+
targetSampleRate: 16e3
|
|
6044
|
+
};
|
|
6045
|
+
const out = _OpenAITTS.resampleStreaming(audio, ctx);
|
|
6046
|
+
if (ctx.leftover.length === 0) return out;
|
|
6047
|
+
const tail = Buffer.alloc(ctx.leftover.length * 2);
|
|
6048
|
+
for (let i = 0; i < ctx.leftover.length; i++) {
|
|
6049
|
+
tail.writeInt16LE(ctx.leftover[i], i * 2);
|
|
5641
6050
|
}
|
|
5642
|
-
|
|
5643
|
-
|
|
5644
|
-
|
|
5645
|
-
|
|
5646
|
-
|
|
5647
|
-
|
|
5648
|
-
|
|
5649
|
-
|
|
5650
|
-
|
|
5651
|
-
|
|
5652
|
-
|
|
5653
|
-
|
|
5654
|
-
|
|
5655
|
-
reader.releaseLock();
|
|
6051
|
+
return Buffer.concat([out, tail]);
|
|
6052
|
+
}
|
|
6053
|
+
};
|
|
6054
|
+
|
|
6055
|
+
// src/tts/openai.ts
|
|
6056
|
+
var TTS3 = class extends OpenAITTS {
|
|
6057
|
+
static providerKey = "openai_tts";
|
|
6058
|
+
constructor(opts = {}) {
|
|
6059
|
+
const key = opts.apiKey ?? process.env.OPENAI_API_KEY;
|
|
6060
|
+
if (!key) {
|
|
6061
|
+
throw new Error(
|
|
6062
|
+
"OpenAI TTS requires an apiKey. Pass { apiKey: 'sk-...' } or set OPENAI_API_KEY in the environment."
|
|
6063
|
+
);
|
|
5656
6064
|
}
|
|
6065
|
+
super(
|
|
6066
|
+
key,
|
|
6067
|
+
opts.voice ?? "alloy",
|
|
6068
|
+
opts.model ?? "gpt-4o-mini-tts",
|
|
6069
|
+
opts.instructions ?? null,
|
|
6070
|
+
opts.speed ?? null,
|
|
6071
|
+
opts.antiAlias ?? false
|
|
6072
|
+
);
|
|
5657
6073
|
}
|
|
5658
6074
|
};
|
|
5659
6075
|
|
|
5660
6076
|
// src/tts/cartesia.ts
|
|
6077
|
+
init_esm_shims();
|
|
5661
6078
|
function resolveApiKey3(apiKey) {
|
|
5662
6079
|
const key = apiKey ?? process.env.CARTESIA_API_KEY;
|
|
5663
6080
|
if (!key) {
|
|
@@ -5687,150 +6104,6 @@ var TTS4 = class _TTS extends CartesiaTTS {
|
|
|
5687
6104
|
|
|
5688
6105
|
// src/tts/rime.ts
|
|
5689
6106
|
init_esm_shims();
|
|
5690
|
-
|
|
5691
|
-
// src/providers/rime-tts.ts
|
|
5692
|
-
init_esm_shims();
|
|
5693
|
-
var RIME_BASE_URL = "https://users.rime.ai/v1/rime-tts";
|
|
5694
|
-
var RimeModel = {
|
|
5695
|
-
ARCANA: "arcana",
|
|
5696
|
-
MIST: "mist",
|
|
5697
|
-
MIST_V2: "mistv2"
|
|
5698
|
-
};
|
|
5699
|
-
var RimeAudioFormat = {
|
|
5700
|
-
PCM: "audio/pcm",
|
|
5701
|
-
MP3: "audio/mp3",
|
|
5702
|
-
WAV: "audio/wav",
|
|
5703
|
-
MULAW: "audio/mulaw"
|
|
5704
|
-
};
|
|
5705
|
-
var ARCANA_MODEL_TIMEOUT_MS = 60 * 4 * 1e3;
|
|
5706
|
-
var MIST_MODEL_TIMEOUT_MS = 30 * 1e3;
|
|
5707
|
-
function isMistModel(model) {
|
|
5708
|
-
return model.includes(RimeModel.MIST);
|
|
5709
|
-
}
|
|
5710
|
-
function timeoutForModel(model) {
|
|
5711
|
-
if (model === RimeModel.ARCANA) return ARCANA_MODEL_TIMEOUT_MS;
|
|
5712
|
-
return MIST_MODEL_TIMEOUT_MS;
|
|
5713
|
-
}
|
|
5714
|
-
var RimeTTS = class {
|
|
5715
|
-
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
5716
|
-
static providerKey = "rime";
|
|
5717
|
-
apiKey;
|
|
5718
|
-
model;
|
|
5719
|
-
speaker;
|
|
5720
|
-
lang;
|
|
5721
|
-
sampleRate;
|
|
5722
|
-
repetitionPenalty;
|
|
5723
|
-
temperature;
|
|
5724
|
-
topP;
|
|
5725
|
-
maxTokens;
|
|
5726
|
-
speedAlpha;
|
|
5727
|
-
reduceLatency;
|
|
5728
|
-
pauseBetweenBrackets;
|
|
5729
|
-
phonemizeBetweenBrackets;
|
|
5730
|
-
baseUrl;
|
|
5731
|
-
totalTimeoutMs;
|
|
5732
|
-
constructor(apiKey, opts = {}) {
|
|
5733
|
-
this.apiKey = apiKey;
|
|
5734
|
-
this.model = opts.model ?? RimeModel.ARCANA;
|
|
5735
|
-
const defaultSpeaker = isMistModel(this.model) ? "cove" : "astra";
|
|
5736
|
-
this.speaker = opts.speaker ?? defaultSpeaker;
|
|
5737
|
-
this.lang = opts.lang ?? "eng";
|
|
5738
|
-
this.sampleRate = opts.sampleRate ?? 16e3;
|
|
5739
|
-
this.repetitionPenalty = opts.repetitionPenalty;
|
|
5740
|
-
this.temperature = opts.temperature;
|
|
5741
|
-
this.topP = opts.topP;
|
|
5742
|
-
this.maxTokens = opts.maxTokens;
|
|
5743
|
-
this.speedAlpha = opts.speedAlpha;
|
|
5744
|
-
this.reduceLatency = opts.reduceLatency;
|
|
5745
|
-
this.pauseBetweenBrackets = opts.pauseBetweenBrackets;
|
|
5746
|
-
this.phonemizeBetweenBrackets = opts.phonemizeBetweenBrackets;
|
|
5747
|
-
this.baseUrl = opts.baseUrl ?? RIME_BASE_URL;
|
|
5748
|
-
this.totalTimeoutMs = timeoutForModel(this.model);
|
|
5749
|
-
}
|
|
5750
|
-
buildPayload(text) {
|
|
5751
|
-
const payload = {
|
|
5752
|
-
speaker: this.speaker,
|
|
5753
|
-
text,
|
|
5754
|
-
modelId: this.model
|
|
5755
|
-
};
|
|
5756
|
-
if (this.model === RimeModel.ARCANA) {
|
|
5757
|
-
if (this.repetitionPenalty !== void 0)
|
|
5758
|
-
payload.repetition_penalty = this.repetitionPenalty;
|
|
5759
|
-
if (this.temperature !== void 0) payload.temperature = this.temperature;
|
|
5760
|
-
if (this.topP !== void 0) payload.top_p = this.topP;
|
|
5761
|
-
if (this.maxTokens !== void 0) payload.max_tokens = this.maxTokens;
|
|
5762
|
-
payload.lang = this.lang;
|
|
5763
|
-
payload.samplingRate = this.sampleRate;
|
|
5764
|
-
} else if (isMistModel(this.model)) {
|
|
5765
|
-
payload.lang = this.lang;
|
|
5766
|
-
payload.samplingRate = this.sampleRate;
|
|
5767
|
-
if (this.speedAlpha !== void 0) payload.speedAlpha = this.speedAlpha;
|
|
5768
|
-
if (this.model === RimeModel.MIST_V2 && this.reduceLatency !== void 0) {
|
|
5769
|
-
payload.reduceLatency = this.reduceLatency;
|
|
5770
|
-
}
|
|
5771
|
-
if (this.pauseBetweenBrackets !== void 0) {
|
|
5772
|
-
payload.pauseBetweenBrackets = this.pauseBetweenBrackets;
|
|
5773
|
-
}
|
|
5774
|
-
if (this.phonemizeBetweenBrackets !== void 0) {
|
|
5775
|
-
payload.phonemizeBetweenBrackets = this.phonemizeBetweenBrackets;
|
|
5776
|
-
}
|
|
5777
|
-
}
|
|
5778
|
-
return payload;
|
|
5779
|
-
}
|
|
5780
|
-
/** Synthesize text and return the concatenated audio buffer. */
|
|
5781
|
-
async synthesize(text) {
|
|
5782
|
-
const chunks = [];
|
|
5783
|
-
for await (const chunk of this.synthesizeStream(text)) {
|
|
5784
|
-
chunks.push(chunk);
|
|
5785
|
-
}
|
|
5786
|
-
return Buffer.concat(chunks);
|
|
5787
|
-
}
|
|
5788
|
-
/**
|
|
5789
|
-
* Synthesize text and yield raw PCM_S16LE chunks at the configured
|
|
5790
|
-
* `sampleRate` as they stream in.
|
|
5791
|
-
*/
|
|
5792
|
-
async *synthesizeStream(text) {
|
|
5793
|
-
const response = await fetch(this.baseUrl, {
|
|
5794
|
-
method: "POST",
|
|
5795
|
-
headers: {
|
|
5796
|
-
accept: RimeAudioFormat.PCM,
|
|
5797
|
-
Authorization: `Bearer ${this.apiKey}`,
|
|
5798
|
-
"content-type": "application/json"
|
|
5799
|
-
},
|
|
5800
|
-
body: JSON.stringify(this.buildPayload(text)),
|
|
5801
|
-
signal: AbortSignal.timeout(this.totalTimeoutMs)
|
|
5802
|
-
});
|
|
5803
|
-
if (!response.ok) {
|
|
5804
|
-
const body = await response.text();
|
|
5805
|
-
throw new Error(`Rime TTS error ${response.status}: ${body}`);
|
|
5806
|
-
}
|
|
5807
|
-
const contentType = response.headers.get("content-type") ?? "";
|
|
5808
|
-
if (!contentType.startsWith("audio")) {
|
|
5809
|
-
const body = await response.text();
|
|
5810
|
-
throw new Error(`Rime returned non-audio response: ${body.slice(0, 500)}`);
|
|
5811
|
-
}
|
|
5812
|
-
if (!response.body) {
|
|
5813
|
-
throw new Error("Rime TTS: no response body");
|
|
5814
|
-
}
|
|
5815
|
-
const reader = response.body.getReader();
|
|
5816
|
-
try {
|
|
5817
|
-
while (true) {
|
|
5818
|
-
const { done, value } = await reader.read();
|
|
5819
|
-
if (done) break;
|
|
5820
|
-
if (value && value.length > 0) {
|
|
5821
|
-
yield Buffer.from(value);
|
|
5822
|
-
}
|
|
5823
|
-
}
|
|
5824
|
-
} finally {
|
|
5825
|
-
if (typeof reader.cancel === "function")
|
|
5826
|
-
await reader.cancel().catch(() => {
|
|
5827
|
-
});
|
|
5828
|
-
reader.releaseLock();
|
|
5829
|
-
}
|
|
5830
|
-
}
|
|
5831
|
-
};
|
|
5832
|
-
|
|
5833
|
-
// src/tts/rime.ts
|
|
5834
6107
|
var TTS5 = class extends RimeTTS {
|
|
5835
6108
|
static providerKey = "rime";
|
|
5836
6109
|
constructor(opts = {}) {
|
|
@@ -6469,12 +6742,6 @@ init_esm_shims();
|
|
|
6469
6742
|
|
|
6470
6743
|
// src/providers/groq-llm.ts
|
|
6471
6744
|
init_esm_shims();
|
|
6472
|
-
|
|
6473
|
-
// src/version.ts
|
|
6474
|
-
init_esm_shims();
|
|
6475
|
-
var VERSION = "0.5.5";
|
|
6476
|
-
|
|
6477
|
-
// src/providers/groq-llm.ts
|
|
6478
6745
|
var GROQ_BASE_URL = "https://api.groq.com/openai/v1";
|
|
6479
6746
|
var GroqModel = {
|
|
6480
6747
|
LLAMA_3_3_70B_VERSATILE: "llama-3.3-70b-versatile",
|
|
@@ -7293,7 +7560,7 @@ var KrispVivaFilter = class {
|
|
|
7293
7560
|
|
|
7294
7561
|
// src/telephony/twilio.ts
|
|
7295
7562
|
init_esm_shims();
|
|
7296
|
-
var
|
|
7563
|
+
var Carrier2 = class {
|
|
7297
7564
|
kind = "twilio";
|
|
7298
7565
|
accountSid;
|
|
7299
7566
|
authToken;
|
|
@@ -7317,7 +7584,7 @@ var Carrier = class {
|
|
|
7317
7584
|
|
|
7318
7585
|
// src/telephony/telnyx.ts
|
|
7319
7586
|
init_esm_shims();
|
|
7320
|
-
var
|
|
7587
|
+
var Carrier3 = class {
|
|
7321
7588
|
kind = "telnyx";
|
|
7322
7589
|
apiKey;
|
|
7323
7590
|
connectionId;
|
|
@@ -8131,12 +8398,28 @@ var TwilioAdapter = class _TwilioAdapter {
|
|
|
8131
8398
|
return { callSid: call.sid };
|
|
8132
8399
|
}
|
|
8133
8400
|
/**
|
|
8134
|
-
* Build a
|
|
8135
|
-
*
|
|
8401
|
+
* Build a ``<Response><Connect><Stream url="...">`` TwiML document.
|
|
8402
|
+
*
|
|
8403
|
+
* ``parameters`` is forwarded as ``<Parameter name="..." value="..."/>``
|
|
8404
|
+
* children of ``<Stream>``. Twilio Media Streams strips query-string params
|
|
8405
|
+
* from the ``<Stream url=...>`` before the WS handshake, so
|
|
8406
|
+
* ``<Parameter>`` tags are the supported way to pre-populate
|
|
8407
|
+
* ``start.customParameters`` on the WS ``start`` frame. Used by the
|
|
8408
|
+
* inbound path to carry caller / callee through to the bridge.
|
|
8409
|
+
*
|
|
8410
|
+
* Mirrors the Python adapter's ``generate_stream_twiml``.
|
|
8136
8411
|
*/
|
|
8137
|
-
static generateStreamTwiml(streamUrl) {
|
|
8138
|
-
const
|
|
8139
|
-
|
|
8412
|
+
static generateStreamTwiml(streamUrl, parameters) {
|
|
8413
|
+
const esc = (s) => s.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'");
|
|
8414
|
+
const escapedUrl = esc(streamUrl);
|
|
8415
|
+
let paramTags = "";
|
|
8416
|
+
if (parameters) {
|
|
8417
|
+
for (const [name, value] of Object.entries(parameters)) {
|
|
8418
|
+
if (value == null) continue;
|
|
8419
|
+
paramTags += `<Parameter name="${esc(name)}" value="${esc(String(value))}"/>`;
|
|
8420
|
+
}
|
|
8421
|
+
}
|
|
8422
|
+
return `<?xml version="1.0" encoding="UTF-8"?><Response><Connect><Stream url="${escapedUrl}">${paramTags}</Stream></Connect></Response>`;
|
|
8140
8423
|
}
|
|
8141
8424
|
/** Force-complete an in-progress call. */
|
|
8142
8425
|
async endCall(callSid) {
|
|
@@ -8529,6 +8812,8 @@ export {
|
|
|
8529
8812
|
CallMetricsAccumulator,
|
|
8530
8813
|
STT4 as CartesiaSTT,
|
|
8531
8814
|
TTS4 as CartesiaTTS,
|
|
8815
|
+
CartesiaTTSModel,
|
|
8816
|
+
CartesiaTTSVoiceMode,
|
|
8532
8817
|
LLM4 as CerebrasLLM,
|
|
8533
8818
|
ChatContext,
|
|
8534
8819
|
CloudflareTunnel,
|
|
@@ -8536,10 +8821,13 @@ export {
|
|
|
8536
8821
|
DEFAULT_PRICING,
|
|
8537
8822
|
DTMF_EVENTS,
|
|
8538
8823
|
DeepFilterNetFilter,
|
|
8824
|
+
DeepgramModel,
|
|
8539
8825
|
STT as DeepgramSTT,
|
|
8540
8826
|
DefaultToolExecutor,
|
|
8541
8827
|
ConvAI as ElevenLabsConvAI,
|
|
8542
8828
|
ElevenLabsConvAIAdapter,
|
|
8829
|
+
ElevenLabsModel,
|
|
8830
|
+
ElevenLabsOutputFormat,
|
|
8543
8831
|
ElevenLabsTTS as ElevenLabsRestTTS,
|
|
8544
8832
|
TTS as ElevenLabsTTS,
|
|
8545
8833
|
TTS2 as ElevenLabsWebSocketTTS,
|
|
@@ -8568,8 +8856,15 @@ export {
|
|
|
8568
8856
|
Realtime2 as OpenAIRealtime2,
|
|
8569
8857
|
OpenAIRealtime2Adapter,
|
|
8570
8858
|
OpenAIRealtimeAdapter,
|
|
8859
|
+
OpenAIRealtimeAudioFormat,
|
|
8860
|
+
OpenAIRealtimeModel,
|
|
8861
|
+
OpenAIRealtimeVADType,
|
|
8571
8862
|
TTS3 as OpenAITTS,
|
|
8572
8863
|
STT3 as OpenAITranscribeSTT,
|
|
8864
|
+
OpenAITranscriptionModel,
|
|
8865
|
+
OpenAIVoice,
|
|
8866
|
+
PRICING_LAST_UPDATED,
|
|
8867
|
+
PRICING_VERSION,
|
|
8573
8868
|
PartialStreamError,
|
|
8574
8869
|
Patter,
|
|
8575
8870
|
PatterConnectionError,
|
|
@@ -8577,9 +8872,14 @@ export {
|
|
|
8577
8872
|
PatterTool,
|
|
8578
8873
|
PcmCarry,
|
|
8579
8874
|
PipelineHookExecutor,
|
|
8875
|
+
Carrier as Plivo,
|
|
8876
|
+
PlivoAdapter,
|
|
8877
|
+
PricingUnit,
|
|
8580
8878
|
ProvisionError,
|
|
8581
8879
|
RateLimitError,
|
|
8582
8880
|
RemoteMessageHandler,
|
|
8881
|
+
RimeAudioFormat,
|
|
8882
|
+
RimeModel,
|
|
8583
8883
|
TTS5 as RimeTTS,
|
|
8584
8884
|
SPAN_BARGEIN,
|
|
8585
8885
|
SPAN_CALL,
|
|
@@ -8600,7 +8900,7 @@ export {
|
|
|
8600
8900
|
TurnDetectionMode as SpeechmaticsTurnDetectionMode,
|
|
8601
8901
|
StatefulResampler,
|
|
8602
8902
|
Static as StaticTunnel,
|
|
8603
|
-
|
|
8903
|
+
Carrier3 as Telnyx,
|
|
8604
8904
|
TelnyxAdapter,
|
|
8605
8905
|
TelnyxSTT,
|
|
8606
8906
|
TelnyxSTTInputFormat,
|
|
@@ -8611,7 +8911,7 @@ export {
|
|
|
8611
8911
|
TestSession,
|
|
8612
8912
|
TfidfLoopDetector,
|
|
8613
8913
|
Tool,
|
|
8614
|
-
|
|
8914
|
+
Carrier2 as Twilio,
|
|
8615
8915
|
TwilioAdapter,
|
|
8616
8916
|
ULTRAVOX_DEFAULT_API_BASE,
|
|
8617
8917
|
ULTRAVOX_DEFAULT_SR,
|