getpatter 0.6.0 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/barge-in-strategies-X6ARMGIQ.mjs +12 -0
- package/dist/chunk-CL2U3YET.mjs +1429 -0
- package/dist/chunk-D4424JZR.mjs +71 -0
- package/dist/{chunk-JUQ5WQTQ.mjs → chunk-LE63CSOB.mjs} +1424 -969
- package/dist/{chunk-X3364LSI.mjs → chunk-R2T4JABZ.mjs} +49 -2
- package/dist/cli.js +315 -37
- package/dist/dashboard/ui.html +13 -13
- package/dist/index.d.mts +2136 -709
- package/dist/index.d.ts +2136 -709
- package/dist/index.js +5674 -2233
- package/dist/index.mjs +2338 -915
- package/dist/openai-realtime-2-CNFARP25.mjs +8 -0
- package/dist/{silero-vad-YLCXT5GQ.mjs → silero-vad-LNDFGIY7.mjs} +1 -1
- package/dist/{test-mode-Y7YG5LFZ.mjs → test-mode-RS57BDM6.mjs} +2 -1
- package/package.json +1 -1
- package/src/dashboard/ui.html +13 -13
package/dist/index.mjs
CHANGED
|
@@ -6,6 +6,7 @@ import {
|
|
|
6
6
|
CallMetricsAccumulator,
|
|
7
7
|
DEFAULT_MIN_SENTENCE_LEN,
|
|
8
8
|
DEFAULT_PRICING,
|
|
9
|
+
DeepgramModel,
|
|
9
10
|
DeepgramSTT,
|
|
10
11
|
DefaultToolExecutor,
|
|
11
12
|
ElevenLabsConvAIAdapter,
|
|
@@ -15,11 +16,12 @@ import {
|
|
|
15
16
|
LLMLoop,
|
|
16
17
|
MetricsStore,
|
|
17
18
|
OpenAILLMProvider,
|
|
18
|
-
|
|
19
|
+
PRICING_LAST_UPDATED,
|
|
20
|
+
PRICING_VERSION,
|
|
19
21
|
PatterConnectionError,
|
|
20
22
|
PatterError,
|
|
21
|
-
PcmCarry,
|
|
22
23
|
PipelineHookExecutor,
|
|
24
|
+
PricingUnit,
|
|
23
25
|
ProvisionError,
|
|
24
26
|
RateLimitError,
|
|
25
27
|
RemoteMessageHandler,
|
|
@@ -31,18 +33,14 @@ import {
|
|
|
31
33
|
SPAN_TOOL,
|
|
32
34
|
SPAN_TTS,
|
|
33
35
|
SentenceChunker,
|
|
34
|
-
StatefulResampler,
|
|
35
36
|
TestSession,
|
|
37
|
+
VERSION,
|
|
36
38
|
calculateRealtimeCost,
|
|
37
39
|
calculateSttCost,
|
|
38
40
|
calculateTelephonyCost,
|
|
39
41
|
calculateTtsCost,
|
|
40
42
|
callsToCsv,
|
|
41
43
|
callsToJson,
|
|
42
|
-
createResampler16kTo8k,
|
|
43
|
-
createResampler24kTo16k,
|
|
44
|
-
createResampler24kTo8k,
|
|
45
|
-
createResampler8kTo16k,
|
|
46
44
|
initTracing,
|
|
47
45
|
isRemoteUrl,
|
|
48
46
|
isTracingEnabled,
|
|
@@ -52,14 +50,34 @@ import {
|
|
|
52
50
|
mergePricing,
|
|
53
51
|
mountApi,
|
|
54
52
|
mountDashboard,
|
|
53
|
+
resolveLogRoot,
|
|
54
|
+
startSpan
|
|
55
|
+
} from "./chunk-LE63CSOB.mjs";
|
|
56
|
+
import {
|
|
57
|
+
OpenAIRealtime2Adapter,
|
|
58
|
+
OpenAIRealtimeAdapter,
|
|
59
|
+
OpenAIRealtimeAudioFormat,
|
|
60
|
+
OpenAIRealtimeModel,
|
|
61
|
+
OpenAIRealtimeVADType,
|
|
62
|
+
OpenAITranscriptionModel,
|
|
63
|
+
OpenAIVoice,
|
|
64
|
+
PcmCarry,
|
|
65
|
+
StatefulResampler,
|
|
66
|
+
createResampler16kTo8k,
|
|
67
|
+
createResampler24kTo16k,
|
|
68
|
+
createResampler24kTo8k,
|
|
69
|
+
createResampler8kTo16k,
|
|
55
70
|
mulawToPcm16,
|
|
56
71
|
pcm16ToMulaw,
|
|
57
72
|
resample16kTo8k,
|
|
58
73
|
resample24kTo16k,
|
|
59
|
-
resample8kTo16k
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
74
|
+
resample8kTo16k
|
|
75
|
+
} from "./chunk-CL2U3YET.mjs";
|
|
76
|
+
import {
|
|
77
|
+
MinWordsStrategy,
|
|
78
|
+
evaluateStrategies,
|
|
79
|
+
resetStrategies
|
|
80
|
+
} from "./chunk-D4424JZR.mjs";
|
|
63
81
|
import {
|
|
64
82
|
getLogger,
|
|
65
83
|
setLogger
|
|
@@ -69,7 +87,7 @@ import {
|
|
|
69
87
|
} from "./chunk-6GR5MHHQ.mjs";
|
|
70
88
|
import {
|
|
71
89
|
SileroVAD
|
|
72
|
-
} from "./chunk-
|
|
90
|
+
} from "./chunk-R2T4JABZ.mjs";
|
|
73
91
|
import {
|
|
74
92
|
__dirname,
|
|
75
93
|
__require,
|
|
@@ -99,7 +117,31 @@ var Realtime = class {
|
|
|
99
117
|
);
|
|
100
118
|
}
|
|
101
119
|
this.apiKey = key;
|
|
102
|
-
this.model = opts.model ?? "gpt-
|
|
120
|
+
this.model = opts.model ?? "gpt-realtime-mini";
|
|
121
|
+
this.voice = opts.voice ?? "alloy";
|
|
122
|
+
this.reasoningEffort = opts.reasoningEffort;
|
|
123
|
+
this.inputAudioTranscriptionModel = opts.inputAudioTranscriptionModel;
|
|
124
|
+
}
|
|
125
|
+
};
|
|
126
|
+
|
|
127
|
+
// src/engines/openai-2.ts
|
|
128
|
+
init_esm_shims();
|
|
129
|
+
var Realtime2 = class {
|
|
130
|
+
kind = "openai_realtime_2";
|
|
131
|
+
apiKey;
|
|
132
|
+
model;
|
|
133
|
+
voice;
|
|
134
|
+
reasoningEffort;
|
|
135
|
+
inputAudioTranscriptionModel;
|
|
136
|
+
constructor(opts = {}) {
|
|
137
|
+
const key = opts.apiKey ?? process.env.OPENAI_API_KEY;
|
|
138
|
+
if (!key) {
|
|
139
|
+
throw new Error(
|
|
140
|
+
"OpenAI Realtime 2 requires an apiKey. Pass { apiKey: 'sk-...' } or set OPENAI_API_KEY in the environment."
|
|
141
|
+
);
|
|
142
|
+
}
|
|
143
|
+
this.apiKey = key;
|
|
144
|
+
this.model = opts.model ?? "gpt-realtime-2";
|
|
103
145
|
this.voice = opts.voice ?? "alloy";
|
|
104
146
|
this.reasoningEffort = opts.reasoningEffort;
|
|
105
147
|
this.inputAudioTranscriptionModel = opts.inputAudioTranscriptionModel;
|
|
@@ -520,11 +562,41 @@ function filterUndef(obj) {
|
|
|
520
562
|
}
|
|
521
563
|
|
|
522
564
|
// src/client.ts
|
|
565
|
+
var PREWARM_CACHE_MAX = 200;
|
|
566
|
+
var PREWARM_TTL_GRACE_MS = 5e3;
|
|
567
|
+
var PARKED_CONN_TTL_MS = 3e4;
|
|
523
568
|
function resolvePersistRoot(persist) {
|
|
524
569
|
if (persist === false) return null;
|
|
525
570
|
if (persist === true) return resolveLogRoot("auto");
|
|
526
571
|
if (typeof persist === "string") return resolveLogRoot(persist);
|
|
527
|
-
|
|
572
|
+
const envRoot = resolveLogRoot();
|
|
573
|
+
if (envRoot !== null) return envRoot;
|
|
574
|
+
return resolveLogRoot("auto");
|
|
575
|
+
}
|
|
576
|
+
function closeParkedConnections(slot) {
|
|
577
|
+
if (slot.stt) {
|
|
578
|
+
try {
|
|
579
|
+
slot.stt.close();
|
|
580
|
+
} catch {
|
|
581
|
+
}
|
|
582
|
+
}
|
|
583
|
+
if (slot.tts) {
|
|
584
|
+
try {
|
|
585
|
+
slot.tts.ws.close();
|
|
586
|
+
} catch {
|
|
587
|
+
}
|
|
588
|
+
}
|
|
589
|
+
if (slot.openaiRealtime) {
|
|
590
|
+
const wsAny = slot.openaiRealtime;
|
|
591
|
+
if (wsAny._parkedKeepalive) {
|
|
592
|
+
clearInterval(wsAny._parkedKeepalive);
|
|
593
|
+
delete wsAny._parkedKeepalive;
|
|
594
|
+
}
|
|
595
|
+
try {
|
|
596
|
+
slot.openaiRealtime.close();
|
|
597
|
+
} catch {
|
|
598
|
+
}
|
|
599
|
+
}
|
|
528
600
|
}
|
|
529
601
|
var Patter = class {
|
|
530
602
|
localConfig;
|
|
@@ -546,6 +618,65 @@ var Patter = class {
|
|
|
546
618
|
* ``Cannot use both tunnel: true and webhookUrl``.
|
|
547
619
|
*/
|
|
548
620
|
tunnelOwnsWebhookUrl = false;
|
|
621
|
+
/**
|
|
622
|
+
* Pre-rendered first-message TTS audio per outbound call_id. Populated
|
|
623
|
+
* by :meth:`call` when ``agent.prewarmFirstMessage`` is true; consumed
|
|
624
|
+
* by the StreamHandler firstMessage emit so the greeting streams
|
|
625
|
+
* instantly on ``start`` instead of paying the 200-700 ms TTS first-byte
|
|
626
|
+
* latency. See ``AgentOptions.prewarmFirstMessage``.
|
|
627
|
+
*
|
|
628
|
+
* Stores raw bytes in the TTS provider's native sample rate; the
|
|
629
|
+
* carrier-side audio sender resamples on emit.
|
|
630
|
+
*/
|
|
631
|
+
prewarmAudio = /* @__PURE__ */ new Map();
|
|
632
|
+
/**
|
|
633
|
+
* Call IDs whose prewarm cache slot has already been consumed —
|
|
634
|
+
* either by ``popPrewarmAudio`` (cache hit OR miss on the firstMessage
|
|
635
|
+
* emit path) or by ``recordPrewarmWaste`` (call ended before pickup).
|
|
636
|
+
* The prewarm task checks this set BEFORE writing bytes so a slow
|
|
637
|
+
* synth that finishes after the consumer already polled doesn't
|
|
638
|
+
* orphan bytes in ``prewarmAudio``. See FIX #92 in the parity audit.
|
|
639
|
+
*/
|
|
640
|
+
prewarmConsumed = /* @__PURE__ */ new Set();
|
|
641
|
+
/**
|
|
642
|
+
* Background tasks tracked so :meth:`disconnect` can wait on / drop any
|
|
643
|
+
* still-running prewarm-first-message synth before tearing down.
|
|
644
|
+
*/
|
|
645
|
+
prewarmTasks = /* @__PURE__ */ new Set();
|
|
646
|
+
/**
|
|
647
|
+
* TTL eviction timers keyed by call_id so :meth:`disconnect` (and
|
|
648
|
+
* normal consumption / waste-record paths) can cancel any pending
|
|
649
|
+
* timer when the slot drains naturally. Without this, the timer
|
|
650
|
+
* would WARN spuriously after the cache was already emptied.
|
|
651
|
+
*/
|
|
652
|
+
prewarmTtlTimers = /* @__PURE__ */ new Map();
|
|
653
|
+
/**
|
|
654
|
+
* Pre-opened, fully-handshaked provider WebSockets keyed by
|
|
655
|
+
* carrier-issued call_id. Populated by ``parkProviderConnections``
|
|
656
|
+
* during the carrier ringing window; consumed by the per-call
|
|
657
|
+
* StreamHandler at ``start`` via ``adoptWebSocket(...)`` so STT / TTS
|
|
658
|
+
* / Realtime audio can flow on the first turn without paying the
|
|
659
|
+
* 150-900 ms TLS + WS-upgrade + protocol-handshake round-trip again.
|
|
660
|
+
*
|
|
661
|
+
* Distinct from ``prewarmAudio`` (which holds pre-rendered TTS bytes
|
|
662
|
+
* for the first message); the two features are complementary and
|
|
663
|
+
* orthogonal — both can be active for the same call.
|
|
664
|
+
*
|
|
665
|
+
* Each slot may hold up to three parked connections (STT, TTS,
|
|
666
|
+
* Realtime). Drained by:
|
|
667
|
+
* - {@link popPrewarmedConnections} on the carrier ``start`` event
|
|
668
|
+
* (consumed normally — the handles transfer to the StreamHandler)
|
|
669
|
+
* - {@link recordPrewarmWaste} on call-termination paths (no-answer,
|
|
670
|
+
* busy, failed, canceled, AMD voicemail). Closes parked sockets.
|
|
671
|
+
* - {@link disconnect} on Patter teardown. Closes all parked sockets.
|
|
672
|
+
*/
|
|
673
|
+
prewarmedConnections = /* @__PURE__ */ new Map();
|
|
674
|
+
/**
|
|
675
|
+
* TTL eviction handles keyed by call_id for connections that are never
|
|
676
|
+
* adopted (e.g. a carrier that swallows ``start``). Closes the parked
|
|
677
|
+
* sockets so they don't leak past the safety window.
|
|
678
|
+
*/
|
|
679
|
+
prewarmedConnTimers = /* @__PURE__ */ new Map();
|
|
549
680
|
/**
|
|
550
681
|
* Speech-edge events for turn-taking instrumentation. Public surface: the
|
|
551
682
|
* seven `on*` proxy accessors below plus the `conversationState` snapshot.
|
|
@@ -553,13 +684,15 @@ var Patter = class {
|
|
|
553
684
|
* the previous behaviour.
|
|
554
685
|
*
|
|
555
686
|
* See `src/_speech-events.ts` for the full event taxonomy and the
|
|
556
|
-
*
|
|
687
|
+
* OpenAI Realtime alignment table.
|
|
557
688
|
*/
|
|
558
689
|
speechEvents = new SpeechEvents();
|
|
559
690
|
// ---- Speech-edge event callback proxies ------------------------------
|
|
560
|
-
// The seven `on*` properties below
|
|
561
|
-
//
|
|
562
|
-
//
|
|
691
|
+
// The seven `on*` properties below follow the canonical voice-agent
|
|
692
|
+
// metric set (user/agent state transitions, turn boundaries, TTFT, audio
|
|
693
|
+
// first-byte) and align with OpenAI Realtime where applicable. They
|
|
694
|
+
// proxy to `speechEvents` so the dispatcher remains the single source of
|
|
695
|
+
// truth (state + OTel).
|
|
563
696
|
get onUserSpeechStarted() {
|
|
564
697
|
return this.speechEvents.onUserSpeechStarted;
|
|
565
698
|
}
|
|
@@ -604,8 +737,8 @@ var Patter = class {
|
|
|
604
737
|
}
|
|
605
738
|
/**
|
|
606
739
|
* Snapshot of the current per-side state of the call.
|
|
607
|
-
*
|
|
608
|
-
*
|
|
740
|
+
* Returns the user_state / agent_state payload shape — read-only and
|
|
741
|
+
* safe to call at any time.
|
|
609
742
|
*/
|
|
610
743
|
get conversationState() {
|
|
611
744
|
return this.speechEvents.conversationState;
|
|
@@ -717,7 +850,7 @@ var Patter = class {
|
|
|
717
850
|
);
|
|
718
851
|
}
|
|
719
852
|
const engine = opts.engine;
|
|
720
|
-
if (engine instanceof Realtime) {
|
|
853
|
+
if (engine instanceof Realtime || engine instanceof Realtime2) {
|
|
721
854
|
working = {
|
|
722
855
|
...working,
|
|
723
856
|
provider: "openai_realtime",
|
|
@@ -735,7 +868,7 @@ var Patter = class {
|
|
|
735
868
|
};
|
|
736
869
|
} else {
|
|
737
870
|
throw new Error(
|
|
738
|
-
"Unknown engine. Expected OpenAIRealtime or ElevenLabsConvAI instance."
|
|
871
|
+
"Unknown engine. Expected OpenAIRealtime, OpenAIRealtime2, or ElevenLabsConvAI instance."
|
|
739
872
|
);
|
|
740
873
|
}
|
|
741
874
|
} else if (!working.provider && (working.stt !== void 0 || working.tts !== void 0 || working.llm !== void 0)) {
|
|
@@ -795,6 +928,13 @@ var Patter = class {
|
|
|
795
928
|
if (!opts.agent.systemPrompt && opts.agent.provider !== "pipeline") {
|
|
796
929
|
throw new Error("agent.systemPrompt is required");
|
|
797
930
|
}
|
|
931
|
+
if (opts.agent.echoCancellation) {
|
|
932
|
+
try {
|
|
933
|
+
await import("./aec-PJJMUM5E.mjs");
|
|
934
|
+
} catch (err) {
|
|
935
|
+
getLogger().debug(`AEC pre-import failed at serve(): ${String(err)}`);
|
|
936
|
+
}
|
|
937
|
+
}
|
|
798
938
|
if (opts.port !== void 0) {
|
|
799
939
|
if (typeof opts.port !== "number" || opts.port < 1 || opts.port > 65535) {
|
|
800
940
|
throw new RangeError(`port must be between 1 and 65535, got ${opts.port}`);
|
|
@@ -876,6 +1016,9 @@ var Patter = class {
|
|
|
876
1016
|
opts.dashboard ?? true,
|
|
877
1017
|
opts.dashboardToken ?? ""
|
|
878
1018
|
);
|
|
1019
|
+
this.embeddedServer.popPrewarmAudio = this.popPrewarmAudio;
|
|
1020
|
+
this.embeddedServer.popPrewarmedConnections = this.popPrewarmedConnections;
|
|
1021
|
+
this.embeddedServer.recordPrewarmWaste = this.recordPrewarmWaste;
|
|
879
1022
|
try {
|
|
880
1023
|
await this.embeddedServer.start(port);
|
|
881
1024
|
if (this.tunnelHandle) {
|
|
@@ -890,7 +1033,7 @@ var Patter = class {
|
|
|
890
1033
|
}
|
|
891
1034
|
/** Run the agent in interactive terminal-test mode (no real telephony). */
|
|
892
1035
|
async test(opts) {
|
|
893
|
-
const { TestSession: TestSession2 } = await import("./test-mode-
|
|
1036
|
+
const { TestSession: TestSession2 } = await import("./test-mode-RS57BDM6.mjs");
|
|
894
1037
|
const session = new TestSession2();
|
|
895
1038
|
await session.run({
|
|
896
1039
|
agent: opts.agent,
|
|
@@ -900,6 +1043,376 @@ var Patter = class {
|
|
|
900
1043
|
onCallEnd: opts.onCallEnd
|
|
901
1044
|
});
|
|
902
1045
|
}
|
|
1046
|
+
/**
|
|
1047
|
+
* Pop and return the pre-synthesised first-message audio for ``callId``.
|
|
1048
|
+
*
|
|
1049
|
+
* Returns ``undefined`` when ``agent.prewarmFirstMessage`` was not set
|
|
1050
|
+
* for the originating outbound call, or when the synth was still in
|
|
1051
|
+
* flight at the moment the carrier emitted ``start`` (cache miss — the
|
|
1052
|
+
* StreamHandler falls back to live TTS).
|
|
1053
|
+
*
|
|
1054
|
+
* Called by the per-call StreamHandler at the start of the firstMessage
|
|
1055
|
+
* emit. Returning bytes here lets the handler skip the live TTS
|
|
1056
|
+
* synthesis and stream the cached buffer directly.
|
|
1057
|
+
*
|
|
1058
|
+
* Marks ``callId`` as consumed regardless of cache hit/miss so a slow
|
|
1059
|
+
* synth task that finishes after this call drops its bytes instead of
|
|
1060
|
+
* orphaning them in ``prewarmAudio``. See FIX #92.
|
|
1061
|
+
*/
|
|
1062
|
+
popPrewarmAudio = (callId) => {
|
|
1063
|
+
this.prewarmConsumed.add(callId);
|
|
1064
|
+
const ttl = this.prewarmTtlTimers.get(callId);
|
|
1065
|
+
if (ttl !== void 0) {
|
|
1066
|
+
clearTimeout(ttl);
|
|
1067
|
+
this.prewarmTtlTimers.delete(callId);
|
|
1068
|
+
}
|
|
1069
|
+
const buf = this.prewarmAudio.get(callId);
|
|
1070
|
+
if (buf !== void 0) this.prewarmAudio.delete(callId);
|
|
1071
|
+
return buf;
|
|
1072
|
+
};
|
|
1073
|
+
/**
|
|
1074
|
+
* Log a warning if a prewarmed greeting was paid for but never used.
|
|
1075
|
+
* The TTS bill for ``agent.firstMessage`` has already been incurred by
|
|
1076
|
+
* the background synth task, so the user should know — opt-in feature
|
|
1077
|
+
* with a known cost surface.
|
|
1078
|
+
*
|
|
1079
|
+
* Idempotent: the second call for the same ``callId`` is a no-op, so
|
|
1080
|
+
* the status callback firing first and ``endCall`` running afterwards
|
|
1081
|
+
* (or vice-versa) does not double-WARN. Public so the embedded
|
|
1082
|
+
* server's webhook handlers can invoke it on no-answer / busy /
|
|
1083
|
+
* failed / canceled / AMD-machine paths. See FIX #91.
|
|
1084
|
+
*/
|
|
1085
|
+
recordPrewarmWaste = (callId) => {
|
|
1086
|
+
this.closePrewarmedConnections(callId);
|
|
1087
|
+
if (this.prewarmConsumed.has(callId)) {
|
|
1088
|
+
this.prewarmAudio.delete(callId);
|
|
1089
|
+
return;
|
|
1090
|
+
}
|
|
1091
|
+
this.prewarmConsumed.add(callId);
|
|
1092
|
+
const ttl = this.prewarmTtlTimers.get(callId);
|
|
1093
|
+
if (ttl !== void 0) {
|
|
1094
|
+
clearTimeout(ttl);
|
|
1095
|
+
this.prewarmTtlTimers.delete(callId);
|
|
1096
|
+
}
|
|
1097
|
+
const buf = this.prewarmAudio.get(callId);
|
|
1098
|
+
if (buf !== void 0) {
|
|
1099
|
+
this.prewarmAudio.delete(callId);
|
|
1100
|
+
getLogger().warn(
|
|
1101
|
+
`Prewarm wasted for call ${callId} \u2014 first-message TTS already paid (~${buf.byteLength} bytes synthesised) but call ended before pickup.`
|
|
1102
|
+
);
|
|
1103
|
+
}
|
|
1104
|
+
};
|
|
1105
|
+
/**
|
|
1106
|
+
* Pop and return the parked provider WebSockets for ``callId``, or
|
|
1107
|
+
* ``undefined`` when no parked connections exist.
|
|
1108
|
+
*
|
|
1109
|
+
* Wired into ``EmbeddedServer.popPrewarmedConnections`` so the
|
|
1110
|
+
* per-call ``StreamHandler`` can adopt the parked sockets at the
|
|
1111
|
+
* carrier ``start`` event instead of opening fresh ones — saving
|
|
1112
|
+
* ~150-900 ms of cold-start handshake on the first turn.
|
|
1113
|
+
*/
|
|
1114
|
+
popPrewarmedConnections = (callId) => {
|
|
1115
|
+
const slot = this.prewarmedConnections.get(callId);
|
|
1116
|
+
if (slot === void 0) return void 0;
|
|
1117
|
+
this.prewarmedConnections.delete(callId);
|
|
1118
|
+
const ttl = this.prewarmedConnTimers.get(callId);
|
|
1119
|
+
if (ttl !== void 0) {
|
|
1120
|
+
clearTimeout(ttl);
|
|
1121
|
+
this.prewarmedConnTimers.delete(callId);
|
|
1122
|
+
}
|
|
1123
|
+
return slot;
|
|
1124
|
+
};
|
|
1125
|
+
/**
|
|
1126
|
+
* Close any parked provider WebSockets for ``callId``. Wired into
|
|
1127
|
+
* ``EmbeddedServer.closePrewarmedConnections`` so call-termination
|
|
1128
|
+
* paths (no-answer, busy, failed, canceled, AMD voicemail) drop the
|
|
1129
|
+
* sockets cleanly instead of leaving them to the upstream timeout.
|
|
1130
|
+
*/
|
|
1131
|
+
closePrewarmedConnections = (callId) => {
|
|
1132
|
+
const slot = this.prewarmedConnections.get(callId);
|
|
1133
|
+
if (slot === void 0) return;
|
|
1134
|
+
this.prewarmedConnections.delete(callId);
|
|
1135
|
+
const ttl = this.prewarmedConnTimers.get(callId);
|
|
1136
|
+
if (ttl !== void 0) {
|
|
1137
|
+
clearTimeout(ttl);
|
|
1138
|
+
this.prewarmedConnTimers.delete(callId);
|
|
1139
|
+
}
|
|
1140
|
+
closeParkedConnections(slot);
|
|
1141
|
+
};
|
|
1142
|
+
/**
|
|
1143
|
+
* Open and park provider WebSockets in parallel with the carrier-side
|
|
1144
|
+
* ``initiateCall``. Unlike :meth:`spawnProviderWarmup` (which closes
|
|
1145
|
+
* the WS after a brief idle), the sockets opened here stay OPEN and
|
|
1146
|
+
* are handed off to the per-call ``StreamHandler`` on ``start``.
|
|
1147
|
+
*
|
|
1148
|
+
* This is the structural fix for first-turn cold-start: on Node's
|
|
1149
|
+
* ``ws`` package, opening + closing a WS does NOT warm TLS for the
|
|
1150
|
+
* next open — every fresh ``new WebSocket()`` re-pays the full
|
|
1151
|
+
* TCP + TLS + HTTP-101 round-trip. By keeping the WS open and
|
|
1152
|
+
* adopting it directly, the live first turn skips the handshake
|
|
1153
|
+
* entirely (saves ~150-900 ms depending on provider).
|
|
1154
|
+
*
|
|
1155
|
+
* Best-effort: each provider's parking task is wrapped in
|
|
1156
|
+
* ``Promise.allSettled`` so a slow or failing endpoint cannot block
|
|
1157
|
+
* the others. Providers without ``openParkedConnection`` contribute
|
|
1158
|
+
* nothing — the call falls through to the cold ``connect()`` path
|
|
1159
|
+
* for that provider.
|
|
1160
|
+
*/
|
|
1161
|
+
parkProviderConnections(agent, callId) {
|
|
1162
|
+
const stt = agent.stt;
|
|
1163
|
+
const tts = agent.tts;
|
|
1164
|
+
const sttOpen = typeof stt?.openParkedConnection === "function" ? stt.openParkedConnection.bind(stt) : null;
|
|
1165
|
+
const ttsOpen = typeof tts?.openParkedConnection === "function" ? tts.openParkedConnection.bind(tts) : null;
|
|
1166
|
+
const providerStr = agent.provider ?? "";
|
|
1167
|
+
const wantsRealtimePark = providerStr === "openai_realtime" || providerStr === "openai_realtime_2";
|
|
1168
|
+
if (!sttOpen && !ttsOpen && !wantsRealtimePark) return;
|
|
1169
|
+
const slot = {};
|
|
1170
|
+
this.prewarmedConnections.set(callId, slot);
|
|
1171
|
+
const startedAt = Date.now();
|
|
1172
|
+
const tasks = [];
|
|
1173
|
+
if (sttOpen) {
|
|
1174
|
+
tasks.push((async () => {
|
|
1175
|
+
try {
|
|
1176
|
+
const ws = await sttOpen();
|
|
1177
|
+
if (this.prewarmedConnections.get(callId) !== slot) {
|
|
1178
|
+
try {
|
|
1179
|
+
ws.close();
|
|
1180
|
+
} catch {
|
|
1181
|
+
}
|
|
1182
|
+
return;
|
|
1183
|
+
}
|
|
1184
|
+
slot.stt = ws;
|
|
1185
|
+
getLogger().info(
|
|
1186
|
+
`[PREWARM] callId=${callId} provider=stt ms=${Date.now() - startedAt}`
|
|
1187
|
+
);
|
|
1188
|
+
} catch (err) {
|
|
1189
|
+
getLogger().debug(`Park STT failed for ${callId}: ${String(err)}`);
|
|
1190
|
+
}
|
|
1191
|
+
})());
|
|
1192
|
+
}
|
|
1193
|
+
if (ttsOpen) {
|
|
1194
|
+
tasks.push((async () => {
|
|
1195
|
+
try {
|
|
1196
|
+
const parked = await ttsOpen();
|
|
1197
|
+
if (this.prewarmedConnections.get(callId) !== slot) {
|
|
1198
|
+
try {
|
|
1199
|
+
parked.ws.close();
|
|
1200
|
+
} catch {
|
|
1201
|
+
}
|
|
1202
|
+
return;
|
|
1203
|
+
}
|
|
1204
|
+
slot.tts = parked;
|
|
1205
|
+
getLogger().info(
|
|
1206
|
+
`[PREWARM] callId=${callId} provider=tts ms=${Date.now() - startedAt}`
|
|
1207
|
+
);
|
|
1208
|
+
} catch (err) {
|
|
1209
|
+
getLogger().debug(`Park TTS failed for ${callId}: ${String(err)}`);
|
|
1210
|
+
}
|
|
1211
|
+
})());
|
|
1212
|
+
}
|
|
1213
|
+
if (wantsRealtimePark) {
|
|
1214
|
+
tasks.push((async () => {
|
|
1215
|
+
const { OpenAIRealtime2Adapter: OpenAIRealtime2Adapter2 } = await import("./openai-realtime-2-CNFARP25.mjs");
|
|
1216
|
+
const apiKey = process.env.OPENAI_API_KEY ?? "";
|
|
1217
|
+
if (!apiKey) {
|
|
1218
|
+
getLogger().debug(`Park OpenAI Realtime skipped for ${callId}: no OPENAI_API_KEY`);
|
|
1219
|
+
return;
|
|
1220
|
+
}
|
|
1221
|
+
try {
|
|
1222
|
+
const tmpAdapter = new OpenAIRealtime2Adapter2(
|
|
1223
|
+
apiKey,
|
|
1224
|
+
agent.model ?? "gpt-realtime-mini",
|
|
1225
|
+
agent.voice ?? "alloy",
|
|
1226
|
+
agent.systemPrompt ?? "",
|
|
1227
|
+
[],
|
|
1228
|
+
// audioFormat — the GA adapter always emits audio/pcm@24000
|
|
1229
|
+
// internally regardless of this value, but it's a required
|
|
1230
|
+
// positional param. Default to g711_ulaw (Twilio wire format).
|
|
1231
|
+
void 0
|
|
1232
|
+
);
|
|
1233
|
+
const ws = await tmpAdapter.openParkedConnection();
|
|
1234
|
+
if (this.prewarmedConnections.get(callId) !== slot) {
|
|
1235
|
+
try {
|
|
1236
|
+
ws.close();
|
|
1237
|
+
} catch {
|
|
1238
|
+
}
|
|
1239
|
+
return;
|
|
1240
|
+
}
|
|
1241
|
+
slot.openaiRealtime = ws;
|
|
1242
|
+
getLogger().info(
|
|
1243
|
+
`[PREWARM] callId=${callId} provider=openai_realtime ms=${Date.now() - startedAt}`
|
|
1244
|
+
);
|
|
1245
|
+
} catch (err) {
|
|
1246
|
+
getLogger().debug(`Park OpenAI Realtime failed for ${callId}: ${String(err)}`);
|
|
1247
|
+
}
|
|
1248
|
+
})());
|
|
1249
|
+
}
|
|
1250
|
+
const task = (async () => {
|
|
1251
|
+
await Promise.allSettled(tasks);
|
|
1252
|
+
})();
|
|
1253
|
+
this.prewarmTasks.add(task);
|
|
1254
|
+
void task.finally(() => {
|
|
1255
|
+
this.prewarmTasks.delete(task);
|
|
1256
|
+
if (!this.prewarmedConnections.has(callId)) return;
|
|
1257
|
+
const handle = setTimeout(() => {
|
|
1258
|
+
this.prewarmedConnTimers.delete(callId);
|
|
1259
|
+
const orphan = this.prewarmedConnections.get(callId);
|
|
1260
|
+
if (orphan === void 0) return;
|
|
1261
|
+
this.prewarmedConnections.delete(callId);
|
|
1262
|
+
closeParkedConnections(orphan);
|
|
1263
|
+
getLogger().warn(
|
|
1264
|
+
`[PREWARM] parked connections evicted by TTL for ${callId} \u2014 call never reached start (~${(PARKED_CONN_TTL_MS / 1e3).toFixed(0)}s).`
|
|
1265
|
+
);
|
|
1266
|
+
}, PARKED_CONN_TTL_MS);
|
|
1267
|
+
handle.unref?.();
|
|
1268
|
+
this.prewarmedConnTimers.set(callId, handle);
|
|
1269
|
+
});
|
|
1270
|
+
}
|
|
1271
|
+
/**
|
|
1272
|
+
* Spawn a fire-and-forget task that warms up STT / TTS / LLM in
|
|
1273
|
+
* parallel with the carrier-side ``initiateCall``.
|
|
1274
|
+
*
|
|
1275
|
+
* Best-effort: each provider's optional ``warmup()`` is wrapped in
|
|
1276
|
+
* ``Promise.allSettled`` so a slow or failing endpoint cannot block
|
|
1277
|
+
* the others. Providers without ``warmup`` contribute nothing.
|
|
1278
|
+
*/
|
|
1279
|
+
spawnProviderWarmup(agent) {
|
|
1280
|
+
const targets = [];
|
|
1281
|
+
const collect = (provider, label) => {
|
|
1282
|
+
if (!provider || typeof provider !== "object") return;
|
|
1283
|
+
const fn = provider.warmup;
|
|
1284
|
+
if (typeof fn !== "function") return;
|
|
1285
|
+
targets.push({
|
|
1286
|
+
name: label,
|
|
1287
|
+
fn: fn.bind(provider)
|
|
1288
|
+
});
|
|
1289
|
+
};
|
|
1290
|
+
collect(agent.stt, "stt");
|
|
1291
|
+
collect(agent.tts, "tts");
|
|
1292
|
+
collect(agent.llm, "llm");
|
|
1293
|
+
if (targets.length === 0) return;
|
|
1294
|
+
const task = (async () => {
|
|
1295
|
+
const results = await Promise.allSettled(targets.map((t) => t.fn()));
|
|
1296
|
+
results.forEach((r, i) => {
|
|
1297
|
+
if (r.status === "rejected") {
|
|
1298
|
+
getLogger().debug(
|
|
1299
|
+
`Provider warmup failed (${targets[i].name}): ${String(r.reason)}`
|
|
1300
|
+
);
|
|
1301
|
+
}
|
|
1302
|
+
});
|
|
1303
|
+
})();
|
|
1304
|
+
this.prewarmTasks.add(task);
|
|
1305
|
+
void task.finally(() => this.prewarmTasks.delete(task));
|
|
1306
|
+
}
|
|
1307
|
+
/**
|
|
1308
|
+
* Pre-render ``agent.firstMessage`` to TTS bytes during the ringing
|
|
1309
|
+
* window and stash them in ``prewarmAudio.set(callId, buf)``.
|
|
1310
|
+
*
|
|
1311
|
+
* Skipped silently when ``agent.prewarmFirstMessage`` is false or
|
|
1312
|
+
* when ``agent.tts`` / ``agent.firstMessage`` is missing. The synth
|
|
1313
|
+
* is bounded by ``ringTimeout`` (default 25 s) so a never-answered
|
|
1314
|
+
* call doesn't tie up the TTS connection. On timeout / error the
|
|
1315
|
+
* cache is left empty and the StreamHandler falls back to live TTS.
|
|
1316
|
+
*
|
|
1317
|
+
* **Pipeline mode only.** Realtime / ConvAI provider modes never
|
|
1318
|
+
* consume the prewarm cache (the StreamHandler for those modes runs
|
|
1319
|
+
* its first-message emit through the provider's own audio path).
|
|
1320
|
+
* Spawning the prewarm in those modes pays the TTS bill for nothing
|
|
1321
|
+
* — refused with a warn.
|
|
1322
|
+
*
|
|
1323
|
+
* **Capped at ``PREWARM_CACHE_MAX`` concurrent entries.** Refused
|
|
1324
|
+
* with a warn when the cap is reached (the call still proceeds —
|
|
1325
|
+
* StreamHandler falls back to live TTS).
|
|
1326
|
+
*/
|
|
1327
|
+
spawnPrewarmFirstMessage(agent, callId, ringTimeout, carrier) {
|
|
1328
|
+
if (!agent.prewarmFirstMessage) return;
|
|
1329
|
+
const providerMode = agent.provider ?? "openai_realtime";
|
|
1330
|
+
if (providerMode !== "pipeline") {
|
|
1331
|
+
getLogger().warn(
|
|
1332
|
+
`agent.prewarmFirstMessage=true is only supported in pipeline mode (provider=${providerMode}); skipping pre-synth to avoid wasted TTS spend.`
|
|
1333
|
+
);
|
|
1334
|
+
return;
|
|
1335
|
+
}
|
|
1336
|
+
const firstMessage = agent.firstMessage ?? "";
|
|
1337
|
+
const tts = agent.tts;
|
|
1338
|
+
if (!firstMessage || !tts) return;
|
|
1339
|
+
if (typeof tts.synthesizeStream !== "function") return;
|
|
1340
|
+
if (carrier) {
|
|
1341
|
+
const carrierAware = tts;
|
|
1342
|
+
if (typeof carrierAware.setTelephonyCarrier === "function") {
|
|
1343
|
+
try {
|
|
1344
|
+
carrierAware.setTelephonyCarrier(carrier);
|
|
1345
|
+
} catch (err) {
|
|
1346
|
+
getLogger().debug(
|
|
1347
|
+
`Prewarm TTS setTelephonyCarrier failed for ${callId}: ${String(err)}`
|
|
1348
|
+
);
|
|
1349
|
+
}
|
|
1350
|
+
}
|
|
1351
|
+
}
|
|
1352
|
+
const inFlight = this.prewarmAudio.size + this.prewarmTasks.size;
|
|
1353
|
+
if (inFlight >= PREWARM_CACHE_MAX) {
|
|
1354
|
+
getLogger().warn(
|
|
1355
|
+
`Prewarm cache full (${inFlight}/${PREWARM_CACHE_MAX} in-flight) \u2014 skipping pre-synth for call ${callId}; falling back to live TTS at pickup.`
|
|
1356
|
+
);
|
|
1357
|
+
return;
|
|
1358
|
+
}
|
|
1359
|
+
const timeoutMs = (typeof ringTimeout === "number" ? ringTimeout : 25) * 1e3;
|
|
1360
|
+
const task = (async () => {
|
|
1361
|
+
try {
|
|
1362
|
+
const accumulate = async () => {
|
|
1363
|
+
const chunks = [];
|
|
1364
|
+
for await (const chunk of tts.synthesizeStream(firstMessage)) {
|
|
1365
|
+
const u = chunk;
|
|
1366
|
+
if (Buffer.isBuffer(u)) chunks.push(u);
|
|
1367
|
+
else if (ArrayBuffer.isView(u))
|
|
1368
|
+
chunks.push(Buffer.from(u.buffer, u.byteOffset, u.byteLength));
|
|
1369
|
+
}
|
|
1370
|
+
return Buffer.concat(chunks);
|
|
1371
|
+
};
|
|
1372
|
+
const timer = new Promise(
|
|
1373
|
+
(_resolve, reject) => setTimeout(
|
|
1374
|
+
() => reject(new Error("prewarm-first-message timeout")),
|
|
1375
|
+
timeoutMs
|
|
1376
|
+
).unref?.()
|
|
1377
|
+
);
|
|
1378
|
+
const buf = await Promise.race([accumulate(), timer]);
|
|
1379
|
+
if (buf.byteLength > 0) {
|
|
1380
|
+
if (this.prewarmConsumed.has(callId)) {
|
|
1381
|
+
getLogger().warn(
|
|
1382
|
+
`Prewarm orphaned for call ${callId} \u2014 synth completed (~${buf.byteLength} bytes) AFTER consumer polled; bytes dropped, TTS bill already paid.`
|
|
1383
|
+
);
|
|
1384
|
+
return;
|
|
1385
|
+
}
|
|
1386
|
+
this.prewarmAudio.set(callId, buf);
|
|
1387
|
+
getLogger().debug(
|
|
1388
|
+
`Prewarm first-message ready for call ${callId} (${buf.byteLength} bytes)`
|
|
1389
|
+
);
|
|
1390
|
+
}
|
|
1391
|
+
} catch (err) {
|
|
1392
|
+
getLogger().debug(
|
|
1393
|
+
`Prewarm first-message failed for call ${callId}: ${String(err)}`
|
|
1394
|
+
);
|
|
1395
|
+
}
|
|
1396
|
+
})();
|
|
1397
|
+
this.prewarmTasks.add(task);
|
|
1398
|
+
void task.finally(() => {
|
|
1399
|
+
this.prewarmTasks.delete(task);
|
|
1400
|
+
if (!this.prewarmAudio.has(callId)) return;
|
|
1401
|
+
const ttlMs = timeoutMs + PREWARM_TTL_GRACE_MS;
|
|
1402
|
+
const handle = setTimeout(() => {
|
|
1403
|
+
this.prewarmTtlTimers.delete(callId);
|
|
1404
|
+
const orphan = this.prewarmAudio.get(callId);
|
|
1405
|
+
if (orphan === void 0) return;
|
|
1406
|
+
this.prewarmAudio.delete(callId);
|
|
1407
|
+
this.prewarmConsumed.add(callId);
|
|
1408
|
+
getLogger().warn(
|
|
1409
|
+
`Prewarm bytes evicted by TTL \u2014 call ${callId} never consumed them (~${orphan.byteLength} bytes synthesised, ${(ttlMs / 1e3).toFixed(1)}s after ringTimeout).`
|
|
1410
|
+
);
|
|
1411
|
+
}, ttlMs);
|
|
1412
|
+
handle.unref?.();
|
|
1413
|
+
this.prewarmTtlTimers.set(callId, handle);
|
|
1414
|
+
});
|
|
1415
|
+
}
|
|
903
1416
|
/** Place an outbound call via the configured carrier. */
|
|
904
1417
|
async call(options) {
|
|
905
1418
|
if (!options.to) {
|
|
@@ -914,6 +1427,9 @@ var Patter = class {
|
|
|
914
1427
|
if (this.embeddedServer) {
|
|
915
1428
|
this.embeddedServer.onMachineDetection = options.onMachineDetection;
|
|
916
1429
|
}
|
|
1430
|
+
if (options.agent.prewarm !== false) {
|
|
1431
|
+
this.spawnProviderWarmup(options.agent);
|
|
1432
|
+
}
|
|
917
1433
|
if (carrier.kind === "telnyx") {
|
|
918
1434
|
const telnyxKey = carrier.apiKey;
|
|
919
1435
|
const connectionId = carrier.connectionId;
|
|
@@ -939,21 +1455,35 @@ var Patter = class {
|
|
|
939
1455
|
if (!response2.ok) {
|
|
940
1456
|
throw new ProvisionError(`Failed to initiate Telnyx call: ${await response2.text()}`);
|
|
941
1457
|
}
|
|
942
|
-
|
|
1458
|
+
let telnyxCallId;
|
|
1459
|
+
try {
|
|
1460
|
+
const body = await response2.clone().json();
|
|
1461
|
+
telnyxCallId = body.data?.call_control_id;
|
|
1462
|
+
} catch {
|
|
1463
|
+
}
|
|
1464
|
+
if (telnyxCallId) {
|
|
1465
|
+
const initiatedPayload = {
|
|
1466
|
+
call_id: telnyxCallId,
|
|
1467
|
+
caller: phoneNumber,
|
|
1468
|
+
callee: options.to,
|
|
1469
|
+
direction: "outbound",
|
|
1470
|
+
status: "initiated"
|
|
1471
|
+
};
|
|
1472
|
+
if (this.embeddedServer) {
|
|
1473
|
+
this.embeddedServer.metricsStore.recordCallInitiated(initiatedPayload);
|
|
1474
|
+
}
|
|
943
1475
|
try {
|
|
944
|
-
const
|
|
945
|
-
|
|
946
|
-
if (callId) {
|
|
947
|
-
this.embeddedServer.metricsStore.recordCallInitiated({
|
|
948
|
-
call_id: callId,
|
|
949
|
-
caller: phoneNumber,
|
|
950
|
-
callee: options.to,
|
|
951
|
-
direction: "outbound"
|
|
952
|
-
});
|
|
953
|
-
}
|
|
1476
|
+
const { notifyDashboard: notifyDashboard2 } = await import("./persistence-LVIAHESK.mjs");
|
|
1477
|
+
notifyDashboard2(initiatedPayload);
|
|
954
1478
|
} catch {
|
|
955
1479
|
}
|
|
956
1480
|
}
|
|
1481
|
+
if (telnyxCallId) {
|
|
1482
|
+
this.spawnPrewarmFirstMessage(options.agent, telnyxCallId, effectiveRingTimeout, "telnyx");
|
|
1483
|
+
if (options.agent.prewarm !== false) {
|
|
1484
|
+
this.parkProviderConnections(options.agent, telnyxCallId);
|
|
1485
|
+
}
|
|
1486
|
+
}
|
|
957
1487
|
return;
|
|
958
1488
|
}
|
|
959
1489
|
const twilioSid = carrier.accountSid;
|
|
@@ -994,34 +1524,77 @@ var Patter = class {
|
|
|
994
1524
|
if (!response.ok) {
|
|
995
1525
|
throw new ProvisionError(`Failed to initiate call: ${await response.text()}`);
|
|
996
1526
|
}
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1527
|
+
let twilioCallSid;
|
|
1528
|
+
let twilioNotificationsPath;
|
|
1529
|
+
try {
|
|
1530
|
+
const body = await response.clone().json();
|
|
1531
|
+
twilioCallSid = body.sid;
|
|
1532
|
+
twilioNotificationsPath = body.subresource_uris?.notifications;
|
|
1533
|
+
} catch {
|
|
1534
|
+
}
|
|
1535
|
+
if (twilioCallSid) {
|
|
1536
|
+
const initiatedPayload = {
|
|
1537
|
+
call_id: twilioCallSid,
|
|
1538
|
+
caller: phoneNumber,
|
|
1539
|
+
callee: options.to,
|
|
1540
|
+
direction: "outbound",
|
|
1541
|
+
status: "initiated"
|
|
1542
|
+
};
|
|
1543
|
+
if (this.embeddedServer) {
|
|
1544
|
+
this.embeddedServer.metricsStore.recordCallInitiated(initiatedPayload);
|
|
1545
|
+
if (twilioNotificationsPath) {
|
|
1546
|
+
getLogger().info(
|
|
1547
|
+
`Outbound call ${twilioCallSid} placed. Twilio notifications: https://api.twilio.com${twilioNotificationsPath} (check here if the call drops with no audio).`
|
|
1548
|
+
);
|
|
1014
1549
|
}
|
|
1550
|
+
}
|
|
1551
|
+
try {
|
|
1552
|
+
const { notifyDashboard: notifyDashboard2 } = await import("./persistence-LVIAHESK.mjs");
|
|
1553
|
+
notifyDashboard2(initiatedPayload);
|
|
1015
1554
|
} catch {
|
|
1016
1555
|
}
|
|
1017
1556
|
}
|
|
1557
|
+
if (twilioCallSid) {
|
|
1558
|
+
this.spawnPrewarmFirstMessage(options.agent, twilioCallSid, effectiveRingTimeout, "twilio");
|
|
1559
|
+
if (options.agent.prewarm !== false) {
|
|
1560
|
+
this.parkProviderConnections(options.agent, twilioCallSid);
|
|
1561
|
+
}
|
|
1562
|
+
}
|
|
1018
1563
|
}
|
|
1019
1564
|
/**
|
|
1020
1565
|
* Stop the embedded server and any running tunnel. Safe to call multiple
|
|
1021
1566
|
* times. Leaves the instance reusable: a subsequent ``serve()`` works as
|
|
1022
1567
|
* if the previous lifecycle never happened.
|
|
1568
|
+
*
|
|
1569
|
+
* Also clears any pending TTL eviction timers, awaits in-flight
|
|
1570
|
+
* prewarm-first-message synth tasks (best-effort, with a 1 s safety
|
|
1571
|
+
* timeout), and clears the prewarm cache. Without this a still-running
|
|
1572
|
+
* TTS WS keeps the user billed long after SDK teardown, and stale
|
|
1573
|
+
* entries leak across ``serve`` / ``disconnect`` cycles. See FIX #93.
|
|
1023
1574
|
*/
|
|
1024
1575
|
async disconnect() {
|
|
1576
|
+
for (const handle of this.prewarmTtlTimers.values()) {
|
|
1577
|
+
clearTimeout(handle);
|
|
1578
|
+
}
|
|
1579
|
+
this.prewarmTtlTimers.clear();
|
|
1580
|
+
if (this.prewarmTasks.size > 0) {
|
|
1581
|
+
const drain = Promise.allSettled(Array.from(this.prewarmTasks));
|
|
1582
|
+
const timer = new Promise(
|
|
1583
|
+
(resolve) => setTimeout(resolve, 1e3).unref?.()
|
|
1584
|
+
);
|
|
1585
|
+
await Promise.race([drain, timer]);
|
|
1586
|
+
}
|
|
1587
|
+
this.prewarmTasks.clear();
|
|
1588
|
+
this.prewarmAudio.clear();
|
|
1589
|
+
this.prewarmConsumed.clear();
|
|
1590
|
+
for (const handle of this.prewarmedConnTimers.values()) {
|
|
1591
|
+
clearTimeout(handle);
|
|
1592
|
+
}
|
|
1593
|
+
this.prewarmedConnTimers.clear();
|
|
1594
|
+
for (const slot of this.prewarmedConnections.values()) {
|
|
1595
|
+
closeParkedConnections(slot);
|
|
1596
|
+
}
|
|
1597
|
+
this.prewarmedConnections.clear();
|
|
1025
1598
|
if (this.tunnelHandle) {
|
|
1026
1599
|
this.tunnelHandle.stop();
|
|
1027
1600
|
this.tunnelHandle = null;
|
|
@@ -1072,6 +1645,7 @@ var Patter = class {
|
|
|
1072
1645
|
if (!callSid) {
|
|
1073
1646
|
throw new Error("callSid must be a non-empty string");
|
|
1074
1647
|
}
|
|
1648
|
+
this.recordPrewarmWaste(callSid);
|
|
1075
1649
|
const carrier = this.localConfig.carrier;
|
|
1076
1650
|
if (carrier.kind === "twilio") {
|
|
1077
1651
|
const auth = Buffer.from(`${carrier.accountSid}:${carrier.authToken}`).toString("base64");
|
|
@@ -1107,7 +1681,7 @@ var Patter = class {
|
|
|
1107
1681
|
}
|
|
1108
1682
|
};
|
|
1109
1683
|
async function waitForTunnelPubliclyReachable(hostname, totalTimeoutMs = 6e4, graceMs = 5e3) {
|
|
1110
|
-
const
|
|
1684
|
+
const log2 = getLogger();
|
|
1111
1685
|
const { Resolver } = await import("dns/promises");
|
|
1112
1686
|
const resolver = new Resolver({ timeout: 1500, tries: 1 });
|
|
1113
1687
|
resolver.setServers(["1.1.1.1", "8.8.8.8"]);
|
|
@@ -1119,7 +1693,7 @@ async function waitForTunnelPubliclyReachable(hostname, totalTimeoutMs = 6e4, gr
|
|
|
1119
1693
|
try {
|
|
1120
1694
|
const records = await resolver.resolve4(hostname);
|
|
1121
1695
|
const first = records[0] ?? "<unknown>";
|
|
1122
|
-
|
|
1696
|
+
log2.info(
|
|
1123
1697
|
"Tunnel DNS resolved \u2192 %s (attempt %d); waiting %d ms grace",
|
|
1124
1698
|
first,
|
|
1125
1699
|
attempt,
|
|
@@ -2278,48 +2852,633 @@ function scheduleInterval(intervalOrOpts, callback) {
|
|
|
2278
2852
|
};
|
|
2279
2853
|
}
|
|
2280
2854
|
|
|
2281
|
-
// src/
|
|
2282
|
-
init_esm_shims();
|
|
2283
|
-
var STT = class extends DeepgramSTT {
|
|
2284
|
-
static providerKey = "deepgram";
|
|
2285
|
-
constructor(opts = {}) {
|
|
2286
|
-
const key = opts.apiKey ?? process.env.DEEPGRAM_API_KEY;
|
|
2287
|
-
if (!key) {
|
|
2288
|
-
throw new Error(
|
|
2289
|
-
"Deepgram STT requires an apiKey. Pass { apiKey: 'dg_...' } or set DEEPGRAM_API_KEY in the environment."
|
|
2290
|
-
);
|
|
2291
|
-
}
|
|
2292
|
-
super(
|
|
2293
|
-
key,
|
|
2294
|
-
opts.language ?? "en",
|
|
2295
|
-
opts.model ?? "nova-3",
|
|
2296
|
-
opts.encoding ?? "linear16",
|
|
2297
|
-
opts.sampleRate ?? 16e3,
|
|
2298
|
-
{
|
|
2299
|
-
endpointingMs: opts.endpointingMs ?? 150,
|
|
2300
|
-
utteranceEndMs: opts.utteranceEndMs === null ? null : opts.utteranceEndMs ?? 1e3,
|
|
2301
|
-
smartFormat: opts.smartFormat ?? true,
|
|
2302
|
-
interimResults: opts.interimResults ?? true,
|
|
2303
|
-
...opts.vadEvents !== void 0 ? { vadEvents: opts.vadEvents } : {}
|
|
2304
|
-
}
|
|
2305
|
-
);
|
|
2306
|
-
}
|
|
2307
|
-
};
|
|
2308
|
-
|
|
2309
|
-
// src/stt/whisper.ts
|
|
2310
|
-
init_esm_shims();
|
|
2311
|
-
|
|
2312
|
-
// src/providers/whisper-stt.ts
|
|
2855
|
+
// src/providers/elevenlabs-tts.ts
|
|
2313
2856
|
init_esm_shims();
|
|
2314
|
-
var
|
|
2315
|
-
var
|
|
2316
|
-
|
|
2317
|
-
|
|
2318
|
-
|
|
2319
|
-
|
|
2320
|
-
|
|
2321
|
-
|
|
2322
|
-
|
|
2857
|
+
var ELEVENLABS_BASE_URL = "https://api.elevenlabs.io/v1";
|
|
2858
|
+
var ELEVENLABS_VOICE_ID_BY_NAME = {
|
|
2859
|
+
rachel: "21m00Tcm4TlvDq8ikWAM",
|
|
2860
|
+
drew: "29vD33N1CtxCmqQRPOHJ",
|
|
2861
|
+
clyde: "2EiwWnXFnvU5JabPnv8n",
|
|
2862
|
+
paul: "5Q0t7uMcjvnagumLfvZi",
|
|
2863
|
+
domi: "AZnzlk1XvdvUeBnXmlld",
|
|
2864
|
+
dave: "CYw3kZ02Hs0563khs1Fj",
|
|
2865
|
+
fin: "D38z5RcWu1voky8WS1ja",
|
|
2866
|
+
bella: "EXAVITQu4vr4xnSDxMaL",
|
|
2867
|
+
antoni: "ErXwobaYiN019PkySvjV",
|
|
2868
|
+
thomas: "GBv7mTt0atIp3Br8iCZE",
|
|
2869
|
+
charlie: "IKne3meq5aSn9XLyUdCD",
|
|
2870
|
+
george: "JBFqnCBsd6RMkjVDRZzb",
|
|
2871
|
+
emily: "LcfcDJNUP1GQjkzn1xUU",
|
|
2872
|
+
elli: "MF3mGyEYCl7XYWbV9V6O",
|
|
2873
|
+
callum: "N2lVS1w4EtoT3dr4eOWO",
|
|
2874
|
+
patrick: "ODq5zmih8GrVes37Dizd",
|
|
2875
|
+
harry: "SOYHLrjzK2X1ezoPC6cr",
|
|
2876
|
+
liam: "TX3LPaxmHKxFdv7VOQHJ",
|
|
2877
|
+
dorothy: "ThT5KcBeYPX3keUQqHPh",
|
|
2878
|
+
josh: "TxGEqnHWrfWFTfGW9XjX",
|
|
2879
|
+
arnold: "VR6AewLTigWG4xSOukaG",
|
|
2880
|
+
charlotte: "XB0fDUnXU5powFXDhCwa",
|
|
2881
|
+
matilda: "XrExE9yKIg1WjnnlVkGX",
|
|
2882
|
+
matthew: "Yko7PKHZNXotIFUBG7I9",
|
|
2883
|
+
james: "ZQe5CZNOzWyzPSCn5a3c",
|
|
2884
|
+
joseph: "Zlb1dXrM653N07WRdFW3",
|
|
2885
|
+
jeremy: "bVMeCyTHy58xNoL34h3p",
|
|
2886
|
+
michael: "flq6f7yk4E4fJM5XTYuZ",
|
|
2887
|
+
ethan: "g5CIjZEefAph4nQFvHAz",
|
|
2888
|
+
gigi: "jBpfuIE2acCO8z3wKNLl",
|
|
2889
|
+
freya: "jsCqWAovK2LkecY7zXl4",
|
|
2890
|
+
brian: "nPczCjzI2devNBz1zQrb",
|
|
2891
|
+
grace: "oWAxZDx7w5VEj9dCyTzz",
|
|
2892
|
+
daniel: "onwK4e9ZLuTAKqWW03F9",
|
|
2893
|
+
lily: "pFZP5JQG7iQjIQuC4Bku",
|
|
2894
|
+
serena: "pMsXgVXv3BLzUgSXRplE",
|
|
2895
|
+
adam: "pNInz6obpgDQGcFmaJgB",
|
|
2896
|
+
nicole: "piTKgcLEGmPE4e6mEKli",
|
|
2897
|
+
bill: "pqHfZKP75CvOlQylNhV4",
|
|
2898
|
+
jessie: "t0jbNlBVZ17f02VDIeMI",
|
|
2899
|
+
ryan: "wViXBPUzp2ZZixB1xQuM",
|
|
2900
|
+
sam: "yoZ06aMxZJJ28mfd3POQ",
|
|
2901
|
+
glinda: "z9fAnlkpzviPz146aGWa",
|
|
2902
|
+
giovanni: "zcAOhNBS3c14rBihAFp1",
|
|
2903
|
+
mimi: "zrHiDhphv9ZnVXBqCLjz",
|
|
2904
|
+
sarah: "EXAVITQu4vr4xnSDxMaL",
|
|
2905
|
+
alloy: "EXAVITQu4vr4xnSDxMaL"
|
|
2906
|
+
};
|
|
2907
|
+
var VOICE_ID_PATTERN = /^[A-Za-z0-9]{20}$/;
|
|
2908
|
+
function resolveVoiceId(voice) {
|
|
2909
|
+
if (!voice) return voice;
|
|
2910
|
+
if (VOICE_ID_PATTERN.test(voice)) return voice;
|
|
2911
|
+
return ELEVENLABS_VOICE_ID_BY_NAME[voice.toLowerCase()] ?? voice;
|
|
2912
|
+
}
|
|
2913
|
+
var ElevenLabsModel = {
|
|
2914
|
+
V3: "eleven_v3",
|
|
2915
|
+
FLASH_V2_5: "eleven_flash_v2_5",
|
|
2916
|
+
TURBO_V2_5: "eleven_turbo_v2_5",
|
|
2917
|
+
MULTILINGUAL_V2: "eleven_multilingual_v2",
|
|
2918
|
+
MONOLINGUAL_V1: "eleven_monolingual_v1"
|
|
2919
|
+
};
|
|
2920
|
+
var ElevenLabsOutputFormat = {
|
|
2921
|
+
MP3_22050_32: "mp3_22050_32",
|
|
2922
|
+
MP3_44100_32: "mp3_44100_32",
|
|
2923
|
+
MP3_44100_64: "mp3_44100_64",
|
|
2924
|
+
MP3_44100_96: "mp3_44100_96",
|
|
2925
|
+
MP3_44100_128: "mp3_44100_128",
|
|
2926
|
+
MP3_44100_192: "mp3_44100_192",
|
|
2927
|
+
PCM_8000: "pcm_8000",
|
|
2928
|
+
PCM_16000: "pcm_16000",
|
|
2929
|
+
PCM_22050: "pcm_22050",
|
|
2930
|
+
PCM_24000: "pcm_24000",
|
|
2931
|
+
PCM_44100: "pcm_44100",
|
|
2932
|
+
ULAW_8000: "ulaw_8000"
|
|
2933
|
+
};
|
|
2934
|
+
var ElevenLabsTTS = class _ElevenLabsTTS {
|
|
2935
|
+
// Stable pricing/dashboard key — read by stream-handler / metrics via
|
|
2936
|
+
// ``(agent.tts.constructor as any).providerKey``. Without this the cost
|
|
2937
|
+
// calculator falls back to ``constructor.name`` ("ElevenLabsTTS") which
|
|
2938
|
+
// does NOT match the pricing table key "elevenlabs", silently zeroing
|
|
2939
|
+
// TTS cost for callers that construct the raw REST class directly
|
|
2940
|
+
// (exposed at top level as ``ElevenLabsRestTTS``).
|
|
2941
|
+
static providerKey = "elevenlabs";
|
|
2942
|
+
apiKey;
|
|
2943
|
+
voiceId;
|
|
2944
|
+
modelId;
|
|
2945
|
+
_outputFormat;
|
|
2946
|
+
_outputFormatExplicit;
|
|
2947
|
+
voiceSettings;
|
|
2948
|
+
languageCode;
|
|
2949
|
+
chunkSize;
|
|
2950
|
+
/**
|
|
2951
|
+
* Public view of the (possibly auto-flipped) wire format. Read by the
|
|
2952
|
+
* stream-handler to decide whether to skip the client-side resample +
|
|
2953
|
+
* mulaw encode when the bytes are already in the carrier's wire codec.
|
|
2954
|
+
*/
|
|
2955
|
+
get outputFormat() {
|
|
2956
|
+
return this._outputFormat;
|
|
2957
|
+
}
|
|
2958
|
+
constructor(apiKey, voiceIdOrOptions = "21m00Tcm4TlvDq8ikWAM", modelId = ElevenLabsModel.FLASH_V2_5, outputFormat = ElevenLabsOutputFormat.PCM_16000) {
|
|
2959
|
+
this.apiKey = apiKey;
|
|
2960
|
+
if (typeof voiceIdOrOptions === "object") {
|
|
2961
|
+
const o = voiceIdOrOptions;
|
|
2962
|
+
this.voiceId = resolveVoiceId(o.voiceId ?? "21m00Tcm4TlvDq8ikWAM");
|
|
2963
|
+
this.modelId = o.modelId ?? ElevenLabsModel.FLASH_V2_5;
|
|
2964
|
+
this._outputFormatExplicit = o.outputFormat !== void 0;
|
|
2965
|
+
this._outputFormat = o.outputFormat ?? ElevenLabsOutputFormat.PCM_16000;
|
|
2966
|
+
this.voiceSettings = o.voiceSettings;
|
|
2967
|
+
this.languageCode = o.languageCode;
|
|
2968
|
+
this.chunkSize = o.chunkSize ?? 4096;
|
|
2969
|
+
} else {
|
|
2970
|
+
this.voiceId = resolveVoiceId(voiceIdOrOptions);
|
|
2971
|
+
this.modelId = modelId;
|
|
2972
|
+
this._outputFormatExplicit = outputFormat !== ElevenLabsOutputFormat.PCM_16000;
|
|
2973
|
+
this._outputFormat = outputFormat;
|
|
2974
|
+
this.voiceSettings = void 0;
|
|
2975
|
+
this.languageCode = void 0;
|
|
2976
|
+
this.chunkSize = 4096;
|
|
2977
|
+
}
|
|
2978
|
+
}
|
|
2979
|
+
/**
|
|
2980
|
+
* Hook called by ``StreamHandler.initPipeline`` to advise the carrier
|
|
2981
|
+
* wire format. When the user did NOT pass an explicit ``outputFormat``,
|
|
2982
|
+
* auto-flip to the carrier's native codec so the audio bytes ElevenLabs
|
|
2983
|
+
* returns are already in Twilio/Telnyx wire format — eliminating the
|
|
2984
|
+
* client-side 16 kHz → 8 kHz resample and PCM → μ-law encode. The
|
|
2985
|
+
* resample/encode chain was a source of audible artifacts on the
|
|
2986
|
+
* prewarmed firstMessage (see 0.6.2 acceptance notes — burst delivery
|
|
2987
|
+
* of resampled audio crackled on the carrier-side jitter buffer).
|
|
2988
|
+
*
|
|
2989
|
+
* No-op when the caller passed an explicit ``outputFormat`` (incl. via
|
|
2990
|
+
* the ``forTwilio`` / ``forTelnyx`` factories) — user wins.
|
|
2991
|
+
*
|
|
2992
|
+
* Parity with {@link ElevenLabsWebSocketTTS.setTelephonyCarrier}.
|
|
2993
|
+
*/
|
|
2994
|
+
setTelephonyCarrier(carrier) {
|
|
2995
|
+
if (this._outputFormatExplicit) return;
|
|
2996
|
+
if (carrier === "twilio") {
|
|
2997
|
+
this._outputFormat = ElevenLabsOutputFormat.ULAW_8000;
|
|
2998
|
+
} else if (carrier === "telnyx") {
|
|
2999
|
+
this._outputFormat = ElevenLabsOutputFormat.PCM_16000;
|
|
3000
|
+
}
|
|
3001
|
+
}
|
|
3002
|
+
/**
|
|
3003
|
+
* Construct an instance pre-configured for Twilio Media Streams.
|
|
3004
|
+
*
|
|
3005
|
+
* Sets `outputFormat='ulaw_8000'` so ElevenLabs emits μ-law @ 8 kHz
|
|
3006
|
+
* directly — the exact wire format Twilio's media stream uses — letting
|
|
3007
|
+
* the SDK skip the 16 kHz→8 kHz resample and PCM→μ-law conversion in
|
|
3008
|
+
* `TwilioAudioSender`. Saves ~30–80 ms first-byte and per-frame CPU,
|
|
3009
|
+
* and removes a potential aliasing source.
|
|
3010
|
+
*
|
|
3011
|
+
* `voiceSettings` defaults to a low-bandwidth-friendly profile
|
|
3012
|
+
* (speaker boost off, modest stability) which sounds cleaner at 8 kHz
|
|
3013
|
+
* μ-law than the studio default. Pass an explicit object to override.
|
|
3014
|
+
*/
|
|
3015
|
+
static forTwilio(apiKey, options = {}) {
|
|
3016
|
+
const voiceSettings = options.voiceSettings ?? {
|
|
3017
|
+
// Speaker boost adds high-frequency emphasis that aliases ugly over an
|
|
3018
|
+
// 8 kHz μ-law line. Slightly higher stability tames the excursions
|
|
3019
|
+
// that compander quantization noise can amplify.
|
|
3020
|
+
stability: 0.6,
|
|
3021
|
+
similarity_boost: 0.75,
|
|
3022
|
+
use_speaker_boost: false
|
|
3023
|
+
};
|
|
3024
|
+
return new _ElevenLabsTTS(apiKey, {
|
|
3025
|
+
...options,
|
|
3026
|
+
voiceSettings,
|
|
3027
|
+
outputFormat: ElevenLabsOutputFormat.ULAW_8000
|
|
3028
|
+
});
|
|
3029
|
+
}
|
|
3030
|
+
/**
|
|
3031
|
+
* Construct an instance pre-configured for Telnyx bidirectional media.
|
|
3032
|
+
*
|
|
3033
|
+
* Telnyx's default media-streaming codec is L16 PCM @ 16 kHz, which
|
|
3034
|
+
* matches our default Telnyx handler. We pick `pcm_16000` so the audio
|
|
3035
|
+
* flows end-to-end with zero resampling or transcoding.
|
|
3036
|
+
*
|
|
3037
|
+
* Trade-off: if your Telnyx profile is pinned to PCMU/8000 (μ-law),
|
|
3038
|
+
* construct `ElevenLabsTTS` directly with `outputFormat: 'ulaw_8000'`
|
|
3039
|
+
* — Telnyx supports that natively too.
|
|
3040
|
+
*/
|
|
3041
|
+
static forTelnyx(apiKey, options = {}) {
|
|
3042
|
+
return new _ElevenLabsTTS(apiKey, {
|
|
3043
|
+
...options,
|
|
3044
|
+
outputFormat: ElevenLabsOutputFormat.PCM_16000
|
|
3045
|
+
});
|
|
3046
|
+
}
|
|
3047
|
+
/**
|
|
3048
|
+
* Synthesise text to speech and return the full audio as a single Buffer.
|
|
3049
|
+
*
|
|
3050
|
+
* For large chunks (or when latency matters) call `synthesizeStream` instead.
|
|
3051
|
+
*/
|
|
3052
|
+
async synthesize(text) {
|
|
3053
|
+
const chunks = [];
|
|
3054
|
+
for await (const chunk of this.synthesizeStream(text)) {
|
|
3055
|
+
chunks.push(chunk);
|
|
3056
|
+
}
|
|
3057
|
+
return Buffer.concat(chunks);
|
|
3058
|
+
}
|
|
3059
|
+
/**
|
|
3060
|
+
* Synthesise text and yield audio chunks as they arrive (streaming).
|
|
3061
|
+
*
|
|
3062
|
+
* The yielded buffers are raw PCM at 16 kHz (or whatever `outputFormat` is
|
|
3063
|
+
* configured to). `chunkSize` controls the maximum yield size — 512 is a
|
|
3064
|
+
* good choice for low-latency telephony.
|
|
3065
|
+
*/
|
|
3066
|
+
async *synthesizeStream(text) {
|
|
3067
|
+
const url = `${ELEVENLABS_BASE_URL}/text-to-speech/${encodeURIComponent(this.voiceId)}/stream?output_format=${encodeURIComponent(this._outputFormat)}`;
|
|
3068
|
+
const body = {
|
|
3069
|
+
text,
|
|
3070
|
+
model_id: this.modelId
|
|
3071
|
+
};
|
|
3072
|
+
if (this.voiceSettings) body["voice_settings"] = this.voiceSettings;
|
|
3073
|
+
if (this.languageCode) body["language_code"] = this.languageCode;
|
|
3074
|
+
const response = await fetch(url, {
|
|
3075
|
+
method: "POST",
|
|
3076
|
+
headers: {
|
|
3077
|
+
"xi-api-key": this.apiKey,
|
|
3078
|
+
"Content-Type": "application/json"
|
|
3079
|
+
},
|
|
3080
|
+
body: JSON.stringify(body),
|
|
3081
|
+
signal: AbortSignal.timeout(3e4)
|
|
3082
|
+
});
|
|
3083
|
+
if (!response.ok) {
|
|
3084
|
+
const errBody = await response.text();
|
|
3085
|
+
throw new Error(`ElevenLabs TTS error ${response.status}: ${errBody}`);
|
|
3086
|
+
}
|
|
3087
|
+
if (!response.body) {
|
|
3088
|
+
throw new Error("ElevenLabs TTS: no response body");
|
|
3089
|
+
}
|
|
3090
|
+
const reader = response.body.getReader();
|
|
3091
|
+
try {
|
|
3092
|
+
while (true) {
|
|
3093
|
+
const { done, value } = await reader.read();
|
|
3094
|
+
if (done) break;
|
|
3095
|
+
if (!value || value.length === 0) continue;
|
|
3096
|
+
const buf = Buffer.from(value);
|
|
3097
|
+
for (let offset = 0; offset < buf.length; offset += this.chunkSize) {
|
|
3098
|
+
yield buf.subarray(offset, Math.min(offset + this.chunkSize, buf.length));
|
|
3099
|
+
}
|
|
3100
|
+
}
|
|
3101
|
+
} finally {
|
|
3102
|
+
if (typeof reader.cancel === "function") await reader.cancel().catch(() => {
|
|
3103
|
+
});
|
|
3104
|
+
reader.releaseLock();
|
|
3105
|
+
}
|
|
3106
|
+
}
|
|
3107
|
+
};
|
|
3108
|
+
|
|
3109
|
+
// src/providers/cartesia-tts.ts
|
|
3110
|
+
init_esm_shims();
|
|
3111
|
+
var CARTESIA_BASE_URL = "https://api.cartesia.ai";
|
|
3112
|
+
var CARTESIA_API_VERSION = "2025-04-16";
|
|
3113
|
+
var CARTESIA_DEFAULT_VOICE_ID = "f786b574-daa5-4673-aa0c-cbe3e8534c02";
|
|
3114
|
+
var CartesiaTTSModel = {
|
|
3115
|
+
SONIC_3: "sonic-3",
|
|
3116
|
+
SONIC_2: "sonic-2",
|
|
3117
|
+
SONIC: "sonic"
|
|
3118
|
+
};
|
|
3119
|
+
var CartesiaTTSContainer = {
|
|
3120
|
+
RAW: "raw",
|
|
3121
|
+
WAV: "wav",
|
|
3122
|
+
MP3: "mp3"
|
|
3123
|
+
};
|
|
3124
|
+
var CartesiaTTSEncoding = {
|
|
3125
|
+
PCM_S16LE: "pcm_s16le",
|
|
3126
|
+
PCM_F32LE: "pcm_f32le",
|
|
3127
|
+
PCM_MULAW: "pcm_mulaw",
|
|
3128
|
+
PCM_ALAW: "pcm_alaw"
|
|
3129
|
+
};
|
|
3130
|
+
var CartesiaTTSSampleRate = {
|
|
3131
|
+
HZ_8000: 8e3,
|
|
3132
|
+
HZ_16000: 16e3,
|
|
3133
|
+
HZ_22050: 22050,
|
|
3134
|
+
HZ_24000: 24e3,
|
|
3135
|
+
HZ_44100: 44100
|
|
3136
|
+
};
|
|
3137
|
+
var CartesiaTTSVoiceMode = {
|
|
3138
|
+
ID: "id",
|
|
3139
|
+
EMBEDDING: "embedding"
|
|
3140
|
+
};
|
|
3141
|
+
var CartesiaTTS = class _CartesiaTTS {
|
|
3142
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
3143
|
+
static providerKey = "cartesia_tts";
|
|
3144
|
+
apiKey;
|
|
3145
|
+
model;
|
|
3146
|
+
voice;
|
|
3147
|
+
language;
|
|
3148
|
+
sampleRate;
|
|
3149
|
+
speed;
|
|
3150
|
+
emotion;
|
|
3151
|
+
volume;
|
|
3152
|
+
baseUrl;
|
|
3153
|
+
apiVersion;
|
|
3154
|
+
constructor(apiKey, opts = {}) {
|
|
3155
|
+
this.apiKey = apiKey;
|
|
3156
|
+
this.model = opts.model ?? CartesiaTTSModel.SONIC_3;
|
|
3157
|
+
this.voice = opts.voice ?? CARTESIA_DEFAULT_VOICE_ID;
|
|
3158
|
+
this.language = opts.language ?? "en";
|
|
3159
|
+
this.sampleRate = opts.sampleRate ?? CartesiaTTSSampleRate.HZ_16000;
|
|
3160
|
+
this.speed = opts.speed;
|
|
3161
|
+
this.emotion = typeof opts.emotion === "string" ? [opts.emotion] : opts.emotion;
|
|
3162
|
+
this.volume = opts.volume;
|
|
3163
|
+
this.baseUrl = opts.baseUrl ?? CARTESIA_BASE_URL;
|
|
3164
|
+
this.apiVersion = opts.apiVersion ?? CARTESIA_API_VERSION;
|
|
3165
|
+
}
|
|
3166
|
+
/**
|
|
3167
|
+
* Construct an instance pre-configured for Twilio Media Streams.
|
|
3168
|
+
*
|
|
3169
|
+
* Sets `sampleRate=8000` so Cartesia emits PCM_S16LE @ 8 kHz directly.
|
|
3170
|
+
* Twilio's media stream uses μ-law @ 8 kHz so the SDK still does the
|
|
3171
|
+
* PCM → μ-law transcode client-side, but the 16 kHz → 8 kHz resample
|
|
3172
|
+
* step is skipped. Saves ~10–30 ms first-byte plus per-frame CPU and
|
|
3173
|
+
* removes a potential aliasing source.
|
|
3174
|
+
*/
|
|
3175
|
+
static forTwilio(apiKey, options = {}) {
|
|
3176
|
+
return new _CartesiaTTS(apiKey, {
|
|
3177
|
+
...options,
|
|
3178
|
+
sampleRate: CartesiaTTSSampleRate.HZ_8000
|
|
3179
|
+
});
|
|
3180
|
+
}
|
|
3181
|
+
/**
|
|
3182
|
+
* Construct an instance pre-configured for Telnyx bidirectional media.
|
|
3183
|
+
*
|
|
3184
|
+
* Sets `sampleRate=16000` to match Telnyx's L16/16000 default codec —
|
|
3185
|
+
* audio flows end-to-end with zero resampling or transcoding. Same as
|
|
3186
|
+
* the bare-constructor default; exists for API symmetry with
|
|
3187
|
+
* {@link CartesiaTTS.forTwilio}.
|
|
3188
|
+
*/
|
|
3189
|
+
static forTelnyx(apiKey, options = {}) {
|
|
3190
|
+
return new _CartesiaTTS(apiKey, {
|
|
3191
|
+
...options,
|
|
3192
|
+
sampleRate: CartesiaTTSSampleRate.HZ_16000
|
|
3193
|
+
});
|
|
3194
|
+
}
|
|
3195
|
+
/** Build the JSON payload for the Cartesia bytes endpoint. */
|
|
3196
|
+
buildPayload(text) {
|
|
3197
|
+
const payload = {
|
|
3198
|
+
model_id: this.model,
|
|
3199
|
+
voice: { mode: CartesiaTTSVoiceMode.ID, id: this.voice },
|
|
3200
|
+
transcript: text,
|
|
3201
|
+
output_format: {
|
|
3202
|
+
container: CartesiaTTSContainer.RAW,
|
|
3203
|
+
encoding: CartesiaTTSEncoding.PCM_S16LE,
|
|
3204
|
+
sample_rate: this.sampleRate
|
|
3205
|
+
},
|
|
3206
|
+
language: this.language
|
|
3207
|
+
};
|
|
3208
|
+
const generationConfig = {};
|
|
3209
|
+
if (this.speed !== void 0) generationConfig.speed = this.speed;
|
|
3210
|
+
if (this.emotion && this.emotion.length > 0)
|
|
3211
|
+
generationConfig.emotion = this.emotion[0];
|
|
3212
|
+
if (this.volume !== void 0) generationConfig.volume = this.volume;
|
|
3213
|
+
if (Object.keys(generationConfig).length > 0) {
|
|
3214
|
+
payload.generation_config = generationConfig;
|
|
3215
|
+
}
|
|
3216
|
+
return payload;
|
|
3217
|
+
}
|
|
3218
|
+
/**
|
|
3219
|
+
* Pre-call HTTP warmup for the Cartesia `/tts/bytes` endpoint.
|
|
3220
|
+
*
|
|
3221
|
+
* Issues a lightweight `GET <baseUrl>/voices` so DNS, TLS, and HTTP/2
|
|
3222
|
+
* are already up by the time the first `synthesizeStream()` POST
|
|
3223
|
+
* lands. Best-effort: 5 s timeout, all exceptions swallowed at
|
|
3224
|
+
* debug level.
|
|
3225
|
+
*
|
|
3226
|
+
* Billing safety: `GET /voices` is a free metadata read on
|
|
3227
|
+
* Cartesia's REST surface (per https://docs.cartesia.ai). It does
|
|
3228
|
+
* not consume synthesis credits. The actual synthesis is billed
|
|
3229
|
+
* only when `POST /tts/bytes` runs with a non-empty `transcript`.
|
|
3230
|
+
*
|
|
3231
|
+
* Note: Cartesia TTS uses the HTTP path (vs the WebSocket variant
|
|
3232
|
+
* Cartesia also exposes) — connection warmup is therefore HTTP-GET
|
|
3233
|
+
* based, not WebSocket pre-handshake. The latency win is smaller
|
|
3234
|
+
* (~50-150 ms vs the ~200-500 ms of a WS prewarm) but still real.
|
|
3235
|
+
*/
|
|
3236
|
+
async warmup() {
|
|
3237
|
+
try {
|
|
3238
|
+
await fetch(`${this.baseUrl}/voices`, {
|
|
3239
|
+
method: "GET",
|
|
3240
|
+
headers: {
|
|
3241
|
+
"X-API-Key": this.apiKey,
|
|
3242
|
+
"Cartesia-Version": this.apiVersion
|
|
3243
|
+
},
|
|
3244
|
+
signal: AbortSignal.timeout(5e3)
|
|
3245
|
+
});
|
|
3246
|
+
} catch (err) {
|
|
3247
|
+
getLogger().debug(`Cartesia TTS warmup failed (best-effort): ${String(err)}`);
|
|
3248
|
+
}
|
|
3249
|
+
}
|
|
3250
|
+
/** Synthesize text and return the concatenated audio buffer. */
|
|
3251
|
+
async synthesize(text) {
|
|
3252
|
+
const chunks = [];
|
|
3253
|
+
for await (const chunk of this.synthesizeStream(text)) {
|
|
3254
|
+
chunks.push(chunk);
|
|
3255
|
+
}
|
|
3256
|
+
return Buffer.concat(chunks);
|
|
3257
|
+
}
|
|
3258
|
+
/**
|
|
3259
|
+
* Synthesize text and yield raw PCM_S16LE chunks at the configured
|
|
3260
|
+
* `sampleRate` as they arrive from Cartesia.
|
|
3261
|
+
*/
|
|
3262
|
+
async *synthesizeStream(text) {
|
|
3263
|
+
const response = await fetch(`${this.baseUrl}/tts/bytes`, {
|
|
3264
|
+
method: "POST",
|
|
3265
|
+
headers: {
|
|
3266
|
+
"X-API-Key": this.apiKey,
|
|
3267
|
+
"Cartesia-Version": this.apiVersion,
|
|
3268
|
+
"Content-Type": "application/json"
|
|
3269
|
+
},
|
|
3270
|
+
body: JSON.stringify(this.buildPayload(text)),
|
|
3271
|
+
signal: AbortSignal.timeout(3e4)
|
|
3272
|
+
});
|
|
3273
|
+
if (!response.ok) {
|
|
3274
|
+
const body = await response.text();
|
|
3275
|
+
throw new Error(`Cartesia TTS error ${response.status}: ${body}`);
|
|
3276
|
+
}
|
|
3277
|
+
if (!response.body) {
|
|
3278
|
+
throw new Error("Cartesia TTS: no response body");
|
|
3279
|
+
}
|
|
3280
|
+
const reader = response.body.getReader();
|
|
3281
|
+
try {
|
|
3282
|
+
while (true) {
|
|
3283
|
+
const { done, value } = await reader.read();
|
|
3284
|
+
if (done) break;
|
|
3285
|
+
if (value && value.length > 0) {
|
|
3286
|
+
yield Buffer.from(value);
|
|
3287
|
+
}
|
|
3288
|
+
}
|
|
3289
|
+
} finally {
|
|
3290
|
+
if (typeof reader.cancel === "function")
|
|
3291
|
+
await reader.cancel().catch(() => {
|
|
3292
|
+
});
|
|
3293
|
+
reader.releaseLock();
|
|
3294
|
+
}
|
|
3295
|
+
}
|
|
3296
|
+
};
|
|
3297
|
+
|
|
3298
|
+
// src/providers/rime-tts.ts
|
|
3299
|
+
init_esm_shims();
|
|
3300
|
+
var RIME_BASE_URL = "https://users.rime.ai/v1/rime-tts";
|
|
3301
|
+
var RimeModel = {
|
|
3302
|
+
ARCANA: "arcana",
|
|
3303
|
+
MIST: "mist",
|
|
3304
|
+
MIST_V2: "mistv2"
|
|
3305
|
+
};
|
|
3306
|
+
var RimeAudioFormat = {
|
|
3307
|
+
PCM: "audio/pcm",
|
|
3308
|
+
MP3: "audio/mp3",
|
|
3309
|
+
WAV: "audio/wav",
|
|
3310
|
+
MULAW: "audio/mulaw"
|
|
3311
|
+
};
|
|
3312
|
+
var ARCANA_MODEL_TIMEOUT_MS = 60 * 4 * 1e3;
|
|
3313
|
+
var MIST_MODEL_TIMEOUT_MS = 30 * 1e3;
|
|
3314
|
+
function isMistModel(model) {
|
|
3315
|
+
return model.includes(RimeModel.MIST);
|
|
3316
|
+
}
|
|
3317
|
+
function timeoutForModel(model) {
|
|
3318
|
+
if (model === RimeModel.ARCANA) return ARCANA_MODEL_TIMEOUT_MS;
|
|
3319
|
+
return MIST_MODEL_TIMEOUT_MS;
|
|
3320
|
+
}
|
|
3321
|
+
var RimeTTS = class {
|
|
3322
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
3323
|
+
static providerKey = "rime";
|
|
3324
|
+
apiKey;
|
|
3325
|
+
model;
|
|
3326
|
+
speaker;
|
|
3327
|
+
lang;
|
|
3328
|
+
sampleRate;
|
|
3329
|
+
repetitionPenalty;
|
|
3330
|
+
temperature;
|
|
3331
|
+
topP;
|
|
3332
|
+
maxTokens;
|
|
3333
|
+
speedAlpha;
|
|
3334
|
+
reduceLatency;
|
|
3335
|
+
pauseBetweenBrackets;
|
|
3336
|
+
phonemizeBetweenBrackets;
|
|
3337
|
+
baseUrl;
|
|
3338
|
+
totalTimeoutMs;
|
|
3339
|
+
constructor(apiKey, opts = {}) {
|
|
3340
|
+
this.apiKey = apiKey;
|
|
3341
|
+
this.model = opts.model ?? RimeModel.ARCANA;
|
|
3342
|
+
const defaultSpeaker = isMistModel(this.model) ? "cove" : "astra";
|
|
3343
|
+
this.speaker = opts.speaker ?? defaultSpeaker;
|
|
3344
|
+
this.lang = opts.lang ?? "eng";
|
|
3345
|
+
this.sampleRate = opts.sampleRate ?? 16e3;
|
|
3346
|
+
this.repetitionPenalty = opts.repetitionPenalty;
|
|
3347
|
+
this.temperature = opts.temperature;
|
|
3348
|
+
this.topP = opts.topP;
|
|
3349
|
+
this.maxTokens = opts.maxTokens;
|
|
3350
|
+
this.speedAlpha = opts.speedAlpha;
|
|
3351
|
+
this.reduceLatency = opts.reduceLatency;
|
|
3352
|
+
this.pauseBetweenBrackets = opts.pauseBetweenBrackets;
|
|
3353
|
+
this.phonemizeBetweenBrackets = opts.phonemizeBetweenBrackets;
|
|
3354
|
+
this.baseUrl = opts.baseUrl ?? RIME_BASE_URL;
|
|
3355
|
+
this.totalTimeoutMs = timeoutForModel(this.model);
|
|
3356
|
+
}
|
|
3357
|
+
buildPayload(text) {
|
|
3358
|
+
const payload = {
|
|
3359
|
+
speaker: this.speaker,
|
|
3360
|
+
text,
|
|
3361
|
+
modelId: this.model
|
|
3362
|
+
};
|
|
3363
|
+
if (this.model === RimeModel.ARCANA) {
|
|
3364
|
+
if (this.repetitionPenalty !== void 0)
|
|
3365
|
+
payload.repetition_penalty = this.repetitionPenalty;
|
|
3366
|
+
if (this.temperature !== void 0) payload.temperature = this.temperature;
|
|
3367
|
+
if (this.topP !== void 0) payload.top_p = this.topP;
|
|
3368
|
+
if (this.maxTokens !== void 0) payload.max_tokens = this.maxTokens;
|
|
3369
|
+
payload.lang = this.lang;
|
|
3370
|
+
payload.samplingRate = this.sampleRate;
|
|
3371
|
+
} else if (isMistModel(this.model)) {
|
|
3372
|
+
payload.lang = this.lang;
|
|
3373
|
+
payload.samplingRate = this.sampleRate;
|
|
3374
|
+
if (this.speedAlpha !== void 0) payload.speedAlpha = this.speedAlpha;
|
|
3375
|
+
if (this.model === RimeModel.MIST_V2 && this.reduceLatency !== void 0) {
|
|
3376
|
+
payload.reduceLatency = this.reduceLatency;
|
|
3377
|
+
}
|
|
3378
|
+
if (this.pauseBetweenBrackets !== void 0) {
|
|
3379
|
+
payload.pauseBetweenBrackets = this.pauseBetweenBrackets;
|
|
3380
|
+
}
|
|
3381
|
+
if (this.phonemizeBetweenBrackets !== void 0) {
|
|
3382
|
+
payload.phonemizeBetweenBrackets = this.phonemizeBetweenBrackets;
|
|
3383
|
+
}
|
|
3384
|
+
}
|
|
3385
|
+
return payload;
|
|
3386
|
+
}
|
|
3387
|
+
/** Synthesize text and return the concatenated audio buffer. */
|
|
3388
|
+
async synthesize(text) {
|
|
3389
|
+
const chunks = [];
|
|
3390
|
+
for await (const chunk of this.synthesizeStream(text)) {
|
|
3391
|
+
chunks.push(chunk);
|
|
3392
|
+
}
|
|
3393
|
+
return Buffer.concat(chunks);
|
|
3394
|
+
}
|
|
3395
|
+
/**
|
|
3396
|
+
* Synthesize text and yield raw PCM_S16LE chunks at the configured
|
|
3397
|
+
* `sampleRate` as they stream in.
|
|
3398
|
+
*/
|
|
3399
|
+
async *synthesizeStream(text) {
|
|
3400
|
+
const response = await fetch(this.baseUrl, {
|
|
3401
|
+
method: "POST",
|
|
3402
|
+
headers: {
|
|
3403
|
+
accept: RimeAudioFormat.PCM,
|
|
3404
|
+
Authorization: `Bearer ${this.apiKey}`,
|
|
3405
|
+
"content-type": "application/json"
|
|
3406
|
+
},
|
|
3407
|
+
body: JSON.stringify(this.buildPayload(text)),
|
|
3408
|
+
signal: AbortSignal.timeout(this.totalTimeoutMs)
|
|
3409
|
+
});
|
|
3410
|
+
if (!response.ok) {
|
|
3411
|
+
const body = await response.text();
|
|
3412
|
+
throw new Error(`Rime TTS error ${response.status}: ${body}`);
|
|
3413
|
+
}
|
|
3414
|
+
const contentType = response.headers.get("content-type") ?? "";
|
|
3415
|
+
if (!contentType.startsWith("audio")) {
|
|
3416
|
+
const body = await response.text();
|
|
3417
|
+
throw new Error(`Rime returned non-audio response: ${body.slice(0, 500)}`);
|
|
3418
|
+
}
|
|
3419
|
+
if (!response.body) {
|
|
3420
|
+
throw new Error("Rime TTS: no response body");
|
|
3421
|
+
}
|
|
3422
|
+
const reader = response.body.getReader();
|
|
3423
|
+
try {
|
|
3424
|
+
while (true) {
|
|
3425
|
+
const { done, value } = await reader.read();
|
|
3426
|
+
if (done) break;
|
|
3427
|
+
if (value && value.length > 0) {
|
|
3428
|
+
yield Buffer.from(value);
|
|
3429
|
+
}
|
|
3430
|
+
}
|
|
3431
|
+
} finally {
|
|
3432
|
+
if (typeof reader.cancel === "function")
|
|
3433
|
+
await reader.cancel().catch(() => {
|
|
3434
|
+
});
|
|
3435
|
+
reader.releaseLock();
|
|
3436
|
+
}
|
|
3437
|
+
}
|
|
3438
|
+
};
|
|
3439
|
+
|
|
3440
|
+
// src/stt/deepgram.ts
|
|
3441
|
+
init_esm_shims();
|
|
3442
|
+
var STT = class extends DeepgramSTT {
|
|
3443
|
+
static providerKey = "deepgram";
|
|
3444
|
+
constructor(opts = {}) {
|
|
3445
|
+
const key = opts.apiKey ?? process.env.DEEPGRAM_API_KEY;
|
|
3446
|
+
if (!key) {
|
|
3447
|
+
throw new Error(
|
|
3448
|
+
"Deepgram STT requires an apiKey. Pass { apiKey: 'dg_...' } or set DEEPGRAM_API_KEY in the environment."
|
|
3449
|
+
);
|
|
3450
|
+
}
|
|
3451
|
+
super(
|
|
3452
|
+
key,
|
|
3453
|
+
opts.language ?? "en",
|
|
3454
|
+
opts.model ?? "nova-3",
|
|
3455
|
+
opts.encoding ?? "linear16",
|
|
3456
|
+
opts.sampleRate ?? 16e3,
|
|
3457
|
+
{
|
|
3458
|
+
endpointingMs: opts.endpointingMs ?? 150,
|
|
3459
|
+
utteranceEndMs: opts.utteranceEndMs === null ? null : opts.utteranceEndMs ?? 1e3,
|
|
3460
|
+
smartFormat: opts.smartFormat ?? true,
|
|
3461
|
+
interimResults: opts.interimResults ?? true,
|
|
3462
|
+
...opts.vadEvents !== void 0 ? { vadEvents: opts.vadEvents } : {}
|
|
3463
|
+
}
|
|
3464
|
+
);
|
|
3465
|
+
}
|
|
3466
|
+
};
|
|
3467
|
+
|
|
3468
|
+
// src/stt/whisper.ts
|
|
3469
|
+
init_esm_shims();
|
|
3470
|
+
|
|
3471
|
+
// src/providers/whisper-stt.ts
|
|
3472
|
+
init_esm_shims();
|
|
3473
|
+
var OPENAI_TRANSCRIPTION_URL = "https://api.openai.com/v1/audio/transcriptions";
|
|
3474
|
+
var DEFAULT_BUFFER_SIZE = 16e3 * 2;
|
|
3475
|
+
var ALLOWED_MODELS = /* @__PURE__ */ new Set(["whisper-1", "gpt-4o-transcribe", "gpt-4o-mini-transcribe"]);
|
|
3476
|
+
function wrapPcmInWav(pcm, sampleRate = 16e3, channels = 1, bitsPerSample = 16) {
|
|
3477
|
+
const dataSize = pcm.length;
|
|
3478
|
+
const header = Buffer.alloc(44);
|
|
3479
|
+
header.write("RIFF", 0);
|
|
3480
|
+
header.writeUInt32LE(36 + dataSize, 4);
|
|
3481
|
+
header.write("WAVE", 8);
|
|
2323
3482
|
header.write("fmt ", 12);
|
|
2324
3483
|
header.writeUInt32LE(16, 16);
|
|
2325
3484
|
header.writeUInt16LE(1, 20);
|
|
@@ -2333,6 +3492,8 @@ function wrapPcmInWav(pcm, sampleRate = 16e3, channels = 1, bitsPerSample = 16)
|
|
|
2333
3492
|
return Buffer.concat([header, pcm]);
|
|
2334
3493
|
}
|
|
2335
3494
|
var WhisperSTT = class _WhisperSTT {
|
|
3495
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
3496
|
+
static providerKey = "whisper";
|
|
2336
3497
|
apiKey;
|
|
2337
3498
|
model;
|
|
2338
3499
|
language;
|
|
@@ -2501,6 +3662,8 @@ init_esm_shims();
|
|
|
2501
3662
|
var ALLOWED_MODELS2 = /* @__PURE__ */ new Set(["gpt-4o-transcribe", "gpt-4o-mini-transcribe"]);
|
|
2502
3663
|
var DEFAULT_BUFFER_SIZE2 = 16e3 * 2;
|
|
2503
3664
|
var OpenAITranscribeSTT = class extends WhisperSTT {
|
|
3665
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
3666
|
+
static providerKey = "openai_transcribe";
|
|
2504
3667
|
/**
|
|
2505
3668
|
* @param apiKey OpenAI API key.
|
|
2506
3669
|
* @param language ISO-639-1 language code (e.g. ``"en"``, ``"it"``). Optional.
|
|
@@ -2576,6 +3739,8 @@ var CartesiaSTT = class {
|
|
|
2576
3739
|
}
|
|
2577
3740
|
apiKey;
|
|
2578
3741
|
options;
|
|
3742
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
3743
|
+
static providerKey = "cartesia_stt";
|
|
2579
3744
|
ws = null;
|
|
2580
3745
|
callbacks = /* @__PURE__ */ new Set();
|
|
2581
3746
|
keepaliveTimer = null;
|
|
@@ -2584,6 +3749,37 @@ var CartesiaSTT = class {
|
|
|
2584
3749
|
* `null` until the first transcript event arrives (matches Python's `None`).
|
|
2585
3750
|
*/
|
|
2586
3751
|
requestId = null;
|
|
3752
|
+
/**
|
|
3753
|
+
* Open a fresh WebSocket without arming any message / keepalive handlers
|
|
3754
|
+
* and without taking ownership on `this.ws`. Returns the OPEN socket so
|
|
3755
|
+
* the caller (the prewarm pipeline) can park it for later adoption via
|
|
3756
|
+
* `adoptWebSocket`. Bounded by `CONNECT_TIMEOUT_MS`.
|
|
3757
|
+
*
|
|
3758
|
+
* Billing safety: opening + parking the WS does not stream audio
|
|
3759
|
+
* (Cartesia STT bills on streamed audio seconds), so no charge is
|
|
3760
|
+
* incurred. Close the returned WS yourself if it is never adopted.
|
|
3761
|
+
*/
|
|
3762
|
+
async openParkedConnection() {
|
|
3763
|
+
const url = this.buildWsUrl();
|
|
3764
|
+
const ws = new WebSocket2(url, {
|
|
3765
|
+
headers: { "User-Agent": USER_AGENT }
|
|
3766
|
+
});
|
|
3767
|
+
await new Promise((resolve, reject) => {
|
|
3768
|
+
const timer = setTimeout(
|
|
3769
|
+
() => reject(new Error("Cartesia STT park connect timeout")),
|
|
3770
|
+
CONNECT_TIMEOUT_MS
|
|
3771
|
+
);
|
|
3772
|
+
ws.once("open", () => {
|
|
3773
|
+
clearTimeout(timer);
|
|
3774
|
+
resolve();
|
|
3775
|
+
});
|
|
3776
|
+
ws.once("error", (err) => {
|
|
3777
|
+
clearTimeout(timer);
|
|
3778
|
+
reject(err);
|
|
3779
|
+
});
|
|
3780
|
+
});
|
|
3781
|
+
return ws;
|
|
3782
|
+
}
|
|
2587
3783
|
buildWsUrl() {
|
|
2588
3784
|
const opts = this.options;
|
|
2589
3785
|
const rawBase = opts.baseUrl ?? DEFAULT_BASE_URL;
|
|
@@ -2608,6 +3804,57 @@ var CartesiaSTT = class {
|
|
|
2608
3804
|
});
|
|
2609
3805
|
return `${base}/stt/websocket?${params.toString()}`;
|
|
2610
3806
|
}
|
|
3807
|
+
/**
|
|
3808
|
+
* Pre-call WebSocket warmup for the Cartesia STT `/stt/websocket` endpoint.
|
|
3809
|
+
*
|
|
3810
|
+
* Opens the WS (DNS + TLS + auth handshake), idles ~250 ms so the
|
|
3811
|
+
* Cartesia edge keeps session state warm, then closes. By the time
|
|
3812
|
+
* `connect()` is invoked at call-pickup the resolver and TLS session
|
|
3813
|
+
* are hot — net wire time saving of 200-500 ms.
|
|
3814
|
+
*
|
|
3815
|
+
* Billing safety: Cartesia STT bills on streamed audio seconds (per
|
|
3816
|
+
* https://docs.cartesia.ai/2025-04-16/api-reference/stt/stt). Opening
|
|
3817
|
+
* + closing the WebSocket without forwarding audio does not consume
|
|
3818
|
+
* billable seconds. Best-effort: failures logged at debug level.
|
|
3819
|
+
*/
|
|
3820
|
+
async warmup() {
|
|
3821
|
+
const url = this.buildWsUrl();
|
|
3822
|
+
let ws = null;
|
|
3823
|
+
try {
|
|
3824
|
+
ws = await new Promise((resolve, reject) => {
|
|
3825
|
+
const sock = new WebSocket2(url, {
|
|
3826
|
+
headers: { "User-Agent": USER_AGENT }
|
|
3827
|
+
});
|
|
3828
|
+
const timer = setTimeout(() => {
|
|
3829
|
+
try {
|
|
3830
|
+
sock.close();
|
|
3831
|
+
} catch {
|
|
3832
|
+
}
|
|
3833
|
+
reject(new Error("Cartesia STT warmup connect timeout"));
|
|
3834
|
+
}, 5e3);
|
|
3835
|
+
sock.once("open", () => {
|
|
3836
|
+
clearTimeout(timer);
|
|
3837
|
+
resolve(sock);
|
|
3838
|
+
});
|
|
3839
|
+
sock.once("error", (err) => {
|
|
3840
|
+
clearTimeout(timer);
|
|
3841
|
+
reject(err);
|
|
3842
|
+
});
|
|
3843
|
+
});
|
|
3844
|
+
await new Promise((r) => setTimeout(r, 250));
|
|
3845
|
+
} catch (err) {
|
|
3846
|
+
getLogger().debug(
|
|
3847
|
+
`Cartesia STT warmup failed (best-effort): ${describeWarmupError(err)}`
|
|
3848
|
+
);
|
|
3849
|
+
} finally {
|
|
3850
|
+
if (ws) {
|
|
3851
|
+
try {
|
|
3852
|
+
ws.close();
|
|
3853
|
+
} catch {
|
|
3854
|
+
}
|
|
3855
|
+
}
|
|
3856
|
+
}
|
|
3857
|
+
}
|
|
2611
3858
|
/** Open the streaming WebSocket and arm message + keepalive handlers. */
|
|
2612
3859
|
async connect() {
|
|
2613
3860
|
const url = this.buildWsUrl();
|
|
@@ -2628,6 +3875,24 @@ var CartesiaSTT = class {
|
|
|
2628
3875
|
reject(err);
|
|
2629
3876
|
});
|
|
2630
3877
|
});
|
|
3878
|
+
this.armMessageAndKeepalive();
|
|
3879
|
+
}
|
|
3880
|
+
/**
|
|
3881
|
+
* Adopt a pre-opened, already-OPEN WebSocket produced by the prewarm
|
|
3882
|
+
* pipeline (see `Patter.parkProviderConnections`). Skips the fresh
|
|
3883
|
+
* `new WebSocket()` + handshake — the WS is already through DNS, TLS
|
|
3884
|
+
* and HTTP-101 so audio frames can flow on this turn instead of
|
|
3885
|
+
* paying ~150-400 ms of handshake.
|
|
3886
|
+
*
|
|
3887
|
+
* Caller MUST verify `ws.readyState === OPEN` before calling. If the
|
|
3888
|
+
* parked WS died between park and adopt, fall back to `connect()`.
|
|
3889
|
+
*/
|
|
3890
|
+
adoptWebSocket(ws) {
|
|
3891
|
+
this.ws = ws;
|
|
3892
|
+
this.armMessageAndKeepalive();
|
|
3893
|
+
}
|
|
3894
|
+
armMessageAndKeepalive() {
|
|
3895
|
+
if (!this.ws) return;
|
|
2631
3896
|
this.ws.on("message", (raw) => {
|
|
2632
3897
|
let event;
|
|
2633
3898
|
try {
|
|
@@ -2675,6 +3940,31 @@ var CartesiaSTT = class {
|
|
|
2675
3940
|
if (!this.ws || this.ws.readyState !== WebSocket2.OPEN) return;
|
|
2676
3941
|
this.ws.send(audio);
|
|
2677
3942
|
}
|
|
3943
|
+
/**
|
|
3944
|
+
* Force Cartesia to finalise the in-flight utterance immediately.
|
|
3945
|
+
*
|
|
3946
|
+
* Sends a ``finalize`` text frame on the live WebSocket. Cartesia
|
|
3947
|
+
* replies with the final transcript followed by ``flush_done``,
|
|
3948
|
+
* bypassing its conservative internal silence heuristic (which can
|
|
3949
|
+
* wait 2-7 s on PSTN audio before naturally finalising). Wired
|
|
3950
|
+
* into ``StreamHandler`` on the VAD ``speech_end`` event so the
|
|
3951
|
+
* SDK's authoritative end-of-speech detection forces an immediate
|
|
3952
|
+
* STT finalisation — turning Cartesia's natural-pause endpointing
|
|
3953
|
+
* into a deterministic VAD-driven one, parity with the Deepgram
|
|
3954
|
+
* fast-path. No-op when the WS isn't open. Parity with Python
|
|
3955
|
+
* ``CartesiaSTT.finalize``.
|
|
3956
|
+
*/
|
|
3957
|
+
async finalize() {
|
|
3958
|
+
if (!this.ws || this.ws.readyState !== WebSocket2.OPEN) return;
|
|
3959
|
+
await new Promise((resolve) => {
|
|
3960
|
+
this.ws.send(CartesiaSTTClientFrame.FINALIZE, (err) => {
|
|
3961
|
+
if (err) {
|
|
3962
|
+
getLogger().debug(`Cartesia finalize send failed: ${String(err)}`);
|
|
3963
|
+
}
|
|
3964
|
+
resolve();
|
|
3965
|
+
});
|
|
3966
|
+
});
|
|
3967
|
+
}
|
|
2678
3968
|
/** Register a transcript listener. */
|
|
2679
3969
|
onTranscript(callback) {
|
|
2680
3970
|
this.callbacks.add(callback);
|
|
@@ -2748,6 +4038,17 @@ var CartesiaSTT = class {
|
|
|
2748
4038
|
}
|
|
2749
4039
|
}
|
|
2750
4040
|
};
|
|
4041
|
+
function describeWarmupError(err) {
|
|
4042
|
+
if (typeof err === "object" && err !== null) {
|
|
4043
|
+
const e = err;
|
|
4044
|
+
if (typeof e.statusCode === "number") return `HTTP ${e.statusCode}`;
|
|
4045
|
+
if (typeof e.code === "number" && e.code >= 100 && e.code < 600) return `HTTP ${e.code}`;
|
|
4046
|
+
const ctor = e.constructor?.name;
|
|
4047
|
+
if (typeof ctor === "string" && ctor !== "Object") return ctor;
|
|
4048
|
+
if (typeof e.name === "string") return e.name;
|
|
4049
|
+
}
|
|
4050
|
+
return typeof err;
|
|
4051
|
+
}
|
|
2751
4052
|
|
|
2752
4053
|
// src/stt/cartesia.ts
|
|
2753
4054
|
var STT4 = class extends CartesiaSTT {
|
|
@@ -2826,6 +4127,8 @@ var TokenAccumulator = class {
|
|
|
2826
4127
|
}
|
|
2827
4128
|
};
|
|
2828
4129
|
var SonioxSTT = class _SonioxSTT {
|
|
4130
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
4131
|
+
static providerKey = "soniox";
|
|
2829
4132
|
ws = null;
|
|
2830
4133
|
callbacks = [];
|
|
2831
4134
|
final = new TokenAccumulator();
|
|
@@ -3103,6 +4406,8 @@ var AssemblyAISTT = class _AssemblyAISTT {
|
|
|
3103
4406
|
}
|
|
3104
4407
|
apiKey;
|
|
3105
4408
|
options;
|
|
4409
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
4410
|
+
static providerKey = "assemblyai";
|
|
3106
4411
|
ws = null;
|
|
3107
4412
|
callbacks = /* @__PURE__ */ new Set();
|
|
3108
4413
|
closing = false;
|
|
@@ -3192,6 +4497,62 @@ var AssemblyAISTT = class _AssemblyAISTT {
|
|
|
3192
4497
|
}
|
|
3193
4498
|
return headers;
|
|
3194
4499
|
}
|
|
4500
|
+
/**
|
|
4501
|
+
* Pre-call WebSocket warmup for the AssemblyAI v3 `/v3/ws` endpoint.
|
|
4502
|
+
*
|
|
4503
|
+
* Opens the WS (DNS + TLS + auth handshake), idles ~250 ms so the
|
|
4504
|
+
* AssemblyAI edge keeps the session state warm, then sends Terminate
|
|
4505
|
+
* and closes. By the time `connect()` is invoked at call-pickup the
|
|
4506
|
+
* resolver and TLS session are hot — net wire time saving of
|
|
4507
|
+
* 200-500 ms.
|
|
4508
|
+
*
|
|
4509
|
+
* Billing safety: AssemblyAI Universal Streaming bills on streamed
|
|
4510
|
+
* audio seconds (per https://www.assemblyai.com/pricing). Opening +
|
|
4511
|
+
* closing the WebSocket without forwarding any audio frames does
|
|
4512
|
+
* not consume billable seconds. Best-effort: failures logged at
|
|
4513
|
+
* debug level.
|
|
4514
|
+
*/
|
|
4515
|
+
async warmup() {
|
|
4516
|
+
const url = this.buildUrl();
|
|
4517
|
+
const headers = this.buildHeaders();
|
|
4518
|
+
let ws = null;
|
|
4519
|
+
try {
|
|
4520
|
+
ws = await new Promise((resolve, reject) => {
|
|
4521
|
+
const sock = new WebSocket4(url, { headers });
|
|
4522
|
+
const timer = setTimeout(() => {
|
|
4523
|
+
try {
|
|
4524
|
+
sock.close();
|
|
4525
|
+
} catch {
|
|
4526
|
+
}
|
|
4527
|
+
reject(new Error("AssemblyAI STT warmup connect timeout"));
|
|
4528
|
+
}, 5e3);
|
|
4529
|
+
sock.once("open", () => {
|
|
4530
|
+
clearTimeout(timer);
|
|
4531
|
+
resolve(sock);
|
|
4532
|
+
});
|
|
4533
|
+
sock.once("error", (err) => {
|
|
4534
|
+
clearTimeout(timer);
|
|
4535
|
+
reject(err);
|
|
4536
|
+
});
|
|
4537
|
+
});
|
|
4538
|
+
await new Promise((r) => setTimeout(r, 250));
|
|
4539
|
+
try {
|
|
4540
|
+
ws.send(JSON.stringify({ type: AssemblyAIClientFrame.TERMINATE }));
|
|
4541
|
+
} catch {
|
|
4542
|
+
}
|
|
4543
|
+
} catch (err) {
|
|
4544
|
+
getLogger().debug(
|
|
4545
|
+
`AssemblyAI STT warmup failed (best-effort): ${describeWarmupError2(err)}`
|
|
4546
|
+
);
|
|
4547
|
+
} finally {
|
|
4548
|
+
if (ws) {
|
|
4549
|
+
try {
|
|
4550
|
+
ws.close();
|
|
4551
|
+
} catch {
|
|
4552
|
+
}
|
|
4553
|
+
}
|
|
4554
|
+
}
|
|
4555
|
+
}
|
|
3195
4556
|
/** Open the streaming WebSocket and arm message handlers. */
|
|
3196
4557
|
async connect() {
|
|
3197
4558
|
this.closing = false;
|
|
@@ -3420,6 +4781,17 @@ function averageConfidence(words) {
|
|
|
3420
4781
|
}
|
|
3421
4782
|
return total / words.length;
|
|
3422
4783
|
}
|
|
4784
|
+
function describeWarmupError2(err) {
|
|
4785
|
+
if (typeof err === "object" && err !== null) {
|
|
4786
|
+
const e = err;
|
|
4787
|
+
if (typeof e.statusCode === "number") return `HTTP ${e.statusCode}`;
|
|
4788
|
+
if (typeof e.code === "number" && e.code >= 100 && e.code < 600) return `HTTP ${e.code}`;
|
|
4789
|
+
const ctor = e.constructor?.name;
|
|
4790
|
+
if (typeof ctor === "string" && ctor !== "Object") return ctor;
|
|
4791
|
+
if (typeof e.name === "string") return e.name;
|
|
4792
|
+
}
|
|
4793
|
+
return typeof err;
|
|
4794
|
+
}
|
|
3423
4795
|
|
|
3424
4796
|
// src/stt/assemblyai.ts
|
|
3425
4797
|
var STT6 = class extends AssemblyAISTT {
|
|
@@ -3476,6 +4848,8 @@ var SpeechmaticsServerMessage = {
|
|
|
3476
4848
|
ERROR: "Error"
|
|
3477
4849
|
};
|
|
3478
4850
|
var SpeechmaticsSTT = class {
|
|
4851
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
4852
|
+
static providerKey = "speechmatics";
|
|
3479
4853
|
ws = null;
|
|
3480
4854
|
transcriptCallbacks = /* @__PURE__ */ new Set();
|
|
3481
4855
|
errorCallbacks = /* @__PURE__ */ new Set();
|
|
@@ -3729,275 +5103,60 @@ var SpeechmaticsSTT = class {
|
|
|
3729
5103
|
emitError(err) {
|
|
3730
5104
|
for (const cb of this.errorCallbacks) {
|
|
3731
5105
|
try {
|
|
3732
|
-
cb(err);
|
|
3733
|
-
} catch (cbErr) {
|
|
3734
|
-
getLogger().error(`SpeechmaticsSTT error callback threw: ${String(cbErr)}`);
|
|
3735
|
-
}
|
|
3736
|
-
}
|
|
3737
|
-
}
|
|
3738
|
-
handleError(err) {
|
|
3739
|
-
getLogger().error(`SpeechmaticsSTT WebSocket error: ${err.message}`);
|
|
3740
|
-
this.emitError(err);
|
|
3741
|
-
}
|
|
3742
|
-
handleClose() {
|
|
3743
|
-
if (!this.running) return;
|
|
3744
|
-
this.running = false;
|
|
3745
|
-
}
|
|
3746
|
-
/** Send `EndOfStream` and close the WebSocket. Idempotent. */
|
|
3747
|
-
close() {
|
|
3748
|
-
this.running = false;
|
|
3749
|
-
const ws = this.ws;
|
|
3750
|
-
if (!ws) return;
|
|
3751
|
-
this.ws = null;
|
|
3752
|
-
const sendSafe = (payload) => {
|
|
3753
|
-
if (ws.readyState === WebSocket5.OPEN) {
|
|
3754
|
-
try {
|
|
3755
|
-
ws.send(payload);
|
|
3756
|
-
} catch {
|
|
3757
|
-
}
|
|
3758
|
-
}
|
|
3759
|
-
};
|
|
3760
|
-
sendSafe(
|
|
3761
|
-
JSON.stringify({ message: "EndOfStream", last_seq_no: this.lastSeqNo })
|
|
3762
|
-
);
|
|
3763
|
-
try {
|
|
3764
|
-
ws.close();
|
|
3765
|
-
} catch {
|
|
3766
|
-
}
|
|
3767
|
-
}
|
|
3768
|
-
};
|
|
3769
|
-
|
|
3770
|
-
// src/stt/speechmatics.ts
|
|
3771
|
-
var STT7 = class extends SpeechmaticsSTT {
|
|
3772
|
-
static providerKey = "speechmatics";
|
|
3773
|
-
constructor(opts = {}) {
|
|
3774
|
-
const key = opts.apiKey ?? process.env.SPEECHMATICS_API_KEY;
|
|
3775
|
-
if (!key) {
|
|
3776
|
-
throw new Error(
|
|
3777
|
-
"Speechmatics STT requires an apiKey. Pass { apiKey: 'sm_...' } or set SPEECHMATICS_API_KEY in the environment."
|
|
3778
|
-
);
|
|
3779
|
-
}
|
|
3780
|
-
super(key, opts);
|
|
3781
|
-
}
|
|
3782
|
-
};
|
|
3783
|
-
|
|
3784
|
-
// src/tts/elevenlabs.ts
|
|
3785
|
-
init_esm_shims();
|
|
3786
|
-
|
|
3787
|
-
// src/providers/elevenlabs-tts.ts
|
|
3788
|
-
init_esm_shims();
|
|
3789
|
-
var ELEVENLABS_BASE_URL = "https://api.elevenlabs.io/v1";
|
|
3790
|
-
var ELEVENLABS_VOICE_ID_BY_NAME = {
|
|
3791
|
-
rachel: "21m00Tcm4TlvDq8ikWAM",
|
|
3792
|
-
drew: "29vD33N1CtxCmqQRPOHJ",
|
|
3793
|
-
clyde: "2EiwWnXFnvU5JabPnv8n",
|
|
3794
|
-
paul: "5Q0t7uMcjvnagumLfvZi",
|
|
3795
|
-
domi: "AZnzlk1XvdvUeBnXmlld",
|
|
3796
|
-
dave: "CYw3kZ02Hs0563khs1Fj",
|
|
3797
|
-
fin: "D38z5RcWu1voky8WS1ja",
|
|
3798
|
-
bella: "EXAVITQu4vr4xnSDxMaL",
|
|
3799
|
-
antoni: "ErXwobaYiN019PkySvjV",
|
|
3800
|
-
thomas: "GBv7mTt0atIp3Br8iCZE",
|
|
3801
|
-
charlie: "IKne3meq5aSn9XLyUdCD",
|
|
3802
|
-
george: "JBFqnCBsd6RMkjVDRZzb",
|
|
3803
|
-
emily: "LcfcDJNUP1GQjkzn1xUU",
|
|
3804
|
-
elli: "MF3mGyEYCl7XYWbV9V6O",
|
|
3805
|
-
callum: "N2lVS1w4EtoT3dr4eOWO",
|
|
3806
|
-
patrick: "ODq5zmih8GrVes37Dizd",
|
|
3807
|
-
harry: "SOYHLrjzK2X1ezoPC6cr",
|
|
3808
|
-
liam: "TX3LPaxmHKxFdv7VOQHJ",
|
|
3809
|
-
dorothy: "ThT5KcBeYPX3keUQqHPh",
|
|
3810
|
-
josh: "TxGEqnHWrfWFTfGW9XjX",
|
|
3811
|
-
arnold: "VR6AewLTigWG4xSOukaG",
|
|
3812
|
-
charlotte: "XB0fDUnXU5powFXDhCwa",
|
|
3813
|
-
matilda: "XrExE9yKIg1WjnnlVkGX",
|
|
3814
|
-
matthew: "Yko7PKHZNXotIFUBG7I9",
|
|
3815
|
-
james: "ZQe5CZNOzWyzPSCn5a3c",
|
|
3816
|
-
joseph: "Zlb1dXrM653N07WRdFW3",
|
|
3817
|
-
jeremy: "bVMeCyTHy58xNoL34h3p",
|
|
3818
|
-
michael: "flq6f7yk4E4fJM5XTYuZ",
|
|
3819
|
-
ethan: "g5CIjZEefAph4nQFvHAz",
|
|
3820
|
-
gigi: "jBpfuIE2acCO8z3wKNLl",
|
|
3821
|
-
freya: "jsCqWAovK2LkecY7zXl4",
|
|
3822
|
-
brian: "nPczCjzI2devNBz1zQrb",
|
|
3823
|
-
grace: "oWAxZDx7w5VEj9dCyTzz",
|
|
3824
|
-
daniel: "onwK4e9ZLuTAKqWW03F9",
|
|
3825
|
-
lily: "pFZP5JQG7iQjIQuC4Bku",
|
|
3826
|
-
serena: "pMsXgVXv3BLzUgSXRplE",
|
|
3827
|
-
adam: "pNInz6obpgDQGcFmaJgB",
|
|
3828
|
-
nicole: "piTKgcLEGmPE4e6mEKli",
|
|
3829
|
-
bill: "pqHfZKP75CvOlQylNhV4",
|
|
3830
|
-
jessie: "t0jbNlBVZ17f02VDIeMI",
|
|
3831
|
-
ryan: "wViXBPUzp2ZZixB1xQuM",
|
|
3832
|
-
sam: "yoZ06aMxZJJ28mfd3POQ",
|
|
3833
|
-
glinda: "z9fAnlkpzviPz146aGWa",
|
|
3834
|
-
giovanni: "zcAOhNBS3c14rBihAFp1",
|
|
3835
|
-
mimi: "zrHiDhphv9ZnVXBqCLjz",
|
|
3836
|
-
sarah: "EXAVITQu4vr4xnSDxMaL",
|
|
3837
|
-
alloy: "EXAVITQu4vr4xnSDxMaL"
|
|
3838
|
-
};
|
|
3839
|
-
var VOICE_ID_PATTERN = /^[A-Za-z0-9]{20}$/;
|
|
3840
|
-
function resolveVoiceId(voice) {
|
|
3841
|
-
if (!voice) return voice;
|
|
3842
|
-
if (VOICE_ID_PATTERN.test(voice)) return voice;
|
|
3843
|
-
return ELEVENLABS_VOICE_ID_BY_NAME[voice.toLowerCase()] ?? voice;
|
|
3844
|
-
}
|
|
3845
|
-
var ElevenLabsModel = {
|
|
3846
|
-
V3: "eleven_v3",
|
|
3847
|
-
FLASH_V2_5: "eleven_flash_v2_5",
|
|
3848
|
-
TURBO_V2_5: "eleven_turbo_v2_5",
|
|
3849
|
-
MULTILINGUAL_V2: "eleven_multilingual_v2",
|
|
3850
|
-
MONOLINGUAL_V1: "eleven_monolingual_v1"
|
|
3851
|
-
};
|
|
3852
|
-
var ElevenLabsOutputFormat = {
|
|
3853
|
-
MP3_22050_32: "mp3_22050_32",
|
|
3854
|
-
MP3_44100_32: "mp3_44100_32",
|
|
3855
|
-
MP3_44100_64: "mp3_44100_64",
|
|
3856
|
-
MP3_44100_96: "mp3_44100_96",
|
|
3857
|
-
MP3_44100_128: "mp3_44100_128",
|
|
3858
|
-
MP3_44100_192: "mp3_44100_192",
|
|
3859
|
-
PCM_8000: "pcm_8000",
|
|
3860
|
-
PCM_16000: "pcm_16000",
|
|
3861
|
-
PCM_22050: "pcm_22050",
|
|
3862
|
-
PCM_24000: "pcm_24000",
|
|
3863
|
-
PCM_44100: "pcm_44100",
|
|
3864
|
-
ULAW_8000: "ulaw_8000"
|
|
3865
|
-
};
|
|
3866
|
-
var ElevenLabsTTS = class _ElevenLabsTTS {
|
|
3867
|
-
apiKey;
|
|
3868
|
-
voiceId;
|
|
3869
|
-
modelId;
|
|
3870
|
-
outputFormat;
|
|
3871
|
-
voiceSettings;
|
|
3872
|
-
languageCode;
|
|
3873
|
-
chunkSize;
|
|
3874
|
-
constructor(apiKey, voiceIdOrOptions = "21m00Tcm4TlvDq8ikWAM", modelId = ElevenLabsModel.FLASH_V2_5, outputFormat = ElevenLabsOutputFormat.PCM_16000) {
|
|
3875
|
-
this.apiKey = apiKey;
|
|
3876
|
-
if (typeof voiceIdOrOptions === "object") {
|
|
3877
|
-
const o = voiceIdOrOptions;
|
|
3878
|
-
this.voiceId = resolveVoiceId(o.voiceId ?? "21m00Tcm4TlvDq8ikWAM");
|
|
3879
|
-
this.modelId = o.modelId ?? ElevenLabsModel.FLASH_V2_5;
|
|
3880
|
-
this.outputFormat = o.outputFormat ?? ElevenLabsOutputFormat.PCM_16000;
|
|
3881
|
-
this.voiceSettings = o.voiceSettings;
|
|
3882
|
-
this.languageCode = o.languageCode;
|
|
3883
|
-
this.chunkSize = o.chunkSize ?? 4096;
|
|
3884
|
-
} else {
|
|
3885
|
-
this.voiceId = resolveVoiceId(voiceIdOrOptions);
|
|
3886
|
-
this.modelId = modelId;
|
|
3887
|
-
this.outputFormat = outputFormat;
|
|
3888
|
-
this.voiceSettings = void 0;
|
|
3889
|
-
this.languageCode = void 0;
|
|
3890
|
-
this.chunkSize = 4096;
|
|
3891
|
-
}
|
|
3892
|
-
}
|
|
3893
|
-
/**
|
|
3894
|
-
* Construct an instance pre-configured for Twilio Media Streams.
|
|
3895
|
-
*
|
|
3896
|
-
* Sets `outputFormat='ulaw_8000'` so ElevenLabs emits μ-law @ 8 kHz
|
|
3897
|
-
* directly — the exact wire format Twilio's media stream uses — letting
|
|
3898
|
-
* the SDK skip the 16 kHz→8 kHz resample and PCM→μ-law conversion in
|
|
3899
|
-
* `TwilioAudioSender`. Saves ~30–80 ms first-byte and per-frame CPU,
|
|
3900
|
-
* and removes a potential aliasing source.
|
|
3901
|
-
*
|
|
3902
|
-
* `voiceSettings` defaults to a low-bandwidth-friendly profile
|
|
3903
|
-
* (speaker boost off, modest stability) which sounds cleaner at 8 kHz
|
|
3904
|
-
* μ-law than the studio default. Pass an explicit object to override.
|
|
3905
|
-
*/
|
|
3906
|
-
static forTwilio(apiKey, options = {}) {
|
|
3907
|
-
const voiceSettings = options.voiceSettings ?? {
|
|
3908
|
-
// Speaker boost adds high-frequency emphasis that aliases ugly over an
|
|
3909
|
-
// 8 kHz μ-law line. Slightly higher stability tames the excursions
|
|
3910
|
-
// that compander quantization noise can amplify.
|
|
3911
|
-
stability: 0.6,
|
|
3912
|
-
similarity_boost: 0.75,
|
|
3913
|
-
use_speaker_boost: false
|
|
3914
|
-
};
|
|
3915
|
-
return new _ElevenLabsTTS(apiKey, {
|
|
3916
|
-
...options,
|
|
3917
|
-
voiceSettings,
|
|
3918
|
-
outputFormat: ElevenLabsOutputFormat.ULAW_8000
|
|
3919
|
-
});
|
|
3920
|
-
}
|
|
3921
|
-
/**
|
|
3922
|
-
* Construct an instance pre-configured for Telnyx bidirectional media.
|
|
3923
|
-
*
|
|
3924
|
-
* Telnyx's default media-streaming codec is L16 PCM @ 16 kHz, which
|
|
3925
|
-
* matches our default Telnyx handler. We pick `pcm_16000` so the audio
|
|
3926
|
-
* flows end-to-end with zero resampling or transcoding.
|
|
3927
|
-
*
|
|
3928
|
-
* Trade-off: if your Telnyx profile is pinned to PCMU/8000 (μ-law),
|
|
3929
|
-
* construct `ElevenLabsTTS` directly with `outputFormat: 'ulaw_8000'`
|
|
3930
|
-
* — Telnyx supports that natively too.
|
|
3931
|
-
*/
|
|
3932
|
-
static forTelnyx(apiKey, options = {}) {
|
|
3933
|
-
return new _ElevenLabsTTS(apiKey, {
|
|
3934
|
-
...options,
|
|
3935
|
-
outputFormat: ElevenLabsOutputFormat.PCM_16000
|
|
3936
|
-
});
|
|
3937
|
-
}
|
|
3938
|
-
/**
|
|
3939
|
-
* Synthesise text to speech and return the full audio as a single Buffer.
|
|
3940
|
-
*
|
|
3941
|
-
* For large chunks (or when latency matters) call `synthesizeStream` instead.
|
|
3942
|
-
*/
|
|
3943
|
-
async synthesize(text) {
|
|
3944
|
-
const chunks = [];
|
|
3945
|
-
for await (const chunk of this.synthesizeStream(text)) {
|
|
3946
|
-
chunks.push(chunk);
|
|
3947
|
-
}
|
|
3948
|
-
return Buffer.concat(chunks);
|
|
3949
|
-
}
|
|
3950
|
-
/**
|
|
3951
|
-
* Synthesise text and yield audio chunks as they arrive (streaming).
|
|
3952
|
-
*
|
|
3953
|
-
* The yielded buffers are raw PCM at 16 kHz (or whatever `outputFormat` is
|
|
3954
|
-
* configured to). `chunkSize` controls the maximum yield size — 512 is a
|
|
3955
|
-
* good choice for low-latency telephony.
|
|
3956
|
-
*/
|
|
3957
|
-
async *synthesizeStream(text) {
|
|
3958
|
-
const url = `${ELEVENLABS_BASE_URL}/text-to-speech/${encodeURIComponent(this.voiceId)}/stream?output_format=${encodeURIComponent(this.outputFormat)}`;
|
|
3959
|
-
const body = {
|
|
3960
|
-
text,
|
|
3961
|
-
model_id: this.modelId
|
|
3962
|
-
};
|
|
3963
|
-
if (this.voiceSettings) body["voice_settings"] = this.voiceSettings;
|
|
3964
|
-
if (this.languageCode) body["language_code"] = this.languageCode;
|
|
3965
|
-
const response = await fetch(url, {
|
|
3966
|
-
method: "POST",
|
|
3967
|
-
headers: {
|
|
3968
|
-
"xi-api-key": this.apiKey,
|
|
3969
|
-
"Content-Type": "application/json"
|
|
3970
|
-
},
|
|
3971
|
-
body: JSON.stringify(body),
|
|
3972
|
-
signal: AbortSignal.timeout(3e4)
|
|
3973
|
-
});
|
|
3974
|
-
if (!response.ok) {
|
|
3975
|
-
const errBody = await response.text();
|
|
3976
|
-
throw new Error(`ElevenLabs TTS error ${response.status}: ${errBody}`);
|
|
3977
|
-
}
|
|
3978
|
-
if (!response.body) {
|
|
3979
|
-
throw new Error("ElevenLabs TTS: no response body");
|
|
3980
|
-
}
|
|
3981
|
-
const reader = response.body.getReader();
|
|
3982
|
-
try {
|
|
3983
|
-
while (true) {
|
|
3984
|
-
const { done, value } = await reader.read();
|
|
3985
|
-
if (done) break;
|
|
3986
|
-
if (!value || value.length === 0) continue;
|
|
3987
|
-
const buf = Buffer.from(value);
|
|
3988
|
-
for (let offset = 0; offset < buf.length; offset += this.chunkSize) {
|
|
3989
|
-
yield buf.subarray(offset, Math.min(offset + this.chunkSize, buf.length));
|
|
5106
|
+
cb(err);
|
|
5107
|
+
} catch (cbErr) {
|
|
5108
|
+
getLogger().error(`SpeechmaticsSTT error callback threw: ${String(cbErr)}`);
|
|
5109
|
+
}
|
|
5110
|
+
}
|
|
5111
|
+
}
|
|
5112
|
+
handleError(err) {
|
|
5113
|
+
getLogger().error(`SpeechmaticsSTT WebSocket error: ${err.message}`);
|
|
5114
|
+
this.emitError(err);
|
|
5115
|
+
}
|
|
5116
|
+
handleClose() {
|
|
5117
|
+
if (!this.running) return;
|
|
5118
|
+
this.running = false;
|
|
5119
|
+
}
|
|
5120
|
+
/** Send `EndOfStream` and close the WebSocket. Idempotent. */
|
|
5121
|
+
close() {
|
|
5122
|
+
this.running = false;
|
|
5123
|
+
const ws = this.ws;
|
|
5124
|
+
if (!ws) return;
|
|
5125
|
+
this.ws = null;
|
|
5126
|
+
const sendSafe = (payload) => {
|
|
5127
|
+
if (ws.readyState === WebSocket5.OPEN) {
|
|
5128
|
+
try {
|
|
5129
|
+
ws.send(payload);
|
|
5130
|
+
} catch {
|
|
3990
5131
|
}
|
|
3991
5132
|
}
|
|
3992
|
-
}
|
|
3993
|
-
|
|
3994
|
-
})
|
|
3995
|
-
|
|
5133
|
+
};
|
|
5134
|
+
sendSafe(
|
|
5135
|
+
JSON.stringify({ message: "EndOfStream", last_seq_no: this.lastSeqNo })
|
|
5136
|
+
);
|
|
5137
|
+
try {
|
|
5138
|
+
ws.close();
|
|
5139
|
+
} catch {
|
|
5140
|
+
}
|
|
5141
|
+
}
|
|
5142
|
+
};
|
|
5143
|
+
|
|
5144
|
+
// src/stt/speechmatics.ts
|
|
5145
|
+
var STT7 = class extends SpeechmaticsSTT {
|
|
5146
|
+
static providerKey = "speechmatics";
|
|
5147
|
+
constructor(opts = {}) {
|
|
5148
|
+
const key = opts.apiKey ?? process.env.SPEECHMATICS_API_KEY;
|
|
5149
|
+
if (!key) {
|
|
5150
|
+
throw new Error(
|
|
5151
|
+
"Speechmatics STT requires an apiKey. Pass { apiKey: 'sm_...' } or set SPEECHMATICS_API_KEY in the environment."
|
|
5152
|
+
);
|
|
3996
5153
|
}
|
|
5154
|
+
super(key, opts);
|
|
3997
5155
|
}
|
|
3998
5156
|
};
|
|
3999
5157
|
|
|
4000
5158
|
// src/tts/elevenlabs.ts
|
|
5159
|
+
init_esm_shims();
|
|
4001
5160
|
function resolveApiKey(apiKey) {
|
|
4002
5161
|
const key = apiKey ?? process.env.ELEVENLABS_API_KEY;
|
|
4003
5162
|
if (!key) {
|
|
@@ -4013,7 +5172,7 @@ var TTS = class _TTS extends ElevenLabsTTS {
|
|
|
4013
5172
|
super(resolveApiKey(opts.apiKey), {
|
|
4014
5173
|
voiceId: opts.voiceId ?? "EXAVITQu4vr4xnSDxMaL",
|
|
4015
5174
|
modelId: opts.modelId ?? "eleven_flash_v2_5",
|
|
4016
|
-
outputFormat: opts.outputFormat
|
|
5175
|
+
...opts.outputFormat !== void 0 ? { outputFormat: opts.outputFormat } : {},
|
|
4017
5176
|
languageCode: opts.languageCode,
|
|
4018
5177
|
voiceSettings: opts.voiceSettings
|
|
4019
5178
|
});
|
|
@@ -4052,7 +5211,7 @@ var ElevenLabsPlanError = class extends ElevenLabsTTSError {
|
|
|
4052
5211
|
this.name = "ElevenLabsPlanError";
|
|
4053
5212
|
}
|
|
4054
5213
|
};
|
|
4055
|
-
var PLAN_REQUIRED_MSG = "ElevenLabs WS streaming requires a Pro plan or higher (the WS endpoint returned `payment_required`). Either upgrade at https://elevenlabs.io/pricing, or use
|
|
5214
|
+
var PLAN_REQUIRED_MSG = "ElevenLabs WS streaming requires a Pro plan or higher (the WS endpoint returned `payment_required`). Either upgrade at https://elevenlabs.io/pricing, or use `ElevenLabsRestTTS` for HTTP REST instead which works on all plans (drop-in API).";
|
|
4056
5215
|
function sanitiseLogStr(value, limit = 200) {
|
|
4057
5216
|
return String(value).replace(/[\r\n\x00]/g, " ").slice(0, limit);
|
|
4058
5217
|
}
|
|
@@ -4071,6 +5230,33 @@ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
|
|
|
4071
5230
|
inactivityTimeout;
|
|
4072
5231
|
chunkLengthSchedule;
|
|
4073
5232
|
chunkSize;
|
|
5233
|
+
/**
|
|
5234
|
+
* Single-slot adoption queue. The prewarm pipeline parks one WS per
|
|
5235
|
+
* outbound call here; the next `synthesizeStream` call consumes it
|
|
5236
|
+
* (skipping `new WebSocket()` and the BOS send) instead of opening
|
|
5237
|
+
* a fresh socket. The slot is consumed exactly once: if a second
|
|
5238
|
+
* `synthesizeStream` runs before the first, only the first benefits.
|
|
5239
|
+
*
|
|
5240
|
+
* We keep this on the adapter (not in a parameter) so the existing
|
|
5241
|
+
* `for await (const chunk of agent.tts.synthesizeStream(...))` call
|
|
5242
|
+
* site in `StreamHandler` continues to work without signature
|
|
5243
|
+
* changes.
|
|
5244
|
+
*/
|
|
5245
|
+
adoptedConnection = null;
|
|
5246
|
+
/**
|
|
5247
|
+
* Active WS for the in-flight ``synthesizeStream`` call, if any. Set
|
|
5248
|
+
* when a stream starts, cleared in its ``finally`` block. The
|
|
5249
|
+
* stream-handler calls ``cancelActiveStream()`` from ``cancelSpeaking``
|
|
5250
|
+
* to unblock the generator's inner ``await Promise<frame>`` — without
|
|
5251
|
+
* it, a barge-in on the firstMessage live path leaves the for-await
|
|
5252
|
+
* stuck waiting for the next frame; ElevenLabs never sends
|
|
5253
|
+
* ``isFinal=true`` after the consumer breaks, the 30 s frame timeout
|
|
5254
|
+
* fires post-call, and meanwhile ``initPipeline`` never returns so
|
|
5255
|
+
* the STT ``onTranscript`` callback never registers and subsequent
|
|
5256
|
+
* user turns are silently dropped (root cause of the 2026-05-20
|
|
5257
|
+
* "first message OK, then no response" symptom).
|
|
5258
|
+
*/
|
|
5259
|
+
activeStreamWs = null;
|
|
4074
5260
|
/**
|
|
4075
5261
|
* The wire format requested over the ElevenLabs WS. Initially set from
|
|
4076
5262
|
* the constructor; ``setTelephonyCarrier`` may auto-flip it to the
|
|
@@ -4086,7 +5272,7 @@ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
|
|
|
4086
5272
|
constructor(opts) {
|
|
4087
5273
|
if (opts.modelId === "eleven_v3") {
|
|
4088
5274
|
throw new Error(
|
|
4089
|
-
"eleven_v3 is not supported by the WebSocket stream-input endpoint \u2014 use
|
|
5275
|
+
"eleven_v3 is not supported by the WebSocket stream-input endpoint \u2014 use `ElevenLabsRestTTS` for HTTP REST instead."
|
|
4090
5276
|
);
|
|
4091
5277
|
}
|
|
4092
5278
|
this.apiKey = opts.apiKey;
|
|
@@ -4119,6 +5305,32 @@ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
|
|
|
4119
5305
|
if (!native) return;
|
|
4120
5306
|
this._outputFormat = native;
|
|
4121
5307
|
}
|
|
5308
|
+
/**
|
|
5309
|
+
* Force-close the WebSocket of any in-flight ``synthesizeStream`` call.
|
|
5310
|
+
* Called by the stream-handler from ``cancelSpeaking`` (barge-in) so
|
|
5311
|
+
* the generator's inner ``await Promise<frame>`` loop unblocks cleanly
|
|
5312
|
+
* via the ``onClose`` handler — instead of waiting up to 30 s for the
|
|
5313
|
+
* ``FRAME_TIMEOUT_MS`` watchdog to fire. No-op when no stream is in
|
|
5314
|
+
* flight or when the WS is already closing.
|
|
5315
|
+
*
|
|
5316
|
+
* Without this, a barge-in during the firstMessage live path left the
|
|
5317
|
+
* for-await stuck (ElevenLabs never sends ``isFinal=true`` after the
|
|
5318
|
+
* consumer breaks), ``initPipeline`` never returned, the STT
|
|
5319
|
+
* ``onTranscript`` callback never registered, and the entire remainder
|
|
5320
|
+
* of the call was silent for the user. Surfaced during the 2026-05-20
|
|
5321
|
+
* acceptance run.
|
|
5322
|
+
*/
|
|
5323
|
+
cancelActiveStream() {
|
|
5324
|
+
const ws = this.activeStreamWs;
|
|
5325
|
+
if (!ws) return;
|
|
5326
|
+
this.activeStreamWs = null;
|
|
5327
|
+
try {
|
|
5328
|
+
if (ws.readyState === WebSocket6.OPEN || ws.readyState === WebSocket6.CONNECTING) {
|
|
5329
|
+
ws.close();
|
|
5330
|
+
}
|
|
5331
|
+
} catch {
|
|
5332
|
+
}
|
|
5333
|
+
}
|
|
4122
5334
|
/** Pre-configured for Twilio Media Streams (`ulaw_8000`). */
|
|
4123
5335
|
static forTwilio(opts) {
|
|
4124
5336
|
return new _ElevenLabsWebSocketTTS({
|
|
@@ -4148,6 +5360,24 @@ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
|
|
|
4148
5360
|
if (this.languageCode) params.set("language_code", this.languageCode);
|
|
4149
5361
|
return `${WS_BASE}/${encodeURIComponent(this.voiceId)}/stream-input?${params.toString()}`;
|
|
4150
5362
|
}
|
|
5363
|
+
/**
|
|
5364
|
+
* Build the protocol-required BOS frame sent on every fresh WS.
|
|
5365
|
+
*
|
|
5366
|
+
* The single-space `{"text": " "}` keep-alive establishes the session
|
|
5367
|
+
* without committing any synthesis (no `flush: true`, no real text).
|
|
5368
|
+
* Production `synthesizeStream()` and `warmup()` share this exact
|
|
5369
|
+
* construction so the upstream worker chooses the same per-session
|
|
5370
|
+
* config in both cases — otherwise the warm session is on a different
|
|
5371
|
+
* worker than the live request, which defeats the warmup goal.
|
|
5372
|
+
*/
|
|
5373
|
+
buildBosFrame() {
|
|
5374
|
+
const init = { text: " " };
|
|
5375
|
+
if (this.voiceSettings) init["voice_settings"] = this.voiceSettings;
|
|
5376
|
+
if (!this.autoMode && this.chunkLengthSchedule) {
|
|
5377
|
+
init["generation_config"] = { chunk_length_schedule: this.chunkLengthSchedule };
|
|
5378
|
+
}
|
|
5379
|
+
return init;
|
|
5380
|
+
}
|
|
4151
5381
|
/**
|
|
4152
5382
|
* Single-shot synthesis: open WS, send text, yield bytes, close.
|
|
4153
5383
|
*
|
|
@@ -4166,9 +5396,27 @@ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
|
|
|
4166
5396
|
* after flush — auto_mode could otherwise truncate the tail audio).
|
|
4167
5397
|
*/
|
|
4168
5398
|
async *synthesizeStream(text) {
|
|
4169
|
-
|
|
4170
|
-
|
|
4171
|
-
|
|
5399
|
+
let ws;
|
|
5400
|
+
let bosAlreadySent = false;
|
|
5401
|
+
let adopted = false;
|
|
5402
|
+
const parked = this.adoptedConnection;
|
|
5403
|
+
this.adoptedConnection = null;
|
|
5404
|
+
if (parked && parked.ws.readyState === WebSocket6.OPEN) {
|
|
5405
|
+
ws = parked.ws;
|
|
5406
|
+
bosAlreadySent = parked.bosSent;
|
|
5407
|
+
adopted = true;
|
|
5408
|
+
} else {
|
|
5409
|
+
if (parked) {
|
|
5410
|
+
try {
|
|
5411
|
+
parked.ws.close();
|
|
5412
|
+
} catch {
|
|
5413
|
+
}
|
|
5414
|
+
}
|
|
5415
|
+
ws = new WebSocket6(this.buildUrl(), {
|
|
5416
|
+
headers: { "xi-api-key": this.apiKey }
|
|
5417
|
+
});
|
|
5418
|
+
}
|
|
5419
|
+
this.activeStreamWs = ws;
|
|
4172
5420
|
const queue = [];
|
|
4173
5421
|
let done = false;
|
|
4174
5422
|
let pendingError = null;
|
|
@@ -4238,28 +5486,27 @@ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
|
|
|
4238
5486
|
};
|
|
4239
5487
|
ws.on("error", onError);
|
|
4240
5488
|
try {
|
|
4241
|
-
|
|
4242
|
-
|
|
4243
|
-
|
|
4244
|
-
|
|
4245
|
-
|
|
4246
|
-
|
|
4247
|
-
|
|
4248
|
-
|
|
4249
|
-
|
|
4250
|
-
|
|
4251
|
-
|
|
4252
|
-
|
|
4253
|
-
|
|
4254
|
-
|
|
5489
|
+
if (!adopted) {
|
|
5490
|
+
await new Promise((resolve, reject) => {
|
|
5491
|
+
connectTimer = setTimeout(
|
|
5492
|
+
() => reject(new Error("ElevenLabs WS connect timeout")),
|
|
5493
|
+
CONNECT_TIMEOUT_MS4
|
|
5494
|
+
);
|
|
5495
|
+
ws.once("open", () => {
|
|
5496
|
+
if (connectTimer) clearTimeout(connectTimer);
|
|
5497
|
+
connectTimer = void 0;
|
|
5498
|
+
resolve();
|
|
5499
|
+
});
|
|
5500
|
+
ws.once("error", (err) => {
|
|
5501
|
+
if (connectTimer) clearTimeout(connectTimer);
|
|
5502
|
+
connectTimer = void 0;
|
|
5503
|
+
reject(err);
|
|
5504
|
+
});
|
|
4255
5505
|
});
|
|
4256
|
-
});
|
|
4257
|
-
const init = { text: " " };
|
|
4258
|
-
if (this.voiceSettings) init["voice_settings"] = this.voiceSettings;
|
|
4259
|
-
if (!this.autoMode && this.chunkLengthSchedule) {
|
|
4260
|
-
init["generation_config"] = { chunk_length_schedule: this.chunkLengthSchedule };
|
|
4261
5506
|
}
|
|
4262
|
-
|
|
5507
|
+
if (!bosAlreadySent) {
|
|
5508
|
+
ws.send(JSON.stringify(this.buildBosFrame()));
|
|
5509
|
+
}
|
|
4263
5510
|
ws.send(JSON.stringify({ text: text + " ", flush: true }));
|
|
4264
5511
|
ws.on("message", onMessage);
|
|
4265
5512
|
ws.on("close", onClose);
|
|
@@ -4290,6 +5537,7 @@ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
|
|
|
4290
5537
|
}
|
|
4291
5538
|
} finally {
|
|
4292
5539
|
if (connectTimer) clearTimeout(connectTimer);
|
|
5540
|
+
if (this.activeStreamWs === ws) this.activeStreamWs = null;
|
|
4293
5541
|
try {
|
|
4294
5542
|
if (ws.readyState === WebSocket6.OPEN) {
|
|
4295
5543
|
ws.send(JSON.stringify({ text: "" }));
|
|
@@ -4305,387 +5553,227 @@ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
|
|
|
4305
5553
|
ws.removeAllListeners();
|
|
4306
5554
|
}
|
|
4307
5555
|
}
|
|
4308
|
-
/** No-op — connections are per-utterance and torn down inside synthesizeStream. */
|
|
4309
|
-
async close() {
|
|
4310
|
-
}
|
|
4311
|
-
};
|
|
4312
|
-
function looksLikeJson(buf) {
|
|
4313
|
-
if (buf.length === 0) return false;
|
|
4314
|
-
const b = buf[0];
|
|
4315
|
-
return b === 123 || b === 91;
|
|
4316
|
-
}
|
|
4317
|
-
|
|
4318
|
-
// src/tts/elevenlabs-ws.ts
|
|
4319
|
-
function resolveApiKey2(apiKey) {
|
|
4320
|
-
const key = apiKey ?? process.env.ELEVENLABS_API_KEY;
|
|
4321
|
-
if (!key) {
|
|
4322
|
-
throw new Error(
|
|
4323
|
-
"ElevenLabs WebSocket TTS requires an apiKey. Pass { apiKey: '...' } or set ELEVENLABS_API_KEY in the environment."
|
|
4324
|
-
);
|
|
4325
|
-
}
|
|
4326
|
-
return key;
|
|
4327
|
-
}
|
|
4328
|
-
function buildOpts(opts) {
|
|
4329
|
-
const out = {
|
|
4330
|
-
apiKey: resolveApiKey2(opts.apiKey),
|
|
4331
|
-
modelId: opts.modelId ?? "eleven_flash_v2_5",
|
|
4332
|
-
outputFormat: opts.outputFormat ?? "pcm_16000",
|
|
4333
|
-
autoMode: opts.autoMode ?? true
|
|
4334
|
-
};
|
|
4335
|
-
if (opts.voiceId !== void 0) out.voiceId = opts.voiceId;
|
|
4336
|
-
if (opts.voiceSettings !== void 0) out.voiceSettings = opts.voiceSettings;
|
|
4337
|
-
if (opts.languageCode !== void 0) out.languageCode = opts.languageCode;
|
|
4338
|
-
if (opts.inactivityTimeout !== void 0) out.inactivityTimeout = opts.inactivityTimeout;
|
|
4339
|
-
if (opts.chunkLengthSchedule !== void 0) out.chunkLengthSchedule = opts.chunkLengthSchedule;
|
|
4340
|
-
return out;
|
|
4341
|
-
}
|
|
4342
|
-
var TTS2 = class _TTS extends ElevenLabsWebSocketTTS {
|
|
4343
|
-
static providerKey = "elevenlabs_ws";
|
|
4344
|
-
constructor(opts = {}) {
|
|
4345
|
-
super(buildOpts(opts));
|
|
4346
|
-
}
|
|
4347
|
-
/** WebSocket TTS pre-configured for Twilio Media Streams (`ulaw_8000`). */
|
|
4348
|
-
static forTwilio(opts = {}) {
|
|
4349
|
-
return new _TTS({ ...opts, outputFormat: "ulaw_8000" });
|
|
4350
|
-
}
|
|
4351
|
-
/** WebSocket TTS pre-configured for Telnyx (`pcm_16000`). */
|
|
4352
|
-
static forTelnyx(opts = {}) {
|
|
4353
|
-
return new _TTS({ ...opts, outputFormat: "pcm_16000" });
|
|
4354
|
-
}
|
|
4355
|
-
};
|
|
4356
|
-
|
|
4357
|
-
// src/tts/openai.ts
|
|
4358
|
-
init_esm_shims();
|
|
4359
|
-
|
|
4360
|
-
// src/providers/openai-tts.ts
|
|
4361
|
-
init_esm_shims();
|
|
4362
|
-
var OPENAI_TTS_URL = "https://api.openai.com/v1/audio/speech";
|
|
4363
|
-
var INSTRUCTIONS_PREFIX = "gpt-4o-mini-tts";
|
|
4364
|
-
var LPF_ALPHA = 0.78;
|
|
4365
|
-
var LPF_ALPHA_8K = 0.45;
|
|
4366
|
-
var OpenAITTS = class _OpenAITTS {
|
|
4367
|
-
constructor(apiKey, voice = "alloy", model = "gpt-4o-mini-tts", instructions = null, speed = null, antiAlias = true, targetSampleRate = 16e3) {
|
|
4368
|
-
this.apiKey = apiKey;
|
|
4369
|
-
this.voice = voice;
|
|
4370
|
-
this.model = model;
|
|
4371
|
-
this.instructions = instructions;
|
|
4372
|
-
this.speed = speed;
|
|
4373
|
-
this.antiAlias = antiAlias;
|
|
4374
|
-
this.targetSampleRate = targetSampleRate;
|
|
4375
|
-
if (speed !== null && speed !== void 0 && (speed < 0.25 || speed > 4)) {
|
|
4376
|
-
throw new Error("OpenAITTS: speed must be in [0.25, 4.0]");
|
|
4377
|
-
}
|
|
4378
|
-
if (targetSampleRate !== 8e3 && targetSampleRate !== 16e3) {
|
|
4379
|
-
throw new Error("OpenAITTS: targetSampleRate must be 8000 or 16000");
|
|
4380
|
-
}
|
|
4381
|
-
}
|
|
4382
|
-
apiKey;
|
|
4383
|
-
voice;
|
|
4384
|
-
model;
|
|
4385
|
-
instructions;
|
|
4386
|
-
speed;
|
|
4387
|
-
antiAlias;
|
|
4388
|
-
targetSampleRate;
|
|
4389
|
-
/**
|
|
4390
|
-
* Synthesise text to speech and return the full audio as a single Buffer.
|
|
4391
|
-
*
|
|
4392
|
-
* For large chunks (or when latency matters) call `synthesizeStream` instead.
|
|
4393
|
-
*/
|
|
4394
|
-
async synthesize(text) {
|
|
4395
|
-
const chunks = [];
|
|
4396
|
-
for await (const chunk of this.synthesizeStream(text)) {
|
|
4397
|
-
chunks.push(chunk);
|
|
4398
|
-
}
|
|
4399
|
-
return Buffer.concat(chunks);
|
|
4400
|
-
}
|
|
4401
5556
|
/**
|
|
4402
|
-
*
|
|
5557
|
+
* Pre-call WebSocket warmup for the ElevenLabs `/stream-input` endpoint.
|
|
4403
5558
|
*
|
|
4404
|
-
*
|
|
4405
|
-
*
|
|
4406
|
-
*
|
|
5559
|
+
* Opens the WS (DNS + TLS + auth handshake), sends the EXACT same BOS
|
|
5560
|
+
* frame the production `synthesizeStream()` path sends — including
|
|
5561
|
+
* `voice_settings` and (when configured) `generation_config` — so
|
|
5562
|
+
* ElevenLabs instantiates the same per-session worker for both
|
|
5563
|
+
* warmup and the live request. If the BOS frames differ, the server
|
|
5564
|
+
* may route warmup and the real call to two different workers, and
|
|
5565
|
+
* the warmed worker is wasted. Idles ~250 ms, then closes. By the
|
|
5566
|
+
* time the first `synthesizeStream()` call lands during the call,
|
|
5567
|
+
* the connection pool has the upstream warm — net wire time saving
|
|
5568
|
+
* of 200-500 ms.
|
|
4407
5569
|
*
|
|
4408
|
-
*
|
|
4409
|
-
*
|
|
4410
|
-
*
|
|
5570
|
+
* Billing safety: ElevenLabs bills on synthesised characters
|
|
5571
|
+
* delivered via `audio` frames (per https://elevenlabs.io/pricing).
|
|
5572
|
+
* The keepalive (single-space `text`, no `flush: true`, no real
|
|
5573
|
+
* transcript) is documented as the session-establishment frame and
|
|
5574
|
+
* does NOT generate synthesis. Closing without sending the actual
|
|
5575
|
+
* transcript does not consume billable characters. Best-effort:
|
|
5576
|
+
* failures logged at debug level.
|
|
4411
5577
|
*/
|
|
4412
|
-
async
|
|
4413
|
-
const
|
|
4414
|
-
|
|
4415
|
-
input: text,
|
|
4416
|
-
voice: this.voice,
|
|
4417
|
-
response_format: "pcm"
|
|
4418
|
-
};
|
|
4419
|
-
if (this.instructions !== null && this.model.startsWith(INSTRUCTIONS_PREFIX)) {
|
|
4420
|
-
body.instructions = this.instructions;
|
|
4421
|
-
}
|
|
4422
|
-
if (this.speed !== null) {
|
|
4423
|
-
body.speed = this.speed;
|
|
4424
|
-
}
|
|
4425
|
-
const response = await fetch(OPENAI_TTS_URL, {
|
|
4426
|
-
method: "POST",
|
|
4427
|
-
headers: {
|
|
4428
|
-
"Authorization": `Bearer ${this.apiKey}`,
|
|
4429
|
-
"Content-Type": "application/json"
|
|
4430
|
-
},
|
|
4431
|
-
body: JSON.stringify(body)
|
|
5578
|
+
async warmup() {
|
|
5579
|
+
const ws = new WebSocket6(this.buildUrl(), {
|
|
5580
|
+
headers: { "xi-api-key": this.apiKey }
|
|
4432
5581
|
});
|
|
4433
|
-
if (!response.ok) {
|
|
4434
|
-
const errBody = await response.text();
|
|
4435
|
-
throw new Error(`OpenAI TTS error ${response.status}: ${errBody}`);
|
|
4436
|
-
}
|
|
4437
|
-
if (!response.body) {
|
|
4438
|
-
throw new Error("OpenAI TTS: no response body");
|
|
4439
|
-
}
|
|
4440
|
-
const ctx = {
|
|
4441
|
-
carryByte: null,
|
|
4442
|
-
leftover: [],
|
|
4443
|
-
lpfPrev: 0,
|
|
4444
|
-
lpfEnabled: this.antiAlias,
|
|
4445
|
-
targetSampleRate: this.targetSampleRate
|
|
4446
|
-
};
|
|
4447
|
-
const reader = response.body.getReader();
|
|
4448
5582
|
try {
|
|
4449
|
-
|
|
4450
|
-
const
|
|
4451
|
-
|
|
4452
|
-
|
|
4453
|
-
|
|
4454
|
-
|
|
4455
|
-
|
|
4456
|
-
|
|
4457
|
-
|
|
4458
|
-
|
|
4459
|
-
|
|
4460
|
-
|
|
4461
|
-
}
|
|
4462
|
-
|
|
5583
|
+
await new Promise((resolve, reject) => {
|
|
5584
|
+
const timer = setTimeout(
|
|
5585
|
+
() => reject(new Error("ElevenLabs WS TTS warmup connect timeout")),
|
|
5586
|
+
CONNECT_TIMEOUT_MS4
|
|
5587
|
+
);
|
|
5588
|
+
ws.once("open", () => {
|
|
5589
|
+
clearTimeout(timer);
|
|
5590
|
+
resolve();
|
|
5591
|
+
});
|
|
5592
|
+
ws.once("error", (err) => {
|
|
5593
|
+
clearTimeout(timer);
|
|
5594
|
+
reject(err);
|
|
5595
|
+
});
|
|
5596
|
+
});
|
|
5597
|
+
try {
|
|
5598
|
+
ws.send(JSON.stringify(this.buildBosFrame()));
|
|
5599
|
+
} catch {
|
|
4463
5600
|
}
|
|
5601
|
+
await new Promise((r) => setTimeout(r, 250));
|
|
5602
|
+
} catch (err) {
|
|
5603
|
+
getLogger().debug(`ElevenLabs WS TTS warmup failed (best-effort): ${String(err)}`);
|
|
4464
5604
|
} finally {
|
|
4465
|
-
|
|
4466
|
-
|
|
4467
|
-
|
|
5605
|
+
try {
|
|
5606
|
+
if (ws.readyState === WebSocket6.OPEN || ws.readyState === WebSocket6.CONNECTING) {
|
|
5607
|
+
ws.close();
|
|
5608
|
+
}
|
|
5609
|
+
} catch {
|
|
5610
|
+
}
|
|
5611
|
+
ws.removeAllListeners();
|
|
4468
5612
|
}
|
|
4469
5613
|
}
|
|
4470
5614
|
/**
|
|
4471
|
-
*
|
|
4472
|
-
*
|
|
4473
|
-
*
|
|
5615
|
+
* Open a fresh WS, send the EXACT BOS frame the live `synthesizeStream`
|
|
5616
|
+
* sends, and return the OPEN socket without closing it. Used by the
|
|
5617
|
+
* prewarm pipeline to park a TTS connection during the carrier ringing
|
|
5618
|
+
* window so the next `synthesizeStream` call can adopt it via
|
|
5619
|
+
* {@link adoptWebSocket} and skip ~400-900 ms of TLS + BOS round-trip.
|
|
4474
5620
|
*
|
|
4475
|
-
*
|
|
4476
|
-
*
|
|
4477
|
-
*
|
|
5621
|
+
* Returns a parked-handle the caller stashes; the next
|
|
5622
|
+
* `synthesizeStream` will detect the adoption queue and skip its own
|
|
5623
|
+
* `new WebSocket()` + BOS send.
|
|
4478
5624
|
*
|
|
4479
|
-
*
|
|
4480
|
-
*
|
|
4481
|
-
*
|
|
5625
|
+
* Billing safety: BOS is the documented session-establishment frame
|
|
5626
|
+
* (single space `text`, no `flush: true`) and does not generate
|
|
5627
|
+
* synthesis. ElevenLabs bills on `audio` frames received from the
|
|
5628
|
+
* server, not on BOS bytes sent by the client.
|
|
4482
5629
|
*/
|
|
4483
|
-
|
|
4484
|
-
|
|
4485
|
-
|
|
4486
|
-
|
|
4487
|
-
|
|
4488
|
-
|
|
4489
|
-
|
|
4490
|
-
|
|
4491
|
-
|
|
4492
|
-
|
|
4493
|
-
|
|
4494
|
-
|
|
4495
|
-
|
|
4496
|
-
|
|
5630
|
+
async openParkedConnection() {
|
|
5631
|
+
const ws = new WebSocket6(this.buildUrl(), {
|
|
5632
|
+
headers: { "xi-api-key": this.apiKey }
|
|
5633
|
+
});
|
|
5634
|
+
await new Promise((resolve, reject) => {
|
|
5635
|
+
const timer = setTimeout(
|
|
5636
|
+
() => reject(new Error("ElevenLabs WS park connect timeout")),
|
|
5637
|
+
CONNECT_TIMEOUT_MS4
|
|
5638
|
+
);
|
|
5639
|
+
ws.once("open", () => {
|
|
5640
|
+
clearTimeout(timer);
|
|
5641
|
+
resolve();
|
|
5642
|
+
});
|
|
5643
|
+
ws.once("error", (err) => {
|
|
5644
|
+
clearTimeout(timer);
|
|
5645
|
+
reject(err);
|
|
5646
|
+
});
|
|
5647
|
+
});
|
|
5648
|
+
let bosSent = false;
|
|
5649
|
+
try {
|
|
5650
|
+
ws.send(JSON.stringify(this.buildBosFrame()));
|
|
5651
|
+
bosSent = true;
|
|
5652
|
+
} catch {
|
|
4497
5653
|
}
|
|
4498
|
-
|
|
4499
|
-
|
|
4500
|
-
|
|
4501
|
-
|
|
4502
|
-
|
|
4503
|
-
|
|
4504
|
-
|
|
4505
|
-
|
|
4506
|
-
|
|
4507
|
-
|
|
4508
|
-
|
|
4509
|
-
|
|
4510
|
-
|
|
4511
|
-
|
|
4512
|
-
|
|
4513
|
-
|
|
5654
|
+
return { ws, bosSent };
|
|
5655
|
+
}
|
|
5656
|
+
/**
|
|
5657
|
+
* Stash a parked WS handle so the next `synthesizeStream` call adopts
|
|
5658
|
+
* it instead of opening a fresh socket. Caller is responsible for
|
|
5659
|
+
* holding the handle alive until either the live request consumes it
|
|
5660
|
+
* or the call ends (in which case `discardAdoptedConnection()`
|
|
5661
|
+
* cleans it up).
|
|
5662
|
+
*/
|
|
5663
|
+
adoptWebSocket(parked) {
|
|
5664
|
+
const prev = this.adoptedConnection;
|
|
5665
|
+
this.adoptedConnection = parked;
|
|
5666
|
+
if (prev && prev !== parked) {
|
|
5667
|
+
try {
|
|
5668
|
+
prev.ws.close();
|
|
5669
|
+
} catch {
|
|
4514
5670
|
}
|
|
4515
5671
|
}
|
|
4516
|
-
|
|
4517
|
-
|
|
4518
|
-
|
|
4519
|
-
|
|
4520
|
-
|
|
4521
|
-
|
|
4522
|
-
|
|
4523
|
-
|
|
4524
|
-
|
|
4525
|
-
|
|
4526
|
-
|
|
4527
|
-
|
|
4528
|
-
|
|
5672
|
+
}
|
|
5673
|
+
/**
|
|
5674
|
+
* Drop and close any pending parked WS without consuming it. Used on
|
|
5675
|
+
* call-failure paths so a never-started call does not leak a TTS WS
|
|
5676
|
+
* that ElevenLabs will close after its inactivity timeout anyway.
|
|
5677
|
+
*/
|
|
5678
|
+
discardAdoptedConnection() {
|
|
5679
|
+
const parked = this.adoptedConnection;
|
|
5680
|
+
this.adoptedConnection = null;
|
|
5681
|
+
if (parked) {
|
|
5682
|
+
try {
|
|
5683
|
+
parked.ws.close();
|
|
5684
|
+
} catch {
|
|
4529
5685
|
}
|
|
4530
5686
|
}
|
|
4531
|
-
ctx.leftover = samples.slice(i);
|
|
4532
|
-
const buffer = Buffer.alloc(out.length * 2);
|
|
4533
|
-
for (let j = 0; j < out.length; j++) {
|
|
4534
|
-
buffer.writeInt16LE(out[j], j * 2);
|
|
4535
|
-
}
|
|
4536
|
-
return buffer;
|
|
4537
5687
|
}
|
|
4538
|
-
/**
|
|
4539
|
-
|
|
4540
|
-
|
|
4541
|
-
carryByte: null,
|
|
4542
|
-
leftover: [],
|
|
4543
|
-
lpfPrev: 0,
|
|
4544
|
-
lpfEnabled: false,
|
|
4545
|
-
targetSampleRate: 16e3
|
|
4546
|
-
};
|
|
4547
|
-
const out = _OpenAITTS.resampleStreaming(audio, ctx);
|
|
4548
|
-
if (ctx.leftover.length === 0) return out;
|
|
4549
|
-
const tail = Buffer.alloc(ctx.leftover.length * 2);
|
|
4550
|
-
for (let i = 0; i < ctx.leftover.length; i++) {
|
|
4551
|
-
tail.writeInt16LE(ctx.leftover[i], i * 2);
|
|
4552
|
-
}
|
|
4553
|
-
return Buffer.concat([out, tail]);
|
|
5688
|
+
/** No-op — connections are per-utterance and torn down inside synthesizeStream. */
|
|
5689
|
+
async close() {
|
|
5690
|
+
this.discardAdoptedConnection();
|
|
4554
5691
|
}
|
|
4555
5692
|
};
|
|
5693
|
+
function looksLikeJson(buf) {
|
|
5694
|
+
if (buf.length === 0) return false;
|
|
5695
|
+
const b = buf[0];
|
|
5696
|
+
return b === 123 || b === 91;
|
|
5697
|
+
}
|
|
4556
5698
|
|
|
4557
|
-
// src/tts/
|
|
4558
|
-
|
|
4559
|
-
|
|
4560
|
-
|
|
4561
|
-
|
|
4562
|
-
|
|
4563
|
-
throw new Error(
|
|
4564
|
-
"OpenAI TTS requires an apiKey. Pass { apiKey: 'sk-...' } or set OPENAI_API_KEY in the environment."
|
|
4565
|
-
);
|
|
4566
|
-
}
|
|
4567
|
-
super(
|
|
4568
|
-
key,
|
|
4569
|
-
opts.voice ?? "alloy",
|
|
4570
|
-
opts.model ?? "gpt-4o-mini-tts",
|
|
4571
|
-
opts.instructions ?? null,
|
|
4572
|
-
opts.speed ?? null,
|
|
4573
|
-
opts.antiAlias ?? false
|
|
5699
|
+
// src/tts/elevenlabs-ws.ts
|
|
5700
|
+
function resolveApiKey2(apiKey) {
|
|
5701
|
+
const key = apiKey ?? process.env.ELEVENLABS_API_KEY;
|
|
5702
|
+
if (!key) {
|
|
5703
|
+
throw new Error(
|
|
5704
|
+
"ElevenLabs WebSocket TTS requires an apiKey. Pass { apiKey: '...' } or set ELEVENLABS_API_KEY in the environment."
|
|
4574
5705
|
);
|
|
4575
5706
|
}
|
|
5707
|
+
return key;
|
|
5708
|
+
}
|
|
5709
|
+
function buildOpts(opts) {
|
|
5710
|
+
const out = {
|
|
5711
|
+
apiKey: resolveApiKey2(opts.apiKey),
|
|
5712
|
+
modelId: opts.modelId ?? "eleven_flash_v2_5",
|
|
5713
|
+
autoMode: opts.autoMode ?? true
|
|
5714
|
+
};
|
|
5715
|
+
if (opts.outputFormat !== void 0) out.outputFormat = opts.outputFormat;
|
|
5716
|
+
if (opts.voiceId !== void 0) out.voiceId = opts.voiceId;
|
|
5717
|
+
if (opts.voiceSettings !== void 0) out.voiceSettings = opts.voiceSettings;
|
|
5718
|
+
if (opts.languageCode !== void 0) out.languageCode = opts.languageCode;
|
|
5719
|
+
if (opts.inactivityTimeout !== void 0) out.inactivityTimeout = opts.inactivityTimeout;
|
|
5720
|
+
if (opts.chunkLengthSchedule !== void 0) out.chunkLengthSchedule = opts.chunkLengthSchedule;
|
|
5721
|
+
return out;
|
|
5722
|
+
}
|
|
5723
|
+
var TTS2 = class _TTS extends ElevenLabsWebSocketTTS {
|
|
5724
|
+
static providerKey = "elevenlabs_ws";
|
|
5725
|
+
constructor(opts = {}) {
|
|
5726
|
+
super(buildOpts(opts));
|
|
5727
|
+
}
|
|
5728
|
+
/** WebSocket TTS pre-configured for Twilio Media Streams (`ulaw_8000`). */
|
|
5729
|
+
static forTwilio(opts = {}) {
|
|
5730
|
+
return new _TTS({ ...opts, outputFormat: "ulaw_8000" });
|
|
5731
|
+
}
|
|
5732
|
+
/** WebSocket TTS pre-configured for Telnyx (`pcm_16000`). */
|
|
5733
|
+
static forTelnyx(opts = {}) {
|
|
5734
|
+
return new _TTS({ ...opts, outputFormat: "pcm_16000" });
|
|
5735
|
+
}
|
|
4576
5736
|
};
|
|
4577
5737
|
|
|
4578
|
-
// src/tts/
|
|
5738
|
+
// src/tts/openai.ts
|
|
4579
5739
|
init_esm_shims();
|
|
4580
5740
|
|
|
4581
|
-
// src/providers/
|
|
5741
|
+
// src/providers/openai-tts.ts
|
|
4582
5742
|
init_esm_shims();
|
|
4583
|
-
var
|
|
4584
|
-
var
|
|
4585
|
-
var
|
|
4586
|
-
var
|
|
4587
|
-
|
|
4588
|
-
|
|
4589
|
-
SONIC: "sonic"
|
|
4590
|
-
};
|
|
4591
|
-
var CartesiaTTSContainer = {
|
|
4592
|
-
RAW: "raw",
|
|
4593
|
-
WAV: "wav",
|
|
4594
|
-
MP3: "mp3"
|
|
4595
|
-
};
|
|
4596
|
-
var CartesiaTTSEncoding = {
|
|
4597
|
-
PCM_S16LE: "pcm_s16le",
|
|
4598
|
-
PCM_F32LE: "pcm_f32le",
|
|
4599
|
-
PCM_MULAW: "pcm_mulaw",
|
|
4600
|
-
PCM_ALAW: "pcm_alaw"
|
|
4601
|
-
};
|
|
4602
|
-
var CartesiaTTSSampleRate = {
|
|
4603
|
-
HZ_8000: 8e3,
|
|
4604
|
-
HZ_16000: 16e3,
|
|
4605
|
-
HZ_22050: 22050,
|
|
4606
|
-
HZ_24000: 24e3,
|
|
4607
|
-
HZ_44100: 44100
|
|
4608
|
-
};
|
|
4609
|
-
var CartesiaTTSVoiceMode = {
|
|
4610
|
-
ID: "id",
|
|
4611
|
-
EMBEDDING: "embedding"
|
|
4612
|
-
};
|
|
4613
|
-
var CartesiaTTS = class _CartesiaTTS {
|
|
4614
|
-
apiKey;
|
|
4615
|
-
model;
|
|
4616
|
-
voice;
|
|
4617
|
-
language;
|
|
4618
|
-
sampleRate;
|
|
4619
|
-
speed;
|
|
4620
|
-
emotion;
|
|
4621
|
-
volume;
|
|
4622
|
-
baseUrl;
|
|
4623
|
-
apiVersion;
|
|
4624
|
-
constructor(apiKey, opts = {}) {
|
|
5743
|
+
var OPENAI_TTS_URL = "https://api.openai.com/v1/audio/speech";
|
|
5744
|
+
var INSTRUCTIONS_PREFIX = "gpt-4o-mini-tts";
|
|
5745
|
+
var LPF_ALPHA = 0.78;
|
|
5746
|
+
var LPF_ALPHA_8K = 0.45;
|
|
5747
|
+
var OpenAITTS = class _OpenAITTS {
|
|
5748
|
+
constructor(apiKey, voice = "alloy", model = "gpt-4o-mini-tts", instructions = null, speed = null, antiAlias = true, targetSampleRate = 16e3) {
|
|
4625
5749
|
this.apiKey = apiKey;
|
|
4626
|
-
this.
|
|
4627
|
-
this.
|
|
4628
|
-
this.
|
|
4629
|
-
this.
|
|
4630
|
-
this.
|
|
4631
|
-
this.
|
|
4632
|
-
|
|
4633
|
-
|
|
4634
|
-
|
|
4635
|
-
|
|
4636
|
-
|
|
4637
|
-
|
|
4638
|
-
*
|
|
4639
|
-
* Sets `sampleRate=8000` so Cartesia emits PCM_S16LE @ 8 kHz directly.
|
|
4640
|
-
* Twilio's media stream uses μ-law @ 8 kHz so the SDK still does the
|
|
4641
|
-
* PCM → μ-law transcode client-side, but the 16 kHz → 8 kHz resample
|
|
4642
|
-
* step is skipped. Saves ~10–30 ms first-byte plus per-frame CPU and
|
|
4643
|
-
* removes a potential aliasing source.
|
|
4644
|
-
*/
|
|
4645
|
-
static forTwilio(apiKey, options = {}) {
|
|
4646
|
-
return new _CartesiaTTS(apiKey, {
|
|
4647
|
-
...options,
|
|
4648
|
-
sampleRate: CartesiaTTSSampleRate.HZ_8000
|
|
4649
|
-
});
|
|
5750
|
+
this.voice = voice;
|
|
5751
|
+
this.model = model;
|
|
5752
|
+
this.instructions = instructions;
|
|
5753
|
+
this.speed = speed;
|
|
5754
|
+
this.antiAlias = antiAlias;
|
|
5755
|
+
this.targetSampleRate = targetSampleRate;
|
|
5756
|
+
if (speed !== null && speed !== void 0 && (speed < 0.25 || speed > 4)) {
|
|
5757
|
+
throw new Error("OpenAITTS: speed must be in [0.25, 4.0]");
|
|
5758
|
+
}
|
|
5759
|
+
if (targetSampleRate !== 8e3 && targetSampleRate !== 16e3) {
|
|
5760
|
+
throw new Error("OpenAITTS: targetSampleRate must be 8000 or 16000");
|
|
5761
|
+
}
|
|
4650
5762
|
}
|
|
5763
|
+
apiKey;
|
|
5764
|
+
voice;
|
|
5765
|
+
model;
|
|
5766
|
+
instructions;
|
|
5767
|
+
speed;
|
|
5768
|
+
antiAlias;
|
|
5769
|
+
targetSampleRate;
|
|
5770
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
5771
|
+
static providerKey = "openai_tts";
|
|
4651
5772
|
/**
|
|
4652
|
-
*
|
|
5773
|
+
* Synthesise text to speech and return the full audio as a single Buffer.
|
|
4653
5774
|
*
|
|
4654
|
-
*
|
|
4655
|
-
* audio flows end-to-end with zero resampling or transcoding. Same as
|
|
4656
|
-
* the bare-constructor default; exists for API symmetry with
|
|
4657
|
-
* {@link CartesiaTTS.forTwilio}.
|
|
5775
|
+
* For large chunks (or when latency matters) call `synthesizeStream` instead.
|
|
4658
5776
|
*/
|
|
4659
|
-
static forTelnyx(apiKey, options = {}) {
|
|
4660
|
-
return new _CartesiaTTS(apiKey, {
|
|
4661
|
-
...options,
|
|
4662
|
-
sampleRate: CartesiaTTSSampleRate.HZ_16000
|
|
4663
|
-
});
|
|
4664
|
-
}
|
|
4665
|
-
/** Build the JSON payload for the Cartesia bytes endpoint. */
|
|
4666
|
-
buildPayload(text) {
|
|
4667
|
-
const payload = {
|
|
4668
|
-
model_id: this.model,
|
|
4669
|
-
voice: { mode: CartesiaTTSVoiceMode.ID, id: this.voice },
|
|
4670
|
-
transcript: text,
|
|
4671
|
-
output_format: {
|
|
4672
|
-
container: CartesiaTTSContainer.RAW,
|
|
4673
|
-
encoding: CartesiaTTSEncoding.PCM_S16LE,
|
|
4674
|
-
sample_rate: this.sampleRate
|
|
4675
|
-
},
|
|
4676
|
-
language: this.language
|
|
4677
|
-
};
|
|
4678
|
-
const generationConfig = {};
|
|
4679
|
-
if (this.speed !== void 0) generationConfig.speed = this.speed;
|
|
4680
|
-
if (this.emotion && this.emotion.length > 0)
|
|
4681
|
-
generationConfig.emotion = this.emotion[0];
|
|
4682
|
-
if (this.volume !== void 0) generationConfig.volume = this.volume;
|
|
4683
|
-
if (Object.keys(generationConfig).length > 0) {
|
|
4684
|
-
payload.generation_config = generationConfig;
|
|
4685
|
-
}
|
|
4686
|
-
return payload;
|
|
4687
|
-
}
|
|
4688
|
-
/** Synthesize text and return the concatenated audio buffer. */
|
|
4689
5777
|
async synthesize(text) {
|
|
4690
5778
|
const chunks = [];
|
|
4691
5779
|
for await (const chunk of this.synthesizeStream(text)) {
|
|
@@ -4694,217 +5782,213 @@ var CartesiaTTS = class _CartesiaTTS {
|
|
|
4694
5782
|
return Buffer.concat(chunks);
|
|
4695
5783
|
}
|
|
4696
5784
|
/**
|
|
4697
|
-
*
|
|
4698
|
-
*
|
|
5785
|
+
* Synthesise text and yield audio chunks as they arrive (streaming).
|
|
5786
|
+
*
|
|
5787
|
+
* OpenAI returns 24 kHz PCM16; each chunk is lowpass-filtered then
|
|
5788
|
+
* decimated 3:2 to 16 kHz before yielding so the output is ready for
|
|
5789
|
+
* telephony pipelines.
|
|
5790
|
+
*
|
|
5791
|
+
* The resampler carries state (filter memory + buffered samples + odd
|
|
5792
|
+
* trailing byte) between chunks so cross-chunk sample alignment and
|
|
5793
|
+
* filter phase don't reset on every network read.
|
|
4699
5794
|
*/
|
|
4700
5795
|
async *synthesizeStream(text) {
|
|
4701
|
-
const
|
|
5796
|
+
const body = {
|
|
5797
|
+
model: this.model,
|
|
5798
|
+
input: text,
|
|
5799
|
+
voice: this.voice,
|
|
5800
|
+
response_format: "pcm"
|
|
5801
|
+
};
|
|
5802
|
+
if (this.instructions !== null && this.model.startsWith(INSTRUCTIONS_PREFIX)) {
|
|
5803
|
+
body.instructions = this.instructions;
|
|
5804
|
+
}
|
|
5805
|
+
if (this.speed !== null) {
|
|
5806
|
+
body.speed = this.speed;
|
|
5807
|
+
}
|
|
5808
|
+
const response = await fetch(OPENAI_TTS_URL, {
|
|
4702
5809
|
method: "POST",
|
|
4703
5810
|
headers: {
|
|
4704
|
-
"
|
|
4705
|
-
"Cartesia-Version": this.apiVersion,
|
|
5811
|
+
"Authorization": `Bearer ${this.apiKey}`,
|
|
4706
5812
|
"Content-Type": "application/json"
|
|
4707
5813
|
},
|
|
4708
|
-
body: JSON.stringify(
|
|
4709
|
-
signal: AbortSignal.timeout(3e4)
|
|
5814
|
+
body: JSON.stringify(body)
|
|
4710
5815
|
});
|
|
4711
5816
|
if (!response.ok) {
|
|
4712
|
-
const
|
|
4713
|
-
throw new Error(`
|
|
5817
|
+
const errBody = await response.text();
|
|
5818
|
+
throw new Error(`OpenAI TTS error ${response.status}: ${errBody}`);
|
|
4714
5819
|
}
|
|
4715
5820
|
if (!response.body) {
|
|
4716
|
-
throw new Error("
|
|
5821
|
+
throw new Error("OpenAI TTS: no response body");
|
|
4717
5822
|
}
|
|
5823
|
+
const ctx = {
|
|
5824
|
+
carryByte: null,
|
|
5825
|
+
leftover: [],
|
|
5826
|
+
lpfPrev: 0,
|
|
5827
|
+
lpfEnabled: this.antiAlias,
|
|
5828
|
+
targetSampleRate: this.targetSampleRate
|
|
5829
|
+
};
|
|
4718
5830
|
const reader = response.body.getReader();
|
|
4719
5831
|
try {
|
|
4720
5832
|
while (true) {
|
|
4721
5833
|
const { done, value } = await reader.read();
|
|
4722
5834
|
if (done) break;
|
|
4723
5835
|
if (value && value.length > 0) {
|
|
4724
|
-
|
|
5836
|
+
const out = _OpenAITTS.resampleStreaming(Buffer.from(value), ctx);
|
|
5837
|
+
if (out.length > 0) yield out;
|
|
5838
|
+
}
|
|
5839
|
+
}
|
|
5840
|
+
if (ctx.leftover.length > 0) {
|
|
5841
|
+
const tail = Buffer.alloc(ctx.leftover.length * 2);
|
|
5842
|
+
for (let i = 0; i < ctx.leftover.length; i++) {
|
|
5843
|
+
tail.writeInt16LE(ctx.leftover[i], i * 2);
|
|
4725
5844
|
}
|
|
5845
|
+
yield tail;
|
|
4726
5846
|
}
|
|
4727
5847
|
} finally {
|
|
4728
|
-
if (typeof reader.cancel === "function")
|
|
4729
|
-
|
|
4730
|
-
});
|
|
5848
|
+
if (typeof reader.cancel === "function") await reader.cancel().catch(() => {
|
|
5849
|
+
});
|
|
4731
5850
|
reader.releaseLock();
|
|
4732
5851
|
}
|
|
4733
5852
|
}
|
|
4734
|
-
|
|
4735
|
-
|
|
4736
|
-
|
|
4737
|
-
|
|
4738
|
-
|
|
4739
|
-
|
|
4740
|
-
|
|
4741
|
-
|
|
4742
|
-
|
|
4743
|
-
|
|
4744
|
-
|
|
4745
|
-
|
|
4746
|
-
|
|
4747
|
-
static
|
|
4748
|
-
|
|
4749
|
-
|
|
4750
|
-
|
|
4751
|
-
|
|
4752
|
-
|
|
4753
|
-
|
|
4754
|
-
|
|
4755
|
-
|
|
4756
|
-
|
|
4757
|
-
|
|
4758
|
-
|
|
4759
|
-
|
|
4760
|
-
|
|
4761
|
-
|
|
4762
|
-
|
|
4763
|
-
|
|
4764
|
-
|
|
4765
|
-
|
|
4766
|
-
|
|
4767
|
-
|
|
4768
|
-
|
|
4769
|
-
|
|
4770
|
-
|
|
4771
|
-
|
|
4772
|
-
|
|
4773
|
-
|
|
4774
|
-
|
|
4775
|
-
|
|
4776
|
-
|
|
4777
|
-
|
|
4778
|
-
WAV: "audio/wav",
|
|
4779
|
-
MULAW: "audio/mulaw"
|
|
4780
|
-
};
|
|
4781
|
-
var ARCANA_MODEL_TIMEOUT_MS = 60 * 4 * 1e3;
|
|
4782
|
-
var MIST_MODEL_TIMEOUT_MS = 30 * 1e3;
|
|
4783
|
-
function isMistModel(model) {
|
|
4784
|
-
return model.includes(RimeModel.MIST);
|
|
4785
|
-
}
|
|
4786
|
-
function timeoutForModel(model) {
|
|
4787
|
-
if (model === RimeModel.ARCANA) return ARCANA_MODEL_TIMEOUT_MS;
|
|
4788
|
-
return MIST_MODEL_TIMEOUT_MS;
|
|
4789
|
-
}
|
|
4790
|
-
var RimeTTS = class {
|
|
4791
|
-
apiKey;
|
|
4792
|
-
model;
|
|
4793
|
-
speaker;
|
|
4794
|
-
lang;
|
|
4795
|
-
sampleRate;
|
|
4796
|
-
repetitionPenalty;
|
|
4797
|
-
temperature;
|
|
4798
|
-
topP;
|
|
4799
|
-
maxTokens;
|
|
4800
|
-
speedAlpha;
|
|
4801
|
-
reduceLatency;
|
|
4802
|
-
pauseBetweenBrackets;
|
|
4803
|
-
phonemizeBetweenBrackets;
|
|
4804
|
-
baseUrl;
|
|
4805
|
-
totalTimeoutMs;
|
|
4806
|
-
constructor(apiKey, opts = {}) {
|
|
4807
|
-
this.apiKey = apiKey;
|
|
4808
|
-
this.model = opts.model ?? RimeModel.ARCANA;
|
|
4809
|
-
const defaultSpeaker = isMistModel(this.model) ? "cove" : "astra";
|
|
4810
|
-
this.speaker = opts.speaker ?? defaultSpeaker;
|
|
4811
|
-
this.lang = opts.lang ?? "eng";
|
|
4812
|
-
this.sampleRate = opts.sampleRate ?? 16e3;
|
|
4813
|
-
this.repetitionPenalty = opts.repetitionPenalty;
|
|
4814
|
-
this.temperature = opts.temperature;
|
|
4815
|
-
this.topP = opts.topP;
|
|
4816
|
-
this.maxTokens = opts.maxTokens;
|
|
4817
|
-
this.speedAlpha = opts.speedAlpha;
|
|
4818
|
-
this.reduceLatency = opts.reduceLatency;
|
|
4819
|
-
this.pauseBetweenBrackets = opts.pauseBetweenBrackets;
|
|
4820
|
-
this.phonemizeBetweenBrackets = opts.phonemizeBetweenBrackets;
|
|
4821
|
-
this.baseUrl = opts.baseUrl ?? RIME_BASE_URL;
|
|
4822
|
-
this.totalTimeoutMs = timeoutForModel(this.model);
|
|
4823
|
-
}
|
|
4824
|
-
buildPayload(text) {
|
|
4825
|
-
const payload = {
|
|
4826
|
-
speaker: this.speaker,
|
|
4827
|
-
text,
|
|
4828
|
-
modelId: this.model
|
|
4829
|
-
};
|
|
4830
|
-
if (this.model === RimeModel.ARCANA) {
|
|
4831
|
-
if (this.repetitionPenalty !== void 0)
|
|
4832
|
-
payload.repetition_penalty = this.repetitionPenalty;
|
|
4833
|
-
if (this.temperature !== void 0) payload.temperature = this.temperature;
|
|
4834
|
-
if (this.topP !== void 0) payload.top_p = this.topP;
|
|
4835
|
-
if (this.maxTokens !== void 0) payload.max_tokens = this.maxTokens;
|
|
4836
|
-
payload.lang = this.lang;
|
|
4837
|
-
payload.samplingRate = this.sampleRate;
|
|
4838
|
-
} else if (isMistModel(this.model)) {
|
|
4839
|
-
payload.lang = this.lang;
|
|
4840
|
-
payload.samplingRate = this.sampleRate;
|
|
4841
|
-
if (this.speedAlpha !== void 0) payload.speedAlpha = this.speedAlpha;
|
|
4842
|
-
if (this.model === RimeModel.MIST_V2 && this.reduceLatency !== void 0) {
|
|
4843
|
-
payload.reduceLatency = this.reduceLatency;
|
|
5853
|
+
/**
|
|
5854
|
+
* Streaming 24 kHz → {16, 8} kHz resampler (PCM16-LE). Applies a single-pole
|
|
5855
|
+
* lowpass ahead of the decimation and carries filter + sample state across
|
|
5856
|
+
* chunks so the cadence doesn't reset at every network read.
|
|
5857
|
+
*
|
|
5858
|
+
* Output rate is selected by ``ctx.targetSampleRate``:
|
|
5859
|
+
* 16000 → 3:2 decimation (sample 0 + mid(1,2)) [default]
|
|
5860
|
+
* 8000 → 3:1 decimation (sample 0 only) [fix #46]
|
|
5861
|
+
*
|
|
5862
|
+
* ``ctx.lpfEnabled`` controls whether the LPF is engaged — kept disabled
|
|
5863
|
+
* for the legacy static helper so the bit-exact downsample-only tests
|
|
5864
|
+
* remain valid; the real streaming path always engages it.
|
|
5865
|
+
*/
|
|
5866
|
+
static resampleStreaming(audio, ctx) {
|
|
5867
|
+
let buf;
|
|
5868
|
+
if (ctx.carryByte !== null) {
|
|
5869
|
+
buf = Buffer.concat([Buffer.from([ctx.carryByte]), audio]);
|
|
5870
|
+
ctx.carryByte = null;
|
|
5871
|
+
} else {
|
|
5872
|
+
buf = audio;
|
|
5873
|
+
}
|
|
5874
|
+
if (buf.length % 2 === 1) {
|
|
5875
|
+
ctx.carryByte = buf[buf.length - 1];
|
|
5876
|
+
buf = buf.subarray(0, buf.length - 1);
|
|
5877
|
+
}
|
|
5878
|
+
if (buf.length === 0 && ctx.leftover.length === 0) {
|
|
5879
|
+
return Buffer.alloc(0);
|
|
5880
|
+
}
|
|
5881
|
+
const direct8k = ctx.targetSampleRate === 8e3;
|
|
5882
|
+
const lpfAlpha = direct8k ? LPF_ALPHA_8K : LPF_ALPHA;
|
|
5883
|
+
const sampleCount = buf.length / 2;
|
|
5884
|
+
const samples = ctx.leftover.slice();
|
|
5885
|
+
const lpf = ctx.lpfEnabled !== false;
|
|
5886
|
+
let y = ctx.lpfPrev;
|
|
5887
|
+
for (let i2 = 0; i2 < sampleCount; i2++) {
|
|
5888
|
+
const x = buf.readInt16LE(i2 * 2);
|
|
5889
|
+
if (lpf) {
|
|
5890
|
+
y = lpfAlpha * x + (1 - lpfAlpha) * y;
|
|
5891
|
+
let s = Math.round(y);
|
|
5892
|
+
if (s > 32767) s = 32767;
|
|
5893
|
+
else if (s < -32768) s = -32768;
|
|
5894
|
+
samples.push(s);
|
|
5895
|
+
} else {
|
|
5896
|
+
samples.push(x);
|
|
4844
5897
|
}
|
|
4845
|
-
|
|
4846
|
-
|
|
5898
|
+
}
|
|
5899
|
+
if (lpf) ctx.lpfPrev = y;
|
|
5900
|
+
const out = [];
|
|
5901
|
+
let i = 0;
|
|
5902
|
+
if (direct8k) {
|
|
5903
|
+
while (i + 2 < samples.length) {
|
|
5904
|
+
out.push(samples[i]);
|
|
5905
|
+
i += 3;
|
|
4847
5906
|
}
|
|
4848
|
-
|
|
4849
|
-
|
|
5907
|
+
} else {
|
|
5908
|
+
while (i + 2 < samples.length) {
|
|
5909
|
+
out.push(samples[i]);
|
|
5910
|
+
out.push(Math.round((samples[i + 1] + samples[i + 2]) / 2));
|
|
5911
|
+
i += 3;
|
|
4850
5912
|
}
|
|
4851
5913
|
}
|
|
4852
|
-
|
|
4853
|
-
|
|
4854
|
-
|
|
4855
|
-
|
|
4856
|
-
const chunks = [];
|
|
4857
|
-
for await (const chunk of this.synthesizeStream(text)) {
|
|
4858
|
-
chunks.push(chunk);
|
|
5914
|
+
ctx.leftover = samples.slice(i);
|
|
5915
|
+
const buffer = Buffer.alloc(out.length * 2);
|
|
5916
|
+
for (let j = 0; j < out.length; j++) {
|
|
5917
|
+
buffer.writeInt16LE(out[j], j * 2);
|
|
4859
5918
|
}
|
|
4860
|
-
return
|
|
5919
|
+
return buffer;
|
|
4861
5920
|
}
|
|
4862
|
-
/**
|
|
4863
|
-
|
|
4864
|
-
|
|
4865
|
-
|
|
4866
|
-
|
|
4867
|
-
|
|
4868
|
-
|
|
4869
|
-
|
|
4870
|
-
|
|
4871
|
-
|
|
4872
|
-
|
|
4873
|
-
|
|
4874
|
-
|
|
4875
|
-
|
|
4876
|
-
});
|
|
4877
|
-
if (!response.ok) {
|
|
4878
|
-
const body = await response.text();
|
|
4879
|
-
throw new Error(`Rime TTS error ${response.status}: ${body}`);
|
|
4880
|
-
}
|
|
4881
|
-
const contentType = response.headers.get("content-type") ?? "";
|
|
4882
|
-
if (!contentType.startsWith("audio")) {
|
|
4883
|
-
const body = await response.text();
|
|
4884
|
-
throw new Error(`Rime returned non-audio response: ${body.slice(0, 500)}`);
|
|
4885
|
-
}
|
|
4886
|
-
if (!response.body) {
|
|
4887
|
-
throw new Error("Rime TTS: no response body");
|
|
5921
|
+
/** @deprecated use {@link resampleStreaming} with persistent state. */
|
|
5922
|
+
static resample24kTo16k(audio) {
|
|
5923
|
+
const ctx = {
|
|
5924
|
+
carryByte: null,
|
|
5925
|
+
leftover: [],
|
|
5926
|
+
lpfPrev: 0,
|
|
5927
|
+
lpfEnabled: false,
|
|
5928
|
+
targetSampleRate: 16e3
|
|
5929
|
+
};
|
|
5930
|
+
const out = _OpenAITTS.resampleStreaming(audio, ctx);
|
|
5931
|
+
if (ctx.leftover.length === 0) return out;
|
|
5932
|
+
const tail = Buffer.alloc(ctx.leftover.length * 2);
|
|
5933
|
+
for (let i = 0; i < ctx.leftover.length; i++) {
|
|
5934
|
+
tail.writeInt16LE(ctx.leftover[i], i * 2);
|
|
4888
5935
|
}
|
|
4889
|
-
|
|
4890
|
-
|
|
4891
|
-
|
|
4892
|
-
|
|
4893
|
-
|
|
4894
|
-
|
|
4895
|
-
|
|
4896
|
-
|
|
4897
|
-
|
|
4898
|
-
|
|
4899
|
-
|
|
4900
|
-
|
|
4901
|
-
|
|
4902
|
-
reader.releaseLock();
|
|
5936
|
+
return Buffer.concat([out, tail]);
|
|
5937
|
+
}
|
|
5938
|
+
};
|
|
5939
|
+
|
|
5940
|
+
// src/tts/openai.ts
|
|
5941
|
+
var TTS3 = class extends OpenAITTS {
|
|
5942
|
+
static providerKey = "openai_tts";
|
|
5943
|
+
constructor(opts = {}) {
|
|
5944
|
+
const key = opts.apiKey ?? process.env.OPENAI_API_KEY;
|
|
5945
|
+
if (!key) {
|
|
5946
|
+
throw new Error(
|
|
5947
|
+
"OpenAI TTS requires an apiKey. Pass { apiKey: 'sk-...' } or set OPENAI_API_KEY in the environment."
|
|
5948
|
+
);
|
|
4903
5949
|
}
|
|
5950
|
+
super(
|
|
5951
|
+
key,
|
|
5952
|
+
opts.voice ?? "alloy",
|
|
5953
|
+
opts.model ?? "gpt-4o-mini-tts",
|
|
5954
|
+
opts.instructions ?? null,
|
|
5955
|
+
opts.speed ?? null,
|
|
5956
|
+
opts.antiAlias ?? false
|
|
5957
|
+
);
|
|
5958
|
+
}
|
|
5959
|
+
};
|
|
5960
|
+
|
|
5961
|
+
// src/tts/cartesia.ts
|
|
5962
|
+
init_esm_shims();
|
|
5963
|
+
function resolveApiKey3(apiKey) {
|
|
5964
|
+
const key = apiKey ?? process.env.CARTESIA_API_KEY;
|
|
5965
|
+
if (!key) {
|
|
5966
|
+
throw new Error(
|
|
5967
|
+
"Cartesia TTS requires an apiKey. Pass { apiKey: '...' } or set CARTESIA_API_KEY in the environment."
|
|
5968
|
+
);
|
|
5969
|
+
}
|
|
5970
|
+
return key;
|
|
5971
|
+
}
|
|
5972
|
+
var TTS4 = class _TTS extends CartesiaTTS {
|
|
5973
|
+
static providerKey = "cartesia_tts";
|
|
5974
|
+
constructor(opts = {}) {
|
|
5975
|
+
const key = resolveApiKey3(opts.apiKey);
|
|
5976
|
+
const { apiKey: _ignored, ...rest } = opts;
|
|
5977
|
+
void _ignored;
|
|
5978
|
+
super(key, rest);
|
|
5979
|
+
}
|
|
5980
|
+
static forTwilio(arg1, arg2) {
|
|
5981
|
+
const opts = typeof arg1 === "string" ? { apiKey: arg1, ...arg2 ?? {} } : arg1 ?? {};
|
|
5982
|
+
return new _TTS({ ...opts, sampleRate: 8e3 });
|
|
5983
|
+
}
|
|
5984
|
+
static forTelnyx(arg1, arg2) {
|
|
5985
|
+
const opts = typeof arg1 === "string" ? { apiKey: arg1, ...arg2 ?? {} } : arg1 ?? {};
|
|
5986
|
+
return new _TTS({ ...opts, sampleRate: 16e3 });
|
|
4904
5987
|
}
|
|
4905
5988
|
};
|
|
4906
5989
|
|
|
4907
5990
|
// src/tts/rime.ts
|
|
5991
|
+
init_esm_shims();
|
|
4908
5992
|
var TTS5 = class extends RimeTTS {
|
|
4909
5993
|
static providerKey = "rime";
|
|
4910
5994
|
constructor(opts = {}) {
|
|
@@ -4943,6 +6027,8 @@ var LMNTSampleRate = {
|
|
|
4943
6027
|
HZ_24000: 24e3
|
|
4944
6028
|
};
|
|
4945
6029
|
var LMNTTTS = class {
|
|
6030
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
6031
|
+
static providerKey = "lmnt";
|
|
4946
6032
|
apiKey;
|
|
4947
6033
|
model;
|
|
4948
6034
|
voice;
|
|
@@ -5041,6 +6127,7 @@ init_esm_shims();
|
|
|
5041
6127
|
// src/providers/inworld-tts.ts
|
|
5042
6128
|
init_esm_shims();
|
|
5043
6129
|
var INWORLD_BASE_URL = "https://api.inworld.ai/tts/v1/voice:stream";
|
|
6130
|
+
var INWORLD_VOICES_URL = "https://api.inworld.ai/tts/v1/voices";
|
|
5044
6131
|
var InworldModel = {
|
|
5045
6132
|
TTS_2: "inworld-tts-2",
|
|
5046
6133
|
TTS_1_5_MAX: "inworld-tts-1.5-max",
|
|
@@ -5055,6 +6142,8 @@ var InworldAudioEncoding = {
|
|
|
5055
6142
|
MP3: "MP3"
|
|
5056
6143
|
};
|
|
5057
6144
|
var InworldTTS = class {
|
|
6145
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
6146
|
+
static providerKey = "inworld";
|
|
5058
6147
|
authToken;
|
|
5059
6148
|
model;
|
|
5060
6149
|
voice;
|
|
@@ -5099,6 +6188,45 @@ var InworldTTS = class {
|
|
|
5099
6188
|
if (this.deliveryMode !== void 0) payload.deliveryMode = this.deliveryMode;
|
|
5100
6189
|
return payload;
|
|
5101
6190
|
}
|
|
6191
|
+
/**
|
|
6192
|
+
* Pre-call HTTP warmup for the Inworld TTS API.
|
|
6193
|
+
*
|
|
6194
|
+
* Issues a lightweight `GET /tts/v1/voices` against the API host so
|
|
6195
|
+
* DNS + TLS + HTTP/2 connection are already up by the time the first
|
|
6196
|
+
* `synthesizeStream()` POST lands. Best-effort: 5 s timeout, all
|
|
6197
|
+
* exceptions swallowed at debug level.
|
|
6198
|
+
*
|
|
6199
|
+
* Earlier revisions issued `HEAD` against the streaming endpoint
|
|
6200
|
+
* (`/tts/v1/voice:stream`). That endpoint is POST-only so HEAD
|
|
6201
|
+
* returns `405 Method Not Allowed` — the warmup still completed the
|
|
6202
|
+
* TLS handshake but spammed 405 errors into Inworld's audit logs and
|
|
6203
|
+
* into our own logs. Switching to a documented `GET /tts/v1/voices`
|
|
6204
|
+
* metadata read is a 2xx-clean equivalent.
|
|
6205
|
+
*
|
|
6206
|
+
* Billing safety: `GET /tts/v1/voices` is a free metadata endpoint
|
|
6207
|
+
* (per https://docs.inworld.ai/). It returns the voice catalogue
|
|
6208
|
+
* without invoking the synthesis pipeline. The actual synthesis is
|
|
6209
|
+
* billed only when `POST /tts/v1/voice:stream` runs with a non-empty
|
|
6210
|
+
* `text`.
|
|
6211
|
+
*
|
|
6212
|
+
* Note: Inworld TTS uses the HTTP NDJSON streaming path rather than
|
|
6213
|
+
* a persistent WebSocket — connection warmup is therefore HTTP-based,
|
|
6214
|
+
* not WebSocket pre-handshake. The latency win is smaller (~50-150 ms)
|
|
6215
|
+
* than the WS-based prewarms but still real on cold-start calls.
|
|
6216
|
+
*/
|
|
6217
|
+
async warmup() {
|
|
6218
|
+
try {
|
|
6219
|
+
await fetch(INWORLD_VOICES_URL, {
|
|
6220
|
+
method: "GET",
|
|
6221
|
+
headers: {
|
|
6222
|
+
Authorization: `Basic ${this.authToken}`
|
|
6223
|
+
},
|
|
6224
|
+
signal: AbortSignal.timeout(5e3)
|
|
6225
|
+
});
|
|
6226
|
+
} catch (err) {
|
|
6227
|
+
getLogger().debug(`Inworld TTS warmup failed (best-effort): ${String(err)}`);
|
|
6228
|
+
}
|
|
6229
|
+
}
|
|
5102
6230
|
/** Synthesize text and return the concatenated audio buffer. */
|
|
5103
6231
|
async synthesize(text) {
|
|
5104
6232
|
const chunks = [];
|
|
@@ -5238,6 +6366,8 @@ var DEFAULT_MODEL = AnthropicModel.CLAUDE_HAIKU_4_5_20251001;
|
|
|
5238
6366
|
var DEFAULT_MAX_TOKENS = 1024;
|
|
5239
6367
|
var PROMPT_CACHING_BETA = "prompt-caching-2024-07-31";
|
|
5240
6368
|
var AnthropicLLMProvider = class {
|
|
6369
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
6370
|
+
static providerKey = "anthropic";
|
|
5241
6371
|
apiKey;
|
|
5242
6372
|
model;
|
|
5243
6373
|
maxTokens;
|
|
@@ -5259,6 +6389,27 @@ var AnthropicLLMProvider = class {
|
|
|
5259
6389
|
this.anthropicVersion = options.anthropicVersion ?? DEFAULT_ANTHROPIC_VERSION;
|
|
5260
6390
|
this.promptCaching = options.promptCaching ?? true;
|
|
5261
6391
|
}
|
|
6392
|
+
/**
|
|
6393
|
+
* Pre-call DNS / TLS warmup for the Anthropic Messages API.
|
|
6394
|
+
* Issues a lightweight ``GET https://api.anthropic.com/v1/models`` so
|
|
6395
|
+
* DNS, TLS and HTTP/2 are already up by the time the first ``messages``
|
|
6396
|
+
* call lands. Best-effort: 5 s timeout, exceptions swallowed at debug.
|
|
6397
|
+
*/
|
|
6398
|
+
async warmup() {
|
|
6399
|
+
try {
|
|
6400
|
+
const modelsUrl = this.url.replace(/\/messages\/?$/, "/models");
|
|
6401
|
+
await fetch(modelsUrl, {
|
|
6402
|
+
method: "GET",
|
|
6403
|
+
headers: {
|
|
6404
|
+
"x-api-key": this.apiKey,
|
|
6405
|
+
"anthropic-version": this.anthropicVersion
|
|
6406
|
+
},
|
|
6407
|
+
signal: AbortSignal.timeout(5e3)
|
|
6408
|
+
});
|
|
6409
|
+
} catch (err) {
|
|
6410
|
+
getLogger().debug(`Anthropic LLM warmup failed (best-effort): ${String(err)}`);
|
|
6411
|
+
}
|
|
6412
|
+
}
|
|
5262
6413
|
/** Stream Patter-format LLM chunks for the given OpenAI-style chat history. */
|
|
5263
6414
|
async *stream(messages, tools, opts) {
|
|
5264
6415
|
const { system, messages: anthropicMessages } = toAnthropicMessages(messages);
|
|
@@ -5476,12 +6627,6 @@ init_esm_shims();
|
|
|
5476
6627
|
|
|
5477
6628
|
// src/providers/groq-llm.ts
|
|
5478
6629
|
init_esm_shims();
|
|
5479
|
-
|
|
5480
|
-
// src/version.ts
|
|
5481
|
-
init_esm_shims();
|
|
5482
|
-
var VERSION = "0.5.5";
|
|
5483
|
-
|
|
5484
|
-
// src/providers/groq-llm.ts
|
|
5485
6630
|
var GROQ_BASE_URL = "https://api.groq.com/openai/v1";
|
|
5486
6631
|
var GroqModel = {
|
|
5487
6632
|
LLAMA_3_3_70B_VERSATILE: "llama-3.3-70b-versatile",
|
|
@@ -5494,6 +6639,8 @@ var GroqModel = {
|
|
|
5494
6639
|
};
|
|
5495
6640
|
var DEFAULT_MODEL2 = GroqModel.LLAMA_3_3_70B_VERSATILE;
|
|
5496
6641
|
var GroqLLMProvider = class {
|
|
6642
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
6643
|
+
static providerKey = "groq";
|
|
5497
6644
|
apiKey;
|
|
5498
6645
|
model;
|
|
5499
6646
|
baseUrl;
|
|
@@ -5527,6 +6674,21 @@ var GroqLLMProvider = class {
|
|
|
5527
6674
|
this.presencePenalty = options.presencePenalty;
|
|
5528
6675
|
this.stop = options.stop;
|
|
5529
6676
|
}
|
|
6677
|
+
/**
|
|
6678
|
+
* Pre-call DNS / TLS warmup for the Groq inference endpoint.
|
|
6679
|
+
* Best-effort: 5 s timeout, all exceptions swallowed at debug level.
|
|
6680
|
+
*/
|
|
6681
|
+
async warmup() {
|
|
6682
|
+
try {
|
|
6683
|
+
await fetch(`${this.baseUrl}/models`, {
|
|
6684
|
+
method: "GET",
|
|
6685
|
+
headers: { Authorization: `Bearer ${this.apiKey}` },
|
|
6686
|
+
signal: AbortSignal.timeout(5e3)
|
|
6687
|
+
});
|
|
6688
|
+
} catch (err) {
|
|
6689
|
+
getLogger().debug(`Groq LLM warmup failed (best-effort): ${String(err)}`);
|
|
6690
|
+
}
|
|
6691
|
+
}
|
|
5530
6692
|
/** Stream Patter-format LLM chunks from the Groq chat completions API. */
|
|
5531
6693
|
async *stream(messages, tools, opts) {
|
|
5532
6694
|
const body = {
|
|
@@ -5662,6 +6824,8 @@ var CerebrasModel = {
|
|
|
5662
6824
|
var DEFAULT_MODEL3 = CerebrasModel.GPT_OSS_120B;
|
|
5663
6825
|
var RETRY_BACKOFF_BASE_MS = 500;
|
|
5664
6826
|
var CerebrasLLMProvider = class {
|
|
6827
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
6828
|
+
static providerKey = "cerebras";
|
|
5665
6829
|
apiKey;
|
|
5666
6830
|
model;
|
|
5667
6831
|
baseUrl;
|
|
@@ -5697,6 +6861,21 @@ var CerebrasLLMProvider = class {
|
|
|
5697
6861
|
this.presencePenalty = options.presencePenalty;
|
|
5698
6862
|
this.stop = options.stop;
|
|
5699
6863
|
}
|
|
6864
|
+
/**
|
|
6865
|
+
* Pre-call DNS / TLS warmup for the Cerebras inference endpoint.
|
|
6866
|
+
* Best-effort: 5 s timeout, all exceptions swallowed at debug level.
|
|
6867
|
+
*/
|
|
6868
|
+
async warmup() {
|
|
6869
|
+
try {
|
|
6870
|
+
await fetch(`${this.baseUrl}/models`, {
|
|
6871
|
+
method: "GET",
|
|
6872
|
+
headers: { Authorization: `Bearer ${this.apiKey}` },
|
|
6873
|
+
signal: AbortSignal.timeout(5e3)
|
|
6874
|
+
});
|
|
6875
|
+
} catch (err) {
|
|
6876
|
+
getLogger().debug(`Cerebras LLM warmup failed (best-effort): ${String(err)}`);
|
|
6877
|
+
}
|
|
6878
|
+
}
|
|
5700
6879
|
/** Stream Patter-format LLM chunks from the Cerebras chat completions API. */
|
|
5701
6880
|
async *stream(messages, tools, opts) {
|
|
5702
6881
|
const body = {
|
|
@@ -5859,6 +7038,8 @@ var GoogleModel = {
|
|
|
5859
7038
|
var DEFAULT_MODEL4 = GoogleModel.GEMINI_2_5_FLASH;
|
|
5860
7039
|
var DEFAULT_BASE_URL3 = "https://generativelanguage.googleapis.com/v1beta";
|
|
5861
7040
|
var GoogleLLMProvider = class {
|
|
7041
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
7042
|
+
static providerKey = "google";
|
|
5862
7043
|
apiKey;
|
|
5863
7044
|
model;
|
|
5864
7045
|
baseUrl;
|
|
@@ -5876,6 +7057,23 @@ var GoogleLLMProvider = class {
|
|
|
5876
7057
|
this.temperature = options.temperature;
|
|
5877
7058
|
this.maxOutputTokens = options.maxOutputTokens;
|
|
5878
7059
|
}
|
|
7060
|
+
/**
|
|
7061
|
+
* Pre-call DNS / TLS warmup for the Gemini API.
|
|
7062
|
+
* Issues a lightweight ``GET ${baseUrl}/models?key=...`` so DNS, TLS
|
|
7063
|
+
* and HTTP/2 are already up by the time the first
|
|
7064
|
+
* ``streamGenerateContent`` call lands. Best-effort: 5 s timeout, all
|
|
7065
|
+
* exceptions swallowed at debug level.
|
|
7066
|
+
*/
|
|
7067
|
+
async warmup() {
|
|
7068
|
+
try {
|
|
7069
|
+
await fetch(`${this.baseUrl}/models?key=${encodeURIComponent(this.apiKey)}`, {
|
|
7070
|
+
method: "GET",
|
|
7071
|
+
signal: AbortSignal.timeout(5e3)
|
|
7072
|
+
});
|
|
7073
|
+
} catch (err) {
|
|
7074
|
+
getLogger().debug(`Google LLM warmup failed (best-effort): ${String(err)}`);
|
|
7075
|
+
}
|
|
7076
|
+
}
|
|
5879
7077
|
/** Stream Patter-format LLM chunks from the Gemini SSE endpoint. */
|
|
5880
7078
|
async *stream(messages, tools, opts) {
|
|
5881
7079
|
const { systemInstruction, contents } = toGeminiContents(messages);
|
|
@@ -6065,6 +7263,186 @@ var LLM5 = class extends GoogleLLMProvider {
|
|
|
6065
7263
|
}
|
|
6066
7264
|
};
|
|
6067
7265
|
|
|
7266
|
+
// src/providers/deepfilternet-filter.ts
|
|
7267
|
+
init_esm_shims();
|
|
7268
|
+
function log() {
|
|
7269
|
+
return getLogger();
|
|
7270
|
+
}
|
|
7271
|
+
var DEEPFILTERNET_SR = 48e3;
|
|
7272
|
+
async function loadOnnxRuntime() {
|
|
7273
|
+
try {
|
|
7274
|
+
const specifier = "onnxruntime-node";
|
|
7275
|
+
const mod = await import(specifier);
|
|
7276
|
+
return mod;
|
|
7277
|
+
} catch {
|
|
7278
|
+
return null;
|
|
7279
|
+
}
|
|
7280
|
+
}
|
|
7281
|
+
function pcm16ToFloat32(pcm) {
|
|
7282
|
+
const view = new Int16Array(pcm.buffer, pcm.byteOffset, Math.floor(pcm.byteLength / 2));
|
|
7283
|
+
const out = new Float32Array(view.length);
|
|
7284
|
+
for (let i = 0; i < view.length; i += 1) {
|
|
7285
|
+
out[i] = view[i] / 32768;
|
|
7286
|
+
}
|
|
7287
|
+
return out;
|
|
7288
|
+
}
|
|
7289
|
+
function float32ToPcm16(samples) {
|
|
7290
|
+
const out = Buffer.alloc(samples.length * 2);
|
|
7291
|
+
for (let i = 0; i < samples.length; i += 1) {
|
|
7292
|
+
const clamped = Math.max(-1, Math.min(1, samples[i]));
|
|
7293
|
+
out.writeInt16LE(Math.round(clamped * 32767), i * 2);
|
|
7294
|
+
}
|
|
7295
|
+
return out;
|
|
7296
|
+
}
|
|
7297
|
+
var DeepFilterNetFilter = class {
|
|
7298
|
+
modelPath;
|
|
7299
|
+
silenceWarnings;
|
|
7300
|
+
session = null;
|
|
7301
|
+
ort = null;
|
|
7302
|
+
warned = false;
|
|
7303
|
+
closed = false;
|
|
7304
|
+
// Fix 5: stateful resamplers for src_sr↔48k conversions so chunk-boundary
|
|
7305
|
+
// samples are not discarded. Lazy-created and torn down on rate change.
|
|
7306
|
+
_resamplerSrcRate = null;
|
|
7307
|
+
_upsamplerInst = null;
|
|
7308
|
+
_downsamplerInst = null;
|
|
7309
|
+
constructor(options = {}) {
|
|
7310
|
+
this.modelPath = options.modelPath;
|
|
7311
|
+
this.silenceWarnings = options.silenceWarnings === true;
|
|
7312
|
+
}
|
|
7313
|
+
async ensureSession() {
|
|
7314
|
+
if (this.session !== null) {
|
|
7315
|
+
return this.session;
|
|
7316
|
+
}
|
|
7317
|
+
if (!this.modelPath) {
|
|
7318
|
+
if (!this.warned && !this.silenceWarnings) {
|
|
7319
|
+
log().warn(
|
|
7320
|
+
"DeepFilterNetFilter: no modelPath provided; audio will pass through unmodified. Provide a DeepFilterNet ONNX model to enable noise suppression."
|
|
7321
|
+
);
|
|
7322
|
+
this.warned = true;
|
|
7323
|
+
}
|
|
7324
|
+
return null;
|
|
7325
|
+
}
|
|
7326
|
+
if (this.ort === null) {
|
|
7327
|
+
this.ort = await loadOnnxRuntime();
|
|
7328
|
+
}
|
|
7329
|
+
if (this.ort === null) {
|
|
7330
|
+
if (!this.warned && !this.silenceWarnings) {
|
|
7331
|
+
log().warn(
|
|
7332
|
+
"DeepFilterNetFilter: onnxruntime-node is not installed; audio will pass through unmodified. Run `npm install onnxruntime-node` to enable noise suppression."
|
|
7333
|
+
);
|
|
7334
|
+
this.warned = true;
|
|
7335
|
+
}
|
|
7336
|
+
return null;
|
|
7337
|
+
}
|
|
7338
|
+
try {
|
|
7339
|
+
this.session = await this.ort.InferenceSession.create(this.modelPath);
|
|
7340
|
+
return this.session;
|
|
7341
|
+
} catch (error) {
|
|
7342
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
7343
|
+
log().error(`DeepFilterNetFilter: failed to load model: ${message}`);
|
|
7344
|
+
this.warned = true;
|
|
7345
|
+
return null;
|
|
7346
|
+
}
|
|
7347
|
+
}
|
|
7348
|
+
/** Run noise suppression on a PCM16 chunk; pass-through when no model is loaded. */
|
|
7349
|
+
async process(pcmChunk, sampleRate) {
|
|
7350
|
+
if (this.closed) {
|
|
7351
|
+
throw new Error("DeepFilterNetFilter is closed");
|
|
7352
|
+
}
|
|
7353
|
+
if (pcmChunk.length === 0) {
|
|
7354
|
+
return pcmChunk;
|
|
7355
|
+
}
|
|
7356
|
+
const session = await this.ensureSession();
|
|
7357
|
+
if (session === null || this.ort === null) {
|
|
7358
|
+
return pcmChunk;
|
|
7359
|
+
}
|
|
7360
|
+
try {
|
|
7361
|
+
if (this._resamplerSrcRate !== sampleRate) {
|
|
7362
|
+
this._resamplerSrcRate = sampleRate;
|
|
7363
|
+
this._upsamplerInst = new StatefulResampler({ srcRate: sampleRate, dstRate: DEEPFILTERNET_SR });
|
|
7364
|
+
this._downsamplerInst = new StatefulResampler({ srcRate: DEEPFILTERNET_SR, dstRate: sampleRate });
|
|
7365
|
+
}
|
|
7366
|
+
const samples = pcm16ToFloat32(pcmChunk);
|
|
7367
|
+
const pcm16Up = this._upsamplerInst.process(float32ToPcm16(new Float32Array(samples)));
|
|
7368
|
+
const upsampled = pcm16ToFloat32(pcm16Up);
|
|
7369
|
+
const inputName = session.inputNames[0];
|
|
7370
|
+
const outputName = session.outputNames[0];
|
|
7371
|
+
const tensor = new this.ort.Tensor("float32", upsampled, [1, upsampled.length]);
|
|
7372
|
+
const feeds = { [inputName]: tensor };
|
|
7373
|
+
const results = await session.run(feeds);
|
|
7374
|
+
const output = results[outputName];
|
|
7375
|
+
if (!output || !output.data) {
|
|
7376
|
+
return pcmChunk;
|
|
7377
|
+
}
|
|
7378
|
+
const enhanced = output.data instanceof Float32Array ? output.data : new Float32Array(output.data);
|
|
7379
|
+
const pcm16Enhanced = float32ToPcm16(enhanced);
|
|
7380
|
+
const pcm16Restored = this._downsamplerInst.process(pcm16Enhanced);
|
|
7381
|
+
return pcm16Restored;
|
|
7382
|
+
} catch (error) {
|
|
7383
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
7384
|
+
log().error(`DeepFilterNetFilter.process failed: ${message}`);
|
|
7385
|
+
return pcmChunk;
|
|
7386
|
+
}
|
|
7387
|
+
}
|
|
7388
|
+
/** Flush resamplers, release the ONNX session, and mark the filter closed. */
|
|
7389
|
+
async close() {
|
|
7390
|
+
try {
|
|
7391
|
+
this._upsamplerInst?.flush();
|
|
7392
|
+
} catch {
|
|
7393
|
+
}
|
|
7394
|
+
try {
|
|
7395
|
+
this._downsamplerInst?.flush();
|
|
7396
|
+
} catch {
|
|
7397
|
+
}
|
|
7398
|
+
this._upsamplerInst = null;
|
|
7399
|
+
this._downsamplerInst = null;
|
|
7400
|
+
if (this.session !== null && typeof this.session.release === "function") {
|
|
7401
|
+
try {
|
|
7402
|
+
await this.session.release();
|
|
7403
|
+
} catch (error) {
|
|
7404
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
7405
|
+
log().warn(`DeepFilterNetFilter.close: release failed: ${message}`);
|
|
7406
|
+
}
|
|
7407
|
+
}
|
|
7408
|
+
this.session = null;
|
|
7409
|
+
this.closed = true;
|
|
7410
|
+
}
|
|
7411
|
+
};
|
|
7412
|
+
|
|
7413
|
+
// src/providers/krisp-filter.ts
|
|
7414
|
+
init_esm_shims();
|
|
7415
|
+
var KrispSampleRate = {
|
|
7416
|
+
HZ_8000: 8e3,
|
|
7417
|
+
HZ_16000: 16e3,
|
|
7418
|
+
HZ_32000: 32e3,
|
|
7419
|
+
HZ_44100: 44100,
|
|
7420
|
+
HZ_48000: 48e3
|
|
7421
|
+
};
|
|
7422
|
+
var KrispFrameDuration = {
|
|
7423
|
+
MS_10: 10,
|
|
7424
|
+
MS_15: 15,
|
|
7425
|
+
MS_20: 20,
|
|
7426
|
+
MS_30: 30,
|
|
7427
|
+
MS_32: 32
|
|
7428
|
+
};
|
|
7429
|
+
var NODE_SDK_UNAVAILABLE_MESSAGE = "Krisp VIVA Filter is not yet available for the Patter TypeScript SDK.\n\nAs of 2026-05, Krisp does not publish an official Node.js (server) SDK. The Patter TypeScript SDK ships only the AudioFilter interface scaffold (this file) for parity with the Python implementation, since Patter runs server-side on a real-time audio stream from the telephony carrier.\n\nAvailable paths today:\n 1. Use the Python SDK: `from getpatter.providers.krisp_filter import KrispVivaFilter` \u2014 fully implemented, requires `pip install getpatter[krisp]` + `KRISP_VIVA_SDK_LICENSE_KEY` + `KRISP_VIVA_FILTER_MODEL_PATH`.\n 2. Use DeepFilterNet on TS: `new DeepFilterNetFilter({ modelPath: '.../DeepFilterNet3.onnx' })` \u2014 community ONNX export, no license needed.\n\nBrowser/React Native (not applicable to Patter server-side, listed for completeness):\n - Browser WASM wrappers (various third-party packages) process local microphone capture, not server-received PCM/mulaw audio.\n - Mobile client wrappers (iOS/Android, various third-party packages) are likewise client-side only.\n\nTrack Node SDK status:\n - https://krisp.ai/developers/\n - Patter backlog: task #38 \"Krisp TS port decision\"\n";
|
|
7430
|
+
var KrispVivaFilter = class {
|
|
7431
|
+
static providerKey = "krisp_viva";
|
|
7432
|
+
constructor(_options = {}) {
|
|
7433
|
+
throw new Error(NODE_SDK_UNAVAILABLE_MESSAGE);
|
|
7434
|
+
}
|
|
7435
|
+
// The two methods below are unreachable at runtime (constructor throws)
|
|
7436
|
+
// but kept so the class structurally satisfies `AudioFilter`. When the
|
|
7437
|
+
// Node binding lands, replace constructor + these stubs with the real
|
|
7438
|
+
// implementation.
|
|
7439
|
+
async process(pcmChunk, _sampleRate) {
|
|
7440
|
+
return pcmChunk;
|
|
7441
|
+
}
|
|
7442
|
+
async close() {
|
|
7443
|
+
}
|
|
7444
|
+
};
|
|
7445
|
+
|
|
6068
7446
|
// src/telephony/twilio.ts
|
|
6069
7447
|
init_esm_shims();
|
|
6070
7448
|
var Carrier = class {
|
|
@@ -6905,12 +8283,28 @@ var TwilioAdapter = class _TwilioAdapter {
|
|
|
6905
8283
|
return { callSid: call.sid };
|
|
6906
8284
|
}
|
|
6907
8285
|
/**
|
|
6908
|
-
* Build a
|
|
6909
|
-
*
|
|
8286
|
+
* Build a ``<Response><Connect><Stream url="...">`` TwiML document.
|
|
8287
|
+
*
|
|
8288
|
+
* ``parameters`` is forwarded as ``<Parameter name="..." value="..."/>``
|
|
8289
|
+
* children of ``<Stream>``. Twilio Media Streams strips query-string params
|
|
8290
|
+
* from the ``<Stream url=...>`` before the WS handshake, so
|
|
8291
|
+
* ``<Parameter>`` tags are the supported way to pre-populate
|
|
8292
|
+
* ``start.customParameters`` on the WS ``start`` frame. Used by the
|
|
8293
|
+
* inbound path to carry caller / callee through to the bridge.
|
|
8294
|
+
*
|
|
8295
|
+
* Mirrors the Python adapter's ``generate_stream_twiml``.
|
|
6910
8296
|
*/
|
|
6911
|
-
static generateStreamTwiml(streamUrl) {
|
|
6912
|
-
const
|
|
6913
|
-
|
|
8297
|
+
static generateStreamTwiml(streamUrl, parameters) {
|
|
8298
|
+
const esc = (s) => s.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'");
|
|
8299
|
+
const escapedUrl = esc(streamUrl);
|
|
8300
|
+
let paramTags = "";
|
|
8301
|
+
if (parameters) {
|
|
8302
|
+
for (const [name, value] of Object.entries(parameters)) {
|
|
8303
|
+
if (value == null) continue;
|
|
8304
|
+
paramTags += `<Parameter name="${esc(name)}" value="${esc(String(value))}"/>`;
|
|
8305
|
+
}
|
|
8306
|
+
}
|
|
8307
|
+
return `<?xml version="1.0" encoding="UTF-8"?><Response><Connect><Stream url="${escapedUrl}">${paramTags}</Stream></Connect></Response>`;
|
|
6914
8308
|
}
|
|
6915
8309
|
/** Force-complete an in-progress call. */
|
|
6916
8310
|
async endCall(callSid) {
|
|
@@ -7100,6 +8494,8 @@ var TelnyxSTT = class {
|
|
|
7100
8494
|
transcriptionEngine;
|
|
7101
8495
|
sampleRate;
|
|
7102
8496
|
baseUrl;
|
|
8497
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
8498
|
+
static providerKey = "telnyx_stt";
|
|
7103
8499
|
ws = null;
|
|
7104
8500
|
callbacks = [];
|
|
7105
8501
|
headerSent = false;
|
|
@@ -7204,6 +8600,8 @@ var TelnyxTTS = class {
|
|
|
7204
8600
|
apiKey;
|
|
7205
8601
|
voice;
|
|
7206
8602
|
baseUrl;
|
|
8603
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
8604
|
+
static providerKey = "telnyx_tts";
|
|
7207
8605
|
/** Collect every audio chunk into a single Buffer. */
|
|
7208
8606
|
async synthesize(text) {
|
|
7209
8607
|
const chunks = [];
|
|
@@ -7299,16 +8697,23 @@ export {
|
|
|
7299
8697
|
CallMetricsAccumulator,
|
|
7300
8698
|
STT4 as CartesiaSTT,
|
|
7301
8699
|
TTS4 as CartesiaTTS,
|
|
8700
|
+
CartesiaTTSModel,
|
|
8701
|
+
CartesiaTTSVoiceMode,
|
|
7302
8702
|
LLM4 as CerebrasLLM,
|
|
7303
8703
|
ChatContext,
|
|
7304
8704
|
CloudflareTunnel,
|
|
7305
8705
|
DEFAULT_MIN_SENTENCE_LEN,
|
|
7306
8706
|
DEFAULT_PRICING,
|
|
7307
8707
|
DTMF_EVENTS,
|
|
8708
|
+
DeepFilterNetFilter,
|
|
8709
|
+
DeepgramModel,
|
|
7308
8710
|
STT as DeepgramSTT,
|
|
7309
8711
|
DefaultToolExecutor,
|
|
7310
8712
|
ConvAI as ElevenLabsConvAI,
|
|
7311
8713
|
ElevenLabsConvAIAdapter,
|
|
8714
|
+
ElevenLabsModel,
|
|
8715
|
+
ElevenLabsOutputFormat,
|
|
8716
|
+
ElevenLabsTTS as ElevenLabsRestTTS,
|
|
7312
8717
|
TTS as ElevenLabsTTS,
|
|
7313
8718
|
TTS2 as ElevenLabsWebSocketTTS,
|
|
7314
8719
|
ErrorCode,
|
|
@@ -7322,16 +8727,29 @@ export {
|
|
|
7322
8727
|
Guardrail,
|
|
7323
8728
|
IVRActivity,
|
|
7324
8729
|
TTS7 as InworldTTS,
|
|
8730
|
+
KrispFrameDuration,
|
|
8731
|
+
KrispSampleRate,
|
|
8732
|
+
KrispVivaFilter,
|
|
7325
8733
|
LLMLoop,
|
|
7326
8734
|
TTS6 as LMNTTTS,
|
|
7327
8735
|
MetricsStore,
|
|
8736
|
+
MinWordsStrategy,
|
|
7328
8737
|
Ngrok,
|
|
7329
8738
|
LLM as OpenAILLM,
|
|
7330
8739
|
OpenAILLMProvider,
|
|
7331
8740
|
Realtime as OpenAIRealtime,
|
|
8741
|
+
Realtime2 as OpenAIRealtime2,
|
|
8742
|
+
OpenAIRealtime2Adapter,
|
|
7332
8743
|
OpenAIRealtimeAdapter,
|
|
8744
|
+
OpenAIRealtimeAudioFormat,
|
|
8745
|
+
OpenAIRealtimeModel,
|
|
8746
|
+
OpenAIRealtimeVADType,
|
|
7333
8747
|
TTS3 as OpenAITTS,
|
|
7334
8748
|
STT3 as OpenAITranscribeSTT,
|
|
8749
|
+
OpenAITranscriptionModel,
|
|
8750
|
+
OpenAIVoice,
|
|
8751
|
+
PRICING_LAST_UPDATED,
|
|
8752
|
+
PRICING_VERSION,
|
|
7335
8753
|
PartialStreamError,
|
|
7336
8754
|
Patter,
|
|
7337
8755
|
PatterConnectionError,
|
|
@@ -7339,9 +8757,12 @@ export {
|
|
|
7339
8757
|
PatterTool,
|
|
7340
8758
|
PcmCarry,
|
|
7341
8759
|
PipelineHookExecutor,
|
|
8760
|
+
PricingUnit,
|
|
7342
8761
|
ProvisionError,
|
|
7343
8762
|
RateLimitError,
|
|
7344
8763
|
RemoteMessageHandler,
|
|
8764
|
+
RimeAudioFormat,
|
|
8765
|
+
RimeModel,
|
|
7345
8766
|
TTS5 as RimeTTS,
|
|
7346
8767
|
SPAN_BARGEIN,
|
|
7347
8768
|
SPAN_CALL,
|
|
@@ -7395,6 +8816,7 @@ export {
|
|
|
7395
8816
|
deepgram,
|
|
7396
8817
|
defineTool,
|
|
7397
8818
|
elevenlabs,
|
|
8819
|
+
evaluateStrategies as evaluateBargeInStrategies,
|
|
7398
8820
|
filterEmoji,
|
|
7399
8821
|
filterForTTS,
|
|
7400
8822
|
filterMarkdown,
|
|
@@ -7420,6 +8842,7 @@ export {
|
|
|
7420
8842
|
resample24kTo16k,
|
|
7421
8843
|
resample8kTo16k,
|
|
7422
8844
|
resamplePcm,
|
|
8845
|
+
resetStrategies as resetBargeInStrategies,
|
|
7423
8846
|
rime,
|
|
7424
8847
|
scheduleCron,
|
|
7425
8848
|
scheduleInterval,
|