getpatter 0.6.0 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/barge-in-strategies-X6ARMGIQ.mjs +12 -0
- package/dist/chunk-D4424JZR.mjs +71 -0
- package/dist/{chunk-X3364LSI.mjs → chunk-RV7APPYE.mjs} +36 -2
- package/dist/{chunk-JUQ5WQTQ.mjs → chunk-TEW3NAZJ.mjs} +3244 -1674
- package/dist/cli.js +277 -24
- package/dist/dashboard/ui.html +13 -13
- package/dist/index.d.mts +1525 -364
- package/dist/index.d.ts +1525 -364
- package/dist/index.js +3921 -986
- package/dist/index.mjs +1310 -70
- package/dist/{silero-vad-YLCXT5GQ.mjs → silero-vad-NSEXI4XS.mjs} +1 -1
- package/dist/{test-mode-Y7YG5LFZ.mjs → test-mode-WEKKNBLD.mjs} +1 -1
- package/package.json +1 -1
- package/src/dashboard/ui.html +13 -13
package/dist/index.mjs
CHANGED
|
@@ -15,6 +15,7 @@ import {
|
|
|
15
15
|
LLMLoop,
|
|
16
16
|
MetricsStore,
|
|
17
17
|
OpenAILLMProvider,
|
|
18
|
+
OpenAIRealtime2Adapter,
|
|
18
19
|
OpenAIRealtimeAdapter,
|
|
19
20
|
PatterConnectionError,
|
|
20
21
|
PatterError,
|
|
@@ -59,7 +60,12 @@ import {
|
|
|
59
60
|
resample8kTo16k,
|
|
60
61
|
resolveLogRoot,
|
|
61
62
|
startSpan
|
|
62
|
-
} from "./chunk-
|
|
63
|
+
} from "./chunk-TEW3NAZJ.mjs";
|
|
64
|
+
import {
|
|
65
|
+
MinWordsStrategy,
|
|
66
|
+
evaluateStrategies,
|
|
67
|
+
resetStrategies
|
|
68
|
+
} from "./chunk-D4424JZR.mjs";
|
|
63
69
|
import {
|
|
64
70
|
getLogger,
|
|
65
71
|
setLogger
|
|
@@ -69,7 +75,7 @@ import {
|
|
|
69
75
|
} from "./chunk-6GR5MHHQ.mjs";
|
|
70
76
|
import {
|
|
71
77
|
SileroVAD
|
|
72
|
-
} from "./chunk-
|
|
78
|
+
} from "./chunk-RV7APPYE.mjs";
|
|
73
79
|
import {
|
|
74
80
|
__dirname,
|
|
75
81
|
__require,
|
|
@@ -106,6 +112,30 @@ var Realtime = class {
|
|
|
106
112
|
}
|
|
107
113
|
};
|
|
108
114
|
|
|
115
|
+
// src/engines/openai-2.ts
|
|
116
|
+
init_esm_shims();
|
|
117
|
+
var Realtime2 = class {
|
|
118
|
+
kind = "openai_realtime_2";
|
|
119
|
+
apiKey;
|
|
120
|
+
model;
|
|
121
|
+
voice;
|
|
122
|
+
reasoningEffort;
|
|
123
|
+
inputAudioTranscriptionModel;
|
|
124
|
+
constructor(opts = {}) {
|
|
125
|
+
const key = opts.apiKey ?? process.env.OPENAI_API_KEY;
|
|
126
|
+
if (!key) {
|
|
127
|
+
throw new Error(
|
|
128
|
+
"OpenAI Realtime 2 requires an apiKey. Pass { apiKey: 'sk-...' } or set OPENAI_API_KEY in the environment."
|
|
129
|
+
);
|
|
130
|
+
}
|
|
131
|
+
this.apiKey = key;
|
|
132
|
+
this.model = opts.model ?? "gpt-realtime-2";
|
|
133
|
+
this.voice = opts.voice ?? "alloy";
|
|
134
|
+
this.reasoningEffort = opts.reasoningEffort;
|
|
135
|
+
this.inputAudioTranscriptionModel = opts.inputAudioTranscriptionModel;
|
|
136
|
+
}
|
|
137
|
+
};
|
|
138
|
+
|
|
109
139
|
// src/engines/elevenlabs.ts
|
|
110
140
|
init_esm_shims();
|
|
111
141
|
var ConvAI = class {
|
|
@@ -520,12 +550,35 @@ function filterUndef(obj) {
|
|
|
520
550
|
}
|
|
521
551
|
|
|
522
552
|
// src/client.ts
|
|
553
|
+
var PREWARM_CACHE_MAX = 200;
|
|
554
|
+
var PREWARM_TTL_GRACE_MS = 5e3;
|
|
555
|
+
var PARKED_CONN_TTL_MS = 3e4;
|
|
523
556
|
function resolvePersistRoot(persist) {
|
|
524
557
|
if (persist === false) return null;
|
|
525
558
|
if (persist === true) return resolveLogRoot("auto");
|
|
526
559
|
if (typeof persist === "string") return resolveLogRoot(persist);
|
|
527
560
|
return resolveLogRoot();
|
|
528
561
|
}
|
|
562
|
+
function closeParkedConnections(slot) {
|
|
563
|
+
if (slot.stt) {
|
|
564
|
+
try {
|
|
565
|
+
slot.stt.close();
|
|
566
|
+
} catch {
|
|
567
|
+
}
|
|
568
|
+
}
|
|
569
|
+
if (slot.tts) {
|
|
570
|
+
try {
|
|
571
|
+
slot.tts.ws.close();
|
|
572
|
+
} catch {
|
|
573
|
+
}
|
|
574
|
+
}
|
|
575
|
+
if (slot.openaiRealtime) {
|
|
576
|
+
try {
|
|
577
|
+
slot.openaiRealtime.close();
|
|
578
|
+
} catch {
|
|
579
|
+
}
|
|
580
|
+
}
|
|
581
|
+
}
|
|
529
582
|
var Patter = class {
|
|
530
583
|
localConfig;
|
|
531
584
|
embeddedServer = null;
|
|
@@ -546,6 +599,65 @@ var Patter = class {
|
|
|
546
599
|
* ``Cannot use both tunnel: true and webhookUrl``.
|
|
547
600
|
*/
|
|
548
601
|
tunnelOwnsWebhookUrl = false;
|
|
602
|
+
/**
|
|
603
|
+
* Pre-rendered first-message TTS audio per outbound call_id. Populated
|
|
604
|
+
* by :meth:`call` when ``agent.prewarmFirstMessage`` is true; consumed
|
|
605
|
+
* by the StreamHandler firstMessage emit so the greeting streams
|
|
606
|
+
* instantly on ``start`` instead of paying the 200-700 ms TTS first-byte
|
|
607
|
+
* latency. See ``AgentOptions.prewarmFirstMessage``.
|
|
608
|
+
*
|
|
609
|
+
* Stores raw bytes in the TTS provider's native sample rate; the
|
|
610
|
+
* carrier-side audio sender resamples on emit.
|
|
611
|
+
*/
|
|
612
|
+
prewarmAudio = /* @__PURE__ */ new Map();
|
|
613
|
+
/**
|
|
614
|
+
* Call IDs whose prewarm cache slot has already been consumed —
|
|
615
|
+
* either by ``popPrewarmAudio`` (cache hit OR miss on the firstMessage
|
|
616
|
+
* emit path) or by ``recordPrewarmWaste`` (call ended before pickup).
|
|
617
|
+
* The prewarm task checks this set BEFORE writing bytes so a slow
|
|
618
|
+
* synth that finishes after the consumer already polled doesn't
|
|
619
|
+
* orphan bytes in ``prewarmAudio``. See FIX #92 in the parity audit.
|
|
620
|
+
*/
|
|
621
|
+
prewarmConsumed = /* @__PURE__ */ new Set();
|
|
622
|
+
/**
|
|
623
|
+
* Background tasks tracked so :meth:`disconnect` can wait on / drop any
|
|
624
|
+
* still-running prewarm-first-message synth before tearing down.
|
|
625
|
+
*/
|
|
626
|
+
prewarmTasks = /* @__PURE__ */ new Set();
|
|
627
|
+
/**
|
|
628
|
+
* TTL eviction timers keyed by call_id so :meth:`disconnect` (and
|
|
629
|
+
* normal consumption / waste-record paths) can cancel any pending
|
|
630
|
+
* timer when the slot drains naturally. Without this, the timer
|
|
631
|
+
* would WARN spuriously after the cache was already emptied.
|
|
632
|
+
*/
|
|
633
|
+
prewarmTtlTimers = /* @__PURE__ */ new Map();
|
|
634
|
+
/**
|
|
635
|
+
* Pre-opened, fully-handshaked provider WebSockets keyed by
|
|
636
|
+
* carrier-issued call_id. Populated by ``parkProviderConnections``
|
|
637
|
+
* during the carrier ringing window; consumed by the per-call
|
|
638
|
+
* StreamHandler at ``start`` via ``adoptWebSocket(...)`` so STT / TTS
|
|
639
|
+
* / Realtime audio can flow on the first turn without paying the
|
|
640
|
+
* 150-900 ms TLS + WS-upgrade + protocol-handshake round-trip again.
|
|
641
|
+
*
|
|
642
|
+
* Distinct from ``prewarmAudio`` (which holds pre-rendered TTS bytes
|
|
643
|
+
* for the first message); the two features are complementary and
|
|
644
|
+
* orthogonal — both can be active for the same call.
|
|
645
|
+
*
|
|
646
|
+
* Each slot may hold up to three parked connections (STT, TTS,
|
|
647
|
+
* Realtime). Drained by:
|
|
648
|
+
* - {@link popPrewarmedConnections} on the carrier ``start`` event
|
|
649
|
+
* (consumed normally — the handles transfer to the StreamHandler)
|
|
650
|
+
* - {@link recordPrewarmWaste} on call-termination paths (no-answer,
|
|
651
|
+
* busy, failed, canceled, AMD voicemail). Closes parked sockets.
|
|
652
|
+
* - {@link disconnect} on Patter teardown. Closes all parked sockets.
|
|
653
|
+
*/
|
|
654
|
+
prewarmedConnections = /* @__PURE__ */ new Map();
|
|
655
|
+
/**
|
|
656
|
+
* TTL eviction handles keyed by call_id for connections that are never
|
|
657
|
+
* adopted (e.g. a carrier that swallows ``start``). Closes the parked
|
|
658
|
+
* sockets so they don't leak past the safety window.
|
|
659
|
+
*/
|
|
660
|
+
prewarmedConnTimers = /* @__PURE__ */ new Map();
|
|
549
661
|
/**
|
|
550
662
|
* Speech-edge events for turn-taking instrumentation. Public surface: the
|
|
551
663
|
* seven `on*` proxy accessors below plus the `conversationState` snapshot.
|
|
@@ -553,13 +665,15 @@ var Patter = class {
|
|
|
553
665
|
* the previous behaviour.
|
|
554
666
|
*
|
|
555
667
|
* See `src/_speech-events.ts` for the full event taxonomy and the
|
|
556
|
-
*
|
|
668
|
+
* OpenAI Realtime alignment table.
|
|
557
669
|
*/
|
|
558
670
|
speechEvents = new SpeechEvents();
|
|
559
671
|
// ---- Speech-edge event callback proxies ------------------------------
|
|
560
|
-
// The seven `on*` properties below
|
|
561
|
-
//
|
|
562
|
-
//
|
|
672
|
+
// The seven `on*` properties below follow the canonical voice-agent
|
|
673
|
+
// metric set (user/agent state transitions, turn boundaries, TTFT, audio
|
|
674
|
+
// first-byte) and align with OpenAI Realtime where applicable. They
|
|
675
|
+
// proxy to `speechEvents` so the dispatcher remains the single source of
|
|
676
|
+
// truth (state + OTel).
|
|
563
677
|
get onUserSpeechStarted() {
|
|
564
678
|
return this.speechEvents.onUserSpeechStarted;
|
|
565
679
|
}
|
|
@@ -604,8 +718,8 @@ var Patter = class {
|
|
|
604
718
|
}
|
|
605
719
|
/**
|
|
606
720
|
* Snapshot of the current per-side state of the call.
|
|
607
|
-
*
|
|
608
|
-
*
|
|
721
|
+
* Returns the user_state / agent_state payload shape — read-only and
|
|
722
|
+
* safe to call at any time.
|
|
609
723
|
*/
|
|
610
724
|
get conversationState() {
|
|
611
725
|
return this.speechEvents.conversationState;
|
|
@@ -717,7 +831,7 @@ var Patter = class {
|
|
|
717
831
|
);
|
|
718
832
|
}
|
|
719
833
|
const engine = opts.engine;
|
|
720
|
-
if (engine instanceof Realtime) {
|
|
834
|
+
if (engine instanceof Realtime || engine instanceof Realtime2) {
|
|
721
835
|
working = {
|
|
722
836
|
...working,
|
|
723
837
|
provider: "openai_realtime",
|
|
@@ -735,7 +849,7 @@ var Patter = class {
|
|
|
735
849
|
};
|
|
736
850
|
} else {
|
|
737
851
|
throw new Error(
|
|
738
|
-
"Unknown engine. Expected OpenAIRealtime or ElevenLabsConvAI instance."
|
|
852
|
+
"Unknown engine. Expected OpenAIRealtime, OpenAIRealtime2, or ElevenLabsConvAI instance."
|
|
739
853
|
);
|
|
740
854
|
}
|
|
741
855
|
} else if (!working.provider && (working.stt !== void 0 || working.tts !== void 0 || working.llm !== void 0)) {
|
|
@@ -795,6 +909,13 @@ var Patter = class {
|
|
|
795
909
|
if (!opts.agent.systemPrompt && opts.agent.provider !== "pipeline") {
|
|
796
910
|
throw new Error("agent.systemPrompt is required");
|
|
797
911
|
}
|
|
912
|
+
if (opts.agent.echoCancellation) {
|
|
913
|
+
try {
|
|
914
|
+
await import("./aec-PJJMUM5E.mjs");
|
|
915
|
+
} catch (err) {
|
|
916
|
+
getLogger().debug(`AEC pre-import failed at serve(): ${String(err)}`);
|
|
917
|
+
}
|
|
918
|
+
}
|
|
798
919
|
if (opts.port !== void 0) {
|
|
799
920
|
if (typeof opts.port !== "number" || opts.port < 1 || opts.port > 65535) {
|
|
800
921
|
throw new RangeError(`port must be between 1 and 65535, got ${opts.port}`);
|
|
@@ -876,6 +997,9 @@ var Patter = class {
|
|
|
876
997
|
opts.dashboard ?? true,
|
|
877
998
|
opts.dashboardToken ?? ""
|
|
878
999
|
);
|
|
1000
|
+
this.embeddedServer.popPrewarmAudio = this.popPrewarmAudio;
|
|
1001
|
+
this.embeddedServer.popPrewarmedConnections = this.popPrewarmedConnections;
|
|
1002
|
+
this.embeddedServer.recordPrewarmWaste = this.recordPrewarmWaste;
|
|
879
1003
|
try {
|
|
880
1004
|
await this.embeddedServer.start(port);
|
|
881
1005
|
if (this.tunnelHandle) {
|
|
@@ -890,7 +1014,7 @@ var Patter = class {
|
|
|
890
1014
|
}
|
|
891
1015
|
/** Run the agent in interactive terminal-test mode (no real telephony). */
|
|
892
1016
|
async test(opts) {
|
|
893
|
-
const { TestSession: TestSession2 } = await import("./test-mode-
|
|
1017
|
+
const { TestSession: TestSession2 } = await import("./test-mode-WEKKNBLD.mjs");
|
|
894
1018
|
const session = new TestSession2();
|
|
895
1019
|
await session.run({
|
|
896
1020
|
agent: opts.agent,
|
|
@@ -900,6 +1024,325 @@ var Patter = class {
|
|
|
900
1024
|
onCallEnd: opts.onCallEnd
|
|
901
1025
|
});
|
|
902
1026
|
}
|
|
1027
|
+
/**
|
|
1028
|
+
* Pop and return the pre-synthesised first-message audio for ``callId``.
|
|
1029
|
+
*
|
|
1030
|
+
* Returns ``undefined`` when ``agent.prewarmFirstMessage`` was not set
|
|
1031
|
+
* for the originating outbound call, or when the synth was still in
|
|
1032
|
+
* flight at the moment the carrier emitted ``start`` (cache miss — the
|
|
1033
|
+
* StreamHandler falls back to live TTS).
|
|
1034
|
+
*
|
|
1035
|
+
* Called by the per-call StreamHandler at the start of the firstMessage
|
|
1036
|
+
* emit. Returning bytes here lets the handler skip the live TTS
|
|
1037
|
+
* synthesis and stream the cached buffer directly.
|
|
1038
|
+
*
|
|
1039
|
+
* Marks ``callId`` as consumed regardless of cache hit/miss so a slow
|
|
1040
|
+
* synth task that finishes after this call drops its bytes instead of
|
|
1041
|
+
* orphaning them in ``prewarmAudio``. See FIX #92.
|
|
1042
|
+
*/
|
|
1043
|
+
popPrewarmAudio = (callId) => {
|
|
1044
|
+
this.prewarmConsumed.add(callId);
|
|
1045
|
+
const ttl = this.prewarmTtlTimers.get(callId);
|
|
1046
|
+
if (ttl !== void 0) {
|
|
1047
|
+
clearTimeout(ttl);
|
|
1048
|
+
this.prewarmTtlTimers.delete(callId);
|
|
1049
|
+
}
|
|
1050
|
+
const buf = this.prewarmAudio.get(callId);
|
|
1051
|
+
if (buf !== void 0) this.prewarmAudio.delete(callId);
|
|
1052
|
+
return buf;
|
|
1053
|
+
};
|
|
1054
|
+
/**
|
|
1055
|
+
* Log a warning if a prewarmed greeting was paid for but never used.
|
|
1056
|
+
* The TTS bill for ``agent.firstMessage`` has already been incurred by
|
|
1057
|
+
* the background synth task, so the user should know — opt-in feature
|
|
1058
|
+
* with a known cost surface.
|
|
1059
|
+
*
|
|
1060
|
+
* Idempotent: the second call for the same ``callId`` is a no-op, so
|
|
1061
|
+
* the status callback firing first and ``endCall`` running afterwards
|
|
1062
|
+
* (or vice-versa) does not double-WARN. Public so the embedded
|
|
1063
|
+
* server's webhook handlers can invoke it on no-answer / busy /
|
|
1064
|
+
* failed / canceled / AMD-machine paths. See FIX #91.
|
|
1065
|
+
*/
|
|
1066
|
+
recordPrewarmWaste = (callId) => {
|
|
1067
|
+
this.closePrewarmedConnections(callId);
|
|
1068
|
+
if (this.prewarmConsumed.has(callId)) {
|
|
1069
|
+
this.prewarmAudio.delete(callId);
|
|
1070
|
+
return;
|
|
1071
|
+
}
|
|
1072
|
+
this.prewarmConsumed.add(callId);
|
|
1073
|
+
const ttl = this.prewarmTtlTimers.get(callId);
|
|
1074
|
+
if (ttl !== void 0) {
|
|
1075
|
+
clearTimeout(ttl);
|
|
1076
|
+
this.prewarmTtlTimers.delete(callId);
|
|
1077
|
+
}
|
|
1078
|
+
const buf = this.prewarmAudio.get(callId);
|
|
1079
|
+
if (buf !== void 0) {
|
|
1080
|
+
this.prewarmAudio.delete(callId);
|
|
1081
|
+
getLogger().warn(
|
|
1082
|
+
`Prewarm wasted for call ${callId} \u2014 first-message TTS already paid (~${buf.byteLength} bytes synthesised) but call ended before pickup.`
|
|
1083
|
+
);
|
|
1084
|
+
}
|
|
1085
|
+
};
|
|
1086
|
+
/**
|
|
1087
|
+
* Pop and return the parked provider WebSockets for ``callId``, or
|
|
1088
|
+
* ``undefined`` when no parked connections exist.
|
|
1089
|
+
*
|
|
1090
|
+
* Wired into ``EmbeddedServer.popPrewarmedConnections`` so the
|
|
1091
|
+
* per-call ``StreamHandler`` can adopt the parked sockets at the
|
|
1092
|
+
* carrier ``start`` event instead of opening fresh ones — saving
|
|
1093
|
+
* ~150-900 ms of cold-start handshake on the first turn.
|
|
1094
|
+
*/
|
|
1095
|
+
popPrewarmedConnections = (callId) => {
|
|
1096
|
+
const slot = this.prewarmedConnections.get(callId);
|
|
1097
|
+
if (slot === void 0) return void 0;
|
|
1098
|
+
this.prewarmedConnections.delete(callId);
|
|
1099
|
+
const ttl = this.prewarmedConnTimers.get(callId);
|
|
1100
|
+
if (ttl !== void 0) {
|
|
1101
|
+
clearTimeout(ttl);
|
|
1102
|
+
this.prewarmedConnTimers.delete(callId);
|
|
1103
|
+
}
|
|
1104
|
+
return slot;
|
|
1105
|
+
};
|
|
1106
|
+
/**
|
|
1107
|
+
* Close any parked provider WebSockets for ``callId``. Wired into
|
|
1108
|
+
* ``EmbeddedServer.closePrewarmedConnections`` so call-termination
|
|
1109
|
+
* paths (no-answer, busy, failed, canceled, AMD voicemail) drop the
|
|
1110
|
+
* sockets cleanly instead of leaving them to the upstream timeout.
|
|
1111
|
+
*/
|
|
1112
|
+
closePrewarmedConnections = (callId) => {
|
|
1113
|
+
const slot = this.prewarmedConnections.get(callId);
|
|
1114
|
+
if (slot === void 0) return;
|
|
1115
|
+
this.prewarmedConnections.delete(callId);
|
|
1116
|
+
const ttl = this.prewarmedConnTimers.get(callId);
|
|
1117
|
+
if (ttl !== void 0) {
|
|
1118
|
+
clearTimeout(ttl);
|
|
1119
|
+
this.prewarmedConnTimers.delete(callId);
|
|
1120
|
+
}
|
|
1121
|
+
closeParkedConnections(slot);
|
|
1122
|
+
};
|
|
1123
|
+
/**
|
|
1124
|
+
* Open and park provider WebSockets in parallel with the carrier-side
|
|
1125
|
+
* ``initiateCall``. Unlike :meth:`spawnProviderWarmup` (which closes
|
|
1126
|
+
* the WS after a brief idle), the sockets opened here stay OPEN and
|
|
1127
|
+
* are handed off to the per-call ``StreamHandler`` on ``start``.
|
|
1128
|
+
*
|
|
1129
|
+
* This is the structural fix for first-turn cold-start: on Node's
|
|
1130
|
+
* ``ws`` package, opening + closing a WS does NOT warm TLS for the
|
|
1131
|
+
* next open — every fresh ``new WebSocket()`` re-pays the full
|
|
1132
|
+
* TCP + TLS + HTTP-101 round-trip. By keeping the WS open and
|
|
1133
|
+
* adopting it directly, the live first turn skips the handshake
|
|
1134
|
+
* entirely (saves ~150-900 ms depending on provider).
|
|
1135
|
+
*
|
|
1136
|
+
* Best-effort: each provider's parking task is wrapped in
|
|
1137
|
+
* ``Promise.allSettled`` so a slow or failing endpoint cannot block
|
|
1138
|
+
* the others. Providers without ``openParkedConnection`` contribute
|
|
1139
|
+
* nothing — the call falls through to the cold ``connect()`` path
|
|
1140
|
+
* for that provider.
|
|
1141
|
+
*/
|
|
1142
|
+
parkProviderConnections(agent, callId) {
|
|
1143
|
+
const stt = agent.stt;
|
|
1144
|
+
const tts = agent.tts;
|
|
1145
|
+
const sttOpen = typeof stt?.openParkedConnection === "function" ? stt.openParkedConnection.bind(stt) : null;
|
|
1146
|
+
const ttsOpen = typeof tts?.openParkedConnection === "function" ? tts.openParkedConnection.bind(tts) : null;
|
|
1147
|
+
if (!sttOpen && !ttsOpen) return;
|
|
1148
|
+
const slot = {};
|
|
1149
|
+
this.prewarmedConnections.set(callId, slot);
|
|
1150
|
+
const startedAt = Date.now();
|
|
1151
|
+
const tasks = [];
|
|
1152
|
+
if (sttOpen) {
|
|
1153
|
+
tasks.push((async () => {
|
|
1154
|
+
try {
|
|
1155
|
+
const ws = await sttOpen();
|
|
1156
|
+
if (this.prewarmedConnections.get(callId) !== slot) {
|
|
1157
|
+
try {
|
|
1158
|
+
ws.close();
|
|
1159
|
+
} catch {
|
|
1160
|
+
}
|
|
1161
|
+
return;
|
|
1162
|
+
}
|
|
1163
|
+
slot.stt = ws;
|
|
1164
|
+
getLogger().info(
|
|
1165
|
+
`[PREWARM] callId=${callId} provider=stt ms=${Date.now() - startedAt}`
|
|
1166
|
+
);
|
|
1167
|
+
} catch (err) {
|
|
1168
|
+
getLogger().debug(`Park STT failed for ${callId}: ${String(err)}`);
|
|
1169
|
+
}
|
|
1170
|
+
})());
|
|
1171
|
+
}
|
|
1172
|
+
if (ttsOpen) {
|
|
1173
|
+
tasks.push((async () => {
|
|
1174
|
+
try {
|
|
1175
|
+
const parked = await ttsOpen();
|
|
1176
|
+
if (this.prewarmedConnections.get(callId) !== slot) {
|
|
1177
|
+
try {
|
|
1178
|
+
parked.ws.close();
|
|
1179
|
+
} catch {
|
|
1180
|
+
}
|
|
1181
|
+
return;
|
|
1182
|
+
}
|
|
1183
|
+
slot.tts = parked;
|
|
1184
|
+
getLogger().info(
|
|
1185
|
+
`[PREWARM] callId=${callId} provider=tts ms=${Date.now() - startedAt}`
|
|
1186
|
+
);
|
|
1187
|
+
} catch (err) {
|
|
1188
|
+
getLogger().debug(`Park TTS failed for ${callId}: ${String(err)}`);
|
|
1189
|
+
}
|
|
1190
|
+
})());
|
|
1191
|
+
}
|
|
1192
|
+
const task = (async () => {
|
|
1193
|
+
await Promise.allSettled(tasks);
|
|
1194
|
+
})();
|
|
1195
|
+
this.prewarmTasks.add(task);
|
|
1196
|
+
void task.finally(() => {
|
|
1197
|
+
this.prewarmTasks.delete(task);
|
|
1198
|
+
if (!this.prewarmedConnections.has(callId)) return;
|
|
1199
|
+
const handle = setTimeout(() => {
|
|
1200
|
+
this.prewarmedConnTimers.delete(callId);
|
|
1201
|
+
const orphan = this.prewarmedConnections.get(callId);
|
|
1202
|
+
if (orphan === void 0) return;
|
|
1203
|
+
this.prewarmedConnections.delete(callId);
|
|
1204
|
+
closeParkedConnections(orphan);
|
|
1205
|
+
getLogger().warn(
|
|
1206
|
+
`[PREWARM] parked connections evicted by TTL for ${callId} \u2014 call never reached start (~${(PARKED_CONN_TTL_MS / 1e3).toFixed(0)}s).`
|
|
1207
|
+
);
|
|
1208
|
+
}, PARKED_CONN_TTL_MS);
|
|
1209
|
+
handle.unref?.();
|
|
1210
|
+
this.prewarmedConnTimers.set(callId, handle);
|
|
1211
|
+
});
|
|
1212
|
+
}
|
|
1213
|
+
/**
|
|
1214
|
+
* Spawn a fire-and-forget task that warms up STT / TTS / LLM in
|
|
1215
|
+
* parallel with the carrier-side ``initiateCall``.
|
|
1216
|
+
*
|
|
1217
|
+
* Best-effort: each provider's optional ``warmup()`` is wrapped in
|
|
1218
|
+
* ``Promise.allSettled`` so a slow or failing endpoint cannot block
|
|
1219
|
+
* the others. Providers without ``warmup`` contribute nothing.
|
|
1220
|
+
*/
|
|
1221
|
+
spawnProviderWarmup(agent) {
|
|
1222
|
+
const targets = [];
|
|
1223
|
+
const collect = (provider, label) => {
|
|
1224
|
+
if (!provider || typeof provider !== "object") return;
|
|
1225
|
+
const fn = provider.warmup;
|
|
1226
|
+
if (typeof fn !== "function") return;
|
|
1227
|
+
targets.push({
|
|
1228
|
+
name: label,
|
|
1229
|
+
fn: fn.bind(provider)
|
|
1230
|
+
});
|
|
1231
|
+
};
|
|
1232
|
+
collect(agent.stt, "stt");
|
|
1233
|
+
collect(agent.tts, "tts");
|
|
1234
|
+
collect(agent.llm, "llm");
|
|
1235
|
+
if (targets.length === 0) return;
|
|
1236
|
+
const task = (async () => {
|
|
1237
|
+
const results = await Promise.allSettled(targets.map((t) => t.fn()));
|
|
1238
|
+
results.forEach((r, i) => {
|
|
1239
|
+
if (r.status === "rejected") {
|
|
1240
|
+
getLogger().debug(
|
|
1241
|
+
`Provider warmup failed (${targets[i].name}): ${String(r.reason)}`
|
|
1242
|
+
);
|
|
1243
|
+
}
|
|
1244
|
+
});
|
|
1245
|
+
})();
|
|
1246
|
+
this.prewarmTasks.add(task);
|
|
1247
|
+
void task.finally(() => this.prewarmTasks.delete(task));
|
|
1248
|
+
}
|
|
1249
|
+
/**
|
|
1250
|
+
* Pre-render ``agent.firstMessage`` to TTS bytes during the ringing
|
|
1251
|
+
* window and stash them in ``prewarmAudio.set(callId, buf)``.
|
|
1252
|
+
*
|
|
1253
|
+
* Skipped silently when ``agent.prewarmFirstMessage`` is false or
|
|
1254
|
+
* when ``agent.tts`` / ``agent.firstMessage`` is missing. The synth
|
|
1255
|
+
* is bounded by ``ringTimeout`` (default 25 s) so a never-answered
|
|
1256
|
+
* call doesn't tie up the TTS connection. On timeout / error the
|
|
1257
|
+
* cache is left empty and the StreamHandler falls back to live TTS.
|
|
1258
|
+
*
|
|
1259
|
+
* **Pipeline mode only.** Realtime / ConvAI provider modes never
|
|
1260
|
+
* consume the prewarm cache (the StreamHandler for those modes runs
|
|
1261
|
+
* its first-message emit through the provider's own audio path).
|
|
1262
|
+
* Spawning the prewarm in those modes pays the TTS bill for nothing
|
|
1263
|
+
* — refused with a warn.
|
|
1264
|
+
*
|
|
1265
|
+
* **Capped at ``PREWARM_CACHE_MAX`` concurrent entries.** Refused
|
|
1266
|
+
* with a warn when the cap is reached (the call still proceeds —
|
|
1267
|
+
* StreamHandler falls back to live TTS).
|
|
1268
|
+
*/
|
|
1269
|
+
spawnPrewarmFirstMessage(agent, callId, ringTimeout) {
|
|
1270
|
+
if (!agent.prewarmFirstMessage) return;
|
|
1271
|
+
const providerMode = agent.provider ?? "openai_realtime";
|
|
1272
|
+
if (providerMode !== "pipeline") {
|
|
1273
|
+
getLogger().warn(
|
|
1274
|
+
`agent.prewarmFirstMessage=true is only supported in pipeline mode (provider=${providerMode}); skipping pre-synth to avoid wasted TTS spend.`
|
|
1275
|
+
);
|
|
1276
|
+
return;
|
|
1277
|
+
}
|
|
1278
|
+
const firstMessage = agent.firstMessage ?? "";
|
|
1279
|
+
const tts = agent.tts;
|
|
1280
|
+
if (!firstMessage || !tts) return;
|
|
1281
|
+
if (typeof tts.synthesizeStream !== "function") return;
|
|
1282
|
+
const inFlight = this.prewarmAudio.size + this.prewarmTasks.size;
|
|
1283
|
+
if (inFlight >= PREWARM_CACHE_MAX) {
|
|
1284
|
+
getLogger().warn(
|
|
1285
|
+
`Prewarm cache full (${inFlight}/${PREWARM_CACHE_MAX} in-flight) \u2014 skipping pre-synth for call ${callId}; falling back to live TTS at pickup.`
|
|
1286
|
+
);
|
|
1287
|
+
return;
|
|
1288
|
+
}
|
|
1289
|
+
const timeoutMs = (typeof ringTimeout === "number" ? ringTimeout : 25) * 1e3;
|
|
1290
|
+
const task = (async () => {
|
|
1291
|
+
try {
|
|
1292
|
+
const accumulate = async () => {
|
|
1293
|
+
const chunks = [];
|
|
1294
|
+
for await (const chunk of tts.synthesizeStream(firstMessage)) {
|
|
1295
|
+
const u = chunk;
|
|
1296
|
+
if (Buffer.isBuffer(u)) chunks.push(u);
|
|
1297
|
+
else if (ArrayBuffer.isView(u))
|
|
1298
|
+
chunks.push(Buffer.from(u.buffer, u.byteOffset, u.byteLength));
|
|
1299
|
+
}
|
|
1300
|
+
return Buffer.concat(chunks);
|
|
1301
|
+
};
|
|
1302
|
+
const timer = new Promise(
|
|
1303
|
+
(_resolve, reject) => setTimeout(
|
|
1304
|
+
() => reject(new Error("prewarm-first-message timeout")),
|
|
1305
|
+
timeoutMs
|
|
1306
|
+
).unref?.()
|
|
1307
|
+
);
|
|
1308
|
+
const buf = await Promise.race([accumulate(), timer]);
|
|
1309
|
+
if (buf.byteLength > 0) {
|
|
1310
|
+
if (this.prewarmConsumed.has(callId)) {
|
|
1311
|
+
getLogger().warn(
|
|
1312
|
+
`Prewarm orphaned for call ${callId} \u2014 synth completed (~${buf.byteLength} bytes) AFTER consumer polled; bytes dropped, TTS bill already paid.`
|
|
1313
|
+
);
|
|
1314
|
+
return;
|
|
1315
|
+
}
|
|
1316
|
+
this.prewarmAudio.set(callId, buf);
|
|
1317
|
+
getLogger().debug(
|
|
1318
|
+
`Prewarm first-message ready for call ${callId} (${buf.byteLength} bytes)`
|
|
1319
|
+
);
|
|
1320
|
+
}
|
|
1321
|
+
} catch (err) {
|
|
1322
|
+
getLogger().debug(
|
|
1323
|
+
`Prewarm first-message failed for call ${callId}: ${String(err)}`
|
|
1324
|
+
);
|
|
1325
|
+
}
|
|
1326
|
+
})();
|
|
1327
|
+
this.prewarmTasks.add(task);
|
|
1328
|
+
void task.finally(() => {
|
|
1329
|
+
this.prewarmTasks.delete(task);
|
|
1330
|
+
if (!this.prewarmAudio.has(callId)) return;
|
|
1331
|
+
const ttlMs = timeoutMs + PREWARM_TTL_GRACE_MS;
|
|
1332
|
+
const handle = setTimeout(() => {
|
|
1333
|
+
this.prewarmTtlTimers.delete(callId);
|
|
1334
|
+
const orphan = this.prewarmAudio.get(callId);
|
|
1335
|
+
if (orphan === void 0) return;
|
|
1336
|
+
this.prewarmAudio.delete(callId);
|
|
1337
|
+
this.prewarmConsumed.add(callId);
|
|
1338
|
+
getLogger().warn(
|
|
1339
|
+
`Prewarm bytes evicted by TTL \u2014 call ${callId} never consumed them (~${orphan.byteLength} bytes synthesised, ${(ttlMs / 1e3).toFixed(1)}s after ringTimeout).`
|
|
1340
|
+
);
|
|
1341
|
+
}, ttlMs);
|
|
1342
|
+
handle.unref?.();
|
|
1343
|
+
this.prewarmTtlTimers.set(callId, handle);
|
|
1344
|
+
});
|
|
1345
|
+
}
|
|
903
1346
|
/** Place an outbound call via the configured carrier. */
|
|
904
1347
|
async call(options) {
|
|
905
1348
|
if (!options.to) {
|
|
@@ -914,6 +1357,9 @@ var Patter = class {
|
|
|
914
1357
|
if (this.embeddedServer) {
|
|
915
1358
|
this.embeddedServer.onMachineDetection = options.onMachineDetection;
|
|
916
1359
|
}
|
|
1360
|
+
if (options.agent.prewarm !== false) {
|
|
1361
|
+
this.spawnProviderWarmup(options.agent);
|
|
1362
|
+
}
|
|
917
1363
|
if (carrier.kind === "telnyx") {
|
|
918
1364
|
const telnyxKey = carrier.apiKey;
|
|
919
1365
|
const connectionId = carrier.connectionId;
|
|
@@ -939,19 +1385,24 @@ var Patter = class {
|
|
|
939
1385
|
if (!response2.ok) {
|
|
940
1386
|
throw new ProvisionError(`Failed to initiate Telnyx call: ${await response2.text()}`);
|
|
941
1387
|
}
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
}
|
|
1388
|
+
let telnyxCallId;
|
|
1389
|
+
try {
|
|
1390
|
+
const body = await response2.clone().json();
|
|
1391
|
+
telnyxCallId = body.data?.call_control_id;
|
|
1392
|
+
} catch {
|
|
1393
|
+
}
|
|
1394
|
+
if (this.embeddedServer && telnyxCallId) {
|
|
1395
|
+
this.embeddedServer.metricsStore.recordCallInitiated({
|
|
1396
|
+
call_id: telnyxCallId,
|
|
1397
|
+
caller: phoneNumber,
|
|
1398
|
+
callee: options.to,
|
|
1399
|
+
direction: "outbound"
|
|
1400
|
+
});
|
|
1401
|
+
}
|
|
1402
|
+
if (telnyxCallId) {
|
|
1403
|
+
this.spawnPrewarmFirstMessage(options.agent, telnyxCallId, effectiveRingTimeout);
|
|
1404
|
+
if (options.agent.prewarm !== false) {
|
|
1405
|
+
this.parkProviderConnections(options.agent, telnyxCallId);
|
|
955
1406
|
}
|
|
956
1407
|
}
|
|
957
1408
|
return;
|
|
@@ -994,25 +1445,31 @@ var Patter = class {
|
|
|
994
1445
|
if (!response.ok) {
|
|
995
1446
|
throw new ProvisionError(`Failed to initiate call: ${await response.text()}`);
|
|
996
1447
|
}
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1448
|
+
let twilioCallSid;
|
|
1449
|
+
let twilioNotificationsPath;
|
|
1450
|
+
try {
|
|
1451
|
+
const body = await response.clone().json();
|
|
1452
|
+
twilioCallSid = body.sid;
|
|
1453
|
+
twilioNotificationsPath = body.subresource_uris?.notifications;
|
|
1454
|
+
} catch {
|
|
1455
|
+
}
|
|
1456
|
+
if (this.embeddedServer && twilioCallSid) {
|
|
1457
|
+
this.embeddedServer.metricsStore.recordCallInitiated({
|
|
1458
|
+
call_id: twilioCallSid,
|
|
1459
|
+
caller: phoneNumber,
|
|
1460
|
+
callee: options.to,
|
|
1461
|
+
direction: "outbound"
|
|
1462
|
+
});
|
|
1463
|
+
if (twilioNotificationsPath) {
|
|
1464
|
+
getLogger().info(
|
|
1465
|
+
`Outbound call ${twilioCallSid} placed. Twilio notifications: https://api.twilio.com${twilioNotificationsPath} (check here if the call drops with no audio).`
|
|
1466
|
+
);
|
|
1467
|
+
}
|
|
1468
|
+
}
|
|
1469
|
+
if (twilioCallSid) {
|
|
1470
|
+
this.spawnPrewarmFirstMessage(options.agent, twilioCallSid, effectiveRingTimeout);
|
|
1471
|
+
if (options.agent.prewarm !== false) {
|
|
1472
|
+
this.parkProviderConnections(options.agent, twilioCallSid);
|
|
1016
1473
|
}
|
|
1017
1474
|
}
|
|
1018
1475
|
}
|
|
@@ -1020,8 +1477,36 @@ var Patter = class {
|
|
|
1020
1477
|
* Stop the embedded server and any running tunnel. Safe to call multiple
|
|
1021
1478
|
* times. Leaves the instance reusable: a subsequent ``serve()`` works as
|
|
1022
1479
|
* if the previous lifecycle never happened.
|
|
1480
|
+
*
|
|
1481
|
+
* Also clears any pending TTL eviction timers, awaits in-flight
|
|
1482
|
+
* prewarm-first-message synth tasks (best-effort, with a 1 s safety
|
|
1483
|
+
* timeout), and clears the prewarm cache. Without this a still-running
|
|
1484
|
+
* TTS WS keeps the user billed long after SDK teardown, and stale
|
|
1485
|
+
* entries leak across ``serve`` / ``disconnect`` cycles. See FIX #93.
|
|
1023
1486
|
*/
|
|
1024
1487
|
async disconnect() {
|
|
1488
|
+
for (const handle of this.prewarmTtlTimers.values()) {
|
|
1489
|
+
clearTimeout(handle);
|
|
1490
|
+
}
|
|
1491
|
+
this.prewarmTtlTimers.clear();
|
|
1492
|
+
if (this.prewarmTasks.size > 0) {
|
|
1493
|
+
const drain = Promise.allSettled(Array.from(this.prewarmTasks));
|
|
1494
|
+
const timer = new Promise(
|
|
1495
|
+
(resolve) => setTimeout(resolve, 1e3).unref?.()
|
|
1496
|
+
);
|
|
1497
|
+
await Promise.race([drain, timer]);
|
|
1498
|
+
}
|
|
1499
|
+
this.prewarmTasks.clear();
|
|
1500
|
+
this.prewarmAudio.clear();
|
|
1501
|
+
this.prewarmConsumed.clear();
|
|
1502
|
+
for (const handle of this.prewarmedConnTimers.values()) {
|
|
1503
|
+
clearTimeout(handle);
|
|
1504
|
+
}
|
|
1505
|
+
this.prewarmedConnTimers.clear();
|
|
1506
|
+
for (const slot of this.prewarmedConnections.values()) {
|
|
1507
|
+
closeParkedConnections(slot);
|
|
1508
|
+
}
|
|
1509
|
+
this.prewarmedConnections.clear();
|
|
1025
1510
|
if (this.tunnelHandle) {
|
|
1026
1511
|
this.tunnelHandle.stop();
|
|
1027
1512
|
this.tunnelHandle = null;
|
|
@@ -1072,6 +1557,7 @@ var Patter = class {
|
|
|
1072
1557
|
if (!callSid) {
|
|
1073
1558
|
throw new Error("callSid must be a non-empty string");
|
|
1074
1559
|
}
|
|
1560
|
+
this.recordPrewarmWaste(callSid);
|
|
1075
1561
|
const carrier = this.localConfig.carrier;
|
|
1076
1562
|
if (carrier.kind === "twilio") {
|
|
1077
1563
|
const auth = Buffer.from(`${carrier.accountSid}:${carrier.authToken}`).toString("base64");
|
|
@@ -1107,7 +1593,7 @@ var Patter = class {
|
|
|
1107
1593
|
}
|
|
1108
1594
|
};
|
|
1109
1595
|
async function waitForTunnelPubliclyReachable(hostname, totalTimeoutMs = 6e4, graceMs = 5e3) {
|
|
1110
|
-
const
|
|
1596
|
+
const log2 = getLogger();
|
|
1111
1597
|
const { Resolver } = await import("dns/promises");
|
|
1112
1598
|
const resolver = new Resolver({ timeout: 1500, tries: 1 });
|
|
1113
1599
|
resolver.setServers(["1.1.1.1", "8.8.8.8"]);
|
|
@@ -1119,7 +1605,7 @@ async function waitForTunnelPubliclyReachable(hostname, totalTimeoutMs = 6e4, gr
|
|
|
1119
1605
|
try {
|
|
1120
1606
|
const records = await resolver.resolve4(hostname);
|
|
1121
1607
|
const first = records[0] ?? "<unknown>";
|
|
1122
|
-
|
|
1608
|
+
log2.info(
|
|
1123
1609
|
"Tunnel DNS resolved \u2192 %s (attempt %d); waiting %d ms grace",
|
|
1124
1610
|
first,
|
|
1125
1611
|
attempt,
|
|
@@ -2333,6 +2819,8 @@ function wrapPcmInWav(pcm, sampleRate = 16e3, channels = 1, bitsPerSample = 16)
|
|
|
2333
2819
|
return Buffer.concat([header, pcm]);
|
|
2334
2820
|
}
|
|
2335
2821
|
var WhisperSTT = class _WhisperSTT {
|
|
2822
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
2823
|
+
static providerKey = "whisper";
|
|
2336
2824
|
apiKey;
|
|
2337
2825
|
model;
|
|
2338
2826
|
language;
|
|
@@ -2501,6 +2989,8 @@ init_esm_shims();
|
|
|
2501
2989
|
var ALLOWED_MODELS2 = /* @__PURE__ */ new Set(["gpt-4o-transcribe", "gpt-4o-mini-transcribe"]);
|
|
2502
2990
|
var DEFAULT_BUFFER_SIZE2 = 16e3 * 2;
|
|
2503
2991
|
var OpenAITranscribeSTT = class extends WhisperSTT {
|
|
2992
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
2993
|
+
static providerKey = "openai_transcribe";
|
|
2504
2994
|
/**
|
|
2505
2995
|
* @param apiKey OpenAI API key.
|
|
2506
2996
|
* @param language ISO-639-1 language code (e.g. ``"en"``, ``"it"``). Optional.
|
|
@@ -2576,6 +3066,8 @@ var CartesiaSTT = class {
|
|
|
2576
3066
|
}
|
|
2577
3067
|
apiKey;
|
|
2578
3068
|
options;
|
|
3069
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
3070
|
+
static providerKey = "cartesia_stt";
|
|
2579
3071
|
ws = null;
|
|
2580
3072
|
callbacks = /* @__PURE__ */ new Set();
|
|
2581
3073
|
keepaliveTimer = null;
|
|
@@ -2584,6 +3076,37 @@ var CartesiaSTT = class {
|
|
|
2584
3076
|
* `null` until the first transcript event arrives (matches Python's `None`).
|
|
2585
3077
|
*/
|
|
2586
3078
|
requestId = null;
|
|
3079
|
+
/**
|
|
3080
|
+
* Open a fresh WebSocket without arming any message / keepalive handlers
|
|
3081
|
+
* and without taking ownership on `this.ws`. Returns the OPEN socket so
|
|
3082
|
+
* the caller (the prewarm pipeline) can park it for later adoption via
|
|
3083
|
+
* `adoptWebSocket`. Bounded by `CONNECT_TIMEOUT_MS`.
|
|
3084
|
+
*
|
|
3085
|
+
* Billing safety: opening + parking the WS does not stream audio
|
|
3086
|
+
* (Cartesia STT bills on streamed audio seconds), so no charge is
|
|
3087
|
+
* incurred. Close the returned WS yourself if it is never adopted.
|
|
3088
|
+
*/
|
|
3089
|
+
async openParkedConnection() {
|
|
3090
|
+
const url = this.buildWsUrl();
|
|
3091
|
+
const ws = new WebSocket2(url, {
|
|
3092
|
+
headers: { "User-Agent": USER_AGENT }
|
|
3093
|
+
});
|
|
3094
|
+
await new Promise((resolve, reject) => {
|
|
3095
|
+
const timer = setTimeout(
|
|
3096
|
+
() => reject(new Error("Cartesia STT park connect timeout")),
|
|
3097
|
+
CONNECT_TIMEOUT_MS
|
|
3098
|
+
);
|
|
3099
|
+
ws.once("open", () => {
|
|
3100
|
+
clearTimeout(timer);
|
|
3101
|
+
resolve();
|
|
3102
|
+
});
|
|
3103
|
+
ws.once("error", (err) => {
|
|
3104
|
+
clearTimeout(timer);
|
|
3105
|
+
reject(err);
|
|
3106
|
+
});
|
|
3107
|
+
});
|
|
3108
|
+
return ws;
|
|
3109
|
+
}
|
|
2587
3110
|
buildWsUrl() {
|
|
2588
3111
|
const opts = this.options;
|
|
2589
3112
|
const rawBase = opts.baseUrl ?? DEFAULT_BASE_URL;
|
|
@@ -2608,6 +3131,57 @@ var CartesiaSTT = class {
|
|
|
2608
3131
|
});
|
|
2609
3132
|
return `${base}/stt/websocket?${params.toString()}`;
|
|
2610
3133
|
}
|
|
3134
|
+
/**
|
|
3135
|
+
* Pre-call WebSocket warmup for the Cartesia STT `/stt/websocket` endpoint.
|
|
3136
|
+
*
|
|
3137
|
+
* Opens the WS (DNS + TLS + auth handshake), idles ~250 ms so the
|
|
3138
|
+
* Cartesia edge keeps session state warm, then closes. By the time
|
|
3139
|
+
* `connect()` is invoked at call-pickup the resolver and TLS session
|
|
3140
|
+
* are hot — net wire time saving of 200-500 ms.
|
|
3141
|
+
*
|
|
3142
|
+
* Billing safety: Cartesia STT bills on streamed audio seconds (per
|
|
3143
|
+
* https://docs.cartesia.ai/2025-04-16/api-reference/stt/stt). Opening
|
|
3144
|
+
* + closing the WebSocket without forwarding audio does not consume
|
|
3145
|
+
* billable seconds. Best-effort: failures logged at debug level.
|
|
3146
|
+
*/
|
|
3147
|
+
async warmup() {
|
|
3148
|
+
const url = this.buildWsUrl();
|
|
3149
|
+
let ws = null;
|
|
3150
|
+
try {
|
|
3151
|
+
ws = await new Promise((resolve, reject) => {
|
|
3152
|
+
const sock = new WebSocket2(url, {
|
|
3153
|
+
headers: { "User-Agent": USER_AGENT }
|
|
3154
|
+
});
|
|
3155
|
+
const timer = setTimeout(() => {
|
|
3156
|
+
try {
|
|
3157
|
+
sock.close();
|
|
3158
|
+
} catch {
|
|
3159
|
+
}
|
|
3160
|
+
reject(new Error("Cartesia STT warmup connect timeout"));
|
|
3161
|
+
}, 5e3);
|
|
3162
|
+
sock.once("open", () => {
|
|
3163
|
+
clearTimeout(timer);
|
|
3164
|
+
resolve(sock);
|
|
3165
|
+
});
|
|
3166
|
+
sock.once("error", (err) => {
|
|
3167
|
+
clearTimeout(timer);
|
|
3168
|
+
reject(err);
|
|
3169
|
+
});
|
|
3170
|
+
});
|
|
3171
|
+
await new Promise((r) => setTimeout(r, 250));
|
|
3172
|
+
} catch (err) {
|
|
3173
|
+
getLogger().debug(
|
|
3174
|
+
`Cartesia STT warmup failed (best-effort): ${describeWarmupError(err)}`
|
|
3175
|
+
);
|
|
3176
|
+
} finally {
|
|
3177
|
+
if (ws) {
|
|
3178
|
+
try {
|
|
3179
|
+
ws.close();
|
|
3180
|
+
} catch {
|
|
3181
|
+
}
|
|
3182
|
+
}
|
|
3183
|
+
}
|
|
3184
|
+
}
|
|
2611
3185
|
/** Open the streaming WebSocket and arm message + keepalive handlers. */
|
|
2612
3186
|
async connect() {
|
|
2613
3187
|
const url = this.buildWsUrl();
|
|
@@ -2628,6 +3202,24 @@ var CartesiaSTT = class {
|
|
|
2628
3202
|
reject(err);
|
|
2629
3203
|
});
|
|
2630
3204
|
});
|
|
3205
|
+
this.armMessageAndKeepalive();
|
|
3206
|
+
}
|
|
3207
|
+
/**
|
|
3208
|
+
* Adopt a pre-opened, already-OPEN WebSocket produced by the prewarm
|
|
3209
|
+
* pipeline (see `Patter.parkProviderConnections`). Skips the fresh
|
|
3210
|
+
* `new WebSocket()` + handshake — the WS is already through DNS, TLS
|
|
3211
|
+
* and HTTP-101 so audio frames can flow on this turn instead of
|
|
3212
|
+
* paying ~150-400 ms of handshake.
|
|
3213
|
+
*
|
|
3214
|
+
* Caller MUST verify `ws.readyState === OPEN` before calling. If the
|
|
3215
|
+
* parked WS died between park and adopt, fall back to `connect()`.
|
|
3216
|
+
*/
|
|
3217
|
+
adoptWebSocket(ws) {
|
|
3218
|
+
this.ws = ws;
|
|
3219
|
+
this.armMessageAndKeepalive();
|
|
3220
|
+
}
|
|
3221
|
+
armMessageAndKeepalive() {
|
|
3222
|
+
if (!this.ws) return;
|
|
2631
3223
|
this.ws.on("message", (raw) => {
|
|
2632
3224
|
let event;
|
|
2633
3225
|
try {
|
|
@@ -2675,6 +3267,31 @@ var CartesiaSTT = class {
|
|
|
2675
3267
|
if (!this.ws || this.ws.readyState !== WebSocket2.OPEN) return;
|
|
2676
3268
|
this.ws.send(audio);
|
|
2677
3269
|
}
|
|
3270
|
+
/**
|
|
3271
|
+
* Force Cartesia to finalise the in-flight utterance immediately.
|
|
3272
|
+
*
|
|
3273
|
+
* Sends a ``finalize`` text frame on the live WebSocket. Cartesia
|
|
3274
|
+
* replies with the final transcript followed by ``flush_done``,
|
|
3275
|
+
* bypassing its conservative internal silence heuristic (which can
|
|
3276
|
+
* wait 2-7 s on PSTN audio before naturally finalising). Wired
|
|
3277
|
+
* into ``StreamHandler`` on the VAD ``speech_end`` event so the
|
|
3278
|
+
* SDK's authoritative end-of-speech detection forces an immediate
|
|
3279
|
+
* STT finalisation — turning Cartesia's natural-pause endpointing
|
|
3280
|
+
* into a deterministic VAD-driven one, parity with the Deepgram
|
|
3281
|
+
* fast-path. No-op when the WS isn't open. Parity with Python
|
|
3282
|
+
* ``CartesiaSTT.finalize``.
|
|
3283
|
+
*/
|
|
3284
|
+
async finalize() {
|
|
3285
|
+
if (!this.ws || this.ws.readyState !== WebSocket2.OPEN) return;
|
|
3286
|
+
await new Promise((resolve) => {
|
|
3287
|
+
this.ws.send(CartesiaSTTClientFrame.FINALIZE, (err) => {
|
|
3288
|
+
if (err) {
|
|
3289
|
+
getLogger().debug(`Cartesia finalize send failed: ${String(err)}`);
|
|
3290
|
+
}
|
|
3291
|
+
resolve();
|
|
3292
|
+
});
|
|
3293
|
+
});
|
|
3294
|
+
}
|
|
2678
3295
|
/** Register a transcript listener. */
|
|
2679
3296
|
onTranscript(callback) {
|
|
2680
3297
|
this.callbacks.add(callback);
|
|
@@ -2748,6 +3365,17 @@ var CartesiaSTT = class {
|
|
|
2748
3365
|
}
|
|
2749
3366
|
}
|
|
2750
3367
|
};
|
|
3368
|
+
function describeWarmupError(err) {
|
|
3369
|
+
if (typeof err === "object" && err !== null) {
|
|
3370
|
+
const e = err;
|
|
3371
|
+
if (typeof e.statusCode === "number") return `HTTP ${e.statusCode}`;
|
|
3372
|
+
if (typeof e.code === "number" && e.code >= 100 && e.code < 600) return `HTTP ${e.code}`;
|
|
3373
|
+
const ctor = e.constructor?.name;
|
|
3374
|
+
if (typeof ctor === "string" && ctor !== "Object") return ctor;
|
|
3375
|
+
if (typeof e.name === "string") return e.name;
|
|
3376
|
+
}
|
|
3377
|
+
return typeof err;
|
|
3378
|
+
}
|
|
2751
3379
|
|
|
2752
3380
|
// src/stt/cartesia.ts
|
|
2753
3381
|
var STT4 = class extends CartesiaSTT {
|
|
@@ -2826,6 +3454,8 @@ var TokenAccumulator = class {
|
|
|
2826
3454
|
}
|
|
2827
3455
|
};
|
|
2828
3456
|
var SonioxSTT = class _SonioxSTT {
|
|
3457
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
3458
|
+
static providerKey = "soniox";
|
|
2829
3459
|
ws = null;
|
|
2830
3460
|
callbacks = [];
|
|
2831
3461
|
final = new TokenAccumulator();
|
|
@@ -3103,6 +3733,8 @@ var AssemblyAISTT = class _AssemblyAISTT {
|
|
|
3103
3733
|
}
|
|
3104
3734
|
apiKey;
|
|
3105
3735
|
options;
|
|
3736
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
3737
|
+
static providerKey = "assemblyai";
|
|
3106
3738
|
ws = null;
|
|
3107
3739
|
callbacks = /* @__PURE__ */ new Set();
|
|
3108
3740
|
closing = false;
|
|
@@ -3192,6 +3824,62 @@ var AssemblyAISTT = class _AssemblyAISTT {
|
|
|
3192
3824
|
}
|
|
3193
3825
|
return headers;
|
|
3194
3826
|
}
|
|
3827
|
+
/**
|
|
3828
|
+
* Pre-call WebSocket warmup for the AssemblyAI v3 `/v3/ws` endpoint.
|
|
3829
|
+
*
|
|
3830
|
+
* Opens the WS (DNS + TLS + auth handshake), idles ~250 ms so the
|
|
3831
|
+
* AssemblyAI edge keeps the session state warm, then sends Terminate
|
|
3832
|
+
* and closes. By the time `connect()` is invoked at call-pickup the
|
|
3833
|
+
* resolver and TLS session are hot — net wire time saving of
|
|
3834
|
+
* 200-500 ms.
|
|
3835
|
+
*
|
|
3836
|
+
* Billing safety: AssemblyAI Universal Streaming bills on streamed
|
|
3837
|
+
* audio seconds (per https://www.assemblyai.com/pricing). Opening +
|
|
3838
|
+
* closing the WebSocket without forwarding any audio frames does
|
|
3839
|
+
* not consume billable seconds. Best-effort: failures logged at
|
|
3840
|
+
* debug level.
|
|
3841
|
+
*/
|
|
3842
|
+
async warmup() {
|
|
3843
|
+
const url = this.buildUrl();
|
|
3844
|
+
const headers = this.buildHeaders();
|
|
3845
|
+
let ws = null;
|
|
3846
|
+
try {
|
|
3847
|
+
ws = await new Promise((resolve, reject) => {
|
|
3848
|
+
const sock = new WebSocket4(url, { headers });
|
|
3849
|
+
const timer = setTimeout(() => {
|
|
3850
|
+
try {
|
|
3851
|
+
sock.close();
|
|
3852
|
+
} catch {
|
|
3853
|
+
}
|
|
3854
|
+
reject(new Error("AssemblyAI STT warmup connect timeout"));
|
|
3855
|
+
}, 5e3);
|
|
3856
|
+
sock.once("open", () => {
|
|
3857
|
+
clearTimeout(timer);
|
|
3858
|
+
resolve(sock);
|
|
3859
|
+
});
|
|
3860
|
+
sock.once("error", (err) => {
|
|
3861
|
+
clearTimeout(timer);
|
|
3862
|
+
reject(err);
|
|
3863
|
+
});
|
|
3864
|
+
});
|
|
3865
|
+
await new Promise((r) => setTimeout(r, 250));
|
|
3866
|
+
try {
|
|
3867
|
+
ws.send(JSON.stringify({ type: AssemblyAIClientFrame.TERMINATE }));
|
|
3868
|
+
} catch {
|
|
3869
|
+
}
|
|
3870
|
+
} catch (err) {
|
|
3871
|
+
getLogger().debug(
|
|
3872
|
+
`AssemblyAI STT warmup failed (best-effort): ${describeWarmupError2(err)}`
|
|
3873
|
+
);
|
|
3874
|
+
} finally {
|
|
3875
|
+
if (ws) {
|
|
3876
|
+
try {
|
|
3877
|
+
ws.close();
|
|
3878
|
+
} catch {
|
|
3879
|
+
}
|
|
3880
|
+
}
|
|
3881
|
+
}
|
|
3882
|
+
}
|
|
3195
3883
|
/** Open the streaming WebSocket and arm message handlers. */
|
|
3196
3884
|
async connect() {
|
|
3197
3885
|
this.closing = false;
|
|
@@ -3420,6 +4108,17 @@ function averageConfidence(words) {
|
|
|
3420
4108
|
}
|
|
3421
4109
|
return total / words.length;
|
|
3422
4110
|
}
|
|
4111
|
+
function describeWarmupError2(err) {
|
|
4112
|
+
if (typeof err === "object" && err !== null) {
|
|
4113
|
+
const e = err;
|
|
4114
|
+
if (typeof e.statusCode === "number") return `HTTP ${e.statusCode}`;
|
|
4115
|
+
if (typeof e.code === "number" && e.code >= 100 && e.code < 600) return `HTTP ${e.code}`;
|
|
4116
|
+
const ctor = e.constructor?.name;
|
|
4117
|
+
if (typeof ctor === "string" && ctor !== "Object") return ctor;
|
|
4118
|
+
if (typeof e.name === "string") return e.name;
|
|
4119
|
+
}
|
|
4120
|
+
return typeof err;
|
|
4121
|
+
}
|
|
3423
4122
|
|
|
3424
4123
|
// src/stt/assemblyai.ts
|
|
3425
4124
|
var STT6 = class extends AssemblyAISTT {
|
|
@@ -3476,6 +4175,8 @@ var SpeechmaticsServerMessage = {
|
|
|
3476
4175
|
ERROR: "Error"
|
|
3477
4176
|
};
|
|
3478
4177
|
var SpeechmaticsSTT = class {
|
|
4178
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
4179
|
+
static providerKey = "speechmatics";
|
|
3479
4180
|
ws = null;
|
|
3480
4181
|
transcriptCallbacks = /* @__PURE__ */ new Set();
|
|
3481
4182
|
errorCallbacks = /* @__PURE__ */ new Set();
|
|
@@ -3864,6 +4565,13 @@ var ElevenLabsOutputFormat = {
|
|
|
3864
4565
|
ULAW_8000: "ulaw_8000"
|
|
3865
4566
|
};
|
|
3866
4567
|
var ElevenLabsTTS = class _ElevenLabsTTS {
|
|
4568
|
+
// Stable pricing/dashboard key — read by stream-handler / metrics via
|
|
4569
|
+
// ``(agent.tts.constructor as any).providerKey``. Without this the cost
|
|
4570
|
+
// calculator falls back to ``constructor.name`` ("ElevenLabsTTS") which
|
|
4571
|
+
// does NOT match the pricing table key "elevenlabs", silently zeroing
|
|
4572
|
+
// TTS cost for callers that construct the raw REST class directly
|
|
4573
|
+
// (exposed at top level as ``ElevenLabsRestTTS``).
|
|
4574
|
+
static providerKey = "elevenlabs";
|
|
3867
4575
|
apiKey;
|
|
3868
4576
|
voiceId;
|
|
3869
4577
|
modelId;
|
|
@@ -4052,7 +4760,7 @@ var ElevenLabsPlanError = class extends ElevenLabsTTSError {
|
|
|
4052
4760
|
this.name = "ElevenLabsPlanError";
|
|
4053
4761
|
}
|
|
4054
4762
|
};
|
|
4055
|
-
var PLAN_REQUIRED_MSG = "ElevenLabs WS streaming requires a Pro plan or higher (the WS endpoint returned `payment_required`). Either upgrade at https://elevenlabs.io/pricing, or use
|
|
4763
|
+
var PLAN_REQUIRED_MSG = "ElevenLabs WS streaming requires a Pro plan or higher (the WS endpoint returned `payment_required`). Either upgrade at https://elevenlabs.io/pricing, or use `ElevenLabsRestTTS` for HTTP REST instead which works on all plans (drop-in API).";
|
|
4056
4764
|
function sanitiseLogStr(value, limit = 200) {
|
|
4057
4765
|
return String(value).replace(/[\r\n\x00]/g, " ").slice(0, limit);
|
|
4058
4766
|
}
|
|
@@ -4071,6 +4779,19 @@ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
|
|
|
4071
4779
|
inactivityTimeout;
|
|
4072
4780
|
chunkLengthSchedule;
|
|
4073
4781
|
chunkSize;
|
|
4782
|
+
/**
|
|
4783
|
+
* Single-slot adoption queue. The prewarm pipeline parks one WS per
|
|
4784
|
+
* outbound call here; the next `synthesizeStream` call consumes it
|
|
4785
|
+
* (skipping `new WebSocket()` and the BOS send) instead of opening
|
|
4786
|
+
* a fresh socket. The slot is consumed exactly once: if a second
|
|
4787
|
+
* `synthesizeStream` runs before the first, only the first benefits.
|
|
4788
|
+
*
|
|
4789
|
+
* We keep this on the adapter (not in a parameter) so the existing
|
|
4790
|
+
* `for await (const chunk of agent.tts.synthesizeStream(...))` call
|
|
4791
|
+
* site in `StreamHandler` continues to work without signature
|
|
4792
|
+
* changes.
|
|
4793
|
+
*/
|
|
4794
|
+
adoptedConnection = null;
|
|
4074
4795
|
/**
|
|
4075
4796
|
* The wire format requested over the ElevenLabs WS. Initially set from
|
|
4076
4797
|
* the constructor; ``setTelephonyCarrier`` may auto-flip it to the
|
|
@@ -4086,7 +4807,7 @@ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
|
|
|
4086
4807
|
constructor(opts) {
|
|
4087
4808
|
if (opts.modelId === "eleven_v3") {
|
|
4088
4809
|
throw new Error(
|
|
4089
|
-
"eleven_v3 is not supported by the WebSocket stream-input endpoint \u2014 use
|
|
4810
|
+
"eleven_v3 is not supported by the WebSocket stream-input endpoint \u2014 use `ElevenLabsRestTTS` for HTTP REST instead."
|
|
4090
4811
|
);
|
|
4091
4812
|
}
|
|
4092
4813
|
this.apiKey = opts.apiKey;
|
|
@@ -4148,6 +4869,24 @@ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
|
|
|
4148
4869
|
if (this.languageCode) params.set("language_code", this.languageCode);
|
|
4149
4870
|
return `${WS_BASE}/${encodeURIComponent(this.voiceId)}/stream-input?${params.toString()}`;
|
|
4150
4871
|
}
|
|
4872
|
+
/**
|
|
4873
|
+
* Build the protocol-required BOS frame sent on every fresh WS.
|
|
4874
|
+
*
|
|
4875
|
+
* The single-space `{"text": " "}` keep-alive establishes the session
|
|
4876
|
+
* without committing any synthesis (no `flush: true`, no real text).
|
|
4877
|
+
* Production `synthesizeStream()` and `warmup()` share this exact
|
|
4878
|
+
* construction so the upstream worker chooses the same per-session
|
|
4879
|
+
* config in both cases — otherwise the warm session is on a different
|
|
4880
|
+
* worker than the live request, which defeats the warmup goal.
|
|
4881
|
+
*/
|
|
4882
|
+
buildBosFrame() {
|
|
4883
|
+
const init = { text: " " };
|
|
4884
|
+
if (this.voiceSettings) init["voice_settings"] = this.voiceSettings;
|
|
4885
|
+
if (!this.autoMode && this.chunkLengthSchedule) {
|
|
4886
|
+
init["generation_config"] = { chunk_length_schedule: this.chunkLengthSchedule };
|
|
4887
|
+
}
|
|
4888
|
+
return init;
|
|
4889
|
+
}
|
|
4151
4890
|
/**
|
|
4152
4891
|
* Single-shot synthesis: open WS, send text, yield bytes, close.
|
|
4153
4892
|
*
|
|
@@ -4166,9 +4905,26 @@ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
|
|
|
4166
4905
|
* after flush — auto_mode could otherwise truncate the tail audio).
|
|
4167
4906
|
*/
|
|
4168
4907
|
async *synthesizeStream(text) {
|
|
4169
|
-
|
|
4170
|
-
|
|
4171
|
-
|
|
4908
|
+
let ws;
|
|
4909
|
+
let bosAlreadySent = false;
|
|
4910
|
+
let adopted = false;
|
|
4911
|
+
const parked = this.adoptedConnection;
|
|
4912
|
+
this.adoptedConnection = null;
|
|
4913
|
+
if (parked && parked.ws.readyState === WebSocket6.OPEN) {
|
|
4914
|
+
ws = parked.ws;
|
|
4915
|
+
bosAlreadySent = parked.bosSent;
|
|
4916
|
+
adopted = true;
|
|
4917
|
+
} else {
|
|
4918
|
+
if (parked) {
|
|
4919
|
+
try {
|
|
4920
|
+
parked.ws.close();
|
|
4921
|
+
} catch {
|
|
4922
|
+
}
|
|
4923
|
+
}
|
|
4924
|
+
ws = new WebSocket6(this.buildUrl(), {
|
|
4925
|
+
headers: { "xi-api-key": this.apiKey }
|
|
4926
|
+
});
|
|
4927
|
+
}
|
|
4172
4928
|
const queue = [];
|
|
4173
4929
|
let done = false;
|
|
4174
4930
|
let pendingError = null;
|
|
@@ -4238,28 +4994,27 @@ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
|
|
|
4238
4994
|
};
|
|
4239
4995
|
ws.on("error", onError);
|
|
4240
4996
|
try {
|
|
4241
|
-
|
|
4242
|
-
|
|
4243
|
-
|
|
4244
|
-
|
|
4245
|
-
|
|
4246
|
-
|
|
4247
|
-
|
|
4248
|
-
|
|
4249
|
-
|
|
4250
|
-
|
|
4251
|
-
|
|
4252
|
-
|
|
4253
|
-
|
|
4254
|
-
|
|
4997
|
+
if (!adopted) {
|
|
4998
|
+
await new Promise((resolve, reject) => {
|
|
4999
|
+
connectTimer = setTimeout(
|
|
5000
|
+
() => reject(new Error("ElevenLabs WS connect timeout")),
|
|
5001
|
+
CONNECT_TIMEOUT_MS4
|
|
5002
|
+
);
|
|
5003
|
+
ws.once("open", () => {
|
|
5004
|
+
if (connectTimer) clearTimeout(connectTimer);
|
|
5005
|
+
connectTimer = void 0;
|
|
5006
|
+
resolve();
|
|
5007
|
+
});
|
|
5008
|
+
ws.once("error", (err) => {
|
|
5009
|
+
if (connectTimer) clearTimeout(connectTimer);
|
|
5010
|
+
connectTimer = void 0;
|
|
5011
|
+
reject(err);
|
|
5012
|
+
});
|
|
4255
5013
|
});
|
|
4256
|
-
});
|
|
4257
|
-
const init = { text: " " };
|
|
4258
|
-
if (this.voiceSettings) init["voice_settings"] = this.voiceSettings;
|
|
4259
|
-
if (!this.autoMode && this.chunkLengthSchedule) {
|
|
4260
|
-
init["generation_config"] = { chunk_length_schedule: this.chunkLengthSchedule };
|
|
4261
5014
|
}
|
|
4262
|
-
|
|
5015
|
+
if (!bosAlreadySent) {
|
|
5016
|
+
ws.send(JSON.stringify(this.buildBosFrame()));
|
|
5017
|
+
}
|
|
4263
5018
|
ws.send(JSON.stringify({ text: text + " ", flush: true }));
|
|
4264
5019
|
ws.on("message", onMessage);
|
|
4265
5020
|
ws.on("close", onClose);
|
|
@@ -4305,8 +5060,141 @@ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
|
|
|
4305
5060
|
ws.removeAllListeners();
|
|
4306
5061
|
}
|
|
4307
5062
|
}
|
|
5063
|
+
/**
|
|
5064
|
+
* Pre-call WebSocket warmup for the ElevenLabs `/stream-input` endpoint.
|
|
5065
|
+
*
|
|
5066
|
+
* Opens the WS (DNS + TLS + auth handshake), sends the EXACT same BOS
|
|
5067
|
+
* frame the production `synthesizeStream()` path sends — including
|
|
5068
|
+
* `voice_settings` and (when configured) `generation_config` — so
|
|
5069
|
+
* ElevenLabs instantiates the same per-session worker for both
|
|
5070
|
+
* warmup and the live request. If the BOS frames differ, the server
|
|
5071
|
+
* may route warmup and the real call to two different workers, and
|
|
5072
|
+
* the warmed worker is wasted. Idles ~250 ms, then closes. By the
|
|
5073
|
+
* time the first `synthesizeStream()` call lands during the call,
|
|
5074
|
+
* the connection pool has the upstream warm — net wire time saving
|
|
5075
|
+
* of 200-500 ms.
|
|
5076
|
+
*
|
|
5077
|
+
* Billing safety: ElevenLabs bills on synthesised characters
|
|
5078
|
+
* delivered via `audio` frames (per https://elevenlabs.io/pricing).
|
|
5079
|
+
* The keepalive (single-space `text`, no `flush: true`, no real
|
|
5080
|
+
* transcript) is documented as the session-establishment frame and
|
|
5081
|
+
* does NOT generate synthesis. Closing without sending the actual
|
|
5082
|
+
* transcript does not consume billable characters. Best-effort:
|
|
5083
|
+
* failures logged at debug level.
|
|
5084
|
+
*/
|
|
5085
|
+
async warmup() {
|
|
5086
|
+
const ws = new WebSocket6(this.buildUrl(), {
|
|
5087
|
+
headers: { "xi-api-key": this.apiKey }
|
|
5088
|
+
});
|
|
5089
|
+
try {
|
|
5090
|
+
await new Promise((resolve, reject) => {
|
|
5091
|
+
const timer = setTimeout(
|
|
5092
|
+
() => reject(new Error("ElevenLabs WS TTS warmup connect timeout")),
|
|
5093
|
+
CONNECT_TIMEOUT_MS4
|
|
5094
|
+
);
|
|
5095
|
+
ws.once("open", () => {
|
|
5096
|
+
clearTimeout(timer);
|
|
5097
|
+
resolve();
|
|
5098
|
+
});
|
|
5099
|
+
ws.once("error", (err) => {
|
|
5100
|
+
clearTimeout(timer);
|
|
5101
|
+
reject(err);
|
|
5102
|
+
});
|
|
5103
|
+
});
|
|
5104
|
+
try {
|
|
5105
|
+
ws.send(JSON.stringify(this.buildBosFrame()));
|
|
5106
|
+
} catch {
|
|
5107
|
+
}
|
|
5108
|
+
await new Promise((r) => setTimeout(r, 250));
|
|
5109
|
+
} catch (err) {
|
|
5110
|
+
getLogger().debug(`ElevenLabs WS TTS warmup failed (best-effort): ${String(err)}`);
|
|
5111
|
+
} finally {
|
|
5112
|
+
try {
|
|
5113
|
+
if (ws.readyState === WebSocket6.OPEN || ws.readyState === WebSocket6.CONNECTING) {
|
|
5114
|
+
ws.close();
|
|
5115
|
+
}
|
|
5116
|
+
} catch {
|
|
5117
|
+
}
|
|
5118
|
+
ws.removeAllListeners();
|
|
5119
|
+
}
|
|
5120
|
+
}
|
|
5121
|
+
/**
|
|
5122
|
+
* Open a fresh WS, send the EXACT BOS frame the live `synthesizeStream`
|
|
5123
|
+
* sends, and return the OPEN socket without closing it. Used by the
|
|
5124
|
+
* prewarm pipeline to park a TTS connection during the carrier ringing
|
|
5125
|
+
* window so the next `synthesizeStream` call can adopt it via
|
|
5126
|
+
* {@link adoptWebSocket} and skip ~400-900 ms of TLS + BOS round-trip.
|
|
5127
|
+
*
|
|
5128
|
+
* Returns a parked-handle the caller stashes; the next
|
|
5129
|
+
* `synthesizeStream` will detect the adoption queue and skip its own
|
|
5130
|
+
* `new WebSocket()` + BOS send.
|
|
5131
|
+
*
|
|
5132
|
+
* Billing safety: BOS is the documented session-establishment frame
|
|
5133
|
+
* (single space `text`, no `flush: true`) and does not generate
|
|
5134
|
+
* synthesis. ElevenLabs bills on `audio` frames received from the
|
|
5135
|
+
* server, not on BOS bytes sent by the client.
|
|
5136
|
+
*/
|
|
5137
|
+
async openParkedConnection() {
|
|
5138
|
+
const ws = new WebSocket6(this.buildUrl(), {
|
|
5139
|
+
headers: { "xi-api-key": this.apiKey }
|
|
5140
|
+
});
|
|
5141
|
+
await new Promise((resolve, reject) => {
|
|
5142
|
+
const timer = setTimeout(
|
|
5143
|
+
() => reject(new Error("ElevenLabs WS park connect timeout")),
|
|
5144
|
+
CONNECT_TIMEOUT_MS4
|
|
5145
|
+
);
|
|
5146
|
+
ws.once("open", () => {
|
|
5147
|
+
clearTimeout(timer);
|
|
5148
|
+
resolve();
|
|
5149
|
+
});
|
|
5150
|
+
ws.once("error", (err) => {
|
|
5151
|
+
clearTimeout(timer);
|
|
5152
|
+
reject(err);
|
|
5153
|
+
});
|
|
5154
|
+
});
|
|
5155
|
+
let bosSent = false;
|
|
5156
|
+
try {
|
|
5157
|
+
ws.send(JSON.stringify(this.buildBosFrame()));
|
|
5158
|
+
bosSent = true;
|
|
5159
|
+
} catch {
|
|
5160
|
+
}
|
|
5161
|
+
return { ws, bosSent };
|
|
5162
|
+
}
|
|
5163
|
+
/**
|
|
5164
|
+
* Stash a parked WS handle so the next `synthesizeStream` call adopts
|
|
5165
|
+
* it instead of opening a fresh socket. Caller is responsible for
|
|
5166
|
+
* holding the handle alive until either the live request consumes it
|
|
5167
|
+
* or the call ends (in which case `discardAdoptedConnection()`
|
|
5168
|
+
* cleans it up).
|
|
5169
|
+
*/
|
|
5170
|
+
adoptWebSocket(parked) {
|
|
5171
|
+
const prev = this.adoptedConnection;
|
|
5172
|
+
this.adoptedConnection = parked;
|
|
5173
|
+
if (prev && prev !== parked) {
|
|
5174
|
+
try {
|
|
5175
|
+
prev.ws.close();
|
|
5176
|
+
} catch {
|
|
5177
|
+
}
|
|
5178
|
+
}
|
|
5179
|
+
}
|
|
5180
|
+
/**
|
|
5181
|
+
* Drop and close any pending parked WS without consuming it. Used on
|
|
5182
|
+
* call-failure paths so a never-started call does not leak a TTS WS
|
|
5183
|
+
* that ElevenLabs will close after its inactivity timeout anyway.
|
|
5184
|
+
*/
|
|
5185
|
+
discardAdoptedConnection() {
|
|
5186
|
+
const parked = this.adoptedConnection;
|
|
5187
|
+
this.adoptedConnection = null;
|
|
5188
|
+
if (parked) {
|
|
5189
|
+
try {
|
|
5190
|
+
parked.ws.close();
|
|
5191
|
+
} catch {
|
|
5192
|
+
}
|
|
5193
|
+
}
|
|
5194
|
+
}
|
|
4308
5195
|
/** No-op — connections are per-utterance and torn down inside synthesizeStream. */
|
|
4309
5196
|
async close() {
|
|
5197
|
+
this.discardAdoptedConnection();
|
|
4310
5198
|
}
|
|
4311
5199
|
};
|
|
4312
5200
|
function looksLikeJson(buf) {
|
|
@@ -4386,6 +5274,8 @@ var OpenAITTS = class _OpenAITTS {
|
|
|
4386
5274
|
speed;
|
|
4387
5275
|
antiAlias;
|
|
4388
5276
|
targetSampleRate;
|
|
5277
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
5278
|
+
static providerKey = "openai_tts";
|
|
4389
5279
|
/**
|
|
4390
5280
|
* Synthesise text to speech and return the full audio as a single Buffer.
|
|
4391
5281
|
*
|
|
@@ -4611,6 +5501,8 @@ var CartesiaTTSVoiceMode = {
|
|
|
4611
5501
|
EMBEDDING: "embedding"
|
|
4612
5502
|
};
|
|
4613
5503
|
var CartesiaTTS = class _CartesiaTTS {
|
|
5504
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
5505
|
+
static providerKey = "cartesia_tts";
|
|
4614
5506
|
apiKey;
|
|
4615
5507
|
model;
|
|
4616
5508
|
voice;
|
|
@@ -4685,6 +5577,38 @@ var CartesiaTTS = class _CartesiaTTS {
|
|
|
4685
5577
|
}
|
|
4686
5578
|
return payload;
|
|
4687
5579
|
}
|
|
5580
|
+
/**
|
|
5581
|
+
* Pre-call HTTP warmup for the Cartesia `/tts/bytes` endpoint.
|
|
5582
|
+
*
|
|
5583
|
+
* Issues a lightweight `GET <baseUrl>/voices` so DNS, TLS, and HTTP/2
|
|
5584
|
+
* are already up by the time the first `synthesizeStream()` POST
|
|
5585
|
+
* lands. Best-effort: 5 s timeout, all exceptions swallowed at
|
|
5586
|
+
* debug level.
|
|
5587
|
+
*
|
|
5588
|
+
* Billing safety: `GET /voices` is a free metadata read on
|
|
5589
|
+
* Cartesia's REST surface (per https://docs.cartesia.ai). It does
|
|
5590
|
+
* not consume synthesis credits. The actual synthesis is billed
|
|
5591
|
+
* only when `POST /tts/bytes` runs with a non-empty `transcript`.
|
|
5592
|
+
*
|
|
5593
|
+
* Note: Cartesia TTS uses the HTTP path (vs the WebSocket variant
|
|
5594
|
+
* Cartesia also exposes) — connection warmup is therefore HTTP-GET
|
|
5595
|
+
* based, not WebSocket pre-handshake. The latency win is smaller
|
|
5596
|
+
* (~50-150 ms vs the ~200-500 ms of a WS prewarm) but still real.
|
|
5597
|
+
*/
|
|
5598
|
+
async warmup() {
|
|
5599
|
+
try {
|
|
5600
|
+
await fetch(`${this.baseUrl}/voices`, {
|
|
5601
|
+
method: "GET",
|
|
5602
|
+
headers: {
|
|
5603
|
+
"X-API-Key": this.apiKey,
|
|
5604
|
+
"Cartesia-Version": this.apiVersion
|
|
5605
|
+
},
|
|
5606
|
+
signal: AbortSignal.timeout(5e3)
|
|
5607
|
+
});
|
|
5608
|
+
} catch (err) {
|
|
5609
|
+
getLogger().debug(`Cartesia TTS warmup failed (best-effort): ${String(err)}`);
|
|
5610
|
+
}
|
|
5611
|
+
}
|
|
4688
5612
|
/** Synthesize text and return the concatenated audio buffer. */
|
|
4689
5613
|
async synthesize(text) {
|
|
4690
5614
|
const chunks = [];
|
|
@@ -4788,6 +5712,8 @@ function timeoutForModel(model) {
|
|
|
4788
5712
|
return MIST_MODEL_TIMEOUT_MS;
|
|
4789
5713
|
}
|
|
4790
5714
|
var RimeTTS = class {
|
|
5715
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
5716
|
+
static providerKey = "rime";
|
|
4791
5717
|
apiKey;
|
|
4792
5718
|
model;
|
|
4793
5719
|
speaker;
|
|
@@ -4943,6 +5869,8 @@ var LMNTSampleRate = {
|
|
|
4943
5869
|
HZ_24000: 24e3
|
|
4944
5870
|
};
|
|
4945
5871
|
var LMNTTTS = class {
|
|
5872
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
5873
|
+
static providerKey = "lmnt";
|
|
4946
5874
|
apiKey;
|
|
4947
5875
|
model;
|
|
4948
5876
|
voice;
|
|
@@ -5041,6 +5969,7 @@ init_esm_shims();
|
|
|
5041
5969
|
// src/providers/inworld-tts.ts
|
|
5042
5970
|
init_esm_shims();
|
|
5043
5971
|
var INWORLD_BASE_URL = "https://api.inworld.ai/tts/v1/voice:stream";
|
|
5972
|
+
var INWORLD_VOICES_URL = "https://api.inworld.ai/tts/v1/voices";
|
|
5044
5973
|
var InworldModel = {
|
|
5045
5974
|
TTS_2: "inworld-tts-2",
|
|
5046
5975
|
TTS_1_5_MAX: "inworld-tts-1.5-max",
|
|
@@ -5055,6 +5984,8 @@ var InworldAudioEncoding = {
|
|
|
5055
5984
|
MP3: "MP3"
|
|
5056
5985
|
};
|
|
5057
5986
|
var InworldTTS = class {
|
|
5987
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
5988
|
+
static providerKey = "inworld";
|
|
5058
5989
|
authToken;
|
|
5059
5990
|
model;
|
|
5060
5991
|
voice;
|
|
@@ -5099,6 +6030,45 @@ var InworldTTS = class {
|
|
|
5099
6030
|
if (this.deliveryMode !== void 0) payload.deliveryMode = this.deliveryMode;
|
|
5100
6031
|
return payload;
|
|
5101
6032
|
}
|
|
6033
|
+
/**
|
|
6034
|
+
* Pre-call HTTP warmup for the Inworld TTS API.
|
|
6035
|
+
*
|
|
6036
|
+
* Issues a lightweight `GET /tts/v1/voices` against the API host so
|
|
6037
|
+
* DNS + TLS + HTTP/2 connection are already up by the time the first
|
|
6038
|
+
* `synthesizeStream()` POST lands. Best-effort: 5 s timeout, all
|
|
6039
|
+
* exceptions swallowed at debug level.
|
|
6040
|
+
*
|
|
6041
|
+
* Earlier revisions issued `HEAD` against the streaming endpoint
|
|
6042
|
+
* (`/tts/v1/voice:stream`). That endpoint is POST-only so HEAD
|
|
6043
|
+
* returns `405 Method Not Allowed` — the warmup still completed the
|
|
6044
|
+
* TLS handshake but spammed 405 errors into Inworld's audit logs and
|
|
6045
|
+
* into our own logs. Switching to a documented `GET /tts/v1/voices`
|
|
6046
|
+
* metadata read is a 2xx-clean equivalent.
|
|
6047
|
+
*
|
|
6048
|
+
* Billing safety: `GET /tts/v1/voices` is a free metadata endpoint
|
|
6049
|
+
* (per https://docs.inworld.ai/). It returns the voice catalogue
|
|
6050
|
+
* without invoking the synthesis pipeline. The actual synthesis is
|
|
6051
|
+
* billed only when `POST /tts/v1/voice:stream` runs with a non-empty
|
|
6052
|
+
* `text`.
|
|
6053
|
+
*
|
|
6054
|
+
* Note: Inworld TTS uses the HTTP NDJSON streaming path rather than
|
|
6055
|
+
* a persistent WebSocket — connection warmup is therefore HTTP-based,
|
|
6056
|
+
* not WebSocket pre-handshake. The latency win is smaller (~50-150 ms)
|
|
6057
|
+
* than the WS-based prewarms but still real on cold-start calls.
|
|
6058
|
+
*/
|
|
6059
|
+
async warmup() {
|
|
6060
|
+
try {
|
|
6061
|
+
await fetch(INWORLD_VOICES_URL, {
|
|
6062
|
+
method: "GET",
|
|
6063
|
+
headers: {
|
|
6064
|
+
Authorization: `Basic ${this.authToken}`
|
|
6065
|
+
},
|
|
6066
|
+
signal: AbortSignal.timeout(5e3)
|
|
6067
|
+
});
|
|
6068
|
+
} catch (err) {
|
|
6069
|
+
getLogger().debug(`Inworld TTS warmup failed (best-effort): ${String(err)}`);
|
|
6070
|
+
}
|
|
6071
|
+
}
|
|
5102
6072
|
/** Synthesize text and return the concatenated audio buffer. */
|
|
5103
6073
|
async synthesize(text) {
|
|
5104
6074
|
const chunks = [];
|
|
@@ -5238,6 +6208,8 @@ var DEFAULT_MODEL = AnthropicModel.CLAUDE_HAIKU_4_5_20251001;
|
|
|
5238
6208
|
var DEFAULT_MAX_TOKENS = 1024;
|
|
5239
6209
|
var PROMPT_CACHING_BETA = "prompt-caching-2024-07-31";
|
|
5240
6210
|
var AnthropicLLMProvider = class {
|
|
6211
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
6212
|
+
static providerKey = "anthropic";
|
|
5241
6213
|
apiKey;
|
|
5242
6214
|
model;
|
|
5243
6215
|
maxTokens;
|
|
@@ -5259,6 +6231,27 @@ var AnthropicLLMProvider = class {
|
|
|
5259
6231
|
this.anthropicVersion = options.anthropicVersion ?? DEFAULT_ANTHROPIC_VERSION;
|
|
5260
6232
|
this.promptCaching = options.promptCaching ?? true;
|
|
5261
6233
|
}
|
|
6234
|
+
/**
|
|
6235
|
+
* Pre-call DNS / TLS warmup for the Anthropic Messages API.
|
|
6236
|
+
* Issues a lightweight ``GET https://api.anthropic.com/v1/models`` so
|
|
6237
|
+
* DNS, TLS and HTTP/2 are already up by the time the first ``messages``
|
|
6238
|
+
* call lands. Best-effort: 5 s timeout, exceptions swallowed at debug.
|
|
6239
|
+
*/
|
|
6240
|
+
async warmup() {
|
|
6241
|
+
try {
|
|
6242
|
+
const modelsUrl = this.url.replace(/\/messages\/?$/, "/models");
|
|
6243
|
+
await fetch(modelsUrl, {
|
|
6244
|
+
method: "GET",
|
|
6245
|
+
headers: {
|
|
6246
|
+
"x-api-key": this.apiKey,
|
|
6247
|
+
"anthropic-version": this.anthropicVersion
|
|
6248
|
+
},
|
|
6249
|
+
signal: AbortSignal.timeout(5e3)
|
|
6250
|
+
});
|
|
6251
|
+
} catch (err) {
|
|
6252
|
+
getLogger().debug(`Anthropic LLM warmup failed (best-effort): ${String(err)}`);
|
|
6253
|
+
}
|
|
6254
|
+
}
|
|
5262
6255
|
/** Stream Patter-format LLM chunks for the given OpenAI-style chat history. */
|
|
5263
6256
|
async *stream(messages, tools, opts) {
|
|
5264
6257
|
const { system, messages: anthropicMessages } = toAnthropicMessages(messages);
|
|
@@ -5494,6 +6487,8 @@ var GroqModel = {
|
|
|
5494
6487
|
};
|
|
5495
6488
|
var DEFAULT_MODEL2 = GroqModel.LLAMA_3_3_70B_VERSATILE;
|
|
5496
6489
|
var GroqLLMProvider = class {
|
|
6490
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
6491
|
+
static providerKey = "groq";
|
|
5497
6492
|
apiKey;
|
|
5498
6493
|
model;
|
|
5499
6494
|
baseUrl;
|
|
@@ -5527,6 +6522,21 @@ var GroqLLMProvider = class {
|
|
|
5527
6522
|
this.presencePenalty = options.presencePenalty;
|
|
5528
6523
|
this.stop = options.stop;
|
|
5529
6524
|
}
|
|
6525
|
+
/**
|
|
6526
|
+
* Pre-call DNS / TLS warmup for the Groq inference endpoint.
|
|
6527
|
+
* Best-effort: 5 s timeout, all exceptions swallowed at debug level.
|
|
6528
|
+
*/
|
|
6529
|
+
async warmup() {
|
|
6530
|
+
try {
|
|
6531
|
+
await fetch(`${this.baseUrl}/models`, {
|
|
6532
|
+
method: "GET",
|
|
6533
|
+
headers: { Authorization: `Bearer ${this.apiKey}` },
|
|
6534
|
+
signal: AbortSignal.timeout(5e3)
|
|
6535
|
+
});
|
|
6536
|
+
} catch (err) {
|
|
6537
|
+
getLogger().debug(`Groq LLM warmup failed (best-effort): ${String(err)}`);
|
|
6538
|
+
}
|
|
6539
|
+
}
|
|
5530
6540
|
/** Stream Patter-format LLM chunks from the Groq chat completions API. */
|
|
5531
6541
|
async *stream(messages, tools, opts) {
|
|
5532
6542
|
const body = {
|
|
@@ -5662,6 +6672,8 @@ var CerebrasModel = {
|
|
|
5662
6672
|
var DEFAULT_MODEL3 = CerebrasModel.GPT_OSS_120B;
|
|
5663
6673
|
var RETRY_BACKOFF_BASE_MS = 500;
|
|
5664
6674
|
var CerebrasLLMProvider = class {
|
|
6675
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
6676
|
+
static providerKey = "cerebras";
|
|
5665
6677
|
apiKey;
|
|
5666
6678
|
model;
|
|
5667
6679
|
baseUrl;
|
|
@@ -5697,6 +6709,21 @@ var CerebrasLLMProvider = class {
|
|
|
5697
6709
|
this.presencePenalty = options.presencePenalty;
|
|
5698
6710
|
this.stop = options.stop;
|
|
5699
6711
|
}
|
|
6712
|
+
/**
|
|
6713
|
+
* Pre-call DNS / TLS warmup for the Cerebras inference endpoint.
|
|
6714
|
+
* Best-effort: 5 s timeout, all exceptions swallowed at debug level.
|
|
6715
|
+
*/
|
|
6716
|
+
async warmup() {
|
|
6717
|
+
try {
|
|
6718
|
+
await fetch(`${this.baseUrl}/models`, {
|
|
6719
|
+
method: "GET",
|
|
6720
|
+
headers: { Authorization: `Bearer ${this.apiKey}` },
|
|
6721
|
+
signal: AbortSignal.timeout(5e3)
|
|
6722
|
+
});
|
|
6723
|
+
} catch (err) {
|
|
6724
|
+
getLogger().debug(`Cerebras LLM warmup failed (best-effort): ${String(err)}`);
|
|
6725
|
+
}
|
|
6726
|
+
}
|
|
5700
6727
|
/** Stream Patter-format LLM chunks from the Cerebras chat completions API. */
|
|
5701
6728
|
async *stream(messages, tools, opts) {
|
|
5702
6729
|
const body = {
|
|
@@ -5859,6 +6886,8 @@ var GoogleModel = {
|
|
|
5859
6886
|
var DEFAULT_MODEL4 = GoogleModel.GEMINI_2_5_FLASH;
|
|
5860
6887
|
var DEFAULT_BASE_URL3 = "https://generativelanguage.googleapis.com/v1beta";
|
|
5861
6888
|
var GoogleLLMProvider = class {
|
|
6889
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
6890
|
+
static providerKey = "google";
|
|
5862
6891
|
apiKey;
|
|
5863
6892
|
model;
|
|
5864
6893
|
baseUrl;
|
|
@@ -5876,6 +6905,23 @@ var GoogleLLMProvider = class {
|
|
|
5876
6905
|
this.temperature = options.temperature;
|
|
5877
6906
|
this.maxOutputTokens = options.maxOutputTokens;
|
|
5878
6907
|
}
|
|
6908
|
+
/**
|
|
6909
|
+
* Pre-call DNS / TLS warmup for the Gemini API.
|
|
6910
|
+
* Issues a lightweight ``GET ${baseUrl}/models?key=...`` so DNS, TLS
|
|
6911
|
+
* and HTTP/2 are already up by the time the first
|
|
6912
|
+
* ``streamGenerateContent`` call lands. Best-effort: 5 s timeout, all
|
|
6913
|
+
* exceptions swallowed at debug level.
|
|
6914
|
+
*/
|
|
6915
|
+
async warmup() {
|
|
6916
|
+
try {
|
|
6917
|
+
await fetch(`${this.baseUrl}/models?key=${encodeURIComponent(this.apiKey)}`, {
|
|
6918
|
+
method: "GET",
|
|
6919
|
+
signal: AbortSignal.timeout(5e3)
|
|
6920
|
+
});
|
|
6921
|
+
} catch (err) {
|
|
6922
|
+
getLogger().debug(`Google LLM warmup failed (best-effort): ${String(err)}`);
|
|
6923
|
+
}
|
|
6924
|
+
}
|
|
5879
6925
|
/** Stream Patter-format LLM chunks from the Gemini SSE endpoint. */
|
|
5880
6926
|
async *stream(messages, tools, opts) {
|
|
5881
6927
|
const { systemInstruction, contents } = toGeminiContents(messages);
|
|
@@ -6065,6 +7111,186 @@ var LLM5 = class extends GoogleLLMProvider {
|
|
|
6065
7111
|
}
|
|
6066
7112
|
};
|
|
6067
7113
|
|
|
7114
|
+
// src/providers/deepfilternet-filter.ts
|
|
7115
|
+
init_esm_shims();
|
|
7116
|
+
function log() {
|
|
7117
|
+
return getLogger();
|
|
7118
|
+
}
|
|
7119
|
+
var DEEPFILTERNET_SR = 48e3;
|
|
7120
|
+
async function loadOnnxRuntime() {
|
|
7121
|
+
try {
|
|
7122
|
+
const specifier = "onnxruntime-node";
|
|
7123
|
+
const mod = await import(specifier);
|
|
7124
|
+
return mod;
|
|
7125
|
+
} catch {
|
|
7126
|
+
return null;
|
|
7127
|
+
}
|
|
7128
|
+
}
|
|
7129
|
+
function pcm16ToFloat32(pcm) {
|
|
7130
|
+
const view = new Int16Array(pcm.buffer, pcm.byteOffset, Math.floor(pcm.byteLength / 2));
|
|
7131
|
+
const out = new Float32Array(view.length);
|
|
7132
|
+
for (let i = 0; i < view.length; i += 1) {
|
|
7133
|
+
out[i] = view[i] / 32768;
|
|
7134
|
+
}
|
|
7135
|
+
return out;
|
|
7136
|
+
}
|
|
7137
|
+
function float32ToPcm16(samples) {
|
|
7138
|
+
const out = Buffer.alloc(samples.length * 2);
|
|
7139
|
+
for (let i = 0; i < samples.length; i += 1) {
|
|
7140
|
+
const clamped = Math.max(-1, Math.min(1, samples[i]));
|
|
7141
|
+
out.writeInt16LE(Math.round(clamped * 32767), i * 2);
|
|
7142
|
+
}
|
|
7143
|
+
return out;
|
|
7144
|
+
}
|
|
7145
|
+
var DeepFilterNetFilter = class {
|
|
7146
|
+
modelPath;
|
|
7147
|
+
silenceWarnings;
|
|
7148
|
+
session = null;
|
|
7149
|
+
ort = null;
|
|
7150
|
+
warned = false;
|
|
7151
|
+
closed = false;
|
|
7152
|
+
// Fix 5: stateful resamplers for src_sr↔48k conversions so chunk-boundary
|
|
7153
|
+
// samples are not discarded. Lazy-created and torn down on rate change.
|
|
7154
|
+
_resamplerSrcRate = null;
|
|
7155
|
+
_upsamplerInst = null;
|
|
7156
|
+
_downsamplerInst = null;
|
|
7157
|
+
constructor(options = {}) {
|
|
7158
|
+
this.modelPath = options.modelPath;
|
|
7159
|
+
this.silenceWarnings = options.silenceWarnings === true;
|
|
7160
|
+
}
|
|
7161
|
+
async ensureSession() {
|
|
7162
|
+
if (this.session !== null) {
|
|
7163
|
+
return this.session;
|
|
7164
|
+
}
|
|
7165
|
+
if (!this.modelPath) {
|
|
7166
|
+
if (!this.warned && !this.silenceWarnings) {
|
|
7167
|
+
log().warn(
|
|
7168
|
+
"DeepFilterNetFilter: no modelPath provided; audio will pass through unmodified. Provide a DeepFilterNet ONNX model to enable noise suppression."
|
|
7169
|
+
);
|
|
7170
|
+
this.warned = true;
|
|
7171
|
+
}
|
|
7172
|
+
return null;
|
|
7173
|
+
}
|
|
7174
|
+
if (this.ort === null) {
|
|
7175
|
+
this.ort = await loadOnnxRuntime();
|
|
7176
|
+
}
|
|
7177
|
+
if (this.ort === null) {
|
|
7178
|
+
if (!this.warned && !this.silenceWarnings) {
|
|
7179
|
+
log().warn(
|
|
7180
|
+
"DeepFilterNetFilter: onnxruntime-node is not installed; audio will pass through unmodified. Run `npm install onnxruntime-node` to enable noise suppression."
|
|
7181
|
+
);
|
|
7182
|
+
this.warned = true;
|
|
7183
|
+
}
|
|
7184
|
+
return null;
|
|
7185
|
+
}
|
|
7186
|
+
try {
|
|
7187
|
+
this.session = await this.ort.InferenceSession.create(this.modelPath);
|
|
7188
|
+
return this.session;
|
|
7189
|
+
} catch (error) {
|
|
7190
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
7191
|
+
log().error(`DeepFilterNetFilter: failed to load model: ${message}`);
|
|
7192
|
+
this.warned = true;
|
|
7193
|
+
return null;
|
|
7194
|
+
}
|
|
7195
|
+
}
|
|
7196
|
+
/** Run noise suppression on a PCM16 chunk; pass-through when no model is loaded. */
|
|
7197
|
+
async process(pcmChunk, sampleRate) {
|
|
7198
|
+
if (this.closed) {
|
|
7199
|
+
throw new Error("DeepFilterNetFilter is closed");
|
|
7200
|
+
}
|
|
7201
|
+
if (pcmChunk.length === 0) {
|
|
7202
|
+
return pcmChunk;
|
|
7203
|
+
}
|
|
7204
|
+
const session = await this.ensureSession();
|
|
7205
|
+
if (session === null || this.ort === null) {
|
|
7206
|
+
return pcmChunk;
|
|
7207
|
+
}
|
|
7208
|
+
try {
|
|
7209
|
+
if (this._resamplerSrcRate !== sampleRate) {
|
|
7210
|
+
this._resamplerSrcRate = sampleRate;
|
|
7211
|
+
this._upsamplerInst = new StatefulResampler({ srcRate: sampleRate, dstRate: DEEPFILTERNET_SR });
|
|
7212
|
+
this._downsamplerInst = new StatefulResampler({ srcRate: DEEPFILTERNET_SR, dstRate: sampleRate });
|
|
7213
|
+
}
|
|
7214
|
+
const samples = pcm16ToFloat32(pcmChunk);
|
|
7215
|
+
const pcm16Up = this._upsamplerInst.process(float32ToPcm16(new Float32Array(samples)));
|
|
7216
|
+
const upsampled = pcm16ToFloat32(pcm16Up);
|
|
7217
|
+
const inputName = session.inputNames[0];
|
|
7218
|
+
const outputName = session.outputNames[0];
|
|
7219
|
+
const tensor = new this.ort.Tensor("float32", upsampled, [1, upsampled.length]);
|
|
7220
|
+
const feeds = { [inputName]: tensor };
|
|
7221
|
+
const results = await session.run(feeds);
|
|
7222
|
+
const output = results[outputName];
|
|
7223
|
+
if (!output || !output.data) {
|
|
7224
|
+
return pcmChunk;
|
|
7225
|
+
}
|
|
7226
|
+
const enhanced = output.data instanceof Float32Array ? output.data : new Float32Array(output.data);
|
|
7227
|
+
const pcm16Enhanced = float32ToPcm16(enhanced);
|
|
7228
|
+
const pcm16Restored = this._downsamplerInst.process(pcm16Enhanced);
|
|
7229
|
+
return pcm16Restored;
|
|
7230
|
+
} catch (error) {
|
|
7231
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
7232
|
+
log().error(`DeepFilterNetFilter.process failed: ${message}`);
|
|
7233
|
+
return pcmChunk;
|
|
7234
|
+
}
|
|
7235
|
+
}
|
|
7236
|
+
/** Flush resamplers, release the ONNX session, and mark the filter closed. */
|
|
7237
|
+
async close() {
|
|
7238
|
+
try {
|
|
7239
|
+
this._upsamplerInst?.flush();
|
|
7240
|
+
} catch {
|
|
7241
|
+
}
|
|
7242
|
+
try {
|
|
7243
|
+
this._downsamplerInst?.flush();
|
|
7244
|
+
} catch {
|
|
7245
|
+
}
|
|
7246
|
+
this._upsamplerInst = null;
|
|
7247
|
+
this._downsamplerInst = null;
|
|
7248
|
+
if (this.session !== null && typeof this.session.release === "function") {
|
|
7249
|
+
try {
|
|
7250
|
+
await this.session.release();
|
|
7251
|
+
} catch (error) {
|
|
7252
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
7253
|
+
log().warn(`DeepFilterNetFilter.close: release failed: ${message}`);
|
|
7254
|
+
}
|
|
7255
|
+
}
|
|
7256
|
+
this.session = null;
|
|
7257
|
+
this.closed = true;
|
|
7258
|
+
}
|
|
7259
|
+
};
|
|
7260
|
+
|
|
7261
|
+
// src/providers/krisp-filter.ts
|
|
7262
|
+
init_esm_shims();
|
|
7263
|
+
var KrispSampleRate = {
|
|
7264
|
+
HZ_8000: 8e3,
|
|
7265
|
+
HZ_16000: 16e3,
|
|
7266
|
+
HZ_32000: 32e3,
|
|
7267
|
+
HZ_44100: 44100,
|
|
7268
|
+
HZ_48000: 48e3
|
|
7269
|
+
};
|
|
7270
|
+
var KrispFrameDuration = {
|
|
7271
|
+
MS_10: 10,
|
|
7272
|
+
MS_15: 15,
|
|
7273
|
+
MS_20: 20,
|
|
7274
|
+
MS_30: 30,
|
|
7275
|
+
MS_32: 32
|
|
7276
|
+
};
|
|
7277
|
+
var NODE_SDK_UNAVAILABLE_MESSAGE = "Krisp VIVA Filter is not yet available for the Patter TypeScript SDK.\n\nAs of 2026-05, Krisp does not publish an official Node.js (server) SDK. The Patter TypeScript SDK ships only the AudioFilter interface scaffold (this file) for parity with the Python implementation, since Patter runs server-side on a real-time audio stream from the telephony carrier.\n\nAvailable paths today:\n 1. Use the Python SDK: `from getpatter.providers.krisp_filter import KrispVivaFilter` \u2014 fully implemented, requires `pip install getpatter[krisp]` + `KRISP_VIVA_SDK_LICENSE_KEY` + `KRISP_VIVA_FILTER_MODEL_PATH`.\n 2. Use DeepFilterNet on TS: `new DeepFilterNetFilter({ modelPath: '.../DeepFilterNet3.onnx' })` \u2014 community ONNX export, no license needed.\n\nBrowser/React Native (not applicable to Patter server-side, listed for completeness):\n - Browser WASM wrappers (various third-party packages) process local microphone capture, not server-received PCM/mulaw audio.\n - Mobile client wrappers (iOS/Android, various third-party packages) are likewise client-side only.\n\nTrack Node SDK status:\n - https://krisp.ai/developers/\n - Patter backlog: task #38 \"Krisp TS port decision\"\n";
|
|
7278
|
+
var KrispVivaFilter = class {
|
|
7279
|
+
static providerKey = "krisp_viva";
|
|
7280
|
+
constructor(_options = {}) {
|
|
7281
|
+
throw new Error(NODE_SDK_UNAVAILABLE_MESSAGE);
|
|
7282
|
+
}
|
|
7283
|
+
// The two methods below are unreachable at runtime (constructor throws)
|
|
7284
|
+
// but kept so the class structurally satisfies `AudioFilter`. When the
|
|
7285
|
+
// Node binding lands, replace constructor + these stubs with the real
|
|
7286
|
+
// implementation.
|
|
7287
|
+
async process(pcmChunk, _sampleRate) {
|
|
7288
|
+
return pcmChunk;
|
|
7289
|
+
}
|
|
7290
|
+
async close() {
|
|
7291
|
+
}
|
|
7292
|
+
};
|
|
7293
|
+
|
|
6068
7294
|
// src/telephony/twilio.ts
|
|
6069
7295
|
init_esm_shims();
|
|
6070
7296
|
var Carrier = class {
|
|
@@ -7100,6 +8326,8 @@ var TelnyxSTT = class {
|
|
|
7100
8326
|
transcriptionEngine;
|
|
7101
8327
|
sampleRate;
|
|
7102
8328
|
baseUrl;
|
|
8329
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
8330
|
+
static providerKey = "telnyx_stt";
|
|
7103
8331
|
ws = null;
|
|
7104
8332
|
callbacks = [];
|
|
7105
8333
|
headerSent = false;
|
|
@@ -7204,6 +8432,8 @@ var TelnyxTTS = class {
|
|
|
7204
8432
|
apiKey;
|
|
7205
8433
|
voice;
|
|
7206
8434
|
baseUrl;
|
|
8435
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
8436
|
+
static providerKey = "telnyx_tts";
|
|
7207
8437
|
/** Collect every audio chunk into a single Buffer. */
|
|
7208
8438
|
async synthesize(text) {
|
|
7209
8439
|
const chunks = [];
|
|
@@ -7305,10 +8535,12 @@ export {
|
|
|
7305
8535
|
DEFAULT_MIN_SENTENCE_LEN,
|
|
7306
8536
|
DEFAULT_PRICING,
|
|
7307
8537
|
DTMF_EVENTS,
|
|
8538
|
+
DeepFilterNetFilter,
|
|
7308
8539
|
STT as DeepgramSTT,
|
|
7309
8540
|
DefaultToolExecutor,
|
|
7310
8541
|
ConvAI as ElevenLabsConvAI,
|
|
7311
8542
|
ElevenLabsConvAIAdapter,
|
|
8543
|
+
ElevenLabsTTS as ElevenLabsRestTTS,
|
|
7312
8544
|
TTS as ElevenLabsTTS,
|
|
7313
8545
|
TTS2 as ElevenLabsWebSocketTTS,
|
|
7314
8546
|
ErrorCode,
|
|
@@ -7322,13 +8554,19 @@ export {
|
|
|
7322
8554
|
Guardrail,
|
|
7323
8555
|
IVRActivity,
|
|
7324
8556
|
TTS7 as InworldTTS,
|
|
8557
|
+
KrispFrameDuration,
|
|
8558
|
+
KrispSampleRate,
|
|
8559
|
+
KrispVivaFilter,
|
|
7325
8560
|
LLMLoop,
|
|
7326
8561
|
TTS6 as LMNTTTS,
|
|
7327
8562
|
MetricsStore,
|
|
8563
|
+
MinWordsStrategy,
|
|
7328
8564
|
Ngrok,
|
|
7329
8565
|
LLM as OpenAILLM,
|
|
7330
8566
|
OpenAILLMProvider,
|
|
7331
8567
|
Realtime as OpenAIRealtime,
|
|
8568
|
+
Realtime2 as OpenAIRealtime2,
|
|
8569
|
+
OpenAIRealtime2Adapter,
|
|
7332
8570
|
OpenAIRealtimeAdapter,
|
|
7333
8571
|
TTS3 as OpenAITTS,
|
|
7334
8572
|
STT3 as OpenAITranscribeSTT,
|
|
@@ -7395,6 +8633,7 @@ export {
|
|
|
7395
8633
|
deepgram,
|
|
7396
8634
|
defineTool,
|
|
7397
8635
|
elevenlabs,
|
|
8636
|
+
evaluateStrategies as evaluateBargeInStrategies,
|
|
7398
8637
|
filterEmoji,
|
|
7399
8638
|
filterForTTS,
|
|
7400
8639
|
filterMarkdown,
|
|
@@ -7420,6 +8659,7 @@ export {
|
|
|
7420
8659
|
resample24kTo16k,
|
|
7421
8660
|
resample8kTo16k,
|
|
7422
8661
|
resamplePcm,
|
|
8662
|
+
resetStrategies as resetBargeInStrategies,
|
|
7423
8663
|
rime,
|
|
7424
8664
|
scheduleCron,
|
|
7425
8665
|
scheduleInterval,
|