getpatter 0.6.0 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/barge-in-strategies-X6ARMGIQ.mjs +12 -0
- package/dist/chunk-D4424JZR.mjs +71 -0
- package/dist/{chunk-X3364LSI.mjs → chunk-RV7APPYE.mjs} +36 -2
- package/dist/{chunk-JUQ5WQTQ.mjs → chunk-TEW3NAZJ.mjs} +3244 -1674
- package/dist/cli.js +277 -24
- package/dist/dashboard/ui.html +13 -13
- package/dist/index.d.mts +1525 -364
- package/dist/index.d.ts +1525 -364
- package/dist/index.js +3921 -986
- package/dist/index.mjs +1310 -70
- package/dist/{silero-vad-YLCXT5GQ.mjs → silero-vad-NSEXI4XS.mjs} +1 -1
- package/dist/{test-mode-Y7YG5LFZ.mjs → test-mode-WEKKNBLD.mjs} +1 -1
- package/package.json +1 -1
- package/src/dashboard/ui.html +13 -13
package/dist/index.d.ts
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import * as WebSocket from 'ws';
|
|
2
|
+
import WebSocket__default from 'ws';
|
|
1
3
|
import { EventEmitter } from 'events';
|
|
2
4
|
import { Request, Response, NextFunction, Express } from 'express';
|
|
3
5
|
|
|
@@ -104,6 +106,61 @@ declare class Realtime {
|
|
|
104
106
|
constructor(opts?: RealtimeOptions);
|
|
105
107
|
}
|
|
106
108
|
|
|
109
|
+
/**
|
|
110
|
+
* OpenAI Realtime 2 engine — marker class for Patter client dispatch.
|
|
111
|
+
*
|
|
112
|
+
* Wraps `gpt-realtime-2` (GA Realtime API). Separate marker from
|
|
113
|
+
* {@link import('./openai').Realtime} because the GA endpoint speaks a
|
|
114
|
+
* different `session.update` wire shape; the client dispatches to
|
|
115
|
+
* `OpenAIRealtime2Adapter` when this marker is passed.
|
|
116
|
+
*/
|
|
117
|
+
/** Constructor options for the OpenAI `Realtime2` engine marker. */
|
|
118
|
+
interface Realtime2Options {
|
|
119
|
+
/** API key. Falls back to OPENAI_API_KEY env var when omitted. */
|
|
120
|
+
apiKey?: string;
|
|
121
|
+
/** GA Realtime model. Defaults to `gpt-realtime-2`. */
|
|
122
|
+
model?: string;
|
|
123
|
+
/** Voice preset. Defaults to alloy. */
|
|
124
|
+
voice?: string;
|
|
125
|
+
/**
|
|
126
|
+
* Reasoning-effort tier. When omitted the field is not sent and the
|
|
127
|
+
* server default applies. OpenAI recommends `"low"` for production
|
|
128
|
+
* voice flows — higher tiers add measurable per-turn latency.
|
|
129
|
+
*/
|
|
130
|
+
reasoningEffort?: 'minimal' | 'low' | 'medium' | 'high';
|
|
131
|
+
/**
|
|
132
|
+
* Override for `audio.input.transcription.model`. Omit to keep the
|
|
133
|
+
* adapter default (`whisper-1`). Use `"gpt-realtime-whisper"` for
|
|
134
|
+
* low-latency transcript partials.
|
|
135
|
+
*/
|
|
136
|
+
inputAudioTranscriptionModel?: string;
|
|
137
|
+
}
|
|
138
|
+
/**
|
|
139
|
+
* OpenAI Realtime 2 engine marker — selects `gpt-realtime-2` on the GA
|
|
140
|
+
* Realtime API.
|
|
141
|
+
*
|
|
142
|
+
* @example
|
|
143
|
+
* ```ts
|
|
144
|
+
* import { Patter, Twilio, OpenAIRealtime2 } from "getpatter";
|
|
145
|
+
*
|
|
146
|
+
* const phone = new Patter({ carrier: new Twilio(), phoneNumber: "+1..." });
|
|
147
|
+
* const agent = phone.agent({
|
|
148
|
+
* engine: new OpenAIRealtime2({ reasoningEffort: "low" }),
|
|
149
|
+
* systemPrompt: "You are a friendly receptionist.",
|
|
150
|
+
* firstMessage: "Hello! How can I help?",
|
|
151
|
+
* });
|
|
152
|
+
* ```
|
|
153
|
+
*/
|
|
154
|
+
declare class Realtime2 {
|
|
155
|
+
readonly kind: "openai_realtime_2";
|
|
156
|
+
readonly apiKey: string;
|
|
157
|
+
readonly model: string;
|
|
158
|
+
readonly voice: string;
|
|
159
|
+
readonly reasoningEffort?: 'minimal' | 'low' | 'medium' | 'high';
|
|
160
|
+
readonly inputAudioTranscriptionModel?: string;
|
|
161
|
+
constructor(opts?: Realtime2Options);
|
|
162
|
+
}
|
|
163
|
+
|
|
107
164
|
/** ElevenLabs ConvAI engine — marker class for Patter client dispatch. */
|
|
108
165
|
/** Constructor options for the ElevenLabs `ConvAI` engine marker. */
|
|
109
166
|
interface ConvAIOptions {
|
|
@@ -273,71 +330,6 @@ declare class Tool implements ToolDefinition {
|
|
|
273
330
|
/** Factory helper mirroring Python's `tool(...)` function. */
|
|
274
331
|
declare function tool(opts: ToolOptions): Tool;
|
|
275
332
|
|
|
276
|
-
/**
|
|
277
|
-
* Shared STT / TTS adapter dispatch.
|
|
278
|
-
*
|
|
279
|
-
* In v0.5.0+ callers always pass pre-instantiated adapters (``agent.stt`` /
|
|
280
|
-
* ``agent.tts`` are ``STTAdapter`` / ``TTSAdapter`` instances), so these
|
|
281
|
-
* helpers are thin pass-throughs that return the instance or null. Kept as
|
|
282
|
-
* functions so the Twilio/Telnyx bridges have a single dispatch point.
|
|
283
|
-
*/
|
|
284
|
-
|
|
285
|
-
/** Per-word timings / metadata (Deepgram-shaped). Optional on every adapter. */
|
|
286
|
-
interface STTWord {
|
|
287
|
-
readonly word?: string;
|
|
288
|
-
readonly start?: number;
|
|
289
|
-
readonly end?: number;
|
|
290
|
-
readonly confidence?: number;
|
|
291
|
-
readonly punctuated_word?: string;
|
|
292
|
-
readonly speaker?: number;
|
|
293
|
-
}
|
|
294
|
-
/**
|
|
295
|
-
* Facade transcript shape — widened to surface richer provider fields
|
|
296
|
-
* (Deepgram emits all of them) without forcing adapters that only know
|
|
297
|
-
* ``text``/``isFinal`` to change. All non-text fields are optional.
|
|
298
|
-
*/
|
|
299
|
-
interface STTTranscript {
|
|
300
|
-
text: string;
|
|
301
|
-
isFinal?: boolean;
|
|
302
|
-
/** Overall transcript confidence in [0, 1]. */
|
|
303
|
-
confidence?: number;
|
|
304
|
-
/** Provider-side end-of-utterance hint (faster than ``isFinal``). */
|
|
305
|
-
speechFinal?: boolean;
|
|
306
|
-
/** True when the result was produced in response to a Finalize command. */
|
|
307
|
-
fromFinalize?: boolean;
|
|
308
|
-
/** Provider request id (Deepgram populates this from the Metadata frame). */
|
|
309
|
-
requestId?: string;
|
|
310
|
-
/** Per-word timings / metadata when the provider emits them. */
|
|
311
|
-
words?: ReadonlyArray<STTWord>;
|
|
312
|
-
/** Which provider event this transcript represents (e.g. ``Results``). */
|
|
313
|
-
eventType?: string;
|
|
314
|
-
}
|
|
315
|
-
/** Callback invoked by an `STTAdapter` for each (partial or final) transcript event. */
|
|
316
|
-
type STTTranscriptCallback = (t: STTTranscript) => Promise<void> | void;
|
|
317
|
-
/** Shape shared by every STT adapter in the SDK. */
|
|
318
|
-
interface STTAdapter {
|
|
319
|
-
connect(): Promise<void>;
|
|
320
|
-
sendAudio(pcm: Buffer): void | Promise<void>;
|
|
321
|
-
onTranscript(cb: STTTranscriptCallback): void;
|
|
322
|
-
close(): void | Promise<void>;
|
|
323
|
-
/**
|
|
324
|
-
* Optional: ask the provider to immediately finalise the in-flight
|
|
325
|
-
* utterance (rather than waiting for its own endpoint timer). Called by
|
|
326
|
-
* ``StreamHandler`` whenever the SDK's VAD signals ``speech_end``, and
|
|
327
|
-
* after a barge-in cancel — both moments where waiting for the
|
|
328
|
-
* provider's endpoint heuristic stalls the next turn.
|
|
329
|
-
*
|
|
330
|
-
* Implementations that do not support utterance-level finalisation
|
|
331
|
-
* (e.g. one-shot transcribers like Whisper) should omit this method
|
|
332
|
-
* entirely; the stream handler does an optional-chained call.
|
|
333
|
-
*/
|
|
334
|
-
finalize?(): void | Promise<void>;
|
|
335
|
-
}
|
|
336
|
-
/** Shape shared by every TTS adapter in the SDK. */
|
|
337
|
-
interface TTSAdapter {
|
|
338
|
-
synthesizeStream(text: string): AsyncIterable<Buffer>;
|
|
339
|
-
}
|
|
340
|
-
|
|
341
333
|
/**
|
|
342
334
|
* Pipeline hook executor for pipeline mode.
|
|
343
335
|
*
|
|
@@ -616,6 +608,22 @@ interface LLMStreamOptions {
|
|
|
616
608
|
}
|
|
617
609
|
interface LLMProvider {
|
|
618
610
|
stream(messages: Array<Record<string, unknown>>, tools?: Array<Record<string, unknown>> | null, opts?: LLMStreamOptions): AsyncGenerator<LLMChunk, void, unknown>;
|
|
611
|
+
/**
|
|
612
|
+
* Optional best-effort pre-call DNS / TLS / HTTP-keepalive warmup.
|
|
613
|
+
*
|
|
614
|
+
* Called once per outbound call from ``Patter.call`` when the agent has
|
|
615
|
+
* ``prewarm: true`` (the default). Concrete providers (OpenAI,
|
|
616
|
+
* Anthropic, Google, Cerebras, Groq) override this to issue a
|
|
617
|
+
* lightweight HTTPS GET to their inference endpoint so by the time the
|
|
618
|
+
* first ``stream()`` call lands, the connection pool already has a
|
|
619
|
+
* warm socket. Failures are logged at debug level and never abort the
|
|
620
|
+
* call — pure latency optimisation.
|
|
621
|
+
*
|
|
622
|
+
* Optional on the interface (``warmup?: ...``) so providers without a
|
|
623
|
+
* warmup hook still satisfy the type. Detected via runtime
|
|
624
|
+
* ``typeof provider.warmup === 'function'`` in the client.
|
|
625
|
+
*/
|
|
626
|
+
warmup?(): Promise<void>;
|
|
619
627
|
}
|
|
620
628
|
/** Optional sampling kwargs forwarded into the OpenAI Chat Completions body. */
|
|
621
629
|
interface OpenAILLMSamplingOptions {
|
|
@@ -642,6 +650,8 @@ interface OpenAILLMSamplingOptions {
|
|
|
642
650
|
}
|
|
643
651
|
/** LLM provider backed by OpenAI Chat Completions (streaming). */
|
|
644
652
|
declare class OpenAILLMProvider implements LLMProvider {
|
|
653
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
654
|
+
static readonly providerKey = "openai";
|
|
645
655
|
private readonly apiKey;
|
|
646
656
|
readonly model: string;
|
|
647
657
|
private readonly temperature?;
|
|
@@ -655,6 +665,23 @@ declare class OpenAILLMProvider implements LLMProvider {
|
|
|
655
665
|
private readonly presencePenalty?;
|
|
656
666
|
private readonly stop?;
|
|
657
667
|
constructor(apiKey: string, model: string, sampling?: OpenAILLMSamplingOptions);
|
|
668
|
+
/** Subclasses (Cerebras, Groq) override this with their own host. */
|
|
669
|
+
protected get baseUrl(): string;
|
|
670
|
+
/**
|
|
671
|
+
* Pre-call DNS / TLS / HTTP-keepalive warmup.
|
|
672
|
+
*
|
|
673
|
+
* Issues a lightweight ``GET ${baseUrl}/models`` so DNS, TLS and HTTP/2
|
|
674
|
+
* are already up by the time the first ``chat.completions`` call lands.
|
|
675
|
+
* Best-effort: 5 s timeout, all exceptions swallowed at debug level.
|
|
676
|
+
*
|
|
677
|
+
* Note: an HTTPS GET warms DNS + TLS + connection pool but does NOT
|
|
678
|
+
* warm the inference path itself; for true inference warmup a real
|
|
679
|
+
* low-token request is needed, left as a follow-up. STT / TTS providers ship concrete
|
|
680
|
+
* WebSocket-based prewarms (Cartesia / Deepgram / AssemblyAI for STT;
|
|
681
|
+
* ElevenLabs WS for TTS) which save 200-500 ms each — those dominate
|
|
682
|
+
* the cold-start latency budget.
|
|
683
|
+
*/
|
|
684
|
+
warmup(): Promise<void>;
|
|
658
685
|
/** Stream OpenAI Chat Completions chunks for the given messages/tools. */
|
|
659
686
|
stream(messages: Array<Record<string, unknown>>, tools?: Array<Record<string, unknown>> | null, opts?: LLMStreamOptions): AsyncGenerator<LLMChunk, void, unknown>;
|
|
660
687
|
}
|
|
@@ -669,6 +696,8 @@ declare class LLMLoop {
|
|
|
669
696
|
private eventBus?;
|
|
670
697
|
private readonly _providerName;
|
|
671
698
|
private readonly _modelName;
|
|
699
|
+
private _usageMissingCount;
|
|
700
|
+
private _loggedUsageFallback;
|
|
672
701
|
private onToolCall?;
|
|
673
702
|
constructor(apiKey: string, model: string, systemPrompt: string, tools?: ToolDefinition[] | null, llmProvider?: LLMProvider, disablePhonePreamble?: boolean);
|
|
674
703
|
/**
|
|
@@ -706,6 +735,87 @@ declare class LLMLoop {
|
|
|
706
735
|
private buildMessages;
|
|
707
736
|
}
|
|
708
737
|
|
|
738
|
+
/**
|
|
739
|
+
* Barge-in confirmation strategies.
|
|
740
|
+
*
|
|
741
|
+
* When a caller starts speaking while the agent's TTS is in flight, the SDK
|
|
742
|
+
* has to decide whether the speech is a real interruption or just a brief
|
|
743
|
+
* backchannel ("uh-huh", "okay") / room noise / cough. The default
|
|
744
|
+
* behaviour is to treat any VAD speech_start as a confirmed barge-in and
|
|
745
|
+
* cancel the agent immediately. That is fine for clean inputs but
|
|
746
|
+
* produces frequent false positives on PSTN: the agent gets cut
|
|
747
|
+
* mid-sentence by background chatter, breath, or filler words and never
|
|
748
|
+
* recovers the conversational thread.
|
|
749
|
+
*
|
|
750
|
+
* Each ``BargeInStrategy`` is consulted on every STT transcript while a
|
|
751
|
+
* barge-in is *pending* (VAD fired, but the agent has not yet been
|
|
752
|
+
* cancelled). The first strategy that returns ``true`` confirms the
|
|
753
|
+
* barge-in; if none do within the configured timeout the pending state
|
|
754
|
+
* is dropped and the agent resumes streaming TTS as if nothing happened.
|
|
755
|
+
* With an empty ``bargeInStrategies`` array the SDK falls back to the
|
|
756
|
+
* legacy "interrupt immediately on VAD" path, so adding strategies is
|
|
757
|
+
* a strict opt-in.
|
|
758
|
+
*/
|
|
759
|
+
interface EvaluateContext {
|
|
760
|
+
/** Latest STT output text (interim or final). */
|
|
761
|
+
readonly transcript: string;
|
|
762
|
+
/** ``true`` for interim partials, ``false`` for finals. */
|
|
763
|
+
readonly isInterim: boolean;
|
|
764
|
+
/** Whether the agent's TTS is currently in flight. */
|
|
765
|
+
readonly agentSpeaking: boolean;
|
|
766
|
+
}
|
|
767
|
+
/**
|
|
768
|
+
* Decides whether a pending barge-in should be confirmed.
|
|
769
|
+
*
|
|
770
|
+
* Implementations must be safe to call from any number of evaluations
|
|
771
|
+
* per turn. ``reset`` is invoked when the agent finishes speaking
|
|
772
|
+
* naturally and when a pending barge-in times out without
|
|
773
|
+
* confirmation.
|
|
774
|
+
*/
|
|
775
|
+
interface BargeInStrategy {
|
|
776
|
+
evaluate(ctx: EvaluateContext): Promise<boolean> | boolean;
|
|
777
|
+
reset?(): Promise<void> | void;
|
|
778
|
+
}
|
|
779
|
+
interface MinWordsStrategyOptions {
|
|
780
|
+
/**
|
|
781
|
+
* Minimum word count required while the agent is speaking. Reasonable
|
|
782
|
+
* values are 2-5; 3 is a good starting point for production phone
|
|
783
|
+
* agents. Must be ``>= 1``.
|
|
784
|
+
*/
|
|
785
|
+
readonly minWords: number;
|
|
786
|
+
/**
|
|
787
|
+
* When ``true`` (default), interim STT partials are evaluated as soon
|
|
788
|
+
* as they arrive. Set to ``false`` to wait for finals only — slower
|
|
789
|
+
* but free of partial-word noise on jittery STT providers.
|
|
790
|
+
*/
|
|
791
|
+
readonly useInterim?: boolean;
|
|
792
|
+
}
|
|
793
|
+
/**
|
|
794
|
+
* Confirm barge-in only after the caller has spoken ``minWords`` words.
|
|
795
|
+
*
|
|
796
|
+
* Filters short backchannels, single-word utterances, and stray
|
|
797
|
+
* transcription fragments that VAD picked up but were not real
|
|
798
|
+
* interruptions. While the agent is silent the strategy permits any
|
|
799
|
+
* speech to count (one word is enough), so the first user turn is not
|
|
800
|
+
* delayed.
|
|
801
|
+
*/
|
|
802
|
+
declare class MinWordsStrategy implements BargeInStrategy {
|
|
803
|
+
private readonly minWords;
|
|
804
|
+
private readonly useInterim;
|
|
805
|
+
constructor(options: MinWordsStrategyOptions);
|
|
806
|
+
evaluate(ctx: EvaluateContext): boolean;
|
|
807
|
+
reset(): Promise<void>;
|
|
808
|
+
}
|
|
809
|
+
/**
|
|
810
|
+
* Short-circuit-OR composition: first strategy that confirms wins.
|
|
811
|
+
* Returns ``false`` for an empty array so callers can use the empty
|
|
812
|
+
* default to mean "no opt-in confirmation, fall back to legacy
|
|
813
|
+
* interrupt-on-VAD".
|
|
814
|
+
*/
|
|
815
|
+
declare function evaluateStrategies(strategies: readonly BargeInStrategy[], ctx: EvaluateContext): Promise<boolean>;
|
|
816
|
+
/** Call ``reset()`` on every strategy, swallowing per-strategy errors. */
|
|
817
|
+
declare function resetStrategies(strategies: readonly BargeInStrategy[]): Promise<void>;
|
|
818
|
+
|
|
709
819
|
/**
|
|
710
820
|
* Public type definitions for the Patter SDK — agent options, pipeline hooks,
|
|
711
821
|
* provider config envelopes, and serve/call request/response shapes.
|
|
@@ -967,6 +1077,15 @@ interface VADEvent {
|
|
|
967
1077
|
interface VADProvider {
|
|
968
1078
|
processFrame(pcmChunk: Buffer, sampleRate: number): Promise<VADEvent | null>;
|
|
969
1079
|
close(): Promise<void>;
|
|
1080
|
+
/**
|
|
1081
|
+
* Optional: reset all per-utterance state so the next ``processFrame``
|
|
1082
|
+
* starts from a clean SILENCE state. Useful between agent turns to
|
|
1083
|
+
* prevent a "stuck SPEECH" condition where PSTN echo / loopback kept the
|
|
1084
|
+
* detector's internal probability above the deactivation threshold for
|
|
1085
|
+
* the full agent turn, leaving the VAD unable to emit ``speech_start``
|
|
1086
|
+
* on the next user utterance (one-shot barge-in bug).
|
|
1087
|
+
*/
|
|
1088
|
+
reset?(): Promise<void> | void;
|
|
970
1089
|
}
|
|
971
1090
|
/** Pre-STT audio filter — noise cancellation, gain, EQ. */
|
|
972
1091
|
interface AudioFilter {
|
|
@@ -1062,7 +1181,7 @@ interface AgentOptions {
|
|
|
1062
1181
|
* matching mode (``openai_realtime`` or ``elevenlabs_convai``). When absent,
|
|
1063
1182
|
* pipeline mode is selected if ``stt`` and ``tts`` are provided.
|
|
1064
1183
|
*/
|
|
1065
|
-
engine?: Realtime | ConvAI;
|
|
1184
|
+
engine?: Realtime | Realtime2 | ConvAI;
|
|
1066
1185
|
/**
|
|
1067
1186
|
* Provider mode. Normally derived from ``engine`` / ``stt`` + ``tts``. Pass
|
|
1068
1187
|
* ``'pipeline'`` explicitly when building a pipeline-mode agent without
|
|
@@ -1103,6 +1222,59 @@ interface AgentOptions {
|
|
|
1103
1222
|
* Default: 300.
|
|
1104
1223
|
*/
|
|
1105
1224
|
bargeInThresholdMs?: number;
|
|
1225
|
+
/**
|
|
1226
|
+
* Opt-in barge-in confirmation strategies (pipeline mode). With the
|
|
1227
|
+
* default empty array the SDK falls back to the legacy
|
|
1228
|
+
* "interrupt immediately on VAD speech_start" behaviour. When at
|
|
1229
|
+
* least one strategy is provided, a VAD speech_start during TTS
|
|
1230
|
+
* marks the barge-in as *pending* — the agent's TTS continues
|
|
1231
|
+
* streaming naturally and its in-flight LLM stream is preserved —
|
|
1232
|
+
* and the strategies are consulted on every STT transcript. The first strategy that
|
|
1233
|
+
* returns ``true`` confirms the barge-in (cancels TTS, flushes the
|
|
1234
|
+
* inbound ring buffer); if none confirm within
|
|
1235
|
+
* ``bargeInConfirmMs`` the pending state is dropped and TTS resumes.
|
|
1236
|
+
*
|
|
1237
|
+
* See ``getpatter`` exports ``BargeInStrategy`` /
|
|
1238
|
+
* ``MinWordsStrategy`` for the protocol and a reference
|
|
1239
|
+
* implementation.
|
|
1240
|
+
*/
|
|
1241
|
+
bargeInStrategies?: readonly BargeInStrategy[];
|
|
1242
|
+
/**
|
|
1243
|
+
* Maximum time (ms) to wait for at least one strategy to confirm a
|
|
1244
|
+
* pending barge-in before discarding the pending state and resuming
|
|
1245
|
+
* TTS. Only consulted when ``bargeInStrategies`` is non-empty.
|
|
1246
|
+
* Default: 1500.
|
|
1247
|
+
*/
|
|
1248
|
+
bargeInConfirmMs?: number;
|
|
1249
|
+
/**
|
|
1250
|
+
* When ``true`` (default), ``Patter.call`` warms up the STT, TTS, and
|
|
1251
|
+
* LLM provider connections in parallel with the carrier-side
|
|
1252
|
+
* ``initiateCall`` request so DNS, TLS, and HTTP/2 handshakes are
|
|
1253
|
+
* already complete by the time the callee answers. Adapters expose a
|
|
1254
|
+
* ``warmup()`` method returning ``Promise<void>`` (default no-op) —
|
|
1255
|
+
* providers can override to dial open a persistent connection ahead
|
|
1256
|
+
* of the WebSocket bridge. Best-effort: warmup failures are logged
|
|
1257
|
+
* at debug level and never abort the call. Default: ``true``.
|
|
1258
|
+
*/
|
|
1259
|
+
prewarm?: boolean;
|
|
1260
|
+
/**
|
|
1261
|
+
* When ``true`` (default ``false``), ``Patter.call`` also pre-renders
|
|
1262
|
+
* ``firstMessage`` to TTS audio bytes during the ringing window and
|
|
1263
|
+
* streams the cached buffer immediately when the carrier emits
|
|
1264
|
+
* ``start``. Eliminates the 200-700 ms TTS first-byte latency on the
|
|
1265
|
+
* greeting at the cost of paying the TTS bill even if the call is
|
|
1266
|
+
* never answered (silently logged at warn level when the call
|
|
1267
|
+
* fails). Off by default to preserve the prior cost surface; opt-in
|
|
1268
|
+
* for production outbound where every millisecond of greeting
|
|
1269
|
+
* latency hurts conversion. Default: ``false``.
|
|
1270
|
+
*
|
|
1271
|
+
* **Pipeline mode only.** Realtime / ConvAI provider modes never
|
|
1272
|
+
* consume the prewarm cache (the StreamHandler for those modes runs
|
|
1273
|
+
* its first-message emit through the provider's own audio path), so
|
|
1274
|
+
* ``Patter.call`` refuses to spawn the prewarm task and emits a warn
|
|
1275
|
+
* when ``provider !== 'pipeline'``.
|
|
1276
|
+
*/
|
|
1277
|
+
prewarmFirstMessage?: boolean;
|
|
1106
1278
|
/**
|
|
1107
1279
|
* When true, the sentence chunker emits the first clause of each response
|
|
1108
1280
|
* on a soft punctuation boundary (",", em-dash, en-dash) once ~40 chars
|
|
@@ -1194,36 +1366,449 @@ interface LocalCallOptions {
|
|
|
1194
1366
|
to: string;
|
|
1195
1367
|
agent: AgentOptions;
|
|
1196
1368
|
/**
|
|
1197
|
-
* Enable answering-machine detection. **Defaults to ``true``** — the SDK
|
|
1198
|
-
* asks Twilio (``MachineDetection=DetectMessageEnd`` + Async AMD) or
|
|
1199
|
-
* Telnyx (``answering_machine_detection=greeting_end``) to classify
|
|
1200
|
-
* whoever picks up. Async AMD on Twilio adds ~0 answer-latency on human
|
|
1201
|
-
* pickups (the call connects immediately and the result arrives via
|
|
1202
|
-
* webhook 2-5 s later), so ON-by-default is safe. Pass ``false`` to
|
|
1203
|
-
* disable when you want to skip per-call AMD billing or you already
|
|
1204
|
-
* know the destination is a human.
|
|
1369
|
+
* Enable answering-machine detection. **Defaults to ``true``** — the SDK
|
|
1370
|
+
* asks Twilio (``MachineDetection=DetectMessageEnd`` + Async AMD) or
|
|
1371
|
+
* Telnyx (``answering_machine_detection=greeting_end``) to classify
|
|
1372
|
+
* whoever picks up. Async AMD on Twilio adds ~0 answer-latency on human
|
|
1373
|
+
* pickups (the call connects immediately and the result arrives via
|
|
1374
|
+
* webhook 2-5 s later), so ON-by-default is safe. Pass ``false`` to
|
|
1375
|
+
* disable when you want to skip per-call AMD billing or you already
|
|
1376
|
+
* know the destination is a human.
|
|
1377
|
+
*/
|
|
1378
|
+
machineDetection?: boolean;
|
|
1379
|
+
/**
|
|
1380
|
+
* Called once when the carrier finishes the AMD check. Fires for both
|
|
1381
|
+
* ``human`` and ``machine`` outcomes. Combine with ``voicemailMessage``
|
|
1382
|
+
* to get both the legacy voicemail-drop AND a result callback (the SDK
|
|
1383
|
+
* fires the callback after the drop is queued). Acceptance tests use
|
|
1384
|
+
* this to mark a run INVALID when ``classification !== 'human'``.
|
|
1385
|
+
*/
|
|
1386
|
+
onMachineDetection?: (result: MachineDetectionResult) => void | Promise<void>;
|
|
1387
|
+
/** If set, spoken as a voicemail message when AMD detects a machine. Implicitly enables ``machineDetection``. */
|
|
1388
|
+
voicemailMessage?: string;
|
|
1389
|
+
/** Dynamic variables merged into agent.variables before call. Override agent-level variables. */
|
|
1390
|
+
variables?: Record<string, string>;
|
|
1391
|
+
/**
|
|
1392
|
+
* Ring timeout in seconds. Forwarded to Twilio as `Timeout` and to Telnyx
|
|
1393
|
+
* as `timeout_secs`. Defaults to **25 s** — the production-recommended
|
|
1394
|
+
* value that limits phantom calls. Pass `60` for legacy carrier-default
|
|
1395
|
+
* parity, or `null` to omit the parameter entirely (carrier picks its
|
|
1396
|
+
* own default).
|
|
1397
|
+
*/
|
|
1398
|
+
ringTimeout?: number | null;
|
|
1399
|
+
}
|
|
1400
|
+
|
|
1401
|
+
/**
|
|
1402
|
+
* Shared STT / TTS adapter dispatch.
|
|
1403
|
+
*
|
|
1404
|
+
* In v0.5.0+ callers always pass pre-instantiated adapters (``agent.stt`` /
|
|
1405
|
+
* ``agent.tts`` are ``STTAdapter`` / ``TTSAdapter`` instances), so these
|
|
1406
|
+
* helpers are thin pass-throughs that return the instance or null. Kept as
|
|
1407
|
+
* functions so the Twilio/Telnyx bridges have a single dispatch point.
|
|
1408
|
+
*/
|
|
1409
|
+
|
|
1410
|
+
/** Per-word timings / metadata (Deepgram-shaped). Optional on every adapter. */
|
|
1411
|
+
interface STTWord {
|
|
1412
|
+
readonly word?: string;
|
|
1413
|
+
readonly start?: number;
|
|
1414
|
+
readonly end?: number;
|
|
1415
|
+
readonly confidence?: number;
|
|
1416
|
+
readonly punctuated_word?: string;
|
|
1417
|
+
readonly speaker?: number;
|
|
1418
|
+
}
|
|
1419
|
+
/**
|
|
1420
|
+
* Facade transcript shape — widened to surface richer provider fields
|
|
1421
|
+
* (Deepgram emits all of them) without forcing adapters that only know
|
|
1422
|
+
* ``text``/``isFinal`` to change. All non-text fields are optional.
|
|
1423
|
+
*/
|
|
1424
|
+
interface STTTranscript {
|
|
1425
|
+
text: string;
|
|
1426
|
+
isFinal?: boolean;
|
|
1427
|
+
/** Overall transcript confidence in [0, 1]. */
|
|
1428
|
+
confidence?: number;
|
|
1429
|
+
/** Provider-side end-of-utterance hint (faster than ``isFinal``). */
|
|
1430
|
+
speechFinal?: boolean;
|
|
1431
|
+
/** True when the result was produced in response to a Finalize command. */
|
|
1432
|
+
fromFinalize?: boolean;
|
|
1433
|
+
/** Provider request id (Deepgram populates this from the Metadata frame). */
|
|
1434
|
+
requestId?: string;
|
|
1435
|
+
/** Per-word timings / metadata when the provider emits them. */
|
|
1436
|
+
words?: ReadonlyArray<STTWord>;
|
|
1437
|
+
/** Which provider event this transcript represents (e.g. ``Results``). */
|
|
1438
|
+
eventType?: string;
|
|
1439
|
+
}
|
|
1440
|
+
/** Callback invoked by an `STTAdapter` for each (partial or final) transcript event. */
|
|
1441
|
+
type STTTranscriptCallback = (t: STTTranscript) => Promise<void> | void;
|
|
1442
|
+
/** Shape shared by every STT adapter in the SDK. */
|
|
1443
|
+
interface STTAdapter {
|
|
1444
|
+
connect(): Promise<void>;
|
|
1445
|
+
sendAudio(pcm: Buffer): void | Promise<void>;
|
|
1446
|
+
onTranscript(cb: STTTranscriptCallback): void;
|
|
1447
|
+
close(): void | Promise<void>;
|
|
1448
|
+
/**
|
|
1449
|
+
* Optional: ask the provider to immediately finalise the in-flight
|
|
1450
|
+
* utterance (rather than waiting for its own endpoint timer). Called by
|
|
1451
|
+
* ``StreamHandler`` whenever the SDK's VAD signals ``speech_end``, and
|
|
1452
|
+
* after a barge-in cancel — both moments where waiting for the
|
|
1453
|
+
* provider's endpoint heuristic stalls the next turn.
|
|
1454
|
+
*
|
|
1455
|
+
* Implementations that do not support utterance-level finalisation
|
|
1456
|
+
* (e.g. one-shot transcribers like Whisper) should omit this method
|
|
1457
|
+
* entirely; the stream handler does an optional-chained call.
|
|
1458
|
+
*/
|
|
1459
|
+
finalize?(): void | Promise<void>;
|
|
1460
|
+
/**
|
|
1461
|
+
* Optional best-effort pre-call DNS / TLS / HTTP-keepalive warmup.
|
|
1462
|
+
* Default behaviour is a no-op — providers that benefit (e.g.
|
|
1463
|
+
* provider WebSockets with a slow handshake) can override. Failures
|
|
1464
|
+
* must never abort the call.
|
|
1465
|
+
*/
|
|
1466
|
+
warmup?(): Promise<void>;
|
|
1467
|
+
}
|
|
1468
|
+
/** Shape shared by every TTS adapter in the SDK. */
|
|
1469
|
+
interface TTSAdapter {
|
|
1470
|
+
synthesizeStream(text: string): AsyncIterable<Buffer>;
|
|
1471
|
+
/**
|
|
1472
|
+
* Optional best-effort pre-call DNS / TLS / HTTP-keepalive warmup.
|
|
1473
|
+
* Default behaviour is a no-op. Failures must never abort the call.
|
|
1474
|
+
*/
|
|
1475
|
+
warmup?(): Promise<void>;
|
|
1476
|
+
}
|
|
1477
|
+
|
|
1478
|
+
/**
|
|
1479
|
+
* Known stable ElevenLabs voice models (from the official ElevenLabs API
|
|
1480
|
+
* reference). Exposed as a typed `as const` object so callers can pass
|
|
1481
|
+
* `ElevenLabsModel.FLASH_V2_5` and get autocomplete / static checking; the
|
|
1482
|
+
* public `modelId` option also accepts an arbitrary `string` so users can
|
|
1483
|
+
* pass forward-compat IDs we haven't enumerated yet.
|
|
1484
|
+
*
|
|
1485
|
+
* - `V3` — newest, highest quality (slower TTFT than Flash).
|
|
1486
|
+
* - `FLASH_V2_5` — current default, fastest (~75 ms TTFT).
|
|
1487
|
+
* - `TURBO_V2_5` — balanced quality/speed.
|
|
1488
|
+
* - `MULTILINGUAL_V2` — best multilingual support.
|
|
1489
|
+
* - `MONOLINGUAL_V1` — legacy English-only.
|
|
1490
|
+
*/
|
|
1491
|
+
declare const ElevenLabsModel: {
|
|
1492
|
+
readonly V3: "eleven_v3";
|
|
1493
|
+
readonly FLASH_V2_5: "eleven_flash_v2_5";
|
|
1494
|
+
readonly TURBO_V2_5: "eleven_turbo_v2_5";
|
|
1495
|
+
readonly MULTILINGUAL_V2: "eleven_multilingual_v2";
|
|
1496
|
+
readonly MONOLINGUAL_V1: "eleven_monolingual_v1";
|
|
1497
|
+
};
|
|
1498
|
+
/** Union of {@link ElevenLabsModel} string values. */
|
|
1499
|
+
type ElevenLabsModel = (typeof ElevenLabsModel)[keyof typeof ElevenLabsModel];
|
|
1500
|
+
declare const ElevenLabsOutputFormat: {
|
|
1501
|
+
readonly MP3_22050_32: "mp3_22050_32";
|
|
1502
|
+
readonly MP3_44100_32: "mp3_44100_32";
|
|
1503
|
+
readonly MP3_44100_64: "mp3_44100_64";
|
|
1504
|
+
readonly MP3_44100_96: "mp3_44100_96";
|
|
1505
|
+
readonly MP3_44100_128: "mp3_44100_128";
|
|
1506
|
+
readonly MP3_44100_192: "mp3_44100_192";
|
|
1507
|
+
readonly PCM_8000: "pcm_8000";
|
|
1508
|
+
readonly PCM_16000: "pcm_16000";
|
|
1509
|
+
readonly PCM_22050: "pcm_22050";
|
|
1510
|
+
readonly PCM_24000: "pcm_24000";
|
|
1511
|
+
readonly PCM_44100: "pcm_44100";
|
|
1512
|
+
readonly ULAW_8000: "ulaw_8000";
|
|
1513
|
+
};
|
|
1514
|
+
/** Union of {@link ElevenLabsOutputFormat} string values. */
|
|
1515
|
+
type ElevenLabsOutputFormat = (typeof ElevenLabsOutputFormat)[keyof typeof ElevenLabsOutputFormat];
|
|
1516
|
+
/** ElevenLabs voice tuning knobs forwarded as `voice_settings` in the request. */
|
|
1517
|
+
interface ElevenLabsVoiceSettings {
|
|
1518
|
+
stability?: number;
|
|
1519
|
+
similarity_boost?: number;
|
|
1520
|
+
style?: number;
|
|
1521
|
+
use_speaker_boost?: boolean;
|
|
1522
|
+
}
|
|
1523
|
+
/** Constructor options for {@link ElevenLabsTTS}. */
|
|
1524
|
+
interface ElevenLabsTTSOptions$1 {
|
|
1525
|
+
voiceId?: string;
|
|
1526
|
+
/**
|
|
1527
|
+
* ElevenLabs voice model ID. The default ``eleven_flash_v2_5`` has the
|
|
1528
|
+
* lowest TTFT (~75 ms). Pass ``eleven_v3`` for highest quality, or any
|
|
1529
|
+
* arbitrary string for forward-compat with future models.
|
|
1530
|
+
*/
|
|
1531
|
+
modelId?: ElevenLabsModel | string;
|
|
1532
|
+
outputFormat?: ElevenLabsOutputFormat;
|
|
1533
|
+
voiceSettings?: ElevenLabsVoiceSettings;
|
|
1534
|
+
languageCode?: string;
|
|
1535
|
+
chunkSize?: number;
|
|
1536
|
+
}
|
|
1537
|
+
/**
|
|
1538
|
+
* ElevenLabs streaming TTS adapter.
|
|
1539
|
+
*
|
|
1540
|
+
* Supported `modelId` values are autocompleted via {@link ElevenLabsModel}.
|
|
1541
|
+
* Default is `eleven_flash_v2_5` (lowest TTFT, ~75 ms).
|
|
1542
|
+
*
|
|
1543
|
+
* **Telephony optimization** — the constructor default
|
|
1544
|
+
* `outputFormat='pcm_16000'` is correct for web playback, dashboard
|
|
1545
|
+
* previews, and 16 kHz pipelines. For real phone calls, use the
|
|
1546
|
+
* carrier-specific factories instead:
|
|
1547
|
+
*
|
|
1548
|
+
* - {@link ElevenLabsTTS.forTwilio} emits `ulaw_8000` natively. Twilio's
|
|
1549
|
+
* media-stream WebSocket expects μ-law @ 8 kHz, so the SDK normally
|
|
1550
|
+
* resamples 16 kHz → 8 kHz and PCM → μ-law before sending. Asking
|
|
1551
|
+
* ElevenLabs to produce μ-law directly skips that step (saves
|
|
1552
|
+
* ~30–80 ms first-byte plus per-frame CPU and avoids any resampling
|
|
1553
|
+
* aliasing).
|
|
1554
|
+
* - {@link ElevenLabsTTS.forTelnyx} emits `pcm_16000`. Telnyx negotiates
|
|
1555
|
+
* L16/16000 on its bidirectional media WebSocket, so 16 kHz PCM is
|
|
1556
|
+
* already the format used end-to-end and no transcoding happens.
|
|
1557
|
+
* ElevenLabs *also* supports `ulaw_8000` if your Telnyx profile is
|
|
1558
|
+
* pinned to PCMU/8000 — pass `outputFormat: 'ulaw_8000'` explicitly
|
|
1559
|
+
* in that case.
|
|
1560
|
+
*/
|
|
1561
|
+
declare class ElevenLabsTTS {
|
|
1562
|
+
static readonly providerKey = "elevenlabs";
|
|
1563
|
+
private readonly apiKey;
|
|
1564
|
+
private readonly voiceId;
|
|
1565
|
+
private readonly modelId;
|
|
1566
|
+
private readonly outputFormat;
|
|
1567
|
+
private readonly voiceSettings;
|
|
1568
|
+
private readonly languageCode;
|
|
1569
|
+
private readonly chunkSize;
|
|
1570
|
+
constructor(apiKey: string, voiceId?: string, modelId?: string, outputFormat?: ElevenLabsOutputFormat | string);
|
|
1571
|
+
constructor(apiKey: string, options: ElevenLabsTTSOptions$1);
|
|
1572
|
+
/**
|
|
1573
|
+
* Construct an instance pre-configured for Twilio Media Streams.
|
|
1574
|
+
*
|
|
1575
|
+
* Sets `outputFormat='ulaw_8000'` so ElevenLabs emits μ-law @ 8 kHz
|
|
1576
|
+
* directly — the exact wire format Twilio's media stream uses — letting
|
|
1577
|
+
* the SDK skip the 16 kHz→8 kHz resample and PCM→μ-law conversion in
|
|
1578
|
+
* `TwilioAudioSender`. Saves ~30–80 ms first-byte and per-frame CPU,
|
|
1579
|
+
* and removes a potential aliasing source.
|
|
1580
|
+
*
|
|
1581
|
+
* `voiceSettings` defaults to a low-bandwidth-friendly profile
|
|
1582
|
+
* (speaker boost off, modest stability) which sounds cleaner at 8 kHz
|
|
1583
|
+
* μ-law than the studio default. Pass an explicit object to override.
|
|
1584
|
+
*/
|
|
1585
|
+
static forTwilio(apiKey: string, options?: Omit<ElevenLabsTTSOptions$1, 'outputFormat'>): ElevenLabsTTS;
|
|
1586
|
+
/**
|
|
1587
|
+
* Construct an instance pre-configured for Telnyx bidirectional media.
|
|
1588
|
+
*
|
|
1589
|
+
* Telnyx's default media-streaming codec is L16 PCM @ 16 kHz, which
|
|
1590
|
+
* matches our default Telnyx handler. We pick `pcm_16000` so the audio
|
|
1591
|
+
* flows end-to-end with zero resampling or transcoding.
|
|
1592
|
+
*
|
|
1593
|
+
* Trade-off: if your Telnyx profile is pinned to PCMU/8000 (μ-law),
|
|
1594
|
+
* construct `ElevenLabsTTS` directly with `outputFormat: 'ulaw_8000'`
|
|
1595
|
+
* — Telnyx supports that natively too.
|
|
1596
|
+
*/
|
|
1597
|
+
static forTelnyx(apiKey: string, options?: Omit<ElevenLabsTTSOptions$1, 'outputFormat'>): ElevenLabsTTS;
|
|
1598
|
+
/**
|
|
1599
|
+
* Synthesise text to speech and return the full audio as a single Buffer.
|
|
1600
|
+
*
|
|
1601
|
+
* For large chunks (or when latency matters) call `synthesizeStream` instead.
|
|
1602
|
+
*/
|
|
1603
|
+
synthesize(text: string): Promise<Buffer>;
|
|
1604
|
+
/**
|
|
1605
|
+
* Synthesise text and yield audio chunks as they arrive (streaming).
|
|
1606
|
+
*
|
|
1607
|
+
* The yielded buffers are raw PCM at 16 kHz (or whatever `outputFormat` is
|
|
1608
|
+
* configured to). `chunkSize` controls the maximum yield size — 512 is a
|
|
1609
|
+
* good choice for low-latency telephony.
|
|
1610
|
+
*/
|
|
1611
|
+
synthesizeStream(text: string): AsyncGenerator<Buffer>;
|
|
1612
|
+
}
|
|
1613
|
+
|
|
1614
|
+
/**
|
|
1615
|
+
* WebSocket-based ElevenLabs TTS provider — opt-in low-latency variant.
|
|
1616
|
+
*
|
|
1617
|
+
* Targets the ElevenLabs streaming-input WebSocket endpoint
|
|
1618
|
+
* (`/v1/text-to-speech/{voice_id}/stream-input`) instead of the HTTP
|
|
1619
|
+
* `/stream` endpoint used by `ElevenLabsTTS`. Saves the HTTP request setup
|
|
1620
|
+
* time per utterance (~50 ms) and avoids the HTTP cold-start TLS handshake
|
|
1621
|
+
* when calls are bursty.
|
|
1622
|
+
*
|
|
1623
|
+
* API matches `ElevenLabsTTS` (`synthesizeStream(text)` returns an
|
|
1624
|
+
* `AsyncGenerator<Buffer>`) so it can be passed anywhere a TTSAdapter is
|
|
1625
|
+
* expected.
|
|
1626
|
+
*
|
|
1627
|
+
* Behaviour notes
|
|
1628
|
+
* - WebSocket is opened **per-utterance** (matches HTTP semantics). A
|
|
1629
|
+
* future revision may pool a WS across utterances of the same call
|
|
1630
|
+
* session — see roadmap Phase 5b.
|
|
1631
|
+
* - `auto_mode=true` is enabled by default. Pass `autoMode: false` to
|
|
1632
|
+
* send a custom `chunk_length_schedule`.
|
|
1633
|
+
* - `outputFormat` is exposed as a query parameter so `ulaw_8000` (Twilio
|
|
1634
|
+
* native) and `pcm_16000` (Telnyx native) work without resampling.
|
|
1635
|
+
* - `eleven_v3` is **not** supported — the WS endpoint rejects it.
|
|
1636
|
+
* - `optimize_streaming_latency` is officially deprecated and is not
|
|
1637
|
+
* exposed.
|
|
1638
|
+
*/
|
|
1639
|
+
|
|
1640
|
+
/** Constructor options for {@link ElevenLabsWebSocketTTS}. */
|
|
1641
|
+
interface ElevenLabsWebSocketTTSOptions {
|
|
1642
|
+
apiKey: string;
|
|
1643
|
+
voiceId?: string;
|
|
1644
|
+
modelId?: ElevenLabsModel | string;
|
|
1645
|
+
outputFormat?: string;
|
|
1646
|
+
voiceSettings?: Record<string, unknown>;
|
|
1647
|
+
languageCode?: string;
|
|
1648
|
+
/** Let the server pick chunk timing. Default true. */
|
|
1649
|
+
autoMode?: boolean;
|
|
1650
|
+
/** WS keep-alive timeout in seconds (5–180). Default 60. */
|
|
1651
|
+
inactivityTimeout?: number;
|
|
1652
|
+
/**
|
|
1653
|
+
* Manual chunk schedule, only used when ``autoMode: false``. Each value
|
|
1654
|
+
* must be 5–500. ElevenLabs default is ``[120, 160, 250, 290]``.
|
|
1655
|
+
*/
|
|
1656
|
+
chunkLengthSchedule?: number[];
|
|
1657
|
+
/** Outgoing audio re-chunk size in bytes. Default 4096. */
|
|
1658
|
+
chunkSize?: number;
|
|
1659
|
+
}
|
|
1660
|
+
/**
|
|
1661
|
+
* Parked WS handle returned by {@link ElevenLabsWebSocketTTS.openParkedConnection}.
|
|
1662
|
+
*
|
|
1663
|
+
* `bosSent` records whether the BOS frame (`{"text": " ", ...}`) has
|
|
1664
|
+
* already been written to the wire. The prewarm pipeline always sends
|
|
1665
|
+
* the BOS so the upstream worker is selected on the parked connection;
|
|
1666
|
+
* `synthesizeStream` adopts the WS and SKIPS its own BOS send to avoid
|
|
1667
|
+
* a protocol error.
|
|
1668
|
+
*/
|
|
1669
|
+
interface ElevenLabsParkedWS {
|
|
1670
|
+
ws: WebSocket__default;
|
|
1671
|
+
bosSent: boolean;
|
|
1672
|
+
}
|
|
1673
|
+
/** WebSocket-based ElevenLabs TTS adapter — opt-in low-latency variant. */
|
|
1674
|
+
declare class ElevenLabsWebSocketTTS implements TTSAdapter {
|
|
1675
|
+
static readonly providerKey = "elevenlabs_ws";
|
|
1676
|
+
readonly apiKey: string;
|
|
1677
|
+
readonly voiceId: string;
|
|
1678
|
+
readonly modelId: string;
|
|
1679
|
+
readonly voiceSettings?: Record<string, unknown>;
|
|
1680
|
+
readonly languageCode?: string;
|
|
1681
|
+
readonly autoMode: boolean;
|
|
1682
|
+
readonly inactivityTimeout: number;
|
|
1683
|
+
readonly chunkLengthSchedule?: number[];
|
|
1684
|
+
readonly chunkSize: number;
|
|
1685
|
+
/**
|
|
1686
|
+
* Single-slot adoption queue. The prewarm pipeline parks one WS per
|
|
1687
|
+
* outbound call here; the next `synthesizeStream` call consumes it
|
|
1688
|
+
* (skipping `new WebSocket()` and the BOS send) instead of opening
|
|
1689
|
+
* a fresh socket. The slot is consumed exactly once: if a second
|
|
1690
|
+
* `synthesizeStream` runs before the first, only the first benefits.
|
|
1691
|
+
*
|
|
1692
|
+
* We keep this on the adapter (not in a parameter) so the existing
|
|
1693
|
+
* `for await (const chunk of agent.tts.synthesizeStream(...))` call
|
|
1694
|
+
* site in `StreamHandler` continues to work without signature
|
|
1695
|
+
* changes.
|
|
1696
|
+
*/
|
|
1697
|
+
private adoptedConnection;
|
|
1698
|
+
/**
|
|
1699
|
+
* The wire format requested over the ElevenLabs WS. Initially set from
|
|
1700
|
+
* the constructor; ``setTelephonyCarrier`` may auto-flip it to the
|
|
1701
|
+
* carrier's native codec when the caller did NOT pass ``outputFormat``
|
|
1702
|
+
* explicitly.
|
|
1703
|
+
*/
|
|
1704
|
+
private _outputFormat;
|
|
1705
|
+
private readonly _outputFormatExplicit;
|
|
1706
|
+
/** Public read-only view of the (possibly auto-flipped) wire format. */
|
|
1707
|
+
get outputFormat(): string;
|
|
1708
|
+
constructor(opts: ElevenLabsWebSocketTTSOptions);
|
|
1709
|
+
/**
|
|
1710
|
+
* Hook called by ``StreamHandler`` to advise the carrier wire format.
|
|
1711
|
+
*
|
|
1712
|
+
* When the user did NOT pass an explicit ``outputFormat`` in the
|
|
1713
|
+
* constructor options, this flips the format to the carrier's native
|
|
1714
|
+
* wire codec — saving a client-side transcode step. Calling with an
|
|
1715
|
+
* unknown carrier (``""`` / ``"custom"``) is a no-op.
|
|
1716
|
+
*
|
|
1717
|
+
* When ``outputFormat`` was explicitly passed (incl. via the
|
|
1718
|
+
* ``forTwilio`` / ``forTelnyx`` factories), this method is a no-op —
|
|
1719
|
+
* the user's choice always wins.
|
|
1720
|
+
*/
|
|
1721
|
+
setTelephonyCarrier(carrier: string): void;
|
|
1722
|
+
/** Pre-configured for Twilio Media Streams (`ulaw_8000`). */
|
|
1723
|
+
static forTwilio(opts: Omit<ElevenLabsWebSocketTTSOptions, 'outputFormat'>): ElevenLabsWebSocketTTS;
|
|
1724
|
+
/** Pre-configured for Telnyx (`pcm_16000`). */
|
|
1725
|
+
static forTelnyx(opts: Omit<ElevenLabsWebSocketTTSOptions, 'outputFormat'>): ElevenLabsWebSocketTTS;
|
|
1726
|
+
private buildUrl;
|
|
1727
|
+
/**
|
|
1728
|
+
* Build the protocol-required BOS frame sent on every fresh WS.
|
|
1729
|
+
*
|
|
1730
|
+
* The single-space `{"text": " "}` keep-alive establishes the session
|
|
1731
|
+
* without committing any synthesis (no `flush: true`, no real text).
|
|
1732
|
+
* Production `synthesizeStream()` and `warmup()` share this exact
|
|
1733
|
+
* construction so the upstream worker chooses the same per-session
|
|
1734
|
+
* config in both cases — otherwise the warm session is on a different
|
|
1735
|
+
* worker than the live request, which defeats the warmup goal.
|
|
1736
|
+
*/
|
|
1737
|
+
private buildBosFrame;
|
|
1738
|
+
/**
|
|
1739
|
+
* Single-shot synthesis: open WS, send text, yield bytes, close.
|
|
1740
|
+
*
|
|
1741
|
+
* Resilience contract:
|
|
1742
|
+
* - Connection bounded by ``CONNECT_TIMEOUT_MS`` (5s, was 15s).
|
|
1743
|
+
* - Each idle wait bounded by ``FRAME_TIMEOUT_MS`` (30s) so a stalled
|
|
1744
|
+
* server cannot keep the generator alive indefinitely.
|
|
1745
|
+
* - Permanent error handler attached BEFORE the open await — prevents
|
|
1746
|
+
* ``uncaughtException`` if an error fires after the once-listener
|
|
1747
|
+
* resolves.
|
|
1748
|
+
* - All event listeners removed in ``finally`` (no closure leak past
|
|
1749
|
+
* socket close).
|
|
1750
|
+
* - Server-reported ``error`` raises ``ElevenLabsTTSError``.
|
|
1751
|
+
* - Per-frame audio payload capped at ``MAX_AUDIO_B64_BYTES``.
|
|
1752
|
+
* - Best-effort EOS ``{"text":""}`` sent in finally (not immediately
|
|
1753
|
+
* after flush — auto_mode could otherwise truncate the tail audio).
|
|
1754
|
+
*/
|
|
1755
|
+
synthesizeStream(text: string): AsyncGenerator<Buffer>;
|
|
1756
|
+
/**
|
|
1757
|
+
* Pre-call WebSocket warmup for the ElevenLabs `/stream-input` endpoint.
|
|
1758
|
+
*
|
|
1759
|
+
* Opens the WS (DNS + TLS + auth handshake), sends the EXACT same BOS
|
|
1760
|
+
* frame the production `synthesizeStream()` path sends — including
|
|
1761
|
+
* `voice_settings` and (when configured) `generation_config` — so
|
|
1762
|
+
* ElevenLabs instantiates the same per-session worker for both
|
|
1763
|
+
* warmup and the live request. If the BOS frames differ, the server
|
|
1764
|
+
* may route warmup and the real call to two different workers, and
|
|
1765
|
+
* the warmed worker is wasted. Idles ~250 ms, then closes. By the
|
|
1766
|
+
* time the first `synthesizeStream()` call lands during the call,
|
|
1767
|
+
* the connection pool has the upstream warm — net wire time saving
|
|
1768
|
+
* of 200-500 ms.
|
|
1769
|
+
*
|
|
1770
|
+
* Billing safety: ElevenLabs bills on synthesised characters
|
|
1771
|
+
* delivered via `audio` frames (per https://elevenlabs.io/pricing).
|
|
1772
|
+
* The keepalive (single-space `text`, no `flush: true`, no real
|
|
1773
|
+
* transcript) is documented as the session-establishment frame and
|
|
1774
|
+
* does NOT generate synthesis. Closing without sending the actual
|
|
1775
|
+
* transcript does not consume billable characters. Best-effort:
|
|
1776
|
+
* failures logged at debug level.
|
|
1777
|
+
*/
|
|
1778
|
+
warmup(): Promise<void>;
|
|
1779
|
+
/**
|
|
1780
|
+
* Open a fresh WS, send the EXACT BOS frame the live `synthesizeStream`
|
|
1781
|
+
* sends, and return the OPEN socket without closing it. Used by the
|
|
1782
|
+
* prewarm pipeline to park a TTS connection during the carrier ringing
|
|
1783
|
+
* window so the next `synthesizeStream` call can adopt it via
|
|
1784
|
+
* {@link adoptWebSocket} and skip ~400-900 ms of TLS + BOS round-trip.
|
|
1785
|
+
*
|
|
1786
|
+
* Returns a parked-handle the caller stashes; the next
|
|
1787
|
+
* `synthesizeStream` will detect the adoption queue and skip its own
|
|
1788
|
+
* `new WebSocket()` + BOS send.
|
|
1789
|
+
*
|
|
1790
|
+
* Billing safety: BOS is the documented session-establishment frame
|
|
1791
|
+
* (single space `text`, no `flush: true`) and does not generate
|
|
1792
|
+
* synthesis. ElevenLabs bills on `audio` frames received from the
|
|
1793
|
+
* server, not on BOS bytes sent by the client.
|
|
1205
1794
|
*/
|
|
1206
|
-
|
|
1795
|
+
openParkedConnection(): Promise<ElevenLabsParkedWS>;
|
|
1207
1796
|
/**
|
|
1208
|
-
*
|
|
1209
|
-
*
|
|
1210
|
-
*
|
|
1211
|
-
*
|
|
1212
|
-
*
|
|
1797
|
+
* Stash a parked WS handle so the next `synthesizeStream` call adopts
|
|
1798
|
+
* it instead of opening a fresh socket. Caller is responsible for
|
|
1799
|
+
* holding the handle alive until either the live request consumes it
|
|
1800
|
+
* or the call ends (in which case `discardAdoptedConnection()`
|
|
1801
|
+
* cleans it up).
|
|
1213
1802
|
*/
|
|
1214
|
-
|
|
1215
|
-
/** If set, spoken as a voicemail message when AMD detects a machine. Implicitly enables ``machineDetection``. */
|
|
1216
|
-
voicemailMessage?: string;
|
|
1217
|
-
/** Dynamic variables merged into agent.variables before call. Override agent-level variables. */
|
|
1218
|
-
variables?: Record<string, string>;
|
|
1803
|
+
adoptWebSocket(parked: ElevenLabsParkedWS): void;
|
|
1219
1804
|
/**
|
|
1220
|
-
*
|
|
1221
|
-
*
|
|
1222
|
-
*
|
|
1223
|
-
* parity, or `null` to omit the parameter entirely (carrier picks its
|
|
1224
|
-
* own default).
|
|
1805
|
+
* Drop and close any pending parked WS without consuming it. Used on
|
|
1806
|
+
* call-failure paths so a never-started call does not leak a TTS WS
|
|
1807
|
+
* that ElevenLabs will close after its inactivity timeout anyway.
|
|
1225
1808
|
*/
|
|
1226
|
-
|
|
1809
|
+
discardAdoptedConnection(): void;
|
|
1810
|
+
/** No-op — connections are per-utterance and torn down inside synthesizeStream. */
|
|
1811
|
+
close(): Promise<void>;
|
|
1227
1812
|
}
|
|
1228
1813
|
|
|
1229
1814
|
/**
|
|
@@ -1272,6 +1857,19 @@ declare class MetricsStore extends EventEmitter {
|
|
|
1272
1857
|
private readonly maxCalls;
|
|
1273
1858
|
private calls;
|
|
1274
1859
|
private activeCalls;
|
|
1860
|
+
/**
|
|
1861
|
+
* User-driven soft delete: call_ids the operator removed from the
|
|
1862
|
+
* dashboard view. The on-disk artefacts written by ``CallLogger``
|
|
1863
|
+
* (``metadata.json``, ``transcript.jsonl``) are intentionally NOT
|
|
1864
|
+
* touched — they serve as the durable backup. All read paths
|
|
1865
|
+
* (``getCalls`` / ``getCall`` / ``getAggregates`` / ``getCallsInRange``
|
|
1866
|
+
* / ``hydrate``) filter against this set so the call is invisible
|
|
1867
|
+
* to the UI and excluded from rolling metrics. Populated from
|
|
1868
|
+
* ``<logRoot>/.deleted_call_ids.json`` on hydrate so deletions
|
|
1869
|
+
* survive a process restart. Parity with Python.
|
|
1870
|
+
*/
|
|
1871
|
+
private deletedCallIds;
|
|
1872
|
+
private deletedIdsPath;
|
|
1275
1873
|
/**
|
|
1276
1874
|
* Accepts either a numeric ``maxCalls`` (legacy positional — matches the
|
|
1277
1875
|
* original TS API) or an options object ``{ maxCalls }`` to align with the
|
|
@@ -1300,19 +1898,66 @@ declare class MetricsStore extends EventEmitter {
|
|
|
1300
1898
|
recordTurn(data: Record<string, unknown>): void;
|
|
1301
1899
|
/** Move a call from active to completed and persist its final metrics. */
|
|
1302
1900
|
recordCallEnd(data: Record<string, unknown>, metrics?: Record<string, unknown> | null): void;
|
|
1303
|
-
/**
|
|
1901
|
+
/**
|
|
1902
|
+
* Return a window of completed calls in newest-first order.
|
|
1903
|
+
*
|
|
1904
|
+
* Soft-deleted call_ids (see ``deleteCalls``) are filtered out so the
|
|
1905
|
+
* dashboard never re-shows a row the user removed. The on-disk
|
|
1906
|
+
* artefacts are intentionally preserved as a backup.
|
|
1907
|
+
*/
|
|
1304
1908
|
getCalls(limit?: number, offset?: number): CallRecord[];
|
|
1305
|
-
/**
|
|
1909
|
+
/**
|
|
1910
|
+
* Look up a completed call by id (newest match wins).
|
|
1911
|
+
*
|
|
1912
|
+
* Soft-deleted call_ids resolve to ``null`` so the SPA's detail pane
|
|
1913
|
+
* cannot render a row the user removed.
|
|
1914
|
+
*/
|
|
1306
1915
|
getCall(callId: string): CallRecord | null;
|
|
1916
|
+
/**
|
|
1917
|
+
* Soft-delete one or more calls from the dashboard view.
|
|
1918
|
+
*
|
|
1919
|
+
* Adds each ``call_id`` to an in-memory set. Subsequent reads via
|
|
1920
|
+
* ``getCalls`` / ``getCall`` / ``getAggregates`` / ``getCallsInRange``
|
|
1921
|
+
* exclude the deleted ids, so rolling metrics (avg latency, total
|
|
1922
|
+
* spend) are recomputed without them. The on-disk
|
|
1923
|
+
* ``metadata.json`` / ``transcript.jsonl`` files written by
|
|
1924
|
+
* ``CallLogger`` are NOT touched — they serve as a durable backup
|
|
1925
|
+
* the operator can audit outside the dashboard.
|
|
1926
|
+
*
|
|
1927
|
+
* Active calls are never deletable. A call_id that is currently
|
|
1928
|
+
* in ``activeCalls`` is silently skipped so a mid-call delete
|
|
1929
|
+
* from the UI cannot orphan the live transcript pane.
|
|
1930
|
+
*
|
|
1931
|
+
* Persisted to ``<logRoot>/.deleted_call_ids.json`` (best-effort)
|
|
1932
|
+
* when ``hydrate()`` has been called with a log root. Parity with
|
|
1933
|
+
* Python ``delete_calls``.
|
|
1934
|
+
*
|
|
1935
|
+
* @returns The list of call_ids actually accepted as deleted.
|
|
1936
|
+
*/
|
|
1937
|
+
deleteCalls(callIds: readonly string[]): string[];
|
|
1938
|
+
/** Whether ``callId`` was soft-deleted from the dashboard. */
|
|
1939
|
+
isDeleted(callId: string): boolean;
|
|
1940
|
+
/** Snapshot of soft-deleted call_ids (sorted). */
|
|
1941
|
+
getDeletedCallIds(): string[];
|
|
1942
|
+
/** Atomically persist the deleted-ids set to disk. Best-effort. */
|
|
1943
|
+
private persistDeletedIds;
|
|
1307
1944
|
/** Look up an active call by id (returns undefined if not active or unknown). */
|
|
1308
1945
|
getActive(callId: string): CallRecord | undefined;
|
|
1309
1946
|
/** Return all currently active (not yet ended) calls. */
|
|
1310
1947
|
getActiveCalls(): CallRecord[];
|
|
1311
|
-
/**
|
|
1948
|
+
/**
|
|
1949
|
+
* Compute summary statistics across the buffered call history.
|
|
1950
|
+
*
|
|
1951
|
+
* Soft-deleted calls are excluded so rolling metrics (avg latency,
|
|
1952
|
+
* total spend) match exactly what the operator sees in the call list.
|
|
1953
|
+
*/
|
|
1312
1954
|
getAggregates(): Record<string, unknown>;
|
|
1313
|
-
/**
|
|
1955
|
+
/**
|
|
1956
|
+
* Return calls whose `started_at` falls within `[fromTs, toTs]` (Unix
|
|
1957
|
+
* seconds). Soft-deleted calls are filtered out.
|
|
1958
|
+
*/
|
|
1314
1959
|
getCallsInRange(fromTs?: number, toTs?: number): CallRecord[];
|
|
1315
|
-
/** Number of completed calls currently in the ring buffer. */
|
|
1960
|
+
/** Number of completed (non-deleted) calls currently in the ring buffer. */
|
|
1316
1961
|
get callCount(): number;
|
|
1317
1962
|
/**
|
|
1318
1963
|
* Rebuild the in-memory call list from `metadata.json` files written by
|
|
@@ -1455,6 +2100,19 @@ declare class SpeechEvents {
|
|
|
1455
2100
|
private dispatch;
|
|
1456
2101
|
}
|
|
1457
2102
|
|
|
2103
|
+
/** Parked provider WebSockets ready for adoption by a per-call StreamHandler. */
|
|
2104
|
+
interface ParkedProviderConnections {
|
|
2105
|
+
/** Pre-opened STT WS (Cartesia today; other adapters may add support later). */
|
|
2106
|
+
stt?: WebSocket.WebSocket;
|
|
2107
|
+
/**
|
|
2108
|
+
* Pre-opened TTS WS handle (ElevenLabs WS today). The `bosSent` flag
|
|
2109
|
+
* lets the live `synthesizeStream` skip its own BOS send when the
|
|
2110
|
+
* prewarm pipeline already wrote it.
|
|
2111
|
+
*/
|
|
2112
|
+
tts?: ElevenLabsParkedWS;
|
|
2113
|
+
/** Pre-opened OpenAI Realtime WS (already through `session.updated`). */
|
|
2114
|
+
openaiRealtime?: WebSocket.WebSocket;
|
|
2115
|
+
}
|
|
1458
2116
|
/** Top-level SDK entry point — wraps a carrier + embedded server + agent loop. */
|
|
1459
2117
|
declare class Patter {
|
|
1460
2118
|
private localConfig;
|
|
@@ -1476,6 +2134,65 @@ declare class Patter {
|
|
|
1476
2134
|
* ``Cannot use both tunnel: true and webhookUrl``.
|
|
1477
2135
|
*/
|
|
1478
2136
|
private tunnelOwnsWebhookUrl;
|
|
2137
|
+
/**
|
|
2138
|
+
* Pre-rendered first-message TTS audio per outbound call_id. Populated
|
|
2139
|
+
* by :meth:`call` when ``agent.prewarmFirstMessage`` is true; consumed
|
|
2140
|
+
* by the StreamHandler firstMessage emit so the greeting streams
|
|
2141
|
+
* instantly on ``start`` instead of paying the 200-700 ms TTS first-byte
|
|
2142
|
+
* latency. See ``AgentOptions.prewarmFirstMessage``.
|
|
2143
|
+
*
|
|
2144
|
+
* Stores raw bytes in the TTS provider's native sample rate; the
|
|
2145
|
+
* carrier-side audio sender resamples on emit.
|
|
2146
|
+
*/
|
|
2147
|
+
private prewarmAudio;
|
|
2148
|
+
/**
|
|
2149
|
+
* Call IDs whose prewarm cache slot has already been consumed —
|
|
2150
|
+
* either by ``popPrewarmAudio`` (cache hit OR miss on the firstMessage
|
|
2151
|
+
* emit path) or by ``recordPrewarmWaste`` (call ended before pickup).
|
|
2152
|
+
* The prewarm task checks this set BEFORE writing bytes so a slow
|
|
2153
|
+
* synth that finishes after the consumer already polled doesn't
|
|
2154
|
+
* orphan bytes in ``prewarmAudio``. See FIX #92 in the parity audit.
|
|
2155
|
+
*/
|
|
2156
|
+
private prewarmConsumed;
|
|
2157
|
+
/**
|
|
2158
|
+
* Background tasks tracked so :meth:`disconnect` can wait on / drop any
|
|
2159
|
+
* still-running prewarm-first-message synth before tearing down.
|
|
2160
|
+
*/
|
|
2161
|
+
private prewarmTasks;
|
|
2162
|
+
/**
|
|
2163
|
+
* TTL eviction timers keyed by call_id so :meth:`disconnect` (and
|
|
2164
|
+
* normal consumption / waste-record paths) can cancel any pending
|
|
2165
|
+
* timer when the slot drains naturally. Without this, the timer
|
|
2166
|
+
* would WARN spuriously after the cache was already emptied.
|
|
2167
|
+
*/
|
|
2168
|
+
private prewarmTtlTimers;
|
|
2169
|
+
/**
|
|
2170
|
+
* Pre-opened, fully-handshaked provider WebSockets keyed by
|
|
2171
|
+
* carrier-issued call_id. Populated by ``parkProviderConnections``
|
|
2172
|
+
* during the carrier ringing window; consumed by the per-call
|
|
2173
|
+
* StreamHandler at ``start`` via ``adoptWebSocket(...)`` so STT / TTS
|
|
2174
|
+
* / Realtime audio can flow on the first turn without paying the
|
|
2175
|
+
* 150-900 ms TLS + WS-upgrade + protocol-handshake round-trip again.
|
|
2176
|
+
*
|
|
2177
|
+
* Distinct from ``prewarmAudio`` (which holds pre-rendered TTS bytes
|
|
2178
|
+
* for the first message); the two features are complementary and
|
|
2179
|
+
* orthogonal — both can be active for the same call.
|
|
2180
|
+
*
|
|
2181
|
+
* Each slot may hold up to three parked connections (STT, TTS,
|
|
2182
|
+
* Realtime). Drained by:
|
|
2183
|
+
* - {@link popPrewarmedConnections} on the carrier ``start`` event
|
|
2184
|
+
* (consumed normally — the handles transfer to the StreamHandler)
|
|
2185
|
+
* - {@link recordPrewarmWaste} on call-termination paths (no-answer,
|
|
2186
|
+
* busy, failed, canceled, AMD voicemail). Closes parked sockets.
|
|
2187
|
+
* - {@link disconnect} on Patter teardown. Closes all parked sockets.
|
|
2188
|
+
*/
|
|
2189
|
+
private prewarmedConnections;
|
|
2190
|
+
/**
|
|
2191
|
+
* TTL eviction handles keyed by call_id for connections that are never
|
|
2192
|
+
* adopted (e.g. a carrier that swallows ``start``). Closes the parked
|
|
2193
|
+
* sockets so they don't leak past the safety window.
|
|
2194
|
+
*/
|
|
2195
|
+
private prewarmedConnTimers;
|
|
1479
2196
|
/**
|
|
1480
2197
|
* Speech-edge events for turn-taking instrumentation. Public surface: the
|
|
1481
2198
|
* seven `on*` proxy accessors below plus the `conversationState` snapshot.
|
|
@@ -1483,7 +2200,7 @@ declare class Patter {
|
|
|
1483
2200
|
* the previous behaviour.
|
|
1484
2201
|
*
|
|
1485
2202
|
* See `src/_speech-events.ts` for the full event taxonomy and the
|
|
1486
|
-
*
|
|
2203
|
+
* OpenAI Realtime alignment table.
|
|
1487
2204
|
*/
|
|
1488
2205
|
readonly speechEvents: SpeechEvents;
|
|
1489
2206
|
get onUserSpeechStarted(): SpeechEventCallback | null;
|
|
@@ -1502,8 +2219,8 @@ declare class Patter {
|
|
|
1502
2219
|
set onAudioOut(cb: SpeechEventCallback | null);
|
|
1503
2220
|
/**
|
|
1504
2221
|
* Snapshot of the current per-side state of the call.
|
|
1505
|
-
*
|
|
1506
|
-
*
|
|
2222
|
+
* Returns the user_state / agent_state payload shape — read-only and
|
|
2223
|
+
* safe to call at any time.
|
|
1507
2224
|
*/
|
|
1508
2225
|
get conversationState(): ConversationStateSnapshot;
|
|
1509
2226
|
/**
|
|
@@ -1553,12 +2270,115 @@ declare class Patter {
|
|
|
1553
2270
|
private _serveImpl;
|
|
1554
2271
|
/** Run the agent in interactive terminal-test mode (no real telephony). */
|
|
1555
2272
|
test(opts: ServeOptions): Promise<void>;
|
|
2273
|
+
/**
|
|
2274
|
+
* Pop and return the pre-synthesised first-message audio for ``callId``.
|
|
2275
|
+
*
|
|
2276
|
+
* Returns ``undefined`` when ``agent.prewarmFirstMessage`` was not set
|
|
2277
|
+
* for the originating outbound call, or when the synth was still in
|
|
2278
|
+
* flight at the moment the carrier emitted ``start`` (cache miss — the
|
|
2279
|
+
* StreamHandler falls back to live TTS).
|
|
2280
|
+
*
|
|
2281
|
+
* Called by the per-call StreamHandler at the start of the firstMessage
|
|
2282
|
+
* emit. Returning bytes here lets the handler skip the live TTS
|
|
2283
|
+
* synthesis and stream the cached buffer directly.
|
|
2284
|
+
*
|
|
2285
|
+
* Marks ``callId`` as consumed regardless of cache hit/miss so a slow
|
|
2286
|
+
* synth task that finishes after this call drops its bytes instead of
|
|
2287
|
+
* orphaning them in ``prewarmAudio``. See FIX #92.
|
|
2288
|
+
*/
|
|
2289
|
+
popPrewarmAudio: (callId: string) => Buffer | undefined;
|
|
2290
|
+
/**
|
|
2291
|
+
* Log a warning if a prewarmed greeting was paid for but never used.
|
|
2292
|
+
* The TTS bill for ``agent.firstMessage`` has already been incurred by
|
|
2293
|
+
* the background synth task, so the user should know — opt-in feature
|
|
2294
|
+
* with a known cost surface.
|
|
2295
|
+
*
|
|
2296
|
+
* Idempotent: the second call for the same ``callId`` is a no-op, so
|
|
2297
|
+
* the status callback firing first and ``endCall`` running afterwards
|
|
2298
|
+
* (or vice-versa) does not double-WARN. Public so the embedded
|
|
2299
|
+
* server's webhook handlers can invoke it on no-answer / busy /
|
|
2300
|
+
* failed / canceled / AMD-machine paths. See FIX #91.
|
|
2301
|
+
*/
|
|
2302
|
+
recordPrewarmWaste: (callId: string) => void;
|
|
2303
|
+
/**
|
|
2304
|
+
* Pop and return the parked provider WebSockets for ``callId``, or
|
|
2305
|
+
* ``undefined`` when no parked connections exist.
|
|
2306
|
+
*
|
|
2307
|
+
* Wired into ``EmbeddedServer.popPrewarmedConnections`` so the
|
|
2308
|
+
* per-call ``StreamHandler`` can adopt the parked sockets at the
|
|
2309
|
+
* carrier ``start`` event instead of opening fresh ones — saving
|
|
2310
|
+
* ~150-900 ms of cold-start handshake on the first turn.
|
|
2311
|
+
*/
|
|
2312
|
+
popPrewarmedConnections: (callId: string) => ParkedProviderConnections | undefined;
|
|
2313
|
+
/**
|
|
2314
|
+
* Close any parked provider WebSockets for ``callId``. Wired into
|
|
2315
|
+
* ``EmbeddedServer.closePrewarmedConnections`` so call-termination
|
|
2316
|
+
* paths (no-answer, busy, failed, canceled, AMD voicemail) drop the
|
|
2317
|
+
* sockets cleanly instead of leaving them to the upstream timeout.
|
|
2318
|
+
*/
|
|
2319
|
+
closePrewarmedConnections: (callId: string) => void;
|
|
2320
|
+
/**
|
|
2321
|
+
* Open and park provider WebSockets in parallel with the carrier-side
|
|
2322
|
+
* ``initiateCall``. Unlike :meth:`spawnProviderWarmup` (which closes
|
|
2323
|
+
* the WS after a brief idle), the sockets opened here stay OPEN and
|
|
2324
|
+
* are handed off to the per-call ``StreamHandler`` on ``start``.
|
|
2325
|
+
*
|
|
2326
|
+
* This is the structural fix for first-turn cold-start: on Node's
|
|
2327
|
+
* ``ws`` package, opening + closing a WS does NOT warm TLS for the
|
|
2328
|
+
* next open — every fresh ``new WebSocket()`` re-pays the full
|
|
2329
|
+
* TCP + TLS + HTTP-101 round-trip. By keeping the WS open and
|
|
2330
|
+
* adopting it directly, the live first turn skips the handshake
|
|
2331
|
+
* entirely (saves ~150-900 ms depending on provider).
|
|
2332
|
+
*
|
|
2333
|
+
* Best-effort: each provider's parking task is wrapped in
|
|
2334
|
+
* ``Promise.allSettled`` so a slow or failing endpoint cannot block
|
|
2335
|
+
* the others. Providers without ``openParkedConnection`` contribute
|
|
2336
|
+
* nothing — the call falls through to the cold ``connect()`` path
|
|
2337
|
+
* for that provider.
|
|
2338
|
+
*/
|
|
2339
|
+
private parkProviderConnections;
|
|
2340
|
+
/**
|
|
2341
|
+
* Spawn a fire-and-forget task that warms up STT / TTS / LLM in
|
|
2342
|
+
* parallel with the carrier-side ``initiateCall``.
|
|
2343
|
+
*
|
|
2344
|
+
* Best-effort: each provider's optional ``warmup()`` is wrapped in
|
|
2345
|
+
* ``Promise.allSettled`` so a slow or failing endpoint cannot block
|
|
2346
|
+
* the others. Providers without ``warmup`` contribute nothing.
|
|
2347
|
+
*/
|
|
2348
|
+
private spawnProviderWarmup;
|
|
2349
|
+
/**
|
|
2350
|
+
* Pre-render ``agent.firstMessage`` to TTS bytes during the ringing
|
|
2351
|
+
* window and stash them in ``prewarmAudio.set(callId, buf)``.
|
|
2352
|
+
*
|
|
2353
|
+
* Skipped silently when ``agent.prewarmFirstMessage`` is false or
|
|
2354
|
+
* when ``agent.tts`` / ``agent.firstMessage`` is missing. The synth
|
|
2355
|
+
* is bounded by ``ringTimeout`` (default 25 s) so a never-answered
|
|
2356
|
+
* call doesn't tie up the TTS connection. On timeout / error the
|
|
2357
|
+
* cache is left empty and the StreamHandler falls back to live TTS.
|
|
2358
|
+
*
|
|
2359
|
+
* **Pipeline mode only.** Realtime / ConvAI provider modes never
|
|
2360
|
+
* consume the prewarm cache (the StreamHandler for those modes runs
|
|
2361
|
+
* its first-message emit through the provider's own audio path).
|
|
2362
|
+
* Spawning the prewarm in those modes pays the TTS bill for nothing
|
|
2363
|
+
* — refused with a warn.
|
|
2364
|
+
*
|
|
2365
|
+
* **Capped at ``PREWARM_CACHE_MAX`` concurrent entries.** Refused
|
|
2366
|
+
* with a warn when the cap is reached (the call still proceeds —
|
|
2367
|
+
* StreamHandler falls back to live TTS).
|
|
2368
|
+
*/
|
|
2369
|
+
private spawnPrewarmFirstMessage;
|
|
1556
2370
|
/** Place an outbound call via the configured carrier. */
|
|
1557
2371
|
call(options: LocalCallOptions): Promise<void>;
|
|
1558
2372
|
/**
|
|
1559
2373
|
* Stop the embedded server and any running tunnel. Safe to call multiple
|
|
1560
2374
|
* times. Leaves the instance reusable: a subsequent ``serve()`` works as
|
|
1561
2375
|
* if the previous lifecycle never happened.
|
|
2376
|
+
*
|
|
2377
|
+
* Also clears any pending TTL eviction timers, awaits in-flight
|
|
2378
|
+
* prewarm-first-message synth tasks (best-effort, with a 1 s safety
|
|
2379
|
+
* timeout), and clears the prewarm cache. Without this a still-running
|
|
2380
|
+
* TTS WS keeps the user billed long after SDK teardown, and stale
|
|
2381
|
+
* entries leak across ``serve`` / ``disconnect`` cycles. See FIX #93.
|
|
1562
2382
|
*/
|
|
1563
2383
|
disconnect(): Promise<void>;
|
|
1564
2384
|
/**
|
|
@@ -2075,7 +2895,22 @@ declare function calculateTelephonyCost(provider: string, durationSeconds: numbe
|
|
|
2075
2895
|
|
|
2076
2896
|
/** Per-turn latency breakdown across the STT/LLM/TTS pipeline. */
|
|
2077
2897
|
interface LatencyBreakdown {
|
|
2898
|
+
/**
|
|
2899
|
+
* STT finalization time: end-of-speech (VAD stop or STT speech_final) →
|
|
2900
|
+
* final transcript delivery. This is the engineering metric — pure STT
|
|
2901
|
+
* processing latency, independent of how long the user spoke. Industry
|
|
2902
|
+
* benchmarks (Picovoice, Deepgram, Gladia, Speechmatics) all report this
|
|
2903
|
+
* number as "STT latency". Falls back to turn_start when the endpoint
|
|
2904
|
+
* signal is unavailable (degraded provider, batch STT, etc.).
|
|
2905
|
+
*/
|
|
2078
2906
|
stt_ms: number;
|
|
2907
|
+
/**
|
|
2908
|
+
* Duration of the user's utterance (turn_start → end-of-speech). Useful
|
|
2909
|
+
* to distinguish "user spoke for 4s" from "STT took 4s to finalize" —
|
|
2910
|
+
* they used to be conflated in stt_ms before 0.6.1. Optional — undefined
|
|
2911
|
+
* when the endpoint signal is unavailable.
|
|
2912
|
+
*/
|
|
2913
|
+
user_speech_duration_ms?: number;
|
|
2079
2914
|
/**
|
|
2080
2915
|
* Backwards-compatible LLM bucket. With the split below, this now reflects
|
|
2081
2916
|
* the user-perceived first-token latency (TTFT) when streaming is available
|
|
@@ -2164,6 +2999,12 @@ interface CallMetrics {
|
|
|
2164
2999
|
tts_provider: string;
|
|
2165
3000
|
llm_provider: string;
|
|
2166
3001
|
telephony_provider: string;
|
|
3002
|
+
/** Model identifiers per provider (e.g. "ink-whisper", "eleven_flash_v2_5",
|
|
3003
|
+
* "gpt-oss-120b"). Surface on the dashboard cost breakdown so operators
|
|
3004
|
+
* can attribute per-call spend to a specific model. */
|
|
3005
|
+
stt_model?: string;
|
|
3006
|
+
tts_model?: string;
|
|
3007
|
+
llm_model?: string;
|
|
2167
3008
|
}
|
|
2168
3009
|
/** Programmatic control surface for a live call (transfer, hangup, DTMF). */
|
|
2169
3010
|
interface CallControl {
|
|
@@ -2236,6 +3077,7 @@ declare class CallMetricsAccumulator {
|
|
|
2236
3077
|
private _actualTelephonyCost;
|
|
2237
3078
|
private _actualSttCost;
|
|
2238
3079
|
private _totalLlmCost;
|
|
3080
|
+
private _llmModel;
|
|
2239
3081
|
private _eventBus;
|
|
2240
3082
|
/** Timestamp (hrTimeMs) when VAD emitted speech_end. */
|
|
2241
3083
|
private _vadStoppedAt;
|
|
@@ -2250,6 +3092,21 @@ declare class CallMetricsAccumulator {
|
|
|
2250
3092
|
private _overlapStartedAt;
|
|
2251
3093
|
private _reportOnlyInitialTtfb;
|
|
2252
3094
|
private _initialTtfbEmitted;
|
|
3095
|
+
/**
|
|
3096
|
+
* Last barge-in detection timestamp (hrTimeMs). Used by
|
|
3097
|
+
* ``_computeTurnLatency`` to gate endpoint_ms / stt_ms emission on turns
|
|
3098
|
+
* that started immediately after a barge-in — those turns have unreliable
|
|
3099
|
+
* VAD/STT anchors and would otherwise pollute the p95 distribution with
|
|
3100
|
+
* synthetic 6+ second spikes.
|
|
3101
|
+
*/
|
|
3102
|
+
private _lastBargeinAt;
|
|
3103
|
+
/**
|
|
3104
|
+
* Count of turns where ``recordSttComplete`` fired but no legitimate VAD
|
|
3105
|
+
* ``speech_end`` had stamped ``_endpointSignalAt``. Exposed via metrics so
|
|
3106
|
+
* we can spot environments where PSTN packet loss is dropping VAD stops
|
|
3107
|
+
* (the common cause of missing endpoint signals).
|
|
3108
|
+
*/
|
|
3109
|
+
private _endpointSignalMissingCount;
|
|
2253
3110
|
constructor(opts: {
|
|
2254
3111
|
callId: string;
|
|
2255
3112
|
providerMode: string;
|
|
@@ -2285,6 +3142,31 @@ declare class CallMetricsAccumulator {
|
|
|
2285
3142
|
* on the first audio byte rather than just before recordSttComplete().
|
|
2286
3143
|
*/
|
|
2287
3144
|
startTurnIfIdle(): void;
|
|
3145
|
+
/**
|
|
3146
|
+
* Anchor the current turn at a legitimate VAD ``speech_start`` event.
|
|
3147
|
+
*
|
|
3148
|
+
* Industry-standard pattern: every VAD ``speech_start`` that fires while the agent
|
|
3149
|
+
* is NOT in the suppressed warmup window re-anchors the turn timer to
|
|
3150
|
+
* the wall-clock moment the user actually started speaking. Re-anchors:
|
|
3151
|
+
*
|
|
3152
|
+
* * ``_turnStart`` — fixes the case where a phantom ``speech_start``
|
|
3153
|
+
* during agent TTS or a partial transcript from the previous user
|
|
3154
|
+
* attempt already stamped the field. Without this, the legitimate
|
|
3155
|
+
* user-speech ``speech_start`` no-op'd and ``user_speech_duration_ms``
|
|
3156
|
+
* inflated from ~1 s to 5-7 s (the original "I waited 7 seconds"
|
|
3157
|
+
* dashboard symptom).
|
|
3158
|
+
* * ``_endpointSignalAt``, ``_vadStoppedAt``, ``_sttFinalAt`` — any
|
|
3159
|
+
* stale anchor from a rejected barge-in / dropped final transcript
|
|
3160
|
+
* on the same uncommitted turn is cleared, so the next
|
|
3161
|
+
* ``recordVadStop`` / ``recordSttFinalTimestamp`` stamps fresh.
|
|
3162
|
+
* * ``_sttComplete``, ``_llmFirstToken``, ``_initialTtfbEmitted`` — same
|
|
3163
|
+
* rationale for the downstream pipeline timestamps.
|
|
3164
|
+
*
|
|
3165
|
+
* No-op once the turn is committed (``_turnCommittedMono`` set): a
|
|
3166
|
+
* VAD ``speech_start`` after commit belongs to the NEXT turn's
|
|
3167
|
+
* barge-in path, handled by ``recordTurnInterrupted`` instead.
|
|
3168
|
+
*/
|
|
3169
|
+
anchorUserSpeechStart(): void;
|
|
2288
3170
|
/** Stamp end-of-STT, capture the user's transcript, and accrue billed STT seconds. */
|
|
2289
3171
|
recordSttComplete(text: string, audioSeconds?: number): void;
|
|
2290
3172
|
/** Record the timestamp of the first LLM token (TTFT). No-op after first call. */
|
|
@@ -2419,6 +3301,13 @@ declare class CallMetricsAccumulator {
|
|
|
2419
3301
|
endCall(): CallMetrics;
|
|
2420
3302
|
/** Return the cost breakdown for the call so far without ending it. */
|
|
2421
3303
|
getCostSoFar(): CostBreakdown;
|
|
3304
|
+
/**
|
|
3305
|
+
* Number of turns where recordSttComplete fired without a prior legitimate
|
|
3306
|
+
* VAD speech_end. Surfaced for diagnostics — a non-zero value points at
|
|
3307
|
+
* dropped VAD stops (commonly PSTN packet loss), which is why we stopped
|
|
3308
|
+
* faking _endpointSignalAt from _sttComplete in 0.6.x.
|
|
3309
|
+
*/
|
|
3310
|
+
get endpointSignalMissingCount(): number;
|
|
2422
3311
|
private _resetTurnState;
|
|
2423
3312
|
private _computeTurnLatency;
|
|
2424
3313
|
private _computeCost;
|
|
@@ -2442,6 +3331,7 @@ declare class CallMetricsAccumulator {
|
|
|
2442
3331
|
* {@link OpenAIRealtimeAdapter}. Audio negotiation defaults to
|
|
2443
3332
|
* `g711_ulaw` so traffic flows through Twilio/Telnyx without transcoding.
|
|
2444
3333
|
*/
|
|
3334
|
+
|
|
2445
3335
|
/**
|
|
2446
3336
|
* Supported OpenAI Realtime wire audio formats. See
|
|
2447
3337
|
* https://platform.openai.com/docs/guides/realtime for the full list.
|
|
@@ -2483,28 +3373,96 @@ interface OpenAIRealtimeOptions {
|
|
|
2483
3373
|
}
|
|
2484
3374
|
/** Realtime WebSocket adapter for OpenAI's `gpt-realtime` family. */
|
|
2485
3375
|
declare class OpenAIRealtimeAdapter {
|
|
2486
|
-
|
|
2487
|
-
|
|
2488
|
-
|
|
2489
|
-
|
|
2490
|
-
|
|
2491
|
-
|
|
2492
|
-
|
|
3376
|
+
protected readonly apiKey: string;
|
|
3377
|
+
protected readonly model: string;
|
|
3378
|
+
protected readonly voice: string;
|
|
3379
|
+
protected readonly instructions: string;
|
|
3380
|
+
protected readonly tools?: Array<{
|
|
3381
|
+
name: string;
|
|
3382
|
+
description: string;
|
|
3383
|
+
parameters: Record<string, unknown>;
|
|
3384
|
+
strict?: boolean;
|
|
3385
|
+
}> | undefined;
|
|
3386
|
+
protected readonly audioFormat: OpenAIRealtimeAudioFormat;
|
|
3387
|
+
protected ws: WebSocket__default | null;
|
|
2493
3388
|
private readonly eventCallbacks;
|
|
2494
3389
|
private messageListenerAttached;
|
|
2495
3390
|
private heartbeat;
|
|
2496
3391
|
private currentResponseItemId;
|
|
2497
3392
|
private currentResponseAudioMs;
|
|
2498
3393
|
private currentResponseFirstAudioAt;
|
|
2499
|
-
|
|
3394
|
+
protected readonly options: OpenAIRealtimeOptions;
|
|
2500
3395
|
constructor(apiKey: string, model?: string, voice?: string, instructions?: string, tools?: Array<{
|
|
2501
3396
|
name: string;
|
|
2502
3397
|
description: string;
|
|
2503
3398
|
parameters: Record<string, unknown>;
|
|
2504
3399
|
strict?: boolean;
|
|
2505
3400
|
}> | undefined, audioFormat?: OpenAIRealtimeAudioFormat, options?: OpenAIRealtimeOptions);
|
|
3401
|
+
/**
|
|
3402
|
+
* Build the production session.update body. Mirrors the body sent
|
|
3403
|
+
* inside `connect()` so warmup can apply identical configuration to
|
|
3404
|
+
* the upstream session and prime it without billing.
|
|
3405
|
+
*/
|
|
3406
|
+
private buildSessionConfig;
|
|
3407
|
+
/**
|
|
3408
|
+
* Pre-call WebSocket warmup for the OpenAI Realtime endpoint.
|
|
3409
|
+
*
|
|
3410
|
+
* The canonical session-only warm step on the Realtime API: open the
|
|
3411
|
+
* WS, wait for `session.created`, send a single `session.update`
|
|
3412
|
+
* containing the same fields that the production `connect()` path
|
|
3413
|
+
* applies (`input_audio_format`, `output_audio_format`, `voice`,
|
|
3414
|
+
* `instructions`, `turn_detection`, `input_audio_transcription`,
|
|
3415
|
+
* plus any opt-in fields populated on the adapter), wait for the
|
|
3416
|
+
* matching `session.updated` ack, then close cleanly. This primes
|
|
3417
|
+
* the per-session state on the OpenAI side — DNS + TLS + auth
|
|
3418
|
+
* handshake + initial config exchange — without ever invoking the
|
|
3419
|
+
* model.
|
|
3420
|
+
*
|
|
3421
|
+
* Earlier revisions sent `response.create` with
|
|
3422
|
+
* `{"response": {"generate": false}}` to prime the inference path.
|
|
3423
|
+
* That field is NOT in the OpenAI Realtime API schema; the server
|
|
3424
|
+
* either ignores it (and bills tokens for a real model response) or
|
|
3425
|
+
* rejects the request with `invalid_request_error`. Both behaviours
|
|
3426
|
+
* are billing-unsafe or a no-op beyond TLS warm. The
|
|
3427
|
+
* `session.update` flow is documented and side-effect-free.
|
|
3428
|
+
*
|
|
3429
|
+
* Billing safety: `session.update` only mutates session
|
|
3430
|
+
* configuration. It does NOT invoke the model, does NOT consume any
|
|
3431
|
+
* audio buffer, and does NOT trigger token generation, so no
|
|
3432
|
+
* per-token cost is accrued. Best-effort: failures are logged at
|
|
3433
|
+
* debug level and never raised.
|
|
3434
|
+
*/
|
|
3435
|
+
warmup(): Promise<void>;
|
|
2506
3436
|
/** Open the Realtime WebSocket and apply the session configuration. */
|
|
2507
3437
|
connect(): Promise<void>;
|
|
3438
|
+
/**
|
|
3439
|
+
* Adopt a pre-opened, already-`session.updated` Realtime WebSocket
|
|
3440
|
+
* produced by the prewarm pipeline (see `Patter.parkProviderConnections`).
|
|
3441
|
+
* Skips the fresh `new WebSocket()` + `session.created` /
|
|
3442
|
+
* `session.update` round-trip — saves ~250-450 ms on first turn.
|
|
3443
|
+
*
|
|
3444
|
+
* Caller MUST verify `ws.readyState === OPEN` before calling and MUST
|
|
3445
|
+
* have already received `session.updated` on the parked socket. If
|
|
3446
|
+
* the parked WS died between park and adopt, fall back to `connect()`.
|
|
3447
|
+
*/
|
|
3448
|
+
adoptWebSocket(ws: WebSocket__default): void;
|
|
3449
|
+
protected armHeartbeatAndListener(): void;
|
|
3450
|
+
/**
|
|
3451
|
+
* Open a fresh Realtime WS, exchange `session.created` /
|
|
3452
|
+
* `session.update` / `session.updated` (so the upstream session is
|
|
3453
|
+
* fully primed), and return the OPEN socket WITHOUT arming the
|
|
3454
|
+
* heartbeat / message listener. Used by the prewarm pipeline to park
|
|
3455
|
+
* a Realtime connection during ringing; the live consumer adopts it
|
|
3456
|
+
* via {@link adoptWebSocket}.
|
|
3457
|
+
*
|
|
3458
|
+
* Bounded by 8 s. Throws on timeout / handshake failure — callers
|
|
3459
|
+
* (the prewarm pipeline) treat any error as a cache miss and the
|
|
3460
|
+
* call falls through to the cold `connect()` path.
|
|
3461
|
+
*
|
|
3462
|
+
* Billing safety: `session.update` does not invoke the model. No
|
|
3463
|
+
* tokens are billed.
|
|
3464
|
+
*/
|
|
3465
|
+
openParkedConnection(): Promise<WebSocket__default>;
|
|
2508
3466
|
/** Append a base64-encoded audio chunk to the realtime input buffer. */
|
|
2509
3467
|
sendAudio(mulawAudio: Buffer): void;
|
|
2510
3468
|
/**
|
|
@@ -2518,7 +3476,7 @@ declare class OpenAIRealtimeAdapter {
|
|
|
2518
3476
|
onEvent(callback: RealtimeEventCallback): void;
|
|
2519
3477
|
/** Remove a previously registered {@link onEvent} callback. */
|
|
2520
3478
|
offEvent(callback: RealtimeEventCallback): void;
|
|
2521
|
-
|
|
3479
|
+
protected ensureMessageListener(): void;
|
|
2522
3480
|
/** Truncate the in-flight assistant turn and cancel the active response.
|
|
2523
3481
|
*
|
|
2524
3482
|
* ``audio_end_ms`` MUST reflect what the caller actually heard, not what
|
|
@@ -2684,11 +3642,6 @@ declare function isRemoteUrl(onMessage: unknown): onMessage is string;
|
|
|
2684
3642
|
/** Check if a URL is a WebSocket URL. */
|
|
2685
3643
|
declare function isWebSocketUrl(url: string): boolean;
|
|
2686
3644
|
|
|
2687
|
-
/**
|
|
2688
|
-
* Embedded HTTP/WebSocket server — wires Express webhooks for the configured
|
|
2689
|
-
* carrier (Twilio or Telnyx) into the per-call `StreamHandler` and dashboard.
|
|
2690
|
-
*/
|
|
2691
|
-
|
|
2692
3645
|
/** Resolved configuration consumed by `EmbeddedServer` (carrier credentials, webhook URL, etc.). */
|
|
2693
3646
|
interface LocalConfig {
|
|
2694
3647
|
twilioSid?: string;
|
|
@@ -3322,6 +4275,8 @@ interface SonioxSTTOptions$1 {
|
|
|
3322
4275
|
}
|
|
3323
4276
|
/** Streaming STT adapter for Soniox's real-time WebSocket API. */
|
|
3324
4277
|
declare class SonioxSTT {
|
|
4278
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
4279
|
+
static readonly providerKey = "soniox";
|
|
3325
4280
|
private ws;
|
|
3326
4281
|
private callbacks;
|
|
3327
4282
|
private final;
|
|
@@ -3430,6 +4385,8 @@ interface AssemblyAISTTOptions$1 {
|
|
|
3430
4385
|
declare class AssemblyAISTT {
|
|
3431
4386
|
private readonly apiKey;
|
|
3432
4387
|
private readonly options;
|
|
4388
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
4389
|
+
static readonly providerKey = "assemblyai";
|
|
3433
4390
|
private ws;
|
|
3434
4391
|
private readonly callbacks;
|
|
3435
4392
|
private closing;
|
|
@@ -3460,6 +4417,22 @@ declare class AssemblyAISTT {
|
|
|
3460
4417
|
static forTwilio(apiKey: string, model?: AssemblyAIModel): AssemblyAISTT;
|
|
3461
4418
|
private buildUrl;
|
|
3462
4419
|
private buildHeaders;
|
|
4420
|
+
/**
|
|
4421
|
+
* Pre-call WebSocket warmup for the AssemblyAI v3 `/v3/ws` endpoint.
|
|
4422
|
+
*
|
|
4423
|
+
* Opens the WS (DNS + TLS + auth handshake), idles ~250 ms so the
|
|
4424
|
+
* AssemblyAI edge keeps the session state warm, then sends Terminate
|
|
4425
|
+
* and closes. By the time `connect()` is invoked at call-pickup the
|
|
4426
|
+
* resolver and TLS session are hot — net wire time saving of
|
|
4427
|
+
* 200-500 ms.
|
|
4428
|
+
*
|
|
4429
|
+
* Billing safety: AssemblyAI Universal Streaming bills on streamed
|
|
4430
|
+
* audio seconds (per https://www.assemblyai.com/pricing). Opening +
|
|
4431
|
+
* closing the WebSocket without forwarding any audio frames does
|
|
4432
|
+
* not consume billable seconds. Best-effort: failures logged at
|
|
4433
|
+
* debug level.
|
|
4434
|
+
*/
|
|
4435
|
+
warmup(): Promise<void>;
|
|
3463
4436
|
/** Open the streaming WebSocket and arm message handlers. */
|
|
3464
4437
|
connect(): Promise<void>;
|
|
3465
4438
|
private awaitOpen;
|
|
@@ -3500,6 +4473,7 @@ declare class AssemblyAISTT {
|
|
|
3500
4473
|
* Implements a `DeepgramSTT`-shaped provider using Cartesia's streaming
|
|
3501
4474
|
* WebSocket API. Pure `ws` transport — does NOT depend on the vendor SDK.
|
|
3502
4475
|
*/
|
|
4476
|
+
|
|
3503
4477
|
/** Patter-normalised transcript event emitted by {@link CartesiaSTT}. */
|
|
3504
4478
|
interface Transcript$4 {
|
|
3505
4479
|
readonly text: string;
|
|
@@ -3546,6 +4520,8 @@ interface CartesiaSTTOptions$1 {
|
|
|
3546
4520
|
declare class CartesiaSTT {
|
|
3547
4521
|
private readonly apiKey;
|
|
3548
4522
|
private readonly options;
|
|
4523
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
4524
|
+
static readonly providerKey = "cartesia_stt";
|
|
3549
4525
|
private ws;
|
|
3550
4526
|
private callbacks;
|
|
3551
4527
|
private keepaliveTimer;
|
|
@@ -3555,13 +4531,65 @@ declare class CartesiaSTT {
|
|
|
3555
4531
|
*/
|
|
3556
4532
|
requestId: string | null;
|
|
3557
4533
|
constructor(apiKey: string, options?: CartesiaSTTOptions$1);
|
|
4534
|
+
/**
|
|
4535
|
+
* Open a fresh WebSocket without arming any message / keepalive handlers
|
|
4536
|
+
* and without taking ownership on `this.ws`. Returns the OPEN socket so
|
|
4537
|
+
* the caller (the prewarm pipeline) can park it for later adoption via
|
|
4538
|
+
* `adoptWebSocket`. Bounded by `CONNECT_TIMEOUT_MS`.
|
|
4539
|
+
*
|
|
4540
|
+
* Billing safety: opening + parking the WS does not stream audio
|
|
4541
|
+
* (Cartesia STT bills on streamed audio seconds), so no charge is
|
|
4542
|
+
* incurred. Close the returned WS yourself if it is never adopted.
|
|
4543
|
+
*/
|
|
4544
|
+
openParkedConnection(): Promise<WebSocket__default>;
|
|
3558
4545
|
private buildWsUrl;
|
|
4546
|
+
/**
|
|
4547
|
+
* Pre-call WebSocket warmup for the Cartesia STT `/stt/websocket` endpoint.
|
|
4548
|
+
*
|
|
4549
|
+
* Opens the WS (DNS + TLS + auth handshake), idles ~250 ms so the
|
|
4550
|
+
* Cartesia edge keeps session state warm, then closes. By the time
|
|
4551
|
+
* `connect()` is invoked at call-pickup the resolver and TLS session
|
|
4552
|
+
* are hot — net wire time saving of 200-500 ms.
|
|
4553
|
+
*
|
|
4554
|
+
* Billing safety: Cartesia STT bills on streamed audio seconds (per
|
|
4555
|
+
* https://docs.cartesia.ai/2025-04-16/api-reference/stt/stt). Opening
|
|
4556
|
+
* + closing the WebSocket without forwarding audio does not consume
|
|
4557
|
+
* billable seconds. Best-effort: failures logged at debug level.
|
|
4558
|
+
*/
|
|
4559
|
+
warmup(): Promise<void>;
|
|
3559
4560
|
/** Open the streaming WebSocket and arm message + keepalive handlers. */
|
|
3560
4561
|
connect(): Promise<void>;
|
|
4562
|
+
/**
|
|
4563
|
+
* Adopt a pre-opened, already-OPEN WebSocket produced by the prewarm
|
|
4564
|
+
* pipeline (see `Patter.parkProviderConnections`). Skips the fresh
|
|
4565
|
+
* `new WebSocket()` + handshake — the WS is already through DNS, TLS
|
|
4566
|
+
* and HTTP-101 so audio frames can flow on this turn instead of
|
|
4567
|
+
* paying ~150-400 ms of handshake.
|
|
4568
|
+
*
|
|
4569
|
+
* Caller MUST verify `ws.readyState === OPEN` before calling. If the
|
|
4570
|
+
* parked WS died between park and adopt, fall back to `connect()`.
|
|
4571
|
+
*/
|
|
4572
|
+
adoptWebSocket(ws: WebSocket__default): void;
|
|
4573
|
+
private armMessageAndKeepalive;
|
|
3561
4574
|
private handleEvent;
|
|
3562
4575
|
private emit;
|
|
3563
4576
|
/** Send a binary PCM16-LE audio chunk to Cartesia for transcription. */
|
|
3564
4577
|
sendAudio(audio: Buffer): void;
|
|
4578
|
+
/**
|
|
4579
|
+
* Force Cartesia to finalise the in-flight utterance immediately.
|
|
4580
|
+
*
|
|
4581
|
+
* Sends a ``finalize`` text frame on the live WebSocket. Cartesia
|
|
4582
|
+
* replies with the final transcript followed by ``flush_done``,
|
|
4583
|
+
* bypassing its conservative internal silence heuristic (which can
|
|
4584
|
+
* wait 2-7 s on PSTN audio before naturally finalising). Wired
|
|
4585
|
+
* into ``StreamHandler`` on the VAD ``speech_end`` event so the
|
|
4586
|
+
* SDK's authoritative end-of-speech detection forces an immediate
|
|
4587
|
+
* STT finalisation — turning Cartesia's natural-pause endpointing
|
|
4588
|
+
* into a deterministic VAD-driven one, parity with the Deepgram
|
|
4589
|
+
* fast-path. No-op when the WS isn't open. Parity with Python
|
|
4590
|
+
* ``CartesiaSTT.finalize``.
|
|
4591
|
+
*/
|
|
4592
|
+
finalize(): Promise<void>;
|
|
3565
4593
|
/** Register a transcript listener. */
|
|
3566
4594
|
onTranscript(callback: TranscriptCallback$4): void;
|
|
3567
4595
|
/** Remove a previously registered transcript callback. */
|
|
@@ -3624,6 +4652,8 @@ interface LMNTTTSOptions$1 {
|
|
|
3624
4652
|
}
|
|
3625
4653
|
/** LMNT TTS adapter backed by the `/v1/ai/speech/bytes` HTTP streaming endpoint. */
|
|
3626
4654
|
declare class LMNTTTS {
|
|
4655
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
4656
|
+
static readonly providerKey = "lmnt";
|
|
3627
4657
|
private readonly apiKey;
|
|
3628
4658
|
private readonly model;
|
|
3629
4659
|
private readonly voice;
|
|
@@ -3717,6 +4747,8 @@ interface DeepgramSTTOptions$1 {
|
|
|
3717
4747
|
}
|
|
3718
4748
|
/** Streaming STT adapter for Deepgram's `/v1/listen` WebSocket API. */
|
|
3719
4749
|
declare class DeepgramSTT {
|
|
4750
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
4751
|
+
static readonly providerKey = "deepgram";
|
|
3720
4752
|
private ws;
|
|
3721
4753
|
private readonly transcriptCallbacks;
|
|
3722
4754
|
private readonly errorCallbacks;
|
|
@@ -3749,6 +4781,22 @@ declare class DeepgramSTT {
|
|
|
3749
4781
|
/** Factory for Twilio calls — mulaw 8 kHz. Forwards tuning options through. */
|
|
3750
4782
|
static forTwilio(apiKey: string, language?: string, model?: string, options?: DeepgramSTTOptions$1): DeepgramSTT;
|
|
3751
4783
|
private buildUrl;
|
|
4784
|
+
/**
|
|
4785
|
+
* Pre-call WebSocket warmup for the Deepgram `/v1/listen` endpoint.
|
|
4786
|
+
*
|
|
4787
|
+
* Opens the WS (full DNS + TLS + auth handshake), idles ~250 ms so the
|
|
4788
|
+
* provider edge keeps the session warm in its routing table, then
|
|
4789
|
+
* closes cleanly. By the time `connect()` is invoked at call-pickup
|
|
4790
|
+
* the DNS resolver is hot, the TCP+TLS session is in the connection
|
|
4791
|
+
* pool, and recent WS auth is still warm at Deepgram's edge — net
|
|
4792
|
+
* wire time saving of 200-500 ms vs a cold WS open.
|
|
4793
|
+
*
|
|
4794
|
+
* Billing safety: Deepgram bills on streamed audio seconds (per
|
|
4795
|
+
* https://deepgram.com/pricing). Opening + closing the WebSocket
|
|
4796
|
+
* without sending any audio frames does not consume billable seconds.
|
|
4797
|
+
* Best-effort: any failure is logged at debug level and never raised.
|
|
4798
|
+
*/
|
|
4799
|
+
warmup(): Promise<void>;
|
|
3752
4800
|
/** Open the streaming WebSocket and arm message + keepalive handlers. */
|
|
3753
4801
|
connect(): Promise<void>;
|
|
3754
4802
|
private openSocket;
|
|
@@ -3825,6 +4873,8 @@ type TranscriptCallback$2 = (transcript: Transcript$2) => void;
|
|
|
3825
4873
|
type WhisperResponseFormat = 'json' | 'verbose_json';
|
|
3826
4874
|
/** Buffered STT adapter for OpenAI's Whisper transcription HTTP API. */
|
|
3827
4875
|
declare class WhisperSTT {
|
|
4876
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
4877
|
+
static readonly providerKey: string;
|
|
3828
4878
|
private readonly apiKey;
|
|
3829
4879
|
private readonly model;
|
|
3830
4880
|
private readonly language;
|
|
@@ -3913,6 +4963,8 @@ declare class STT$5 extends WhisperSTT {
|
|
|
3913
4963
|
|
|
3914
4964
|
/** STT adapter restricted to OpenAI's GPT-4o Transcribe model family. */
|
|
3915
4965
|
declare class OpenAITranscribeSTT extends WhisperSTT {
|
|
4966
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
4967
|
+
static readonly providerKey: string;
|
|
3916
4968
|
/**
|
|
3917
4969
|
* @param apiKey OpenAI API key.
|
|
3918
4970
|
* @param language ISO-639-1 language code (e.g. ``"en"``, ``"it"``). Optional.
|
|
@@ -4172,6 +5224,8 @@ interface SpeechmaticsSTTOptions$1 {
|
|
|
4172
5224
|
* ```
|
|
4173
5225
|
*/
|
|
4174
5226
|
declare class SpeechmaticsSTT {
|
|
5227
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
5228
|
+
static readonly providerKey = "speechmatics";
|
|
4175
5229
|
private ws;
|
|
4176
5230
|
private readonly transcriptCallbacks;
|
|
4177
5231
|
private readonly errorCallbacks;
|
|
@@ -4231,147 +5285,12 @@ type SpeechmaticsSTTOptions = SpeechmaticsSTTOptions$1 & {
|
|
|
4231
5285
|
* ```ts
|
|
4232
5286
|
* import * as speechmatics from "getpatter/stt/speechmatics";
|
|
4233
5287
|
* const stt = new speechmatics.STT(); // reads SPEECHMATICS_API_KEY
|
|
4234
|
-
* const stt = new speechmatics.STT({ apiKey: "sm_...", language: "en" });
|
|
4235
|
-
* ```
|
|
4236
|
-
*/
|
|
4237
|
-
declare class STT extends SpeechmaticsSTT {
|
|
4238
|
-
static readonly providerKey = "speechmatics";
|
|
4239
|
-
constructor(opts?: SpeechmaticsSTTOptions);
|
|
4240
|
-
}
|
|
4241
|
-
|
|
4242
|
-
/**
|
|
4243
|
-
* Known stable ElevenLabs voice models (from the official ElevenLabs API
|
|
4244
|
-
* reference). Exposed as a typed `as const` object so callers can pass
|
|
4245
|
-
* `ElevenLabsModel.FLASH_V2_5` and get autocomplete / static checking; the
|
|
4246
|
-
* public `modelId` option also accepts an arbitrary `string` so users can
|
|
4247
|
-
* pass forward-compat IDs we haven't enumerated yet.
|
|
4248
|
-
*
|
|
4249
|
-
* - `V3` — newest, highest quality (slower TTFT than Flash).
|
|
4250
|
-
* - `FLASH_V2_5` — current default, fastest (~75 ms TTFT).
|
|
4251
|
-
* - `TURBO_V2_5` — balanced quality/speed.
|
|
4252
|
-
* - `MULTILINGUAL_V2` — best multilingual support.
|
|
4253
|
-
* - `MONOLINGUAL_V1` — legacy English-only.
|
|
4254
|
-
*/
|
|
4255
|
-
declare const ElevenLabsModel: {
|
|
4256
|
-
readonly V3: "eleven_v3";
|
|
4257
|
-
readonly FLASH_V2_5: "eleven_flash_v2_5";
|
|
4258
|
-
readonly TURBO_V2_5: "eleven_turbo_v2_5";
|
|
4259
|
-
readonly MULTILINGUAL_V2: "eleven_multilingual_v2";
|
|
4260
|
-
readonly MONOLINGUAL_V1: "eleven_monolingual_v1";
|
|
4261
|
-
};
|
|
4262
|
-
/** Union of {@link ElevenLabsModel} string values. */
|
|
4263
|
-
type ElevenLabsModel = (typeof ElevenLabsModel)[keyof typeof ElevenLabsModel];
|
|
4264
|
-
declare const ElevenLabsOutputFormat: {
|
|
4265
|
-
readonly MP3_22050_32: "mp3_22050_32";
|
|
4266
|
-
readonly MP3_44100_32: "mp3_44100_32";
|
|
4267
|
-
readonly MP3_44100_64: "mp3_44100_64";
|
|
4268
|
-
readonly MP3_44100_96: "mp3_44100_96";
|
|
4269
|
-
readonly MP3_44100_128: "mp3_44100_128";
|
|
4270
|
-
readonly MP3_44100_192: "mp3_44100_192";
|
|
4271
|
-
readonly PCM_8000: "pcm_8000";
|
|
4272
|
-
readonly PCM_16000: "pcm_16000";
|
|
4273
|
-
readonly PCM_22050: "pcm_22050";
|
|
4274
|
-
readonly PCM_24000: "pcm_24000";
|
|
4275
|
-
readonly PCM_44100: "pcm_44100";
|
|
4276
|
-
readonly ULAW_8000: "ulaw_8000";
|
|
4277
|
-
};
|
|
4278
|
-
/** Union of {@link ElevenLabsOutputFormat} string values. */
|
|
4279
|
-
type ElevenLabsOutputFormat = (typeof ElevenLabsOutputFormat)[keyof typeof ElevenLabsOutputFormat];
|
|
4280
|
-
/** ElevenLabs voice tuning knobs forwarded as `voice_settings` in the request. */
|
|
4281
|
-
interface ElevenLabsVoiceSettings {
|
|
4282
|
-
stability?: number;
|
|
4283
|
-
similarity_boost?: number;
|
|
4284
|
-
style?: number;
|
|
4285
|
-
use_speaker_boost?: boolean;
|
|
4286
|
-
}
|
|
4287
|
-
/** Constructor options for {@link ElevenLabsTTS}. */
|
|
4288
|
-
interface ElevenLabsTTSOptions$1 {
|
|
4289
|
-
voiceId?: string;
|
|
4290
|
-
/**
|
|
4291
|
-
* ElevenLabs voice model ID. The default ``eleven_flash_v2_5`` has the
|
|
4292
|
-
* lowest TTFT (~75 ms). Pass ``eleven_v3`` for highest quality, or any
|
|
4293
|
-
* arbitrary string for forward-compat with future models.
|
|
4294
|
-
*/
|
|
4295
|
-
modelId?: ElevenLabsModel | string;
|
|
4296
|
-
outputFormat?: ElevenLabsOutputFormat;
|
|
4297
|
-
voiceSettings?: ElevenLabsVoiceSettings;
|
|
4298
|
-
languageCode?: string;
|
|
4299
|
-
chunkSize?: number;
|
|
4300
|
-
}
|
|
4301
|
-
/**
|
|
4302
|
-
* ElevenLabs streaming TTS adapter.
|
|
4303
|
-
*
|
|
4304
|
-
* Supported `modelId` values are autocompleted via {@link ElevenLabsModel}.
|
|
4305
|
-
* Default is `eleven_flash_v2_5` (lowest TTFT, ~75 ms).
|
|
4306
|
-
*
|
|
4307
|
-
* **Telephony optimization** — the constructor default
|
|
4308
|
-
* `outputFormat='pcm_16000'` is correct for web playback, dashboard
|
|
4309
|
-
* previews, and 16 kHz pipelines. For real phone calls, use the
|
|
4310
|
-
* carrier-specific factories instead:
|
|
4311
|
-
*
|
|
4312
|
-
* - {@link ElevenLabsTTS.forTwilio} emits `ulaw_8000` natively. Twilio's
|
|
4313
|
-
* media-stream WebSocket expects μ-law @ 8 kHz, so the SDK normally
|
|
4314
|
-
* resamples 16 kHz → 8 kHz and PCM → μ-law before sending. Asking
|
|
4315
|
-
* ElevenLabs to produce μ-law directly skips that step (saves
|
|
4316
|
-
* ~30–80 ms first-byte plus per-frame CPU and avoids any resampling
|
|
4317
|
-
* aliasing).
|
|
4318
|
-
* - {@link ElevenLabsTTS.forTelnyx} emits `pcm_16000`. Telnyx negotiates
|
|
4319
|
-
* L16/16000 on its bidirectional media WebSocket, so 16 kHz PCM is
|
|
4320
|
-
* already the format used end-to-end and no transcoding happens.
|
|
4321
|
-
* ElevenLabs *also* supports `ulaw_8000` if your Telnyx profile is
|
|
4322
|
-
* pinned to PCMU/8000 — pass `outputFormat: 'ulaw_8000'` explicitly
|
|
4323
|
-
* in that case.
|
|
5288
|
+
* const stt = new speechmatics.STT({ apiKey: "sm_...", language: "en" });
|
|
5289
|
+
* ```
|
|
4324
5290
|
*/
|
|
4325
|
-
declare class
|
|
4326
|
-
|
|
4327
|
-
|
|
4328
|
-
private readonly modelId;
|
|
4329
|
-
private readonly outputFormat;
|
|
4330
|
-
private readonly voiceSettings;
|
|
4331
|
-
private readonly languageCode;
|
|
4332
|
-
private readonly chunkSize;
|
|
4333
|
-
constructor(apiKey: string, voiceId?: string, modelId?: string, outputFormat?: ElevenLabsOutputFormat | string);
|
|
4334
|
-
constructor(apiKey: string, options: ElevenLabsTTSOptions$1);
|
|
4335
|
-
/**
|
|
4336
|
-
* Construct an instance pre-configured for Twilio Media Streams.
|
|
4337
|
-
*
|
|
4338
|
-
* Sets `outputFormat='ulaw_8000'` so ElevenLabs emits μ-law @ 8 kHz
|
|
4339
|
-
* directly — the exact wire format Twilio's media stream uses — letting
|
|
4340
|
-
* the SDK skip the 16 kHz→8 kHz resample and PCM→μ-law conversion in
|
|
4341
|
-
* `TwilioAudioSender`. Saves ~30–80 ms first-byte and per-frame CPU,
|
|
4342
|
-
* and removes a potential aliasing source.
|
|
4343
|
-
*
|
|
4344
|
-
* `voiceSettings` defaults to a low-bandwidth-friendly profile
|
|
4345
|
-
* (speaker boost off, modest stability) which sounds cleaner at 8 kHz
|
|
4346
|
-
* μ-law than the studio default. Pass an explicit object to override.
|
|
4347
|
-
*/
|
|
4348
|
-
static forTwilio(apiKey: string, options?: Omit<ElevenLabsTTSOptions$1, 'outputFormat'>): ElevenLabsTTS;
|
|
4349
|
-
/**
|
|
4350
|
-
* Construct an instance pre-configured for Telnyx bidirectional media.
|
|
4351
|
-
*
|
|
4352
|
-
* Telnyx's default media-streaming codec is L16 PCM @ 16 kHz, which
|
|
4353
|
-
* matches our default Telnyx handler. We pick `pcm_16000` so the audio
|
|
4354
|
-
* flows end-to-end with zero resampling or transcoding.
|
|
4355
|
-
*
|
|
4356
|
-
* Trade-off: if your Telnyx profile is pinned to PCMU/8000 (μ-law),
|
|
4357
|
-
* construct `ElevenLabsTTS` directly with `outputFormat: 'ulaw_8000'`
|
|
4358
|
-
* — Telnyx supports that natively too.
|
|
4359
|
-
*/
|
|
4360
|
-
static forTelnyx(apiKey: string, options?: Omit<ElevenLabsTTSOptions$1, 'outputFormat'>): ElevenLabsTTS;
|
|
4361
|
-
/**
|
|
4362
|
-
* Synthesise text to speech and return the full audio as a single Buffer.
|
|
4363
|
-
*
|
|
4364
|
-
* For large chunks (or when latency matters) call `synthesizeStream` instead.
|
|
4365
|
-
*/
|
|
4366
|
-
synthesize(text: string): Promise<Buffer>;
|
|
4367
|
-
/**
|
|
4368
|
-
* Synthesise text and yield audio chunks as they arrive (streaming).
|
|
4369
|
-
*
|
|
4370
|
-
* The yielded buffers are raw PCM at 16 kHz (or whatever `outputFormat` is
|
|
4371
|
-
* configured to). `chunkSize` controls the maximum yield size — 512 is a
|
|
4372
|
-
* good choice for low-latency telephony.
|
|
4373
|
-
*/
|
|
4374
|
-
synthesizeStream(text: string): AsyncGenerator<Buffer>;
|
|
5291
|
+
declare class STT extends SpeechmaticsSTT {
|
|
5292
|
+
static readonly providerKey = "speechmatics";
|
|
5293
|
+
constructor(opts?: SpeechmaticsSTTOptions);
|
|
4375
5294
|
}
|
|
4376
5295
|
|
|
4377
5296
|
/** ElevenLabs TTS for Patter pipeline mode. */
|
|
@@ -4424,115 +5343,6 @@ declare class TTS$6 extends ElevenLabsTTS {
|
|
|
4424
5343
|
static forTelnyx(apiKey: string, options?: Omit<ElevenLabsTTSOptions, "outputFormat">): TTS$6;
|
|
4425
5344
|
}
|
|
4426
5345
|
|
|
4427
|
-
/**
|
|
4428
|
-
* WebSocket-based ElevenLabs TTS provider — opt-in low-latency variant.
|
|
4429
|
-
*
|
|
4430
|
-
* Targets the ElevenLabs streaming-input WebSocket endpoint
|
|
4431
|
-
* (`/v1/text-to-speech/{voice_id}/stream-input`) instead of the HTTP
|
|
4432
|
-
* `/stream` endpoint used by `ElevenLabsTTS`. Saves the HTTP request setup
|
|
4433
|
-
* time per utterance (~50 ms) and avoids the HTTP cold-start TLS handshake
|
|
4434
|
-
* when calls are bursty.
|
|
4435
|
-
*
|
|
4436
|
-
* API matches `ElevenLabsTTS` (`synthesizeStream(text)` returns an
|
|
4437
|
-
* `AsyncGenerator<Buffer>`) so it can be passed anywhere a TTSAdapter is
|
|
4438
|
-
* expected.
|
|
4439
|
-
*
|
|
4440
|
-
* Behaviour notes
|
|
4441
|
-
* - WebSocket is opened **per-utterance** (matches HTTP semantics). A
|
|
4442
|
-
* future revision may pool a WS across utterances of the same call
|
|
4443
|
-
* session — see roadmap Phase 5b.
|
|
4444
|
-
* - `auto_mode=true` is enabled by default. Pass `autoMode: false` to
|
|
4445
|
-
* send a custom `chunk_length_schedule`.
|
|
4446
|
-
* - `outputFormat` is exposed as a query parameter so `ulaw_8000` (Twilio
|
|
4447
|
-
* native) and `pcm_16000` (Telnyx native) work without resampling.
|
|
4448
|
-
* - `eleven_v3` is **not** supported — the WS endpoint rejects it.
|
|
4449
|
-
* - `optimize_streaming_latency` is officially deprecated and is not
|
|
4450
|
-
* exposed.
|
|
4451
|
-
*/
|
|
4452
|
-
|
|
4453
|
-
/** Constructor options for {@link ElevenLabsWebSocketTTS}. */
|
|
4454
|
-
interface ElevenLabsWebSocketTTSOptions {
|
|
4455
|
-
apiKey: string;
|
|
4456
|
-
voiceId?: string;
|
|
4457
|
-
modelId?: ElevenLabsModel | string;
|
|
4458
|
-
outputFormat?: string;
|
|
4459
|
-
voiceSettings?: Record<string, unknown>;
|
|
4460
|
-
languageCode?: string;
|
|
4461
|
-
/** Let the server pick chunk timing. Default true. */
|
|
4462
|
-
autoMode?: boolean;
|
|
4463
|
-
/** WS keep-alive timeout in seconds (5–180). Default 60. */
|
|
4464
|
-
inactivityTimeout?: number;
|
|
4465
|
-
/**
|
|
4466
|
-
* Manual chunk schedule, only used when ``autoMode: false``. Each value
|
|
4467
|
-
* must be 5–500. ElevenLabs default is ``[120, 160, 250, 290]``.
|
|
4468
|
-
*/
|
|
4469
|
-
chunkLengthSchedule?: number[];
|
|
4470
|
-
/** Outgoing audio re-chunk size in bytes. Default 4096. */
|
|
4471
|
-
chunkSize?: number;
|
|
4472
|
-
}
|
|
4473
|
-
/** WebSocket-based ElevenLabs TTS adapter — opt-in low-latency variant. */
|
|
4474
|
-
declare class ElevenLabsWebSocketTTS implements TTSAdapter {
|
|
4475
|
-
static readonly providerKey = "elevenlabs_ws";
|
|
4476
|
-
readonly apiKey: string;
|
|
4477
|
-
readonly voiceId: string;
|
|
4478
|
-
readonly modelId: string;
|
|
4479
|
-
readonly voiceSettings?: Record<string, unknown>;
|
|
4480
|
-
readonly languageCode?: string;
|
|
4481
|
-
readonly autoMode: boolean;
|
|
4482
|
-
readonly inactivityTimeout: number;
|
|
4483
|
-
readonly chunkLengthSchedule?: number[];
|
|
4484
|
-
readonly chunkSize: number;
|
|
4485
|
-
/**
|
|
4486
|
-
* The wire format requested over the ElevenLabs WS. Initially set from
|
|
4487
|
-
* the constructor; ``setTelephonyCarrier`` may auto-flip it to the
|
|
4488
|
-
* carrier's native codec when the caller did NOT pass ``outputFormat``
|
|
4489
|
-
* explicitly.
|
|
4490
|
-
*/
|
|
4491
|
-
private _outputFormat;
|
|
4492
|
-
private readonly _outputFormatExplicit;
|
|
4493
|
-
/** Public read-only view of the (possibly auto-flipped) wire format. */
|
|
4494
|
-
get outputFormat(): string;
|
|
4495
|
-
constructor(opts: ElevenLabsWebSocketTTSOptions);
|
|
4496
|
-
/**
|
|
4497
|
-
* Hook called by ``StreamHandler`` to advise the carrier wire format.
|
|
4498
|
-
*
|
|
4499
|
-
* When the user did NOT pass an explicit ``outputFormat`` in the
|
|
4500
|
-
* constructor options, this flips the format to the carrier's native
|
|
4501
|
-
* wire codec — saving a client-side transcode step. Calling with an
|
|
4502
|
-
* unknown carrier (``""`` / ``"custom"``) is a no-op.
|
|
4503
|
-
*
|
|
4504
|
-
* When ``outputFormat`` was explicitly passed (incl. via the
|
|
4505
|
-
* ``forTwilio`` / ``forTelnyx`` factories), this method is a no-op —
|
|
4506
|
-
* the user's choice always wins.
|
|
4507
|
-
*/
|
|
4508
|
-
setTelephonyCarrier(carrier: string): void;
|
|
4509
|
-
/** Pre-configured for Twilio Media Streams (`ulaw_8000`). */
|
|
4510
|
-
static forTwilio(opts: Omit<ElevenLabsWebSocketTTSOptions, 'outputFormat'>): ElevenLabsWebSocketTTS;
|
|
4511
|
-
/** Pre-configured for Telnyx (`pcm_16000`). */
|
|
4512
|
-
static forTelnyx(opts: Omit<ElevenLabsWebSocketTTSOptions, 'outputFormat'>): ElevenLabsWebSocketTTS;
|
|
4513
|
-
private buildUrl;
|
|
4514
|
-
/**
|
|
4515
|
-
* Single-shot synthesis: open WS, send text, yield bytes, close.
|
|
4516
|
-
*
|
|
4517
|
-
* Resilience contract:
|
|
4518
|
-
* - Connection bounded by ``CONNECT_TIMEOUT_MS`` (5s, was 15s).
|
|
4519
|
-
* - Each idle wait bounded by ``FRAME_TIMEOUT_MS`` (30s) so a stalled
|
|
4520
|
-
* server cannot keep the generator alive indefinitely.
|
|
4521
|
-
* - Permanent error handler attached BEFORE the open await — prevents
|
|
4522
|
-
* ``uncaughtException`` if an error fires after the once-listener
|
|
4523
|
-
* resolves.
|
|
4524
|
-
* - All event listeners removed in ``finally`` (no closure leak past
|
|
4525
|
-
* socket close).
|
|
4526
|
-
* - Server-reported ``error`` raises ``ElevenLabsTTSError``.
|
|
4527
|
-
* - Per-frame audio payload capped at ``MAX_AUDIO_B64_BYTES``.
|
|
4528
|
-
* - Best-effort EOS ``{"text":""}`` sent in finally (not immediately
|
|
4529
|
-
* after flush — auto_mode could otherwise truncate the tail audio).
|
|
4530
|
-
*/
|
|
4531
|
-
synthesizeStream(text: string): AsyncGenerator<Buffer>;
|
|
4532
|
-
/** No-op — connections are per-utterance and torn down inside synthesizeStream. */
|
|
4533
|
-
close(): Promise<void>;
|
|
4534
|
-
}
|
|
4535
|
-
|
|
4536
5346
|
/** ElevenLabs WebSocket TTS for Patter pipeline mode (opt-in low-latency). */
|
|
4537
5347
|
|
|
4538
5348
|
/** Constructor options for the ElevenLabs WebSocket `TTS` adapter. */
|
|
@@ -4595,6 +5405,8 @@ declare class OpenAITTS {
|
|
|
4595
5405
|
private readonly speed;
|
|
4596
5406
|
private readonly antiAlias;
|
|
4597
5407
|
private readonly targetSampleRate;
|
|
5408
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
5409
|
+
static readonly providerKey = "openai_tts";
|
|
4598
5410
|
constructor(apiKey: string, voice?: string, model?: string, instructions?: string | null, speed?: number | null, antiAlias?: boolean, targetSampleRate?: number);
|
|
4599
5411
|
/**
|
|
4600
5412
|
* Synthesise text to speech and return the full audio as a single Buffer.
|
|
@@ -4736,6 +5548,8 @@ interface CartesiaTTSOptions$1 {
|
|
|
4736
5548
|
}
|
|
4737
5549
|
/** Cartesia TTS provider backed by the HTTP `/tts/bytes` streaming endpoint. */
|
|
4738
5550
|
declare class CartesiaTTS {
|
|
5551
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
5552
|
+
static readonly providerKey = "cartesia_tts";
|
|
4739
5553
|
private readonly apiKey;
|
|
4740
5554
|
private readonly model;
|
|
4741
5555
|
private readonly voice;
|
|
@@ -4768,6 +5582,25 @@ declare class CartesiaTTS {
|
|
|
4768
5582
|
static forTelnyx(apiKey: string, options?: Omit<CartesiaTTSOptions$1, 'sampleRate'>): CartesiaTTS;
|
|
4769
5583
|
/** Build the JSON payload for the Cartesia bytes endpoint. */
|
|
4770
5584
|
private buildPayload;
|
|
5585
|
+
/**
|
|
5586
|
+
* Pre-call HTTP warmup for the Cartesia `/tts/bytes` endpoint.
|
|
5587
|
+
*
|
|
5588
|
+
* Issues a lightweight `GET <baseUrl>/voices` so DNS, TLS, and HTTP/2
|
|
5589
|
+
* are already up by the time the first `synthesizeStream()` POST
|
|
5590
|
+
* lands. Best-effort: 5 s timeout, all exceptions swallowed at
|
|
5591
|
+
* debug level.
|
|
5592
|
+
*
|
|
5593
|
+
* Billing safety: `GET /voices` is a free metadata read on
|
|
5594
|
+
* Cartesia's REST surface (per https://docs.cartesia.ai). It does
|
|
5595
|
+
* not consume synthesis credits. The actual synthesis is billed
|
|
5596
|
+
* only when `POST /tts/bytes` runs with a non-empty `transcript`.
|
|
5597
|
+
*
|
|
5598
|
+
* Note: Cartesia TTS uses the HTTP path (vs the WebSocket variant
|
|
5599
|
+
* Cartesia also exposes) — connection warmup is therefore HTTP-GET
|
|
5600
|
+
* based, not WebSocket pre-handshake. The latency win is smaller
|
|
5601
|
+
* (~50-150 ms vs the ~200-500 ms of a WS prewarm) but still real.
|
|
5602
|
+
*/
|
|
5603
|
+
warmup(): Promise<void>;
|
|
4771
5604
|
/** Synthesize text and return the concatenated audio buffer. */
|
|
4772
5605
|
synthesize(text: string): Promise<Buffer>;
|
|
4773
5606
|
/**
|
|
@@ -4843,6 +5676,8 @@ interface RimeTTSOptions$1 {
|
|
|
4843
5676
|
}
|
|
4844
5677
|
/** Rime TTS adapter for the `users.rime.ai/v1/rime-tts` HTTP streaming endpoint. */
|
|
4845
5678
|
declare class RimeTTS {
|
|
5679
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
5680
|
+
static readonly providerKey = "rime";
|
|
4846
5681
|
private readonly apiKey;
|
|
4847
5682
|
private readonly model;
|
|
4848
5683
|
private readonly speaker;
|
|
@@ -5001,6 +5836,8 @@ interface InworldTTSOptions$1 {
|
|
|
5001
5836
|
* before calling the constructor.
|
|
5002
5837
|
*/
|
|
5003
5838
|
declare class InworldTTS {
|
|
5839
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
5840
|
+
static readonly providerKey = "inworld";
|
|
5004
5841
|
private readonly authToken;
|
|
5005
5842
|
private readonly model;
|
|
5006
5843
|
private readonly voice;
|
|
@@ -5014,6 +5851,33 @@ declare class InworldTTS {
|
|
|
5014
5851
|
private readonly baseUrl;
|
|
5015
5852
|
constructor(authToken: string, opts?: InworldTTSOptions$1);
|
|
5016
5853
|
private buildPayload;
|
|
5854
|
+
/**
|
|
5855
|
+
* Pre-call HTTP warmup for the Inworld TTS API.
|
|
5856
|
+
*
|
|
5857
|
+
* Issues a lightweight `GET /tts/v1/voices` against the API host so
|
|
5858
|
+
* DNS + TLS + HTTP/2 connection are already up by the time the first
|
|
5859
|
+
* `synthesizeStream()` POST lands. Best-effort: 5 s timeout, all
|
|
5860
|
+
* exceptions swallowed at debug level.
|
|
5861
|
+
*
|
|
5862
|
+
* Earlier revisions issued `HEAD` against the streaming endpoint
|
|
5863
|
+
* (`/tts/v1/voice:stream`). That endpoint is POST-only so HEAD
|
|
5864
|
+
* returns `405 Method Not Allowed` — the warmup still completed the
|
|
5865
|
+
* TLS handshake but spammed 405 errors into Inworld's audit logs and
|
|
5866
|
+
* into our own logs. Switching to a documented `GET /tts/v1/voices`
|
|
5867
|
+
* metadata read is a 2xx-clean equivalent.
|
|
5868
|
+
*
|
|
5869
|
+
* Billing safety: `GET /tts/v1/voices` is a free metadata endpoint
|
|
5870
|
+
* (per https://docs.inworld.ai/). It returns the voice catalogue
|
|
5871
|
+
* without invoking the synthesis pipeline. The actual synthesis is
|
|
5872
|
+
* billed only when `POST /tts/v1/voice:stream` runs with a non-empty
|
|
5873
|
+
* `text`.
|
|
5874
|
+
*
|
|
5875
|
+
* Note: Inworld TTS uses the HTTP NDJSON streaming path rather than
|
|
5876
|
+
* a persistent WebSocket — connection warmup is therefore HTTP-based,
|
|
5877
|
+
* not WebSocket pre-handshake. The latency win is smaller (~50-150 ms)
|
|
5878
|
+
* than the WS-based prewarms but still real on cold-start calls.
|
|
5879
|
+
*/
|
|
5880
|
+
warmup(): Promise<void>;
|
|
5017
5881
|
/** Synthesize text and return the concatenated audio buffer. */
|
|
5018
5882
|
synthesize(text: string): Promise<Buffer>;
|
|
5019
5883
|
/**
|
|
@@ -5143,6 +6007,8 @@ interface AnthropicLLMOptions$1 {
|
|
|
5143
6007
|
}
|
|
5144
6008
|
/** LLM provider backed by Anthropic's Messages API (streaming). */
|
|
5145
6009
|
declare class AnthropicLLMProvider implements LLMProvider {
|
|
6010
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
6011
|
+
static readonly providerKey = "anthropic";
|
|
5146
6012
|
private readonly apiKey;
|
|
5147
6013
|
private readonly model;
|
|
5148
6014
|
private readonly maxTokens;
|
|
@@ -5151,6 +6017,13 @@ declare class AnthropicLLMProvider implements LLMProvider {
|
|
|
5151
6017
|
private readonly anthropicVersion;
|
|
5152
6018
|
private readonly promptCaching;
|
|
5153
6019
|
constructor(options: AnthropicLLMOptions$1);
|
|
6020
|
+
/**
|
|
6021
|
+
* Pre-call DNS / TLS warmup for the Anthropic Messages API.
|
|
6022
|
+
* Issues a lightweight ``GET https://api.anthropic.com/v1/models`` so
|
|
6023
|
+
* DNS, TLS and HTTP/2 are already up by the time the first ``messages``
|
|
6024
|
+
* call lands. Best-effort: 5 s timeout, exceptions swallowed at debug.
|
|
6025
|
+
*/
|
|
6026
|
+
warmup(): Promise<void>;
|
|
5154
6027
|
/** Stream Patter-format LLM chunks for the given OpenAI-style chat history. */
|
|
5155
6028
|
stream(messages: Array<Record<string, unknown>>, tools?: Array<Record<string, unknown>> | null, opts?: LLMStreamOptions): AsyncGenerator<LLMChunk, void, unknown>;
|
|
5156
6029
|
}
|
|
@@ -5238,6 +6111,8 @@ interface GroqLLMOptions$1 {
|
|
|
5238
6111
|
}
|
|
5239
6112
|
/** LLM provider backed by Groq's OpenAI-compatible Chat Completions API. */
|
|
5240
6113
|
declare class GroqLLMProvider implements LLMProvider {
|
|
6114
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
6115
|
+
static readonly providerKey = "groq";
|
|
5241
6116
|
private readonly apiKey;
|
|
5242
6117
|
readonly model: string;
|
|
5243
6118
|
private readonly baseUrl;
|
|
@@ -5252,6 +6127,11 @@ declare class GroqLLMProvider implements LLMProvider {
|
|
|
5252
6127
|
private readonly presencePenalty?;
|
|
5253
6128
|
private readonly stop?;
|
|
5254
6129
|
constructor(options: GroqLLMOptions$1);
|
|
6130
|
+
/**
|
|
6131
|
+
* Pre-call DNS / TLS warmup for the Groq inference endpoint.
|
|
6132
|
+
* Best-effort: 5 s timeout, all exceptions swallowed at debug level.
|
|
6133
|
+
*/
|
|
6134
|
+
warmup(): Promise<void>;
|
|
5255
6135
|
/** Stream Patter-format LLM chunks from the Groq chat completions API. */
|
|
5256
6136
|
stream(messages: Array<Record<string, unknown>>, tools?: Array<Record<string, unknown>> | null, opts?: LLMStreamOptions): AsyncGenerator<LLMChunk, void, unknown>;
|
|
5257
6137
|
}
|
|
@@ -5371,6 +6251,8 @@ interface CerebrasLLMOptions$1 {
|
|
|
5371
6251
|
* - zai-glm-4.7
|
|
5372
6252
|
*/
|
|
5373
6253
|
declare class CerebrasLLMProvider implements LLMProvider {
|
|
6254
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
6255
|
+
static readonly providerKey = "cerebras";
|
|
5374
6256
|
private readonly apiKey;
|
|
5375
6257
|
readonly model: string;
|
|
5376
6258
|
private readonly baseUrl;
|
|
@@ -5386,6 +6268,11 @@ declare class CerebrasLLMProvider implements LLMProvider {
|
|
|
5386
6268
|
private readonly presencePenalty?;
|
|
5387
6269
|
private readonly stop?;
|
|
5388
6270
|
constructor(options: CerebrasLLMOptions$1);
|
|
6271
|
+
/**
|
|
6272
|
+
* Pre-call DNS / TLS warmup for the Cerebras inference endpoint.
|
|
6273
|
+
* Best-effort: 5 s timeout, all exceptions swallowed at debug level.
|
|
6274
|
+
*/
|
|
6275
|
+
warmup(): Promise<void>;
|
|
5389
6276
|
/** Stream Patter-format LLM chunks from the Cerebras chat completions API. */
|
|
5390
6277
|
stream(messages: Array<Record<string, unknown>>, tools?: Array<Record<string, unknown>> | null, opts?: LLMStreamOptions): AsyncGenerator<LLMChunk, void, unknown>;
|
|
5391
6278
|
}
|
|
@@ -5468,12 +6355,22 @@ interface GoogleLLMOptions$1 {
|
|
|
5468
6355
|
}
|
|
5469
6356
|
/** LLM provider backed by Google Gemini (Developer API, streaming SSE). */
|
|
5470
6357
|
declare class GoogleLLMProvider implements LLMProvider {
|
|
6358
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
6359
|
+
static readonly providerKey = "google";
|
|
5471
6360
|
private readonly apiKey;
|
|
5472
6361
|
readonly model: string;
|
|
5473
6362
|
private readonly baseUrl;
|
|
5474
6363
|
private readonly temperature?;
|
|
5475
6364
|
private readonly maxOutputTokens?;
|
|
5476
6365
|
constructor(options: GoogleLLMOptions$1);
|
|
6366
|
+
/**
|
|
6367
|
+
* Pre-call DNS / TLS warmup for the Gemini API.
|
|
6368
|
+
* Issues a lightweight ``GET ${baseUrl}/models?key=...`` so DNS, TLS
|
|
6369
|
+
* and HTTP/2 are already up by the time the first
|
|
6370
|
+
* ``streamGenerateContent`` call lands. Best-effort: 5 s timeout, all
|
|
6371
|
+
* exceptions swallowed at debug level.
|
|
6372
|
+
*/
|
|
6373
|
+
warmup(): Promise<void>;
|
|
5477
6374
|
/** Stream Patter-format LLM chunks from the Gemini SSE endpoint. */
|
|
5478
6375
|
stream(messages: Array<Record<string, unknown>>, tools?: Array<Record<string, unknown>> | null, opts?: LLMStreamOptions): AsyncGenerator<LLMChunk, void, unknown>;
|
|
5479
6376
|
}
|
|
@@ -5597,7 +6494,10 @@ declare class SileroVAD implements VADProvider {
|
|
|
5597
6494
|
* - `activationThreshold = 0.5` — upstream `threshold`
|
|
5598
6495
|
* - `deactivationThreshold = 0.35` — upstream `neg_threshold = threshold - 0.15`
|
|
5599
6496
|
* - `minSpeechDuration = 0.25` — upstream `min_speech_duration_ms = 250`
|
|
5600
|
-
* - `minSilenceDuration = 0.
|
|
6497
|
+
* - `minSilenceDuration = 0.4` — telephony default (was 0.1, bumped after
|
|
6498
|
+
* round 10f found speech_end firing on inter-sentence pauses < 250 ms,
|
|
6499
|
+
* causing double-talk dispatch). 400 ms matches the industry telephony
|
|
6500
|
+
* default and the inter_utterance_gap_ms debounce in stream-handler.ts.
|
|
5601
6501
|
* - `prefixPaddingDuration = 0.03` — upstream `speech_pad_ms = 30`
|
|
5602
6502
|
*
|
|
5603
6503
|
* Override any field by passing `options`. Deployments that experience
|
|
@@ -5639,6 +6539,263 @@ declare class SileroVAD implements VADProvider {
|
|
|
5639
6539
|
private advanceState;
|
|
5640
6540
|
/** Mark the VAD as closed; subsequent processFrame calls throw. */
|
|
5641
6541
|
close(): Promise<void>;
|
|
6542
|
+
/**
|
|
6543
|
+
* Reset all per-utterance state so the next ``processFrame`` starts from
|
|
6544
|
+
* a clean SILENCE state.
|
|
6545
|
+
*
|
|
6546
|
+
* Called by the stream handler between agent turns to prevent a "stuck
|
|
6547
|
+
* SPEECH" condition where PSTN echo / loopback kept the detector's
|
|
6548
|
+
* probability above ``deactivationThreshold`` for the entire agent turn.
|
|
6549
|
+
* Without this reset the next user utterance would never trigger a
|
|
6550
|
+
* SILENCE→SPEECH transition and barge-in would feel "one-shot" (works
|
|
6551
|
+
* once, then never again until the call ends).
|
|
6552
|
+
*
|
|
6553
|
+
* Safe to call any time including on a closed instance (no-op).
|
|
6554
|
+
*/
|
|
6555
|
+
reset(): void;
|
|
6556
|
+
}
|
|
6557
|
+
|
|
6558
|
+
/** Options accepted by {@link DeepFilterNetFilter}. */
|
|
6559
|
+
interface DeepFilterNetOptions {
|
|
6560
|
+
/** Absolute path to a DeepFilterNet ONNX model. If omitted, the filter
|
|
6561
|
+
* logs a warning and becomes a pass-through. */
|
|
6562
|
+
modelPath?: string;
|
|
6563
|
+
/** When true, disable the pass-through warning (used by tests). */
|
|
6564
|
+
silenceWarnings?: boolean;
|
|
6565
|
+
}
|
|
6566
|
+
/** OSS noise-suppression filter backed by a DeepFilterNet ONNX model. */
|
|
6567
|
+
declare class DeepFilterNetFilter implements AudioFilter {
|
|
6568
|
+
private readonly modelPath;
|
|
6569
|
+
private readonly silenceWarnings;
|
|
6570
|
+
private session;
|
|
6571
|
+
private ort;
|
|
6572
|
+
private warned;
|
|
6573
|
+
private closed;
|
|
6574
|
+
private _resamplerSrcRate;
|
|
6575
|
+
private _upsamplerInst;
|
|
6576
|
+
private _downsamplerInst;
|
|
6577
|
+
constructor(options?: DeepFilterNetOptions);
|
|
6578
|
+
private ensureSession;
|
|
6579
|
+
/** Run noise suppression on a PCM16 chunk; pass-through when no model is loaded. */
|
|
6580
|
+
process(pcmChunk: Buffer, sampleRate: number): Promise<Buffer>;
|
|
6581
|
+
/** Flush resamplers, release the ONNX session, and mark the filter closed. */
|
|
6582
|
+
close(): Promise<void>;
|
|
6583
|
+
}
|
|
6584
|
+
|
|
6585
|
+
/**
|
|
6586
|
+
* Krisp VIVA noise-reduction AudioFilter — TypeScript scaffold.
|
|
6587
|
+
*
|
|
6588
|
+
* Mirrors the API of the Python `getpatter.providers.krisp_filter.KrispVivaFilter`
|
|
6589
|
+
* for SDK parity. As of 2026-05 Krisp does not publish an official Node.js
|
|
6590
|
+
* (server) SDK; third-party browser/RN wrappers exist but cannot process
|
|
6591
|
+
* server-received PCM/mulaw audio. This class throws at construction time
|
|
6592
|
+
* and points the caller at the available paths (Python SDK or DeepFilterNet
|
|
6593
|
+
* on TS).
|
|
6594
|
+
*
|
|
6595
|
+
* When Krisp publishes an official Node binding — or a community NAPI/WASM
|
|
6596
|
+
* wrapper becomes available — the import below and `process()` body will
|
|
6597
|
+
* fill in. The class signature is intentionally compatible with the Python
|
|
6598
|
+
* one so callers do not need to migrate code: `camelCase` ↔ `snake_case`,
|
|
6599
|
+
* `modelPath` ↔ `model_path`, etc.
|
|
6600
|
+
*
|
|
6601
|
+
* Krisp VIVA is a proprietary SDK and requires a commercial license plus a
|
|
6602
|
+
* `.kef` model file provided by the user. Patter ships only the
|
|
6603
|
+
* AudioFilter interface scaffold — never the SDK or model.
|
|
6604
|
+
*
|
|
6605
|
+
* @see https://krisp.ai/developers/
|
|
6606
|
+
*/
|
|
6607
|
+
|
|
6608
|
+
/** Krisp-supported sample rates (parity with Python `KrispSampleRate`). */
|
|
6609
|
+
declare const KrispSampleRate: {
|
|
6610
|
+
readonly HZ_8000: 8000;
|
|
6611
|
+
readonly HZ_16000: 16000;
|
|
6612
|
+
readonly HZ_32000: 32000;
|
|
6613
|
+
readonly HZ_44100: 44100;
|
|
6614
|
+
readonly HZ_48000: 48000;
|
|
6615
|
+
};
|
|
6616
|
+
type KrispSampleRate = (typeof KrispSampleRate)[keyof typeof KrispSampleRate];
|
|
6617
|
+
/** Krisp-supported frame durations in ms (parity with Python `KrispFrameDuration`). */
|
|
6618
|
+
declare const KrispFrameDuration: {
|
|
6619
|
+
readonly MS_10: 10;
|
|
6620
|
+
readonly MS_15: 15;
|
|
6621
|
+
readonly MS_20: 20;
|
|
6622
|
+
readonly MS_30: 30;
|
|
6623
|
+
readonly MS_32: 32;
|
|
6624
|
+
};
|
|
6625
|
+
type KrispFrameDuration = (typeof KrispFrameDuration)[keyof typeof KrispFrameDuration];
|
|
6626
|
+
/** Options accepted by {@link KrispVivaFilter}. */
|
|
6627
|
+
interface KrispVivaFilterOptions {
|
|
6628
|
+
/**
|
|
6629
|
+
* Path to the Krisp `.kef` model file. If omitted, falls back to the
|
|
6630
|
+
* `KRISP_VIVA_FILTER_MODEL_PATH` environment variable.
|
|
6631
|
+
*/
|
|
6632
|
+
readonly modelPath?: string;
|
|
6633
|
+
/** Noise-suppression strength in `[0, 100]`. Defaults to `100`. */
|
|
6634
|
+
readonly noiseSuppressionLevel?: number;
|
|
6635
|
+
/** Frame duration in ms. One of `10, 15, 20, 30, 32`. Defaults to `10`. */
|
|
6636
|
+
readonly frameDurationMs?: KrispFrameDuration | number;
|
|
6637
|
+
/** Initial sample rate in Hz. Defaults to `16000`. Re-created lazily if it changes mid-call. */
|
|
6638
|
+
readonly sampleRate?: KrispSampleRate | number;
|
|
6639
|
+
}
|
|
6640
|
+
/**
|
|
6641
|
+
* Krisp VIVA noise-reduction filter — TypeScript scaffold (NOT YET IMPLEMENTED).
|
|
6642
|
+
*
|
|
6643
|
+
* Construction throws with a guidance message because Krisp does not ship a
|
|
6644
|
+
* Node.js SDK. The class exists for API parity with the Python
|
|
6645
|
+
* `KrispVivaFilter` so that user code does not need to be rewritten when a
|
|
6646
|
+
* Node binding lands.
|
|
6647
|
+
*
|
|
6648
|
+
* For TS users today, use {@link DeepFilterNetFilter} from
|
|
6649
|
+
* `./deepfilternet-filter` instead — same `AudioFilter` interface, no
|
|
6650
|
+
* license required.
|
|
6651
|
+
*
|
|
6652
|
+
* @example
|
|
6653
|
+
* ```ts
|
|
6654
|
+
* // FUTURE — when Krisp publishes a Node SDK:
|
|
6655
|
+
* import { KrispVivaFilter } from 'getpatter/providers/krisp-filter';
|
|
6656
|
+
* const filter = new KrispVivaFilter({ modelPath: '/path/to/model.kef' });
|
|
6657
|
+
* const agent = phone.agent({ audioFilter: filter, ... });
|
|
6658
|
+
* ```
|
|
6659
|
+
*/
|
|
6660
|
+
declare class KrispVivaFilter implements AudioFilter {
|
|
6661
|
+
static readonly providerKey = "krisp_viva";
|
|
6662
|
+
constructor(_options?: KrispVivaFilterOptions);
|
|
6663
|
+
process(pcmChunk: Buffer, _sampleRate: number): Promise<Buffer>;
|
|
6664
|
+
close(): Promise<void>;
|
|
6665
|
+
}
|
|
6666
|
+
|
|
6667
|
+
/**
|
|
6668
|
+
* OpenAI Realtime adapter for the GA Realtime API (`gpt-realtime-2`).
|
|
6669
|
+
*
|
|
6670
|
+
* `gpt-realtime-2` is served from the same `wss://api.openai.com/v1/realtime`
|
|
6671
|
+
* endpoint as the v1-beta family, but the GA endpoint:
|
|
6672
|
+
* - REJECTS the legacy `OpenAI-Beta: realtime=v1` header (returns
|
|
6673
|
+
* `invalid_model` with message "Model X is only available on the GA API").
|
|
6674
|
+
* - REQUIRES `session.type === "realtime"` at the root of `session.update`.
|
|
6675
|
+
* - Uses `output_modalities` (was `modalities`).
|
|
6676
|
+
* - Nests audio config under `audio.{input,output}` with MIME `type`
|
|
6677
|
+
* strings (`audio/pcmu`, `audio/pcma`, `audio/pcm`) instead of the v1
|
|
6678
|
+
* enum strings (`g711_ulaw`, `g711_alaw`, `pcm16`) and moves `voice`
|
|
6679
|
+
* under `audio.output.voice`, `transcription` + `turn_detection`
|
|
6680
|
+
* under `audio.input`.
|
|
6681
|
+
*
|
|
6682
|
+
* Everything ELSE (event names, audio delta dispatch, barge-in / truncate
|
|
6683
|
+
* semantics, heartbeat, tool calling) is API-compatible with the v1 family,
|
|
6684
|
+
* so this adapter subclasses {@link OpenAIRealtimeAdapter} and overrides
|
|
6685
|
+
* only `connect()`. The runtime behaviour (`sendAudio`, `cancelResponse`,
|
|
6686
|
+
* `sendText`, `sendFirstMessage`, …) is inherited unchanged.
|
|
6687
|
+
*/
|
|
6688
|
+
|
|
6689
|
+
/**
|
|
6690
|
+
* Realtime WebSocket adapter speaking OpenAI's GA Realtime API.
|
|
6691
|
+
*
|
|
6692
|
+
* Note on audio transport: the GA endpoint accepts only PCM-16-LE with
|
|
6693
|
+
* `rate >= 24000` for both `session.audio.input.format` and
|
|
6694
|
+
* `session.audio.output.format`. The `audio/pcmu` MIME type appears to be
|
|
6695
|
+
* accepted at the protocol level but the server's audio engine does not
|
|
6696
|
+
* actually decode mulaw 8 kHz frames — they're silently dropped, the input
|
|
6697
|
+
* buffer stays empty, `input_audio_buffer.commit` returns
|
|
6698
|
+
* "buffer only has 0.00ms of audio", and the call ends up muted. Until
|
|
6699
|
+
* OpenAI documents native g711_ulaw on the GA endpoint we transcode on
|
|
6700
|
+
* both directions on the Patter side:
|
|
6701
|
+
* - inbound (Twilio/Telnyx → model): mulaw 8 kHz → PCM 24 kHz
|
|
6702
|
+
* - outbound (model → Twilio/Telnyx): PCM 24 kHz → mulaw 8 kHz
|
|
6703
|
+
*
|
|
6704
|
+
* The outbound path needs a stateful resampler instance because the
|
|
6705
|
+
* 24 kHz → 8 kHz decimator carries phase between chunks; sharing a single
|
|
6706
|
+
* instance across the call eliminates the boundary clicks a stateless
|
|
6707
|
+
* helper would produce on every audio delta.
|
|
6708
|
+
*/
|
|
6709
|
+
declare class OpenAIRealtime2Adapter extends OpenAIRealtimeAdapter {
|
|
6710
|
+
/** Two-stage outbound resampler for 24 kHz → 8 kHz. Created lazily on
|
|
6711
|
+
* the first audio frame so each Realtime session has its own state.
|
|
6712
|
+
*
|
|
6713
|
+
* We chain `24k → 16k → 8k` instead of using the direct `24k → 8k`
|
|
6714
|
+
* variant of {@link StatefulResampler}: the direct path is a 3:1
|
|
6715
|
+
* decimation with linear interpolation only — no anti-alias filter
|
|
6716
|
+
* — so any energy above 4 kHz in the source aliases down into the
|
|
6717
|
+
* audible band and is heard as raspy/scratchy artefacts on speech.
|
|
6718
|
+
* `gpt-realtime-2` outputs voice with significant content above
|
|
6719
|
+
* 4 kHz. The second stage (16k → 8k) uses a 5-tap FIR anti-alias
|
|
6720
|
+
* filter which removes the offending band before decimation, and
|
|
6721
|
+
* empirically (see commit message) the chain produces audibly
|
|
6722
|
+
* cleaner output. The 24k → 16k step is still pure linear-interp
|
|
6723
|
+
* but the inputs to it stay below the Nyquist of the 16 kHz stage,
|
|
6724
|
+
* so it doesn't introduce new artefacts.
|
|
6725
|
+
*/
|
|
6726
|
+
private outboundResampler24To16;
|
|
6727
|
+
private outboundResampler16To8;
|
|
6728
|
+
/** Last 8 kHz input sample carried across chunk boundaries for the
|
|
6729
|
+
* direct 3× linear upsample (see `transcodeInboundMulaw8ToPcm24`).
|
|
6730
|
+
* The carry guarantees the very first output of each chunk
|
|
6731
|
+
* interpolates from the *real* preceding sample, not from the chunk's
|
|
6732
|
+
* own first sample replicated — without it every 20 ms Twilio frame
|
|
6733
|
+
* boundary becomes a small DC step that the GA server VAD interprets
|
|
6734
|
+
* as constant low-energy noise, which never crosses the speech
|
|
6735
|
+
* threshold. */
|
|
6736
|
+
private inbound8kCarry;
|
|
6737
|
+
/** GA-shape `session.update` payload. See module-level docstring. */
|
|
6738
|
+
private buildGASessionConfig;
|
|
6739
|
+
/**
|
|
6740
|
+
* Open the Realtime WebSocket against the GA endpoint and apply the GA
|
|
6741
|
+
* session configuration. Header `OpenAI-Beta: realtime=v1` is OMITTED
|
|
6742
|
+
* (the GA endpoint rejects it). Wire shape uses nested `audio.{input,
|
|
6743
|
+
* output}` + `output_modalities` + `session.type === "realtime"`.
|
|
6744
|
+
*/
|
|
6745
|
+
connect(): Promise<void>;
|
|
6746
|
+
/**
|
|
6747
|
+
* GA-API variant of {@link OpenAIRealtimeAdapter.sendFirstMessage}. Two
|
|
6748
|
+
* differences from the v1 path:
|
|
6749
|
+
*
|
|
6750
|
+
* 1. The v1 implementation sends `response.modalities` which the GA
|
|
6751
|
+
* endpoint rejects with `Unknown parameter: 'response.modalities'`.
|
|
6752
|
+
* Use `output_modalities` to match the GA `session.update` shape.
|
|
6753
|
+
*
|
|
6754
|
+
* 2. The GA `response.create` does NOT inherit `audio.output.voice`
|
|
6755
|
+
* from the session — it falls back to the server-side default
|
|
6756
|
+
* (`marin`, female) when the field is omitted on the response
|
|
6757
|
+
* itself. Session-level `voice: "alloy"` only affects subsequent
|
|
6758
|
+
* server-VAD-triggered responses, NOT this explicit
|
|
6759
|
+
* `response.create`. We re-inject the configured voice here so the
|
|
6760
|
+
* first-message voice matches the rest of the call.
|
|
6761
|
+
*/
|
|
6762
|
+
/**
|
|
6763
|
+
* Override the parent `sendAudio` to transcode inbound carrier audio
|
|
6764
|
+
* (mulaw 8 kHz from Twilio/Telnyx) into PCM-16 24 kHz before sending
|
|
6765
|
+
* `input_audio_buffer.append`. The GA server's audio engine ignores
|
|
6766
|
+
* mulaw frames (commit returns "buffer only has 0.00ms of audio") even
|
|
6767
|
+
* though it accepts `audio/pcmu` at the protocol level.
|
|
6768
|
+
*/
|
|
6769
|
+
sendAudio(mulawAudio: Buffer): void;
|
|
6770
|
+
/**
|
|
6771
|
+
* mulaw 8 kHz Buffer → PCM-16-LE 24 kHz Buffer.
|
|
6772
|
+
*
|
|
6773
|
+
* Direct 3× linear-interpolation upsample with a one-sample carry
|
|
6774
|
+
* across chunk boundaries. For every consecutive pair of 8 kHz
|
|
6775
|
+
* samples `(s_a, s_b)` we emit three 24 kHz samples:
|
|
6776
|
+
*
|
|
6777
|
+
* out_0 = s_a
|
|
6778
|
+
* out_1 = 2/3·s_a + 1/3·s_b
|
|
6779
|
+
* out_2 = 1/3·s_a + 2/3·s_b
|
|
6780
|
+
*
|
|
6781
|
+
* The carry stores the last 8 kHz sample of the chunk so the next
|
|
6782
|
+
* chunk can start by pairing `(carry, firstNewSample)` — that's what
|
|
6783
|
+
* keeps the output rate exact (each input sample → 3 output samples)
|
|
6784
|
+
* and eliminates the chunk-boundary DC step that confused the GA
|
|
6785
|
+
* server VAD. The first chunk has no carry and loses 3 samples at
|
|
6786
|
+
* the leading edge (375 µs of audio); that's well below any audible
|
|
6787
|
+
* artefact and well below the GA VAD's 300 ms prefix-padding window.
|
|
6788
|
+
*/
|
|
6789
|
+
private transcodeInboundMulaw8ToPcm24;
|
|
6790
|
+
/**
|
|
6791
|
+
* Base64 PCM-16-LE 24 kHz → Base64 mulaw 8 kHz. Used by the WS
|
|
6792
|
+
* translation shim on each `response.output_audio.delta`. The stateful
|
|
6793
|
+
* resampler is created lazily and reused across all deltas in this
|
|
6794
|
+
* session so the 3:1 decimator's phase carries across chunk
|
|
6795
|
+
* boundaries — without that, every chunk boundary produces a click.
|
|
6796
|
+
*/
|
|
6797
|
+
private transcodeOutboundPcm24ToMulaw8Buffer;
|
|
6798
|
+
sendFirstMessage(text: string): Promise<void>;
|
|
5642
6799
|
}
|
|
5643
6800
|
|
|
5644
6801
|
/**
|
|
@@ -6379,6 +7536,8 @@ declare class TelnyxSTT {
|
|
|
6379
7536
|
private readonly transcriptionEngine;
|
|
6380
7537
|
private readonly sampleRate;
|
|
6381
7538
|
private readonly baseUrl;
|
|
7539
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
7540
|
+
static readonly providerKey = "telnyx_stt";
|
|
6382
7541
|
private ws;
|
|
6383
7542
|
private callbacks;
|
|
6384
7543
|
private headerSent;
|
|
@@ -6425,6 +7584,8 @@ declare class TelnyxTTS {
|
|
|
6425
7584
|
private readonly apiKey;
|
|
6426
7585
|
private readonly voice;
|
|
6427
7586
|
private readonly baseUrl;
|
|
7587
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
7588
|
+
static readonly providerKey = "telnyx_tts";
|
|
6428
7589
|
constructor(apiKey: string, voice?: string, baseUrl?: string);
|
|
6429
7590
|
/** Collect every audio chunk into a single Buffer. */
|
|
6430
7591
|
synthesize(text: string): Promise<Buffer>;
|
|
@@ -6504,4 +7665,4 @@ interface CallEvent {
|
|
|
6504
7665
|
readonly direction?: string;
|
|
6505
7666
|
}
|
|
6506
7667
|
|
|
6507
|
-
export { type AgentOptions, type AgentState, AllProvidersFailedError, type AnthropicConversion, LLM$3 as AnthropicLLM, type AnthropicLLMOptions, type AnthropicMessage, AssemblyAIEncoding, AssemblyAIModel, STT$1 as AssemblyAISTT, type AssemblyAISTTOptions, type AudioConfig, type AudioSource, AuthenticationError, type BackgroundAudioOptions, BackgroundAudioPlayer, BuiltinAudioClip, type BuiltinAudioClipName, type BuiltinPcmSource, type CallControl, type CallEvent, type CallEventHandler, type CallMetrics, CallMetricsAccumulator, type CallRecord, type CartesiaEncoding, STT$3 as CartesiaSTT, type CartesiaSTTOptions, TTS$3 as CartesiaTTS, type CartesiaTTSOptions, LLM$1 as CerebrasLLM, type CerebrasLLMOptions, ChatContext, type ChatMessage, type ChatRole, CloudflareTunnel, type ConversationStateSnapshot, type CostBreakdown, DEFAULT_MIN_SENTENCE_LEN, DEFAULT_PRICING, DTMF_EVENTS, STT$6 as DeepgramSTT, type DeepgramSTTOptions, DefaultToolExecutor, type DefaultToolExecutorOptions, type DefineToolInput, type DtmfEvent, ConvAI as ElevenLabsConvAI, ElevenLabsConvAIAdapter, type ConvAIOptions as ElevenLabsConvAIOptions, TTS$6 as ElevenLabsTTS, type ElevenLabsTTSOptions, type ElevenLabsWebSocketOptions, TTS$5 as ElevenLabsWebSocketTTS, type EouTrigger, ErrorCode, EventBus, FallbackLLMProvider, type FallbackLLMProviderOptions, type FilePcmSource, GEMINI_DEFAULT_INPUT_SR, GEMINI_DEFAULT_OUTPUT_SR, GeminiLiveAdapter, type GeminiLiveEventHandler, LLM as GoogleLLM, type GoogleLLMOptions, LLM$2 as GroqLLM, type GroqLLMOptions, Guardrail$1 as Guardrail, type GuardrailOptions, type HookContext, IVRActivity, type IVRActivityOptions, type IVRToolDefinition, type IncomingMessage, type InitTracingOptions, TTS as InworldTTS, type InworldTTSOptions, type JobCallback, type LLMChunk, LLMLoop, type LLMProvider, LMNTAudioFormat, LMNTModel, LMNTSampleRate, TTS$1 as LMNTTTS, type LMNTTTSOptions, type LatencyBreakdown, type LocalCallOptions, type LocalConfig, type LocalOptions, type Logger, type LoopCallback, type MessageHandler, MetricsStore, Ngrok, LLM$4 as OpenAILLM, type OpenAILLMOptions, OpenAILLMProvider, type OpenAIMessage, Realtime as OpenAIRealtime, OpenAIRealtimeAdapter, type RealtimeOptions as OpenAIRealtimeOptions, TTS$4 as OpenAITTS, type OpenAITTSOptions, STT$4 as OpenAITranscribeSTT, type OpenAITranscribeSTTOptions, type ParamSpec, PartialStreamError, Patter, PatterConnectionError, PatterError, type PatterEventType, PatterTool, type PatterToolExecuteArgs, type PatterToolOptions, type PatterToolResult, PcmCarry, PipelineHookExecutor, type PipelineHooks, type PipelineMessageHandler, type ProviderPricing, ProvisionError, RateLimitError, type RawPcmSource, type RealtimeConfig, RemoteMessageHandler, TTS$2 as RimeTTS, type RimeTTSOptions, SPAN_BARGEIN, SPAN_CALL, SPAN_ENDPOINT, SPAN_LLM, SPAN_STT, SPAN_TOOL, SPAN_TTS, type SSEEvent, type STTConfig, type ScheduleHandle, SentenceChunker, type ServeOptions, type SilenceCallback, type SileroSampleRate, SileroVAD, type SileroVADOptions, STT$2 as SonioxSTT, type SonioxSTTOptions$1 as SonioxSTTOptions, type Span, type SpeechEventCallback, SpeechEvents, SpeechmaticsAudioEncoding, SpeechmaticsOperatingPoint, STT as SpeechmaticsSTT, type SpeechmaticsSTTOptions, SpeechmaticsSampleRate, SpeechmaticsServerMessage, TurnDetectionMode as SpeechmaticsTurnDetectionMode, StatefulResampler, type StatefulResamplerOptions, Static as StaticTunnel, type TTSConfig, Carrier as Telnyx, TelnyxAdapter, type TelnyxCarrierOptions, type ConfigureNumberOptions as TelnyxConfigureNumberOptions, type EndCallOptions as TelnyxEndCallOptions, type InitiateCallOptions as TelnyxInitiateCallOptions, type InitiateCallResult as TelnyxInitiateCallResult, type ProvisionNumberOptions as TelnyxProvisionNumberOptions, type ProvisionNumberResult as TelnyxProvisionNumberResult, TelnyxSTT, TelnyxSTTInputFormat, TelnyxSTTSampleRate, type Transcript as TelnyxSTTTranscript, TelnyxTTS, TelnyxTTSSampleRate, TelnyxTTSVoice, type TelnyxTranscriptionEngine, TestSession, TfidfLoopDetector, type TfidfLoopDetectorOptions, Tool, type ToolDefinition, type ToolExecutor, type ToolHandler, type ToolOptions, type TunnelHandle, type TurnMetrics, Carrier$1 as Twilio, TwilioAdapter, type TwilioAdapterOptions, type TwilioCarrierOptions, type ConfigureNumberOptions$1 as TwilioConfigureNumberOptions, type InitiateCallOptions$1 as TwilioInitiateCallOptions, type InitiateCallResult$1 as TwilioInitiateCallResult, type ProvisionNumberOptions$1 as TwilioProvisionNumberOptions, type ProvisionNumberResult$1 as TwilioProvisionNumberResult, ULTRAVOX_DEFAULT_API_BASE, ULTRAVOX_DEFAULT_SR, type UltravoxEventHandler, UltravoxRealtimeAdapter, type UserState, STT$5 as WhisperSTT, type WhisperSTTOptions, assemblyai, builtinClipPath, calculateRealtimeCost, calculateSttCost, calculateTelephonyCost, calculateTtsCost, callsToCsv, callsToJson, cartesia, createResampler16kTo8k, createResampler24kTo16k, createResampler24kTo8k, createResampler8kTo16k, deepgram, defineTool, elevenlabs, filterEmoji, filterForTTS, filterMarkdown, formatDtmf, geminiLive, getLogger, guardrail, initTracing, isRemoteUrl, isTracingEnabled, isWebSocketUrl, lmnt, makeAuthMiddleware, mergePricing, mixPcm, mountApi, mountDashboard, mulawToPcm16, notifyDashboard, openaiTts, pcm16ToMulaw, resample16kTo8k, resample24kTo16k, resample8kTo16k, resamplePcm, rime, scheduleCron, scheduleInterval, scheduleOnce, selectSoundFromList, setLogger, soniox, speechmatics, startSpan, startTunnel, tool, ultravox, whisper };
|
|
7668
|
+
export { type AgentOptions, type AgentState, AllProvidersFailedError, type AnthropicConversion, LLM$3 as AnthropicLLM, type AnthropicLLMOptions, type AnthropicMessage, AssemblyAIEncoding, AssemblyAIModel, STT$1 as AssemblyAISTT, type AssemblyAISTTOptions, type AudioConfig, type AudioSource, AuthenticationError, type BackgroundAudioOptions, BackgroundAudioPlayer, type EvaluateContext as BargeInEvaluateContext, type BargeInStrategy, BuiltinAudioClip, type BuiltinAudioClipName, type BuiltinPcmSource, type CallControl, type CallEvent, type CallEventHandler, type CallMetrics, CallMetricsAccumulator, type CallRecord, type CartesiaEncoding, STT$3 as CartesiaSTT, type CartesiaSTTOptions, TTS$3 as CartesiaTTS, type CartesiaTTSOptions, LLM$1 as CerebrasLLM, type CerebrasLLMOptions, ChatContext, type ChatMessage, type ChatRole, CloudflareTunnel, type ConversationStateSnapshot, type CostBreakdown, DEFAULT_MIN_SENTENCE_LEN, DEFAULT_PRICING, DTMF_EVENTS, DeepFilterNetFilter, type DeepFilterNetOptions, STT$6 as DeepgramSTT, type DeepgramSTTOptions, DefaultToolExecutor, type DefaultToolExecutorOptions, type DefineToolInput, type DtmfEvent, ConvAI as ElevenLabsConvAI, ElevenLabsConvAIAdapter, type ConvAIOptions as ElevenLabsConvAIOptions, ElevenLabsTTS as ElevenLabsRestTTS, TTS$6 as ElevenLabsTTS, type ElevenLabsTTSOptions, type ElevenLabsWebSocketOptions, TTS$5 as ElevenLabsWebSocketTTS, type EouTrigger, ErrorCode, EventBus, FallbackLLMProvider, type FallbackLLMProviderOptions, type FilePcmSource, GEMINI_DEFAULT_INPUT_SR, GEMINI_DEFAULT_OUTPUT_SR, GeminiLiveAdapter, type GeminiLiveEventHandler, LLM as GoogleLLM, type GoogleLLMOptions, LLM$2 as GroqLLM, type GroqLLMOptions, Guardrail$1 as Guardrail, type GuardrailOptions, type HookContext, IVRActivity, type IVRActivityOptions, type IVRToolDefinition, type IncomingMessage, type InitTracingOptions, TTS as InworldTTS, type InworldTTSOptions, type JobCallback, KrispFrameDuration, KrispSampleRate, KrispVivaFilter, type KrispVivaFilterOptions, type LLMChunk, LLMLoop, type LLMProvider, LMNTAudioFormat, LMNTModel, LMNTSampleRate, TTS$1 as LMNTTTS, type LMNTTTSOptions, type LatencyBreakdown, type LocalCallOptions, type LocalConfig, type LocalOptions, type Logger, type LoopCallback, type MessageHandler, MetricsStore, MinWordsStrategy, type MinWordsStrategyOptions, Ngrok, LLM$4 as OpenAILLM, type OpenAILLMOptions, OpenAILLMProvider, type OpenAIMessage, Realtime as OpenAIRealtime, Realtime2 as OpenAIRealtime2, OpenAIRealtime2Adapter, type Realtime2Options as OpenAIRealtime2Options, OpenAIRealtimeAdapter, type RealtimeOptions as OpenAIRealtimeOptions, TTS$4 as OpenAITTS, type OpenAITTSOptions, STT$4 as OpenAITranscribeSTT, type OpenAITranscribeSTTOptions, type ParamSpec, PartialStreamError, Patter, PatterConnectionError, PatterError, type PatterEventType, PatterTool, type PatterToolExecuteArgs, type PatterToolOptions, type PatterToolResult, PcmCarry, PipelineHookExecutor, type PipelineHooks, type PipelineMessageHandler, type ProviderPricing, ProvisionError, RateLimitError, type RawPcmSource, type RealtimeConfig, RemoteMessageHandler, TTS$2 as RimeTTS, type RimeTTSOptions, SPAN_BARGEIN, SPAN_CALL, SPAN_ENDPOINT, SPAN_LLM, SPAN_STT, SPAN_TOOL, SPAN_TTS, type SSEEvent, type STTConfig, type ScheduleHandle, SentenceChunker, type ServeOptions, type SilenceCallback, type SileroSampleRate, SileroVAD, type SileroVADOptions, STT$2 as SonioxSTT, type SonioxSTTOptions$1 as SonioxSTTOptions, type Span, type SpeechEventCallback, SpeechEvents, SpeechmaticsAudioEncoding, SpeechmaticsOperatingPoint, STT as SpeechmaticsSTT, type SpeechmaticsSTTOptions, SpeechmaticsSampleRate, SpeechmaticsServerMessage, TurnDetectionMode as SpeechmaticsTurnDetectionMode, StatefulResampler, type StatefulResamplerOptions, Static as StaticTunnel, type TTSConfig, Carrier as Telnyx, TelnyxAdapter, type TelnyxCarrierOptions, type ConfigureNumberOptions as TelnyxConfigureNumberOptions, type EndCallOptions as TelnyxEndCallOptions, type InitiateCallOptions as TelnyxInitiateCallOptions, type InitiateCallResult as TelnyxInitiateCallResult, type ProvisionNumberOptions as TelnyxProvisionNumberOptions, type ProvisionNumberResult as TelnyxProvisionNumberResult, TelnyxSTT, TelnyxSTTInputFormat, TelnyxSTTSampleRate, type Transcript as TelnyxSTTTranscript, TelnyxTTS, TelnyxTTSSampleRate, TelnyxTTSVoice, type TelnyxTranscriptionEngine, TestSession, TfidfLoopDetector, type TfidfLoopDetectorOptions, Tool, type ToolDefinition, type ToolExecutor, type ToolHandler, type ToolOptions, type TunnelHandle, type TurnMetrics, Carrier$1 as Twilio, TwilioAdapter, type TwilioAdapterOptions, type TwilioCarrierOptions, type ConfigureNumberOptions$1 as TwilioConfigureNumberOptions, type InitiateCallOptions$1 as TwilioInitiateCallOptions, type InitiateCallResult$1 as TwilioInitiateCallResult, type ProvisionNumberOptions$1 as TwilioProvisionNumberOptions, type ProvisionNumberResult$1 as TwilioProvisionNumberResult, ULTRAVOX_DEFAULT_API_BASE, ULTRAVOX_DEFAULT_SR, type UltravoxEventHandler, UltravoxRealtimeAdapter, type UserState, STT$5 as WhisperSTT, type WhisperSTTOptions, assemblyai, builtinClipPath, calculateRealtimeCost, calculateSttCost, calculateTelephonyCost, calculateTtsCost, callsToCsv, callsToJson, cartesia, createResampler16kTo8k, createResampler24kTo16k, createResampler24kTo8k, createResampler8kTo16k, deepgram, defineTool, elevenlabs, evaluateStrategies as evaluateBargeInStrategies, filterEmoji, filterForTTS, filterMarkdown, formatDtmf, geminiLive, getLogger, guardrail, initTracing, isRemoteUrl, isTracingEnabled, isWebSocketUrl, lmnt, makeAuthMiddleware, mergePricing, mixPcm, mountApi, mountDashboard, mulawToPcm16, notifyDashboard, openaiTts, pcm16ToMulaw, resample16kTo8k, resample24kTo16k, resample8kTo16k, resamplePcm, resetStrategies as resetBargeInStrategies, rime, scheduleCron, scheduleInterval, scheduleOnce, selectSoundFromList, setLogger, soniox, speechmatics, startSpan, startTunnel, tool, ultravox, whisper };
|