getpatter 0.6.0 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/barge-in-strategies-X6ARMGIQ.mjs +12 -0
- package/dist/chunk-CL2U3YET.mjs +1429 -0
- package/dist/chunk-D4424JZR.mjs +71 -0
- package/dist/{chunk-JUQ5WQTQ.mjs → chunk-LE63CSOB.mjs} +1424 -969
- package/dist/{chunk-X3364LSI.mjs → chunk-R2T4JABZ.mjs} +49 -2
- package/dist/cli.js +315 -37
- package/dist/dashboard/ui.html +13 -13
- package/dist/index.d.mts +2136 -709
- package/dist/index.d.ts +2136 -709
- package/dist/index.js +5674 -2233
- package/dist/index.mjs +2338 -915
- package/dist/openai-realtime-2-CNFARP25.mjs +8 -0
- package/dist/{silero-vad-YLCXT5GQ.mjs → silero-vad-LNDFGIY7.mjs} +1 -1
- package/dist/{test-mode-Y7YG5LFZ.mjs → test-mode-RS57BDM6.mjs} +2 -1
- package/package.json +1 -1
- package/src/dashboard/ui.html +13 -13
package/dist/index.d.mts
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import * as WebSocket from 'ws';
|
|
2
|
+
import WebSocket__default from 'ws';
|
|
1
3
|
import { EventEmitter } from 'events';
|
|
2
4
|
import { Request, Response, NextFunction, Express } from 'express';
|
|
3
5
|
|
|
@@ -59,7 +61,11 @@ declare class Carrier {
|
|
|
59
61
|
interface RealtimeOptions {
|
|
60
62
|
/** API key. Falls back to OPENAI_API_KEY env var when omitted. */
|
|
61
63
|
apiKey?: string;
|
|
62
|
-
/**
|
|
64
|
+
/**
|
|
65
|
+
* Realtime model. Defaults to ``gpt-realtime-mini`` (bumped from the
|
|
66
|
+
* deprecated ``gpt-4o-mini-realtime-preview`` on 2026-05-25 for
|
|
67
|
+
* parity with the Python SDK and the GA Realtime API surface).
|
|
68
|
+
*/
|
|
63
69
|
model?: string;
|
|
64
70
|
/** Voice preset. Defaults to alloy. */
|
|
65
71
|
voice?: string;
|
|
@@ -104,6 +110,61 @@ declare class Realtime {
|
|
|
104
110
|
constructor(opts?: RealtimeOptions);
|
|
105
111
|
}
|
|
106
112
|
|
|
113
|
+
/**
|
|
114
|
+
* OpenAI Realtime 2 engine — marker class for Patter client dispatch.
|
|
115
|
+
*
|
|
116
|
+
* Wraps `gpt-realtime-2` (GA Realtime API). Separate marker from
|
|
117
|
+
* {@link import('./openai').Realtime} because the GA endpoint speaks a
|
|
118
|
+
* different `session.update` wire shape; the client dispatches to
|
|
119
|
+
* `OpenAIRealtime2Adapter` when this marker is passed.
|
|
120
|
+
*/
|
|
121
|
+
/** Constructor options for the OpenAI `Realtime2` engine marker. */
|
|
122
|
+
interface Realtime2Options {
|
|
123
|
+
/** API key. Falls back to OPENAI_API_KEY env var when omitted. */
|
|
124
|
+
apiKey?: string;
|
|
125
|
+
/** GA Realtime model. Defaults to `gpt-realtime-2`. */
|
|
126
|
+
model?: string;
|
|
127
|
+
/** Voice preset. Defaults to alloy. */
|
|
128
|
+
voice?: string;
|
|
129
|
+
/**
|
|
130
|
+
* Reasoning-effort tier. When omitted the field is not sent and the
|
|
131
|
+
* server default applies. OpenAI recommends `"low"` for production
|
|
132
|
+
* voice flows — higher tiers add measurable per-turn latency.
|
|
133
|
+
*/
|
|
134
|
+
reasoningEffort?: 'minimal' | 'low' | 'medium' | 'high';
|
|
135
|
+
/**
|
|
136
|
+
* Override for `audio.input.transcription.model`. Omit to keep the
|
|
137
|
+
* adapter default (`whisper-1`). Use `"gpt-realtime-whisper"` for
|
|
138
|
+
* low-latency transcript partials.
|
|
139
|
+
*/
|
|
140
|
+
inputAudioTranscriptionModel?: string;
|
|
141
|
+
}
|
|
142
|
+
/**
|
|
143
|
+
* OpenAI Realtime 2 engine marker — selects `gpt-realtime-2` on the GA
|
|
144
|
+
* Realtime API.
|
|
145
|
+
*
|
|
146
|
+
* @example
|
|
147
|
+
* ```ts
|
|
148
|
+
* import { Patter, Twilio, OpenAIRealtime2 } from "getpatter";
|
|
149
|
+
*
|
|
150
|
+
* const phone = new Patter({ carrier: new Twilio(), phoneNumber: "+1..." });
|
|
151
|
+
* const agent = phone.agent({
|
|
152
|
+
* engine: new OpenAIRealtime2({ reasoningEffort: "low" }),
|
|
153
|
+
* systemPrompt: "You are a friendly receptionist.",
|
|
154
|
+
* firstMessage: "Hello! How can I help?",
|
|
155
|
+
* });
|
|
156
|
+
* ```
|
|
157
|
+
*/
|
|
158
|
+
declare class Realtime2 {
|
|
159
|
+
readonly kind: "openai_realtime_2";
|
|
160
|
+
readonly apiKey: string;
|
|
161
|
+
readonly model: string;
|
|
162
|
+
readonly voice: string;
|
|
163
|
+
readonly reasoningEffort?: 'minimal' | 'low' | 'medium' | 'high';
|
|
164
|
+
readonly inputAudioTranscriptionModel?: string;
|
|
165
|
+
constructor(opts?: Realtime2Options);
|
|
166
|
+
}
|
|
167
|
+
|
|
107
168
|
/** ElevenLabs ConvAI engine — marker class for Patter client dispatch. */
|
|
108
169
|
/** Constructor options for the ElevenLabs `ConvAI` engine marker. */
|
|
109
170
|
interface ConvAIOptions {
|
|
@@ -273,71 +334,6 @@ declare class Tool implements ToolDefinition {
|
|
|
273
334
|
/** Factory helper mirroring Python's `tool(...)` function. */
|
|
274
335
|
declare function tool(opts: ToolOptions): Tool;
|
|
275
336
|
|
|
276
|
-
/**
|
|
277
|
-
* Shared STT / TTS adapter dispatch.
|
|
278
|
-
*
|
|
279
|
-
* In v0.5.0+ callers always pass pre-instantiated adapters (``agent.stt`` /
|
|
280
|
-
* ``agent.tts`` are ``STTAdapter`` / ``TTSAdapter`` instances), so these
|
|
281
|
-
* helpers are thin pass-throughs that return the instance or null. Kept as
|
|
282
|
-
* functions so the Twilio/Telnyx bridges have a single dispatch point.
|
|
283
|
-
*/
|
|
284
|
-
|
|
285
|
-
/** Per-word timings / metadata (Deepgram-shaped). Optional on every adapter. */
|
|
286
|
-
interface STTWord {
|
|
287
|
-
readonly word?: string;
|
|
288
|
-
readonly start?: number;
|
|
289
|
-
readonly end?: number;
|
|
290
|
-
readonly confidence?: number;
|
|
291
|
-
readonly punctuated_word?: string;
|
|
292
|
-
readonly speaker?: number;
|
|
293
|
-
}
|
|
294
|
-
/**
|
|
295
|
-
* Facade transcript shape — widened to surface richer provider fields
|
|
296
|
-
* (Deepgram emits all of them) without forcing adapters that only know
|
|
297
|
-
* ``text``/``isFinal`` to change. All non-text fields are optional.
|
|
298
|
-
*/
|
|
299
|
-
interface STTTranscript {
|
|
300
|
-
text: string;
|
|
301
|
-
isFinal?: boolean;
|
|
302
|
-
/** Overall transcript confidence in [0, 1]. */
|
|
303
|
-
confidence?: number;
|
|
304
|
-
/** Provider-side end-of-utterance hint (faster than ``isFinal``). */
|
|
305
|
-
speechFinal?: boolean;
|
|
306
|
-
/** True when the result was produced in response to a Finalize command. */
|
|
307
|
-
fromFinalize?: boolean;
|
|
308
|
-
/** Provider request id (Deepgram populates this from the Metadata frame). */
|
|
309
|
-
requestId?: string;
|
|
310
|
-
/** Per-word timings / metadata when the provider emits them. */
|
|
311
|
-
words?: ReadonlyArray<STTWord>;
|
|
312
|
-
/** Which provider event this transcript represents (e.g. ``Results``). */
|
|
313
|
-
eventType?: string;
|
|
314
|
-
}
|
|
315
|
-
/** Callback invoked by an `STTAdapter` for each (partial or final) transcript event. */
|
|
316
|
-
type STTTranscriptCallback = (t: STTTranscript) => Promise<void> | void;
|
|
317
|
-
/** Shape shared by every STT adapter in the SDK. */
|
|
318
|
-
interface STTAdapter {
|
|
319
|
-
connect(): Promise<void>;
|
|
320
|
-
sendAudio(pcm: Buffer): void | Promise<void>;
|
|
321
|
-
onTranscript(cb: STTTranscriptCallback): void;
|
|
322
|
-
close(): void | Promise<void>;
|
|
323
|
-
/**
|
|
324
|
-
* Optional: ask the provider to immediately finalise the in-flight
|
|
325
|
-
* utterance (rather than waiting for its own endpoint timer). Called by
|
|
326
|
-
* ``StreamHandler`` whenever the SDK's VAD signals ``speech_end``, and
|
|
327
|
-
* after a barge-in cancel — both moments where waiting for the
|
|
328
|
-
* provider's endpoint heuristic stalls the next turn.
|
|
329
|
-
*
|
|
330
|
-
* Implementations that do not support utterance-level finalisation
|
|
331
|
-
* (e.g. one-shot transcribers like Whisper) should omit this method
|
|
332
|
-
* entirely; the stream handler does an optional-chained call.
|
|
333
|
-
*/
|
|
334
|
-
finalize?(): void | Promise<void>;
|
|
335
|
-
}
|
|
336
|
-
/** Shape shared by every TTS adapter in the SDK. */
|
|
337
|
-
interface TTSAdapter {
|
|
338
|
-
synthesizeStream(text: string): AsyncIterable<Buffer>;
|
|
339
|
-
}
|
|
340
|
-
|
|
341
337
|
/**
|
|
342
338
|
* Pipeline hook executor for pipeline mode.
|
|
343
339
|
*
|
|
@@ -616,6 +612,22 @@ interface LLMStreamOptions {
|
|
|
616
612
|
}
|
|
617
613
|
interface LLMProvider {
|
|
618
614
|
stream(messages: Array<Record<string, unknown>>, tools?: Array<Record<string, unknown>> | null, opts?: LLMStreamOptions): AsyncGenerator<LLMChunk, void, unknown>;
|
|
615
|
+
/**
|
|
616
|
+
* Optional best-effort pre-call DNS / TLS / HTTP-keepalive warmup.
|
|
617
|
+
*
|
|
618
|
+
* Called once per outbound call from ``Patter.call`` when the agent has
|
|
619
|
+
* ``prewarm: true`` (the default). Concrete providers (OpenAI,
|
|
620
|
+
* Anthropic, Google, Cerebras, Groq) override this to issue a
|
|
621
|
+
* lightweight HTTPS GET to their inference endpoint so by the time the
|
|
622
|
+
* first ``stream()`` call lands, the connection pool already has a
|
|
623
|
+
* warm socket. Failures are logged at debug level and never abort the
|
|
624
|
+
* call — pure latency optimisation.
|
|
625
|
+
*
|
|
626
|
+
* Optional on the interface (``warmup?: ...``) so providers without a
|
|
627
|
+
* warmup hook still satisfy the type. Detected via runtime
|
|
628
|
+
* ``typeof provider.warmup === 'function'`` in the client.
|
|
629
|
+
*/
|
|
630
|
+
warmup?(): Promise<void>;
|
|
619
631
|
}
|
|
620
632
|
/** Optional sampling kwargs forwarded into the OpenAI Chat Completions body. */
|
|
621
633
|
interface OpenAILLMSamplingOptions {
|
|
@@ -642,6 +654,8 @@ interface OpenAILLMSamplingOptions {
|
|
|
642
654
|
}
|
|
643
655
|
/** LLM provider backed by OpenAI Chat Completions (streaming). */
|
|
644
656
|
declare class OpenAILLMProvider implements LLMProvider {
|
|
657
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
658
|
+
static readonly providerKey = "openai";
|
|
645
659
|
private readonly apiKey;
|
|
646
660
|
readonly model: string;
|
|
647
661
|
private readonly temperature?;
|
|
@@ -655,6 +669,23 @@ declare class OpenAILLMProvider implements LLMProvider {
|
|
|
655
669
|
private readonly presencePenalty?;
|
|
656
670
|
private readonly stop?;
|
|
657
671
|
constructor(apiKey: string, model: string, sampling?: OpenAILLMSamplingOptions);
|
|
672
|
+
/** Subclasses (Cerebras, Groq) override this with their own host. */
|
|
673
|
+
protected get baseUrl(): string;
|
|
674
|
+
/**
|
|
675
|
+
* Pre-call DNS / TLS / HTTP-keepalive warmup.
|
|
676
|
+
*
|
|
677
|
+
* Issues a lightweight ``GET ${baseUrl}/models`` so DNS, TLS and HTTP/2
|
|
678
|
+
* are already up by the time the first ``chat.completions`` call lands.
|
|
679
|
+
* Best-effort: 5 s timeout, all exceptions swallowed at debug level.
|
|
680
|
+
*
|
|
681
|
+
* Note: an HTTPS GET warms DNS + TLS + connection pool but does NOT
|
|
682
|
+
* warm the inference path itself; for true inference warmup a real
|
|
683
|
+
* low-token request is needed, left as a follow-up. STT / TTS providers ship concrete
|
|
684
|
+
* WebSocket-based prewarms (Cartesia / Deepgram / AssemblyAI for STT;
|
|
685
|
+
* ElevenLabs WS for TTS) which save 200-500 ms each — those dominate
|
|
686
|
+
* the cold-start latency budget.
|
|
687
|
+
*/
|
|
688
|
+
warmup(): Promise<void>;
|
|
658
689
|
/** Stream OpenAI Chat Completions chunks for the given messages/tools. */
|
|
659
690
|
stream(messages: Array<Record<string, unknown>>, tools?: Array<Record<string, unknown>> | null, opts?: LLMStreamOptions): AsyncGenerator<LLMChunk, void, unknown>;
|
|
660
691
|
}
|
|
@@ -669,6 +700,8 @@ declare class LLMLoop {
|
|
|
669
700
|
private eventBus?;
|
|
670
701
|
private readonly _providerName;
|
|
671
702
|
private readonly _modelName;
|
|
703
|
+
private _usageMissingCount;
|
|
704
|
+
private _loggedUsageFallback;
|
|
672
705
|
private onToolCall?;
|
|
673
706
|
constructor(apiKey: string, model: string, systemPrompt: string, tools?: ToolDefinition[] | null, llmProvider?: LLMProvider, disablePhonePreamble?: boolean);
|
|
674
707
|
/**
|
|
@@ -706,6 +739,87 @@ declare class LLMLoop {
|
|
|
706
739
|
private buildMessages;
|
|
707
740
|
}
|
|
708
741
|
|
|
742
|
+
/**
|
|
743
|
+
* Barge-in confirmation strategies.
|
|
744
|
+
*
|
|
745
|
+
* When a caller starts speaking while the agent's TTS is in flight, the SDK
|
|
746
|
+
* has to decide whether the speech is a real interruption or just a brief
|
|
747
|
+
* backchannel ("uh-huh", "okay") / room noise / cough. The default
|
|
748
|
+
* behaviour is to treat any VAD speech_start as a confirmed barge-in and
|
|
749
|
+
* cancel the agent immediately. That is fine for clean inputs but
|
|
750
|
+
* produces frequent false positives on PSTN: the agent gets cut
|
|
751
|
+
* mid-sentence by background chatter, breath, or filler words and never
|
|
752
|
+
* recovers the conversational thread.
|
|
753
|
+
*
|
|
754
|
+
* Each ``BargeInStrategy`` is consulted on every STT transcript while a
|
|
755
|
+
* barge-in is *pending* (VAD fired, but the agent has not yet been
|
|
756
|
+
* cancelled). The first strategy that returns ``true`` confirms the
|
|
757
|
+
* barge-in; if none do within the configured timeout the pending state
|
|
758
|
+
* is dropped and the agent resumes streaming TTS as if nothing happened.
|
|
759
|
+
* With an empty ``bargeInStrategies`` array the SDK falls back to the
|
|
760
|
+
* legacy "interrupt immediately on VAD" path, so adding strategies is
|
|
761
|
+
* a strict opt-in.
|
|
762
|
+
*/
|
|
763
|
+
interface EvaluateContext {
|
|
764
|
+
/** Latest STT output text (interim or final). */
|
|
765
|
+
readonly transcript: string;
|
|
766
|
+
/** ``true`` for interim partials, ``false`` for finals. */
|
|
767
|
+
readonly isInterim: boolean;
|
|
768
|
+
/** Whether the agent's TTS is currently in flight. */
|
|
769
|
+
readonly agentSpeaking: boolean;
|
|
770
|
+
}
|
|
771
|
+
/**
|
|
772
|
+
* Decides whether a pending barge-in should be confirmed.
|
|
773
|
+
*
|
|
774
|
+
* Implementations must be safe to call from any number of evaluations
|
|
775
|
+
* per turn. ``reset`` is invoked when the agent finishes speaking
|
|
776
|
+
* naturally and when a pending barge-in times out without
|
|
777
|
+
* confirmation.
|
|
778
|
+
*/
|
|
779
|
+
interface BargeInStrategy {
|
|
780
|
+
evaluate(ctx: EvaluateContext): Promise<boolean> | boolean;
|
|
781
|
+
reset?(): Promise<void> | void;
|
|
782
|
+
}
|
|
783
|
+
interface MinWordsStrategyOptions {
|
|
784
|
+
/**
|
|
785
|
+
* Minimum word count required while the agent is speaking. Reasonable
|
|
786
|
+
* values are 2-5; 3 is a good starting point for production phone
|
|
787
|
+
* agents. Must be ``>= 1``.
|
|
788
|
+
*/
|
|
789
|
+
readonly minWords: number;
|
|
790
|
+
/**
|
|
791
|
+
* When ``true`` (default), interim STT partials are evaluated as soon
|
|
792
|
+
* as they arrive. Set to ``false`` to wait for finals only — slower
|
|
793
|
+
* but free of partial-word noise on jittery STT providers.
|
|
794
|
+
*/
|
|
795
|
+
readonly useInterim?: boolean;
|
|
796
|
+
}
|
|
797
|
+
/**
|
|
798
|
+
* Confirm barge-in only after the caller has spoken ``minWords`` words.
|
|
799
|
+
*
|
|
800
|
+
* Filters short backchannels, single-word utterances, and stray
|
|
801
|
+
* transcription fragments that VAD picked up but were not real
|
|
802
|
+
* interruptions. While the agent is silent the strategy permits any
|
|
803
|
+
* speech to count (one word is enough), so the first user turn is not
|
|
804
|
+
* delayed.
|
|
805
|
+
*/
|
|
806
|
+
declare class MinWordsStrategy implements BargeInStrategy {
|
|
807
|
+
private readonly minWords;
|
|
808
|
+
private readonly useInterim;
|
|
809
|
+
constructor(options: MinWordsStrategyOptions);
|
|
810
|
+
evaluate(ctx: EvaluateContext): boolean;
|
|
811
|
+
reset(): Promise<void>;
|
|
812
|
+
}
|
|
813
|
+
/**
|
|
814
|
+
* Short-circuit-OR composition: first strategy that confirms wins.
|
|
815
|
+
* Returns ``false`` for an empty array so callers can use the empty
|
|
816
|
+
* default to mean "no opt-in confirmation, fall back to legacy
|
|
817
|
+
* interrupt-on-VAD".
|
|
818
|
+
*/
|
|
819
|
+
declare function evaluateStrategies(strategies: readonly BargeInStrategy[], ctx: EvaluateContext): Promise<boolean>;
|
|
820
|
+
/** Call ``reset()`` on every strategy, swallowing per-strategy errors. */
|
|
821
|
+
declare function resetStrategies(strategies: readonly BargeInStrategy[]): Promise<void>;
|
|
822
|
+
|
|
709
823
|
/**
|
|
710
824
|
* Public type definitions for the Patter SDK — agent options, pipeline hooks,
|
|
711
825
|
* provider config envelopes, and serve/call request/response shapes.
|
|
@@ -967,6 +1081,15 @@ interface VADEvent {
|
|
|
967
1081
|
interface VADProvider {
|
|
968
1082
|
processFrame(pcmChunk: Buffer, sampleRate: number): Promise<VADEvent | null>;
|
|
969
1083
|
close(): Promise<void>;
|
|
1084
|
+
/**
|
|
1085
|
+
* Optional: reset all per-utterance state so the next ``processFrame``
|
|
1086
|
+
* starts from a clean SILENCE state. Useful between agent turns to
|
|
1087
|
+
* prevent a "stuck SPEECH" condition where PSTN echo / loopback kept the
|
|
1088
|
+
* detector's internal probability above the deactivation threshold for
|
|
1089
|
+
* the full agent turn, leaving the VAD unable to emit ``speech_start``
|
|
1090
|
+
* on the next user utterance (one-shot barge-in bug).
|
|
1091
|
+
*/
|
|
1092
|
+
reset?(): Promise<void> | void;
|
|
970
1093
|
}
|
|
971
1094
|
/** Pre-STT audio filter — noise cancellation, gain, EQ. */
|
|
972
1095
|
interface AudioFilter {
|
|
@@ -1062,7 +1185,7 @@ interface AgentOptions {
|
|
|
1062
1185
|
* matching mode (``openai_realtime`` or ``elevenlabs_convai``). When absent,
|
|
1063
1186
|
* pipeline mode is selected if ``stt`` and ``tts`` are provided.
|
|
1064
1187
|
*/
|
|
1065
|
-
engine?: Realtime | ConvAI;
|
|
1188
|
+
engine?: Realtime | Realtime2 | ConvAI;
|
|
1066
1189
|
/**
|
|
1067
1190
|
* Provider mode. Normally derived from ``engine`` / ``stt`` + ``tts``. Pass
|
|
1068
1191
|
* ``'pipeline'`` explicitly when building a pipeline-mode agent without
|
|
@@ -1103,6 +1226,60 @@ interface AgentOptions {
|
|
|
1103
1226
|
* Default: 300.
|
|
1104
1227
|
*/
|
|
1105
1228
|
bargeInThresholdMs?: number;
|
|
1229
|
+
/**
|
|
1230
|
+
* Opt-in barge-in confirmation strategies (pipeline mode). With the
|
|
1231
|
+
* default empty array the SDK falls back to the legacy
|
|
1232
|
+
* "interrupt immediately on VAD speech_start" behaviour. When at
|
|
1233
|
+
* least one strategy is provided, a VAD speech_start during TTS
|
|
1234
|
+
* marks the barge-in as *pending* — the agent's TTS continues
|
|
1235
|
+
* streaming naturally and its in-flight LLM stream is preserved —
|
|
1236
|
+
* and the strategies are consulted on every STT transcript. The first strategy that
|
|
1237
|
+
* returns ``true`` confirms the barge-in (cancels TTS, flushes the
|
|
1238
|
+
* inbound ring buffer); if none confirm within
|
|
1239
|
+
* ``bargeInConfirmMs`` the pending state is dropped and TTS resumes.
|
|
1240
|
+
*
|
|
1241
|
+
* See ``getpatter`` exports ``BargeInStrategy`` /
|
|
1242
|
+
* ``MinWordsStrategy`` for the protocol and a reference
|
|
1243
|
+
* implementation.
|
|
1244
|
+
*/
|
|
1245
|
+
bargeInStrategies?: readonly BargeInStrategy[];
|
|
1246
|
+
/**
|
|
1247
|
+
* Maximum time (ms) to wait for at least one strategy to confirm a
|
|
1248
|
+
* pending barge-in before discarding the pending state and resuming
|
|
1249
|
+
* TTS. Only consulted when ``bargeInStrategies`` is non-empty.
|
|
1250
|
+
* Default: 1500.
|
|
1251
|
+
*/
|
|
1252
|
+
bargeInConfirmMs?: number;
|
|
1253
|
+
/**
|
|
1254
|
+
* When ``true`` (default), ``Patter.call`` warms up the STT, TTS, and
|
|
1255
|
+
* LLM provider connections in parallel with the carrier-side
|
|
1256
|
+
* ``initiateCall`` request so DNS, TLS, and HTTP/2 handshakes are
|
|
1257
|
+
* already complete by the time the callee answers. Adapters expose a
|
|
1258
|
+
* ``warmup()`` method returning ``Promise<void>`` (default no-op) —
|
|
1259
|
+
* providers can override to dial open a persistent connection ahead
|
|
1260
|
+
* of the WebSocket bridge. Best-effort: warmup failures are logged
|
|
1261
|
+
* at debug level and never abort the call. Default: ``true``.
|
|
1262
|
+
*/
|
|
1263
|
+
prewarm?: boolean;
|
|
1264
|
+
/**
|
|
1265
|
+
* When ``true`` (default since 0.6.2 in pipeline mode), ``Patter.call``
|
|
1266
|
+
* pre-renders ``firstMessage`` to TTS audio bytes during the ringing
|
|
1267
|
+
* window and streams the cached buffer immediately when the carrier
|
|
1268
|
+
* emits ``start``. Eliminates the 200-700 ms TTS first-byte latency
|
|
1269
|
+
* on the greeting that dominated first-turn ``p95`` on every pipeline
|
|
1270
|
+
* acceptance run. The trade-off is paying the TTS bill even if the
|
|
1271
|
+
* call is never answered (silently logged at warn level when the call
|
|
1272
|
+
* fails) — typically $0.001-$0.005 per ringing call depending on TTS
|
|
1273
|
+
* provider. Opt out by passing ``prewarmFirstMessage: false`` (e.g.
|
|
1274
|
+
* for very high-volume outbound where un-answered TTS spend matters).
|
|
1275
|
+
*
|
|
1276
|
+
* **Pipeline mode only.** Realtime / ConvAI provider modes never
|
|
1277
|
+
* consume the prewarm cache (the StreamHandler for those modes runs
|
|
1278
|
+
* its first-message emit through the provider's own audio path), so
|
|
1279
|
+
* ``Patter.call`` refuses to spawn the prewarm task and emits a warn
|
|
1280
|
+
* when ``provider !== 'pipeline'``.
|
|
1281
|
+
*/
|
|
1282
|
+
prewarmFirstMessage?: boolean;
|
|
1106
1283
|
/**
|
|
1107
1284
|
* When true, the sentence chunker emits the first clause of each response
|
|
1108
1285
|
* on a soft punctuation boundary (",", em-dash, en-dash) once ~40 chars
|
|
@@ -1227,92 +1404,618 @@ interface LocalCallOptions {
|
|
|
1227
1404
|
}
|
|
1228
1405
|
|
|
1229
1406
|
/**
|
|
1230
|
-
*
|
|
1231
|
-
*
|
|
1232
|
-
* Keeps the last `maxCalls` completed calls and tracks active calls.
|
|
1233
|
-
* Supports SSE event subscribers for real-time updates.
|
|
1407
|
+
* Shared STT / TTS adapter dispatch.
|
|
1234
1408
|
*
|
|
1235
|
-
*
|
|
1236
|
-
*
|
|
1237
|
-
*
|
|
1238
|
-
*
|
|
1239
|
-
* the JSONL/JSON files, the store is just a cache on top).
|
|
1409
|
+
* In v0.5.0+ callers always pass pre-instantiated adapters (``agent.stt`` /
|
|
1410
|
+
* ``agent.tts`` are ``STTAdapter`` / ``TTSAdapter`` instances), so these
|
|
1411
|
+
* helpers are thin pass-throughs that return the instance or null. Kept as
|
|
1412
|
+
* functions so the Twilio/Telnyx bridges have a single dispatch point.
|
|
1240
1413
|
*/
|
|
1241
1414
|
|
|
1242
|
-
/**
|
|
1243
|
-
interface
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
/**
|
|
1251
|
-
* Current lifecycle state: ``initiated`` (pre-registered), ``ringing``,
|
|
1252
|
-
* ``in-progress``, ``completed``, ``no-answer``, ``busy``, ``failed``,
|
|
1253
|
-
* ``canceled``, or ``webhook_error``.
|
|
1254
|
-
*/
|
|
1255
|
-
status?: string;
|
|
1256
|
-
transcript?: Array<{
|
|
1257
|
-
role: string;
|
|
1258
|
-
text: string;
|
|
1259
|
-
timestamp: number;
|
|
1260
|
-
}>;
|
|
1261
|
-
turns?: unknown[];
|
|
1262
|
-
metrics?: Record<string, unknown> | null;
|
|
1263
|
-
[key: string]: unknown;
|
|
1415
|
+
/** Per-word timings / metadata (Deepgram-shaped). Optional on every adapter. */
|
|
1416
|
+
interface STTWord {
|
|
1417
|
+
readonly word?: string;
|
|
1418
|
+
readonly start?: number;
|
|
1419
|
+
readonly end?: number;
|
|
1420
|
+
readonly confidence?: number;
|
|
1421
|
+
readonly punctuated_word?: string;
|
|
1422
|
+
readonly speaker?: number;
|
|
1264
1423
|
}
|
|
1265
|
-
/**
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1424
|
+
/**
|
|
1425
|
+
* Facade transcript shape — widened to surface richer provider fields
|
|
1426
|
+
* (Deepgram emits all of them) without forcing adapters that only know
|
|
1427
|
+
* ``text``/``isFinal`` to change. All non-text fields are optional.
|
|
1428
|
+
*/
|
|
1429
|
+
interface STTTranscript {
|
|
1430
|
+
text: string;
|
|
1431
|
+
isFinal?: boolean;
|
|
1432
|
+
/** Overall transcript confidence in [0, 1]. */
|
|
1433
|
+
confidence?: number;
|
|
1434
|
+
/** Provider-side end-of-utterance hint (faster than ``isFinal``). */
|
|
1435
|
+
speechFinal?: boolean;
|
|
1436
|
+
/** True when the result was produced in response to a Finalize command. */
|
|
1437
|
+
fromFinalize?: boolean;
|
|
1438
|
+
/** Provider request id (Deepgram populates this from the Metadata frame). */
|
|
1439
|
+
requestId?: string;
|
|
1440
|
+
/** Per-word timings / metadata when the provider emits them. */
|
|
1441
|
+
words?: ReadonlyArray<STTWord>;
|
|
1442
|
+
/** Which provider event this transcript represents (e.g. ``Results``). */
|
|
1443
|
+
eventType?: string;
|
|
1269
1444
|
}
|
|
1270
|
-
/**
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1445
|
+
/** Callback invoked by an `STTAdapter` for each (partial or final) transcript event. */
|
|
1446
|
+
type STTTranscriptCallback = (t: STTTranscript) => Promise<void> | void;
|
|
1447
|
+
/** Shape shared by every STT adapter in the SDK. */
|
|
1448
|
+
interface STTAdapter {
|
|
1449
|
+
connect(): Promise<void>;
|
|
1450
|
+
sendAudio(pcm: Buffer): void | Promise<void>;
|
|
1451
|
+
onTranscript(cb: STTTranscriptCallback): void;
|
|
1452
|
+
close(): void | Promise<void>;
|
|
1275
1453
|
/**
|
|
1276
|
-
*
|
|
1277
|
-
*
|
|
1278
|
-
*
|
|
1279
|
-
*
|
|
1454
|
+
* Optional: ask the provider to immediately finalise the in-flight
|
|
1455
|
+
* utterance (rather than waiting for its own endpoint timer). Called by
|
|
1456
|
+
* ``StreamHandler`` whenever the SDK's VAD signals ``speech_end``, and
|
|
1457
|
+
* after a barge-in cancel — both moments where waiting for the
|
|
1458
|
+
* provider's endpoint heuristic stalls the next turn.
|
|
1459
|
+
*
|
|
1460
|
+
* Implementations that do not support utterance-level finalisation
|
|
1461
|
+
* (e.g. one-shot transcribers like Whisper) should omit this method
|
|
1462
|
+
* entirely; the stream handler does an optional-chained call.
|
|
1280
1463
|
*/
|
|
1281
|
-
|
|
1282
|
-
maxCalls?: number;
|
|
1283
|
-
});
|
|
1284
|
-
private publish;
|
|
1285
|
-
/** Mark a call as in-progress (creates the row if it does not yet exist). */
|
|
1286
|
-
recordCallStart(data: Record<string, unknown>): void;
|
|
1464
|
+
finalize?(): void | Promise<void>;
|
|
1287
1465
|
/**
|
|
1288
|
-
*
|
|
1289
|
-
*
|
|
1290
|
-
*
|
|
1466
|
+
* Optional best-effort pre-call DNS / TLS / HTTP-keepalive warmup.
|
|
1467
|
+
* Default behaviour is a no-op — providers that benefit (e.g.
|
|
1468
|
+
* provider WebSockets with a slow handshake) can override. Failures
|
|
1469
|
+
* must never abort the call.
|
|
1291
1470
|
*/
|
|
1292
|
-
|
|
1471
|
+
warmup?(): Promise<void>;
|
|
1472
|
+
}
|
|
1473
|
+
/** Shape shared by every TTS adapter in the SDK. */
|
|
1474
|
+
interface TTSAdapter {
|
|
1475
|
+
synthesizeStream(text: string): AsyncIterable<Buffer>;
|
|
1293
1476
|
/**
|
|
1294
|
-
*
|
|
1295
|
-
*
|
|
1296
|
-
|
|
1477
|
+
* Optional best-effort pre-call DNS / TLS / HTTP-keepalive warmup.
|
|
1478
|
+
* Default behaviour is a no-op. Failures must never abort the call.
|
|
1479
|
+
*/
|
|
1480
|
+
warmup?(): Promise<void>;
|
|
1481
|
+
}
|
|
1482
|
+
|
|
1483
|
+
/**
|
|
1484
|
+
* Known stable ElevenLabs voice models (from the official ElevenLabs API
|
|
1485
|
+
* reference). Exposed as a typed `as const` object so callers can pass
|
|
1486
|
+
* `ElevenLabsModel.FLASH_V2_5` and get autocomplete / static checking; the
|
|
1487
|
+
* public `modelId` option also accepts an arbitrary `string` so users can
|
|
1488
|
+
* pass forward-compat IDs we haven't enumerated yet.
|
|
1489
|
+
*
|
|
1490
|
+
* - `V3` — newest, highest quality (slower TTFT than Flash).
|
|
1491
|
+
* - `FLASH_V2_5` — current default, fastest (~75 ms TTFT).
|
|
1492
|
+
* - `TURBO_V2_5` — balanced quality/speed.
|
|
1493
|
+
* - `MULTILINGUAL_V2` — best multilingual support.
|
|
1494
|
+
* - `MONOLINGUAL_V1` — legacy English-only.
|
|
1495
|
+
*/
|
|
1496
|
+
declare const ElevenLabsModel: {
|
|
1497
|
+
readonly V3: "eleven_v3";
|
|
1498
|
+
readonly FLASH_V2_5: "eleven_flash_v2_5";
|
|
1499
|
+
readonly TURBO_V2_5: "eleven_turbo_v2_5";
|
|
1500
|
+
readonly MULTILINGUAL_V2: "eleven_multilingual_v2";
|
|
1501
|
+
readonly MONOLINGUAL_V1: "eleven_monolingual_v1";
|
|
1502
|
+
};
|
|
1503
|
+
/** Union of {@link ElevenLabsModel} string values. */
|
|
1504
|
+
type ElevenLabsModel = (typeof ElevenLabsModel)[keyof typeof ElevenLabsModel];
|
|
1505
|
+
declare const ElevenLabsOutputFormat: {
|
|
1506
|
+
readonly MP3_22050_32: "mp3_22050_32";
|
|
1507
|
+
readonly MP3_44100_32: "mp3_44100_32";
|
|
1508
|
+
readonly MP3_44100_64: "mp3_44100_64";
|
|
1509
|
+
readonly MP3_44100_96: "mp3_44100_96";
|
|
1510
|
+
readonly MP3_44100_128: "mp3_44100_128";
|
|
1511
|
+
readonly MP3_44100_192: "mp3_44100_192";
|
|
1512
|
+
readonly PCM_8000: "pcm_8000";
|
|
1513
|
+
readonly PCM_16000: "pcm_16000";
|
|
1514
|
+
readonly PCM_22050: "pcm_22050";
|
|
1515
|
+
readonly PCM_24000: "pcm_24000";
|
|
1516
|
+
readonly PCM_44100: "pcm_44100";
|
|
1517
|
+
readonly ULAW_8000: "ulaw_8000";
|
|
1518
|
+
};
|
|
1519
|
+
/** Union of {@link ElevenLabsOutputFormat} string values. */
|
|
1520
|
+
type ElevenLabsOutputFormat = (typeof ElevenLabsOutputFormat)[keyof typeof ElevenLabsOutputFormat];
|
|
1521
|
+
/** ElevenLabs voice tuning knobs forwarded as `voice_settings` in the request. */
|
|
1522
|
+
interface ElevenLabsVoiceSettings {
|
|
1523
|
+
stability?: number;
|
|
1524
|
+
similarity_boost?: number;
|
|
1525
|
+
style?: number;
|
|
1526
|
+
use_speaker_boost?: boolean;
|
|
1527
|
+
}
|
|
1528
|
+
/** Constructor options for {@link ElevenLabsTTS}. */
|
|
1529
|
+
interface ElevenLabsTTSOptions$1 {
|
|
1530
|
+
voiceId?: string;
|
|
1531
|
+
/**
|
|
1532
|
+
* ElevenLabs voice model ID. The default ``eleven_flash_v2_5`` has the
|
|
1533
|
+
* lowest TTFT (~75 ms). Pass ``eleven_v3`` for highest quality, or any
|
|
1534
|
+
* arbitrary string for forward-compat with future models.
|
|
1535
|
+
*/
|
|
1536
|
+
modelId?: ElevenLabsModel | string;
|
|
1537
|
+
outputFormat?: ElevenLabsOutputFormat;
|
|
1538
|
+
voiceSettings?: ElevenLabsVoiceSettings;
|
|
1539
|
+
languageCode?: string;
|
|
1540
|
+
chunkSize?: number;
|
|
1541
|
+
}
|
|
1542
|
+
/**
|
|
1543
|
+
* ElevenLabs streaming TTS adapter.
|
|
1544
|
+
*
|
|
1545
|
+
* Supported `modelId` values are autocompleted via {@link ElevenLabsModel}.
|
|
1546
|
+
* Default is `eleven_flash_v2_5` (lowest TTFT, ~75 ms).
|
|
1547
|
+
*
|
|
1548
|
+
* **Telephony optimization** — the constructor default
|
|
1549
|
+
* `outputFormat='pcm_16000'` is correct for web playback, dashboard
|
|
1550
|
+
* previews, and 16 kHz pipelines. For real phone calls, use the
|
|
1551
|
+
* carrier-specific factories instead:
|
|
1552
|
+
*
|
|
1553
|
+
* - {@link ElevenLabsTTS.forTwilio} emits `ulaw_8000` natively. Twilio's
|
|
1554
|
+
* media-stream WebSocket expects μ-law @ 8 kHz, so the SDK normally
|
|
1555
|
+
* resamples 16 kHz → 8 kHz and PCM → μ-law before sending. Asking
|
|
1556
|
+
* ElevenLabs to produce μ-law directly skips that step (saves
|
|
1557
|
+
* ~30–80 ms first-byte plus per-frame CPU and avoids any resampling
|
|
1558
|
+
* aliasing).
|
|
1559
|
+
* - {@link ElevenLabsTTS.forTelnyx} emits `pcm_16000`. Telnyx negotiates
|
|
1560
|
+
* L16/16000 on its bidirectional media WebSocket, so 16 kHz PCM is
|
|
1561
|
+
* already the format used end-to-end and no transcoding happens.
|
|
1562
|
+
* ElevenLabs *also* supports `ulaw_8000` if your Telnyx profile is
|
|
1563
|
+
* pinned to PCMU/8000 — pass `outputFormat: 'ulaw_8000'` explicitly
|
|
1564
|
+
* in that case.
|
|
1565
|
+
*/
|
|
1566
|
+
declare class ElevenLabsTTS {
|
|
1567
|
+
static readonly providerKey = "elevenlabs";
|
|
1568
|
+
private readonly apiKey;
|
|
1569
|
+
private readonly voiceId;
|
|
1570
|
+
private readonly modelId;
|
|
1571
|
+
private _outputFormat;
|
|
1572
|
+
private readonly _outputFormatExplicit;
|
|
1573
|
+
private readonly voiceSettings;
|
|
1574
|
+
private readonly languageCode;
|
|
1575
|
+
private readonly chunkSize;
|
|
1576
|
+
/**
|
|
1577
|
+
* Public view of the (possibly auto-flipped) wire format. Read by the
|
|
1578
|
+
* stream-handler to decide whether to skip the client-side resample +
|
|
1579
|
+
* mulaw encode when the bytes are already in the carrier's wire codec.
|
|
1580
|
+
*/
|
|
1581
|
+
get outputFormat(): ElevenLabsOutputFormat;
|
|
1582
|
+
constructor(apiKey: string, voiceId?: string, modelId?: string, outputFormat?: ElevenLabsOutputFormat | string);
|
|
1583
|
+
constructor(apiKey: string, options: ElevenLabsTTSOptions$1);
|
|
1584
|
+
/**
|
|
1585
|
+
* Hook called by ``StreamHandler.initPipeline`` to advise the carrier
|
|
1586
|
+
* wire format. When the user did NOT pass an explicit ``outputFormat``,
|
|
1587
|
+
* auto-flip to the carrier's native codec so the audio bytes ElevenLabs
|
|
1588
|
+
* returns are already in Twilio/Telnyx wire format — eliminating the
|
|
1589
|
+
* client-side 16 kHz → 8 kHz resample and PCM → μ-law encode. The
|
|
1590
|
+
* resample/encode chain was a source of audible artifacts on the
|
|
1591
|
+
* prewarmed firstMessage (see 0.6.2 acceptance notes — burst delivery
|
|
1592
|
+
* of resampled audio crackled on the carrier-side jitter buffer).
|
|
1593
|
+
*
|
|
1594
|
+
* No-op when the caller passed an explicit ``outputFormat`` (incl. via
|
|
1595
|
+
* the ``forTwilio`` / ``forTelnyx`` factories) — user wins.
|
|
1596
|
+
*
|
|
1597
|
+
* Parity with {@link ElevenLabsWebSocketTTS.setTelephonyCarrier}.
|
|
1598
|
+
*/
|
|
1599
|
+
setTelephonyCarrier(carrier: string): void;
|
|
1600
|
+
/**
|
|
1601
|
+
* Construct an instance pre-configured for Twilio Media Streams.
|
|
1602
|
+
*
|
|
1603
|
+
* Sets `outputFormat='ulaw_8000'` so ElevenLabs emits μ-law @ 8 kHz
|
|
1604
|
+
* directly — the exact wire format Twilio's media stream uses — letting
|
|
1605
|
+
* the SDK skip the 16 kHz→8 kHz resample and PCM→μ-law conversion in
|
|
1606
|
+
* `TwilioAudioSender`. Saves ~30–80 ms first-byte and per-frame CPU,
|
|
1607
|
+
* and removes a potential aliasing source.
|
|
1608
|
+
*
|
|
1609
|
+
* `voiceSettings` defaults to a low-bandwidth-friendly profile
|
|
1610
|
+
* (speaker boost off, modest stability) which sounds cleaner at 8 kHz
|
|
1611
|
+
* μ-law than the studio default. Pass an explicit object to override.
|
|
1612
|
+
*/
|
|
1613
|
+
static forTwilio(apiKey: string, options?: Omit<ElevenLabsTTSOptions$1, 'outputFormat'>): ElevenLabsTTS;
|
|
1614
|
+
/**
|
|
1615
|
+
* Construct an instance pre-configured for Telnyx bidirectional media.
|
|
1616
|
+
*
|
|
1617
|
+
* Telnyx's default media-streaming codec is L16 PCM @ 16 kHz, which
|
|
1618
|
+
* matches our default Telnyx handler. We pick `pcm_16000` so the audio
|
|
1619
|
+
* flows end-to-end with zero resampling or transcoding.
|
|
1620
|
+
*
|
|
1621
|
+
* Trade-off: if your Telnyx profile is pinned to PCMU/8000 (μ-law),
|
|
1622
|
+
* construct `ElevenLabsTTS` directly with `outputFormat: 'ulaw_8000'`
|
|
1623
|
+
* — Telnyx supports that natively too.
|
|
1624
|
+
*/
|
|
1625
|
+
static forTelnyx(apiKey: string, options?: Omit<ElevenLabsTTSOptions$1, 'outputFormat'>): ElevenLabsTTS;
|
|
1626
|
+
/**
|
|
1627
|
+
* Synthesise text to speech and return the full audio as a single Buffer.
|
|
1628
|
+
*
|
|
1629
|
+
* For large chunks (or when latency matters) call `synthesizeStream` instead.
|
|
1630
|
+
*/
|
|
1631
|
+
synthesize(text: string): Promise<Buffer>;
|
|
1632
|
+
/**
|
|
1633
|
+
* Synthesise text and yield audio chunks as they arrive (streaming).
|
|
1634
|
+
*
|
|
1635
|
+
* The yielded buffers are raw PCM at 16 kHz (or whatever `outputFormat` is
|
|
1636
|
+
* configured to). `chunkSize` controls the maximum yield size — 512 is a
|
|
1637
|
+
* good choice for low-latency telephony.
|
|
1638
|
+
*/
|
|
1639
|
+
synthesizeStream(text: string): AsyncGenerator<Buffer>;
|
|
1640
|
+
}
|
|
1641
|
+
|
|
1642
|
+
/**
|
|
1643
|
+
* WebSocket-based ElevenLabs TTS provider — opt-in low-latency variant.
|
|
1644
|
+
*
|
|
1645
|
+
* Targets the ElevenLabs streaming-input WebSocket endpoint
|
|
1646
|
+
* (`/v1/text-to-speech/{voice_id}/stream-input`) instead of the HTTP
|
|
1647
|
+
* `/stream` endpoint used by `ElevenLabsTTS`. Saves the HTTP request setup
|
|
1648
|
+
* time per utterance (~50 ms) and avoids the HTTP cold-start TLS handshake
|
|
1649
|
+
* when calls are bursty.
|
|
1650
|
+
*
|
|
1651
|
+
* API matches `ElevenLabsTTS` (`synthesizeStream(text)` returns an
|
|
1652
|
+
* `AsyncGenerator<Buffer>`) so it can be passed anywhere a TTSAdapter is
|
|
1653
|
+
* expected.
|
|
1654
|
+
*
|
|
1655
|
+
* Behaviour notes
|
|
1656
|
+
* - WebSocket is opened **per-utterance** (matches HTTP semantics). A
|
|
1657
|
+
* future revision may pool a WS across utterances of the same call
|
|
1658
|
+
* session — see roadmap Phase 5b.
|
|
1659
|
+
* - `auto_mode=true` is enabled by default. Pass `autoMode: false` to
|
|
1660
|
+
* send a custom `chunk_length_schedule`.
|
|
1661
|
+
* - `outputFormat` is exposed as a query parameter so `ulaw_8000` (Twilio
|
|
1662
|
+
* native) and `pcm_16000` (Telnyx native) work without resampling.
|
|
1663
|
+
* - `eleven_v3` is **not** supported — the WS endpoint rejects it.
|
|
1664
|
+
* - `optimize_streaming_latency` is officially deprecated and is not
|
|
1665
|
+
* exposed.
|
|
1666
|
+
*/
|
|
1667
|
+
|
|
1668
|
+
/** Constructor options for {@link ElevenLabsWebSocketTTS}. */
|
|
1669
|
+
interface ElevenLabsWebSocketTTSOptions {
|
|
1670
|
+
apiKey: string;
|
|
1671
|
+
voiceId?: string;
|
|
1672
|
+
modelId?: ElevenLabsModel | string;
|
|
1673
|
+
outputFormat?: string;
|
|
1674
|
+
voiceSettings?: Record<string, unknown>;
|
|
1675
|
+
languageCode?: string;
|
|
1676
|
+
/** Let the server pick chunk timing. Default true. */
|
|
1677
|
+
autoMode?: boolean;
|
|
1678
|
+
/** WS keep-alive timeout in seconds (5–180). Default 60. */
|
|
1679
|
+
inactivityTimeout?: number;
|
|
1680
|
+
/**
|
|
1681
|
+
* Manual chunk schedule, only used when ``autoMode: false``. Each value
|
|
1682
|
+
* must be 5–500. ElevenLabs default is ``[120, 160, 250, 290]``.
|
|
1683
|
+
*/
|
|
1684
|
+
chunkLengthSchedule?: number[];
|
|
1685
|
+
/** Outgoing audio re-chunk size in bytes. Default 4096. */
|
|
1686
|
+
chunkSize?: number;
|
|
1687
|
+
}
|
|
1688
|
+
/**
|
|
1689
|
+
* Parked WS handle returned by {@link ElevenLabsWebSocketTTS.openParkedConnection}.
|
|
1690
|
+
*
|
|
1691
|
+
* `bosSent` records whether the BOS frame (`{"text": " ", ...}`) has
|
|
1692
|
+
* already been written to the wire. The prewarm pipeline always sends
|
|
1693
|
+
* the BOS so the upstream worker is selected on the parked connection;
|
|
1694
|
+
* `synthesizeStream` adopts the WS and SKIPS its own BOS send to avoid
|
|
1695
|
+
* a protocol error.
|
|
1696
|
+
*/
|
|
1697
|
+
interface ElevenLabsParkedWS {
|
|
1698
|
+
ws: WebSocket__default;
|
|
1699
|
+
bosSent: boolean;
|
|
1700
|
+
}
|
|
1701
|
+
/** WebSocket-based ElevenLabs TTS adapter — opt-in low-latency variant. */
|
|
1702
|
+
declare class ElevenLabsWebSocketTTS implements TTSAdapter {
|
|
1703
|
+
static readonly providerKey = "elevenlabs_ws";
|
|
1704
|
+
readonly apiKey: string;
|
|
1705
|
+
readonly voiceId: string;
|
|
1706
|
+
readonly modelId: string;
|
|
1707
|
+
readonly voiceSettings?: Record<string, unknown>;
|
|
1708
|
+
readonly languageCode?: string;
|
|
1709
|
+
readonly autoMode: boolean;
|
|
1710
|
+
readonly inactivityTimeout: number;
|
|
1711
|
+
readonly chunkLengthSchedule?: number[];
|
|
1712
|
+
readonly chunkSize: number;
|
|
1713
|
+
/**
|
|
1714
|
+
* Single-slot adoption queue. The prewarm pipeline parks one WS per
|
|
1715
|
+
* outbound call here; the next `synthesizeStream` call consumes it
|
|
1716
|
+
* (skipping `new WebSocket()` and the BOS send) instead of opening
|
|
1717
|
+
* a fresh socket. The slot is consumed exactly once: if a second
|
|
1718
|
+
* `synthesizeStream` runs before the first, only the first benefits.
|
|
1719
|
+
*
|
|
1720
|
+
* We keep this on the adapter (not in a parameter) so the existing
|
|
1721
|
+
* `for await (const chunk of agent.tts.synthesizeStream(...))` call
|
|
1722
|
+
* site in `StreamHandler` continues to work without signature
|
|
1723
|
+
* changes.
|
|
1724
|
+
*/
|
|
1725
|
+
private adoptedConnection;
|
|
1726
|
+
/**
|
|
1727
|
+
* Active WS for the in-flight ``synthesizeStream`` call, if any. Set
|
|
1728
|
+
* when a stream starts, cleared in its ``finally`` block. The
|
|
1729
|
+
* stream-handler calls ``cancelActiveStream()`` from ``cancelSpeaking``
|
|
1730
|
+
* to unblock the generator's inner ``await Promise<frame>`` — without
|
|
1731
|
+
* it, a barge-in on the firstMessage live path leaves the for-await
|
|
1732
|
+
* stuck waiting for the next frame; ElevenLabs never sends
|
|
1733
|
+
* ``isFinal=true`` after the consumer breaks, the 30 s frame timeout
|
|
1734
|
+
* fires post-call, and meanwhile ``initPipeline`` never returns so
|
|
1735
|
+
* the STT ``onTranscript`` callback never registers and subsequent
|
|
1736
|
+
* user turns are silently dropped (root cause of the 2026-05-20
|
|
1737
|
+
* "first message OK, then no response" symptom).
|
|
1738
|
+
*/
|
|
1739
|
+
private activeStreamWs;
|
|
1740
|
+
/**
|
|
1741
|
+
* The wire format requested over the ElevenLabs WS. Initially set from
|
|
1742
|
+
* the constructor; ``setTelephonyCarrier`` may auto-flip it to the
|
|
1743
|
+
* carrier's native codec when the caller did NOT pass ``outputFormat``
|
|
1744
|
+
* explicitly.
|
|
1745
|
+
*/
|
|
1746
|
+
private _outputFormat;
|
|
1747
|
+
private readonly _outputFormatExplicit;
|
|
1748
|
+
/** Public read-only view of the (possibly auto-flipped) wire format. */
|
|
1749
|
+
get outputFormat(): string;
|
|
1750
|
+
constructor(opts: ElevenLabsWebSocketTTSOptions);
|
|
1751
|
+
/**
|
|
1752
|
+
* Hook called by ``StreamHandler`` to advise the carrier wire format.
|
|
1753
|
+
*
|
|
1754
|
+
* When the user did NOT pass an explicit ``outputFormat`` in the
|
|
1755
|
+
* constructor options, this flips the format to the carrier's native
|
|
1756
|
+
* wire codec — saving a client-side transcode step. Calling with an
|
|
1757
|
+
* unknown carrier (``""`` / ``"custom"``) is a no-op.
|
|
1758
|
+
*
|
|
1759
|
+
* When ``outputFormat`` was explicitly passed (incl. via the
|
|
1760
|
+
* ``forTwilio`` / ``forTelnyx`` factories), this method is a no-op —
|
|
1761
|
+
* the user's choice always wins.
|
|
1762
|
+
*/
|
|
1763
|
+
setTelephonyCarrier(carrier: string): void;
|
|
1764
|
+
/**
|
|
1765
|
+
* Force-close the WebSocket of any in-flight ``synthesizeStream`` call.
|
|
1766
|
+
* Called by the stream-handler from ``cancelSpeaking`` (barge-in) so
|
|
1767
|
+
* the generator's inner ``await Promise<frame>`` loop unblocks cleanly
|
|
1768
|
+
* via the ``onClose`` handler — instead of waiting up to 30 s for the
|
|
1769
|
+
* ``FRAME_TIMEOUT_MS`` watchdog to fire. No-op when no stream is in
|
|
1770
|
+
* flight or when the WS is already closing.
|
|
1771
|
+
*
|
|
1772
|
+
* Without this, a barge-in during the firstMessage live path left the
|
|
1773
|
+
* for-await stuck (ElevenLabs never sends ``isFinal=true`` after the
|
|
1774
|
+
* consumer breaks), ``initPipeline`` never returned, the STT
|
|
1775
|
+
* ``onTranscript`` callback never registered, and the entire remainder
|
|
1776
|
+
* of the call was silent for the user. Surfaced during the 2026-05-20
|
|
1777
|
+
* acceptance run.
|
|
1778
|
+
*/
|
|
1779
|
+
cancelActiveStream(): void;
|
|
1780
|
+
/** Pre-configured for Twilio Media Streams (`ulaw_8000`). */
|
|
1781
|
+
static forTwilio(opts: Omit<ElevenLabsWebSocketTTSOptions, 'outputFormat'>): ElevenLabsWebSocketTTS;
|
|
1782
|
+
/** Pre-configured for Telnyx (`pcm_16000`). */
|
|
1783
|
+
static forTelnyx(opts: Omit<ElevenLabsWebSocketTTSOptions, 'outputFormat'>): ElevenLabsWebSocketTTS;
|
|
1784
|
+
private buildUrl;
|
|
1785
|
+
/**
|
|
1786
|
+
* Build the protocol-required BOS frame sent on every fresh WS.
|
|
1787
|
+
*
|
|
1788
|
+
* The single-space `{"text": " "}` keep-alive establishes the session
|
|
1789
|
+
* without committing any synthesis (no `flush: true`, no real text).
|
|
1790
|
+
* Production `synthesizeStream()` and `warmup()` share this exact
|
|
1791
|
+
* construction so the upstream worker chooses the same per-session
|
|
1792
|
+
* config in both cases — otherwise the warm session is on a different
|
|
1793
|
+
* worker than the live request, which defeats the warmup goal.
|
|
1794
|
+
*/
|
|
1795
|
+
private buildBosFrame;
|
|
1796
|
+
/**
|
|
1797
|
+
* Single-shot synthesis: open WS, send text, yield bytes, close.
|
|
1798
|
+
*
|
|
1799
|
+
* Resilience contract:
|
|
1800
|
+
* - Connection bounded by ``CONNECT_TIMEOUT_MS`` (5s, was 15s).
|
|
1801
|
+
* - Each idle wait bounded by ``FRAME_TIMEOUT_MS`` (30s) so a stalled
|
|
1802
|
+
* server cannot keep the generator alive indefinitely.
|
|
1803
|
+
* - Permanent error handler attached BEFORE the open await — prevents
|
|
1804
|
+
* ``uncaughtException`` if an error fires after the once-listener
|
|
1805
|
+
* resolves.
|
|
1806
|
+
* - All event listeners removed in ``finally`` (no closure leak past
|
|
1807
|
+
* socket close).
|
|
1808
|
+
* - Server-reported ``error`` raises ``ElevenLabsTTSError``.
|
|
1809
|
+
* - Per-frame audio payload capped at ``MAX_AUDIO_B64_BYTES``.
|
|
1810
|
+
* - Best-effort EOS ``{"text":""}`` sent in finally (not immediately
|
|
1811
|
+
* after flush — auto_mode could otherwise truncate the tail audio).
|
|
1812
|
+
*/
|
|
1813
|
+
synthesizeStream(text: string): AsyncGenerator<Buffer>;
|
|
1814
|
+
/**
|
|
1815
|
+
* Pre-call WebSocket warmup for the ElevenLabs `/stream-input` endpoint.
|
|
1816
|
+
*
|
|
1817
|
+
* Opens the WS (DNS + TLS + auth handshake), sends the EXACT same BOS
|
|
1818
|
+
* frame the production `synthesizeStream()` path sends — including
|
|
1819
|
+
* `voice_settings` and (when configured) `generation_config` — so
|
|
1820
|
+
* ElevenLabs instantiates the same per-session worker for both
|
|
1821
|
+
* warmup and the live request. If the BOS frames differ, the server
|
|
1822
|
+
* may route warmup and the real call to two different workers, and
|
|
1823
|
+
* the warmed worker is wasted. Idles ~250 ms, then closes. By the
|
|
1824
|
+
* time the first `synthesizeStream()` call lands during the call,
|
|
1825
|
+
* the connection pool has the upstream warm — net wire time saving
|
|
1826
|
+
* of 200-500 ms.
|
|
1827
|
+
*
|
|
1828
|
+
* Billing safety: ElevenLabs bills on synthesised characters
|
|
1829
|
+
* delivered via `audio` frames (per https://elevenlabs.io/pricing).
|
|
1830
|
+
* The keepalive (single-space `text`, no `flush: true`, no real
|
|
1831
|
+
* transcript) is documented as the session-establishment frame and
|
|
1832
|
+
* does NOT generate synthesis. Closing without sending the actual
|
|
1833
|
+
* transcript does not consume billable characters. Best-effort:
|
|
1834
|
+
* failures logged at debug level.
|
|
1835
|
+
*/
|
|
1836
|
+
warmup(): Promise<void>;
|
|
1837
|
+
/**
|
|
1838
|
+
* Open a fresh WS, send the EXACT BOS frame the live `synthesizeStream`
|
|
1839
|
+
* sends, and return the OPEN socket without closing it. Used by the
|
|
1840
|
+
* prewarm pipeline to park a TTS connection during the carrier ringing
|
|
1841
|
+
* window so the next `synthesizeStream` call can adopt it via
|
|
1842
|
+
* {@link adoptWebSocket} and skip ~400-900 ms of TLS + BOS round-trip.
|
|
1843
|
+
*
|
|
1844
|
+
* Returns a parked-handle the caller stashes; the next
|
|
1845
|
+
* `synthesizeStream` will detect the adoption queue and skip its own
|
|
1846
|
+
* `new WebSocket()` + BOS send.
|
|
1847
|
+
*
|
|
1848
|
+
* Billing safety: BOS is the documented session-establishment frame
|
|
1849
|
+
* (single space `text`, no `flush: true`) and does not generate
|
|
1850
|
+
* synthesis. ElevenLabs bills on `audio` frames received from the
|
|
1851
|
+
* server, not on BOS bytes sent by the client.
|
|
1852
|
+
*/
|
|
1853
|
+
openParkedConnection(): Promise<ElevenLabsParkedWS>;
|
|
1854
|
+
/**
|
|
1855
|
+
* Stash a parked WS handle so the next `synthesizeStream` call adopts
|
|
1856
|
+
* it instead of opening a fresh socket. Caller is responsible for
|
|
1857
|
+
* holding the handle alive until either the live request consumes it
|
|
1858
|
+
* or the call ends (in which case `discardAdoptedConnection()`
|
|
1859
|
+
* cleans it up).
|
|
1860
|
+
*/
|
|
1861
|
+
adoptWebSocket(parked: ElevenLabsParkedWS): void;
|
|
1862
|
+
/**
|
|
1863
|
+
* Drop and close any pending parked WS without consuming it. Used on
|
|
1864
|
+
* call-failure paths so a never-started call does not leak a TTS WS
|
|
1865
|
+
* that ElevenLabs will close after its inactivity timeout anyway.
|
|
1866
|
+
*/
|
|
1867
|
+
discardAdoptedConnection(): void;
|
|
1868
|
+
/** No-op — connections are per-utterance and torn down inside synthesizeStream. */
|
|
1869
|
+
close(): Promise<void>;
|
|
1870
|
+
}
|
|
1871
|
+
|
|
1872
|
+
/**
|
|
1873
|
+
* In-memory metrics store for the local dashboard.
|
|
1874
|
+
*
|
|
1875
|
+
* Keeps the last `maxCalls` completed calls and tracks active calls.
|
|
1876
|
+
* Supports SSE event subscribers for real-time updates.
|
|
1877
|
+
*
|
|
1878
|
+
* Optional disk hydration: when `CallLogger` writes per-call records under
|
|
1879
|
+
* `<root>/calls/YYYY/MM/DD/<call_id>/metadata.json`, calling
|
|
1880
|
+
* `hydrate(logRoot)` on a fresh store rebuilds the in-memory list from those
|
|
1881
|
+
* files so the dashboard survives process restarts (the persistence is in
|
|
1882
|
+
* the JSONL/JSON files, the store is just a cache on top).
|
|
1883
|
+
*/
|
|
1884
|
+
|
|
1885
|
+
/** Snapshot of a call as held by the dashboard store. */
|
|
1886
|
+
interface CallRecord {
|
|
1887
|
+
call_id: string;
|
|
1888
|
+
caller: string;
|
|
1889
|
+
callee: string;
|
|
1890
|
+
direction: string;
|
|
1891
|
+
started_at: number;
|
|
1892
|
+
ended_at?: number;
|
|
1893
|
+
/**
|
|
1894
|
+
* Current lifecycle state: ``initiated`` (pre-registered), ``ringing``,
|
|
1895
|
+
* ``in-progress``, ``completed``, ``no-answer``, ``busy``, ``failed``,
|
|
1896
|
+
* ``canceled``, or ``webhook_error``.
|
|
1897
|
+
*/
|
|
1898
|
+
status?: string;
|
|
1899
|
+
transcript?: Array<{
|
|
1900
|
+
role: string;
|
|
1901
|
+
text: string;
|
|
1902
|
+
timestamp: number;
|
|
1903
|
+
}>;
|
|
1904
|
+
turns?: unknown[];
|
|
1905
|
+
metrics?: Record<string, unknown> | null;
|
|
1906
|
+
[key: string]: unknown;
|
|
1907
|
+
}
|
|
1908
|
+
/** Server-Sent-Event payload broadcast by `MetricsStore` for live UI updates. */
|
|
1909
|
+
interface SSEEvent {
|
|
1910
|
+
type: string;
|
|
1911
|
+
data: Record<string, unknown>;
|
|
1912
|
+
}
|
|
1913
|
+
/** In-memory bounded ring buffer of recent calls plus active-call tracking. */
|
|
1914
|
+
declare class MetricsStore extends EventEmitter {
|
|
1915
|
+
private readonly maxCalls;
|
|
1916
|
+
private calls;
|
|
1917
|
+
private activeCalls;
|
|
1918
|
+
/**
|
|
1919
|
+
* User-driven soft delete: call_ids the operator removed from the
|
|
1920
|
+
* dashboard view. The on-disk artefacts written by ``CallLogger``
|
|
1921
|
+
* (``metadata.json``, ``transcript.jsonl``) are intentionally NOT
|
|
1922
|
+
* touched — they serve as the durable backup. All read paths
|
|
1923
|
+
* (``getCalls`` / ``getCall`` / ``getAggregates`` / ``getCallsInRange``
|
|
1924
|
+
* / ``hydrate``) filter against this set so the call is invisible
|
|
1925
|
+
* to the UI and excluded from rolling metrics. Populated from
|
|
1926
|
+
* ``<logRoot>/.deleted_call_ids.json`` on hydrate so deletions
|
|
1927
|
+
* survive a process restart. Parity with Python.
|
|
1928
|
+
*/
|
|
1929
|
+
private deletedCallIds;
|
|
1930
|
+
private deletedIdsPath;
|
|
1931
|
+
/**
|
|
1932
|
+
* Accepts either a numeric ``maxCalls`` (legacy positional — matches the
|
|
1933
|
+
* original TS API) or an options object ``{ maxCalls }`` to align with the
|
|
1934
|
+
* Python SDK's keyword-argument style. Plain literals also work:
|
|
1935
|
+
* ``new MetricsStore()`` / ``new MetricsStore(100)`` / ``new MetricsStore({ maxCalls: 100 })``.
|
|
1936
|
+
*/
|
|
1937
|
+
constructor(maxCallsOrOpts?: number | {
|
|
1938
|
+
maxCalls?: number;
|
|
1939
|
+
});
|
|
1940
|
+
private publish;
|
|
1941
|
+
/** Mark a call as in-progress (creates the row if it does not yet exist). */
|
|
1942
|
+
recordCallStart(data: Record<string, unknown>): void;
|
|
1943
|
+
/**
|
|
1944
|
+
* Pre-register an outbound call before any webhook fires. Lets the
|
|
1945
|
+
* dashboard surface attempts that never reach media (no-answer, busy,
|
|
1946
|
+
* carrier-rejected). Mirrors the Python ``record_call_initiated``.
|
|
1947
|
+
*/
|
|
1948
|
+
recordCallInitiated(data: Record<string, unknown>): void;
|
|
1949
|
+
/**
|
|
1950
|
+
* Update the status of an active or completed call. Terminal states
|
|
1951
|
+
* (completed, no-answer, busy, failed, canceled, webhook_error) move the
|
|
1952
|
+
* row from active to completed so the UI freezes the live duration timer.
|
|
1953
|
+
*/
|
|
1954
|
+
updateCallStatus(callId: string, status: string, extra?: Record<string, unknown>): void;
|
|
1955
|
+
/** Append a single conversation turn to an active call and broadcast it via SSE. */
|
|
1956
|
+
recordTurn(data: Record<string, unknown>): void;
|
|
1957
|
+
/** Move a call from active to completed and persist its final metrics. */
|
|
1958
|
+
recordCallEnd(data: Record<string, unknown>, metrics?: Record<string, unknown> | null): void;
|
|
1959
|
+
/**
|
|
1960
|
+
* Return a window of completed calls in newest-first order.
|
|
1961
|
+
*
|
|
1962
|
+
* Soft-deleted call_ids (see ``deleteCalls``) are filtered out so the
|
|
1963
|
+
* dashboard never re-shows a row the user removed. The on-disk
|
|
1964
|
+
* artefacts are intentionally preserved as a backup.
|
|
1965
|
+
*/
|
|
1966
|
+
getCalls(limit?: number, offset?: number): CallRecord[];
|
|
1967
|
+
/**
|
|
1968
|
+
* Look up a completed call by id (newest match wins).
|
|
1969
|
+
*
|
|
1970
|
+
* Soft-deleted call_ids resolve to ``null`` so the SPA's detail pane
|
|
1971
|
+
* cannot render a row the user removed.
|
|
1972
|
+
*/
|
|
1973
|
+
getCall(callId: string): CallRecord | null;
|
|
1974
|
+
/**
|
|
1975
|
+
* Soft-delete one or more calls from the dashboard view.
|
|
1976
|
+
*
|
|
1977
|
+
* Adds each ``call_id`` to an in-memory set. Subsequent reads via
|
|
1978
|
+
* ``getCalls`` / ``getCall`` / ``getAggregates`` / ``getCallsInRange``
|
|
1979
|
+
* exclude the deleted ids, so rolling metrics (avg latency, total
|
|
1980
|
+
* spend) are recomputed without them. The on-disk
|
|
1981
|
+
* ``metadata.json`` / ``transcript.jsonl`` files written by
|
|
1982
|
+
* ``CallLogger`` are NOT touched — they serve as a durable backup
|
|
1983
|
+
* the operator can audit outside the dashboard.
|
|
1984
|
+
*
|
|
1985
|
+
* Active calls are never deletable. A call_id that is currently
|
|
1986
|
+
* in ``activeCalls`` is silently skipped so a mid-call delete
|
|
1987
|
+
* from the UI cannot orphan the live transcript pane.
|
|
1988
|
+
*
|
|
1989
|
+
* Persisted to ``<logRoot>/.deleted_call_ids.json`` (best-effort)
|
|
1990
|
+
* when ``hydrate()`` has been called with a log root. Parity with
|
|
1991
|
+
* Python ``delete_calls``.
|
|
1992
|
+
*
|
|
1993
|
+
* @returns The list of call_ids actually accepted as deleted.
|
|
1994
|
+
*/
|
|
1995
|
+
deleteCalls(callIds: readonly string[]): string[];
|
|
1996
|
+
/** Whether ``callId`` was soft-deleted from the dashboard. */
|
|
1997
|
+
isDeleted(callId: string): boolean;
|
|
1998
|
+
/** Snapshot of soft-deleted call_ids (sorted). */
|
|
1999
|
+
getDeletedCallIds(): string[];
|
|
2000
|
+
/** Atomically persist the deleted-ids set to disk. Best-effort. */
|
|
2001
|
+
private persistDeletedIds;
|
|
2002
|
+
/** Look up an active call by id (returns undefined if not active or unknown). */
|
|
2003
|
+
getActive(callId: string): CallRecord | undefined;
|
|
2004
|
+
/** Return all currently active (not yet ended) calls. */
|
|
2005
|
+
getActiveCalls(): CallRecord[];
|
|
2006
|
+
/**
|
|
2007
|
+
* Compute summary statistics across the buffered call history.
|
|
2008
|
+
*
|
|
2009
|
+
* Soft-deleted calls are excluded so rolling metrics (avg latency,
|
|
2010
|
+
* total spend) match exactly what the operator sees in the call list.
|
|
2011
|
+
*/
|
|
2012
|
+
getAggregates(): Record<string, unknown>;
|
|
2013
|
+
/**
|
|
2014
|
+
* Return calls whose `started_at` falls within `[fromTs, toTs]` (Unix
|
|
2015
|
+
* seconds). Soft-deleted calls are filtered out.
|
|
1297
2016
|
*/
|
|
1298
|
-
updateCallStatus(callId: string, status: string, extra?: Record<string, unknown>): void;
|
|
1299
|
-
/** Append a single conversation turn to an active call and broadcast it via SSE. */
|
|
1300
|
-
recordTurn(data: Record<string, unknown>): void;
|
|
1301
|
-
/** Move a call from active to completed and persist its final metrics. */
|
|
1302
|
-
recordCallEnd(data: Record<string, unknown>, metrics?: Record<string, unknown> | null): void;
|
|
1303
|
-
/** Return a window of completed calls in newest-first order. */
|
|
1304
|
-
getCalls(limit?: number, offset?: number): CallRecord[];
|
|
1305
|
-
/** Look up a completed call by id (newest match wins). */
|
|
1306
|
-
getCall(callId: string): CallRecord | null;
|
|
1307
|
-
/** Look up an active call by id (returns undefined if not active or unknown). */
|
|
1308
|
-
getActive(callId: string): CallRecord | undefined;
|
|
1309
|
-
/** Return all currently active (not yet ended) calls. */
|
|
1310
|
-
getActiveCalls(): CallRecord[];
|
|
1311
|
-
/** Compute summary statistics across the buffered call history. */
|
|
1312
|
-
getAggregates(): Record<string, unknown>;
|
|
1313
|
-
/** Return calls whose `started_at` falls within `[fromTs, toTs]` (Unix seconds). */
|
|
1314
2017
|
getCallsInRange(fromTs?: number, toTs?: number): CallRecord[];
|
|
1315
|
-
/** Number of completed calls currently in the ring buffer. */
|
|
2018
|
+
/** Number of completed (non-deleted) calls currently in the ring buffer. */
|
|
1316
2019
|
get callCount(): number;
|
|
1317
2020
|
/**
|
|
1318
2021
|
* Rebuild the in-memory call list from `metadata.json` files written by
|
|
@@ -1455,6 +2158,19 @@ declare class SpeechEvents {
|
|
|
1455
2158
|
private dispatch;
|
|
1456
2159
|
}
|
|
1457
2160
|
|
|
2161
|
+
/** Parked provider WebSockets ready for adoption by a per-call StreamHandler. */
|
|
2162
|
+
interface ParkedProviderConnections {
|
|
2163
|
+
/** Pre-opened STT WS (Cartesia today; other adapters may add support later). */
|
|
2164
|
+
stt?: WebSocket.WebSocket;
|
|
2165
|
+
/**
|
|
2166
|
+
* Pre-opened TTS WS handle (ElevenLabs WS today). The `bosSent` flag
|
|
2167
|
+
* lets the live `synthesizeStream` skip its own BOS send when the
|
|
2168
|
+
* prewarm pipeline already wrote it.
|
|
2169
|
+
*/
|
|
2170
|
+
tts?: ElevenLabsParkedWS;
|
|
2171
|
+
/** Pre-opened OpenAI Realtime WS (already through `session.updated`). */
|
|
2172
|
+
openaiRealtime?: WebSocket.WebSocket;
|
|
2173
|
+
}
|
|
1458
2174
|
/** Top-level SDK entry point — wraps a carrier + embedded server + agent loop. */
|
|
1459
2175
|
declare class Patter {
|
|
1460
2176
|
private localConfig;
|
|
@@ -1476,6 +2192,65 @@ declare class Patter {
|
|
|
1476
2192
|
* ``Cannot use both tunnel: true and webhookUrl``.
|
|
1477
2193
|
*/
|
|
1478
2194
|
private tunnelOwnsWebhookUrl;
|
|
2195
|
+
/**
|
|
2196
|
+
* Pre-rendered first-message TTS audio per outbound call_id. Populated
|
|
2197
|
+
* by :meth:`call` when ``agent.prewarmFirstMessage`` is true; consumed
|
|
2198
|
+
* by the StreamHandler firstMessage emit so the greeting streams
|
|
2199
|
+
* instantly on ``start`` instead of paying the 200-700 ms TTS first-byte
|
|
2200
|
+
* latency. See ``AgentOptions.prewarmFirstMessage``.
|
|
2201
|
+
*
|
|
2202
|
+
* Stores raw bytes in the TTS provider's native sample rate; the
|
|
2203
|
+
* carrier-side audio sender resamples on emit.
|
|
2204
|
+
*/
|
|
2205
|
+
private prewarmAudio;
|
|
2206
|
+
/**
|
|
2207
|
+
* Call IDs whose prewarm cache slot has already been consumed —
|
|
2208
|
+
* either by ``popPrewarmAudio`` (cache hit OR miss on the firstMessage
|
|
2209
|
+
* emit path) or by ``recordPrewarmWaste`` (call ended before pickup).
|
|
2210
|
+
* The prewarm task checks this set BEFORE writing bytes so a slow
|
|
2211
|
+
* synth that finishes after the consumer already polled doesn't
|
|
2212
|
+
* orphan bytes in ``prewarmAudio``. See FIX #92 in the parity audit.
|
|
2213
|
+
*/
|
|
2214
|
+
private prewarmConsumed;
|
|
2215
|
+
/**
|
|
2216
|
+
* Background tasks tracked so :meth:`disconnect` can wait on / drop any
|
|
2217
|
+
* still-running prewarm-first-message synth before tearing down.
|
|
2218
|
+
*/
|
|
2219
|
+
private prewarmTasks;
|
|
2220
|
+
/**
|
|
2221
|
+
* TTL eviction timers keyed by call_id so :meth:`disconnect` (and
|
|
2222
|
+
* normal consumption / waste-record paths) can cancel any pending
|
|
2223
|
+
* timer when the slot drains naturally. Without this, the timer
|
|
2224
|
+
* would WARN spuriously after the cache was already emptied.
|
|
2225
|
+
*/
|
|
2226
|
+
private prewarmTtlTimers;
|
|
2227
|
+
/**
|
|
2228
|
+
* Pre-opened, fully-handshaked provider WebSockets keyed by
|
|
2229
|
+
* carrier-issued call_id. Populated by ``parkProviderConnections``
|
|
2230
|
+
* during the carrier ringing window; consumed by the per-call
|
|
2231
|
+
* StreamHandler at ``start`` via ``adoptWebSocket(...)`` so STT / TTS
|
|
2232
|
+
* / Realtime audio can flow on the first turn without paying the
|
|
2233
|
+
* 150-900 ms TLS + WS-upgrade + protocol-handshake round-trip again.
|
|
2234
|
+
*
|
|
2235
|
+
* Distinct from ``prewarmAudio`` (which holds pre-rendered TTS bytes
|
|
2236
|
+
* for the first message); the two features are complementary and
|
|
2237
|
+
* orthogonal — both can be active for the same call.
|
|
2238
|
+
*
|
|
2239
|
+
* Each slot may hold up to three parked connections (STT, TTS,
|
|
2240
|
+
* Realtime). Drained by:
|
|
2241
|
+
* - {@link popPrewarmedConnections} on the carrier ``start`` event
|
|
2242
|
+
* (consumed normally — the handles transfer to the StreamHandler)
|
|
2243
|
+
* - {@link recordPrewarmWaste} on call-termination paths (no-answer,
|
|
2244
|
+
* busy, failed, canceled, AMD voicemail). Closes parked sockets.
|
|
2245
|
+
* - {@link disconnect} on Patter teardown. Closes all parked sockets.
|
|
2246
|
+
*/
|
|
2247
|
+
private prewarmedConnections;
|
|
2248
|
+
/**
|
|
2249
|
+
* TTL eviction handles keyed by call_id for connections that are never
|
|
2250
|
+
* adopted (e.g. a carrier that swallows ``start``). Closes the parked
|
|
2251
|
+
* sockets so they don't leak past the safety window.
|
|
2252
|
+
*/
|
|
2253
|
+
private prewarmedConnTimers;
|
|
1479
2254
|
/**
|
|
1480
2255
|
* Speech-edge events for turn-taking instrumentation. Public surface: the
|
|
1481
2256
|
* seven `on*` proxy accessors below plus the `conversationState` snapshot.
|
|
@@ -1483,7 +2258,7 @@ declare class Patter {
|
|
|
1483
2258
|
* the previous behaviour.
|
|
1484
2259
|
*
|
|
1485
2260
|
* See `src/_speech-events.ts` for the full event taxonomy and the
|
|
1486
|
-
*
|
|
2261
|
+
* OpenAI Realtime alignment table.
|
|
1487
2262
|
*/
|
|
1488
2263
|
readonly speechEvents: SpeechEvents;
|
|
1489
2264
|
get onUserSpeechStarted(): SpeechEventCallback | null;
|
|
@@ -1502,8 +2277,8 @@ declare class Patter {
|
|
|
1502
2277
|
set onAudioOut(cb: SpeechEventCallback | null);
|
|
1503
2278
|
/**
|
|
1504
2279
|
* Snapshot of the current per-side state of the call.
|
|
1505
|
-
*
|
|
1506
|
-
*
|
|
2280
|
+
* Returns the user_state / agent_state payload shape — read-only and
|
|
2281
|
+
* safe to call at any time.
|
|
1507
2282
|
*/
|
|
1508
2283
|
get conversationState(): ConversationStateSnapshot;
|
|
1509
2284
|
/**
|
|
@@ -1553,12 +2328,115 @@ declare class Patter {
|
|
|
1553
2328
|
private _serveImpl;
|
|
1554
2329
|
/** Run the agent in interactive terminal-test mode (no real telephony). */
|
|
1555
2330
|
test(opts: ServeOptions): Promise<void>;
|
|
2331
|
+
/**
|
|
2332
|
+
* Pop and return the pre-synthesised first-message audio for ``callId``.
|
|
2333
|
+
*
|
|
2334
|
+
* Returns ``undefined`` when ``agent.prewarmFirstMessage`` was not set
|
|
2335
|
+
* for the originating outbound call, or when the synth was still in
|
|
2336
|
+
* flight at the moment the carrier emitted ``start`` (cache miss — the
|
|
2337
|
+
* StreamHandler falls back to live TTS).
|
|
2338
|
+
*
|
|
2339
|
+
* Called by the per-call StreamHandler at the start of the firstMessage
|
|
2340
|
+
* emit. Returning bytes here lets the handler skip the live TTS
|
|
2341
|
+
* synthesis and stream the cached buffer directly.
|
|
2342
|
+
*
|
|
2343
|
+
* Marks ``callId`` as consumed regardless of cache hit/miss so a slow
|
|
2344
|
+
* synth task that finishes after this call drops its bytes instead of
|
|
2345
|
+
* orphaning them in ``prewarmAudio``. See FIX #92.
|
|
2346
|
+
*/
|
|
2347
|
+
popPrewarmAudio: (callId: string) => Buffer | undefined;
|
|
2348
|
+
/**
|
|
2349
|
+
* Log a warning if a prewarmed greeting was paid for but never used.
|
|
2350
|
+
* The TTS bill for ``agent.firstMessage`` has already been incurred by
|
|
2351
|
+
* the background synth task, so the user should know — opt-in feature
|
|
2352
|
+
* with a known cost surface.
|
|
2353
|
+
*
|
|
2354
|
+
* Idempotent: the second call for the same ``callId`` is a no-op, so
|
|
2355
|
+
* the status callback firing first and ``endCall`` running afterwards
|
|
2356
|
+
* (or vice-versa) does not double-WARN. Public so the embedded
|
|
2357
|
+
* server's webhook handlers can invoke it on no-answer / busy /
|
|
2358
|
+
* failed / canceled / AMD-machine paths. See FIX #91.
|
|
2359
|
+
*/
|
|
2360
|
+
recordPrewarmWaste: (callId: string) => void;
|
|
2361
|
+
/**
|
|
2362
|
+
* Pop and return the parked provider WebSockets for ``callId``, or
|
|
2363
|
+
* ``undefined`` when no parked connections exist.
|
|
2364
|
+
*
|
|
2365
|
+
* Wired into ``EmbeddedServer.popPrewarmedConnections`` so the
|
|
2366
|
+
* per-call ``StreamHandler`` can adopt the parked sockets at the
|
|
2367
|
+
* carrier ``start`` event instead of opening fresh ones — saving
|
|
2368
|
+
* ~150-900 ms of cold-start handshake on the first turn.
|
|
2369
|
+
*/
|
|
2370
|
+
popPrewarmedConnections: (callId: string) => ParkedProviderConnections | undefined;
|
|
2371
|
+
/**
|
|
2372
|
+
* Close any parked provider WebSockets for ``callId``. Wired into
|
|
2373
|
+
* ``EmbeddedServer.closePrewarmedConnections`` so call-termination
|
|
2374
|
+
* paths (no-answer, busy, failed, canceled, AMD voicemail) drop the
|
|
2375
|
+
* sockets cleanly instead of leaving them to the upstream timeout.
|
|
2376
|
+
*/
|
|
2377
|
+
closePrewarmedConnections: (callId: string) => void;
|
|
2378
|
+
/**
|
|
2379
|
+
* Open and park provider WebSockets in parallel with the carrier-side
|
|
2380
|
+
* ``initiateCall``. Unlike :meth:`spawnProviderWarmup` (which closes
|
|
2381
|
+
* the WS after a brief idle), the sockets opened here stay OPEN and
|
|
2382
|
+
* are handed off to the per-call ``StreamHandler`` on ``start``.
|
|
2383
|
+
*
|
|
2384
|
+
* This is the structural fix for first-turn cold-start: on Node's
|
|
2385
|
+
* ``ws`` package, opening + closing a WS does NOT warm TLS for the
|
|
2386
|
+
* next open — every fresh ``new WebSocket()`` re-pays the full
|
|
2387
|
+
* TCP + TLS + HTTP-101 round-trip. By keeping the WS open and
|
|
2388
|
+
* adopting it directly, the live first turn skips the handshake
|
|
2389
|
+
* entirely (saves ~150-900 ms depending on provider).
|
|
2390
|
+
*
|
|
2391
|
+
* Best-effort: each provider's parking task is wrapped in
|
|
2392
|
+
* ``Promise.allSettled`` so a slow or failing endpoint cannot block
|
|
2393
|
+
* the others. Providers without ``openParkedConnection`` contribute
|
|
2394
|
+
* nothing — the call falls through to the cold ``connect()`` path
|
|
2395
|
+
* for that provider.
|
|
2396
|
+
*/
|
|
2397
|
+
private parkProviderConnections;
|
|
2398
|
+
/**
|
|
2399
|
+
* Spawn a fire-and-forget task that warms up STT / TTS / LLM in
|
|
2400
|
+
* parallel with the carrier-side ``initiateCall``.
|
|
2401
|
+
*
|
|
2402
|
+
* Best-effort: each provider's optional ``warmup()`` is wrapped in
|
|
2403
|
+
* ``Promise.allSettled`` so a slow or failing endpoint cannot block
|
|
2404
|
+
* the others. Providers without ``warmup`` contribute nothing.
|
|
2405
|
+
*/
|
|
2406
|
+
private spawnProviderWarmup;
|
|
2407
|
+
/**
|
|
2408
|
+
* Pre-render ``agent.firstMessage`` to TTS bytes during the ringing
|
|
2409
|
+
* window and stash them in ``prewarmAudio.set(callId, buf)``.
|
|
2410
|
+
*
|
|
2411
|
+
* Skipped silently when ``agent.prewarmFirstMessage`` is false or
|
|
2412
|
+
* when ``agent.tts`` / ``agent.firstMessage`` is missing. The synth
|
|
2413
|
+
* is bounded by ``ringTimeout`` (default 25 s) so a never-answered
|
|
2414
|
+
* call doesn't tie up the TTS connection. On timeout / error the
|
|
2415
|
+
* cache is left empty and the StreamHandler falls back to live TTS.
|
|
2416
|
+
*
|
|
2417
|
+
* **Pipeline mode only.** Realtime / ConvAI provider modes never
|
|
2418
|
+
* consume the prewarm cache (the StreamHandler for those modes runs
|
|
2419
|
+
* its first-message emit through the provider's own audio path).
|
|
2420
|
+
* Spawning the prewarm in those modes pays the TTS bill for nothing
|
|
2421
|
+
* — refused with a warn.
|
|
2422
|
+
*
|
|
2423
|
+
* **Capped at ``PREWARM_CACHE_MAX`` concurrent entries.** Refused
|
|
2424
|
+
* with a warn when the cap is reached (the call still proceeds —
|
|
2425
|
+
* StreamHandler falls back to live TTS).
|
|
2426
|
+
*/
|
|
2427
|
+
private spawnPrewarmFirstMessage;
|
|
1556
2428
|
/** Place an outbound call via the configured carrier. */
|
|
1557
2429
|
call(options: LocalCallOptions): Promise<void>;
|
|
1558
2430
|
/**
|
|
1559
2431
|
* Stop the embedded server and any running tunnel. Safe to call multiple
|
|
1560
2432
|
* times. Leaves the instance reusable: a subsequent ``serve()`` works as
|
|
1561
2433
|
* if the previous lifecycle never happened.
|
|
2434
|
+
*
|
|
2435
|
+
* Also clears any pending TTL eviction timers, awaits in-flight
|
|
2436
|
+
* prewarm-first-message synth tasks (best-effort, with a 1 s safety
|
|
2437
|
+
* timeout), and clears the prewarm cache. Without this a still-running
|
|
2438
|
+
* TTS WS keeps the user billed long after SDK teardown, and stale
|
|
2439
|
+
* entries leak across ``serve`` / ``disconnect`` cycles. See FIX #93.
|
|
1562
2440
|
*/
|
|
1563
2441
|
disconnect(): Promise<void>;
|
|
1564
2442
|
/**
|
|
@@ -1957,6 +2835,27 @@ declare function geminiLive(opts: {
|
|
|
1957
2835
|
voice?: string;
|
|
1958
2836
|
}): RealtimeConfig;
|
|
1959
2837
|
|
|
2838
|
+
/**
|
|
2839
|
+
* Default provider pricing and merge utilities.
|
|
2840
|
+
*
|
|
2841
|
+
* Pricing reflects public provider rates as of 2026. Each provider entry
|
|
2842
|
+
* carries provider-level defaults (the model Patter ships with by default)
|
|
2843
|
+
* plus an optional ``models`` map keyed by model identifier with per-model
|
|
2844
|
+
* overrides. Cost-calc functions take an optional ``model`` arg and
|
|
2845
|
+
* auto-resolve the rate via {@link resolveProviderRates} (longest-prefix
|
|
2846
|
+
* match for versioned model IDs). When the agent's adapter exposes
|
|
2847
|
+
* ``model`` and the metrics layer threads it through, the dashboard bills
|
|
2848
|
+
* with model accuracy out of the box — no manual override needed.
|
|
2849
|
+
*
|
|
2850
|
+
* User overrides via ``new Patter({ pricing: {...} })`` keep working as
|
|
2851
|
+
* before. To register a new model rate without touching the SDK source:
|
|
2852
|
+
*
|
|
2853
|
+
* new Patter({ pricing: { elevenlabs: { models: { my_custom: { price: 0.075 } } } } })
|
|
2854
|
+
*/
|
|
2855
|
+
/** Pricing table version identifier, updated in lockstep with the Python SDK. */
|
|
2856
|
+
declare const PRICING_VERSION = "2026.3";
|
|
2857
|
+
/** ISO date the pricing table was last refreshed against public provider rates. */
|
|
2858
|
+
declare const PRICING_LAST_UPDATED = "2026-05-08";
|
|
1960
2859
|
/**
|
|
1961
2860
|
* Billing units used by ``DEFAULT_PRICING`` entries. String values keep the
|
|
1962
2861
|
* pricing table JSON-serialisable and backwards-compatible with consumers
|
|
@@ -2075,7 +2974,22 @@ declare function calculateTelephonyCost(provider: string, durationSeconds: numbe
|
|
|
2075
2974
|
|
|
2076
2975
|
/** Per-turn latency breakdown across the STT/LLM/TTS pipeline. */
|
|
2077
2976
|
interface LatencyBreakdown {
|
|
2977
|
+
/**
|
|
2978
|
+
* STT finalization time: end-of-speech (VAD stop or STT speech_final) →
|
|
2979
|
+
* final transcript delivery. This is the engineering metric — pure STT
|
|
2980
|
+
* processing latency, independent of how long the user spoke. Industry
|
|
2981
|
+
* benchmarks (Picovoice, Deepgram, Gladia, Speechmatics) all report this
|
|
2982
|
+
* number as "STT latency". Falls back to turn_start when the endpoint
|
|
2983
|
+
* signal is unavailable (degraded provider, batch STT, etc.).
|
|
2984
|
+
*/
|
|
2078
2985
|
stt_ms: number;
|
|
2986
|
+
/**
|
|
2987
|
+
* Duration of the user's utterance (turn_start → end-of-speech). Useful
|
|
2988
|
+
* to distinguish "user spoke for 4s" from "STT took 4s to finalize" —
|
|
2989
|
+
* they used to be conflated in stt_ms before 0.6.1. Optional — undefined
|
|
2990
|
+
* when the endpoint signal is unavailable.
|
|
2991
|
+
*/
|
|
2992
|
+
user_speech_duration_ms?: number;
|
|
2079
2993
|
/**
|
|
2080
2994
|
* Backwards-compatible LLM bucket. With the split below, this now reflects
|
|
2081
2995
|
* the user-perceived first-token latency (TTFT) when streaming is available
|
|
@@ -2164,6 +3078,12 @@ interface CallMetrics {
|
|
|
2164
3078
|
tts_provider: string;
|
|
2165
3079
|
llm_provider: string;
|
|
2166
3080
|
telephony_provider: string;
|
|
3081
|
+
/** Model identifiers per provider (e.g. "ink-whisper", "eleven_flash_v2_5",
|
|
3082
|
+
* "gpt-oss-120b"). Surface on the dashboard cost breakdown so operators
|
|
3083
|
+
* can attribute per-call spend to a specific model. */
|
|
3084
|
+
stt_model?: string;
|
|
3085
|
+
tts_model?: string;
|
|
3086
|
+
llm_model?: string;
|
|
2167
3087
|
}
|
|
2168
3088
|
/** Programmatic control surface for a live call (transfer, hangup, DTMF). */
|
|
2169
3089
|
interface CallControl {
|
|
@@ -2226,6 +3146,21 @@ declare class CallMetricsAccumulator {
|
|
|
2226
3146
|
private _bargeinStoppedAt;
|
|
2227
3147
|
private _turnUserText;
|
|
2228
3148
|
private _turnSttAudioSeconds;
|
|
3149
|
+
/**
|
|
3150
|
+
* Guard against the recordTurnInterrupted / recordTurnComplete race.
|
|
3151
|
+
*
|
|
3152
|
+
* A VAD-path barge-in fires ``recordTurnInterrupted`` synchronously
|
|
3153
|
+
* inside ``handleAudioAsync`` while the in-flight pipeline LLM stream
|
|
3154
|
+
* keeps unwinding on its own task. When the LLM stream eventually
|
|
3155
|
+
* exits, the existing pipeline path falls through to
|
|
3156
|
+
* ``recordTurnComplete``, which would push a second turn for the same
|
|
3157
|
+
* logical exchange (this time carrying ``user_text=''`` because the
|
|
3158
|
+
* field was already reset). ``_turnAlreadyClosed`` is flipped by
|
|
3159
|
+
* ``recordTurnInterrupted`` and read by ``recordTurnComplete`` so the
|
|
3160
|
+
* late ``recordTurnComplete`` becomes a no-op until the next
|
|
3161
|
+
* ``startTurn`` re-arms the accumulator.
|
|
3162
|
+
*/
|
|
3163
|
+
private _turnAlreadyClosed;
|
|
2229
3164
|
private _totalSttAudioSeconds;
|
|
2230
3165
|
private _totalTtsCharacters;
|
|
2231
3166
|
private _totalRealtimeCost;
|
|
@@ -2236,6 +3171,7 @@ declare class CallMetricsAccumulator {
|
|
|
2236
3171
|
private _actualTelephonyCost;
|
|
2237
3172
|
private _actualSttCost;
|
|
2238
3173
|
private _totalLlmCost;
|
|
3174
|
+
private _llmModel;
|
|
2239
3175
|
private _eventBus;
|
|
2240
3176
|
/** Timestamp (hrTimeMs) when VAD emitted speech_end. */
|
|
2241
3177
|
private _vadStoppedAt;
|
|
@@ -2250,6 +3186,21 @@ declare class CallMetricsAccumulator {
|
|
|
2250
3186
|
private _overlapStartedAt;
|
|
2251
3187
|
private _reportOnlyInitialTtfb;
|
|
2252
3188
|
private _initialTtfbEmitted;
|
|
3189
|
+
/**
|
|
3190
|
+
* Last barge-in detection timestamp (hrTimeMs). Used by
|
|
3191
|
+
* ``_computeTurnLatency`` to gate endpoint_ms / stt_ms emission on turns
|
|
3192
|
+
* that started immediately after a barge-in — those turns have unreliable
|
|
3193
|
+
* VAD/STT anchors and would otherwise pollute the p95 distribution with
|
|
3194
|
+
* synthetic 6+ second spikes.
|
|
3195
|
+
*/
|
|
3196
|
+
private _lastBargeinAt;
|
|
3197
|
+
/**
|
|
3198
|
+
* Count of turns where ``recordSttComplete`` fired but no legitimate VAD
|
|
3199
|
+
* ``speech_end`` had stamped ``_endpointSignalAt``. Exposed via metrics so
|
|
3200
|
+
* we can spot environments where PSTN packet loss is dropping VAD stops
|
|
3201
|
+
* (the common cause of missing endpoint signals).
|
|
3202
|
+
*/
|
|
3203
|
+
private _endpointSignalMissingCount;
|
|
2253
3204
|
constructor(opts: {
|
|
2254
3205
|
callId: string;
|
|
2255
3206
|
providerMode: string;
|
|
@@ -2285,6 +3236,31 @@ declare class CallMetricsAccumulator {
|
|
|
2285
3236
|
* on the first audio byte rather than just before recordSttComplete().
|
|
2286
3237
|
*/
|
|
2287
3238
|
startTurnIfIdle(): void;
|
|
3239
|
+
/**
|
|
3240
|
+
* Anchor the current turn at a legitimate VAD ``speech_start`` event.
|
|
3241
|
+
*
|
|
3242
|
+
* Industry-standard pattern: every VAD ``speech_start`` that fires while the agent
|
|
3243
|
+
* is NOT in the suppressed warmup window re-anchors the turn timer to
|
|
3244
|
+
* the wall-clock moment the user actually started speaking. Re-anchors:
|
|
3245
|
+
*
|
|
3246
|
+
* * ``_turnStart`` — fixes the case where a phantom ``speech_start``
|
|
3247
|
+
* during agent TTS or a partial transcript from the previous user
|
|
3248
|
+
* attempt already stamped the field. Without this, the legitimate
|
|
3249
|
+
* user-speech ``speech_start`` no-op'd and ``user_speech_duration_ms``
|
|
3250
|
+
* inflated from ~1 s to 5-7 s (the original "I waited 7 seconds"
|
|
3251
|
+
* dashboard symptom).
|
|
3252
|
+
* * ``_endpointSignalAt``, ``_vadStoppedAt``, ``_sttFinalAt`` — any
|
|
3253
|
+
* stale anchor from a rejected barge-in / dropped final transcript
|
|
3254
|
+
* on the same uncommitted turn is cleared, so the next
|
|
3255
|
+
* ``recordVadStop`` / ``recordSttFinalTimestamp`` stamps fresh.
|
|
3256
|
+
* * ``_sttComplete``, ``_llmFirstToken``, ``_initialTtfbEmitted`` — same
|
|
3257
|
+
* rationale for the downstream pipeline timestamps.
|
|
3258
|
+
*
|
|
3259
|
+
* No-op once the turn is committed (``_turnCommittedMono`` set): a
|
|
3260
|
+
* VAD ``speech_start`` after commit belongs to the NEXT turn's
|
|
3261
|
+
* barge-in path, handled by ``recordTurnInterrupted`` instead.
|
|
3262
|
+
*/
|
|
3263
|
+
anchorUserSpeechStart(): void;
|
|
2288
3264
|
/** Stamp end-of-STT, capture the user's transcript, and accrue billed STT seconds. */
|
|
2289
3265
|
recordSttComplete(text: string, audioSeconds?: number): void;
|
|
2290
3266
|
/** Record the timestamp of the first LLM token (TTFT). No-op after first call. */
|
|
@@ -2320,9 +3296,26 @@ declare class CallMetricsAccumulator {
|
|
|
2320
3296
|
* to compute ``bargein_ms``.
|
|
2321
3297
|
*/
|
|
2322
3298
|
recordTtsStopped(ts?: number): void;
|
|
2323
|
-
/**
|
|
2324
|
-
|
|
2325
|
-
|
|
3299
|
+
/**
|
|
3300
|
+
* Close the current turn cleanly and append a `TurnMetrics` record.
|
|
3301
|
+
*
|
|
3302
|
+
* Returns ``null`` when ``recordTurnInterrupted`` has already closed
|
|
3303
|
+
* the current turn — this protects against the VAD-barge-in /
|
|
3304
|
+
* pipeline-LLM race where both paths try to finalise the same logical
|
|
3305
|
+
* turn and the second would otherwise push a phantom entry with
|
|
3306
|
+
* ``user_text=''``. The caller treats ``null`` as "nothing to emit";
|
|
3307
|
+
* ``emitTurnMetrics`` is already null-safe.
|
|
3308
|
+
*/
|
|
3309
|
+
recordTurnComplete(agentText: string): TurnMetrics | null;
|
|
3310
|
+
/**
|
|
3311
|
+
* Close the current turn as interrupted (barge-in) and return the
|
|
3312
|
+
* recorded metrics. Returns ``null`` when no turn is open, OR when
|
|
3313
|
+
* ``recordTurnComplete`` has already finalised the current turn —
|
|
3314
|
+
* bidirectional parity with the guard at the top of
|
|
3315
|
+
* ``recordTurnComplete``. Prevents an out-of-order interruption (e.g.
|
|
3316
|
+
* a future refactor that reorders the bargein + LLM-unwind paths)
|
|
3317
|
+
* from overwriting a turn that the complete path already emitted.
|
|
3318
|
+
*/
|
|
2326
3319
|
recordTurnInterrupted(): TurnMetrics | null;
|
|
2327
3320
|
/**
|
|
2328
3321
|
* Record the moment VAD emitted speech_end for the current utterance.
|
|
@@ -2419,6 +3412,13 @@ declare class CallMetricsAccumulator {
|
|
|
2419
3412
|
endCall(): CallMetrics;
|
|
2420
3413
|
/** Return the cost breakdown for the call so far without ending it. */
|
|
2421
3414
|
getCostSoFar(): CostBreakdown;
|
|
3415
|
+
/**
|
|
3416
|
+
* Number of turns where recordSttComplete fired without a prior legitimate
|
|
3417
|
+
* VAD speech_end. Surfaced for diagnostics — a non-zero value points at
|
|
3418
|
+
* dropped VAD stops (commonly PSTN packet loss), which is why we stopped
|
|
3419
|
+
* faking _endpointSignalAt from _sttComplete in 0.6.x.
|
|
3420
|
+
*/
|
|
3421
|
+
get endpointSignalMissingCount(): number;
|
|
2422
3422
|
private _resetTurnState;
|
|
2423
3423
|
private _computeTurnLatency;
|
|
2424
3424
|
private _computeCost;
|
|
@@ -2442,6 +3442,7 @@ declare class CallMetricsAccumulator {
|
|
|
2442
3442
|
* {@link OpenAIRealtimeAdapter}. Audio negotiation defaults to
|
|
2443
3443
|
* `g711_ulaw` so traffic flows through Twilio/Telnyx without transcoding.
|
|
2444
3444
|
*/
|
|
3445
|
+
|
|
2445
3446
|
/**
|
|
2446
3447
|
* Supported OpenAI Realtime wire audio formats. See
|
|
2447
3448
|
* https://platform.openai.com/docs/guides/realtime for the full list.
|
|
@@ -2456,6 +3457,67 @@ declare const OpenAIRealtimeAudioFormat: {
|
|
|
2456
3457
|
};
|
|
2457
3458
|
/** Union of {@link OpenAIRealtimeAudioFormat} string values. */
|
|
2458
3459
|
type OpenAIRealtimeAudioFormat = (typeof OpenAIRealtimeAudioFormat)[keyof typeof OpenAIRealtimeAudioFormat];
|
|
3460
|
+
/**
|
|
3461
|
+
* Known OpenAI Realtime API model identifiers.
|
|
3462
|
+
*
|
|
3463
|
+
* `GPT_REALTIME_2` is OpenAI's most-capable realtime voice model
|
|
3464
|
+
* (speech-to-speech with configurable reasoning effort, stronger
|
|
3465
|
+
* instruction following, 128K context). It accepts the same session
|
|
3466
|
+
* update wire format as the v1 `gpt-realtime` family but supports an
|
|
3467
|
+
* additional `reasoning.effort` field — see `reasoningEffort` on
|
|
3468
|
+
* {@link OpenAIRealtimeOptions}. Pricing differs from the mini default;
|
|
3469
|
+
* override `DEFAULT_PRICING.openai_realtime` with the values in
|
|
3470
|
+
* `DEFAULT_PRICING.openai_realtime_2` when selecting it.
|
|
3471
|
+
*/
|
|
3472
|
+
declare const OpenAIRealtimeModel: {
|
|
3473
|
+
readonly GPT_REALTIME: "gpt-realtime";
|
|
3474
|
+
readonly GPT_REALTIME_2: "gpt-realtime-2";
|
|
3475
|
+
readonly GPT_REALTIME_MINI: "gpt-realtime-mini";
|
|
3476
|
+
readonly GPT_4O_REALTIME_PREVIEW: "gpt-4o-realtime-preview";
|
|
3477
|
+
readonly GPT_4O_MINI_REALTIME_PREVIEW: "gpt-4o-mini-realtime-preview";
|
|
3478
|
+
};
|
|
3479
|
+
/** Union of {@link OpenAIRealtimeModel} string values. */
|
|
3480
|
+
type OpenAIRealtimeModel = (typeof OpenAIRealtimeModel)[keyof typeof OpenAIRealtimeModel];
|
|
3481
|
+
/** OpenAI Realtime / TTS voice identifiers. */
|
|
3482
|
+
declare const OpenAIVoice: {
|
|
3483
|
+
readonly ALLOY: "alloy";
|
|
3484
|
+
readonly ASH: "ash";
|
|
3485
|
+
readonly BALLAD: "ballad";
|
|
3486
|
+
readonly CORAL: "coral";
|
|
3487
|
+
readonly ECHO: "echo";
|
|
3488
|
+
readonly FABLE: "fable";
|
|
3489
|
+
readonly NOVA: "nova";
|
|
3490
|
+
readonly ONYX: "onyx";
|
|
3491
|
+
readonly SAGE: "sage";
|
|
3492
|
+
readonly SHIMMER: "shimmer";
|
|
3493
|
+
readonly VERSE: "verse";
|
|
3494
|
+
};
|
|
3495
|
+
/** Union of {@link OpenAIVoice} string values. */
|
|
3496
|
+
type OpenAIVoice = (typeof OpenAIVoice)[keyof typeof OpenAIVoice];
|
|
3497
|
+
/**
|
|
3498
|
+
* Models accepted by `input_audio_transcription` on Realtime sessions.
|
|
3499
|
+
*
|
|
3500
|
+
* `GPT_REALTIME_WHISPER` is OpenAI's streaming-optimised Whisper variant
|
|
3501
|
+
* designed for low-latency transcript deltas inside a Realtime session.
|
|
3502
|
+
* Billed per minute of audio (separate from the conversational model
|
|
3503
|
+
* tokens). Use it when you want faster partial transcripts than
|
|
3504
|
+
* `whisper-1` at lower cost than `gpt-4o-transcribe`.
|
|
3505
|
+
*/
|
|
3506
|
+
declare const OpenAITranscriptionModel: {
|
|
3507
|
+
readonly WHISPER_1: "whisper-1";
|
|
3508
|
+
readonly GPT_4O_TRANSCRIBE: "gpt-4o-transcribe";
|
|
3509
|
+
readonly GPT_4O_MINI_TRANSCRIBE: "gpt-4o-mini-transcribe";
|
|
3510
|
+
readonly GPT_REALTIME_WHISPER: "gpt-realtime-whisper";
|
|
3511
|
+
};
|
|
3512
|
+
/** Union of {@link OpenAITranscriptionModel} string values. */
|
|
3513
|
+
type OpenAITranscriptionModel = (typeof OpenAITranscriptionModel)[keyof typeof OpenAITranscriptionModel];
|
|
3514
|
+
/** Server-side voice-activity-detection modes. */
|
|
3515
|
+
declare const OpenAIRealtimeVADType: {
|
|
3516
|
+
readonly SERVER_VAD: "server_vad";
|
|
3517
|
+
readonly SEMANTIC_VAD: "semantic_vad";
|
|
3518
|
+
};
|
|
3519
|
+
/** Union of {@link OpenAIRealtimeVADType} string values. */
|
|
3520
|
+
type OpenAIRealtimeVADType = (typeof OpenAIRealtimeVADType)[keyof typeof OpenAIRealtimeVADType];
|
|
2459
3521
|
/** Callback signature for events emitted by {@link OpenAIRealtimeAdapter}. */
|
|
2460
3522
|
type RealtimeEventCallback = (type: string, data: unknown) => void | Promise<void>;
|
|
2461
3523
|
/** Constructor options for {@link OpenAIRealtimeAdapter}. */
|
|
@@ -2483,28 +3545,96 @@ interface OpenAIRealtimeOptions {
|
|
|
2483
3545
|
}
|
|
2484
3546
|
/** Realtime WebSocket adapter for OpenAI's `gpt-realtime` family. */
|
|
2485
3547
|
declare class OpenAIRealtimeAdapter {
|
|
2486
|
-
|
|
2487
|
-
|
|
2488
|
-
|
|
2489
|
-
|
|
2490
|
-
|
|
2491
|
-
|
|
2492
|
-
|
|
3548
|
+
protected readonly apiKey: string;
|
|
3549
|
+
protected readonly model: string;
|
|
3550
|
+
protected readonly voice: string;
|
|
3551
|
+
protected readonly instructions: string;
|
|
3552
|
+
protected readonly tools?: Array<{
|
|
3553
|
+
name: string;
|
|
3554
|
+
description: string;
|
|
3555
|
+
parameters: Record<string, unknown>;
|
|
3556
|
+
strict?: boolean;
|
|
3557
|
+
}> | undefined;
|
|
3558
|
+
protected readonly audioFormat: OpenAIRealtimeAudioFormat;
|
|
3559
|
+
protected ws: WebSocket__default | null;
|
|
2493
3560
|
private readonly eventCallbacks;
|
|
2494
3561
|
private messageListenerAttached;
|
|
2495
3562
|
private heartbeat;
|
|
2496
3563
|
private currentResponseItemId;
|
|
2497
3564
|
private currentResponseAudioMs;
|
|
2498
3565
|
private currentResponseFirstAudioAt;
|
|
2499
|
-
|
|
3566
|
+
protected readonly options: OpenAIRealtimeOptions;
|
|
2500
3567
|
constructor(apiKey: string, model?: string, voice?: string, instructions?: string, tools?: Array<{
|
|
2501
3568
|
name: string;
|
|
2502
3569
|
description: string;
|
|
2503
3570
|
parameters: Record<string, unknown>;
|
|
2504
3571
|
strict?: boolean;
|
|
2505
3572
|
}> | undefined, audioFormat?: OpenAIRealtimeAudioFormat, options?: OpenAIRealtimeOptions);
|
|
3573
|
+
/**
|
|
3574
|
+
* Build the production session.update body. Mirrors the body sent
|
|
3575
|
+
* inside `connect()` so warmup can apply identical configuration to
|
|
3576
|
+
* the upstream session and prime it without billing.
|
|
3577
|
+
*/
|
|
3578
|
+
private buildSessionConfig;
|
|
3579
|
+
/**
|
|
3580
|
+
* Pre-call WebSocket warmup for the OpenAI Realtime endpoint.
|
|
3581
|
+
*
|
|
3582
|
+
* The canonical session-only warm step on the Realtime API: open the
|
|
3583
|
+
* WS, wait for `session.created`, send a single `session.update`
|
|
3584
|
+
* containing the same fields that the production `connect()` path
|
|
3585
|
+
* applies (`input_audio_format`, `output_audio_format`, `voice`,
|
|
3586
|
+
* `instructions`, `turn_detection`, `input_audio_transcription`,
|
|
3587
|
+
* plus any opt-in fields populated on the adapter), wait for the
|
|
3588
|
+
* matching `session.updated` ack, then close cleanly. This primes
|
|
3589
|
+
* the per-session state on the OpenAI side — DNS + TLS + auth
|
|
3590
|
+
* handshake + initial config exchange — without ever invoking the
|
|
3591
|
+
* model.
|
|
3592
|
+
*
|
|
3593
|
+
* Earlier revisions sent `response.create` with
|
|
3594
|
+
* `{"response": {"generate": false}}` to prime the inference path.
|
|
3595
|
+
* That field is NOT in the OpenAI Realtime API schema; the server
|
|
3596
|
+
* either ignores it (and bills tokens for a real model response) or
|
|
3597
|
+
* rejects the request with `invalid_request_error`. Both behaviours
|
|
3598
|
+
* are billing-unsafe or a no-op beyond TLS warm. The
|
|
3599
|
+
* `session.update` flow is documented and side-effect-free.
|
|
3600
|
+
*
|
|
3601
|
+
* Billing safety: `session.update` only mutates session
|
|
3602
|
+
* configuration. It does NOT invoke the model, does NOT consume any
|
|
3603
|
+
* audio buffer, and does NOT trigger token generation, so no
|
|
3604
|
+
* per-token cost is accrued. Best-effort: failures are logged at
|
|
3605
|
+
* debug level and never raised.
|
|
3606
|
+
*/
|
|
3607
|
+
warmup(): Promise<void>;
|
|
2506
3608
|
/** Open the Realtime WebSocket and apply the session configuration. */
|
|
2507
3609
|
connect(): Promise<void>;
|
|
3610
|
+
/**
|
|
3611
|
+
* Adopt a pre-opened, already-`session.updated` Realtime WebSocket
|
|
3612
|
+
* produced by the prewarm pipeline (see `Patter.parkProviderConnections`).
|
|
3613
|
+
* Skips the fresh `new WebSocket()` + `session.created` /
|
|
3614
|
+
* `session.update` round-trip — saves ~250-450 ms on first turn.
|
|
3615
|
+
*
|
|
3616
|
+
* Caller MUST verify `ws.readyState === OPEN` before calling and MUST
|
|
3617
|
+
* have already received `session.updated` on the parked socket. If
|
|
3618
|
+
* the parked WS died between park and adopt, fall back to `connect()`.
|
|
3619
|
+
*/
|
|
3620
|
+
adoptWebSocket(ws: WebSocket__default): void;
|
|
3621
|
+
protected armHeartbeatAndListener(): void;
|
|
3622
|
+
/**
|
|
3623
|
+
* Open a fresh Realtime WS, exchange `session.created` /
|
|
3624
|
+
* `session.update` / `session.updated` (so the upstream session is
|
|
3625
|
+
* fully primed), and return the OPEN socket WITHOUT arming the
|
|
3626
|
+
* heartbeat / message listener. Used by the prewarm pipeline to park
|
|
3627
|
+
* a Realtime connection during ringing; the live consumer adopts it
|
|
3628
|
+
* via {@link adoptWebSocket}.
|
|
3629
|
+
*
|
|
3630
|
+
* Bounded by 8 s. Throws on timeout / handshake failure — callers
|
|
3631
|
+
* (the prewarm pipeline) treat any error as a cache miss and the
|
|
3632
|
+
* call falls through to the cold `connect()` path.
|
|
3633
|
+
*
|
|
3634
|
+
* Billing safety: `session.update` does not invoke the model. No
|
|
3635
|
+
* tokens are billed.
|
|
3636
|
+
*/
|
|
3637
|
+
openParkedConnection(): Promise<WebSocket__default>;
|
|
2508
3638
|
/** Append a base64-encoded audio chunk to the realtime input buffer. */
|
|
2509
3639
|
sendAudio(mulawAudio: Buffer): void;
|
|
2510
3640
|
/**
|
|
@@ -2518,7 +3648,7 @@ declare class OpenAIRealtimeAdapter {
|
|
|
2518
3648
|
onEvent(callback: RealtimeEventCallback): void;
|
|
2519
3649
|
/** Remove a previously registered {@link onEvent} callback. */
|
|
2520
3650
|
offEvent(callback: RealtimeEventCallback): void;
|
|
2521
|
-
|
|
3651
|
+
protected ensureMessageListener(): void;
|
|
2522
3652
|
/** Truncate the in-flight assistant turn and cancel the active response.
|
|
2523
3653
|
*
|
|
2524
3654
|
* ``audio_end_ms`` MUST reflect what the caller actually heard, not what
|
|
@@ -2535,6 +3665,17 @@ declare class OpenAIRealtimeAdapter {
|
|
|
2535
3665
|
cancelResponse(): void;
|
|
2536
3666
|
/** Inject a user text turn and request a new response. */
|
|
2537
3667
|
sendText(text: string): Promise<void>;
|
|
3668
|
+
/**
|
|
3669
|
+
* Trigger `response.create` with no new user item.
|
|
3670
|
+
*
|
|
3671
|
+
* Used by the Realtime stream-handler to drive a response after the
|
|
3672
|
+
* client-side hallucination filter accepts an
|
|
3673
|
+
* `input_audio_transcription.completed` event. The server VAD config
|
|
3674
|
+
* sets `create_response: false` so OpenAI no longer auto-creates a
|
|
3675
|
+
* response on every `input_audio_buffer.committed`; Patter is now
|
|
3676
|
+
* responsible for triggering it explicitly when a real user turn lands.
|
|
3677
|
+
*/
|
|
3678
|
+
requestResponse(): Promise<void>;
|
|
2538
3679
|
/**
|
|
2539
3680
|
* Make the AI speak ``text`` as its opening line.
|
|
2540
3681
|
*
|
|
@@ -2684,11 +3825,6 @@ declare function isRemoteUrl(onMessage: unknown): onMessage is string;
|
|
|
2684
3825
|
/** Check if a URL is a WebSocket URL. */
|
|
2685
3826
|
declare function isWebSocketUrl(url: string): boolean;
|
|
2686
3827
|
|
|
2687
|
-
/**
|
|
2688
|
-
* Embedded HTTP/WebSocket server — wires Express webhooks for the configured
|
|
2689
|
-
* carrier (Twilio or Telnyx) into the per-call `StreamHandler` and dashboard.
|
|
2690
|
-
*/
|
|
2691
|
-
|
|
2692
3828
|
/** Resolved configuration consumed by `EmbeddedServer` (carrier credentials, webhook URL, etc.). */
|
|
2693
3829
|
interface LocalConfig {
|
|
2694
3830
|
twilioSid?: string;
|
|
@@ -3322,6 +4458,8 @@ interface SonioxSTTOptions$1 {
|
|
|
3322
4458
|
}
|
|
3323
4459
|
/** Streaming STT adapter for Soniox's real-time WebSocket API. */
|
|
3324
4460
|
declare class SonioxSTT {
|
|
4461
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
4462
|
+
static readonly providerKey = "soniox";
|
|
3325
4463
|
private ws;
|
|
3326
4464
|
private callbacks;
|
|
3327
4465
|
private final;
|
|
@@ -3430,6 +4568,8 @@ interface AssemblyAISTTOptions$1 {
|
|
|
3430
4568
|
declare class AssemblyAISTT {
|
|
3431
4569
|
private readonly apiKey;
|
|
3432
4570
|
private readonly options;
|
|
4571
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
4572
|
+
static readonly providerKey = "assemblyai";
|
|
3433
4573
|
private ws;
|
|
3434
4574
|
private readonly callbacks;
|
|
3435
4575
|
private closing;
|
|
@@ -3460,6 +4600,22 @@ declare class AssemblyAISTT {
|
|
|
3460
4600
|
static forTwilio(apiKey: string, model?: AssemblyAIModel): AssemblyAISTT;
|
|
3461
4601
|
private buildUrl;
|
|
3462
4602
|
private buildHeaders;
|
|
4603
|
+
/**
|
|
4604
|
+
* Pre-call WebSocket warmup for the AssemblyAI v3 `/v3/ws` endpoint.
|
|
4605
|
+
*
|
|
4606
|
+
* Opens the WS (DNS + TLS + auth handshake), idles ~250 ms so the
|
|
4607
|
+
* AssemblyAI edge keeps the session state warm, then sends Terminate
|
|
4608
|
+
* and closes. By the time `connect()` is invoked at call-pickup the
|
|
4609
|
+
* resolver and TLS session are hot — net wire time saving of
|
|
4610
|
+
* 200-500 ms.
|
|
4611
|
+
*
|
|
4612
|
+
* Billing safety: AssemblyAI Universal Streaming bills on streamed
|
|
4613
|
+
* audio seconds (per https://www.assemblyai.com/pricing). Opening +
|
|
4614
|
+
* closing the WebSocket without forwarding any audio frames does
|
|
4615
|
+
* not consume billable seconds. Best-effort: failures logged at
|
|
4616
|
+
* debug level.
|
|
4617
|
+
*/
|
|
4618
|
+
warmup(): Promise<void>;
|
|
3463
4619
|
/** Open the streaming WebSocket and arm message handlers. */
|
|
3464
4620
|
connect(): Promise<void>;
|
|
3465
4621
|
private awaitOpen;
|
|
@@ -3500,6 +4656,7 @@ declare class AssemblyAISTT {
|
|
|
3500
4656
|
* Implements a `DeepgramSTT`-shaped provider using Cartesia's streaming
|
|
3501
4657
|
* WebSocket API. Pure `ws` transport — does NOT depend on the vendor SDK.
|
|
3502
4658
|
*/
|
|
4659
|
+
|
|
3503
4660
|
/** Patter-normalised transcript event emitted by {@link CartesiaSTT}. */
|
|
3504
4661
|
interface Transcript$4 {
|
|
3505
4662
|
readonly text: string;
|
|
@@ -3546,6 +4703,8 @@ interface CartesiaSTTOptions$1 {
|
|
|
3546
4703
|
declare class CartesiaSTT {
|
|
3547
4704
|
private readonly apiKey;
|
|
3548
4705
|
private readonly options;
|
|
4706
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
4707
|
+
static readonly providerKey = "cartesia_stt";
|
|
3549
4708
|
private ws;
|
|
3550
4709
|
private callbacks;
|
|
3551
4710
|
private keepaliveTimer;
|
|
@@ -3555,13 +4714,65 @@ declare class CartesiaSTT {
|
|
|
3555
4714
|
*/
|
|
3556
4715
|
requestId: string | null;
|
|
3557
4716
|
constructor(apiKey: string, options?: CartesiaSTTOptions$1);
|
|
4717
|
+
/**
|
|
4718
|
+
* Open a fresh WebSocket without arming any message / keepalive handlers
|
|
4719
|
+
* and without taking ownership on `this.ws`. Returns the OPEN socket so
|
|
4720
|
+
* the caller (the prewarm pipeline) can park it for later adoption via
|
|
4721
|
+
* `adoptWebSocket`. Bounded by `CONNECT_TIMEOUT_MS`.
|
|
4722
|
+
*
|
|
4723
|
+
* Billing safety: opening + parking the WS does not stream audio
|
|
4724
|
+
* (Cartesia STT bills on streamed audio seconds), so no charge is
|
|
4725
|
+
* incurred. Close the returned WS yourself if it is never adopted.
|
|
4726
|
+
*/
|
|
4727
|
+
openParkedConnection(): Promise<WebSocket__default>;
|
|
3558
4728
|
private buildWsUrl;
|
|
4729
|
+
/**
|
|
4730
|
+
* Pre-call WebSocket warmup for the Cartesia STT `/stt/websocket` endpoint.
|
|
4731
|
+
*
|
|
4732
|
+
* Opens the WS (DNS + TLS + auth handshake), idles ~250 ms so the
|
|
4733
|
+
* Cartesia edge keeps session state warm, then closes. By the time
|
|
4734
|
+
* `connect()` is invoked at call-pickup the resolver and TLS session
|
|
4735
|
+
* are hot — net wire time saving of 200-500 ms.
|
|
4736
|
+
*
|
|
4737
|
+
* Billing safety: Cartesia STT bills on streamed audio seconds (per
|
|
4738
|
+
* https://docs.cartesia.ai/2025-04-16/api-reference/stt/stt). Opening
|
|
4739
|
+
* + closing the WebSocket without forwarding audio does not consume
|
|
4740
|
+
* billable seconds. Best-effort: failures logged at debug level.
|
|
4741
|
+
*/
|
|
4742
|
+
warmup(): Promise<void>;
|
|
3559
4743
|
/** Open the streaming WebSocket and arm message + keepalive handlers. */
|
|
3560
4744
|
connect(): Promise<void>;
|
|
4745
|
+
/**
|
|
4746
|
+
* Adopt a pre-opened, already-OPEN WebSocket produced by the prewarm
|
|
4747
|
+
* pipeline (see `Patter.parkProviderConnections`). Skips the fresh
|
|
4748
|
+
* `new WebSocket()` + handshake — the WS is already through DNS, TLS
|
|
4749
|
+
* and HTTP-101 so audio frames can flow on this turn instead of
|
|
4750
|
+
* paying ~150-400 ms of handshake.
|
|
4751
|
+
*
|
|
4752
|
+
* Caller MUST verify `ws.readyState === OPEN` before calling. If the
|
|
4753
|
+
* parked WS died between park and adopt, fall back to `connect()`.
|
|
4754
|
+
*/
|
|
4755
|
+
adoptWebSocket(ws: WebSocket__default): void;
|
|
4756
|
+
private armMessageAndKeepalive;
|
|
3561
4757
|
private handleEvent;
|
|
3562
4758
|
private emit;
|
|
3563
4759
|
/** Send a binary PCM16-LE audio chunk to Cartesia for transcription. */
|
|
3564
4760
|
sendAudio(audio: Buffer): void;
|
|
4761
|
+
/**
|
|
4762
|
+
* Force Cartesia to finalise the in-flight utterance immediately.
|
|
4763
|
+
*
|
|
4764
|
+
* Sends a ``finalize`` text frame on the live WebSocket. Cartesia
|
|
4765
|
+
* replies with the final transcript followed by ``flush_done``,
|
|
4766
|
+
* bypassing its conservative internal silence heuristic (which can
|
|
4767
|
+
* wait 2-7 s on PSTN audio before naturally finalising). Wired
|
|
4768
|
+
* into ``StreamHandler`` on the VAD ``speech_end`` event so the
|
|
4769
|
+
* SDK's authoritative end-of-speech detection forces an immediate
|
|
4770
|
+
* STT finalisation — turning Cartesia's natural-pause endpointing
|
|
4771
|
+
* into a deterministic VAD-driven one, parity with the Deepgram
|
|
4772
|
+
* fast-path. No-op when the WS isn't open. Parity with Python
|
|
4773
|
+
* ``CartesiaSTT.finalize``.
|
|
4774
|
+
*/
|
|
4775
|
+
finalize(): Promise<void>;
|
|
3565
4776
|
/** Register a transcript listener. */
|
|
3566
4777
|
onTranscript(callback: TranscriptCallback$4): void;
|
|
3567
4778
|
/** Remove a previously registered transcript callback. */
|
|
@@ -3624,6 +4835,8 @@ interface LMNTTTSOptions$1 {
|
|
|
3624
4835
|
}
|
|
3625
4836
|
/** LMNT TTS adapter backed by the `/v1/ai/speech/bytes` HTTP streaming endpoint. */
|
|
3626
4837
|
declare class LMNTTTS {
|
|
4838
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
4839
|
+
static readonly providerKey = "lmnt";
|
|
3627
4840
|
private readonly apiKey;
|
|
3628
4841
|
private readonly model;
|
|
3629
4842
|
private readonly voice;
|
|
@@ -3678,6 +4891,18 @@ interface Transcript$3 {
|
|
|
3678
4891
|
}
|
|
3679
4892
|
type TranscriptCallback$3 = (transcript: Transcript$3) => void;
|
|
3680
4893
|
type ErrorCallback$1 = (error: Error) => void;
|
|
4894
|
+
/** Known Deepgram STT models. */
|
|
4895
|
+
declare const DeepgramModel: {
|
|
4896
|
+
readonly NOVA_3: "nova-3";
|
|
4897
|
+
readonly NOVA_2: "nova-2";
|
|
4898
|
+
readonly NOVA_2_PHONECALL: "nova-2-phonecall";
|
|
4899
|
+
readonly NOVA_2_GENERAL: "nova-2-general";
|
|
4900
|
+
readonly NOVA_2_MEETING: "nova-2-meeting";
|
|
4901
|
+
readonly NOVA: "nova";
|
|
4902
|
+
readonly ENHANCED: "enhanced";
|
|
4903
|
+
readonly BASE: "base";
|
|
4904
|
+
};
|
|
4905
|
+
type DeepgramModel = (typeof DeepgramModel)[keyof typeof DeepgramModel];
|
|
3681
4906
|
/**
|
|
3682
4907
|
* Optional tuning knobs for Deepgram live transcription.
|
|
3683
4908
|
*
|
|
@@ -3698,92 +4923,306 @@ interface DeepgramSTTOptions$1 {
|
|
|
3698
4923
|
*/
|
|
3699
4924
|
readonly endpointingMs?: number;
|
|
3700
4925
|
/**
|
|
3701
|
-
* End-of-utterance silence window in milliseconds. Deepgram enforces a
|
|
3702
|
-
* hard minimum of 1000 ms. Set to ``null`` to disable. Default ``1000``.
|
|
4926
|
+
* End-of-utterance silence window in milliseconds. Deepgram enforces a
|
|
4927
|
+
* hard minimum of 1000 ms. Set to ``null`` to disable. Default ``1000``.
|
|
4928
|
+
*/
|
|
4929
|
+
readonly utteranceEndMs?: number | null;
|
|
4930
|
+
/**
|
|
4931
|
+
* Enable smart formatting (punctuation + numerals). Default ``false`` —
|
|
4932
|
+
* smart formatting adds roughly 50–150 ms to TTFT on each final transcript
|
|
4933
|
+
* and is rarely useful for telephony pipelines that pass the text straight
|
|
4934
|
+
* to an LLM. Set to ``true`` for use cases (dashboards, raw transcripts)
|
|
4935
|
+
* where the formatted text is surfaced directly to humans.
|
|
4936
|
+
*/
|
|
4937
|
+
readonly smartFormat?: boolean;
|
|
4938
|
+
/** Emit interim (non-final) transcripts. Default ``true``. */
|
|
4939
|
+
readonly interimResults?: boolean;
|
|
4940
|
+
/** Emit VAD events (``SpeechStarted`` / ``UtteranceEnd``). Default ``true``. */
|
|
4941
|
+
readonly vadEvents?: boolean;
|
|
4942
|
+
}
|
|
4943
|
+
/** Streaming STT adapter for Deepgram's `/v1/listen` WebSocket API. */
|
|
4944
|
+
declare class DeepgramSTT {
|
|
4945
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
4946
|
+
static readonly providerKey = "deepgram";
|
|
4947
|
+
private ws;
|
|
4948
|
+
private readonly transcriptCallbacks;
|
|
4949
|
+
private readonly errorCallbacks;
|
|
4950
|
+
private keepaliveTimer;
|
|
4951
|
+
private running;
|
|
4952
|
+
private reconnectAttempted;
|
|
4953
|
+
/** Request ID from Deepgram — used to query actual cost post-call. */
|
|
4954
|
+
requestId: string;
|
|
4955
|
+
private readonly apiKey;
|
|
4956
|
+
private readonly language;
|
|
4957
|
+
private readonly model;
|
|
4958
|
+
private readonly encoding;
|
|
4959
|
+
private readonly sampleRate;
|
|
4960
|
+
private readonly endpointingMs;
|
|
4961
|
+
private readonly utteranceEndMs;
|
|
4962
|
+
private readonly smartFormat;
|
|
4963
|
+
private readonly interimResults;
|
|
4964
|
+
private readonly vadEvents;
|
|
4965
|
+
/**
|
|
4966
|
+
* New ergonomic constructor accepting an options object (mirrors Python kwargs).
|
|
4967
|
+
*
|
|
4968
|
+
* Also accepts the legacy positional form
|
|
4969
|
+
* ``(apiKey, language?, model?, encoding?, sampleRate?)`` for backward
|
|
4970
|
+
* compatibility with code that predated BUG #13.
|
|
4971
|
+
*/
|
|
4972
|
+
constructor(apiKey: string, language?: string, model?: string, encoding?: string, sampleRate?: number, options?: DeepgramSTTOptions$1);
|
|
4973
|
+
constructor(apiKey: string, options: DeepgramSTTOptions$1 & {
|
|
4974
|
+
language?: string;
|
|
4975
|
+
});
|
|
4976
|
+
/** Factory for Twilio calls — mulaw 8 kHz. Forwards tuning options through. */
|
|
4977
|
+
static forTwilio(apiKey: string, language?: string, model?: string, options?: DeepgramSTTOptions$1): DeepgramSTT;
|
|
4978
|
+
private buildUrl;
|
|
4979
|
+
/**
|
|
4980
|
+
* Pre-call WebSocket warmup for the Deepgram `/v1/listen` endpoint.
|
|
4981
|
+
*
|
|
4982
|
+
* Opens the WS (full DNS + TLS + auth handshake), idles ~250 ms so the
|
|
4983
|
+
* provider edge keeps the session warm in its routing table, then
|
|
4984
|
+
* closes cleanly. By the time `connect()` is invoked at call-pickup
|
|
4985
|
+
* the DNS resolver is hot, the TCP+TLS session is in the connection
|
|
4986
|
+
* pool, and recent WS auth is still warm at Deepgram's edge — net
|
|
4987
|
+
* wire time saving of 200-500 ms vs a cold WS open.
|
|
4988
|
+
*
|
|
4989
|
+
* Billing safety: Deepgram bills on streamed audio seconds (per
|
|
4990
|
+
* https://deepgram.com/pricing). Opening + closing the WebSocket
|
|
4991
|
+
* without sending any audio frames does not consume billable seconds.
|
|
4992
|
+
* Best-effort: any failure is logged at debug level and never raised.
|
|
4993
|
+
*/
|
|
4994
|
+
warmup(): Promise<void>;
|
|
4995
|
+
/** Open the streaming WebSocket and arm message + keepalive handlers. */
|
|
4996
|
+
connect(): Promise<void>;
|
|
4997
|
+
private openSocket;
|
|
4998
|
+
private clearKeepalive;
|
|
4999
|
+
private handleMessage;
|
|
5000
|
+
private emitTranscript;
|
|
5001
|
+
private emitError;
|
|
5002
|
+
private handleError;
|
|
5003
|
+
private handleClose;
|
|
5004
|
+
/** Send a binary audio chunk to Deepgram for transcription. */
|
|
5005
|
+
sendAudio(audio: Buffer): void;
|
|
5006
|
+
private audioSentCount;
|
|
5007
|
+
private audioDroppedCount;
|
|
5008
|
+
/** Register a transcript listener. */
|
|
5009
|
+
onTranscript(callback: TranscriptCallback$3): void;
|
|
5010
|
+
/** Remove a previously registered transcript listener. */
|
|
5011
|
+
offTranscript(callback: TranscriptCallback$3): void;
|
|
5012
|
+
/** Register an error listener for socket / API failures. */
|
|
5013
|
+
onError(callback: ErrorCallback$1): void;
|
|
5014
|
+
/** Remove a previously registered error listener. */
|
|
5015
|
+
offError(callback: ErrorCallback$1): void;
|
|
5016
|
+
/**
|
|
5017
|
+
* Force Deepgram to immediately emit a final ``Results`` frame for the
|
|
5018
|
+
* in-flight utterance, rather than waiting for its own endpoint
|
|
5019
|
+
* heuristic (utterance_end_ms ~1 s + natural-pause endpointing).
|
|
5020
|
+
* Called by the SDK on VAD ``speech_end`` and after barge-in cancel —
|
|
5021
|
+
* both moments where the SDK already knows the user has stopped
|
|
5022
|
+
* speaking and waiting for Deepgram's own endpointing only adds
|
|
5023
|
+
* dead air.
|
|
5024
|
+
*
|
|
5025
|
+
* Idempotent: safe to call when the socket is closed/closing.
|
|
5026
|
+
*/
|
|
5027
|
+
finalize(): void;
|
|
5028
|
+
/** Send Finalize, briefly drain trailing transcripts, then close the socket. */
|
|
5029
|
+
close(): void;
|
|
5030
|
+
}
|
|
5031
|
+
|
|
5032
|
+
/**
|
|
5033
|
+
* Cartesia TTS provider — HTTP `/tts/bytes` endpoint.
|
|
5034
|
+
*
|
|
5035
|
+
* Cartesia also offers a WebSocket streaming mode with word timestamps;
|
|
5036
|
+
* this provider focuses on the chunked-bytes HTTP API which maps cleanly
|
|
5037
|
+
* onto Patter's `synthesize(text)` contract and keeps the provider
|
|
5038
|
+
* dependency-free (just `fetch`).
|
|
5039
|
+
*
|
|
5040
|
+
* Default model is `sonic-3` (GA snapshot `sonic-3-2026-01-12`) — Cartesia's
|
|
5041
|
+
* current GA model with a documented ~90 ms TTFB target. Voice IDs from the
|
|
5042
|
+
* sonic-2 generation (including the default Katie voice) remain compatible.
|
|
5043
|
+
*
|
|
5044
|
+
* **Telephony optimization** — the constructor default
|
|
5045
|
+
* `sampleRate=16000` is correct for web playback, dashboard previews, and
|
|
5046
|
+
* 16 kHz pipelines. For real phone calls, use the carrier-specific
|
|
5047
|
+
* factories instead:
|
|
5048
|
+
*
|
|
5049
|
+
* - {@link CartesiaTTS.forTwilio} requests `sampleRate=8000` natively from
|
|
5050
|
+
* Cartesia. Twilio's media-stream WebSocket expects μ-law @ 8 kHz, so
|
|
5051
|
+
* the SDK normally resamples 16 kHz → 8 kHz before doing the PCM →
|
|
5052
|
+
* μ-law transcode in `TwilioAudioSender`. Asking Cartesia for 8 kHz
|
|
5053
|
+
* PCM at the source skips the resample step (saves ~10–30 ms first-
|
|
5054
|
+
* byte plus per-frame CPU and removes a potential aliasing source).
|
|
5055
|
+
* The PCM → μ-law transcode still happens client-side.
|
|
5056
|
+
* - {@link CartesiaTTS.forTelnyx} requests `sampleRate=16000`. Telnyx
|
|
5057
|
+
* negotiates L16/16000 on its bidirectional media WebSocket, so
|
|
5058
|
+
* 16 kHz PCM is already the format used end-to-end and no
|
|
5059
|
+
* transcoding happens. This is the same as the bare-constructor
|
|
5060
|
+
* default and exists for API symmetry with the Twilio factory.
|
|
5061
|
+
*/
|
|
5062
|
+
/** Known Cartesia TTS models. */
|
|
5063
|
+
declare const CartesiaTTSModel: {
|
|
5064
|
+
readonly SONIC_3: "sonic-3";
|
|
5065
|
+
readonly SONIC_2: "sonic-2";
|
|
5066
|
+
readonly SONIC: "sonic";
|
|
5067
|
+
};
|
|
5068
|
+
type CartesiaTTSModel = (typeof CartesiaTTSModel)[keyof typeof CartesiaTTSModel];
|
|
5069
|
+
/** Common PCM sample rates accepted by the Cartesia bytes endpoint. */
|
|
5070
|
+
declare const CartesiaTTSSampleRate: {
|
|
5071
|
+
readonly HZ_8000: 8000;
|
|
5072
|
+
readonly HZ_16000: 16000;
|
|
5073
|
+
readonly HZ_22050: 22050;
|
|
5074
|
+
readonly HZ_24000: 24000;
|
|
5075
|
+
readonly HZ_44100: 44100;
|
|
5076
|
+
};
|
|
5077
|
+
type CartesiaTTSSampleRate = (typeof CartesiaTTSSampleRate)[keyof typeof CartesiaTTSSampleRate];
|
|
5078
|
+
/** Voice-selection mode passed in the Cartesia bytes payload. */
|
|
5079
|
+
declare const CartesiaTTSVoiceMode: {
|
|
5080
|
+
readonly ID: "id";
|
|
5081
|
+
readonly EMBEDDING: "embedding";
|
|
5082
|
+
};
|
|
5083
|
+
type CartesiaTTSVoiceMode = (typeof CartesiaTTSVoiceMode)[keyof typeof CartesiaTTSVoiceMode];
|
|
5084
|
+
/** Constructor options for {@link CartesiaTTS}. */
|
|
5085
|
+
interface CartesiaTTSOptions$1 {
|
|
5086
|
+
model?: CartesiaTTSModel | string;
|
|
5087
|
+
voice?: string;
|
|
5088
|
+
language?: string;
|
|
5089
|
+
sampleRate?: CartesiaTTSSampleRate | number;
|
|
5090
|
+
speed?: string | number;
|
|
5091
|
+
emotion?: string | string[];
|
|
5092
|
+
volume?: number;
|
|
5093
|
+
baseUrl?: string;
|
|
5094
|
+
apiVersion?: string;
|
|
5095
|
+
}
|
|
5096
|
+
/** Cartesia TTS provider backed by the HTTP `/tts/bytes` streaming endpoint. */
|
|
5097
|
+
declare class CartesiaTTS {
|
|
5098
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
5099
|
+
static readonly providerKey = "cartesia_tts";
|
|
5100
|
+
private readonly apiKey;
|
|
5101
|
+
private readonly model;
|
|
5102
|
+
private readonly voice;
|
|
5103
|
+
private readonly language;
|
|
5104
|
+
private readonly sampleRate;
|
|
5105
|
+
private readonly speed?;
|
|
5106
|
+
private readonly emotion?;
|
|
5107
|
+
private readonly volume?;
|
|
5108
|
+
private readonly baseUrl;
|
|
5109
|
+
private readonly apiVersion;
|
|
5110
|
+
constructor(apiKey: string, opts?: CartesiaTTSOptions$1);
|
|
5111
|
+
/**
|
|
5112
|
+
* Construct an instance pre-configured for Twilio Media Streams.
|
|
5113
|
+
*
|
|
5114
|
+
* Sets `sampleRate=8000` so Cartesia emits PCM_S16LE @ 8 kHz directly.
|
|
5115
|
+
* Twilio's media stream uses μ-law @ 8 kHz so the SDK still does the
|
|
5116
|
+
* PCM → μ-law transcode client-side, but the 16 kHz → 8 kHz resample
|
|
5117
|
+
* step is skipped. Saves ~10–30 ms first-byte plus per-frame CPU and
|
|
5118
|
+
* removes a potential aliasing source.
|
|
5119
|
+
*/
|
|
5120
|
+
static forTwilio(apiKey: string, options?: Omit<CartesiaTTSOptions$1, 'sampleRate'>): CartesiaTTS;
|
|
5121
|
+
/**
|
|
5122
|
+
* Construct an instance pre-configured for Telnyx bidirectional media.
|
|
5123
|
+
*
|
|
5124
|
+
* Sets `sampleRate=16000` to match Telnyx's L16/16000 default codec —
|
|
5125
|
+
* audio flows end-to-end with zero resampling or transcoding. Same as
|
|
5126
|
+
* the bare-constructor default; exists for API symmetry with
|
|
5127
|
+
* {@link CartesiaTTS.forTwilio}.
|
|
5128
|
+
*/
|
|
5129
|
+
static forTelnyx(apiKey: string, options?: Omit<CartesiaTTSOptions$1, 'sampleRate'>): CartesiaTTS;
|
|
5130
|
+
/** Build the JSON payload for the Cartesia bytes endpoint. */
|
|
5131
|
+
private buildPayload;
|
|
5132
|
+
/**
|
|
5133
|
+
* Pre-call HTTP warmup for the Cartesia `/tts/bytes` endpoint.
|
|
5134
|
+
*
|
|
5135
|
+
* Issues a lightweight `GET <baseUrl>/voices` so DNS, TLS, and HTTP/2
|
|
5136
|
+
* are already up by the time the first `synthesizeStream()` POST
|
|
5137
|
+
* lands. Best-effort: 5 s timeout, all exceptions swallowed at
|
|
5138
|
+
* debug level.
|
|
5139
|
+
*
|
|
5140
|
+
* Billing safety: `GET /voices` is a free metadata read on
|
|
5141
|
+
* Cartesia's REST surface (per https://docs.cartesia.ai). It does
|
|
5142
|
+
* not consume synthesis credits. The actual synthesis is billed
|
|
5143
|
+
* only when `POST /tts/bytes` runs with a non-empty `transcript`.
|
|
5144
|
+
*
|
|
5145
|
+
* Note: Cartesia TTS uses the HTTP path (vs the WebSocket variant
|
|
5146
|
+
* Cartesia also exposes) — connection warmup is therefore HTTP-GET
|
|
5147
|
+
* based, not WebSocket pre-handshake. The latency win is smaller
|
|
5148
|
+
* (~50-150 ms vs the ~200-500 ms of a WS prewarm) but still real.
|
|
3703
5149
|
*/
|
|
3704
|
-
|
|
5150
|
+
warmup(): Promise<void>;
|
|
5151
|
+
/** Synthesize text and return the concatenated audio buffer. */
|
|
5152
|
+
synthesize(text: string): Promise<Buffer>;
|
|
3705
5153
|
/**
|
|
3706
|
-
*
|
|
3707
|
-
*
|
|
3708
|
-
* and is rarely useful for telephony pipelines that pass the text straight
|
|
3709
|
-
* to an LLM. Set to ``true`` for use cases (dashboards, raw transcripts)
|
|
3710
|
-
* where the formatted text is surfaced directly to humans.
|
|
5154
|
+
* Synthesize text and yield raw PCM_S16LE chunks at the configured
|
|
5155
|
+
* `sampleRate` as they arrive from Cartesia.
|
|
3711
5156
|
*/
|
|
3712
|
-
|
|
3713
|
-
/** Emit interim (non-final) transcripts. Default ``true``. */
|
|
3714
|
-
readonly interimResults?: boolean;
|
|
3715
|
-
/** Emit VAD events (``SpeechStarted`` / ``UtteranceEnd``). Default ``true``. */
|
|
3716
|
-
readonly vadEvents?: boolean;
|
|
5157
|
+
synthesizeStream(text: string): AsyncGenerator<Buffer>;
|
|
3717
5158
|
}
|
|
3718
|
-
|
|
3719
|
-
|
|
3720
|
-
|
|
3721
|
-
|
|
3722
|
-
|
|
3723
|
-
|
|
3724
|
-
|
|
3725
|
-
|
|
3726
|
-
|
|
3727
|
-
|
|
5159
|
+
|
|
5160
|
+
/**
|
|
5161
|
+
* Rime TTS provider — HTTP chunked endpoint.
|
|
5162
|
+
*
|
|
5163
|
+
* Supports both Arcana and Mist model families. The Arcana model can take
|
|
5164
|
+
* up to ~80% of the output audio's duration to synthesize, so its request
|
|
5165
|
+
* timeout is bumped to 4 minutes.
|
|
5166
|
+
*/
|
|
5167
|
+
/** Rime TTS model families. */
|
|
5168
|
+
declare const RimeModel: {
|
|
5169
|
+
readonly ARCANA: "arcana";
|
|
5170
|
+
readonly MIST: "mist";
|
|
5171
|
+
readonly MIST_V2: "mistv2";
|
|
5172
|
+
};
|
|
5173
|
+
type RimeModel = (typeof RimeModel)[keyof typeof RimeModel];
|
|
5174
|
+
/** Supported response Content-Type accept headers for Rime TTS. */
|
|
5175
|
+
declare const RimeAudioFormat: {
|
|
5176
|
+
readonly PCM: "audio/pcm";
|
|
5177
|
+
readonly MP3: "audio/mp3";
|
|
5178
|
+
readonly WAV: "audio/wav";
|
|
5179
|
+
readonly MULAW: "audio/mulaw";
|
|
5180
|
+
};
|
|
5181
|
+
type RimeAudioFormat = (typeof RimeAudioFormat)[keyof typeof RimeAudioFormat];
|
|
5182
|
+
/** Constructor options for {@link RimeTTS}. */
|
|
5183
|
+
interface RimeTTSOptions$1 {
|
|
5184
|
+
model?: string;
|
|
5185
|
+
speaker?: string;
|
|
5186
|
+
lang?: string;
|
|
5187
|
+
sampleRate?: number;
|
|
5188
|
+
repetitionPenalty?: number;
|
|
5189
|
+
temperature?: number;
|
|
5190
|
+
topP?: number;
|
|
5191
|
+
maxTokens?: number;
|
|
5192
|
+
speedAlpha?: number;
|
|
5193
|
+
reduceLatency?: boolean;
|
|
5194
|
+
pauseBetweenBrackets?: boolean;
|
|
5195
|
+
phonemizeBetweenBrackets?: boolean;
|
|
5196
|
+
baseUrl?: string;
|
|
5197
|
+
}
|
|
5198
|
+
/** Rime TTS adapter for the `users.rime.ai/v1/rime-tts` HTTP streaming endpoint. */
|
|
5199
|
+
declare class RimeTTS {
|
|
5200
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
5201
|
+
static readonly providerKey = "rime";
|
|
3728
5202
|
private readonly apiKey;
|
|
3729
|
-
private readonly language;
|
|
3730
5203
|
private readonly model;
|
|
3731
|
-
private readonly
|
|
5204
|
+
private readonly speaker;
|
|
5205
|
+
private readonly lang;
|
|
3732
5206
|
private readonly sampleRate;
|
|
3733
|
-
private readonly
|
|
3734
|
-
private readonly
|
|
3735
|
-
private readonly
|
|
3736
|
-
private readonly
|
|
3737
|
-
private readonly
|
|
3738
|
-
|
|
3739
|
-
|
|
3740
|
-
|
|
3741
|
-
|
|
3742
|
-
|
|
3743
|
-
|
|
3744
|
-
|
|
3745
|
-
|
|
3746
|
-
|
|
3747
|
-
language?: string;
|
|
3748
|
-
});
|
|
3749
|
-
/** Factory for Twilio calls — mulaw 8 kHz. Forwards tuning options through. */
|
|
3750
|
-
static forTwilio(apiKey: string, language?: string, model?: string, options?: DeepgramSTTOptions$1): DeepgramSTT;
|
|
3751
|
-
private buildUrl;
|
|
3752
|
-
/** Open the streaming WebSocket and arm message + keepalive handlers. */
|
|
3753
|
-
connect(): Promise<void>;
|
|
3754
|
-
private openSocket;
|
|
3755
|
-
private clearKeepalive;
|
|
3756
|
-
private handleMessage;
|
|
3757
|
-
private emitTranscript;
|
|
3758
|
-
private emitError;
|
|
3759
|
-
private handleError;
|
|
3760
|
-
private handleClose;
|
|
3761
|
-
/** Send a binary audio chunk to Deepgram for transcription. */
|
|
3762
|
-
sendAudio(audio: Buffer): void;
|
|
3763
|
-
private audioSentCount;
|
|
3764
|
-
private audioDroppedCount;
|
|
3765
|
-
/** Register a transcript listener. */
|
|
3766
|
-
onTranscript(callback: TranscriptCallback$3): void;
|
|
3767
|
-
/** Remove a previously registered transcript listener. */
|
|
3768
|
-
offTranscript(callback: TranscriptCallback$3): void;
|
|
3769
|
-
/** Register an error listener for socket / API failures. */
|
|
3770
|
-
onError(callback: ErrorCallback$1): void;
|
|
3771
|
-
/** Remove a previously registered error listener. */
|
|
3772
|
-
offError(callback: ErrorCallback$1): void;
|
|
5207
|
+
private readonly repetitionPenalty?;
|
|
5208
|
+
private readonly temperature?;
|
|
5209
|
+
private readonly topP?;
|
|
5210
|
+
private readonly maxTokens?;
|
|
5211
|
+
private readonly speedAlpha?;
|
|
5212
|
+
private readonly reduceLatency?;
|
|
5213
|
+
private readonly pauseBetweenBrackets?;
|
|
5214
|
+
private readonly phonemizeBetweenBrackets?;
|
|
5215
|
+
private readonly baseUrl;
|
|
5216
|
+
private readonly totalTimeoutMs;
|
|
5217
|
+
constructor(apiKey: string, opts?: RimeTTSOptions$1);
|
|
5218
|
+
private buildPayload;
|
|
5219
|
+
/** Synthesize text and return the concatenated audio buffer. */
|
|
5220
|
+
synthesize(text: string): Promise<Buffer>;
|
|
3773
5221
|
/**
|
|
3774
|
-
*
|
|
3775
|
-
*
|
|
3776
|
-
* heuristic (utterance_end_ms ~1 s + natural-pause endpointing).
|
|
3777
|
-
* Called by the SDK on VAD ``speech_end`` and after barge-in cancel —
|
|
3778
|
-
* both moments where the SDK already knows the user has stopped
|
|
3779
|
-
* speaking and waiting for Deepgram's own endpointing only adds
|
|
3780
|
-
* dead air.
|
|
3781
|
-
*
|
|
3782
|
-
* Idempotent: safe to call when the socket is closed/closing.
|
|
5222
|
+
* Synthesize text and yield raw PCM_S16LE chunks at the configured
|
|
5223
|
+
* `sampleRate` as they stream in.
|
|
3783
5224
|
*/
|
|
3784
|
-
|
|
3785
|
-
/** Send Finalize, briefly drain trailing transcripts, then close the socket. */
|
|
3786
|
-
close(): void;
|
|
5225
|
+
synthesizeStream(text: string): AsyncGenerator<Buffer>;
|
|
3787
5226
|
}
|
|
3788
5227
|
|
|
3789
5228
|
/** Deepgram streaming STT for Patter pipeline mode. */
|
|
@@ -3825,6 +5264,8 @@ type TranscriptCallback$2 = (transcript: Transcript$2) => void;
|
|
|
3825
5264
|
type WhisperResponseFormat = 'json' | 'verbose_json';
|
|
3826
5265
|
/** Buffered STT adapter for OpenAI's Whisper transcription HTTP API. */
|
|
3827
5266
|
declare class WhisperSTT {
|
|
5267
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
5268
|
+
static readonly providerKey: string;
|
|
3828
5269
|
private readonly apiKey;
|
|
3829
5270
|
private readonly model;
|
|
3830
5271
|
private readonly language;
|
|
@@ -3913,6 +5354,8 @@ declare class STT$5 extends WhisperSTT {
|
|
|
3913
5354
|
|
|
3914
5355
|
/** STT adapter restricted to OpenAI's GPT-4o Transcribe model family. */
|
|
3915
5356
|
declare class OpenAITranscribeSTT extends WhisperSTT {
|
|
5357
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
5358
|
+
static readonly providerKey: string;
|
|
3916
5359
|
/**
|
|
3917
5360
|
* @param apiKey OpenAI API key.
|
|
3918
5361
|
* @param language ISO-639-1 language code (e.g. ``"en"``, ``"it"``). Optional.
|
|
@@ -4172,206 +5615,73 @@ interface SpeechmaticsSTTOptions$1 {
|
|
|
4172
5615
|
* ```
|
|
4173
5616
|
*/
|
|
4174
5617
|
declare class SpeechmaticsSTT {
|
|
5618
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
5619
|
+
static readonly providerKey = "speechmatics";
|
|
4175
5620
|
private ws;
|
|
4176
5621
|
private readonly transcriptCallbacks;
|
|
4177
|
-
private readonly errorCallbacks;
|
|
4178
|
-
private running;
|
|
4179
|
-
/** Sequence number of the last audio chunk acknowledged via `AudioAdded`. */
|
|
4180
|
-
private lastSeqNo;
|
|
4181
|
-
private readonly apiKey;
|
|
4182
|
-
private readonly baseUrl;
|
|
4183
|
-
private readonly language;
|
|
4184
|
-
private readonly turnDetectionMode;
|
|
4185
|
-
private readonly sampleRate;
|
|
4186
|
-
private readonly enableDiarization;
|
|
4187
|
-
private readonly maxDelay;
|
|
4188
|
-
private readonly endOfUtteranceSilenceTrigger;
|
|
4189
|
-
private readonly endOfUtteranceMaxDelay;
|
|
4190
|
-
private readonly includePartials;
|
|
4191
|
-
private readonly additionalVocab;
|
|
4192
|
-
private readonly operatingPoint;
|
|
4193
|
-
private readonly domain;
|
|
4194
|
-
private readonly outputLocale;
|
|
4195
|
-
constructor(apiKey: string, options?: SpeechmaticsSTTOptions$1);
|
|
4196
|
-
/** Build the JSON `StartRecognition` payload sent on connect. */
|
|
4197
|
-
private buildStartRecognition;
|
|
4198
|
-
/** Open the streaming WebSocket and send the `StartRecognition` frame. */
|
|
4199
|
-
connect(): Promise<void>;
|
|
4200
|
-
/** Send a binary PCM16-LE audio chunk to Speechmatics for transcription. */
|
|
4201
|
-
sendAudio(audio: Buffer): void;
|
|
4202
|
-
/** Register a transcript listener. */
|
|
4203
|
-
onTranscript(callback: TranscriptCallback$1): void;
|
|
4204
|
-
/** Remove a previously registered transcript listener. */
|
|
4205
|
-
offTranscript(callback: TranscriptCallback$1): void;
|
|
4206
|
-
/** Register an error listener for socket / API failures. */
|
|
4207
|
-
onError(callback: ErrorCallback): void;
|
|
4208
|
-
/** Remove a previously registered error listener. */
|
|
4209
|
-
offError(callback: ErrorCallback): void;
|
|
4210
|
-
private handleMessage;
|
|
4211
|
-
/** Translate a Speechmatics transcript message into a Patter `Transcript`. */
|
|
4212
|
-
private toTranscript;
|
|
4213
|
-
private emitTranscript;
|
|
4214
|
-
private emitError;
|
|
4215
|
-
private handleError;
|
|
4216
|
-
private handleClose;
|
|
4217
|
-
/** Send `EndOfStream` and close the WebSocket. Idempotent. */
|
|
4218
|
-
close(): void;
|
|
4219
|
-
}
|
|
4220
|
-
|
|
4221
|
-
/** Speechmatics streaming STT for Patter pipeline mode. */
|
|
4222
|
-
|
|
4223
|
-
type SpeechmaticsSTTOptions = SpeechmaticsSTTOptions$1 & {
|
|
4224
|
-
/** API key. Falls back to SPEECHMATICS_API_KEY env var when omitted. */
|
|
4225
|
-
apiKey?: string;
|
|
4226
|
-
};
|
|
4227
|
-
/**
|
|
4228
|
-
* Speechmatics streaming STT.
|
|
4229
|
-
*
|
|
4230
|
-
* @example
|
|
4231
|
-
* ```ts
|
|
4232
|
-
* import * as speechmatics from "getpatter/stt/speechmatics";
|
|
4233
|
-
* const stt = new speechmatics.STT(); // reads SPEECHMATICS_API_KEY
|
|
4234
|
-
* const stt = new speechmatics.STT({ apiKey: "sm_...", language: "en" });
|
|
4235
|
-
* ```
|
|
4236
|
-
*/
|
|
4237
|
-
declare class STT extends SpeechmaticsSTT {
|
|
4238
|
-
static readonly providerKey = "speechmatics";
|
|
4239
|
-
constructor(opts?: SpeechmaticsSTTOptions);
|
|
4240
|
-
}
|
|
4241
|
-
|
|
4242
|
-
/**
|
|
4243
|
-
* Known stable ElevenLabs voice models (from the official ElevenLabs API
|
|
4244
|
-
* reference). Exposed as a typed `as const` object so callers can pass
|
|
4245
|
-
* `ElevenLabsModel.FLASH_V2_5` and get autocomplete / static checking; the
|
|
4246
|
-
* public `modelId` option also accepts an arbitrary `string` so users can
|
|
4247
|
-
* pass forward-compat IDs we haven't enumerated yet.
|
|
4248
|
-
*
|
|
4249
|
-
* - `V3` — newest, highest quality (slower TTFT than Flash).
|
|
4250
|
-
* - `FLASH_V2_5` — current default, fastest (~75 ms TTFT).
|
|
4251
|
-
* - `TURBO_V2_5` — balanced quality/speed.
|
|
4252
|
-
* - `MULTILINGUAL_V2` — best multilingual support.
|
|
4253
|
-
* - `MONOLINGUAL_V1` — legacy English-only.
|
|
4254
|
-
*/
|
|
4255
|
-
declare const ElevenLabsModel: {
|
|
4256
|
-
readonly V3: "eleven_v3";
|
|
4257
|
-
readonly FLASH_V2_5: "eleven_flash_v2_5";
|
|
4258
|
-
readonly TURBO_V2_5: "eleven_turbo_v2_5";
|
|
4259
|
-
readonly MULTILINGUAL_V2: "eleven_multilingual_v2";
|
|
4260
|
-
readonly MONOLINGUAL_V1: "eleven_monolingual_v1";
|
|
4261
|
-
};
|
|
4262
|
-
/** Union of {@link ElevenLabsModel} string values. */
|
|
4263
|
-
type ElevenLabsModel = (typeof ElevenLabsModel)[keyof typeof ElevenLabsModel];
|
|
4264
|
-
declare const ElevenLabsOutputFormat: {
|
|
4265
|
-
readonly MP3_22050_32: "mp3_22050_32";
|
|
4266
|
-
readonly MP3_44100_32: "mp3_44100_32";
|
|
4267
|
-
readonly MP3_44100_64: "mp3_44100_64";
|
|
4268
|
-
readonly MP3_44100_96: "mp3_44100_96";
|
|
4269
|
-
readonly MP3_44100_128: "mp3_44100_128";
|
|
4270
|
-
readonly MP3_44100_192: "mp3_44100_192";
|
|
4271
|
-
readonly PCM_8000: "pcm_8000";
|
|
4272
|
-
readonly PCM_16000: "pcm_16000";
|
|
4273
|
-
readonly PCM_22050: "pcm_22050";
|
|
4274
|
-
readonly PCM_24000: "pcm_24000";
|
|
4275
|
-
readonly PCM_44100: "pcm_44100";
|
|
4276
|
-
readonly ULAW_8000: "ulaw_8000";
|
|
4277
|
-
};
|
|
4278
|
-
/** Union of {@link ElevenLabsOutputFormat} string values. */
|
|
4279
|
-
type ElevenLabsOutputFormat = (typeof ElevenLabsOutputFormat)[keyof typeof ElevenLabsOutputFormat];
|
|
4280
|
-
/** ElevenLabs voice tuning knobs forwarded as `voice_settings` in the request. */
|
|
4281
|
-
interface ElevenLabsVoiceSettings {
|
|
4282
|
-
stability?: number;
|
|
4283
|
-
similarity_boost?: number;
|
|
4284
|
-
style?: number;
|
|
4285
|
-
use_speaker_boost?: boolean;
|
|
4286
|
-
}
|
|
4287
|
-
/** Constructor options for {@link ElevenLabsTTS}. */
|
|
4288
|
-
interface ElevenLabsTTSOptions$1 {
|
|
4289
|
-
voiceId?: string;
|
|
4290
|
-
/**
|
|
4291
|
-
* ElevenLabs voice model ID. The default ``eleven_flash_v2_5`` has the
|
|
4292
|
-
* lowest TTFT (~75 ms). Pass ``eleven_v3`` for highest quality, or any
|
|
4293
|
-
* arbitrary string for forward-compat with future models.
|
|
4294
|
-
*/
|
|
4295
|
-
modelId?: ElevenLabsModel | string;
|
|
4296
|
-
outputFormat?: ElevenLabsOutputFormat;
|
|
4297
|
-
voiceSettings?: ElevenLabsVoiceSettings;
|
|
4298
|
-
languageCode?: string;
|
|
4299
|
-
chunkSize?: number;
|
|
5622
|
+
private readonly errorCallbacks;
|
|
5623
|
+
private running;
|
|
5624
|
+
/** Sequence number of the last audio chunk acknowledged via `AudioAdded`. */
|
|
5625
|
+
private lastSeqNo;
|
|
5626
|
+
private readonly apiKey;
|
|
5627
|
+
private readonly baseUrl;
|
|
5628
|
+
private readonly language;
|
|
5629
|
+
private readonly turnDetectionMode;
|
|
5630
|
+
private readonly sampleRate;
|
|
5631
|
+
private readonly enableDiarization;
|
|
5632
|
+
private readonly maxDelay;
|
|
5633
|
+
private readonly endOfUtteranceSilenceTrigger;
|
|
5634
|
+
private readonly endOfUtteranceMaxDelay;
|
|
5635
|
+
private readonly includePartials;
|
|
5636
|
+
private readonly additionalVocab;
|
|
5637
|
+
private readonly operatingPoint;
|
|
5638
|
+
private readonly domain;
|
|
5639
|
+
private readonly outputLocale;
|
|
5640
|
+
constructor(apiKey: string, options?: SpeechmaticsSTTOptions$1);
|
|
5641
|
+
/** Build the JSON `StartRecognition` payload sent on connect. */
|
|
5642
|
+
private buildStartRecognition;
|
|
5643
|
+
/** Open the streaming WebSocket and send the `StartRecognition` frame. */
|
|
5644
|
+
connect(): Promise<void>;
|
|
5645
|
+
/** Send a binary PCM16-LE audio chunk to Speechmatics for transcription. */
|
|
5646
|
+
sendAudio(audio: Buffer): void;
|
|
5647
|
+
/** Register a transcript listener. */
|
|
5648
|
+
onTranscript(callback: TranscriptCallback$1): void;
|
|
5649
|
+
/** Remove a previously registered transcript listener. */
|
|
5650
|
+
offTranscript(callback: TranscriptCallback$1): void;
|
|
5651
|
+
/** Register an error listener for socket / API failures. */
|
|
5652
|
+
onError(callback: ErrorCallback): void;
|
|
5653
|
+
/** Remove a previously registered error listener. */
|
|
5654
|
+
offError(callback: ErrorCallback): void;
|
|
5655
|
+
private handleMessage;
|
|
5656
|
+
/** Translate a Speechmatics transcript message into a Patter `Transcript`. */
|
|
5657
|
+
private toTranscript;
|
|
5658
|
+
private emitTranscript;
|
|
5659
|
+
private emitError;
|
|
5660
|
+
private handleError;
|
|
5661
|
+
private handleClose;
|
|
5662
|
+
/** Send `EndOfStream` and close the WebSocket. Idempotent. */
|
|
5663
|
+
close(): void;
|
|
4300
5664
|
}
|
|
5665
|
+
|
|
5666
|
+
/** Speechmatics streaming STT for Patter pipeline mode. */
|
|
5667
|
+
|
|
5668
|
+
type SpeechmaticsSTTOptions = SpeechmaticsSTTOptions$1 & {
|
|
5669
|
+
/** API key. Falls back to SPEECHMATICS_API_KEY env var when omitted. */
|
|
5670
|
+
apiKey?: string;
|
|
5671
|
+
};
|
|
4301
5672
|
/**
|
|
4302
|
-
*
|
|
4303
|
-
*
|
|
4304
|
-
* Supported `modelId` values are autocompleted via {@link ElevenLabsModel}.
|
|
4305
|
-
* Default is `eleven_flash_v2_5` (lowest TTFT, ~75 ms).
|
|
4306
|
-
*
|
|
4307
|
-
* **Telephony optimization** — the constructor default
|
|
4308
|
-
* `outputFormat='pcm_16000'` is correct for web playback, dashboard
|
|
4309
|
-
* previews, and 16 kHz pipelines. For real phone calls, use the
|
|
4310
|
-
* carrier-specific factories instead:
|
|
5673
|
+
* Speechmatics streaming STT.
|
|
4311
5674
|
*
|
|
4312
|
-
*
|
|
4313
|
-
*
|
|
4314
|
-
*
|
|
4315
|
-
*
|
|
4316
|
-
*
|
|
4317
|
-
*
|
|
4318
|
-
* - {@link ElevenLabsTTS.forTelnyx} emits `pcm_16000`. Telnyx negotiates
|
|
4319
|
-
* L16/16000 on its bidirectional media WebSocket, so 16 kHz PCM is
|
|
4320
|
-
* already the format used end-to-end and no transcoding happens.
|
|
4321
|
-
* ElevenLabs *also* supports `ulaw_8000` if your Telnyx profile is
|
|
4322
|
-
* pinned to PCMU/8000 — pass `outputFormat: 'ulaw_8000'` explicitly
|
|
4323
|
-
* in that case.
|
|
5675
|
+
* @example
|
|
5676
|
+
* ```ts
|
|
5677
|
+
* import * as speechmatics from "getpatter/stt/speechmatics";
|
|
5678
|
+
* const stt = new speechmatics.STT(); // reads SPEECHMATICS_API_KEY
|
|
5679
|
+
* const stt = new speechmatics.STT({ apiKey: "sm_...", language: "en" });
|
|
5680
|
+
* ```
|
|
4324
5681
|
*/
|
|
4325
|
-
declare class
|
|
4326
|
-
|
|
4327
|
-
|
|
4328
|
-
private readonly modelId;
|
|
4329
|
-
private readonly outputFormat;
|
|
4330
|
-
private readonly voiceSettings;
|
|
4331
|
-
private readonly languageCode;
|
|
4332
|
-
private readonly chunkSize;
|
|
4333
|
-
constructor(apiKey: string, voiceId?: string, modelId?: string, outputFormat?: ElevenLabsOutputFormat | string);
|
|
4334
|
-
constructor(apiKey: string, options: ElevenLabsTTSOptions$1);
|
|
4335
|
-
/**
|
|
4336
|
-
* Construct an instance pre-configured for Twilio Media Streams.
|
|
4337
|
-
*
|
|
4338
|
-
* Sets `outputFormat='ulaw_8000'` so ElevenLabs emits μ-law @ 8 kHz
|
|
4339
|
-
* directly — the exact wire format Twilio's media stream uses — letting
|
|
4340
|
-
* the SDK skip the 16 kHz→8 kHz resample and PCM→μ-law conversion in
|
|
4341
|
-
* `TwilioAudioSender`. Saves ~30–80 ms first-byte and per-frame CPU,
|
|
4342
|
-
* and removes a potential aliasing source.
|
|
4343
|
-
*
|
|
4344
|
-
* `voiceSettings` defaults to a low-bandwidth-friendly profile
|
|
4345
|
-
* (speaker boost off, modest stability) which sounds cleaner at 8 kHz
|
|
4346
|
-
* μ-law than the studio default. Pass an explicit object to override.
|
|
4347
|
-
*/
|
|
4348
|
-
static forTwilio(apiKey: string, options?: Omit<ElevenLabsTTSOptions$1, 'outputFormat'>): ElevenLabsTTS;
|
|
4349
|
-
/**
|
|
4350
|
-
* Construct an instance pre-configured for Telnyx bidirectional media.
|
|
4351
|
-
*
|
|
4352
|
-
* Telnyx's default media-streaming codec is L16 PCM @ 16 kHz, which
|
|
4353
|
-
* matches our default Telnyx handler. We pick `pcm_16000` so the audio
|
|
4354
|
-
* flows end-to-end with zero resampling or transcoding.
|
|
4355
|
-
*
|
|
4356
|
-
* Trade-off: if your Telnyx profile is pinned to PCMU/8000 (μ-law),
|
|
4357
|
-
* construct `ElevenLabsTTS` directly with `outputFormat: 'ulaw_8000'`
|
|
4358
|
-
* — Telnyx supports that natively too.
|
|
4359
|
-
*/
|
|
4360
|
-
static forTelnyx(apiKey: string, options?: Omit<ElevenLabsTTSOptions$1, 'outputFormat'>): ElevenLabsTTS;
|
|
4361
|
-
/**
|
|
4362
|
-
* Synthesise text to speech and return the full audio as a single Buffer.
|
|
4363
|
-
*
|
|
4364
|
-
* For large chunks (or when latency matters) call `synthesizeStream` instead.
|
|
4365
|
-
*/
|
|
4366
|
-
synthesize(text: string): Promise<Buffer>;
|
|
4367
|
-
/**
|
|
4368
|
-
* Synthesise text and yield audio chunks as they arrive (streaming).
|
|
4369
|
-
*
|
|
4370
|
-
* The yielded buffers are raw PCM at 16 kHz (or whatever `outputFormat` is
|
|
4371
|
-
* configured to). `chunkSize` controls the maximum yield size — 512 is a
|
|
4372
|
-
* good choice for low-latency telephony.
|
|
4373
|
-
*/
|
|
4374
|
-
synthesizeStream(text: string): AsyncGenerator<Buffer>;
|
|
5682
|
+
declare class STT extends SpeechmaticsSTT {
|
|
5683
|
+
static readonly providerKey = "speechmatics";
|
|
5684
|
+
constructor(opts?: SpeechmaticsSTTOptions);
|
|
4375
5685
|
}
|
|
4376
5686
|
|
|
4377
5687
|
/** ElevenLabs TTS for Patter pipeline mode. */
|
|
@@ -4404,133 +5714,24 @@ type ElevenLabsCarrierOptions = Omit<ElevenLabsTTSOptions, "outputFormat">;
|
|
|
4404
5714
|
* @example
|
|
4405
5715
|
* ```ts
|
|
4406
5716
|
* import * as elevenlabs from "getpatter/tts/elevenlabs";
|
|
4407
|
-
* const tts = new elevenlabs.TTS(); // reads ELEVENLABS_API_KEY
|
|
4408
|
-
* const tts = new elevenlabs.TTS({ apiKey: "...", voiceId: "rachel" });
|
|
4409
|
-
* ```
|
|
4410
|
-
*
|
|
4411
|
-
* **Telephony optimization** — use {@link TTS.forTwilio} (μ-law @ 8 kHz,
|
|
4412
|
-
* native Twilio Media Streams format) or {@link TTS.forTelnyx} (PCM @
|
|
4413
|
-
* 16 kHz, native Telnyx default) on phone calls to skip the SDK-side
|
|
4414
|
-
* resampling / transcoding step.
|
|
4415
|
-
*/
|
|
4416
|
-
declare class TTS$6 extends ElevenLabsTTS {
|
|
4417
|
-
static readonly providerKey = "elevenlabs";
|
|
4418
|
-
constructor(opts?: ElevenLabsTTSOptions);
|
|
4419
|
-
/** Pipeline TTS pre-configured for Twilio Media Streams (`ulaw_8000`). */
|
|
4420
|
-
static forTwilio(opts?: ElevenLabsCarrierOptions): TTS$6;
|
|
4421
|
-
static forTwilio(apiKey: string, options?: Omit<ElevenLabsTTSOptions, "outputFormat">): TTS$6;
|
|
4422
|
-
/** Pipeline TTS pre-configured for Telnyx (`pcm_16000`). */
|
|
4423
|
-
static forTelnyx(opts?: ElevenLabsCarrierOptions): TTS$6;
|
|
4424
|
-
static forTelnyx(apiKey: string, options?: Omit<ElevenLabsTTSOptions, "outputFormat">): TTS$6;
|
|
4425
|
-
}
|
|
4426
|
-
|
|
4427
|
-
/**
|
|
4428
|
-
* WebSocket-based ElevenLabs TTS provider — opt-in low-latency variant.
|
|
4429
|
-
*
|
|
4430
|
-
* Targets the ElevenLabs streaming-input WebSocket endpoint
|
|
4431
|
-
* (`/v1/text-to-speech/{voice_id}/stream-input`) instead of the HTTP
|
|
4432
|
-
* `/stream` endpoint used by `ElevenLabsTTS`. Saves the HTTP request setup
|
|
4433
|
-
* time per utterance (~50 ms) and avoids the HTTP cold-start TLS handshake
|
|
4434
|
-
* when calls are bursty.
|
|
4435
|
-
*
|
|
4436
|
-
* API matches `ElevenLabsTTS` (`synthesizeStream(text)` returns an
|
|
4437
|
-
* `AsyncGenerator<Buffer>`) so it can be passed anywhere a TTSAdapter is
|
|
4438
|
-
* expected.
|
|
4439
|
-
*
|
|
4440
|
-
* Behaviour notes
|
|
4441
|
-
* - WebSocket is opened **per-utterance** (matches HTTP semantics). A
|
|
4442
|
-
* future revision may pool a WS across utterances of the same call
|
|
4443
|
-
* session — see roadmap Phase 5b.
|
|
4444
|
-
* - `auto_mode=true` is enabled by default. Pass `autoMode: false` to
|
|
4445
|
-
* send a custom `chunk_length_schedule`.
|
|
4446
|
-
* - `outputFormat` is exposed as a query parameter so `ulaw_8000` (Twilio
|
|
4447
|
-
* native) and `pcm_16000` (Telnyx native) work without resampling.
|
|
4448
|
-
* - `eleven_v3` is **not** supported — the WS endpoint rejects it.
|
|
4449
|
-
* - `optimize_streaming_latency` is officially deprecated and is not
|
|
4450
|
-
* exposed.
|
|
4451
|
-
*/
|
|
4452
|
-
|
|
4453
|
-
/** Constructor options for {@link ElevenLabsWebSocketTTS}. */
|
|
4454
|
-
interface ElevenLabsWebSocketTTSOptions {
|
|
4455
|
-
apiKey: string;
|
|
4456
|
-
voiceId?: string;
|
|
4457
|
-
modelId?: ElevenLabsModel | string;
|
|
4458
|
-
outputFormat?: string;
|
|
4459
|
-
voiceSettings?: Record<string, unknown>;
|
|
4460
|
-
languageCode?: string;
|
|
4461
|
-
/** Let the server pick chunk timing. Default true. */
|
|
4462
|
-
autoMode?: boolean;
|
|
4463
|
-
/** WS keep-alive timeout in seconds (5–180). Default 60. */
|
|
4464
|
-
inactivityTimeout?: number;
|
|
4465
|
-
/**
|
|
4466
|
-
* Manual chunk schedule, only used when ``autoMode: false``. Each value
|
|
4467
|
-
* must be 5–500. ElevenLabs default is ``[120, 160, 250, 290]``.
|
|
4468
|
-
*/
|
|
4469
|
-
chunkLengthSchedule?: number[];
|
|
4470
|
-
/** Outgoing audio re-chunk size in bytes. Default 4096. */
|
|
4471
|
-
chunkSize?: number;
|
|
4472
|
-
}
|
|
4473
|
-
/** WebSocket-based ElevenLabs TTS adapter — opt-in low-latency variant. */
|
|
4474
|
-
declare class ElevenLabsWebSocketTTS implements TTSAdapter {
|
|
4475
|
-
static readonly providerKey = "elevenlabs_ws";
|
|
4476
|
-
readonly apiKey: string;
|
|
4477
|
-
readonly voiceId: string;
|
|
4478
|
-
readonly modelId: string;
|
|
4479
|
-
readonly voiceSettings?: Record<string, unknown>;
|
|
4480
|
-
readonly languageCode?: string;
|
|
4481
|
-
readonly autoMode: boolean;
|
|
4482
|
-
readonly inactivityTimeout: number;
|
|
4483
|
-
readonly chunkLengthSchedule?: number[];
|
|
4484
|
-
readonly chunkSize: number;
|
|
4485
|
-
/**
|
|
4486
|
-
* The wire format requested over the ElevenLabs WS. Initially set from
|
|
4487
|
-
* the constructor; ``setTelephonyCarrier`` may auto-flip it to the
|
|
4488
|
-
* carrier's native codec when the caller did NOT pass ``outputFormat``
|
|
4489
|
-
* explicitly.
|
|
4490
|
-
*/
|
|
4491
|
-
private _outputFormat;
|
|
4492
|
-
private readonly _outputFormatExplicit;
|
|
4493
|
-
/** Public read-only view of the (possibly auto-flipped) wire format. */
|
|
4494
|
-
get outputFormat(): string;
|
|
4495
|
-
constructor(opts: ElevenLabsWebSocketTTSOptions);
|
|
4496
|
-
/**
|
|
4497
|
-
* Hook called by ``StreamHandler`` to advise the carrier wire format.
|
|
4498
|
-
*
|
|
4499
|
-
* When the user did NOT pass an explicit ``outputFormat`` in the
|
|
4500
|
-
* constructor options, this flips the format to the carrier's native
|
|
4501
|
-
* wire codec — saving a client-side transcode step. Calling with an
|
|
4502
|
-
* unknown carrier (``""`` / ``"custom"``) is a no-op.
|
|
4503
|
-
*
|
|
4504
|
-
* When ``outputFormat`` was explicitly passed (incl. via the
|
|
4505
|
-
* ``forTwilio`` / ``forTelnyx`` factories), this method is a no-op —
|
|
4506
|
-
* the user's choice always wins.
|
|
4507
|
-
*/
|
|
4508
|
-
setTelephonyCarrier(carrier: string): void;
|
|
4509
|
-
/** Pre-configured for Twilio Media Streams (`ulaw_8000`). */
|
|
4510
|
-
static forTwilio(opts: Omit<ElevenLabsWebSocketTTSOptions, 'outputFormat'>): ElevenLabsWebSocketTTS;
|
|
4511
|
-
/** Pre-configured for Telnyx (`pcm_16000`). */
|
|
4512
|
-
static forTelnyx(opts: Omit<ElevenLabsWebSocketTTSOptions, 'outputFormat'>): ElevenLabsWebSocketTTS;
|
|
4513
|
-
private buildUrl;
|
|
4514
|
-
/**
|
|
4515
|
-
* Single-shot synthesis: open WS, send text, yield bytes, close.
|
|
4516
|
-
*
|
|
4517
|
-
* Resilience contract:
|
|
4518
|
-
* - Connection bounded by ``CONNECT_TIMEOUT_MS`` (5s, was 15s).
|
|
4519
|
-
* - Each idle wait bounded by ``FRAME_TIMEOUT_MS`` (30s) so a stalled
|
|
4520
|
-
* server cannot keep the generator alive indefinitely.
|
|
4521
|
-
* - Permanent error handler attached BEFORE the open await — prevents
|
|
4522
|
-
* ``uncaughtException`` if an error fires after the once-listener
|
|
4523
|
-
* resolves.
|
|
4524
|
-
* - All event listeners removed in ``finally`` (no closure leak past
|
|
4525
|
-
* socket close).
|
|
4526
|
-
* - Server-reported ``error`` raises ``ElevenLabsTTSError``.
|
|
4527
|
-
* - Per-frame audio payload capped at ``MAX_AUDIO_B64_BYTES``.
|
|
4528
|
-
* - Best-effort EOS ``{"text":""}`` sent in finally (not immediately
|
|
4529
|
-
* after flush — auto_mode could otherwise truncate the tail audio).
|
|
4530
|
-
*/
|
|
4531
|
-
synthesizeStream(text: string): AsyncGenerator<Buffer>;
|
|
4532
|
-
/** No-op — connections are per-utterance and torn down inside synthesizeStream. */
|
|
4533
|
-
close(): Promise<void>;
|
|
5717
|
+
* const tts = new elevenlabs.TTS(); // reads ELEVENLABS_API_KEY
|
|
5718
|
+
* const tts = new elevenlabs.TTS({ apiKey: "...", voiceId: "rachel" });
|
|
5719
|
+
* ```
|
|
5720
|
+
*
|
|
5721
|
+
* **Telephony optimization** — use {@link TTS.forTwilio} (μ-law @ 8 kHz,
|
|
5722
|
+
* native Twilio Media Streams format) or {@link TTS.forTelnyx} (PCM @
|
|
5723
|
+
* 16 kHz, native Telnyx default) on phone calls to skip the SDK-side
|
|
5724
|
+
* resampling / transcoding step.
|
|
5725
|
+
*/
|
|
5726
|
+
declare class TTS$6 extends ElevenLabsTTS {
|
|
5727
|
+
static readonly providerKey = "elevenlabs";
|
|
5728
|
+
constructor(opts?: ElevenLabsTTSOptions);
|
|
5729
|
+
/** Pipeline TTS pre-configured for Twilio Media Streams (`ulaw_8000`). */
|
|
5730
|
+
static forTwilio(opts?: ElevenLabsCarrierOptions): TTS$6;
|
|
5731
|
+
static forTwilio(apiKey: string, options?: Omit<ElevenLabsTTSOptions, "outputFormat">): TTS$6;
|
|
5732
|
+
/** Pipeline TTS pre-configured for Telnyx (`pcm_16000`). */
|
|
5733
|
+
static forTelnyx(opts?: ElevenLabsCarrierOptions): TTS$6;
|
|
5734
|
+
static forTelnyx(apiKey: string, options?: Omit<ElevenLabsTTSOptions, "outputFormat">): TTS$6;
|
|
4534
5735
|
}
|
|
4535
5736
|
|
|
4536
5737
|
/** ElevenLabs WebSocket TTS for Patter pipeline mode (opt-in low-latency). */
|
|
@@ -4595,6 +5796,8 @@ declare class OpenAITTS {
|
|
|
4595
5796
|
private readonly speed;
|
|
4596
5797
|
private readonly antiAlias;
|
|
4597
5798
|
private readonly targetSampleRate;
|
|
5799
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
5800
|
+
static readonly providerKey = "openai_tts";
|
|
4598
5801
|
constructor(apiKey: string, voice?: string, model?: string, instructions?: string | null, speed?: number | null, antiAlias?: boolean, targetSampleRate?: number);
|
|
4599
5802
|
/**
|
|
4600
5803
|
* Synthesise text to speech and return the full audio as a single Buffer.
|
|
@@ -4676,107 +5879,6 @@ declare class TTS$4 extends OpenAITTS {
|
|
|
4676
5879
|
constructor(opts?: OpenAITTSOptions);
|
|
4677
5880
|
}
|
|
4678
5881
|
|
|
4679
|
-
/**
|
|
4680
|
-
* Cartesia TTS provider — HTTP `/tts/bytes` endpoint.
|
|
4681
|
-
*
|
|
4682
|
-
* Cartesia also offers a WebSocket streaming mode with word timestamps;
|
|
4683
|
-
* this provider focuses on the chunked-bytes HTTP API which maps cleanly
|
|
4684
|
-
* onto Patter's `synthesize(text)` contract and keeps the provider
|
|
4685
|
-
* dependency-free (just `fetch`).
|
|
4686
|
-
*
|
|
4687
|
-
* Default model is `sonic-3` (GA snapshot `sonic-3-2026-01-12`) — Cartesia's
|
|
4688
|
-
* current GA model with a documented ~90 ms TTFB target. Voice IDs from the
|
|
4689
|
-
* sonic-2 generation (including the default Katie voice) remain compatible.
|
|
4690
|
-
*
|
|
4691
|
-
* **Telephony optimization** — the constructor default
|
|
4692
|
-
* `sampleRate=16000` is correct for web playback, dashboard previews, and
|
|
4693
|
-
* 16 kHz pipelines. For real phone calls, use the carrier-specific
|
|
4694
|
-
* factories instead:
|
|
4695
|
-
*
|
|
4696
|
-
* - {@link CartesiaTTS.forTwilio} requests `sampleRate=8000` natively from
|
|
4697
|
-
* Cartesia. Twilio's media-stream WebSocket expects μ-law @ 8 kHz, so
|
|
4698
|
-
* the SDK normally resamples 16 kHz → 8 kHz before doing the PCM →
|
|
4699
|
-
* μ-law transcode in `TwilioAudioSender`. Asking Cartesia for 8 kHz
|
|
4700
|
-
* PCM at the source skips the resample step (saves ~10–30 ms first-
|
|
4701
|
-
* byte plus per-frame CPU and removes a potential aliasing source).
|
|
4702
|
-
* The PCM → μ-law transcode still happens client-side.
|
|
4703
|
-
* - {@link CartesiaTTS.forTelnyx} requests `sampleRate=16000`. Telnyx
|
|
4704
|
-
* negotiates L16/16000 on its bidirectional media WebSocket, so
|
|
4705
|
-
* 16 kHz PCM is already the format used end-to-end and no
|
|
4706
|
-
* transcoding happens. This is the same as the bare-constructor
|
|
4707
|
-
* default and exists for API symmetry with the Twilio factory.
|
|
4708
|
-
*/
|
|
4709
|
-
/** Known Cartesia TTS models. */
|
|
4710
|
-
declare const CartesiaTTSModel: {
|
|
4711
|
-
readonly SONIC_3: "sonic-3";
|
|
4712
|
-
readonly SONIC_2: "sonic-2";
|
|
4713
|
-
readonly SONIC: "sonic";
|
|
4714
|
-
};
|
|
4715
|
-
type CartesiaTTSModel = (typeof CartesiaTTSModel)[keyof typeof CartesiaTTSModel];
|
|
4716
|
-
/** Common PCM sample rates accepted by the Cartesia bytes endpoint. */
|
|
4717
|
-
declare const CartesiaTTSSampleRate: {
|
|
4718
|
-
readonly HZ_8000: 8000;
|
|
4719
|
-
readonly HZ_16000: 16000;
|
|
4720
|
-
readonly HZ_22050: 22050;
|
|
4721
|
-
readonly HZ_24000: 24000;
|
|
4722
|
-
readonly HZ_44100: 44100;
|
|
4723
|
-
};
|
|
4724
|
-
type CartesiaTTSSampleRate = (typeof CartesiaTTSSampleRate)[keyof typeof CartesiaTTSSampleRate];
|
|
4725
|
-
/** Constructor options for {@link CartesiaTTS}. */
|
|
4726
|
-
interface CartesiaTTSOptions$1 {
|
|
4727
|
-
model?: CartesiaTTSModel | string;
|
|
4728
|
-
voice?: string;
|
|
4729
|
-
language?: string;
|
|
4730
|
-
sampleRate?: CartesiaTTSSampleRate | number;
|
|
4731
|
-
speed?: string | number;
|
|
4732
|
-
emotion?: string | string[];
|
|
4733
|
-
volume?: number;
|
|
4734
|
-
baseUrl?: string;
|
|
4735
|
-
apiVersion?: string;
|
|
4736
|
-
}
|
|
4737
|
-
/** Cartesia TTS provider backed by the HTTP `/tts/bytes` streaming endpoint. */
|
|
4738
|
-
declare class CartesiaTTS {
|
|
4739
|
-
private readonly apiKey;
|
|
4740
|
-
private readonly model;
|
|
4741
|
-
private readonly voice;
|
|
4742
|
-
private readonly language;
|
|
4743
|
-
private readonly sampleRate;
|
|
4744
|
-
private readonly speed?;
|
|
4745
|
-
private readonly emotion?;
|
|
4746
|
-
private readonly volume?;
|
|
4747
|
-
private readonly baseUrl;
|
|
4748
|
-
private readonly apiVersion;
|
|
4749
|
-
constructor(apiKey: string, opts?: CartesiaTTSOptions$1);
|
|
4750
|
-
/**
|
|
4751
|
-
* Construct an instance pre-configured for Twilio Media Streams.
|
|
4752
|
-
*
|
|
4753
|
-
* Sets `sampleRate=8000` so Cartesia emits PCM_S16LE @ 8 kHz directly.
|
|
4754
|
-
* Twilio's media stream uses μ-law @ 8 kHz so the SDK still does the
|
|
4755
|
-
* PCM → μ-law transcode client-side, but the 16 kHz → 8 kHz resample
|
|
4756
|
-
* step is skipped. Saves ~10–30 ms first-byte plus per-frame CPU and
|
|
4757
|
-
* removes a potential aliasing source.
|
|
4758
|
-
*/
|
|
4759
|
-
static forTwilio(apiKey: string, options?: Omit<CartesiaTTSOptions$1, 'sampleRate'>): CartesiaTTS;
|
|
4760
|
-
/**
|
|
4761
|
-
* Construct an instance pre-configured for Telnyx bidirectional media.
|
|
4762
|
-
*
|
|
4763
|
-
* Sets `sampleRate=16000` to match Telnyx's L16/16000 default codec —
|
|
4764
|
-
* audio flows end-to-end with zero resampling or transcoding. Same as
|
|
4765
|
-
* the bare-constructor default; exists for API symmetry with
|
|
4766
|
-
* {@link CartesiaTTS.forTwilio}.
|
|
4767
|
-
*/
|
|
4768
|
-
static forTelnyx(apiKey: string, options?: Omit<CartesiaTTSOptions$1, 'sampleRate'>): CartesiaTTS;
|
|
4769
|
-
/** Build the JSON payload for the Cartesia bytes endpoint. */
|
|
4770
|
-
private buildPayload;
|
|
4771
|
-
/** Synthesize text and return the concatenated audio buffer. */
|
|
4772
|
-
synthesize(text: string): Promise<Buffer>;
|
|
4773
|
-
/**
|
|
4774
|
-
* Synthesize text and yield raw PCM_S16LE chunks at the configured
|
|
4775
|
-
* `sampleRate` as they arrive from Cartesia.
|
|
4776
|
-
*/
|
|
4777
|
-
synthesizeStream(text: string): AsyncGenerator<Buffer>;
|
|
4778
|
-
}
|
|
4779
|
-
|
|
4780
5882
|
/** Cartesia TTS for Patter pipeline mode. */
|
|
4781
5883
|
|
|
4782
5884
|
/** Constructor options for the Cartesia `TTS` adapter. */
|
|
@@ -4825,50 +5927,6 @@ declare class TTS$3 extends CartesiaTTS {
|
|
|
4825
5927
|
static forTelnyx(apiKey: string, options?: Omit<CartesiaTTSOptions, "sampleRate">): TTS$3;
|
|
4826
5928
|
}
|
|
4827
5929
|
|
|
4828
|
-
/** Constructor options for {@link RimeTTS}. */
|
|
4829
|
-
interface RimeTTSOptions$1 {
|
|
4830
|
-
model?: string;
|
|
4831
|
-
speaker?: string;
|
|
4832
|
-
lang?: string;
|
|
4833
|
-
sampleRate?: number;
|
|
4834
|
-
repetitionPenalty?: number;
|
|
4835
|
-
temperature?: number;
|
|
4836
|
-
topP?: number;
|
|
4837
|
-
maxTokens?: number;
|
|
4838
|
-
speedAlpha?: number;
|
|
4839
|
-
reduceLatency?: boolean;
|
|
4840
|
-
pauseBetweenBrackets?: boolean;
|
|
4841
|
-
phonemizeBetweenBrackets?: boolean;
|
|
4842
|
-
baseUrl?: string;
|
|
4843
|
-
}
|
|
4844
|
-
/** Rime TTS adapter for the `users.rime.ai/v1/rime-tts` HTTP streaming endpoint. */
|
|
4845
|
-
declare class RimeTTS {
|
|
4846
|
-
private readonly apiKey;
|
|
4847
|
-
private readonly model;
|
|
4848
|
-
private readonly speaker;
|
|
4849
|
-
private readonly lang;
|
|
4850
|
-
private readonly sampleRate;
|
|
4851
|
-
private readonly repetitionPenalty?;
|
|
4852
|
-
private readonly temperature?;
|
|
4853
|
-
private readonly topP?;
|
|
4854
|
-
private readonly maxTokens?;
|
|
4855
|
-
private readonly speedAlpha?;
|
|
4856
|
-
private readonly reduceLatency?;
|
|
4857
|
-
private readonly pauseBetweenBrackets?;
|
|
4858
|
-
private readonly phonemizeBetweenBrackets?;
|
|
4859
|
-
private readonly baseUrl;
|
|
4860
|
-
private readonly totalTimeoutMs;
|
|
4861
|
-
constructor(apiKey: string, opts?: RimeTTSOptions$1);
|
|
4862
|
-
private buildPayload;
|
|
4863
|
-
/** Synthesize text and return the concatenated audio buffer. */
|
|
4864
|
-
synthesize(text: string): Promise<Buffer>;
|
|
4865
|
-
/**
|
|
4866
|
-
* Synthesize text and yield raw PCM_S16LE chunks at the configured
|
|
4867
|
-
* `sampleRate` as they stream in.
|
|
4868
|
-
*/
|
|
4869
|
-
synthesizeStream(text: string): AsyncGenerator<Buffer>;
|
|
4870
|
-
}
|
|
4871
|
-
|
|
4872
5930
|
/** Rime TTS for Patter pipeline mode. */
|
|
4873
5931
|
|
|
4874
5932
|
/** Constructor options for the Rime `TTS` adapter. */
|
|
@@ -5001,6 +6059,8 @@ interface InworldTTSOptions$1 {
|
|
|
5001
6059
|
* before calling the constructor.
|
|
5002
6060
|
*/
|
|
5003
6061
|
declare class InworldTTS {
|
|
6062
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
6063
|
+
static readonly providerKey = "inworld";
|
|
5004
6064
|
private readonly authToken;
|
|
5005
6065
|
private readonly model;
|
|
5006
6066
|
private readonly voice;
|
|
@@ -5014,6 +6074,33 @@ declare class InworldTTS {
|
|
|
5014
6074
|
private readonly baseUrl;
|
|
5015
6075
|
constructor(authToken: string, opts?: InworldTTSOptions$1);
|
|
5016
6076
|
private buildPayload;
|
|
6077
|
+
/**
|
|
6078
|
+
* Pre-call HTTP warmup for the Inworld TTS API.
|
|
6079
|
+
*
|
|
6080
|
+
* Issues a lightweight `GET /tts/v1/voices` against the API host so
|
|
6081
|
+
* DNS + TLS + HTTP/2 connection are already up by the time the first
|
|
6082
|
+
* `synthesizeStream()` POST lands. Best-effort: 5 s timeout, all
|
|
6083
|
+
* exceptions swallowed at debug level.
|
|
6084
|
+
*
|
|
6085
|
+
* Earlier revisions issued `HEAD` against the streaming endpoint
|
|
6086
|
+
* (`/tts/v1/voice:stream`). That endpoint is POST-only so HEAD
|
|
6087
|
+
* returns `405 Method Not Allowed` — the warmup still completed the
|
|
6088
|
+
* TLS handshake but spammed 405 errors into Inworld's audit logs and
|
|
6089
|
+
* into our own logs. Switching to a documented `GET /tts/v1/voices`
|
|
6090
|
+
* metadata read is a 2xx-clean equivalent.
|
|
6091
|
+
*
|
|
6092
|
+
* Billing safety: `GET /tts/v1/voices` is a free metadata endpoint
|
|
6093
|
+
* (per https://docs.inworld.ai/). It returns the voice catalogue
|
|
6094
|
+
* without invoking the synthesis pipeline. The actual synthesis is
|
|
6095
|
+
* billed only when `POST /tts/v1/voice:stream` runs with a non-empty
|
|
6096
|
+
* `text`.
|
|
6097
|
+
*
|
|
6098
|
+
* Note: Inworld TTS uses the HTTP NDJSON streaming path rather than
|
|
6099
|
+
* a persistent WebSocket — connection warmup is therefore HTTP-based,
|
|
6100
|
+
* not WebSocket pre-handshake. The latency win is smaller (~50-150 ms)
|
|
6101
|
+
* than the WS-based prewarms but still real on cold-start calls.
|
|
6102
|
+
*/
|
|
6103
|
+
warmup(): Promise<void>;
|
|
5017
6104
|
/** Synthesize text and return the concatenated audio buffer. */
|
|
5018
6105
|
synthesize(text: string): Promise<Buffer>;
|
|
5019
6106
|
/**
|
|
@@ -5143,6 +6230,8 @@ interface AnthropicLLMOptions$1 {
|
|
|
5143
6230
|
}
|
|
5144
6231
|
/** LLM provider backed by Anthropic's Messages API (streaming). */
|
|
5145
6232
|
declare class AnthropicLLMProvider implements LLMProvider {
|
|
6233
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
6234
|
+
static readonly providerKey = "anthropic";
|
|
5146
6235
|
private readonly apiKey;
|
|
5147
6236
|
private readonly model;
|
|
5148
6237
|
private readonly maxTokens;
|
|
@@ -5151,6 +6240,13 @@ declare class AnthropicLLMProvider implements LLMProvider {
|
|
|
5151
6240
|
private readonly anthropicVersion;
|
|
5152
6241
|
private readonly promptCaching;
|
|
5153
6242
|
constructor(options: AnthropicLLMOptions$1);
|
|
6243
|
+
/**
|
|
6244
|
+
* Pre-call DNS / TLS warmup for the Anthropic Messages API.
|
|
6245
|
+
* Issues a lightweight ``GET https://api.anthropic.com/v1/models`` so
|
|
6246
|
+
* DNS, TLS and HTTP/2 are already up by the time the first ``messages``
|
|
6247
|
+
* call lands. Best-effort: 5 s timeout, exceptions swallowed at debug.
|
|
6248
|
+
*/
|
|
6249
|
+
warmup(): Promise<void>;
|
|
5154
6250
|
/** Stream Patter-format LLM chunks for the given OpenAI-style chat history. */
|
|
5155
6251
|
stream(messages: Array<Record<string, unknown>>, tools?: Array<Record<string, unknown>> | null, opts?: LLMStreamOptions): AsyncGenerator<LLMChunk, void, unknown>;
|
|
5156
6252
|
}
|
|
@@ -5238,6 +6334,8 @@ interface GroqLLMOptions$1 {
|
|
|
5238
6334
|
}
|
|
5239
6335
|
/** LLM provider backed by Groq's OpenAI-compatible Chat Completions API. */
|
|
5240
6336
|
declare class GroqLLMProvider implements LLMProvider {
|
|
6337
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
6338
|
+
static readonly providerKey = "groq";
|
|
5241
6339
|
private readonly apiKey;
|
|
5242
6340
|
readonly model: string;
|
|
5243
6341
|
private readonly baseUrl;
|
|
@@ -5252,6 +6350,11 @@ declare class GroqLLMProvider implements LLMProvider {
|
|
|
5252
6350
|
private readonly presencePenalty?;
|
|
5253
6351
|
private readonly stop?;
|
|
5254
6352
|
constructor(options: GroqLLMOptions$1);
|
|
6353
|
+
/**
|
|
6354
|
+
* Pre-call DNS / TLS warmup for the Groq inference endpoint.
|
|
6355
|
+
* Best-effort: 5 s timeout, all exceptions swallowed at debug level.
|
|
6356
|
+
*/
|
|
6357
|
+
warmup(): Promise<void>;
|
|
5255
6358
|
/** Stream Patter-format LLM chunks from the Groq chat completions API. */
|
|
5256
6359
|
stream(messages: Array<Record<string, unknown>>, tools?: Array<Record<string, unknown>> | null, opts?: LLMStreamOptions): AsyncGenerator<LLMChunk, void, unknown>;
|
|
5257
6360
|
}
|
|
@@ -5371,6 +6474,8 @@ interface CerebrasLLMOptions$1 {
|
|
|
5371
6474
|
* - zai-glm-4.7
|
|
5372
6475
|
*/
|
|
5373
6476
|
declare class CerebrasLLMProvider implements LLMProvider {
|
|
6477
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
6478
|
+
static readonly providerKey = "cerebras";
|
|
5374
6479
|
private readonly apiKey;
|
|
5375
6480
|
readonly model: string;
|
|
5376
6481
|
private readonly baseUrl;
|
|
@@ -5386,6 +6491,11 @@ declare class CerebrasLLMProvider implements LLMProvider {
|
|
|
5386
6491
|
private readonly presencePenalty?;
|
|
5387
6492
|
private readonly stop?;
|
|
5388
6493
|
constructor(options: CerebrasLLMOptions$1);
|
|
6494
|
+
/**
|
|
6495
|
+
* Pre-call DNS / TLS warmup for the Cerebras inference endpoint.
|
|
6496
|
+
* Best-effort: 5 s timeout, all exceptions swallowed at debug level.
|
|
6497
|
+
*/
|
|
6498
|
+
warmup(): Promise<void>;
|
|
5389
6499
|
/** Stream Patter-format LLM chunks from the Cerebras chat completions API. */
|
|
5390
6500
|
stream(messages: Array<Record<string, unknown>>, tools?: Array<Record<string, unknown>> | null, opts?: LLMStreamOptions): AsyncGenerator<LLMChunk, void, unknown>;
|
|
5391
6501
|
}
|
|
@@ -5468,12 +6578,22 @@ interface GoogleLLMOptions$1 {
|
|
|
5468
6578
|
}
|
|
5469
6579
|
/** LLM provider backed by Google Gemini (Developer API, streaming SSE). */
|
|
5470
6580
|
declare class GoogleLLMProvider implements LLMProvider {
|
|
6581
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
6582
|
+
static readonly providerKey = "google";
|
|
5471
6583
|
private readonly apiKey;
|
|
5472
6584
|
readonly model: string;
|
|
5473
6585
|
private readonly baseUrl;
|
|
5474
6586
|
private readonly temperature?;
|
|
5475
6587
|
private readonly maxOutputTokens?;
|
|
5476
6588
|
constructor(options: GoogleLLMOptions$1);
|
|
6589
|
+
/**
|
|
6590
|
+
* Pre-call DNS / TLS warmup for the Gemini API.
|
|
6591
|
+
* Issues a lightweight ``GET ${baseUrl}/models?key=...`` so DNS, TLS
|
|
6592
|
+
* and HTTP/2 are already up by the time the first
|
|
6593
|
+
* ``streamGenerateContent`` call lands. Best-effort: 5 s timeout, all
|
|
6594
|
+
* exceptions swallowed at debug level.
|
|
6595
|
+
*/
|
|
6596
|
+
warmup(): Promise<void>;
|
|
5477
6597
|
/** Stream Patter-format LLM chunks from the Gemini SSE endpoint. */
|
|
5478
6598
|
stream(messages: Array<Record<string, unknown>>, tools?: Array<Record<string, unknown>> | null, opts?: LLMStreamOptions): AsyncGenerator<LLMChunk, void, unknown>;
|
|
5479
6599
|
}
|
|
@@ -5597,7 +6717,10 @@ declare class SileroVAD implements VADProvider {
|
|
|
5597
6717
|
* - `activationThreshold = 0.5` — upstream `threshold`
|
|
5598
6718
|
* - `deactivationThreshold = 0.35` — upstream `neg_threshold = threshold - 0.15`
|
|
5599
6719
|
* - `minSpeechDuration = 0.25` — upstream `min_speech_duration_ms = 250`
|
|
5600
|
-
* - `minSilenceDuration = 0.
|
|
6720
|
+
* - `minSilenceDuration = 0.4` — telephony default (was 0.1, bumped after
|
|
6721
|
+
* round 10f found speech_end firing on inter-sentence pauses < 250 ms,
|
|
6722
|
+
* causing double-talk dispatch). 400 ms matches the industry telephony
|
|
6723
|
+
* default and the inter_utterance_gap_ms debounce in stream-handler.ts.
|
|
5601
6724
|
* - `prefixPaddingDuration = 0.03` — upstream `speech_pad_ms = 30`
|
|
5602
6725
|
*
|
|
5603
6726
|
* Override any field by passing `options`. Deployments that experience
|
|
@@ -5639,6 +6762,298 @@ declare class SileroVAD implements VADProvider {
|
|
|
5639
6762
|
private advanceState;
|
|
5640
6763
|
/** Mark the VAD as closed; subsequent processFrame calls throw. */
|
|
5641
6764
|
close(): Promise<void>;
|
|
6765
|
+
/**
|
|
6766
|
+
* Reset all per-utterance state so the next ``processFrame`` starts from
|
|
6767
|
+
* a clean SILENCE state.
|
|
6768
|
+
*
|
|
6769
|
+
* Called by the stream handler between agent turns to prevent a "stuck
|
|
6770
|
+
* SPEECH" condition where PSTN echo / loopback kept the detector's
|
|
6771
|
+
* probability above ``deactivationThreshold`` for the entire agent turn.
|
|
6772
|
+
* Without this reset the next user utterance would never trigger a
|
|
6773
|
+
* SILENCE→SPEECH transition and barge-in would feel "one-shot" (works
|
|
6774
|
+
* once, then never again until the call ends).
|
|
6775
|
+
*
|
|
6776
|
+
* Safe to call any time including on a closed instance (no-op).
|
|
6777
|
+
*/
|
|
6778
|
+
reset(): void;
|
|
6779
|
+
}
|
|
6780
|
+
|
|
6781
|
+
/** Options accepted by {@link DeepFilterNetFilter}. */
|
|
6782
|
+
interface DeepFilterNetOptions {
|
|
6783
|
+
/** Absolute path to a DeepFilterNet ONNX model. If omitted, the filter
|
|
6784
|
+
* logs a warning and becomes a pass-through. */
|
|
6785
|
+
modelPath?: string;
|
|
6786
|
+
/** When true, disable the pass-through warning (used by tests). */
|
|
6787
|
+
silenceWarnings?: boolean;
|
|
6788
|
+
}
|
|
6789
|
+
/** OSS noise-suppression filter backed by a DeepFilterNet ONNX model. */
|
|
6790
|
+
declare class DeepFilterNetFilter implements AudioFilter {
|
|
6791
|
+
private readonly modelPath;
|
|
6792
|
+
private readonly silenceWarnings;
|
|
6793
|
+
private session;
|
|
6794
|
+
private ort;
|
|
6795
|
+
private warned;
|
|
6796
|
+
private closed;
|
|
6797
|
+
private _resamplerSrcRate;
|
|
6798
|
+
private _upsamplerInst;
|
|
6799
|
+
private _downsamplerInst;
|
|
6800
|
+
constructor(options?: DeepFilterNetOptions);
|
|
6801
|
+
private ensureSession;
|
|
6802
|
+
/** Run noise suppression on a PCM16 chunk; pass-through when no model is loaded. */
|
|
6803
|
+
process(pcmChunk: Buffer, sampleRate: number): Promise<Buffer>;
|
|
6804
|
+
/** Flush resamplers, release the ONNX session, and mark the filter closed. */
|
|
6805
|
+
close(): Promise<void>;
|
|
6806
|
+
}
|
|
6807
|
+
|
|
6808
|
+
/**
|
|
6809
|
+
* Krisp VIVA noise-reduction AudioFilter — TypeScript scaffold.
|
|
6810
|
+
*
|
|
6811
|
+
* Mirrors the API of the Python `getpatter.providers.krisp_filter.KrispVivaFilter`
|
|
6812
|
+
* for SDK parity. As of 2026-05 Krisp does not publish an official Node.js
|
|
6813
|
+
* (server) SDK; third-party browser/RN wrappers exist but cannot process
|
|
6814
|
+
* server-received PCM/mulaw audio. This class throws at construction time
|
|
6815
|
+
* and points the caller at the available paths (Python SDK or DeepFilterNet
|
|
6816
|
+
* on TS).
|
|
6817
|
+
*
|
|
6818
|
+
* When Krisp publishes an official Node binding — or a community NAPI/WASM
|
|
6819
|
+
* wrapper becomes available — the import below and `process()` body will
|
|
6820
|
+
* fill in. The class signature is intentionally compatible with the Python
|
|
6821
|
+
* one so callers do not need to migrate code: `camelCase` ↔ `snake_case`,
|
|
6822
|
+
* `modelPath` ↔ `model_path`, etc.
|
|
6823
|
+
*
|
|
6824
|
+
* Krisp VIVA is a proprietary SDK and requires a commercial license plus a
|
|
6825
|
+
* `.kef` model file provided by the user. Patter ships only the
|
|
6826
|
+
* AudioFilter interface scaffold — never the SDK or model.
|
|
6827
|
+
*
|
|
6828
|
+
* @see https://krisp.ai/developers/
|
|
6829
|
+
*/
|
|
6830
|
+
|
|
6831
|
+
/** Krisp-supported sample rates (parity with Python `KrispSampleRate`). */
|
|
6832
|
+
declare const KrispSampleRate: {
|
|
6833
|
+
readonly HZ_8000: 8000;
|
|
6834
|
+
readonly HZ_16000: 16000;
|
|
6835
|
+
readonly HZ_32000: 32000;
|
|
6836
|
+
readonly HZ_44100: 44100;
|
|
6837
|
+
readonly HZ_48000: 48000;
|
|
6838
|
+
};
|
|
6839
|
+
type KrispSampleRate = (typeof KrispSampleRate)[keyof typeof KrispSampleRate];
|
|
6840
|
+
/** Krisp-supported frame durations in ms (parity with Python `KrispFrameDuration`). */
|
|
6841
|
+
declare const KrispFrameDuration: {
|
|
6842
|
+
readonly MS_10: 10;
|
|
6843
|
+
readonly MS_15: 15;
|
|
6844
|
+
readonly MS_20: 20;
|
|
6845
|
+
readonly MS_30: 30;
|
|
6846
|
+
readonly MS_32: 32;
|
|
6847
|
+
};
|
|
6848
|
+
type KrispFrameDuration = (typeof KrispFrameDuration)[keyof typeof KrispFrameDuration];
|
|
6849
|
+
/** Options accepted by {@link KrispVivaFilter}. */
|
|
6850
|
+
interface KrispVivaFilterOptions {
|
|
6851
|
+
/**
|
|
6852
|
+
* Path to the Krisp `.kef` model file. If omitted, falls back to the
|
|
6853
|
+
* `KRISP_VIVA_FILTER_MODEL_PATH` environment variable.
|
|
6854
|
+
*/
|
|
6855
|
+
readonly modelPath?: string;
|
|
6856
|
+
/** Noise-suppression strength in `[0, 100]`. Defaults to `100`. */
|
|
6857
|
+
readonly noiseSuppressionLevel?: number;
|
|
6858
|
+
/** Frame duration in ms. One of `10, 15, 20, 30, 32`. Defaults to `10`. */
|
|
6859
|
+
readonly frameDurationMs?: KrispFrameDuration | number;
|
|
6860
|
+
/** Initial sample rate in Hz. Defaults to `16000`. Re-created lazily if it changes mid-call. */
|
|
6861
|
+
readonly sampleRate?: KrispSampleRate | number;
|
|
6862
|
+
}
|
|
6863
|
+
/**
|
|
6864
|
+
* Krisp VIVA noise-reduction filter — TypeScript scaffold (NOT YET IMPLEMENTED).
|
|
6865
|
+
*
|
|
6866
|
+
* Construction throws with a guidance message because Krisp does not ship a
|
|
6867
|
+
* Node.js SDK. The class exists for API parity with the Python
|
|
6868
|
+
* `KrispVivaFilter` so that user code does not need to be rewritten when a
|
|
6869
|
+
* Node binding lands.
|
|
6870
|
+
*
|
|
6871
|
+
* For TS users today, use {@link DeepFilterNetFilter} from
|
|
6872
|
+
* `./deepfilternet-filter` instead — same `AudioFilter` interface, no
|
|
6873
|
+
* license required.
|
|
6874
|
+
*
|
|
6875
|
+
* @example
|
|
6876
|
+
* ```ts
|
|
6877
|
+
* // FUTURE — when Krisp publishes a Node SDK:
|
|
6878
|
+
* import { KrispVivaFilter } from 'getpatter/providers/krisp-filter';
|
|
6879
|
+
* const filter = new KrispVivaFilter({ modelPath: '/path/to/model.kef' });
|
|
6880
|
+
* const agent = phone.agent({ audioFilter: filter, ... });
|
|
6881
|
+
* ```
|
|
6882
|
+
*/
|
|
6883
|
+
declare class KrispVivaFilter implements AudioFilter {
|
|
6884
|
+
static readonly providerKey = "krisp_viva";
|
|
6885
|
+
constructor(_options?: KrispVivaFilterOptions);
|
|
6886
|
+
process(pcmChunk: Buffer, _sampleRate: number): Promise<Buffer>;
|
|
6887
|
+
close(): Promise<void>;
|
|
6888
|
+
}
|
|
6889
|
+
|
|
6890
|
+
/**
|
|
6891
|
+
* OpenAI Realtime adapter for the GA Realtime API (`gpt-realtime-2`).
|
|
6892
|
+
*
|
|
6893
|
+
* `gpt-realtime-2` is served from the same `wss://api.openai.com/v1/realtime`
|
|
6894
|
+
* endpoint as the v1-beta family, but the GA endpoint:
|
|
6895
|
+
* - REJECTS the legacy `OpenAI-Beta: realtime=v1` header (returns
|
|
6896
|
+
* `invalid_model` with message "Model X is only available on the GA API").
|
|
6897
|
+
* - REQUIRES `session.type === "realtime"` at the root of `session.update`.
|
|
6898
|
+
* - Uses `output_modalities` (was `modalities`).
|
|
6899
|
+
* - Nests audio config under `audio.{input,output}` with MIME `type`
|
|
6900
|
+
* strings (`audio/pcmu`, `audio/pcma`, `audio/pcm`) instead of the v1
|
|
6901
|
+
* enum strings (`g711_ulaw`, `g711_alaw`, `pcm16`) and moves `voice`
|
|
6902
|
+
* under `audio.output.voice`, `transcription` + `turn_detection`
|
|
6903
|
+
* under `audio.input`.
|
|
6904
|
+
*
|
|
6905
|
+
* Everything ELSE (event names, audio delta dispatch, barge-in / truncate
|
|
6906
|
+
* semantics, heartbeat, tool calling) is API-compatible with the v1 family,
|
|
6907
|
+
* so this adapter subclasses {@link OpenAIRealtimeAdapter} and overrides
|
|
6908
|
+
* only `connect()`. The runtime behaviour (`sendAudio`, `cancelResponse`,
|
|
6909
|
+
* `sendText`, `sendFirstMessage`, …) is inherited unchanged.
|
|
6910
|
+
*/
|
|
6911
|
+
|
|
6912
|
+
/**
|
|
6913
|
+
* Realtime WebSocket adapter speaking OpenAI's GA Realtime API.
|
|
6914
|
+
*
|
|
6915
|
+
* Note on audio transport: the GA endpoint accepts only PCM-16-LE with
|
|
6916
|
+
* `rate >= 24000` for both `session.audio.input.format` and
|
|
6917
|
+
* `session.audio.output.format`. The `audio/pcmu` MIME type appears to be
|
|
6918
|
+
* accepted at the protocol level but the server's audio engine does not
|
|
6919
|
+
* actually decode mulaw 8 kHz frames — they're silently dropped, the input
|
|
6920
|
+
* buffer stays empty, `input_audio_buffer.commit` returns
|
|
6921
|
+
* "buffer only has 0.00ms of audio", and the call ends up muted. Until
|
|
6922
|
+
* OpenAI documents native g711_ulaw on the GA endpoint we transcode on
|
|
6923
|
+
* both directions on the Patter side:
|
|
6924
|
+
* - inbound (Twilio/Telnyx → model): mulaw 8 kHz → PCM 24 kHz
|
|
6925
|
+
* - outbound (model → Twilio/Telnyx): PCM 24 kHz → mulaw 8 kHz
|
|
6926
|
+
*
|
|
6927
|
+
* The outbound path needs a stateful resampler instance because the
|
|
6928
|
+
* 24 kHz → 8 kHz decimator carries phase between chunks; sharing a single
|
|
6929
|
+
* instance across the call eliminates the boundary clicks a stateless
|
|
6930
|
+
* helper would produce on every audio delta.
|
|
6931
|
+
*/
|
|
6932
|
+
declare class OpenAIRealtime2Adapter extends OpenAIRealtimeAdapter {
|
|
6933
|
+
/** Two-stage outbound resampler for 24 kHz → 8 kHz. Created lazily on
|
|
6934
|
+
* the first audio frame so each Realtime session has its own state.
|
|
6935
|
+
*
|
|
6936
|
+
* We chain `24k → 16k → 8k` instead of using the direct `24k → 8k`
|
|
6937
|
+
* variant of {@link StatefulResampler}: the direct path is a 3:1
|
|
6938
|
+
* decimation with linear interpolation only — no anti-alias filter
|
|
6939
|
+
* — so any energy above 4 kHz in the source aliases down into the
|
|
6940
|
+
* audible band and is heard as raspy/scratchy artefacts on speech.
|
|
6941
|
+
* `gpt-realtime-2` outputs voice with significant content above
|
|
6942
|
+
* 4 kHz. The second stage (16k → 8k) uses a 5-tap FIR anti-alias
|
|
6943
|
+
* filter which removes the offending band before decimation, and
|
|
6944
|
+
* empirically (see commit message) the chain produces audibly
|
|
6945
|
+
* cleaner output. The 24k → 16k step is still pure linear-interp
|
|
6946
|
+
* but the inputs to it stay below the Nyquist of the 16 kHz stage,
|
|
6947
|
+
* so it doesn't introduce new artefacts.
|
|
6948
|
+
*/
|
|
6949
|
+
private outboundResampler24To16;
|
|
6950
|
+
private outboundResampler16To8;
|
|
6951
|
+
/** Last 8 kHz input sample carried across chunk boundaries for the
|
|
6952
|
+
* direct 3× linear upsample (see `transcodeInboundMulaw8ToPcm24`).
|
|
6953
|
+
* The carry guarantees the very first output of each chunk
|
|
6954
|
+
* interpolates from the *real* preceding sample, not from the chunk's
|
|
6955
|
+
* own first sample replicated — without it every 20 ms Twilio frame
|
|
6956
|
+
* boundary becomes a small DC step that the GA server VAD interprets
|
|
6957
|
+
* as constant low-energy noise, which never crosses the speech
|
|
6958
|
+
* threshold. */
|
|
6959
|
+
private inbound8kCarry;
|
|
6960
|
+
/** GA-shape `session.update` payload. See module-level docstring. */
|
|
6961
|
+
private buildGASessionConfig;
|
|
6962
|
+
/**
|
|
6963
|
+
* Open the Realtime WebSocket against the GA endpoint and apply the GA
|
|
6964
|
+
* session configuration. Header `OpenAI-Beta: realtime=v1` is OMITTED
|
|
6965
|
+
* (the GA endpoint rejects it). Wire shape uses nested `audio.{input,
|
|
6966
|
+
* output}` + `output_modalities` + `session.type === "realtime"`.
|
|
6967
|
+
*/
|
|
6968
|
+
connect(): Promise<void>;
|
|
6969
|
+
/**
|
|
6970
|
+
* GA-API variant of {@link OpenAIRealtimeAdapter.openParkedConnection}.
|
|
6971
|
+
* Opens a fresh Realtime WS against the GA endpoint, exchanges
|
|
6972
|
+
* `session.created` → GA-shape `session.update` → `session.updated`
|
|
6973
|
+
* so the upstream session is fully primed, and returns the OPEN
|
|
6974
|
+
* socket WITHOUT taking it on `this.ws` or arming the heartbeat /
|
|
6975
|
+
* message listener.
|
|
6976
|
+
*
|
|
6977
|
+
* Used by `Patter.parkProviderConnections` during the carrier
|
|
6978
|
+
* ringing window so the per-call `StreamHandler` can adopt the
|
|
6979
|
+
* primed socket at carrier `start` — eliminating the TCP + TLS +
|
|
6980
|
+
* HTTP-101 + `session.update` ack round-trip from the critical path.
|
|
6981
|
+
* Saves ~300-600 ms of first-audible-word latency.
|
|
6982
|
+
*
|
|
6983
|
+
* Bounded by 8 s. Throws on timeout / handshake failure / GA-side
|
|
6984
|
+
* rejection. Callers treat any error as a cache miss and fall
|
|
6985
|
+
* through to the cold {@link connect} path.
|
|
6986
|
+
*
|
|
6987
|
+
* Billing safety: confirmed by OpenAI's Managing Realtime Costs
|
|
6988
|
+
* guide — `session.update` does NOT invoke the model and bills no
|
|
6989
|
+
* tokens. An idle parked socket costs $0.
|
|
6990
|
+
*/
|
|
6991
|
+
openParkedConnection(): Promise<WebSocket__default>;
|
|
6992
|
+
/**
|
|
6993
|
+
* GA-API variant of {@link OpenAIRealtimeAdapter.adoptWebSocket}. Takes
|
|
6994
|
+
* over a WS that {@link openParkedConnection} produced (already through
|
|
6995
|
+
* `session.created` + `session.update` + `session.updated`) and arms
|
|
6996
|
+
* the heartbeat + message listener so the GA event-translation shim
|
|
6997
|
+
* is wired up. Skips the cold-connect path — saves ~300-600 ms on
|
|
6998
|
+
* first audible word.
|
|
6999
|
+
*
|
|
7000
|
+
* Caller MUST verify `ws.readyState === OPEN` before calling. If the
|
|
7001
|
+
* parked WS died between park and adopt, fall back to {@link connect}.
|
|
7002
|
+
*/
|
|
7003
|
+
adoptWebSocket(ws: WebSocket__default): void;
|
|
7004
|
+
/**
|
|
7005
|
+
* GA-API variant of {@link OpenAIRealtimeAdapter.sendFirstMessage}. Two
|
|
7006
|
+
* differences from the v1 path:
|
|
7007
|
+
*
|
|
7008
|
+
* 1. The v1 implementation sends `response.modalities` which the GA
|
|
7009
|
+
* endpoint rejects with `Unknown parameter: 'response.modalities'`.
|
|
7010
|
+
* Use `output_modalities` to match the GA `session.update` shape.
|
|
7011
|
+
*
|
|
7012
|
+
* 2. The GA `response.create` does NOT inherit `audio.output.voice`
|
|
7013
|
+
* from the session — it falls back to the server-side default
|
|
7014
|
+
* (`marin`, female) when the field is omitted on the response
|
|
7015
|
+
* itself. Session-level `voice: "alloy"` only affects subsequent
|
|
7016
|
+
* server-VAD-triggered responses, NOT this explicit
|
|
7017
|
+
* `response.create`. We re-inject the configured voice here so the
|
|
7018
|
+
* first-message voice matches the rest of the call.
|
|
7019
|
+
*/
|
|
7020
|
+
/**
|
|
7021
|
+
* Override the parent `sendAudio` to transcode inbound carrier audio
|
|
7022
|
+
* (mulaw 8 kHz from Twilio/Telnyx) into PCM-16 24 kHz before sending
|
|
7023
|
+
* `input_audio_buffer.append`. The GA server's audio engine ignores
|
|
7024
|
+
* mulaw frames (commit returns "buffer only has 0.00ms of audio") even
|
|
7025
|
+
* though it accepts `audio/pcmu` at the protocol level.
|
|
7026
|
+
*/
|
|
7027
|
+
sendAudio(mulawAudio: Buffer): void;
|
|
7028
|
+
/**
|
|
7029
|
+
* mulaw 8 kHz Buffer → PCM-16-LE 24 kHz Buffer.
|
|
7030
|
+
*
|
|
7031
|
+
* Direct 3× linear-interpolation upsample with a one-sample carry
|
|
7032
|
+
* across chunk boundaries. For every consecutive pair of 8 kHz
|
|
7033
|
+
* samples `(s_a, s_b)` we emit three 24 kHz samples:
|
|
7034
|
+
*
|
|
7035
|
+
* out_0 = s_a
|
|
7036
|
+
* out_1 = 2/3·s_a + 1/3·s_b
|
|
7037
|
+
* out_2 = 1/3·s_a + 2/3·s_b
|
|
7038
|
+
*
|
|
7039
|
+
* The carry stores the last 8 kHz sample of the chunk so the next
|
|
7040
|
+
* chunk can start by pairing `(carry, firstNewSample)` — that's what
|
|
7041
|
+
* keeps the output rate exact (each input sample → 3 output samples)
|
|
7042
|
+
* and eliminates the chunk-boundary DC step that confused the GA
|
|
7043
|
+
* server VAD. The first chunk has no carry and loses 3 samples at
|
|
7044
|
+
* the leading edge (375 µs of audio); that's well below any audible
|
|
7045
|
+
* artefact and well below the GA VAD's 300 ms prefix-padding window.
|
|
7046
|
+
*/
|
|
7047
|
+
private transcodeInboundMulaw8ToPcm24;
|
|
7048
|
+
/**
|
|
7049
|
+
* Base64 PCM-16-LE 24 kHz → Base64 mulaw 8 kHz. Used by the WS
|
|
7050
|
+
* translation shim on each `response.output_audio.delta`. The stateful
|
|
7051
|
+
* resampler is created lazily and reused across all deltas in this
|
|
7052
|
+
* session so the 3:1 decimator's phase carries across chunk
|
|
7053
|
+
* boundaries — without that, every chunk boundary produces a click.
|
|
7054
|
+
*/
|
|
7055
|
+
private transcodeOutboundPcm24ToMulaw8Buffer;
|
|
7056
|
+
sendFirstMessage(text: string): Promise<void>;
|
|
5642
7057
|
}
|
|
5643
7058
|
|
|
5644
7059
|
/**
|
|
@@ -6273,10 +7688,18 @@ declare class TwilioAdapter {
|
|
|
6273
7688
|
/** Place an outbound call. Returns the Twilio call SID. */
|
|
6274
7689
|
initiateCall(opts: InitiateCallOptions$1): Promise<InitiateCallResult$1>;
|
|
6275
7690
|
/**
|
|
6276
|
-
* Build a
|
|
6277
|
-
*
|
|
7691
|
+
* Build a ``<Response><Connect><Stream url="...">`` TwiML document.
|
|
7692
|
+
*
|
|
7693
|
+
* ``parameters`` is forwarded as ``<Parameter name="..." value="..."/>``
|
|
7694
|
+
* children of ``<Stream>``. Twilio Media Streams strips query-string params
|
|
7695
|
+
* from the ``<Stream url=...>`` before the WS handshake, so
|
|
7696
|
+
* ``<Parameter>`` tags are the supported way to pre-populate
|
|
7697
|
+
* ``start.customParameters`` on the WS ``start`` frame. Used by the
|
|
7698
|
+
* inbound path to carry caller / callee through to the bridge.
|
|
7699
|
+
*
|
|
7700
|
+
* Mirrors the Python adapter's ``generate_stream_twiml``.
|
|
6278
7701
|
*/
|
|
6279
|
-
static generateStreamTwiml(streamUrl: string): string;
|
|
7702
|
+
static generateStreamTwiml(streamUrl: string, parameters?: Record<string, string>): string;
|
|
6280
7703
|
/** Force-complete an in-progress call. */
|
|
6281
7704
|
endCall(callSid: string): Promise<void>;
|
|
6282
7705
|
}
|
|
@@ -6379,6 +7802,8 @@ declare class TelnyxSTT {
|
|
|
6379
7802
|
private readonly transcriptionEngine;
|
|
6380
7803
|
private readonly sampleRate;
|
|
6381
7804
|
private readonly baseUrl;
|
|
7805
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
7806
|
+
static readonly providerKey = "telnyx_stt";
|
|
6382
7807
|
private ws;
|
|
6383
7808
|
private callbacks;
|
|
6384
7809
|
private headerSent;
|
|
@@ -6425,6 +7850,8 @@ declare class TelnyxTTS {
|
|
|
6425
7850
|
private readonly apiKey;
|
|
6426
7851
|
private readonly voice;
|
|
6427
7852
|
private readonly baseUrl;
|
|
7853
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
7854
|
+
static readonly providerKey = "telnyx_tts";
|
|
6428
7855
|
constructor(apiKey: string, voice?: string, baseUrl?: string);
|
|
6429
7856
|
/** Collect every audio chunk into a single Buffer. */
|
|
6430
7857
|
synthesize(text: string): Promise<Buffer>;
|
|
@@ -6504,4 +7931,4 @@ interface CallEvent {
|
|
|
6504
7931
|
readonly direction?: string;
|
|
6505
7932
|
}
|
|
6506
7933
|
|
|
6507
|
-
export { type AgentOptions, type AgentState, AllProvidersFailedError, type AnthropicConversion, LLM$3 as AnthropicLLM, type AnthropicLLMOptions, type AnthropicMessage, AssemblyAIEncoding, AssemblyAIModel, STT$1 as AssemblyAISTT, type AssemblyAISTTOptions, type AudioConfig, type AudioSource, AuthenticationError, type BackgroundAudioOptions, BackgroundAudioPlayer, BuiltinAudioClip, type BuiltinAudioClipName, type BuiltinPcmSource, type CallControl, type CallEvent, type CallEventHandler, type CallMetrics, CallMetricsAccumulator, type CallRecord, type CartesiaEncoding, STT$3 as CartesiaSTT, type CartesiaSTTOptions, TTS$3 as CartesiaTTS, type CartesiaTTSOptions, LLM$1 as CerebrasLLM, type CerebrasLLMOptions, ChatContext, type ChatMessage, type ChatRole, CloudflareTunnel, type ConversationStateSnapshot, type CostBreakdown, DEFAULT_MIN_SENTENCE_LEN, DEFAULT_PRICING, DTMF_EVENTS, STT$6 as DeepgramSTT, type DeepgramSTTOptions, DefaultToolExecutor, type DefaultToolExecutorOptions, type DefineToolInput, type DtmfEvent, ConvAI as ElevenLabsConvAI, ElevenLabsConvAIAdapter, type ConvAIOptions as ElevenLabsConvAIOptions, TTS$6 as ElevenLabsTTS, type ElevenLabsTTSOptions, type ElevenLabsWebSocketOptions, TTS$5 as ElevenLabsWebSocketTTS, type EouTrigger, ErrorCode, EventBus, FallbackLLMProvider, type FallbackLLMProviderOptions, type FilePcmSource, GEMINI_DEFAULT_INPUT_SR, GEMINI_DEFAULT_OUTPUT_SR, GeminiLiveAdapter, type GeminiLiveEventHandler, LLM as GoogleLLM, type GoogleLLMOptions, LLM$2 as GroqLLM, type GroqLLMOptions, Guardrail$1 as Guardrail, type GuardrailOptions, type HookContext, IVRActivity, type IVRActivityOptions, type IVRToolDefinition, type IncomingMessage, type InitTracingOptions, TTS as InworldTTS, type InworldTTSOptions, type JobCallback, type LLMChunk, LLMLoop, type LLMProvider, LMNTAudioFormat, LMNTModel, LMNTSampleRate, TTS$1 as LMNTTTS, type LMNTTTSOptions, type LatencyBreakdown, type LocalCallOptions, type LocalConfig, type LocalOptions, type Logger, type LoopCallback, type MessageHandler, MetricsStore, Ngrok, LLM$4 as OpenAILLM, type OpenAILLMOptions, OpenAILLMProvider, type OpenAIMessage, Realtime as OpenAIRealtime, OpenAIRealtimeAdapter, type RealtimeOptions as OpenAIRealtimeOptions, TTS$4 as OpenAITTS, type OpenAITTSOptions, STT$4 as OpenAITranscribeSTT, type OpenAITranscribeSTTOptions, type ParamSpec, PartialStreamError, Patter, PatterConnectionError, PatterError, type PatterEventType, PatterTool, type PatterToolExecuteArgs, type PatterToolOptions, type PatterToolResult, PcmCarry, PipelineHookExecutor, type PipelineHooks, type PipelineMessageHandler, type ProviderPricing, ProvisionError, RateLimitError, type RawPcmSource, type RealtimeConfig, RemoteMessageHandler, TTS$2 as RimeTTS, type RimeTTSOptions, SPAN_BARGEIN, SPAN_CALL, SPAN_ENDPOINT, SPAN_LLM, SPAN_STT, SPAN_TOOL, SPAN_TTS, type SSEEvent, type STTConfig, type ScheduleHandle, SentenceChunker, type ServeOptions, type SilenceCallback, type SileroSampleRate, SileroVAD, type SileroVADOptions, STT$2 as SonioxSTT, type SonioxSTTOptions$1 as SonioxSTTOptions, type Span, type SpeechEventCallback, SpeechEvents, SpeechmaticsAudioEncoding, SpeechmaticsOperatingPoint, STT as SpeechmaticsSTT, type SpeechmaticsSTTOptions, SpeechmaticsSampleRate, SpeechmaticsServerMessage, TurnDetectionMode as SpeechmaticsTurnDetectionMode, StatefulResampler, type StatefulResamplerOptions, Static as StaticTunnel, type TTSConfig, Carrier as Telnyx, TelnyxAdapter, type TelnyxCarrierOptions, type ConfigureNumberOptions as TelnyxConfigureNumberOptions, type EndCallOptions as TelnyxEndCallOptions, type InitiateCallOptions as TelnyxInitiateCallOptions, type InitiateCallResult as TelnyxInitiateCallResult, type ProvisionNumberOptions as TelnyxProvisionNumberOptions, type ProvisionNumberResult as TelnyxProvisionNumberResult, TelnyxSTT, TelnyxSTTInputFormat, TelnyxSTTSampleRate, type Transcript as TelnyxSTTTranscript, TelnyxTTS, TelnyxTTSSampleRate, TelnyxTTSVoice, type TelnyxTranscriptionEngine, TestSession, TfidfLoopDetector, type TfidfLoopDetectorOptions, Tool, type ToolDefinition, type ToolExecutor, type ToolHandler, type ToolOptions, type TunnelHandle, type TurnMetrics, Carrier$1 as Twilio, TwilioAdapter, type TwilioAdapterOptions, type TwilioCarrierOptions, type ConfigureNumberOptions$1 as TwilioConfigureNumberOptions, type InitiateCallOptions$1 as TwilioInitiateCallOptions, type InitiateCallResult$1 as TwilioInitiateCallResult, type ProvisionNumberOptions$1 as TwilioProvisionNumberOptions, type ProvisionNumberResult$1 as TwilioProvisionNumberResult, ULTRAVOX_DEFAULT_API_BASE, ULTRAVOX_DEFAULT_SR, type UltravoxEventHandler, UltravoxRealtimeAdapter, type UserState, STT$5 as WhisperSTT, type WhisperSTTOptions, assemblyai, builtinClipPath, calculateRealtimeCost, calculateSttCost, calculateTelephonyCost, calculateTtsCost, callsToCsv, callsToJson, cartesia, createResampler16kTo8k, createResampler24kTo16k, createResampler24kTo8k, createResampler8kTo16k, deepgram, defineTool, elevenlabs, filterEmoji, filterForTTS, filterMarkdown, formatDtmf, geminiLive, getLogger, guardrail, initTracing, isRemoteUrl, isTracingEnabled, isWebSocketUrl, lmnt, makeAuthMiddleware, mergePricing, mixPcm, mountApi, mountDashboard, mulawToPcm16, notifyDashboard, openaiTts, pcm16ToMulaw, resample16kTo8k, resample24kTo16k, resample8kTo16k, resamplePcm, rime, scheduleCron, scheduleInterval, scheduleOnce, selectSoundFromList, setLogger, soniox, speechmatics, startSpan, startTunnel, tool, ultravox, whisper };
|
|
7934
|
+
export { type AgentOptions, type AgentState, AllProvidersFailedError, type AnthropicConversion, LLM$3 as AnthropicLLM, type AnthropicLLMOptions, type AnthropicMessage, AssemblyAIEncoding, AssemblyAIModel, STT$1 as AssemblyAISTT, type AssemblyAISTTOptions, type AudioConfig, type AudioSource, AuthenticationError, type BackgroundAudioOptions, BackgroundAudioPlayer, type EvaluateContext as BargeInEvaluateContext, type BargeInStrategy, BuiltinAudioClip, type BuiltinAudioClipName, type BuiltinPcmSource, type CallControl, type CallEvent, type CallEventHandler, type CallMetrics, CallMetricsAccumulator, type CallRecord, type CartesiaEncoding, STT$3 as CartesiaSTT, type CartesiaSTTOptions, TTS$3 as CartesiaTTS, CartesiaTTSModel, type CartesiaTTSOptions, CartesiaTTSVoiceMode, LLM$1 as CerebrasLLM, type CerebrasLLMOptions, ChatContext, type ChatMessage, type ChatRole, CloudflareTunnel, type ConversationStateSnapshot, type CostBreakdown, DEFAULT_MIN_SENTENCE_LEN, DEFAULT_PRICING, DTMF_EVENTS, DeepFilterNetFilter, type DeepFilterNetOptions, DeepgramModel, STT$6 as DeepgramSTT, type DeepgramSTTOptions, DefaultToolExecutor, type DefaultToolExecutorOptions, type DefineToolInput, type DtmfEvent, ConvAI as ElevenLabsConvAI, ElevenLabsConvAIAdapter, type ConvAIOptions as ElevenLabsConvAIOptions, ElevenLabsModel, ElevenLabsOutputFormat, ElevenLabsTTS as ElevenLabsRestTTS, TTS$6 as ElevenLabsTTS, type ElevenLabsTTSOptions, type ElevenLabsWebSocketOptions, TTS$5 as ElevenLabsWebSocketTTS, type EouTrigger, ErrorCode, EventBus, FallbackLLMProvider, type FallbackLLMProviderOptions, type FilePcmSource, GEMINI_DEFAULT_INPUT_SR, GEMINI_DEFAULT_OUTPUT_SR, GeminiLiveAdapter, type GeminiLiveEventHandler, LLM as GoogleLLM, type GoogleLLMOptions, LLM$2 as GroqLLM, type GroqLLMOptions, Guardrail$1 as Guardrail, type GuardrailOptions, type HookContext, IVRActivity, type IVRActivityOptions, type IVRToolDefinition, type IncomingMessage, type InitTracingOptions, TTS as InworldTTS, type InworldTTSOptions, type JobCallback, KrispFrameDuration, KrispSampleRate, KrispVivaFilter, type KrispVivaFilterOptions, type LLMChunk, LLMLoop, type LLMProvider, LMNTAudioFormat, LMNTModel, LMNTSampleRate, TTS$1 as LMNTTTS, type LMNTTTSOptions, type LatencyBreakdown, type LocalCallOptions, type LocalConfig, type LocalOptions, type Logger, type LoopCallback, type MessageHandler, MetricsStore, MinWordsStrategy, type MinWordsStrategyOptions, type ModelPricing, Ngrok, LLM$4 as OpenAILLM, type OpenAILLMOptions, OpenAILLMProvider, type OpenAIMessage, Realtime as OpenAIRealtime, Realtime2 as OpenAIRealtime2, OpenAIRealtime2Adapter, type Realtime2Options as OpenAIRealtime2Options, OpenAIRealtimeAdapter, OpenAIRealtimeAudioFormat, OpenAIRealtimeModel, type RealtimeOptions as OpenAIRealtimeOptions, OpenAIRealtimeVADType, TTS$4 as OpenAITTS, type OpenAITTSOptions, STT$4 as OpenAITranscribeSTT, type OpenAITranscribeSTTOptions, OpenAITranscriptionModel, OpenAIVoice, PRICING_LAST_UPDATED, PRICING_VERSION, type ParamSpec, PartialStreamError, Patter, PatterConnectionError, PatterError, type PatterEventType, PatterTool, type PatterToolExecuteArgs, type PatterToolOptions, type PatterToolResult, PcmCarry, PipelineHookExecutor, type PipelineHooks, type PipelineMessageHandler, PricingUnit, type PricingUnitValue, type ProviderPricing, ProvisionError, RateLimitError, type RawPcmSource, type RealtimeConfig, RemoteMessageHandler, RimeAudioFormat, RimeModel, TTS$2 as RimeTTS, type RimeTTSOptions, SPAN_BARGEIN, SPAN_CALL, SPAN_ENDPOINT, SPAN_LLM, SPAN_STT, SPAN_TOOL, SPAN_TTS, type SSEEvent, type STTConfig, type ScheduleHandle, SentenceChunker, type ServeOptions, type SilenceCallback, type SileroSampleRate, SileroVAD, type SileroVADOptions, STT$2 as SonioxSTT, type SonioxSTTOptions$1 as SonioxSTTOptions, type Span, type SpeechEventCallback, SpeechEvents, SpeechmaticsAudioEncoding, SpeechmaticsOperatingPoint, STT as SpeechmaticsSTT, type SpeechmaticsSTTOptions, SpeechmaticsSampleRate, SpeechmaticsServerMessage, TurnDetectionMode as SpeechmaticsTurnDetectionMode, StatefulResampler, type StatefulResamplerOptions, Static as StaticTunnel, type TTSConfig, Carrier as Telnyx, TelnyxAdapter, type TelnyxCarrierOptions, type ConfigureNumberOptions as TelnyxConfigureNumberOptions, type EndCallOptions as TelnyxEndCallOptions, type InitiateCallOptions as TelnyxInitiateCallOptions, type InitiateCallResult as TelnyxInitiateCallResult, type ProvisionNumberOptions as TelnyxProvisionNumberOptions, type ProvisionNumberResult as TelnyxProvisionNumberResult, TelnyxSTT, TelnyxSTTInputFormat, TelnyxSTTSampleRate, type Transcript as TelnyxSTTTranscript, TelnyxTTS, TelnyxTTSSampleRate, TelnyxTTSVoice, type TelnyxTranscriptionEngine, TestSession, TfidfLoopDetector, type TfidfLoopDetectorOptions, Tool, type ToolDefinition, type ToolExecutor, type ToolHandler, type ToolOptions, type TunnelHandle, type TurnMetrics, Carrier$1 as Twilio, TwilioAdapter, type TwilioAdapterOptions, type TwilioCarrierOptions, type ConfigureNumberOptions$1 as TwilioConfigureNumberOptions, type InitiateCallOptions$1 as TwilioInitiateCallOptions, type InitiateCallResult$1 as TwilioInitiateCallResult, type ProvisionNumberOptions$1 as TwilioProvisionNumberOptions, type ProvisionNumberResult$1 as TwilioProvisionNumberResult, ULTRAVOX_DEFAULT_API_BASE, ULTRAVOX_DEFAULT_SR, type UltravoxEventHandler, UltravoxRealtimeAdapter, type UserState, STT$5 as WhisperSTT, type WhisperSTTOptions, assemblyai, builtinClipPath, calculateRealtimeCost, calculateSttCost, calculateTelephonyCost, calculateTtsCost, callsToCsv, callsToJson, cartesia, createResampler16kTo8k, createResampler24kTo16k, createResampler24kTo8k, createResampler8kTo16k, deepgram, defineTool, elevenlabs, evaluateStrategies as evaluateBargeInStrategies, filterEmoji, filterForTTS, filterMarkdown, formatDtmf, geminiLive, getLogger, guardrail, initTracing, isRemoteUrl, isTracingEnabled, isWebSocketUrl, lmnt, makeAuthMiddleware, mergePricing, mixPcm, mountApi, mountDashboard, mulawToPcm16, notifyDashboard, openaiTts, pcm16ToMulaw, resample16kTo8k, resample24kTo16k, resample8kTo16k, resamplePcm, resetStrategies as resetBargeInStrategies, rime, scheduleCron, scheduleInterval, scheduleOnce, selectSoundFromList, setLogger, soniox, speechmatics, startSpan, startTunnel, tool, ultravox, whisper };
|