getpatter 0.6.0 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -1,3 +1,5 @@
1
+ import * as WebSocket from 'ws';
2
+ import WebSocket__default from 'ws';
1
3
  import { EventEmitter } from 'events';
2
4
  import { Request, Response, NextFunction, Express } from 'express';
3
5
 
@@ -59,7 +61,11 @@ declare class Carrier {
59
61
  interface RealtimeOptions {
60
62
  /** API key. Falls back to OPENAI_API_KEY env var when omitted. */
61
63
  apiKey?: string;
62
- /** Realtime model. Defaults to gpt-4o-mini-realtime-preview. */
64
+ /**
65
+ * Realtime model. Defaults to ``gpt-realtime-mini`` (bumped from the
66
+ * deprecated ``gpt-4o-mini-realtime-preview`` on 2026-05-25 for
67
+ * parity with the Python SDK and the GA Realtime API surface).
68
+ */
63
69
  model?: string;
64
70
  /** Voice preset. Defaults to alloy. */
65
71
  voice?: string;
@@ -104,6 +110,61 @@ declare class Realtime {
104
110
  constructor(opts?: RealtimeOptions);
105
111
  }
106
112
 
113
+ /**
114
+ * OpenAI Realtime 2 engine — marker class for Patter client dispatch.
115
+ *
116
+ * Wraps `gpt-realtime-2` (GA Realtime API). Separate marker from
117
+ * {@link import('./openai').Realtime} because the GA endpoint speaks a
118
+ * different `session.update` wire shape; the client dispatches to
119
+ * `OpenAIRealtime2Adapter` when this marker is passed.
120
+ */
121
+ /** Constructor options for the OpenAI `Realtime2` engine marker. */
122
+ interface Realtime2Options {
123
+ /** API key. Falls back to OPENAI_API_KEY env var when omitted. */
124
+ apiKey?: string;
125
+ /** GA Realtime model. Defaults to `gpt-realtime-2`. */
126
+ model?: string;
127
+ /** Voice preset. Defaults to alloy. */
128
+ voice?: string;
129
+ /**
130
+ * Reasoning-effort tier. When omitted the field is not sent and the
131
+ * server default applies. OpenAI recommends `"low"` for production
132
+ * voice flows — higher tiers add measurable per-turn latency.
133
+ */
134
+ reasoningEffort?: 'minimal' | 'low' | 'medium' | 'high';
135
+ /**
136
+ * Override for `audio.input.transcription.model`. Omit to keep the
137
+ * adapter default (`whisper-1`). Use `"gpt-realtime-whisper"` for
138
+ * low-latency transcript partials.
139
+ */
140
+ inputAudioTranscriptionModel?: string;
141
+ }
142
+ /**
143
+ * OpenAI Realtime 2 engine marker — selects `gpt-realtime-2` on the GA
144
+ * Realtime API.
145
+ *
146
+ * @example
147
+ * ```ts
148
+ * import { Patter, Twilio, OpenAIRealtime2 } from "getpatter";
149
+ *
150
+ * const phone = new Patter({ carrier: new Twilio(), phoneNumber: "+1..." });
151
+ * const agent = phone.agent({
152
+ * engine: new OpenAIRealtime2({ reasoningEffort: "low" }),
153
+ * systemPrompt: "You are a friendly receptionist.",
154
+ * firstMessage: "Hello! How can I help?",
155
+ * });
156
+ * ```
157
+ */
158
+ declare class Realtime2 {
159
+ readonly kind: "openai_realtime_2";
160
+ readonly apiKey: string;
161
+ readonly model: string;
162
+ readonly voice: string;
163
+ readonly reasoningEffort?: 'minimal' | 'low' | 'medium' | 'high';
164
+ readonly inputAudioTranscriptionModel?: string;
165
+ constructor(opts?: Realtime2Options);
166
+ }
167
+
107
168
  /** ElevenLabs ConvAI engine — marker class for Patter client dispatch. */
108
169
  /** Constructor options for the ElevenLabs `ConvAI` engine marker. */
109
170
  interface ConvAIOptions {
@@ -273,71 +334,6 @@ declare class Tool implements ToolDefinition {
273
334
  /** Factory helper mirroring Python's `tool(...)` function. */
274
335
  declare function tool(opts: ToolOptions): Tool;
275
336
 
276
- /**
277
- * Shared STT / TTS adapter dispatch.
278
- *
279
- * In v0.5.0+ callers always pass pre-instantiated adapters (``agent.stt`` /
280
- * ``agent.tts`` are ``STTAdapter`` / ``TTSAdapter`` instances), so these
281
- * helpers are thin pass-throughs that return the instance or null. Kept as
282
- * functions so the Twilio/Telnyx bridges have a single dispatch point.
283
- */
284
-
285
- /** Per-word timings / metadata (Deepgram-shaped). Optional on every adapter. */
286
- interface STTWord {
287
- readonly word?: string;
288
- readonly start?: number;
289
- readonly end?: number;
290
- readonly confidence?: number;
291
- readonly punctuated_word?: string;
292
- readonly speaker?: number;
293
- }
294
- /**
295
- * Facade transcript shape — widened to surface richer provider fields
296
- * (Deepgram emits all of them) without forcing adapters that only know
297
- * ``text``/``isFinal`` to change. All non-text fields are optional.
298
- */
299
- interface STTTranscript {
300
- text: string;
301
- isFinal?: boolean;
302
- /** Overall transcript confidence in [0, 1]. */
303
- confidence?: number;
304
- /** Provider-side end-of-utterance hint (faster than ``isFinal``). */
305
- speechFinal?: boolean;
306
- /** True when the result was produced in response to a Finalize command. */
307
- fromFinalize?: boolean;
308
- /** Provider request id (Deepgram populates this from the Metadata frame). */
309
- requestId?: string;
310
- /** Per-word timings / metadata when the provider emits them. */
311
- words?: ReadonlyArray<STTWord>;
312
- /** Which provider event this transcript represents (e.g. ``Results``). */
313
- eventType?: string;
314
- }
315
- /** Callback invoked by an `STTAdapter` for each (partial or final) transcript event. */
316
- type STTTranscriptCallback = (t: STTTranscript) => Promise<void> | void;
317
- /** Shape shared by every STT adapter in the SDK. */
318
- interface STTAdapter {
319
- connect(): Promise<void>;
320
- sendAudio(pcm: Buffer): void | Promise<void>;
321
- onTranscript(cb: STTTranscriptCallback): void;
322
- close(): void | Promise<void>;
323
- /**
324
- * Optional: ask the provider to immediately finalise the in-flight
325
- * utterance (rather than waiting for its own endpoint timer). Called by
326
- * ``StreamHandler`` whenever the SDK's VAD signals ``speech_end``, and
327
- * after a barge-in cancel — both moments where waiting for the
328
- * provider's endpoint heuristic stalls the next turn.
329
- *
330
- * Implementations that do not support utterance-level finalisation
331
- * (e.g. one-shot transcribers like Whisper) should omit this method
332
- * entirely; the stream handler does an optional-chained call.
333
- */
334
- finalize?(): void | Promise<void>;
335
- }
336
- /** Shape shared by every TTS adapter in the SDK. */
337
- interface TTSAdapter {
338
- synthesizeStream(text: string): AsyncIterable<Buffer>;
339
- }
340
-
341
337
  /**
342
338
  * Pipeline hook executor for pipeline mode.
343
339
  *
@@ -616,6 +612,22 @@ interface LLMStreamOptions {
616
612
  }
617
613
  interface LLMProvider {
618
614
  stream(messages: Array<Record<string, unknown>>, tools?: Array<Record<string, unknown>> | null, opts?: LLMStreamOptions): AsyncGenerator<LLMChunk, void, unknown>;
615
+ /**
616
+ * Optional best-effort pre-call DNS / TLS / HTTP-keepalive warmup.
617
+ *
618
+ * Called once per outbound call from ``Patter.call`` when the agent has
619
+ * ``prewarm: true`` (the default). Concrete providers (OpenAI,
620
+ * Anthropic, Google, Cerebras, Groq) override this to issue a
621
+ * lightweight HTTPS GET to their inference endpoint so by the time the
622
+ * first ``stream()`` call lands, the connection pool already has a
623
+ * warm socket. Failures are logged at debug level and never abort the
624
+ * call — pure latency optimisation.
625
+ *
626
+ * Optional on the interface (``warmup?: ...``) so providers without a
627
+ * warmup hook still satisfy the type. Detected via runtime
628
+ * ``typeof provider.warmup === 'function'`` in the client.
629
+ */
630
+ warmup?(): Promise<void>;
619
631
  }
620
632
  /** Optional sampling kwargs forwarded into the OpenAI Chat Completions body. */
621
633
  interface OpenAILLMSamplingOptions {
@@ -642,6 +654,8 @@ interface OpenAILLMSamplingOptions {
642
654
  }
643
655
  /** LLM provider backed by OpenAI Chat Completions (streaming). */
644
656
  declare class OpenAILLMProvider implements LLMProvider {
657
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
658
+ static readonly providerKey = "openai";
645
659
  private readonly apiKey;
646
660
  readonly model: string;
647
661
  private readonly temperature?;
@@ -655,6 +669,23 @@ declare class OpenAILLMProvider implements LLMProvider {
655
669
  private readonly presencePenalty?;
656
670
  private readonly stop?;
657
671
  constructor(apiKey: string, model: string, sampling?: OpenAILLMSamplingOptions);
672
+ /** Subclasses (Cerebras, Groq) override this with their own host. */
673
+ protected get baseUrl(): string;
674
+ /**
675
+ * Pre-call DNS / TLS / HTTP-keepalive warmup.
676
+ *
677
+ * Issues a lightweight ``GET ${baseUrl}/models`` so DNS, TLS and HTTP/2
678
+ * are already up by the time the first ``chat.completions`` call lands.
679
+ * Best-effort: 5 s timeout, all exceptions swallowed at debug level.
680
+ *
681
+ * Note: an HTTPS GET warms DNS + TLS + connection pool but does NOT
682
+ * warm the inference path itself; for true inference warmup a real
683
+ * low-token request is needed, left as a follow-up. STT / TTS providers ship concrete
684
+ * WebSocket-based prewarms (Cartesia / Deepgram / AssemblyAI for STT;
685
+ * ElevenLabs WS for TTS) which save 200-500 ms each — those dominate
686
+ * the cold-start latency budget.
687
+ */
688
+ warmup(): Promise<void>;
658
689
  /** Stream OpenAI Chat Completions chunks for the given messages/tools. */
659
690
  stream(messages: Array<Record<string, unknown>>, tools?: Array<Record<string, unknown>> | null, opts?: LLMStreamOptions): AsyncGenerator<LLMChunk, void, unknown>;
660
691
  }
@@ -669,6 +700,8 @@ declare class LLMLoop {
669
700
  private eventBus?;
670
701
  private readonly _providerName;
671
702
  private readonly _modelName;
703
+ private _usageMissingCount;
704
+ private _loggedUsageFallback;
672
705
  private onToolCall?;
673
706
  constructor(apiKey: string, model: string, systemPrompt: string, tools?: ToolDefinition[] | null, llmProvider?: LLMProvider, disablePhonePreamble?: boolean);
674
707
  /**
@@ -706,6 +739,87 @@ declare class LLMLoop {
706
739
  private buildMessages;
707
740
  }
708
741
 
742
+ /**
743
+ * Barge-in confirmation strategies.
744
+ *
745
+ * When a caller starts speaking while the agent's TTS is in flight, the SDK
746
+ * has to decide whether the speech is a real interruption or just a brief
747
+ * backchannel ("uh-huh", "okay") / room noise / cough. The default
748
+ * behaviour is to treat any VAD speech_start as a confirmed barge-in and
749
+ * cancel the agent immediately. That is fine for clean inputs but
750
+ * produces frequent false positives on PSTN: the agent gets cut
751
+ * mid-sentence by background chatter, breath, or filler words and never
752
+ * recovers the conversational thread.
753
+ *
754
+ * Each ``BargeInStrategy`` is consulted on every STT transcript while a
755
+ * barge-in is *pending* (VAD fired, but the agent has not yet been
756
+ * cancelled). The first strategy that returns ``true`` confirms the
757
+ * barge-in; if none do within the configured timeout the pending state
758
+ * is dropped and the agent resumes streaming TTS as if nothing happened.
759
+ * With an empty ``bargeInStrategies`` array the SDK falls back to the
760
+ * legacy "interrupt immediately on VAD" path, so adding strategies is
761
+ * a strict opt-in.
762
+ */
763
+ interface EvaluateContext {
764
+ /** Latest STT output text (interim or final). */
765
+ readonly transcript: string;
766
+ /** ``true`` for interim partials, ``false`` for finals. */
767
+ readonly isInterim: boolean;
768
+ /** Whether the agent's TTS is currently in flight. */
769
+ readonly agentSpeaking: boolean;
770
+ }
771
+ /**
772
+ * Decides whether a pending barge-in should be confirmed.
773
+ *
774
+ * Implementations must be safe to call from any number of evaluations
775
+ * per turn. ``reset`` is invoked when the agent finishes speaking
776
+ * naturally and when a pending barge-in times out without
777
+ * confirmation.
778
+ */
779
+ interface BargeInStrategy {
780
+ evaluate(ctx: EvaluateContext): Promise<boolean> | boolean;
781
+ reset?(): Promise<void> | void;
782
+ }
783
+ interface MinWordsStrategyOptions {
784
+ /**
785
+ * Minimum word count required while the agent is speaking. Reasonable
786
+ * values are 2-5; 3 is a good starting point for production phone
787
+ * agents. Must be ``>= 1``.
788
+ */
789
+ readonly minWords: number;
790
+ /**
791
+ * When ``true`` (default), interim STT partials are evaluated as soon
792
+ * as they arrive. Set to ``false`` to wait for finals only — slower
793
+ * but free of partial-word noise on jittery STT providers.
794
+ */
795
+ readonly useInterim?: boolean;
796
+ }
797
+ /**
798
+ * Confirm barge-in only after the caller has spoken ``minWords`` words.
799
+ *
800
+ * Filters short backchannels, single-word utterances, and stray
801
+ * transcription fragments that VAD picked up but were not real
802
+ * interruptions. While the agent is silent the strategy permits any
803
+ * speech to count (one word is enough), so the first user turn is not
804
+ * delayed.
805
+ */
806
+ declare class MinWordsStrategy implements BargeInStrategy {
807
+ private readonly minWords;
808
+ private readonly useInterim;
809
+ constructor(options: MinWordsStrategyOptions);
810
+ evaluate(ctx: EvaluateContext): boolean;
811
+ reset(): Promise<void>;
812
+ }
813
+ /**
814
+ * Short-circuit-OR composition: first strategy that confirms wins.
815
+ * Returns ``false`` for an empty array so callers can use the empty
816
+ * default to mean "no opt-in confirmation, fall back to legacy
817
+ * interrupt-on-VAD".
818
+ */
819
+ declare function evaluateStrategies(strategies: readonly BargeInStrategy[], ctx: EvaluateContext): Promise<boolean>;
820
+ /** Call ``reset()`` on every strategy, swallowing per-strategy errors. */
821
+ declare function resetStrategies(strategies: readonly BargeInStrategy[]): Promise<void>;
822
+
709
823
  /**
710
824
  * Public type definitions for the Patter SDK — agent options, pipeline hooks,
711
825
  * provider config envelopes, and serve/call request/response shapes.
@@ -967,6 +1081,15 @@ interface VADEvent {
967
1081
  interface VADProvider {
968
1082
  processFrame(pcmChunk: Buffer, sampleRate: number): Promise<VADEvent | null>;
969
1083
  close(): Promise<void>;
1084
+ /**
1085
+ * Optional: reset all per-utterance state so the next ``processFrame``
1086
+ * starts from a clean SILENCE state. Useful between agent turns to
1087
+ * prevent a "stuck SPEECH" condition where PSTN echo / loopback kept the
1088
+ * detector's internal probability above the deactivation threshold for
1089
+ * the full agent turn, leaving the VAD unable to emit ``speech_start``
1090
+ * on the next user utterance (one-shot barge-in bug).
1091
+ */
1092
+ reset?(): Promise<void> | void;
970
1093
  }
971
1094
  /** Pre-STT audio filter — noise cancellation, gain, EQ. */
972
1095
  interface AudioFilter {
@@ -1062,7 +1185,7 @@ interface AgentOptions {
1062
1185
  * matching mode (``openai_realtime`` or ``elevenlabs_convai``). When absent,
1063
1186
  * pipeline mode is selected if ``stt`` and ``tts`` are provided.
1064
1187
  */
1065
- engine?: Realtime | ConvAI;
1188
+ engine?: Realtime | Realtime2 | ConvAI;
1066
1189
  /**
1067
1190
  * Provider mode. Normally derived from ``engine`` / ``stt`` + ``tts``. Pass
1068
1191
  * ``'pipeline'`` explicitly when building a pipeline-mode agent without
@@ -1103,6 +1226,60 @@ interface AgentOptions {
1103
1226
  * Default: 300.
1104
1227
  */
1105
1228
  bargeInThresholdMs?: number;
1229
+ /**
1230
+ * Opt-in barge-in confirmation strategies (pipeline mode). With the
1231
+ * default empty array the SDK falls back to the legacy
1232
+ * "interrupt immediately on VAD speech_start" behaviour. When at
1233
+ * least one strategy is provided, a VAD speech_start during TTS
1234
+ * marks the barge-in as *pending* — the agent's TTS continues
1235
+ * streaming naturally and its in-flight LLM stream is preserved —
1236
+ * and the strategies are consulted on every STT transcript. The first strategy that
1237
+ * returns ``true`` confirms the barge-in (cancels TTS, flushes the
1238
+ * inbound ring buffer); if none confirm within
1239
+ * ``bargeInConfirmMs`` the pending state is dropped and TTS resumes.
1240
+ *
1241
+ * See ``getpatter`` exports ``BargeInStrategy`` /
1242
+ * ``MinWordsStrategy`` for the protocol and a reference
1243
+ * implementation.
1244
+ */
1245
+ bargeInStrategies?: readonly BargeInStrategy[];
1246
+ /**
1247
+ * Maximum time (ms) to wait for at least one strategy to confirm a
1248
+ * pending barge-in before discarding the pending state and resuming
1249
+ * TTS. Only consulted when ``bargeInStrategies`` is non-empty.
1250
+ * Default: 1500.
1251
+ */
1252
+ bargeInConfirmMs?: number;
1253
+ /**
1254
+ * When ``true`` (default), ``Patter.call`` warms up the STT, TTS, and
1255
+ * LLM provider connections in parallel with the carrier-side
1256
+ * ``initiateCall`` request so DNS, TLS, and HTTP/2 handshakes are
1257
+ * already complete by the time the callee answers. Adapters expose a
1258
+ * ``warmup()`` method returning ``Promise<void>`` (default no-op) —
1259
+ * providers can override to dial open a persistent connection ahead
1260
+ * of the WebSocket bridge. Best-effort: warmup failures are logged
1261
+ * at debug level and never abort the call. Default: ``true``.
1262
+ */
1263
+ prewarm?: boolean;
1264
+ /**
1265
+ * When ``true`` (default since 0.6.2 in pipeline mode), ``Patter.call``
1266
+ * pre-renders ``firstMessage`` to TTS audio bytes during the ringing
1267
+ * window and streams the cached buffer immediately when the carrier
1268
+ * emits ``start``. Eliminates the 200-700 ms TTS first-byte latency
1269
+ * on the greeting that dominated first-turn ``p95`` on every pipeline
1270
+ * acceptance run. The trade-off is paying the TTS bill even if the
1271
+ * call is never answered (silently logged at warn level when the call
1272
+ * fails) — typically $0.001-$0.005 per ringing call depending on TTS
1273
+ * provider. Opt out by passing ``prewarmFirstMessage: false`` (e.g.
1274
+ * for very high-volume outbound where un-answered TTS spend matters).
1275
+ *
1276
+ * **Pipeline mode only.** Realtime / ConvAI provider modes never
1277
+ * consume the prewarm cache (the StreamHandler for those modes runs
1278
+ * its first-message emit through the provider's own audio path), so
1279
+ * ``Patter.call`` refuses to spawn the prewarm task and emits a warn
1280
+ * when ``provider !== 'pipeline'``.
1281
+ */
1282
+ prewarmFirstMessage?: boolean;
1106
1283
  /**
1107
1284
  * When true, the sentence chunker emits the first clause of each response
1108
1285
  * on a soft punctuation boundary (",", em-dash, en-dash) once ~40 chars
@@ -1227,92 +1404,618 @@ interface LocalCallOptions {
1227
1404
  }
1228
1405
 
1229
1406
  /**
1230
- * In-memory metrics store for the local dashboard.
1231
- *
1232
- * Keeps the last `maxCalls` completed calls and tracks active calls.
1233
- * Supports SSE event subscribers for real-time updates.
1407
+ * Shared STT / TTS adapter dispatch.
1234
1408
  *
1235
- * Optional disk hydration: when `CallLogger` writes per-call records under
1236
- * `<root>/calls/YYYY/MM/DD/<call_id>/metadata.json`, calling
1237
- * `hydrate(logRoot)` on a fresh store rebuilds the in-memory list from those
1238
- * files so the dashboard survives process restarts (the persistence is in
1239
- * the JSONL/JSON files, the store is just a cache on top).
1409
+ * In v0.5.0+ callers always pass pre-instantiated adapters (``agent.stt`` /
1410
+ * ``agent.tts`` are ``STTAdapter`` / ``TTSAdapter`` instances), so these
1411
+ * helpers are thin pass-throughs that return the instance or null. Kept as
1412
+ * functions so the Twilio/Telnyx bridges have a single dispatch point.
1240
1413
  */
1241
1414
 
1242
- /** Snapshot of a call as held by the dashboard store. */
1243
- interface CallRecord {
1244
- call_id: string;
1245
- caller: string;
1246
- callee: string;
1247
- direction: string;
1248
- started_at: number;
1249
- ended_at?: number;
1250
- /**
1251
- * Current lifecycle state: ``initiated`` (pre-registered), ``ringing``,
1252
- * ``in-progress``, ``completed``, ``no-answer``, ``busy``, ``failed``,
1253
- * ``canceled``, or ``webhook_error``.
1254
- */
1255
- status?: string;
1256
- transcript?: Array<{
1257
- role: string;
1258
- text: string;
1259
- timestamp: number;
1260
- }>;
1261
- turns?: unknown[];
1262
- metrics?: Record<string, unknown> | null;
1263
- [key: string]: unknown;
1415
+ /** Per-word timings / metadata (Deepgram-shaped). Optional on every adapter. */
1416
+ interface STTWord {
1417
+ readonly word?: string;
1418
+ readonly start?: number;
1419
+ readonly end?: number;
1420
+ readonly confidence?: number;
1421
+ readonly punctuated_word?: string;
1422
+ readonly speaker?: number;
1264
1423
  }
1265
- /** Server-Sent-Event payload broadcast by `MetricsStore` for live UI updates. */
1266
- interface SSEEvent {
1267
- type: string;
1268
- data: Record<string, unknown>;
1424
+ /**
1425
+ * Facade transcript shape — widened to surface richer provider fields
1426
+ * (Deepgram emits all of them) without forcing adapters that only know
1427
+ * ``text``/``isFinal`` to change. All non-text fields are optional.
1428
+ */
1429
+ interface STTTranscript {
1430
+ text: string;
1431
+ isFinal?: boolean;
1432
+ /** Overall transcript confidence in [0, 1]. */
1433
+ confidence?: number;
1434
+ /** Provider-side end-of-utterance hint (faster than ``isFinal``). */
1435
+ speechFinal?: boolean;
1436
+ /** True when the result was produced in response to a Finalize command. */
1437
+ fromFinalize?: boolean;
1438
+ /** Provider request id (Deepgram populates this from the Metadata frame). */
1439
+ requestId?: string;
1440
+ /** Per-word timings / metadata when the provider emits them. */
1441
+ words?: ReadonlyArray<STTWord>;
1442
+ /** Which provider event this transcript represents (e.g. ``Results``). */
1443
+ eventType?: string;
1269
1444
  }
1270
- /** In-memory bounded ring buffer of recent calls plus active-call tracking. */
1271
- declare class MetricsStore extends EventEmitter {
1272
- private readonly maxCalls;
1273
- private calls;
1274
- private activeCalls;
1445
+ /** Callback invoked by an `STTAdapter` for each (partial or final) transcript event. */
1446
+ type STTTranscriptCallback = (t: STTTranscript) => Promise<void> | void;
1447
+ /** Shape shared by every STT adapter in the SDK. */
1448
+ interface STTAdapter {
1449
+ connect(): Promise<void>;
1450
+ sendAudio(pcm: Buffer): void | Promise<void>;
1451
+ onTranscript(cb: STTTranscriptCallback): void;
1452
+ close(): void | Promise<void>;
1275
1453
  /**
1276
- * Accepts either a numeric ``maxCalls`` (legacy positional matches the
1277
- * original TS API) or an options object ``{ maxCalls }`` to align with the
1278
- * Python SDK's keyword-argument style. Plain literals also work:
1279
- * ``new MetricsStore()`` / ``new MetricsStore(100)`` / ``new MetricsStore({ maxCalls: 100 })``.
1454
+ * Optional: ask the provider to immediately finalise the in-flight
1455
+ * utterance (rather than waiting for its own endpoint timer). Called by
1456
+ * ``StreamHandler`` whenever the SDK's VAD signals ``speech_end``, and
1457
+ * after a barge-in cancel both moments where waiting for the
1458
+ * provider's endpoint heuristic stalls the next turn.
1459
+ *
1460
+ * Implementations that do not support utterance-level finalisation
1461
+ * (e.g. one-shot transcribers like Whisper) should omit this method
1462
+ * entirely; the stream handler does an optional-chained call.
1280
1463
  */
1281
- constructor(maxCallsOrOpts?: number | {
1282
- maxCalls?: number;
1283
- });
1284
- private publish;
1285
- /** Mark a call as in-progress (creates the row if it does not yet exist). */
1286
- recordCallStart(data: Record<string, unknown>): void;
1464
+ finalize?(): void | Promise<void>;
1287
1465
  /**
1288
- * Pre-register an outbound call before any webhook fires. Lets the
1289
- * dashboard surface attempts that never reach media (no-answer, busy,
1290
- * carrier-rejected). Mirrors the Python ``record_call_initiated``.
1466
+ * Optional best-effort pre-call DNS / TLS / HTTP-keepalive warmup.
1467
+ * Default behaviour is a no-op providers that benefit (e.g.
1468
+ * provider WebSockets with a slow handshake) can override. Failures
1469
+ * must never abort the call.
1291
1470
  */
1292
- recordCallInitiated(data: Record<string, unknown>): void;
1471
+ warmup?(): Promise<void>;
1472
+ }
1473
+ /** Shape shared by every TTS adapter in the SDK. */
1474
+ interface TTSAdapter {
1475
+ synthesizeStream(text: string): AsyncIterable<Buffer>;
1293
1476
  /**
1294
- * Update the status of an active or completed call. Terminal states
1295
- * (completed, no-answer, busy, failed, canceled, webhook_error) move the
1296
- * row from active to completed so the UI freezes the live duration timer.
1477
+ * Optional best-effort pre-call DNS / TLS / HTTP-keepalive warmup.
1478
+ * Default behaviour is a no-op. Failures must never abort the call.
1479
+ */
1480
+ warmup?(): Promise<void>;
1481
+ }
1482
+
1483
+ /**
1484
+ * Known stable ElevenLabs voice models (from the official ElevenLabs API
1485
+ * reference). Exposed as a typed `as const` object so callers can pass
1486
+ * `ElevenLabsModel.FLASH_V2_5` and get autocomplete / static checking; the
1487
+ * public `modelId` option also accepts an arbitrary `string` so users can
1488
+ * pass forward-compat IDs we haven't enumerated yet.
1489
+ *
1490
+ * - `V3` — newest, highest quality (slower TTFT than Flash).
1491
+ * - `FLASH_V2_5` — current default, fastest (~75 ms TTFT).
1492
+ * - `TURBO_V2_5` — balanced quality/speed.
1493
+ * - `MULTILINGUAL_V2` — best multilingual support.
1494
+ * - `MONOLINGUAL_V1` — legacy English-only.
1495
+ */
1496
+ declare const ElevenLabsModel: {
1497
+ readonly V3: "eleven_v3";
1498
+ readonly FLASH_V2_5: "eleven_flash_v2_5";
1499
+ readonly TURBO_V2_5: "eleven_turbo_v2_5";
1500
+ readonly MULTILINGUAL_V2: "eleven_multilingual_v2";
1501
+ readonly MONOLINGUAL_V1: "eleven_monolingual_v1";
1502
+ };
1503
+ /** Union of {@link ElevenLabsModel} string values. */
1504
+ type ElevenLabsModel = (typeof ElevenLabsModel)[keyof typeof ElevenLabsModel];
1505
+ declare const ElevenLabsOutputFormat: {
1506
+ readonly MP3_22050_32: "mp3_22050_32";
1507
+ readonly MP3_44100_32: "mp3_44100_32";
1508
+ readonly MP3_44100_64: "mp3_44100_64";
1509
+ readonly MP3_44100_96: "mp3_44100_96";
1510
+ readonly MP3_44100_128: "mp3_44100_128";
1511
+ readonly MP3_44100_192: "mp3_44100_192";
1512
+ readonly PCM_8000: "pcm_8000";
1513
+ readonly PCM_16000: "pcm_16000";
1514
+ readonly PCM_22050: "pcm_22050";
1515
+ readonly PCM_24000: "pcm_24000";
1516
+ readonly PCM_44100: "pcm_44100";
1517
+ readonly ULAW_8000: "ulaw_8000";
1518
+ };
1519
+ /** Union of {@link ElevenLabsOutputFormat} string values. */
1520
+ type ElevenLabsOutputFormat = (typeof ElevenLabsOutputFormat)[keyof typeof ElevenLabsOutputFormat];
1521
+ /** ElevenLabs voice tuning knobs forwarded as `voice_settings` in the request. */
1522
+ interface ElevenLabsVoiceSettings {
1523
+ stability?: number;
1524
+ similarity_boost?: number;
1525
+ style?: number;
1526
+ use_speaker_boost?: boolean;
1527
+ }
1528
+ /** Constructor options for {@link ElevenLabsTTS}. */
1529
+ interface ElevenLabsTTSOptions$1 {
1530
+ voiceId?: string;
1531
+ /**
1532
+ * ElevenLabs voice model ID. The default ``eleven_flash_v2_5`` has the
1533
+ * lowest TTFT (~75 ms). Pass ``eleven_v3`` for highest quality, or any
1534
+ * arbitrary string for forward-compat with future models.
1535
+ */
1536
+ modelId?: ElevenLabsModel | string;
1537
+ outputFormat?: ElevenLabsOutputFormat;
1538
+ voiceSettings?: ElevenLabsVoiceSettings;
1539
+ languageCode?: string;
1540
+ chunkSize?: number;
1541
+ }
1542
+ /**
1543
+ * ElevenLabs streaming TTS adapter.
1544
+ *
1545
+ * Supported `modelId` values are autocompleted via {@link ElevenLabsModel}.
1546
+ * Default is `eleven_flash_v2_5` (lowest TTFT, ~75 ms).
1547
+ *
1548
+ * **Telephony optimization** — the constructor default
1549
+ * `outputFormat='pcm_16000'` is correct for web playback, dashboard
1550
+ * previews, and 16 kHz pipelines. For real phone calls, use the
1551
+ * carrier-specific factories instead:
1552
+ *
1553
+ * - {@link ElevenLabsTTS.forTwilio} emits `ulaw_8000` natively. Twilio's
1554
+ * media-stream WebSocket expects μ-law @ 8 kHz, so the SDK normally
1555
+ * resamples 16 kHz → 8 kHz and PCM → μ-law before sending. Asking
1556
+ * ElevenLabs to produce μ-law directly skips that step (saves
1557
+ * ~30–80 ms first-byte plus per-frame CPU and avoids any resampling
1558
+ * aliasing).
1559
+ * - {@link ElevenLabsTTS.forTelnyx} emits `pcm_16000`. Telnyx negotiates
1560
+ * L16/16000 on its bidirectional media WebSocket, so 16 kHz PCM is
1561
+ * already the format used end-to-end and no transcoding happens.
1562
+ * ElevenLabs *also* supports `ulaw_8000` if your Telnyx profile is
1563
+ * pinned to PCMU/8000 — pass `outputFormat: 'ulaw_8000'` explicitly
1564
+ * in that case.
1565
+ */
1566
+ declare class ElevenLabsTTS {
1567
+ static readonly providerKey = "elevenlabs";
1568
+ private readonly apiKey;
1569
+ private readonly voiceId;
1570
+ private readonly modelId;
1571
+ private _outputFormat;
1572
+ private readonly _outputFormatExplicit;
1573
+ private readonly voiceSettings;
1574
+ private readonly languageCode;
1575
+ private readonly chunkSize;
1576
+ /**
1577
+ * Public view of the (possibly auto-flipped) wire format. Read by the
1578
+ * stream-handler to decide whether to skip the client-side resample +
1579
+ * mulaw encode when the bytes are already in the carrier's wire codec.
1580
+ */
1581
+ get outputFormat(): ElevenLabsOutputFormat;
1582
+ constructor(apiKey: string, voiceId?: string, modelId?: string, outputFormat?: ElevenLabsOutputFormat | string);
1583
+ constructor(apiKey: string, options: ElevenLabsTTSOptions$1);
1584
+ /**
1585
+ * Hook called by ``StreamHandler.initPipeline`` to advise the carrier
1586
+ * wire format. When the user did NOT pass an explicit ``outputFormat``,
1587
+ * auto-flip to the carrier's native codec so the audio bytes ElevenLabs
1588
+ * returns are already in Twilio/Telnyx wire format — eliminating the
1589
+ * client-side 16 kHz → 8 kHz resample and PCM → μ-law encode. The
1590
+ * resample/encode chain was a source of audible artifacts on the
1591
+ * prewarmed firstMessage (see 0.6.2 acceptance notes — burst delivery
1592
+ * of resampled audio crackled on the carrier-side jitter buffer).
1593
+ *
1594
+ * No-op when the caller passed an explicit ``outputFormat`` (incl. via
1595
+ * the ``forTwilio`` / ``forTelnyx`` factories) — user wins.
1596
+ *
1597
+ * Parity with {@link ElevenLabsWebSocketTTS.setTelephonyCarrier}.
1598
+ */
1599
+ setTelephonyCarrier(carrier: string): void;
1600
+ /**
1601
+ * Construct an instance pre-configured for Twilio Media Streams.
1602
+ *
1603
+ * Sets `outputFormat='ulaw_8000'` so ElevenLabs emits μ-law @ 8 kHz
1604
+ * directly — the exact wire format Twilio's media stream uses — letting
1605
+ * the SDK skip the 16 kHz→8 kHz resample and PCM→μ-law conversion in
1606
+ * `TwilioAudioSender`. Saves ~30–80 ms first-byte and per-frame CPU,
1607
+ * and removes a potential aliasing source.
1608
+ *
1609
+ * `voiceSettings` defaults to a low-bandwidth-friendly profile
1610
+ * (speaker boost off, modest stability) which sounds cleaner at 8 kHz
1611
+ * μ-law than the studio default. Pass an explicit object to override.
1612
+ */
1613
+ static forTwilio(apiKey: string, options?: Omit<ElevenLabsTTSOptions$1, 'outputFormat'>): ElevenLabsTTS;
1614
+ /**
1615
+ * Construct an instance pre-configured for Telnyx bidirectional media.
1616
+ *
1617
+ * Telnyx's default media-streaming codec is L16 PCM @ 16 kHz, which
1618
+ * matches our default Telnyx handler. We pick `pcm_16000` so the audio
1619
+ * flows end-to-end with zero resampling or transcoding.
1620
+ *
1621
+ * Trade-off: if your Telnyx profile is pinned to PCMU/8000 (μ-law),
1622
+ * construct `ElevenLabsTTS` directly with `outputFormat: 'ulaw_8000'`
1623
+ * — Telnyx supports that natively too.
1624
+ */
1625
+ static forTelnyx(apiKey: string, options?: Omit<ElevenLabsTTSOptions$1, 'outputFormat'>): ElevenLabsTTS;
1626
+ /**
1627
+ * Synthesise text to speech and return the full audio as a single Buffer.
1628
+ *
1629
+ * For large chunks (or when latency matters) call `synthesizeStream` instead.
1630
+ */
1631
+ synthesize(text: string): Promise<Buffer>;
1632
+ /**
1633
+ * Synthesise text and yield audio chunks as they arrive (streaming).
1634
+ *
1635
+ * The yielded buffers are raw PCM at 16 kHz (or whatever `outputFormat` is
1636
+ * configured to). `chunkSize` controls the maximum yield size — 512 is a
1637
+ * good choice for low-latency telephony.
1638
+ */
1639
+ synthesizeStream(text: string): AsyncGenerator<Buffer>;
1640
+ }
1641
+
1642
+ /**
1643
+ * WebSocket-based ElevenLabs TTS provider — opt-in low-latency variant.
1644
+ *
1645
+ * Targets the ElevenLabs streaming-input WebSocket endpoint
1646
+ * (`/v1/text-to-speech/{voice_id}/stream-input`) instead of the HTTP
1647
+ * `/stream` endpoint used by `ElevenLabsTTS`. Saves the HTTP request setup
1648
+ * time per utterance (~50 ms) and avoids the HTTP cold-start TLS handshake
1649
+ * when calls are bursty.
1650
+ *
1651
+ * API matches `ElevenLabsTTS` (`synthesizeStream(text)` returns an
1652
+ * `AsyncGenerator<Buffer>`) so it can be passed anywhere a TTSAdapter is
1653
+ * expected.
1654
+ *
1655
+ * Behaviour notes
1656
+ * - WebSocket is opened **per-utterance** (matches HTTP semantics). A
1657
+ * future revision may pool a WS across utterances of the same call
1658
+ * session — see roadmap Phase 5b.
1659
+ * - `auto_mode=true` is enabled by default. Pass `autoMode: false` to
1660
+ * send a custom `chunk_length_schedule`.
1661
+ * - `outputFormat` is exposed as a query parameter so `ulaw_8000` (Twilio
1662
+ * native) and `pcm_16000` (Telnyx native) work without resampling.
1663
+ * - `eleven_v3` is **not** supported — the WS endpoint rejects it.
1664
+ * - `optimize_streaming_latency` is officially deprecated and is not
1665
+ * exposed.
1666
+ */
1667
+
1668
+ /** Constructor options for {@link ElevenLabsWebSocketTTS}. */
1669
+ interface ElevenLabsWebSocketTTSOptions {
1670
+ apiKey: string;
1671
+ voiceId?: string;
1672
+ modelId?: ElevenLabsModel | string;
1673
+ outputFormat?: string;
1674
+ voiceSettings?: Record<string, unknown>;
1675
+ languageCode?: string;
1676
+ /** Let the server pick chunk timing. Default true. */
1677
+ autoMode?: boolean;
1678
+ /** WS keep-alive timeout in seconds (5–180). Default 60. */
1679
+ inactivityTimeout?: number;
1680
+ /**
1681
+ * Manual chunk schedule, only used when ``autoMode: false``. Each value
1682
+ * must be 5–500. ElevenLabs default is ``[120, 160, 250, 290]``.
1683
+ */
1684
+ chunkLengthSchedule?: number[];
1685
+ /** Outgoing audio re-chunk size in bytes. Default 4096. */
1686
+ chunkSize?: number;
1687
+ }
1688
+ /**
1689
+ * Parked WS handle returned by {@link ElevenLabsWebSocketTTS.openParkedConnection}.
1690
+ *
1691
+ * `bosSent` records whether the BOS frame (`{"text": " ", ...}`) has
1692
+ * already been written to the wire. The prewarm pipeline always sends
1693
+ * the BOS so the upstream worker is selected on the parked connection;
1694
+ * `synthesizeStream` adopts the WS and SKIPS its own BOS send to avoid
1695
+ * a protocol error.
1696
+ */
1697
+ interface ElevenLabsParkedWS {
1698
+ ws: WebSocket__default;
1699
+ bosSent: boolean;
1700
+ }
1701
+ /** WebSocket-based ElevenLabs TTS adapter — opt-in low-latency variant. */
1702
+ declare class ElevenLabsWebSocketTTS implements TTSAdapter {
1703
+ static readonly providerKey = "elevenlabs_ws";
1704
+ readonly apiKey: string;
1705
+ readonly voiceId: string;
1706
+ readonly modelId: string;
1707
+ readonly voiceSettings?: Record<string, unknown>;
1708
+ readonly languageCode?: string;
1709
+ readonly autoMode: boolean;
1710
+ readonly inactivityTimeout: number;
1711
+ readonly chunkLengthSchedule?: number[];
1712
+ readonly chunkSize: number;
1713
+ /**
1714
+ * Single-slot adoption queue. The prewarm pipeline parks one WS per
1715
+ * outbound call here; the next `synthesizeStream` call consumes it
1716
+ * (skipping `new WebSocket()` and the BOS send) instead of opening
1717
+ * a fresh socket. The slot is consumed exactly once: if a second
1718
+ * `synthesizeStream` runs before the first, only the first benefits.
1719
+ *
1720
+ * We keep this on the adapter (not in a parameter) so the existing
1721
+ * `for await (const chunk of agent.tts.synthesizeStream(...))` call
1722
+ * site in `StreamHandler` continues to work without signature
1723
+ * changes.
1724
+ */
1725
+ private adoptedConnection;
1726
+ /**
1727
+ * Active WS for the in-flight ``synthesizeStream`` call, if any. Set
1728
+ * when a stream starts, cleared in its ``finally`` block. The
1729
+ * stream-handler calls ``cancelActiveStream()`` from ``cancelSpeaking``
1730
+ * to unblock the generator's inner ``await Promise<frame>`` — without
1731
+ * it, a barge-in on the firstMessage live path leaves the for-await
1732
+ * stuck waiting for the next frame; ElevenLabs never sends
1733
+ * ``isFinal=true`` after the consumer breaks, the 30 s frame timeout
1734
+ * fires post-call, and meanwhile ``initPipeline`` never returns so
1735
+ * the STT ``onTranscript`` callback never registers and subsequent
1736
+ * user turns are silently dropped (root cause of the 2026-05-20
1737
+ * "first message OK, then no response" symptom).
1738
+ */
1739
+ private activeStreamWs;
1740
+ /**
1741
+ * The wire format requested over the ElevenLabs WS. Initially set from
1742
+ * the constructor; ``setTelephonyCarrier`` may auto-flip it to the
1743
+ * carrier's native codec when the caller did NOT pass ``outputFormat``
1744
+ * explicitly.
1745
+ */
1746
+ private _outputFormat;
1747
+ private readonly _outputFormatExplicit;
1748
+ /** Public read-only view of the (possibly auto-flipped) wire format. */
1749
+ get outputFormat(): string;
1750
+ constructor(opts: ElevenLabsWebSocketTTSOptions);
1751
+ /**
1752
+ * Hook called by ``StreamHandler`` to advise the carrier wire format.
1753
+ *
1754
+ * When the user did NOT pass an explicit ``outputFormat`` in the
1755
+ * constructor options, this flips the format to the carrier's native
1756
+ * wire codec — saving a client-side transcode step. Calling with an
1757
+ * unknown carrier (``""`` / ``"custom"``) is a no-op.
1758
+ *
1759
+ * When ``outputFormat`` was explicitly passed (incl. via the
1760
+ * ``forTwilio`` / ``forTelnyx`` factories), this method is a no-op —
1761
+ * the user's choice always wins.
1762
+ */
1763
+ setTelephonyCarrier(carrier: string): void;
1764
+ /**
1765
+ * Force-close the WebSocket of any in-flight ``synthesizeStream`` call.
1766
+ * Called by the stream-handler from ``cancelSpeaking`` (barge-in) so
1767
+ * the generator's inner ``await Promise<frame>`` loop unblocks cleanly
1768
+ * via the ``onClose`` handler — instead of waiting up to 30 s for the
1769
+ * ``FRAME_TIMEOUT_MS`` watchdog to fire. No-op when no stream is in
1770
+ * flight or when the WS is already closing.
1771
+ *
1772
+ * Without this, a barge-in during the firstMessage live path left the
1773
+ * for-await stuck (ElevenLabs never sends ``isFinal=true`` after the
1774
+ * consumer breaks), ``initPipeline`` never returned, the STT
1775
+ * ``onTranscript`` callback never registered, and the entire remainder
1776
+ * of the call was silent for the user. Surfaced during the 2026-05-20
1777
+ * acceptance run.
1778
+ */
1779
+ cancelActiveStream(): void;
1780
+ /** Pre-configured for Twilio Media Streams (`ulaw_8000`). */
1781
+ static forTwilio(opts: Omit<ElevenLabsWebSocketTTSOptions, 'outputFormat'>): ElevenLabsWebSocketTTS;
1782
+ /** Pre-configured for Telnyx (`pcm_16000`). */
1783
+ static forTelnyx(opts: Omit<ElevenLabsWebSocketTTSOptions, 'outputFormat'>): ElevenLabsWebSocketTTS;
1784
+ private buildUrl;
1785
+ /**
1786
+ * Build the protocol-required BOS frame sent on every fresh WS.
1787
+ *
1788
+ * The single-space `{"text": " "}` keep-alive establishes the session
1789
+ * without committing any synthesis (no `flush: true`, no real text).
1790
+ * Production `synthesizeStream()` and `warmup()` share this exact
1791
+ * construction so the upstream worker chooses the same per-session
1792
+ * config in both cases — otherwise the warm session is on a different
1793
+ * worker than the live request, which defeats the warmup goal.
1794
+ */
1795
+ private buildBosFrame;
1796
+ /**
1797
+ * Single-shot synthesis: open WS, send text, yield bytes, close.
1798
+ *
1799
+ * Resilience contract:
1800
+ * - Connection bounded by ``CONNECT_TIMEOUT_MS`` (5s, was 15s).
1801
+ * - Each idle wait bounded by ``FRAME_TIMEOUT_MS`` (30s) so a stalled
1802
+ * server cannot keep the generator alive indefinitely.
1803
+ * - Permanent error handler attached BEFORE the open await — prevents
1804
+ * ``uncaughtException`` if an error fires after the once-listener
1805
+ * resolves.
1806
+ * - All event listeners removed in ``finally`` (no closure leak past
1807
+ * socket close).
1808
+ * - Server-reported ``error`` raises ``ElevenLabsTTSError``.
1809
+ * - Per-frame audio payload capped at ``MAX_AUDIO_B64_BYTES``.
1810
+ * - Best-effort EOS ``{"text":""}`` sent in finally (not immediately
1811
+ * after flush — auto_mode could otherwise truncate the tail audio).
1812
+ */
1813
+ synthesizeStream(text: string): AsyncGenerator<Buffer>;
1814
+ /**
1815
+ * Pre-call WebSocket warmup for the ElevenLabs `/stream-input` endpoint.
1816
+ *
1817
+ * Opens the WS (DNS + TLS + auth handshake), sends the EXACT same BOS
1818
+ * frame the production `synthesizeStream()` path sends — including
1819
+ * `voice_settings` and (when configured) `generation_config` — so
1820
+ * ElevenLabs instantiates the same per-session worker for both
1821
+ * warmup and the live request. If the BOS frames differ, the server
1822
+ * may route warmup and the real call to two different workers, and
1823
+ * the warmed worker is wasted. Idles ~250 ms, then closes. By the
1824
+ * time the first `synthesizeStream()` call lands during the call,
1825
+ * the connection pool has the upstream warm — net wire time saving
1826
+ * of 200-500 ms.
1827
+ *
1828
+ * Billing safety: ElevenLabs bills on synthesised characters
1829
+ * delivered via `audio` frames (per https://elevenlabs.io/pricing).
1830
+ * The keepalive (single-space `text`, no `flush: true`, no real
1831
+ * transcript) is documented as the session-establishment frame and
1832
+ * does NOT generate synthesis. Closing without sending the actual
1833
+ * transcript does not consume billable characters. Best-effort:
1834
+ * failures logged at debug level.
1835
+ */
1836
+ warmup(): Promise<void>;
1837
+ /**
1838
+ * Open a fresh WS, send the EXACT BOS frame the live `synthesizeStream`
1839
+ * sends, and return the OPEN socket without closing it. Used by the
1840
+ * prewarm pipeline to park a TTS connection during the carrier ringing
1841
+ * window so the next `synthesizeStream` call can adopt it via
1842
+ * {@link adoptWebSocket} and skip ~400-900 ms of TLS + BOS round-trip.
1843
+ *
1844
+ * Returns a parked-handle the caller stashes; the next
1845
+ * `synthesizeStream` will detect the adoption queue and skip its own
1846
+ * `new WebSocket()` + BOS send.
1847
+ *
1848
+ * Billing safety: BOS is the documented session-establishment frame
1849
+ * (single space `text`, no `flush: true`) and does not generate
1850
+ * synthesis. ElevenLabs bills on `audio` frames received from the
1851
+ * server, not on BOS bytes sent by the client.
1852
+ */
1853
+ openParkedConnection(): Promise<ElevenLabsParkedWS>;
1854
+ /**
1855
+ * Stash a parked WS handle so the next `synthesizeStream` call adopts
1856
+ * it instead of opening a fresh socket. Caller is responsible for
1857
+ * holding the handle alive until either the live request consumes it
1858
+ * or the call ends (in which case `discardAdoptedConnection()`
1859
+ * cleans it up).
1860
+ */
1861
+ adoptWebSocket(parked: ElevenLabsParkedWS): void;
1862
+ /**
1863
+ * Drop and close any pending parked WS without consuming it. Used on
1864
+ * call-failure paths so a never-started call does not leak a TTS WS
1865
+ * that ElevenLabs will close after its inactivity timeout anyway.
1866
+ */
1867
+ discardAdoptedConnection(): void;
1868
+ /** No-op — connections are per-utterance and torn down inside synthesizeStream. */
1869
+ close(): Promise<void>;
1870
+ }
1871
+
1872
+ /**
1873
+ * In-memory metrics store for the local dashboard.
1874
+ *
1875
+ * Keeps the last `maxCalls` completed calls and tracks active calls.
1876
+ * Supports SSE event subscribers for real-time updates.
1877
+ *
1878
+ * Optional disk hydration: when `CallLogger` writes per-call records under
1879
+ * `<root>/calls/YYYY/MM/DD/<call_id>/metadata.json`, calling
1880
+ * `hydrate(logRoot)` on a fresh store rebuilds the in-memory list from those
1881
+ * files so the dashboard survives process restarts (the persistence is in
1882
+ * the JSONL/JSON files, the store is just a cache on top).
1883
+ */
1884
+
1885
+ /** Snapshot of a call as held by the dashboard store. */
1886
+ interface CallRecord {
1887
+ call_id: string;
1888
+ caller: string;
1889
+ callee: string;
1890
+ direction: string;
1891
+ started_at: number;
1892
+ ended_at?: number;
1893
+ /**
1894
+ * Current lifecycle state: ``initiated`` (pre-registered), ``ringing``,
1895
+ * ``in-progress``, ``completed``, ``no-answer``, ``busy``, ``failed``,
1896
+ * ``canceled``, or ``webhook_error``.
1897
+ */
1898
+ status?: string;
1899
+ transcript?: Array<{
1900
+ role: string;
1901
+ text: string;
1902
+ timestamp: number;
1903
+ }>;
1904
+ turns?: unknown[];
1905
+ metrics?: Record<string, unknown> | null;
1906
+ [key: string]: unknown;
1907
+ }
1908
+ /** Server-Sent-Event payload broadcast by `MetricsStore` for live UI updates. */
1909
+ interface SSEEvent {
1910
+ type: string;
1911
+ data: Record<string, unknown>;
1912
+ }
1913
+ /** In-memory bounded ring buffer of recent calls plus active-call tracking. */
1914
+ declare class MetricsStore extends EventEmitter {
1915
+ private readonly maxCalls;
1916
+ private calls;
1917
+ private activeCalls;
1918
+ /**
1919
+ * User-driven soft delete: call_ids the operator removed from the
1920
+ * dashboard view. The on-disk artefacts written by ``CallLogger``
1921
+ * (``metadata.json``, ``transcript.jsonl``) are intentionally NOT
1922
+ * touched — they serve as the durable backup. All read paths
1923
+ * (``getCalls`` / ``getCall`` / ``getAggregates`` / ``getCallsInRange``
1924
+ * / ``hydrate``) filter against this set so the call is invisible
1925
+ * to the UI and excluded from rolling metrics. Populated from
1926
+ * ``<logRoot>/.deleted_call_ids.json`` on hydrate so deletions
1927
+ * survive a process restart. Parity with Python.
1928
+ */
1929
+ private deletedCallIds;
1930
+ private deletedIdsPath;
1931
+ /**
1932
+ * Accepts either a numeric ``maxCalls`` (legacy positional — matches the
1933
+ * original TS API) or an options object ``{ maxCalls }`` to align with the
1934
+ * Python SDK's keyword-argument style. Plain literals also work:
1935
+ * ``new MetricsStore()`` / ``new MetricsStore(100)`` / ``new MetricsStore({ maxCalls: 100 })``.
1936
+ */
1937
+ constructor(maxCallsOrOpts?: number | {
1938
+ maxCalls?: number;
1939
+ });
1940
+ private publish;
1941
+ /** Mark a call as in-progress (creates the row if it does not yet exist). */
1942
+ recordCallStart(data: Record<string, unknown>): void;
1943
+ /**
1944
+ * Pre-register an outbound call before any webhook fires. Lets the
1945
+ * dashboard surface attempts that never reach media (no-answer, busy,
1946
+ * carrier-rejected). Mirrors the Python ``record_call_initiated``.
1947
+ */
1948
+ recordCallInitiated(data: Record<string, unknown>): void;
1949
+ /**
1950
+ * Update the status of an active or completed call. Terminal states
1951
+ * (completed, no-answer, busy, failed, canceled, webhook_error) move the
1952
+ * row from active to completed so the UI freezes the live duration timer.
1953
+ */
1954
+ updateCallStatus(callId: string, status: string, extra?: Record<string, unknown>): void;
1955
+ /** Append a single conversation turn to an active call and broadcast it via SSE. */
1956
+ recordTurn(data: Record<string, unknown>): void;
1957
+ /** Move a call from active to completed and persist its final metrics. */
1958
+ recordCallEnd(data: Record<string, unknown>, metrics?: Record<string, unknown> | null): void;
1959
+ /**
1960
+ * Return a window of completed calls in newest-first order.
1961
+ *
1962
+ * Soft-deleted call_ids (see ``deleteCalls``) are filtered out so the
1963
+ * dashboard never re-shows a row the user removed. The on-disk
1964
+ * artefacts are intentionally preserved as a backup.
1965
+ */
1966
+ getCalls(limit?: number, offset?: number): CallRecord[];
1967
+ /**
1968
+ * Look up a completed call by id (newest match wins).
1969
+ *
1970
+ * Soft-deleted call_ids resolve to ``null`` so the SPA's detail pane
1971
+ * cannot render a row the user removed.
1972
+ */
1973
+ getCall(callId: string): CallRecord | null;
1974
+ /**
1975
+ * Soft-delete one or more calls from the dashboard view.
1976
+ *
1977
+ * Adds each ``call_id`` to an in-memory set. Subsequent reads via
1978
+ * ``getCalls`` / ``getCall`` / ``getAggregates`` / ``getCallsInRange``
1979
+ * exclude the deleted ids, so rolling metrics (avg latency, total
1980
+ * spend) are recomputed without them. The on-disk
1981
+ * ``metadata.json`` / ``transcript.jsonl`` files written by
1982
+ * ``CallLogger`` are NOT touched — they serve as a durable backup
1983
+ * the operator can audit outside the dashboard.
1984
+ *
1985
+ * Active calls are never deletable. A call_id that is currently
1986
+ * in ``activeCalls`` is silently skipped so a mid-call delete
1987
+ * from the UI cannot orphan the live transcript pane.
1988
+ *
1989
+ * Persisted to ``<logRoot>/.deleted_call_ids.json`` (best-effort)
1990
+ * when ``hydrate()`` has been called with a log root. Parity with
1991
+ * Python ``delete_calls``.
1992
+ *
1993
+ * @returns The list of call_ids actually accepted as deleted.
1994
+ */
1995
+ deleteCalls(callIds: readonly string[]): string[];
1996
+ /** Whether ``callId`` was soft-deleted from the dashboard. */
1997
+ isDeleted(callId: string): boolean;
1998
+ /** Snapshot of soft-deleted call_ids (sorted). */
1999
+ getDeletedCallIds(): string[];
2000
+ /** Atomically persist the deleted-ids set to disk. Best-effort. */
2001
+ private persistDeletedIds;
2002
+ /** Look up an active call by id (returns undefined if not active or unknown). */
2003
+ getActive(callId: string): CallRecord | undefined;
2004
+ /** Return all currently active (not yet ended) calls. */
2005
+ getActiveCalls(): CallRecord[];
2006
+ /**
2007
+ * Compute summary statistics across the buffered call history.
2008
+ *
2009
+ * Soft-deleted calls are excluded so rolling metrics (avg latency,
2010
+ * total spend) match exactly what the operator sees in the call list.
2011
+ */
2012
+ getAggregates(): Record<string, unknown>;
2013
+ /**
2014
+ * Return calls whose `started_at` falls within `[fromTs, toTs]` (Unix
2015
+ * seconds). Soft-deleted calls are filtered out.
1297
2016
  */
1298
- updateCallStatus(callId: string, status: string, extra?: Record<string, unknown>): void;
1299
- /** Append a single conversation turn to an active call and broadcast it via SSE. */
1300
- recordTurn(data: Record<string, unknown>): void;
1301
- /** Move a call from active to completed and persist its final metrics. */
1302
- recordCallEnd(data: Record<string, unknown>, metrics?: Record<string, unknown> | null): void;
1303
- /** Return a window of completed calls in newest-first order. */
1304
- getCalls(limit?: number, offset?: number): CallRecord[];
1305
- /** Look up a completed call by id (newest match wins). */
1306
- getCall(callId: string): CallRecord | null;
1307
- /** Look up an active call by id (returns undefined if not active or unknown). */
1308
- getActive(callId: string): CallRecord | undefined;
1309
- /** Return all currently active (not yet ended) calls. */
1310
- getActiveCalls(): CallRecord[];
1311
- /** Compute summary statistics across the buffered call history. */
1312
- getAggregates(): Record<string, unknown>;
1313
- /** Return calls whose `started_at` falls within `[fromTs, toTs]` (Unix seconds). */
1314
2017
  getCallsInRange(fromTs?: number, toTs?: number): CallRecord[];
1315
- /** Number of completed calls currently in the ring buffer. */
2018
+ /** Number of completed (non-deleted) calls currently in the ring buffer. */
1316
2019
  get callCount(): number;
1317
2020
  /**
1318
2021
  * Rebuild the in-memory call list from `metadata.json` files written by
@@ -1455,6 +2158,19 @@ declare class SpeechEvents {
1455
2158
  private dispatch;
1456
2159
  }
1457
2160
 
2161
+ /** Parked provider WebSockets ready for adoption by a per-call StreamHandler. */
2162
+ interface ParkedProviderConnections {
2163
+ /** Pre-opened STT WS (Cartesia today; other adapters may add support later). */
2164
+ stt?: WebSocket.WebSocket;
2165
+ /**
2166
+ * Pre-opened TTS WS handle (ElevenLabs WS today). The `bosSent` flag
2167
+ * lets the live `synthesizeStream` skip its own BOS send when the
2168
+ * prewarm pipeline already wrote it.
2169
+ */
2170
+ tts?: ElevenLabsParkedWS;
2171
+ /** Pre-opened OpenAI Realtime WS (already through `session.updated`). */
2172
+ openaiRealtime?: WebSocket.WebSocket;
2173
+ }
1458
2174
  /** Top-level SDK entry point — wraps a carrier + embedded server + agent loop. */
1459
2175
  declare class Patter {
1460
2176
  private localConfig;
@@ -1476,6 +2192,65 @@ declare class Patter {
1476
2192
  * ``Cannot use both tunnel: true and webhookUrl``.
1477
2193
  */
1478
2194
  private tunnelOwnsWebhookUrl;
2195
+ /**
2196
+ * Pre-rendered first-message TTS audio per outbound call_id. Populated
2197
+ * by :meth:`call` when ``agent.prewarmFirstMessage`` is true; consumed
2198
+ * by the StreamHandler firstMessage emit so the greeting streams
2199
+ * instantly on ``start`` instead of paying the 200-700 ms TTS first-byte
2200
+ * latency. See ``AgentOptions.prewarmFirstMessage``.
2201
+ *
2202
+ * Stores raw bytes in the TTS provider's native sample rate; the
2203
+ * carrier-side audio sender resamples on emit.
2204
+ */
2205
+ private prewarmAudio;
2206
+ /**
2207
+ * Call IDs whose prewarm cache slot has already been consumed —
2208
+ * either by ``popPrewarmAudio`` (cache hit OR miss on the firstMessage
2209
+ * emit path) or by ``recordPrewarmWaste`` (call ended before pickup).
2210
+ * The prewarm task checks this set BEFORE writing bytes so a slow
2211
+ * synth that finishes after the consumer already polled doesn't
2212
+ * orphan bytes in ``prewarmAudio``. See FIX #92 in the parity audit.
2213
+ */
2214
+ private prewarmConsumed;
2215
+ /**
2216
+ * Background tasks tracked so :meth:`disconnect` can wait on / drop any
2217
+ * still-running prewarm-first-message synth before tearing down.
2218
+ */
2219
+ private prewarmTasks;
2220
+ /**
2221
+ * TTL eviction timers keyed by call_id so :meth:`disconnect` (and
2222
+ * normal consumption / waste-record paths) can cancel any pending
2223
+ * timer when the slot drains naturally. Without this, the timer
2224
+ * would WARN spuriously after the cache was already emptied.
2225
+ */
2226
+ private prewarmTtlTimers;
2227
+ /**
2228
+ * Pre-opened, fully-handshaked provider WebSockets keyed by
2229
+ * carrier-issued call_id. Populated by ``parkProviderConnections``
2230
+ * during the carrier ringing window; consumed by the per-call
2231
+ * StreamHandler at ``start`` via ``adoptWebSocket(...)`` so STT / TTS
2232
+ * / Realtime audio can flow on the first turn without paying the
2233
+ * 150-900 ms TLS + WS-upgrade + protocol-handshake round-trip again.
2234
+ *
2235
+ * Distinct from ``prewarmAudio`` (which holds pre-rendered TTS bytes
2236
+ * for the first message); the two features are complementary and
2237
+ * orthogonal — both can be active for the same call.
2238
+ *
2239
+ * Each slot may hold up to three parked connections (STT, TTS,
2240
+ * Realtime). Drained by:
2241
+ * - {@link popPrewarmedConnections} on the carrier ``start`` event
2242
+ * (consumed normally — the handles transfer to the StreamHandler)
2243
+ * - {@link recordPrewarmWaste} on call-termination paths (no-answer,
2244
+ * busy, failed, canceled, AMD voicemail). Closes parked sockets.
2245
+ * - {@link disconnect} on Patter teardown. Closes all parked sockets.
2246
+ */
2247
+ private prewarmedConnections;
2248
+ /**
2249
+ * TTL eviction handles keyed by call_id for connections that are never
2250
+ * adopted (e.g. a carrier that swallows ``start``). Closes the parked
2251
+ * sockets so they don't leak past the safety window.
2252
+ */
2253
+ private prewarmedConnTimers;
1479
2254
  /**
1480
2255
  * Speech-edge events for turn-taking instrumentation. Public surface: the
1481
2256
  * seven `on*` proxy accessors below plus the `conversationState` snapshot.
@@ -1483,7 +2258,7 @@ declare class Patter {
1483
2258
  * the previous behaviour.
1484
2259
  *
1485
2260
  * See `src/_speech-events.ts` for the full event taxonomy and the
1486
- * industry-alignment table (LiveKit / Pipecat / OpenAI Realtime).
2261
+ * OpenAI Realtime alignment table.
1487
2262
  */
1488
2263
  readonly speechEvents: SpeechEvents;
1489
2264
  get onUserSpeechStarted(): SpeechEventCallback | null;
@@ -1502,8 +2277,8 @@ declare class Patter {
1502
2277
  set onAudioOut(cb: SpeechEventCallback | null);
1503
2278
  /**
1504
2279
  * Snapshot of the current per-side state of the call.
1505
- * Mirrors LiveKit's `user_state_changed` / `agent_state_changed`
1506
- * payloads. Read-only and safe to call at any time.
2280
+ * Returns the user_state / agent_state payload shape — read-only and
2281
+ * safe to call at any time.
1507
2282
  */
1508
2283
  get conversationState(): ConversationStateSnapshot;
1509
2284
  /**
@@ -1553,12 +2328,115 @@ declare class Patter {
1553
2328
  private _serveImpl;
1554
2329
  /** Run the agent in interactive terminal-test mode (no real telephony). */
1555
2330
  test(opts: ServeOptions): Promise<void>;
2331
+ /**
2332
+ * Pop and return the pre-synthesised first-message audio for ``callId``.
2333
+ *
2334
+ * Returns ``undefined`` when ``agent.prewarmFirstMessage`` was not set
2335
+ * for the originating outbound call, or when the synth was still in
2336
+ * flight at the moment the carrier emitted ``start`` (cache miss — the
2337
+ * StreamHandler falls back to live TTS).
2338
+ *
2339
+ * Called by the per-call StreamHandler at the start of the firstMessage
2340
+ * emit. Returning bytes here lets the handler skip the live TTS
2341
+ * synthesis and stream the cached buffer directly.
2342
+ *
2343
+ * Marks ``callId`` as consumed regardless of cache hit/miss so a slow
2344
+ * synth task that finishes after this call drops its bytes instead of
2345
+ * orphaning them in ``prewarmAudio``. See FIX #92.
2346
+ */
2347
+ popPrewarmAudio: (callId: string) => Buffer | undefined;
2348
+ /**
2349
+ * Log a warning if a prewarmed greeting was paid for but never used.
2350
+ * The TTS bill for ``agent.firstMessage`` has already been incurred by
2351
+ * the background synth task, so the user should know — opt-in feature
2352
+ * with a known cost surface.
2353
+ *
2354
+ * Idempotent: the second call for the same ``callId`` is a no-op, so
2355
+ * the status callback firing first and ``endCall`` running afterwards
2356
+ * (or vice-versa) does not double-WARN. Public so the embedded
2357
+ * server's webhook handlers can invoke it on no-answer / busy /
2358
+ * failed / canceled / AMD-machine paths. See FIX #91.
2359
+ */
2360
+ recordPrewarmWaste: (callId: string) => void;
2361
+ /**
2362
+ * Pop and return the parked provider WebSockets for ``callId``, or
2363
+ * ``undefined`` when no parked connections exist.
2364
+ *
2365
+ * Wired into ``EmbeddedServer.popPrewarmedConnections`` so the
2366
+ * per-call ``StreamHandler`` can adopt the parked sockets at the
2367
+ * carrier ``start`` event instead of opening fresh ones — saving
2368
+ * ~150-900 ms of cold-start handshake on the first turn.
2369
+ */
2370
+ popPrewarmedConnections: (callId: string) => ParkedProviderConnections | undefined;
2371
+ /**
2372
+ * Close any parked provider WebSockets for ``callId``. Wired into
2373
+ * ``EmbeddedServer.closePrewarmedConnections`` so call-termination
2374
+ * paths (no-answer, busy, failed, canceled, AMD voicemail) drop the
2375
+ * sockets cleanly instead of leaving them to the upstream timeout.
2376
+ */
2377
+ closePrewarmedConnections: (callId: string) => void;
2378
+ /**
2379
+ * Open and park provider WebSockets in parallel with the carrier-side
2380
+ * ``initiateCall``. Unlike :meth:`spawnProviderWarmup` (which closes
2381
+ * the WS after a brief idle), the sockets opened here stay OPEN and
2382
+ * are handed off to the per-call ``StreamHandler`` on ``start``.
2383
+ *
2384
+ * This is the structural fix for first-turn cold-start: on Node's
2385
+ * ``ws`` package, opening + closing a WS does NOT warm TLS for the
2386
+ * next open — every fresh ``new WebSocket()`` re-pays the full
2387
+ * TCP + TLS + HTTP-101 round-trip. By keeping the WS open and
2388
+ * adopting it directly, the live first turn skips the handshake
2389
+ * entirely (saves ~150-900 ms depending on provider).
2390
+ *
2391
+ * Best-effort: each provider's parking task is wrapped in
2392
+ * ``Promise.allSettled`` so a slow or failing endpoint cannot block
2393
+ * the others. Providers without ``openParkedConnection`` contribute
2394
+ * nothing — the call falls through to the cold ``connect()`` path
2395
+ * for that provider.
2396
+ */
2397
+ private parkProviderConnections;
2398
+ /**
2399
+ * Spawn a fire-and-forget task that warms up STT / TTS / LLM in
2400
+ * parallel with the carrier-side ``initiateCall``.
2401
+ *
2402
+ * Best-effort: each provider's optional ``warmup()`` is wrapped in
2403
+ * ``Promise.allSettled`` so a slow or failing endpoint cannot block
2404
+ * the others. Providers without ``warmup`` contribute nothing.
2405
+ */
2406
+ private spawnProviderWarmup;
2407
+ /**
2408
+ * Pre-render ``agent.firstMessage`` to TTS bytes during the ringing
2409
+ * window and stash them in ``prewarmAudio.set(callId, buf)``.
2410
+ *
2411
+ * Skipped silently when ``agent.prewarmFirstMessage`` is false or
2412
+ * when ``agent.tts`` / ``agent.firstMessage`` is missing. The synth
2413
+ * is bounded by ``ringTimeout`` (default 25 s) so a never-answered
2414
+ * call doesn't tie up the TTS connection. On timeout / error the
2415
+ * cache is left empty and the StreamHandler falls back to live TTS.
2416
+ *
2417
+ * **Pipeline mode only.** Realtime / ConvAI provider modes never
2418
+ * consume the prewarm cache (the StreamHandler for those modes runs
2419
+ * its first-message emit through the provider's own audio path).
2420
+ * Spawning the prewarm in those modes pays the TTS bill for nothing
2421
+ * — refused with a warn.
2422
+ *
2423
+ * **Capped at ``PREWARM_CACHE_MAX`` concurrent entries.** Refused
2424
+ * with a warn when the cap is reached (the call still proceeds —
2425
+ * StreamHandler falls back to live TTS).
2426
+ */
2427
+ private spawnPrewarmFirstMessage;
1556
2428
  /** Place an outbound call via the configured carrier. */
1557
2429
  call(options: LocalCallOptions): Promise<void>;
1558
2430
  /**
1559
2431
  * Stop the embedded server and any running tunnel. Safe to call multiple
1560
2432
  * times. Leaves the instance reusable: a subsequent ``serve()`` works as
1561
2433
  * if the previous lifecycle never happened.
2434
+ *
2435
+ * Also clears any pending TTL eviction timers, awaits in-flight
2436
+ * prewarm-first-message synth tasks (best-effort, with a 1 s safety
2437
+ * timeout), and clears the prewarm cache. Without this a still-running
2438
+ * TTS WS keeps the user billed long after SDK teardown, and stale
2439
+ * entries leak across ``serve`` / ``disconnect`` cycles. See FIX #93.
1562
2440
  */
1563
2441
  disconnect(): Promise<void>;
1564
2442
  /**
@@ -1957,6 +2835,27 @@ declare function geminiLive(opts: {
1957
2835
  voice?: string;
1958
2836
  }): RealtimeConfig;
1959
2837
 
2838
+ /**
2839
+ * Default provider pricing and merge utilities.
2840
+ *
2841
+ * Pricing reflects public provider rates as of 2026. Each provider entry
2842
+ * carries provider-level defaults (the model Patter ships with by default)
2843
+ * plus an optional ``models`` map keyed by model identifier with per-model
2844
+ * overrides. Cost-calc functions take an optional ``model`` arg and
2845
+ * auto-resolve the rate via {@link resolveProviderRates} (longest-prefix
2846
+ * match for versioned model IDs). When the agent's adapter exposes
2847
+ * ``model`` and the metrics layer threads it through, the dashboard bills
2848
+ * with model accuracy out of the box — no manual override needed.
2849
+ *
2850
+ * User overrides via ``new Patter({ pricing: {...} })`` keep working as
2851
+ * before. To register a new model rate without touching the SDK source:
2852
+ *
2853
+ * new Patter({ pricing: { elevenlabs: { models: { my_custom: { price: 0.075 } } } } })
2854
+ */
2855
+ /** Pricing table version identifier, updated in lockstep with the Python SDK. */
2856
+ declare const PRICING_VERSION = "2026.3";
2857
+ /** ISO date the pricing table was last refreshed against public provider rates. */
2858
+ declare const PRICING_LAST_UPDATED = "2026-05-08";
1960
2859
  /**
1961
2860
  * Billing units used by ``DEFAULT_PRICING`` entries. String values keep the
1962
2861
  * pricing table JSON-serialisable and backwards-compatible with consumers
@@ -2075,7 +2974,22 @@ declare function calculateTelephonyCost(provider: string, durationSeconds: numbe
2075
2974
 
2076
2975
  /** Per-turn latency breakdown across the STT/LLM/TTS pipeline. */
2077
2976
  interface LatencyBreakdown {
2977
+ /**
2978
+ * STT finalization time: end-of-speech (VAD stop or STT speech_final) →
2979
+ * final transcript delivery. This is the engineering metric — pure STT
2980
+ * processing latency, independent of how long the user spoke. Industry
2981
+ * benchmarks (Picovoice, Deepgram, Gladia, Speechmatics) all report this
2982
+ * number as "STT latency". Falls back to turn_start when the endpoint
2983
+ * signal is unavailable (degraded provider, batch STT, etc.).
2984
+ */
2078
2985
  stt_ms: number;
2986
+ /**
2987
+ * Duration of the user's utterance (turn_start → end-of-speech). Useful
2988
+ * to distinguish "user spoke for 4s" from "STT took 4s to finalize" —
2989
+ * they used to be conflated in stt_ms before 0.6.1. Optional — undefined
2990
+ * when the endpoint signal is unavailable.
2991
+ */
2992
+ user_speech_duration_ms?: number;
2079
2993
  /**
2080
2994
  * Backwards-compatible LLM bucket. With the split below, this now reflects
2081
2995
  * the user-perceived first-token latency (TTFT) when streaming is available
@@ -2164,6 +3078,12 @@ interface CallMetrics {
2164
3078
  tts_provider: string;
2165
3079
  llm_provider: string;
2166
3080
  telephony_provider: string;
3081
+ /** Model identifiers per provider (e.g. "ink-whisper", "eleven_flash_v2_5",
3082
+ * "gpt-oss-120b"). Surface on the dashboard cost breakdown so operators
3083
+ * can attribute per-call spend to a specific model. */
3084
+ stt_model?: string;
3085
+ tts_model?: string;
3086
+ llm_model?: string;
2167
3087
  }
2168
3088
  /** Programmatic control surface for a live call (transfer, hangup, DTMF). */
2169
3089
  interface CallControl {
@@ -2226,6 +3146,21 @@ declare class CallMetricsAccumulator {
2226
3146
  private _bargeinStoppedAt;
2227
3147
  private _turnUserText;
2228
3148
  private _turnSttAudioSeconds;
3149
+ /**
3150
+ * Guard against the recordTurnInterrupted / recordTurnComplete race.
3151
+ *
3152
+ * A VAD-path barge-in fires ``recordTurnInterrupted`` synchronously
3153
+ * inside ``handleAudioAsync`` while the in-flight pipeline LLM stream
3154
+ * keeps unwinding on its own task. When the LLM stream eventually
3155
+ * exits, the existing pipeline path falls through to
3156
+ * ``recordTurnComplete``, which would push a second turn for the same
3157
+ * logical exchange (this time carrying ``user_text=''`` because the
3158
+ * field was already reset). ``_turnAlreadyClosed`` is flipped by
3159
+ * ``recordTurnInterrupted`` and read by ``recordTurnComplete`` so the
3160
+ * late ``recordTurnComplete`` becomes a no-op until the next
3161
+ * ``startTurn`` re-arms the accumulator.
3162
+ */
3163
+ private _turnAlreadyClosed;
2229
3164
  private _totalSttAudioSeconds;
2230
3165
  private _totalTtsCharacters;
2231
3166
  private _totalRealtimeCost;
@@ -2236,6 +3171,7 @@ declare class CallMetricsAccumulator {
2236
3171
  private _actualTelephonyCost;
2237
3172
  private _actualSttCost;
2238
3173
  private _totalLlmCost;
3174
+ private _llmModel;
2239
3175
  private _eventBus;
2240
3176
  /** Timestamp (hrTimeMs) when VAD emitted speech_end. */
2241
3177
  private _vadStoppedAt;
@@ -2250,6 +3186,21 @@ declare class CallMetricsAccumulator {
2250
3186
  private _overlapStartedAt;
2251
3187
  private _reportOnlyInitialTtfb;
2252
3188
  private _initialTtfbEmitted;
3189
+ /**
3190
+ * Last barge-in detection timestamp (hrTimeMs). Used by
3191
+ * ``_computeTurnLatency`` to gate endpoint_ms / stt_ms emission on turns
3192
+ * that started immediately after a barge-in — those turns have unreliable
3193
+ * VAD/STT anchors and would otherwise pollute the p95 distribution with
3194
+ * synthetic 6+ second spikes.
3195
+ */
3196
+ private _lastBargeinAt;
3197
+ /**
3198
+ * Count of turns where ``recordSttComplete`` fired but no legitimate VAD
3199
+ * ``speech_end`` had stamped ``_endpointSignalAt``. Exposed via metrics so
3200
+ * we can spot environments where PSTN packet loss is dropping VAD stops
3201
+ * (the common cause of missing endpoint signals).
3202
+ */
3203
+ private _endpointSignalMissingCount;
2253
3204
  constructor(opts: {
2254
3205
  callId: string;
2255
3206
  providerMode: string;
@@ -2285,6 +3236,31 @@ declare class CallMetricsAccumulator {
2285
3236
  * on the first audio byte rather than just before recordSttComplete().
2286
3237
  */
2287
3238
  startTurnIfIdle(): void;
3239
+ /**
3240
+ * Anchor the current turn at a legitimate VAD ``speech_start`` event.
3241
+ *
3242
+ * Industry-standard pattern: every VAD ``speech_start`` that fires while the agent
3243
+ * is NOT in the suppressed warmup window re-anchors the turn timer to
3244
+ * the wall-clock moment the user actually started speaking. Re-anchors:
3245
+ *
3246
+ * * ``_turnStart`` — fixes the case where a phantom ``speech_start``
3247
+ * during agent TTS or a partial transcript from the previous user
3248
+ * attempt already stamped the field. Without this, the legitimate
3249
+ * user-speech ``speech_start`` no-op'd and ``user_speech_duration_ms``
3250
+ * inflated from ~1 s to 5-7 s (the original "I waited 7 seconds"
3251
+ * dashboard symptom).
3252
+ * * ``_endpointSignalAt``, ``_vadStoppedAt``, ``_sttFinalAt`` — any
3253
+ * stale anchor from a rejected barge-in / dropped final transcript
3254
+ * on the same uncommitted turn is cleared, so the next
3255
+ * ``recordVadStop`` / ``recordSttFinalTimestamp`` stamps fresh.
3256
+ * * ``_sttComplete``, ``_llmFirstToken``, ``_initialTtfbEmitted`` — same
3257
+ * rationale for the downstream pipeline timestamps.
3258
+ *
3259
+ * No-op once the turn is committed (``_turnCommittedMono`` set): a
3260
+ * VAD ``speech_start`` after commit belongs to the NEXT turn's
3261
+ * barge-in path, handled by ``recordTurnInterrupted`` instead.
3262
+ */
3263
+ anchorUserSpeechStart(): void;
2288
3264
  /** Stamp end-of-STT, capture the user's transcript, and accrue billed STT seconds. */
2289
3265
  recordSttComplete(text: string, audioSeconds?: number): void;
2290
3266
  /** Record the timestamp of the first LLM token (TTFT). No-op after first call. */
@@ -2320,9 +3296,26 @@ declare class CallMetricsAccumulator {
2320
3296
  * to compute ``bargein_ms``.
2321
3297
  */
2322
3298
  recordTtsStopped(ts?: number): void;
2323
- /** Close the current turn cleanly and append a `TurnMetrics` record. */
2324
- recordTurnComplete(agentText: string): TurnMetrics;
2325
- /** Close the current turn as interrupted (barge-in) and return the recorded metrics. */
3299
+ /**
3300
+ * Close the current turn cleanly and append a `TurnMetrics` record.
3301
+ *
3302
+ * Returns ``null`` when ``recordTurnInterrupted`` has already closed
3303
+ * the current turn — this protects against the VAD-barge-in /
3304
+ * pipeline-LLM race where both paths try to finalise the same logical
3305
+ * turn and the second would otherwise push a phantom entry with
3306
+ * ``user_text=''``. The caller treats ``null`` as "nothing to emit";
3307
+ * ``emitTurnMetrics`` is already null-safe.
3308
+ */
3309
+ recordTurnComplete(agentText: string): TurnMetrics | null;
3310
+ /**
3311
+ * Close the current turn as interrupted (barge-in) and return the
3312
+ * recorded metrics. Returns ``null`` when no turn is open, OR when
3313
+ * ``recordTurnComplete`` has already finalised the current turn —
3314
+ * bidirectional parity with the guard at the top of
3315
+ * ``recordTurnComplete``. Prevents an out-of-order interruption (e.g.
3316
+ * a future refactor that reorders the bargein + LLM-unwind paths)
3317
+ * from overwriting a turn that the complete path already emitted.
3318
+ */
2326
3319
  recordTurnInterrupted(): TurnMetrics | null;
2327
3320
  /**
2328
3321
  * Record the moment VAD emitted speech_end for the current utterance.
@@ -2419,6 +3412,13 @@ declare class CallMetricsAccumulator {
2419
3412
  endCall(): CallMetrics;
2420
3413
  /** Return the cost breakdown for the call so far without ending it. */
2421
3414
  getCostSoFar(): CostBreakdown;
3415
+ /**
3416
+ * Number of turns where recordSttComplete fired without a prior legitimate
3417
+ * VAD speech_end. Surfaced for diagnostics — a non-zero value points at
3418
+ * dropped VAD stops (commonly PSTN packet loss), which is why we stopped
3419
+ * faking _endpointSignalAt from _sttComplete in 0.6.x.
3420
+ */
3421
+ get endpointSignalMissingCount(): number;
2422
3422
  private _resetTurnState;
2423
3423
  private _computeTurnLatency;
2424
3424
  private _computeCost;
@@ -2442,6 +3442,7 @@ declare class CallMetricsAccumulator {
2442
3442
  * {@link OpenAIRealtimeAdapter}. Audio negotiation defaults to
2443
3443
  * `g711_ulaw` so traffic flows through Twilio/Telnyx without transcoding.
2444
3444
  */
3445
+
2445
3446
  /**
2446
3447
  * Supported OpenAI Realtime wire audio formats. See
2447
3448
  * https://platform.openai.com/docs/guides/realtime for the full list.
@@ -2456,6 +3457,67 @@ declare const OpenAIRealtimeAudioFormat: {
2456
3457
  };
2457
3458
  /** Union of {@link OpenAIRealtimeAudioFormat} string values. */
2458
3459
  type OpenAIRealtimeAudioFormat = (typeof OpenAIRealtimeAudioFormat)[keyof typeof OpenAIRealtimeAudioFormat];
3460
+ /**
3461
+ * Known OpenAI Realtime API model identifiers.
3462
+ *
3463
+ * `GPT_REALTIME_2` is OpenAI's most-capable realtime voice model
3464
+ * (speech-to-speech with configurable reasoning effort, stronger
3465
+ * instruction following, 128K context). It accepts the same session
3466
+ * update wire format as the v1 `gpt-realtime` family but supports an
3467
+ * additional `reasoning.effort` field — see `reasoningEffort` on
3468
+ * {@link OpenAIRealtimeOptions}. Pricing differs from the mini default;
3469
+ * override `DEFAULT_PRICING.openai_realtime` with the values in
3470
+ * `DEFAULT_PRICING.openai_realtime_2` when selecting it.
3471
+ */
3472
+ declare const OpenAIRealtimeModel: {
3473
+ readonly GPT_REALTIME: "gpt-realtime";
3474
+ readonly GPT_REALTIME_2: "gpt-realtime-2";
3475
+ readonly GPT_REALTIME_MINI: "gpt-realtime-mini";
3476
+ readonly GPT_4O_REALTIME_PREVIEW: "gpt-4o-realtime-preview";
3477
+ readonly GPT_4O_MINI_REALTIME_PREVIEW: "gpt-4o-mini-realtime-preview";
3478
+ };
3479
+ /** Union of {@link OpenAIRealtimeModel} string values. */
3480
+ type OpenAIRealtimeModel = (typeof OpenAIRealtimeModel)[keyof typeof OpenAIRealtimeModel];
3481
+ /** OpenAI Realtime / TTS voice identifiers. */
3482
+ declare const OpenAIVoice: {
3483
+ readonly ALLOY: "alloy";
3484
+ readonly ASH: "ash";
3485
+ readonly BALLAD: "ballad";
3486
+ readonly CORAL: "coral";
3487
+ readonly ECHO: "echo";
3488
+ readonly FABLE: "fable";
3489
+ readonly NOVA: "nova";
3490
+ readonly ONYX: "onyx";
3491
+ readonly SAGE: "sage";
3492
+ readonly SHIMMER: "shimmer";
3493
+ readonly VERSE: "verse";
3494
+ };
3495
+ /** Union of {@link OpenAIVoice} string values. */
3496
+ type OpenAIVoice = (typeof OpenAIVoice)[keyof typeof OpenAIVoice];
3497
+ /**
3498
+ * Models accepted by `input_audio_transcription` on Realtime sessions.
3499
+ *
3500
+ * `GPT_REALTIME_WHISPER` is OpenAI's streaming-optimised Whisper variant
3501
+ * designed for low-latency transcript deltas inside a Realtime session.
3502
+ * Billed per minute of audio (separate from the conversational model
3503
+ * tokens). Use it when you want faster partial transcripts than
3504
+ * `whisper-1` at lower cost than `gpt-4o-transcribe`.
3505
+ */
3506
+ declare const OpenAITranscriptionModel: {
3507
+ readonly WHISPER_1: "whisper-1";
3508
+ readonly GPT_4O_TRANSCRIBE: "gpt-4o-transcribe";
3509
+ readonly GPT_4O_MINI_TRANSCRIBE: "gpt-4o-mini-transcribe";
3510
+ readonly GPT_REALTIME_WHISPER: "gpt-realtime-whisper";
3511
+ };
3512
+ /** Union of {@link OpenAITranscriptionModel} string values. */
3513
+ type OpenAITranscriptionModel = (typeof OpenAITranscriptionModel)[keyof typeof OpenAITranscriptionModel];
3514
+ /** Server-side voice-activity-detection modes. */
3515
+ declare const OpenAIRealtimeVADType: {
3516
+ readonly SERVER_VAD: "server_vad";
3517
+ readonly SEMANTIC_VAD: "semantic_vad";
3518
+ };
3519
+ /** Union of {@link OpenAIRealtimeVADType} string values. */
3520
+ type OpenAIRealtimeVADType = (typeof OpenAIRealtimeVADType)[keyof typeof OpenAIRealtimeVADType];
2459
3521
  /** Callback signature for events emitted by {@link OpenAIRealtimeAdapter}. */
2460
3522
  type RealtimeEventCallback = (type: string, data: unknown) => void | Promise<void>;
2461
3523
  /** Constructor options for {@link OpenAIRealtimeAdapter}. */
@@ -2483,28 +3545,96 @@ interface OpenAIRealtimeOptions {
2483
3545
  }
2484
3546
  /** Realtime WebSocket adapter for OpenAI's `gpt-realtime` family. */
2485
3547
  declare class OpenAIRealtimeAdapter {
2486
- private readonly apiKey;
2487
- private readonly model;
2488
- private readonly voice;
2489
- private readonly instructions;
2490
- private readonly tools?;
2491
- private readonly audioFormat;
2492
- private ws;
3548
+ protected readonly apiKey: string;
3549
+ protected readonly model: string;
3550
+ protected readonly voice: string;
3551
+ protected readonly instructions: string;
3552
+ protected readonly tools?: Array<{
3553
+ name: string;
3554
+ description: string;
3555
+ parameters: Record<string, unknown>;
3556
+ strict?: boolean;
3557
+ }> | undefined;
3558
+ protected readonly audioFormat: OpenAIRealtimeAudioFormat;
3559
+ protected ws: WebSocket__default | null;
2493
3560
  private readonly eventCallbacks;
2494
3561
  private messageListenerAttached;
2495
3562
  private heartbeat;
2496
3563
  private currentResponseItemId;
2497
3564
  private currentResponseAudioMs;
2498
3565
  private currentResponseFirstAudioAt;
2499
- private readonly options;
3566
+ protected readonly options: OpenAIRealtimeOptions;
2500
3567
  constructor(apiKey: string, model?: string, voice?: string, instructions?: string, tools?: Array<{
2501
3568
  name: string;
2502
3569
  description: string;
2503
3570
  parameters: Record<string, unknown>;
2504
3571
  strict?: boolean;
2505
3572
  }> | undefined, audioFormat?: OpenAIRealtimeAudioFormat, options?: OpenAIRealtimeOptions);
3573
+ /**
3574
+ * Build the production session.update body. Mirrors the body sent
3575
+ * inside `connect()` so warmup can apply identical configuration to
3576
+ * the upstream session and prime it without billing.
3577
+ */
3578
+ private buildSessionConfig;
3579
+ /**
3580
+ * Pre-call WebSocket warmup for the OpenAI Realtime endpoint.
3581
+ *
3582
+ * The canonical session-only warm step on the Realtime API: open the
3583
+ * WS, wait for `session.created`, send a single `session.update`
3584
+ * containing the same fields that the production `connect()` path
3585
+ * applies (`input_audio_format`, `output_audio_format`, `voice`,
3586
+ * `instructions`, `turn_detection`, `input_audio_transcription`,
3587
+ * plus any opt-in fields populated on the adapter), wait for the
3588
+ * matching `session.updated` ack, then close cleanly. This primes
3589
+ * the per-session state on the OpenAI side — DNS + TLS + auth
3590
+ * handshake + initial config exchange — without ever invoking the
3591
+ * model.
3592
+ *
3593
+ * Earlier revisions sent `response.create` with
3594
+ * `{"response": {"generate": false}}` to prime the inference path.
3595
+ * That field is NOT in the OpenAI Realtime API schema; the server
3596
+ * either ignores it (and bills tokens for a real model response) or
3597
+ * rejects the request with `invalid_request_error`. Both behaviours
3598
+ * are billing-unsafe or a no-op beyond TLS warm. The
3599
+ * `session.update` flow is documented and side-effect-free.
3600
+ *
3601
+ * Billing safety: `session.update` only mutates session
3602
+ * configuration. It does NOT invoke the model, does NOT consume any
3603
+ * audio buffer, and does NOT trigger token generation, so no
3604
+ * per-token cost is accrued. Best-effort: failures are logged at
3605
+ * debug level and never raised.
3606
+ */
3607
+ warmup(): Promise<void>;
2506
3608
  /** Open the Realtime WebSocket and apply the session configuration. */
2507
3609
  connect(): Promise<void>;
3610
+ /**
3611
+ * Adopt a pre-opened, already-`session.updated` Realtime WebSocket
3612
+ * produced by the prewarm pipeline (see `Patter.parkProviderConnections`).
3613
+ * Skips the fresh `new WebSocket()` + `session.created` /
3614
+ * `session.update` round-trip — saves ~250-450 ms on first turn.
3615
+ *
3616
+ * Caller MUST verify `ws.readyState === OPEN` before calling and MUST
3617
+ * have already received `session.updated` on the parked socket. If
3618
+ * the parked WS died between park and adopt, fall back to `connect()`.
3619
+ */
3620
+ adoptWebSocket(ws: WebSocket__default): void;
3621
+ protected armHeartbeatAndListener(): void;
3622
+ /**
3623
+ * Open a fresh Realtime WS, exchange `session.created` /
3624
+ * `session.update` / `session.updated` (so the upstream session is
3625
+ * fully primed), and return the OPEN socket WITHOUT arming the
3626
+ * heartbeat / message listener. Used by the prewarm pipeline to park
3627
+ * a Realtime connection during ringing; the live consumer adopts it
3628
+ * via {@link adoptWebSocket}.
3629
+ *
3630
+ * Bounded by 8 s. Throws on timeout / handshake failure — callers
3631
+ * (the prewarm pipeline) treat any error as a cache miss and the
3632
+ * call falls through to the cold `connect()` path.
3633
+ *
3634
+ * Billing safety: `session.update` does not invoke the model. No
3635
+ * tokens are billed.
3636
+ */
3637
+ openParkedConnection(): Promise<WebSocket__default>;
2508
3638
  /** Append a base64-encoded audio chunk to the realtime input buffer. */
2509
3639
  sendAudio(mulawAudio: Buffer): void;
2510
3640
  /**
@@ -2518,7 +3648,7 @@ declare class OpenAIRealtimeAdapter {
2518
3648
  onEvent(callback: RealtimeEventCallback): void;
2519
3649
  /** Remove a previously registered {@link onEvent} callback. */
2520
3650
  offEvent(callback: RealtimeEventCallback): void;
2521
- private ensureMessageListener;
3651
+ protected ensureMessageListener(): void;
2522
3652
  /** Truncate the in-flight assistant turn and cancel the active response.
2523
3653
  *
2524
3654
  * ``audio_end_ms`` MUST reflect what the caller actually heard, not what
@@ -2535,6 +3665,17 @@ declare class OpenAIRealtimeAdapter {
2535
3665
  cancelResponse(): void;
2536
3666
  /** Inject a user text turn and request a new response. */
2537
3667
  sendText(text: string): Promise<void>;
3668
+ /**
3669
+ * Trigger `response.create` with no new user item.
3670
+ *
3671
+ * Used by the Realtime stream-handler to drive a response after the
3672
+ * client-side hallucination filter accepts an
3673
+ * `input_audio_transcription.completed` event. The server VAD config
3674
+ * sets `create_response: false` so OpenAI no longer auto-creates a
3675
+ * response on every `input_audio_buffer.committed`; Patter is now
3676
+ * responsible for triggering it explicitly when a real user turn lands.
3677
+ */
3678
+ requestResponse(): Promise<void>;
2538
3679
  /**
2539
3680
  * Make the AI speak ``text`` as its opening line.
2540
3681
  *
@@ -2684,11 +3825,6 @@ declare function isRemoteUrl(onMessage: unknown): onMessage is string;
2684
3825
  /** Check if a URL is a WebSocket URL. */
2685
3826
  declare function isWebSocketUrl(url: string): boolean;
2686
3827
 
2687
- /**
2688
- * Embedded HTTP/WebSocket server — wires Express webhooks for the configured
2689
- * carrier (Twilio or Telnyx) into the per-call `StreamHandler` and dashboard.
2690
- */
2691
-
2692
3828
  /** Resolved configuration consumed by `EmbeddedServer` (carrier credentials, webhook URL, etc.). */
2693
3829
  interface LocalConfig {
2694
3830
  twilioSid?: string;
@@ -3322,6 +4458,8 @@ interface SonioxSTTOptions$1 {
3322
4458
  }
3323
4459
  /** Streaming STT adapter for Soniox's real-time WebSocket API. */
3324
4460
  declare class SonioxSTT {
4461
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
4462
+ static readonly providerKey = "soniox";
3325
4463
  private ws;
3326
4464
  private callbacks;
3327
4465
  private final;
@@ -3430,6 +4568,8 @@ interface AssemblyAISTTOptions$1 {
3430
4568
  declare class AssemblyAISTT {
3431
4569
  private readonly apiKey;
3432
4570
  private readonly options;
4571
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
4572
+ static readonly providerKey = "assemblyai";
3433
4573
  private ws;
3434
4574
  private readonly callbacks;
3435
4575
  private closing;
@@ -3460,6 +4600,22 @@ declare class AssemblyAISTT {
3460
4600
  static forTwilio(apiKey: string, model?: AssemblyAIModel): AssemblyAISTT;
3461
4601
  private buildUrl;
3462
4602
  private buildHeaders;
4603
+ /**
4604
+ * Pre-call WebSocket warmup for the AssemblyAI v3 `/v3/ws` endpoint.
4605
+ *
4606
+ * Opens the WS (DNS + TLS + auth handshake), idles ~250 ms so the
4607
+ * AssemblyAI edge keeps the session state warm, then sends Terminate
4608
+ * and closes. By the time `connect()` is invoked at call-pickup the
4609
+ * resolver and TLS session are hot — net wire time saving of
4610
+ * 200-500 ms.
4611
+ *
4612
+ * Billing safety: AssemblyAI Universal Streaming bills on streamed
4613
+ * audio seconds (per https://www.assemblyai.com/pricing). Opening +
4614
+ * closing the WebSocket without forwarding any audio frames does
4615
+ * not consume billable seconds. Best-effort: failures logged at
4616
+ * debug level.
4617
+ */
4618
+ warmup(): Promise<void>;
3463
4619
  /** Open the streaming WebSocket and arm message handlers. */
3464
4620
  connect(): Promise<void>;
3465
4621
  private awaitOpen;
@@ -3500,6 +4656,7 @@ declare class AssemblyAISTT {
3500
4656
  * Implements a `DeepgramSTT`-shaped provider using Cartesia's streaming
3501
4657
  * WebSocket API. Pure `ws` transport — does NOT depend on the vendor SDK.
3502
4658
  */
4659
+
3503
4660
  /** Patter-normalised transcript event emitted by {@link CartesiaSTT}. */
3504
4661
  interface Transcript$4 {
3505
4662
  readonly text: string;
@@ -3546,6 +4703,8 @@ interface CartesiaSTTOptions$1 {
3546
4703
  declare class CartesiaSTT {
3547
4704
  private readonly apiKey;
3548
4705
  private readonly options;
4706
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
4707
+ static readonly providerKey = "cartesia_stt";
3549
4708
  private ws;
3550
4709
  private callbacks;
3551
4710
  private keepaliveTimer;
@@ -3555,13 +4714,65 @@ declare class CartesiaSTT {
3555
4714
  */
3556
4715
  requestId: string | null;
3557
4716
  constructor(apiKey: string, options?: CartesiaSTTOptions$1);
4717
+ /**
4718
+ * Open a fresh WebSocket without arming any message / keepalive handlers
4719
+ * and without taking ownership on `this.ws`. Returns the OPEN socket so
4720
+ * the caller (the prewarm pipeline) can park it for later adoption via
4721
+ * `adoptWebSocket`. Bounded by `CONNECT_TIMEOUT_MS`.
4722
+ *
4723
+ * Billing safety: opening + parking the WS does not stream audio
4724
+ * (Cartesia STT bills on streamed audio seconds), so no charge is
4725
+ * incurred. Close the returned WS yourself if it is never adopted.
4726
+ */
4727
+ openParkedConnection(): Promise<WebSocket__default>;
3558
4728
  private buildWsUrl;
4729
+ /**
4730
+ * Pre-call WebSocket warmup for the Cartesia STT `/stt/websocket` endpoint.
4731
+ *
4732
+ * Opens the WS (DNS + TLS + auth handshake), idles ~250 ms so the
4733
+ * Cartesia edge keeps session state warm, then closes. By the time
4734
+ * `connect()` is invoked at call-pickup the resolver and TLS session
4735
+ * are hot — net wire time saving of 200-500 ms.
4736
+ *
4737
+ * Billing safety: Cartesia STT bills on streamed audio seconds (per
4738
+ * https://docs.cartesia.ai/2025-04-16/api-reference/stt/stt). Opening
4739
+ * + closing the WebSocket without forwarding audio does not consume
4740
+ * billable seconds. Best-effort: failures logged at debug level.
4741
+ */
4742
+ warmup(): Promise<void>;
3559
4743
  /** Open the streaming WebSocket and arm message + keepalive handlers. */
3560
4744
  connect(): Promise<void>;
4745
+ /**
4746
+ * Adopt a pre-opened, already-OPEN WebSocket produced by the prewarm
4747
+ * pipeline (see `Patter.parkProviderConnections`). Skips the fresh
4748
+ * `new WebSocket()` + handshake — the WS is already through DNS, TLS
4749
+ * and HTTP-101 so audio frames can flow on this turn instead of
4750
+ * paying ~150-400 ms of handshake.
4751
+ *
4752
+ * Caller MUST verify `ws.readyState === OPEN` before calling. If the
4753
+ * parked WS died between park and adopt, fall back to `connect()`.
4754
+ */
4755
+ adoptWebSocket(ws: WebSocket__default): void;
4756
+ private armMessageAndKeepalive;
3561
4757
  private handleEvent;
3562
4758
  private emit;
3563
4759
  /** Send a binary PCM16-LE audio chunk to Cartesia for transcription. */
3564
4760
  sendAudio(audio: Buffer): void;
4761
+ /**
4762
+ * Force Cartesia to finalise the in-flight utterance immediately.
4763
+ *
4764
+ * Sends a ``finalize`` text frame on the live WebSocket. Cartesia
4765
+ * replies with the final transcript followed by ``flush_done``,
4766
+ * bypassing its conservative internal silence heuristic (which can
4767
+ * wait 2-7 s on PSTN audio before naturally finalising). Wired
4768
+ * into ``StreamHandler`` on the VAD ``speech_end`` event so the
4769
+ * SDK's authoritative end-of-speech detection forces an immediate
4770
+ * STT finalisation — turning Cartesia's natural-pause endpointing
4771
+ * into a deterministic VAD-driven one, parity with the Deepgram
4772
+ * fast-path. No-op when the WS isn't open. Parity with Python
4773
+ * ``CartesiaSTT.finalize``.
4774
+ */
4775
+ finalize(): Promise<void>;
3565
4776
  /** Register a transcript listener. */
3566
4777
  onTranscript(callback: TranscriptCallback$4): void;
3567
4778
  /** Remove a previously registered transcript callback. */
@@ -3624,6 +4835,8 @@ interface LMNTTTSOptions$1 {
3624
4835
  }
3625
4836
  /** LMNT TTS adapter backed by the `/v1/ai/speech/bytes` HTTP streaming endpoint. */
3626
4837
  declare class LMNTTTS {
4838
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
4839
+ static readonly providerKey = "lmnt";
3627
4840
  private readonly apiKey;
3628
4841
  private readonly model;
3629
4842
  private readonly voice;
@@ -3678,6 +4891,18 @@ interface Transcript$3 {
3678
4891
  }
3679
4892
  type TranscriptCallback$3 = (transcript: Transcript$3) => void;
3680
4893
  type ErrorCallback$1 = (error: Error) => void;
4894
+ /** Known Deepgram STT models. */
4895
+ declare const DeepgramModel: {
4896
+ readonly NOVA_3: "nova-3";
4897
+ readonly NOVA_2: "nova-2";
4898
+ readonly NOVA_2_PHONECALL: "nova-2-phonecall";
4899
+ readonly NOVA_2_GENERAL: "nova-2-general";
4900
+ readonly NOVA_2_MEETING: "nova-2-meeting";
4901
+ readonly NOVA: "nova";
4902
+ readonly ENHANCED: "enhanced";
4903
+ readonly BASE: "base";
4904
+ };
4905
+ type DeepgramModel = (typeof DeepgramModel)[keyof typeof DeepgramModel];
3681
4906
  /**
3682
4907
  * Optional tuning knobs for Deepgram live transcription.
3683
4908
  *
@@ -3698,92 +4923,306 @@ interface DeepgramSTTOptions$1 {
3698
4923
  */
3699
4924
  readonly endpointingMs?: number;
3700
4925
  /**
3701
- * End-of-utterance silence window in milliseconds. Deepgram enforces a
3702
- * hard minimum of 1000 ms. Set to ``null`` to disable. Default ``1000``.
4926
+ * End-of-utterance silence window in milliseconds. Deepgram enforces a
4927
+ * hard minimum of 1000 ms. Set to ``null`` to disable. Default ``1000``.
4928
+ */
4929
+ readonly utteranceEndMs?: number | null;
4930
+ /**
4931
+ * Enable smart formatting (punctuation + numerals). Default ``false`` —
4932
+ * smart formatting adds roughly 50–150 ms to TTFT on each final transcript
4933
+ * and is rarely useful for telephony pipelines that pass the text straight
4934
+ * to an LLM. Set to ``true`` for use cases (dashboards, raw transcripts)
4935
+ * where the formatted text is surfaced directly to humans.
4936
+ */
4937
+ readonly smartFormat?: boolean;
4938
+ /** Emit interim (non-final) transcripts. Default ``true``. */
4939
+ readonly interimResults?: boolean;
4940
+ /** Emit VAD events (``SpeechStarted`` / ``UtteranceEnd``). Default ``true``. */
4941
+ readonly vadEvents?: boolean;
4942
+ }
4943
+ /** Streaming STT adapter for Deepgram's `/v1/listen` WebSocket API. */
4944
+ declare class DeepgramSTT {
4945
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
4946
+ static readonly providerKey = "deepgram";
4947
+ private ws;
4948
+ private readonly transcriptCallbacks;
4949
+ private readonly errorCallbacks;
4950
+ private keepaliveTimer;
4951
+ private running;
4952
+ private reconnectAttempted;
4953
+ /** Request ID from Deepgram — used to query actual cost post-call. */
4954
+ requestId: string;
4955
+ private readonly apiKey;
4956
+ private readonly language;
4957
+ private readonly model;
4958
+ private readonly encoding;
4959
+ private readonly sampleRate;
4960
+ private readonly endpointingMs;
4961
+ private readonly utteranceEndMs;
4962
+ private readonly smartFormat;
4963
+ private readonly interimResults;
4964
+ private readonly vadEvents;
4965
+ /**
4966
+ * New ergonomic constructor accepting an options object (mirrors Python kwargs).
4967
+ *
4968
+ * Also accepts the legacy positional form
4969
+ * ``(apiKey, language?, model?, encoding?, sampleRate?)`` for backward
4970
+ * compatibility with code that predated BUG #13.
4971
+ */
4972
+ constructor(apiKey: string, language?: string, model?: string, encoding?: string, sampleRate?: number, options?: DeepgramSTTOptions$1);
4973
+ constructor(apiKey: string, options: DeepgramSTTOptions$1 & {
4974
+ language?: string;
4975
+ });
4976
+ /** Factory for Twilio calls — mulaw 8 kHz. Forwards tuning options through. */
4977
+ static forTwilio(apiKey: string, language?: string, model?: string, options?: DeepgramSTTOptions$1): DeepgramSTT;
4978
+ private buildUrl;
4979
+ /**
4980
+ * Pre-call WebSocket warmup for the Deepgram `/v1/listen` endpoint.
4981
+ *
4982
+ * Opens the WS (full DNS + TLS + auth handshake), idles ~250 ms so the
4983
+ * provider edge keeps the session warm in its routing table, then
4984
+ * closes cleanly. By the time `connect()` is invoked at call-pickup
4985
+ * the DNS resolver is hot, the TCP+TLS session is in the connection
4986
+ * pool, and recent WS auth is still warm at Deepgram's edge — net
4987
+ * wire time saving of 200-500 ms vs a cold WS open.
4988
+ *
4989
+ * Billing safety: Deepgram bills on streamed audio seconds (per
4990
+ * https://deepgram.com/pricing). Opening + closing the WebSocket
4991
+ * without sending any audio frames does not consume billable seconds.
4992
+ * Best-effort: any failure is logged at debug level and never raised.
4993
+ */
4994
+ warmup(): Promise<void>;
4995
+ /** Open the streaming WebSocket and arm message + keepalive handlers. */
4996
+ connect(): Promise<void>;
4997
+ private openSocket;
4998
+ private clearKeepalive;
4999
+ private handleMessage;
5000
+ private emitTranscript;
5001
+ private emitError;
5002
+ private handleError;
5003
+ private handleClose;
5004
+ /** Send a binary audio chunk to Deepgram for transcription. */
5005
+ sendAudio(audio: Buffer): void;
5006
+ private audioSentCount;
5007
+ private audioDroppedCount;
5008
+ /** Register a transcript listener. */
5009
+ onTranscript(callback: TranscriptCallback$3): void;
5010
+ /** Remove a previously registered transcript listener. */
5011
+ offTranscript(callback: TranscriptCallback$3): void;
5012
+ /** Register an error listener for socket / API failures. */
5013
+ onError(callback: ErrorCallback$1): void;
5014
+ /** Remove a previously registered error listener. */
5015
+ offError(callback: ErrorCallback$1): void;
5016
+ /**
5017
+ * Force Deepgram to immediately emit a final ``Results`` frame for the
5018
+ * in-flight utterance, rather than waiting for its own endpoint
5019
+ * heuristic (utterance_end_ms ~1 s + natural-pause endpointing).
5020
+ * Called by the SDK on VAD ``speech_end`` and after barge-in cancel —
5021
+ * both moments where the SDK already knows the user has stopped
5022
+ * speaking and waiting for Deepgram's own endpointing only adds
5023
+ * dead air.
5024
+ *
5025
+ * Idempotent: safe to call when the socket is closed/closing.
5026
+ */
5027
+ finalize(): void;
5028
+ /** Send Finalize, briefly drain trailing transcripts, then close the socket. */
5029
+ close(): void;
5030
+ }
5031
+
5032
+ /**
5033
+ * Cartesia TTS provider — HTTP `/tts/bytes` endpoint.
5034
+ *
5035
+ * Cartesia also offers a WebSocket streaming mode with word timestamps;
5036
+ * this provider focuses on the chunked-bytes HTTP API which maps cleanly
5037
+ * onto Patter's `synthesize(text)` contract and keeps the provider
5038
+ * dependency-free (just `fetch`).
5039
+ *
5040
+ * Default model is `sonic-3` (GA snapshot `sonic-3-2026-01-12`) — Cartesia's
5041
+ * current GA model with a documented ~90 ms TTFB target. Voice IDs from the
5042
+ * sonic-2 generation (including the default Katie voice) remain compatible.
5043
+ *
5044
+ * **Telephony optimization** — the constructor default
5045
+ * `sampleRate=16000` is correct for web playback, dashboard previews, and
5046
+ * 16 kHz pipelines. For real phone calls, use the carrier-specific
5047
+ * factories instead:
5048
+ *
5049
+ * - {@link CartesiaTTS.forTwilio} requests `sampleRate=8000` natively from
5050
+ * Cartesia. Twilio's media-stream WebSocket expects μ-law @ 8 kHz, so
5051
+ * the SDK normally resamples 16 kHz → 8 kHz before doing the PCM →
5052
+ * μ-law transcode in `TwilioAudioSender`. Asking Cartesia for 8 kHz
5053
+ * PCM at the source skips the resample step (saves ~10–30 ms first-
5054
+ * byte plus per-frame CPU and removes a potential aliasing source).
5055
+ * The PCM → μ-law transcode still happens client-side.
5056
+ * - {@link CartesiaTTS.forTelnyx} requests `sampleRate=16000`. Telnyx
5057
+ * negotiates L16/16000 on its bidirectional media WebSocket, so
5058
+ * 16 kHz PCM is already the format used end-to-end and no
5059
+ * transcoding happens. This is the same as the bare-constructor
5060
+ * default and exists for API symmetry with the Twilio factory.
5061
+ */
5062
+ /** Known Cartesia TTS models. */
5063
+ declare const CartesiaTTSModel: {
5064
+ readonly SONIC_3: "sonic-3";
5065
+ readonly SONIC_2: "sonic-2";
5066
+ readonly SONIC: "sonic";
5067
+ };
5068
+ type CartesiaTTSModel = (typeof CartesiaTTSModel)[keyof typeof CartesiaTTSModel];
5069
+ /** Common PCM sample rates accepted by the Cartesia bytes endpoint. */
5070
+ declare const CartesiaTTSSampleRate: {
5071
+ readonly HZ_8000: 8000;
5072
+ readonly HZ_16000: 16000;
5073
+ readonly HZ_22050: 22050;
5074
+ readonly HZ_24000: 24000;
5075
+ readonly HZ_44100: 44100;
5076
+ };
5077
+ type CartesiaTTSSampleRate = (typeof CartesiaTTSSampleRate)[keyof typeof CartesiaTTSSampleRate];
5078
+ /** Voice-selection mode passed in the Cartesia bytes payload. */
5079
+ declare const CartesiaTTSVoiceMode: {
5080
+ readonly ID: "id";
5081
+ readonly EMBEDDING: "embedding";
5082
+ };
5083
+ type CartesiaTTSVoiceMode = (typeof CartesiaTTSVoiceMode)[keyof typeof CartesiaTTSVoiceMode];
5084
+ /** Constructor options for {@link CartesiaTTS}. */
5085
+ interface CartesiaTTSOptions$1 {
5086
+ model?: CartesiaTTSModel | string;
5087
+ voice?: string;
5088
+ language?: string;
5089
+ sampleRate?: CartesiaTTSSampleRate | number;
5090
+ speed?: string | number;
5091
+ emotion?: string | string[];
5092
+ volume?: number;
5093
+ baseUrl?: string;
5094
+ apiVersion?: string;
5095
+ }
5096
+ /** Cartesia TTS provider backed by the HTTP `/tts/bytes` streaming endpoint. */
5097
+ declare class CartesiaTTS {
5098
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
5099
+ static readonly providerKey = "cartesia_tts";
5100
+ private readonly apiKey;
5101
+ private readonly model;
5102
+ private readonly voice;
5103
+ private readonly language;
5104
+ private readonly sampleRate;
5105
+ private readonly speed?;
5106
+ private readonly emotion?;
5107
+ private readonly volume?;
5108
+ private readonly baseUrl;
5109
+ private readonly apiVersion;
5110
+ constructor(apiKey: string, opts?: CartesiaTTSOptions$1);
5111
+ /**
5112
+ * Construct an instance pre-configured for Twilio Media Streams.
5113
+ *
5114
+ * Sets `sampleRate=8000` so Cartesia emits PCM_S16LE @ 8 kHz directly.
5115
+ * Twilio's media stream uses μ-law @ 8 kHz so the SDK still does the
5116
+ * PCM → μ-law transcode client-side, but the 16 kHz → 8 kHz resample
5117
+ * step is skipped. Saves ~10–30 ms first-byte plus per-frame CPU and
5118
+ * removes a potential aliasing source.
5119
+ */
5120
+ static forTwilio(apiKey: string, options?: Omit<CartesiaTTSOptions$1, 'sampleRate'>): CartesiaTTS;
5121
+ /**
5122
+ * Construct an instance pre-configured for Telnyx bidirectional media.
5123
+ *
5124
+ * Sets `sampleRate=16000` to match Telnyx's L16/16000 default codec —
5125
+ * audio flows end-to-end with zero resampling or transcoding. Same as
5126
+ * the bare-constructor default; exists for API symmetry with
5127
+ * {@link CartesiaTTS.forTwilio}.
5128
+ */
5129
+ static forTelnyx(apiKey: string, options?: Omit<CartesiaTTSOptions$1, 'sampleRate'>): CartesiaTTS;
5130
+ /** Build the JSON payload for the Cartesia bytes endpoint. */
5131
+ private buildPayload;
5132
+ /**
5133
+ * Pre-call HTTP warmup for the Cartesia `/tts/bytes` endpoint.
5134
+ *
5135
+ * Issues a lightweight `GET <baseUrl>/voices` so DNS, TLS, and HTTP/2
5136
+ * are already up by the time the first `synthesizeStream()` POST
5137
+ * lands. Best-effort: 5 s timeout, all exceptions swallowed at
5138
+ * debug level.
5139
+ *
5140
+ * Billing safety: `GET /voices` is a free metadata read on
5141
+ * Cartesia's REST surface (per https://docs.cartesia.ai). It does
5142
+ * not consume synthesis credits. The actual synthesis is billed
5143
+ * only when `POST /tts/bytes` runs with a non-empty `transcript`.
5144
+ *
5145
+ * Note: Cartesia TTS uses the HTTP path (vs the WebSocket variant
5146
+ * Cartesia also exposes) — connection warmup is therefore HTTP-GET
5147
+ * based, not WebSocket pre-handshake. The latency win is smaller
5148
+ * (~50-150 ms vs the ~200-500 ms of a WS prewarm) but still real.
3703
5149
  */
3704
- readonly utteranceEndMs?: number | null;
5150
+ warmup(): Promise<void>;
5151
+ /** Synthesize text and return the concatenated audio buffer. */
5152
+ synthesize(text: string): Promise<Buffer>;
3705
5153
  /**
3706
- * Enable smart formatting (punctuation + numerals). Default ``false``
3707
- * smart formatting adds roughly 50–150 ms to TTFT on each final transcript
3708
- * and is rarely useful for telephony pipelines that pass the text straight
3709
- * to an LLM. Set to ``true`` for use cases (dashboards, raw transcripts)
3710
- * where the formatted text is surfaced directly to humans.
5154
+ * Synthesize text and yield raw PCM_S16LE chunks at the configured
5155
+ * `sampleRate` as they arrive from Cartesia.
3711
5156
  */
3712
- readonly smartFormat?: boolean;
3713
- /** Emit interim (non-final) transcripts. Default ``true``. */
3714
- readonly interimResults?: boolean;
3715
- /** Emit VAD events (``SpeechStarted`` / ``UtteranceEnd``). Default ``true``. */
3716
- readonly vadEvents?: boolean;
5157
+ synthesizeStream(text: string): AsyncGenerator<Buffer>;
3717
5158
  }
3718
- /** Streaming STT adapter for Deepgram's `/v1/listen` WebSocket API. */
3719
- declare class DeepgramSTT {
3720
- private ws;
3721
- private readonly transcriptCallbacks;
3722
- private readonly errorCallbacks;
3723
- private keepaliveTimer;
3724
- private running;
3725
- private reconnectAttempted;
3726
- /** Request ID from Deepgram — used to query actual cost post-call. */
3727
- requestId: string;
5159
+
5160
+ /**
5161
+ * Rime TTS provider — HTTP chunked endpoint.
5162
+ *
5163
+ * Supports both Arcana and Mist model families. The Arcana model can take
5164
+ * up to ~80% of the output audio's duration to synthesize, so its request
5165
+ * timeout is bumped to 4 minutes.
5166
+ */
5167
+ /** Rime TTS model families. */
5168
+ declare const RimeModel: {
5169
+ readonly ARCANA: "arcana";
5170
+ readonly MIST: "mist";
5171
+ readonly MIST_V2: "mistv2";
5172
+ };
5173
+ type RimeModel = (typeof RimeModel)[keyof typeof RimeModel];
5174
+ /** Supported response Content-Type accept headers for Rime TTS. */
5175
+ declare const RimeAudioFormat: {
5176
+ readonly PCM: "audio/pcm";
5177
+ readonly MP3: "audio/mp3";
5178
+ readonly WAV: "audio/wav";
5179
+ readonly MULAW: "audio/mulaw";
5180
+ };
5181
+ type RimeAudioFormat = (typeof RimeAudioFormat)[keyof typeof RimeAudioFormat];
5182
+ /** Constructor options for {@link RimeTTS}. */
5183
+ interface RimeTTSOptions$1 {
5184
+ model?: string;
5185
+ speaker?: string;
5186
+ lang?: string;
5187
+ sampleRate?: number;
5188
+ repetitionPenalty?: number;
5189
+ temperature?: number;
5190
+ topP?: number;
5191
+ maxTokens?: number;
5192
+ speedAlpha?: number;
5193
+ reduceLatency?: boolean;
5194
+ pauseBetweenBrackets?: boolean;
5195
+ phonemizeBetweenBrackets?: boolean;
5196
+ baseUrl?: string;
5197
+ }
5198
+ /** Rime TTS adapter for the `users.rime.ai/v1/rime-tts` HTTP streaming endpoint. */
5199
+ declare class RimeTTS {
5200
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
5201
+ static readonly providerKey = "rime";
3728
5202
  private readonly apiKey;
3729
- private readonly language;
3730
5203
  private readonly model;
3731
- private readonly encoding;
5204
+ private readonly speaker;
5205
+ private readonly lang;
3732
5206
  private readonly sampleRate;
3733
- private readonly endpointingMs;
3734
- private readonly utteranceEndMs;
3735
- private readonly smartFormat;
3736
- private readonly interimResults;
3737
- private readonly vadEvents;
3738
- /**
3739
- * New ergonomic constructor accepting an options object (mirrors Python kwargs).
3740
- *
3741
- * Also accepts the legacy positional form
3742
- * ``(apiKey, language?, model?, encoding?, sampleRate?)`` for backward
3743
- * compatibility with code that predated BUG #13.
3744
- */
3745
- constructor(apiKey: string, language?: string, model?: string, encoding?: string, sampleRate?: number, options?: DeepgramSTTOptions$1);
3746
- constructor(apiKey: string, options: DeepgramSTTOptions$1 & {
3747
- language?: string;
3748
- });
3749
- /** Factory for Twilio calls — mulaw 8 kHz. Forwards tuning options through. */
3750
- static forTwilio(apiKey: string, language?: string, model?: string, options?: DeepgramSTTOptions$1): DeepgramSTT;
3751
- private buildUrl;
3752
- /** Open the streaming WebSocket and arm message + keepalive handlers. */
3753
- connect(): Promise<void>;
3754
- private openSocket;
3755
- private clearKeepalive;
3756
- private handleMessage;
3757
- private emitTranscript;
3758
- private emitError;
3759
- private handleError;
3760
- private handleClose;
3761
- /** Send a binary audio chunk to Deepgram for transcription. */
3762
- sendAudio(audio: Buffer): void;
3763
- private audioSentCount;
3764
- private audioDroppedCount;
3765
- /** Register a transcript listener. */
3766
- onTranscript(callback: TranscriptCallback$3): void;
3767
- /** Remove a previously registered transcript listener. */
3768
- offTranscript(callback: TranscriptCallback$3): void;
3769
- /** Register an error listener for socket / API failures. */
3770
- onError(callback: ErrorCallback$1): void;
3771
- /** Remove a previously registered error listener. */
3772
- offError(callback: ErrorCallback$1): void;
5207
+ private readonly repetitionPenalty?;
5208
+ private readonly temperature?;
5209
+ private readonly topP?;
5210
+ private readonly maxTokens?;
5211
+ private readonly speedAlpha?;
5212
+ private readonly reduceLatency?;
5213
+ private readonly pauseBetweenBrackets?;
5214
+ private readonly phonemizeBetweenBrackets?;
5215
+ private readonly baseUrl;
5216
+ private readonly totalTimeoutMs;
5217
+ constructor(apiKey: string, opts?: RimeTTSOptions$1);
5218
+ private buildPayload;
5219
+ /** Synthesize text and return the concatenated audio buffer. */
5220
+ synthesize(text: string): Promise<Buffer>;
3773
5221
  /**
3774
- * Force Deepgram to immediately emit a final ``Results`` frame for the
3775
- * in-flight utterance, rather than waiting for its own endpoint
3776
- * heuristic (utterance_end_ms ~1 s + natural-pause endpointing).
3777
- * Called by the SDK on VAD ``speech_end`` and after barge-in cancel —
3778
- * both moments where the SDK already knows the user has stopped
3779
- * speaking and waiting for Deepgram's own endpointing only adds
3780
- * dead air.
3781
- *
3782
- * Idempotent: safe to call when the socket is closed/closing.
5222
+ * Synthesize text and yield raw PCM_S16LE chunks at the configured
5223
+ * `sampleRate` as they stream in.
3783
5224
  */
3784
- finalize(): void;
3785
- /** Send Finalize, briefly drain trailing transcripts, then close the socket. */
3786
- close(): void;
5225
+ synthesizeStream(text: string): AsyncGenerator<Buffer>;
3787
5226
  }
3788
5227
 
3789
5228
  /** Deepgram streaming STT for Patter pipeline mode. */
@@ -3825,6 +5264,8 @@ type TranscriptCallback$2 = (transcript: Transcript$2) => void;
3825
5264
  type WhisperResponseFormat = 'json' | 'verbose_json';
3826
5265
  /** Buffered STT adapter for OpenAI's Whisper transcription HTTP API. */
3827
5266
  declare class WhisperSTT {
5267
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
5268
+ static readonly providerKey: string;
3828
5269
  private readonly apiKey;
3829
5270
  private readonly model;
3830
5271
  private readonly language;
@@ -3913,6 +5354,8 @@ declare class STT$5 extends WhisperSTT {
3913
5354
 
3914
5355
  /** STT adapter restricted to OpenAI's GPT-4o Transcribe model family. */
3915
5356
  declare class OpenAITranscribeSTT extends WhisperSTT {
5357
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
5358
+ static readonly providerKey: string;
3916
5359
  /**
3917
5360
  * @param apiKey OpenAI API key.
3918
5361
  * @param language ISO-639-1 language code (e.g. ``"en"``, ``"it"``). Optional.
@@ -4172,206 +5615,73 @@ interface SpeechmaticsSTTOptions$1 {
4172
5615
  * ```
4173
5616
  */
4174
5617
  declare class SpeechmaticsSTT {
5618
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
5619
+ static readonly providerKey = "speechmatics";
4175
5620
  private ws;
4176
5621
  private readonly transcriptCallbacks;
4177
- private readonly errorCallbacks;
4178
- private running;
4179
- /** Sequence number of the last audio chunk acknowledged via `AudioAdded`. */
4180
- private lastSeqNo;
4181
- private readonly apiKey;
4182
- private readonly baseUrl;
4183
- private readonly language;
4184
- private readonly turnDetectionMode;
4185
- private readonly sampleRate;
4186
- private readonly enableDiarization;
4187
- private readonly maxDelay;
4188
- private readonly endOfUtteranceSilenceTrigger;
4189
- private readonly endOfUtteranceMaxDelay;
4190
- private readonly includePartials;
4191
- private readonly additionalVocab;
4192
- private readonly operatingPoint;
4193
- private readonly domain;
4194
- private readonly outputLocale;
4195
- constructor(apiKey: string, options?: SpeechmaticsSTTOptions$1);
4196
- /** Build the JSON `StartRecognition` payload sent on connect. */
4197
- private buildStartRecognition;
4198
- /** Open the streaming WebSocket and send the `StartRecognition` frame. */
4199
- connect(): Promise<void>;
4200
- /** Send a binary PCM16-LE audio chunk to Speechmatics for transcription. */
4201
- sendAudio(audio: Buffer): void;
4202
- /** Register a transcript listener. */
4203
- onTranscript(callback: TranscriptCallback$1): void;
4204
- /** Remove a previously registered transcript listener. */
4205
- offTranscript(callback: TranscriptCallback$1): void;
4206
- /** Register an error listener for socket / API failures. */
4207
- onError(callback: ErrorCallback): void;
4208
- /** Remove a previously registered error listener. */
4209
- offError(callback: ErrorCallback): void;
4210
- private handleMessage;
4211
- /** Translate a Speechmatics transcript message into a Patter `Transcript`. */
4212
- private toTranscript;
4213
- private emitTranscript;
4214
- private emitError;
4215
- private handleError;
4216
- private handleClose;
4217
- /** Send `EndOfStream` and close the WebSocket. Idempotent. */
4218
- close(): void;
4219
- }
4220
-
4221
- /** Speechmatics streaming STT for Patter pipeline mode. */
4222
-
4223
- type SpeechmaticsSTTOptions = SpeechmaticsSTTOptions$1 & {
4224
- /** API key. Falls back to SPEECHMATICS_API_KEY env var when omitted. */
4225
- apiKey?: string;
4226
- };
4227
- /**
4228
- * Speechmatics streaming STT.
4229
- *
4230
- * @example
4231
- * ```ts
4232
- * import * as speechmatics from "getpatter/stt/speechmatics";
4233
- * const stt = new speechmatics.STT(); // reads SPEECHMATICS_API_KEY
4234
- * const stt = new speechmatics.STT({ apiKey: "sm_...", language: "en" });
4235
- * ```
4236
- */
4237
- declare class STT extends SpeechmaticsSTT {
4238
- static readonly providerKey = "speechmatics";
4239
- constructor(opts?: SpeechmaticsSTTOptions);
4240
- }
4241
-
4242
- /**
4243
- * Known stable ElevenLabs voice models (from the official ElevenLabs API
4244
- * reference). Exposed as a typed `as const` object so callers can pass
4245
- * `ElevenLabsModel.FLASH_V2_5` and get autocomplete / static checking; the
4246
- * public `modelId` option also accepts an arbitrary `string` so users can
4247
- * pass forward-compat IDs we haven't enumerated yet.
4248
- *
4249
- * - `V3` — newest, highest quality (slower TTFT than Flash).
4250
- * - `FLASH_V2_5` — current default, fastest (~75 ms TTFT).
4251
- * - `TURBO_V2_5` — balanced quality/speed.
4252
- * - `MULTILINGUAL_V2` — best multilingual support.
4253
- * - `MONOLINGUAL_V1` — legacy English-only.
4254
- */
4255
- declare const ElevenLabsModel: {
4256
- readonly V3: "eleven_v3";
4257
- readonly FLASH_V2_5: "eleven_flash_v2_5";
4258
- readonly TURBO_V2_5: "eleven_turbo_v2_5";
4259
- readonly MULTILINGUAL_V2: "eleven_multilingual_v2";
4260
- readonly MONOLINGUAL_V1: "eleven_monolingual_v1";
4261
- };
4262
- /** Union of {@link ElevenLabsModel} string values. */
4263
- type ElevenLabsModel = (typeof ElevenLabsModel)[keyof typeof ElevenLabsModel];
4264
- declare const ElevenLabsOutputFormat: {
4265
- readonly MP3_22050_32: "mp3_22050_32";
4266
- readonly MP3_44100_32: "mp3_44100_32";
4267
- readonly MP3_44100_64: "mp3_44100_64";
4268
- readonly MP3_44100_96: "mp3_44100_96";
4269
- readonly MP3_44100_128: "mp3_44100_128";
4270
- readonly MP3_44100_192: "mp3_44100_192";
4271
- readonly PCM_8000: "pcm_8000";
4272
- readonly PCM_16000: "pcm_16000";
4273
- readonly PCM_22050: "pcm_22050";
4274
- readonly PCM_24000: "pcm_24000";
4275
- readonly PCM_44100: "pcm_44100";
4276
- readonly ULAW_8000: "ulaw_8000";
4277
- };
4278
- /** Union of {@link ElevenLabsOutputFormat} string values. */
4279
- type ElevenLabsOutputFormat = (typeof ElevenLabsOutputFormat)[keyof typeof ElevenLabsOutputFormat];
4280
- /** ElevenLabs voice tuning knobs forwarded as `voice_settings` in the request. */
4281
- interface ElevenLabsVoiceSettings {
4282
- stability?: number;
4283
- similarity_boost?: number;
4284
- style?: number;
4285
- use_speaker_boost?: boolean;
4286
- }
4287
- /** Constructor options for {@link ElevenLabsTTS}. */
4288
- interface ElevenLabsTTSOptions$1 {
4289
- voiceId?: string;
4290
- /**
4291
- * ElevenLabs voice model ID. The default ``eleven_flash_v2_5`` has the
4292
- * lowest TTFT (~75 ms). Pass ``eleven_v3`` for highest quality, or any
4293
- * arbitrary string for forward-compat with future models.
4294
- */
4295
- modelId?: ElevenLabsModel | string;
4296
- outputFormat?: ElevenLabsOutputFormat;
4297
- voiceSettings?: ElevenLabsVoiceSettings;
4298
- languageCode?: string;
4299
- chunkSize?: number;
5622
+ private readonly errorCallbacks;
5623
+ private running;
5624
+ /** Sequence number of the last audio chunk acknowledged via `AudioAdded`. */
5625
+ private lastSeqNo;
5626
+ private readonly apiKey;
5627
+ private readonly baseUrl;
5628
+ private readonly language;
5629
+ private readonly turnDetectionMode;
5630
+ private readonly sampleRate;
5631
+ private readonly enableDiarization;
5632
+ private readonly maxDelay;
5633
+ private readonly endOfUtteranceSilenceTrigger;
5634
+ private readonly endOfUtteranceMaxDelay;
5635
+ private readonly includePartials;
5636
+ private readonly additionalVocab;
5637
+ private readonly operatingPoint;
5638
+ private readonly domain;
5639
+ private readonly outputLocale;
5640
+ constructor(apiKey: string, options?: SpeechmaticsSTTOptions$1);
5641
+ /** Build the JSON `StartRecognition` payload sent on connect. */
5642
+ private buildStartRecognition;
5643
+ /** Open the streaming WebSocket and send the `StartRecognition` frame. */
5644
+ connect(): Promise<void>;
5645
+ /** Send a binary PCM16-LE audio chunk to Speechmatics for transcription. */
5646
+ sendAudio(audio: Buffer): void;
5647
+ /** Register a transcript listener. */
5648
+ onTranscript(callback: TranscriptCallback$1): void;
5649
+ /** Remove a previously registered transcript listener. */
5650
+ offTranscript(callback: TranscriptCallback$1): void;
5651
+ /** Register an error listener for socket / API failures. */
5652
+ onError(callback: ErrorCallback): void;
5653
+ /** Remove a previously registered error listener. */
5654
+ offError(callback: ErrorCallback): void;
5655
+ private handleMessage;
5656
+ /** Translate a Speechmatics transcript message into a Patter `Transcript`. */
5657
+ private toTranscript;
5658
+ private emitTranscript;
5659
+ private emitError;
5660
+ private handleError;
5661
+ private handleClose;
5662
+ /** Send `EndOfStream` and close the WebSocket. Idempotent. */
5663
+ close(): void;
4300
5664
  }
5665
+
5666
+ /** Speechmatics streaming STT for Patter pipeline mode. */
5667
+
5668
+ type SpeechmaticsSTTOptions = SpeechmaticsSTTOptions$1 & {
5669
+ /** API key. Falls back to SPEECHMATICS_API_KEY env var when omitted. */
5670
+ apiKey?: string;
5671
+ };
4301
5672
  /**
4302
- * ElevenLabs streaming TTS adapter.
4303
- *
4304
- * Supported `modelId` values are autocompleted via {@link ElevenLabsModel}.
4305
- * Default is `eleven_flash_v2_5` (lowest TTFT, ~75 ms).
4306
- *
4307
- * **Telephony optimization** — the constructor default
4308
- * `outputFormat='pcm_16000'` is correct for web playback, dashboard
4309
- * previews, and 16 kHz pipelines. For real phone calls, use the
4310
- * carrier-specific factories instead:
5673
+ * Speechmatics streaming STT.
4311
5674
  *
4312
- * - {@link ElevenLabsTTS.forTwilio} emits `ulaw_8000` natively. Twilio's
4313
- * media-stream WebSocket expects μ-law @ 8 kHz, so the SDK normally
4314
- * resamples 16 kHz 8 kHz and PCM → μ-law before sending. Asking
4315
- * ElevenLabs to produce μ-law directly skips that step (saves
4316
- * ~30–80 ms first-byte plus per-frame CPU and avoids any resampling
4317
- * aliasing).
4318
- * - {@link ElevenLabsTTS.forTelnyx} emits `pcm_16000`. Telnyx negotiates
4319
- * L16/16000 on its bidirectional media WebSocket, so 16 kHz PCM is
4320
- * already the format used end-to-end and no transcoding happens.
4321
- * ElevenLabs *also* supports `ulaw_8000` if your Telnyx profile is
4322
- * pinned to PCMU/8000 — pass `outputFormat: 'ulaw_8000'` explicitly
4323
- * in that case.
5675
+ * @example
5676
+ * ```ts
5677
+ * import * as speechmatics from "getpatter/stt/speechmatics";
5678
+ * const stt = new speechmatics.STT(); // reads SPEECHMATICS_API_KEY
5679
+ * const stt = new speechmatics.STT({ apiKey: "sm_...", language: "en" });
5680
+ * ```
4324
5681
  */
4325
- declare class ElevenLabsTTS {
4326
- private readonly apiKey;
4327
- private readonly voiceId;
4328
- private readonly modelId;
4329
- private readonly outputFormat;
4330
- private readonly voiceSettings;
4331
- private readonly languageCode;
4332
- private readonly chunkSize;
4333
- constructor(apiKey: string, voiceId?: string, modelId?: string, outputFormat?: ElevenLabsOutputFormat | string);
4334
- constructor(apiKey: string, options: ElevenLabsTTSOptions$1);
4335
- /**
4336
- * Construct an instance pre-configured for Twilio Media Streams.
4337
- *
4338
- * Sets `outputFormat='ulaw_8000'` so ElevenLabs emits μ-law @ 8 kHz
4339
- * directly — the exact wire format Twilio's media stream uses — letting
4340
- * the SDK skip the 16 kHz→8 kHz resample and PCM→μ-law conversion in
4341
- * `TwilioAudioSender`. Saves ~30–80 ms first-byte and per-frame CPU,
4342
- * and removes a potential aliasing source.
4343
- *
4344
- * `voiceSettings` defaults to a low-bandwidth-friendly profile
4345
- * (speaker boost off, modest stability) which sounds cleaner at 8 kHz
4346
- * μ-law than the studio default. Pass an explicit object to override.
4347
- */
4348
- static forTwilio(apiKey: string, options?: Omit<ElevenLabsTTSOptions$1, 'outputFormat'>): ElevenLabsTTS;
4349
- /**
4350
- * Construct an instance pre-configured for Telnyx bidirectional media.
4351
- *
4352
- * Telnyx's default media-streaming codec is L16 PCM @ 16 kHz, which
4353
- * matches our default Telnyx handler. We pick `pcm_16000` so the audio
4354
- * flows end-to-end with zero resampling or transcoding.
4355
- *
4356
- * Trade-off: if your Telnyx profile is pinned to PCMU/8000 (μ-law),
4357
- * construct `ElevenLabsTTS` directly with `outputFormat: 'ulaw_8000'`
4358
- * — Telnyx supports that natively too.
4359
- */
4360
- static forTelnyx(apiKey: string, options?: Omit<ElevenLabsTTSOptions$1, 'outputFormat'>): ElevenLabsTTS;
4361
- /**
4362
- * Synthesise text to speech and return the full audio as a single Buffer.
4363
- *
4364
- * For large chunks (or when latency matters) call `synthesizeStream` instead.
4365
- */
4366
- synthesize(text: string): Promise<Buffer>;
4367
- /**
4368
- * Synthesise text and yield audio chunks as they arrive (streaming).
4369
- *
4370
- * The yielded buffers are raw PCM at 16 kHz (or whatever `outputFormat` is
4371
- * configured to). `chunkSize` controls the maximum yield size — 512 is a
4372
- * good choice for low-latency telephony.
4373
- */
4374
- synthesizeStream(text: string): AsyncGenerator<Buffer>;
5682
+ declare class STT extends SpeechmaticsSTT {
5683
+ static readonly providerKey = "speechmatics";
5684
+ constructor(opts?: SpeechmaticsSTTOptions);
4375
5685
  }
4376
5686
 
4377
5687
  /** ElevenLabs TTS for Patter pipeline mode. */
@@ -4404,133 +5714,24 @@ type ElevenLabsCarrierOptions = Omit<ElevenLabsTTSOptions, "outputFormat">;
4404
5714
  * @example
4405
5715
  * ```ts
4406
5716
  * import * as elevenlabs from "getpatter/tts/elevenlabs";
4407
- * const tts = new elevenlabs.TTS(); // reads ELEVENLABS_API_KEY
4408
- * const tts = new elevenlabs.TTS({ apiKey: "...", voiceId: "rachel" });
4409
- * ```
4410
- *
4411
- * **Telephony optimization** — use {@link TTS.forTwilio} (μ-law @ 8 kHz,
4412
- * native Twilio Media Streams format) or {@link TTS.forTelnyx} (PCM @
4413
- * 16 kHz, native Telnyx default) on phone calls to skip the SDK-side
4414
- * resampling / transcoding step.
4415
- */
4416
- declare class TTS$6 extends ElevenLabsTTS {
4417
- static readonly providerKey = "elevenlabs";
4418
- constructor(opts?: ElevenLabsTTSOptions);
4419
- /** Pipeline TTS pre-configured for Twilio Media Streams (`ulaw_8000`). */
4420
- static forTwilio(opts?: ElevenLabsCarrierOptions): TTS$6;
4421
- static forTwilio(apiKey: string, options?: Omit<ElevenLabsTTSOptions, "outputFormat">): TTS$6;
4422
- /** Pipeline TTS pre-configured for Telnyx (`pcm_16000`). */
4423
- static forTelnyx(opts?: ElevenLabsCarrierOptions): TTS$6;
4424
- static forTelnyx(apiKey: string, options?: Omit<ElevenLabsTTSOptions, "outputFormat">): TTS$6;
4425
- }
4426
-
4427
- /**
4428
- * WebSocket-based ElevenLabs TTS provider — opt-in low-latency variant.
4429
- *
4430
- * Targets the ElevenLabs streaming-input WebSocket endpoint
4431
- * (`/v1/text-to-speech/{voice_id}/stream-input`) instead of the HTTP
4432
- * `/stream` endpoint used by `ElevenLabsTTS`. Saves the HTTP request setup
4433
- * time per utterance (~50 ms) and avoids the HTTP cold-start TLS handshake
4434
- * when calls are bursty.
4435
- *
4436
- * API matches `ElevenLabsTTS` (`synthesizeStream(text)` returns an
4437
- * `AsyncGenerator<Buffer>`) so it can be passed anywhere a TTSAdapter is
4438
- * expected.
4439
- *
4440
- * Behaviour notes
4441
- * - WebSocket is opened **per-utterance** (matches HTTP semantics). A
4442
- * future revision may pool a WS across utterances of the same call
4443
- * session — see roadmap Phase 5b.
4444
- * - `auto_mode=true` is enabled by default. Pass `autoMode: false` to
4445
- * send a custom `chunk_length_schedule`.
4446
- * - `outputFormat` is exposed as a query parameter so `ulaw_8000` (Twilio
4447
- * native) and `pcm_16000` (Telnyx native) work without resampling.
4448
- * - `eleven_v3` is **not** supported — the WS endpoint rejects it.
4449
- * - `optimize_streaming_latency` is officially deprecated and is not
4450
- * exposed.
4451
- */
4452
-
4453
- /** Constructor options for {@link ElevenLabsWebSocketTTS}. */
4454
- interface ElevenLabsWebSocketTTSOptions {
4455
- apiKey: string;
4456
- voiceId?: string;
4457
- modelId?: ElevenLabsModel | string;
4458
- outputFormat?: string;
4459
- voiceSettings?: Record<string, unknown>;
4460
- languageCode?: string;
4461
- /** Let the server pick chunk timing. Default true. */
4462
- autoMode?: boolean;
4463
- /** WS keep-alive timeout in seconds (5–180). Default 60. */
4464
- inactivityTimeout?: number;
4465
- /**
4466
- * Manual chunk schedule, only used when ``autoMode: false``. Each value
4467
- * must be 5–500. ElevenLabs default is ``[120, 160, 250, 290]``.
4468
- */
4469
- chunkLengthSchedule?: number[];
4470
- /** Outgoing audio re-chunk size in bytes. Default 4096. */
4471
- chunkSize?: number;
4472
- }
4473
- /** WebSocket-based ElevenLabs TTS adapter — opt-in low-latency variant. */
4474
- declare class ElevenLabsWebSocketTTS implements TTSAdapter {
4475
- static readonly providerKey = "elevenlabs_ws";
4476
- readonly apiKey: string;
4477
- readonly voiceId: string;
4478
- readonly modelId: string;
4479
- readonly voiceSettings?: Record<string, unknown>;
4480
- readonly languageCode?: string;
4481
- readonly autoMode: boolean;
4482
- readonly inactivityTimeout: number;
4483
- readonly chunkLengthSchedule?: number[];
4484
- readonly chunkSize: number;
4485
- /**
4486
- * The wire format requested over the ElevenLabs WS. Initially set from
4487
- * the constructor; ``setTelephonyCarrier`` may auto-flip it to the
4488
- * carrier's native codec when the caller did NOT pass ``outputFormat``
4489
- * explicitly.
4490
- */
4491
- private _outputFormat;
4492
- private readonly _outputFormatExplicit;
4493
- /** Public read-only view of the (possibly auto-flipped) wire format. */
4494
- get outputFormat(): string;
4495
- constructor(opts: ElevenLabsWebSocketTTSOptions);
4496
- /**
4497
- * Hook called by ``StreamHandler`` to advise the carrier wire format.
4498
- *
4499
- * When the user did NOT pass an explicit ``outputFormat`` in the
4500
- * constructor options, this flips the format to the carrier's native
4501
- * wire codec — saving a client-side transcode step. Calling with an
4502
- * unknown carrier (``""`` / ``"custom"``) is a no-op.
4503
- *
4504
- * When ``outputFormat`` was explicitly passed (incl. via the
4505
- * ``forTwilio`` / ``forTelnyx`` factories), this method is a no-op —
4506
- * the user's choice always wins.
4507
- */
4508
- setTelephonyCarrier(carrier: string): void;
4509
- /** Pre-configured for Twilio Media Streams (`ulaw_8000`). */
4510
- static forTwilio(opts: Omit<ElevenLabsWebSocketTTSOptions, 'outputFormat'>): ElevenLabsWebSocketTTS;
4511
- /** Pre-configured for Telnyx (`pcm_16000`). */
4512
- static forTelnyx(opts: Omit<ElevenLabsWebSocketTTSOptions, 'outputFormat'>): ElevenLabsWebSocketTTS;
4513
- private buildUrl;
4514
- /**
4515
- * Single-shot synthesis: open WS, send text, yield bytes, close.
4516
- *
4517
- * Resilience contract:
4518
- * - Connection bounded by ``CONNECT_TIMEOUT_MS`` (5s, was 15s).
4519
- * - Each idle wait bounded by ``FRAME_TIMEOUT_MS`` (30s) so a stalled
4520
- * server cannot keep the generator alive indefinitely.
4521
- * - Permanent error handler attached BEFORE the open await — prevents
4522
- * ``uncaughtException`` if an error fires after the once-listener
4523
- * resolves.
4524
- * - All event listeners removed in ``finally`` (no closure leak past
4525
- * socket close).
4526
- * - Server-reported ``error`` raises ``ElevenLabsTTSError``.
4527
- * - Per-frame audio payload capped at ``MAX_AUDIO_B64_BYTES``.
4528
- * - Best-effort EOS ``{"text":""}`` sent in finally (not immediately
4529
- * after flush — auto_mode could otherwise truncate the tail audio).
4530
- */
4531
- synthesizeStream(text: string): AsyncGenerator<Buffer>;
4532
- /** No-op — connections are per-utterance and torn down inside synthesizeStream. */
4533
- close(): Promise<void>;
5717
+ * const tts = new elevenlabs.TTS(); // reads ELEVENLABS_API_KEY
5718
+ * const tts = new elevenlabs.TTS({ apiKey: "...", voiceId: "rachel" });
5719
+ * ```
5720
+ *
5721
+ * **Telephony optimization** — use {@link TTS.forTwilio} (μ-law @ 8 kHz,
5722
+ * native Twilio Media Streams format) or {@link TTS.forTelnyx} (PCM @
5723
+ * 16 kHz, native Telnyx default) on phone calls to skip the SDK-side
5724
+ * resampling / transcoding step.
5725
+ */
5726
+ declare class TTS$6 extends ElevenLabsTTS {
5727
+ static readonly providerKey = "elevenlabs";
5728
+ constructor(opts?: ElevenLabsTTSOptions);
5729
+ /** Pipeline TTS pre-configured for Twilio Media Streams (`ulaw_8000`). */
5730
+ static forTwilio(opts?: ElevenLabsCarrierOptions): TTS$6;
5731
+ static forTwilio(apiKey: string, options?: Omit<ElevenLabsTTSOptions, "outputFormat">): TTS$6;
5732
+ /** Pipeline TTS pre-configured for Telnyx (`pcm_16000`). */
5733
+ static forTelnyx(opts?: ElevenLabsCarrierOptions): TTS$6;
5734
+ static forTelnyx(apiKey: string, options?: Omit<ElevenLabsTTSOptions, "outputFormat">): TTS$6;
4534
5735
  }
4535
5736
 
4536
5737
  /** ElevenLabs WebSocket TTS for Patter pipeline mode (opt-in low-latency). */
@@ -4595,6 +5796,8 @@ declare class OpenAITTS {
4595
5796
  private readonly speed;
4596
5797
  private readonly antiAlias;
4597
5798
  private readonly targetSampleRate;
5799
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
5800
+ static readonly providerKey = "openai_tts";
4598
5801
  constructor(apiKey: string, voice?: string, model?: string, instructions?: string | null, speed?: number | null, antiAlias?: boolean, targetSampleRate?: number);
4599
5802
  /**
4600
5803
  * Synthesise text to speech and return the full audio as a single Buffer.
@@ -4676,107 +5879,6 @@ declare class TTS$4 extends OpenAITTS {
4676
5879
  constructor(opts?: OpenAITTSOptions);
4677
5880
  }
4678
5881
 
4679
- /**
4680
- * Cartesia TTS provider — HTTP `/tts/bytes` endpoint.
4681
- *
4682
- * Cartesia also offers a WebSocket streaming mode with word timestamps;
4683
- * this provider focuses on the chunked-bytes HTTP API which maps cleanly
4684
- * onto Patter's `synthesize(text)` contract and keeps the provider
4685
- * dependency-free (just `fetch`).
4686
- *
4687
- * Default model is `sonic-3` (GA snapshot `sonic-3-2026-01-12`) — Cartesia's
4688
- * current GA model with a documented ~90 ms TTFB target. Voice IDs from the
4689
- * sonic-2 generation (including the default Katie voice) remain compatible.
4690
- *
4691
- * **Telephony optimization** — the constructor default
4692
- * `sampleRate=16000` is correct for web playback, dashboard previews, and
4693
- * 16 kHz pipelines. For real phone calls, use the carrier-specific
4694
- * factories instead:
4695
- *
4696
- * - {@link CartesiaTTS.forTwilio} requests `sampleRate=8000` natively from
4697
- * Cartesia. Twilio's media-stream WebSocket expects μ-law @ 8 kHz, so
4698
- * the SDK normally resamples 16 kHz → 8 kHz before doing the PCM →
4699
- * μ-law transcode in `TwilioAudioSender`. Asking Cartesia for 8 kHz
4700
- * PCM at the source skips the resample step (saves ~10–30 ms first-
4701
- * byte plus per-frame CPU and removes a potential aliasing source).
4702
- * The PCM → μ-law transcode still happens client-side.
4703
- * - {@link CartesiaTTS.forTelnyx} requests `sampleRate=16000`. Telnyx
4704
- * negotiates L16/16000 on its bidirectional media WebSocket, so
4705
- * 16 kHz PCM is already the format used end-to-end and no
4706
- * transcoding happens. This is the same as the bare-constructor
4707
- * default and exists for API symmetry with the Twilio factory.
4708
- */
4709
- /** Known Cartesia TTS models. */
4710
- declare const CartesiaTTSModel: {
4711
- readonly SONIC_3: "sonic-3";
4712
- readonly SONIC_2: "sonic-2";
4713
- readonly SONIC: "sonic";
4714
- };
4715
- type CartesiaTTSModel = (typeof CartesiaTTSModel)[keyof typeof CartesiaTTSModel];
4716
- /** Common PCM sample rates accepted by the Cartesia bytes endpoint. */
4717
- declare const CartesiaTTSSampleRate: {
4718
- readonly HZ_8000: 8000;
4719
- readonly HZ_16000: 16000;
4720
- readonly HZ_22050: 22050;
4721
- readonly HZ_24000: 24000;
4722
- readonly HZ_44100: 44100;
4723
- };
4724
- type CartesiaTTSSampleRate = (typeof CartesiaTTSSampleRate)[keyof typeof CartesiaTTSSampleRate];
4725
- /** Constructor options for {@link CartesiaTTS}. */
4726
- interface CartesiaTTSOptions$1 {
4727
- model?: CartesiaTTSModel | string;
4728
- voice?: string;
4729
- language?: string;
4730
- sampleRate?: CartesiaTTSSampleRate | number;
4731
- speed?: string | number;
4732
- emotion?: string | string[];
4733
- volume?: number;
4734
- baseUrl?: string;
4735
- apiVersion?: string;
4736
- }
4737
- /** Cartesia TTS provider backed by the HTTP `/tts/bytes` streaming endpoint. */
4738
- declare class CartesiaTTS {
4739
- private readonly apiKey;
4740
- private readonly model;
4741
- private readonly voice;
4742
- private readonly language;
4743
- private readonly sampleRate;
4744
- private readonly speed?;
4745
- private readonly emotion?;
4746
- private readonly volume?;
4747
- private readonly baseUrl;
4748
- private readonly apiVersion;
4749
- constructor(apiKey: string, opts?: CartesiaTTSOptions$1);
4750
- /**
4751
- * Construct an instance pre-configured for Twilio Media Streams.
4752
- *
4753
- * Sets `sampleRate=8000` so Cartesia emits PCM_S16LE @ 8 kHz directly.
4754
- * Twilio's media stream uses μ-law @ 8 kHz so the SDK still does the
4755
- * PCM → μ-law transcode client-side, but the 16 kHz → 8 kHz resample
4756
- * step is skipped. Saves ~10–30 ms first-byte plus per-frame CPU and
4757
- * removes a potential aliasing source.
4758
- */
4759
- static forTwilio(apiKey: string, options?: Omit<CartesiaTTSOptions$1, 'sampleRate'>): CartesiaTTS;
4760
- /**
4761
- * Construct an instance pre-configured for Telnyx bidirectional media.
4762
- *
4763
- * Sets `sampleRate=16000` to match Telnyx's L16/16000 default codec —
4764
- * audio flows end-to-end with zero resampling or transcoding. Same as
4765
- * the bare-constructor default; exists for API symmetry with
4766
- * {@link CartesiaTTS.forTwilio}.
4767
- */
4768
- static forTelnyx(apiKey: string, options?: Omit<CartesiaTTSOptions$1, 'sampleRate'>): CartesiaTTS;
4769
- /** Build the JSON payload for the Cartesia bytes endpoint. */
4770
- private buildPayload;
4771
- /** Synthesize text and return the concatenated audio buffer. */
4772
- synthesize(text: string): Promise<Buffer>;
4773
- /**
4774
- * Synthesize text and yield raw PCM_S16LE chunks at the configured
4775
- * `sampleRate` as they arrive from Cartesia.
4776
- */
4777
- synthesizeStream(text: string): AsyncGenerator<Buffer>;
4778
- }
4779
-
4780
5882
  /** Cartesia TTS for Patter pipeline mode. */
4781
5883
 
4782
5884
  /** Constructor options for the Cartesia `TTS` adapter. */
@@ -4825,50 +5927,6 @@ declare class TTS$3 extends CartesiaTTS {
4825
5927
  static forTelnyx(apiKey: string, options?: Omit<CartesiaTTSOptions, "sampleRate">): TTS$3;
4826
5928
  }
4827
5929
 
4828
- /** Constructor options for {@link RimeTTS}. */
4829
- interface RimeTTSOptions$1 {
4830
- model?: string;
4831
- speaker?: string;
4832
- lang?: string;
4833
- sampleRate?: number;
4834
- repetitionPenalty?: number;
4835
- temperature?: number;
4836
- topP?: number;
4837
- maxTokens?: number;
4838
- speedAlpha?: number;
4839
- reduceLatency?: boolean;
4840
- pauseBetweenBrackets?: boolean;
4841
- phonemizeBetweenBrackets?: boolean;
4842
- baseUrl?: string;
4843
- }
4844
- /** Rime TTS adapter for the `users.rime.ai/v1/rime-tts` HTTP streaming endpoint. */
4845
- declare class RimeTTS {
4846
- private readonly apiKey;
4847
- private readonly model;
4848
- private readonly speaker;
4849
- private readonly lang;
4850
- private readonly sampleRate;
4851
- private readonly repetitionPenalty?;
4852
- private readonly temperature?;
4853
- private readonly topP?;
4854
- private readonly maxTokens?;
4855
- private readonly speedAlpha?;
4856
- private readonly reduceLatency?;
4857
- private readonly pauseBetweenBrackets?;
4858
- private readonly phonemizeBetweenBrackets?;
4859
- private readonly baseUrl;
4860
- private readonly totalTimeoutMs;
4861
- constructor(apiKey: string, opts?: RimeTTSOptions$1);
4862
- private buildPayload;
4863
- /** Synthesize text and return the concatenated audio buffer. */
4864
- synthesize(text: string): Promise<Buffer>;
4865
- /**
4866
- * Synthesize text and yield raw PCM_S16LE chunks at the configured
4867
- * `sampleRate` as they stream in.
4868
- */
4869
- synthesizeStream(text: string): AsyncGenerator<Buffer>;
4870
- }
4871
-
4872
5930
  /** Rime TTS for Patter pipeline mode. */
4873
5931
 
4874
5932
  /** Constructor options for the Rime `TTS` adapter. */
@@ -5001,6 +6059,8 @@ interface InworldTTSOptions$1 {
5001
6059
  * before calling the constructor.
5002
6060
  */
5003
6061
  declare class InworldTTS {
6062
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
6063
+ static readonly providerKey = "inworld";
5004
6064
  private readonly authToken;
5005
6065
  private readonly model;
5006
6066
  private readonly voice;
@@ -5014,6 +6074,33 @@ declare class InworldTTS {
5014
6074
  private readonly baseUrl;
5015
6075
  constructor(authToken: string, opts?: InworldTTSOptions$1);
5016
6076
  private buildPayload;
6077
+ /**
6078
+ * Pre-call HTTP warmup for the Inworld TTS API.
6079
+ *
6080
+ * Issues a lightweight `GET /tts/v1/voices` against the API host so
6081
+ * DNS + TLS + HTTP/2 connection are already up by the time the first
6082
+ * `synthesizeStream()` POST lands. Best-effort: 5 s timeout, all
6083
+ * exceptions swallowed at debug level.
6084
+ *
6085
+ * Earlier revisions issued `HEAD` against the streaming endpoint
6086
+ * (`/tts/v1/voice:stream`). That endpoint is POST-only so HEAD
6087
+ * returns `405 Method Not Allowed` — the warmup still completed the
6088
+ * TLS handshake but spammed 405 errors into Inworld's audit logs and
6089
+ * into our own logs. Switching to a documented `GET /tts/v1/voices`
6090
+ * metadata read is a 2xx-clean equivalent.
6091
+ *
6092
+ * Billing safety: `GET /tts/v1/voices` is a free metadata endpoint
6093
+ * (per https://docs.inworld.ai/). It returns the voice catalogue
6094
+ * without invoking the synthesis pipeline. The actual synthesis is
6095
+ * billed only when `POST /tts/v1/voice:stream` runs with a non-empty
6096
+ * `text`.
6097
+ *
6098
+ * Note: Inworld TTS uses the HTTP NDJSON streaming path rather than
6099
+ * a persistent WebSocket — connection warmup is therefore HTTP-based,
6100
+ * not WebSocket pre-handshake. The latency win is smaller (~50-150 ms)
6101
+ * than the WS-based prewarms but still real on cold-start calls.
6102
+ */
6103
+ warmup(): Promise<void>;
5017
6104
  /** Synthesize text and return the concatenated audio buffer. */
5018
6105
  synthesize(text: string): Promise<Buffer>;
5019
6106
  /**
@@ -5143,6 +6230,8 @@ interface AnthropicLLMOptions$1 {
5143
6230
  }
5144
6231
  /** LLM provider backed by Anthropic's Messages API (streaming). */
5145
6232
  declare class AnthropicLLMProvider implements LLMProvider {
6233
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
6234
+ static readonly providerKey = "anthropic";
5146
6235
  private readonly apiKey;
5147
6236
  private readonly model;
5148
6237
  private readonly maxTokens;
@@ -5151,6 +6240,13 @@ declare class AnthropicLLMProvider implements LLMProvider {
5151
6240
  private readonly anthropicVersion;
5152
6241
  private readonly promptCaching;
5153
6242
  constructor(options: AnthropicLLMOptions$1);
6243
+ /**
6244
+ * Pre-call DNS / TLS warmup for the Anthropic Messages API.
6245
+ * Issues a lightweight ``GET https://api.anthropic.com/v1/models`` so
6246
+ * DNS, TLS and HTTP/2 are already up by the time the first ``messages``
6247
+ * call lands. Best-effort: 5 s timeout, exceptions swallowed at debug.
6248
+ */
6249
+ warmup(): Promise<void>;
5154
6250
  /** Stream Patter-format LLM chunks for the given OpenAI-style chat history. */
5155
6251
  stream(messages: Array<Record<string, unknown>>, tools?: Array<Record<string, unknown>> | null, opts?: LLMStreamOptions): AsyncGenerator<LLMChunk, void, unknown>;
5156
6252
  }
@@ -5238,6 +6334,8 @@ interface GroqLLMOptions$1 {
5238
6334
  }
5239
6335
  /** LLM provider backed by Groq's OpenAI-compatible Chat Completions API. */
5240
6336
  declare class GroqLLMProvider implements LLMProvider {
6337
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
6338
+ static readonly providerKey = "groq";
5241
6339
  private readonly apiKey;
5242
6340
  readonly model: string;
5243
6341
  private readonly baseUrl;
@@ -5252,6 +6350,11 @@ declare class GroqLLMProvider implements LLMProvider {
5252
6350
  private readonly presencePenalty?;
5253
6351
  private readonly stop?;
5254
6352
  constructor(options: GroqLLMOptions$1);
6353
+ /**
6354
+ * Pre-call DNS / TLS warmup for the Groq inference endpoint.
6355
+ * Best-effort: 5 s timeout, all exceptions swallowed at debug level.
6356
+ */
6357
+ warmup(): Promise<void>;
5255
6358
  /** Stream Patter-format LLM chunks from the Groq chat completions API. */
5256
6359
  stream(messages: Array<Record<string, unknown>>, tools?: Array<Record<string, unknown>> | null, opts?: LLMStreamOptions): AsyncGenerator<LLMChunk, void, unknown>;
5257
6360
  }
@@ -5371,6 +6474,8 @@ interface CerebrasLLMOptions$1 {
5371
6474
  * - zai-glm-4.7
5372
6475
  */
5373
6476
  declare class CerebrasLLMProvider implements LLMProvider {
6477
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
6478
+ static readonly providerKey = "cerebras";
5374
6479
  private readonly apiKey;
5375
6480
  readonly model: string;
5376
6481
  private readonly baseUrl;
@@ -5386,6 +6491,11 @@ declare class CerebrasLLMProvider implements LLMProvider {
5386
6491
  private readonly presencePenalty?;
5387
6492
  private readonly stop?;
5388
6493
  constructor(options: CerebrasLLMOptions$1);
6494
+ /**
6495
+ * Pre-call DNS / TLS warmup for the Cerebras inference endpoint.
6496
+ * Best-effort: 5 s timeout, all exceptions swallowed at debug level.
6497
+ */
6498
+ warmup(): Promise<void>;
5389
6499
  /** Stream Patter-format LLM chunks from the Cerebras chat completions API. */
5390
6500
  stream(messages: Array<Record<string, unknown>>, tools?: Array<Record<string, unknown>> | null, opts?: LLMStreamOptions): AsyncGenerator<LLMChunk, void, unknown>;
5391
6501
  }
@@ -5468,12 +6578,22 @@ interface GoogleLLMOptions$1 {
5468
6578
  }
5469
6579
  /** LLM provider backed by Google Gemini (Developer API, streaming SSE). */
5470
6580
  declare class GoogleLLMProvider implements LLMProvider {
6581
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
6582
+ static readonly providerKey = "google";
5471
6583
  private readonly apiKey;
5472
6584
  readonly model: string;
5473
6585
  private readonly baseUrl;
5474
6586
  private readonly temperature?;
5475
6587
  private readonly maxOutputTokens?;
5476
6588
  constructor(options: GoogleLLMOptions$1);
6589
+ /**
6590
+ * Pre-call DNS / TLS warmup for the Gemini API.
6591
+ * Issues a lightweight ``GET ${baseUrl}/models?key=...`` so DNS, TLS
6592
+ * and HTTP/2 are already up by the time the first
6593
+ * ``streamGenerateContent`` call lands. Best-effort: 5 s timeout, all
6594
+ * exceptions swallowed at debug level.
6595
+ */
6596
+ warmup(): Promise<void>;
5477
6597
  /** Stream Patter-format LLM chunks from the Gemini SSE endpoint. */
5478
6598
  stream(messages: Array<Record<string, unknown>>, tools?: Array<Record<string, unknown>> | null, opts?: LLMStreamOptions): AsyncGenerator<LLMChunk, void, unknown>;
5479
6599
  }
@@ -5597,7 +6717,10 @@ declare class SileroVAD implements VADProvider {
5597
6717
  * - `activationThreshold = 0.5` — upstream `threshold`
5598
6718
  * - `deactivationThreshold = 0.35` — upstream `neg_threshold = threshold - 0.15`
5599
6719
  * - `minSpeechDuration = 0.25` — upstream `min_speech_duration_ms = 250`
5600
- * - `minSilenceDuration = 0.1` — upstream `min_silence_duration_ms = 100`
6720
+ * - `minSilenceDuration = 0.4` — telephony default (was 0.1, bumped after
6721
+ * round 10f found speech_end firing on inter-sentence pauses < 250 ms,
6722
+ * causing double-talk dispatch). 400 ms matches the industry telephony
6723
+ * default and the inter_utterance_gap_ms debounce in stream-handler.ts.
5601
6724
  * - `prefixPaddingDuration = 0.03` — upstream `speech_pad_ms = 30`
5602
6725
  *
5603
6726
  * Override any field by passing `options`. Deployments that experience
@@ -5639,6 +6762,298 @@ declare class SileroVAD implements VADProvider {
5639
6762
  private advanceState;
5640
6763
  /** Mark the VAD as closed; subsequent processFrame calls throw. */
5641
6764
  close(): Promise<void>;
6765
+ /**
6766
+ * Reset all per-utterance state so the next ``processFrame`` starts from
6767
+ * a clean SILENCE state.
6768
+ *
6769
+ * Called by the stream handler between agent turns to prevent a "stuck
6770
+ * SPEECH" condition where PSTN echo / loopback kept the detector's
6771
+ * probability above ``deactivationThreshold`` for the entire agent turn.
6772
+ * Without this reset the next user utterance would never trigger a
6773
+ * SILENCE→SPEECH transition and barge-in would feel "one-shot" (works
6774
+ * once, then never again until the call ends).
6775
+ *
6776
+ * Safe to call any time including on a closed instance (no-op).
6777
+ */
6778
+ reset(): void;
6779
+ }
6780
+
6781
+ /** Options accepted by {@link DeepFilterNetFilter}. */
6782
+ interface DeepFilterNetOptions {
6783
+ /** Absolute path to a DeepFilterNet ONNX model. If omitted, the filter
6784
+ * logs a warning and becomes a pass-through. */
6785
+ modelPath?: string;
6786
+ /** When true, disable the pass-through warning (used by tests). */
6787
+ silenceWarnings?: boolean;
6788
+ }
6789
+ /** OSS noise-suppression filter backed by a DeepFilterNet ONNX model. */
6790
+ declare class DeepFilterNetFilter implements AudioFilter {
6791
+ private readonly modelPath;
6792
+ private readonly silenceWarnings;
6793
+ private session;
6794
+ private ort;
6795
+ private warned;
6796
+ private closed;
6797
+ private _resamplerSrcRate;
6798
+ private _upsamplerInst;
6799
+ private _downsamplerInst;
6800
+ constructor(options?: DeepFilterNetOptions);
6801
+ private ensureSession;
6802
+ /** Run noise suppression on a PCM16 chunk; pass-through when no model is loaded. */
6803
+ process(pcmChunk: Buffer, sampleRate: number): Promise<Buffer>;
6804
+ /** Flush resamplers, release the ONNX session, and mark the filter closed. */
6805
+ close(): Promise<void>;
6806
+ }
6807
+
6808
+ /**
6809
+ * Krisp VIVA noise-reduction AudioFilter — TypeScript scaffold.
6810
+ *
6811
+ * Mirrors the API of the Python `getpatter.providers.krisp_filter.KrispVivaFilter`
6812
+ * for SDK parity. As of 2026-05 Krisp does not publish an official Node.js
6813
+ * (server) SDK; third-party browser/RN wrappers exist but cannot process
6814
+ * server-received PCM/mulaw audio. This class throws at construction time
6815
+ * and points the caller at the available paths (Python SDK or DeepFilterNet
6816
+ * on TS).
6817
+ *
6818
+ * When Krisp publishes an official Node binding — or a community NAPI/WASM
6819
+ * wrapper becomes available — the import below and `process()` body will
6820
+ * fill in. The class signature is intentionally compatible with the Python
6821
+ * one so callers do not need to migrate code: `camelCase` ↔ `snake_case`,
6822
+ * `modelPath` ↔ `model_path`, etc.
6823
+ *
6824
+ * Krisp VIVA is a proprietary SDK and requires a commercial license plus a
6825
+ * `.kef` model file provided by the user. Patter ships only the
6826
+ * AudioFilter interface scaffold — never the SDK or model.
6827
+ *
6828
+ * @see https://krisp.ai/developers/
6829
+ */
6830
+
6831
+ /** Krisp-supported sample rates (parity with Python `KrispSampleRate`). */
6832
+ declare const KrispSampleRate: {
6833
+ readonly HZ_8000: 8000;
6834
+ readonly HZ_16000: 16000;
6835
+ readonly HZ_32000: 32000;
6836
+ readonly HZ_44100: 44100;
6837
+ readonly HZ_48000: 48000;
6838
+ };
6839
+ type KrispSampleRate = (typeof KrispSampleRate)[keyof typeof KrispSampleRate];
6840
+ /** Krisp-supported frame durations in ms (parity with Python `KrispFrameDuration`). */
6841
+ declare const KrispFrameDuration: {
6842
+ readonly MS_10: 10;
6843
+ readonly MS_15: 15;
6844
+ readonly MS_20: 20;
6845
+ readonly MS_30: 30;
6846
+ readonly MS_32: 32;
6847
+ };
6848
+ type KrispFrameDuration = (typeof KrispFrameDuration)[keyof typeof KrispFrameDuration];
6849
+ /** Options accepted by {@link KrispVivaFilter}. */
6850
+ interface KrispVivaFilterOptions {
6851
+ /**
6852
+ * Path to the Krisp `.kef` model file. If omitted, falls back to the
6853
+ * `KRISP_VIVA_FILTER_MODEL_PATH` environment variable.
6854
+ */
6855
+ readonly modelPath?: string;
6856
+ /** Noise-suppression strength in `[0, 100]`. Defaults to `100`. */
6857
+ readonly noiseSuppressionLevel?: number;
6858
+ /** Frame duration in ms. One of `10, 15, 20, 30, 32`. Defaults to `10`. */
6859
+ readonly frameDurationMs?: KrispFrameDuration | number;
6860
+ /** Initial sample rate in Hz. Defaults to `16000`. Re-created lazily if it changes mid-call. */
6861
+ readonly sampleRate?: KrispSampleRate | number;
6862
+ }
6863
+ /**
6864
+ * Krisp VIVA noise-reduction filter — TypeScript scaffold (NOT YET IMPLEMENTED).
6865
+ *
6866
+ * Construction throws with a guidance message because Krisp does not ship a
6867
+ * Node.js SDK. The class exists for API parity with the Python
6868
+ * `KrispVivaFilter` so that user code does not need to be rewritten when a
6869
+ * Node binding lands.
6870
+ *
6871
+ * For TS users today, use {@link DeepFilterNetFilter} from
6872
+ * `./deepfilternet-filter` instead — same `AudioFilter` interface, no
6873
+ * license required.
6874
+ *
6875
+ * @example
6876
+ * ```ts
6877
+ * // FUTURE — when Krisp publishes a Node SDK:
6878
+ * import { KrispVivaFilter } from 'getpatter/providers/krisp-filter';
6879
+ * const filter = new KrispVivaFilter({ modelPath: '/path/to/model.kef' });
6880
+ * const agent = phone.agent({ audioFilter: filter, ... });
6881
+ * ```
6882
+ */
6883
+ declare class KrispVivaFilter implements AudioFilter {
6884
+ static readonly providerKey = "krisp_viva";
6885
+ constructor(_options?: KrispVivaFilterOptions);
6886
+ process(pcmChunk: Buffer, _sampleRate: number): Promise<Buffer>;
6887
+ close(): Promise<void>;
6888
+ }
6889
+
6890
+ /**
6891
+ * OpenAI Realtime adapter for the GA Realtime API (`gpt-realtime-2`).
6892
+ *
6893
+ * `gpt-realtime-2` is served from the same `wss://api.openai.com/v1/realtime`
6894
+ * endpoint as the v1-beta family, but the GA endpoint:
6895
+ * - REJECTS the legacy `OpenAI-Beta: realtime=v1` header (returns
6896
+ * `invalid_model` with message "Model X is only available on the GA API").
6897
+ * - REQUIRES `session.type === "realtime"` at the root of `session.update`.
6898
+ * - Uses `output_modalities` (was `modalities`).
6899
+ * - Nests audio config under `audio.{input,output}` with MIME `type`
6900
+ * strings (`audio/pcmu`, `audio/pcma`, `audio/pcm`) instead of the v1
6901
+ * enum strings (`g711_ulaw`, `g711_alaw`, `pcm16`) and moves `voice`
6902
+ * under `audio.output.voice`, `transcription` + `turn_detection`
6903
+ * under `audio.input`.
6904
+ *
6905
+ * Everything ELSE (event names, audio delta dispatch, barge-in / truncate
6906
+ * semantics, heartbeat, tool calling) is API-compatible with the v1 family,
6907
+ * so this adapter subclasses {@link OpenAIRealtimeAdapter} and overrides
6908
+ * only `connect()`. The runtime behaviour (`sendAudio`, `cancelResponse`,
6909
+ * `sendText`, `sendFirstMessage`, …) is inherited unchanged.
6910
+ */
6911
+
6912
+ /**
6913
+ * Realtime WebSocket adapter speaking OpenAI's GA Realtime API.
6914
+ *
6915
+ * Note on audio transport: the GA endpoint accepts only PCM-16-LE with
6916
+ * `rate >= 24000` for both `session.audio.input.format` and
6917
+ * `session.audio.output.format`. The `audio/pcmu` MIME type appears to be
6918
+ * accepted at the protocol level but the server's audio engine does not
6919
+ * actually decode mulaw 8 kHz frames — they're silently dropped, the input
6920
+ * buffer stays empty, `input_audio_buffer.commit` returns
6921
+ * "buffer only has 0.00ms of audio", and the call ends up muted. Until
6922
+ * OpenAI documents native g711_ulaw on the GA endpoint we transcode on
6923
+ * both directions on the Patter side:
6924
+ * - inbound (Twilio/Telnyx → model): mulaw 8 kHz → PCM 24 kHz
6925
+ * - outbound (model → Twilio/Telnyx): PCM 24 kHz → mulaw 8 kHz
6926
+ *
6927
+ * The outbound path needs a stateful resampler instance because the
6928
+ * 24 kHz → 8 kHz decimator carries phase between chunks; sharing a single
6929
+ * instance across the call eliminates the boundary clicks a stateless
6930
+ * helper would produce on every audio delta.
6931
+ */
6932
+ declare class OpenAIRealtime2Adapter extends OpenAIRealtimeAdapter {
6933
+ /** Two-stage outbound resampler for 24 kHz → 8 kHz. Created lazily on
6934
+ * the first audio frame so each Realtime session has its own state.
6935
+ *
6936
+ * We chain `24k → 16k → 8k` instead of using the direct `24k → 8k`
6937
+ * variant of {@link StatefulResampler}: the direct path is a 3:1
6938
+ * decimation with linear interpolation only — no anti-alias filter
6939
+ * — so any energy above 4 kHz in the source aliases down into the
6940
+ * audible band and is heard as raspy/scratchy artefacts on speech.
6941
+ * `gpt-realtime-2` outputs voice with significant content above
6942
+ * 4 kHz. The second stage (16k → 8k) uses a 5-tap FIR anti-alias
6943
+ * filter which removes the offending band before decimation, and
6944
+ * empirically (see commit message) the chain produces audibly
6945
+ * cleaner output. The 24k → 16k step is still pure linear-interp
6946
+ * but the inputs to it stay below the Nyquist of the 16 kHz stage,
6947
+ * so it doesn't introduce new artefacts.
6948
+ */
6949
+ private outboundResampler24To16;
6950
+ private outboundResampler16To8;
6951
+ /** Last 8 kHz input sample carried across chunk boundaries for the
6952
+ * direct 3× linear upsample (see `transcodeInboundMulaw8ToPcm24`).
6953
+ * The carry guarantees the very first output of each chunk
6954
+ * interpolates from the *real* preceding sample, not from the chunk's
6955
+ * own first sample replicated — without it every 20 ms Twilio frame
6956
+ * boundary becomes a small DC step that the GA server VAD interprets
6957
+ * as constant low-energy noise, which never crosses the speech
6958
+ * threshold. */
6959
+ private inbound8kCarry;
6960
+ /** GA-shape `session.update` payload. See module-level docstring. */
6961
+ private buildGASessionConfig;
6962
+ /**
6963
+ * Open the Realtime WebSocket against the GA endpoint and apply the GA
6964
+ * session configuration. Header `OpenAI-Beta: realtime=v1` is OMITTED
6965
+ * (the GA endpoint rejects it). Wire shape uses nested `audio.{input,
6966
+ * output}` + `output_modalities` + `session.type === "realtime"`.
6967
+ */
6968
+ connect(): Promise<void>;
6969
+ /**
6970
+ * GA-API variant of {@link OpenAIRealtimeAdapter.openParkedConnection}.
6971
+ * Opens a fresh Realtime WS against the GA endpoint, exchanges
6972
+ * `session.created` → GA-shape `session.update` → `session.updated`
6973
+ * so the upstream session is fully primed, and returns the OPEN
6974
+ * socket WITHOUT taking it on `this.ws` or arming the heartbeat /
6975
+ * message listener.
6976
+ *
6977
+ * Used by `Patter.parkProviderConnections` during the carrier
6978
+ * ringing window so the per-call `StreamHandler` can adopt the
6979
+ * primed socket at carrier `start` — eliminating the TCP + TLS +
6980
+ * HTTP-101 + `session.update` ack round-trip from the critical path.
6981
+ * Saves ~300-600 ms of first-audible-word latency.
6982
+ *
6983
+ * Bounded by 8 s. Throws on timeout / handshake failure / GA-side
6984
+ * rejection. Callers treat any error as a cache miss and fall
6985
+ * through to the cold {@link connect} path.
6986
+ *
6987
+ * Billing safety: confirmed by OpenAI's Managing Realtime Costs
6988
+ * guide — `session.update` does NOT invoke the model and bills no
6989
+ * tokens. An idle parked socket costs $0.
6990
+ */
6991
+ openParkedConnection(): Promise<WebSocket__default>;
6992
+ /**
6993
+ * GA-API variant of {@link OpenAIRealtimeAdapter.adoptWebSocket}. Takes
6994
+ * over a WS that {@link openParkedConnection} produced (already through
6995
+ * `session.created` + `session.update` + `session.updated`) and arms
6996
+ * the heartbeat + message listener so the GA event-translation shim
6997
+ * is wired up. Skips the cold-connect path — saves ~300-600 ms on
6998
+ * first audible word.
6999
+ *
7000
+ * Caller MUST verify `ws.readyState === OPEN` before calling. If the
7001
+ * parked WS died between park and adopt, fall back to {@link connect}.
7002
+ */
7003
+ adoptWebSocket(ws: WebSocket__default): void;
7004
+ /**
7005
+ * GA-API variant of {@link OpenAIRealtimeAdapter.sendFirstMessage}. Two
7006
+ * differences from the v1 path:
7007
+ *
7008
+ * 1. The v1 implementation sends `response.modalities` which the GA
7009
+ * endpoint rejects with `Unknown parameter: 'response.modalities'`.
7010
+ * Use `output_modalities` to match the GA `session.update` shape.
7011
+ *
7012
+ * 2. The GA `response.create` does NOT inherit `audio.output.voice`
7013
+ * from the session — it falls back to the server-side default
7014
+ * (`marin`, female) when the field is omitted on the response
7015
+ * itself. Session-level `voice: "alloy"` only affects subsequent
7016
+ * server-VAD-triggered responses, NOT this explicit
7017
+ * `response.create`. We re-inject the configured voice here so the
7018
+ * first-message voice matches the rest of the call.
7019
+ */
7020
+ /**
7021
+ * Override the parent `sendAudio` to transcode inbound carrier audio
7022
+ * (mulaw 8 kHz from Twilio/Telnyx) into PCM-16 24 kHz before sending
7023
+ * `input_audio_buffer.append`. The GA server's audio engine ignores
7024
+ * mulaw frames (commit returns "buffer only has 0.00ms of audio") even
7025
+ * though it accepts `audio/pcmu` at the protocol level.
7026
+ */
7027
+ sendAudio(mulawAudio: Buffer): void;
7028
+ /**
7029
+ * mulaw 8 kHz Buffer → PCM-16-LE 24 kHz Buffer.
7030
+ *
7031
+ * Direct 3× linear-interpolation upsample with a one-sample carry
7032
+ * across chunk boundaries. For every consecutive pair of 8 kHz
7033
+ * samples `(s_a, s_b)` we emit three 24 kHz samples:
7034
+ *
7035
+ * out_0 = s_a
7036
+ * out_1 = 2/3·s_a + 1/3·s_b
7037
+ * out_2 = 1/3·s_a + 2/3·s_b
7038
+ *
7039
+ * The carry stores the last 8 kHz sample of the chunk so the next
7040
+ * chunk can start by pairing `(carry, firstNewSample)` — that's what
7041
+ * keeps the output rate exact (each input sample → 3 output samples)
7042
+ * and eliminates the chunk-boundary DC step that confused the GA
7043
+ * server VAD. The first chunk has no carry and loses 3 samples at
7044
+ * the leading edge (375 µs of audio); that's well below any audible
7045
+ * artefact and well below the GA VAD's 300 ms prefix-padding window.
7046
+ */
7047
+ private transcodeInboundMulaw8ToPcm24;
7048
+ /**
7049
+ * Base64 PCM-16-LE 24 kHz → Base64 mulaw 8 kHz. Used by the WS
7050
+ * translation shim on each `response.output_audio.delta`. The stateful
7051
+ * resampler is created lazily and reused across all deltas in this
7052
+ * session so the 3:1 decimator's phase carries across chunk
7053
+ * boundaries — without that, every chunk boundary produces a click.
7054
+ */
7055
+ private transcodeOutboundPcm24ToMulaw8Buffer;
7056
+ sendFirstMessage(text: string): Promise<void>;
5642
7057
  }
5643
7058
 
5644
7059
  /**
@@ -6273,10 +7688,18 @@ declare class TwilioAdapter {
6273
7688
  /** Place an outbound call. Returns the Twilio call SID. */
6274
7689
  initiateCall(opts: InitiateCallOptions$1): Promise<InitiateCallResult$1>;
6275
7690
  /**
6276
- * Build a minimal ``<Response><Connect><Stream url="..."/></Connect></Response>``
6277
- * TwiML document. Mirrors the Python adapter's ``generate_stream_twiml``.
7691
+ * Build a ``<Response><Connect><Stream url="...">`` TwiML document.
7692
+ *
7693
+ * ``parameters`` is forwarded as ``<Parameter name="..." value="..."/>``
7694
+ * children of ``<Stream>``. Twilio Media Streams strips query-string params
7695
+ * from the ``<Stream url=...>`` before the WS handshake, so
7696
+ * ``<Parameter>`` tags are the supported way to pre-populate
7697
+ * ``start.customParameters`` on the WS ``start`` frame. Used by the
7698
+ * inbound path to carry caller / callee through to the bridge.
7699
+ *
7700
+ * Mirrors the Python adapter's ``generate_stream_twiml``.
6278
7701
  */
6279
- static generateStreamTwiml(streamUrl: string): string;
7702
+ static generateStreamTwiml(streamUrl: string, parameters?: Record<string, string>): string;
6280
7703
  /** Force-complete an in-progress call. */
6281
7704
  endCall(callSid: string): Promise<void>;
6282
7705
  }
@@ -6379,6 +7802,8 @@ declare class TelnyxSTT {
6379
7802
  private readonly transcriptionEngine;
6380
7803
  private readonly sampleRate;
6381
7804
  private readonly baseUrl;
7805
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
7806
+ static readonly providerKey = "telnyx_stt";
6382
7807
  private ws;
6383
7808
  private callbacks;
6384
7809
  private headerSent;
@@ -6425,6 +7850,8 @@ declare class TelnyxTTS {
6425
7850
  private readonly apiKey;
6426
7851
  private readonly voice;
6427
7852
  private readonly baseUrl;
7853
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
7854
+ static readonly providerKey = "telnyx_tts";
6428
7855
  constructor(apiKey: string, voice?: string, baseUrl?: string);
6429
7856
  /** Collect every audio chunk into a single Buffer. */
6430
7857
  synthesize(text: string): Promise<Buffer>;
@@ -6504,4 +7931,4 @@ interface CallEvent {
6504
7931
  readonly direction?: string;
6505
7932
  }
6506
7933
 
6507
- export { type AgentOptions, type AgentState, AllProvidersFailedError, type AnthropicConversion, LLM$3 as AnthropicLLM, type AnthropicLLMOptions, type AnthropicMessage, AssemblyAIEncoding, AssemblyAIModel, STT$1 as AssemblyAISTT, type AssemblyAISTTOptions, type AudioConfig, type AudioSource, AuthenticationError, type BackgroundAudioOptions, BackgroundAudioPlayer, BuiltinAudioClip, type BuiltinAudioClipName, type BuiltinPcmSource, type CallControl, type CallEvent, type CallEventHandler, type CallMetrics, CallMetricsAccumulator, type CallRecord, type CartesiaEncoding, STT$3 as CartesiaSTT, type CartesiaSTTOptions, TTS$3 as CartesiaTTS, type CartesiaTTSOptions, LLM$1 as CerebrasLLM, type CerebrasLLMOptions, ChatContext, type ChatMessage, type ChatRole, CloudflareTunnel, type ConversationStateSnapshot, type CostBreakdown, DEFAULT_MIN_SENTENCE_LEN, DEFAULT_PRICING, DTMF_EVENTS, STT$6 as DeepgramSTT, type DeepgramSTTOptions, DefaultToolExecutor, type DefaultToolExecutorOptions, type DefineToolInput, type DtmfEvent, ConvAI as ElevenLabsConvAI, ElevenLabsConvAIAdapter, type ConvAIOptions as ElevenLabsConvAIOptions, TTS$6 as ElevenLabsTTS, type ElevenLabsTTSOptions, type ElevenLabsWebSocketOptions, TTS$5 as ElevenLabsWebSocketTTS, type EouTrigger, ErrorCode, EventBus, FallbackLLMProvider, type FallbackLLMProviderOptions, type FilePcmSource, GEMINI_DEFAULT_INPUT_SR, GEMINI_DEFAULT_OUTPUT_SR, GeminiLiveAdapter, type GeminiLiveEventHandler, LLM as GoogleLLM, type GoogleLLMOptions, LLM$2 as GroqLLM, type GroqLLMOptions, Guardrail$1 as Guardrail, type GuardrailOptions, type HookContext, IVRActivity, type IVRActivityOptions, type IVRToolDefinition, type IncomingMessage, type InitTracingOptions, TTS as InworldTTS, type InworldTTSOptions, type JobCallback, type LLMChunk, LLMLoop, type LLMProvider, LMNTAudioFormat, LMNTModel, LMNTSampleRate, TTS$1 as LMNTTTS, type LMNTTTSOptions, type LatencyBreakdown, type LocalCallOptions, type LocalConfig, type LocalOptions, type Logger, type LoopCallback, type MessageHandler, MetricsStore, Ngrok, LLM$4 as OpenAILLM, type OpenAILLMOptions, OpenAILLMProvider, type OpenAIMessage, Realtime as OpenAIRealtime, OpenAIRealtimeAdapter, type RealtimeOptions as OpenAIRealtimeOptions, TTS$4 as OpenAITTS, type OpenAITTSOptions, STT$4 as OpenAITranscribeSTT, type OpenAITranscribeSTTOptions, type ParamSpec, PartialStreamError, Patter, PatterConnectionError, PatterError, type PatterEventType, PatterTool, type PatterToolExecuteArgs, type PatterToolOptions, type PatterToolResult, PcmCarry, PipelineHookExecutor, type PipelineHooks, type PipelineMessageHandler, type ProviderPricing, ProvisionError, RateLimitError, type RawPcmSource, type RealtimeConfig, RemoteMessageHandler, TTS$2 as RimeTTS, type RimeTTSOptions, SPAN_BARGEIN, SPAN_CALL, SPAN_ENDPOINT, SPAN_LLM, SPAN_STT, SPAN_TOOL, SPAN_TTS, type SSEEvent, type STTConfig, type ScheduleHandle, SentenceChunker, type ServeOptions, type SilenceCallback, type SileroSampleRate, SileroVAD, type SileroVADOptions, STT$2 as SonioxSTT, type SonioxSTTOptions$1 as SonioxSTTOptions, type Span, type SpeechEventCallback, SpeechEvents, SpeechmaticsAudioEncoding, SpeechmaticsOperatingPoint, STT as SpeechmaticsSTT, type SpeechmaticsSTTOptions, SpeechmaticsSampleRate, SpeechmaticsServerMessage, TurnDetectionMode as SpeechmaticsTurnDetectionMode, StatefulResampler, type StatefulResamplerOptions, Static as StaticTunnel, type TTSConfig, Carrier as Telnyx, TelnyxAdapter, type TelnyxCarrierOptions, type ConfigureNumberOptions as TelnyxConfigureNumberOptions, type EndCallOptions as TelnyxEndCallOptions, type InitiateCallOptions as TelnyxInitiateCallOptions, type InitiateCallResult as TelnyxInitiateCallResult, type ProvisionNumberOptions as TelnyxProvisionNumberOptions, type ProvisionNumberResult as TelnyxProvisionNumberResult, TelnyxSTT, TelnyxSTTInputFormat, TelnyxSTTSampleRate, type Transcript as TelnyxSTTTranscript, TelnyxTTS, TelnyxTTSSampleRate, TelnyxTTSVoice, type TelnyxTranscriptionEngine, TestSession, TfidfLoopDetector, type TfidfLoopDetectorOptions, Tool, type ToolDefinition, type ToolExecutor, type ToolHandler, type ToolOptions, type TunnelHandle, type TurnMetrics, Carrier$1 as Twilio, TwilioAdapter, type TwilioAdapterOptions, type TwilioCarrierOptions, type ConfigureNumberOptions$1 as TwilioConfigureNumberOptions, type InitiateCallOptions$1 as TwilioInitiateCallOptions, type InitiateCallResult$1 as TwilioInitiateCallResult, type ProvisionNumberOptions$1 as TwilioProvisionNumberOptions, type ProvisionNumberResult$1 as TwilioProvisionNumberResult, ULTRAVOX_DEFAULT_API_BASE, ULTRAVOX_DEFAULT_SR, type UltravoxEventHandler, UltravoxRealtimeAdapter, type UserState, STT$5 as WhisperSTT, type WhisperSTTOptions, assemblyai, builtinClipPath, calculateRealtimeCost, calculateSttCost, calculateTelephonyCost, calculateTtsCost, callsToCsv, callsToJson, cartesia, createResampler16kTo8k, createResampler24kTo16k, createResampler24kTo8k, createResampler8kTo16k, deepgram, defineTool, elevenlabs, filterEmoji, filterForTTS, filterMarkdown, formatDtmf, geminiLive, getLogger, guardrail, initTracing, isRemoteUrl, isTracingEnabled, isWebSocketUrl, lmnt, makeAuthMiddleware, mergePricing, mixPcm, mountApi, mountDashboard, mulawToPcm16, notifyDashboard, openaiTts, pcm16ToMulaw, resample16kTo8k, resample24kTo16k, resample8kTo16k, resamplePcm, rime, scheduleCron, scheduleInterval, scheduleOnce, selectSoundFromList, setLogger, soniox, speechmatics, startSpan, startTunnel, tool, ultravox, whisper };
7934
+ export { type AgentOptions, type AgentState, AllProvidersFailedError, type AnthropicConversion, LLM$3 as AnthropicLLM, type AnthropicLLMOptions, type AnthropicMessage, AssemblyAIEncoding, AssemblyAIModel, STT$1 as AssemblyAISTT, type AssemblyAISTTOptions, type AudioConfig, type AudioSource, AuthenticationError, type BackgroundAudioOptions, BackgroundAudioPlayer, type EvaluateContext as BargeInEvaluateContext, type BargeInStrategy, BuiltinAudioClip, type BuiltinAudioClipName, type BuiltinPcmSource, type CallControl, type CallEvent, type CallEventHandler, type CallMetrics, CallMetricsAccumulator, type CallRecord, type CartesiaEncoding, STT$3 as CartesiaSTT, type CartesiaSTTOptions, TTS$3 as CartesiaTTS, CartesiaTTSModel, type CartesiaTTSOptions, CartesiaTTSVoiceMode, LLM$1 as CerebrasLLM, type CerebrasLLMOptions, ChatContext, type ChatMessage, type ChatRole, CloudflareTunnel, type ConversationStateSnapshot, type CostBreakdown, DEFAULT_MIN_SENTENCE_LEN, DEFAULT_PRICING, DTMF_EVENTS, DeepFilterNetFilter, type DeepFilterNetOptions, DeepgramModel, STT$6 as DeepgramSTT, type DeepgramSTTOptions, DefaultToolExecutor, type DefaultToolExecutorOptions, type DefineToolInput, type DtmfEvent, ConvAI as ElevenLabsConvAI, ElevenLabsConvAIAdapter, type ConvAIOptions as ElevenLabsConvAIOptions, ElevenLabsModel, ElevenLabsOutputFormat, ElevenLabsTTS as ElevenLabsRestTTS, TTS$6 as ElevenLabsTTS, type ElevenLabsTTSOptions, type ElevenLabsWebSocketOptions, TTS$5 as ElevenLabsWebSocketTTS, type EouTrigger, ErrorCode, EventBus, FallbackLLMProvider, type FallbackLLMProviderOptions, type FilePcmSource, GEMINI_DEFAULT_INPUT_SR, GEMINI_DEFAULT_OUTPUT_SR, GeminiLiveAdapter, type GeminiLiveEventHandler, LLM as GoogleLLM, type GoogleLLMOptions, LLM$2 as GroqLLM, type GroqLLMOptions, Guardrail$1 as Guardrail, type GuardrailOptions, type HookContext, IVRActivity, type IVRActivityOptions, type IVRToolDefinition, type IncomingMessage, type InitTracingOptions, TTS as InworldTTS, type InworldTTSOptions, type JobCallback, KrispFrameDuration, KrispSampleRate, KrispVivaFilter, type KrispVivaFilterOptions, type LLMChunk, LLMLoop, type LLMProvider, LMNTAudioFormat, LMNTModel, LMNTSampleRate, TTS$1 as LMNTTTS, type LMNTTTSOptions, type LatencyBreakdown, type LocalCallOptions, type LocalConfig, type LocalOptions, type Logger, type LoopCallback, type MessageHandler, MetricsStore, MinWordsStrategy, type MinWordsStrategyOptions, type ModelPricing, Ngrok, LLM$4 as OpenAILLM, type OpenAILLMOptions, OpenAILLMProvider, type OpenAIMessage, Realtime as OpenAIRealtime, Realtime2 as OpenAIRealtime2, OpenAIRealtime2Adapter, type Realtime2Options as OpenAIRealtime2Options, OpenAIRealtimeAdapter, OpenAIRealtimeAudioFormat, OpenAIRealtimeModel, type RealtimeOptions as OpenAIRealtimeOptions, OpenAIRealtimeVADType, TTS$4 as OpenAITTS, type OpenAITTSOptions, STT$4 as OpenAITranscribeSTT, type OpenAITranscribeSTTOptions, OpenAITranscriptionModel, OpenAIVoice, PRICING_LAST_UPDATED, PRICING_VERSION, type ParamSpec, PartialStreamError, Patter, PatterConnectionError, PatterError, type PatterEventType, PatterTool, type PatterToolExecuteArgs, type PatterToolOptions, type PatterToolResult, PcmCarry, PipelineHookExecutor, type PipelineHooks, type PipelineMessageHandler, PricingUnit, type PricingUnitValue, type ProviderPricing, ProvisionError, RateLimitError, type RawPcmSource, type RealtimeConfig, RemoteMessageHandler, RimeAudioFormat, RimeModel, TTS$2 as RimeTTS, type RimeTTSOptions, SPAN_BARGEIN, SPAN_CALL, SPAN_ENDPOINT, SPAN_LLM, SPAN_STT, SPAN_TOOL, SPAN_TTS, type SSEEvent, type STTConfig, type ScheduleHandle, SentenceChunker, type ServeOptions, type SilenceCallback, type SileroSampleRate, SileroVAD, type SileroVADOptions, STT$2 as SonioxSTT, type SonioxSTTOptions$1 as SonioxSTTOptions, type Span, type SpeechEventCallback, SpeechEvents, SpeechmaticsAudioEncoding, SpeechmaticsOperatingPoint, STT as SpeechmaticsSTT, type SpeechmaticsSTTOptions, SpeechmaticsSampleRate, SpeechmaticsServerMessage, TurnDetectionMode as SpeechmaticsTurnDetectionMode, StatefulResampler, type StatefulResamplerOptions, Static as StaticTunnel, type TTSConfig, Carrier as Telnyx, TelnyxAdapter, type TelnyxCarrierOptions, type ConfigureNumberOptions as TelnyxConfigureNumberOptions, type EndCallOptions as TelnyxEndCallOptions, type InitiateCallOptions as TelnyxInitiateCallOptions, type InitiateCallResult as TelnyxInitiateCallResult, type ProvisionNumberOptions as TelnyxProvisionNumberOptions, type ProvisionNumberResult as TelnyxProvisionNumberResult, TelnyxSTT, TelnyxSTTInputFormat, TelnyxSTTSampleRate, type Transcript as TelnyxSTTTranscript, TelnyxTTS, TelnyxTTSSampleRate, TelnyxTTSVoice, type TelnyxTranscriptionEngine, TestSession, TfidfLoopDetector, type TfidfLoopDetectorOptions, Tool, type ToolDefinition, type ToolExecutor, type ToolHandler, type ToolOptions, type TunnelHandle, type TurnMetrics, Carrier$1 as Twilio, TwilioAdapter, type TwilioAdapterOptions, type TwilioCarrierOptions, type ConfigureNumberOptions$1 as TwilioConfigureNumberOptions, type InitiateCallOptions$1 as TwilioInitiateCallOptions, type InitiateCallResult$1 as TwilioInitiateCallResult, type ProvisionNumberOptions$1 as TwilioProvisionNumberOptions, type ProvisionNumberResult$1 as TwilioProvisionNumberResult, ULTRAVOX_DEFAULT_API_BASE, ULTRAVOX_DEFAULT_SR, type UltravoxEventHandler, UltravoxRealtimeAdapter, type UserState, STT$5 as WhisperSTT, type WhisperSTTOptions, assemblyai, builtinClipPath, calculateRealtimeCost, calculateSttCost, calculateTelephonyCost, calculateTtsCost, callsToCsv, callsToJson, cartesia, createResampler16kTo8k, createResampler24kTo16k, createResampler24kTo8k, createResampler8kTo16k, deepgram, defineTool, elevenlabs, evaluateStrategies as evaluateBargeInStrategies, filterEmoji, filterForTTS, filterMarkdown, formatDtmf, geminiLive, getLogger, guardrail, initTracing, isRemoteUrl, isTracingEnabled, isWebSocketUrl, lmnt, makeAuthMiddleware, mergePricing, mixPcm, mountApi, mountDashboard, mulawToPcm16, notifyDashboard, openaiTts, pcm16ToMulaw, resample16kTo8k, resample24kTo16k, resample8kTo16k, resamplePcm, resetStrategies as resetBargeInStrategies, rime, scheduleCron, scheduleInterval, scheduleOnce, selectSoundFromList, setLogger, soniox, speechmatics, startSpan, startTunnel, tool, ultravox, whisper };