getpatter 0.6.0 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -1,3 +1,5 @@
1
+ import * as WebSocket from 'ws';
2
+ import WebSocket__default from 'ws';
1
3
  import { EventEmitter } from 'events';
2
4
  import { Request, Response, NextFunction, Express } from 'express';
3
5
 
@@ -104,6 +106,61 @@ declare class Realtime {
104
106
  constructor(opts?: RealtimeOptions);
105
107
  }
106
108
 
109
+ /**
110
+ * OpenAI Realtime 2 engine — marker class for Patter client dispatch.
111
+ *
112
+ * Wraps `gpt-realtime-2` (GA Realtime API). Separate marker from
113
+ * {@link import('./openai').Realtime} because the GA endpoint speaks a
114
+ * different `session.update` wire shape; the client dispatches to
115
+ * `OpenAIRealtime2Adapter` when this marker is passed.
116
+ */
117
+ /** Constructor options for the OpenAI `Realtime2` engine marker. */
118
+ interface Realtime2Options {
119
+ /** API key. Falls back to OPENAI_API_KEY env var when omitted. */
120
+ apiKey?: string;
121
+ /** GA Realtime model. Defaults to `gpt-realtime-2`. */
122
+ model?: string;
123
+ /** Voice preset. Defaults to alloy. */
124
+ voice?: string;
125
+ /**
126
+ * Reasoning-effort tier. When omitted the field is not sent and the
127
+ * server default applies. OpenAI recommends `"low"` for production
128
+ * voice flows — higher tiers add measurable per-turn latency.
129
+ */
130
+ reasoningEffort?: 'minimal' | 'low' | 'medium' | 'high';
131
+ /**
132
+ * Override for `audio.input.transcription.model`. Omit to keep the
133
+ * adapter default (`whisper-1`). Use `"gpt-realtime-whisper"` for
134
+ * low-latency transcript partials.
135
+ */
136
+ inputAudioTranscriptionModel?: string;
137
+ }
138
+ /**
139
+ * OpenAI Realtime 2 engine marker — selects `gpt-realtime-2` on the GA
140
+ * Realtime API.
141
+ *
142
+ * @example
143
+ * ```ts
144
+ * import { Patter, Twilio, OpenAIRealtime2 } from "getpatter";
145
+ *
146
+ * const phone = new Patter({ carrier: new Twilio(), phoneNumber: "+1..." });
147
+ * const agent = phone.agent({
148
+ * engine: new OpenAIRealtime2({ reasoningEffort: "low" }),
149
+ * systemPrompt: "You are a friendly receptionist.",
150
+ * firstMessage: "Hello! How can I help?",
151
+ * });
152
+ * ```
153
+ */
154
+ declare class Realtime2 {
155
+ readonly kind: "openai_realtime_2";
156
+ readonly apiKey: string;
157
+ readonly model: string;
158
+ readonly voice: string;
159
+ readonly reasoningEffort?: 'minimal' | 'low' | 'medium' | 'high';
160
+ readonly inputAudioTranscriptionModel?: string;
161
+ constructor(opts?: Realtime2Options);
162
+ }
163
+
107
164
  /** ElevenLabs ConvAI engine — marker class for Patter client dispatch. */
108
165
  /** Constructor options for the ElevenLabs `ConvAI` engine marker. */
109
166
  interface ConvAIOptions {
@@ -273,71 +330,6 @@ declare class Tool implements ToolDefinition {
273
330
  /** Factory helper mirroring Python's `tool(...)` function. */
274
331
  declare function tool(opts: ToolOptions): Tool;
275
332
 
276
- /**
277
- * Shared STT / TTS adapter dispatch.
278
- *
279
- * In v0.5.0+ callers always pass pre-instantiated adapters (``agent.stt`` /
280
- * ``agent.tts`` are ``STTAdapter`` / ``TTSAdapter`` instances), so these
281
- * helpers are thin pass-throughs that return the instance or null. Kept as
282
- * functions so the Twilio/Telnyx bridges have a single dispatch point.
283
- */
284
-
285
- /** Per-word timings / metadata (Deepgram-shaped). Optional on every adapter. */
286
- interface STTWord {
287
- readonly word?: string;
288
- readonly start?: number;
289
- readonly end?: number;
290
- readonly confidence?: number;
291
- readonly punctuated_word?: string;
292
- readonly speaker?: number;
293
- }
294
- /**
295
- * Facade transcript shape — widened to surface richer provider fields
296
- * (Deepgram emits all of them) without forcing adapters that only know
297
- * ``text``/``isFinal`` to change. All non-text fields are optional.
298
- */
299
- interface STTTranscript {
300
- text: string;
301
- isFinal?: boolean;
302
- /** Overall transcript confidence in [0, 1]. */
303
- confidence?: number;
304
- /** Provider-side end-of-utterance hint (faster than ``isFinal``). */
305
- speechFinal?: boolean;
306
- /** True when the result was produced in response to a Finalize command. */
307
- fromFinalize?: boolean;
308
- /** Provider request id (Deepgram populates this from the Metadata frame). */
309
- requestId?: string;
310
- /** Per-word timings / metadata when the provider emits them. */
311
- words?: ReadonlyArray<STTWord>;
312
- /** Which provider event this transcript represents (e.g. ``Results``). */
313
- eventType?: string;
314
- }
315
- /** Callback invoked by an `STTAdapter` for each (partial or final) transcript event. */
316
- type STTTranscriptCallback = (t: STTTranscript) => Promise<void> | void;
317
- /** Shape shared by every STT adapter in the SDK. */
318
- interface STTAdapter {
319
- connect(): Promise<void>;
320
- sendAudio(pcm: Buffer): void | Promise<void>;
321
- onTranscript(cb: STTTranscriptCallback): void;
322
- close(): void | Promise<void>;
323
- /**
324
- * Optional: ask the provider to immediately finalise the in-flight
325
- * utterance (rather than waiting for its own endpoint timer). Called by
326
- * ``StreamHandler`` whenever the SDK's VAD signals ``speech_end``, and
327
- * after a barge-in cancel — both moments where waiting for the
328
- * provider's endpoint heuristic stalls the next turn.
329
- *
330
- * Implementations that do not support utterance-level finalisation
331
- * (e.g. one-shot transcribers like Whisper) should omit this method
332
- * entirely; the stream handler does an optional-chained call.
333
- */
334
- finalize?(): void | Promise<void>;
335
- }
336
- /** Shape shared by every TTS adapter in the SDK. */
337
- interface TTSAdapter {
338
- synthesizeStream(text: string): AsyncIterable<Buffer>;
339
- }
340
-
341
333
  /**
342
334
  * Pipeline hook executor for pipeline mode.
343
335
  *
@@ -616,6 +608,22 @@ interface LLMStreamOptions {
616
608
  }
617
609
  interface LLMProvider {
618
610
  stream(messages: Array<Record<string, unknown>>, tools?: Array<Record<string, unknown>> | null, opts?: LLMStreamOptions): AsyncGenerator<LLMChunk, void, unknown>;
611
+ /**
612
+ * Optional best-effort pre-call DNS / TLS / HTTP-keepalive warmup.
613
+ *
614
+ * Called once per outbound call from ``Patter.call`` when the agent has
615
+ * ``prewarm: true`` (the default). Concrete providers (OpenAI,
616
+ * Anthropic, Google, Cerebras, Groq) override this to issue a
617
+ * lightweight HTTPS GET to their inference endpoint so by the time the
618
+ * first ``stream()`` call lands, the connection pool already has a
619
+ * warm socket. Failures are logged at debug level and never abort the
620
+ * call — pure latency optimisation.
621
+ *
622
+ * Optional on the interface (``warmup?: ...``) so providers without a
623
+ * warmup hook still satisfy the type. Detected via runtime
624
+ * ``typeof provider.warmup === 'function'`` in the client.
625
+ */
626
+ warmup?(): Promise<void>;
619
627
  }
620
628
  /** Optional sampling kwargs forwarded into the OpenAI Chat Completions body. */
621
629
  interface OpenAILLMSamplingOptions {
@@ -642,6 +650,8 @@ interface OpenAILLMSamplingOptions {
642
650
  }
643
651
  /** LLM provider backed by OpenAI Chat Completions (streaming). */
644
652
  declare class OpenAILLMProvider implements LLMProvider {
653
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
654
+ static readonly providerKey = "openai";
645
655
  private readonly apiKey;
646
656
  readonly model: string;
647
657
  private readonly temperature?;
@@ -655,6 +665,23 @@ declare class OpenAILLMProvider implements LLMProvider {
655
665
  private readonly presencePenalty?;
656
666
  private readonly stop?;
657
667
  constructor(apiKey: string, model: string, sampling?: OpenAILLMSamplingOptions);
668
+ /** Subclasses (Cerebras, Groq) override this with their own host. */
669
+ protected get baseUrl(): string;
670
+ /**
671
+ * Pre-call DNS / TLS / HTTP-keepalive warmup.
672
+ *
673
+ * Issues a lightweight ``GET ${baseUrl}/models`` so DNS, TLS and HTTP/2
674
+ * are already up by the time the first ``chat.completions`` call lands.
675
+ * Best-effort: 5 s timeout, all exceptions swallowed at debug level.
676
+ *
677
+ * Note: an HTTPS GET warms DNS + TLS + connection pool but does NOT
678
+ * warm the inference path itself; for true inference warmup a real
679
+ * low-token request is needed, left as a follow-up. STT / TTS providers ship concrete
680
+ * WebSocket-based prewarms (Cartesia / Deepgram / AssemblyAI for STT;
681
+ * ElevenLabs WS for TTS) which save 200-500 ms each — those dominate
682
+ * the cold-start latency budget.
683
+ */
684
+ warmup(): Promise<void>;
658
685
  /** Stream OpenAI Chat Completions chunks for the given messages/tools. */
659
686
  stream(messages: Array<Record<string, unknown>>, tools?: Array<Record<string, unknown>> | null, opts?: LLMStreamOptions): AsyncGenerator<LLMChunk, void, unknown>;
660
687
  }
@@ -669,6 +696,8 @@ declare class LLMLoop {
669
696
  private eventBus?;
670
697
  private readonly _providerName;
671
698
  private readonly _modelName;
699
+ private _usageMissingCount;
700
+ private _loggedUsageFallback;
672
701
  private onToolCall?;
673
702
  constructor(apiKey: string, model: string, systemPrompt: string, tools?: ToolDefinition[] | null, llmProvider?: LLMProvider, disablePhonePreamble?: boolean);
674
703
  /**
@@ -706,6 +735,87 @@ declare class LLMLoop {
706
735
  private buildMessages;
707
736
  }
708
737
 
738
+ /**
739
+ * Barge-in confirmation strategies.
740
+ *
741
+ * When a caller starts speaking while the agent's TTS is in flight, the SDK
742
+ * has to decide whether the speech is a real interruption or just a brief
743
+ * backchannel ("uh-huh", "okay") / room noise / cough. The default
744
+ * behaviour is to treat any VAD speech_start as a confirmed barge-in and
745
+ * cancel the agent immediately. That is fine for clean inputs but
746
+ * produces frequent false positives on PSTN: the agent gets cut
747
+ * mid-sentence by background chatter, breath, or filler words and never
748
+ * recovers the conversational thread.
749
+ *
750
+ * Each ``BargeInStrategy`` is consulted on every STT transcript while a
751
+ * barge-in is *pending* (VAD fired, but the agent has not yet been
752
+ * cancelled). The first strategy that returns ``true`` confirms the
753
+ * barge-in; if none do within the configured timeout the pending state
754
+ * is dropped and the agent resumes streaming TTS as if nothing happened.
755
+ * With an empty ``bargeInStrategies`` array the SDK falls back to the
756
+ * legacy "interrupt immediately on VAD" path, so adding strategies is
757
+ * a strict opt-in.
758
+ */
759
+ interface EvaluateContext {
760
+ /** Latest STT output text (interim or final). */
761
+ readonly transcript: string;
762
+ /** ``true`` for interim partials, ``false`` for finals. */
763
+ readonly isInterim: boolean;
764
+ /** Whether the agent's TTS is currently in flight. */
765
+ readonly agentSpeaking: boolean;
766
+ }
767
+ /**
768
+ * Decides whether a pending barge-in should be confirmed.
769
+ *
770
+ * Implementations must be safe to call from any number of evaluations
771
+ * per turn. ``reset`` is invoked when the agent finishes speaking
772
+ * naturally and when a pending barge-in times out without
773
+ * confirmation.
774
+ */
775
+ interface BargeInStrategy {
776
+ evaluate(ctx: EvaluateContext): Promise<boolean> | boolean;
777
+ reset?(): Promise<void> | void;
778
+ }
779
+ interface MinWordsStrategyOptions {
780
+ /**
781
+ * Minimum word count required while the agent is speaking. Reasonable
782
+ * values are 2-5; 3 is a good starting point for production phone
783
+ * agents. Must be ``>= 1``.
784
+ */
785
+ readonly minWords: number;
786
+ /**
787
+ * When ``true`` (default), interim STT partials are evaluated as soon
788
+ * as they arrive. Set to ``false`` to wait for finals only — slower
789
+ * but free of partial-word noise on jittery STT providers.
790
+ */
791
+ readonly useInterim?: boolean;
792
+ }
793
+ /**
794
+ * Confirm barge-in only after the caller has spoken ``minWords`` words.
795
+ *
796
+ * Filters short backchannels, single-word utterances, and stray
797
+ * transcription fragments that VAD picked up but were not real
798
+ * interruptions. While the agent is silent the strategy permits any
799
+ * speech to count (one word is enough), so the first user turn is not
800
+ * delayed.
801
+ */
802
+ declare class MinWordsStrategy implements BargeInStrategy {
803
+ private readonly minWords;
804
+ private readonly useInterim;
805
+ constructor(options: MinWordsStrategyOptions);
806
+ evaluate(ctx: EvaluateContext): boolean;
807
+ reset(): Promise<void>;
808
+ }
809
+ /**
810
+ * Short-circuit-OR composition: first strategy that confirms wins.
811
+ * Returns ``false`` for an empty array so callers can use the empty
812
+ * default to mean "no opt-in confirmation, fall back to legacy
813
+ * interrupt-on-VAD".
814
+ */
815
+ declare function evaluateStrategies(strategies: readonly BargeInStrategy[], ctx: EvaluateContext): Promise<boolean>;
816
+ /** Call ``reset()`` on every strategy, swallowing per-strategy errors. */
817
+ declare function resetStrategies(strategies: readonly BargeInStrategy[]): Promise<void>;
818
+
709
819
  /**
710
820
  * Public type definitions for the Patter SDK — agent options, pipeline hooks,
711
821
  * provider config envelopes, and serve/call request/response shapes.
@@ -967,6 +1077,15 @@ interface VADEvent {
967
1077
  interface VADProvider {
968
1078
  processFrame(pcmChunk: Buffer, sampleRate: number): Promise<VADEvent | null>;
969
1079
  close(): Promise<void>;
1080
+ /**
1081
+ * Optional: reset all per-utterance state so the next ``processFrame``
1082
+ * starts from a clean SILENCE state. Useful between agent turns to
1083
+ * prevent a "stuck SPEECH" condition where PSTN echo / loopback kept the
1084
+ * detector's internal probability above the deactivation threshold for
1085
+ * the full agent turn, leaving the VAD unable to emit ``speech_start``
1086
+ * on the next user utterance (one-shot barge-in bug).
1087
+ */
1088
+ reset?(): Promise<void> | void;
970
1089
  }
971
1090
  /** Pre-STT audio filter — noise cancellation, gain, EQ. */
972
1091
  interface AudioFilter {
@@ -1062,7 +1181,7 @@ interface AgentOptions {
1062
1181
  * matching mode (``openai_realtime`` or ``elevenlabs_convai``). When absent,
1063
1182
  * pipeline mode is selected if ``stt`` and ``tts`` are provided.
1064
1183
  */
1065
- engine?: Realtime | ConvAI;
1184
+ engine?: Realtime | Realtime2 | ConvAI;
1066
1185
  /**
1067
1186
  * Provider mode. Normally derived from ``engine`` / ``stt`` + ``tts``. Pass
1068
1187
  * ``'pipeline'`` explicitly when building a pipeline-mode agent without
@@ -1103,6 +1222,59 @@ interface AgentOptions {
1103
1222
  * Default: 300.
1104
1223
  */
1105
1224
  bargeInThresholdMs?: number;
1225
+ /**
1226
+ * Opt-in barge-in confirmation strategies (pipeline mode). With the
1227
+ * default empty array the SDK falls back to the legacy
1228
+ * "interrupt immediately on VAD speech_start" behaviour. When at
1229
+ * least one strategy is provided, a VAD speech_start during TTS
1230
+ * marks the barge-in as *pending* — the agent's TTS continues
1231
+ * streaming naturally and its in-flight LLM stream is preserved —
1232
+ * and the strategies are consulted on every STT transcript. The first strategy that
1233
+ * returns ``true`` confirms the barge-in (cancels TTS, flushes the
1234
+ * inbound ring buffer); if none confirm within
1235
+ * ``bargeInConfirmMs`` the pending state is dropped and TTS resumes.
1236
+ *
1237
+ * See ``getpatter`` exports ``BargeInStrategy`` /
1238
+ * ``MinWordsStrategy`` for the protocol and a reference
1239
+ * implementation.
1240
+ */
1241
+ bargeInStrategies?: readonly BargeInStrategy[];
1242
+ /**
1243
+ * Maximum time (ms) to wait for at least one strategy to confirm a
1244
+ * pending barge-in before discarding the pending state and resuming
1245
+ * TTS. Only consulted when ``bargeInStrategies`` is non-empty.
1246
+ * Default: 1500.
1247
+ */
1248
+ bargeInConfirmMs?: number;
1249
+ /**
1250
+ * When ``true`` (default), ``Patter.call`` warms up the STT, TTS, and
1251
+ * LLM provider connections in parallel with the carrier-side
1252
+ * ``initiateCall`` request so DNS, TLS, and HTTP/2 handshakes are
1253
+ * already complete by the time the callee answers. Adapters expose a
1254
+ * ``warmup()`` method returning ``Promise<void>`` (default no-op) —
1255
+ * providers can override to dial open a persistent connection ahead
1256
+ * of the WebSocket bridge. Best-effort: warmup failures are logged
1257
+ * at debug level and never abort the call. Default: ``true``.
1258
+ */
1259
+ prewarm?: boolean;
1260
+ /**
1261
+ * When ``true`` (default ``false``), ``Patter.call`` also pre-renders
1262
+ * ``firstMessage`` to TTS audio bytes during the ringing window and
1263
+ * streams the cached buffer immediately when the carrier emits
1264
+ * ``start``. Eliminates the 200-700 ms TTS first-byte latency on the
1265
+ * greeting at the cost of paying the TTS bill even if the call is
1266
+ * never answered (silently logged at warn level when the call
1267
+ * fails). Off by default to preserve the prior cost surface; opt-in
1268
+ * for production outbound where every millisecond of greeting
1269
+ * latency hurts conversion. Default: ``false``.
1270
+ *
1271
+ * **Pipeline mode only.** Realtime / ConvAI provider modes never
1272
+ * consume the prewarm cache (the StreamHandler for those modes runs
1273
+ * its first-message emit through the provider's own audio path), so
1274
+ * ``Patter.call`` refuses to spawn the prewarm task and emits a warn
1275
+ * when ``provider !== 'pipeline'``.
1276
+ */
1277
+ prewarmFirstMessage?: boolean;
1106
1278
  /**
1107
1279
  * When true, the sentence chunker emits the first clause of each response
1108
1280
  * on a soft punctuation boundary (",", em-dash, en-dash) once ~40 chars
@@ -1194,36 +1366,449 @@ interface LocalCallOptions {
1194
1366
  to: string;
1195
1367
  agent: AgentOptions;
1196
1368
  /**
1197
- * Enable answering-machine detection. **Defaults to ``true``** — the SDK
1198
- * asks Twilio (``MachineDetection=DetectMessageEnd`` + Async AMD) or
1199
- * Telnyx (``answering_machine_detection=greeting_end``) to classify
1200
- * whoever picks up. Async AMD on Twilio adds ~0 answer-latency on human
1201
- * pickups (the call connects immediately and the result arrives via
1202
- * webhook 2-5 s later), so ON-by-default is safe. Pass ``false`` to
1203
- * disable when you want to skip per-call AMD billing or you already
1204
- * know the destination is a human.
1369
+ * Enable answering-machine detection. **Defaults to ``true``** — the SDK
1370
+ * asks Twilio (``MachineDetection=DetectMessageEnd`` + Async AMD) or
1371
+ * Telnyx (``answering_machine_detection=greeting_end``) to classify
1372
+ * whoever picks up. Async AMD on Twilio adds ~0 answer-latency on human
1373
+ * pickups (the call connects immediately and the result arrives via
1374
+ * webhook 2-5 s later), so ON-by-default is safe. Pass ``false`` to
1375
+ * disable when you want to skip per-call AMD billing or you already
1376
+ * know the destination is a human.
1377
+ */
1378
+ machineDetection?: boolean;
1379
+ /**
1380
+ * Called once when the carrier finishes the AMD check. Fires for both
1381
+ * ``human`` and ``machine`` outcomes. Combine with ``voicemailMessage``
1382
+ * to get both the legacy voicemail-drop AND a result callback (the SDK
1383
+ * fires the callback after the drop is queued). Acceptance tests use
1384
+ * this to mark a run INVALID when ``classification !== 'human'``.
1385
+ */
1386
+ onMachineDetection?: (result: MachineDetectionResult) => void | Promise<void>;
1387
+ /** If set, spoken as a voicemail message when AMD detects a machine. Implicitly enables ``machineDetection``. */
1388
+ voicemailMessage?: string;
1389
+ /** Dynamic variables merged into agent.variables before call. Override agent-level variables. */
1390
+ variables?: Record<string, string>;
1391
+ /**
1392
+ * Ring timeout in seconds. Forwarded to Twilio as `Timeout` and to Telnyx
1393
+ * as `timeout_secs`. Defaults to **25 s** — the production-recommended
1394
+ * value that limits phantom calls. Pass `60` for legacy carrier-default
1395
+ * parity, or `null` to omit the parameter entirely (carrier picks its
1396
+ * own default).
1397
+ */
1398
+ ringTimeout?: number | null;
1399
+ }
1400
+
1401
+ /**
1402
+ * Shared STT / TTS adapter dispatch.
1403
+ *
1404
+ * In v0.5.0+ callers always pass pre-instantiated adapters (``agent.stt`` /
1405
+ * ``agent.tts`` are ``STTAdapter`` / ``TTSAdapter`` instances), so these
1406
+ * helpers are thin pass-throughs that return the instance or null. Kept as
1407
+ * functions so the Twilio/Telnyx bridges have a single dispatch point.
1408
+ */
1409
+
1410
+ /** Per-word timings / metadata (Deepgram-shaped). Optional on every adapter. */
1411
+ interface STTWord {
1412
+ readonly word?: string;
1413
+ readonly start?: number;
1414
+ readonly end?: number;
1415
+ readonly confidence?: number;
1416
+ readonly punctuated_word?: string;
1417
+ readonly speaker?: number;
1418
+ }
1419
+ /**
1420
+ * Facade transcript shape — widened to surface richer provider fields
1421
+ * (Deepgram emits all of them) without forcing adapters that only know
1422
+ * ``text``/``isFinal`` to change. All non-text fields are optional.
1423
+ */
1424
+ interface STTTranscript {
1425
+ text: string;
1426
+ isFinal?: boolean;
1427
+ /** Overall transcript confidence in [0, 1]. */
1428
+ confidence?: number;
1429
+ /** Provider-side end-of-utterance hint (faster than ``isFinal``). */
1430
+ speechFinal?: boolean;
1431
+ /** True when the result was produced in response to a Finalize command. */
1432
+ fromFinalize?: boolean;
1433
+ /** Provider request id (Deepgram populates this from the Metadata frame). */
1434
+ requestId?: string;
1435
+ /** Per-word timings / metadata when the provider emits them. */
1436
+ words?: ReadonlyArray<STTWord>;
1437
+ /** Which provider event this transcript represents (e.g. ``Results``). */
1438
+ eventType?: string;
1439
+ }
1440
+ /** Callback invoked by an `STTAdapter` for each (partial or final) transcript event. */
1441
+ type STTTranscriptCallback = (t: STTTranscript) => Promise<void> | void;
1442
+ /** Shape shared by every STT adapter in the SDK. */
1443
+ interface STTAdapter {
1444
+ connect(): Promise<void>;
1445
+ sendAudio(pcm: Buffer): void | Promise<void>;
1446
+ onTranscript(cb: STTTranscriptCallback): void;
1447
+ close(): void | Promise<void>;
1448
+ /**
1449
+ * Optional: ask the provider to immediately finalise the in-flight
1450
+ * utterance (rather than waiting for its own endpoint timer). Called by
1451
+ * ``StreamHandler`` whenever the SDK's VAD signals ``speech_end``, and
1452
+ * after a barge-in cancel — both moments where waiting for the
1453
+ * provider's endpoint heuristic stalls the next turn.
1454
+ *
1455
+ * Implementations that do not support utterance-level finalisation
1456
+ * (e.g. one-shot transcribers like Whisper) should omit this method
1457
+ * entirely; the stream handler does an optional-chained call.
1458
+ */
1459
+ finalize?(): void | Promise<void>;
1460
+ /**
1461
+ * Optional best-effort pre-call DNS / TLS / HTTP-keepalive warmup.
1462
+ * Default behaviour is a no-op — providers that benefit (e.g.
1463
+ * provider WebSockets with a slow handshake) can override. Failures
1464
+ * must never abort the call.
1465
+ */
1466
+ warmup?(): Promise<void>;
1467
+ }
1468
+ /** Shape shared by every TTS adapter in the SDK. */
1469
+ interface TTSAdapter {
1470
+ synthesizeStream(text: string): AsyncIterable<Buffer>;
1471
+ /**
1472
+ * Optional best-effort pre-call DNS / TLS / HTTP-keepalive warmup.
1473
+ * Default behaviour is a no-op. Failures must never abort the call.
1474
+ */
1475
+ warmup?(): Promise<void>;
1476
+ }
1477
+
1478
+ /**
1479
+ * Known stable ElevenLabs voice models (from the official ElevenLabs API
1480
+ * reference). Exposed as a typed `as const` object so callers can pass
1481
+ * `ElevenLabsModel.FLASH_V2_5` and get autocomplete / static checking; the
1482
+ * public `modelId` option also accepts an arbitrary `string` so users can
1483
+ * pass forward-compat IDs we haven't enumerated yet.
1484
+ *
1485
+ * - `V3` — newest, highest quality (slower TTFT than Flash).
1486
+ * - `FLASH_V2_5` — current default, fastest (~75 ms TTFT).
1487
+ * - `TURBO_V2_5` — balanced quality/speed.
1488
+ * - `MULTILINGUAL_V2` — best multilingual support.
1489
+ * - `MONOLINGUAL_V1` — legacy English-only.
1490
+ */
1491
+ declare const ElevenLabsModel: {
1492
+ readonly V3: "eleven_v3";
1493
+ readonly FLASH_V2_5: "eleven_flash_v2_5";
1494
+ readonly TURBO_V2_5: "eleven_turbo_v2_5";
1495
+ readonly MULTILINGUAL_V2: "eleven_multilingual_v2";
1496
+ readonly MONOLINGUAL_V1: "eleven_monolingual_v1";
1497
+ };
1498
+ /** Union of {@link ElevenLabsModel} string values. */
1499
+ type ElevenLabsModel = (typeof ElevenLabsModel)[keyof typeof ElevenLabsModel];
1500
+ declare const ElevenLabsOutputFormat: {
1501
+ readonly MP3_22050_32: "mp3_22050_32";
1502
+ readonly MP3_44100_32: "mp3_44100_32";
1503
+ readonly MP3_44100_64: "mp3_44100_64";
1504
+ readonly MP3_44100_96: "mp3_44100_96";
1505
+ readonly MP3_44100_128: "mp3_44100_128";
1506
+ readonly MP3_44100_192: "mp3_44100_192";
1507
+ readonly PCM_8000: "pcm_8000";
1508
+ readonly PCM_16000: "pcm_16000";
1509
+ readonly PCM_22050: "pcm_22050";
1510
+ readonly PCM_24000: "pcm_24000";
1511
+ readonly PCM_44100: "pcm_44100";
1512
+ readonly ULAW_8000: "ulaw_8000";
1513
+ };
1514
+ /** Union of {@link ElevenLabsOutputFormat} string values. */
1515
+ type ElevenLabsOutputFormat = (typeof ElevenLabsOutputFormat)[keyof typeof ElevenLabsOutputFormat];
1516
+ /** ElevenLabs voice tuning knobs forwarded as `voice_settings` in the request. */
1517
+ interface ElevenLabsVoiceSettings {
1518
+ stability?: number;
1519
+ similarity_boost?: number;
1520
+ style?: number;
1521
+ use_speaker_boost?: boolean;
1522
+ }
1523
+ /** Constructor options for {@link ElevenLabsTTS}. */
1524
+ interface ElevenLabsTTSOptions$1 {
1525
+ voiceId?: string;
1526
+ /**
1527
+ * ElevenLabs voice model ID. The default ``eleven_flash_v2_5`` has the
1528
+ * lowest TTFT (~75 ms). Pass ``eleven_v3`` for highest quality, or any
1529
+ * arbitrary string for forward-compat with future models.
1530
+ */
1531
+ modelId?: ElevenLabsModel | string;
1532
+ outputFormat?: ElevenLabsOutputFormat;
1533
+ voiceSettings?: ElevenLabsVoiceSettings;
1534
+ languageCode?: string;
1535
+ chunkSize?: number;
1536
+ }
1537
+ /**
1538
+ * ElevenLabs streaming TTS adapter.
1539
+ *
1540
+ * Supported `modelId` values are autocompleted via {@link ElevenLabsModel}.
1541
+ * Default is `eleven_flash_v2_5` (lowest TTFT, ~75 ms).
1542
+ *
1543
+ * **Telephony optimization** — the constructor default
1544
+ * `outputFormat='pcm_16000'` is correct for web playback, dashboard
1545
+ * previews, and 16 kHz pipelines. For real phone calls, use the
1546
+ * carrier-specific factories instead:
1547
+ *
1548
+ * - {@link ElevenLabsTTS.forTwilio} emits `ulaw_8000` natively. Twilio's
1549
+ * media-stream WebSocket expects μ-law @ 8 kHz, so the SDK normally
1550
+ * resamples 16 kHz → 8 kHz and PCM → μ-law before sending. Asking
1551
+ * ElevenLabs to produce μ-law directly skips that step (saves
1552
+ * ~30–80 ms first-byte plus per-frame CPU and avoids any resampling
1553
+ * aliasing).
1554
+ * - {@link ElevenLabsTTS.forTelnyx} emits `pcm_16000`. Telnyx negotiates
1555
+ * L16/16000 on its bidirectional media WebSocket, so 16 kHz PCM is
1556
+ * already the format used end-to-end and no transcoding happens.
1557
+ * ElevenLabs *also* supports `ulaw_8000` if your Telnyx profile is
1558
+ * pinned to PCMU/8000 — pass `outputFormat: 'ulaw_8000'` explicitly
1559
+ * in that case.
1560
+ */
1561
+ declare class ElevenLabsTTS {
1562
+ static readonly providerKey = "elevenlabs";
1563
+ private readonly apiKey;
1564
+ private readonly voiceId;
1565
+ private readonly modelId;
1566
+ private readonly outputFormat;
1567
+ private readonly voiceSettings;
1568
+ private readonly languageCode;
1569
+ private readonly chunkSize;
1570
+ constructor(apiKey: string, voiceId?: string, modelId?: string, outputFormat?: ElevenLabsOutputFormat | string);
1571
+ constructor(apiKey: string, options: ElevenLabsTTSOptions$1);
1572
+ /**
1573
+ * Construct an instance pre-configured for Twilio Media Streams.
1574
+ *
1575
+ * Sets `outputFormat='ulaw_8000'` so ElevenLabs emits μ-law @ 8 kHz
1576
+ * directly — the exact wire format Twilio's media stream uses — letting
1577
+ * the SDK skip the 16 kHz→8 kHz resample and PCM→μ-law conversion in
1578
+ * `TwilioAudioSender`. Saves ~30–80 ms first-byte and per-frame CPU,
1579
+ * and removes a potential aliasing source.
1580
+ *
1581
+ * `voiceSettings` defaults to a low-bandwidth-friendly profile
1582
+ * (speaker boost off, modest stability) which sounds cleaner at 8 kHz
1583
+ * μ-law than the studio default. Pass an explicit object to override.
1584
+ */
1585
+ static forTwilio(apiKey: string, options?: Omit<ElevenLabsTTSOptions$1, 'outputFormat'>): ElevenLabsTTS;
1586
+ /**
1587
+ * Construct an instance pre-configured for Telnyx bidirectional media.
1588
+ *
1589
+ * Telnyx's default media-streaming codec is L16 PCM @ 16 kHz, which
1590
+ * matches our default Telnyx handler. We pick `pcm_16000` so the audio
1591
+ * flows end-to-end with zero resampling or transcoding.
1592
+ *
1593
+ * Trade-off: if your Telnyx profile is pinned to PCMU/8000 (μ-law),
1594
+ * construct `ElevenLabsTTS` directly with `outputFormat: 'ulaw_8000'`
1595
+ * — Telnyx supports that natively too.
1596
+ */
1597
+ static forTelnyx(apiKey: string, options?: Omit<ElevenLabsTTSOptions$1, 'outputFormat'>): ElevenLabsTTS;
1598
+ /**
1599
+ * Synthesise text to speech and return the full audio as a single Buffer.
1600
+ *
1601
+ * For large chunks (or when latency matters) call `synthesizeStream` instead.
1602
+ */
1603
+ synthesize(text: string): Promise<Buffer>;
1604
+ /**
1605
+ * Synthesise text and yield audio chunks as they arrive (streaming).
1606
+ *
1607
+ * The yielded buffers are raw PCM at 16 kHz (or whatever `outputFormat` is
1608
+ * configured to). `chunkSize` controls the maximum yield size — 512 is a
1609
+ * good choice for low-latency telephony.
1610
+ */
1611
+ synthesizeStream(text: string): AsyncGenerator<Buffer>;
1612
+ }
1613
+
1614
+ /**
1615
+ * WebSocket-based ElevenLabs TTS provider — opt-in low-latency variant.
1616
+ *
1617
+ * Targets the ElevenLabs streaming-input WebSocket endpoint
1618
+ * (`/v1/text-to-speech/{voice_id}/stream-input`) instead of the HTTP
1619
+ * `/stream` endpoint used by `ElevenLabsTTS`. Saves the HTTP request setup
1620
+ * time per utterance (~50 ms) and avoids the HTTP cold-start TLS handshake
1621
+ * when calls are bursty.
1622
+ *
1623
+ * API matches `ElevenLabsTTS` (`synthesizeStream(text)` returns an
1624
+ * `AsyncGenerator<Buffer>`) so it can be passed anywhere a TTSAdapter is
1625
+ * expected.
1626
+ *
1627
+ * Behaviour notes
1628
+ * - WebSocket is opened **per-utterance** (matches HTTP semantics). A
1629
+ * future revision may pool a WS across utterances of the same call
1630
+ * session — see roadmap Phase 5b.
1631
+ * - `auto_mode=true` is enabled by default. Pass `autoMode: false` to
1632
+ * send a custom `chunk_length_schedule`.
1633
+ * - `outputFormat` is exposed as a query parameter so `ulaw_8000` (Twilio
1634
+ * native) and `pcm_16000` (Telnyx native) work without resampling.
1635
+ * - `eleven_v3` is **not** supported — the WS endpoint rejects it.
1636
+ * - `optimize_streaming_latency` is officially deprecated and is not
1637
+ * exposed.
1638
+ */
1639
+
1640
+ /** Constructor options for {@link ElevenLabsWebSocketTTS}. */
1641
+ interface ElevenLabsWebSocketTTSOptions {
1642
+ apiKey: string;
1643
+ voiceId?: string;
1644
+ modelId?: ElevenLabsModel | string;
1645
+ outputFormat?: string;
1646
+ voiceSettings?: Record<string, unknown>;
1647
+ languageCode?: string;
1648
+ /** Let the server pick chunk timing. Default true. */
1649
+ autoMode?: boolean;
1650
+ /** WS keep-alive timeout in seconds (5–180). Default 60. */
1651
+ inactivityTimeout?: number;
1652
+ /**
1653
+ * Manual chunk schedule, only used when ``autoMode: false``. Each value
1654
+ * must be 5–500. ElevenLabs default is ``[120, 160, 250, 290]``.
1655
+ */
1656
+ chunkLengthSchedule?: number[];
1657
+ /** Outgoing audio re-chunk size in bytes. Default 4096. */
1658
+ chunkSize?: number;
1659
+ }
1660
+ /**
1661
+ * Parked WS handle returned by {@link ElevenLabsWebSocketTTS.openParkedConnection}.
1662
+ *
1663
+ * `bosSent` records whether the BOS frame (`{"text": " ", ...}`) has
1664
+ * already been written to the wire. The prewarm pipeline always sends
1665
+ * the BOS so the upstream worker is selected on the parked connection;
1666
+ * `synthesizeStream` adopts the WS and SKIPS its own BOS send to avoid
1667
+ * a protocol error.
1668
+ */
1669
+ interface ElevenLabsParkedWS {
1670
+ ws: WebSocket__default;
1671
+ bosSent: boolean;
1672
+ }
1673
+ /** WebSocket-based ElevenLabs TTS adapter — opt-in low-latency variant. */
1674
+ declare class ElevenLabsWebSocketTTS implements TTSAdapter {
1675
+ static readonly providerKey = "elevenlabs_ws";
1676
+ readonly apiKey: string;
1677
+ readonly voiceId: string;
1678
+ readonly modelId: string;
1679
+ readonly voiceSettings?: Record<string, unknown>;
1680
+ readonly languageCode?: string;
1681
+ readonly autoMode: boolean;
1682
+ readonly inactivityTimeout: number;
1683
+ readonly chunkLengthSchedule?: number[];
1684
+ readonly chunkSize: number;
1685
+ /**
1686
+ * Single-slot adoption queue. The prewarm pipeline parks one WS per
1687
+ * outbound call here; the next `synthesizeStream` call consumes it
1688
+ * (skipping `new WebSocket()` and the BOS send) instead of opening
1689
+ * a fresh socket. The slot is consumed exactly once: if a second
1690
+ * `synthesizeStream` runs before the first, only the first benefits.
1691
+ *
1692
+ * We keep this on the adapter (not in a parameter) so the existing
1693
+ * `for await (const chunk of agent.tts.synthesizeStream(...))` call
1694
+ * site in `StreamHandler` continues to work without signature
1695
+ * changes.
1696
+ */
1697
+ private adoptedConnection;
1698
+ /**
1699
+ * The wire format requested over the ElevenLabs WS. Initially set from
1700
+ * the constructor; ``setTelephonyCarrier`` may auto-flip it to the
1701
+ * carrier's native codec when the caller did NOT pass ``outputFormat``
1702
+ * explicitly.
1703
+ */
1704
+ private _outputFormat;
1705
+ private readonly _outputFormatExplicit;
1706
+ /** Public read-only view of the (possibly auto-flipped) wire format. */
1707
+ get outputFormat(): string;
1708
+ constructor(opts: ElevenLabsWebSocketTTSOptions);
1709
+ /**
1710
+ * Hook called by ``StreamHandler`` to advise the carrier wire format.
1711
+ *
1712
+ * When the user did NOT pass an explicit ``outputFormat`` in the
1713
+ * constructor options, this flips the format to the carrier's native
1714
+ * wire codec — saving a client-side transcode step. Calling with an
1715
+ * unknown carrier (``""`` / ``"custom"``) is a no-op.
1716
+ *
1717
+ * When ``outputFormat`` was explicitly passed (incl. via the
1718
+ * ``forTwilio`` / ``forTelnyx`` factories), this method is a no-op —
1719
+ * the user's choice always wins.
1720
+ */
1721
+ setTelephonyCarrier(carrier: string): void;
1722
+ /** Pre-configured for Twilio Media Streams (`ulaw_8000`). */
1723
+ static forTwilio(opts: Omit<ElevenLabsWebSocketTTSOptions, 'outputFormat'>): ElevenLabsWebSocketTTS;
1724
+ /** Pre-configured for Telnyx (`pcm_16000`). */
1725
+ static forTelnyx(opts: Omit<ElevenLabsWebSocketTTSOptions, 'outputFormat'>): ElevenLabsWebSocketTTS;
1726
+ private buildUrl;
1727
+ /**
1728
+ * Build the protocol-required BOS frame sent on every fresh WS.
1729
+ *
1730
+ * The single-space `{"text": " "}` keep-alive establishes the session
1731
+ * without committing any synthesis (no `flush: true`, no real text).
1732
+ * Production `synthesizeStream()` and `warmup()` share this exact
1733
+ * construction so the upstream worker chooses the same per-session
1734
+ * config in both cases — otherwise the warm session is on a different
1735
+ * worker than the live request, which defeats the warmup goal.
1736
+ */
1737
+ private buildBosFrame;
1738
+ /**
1739
+ * Single-shot synthesis: open WS, send text, yield bytes, close.
1740
+ *
1741
+ * Resilience contract:
1742
+ * - Connection bounded by ``CONNECT_TIMEOUT_MS`` (5s, was 15s).
1743
+ * - Each idle wait bounded by ``FRAME_TIMEOUT_MS`` (30s) so a stalled
1744
+ * server cannot keep the generator alive indefinitely.
1745
+ * - Permanent error handler attached BEFORE the open await — prevents
1746
+ * ``uncaughtException`` if an error fires after the once-listener
1747
+ * resolves.
1748
+ * - All event listeners removed in ``finally`` (no closure leak past
1749
+ * socket close).
1750
+ * - Server-reported ``error`` raises ``ElevenLabsTTSError``.
1751
+ * - Per-frame audio payload capped at ``MAX_AUDIO_B64_BYTES``.
1752
+ * - Best-effort EOS ``{"text":""}`` sent in finally (not immediately
1753
+ * after flush — auto_mode could otherwise truncate the tail audio).
1754
+ */
1755
+ synthesizeStream(text: string): AsyncGenerator<Buffer>;
1756
+ /**
1757
+ * Pre-call WebSocket warmup for the ElevenLabs `/stream-input` endpoint.
1758
+ *
1759
+ * Opens the WS (DNS + TLS + auth handshake), sends the EXACT same BOS
1760
+ * frame the production `synthesizeStream()` path sends — including
1761
+ * `voice_settings` and (when configured) `generation_config` — so
1762
+ * ElevenLabs instantiates the same per-session worker for both
1763
+ * warmup and the live request. If the BOS frames differ, the server
1764
+ * may route warmup and the real call to two different workers, and
1765
+ * the warmed worker is wasted. Idles ~250 ms, then closes. By the
1766
+ * time the first `synthesizeStream()` call lands during the call,
1767
+ * the connection pool has the upstream warm — net wire time saving
1768
+ * of 200-500 ms.
1769
+ *
1770
+ * Billing safety: ElevenLabs bills on synthesised characters
1771
+ * delivered via `audio` frames (per https://elevenlabs.io/pricing).
1772
+ * The keepalive (single-space `text`, no `flush: true`, no real
1773
+ * transcript) is documented as the session-establishment frame and
1774
+ * does NOT generate synthesis. Closing without sending the actual
1775
+ * transcript does not consume billable characters. Best-effort:
1776
+ * failures logged at debug level.
1777
+ */
1778
+ warmup(): Promise<void>;
1779
+ /**
1780
+ * Open a fresh WS, send the EXACT BOS frame the live `synthesizeStream`
1781
+ * sends, and return the OPEN socket without closing it. Used by the
1782
+ * prewarm pipeline to park a TTS connection during the carrier ringing
1783
+ * window so the next `synthesizeStream` call can adopt it via
1784
+ * {@link adoptWebSocket} and skip ~400-900 ms of TLS + BOS round-trip.
1785
+ *
1786
+ * Returns a parked-handle the caller stashes; the next
1787
+ * `synthesizeStream` will detect the adoption queue and skip its own
1788
+ * `new WebSocket()` + BOS send.
1789
+ *
1790
+ * Billing safety: BOS is the documented session-establishment frame
1791
+ * (single space `text`, no `flush: true`) and does not generate
1792
+ * synthesis. ElevenLabs bills on `audio` frames received from the
1793
+ * server, not on BOS bytes sent by the client.
1205
1794
  */
1206
- machineDetection?: boolean;
1795
+ openParkedConnection(): Promise<ElevenLabsParkedWS>;
1207
1796
  /**
1208
- * Called once when the carrier finishes the AMD check. Fires for both
1209
- * ``human`` and ``machine`` outcomes. Combine with ``voicemailMessage``
1210
- * to get both the legacy voicemail-drop AND a result callback (the SDK
1211
- * fires the callback after the drop is queued). Acceptance tests use
1212
- * this to mark a run INVALID when ``classification !== 'human'``.
1797
+ * Stash a parked WS handle so the next `synthesizeStream` call adopts
1798
+ * it instead of opening a fresh socket. Caller is responsible for
1799
+ * holding the handle alive until either the live request consumes it
1800
+ * or the call ends (in which case `discardAdoptedConnection()`
1801
+ * cleans it up).
1213
1802
  */
1214
- onMachineDetection?: (result: MachineDetectionResult) => void | Promise<void>;
1215
- /** If set, spoken as a voicemail message when AMD detects a machine. Implicitly enables ``machineDetection``. */
1216
- voicemailMessage?: string;
1217
- /** Dynamic variables merged into agent.variables before call. Override agent-level variables. */
1218
- variables?: Record<string, string>;
1803
+ adoptWebSocket(parked: ElevenLabsParkedWS): void;
1219
1804
  /**
1220
- * Ring timeout in seconds. Forwarded to Twilio as `Timeout` and to Telnyx
1221
- * as `timeout_secs`. Defaults to **25 s** the production-recommended
1222
- * value that limits phantom calls. Pass `60` for legacy carrier-default
1223
- * parity, or `null` to omit the parameter entirely (carrier picks its
1224
- * own default).
1805
+ * Drop and close any pending parked WS without consuming it. Used on
1806
+ * call-failure paths so a never-started call does not leak a TTS WS
1807
+ * that ElevenLabs will close after its inactivity timeout anyway.
1225
1808
  */
1226
- ringTimeout?: number | null;
1809
+ discardAdoptedConnection(): void;
1810
+ /** No-op — connections are per-utterance and torn down inside synthesizeStream. */
1811
+ close(): Promise<void>;
1227
1812
  }
1228
1813
 
1229
1814
  /**
@@ -1272,6 +1857,19 @@ declare class MetricsStore extends EventEmitter {
1272
1857
  private readonly maxCalls;
1273
1858
  private calls;
1274
1859
  private activeCalls;
1860
+ /**
1861
+ * User-driven soft delete: call_ids the operator removed from the
1862
+ * dashboard view. The on-disk artefacts written by ``CallLogger``
1863
+ * (``metadata.json``, ``transcript.jsonl``) are intentionally NOT
1864
+ * touched — they serve as the durable backup. All read paths
1865
+ * (``getCalls`` / ``getCall`` / ``getAggregates`` / ``getCallsInRange``
1866
+ * / ``hydrate``) filter against this set so the call is invisible
1867
+ * to the UI and excluded from rolling metrics. Populated from
1868
+ * ``<logRoot>/.deleted_call_ids.json`` on hydrate so deletions
1869
+ * survive a process restart. Parity with Python.
1870
+ */
1871
+ private deletedCallIds;
1872
+ private deletedIdsPath;
1275
1873
  /**
1276
1874
  * Accepts either a numeric ``maxCalls`` (legacy positional — matches the
1277
1875
  * original TS API) or an options object ``{ maxCalls }`` to align with the
@@ -1300,19 +1898,66 @@ declare class MetricsStore extends EventEmitter {
1300
1898
  recordTurn(data: Record<string, unknown>): void;
1301
1899
  /** Move a call from active to completed and persist its final metrics. */
1302
1900
  recordCallEnd(data: Record<string, unknown>, metrics?: Record<string, unknown> | null): void;
1303
- /** Return a window of completed calls in newest-first order. */
1901
+ /**
1902
+ * Return a window of completed calls in newest-first order.
1903
+ *
1904
+ * Soft-deleted call_ids (see ``deleteCalls``) are filtered out so the
1905
+ * dashboard never re-shows a row the user removed. The on-disk
1906
+ * artefacts are intentionally preserved as a backup.
1907
+ */
1304
1908
  getCalls(limit?: number, offset?: number): CallRecord[];
1305
- /** Look up a completed call by id (newest match wins). */
1909
+ /**
1910
+ * Look up a completed call by id (newest match wins).
1911
+ *
1912
+ * Soft-deleted call_ids resolve to ``null`` so the SPA's detail pane
1913
+ * cannot render a row the user removed.
1914
+ */
1306
1915
  getCall(callId: string): CallRecord | null;
1916
+ /**
1917
+ * Soft-delete one or more calls from the dashboard view.
1918
+ *
1919
+ * Adds each ``call_id`` to an in-memory set. Subsequent reads via
1920
+ * ``getCalls`` / ``getCall`` / ``getAggregates`` / ``getCallsInRange``
1921
+ * exclude the deleted ids, so rolling metrics (avg latency, total
1922
+ * spend) are recomputed without them. The on-disk
1923
+ * ``metadata.json`` / ``transcript.jsonl`` files written by
1924
+ * ``CallLogger`` are NOT touched — they serve as a durable backup
1925
+ * the operator can audit outside the dashboard.
1926
+ *
1927
+ * Active calls are never deletable. A call_id that is currently
1928
+ * in ``activeCalls`` is silently skipped so a mid-call delete
1929
+ * from the UI cannot orphan the live transcript pane.
1930
+ *
1931
+ * Persisted to ``<logRoot>/.deleted_call_ids.json`` (best-effort)
1932
+ * when ``hydrate()`` has been called with a log root. Parity with
1933
+ * Python ``delete_calls``.
1934
+ *
1935
+ * @returns The list of call_ids actually accepted as deleted.
1936
+ */
1937
+ deleteCalls(callIds: readonly string[]): string[];
1938
+ /** Whether ``callId`` was soft-deleted from the dashboard. */
1939
+ isDeleted(callId: string): boolean;
1940
+ /** Snapshot of soft-deleted call_ids (sorted). */
1941
+ getDeletedCallIds(): string[];
1942
+ /** Atomically persist the deleted-ids set to disk. Best-effort. */
1943
+ private persistDeletedIds;
1307
1944
  /** Look up an active call by id (returns undefined if not active or unknown). */
1308
1945
  getActive(callId: string): CallRecord | undefined;
1309
1946
  /** Return all currently active (not yet ended) calls. */
1310
1947
  getActiveCalls(): CallRecord[];
1311
- /** Compute summary statistics across the buffered call history. */
1948
+ /**
1949
+ * Compute summary statistics across the buffered call history.
1950
+ *
1951
+ * Soft-deleted calls are excluded so rolling metrics (avg latency,
1952
+ * total spend) match exactly what the operator sees in the call list.
1953
+ */
1312
1954
  getAggregates(): Record<string, unknown>;
1313
- /** Return calls whose `started_at` falls within `[fromTs, toTs]` (Unix seconds). */
1955
+ /**
1956
+ * Return calls whose `started_at` falls within `[fromTs, toTs]` (Unix
1957
+ * seconds). Soft-deleted calls are filtered out.
1958
+ */
1314
1959
  getCallsInRange(fromTs?: number, toTs?: number): CallRecord[];
1315
- /** Number of completed calls currently in the ring buffer. */
1960
+ /** Number of completed (non-deleted) calls currently in the ring buffer. */
1316
1961
  get callCount(): number;
1317
1962
  /**
1318
1963
  * Rebuild the in-memory call list from `metadata.json` files written by
@@ -1455,6 +2100,19 @@ declare class SpeechEvents {
1455
2100
  private dispatch;
1456
2101
  }
1457
2102
 
2103
+ /** Parked provider WebSockets ready for adoption by a per-call StreamHandler. */
2104
+ interface ParkedProviderConnections {
2105
+ /** Pre-opened STT WS (Cartesia today; other adapters may add support later). */
2106
+ stt?: WebSocket.WebSocket;
2107
+ /**
2108
+ * Pre-opened TTS WS handle (ElevenLabs WS today). The `bosSent` flag
2109
+ * lets the live `synthesizeStream` skip its own BOS send when the
2110
+ * prewarm pipeline already wrote it.
2111
+ */
2112
+ tts?: ElevenLabsParkedWS;
2113
+ /** Pre-opened OpenAI Realtime WS (already through `session.updated`). */
2114
+ openaiRealtime?: WebSocket.WebSocket;
2115
+ }
1458
2116
  /** Top-level SDK entry point — wraps a carrier + embedded server + agent loop. */
1459
2117
  declare class Patter {
1460
2118
  private localConfig;
@@ -1476,6 +2134,65 @@ declare class Patter {
1476
2134
  * ``Cannot use both tunnel: true and webhookUrl``.
1477
2135
  */
1478
2136
  private tunnelOwnsWebhookUrl;
2137
+ /**
2138
+ * Pre-rendered first-message TTS audio per outbound call_id. Populated
2139
+ * by :meth:`call` when ``agent.prewarmFirstMessage`` is true; consumed
2140
+ * by the StreamHandler firstMessage emit so the greeting streams
2141
+ * instantly on ``start`` instead of paying the 200-700 ms TTS first-byte
2142
+ * latency. See ``AgentOptions.prewarmFirstMessage``.
2143
+ *
2144
+ * Stores raw bytes in the TTS provider's native sample rate; the
2145
+ * carrier-side audio sender resamples on emit.
2146
+ */
2147
+ private prewarmAudio;
2148
+ /**
2149
+ * Call IDs whose prewarm cache slot has already been consumed —
2150
+ * either by ``popPrewarmAudio`` (cache hit OR miss on the firstMessage
2151
+ * emit path) or by ``recordPrewarmWaste`` (call ended before pickup).
2152
+ * The prewarm task checks this set BEFORE writing bytes so a slow
2153
+ * synth that finishes after the consumer already polled doesn't
2154
+ * orphan bytes in ``prewarmAudio``. See FIX #92 in the parity audit.
2155
+ */
2156
+ private prewarmConsumed;
2157
+ /**
2158
+ * Background tasks tracked so :meth:`disconnect` can wait on / drop any
2159
+ * still-running prewarm-first-message synth before tearing down.
2160
+ */
2161
+ private prewarmTasks;
2162
+ /**
2163
+ * TTL eviction timers keyed by call_id so :meth:`disconnect` (and
2164
+ * normal consumption / waste-record paths) can cancel any pending
2165
+ * timer when the slot drains naturally. Without this, the timer
2166
+ * would WARN spuriously after the cache was already emptied.
2167
+ */
2168
+ private prewarmTtlTimers;
2169
+ /**
2170
+ * Pre-opened, fully-handshaked provider WebSockets keyed by
2171
+ * carrier-issued call_id. Populated by ``parkProviderConnections``
2172
+ * during the carrier ringing window; consumed by the per-call
2173
+ * StreamHandler at ``start`` via ``adoptWebSocket(...)`` so STT / TTS
2174
+ * / Realtime audio can flow on the first turn without paying the
2175
+ * 150-900 ms TLS + WS-upgrade + protocol-handshake round-trip again.
2176
+ *
2177
+ * Distinct from ``prewarmAudio`` (which holds pre-rendered TTS bytes
2178
+ * for the first message); the two features are complementary and
2179
+ * orthogonal — both can be active for the same call.
2180
+ *
2181
+ * Each slot may hold up to three parked connections (STT, TTS,
2182
+ * Realtime). Drained by:
2183
+ * - {@link popPrewarmedConnections} on the carrier ``start`` event
2184
+ * (consumed normally — the handles transfer to the StreamHandler)
2185
+ * - {@link recordPrewarmWaste} on call-termination paths (no-answer,
2186
+ * busy, failed, canceled, AMD voicemail). Closes parked sockets.
2187
+ * - {@link disconnect} on Patter teardown. Closes all parked sockets.
2188
+ */
2189
+ private prewarmedConnections;
2190
+ /**
2191
+ * TTL eviction handles keyed by call_id for connections that are never
2192
+ * adopted (e.g. a carrier that swallows ``start``). Closes the parked
2193
+ * sockets so they don't leak past the safety window.
2194
+ */
2195
+ private prewarmedConnTimers;
1479
2196
  /**
1480
2197
  * Speech-edge events for turn-taking instrumentation. Public surface: the
1481
2198
  * seven `on*` proxy accessors below plus the `conversationState` snapshot.
@@ -1483,7 +2200,7 @@ declare class Patter {
1483
2200
  * the previous behaviour.
1484
2201
  *
1485
2202
  * See `src/_speech-events.ts` for the full event taxonomy and the
1486
- * industry-alignment table (LiveKit / Pipecat / OpenAI Realtime).
2203
+ * OpenAI Realtime alignment table.
1487
2204
  */
1488
2205
  readonly speechEvents: SpeechEvents;
1489
2206
  get onUserSpeechStarted(): SpeechEventCallback | null;
@@ -1502,8 +2219,8 @@ declare class Patter {
1502
2219
  set onAudioOut(cb: SpeechEventCallback | null);
1503
2220
  /**
1504
2221
  * Snapshot of the current per-side state of the call.
1505
- * Mirrors LiveKit's `user_state_changed` / `agent_state_changed`
1506
- * payloads. Read-only and safe to call at any time.
2222
+ * Returns the user_state / agent_state payload shape — read-only and
2223
+ * safe to call at any time.
1507
2224
  */
1508
2225
  get conversationState(): ConversationStateSnapshot;
1509
2226
  /**
@@ -1553,12 +2270,115 @@ declare class Patter {
1553
2270
  private _serveImpl;
1554
2271
  /** Run the agent in interactive terminal-test mode (no real telephony). */
1555
2272
  test(opts: ServeOptions): Promise<void>;
2273
+ /**
2274
+ * Pop and return the pre-synthesised first-message audio for ``callId``.
2275
+ *
2276
+ * Returns ``undefined`` when ``agent.prewarmFirstMessage`` was not set
2277
+ * for the originating outbound call, or when the synth was still in
2278
+ * flight at the moment the carrier emitted ``start`` (cache miss — the
2279
+ * StreamHandler falls back to live TTS).
2280
+ *
2281
+ * Called by the per-call StreamHandler at the start of the firstMessage
2282
+ * emit. Returning bytes here lets the handler skip the live TTS
2283
+ * synthesis and stream the cached buffer directly.
2284
+ *
2285
+ * Marks ``callId`` as consumed regardless of cache hit/miss so a slow
2286
+ * synth task that finishes after this call drops its bytes instead of
2287
+ * orphaning them in ``prewarmAudio``. See FIX #92.
2288
+ */
2289
+ popPrewarmAudio: (callId: string) => Buffer | undefined;
2290
+ /**
2291
+ * Log a warning if a prewarmed greeting was paid for but never used.
2292
+ * The TTS bill for ``agent.firstMessage`` has already been incurred by
2293
+ * the background synth task, so the user should know — opt-in feature
2294
+ * with a known cost surface.
2295
+ *
2296
+ * Idempotent: the second call for the same ``callId`` is a no-op, so
2297
+ * the status callback firing first and ``endCall`` running afterwards
2298
+ * (or vice-versa) does not double-WARN. Public so the embedded
2299
+ * server's webhook handlers can invoke it on no-answer / busy /
2300
+ * failed / canceled / AMD-machine paths. See FIX #91.
2301
+ */
2302
+ recordPrewarmWaste: (callId: string) => void;
2303
+ /**
2304
+ * Pop and return the parked provider WebSockets for ``callId``, or
2305
+ * ``undefined`` when no parked connections exist.
2306
+ *
2307
+ * Wired into ``EmbeddedServer.popPrewarmedConnections`` so the
2308
+ * per-call ``StreamHandler`` can adopt the parked sockets at the
2309
+ * carrier ``start`` event instead of opening fresh ones — saving
2310
+ * ~150-900 ms of cold-start handshake on the first turn.
2311
+ */
2312
+ popPrewarmedConnections: (callId: string) => ParkedProviderConnections | undefined;
2313
+ /**
2314
+ * Close any parked provider WebSockets for ``callId``. Wired into
2315
+ * ``EmbeddedServer.closePrewarmedConnections`` so call-termination
2316
+ * paths (no-answer, busy, failed, canceled, AMD voicemail) drop the
2317
+ * sockets cleanly instead of leaving them to the upstream timeout.
2318
+ */
2319
+ closePrewarmedConnections: (callId: string) => void;
2320
+ /**
2321
+ * Open and park provider WebSockets in parallel with the carrier-side
2322
+ * ``initiateCall``. Unlike :meth:`spawnProviderWarmup` (which closes
2323
+ * the WS after a brief idle), the sockets opened here stay OPEN and
2324
+ * are handed off to the per-call ``StreamHandler`` on ``start``.
2325
+ *
2326
+ * This is the structural fix for first-turn cold-start: on Node's
2327
+ * ``ws`` package, opening + closing a WS does NOT warm TLS for the
2328
+ * next open — every fresh ``new WebSocket()`` re-pays the full
2329
+ * TCP + TLS + HTTP-101 round-trip. By keeping the WS open and
2330
+ * adopting it directly, the live first turn skips the handshake
2331
+ * entirely (saves ~150-900 ms depending on provider).
2332
+ *
2333
+ * Best-effort: each provider's parking task is wrapped in
2334
+ * ``Promise.allSettled`` so a slow or failing endpoint cannot block
2335
+ * the others. Providers without ``openParkedConnection`` contribute
2336
+ * nothing — the call falls through to the cold ``connect()`` path
2337
+ * for that provider.
2338
+ */
2339
+ private parkProviderConnections;
2340
+ /**
2341
+ * Spawn a fire-and-forget task that warms up STT / TTS / LLM in
2342
+ * parallel with the carrier-side ``initiateCall``.
2343
+ *
2344
+ * Best-effort: each provider's optional ``warmup()`` is wrapped in
2345
+ * ``Promise.allSettled`` so a slow or failing endpoint cannot block
2346
+ * the others. Providers without ``warmup`` contribute nothing.
2347
+ */
2348
+ private spawnProviderWarmup;
2349
+ /**
2350
+ * Pre-render ``agent.firstMessage`` to TTS bytes during the ringing
2351
+ * window and stash them in ``prewarmAudio.set(callId, buf)``.
2352
+ *
2353
+ * Skipped silently when ``agent.prewarmFirstMessage`` is false or
2354
+ * when ``agent.tts`` / ``agent.firstMessage`` is missing. The synth
2355
+ * is bounded by ``ringTimeout`` (default 25 s) so a never-answered
2356
+ * call doesn't tie up the TTS connection. On timeout / error the
2357
+ * cache is left empty and the StreamHandler falls back to live TTS.
2358
+ *
2359
+ * **Pipeline mode only.** Realtime / ConvAI provider modes never
2360
+ * consume the prewarm cache (the StreamHandler for those modes runs
2361
+ * its first-message emit through the provider's own audio path).
2362
+ * Spawning the prewarm in those modes pays the TTS bill for nothing
2363
+ * — refused with a warn.
2364
+ *
2365
+ * **Capped at ``PREWARM_CACHE_MAX`` concurrent entries.** Refused
2366
+ * with a warn when the cap is reached (the call still proceeds —
2367
+ * StreamHandler falls back to live TTS).
2368
+ */
2369
+ private spawnPrewarmFirstMessage;
1556
2370
  /** Place an outbound call via the configured carrier. */
1557
2371
  call(options: LocalCallOptions): Promise<void>;
1558
2372
  /**
1559
2373
  * Stop the embedded server and any running tunnel. Safe to call multiple
1560
2374
  * times. Leaves the instance reusable: a subsequent ``serve()`` works as
1561
2375
  * if the previous lifecycle never happened.
2376
+ *
2377
+ * Also clears any pending TTL eviction timers, awaits in-flight
2378
+ * prewarm-first-message synth tasks (best-effort, with a 1 s safety
2379
+ * timeout), and clears the prewarm cache. Without this a still-running
2380
+ * TTS WS keeps the user billed long after SDK teardown, and stale
2381
+ * entries leak across ``serve`` / ``disconnect`` cycles. See FIX #93.
1562
2382
  */
1563
2383
  disconnect(): Promise<void>;
1564
2384
  /**
@@ -2075,7 +2895,22 @@ declare function calculateTelephonyCost(provider: string, durationSeconds: numbe
2075
2895
 
2076
2896
  /** Per-turn latency breakdown across the STT/LLM/TTS pipeline. */
2077
2897
  interface LatencyBreakdown {
2898
+ /**
2899
+ * STT finalization time: end-of-speech (VAD stop or STT speech_final) →
2900
+ * final transcript delivery. This is the engineering metric — pure STT
2901
+ * processing latency, independent of how long the user spoke. Industry
2902
+ * benchmarks (Picovoice, Deepgram, Gladia, Speechmatics) all report this
2903
+ * number as "STT latency". Falls back to turn_start when the endpoint
2904
+ * signal is unavailable (degraded provider, batch STT, etc.).
2905
+ */
2078
2906
  stt_ms: number;
2907
+ /**
2908
+ * Duration of the user's utterance (turn_start → end-of-speech). Useful
2909
+ * to distinguish "user spoke for 4s" from "STT took 4s to finalize" —
2910
+ * they used to be conflated in stt_ms before 0.6.1. Optional — undefined
2911
+ * when the endpoint signal is unavailable.
2912
+ */
2913
+ user_speech_duration_ms?: number;
2079
2914
  /**
2080
2915
  * Backwards-compatible LLM bucket. With the split below, this now reflects
2081
2916
  * the user-perceived first-token latency (TTFT) when streaming is available
@@ -2164,6 +2999,12 @@ interface CallMetrics {
2164
2999
  tts_provider: string;
2165
3000
  llm_provider: string;
2166
3001
  telephony_provider: string;
3002
+ /** Model identifiers per provider (e.g. "ink-whisper", "eleven_flash_v2_5",
3003
+ * "gpt-oss-120b"). Surface on the dashboard cost breakdown so operators
3004
+ * can attribute per-call spend to a specific model. */
3005
+ stt_model?: string;
3006
+ tts_model?: string;
3007
+ llm_model?: string;
2167
3008
  }
2168
3009
  /** Programmatic control surface for a live call (transfer, hangup, DTMF). */
2169
3010
  interface CallControl {
@@ -2236,6 +3077,7 @@ declare class CallMetricsAccumulator {
2236
3077
  private _actualTelephonyCost;
2237
3078
  private _actualSttCost;
2238
3079
  private _totalLlmCost;
3080
+ private _llmModel;
2239
3081
  private _eventBus;
2240
3082
  /** Timestamp (hrTimeMs) when VAD emitted speech_end. */
2241
3083
  private _vadStoppedAt;
@@ -2250,6 +3092,21 @@ declare class CallMetricsAccumulator {
2250
3092
  private _overlapStartedAt;
2251
3093
  private _reportOnlyInitialTtfb;
2252
3094
  private _initialTtfbEmitted;
3095
+ /**
3096
+ * Last barge-in detection timestamp (hrTimeMs). Used by
3097
+ * ``_computeTurnLatency`` to gate endpoint_ms / stt_ms emission on turns
3098
+ * that started immediately after a barge-in — those turns have unreliable
3099
+ * VAD/STT anchors and would otherwise pollute the p95 distribution with
3100
+ * synthetic 6+ second spikes.
3101
+ */
3102
+ private _lastBargeinAt;
3103
+ /**
3104
+ * Count of turns where ``recordSttComplete`` fired but no legitimate VAD
3105
+ * ``speech_end`` had stamped ``_endpointSignalAt``. Exposed via metrics so
3106
+ * we can spot environments where PSTN packet loss is dropping VAD stops
3107
+ * (the common cause of missing endpoint signals).
3108
+ */
3109
+ private _endpointSignalMissingCount;
2253
3110
  constructor(opts: {
2254
3111
  callId: string;
2255
3112
  providerMode: string;
@@ -2285,6 +3142,31 @@ declare class CallMetricsAccumulator {
2285
3142
  * on the first audio byte rather than just before recordSttComplete().
2286
3143
  */
2287
3144
  startTurnIfIdle(): void;
3145
+ /**
3146
+ * Anchor the current turn at a legitimate VAD ``speech_start`` event.
3147
+ *
3148
+ * Industry-standard pattern: every VAD ``speech_start`` that fires while the agent
3149
+ * is NOT in the suppressed warmup window re-anchors the turn timer to
3150
+ * the wall-clock moment the user actually started speaking. Re-anchors:
3151
+ *
3152
+ * * ``_turnStart`` — fixes the case where a phantom ``speech_start``
3153
+ * during agent TTS or a partial transcript from the previous user
3154
+ * attempt already stamped the field. Without this, the legitimate
3155
+ * user-speech ``speech_start`` no-op'd and ``user_speech_duration_ms``
3156
+ * inflated from ~1 s to 5-7 s (the original "I waited 7 seconds"
3157
+ * dashboard symptom).
3158
+ * * ``_endpointSignalAt``, ``_vadStoppedAt``, ``_sttFinalAt`` — any
3159
+ * stale anchor from a rejected barge-in / dropped final transcript
3160
+ * on the same uncommitted turn is cleared, so the next
3161
+ * ``recordVadStop`` / ``recordSttFinalTimestamp`` stamps fresh.
3162
+ * * ``_sttComplete``, ``_llmFirstToken``, ``_initialTtfbEmitted`` — same
3163
+ * rationale for the downstream pipeline timestamps.
3164
+ *
3165
+ * No-op once the turn is committed (``_turnCommittedMono`` set): a
3166
+ * VAD ``speech_start`` after commit belongs to the NEXT turn's
3167
+ * barge-in path, handled by ``recordTurnInterrupted`` instead.
3168
+ */
3169
+ anchorUserSpeechStart(): void;
2288
3170
  /** Stamp end-of-STT, capture the user's transcript, and accrue billed STT seconds. */
2289
3171
  recordSttComplete(text: string, audioSeconds?: number): void;
2290
3172
  /** Record the timestamp of the first LLM token (TTFT). No-op after first call. */
@@ -2419,6 +3301,13 @@ declare class CallMetricsAccumulator {
2419
3301
  endCall(): CallMetrics;
2420
3302
  /** Return the cost breakdown for the call so far without ending it. */
2421
3303
  getCostSoFar(): CostBreakdown;
3304
+ /**
3305
+ * Number of turns where recordSttComplete fired without a prior legitimate
3306
+ * VAD speech_end. Surfaced for diagnostics — a non-zero value points at
3307
+ * dropped VAD stops (commonly PSTN packet loss), which is why we stopped
3308
+ * faking _endpointSignalAt from _sttComplete in 0.6.x.
3309
+ */
3310
+ get endpointSignalMissingCount(): number;
2422
3311
  private _resetTurnState;
2423
3312
  private _computeTurnLatency;
2424
3313
  private _computeCost;
@@ -2442,6 +3331,7 @@ declare class CallMetricsAccumulator {
2442
3331
  * {@link OpenAIRealtimeAdapter}. Audio negotiation defaults to
2443
3332
  * `g711_ulaw` so traffic flows through Twilio/Telnyx without transcoding.
2444
3333
  */
3334
+
2445
3335
  /**
2446
3336
  * Supported OpenAI Realtime wire audio formats. See
2447
3337
  * https://platform.openai.com/docs/guides/realtime for the full list.
@@ -2483,28 +3373,96 @@ interface OpenAIRealtimeOptions {
2483
3373
  }
2484
3374
  /** Realtime WebSocket adapter for OpenAI's `gpt-realtime` family. */
2485
3375
  declare class OpenAIRealtimeAdapter {
2486
- private readonly apiKey;
2487
- private readonly model;
2488
- private readonly voice;
2489
- private readonly instructions;
2490
- private readonly tools?;
2491
- private readonly audioFormat;
2492
- private ws;
3376
+ protected readonly apiKey: string;
3377
+ protected readonly model: string;
3378
+ protected readonly voice: string;
3379
+ protected readonly instructions: string;
3380
+ protected readonly tools?: Array<{
3381
+ name: string;
3382
+ description: string;
3383
+ parameters: Record<string, unknown>;
3384
+ strict?: boolean;
3385
+ }> | undefined;
3386
+ protected readonly audioFormat: OpenAIRealtimeAudioFormat;
3387
+ protected ws: WebSocket__default | null;
2493
3388
  private readonly eventCallbacks;
2494
3389
  private messageListenerAttached;
2495
3390
  private heartbeat;
2496
3391
  private currentResponseItemId;
2497
3392
  private currentResponseAudioMs;
2498
3393
  private currentResponseFirstAudioAt;
2499
- private readonly options;
3394
+ protected readonly options: OpenAIRealtimeOptions;
2500
3395
  constructor(apiKey: string, model?: string, voice?: string, instructions?: string, tools?: Array<{
2501
3396
  name: string;
2502
3397
  description: string;
2503
3398
  parameters: Record<string, unknown>;
2504
3399
  strict?: boolean;
2505
3400
  }> | undefined, audioFormat?: OpenAIRealtimeAudioFormat, options?: OpenAIRealtimeOptions);
3401
+ /**
3402
+ * Build the production session.update body. Mirrors the body sent
3403
+ * inside `connect()` so warmup can apply identical configuration to
3404
+ * the upstream session and prime it without billing.
3405
+ */
3406
+ private buildSessionConfig;
3407
+ /**
3408
+ * Pre-call WebSocket warmup for the OpenAI Realtime endpoint.
3409
+ *
3410
+ * The canonical session-only warm step on the Realtime API: open the
3411
+ * WS, wait for `session.created`, send a single `session.update`
3412
+ * containing the same fields that the production `connect()` path
3413
+ * applies (`input_audio_format`, `output_audio_format`, `voice`,
3414
+ * `instructions`, `turn_detection`, `input_audio_transcription`,
3415
+ * plus any opt-in fields populated on the adapter), wait for the
3416
+ * matching `session.updated` ack, then close cleanly. This primes
3417
+ * the per-session state on the OpenAI side — DNS + TLS + auth
3418
+ * handshake + initial config exchange — without ever invoking the
3419
+ * model.
3420
+ *
3421
+ * Earlier revisions sent `response.create` with
3422
+ * `{"response": {"generate": false}}` to prime the inference path.
3423
+ * That field is NOT in the OpenAI Realtime API schema; the server
3424
+ * either ignores it (and bills tokens for a real model response) or
3425
+ * rejects the request with `invalid_request_error`. Both behaviours
3426
+ * are billing-unsafe or a no-op beyond TLS warm. The
3427
+ * `session.update` flow is documented and side-effect-free.
3428
+ *
3429
+ * Billing safety: `session.update` only mutates session
3430
+ * configuration. It does NOT invoke the model, does NOT consume any
3431
+ * audio buffer, and does NOT trigger token generation, so no
3432
+ * per-token cost is accrued. Best-effort: failures are logged at
3433
+ * debug level and never raised.
3434
+ */
3435
+ warmup(): Promise<void>;
2506
3436
  /** Open the Realtime WebSocket and apply the session configuration. */
2507
3437
  connect(): Promise<void>;
3438
+ /**
3439
+ * Adopt a pre-opened, already-`session.updated` Realtime WebSocket
3440
+ * produced by the prewarm pipeline (see `Patter.parkProviderConnections`).
3441
+ * Skips the fresh `new WebSocket()` + `session.created` /
3442
+ * `session.update` round-trip — saves ~250-450 ms on first turn.
3443
+ *
3444
+ * Caller MUST verify `ws.readyState === OPEN` before calling and MUST
3445
+ * have already received `session.updated` on the parked socket. If
3446
+ * the parked WS died between park and adopt, fall back to `connect()`.
3447
+ */
3448
+ adoptWebSocket(ws: WebSocket__default): void;
3449
+ protected armHeartbeatAndListener(): void;
3450
+ /**
3451
+ * Open a fresh Realtime WS, exchange `session.created` /
3452
+ * `session.update` / `session.updated` (so the upstream session is
3453
+ * fully primed), and return the OPEN socket WITHOUT arming the
3454
+ * heartbeat / message listener. Used by the prewarm pipeline to park
3455
+ * a Realtime connection during ringing; the live consumer adopts it
3456
+ * via {@link adoptWebSocket}.
3457
+ *
3458
+ * Bounded by 8 s. Throws on timeout / handshake failure — callers
3459
+ * (the prewarm pipeline) treat any error as a cache miss and the
3460
+ * call falls through to the cold `connect()` path.
3461
+ *
3462
+ * Billing safety: `session.update` does not invoke the model. No
3463
+ * tokens are billed.
3464
+ */
3465
+ openParkedConnection(): Promise<WebSocket__default>;
2508
3466
  /** Append a base64-encoded audio chunk to the realtime input buffer. */
2509
3467
  sendAudio(mulawAudio: Buffer): void;
2510
3468
  /**
@@ -2518,7 +3476,7 @@ declare class OpenAIRealtimeAdapter {
2518
3476
  onEvent(callback: RealtimeEventCallback): void;
2519
3477
  /** Remove a previously registered {@link onEvent} callback. */
2520
3478
  offEvent(callback: RealtimeEventCallback): void;
2521
- private ensureMessageListener;
3479
+ protected ensureMessageListener(): void;
2522
3480
  /** Truncate the in-flight assistant turn and cancel the active response.
2523
3481
  *
2524
3482
  * ``audio_end_ms`` MUST reflect what the caller actually heard, not what
@@ -2684,11 +3642,6 @@ declare function isRemoteUrl(onMessage: unknown): onMessage is string;
2684
3642
  /** Check if a URL is a WebSocket URL. */
2685
3643
  declare function isWebSocketUrl(url: string): boolean;
2686
3644
 
2687
- /**
2688
- * Embedded HTTP/WebSocket server — wires Express webhooks for the configured
2689
- * carrier (Twilio or Telnyx) into the per-call `StreamHandler` and dashboard.
2690
- */
2691
-
2692
3645
  /** Resolved configuration consumed by `EmbeddedServer` (carrier credentials, webhook URL, etc.). */
2693
3646
  interface LocalConfig {
2694
3647
  twilioSid?: string;
@@ -3322,6 +4275,8 @@ interface SonioxSTTOptions$1 {
3322
4275
  }
3323
4276
  /** Streaming STT adapter for Soniox's real-time WebSocket API. */
3324
4277
  declare class SonioxSTT {
4278
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
4279
+ static readonly providerKey = "soniox";
3325
4280
  private ws;
3326
4281
  private callbacks;
3327
4282
  private final;
@@ -3430,6 +4385,8 @@ interface AssemblyAISTTOptions$1 {
3430
4385
  declare class AssemblyAISTT {
3431
4386
  private readonly apiKey;
3432
4387
  private readonly options;
4388
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
4389
+ static readonly providerKey = "assemblyai";
3433
4390
  private ws;
3434
4391
  private readonly callbacks;
3435
4392
  private closing;
@@ -3460,6 +4417,22 @@ declare class AssemblyAISTT {
3460
4417
  static forTwilio(apiKey: string, model?: AssemblyAIModel): AssemblyAISTT;
3461
4418
  private buildUrl;
3462
4419
  private buildHeaders;
4420
+ /**
4421
+ * Pre-call WebSocket warmup for the AssemblyAI v3 `/v3/ws` endpoint.
4422
+ *
4423
+ * Opens the WS (DNS + TLS + auth handshake), idles ~250 ms so the
4424
+ * AssemblyAI edge keeps the session state warm, then sends Terminate
4425
+ * and closes. By the time `connect()` is invoked at call-pickup the
4426
+ * resolver and TLS session are hot — net wire time saving of
4427
+ * 200-500 ms.
4428
+ *
4429
+ * Billing safety: AssemblyAI Universal Streaming bills on streamed
4430
+ * audio seconds (per https://www.assemblyai.com/pricing). Opening +
4431
+ * closing the WebSocket without forwarding any audio frames does
4432
+ * not consume billable seconds. Best-effort: failures logged at
4433
+ * debug level.
4434
+ */
4435
+ warmup(): Promise<void>;
3463
4436
  /** Open the streaming WebSocket and arm message handlers. */
3464
4437
  connect(): Promise<void>;
3465
4438
  private awaitOpen;
@@ -3500,6 +4473,7 @@ declare class AssemblyAISTT {
3500
4473
  * Implements a `DeepgramSTT`-shaped provider using Cartesia's streaming
3501
4474
  * WebSocket API. Pure `ws` transport — does NOT depend on the vendor SDK.
3502
4475
  */
4476
+
3503
4477
  /** Patter-normalised transcript event emitted by {@link CartesiaSTT}. */
3504
4478
  interface Transcript$4 {
3505
4479
  readonly text: string;
@@ -3546,6 +4520,8 @@ interface CartesiaSTTOptions$1 {
3546
4520
  declare class CartesiaSTT {
3547
4521
  private readonly apiKey;
3548
4522
  private readonly options;
4523
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
4524
+ static readonly providerKey = "cartesia_stt";
3549
4525
  private ws;
3550
4526
  private callbacks;
3551
4527
  private keepaliveTimer;
@@ -3555,13 +4531,65 @@ declare class CartesiaSTT {
3555
4531
  */
3556
4532
  requestId: string | null;
3557
4533
  constructor(apiKey: string, options?: CartesiaSTTOptions$1);
4534
+ /**
4535
+ * Open a fresh WebSocket without arming any message / keepalive handlers
4536
+ * and without taking ownership on `this.ws`. Returns the OPEN socket so
4537
+ * the caller (the prewarm pipeline) can park it for later adoption via
4538
+ * `adoptWebSocket`. Bounded by `CONNECT_TIMEOUT_MS`.
4539
+ *
4540
+ * Billing safety: opening + parking the WS does not stream audio
4541
+ * (Cartesia STT bills on streamed audio seconds), so no charge is
4542
+ * incurred. Close the returned WS yourself if it is never adopted.
4543
+ */
4544
+ openParkedConnection(): Promise<WebSocket__default>;
3558
4545
  private buildWsUrl;
4546
+ /**
4547
+ * Pre-call WebSocket warmup for the Cartesia STT `/stt/websocket` endpoint.
4548
+ *
4549
+ * Opens the WS (DNS + TLS + auth handshake), idles ~250 ms so the
4550
+ * Cartesia edge keeps session state warm, then closes. By the time
4551
+ * `connect()` is invoked at call-pickup the resolver and TLS session
4552
+ * are hot — net wire time saving of 200-500 ms.
4553
+ *
4554
+ * Billing safety: Cartesia STT bills on streamed audio seconds (per
4555
+ * https://docs.cartesia.ai/2025-04-16/api-reference/stt/stt). Opening
4556
+ * + closing the WebSocket without forwarding audio does not consume
4557
+ * billable seconds. Best-effort: failures logged at debug level.
4558
+ */
4559
+ warmup(): Promise<void>;
3559
4560
  /** Open the streaming WebSocket and arm message + keepalive handlers. */
3560
4561
  connect(): Promise<void>;
4562
+ /**
4563
+ * Adopt a pre-opened, already-OPEN WebSocket produced by the prewarm
4564
+ * pipeline (see `Patter.parkProviderConnections`). Skips the fresh
4565
+ * `new WebSocket()` + handshake — the WS is already through DNS, TLS
4566
+ * and HTTP-101 so audio frames can flow on this turn instead of
4567
+ * paying ~150-400 ms of handshake.
4568
+ *
4569
+ * Caller MUST verify `ws.readyState === OPEN` before calling. If the
4570
+ * parked WS died between park and adopt, fall back to `connect()`.
4571
+ */
4572
+ adoptWebSocket(ws: WebSocket__default): void;
4573
+ private armMessageAndKeepalive;
3561
4574
  private handleEvent;
3562
4575
  private emit;
3563
4576
  /** Send a binary PCM16-LE audio chunk to Cartesia for transcription. */
3564
4577
  sendAudio(audio: Buffer): void;
4578
+ /**
4579
+ * Force Cartesia to finalise the in-flight utterance immediately.
4580
+ *
4581
+ * Sends a ``finalize`` text frame on the live WebSocket. Cartesia
4582
+ * replies with the final transcript followed by ``flush_done``,
4583
+ * bypassing its conservative internal silence heuristic (which can
4584
+ * wait 2-7 s on PSTN audio before naturally finalising). Wired
4585
+ * into ``StreamHandler`` on the VAD ``speech_end`` event so the
4586
+ * SDK's authoritative end-of-speech detection forces an immediate
4587
+ * STT finalisation — turning Cartesia's natural-pause endpointing
4588
+ * into a deterministic VAD-driven one, parity with the Deepgram
4589
+ * fast-path. No-op when the WS isn't open. Parity with Python
4590
+ * ``CartesiaSTT.finalize``.
4591
+ */
4592
+ finalize(): Promise<void>;
3565
4593
  /** Register a transcript listener. */
3566
4594
  onTranscript(callback: TranscriptCallback$4): void;
3567
4595
  /** Remove a previously registered transcript callback. */
@@ -3624,6 +4652,8 @@ interface LMNTTTSOptions$1 {
3624
4652
  }
3625
4653
  /** LMNT TTS adapter backed by the `/v1/ai/speech/bytes` HTTP streaming endpoint. */
3626
4654
  declare class LMNTTTS {
4655
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
4656
+ static readonly providerKey = "lmnt";
3627
4657
  private readonly apiKey;
3628
4658
  private readonly model;
3629
4659
  private readonly voice;
@@ -3717,6 +4747,8 @@ interface DeepgramSTTOptions$1 {
3717
4747
  }
3718
4748
  /** Streaming STT adapter for Deepgram's `/v1/listen` WebSocket API. */
3719
4749
  declare class DeepgramSTT {
4750
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
4751
+ static readonly providerKey = "deepgram";
3720
4752
  private ws;
3721
4753
  private readonly transcriptCallbacks;
3722
4754
  private readonly errorCallbacks;
@@ -3749,6 +4781,22 @@ declare class DeepgramSTT {
3749
4781
  /** Factory for Twilio calls — mulaw 8 kHz. Forwards tuning options through. */
3750
4782
  static forTwilio(apiKey: string, language?: string, model?: string, options?: DeepgramSTTOptions$1): DeepgramSTT;
3751
4783
  private buildUrl;
4784
+ /**
4785
+ * Pre-call WebSocket warmup for the Deepgram `/v1/listen` endpoint.
4786
+ *
4787
+ * Opens the WS (full DNS + TLS + auth handshake), idles ~250 ms so the
4788
+ * provider edge keeps the session warm in its routing table, then
4789
+ * closes cleanly. By the time `connect()` is invoked at call-pickup
4790
+ * the DNS resolver is hot, the TCP+TLS session is in the connection
4791
+ * pool, and recent WS auth is still warm at Deepgram's edge — net
4792
+ * wire time saving of 200-500 ms vs a cold WS open.
4793
+ *
4794
+ * Billing safety: Deepgram bills on streamed audio seconds (per
4795
+ * https://deepgram.com/pricing). Opening + closing the WebSocket
4796
+ * without sending any audio frames does not consume billable seconds.
4797
+ * Best-effort: any failure is logged at debug level and never raised.
4798
+ */
4799
+ warmup(): Promise<void>;
3752
4800
  /** Open the streaming WebSocket and arm message + keepalive handlers. */
3753
4801
  connect(): Promise<void>;
3754
4802
  private openSocket;
@@ -3825,6 +4873,8 @@ type TranscriptCallback$2 = (transcript: Transcript$2) => void;
3825
4873
  type WhisperResponseFormat = 'json' | 'verbose_json';
3826
4874
  /** Buffered STT adapter for OpenAI's Whisper transcription HTTP API. */
3827
4875
  declare class WhisperSTT {
4876
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
4877
+ static readonly providerKey: string;
3828
4878
  private readonly apiKey;
3829
4879
  private readonly model;
3830
4880
  private readonly language;
@@ -3913,6 +4963,8 @@ declare class STT$5 extends WhisperSTT {
3913
4963
 
3914
4964
  /** STT adapter restricted to OpenAI's GPT-4o Transcribe model family. */
3915
4965
  declare class OpenAITranscribeSTT extends WhisperSTT {
4966
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
4967
+ static readonly providerKey: string;
3916
4968
  /**
3917
4969
  * @param apiKey OpenAI API key.
3918
4970
  * @param language ISO-639-1 language code (e.g. ``"en"``, ``"it"``). Optional.
@@ -4172,6 +5224,8 @@ interface SpeechmaticsSTTOptions$1 {
4172
5224
  * ```
4173
5225
  */
4174
5226
  declare class SpeechmaticsSTT {
5227
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
5228
+ static readonly providerKey = "speechmatics";
4175
5229
  private ws;
4176
5230
  private readonly transcriptCallbacks;
4177
5231
  private readonly errorCallbacks;
@@ -4231,147 +5285,12 @@ type SpeechmaticsSTTOptions = SpeechmaticsSTTOptions$1 & {
4231
5285
  * ```ts
4232
5286
  * import * as speechmatics from "getpatter/stt/speechmatics";
4233
5287
  * const stt = new speechmatics.STT(); // reads SPEECHMATICS_API_KEY
4234
- * const stt = new speechmatics.STT({ apiKey: "sm_...", language: "en" });
4235
- * ```
4236
- */
4237
- declare class STT extends SpeechmaticsSTT {
4238
- static readonly providerKey = "speechmatics";
4239
- constructor(opts?: SpeechmaticsSTTOptions);
4240
- }
4241
-
4242
- /**
4243
- * Known stable ElevenLabs voice models (from the official ElevenLabs API
4244
- * reference). Exposed as a typed `as const` object so callers can pass
4245
- * `ElevenLabsModel.FLASH_V2_5` and get autocomplete / static checking; the
4246
- * public `modelId` option also accepts an arbitrary `string` so users can
4247
- * pass forward-compat IDs we haven't enumerated yet.
4248
- *
4249
- * - `V3` — newest, highest quality (slower TTFT than Flash).
4250
- * - `FLASH_V2_5` — current default, fastest (~75 ms TTFT).
4251
- * - `TURBO_V2_5` — balanced quality/speed.
4252
- * - `MULTILINGUAL_V2` — best multilingual support.
4253
- * - `MONOLINGUAL_V1` — legacy English-only.
4254
- */
4255
- declare const ElevenLabsModel: {
4256
- readonly V3: "eleven_v3";
4257
- readonly FLASH_V2_5: "eleven_flash_v2_5";
4258
- readonly TURBO_V2_5: "eleven_turbo_v2_5";
4259
- readonly MULTILINGUAL_V2: "eleven_multilingual_v2";
4260
- readonly MONOLINGUAL_V1: "eleven_monolingual_v1";
4261
- };
4262
- /** Union of {@link ElevenLabsModel} string values. */
4263
- type ElevenLabsModel = (typeof ElevenLabsModel)[keyof typeof ElevenLabsModel];
4264
- declare const ElevenLabsOutputFormat: {
4265
- readonly MP3_22050_32: "mp3_22050_32";
4266
- readonly MP3_44100_32: "mp3_44100_32";
4267
- readonly MP3_44100_64: "mp3_44100_64";
4268
- readonly MP3_44100_96: "mp3_44100_96";
4269
- readonly MP3_44100_128: "mp3_44100_128";
4270
- readonly MP3_44100_192: "mp3_44100_192";
4271
- readonly PCM_8000: "pcm_8000";
4272
- readonly PCM_16000: "pcm_16000";
4273
- readonly PCM_22050: "pcm_22050";
4274
- readonly PCM_24000: "pcm_24000";
4275
- readonly PCM_44100: "pcm_44100";
4276
- readonly ULAW_8000: "ulaw_8000";
4277
- };
4278
- /** Union of {@link ElevenLabsOutputFormat} string values. */
4279
- type ElevenLabsOutputFormat = (typeof ElevenLabsOutputFormat)[keyof typeof ElevenLabsOutputFormat];
4280
- /** ElevenLabs voice tuning knobs forwarded as `voice_settings` in the request. */
4281
- interface ElevenLabsVoiceSettings {
4282
- stability?: number;
4283
- similarity_boost?: number;
4284
- style?: number;
4285
- use_speaker_boost?: boolean;
4286
- }
4287
- /** Constructor options for {@link ElevenLabsTTS}. */
4288
- interface ElevenLabsTTSOptions$1 {
4289
- voiceId?: string;
4290
- /**
4291
- * ElevenLabs voice model ID. The default ``eleven_flash_v2_5`` has the
4292
- * lowest TTFT (~75 ms). Pass ``eleven_v3`` for highest quality, or any
4293
- * arbitrary string for forward-compat with future models.
4294
- */
4295
- modelId?: ElevenLabsModel | string;
4296
- outputFormat?: ElevenLabsOutputFormat;
4297
- voiceSettings?: ElevenLabsVoiceSettings;
4298
- languageCode?: string;
4299
- chunkSize?: number;
4300
- }
4301
- /**
4302
- * ElevenLabs streaming TTS adapter.
4303
- *
4304
- * Supported `modelId` values are autocompleted via {@link ElevenLabsModel}.
4305
- * Default is `eleven_flash_v2_5` (lowest TTFT, ~75 ms).
4306
- *
4307
- * **Telephony optimization** — the constructor default
4308
- * `outputFormat='pcm_16000'` is correct for web playback, dashboard
4309
- * previews, and 16 kHz pipelines. For real phone calls, use the
4310
- * carrier-specific factories instead:
4311
- *
4312
- * - {@link ElevenLabsTTS.forTwilio} emits `ulaw_8000` natively. Twilio's
4313
- * media-stream WebSocket expects μ-law @ 8 kHz, so the SDK normally
4314
- * resamples 16 kHz → 8 kHz and PCM → μ-law before sending. Asking
4315
- * ElevenLabs to produce μ-law directly skips that step (saves
4316
- * ~30–80 ms first-byte plus per-frame CPU and avoids any resampling
4317
- * aliasing).
4318
- * - {@link ElevenLabsTTS.forTelnyx} emits `pcm_16000`. Telnyx negotiates
4319
- * L16/16000 on its bidirectional media WebSocket, so 16 kHz PCM is
4320
- * already the format used end-to-end and no transcoding happens.
4321
- * ElevenLabs *also* supports `ulaw_8000` if your Telnyx profile is
4322
- * pinned to PCMU/8000 — pass `outputFormat: 'ulaw_8000'` explicitly
4323
- * in that case.
5288
+ * const stt = new speechmatics.STT({ apiKey: "sm_...", language: "en" });
5289
+ * ```
4324
5290
  */
4325
- declare class ElevenLabsTTS {
4326
- private readonly apiKey;
4327
- private readonly voiceId;
4328
- private readonly modelId;
4329
- private readonly outputFormat;
4330
- private readonly voiceSettings;
4331
- private readonly languageCode;
4332
- private readonly chunkSize;
4333
- constructor(apiKey: string, voiceId?: string, modelId?: string, outputFormat?: ElevenLabsOutputFormat | string);
4334
- constructor(apiKey: string, options: ElevenLabsTTSOptions$1);
4335
- /**
4336
- * Construct an instance pre-configured for Twilio Media Streams.
4337
- *
4338
- * Sets `outputFormat='ulaw_8000'` so ElevenLabs emits μ-law @ 8 kHz
4339
- * directly — the exact wire format Twilio's media stream uses — letting
4340
- * the SDK skip the 16 kHz→8 kHz resample and PCM→μ-law conversion in
4341
- * `TwilioAudioSender`. Saves ~30–80 ms first-byte and per-frame CPU,
4342
- * and removes a potential aliasing source.
4343
- *
4344
- * `voiceSettings` defaults to a low-bandwidth-friendly profile
4345
- * (speaker boost off, modest stability) which sounds cleaner at 8 kHz
4346
- * μ-law than the studio default. Pass an explicit object to override.
4347
- */
4348
- static forTwilio(apiKey: string, options?: Omit<ElevenLabsTTSOptions$1, 'outputFormat'>): ElevenLabsTTS;
4349
- /**
4350
- * Construct an instance pre-configured for Telnyx bidirectional media.
4351
- *
4352
- * Telnyx's default media-streaming codec is L16 PCM @ 16 kHz, which
4353
- * matches our default Telnyx handler. We pick `pcm_16000` so the audio
4354
- * flows end-to-end with zero resampling or transcoding.
4355
- *
4356
- * Trade-off: if your Telnyx profile is pinned to PCMU/8000 (μ-law),
4357
- * construct `ElevenLabsTTS` directly with `outputFormat: 'ulaw_8000'`
4358
- * — Telnyx supports that natively too.
4359
- */
4360
- static forTelnyx(apiKey: string, options?: Omit<ElevenLabsTTSOptions$1, 'outputFormat'>): ElevenLabsTTS;
4361
- /**
4362
- * Synthesise text to speech and return the full audio as a single Buffer.
4363
- *
4364
- * For large chunks (or when latency matters) call `synthesizeStream` instead.
4365
- */
4366
- synthesize(text: string): Promise<Buffer>;
4367
- /**
4368
- * Synthesise text and yield audio chunks as they arrive (streaming).
4369
- *
4370
- * The yielded buffers are raw PCM at 16 kHz (or whatever `outputFormat` is
4371
- * configured to). `chunkSize` controls the maximum yield size — 512 is a
4372
- * good choice for low-latency telephony.
4373
- */
4374
- synthesizeStream(text: string): AsyncGenerator<Buffer>;
5291
+ declare class STT extends SpeechmaticsSTT {
5292
+ static readonly providerKey = "speechmatics";
5293
+ constructor(opts?: SpeechmaticsSTTOptions);
4375
5294
  }
4376
5295
 
4377
5296
  /** ElevenLabs TTS for Patter pipeline mode. */
@@ -4424,115 +5343,6 @@ declare class TTS$6 extends ElevenLabsTTS {
4424
5343
  static forTelnyx(apiKey: string, options?: Omit<ElevenLabsTTSOptions, "outputFormat">): TTS$6;
4425
5344
  }
4426
5345
 
4427
- /**
4428
- * WebSocket-based ElevenLabs TTS provider — opt-in low-latency variant.
4429
- *
4430
- * Targets the ElevenLabs streaming-input WebSocket endpoint
4431
- * (`/v1/text-to-speech/{voice_id}/stream-input`) instead of the HTTP
4432
- * `/stream` endpoint used by `ElevenLabsTTS`. Saves the HTTP request setup
4433
- * time per utterance (~50 ms) and avoids the HTTP cold-start TLS handshake
4434
- * when calls are bursty.
4435
- *
4436
- * API matches `ElevenLabsTTS` (`synthesizeStream(text)` returns an
4437
- * `AsyncGenerator<Buffer>`) so it can be passed anywhere a TTSAdapter is
4438
- * expected.
4439
- *
4440
- * Behaviour notes
4441
- * - WebSocket is opened **per-utterance** (matches HTTP semantics). A
4442
- * future revision may pool a WS across utterances of the same call
4443
- * session — see roadmap Phase 5b.
4444
- * - `auto_mode=true` is enabled by default. Pass `autoMode: false` to
4445
- * send a custom `chunk_length_schedule`.
4446
- * - `outputFormat` is exposed as a query parameter so `ulaw_8000` (Twilio
4447
- * native) and `pcm_16000` (Telnyx native) work without resampling.
4448
- * - `eleven_v3` is **not** supported — the WS endpoint rejects it.
4449
- * - `optimize_streaming_latency` is officially deprecated and is not
4450
- * exposed.
4451
- */
4452
-
4453
- /** Constructor options for {@link ElevenLabsWebSocketTTS}. */
4454
- interface ElevenLabsWebSocketTTSOptions {
4455
- apiKey: string;
4456
- voiceId?: string;
4457
- modelId?: ElevenLabsModel | string;
4458
- outputFormat?: string;
4459
- voiceSettings?: Record<string, unknown>;
4460
- languageCode?: string;
4461
- /** Let the server pick chunk timing. Default true. */
4462
- autoMode?: boolean;
4463
- /** WS keep-alive timeout in seconds (5–180). Default 60. */
4464
- inactivityTimeout?: number;
4465
- /**
4466
- * Manual chunk schedule, only used when ``autoMode: false``. Each value
4467
- * must be 5–500. ElevenLabs default is ``[120, 160, 250, 290]``.
4468
- */
4469
- chunkLengthSchedule?: number[];
4470
- /** Outgoing audio re-chunk size in bytes. Default 4096. */
4471
- chunkSize?: number;
4472
- }
4473
- /** WebSocket-based ElevenLabs TTS adapter — opt-in low-latency variant. */
4474
- declare class ElevenLabsWebSocketTTS implements TTSAdapter {
4475
- static readonly providerKey = "elevenlabs_ws";
4476
- readonly apiKey: string;
4477
- readonly voiceId: string;
4478
- readonly modelId: string;
4479
- readonly voiceSettings?: Record<string, unknown>;
4480
- readonly languageCode?: string;
4481
- readonly autoMode: boolean;
4482
- readonly inactivityTimeout: number;
4483
- readonly chunkLengthSchedule?: number[];
4484
- readonly chunkSize: number;
4485
- /**
4486
- * The wire format requested over the ElevenLabs WS. Initially set from
4487
- * the constructor; ``setTelephonyCarrier`` may auto-flip it to the
4488
- * carrier's native codec when the caller did NOT pass ``outputFormat``
4489
- * explicitly.
4490
- */
4491
- private _outputFormat;
4492
- private readonly _outputFormatExplicit;
4493
- /** Public read-only view of the (possibly auto-flipped) wire format. */
4494
- get outputFormat(): string;
4495
- constructor(opts: ElevenLabsWebSocketTTSOptions);
4496
- /**
4497
- * Hook called by ``StreamHandler`` to advise the carrier wire format.
4498
- *
4499
- * When the user did NOT pass an explicit ``outputFormat`` in the
4500
- * constructor options, this flips the format to the carrier's native
4501
- * wire codec — saving a client-side transcode step. Calling with an
4502
- * unknown carrier (``""`` / ``"custom"``) is a no-op.
4503
- *
4504
- * When ``outputFormat`` was explicitly passed (incl. via the
4505
- * ``forTwilio`` / ``forTelnyx`` factories), this method is a no-op —
4506
- * the user's choice always wins.
4507
- */
4508
- setTelephonyCarrier(carrier: string): void;
4509
- /** Pre-configured for Twilio Media Streams (`ulaw_8000`). */
4510
- static forTwilio(opts: Omit<ElevenLabsWebSocketTTSOptions, 'outputFormat'>): ElevenLabsWebSocketTTS;
4511
- /** Pre-configured for Telnyx (`pcm_16000`). */
4512
- static forTelnyx(opts: Omit<ElevenLabsWebSocketTTSOptions, 'outputFormat'>): ElevenLabsWebSocketTTS;
4513
- private buildUrl;
4514
- /**
4515
- * Single-shot synthesis: open WS, send text, yield bytes, close.
4516
- *
4517
- * Resilience contract:
4518
- * - Connection bounded by ``CONNECT_TIMEOUT_MS`` (5s, was 15s).
4519
- * - Each idle wait bounded by ``FRAME_TIMEOUT_MS`` (30s) so a stalled
4520
- * server cannot keep the generator alive indefinitely.
4521
- * - Permanent error handler attached BEFORE the open await — prevents
4522
- * ``uncaughtException`` if an error fires after the once-listener
4523
- * resolves.
4524
- * - All event listeners removed in ``finally`` (no closure leak past
4525
- * socket close).
4526
- * - Server-reported ``error`` raises ``ElevenLabsTTSError``.
4527
- * - Per-frame audio payload capped at ``MAX_AUDIO_B64_BYTES``.
4528
- * - Best-effort EOS ``{"text":""}`` sent in finally (not immediately
4529
- * after flush — auto_mode could otherwise truncate the tail audio).
4530
- */
4531
- synthesizeStream(text: string): AsyncGenerator<Buffer>;
4532
- /** No-op — connections are per-utterance and torn down inside synthesizeStream. */
4533
- close(): Promise<void>;
4534
- }
4535
-
4536
5346
  /** ElevenLabs WebSocket TTS for Patter pipeline mode (opt-in low-latency). */
4537
5347
 
4538
5348
  /** Constructor options for the ElevenLabs WebSocket `TTS` adapter. */
@@ -4595,6 +5405,8 @@ declare class OpenAITTS {
4595
5405
  private readonly speed;
4596
5406
  private readonly antiAlias;
4597
5407
  private readonly targetSampleRate;
5408
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
5409
+ static readonly providerKey = "openai_tts";
4598
5410
  constructor(apiKey: string, voice?: string, model?: string, instructions?: string | null, speed?: number | null, antiAlias?: boolean, targetSampleRate?: number);
4599
5411
  /**
4600
5412
  * Synthesise text to speech and return the full audio as a single Buffer.
@@ -4736,6 +5548,8 @@ interface CartesiaTTSOptions$1 {
4736
5548
  }
4737
5549
  /** Cartesia TTS provider backed by the HTTP `/tts/bytes` streaming endpoint. */
4738
5550
  declare class CartesiaTTS {
5551
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
5552
+ static readonly providerKey = "cartesia_tts";
4739
5553
  private readonly apiKey;
4740
5554
  private readonly model;
4741
5555
  private readonly voice;
@@ -4768,6 +5582,25 @@ declare class CartesiaTTS {
4768
5582
  static forTelnyx(apiKey: string, options?: Omit<CartesiaTTSOptions$1, 'sampleRate'>): CartesiaTTS;
4769
5583
  /** Build the JSON payload for the Cartesia bytes endpoint. */
4770
5584
  private buildPayload;
5585
+ /**
5586
+ * Pre-call HTTP warmup for the Cartesia `/tts/bytes` endpoint.
5587
+ *
5588
+ * Issues a lightweight `GET <baseUrl>/voices` so DNS, TLS, and HTTP/2
5589
+ * are already up by the time the first `synthesizeStream()` POST
5590
+ * lands. Best-effort: 5 s timeout, all exceptions swallowed at
5591
+ * debug level.
5592
+ *
5593
+ * Billing safety: `GET /voices` is a free metadata read on
5594
+ * Cartesia's REST surface (per https://docs.cartesia.ai). It does
5595
+ * not consume synthesis credits. The actual synthesis is billed
5596
+ * only when `POST /tts/bytes` runs with a non-empty `transcript`.
5597
+ *
5598
+ * Note: Cartesia TTS uses the HTTP path (vs the WebSocket variant
5599
+ * Cartesia also exposes) — connection warmup is therefore HTTP-GET
5600
+ * based, not WebSocket pre-handshake. The latency win is smaller
5601
+ * (~50-150 ms vs the ~200-500 ms of a WS prewarm) but still real.
5602
+ */
5603
+ warmup(): Promise<void>;
4771
5604
  /** Synthesize text and return the concatenated audio buffer. */
4772
5605
  synthesize(text: string): Promise<Buffer>;
4773
5606
  /**
@@ -4843,6 +5676,8 @@ interface RimeTTSOptions$1 {
4843
5676
  }
4844
5677
  /** Rime TTS adapter for the `users.rime.ai/v1/rime-tts` HTTP streaming endpoint. */
4845
5678
  declare class RimeTTS {
5679
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
5680
+ static readonly providerKey = "rime";
4846
5681
  private readonly apiKey;
4847
5682
  private readonly model;
4848
5683
  private readonly speaker;
@@ -5001,6 +5836,8 @@ interface InworldTTSOptions$1 {
5001
5836
  * before calling the constructor.
5002
5837
  */
5003
5838
  declare class InworldTTS {
5839
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
5840
+ static readonly providerKey = "inworld";
5004
5841
  private readonly authToken;
5005
5842
  private readonly model;
5006
5843
  private readonly voice;
@@ -5014,6 +5851,33 @@ declare class InworldTTS {
5014
5851
  private readonly baseUrl;
5015
5852
  constructor(authToken: string, opts?: InworldTTSOptions$1);
5016
5853
  private buildPayload;
5854
+ /**
5855
+ * Pre-call HTTP warmup for the Inworld TTS API.
5856
+ *
5857
+ * Issues a lightweight `GET /tts/v1/voices` against the API host so
5858
+ * DNS + TLS + HTTP/2 connection are already up by the time the first
5859
+ * `synthesizeStream()` POST lands. Best-effort: 5 s timeout, all
5860
+ * exceptions swallowed at debug level.
5861
+ *
5862
+ * Earlier revisions issued `HEAD` against the streaming endpoint
5863
+ * (`/tts/v1/voice:stream`). That endpoint is POST-only so HEAD
5864
+ * returns `405 Method Not Allowed` — the warmup still completed the
5865
+ * TLS handshake but spammed 405 errors into Inworld's audit logs and
5866
+ * into our own logs. Switching to a documented `GET /tts/v1/voices`
5867
+ * metadata read is a 2xx-clean equivalent.
5868
+ *
5869
+ * Billing safety: `GET /tts/v1/voices` is a free metadata endpoint
5870
+ * (per https://docs.inworld.ai/). It returns the voice catalogue
5871
+ * without invoking the synthesis pipeline. The actual synthesis is
5872
+ * billed only when `POST /tts/v1/voice:stream` runs with a non-empty
5873
+ * `text`.
5874
+ *
5875
+ * Note: Inworld TTS uses the HTTP NDJSON streaming path rather than
5876
+ * a persistent WebSocket — connection warmup is therefore HTTP-based,
5877
+ * not WebSocket pre-handshake. The latency win is smaller (~50-150 ms)
5878
+ * than the WS-based prewarms but still real on cold-start calls.
5879
+ */
5880
+ warmup(): Promise<void>;
5017
5881
  /** Synthesize text and return the concatenated audio buffer. */
5018
5882
  synthesize(text: string): Promise<Buffer>;
5019
5883
  /**
@@ -5143,6 +6007,8 @@ interface AnthropicLLMOptions$1 {
5143
6007
  }
5144
6008
  /** LLM provider backed by Anthropic's Messages API (streaming). */
5145
6009
  declare class AnthropicLLMProvider implements LLMProvider {
6010
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
6011
+ static readonly providerKey = "anthropic";
5146
6012
  private readonly apiKey;
5147
6013
  private readonly model;
5148
6014
  private readonly maxTokens;
@@ -5151,6 +6017,13 @@ declare class AnthropicLLMProvider implements LLMProvider {
5151
6017
  private readonly anthropicVersion;
5152
6018
  private readonly promptCaching;
5153
6019
  constructor(options: AnthropicLLMOptions$1);
6020
+ /**
6021
+ * Pre-call DNS / TLS warmup for the Anthropic Messages API.
6022
+ * Issues a lightweight ``GET https://api.anthropic.com/v1/models`` so
6023
+ * DNS, TLS and HTTP/2 are already up by the time the first ``messages``
6024
+ * call lands. Best-effort: 5 s timeout, exceptions swallowed at debug.
6025
+ */
6026
+ warmup(): Promise<void>;
5154
6027
  /** Stream Patter-format LLM chunks for the given OpenAI-style chat history. */
5155
6028
  stream(messages: Array<Record<string, unknown>>, tools?: Array<Record<string, unknown>> | null, opts?: LLMStreamOptions): AsyncGenerator<LLMChunk, void, unknown>;
5156
6029
  }
@@ -5238,6 +6111,8 @@ interface GroqLLMOptions$1 {
5238
6111
  }
5239
6112
  /** LLM provider backed by Groq's OpenAI-compatible Chat Completions API. */
5240
6113
  declare class GroqLLMProvider implements LLMProvider {
6114
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
6115
+ static readonly providerKey = "groq";
5241
6116
  private readonly apiKey;
5242
6117
  readonly model: string;
5243
6118
  private readonly baseUrl;
@@ -5252,6 +6127,11 @@ declare class GroqLLMProvider implements LLMProvider {
5252
6127
  private readonly presencePenalty?;
5253
6128
  private readonly stop?;
5254
6129
  constructor(options: GroqLLMOptions$1);
6130
+ /**
6131
+ * Pre-call DNS / TLS warmup for the Groq inference endpoint.
6132
+ * Best-effort: 5 s timeout, all exceptions swallowed at debug level.
6133
+ */
6134
+ warmup(): Promise<void>;
5255
6135
  /** Stream Patter-format LLM chunks from the Groq chat completions API. */
5256
6136
  stream(messages: Array<Record<string, unknown>>, tools?: Array<Record<string, unknown>> | null, opts?: LLMStreamOptions): AsyncGenerator<LLMChunk, void, unknown>;
5257
6137
  }
@@ -5371,6 +6251,8 @@ interface CerebrasLLMOptions$1 {
5371
6251
  * - zai-glm-4.7
5372
6252
  */
5373
6253
  declare class CerebrasLLMProvider implements LLMProvider {
6254
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
6255
+ static readonly providerKey = "cerebras";
5374
6256
  private readonly apiKey;
5375
6257
  readonly model: string;
5376
6258
  private readonly baseUrl;
@@ -5386,6 +6268,11 @@ declare class CerebrasLLMProvider implements LLMProvider {
5386
6268
  private readonly presencePenalty?;
5387
6269
  private readonly stop?;
5388
6270
  constructor(options: CerebrasLLMOptions$1);
6271
+ /**
6272
+ * Pre-call DNS / TLS warmup for the Cerebras inference endpoint.
6273
+ * Best-effort: 5 s timeout, all exceptions swallowed at debug level.
6274
+ */
6275
+ warmup(): Promise<void>;
5389
6276
  /** Stream Patter-format LLM chunks from the Cerebras chat completions API. */
5390
6277
  stream(messages: Array<Record<string, unknown>>, tools?: Array<Record<string, unknown>> | null, opts?: LLMStreamOptions): AsyncGenerator<LLMChunk, void, unknown>;
5391
6278
  }
@@ -5468,12 +6355,22 @@ interface GoogleLLMOptions$1 {
5468
6355
  }
5469
6356
  /** LLM provider backed by Google Gemini (Developer API, streaming SSE). */
5470
6357
  declare class GoogleLLMProvider implements LLMProvider {
6358
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
6359
+ static readonly providerKey = "google";
5471
6360
  private readonly apiKey;
5472
6361
  readonly model: string;
5473
6362
  private readonly baseUrl;
5474
6363
  private readonly temperature?;
5475
6364
  private readonly maxOutputTokens?;
5476
6365
  constructor(options: GoogleLLMOptions$1);
6366
+ /**
6367
+ * Pre-call DNS / TLS warmup for the Gemini API.
6368
+ * Issues a lightweight ``GET ${baseUrl}/models?key=...`` so DNS, TLS
6369
+ * and HTTP/2 are already up by the time the first
6370
+ * ``streamGenerateContent`` call lands. Best-effort: 5 s timeout, all
6371
+ * exceptions swallowed at debug level.
6372
+ */
6373
+ warmup(): Promise<void>;
5477
6374
  /** Stream Patter-format LLM chunks from the Gemini SSE endpoint. */
5478
6375
  stream(messages: Array<Record<string, unknown>>, tools?: Array<Record<string, unknown>> | null, opts?: LLMStreamOptions): AsyncGenerator<LLMChunk, void, unknown>;
5479
6376
  }
@@ -5597,7 +6494,10 @@ declare class SileroVAD implements VADProvider {
5597
6494
  * - `activationThreshold = 0.5` — upstream `threshold`
5598
6495
  * - `deactivationThreshold = 0.35` — upstream `neg_threshold = threshold - 0.15`
5599
6496
  * - `minSpeechDuration = 0.25` — upstream `min_speech_duration_ms = 250`
5600
- * - `minSilenceDuration = 0.1` — upstream `min_silence_duration_ms = 100`
6497
+ * - `minSilenceDuration = 0.4` — telephony default (was 0.1, bumped after
6498
+ * round 10f found speech_end firing on inter-sentence pauses < 250 ms,
6499
+ * causing double-talk dispatch). 400 ms matches the industry telephony
6500
+ * default and the inter_utterance_gap_ms debounce in stream-handler.ts.
5601
6501
  * - `prefixPaddingDuration = 0.03` — upstream `speech_pad_ms = 30`
5602
6502
  *
5603
6503
  * Override any field by passing `options`. Deployments that experience
@@ -5639,6 +6539,263 @@ declare class SileroVAD implements VADProvider {
5639
6539
  private advanceState;
5640
6540
  /** Mark the VAD as closed; subsequent processFrame calls throw. */
5641
6541
  close(): Promise<void>;
6542
+ /**
6543
+ * Reset all per-utterance state so the next ``processFrame`` starts from
6544
+ * a clean SILENCE state.
6545
+ *
6546
+ * Called by the stream handler between agent turns to prevent a "stuck
6547
+ * SPEECH" condition where PSTN echo / loopback kept the detector's
6548
+ * probability above ``deactivationThreshold`` for the entire agent turn.
6549
+ * Without this reset the next user utterance would never trigger a
6550
+ * SILENCE→SPEECH transition and barge-in would feel "one-shot" (works
6551
+ * once, then never again until the call ends).
6552
+ *
6553
+ * Safe to call any time including on a closed instance (no-op).
6554
+ */
6555
+ reset(): void;
6556
+ }
6557
+
6558
+ /** Options accepted by {@link DeepFilterNetFilter}. */
6559
+ interface DeepFilterNetOptions {
6560
+ /** Absolute path to a DeepFilterNet ONNX model. If omitted, the filter
6561
+ * logs a warning and becomes a pass-through. */
6562
+ modelPath?: string;
6563
+ /** When true, disable the pass-through warning (used by tests). */
6564
+ silenceWarnings?: boolean;
6565
+ }
6566
+ /** OSS noise-suppression filter backed by a DeepFilterNet ONNX model. */
6567
+ declare class DeepFilterNetFilter implements AudioFilter {
6568
+ private readonly modelPath;
6569
+ private readonly silenceWarnings;
6570
+ private session;
6571
+ private ort;
6572
+ private warned;
6573
+ private closed;
6574
+ private _resamplerSrcRate;
6575
+ private _upsamplerInst;
6576
+ private _downsamplerInst;
6577
+ constructor(options?: DeepFilterNetOptions);
6578
+ private ensureSession;
6579
+ /** Run noise suppression on a PCM16 chunk; pass-through when no model is loaded. */
6580
+ process(pcmChunk: Buffer, sampleRate: number): Promise<Buffer>;
6581
+ /** Flush resamplers, release the ONNX session, and mark the filter closed. */
6582
+ close(): Promise<void>;
6583
+ }
6584
+
6585
+ /**
6586
+ * Krisp VIVA noise-reduction AudioFilter — TypeScript scaffold.
6587
+ *
6588
+ * Mirrors the API of the Python `getpatter.providers.krisp_filter.KrispVivaFilter`
6589
+ * for SDK parity. As of 2026-05 Krisp does not publish an official Node.js
6590
+ * (server) SDK; third-party browser/RN wrappers exist but cannot process
6591
+ * server-received PCM/mulaw audio. This class throws at construction time
6592
+ * and points the caller at the available paths (Python SDK or DeepFilterNet
6593
+ * on TS).
6594
+ *
6595
+ * When Krisp publishes an official Node binding — or a community NAPI/WASM
6596
+ * wrapper becomes available — the import below and `process()` body will
6597
+ * fill in. The class signature is intentionally compatible with the Python
6598
+ * one so callers do not need to migrate code: `camelCase` ↔ `snake_case`,
6599
+ * `modelPath` ↔ `model_path`, etc.
6600
+ *
6601
+ * Krisp VIVA is a proprietary SDK and requires a commercial license plus a
6602
+ * `.kef` model file provided by the user. Patter ships only the
6603
+ * AudioFilter interface scaffold — never the SDK or model.
6604
+ *
6605
+ * @see https://krisp.ai/developers/
6606
+ */
6607
+
6608
+ /** Krisp-supported sample rates (parity with Python `KrispSampleRate`). */
6609
+ declare const KrispSampleRate: {
6610
+ readonly HZ_8000: 8000;
6611
+ readonly HZ_16000: 16000;
6612
+ readonly HZ_32000: 32000;
6613
+ readonly HZ_44100: 44100;
6614
+ readonly HZ_48000: 48000;
6615
+ };
6616
+ type KrispSampleRate = (typeof KrispSampleRate)[keyof typeof KrispSampleRate];
6617
+ /** Krisp-supported frame durations in ms (parity with Python `KrispFrameDuration`). */
6618
+ declare const KrispFrameDuration: {
6619
+ readonly MS_10: 10;
6620
+ readonly MS_15: 15;
6621
+ readonly MS_20: 20;
6622
+ readonly MS_30: 30;
6623
+ readonly MS_32: 32;
6624
+ };
6625
+ type KrispFrameDuration = (typeof KrispFrameDuration)[keyof typeof KrispFrameDuration];
6626
+ /** Options accepted by {@link KrispVivaFilter}. */
6627
+ interface KrispVivaFilterOptions {
6628
+ /**
6629
+ * Path to the Krisp `.kef` model file. If omitted, falls back to the
6630
+ * `KRISP_VIVA_FILTER_MODEL_PATH` environment variable.
6631
+ */
6632
+ readonly modelPath?: string;
6633
+ /** Noise-suppression strength in `[0, 100]`. Defaults to `100`. */
6634
+ readonly noiseSuppressionLevel?: number;
6635
+ /** Frame duration in ms. One of `10, 15, 20, 30, 32`. Defaults to `10`. */
6636
+ readonly frameDurationMs?: KrispFrameDuration | number;
6637
+ /** Initial sample rate in Hz. Defaults to `16000`. Re-created lazily if it changes mid-call. */
6638
+ readonly sampleRate?: KrispSampleRate | number;
6639
+ }
6640
+ /**
6641
+ * Krisp VIVA noise-reduction filter — TypeScript scaffold (NOT YET IMPLEMENTED).
6642
+ *
6643
+ * Construction throws with a guidance message because Krisp does not ship a
6644
+ * Node.js SDK. The class exists for API parity with the Python
6645
+ * `KrispVivaFilter` so that user code does not need to be rewritten when a
6646
+ * Node binding lands.
6647
+ *
6648
+ * For TS users today, use {@link DeepFilterNetFilter} from
6649
+ * `./deepfilternet-filter` instead — same `AudioFilter` interface, no
6650
+ * license required.
6651
+ *
6652
+ * @example
6653
+ * ```ts
6654
+ * // FUTURE — when Krisp publishes a Node SDK:
6655
+ * import { KrispVivaFilter } from 'getpatter/providers/krisp-filter';
6656
+ * const filter = new KrispVivaFilter({ modelPath: '/path/to/model.kef' });
6657
+ * const agent = phone.agent({ audioFilter: filter, ... });
6658
+ * ```
6659
+ */
6660
+ declare class KrispVivaFilter implements AudioFilter {
6661
+ static readonly providerKey = "krisp_viva";
6662
+ constructor(_options?: KrispVivaFilterOptions);
6663
+ process(pcmChunk: Buffer, _sampleRate: number): Promise<Buffer>;
6664
+ close(): Promise<void>;
6665
+ }
6666
+
6667
+ /**
6668
+ * OpenAI Realtime adapter for the GA Realtime API (`gpt-realtime-2`).
6669
+ *
6670
+ * `gpt-realtime-2` is served from the same `wss://api.openai.com/v1/realtime`
6671
+ * endpoint as the v1-beta family, but the GA endpoint:
6672
+ * - REJECTS the legacy `OpenAI-Beta: realtime=v1` header (returns
6673
+ * `invalid_model` with message "Model X is only available on the GA API").
6674
+ * - REQUIRES `session.type === "realtime"` at the root of `session.update`.
6675
+ * - Uses `output_modalities` (was `modalities`).
6676
+ * - Nests audio config under `audio.{input,output}` with MIME `type`
6677
+ * strings (`audio/pcmu`, `audio/pcma`, `audio/pcm`) instead of the v1
6678
+ * enum strings (`g711_ulaw`, `g711_alaw`, `pcm16`) and moves `voice`
6679
+ * under `audio.output.voice`, `transcription` + `turn_detection`
6680
+ * under `audio.input`.
6681
+ *
6682
+ * Everything ELSE (event names, audio delta dispatch, barge-in / truncate
6683
+ * semantics, heartbeat, tool calling) is API-compatible with the v1 family,
6684
+ * so this adapter subclasses {@link OpenAIRealtimeAdapter} and overrides
6685
+ * only `connect()`. The runtime behaviour (`sendAudio`, `cancelResponse`,
6686
+ * `sendText`, `sendFirstMessage`, …) is inherited unchanged.
6687
+ */
6688
+
6689
+ /**
6690
+ * Realtime WebSocket adapter speaking OpenAI's GA Realtime API.
6691
+ *
6692
+ * Note on audio transport: the GA endpoint accepts only PCM-16-LE with
6693
+ * `rate >= 24000` for both `session.audio.input.format` and
6694
+ * `session.audio.output.format`. The `audio/pcmu` MIME type appears to be
6695
+ * accepted at the protocol level but the server's audio engine does not
6696
+ * actually decode mulaw 8 kHz frames — they're silently dropped, the input
6697
+ * buffer stays empty, `input_audio_buffer.commit` returns
6698
+ * "buffer only has 0.00ms of audio", and the call ends up muted. Until
6699
+ * OpenAI documents native g711_ulaw on the GA endpoint we transcode on
6700
+ * both directions on the Patter side:
6701
+ * - inbound (Twilio/Telnyx → model): mulaw 8 kHz → PCM 24 kHz
6702
+ * - outbound (model → Twilio/Telnyx): PCM 24 kHz → mulaw 8 kHz
6703
+ *
6704
+ * The outbound path needs a stateful resampler instance because the
6705
+ * 24 kHz → 8 kHz decimator carries phase between chunks; sharing a single
6706
+ * instance across the call eliminates the boundary clicks a stateless
6707
+ * helper would produce on every audio delta.
6708
+ */
6709
+ declare class OpenAIRealtime2Adapter extends OpenAIRealtimeAdapter {
6710
+ /** Two-stage outbound resampler for 24 kHz → 8 kHz. Created lazily on
6711
+ * the first audio frame so each Realtime session has its own state.
6712
+ *
6713
+ * We chain `24k → 16k → 8k` instead of using the direct `24k → 8k`
6714
+ * variant of {@link StatefulResampler}: the direct path is a 3:1
6715
+ * decimation with linear interpolation only — no anti-alias filter
6716
+ * — so any energy above 4 kHz in the source aliases down into the
6717
+ * audible band and is heard as raspy/scratchy artefacts on speech.
6718
+ * `gpt-realtime-2` outputs voice with significant content above
6719
+ * 4 kHz. The second stage (16k → 8k) uses a 5-tap FIR anti-alias
6720
+ * filter which removes the offending band before decimation, and
6721
+ * empirically (see commit message) the chain produces audibly
6722
+ * cleaner output. The 24k → 16k step is still pure linear-interp
6723
+ * but the inputs to it stay below the Nyquist of the 16 kHz stage,
6724
+ * so it doesn't introduce new artefacts.
6725
+ */
6726
+ private outboundResampler24To16;
6727
+ private outboundResampler16To8;
6728
+ /** Last 8 kHz input sample carried across chunk boundaries for the
6729
+ * direct 3× linear upsample (see `transcodeInboundMulaw8ToPcm24`).
6730
+ * The carry guarantees the very first output of each chunk
6731
+ * interpolates from the *real* preceding sample, not from the chunk's
6732
+ * own first sample replicated — without it every 20 ms Twilio frame
6733
+ * boundary becomes a small DC step that the GA server VAD interprets
6734
+ * as constant low-energy noise, which never crosses the speech
6735
+ * threshold. */
6736
+ private inbound8kCarry;
6737
+ /** GA-shape `session.update` payload. See module-level docstring. */
6738
+ private buildGASessionConfig;
6739
+ /**
6740
+ * Open the Realtime WebSocket against the GA endpoint and apply the GA
6741
+ * session configuration. Header `OpenAI-Beta: realtime=v1` is OMITTED
6742
+ * (the GA endpoint rejects it). Wire shape uses nested `audio.{input,
6743
+ * output}` + `output_modalities` + `session.type === "realtime"`.
6744
+ */
6745
+ connect(): Promise<void>;
6746
+ /**
6747
+ * GA-API variant of {@link OpenAIRealtimeAdapter.sendFirstMessage}. Two
6748
+ * differences from the v1 path:
6749
+ *
6750
+ * 1. The v1 implementation sends `response.modalities` which the GA
6751
+ * endpoint rejects with `Unknown parameter: 'response.modalities'`.
6752
+ * Use `output_modalities` to match the GA `session.update` shape.
6753
+ *
6754
+ * 2. The GA `response.create` does NOT inherit `audio.output.voice`
6755
+ * from the session — it falls back to the server-side default
6756
+ * (`marin`, female) when the field is omitted on the response
6757
+ * itself. Session-level `voice: "alloy"` only affects subsequent
6758
+ * server-VAD-triggered responses, NOT this explicit
6759
+ * `response.create`. We re-inject the configured voice here so the
6760
+ * first-message voice matches the rest of the call.
6761
+ */
6762
+ /**
6763
+ * Override the parent `sendAudio` to transcode inbound carrier audio
6764
+ * (mulaw 8 kHz from Twilio/Telnyx) into PCM-16 24 kHz before sending
6765
+ * `input_audio_buffer.append`. The GA server's audio engine ignores
6766
+ * mulaw frames (commit returns "buffer only has 0.00ms of audio") even
6767
+ * though it accepts `audio/pcmu` at the protocol level.
6768
+ */
6769
+ sendAudio(mulawAudio: Buffer): void;
6770
+ /**
6771
+ * mulaw 8 kHz Buffer → PCM-16-LE 24 kHz Buffer.
6772
+ *
6773
+ * Direct 3× linear-interpolation upsample with a one-sample carry
6774
+ * across chunk boundaries. For every consecutive pair of 8 kHz
6775
+ * samples `(s_a, s_b)` we emit three 24 kHz samples:
6776
+ *
6777
+ * out_0 = s_a
6778
+ * out_1 = 2/3·s_a + 1/3·s_b
6779
+ * out_2 = 1/3·s_a + 2/3·s_b
6780
+ *
6781
+ * The carry stores the last 8 kHz sample of the chunk so the next
6782
+ * chunk can start by pairing `(carry, firstNewSample)` — that's what
6783
+ * keeps the output rate exact (each input sample → 3 output samples)
6784
+ * and eliminates the chunk-boundary DC step that confused the GA
6785
+ * server VAD. The first chunk has no carry and loses 3 samples at
6786
+ * the leading edge (375 µs of audio); that's well below any audible
6787
+ * artefact and well below the GA VAD's 300 ms prefix-padding window.
6788
+ */
6789
+ private transcodeInboundMulaw8ToPcm24;
6790
+ /**
6791
+ * Base64 PCM-16-LE 24 kHz → Base64 mulaw 8 kHz. Used by the WS
6792
+ * translation shim on each `response.output_audio.delta`. The stateful
6793
+ * resampler is created lazily and reused across all deltas in this
6794
+ * session so the 3:1 decimator's phase carries across chunk
6795
+ * boundaries — without that, every chunk boundary produces a click.
6796
+ */
6797
+ private transcodeOutboundPcm24ToMulaw8Buffer;
6798
+ sendFirstMessage(text: string): Promise<void>;
5642
6799
  }
5643
6800
 
5644
6801
  /**
@@ -6379,6 +7536,8 @@ declare class TelnyxSTT {
6379
7536
  private readonly transcriptionEngine;
6380
7537
  private readonly sampleRate;
6381
7538
  private readonly baseUrl;
7539
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
7540
+ static readonly providerKey = "telnyx_stt";
6382
7541
  private ws;
6383
7542
  private callbacks;
6384
7543
  private headerSent;
@@ -6425,6 +7584,8 @@ declare class TelnyxTTS {
6425
7584
  private readonly apiKey;
6426
7585
  private readonly voice;
6427
7586
  private readonly baseUrl;
7587
+ /** Stable pricing/dashboard key — read by stream-handler/metrics. */
7588
+ static readonly providerKey = "telnyx_tts";
6428
7589
  constructor(apiKey: string, voice?: string, baseUrl?: string);
6429
7590
  /** Collect every audio chunk into a single Buffer. */
6430
7591
  synthesize(text: string): Promise<Buffer>;
@@ -6504,4 +7665,4 @@ interface CallEvent {
6504
7665
  readonly direction?: string;
6505
7666
  }
6506
7667
 
6507
- export { type AgentOptions, type AgentState, AllProvidersFailedError, type AnthropicConversion, LLM$3 as AnthropicLLM, type AnthropicLLMOptions, type AnthropicMessage, AssemblyAIEncoding, AssemblyAIModel, STT$1 as AssemblyAISTT, type AssemblyAISTTOptions, type AudioConfig, type AudioSource, AuthenticationError, type BackgroundAudioOptions, BackgroundAudioPlayer, BuiltinAudioClip, type BuiltinAudioClipName, type BuiltinPcmSource, type CallControl, type CallEvent, type CallEventHandler, type CallMetrics, CallMetricsAccumulator, type CallRecord, type CartesiaEncoding, STT$3 as CartesiaSTT, type CartesiaSTTOptions, TTS$3 as CartesiaTTS, type CartesiaTTSOptions, LLM$1 as CerebrasLLM, type CerebrasLLMOptions, ChatContext, type ChatMessage, type ChatRole, CloudflareTunnel, type ConversationStateSnapshot, type CostBreakdown, DEFAULT_MIN_SENTENCE_LEN, DEFAULT_PRICING, DTMF_EVENTS, STT$6 as DeepgramSTT, type DeepgramSTTOptions, DefaultToolExecutor, type DefaultToolExecutorOptions, type DefineToolInput, type DtmfEvent, ConvAI as ElevenLabsConvAI, ElevenLabsConvAIAdapter, type ConvAIOptions as ElevenLabsConvAIOptions, TTS$6 as ElevenLabsTTS, type ElevenLabsTTSOptions, type ElevenLabsWebSocketOptions, TTS$5 as ElevenLabsWebSocketTTS, type EouTrigger, ErrorCode, EventBus, FallbackLLMProvider, type FallbackLLMProviderOptions, type FilePcmSource, GEMINI_DEFAULT_INPUT_SR, GEMINI_DEFAULT_OUTPUT_SR, GeminiLiveAdapter, type GeminiLiveEventHandler, LLM as GoogleLLM, type GoogleLLMOptions, LLM$2 as GroqLLM, type GroqLLMOptions, Guardrail$1 as Guardrail, type GuardrailOptions, type HookContext, IVRActivity, type IVRActivityOptions, type IVRToolDefinition, type IncomingMessage, type InitTracingOptions, TTS as InworldTTS, type InworldTTSOptions, type JobCallback, type LLMChunk, LLMLoop, type LLMProvider, LMNTAudioFormat, LMNTModel, LMNTSampleRate, TTS$1 as LMNTTTS, type LMNTTTSOptions, type LatencyBreakdown, type LocalCallOptions, type LocalConfig, type LocalOptions, type Logger, type LoopCallback, type MessageHandler, MetricsStore, Ngrok, LLM$4 as OpenAILLM, type OpenAILLMOptions, OpenAILLMProvider, type OpenAIMessage, Realtime as OpenAIRealtime, OpenAIRealtimeAdapter, type RealtimeOptions as OpenAIRealtimeOptions, TTS$4 as OpenAITTS, type OpenAITTSOptions, STT$4 as OpenAITranscribeSTT, type OpenAITranscribeSTTOptions, type ParamSpec, PartialStreamError, Patter, PatterConnectionError, PatterError, type PatterEventType, PatterTool, type PatterToolExecuteArgs, type PatterToolOptions, type PatterToolResult, PcmCarry, PipelineHookExecutor, type PipelineHooks, type PipelineMessageHandler, type ProviderPricing, ProvisionError, RateLimitError, type RawPcmSource, type RealtimeConfig, RemoteMessageHandler, TTS$2 as RimeTTS, type RimeTTSOptions, SPAN_BARGEIN, SPAN_CALL, SPAN_ENDPOINT, SPAN_LLM, SPAN_STT, SPAN_TOOL, SPAN_TTS, type SSEEvent, type STTConfig, type ScheduleHandle, SentenceChunker, type ServeOptions, type SilenceCallback, type SileroSampleRate, SileroVAD, type SileroVADOptions, STT$2 as SonioxSTT, type SonioxSTTOptions$1 as SonioxSTTOptions, type Span, type SpeechEventCallback, SpeechEvents, SpeechmaticsAudioEncoding, SpeechmaticsOperatingPoint, STT as SpeechmaticsSTT, type SpeechmaticsSTTOptions, SpeechmaticsSampleRate, SpeechmaticsServerMessage, TurnDetectionMode as SpeechmaticsTurnDetectionMode, StatefulResampler, type StatefulResamplerOptions, Static as StaticTunnel, type TTSConfig, Carrier as Telnyx, TelnyxAdapter, type TelnyxCarrierOptions, type ConfigureNumberOptions as TelnyxConfigureNumberOptions, type EndCallOptions as TelnyxEndCallOptions, type InitiateCallOptions as TelnyxInitiateCallOptions, type InitiateCallResult as TelnyxInitiateCallResult, type ProvisionNumberOptions as TelnyxProvisionNumberOptions, type ProvisionNumberResult as TelnyxProvisionNumberResult, TelnyxSTT, TelnyxSTTInputFormat, TelnyxSTTSampleRate, type Transcript as TelnyxSTTTranscript, TelnyxTTS, TelnyxTTSSampleRate, TelnyxTTSVoice, type TelnyxTranscriptionEngine, TestSession, TfidfLoopDetector, type TfidfLoopDetectorOptions, Tool, type ToolDefinition, type ToolExecutor, type ToolHandler, type ToolOptions, type TunnelHandle, type TurnMetrics, Carrier$1 as Twilio, TwilioAdapter, type TwilioAdapterOptions, type TwilioCarrierOptions, type ConfigureNumberOptions$1 as TwilioConfigureNumberOptions, type InitiateCallOptions$1 as TwilioInitiateCallOptions, type InitiateCallResult$1 as TwilioInitiateCallResult, type ProvisionNumberOptions$1 as TwilioProvisionNumberOptions, type ProvisionNumberResult$1 as TwilioProvisionNumberResult, ULTRAVOX_DEFAULT_API_BASE, ULTRAVOX_DEFAULT_SR, type UltravoxEventHandler, UltravoxRealtimeAdapter, type UserState, STT$5 as WhisperSTT, type WhisperSTTOptions, assemblyai, builtinClipPath, calculateRealtimeCost, calculateSttCost, calculateTelephonyCost, calculateTtsCost, callsToCsv, callsToJson, cartesia, createResampler16kTo8k, createResampler24kTo16k, createResampler24kTo8k, createResampler8kTo16k, deepgram, defineTool, elevenlabs, filterEmoji, filterForTTS, filterMarkdown, formatDtmf, geminiLive, getLogger, guardrail, initTracing, isRemoteUrl, isTracingEnabled, isWebSocketUrl, lmnt, makeAuthMiddleware, mergePricing, mixPcm, mountApi, mountDashboard, mulawToPcm16, notifyDashboard, openaiTts, pcm16ToMulaw, resample16kTo8k, resample24kTo16k, resample8kTo16k, resamplePcm, rime, scheduleCron, scheduleInterval, scheduleOnce, selectSoundFromList, setLogger, soniox, speechmatics, startSpan, startTunnel, tool, ultravox, whisper };
7668
+ export { type AgentOptions, type AgentState, AllProvidersFailedError, type AnthropicConversion, LLM$3 as AnthropicLLM, type AnthropicLLMOptions, type AnthropicMessage, AssemblyAIEncoding, AssemblyAIModel, STT$1 as AssemblyAISTT, type AssemblyAISTTOptions, type AudioConfig, type AudioSource, AuthenticationError, type BackgroundAudioOptions, BackgroundAudioPlayer, type EvaluateContext as BargeInEvaluateContext, type BargeInStrategy, BuiltinAudioClip, type BuiltinAudioClipName, type BuiltinPcmSource, type CallControl, type CallEvent, type CallEventHandler, type CallMetrics, CallMetricsAccumulator, type CallRecord, type CartesiaEncoding, STT$3 as CartesiaSTT, type CartesiaSTTOptions, TTS$3 as CartesiaTTS, type CartesiaTTSOptions, LLM$1 as CerebrasLLM, type CerebrasLLMOptions, ChatContext, type ChatMessage, type ChatRole, CloudflareTunnel, type ConversationStateSnapshot, type CostBreakdown, DEFAULT_MIN_SENTENCE_LEN, DEFAULT_PRICING, DTMF_EVENTS, DeepFilterNetFilter, type DeepFilterNetOptions, STT$6 as DeepgramSTT, type DeepgramSTTOptions, DefaultToolExecutor, type DefaultToolExecutorOptions, type DefineToolInput, type DtmfEvent, ConvAI as ElevenLabsConvAI, ElevenLabsConvAIAdapter, type ConvAIOptions as ElevenLabsConvAIOptions, ElevenLabsTTS as ElevenLabsRestTTS, TTS$6 as ElevenLabsTTS, type ElevenLabsTTSOptions, type ElevenLabsWebSocketOptions, TTS$5 as ElevenLabsWebSocketTTS, type EouTrigger, ErrorCode, EventBus, FallbackLLMProvider, type FallbackLLMProviderOptions, type FilePcmSource, GEMINI_DEFAULT_INPUT_SR, GEMINI_DEFAULT_OUTPUT_SR, GeminiLiveAdapter, type GeminiLiveEventHandler, LLM as GoogleLLM, type GoogleLLMOptions, LLM$2 as GroqLLM, type GroqLLMOptions, Guardrail$1 as Guardrail, type GuardrailOptions, type HookContext, IVRActivity, type IVRActivityOptions, type IVRToolDefinition, type IncomingMessage, type InitTracingOptions, TTS as InworldTTS, type InworldTTSOptions, type JobCallback, KrispFrameDuration, KrispSampleRate, KrispVivaFilter, type KrispVivaFilterOptions, type LLMChunk, LLMLoop, type LLMProvider, LMNTAudioFormat, LMNTModel, LMNTSampleRate, TTS$1 as LMNTTTS, type LMNTTTSOptions, type LatencyBreakdown, type LocalCallOptions, type LocalConfig, type LocalOptions, type Logger, type LoopCallback, type MessageHandler, MetricsStore, MinWordsStrategy, type MinWordsStrategyOptions, Ngrok, LLM$4 as OpenAILLM, type OpenAILLMOptions, OpenAILLMProvider, type OpenAIMessage, Realtime as OpenAIRealtime, Realtime2 as OpenAIRealtime2, OpenAIRealtime2Adapter, type Realtime2Options as OpenAIRealtime2Options, OpenAIRealtimeAdapter, type RealtimeOptions as OpenAIRealtimeOptions, TTS$4 as OpenAITTS, type OpenAITTSOptions, STT$4 as OpenAITranscribeSTT, type OpenAITranscribeSTTOptions, type ParamSpec, PartialStreamError, Patter, PatterConnectionError, PatterError, type PatterEventType, PatterTool, type PatterToolExecuteArgs, type PatterToolOptions, type PatterToolResult, PcmCarry, PipelineHookExecutor, type PipelineHooks, type PipelineMessageHandler, type ProviderPricing, ProvisionError, RateLimitError, type RawPcmSource, type RealtimeConfig, RemoteMessageHandler, TTS$2 as RimeTTS, type RimeTTSOptions, SPAN_BARGEIN, SPAN_CALL, SPAN_ENDPOINT, SPAN_LLM, SPAN_STT, SPAN_TOOL, SPAN_TTS, type SSEEvent, type STTConfig, type ScheduleHandle, SentenceChunker, type ServeOptions, type SilenceCallback, type SileroSampleRate, SileroVAD, type SileroVADOptions, STT$2 as SonioxSTT, type SonioxSTTOptions$1 as SonioxSTTOptions, type Span, type SpeechEventCallback, SpeechEvents, SpeechmaticsAudioEncoding, SpeechmaticsOperatingPoint, STT as SpeechmaticsSTT, type SpeechmaticsSTTOptions, SpeechmaticsSampleRate, SpeechmaticsServerMessage, TurnDetectionMode as SpeechmaticsTurnDetectionMode, StatefulResampler, type StatefulResamplerOptions, Static as StaticTunnel, type TTSConfig, Carrier as Telnyx, TelnyxAdapter, type TelnyxCarrierOptions, type ConfigureNumberOptions as TelnyxConfigureNumberOptions, type EndCallOptions as TelnyxEndCallOptions, type InitiateCallOptions as TelnyxInitiateCallOptions, type InitiateCallResult as TelnyxInitiateCallResult, type ProvisionNumberOptions as TelnyxProvisionNumberOptions, type ProvisionNumberResult as TelnyxProvisionNumberResult, TelnyxSTT, TelnyxSTTInputFormat, TelnyxSTTSampleRate, type Transcript as TelnyxSTTTranscript, TelnyxTTS, TelnyxTTSSampleRate, TelnyxTTSVoice, type TelnyxTranscriptionEngine, TestSession, TfidfLoopDetector, type TfidfLoopDetectorOptions, Tool, type ToolDefinition, type ToolExecutor, type ToolHandler, type ToolOptions, type TunnelHandle, type TurnMetrics, Carrier$1 as Twilio, TwilioAdapter, type TwilioAdapterOptions, type TwilioCarrierOptions, type ConfigureNumberOptions$1 as TwilioConfigureNumberOptions, type InitiateCallOptions$1 as TwilioInitiateCallOptions, type InitiateCallResult$1 as TwilioInitiateCallResult, type ProvisionNumberOptions$1 as TwilioProvisionNumberOptions, type ProvisionNumberResult$1 as TwilioProvisionNumberResult, ULTRAVOX_DEFAULT_API_BASE, ULTRAVOX_DEFAULT_SR, type UltravoxEventHandler, UltravoxRealtimeAdapter, type UserState, STT$5 as WhisperSTT, type WhisperSTTOptions, assemblyai, builtinClipPath, calculateRealtimeCost, calculateSttCost, calculateTelephonyCost, calculateTtsCost, callsToCsv, callsToJson, cartesia, createResampler16kTo8k, createResampler24kTo16k, createResampler24kTo8k, createResampler8kTo16k, deepgram, defineTool, elevenlabs, evaluateStrategies as evaluateBargeInStrategies, filterEmoji, filterForTTS, filterMarkdown, formatDtmf, geminiLive, getLogger, guardrail, initTracing, isRemoteUrl, isTracingEnabled, isWebSocketUrl, lmnt, makeAuthMiddleware, mergePricing, mixPcm, mountApi, mountDashboard, mulawToPcm16, notifyDashboard, openaiTts, pcm16ToMulaw, resample16kTo8k, resample24kTo16k, resample8kTo16k, resamplePcm, resetStrategies as resetBargeInStrategies, rime, scheduleCron, scheduleInterval, scheduleOnce, selectSoundFromList, setLogger, soniox, speechmatics, startSpan, startTunnel, tool, ultravox, whisper };