getpatter 0.6.7 → 0.6.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -1,5 +1,5 @@
1
1
  import * as WebSocket from 'ws';
2
- import WebSocket__default from 'ws';
2
+ import WebSocket__default, { WebSocket as WebSocket$1 } from 'ws';
3
3
  import { EventEmitter } from 'events';
4
4
  import { Request, Response, NextFunction, Express } from 'express';
5
5
 
@@ -184,6 +184,149 @@ declare class SpeechEvents {
184
184
  private dispatch;
185
185
  }
186
186
 
187
+ /**
188
+ * Carrier-neutral local call recording — interleaved stereo WAV on disk.
189
+ *
190
+ * `LocalCallRecorder` taps the audio that already flows through the SDK's
191
+ * per-call `StreamHandler` and writes a synchronized two-channel recording:
192
+ *
193
+ * - **left channel = caller** (inbound, post mulaw→PCM16 decode)
194
+ * - **right channel = agent** (outbound TTS / Realtime / ConvAI audio)
195
+ *
196
+ * Unlike the carrier-side `recording: true` option (Twilio Recordings API,
197
+ * Telnyx `record_start`, Plivo Record API) this works on ANY carrier because
198
+ * nothing leaves the process: the recorder is fed at the transport tap points
199
+ * and appends PCM to `recording.wav` incrementally. Enable via
200
+ * `Patter.serve({ localRecording: true })` (or a directory string). Parity
201
+ * with Python `getpatter/audio/call_recorder.py`.
202
+ *
203
+ * ## Format
204
+ *
205
+ * 16-bit little-endian PCM, 16 kHz, 2 channels. A placeholder RIFF header is
206
+ * written at open with zero sizes; {@link LocalCallRecorder.close} patches
207
+ * the `RIFF` and `data` chunk sizes so the file is parseable by any WAV
208
+ * reader. The handler teardown paths (`handleStop` / `handleWsClose` →
209
+ * `fireCallEnd`) call `close()`, so a truncated call still yields a valid
210
+ * file.
211
+ *
212
+ * ## Time alignment (documented approach)
213
+ *
214
+ * The recorder is **caller-clocked**. On PSTN the inbound media stream is a
215
+ * continuous realtime sequence of ~20 ms frames (silence included), so the
216
+ * caller channel is used as the wall clock:
217
+ *
218
+ * - Each caller chunk advances the file by exactly its own duration. The
219
+ * right channel for that span is popped from a bounded agent FIFO and
220
+ * zero-padded (silence) when the agent has nothing buffered — i.e. the
221
+ * lagging channel is padded with silence whenever the other channel writes.
222
+ * - Agent audio is appended to the FIFO on arrival and drained at the caller
223
+ * clock rate. TTS providers push audio faster than realtime (multi-second
224
+ * bursts); draining at the caller rate mirrors the carrier's own playout
225
+ * buffer, so agent audio lands on the timeline roughly where the caller
226
+ * actually heard it (frame-granularity ≈ the inbound chunk size, ~20 ms).
227
+ *
228
+ * Alignment is therefore duration-based, not wall-clock-based: cross-channel
229
+ * skew is bounded by the carrier playout backlog.
230
+ *
231
+ * ## Memory profile (allocation-light, no per-frame disk I/O)
232
+ *
233
+ * Interleaved frames accumulate in a 64 KiB write buffer (~1 s of stereo
234
+ * audio) that is flushed with a single `writeSync` when full and on
235
+ * `close()` — the 20 ms hot path never issues per-frame disk writes.
236
+ * Nothing accumulates for the call duration. Internal state is bounded: at
237
+ * most one odd carry byte on each PCM path, a fixed-state resampler per
238
+ * channel, the bounded write buffer, and an agent FIFO capped at
239
+ * {@link AGENT_BACKLOG_CAP_S} seconds (~1.9 MB) whose overflow is
240
+ * force-flushed against left-channel silence.
241
+ *
242
+ * Any I/O error disables the recorder for the rest of the call (logged
243
+ * once) — recording must never take down a live phone call.
244
+ */
245
+ /** Output sample rate (both channels) — the SDK's internal PCM16 rate. */
246
+ declare const RECORDING_SAMPLE_RATE = 16000;
247
+ /**
248
+ * Maximum buffered agent audio (seconds) awaiting caller-clock drain. The
249
+ * agent FIFO mirrors the carrier playout buffer; 60 s covers even very long
250
+ * single-turn monologues. Overflow is force-flushed (silence on the caller
251
+ * channel), bounding memory at ~1.9 MB per recorded call.
252
+ */
253
+ declare const AGENT_BACKLOG_CAP_S = 60;
254
+ /** Wire encodings the taps may feed; everything is decoded to PCM16 16 kHz. */
255
+ type RecorderEncoding = 'pcm16_16k' | 'mulaw_8k' | 'pcm16_8k' | 'pcm16_24k';
256
+ /**
257
+ * Incremental stereo WAV writer for one call (left=caller, right=agent).
258
+ *
259
+ * The file is opened (and the placeholder header written) eagerly so
260
+ * permission errors surface at call start, not mid-call. All `add*` methods
261
+ * are synchronous and cheap (one decode + one buffered append per ~20 ms
262
+ * frame; disk writes happen in 64 KiB batches); errors never propagate —
263
+ * the recorder disables itself and logs a single warning.
264
+ */
265
+ declare class LocalCallRecorder {
266
+ private readonly filePath;
267
+ private fd;
268
+ private dataBytes;
269
+ private isClosed;
270
+ private broken;
271
+ private readonly callerDecoder;
272
+ private readonly agentDecoder;
273
+ /** Bounded FIFO of decoded agent PCM16@16k awaiting caller-clock drain. */
274
+ private agentBacklog;
275
+ private agentBacklogBytes;
276
+ /** Test hook: shrink to exercise the overflow path without 60 s of audio. */
277
+ maxBacklogBytes: number;
278
+ private callerSampleCount;
279
+ private agentSampleCount;
280
+ /** Interleaved stereo bytes awaiting the next batched disk write. */
281
+ private pendingChunks;
282
+ private pendingBytes;
283
+ /** Stereo data bytes already flushed to disk (file offset = 44 + this). */
284
+ private flushedBytes;
285
+ constructor(filePath: string);
286
+ /** Target WAV path. */
287
+ get path(): string;
288
+ /** True once `close()` has run (or the recorder broke). */
289
+ get closed(): boolean;
290
+ /**
291
+ * Stereo frames appended so far (the 64 KiB write buffer flushes roughly
292
+ * once per second of audio and always on `close()`).
293
+ */
294
+ get framesWritten(): number;
295
+ /** Real caller samples ingested (excludes padding). */
296
+ get callerSamples(): number;
297
+ /** Real agent samples ingested (excludes padding). */
298
+ get agentSamples(): number;
299
+ /** Recorded duration so far (committed stereo frames / 16 kHz). */
300
+ get durationSeconds(): number;
301
+ /**
302
+ * Record a caller-side chunk; advances the file by its duration. The
303
+ * right channel for the span is drained from the agent FIFO and
304
+ * zero-padded (silence) when the agent is not speaking.
305
+ */
306
+ addCallerAudio(data: Buffer, encoding?: RecorderEncoding): void;
307
+ /** Buffer an agent-side chunk; it drains at the caller clock rate. */
308
+ addAgentAudio(data: Buffer, encoding?: RecorderEncoding): void;
309
+ /**
310
+ * Drain the agent FIFO, patch the WAV header, and close the file.
311
+ *
312
+ * Idempotent and exception-safe — callable from every teardown path
313
+ * (carrier `stop`, abnormal WS close, `fireCallEnd`). Returns the
314
+ * recording path when a playable file exists, else `null`.
315
+ */
316
+ close(): string | null;
317
+ /** Pop up to `numBytes` from the agent FIFO, zero-padded to size. */
318
+ private popAgentBacklog;
319
+ /**
320
+ * Interleave equal-length mono buffers and append to the write buffer.
321
+ * The buffer is flushed to disk in 64 KiB batches (and on `close()`),
322
+ * so the per-frame cost is one `Buffer.alloc` + array push — no syscall.
323
+ */
324
+ private writeFrames;
325
+ /** Write all buffered stereo bytes in one `writeSync` call. */
326
+ private flushPending;
327
+ private markBroken;
328
+ }
329
+
187
330
  /**
188
331
  * OpenAI Realtime WebSocket adapter for Patter's realtime mode.
189
332
  *
@@ -347,8 +490,8 @@ declare class OpenAIRealtimeAdapter {
347
490
  protected readonly apiKey: string;
348
491
  protected readonly model: string;
349
492
  protected readonly voice: string;
350
- protected readonly instructions: string;
351
- protected readonly tools?: Array<{
493
+ protected instructions: string;
494
+ protected tools?: Array<{
352
495
  name: string;
353
496
  description: string;
354
497
  parameters: Record<string, unknown>;
@@ -547,6 +690,41 @@ declare class OpenAIRealtimeAdapter {
547
690
  sendReassurance(text: string): Promise<void>;
548
691
  /** Submit a tool/function-call result and request the next response. */
549
692
  sendFunctionResult(callId: string, result: string): Promise<void>;
693
+ /**
694
+ * Build the partial `session` body for a mid-session swap.
695
+ *
696
+ * v1-beta wire shape: flat `{ instructions, tools }`. The GA adapter
697
+ * overrides this to add the mandatory `"type": "realtime"` envelope.
698
+ * OpenAI merges partial `session.update` payloads server-side, so only the
699
+ * swapped fields are sent — audio formats, VAD tuning, and voice are
700
+ * untouched. Mirrors Python `_build_session_update_patch`.
701
+ */
702
+ protected buildSessionUpdatePatch(instructions: string | undefined, tools: Array<{
703
+ name: string;
704
+ description: string;
705
+ parameters: Record<string, unknown>;
706
+ strict?: boolean;
707
+ }> | undefined): Record<string, unknown>;
708
+ /**
709
+ * Apply a mid-session `instructions` / `tools` swap (multi-agent handoff).
710
+ *
711
+ * Sends a partial `session.update` carrying only the supplied fields and
712
+ * records them on the adapter so reconnect/warmup paths rebuild the
713
+ * session with the post-handoff config. Voice is intentionally NOT
714
+ * updatable here — OpenAI Realtime rejects a voice change once the session
715
+ * has produced audio, so the session keeps the voice established at call
716
+ * start (documented limitation). Mirrors Python
717
+ * `OpenAIRealtimeAdapter.update_session`.
718
+ */
719
+ updateSession(update: {
720
+ instructions?: string;
721
+ tools?: Array<{
722
+ name: string;
723
+ description: string;
724
+ parameters: Record<string, unknown>;
725
+ strict?: boolean;
726
+ }>;
727
+ }): Promise<void>;
550
728
  /** Stop the heartbeat, drop listeners, and close the Realtime WebSocket. */
551
729
  close(): void;
552
730
  }
@@ -620,6 +798,8 @@ declare class ElevenLabsConvAIAdapter {
620
798
  private finalizeAgentTurn;
621
799
  private scheduleSilenceDone;
622
800
  private handleMessage;
801
+ /** Answer a ``client_tool_call`` from the ElevenLabs agent. */
802
+ sendClientToolResult(toolCallId: string, result: string, isError?: boolean): void;
623
803
  /** Send a caller-side audio chunk to ConvAI as a base64 `user_audio_chunk`. */
624
804
  sendAudio(audioBytes: Buffer): void;
625
805
  /** Register the event callback that receives ConvAI server messages. */
@@ -775,7 +955,7 @@ declare function calculateTelephonyCost(provider: string, durationSeconds: numbe
775
955
  * Patter logger rather than being swallowed or crashing the call.
776
956
  */
777
957
  /** String tag identifying every event type the `EventBus` knows how to dispatch. */
778
- type PatterEventType = 'turn_started' | 'turn_ended' | 'eou_metrics' | 'interruption' | 'llm_metrics' | 'tts_metrics' | 'stt_metrics' | 'metrics_collected' | 'call_ended' | 'transcript_partial' | 'transcript_final' | 'llm_chunk' | 'tts_chunk' | 'tool_call_started';
958
+ type PatterEventType = 'turn_started' | 'turn_ended' | 'eou_metrics' | 'interruption' | 'llm_metrics' | 'tts_metrics' | 'stt_metrics' | 'metrics_collected' | 'call_ended' | 'transcript_partial' | 'transcript_final' | 'llm_chunk' | 'tts_chunk' | 'tool_call_started' | 'false_interruption';
779
959
  type Listener<T = unknown> = (payload: T) => void | Promise<void>;
780
960
  /** In-process pub/sub for Patter call-lifecycle events. */
781
961
  declare class EventBus {
@@ -912,11 +1092,29 @@ interface CallMetrics {
912
1092
  /** Terminal error code when the call ended abnormally (a lowercased ErrorCode
913
1093
  * value or "other"); empty/absent for a clean call. Never the message. */
914
1094
  readonly error_code?: string;
1095
+ /** PREEMPTIVE GENERATION counters (pipeline mode, only non-zero when
1096
+ * `agent.preemptiveGeneration` is true). `preemptive_hits` counts
1097
+ * speculative turns released on a matching final transcript (latency win);
1098
+ * `preemptive_misses` counts speculations started but discarded
1099
+ * (mismatched final, barge-in, replaced by a newer interim, buffer
1100
+ * overflow) — i.e. wasted LLM/TTS spend. Mirrors Python
1101
+ * `CallMetrics.preemptive_hits` / `preemptive_misses`. */
1102
+ readonly preemptive_hits?: number;
1103
+ readonly preemptive_misses?: number;
915
1104
  }
916
1105
  /** Programmatic control surface for a live call (transfer, hangup, DTMF). */
917
1106
  interface CallControl {
918
- /** Transfer the call to a different number or SIP URI. */
919
- transfer(number: string): Promise<void>;
1107
+ /**
1108
+ * Transfer the call to a different number or SIP URI.
1109
+ *
1110
+ * `options.mode === 'warm'` requests a hold-announce-bridge warm transfer
1111
+ * (Twilio only for now); omitted / `'cold'` runs the historical blind
1112
+ * redirect byte-identically. Warm mode resolves a
1113
+ * {@link TransferCallResult} envelope (`{ error }` when unsupported or
1114
+ * failed — the call keeps running); cold mode may resolve `void` (legacy
1115
+ * contract). Mirrors Python `CallControl.transfer(number, mode=, summary=)`.
1116
+ */
1117
+ transfer(number: string, options?: TransferCallOptions): Promise<TransferCallResult | void>;
920
1118
  /** Hang up the call. */
921
1119
  hangup(): Promise<void>;
922
1120
  /**
@@ -1000,6 +1198,8 @@ declare class CallMetricsAccumulator {
1000
1198
  private _actualTelephonyCost;
1001
1199
  private _actualSttCost;
1002
1200
  private _totalLlmCost;
1201
+ private _preemptiveHits;
1202
+ private _preemptiveMisses;
1003
1203
  private _llmModel;
1004
1204
  private _eventBus;
1005
1205
  /** Timestamp (hrTimeMs) when VAD emitted speech_end. */
@@ -1243,6 +1443,13 @@ declare class CallMetricsAccumulator {
1243
1443
  text_tokens?: number;
1244
1444
  };
1245
1445
  }, model?: string | null): void;
1446
+ /** Count a preemptive (speculative) turn RELEASED on a matching final
1447
+ * transcript — the buffered LLM+TTS work became the real turn. */
1448
+ recordPreemptiveHit(): void;
1449
+ /** Count a preemptive (speculative) turn started but discarded without
1450
+ * release (mismatched final, barge-in during speculation, replaced by a
1451
+ * newer interim, or buffer overflow). */
1452
+ recordPreemptiveMiss(): void;
1246
1453
  /** Override the carrier-billed telephony cost (e.g. exact value reported via Twilio API). */
1247
1454
  setActualTelephonyCost(cost: number): void;
1248
1455
  /** Override the provider-billed STT cost when an exact figure is available. */
@@ -1527,6 +1734,1403 @@ declare class MetricsStore extends EventEmitter {
1527
1734
  hydrate(logRoot: string | null | undefined): number;
1528
1735
  }
1529
1736
 
1737
+ type AIAdapter = OpenAIRealtimeAdapter | ElevenLabsConvAIAdapter;
1738
+ /** Provider-specific operations that differ between Twilio, Telnyx and Plivo. */
1739
+ interface TelephonyBridge {
1740
+ /** Human-readable label for log messages. */
1741
+ readonly label: string;
1742
+ /** Telephony provider name for metrics. */
1743
+ readonly telephonyProvider: CarrierKind;
1744
+ /** Wire format of the inbound media stream after the carrier has accepted
1745
+ * the call. Lets the StreamHandler decide whether to decode + resample
1746
+ * inbound audio without needing carrier-name knowledge — mulaw 8 kHz
1747
+ * carriers (Twilio, Plivo) say ``ulaw_8000``, PCM 16 kHz carriers
1748
+ * (Telnyx with PCMU bidirectional negotiation off) say ``pcm_16000``. */
1749
+ readonly inputWireFormat: 'ulaw_8000' | 'pcm_16000';
1750
+ /** Send an audio chunk (base64-encoded) to the telephony WebSocket. */
1751
+ sendAudio(ws: WebSocket$1, audioBase64: string, streamSid: string): void;
1752
+ /** Send a mark event to track audio playback progress (no-op for Telnyx). */
1753
+ sendMark(ws: WebSocket$1, markName: string, streamSid: string): void;
1754
+ /** Send a clear/interrupt event to stop audio playback. */
1755
+ sendClear(ws: WebSocket$1, streamSid: string): void;
1756
+ /** Transfer the call to a different number or SIP URI via provider API.
1757
+ * ``options.mode === 'warm'`` requests a hold-announce-bridge warm
1758
+ * transfer (Twilio only for now); the default / omitted options run the
1759
+ * historical cold (blind) redirect byte-identically. Returns a
1760
+ * {@link TransferCallResult} envelope for warm mode (``{ error }`` when
1761
+ * unsupported / failed — the call keeps running); cold mode may resolve
1762
+ * ``void`` (legacy contract). */
1763
+ transferCall(callId: string, toNumber: string, options?: TransferCallOptions): Promise<TransferCallResult | void>;
1764
+ /** Hang up the call via provider API. */
1765
+ endCall(callId: string, ws: WebSocket$1): Promise<void>;
1766
+ /** Send DTMF digits to the caller. Carriers using REST (Telnyx) ignore
1767
+ * ``ws``; carriers that send DTMF as a media-stream message (Plivo) use it. */
1768
+ sendDtmf?(ws: WebSocket$1, callId: string, digits: string, delayMs: number): Promise<void>;
1769
+ /** Start call recording via provider API (optional). */
1770
+ startRecording?(callId: string): Promise<void>;
1771
+ /** Stop call recording via provider API (optional). */
1772
+ stopRecording?(callId: string): Promise<void>;
1773
+ /** Create an STT instance appropriate for this provider's audio format.
1774
+ * Returns any of the supported STT adapters (DeepgramSTT, WhisperSTT,
1775
+ * CartesiaSTT, SonioxSTT, AssemblyAISTT) or null when no STT is configured. */
1776
+ createStt(agent: AgentOptions): Promise<STTAdapter | null>;
1777
+ /** Query actual telephony costs after call ends. */
1778
+ queryTelephonyCost(metricsAcc: CallMetricsAccumulator, callId: string): Promise<void>;
1779
+ }
1780
+ /** Per-call dependencies injected into `StreamHandler` (immutable for the call's lifetime). */
1781
+ interface StreamHandlerDeps {
1782
+ readonly config: {
1783
+ readonly openaiKey?: string;
1784
+ readonly twilioSid?: string;
1785
+ readonly twilioToken?: string;
1786
+ };
1787
+ readonly agent: AgentOptions;
1788
+ readonly bridge: TelephonyBridge;
1789
+ readonly metricsStore: MetricsStore;
1790
+ readonly pricing: Record<string, Partial<ProviderPricing>> | null;
1791
+ readonly remoteHandler: RemoteMessageHandler;
1792
+ /**
1793
+ * Per-call start callback. A returned object is treated as PER-CALL AGENT
1794
+ * OVERRIDES (snake_case keys: system_prompt, voice, model, language,
1795
+ * first_message, provider, tools, variables) — parity with Python's
1796
+ * ``apply_call_overrides``. Return nothing for the legacy observe-only
1797
+ * behaviour.
1798
+ */
1799
+ readonly onCallStart?: (data: Record<string, unknown>) => Promise<void | Record<string, unknown> | undefined> | void | Record<string, unknown>;
1800
+ readonly onCallEnd?: (data: Record<string, unknown>) => Promise<void>;
1801
+ readonly onTranscript?: (data: Record<string, unknown>) => Promise<void>;
1802
+ readonly onMessage?: PipelineMessageHandler | string;
1803
+ readonly onMetrics?: (data: Record<string, unknown>) => Promise<void>;
1804
+ readonly recording: boolean;
1805
+ /**
1806
+ * Optional factory returning a carrier-neutral local call recorder for
1807
+ * ``callId`` (wired by ``EmbeddedServer.makeLocalRecorder`` when
1808
+ * ``serve({ localRecording })`` is on). Returning ``null`` / leaving the
1809
+ * field unset keeps every recording tap a no-op. The handler owns the
1810
+ * recorder lifetime: created in ``handleCallStart``, finalized in
1811
+ * ``fireCallEnd`` (every teardown path funnels there).
1812
+ */
1813
+ readonly makeLocalRecorder?: (callId: string) => LocalCallRecorder | null;
1814
+ /** When true, only the first TTFB per call is forwarded to the event bus. Default false. */
1815
+ readonly reportOnlyInitialTtfb?: boolean;
1816
+ /**
1817
+ * Optional speech-edge events dispatcher. When provided, the handler emits
1818
+ * turn-taking edges (VAD start/stop, EOU commit, agent first/last wire
1819
+ * chunk) as the call progresses. ``undefined`` means no events are fired
1820
+ * — exact prior behaviour. See ``src/_speech-events.ts``.
1821
+ */
1822
+ readonly speechEvents?: SpeechEvents;
1823
+ /** Build an AI adapter (OpenAI Realtime or ElevenLabs ConvAI). Injected to avoid circular imports. */
1824
+ readonly buildAIAdapter: (resolvedPrompt: string, tools?: readonly ToolDefinition[]) => AIAdapter;
1825
+ /** Sanitize untrusted key-value variables map. */
1826
+ readonly sanitizeVariables: (raw: Record<string, unknown>) => Record<string, string>;
1827
+ /** Replace {key} placeholders in a template string. */
1828
+ readonly resolveVariables: (template: string, variables: Record<string, string>) => string;
1829
+ /**
1830
+ * Optional accessor returning pre-rendered first-message audio for
1831
+ * ``callId``. Wired by ``Patter.serve()`` when the parent client has
1832
+ * ``agent.prewarmFirstMessage: true``. Returning ``undefined`` means
1833
+ * "no prewarm — always run live TTS".
1834
+ */
1835
+ readonly popPrewarmAudio?: (callId: string) => Buffer | undefined;
1836
+ /**
1837
+ * Optional accessor returning pre-opened, fully-handshaked provider
1838
+ * WebSockets for ``callId`` so the per-call StreamHandler can
1839
+ * adopt them at ``start`` instead of paying the cold handshake on
1840
+ * the first turn. Wired by ``Patter.serve()``. Returning
1841
+ * ``undefined`` (or any sub-field unset) means "no parked socket
1842
+ * for this provider — fall back to fresh ``connect()``".
1843
+ */
1844
+ readonly popPrewarmedConnections?: (callId: string) => ParkedProviderConnections | undefined;
1845
+ }
1846
+ /** Per-call session controller — owns the AI adapter, STT/TTS pipeline, and metrics. */
1847
+ declare class StreamHandler {
1848
+ private readonly deps;
1849
+ private readonly ws;
1850
+ private caller;
1851
+ private callee;
1852
+ private streamSid;
1853
+ private callId;
1854
+ private adapter;
1855
+ private stt;
1856
+ private tts;
1857
+ private isSpeaking;
1858
+ /**
1859
+ * True only while the post-TTS tail-grace window is pending: the agent has
1860
+ * finished its turn but ``isSpeaking`` is still held for
1861
+ * ``PATTER_TTS_TAIL_GRACE_MS`` to swallow the fading echo tail. A VAD
1862
+ * ``speech_start`` (or a transcript) during this window is the user's NEXT
1863
+ * turn, not a barge-in — there is nothing left to interrupt. Set by
1864
+ * ``endSpeakingWithGrace``; cleared by ``beginSpeaking``, the grace flip,
1865
+ * ``cancelSpeaking``, and ``endTailGraceForNewTurn``. Parity with Python
1866
+ * ``_tail_grace_active``.
1867
+ */
1868
+ private tailGraceActive;
1869
+ /**
1870
+ * Ring buffer of inbound PCM16 16 kHz frames captured while the agent
1871
+ * is speaking and the self-hearing guard is dropping audio. On
1872
+ * barge-in we flush this buffer to STT so Deepgram (or any other
1873
+ * streaming STT) receives the user's first ~500 ms of speech — which
1874
+ * would otherwise be lost while the VAD's `minSpeechDuration` window
1875
+ * accumulated and fired `speech_start`. Each frame is 20 ms × 32 bytes
1876
+ * (16 kHz × 16-bit mono) ≈ 640 bytes.
1877
+ *
1878
+ * Capped to ``INBOUND_AUDIO_RING_FRAMES`` to recover only the
1879
+ * VAD-missed leading edge of the user's speech (default 250 ms,
1880
+ * matching SileroVAD ``minSpeechDuration``). Earlier values up to
1881
+ * 600 ms were including ~350 ms of pre-speech silence/agent-bleed in
1882
+ * the replay; on PSTN (where AEC is a no-op) Deepgram trained on
1883
+ * English happily transcribes that bleed as English garbage
1884
+ * (``"The same as Edgar,"``, ``"Permadees."``) and commits it to
1885
+ * the LLM as a phantom user transcript. See BUGS.md 2026-05-05
1886
+ * post-barge-in bleed-transcription entry.
1887
+ */
1888
+ private inboundAudioRing;
1889
+ private static readonly INBOUND_AUDIO_RING_FRAMES;
1890
+ /**
1891
+ * Cached LLM provider tag used by speech-event payloads. Mirrors the
1892
+ * value passed to the metrics accumulator at construction time so the
1893
+ * speech-edge events report the same provider classification as
1894
+ * dashboard / pricing rows.
1895
+ */
1896
+ private llmProviderTag;
1897
+ /**
1898
+ * Auto-loaded SileroVAD when ``agent.vad`` is undefined. Populated by
1899
+ * ``initPipeline`` and queried alongside ``agent.vad`` on every audio frame.
1900
+ * Stays null when ``onnxruntime-node`` is not installed — the pipeline
1901
+ * then falls back to the STT-endpoint heuristic (legacy behaviour).
1902
+ */
1903
+ private autoVad;
1904
+ /**
1905
+ * Acoustic echo canceller (NLMS adaptive filter). Lazily instantiated in
1906
+ * ``initPipeline`` when ``agent.echoCancellation`` is true. ``null``
1907
+ * otherwise — the mic path stays a pure pass-through for handset /
1908
+ * headset deployments that don't have TTS bleed.
1909
+ */
1910
+ private aec;
1911
+ /**
1912
+ * Carrier-neutral local call recorder (stereo WAV; left=caller,
1913
+ * right=agent). Created in ``handleCallStart`` via
1914
+ * ``deps.makeLocalRecorder`` when ``serve({ localRecording })`` is on;
1915
+ * ``null`` keeps every tap a no-op. Finalized (header patched, file
1916
+ * closed) in ``fireCallEnd`` — both ``handleStop`` and ``handleWsClose``
1917
+ * funnel there, so abnormal teardown still yields a parseable file.
1918
+ * Parity with Python ``StreamHandler.local_recorder``.
1919
+ */
1920
+ private localRecorder;
1921
+ /**
1922
+ * Monotonic counter incremented on every TTS-start. The grace timer
1923
+ * scheduled by ``endSpeakingWithGrace`` only flips ``isSpeaking=false``
1924
+ * if the counter still matches its capture — a new turn that started in
1925
+ * the meantime invalidates the obsolete timer instead of clobbering its
1926
+ * own ``isSpeaking=true``.
1927
+ */
1928
+ private speakingGeneration;
1929
+ /**
1930
+ * Wall-clock timestamp (ms since epoch) when the current TTS turn
1931
+ * started — captured by ``beginSpeaking`` and cleared by
1932
+ * ``cancelSpeaking`` / the grace flip. Used to gate barge-in: we
1933
+ * suppress the cancel for the first
1934
+ * ``MIN_AGENT_SPEAKING_MS_BEFORE_BARGE_IN_AEC`` of every turn (when AEC
1935
+ * is on) so the AEC filter has time to converge — otherwise residual
1936
+ * TTS bleed in the mic stream looks like user speech to VAD and
1937
+ * triggers an immediate self-cancellation of the agent's first
1938
+ * sentence.
1939
+ */
1940
+ private speakingStartedAt;
1941
+ /**
1942
+ * Wall-clock (ms) when the FIRST TTS audio chunk actually reached the
1943
+ * carrier wire — set in ``markFirstAudioSent`` after ``bridge.sendAudio``
1944
+ * succeeds, cleared by ``beginSpeaking`` / ``cancelSpeaking``. The barge-in
1945
+ * gate measures elapsed from this instant, NOT from ``speakingStartedAt``,
1946
+ * because ElevenLabs (and other cloud TTS) take 200-700 ms to emit the
1947
+ * first byte. A gate anchored to ``beginSpeaking`` would expire on
1948
+ * background noise before any audio went out, exit the TTS loop on
1949
+ * ``isSpeaking=false``, and silently cut the agent's first turn.
1950
+ */
1951
+ private firstAudioSentAt;
1952
+ /**
1953
+ * Estimated wall-clock (ms) when the LAST audio byte pushed to the carrier
1954
+ * finishes PLAYING on the phone. The pipeline pushes TTS audio as fast as
1955
+ * the provider synthesizes it (no pacing) and the carrier buffers + plays
1956
+ * at realtime, so "we finished pushing" and "the caller finished hearing"
1957
+ * can diverge by tens of seconds — especially with agent-runtime LLMs
1958
+ * (Hermes/OpenClaw) that deliver a long reply all at once after a thinking
1959
+ * pause. ``endSpeakingWithGrace`` holds ``isSpeaking=true`` (with
1960
+ * ``tailGraceActive=false``) until this cursor passes, so a barge-in during
1961
+ * the audible backlog still takes the cancel path (``sendClear`` drops the
1962
+ * carrier buffer) instead of being treated as a calm next turn. Advanced by
1963
+ * ``trackOutboundPlayback``; reset by ``cancelSpeaking`` (the buffer is
1964
+ * cleared) and ``endTailGraceForNewTurn``.
1965
+ */
1966
+ private playbackBufferedUntil;
1967
+ /**
1968
+ * Per-turn playback timeline used to estimate the response prefix the
1969
+ * caller actually HEARD when a barge-in lands. ``turnPlaybackTotalMs``
1970
+ * accumulates the playout duration of every chunk pushed this turn
1971
+ * (including filler audio, which keeps the timeline aligned);
1972
+ * ``turnSpokenSegments`` records ``{text, startMs}`` for each RESPONSE
1973
+ * sentence at its first audible chunk (filler / error-fallback audio
1974
+ * advances the clock but adds no segment). ``heard = total - backlog``
1975
+ * then maps to a sentence-granular prefix — see ``heardResponsePrefix``.
1976
+ * Both reset at ``beginSpeaking``. Mirrors Python
1977
+ * ``_turn_playback_total_s`` / ``_turn_spoken_segments``.
1978
+ */
1979
+ private turnPlaybackTotalMs;
1980
+ private turnSpokenSegments;
1981
+ /**
1982
+ * Optional barge-in confirmation strategies. With an empty array the
1983
+ * SDK falls back to the legacy "cancel on first VAD speech_start"
1984
+ * behaviour. With one or more strategies, a VAD speech_start during
1985
+ * TTS marks the barge-in as *pending* — TTS keeps streaming naturally
1986
+ * — and the strategies are consulted on every STT transcript via
1987
+ * ``handleBargeIn``. The first strategy that returns ``true`` cancels
1988
+ * the agent; if none confirm within ``bargeInConfirmMs`` the pending
1989
+ * state is dropped and the agent finishes its sentence.
1990
+ */
1991
+ private readonly bargeInStrategies;
1992
+ /** Pending-barge-in confirmation timeout in milliseconds. */
1993
+ private readonly bargeInConfirmMs;
1994
+ /** Wall-clock (ms) when the current pending barge-in started, or
1995
+ * ``null`` if no barge-in is pending. */
1996
+ private bargeInPendingSince;
1997
+ /** Timer that fires the pending-barge-in timeout. In
1998
+ * ``bargeInMode: 'pause_resume'`` this same handle holds the
1999
+ * false-interruption resume timer. */
2000
+ private bargeInPendingTimer;
2001
+ /**
2002
+ * Pause-and-resume false-interruption handling (opt-in
2003
+ * ``agent.bargeInMode: 'pause_resume'``; default ``'cancel'`` keeps
2004
+ * today's behaviour byte-identical): PAUSE output on
2005
+ * VAD speech_start (carrier cleared, sends gated on ``outputPaused``),
2006
+ * KILL on a committed final transcript within ``bargeInConfirmMs``,
2007
+ * RESUME from the first not-fully-heard sentence otherwise. Mirrors
2008
+ * Python ``_barge_in_mode`` / ``_output_paused``.
2009
+ */
2010
+ private readonly bargeInMode;
2011
+ /** True while output is paused: ``synthesizeSentence`` queues chunks
2012
+ * into per-sentence retention entries instead of sending, and the LLM
2013
+ * loops buffer whole sentences as text. */
2014
+ private outputPaused;
2015
+ /** Per-pause decision latch — resolved when the pause resolves
2016
+ * (resume, kill, or teardown) so loop-side waiters can proceed. */
2017
+ private pauseDecision;
2018
+ /** Sentences produced by the LLM while paused (text, pre-guardrail).
2019
+ * Spoken in order on resume; discarded on kill. Bounded by
2020
+ * ``PAUSE_MAX_BUFFERED_SENTENCES`` — overflow degrades to a full
2021
+ * cancel so memory stays bounded against a runaway stream. */
2022
+ private pausedSentences;
2023
+ /**
2024
+ * Per-turn retained sentence audio (pause_resume mode only): one entry
2025
+ * per response sentence holding every TTS chunk produced for it.
2026
+ * ``sent`` counts chunks actually delivered to the carrier — the
2027
+ * resume path resets it to 0 for the unheard tail and re-sends from
2028
+ * memory (no TTS re-billing). Index-aligned with
2029
+ * ``turnSpokenSegments`` for the stamped prefix. Bounded by
2030
+ * ``PAUSE_RESUME_MAX_RETAINED_S``.
2031
+ */
2032
+ private turnSentenceAudio;
2033
+ private pauseRetainedBytes;
2034
+ /** Set when the retained-audio cap was exceeded while NOT paused (very
2035
+ * long carrier backlog): retention is released and pause_resume falls
2036
+ * back to legacy cancel for the rest of the turn. Reset at
2037
+ * ``beginSpeaking``. */
2038
+ private pauseResumeOverflowed;
2039
+ /** Sentence index (into ``turnSpokenSegments`` / ``turnSentenceAudio``)
2040
+ * of the first sentence the caller had NOT fully heard at pause time —
2041
+ * the resume offset. Sentence granularity: the partially-played
2042
+ * sentence is replayed from its start (natural-sounding repair) rather
2043
+ * than resumed mid-word. */
2044
+ private pauseResumeIndex;
2045
+ /** False until the turn body finishes pushing audio (the
2046
+ * ``endSpeakingWithGrace`` call in its finally). The resume path uses
2047
+ * it to decide whether the #164 grace machinery must be re-armed for
2048
+ * the re-sent tail (post-complete pause) or whether the still-running
2049
+ * turn body will arm it itself. */
2050
+ private turnOutputDone;
2051
+ /** Cap on sentences buffered as text while output is paused. A pause
2052
+ * lasts at most ``bargeInConfirmMs`` (1.5 s default) so this is
2053
+ * generous; overflow degrades to a full cancel. Mirrors Python
2054
+ * ``_PAUSE_MAX_BUFFERED_SENTENCES``. */
2055
+ private static readonly PAUSE_MAX_BUFFERED_SENTENCES;
2056
+ /** Cap (seconds of playout) on retained per-sentence TTS audio — both
2057
+ * the already-sent tail kept for re-send and chunks queued while
2058
+ * paused. 15 s ≈ 480 KB of PCM16 @ 16 kHz per concurrent call.
2059
+ * Overflow while paused → degrade to full cancel; overflow while
2060
+ * speaking → release retention and fall back to legacy cancel for the
2061
+ * rest of the turn. Mirrors Python ``_PAUSE_RESUME_MAX_RETAINED_S``. */
2062
+ private static readonly PAUSE_RESUME_MAX_RETAINED_S;
2063
+ /**
2064
+ * Set to true when a VAD ``speech_start`` was suppressed by the
2065
+ * anti-echo gate during the current agent turn. Cleared on
2066
+ * ``beginSpeaking`` and ``cancelSpeaking``. When the turn ends
2067
+ * naturally (grace timer), the inbound audio ring is flushed to STT
2068
+ * so the user's speech is not silently discarded.
2069
+ */
2070
+ private suppressedSpeechPending;
2071
+ /** Rolling window byte budget: the last 8 s of PCM16 @ 16 kHz. */
2072
+ private static readonly SEMANTIC_WINDOW_MAX_BYTES;
2073
+ /** Re-score cadence while holding: one prediction per this much silence. */
2074
+ private static readonly SEMANTIC_POLL_MS;
2075
+ /** Rolling buffer of post-decode PCM16-16k frames (bounded to 8 s). */
2076
+ private semanticAudioRing;
2077
+ private semanticAudioRingBytes;
2078
+ /** True while a sub-threshold prediction is holding the finalize open. */
2079
+ private semanticHoldActive;
2080
+ /** Wall-clock (ms) deadline for the hard cap, null when idle. */
2081
+ private semanticHoldDeadlineMs;
2082
+ /** Invalidates the backstop timer once its hold has been resolved. */
2083
+ private semanticHoldGeneration;
2084
+ /** Wall-clock backstop — finalizes at the cap even if audio stalls. */
2085
+ private semanticHoldTimer;
2086
+ /** Bytes accumulated since the last prediction while holding. */
2087
+ private semanticPollPendingBytes;
2088
+ /**
2089
+ * Set on the FIRST detector failure: semantic endpointing is then
2090
+ * disabled for the remainder of the call (one clear warning, plain
2091
+ * VAD-silence behavior) instead of warning per turn against a
2092
+ * permanently broken model. Mirrors Python ``_semantic_detector_failed``
2093
+ * and the existing ``vadDisabled`` fail-once pattern.
2094
+ */
2095
+ private turnDetectorFailed;
2096
+ /**
2097
+ * EOU trigger for the NEXT committed turn. Stamped by the semantic
2098
+ * finalize paths, consumed (and reset) on transcript commit. Parity
2099
+ * with Python ``_last_eou_trigger``.
2100
+ */
2101
+ private lastEouTrigger;
2102
+ /** Hard cap (ms) a semantic hold may defer the finalize. */
2103
+ private readonly maxSemanticHoldMs;
2104
+ /**
2105
+ * Minimum wall-clock duration (ms) the agent must have been speaking
2106
+ * before barge-in is allowed to fire when AEC is active. Covers the
2107
+ * AEC warmup window (~500 ms) plus a safety margin so residual bleed
2108
+ * during the convergence period does not self-trigger barge-in.
2109
+ */
2110
+ private static readonly MIN_AGENT_SPEAKING_MS_BEFORE_BARGE_IN_AEC;
2111
+ /**
2112
+ * Same as the AEC variant but for deployments where AEC is OFF
2113
+ * (default on PSTN — Twilio/Telnyx). Without an adaptive filter to
2114
+ * converge, the only justification for a gate is anti-flicker on
2115
+ * micro-events (cough, click). Raised 100 → 500 ms on 2026-05-19
2116
+ * after the 0.6.2 acceptance run showed a phantom VAD speech_start
2117
+ * firing on the very first inbound frame (~500 ms into the call,
2118
+ * which is past a 100 ms gate). The phantom barge-in cancelled the
2119
+ * prewarmed firstMessage, the user heard a clipped (graffiante)
2120
+ * audio fragment, and the SDK left ``_turnAlreadyClosed=true`` so
2121
+ * subsequent ``recordTurnComplete`` calls were no-ops. 500 ms
2122
+ * filters those phantoms while still letting a real interruption
2123
+ * land within half a second of agent onset.
2124
+ */
2125
+ private static readonly MIN_AGENT_SPEAKING_MS_BEFORE_BARGE_IN_NO_AEC;
2126
+ /** Handle for the pending grace-period timer, so it can be cleared on cleanup. */
2127
+ private graceTimer;
2128
+ /**
2129
+ * AbortController for the current LLM streaming consumption. Aborted by
2130
+ * ``cancelSpeaking`` so the in-flight LLM stream stops generating tokens
2131
+ * we will never speak — saves provider cost and frees the connection
2132
+ * earlier. Mirrors Python ``_llm_cancel_event``.
2133
+ */
2134
+ private llmAbort;
2135
+ /**
2136
+ * Wall-clock timestamp of the most recent ``cancelSpeaking`` call, or
2137
+ * ``null`` if no cancel has fired since the call started. Used by
2138
+ * ``beginSpeaking`` to enforce a short post-cancel drain window so the
2139
+ * remote PSTN player finishes flushing the previous turn's in-flight
2140
+ * audio before the next TTS chunk lands on top of it. Without this,
2141
+ * the first sentence of a post-barge-in turn audibly overlaps with
2142
+ * the tail of the cancelled turn (~50-200 ms of doubled audio).
2143
+ */
2144
+ private lastCancelAt;
2145
+ /**
2146
+ * Promise queue tracking outstanding Twilio marks the SDK has sent but
2147
+ * not yet seen echoed back. Used by the firstMessage send loop to bound
2148
+ * the depth of audio queued at the carrier — without this the loop
2149
+ * pushes the entire TTS stream into Twilio's WebSocket in one burst,
2150
+ * and a sendClear issued mid-buffer races against several seconds of
2151
+ * already-queued media frames (BUG #128). The window depth is
2152
+ * ``FIRST_MESSAGE_MARK_WINDOW``; ``onMark`` drains entries as Twilio
2153
+ * confirms playback, ``cancelSpeaking`` resolves every pending entry so
2154
+ * any awaiter exits immediately. Telnyx never populates this queue
2155
+ * (Telnyx's media-stream protocol has no mark concept — the loop
2156
+ * falls back to time-based pacing on that carrier).
2157
+ */
2158
+ private pendingMarks;
2159
+ /**
2160
+ * Monotonic counter for first-message mark names. Distinct from
2161
+ * ``chunkCount`` (which the Realtime path uses) so the two paths can
2162
+ * coexist without name collisions even when firstMessage finishes while
2163
+ * a Realtime turn is still streaming.
2164
+ */
2165
+ /**
2166
+ * Minimum drain window (ms) between a ``cancelSpeaking`` and the next
2167
+ * ``beginSpeaking``. 150 ms covers a typical PSTN jitter buffer drain
2168
+ * + Twilio Media Stream clear propagation. Lower values risk audio
2169
+ * overlap on the first chunk; higher values increase the perceived
2170
+ * "agent ack" latency after a barge-in. 150 ms is the smallest value
2171
+ * that consistently eliminated the overlap during 0.6.0 acceptance.
2172
+ */
2173
+ private static readonly POST_CANCEL_DRAIN_MS;
2174
+ /**
2175
+ * Mark the start of a TTS span. Use instead of setting isSpeaking
2176
+ * directly. Awaits the post-cancel drain window before flipping state
2177
+ * so the remote player has time to flush the cancelled turn's tail.
2178
+ */
2179
+ private beginSpeaking;
2180
+ /**
2181
+ * Record that the first TTS audio chunk of the current turn has hit the
2182
+ * carrier wire. Idempotent within a turn — only the first call sets the
2183
+ * timestamp; later chunks are no-ops. Must be invoked AFTER the underlying
2184
+ * ``bridge.sendAudio`` resolves so the gate is anchored to "audio actually
2185
+ * went out", not "we asked the carrier to send it".
2186
+ */
2187
+ private markFirstAudioSent;
2188
+ /**
2189
+ * Advance ``playbackBufferedUntil`` by the playout duration of an outbound
2190
+ * TTS chunk. ``numBytes`` is the size of the chunk BEFORE carrier encoding
2191
+ * (the same buffer handed to ``encodePipelineAudio``): PCM16 @ 16 kHz in
2192
+ * the default path (32 bytes/ms), or the carrier's native μ-law @ 8 kHz
2193
+ * (8 bytes/ms) when the TTS adapter emits wire format directly
2194
+ * (``ttsOutputFormatNativeForCarrier`` — Twilio/Plivo ``ulaw_8000``;
2195
+ * Telnyx native is ``pcm_16000`` so it stays at 32 bytes/ms).
2196
+ */
2197
+ private trackOutboundPlayback;
2198
+ /**
2199
+ * Estimate the response prefix the caller actually HEARD this turn.
2200
+ *
2201
+ * The pipeline pushes audio faster than realtime, so at barge-in time
2202
+ * ``heard = totalPushed - carrierBacklog`` ms of audio have actually
2203
+ * played. Mapped at sentence granularity against ``turnSpokenSegments``:
2204
+ * a sentence counts as heard once its playback has STARTED
2205
+ * (``startMs <= heardMs``), so the sentence playing at the moment of
2206
+ * interruption is included.
2207
+ *
2208
+ * Returns ``null`` when no segments were tracked this turn (nothing
2209
+ * synthesized through the tracked path — callers fall back to the legacy
2210
+ * full-text behaviour). Mirrors Python ``_heard_response_prefix``.
2211
+ */
2212
+ private heardResponsePrefix;
2213
+ /**
2214
+ * Replace the text of the most recent assistant entry in the conversation
2215
+ * history. No-op when the last entry is not an assistant turn (e.g. the
2216
+ * caller's next turn was already committed).
2217
+ */
2218
+ private rewriteLastAssistantEntry;
2219
+ /**
2220
+ * Heard-prefix semantics for a barge-in that lands AFTER
2221
+ * the turn completed, while the carrier is still playing the buffered
2222
+ * tail.
2223
+ *
2224
+ * The completed turn already recorded its FULL reply in history, but the
2225
+ * caller only heard part of it before interrupting — a stateful agent
2226
+ * runtime (Hermes / OpenClaw) would otherwise "remember saying" things
2227
+ * the caller never heard. Rewrites the last assistant entry to the heard
2228
+ * prefix + ``[interrupted by caller]``.
2229
+ *
2230
+ * MUST run BEFORE ``cancelSpeaking`` resets ``playbackBufferedUntil``
2231
+ * (the backlog is the heard-prefix input). No-op when a turn is still in
2232
+ * flight (the streaming path applies its own marker), when there is no
2233
+ * backlog, or when everything was already heard. Mirrors Python
2234
+ * ``_maybe_truncate_completed_turn_history``.
2235
+ */
2236
+ private maybeTruncateCompletedTurnHistory;
2237
+ /**
2238
+ * Atomically end speaking AND invalidate any pending grace timer.
2239
+ * Use instead of ``this.isSpeaking = false`` at barge-in sites.
2240
+ *
2241
+ * Also aborts the in-flight LLM stream (if any) so the provider stops
2242
+ * billing tokens we will never speak.
2243
+ */
2244
+ private cancelSpeaking;
2245
+ /**
2246
+ * Resolve every entry in ``pendingMarks`` and empty the queue. Idempotent
2247
+ * — safe to call from ``cancelSpeaking`` and again from the grace path
2248
+ * without leaking pending promises.
2249
+ */
2250
+ private drainPendingMarks;
2251
+ /**
2252
+ * Bytes-per-millisecond for a 16 kHz PCM16 mono stream. Used by
2253
+ * ``sendPacedFirstMessageBytes`` to translate chunk size into a
2254
+ * playout-duration sleep so we never deliver faster than the carrier
2255
+ * can decode + play out (which manifested as severe crackling on the
2256
+ * HTTP-TTS path with client-side resampling). 16000 samples/sec × 2
2257
+ * bytes/sample = 32 bytes/ms.
2258
+ */
2259
+ private static readonly PCM16_16K_BYTES_PER_MS;
2260
+ /** Cancel and clear the pending grace timer, if any. */
2261
+ private clearGraceTimer;
2262
+ /**
2263
+ * Mark the agent as no longer producing TTS, honoring a grace period that
2264
+ * approximates the carrier's playback buffer. The user may still hear the
2265
+ * agent for ~1 s after we finish pushing audio (Twilio buffers ~1500 ms);
2266
+ * keeping isSpeaking=true through that window keeps the VAD-driven
2267
+ * barge-in armed during the audible tail. Tunable via env.
2268
+ */
2269
+ private endSpeakingWithGrace;
2270
+ /**
2271
+ * End the post-TTS tail-grace window because the user has begun their next
2272
+ * turn. Unlike a barge-in, the agent's response already played out in full
2273
+ * — there is nothing to cancel and no turn was interrupted. We flip the
2274
+ * speaking flag off (bumping ``speakingGeneration`` so the scheduled grace
2275
+ * timer no-ops), recover any leading audio the self-hearing guard captured
2276
+ * into the ring (the user's first ~250 ms, which VAD needed before it could
2277
+ * emit ``speech_start``), and let the live STT stream take over. We do NOT
2278
+ * call ``sendClear``, ``recordBargeinDetected`` or ``recordTurnInterrupted``
2279
+ * — none apply to a turn that completed normally.
2280
+ *
2281
+ * Without this, fast next-turn speech (humans reply in 200-700 ms, well
2282
+ * inside the 1500 ms default grace) is withheld from STT and recorded as an
2283
+ * empty ``[interrupted]`` turn, after which the agent goes silent for the
2284
+ * rest of the call. Parity with Python ``_end_tail_grace_for_new_turn``.
2285
+ */
2286
+ private endTailGraceForNewTurn;
2287
+ private resetBargeInStrategies;
2288
+ /**
2289
+ * Reset the active VAD provider's per-utterance state. No-op when the
2290
+ * provider does not implement the optional ``reset()`` hook. Safe to call
2291
+ * from any context — failures are swallowed and the VAD is disabled for
2292
+ * the rest of the call so a flaky reset can never silently kill barge-in
2293
+ * for every subsequent turn.
2294
+ */
2295
+ private resetVad;
2296
+ /**
2297
+ * Whether barge-in is allowed to fire right now. Gate length depends
2298
+ * on whether AEC is active: 1 s with AEC (covers filter warmup),
2299
+ * 250 ms without (anti-flicker only — keeps PSTN barge-in responsive).
2300
+ */
2301
+ private canBargeIn;
2302
+ /**
2303
+ * Replay the audio captured by the self-hearing guard right before a
2304
+ * confirmed barge-in. VAD's ``minSpeechDuration`` window (default
2305
+ * 250 ms) means ``speech_start`` fires only AFTER the user has been
2306
+ * talking for that long; without this replay STT sees only the tail
2307
+ * of the user's interruption and produces "the line is breaking up"
2308
+ * partial transcripts. We deliberately do NOT call this on natural
2309
+ * turn end — see the comment in ``endSpeakingWithGrace`` for why.
2310
+ */
2311
+ private flushInboundAudioRing;
2312
+ /**
2313
+ * Per-call resolved tool list. Starts as ``null`` (falls back to
2314
+ * ``deps.agent.tools``). Populated by ``initMcpTools`` when MCP servers
2315
+ * are configured so discovered tools are merged in without mutating the
2316
+ * shared ``AgentOptions`` object. Code that needs the effective tool list
2317
+ * should read ``this.resolvedTools ?? this.deps.agent.tools``.
2318
+ */
2319
+ private resolvedTools;
2320
+ /**
2321
+ * Per-call effective agent configuration. Starts as ``deps.agent`` and is
2322
+ * REPLACED (never mutated — ``AgentOptions`` is shared and readonly) by a
2323
+ * multi-agent ``handoff_to`` so the rest of the call runs with the target
2324
+ * agent's LLM-visible config (system prompt, tools, variables, guardrails,
2325
+ * text transforms, onward handoffs). Parity with the Python handler's
2326
+ * ``self.agent`` swap.
2327
+ */
2328
+ private currentAgent;
2329
+ private llmLoop;
2330
+ /**
2331
+ * Per-call tool executor — provides retry-with-exponential-backoff and a
2332
+ * per-tool circuit breaker for Realtime function calls. Pipeline mode
2333
+ * uses its own executor inside ``LLMLoop``; this one is dedicated to
2334
+ * the Realtime path so a flaky downstream (DB outage, vendor rate
2335
+ * limit) returns a structured ``{ error, fallback: true }`` instead of
2336
+ * hanging the model on retries that will keep failing.
2337
+ */
2338
+ private readonly toolExecutor;
2339
+ /**
2340
+ * MCP server connection manager — populated lazily in
2341
+ * ``initMcpTools()`` when the agent declares ``mcpServers``. Holds
2342
+ * the open MCP client connections for the lifetime of the call so
2343
+ * we can dispatch ``tools/call`` without re-handshaking on every
2344
+ * function invocation. Cleared in ``fireCallEnd``.
2345
+ */
2346
+ private mcpManager;
2347
+ private chunkCount;
2348
+ private callEndFired;
2349
+ private sttClosed;
2350
+ private currentAgentText;
2351
+ private responseAudioStarted;
2352
+ /**
2353
+ * Realtime turn ordering buffer. OpenAI Realtime emits
2354
+ * `input_audio_transcription.completed` (user transcript) AFTER
2355
+ * `response.done` (assistant complete) because Whisper transcription
2356
+ * runs in parallel with — and slower than — model response. Without
2357
+ * this buffer the pushed `history` order is [assistant, user, ...]
2358
+ * which renders out-of-order in the dashboard.
2359
+ *
2360
+ * Behaviour:
2361
+ * - `onAdapterSpeechStopped` flips `userTranscriptPending = true`
2362
+ * - `onAdapterResponseDone` checks the flag; if set, stashes the
2363
+ * assistant text + a fallback timer
2364
+ * - `onAdapterTranscriptInput` clears the flag, pushes user, then
2365
+ * flushes any pending assistant turn
2366
+ * - The fallback timer flushes the assistant alone if the user
2367
+ * transcript never arrives (silence misclassified as speech, etc.)
2368
+ */
2369
+ private userTranscriptPending;
2370
+ private pendingAssistantTurn;
2371
+ private pendingAssistantTimer;
2372
+ /**
2373
+ * Reserved monotonic turn index for the in-flight Realtime turn (issue
2374
+ * #154, fix 5/6). Reserved in ``onAdapterSpeechStopped`` via
2375
+ * ``metricsAcc.reserveTurnIndex()`` the moment the turn OPENS, then threaded
2376
+ * through to the live per-line transcript events (``recordTranscriptLine``)
2377
+ * and into ``recordTurnComplete`` / ``recordTurnInterrupted`` so the
2378
+ * dashboard can sort a late-arriving user line ABOVE its agent line by
2379
+ * ``(turnIndex, role)``. ``null`` until the first turn opens. Parity with
2380
+ * Python ``_current_turn_index``.
2381
+ */
2382
+ private currentTurnIndex;
2383
+ /**
2384
+ * Hard cap on how long we wait for the user transcript before flushing
2385
+ * the buffered assistant turn alone. 3 s covers OpenAI Whisper's typical
2386
+ * 200-800 ms post-response delay with substantial headroom for slow
2387
+ * cellular audio uploads. Beyond this we accept the order will look
2388
+ * "assistant-only" rather than block the call's transcript display.
2389
+ */
2390
+ private static readonly REALTIME_USER_TRANSCRIPT_WAIT_MS;
2391
+ private maxDurationTimer;
2392
+ private transcriptProcessing;
2393
+ private transcriptQueue;
2394
+ /**
2395
+ * The in-flight turn dispatch (LLM + TTS) runs as a SINGLE tracked promise
2396
+ * so the transcript drain loop keeps running ``handleBargeIn`` against the
2397
+ * LIVE turn during a long (30-90 s) agent-runtime response, instead of
2398
+ * head-of-line-blocking on it. Exactly one is in flight: the launcher awaits
2399
+ * the previous one to settle (fast — a barge-in already aborted it) before
2400
+ * starting the next, preserving history/metrics ordering. Parity with
2401
+ * Python ``_dispatch_task``.
2402
+ */
2403
+ private dispatchTask;
2404
+ /** Background greeting playback (see playFirstMessage). */
2405
+ private firstMessageTask;
2406
+ /**
2407
+ * Cap (ms) on how long teardown waits for the backgrounded dispatch to
2408
+ * settle. JS promises are not cancellable, so a user-supplied ``onMessage``
2409
+ * (which receives no AbortSignal) parked on a hung external call could block
2410
+ * call cleanup indefinitely — `llmAbort.abort()` only unblocks the built-in
2411
+ * LLM/TTS paths. We bound the WAIT (Python hard-cancels the task instead).
2412
+ * 30 s matches the webhook ceiling.
2413
+ */
2414
+ private static readonly DISPATCH_SETTLE_TIMEOUT_MS;
2415
+ /**
2416
+ * Opt-in (default OFF): forward inbound audio to STT even while the agent is
2417
+ * speaking, so the transcript barge-in path can receive a transcript on
2418
+ * echo-masked PSTN links where the VAD never fires. ECHO RISK without AEC.
2419
+ * Parity with Python ``_forward_stt_while_speaking``.
2420
+ */
2421
+ private readonly forwardSttWhileSpeaking;
2422
+ private lastCommitText;
2423
+ private lastCommitAt;
2424
+ private readonly preemptiveEnabled;
2425
+ private readonly preemptiveMinStableMs;
2426
+ /** The single in-flight speculation (at most one). ``null`` when idle,
2427
+ * when discarded, or once released (a released speculation becomes the
2428
+ * live turn tracked by ``dispatchTask`` instead). */
2429
+ private speculation;
2430
+ private interimNorm;
2431
+ private interimText;
2432
+ private interimStableTimer;
2433
+ /** Hard cap (ms of playout) on TTS audio buffered by a speculative turn.
2434
+ * Overflow aborts the speculation. Parity with Python
2435
+ * ``_PREEMPTIVE_MAX_BUFFER_S``. */
2436
+ private static readonly PREEMPTIVE_MAX_BUFFER_MS;
2437
+ /** The agent's spoken text for the CURRENT turn, accumulated as tokens stream.
2438
+ * The echo guard rejects transcripts matching it (the agent's own TTS bleeding
2439
+ * back into STT when audio is forwarded during TTS without effective AEC).
2440
+ * Reset in ``beginSpeaking``; only consulted while ``forwardSttWhileSpeaking``.
2441
+ * Parity with Python ``_current_agent_spoken_text``. */
2442
+ private currentAgentSpokenText;
2443
+ private ttsByteCarry;
2444
+ private readonly inboundResampler;
2445
+ private readonly outboundResampler;
2446
+ /**
2447
+ * Inbound audio processing chain: decode (mulaw→PCM16) → stateful 8k→16k
2448
+ * resample → AEC near-end → ``agent.audioFilter`` → VAD (slice 1 of the
2449
+ * pipeline-stages decomposition — docs/architecture/pipeline-stages.md).
2450
+ * Shares ``inboundResampler`` so ``flushResamplers`` keeps draining the
2451
+ * tail on call close; AEC / filter / VAD are late-bound getters because
2452
+ * ``initPipeline`` (and the unit suites) install ``aec`` / ``autoVad``
2453
+ * after construction. Owns the per-call VAD error kill switch that
2454
+ * previously lived here as ``vadDisabled``.
2455
+ */
2456
+ private readonly inputChain;
2457
+ private readonly history;
2458
+ private readonly metricsAcc;
2459
+ private readonly _eventBus;
2460
+ constructor(deps: StreamHandlerDeps, ws: WebSocket$1, caller: string, callee: string);
2461
+ /**
2462
+ * Record a completed turn in the dashboard store and fire the user-supplied
2463
+ * ``onMetrics`` callback. Centralises the 4 emit sites (firstMessage, pipeline
2464
+ * streaming/regular LLM, WebSocket remote, Realtime response_done) so the
2465
+ * payload shape lives in one place.
2466
+ */
2467
+ /**
2468
+ * Emit a live per-line transcript event to the dashboard store (issue #154,
2469
+ * fix 5). Routed through a single helper so the call shape lives in one
2470
+ * place. ``recordTranscriptLine`` appends the line to the active call's
2471
+ * transcript and publishes a ``transcript_line`` SSE event; the dashboard
2472
+ * sorts by (turnIndex, user<assistant) so a late user line lands above its
2473
+ * agent line. No-op when no turn index has been reserved yet.
2474
+ */
2475
+ private emitTranscriptLine;
2476
+ private emitTurnMetrics;
2477
+ /** Reset the TTS odd-byte carry — call at every TTS stream entry/exit. */
2478
+ private resetTtsCarry;
2479
+ /**
2480
+ * Flush both stateful resamplers and any TTS byte carry on call close.
2481
+ * Emits tail bytes through the telephony bridge so the last ~20 ms of audio
2482
+ * is not silently clipped on hangup. No-op if the WebSocket is already gone.
2483
+ */
2484
+ private flushResamplers;
2485
+ /**
2486
+ * Start call recording when configured. Bridges expose
2487
+ * ``startRecording`` for carrier parity (Twilio and Telnyx supported).
2488
+ */
2489
+ private startRecordingIfRequested;
2490
+ /**
2491
+ * Subscribe to a Patter event on the per-call EventBus.
2492
+ *
2493
+ * The most common use-case is 'metrics_collected' — fired after every
2494
+ * completed turn with the TurnMetrics payload.
2495
+ *
2496
+ * Returns an unsubscribe function; call it to stop receiving events.
2497
+ *
2498
+ * @example
2499
+ * const off = handler.addObserver((payload) => {
2500
+ * console.log('turn metrics:', payload);
2501
+ * });
2502
+ * // later:
2503
+ * off();
2504
+ */
2505
+ addObserver<T = unknown>(cb: (payload: T) => void | Promise<void>, event?: PatterEventType): () => void;
2506
+ /**
2507
+ * Handle the call-start event.
2508
+ *
2509
+ * @param callId Call SID (Twilio) or call_control_id (Telnyx)
2510
+ * @param customParams TwiML custom parameters (Twilio only, empty for Telnyx)
2511
+ */
2512
+ /** Initialize per-call state, build the AI adapter, and dispatch the `onCallStart` callback. */
2513
+ handleCallStart(callId: string, customParams?: Record<string, string>): Promise<void>;
2514
+ /**
2515
+ * Connect to every configured MCP server, discover their tools via
2516
+ * ``tools/list``, and merge them into ``agent.tools`` before the
2517
+ * adapter is built. The synthetic handlers dispatch back through the
2518
+ * MCP client so ``DefaultToolExecutor`` can invoke them like any
2519
+ * other handler-tool. No-op when ``agent.mcpServers`` is empty or the
2520
+ * optional ``@modelcontextprotocol/sdk`` is not installed.
2521
+ */
2522
+ private initMcpTools;
2523
+ /**
2524
+ * Merge the built-in ``consult`` tool into the per-call tool list when
2525
+ * ``agent.consult`` is set, mirroring {@link initMcpTools}: the shared
2526
+ * ``deps.agent`` is NOT mutated; the merged list is stored on
2527
+ * ``this.resolvedTools`` so ``buildAIAdapter`` (Realtime) and the pipeline
2528
+ * ``LLMLoop`` both see it. Idempotent — a no-op if a tool with the same name
2529
+ * is already present.
2530
+ */
2531
+ private injectConsultTool;
2532
+ /** Set the stream SID (Twilio only, called after parsing 'start' event). */
2533
+ /** Set the carrier-side stream id (Twilio `streamSid` / Telnyx stream identifier). */
2534
+ setStreamSid(sid: string): void;
2535
+ /**
2536
+ * Record a terminal/processing error as a coarse, anonymous code on the call
2537
+ * metrics (code only, never the message). Surfaced via `call_completed`
2538
+ * telemetry. Safe to call with any value; last write wins.
2539
+ */
2540
+ recordError(err: unknown): void;
2541
+ /** Handle an incoming audio chunk (already decoded from base64). */
2542
+ /** Forward inbound audio bytes to the AI adapter and (in pipeline mode) the STT provider. */
2543
+ handleAudio(audioBuffer: Buffer): Promise<void>;
2544
+ /** Handle a DTMF keypress event (Twilio only). */
2545
+ /** Handle an inbound DTMF tone from the caller. */
2546
+ handleDtmf(digit: string): Promise<void>;
2547
+ /**
2548
+ * Last mark name Twilio has confirmed playback of. Mirrors the Python
2549
+ * ``TwilioAudioSender.last_confirmed_mark`` field — barge-in heuristics
2550
+ * compare this against the latest sent mark to decide whether the agent's
2551
+ * audio has actually reached the caller yet.
2552
+ */
2553
+ lastConfirmedMark: string;
2554
+ /**
2555
+ * Handle a Twilio ``mark`` event acknowledging that a previously sent
2556
+ * audio chunk has been played out. Mirrors Python's
2557
+ * ``twilio_handler.py``: ``audio_sender.on_mark_confirmed(mark_name)`` +
2558
+ * ``handler.on_mark(mark_name)``.
2559
+ */
2560
+ /** Handle a Twilio Media Streams `mark` event acknowledging audio playback boundaries. */
2561
+ onMark(markName: string): Promise<void>;
2562
+ /**
2563
+ * Await the backgrounded turn dispatch during teardown, but never block
2564
+ * longer than ``DISPATCH_SETTLE_TIMEOUT_MS``. The earlier ``llmAbort.abort()``
2565
+ * settles the built-in LLM/TTS paths immediately; the cap only bites a
2566
+ * misbehaving user ``onMessage`` parked on a hung external call (JS promises
2567
+ * can't be cancelled). No-op when nothing is in flight.
2568
+ */
2569
+ private settleDispatchForTeardown;
2570
+ /**
2571
+ * Apply per-call agent overrides returned by ``onCallStart``. snake_case
2572
+ * keys mirror the Python payload contract (``apply_call_overrides``);
2573
+ * ``stt_config``/``tts_config`` dicts are Python-only (TS agents carry
2574
+ * adapter instances, not configs) and are ignored here with a warning.
2575
+ * The deps object is per-handler, so swapping its ``agent`` is call-local.
2576
+ */
2577
+ private applyCallOverrides;
2578
+ /** Handle call stop / stream end. */
2579
+ /** Handle a carrier-emitted `stop` event signalling the call has ended. */
2580
+ /** Append a post-decode PCM16-16k frame to the rolling 8 s window. */
2581
+ private semanticBufferAppend;
2582
+ /** Concatenate the rolling window for one detector prediction. */
2583
+ private semanticWindowBytes;
2584
+ /**
2585
+ * Drop the rolling window — called when a turn commits so the next
2586
+ * turn's window contains only its own audio (mirrors the reference
2587
+ * smart-turn integrations, which score per-turn audio).
2588
+ */
2589
+ private resetSemanticWindow;
2590
+ /**
2591
+ * Score the rolling window; finalize, or hold for more silence.
2592
+ *
2593
+ * Fail-open AND fail-once: the first detector error falls back to the
2594
+ * legacy immediate finalize (``vad_silence`` trigger) and disables
2595
+ * semantic endpointing for the remainder of the call — a broken model
2596
+ * must never stall a live phone call, and a permanently broken one
2597
+ * (onnxruntime-node missing/incompatible, model file gone) must produce
2598
+ * a single clear warning, not one per turn.
2599
+ */
2600
+ private semanticEouCheck;
2601
+ /** Arm the hold state + the wall-clock backstop for the hard cap. */
2602
+ private beginSemanticHold;
2603
+ /** Drop the hold (and its backstop timer) without finalizing. Idempotent. */
2604
+ private cancelSemanticHold;
2605
+ /**
2606
+ * Advance the audio clock of an active hold by one inbound frame.
2607
+ *
2608
+ * Finalizes (``vad_silence``) once the hard cap is reached; otherwise
2609
+ * re-runs the detector after each additional ``SEMANTIC_POLL_MS`` of
2610
+ * silence so a model that flips to "complete" with more trailing
2611
+ * silence commits the turn as ``semantic_turn_detector``.
2612
+ */
2613
+ private pollSemanticHold;
2614
+ /**
2615
+ * Hard cap reached: finalize anyway so the turn can never hang. The
2616
+ * semantic model never agreed, so the commit reason is the accumulated
2617
+ * silence — the EOU trigger stays ``vad_silence``.
2618
+ */
2619
+ private resolveSemanticHoldCap;
2620
+ /**
2621
+ * Ask the STT provider to finalize the in-flight utterance NOW.
2622
+ * Optional chained — Whisper-class adapters that don't support
2623
+ * per-utterance finalisation simply skip. Extracted verbatim from the
2624
+ * VAD ``speech_end`` branch so the default path stays byte-identical
2625
+ * and the semantic turn-detector paths reuse it.
2626
+ */
2627
+ private finalizeSttForEou;
2628
+ handleStop(): Promise<void>;
2629
+ /** Handle WebSocket close event. */
2630
+ /** Tear down adapter, STT/TTS, and per-call state when the carrier WebSocket closes. */
2631
+ handleWsClose(): Promise<void>;
2632
+ /** Close STT at most once; swallow errors. */
2633
+ private closeSttOnce;
2634
+ /**
2635
+ * Encode a PCM 16kHz audio chunk for the telephony provider.
2636
+ *
2637
+ * Both Twilio and Telnyx negotiate PCMU (mulaw) 8 kHz on the bidirectional
2638
+ * media stream — Twilio always, and Telnyx because ``streaming_start``
2639
+ * (server.ts) requests ``stream_bidirectional_codec=PCMU`` at 8 kHz. So
2640
+ * the wire format for both providers is mulaw 8 kHz; we resample 16 kHz
2641
+ * PCM16 → 8 kHz then encode to mulaw. Mirrors the Python pipeline path
2642
+ * (libraries/python/getpatter/handlers/telnyx_handler.py::TelnyxAudioSender).
2643
+ *
2644
+ * Maintains a 1-byte carry across calls so unaligned HTTP chunks from
2645
+ * streaming TTS providers never byte-swap the PCM16 samples downstream.
2646
+ */
2647
+ private encodePipelineAudio;
2648
+ /**
2649
+ * Cached result of ``isTtsOutputFormatNativeForCarrier()`` — settled
2650
+ * once at ``initPipeline`` time after ``setTelephonyCarrier`` has run
2651
+ * on the TTS adapter. Stable for the call lifetime: changes to the
2652
+ * adapter's output format mid-call would NOT flip this. ``true`` means
2653
+ * ``encodePipelineAudio`` can take the bypass path.
2654
+ */
2655
+ private ttsOutputFormatNativeForCarrier;
2656
+ /**
2657
+ * Probe whether the TTS adapter is configured to emit bytes already in
2658
+ * the carrier's wire codec. Currently: Twilio expects ``ulaw_8000``,
2659
+ * Telnyx expects ``pcm_16000`` (no client transcode in either case if
2660
+ * matched). Anything else takes the resample-and-encode path.
2661
+ */
2662
+ private isTtsOutputFormatNativeForCarrier;
2663
+ /**
2664
+ * Prepend any carry byte from the previous chunk, return the even-length
2665
+ * portion, and stash the final odd byte (if any) for the next call.
2666
+ */
2667
+ private alignPcm16;
2668
+ /**
2669
+ * Stream a cached firstMessage buffer in pacing-friendly chunks.
2670
+ *
2671
+ * Splits ``prewarmBytes`` into 20 ms slices (matching Twilio's PSTN
2672
+ * frame quantum) and
2673
+ * forwards each through ``deps.bridge.sendAudio`` exactly like the
2674
+ * live TTS path does — preserving Twilio mark/clear granularity. A
2675
+ * single multi-second sendAudio call would push the whole intro into
2676
+ * the carrier in one go and a ``sendClear`` issued mid-buffer would
2677
+ * have nothing to clear ("agent keeps talking after barge-in" UX bug
2678
+ * on the very first turn).
2679
+ *
2680
+ * Returns ``true`` when at least one chunk hit the wire — the caller
2681
+ * uses that to decide whether to record TTS-first-byte / turn-complete
2682
+ * metrics.
2683
+ */
2684
+ private streamPrewarmBytes;
2685
+ /**
2686
+ * Iterate ``bytes`` in 20 ms slices (Twilio PSTN frame quantum) and
2687
+ * forward each via ``deps.bridge.sendAudio`` with mark-gated pacing
2688
+ * (Twilio) or playout-time-based pacing (Telnyx). Caps the carrier-
2689
+ * side buffer at ``FIRST_MESSAGE_MARK_WINDOW`` chunks so a barge-in's
2690
+ * ``sendClear`` has ~120 ms (Twilio) or zero (Telnyx, immediately
2691
+ * after the latest sleep) of audio to flush.
2692
+ *
2693
+ * Bails immediately when ``isSpeaking`` flips to false — both via the
2694
+ * loop's pre-iter check and via ``drainPendingMarks`` (called from
2695
+ * ``cancelSpeaking``) which unblocks any in-flight ``waitForMarkWindow``.
2696
+ *
2697
+ * Returns ``true`` when at least one chunk hit the wire — the caller
2698
+ * uses that to decide whether to record TTS-first-byte / turn-complete
2699
+ * metrics. See BUG #128 for the regression this fix targets.
2700
+ */
2701
+ /**
2702
+ * Stream the configured greeting — runs as a BACKGROUND task.
2703
+ *
2704
+ * ``handleCallStart`` used to execute this inline; with the carrier WS
2705
+ * events now serialized onto a per-connection FIFO, that blocked EVERY
2706
+ * media frame for the whole greeting (VAD/barge-in structurally
2707
+ * impossible on the first message, mark acks unread). ``handleCallStart``
2708
+ * awaits ``beginSpeaking(true)`` BEFORE spawning this task so the
2709
+ * self-hearing guard engages from the very first inbound frame.
2710
+ * Mirrors the Python ``_play_first_message`` task.
2711
+ */
2712
+ private playFirstMessage;
2713
+ private sendPacedFirstMessageBytes;
2714
+ private initPipeline;
2715
+ /** Build a HookContext for the current call state. */
2716
+ private buildHookContext;
2717
+ /** Synthesize a single sentence through TTS with hooks, sending audio to telephony. */
2718
+ private synthesizeSentence;
2719
+ /** Handle a final transcript from STT in pipeline mode. */
2720
+ private handleTranscript;
2721
+ private processTranscript;
2722
+ /**
2723
+ * Post-commit turn body (LLM dispatch → TTS → turn-complete) run as a
2724
+ * tracked background task so the transcript drain loop is not blocked for
2725
+ * the whole (possibly 30-90 s) agent-runtime turn. A barge-in — transcript
2726
+ * (now reachable mid-turn) or VAD — aborts the in-flight ``llmAbort`` and
2727
+ * flips ``isSpeaking``, which the LLM/TTS loops here observe and break on.
2728
+ * Parity with Python ``_dispatch_turn``.
2729
+ */
2730
+ private dispatchTurn;
2731
+ /**
2732
+ * Barge-in: caller spoke over in-flight TTS. Flip ``isSpeaking`` so the
2733
+ * sentence loop exits on its next check, clear downstream audio buffers,
2734
+ * record the interruption, and return ``true`` so the caller skips the
2735
+ * turn-complete record.
2736
+ */
2737
+ private handleBargeInAsync;
2738
+ /**
2739
+ * Synchronous wrapper that callers in legacy code paths can keep using.
2740
+ * When ``bargeInStrategies`` is empty the work is fully synchronous and
2741
+ * the result is correct. With strategies the call is dispatched as a
2742
+ * floating promise — non-confirmed transcripts simply skip the cancel
2743
+ * and the legacy boolean return is meaningless under that opt-in path.
2744
+ */
2745
+ private handleBargeIn;
2746
+ /**
2747
+ * Run the cancel/flush sequence for a confirmed barge-in. Shared by
2748
+ * the legacy synchronous path and the strategy-confirmed async path.
2749
+ */
2750
+ private runBargeInCancel;
2751
+ /** Mark a VAD-detected barge-in as pending (no cancel yet). */
2752
+ private startPendingBargeIn;
2753
+ /** Drop pending state without cancelling — used on confirm and on
2754
+ * agent stop. Idempotent. */
2755
+ private clearPendingBargeIn;
2756
+ /** Retained-audio cap in bytes for the active TTS chunk format (mirrors
2757
+ * the bytes-per-ms logic of ``trackOutboundPlayback``). */
2758
+ private pauseRetainedCapBytes;
2759
+ /**
2760
+ * Whether a VAD ``speech_start`` during the agent's turn should take the
2761
+ * pause-and-resume path instead of cancel/pending. Requires
2762
+ * ``bargeInMode: 'pause_resume'`` AND resumable state: a dispatch in
2763
+ * flight (the sentence/TTS loops honour the pause gate) or retained
2764
+ * sentence audio from a just-completed turn still playing out of the
2765
+ * carrier buffer. The firstMessage paced sender keeps today's
2766
+ * immediate-cancel behaviour (its prewarm-bytes path has no retained
2767
+ * sentences to resume from — known limitation).
2768
+ */
2769
+ private shouldPauseForBargeIn;
2770
+ /**
2771
+ * Resume offset at SENTENCE granularity: the first sentence (into
2772
+ * ``turnSpokenSegments`` / ``turnSentenceAudio``) whose playback had NOT
2773
+ * completed when the pause landed — computed from the #164
2774
+ * playback-cursor bookkeeping (``heard = totalPushed - carrierBacklog``).
2775
+ * Granularity choice: the partially-played sentence is replayed from its
2776
+ * start (mark/clear bookkeeping is per-sentence and a clipped sentence
2777
+ * restarted at its boundary sounds like a natural repair), rather than
2778
+ * resumed mid-word at a byte offset.
2779
+ */
2780
+ private computePauseResumePoint;
2781
+ /**
2782
+ * PAUSE the agent's output on a VAD ``speech_start`` (pause_resume
2783
+ * mode): gate further sends on ``outputPaused``, ``sendClear`` the
2784
+ * carrier so queued audio stops quickly, and schedule the
2785
+ * false-interruption resume timer. The LLM stream and the TTS provider
2786
+ * stream are deliberately NOT cancelled — tokens keep buffering as
2787
+ * sentences and synthesized audio queues in memory (both bounded) so a
2788
+ * resume can pick up seamlessly.
2789
+ */
2790
+ private startPauseResume;
2791
+ /**
2792
+ * RESUME output after a pause that no transcript confirmed. Re-sends the
2793
+ * cleared-but-unheard tail from the retained sentence audio (sentence
2794
+ * granularity, no TTS re-billing), unpauses the live send path, and
2795
+ * records the event as a FALSE interruption: the overlap closes via
2796
+ * ``recordOverlapEnd(false)`` (the backchannel counter — the
2797
+ * interruption count is NOT incremented) and the turn is never marked
2798
+ * interrupted.
2799
+ */
2800
+ private resumeAfterFalseInterruption;
2801
+ /** Drop all pause-and-resume state (flags + buffers) and wake any
2802
+ * pause-decision waiter. Used by the kill path, fresh turns, and
2803
+ * teardown. Idempotent. */
2804
+ private discardPauseState;
2805
+ /** Block until the in-flight pause resolves. ``true`` → resumed (keep
2806
+ * speaking); ``false`` → killed (turn interrupted). Bounded: fails open
2807
+ * past the confirm window plus margin (the resume timer guarantees a
2808
+ * decision; the margin covers teardown races). Mirrors Python
2809
+ * ``_await_pause_decision``. */
2810
+ private awaitPauseDecision;
2811
+ /** While paused, buffer ``sentence`` (pre-guardrail text) for the resume
2812
+ * drain and return ``true``; return ``false`` when not paused (caller
2813
+ * synthesizes normally). Overflow degrades to a full cancel — the
2814
+ * bounded buffer is a memory-safety valve, not a speech queue. */
2815
+ private bufferSentenceIfPaused;
2816
+ /** Pop-and-return every sentence buffered during the pause. */
2817
+ private releasePausedSentences;
2818
+ /** Open a retention entry for a response sentence (pause_resume mode
2819
+ * only — returns ``null`` otherwise, keeping the legacy send path
2820
+ * byte-identical). Filler / error-fallback audio is never retained
2821
+ * (``recordSegment=false`` callers skip this). */
2822
+ private beginRetainedSentence;
2823
+ /** Append ``chunk`` to the sentence's retention entry, enforcing the
2824
+ * retained-audio cap. Returns ``true`` when retained; ``false`` on
2825
+ * overflow (paused → the turn was just killed; speaking → retention was
2826
+ * released and the caller falls back to direct sends). */
2827
+ private retainPauseChunk;
2828
+ /**
2829
+ * Send every not-yet-sent chunk of a retention entry to the carrier
2830
+ * (claim-then-send so concurrent drains can never double-send). Stamps
2831
+ * the sentence's heard-prefix segment at its first sent chunk — a replay
2832
+ * (``sent`` reset to 0) re-stamps at the new timeline position.
2833
+ * ``force=true`` bypasses the pause gate (resume path only).
2834
+ */
2835
+ private drainSentenceEntry;
2836
+ /** Whether a transcript may KILL a paused turn: it must be a committed
2837
+ * FINAL (interims cannot confirm), not a known STT hallucination, and
2838
+ * not a duplicate of the last committed utterance — the same filter
2839
+ * family ``commitTranscript`` applies, evaluated without consuming its
2840
+ * dedup state (the transcript still flows on to ``commitTranscript`` to
2841
+ * become the user's next turn). */
2842
+ private passesPausedKillFilters;
2843
+ /**
2844
+ * Dedup + throttle + hallucination filter for final STT transcripts.
2845
+ * Mirrors ``PipelineStreamHandler._stt_loop`` on the Python side.
2846
+ * Returns ``true`` when the transcript should be committed to a turn,
2847
+ * ``false`` when it must be dropped. Drop reasons:
2848
+ * - text matches common short hallucinations ("you", "thanks", ...)
2849
+ * - duplicate final within 2 s of previous commit
2850
+ * - back-to-back finals under 500 ms (too tight to be real utterances)
2851
+ */
2852
+ private commitTranscript;
2853
+ /**
2854
+ * Whether a speculative dispatch may start right now. Built-in LLM loop
2855
+ * only (an ``onMessage`` handler may have external side effects per
2856
+ * invocation, so it is never run speculatively), and only while the agent
2857
+ * is idle: not speaking (an interim during agent speech is barge-in
2858
+ * material, not a next turn) and no turn dispatch in flight
2859
+ * (single-in-flight contract). Parity with Python ``_can_speculate``.
2860
+ */
2861
+ private canSpeculate;
2862
+ /**
2863
+ * Read-only mirror of the ``commitTranscript`` filters: a candidate
2864
+ * interim must pass the same hallucination / echo / duplicate checks a
2865
+ * final would face at commit time — otherwise we would speculate on text
2866
+ * whose final is guaranteed to be dropped. Never mutates the dedup state.
2867
+ * Parity with Python ``_speculation_input_ok``.
2868
+ */
2869
+ private speculationInputOk;
2870
+ /**
2871
+ * Track an interim transcript and start a speculation when it qualifies:
2872
+ * (a) it ends with sentence-final punctuation (immediate), or (b) it has
2873
+ * been unchanged for ``preemptiveMinStableMs`` (one-shot stability timer).
2874
+ * No-op when preemptive generation is disabled or the handler cannot
2875
+ * speculate right now. Parity with Python ``_note_interim_transcript``.
2876
+ */
2877
+ private noteInterimTranscript;
2878
+ /** Cancel the pending interim-stability timer, if any. Idempotent. */
2879
+ private clearInterimStabilityTimer;
2880
+ /** Drop interim-stability state — called once a final commits (the
2881
+ * utterance is decided) and on teardown. */
2882
+ private resetInterimTracking;
2883
+ /**
2884
+ * Launch a speculative dispatch for ``interimText``, replacing (and
2885
+ * counting as a miss) any previous speculation on different text. The
2886
+ * task's rejection handler is attached at creation (same contract as
2887
+ * ``dispatchTask``). Parity with Python ``_start_speculation``.
2888
+ */
2889
+ private startSpeculation;
2890
+ /**
2891
+ * Discard the current speculation (if any): signal abort, await the task's
2892
+ * unwind (bounded — JS promises are not cancellable, so a provider that
2893
+ * ignores the signal must not block the caller), and count a miss unless
2894
+ * this is teardown. The speculative task never touched history / carrier /
2895
+ * per-turn metrics, so there is nothing to roll back. Idempotent. Parity
2896
+ * with Python ``_abort_speculation``.
2897
+ */
2898
+ private abortSpeculation;
2899
+ /**
2900
+ * Self-abort from WITHIN the speculative task (LLM error, buffer overflow,
2901
+ * afterTranscribe veto). Marks the speculation unreleasable and
2902
+ * deregisters it so the commit path dispatches normally. Never awaits (the
2903
+ * caller IS the task). Parity with Python ``_fail_speculation_inline``.
2904
+ */
2905
+ private failSpeculationInline;
2906
+ /**
2907
+ * Commit-time decision for the in-flight speculation. Returns ``true``
2908
+ * when the speculation was RELEASED — the caller must NOT dispatch a
2909
+ * normal turn (the speculative task is now the live turn, tracked via
2910
+ * ``dispatchTask``). Returns ``false`` when there was no usable
2911
+ * speculation (none in flight, failed, or mismatched — the mismatch is
2912
+ * discarded here) and the normal dispatch must run.
2913
+ *
2914
+ * On release, the commit-point bookkeeping the normal path performs in
2915
+ * ``processTranscript`` happens HERE — the ``onTranscript`` callback, the
2916
+ * conversation-history user push (final transcript text), and the
2917
+ * turn-committed metric anchors (so TTFT/latency reflect user-perceived
2918
+ * timing from the REAL final-transcript commit) — exactly once per turn.
2919
+ * Parity with Python ``_try_release_speculation``.
2920
+ */
2921
+ private tryReleaseSpeculation;
2922
+ /** Playout duration (ms) of the audio a speculation has buffered so far.
2923
+ * Same bytes-per-ms model as ``trackOutboundPlayback``. */
2924
+ private specBufferMs;
2925
+ /** Push one (already hook-processed) audio chunk of a RELEASED speculation
2926
+ * to the carrier — the same per-chunk bookkeeping ``synthesizeSentence``
2927
+ * performs on the live path. */
2928
+ private specSendChunk;
2929
+ /**
2930
+ * Idempotent release flush: take the floor (``beginSpeaking``), stamp the
2931
+ * post-commit LLM markers, and stream every buffered sentence to the
2932
+ * carrier in order. After this the speculative task continues as a plain
2933
+ * live turn. No-op until the speculation is released. Parity with Python
2934
+ * ``_spec_ensure_flushed``.
2935
+ */
2936
+ private specEnsureFlushed;
2937
+ /**
2938
+ * Synthesize one sentence of an UNRELEASED speculation, holding the audio
2939
+ * in ``spec.buffered``. Transitions to live sending mid-sentence the
2940
+ * moment the release lands. Returns ``false`` when the speculation must
2941
+ * stop (aborted, overflow, or barge-in after a mid-sentence release).
2942
+ * Parity with Python ``_spec_synthesize_buffered``.
2943
+ */
2944
+ private specSynthesizeBuffered;
2945
+ /**
2946
+ * Guardrails + tier-2 hook + synthesis for one speculative sentence —
2947
+ * buffered pre-release, live post-release (same transforms either way).
2948
+ * Returns ``false`` when the turn must stop. Parity with Python
2949
+ * ``_spec_speak_sentence``.
2950
+ */
2951
+ private specSpeakSentence;
2952
+ /**
2953
+ * Turn-complete bookkeeping for a RELEASED speculation — mirrors the tail
2954
+ * of ``runPipelineLlm`` + ``dispatchTurn`` (metrics turn record,
2955
+ * interrupted heard-prefix marker, assistant history entry). Runs exactly
2956
+ * once, after all audio was sent/cancelled. Parity with Python
2957
+ * ``_finish_released_speculation``.
2958
+ */
2959
+ private finishReleasedSpeculation;
2960
+ /**
2961
+ * Body of one speculative turn: LLM stream → sentence chunking → buffered
2962
+ * TTS, then commit-or-discard.
2963
+ *
2964
+ * Until release this task is side-effect free outside ``spec`` itself —
2965
+ * no conversation-history writes, no carrier audio, no per-turn metrics
2966
+ * (LLM token usage/cost IS recorded by ``LLMLoop``: the tokens were
2967
+ * genuinely consumed either way). After release it behaves exactly like a
2968
+ * live ``dispatchTurn`` body. Parity with Python
2969
+ * ``_run_speculative_dispatch``.
2970
+ */
2971
+ private runSpeculativeDispatch;
2972
+ /**
2973
+ * Schedule the opt-in long-turn filler and return its async ``clear()``.
2974
+ *
2975
+ * When ``agent.longTurnMessage`` is unset / empty the returned clear is a
2976
+ * no-op (byte-identical to today's behaviour). Otherwise a one-shot timer
2977
+ * fires after ``agent.longTurnMessageAfterS`` seconds and, IFF no audio has
2978
+ * reached the carrier this turn (``!ttsFirstByteSent.value``) AND we still own
2979
+ * the floor (``this.isSpeaking``), synthesizes the filler ONCE via the same
2980
+ * per-sentence TTS primitive every sentence uses.
2981
+ *
2982
+ * The returned ``clear()`` is **async**: it stops the timer AND, if the filler
2983
+ * already started synthesizing (its ``setTimeout`` callback runs in a separate
2984
+ * macro-task, so it can fire just before the first real sentence), AWAITS the
2985
+ * in-flight synthesis so the filler audio can never interleave with the real
2986
+ * sentence that follows. Idempotent; self-synthesis failure degrades to
2987
+ * silence (never crashes the turn). The caller must clear on first real audio,
2988
+ * on the error branch, and in the finally.
2989
+ */
2990
+ private scheduleLongTurnFiller;
2991
+ /**
2992
+ * Streaming built-in LLM path with sentence chunking and per-sentence
2993
+ * guardrails/TTS. Returns the concatenated (plain) response text plus whether
2994
+ * the turn was cut short by a barge-in — the caller applies the interrupted
2995
+ * marker to history only, keeping metrics on the plain text.
2996
+ */
2997
+ private runPipelineLlm;
2998
+ /**
2999
+ * Non-streaming path (onMessage function / webhook): apply output guardrails,
3000
+ * push to history, sentence-chunk the text, synthesize. Returns ``true`` if
3001
+ * TTS was interrupted mid-flight so the caller can skip turn-complete.
3002
+ */
3003
+ private runRegularLlm;
3004
+ /** Handle streaming WebSocket remote response with TTS. */
3005
+ private handleWebSocketResponse;
3006
+ private initRealtimeAdapter;
3007
+ private handleAdapterEvent;
3008
+ /** Event-type → handler dispatch table for the Realtime adapter. */
3009
+ private readonly adapterEventHandlers;
3010
+ private userSpeechStartMs;
3011
+ private agentTurnStartMs;
3012
+ private emitUserSpeechStarted;
3013
+ private emitUserSpeechEnded;
3014
+ private emitUserSpeechEos;
3015
+ private emitAgentSpeechStarted;
3016
+ private emitAgentSpeechEnded;
3017
+ /** Fire the per-turn LLM TTFT marker. Idempotent in the dispatcher
3018
+ * — guarded by `firstTokenForTurn` on the SpeechEvents instance. */
3019
+ private emitLlmFirstToken;
3020
+ /** Fire the per-turn first-TTS-audio marker. Idempotent in the
3021
+ * dispatcher — guarded by `firstAudioForTurn`. The provider tag falls
3022
+ * back to the engine name for Realtime / ConvAI (no separate TTS). */
3023
+ private emitAudioOut;
3024
+ private onAdapterAudio;
3025
+ private onAdapterSpeechStopped;
3026
+ private onAdapterTranscriptInput;
3027
+ /**
3028
+ * Push an assistant turn into history, fire `onTranscript`, and emit
3029
+ * turn-complete metrics. Shared between the immediate path (no user
3030
+ * transcript pending) and the buffered path (flushed after user
3031
+ * transcript arrives or fallback timer fires).
3032
+ */
3033
+ private flushAssistantTurn;
3034
+ /**
3035
+ * Push an assistant turn into history and fire `onTranscript` so host
3036
+ * applications observe pipeline-mode replies the same way they observe
3037
+ * realtime-mode replies. Mirrors `_emit_assistant_transcript` in the
3038
+ * Python SDK and parallels `flushAssistantTurn` (realtime path).
3039
+ * Caller is responsible for filtering empty strings.
3040
+ */
3041
+ private emitAssistantTranscript;
3042
+ /**
3043
+ * Surface a tool invocation from pipeline mode into the transcript
3044
+ * timeline. Emits TWO events: one for the call (`name(argsJson)`) and
3045
+ * one for the result (`name(...) → result`, truncated to 200 chars).
3046
+ * Mirrors realtime mode's two `emitToolEvent` calls in
3047
+ * `handleFunctionCall`. Wired as the `LLMLoop` `onToolCall` observer.
3048
+ */
3049
+ private recordToolCall;
3050
+ private onAdapterTranscriptOutput;
3051
+ private onAdapterResponseDone;
3052
+ private onAdapterSpeechInterrupt;
3053
+ /**
3054
+ * Handle a Realtime ``error`` event (issue #154, fix 4).
3055
+ *
3056
+ * Both Realtime providers dispatch ``('error', …)`` for server-side errors,
3057
+ * non-normal socket closes, and socket errors, but the stream handler
3058
+ * previously had no entry for it in the dispatch table so these were
3059
+ * silently swallowed. We surface them at WARN level with ONLY the error
3060
+ * envelope fields (``type`` / ``code`` / ``message``) — never any audio or
3061
+ * transcript body, to avoid logging PII. The call is NOT terminated: the
3062
+ * provider decides whether to recover, and many of these (e.g. a transient
3063
+ * ``input_audio_buffer_commit_empty``) are non-fatal. Parity with the
3064
+ * Python ``elif ev_type == 'error'`` branches.
3065
+ */
3066
+ private onAdapterError;
3067
+ /**
3068
+ * Emit a tool-invocation event into the transcript timeline. Pushes a
3069
+ * `role=tool` entry into `history` (so it appears in the dashboard
3070
+ * transcript next to user/assistant turns) AND fires `onTranscript` so
3071
+ * the host application can log / persist / render it. `result` is
3072
+ * truncated for log readability — the full payload is in history.
3073
+ */
3074
+ private emitToolEvent;
3075
+ /**
3076
+ * Execute an ElevenLabs ``client_tool_call`` and ALWAYS answer it — a
3077
+ * missing client_tool_result stalls the ElevenLabs agent until its own
3078
+ * tool timeout. transfer_call/end_call declared as ElevenLabs client
3079
+ * tools route to the carrier helpers. Mirrors Python
3080
+ * ``_handle_convai_client_tool``.
3081
+ */
3082
+ private handleConvAIClientTool;
3083
+ private handleFunctionCall;
3084
+ /**
3085
+ * The effective per-call tool list for the CURRENT agent: target tools plus
3086
+ * the built-in consult tool when configured (deduped by name). Used after a
3087
+ * handoff to rebuild `resolvedTools`.
3088
+ */
3089
+ private effectiveToolsForCurrentAgent;
3090
+ /**
3091
+ * Dispatch the built-in `handoff_to` tool on the Realtime path.
3092
+ *
3093
+ * Swaps the live session to the target agent's configuration via a
3094
+ * mid-session `session.update` (new `instructions` + `tools`), updates
3095
+ * `currentAgent` / `resolvedTools` so subsequent tool dispatch resolves
3096
+ * against the target's tool list, and records a system-style history entry
3097
+ * so transcripts show the handoff. ALWAYS sends a function result — an
3098
+ * unknown name / malformed args produce an error envelope, never silence
3099
+ * (a missing function result would wedge the model).
3100
+ *
3101
+ * Voice is intentionally NOT swapped: OpenAI Realtime rejects a voice
3102
+ * change once the session has produced audio, so the session keeps the
3103
+ * voice established at call start (documented limitation; an info log is
3104
+ * emitted when the target requested a different voice). Parity with the
3105
+ * Python `_handle_handoff_function_call`.
3106
+ */
3107
+ private handleHandoffFunctionCall;
3108
+ /**
3109
+ * Swap the live pipeline call to the named handoff target agent.
3110
+ *
3111
+ * Updates `currentAgent` (the shared `AgentOptions` is never mutated),
3112
+ * swaps the LLM loop's system prompt + tool list so the NEXT turn runs as
3113
+ * the target agent, and appends a system-style history entry recording the
3114
+ * handoff. ALWAYS returns a tool-result string — an unknown name produces
3115
+ * an error envelope, never silence.
3116
+ *
3117
+ * Live audio infrastructure (STT/TTS/VAD instances — and therefore the
3118
+ * speaking voice) established at call start is intentionally retained:
3119
+ * swapping a connected TTS provider mid-call is not supported in v1.
3120
+ * Parity with the Python `_perform_handoff`.
3121
+ */
3122
+ private performHandoff;
3123
+ /**
3124
+ * Build the full pipeline tool list for the CURRENT agent: user tools +
3125
+ * built-in `transfer_call` / `end_call` + the `handoff_to` tool when
3126
+ * handoff targets are configured. Re-invoked after a handoff so the LLM
3127
+ * loop advertises the target agent's tools (including its onward handoff
3128
+ * map). Parity with the Python `_build_combined_pipeline_tools`.
3129
+ */
3130
+ private buildPipelineLlmTools;
3131
+ private fireCallEnd;
3132
+ }
3133
+
1530
3134
  /** Resolved configuration consumed by `EmbeddedServer` (carrier credentials, webhook URL, etc.). */
1531
3135
  interface LocalConfig {
1532
3136
  twilioSid?: string;
@@ -2370,10 +3974,11 @@ declare class OpenAILLMProvider implements LLMProvider {
2370
3974
  /** Pipeline-mode LLM driver: runs the chat loop, dispatches tool calls, and emits text deltas. */
2371
3975
  declare class LLMLoop {
2372
3976
  private readonly provider;
2373
- private readonly systemPrompt;
2374
- private readonly tools;
2375
- private readonly openaiTools;
2376
- private readonly toolMap;
3977
+ private systemPrompt;
3978
+ private disablePhonePreamble;
3979
+ private tools;
3980
+ private openaiTools;
3981
+ private toolMap;
2377
3982
  private toolExecutor;
2378
3983
  private eventBus?;
2379
3984
  private readonly _providerName;
@@ -2382,6 +3987,28 @@ declare class LLMLoop {
2382
3987
  private _loggedUsageFallback;
2383
3988
  private onToolCall?;
2384
3989
  constructor(apiKey: string, model: string, systemPrompt: string, tools?: ToolDefinition[] | null, llmProvider?: LLMProvider, disablePhonePreamble?: boolean);
3990
+ /**
3991
+ * Prepend {@link DEFAULT_PHONE_PREAMBLE} unless disabled — byte-identical
3992
+ * to the historical inline constructor logic.
3993
+ */
3994
+ private static applyPhonePreamble;
3995
+ /** (Re)build `openaiTools` and `toolMap` from a tool list. */
3996
+ private rebuildToolState;
3997
+ /**
3998
+ * Swap the system prompt and/or tool list mid-call (multi-agent handoff).
3999
+ *
4000
+ * Takes effect on the NEXT turn — `run` builds its messages array (with
4001
+ * the system prompt at index 0) per turn, and reads `openaiTools` per
4002
+ * provider iteration, so a swap that lands while a turn is in flight
4003
+ * finishes the current turn under the old prompt and runs every subsequent
4004
+ * turn as the new agent. Omitted fields keep the corresponding current
4005
+ * value. Mirrors Python `LLMLoop.update_agent`.
4006
+ */
4007
+ updateAgent(update: {
4008
+ systemPrompt?: string;
4009
+ tools?: ToolDefinition[];
4010
+ disablePhonePreamble?: boolean;
4011
+ }): void;
2385
4012
  /**
2386
4013
  * Swap in a custom tool executor (e.g. different retry policy, metrics
2387
4014
  * wrapping, tenant-aware fan-out). The default is ``DefaultToolExecutor``.
@@ -2786,6 +4413,42 @@ interface OpenAICompatibleConsult {
2786
4413
  */
2787
4414
  readonly sessionHeader?: string;
2788
4415
  }
4416
+ /**
4417
+ * Options for a call transfer initiated via the built-in `transfer_call`
4418
+ * tool or `TelephonyBridge.transferCall`.
4419
+ *
4420
+ * Mirrors Python's `mode` / `summary` keywords on the per-carrier transfer
4421
+ * functions.
4422
+ */
4423
+ interface TransferCallOptions {
4424
+ /**
4425
+ * `'cold'` (default) redirects the caller immediately — byte-identical to
4426
+ * the historical blind transfer. `'warm'` puts the caller on hold music,
4427
+ * dials the target with an announced {@link summary}, then bridges the two
4428
+ * together (Twilio only for now; other carriers return an error envelope).
4429
+ */
4430
+ readonly mode?: 'cold' | 'warm';
4431
+ /**
4432
+ * Warm mode only — one or two sentences announced to the human agent
4433
+ * before the caller is bridged (who is calling and what they need).
4434
+ */
4435
+ readonly summary?: string;
4436
+ }
4437
+ /**
4438
+ * Result of a transfer attempt. Cold transfers may resolve `void` (legacy
4439
+ * contract); warm transfers resolve a result envelope —
4440
+ * `{ status: 'transferring', mode: 'warm', ... }` on success or
4441
+ * `{ error: ... }` when warm transfer is unsupported on the carrier or the
4442
+ * carrier REST sequence failed (the call keeps running in that case).
4443
+ */
4444
+ interface TransferCallResult {
4445
+ readonly status?: string;
4446
+ readonly mode?: 'cold' | 'warm';
4447
+ readonly to?: string;
4448
+ /** Per-call conference name (Twilio warm transfers). */
4449
+ readonly conference?: string;
4450
+ readonly error?: string;
4451
+ }
2789
4452
  /** Constructor options for `new Patter({...})` in local-server mode. */
2790
4453
  interface LocalOptions {
2791
4454
  /**
@@ -2945,6 +4608,29 @@ interface VADProvider {
2945
4608
  */
2946
4609
  reset?(): Promise<void> | void;
2947
4610
  }
4611
+ /**
4612
+ * Semantic end-of-utterance (turn) detector.
4613
+ *
4614
+ * Predicts whether the caller has FINISHED their turn — as opposed to a
4615
+ * VAD, which only reports whether they are currently producing sound.
4616
+ * Implementations include `SmartTurnDetector` (pipecat-ai smart-turn v3,
4617
+ * ONNX). Used via `Agent.turnDetector`; integrated in the pipeline stream
4618
+ * handler on the VAD `speech_end` edge to defer the STT finalize until the
4619
+ * model agrees the turn is complete (bounded by `Agent.maxSemanticHoldMs`).
4620
+ * Mirror of the Python `TurnDetectorProvider` ABC.
4621
+ */
4622
+ interface TurnDetectorProvider {
4623
+ /** End-of-turn probability at/above which the turn is complete. */
4624
+ readonly threshold: number;
4625
+ /**
4626
+ * Return the end-of-turn probability in `[0, 1]` for the window.
4627
+ * `pcm16Window` is mono int16 little-endian PCM at 16 kHz covering the
4628
+ * most recent seconds of caller audio (the handler keeps a rolling
4629
+ * ~8 s buffer).
4630
+ */
4631
+ predict(pcm16Window: Buffer): Promise<number>;
4632
+ close(): Promise<void>;
4633
+ }
2948
4634
  /** Pre-STT audio filter — noise cancellation, gain, EQ. */
2949
4635
  interface AudioFilter {
2950
4636
  process(pcmChunk: Buffer, sampleRate: number): Promise<Buffer>;
@@ -3078,6 +4764,18 @@ interface AgentOptions {
3078
4764
  * disables it. See {@link ConsultConfig}.
3079
4765
  */
3080
4766
  readonly consult?: ConsultConfig;
4767
+ /**
4768
+ * Multi-agent handoff targets: ``{ name: agentOptions }``. When set, Patter
4769
+ * auto-injects a built-in ``handoff_to(name, reason?)`` tool (Realtime +
4770
+ * Pipeline modes); calling it swaps the CURRENT call to the target agent's
4771
+ * configuration mid-call — system prompt, tools, variables, guardrails,
4772
+ * and onward ``handoffs`` are taken from the target. Audio infrastructure
4773
+ * established at call start (STT/TTS/engine connection — and therefore
4774
+ * voice on engines that cannot switch voice mid-session) is retained.
4775
+ * Chained handoffs follow the TARGET's own ``handoffs`` map. ``undefined``
4776
+ * (default) disables the tool. Mirrors Python ``Agent.handoffs``.
4777
+ */
4778
+ readonly handoffs?: Readonly<Record<string, AgentOptions>>;
3081
4779
  /**
3082
4780
  * When ``true``, ship ``systemPrompt`` to the LLM verbatim. Default
3083
4781
  * (``false``) prepends a phone-friendly preamble that instructs the
@@ -3132,6 +4830,26 @@ interface AgentOptions {
3132
4830
  readonly textTransforms?: ReadonlyArray<(text: string) => string>;
3133
4831
  /** Optional server-side VAD (e.g., Silero). Pipeline mode only. */
3134
4832
  readonly vad?: VADProvider;
4833
+ /**
4834
+ * Opt-in semantic end-of-utterance model (e.g. `SmartTurnDetector.load()`
4835
+ * — pipecat-ai smart-turn v3, ONNX). Pipeline mode only. When set, a VAD
4836
+ * `speech_end` no longer finalizes the STT utterance immediately: the
4837
+ * detector scores the last ~8 s of caller audio and the turn is committed
4838
+ * only once the end-of-turn probability reaches `turnDetector.threshold`
4839
+ * (the EOU trigger is then stamped `semantic_turn_detector`). While the
4840
+ * model says "incomplete" the handler re-polls on subsequent silence,
4841
+ * bounded by `maxSemanticHoldMs`. Undefined (default) keeps today's pure
4842
+ * VAD-silence endpointing byte-identical.
4843
+ */
4844
+ readonly turnDetector?: TurnDetectorProvider;
4845
+ /**
4846
+ * Hard cap (ms) on how long the semantic turn detector may hold a turn
4847
+ * open past the VAD `speech_end` before the SDK finalizes anyway (with
4848
+ * the `vad_silence` trigger), so a turn can never hang on a model that
4849
+ * keeps predicting "incomplete". Only consulted when `turnDetector` is
4850
+ * set. Default 1200 ms.
4851
+ */
4852
+ readonly maxSemanticHoldMs?: number;
3135
4853
  /** Optional pre-STT audio filter (noise cancellation). Pipeline mode only. */
3136
4854
  readonly audioFilter?: AudioFilter;
3137
4855
  /** Optional background audio mixer (hold music, thinking cues). Pipeline mode only. */
@@ -3163,10 +4881,32 @@ interface AgentOptions {
3163
4881
  /**
3164
4882
  * Maximum time (ms) to wait for at least one strategy to confirm a
3165
4883
  * pending barge-in before discarding the pending state and resuming
3166
- * TTS. Only consulted when ``bargeInStrategies`` is non-empty.
4884
+ * TTS. Consulted when ``bargeInStrategies`` is non-empty AND as the
4885
+ * false-interruption window for ``bargeInMode: 'pause_resume'``.
3167
4886
  * Default: 1500.
3168
4887
  */
3169
4888
  readonly bargeInConfirmMs?: number;
4889
+ /**
4890
+ * How a VAD ``speech_start`` during the agent's turn is handled
4891
+ * (pipeline mode):
4892
+ *
4893
+ * - ``'cancel'`` (default): today's behaviour — the in-flight turn is
4894
+ * cancelled immediately (or marked pending when
4895
+ * ``bargeInStrategies`` are configured).
4896
+ * - ``'pause_resume'`` (false-interruption handling):
4897
+ * output is PAUSED immediately — the carrier buffer is cleared and
4898
+ * no further TTS audio is sent — while the LLM stream and the TTS
4899
+ * provider stream stay alive (tokens buffer as sentences,
4900
+ * synthesized audio queues in memory, both bounded). If a committed
4901
+ * final transcript confirms the interruption within
4902
+ * ``bargeInConfirmMs`` the turn is cancelled exactly as in
4903
+ * ``'cancel'`` mode; if the window expires with no transcript (a
4904
+ * cough, line noise) the agent RESUMES from the first sentence the
4905
+ * caller had not fully heard, re-sending retained audio without
4906
+ * re-billing TTS, and the event is recorded as a false interruption
4907
+ * (a backchannel — not an interruption — in metrics).
4908
+ */
4909
+ readonly bargeInMode?: 'cancel' | 'pause_resume';
3170
4910
  /**
3171
4911
  * When ``true`` (default), ``Patter.call`` warms up the STT, TTS, and
3172
4912
  * LLM provider connections in parallel with the carrier-side
@@ -3209,6 +4949,31 @@ interface AgentOptions {
3209
4949
  * currency, balanced delimiter, ellipsis).
3210
4950
  */
3211
4951
  readonly aggressiveFirstFlush?: boolean;
4952
+ /**
4953
+ * PREEMPTIVE GENERATION (pipeline mode, built-in LLM loop only; opt-in).
4954
+ * When ``true`` the SDK starts the LLM — and sentence-chunked TTS
4955
+ * synthesis — EARLY on a confident INTERIM transcript (one that ends with
4956
+ * sentence-final punctuation, or that has been unchanged for
4957
+ * ``preemptiveMinStableMs``), holding all synthesized audio in memory.
4958
+ * When the FINAL transcript commits: if it matches the speculated interim
4959
+ * (normalized — case/punctuation/whitespace-insensitive) the buffered
4960
+ * audio is RELEASED to the carrier immediately (the LLM+TTS latency was
4961
+ * paid during the user's own end-of-utterance silence); if it differs,
4962
+ * the speculation is discarded silently and the turn dispatches normally
4963
+ * on the final. History and metrics record exactly one turn either way.
4964
+ * The standard voice-AI "preemptive generation" pattern. Default:
4965
+ * ``false`` — every turn waits for the final transcript, as today.
4966
+ * Mirrors Python ``preemptive_generation``.
4967
+ */
4968
+ readonly preemptiveGeneration?: boolean;
4969
+ /**
4970
+ * Interim-stability window (ms) for preemptive generation: an interim
4971
+ * transcript that does NOT end with sentence-final punctuation qualifies
4972
+ * for speculation only once it has remained unchanged for this long.
4973
+ * Only consulted when ``preemptiveGeneration`` is true. Default: 300.
4974
+ * Mirrors Python ``preemptive_min_stable_ms``.
4975
+ */
4976
+ readonly preemptiveMinStableMs?: number;
3212
4977
  /**
3213
4978
  * Input noise reduction for speakerphone / conference audio (OpenAI
3214
4979
  * Realtime mode only). `undefined` (default) omits the field entirely
@@ -3284,7 +5049,13 @@ interface ServeOptions {
3284
5049
  readonly port?: number;
3285
5050
  /** When true, start a cloudflared tunnel automatically (requires `cloudflared` npm package). */
3286
5051
  readonly tunnel?: boolean;
3287
- readonly onCallStart?: (data: Record<string, unknown>) => Promise<void>;
5052
+ /**
5053
+ * Called when a call's media stream starts. Returning an object applies
5054
+ * PER-CALL AGENT OVERRIDES (snake_case keys: system_prompt, voice, model,
5055
+ * language, first_message, provider, tools, variables) — parity with the
5056
+ * Python SDK. Return nothing to just observe.
5057
+ */
5058
+ readonly onCallStart?: (data: Record<string, unknown>) => Promise<void | Record<string, unknown> | undefined> | void | Record<string, unknown>;
3288
5059
  readonly onCallEnd?: (data: Record<string, unknown>) => Promise<void>;
3289
5060
  readonly onTranscript?: (data: Record<string, unknown>) => Promise<void>;
3290
5061
  /** Pipeline mode only — called with the user's transcript; return value is spoken.
@@ -3294,6 +5065,19 @@ interface ServeOptions {
3294
5065
  readonly onMetrics?: (data: Record<string, unknown>) => Promise<void>;
3295
5066
  /** When true, record calls via the Twilio Recordings API. */
3296
5067
  readonly recording?: boolean;
5068
+ /**
5069
+ * Carrier-neutral local call recording. When `true`, the SDK records each
5070
+ * call at the transport as an interleaved stereo WAV — left channel =
5071
+ * caller, right channel = agent — at 16 kHz PCM16, written incrementally
5072
+ * to `<call_log_dir>/recording.wav` when call logging (`persist` /
5073
+ * `PATTER_LOG_DIR`) is enabled, else to `./recordings/<call_id>.wav`.
5074
+ * Pass a directory string to choose where the WAVs go. Works on every
5075
+ * carrier (Twilio, Telnyx, Plivo) and every engine mode; independent of
5076
+ * the carrier-side `recording` flag (both can be on). The final path is
5077
+ * surfaced as `recording_path` in the `onCallEnd` payload and in the
5078
+ * call-log metadata. Default `false`.
5079
+ */
5080
+ readonly localRecording?: boolean | string;
3297
5081
  /** If set, spoken as a voicemail message when AMD detects a machine. */
3298
5082
  readonly voicemailMessage?: string;
3299
5083
  /** Custom pricing overrides for cost calculation. */
@@ -3373,6 +5157,13 @@ interface MachineDetectionResult {
3373
5157
  interface LocalCallOptions {
3374
5158
  readonly to: string;
3375
5159
  readonly agent: AgentOptions;
5160
+ /**
5161
+ * Per-call greeting override — what the AI says when the callee answers.
5162
+ * Overrides ``agent.firstMessage`` for this call only (prewarm synthesis
5163
+ * and the stream handler both read the overridden value). Parity with
5164
+ * Python ``call(first_message=...)``.
5165
+ */
5166
+ readonly firstMessage?: string;
3376
5167
  /**
3377
5168
  * Enable answering-machine detection. **Defaults to ``true``** — the SDK
3378
5169
  * asks Twilio (``MachineDetection=DetectMessageEnd`` + Async AMD) or
@@ -3465,15 +5256,6 @@ interface CallResult {
3465
5256
  readonly metrics: CallMetrics | null;
3466
5257
  }
3467
5258
 
3468
- /**
3469
- * Shared STT / TTS adapter dispatch.
3470
- *
3471
- * In v0.5.0+ callers always pass pre-instantiated adapters (``agent.stt`` /
3472
- * ``agent.tts`` are ``STTAdapter`` / ``TTSAdapter`` instances), so these
3473
- * helpers are thin pass-throughs that return the instance or null. Kept as
3474
- * functions so the Twilio/Telnyx bridges have a single dispatch point.
3475
- */
3476
-
3477
5259
  /** Per-word timings / metadata (Deepgram-shaped). Optional on every adapter. */
3478
5260
  interface STTWord {
3479
5261
  readonly word?: string;
@@ -3618,7 +5400,7 @@ interface ElevenLabsTTSOptions$1 {
3618
5400
  * ElevenLabs to produce μ-law directly skips that step (saves
3619
5401
  * ~30–80 ms first-byte plus per-frame CPU and avoids any resampling
3620
5402
  * aliasing).
3621
- * - {@link ElevenLabsTTS.forTelnyx} emits `pcm_16000`. Telnyx negotiates
5403
+ * - {@link ElevenLabsTTS.forTelnyx} emits `ulaw_8000`. The SDK pins Telnyx to PCMU;
3622
5404
  * L16/16000 on its bidirectional media WebSocket, so 16 kHz PCM is
3623
5405
  * already the format used end-to-end and no transcoding happens.
3624
5406
  * ElevenLabs *also* supports `ulaw_8000` if your Telnyx profile is
@@ -3676,13 +5458,9 @@ declare class ElevenLabsTTS {
3676
5458
  /**
3677
5459
  * Construct an instance pre-configured for Telnyx bidirectional media.
3678
5460
  *
3679
- * Telnyx's default media-streaming codec is L16 PCM @ 16 kHz, which
3680
- * matches our default Telnyx handler. We pick `pcm_16000` so the audio
3681
- * flows end-to-end with zero resampling or transcoding.
3682
- *
3683
- * Trade-off: if your Telnyx profile is pinned to PCMU/8000 (μ-law),
3684
- * construct `ElevenLabsTTS` directly with `outputFormat: 'ulaw_8000'`
3685
- * — Telnyx supports that natively too.
5461
+ * The SDK's ``streaming_start`` pins the Telnyx wire to PCMU/μ-law @
5462
+ * 8 kHz (stream_bidirectional_codec=PCMU), so μ-law output flows
5463
+ * end-to-end with zero resampling or transcoding.
3686
5464
  */
3687
5465
  static forTelnyx(apiKey: string, options?: Omit<ElevenLabsTTSOptions$1, 'outputFormat'>): ElevenLabsTTS;
3688
5466
  /**
@@ -3841,7 +5619,7 @@ declare class ElevenLabsWebSocketTTS implements TTSAdapter {
3841
5619
  cancelActiveStream(): void;
3842
5620
  /** Pre-configured for Twilio Media Streams (`ulaw_8000`). */
3843
5621
  static forTwilio(opts: Omit<ElevenLabsWebSocketTTSOptions, 'outputFormat'>): ElevenLabsWebSocketTTS;
3844
- /** Pre-configured for Telnyx (`pcm_16000`). */
5622
+ /** Pre-configured for Telnyx (μ-law 8 kHz wire). */
3845
5623
  static forTelnyx(opts: Omit<ElevenLabsWebSocketTTSOptions, 'outputFormat'>): ElevenLabsWebSocketTTS;
3846
5624
  private buildUrl;
3847
5625
  /**
@@ -4148,6 +5926,14 @@ declare class Patter {
4148
5926
  * carrier ``start`` event instead of opening fresh ones — saving
4149
5927
  * ~150-900 ms of cold-start handshake on the first turn.
4150
5928
  */
5929
+ /**
5930
+ * Re-key prewarm caches from a dial-time id to the live carrier id.
5931
+ * Plivo issues ``request_uuid`` at dial time but the media stream and
5932
+ * webhooks carry ``CallUUID`` — without re-keying, prewarmed first-message
5933
+ * audio and parked provider sockets never matched and always TTL-evicted
5934
+ * as "wasted". Mirrors Python ``_alias_prewarm``.
5935
+ */
5936
+ aliasPrewarm: (oldId: string, newId: string) => void;
4151
5937
  popPrewarmedConnections: (callId: string) => ParkedProviderConnections | undefined;
4152
5938
  /**
4153
5939
  * Close any parked provider WebSockets for ``callId``. Wired into
@@ -5126,7 +6912,7 @@ declare class TestSession {
5126
6912
  agent: AgentOptions;
5127
6913
  openaiKey?: string;
5128
6914
  onMessage?: PipelineMessageHandler;
5129
- onCallStart?: (data: Record<string, unknown>) => Promise<void>;
6915
+ onCallStart?: (data: Record<string, unknown>) => Promise<void | Record<string, unknown> | undefined> | void | Record<string, unknown>;
5130
6916
  onCallEnd?: (data: Record<string, unknown>) => Promise<void>;
5131
6917
  }): Promise<void>;
5132
6918
  }
@@ -5151,7 +6937,7 @@ declare class TestSession {
5151
6937
  declare const GEMINI_DEFAULT_INPUT_SR = 16000;
5152
6938
  declare const GEMINI_DEFAULT_OUTPUT_SR = 24000;
5153
6939
  /** Callback signature for events emitted by {@link GeminiLiveAdapter}. */
5154
- type GeminiLiveEventHandler = (type: 'audio' | 'transcript_output' | 'function_call' | 'speech_started' | 'response_done' | 'error', data: unknown) => void | Promise<void>;
6940
+ type GeminiLiveEventHandler = (type: 'audio' | 'transcript_input' | 'transcript_output' | 'function_call' | 'speech_started' | 'response_done' | 'error', data: unknown) => void | Promise<void>;
5155
6941
  interface GeminiLiveOptions {
5156
6942
  model?: string;
5157
6943
  voice?: string;
@@ -5244,6 +7030,10 @@ declare class UltravoxRealtimeAdapter {
5244
7030
  private readonly sampleRate;
5245
7031
  private readonly firstMessage;
5246
7032
  private ws;
7033
+ /** Last Ultravox state string (turn-end transition detection). */
7034
+ private lastUltravoxState;
7035
+ /** Whether the current agent turn streamed delta frames (dedupe finals). */
7036
+ private agentStreamedDeltas;
5247
7037
  private handlers;
5248
7038
  /** Exposed for diagnostics — true while the underlying socket is open. */
5249
7039
  running: boolean;
@@ -5361,6 +7151,7 @@ declare class SonioxSTT {
5361
7151
  private ws;
5362
7152
  private readonly callbacks;
5363
7153
  private final;
7154
+ private lastInterimText;
5364
7155
  private keepaliveTimer;
5365
7156
  private readonly apiKey;
5366
7157
  private readonly model;
@@ -5373,17 +7164,31 @@ declare class SonioxSTT {
5373
7164
  private readonly maxEndpointDelayMs;
5374
7165
  private readonly clientReferenceId?;
5375
7166
  private readonly baseUrl;
7167
+ /** Construction args replayed by clone(). */
7168
+ private readonly patterCtorArgs;
5376
7169
  constructor(apiKey: string, options?: SonioxSTTOptions$1);
5377
7170
  /** Factory for Twilio-style 8 kHz linear PCM. */
5378
7171
  static forTwilio(apiKey: string, languageHints?: string[]): SonioxSTT;
5379
7172
  private buildConfig;
5380
7173
  /** Open the streaming WebSocket and send the initial config payload. */
7174
+ /**
7175
+ * Fresh adapter built with this instance's construction arguments —
7176
+ * called per call by the stream handler so concurrent calls never share
7177
+ * connection state (sockets/queues; cross-call transcript bleed).
7178
+ */
7179
+ clone(): this;
5381
7180
  connect(): Promise<void>;
5382
7181
  private clearKeepalive;
5383
7182
  private handleMessage;
5384
7183
  private emit;
5385
7184
  /** Send a binary PCM16-LE audio chunk to Soniox for transcription. */
5386
7185
  sendAudio(audio: Buffer): void;
7186
+ /**
7187
+ * Ask Soniox to finalize buffered audio immediately. The pipeline's VAD
7188
+ * ``speech_end`` fast-path duck-types ``stt.finalize`` — without this
7189
+ * every Soniox turn waited out the full endpointing delay. Mirrors Python.
7190
+ */
7191
+ finalize(): void;
5387
7192
  /** Register a transcript listener. */
5388
7193
  onTranscript(callback: TranscriptCallback$6): void;
5389
7194
  /** Unregister a previously registered transcript listener. */
@@ -5495,6 +7300,8 @@ declare class AssemblyAISTT {
5495
7300
  sessionId: string | null;
5496
7301
  /** Unix timestamp when the AssemblyAI session expires. */
5497
7302
  expiresAt: number | null;
7303
+ /** Construction args replayed by clone(). */
7304
+ private readonly patterCtorArgs;
5498
7305
  constructor(apiKey: string, options?: AssemblyAISTTOptions$1);
5499
7306
  /** Factory for Twilio calls — mulaw 8 kHz. */
5500
7307
  static forTwilio(apiKey: string, model?: AssemblyAIModel): AssemblyAISTT;
@@ -5517,6 +7324,12 @@ declare class AssemblyAISTT {
5517
7324
  */
5518
7325
  warmup(): Promise<void>;
5519
7326
  /** Open the streaming WebSocket and arm message handlers. */
7327
+ /**
7328
+ * Fresh adapter built with this instance's construction arguments —
7329
+ * called per call by the stream handler so concurrent calls never share
7330
+ * connection state (sockets/queues; cross-call transcript bleed).
7331
+ */
7332
+ clone(): this;
5520
7333
  connect(): Promise<void>;
5521
7334
  private awaitOpen;
5522
7335
  private attachHandlers;
@@ -5613,6 +7426,8 @@ declare class CartesiaSTT {
5613
7426
  * `null` until the first transcript event arrives (matches Python's `None`).
5614
7427
  */
5615
7428
  requestId: string | null;
7429
+ /** Construction args replayed by clone(). */
7430
+ private readonly patterCtorArgs;
5616
7431
  constructor(apiKey: string, options?: CartesiaSTTOptions$1);
5617
7432
  /**
5618
7433
  * Open a fresh WebSocket without arming any message / keepalive handlers
@@ -5641,6 +7456,12 @@ declare class CartesiaSTT {
5641
7456
  */
5642
7457
  warmup(): Promise<void>;
5643
7458
  /** Open the streaming WebSocket and arm message + keepalive handlers. */
7459
+ /**
7460
+ * Fresh adapter built with this instance's construction arguments —
7461
+ * called per call by the stream handler so concurrent calls never share
7462
+ * connection state (sockets/queues; cross-call transcript bleed).
7463
+ */
7464
+ clone(): this;
5644
7465
  connect(): Promise<void>;
5645
7466
  /**
5646
7467
  * Adopt a pre-opened, already-OPEN WebSocket produced by the prewarm
@@ -5869,6 +7690,8 @@ declare class DeepgramSTT {
5869
7690
  * ``(apiKey, language?, model?, encoding?, sampleRate?)`` for backward
5870
7691
  * compatibility with code that predated BUG #13.
5871
7692
  */
7693
+ /** Construction args replayed by clone(). */
7694
+ private readonly patterCtorArgs;
5872
7695
  constructor(apiKey: string, language?: string, model?: string, encoding?: string, sampleRate?: number, options?: DeepgramSTTOptions$1);
5873
7696
  constructor(apiKey: string, options: DeepgramSTTOptions$1 & {
5874
7697
  language?: string;
@@ -5893,6 +7716,12 @@ declare class DeepgramSTT {
5893
7716
  */
5894
7717
  warmup(): Promise<void>;
5895
7718
  /** Open the streaming WebSocket and arm message + keepalive handlers. */
7719
+ /**
7720
+ * Fresh adapter built with this instance's construction arguments —
7721
+ * called per call by the stream handler so concurrent calls never share
7722
+ * connection state (sockets/queues; cross-call transcript bleed).
7723
+ */
7724
+ clone(): this;
5896
7725
  connect(): Promise<void>;
5897
7726
  private openSocket;
5898
7727
  private clearKeepalive;
@@ -5946,13 +7775,11 @@ declare class DeepgramSTT {
5946
7775
  * 16 kHz pipelines. For real phone calls, use the carrier-specific
5947
7776
  * factories instead:
5948
7777
  *
5949
- * - {@link CartesiaTTS.forTwilio} requests `sampleRate=8000` natively from
5950
- * Cartesia. Twilio's media-stream WebSocket expects μ-law @ 8 kHz, so
5951
- * the SDK normally resamples 16 kHz 8 kHz before doing the PCM →
5952
- * μ-law transcode in `TwilioAudioSender`. Asking Cartesia for 8 kHz
5953
- * PCM at the source skips the resample step (saves ~10–30 ms first-
5954
- * byte plus per-frame CPU and removes a potential aliasing source).
5955
- * The PCM → μ-law transcode still happens client-side.
7778
+ * - {@link CartesiaTTS.forTwilio} requests `sampleRate=16000` the rate
7779
+ * the pipeline's carrier-side encoder expects. The previous 8 kHz
7780
+ * shortcut had no consuming hook: the audio sender unconditionally runs
7781
+ * its fixed 16 kHz 8 kHz decimator, so 8 kHz input was decimated
7782
+ * AGAIN and played back at ~2x speed (chipmunk audio).
5956
7783
  * - {@link CartesiaTTS.forTelnyx} requests `sampleRate=16000`. Telnyx
5957
7784
  * negotiates L16/16000 on its bidirectional media WebSocket, so
5958
7785
  * 16 kHz PCM is already the format used end-to-end and no
@@ -6188,10 +8015,18 @@ declare class WhisperSTT {
6188
8015
  * ``(apiKey, model, language, bufferSize, responseFormat)`` — callers using
6189
8016
  * the old order will need to swap ``language`` and ``model``.
6190
8017
  */
8018
+ /** Construction args replayed by clone(). */
8019
+ private readonly patterCtorArgs;
6191
8020
  constructor(apiKey: string, language?: string, model?: string, bufferSize?: number, responseFormat?: WhisperResponseFormat);
6192
8021
  /** Factory for Twilio calls — mulaw 8 kHz is transcoded upstream, so we still receive PCM 16-bit. */
6193
8022
  static forTwilio(apiKey: string, language?: string, model?: string): WhisperSTT;
6194
8023
  /** Reset the audio buffer and arm the adapter for incoming chunks. */
8024
+ /**
8025
+ * Fresh adapter built with this instance's construction arguments —
8026
+ * called per call by the stream handler so concurrent calls never share
8027
+ * connection state (sockets/queues; cross-call transcript bleed).
8028
+ */
8029
+ clone(): this;
6195
8030
  connect(): Promise<void>;
6196
8031
  /** Buffer a PCM16 chunk; flushes to Whisper once `bufferSize` bytes are reached. */
6197
8032
  sendAudio(audio: Buffer): void;
@@ -6277,7 +8112,7 @@ interface OpenAITranscribeSTTOptions {
6277
8112
  model?: string;
6278
8113
  language?: string;
6279
8114
  bufferSize?: number;
6280
- /** ``"verbose_json"`` exposes segment-level confidence / timestamps. */
8115
+ /** ``"json"`` only — the gpt-4o transcribe models reject ``"verbose_json"`` (whisper-1/WhisperSTT supports it). */
6281
8116
  responseFormat?: WhisperResponseFormat;
6282
8117
  }
6283
8118
  /**
@@ -6537,10 +8372,18 @@ declare class SpeechmaticsSTT {
6537
8372
  private readonly operatingPoint;
6538
8373
  private readonly domain;
6539
8374
  private readonly outputLocale;
8375
+ /** Construction args replayed by clone(). */
8376
+ private readonly patterCtorArgs;
6540
8377
  constructor(apiKey: string, options?: SpeechmaticsSTTOptions$1);
6541
8378
  /** Build the JSON `StartRecognition` payload sent on connect. */
6542
8379
  private buildStartRecognition;
6543
8380
  /** Open the streaming WebSocket and send the `StartRecognition` frame. */
8381
+ /**
8382
+ * Fresh adapter built with this instance's construction arguments —
8383
+ * called per call by the stream handler so concurrent calls never share
8384
+ * connection state (sockets/queues; cross-call transcript bleed).
8385
+ */
8386
+ clone(): this;
6544
8387
  connect(): Promise<void>;
6545
8388
  /** Send a binary PCM16-LE audio chunk to Speechmatics for transcription. */
6546
8389
  sendAudio(audio: Buffer): void;
@@ -6811,15 +8654,15 @@ type CartesiaCarrierOptions = Omit<CartesiaTTSOptions, "sampleRate">;
6811
8654
  * const tts = new cartesia.TTS({ apiKey: "..." });
6812
8655
  * ```
6813
8656
  *
6814
- * **Telephony optimization** — use {@link TTS.forTwilio} (PCM @ 8 kHz,
6815
- * skipping the SDK-side 16 kHz → 8 kHz resample before μ-law transcoding)
8657
+ * **Telephony** — use {@link TTS.forTwilio} (PCM @ 16 kHz; the pipeline's
8658
+ * carrier-side encoder performs the 16 kHz → 8 kHz + μ-law step itself)
6816
8659
  * or {@link TTS.forTelnyx} (PCM @ 16 kHz, native Telnyx default) on
6817
8660
  * phone calls.
6818
8661
  */
6819
8662
  declare class TTS$3 extends CartesiaTTS {
6820
8663
  static readonly providerKey = "cartesia_tts";
6821
8664
  constructor(opts?: CartesiaTTSOptions);
6822
- /** Pipeline TTS pre-configured for Twilio Media Streams (PCM @ 8 kHz). */
8665
+ /** Pipeline TTS pre-configured for Twilio Media Streams (PCM @ 16 kHz — the pipeline decimates to the 8 kHz wire itself; 8 kHz here was decimated AGAIN → chipmunk audio). */
6823
8666
  static forTwilio(opts?: CartesiaCarrierOptions): TTS$3;
6824
8667
  static forTwilio(apiKey: string, options?: Omit<CartesiaTTSOptions, "sampleRate">): TTS$3;
6825
8668
  /** Pipeline TTS pre-configured for Telnyx (PCM @ 16 kHz). */
@@ -7794,8 +9637,8 @@ declare class LLM$3 extends OpenAICompatibleLLMProvider {
7794
9637
  /**
7795
9638
  * Custom LLM — point Patter's pipeline at ANY OpenAI-compatible endpoint.
7796
9639
  *
7797
- * The industry-standard "Custom LLM" pattern (the same concept ElevenLabs
7798
- * Agents, Retell, and Vapi expose under that name): Patter owns the phone leg
9640
+ * The industry-standard "Custom LLM" pattern (the name the voice-AI
9641
+ * ecosystem uses for this concept): Patter owns the phone leg
7799
9642
  * — carrier, STT, turn-taking, barge-in, TTS — and POSTs each conversation
7800
9643
  * turn to YOUR ``/chat/completions`` endpoint. That endpoint can be:
7801
9644
  *
@@ -8160,6 +10003,166 @@ declare class SileroVAD implements VADProvider {
8160
10003
  reset(): void;
8161
10004
  }
8162
10005
 
10006
+ /**
10007
+ * Smart-turn v3 semantic turn detector (ONNX).
10008
+ *
10009
+ * Audio-native end-of-utterance model from the pipecat-ai project
10010
+ * (https://github.com/pipecat-ai/smart-turn, Apache-2.0 — ~8 M params,
10011
+ * <100 ms CPU inference). Unlike a VAD — which only knows *whether* the
10012
+ * caller is producing sound — smart-turn looks at the prosody of the last
10013
+ * few seconds of speech and predicts whether the caller has *finished
10014
+ * their turn* or is merely pausing mid-sentence ("My phone number is…").
10015
+ *
10016
+ * Wiring: pass an instance as `agent.turnDetector`. The pipeline stream
10017
+ * handler then defers the STT finalize that normally fires on a VAD
10018
+ * `speech_end` until the model agrees the turn is complete (probability
10019
+ * ≥ {@link SmartTurnDetector.threshold}), holding for at most
10020
+ * `agent.maxSemanticHoldMs` (default 1200 ms) so a turn can never hang.
10021
+ *
10022
+ * Model file: the ONNX weights are NOT bundled with the SDK (~30 MB).
10023
+ * Download a `smart-turn-v3*.onnx` file from
10024
+ * https://huggingface.co/pipecat-ai/smart-turn-v3 and point the SDK at it
10025
+ * via the `PATTER_SMART_TURN_MODEL` environment variable or the
10026
+ * `modelPath` option of {@link SmartTurnDetector.load}.
10027
+ *
10028
+ * Preprocessing (matches `pipecat-ai/smart-turn` `inference.py` exactly —
10029
+ * the v3 ONNX graph takes Whisper log-mel features, not a raw waveform):
10030
+ *
10031
+ * 1. int16 LE PCM → float in [-1, 1] (÷ 32768).
10032
+ * 2. Keep the LAST 8 s of 16 kHz audio; left-pad with zeros to exactly
10033
+ * 128 000 samples so the speech sits at the END of the window
10034
+ * (`truncate_audio_to_last_n_seconds`).
10035
+ * 3. Zero-mean / unit-variance normalize the full padded window
10036
+ * (`WhisperFeatureExtractor(..., do_normalize=True)`).
10037
+ * 4. Whisper log-mel: 400-point Hann STFT (hop 160, reflect-padded,
10038
+ * centered), 80 Slaney-scale mel filters, `log10` with 1e-10 floor,
10039
+ * clamp to `max - 8`, scale `(x + 4) / 4`, drop the trailing frame
10040
+ * → `(80, 800)` float32.
10041
+ * 5. `session.run({ input_features })` — the graph applies the sigmoid
10042
+ * internally and returns the end-of-turn probability directly.
10043
+ * `probability > 0.5` ⇒ turn complete.
10044
+ *
10045
+ * `onnxruntime-node` is loaded lazily as an optional dependency (same
10046
+ * pattern as `providers/silero-vad.ts`). The feature extraction is pure
10047
+ * TypeScript (mixed-radix FFT) and yields to the event loop periodically
10048
+ * so a per-turn prediction never blocks inbound audio handling.
10049
+ */
10050
+
10051
+ /** Env var consulted by {@link SmartTurnDetector.load} when no `modelPath` is given. */
10052
+ declare const SMART_TURN_MODEL_ENV_VAR = "PATTER_SMART_TURN_MODEL";
10053
+ /** Options accepted by {@link SmartTurnDetector.load}. */
10054
+ interface SmartTurnDetectorOptions {
10055
+ /**
10056
+ * End-of-turn probability at/above which the turn is considered
10057
+ * complete. Default 0.5 per the smart-turn v3 reference.
10058
+ */
10059
+ readonly threshold?: number;
10060
+ /**
10061
+ * Path to the `smart-turn-v3*.onnx` file. Falls back to the
10062
+ * `PATTER_SMART_TURN_MODEL` environment variable when omitted.
10063
+ */
10064
+ readonly modelPath?: string;
10065
+ /** Restrict ONNX Runtime to the CPU execution provider (default true). */
10066
+ readonly forceCpu?: boolean;
10067
+ }
10068
+ /**
10069
+ * Semantic end-of-utterance detector backed by smart-turn v3 (ONNX).
10070
+ *
10071
+ * Load via {@link SmartTurnDetector.load} (throws with actionable
10072
+ * instructions when the optional deps or the model file are missing) or
10073
+ * {@link SmartTurnDetector.maybeLoad} (warns once and resolves `undefined`
10074
+ * instead, so the agent degrades to plain VAD-silence endpointing rather
10075
+ * than crashing):
10076
+ *
10077
+ * ```ts
10078
+ * const detector = await SmartTurnDetector.load(); // PATTER_SMART_TURN_MODEL
10079
+ * // or: await SmartTurnDetector.load({ modelPath: '…/smart-turn-v3.0.onnx' });
10080
+ * // or: await SmartTurnDetector.maybeLoad(); // undefined when unprovisioned
10081
+ *
10082
+ * const agent = phone.agent({ ..., turnDetector: detector });
10083
+ * ```
10084
+ *
10085
+ * {@link predict} takes the most recent window of mono int16 LE PCM at
10086
+ * 16 kHz (up to 8 s — longer windows are truncated to the last 8 s,
10087
+ * shorter ones are left-padded) and returns the probability in `[0, 1]`
10088
+ * that the caller has finished their turn. The pipeline handler compares
10089
+ * it against {@link threshold} (default 0.5).
10090
+ *
10091
+ * The model is stateless (no streaming RNN state), so a single instance
10092
+ * may be shared across concurrent calls.
10093
+ */
10094
+ declare class SmartTurnDetector implements TurnDetectorProvider {
10095
+ private readonly runtime;
10096
+ private session;
10097
+ private readonly thresholdValue;
10098
+ private closed;
10099
+ private constructor();
10100
+ /**
10101
+ * Load the smart-turn v3 ONNX model and return a ready detector.
10102
+ * Throws with download instructions when no model file is configured
10103
+ * (see {@link SMART_TURN_MODEL_ENV_VAR}), and with install instructions
10104
+ * when `onnxruntime-node` is missing.
10105
+ */
10106
+ static load(options?: SmartTurnDetectorOptions): Promise<SmartTurnDetector>;
10107
+ /**
10108
+ * Like {@link load}, but degrade instead of throw.
10109
+ *
10110
+ * Resolves to `undefined` — after a single clear warning — when semantic
10111
+ * turn detection is not provisioned: the optional `onnxruntime-node`
10112
+ * dependency is missing, no model file is configured, or the configured
10113
+ * file cannot be loaded. Intended for deployments where the detector is
10114
+ * a soft upgrade:
10115
+ *
10116
+ * ```ts
10117
+ * const agent = phone.agent({
10118
+ * ...,
10119
+ * turnDetector: await SmartTurnDetector.maybeLoad(),
10120
+ * });
10121
+ * ```
10122
+ *
10123
+ * `turnDetector: undefined` keeps the plain VAD-silence endpointing, so
10124
+ * the agent starts (and the call behaves) exactly as if the feature were
10125
+ * never enabled — it never crashes the app.
10126
+ *
10127
+ * An out-of-range `threshold` still throws: that is a configuration bug,
10128
+ * not a provisioning gap. Mirror of the Python
10129
+ * `SmartTurnDetector.maybe_load`.
10130
+ */
10131
+ static maybeLoad(options?: SmartTurnDetectorOptions): Promise<SmartTurnDetector | undefined>;
10132
+ /**
10133
+ * Internal factory used by tests — bypasses onnxruntime-node loading.
10134
+ * @internal
10135
+ */
10136
+ static fromOnnxSession(runtime: OnnxRuntime, session: OnnxInferenceSession, options?: {
10137
+ threshold?: number;
10138
+ }): SmartTurnDetector;
10139
+ /** Identifier of the underlying model (`smart-turn-v3`). */
10140
+ get model(): string;
10141
+ /** Identifier of the runtime backend (`ONNX`). */
10142
+ get provider(): string;
10143
+ /** Input sample rate the model expects (16 000 Hz). */
10144
+ get sampleRate(): number;
10145
+ /** Maximum audio context the model consumes per prediction (8 s). */
10146
+ get maxWindowSeconds(): number;
10147
+ /** End-of-turn probability at/above which the turn is complete. */
10148
+ get threshold(): number;
10149
+ /**
10150
+ * End-of-turn probability for the given recent-audio window.
10151
+ *
10152
+ * @param pcm16Window Mono int16 little-endian PCM at 16 kHz — ideally
10153
+ * the full audio of the caller's current turn, up to 8 s (the
10154
+ * handler keeps a rolling 8 s buffer). Longer input is truncated to
10155
+ * the most recent 8 s; shorter input is left-padded with silence,
10156
+ * matching the reference preprocessing exactly.
10157
+ * @returns Probability in `[0, 1]` that the turn is COMPLETE (the
10158
+ * graph applies the sigmoid internally). Returns 0 for an empty
10159
+ * window.
10160
+ */
10161
+ predict(pcm16Window: Buffer): Promise<number>;
10162
+ /** Release the ONNX session. Idempotent. */
10163
+ close(): Promise<void>;
10164
+ }
10165
+
8163
10166
  /** Options accepted by {@link DeepFilterNetFilter}. */
8164
10167
  interface DeepFilterNetOptions {
8165
10168
  /** Absolute path to a DeepFilterNet ONNX model. If omitted, the filter
@@ -8341,6 +10344,21 @@ declare class OpenAIRealtime2Adapter extends OpenAIRealtimeAdapter {
8341
10344
  private inbound8kCarry;
8342
10345
  /** GA-shape `session.update` payload. See module-level docstring. */
8343
10346
  private buildGASessionConfig;
10347
+ /**
10348
+ * GA-shape partial `session.update` body for a mid-session swap.
10349
+ *
10350
+ * Identical to the base v1 patch plus the mandatory `"type": "realtime"`
10351
+ * discriminator — the GA endpoint rejects a `session.update` without it.
10352
+ * Used by {@link OpenAIRealtimeAdapter.updateSession} (inherited unchanged)
10353
+ * for the multi-agent `handoff_to` flow. Mirrors the Python
10354
+ * `OpenAIRealtime2Adapter._build_session_update_patch`.
10355
+ */
10356
+ protected buildSessionUpdatePatch(instructions: string | undefined, tools: Array<{
10357
+ name: string;
10358
+ description: string;
10359
+ parameters: Record<string, unknown>;
10360
+ strict?: boolean;
10361
+ }> | undefined): Record<string, unknown>;
8344
10362
  /**
8345
10363
  * Open the Realtime WebSocket against the GA endpoint and apply the GA
8346
10364
  * session configuration. Header `OpenAI-Beta: realtime=v1` is OMITTED
@@ -8567,7 +10585,15 @@ declare class StatefulResampler {
8567
10585
  private readonly dstRate;
8568
10586
  private firHistory;
8569
10587
  private firHistoryValid;
8570
- private firPendingSample;
10588
+ /**
10589
+ * Samples after the last emitted FIR center, carried to the next chunk
10590
+ * (1–2 samples: the next center and/or its missing lookahead). Replaces
10591
+ * the old single ``firPendingSample`` — that design wrote the pending
10592
+ * sample into history too (processed twice, true s-2 lost) and edge-
10593
+ * replicated the +2 tap at every chunk end, producing audible crackle at
10594
+ * chunk boundaries.
10595
+ */
10596
+ private firCarry;
8571
10597
  private upsampleLast;
8572
10598
  private upsampleHasHistory;
8573
10599
  private resample24Last;
@@ -9129,6 +11155,31 @@ declare class TwilioAdapter {
9129
11155
  * Mirrors the Python adapter's ``generate_stream_twiml``.
9130
11156
  */
9131
11157
  static generateStreamTwiml(streamUrl: string, parameters?: Record<string, string>): string;
11158
+ /**
11159
+ * TwiML that parks the CALLER leg in the warm-transfer conference.
11160
+ *
11161
+ * `startConferenceOnEnter="false"` keeps the caller on Twilio's default
11162
+ * hold music until a participant with `startConferenceOnEnter="true"` (the
11163
+ * human agent) joins; `endConferenceOnExit="true"` tears the conference
11164
+ * down if the caller hangs up while waiting. When `statusCallbackUrl` is
11165
+ * provided, conference lifecycle events (start / end / join / leave) are
11166
+ * posted there for observability.
11167
+ *
11168
+ * Mirrors the Python adapter's `generate_warm_transfer_caller_twiml`.
11169
+ */
11170
+ static generateWarmTransferCallerTwiml(conferenceName: string, statusCallbackUrl?: string): string;
11171
+ /**
11172
+ * TwiML executed on the TARGET (human agent) leg of a warm transfer.
11173
+ *
11174
+ * Speaks the agent-provided handoff `summary` first (skipped when empty),
11175
+ * then joins the conference with `startConferenceOnEnter="true"` — which
11176
+ * starts the conference, stops the caller's hold music, and bridges the
11177
+ * two. `endConferenceOnExit="true"` ends the conference (and therefore the
11178
+ * caller leg) when the human agent hangs up.
11179
+ *
11180
+ * Mirrors the Python adapter's `generate_warm_transfer_target_twiml`.
11181
+ */
11182
+ static generateWarmTransferTargetTwiml(conferenceName: string, summary?: string): string;
9132
11183
  /** Force-complete an in-progress call. */
9133
11184
  endCall(callSid: string): Promise<void>;
9134
11185
  }
@@ -9291,7 +11342,15 @@ declare class TelnyxSTT {
9291
11342
  private ws;
9292
11343
  private callbacks;
9293
11344
  private headerSent;
11345
+ /** Construction args replayed by clone(). */
11346
+ private readonly patterCtorArgs;
9294
11347
  constructor(apiKey: string, language?: string, transcriptionEngine?: TelnyxTranscriptionEngine, sampleRate?: number, baseUrl?: string);
11348
+ /**
11349
+ * Fresh adapter built with this instance's construction arguments —
11350
+ * called per call by the stream handler so concurrent calls never share
11351
+ * connection state (sockets/queues; cross-call transcript bleed).
11352
+ */
11353
+ clone(): this;
9295
11354
  /** Open the streaming WebSocket and arm message handlers. */
9296
11355
  connect(): Promise<void>;
9297
11356
  /** Send a binary PCM16 audio chunk; emits the WAV header on the first call. */
@@ -9350,6 +11409,655 @@ declare class TelnyxTTS {
9350
11409
  synthesizeStream(text: string): AsyncGenerator<Buffer>;
9351
11410
  }
9352
11411
 
11412
+ /**
11413
+ * Eval case data model.
11414
+ *
11415
+ * An {@link EvalCase} is a scripted scenario: a sequence of user turns, an
11416
+ * expected-behavior description, and a rubric used by the judge LLM.
11417
+ *
11418
+ * Designed to be loaded from a YAML/JSON suite file — see
11419
+ * {@link loadSuite} in `runner.ts`. Mirrors the Python
11420
+ * `getpatter.evals.case` module (frozen dataclasses → readonly interfaces).
11421
+ */
11422
+
11423
+ /** A single user utterance in a scripted conversation. */
11424
+ interface EvalTurn {
11425
+ readonly user: string;
11426
+ /**
11427
+ * Optional substrings the agent's reply should contain — used as a cheap
11428
+ * pre-filter (logged, never fatal) before invoking the LLM judge.
11429
+ */
11430
+ readonly expectedContains?: ReadonlyArray<string>;
11431
+ }
11432
+ /** A complete evaluation scenario. */
11433
+ interface EvalCase {
11434
+ readonly name: string;
11435
+ readonly turns: ReadonlyArray<EvalTurn>;
11436
+ readonly expectedBehavior: string;
11437
+ readonly rubric: string;
11438
+ /** Optional metadata for reporting/filtering. */
11439
+ readonly tags?: ReadonlyArray<string>;
11440
+ /**
11441
+ * Optional first-message the agent should emit before any user turn.
11442
+ * On the real-pipeline path (``agent`` set) it overrides the agent's own
11443
+ * ``firstMessage`` so the REAL handler speaks it; on the legacy
11444
+ * ``reply()`` path it is prepended to the transcript display-only.
11445
+ */
11446
+ readonly firstMessage?: string;
11447
+ /**
11448
+ * Optional REAL-pipeline target. When ``agent`` is set, the runner drives
11449
+ * the case through {@link EvalSession} — the real `StreamHandler`
11450
+ * pipeline call loop (tools, hooks, guardrails, history) — instead of the
11451
+ * legacy ``reply()``-callable factory. ``llmProvider`` optionally
11452
+ * overrides ``agent.llm`` (e.g. a {@link ScriptedLLMProvider} for CI).
11453
+ * Both default to ``undefined`` so existing suites are unaffected.
11454
+ */
11455
+ readonly agent?: AgentOptions;
11456
+ readonly llmProvider?: LLMProvider;
11457
+ }
11458
+ /** The judge's verdict on one case. */
11459
+ interface JudgeResult {
11460
+ /** Score in [0.0, 1.0]. */
11461
+ readonly score: number;
11462
+ readonly passed: boolean;
11463
+ readonly reasoning: string;
11464
+ }
11465
+ /** One line of a judge-facing transcript. */
11466
+ interface TranscriptEntry {
11467
+ readonly role: string;
11468
+ readonly text: string;
11469
+ }
11470
+ /** The result of running a single {@link EvalCase}. */
11471
+ interface EvalResult {
11472
+ readonly caseName: string;
11473
+ readonly transcript: ReadonlyArray<TranscriptEntry>;
11474
+ readonly judge: JudgeResult;
11475
+ readonly durationS: number;
11476
+ readonly error: string | null;
11477
+ }
11478
+ /**
11479
+ * Render an {@link EvalResult} as the JSON-report row shape — mirrors the
11480
+ * Python `EvalResult.to_dict()` (snake_case keys, stable across SDKs so CI
11481
+ * artefacts are interchangeable).
11482
+ */
11483
+ declare function evalResultToDict(result: EvalResult): Record<string, unknown>;
11484
+
11485
+ /**
11486
+ * LLM-as-judge scoring for eval cases.
11487
+ *
11488
+ * Mirrors the Python `getpatter.evals.llm_judge` module. The judge is
11489
+ * intentionally provider-specific (OpenAI chat completions) because
11490
+ * reliability of structured JSON output matters more than provider
11491
+ * flexibility for evals. Callers who need a different backend can implement
11492
+ * a {@link JudgeBackend} and inject it via the ``backend`` option.
11493
+ */
11494
+
11495
+ /** Pluggable judge backend — anything exposing ``judge(prompt)``. */
11496
+ interface JudgeBackend {
11497
+ judge(prompt: string): Promise<string>;
11498
+ }
11499
+ /** Options for {@link LLMJudge}. Defaults match the Python SDK byte-for-byte. */
11500
+ interface LLMJudgeOptions {
11501
+ /** Model the judge should use. Default: ``gpt-4o-mini``. */
11502
+ readonly model?: string;
11503
+ /** OpenAI API key. Falls back to ``OPENAI_API_KEY`` when unset. */
11504
+ readonly apiKey?: string;
11505
+ /** Score threshold for a pass. Default: ``0.7``. */
11506
+ readonly passThreshold?: number;
11507
+ /** Test/alternative backend — any object exposing ``judge(prompt)``. */
11508
+ readonly backend?: JudgeBackend;
11509
+ }
11510
+ /** Scores conversation transcripts against a rubric via an OpenAI model. */
11511
+ declare class LLMJudge {
11512
+ readonly model: string;
11513
+ readonly passThreshold: number;
11514
+ private readonly apiKey?;
11515
+ private readonly backend?;
11516
+ constructor(options?: LLMJudgeOptions);
11517
+ /** Return a {@link JudgeResult} for the given transcript. */
11518
+ judgeCase(evalCase: EvalCase, transcript: ReadonlyArray<TranscriptEntry>): Promise<JudgeResult>;
11519
+ private buildPrompt;
11520
+ /** Call OpenAI chat completions directly over fetch (no SDK dependency). */
11521
+ private callOpenAI;
11522
+ /** Parse the judge's JSON — tolerant of extra whitespace / code fences. */
11523
+ private parse;
11524
+ }
11525
+
11526
+ /**
11527
+ * Eval runner — executes an {@link EvalSuite} against a scripted agent.
11528
+ *
11529
+ * Two execution paths per case (mirrors the Python `getpatter.evals.runner`):
11530
+ *
11531
+ * - **Legacy** (default): the caller supplies an ``agentFactory`` returning
11532
+ * an async ``reply(text) => string`` callable — no SDK machinery involved.
11533
+ * - **Real pipeline**: when an {@link EvalCase} carries ``agent`` (an
11534
+ * {@link AgentOptions}, optionally with ``llmProvider``), the runner
11535
+ * drives the case through {@link EvalSession} — the real `StreamHandler`
11536
+ * pipeline call loop with tools, hooks, guardrails, and history handling.
11537
+ *
11538
+ * ```ts
11539
+ * const evalCase: EvalCase = {
11540
+ * name: 'books a table',
11541
+ * turns: [{ user: 'table for two at eight' }],
11542
+ * expectedBehavior: 'Agent books and confirms the table.',
11543
+ * rubric: 'Pass if a booking is confirmed.',
11544
+ * agent: myAgent, // real agent under test
11545
+ * llmProvider: new ScriptedLLMProvider([...]), // or a real provider
11546
+ * };
11547
+ * const results = await new EvalRunner({ judge }).run({ name: 's', cases: [evalCase] });
11548
+ * ```
11549
+ */
11550
+
11551
+ /**
11552
+ * An agent callable receives one user turn and returns the agent's final
11553
+ * text response. This decouples the runner from the real Patter Agent
11554
+ * wiring and lets callers plug in any chat-completions client or mock.
11555
+ */
11556
+ type AgentCallable = (text: string) => Promise<string>;
11557
+ /** A factory takes no arguments and returns an {@link AgentCallable}. */
11558
+ type AgentFactory = () => AgentCallable | Promise<AgentCallable>;
11559
+ /** A named collection of {@link EvalCase} to run together. */
11560
+ interface EvalSuite {
11561
+ readonly name: string;
11562
+ readonly cases: ReadonlyArray<EvalCase>;
11563
+ readonly metadata?: Readonly<Record<string, unknown>>;
11564
+ }
11565
+ /** Options for {@link EvalRunner}. */
11566
+ interface EvalRunnerOptions {
11567
+ /** Judge used to score each case. Default: ``new LLMJudge()``. */
11568
+ readonly judge?: LLMJudge;
11569
+ }
11570
+ /** Drives one or more cases against an agent and produces a JSON report. */
11571
+ declare class EvalRunner {
11572
+ readonly judge: LLMJudge;
11573
+ constructor(options?: EvalRunnerOptions);
11574
+ /**
11575
+ * Run every case in ``suite`` sequentially.
11576
+ *
11577
+ * ``agentFactory`` is required only for cases that do NOT carry their own
11578
+ * ``agent`` (the legacy ``reply()`` path).
11579
+ */
11580
+ run(suite: EvalSuite, agentFactory?: AgentFactory): Promise<EvalResult[]>;
11581
+ /**
11582
+ * Run a single case and return its {@link EvalResult}.
11583
+ *
11584
+ * Routes through the real-pipeline {@link EvalSession} when
11585
+ * ``evalCase.agent`` is set; otherwise uses the legacy ``reply()``-callable
11586
+ * ``agentFactory`` (unchanged behaviour).
11587
+ */
11588
+ runCase(evalCase: EvalCase, agentFactory?: AgentFactory): Promise<EvalResult>;
11589
+ /**
11590
+ * Legacy path — drives the case against a ``reply()`` callable.
11591
+ *
11592
+ * Appends into ``transcript`` in place so a mid-case exception still
11593
+ * leaves the partial transcript for the judge (existing semantics).
11594
+ */
11595
+ private runTurnsWithReply;
11596
+ /**
11597
+ * Real-pipeline path — drives the case through {@link EvalSession}.
11598
+ *
11599
+ * The agent's REAL handler emits its own ``firstMessage`` (a
11600
+ * ``evalCase.firstMessage`` overrides the agent's), tools/hooks/guardrails
11601
+ * run for real, and the transcript mirrors what the pipeline actually
11602
+ * said. Appends into ``transcript`` in place (partial-on-error, same as
11603
+ * the legacy path).
11604
+ */
11605
+ private runTurnsWithSession;
11606
+ /** Render a JSON report suitable for CI artefacts. */
11607
+ report(suite: EvalSuite, results: ReadonlyArray<EvalResult>): string;
11608
+ }
11609
+ /**
11610
+ * Load a suite from YAML or JSON.
11611
+ *
11612
+ * Schema (YAML):
11613
+ *
11614
+ * ```yaml
11615
+ * name: "customer support v1"
11616
+ * cases:
11617
+ * - name: "greeting is warm"
11618
+ * expected_behavior: "Agent greets the caller warmly and asks how it can help."
11619
+ * rubric: "Pass if reply contains a greeting and an open-ended question."
11620
+ * turns:
11621
+ * - user: "hi"
11622
+ * ```
11623
+ *
11624
+ * Suite files use snake_case keys (shared byte-for-byte with the Python
11625
+ * SDK so one suite file drives both); camelCase aliases are also accepted.
11626
+ */
11627
+ declare function loadSuite(path: string): Promise<EvalSuite>;
11628
+
11629
+ /**
11630
+ * Eval-harness fakes — the ONLY mocked boundary of {@link EvalSession}.
11631
+ *
11632
+ * Everything inward of these (transcript commit filters, barge-in state
11633
+ * machine, LLM loop, tool executor, hooks, guardrails, sentence chunking,
11634
+ * metrics) is the REAL pipeline code. The fakes stand in for the
11635
+ * paid/external surfaces a live call touches: the telephony carrier
11636
+ * (audio sender), the STT socket, and the TTS socket.
11637
+ */
11638
+
11639
+ /**
11640
+ * Records every carrier-bound audio operation; auto-acks marks.
11641
+ *
11642
+ * The TS analogue of the carrier boundary is the {@link TelephonyBridge}, so
11643
+ * this fake implements it directly. ``sentAudio`` collects the decoded wire
11644
+ * chunks handed to the carrier, ``clears`` counts ``sendClear`` calls
11645
+ * (barge-in flushes), ``marks`` records mark names, ``endedCalls`` /
11646
+ * ``transfers`` record call-control invocations. When attached to a handler,
11647
+ * each ``sendMark`` is echoed back through ``handler.onMark`` so mark-gated
11648
+ * pacing never deadlocks.
11649
+ */
11650
+ declare class FakeAudioSender implements TelephonyBridge {
11651
+ readonly label = "Eval";
11652
+ readonly telephonyProvider: CarrierKind;
11653
+ readonly inputWireFormat: "pcm_16000";
11654
+ readonly sentAudio: Buffer[];
11655
+ readonly marks: string[];
11656
+ readonly endedCalls: string[];
11657
+ readonly transfers: Array<{
11658
+ readonly callId: string;
11659
+ readonly toNumber: string;
11660
+ }>;
11661
+ clears: number;
11662
+ private handler;
11663
+ private sttFactory;
11664
+ /** Wire the auto-ack loopback for ``sendMark`` → ``onMark``. */
11665
+ attachHandler(handler: {
11666
+ onMark(markName: string): Promise<void>;
11667
+ }): void;
11668
+ /** Wire the STT instance ``createStt`` should hand to the handler. */
11669
+ attachSttFactory(factory: () => STTAdapter | null): void;
11670
+ sendAudio(_ws: WebSocket$1, audioBase64: string, _streamSid: string): void;
11671
+ sendMark(_ws: WebSocket$1, markName: string, _streamSid: string): void;
11672
+ sendClear(_ws: WebSocket$1, _streamSid: string): void;
11673
+ transferCall(callId: string, toNumber: string, _options?: TransferCallOptions): Promise<TransferCallResult | void>;
11674
+ endCall(callId: string, _ws: WebSocket$1): Promise<void>;
11675
+ createStt(_agent: AgentOptions): Promise<STTAdapter | null>;
11676
+ queryTelephonyCost(): Promise<void>;
11677
+ }
11678
+ /**
11679
+ * Callback-backed STT double feeding the handler's REAL transcript path.
11680
+ *
11681
+ * The harness pushes finalised transcripts via {@link FakeSTT.pushFinal};
11682
+ * the handler consumes them through the same ``onTranscript`` callback a
11683
+ * live Deepgram adapter drives, so ``handleBargeIn`` → ``commitTranscript``
11684
+ * → ``dispatchTurn`` run exactly as on a real call. ``pushFinal`` resolves
11685
+ * once the handler's transcript drain loop has fully processed the
11686
+ * transcript (i.e. the dispatch task for it — if any — has been created),
11687
+ * which is what {@link EvalSession.userSays} awaits before grabbing the
11688
+ * dispatch task.
11689
+ */
11690
+ declare class FakeSTT implements STTAdapter {
11691
+ readonly sentAudio: Buffer[];
11692
+ connected: boolean;
11693
+ closed: boolean;
11694
+ private callback;
11695
+ connect(): Promise<void>;
11696
+ sendAudio(pcm: Buffer): void;
11697
+ onTranscript(cb: STTTranscriptCallback): void;
11698
+ /** Inject one final transcript through the handler's real receive path. */
11699
+ pushFinal(text: string): Promise<void>;
11700
+ close(): Promise<void>;
11701
+ }
11702
+ /**
11703
+ * TTS double: records the text it is asked to speak, yields silence.
11704
+ *
11705
+ * The default chunk is 320 bytes of PCM16 @ 16 kHz (10 ms of silence) so the
11706
+ * handler's playback-backlog accounting stays in the sub-frame range and a
11707
+ * following turn is never misclassified as a barge-in against a phantom
11708
+ * carrier backlog. ``closed`` flips when the handler's teardown invokes
11709
+ * ``cancelActiveStream`` (the TS TTS adapters are per-request — teardown
11710
+ * cancels the active stream rather than closing a connection).
11711
+ */
11712
+ declare class FakeTTS implements TTSAdapter {
11713
+ readonly spoken: string[];
11714
+ closed: boolean;
11715
+ private readonly chunk;
11716
+ constructor(chunk?: Buffer);
11717
+ synthesizeStream(text: string): AsyncIterable<Buffer>;
11718
+ /** Invoked by the handler's real teardown (``handleStop``). */
11719
+ cancelActiveStream(): void;
11720
+ }
11721
+
11722
+ /**
11723
+ * Deterministic scripted LLM provider for CI evals.
11724
+ *
11725
+ * The fourth fake-able boundary of {@link EvalSession}: instead of a paid
11726
+ * model API, a {@link ScriptedLLMProvider} replays pre-scripted streaming
11727
+ * chunks through the REAL {@link LLMLoop} — tool dispatch, history
11728
+ * threading, usage accounting, and abort handling all run for real.
11729
+ */
11730
+
11731
+ /** Build one scripted assistant turn that streams ``text`` then usage. */
11732
+ declare function textTurn(text: string, options?: {
11733
+ readonly inputTokens?: number;
11734
+ readonly outputTokens?: number;
11735
+ }): LLMChunk[];
11736
+ /**
11737
+ * Build one scripted turn that emits a single complete tool call.
11738
+ *
11739
+ * The {@link LLMLoop} executes the tool via the real ``DefaultToolExecutor``
11740
+ * and re-submits — so a tool scenario needs a follow-up scripted turn
11741
+ * (usually {@link textTurn}) for the post-tool-result response.
11742
+ */
11743
+ declare function toolCallTurn(name: string, args?: Record<string, unknown>, options?: {
11744
+ readonly callId?: string;
11745
+ }): LLMChunk[];
11746
+ /** One recorded {@link ScriptedLLMProvider.stream} request. */
11747
+ interface ScriptedLLMCall {
11748
+ readonly messages: Array<Record<string, unknown>>;
11749
+ readonly tools: Array<Record<string, unknown>> | null;
11750
+ readonly callId: string | null;
11751
+ }
11752
+ /**
11753
+ * Deterministic {@link LLMProvider}.
11754
+ *
11755
+ * Pops one scripted chunk-list per ``stream()`` call (i.e. per LLM-loop
11756
+ * iteration — a tool-call turn consumes one script for the call and one for
11757
+ * the post-result response). Records every request's ``messages`` and
11758
+ * ``tools`` in {@link ScriptedLLMProvider.calls} so tests can assert exactly
11759
+ * what the real pipeline sent to the model. Honours the per-turn abort
11760
+ * signal between chunks like a well-behaved provider.
11761
+ */
11762
+ declare class ScriptedLLMProvider implements LLMProvider {
11763
+ /** Stable pricing/dashboard key (no real pricing entry — cost is 0). */
11764
+ static readonly providerKey = "scripted";
11765
+ readonly calls: ScriptedLLMCall[];
11766
+ private readonly scripts;
11767
+ constructor(turns?: ReadonlyArray<ReadonlyArray<LLMChunk>>);
11768
+ /** Append another scripted turn (chunk list) to the script queue. */
11769
+ addTurn(chunks: ReadonlyArray<LLMChunk>): void;
11770
+ stream(messages: Array<Record<string, unknown>>, tools?: Array<Record<string, unknown>> | null, opts?: LLMStreamOptions): AsyncGenerator<LLMChunk, void, unknown>;
11771
+ }
11772
+
11773
+ /**
11774
+ * Stream-handler utilities — capped conversation history and SSRF-validated
11775
+ * tool-webhook execution shared across the various per-call handlers.
11776
+ */
11777
+ /** A single entry in the per-call conversation history. */
11778
+ interface HistoryEntry {
11779
+ readonly role: string;
11780
+ readonly text: string;
11781
+ readonly timestamp: number;
11782
+ }
11783
+
11784
+ /**
11785
+ * Eval session — drives the REAL pipeline call loop, no telephony required.
11786
+ *
11787
+ * Unlike the legacy ``reply()``-callable path in `runner.ts` (which never
11788
+ * touches the SDK), {@link EvalSession} constructs a real
11789
+ * {@link StreamHandler} in pipeline mode and injects user turns through the
11790
+ * exact same path a live phone call uses:
11791
+ *
11792
+ * ``FakeSTT → onTranscript → handleBargeIn → commitTranscript →
11793
+ * dispatchTurn → LLMLoop (real DefaultToolExecutor, hooks, guardrails) →
11794
+ * SentenceChunker → FakeTTS → FakeAudioSender``
11795
+ *
11796
+ * so tool calling, pipeline hooks, guardrail replacement,
11797
+ * dedup/hallucination filtering, history handling, metrics, and the
11798
+ * turn-taking state machine are all exercised for real. Only the
11799
+ * paid/external boundary is faked: the telephony bridge (audio sender), the
11800
+ * STT socket, the TTS socket, and — optionally — the LLM (a deterministic
11801
+ * {@link ScriptedLLMProvider} for CI, or any real {@link LLMProvider} for
11802
+ * live evals). See `fakes.ts` / `scripted-llm.ts`.
11803
+ *
11804
+ * Lifecycle (TS idiom — no async context managers in JS):
11805
+ *
11806
+ * ```ts
11807
+ * const session = await EvalSession.create({ agent, llmProvider });
11808
+ * try {
11809
+ * const result = await session.userSays('where is my order?');
11810
+ * expect(result)
11811
+ * .toolCalled('lookup_order', { orderId: 'A1' })
11812
+ * .agentTextContains('tomorrow');
11813
+ * } finally {
11814
+ * await session.close();
11815
+ * }
11816
+ * ```
11817
+ *
11818
+ * ``EvalSession.create()`` = ``new EvalSession(options)`` + ``await
11819
+ * session.start()``; ``close()`` runs the handler's real teardown
11820
+ * (``handleStop``) and is idempotent. Always pair ``create`` with a
11821
+ * ``finally { await session.close() }`` so fakes and timers are released
11822
+ * even when an assertion throws.
11823
+ *
11824
+ * Notes
11825
+ * -----
11826
+ * - ``agent.stt`` / ``agent.tts`` configs are ignored (replaced by the
11827
+ * fakes); ``agent.provider`` is forced to ``'pipeline'``.
11828
+ * - ``onMessage``-style agents are not supported — the session targets the
11829
+ * built-in {@link LLMLoop} path.
11830
+ * - ``TurnResult.agentText`` is what the caller HEARD (post-guardrail,
11831
+ * post-hook, post-text-transform sentences handed to TTS).
11832
+ * ``historySnapshot`` mirrors the dashboard conversation history, where
11833
+ * the streaming path records the raw LLM text.
11834
+ */
11835
+
11836
+ /**
11837
+ * One tool invocation observed through the handler's tool-event path.
11838
+ *
11839
+ * ``args`` is the parsed-JSON object the {@link LLMLoop} handed to the real
11840
+ * ``DefaultToolExecutor``; ``result`` is the executor's string return value
11841
+ * (``null`` only for hand-built records). Caller-immutable by convention.
11842
+ */
11843
+ interface ToolCallRecord {
11844
+ readonly name: string;
11845
+ readonly arguments: Readonly<Record<string, unknown>>;
11846
+ readonly result: string | null;
11847
+ }
11848
+ /**
11849
+ * The observable outcome of one {@link EvalSession.userSays} turn.
11850
+ *
11851
+ * - ``userText`` — the injected user utterance.
11852
+ * - ``agentText`` — what the caller heard this turn: the sentences handed to
11853
+ * TTS after guardrails / hooks / text transforms, joined by a single
11854
+ * space. Falls back to the assistant history entry when no TTS sentence
11855
+ * was produced (e.g. the turn was cut short before synthesis).
11856
+ * - ``toolCalls`` — tool invocations recorded via the handler's tool-event
11857
+ * path (``LLMLoop`` ``onToolCall`` → ``recordToolCall``), in execution
11858
+ * order.
11859
+ * - ``historySnapshot`` — the handler's full conversation history right
11860
+ * after the turn settled (``role`` / ``text`` / ``timestamp`` entries,
11861
+ * including ``role: 'tool'`` timeline entries).
11862
+ * - ``interrupted`` — true when the turn was cut short (barge-in cancel,
11863
+ * hook veto, or an LLM error that interrupted the turn) — derived from
11864
+ * the handler's ``turn_ended`` events carrying ``[interrupted]``.
11865
+ * - ``metricsTurn`` — the {@link TurnMetrics} emitted for this turn via the
11866
+ * handler's ``onMetrics`` callback, or ``null`` when the turn did not
11867
+ * complete normally (vetoed / interrupted).
11868
+ */
11869
+ interface TurnResult {
11870
+ readonly userText: string;
11871
+ readonly agentText: string;
11872
+ readonly toolCalls: ReadonlyArray<ToolCallRecord>;
11873
+ readonly historySnapshot: ReadonlyArray<HistoryEntry>;
11874
+ readonly interrupted: boolean;
11875
+ readonly metricsTurn: TurnMetrics | null;
11876
+ }
11877
+ /**
11878
+ * Render a conversation history in the judge transcript shape
11879
+ * (``[{ role: 'user'|'agent'|'tool', text }]`` — assistant → agent).
11880
+ * Shared by {@link EvalSession.transcript} and the assertion ``judge``.
11881
+ */
11882
+ declare function historyTranscript(history: ReadonlyArray<{
11883
+ readonly role: string;
11884
+ readonly text: string;
11885
+ }>): TranscriptEntry[];
11886
+ /** Options for {@link EvalSession}. Defaults match the Python SDK. */
11887
+ interface EvalSessionOptions {
11888
+ /**
11889
+ * The agent under test. Its ``stt`` / ``tts`` configs are replaced by
11890
+ * fakes and ``provider`` is forced to ``'pipeline'``; everything else
11891
+ * (tools, guardrails, hooks, text transforms, firstMessage, variables,
11892
+ * ...) is live.
11893
+ */
11894
+ readonly agent: AgentOptions;
11895
+ /**
11896
+ * Optional LLM provider override. Defaults to ``agent.llm``. Pass a
11897
+ * {@link ScriptedLLMProvider} for deterministic CI evals or a real
11898
+ * provider for live evals.
11899
+ */
11900
+ readonly llmProvider?: LLMProvider;
11901
+ /**
11902
+ * Legacy fallback — when neither ``llmProvider`` nor ``agent.llm`` is
11903
+ * set, the built-in OpenAI provider is built from this key (live evals
11904
+ * only; never use in CI).
11905
+ */
11906
+ readonly openaiKey?: string;
11907
+ /** Call identity threaded through the handler, tools' call context, and metrics. */
11908
+ readonly callId?: string;
11909
+ readonly caller?: string;
11910
+ readonly callee?: string;
11911
+ /**
11912
+ * Optional per-call variables resolved into the system prompt exactly
11913
+ * like ``phone.call({ customParams })``.
11914
+ */
11915
+ readonly customParams?: Readonly<Record<string, string>>;
11916
+ /**
11917
+ * Per-turn ceiling in seconds for {@link EvalSession.userSays} (LLM +
11918
+ * tools + TTS). Generous default (60) for live providers; scripted
11919
+ * providers finish in milliseconds.
11920
+ */
11921
+ readonly turnTimeoutS?: number;
11922
+ }
11923
+ /**
11924
+ * Harness around a real pipeline-mode {@link StreamHandler}.
11925
+ *
11926
+ * See the module docstring for usage. Construction is cheap; the handler is
11927
+ * built and started in {@link EvalSession.start} (``EvalSession.create``
11928
+ * calls it).
11929
+ */
11930
+ declare class EvalSession {
11931
+ readonly callId: string;
11932
+ readonly caller: string;
11933
+ readonly callee: string;
11934
+ /** The faked carrier boundary — records audio / clears / marks. */
11935
+ readonly audioSender: FakeAudioSender;
11936
+ /** The faked STT boundary — inject transcripts via the session, not directly. */
11937
+ readonly stt: FakeSTT;
11938
+ /** The faked TTS boundary — ``spoken`` records every synthesised sentence. */
11939
+ readonly tts: FakeTTS;
11940
+ /**
11941
+ * Every payload the handler fired through ``onTranscript`` — user /
11942
+ * assistant / tool events, in emission order.
11943
+ */
11944
+ readonly transcriptEvents: Array<Record<string, unknown>>;
11945
+ private readonly sourceAgent;
11946
+ private readonly llmProvider?;
11947
+ private readonly openaiKey;
11948
+ private readonly customParams?;
11949
+ private readonly turnTimeoutS;
11950
+ private streamHandler;
11951
+ private readonly toolCalls;
11952
+ private readonly metricsTurns;
11953
+ private interruptedTurns;
11954
+ private offTurnEnded;
11955
+ private started;
11956
+ private closed;
11957
+ constructor(options: EvalSessionOptions);
11958
+ /** Build AND start a session — the canonical entry point. */
11959
+ static create(options: EvalSessionOptions): Promise<EvalSession>;
11960
+ /** Build and start the real handler (idempotent). */
11961
+ start(): Promise<void>;
11962
+ /** Run the handler's real teardown (``handleStop``). Idempotent. */
11963
+ close(): Promise<void>;
11964
+ /**
11965
+ * Inject one final user transcript and await the full agent turn.
11966
+ *
11967
+ * The transcript flows through the handler's real receive path — barge-in
11968
+ * handling, dedup/hallucination filtering, hooks, the LLM loop with real
11969
+ * tool execution, guardrails, sentence chunking, and TTS — exactly as on
11970
+ * a live call. Returns once the turn's dispatch task settles.
11971
+ *
11972
+ * Throws:
11973
+ * - ``PatterError`` (``INPUT_VALIDATION``) — the REAL pipeline dropped
11974
+ * the transcript (duplicate within the 2 s throttle window, known STT
11975
+ * hallucination, or empty text) — same behaviour as a live call.
11976
+ * - ``PatterError`` (``TIMEOUT``) — the turn did not settle within
11977
+ * ``timeoutS`` (default: the session's ``turnTimeoutS``).
11978
+ */
11979
+ userSays(text: string, options?: {
11980
+ readonly timeoutS?: number;
11981
+ }): Promise<TurnResult>;
11982
+ /** The live {@link StreamHandler} (``null`` before ``start``). */
11983
+ get handler(): StreamHandler | null;
11984
+ /** Current conversation history (shallow copies of the entries). */
11985
+ get history(): HistoryEntry[];
11986
+ /** Full-session transcript in the judge shape (assistant → agent). */
11987
+ transcript(): TranscriptEntry[];
11988
+ /** Return the agent reshaped for harness execution. */
11989
+ private buildEvalAgent;
11990
+ private buildTurnResult;
11991
+ }
11992
+
11993
+ /**
11994
+ * Fluent assertions for {@link TurnResult}.
11995
+ *
11996
+ * Chainable expectations against the outcome of one real pipeline turn
11997
+ * driven by {@link EvalSession}:
11998
+ *
11999
+ * ```ts
12000
+ * const result = await session.userSays('book me a table for two');
12001
+ * expect(result)
12002
+ * .toolCalled('book_table', { partySize: 2 })
12003
+ * .agentTextContains('booked');
12004
+ *
12005
+ * // Semantic check via the LLMJudge (async, ends the chain):
12006
+ * await expect(result).judge(new LLMJudge(), {
12007
+ * intent: 'The agent confirms the booking and offers help.',
12008
+ * });
12009
+ * ```
12010
+ *
12011
+ * Every failed expectation throws Node's ``AssertionError`` with the
12012
+ * observed values, so plain vitest reports are actionable without extra
12013
+ * plumbing. Mirrors the Python `getpatter.evals.assertions` module.
12014
+ */
12015
+
12016
+ /** Wrap a {@link TurnResult} in a chainable expectation object. */
12017
+ declare function expect(result: TurnResult): TurnExpectation;
12018
+ /** Options for {@link TurnExpectation.agentTextContains}. */
12019
+ interface AgentTextContainsOptions {
12020
+ /** Compare needles case-sensitively. Default: false (Python parity). */
12021
+ readonly caseSensitive?: boolean;
12022
+ }
12023
+ /** Chainable assertions over one turn. See {@link expect}. */
12024
+ declare class TurnExpectation {
12025
+ private readonly turnResult;
12026
+ constructor(result: TurnResult);
12027
+ /** The wrapped {@link TurnResult} (escape hatch for ad-hoc asserts). */
12028
+ get result(): TurnResult;
12029
+ /**
12030
+ * Assert that tool ``name`` ran this turn.
12031
+ *
12032
+ * ``argsSubset`` (optional) must be recursively contained in the args of
12033
+ * at least one matching invocation — extra argument keys are allowed,
12034
+ * listed keys must match exactly.
12035
+ */
12036
+ toolCalled(name: string, argsSubset?: Record<string, unknown>): TurnExpectation;
12037
+ /** Assert that no tool ran this turn (or that ``name`` did not). */
12038
+ noToolCalled(name?: string): TurnExpectation;
12039
+ /**
12040
+ * Assert that every needle appears in the spoken agent text.
12041
+ *
12042
+ * Variadic form is case-insensitive (Python default); pass an array plus
12043
+ * an options object for ``caseSensitive: true``.
12044
+ */
12045
+ agentTextContains(...needles: string[]): TurnExpectation;
12046
+ agentTextContains(needles: string | ReadonlyArray<string>, options?: AgentTextContainsOptions): TurnExpectation;
12047
+ /**
12048
+ * Score this turn against ``intent`` with the LLM judge.
12049
+ *
12050
+ * Builds a synthetic {@link EvalCase} whose ``expectedBehavior`` is
12051
+ * ``intent`` and judges the turn's full history snapshot. Throws
12052
+ * ``AssertionError`` when the judge fails the turn; returns the
12053
+ * {@link JudgeResult} otherwise (chain-ending, async).
12054
+ */
12055
+ judge(llmJudge: LLMJudge, options: {
12056
+ readonly intent: string;
12057
+ readonly rubric?: string;
12058
+ }): Promise<JudgeResult>;
12059
+ }
12060
+
9353
12061
  declare const SPAN_CALL = "getpatter.call";
9354
12062
  declare const SPAN_STT = "getpatter.stt";
9355
12063
  declare const SPAN_LLM = "getpatter.llm";
@@ -9384,6 +12092,11 @@ interface InitTracingOptions {
9384
12092
  * may be a no-op if the host hasn't registered one).
9385
12093
  */
9386
12094
  declare function initTracing(options?: InitTracingOptions): boolean;
12095
+ /**
12096
+ * Flush any pending spans and tear down the tracer provider. Safe to call
12097
+ * unconditionally — returns immediately when tracing was never wired up.
12098
+ */
12099
+ declare function shutdownTracing(): Promise<void>;
9387
12100
  /** True only if the env flag is set AND the tracer initialized cleanly. */
9388
12101
  declare function isTracingEnabled(): boolean;
9389
12102
  /**
@@ -9397,6 +12110,75 @@ declare function isTracingEnabled(): boolean;
9397
12110
  * Returns a no-op span when tracing is disabled or unavailable.
9398
12111
  */
9399
12112
  declare function startSpan(name: string, attrs?: Record<string, unknown>): Span;
12113
+ /**
12114
+ * Convenience wrapper — starts a span, runs ``fn``, records exceptions on
12115
+ * throw, and always ends the span (try/finally). Mirrors Python's
12116
+ * ``with start_span(...):`` context-manager ergonomics.
12117
+ *
12118
+ * ```ts
12119
+ * await withSpan(SPAN_LLM, { 'llm.model': 'gpt-4o' }, async (span) => {
12120
+ * span.setAttribute('llm.tokens', 123);
12121
+ * return await callLLM();
12122
+ * });
12123
+ * ```
12124
+ */
12125
+ declare function withSpan<T>(name: string, attrs: Record<string, unknown> | undefined, fn: (span: Span) => Promise<T>): Promise<T>;
12126
+
12127
+ /**
12128
+ * Stamp ``patter.*`` attributes on the current span, augmenting them with
12129
+ * the ambient ``patter.call_id`` / ``patter.side`` from the active
12130
+ * ``patterCallScope``. No-op when tracing is disabled or no scope is
12131
+ * active.
12132
+ *
12133
+ * Behaviour mirrors the Python helper:
12134
+ * - If an active recording span exists, attributes are stamped on it.
12135
+ * - Otherwise a transient zero-duration ``patter.billable`` span is
12136
+ * opened to carry the attributes. Some collectors filter
12137
+ * zero-duration spans; callers that need guaranteed attribution
12138
+ * should wrap their billable work in their own span.
12139
+ *
12140
+ * Caller-provided ``patter.call_id`` / ``patter.side`` keys win over the
12141
+ * scope's values.
12142
+ */
12143
+ declare function recordPatterAttrs(attrs: Readonly<Record<string, unknown>>): void;
12144
+ /**
12145
+ * Bind ``callId`` and ``side`` to the active span scope for the duration
12146
+ * of ``fn``. Mirrors the Python ``patter_call_scope`` context manager:
12147
+ * any ``recordPatterAttrs`` call made inside ``fn`` (or anything ``fn``
12148
+ * awaits) sees the bound values.
12149
+ *
12150
+ * Note: JavaScript has no ContextVar equivalent, so this uses a
12151
+ * module-level stack. Concurrent overlapping scopes on the same event
12152
+ * loop will see the innermost scope's values — fine for the SDK's
12153
+ * one-call-per-handler model. If callers need true async-context
12154
+ * isolation, install ``AsyncLocalStorage``-backed propagation via the
12155
+ * OTel SDK's context manager.
12156
+ */
12157
+ declare function patterCallScope<T>(options: {
12158
+ readonly callId: string;
12159
+ readonly side?: string;
12160
+ }, fn: () => Promise<T>): Promise<T>;
12161
+ /**
12162
+ * Wire an OTel ``SpanExporter`` into the SDK's tracer provider and
12163
+ * remember the configured ``side`` on the Patter instance so the
12164
+ * per-call handler reads it when entering ``patterCallScope``.
12165
+ *
12166
+ * Mirrors the Python ``attach_span_exporter`` contract:
12167
+ * - Stores ``side`` on ``patterInstance._patterSide`` unconditionally
12168
+ * (works even when ``@opentelemetry/*`` peer deps are missing).
12169
+ * - Idempotent on the *same exporter object reference*. Two distinct
12170
+ * exporter instances pointing at the same backend will both be
12171
+ * attached and spans will be exported twice — hold a single
12172
+ * exporter object across calls to avoid duplicates.
12173
+ *
12174
+ * When tracing isn't enabled (env flag off / SDK peer deps absent), the
12175
+ * call is a no-op aside from storing ``_patterSide``.
12176
+ */
12177
+ declare function attachSpanExporter(patterInstance: {
12178
+ _patterSide?: string;
12179
+ } & Record<string, unknown>, exporter: unknown, options?: {
12180
+ readonly side?: string;
12181
+ }): void;
9400
12182
 
9401
12183
  /**
9402
12184
  * Observability entrypoint — re-exports the tracing API.
@@ -9430,4 +12212,4 @@ declare const custom: Readonly<{
9430
12212
  LLM: typeof LLM$2;
9431
12213
  }>;
9432
12214
 
9433
- export { type AgentOptions, type AgentState, AllProvidersFailedError, type AnthropicConversion, LLM$7 as AnthropicLLM, type AnthropicLLMOptions, type AnthropicMessage, AssemblyAIEncoding, AssemblyAIModel, STT$1 as AssemblyAISTT, type AssemblyAISTTOptions, type AudioConfig, type AudioSource, AuthenticationError, type BackgroundAudioOptions, BackgroundAudioPlayer, type EvaluateContext as BargeInEvaluateContext, type BargeInStrategy, BuiltinAudioClip, type BuiltinAudioClipName, type BuiltinPcmSource, type CallControl, type CallEvent, type CallEventHandler, type CallMetrics, CallMetricsAccumulator, type CallOutcome, type CallRecord, type CallResult, type CarrierKind, type CartesiaEncoding, STT$3 as CartesiaSTT, type CartesiaSTTOptions, TTS$3 as CartesiaTTS, CartesiaTTSModel, type CartesiaTTSOptions, CartesiaTTSVoiceMode, LLM$5 as CerebrasLLM, type CerebrasLLMOptions, ChatContext, type ChatMessage, type ChatRole, CloudflareTunnel, type ConsultConfig, type ConversationStateSnapshot, type CostBreakdown, LLM$2 as CustomLLM, type CustomLLMOptions, DEFAULT_MIN_SENTENCE_LEN, DEFAULT_PRICING, DTMF_EVENTS, DeepFilterNetFilter, type DeepFilterNetOptions, DeepgramModel, STT$6 as DeepgramSTT, type DeepgramSTTOptions, DefaultToolExecutor, type DefaultToolExecutorOptions, type DefineToolInput, type DtmfEvent, ConvAI as ElevenLabsConvAI, ElevenLabsConvAIAdapter, type ConvAIOptions as ElevenLabsConvAIOptions, ElevenLabsModel, ElevenLabsOutputFormat, ElevenLabsTTS as ElevenLabsRestTTS, TTS$6 as ElevenLabsTTS, type ElevenLabsTTSOptions, type ElevenLabsWebSocketOptions, TTS$5 as ElevenLabsWebSocketTTS, type EouTrigger, ErrorCode, EventBus, FallbackLLMProvider, type FallbackLLMProviderOptions, type FilePcmSource, GEMINI_DEFAULT_INPUT_SR, GEMINI_DEFAULT_OUTPUT_SR, GeminiLiveAdapter, type GeminiLiveEventHandler, LLM$4 as GoogleLLM, type GoogleLLMOptions, LLM$6 as GroqLLM, type GroqLLMOptions, Guardrail$1 as Guardrail, type GuardrailOptions, LLM$1 as HermesLLM, type HermesLLMOptions, type HookContext, IVRActivity, type IVRActivityOptions, type IVRToolDefinition, type IncomingMessage, type InitTracingOptions, TTS as InworldTTS, type InworldTTSOptions, type JobCallback, KrispFrameDuration, KrispSampleRate, KrispVivaFilter, type KrispVivaFilterOptions, type LLMChunk, LLMLoop, type LLMProvider, LMNTAudioFormat, LMNTModel, LMNTSampleRate, TTS$1 as LMNTTTS, type LMNTTTSOptions, type LatencyBreakdown, type LocalCallOptions, type LocalConfig, type LocalOptions, type Logger, type LoopCallback, type MessageHandler, MetricsStore, MinWordsStrategy, type MinWordsStrategyOptions, type ModelPricing, Ngrok, type OpenAICompatibleConsult, LLM$3 as OpenAICompatibleLLM, type OpenAICompatibleLLMOptions, OpenAICompatibleLLMProvider, LLM$8 as OpenAILLM, type OpenAILLMOptions, OpenAILLMProvider, type OpenAIMessage, Realtime as OpenAIRealtime, Realtime2 as OpenAIRealtime2, OpenAIRealtime2Adapter, type Realtime2Options as OpenAIRealtime2Options, OpenAIRealtimeAdapter, OpenAIRealtimeAudioFormat, OpenAIRealtimeModel, type RealtimeOptions as OpenAIRealtimeOptions, OpenAIRealtimeVADType, TTS$4 as OpenAITTS, type OpenAITTSOptions, STT$4 as OpenAITranscribeSTT, type OpenAITranscribeSTTOptions, OpenAITranscriptionModel, OpenAIVoice, LLM as OpenClawLLM, type OpenClawLLMOptions, PRICING_LAST_UPDATED, PRICING_VERSION, type ParamSpec, PartialStreamError, Patter, PatterConfigError, PatterConnectionError, PatterError, type PatterEventType, PatterTool, type PatterToolExecuteArgs, type PatterToolOptions, type PatterToolResult, PcmCarry, PipelineHookExecutor, type PipelineHooks, type PipelineMessageHandler, Carrier as Plivo, PlivoAdapter, type PlivoCarrierOptions, type InitiateCallOptions as PlivoInitiateCallOptions, type InitiateCallResult as PlivoInitiateCallResult, PricingUnit, type PricingUnitValue, type ProviderPricing, ProvisionError, RateLimitError, type RawPcmSource, type RealtimeConfig, type RealtimeTurnDetection, RemoteMessageHandler, RimeAudioFormat, RimeModel, TTS$2 as RimeTTS, type RimeTTSOptions, SPAN_BARGEIN, SPAN_CALL, SPAN_ENDPOINT, SPAN_LLM, SPAN_STT, SPAN_TOOL, SPAN_TTS, type SSEEvent, type STTConfig, type ScheduleHandle, SentenceChunker, type ServeOptions, type SessionContext, type SilenceCallback, type SileroSampleRate, SileroVAD, type SileroVADOptions, STT$2 as SonioxSTT, type SonioxSTTOptions$1 as SonioxSTTOptions, type Span, type SpeechEventCallback, SpeechEvents, SpeechmaticsAudioEncoding, SpeechmaticsOperatingPoint, STT as SpeechmaticsSTT, type SpeechmaticsSTTOptions, SpeechmaticsSampleRate, SpeechmaticsServerMessage, TurnDetectionMode as SpeechmaticsTurnDetectionMode, StatefulResampler, type StatefulResamplerOptions, Static as StaticTunnel, type TTSConfig, Carrier$1 as Telnyx, TelnyxAdapter, type TelnyxCarrierOptions, type ConfigureNumberOptions as TelnyxConfigureNumberOptions, type EndCallOptions as TelnyxEndCallOptions, type InitiateCallOptions$1 as TelnyxInitiateCallOptions, type InitiateCallResult$1 as TelnyxInitiateCallResult, type ProvisionNumberOptions as TelnyxProvisionNumberOptions, type ProvisionNumberResult as TelnyxProvisionNumberResult, TelnyxSTT, TelnyxSTTInputFormat, TelnyxSTTSampleRate, type Transcript as TelnyxSTTTranscript, TelnyxTTS, TelnyxTTSSampleRate, TelnyxTTSVoice, type TelnyxTranscriptionEngine, TestSession, TfidfLoopDetector, type TfidfLoopDetectorOptions, Tool, type ToolDefinition, type ToolExecutor, type ToolHandler, type ToolOptions, type TunnelHandle, type TurnMetrics, Carrier$2 as Twilio, TwilioAdapter, type TwilioAdapterOptions, type TwilioCarrierOptions, type ConfigureNumberOptions$1 as TwilioConfigureNumberOptions, type InitiateCallOptions$2 as TwilioInitiateCallOptions, type InitiateCallResult$2 as TwilioInitiateCallResult, type ProvisionNumberOptions$1 as TwilioProvisionNumberOptions, type ProvisionNumberResult$1 as TwilioProvisionNumberResult, ULTRAVOX_DEFAULT_API_BASE, ULTRAVOX_DEFAULT_SR, type UltravoxEventHandler, UltravoxRealtimeAdapter, type UserState, STT$5 as WhisperSTT, type WhisperSTTOptions, assemblyai, builtinClipPath, calculateRealtimeCost, calculateSttCost, calculateTelephonyCost, calculateTtsCost, callsToCsv, callsToJson, cartesia, createResampler16kTo8k, createResampler24kTo16k, createResampler24kTo8k, createResampler8kTo16k, custom, deepgram, defineTool, elevenlabs, evaluateStrategies as evaluateBargeInStrategies, filterEmoji, filterForTTS, filterMarkdown, formatDtmf, geminiLive, getLogger, guardrail, hashCaller, hermes, initTracing, isRemoteUrl, isTracingEnabled, isWebSocketUrl, lmnt, makeAuthMiddleware, mergePricing, mixPcm, mountApi, mountDashboard, mulawToPcm16, notifyDashboard, openaiCompatible, openaiTts, openclaw, openclawConsult, openclawPostCallNotifier, pcm16ToMulaw, resample16kTo8k, resample24kTo16k, resample8kTo16k, resamplePcm, resetStrategies as resetBargeInStrategies, rime, scheduleCron, scheduleInterval, scheduleOnce, selectSoundFromList, setLogger, soniox, speechmatics, startSpan, startTunnel, tool, ultravox, whisper };
12215
+ export { AGENT_BACKLOG_CAP_S, type AgentCallable, type AgentFactory, type AgentOptions, type AgentState, type AgentTextContainsOptions, AllProvidersFailedError, type AnthropicConversion, LLM$7 as AnthropicLLM, type AnthropicLLMOptions, type AnthropicMessage, AssemblyAIEncoding, AssemblyAIModel, STT$1 as AssemblyAISTT, type AssemblyAISTTOptions, type AudioConfig, type AudioSource, AuthenticationError, type BackgroundAudioOptions, BackgroundAudioPlayer, type EvaluateContext as BargeInEvaluateContext, type BargeInStrategy, BuiltinAudioClip, type BuiltinAudioClipName, type BuiltinPcmSource, type CallControl, type CallEvent, type CallEventHandler, type CallMetrics, CallMetricsAccumulator, type CallOutcome, type CallRecord, type CallResult, type CarrierKind, type CartesiaEncoding, STT$3 as CartesiaSTT, type CartesiaSTTOptions, TTS$3 as CartesiaTTS, CartesiaTTSModel, type CartesiaTTSOptions, CartesiaTTSVoiceMode, LLM$5 as CerebrasLLM, type CerebrasLLMOptions, ChatContext, type ChatMessage, type ChatRole, CloudflareTunnel, type ConsultConfig, type ConversationStateSnapshot, type CostBreakdown, LLM$2 as CustomLLM, type CustomLLMOptions, DEFAULT_MIN_SENTENCE_LEN, DEFAULT_PRICING, DTMF_EVENTS, DeepFilterNetFilter, type DeepFilterNetOptions, DeepgramModel, STT$6 as DeepgramSTT, type DeepgramSTTOptions, DefaultToolExecutor, type DefaultToolExecutorOptions, type DefineToolInput, type DtmfEvent, ConvAI as ElevenLabsConvAI, ElevenLabsConvAIAdapter, type ConvAIOptions as ElevenLabsConvAIOptions, ElevenLabsModel, ElevenLabsOutputFormat, ElevenLabsTTS as ElevenLabsRestTTS, TTS$6 as ElevenLabsTTS, type ElevenLabsTTSOptions, type ElevenLabsWebSocketOptions, TTS$5 as ElevenLabsWebSocketTTS, type EouTrigger, ErrorCode, type EvalCase, type EvalResult, EvalRunner, type EvalRunnerOptions, EvalSession, type EvalSessionOptions, type EvalSuite, type EvalTurn, EventBus, FakeAudioSender, FakeSTT, FakeTTS, FallbackLLMProvider, type FallbackLLMProviderOptions, type FilePcmSource, GEMINI_DEFAULT_INPUT_SR, GEMINI_DEFAULT_OUTPUT_SR, GeminiLiveAdapter, type GeminiLiveEventHandler, LLM$4 as GoogleLLM, type GoogleLLMOptions, LLM$6 as GroqLLM, type GroqLLMOptions, Guardrail$1 as Guardrail, type GuardrailOptions, LLM$1 as HermesLLM, type HermesLLMOptions, type HookContext, IVRActivity, type IVRActivityOptions, type IVRToolDefinition, type IncomingMessage, type InitTracingOptions, TTS as InworldTTS, type InworldTTSOptions, type JobCallback, type JudgeBackend, type JudgeResult, KrispFrameDuration, KrispSampleRate, KrispVivaFilter, type KrispVivaFilterOptions, type LLMChunk, LLMJudge, type LLMJudgeOptions, LLMLoop, type LLMProvider, LMNTAudioFormat, LMNTModel, LMNTSampleRate, TTS$1 as LMNTTTS, type LMNTTTSOptions, type LatencyBreakdown, type LocalCallOptions, LocalCallRecorder, type LocalConfig, type LocalOptions, type Logger, type LoopCallback, type MessageHandler, MetricsStore, MinWordsStrategy, type MinWordsStrategyOptions, type ModelPricing, Ngrok, type OpenAICompatibleConsult, LLM$3 as OpenAICompatibleLLM, type OpenAICompatibleLLMOptions, OpenAICompatibleLLMProvider, LLM$8 as OpenAILLM, type OpenAILLMOptions, OpenAILLMProvider, type OpenAIMessage, Realtime as OpenAIRealtime, Realtime2 as OpenAIRealtime2, OpenAIRealtime2Adapter, type Realtime2Options as OpenAIRealtime2Options, OpenAIRealtimeAdapter, OpenAIRealtimeAudioFormat, OpenAIRealtimeModel, type RealtimeOptions as OpenAIRealtimeOptions, OpenAIRealtimeVADType, TTS$4 as OpenAITTS, type OpenAITTSOptions, STT$4 as OpenAITranscribeSTT, type OpenAITranscribeSTTOptions, OpenAITranscriptionModel, OpenAIVoice, LLM as OpenClawLLM, type OpenClawLLMOptions, PRICING_LAST_UPDATED, PRICING_VERSION, type ParamSpec, PartialStreamError, Patter, PatterConfigError, PatterConnectionError, PatterError, type PatterEventType, PatterTool, type PatterToolExecuteArgs, type PatterToolOptions, type PatterToolResult, PcmCarry, PipelineHookExecutor, type PipelineHooks, type PipelineMessageHandler, Carrier as Plivo, PlivoAdapter, type PlivoCarrierOptions, type InitiateCallOptions as PlivoInitiateCallOptions, type InitiateCallResult as PlivoInitiateCallResult, PricingUnit, type PricingUnitValue, type ProviderPricing, ProvisionError, RECORDING_SAMPLE_RATE, RateLimitError, type RawPcmSource, type RealtimeConfig, type RealtimeTurnDetection, type RecorderEncoding, RemoteMessageHandler, RimeAudioFormat, RimeModel, TTS$2 as RimeTTS, type RimeTTSOptions, SMART_TURN_MODEL_ENV_VAR, SPAN_BARGEIN, SPAN_CALL, SPAN_ENDPOINT, SPAN_LLM, SPAN_STT, SPAN_TOOL, SPAN_TTS, type SSEEvent, type STTConfig, type ScheduleHandle, type ScriptedLLMCall, ScriptedLLMProvider, SentenceChunker, type ServeOptions, type SessionContext, type SilenceCallback, type SileroSampleRate, SileroVAD, type SileroVADOptions, SmartTurnDetector, type SmartTurnDetectorOptions, STT$2 as SonioxSTT, type SonioxSTTOptions$1 as SonioxSTTOptions, type Span, type SpeechEventCallback, SpeechEvents, SpeechmaticsAudioEncoding, SpeechmaticsOperatingPoint, STT as SpeechmaticsSTT, type SpeechmaticsSTTOptions, SpeechmaticsSampleRate, SpeechmaticsServerMessage, TurnDetectionMode as SpeechmaticsTurnDetectionMode, StatefulResampler, type StatefulResamplerOptions, Static as StaticTunnel, type TTSConfig, Carrier$1 as Telnyx, TelnyxAdapter, type TelnyxCarrierOptions, type ConfigureNumberOptions as TelnyxConfigureNumberOptions, type EndCallOptions as TelnyxEndCallOptions, type InitiateCallOptions$1 as TelnyxInitiateCallOptions, type InitiateCallResult$1 as TelnyxInitiateCallResult, type ProvisionNumberOptions as TelnyxProvisionNumberOptions, type ProvisionNumberResult as TelnyxProvisionNumberResult, TelnyxSTT, TelnyxSTTInputFormat, TelnyxSTTSampleRate, type Transcript as TelnyxSTTTranscript, TelnyxTTS, TelnyxTTSSampleRate, TelnyxTTSVoice, type TelnyxTranscriptionEngine, TestSession, TfidfLoopDetector, type TfidfLoopDetectorOptions, Tool, type ToolCallRecord, type ToolDefinition, type ToolExecutor, type ToolHandler, type ToolOptions, type TranscriptEntry, type TransferCallOptions, type TransferCallResult, type TunnelHandle, TurnExpectation, type TurnMetrics, type TurnResult, Carrier$2 as Twilio, TwilioAdapter, type TwilioAdapterOptions, type TwilioCarrierOptions, type ConfigureNumberOptions$1 as TwilioConfigureNumberOptions, type InitiateCallOptions$2 as TwilioInitiateCallOptions, type InitiateCallResult$2 as TwilioInitiateCallResult, type ProvisionNumberOptions$1 as TwilioProvisionNumberOptions, type ProvisionNumberResult$1 as TwilioProvisionNumberResult, ULTRAVOX_DEFAULT_API_BASE, ULTRAVOX_DEFAULT_SR, type UltravoxEventHandler, UltravoxRealtimeAdapter, type UserState, STT$5 as WhisperSTT, type WhisperSTTOptions, assemblyai, attachSpanExporter, builtinClipPath, calculateRealtimeCost, calculateSttCost, calculateTelephonyCost, calculateTtsCost, callsToCsv, callsToJson, cartesia, createResampler16kTo8k, createResampler24kTo16k, createResampler24kTo8k, createResampler8kTo16k, custom, deepgram, defineTool, elevenlabs, evalResultToDict, evaluateStrategies as evaluateBargeInStrategies, expect, filterEmoji, filterForTTS, filterMarkdown, formatDtmf, geminiLive, getLogger, guardrail, hashCaller, hermes, historyTranscript, initTracing, isRemoteUrl, isTracingEnabled, isWebSocketUrl, lmnt, loadSuite, makeAuthMiddleware, mergePricing, mixPcm, mountApi, mountDashboard, mulawToPcm16, notifyDashboard, openaiCompatible, openaiTts, openclaw, openclawConsult, openclawPostCallNotifier, patterCallScope, pcm16ToMulaw, recordPatterAttrs, resample16kTo8k, resample24kTo16k, resample8kTo16k, resamplePcm, resetStrategies as resetBargeInStrategies, rime, scheduleCron, scheduleInterval, scheduleOnce, selectSoundFromList, setLogger, shutdownTracing, soniox, speechmatics, startSpan, startTunnel, textTurn, tool, toolCallTurn, ultravox, whisper, withSpan };