npm - @framers/agentos - Versions diffs - 0.1.108 → 0.1.109 - Mend

@framers/agentos 0.1.108 → 0.1.109

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

package/dist/voice-pipeline/types.d.ts CHANGED Viewed

@@ -8,41 +8,78 @@
  * system. All heavy I/O crosses EventEmitter-based session boundaries to keep
  * the hot path non-blocking.
  *
- * Dependency order (no circular refs):
+ * ## Dependency order (no circular refs)
+ *
+ * ```
  *   AudioFrame / EncodedAudioChunk
- *   → Transport (IStreamTransport)
- *   → STT (IStreamingSTT + StreamingSTTSession)
- *   → Endpoint detection (IEndpointDetector + VadEvent)
- *   → Diarization (IDiarizationEngine + DiarizationSession)
- *   → TTS (IStreamingTTS + StreamingTTSSession)
- *   → Barge-in (IBargeinHandler)
- *   → Session (VoicePipelineSession)
- *   → Protocol messages (ClientTextMessage, ServerTextMessage)
+ *   -> Transport (IStreamTransport)
+ *   -> STT (IStreamingSTT + StreamingSTTSession)
+ *   -> Endpoint detection (IEndpointDetector + VadEvent)
+ *   -> Diarization (IDiarizationEngine + DiarizationSession)
+ *   -> TTS (IStreamingTTS + StreamingTTSSession)
+ *   -> Barge-in (IBargeinHandler)
+ *   -> Session (VoicePipelineSession)
+ *   -> Protocol messages (ClientTextMessage, ServerTextMessage)
+ * ```
+ *
+ * ## Design rationale
+ *
+ * Every interface in this module is kept deliberately narrow so that
+ * implementations can be swapped at runtime (e.g. Deepgram STT vs Whisper
+ * vs browser WebSpeechAPI) without touching the orchestrator. The
+ * EventEmitter-based session pattern was chosen over callback interfaces
+ * because it naturally supports fan-out (multiple listeners) and backpressure
+ * is handled at the transport level rather than per-callback.
  */
 import type { EventEmitter } from 'node:events';
 /**
  * A single frame of raw PCM audio, as produced by a microphone capture or
- * a VAD pre-processor. Each frame typically represents 10–20 ms of audio.
+ * a VAD pre-processor. Each frame typically represents 10-20 ms of audio.
+ *
+ * @see {@link EncodedAudioChunk} for the compressed counterpart used in TTS output.
+ *
+ * @example
+ * ```typescript
+ * const frame: AudioFrame = {
+ *   samples: new Float32Array(320),   // 20 ms @ 16 kHz
+ *   sampleRate: 16000,
+ *   timestamp: Date.now(),
+ * };
+ * ```
  */
 export interface AudioFrame {
     /**
      * Interleaved 32-bit float PCM samples, normalised to [-1, 1].
      * For mono audio this is a flat array; stereo interleaves L/R pairs.
+     *
+     * Float32Array is chosen over Int16Array because it avoids quantisation
+     * artefacts in DSP operations (e.g. energy calculation, resampling) and
+     * is the native format for Web Audio API.
      */
     samples: Float32Array;
     /**
      * Samples per second (e.g. 16000, 24000, 48000).
+     *
+     * 16 kHz is the standard for telephony and most STT engines. 24 kHz is
+     * typical for TTS output. The pipeline resamples internally when STT
+     * and TTS sample rates differ.
      */
     sampleRate: number;
     /**
      * Unix epoch millisecond timestamp at which this frame was captured.
      * Used for synchronisation across STT, VAD, and diarization streams.
+     *
+     * Must be monotonically increasing within a session. Out-of-order
+     * frames degrade STT accuracy and confuse the endpoint detector's
+     * duration tracking.
      */
     timestamp: number;
     /**
      * Optional hint from the capture layer identifying the speaker source
      * (e.g. a hardware device label or a WebRTC peer ID). Used by the
      * diarization engine when native speaker IDs are unavailable.
+     *
+     * @see {@link DiarizedSegment.speakerId} for the post-diarization label.
      */
     speakerHint?: string;
 }
@@ -50,14 +87,32 @@ export interface AudioFrame {
  * A compressed audio chunk ready for transmission over the wire (e.g. to a
  * TTS websocket or a playback buffer). Contains the rendered text to allow
  * barge-in handlers to track interrupted utterance state.
+ *
+ * @see {@link AudioFrame} for the uncompressed PCM counterpart used in capture.
+ * @see {@link StreamingTTSSession} which emits these on the `'audio'` event.
+ *
+ * @example
+ * ```typescript
+ * const chunk: EncodedAudioChunk = {
+ *   audio: Buffer.from([...opusBytes]),
+ *   format: 'opus',
+ *   sampleRate: 24000,
+ *   durationMs: 60,
+ *   text: 'Hello there!',
+ * };
+ * ```
  */
 export interface EncodedAudioChunk {
     /**
-     * Raw encoded bytes in the format specified by `format`.
+     * Raw encoded bytes in the format specified by {@link format}.
      */
     audio: Buffer;
     /**
-     * Codec/container format of `audio`.
+     * Codec/container format of {@link audio}.
+     *
+     * - `'pcm'` -- raw signed 16-bit LE samples (lowest latency, highest bandwidth).
+     * - `'mp3'` -- MPEG Layer 3 (wide browser support, moderate latency).
+     * - `'opus'` -- Opus in OGG container (best quality/size ratio, recommended default).
      */
     format: 'pcm' | 'mp3' | 'opus';
     /**
@@ -66,17 +121,29 @@ export interface EncodedAudioChunk {
     sampleRate: number;
     /**
      * Playback duration of this chunk in milliseconds.
+     * Used by the orchestrator to track cumulative played time for
+     * barge-in context ({@link BargeinContext.playedDurationMs}).
      */
     durationMs: number;
     /**
      * The text fragment that was synthesised into this chunk. Preserved so
-     * barge-in handlers can report `interruptedRemainder` accurately.
+     * barge-in handlers can report {@link VoiceTurnMetadata.interruptedRemainder}
+     * accurately when playback is cut short.
      */
     text: string;
 }
 /**
  * Discriminated union of control messages sent from the pipeline to the
  * underlying stream transport (e.g. a WebSocket or WebRTC data-channel).
+ *
+ * @see {@link IStreamTransport.sendControl} which accepts these messages.
+ * @see {@link ServerTextMessage} for the full server-to-client protocol.
+ *
+ * @example
+ * ```typescript
+ * const muteMsg: TransportControlMessage = { type: 'mute' };
+ * const stopMsg: TransportControlMessage = { type: 'stop', reason: 'session timeout' };
+ * ```
  */
 export type TransportControlMessage = {
     /** Mute the outbound audio stream without closing the session. */
@@ -99,50 +166,76 @@ export type TransportControlMessage = {
  * Abstraction over any bidirectional audio/text stream transport.
  * Implementations include WebSocket, WebRTC data-channel, and in-process pipes.
  *
- * Emits:
- * - `'audio'` (AudioFrame) — inbound audio from the remote client.
- * - `'message'` (ClientTextMessage) — inbound JSON control message from the client.
- * - `'close'` () — transport has been closed (either side).
- * - `'error'` (Error) — fatal transport error.
+ * The transport layer is intentionally thin: it handles framing and I/O but
+ * knows nothing about STT, TTS, or conversation state. This separation lets
+ * the pipeline swap transports (e.g. WebSocket -> WebRTC) without touching
+ * any voice logic.
+ *
+ * ## Events emitted
+ *
+ * | Event       | Payload               | Description                            |
+ * |-------------|-----------------------|----------------------------------------|
+ * | `'audio'`   | {@link AudioFrame}    | Inbound audio from the remote client.  |
+ * | `'message'` | {@link ClientTextMessage} | Inbound JSON control from the client. |
+ * | `'close'`   | *(none)*              | Transport has been closed (either side). |
+ * | `'error'`   | `Error`               | Fatal transport error.                 |
+ *
+ * @see {@link WebSocketStreamTransport} for the canonical WebSocket implementation.
  */
 export interface IStreamTransport extends EventEmitter {
     /**
      * Stable identifier for this transport connection (e.g. a UUID or socket ID).
+     * Used as a correlation key in logs and metrics.
      */
     readonly id: string;
     /**
      * Current connection state.
-     * - `'connecting'` — handshake in progress.
-     * - `'open'` — fully established and ready.
-     * - `'closing'` — graceful teardown initiated.
-     * - `'closed'` — no longer usable.
+     * - `'connecting'` -- handshake in progress.
+     * - `'open'` -- fully established and ready.
+     * - `'closing'` -- graceful teardown initiated.
+     * - `'closed'` -- no longer usable.
      */
     readonly state: 'connecting' | 'open' | 'closing' | 'closed';
     /**
      * Send a synthesised audio chunk to the remote client for playback.
      * Resolves once the chunk has been handed to the underlying I/O layer.
      *
-     * @param chunk — Encoded audio to deliver.
+     * @param chunk - Encoded audio to deliver.
+     * @returns Resolves when the data has been buffered for transmission.
+     * @throws {Error} If the transport is not in `'open'` state.
      */
     sendAudio(chunk: EncodedAudioChunk): Promise<void>;
     /**
      * Send a JSON control message to the remote client.
      *
-     * @param message — Server-side protocol message.
+     * @param message - Server-side protocol message.
+     * @returns Resolves when the data has been buffered for transmission.
+     * @throws {Error} If the transport is not in `'open'` state.
      */
     sendControl(message: ServerTextMessage): Promise<void>;
     /**
      * Close the transport, optionally supplying a WebSocket-style close code and
      * human-readable reason string for diagnostics.
      *
-     * @param code — Optional numeric close code (defaults to 1000 normal closure).
-     * @param reason — Optional human-readable close reason.
+     * @param code - Optional numeric close code (defaults to 1000 normal closure).
+     * @param reason - Optional human-readable close reason.
      */
     close(code?: number, reason?: string): void;
 }
 /**
  * Configuration passed to {@link IStreamingSTT.startSession} when opening a new
  * speech recognition stream.
+ *
+ * @see {@link VoicePipelineConfig.sttOptions} for provider-level overrides.
+ *
+ * @example
+ * ```typescript
+ * const config: StreamingSTTConfig = {
+ *   language: 'en-US',
+ *   interimResults: true,
+ *   punctuate: true,
+ * };
+ * ```
  */
 export interface StreamingSTTConfig {
     /**
@@ -153,11 +246,14 @@ export interface StreamingSTTConfig {
     /**
      * Whether to emit interim (non-final) transcript events. When `true`,
      * partial results arrive more frequently at the cost of higher word error rate.
+     * Interim results are useful for real-time UI display and early endpoint hints.
      * @defaultValue true
      */
     interimResults?: boolean;
     /**
      * Enable automatic punctuation insertion if the provider supports it.
+     * Punctuation is critical for the {@link HeuristicEndpointDetector} which
+     * uses terminal punctuation (`.`, `?`, `!`) as a turn-completion signal.
      * @defaultValue true
      */
     punctuate?: boolean;
@@ -176,6 +272,8 @@ export interface StreamingSTTConfig {
 /**
  * A single word within a {@link TranscriptEvent}, augmented with timing and
  * optional speaker attribution.
+ *
+ * @see {@link TranscriptEvent.words} which contains an array of these.
  */
 export interface TranscriptWord {
     /**
@@ -192,6 +290,7 @@ export interface TranscriptWord {
     end: number;
     /**
      * Recognition confidence in the range [0, 1]. Higher is better.
+     * Typically 0.8+ for clear speech, 0.4-0.7 for noisy or accented audio.
      */
     confidence: number;
     /**
@@ -204,6 +303,17 @@ export interface TranscriptWord {
 /**
  * Emitted by a {@link StreamingSTTSession} each time the provider produces a
  * recognition hypothesis.
+ *
+ * @see {@link IEndpointDetector.pushTranscript} which consumes these events.
+ *
+ * @example
+ * ```typescript
+ * sttSession.on('transcript', (event: TranscriptEvent) => {
+ *   if (event.isFinal) {
+ *     console.log(`Final: "${event.text}" (confidence: ${event.confidence})`);
+ *   }
+ * });
+ * ```
  */
 export interface TranscriptEvent {
     /**
@@ -211,7 +321,7 @@ export interface TranscriptEvent {
      */
     text: string;
     /**
-     * Aggregate confidence score for `text` in the range [0, 1].
+     * Aggregate confidence score for {@link text} in the range [0, 1].
      */
     confidence: number;
     /**
@@ -222,6 +332,9 @@ export interface TranscriptEvent {
     /**
      * `true` when this hypothesis is stable and will not be revised.
      * `false` for interim (streaming) hypotheses.
+     *
+     * The {@link HeuristicEndpointDetector} only accumulates final transcripts;
+     * interim results are discarded to avoid double-counting.
      */
     isFinal: boolean;
     /**
@@ -234,17 +347,22 @@ export interface TranscriptEvent {
  * An active streaming speech-to-text session. Audio frames are pushed in
  * and transcript events flow out via EventEmitter.
  *
- * Emits:
- * - `'transcript'` (TranscriptEvent) — interim or final hypothesis.
- * - `'error'` (Error) — unrecoverable provider error.
- * - `'close'` () — session has been fully terminated.
+ * ## Events emitted
+ *
+ * | Event          | Payload               | Description                        |
+ * |----------------|-----------------------|------------------------------------|
+ * | `'transcript'` | {@link TranscriptEvent} | Interim or final hypothesis.     |
+ * | `'error'`      | `Error`               | Unrecoverable provider error.      |
+ * | `'close'`      | *(none)*              | Session has been fully terminated. |
+ *
+ * @see {@link IStreamingSTT.startSession} which creates these sessions.
  */
 export interface StreamingSTTSession extends EventEmitter {
     /**
      * Push a raw audio frame into the recognition stream. Frames must arrive
      * in capture order; gaps or out-of-order frames degrade accuracy.
      *
-     * @param frame — PCM audio frame to process.
+     * @param frame - PCM audio frame to process.
      */
     pushAudio(frame: AudioFrame): void;
     /**
@@ -263,6 +381,8 @@ export interface StreamingSTTSession extends EventEmitter {
  *
  * Implementations are registered via the `EXTENSION_KIND_STREAMING_STT`
  * extension kind and resolved by the voice pipeline at session creation time.
+ *
+ * @see {@link StreamingSTTSession} for the session interface returned by {@link startSession}.
  */
 export interface IStreamingSTT {
     /**
@@ -276,61 +396,107 @@ export interface IStreamingSTT {
     /**
      * Open a new streaming recognition session.
      *
-     * @param config — Session-level configuration overriding provider defaults.
+     * @param config - Session-level configuration overriding provider defaults.
      * @returns A ready-to-use session whose lifecycle is independent of this factory.
+     * @throws {Error} If the provider fails to initialise (e.g. invalid API key).
      */
     startSession(config?: StreamingSTTConfig): Promise<StreamingSTTSession>;
 }
 /**
  * A VAD (Voice Activity Detection) or STT-derived event describing speech
  * energy transitions over time.
+ *
+ * @see {@link IEndpointDetector.pushVadEvent} which consumes these.
+ *
+ * @example
+ * ```typescript
+ * const speechStart: VadEvent = {
+ *   type: 'speech_start',
+ *   timestamp: Date.now(),
+ *   source: 'vad',
+ *   energyLevel: 0.42,
+ * };
+ * ```
  */
 export interface VadEvent {
     /**
      * Type of the VAD transition:
-     * - `'speech_start'` — voice energy detected after silence.
-     * - `'speech_end'` — voice energy fell below the silence threshold.
-     * - `'silence'` — periodic silence heartbeat (emitted at `silenceIntervalMs` cadence).
+     * - `'speech_start'` -- voice energy detected after silence.
+     * - `'speech_end'` -- voice energy fell below the silence threshold.
+     * - `'silence'` -- periodic silence heartbeat (emitted at `silenceIntervalMs` cadence).
      */
     type: 'speech_start' | 'speech_end' | 'silence';
     /**
      * Unix epoch millisecond timestamp at which this transition was detected.
+     * Used by the endpoint detector to compute speech duration.
      */
     timestamp: number;
     /**
      * Optional raw energy level used to trigger this event (implementation-defined scale).
+     * Useful for debugging VAD sensitivity but not consumed by the pipeline logic.
      */
     energyLevel?: number;
     /**
      * Origin of the VAD event:
-     * - `'vad'` — emitted by a standalone VAD model (e.g. Silero, WebRTC VAD).
-     * - `'stt'` — inferred from STT activity (e.g. provider-side endpointing signals).
+     * - `'vad'` -- emitted by a standalone VAD model (e.g. Silero, WebRTC VAD).
+     * - `'stt'` -- inferred from STT activity (e.g. provider-side endpointing signals).
+     *
+     * The pipeline synthesises STT-derived speech_start/speech_end events when
+     * a dedicated VAD is not available, using the source field to distinguish them.
      */
     source?: 'vad' | 'stt';
 }
 /**
  * Semantic reason why the endpoint detector decided the user has finished speaking.
+ *
+ * Each reason maps to a different detection strategy within the endpoint detector:
+ *
+ * | Reason             | Detection strategy                                     |
+ * |--------------------|--------------------------------------------------------|
+ * | `silence_timeout`  | VAD silence exceeded configured threshold               |
+ * | `punctuation`      | STT final result ends with `.`, `?`, or `!`            |
+ * | `syntax_complete`  | Syntax model determined utterance is grammatically complete |
+ * | `semantic_model`   | Small LM scored intent as complete                     |
+ * | `manual`           | Explicitly triggered by a ClientTextMessage control     |
+ * | `timeout`          | Hard maximum turn duration elapsed                     |
+ *
+ * @see {@link TurnCompleteEvent.reason} which carries this value.
+ * @see {@link VoiceTurnMetadata.endpointReason} where it is forwarded to the agent.
  */
 export type EndpointReason = 'silence_timeout' | 'punctuation' | 'syntax_complete' | 'semantic_model' | 'manual' | 'timeout';
 /**
  * Emitted by {@link IEndpointDetector} when it determines the user has finished
  * their turn and the pipeline should hand off to the agent.
+ *
+ * @see {@link IEndpointDetector} which emits these on the `'turn_complete'` event.
+ * @see {@link VoicePipelineOrchestrator} which transitions to `'processing'` state upon receipt.
+ *
+ * @example
+ * ```typescript
+ * detector.on('turn_complete', (event: TurnCompleteEvent) => {
+ *   console.log(`User said: "${event.transcript}" (reason: ${event.reason})`);
+ * });
+ * ```
  */
 export interface TurnCompleteEvent {
     /**
      * The final consolidated transcript for this turn.
+     * May be empty for acoustic-only detectors that have no transcript access.
      */
     transcript: string;
     /**
      * Aggregate STT confidence score for the transcript, in the range [0, 1].
+     * Zero when no STT data is available (e.g. acoustic-only mode).
      */
     confidence: number;
     /**
      * Total duration of detected speech in this turn, in milliseconds.
+     * Computed as `speechEndTimestamp - speechStartTimestamp`.
      */
     durationMs: number;
     /**
      * The semantic reason that triggered turn completion.
+     * @see {@link EndpointReason} for the full set of possible values.
      */
     reason: EndpointReason;
 }
@@ -339,29 +505,36 @@ export interface TurnCompleteEvent {
  * Combines VAD events with linguistic signals to decide when the user
  * has finished speaking.
  *
- * Emits:
- * - `'turn_complete'` (TurnCompleteEvent) — the user's turn has ended.
- * - `'speech_start'` () — the user has started speaking (re-emitted from VAD).
- * - `'barge_in_detected'` () — user started speaking while TTS was playing.
+ * ## Events emitted
+ *
+ * | Event                  | Payload                 | Description                            |
+ * |------------------------|-------------------------|----------------------------------------|
+ * | `'turn_complete'`      | {@link TurnCompleteEvent} | The user's turn has ended.           |
+ * | `'speech_start'`       | *(none)*                | The user has started speaking.         |
+ * | `'barge_in_detected'`  | *(none)*                | User spoke while TTS was playing.      |
+ *
+ * @see {@link HeuristicEndpointDetector} for the rule-based implementation.
+ * @see {@link AcousticEndpointDetector} for the purely acoustic implementation.
  */
 export interface IEndpointDetector extends EventEmitter {
     /**
      * Active detection strategy:
-     * - `'silence'` — pure silence-timeout based.
-     * - `'hybrid'` — silence + linguistic completeness signals.
-     * - `'semantic'` — small LM scoring utterance completeness.
+     * - `'acoustic'` -- pure silence-timeout based (no transcript analysis).
+     * - `'heuristic'` -- silence + terminal punctuation + backchannel filtering.
+     * - `'semantic'` -- small LM scoring utterance completeness.
      */
     readonly mode: 'acoustic' | 'heuristic' | 'semantic';
     /**
      * Push a VAD event from the upstream voice activity detector.
      *
-     * @param event — The VAD event to process.
+     * @param event - The VAD event to process.
      */
     pushVadEvent(event: VadEvent): void;
     /**
      * Push a partial or final STT result for linguistic analysis.
+     * Acoustic-mode detectors may no-op this method.
      *
-     * @param event — Transcript event from the STT session.
+     * @param event - Transcript event from the STT session.
      */
     pushTranscript(event: TranscriptEvent): void;
     /**
@@ -373,11 +546,14 @@ export interface IEndpointDetector extends EventEmitter {
 /**
  * Configuration for a diarization session. Controls expected speaker count and
  * chunking behaviour for providers that require buffered audio.
+ *
+ * @see {@link IDiarizationEngine.startSession} which accepts this config.
  */
 export interface DiarizationConfig {
     /**
      * Hint to the provider about how many distinct speakers are expected.
-     * When omitted, the provider uses auto-detection.
+     * When omitted, the provider uses auto-detection (which typically adds
+     * latency as it needs more audio to stabilise speaker count).
      */
     expectedSpeakers?: number;
     /**
@@ -401,6 +577,8 @@ export interface DiarizationConfig {
 }
 /**
  * A contiguous segment of transcript text with millisecond timing metadata.
+ *
+ * @see {@link DiarizedSegment} which extends this with speaker attribution.
  */
 export interface TranscriptSegment {
     /**
@@ -419,6 +597,15 @@ export interface TranscriptSegment {
 /**
  * A {@link TranscriptSegment} extended with speaker attribution produced by the
  * diarization engine.
+ *
+ * @see {@link DiarizationSession} which emits these on the `'segment'` event.
+ *
+ * @example
+ * ```typescript
+ * diarizationSession.on('segment', (seg: DiarizedSegment) => {
+ *   console.log(`[${seg.speakerId}]: "${seg.text}"`);
+ * });
+ * ```
  */
 export interface DiarizedSegment extends TranscriptSegment {
     /**
@@ -428,7 +615,7 @@ export interface DiarizedSegment extends TranscriptSegment {
      */
     speakerId: string;
     /**
-     * Confidence that this segment belongs to `speakerId`, in the range [0, 1].
+     * Confidence that this segment belongs to {@link speakerId}, in the range [0, 1].
      */
     speakerConfidence: number;
 }
@@ -436,32 +623,39 @@ export interface DiarizedSegment extends TranscriptSegment {
  * An active diarization session. Accepts raw audio and outputs speaker-attributed
  * transcript segments via EventEmitter.
  *
- * Emits:
- * - `'segment'` (DiarizedSegment) — a diarized transcript segment is ready.
- * - `'speaker_change'` ({ from: string; to: string }) — speaker transition detected.
- * - `'error'` (Error) — unrecoverable engine error.
- * - `'close'` () — session terminated.
+ * ## Events emitted
+ *
+ * | Event              | Payload                                  | Description                    |
+ * |--------------------|------------------------------------------|--------------------------------|
+ * | `'segment'`        | {@link DiarizedSegment}                  | A diarized segment is ready.   |
+ * | `'speaker_change'` | `{ from: string; to: string }`           | Speaker transition detected.   |
+ * | `'error'`          | `Error`                                  | Unrecoverable engine error.    |
+ * | `'close'`          | *(none)*                                 | Session terminated.            |
+ *
+ * @see {@link IDiarizationEngine.startSession} which creates these sessions.
  */
 export interface DiarizationSession extends EventEmitter {
     /**
      * Push a raw audio frame for diarization analysis.
      *
-     * @param frame — PCM audio frame from the capture stream.
+     * @param frame - PCM audio frame from the capture stream.
      */
     pushAudio(frame: AudioFrame): void;
     /**
      * Apply speaker labels to an existing transcript using the session's
      * current speaker model. Returns labelled segments.
      *
-     * @param transcript — Plain transcript segments to label.
+     * @param transcript - Plain transcript segments to label.
+     * @returns Speaker-attributed segments with confidence scores.
      */
     labelTranscript(transcript: TranscriptSegment[]): Promise<DiarizedSegment[]>;
     /**
      * Enroll a known speaker so subsequent audio is attributed to a named identity
      * rather than an anonymous `SPEAKER_N` label.
      *
-     * @param speakerId — Stable identifier for the speaker (e.g. user UUID).
-     * @param samples — Representative audio frames for the speaker's voice.
+     * @param speakerId - Stable identifier for the speaker (e.g. user UUID).
+     * @param samples - Representative audio frames for the speaker's voice.
+     *   Typically 10-30 seconds of clean speech produces the best embeddings.
      */
     enrollSpeaker(speakerId: string, samples: AudioFrame[]): Promise<void>;
     /**
@@ -473,18 +667,33 @@ export interface DiarizationSession extends EventEmitter {
  * Factory interface for diarization (speaker separation) engines.
  *
  * Registered via `EXTENSION_KIND_DIARIZATION`.
+ *
+ * @see {@link DiarizationSession} for the session interface returned by {@link startSession}.
  */
 export interface IDiarizationEngine {
     /**
      * Open a new diarization session.
      *
-     * @param config — Session configuration controlling chunking and speaker hints.
+     * @param config - Session configuration controlling chunking and speaker hints.
+     * @returns A live session that accepts audio and emits diarized segments.
      */
     startSession(config?: DiarizationConfig): Promise<DiarizationSession>;
 }
 /**
  * Configuration passed to {@link IStreamingTTS.startSession} when opening a new
  * text-to-speech synthesis stream.
+ *
+ * @see {@link VoicePipelineConfig.ttsOptions} for provider-level overrides.
+ *
+ * @example
+ * ```typescript
+ * const config: StreamingTTSConfig = {
+ *   voice: 'nova',
+ *   format: 'opus',
+ *   sampleRate: 24000,
+ *   chunkingMode: 'sentence',
+ * };
+ * ```
  */
 export interface StreamingTTSConfig {
     /**
@@ -498,22 +707,22 @@ export interface StreamingTTSConfig {
      */
     format?: 'pcm' | 'mp3' | 'opus';
     /**
-     * Output sample rate in Hz. Must be supported by the chosen `format`.
+     * Output sample rate in Hz. Must be supported by the chosen {@link format}.
      * @defaultValue 24000
      */
     sampleRate?: number;
     /**
      * Controls how the provider segments incoming token streams into synthesis
      * requests:
-     * - `'sentence'` — flush at sentence boundaries (lower latency).
-     * - `'word'` — flush at word boundaries (minimum latency, may sound choppy).
-     * - `'paragraph'` — flush at paragraph boundaries (highest quality).
+     * - `'sentence'` -- flush at sentence boundaries (lower latency).
+     * - `'word'` -- flush at word boundaries (minimum latency, may sound choppy).
+     * - `'paragraph'` -- flush at paragraph boundaries (highest quality).
      * @defaultValue 'sentence'
      */
     chunkingMode?: 'sentence' | 'word' | 'paragraph';
     /**
      * Maximum number of milliseconds of audio to buffer before forcing a flush,
-     * regardless of `chunkingMode`. Prevents unbounded memory growth for very
+     * regardless of {@link chunkingMode}. Prevents unbounded memory growth for very
      * long utterances.
      * @defaultValue 3000
      */
@@ -527,18 +736,23 @@ export interface StreamingTTSConfig {
  * An active streaming TTS session. Token text is pushed in and encoded audio
  * chunks flow out via EventEmitter.
  *
- * Emits:
- * - `'audio'` (EncodedAudioChunk) — a synthesised audio chunk ready for playback.
- * - `'flush_complete'` () — all queued tokens have been synthesised.
- * - `'error'` (Error) — unrecoverable synthesis error.
- * - `'close'` () — session terminated.
+ * ## Events emitted
+ *
+ * | Event              | Payload                   | Description                          |
+ * |--------------------|---------------------------|--------------------------------------|
+ * | `'audio'`          | {@link EncodedAudioChunk}  | A synthesised chunk ready for playback. |
+ * | `'flush_complete'`  | *(none)*                  | All queued tokens have been synthesised. |
+ * | `'error'`          | `Error`                   | Unrecoverable synthesis error.       |
+ * | `'close'`          | *(none)*                  | Session terminated.                  |
+ *
+ * @see {@link IStreamingTTS.startSession} which creates these sessions.
  */
 export interface StreamingTTSSession extends EventEmitter {
     /**
      * Push one or more LLM output tokens into the synthesis buffer.
-     * The session will chunk and synthesise them according to `chunkingMode`.
+     * The session will chunk and synthesise them according to {@link StreamingTTSConfig.chunkingMode}.
      *
-     * @param tokens — Text tokens to synthesise (may be partial words).
+     * @param tokens - Text tokens to synthesise (may be partial words).
      */
     pushTokens(tokens: string): void;
     /**
@@ -549,6 +763,7 @@ export interface StreamingTTSSession extends EventEmitter {
     /**
      * Immediately stop synthesis and discard all buffered tokens. Audio chunks
      * currently in-flight are not recalled; the caller must stop playback separately.
+     * Used during barge-in to halt the agent's response.
      */
     cancel(): void;
     /**
@@ -560,6 +775,8 @@ export interface StreamingTTSSession extends EventEmitter {
  * Factory interface for streaming text-to-speech providers.
  *
  * Registered via `EXTENSION_KIND_STREAMING_TTS`.
+ *
+ * @see {@link StreamingTTSSession} for the session interface returned by {@link startSession}.
  */
 export interface IStreamingTTS {
     /**
@@ -569,33 +786,64 @@ export interface IStreamingTTS {
     /**
      * Open a new streaming synthesis session.
      *
-     * @param config — Session-level configuration overriding provider defaults.
+     * @param config - Session-level configuration overriding provider defaults.
+     * @returns A live session that accepts tokens and emits audio chunks.
+     * @throws {Error} If the provider fails to initialise (e.g. invalid API key).
      */
     startSession(config?: StreamingTTSConfig): Promise<StreamingTTSSession>;
 }
 /**
  * Contextual information supplied to {@link IBargeinHandler.handleBargein} so the
  * handler can make an informed decision about how to respond to interruption.
+ *
+ * @see {@link IBargeinHandler} which consumes this context.
+ * @see {@link HardCutBargeinHandler} and {@link SoftFadeBargeinHandler} for concrete handlers.
+ *
+ * @example
+ * ```typescript
+ * const context: BargeinContext = {
+ *   speechDurationMs: 450,
+ *   interruptedText: 'I was explaining the process of...',
+ *   playedDurationMs: 2300,
+ * };
+ * ```
  */
 export interface BargeinContext {
     /**
      * Duration of detected user speech before the barge-in was confirmed, in ms.
-     * Short durations may indicate accidental noise rather than intentional interruption.
+     * Short durations (< 100 ms) often indicate accidental noise, lip smacks,
+     * or breaths rather than intentional interruption.
+     *
+     * @see {@link HardCutBargeinHandler} which uses a 300 ms default threshold.
+     * @see {@link SoftFadeBargeinHandler} which uses a tiered threshold system.
      */
     speechDurationMs: number;
     /**
-     * The partial TTS text that was interrupted. Used to construct `interruptedRemainder`
-     * in {@link VoiceTurnMetadata}.
+     * The partial TTS text that was interrupted. Used to construct
+     * {@link VoiceTurnMetadata.interruptedRemainder} so the agent knows what
+     * information was cut off and can avoid repeating it.
      */
     interruptedText: string;
     /**
      * How many milliseconds of audio had been played at the point of interruption.
+     * Combined with {@link interruptedText}, this allows the agent to estimate
+     * how much of the response the user actually heard.
      */
     playedDurationMs: number;
 }
 /**
  * Action the pipeline should take in response to a detected barge-in.
  * Returned by {@link IBargeinHandler.handleBargein}.
+ *
+ * @see {@link IBargeinHandler} which returns this type.
+ *
+ * @example
+ * ```typescript
+ * const cancelAction: BargeinAction = { type: 'cancel', injectMarker: '[interrupted]' };
+ * const pauseAction: BargeinAction  = { type: 'pause', fadeMs: 150 };
+ * const resumeAction: BargeinAction = { type: 'resume' };
+ * const ignoreAction: BargeinAction = { type: 'ignore' };
+ * ```
  */
 export type BargeinAction = {
     /** Immediately stop all TTS output and discard the remainder of the response. */
@@ -606,7 +854,7 @@ export type BargeinAction = {
      */
     injectMarker?: string;
 } | {
-    /** Fade out TTS audio over `fadeMs` milliseconds then pause. */
+    /** Fade out TTS audio over {@link fadeMs} milliseconds then pause. */
     type: 'pause';
     /** Duration of the fade-out in milliseconds. @defaultValue 150 */
     fadeMs?: number;
@@ -627,19 +875,22 @@ export type BargeinAction = {
  * Handles the policy decision when a barge-in (user speaking over TTS) is detected.
  *
  * Registered via `EXTENSION_KIND_BARGEIN_HANDLER`.
+ *
+ * @see {@link HardCutBargeinHandler} for the immediate-stop strategy.
+ * @see {@link SoftFadeBargeinHandler} for the three-tier fade strategy.
  */
 export interface IBargeinHandler {
     /**
      * Interruption strategy implemented by this handler:
-     * - `'hard-cut'` — TTS audio is stopped immediately with no fade.
-     * - `'soft-fade'` — TTS audio fades out over a short window before stopping.
+     * - `'hard-cut'` -- TTS audio is stopped immediately with no fade.
+     * - `'soft-fade'` -- TTS audio fades out over a short window before stopping.
      */
     readonly mode: 'hard-cut' | 'soft-fade';
     /**
      * Called by the pipeline when a barge-in is confirmed. The handler evaluates
      * the context and returns the action the pipeline should execute.
      *
-     * @param context — Contextual snapshot at the moment of interruption.
+     * @param context - Contextual snapshot at the moment of interruption.
      * @returns The action to perform (or a promise resolving to one).
      */
     handleBargein(context: BargeinContext): BargeinAction | Promise<BargeinAction>;
@@ -647,21 +898,32 @@ export interface IBargeinHandler {
 /**
  * Adapts any AgentOS agent to the voice pipeline's turn-based protocol.
  *
- * The pipeline calls {@link IVoicePipelineAgentSession.sendText} with the user's
- * final transcript and streams the response back as text tokens for TTS synthesis.
+ * The pipeline calls {@link sendText} with the user's final transcript and
+ * streams the response back as text tokens for TTS synthesis.
+ *
+ * @see {@link VoicePipelineOrchestrator} which invokes this during the
+ *   `PROCESSING -> SPEAKING` state transition.
  */
 export interface IVoicePipelineAgentSession {
     /**
      * Send the user's utterance to the agent and receive a streaming text response.
      *
-     * @param text — Final transcript from the STT + endpoint detection pipeline.
-     * @param metadata — Rich metadata about the current voice turn.
+     * @param text - Final transcript from the STT + endpoint detection pipeline.
+     * @param metadata - Rich metadata about the current voice turn.
      * @returns An async iterable of text tokens (suitable for streaming into TTS).
+     *
+     * @example
+     * ```typescript
+     * const tokens = agentSession.sendText('What is the weather?', metadata);
+     * for await (const token of tokens) {
+     *   ttsSession.pushTokens(token);
+     * }
+     * ```
      */
     sendText(text: string, metadata: VoiceTurnMetadata): AsyncIterable<string>;
     /**
      * Abort the current agent response mid-stream (called on barge-in when
-     * `BargeinAction.type === 'cancel'`).
+     * {@link BargeinAction} type is `'cancel'`).
      *
      * Implementations should cancel any in-flight LLM requests. The pipeline
      * will discard any tokens emitted after `abort()` is called.
@@ -671,6 +933,19 @@ export interface IVoicePipelineAgentSession {
 /**
  * Rich metadata attached to each voice turn and passed to the agent session.
  * Enables the agent to tailor its response based on conversation dynamics.
+ *
+ * @see {@link IVoicePipelineAgentSession.sendText} which receives this metadata.
+ *
+ * @example
+ * ```typescript
+ * const metadata: VoiceTurnMetadata = {
+ *   speakers: ['user'],
+ *   endpointReason: 'punctuation',
+ *   speechDurationMs: 3200,
+ *   wasInterrupted: false,
+ *   transcriptConfidence: 0.92,
+ * };
+ * ```
  */
 export interface VoiceTurnMetadata {
     /**
@@ -680,6 +955,7 @@ export interface VoiceTurnMetadata {
     speakers: string[];
     /**
      * The reason the endpoint detector decided the user had finished speaking.
+     * @see {@link EndpointReason} for the full set of possible values.
      */
     endpointReason: EndpointReason;
     /**
@@ -692,8 +968,9 @@ export interface VoiceTurnMetadata {
      */
     wasInterrupted: boolean;
     /**
-     * When `wasInterrupted` is `true`, the text remainder of the agent response
-     * that was cut off. Useful for the agent to avoid re-stating information.
+     * When {@link wasInterrupted} is `true`, the text remainder of the agent response
+     * that was cut off. Useful for the agent to avoid re-stating information
+     * the user has already heard.
      */
     interruptedRemainder?: string;
     /**
@@ -704,6 +981,21 @@ export interface VoiceTurnMetadata {
 /**
  * Top-level configuration for the {@link VoicePipelineSession}.
  * Specifies which providers to use and their session-level options.
+ *
+ * @see {@link VoicePipelineOrchestrator} which consumes this configuration.
+ *
+ * @example
+ * ```typescript
+ * const config: VoicePipelineConfig = {
+ *   stt: 'deepgram',
+ *   tts: 'openai',
+ *   endpointing: 'heuristic',
+ *   bargeIn: 'hard-cut',
+ *   voice: 'nova',
+ *   format: 'opus',
+ *   language: 'en-US',
+ * };
+ * ```
  */
 export interface VoicePipelineConfig {
     /**
@@ -719,7 +1011,8 @@ export interface VoicePipelineConfig {
      */
     tts: string;
     /**
-     * Endpoint detection strategy. Defaults to `'hybrid'` when omitted.
+     * Endpoint detection strategy. Defaults to `'heuristic'` when omitted.
+     * @see {@link IEndpointDetector.mode} for the strategy descriptions.
      */
     endpointing?: 'acoustic' | 'heuristic' | 'semantic';
     /**
@@ -728,6 +1021,7 @@ export interface VoicePipelineConfig {
     diarization?: boolean;
     /**
      * Barge-in (interruption) handling mode. Defaults to `'hard-cut'` when omitted.
+     * @see {@link HardCutBargeinHandler} and {@link SoftFadeBargeinHandler}.
      */
     bargeIn?: 'hard-cut' | 'soft-fade' | 'disabled';
     /**
@@ -761,26 +1055,35 @@ export interface VoicePipelineConfig {
 /**
  * Lifecycle state of a {@link VoicePipelineSession}.
  *
- * Valid transitions:
+ * ## Valid state transitions
+ *
  * ```
- * idle → listening → processing → speaking → listening
- *                                          → interrupting → listening
- * any  → closed
+ * idle -> listening -> processing -> speaking -> listening
+ *                                             -> interrupting -> listening
+ * any  -> closed
  * ```
+ *
+ * The state machine is enforced by {@link VoicePipelineOrchestrator._setState}
+ * which emits `'state_changed'` on every transition.
  */
 export type PipelineState = 'idle' | 'listening' | 'processing' | 'speaking' | 'interrupting' | 'closed';
 /**
  * A live voice pipeline session binding a transport, STT, endpoint detection,
  * optional diarization, agent, and TTS into a single coordinated lifecycle.
  *
- * Emits:
- * - `'state_change'` (PipelineState) — pipeline state machine transition.
- * - `'turn_complete'` (TurnCompleteEvent) — user turn detected.
- * - `'agent_response_start'` () — agent has begun generating a response.
- * - `'agent_response_end'` () — agent response fully synthesised and played.
- * - `'barge_in'` (BargeinContext) — user interrupted TTS playback.
- * - `'error'` (Error) — unrecoverable pipeline error.
- * - `'close'` () — session has been fully torn down.
+ * ## Events emitted
+ *
+ * | Event                    | Payload                   | Description                             |
+ * |--------------------------|---------------------------|-----------------------------------------|
+ * | `'state_change'`         | {@link PipelineState}     | Pipeline state machine transition.      |
+ * | `'turn_complete'`        | {@link TurnCompleteEvent} | User turn detected.                     |
+ * | `'agent_response_start'` | *(none)*                  | Agent has begun generating a response.  |
+ * | `'agent_response_end'`   | *(none)*                  | Agent response fully played.            |
+ * | `'barge_in'`             | {@link BargeinContext}     | User interrupted TTS playback.          |
+ * | `'error'`                | `Error`                   | Unrecoverable pipeline error.           |
+ * | `'close'`                | *(none)*                  | Session has been fully torn down.       |
+ *
+ * @see {@link VoicePipelineOrchestrator.startSession} which creates these sessions.
  */
 export interface VoicePipelineSession extends EventEmitter {
     /**
@@ -789,6 +1092,7 @@ export interface VoicePipelineSession extends EventEmitter {
     readonly sessionId: string;
     /**
      * Current pipeline state machine state.
+     * @see {@link PipelineState} for the full set of states and transitions.
      */
     readonly state: PipelineState;
     /**
@@ -797,16 +1101,26 @@ export interface VoicePipelineSession extends EventEmitter {
      */
     readonly transport: IStreamTransport;
     /**
-     * Gracefully close the session — flush in-flight audio, tear down all sub-sessions,
+     * Gracefully close the session -- flush in-flight audio, tear down all sub-sessions,
      * and emit `'close'`.
      *
-     * @param reason — Optional human-readable reason for diagnostics.
+     * @param reason - Optional human-readable reason for diagnostics.
      */
     close(reason?: string): Promise<void>;
 }
 /**
  * Messages sent from the client (browser/app) to the server over the transport.
  * All messages are JSON-serialised.
+ *
+ * @see {@link ServerTextMessage} for the server-to-client counterpart.
+ *
+ * @example
+ * ```typescript
+ * const configMsg: ClientTextMessage = {
+ *   type: 'config',
+ *   config: { stt: 'deepgram', tts: 'openai' },
+ * };
+ * ```
  */
 export type ClientTextMessage = {
     /**
@@ -827,6 +1141,18 @@ export type ClientTextMessage = {
 /**
  * Messages sent from the server to the client over the transport.
  * All messages are JSON-serialised.
+ *
+ * @see {@link ClientTextMessage} for the client-to-server counterpart.
+ * @see {@link IStreamTransport.sendControl} which sends these messages.
+ *
+ * @example
+ * ```typescript
+ * const sessionStarted: ServerTextMessage = {
+ *   type: 'session_started',
+ *   sessionId: 'abc-123',
+ *   config: { stt: 'deepgram', tts: 'openai' },
+ * };
+ * ```
  */
 export type ServerTextMessage = {
     /**
@@ -858,8 +1184,8 @@ export type ServerTextMessage = {
     type: 'agent_thinking';
 } | {
     /**
-     * Emitted when TTS synthesis begins — audio chunks will follow over the audio channel.
-     * Clients may hide thinking indicators.
+     * Emitted when TTS synthesis begins -- audio chunks will follow over the audio channel.
+     * Clients may hide thinking indicators and prepare audio playback.
      */
     type: 'agent_speaking';
     /**
@@ -890,16 +1216,16 @@ export type ServerTextMessage = {
      * The session will be closed after this message.
      */
     type: 'error';
-    /** Machine-readable error code. */
+    /** Machine-readable error code (e.g. `'STT_PROVIDER_ERROR'`). */
     code: string;
-    /** Human-readable description. */
+    /** Human-readable description of the error. */
     message: string;
 } | {
     /**
      * Emitted as the final message before the server closes the transport.
      */
     type: 'session_ended';
-    /** Optional human-readable reason. */
+    /** Optional human-readable reason for the session ending. */
     reason?: string;
 };
 //# sourceMappingURL=types.d.ts.map