@framers/agentos 0.1.108 → 0.1.110

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/dist/orchestration/runtime/GraphRuntime.d.ts.map +1 -1
  2. package/dist/orchestration/runtime/GraphRuntime.js +11 -4
  3. package/dist/orchestration/runtime/GraphRuntime.js.map +1 -1
  4. package/dist/orchestration/runtime/safeExpressionEvaluator.d.ts.map +1 -1
  5. package/dist/orchestration/runtime/safeExpressionEvaluator.js +35 -16
  6. package/dist/orchestration/runtime/safeExpressionEvaluator.js.map +1 -1
  7. package/dist/voice-pipeline/AcousticEndpointDetector.d.ts +95 -20
  8. package/dist/voice-pipeline/AcousticEndpointDetector.d.ts.map +1 -1
  9. package/dist/voice-pipeline/AcousticEndpointDetector.js +110 -24
  10. package/dist/voice-pipeline/AcousticEndpointDetector.js.map +1 -1
  11. package/dist/voice-pipeline/HardCutBargeinHandler.d.ts +66 -15
  12. package/dist/voice-pipeline/HardCutBargeinHandler.d.ts.map +1 -1
  13. package/dist/voice-pipeline/HardCutBargeinHandler.js +65 -13
  14. package/dist/voice-pipeline/HardCutBargeinHandler.js.map +1 -1
  15. package/dist/voice-pipeline/HeuristicEndpointDetector.d.ts +116 -42
  16. package/dist/voice-pipeline/HeuristicEndpointDetector.d.ts.map +1 -1
  17. package/dist/voice-pipeline/HeuristicEndpointDetector.js +159 -52
  18. package/dist/voice-pipeline/HeuristicEndpointDetector.js.map +1 -1
  19. package/dist/voice-pipeline/SoftFadeBargeinHandler.d.ts +89 -24
  20. package/dist/voice-pipeline/SoftFadeBargeinHandler.d.ts.map +1 -1
  21. package/dist/voice-pipeline/SoftFadeBargeinHandler.js +74 -20
  22. package/dist/voice-pipeline/SoftFadeBargeinHandler.js.map +1 -1
  23. package/dist/voice-pipeline/VoiceInterruptError.d.ts +68 -10
  24. package/dist/voice-pipeline/VoiceInterruptError.d.ts.map +1 -1
  25. package/dist/voice-pipeline/VoiceInterruptError.js +53 -6
  26. package/dist/voice-pipeline/VoiceInterruptError.js.map +1 -1
  27. package/dist/voice-pipeline/VoicePipelineOrchestrator.d.ts +190 -39
  28. package/dist/voice-pipeline/VoicePipelineOrchestrator.d.ts.map +1 -1
  29. package/dist/voice-pipeline/VoicePipelineOrchestrator.js +266 -53
  30. package/dist/voice-pipeline/VoicePipelineOrchestrator.js.map +1 -1
  31. package/dist/voice-pipeline/WebSocketStreamTransport.d.ts +135 -43
  32. package/dist/voice-pipeline/WebSocketStreamTransport.d.ts.map +1 -1
  33. package/dist/voice-pipeline/WebSocketStreamTransport.js +109 -47
  34. package/dist/voice-pipeline/WebSocketStreamTransport.js.map +1 -1
  35. package/dist/voice-pipeline/index.d.ts +34 -1
  36. package/dist/voice-pipeline/index.d.ts.map +1 -1
  37. package/dist/voice-pipeline/index.js +41 -1
  38. package/dist/voice-pipeline/index.js.map +1 -1
  39. package/dist/voice-pipeline/types.d.ts +432 -106
  40. package/dist/voice-pipeline/types.d.ts.map +1 -1
  41. package/dist/voice-pipeline/types.js +21 -9
  42. package/dist/voice-pipeline/types.js.map +1 -1
  43. package/package.json +1 -1
@@ -8,41 +8,78 @@
8
8
  * system. All heavy I/O crosses EventEmitter-based session boundaries to keep
9
9
  * the hot path non-blocking.
10
10
  *
11
- * Dependency order (no circular refs):
11
+ * ## Dependency order (no circular refs)
12
+ *
13
+ * ```
12
14
  * AudioFrame / EncodedAudioChunk
13
- * Transport (IStreamTransport)
14
- * STT (IStreamingSTT + StreamingSTTSession)
15
- * Endpoint detection (IEndpointDetector + VadEvent)
16
- * Diarization (IDiarizationEngine + DiarizationSession)
17
- * TTS (IStreamingTTS + StreamingTTSSession)
18
- * Barge-in (IBargeinHandler)
19
- * Session (VoicePipelineSession)
20
- * Protocol messages (ClientTextMessage, ServerTextMessage)
15
+ * -> Transport (IStreamTransport)
16
+ * -> STT (IStreamingSTT + StreamingSTTSession)
17
+ * -> Endpoint detection (IEndpointDetector + VadEvent)
18
+ * -> Diarization (IDiarizationEngine + DiarizationSession)
19
+ * -> TTS (IStreamingTTS + StreamingTTSSession)
20
+ * -> Barge-in (IBargeinHandler)
21
+ * -> Session (VoicePipelineSession)
22
+ * -> Protocol messages (ClientTextMessage, ServerTextMessage)
23
+ * ```
24
+ *
25
+ * ## Design rationale
26
+ *
27
+ * Every interface in this module is kept deliberately narrow so that
28
+ * implementations can be swapped at runtime (e.g. Deepgram STT vs Whisper
29
+ * vs browser WebSpeechAPI) without touching the orchestrator. The
30
+ * EventEmitter-based session pattern was chosen over callback interfaces
31
+ * because it naturally supports fan-out (multiple listeners) and backpressure
32
+ * is handled at the transport level rather than per-callback.
21
33
  */
22
34
  import type { EventEmitter } from 'node:events';
23
35
  /**
24
36
  * A single frame of raw PCM audio, as produced by a microphone capture or
25
- * a VAD pre-processor. Each frame typically represents 1020 ms of audio.
37
+ * a VAD pre-processor. Each frame typically represents 10-20 ms of audio.
38
+ *
39
+ * @see {@link EncodedAudioChunk} for the compressed counterpart used in TTS output.
40
+ *
41
+ * @example
42
+ * ```typescript
43
+ * const frame: AudioFrame = {
44
+ * samples: new Float32Array(320), // 20 ms @ 16 kHz
45
+ * sampleRate: 16000,
46
+ * timestamp: Date.now(),
47
+ * };
48
+ * ```
26
49
  */
27
50
  export interface AudioFrame {
28
51
  /**
29
52
  * Interleaved 32-bit float PCM samples, normalised to [-1, 1].
30
53
  * For mono audio this is a flat array; stereo interleaves L/R pairs.
54
+ *
55
+ * Float32Array is chosen over Int16Array because it avoids quantisation
56
+ * artefacts in DSP operations (e.g. energy calculation, resampling) and
57
+ * is the native format for Web Audio API.
31
58
  */
32
59
  samples: Float32Array;
33
60
  /**
34
61
  * Samples per second (e.g. 16000, 24000, 48000).
62
+ *
63
+ * 16 kHz is the standard for telephony and most STT engines. 24 kHz is
64
+ * typical for TTS output. The pipeline resamples internally when STT
65
+ * and TTS sample rates differ.
35
66
  */
36
67
  sampleRate: number;
37
68
  /**
38
69
  * Unix epoch millisecond timestamp at which this frame was captured.
39
70
  * Used for synchronisation across STT, VAD, and diarization streams.
71
+ *
72
+ * Must be monotonically increasing within a session. Out-of-order
73
+ * frames degrade STT accuracy and confuse the endpoint detector's
74
+ * duration tracking.
40
75
  */
41
76
  timestamp: number;
42
77
  /**
43
78
  * Optional hint from the capture layer identifying the speaker source
44
79
  * (e.g. a hardware device label or a WebRTC peer ID). Used by the
45
80
  * diarization engine when native speaker IDs are unavailable.
81
+ *
82
+ * @see {@link DiarizedSegment.speakerId} for the post-diarization label.
46
83
  */
47
84
  speakerHint?: string;
48
85
  }
@@ -50,14 +87,32 @@ export interface AudioFrame {
50
87
  * A compressed audio chunk ready for transmission over the wire (e.g. to a
51
88
  * TTS websocket or a playback buffer). Contains the rendered text to allow
52
89
  * barge-in handlers to track interrupted utterance state.
90
+ *
91
+ * @see {@link AudioFrame} for the uncompressed PCM counterpart used in capture.
92
+ * @see {@link StreamingTTSSession} which emits these on the `'audio'` event.
93
+ *
94
+ * @example
95
+ * ```typescript
96
+ * const chunk: EncodedAudioChunk = {
97
+ * audio: Buffer.from([...opusBytes]),
98
+ * format: 'opus',
99
+ * sampleRate: 24000,
100
+ * durationMs: 60,
101
+ * text: 'Hello there!',
102
+ * };
103
+ * ```
53
104
  */
54
105
  export interface EncodedAudioChunk {
55
106
  /**
56
- * Raw encoded bytes in the format specified by `format`.
107
+ * Raw encoded bytes in the format specified by {@link format}.
57
108
  */
58
109
  audio: Buffer;
59
110
  /**
60
- * Codec/container format of `audio`.
111
+ * Codec/container format of {@link audio}.
112
+ *
113
+ * - `'pcm'` -- raw signed 16-bit LE samples (lowest latency, highest bandwidth).
114
+ * - `'mp3'` -- MPEG Layer 3 (wide browser support, moderate latency).
115
+ * - `'opus'` -- Opus in OGG container (best quality/size ratio, recommended default).
61
116
  */
62
117
  format: 'pcm' | 'mp3' | 'opus';
63
118
  /**
@@ -66,17 +121,29 @@ export interface EncodedAudioChunk {
66
121
  sampleRate: number;
67
122
  /**
68
123
  * Playback duration of this chunk in milliseconds.
124
+ * Used by the orchestrator to track cumulative played time for
125
+ * barge-in context ({@link BargeinContext.playedDurationMs}).
69
126
  */
70
127
  durationMs: number;
71
128
  /**
72
129
  * The text fragment that was synthesised into this chunk. Preserved so
73
- * barge-in handlers can report `interruptedRemainder` accurately.
130
+ * barge-in handlers can report {@link VoiceTurnMetadata.interruptedRemainder}
131
+ * accurately when playback is cut short.
74
132
  */
75
133
  text: string;
76
134
  }
77
135
  /**
78
136
  * Discriminated union of control messages sent from the pipeline to the
79
137
  * underlying stream transport (e.g. a WebSocket or WebRTC data-channel).
138
+ *
139
+ * @see {@link IStreamTransport.sendControl} which accepts these messages.
140
+ * @see {@link ServerTextMessage} for the full server-to-client protocol.
141
+ *
142
+ * @example
143
+ * ```typescript
144
+ * const muteMsg: TransportControlMessage = { type: 'mute' };
145
+ * const stopMsg: TransportControlMessage = { type: 'stop', reason: 'session timeout' };
146
+ * ```
80
147
  */
81
148
  export type TransportControlMessage = {
82
149
  /** Mute the outbound audio stream without closing the session. */
@@ -99,50 +166,76 @@ export type TransportControlMessage = {
99
166
  * Abstraction over any bidirectional audio/text stream transport.
100
167
  * Implementations include WebSocket, WebRTC data-channel, and in-process pipes.
101
168
  *
102
- * Emits:
103
- * - `'audio'` (AudioFrame) inbound audio from the remote client.
104
- * - `'message'` (ClientTextMessage) inbound JSON control message from the client.
105
- * - `'close'` () — transport has been closed (either side).
106
- * - `'error'` (Error) — fatal transport error.
169
+ * The transport layer is intentionally thin: it handles framing and I/O but
170
+ * knows nothing about STT, TTS, or conversation state. This separation lets
171
+ * the pipeline swap transports (e.g. WebSocket -> WebRTC) without touching
172
+ * any voice logic.
173
+ *
174
+ * ## Events emitted
175
+ *
176
+ * | Event | Payload | Description |
177
+ * |-------------|-----------------------|----------------------------------------|
178
+ * | `'audio'` | {@link AudioFrame} | Inbound audio from the remote client. |
179
+ * | `'message'` | {@link ClientTextMessage} | Inbound JSON control from the client. |
180
+ * | `'close'` | *(none)* | Transport has been closed (either side). |
181
+ * | `'error'` | `Error` | Fatal transport error. |
182
+ *
183
+ * @see {@link WebSocketStreamTransport} for the canonical WebSocket implementation.
107
184
  */
108
185
  export interface IStreamTransport extends EventEmitter {
109
186
  /**
110
187
  * Stable identifier for this transport connection (e.g. a UUID or socket ID).
188
+ * Used as a correlation key in logs and metrics.
111
189
  */
112
190
  readonly id: string;
113
191
  /**
114
192
  * Current connection state.
115
- * - `'connecting'` handshake in progress.
116
- * - `'open'` fully established and ready.
117
- * - `'closing'` graceful teardown initiated.
118
- * - `'closed'` no longer usable.
193
+ * - `'connecting'` -- handshake in progress.
194
+ * - `'open'` -- fully established and ready.
195
+ * - `'closing'` -- graceful teardown initiated.
196
+ * - `'closed'` -- no longer usable.
119
197
  */
120
198
  readonly state: 'connecting' | 'open' | 'closing' | 'closed';
121
199
  /**
122
200
  * Send a synthesised audio chunk to the remote client for playback.
123
201
  * Resolves once the chunk has been handed to the underlying I/O layer.
124
202
  *
125
- * @param chunk Encoded audio to deliver.
203
+ * @param chunk - Encoded audio to deliver.
204
+ * @returns Resolves when the data has been buffered for transmission.
205
+ * @throws {Error} If the transport is not in `'open'` state.
126
206
  */
127
207
  sendAudio(chunk: EncodedAudioChunk): Promise<void>;
128
208
  /**
129
209
  * Send a JSON control message to the remote client.
130
210
  *
131
- * @param message Server-side protocol message.
211
+ * @param message - Server-side protocol message.
212
+ * @returns Resolves when the data has been buffered for transmission.
213
+ * @throws {Error} If the transport is not in `'open'` state.
132
214
  */
133
215
  sendControl(message: ServerTextMessage): Promise<void>;
134
216
  /**
135
217
  * Close the transport, optionally supplying a WebSocket-style close code and
136
218
  * human-readable reason string for diagnostics.
137
219
  *
138
- * @param code Optional numeric close code (defaults to 1000 normal closure).
139
- * @param reason Optional human-readable close reason.
220
+ * @param code - Optional numeric close code (defaults to 1000 normal closure).
221
+ * @param reason - Optional human-readable close reason.
140
222
  */
141
223
  close(code?: number, reason?: string): void;
142
224
  }
143
225
  /**
144
226
  * Configuration passed to {@link IStreamingSTT.startSession} when opening a new
145
227
  * speech recognition stream.
228
+ *
229
+ * @see {@link VoicePipelineConfig.sttOptions} for provider-level overrides.
230
+ *
231
+ * @example
232
+ * ```typescript
233
+ * const config: StreamingSTTConfig = {
234
+ * language: 'en-US',
235
+ * interimResults: true,
236
+ * punctuate: true,
237
+ * };
238
+ * ```
146
239
  */
147
240
  export interface StreamingSTTConfig {
148
241
  /**
@@ -153,11 +246,14 @@ export interface StreamingSTTConfig {
153
246
  /**
154
247
  * Whether to emit interim (non-final) transcript events. When `true`,
155
248
  * partial results arrive more frequently at the cost of higher word error rate.
249
+ * Interim results are useful for real-time UI display and early endpoint hints.
156
250
  * @defaultValue true
157
251
  */
158
252
  interimResults?: boolean;
159
253
  /**
160
254
  * Enable automatic punctuation insertion if the provider supports it.
255
+ * Punctuation is critical for the {@link HeuristicEndpointDetector} which
256
+ * uses terminal punctuation (`.`, `?`, `!`) as a turn-completion signal.
161
257
  * @defaultValue true
162
258
  */
163
259
  punctuate?: boolean;
@@ -176,6 +272,8 @@ export interface StreamingSTTConfig {
176
272
  /**
177
273
  * A single word within a {@link TranscriptEvent}, augmented with timing and
178
274
  * optional speaker attribution.
275
+ *
276
+ * @see {@link TranscriptEvent.words} which contains an array of these.
179
277
  */
180
278
  export interface TranscriptWord {
181
279
  /**
@@ -192,6 +290,7 @@ export interface TranscriptWord {
192
290
  end: number;
193
291
  /**
194
292
  * Recognition confidence in the range [0, 1]. Higher is better.
293
+ * Typically 0.8+ for clear speech, 0.4-0.7 for noisy or accented audio.
195
294
  */
196
295
  confidence: number;
197
296
  /**
@@ -204,6 +303,17 @@ export interface TranscriptWord {
204
303
  /**
205
304
  * Emitted by a {@link StreamingSTTSession} each time the provider produces a
206
305
  * recognition hypothesis.
306
+ *
307
+ * @see {@link IEndpointDetector.pushTranscript} which consumes these events.
308
+ *
309
+ * @example
310
+ * ```typescript
311
+ * sttSession.on('transcript', (event: TranscriptEvent) => {
312
+ * if (event.isFinal) {
313
+ * console.log(`Final: "${event.text}" (confidence: ${event.confidence})`);
314
+ * }
315
+ * });
316
+ * ```
207
317
  */
208
318
  export interface TranscriptEvent {
209
319
  /**
@@ -211,7 +321,7 @@ export interface TranscriptEvent {
211
321
  */
212
322
  text: string;
213
323
  /**
214
- * Aggregate confidence score for `text` in the range [0, 1].
324
+ * Aggregate confidence score for {@link text} in the range [0, 1].
215
325
  */
216
326
  confidence: number;
217
327
  /**
@@ -222,6 +332,9 @@ export interface TranscriptEvent {
222
332
  /**
223
333
  * `true` when this hypothesis is stable and will not be revised.
224
334
  * `false` for interim (streaming) hypotheses.
335
+ *
336
+ * The {@link HeuristicEndpointDetector} only accumulates final transcripts;
337
+ * interim results are discarded to avoid double-counting.
225
338
  */
226
339
  isFinal: boolean;
227
340
  /**
@@ -234,17 +347,22 @@ export interface TranscriptEvent {
234
347
  * An active streaming speech-to-text session. Audio frames are pushed in
235
348
  * and transcript events flow out via EventEmitter.
236
349
  *
237
- * Emits:
238
- * - `'transcript'` (TranscriptEvent) — interim or final hypothesis.
239
- * - `'error'` (Error) — unrecoverable provider error.
240
- * - `'close'` () — session has been fully terminated.
350
+ * ## Events emitted
351
+ *
352
+ * | Event | Payload | Description |
353
+ * |----------------|-----------------------|------------------------------------|
354
+ * | `'transcript'` | {@link TranscriptEvent} | Interim or final hypothesis. |
355
+ * | `'error'` | `Error` | Unrecoverable provider error. |
356
+ * | `'close'` | *(none)* | Session has been fully terminated. |
357
+ *
358
+ * @see {@link IStreamingSTT.startSession} which creates these sessions.
241
359
  */
242
360
  export interface StreamingSTTSession extends EventEmitter {
243
361
  /**
244
362
  * Push a raw audio frame into the recognition stream. Frames must arrive
245
363
  * in capture order; gaps or out-of-order frames degrade accuracy.
246
364
  *
247
- * @param frame PCM audio frame to process.
365
+ * @param frame - PCM audio frame to process.
248
366
  */
249
367
  pushAudio(frame: AudioFrame): void;
250
368
  /**
@@ -263,6 +381,8 @@ export interface StreamingSTTSession extends EventEmitter {
263
381
  *
264
382
  * Implementations are registered via the `EXTENSION_KIND_STREAMING_STT`
265
383
  * extension kind and resolved by the voice pipeline at session creation time.
384
+ *
385
+ * @see {@link StreamingSTTSession} for the session interface returned by {@link startSession}.
266
386
  */
267
387
  export interface IStreamingSTT {
268
388
  /**
@@ -276,61 +396,107 @@ export interface IStreamingSTT {
276
396
  /**
277
397
  * Open a new streaming recognition session.
278
398
  *
279
- * @param config Session-level configuration overriding provider defaults.
399
+ * @param config - Session-level configuration overriding provider defaults.
280
400
  * @returns A ready-to-use session whose lifecycle is independent of this factory.
401
+ * @throws {Error} If the provider fails to initialise (e.g. invalid API key).
281
402
  */
282
403
  startSession(config?: StreamingSTTConfig): Promise<StreamingSTTSession>;
283
404
  }
284
405
  /**
285
406
  * A VAD (Voice Activity Detection) or STT-derived event describing speech
286
407
  * energy transitions over time.
408
+ *
409
+ * @see {@link IEndpointDetector.pushVadEvent} which consumes these.
410
+ *
411
+ * @example
412
+ * ```typescript
413
+ * const speechStart: VadEvent = {
414
+ * type: 'speech_start',
415
+ * timestamp: Date.now(),
416
+ * source: 'vad',
417
+ * energyLevel: 0.42,
418
+ * };
419
+ * ```
287
420
  */
288
421
  export interface VadEvent {
289
422
  /**
290
423
  * Type of the VAD transition:
291
- * - `'speech_start'` voice energy detected after silence.
292
- * - `'speech_end'` voice energy fell below the silence threshold.
293
- * - `'silence'` periodic silence heartbeat (emitted at `silenceIntervalMs` cadence).
424
+ * - `'speech_start'` -- voice energy detected after silence.
425
+ * - `'speech_end'` -- voice energy fell below the silence threshold.
426
+ * - `'silence'` -- periodic silence heartbeat (emitted at `silenceIntervalMs` cadence).
294
427
  */
295
428
  type: 'speech_start' | 'speech_end' | 'silence';
296
429
  /**
297
430
  * Unix epoch millisecond timestamp at which this transition was detected.
431
+ * Used by the endpoint detector to compute speech duration.
298
432
  */
299
433
  timestamp: number;
300
434
  /**
301
435
  * Optional raw energy level used to trigger this event (implementation-defined scale).
436
+ * Useful for debugging VAD sensitivity but not consumed by the pipeline logic.
302
437
  */
303
438
  energyLevel?: number;
304
439
  /**
305
440
  * Origin of the VAD event:
306
- * - `'vad'` emitted by a standalone VAD model (e.g. Silero, WebRTC VAD).
307
- * - `'stt'` inferred from STT activity (e.g. provider-side endpointing signals).
441
+ * - `'vad'` -- emitted by a standalone VAD model (e.g. Silero, WebRTC VAD).
442
+ * - `'stt'` -- inferred from STT activity (e.g. provider-side endpointing signals).
443
+ *
444
+ * The pipeline synthesises STT-derived speech_start/speech_end events when
445
+ * a dedicated VAD is not available, using the source field to distinguish them.
308
446
  */
309
447
  source?: 'vad' | 'stt';
310
448
  }
311
449
  /**
312
450
  * Semantic reason why the endpoint detector decided the user has finished speaking.
451
+ *
452
+ * Each reason maps to a different detection strategy within the endpoint detector:
453
+ *
454
+ * | Reason | Detection strategy |
455
+ * |--------------------|--------------------------------------------------------|
456
+ * | `silence_timeout` | VAD silence exceeded configured threshold |
457
+ * | `punctuation` | STT final result ends with `.`, `?`, or `!` |
458
+ * | `syntax_complete` | Syntax model determined utterance is grammatically complete |
459
+ * | `semantic_model` | Small LM scored intent as complete |
460
+ * | `manual` | Explicitly triggered by a ClientTextMessage control |
461
+ * | `timeout` | Hard maximum turn duration elapsed |
462
+ *
463
+ * @see {@link TurnCompleteEvent.reason} which carries this value.
464
+ * @see {@link VoiceTurnMetadata.endpointReason} where it is forwarded to the agent.
313
465
  */
314
466
  export type EndpointReason = 'silence_timeout' | 'punctuation' | 'syntax_complete' | 'semantic_model' | 'manual' | 'timeout';
315
467
  /**
316
468
  * Emitted by {@link IEndpointDetector} when it determines the user has finished
317
469
  * their turn and the pipeline should hand off to the agent.
470
+ *
471
+ * @see {@link IEndpointDetector} which emits these on the `'turn_complete'` event.
472
+ * @see {@link VoicePipelineOrchestrator} which transitions to `'processing'` state upon receipt.
473
+ *
474
+ * @example
475
+ * ```typescript
476
+ * detector.on('turn_complete', (event: TurnCompleteEvent) => {
477
+ * console.log(`User said: "${event.transcript}" (reason: ${event.reason})`);
478
+ * });
479
+ * ```
318
480
  */
319
481
  export interface TurnCompleteEvent {
320
482
  /**
321
483
  * The final consolidated transcript for this turn.
484
+ * May be empty for acoustic-only detectors that have no transcript access.
322
485
  */
323
486
  transcript: string;
324
487
  /**
325
488
  * Aggregate STT confidence score for the transcript, in the range [0, 1].
489
+ * Zero when no STT data is available (e.g. acoustic-only mode).
326
490
  */
327
491
  confidence: number;
328
492
  /**
329
493
  * Total duration of detected speech in this turn, in milliseconds.
494
+ * Computed as `speechEndTimestamp - speechStartTimestamp`.
330
495
  */
331
496
  durationMs: number;
332
497
  /**
333
498
  * The semantic reason that triggered turn completion.
499
+ * @see {@link EndpointReason} for the full set of possible values.
334
500
  */
335
501
  reason: EndpointReason;
336
502
  }
@@ -339,29 +505,36 @@ export interface TurnCompleteEvent {
339
505
  * Combines VAD events with linguistic signals to decide when the user
340
506
  * has finished speaking.
341
507
  *
342
- * Emits:
343
- * - `'turn_complete'` (TurnCompleteEvent) — the user's turn has ended.
344
- * - `'speech_start'` () — the user has started speaking (re-emitted from VAD).
345
- * - `'barge_in_detected'` () — user started speaking while TTS was playing.
508
+ * ## Events emitted
509
+ *
510
+ * | Event | Payload | Description |
511
+ * |------------------------|-------------------------|----------------------------------------|
512
+ * | `'turn_complete'` | {@link TurnCompleteEvent} | The user's turn has ended. |
513
+ * | `'speech_start'` | *(none)* | The user has started speaking. |
514
+ * | `'barge_in_detected'` | *(none)* | User spoke while TTS was playing. |
515
+ *
516
+ * @see {@link HeuristicEndpointDetector} for the rule-based implementation.
517
+ * @see {@link AcousticEndpointDetector} for the purely acoustic implementation.
346
518
  */
347
519
  export interface IEndpointDetector extends EventEmitter {
348
520
  /**
349
521
  * Active detection strategy:
350
- * - `'silence'` pure silence-timeout based.
351
- * - `'hybrid'` silence + linguistic completeness signals.
352
- * - `'semantic'` small LM scoring utterance completeness.
522
+ * - `'acoustic'` -- pure silence-timeout based (no transcript analysis).
523
+ * - `'heuristic'` -- silence + terminal punctuation + backchannel filtering.
524
+ * - `'semantic'` -- small LM scoring utterance completeness.
353
525
  */
354
526
  readonly mode: 'acoustic' | 'heuristic' | 'semantic';
355
527
  /**
356
528
  * Push a VAD event from the upstream voice activity detector.
357
529
  *
358
- * @param event The VAD event to process.
530
+ * @param event - The VAD event to process.
359
531
  */
360
532
  pushVadEvent(event: VadEvent): void;
361
533
  /**
362
534
  * Push a partial or final STT result for linguistic analysis.
535
+ * Acoustic-mode detectors may no-op this method.
363
536
  *
364
- * @param event Transcript event from the STT session.
537
+ * @param event - Transcript event from the STT session.
365
538
  */
366
539
  pushTranscript(event: TranscriptEvent): void;
367
540
  /**
@@ -373,11 +546,14 @@ export interface IEndpointDetector extends EventEmitter {
373
546
  /**
374
547
  * Configuration for a diarization session. Controls expected speaker count and
375
548
  * chunking behaviour for providers that require buffered audio.
549
+ *
550
+ * @see {@link IDiarizationEngine.startSession} which accepts this config.
376
551
  */
377
552
  export interface DiarizationConfig {
378
553
  /**
379
554
  * Hint to the provider about how many distinct speakers are expected.
380
- * When omitted, the provider uses auto-detection.
555
+ * When omitted, the provider uses auto-detection (which typically adds
556
+ * latency as it needs more audio to stabilise speaker count).
381
557
  */
382
558
  expectedSpeakers?: number;
383
559
  /**
@@ -401,6 +577,8 @@ export interface DiarizationConfig {
401
577
  }
402
578
  /**
403
579
  * A contiguous segment of transcript text with millisecond timing metadata.
580
+ *
581
+ * @see {@link DiarizedSegment} which extends this with speaker attribution.
404
582
  */
405
583
  export interface TranscriptSegment {
406
584
  /**
@@ -419,6 +597,15 @@ export interface TranscriptSegment {
419
597
  /**
420
598
  * A {@link TranscriptSegment} extended with speaker attribution produced by the
421
599
  * diarization engine.
600
+ *
601
+ * @see {@link DiarizationSession} which emits these on the `'segment'` event.
602
+ *
603
+ * @example
604
+ * ```typescript
605
+ * diarizationSession.on('segment', (seg: DiarizedSegment) => {
606
+ * console.log(`[${seg.speakerId}]: "${seg.text}"`);
607
+ * });
608
+ * ```
422
609
  */
423
610
  export interface DiarizedSegment extends TranscriptSegment {
424
611
  /**
@@ -428,7 +615,7 @@ export interface DiarizedSegment extends TranscriptSegment {
428
615
  */
429
616
  speakerId: string;
430
617
  /**
431
- * Confidence that this segment belongs to `speakerId`, in the range [0, 1].
618
+ * Confidence that this segment belongs to {@link speakerId}, in the range [0, 1].
432
619
  */
433
620
  speakerConfidence: number;
434
621
  }
@@ -436,32 +623,39 @@ export interface DiarizedSegment extends TranscriptSegment {
436
623
  * An active diarization session. Accepts raw audio and outputs speaker-attributed
437
624
  * transcript segments via EventEmitter.
438
625
  *
439
- * Emits:
440
- * - `'segment'` (DiarizedSegment) — a diarized transcript segment is ready.
441
- * - `'speaker_change'` ({ from: string; to: string }) — speaker transition detected.
442
- * - `'error'` (Error) — unrecoverable engine error.
443
- * - `'close'` () session terminated.
626
+ * ## Events emitted
627
+ *
628
+ * | Event | Payload | Description |
629
+ * |--------------------|------------------------------------------|--------------------------------|
630
+ * | `'segment'` | {@link DiarizedSegment} | A diarized segment is ready. |
631
+ * | `'speaker_change'` | `{ from: string; to: string }` | Speaker transition detected. |
632
+ * | `'error'` | `Error` | Unrecoverable engine error. |
633
+ * | `'close'` | *(none)* | Session terminated. |
634
+ *
635
+ * @see {@link IDiarizationEngine.startSession} which creates these sessions.
444
636
  */
445
637
  export interface DiarizationSession extends EventEmitter {
446
638
  /**
447
639
  * Push a raw audio frame for diarization analysis.
448
640
  *
449
- * @param frame PCM audio frame from the capture stream.
641
+ * @param frame - PCM audio frame from the capture stream.
450
642
  */
451
643
  pushAudio(frame: AudioFrame): void;
452
644
  /**
453
645
  * Apply speaker labels to an existing transcript using the session's
454
646
  * current speaker model. Returns labelled segments.
455
647
  *
456
- * @param transcript Plain transcript segments to label.
648
+ * @param transcript - Plain transcript segments to label.
649
+ * @returns Speaker-attributed segments with confidence scores.
457
650
  */
458
651
  labelTranscript(transcript: TranscriptSegment[]): Promise<DiarizedSegment[]>;
459
652
  /**
460
653
  * Enroll a known speaker so subsequent audio is attributed to a named identity
461
654
  * rather than an anonymous `SPEAKER_N` label.
462
655
  *
463
- * @param speakerId Stable identifier for the speaker (e.g. user UUID).
464
- * @param samples Representative audio frames for the speaker's voice.
656
+ * @param speakerId - Stable identifier for the speaker (e.g. user UUID).
657
+ * @param samples - Representative audio frames for the speaker's voice.
658
+ * Typically 10-30 seconds of clean speech produces the best embeddings.
465
659
  */
466
660
  enrollSpeaker(speakerId: string, samples: AudioFrame[]): Promise<void>;
467
661
  /**
@@ -473,18 +667,33 @@ export interface DiarizationSession extends EventEmitter {
473
667
  * Factory interface for diarization (speaker separation) engines.
474
668
  *
475
669
  * Registered via `EXTENSION_KIND_DIARIZATION`.
670
+ *
671
+ * @see {@link DiarizationSession} for the session interface returned by {@link startSession}.
476
672
  */
477
673
  export interface IDiarizationEngine {
478
674
  /**
479
675
  * Open a new diarization session.
480
676
  *
481
- * @param config Session configuration controlling chunking and speaker hints.
677
+ * @param config - Session configuration controlling chunking and speaker hints.
678
+ * @returns A live session that accepts audio and emits diarized segments.
482
679
  */
483
680
  startSession(config?: DiarizationConfig): Promise<DiarizationSession>;
484
681
  }
485
682
  /**
486
683
  * Configuration passed to {@link IStreamingTTS.startSession} when opening a new
487
684
  * text-to-speech synthesis stream.
685
+ *
686
+ * @see {@link VoicePipelineConfig.ttsOptions} for provider-level overrides.
687
+ *
688
+ * @example
689
+ * ```typescript
690
+ * const config: StreamingTTSConfig = {
691
+ * voice: 'nova',
692
+ * format: 'opus',
693
+ * sampleRate: 24000,
694
+ * chunkingMode: 'sentence',
695
+ * };
696
+ * ```
488
697
  */
489
698
  export interface StreamingTTSConfig {
490
699
  /**
@@ -498,22 +707,22 @@ export interface StreamingTTSConfig {
498
707
  */
499
708
  format?: 'pcm' | 'mp3' | 'opus';
500
709
  /**
501
- * Output sample rate in Hz. Must be supported by the chosen `format`.
710
+ * Output sample rate in Hz. Must be supported by the chosen {@link format}.
502
711
  * @defaultValue 24000
503
712
  */
504
713
  sampleRate?: number;
505
714
  /**
506
715
  * Controls how the provider segments incoming token streams into synthesis
507
716
  * requests:
508
- * - `'sentence'` flush at sentence boundaries (lower latency).
509
- * - `'word'` flush at word boundaries (minimum latency, may sound choppy).
510
- * - `'paragraph'` flush at paragraph boundaries (highest quality).
717
+ * - `'sentence'` -- flush at sentence boundaries (lower latency).
718
+ * - `'word'` -- flush at word boundaries (minimum latency, may sound choppy).
719
+ * - `'paragraph'` -- flush at paragraph boundaries (highest quality).
511
720
  * @defaultValue 'sentence'
512
721
  */
513
722
  chunkingMode?: 'sentence' | 'word' | 'paragraph';
514
723
  /**
515
724
  * Maximum number of milliseconds of audio to buffer before forcing a flush,
516
- * regardless of `chunkingMode`. Prevents unbounded memory growth for very
725
+ * regardless of {@link chunkingMode}. Prevents unbounded memory growth for very
517
726
  * long utterances.
518
727
  * @defaultValue 3000
519
728
  */
@@ -527,18 +736,23 @@ export interface StreamingTTSConfig {
527
736
  * An active streaming TTS session. Token text is pushed in and encoded audio
528
737
  * chunks flow out via EventEmitter.
529
738
  *
530
- * Emits:
531
- * - `'audio'` (EncodedAudioChunk) — a synthesised audio chunk ready for playback.
532
- * - `'flush_complete'` () — all queued tokens have been synthesised.
533
- * - `'error'` (Error) — unrecoverable synthesis error.
534
- * - `'close'` () session terminated.
739
+ * ## Events emitted
740
+ *
741
+ * | Event | Payload | Description |
742
+ * |--------------------|---------------------------|--------------------------------------|
743
+ * | `'audio'` | {@link EncodedAudioChunk} | A synthesised chunk ready for playback. |
744
+ * | `'flush_complete'` | *(none)* | All queued tokens have been synthesised. |
745
+ * | `'error'` | `Error` | Unrecoverable synthesis error. |
746
+ * | `'close'` | *(none)* | Session terminated. |
747
+ *
748
+ * @see {@link IStreamingTTS.startSession} which creates these sessions.
535
749
  */
536
750
  export interface StreamingTTSSession extends EventEmitter {
537
751
  /**
538
752
  * Push one or more LLM output tokens into the synthesis buffer.
539
- * The session will chunk and synthesise them according to `chunkingMode`.
753
+ * The session will chunk and synthesise them according to {@link StreamingTTSConfig.chunkingMode}.
540
754
  *
541
- * @param tokens Text tokens to synthesise (may be partial words).
755
+ * @param tokens - Text tokens to synthesise (may be partial words).
542
756
  */
543
757
  pushTokens(tokens: string): void;
544
758
  /**
@@ -549,6 +763,7 @@ export interface StreamingTTSSession extends EventEmitter {
549
763
  /**
550
764
  * Immediately stop synthesis and discard all buffered tokens. Audio chunks
551
765
  * currently in-flight are not recalled; the caller must stop playback separately.
766
+ * Used during barge-in to halt the agent's response.
552
767
  */
553
768
  cancel(): void;
554
769
  /**
@@ -560,6 +775,8 @@ export interface StreamingTTSSession extends EventEmitter {
560
775
  * Factory interface for streaming text-to-speech providers.
561
776
  *
562
777
  * Registered via `EXTENSION_KIND_STREAMING_TTS`.
778
+ *
779
+ * @see {@link StreamingTTSSession} for the session interface returned by {@link startSession}.
563
780
  */
564
781
  export interface IStreamingTTS {
565
782
  /**
@@ -569,33 +786,64 @@ export interface IStreamingTTS {
569
786
  /**
570
787
  * Open a new streaming synthesis session.
571
788
  *
572
- * @param config Session-level configuration overriding provider defaults.
789
+ * @param config - Session-level configuration overriding provider defaults.
790
+ * @returns A live session that accepts tokens and emits audio chunks.
791
+ * @throws {Error} If the provider fails to initialise (e.g. invalid API key).
573
792
  */
574
793
  startSession(config?: StreamingTTSConfig): Promise<StreamingTTSSession>;
575
794
  }
576
795
  /**
577
796
  * Contextual information supplied to {@link IBargeinHandler.handleBargein} so the
578
797
  * handler can make an informed decision about how to respond to interruption.
798
+ *
799
+ * @see {@link IBargeinHandler} which consumes this context.
800
+ * @see {@link HardCutBargeinHandler} and {@link SoftFadeBargeinHandler} for concrete handlers.
801
+ *
802
+ * @example
803
+ * ```typescript
804
+ * const context: BargeinContext = {
805
+ * speechDurationMs: 450,
806
+ * interruptedText: 'I was explaining the process of...',
807
+ * playedDurationMs: 2300,
808
+ * };
809
+ * ```
579
810
  */
580
811
  export interface BargeinContext {
581
812
  /**
582
813
  * Duration of detected user speech before the barge-in was confirmed, in ms.
583
- * Short durations may indicate accidental noise rather than intentional interruption.
814
+ * Short durations (< 100 ms) often indicate accidental noise, lip smacks,
815
+ * or breaths rather than intentional interruption.
816
+ *
817
+ * @see {@link HardCutBargeinHandler} which uses a 300 ms default threshold.
818
+ * @see {@link SoftFadeBargeinHandler} which uses a tiered threshold system.
584
819
  */
585
820
  speechDurationMs: number;
586
821
  /**
587
- * The partial TTS text that was interrupted. Used to construct `interruptedRemainder`
588
- * in {@link VoiceTurnMetadata}.
822
+ * The partial TTS text that was interrupted. Used to construct
823
+ * {@link VoiceTurnMetadata.interruptedRemainder} so the agent knows what
824
+ * information was cut off and can avoid repeating it.
589
825
  */
590
826
  interruptedText: string;
591
827
  /**
592
828
  * How many milliseconds of audio had been played at the point of interruption.
829
+ * Combined with {@link interruptedText}, this allows the agent to estimate
830
+ * how much of the response the user actually heard.
593
831
  */
594
832
  playedDurationMs: number;
595
833
  }
596
834
  /**
597
835
  * Action the pipeline should take in response to a detected barge-in.
598
836
  * Returned by {@link IBargeinHandler.handleBargein}.
837
+ *
838
+ * @see {@link IBargeinHandler} which returns this type.
839
+ *
840
+ * @example
841
+ * ```typescript
842
+ * const cancelAction: BargeinAction = { type: 'cancel', injectMarker: '[interrupted]' };
843
+ * const pauseAction: BargeinAction = { type: 'pause', fadeMs: 150 };
844
+ * const resumeAction: BargeinAction = { type: 'resume' };
845
+ * const ignoreAction: BargeinAction = { type: 'ignore' };
846
+ * ```
599
847
  */
600
848
  export type BargeinAction = {
601
849
  /** Immediately stop all TTS output and discard the remainder of the response. */
@@ -606,7 +854,7 @@ export type BargeinAction = {
606
854
  */
607
855
  injectMarker?: string;
608
856
  } | {
609
- /** Fade out TTS audio over `fadeMs` milliseconds then pause. */
857
+ /** Fade out TTS audio over {@link fadeMs} milliseconds then pause. */
610
858
  type: 'pause';
611
859
  /** Duration of the fade-out in milliseconds. @defaultValue 150 */
612
860
  fadeMs?: number;
@@ -627,19 +875,22 @@ export type BargeinAction = {
627
875
  * Handles the policy decision when a barge-in (user speaking over TTS) is detected.
628
876
  *
629
877
  * Registered via `EXTENSION_KIND_BARGEIN_HANDLER`.
878
+ *
879
+ * @see {@link HardCutBargeinHandler} for the immediate-stop strategy.
880
+ * @see {@link SoftFadeBargeinHandler} for the three-tier fade strategy.
630
881
  */
631
882
  export interface IBargeinHandler {
632
883
  /**
633
884
  * Interruption strategy implemented by this handler:
634
- * - `'hard-cut'` TTS audio is stopped immediately with no fade.
635
- * - `'soft-fade'` TTS audio fades out over a short window before stopping.
885
+ * - `'hard-cut'` -- TTS audio is stopped immediately with no fade.
886
+ * - `'soft-fade'` -- TTS audio fades out over a short window before stopping.
636
887
  */
637
888
  readonly mode: 'hard-cut' | 'soft-fade';
638
889
  /**
639
890
  * Called by the pipeline when a barge-in is confirmed. The handler evaluates
640
891
  * the context and returns the action the pipeline should execute.
641
892
  *
642
- * @param context Contextual snapshot at the moment of interruption.
893
+ * @param context - Contextual snapshot at the moment of interruption.
643
894
  * @returns The action to perform (or a promise resolving to one).
644
895
  */
645
896
  handleBargein(context: BargeinContext): BargeinAction | Promise<BargeinAction>;
@@ -647,21 +898,32 @@ export interface IBargeinHandler {
647
898
  /**
648
899
  * Adapts any AgentOS agent to the voice pipeline's turn-based protocol.
649
900
  *
650
- * The pipeline calls {@link IVoicePipelineAgentSession.sendText} with the user's
651
- * final transcript and streams the response back as text tokens for TTS synthesis.
901
+ * The pipeline calls {@link sendText} with the user's final transcript and
902
+ * streams the response back as text tokens for TTS synthesis.
903
+ *
904
+ * @see {@link VoicePipelineOrchestrator} which invokes this during the
905
+ * `PROCESSING -> SPEAKING` state transition.
652
906
  */
653
907
  export interface IVoicePipelineAgentSession {
654
908
  /**
655
909
  * Send the user's utterance to the agent and receive a streaming text response.
656
910
  *
657
- * @param text Final transcript from the STT + endpoint detection pipeline.
658
- * @param metadata Rich metadata about the current voice turn.
911
+ * @param text - Final transcript from the STT + endpoint detection pipeline.
912
+ * @param metadata - Rich metadata about the current voice turn.
659
913
  * @returns An async iterable of text tokens (suitable for streaming into TTS).
914
+ *
915
+ * @example
916
+ * ```typescript
917
+ * const tokens = agentSession.sendText('What is the weather?', metadata);
918
+ * for await (const token of tokens) {
919
+ * ttsSession.pushTokens(token);
920
+ * }
921
+ * ```
660
922
  */
661
923
  sendText(text: string, metadata: VoiceTurnMetadata): AsyncIterable<string>;
662
924
  /**
663
925
  * Abort the current agent response mid-stream (called on barge-in when
664
- * `BargeinAction.type === 'cancel'`).
926
+ * {@link BargeinAction} type is `'cancel'`).
665
927
  *
666
928
  * Implementations should cancel any in-flight LLM requests. The pipeline
667
929
  * will discard any tokens emitted after `abort()` is called.
@@ -671,6 +933,19 @@ export interface IVoicePipelineAgentSession {
671
933
  /**
672
934
  * Rich metadata attached to each voice turn and passed to the agent session.
673
935
  * Enables the agent to tailor its response based on conversation dynamics.
936
+ *
937
+ * @see {@link IVoicePipelineAgentSession.sendText} which receives this metadata.
938
+ *
939
+ * @example
940
+ * ```typescript
941
+ * const metadata: VoiceTurnMetadata = {
942
+ * speakers: ['user'],
943
+ * endpointReason: 'punctuation',
944
+ * speechDurationMs: 3200,
945
+ * wasInterrupted: false,
946
+ * transcriptConfidence: 0.92,
947
+ * };
948
+ * ```
674
949
  */
675
950
  export interface VoiceTurnMetadata {
676
951
  /**
@@ -680,6 +955,7 @@ export interface VoiceTurnMetadata {
680
955
  speakers: string[];
681
956
  /**
682
957
  * The reason the endpoint detector decided the user had finished speaking.
958
+ * @see {@link EndpointReason} for the full set of possible values.
683
959
  */
684
960
  endpointReason: EndpointReason;
685
961
  /**
@@ -692,8 +968,9 @@ export interface VoiceTurnMetadata {
692
968
  */
693
969
  wasInterrupted: boolean;
694
970
  /**
695
- * When `wasInterrupted` is `true`, the text remainder of the agent response
696
- * that was cut off. Useful for the agent to avoid re-stating information.
971
+ * When {@link wasInterrupted} is `true`, the text remainder of the agent response
972
+ * that was cut off. Useful for the agent to avoid re-stating information
973
+ * the user has already heard.
697
974
  */
698
975
  interruptedRemainder?: string;
699
976
  /**
@@ -704,6 +981,21 @@ export interface VoiceTurnMetadata {
704
981
  /**
705
982
  * Top-level configuration for the {@link VoicePipelineSession}.
706
983
  * Specifies which providers to use and their session-level options.
984
+ *
985
+ * @see {@link VoicePipelineOrchestrator} which consumes this configuration.
986
+ *
987
+ * @example
988
+ * ```typescript
989
+ * const config: VoicePipelineConfig = {
990
+ * stt: 'deepgram',
991
+ * tts: 'openai',
992
+ * endpointing: 'heuristic',
993
+ * bargeIn: 'hard-cut',
994
+ * voice: 'nova',
995
+ * format: 'opus',
996
+ * language: 'en-US',
997
+ * };
998
+ * ```
707
999
  */
708
1000
  export interface VoicePipelineConfig {
709
1001
  /**
@@ -719,7 +1011,8 @@ export interface VoicePipelineConfig {
719
1011
  */
720
1012
  tts: string;
721
1013
  /**
722
- * Endpoint detection strategy. Defaults to `'hybrid'` when omitted.
1014
+ * Endpoint detection strategy. Defaults to `'heuristic'` when omitted.
1015
+ * @see {@link IEndpointDetector.mode} for the strategy descriptions.
723
1016
  */
724
1017
  endpointing?: 'acoustic' | 'heuristic' | 'semantic';
725
1018
  /**
@@ -728,6 +1021,7 @@ export interface VoicePipelineConfig {
728
1021
  diarization?: boolean;
729
1022
  /**
730
1023
  * Barge-in (interruption) handling mode. Defaults to `'hard-cut'` when omitted.
1024
+ * @see {@link HardCutBargeinHandler} and {@link SoftFadeBargeinHandler}.
731
1025
  */
732
1026
  bargeIn?: 'hard-cut' | 'soft-fade' | 'disabled';
733
1027
  /**
@@ -761,26 +1055,35 @@ export interface VoicePipelineConfig {
761
1055
  /**
762
1056
  * Lifecycle state of a {@link VoicePipelineSession}.
763
1057
  *
764
- * Valid transitions:
1058
+ * ## Valid state transitions
1059
+ *
765
1060
  * ```
766
- * idle listening processing speaking listening
767
- * interrupting listening
768
- * any closed
1061
+ * idle -> listening -> processing -> speaking -> listening
1062
+ * -> interrupting -> listening
1063
+ * any -> closed
769
1064
  * ```
1065
+ *
1066
+ * The state machine is enforced by {@link VoicePipelineOrchestrator._setState}
1067
+ * which emits `'state_changed'` on every transition.
770
1068
  */
771
1069
  export type PipelineState = 'idle' | 'listening' | 'processing' | 'speaking' | 'interrupting' | 'closed';
772
1070
  /**
773
1071
  * A live voice pipeline session binding a transport, STT, endpoint detection,
774
1072
  * optional diarization, agent, and TTS into a single coordinated lifecycle.
775
1073
  *
776
- * Emits:
777
- * - `'state_change'` (PipelineState) — pipeline state machine transition.
778
- * - `'turn_complete'` (TurnCompleteEvent) — user turn detected.
779
- * - `'agent_response_start'` () — agent has begun generating a response.
780
- * - `'agent_response_end'` () agent response fully synthesised and played.
781
- * - `'barge_in'` (BargeinContext) user interrupted TTS playback.
782
- * - `'error'` (Error) unrecoverable pipeline error.
783
- * - `'close'` () session has been fully torn down.
1074
+ * ## Events emitted
1075
+ *
1076
+ * | Event | Payload | Description |
1077
+ * |--------------------------|---------------------------|-----------------------------------------|
1078
+ * | `'state_change'` | {@link PipelineState} | Pipeline state machine transition. |
1079
+ * | `'turn_complete'` | {@link TurnCompleteEvent} | User turn detected. |
1080
+ * | `'agent_response_start'` | *(none)* | Agent has begun generating a response. |
1081
+ * | `'agent_response_end'` | *(none)* | Agent response fully played. |
1082
+ * | `'barge_in'` | {@link BargeinContext} | User interrupted TTS playback. |
1083
+ * | `'error'` | `Error` | Unrecoverable pipeline error. |
1084
+ * | `'close'` | *(none)* | Session has been fully torn down. |
1085
+ *
1086
+ * @see {@link VoicePipelineOrchestrator.startSession} which creates these sessions.
784
1087
  */
785
1088
  export interface VoicePipelineSession extends EventEmitter {
786
1089
  /**
@@ -789,6 +1092,7 @@ export interface VoicePipelineSession extends EventEmitter {
789
1092
  readonly sessionId: string;
790
1093
  /**
791
1094
  * Current pipeline state machine state.
1095
+ * @see {@link PipelineState} for the full set of states and transitions.
792
1096
  */
793
1097
  readonly state: PipelineState;
794
1098
  /**
@@ -797,16 +1101,26 @@ export interface VoicePipelineSession extends EventEmitter {
797
1101
  */
798
1102
  readonly transport: IStreamTransport;
799
1103
  /**
800
- * Gracefully close the session flush in-flight audio, tear down all sub-sessions,
1104
+ * Gracefully close the session -- flush in-flight audio, tear down all sub-sessions,
801
1105
  * and emit `'close'`.
802
1106
  *
803
- * @param reason Optional human-readable reason for diagnostics.
1107
+ * @param reason - Optional human-readable reason for diagnostics.
804
1108
  */
805
1109
  close(reason?: string): Promise<void>;
806
1110
  }
807
1111
  /**
808
1112
  * Messages sent from the client (browser/app) to the server over the transport.
809
1113
  * All messages are JSON-serialised.
1114
+ *
1115
+ * @see {@link ServerTextMessage} for the server-to-client counterpart.
1116
+ *
1117
+ * @example
1118
+ * ```typescript
1119
+ * const configMsg: ClientTextMessage = {
1120
+ * type: 'config',
1121
+ * config: { stt: 'deepgram', tts: 'openai' },
1122
+ * };
1123
+ * ```
810
1124
  */
811
1125
  export type ClientTextMessage = {
812
1126
  /**
@@ -827,6 +1141,18 @@ export type ClientTextMessage = {
827
1141
  /**
828
1142
  * Messages sent from the server to the client over the transport.
829
1143
  * All messages are JSON-serialised.
1144
+ *
1145
+ * @see {@link ClientTextMessage} for the client-to-server counterpart.
1146
+ * @see {@link IStreamTransport.sendControl} which sends these messages.
1147
+ *
1148
+ * @example
1149
+ * ```typescript
1150
+ * const sessionStarted: ServerTextMessage = {
1151
+ * type: 'session_started',
1152
+ * sessionId: 'abc-123',
1153
+ * config: { stt: 'deepgram', tts: 'openai' },
1154
+ * };
1155
+ * ```
830
1156
  */
831
1157
  export type ServerTextMessage = {
832
1158
  /**
@@ -858,8 +1184,8 @@ export type ServerTextMessage = {
858
1184
  type: 'agent_thinking';
859
1185
  } | {
860
1186
  /**
861
- * Emitted when TTS synthesis begins audio chunks will follow over the audio channel.
862
- * Clients may hide thinking indicators.
1187
+ * Emitted when TTS synthesis begins -- audio chunks will follow over the audio channel.
1188
+ * Clients may hide thinking indicators and prepare audio playback.
863
1189
  */
864
1190
  type: 'agent_speaking';
865
1191
  /**
@@ -890,16 +1216,16 @@ export type ServerTextMessage = {
890
1216
  * The session will be closed after this message.
891
1217
  */
892
1218
  type: 'error';
893
- /** Machine-readable error code. */
1219
+ /** Machine-readable error code (e.g. `'STT_PROVIDER_ERROR'`). */
894
1220
  code: string;
895
- /** Human-readable description. */
1221
+ /** Human-readable description of the error. */
896
1222
  message: string;
897
1223
  } | {
898
1224
  /**
899
1225
  * Emitted as the final message before the server closes the transport.
900
1226
  */
901
1227
  type: 'session_ended';
902
- /** Optional human-readable reason. */
1228
+ /** Optional human-readable reason for the session ending. */
903
1229
  reason?: string;
904
1230
  };
905
1231
  //# sourceMappingURL=types.d.ts.map