@framers/agentos 0.1.108 → 0.1.109
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/voice-pipeline/AcousticEndpointDetector.d.ts +95 -20
- package/dist/voice-pipeline/AcousticEndpointDetector.d.ts.map +1 -1
- package/dist/voice-pipeline/AcousticEndpointDetector.js +110 -24
- package/dist/voice-pipeline/AcousticEndpointDetector.js.map +1 -1
- package/dist/voice-pipeline/HardCutBargeinHandler.d.ts +66 -15
- package/dist/voice-pipeline/HardCutBargeinHandler.d.ts.map +1 -1
- package/dist/voice-pipeline/HardCutBargeinHandler.js +65 -13
- package/dist/voice-pipeline/HardCutBargeinHandler.js.map +1 -1
- package/dist/voice-pipeline/HeuristicEndpointDetector.d.ts +116 -42
- package/dist/voice-pipeline/HeuristicEndpointDetector.d.ts.map +1 -1
- package/dist/voice-pipeline/HeuristicEndpointDetector.js +159 -52
- package/dist/voice-pipeline/HeuristicEndpointDetector.js.map +1 -1
- package/dist/voice-pipeline/SoftFadeBargeinHandler.d.ts +89 -24
- package/dist/voice-pipeline/SoftFadeBargeinHandler.d.ts.map +1 -1
- package/dist/voice-pipeline/SoftFadeBargeinHandler.js +74 -20
- package/dist/voice-pipeline/SoftFadeBargeinHandler.js.map +1 -1
- package/dist/voice-pipeline/VoiceInterruptError.d.ts +68 -10
- package/dist/voice-pipeline/VoiceInterruptError.d.ts.map +1 -1
- package/dist/voice-pipeline/VoiceInterruptError.js +53 -6
- package/dist/voice-pipeline/VoiceInterruptError.js.map +1 -1
- package/dist/voice-pipeline/VoicePipelineOrchestrator.d.ts +190 -39
- package/dist/voice-pipeline/VoicePipelineOrchestrator.d.ts.map +1 -1
- package/dist/voice-pipeline/VoicePipelineOrchestrator.js +266 -53
- package/dist/voice-pipeline/VoicePipelineOrchestrator.js.map +1 -1
- package/dist/voice-pipeline/WebSocketStreamTransport.d.ts +135 -43
- package/dist/voice-pipeline/WebSocketStreamTransport.d.ts.map +1 -1
- package/dist/voice-pipeline/WebSocketStreamTransport.js +109 -47
- package/dist/voice-pipeline/WebSocketStreamTransport.js.map +1 -1
- package/dist/voice-pipeline/index.d.ts +34 -1
- package/dist/voice-pipeline/index.d.ts.map +1 -1
- package/dist/voice-pipeline/index.js +41 -1
- package/dist/voice-pipeline/index.js.map +1 -1
- package/dist/voice-pipeline/types.d.ts +432 -106
- package/dist/voice-pipeline/types.d.ts.map +1 -1
- package/dist/voice-pipeline/types.js +21 -9
- package/dist/voice-pipeline/types.js.map +1 -1
- package/package.json +1 -1
|
@@ -8,41 +8,78 @@
|
|
|
8
8
|
* system. All heavy I/O crosses EventEmitter-based session boundaries to keep
|
|
9
9
|
* the hot path non-blocking.
|
|
10
10
|
*
|
|
11
|
-
* Dependency order (no circular refs)
|
|
11
|
+
* ## Dependency order (no circular refs)
|
|
12
|
+
*
|
|
13
|
+
* ```
|
|
12
14
|
* AudioFrame / EncodedAudioChunk
|
|
13
|
-
*
|
|
14
|
-
*
|
|
15
|
-
*
|
|
16
|
-
*
|
|
17
|
-
*
|
|
18
|
-
*
|
|
19
|
-
*
|
|
20
|
-
*
|
|
15
|
+
* -> Transport (IStreamTransport)
|
|
16
|
+
* -> STT (IStreamingSTT + StreamingSTTSession)
|
|
17
|
+
* -> Endpoint detection (IEndpointDetector + VadEvent)
|
|
18
|
+
* -> Diarization (IDiarizationEngine + DiarizationSession)
|
|
19
|
+
* -> TTS (IStreamingTTS + StreamingTTSSession)
|
|
20
|
+
* -> Barge-in (IBargeinHandler)
|
|
21
|
+
* -> Session (VoicePipelineSession)
|
|
22
|
+
* -> Protocol messages (ClientTextMessage, ServerTextMessage)
|
|
23
|
+
* ```
|
|
24
|
+
*
|
|
25
|
+
* ## Design rationale
|
|
26
|
+
*
|
|
27
|
+
* Every interface in this module is kept deliberately narrow so that
|
|
28
|
+
* implementations can be swapped at runtime (e.g. Deepgram STT vs Whisper
|
|
29
|
+
* vs browser WebSpeechAPI) without touching the orchestrator. The
|
|
30
|
+
* EventEmitter-based session pattern was chosen over callback interfaces
|
|
31
|
+
* because it naturally supports fan-out (multiple listeners) and backpressure
|
|
32
|
+
* is handled at the transport level rather than per-callback.
|
|
21
33
|
*/
|
|
22
34
|
import type { EventEmitter } from 'node:events';
|
|
23
35
|
/**
|
|
24
36
|
* A single frame of raw PCM audio, as produced by a microphone capture or
|
|
25
|
-
* a VAD pre-processor. Each frame typically represents 10
|
|
37
|
+
* a VAD pre-processor. Each frame typically represents 10-20 ms of audio.
|
|
38
|
+
*
|
|
39
|
+
* @see {@link EncodedAudioChunk} for the compressed counterpart used in TTS output.
|
|
40
|
+
*
|
|
41
|
+
* @example
|
|
42
|
+
* ```typescript
|
|
43
|
+
* const frame: AudioFrame = {
|
|
44
|
+
* samples: new Float32Array(320), // 20 ms @ 16 kHz
|
|
45
|
+
* sampleRate: 16000,
|
|
46
|
+
* timestamp: Date.now(),
|
|
47
|
+
* };
|
|
48
|
+
* ```
|
|
26
49
|
*/
|
|
27
50
|
export interface AudioFrame {
|
|
28
51
|
/**
|
|
29
52
|
* Interleaved 32-bit float PCM samples, normalised to [-1, 1].
|
|
30
53
|
* For mono audio this is a flat array; stereo interleaves L/R pairs.
|
|
54
|
+
*
|
|
55
|
+
* Float32Array is chosen over Int16Array because it avoids quantisation
|
|
56
|
+
* artefacts in DSP operations (e.g. energy calculation, resampling) and
|
|
57
|
+
* is the native format for Web Audio API.
|
|
31
58
|
*/
|
|
32
59
|
samples: Float32Array;
|
|
33
60
|
/**
|
|
34
61
|
* Samples per second (e.g. 16000, 24000, 48000).
|
|
62
|
+
*
|
|
63
|
+
* 16 kHz is the standard for telephony and most STT engines. 24 kHz is
|
|
64
|
+
* typical for TTS output. The pipeline resamples internally when STT
|
|
65
|
+
* and TTS sample rates differ.
|
|
35
66
|
*/
|
|
36
67
|
sampleRate: number;
|
|
37
68
|
/**
|
|
38
69
|
* Unix epoch millisecond timestamp at which this frame was captured.
|
|
39
70
|
* Used for synchronisation across STT, VAD, and diarization streams.
|
|
71
|
+
*
|
|
72
|
+
* Must be monotonically increasing within a session. Out-of-order
|
|
73
|
+
* frames degrade STT accuracy and confuse the endpoint detector's
|
|
74
|
+
* duration tracking.
|
|
40
75
|
*/
|
|
41
76
|
timestamp: number;
|
|
42
77
|
/**
|
|
43
78
|
* Optional hint from the capture layer identifying the speaker source
|
|
44
79
|
* (e.g. a hardware device label or a WebRTC peer ID). Used by the
|
|
45
80
|
* diarization engine when native speaker IDs are unavailable.
|
|
81
|
+
*
|
|
82
|
+
* @see {@link DiarizedSegment.speakerId} for the post-diarization label.
|
|
46
83
|
*/
|
|
47
84
|
speakerHint?: string;
|
|
48
85
|
}
|
|
@@ -50,14 +87,32 @@ export interface AudioFrame {
|
|
|
50
87
|
* A compressed audio chunk ready for transmission over the wire (e.g. to a
|
|
51
88
|
* TTS websocket or a playback buffer). Contains the rendered text to allow
|
|
52
89
|
* barge-in handlers to track interrupted utterance state.
|
|
90
|
+
*
|
|
91
|
+
* @see {@link AudioFrame} for the uncompressed PCM counterpart used in capture.
|
|
92
|
+
* @see {@link StreamingTTSSession} which emits these on the `'audio'` event.
|
|
93
|
+
*
|
|
94
|
+
* @example
|
|
95
|
+
* ```typescript
|
|
96
|
+
* const chunk: EncodedAudioChunk = {
|
|
97
|
+
* audio: Buffer.from([...opusBytes]),
|
|
98
|
+
* format: 'opus',
|
|
99
|
+
* sampleRate: 24000,
|
|
100
|
+
* durationMs: 60,
|
|
101
|
+
* text: 'Hello there!',
|
|
102
|
+
* };
|
|
103
|
+
* ```
|
|
53
104
|
*/
|
|
54
105
|
export interface EncodedAudioChunk {
|
|
55
106
|
/**
|
|
56
|
-
* Raw encoded bytes in the format specified by
|
|
107
|
+
* Raw encoded bytes in the format specified by {@link format}.
|
|
57
108
|
*/
|
|
58
109
|
audio: Buffer;
|
|
59
110
|
/**
|
|
60
|
-
* Codec/container format of
|
|
111
|
+
* Codec/container format of {@link audio}.
|
|
112
|
+
*
|
|
113
|
+
* - `'pcm'` -- raw signed 16-bit LE samples (lowest latency, highest bandwidth).
|
|
114
|
+
* - `'mp3'` -- MPEG Layer 3 (wide browser support, moderate latency).
|
|
115
|
+
* - `'opus'` -- Opus in OGG container (best quality/size ratio, recommended default).
|
|
61
116
|
*/
|
|
62
117
|
format: 'pcm' | 'mp3' | 'opus';
|
|
63
118
|
/**
|
|
@@ -66,17 +121,29 @@ export interface EncodedAudioChunk {
|
|
|
66
121
|
sampleRate: number;
|
|
67
122
|
/**
|
|
68
123
|
* Playback duration of this chunk in milliseconds.
|
|
124
|
+
* Used by the orchestrator to track cumulative played time for
|
|
125
|
+
* barge-in context ({@link BargeinContext.playedDurationMs}).
|
|
69
126
|
*/
|
|
70
127
|
durationMs: number;
|
|
71
128
|
/**
|
|
72
129
|
* The text fragment that was synthesised into this chunk. Preserved so
|
|
73
|
-
* barge-in handlers can report
|
|
130
|
+
* barge-in handlers can report {@link VoiceTurnMetadata.interruptedRemainder}
|
|
131
|
+
* accurately when playback is cut short.
|
|
74
132
|
*/
|
|
75
133
|
text: string;
|
|
76
134
|
}
|
|
77
135
|
/**
|
|
78
136
|
* Discriminated union of control messages sent from the pipeline to the
|
|
79
137
|
* underlying stream transport (e.g. a WebSocket or WebRTC data-channel).
|
|
138
|
+
*
|
|
139
|
+
* @see {@link IStreamTransport.sendControl} which accepts these messages.
|
|
140
|
+
* @see {@link ServerTextMessage} for the full server-to-client protocol.
|
|
141
|
+
*
|
|
142
|
+
* @example
|
|
143
|
+
* ```typescript
|
|
144
|
+
* const muteMsg: TransportControlMessage = { type: 'mute' };
|
|
145
|
+
* const stopMsg: TransportControlMessage = { type: 'stop', reason: 'session timeout' };
|
|
146
|
+
* ```
|
|
80
147
|
*/
|
|
81
148
|
export type TransportControlMessage = {
|
|
82
149
|
/** Mute the outbound audio stream without closing the session. */
|
|
@@ -99,50 +166,76 @@ export type TransportControlMessage = {
|
|
|
99
166
|
* Abstraction over any bidirectional audio/text stream transport.
|
|
100
167
|
* Implementations include WebSocket, WebRTC data-channel, and in-process pipes.
|
|
101
168
|
*
|
|
102
|
-
*
|
|
103
|
-
*
|
|
104
|
-
*
|
|
105
|
-
*
|
|
106
|
-
*
|
|
169
|
+
* The transport layer is intentionally thin: it handles framing and I/O but
|
|
170
|
+
* knows nothing about STT, TTS, or conversation state. This separation lets
|
|
171
|
+
* the pipeline swap transports (e.g. WebSocket -> WebRTC) without touching
|
|
172
|
+
* any voice logic.
|
|
173
|
+
*
|
|
174
|
+
* ## Events emitted
|
|
175
|
+
*
|
|
176
|
+
* | Event | Payload | Description |
|
|
177
|
+
* |-------------|-----------------------|----------------------------------------|
|
|
178
|
+
* | `'audio'` | {@link AudioFrame} | Inbound audio from the remote client. |
|
|
179
|
+
* | `'message'` | {@link ClientTextMessage} | Inbound JSON control from the client. |
|
|
180
|
+
* | `'close'` | *(none)* | Transport has been closed (either side). |
|
|
181
|
+
* | `'error'` | `Error` | Fatal transport error. |
|
|
182
|
+
*
|
|
183
|
+
* @see {@link WebSocketStreamTransport} for the canonical WebSocket implementation.
|
|
107
184
|
*/
|
|
108
185
|
export interface IStreamTransport extends EventEmitter {
|
|
109
186
|
/**
|
|
110
187
|
* Stable identifier for this transport connection (e.g. a UUID or socket ID).
|
|
188
|
+
* Used as a correlation key in logs and metrics.
|
|
111
189
|
*/
|
|
112
190
|
readonly id: string;
|
|
113
191
|
/**
|
|
114
192
|
* Current connection state.
|
|
115
|
-
* - `'connecting'`
|
|
116
|
-
* - `'open'`
|
|
117
|
-
* - `'closing'`
|
|
118
|
-
* - `'closed'`
|
|
193
|
+
* - `'connecting'` -- handshake in progress.
|
|
194
|
+
* - `'open'` -- fully established and ready.
|
|
195
|
+
* - `'closing'` -- graceful teardown initiated.
|
|
196
|
+
* - `'closed'` -- no longer usable.
|
|
119
197
|
*/
|
|
120
198
|
readonly state: 'connecting' | 'open' | 'closing' | 'closed';
|
|
121
199
|
/**
|
|
122
200
|
* Send a synthesised audio chunk to the remote client for playback.
|
|
123
201
|
* Resolves once the chunk has been handed to the underlying I/O layer.
|
|
124
202
|
*
|
|
125
|
-
* @param chunk
|
|
203
|
+
* @param chunk - Encoded audio to deliver.
|
|
204
|
+
* @returns Resolves when the data has been buffered for transmission.
|
|
205
|
+
* @throws {Error} If the transport is not in `'open'` state.
|
|
126
206
|
*/
|
|
127
207
|
sendAudio(chunk: EncodedAudioChunk): Promise<void>;
|
|
128
208
|
/**
|
|
129
209
|
* Send a JSON control message to the remote client.
|
|
130
210
|
*
|
|
131
|
-
* @param message
|
|
211
|
+
* @param message - Server-side protocol message.
|
|
212
|
+
* @returns Resolves when the data has been buffered for transmission.
|
|
213
|
+
* @throws {Error} If the transport is not in `'open'` state.
|
|
132
214
|
*/
|
|
133
215
|
sendControl(message: ServerTextMessage): Promise<void>;
|
|
134
216
|
/**
|
|
135
217
|
* Close the transport, optionally supplying a WebSocket-style close code and
|
|
136
218
|
* human-readable reason string for diagnostics.
|
|
137
219
|
*
|
|
138
|
-
* @param code
|
|
139
|
-
* @param reason
|
|
220
|
+
* @param code - Optional numeric close code (defaults to 1000 normal closure).
|
|
221
|
+
* @param reason - Optional human-readable close reason.
|
|
140
222
|
*/
|
|
141
223
|
close(code?: number, reason?: string): void;
|
|
142
224
|
}
|
|
143
225
|
/**
|
|
144
226
|
* Configuration passed to {@link IStreamingSTT.startSession} when opening a new
|
|
145
227
|
* speech recognition stream.
|
|
228
|
+
*
|
|
229
|
+
* @see {@link VoicePipelineConfig.sttOptions} for provider-level overrides.
|
|
230
|
+
*
|
|
231
|
+
* @example
|
|
232
|
+
* ```typescript
|
|
233
|
+
* const config: StreamingSTTConfig = {
|
|
234
|
+
* language: 'en-US',
|
|
235
|
+
* interimResults: true,
|
|
236
|
+
* punctuate: true,
|
|
237
|
+
* };
|
|
238
|
+
* ```
|
|
146
239
|
*/
|
|
147
240
|
export interface StreamingSTTConfig {
|
|
148
241
|
/**
|
|
@@ -153,11 +246,14 @@ export interface StreamingSTTConfig {
|
|
|
153
246
|
/**
|
|
154
247
|
* Whether to emit interim (non-final) transcript events. When `true`,
|
|
155
248
|
* partial results arrive more frequently at the cost of higher word error rate.
|
|
249
|
+
* Interim results are useful for real-time UI display and early endpoint hints.
|
|
156
250
|
* @defaultValue true
|
|
157
251
|
*/
|
|
158
252
|
interimResults?: boolean;
|
|
159
253
|
/**
|
|
160
254
|
* Enable automatic punctuation insertion if the provider supports it.
|
|
255
|
+
* Punctuation is critical for the {@link HeuristicEndpointDetector} which
|
|
256
|
+
* uses terminal punctuation (`.`, `?`, `!`) as a turn-completion signal.
|
|
161
257
|
* @defaultValue true
|
|
162
258
|
*/
|
|
163
259
|
punctuate?: boolean;
|
|
@@ -176,6 +272,8 @@ export interface StreamingSTTConfig {
|
|
|
176
272
|
/**
|
|
177
273
|
* A single word within a {@link TranscriptEvent}, augmented with timing and
|
|
178
274
|
* optional speaker attribution.
|
|
275
|
+
*
|
|
276
|
+
* @see {@link TranscriptEvent.words} which contains an array of these.
|
|
179
277
|
*/
|
|
180
278
|
export interface TranscriptWord {
|
|
181
279
|
/**
|
|
@@ -192,6 +290,7 @@ export interface TranscriptWord {
|
|
|
192
290
|
end: number;
|
|
193
291
|
/**
|
|
194
292
|
* Recognition confidence in the range [0, 1]. Higher is better.
|
|
293
|
+
* Typically 0.8+ for clear speech, 0.4-0.7 for noisy or accented audio.
|
|
195
294
|
*/
|
|
196
295
|
confidence: number;
|
|
197
296
|
/**
|
|
@@ -204,6 +303,17 @@ export interface TranscriptWord {
|
|
|
204
303
|
/**
|
|
205
304
|
* Emitted by a {@link StreamingSTTSession} each time the provider produces a
|
|
206
305
|
* recognition hypothesis.
|
|
306
|
+
*
|
|
307
|
+
* @see {@link IEndpointDetector.pushTranscript} which consumes these events.
|
|
308
|
+
*
|
|
309
|
+
* @example
|
|
310
|
+
* ```typescript
|
|
311
|
+
* sttSession.on('transcript', (event: TranscriptEvent) => {
|
|
312
|
+
* if (event.isFinal) {
|
|
313
|
+
* console.log(`Final: "${event.text}" (confidence: ${event.confidence})`);
|
|
314
|
+
* }
|
|
315
|
+
* });
|
|
316
|
+
* ```
|
|
207
317
|
*/
|
|
208
318
|
export interface TranscriptEvent {
|
|
209
319
|
/**
|
|
@@ -211,7 +321,7 @@ export interface TranscriptEvent {
|
|
|
211
321
|
*/
|
|
212
322
|
text: string;
|
|
213
323
|
/**
|
|
214
|
-
* Aggregate confidence score for
|
|
324
|
+
* Aggregate confidence score for {@link text} in the range [0, 1].
|
|
215
325
|
*/
|
|
216
326
|
confidence: number;
|
|
217
327
|
/**
|
|
@@ -222,6 +332,9 @@ export interface TranscriptEvent {
|
|
|
222
332
|
/**
|
|
223
333
|
* `true` when this hypothesis is stable and will not be revised.
|
|
224
334
|
* `false` for interim (streaming) hypotheses.
|
|
335
|
+
*
|
|
336
|
+
* The {@link HeuristicEndpointDetector} only accumulates final transcripts;
|
|
337
|
+
* interim results are discarded to avoid double-counting.
|
|
225
338
|
*/
|
|
226
339
|
isFinal: boolean;
|
|
227
340
|
/**
|
|
@@ -234,17 +347,22 @@ export interface TranscriptEvent {
|
|
|
234
347
|
* An active streaming speech-to-text session. Audio frames are pushed in
|
|
235
348
|
* and transcript events flow out via EventEmitter.
|
|
236
349
|
*
|
|
237
|
-
*
|
|
238
|
-
*
|
|
239
|
-
*
|
|
240
|
-
*
|
|
350
|
+
* ## Events emitted
|
|
351
|
+
*
|
|
352
|
+
* | Event | Payload | Description |
|
|
353
|
+
* |----------------|-----------------------|------------------------------------|
|
|
354
|
+
* | `'transcript'` | {@link TranscriptEvent} | Interim or final hypothesis. |
|
|
355
|
+
* | `'error'` | `Error` | Unrecoverable provider error. |
|
|
356
|
+
* | `'close'` | *(none)* | Session has been fully terminated. |
|
|
357
|
+
*
|
|
358
|
+
* @see {@link IStreamingSTT.startSession} which creates these sessions.
|
|
241
359
|
*/
|
|
242
360
|
export interface StreamingSTTSession extends EventEmitter {
|
|
243
361
|
/**
|
|
244
362
|
* Push a raw audio frame into the recognition stream. Frames must arrive
|
|
245
363
|
* in capture order; gaps or out-of-order frames degrade accuracy.
|
|
246
364
|
*
|
|
247
|
-
* @param frame
|
|
365
|
+
* @param frame - PCM audio frame to process.
|
|
248
366
|
*/
|
|
249
367
|
pushAudio(frame: AudioFrame): void;
|
|
250
368
|
/**
|
|
@@ -263,6 +381,8 @@ export interface StreamingSTTSession extends EventEmitter {
|
|
|
263
381
|
*
|
|
264
382
|
* Implementations are registered via the `EXTENSION_KIND_STREAMING_STT`
|
|
265
383
|
* extension kind and resolved by the voice pipeline at session creation time.
|
|
384
|
+
*
|
|
385
|
+
* @see {@link StreamingSTTSession} for the session interface returned by {@link startSession}.
|
|
266
386
|
*/
|
|
267
387
|
export interface IStreamingSTT {
|
|
268
388
|
/**
|
|
@@ -276,61 +396,107 @@ export interface IStreamingSTT {
|
|
|
276
396
|
/**
|
|
277
397
|
* Open a new streaming recognition session.
|
|
278
398
|
*
|
|
279
|
-
* @param config
|
|
399
|
+
* @param config - Session-level configuration overriding provider defaults.
|
|
280
400
|
* @returns A ready-to-use session whose lifecycle is independent of this factory.
|
|
401
|
+
* @throws {Error} If the provider fails to initialise (e.g. invalid API key).
|
|
281
402
|
*/
|
|
282
403
|
startSession(config?: StreamingSTTConfig): Promise<StreamingSTTSession>;
|
|
283
404
|
}
|
|
284
405
|
/**
|
|
285
406
|
* A VAD (Voice Activity Detection) or STT-derived event describing speech
|
|
286
407
|
* energy transitions over time.
|
|
408
|
+
*
|
|
409
|
+
* @see {@link IEndpointDetector.pushVadEvent} which consumes these.
|
|
410
|
+
*
|
|
411
|
+
* @example
|
|
412
|
+
* ```typescript
|
|
413
|
+
* const speechStart: VadEvent = {
|
|
414
|
+
* type: 'speech_start',
|
|
415
|
+
* timestamp: Date.now(),
|
|
416
|
+
* source: 'vad',
|
|
417
|
+
* energyLevel: 0.42,
|
|
418
|
+
* };
|
|
419
|
+
* ```
|
|
287
420
|
*/
|
|
288
421
|
export interface VadEvent {
|
|
289
422
|
/**
|
|
290
423
|
* Type of the VAD transition:
|
|
291
|
-
* - `'speech_start'`
|
|
292
|
-
* - `'speech_end'`
|
|
293
|
-
* - `'silence'`
|
|
424
|
+
* - `'speech_start'` -- voice energy detected after silence.
|
|
425
|
+
* - `'speech_end'` -- voice energy fell below the silence threshold.
|
|
426
|
+
* - `'silence'` -- periodic silence heartbeat (emitted at `silenceIntervalMs` cadence).
|
|
294
427
|
*/
|
|
295
428
|
type: 'speech_start' | 'speech_end' | 'silence';
|
|
296
429
|
/**
|
|
297
430
|
* Unix epoch millisecond timestamp at which this transition was detected.
|
|
431
|
+
* Used by the endpoint detector to compute speech duration.
|
|
298
432
|
*/
|
|
299
433
|
timestamp: number;
|
|
300
434
|
/**
|
|
301
435
|
* Optional raw energy level used to trigger this event (implementation-defined scale).
|
|
436
|
+
* Useful for debugging VAD sensitivity but not consumed by the pipeline logic.
|
|
302
437
|
*/
|
|
303
438
|
energyLevel?: number;
|
|
304
439
|
/**
|
|
305
440
|
* Origin of the VAD event:
|
|
306
|
-
* - `'vad'`
|
|
307
|
-
* - `'stt'`
|
|
441
|
+
* - `'vad'` -- emitted by a standalone VAD model (e.g. Silero, WebRTC VAD).
|
|
442
|
+
* - `'stt'` -- inferred from STT activity (e.g. provider-side endpointing signals).
|
|
443
|
+
*
|
|
444
|
+
* The pipeline synthesises STT-derived speech_start/speech_end events when
|
|
445
|
+
* a dedicated VAD is not available, using the source field to distinguish them.
|
|
308
446
|
*/
|
|
309
447
|
source?: 'vad' | 'stt';
|
|
310
448
|
}
|
|
311
449
|
/**
|
|
312
450
|
* Semantic reason why the endpoint detector decided the user has finished speaking.
|
|
451
|
+
*
|
|
452
|
+
* Each reason maps to a different detection strategy within the endpoint detector:
|
|
453
|
+
*
|
|
454
|
+
* | Reason | Detection strategy |
|
|
455
|
+
* |--------------------|--------------------------------------------------------|
|
|
456
|
+
* | `silence_timeout` | VAD silence exceeded configured threshold |
|
|
457
|
+
* | `punctuation` | STT final result ends with `.`, `?`, or `!` |
|
|
458
|
+
* | `syntax_complete` | Syntax model determined utterance is grammatically complete |
|
|
459
|
+
* | `semantic_model` | Small LM scored intent as complete |
|
|
460
|
+
* | `manual` | Explicitly triggered by a ClientTextMessage control |
|
|
461
|
+
* | `timeout` | Hard maximum turn duration elapsed |
|
|
462
|
+
*
|
|
463
|
+
* @see {@link TurnCompleteEvent.reason} which carries this value.
|
|
464
|
+
* @see {@link VoiceTurnMetadata.endpointReason} where it is forwarded to the agent.
|
|
313
465
|
*/
|
|
314
466
|
export type EndpointReason = 'silence_timeout' | 'punctuation' | 'syntax_complete' | 'semantic_model' | 'manual' | 'timeout';
|
|
315
467
|
/**
|
|
316
468
|
* Emitted by {@link IEndpointDetector} when it determines the user has finished
|
|
317
469
|
* their turn and the pipeline should hand off to the agent.
|
|
470
|
+
*
|
|
471
|
+
* @see {@link IEndpointDetector} which emits these on the `'turn_complete'` event.
|
|
472
|
+
* @see {@link VoicePipelineOrchestrator} which transitions to `'processing'` state upon receipt.
|
|
473
|
+
*
|
|
474
|
+
* @example
|
|
475
|
+
* ```typescript
|
|
476
|
+
* detector.on('turn_complete', (event: TurnCompleteEvent) => {
|
|
477
|
+
* console.log(`User said: "${event.transcript}" (reason: ${event.reason})`);
|
|
478
|
+
* });
|
|
479
|
+
* ```
|
|
318
480
|
*/
|
|
319
481
|
export interface TurnCompleteEvent {
|
|
320
482
|
/**
|
|
321
483
|
* The final consolidated transcript for this turn.
|
|
484
|
+
* May be empty for acoustic-only detectors that have no transcript access.
|
|
322
485
|
*/
|
|
323
486
|
transcript: string;
|
|
324
487
|
/**
|
|
325
488
|
* Aggregate STT confidence score for the transcript, in the range [0, 1].
|
|
489
|
+
* Zero when no STT data is available (e.g. acoustic-only mode).
|
|
326
490
|
*/
|
|
327
491
|
confidence: number;
|
|
328
492
|
/**
|
|
329
493
|
* Total duration of detected speech in this turn, in milliseconds.
|
|
494
|
+
* Computed as `speechEndTimestamp - speechStartTimestamp`.
|
|
330
495
|
*/
|
|
331
496
|
durationMs: number;
|
|
332
497
|
/**
|
|
333
498
|
* The semantic reason that triggered turn completion.
|
|
499
|
+
* @see {@link EndpointReason} for the full set of possible values.
|
|
334
500
|
*/
|
|
335
501
|
reason: EndpointReason;
|
|
336
502
|
}
|
|
@@ -339,29 +505,36 @@ export interface TurnCompleteEvent {
|
|
|
339
505
|
* Combines VAD events with linguistic signals to decide when the user
|
|
340
506
|
* has finished speaking.
|
|
341
507
|
*
|
|
342
|
-
*
|
|
343
|
-
*
|
|
344
|
-
*
|
|
345
|
-
*
|
|
508
|
+
* ## Events emitted
|
|
509
|
+
*
|
|
510
|
+
* | Event | Payload | Description |
|
|
511
|
+
* |------------------------|-------------------------|----------------------------------------|
|
|
512
|
+
* | `'turn_complete'` | {@link TurnCompleteEvent} | The user's turn has ended. |
|
|
513
|
+
* | `'speech_start'` | *(none)* | The user has started speaking. |
|
|
514
|
+
* | `'barge_in_detected'` | *(none)* | User spoke while TTS was playing. |
|
|
515
|
+
*
|
|
516
|
+
* @see {@link HeuristicEndpointDetector} for the rule-based implementation.
|
|
517
|
+
* @see {@link AcousticEndpointDetector} for the purely acoustic implementation.
|
|
346
518
|
*/
|
|
347
519
|
export interface IEndpointDetector extends EventEmitter {
|
|
348
520
|
/**
|
|
349
521
|
* Active detection strategy:
|
|
350
|
-
* - `'
|
|
351
|
-
* - `'
|
|
352
|
-
* - `'semantic'`
|
|
522
|
+
* - `'acoustic'` -- pure silence-timeout based (no transcript analysis).
|
|
523
|
+
* - `'heuristic'` -- silence + terminal punctuation + backchannel filtering.
|
|
524
|
+
* - `'semantic'` -- small LM scoring utterance completeness.
|
|
353
525
|
*/
|
|
354
526
|
readonly mode: 'acoustic' | 'heuristic' | 'semantic';
|
|
355
527
|
/**
|
|
356
528
|
* Push a VAD event from the upstream voice activity detector.
|
|
357
529
|
*
|
|
358
|
-
* @param event
|
|
530
|
+
* @param event - The VAD event to process.
|
|
359
531
|
*/
|
|
360
532
|
pushVadEvent(event: VadEvent): void;
|
|
361
533
|
/**
|
|
362
534
|
* Push a partial or final STT result for linguistic analysis.
|
|
535
|
+
* Acoustic-mode detectors may no-op this method.
|
|
363
536
|
*
|
|
364
|
-
* @param event
|
|
537
|
+
* @param event - Transcript event from the STT session.
|
|
365
538
|
*/
|
|
366
539
|
pushTranscript(event: TranscriptEvent): void;
|
|
367
540
|
/**
|
|
@@ -373,11 +546,14 @@ export interface IEndpointDetector extends EventEmitter {
|
|
|
373
546
|
/**
|
|
374
547
|
* Configuration for a diarization session. Controls expected speaker count and
|
|
375
548
|
* chunking behaviour for providers that require buffered audio.
|
|
549
|
+
*
|
|
550
|
+
* @see {@link IDiarizationEngine.startSession} which accepts this config.
|
|
376
551
|
*/
|
|
377
552
|
export interface DiarizationConfig {
|
|
378
553
|
/**
|
|
379
554
|
* Hint to the provider about how many distinct speakers are expected.
|
|
380
|
-
* When omitted, the provider uses auto-detection
|
|
555
|
+
* When omitted, the provider uses auto-detection (which typically adds
|
|
556
|
+
* latency as it needs more audio to stabilise speaker count).
|
|
381
557
|
*/
|
|
382
558
|
expectedSpeakers?: number;
|
|
383
559
|
/**
|
|
@@ -401,6 +577,8 @@ export interface DiarizationConfig {
|
|
|
401
577
|
}
|
|
402
578
|
/**
|
|
403
579
|
* A contiguous segment of transcript text with millisecond timing metadata.
|
|
580
|
+
*
|
|
581
|
+
* @see {@link DiarizedSegment} which extends this with speaker attribution.
|
|
404
582
|
*/
|
|
405
583
|
export interface TranscriptSegment {
|
|
406
584
|
/**
|
|
@@ -419,6 +597,15 @@ export interface TranscriptSegment {
|
|
|
419
597
|
/**
|
|
420
598
|
* A {@link TranscriptSegment} extended with speaker attribution produced by the
|
|
421
599
|
* diarization engine.
|
|
600
|
+
*
|
|
601
|
+
* @see {@link DiarizationSession} which emits these on the `'segment'` event.
|
|
602
|
+
*
|
|
603
|
+
* @example
|
|
604
|
+
* ```typescript
|
|
605
|
+
* diarizationSession.on('segment', (seg: DiarizedSegment) => {
|
|
606
|
+
* console.log(`[${seg.speakerId}]: "${seg.text}"`);
|
|
607
|
+
* });
|
|
608
|
+
* ```
|
|
422
609
|
*/
|
|
423
610
|
export interface DiarizedSegment extends TranscriptSegment {
|
|
424
611
|
/**
|
|
@@ -428,7 +615,7 @@ export interface DiarizedSegment extends TranscriptSegment {
|
|
|
428
615
|
*/
|
|
429
616
|
speakerId: string;
|
|
430
617
|
/**
|
|
431
|
-
* Confidence that this segment belongs to
|
|
618
|
+
* Confidence that this segment belongs to {@link speakerId}, in the range [0, 1].
|
|
432
619
|
*/
|
|
433
620
|
speakerConfidence: number;
|
|
434
621
|
}
|
|
@@ -436,32 +623,39 @@ export interface DiarizedSegment extends TranscriptSegment {
|
|
|
436
623
|
* An active diarization session. Accepts raw audio and outputs speaker-attributed
|
|
437
624
|
* transcript segments via EventEmitter.
|
|
438
625
|
*
|
|
439
|
-
*
|
|
440
|
-
*
|
|
441
|
-
*
|
|
442
|
-
*
|
|
443
|
-
*
|
|
626
|
+
* ## Events emitted
|
|
627
|
+
*
|
|
628
|
+
* | Event | Payload | Description |
|
|
629
|
+
* |--------------------|------------------------------------------|--------------------------------|
|
|
630
|
+
* | `'segment'` | {@link DiarizedSegment} | A diarized segment is ready. |
|
|
631
|
+
* | `'speaker_change'` | `{ from: string; to: string }` | Speaker transition detected. |
|
|
632
|
+
* | `'error'` | `Error` | Unrecoverable engine error. |
|
|
633
|
+
* | `'close'` | *(none)* | Session terminated. |
|
|
634
|
+
*
|
|
635
|
+
* @see {@link IDiarizationEngine.startSession} which creates these sessions.
|
|
444
636
|
*/
|
|
445
637
|
export interface DiarizationSession extends EventEmitter {
|
|
446
638
|
/**
|
|
447
639
|
* Push a raw audio frame for diarization analysis.
|
|
448
640
|
*
|
|
449
|
-
* @param frame
|
|
641
|
+
* @param frame - PCM audio frame from the capture stream.
|
|
450
642
|
*/
|
|
451
643
|
pushAudio(frame: AudioFrame): void;
|
|
452
644
|
/**
|
|
453
645
|
* Apply speaker labels to an existing transcript using the session's
|
|
454
646
|
* current speaker model. Returns labelled segments.
|
|
455
647
|
*
|
|
456
|
-
* @param transcript
|
|
648
|
+
* @param transcript - Plain transcript segments to label.
|
|
649
|
+
* @returns Speaker-attributed segments with confidence scores.
|
|
457
650
|
*/
|
|
458
651
|
labelTranscript(transcript: TranscriptSegment[]): Promise<DiarizedSegment[]>;
|
|
459
652
|
/**
|
|
460
653
|
* Enroll a known speaker so subsequent audio is attributed to a named identity
|
|
461
654
|
* rather than an anonymous `SPEAKER_N` label.
|
|
462
655
|
*
|
|
463
|
-
* @param speakerId
|
|
464
|
-
* @param samples
|
|
656
|
+
* @param speakerId - Stable identifier for the speaker (e.g. user UUID).
|
|
657
|
+
* @param samples - Representative audio frames for the speaker's voice.
|
|
658
|
+
* Typically 10-30 seconds of clean speech produces the best embeddings.
|
|
465
659
|
*/
|
|
466
660
|
enrollSpeaker(speakerId: string, samples: AudioFrame[]): Promise<void>;
|
|
467
661
|
/**
|
|
@@ -473,18 +667,33 @@ export interface DiarizationSession extends EventEmitter {
|
|
|
473
667
|
* Factory interface for diarization (speaker separation) engines.
|
|
474
668
|
*
|
|
475
669
|
* Registered via `EXTENSION_KIND_DIARIZATION`.
|
|
670
|
+
*
|
|
671
|
+
* @see {@link DiarizationSession} for the session interface returned by {@link startSession}.
|
|
476
672
|
*/
|
|
477
673
|
export interface IDiarizationEngine {
|
|
478
674
|
/**
|
|
479
675
|
* Open a new diarization session.
|
|
480
676
|
*
|
|
481
|
-
* @param config
|
|
677
|
+
* @param config - Session configuration controlling chunking and speaker hints.
|
|
678
|
+
* @returns A live session that accepts audio and emits diarized segments.
|
|
482
679
|
*/
|
|
483
680
|
startSession(config?: DiarizationConfig): Promise<DiarizationSession>;
|
|
484
681
|
}
|
|
485
682
|
/**
|
|
486
683
|
* Configuration passed to {@link IStreamingTTS.startSession} when opening a new
|
|
487
684
|
* text-to-speech synthesis stream.
|
|
685
|
+
*
|
|
686
|
+
* @see {@link VoicePipelineConfig.ttsOptions} for provider-level overrides.
|
|
687
|
+
*
|
|
688
|
+
* @example
|
|
689
|
+
* ```typescript
|
|
690
|
+
* const config: StreamingTTSConfig = {
|
|
691
|
+
* voice: 'nova',
|
|
692
|
+
* format: 'opus',
|
|
693
|
+
* sampleRate: 24000,
|
|
694
|
+
* chunkingMode: 'sentence',
|
|
695
|
+
* };
|
|
696
|
+
* ```
|
|
488
697
|
*/
|
|
489
698
|
export interface StreamingTTSConfig {
|
|
490
699
|
/**
|
|
@@ -498,22 +707,22 @@ export interface StreamingTTSConfig {
|
|
|
498
707
|
*/
|
|
499
708
|
format?: 'pcm' | 'mp3' | 'opus';
|
|
500
709
|
/**
|
|
501
|
-
* Output sample rate in Hz. Must be supported by the chosen
|
|
710
|
+
* Output sample rate in Hz. Must be supported by the chosen {@link format}.
|
|
502
711
|
* @defaultValue 24000
|
|
503
712
|
*/
|
|
504
713
|
sampleRate?: number;
|
|
505
714
|
/**
|
|
506
715
|
* Controls how the provider segments incoming token streams into synthesis
|
|
507
716
|
* requests:
|
|
508
|
-
* - `'sentence'`
|
|
509
|
-
* - `'word'`
|
|
510
|
-
* - `'paragraph'`
|
|
717
|
+
* - `'sentence'` -- flush at sentence boundaries (lower latency).
|
|
718
|
+
* - `'word'` -- flush at word boundaries (minimum latency, may sound choppy).
|
|
719
|
+
* - `'paragraph'` -- flush at paragraph boundaries (highest quality).
|
|
511
720
|
* @defaultValue 'sentence'
|
|
512
721
|
*/
|
|
513
722
|
chunkingMode?: 'sentence' | 'word' | 'paragraph';
|
|
514
723
|
/**
|
|
515
724
|
* Maximum number of milliseconds of audio to buffer before forcing a flush,
|
|
516
|
-
* regardless of
|
|
725
|
+
* regardless of {@link chunkingMode}. Prevents unbounded memory growth for very
|
|
517
726
|
* long utterances.
|
|
518
727
|
* @defaultValue 3000
|
|
519
728
|
*/
|
|
@@ -527,18 +736,23 @@ export interface StreamingTTSConfig {
|
|
|
527
736
|
* An active streaming TTS session. Token text is pushed in and encoded audio
|
|
528
737
|
* chunks flow out via EventEmitter.
|
|
529
738
|
*
|
|
530
|
-
*
|
|
531
|
-
*
|
|
532
|
-
*
|
|
533
|
-
*
|
|
534
|
-
*
|
|
739
|
+
* ## Events emitted
|
|
740
|
+
*
|
|
741
|
+
* | Event | Payload | Description |
|
|
742
|
+
* |--------------------|---------------------------|--------------------------------------|
|
|
743
|
+
* | `'audio'` | {@link EncodedAudioChunk} | A synthesised chunk ready for playback. |
|
|
744
|
+
* | `'flush_complete'` | *(none)* | All queued tokens have been synthesised. |
|
|
745
|
+
* | `'error'` | `Error` | Unrecoverable synthesis error. |
|
|
746
|
+
* | `'close'` | *(none)* | Session terminated. |
|
|
747
|
+
*
|
|
748
|
+
* @see {@link IStreamingTTS.startSession} which creates these sessions.
|
|
535
749
|
*/
|
|
536
750
|
export interface StreamingTTSSession extends EventEmitter {
|
|
537
751
|
/**
|
|
538
752
|
* Push one or more LLM output tokens into the synthesis buffer.
|
|
539
|
-
* The session will chunk and synthesise them according to
|
|
753
|
+
* The session will chunk and synthesise them according to {@link StreamingTTSConfig.chunkingMode}.
|
|
540
754
|
*
|
|
541
|
-
* @param tokens
|
|
755
|
+
* @param tokens - Text tokens to synthesise (may be partial words).
|
|
542
756
|
*/
|
|
543
757
|
pushTokens(tokens: string): void;
|
|
544
758
|
/**
|
|
@@ -549,6 +763,7 @@ export interface StreamingTTSSession extends EventEmitter {
|
|
|
549
763
|
/**
|
|
550
764
|
* Immediately stop synthesis and discard all buffered tokens. Audio chunks
|
|
551
765
|
* currently in-flight are not recalled; the caller must stop playback separately.
|
|
766
|
+
* Used during barge-in to halt the agent's response.
|
|
552
767
|
*/
|
|
553
768
|
cancel(): void;
|
|
554
769
|
/**
|
|
@@ -560,6 +775,8 @@ export interface StreamingTTSSession extends EventEmitter {
|
|
|
560
775
|
* Factory interface for streaming text-to-speech providers.
|
|
561
776
|
*
|
|
562
777
|
* Registered via `EXTENSION_KIND_STREAMING_TTS`.
|
|
778
|
+
*
|
|
779
|
+
* @see {@link StreamingTTSSession} for the session interface returned by {@link startSession}.
|
|
563
780
|
*/
|
|
564
781
|
export interface IStreamingTTS {
|
|
565
782
|
/**
|
|
@@ -569,33 +786,64 @@ export interface IStreamingTTS {
|
|
|
569
786
|
/**
|
|
570
787
|
* Open a new streaming synthesis session.
|
|
571
788
|
*
|
|
572
|
-
* @param config
|
|
789
|
+
* @param config - Session-level configuration overriding provider defaults.
|
|
790
|
+
* @returns A live session that accepts tokens and emits audio chunks.
|
|
791
|
+
* @throws {Error} If the provider fails to initialise (e.g. invalid API key).
|
|
573
792
|
*/
|
|
574
793
|
startSession(config?: StreamingTTSConfig): Promise<StreamingTTSSession>;
|
|
575
794
|
}
|
|
576
795
|
/**
|
|
577
796
|
* Contextual information supplied to {@link IBargeinHandler.handleBargein} so the
|
|
578
797
|
* handler can make an informed decision about how to respond to interruption.
|
|
798
|
+
*
|
|
799
|
+
* @see {@link IBargeinHandler} which consumes this context.
|
|
800
|
+
* @see {@link HardCutBargeinHandler} and {@link SoftFadeBargeinHandler} for concrete handlers.
|
|
801
|
+
*
|
|
802
|
+
* @example
|
|
803
|
+
* ```typescript
|
|
804
|
+
* const context: BargeinContext = {
|
|
805
|
+
* speechDurationMs: 450,
|
|
806
|
+
* interruptedText: 'I was explaining the process of...',
|
|
807
|
+
* playedDurationMs: 2300,
|
|
808
|
+
* };
|
|
809
|
+
* ```
|
|
579
810
|
*/
|
|
580
811
|
export interface BargeinContext {
|
|
581
812
|
/**
|
|
582
813
|
* Duration of detected user speech before the barge-in was confirmed, in ms.
|
|
583
|
-
* Short durations
|
|
814
|
+
* Short durations (< 100 ms) often indicate accidental noise, lip smacks,
|
|
815
|
+
* or breaths rather than intentional interruption.
|
|
816
|
+
*
|
|
817
|
+
* @see {@link HardCutBargeinHandler} which uses a 300 ms default threshold.
|
|
818
|
+
* @see {@link SoftFadeBargeinHandler} which uses a tiered threshold system.
|
|
584
819
|
*/
|
|
585
820
|
speechDurationMs: number;
|
|
586
821
|
/**
|
|
587
|
-
* The partial TTS text that was interrupted. Used to construct
|
|
588
|
-
*
|
|
822
|
+
* The partial TTS text that was interrupted. Used to construct
|
|
823
|
+
* {@link VoiceTurnMetadata.interruptedRemainder} so the agent knows what
|
|
824
|
+
* information was cut off and can avoid repeating it.
|
|
589
825
|
*/
|
|
590
826
|
interruptedText: string;
|
|
591
827
|
/**
|
|
592
828
|
* How many milliseconds of audio had been played at the point of interruption.
|
|
829
|
+
* Combined with {@link interruptedText}, this allows the agent to estimate
|
|
830
|
+
* how much of the response the user actually heard.
|
|
593
831
|
*/
|
|
594
832
|
playedDurationMs: number;
|
|
595
833
|
}
|
|
596
834
|
/**
|
|
597
835
|
* Action the pipeline should take in response to a detected barge-in.
|
|
598
836
|
* Returned by {@link IBargeinHandler.handleBargein}.
|
|
837
|
+
*
|
|
838
|
+
* @see {@link IBargeinHandler} which returns this type.
|
|
839
|
+
*
|
|
840
|
+
* @example
|
|
841
|
+
* ```typescript
|
|
842
|
+
* const cancelAction: BargeinAction = { type: 'cancel', injectMarker: '[interrupted]' };
|
|
843
|
+
* const pauseAction: BargeinAction = { type: 'pause', fadeMs: 150 };
|
|
844
|
+
* const resumeAction: BargeinAction = { type: 'resume' };
|
|
845
|
+
* const ignoreAction: BargeinAction = { type: 'ignore' };
|
|
846
|
+
* ```
|
|
599
847
|
*/
|
|
600
848
|
export type BargeinAction = {
|
|
601
849
|
/** Immediately stop all TTS output and discard the remainder of the response. */
|
|
@@ -606,7 +854,7 @@ export type BargeinAction = {
|
|
|
606
854
|
*/
|
|
607
855
|
injectMarker?: string;
|
|
608
856
|
} | {
|
|
609
|
-
/** Fade out TTS audio over
|
|
857
|
+
/** Fade out TTS audio over {@link fadeMs} milliseconds then pause. */
|
|
610
858
|
type: 'pause';
|
|
611
859
|
/** Duration of the fade-out in milliseconds. @defaultValue 150 */
|
|
612
860
|
fadeMs?: number;
|
|
@@ -627,19 +875,22 @@ export type BargeinAction = {
|
|
|
627
875
|
* Handles the policy decision when a barge-in (user speaking over TTS) is detected.
|
|
628
876
|
*
|
|
629
877
|
* Registered via `EXTENSION_KIND_BARGEIN_HANDLER`.
|
|
878
|
+
*
|
|
879
|
+
* @see {@link HardCutBargeinHandler} for the immediate-stop strategy.
|
|
880
|
+
* @see {@link SoftFadeBargeinHandler} for the three-tier fade strategy.
|
|
630
881
|
*/
|
|
631
882
|
export interface IBargeinHandler {
|
|
632
883
|
/**
|
|
633
884
|
* Interruption strategy implemented by this handler:
|
|
634
|
-
* - `'hard-cut'`
|
|
635
|
-
* - `'soft-fade'`
|
|
885
|
+
* - `'hard-cut'` -- TTS audio is stopped immediately with no fade.
|
|
886
|
+
* - `'soft-fade'` -- TTS audio fades out over a short window before stopping.
|
|
636
887
|
*/
|
|
637
888
|
readonly mode: 'hard-cut' | 'soft-fade';
|
|
638
889
|
/**
|
|
639
890
|
* Called by the pipeline when a barge-in is confirmed. The handler evaluates
|
|
640
891
|
* the context and returns the action the pipeline should execute.
|
|
641
892
|
*
|
|
642
|
-
* @param context
|
|
893
|
+
* @param context - Contextual snapshot at the moment of interruption.
|
|
643
894
|
* @returns The action to perform (or a promise resolving to one).
|
|
644
895
|
*/
|
|
645
896
|
handleBargein(context: BargeinContext): BargeinAction | Promise<BargeinAction>;
|
|
@@ -647,21 +898,32 @@ export interface IBargeinHandler {
|
|
|
647
898
|
/**
|
|
648
899
|
* Adapts any AgentOS agent to the voice pipeline's turn-based protocol.
|
|
649
900
|
*
|
|
650
|
-
* The pipeline calls {@link
|
|
651
|
-
*
|
|
901
|
+
* The pipeline calls {@link sendText} with the user's final transcript and
|
|
902
|
+
* streams the response back as text tokens for TTS synthesis.
|
|
903
|
+
*
|
|
904
|
+
* @see {@link VoicePipelineOrchestrator} which invokes this during the
|
|
905
|
+
* `PROCESSING -> SPEAKING` state transition.
|
|
652
906
|
*/
|
|
653
907
|
export interface IVoicePipelineAgentSession {
|
|
654
908
|
/**
|
|
655
909
|
* Send the user's utterance to the agent and receive a streaming text response.
|
|
656
910
|
*
|
|
657
|
-
* @param text
|
|
658
|
-
* @param metadata
|
|
911
|
+
* @param text - Final transcript from the STT + endpoint detection pipeline.
|
|
912
|
+
* @param metadata - Rich metadata about the current voice turn.
|
|
659
913
|
* @returns An async iterable of text tokens (suitable for streaming into TTS).
|
|
914
|
+
*
|
|
915
|
+
* @example
|
|
916
|
+
* ```typescript
|
|
917
|
+
* const tokens = agentSession.sendText('What is the weather?', metadata);
|
|
918
|
+
* for await (const token of tokens) {
|
|
919
|
+
* ttsSession.pushTokens(token);
|
|
920
|
+
* }
|
|
921
|
+
* ```
|
|
660
922
|
*/
|
|
661
923
|
sendText(text: string, metadata: VoiceTurnMetadata): AsyncIterable<string>;
|
|
662
924
|
/**
|
|
663
925
|
* Abort the current agent response mid-stream (called on barge-in when
|
|
664
|
-
*
|
|
926
|
+
* {@link BargeinAction} type is `'cancel'`).
|
|
665
927
|
*
|
|
666
928
|
* Implementations should cancel any in-flight LLM requests. The pipeline
|
|
667
929
|
* will discard any tokens emitted after `abort()` is called.
|
|
@@ -671,6 +933,19 @@ export interface IVoicePipelineAgentSession {
|
|
|
671
933
|
/**
|
|
672
934
|
* Rich metadata attached to each voice turn and passed to the agent session.
|
|
673
935
|
* Enables the agent to tailor its response based on conversation dynamics.
|
|
936
|
+
*
|
|
937
|
+
* @see {@link IVoicePipelineAgentSession.sendText} which receives this metadata.
|
|
938
|
+
*
|
|
939
|
+
* @example
|
|
940
|
+
* ```typescript
|
|
941
|
+
* const metadata: VoiceTurnMetadata = {
|
|
942
|
+
* speakers: ['user'],
|
|
943
|
+
* endpointReason: 'punctuation',
|
|
944
|
+
* speechDurationMs: 3200,
|
|
945
|
+
* wasInterrupted: false,
|
|
946
|
+
* transcriptConfidence: 0.92,
|
|
947
|
+
* };
|
|
948
|
+
* ```
|
|
674
949
|
*/
|
|
675
950
|
export interface VoiceTurnMetadata {
|
|
676
951
|
/**
|
|
@@ -680,6 +955,7 @@ export interface VoiceTurnMetadata {
|
|
|
680
955
|
speakers: string[];
|
|
681
956
|
/**
|
|
682
957
|
* The reason the endpoint detector decided the user had finished speaking.
|
|
958
|
+
* @see {@link EndpointReason} for the full set of possible values.
|
|
683
959
|
*/
|
|
684
960
|
endpointReason: EndpointReason;
|
|
685
961
|
/**
|
|
@@ -692,8 +968,9 @@ export interface VoiceTurnMetadata {
|
|
|
692
968
|
*/
|
|
693
969
|
wasInterrupted: boolean;
|
|
694
970
|
/**
|
|
695
|
-
* When
|
|
696
|
-
* that was cut off. Useful for the agent to avoid re-stating information
|
|
971
|
+
* When {@link wasInterrupted} is `true`, the text remainder of the agent response
|
|
972
|
+
* that was cut off. Useful for the agent to avoid re-stating information
|
|
973
|
+
* the user has already heard.
|
|
697
974
|
*/
|
|
698
975
|
interruptedRemainder?: string;
|
|
699
976
|
/**
|
|
@@ -704,6 +981,21 @@ export interface VoiceTurnMetadata {
|
|
|
704
981
|
/**
|
|
705
982
|
* Top-level configuration for the {@link VoicePipelineSession}.
|
|
706
983
|
* Specifies which providers to use and their session-level options.
|
|
984
|
+
*
|
|
985
|
+
* @see {@link VoicePipelineOrchestrator} which consumes this configuration.
|
|
986
|
+
*
|
|
987
|
+
* @example
|
|
988
|
+
* ```typescript
|
|
989
|
+
* const config: VoicePipelineConfig = {
|
|
990
|
+
* stt: 'deepgram',
|
|
991
|
+
* tts: 'openai',
|
|
992
|
+
* endpointing: 'heuristic',
|
|
993
|
+
* bargeIn: 'hard-cut',
|
|
994
|
+
* voice: 'nova',
|
|
995
|
+
* format: 'opus',
|
|
996
|
+
* language: 'en-US',
|
|
997
|
+
* };
|
|
998
|
+
* ```
|
|
707
999
|
*/
|
|
708
1000
|
export interface VoicePipelineConfig {
|
|
709
1001
|
/**
|
|
@@ -719,7 +1011,8 @@ export interface VoicePipelineConfig {
|
|
|
719
1011
|
*/
|
|
720
1012
|
tts: string;
|
|
721
1013
|
/**
|
|
722
|
-
* Endpoint detection strategy. Defaults to `'
|
|
1014
|
+
* Endpoint detection strategy. Defaults to `'heuristic'` when omitted.
|
|
1015
|
+
* @see {@link IEndpointDetector.mode} for the strategy descriptions.
|
|
723
1016
|
*/
|
|
724
1017
|
endpointing?: 'acoustic' | 'heuristic' | 'semantic';
|
|
725
1018
|
/**
|
|
@@ -728,6 +1021,7 @@ export interface VoicePipelineConfig {
|
|
|
728
1021
|
diarization?: boolean;
|
|
729
1022
|
/**
|
|
730
1023
|
* Barge-in (interruption) handling mode. Defaults to `'hard-cut'` when omitted.
|
|
1024
|
+
* @see {@link HardCutBargeinHandler} and {@link SoftFadeBargeinHandler}.
|
|
731
1025
|
*/
|
|
732
1026
|
bargeIn?: 'hard-cut' | 'soft-fade' | 'disabled';
|
|
733
1027
|
/**
|
|
@@ -761,26 +1055,35 @@ export interface VoicePipelineConfig {
|
|
|
761
1055
|
/**
|
|
762
1056
|
* Lifecycle state of a {@link VoicePipelineSession}.
|
|
763
1057
|
*
|
|
764
|
-
* Valid transitions
|
|
1058
|
+
* ## Valid state transitions
|
|
1059
|
+
*
|
|
765
1060
|
* ```
|
|
766
|
-
* idle
|
|
767
|
-
*
|
|
768
|
-
* any
|
|
1061
|
+
* idle -> listening -> processing -> speaking -> listening
|
|
1062
|
+
* -> interrupting -> listening
|
|
1063
|
+
* any -> closed
|
|
769
1064
|
* ```
|
|
1065
|
+
*
|
|
1066
|
+
* The state machine is enforced by {@link VoicePipelineOrchestrator._setState}
|
|
1067
|
+
* which emits `'state_changed'` on every transition.
|
|
770
1068
|
*/
|
|
771
1069
|
export type PipelineState = 'idle' | 'listening' | 'processing' | 'speaking' | 'interrupting' | 'closed';
|
|
772
1070
|
/**
|
|
773
1071
|
* A live voice pipeline session binding a transport, STT, endpoint detection,
|
|
774
1072
|
* optional diarization, agent, and TTS into a single coordinated lifecycle.
|
|
775
1073
|
*
|
|
776
|
-
*
|
|
777
|
-
*
|
|
778
|
-
*
|
|
779
|
-
*
|
|
780
|
-
*
|
|
781
|
-
*
|
|
782
|
-
*
|
|
783
|
-
*
|
|
1074
|
+
* ## Events emitted
|
|
1075
|
+
*
|
|
1076
|
+
* | Event | Payload | Description |
|
|
1077
|
+
* |--------------------------|---------------------------|-----------------------------------------|
|
|
1078
|
+
* | `'state_change'` | {@link PipelineState} | Pipeline state machine transition. |
|
|
1079
|
+
* | `'turn_complete'` | {@link TurnCompleteEvent} | User turn detected. |
|
|
1080
|
+
* | `'agent_response_start'` | *(none)* | Agent has begun generating a response. |
|
|
1081
|
+
* | `'agent_response_end'` | *(none)* | Agent response fully played. |
|
|
1082
|
+
* | `'barge_in'` | {@link BargeinContext} | User interrupted TTS playback. |
|
|
1083
|
+
* | `'error'` | `Error` | Unrecoverable pipeline error. |
|
|
1084
|
+
* | `'close'` | *(none)* | Session has been fully torn down. |
|
|
1085
|
+
*
|
|
1086
|
+
* @see {@link VoicePipelineOrchestrator.startSession} which creates these sessions.
|
|
784
1087
|
*/
|
|
785
1088
|
export interface VoicePipelineSession extends EventEmitter {
|
|
786
1089
|
/**
|
|
@@ -789,6 +1092,7 @@ export interface VoicePipelineSession extends EventEmitter {
|
|
|
789
1092
|
readonly sessionId: string;
|
|
790
1093
|
/**
|
|
791
1094
|
* Current pipeline state machine state.
|
|
1095
|
+
* @see {@link PipelineState} for the full set of states and transitions.
|
|
792
1096
|
*/
|
|
793
1097
|
readonly state: PipelineState;
|
|
794
1098
|
/**
|
|
@@ -797,16 +1101,26 @@ export interface VoicePipelineSession extends EventEmitter {
|
|
|
797
1101
|
*/
|
|
798
1102
|
readonly transport: IStreamTransport;
|
|
799
1103
|
/**
|
|
800
|
-
* Gracefully close the session
|
|
1104
|
+
* Gracefully close the session -- flush in-flight audio, tear down all sub-sessions,
|
|
801
1105
|
* and emit `'close'`.
|
|
802
1106
|
*
|
|
803
|
-
* @param reason
|
|
1107
|
+
* @param reason - Optional human-readable reason for diagnostics.
|
|
804
1108
|
*/
|
|
805
1109
|
close(reason?: string): Promise<void>;
|
|
806
1110
|
}
|
|
807
1111
|
/**
|
|
808
1112
|
* Messages sent from the client (browser/app) to the server over the transport.
|
|
809
1113
|
* All messages are JSON-serialised.
|
|
1114
|
+
*
|
|
1115
|
+
* @see {@link ServerTextMessage} for the server-to-client counterpart.
|
|
1116
|
+
*
|
|
1117
|
+
* @example
|
|
1118
|
+
* ```typescript
|
|
1119
|
+
* const configMsg: ClientTextMessage = {
|
|
1120
|
+
* type: 'config',
|
|
1121
|
+
* config: { stt: 'deepgram', tts: 'openai' },
|
|
1122
|
+
* };
|
|
1123
|
+
* ```
|
|
810
1124
|
*/
|
|
811
1125
|
export type ClientTextMessage = {
|
|
812
1126
|
/**
|
|
@@ -827,6 +1141,18 @@ export type ClientTextMessage = {
|
|
|
827
1141
|
/**
|
|
828
1142
|
* Messages sent from the server to the client over the transport.
|
|
829
1143
|
* All messages are JSON-serialised.
|
|
1144
|
+
*
|
|
1145
|
+
* @see {@link ClientTextMessage} for the client-to-server counterpart.
|
|
1146
|
+
* @see {@link IStreamTransport.sendControl} which sends these messages.
|
|
1147
|
+
*
|
|
1148
|
+
* @example
|
|
1149
|
+
* ```typescript
|
|
1150
|
+
* const sessionStarted: ServerTextMessage = {
|
|
1151
|
+
* type: 'session_started',
|
|
1152
|
+
* sessionId: 'abc-123',
|
|
1153
|
+
* config: { stt: 'deepgram', tts: 'openai' },
|
|
1154
|
+
* };
|
|
1155
|
+
* ```
|
|
830
1156
|
*/
|
|
831
1157
|
export type ServerTextMessage = {
|
|
832
1158
|
/**
|
|
@@ -858,8 +1184,8 @@ export type ServerTextMessage = {
|
|
|
858
1184
|
type: 'agent_thinking';
|
|
859
1185
|
} | {
|
|
860
1186
|
/**
|
|
861
|
-
* Emitted when TTS synthesis begins
|
|
862
|
-
* Clients may hide thinking indicators.
|
|
1187
|
+
* Emitted when TTS synthesis begins -- audio chunks will follow over the audio channel.
|
|
1188
|
+
* Clients may hide thinking indicators and prepare audio playback.
|
|
863
1189
|
*/
|
|
864
1190
|
type: 'agent_speaking';
|
|
865
1191
|
/**
|
|
@@ -890,16 +1216,16 @@ export type ServerTextMessage = {
|
|
|
890
1216
|
* The session will be closed after this message.
|
|
891
1217
|
*/
|
|
892
1218
|
type: 'error';
|
|
893
|
-
/** Machine-readable error code. */
|
|
1219
|
+
/** Machine-readable error code (e.g. `'STT_PROVIDER_ERROR'`). */
|
|
894
1220
|
code: string;
|
|
895
|
-
/** Human-readable description. */
|
|
1221
|
+
/** Human-readable description of the error. */
|
|
896
1222
|
message: string;
|
|
897
1223
|
} | {
|
|
898
1224
|
/**
|
|
899
1225
|
* Emitted as the final message before the server closes the transport.
|
|
900
1226
|
*/
|
|
901
1227
|
type: 'session_ended';
|
|
902
|
-
/** Optional human-readable reason. */
|
|
1228
|
+
/** Optional human-readable reason for the session ending. */
|
|
903
1229
|
reason?: string;
|
|
904
1230
|
};
|
|
905
1231
|
//# sourceMappingURL=types.d.ts.map
|