@framers/agentos 0.1.75 → 0.1.76
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +139 -34
- package/dist/core/agency/AgentCommunicationBus.d.ts +1 -0
- package/dist/core/agency/AgentCommunicationBus.d.ts.map +1 -1
- package/dist/core/agency/AgentCommunicationBus.js +62 -8
- package/dist/core/agency/AgentCommunicationBus.js.map +1 -1
- package/dist/core/agency/IAgentCommunicationBus.d.ts +1 -1
- package/dist/core/agency/IAgentCommunicationBus.d.ts.map +1 -1
- package/dist/orchestration/runtime/LoopController.d.ts +10 -10
- package/dist/orchestration/runtime/LoopController.d.ts.map +1 -1
- package/dist/orchestration/runtime/LoopController.js +1 -1
- package/dist/orchestration/runtime/LoopController.js.map +1 -1
- package/dist/orchestration/runtime/index.d.ts +1 -1
- package/dist/orchestration/runtime/index.d.ts.map +1 -1
- package/dist/orchestration/runtime/index.js.map +1 -1
- package/dist/speech/FallbackProxy.d.ts +104 -0
- package/dist/speech/FallbackProxy.d.ts.map +1 -0
- package/dist/speech/FallbackProxy.js +151 -0
- package/dist/speech/FallbackProxy.js.map +1 -0
- package/dist/speech/SpeechProviderResolver.d.ts +103 -0
- package/dist/speech/SpeechProviderResolver.d.ts.map +1 -0
- package/dist/speech/SpeechProviderResolver.js +256 -0
- package/dist/speech/SpeechProviderResolver.js.map +1 -0
- package/dist/speech/SpeechRuntime.d.ts +23 -1
- package/dist/speech/SpeechRuntime.d.ts.map +1 -1
- package/dist/speech/SpeechRuntime.js +82 -8
- package/dist/speech/SpeechRuntime.js.map +1 -1
- package/dist/speech/index.d.ts +6 -0
- package/dist/speech/index.d.ts.map +1 -1
- package/dist/speech/index.js +6 -0
- package/dist/speech/index.js.map +1 -1
- package/dist/speech/providerCatalog.d.ts.map +1 -1
- package/dist/speech/providerCatalog.js +15 -1
- package/dist/speech/providerCatalog.js.map +1 -1
- package/dist/speech/providers/AssemblyAISTTProvider.d.ts +49 -0
- package/dist/speech/providers/AssemblyAISTTProvider.d.ts.map +1 -0
- package/dist/speech/providers/AssemblyAISTTProvider.js +151 -0
- package/dist/speech/providers/AssemblyAISTTProvider.js.map +1 -0
- package/dist/speech/providers/AzureSpeechSTTProvider.d.ts +48 -0
- package/dist/speech/providers/AzureSpeechSTTProvider.d.ts.map +1 -0
- package/dist/speech/providers/AzureSpeechSTTProvider.js +90 -0
- package/dist/speech/providers/AzureSpeechSTTProvider.js.map +1 -0
- package/dist/speech/providers/AzureSpeechTTSProvider.d.ts +60 -0
- package/dist/speech/providers/AzureSpeechTTSProvider.d.ts.map +1 -0
- package/dist/speech/providers/AzureSpeechTTSProvider.js +127 -0
- package/dist/speech/providers/AzureSpeechTTSProvider.js.map +1 -0
- package/dist/speech/providers/DeepgramBatchSTTProvider.d.ts +55 -0
- package/dist/speech/providers/DeepgramBatchSTTProvider.d.ts.map +1 -0
- package/dist/speech/providers/DeepgramBatchSTTProvider.js +102 -0
- package/dist/speech/providers/DeepgramBatchSTTProvider.js.map +1 -0
- package/dist/speech/types.d.ts +35 -0
- package/dist/speech/types.d.ts.map +1 -1
- package/dist/voice/CallManager.d.ts +1 -1
- package/dist/voice/CallManager.d.ts.map +1 -1
- package/dist/voice/CallManager.js +9 -0
- package/dist/voice/CallManager.js.map +1 -1
- package/dist/voice/MediaStreamParser.d.ts +83 -0
- package/dist/voice/MediaStreamParser.d.ts.map +1 -0
- package/dist/voice/MediaStreamParser.js +2 -0
- package/dist/voice/MediaStreamParser.js.map +1 -0
- package/dist/voice/TelephonyStreamTransport.d.ts +112 -0
- package/dist/voice/TelephonyStreamTransport.d.ts.map +1 -0
- package/dist/voice/TelephonyStreamTransport.js +208 -0
- package/dist/voice/TelephonyStreamTransport.js.map +1 -0
- package/dist/voice/index.d.ts +10 -0
- package/dist/voice/index.d.ts.map +1 -1
- package/dist/voice/index.js +11 -0
- package/dist/voice/index.js.map +1 -1
- package/dist/voice/parsers/PlivoMediaStreamParser.d.ts +43 -0
- package/dist/voice/parsers/PlivoMediaStreamParser.d.ts.map +1 -0
- package/dist/voice/parsers/PlivoMediaStreamParser.js +92 -0
- package/dist/voice/parsers/PlivoMediaStreamParser.js.map +1 -0
- package/dist/voice/parsers/TelnyxMediaStreamParser.d.ts +51 -0
- package/dist/voice/parsers/TelnyxMediaStreamParser.d.ts.map +1 -0
- package/dist/voice/parsers/TelnyxMediaStreamParser.js +103 -0
- package/dist/voice/parsers/TelnyxMediaStreamParser.js.map +1 -0
- package/dist/voice/parsers/TwilioMediaStreamParser.d.ts +50 -0
- package/dist/voice/parsers/TwilioMediaStreamParser.d.ts.map +1 -0
- package/dist/voice/parsers/TwilioMediaStreamParser.js +144 -0
- package/dist/voice/parsers/TwilioMediaStreamParser.js.map +1 -0
- package/dist/voice/providers/plivo.d.ts +77 -0
- package/dist/voice/providers/plivo.d.ts.map +1 -0
- package/dist/voice/providers/plivo.js +180 -0
- package/dist/voice/providers/plivo.js.map +1 -0
- package/dist/voice/providers/telnyx.d.ts +93 -0
- package/dist/voice/providers/telnyx.d.ts.map +1 -0
- package/dist/voice/providers/telnyx.js +193 -0
- package/dist/voice/providers/telnyx.js.map +1 -0
- package/dist/voice/providers/twilio.d.ts +79 -0
- package/dist/voice/providers/twilio.d.ts.map +1 -0
- package/dist/voice/providers/twilio.js +191 -0
- package/dist/voice/providers/twilio.js.map +1 -0
- package/dist/voice/twiml.d.ts +69 -0
- package/dist/voice/twiml.d.ts.map +1 -0
- package/dist/voice/twiml.js +92 -0
- package/dist/voice/twiml.js.map +1 -0
- package/dist/voice/types.d.ts +9 -1
- package/dist/voice/types.d.ts.map +1 -1
- package/dist/voice-pipeline/AcousticEndpointDetector.d.ts +90 -0
- package/dist/voice-pipeline/AcousticEndpointDetector.d.ts.map +1 -0
- package/dist/voice-pipeline/AcousticEndpointDetector.js +123 -0
- package/dist/voice-pipeline/AcousticEndpointDetector.js.map +1 -0
- package/dist/voice-pipeline/HardCutBargeinHandler.d.ts +67 -0
- package/dist/voice-pipeline/HardCutBargeinHandler.d.ts.map +1 -0
- package/dist/voice-pipeline/HardCutBargeinHandler.js +55 -0
- package/dist/voice-pipeline/HardCutBargeinHandler.js.map +1 -0
- package/dist/voice-pipeline/HeuristicEndpointDetector.d.ts +128 -0
- package/dist/voice-pipeline/HeuristicEndpointDetector.d.ts.map +1 -0
- package/dist/voice-pipeline/HeuristicEndpointDetector.js +240 -0
- package/dist/voice-pipeline/HeuristicEndpointDetector.js.map +1 -0
- package/dist/voice-pipeline/SoftFadeBargeinHandler.d.ts +96 -0
- package/dist/voice-pipeline/SoftFadeBargeinHandler.d.ts.map +1 -0
- package/dist/voice-pipeline/SoftFadeBargeinHandler.js +69 -0
- package/dist/voice-pipeline/SoftFadeBargeinHandler.js.map +1 -0
- package/dist/voice-pipeline/VoicePipelineOrchestrator.d.ts +122 -0
- package/dist/voice-pipeline/VoicePipelineOrchestrator.d.ts.map +1 -0
- package/dist/voice-pipeline/VoicePipelineOrchestrator.js +317 -0
- package/dist/voice-pipeline/VoicePipelineOrchestrator.js.map +1 -0
- package/dist/voice-pipeline/WebSocketStreamTransport.d.ts +148 -0
- package/dist/voice-pipeline/WebSocketStreamTransport.d.ts.map +1 -0
- package/dist/voice-pipeline/WebSocketStreamTransport.js +207 -0
- package/dist/voice-pipeline/WebSocketStreamTransport.js.map +1 -0
- package/dist/voice-pipeline/index.d.ts +13 -0
- package/dist/voice-pipeline/index.d.ts.map +1 -0
- package/dist/voice-pipeline/index.js +13 -0
- package/dist/voice-pipeline/index.js.map +1 -0
- package/dist/voice-pipeline/types.d.ts +905 -0
- package/dist/voice-pipeline/types.d.ts.map +1 -0
- package/dist/voice-pipeline/types.js +23 -0
- package/dist/voice-pipeline/types.js.map +1 -0
- package/package.json +6 -1
|
@@ -0,0 +1,905 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @module voice-pipeline/types
|
|
3
|
+
*
|
|
4
|
+
* Core interfaces and types for the AgentOS streaming voice pipeline.
|
|
5
|
+
*
|
|
6
|
+
* The voice pipeline connects speech-to-text, endpoint detection, diarization,
|
|
7
|
+
* agent processing, and text-to-speech into a low-latency, real-time conversation
|
|
8
|
+
* system. All heavy I/O crosses EventEmitter-based session boundaries to keep
|
|
9
|
+
* the hot path non-blocking.
|
|
10
|
+
*
|
|
11
|
+
* Dependency order (no circular refs):
|
|
12
|
+
* AudioFrame / EncodedAudioChunk
|
|
13
|
+
* → Transport (IStreamTransport)
|
|
14
|
+
* → STT (IStreamingSTT + StreamingSTTSession)
|
|
15
|
+
* → Endpoint detection (IEndpointDetector + VadEvent)
|
|
16
|
+
* → Diarization (IDiarizationEngine + DiarizationSession)
|
|
17
|
+
* → TTS (IStreamingTTS + StreamingTTSSession)
|
|
18
|
+
* → Barge-in (IBargeinHandler)
|
|
19
|
+
* → Session (VoicePipelineSession)
|
|
20
|
+
* → Protocol messages (ClientTextMessage, ServerTextMessage)
|
|
21
|
+
*/
|
|
22
|
+
import type { EventEmitter } from 'node:events';
|
|
23
|
+
/**
|
|
24
|
+
* A single frame of raw PCM audio, as produced by a microphone capture or
|
|
25
|
+
* a VAD pre-processor. Each frame typically represents 10–20 ms of audio.
|
|
26
|
+
*/
|
|
27
|
+
export interface AudioFrame {
|
|
28
|
+
/**
|
|
29
|
+
* Interleaved 32-bit float PCM samples, normalised to [-1, 1].
|
|
30
|
+
* For mono audio this is a flat array; stereo interleaves L/R pairs.
|
|
31
|
+
*/
|
|
32
|
+
samples: Float32Array;
|
|
33
|
+
/**
|
|
34
|
+
* Samples per second (e.g. 16000, 24000, 48000).
|
|
35
|
+
*/
|
|
36
|
+
sampleRate: number;
|
|
37
|
+
/**
|
|
38
|
+
* Unix epoch millisecond timestamp at which this frame was captured.
|
|
39
|
+
* Used for synchronisation across STT, VAD, and diarization streams.
|
|
40
|
+
*/
|
|
41
|
+
timestamp: number;
|
|
42
|
+
/**
|
|
43
|
+
* Optional hint from the capture layer identifying the speaker source
|
|
44
|
+
* (e.g. a hardware device label or a WebRTC peer ID). Used by the
|
|
45
|
+
* diarization engine when native speaker IDs are unavailable.
|
|
46
|
+
*/
|
|
47
|
+
speakerHint?: string;
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* A compressed audio chunk ready for transmission over the wire (e.g. to a
|
|
51
|
+
* TTS websocket or a playback buffer). Contains the rendered text to allow
|
|
52
|
+
* barge-in handlers to track interrupted utterance state.
|
|
53
|
+
*/
|
|
54
|
+
export interface EncodedAudioChunk {
|
|
55
|
+
/**
|
|
56
|
+
* Raw encoded bytes in the format specified by `format`.
|
|
57
|
+
*/
|
|
58
|
+
audio: Buffer;
|
|
59
|
+
/**
|
|
60
|
+
* Codec/container format of `audio`.
|
|
61
|
+
*/
|
|
62
|
+
format: 'pcm' | 'mp3' | 'opus';
|
|
63
|
+
/**
|
|
64
|
+
* Samples per second for the encoded stream.
|
|
65
|
+
*/
|
|
66
|
+
sampleRate: number;
|
|
67
|
+
/**
|
|
68
|
+
* Playback duration of this chunk in milliseconds.
|
|
69
|
+
*/
|
|
70
|
+
durationMs: number;
|
|
71
|
+
/**
|
|
72
|
+
* The text fragment that was synthesised into this chunk. Preserved so
|
|
73
|
+
* barge-in handlers can report `interruptedRemainder` accurately.
|
|
74
|
+
*/
|
|
75
|
+
text: string;
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* Discriminated union of control messages sent from the pipeline to the
|
|
79
|
+
* underlying stream transport (e.g. a WebSocket or WebRTC data-channel).
|
|
80
|
+
*/
|
|
81
|
+
export type TransportControlMessage = {
|
|
82
|
+
/** Mute the outbound audio stream without closing the session. */
|
|
83
|
+
type: 'mute';
|
|
84
|
+
} | {
|
|
85
|
+
/** Unmute the outbound audio stream previously muted. */
|
|
86
|
+
type: 'unmute';
|
|
87
|
+
} | {
|
|
88
|
+
/** Reconfigure transport-layer parameters at runtime. */
|
|
89
|
+
type: 'config';
|
|
90
|
+
/** Partial configuration overrides. Keys are transport-specific. */
|
|
91
|
+
params: Record<string, unknown>;
|
|
92
|
+
} | {
|
|
93
|
+
/** Gracefully stop the transport and signal end-of-stream. */
|
|
94
|
+
type: 'stop';
|
|
95
|
+
/** Optional human-readable reason included in the closing handshake. */
|
|
96
|
+
reason?: string;
|
|
97
|
+
};
|
|
98
|
+
/**
|
|
99
|
+
* Abstraction over any bidirectional audio/text stream transport.
|
|
100
|
+
* Implementations include WebSocket, WebRTC data-channel, and in-process pipes.
|
|
101
|
+
*
|
|
102
|
+
* Emits:
|
|
103
|
+
* - `'audio'` (AudioFrame) — inbound audio from the remote client.
|
|
104
|
+
* - `'message'` (ClientTextMessage) — inbound JSON control message from the client.
|
|
105
|
+
* - `'close'` () — transport has been closed (either side).
|
|
106
|
+
* - `'error'` (Error) — fatal transport error.
|
|
107
|
+
*/
|
|
108
|
+
export interface IStreamTransport extends EventEmitter {
|
|
109
|
+
/**
|
|
110
|
+
* Stable identifier for this transport connection (e.g. a UUID or socket ID).
|
|
111
|
+
*/
|
|
112
|
+
readonly id: string;
|
|
113
|
+
/**
|
|
114
|
+
* Current connection state.
|
|
115
|
+
* - `'connecting'` — handshake in progress.
|
|
116
|
+
* - `'open'` — fully established and ready.
|
|
117
|
+
* - `'closing'` — graceful teardown initiated.
|
|
118
|
+
* - `'closed'` — no longer usable.
|
|
119
|
+
*/
|
|
120
|
+
readonly state: 'connecting' | 'open' | 'closing' | 'closed';
|
|
121
|
+
/**
|
|
122
|
+
* Send a synthesised audio chunk to the remote client for playback.
|
|
123
|
+
* Resolves once the chunk has been handed to the underlying I/O layer.
|
|
124
|
+
*
|
|
125
|
+
* @param chunk — Encoded audio to deliver.
|
|
126
|
+
*/
|
|
127
|
+
sendAudio(chunk: EncodedAudioChunk): Promise<void>;
|
|
128
|
+
/**
|
|
129
|
+
* Send a JSON control message to the remote client.
|
|
130
|
+
*
|
|
131
|
+
* @param message — Server-side protocol message.
|
|
132
|
+
*/
|
|
133
|
+
sendControl(message: ServerTextMessage): Promise<void>;
|
|
134
|
+
/**
|
|
135
|
+
* Close the transport, optionally supplying a WebSocket-style close code and
|
|
136
|
+
* human-readable reason string for diagnostics.
|
|
137
|
+
*
|
|
138
|
+
* @param code — Optional numeric close code (defaults to 1000 normal closure).
|
|
139
|
+
* @param reason — Optional human-readable close reason.
|
|
140
|
+
*/
|
|
141
|
+
close(code?: number, reason?: string): void;
|
|
142
|
+
}
|
|
143
|
+
/**
|
|
144
|
+
* Configuration passed to {@link IStreamingSTT.startSession} when opening a new
|
|
145
|
+
* speech recognition stream.
|
|
146
|
+
*/
|
|
147
|
+
export interface StreamingSTTConfig {
|
|
148
|
+
/**
|
|
149
|
+
* BCP-47 language code for recognition (e.g. `'en-US'`, `'fr-FR'`).
|
|
150
|
+
* Falls back to the provider default when omitted.
|
|
151
|
+
*/
|
|
152
|
+
language?: string;
|
|
153
|
+
/**
|
|
154
|
+
* Whether to emit interim (non-final) transcript events. When `true`,
|
|
155
|
+
* partial results arrive more frequently at the cost of higher word error rate.
|
|
156
|
+
* @defaultValue true
|
|
157
|
+
*/
|
|
158
|
+
interimResults?: boolean;
|
|
159
|
+
/**
|
|
160
|
+
* Enable automatic punctuation insertion if the provider supports it.
|
|
161
|
+
* @defaultValue true
|
|
162
|
+
*/
|
|
163
|
+
punctuate?: boolean;
|
|
164
|
+
/**
|
|
165
|
+
* Mask profanity in transcripts if supported by the provider.
|
|
166
|
+
* @defaultValue false
|
|
167
|
+
*/
|
|
168
|
+
profanityFilter?: boolean;
|
|
169
|
+
/**
|
|
170
|
+
* Pass-through options forwarded verbatim to the underlying provider SDK.
|
|
171
|
+
* Useful for enabling provider-specific features (e.g. custom vocabulary,
|
|
172
|
+
* speaker adaptation models) without modifying the interface.
|
|
173
|
+
*/
|
|
174
|
+
providerOptions?: Record<string, unknown>;
|
|
175
|
+
}
|
|
176
|
+
/**
|
|
177
|
+
* A single word within a {@link TranscriptEvent}, augmented with timing and
|
|
178
|
+
* optional speaker attribution.
|
|
179
|
+
*/
|
|
180
|
+
export interface TranscriptWord {
|
|
181
|
+
/**
|
|
182
|
+
* The recognised word token (may include punctuation if `punctuate` is enabled).
|
|
183
|
+
*/
|
|
184
|
+
word: string;
|
|
185
|
+
/**
|
|
186
|
+
* Millisecond offset from the start of the utterance at which this word begins.
|
|
187
|
+
*/
|
|
188
|
+
start: number;
|
|
189
|
+
/**
|
|
190
|
+
* Millisecond offset from the start of the utterance at which this word ends.
|
|
191
|
+
*/
|
|
192
|
+
end: number;
|
|
193
|
+
/**
|
|
194
|
+
* Recognition confidence in the range [0, 1]. Higher is better.
|
|
195
|
+
*/
|
|
196
|
+
confidence: number;
|
|
197
|
+
/**
|
|
198
|
+
* Speaker label when diarization is performed natively by the STT provider
|
|
199
|
+
* (e.g. Deepgram's `diarize` option). When diarization is handled by a
|
|
200
|
+
* separate {@link IDiarizationEngine}, this field is populated post-hoc.
|
|
201
|
+
*/
|
|
202
|
+
speaker?: string;
|
|
203
|
+
}
|
|
204
|
+
/**
|
|
205
|
+
* Emitted by a {@link StreamingSTTSession} each time the provider produces a
|
|
206
|
+
* recognition hypothesis.
|
|
207
|
+
*/
|
|
208
|
+
export interface TranscriptEvent {
|
|
209
|
+
/**
|
|
210
|
+
* Full transcript text for the current utterance hypothesis.
|
|
211
|
+
*/
|
|
212
|
+
text: string;
|
|
213
|
+
/**
|
|
214
|
+
* Aggregate confidence score for `text` in the range [0, 1].
|
|
215
|
+
*/
|
|
216
|
+
confidence: number;
|
|
217
|
+
/**
|
|
218
|
+
* Word-level detail, sorted by `start` time. May be empty for interim events
|
|
219
|
+
* from providers that only supply word timing in final results.
|
|
220
|
+
*/
|
|
221
|
+
words: TranscriptWord[];
|
|
222
|
+
/**
|
|
223
|
+
* `true` when this hypothesis is stable and will not be revised.
|
|
224
|
+
* `false` for interim (streaming) hypotheses.
|
|
225
|
+
*/
|
|
226
|
+
isFinal: boolean;
|
|
227
|
+
/**
|
|
228
|
+
* Duration of the recognised speech segment in milliseconds.
|
|
229
|
+
* Populated only on final events where the provider supplies timing.
|
|
230
|
+
*/
|
|
231
|
+
durationMs?: number;
|
|
232
|
+
}
|
|
233
|
+
/**
|
|
234
|
+
* An active streaming speech-to-text session. Audio frames are pushed in
|
|
235
|
+
* and transcript events flow out via EventEmitter.
|
|
236
|
+
*
|
|
237
|
+
* Emits:
|
|
238
|
+
* - `'transcript'` (TranscriptEvent) — interim or final hypothesis.
|
|
239
|
+
* - `'error'` (Error) — unrecoverable provider error.
|
|
240
|
+
* - `'close'` () — session has been fully terminated.
|
|
241
|
+
*/
|
|
242
|
+
export interface StreamingSTTSession extends EventEmitter {
|
|
243
|
+
/**
|
|
244
|
+
* Push a raw audio frame into the recognition stream. Frames must arrive
|
|
245
|
+
* in capture order; gaps or out-of-order frames degrade accuracy.
|
|
246
|
+
*
|
|
247
|
+
* @param frame — PCM audio frame to process.
|
|
248
|
+
*/
|
|
249
|
+
pushAudio(frame: AudioFrame): void;
|
|
250
|
+
/**
|
|
251
|
+
* Signal end-of-utterance to the provider. The provider will flush any
|
|
252
|
+
* buffered audio and emit a final {@link TranscriptEvent} before `'close'`.
|
|
253
|
+
*/
|
|
254
|
+
flush(): Promise<void>;
|
|
255
|
+
/**
|
|
256
|
+
* Immediately terminate the session without waiting for a final result.
|
|
257
|
+
* Useful during barge-in where the in-flight hypothesis is discarded.
|
|
258
|
+
*/
|
|
259
|
+
close(): void;
|
|
260
|
+
}
|
|
261
|
+
/**
|
|
262
|
+
* Factory interface for streaming speech-to-text providers.
|
|
263
|
+
*
|
|
264
|
+
* Implementations are registered via the `EXTENSION_KIND_STREAMING_STT`
|
|
265
|
+
* extension kind and resolved by the voice pipeline at session creation time.
|
|
266
|
+
*/
|
|
267
|
+
export interface IStreamingSTT {
|
|
268
|
+
/**
|
|
269
|
+
* Unique, stable identifier for this provider (e.g. `'deepgram'`, `'whisper-live'`).
|
|
270
|
+
*/
|
|
271
|
+
readonly providerId: string;
|
|
272
|
+
/**
|
|
273
|
+
* `true` when the provider has at least one active session open.
|
|
274
|
+
*/
|
|
275
|
+
readonly isStreaming: boolean;
|
|
276
|
+
/**
|
|
277
|
+
* Open a new streaming recognition session.
|
|
278
|
+
*
|
|
279
|
+
* @param config — Session-level configuration overriding provider defaults.
|
|
280
|
+
* @returns A ready-to-use session whose lifecycle is independent of this factory.
|
|
281
|
+
*/
|
|
282
|
+
startSession(config?: StreamingSTTConfig): Promise<StreamingSTTSession>;
|
|
283
|
+
}
|
|
284
|
+
/**
|
|
285
|
+
* A VAD (Voice Activity Detection) or STT-derived event describing speech
|
|
286
|
+
* energy transitions over time.
|
|
287
|
+
*/
|
|
288
|
+
export interface VadEvent {
|
|
289
|
+
/**
|
|
290
|
+
* Type of the VAD transition:
|
|
291
|
+
* - `'speech_start'` — voice energy detected after silence.
|
|
292
|
+
* - `'speech_end'` — voice energy fell below the silence threshold.
|
|
293
|
+
* - `'silence'` — periodic silence heartbeat (emitted at `silenceIntervalMs` cadence).
|
|
294
|
+
*/
|
|
295
|
+
type: 'speech_start' | 'speech_end' | 'silence';
|
|
296
|
+
/**
|
|
297
|
+
* Unix epoch millisecond timestamp at which this transition was detected.
|
|
298
|
+
*/
|
|
299
|
+
timestamp: number;
|
|
300
|
+
/**
|
|
301
|
+
* Optional raw energy level used to trigger this event (implementation-defined scale).
|
|
302
|
+
*/
|
|
303
|
+
energyLevel?: number;
|
|
304
|
+
/**
|
|
305
|
+
* Origin of the VAD event:
|
|
306
|
+
* - `'vad'` — emitted by a standalone VAD model (e.g. Silero, WebRTC VAD).
|
|
307
|
+
* - `'stt'` — inferred from STT activity (e.g. provider-side endpointing signals).
|
|
308
|
+
*/
|
|
309
|
+
source?: 'vad' | 'stt';
|
|
310
|
+
}
|
|
311
|
+
/**
|
|
312
|
+
* Semantic reason why the endpoint detector decided the user has finished speaking.
|
|
313
|
+
*/
|
|
314
|
+
export type EndpointReason = 'silence_timeout' | 'punctuation' | 'syntax_complete' | 'semantic_model' | 'manual' | 'timeout';
|
|
315
|
+
/**
|
|
316
|
+
* Emitted by {@link IEndpointDetector} when it determines the user has finished
|
|
317
|
+
* their turn and the pipeline should hand off to the agent.
|
|
318
|
+
*/
|
|
319
|
+
export interface TurnCompleteEvent {
|
|
320
|
+
/**
|
|
321
|
+
* The final consolidated transcript for this turn.
|
|
322
|
+
*/
|
|
323
|
+
transcript: string;
|
|
324
|
+
/**
|
|
325
|
+
* Aggregate STT confidence score for the transcript, in the range [0, 1].
|
|
326
|
+
*/
|
|
327
|
+
confidence: number;
|
|
328
|
+
/**
|
|
329
|
+
* Total duration of detected speech in this turn, in milliseconds.
|
|
330
|
+
*/
|
|
331
|
+
durationMs: number;
|
|
332
|
+
/**
|
|
333
|
+
* The semantic reason that triggered turn completion.
|
|
334
|
+
*/
|
|
335
|
+
reason: EndpointReason;
|
|
336
|
+
}
|
|
337
|
+
/**
|
|
338
|
+
* Detects turn boundaries in a continuous audio/transcript stream.
|
|
339
|
+
* Combines VAD events with linguistic signals to decide when the user
|
|
340
|
+
* has finished speaking.
|
|
341
|
+
*
|
|
342
|
+
* Emits:
|
|
343
|
+
* - `'turn_complete'` (TurnCompleteEvent) — the user's turn has ended.
|
|
344
|
+
* - `'speech_start'` () — the user has started speaking (re-emitted from VAD).
|
|
345
|
+
* - `'barge_in_detected'` () — user started speaking while TTS was playing.
|
|
346
|
+
*/
|
|
347
|
+
export interface IEndpointDetector extends EventEmitter {
|
|
348
|
+
/**
|
|
349
|
+
* Active detection strategy:
|
|
350
|
+
* - `'silence'` — pure silence-timeout based.
|
|
351
|
+
* - `'hybrid'` — silence + linguistic completeness signals.
|
|
352
|
+
* - `'semantic'` — small LM scoring utterance completeness.
|
|
353
|
+
*/
|
|
354
|
+
readonly mode: 'acoustic' | 'heuristic' | 'semantic';
|
|
355
|
+
/**
|
|
356
|
+
* Push a VAD event from the upstream voice activity detector.
|
|
357
|
+
*
|
|
358
|
+
* @param event — The VAD event to process.
|
|
359
|
+
*/
|
|
360
|
+
pushVadEvent(event: VadEvent): void;
|
|
361
|
+
/**
|
|
362
|
+
* Push a partial or final STT result for linguistic analysis.
|
|
363
|
+
*
|
|
364
|
+
* @param event — Transcript event from the STT session.
|
|
365
|
+
*/
|
|
366
|
+
pushTranscript(event: TranscriptEvent): void;
|
|
367
|
+
/**
|
|
368
|
+
* Reset all internal state (timers, partial transcripts) without destroying
|
|
369
|
+
* the detector instance. Called at the start of each new turn.
|
|
370
|
+
*/
|
|
371
|
+
reset(): void;
|
|
372
|
+
}
|
|
373
|
+
/**
|
|
374
|
+
* Configuration for a diarization session. Controls expected speaker count and
|
|
375
|
+
* chunking behaviour for providers that require buffered audio.
|
|
376
|
+
*/
|
|
377
|
+
export interface DiarizationConfig {
|
|
378
|
+
/**
|
|
379
|
+
* Hint to the provider about how many distinct speakers are expected.
|
|
380
|
+
* When omitted, the provider uses auto-detection.
|
|
381
|
+
*/
|
|
382
|
+
expectedSpeakers?: number;
|
|
383
|
+
/**
|
|
384
|
+
* When `true`, use the provider's built-in diarization instead of the
|
|
385
|
+
* AgentOS diarization engine (e.g. Deepgram `diarize` option).
|
|
386
|
+
* @defaultValue false
|
|
387
|
+
*/
|
|
388
|
+
preferProviderNative?: boolean;
|
|
389
|
+
/**
|
|
390
|
+
* Size of audio chunks processed per diarization inference, in milliseconds.
|
|
391
|
+
* Smaller values reduce latency; larger values improve accuracy.
|
|
392
|
+
* @defaultValue 500
|
|
393
|
+
*/
|
|
394
|
+
chunkSizeMs?: number;
|
|
395
|
+
/**
|
|
396
|
+
* Overlap between consecutive chunks in milliseconds. Overlap improves
|
|
397
|
+
* speaker boundary accuracy at the cost of extra compute.
|
|
398
|
+
* @defaultValue 100
|
|
399
|
+
*/
|
|
400
|
+
overlapMs?: number;
|
|
401
|
+
}
|
|
402
|
+
/**
|
|
403
|
+
* A contiguous segment of transcript text with millisecond timing metadata.
|
|
404
|
+
*/
|
|
405
|
+
export interface TranscriptSegment {
|
|
406
|
+
/**
|
|
407
|
+
* The text content of the segment.
|
|
408
|
+
*/
|
|
409
|
+
text: string;
|
|
410
|
+
/**
|
|
411
|
+
* Start of the segment in milliseconds from the beginning of the stream.
|
|
412
|
+
*/
|
|
413
|
+
startMs: number;
|
|
414
|
+
/**
|
|
415
|
+
* End of the segment in milliseconds from the beginning of the stream.
|
|
416
|
+
*/
|
|
417
|
+
endMs: number;
|
|
418
|
+
}
|
|
419
|
+
/**
|
|
420
|
+
* A {@link TranscriptSegment} extended with speaker attribution produced by the
|
|
421
|
+
* diarization engine.
|
|
422
|
+
*/
|
|
423
|
+
export interface DiarizedSegment extends TranscriptSegment {
|
|
424
|
+
/**
|
|
425
|
+
* Stable speaker label assigned by the diarization engine (e.g. `'SPEAKER_0'`).
|
|
426
|
+
* The label is consistent within a session but not across sessions unless
|
|
427
|
+
* speaker enrollment is used.
|
|
428
|
+
*/
|
|
429
|
+
speakerId: string;
|
|
430
|
+
/**
|
|
431
|
+
* Confidence that this segment belongs to `speakerId`, in the range [0, 1].
|
|
432
|
+
*/
|
|
433
|
+
speakerConfidence: number;
|
|
434
|
+
}
|
|
435
|
+
/**
|
|
436
|
+
* An active diarization session. Accepts raw audio and outputs speaker-attributed
|
|
437
|
+
* transcript segments via EventEmitter.
|
|
438
|
+
*
|
|
439
|
+
* Emits:
|
|
440
|
+
* - `'segment'` (DiarizedSegment) — a diarized transcript segment is ready.
|
|
441
|
+
* - `'speaker_change'` ({ from: string; to: string }) — speaker transition detected.
|
|
442
|
+
* - `'error'` (Error) — unrecoverable engine error.
|
|
443
|
+
* - `'close'` () — session terminated.
|
|
444
|
+
*/
|
|
445
|
+
export interface DiarizationSession extends EventEmitter {
|
|
446
|
+
/**
|
|
447
|
+
* Push a raw audio frame for diarization analysis.
|
|
448
|
+
*
|
|
449
|
+
* @param frame — PCM audio frame from the capture stream.
|
|
450
|
+
*/
|
|
451
|
+
pushAudio(frame: AudioFrame): void;
|
|
452
|
+
/**
|
|
453
|
+
* Apply speaker labels to an existing transcript using the session's
|
|
454
|
+
* current speaker model. Returns labelled segments.
|
|
455
|
+
*
|
|
456
|
+
* @param transcript — Plain transcript segments to label.
|
|
457
|
+
*/
|
|
458
|
+
labelTranscript(transcript: TranscriptSegment[]): Promise<DiarizedSegment[]>;
|
|
459
|
+
/**
|
|
460
|
+
* Enroll a known speaker so subsequent audio is attributed to a named identity
|
|
461
|
+
* rather than an anonymous `SPEAKER_N` label.
|
|
462
|
+
*
|
|
463
|
+
* @param speakerId — Stable identifier for the speaker (e.g. user UUID).
|
|
464
|
+
* @param samples — Representative audio frames for the speaker's voice.
|
|
465
|
+
*/
|
|
466
|
+
enrollSpeaker(speakerId: string, samples: AudioFrame[]): Promise<void>;
|
|
467
|
+
/**
|
|
468
|
+
* Terminate the session and release all provider-side resources.
|
|
469
|
+
*/
|
|
470
|
+
close(): void;
|
|
471
|
+
}
|
|
472
|
+
/**
|
|
473
|
+
* Factory interface for diarization (speaker separation) engines.
|
|
474
|
+
*
|
|
475
|
+
* Registered via `EXTENSION_KIND_DIARIZATION`.
|
|
476
|
+
*/
|
|
477
|
+
export interface IDiarizationEngine {
|
|
478
|
+
/**
|
|
479
|
+
* Open a new diarization session.
|
|
480
|
+
*
|
|
481
|
+
* @param config — Session configuration controlling chunking and speaker hints.
|
|
482
|
+
*/
|
|
483
|
+
startSession(config?: DiarizationConfig): Promise<DiarizationSession>;
|
|
484
|
+
}
|
|
485
|
+
/**
|
|
486
|
+
* Configuration passed to {@link IStreamingTTS.startSession} when opening a new
|
|
487
|
+
* text-to-speech synthesis stream.
|
|
488
|
+
*/
|
|
489
|
+
export interface StreamingTTSConfig {
|
|
490
|
+
/**
|
|
491
|
+
* Provider-specific voice identifier (e.g. `'alloy'`, `'nova'`, `'en-US-Wavenet-D'`).
|
|
492
|
+
* Defaults to the provider's built-in default when omitted.
|
|
493
|
+
*/
|
|
494
|
+
voice?: string;
|
|
495
|
+
/**
|
|
496
|
+
* Output audio format.
|
|
497
|
+
* @defaultValue 'opus'
|
|
498
|
+
*/
|
|
499
|
+
format?: 'pcm' | 'mp3' | 'opus';
|
|
500
|
+
/**
|
|
501
|
+
* Output sample rate in Hz. Must be supported by the chosen `format`.
|
|
502
|
+
* @defaultValue 24000
|
|
503
|
+
*/
|
|
504
|
+
sampleRate?: number;
|
|
505
|
+
/**
|
|
506
|
+
* Controls how the provider segments incoming token streams into synthesis
|
|
507
|
+
* requests:
|
|
508
|
+
* - `'sentence'` — flush at sentence boundaries (lower latency).
|
|
509
|
+
* - `'word'` — flush at word boundaries (minimum latency, may sound choppy).
|
|
510
|
+
* - `'paragraph'` — flush at paragraph boundaries (highest quality).
|
|
511
|
+
* @defaultValue 'sentence'
|
|
512
|
+
*/
|
|
513
|
+
chunkingMode?: 'sentence' | 'word' | 'paragraph';
|
|
514
|
+
/**
|
|
515
|
+
* Maximum number of milliseconds of audio to buffer before forcing a flush,
|
|
516
|
+
* regardless of `chunkingMode`. Prevents unbounded memory growth for very
|
|
517
|
+
* long utterances.
|
|
518
|
+
* @defaultValue 3000
|
|
519
|
+
*/
|
|
520
|
+
maxBufferMs?: number;
|
|
521
|
+
/**
|
|
522
|
+
* Pass-through options forwarded to the underlying provider SDK.
|
|
523
|
+
*/
|
|
524
|
+
providerOptions?: Record<string, unknown>;
|
|
525
|
+
}
|
|
526
|
+
/**
|
|
527
|
+
* An active streaming TTS session. Token text is pushed in and encoded audio
|
|
528
|
+
* chunks flow out via EventEmitter.
|
|
529
|
+
*
|
|
530
|
+
* Emits:
|
|
531
|
+
* - `'audio'` (EncodedAudioChunk) — a synthesised audio chunk ready for playback.
|
|
532
|
+
* - `'flush_complete'` () — all queued tokens have been synthesised.
|
|
533
|
+
* - `'error'` (Error) — unrecoverable synthesis error.
|
|
534
|
+
* - `'close'` () — session terminated.
|
|
535
|
+
*/
|
|
536
|
+
export interface StreamingTTSSession extends EventEmitter {
|
|
537
|
+
/**
|
|
538
|
+
* Push one or more LLM output tokens into the synthesis buffer.
|
|
539
|
+
* The session will chunk and synthesise them according to `chunkingMode`.
|
|
540
|
+
*
|
|
541
|
+
* @param tokens — Text tokens to synthesise (may be partial words).
|
|
542
|
+
*/
|
|
543
|
+
pushTokens(tokens: string): void;
|
|
544
|
+
/**
|
|
545
|
+
* Force synthesis of all buffered tokens, then emit `'flush_complete'`.
|
|
546
|
+
* Call at end-of-response or when transitioning between agent turns.
|
|
547
|
+
*/
|
|
548
|
+
flush(): Promise<void>;
|
|
549
|
+
/**
|
|
550
|
+
* Immediately stop synthesis and discard all buffered tokens. Audio chunks
|
|
551
|
+
* currently in-flight are not recalled; the caller must stop playback separately.
|
|
552
|
+
*/
|
|
553
|
+
cancel(): void;
|
|
554
|
+
/**
|
|
555
|
+
* Terminate the session and release provider-side resources.
|
|
556
|
+
*/
|
|
557
|
+
close(): void;
|
|
558
|
+
}
|
|
559
|
+
/**
|
|
560
|
+
* Factory interface for streaming text-to-speech providers.
|
|
561
|
+
*
|
|
562
|
+
* Registered via `EXTENSION_KIND_STREAMING_TTS`.
|
|
563
|
+
*/
|
|
564
|
+
export interface IStreamingTTS {
|
|
565
|
+
/**
|
|
566
|
+
* Unique, stable identifier for this provider (e.g. `'openai'`, `'elevenlabs'`).
|
|
567
|
+
*/
|
|
568
|
+
readonly providerId: string;
|
|
569
|
+
/**
|
|
570
|
+
* Open a new streaming synthesis session.
|
|
571
|
+
*
|
|
572
|
+
* @param config — Session-level configuration overriding provider defaults.
|
|
573
|
+
*/
|
|
574
|
+
startSession(config?: StreamingTTSConfig): Promise<StreamingTTSSession>;
|
|
575
|
+
}
|
|
576
|
+
/**
|
|
577
|
+
* Contextual information supplied to {@link IBargeinHandler.handleBargein} so the
|
|
578
|
+
* handler can make an informed decision about how to respond to interruption.
|
|
579
|
+
*/
|
|
580
|
+
export interface BargeinContext {
|
|
581
|
+
/**
|
|
582
|
+
* Duration of detected user speech before the barge-in was confirmed, in ms.
|
|
583
|
+
* Short durations may indicate accidental noise rather than intentional interruption.
|
|
584
|
+
*/
|
|
585
|
+
speechDurationMs: number;
|
|
586
|
+
/**
|
|
587
|
+
* The partial TTS text that was interrupted. Used to construct `interruptedRemainder`
|
|
588
|
+
* in {@link VoiceTurnMetadata}.
|
|
589
|
+
*/
|
|
590
|
+
interruptedText: string;
|
|
591
|
+
/**
|
|
592
|
+
* How many milliseconds of audio had been played at the point of interruption.
|
|
593
|
+
*/
|
|
594
|
+
playedDurationMs: number;
|
|
595
|
+
}
|
|
596
|
+
/**
|
|
597
|
+
* Action the pipeline should take in response to a detected barge-in.
|
|
598
|
+
* Returned by {@link IBargeinHandler.handleBargein}.
|
|
599
|
+
*/
|
|
600
|
+
export type BargeinAction = {
|
|
601
|
+
/** Immediately stop all TTS output and discard the remainder of the response. */
|
|
602
|
+
type: 'cancel';
|
|
603
|
+
/**
|
|
604
|
+
* Optional text marker injected into the conversation context to signal that
|
|
605
|
+
* the agent's turn was cut short (e.g. `'[interrupted]'`).
|
|
606
|
+
*/
|
|
607
|
+
injectMarker?: string;
|
|
608
|
+
} | {
|
|
609
|
+
/** Fade out TTS audio over `fadeMs` milliseconds then pause. */
|
|
610
|
+
type: 'pause';
|
|
611
|
+
/** Duration of the fade-out in milliseconds. @defaultValue 150 */
|
|
612
|
+
fadeMs?: number;
|
|
613
|
+
} | {
|
|
614
|
+
/**
|
|
615
|
+
* Resume TTS playback from where it was paused (only valid after a prior
|
|
616
|
+
* `'pause'` action).
|
|
617
|
+
*/
|
|
618
|
+
type: 'resume';
|
|
619
|
+
} | {
|
|
620
|
+
/**
|
|
621
|
+
* Treat the detected barge-in as noise and continue TTS playback uninterrupted.
|
|
622
|
+
* Appropriate for very short, low-confidence speech detections.
|
|
623
|
+
*/
|
|
624
|
+
type: 'ignore';
|
|
625
|
+
};
|
|
626
|
+
/**
|
|
627
|
+
* Handles the policy decision when a barge-in (user speaking over TTS) is detected.
|
|
628
|
+
*
|
|
629
|
+
* Registered via `EXTENSION_KIND_BARGEIN_HANDLER`.
|
|
630
|
+
*/
|
|
631
|
+
export interface IBargeinHandler {
|
|
632
|
+
/**
|
|
633
|
+
* Interruption strategy implemented by this handler:
|
|
634
|
+
* - `'hard-cut'` — TTS audio is stopped immediately with no fade.
|
|
635
|
+
* - `'soft-fade'` — TTS audio fades out over a short window before stopping.
|
|
636
|
+
*/
|
|
637
|
+
readonly mode: 'hard-cut' | 'soft-fade';
|
|
638
|
+
/**
|
|
639
|
+
* Called by the pipeline when a barge-in is confirmed. The handler evaluates
|
|
640
|
+
* the context and returns the action the pipeline should execute.
|
|
641
|
+
*
|
|
642
|
+
* @param context — Contextual snapshot at the moment of interruption.
|
|
643
|
+
* @returns The action to perform (or a promise resolving to one).
|
|
644
|
+
*/
|
|
645
|
+
handleBargein(context: BargeinContext): BargeinAction | Promise<BargeinAction>;
|
|
646
|
+
}
|
|
647
|
+
/**
|
|
648
|
+
* Adapts any AgentOS agent to the voice pipeline's turn-based protocol.
|
|
649
|
+
*
|
|
650
|
+
* The pipeline calls {@link IVoicePipelineAgentSession.sendText} with the user's
|
|
651
|
+
* final transcript and streams the response back as text tokens for TTS synthesis.
|
|
652
|
+
*/
|
|
653
|
+
export interface IVoicePipelineAgentSession {
|
|
654
|
+
/**
|
|
655
|
+
* Send the user's utterance to the agent and receive a streaming text response.
|
|
656
|
+
*
|
|
657
|
+
* @param text — Final transcript from the STT + endpoint detection pipeline.
|
|
658
|
+
* @param metadata — Rich metadata about the current voice turn.
|
|
659
|
+
* @returns An async iterable of text tokens (suitable for streaming into TTS).
|
|
660
|
+
*/
|
|
661
|
+
sendText(text: string, metadata: VoiceTurnMetadata): AsyncIterable<string>;
|
|
662
|
+
/**
|
|
663
|
+
* Abort the current agent response mid-stream (called on barge-in when
|
|
664
|
+
* `BargeinAction.type === 'cancel'`).
|
|
665
|
+
*
|
|
666
|
+
* Implementations should cancel any in-flight LLM requests. The pipeline
|
|
667
|
+
* will discard any tokens emitted after `abort()` is called.
|
|
668
|
+
*/
|
|
669
|
+
abort?(): void;
|
|
670
|
+
}
|
|
671
|
+
/**
|
|
672
|
+
* Rich metadata attached to each voice turn and passed to the agent session.
|
|
673
|
+
* Enables the agent to tailor its response based on conversation dynamics.
|
|
674
|
+
*/
|
|
675
|
+
export interface VoiceTurnMetadata {
|
|
676
|
+
/**
|
|
677
|
+
* Speaker labels present in this turn. Contains at least one entry (the user).
|
|
678
|
+
* Multi-speaker turns arise in conference call or multi-party scenarios.
|
|
679
|
+
*/
|
|
680
|
+
speakers: string[];
|
|
681
|
+
/**
|
|
682
|
+
* The reason the endpoint detector decided the user had finished speaking.
|
|
683
|
+
*/
|
|
684
|
+
endpointReason: EndpointReason;
|
|
685
|
+
/**
|
|
686
|
+
* Duration of active user speech in this turn, in milliseconds.
|
|
687
|
+
* Does not include silence periods.
|
|
688
|
+
*/
|
|
689
|
+
speechDurationMs: number;
|
|
690
|
+
/**
|
|
691
|
+
* Whether the user's turn interrupted an in-progress TTS response.
|
|
692
|
+
*/
|
|
693
|
+
wasInterrupted: boolean;
|
|
694
|
+
/**
|
|
695
|
+
* When `wasInterrupted` is `true`, the text remainder of the agent response
|
|
696
|
+
* that was cut off. Useful for the agent to avoid re-stating information.
|
|
697
|
+
*/
|
|
698
|
+
interruptedRemainder?: string;
|
|
699
|
+
/**
|
|
700
|
+
* Aggregate STT confidence for the complete transcript, in the range [0, 1].
|
|
701
|
+
*/
|
|
702
|
+
transcriptConfidence: number;
|
|
703
|
+
}
|
|
704
|
+
/**
|
|
705
|
+
* Top-level configuration for the {@link VoicePipelineSession}.
|
|
706
|
+
* Specifies which providers to use and their session-level options.
|
|
707
|
+
*/
|
|
708
|
+
export interface VoicePipelineConfig {
|
|
709
|
+
/**
|
|
710
|
+
* Identifier of the streaming STT provider to use (must be registered via
|
|
711
|
+
* `EXTENSION_KIND_STREAMING_STT`).
|
|
712
|
+
* Examples: `'deepgram'`, `'whisper-live'`, `'whisper-chunked'`.
|
|
713
|
+
*/
|
|
714
|
+
stt: string;
|
|
715
|
+
/**
|
|
716
|
+
* Identifier of the streaming TTS provider to use (must be registered via
|
|
717
|
+
* `EXTENSION_KIND_STREAMING_TTS`).
|
|
718
|
+
* Examples: `'openai'`, `'elevenlabs'`, `'cartesia'`.
|
|
719
|
+
*/
|
|
720
|
+
tts: string;
|
|
721
|
+
/**
|
|
722
|
+
* Endpoint detection strategy. Defaults to `'hybrid'` when omitted.
|
|
723
|
+
*/
|
|
724
|
+
endpointing?: 'acoustic' | 'heuristic' | 'semantic';
|
|
725
|
+
/**
|
|
726
|
+
* Enable speaker diarization for multi-speaker scenarios. Disabled by default.
|
|
727
|
+
*/
|
|
728
|
+
diarization?: boolean;
|
|
729
|
+
/**
|
|
730
|
+
* Barge-in (interruption) handling mode. Defaults to `'hard-cut'` when omitted.
|
|
731
|
+
*/
|
|
732
|
+
bargeIn?: 'hard-cut' | 'soft-fade' | 'disabled';
|
|
733
|
+
/**
|
|
734
|
+
* TTS voice identifier. Forwarded to {@link StreamingTTSConfig.voice}.
|
|
735
|
+
*/
|
|
736
|
+
voice?: string;
|
|
737
|
+
/**
|
|
738
|
+
* Output audio format for TTS. Forwarded to {@link StreamingTTSConfig.format}.
|
|
739
|
+
* @defaultValue 'opus'
|
|
740
|
+
*/
|
|
741
|
+
format?: 'pcm' | 'mp3' | 'opus';
|
|
742
|
+
/**
|
|
743
|
+
* BCP-47 language code. Forwarded to both STT and TTS sessions.
|
|
744
|
+
*/
|
|
745
|
+
language?: string;
|
|
746
|
+
/**
|
|
747
|
+
* Hard cap on how long a single user turn may last, in milliseconds.
|
|
748
|
+
* When exceeded, the endpoint detector fires with reason `'timeout'`.
|
|
749
|
+
* @defaultValue 30000
|
|
750
|
+
*/
|
|
751
|
+
maxTurnDurationMs?: number;
|
|
752
|
+
/**
|
|
753
|
+
* Provider-level STT options merged into {@link StreamingSTTConfig.providerOptions}.
|
|
754
|
+
*/
|
|
755
|
+
sttOptions?: Record<string, unknown>;
|
|
756
|
+
/**
|
|
757
|
+
* Provider-level TTS options merged into {@link StreamingTTSConfig.providerOptions}.
|
|
758
|
+
*/
|
|
759
|
+
ttsOptions?: Record<string, unknown>;
|
|
760
|
+
}
|
|
761
|
+
/**
|
|
762
|
+
* Lifecycle state of a {@link VoicePipelineSession}.
|
|
763
|
+
*
|
|
764
|
+
* Valid transitions:
|
|
765
|
+
* ```
|
|
766
|
+
* idle → listening → processing → speaking → listening
|
|
767
|
+
* → interrupting → listening
|
|
768
|
+
* any → closed
|
|
769
|
+
* ```
|
|
770
|
+
*/
|
|
771
|
+
export type PipelineState = 'idle' | 'listening' | 'processing' | 'speaking' | 'interrupting' | 'closed';
|
|
772
|
+
/**
|
|
773
|
+
* A live voice pipeline session binding a transport, STT, endpoint detection,
|
|
774
|
+
* optional diarization, agent, and TTS into a single coordinated lifecycle.
|
|
775
|
+
*
|
|
776
|
+
* Emits:
|
|
777
|
+
* - `'state_change'` (PipelineState) — pipeline state machine transition.
|
|
778
|
+
* - `'turn_complete'` (TurnCompleteEvent) — user turn detected.
|
|
779
|
+
* - `'agent_response_start'` () — agent has begun generating a response.
|
|
780
|
+
* - `'agent_response_end'` () — agent response fully synthesised and played.
|
|
781
|
+
* - `'barge_in'` (BargeinContext) — user interrupted TTS playback.
|
|
782
|
+
* - `'error'` (Error) — unrecoverable pipeline error.
|
|
783
|
+
* - `'close'` () — session has been fully torn down.
|
|
784
|
+
*/
|
|
785
|
+
export interface VoicePipelineSession extends EventEmitter {
|
|
786
|
+
/**
|
|
787
|
+
* Unique, stable identifier for this session (UUID).
|
|
788
|
+
*/
|
|
789
|
+
readonly sessionId: string;
|
|
790
|
+
/**
|
|
791
|
+
* Current pipeline state machine state.
|
|
792
|
+
*/
|
|
793
|
+
readonly state: PipelineState;
|
|
794
|
+
/**
|
|
795
|
+
* The transport this session is bound to. Useful for sending out-of-band
|
|
796
|
+
* control messages without going through the pipeline.
|
|
797
|
+
*/
|
|
798
|
+
readonly transport: IStreamTransport;
|
|
799
|
+
/**
|
|
800
|
+
* Gracefully close the session — flush in-flight audio, tear down all sub-sessions,
|
|
801
|
+
* and emit `'close'`.
|
|
802
|
+
*
|
|
803
|
+
* @param reason — Optional human-readable reason for diagnostics.
|
|
804
|
+
*/
|
|
805
|
+
close(reason?: string): Promise<void>;
|
|
806
|
+
}
|
|
807
|
+
/**
|
|
808
|
+
* Messages sent from the client (browser/app) to the server over the transport.
|
|
809
|
+
* All messages are JSON-serialised.
|
|
810
|
+
*/
|
|
811
|
+
export type ClientTextMessage = {
|
|
812
|
+
/**
|
|
813
|
+
* Initial configuration sent once after the WebSocket connection is established.
|
|
814
|
+
* The server responds with `session_started` after applying the config.
|
|
815
|
+
*/
|
|
816
|
+
type: 'config';
|
|
817
|
+
/** Pipeline configuration requested by the client. */
|
|
818
|
+
config: VoicePipelineConfig;
|
|
819
|
+
} | {
|
|
820
|
+
/**
|
|
821
|
+
* Runtime control commands sent during an active session.
|
|
822
|
+
*/
|
|
823
|
+
type: 'control';
|
|
824
|
+
/** The control action to perform. */
|
|
825
|
+
action: TransportControlMessage;
|
|
826
|
+
};
|
|
827
|
+
/**
|
|
828
|
+
* Messages sent from the server to the client over the transport.
|
|
829
|
+
* All messages are JSON-serialised.
|
|
830
|
+
*/
|
|
831
|
+
export type ServerTextMessage = {
|
|
832
|
+
/**
|
|
833
|
+
* Sent once after the server has applied the client's `config` message
|
|
834
|
+
* and is ready to receive audio.
|
|
835
|
+
*/
|
|
836
|
+
type: 'session_started';
|
|
837
|
+
/** The server-assigned session ID. */
|
|
838
|
+
sessionId: string;
|
|
839
|
+
/** Echo of the effective configuration (may differ from client request). */
|
|
840
|
+
config: VoicePipelineConfig;
|
|
841
|
+
} | {
|
|
842
|
+
/**
|
|
843
|
+
* Emitted for each STT hypothesis (interim and final).
|
|
844
|
+
* Clients may display these in real time for visual feedback.
|
|
845
|
+
*/
|
|
846
|
+
type: 'transcript';
|
|
847
|
+
/** Transcript text for this event. */
|
|
848
|
+
text: string;
|
|
849
|
+
/** Whether this hypothesis is final. */
|
|
850
|
+
isFinal: boolean;
|
|
851
|
+
/** Aggregate confidence score [0, 1]. */
|
|
852
|
+
confidence: number;
|
|
853
|
+
} | {
|
|
854
|
+
/**
|
|
855
|
+
* Emitted when the agent has received the transcript and begun generating a reply.
|
|
856
|
+
* Clients may show a thinking indicator.
|
|
857
|
+
*/
|
|
858
|
+
type: 'agent_thinking';
|
|
859
|
+
} | {
|
|
860
|
+
/**
|
|
861
|
+
* Emitted when TTS synthesis begins — audio chunks will follow over the audio channel.
|
|
862
|
+
* Clients may hide thinking indicators.
|
|
863
|
+
*/
|
|
864
|
+
type: 'agent_speaking';
|
|
865
|
+
/**
|
|
866
|
+
* Speculative text of the agent's response accumulated so far. May be partial
|
|
867
|
+
* if the TTS is streaming token-by-token.
|
|
868
|
+
*/
|
|
869
|
+
text: string;
|
|
870
|
+
} | {
|
|
871
|
+
/**
|
|
872
|
+
* Emitted when the agent's complete response has been synthesised and sent.
|
|
873
|
+
*/
|
|
874
|
+
type: 'agent_done';
|
|
875
|
+
/** Full text of the completed response. */
|
|
876
|
+
text: string;
|
|
877
|
+
/** Duration of the synthesised audio in milliseconds. */
|
|
878
|
+
durationMs: number;
|
|
879
|
+
} | {
|
|
880
|
+
/**
|
|
881
|
+
* Emitted when the pipeline detects that the user has started speaking
|
|
882
|
+
* over the current TTS output (barge-in).
|
|
883
|
+
*/
|
|
884
|
+
type: 'barge_in';
|
|
885
|
+
/** The action the pipeline is taking in response. */
|
|
886
|
+
action: BargeinAction;
|
|
887
|
+
} | {
|
|
888
|
+
/**
|
|
889
|
+
* Emitted when an unrecoverable error occurs in the pipeline.
|
|
890
|
+
* The session will be closed after this message.
|
|
891
|
+
*/
|
|
892
|
+
type: 'error';
|
|
893
|
+
/** Machine-readable error code. */
|
|
894
|
+
code: string;
|
|
895
|
+
/** Human-readable description. */
|
|
896
|
+
message: string;
|
|
897
|
+
} | {
|
|
898
|
+
/**
|
|
899
|
+
* Emitted as the final message before the server closes the transport.
|
|
900
|
+
*/
|
|
901
|
+
type: 'session_ended';
|
|
902
|
+
/** Optional human-readable reason. */
|
|
903
|
+
reason?: string;
|
|
904
|
+
};
|
|
905
|
+
//# sourceMappingURL=types.d.ts.map
|