@voice-kit/core 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +2137 -0
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +1466 -4
- package/dist/index.d.ts +1466 -4
- package/dist/index.js +2102 -1
- package/dist/index.js.map +1 -1
- package/package.json +1 -31
- package/dist/audio.cjs +0 -533
- package/dist/audio.cjs.map +0 -1
- package/dist/audio.d.cts +0 -260
- package/dist/audio.d.ts +0 -260
- package/dist/audio.js +0 -514
- package/dist/audio.js.map +0 -1
- package/dist/compliance.cjs +0 -343
- package/dist/compliance.cjs.map +0 -1
- package/dist/compliance.d.cts +0 -163
- package/dist/compliance.d.ts +0 -163
- package/dist/compliance.js +0 -335
- package/dist/compliance.js.map +0 -1
- package/dist/errors.cjs +0 -284
- package/dist/errors.cjs.map +0 -1
- package/dist/errors.d.cts +0 -100
- package/dist/errors.d.ts +0 -100
- package/dist/errors.js +0 -262
- package/dist/errors.js.map +0 -1
- package/dist/index-D3KfRXMP.d.cts +0 -319
- package/dist/index-D3KfRXMP.d.ts +0 -319
- package/dist/memory.cjs +0 -121
- package/dist/memory.cjs.map +0 -1
- package/dist/memory.d.cts +0 -29
- package/dist/memory.d.ts +0 -29
- package/dist/memory.js +0 -115
- package/dist/memory.js.map +0 -1
- package/dist/observability.cjs +0 -229
- package/dist/observability.cjs.map +0 -1
- package/dist/observability.d.cts +0 -122
- package/dist/observability.d.ts +0 -122
- package/dist/observability.js +0 -222
- package/dist/observability.js.map +0 -1
- package/dist/stt.cjs +0 -828
- package/dist/stt.cjs.map +0 -1
- package/dist/stt.d.cts +0 -308
- package/dist/stt.d.ts +0 -308
- package/dist/stt.js +0 -815
- package/dist/stt.js.map +0 -1
- package/dist/telephony.errors-BQYr6-vl.d.cts +0 -80
- package/dist/telephony.errors-C0-nScrF.d.ts +0 -80
- package/dist/tts.cjs +0 -429
- package/dist/tts.cjs.map +0 -1
- package/dist/tts.d.cts +0 -151
- package/dist/tts.d.ts +0 -151
- package/dist/tts.js +0 -418
- package/dist/tts.js.map +0 -1
package/dist/index.d.ts
CHANGED
|
@@ -1,4 +1,1466 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
1
|
+
import * as ai from 'ai';
|
|
2
|
+
import { PassThrough } from 'node:stream';
|
|
3
|
+
import { EventEmitter } from 'node:events';
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* @voice-kit/core — Type definitions
|
|
7
|
+
*/
|
|
8
|
+
/**
|
|
9
|
+
* A single word with timing information from an STT provider.
|
|
10
|
+
*/
|
|
11
|
+
interface WordTimestamp {
|
|
12
|
+
word: string;
|
|
13
|
+
startMs: number;
|
|
14
|
+
endMs: number;
|
|
15
|
+
confidence: number;
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* The result of a speech-to-text transcription, either streaming partial
|
|
19
|
+
* or final. `isFinal` distinguishes the two.
|
|
20
|
+
*
|
|
21
|
+
* @example
|
|
22
|
+
* ```ts
|
|
23
|
+
* for await (const result of stt.transcribeStream(audioIterable)) {
|
|
24
|
+
* if (result.isFinal) console.log('Final:', result.transcript)
|
|
25
|
+
* }
|
|
26
|
+
* ```
|
|
27
|
+
*/
|
|
28
|
+
interface STTResult {
|
|
29
|
+
/** The transcribed text. May be a partial result if `isFinal` is false. */
|
|
30
|
+
transcript: string;
|
|
31
|
+
/** Whether this is the final result for this utterance. */
|
|
32
|
+
isFinal: boolean;
|
|
33
|
+
/** Confidence score from the provider, 0–1. */
|
|
34
|
+
confidence: number;
|
|
35
|
+
/** BCP-47 language tag, e.g. 'hi-IN', 'en-IN'. */
|
|
36
|
+
language: string;
|
|
37
|
+
/** True if a mid-sentence language switch was detected (e.g. Hinglish). */
|
|
38
|
+
languageSwitchDetected: boolean;
|
|
39
|
+
/** Word-level timestamps if supported by the provider. */
|
|
40
|
+
words?: WordTimestamp[];
|
|
41
|
+
/** Time from audio start to this result being emitted, in ms. */
|
|
42
|
+
latencyMs: number;
|
|
43
|
+
}
|
|
44
|
+
/**
|
|
45
|
+
* Configuration for STT provider instantiation.
|
|
46
|
+
*/
|
|
47
|
+
interface STTConfig {
|
|
48
|
+
/** BCP-47 language code. Defaults to 'en-IN'. */
|
|
49
|
+
language?: string;
|
|
50
|
+
/** Additional languages to detect for code-switching. */
|
|
51
|
+
alternateLanguages?: string[];
|
|
52
|
+
/** API key. Falls back to provider-specific env var if omitted. */
|
|
53
|
+
apiKey?: string;
|
|
54
|
+
/** Custom model name. Provider-specific. */
|
|
55
|
+
model?: string;
|
|
56
|
+
/** Enable word-level timestamps. Default false. */
|
|
57
|
+
wordTimestamps?: boolean;
|
|
58
|
+
/** Enable interim / partial results. Default true. */
|
|
59
|
+
interimResults?: boolean;
|
|
60
|
+
/** Deepgram-specific: smart formatting. Default true. */
|
|
61
|
+
smartFormat?: boolean;
|
|
62
|
+
/** Sarvam-specific: region hint. */
|
|
63
|
+
region?: string;
|
|
64
|
+
}
|
|
65
|
+
/**
|
|
66
|
+
* The STTProvider interface. Obtained via `createSTT()` — never instantiate
|
|
67
|
+
* provider classes directly.
|
|
68
|
+
*
|
|
69
|
+
* @example
|
|
70
|
+
* ```ts
|
|
71
|
+
* const stt = createSTT('deepgram', { language: 'en-IN' })
|
|
72
|
+
* for await (const result of stt.transcribeStream(audioStream)) {
|
|
73
|
+
* console.log(result.transcript)
|
|
74
|
+
* }
|
|
75
|
+
* ```
|
|
76
|
+
*/
|
|
77
|
+
interface STTProvider {
|
|
78
|
+
/** Stream audio in, stream STTResults out. Primary realtime path. */
|
|
79
|
+
transcribeStream(audio: AsyncIterable<Buffer>): AsyncIterable<STTResult>;
|
|
80
|
+
/** Batch transcription for recordings. Returns single final result. */
|
|
81
|
+
transcribeBatch(audio: Buffer): Promise<STTResult>;
|
|
82
|
+
/** Whether this provider supports streaming (all except Whisper). */
|
|
83
|
+
readonly supportsStreaming: boolean;
|
|
84
|
+
/** BCP-47 codes this provider can handle. */
|
|
85
|
+
readonly supportedLanguages: string[];
|
|
86
|
+
/** Human-readable provider name for logging. */
|
|
87
|
+
readonly name: string;
|
|
88
|
+
}
|
|
89
|
+
/**
|
|
90
|
+
* Configuration for TTS provider instantiation.
|
|
91
|
+
*/
|
|
92
|
+
interface TTSConfig {
|
|
93
|
+
/** Voice identifier. Provider-specific. */
|
|
94
|
+
voiceId?: string;
|
|
95
|
+
/** Output sample rate. Defaults to provider native rate. */
|
|
96
|
+
sampleRate?: number;
|
|
97
|
+
/** Speaking speed multiplier. Default 1.0. */
|
|
98
|
+
speed?: number;
|
|
99
|
+
/** Pitch adjustment. Provider-specific. */
|
|
100
|
+
pitch?: number;
|
|
101
|
+
/** API key. Falls back to provider-specific env var if omitted. */
|
|
102
|
+
apiKey?: string;
|
|
103
|
+
/** ElevenLabs-specific: model ID. */
|
|
104
|
+
modelId?: string;
|
|
105
|
+
/** Cartesia-specific: emotion control. */
|
|
106
|
+
emotion?: string;
|
|
107
|
+
/** Sarvam-specific: target language for Indic voices. */
|
|
108
|
+
targetLanguage?: string;
|
|
109
|
+
}
|
|
110
|
+
/**
|
|
111
|
+
* The TTSProvider interface. Obtained via `createTTS()` — never instantiate
|
|
112
|
+
* provider classes directly.
|
|
113
|
+
*
|
|
114
|
+
* @example
|
|
115
|
+
* ```ts
|
|
116
|
+
* const tts = createTTS('elevenlabs', { voiceId: 'your-voice-id' })
|
|
117
|
+
* for await (const chunk of tts.synthesizeStream('Hello, how can I help?')) {
|
|
118
|
+
* socket.write(chunk)
|
|
119
|
+
* }
|
|
120
|
+
* ```
|
|
121
|
+
*/
|
|
122
|
+
interface TTSProvider {
|
|
123
|
+
/** Stream synthesis — preferred for realtime. First chunk < 300ms. */
|
|
124
|
+
synthesizeStream(text: string, config?: TTSConfig): AsyncIterable<Buffer>;
|
|
125
|
+
/** Synthesize full audio — for pre-recorded prompts or caching. */
|
|
126
|
+
synthesizeFull(text: string, config?: TTSConfig): Promise<Buffer>;
|
|
127
|
+
/** Native output sample rate of this provider in Hz. */
|
|
128
|
+
readonly outputSampleRate: number;
|
|
129
|
+
/** Native output format before any resampling. */
|
|
130
|
+
readonly outputFormat: 'pcm' | 'mulaw' | 'opus' | 'mp3';
|
|
131
|
+
/** Human-readable provider name for logging. */
|
|
132
|
+
readonly name: string;
|
|
133
|
+
}
|
|
134
|
+
/**
|
|
135
|
+
* A frame of audio classified by the VAD engine.
|
|
136
|
+
* Developers subscribe to these events — never to raw VAD API.
|
|
137
|
+
*/
|
|
138
|
+
interface VoiceFrame {
|
|
139
|
+
/** Event type. */
|
|
140
|
+
type: 'speech_start' | 'speech_end' | 'speech';
|
|
141
|
+
/** VAD confidence 0–1. */
|
|
142
|
+
confidence: number;
|
|
143
|
+
/** Raw PCM audio bytes for this frame. */
|
|
144
|
+
audioBuffer: Buffer;
|
|
145
|
+
/** Duration of audio in this frame, in ms. */
|
|
146
|
+
durationMs: number;
|
|
147
|
+
}
|
|
148
|
+
/**
|
|
149
|
+
* Configuration for the VAD engine.
|
|
150
|
+
*/
|
|
151
|
+
interface VADConfig {
|
|
152
|
+
/** Activation threshold 0–1. Default 0.6. */
|
|
153
|
+
threshold?: number;
|
|
154
|
+
/** Consecutive positive frames before speech_start. Default 3. */
|
|
155
|
+
positiveSpeechFrames?: number;
|
|
156
|
+
/** Consecutive negative frames before speech_end. Default 5. */
|
|
157
|
+
negativeSpeechFrames?: number;
|
|
158
|
+
/** Debounce window in ms to prevent rapid flip-flop. Default 150. */
|
|
159
|
+
debounceMs?: number;
|
|
160
|
+
/** Input sample rate. Auto-set by AudioPipeline — do not override. */
|
|
161
|
+
sampleRate?: number;
|
|
162
|
+
}
|
|
163
|
+
/**
|
|
164
|
+
* Configuration for call memory (LRU-backed sliding window of turns).
|
|
165
|
+
*/
|
|
166
|
+
interface CallMemoryConfig {
|
|
167
|
+
/** Maximum number of turns to retain. Default 20. */
|
|
168
|
+
maxTurns?: number;
|
|
169
|
+
/** Maximum bytes of conversation history to retain. Default 512KB. */
|
|
170
|
+
maxBytes?: number;
|
|
171
|
+
/** TTL for the entire call memory entry in ms. Default 30 minutes. */
|
|
172
|
+
ttlMs?: number;
|
|
173
|
+
}
|
|
174
|
+
/**
|
|
175
|
+
* In-process LRU-backed call memory. Obtained via `createCallMemory()`.
|
|
176
|
+
*
|
|
177
|
+
* @example
|
|
178
|
+
* ```ts
|
|
179
|
+
* const memory = createCallMemory({ maxTurns: 20 })
|
|
180
|
+
* memory.addTurn(callId, { role: 'user', content: 'Hello' })
|
|
181
|
+
* const history = memory.getTurns(callId)
|
|
182
|
+
* ```
|
|
183
|
+
*/
|
|
184
|
+
interface CallMemory {
|
|
185
|
+
addTurn(callId: string, message: ai.ModelMessage): void;
|
|
186
|
+
getTurns(callId: string): ai.ModelMessage[];
|
|
187
|
+
clearCall(callId: string): void;
|
|
188
|
+
getTokenEstimate(callId: string): number;
|
|
189
|
+
/** Truncate oldest turns to stay within budget. */
|
|
190
|
+
trimToTokenBudget(callId: string, maxTokens: number): void;
|
|
191
|
+
}
|
|
192
|
+
/**
|
|
193
|
+
* Type of call for TRAI DND classification.
|
|
194
|
+
*/
|
|
195
|
+
type CallPurpose = 'TRANSACTIONAL' | 'PROMOTIONAL' | 'SERVICE' | 'EMERGENCY';
|
|
196
|
+
/**
|
|
197
|
+
* TRAI DNC check parameters.
|
|
198
|
+
*/
|
|
199
|
+
interface DNCCheckParams {
|
|
200
|
+
/** E.164 format phone number, validated via libphonenumber-js. */
|
|
201
|
+
to: string;
|
|
202
|
+
/** Purpose category for TRAI classification. */
|
|
203
|
+
purpose: CallPurpose;
|
|
204
|
+
/** Scheduled call time. Defaults to now. */
|
|
205
|
+
scheduledAt?: Date;
|
|
206
|
+
}
|
|
207
|
+
/**
|
|
208
|
+
* Result of a TRAI DNC check.
|
|
209
|
+
*/
|
|
210
|
+
interface DNCCheckResult {
|
|
211
|
+
/** Whether the call is permitted. */
|
|
212
|
+
allowed: boolean;
|
|
213
|
+
/** Human-readable reason if not allowed. */
|
|
214
|
+
reason?: string;
|
|
215
|
+
/** When this result was fetched (from LRU cache). */
|
|
216
|
+
cachedAt?: Date;
|
|
217
|
+
/** Whether result came from local LRU cache. */
|
|
218
|
+
fromCache: boolean;
|
|
219
|
+
}
|
|
220
|
+
/**
|
|
221
|
+
* Consent record stored for TRAI compliance.
|
|
222
|
+
*/
|
|
223
|
+
interface ConsentRecord {
|
|
224
|
+
phoneNumber: string;
|
|
225
|
+
consentedAt: Date;
|
|
226
|
+
/** Channel through which consent was obtained. */
|
|
227
|
+
channel: 'voice' | 'sms' | 'web' | 'ivr';
|
|
228
|
+
/** Call purpose consent was given for. */
|
|
229
|
+
purpose: CallPurpose;
|
|
230
|
+
/** Optional reference ID (e.g. recording URL). */
|
|
231
|
+
referenceId?: string;
|
|
232
|
+
}
|
|
233
|
+
/**
|
|
234
|
+
* TRAI compliance configuration.
|
|
235
|
+
*/
|
|
236
|
+
interface TRAIConfig {
|
|
237
|
+
/** Disable TRAI checks entirely. Default false. */
|
|
238
|
+
disabled?: boolean;
|
|
239
|
+
/** Calling timezone override. Default 'Asia/Kolkata'. */
|
|
240
|
+
timezone?: string;
|
|
241
|
+
/** Override calling hours start (24h). Default 9. */
|
|
242
|
+
callingHoursStart?: number;
|
|
243
|
+
/** Override calling hours end (24h). Default 21. */
|
|
244
|
+
callingHoursEnd?: number;
|
|
245
|
+
/** Custom DNC API endpoint. Default: mock endpoint (must be replaced in production). */
|
|
246
|
+
dncApiEndpoint?: string;
|
|
247
|
+
}
|
|
248
|
+
/**
|
|
249
|
+
* Aggregated metrics for a completed or in-progress call.
|
|
250
|
+
*/
|
|
251
|
+
interface CallMetricsSummary {
|
|
252
|
+
callId: string;
|
|
253
|
+
sttFirstByteMs: number[];
|
|
254
|
+
ttsFirstByteMs: number[];
|
|
255
|
+
llmFirstTokenMs: number[];
|
|
256
|
+
turnLatencyMs: number[];
|
|
257
|
+
interruptionCount: number;
|
|
258
|
+
interruptionPositions: number[];
|
|
259
|
+
tokenCost: {
|
|
260
|
+
model: string;
|
|
261
|
+
inputTokens: number;
|
|
262
|
+
outputTokens: number;
|
|
263
|
+
estimatedUsdCost: number;
|
|
264
|
+
}[];
|
|
265
|
+
avgTurnLatencyMs: number;
|
|
266
|
+
p95TurnLatencyMs: number;
|
|
267
|
+
}
|
|
268
|
+
/**
|
|
269
|
+
* Error severity level.
|
|
270
|
+
*/
|
|
271
|
+
type ErrorSeverity = 'low' | 'medium' | 'high' | 'critical';
|
|
272
|
+
/**
|
|
273
|
+
* Base error context shared by all VoiceKit errors.
|
|
274
|
+
*/
|
|
275
|
+
interface VoiceKitErrorContext {
|
|
276
|
+
/** Error code for programmatic handling. */
|
|
277
|
+
code: string;
|
|
278
|
+
/** Associated call ID if applicable. */
|
|
279
|
+
callId?: string;
|
|
280
|
+
/** The provider that threw (e.g. 'deepgram', 'elevenlabs'). */
|
|
281
|
+
provider?: string;
|
|
282
|
+
/** Whether this error is safe to retry. */
|
|
283
|
+
retryable: boolean;
|
|
284
|
+
/** Severity for alerting/logging. */
|
|
285
|
+
severity: ErrorSeverity;
|
|
286
|
+
/** Original upstream error if wrapping. */
|
|
287
|
+
cause?: unknown;
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
/**
|
|
291
|
+
* @voice-kit/core — Typed error hierarchy
|
|
292
|
+
*
|
|
293
|
+
* All VoiceKit errors extend VoiceKitError. Never throw raw Error.
|
|
294
|
+
* Every error carries: code, message, provider, callId, retryable, severity.
|
|
295
|
+
*/
|
|
296
|
+
|
|
297
|
+
/**
|
|
298
|
+
* Base class for all VoiceKit errors. Provides structured context for
|
|
299
|
+
* logging, alerting, and programmatic error handling.
|
|
300
|
+
*
|
|
301
|
+
* @example
|
|
302
|
+
* ```ts
|
|
303
|
+
* try {
|
|
304
|
+
* await stt.transcribeBatch(audio)
|
|
305
|
+
* } catch (err) {
|
|
306
|
+
* if (err instanceof STTError) {
|
|
307
|
+
* console.error(err.code, err.provider, err.retryable)
|
|
308
|
+
* }
|
|
309
|
+
* }
|
|
310
|
+
* ```
|
|
311
|
+
*/
|
|
312
|
+
declare class VoiceKitError extends Error {
|
|
313
|
+
readonly code: string;
|
|
314
|
+
readonly callId?: string;
|
|
315
|
+
readonly provider?: string;
|
|
316
|
+
readonly retryable: boolean;
|
|
317
|
+
readonly severity: ErrorSeverity;
|
|
318
|
+
readonly cause?: unknown;
|
|
319
|
+
constructor(params: {
|
|
320
|
+
code: string;
|
|
321
|
+
message: string;
|
|
322
|
+
callId?: string;
|
|
323
|
+
provider?: string;
|
|
324
|
+
retryable?: boolean;
|
|
325
|
+
severity?: ErrorSeverity;
|
|
326
|
+
cause?: unknown;
|
|
327
|
+
});
|
|
328
|
+
toJSON(): {
|
|
329
|
+
name: string;
|
|
330
|
+
code: string;
|
|
331
|
+
message: string;
|
|
332
|
+
callId: string | undefined;
|
|
333
|
+
provider: string | undefined;
|
|
334
|
+
retryable: boolean;
|
|
335
|
+
severity: ErrorSeverity;
|
|
336
|
+
};
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
/**
|
|
340
|
+
* Errors from agent orchestration (turn engine, handoff, injection).
|
|
341
|
+
*/
|
|
342
|
+
declare class AgentError extends VoiceKitError {
|
|
343
|
+
}
|
|
344
|
+
declare class TurnTransitionError extends AgentError {
|
|
345
|
+
readonly fromState: string;
|
|
346
|
+
readonly event: string;
|
|
347
|
+
constructor(fromState: string, toEvent: string, callId?: string);
|
|
348
|
+
}
|
|
349
|
+
declare class AgentHandoffError extends AgentError {
|
|
350
|
+
constructor(capability: string, cause?: unknown, callId?: string);
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
/**
|
|
354
|
+
* Errors from compliance checks (TRAI DNC, calling hours, consent).
|
|
355
|
+
*/
|
|
356
|
+
declare class ComplianceError extends VoiceKitError {
|
|
357
|
+
readonly phoneNumber?: string;
|
|
358
|
+
constructor(params: {
|
|
359
|
+
code: string;
|
|
360
|
+
message: string;
|
|
361
|
+
callId?: string;
|
|
362
|
+
phoneNumber?: string;
|
|
363
|
+
retryable?: boolean;
|
|
364
|
+
severity?: ErrorSeverity;
|
|
365
|
+
cause?: unknown;
|
|
366
|
+
});
|
|
367
|
+
}
|
|
368
|
+
declare class DNCBlockedError extends ComplianceError {
|
|
369
|
+
constructor(phoneNumber: string, callId?: string);
|
|
370
|
+
}
|
|
371
|
+
declare class CallingHoursError extends ComplianceError {
|
|
372
|
+
constructor(phoneNumber: string, currentTime: string, callId?: string);
|
|
373
|
+
}
|
|
374
|
+
declare class ConsentMissingError extends ComplianceError {
|
|
375
|
+
constructor(phoneNumber: string, callId?: string);
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
/**
|
|
379
|
+
* Errors from Inngest background task dispatch.
|
|
380
|
+
*/
|
|
381
|
+
declare class InngestError extends VoiceKitError {
|
|
382
|
+
readonly taskName?: string;
|
|
383
|
+
constructor(params: {
|
|
384
|
+
code: string;
|
|
385
|
+
message: string;
|
|
386
|
+
callId?: string;
|
|
387
|
+
taskName?: string;
|
|
388
|
+
cause?: unknown;
|
|
389
|
+
});
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
/**
|
|
393
|
+
* Errors from speech-to-text providers.
|
|
394
|
+
*/
|
|
395
|
+
declare class STTError extends VoiceKitError {
|
|
396
|
+
readonly languageCode?: string;
|
|
397
|
+
constructor(params: {
|
|
398
|
+
code: string;
|
|
399
|
+
message: string;
|
|
400
|
+
callId?: string;
|
|
401
|
+
provider?: string;
|
|
402
|
+
retryable?: boolean;
|
|
403
|
+
severity?: ErrorSeverity;
|
|
404
|
+
cause?: unknown;
|
|
405
|
+
languageCode?: string;
|
|
406
|
+
});
|
|
407
|
+
}
|
|
408
|
+
declare class STTConnectionError extends STTError {
|
|
409
|
+
constructor(provider: string, cause?: unknown, callId?: string);
|
|
410
|
+
}
|
|
411
|
+
declare class STTStreamError extends STTError {
|
|
412
|
+
constructor(provider: string, cause?: unknown, callId?: string);
|
|
413
|
+
}
|
|
414
|
+
declare class STTLanguageNotSupportedError extends STTError {
|
|
415
|
+
constructor(provider: string, language: string);
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
/**
|
|
419
|
+
* Errors from telephony providers.
|
|
420
|
+
*/
|
|
421
|
+
declare class TelephonyError extends VoiceKitError {
|
|
422
|
+
readonly to?: string;
|
|
423
|
+
readonly from?: string;
|
|
424
|
+
constructor(params: {
|
|
425
|
+
code: string;
|
|
426
|
+
message: string;
|
|
427
|
+
callId?: string;
|
|
428
|
+
provider?: string;
|
|
429
|
+
retryable?: boolean;
|
|
430
|
+
severity?: ErrorSeverity;
|
|
431
|
+
cause?: unknown;
|
|
432
|
+
to?: string;
|
|
433
|
+
from?: string;
|
|
434
|
+
});
|
|
435
|
+
}
|
|
436
|
+
declare class CallConnectionError extends TelephonyError {
|
|
437
|
+
constructor(provider: string, to: string, cause?: unknown);
|
|
438
|
+
}
|
|
439
|
+
declare class CallNotFoundError extends TelephonyError {
|
|
440
|
+
constructor(callId: string, provider: string);
|
|
441
|
+
}
|
|
442
|
+
declare class AudioTransportError extends TelephonyError {
|
|
443
|
+
constructor(provider: string, cause?: unknown, callId?: string);
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
/**
|
|
447
|
+
* Errors from text-to-speech providers.
|
|
448
|
+
*/
|
|
449
|
+
declare class TTSError extends VoiceKitError {
|
|
450
|
+
}
|
|
451
|
+
declare class TTSConnectionError extends TTSError {
|
|
452
|
+
constructor(provider: string, cause?: unknown, callId?: string);
|
|
453
|
+
}
|
|
454
|
+
declare class TTSStreamError extends TTSError {
|
|
455
|
+
constructor(provider: string, cause?: unknown, callId?: string);
|
|
456
|
+
}
|
|
457
|
+
declare class TTSVoiceNotFoundError extends TTSError {
|
|
458
|
+
constructor(provider: string, voiceId: string);
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
/**
|
|
462
|
+
* @voice-kit/core — G.711 µ-law codec
|
|
463
|
+
*
|
|
464
|
+
* Pure TypeScript implementation of G.711 µ-law (mu-law) encode/decode.
|
|
465
|
+
* No external codec library needed for µ-law. This is 100% internal —
|
|
466
|
+
* never exported from the public API.
|
|
467
|
+
*
|
|
468
|
+
* Used by AudioPipeline to convert Twilio/Exotel µ-law audio ↔ PCM.
|
|
469
|
+
*/
|
|
470
|
+
/**
|
|
471
|
+
* Convert a single µ-law encoded byte (0–255) to a 16-bit linear PCM sample.
|
|
472
|
+
* Algorithm: ITU-T G.711 Section 3.
|
|
473
|
+
*
|
|
474
|
+
* @internal
|
|
475
|
+
*/
|
|
476
|
+
declare function mulawToLinear(sample: number): number;
|
|
477
|
+
/**
|
|
478
|
+
* Convert a 16-bit linear PCM sample to a µ-law encoded byte.
|
|
479
|
+
* Algorithm: ITU-T G.711 Section 3.
|
|
480
|
+
*
|
|
481
|
+
* @internal
|
|
482
|
+
*/
|
|
483
|
+
declare function linearToMulaw(sample: number): number;
|
|
484
|
+
/**
|
|
485
|
+
* Convert a Buffer of µ-law encoded bytes to 16-bit little-endian PCM.
|
|
486
|
+
* Each µ-law byte expands to 2 PCM bytes (16-bit LE signed).
|
|
487
|
+
*
|
|
488
|
+
* Input: N bytes (µ-law, 8kHz mono as sent by Twilio/Exotel)
|
|
489
|
+
* Output: N*2 bytes (PCM 16-bit LE, same sample rate)
|
|
490
|
+
*
|
|
491
|
+
* @internal
|
|
492
|
+
*/
|
|
493
|
+
declare function mulawBufferToPcm(buf: Buffer): Buffer;
|
|
494
|
+
/**
|
|
495
|
+
* Convert a Buffer of 16-bit little-endian PCM to µ-law bytes.
|
|
496
|
+
* Each pair of PCM bytes compresses to 1 µ-law byte.
|
|
497
|
+
*
|
|
498
|
+
* Input: N bytes (PCM 16-bit LE)
|
|
499
|
+
* Output: N/2 bytes (µ-law)
|
|
500
|
+
*
|
|
501
|
+
* @internal
|
|
502
|
+
*/
|
|
503
|
+
declare function pcmBufferToMulaw(buf: Buffer): Buffer;
|
|
504
|
+
/**
|
|
505
|
+
* Convert a base64-encoded µ-law string (as sent by Twilio Media Streams)
|
|
506
|
+
* directly to PCM Buffer. Convenience wrapper used in TwilioProvider.
|
|
507
|
+
*
|
|
508
|
+
* @internal
|
|
509
|
+
*/
|
|
510
|
+
declare function base64MulawToPcm(base64: string): Buffer;
|
|
511
|
+
/**
|
|
512
|
+
* Convert a PCM Buffer to a base64-encoded µ-law string (for sending
|
|
513
|
+
* back to Twilio Media Streams).
|
|
514
|
+
*
|
|
515
|
+
* @internal
|
|
516
|
+
*/
|
|
517
|
+
declare function pcmToBase64Mulaw(pcm: Buffer): string;
|
|
518
|
+
|
|
519
|
+
/**
|
|
520
|
+
* @voice-kit/core — AudioPipeline
|
|
521
|
+
*
|
|
522
|
+
* Automatically selects codec, sample rate, and VAD config based on the
|
|
523
|
+
* telephony provider. Developers never configure codecs — the pipeline
|
|
524
|
+
* handles all conversions transparently.
|
|
525
|
+
*
|
|
526
|
+
* Provider audio formats:
|
|
527
|
+
* Twilio / Exotel → 8kHz µ-law → decode → 8kHz PCM → upsample → 16kHz PCM (for STT)
|
|
528
|
+
* Plivo / Telnyx → 8kHz µ-law (same as Twilio)
|
|
529
|
+
* LiveKit → 48kHz Opus → decode → 48kHz PCM → downsample → 16kHz PCM (for STT)
|
|
530
|
+
* SIP (generic) → 8kHz G.711 (same as Twilio)
|
|
531
|
+
*
|
|
532
|
+
* TTS output path (reverse):
|
|
533
|
+
* STT/LLM → TTS PCM (provider-native rate) → resample → telephony-native rate → encode
|
|
534
|
+
*/
|
|
535
|
+
|
|
536
|
+
/** Telephony providers handled by the pipeline. */
|
|
537
|
+
type TelephonyProviderName = 'twilio' | 'exotel' | 'plivo' | 'telnyx' | 'livekit' | 'sip';
|
|
538
|
+
/**
|
|
539
|
+
* AudioPipeline: auto-wires codec → resample → VAD for a specific telephony provider.
|
|
540
|
+
*
|
|
541
|
+
* Developers never call this directly — it is instantiated by TelephonyProvider
|
|
542
|
+
* implementations and consumed by VoiceAgent.
|
|
543
|
+
*
|
|
544
|
+
* @internal
|
|
545
|
+
*/
|
|
546
|
+
declare class AudioPipeline {
|
|
547
|
+
private readonly profile;
|
|
548
|
+
readonly provider: TelephonyProviderName;
|
|
549
|
+
constructor(provider: TelephonyProviderName);
|
|
550
|
+
/**
|
|
551
|
+
* Transform incoming telephony audio to 16kHz PCM for STT.
|
|
552
|
+
* Handles µ-law decode + resampling automatically.
|
|
553
|
+
*
|
|
554
|
+
* @param raw Raw audio bytes as received from telephony provider
|
|
555
|
+
* @returns Async iterable of 16kHz PCM buffers for STT
|
|
556
|
+
*
|
|
557
|
+
* @internal
|
|
558
|
+
*/
|
|
559
|
+
inboundForSTT(raw: AsyncIterable<Buffer>): AsyncIterable<Buffer>;
|
|
560
|
+
/**
|
|
561
|
+
* Transform TTS output PCM to telephony-native format for sending to caller.
|
|
562
|
+
* Handles resampling + µ-law encode automatically.
|
|
563
|
+
*
|
|
564
|
+
* @param ttsAudio Raw PCM from TTS provider (at TTS provider's native rate)
|
|
565
|
+
* @param ttsSampleRate Native sample rate of the TTS provider
|
|
566
|
+
* @returns Async iterable of audio bytes ready to send to telephony provider
|
|
567
|
+
*
|
|
568
|
+
* @internal
|
|
569
|
+
*/
|
|
570
|
+
outboundFromTTS(ttsAudio: AsyncIterable<Buffer>, ttsSampleRate: number): AsyncIterable<Buffer>;
|
|
571
|
+
/** Get the VAD config tuned for this provider's audio quality. @internal */
|
|
572
|
+
get vadConfig(): Required<VADConfig>;
|
|
573
|
+
/** Sample rate that STT expects (post-pipeline). @internal */
|
|
574
|
+
get sttSampleRate(): number;
|
|
575
|
+
/** Async generator: decode µ-law stream to PCM. @internal */
|
|
576
|
+
private decodeMulaw;
|
|
577
|
+
}
|
|
578
|
+
/**
|
|
579
|
+
* Factory: create an AudioPipeline pre-configured for the given telephony provider.
|
|
580
|
+
*
|
|
581
|
+
* @internal — used by TelephonyProvider implementations
|
|
582
|
+
*/
|
|
583
|
+
declare function createAudioPipeline(provider: TelephonyProviderName): AudioPipeline;
|
|
584
|
+
|
|
585
|
+
/**
|
|
586
|
+
* @voice-kit/core — PCM audio resampler
|
|
587
|
+
*
|
|
588
|
+
* Resamples raw PCM audio between sample rates using fluent-ffmpeg.
|
|
589
|
+
* 100% internal — never exported from the public API.
|
|
590
|
+
* Used by AudioPipeline to convert provider-native rates to STT-required rates.
|
|
591
|
+
*/
|
|
592
|
+
|
|
593
|
+
/**
|
|
594
|
+
* Resample a PCM Buffer from one sample rate to another.
|
|
595
|
+
* Both input and output are signed 16-bit little-endian PCM, mono.
|
|
596
|
+
*
|
|
597
|
+
* Common conversions:
|
|
598
|
+
* 8kHz → 16kHz (Twilio/Exotel µ-law decoded → Deepgram input)
|
|
599
|
+
* 48kHz → 16kHz (LiveKit Opus decoded → Deepgram input)
|
|
600
|
+
* 24kHz → 8kHz (ElevenLabs output → Twilio send)
|
|
601
|
+
*
|
|
602
|
+
* @param buf Raw PCM bytes (s16le mono)
|
|
603
|
+
* @param fromHz Source sample rate in Hz
|
|
604
|
+
* @param toHz Target sample rate in Hz
|
|
605
|
+
* @returns Resampled PCM bytes (s16le mono)
|
|
606
|
+
*
|
|
607
|
+
* @internal
|
|
608
|
+
*/
|
|
609
|
+
declare function resample(buf: Buffer, fromHz: number, toHz: number): Promise<Buffer>;
|
|
610
|
+
/**
|
|
611
|
+
* Create a streaming resampler Transform stream.
|
|
612
|
+
* More efficient than buffering for large audio chunks.
|
|
613
|
+
*
|
|
614
|
+
* @param fromHz Source sample rate in Hz
|
|
615
|
+
* @param toHz Target sample rate in Hz
|
|
616
|
+
* @returns Node.js Transform stream: PCM in, resampled PCM out
|
|
617
|
+
*
|
|
618
|
+
* @internal
|
|
619
|
+
*/
|
|
620
|
+
declare function createResamplerStream(fromHz: number, toHz: number): PassThrough;
|
|
621
|
+
/**
|
|
622
|
+
* Async generator that resamples chunks from an audio iterable on the fly.
|
|
623
|
+
* Used by AudioPipeline for realtime streaming paths.
|
|
624
|
+
*
|
|
625
|
+
* @param audio Async iterable of raw PCM buffers at fromHz
|
|
626
|
+
* @param fromHz Source sample rate
|
|
627
|
+
* @param toHz Target sample rate
|
|
628
|
+
*
|
|
629
|
+
* @internal
|
|
630
|
+
*/
|
|
631
|
+
declare function resampleStream(audio: AsyncIterable<Buffer>, fromHz: number, toHz: number): AsyncIterable<Buffer>;
|
|
632
|
+
|
|
633
|
+
/**
|
|
634
|
+
* @voice-kit/core — Voice Activity Detection engine
|
|
635
|
+
*
|
|
636
|
+
* Wraps @ricky0123/vad-web and emits strongly-typed VoiceFrame events.
|
|
637
|
+
* Developers subscribe to VoiceFrame events — they never touch the raw VAD API.
|
|
638
|
+
*
|
|
639
|
+
* @example
|
|
640
|
+
* ```ts
|
|
641
|
+
* const vad = createVAD({ threshold: 0.6 })
|
|
642
|
+
* vad.on('frame', (frame) => {
|
|
643
|
+
* if (frame.type === 'speech_start') startRecording()
|
|
644
|
+
* if (frame.type === 'speech_end') stopRecording()
|
|
645
|
+
* })
|
|
646
|
+
* await vad.processStream(audioStream)
|
|
647
|
+
* ```
|
|
648
|
+
*/
|
|
649
|
+
|
|
650
|
+
type VADEventMap = {
|
|
651
|
+
frame: [VoiceFrame];
|
|
652
|
+
error: [AudioTransportError];
|
|
653
|
+
};
|
|
654
|
+
/**
|
|
655
|
+
* Internal VAD engine. Processes a 16kHz PCM stream and emits VoiceFrame events.
|
|
656
|
+
* Automatically debounces rapid speech_start/speech_end transitions.
|
|
657
|
+
*
|
|
658
|
+
* Input: 16kHz, 16-bit little-endian PCM, mono.
|
|
659
|
+
* Output: VoiceFrame events on the emitter.
|
|
660
|
+
*/
|
|
661
|
+
declare class VADEngine extends EventEmitter<VADEventMap> {
|
|
662
|
+
private readonly config;
|
|
663
|
+
private isSpeaking;
|
|
664
|
+
private positiveFrameCount;
|
|
665
|
+
private negativeFrameCount;
|
|
666
|
+
private debounceTimer;
|
|
667
|
+
private frameBuffer;
|
|
668
|
+
private vadModel;
|
|
669
|
+
constructor(config?: VADConfig);
|
|
670
|
+
/**
|
|
671
|
+
* Process an async stream of PCM audio frames.
|
|
672
|
+
* Automatically frames the input into 30ms chunks for VAD processing.
|
|
673
|
+
*
|
|
674
|
+
* @param audio Async iterable of PCM buffers (16kHz, s16le, mono)
|
|
675
|
+
*/
|
|
676
|
+
processStream(audio: AsyncIterable<Buffer>): Promise<void>;
|
|
677
|
+
/**
|
|
678
|
+
* Process a single 30ms PCM frame through the VAD model.
|
|
679
|
+
*
|
|
680
|
+
* @internal
|
|
681
|
+
*/
|
|
682
|
+
private processFrame;
|
|
683
|
+
/**
|
|
684
|
+
* Run Silero VAD model inference on a single frame.
|
|
685
|
+
* Returns confidence score 0–1.
|
|
686
|
+
*
|
|
687
|
+
* @internal
|
|
688
|
+
*/
|
|
689
|
+
private runVADInference;
|
|
690
|
+
private emitFrame;
|
|
691
|
+
private scheduleDebounce;
|
|
692
|
+
private clearDebounce;
|
|
693
|
+
/**
|
|
694
|
+
* Load the Silero VAD model if not already loaded.
|
|
695
|
+
* @internal
|
|
696
|
+
*/
|
|
697
|
+
private ensureModelLoaded;
|
|
698
|
+
/** Clean up resources. Call when the call ends. */
|
|
699
|
+
destroy(): void;
|
|
700
|
+
}
|
|
701
|
+
/**
|
|
702
|
+
* Create a configured VAD engine instance.
|
|
703
|
+
* Input must be 16kHz, 16-bit LE, mono PCM (handled automatically by AudioPipeline).
|
|
704
|
+
*
|
|
705
|
+
* @example
|
|
706
|
+
* ```ts
|
|
707
|
+
* const vad = createVAD({ threshold: 0.7, debounceMs: 200 })
|
|
708
|
+
* vad.on('frame', (frame) => handleFrame(frame))
|
|
709
|
+
* await vad.processStream(audioStream)
|
|
710
|
+
* ```
|
|
711
|
+
*/
|
|
712
|
+
declare function createVAD(config?: VADConfig): VADEngine;
|
|
713
|
+
|
|
714
|
+
/**
|
|
715
|
+
* @voice-kit/core — Call audit log
|
|
716
|
+
*
|
|
717
|
+
* Immutable append-only audit log for compliance and debugging.
|
|
718
|
+
* In-memory (LRU) + optional file sink. Once written, entries cannot be modified.
|
|
719
|
+
*/
|
|
720
|
+
type AuditEventType = 'call.started' | 'call.ended' | 'compliance.checked' | 'compliance.blocked' | 'consent.recorded' | 'consent.verified' | 'turn.started' | 'turn.ended' | 'interruption' | 'agent.handoff' | 'tool.called' | 'error';
|
|
721
|
+
interface AuditEntry {
|
|
722
|
+
readonly id: string;
|
|
723
|
+
readonly callId: string;
|
|
724
|
+
readonly type: AuditEventType;
|
|
725
|
+
readonly timestamp: Date;
|
|
726
|
+
readonly data: Readonly<Record<string, unknown>>;
|
|
727
|
+
}
|
|
728
|
+
/**
|
|
729
|
+
* Immutable append-only call audit log.
|
|
730
|
+
*
|
|
731
|
+
* Entries are written to LRU in-process memory and optionally to a JSONL file.
|
|
732
|
+
* Once written, entries are frozen — no modification is possible.
|
|
733
|
+
*
|
|
734
|
+
* @example
|
|
735
|
+
* ```ts
|
|
736
|
+
* const audit = new CallAuditLog({ filePath: '/var/log/voice-kit/audit.jsonl' })
|
|
737
|
+
* audit.append(callId, 'call.started', { from: '+91...', to: '+91...' })
|
|
738
|
+
* const entries = audit.getEntries(callId)
|
|
739
|
+
* ```
|
|
740
|
+
*/
|
|
741
|
+
declare class CallAuditLog {
|
|
742
|
+
/** LRU: up to 10,000 calls × 200 entries each = 2M entries max */
|
|
743
|
+
private readonly cache;
|
|
744
|
+
private readonly filePath?;
|
|
745
|
+
constructor(options?: {
|
|
746
|
+
filePath?: string;
|
|
747
|
+
maxCalls?: number;
|
|
748
|
+
});
|
|
749
|
+
/**
|
|
750
|
+
* Append an immutable audit entry for a call.
|
|
751
|
+
*
|
|
752
|
+
* @param callId The call identifier
|
|
753
|
+
* @param type Audit event type
|
|
754
|
+
* @param data Additional structured data
|
|
755
|
+
*/
|
|
756
|
+
append(callId: string, type: AuditEventType, data?: Record<string, unknown>): AuditEntry;
|
|
757
|
+
/**
|
|
758
|
+
* Get all audit entries for a call, in insertion order.
|
|
759
|
+
*
|
|
760
|
+
* @param callId The call identifier
|
|
761
|
+
*/
|
|
762
|
+
getEntries(callId: string): ReadonlyArray<AuditEntry>;
|
|
763
|
+
/**
|
|
764
|
+
* Get entries of a specific type for a call.
|
|
765
|
+
*/
|
|
766
|
+
getEntriesByType(callId: string, type: AuditEventType): ReadonlyArray<AuditEntry>;
|
|
767
|
+
/** Write entry to JSONL file. @internal */
|
|
768
|
+
private writeToFile;
|
|
769
|
+
}
|
|
770
|
+
|
|
771
|
+
/**
|
|
772
|
+
* @voice-kit/core — TRAI Compliance
|
|
773
|
+
*
|
|
774
|
+
* TRAI (Telecom Regulatory Authority of India) compliance utilities:
|
|
775
|
+
* - DNC (Do Not Call) registry check with 24h LRU cache
|
|
776
|
+
* - Calling hours enforcement (9 AM – 9 PM IST)
|
|
777
|
+
* - Consent tracking (180-day validity)
|
|
778
|
+
*
|
|
779
|
+
* Auto-enabled for +91 numbers. Opt-out, not opt-in.
|
|
780
|
+
*/
|
|
781
|
+
|
|
782
|
+
/**
|
|
783
|
+
* TRAI compliance engine.
|
|
784
|
+
*
|
|
785
|
+
* Enforces DNC registry, calling hours, and consent rules for Indian numbers.
|
|
786
|
+
* Results are cached in LRU to minimize API round-trips.
|
|
787
|
+
*
|
|
788
|
+
* @example
|
|
789
|
+
* ```ts
|
|
790
|
+
* const trai = new TRAICompliance()
|
|
791
|
+
*
|
|
792
|
+
* const result = await trai.checkCallPermission({
|
|
793
|
+
* to: '+919876543210',
|
|
794
|
+
* purpose: 'TRANSACTIONAL',
|
|
795
|
+
* })
|
|
796
|
+
*
|
|
797
|
+
* if (!result.allowed) throw new Error(result.reason)
|
|
798
|
+
* ```
|
|
799
|
+
*/
|
|
800
|
+
declare class TRAICompliance {
|
|
801
|
+
private readonly config;
|
|
802
|
+
private readonly http;
|
|
803
|
+
/** DNC check results cached for 24 hours per number. */
|
|
804
|
+
private readonly dncCache;
|
|
805
|
+
/** Consent records cached for 180 days. */
|
|
806
|
+
private readonly consentCache;
|
|
807
|
+
constructor(config?: TRAIConfig);
|
|
808
|
+
/**
|
|
809
|
+
* Check whether a call is permitted under TRAI rules.
|
|
810
|
+
* Checks: valid E.164, DNC registry, calling hours.
|
|
811
|
+
*
|
|
812
|
+
* @param params Call permission check parameters
|
|
813
|
+
* @throws DNCBlockedError if number is on DNC registry
|
|
814
|
+
* @throws CallingHoursError if outside allowed calling hours
|
|
815
|
+
* @throws ComplianceError if phone number is invalid
|
|
816
|
+
*
|
|
817
|
+
* @example
|
|
818
|
+
* ```ts
|
|
819
|
+
* const result = await trai.checkCallPermission({
|
|
820
|
+
* to: '+919876543210',
|
|
821
|
+
* purpose: 'TRANSACTIONAL',
|
|
822
|
+
* })
|
|
823
|
+
* if (!result.allowed) console.log(result.reason)
|
|
824
|
+
* ```
|
|
825
|
+
*/
|
|
826
|
+
checkCallPermission(params: DNCCheckParams): Promise<DNCCheckResult>;
|
|
827
|
+
/**
|
|
828
|
+
* Check if the current time (or a given time) is within TRAI calling hours.
|
|
829
|
+
* Allowed: 9:00 AM – 9:00 PM IST.
|
|
830
|
+
* Uses Intl.DateTimeFormat only — no date-fns or dayjs dependency.
|
|
831
|
+
*
|
|
832
|
+
* @param at Time to check. Defaults to now.
|
|
833
|
+
* @param timezone IANA timezone. Defaults to 'Asia/Kolkata'.
|
|
834
|
+
*
|
|
835
|
+
* @example
|
|
836
|
+
* ```ts
|
|
837
|
+
* trai.isWithinCallingHours() // Check now
|
|
838
|
+
* trai.isWithinCallingHours(new Date()) // Explicit time
|
|
839
|
+
* ```
|
|
840
|
+
*/
|
|
841
|
+
isWithinCallingHours(at?: Date, timezone?: string): boolean;
|
|
842
|
+
/**
|
|
843
|
+
* Record explicit consent from a user for future calls.
|
|
844
|
+
* Consent is valid for 180 days per TRAI guidelines.
|
|
845
|
+
*
|
|
846
|
+
* @param params Consent record details
|
|
847
|
+
*
|
|
848
|
+
* @example
|
|
849
|
+
* ```ts
|
|
850
|
+
* await trai.recordConsent({
|
|
851
|
+
* phoneNumber: '+919876543210',
|
|
852
|
+
* consentedAt: new Date(),
|
|
853
|
+
* channel: 'ivr',
|
|
854
|
+
* purpose: 'PROMOTIONAL',
|
|
855
|
+
* })
|
|
856
|
+
* ```
|
|
857
|
+
*/
|
|
858
|
+
recordConsent(params: ConsentRecord): Promise<void>;
|
|
859
|
+
/**
|
|
860
|
+
* Check if a number has valid (non-expired) consent on record.
|
|
861
|
+
*
|
|
862
|
+
* @param phoneNumber E.164 phone number
|
|
863
|
+
* @returns True if valid consent exists
|
|
864
|
+
*/
|
|
865
|
+
hasValidConsent(phoneNumber: string): Promise<boolean>;
|
|
866
|
+
/**
|
|
867
|
+
* Fetch DNC status from TRAI DND API.
|
|
868
|
+
* @internal
|
|
869
|
+
*/
|
|
870
|
+
private fetchDNCStatus;
|
|
871
|
+
}
|
|
872
|
+
|
|
873
|
+
/**
|
|
874
|
+
* @voice-kit/core — LRU-backed call memory
|
|
875
|
+
*
|
|
876
|
+
* Provides a sliding window of conversation turns per call.
|
|
877
|
+
* Uses lru-cache for bounded in-process storage — no Redis, no DB.
|
|
878
|
+
* Every cache has explicit max size and TTL to prevent unbounded growth.
|
|
879
|
+
*/
|
|
880
|
+
|
|
881
|
+
/**
|
|
882
|
+
* Create an LRU-backed call memory instance.
|
|
883
|
+
* This is the ONLY in-process memory system in the SDK.
|
|
884
|
+
*
|
|
885
|
+
* @param config Memory configuration
|
|
886
|
+
*
|
|
887
|
+
* @example
|
|
888
|
+
* ```ts
|
|
889
|
+
* // Default: 20 turns, 512KB, 30min TTL
|
|
890
|
+
* const memory = createCallMemory()
|
|
891
|
+
*
|
|
892
|
+
* // Custom
|
|
893
|
+
* const memory = createCallMemory({ maxTurns: 30, maxBytes: 1_000_000 })
|
|
894
|
+
* ```
|
|
895
|
+
*/
|
|
896
|
+
declare function createCallMemory(config?: CallMemoryConfig): CallMemory;
|
|
897
|
+
|
|
898
|
+
/**
|
|
899
|
+
* @voice-kit/core — CallMetrics
|
|
900
|
+
*
|
|
901
|
+
* Records per-call performance metrics: TTFB, turn latency, token cost, interruption rate.
|
|
902
|
+
* In-process LRU storage — exported via getCallSummary().
|
|
903
|
+
*/
|
|
904
|
+
|
|
905
|
+
/**
|
|
906
|
+
* Per-call performance metrics recorder.
|
|
907
|
+
*
|
|
908
|
+
* @example
|
|
909
|
+
* ```ts
|
|
910
|
+
* const metrics = new CallMetrics()
|
|
911
|
+
* metrics.recordSTTFirstByte(callId, 180)
|
|
912
|
+
* metrics.recordTurnLatency(callId, 340)
|
|
913
|
+
* const summary = metrics.getCallSummary(callId)
|
|
914
|
+
* console.log(summary.avgTurnLatencyMs) // 340
|
|
915
|
+
* ```
|
|
916
|
+
*/
|
|
917
|
+
declare class CallMetrics {
|
|
918
|
+
private readonly store;
|
|
919
|
+
constructor();
|
|
920
|
+
private getOrCreate;
|
|
921
|
+
/** Record time from audio start to first STT partial result. */
|
|
922
|
+
recordSTTFirstByte(callId: string, ms: number): void;
|
|
923
|
+
/** Record time from TTS request to first audio chunk. */
|
|
924
|
+
recordTTSFirstByte(callId: string, ms: number): void;
|
|
925
|
+
/** Record time from LLM request to first token. */
|
|
926
|
+
recordLLMFirstToken(callId: string, ms: number): void;
|
|
927
|
+
/**
|
|
928
|
+
* Record end-to-end turn latency: speech_end → first TTS audio byte.
|
|
929
|
+
* This is the primary latency metric for voice agent quality.
|
|
930
|
+
*/
|
|
931
|
+
recordTurnLatency(callId: string, ms: number): void;
|
|
932
|
+
/**
|
|
933
|
+
* Record an interruption event.
|
|
934
|
+
*
|
|
935
|
+
* @param callId Call identifier
|
|
936
|
+
* @param positionPct 0–1, how far through the TTS stream the interruption occurred
|
|
937
|
+
*/
|
|
938
|
+
recordInterruption(callId: string, positionPct: number): void;
|
|
939
|
+
/** Record token usage and estimated cost for a model call. */
|
|
940
|
+
recordTokenCost(callId: string, model: string, inputTokens: number, outputTokens: number): void;
|
|
941
|
+
/**
|
|
942
|
+
* Get a full summary of metrics for a call.
|
|
943
|
+
*
|
|
944
|
+
* @param callId The call identifier
|
|
945
|
+
* @returns Aggregated metrics summary
|
|
946
|
+
*/
|
|
947
|
+
getCallSummary(callId: string): CallMetricsSummary;
|
|
948
|
+
/** Remove metrics for a call. Call on call.ended to free memory. */
|
|
949
|
+
clearCall(callId: string): void;
|
|
950
|
+
}
|
|
951
|
+
|
|
952
|
+
/**
|
|
953
|
+
* @voice-kit/core — OpenTelemetry tracing
|
|
954
|
+
*
|
|
955
|
+
* VoiceSDKTracer: wraps every external provider call with OTel spans.
|
|
956
|
+
* Auto-exports to OTLP endpoint if OTEL_EXPORTER_OTLP_ENDPOINT is set.
|
|
957
|
+
*/
|
|
958
|
+
/**
|
|
959
|
+
* OpenTelemetry tracer for VoiceKit. Wraps every external I/O with spans.
|
|
960
|
+
*
|
|
961
|
+
* @example
|
|
962
|
+
* ```ts
|
|
963
|
+
* const tracer = new VoiceSDKTracer()
|
|
964
|
+
* const result = await tracer.traceSTT(
|
|
965
|
+
* () => stt.transcribeBatch(audio),
|
|
966
|
+
* { provider: 'deepgram', language: 'en-IN' }
|
|
967
|
+
* )
|
|
968
|
+
* ```
|
|
969
|
+
*/
|
|
970
|
+
declare class VoiceSDKTracer {
|
|
971
|
+
private readonly tracer;
|
|
972
|
+
constructor();
|
|
973
|
+
/**
|
|
974
|
+
* Trace an STT operation with provider + language attributes.
|
|
975
|
+
*/
|
|
976
|
+
traceSTT<T>(fn: () => Promise<T>, attrs: {
|
|
977
|
+
provider: string;
|
|
978
|
+
language: string;
|
|
979
|
+
callId?: string;
|
|
980
|
+
}): Promise<T>;
|
|
981
|
+
/**
|
|
982
|
+
* Trace a TTS synthesis operation.
|
|
983
|
+
*/
|
|
984
|
+
traceTTS<T>(fn: () => Promise<T>, attrs: {
|
|
985
|
+
provider: string;
|
|
986
|
+
voice: string;
|
|
987
|
+
chars: number;
|
|
988
|
+
callId?: string;
|
|
989
|
+
}): Promise<T>;
|
|
990
|
+
/**
|
|
991
|
+
* Trace an LLM generation call.
|
|
992
|
+
*/
|
|
993
|
+
traceLLM<T>(fn: () => Promise<T>, attrs: {
|
|
994
|
+
model: string;
|
|
995
|
+
inputTokens: number;
|
|
996
|
+
callId?: string;
|
|
997
|
+
}): Promise<T>;
|
|
998
|
+
/**
|
|
999
|
+
* Trace a full call lifecycle.
|
|
1000
|
+
*/
|
|
1001
|
+
traceCall<T>(fn: () => Promise<T>, attrs: {
|
|
1002
|
+
callId: string;
|
|
1003
|
+
direction: 'inbound' | 'outbound';
|
|
1004
|
+
}): Promise<T>;
|
|
1005
|
+
/**
|
|
1006
|
+
* Trace a single conversation turn.
|
|
1007
|
+
*/
|
|
1008
|
+
traceTurn<T>(fn: () => Promise<T>, attrs: {
|
|
1009
|
+
turnIndex: number;
|
|
1010
|
+
callId: string;
|
|
1011
|
+
}): Promise<T>;
|
|
1012
|
+
/** Generic span wrapper. @internal */
|
|
1013
|
+
private withSpan;
|
|
1014
|
+
}
|
|
1015
|
+
|
|
1016
|
+
/**
|
|
1017
|
+
* @voice-kit/core — AssemblyAI STT Provider
|
|
1018
|
+
*
|
|
1019
|
+
* Async long-form transcription using AssemblyAI SDK.
|
|
1020
|
+
* Best for post-call recordings, meeting notes, long interviews.
|
|
1021
|
+
* Does not support realtime streaming — use Deepgram for live calls.
|
|
1022
|
+
*/
|
|
1023
|
+
|
|
1024
|
+
/**
|
|
1025
|
+
* AssemblyAI async transcription provider.
|
|
1026
|
+
* @internal — obtained via createSTT('assemblyai', config)
|
|
1027
|
+
*/
|
|
1028
|
+
declare class AssemblyAISTTProvider implements STTProvider {
|
|
1029
|
+
readonly name = "assemblyai";
|
|
1030
|
+
readonly supportsStreaming = false;
|
|
1031
|
+
readonly supportedLanguages: string[];
|
|
1032
|
+
private readonly client;
|
|
1033
|
+
private readonly config;
|
|
1034
|
+
constructor(config: STTConfig);
|
|
1035
|
+
/**
|
|
1036
|
+
* Batch-transcribes collected audio. AssemblyAI has no realtime streaming.
|
|
1037
|
+
* Collects all audio from the iterable, uploads, then polls for result.
|
|
1038
|
+
*
|
|
1039
|
+
* @param audio Async iterable of PCM buffers
|
|
1040
|
+
*/
|
|
1041
|
+
transcribeStream(audio: AsyncIterable<Buffer>): AsyncIterable<STTResult>;
|
|
1042
|
+
/**
|
|
1043
|
+
* Upload audio to AssemblyAI and wait for async transcription.
|
|
1044
|
+
* Suitable for call recordings. Average latency: 15–45s per minute of audio.
|
|
1045
|
+
*
|
|
1046
|
+
* @param audio Raw WAV/PCM/MP3 buffer
|
|
1047
|
+
*
|
|
1048
|
+
* @example
|
|
1049
|
+
* ```ts
|
|
1050
|
+
* const stt = createSTT('assemblyai', { wordTimestamps: true })
|
|
1051
|
+
* const result = await stt.transcribeBatch(recordingBuffer)
|
|
1052
|
+
* console.log(result.words) // Word-level timestamps
|
|
1053
|
+
* ```
|
|
1054
|
+
*/
|
|
1055
|
+
transcribeBatch(audio: Buffer): Promise<STTResult>;
|
|
1056
|
+
}
|
|
1057
|
+
|
|
1058
|
+
/**
|
|
1059
|
+
* @voice-kit/core — Deepgram Nova-3 STT Provider
|
|
1060
|
+
*
|
|
1061
|
+
* Streaming STT using Deepgram Nova-3. Handles WebSocket reconnect with
|
|
1062
|
+
* exponential backoff, interim + final results, language detection.
|
|
1063
|
+
* Never instantiate directly — use createSTT('deepgram', config).
|
|
1064
|
+
*
|
|
1065
|
+
* SDK: @deepgram/sdk v5 (beta) — https://github.com/deepgram/deepgram-js-sdk
|
|
1066
|
+
*/
|
|
1067
|
+
|
|
1068
|
+
/**
|
|
1069
|
+
* Deepgram Nova-3 streaming STT provider.
|
|
1070
|
+
* @internal — obtained via createSTT('deepgram', config)
|
|
1071
|
+
*/
|
|
1072
|
+
declare class DeepgramSTTProvider implements STTProvider {
|
|
1073
|
+
readonly name = "deepgram";
|
|
1074
|
+
readonly supportsStreaming = true;
|
|
1075
|
+
readonly supportedLanguages: string[];
|
|
1076
|
+
private readonly client;
|
|
1077
|
+
private readonly config;
|
|
1078
|
+
constructor(config: STTConfig);
|
|
1079
|
+
/**
|
|
1080
|
+
* Stream audio to Deepgram and receive interim + final transcription results.
|
|
1081
|
+
* Handles reconnection transparently with exponential backoff.
|
|
1082
|
+
*
|
|
1083
|
+
* @param audio Async iterable of 16kHz PCM buffers from AudioPipeline
|
|
1084
|
+
*
|
|
1085
|
+
* @example
|
|
1086
|
+
* ```ts
|
|
1087
|
+
* const stt = createSTT('deepgram', { language: 'hi-IN' })
|
|
1088
|
+
* for await (const result of stt.transcribeStream(audioIterable)) {
|
|
1089
|
+
* if (result.isFinal) console.log('User said:', result.transcript)
|
|
1090
|
+
* }
|
|
1091
|
+
* ```
|
|
1092
|
+
*/
|
|
1093
|
+
transcribeStream(audio: AsyncIterable<Buffer>): AsyncIterable<STTResult>;
|
|
1094
|
+
/**
|
|
1095
|
+
* Transcribe a complete audio buffer (non-streaming).
|
|
1096
|
+
* Uses Deepgram pre-recorded API.
|
|
1097
|
+
*
|
|
1098
|
+
* @param audio Raw PCM or WAV buffer
|
|
1099
|
+
*/
|
|
1100
|
+
transcribeBatch(audio: Buffer): Promise<STTResult>;
|
|
1101
|
+
/**
|
|
1102
|
+
* Create and open a live WebSocket connection to Deepgram.
|
|
1103
|
+
*
|
|
1104
|
+
* v5 connection lifecycle (3 explicit steps):
|
|
1105
|
+
* 1. await listen.v1.connect(options) — constructs the connection object
|
|
1106
|
+
* 2. connection.connect() — initiates the WebSocket handshake
|
|
1107
|
+
* 3. await connection.waitForOpen() — resolves once the socket is ready
|
|
1108
|
+
*
|
|
1109
|
+
* @internal
|
|
1110
|
+
*/
|
|
1111
|
+
private connectWithRetry;
|
|
1112
|
+
}
|
|
1113
|
+
|
|
1114
|
+
/**
|
|
1115
|
+
* @voice-kit/core — Sarvam AI Indic STT Provider
|
|
1116
|
+
*
|
|
1117
|
+
* Sarvam AI provides state-of-the-art STT for Indian languages:
|
|
1118
|
+
* hi-IN, kn-IN, ta-IN, te-IN, mr-IN, bn-IN, gu-IN, pa-IN, or-IN
|
|
1119
|
+
*
|
|
1120
|
+
* Uses axios for HTTP calls. No official JS SDK — we use the REST API directly.
|
|
1121
|
+
*/
|
|
1122
|
+
|
|
1123
|
+
/**
|
|
1124
|
+
* Sarvam AI Indic STT provider.
|
|
1125
|
+
* @internal — obtained via createSTT('sarvam', config)
|
|
1126
|
+
*/
|
|
1127
|
+
declare class SarvamSTTProvider implements STTProvider {
|
|
1128
|
+
readonly name = "sarvam";
|
|
1129
|
+
readonly supportsStreaming = false;
|
|
1130
|
+
readonly supportedLanguages: string[];
|
|
1131
|
+
private readonly http;
|
|
1132
|
+
private readonly config;
|
|
1133
|
+
constructor(config: STTConfig);
|
|
1134
|
+
/**
|
|
1135
|
+
* Collects audio and transcribes via Sarvam batch API.
|
|
1136
|
+
* Sarvam doesn't support realtime streaming.
|
|
1137
|
+
*
|
|
1138
|
+
* @param audio Async iterable of 16kHz PCM buffers
|
|
1139
|
+
*/
|
|
1140
|
+
transcribeStream(audio: AsyncIterable<Buffer>): AsyncIterable<STTResult>;
|
|
1141
|
+
/**
|
|
1142
|
+
* Transcribe a WAV/PCM audio buffer in an Indic language.
|
|
1143
|
+
*
|
|
1144
|
+
* @param audio 16kHz PCM or WAV buffer
|
|
1145
|
+
*
|
|
1146
|
+
* @example
|
|
1147
|
+
* ```ts
|
|
1148
|
+
* const stt = createSTT('sarvam', { language: 'ta-IN' })
|
|
1149
|
+
* const result = await stt.transcribeBatch(tamilAudioBuffer)
|
|
1150
|
+
* console.log(result.transcript) // Tamil text
|
|
1151
|
+
* ```
|
|
1152
|
+
*/
|
|
1153
|
+
transcribeBatch(audio: Buffer): Promise<STTResult>;
|
|
1154
|
+
}
|
|
1155
|
+
|
|
1156
|
+
/**
|
|
1157
|
+
* @voice-kit/core — Hinglish language switch detector
|
|
1158
|
+
*
|
|
1159
|
+
* Detects mid-sentence Hindi↔English (Hinglish) code-switching in realtime STT output.
|
|
1160
|
+
* Pure algorithmic detection — no external API calls, no latency overhead.
|
|
1161
|
+
*
|
|
1162
|
+
* Detection signals:
|
|
1163
|
+
* 1. Devanagari Unicode range (U+0900–U+097F) for Hindi
|
|
1164
|
+
* 2. Latin character runs for English
|
|
1165
|
+
* 3. Common Hinglish transition patterns (e.g. "main think karta hun")
|
|
1166
|
+
* 4. Script boundary crossing mid-sentence
|
|
1167
|
+
*/
|
|
1168
|
+
|
|
1169
|
+
type LanguageCode = 'hi-IN' | 'en-IN' | 'unknown';
|
|
1170
|
+
interface LanguageSwitchEvent {
|
|
1171
|
+
/** Language switched from. */
|
|
1172
|
+
from: LanguageCode;
|
|
1173
|
+
/** Language switched to. */
|
|
1174
|
+
to: LanguageCode;
|
|
1175
|
+
/** Position in transcript where switch occurred (word index). */
|
|
1176
|
+
position: number;
|
|
1177
|
+
/** Confidence of the detection 0–1. */
|
|
1178
|
+
confidence: number;
|
|
1179
|
+
/** Full transcript at time of detection. */
|
|
1180
|
+
transcript: string;
|
|
1181
|
+
/** Timestamp of detection. */
|
|
1182
|
+
detectedAt: Date;
|
|
1183
|
+
}
|
|
1184
|
+
type LanguageDetectorEventMap = {
|
|
1185
|
+
'language.switched': [LanguageSwitchEvent];
|
|
1186
|
+
};
|
|
1187
|
+
/**
|
|
1188
|
+
* Hinglish language switch detector.
|
|
1189
|
+
*
|
|
1190
|
+
* Analyzes STT transcripts word-by-word in realtime.
|
|
1191
|
+
* Emits 'language.switched' events when a significant script change is detected.
|
|
1192
|
+
*
|
|
1193
|
+
* @example
|
|
1194
|
+
* ```ts
|
|
1195
|
+
* const detector = new LanguageSwitchDetector('en-IN')
|
|
1196
|
+
* detector.on('language.switched', ({ from, to, transcript }) => {
|
|
1197
|
+
* console.log(`Language switched: ${from} → ${to} in: "${transcript}"`)
|
|
1198
|
+
* })
|
|
1199
|
+
*
|
|
1200
|
+
* // Call on every STT final result
|
|
1201
|
+
* detector.analyze('main yeh kaam kal karonga I promise')
|
|
1202
|
+
* ```
|
|
1203
|
+
*/
|
|
1204
|
+
declare class LanguageSwitchDetector extends EventEmitter<LanguageDetectorEventMap> {
|
|
1205
|
+
private currentLanguage;
|
|
1206
|
+
private readonly primaryLanguage;
|
|
1207
|
+
/** Rolling window of recent language classifications for smoothing. */
|
|
1208
|
+
private recentClassifications;
|
|
1209
|
+
private readonly windowSize;
|
|
1210
|
+
constructor(primaryLanguage?: LanguageCode);
|
|
1211
|
+
/**
|
|
1212
|
+
* Analyze a transcript for language switches.
|
|
1213
|
+
* Should be called on every STT final result.
|
|
1214
|
+
*
|
|
1215
|
+
* @param transcript The transcribed text to analyze
|
|
1216
|
+
* @returns Detected language of the transcript
|
|
1217
|
+
*/
|
|
1218
|
+
analyze(transcript: string): LanguageCode;
|
|
1219
|
+
/**
|
|
1220
|
+
* Analyze a transcript and return per-word language classification.
|
|
1221
|
+
* Useful for word-level Hinglish mixing visualization.
|
|
1222
|
+
*
|
|
1223
|
+
* @param transcript Text to analyze
|
|
1224
|
+
* @returns Array of { word, language } pairs
|
|
1225
|
+
*/
|
|
1226
|
+
analyzeWords(transcript: string): Array<{
|
|
1227
|
+
word: string;
|
|
1228
|
+
language: LanguageCode;
|
|
1229
|
+
}>;
|
|
1230
|
+
/** Reset to primary language (e.g., on new call). */
|
|
1231
|
+
reset(): void;
|
|
1232
|
+
/** Current detected language. */
|
|
1233
|
+
get language(): LanguageCode;
|
|
1234
|
+
private tokenize;
|
|
1235
|
+
private classifyWord;
|
|
1236
|
+
private classifySegment;
|
|
1237
|
+
private computeConfidence;
|
|
1238
|
+
private smoothedLanguage;
|
|
1239
|
+
}
|
|
1240
|
+
/**
|
|
1241
|
+
* Detect whether a transcript contains mixed Hindi+English (Hinglish).
|
|
1242
|
+
* Stateless convenience function for one-shot analysis.
|
|
1243
|
+
*
|
|
1244
|
+
* @param transcript Text to analyze
|
|
1245
|
+
* @returns True if both Devanagari and Latin characters are present
|
|
1246
|
+
*
|
|
1247
|
+
* @example
|
|
1248
|
+
* ```ts
|
|
1249
|
+
* isHinglish('main kal office jaaunga') // true
|
|
1250
|
+
* isHinglish('I will go to the office') // false
|
|
1251
|
+
* isHinglish('मैं कल ऑफिस जाऊंगा') // false (pure Hindi)
|
|
1252
|
+
* ```
|
|
1253
|
+
*/
|
|
1254
|
+
declare function isInglish(transcript: string): boolean;
|
|
1255
|
+
|
|
1256
|
+
/**
|
|
1257
|
+
* @voice-kit/core — STT factory
|
|
1258
|
+
*
|
|
1259
|
+
* createSTT() is the ONLY public API for speech-to-text.
|
|
1260
|
+
* Never instantiate provider classes directly.
|
|
1261
|
+
*/
|
|
1262
|
+
|
|
1263
|
+
/**
|
|
1264
|
+
* Create an STT provider instance. This is the ONLY public API for STT.
|
|
1265
|
+
*
|
|
1266
|
+
* Provider selection guide:
|
|
1267
|
+
* - 'deepgram' → Default. Realtime streaming, best latency, supports en-IN + Indic
|
|
1268
|
+
* - 'sarvam' → Best accuracy for pure Indic languages (hi-IN, ta-IN, kn-IN, te-IN, mr-IN)
|
|
1269
|
+
* - 'assemblyai' → Best for long-form recordings (post-call analysis)
|
|
1270
|
+
* - 'whisper' → Fallback batch transcription, broad language support
|
|
1271
|
+
*
|
|
1272
|
+
* @example
|
|
1273
|
+
* ```ts
|
|
1274
|
+
* // Realtime English (India) — default
|
|
1275
|
+
* const stt = createSTT('deepgram', { language: 'en-IN' })
|
|
1276
|
+
*
|
|
1277
|
+
* // Realtime Hindi
|
|
1278
|
+
* const stt = createSTT('deepgram', { language: 'hi-IN' })
|
|
1279
|
+
*
|
|
1280
|
+
* // Best Indic accuracy
|
|
1281
|
+
* const stt = createSTT('sarvam', { language: 'ta-IN' })
|
|
1282
|
+
*
|
|
1283
|
+
* // Post-call recording
|
|
1284
|
+
* const stt = createSTT('assemblyai', { wordTimestamps: true })
|
|
1285
|
+
* ```
|
|
1286
|
+
*/
|
|
1287
|
+
declare function createSTT(provider: 'deepgram' | 'whisper' | 'assemblyai' | 'sarvam', config?: STTConfig): STTProvider;
|
|
1288
|
+
|
|
1289
|
+
/**
|
|
1290
|
+
* @voice-kit/core — OpenAI Whisper STT Provider (batch fallback)
|
|
1291
|
+
*
|
|
1292
|
+
* Uses @ai-sdk/openai for batch transcription. Does not support streaming.
|
|
1293
|
+
* Use as fallback for long-form audio or when Deepgram is unavailable.
|
|
1294
|
+
*/
|
|
1295
|
+
|
|
1296
|
+
/**
|
|
1297
|
+
* OpenAI Whisper STT provider. Batch-only — does not support streaming.
|
|
1298
|
+
* @internal — obtained via createSTT('whisper', config)
|
|
1299
|
+
*/
|
|
1300
|
+
declare class WhisperSTTProvider implements STTProvider {
|
|
1301
|
+
readonly name = "whisper";
|
|
1302
|
+
readonly supportsStreaming = false;
|
|
1303
|
+
readonly supportedLanguages: string[];
|
|
1304
|
+
private readonly config;
|
|
1305
|
+
constructor(config: STTConfig);
|
|
1306
|
+
/**
|
|
1307
|
+
* Streaming not supported by Whisper. Collects all audio then transcribes.
|
|
1308
|
+
* For realtime use, use createSTT('deepgram') instead.
|
|
1309
|
+
*/
|
|
1310
|
+
transcribeStream(audio: AsyncIterable<Buffer>): AsyncIterable<STTResult>;
|
|
1311
|
+
/**
|
|
1312
|
+
* Transcribe a complete audio buffer via Whisper.
|
|
1313
|
+
*
|
|
1314
|
+
* @param audio WAV or PCM buffer
|
|
1315
|
+
*/
|
|
1316
|
+
transcribeBatch(audio: Buffer): Promise<STTResult>;
|
|
1317
|
+
}
|
|
1318
|
+
|
|
1319
|
+
/**
|
|
1320
|
+
* @voice-kit/core — Cartesia TTS Provider
|
|
1321
|
+
*
|
|
1322
|
+
* Ultra-low-latency streaming TTS via @cartesia/cartesia-js.
|
|
1323
|
+
* Target first chunk: < 90ms. Best for latency-critical applications.
|
|
1324
|
+
*/
|
|
1325
|
+
|
|
1326
|
+
/**
|
|
1327
|
+
* Cartesia ultra-low-latency TTS provider.
|
|
1328
|
+
* @internal — obtained via createTTS('cartesia', config)
|
|
1329
|
+
*/
|
|
1330
|
+
declare class CartesiaTTSProvider implements TTSProvider {
|
|
1331
|
+
readonly name = "cartesia";
|
|
1332
|
+
readonly outputSampleRate = 22050;
|
|
1333
|
+
readonly outputFormat: "pcm";
|
|
1334
|
+
private readonly client;
|
|
1335
|
+
private readonly config;
|
|
1336
|
+
constructor(config: TTSConfig);
|
|
1337
|
+
/**
|
|
1338
|
+
* Stream audio from Cartesia. Typically delivers first chunk in < 90ms.
|
|
1339
|
+
*
|
|
1340
|
+
* @example
|
|
1341
|
+
* ```ts
|
|
1342
|
+
* const tts = createTTS('cartesia', { voiceId: 'your-voice-id' })
|
|
1343
|
+
* for await (const chunk of tts.synthesizeStream('Hello!')) {
|
|
1344
|
+
* sendToTelephony(chunk)
|
|
1345
|
+
* }
|
|
1346
|
+
* ```
|
|
1347
|
+
*/
|
|
1348
|
+
synthesizeStream(text: string, config?: TTSConfig): AsyncIterable<Buffer>;
|
|
1349
|
+
/** Synthesize complete audio. */
|
|
1350
|
+
synthesizeFull(text: string, config?: TTSConfig): Promise<Buffer>;
|
|
1351
|
+
}
|
|
1352
|
+
|
|
1353
|
+
/**
|
|
1354
|
+
* @voice-kit/core — ElevenLabs TTS Provider
|
|
1355
|
+
*
|
|
1356
|
+
* Streaming TTS using ElevenLabs SDK. Features:
|
|
1357
|
+
* - 100ms lookahead jitter buffer to smooth burst delivery
|
|
1358
|
+
* - Voice clone support
|
|
1359
|
+
* - Sub-300ms first chunk target
|
|
1360
|
+
*/
|
|
1361
|
+
|
|
1362
|
+
/**
|
|
1363
|
+
* ElevenLabs streaming TTS provider.
|
|
1364
|
+
* @internal — obtained via createTTS('elevenlabs', config)
|
|
1365
|
+
*/
|
|
1366
|
+
declare class ElevenLabsTTSProvider implements TTSProvider {
|
|
1367
|
+
readonly name = "elevenlabs";
|
|
1368
|
+
readonly outputSampleRate = 24000;
|
|
1369
|
+
readonly outputFormat: "pcm";
|
|
1370
|
+
private readonly client;
|
|
1371
|
+
private readonly config;
|
|
1372
|
+
constructor(config: TTSConfig);
|
|
1373
|
+
/**
|
|
1374
|
+
* Stream synthesized audio from ElevenLabs.
|
|
1375
|
+
* First chunk target: < 300ms. Uses streaming API endpoint.
|
|
1376
|
+
*
|
|
1377
|
+
* A 100ms jitter buffer smooths burst packet delivery without adding
|
|
1378
|
+
* perceptible latency.
|
|
1379
|
+
*
|
|
1380
|
+
* @param text Text to synthesize (should be a sentence boundary chunk)
|
|
1381
|
+
* @param config Per-call config overrides
|
|
1382
|
+
*
|
|
1383
|
+
* @example
|
|
1384
|
+
* ```ts
|
|
1385
|
+
* const tts = createTTS('elevenlabs', { voiceId: 'your-voice-id' })
|
|
1386
|
+
* for await (const chunk of tts.synthesizeStream('Hello, how can I help?')) {
|
|
1387
|
+
* telephony.sendAudio(chunk)
|
|
1388
|
+
* }
|
|
1389
|
+
* ```
|
|
1390
|
+
*/
|
|
1391
|
+
synthesizeStream(text: string, config?: TTSConfig): AsyncIterable<Buffer>;
|
|
1392
|
+
/**
|
|
1393
|
+
* Synthesize full audio (for pre-caching greetings, IVR prompts).
|
|
1394
|
+
* Collects all streaming chunks into a single buffer.
|
|
1395
|
+
*
|
|
1396
|
+
* @param text Text to synthesize
|
|
1397
|
+
* @param config Per-call config overrides
|
|
1398
|
+
*/
|
|
1399
|
+
synthesizeFull(text: string, config?: TTSConfig): Promise<Buffer>;
|
|
1400
|
+
}
|
|
1401
|
+
|
|
1402
|
+
/**
|
|
1403
|
+
* @voice-kit/core — Sarvam AI TTS Provider
|
|
1404
|
+
*
|
|
1405
|
+
* Sarvam AI TTS for Hindi/Hinglish and regional Indian languages.
|
|
1406
|
+
* Supports natural-sounding Indian voices with regional accents.
|
|
1407
|
+
*/
|
|
1408
|
+
|
|
1409
|
+
/**
|
|
1410
|
+
* Sarvam AI TTS provider for Indic languages.
|
|
1411
|
+
* @internal — obtained via createTTS('sarvam', config)
|
|
1412
|
+
*/
|
|
1413
|
+
declare class SarvamTTSProvider implements TTSProvider {
|
|
1414
|
+
readonly name = "sarvam";
|
|
1415
|
+
readonly outputSampleRate = 22050;
|
|
1416
|
+
readonly outputFormat: "mp3";
|
|
1417
|
+
private readonly http;
|
|
1418
|
+
private readonly config;
|
|
1419
|
+
constructor(config: TTSConfig);
|
|
1420
|
+
/**
|
|
1421
|
+
* Synthesize text in an Indic language and stream audio chunks.
|
|
1422
|
+
* Sarvam returns full audio segments — we chunk them for streaming compatibility.
|
|
1423
|
+
*
|
|
1424
|
+
* @example
|
|
1425
|
+
* ```ts
|
|
1426
|
+
* const tts = createTTS('sarvam', { targetLanguage: 'hi-IN' })
|
|
1427
|
+
* for await (const chunk of tts.synthesizeStream('नमस्ते, मैं आपकी कैसे मदद कर सकता हूँ?')) {
|
|
1428
|
+
* telephony.sendAudio(chunk)
|
|
1429
|
+
* }
|
|
1430
|
+
* ```
|
|
1431
|
+
*/
|
|
1432
|
+
synthesizeStream(text: string, config?: TTSConfig): AsyncIterable<Buffer>;
|
|
1433
|
+
/** Synthesize complete audio buffer. */
|
|
1434
|
+
synthesizeFull(text: string, config?: TTSConfig): Promise<Buffer>;
|
|
1435
|
+
}
|
|
1436
|
+
|
|
1437
|
+
/**
|
|
1438
|
+
* @voice-kit/core — TTS factory
|
|
1439
|
+
*
|
|
1440
|
+
* createTTS() is the ONLY public API for text-to-speech.
|
|
1441
|
+
* Never instantiate provider classes directly.
|
|
1442
|
+
*/
|
|
1443
|
+
|
|
1444
|
+
/**
|
|
1445
|
+
* Create a TTS provider instance.
|
|
1446
|
+
*
|
|
1447
|
+
* Provider selection guide:
|
|
1448
|
+
* - 'elevenlabs' → Best voice quality, cloning support, en-IN
|
|
1449
|
+
* - 'cartesia' → Lowest latency (< 90ms TTFB), good for fast-paced agents
|
|
1450
|
+
* - 'sarvam' → Best for Indic languages (hi-IN, ta-IN, kn-IN, te-IN, mr-IN)
|
|
1451
|
+
*
|
|
1452
|
+
* @example
|
|
1453
|
+
* ```ts
|
|
1454
|
+
* // English with voice cloning
|
|
1455
|
+
* const tts = createTTS('elevenlabs', { voiceId: 'your-cloned-voice-id' })
|
|
1456
|
+
*
|
|
1457
|
+
* // Ultra-low latency English
|
|
1458
|
+
* const tts = createTTS('cartesia', { voiceId: 'your-voice-id' })
|
|
1459
|
+
*
|
|
1460
|
+
* // Hindi
|
|
1461
|
+
* const tts = createTTS('sarvam', { targetLanguage: 'hi-IN', voiceId: 'meera' })
|
|
1462
|
+
* ```
|
|
1463
|
+
*/
|
|
1464
|
+
declare function createTTS(provider: 'elevenlabs' | 'cartesia' | 'sarvam', config?: TTSConfig): TTSProvider;
|
|
1465
|
+
|
|
1466
|
+
export { AgentError, AgentHandoffError, AssemblyAISTTProvider, AudioPipeline, AudioTransportError, type AuditEntry, type AuditEventType, CallAuditLog, CallConnectionError, type CallMemory, type CallMemoryConfig, CallMetrics, type CallMetricsSummary, CallNotFoundError, type CallPurpose, CallingHoursError, CartesiaTTSProvider, ComplianceError, ConsentMissingError, type ConsentRecord, DNCBlockedError, type DNCCheckParams, type DNCCheckResult, DeepgramSTTProvider, ElevenLabsTTSProvider, type ErrorSeverity, InngestError, type LanguageCode, LanguageSwitchDetector, type LanguageSwitchEvent, type STTConfig, STTConnectionError, STTError, STTLanguageNotSupportedError, type STTProvider, type STTResult, STTStreamError, SarvamSTTProvider, SarvamTTSProvider, TRAICompliance, type TRAIConfig, type TTSConfig, TTSConnectionError, TTSError, type TTSProvider, TTSStreamError, TTSVoiceNotFoundError, TelephonyError, type TelephonyProviderName, TurnTransitionError, type VADConfig, VADEngine, type VoiceFrame, VoiceKitError, type VoiceKitErrorContext, VoiceSDKTracer, WhisperSTTProvider, type WordTimestamp, base64MulawToPcm, createAudioPipeline, createCallMemory, createResamplerStream, createSTT, createTTS, createVAD, isInglish, linearToMulaw, mulawBufferToPcm, mulawToLinear, pcmBufferToMulaw, pcmToBase64Mulaw, resample, resampleStream };
|