@voice-kit/core 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/dist/index.cjs +2137 -0
  2. package/dist/index.cjs.map +1 -1
  3. package/dist/index.d.cts +1466 -4
  4. package/dist/index.d.ts +1466 -4
  5. package/dist/index.js +2102 -1
  6. package/dist/index.js.map +1 -1
  7. package/package.json +1 -31
  8. package/dist/audio.cjs +0 -533
  9. package/dist/audio.cjs.map +0 -1
  10. package/dist/audio.d.cts +0 -260
  11. package/dist/audio.d.ts +0 -260
  12. package/dist/audio.js +0 -514
  13. package/dist/audio.js.map +0 -1
  14. package/dist/compliance.cjs +0 -343
  15. package/dist/compliance.cjs.map +0 -1
  16. package/dist/compliance.d.cts +0 -163
  17. package/dist/compliance.d.ts +0 -163
  18. package/dist/compliance.js +0 -335
  19. package/dist/compliance.js.map +0 -1
  20. package/dist/errors.cjs +0 -284
  21. package/dist/errors.cjs.map +0 -1
  22. package/dist/errors.d.cts +0 -100
  23. package/dist/errors.d.ts +0 -100
  24. package/dist/errors.js +0 -262
  25. package/dist/errors.js.map +0 -1
  26. package/dist/index-D3KfRXMP.d.cts +0 -319
  27. package/dist/index-D3KfRXMP.d.ts +0 -319
  28. package/dist/memory.cjs +0 -121
  29. package/dist/memory.cjs.map +0 -1
  30. package/dist/memory.d.cts +0 -29
  31. package/dist/memory.d.ts +0 -29
  32. package/dist/memory.js +0 -115
  33. package/dist/memory.js.map +0 -1
  34. package/dist/observability.cjs +0 -229
  35. package/dist/observability.cjs.map +0 -1
  36. package/dist/observability.d.cts +0 -122
  37. package/dist/observability.d.ts +0 -122
  38. package/dist/observability.js +0 -222
  39. package/dist/observability.js.map +0 -1
  40. package/dist/stt.cjs +0 -828
  41. package/dist/stt.cjs.map +0 -1
  42. package/dist/stt.d.cts +0 -308
  43. package/dist/stt.d.ts +0 -308
  44. package/dist/stt.js +0 -815
  45. package/dist/stt.js.map +0 -1
  46. package/dist/telephony.errors-BQYr6-vl.d.cts +0 -80
  47. package/dist/telephony.errors-C0-nScrF.d.ts +0 -80
  48. package/dist/tts.cjs +0 -429
  49. package/dist/tts.cjs.map +0 -1
  50. package/dist/tts.d.cts +0 -151
  51. package/dist/tts.d.ts +0 -151
  52. package/dist/tts.js +0 -418
  53. package/dist/tts.js.map +0 -1
package/dist/index.d.ts CHANGED
@@ -1,4 +1,1466 @@
1
- export { a as CallMemory, C as CallMemoryConfig, d as CallMetricsSummary, j as CallPurpose, c as ConsentRecord, D as DNCCheckParams, b as DNCCheckResult, E as ErrorSeverity, h as STTConfig, S as STTProvider, i as STTResult, T as TRAIConfig, f as TTSConfig, e as TTSProvider, V as VADConfig, g as VoiceFrame, k as VoiceKitErrorContext, W as WordTimestamp, l as createCallMemory, m as createSTT, n as createTTS } from './index-D3KfRXMP.js';
2
- export { AgentError, AgentHandoffError, CallingHoursError, ComplianceError, ConsentMissingError, DNCBlockedError, InngestError, STTConnectionError, STTError, STTLanguageNotSupportedError, STTStreamError, TTSConnectionError, TTSError, TTSStreamError, TTSVoiceNotFoundError, TurnTransitionError } from './errors.js';
3
- export { A as AudioTransportError, C as CallConnectionError, a as CallNotFoundError, T as TelephonyError, V as VoiceKitError } from './telephony.errors-C0-nScrF.js';
4
- import 'ai';
1
+ import * as ai from 'ai';
2
+ import { PassThrough } from 'node:stream';
3
+ import { EventEmitter } from 'node:events';
4
+
5
+ /**
6
+ * @voice-kit/core — Type definitions
7
+ */
8
+ /**
9
+ * A single word with timing information from an STT provider.
10
+ */
11
+ interface WordTimestamp {
12
+ word: string;
13
+ startMs: number;
14
+ endMs: number;
15
+ confidence: number;
16
+ }
17
+ /**
18
+ * The result of a speech-to-text transcription, either streaming partial
19
+ * or final. `isFinal` distinguishes the two.
20
+ *
21
+ * @example
22
+ * ```ts
23
+ * for await (const result of stt.transcribeStream(audioIterable)) {
24
+ * if (result.isFinal) console.log('Final:', result.transcript)
25
+ * }
26
+ * ```
27
+ */
28
+ interface STTResult {
29
+ /** The transcribed text. May be a partial result if `isFinal` is false. */
30
+ transcript: string;
31
+ /** Whether this is the final result for this utterance. */
32
+ isFinal: boolean;
33
+ /** Confidence score from the provider, 0–1. */
34
+ confidence: number;
35
+ /** BCP-47 language tag, e.g. 'hi-IN', 'en-IN'. */
36
+ language: string;
37
+ /** True if a mid-sentence language switch was detected (e.g. Hinglish). */
38
+ languageSwitchDetected: boolean;
39
+ /** Word-level timestamps if supported by the provider. */
40
+ words?: WordTimestamp[];
41
+ /** Time from audio start to this result being emitted, in ms. */
42
+ latencyMs: number;
43
+ }
44
+ /**
45
+ * Configuration for STT provider instantiation.
46
+ */
47
+ interface STTConfig {
48
+ /** BCP-47 language code. Defaults to 'en-IN'. */
49
+ language?: string;
50
+ /** Additional languages to detect for code-switching. */
51
+ alternateLanguages?: string[];
52
+ /** API key. Falls back to provider-specific env var if omitted. */
53
+ apiKey?: string;
54
+ /** Custom model name. Provider-specific. */
55
+ model?: string;
56
+ /** Enable word-level timestamps. Default false. */
57
+ wordTimestamps?: boolean;
58
+ /** Enable interim / partial results. Default true. */
59
+ interimResults?: boolean;
60
+ /** Deepgram-specific: smart formatting. Default true. */
61
+ smartFormat?: boolean;
62
+ /** Sarvam-specific: region hint. */
63
+ region?: string;
64
+ }
65
+ /**
66
+ * The STTProvider interface. Obtained via `createSTT()` — never instantiate
67
+ * provider classes directly.
68
+ *
69
+ * @example
70
+ * ```ts
71
+ * const stt = createSTT('deepgram', { language: 'en-IN' })
72
+ * for await (const result of stt.transcribeStream(audioStream)) {
73
+ * console.log(result.transcript)
74
+ * }
75
+ * ```
76
+ */
77
+ interface STTProvider {
78
+ /** Stream audio in, stream STTResults out. Primary realtime path. */
79
+ transcribeStream(audio: AsyncIterable<Buffer>): AsyncIterable<STTResult>;
80
+ /** Batch transcription for recordings. Returns single final result. */
81
+ transcribeBatch(audio: Buffer): Promise<STTResult>;
82
+ /** Whether this provider supports streaming (all except Whisper). */
83
+ readonly supportsStreaming: boolean;
84
+ /** BCP-47 codes this provider can handle. */
85
+ readonly supportedLanguages: string[];
86
+ /** Human-readable provider name for logging. */
87
+ readonly name: string;
88
+ }
89
+ /**
90
+ * Configuration for TTS provider instantiation.
91
+ */
92
+ interface TTSConfig {
93
+ /** Voice identifier. Provider-specific. */
94
+ voiceId?: string;
95
+ /** Output sample rate. Defaults to provider native rate. */
96
+ sampleRate?: number;
97
+ /** Speaking speed multiplier. Default 1.0. */
98
+ speed?: number;
99
+ /** Pitch adjustment. Provider-specific. */
100
+ pitch?: number;
101
+ /** API key. Falls back to provider-specific env var if omitted. */
102
+ apiKey?: string;
103
+ /** ElevenLabs-specific: model ID. */
104
+ modelId?: string;
105
+ /** Cartesia-specific: emotion control. */
106
+ emotion?: string;
107
+ /** Sarvam-specific: target language for Indic voices. */
108
+ targetLanguage?: string;
109
+ }
110
+ /**
111
+ * The TTSProvider interface. Obtained via `createTTS()` — never instantiate
112
+ * provider classes directly.
113
+ *
114
+ * @example
115
+ * ```ts
116
+ * const tts = createTTS('elevenlabs', { voiceId: 'your-voice-id' })
117
+ * for await (const chunk of tts.synthesizeStream('Hello, how can I help?')) {
118
+ * socket.write(chunk)
119
+ * }
120
+ * ```
121
+ */
122
+ interface TTSProvider {
123
+ /** Stream synthesis — preferred for realtime. First chunk < 300ms. */
124
+ synthesizeStream(text: string, config?: TTSConfig): AsyncIterable<Buffer>;
125
+ /** Synthesize full audio — for pre-recorded prompts or caching. */
126
+ synthesizeFull(text: string, config?: TTSConfig): Promise<Buffer>;
127
+ /** Native output sample rate of this provider in Hz. */
128
+ readonly outputSampleRate: number;
129
+ /** Native output format before any resampling. */
130
+ readonly outputFormat: 'pcm' | 'mulaw' | 'opus' | 'mp3';
131
+ /** Human-readable provider name for logging. */
132
+ readonly name: string;
133
+ }
134
+ /**
135
+ * A frame of audio classified by the VAD engine.
136
+ * Developers subscribe to these events — never to raw VAD API.
137
+ */
138
+ interface VoiceFrame {
139
+ /** Event type. */
140
+ type: 'speech_start' | 'speech_end' | 'speech';
141
+ /** VAD confidence 0–1. */
142
+ confidence: number;
143
+ /** Raw PCM audio bytes for this frame. */
144
+ audioBuffer: Buffer;
145
+ /** Duration of audio in this frame, in ms. */
146
+ durationMs: number;
147
+ }
148
+ /**
149
+ * Configuration for the VAD engine.
150
+ */
151
+ interface VADConfig {
152
+ /** Activation threshold 0–1. Default 0.6. */
153
+ threshold?: number;
154
+ /** Consecutive positive frames before speech_start. Default 3. */
155
+ positiveSpeechFrames?: number;
156
+ /** Consecutive negative frames before speech_end. Default 5. */
157
+ negativeSpeechFrames?: number;
158
+ /** Debounce window in ms to prevent rapid flip-flop. Default 150. */
159
+ debounceMs?: number;
160
+ /** Input sample rate. Auto-set by AudioPipeline — do not override. */
161
+ sampleRate?: number;
162
+ }
163
+ /**
164
+ * Configuration for call memory (LRU-backed sliding window of turns).
165
+ */
166
+ interface CallMemoryConfig {
167
+ /** Maximum number of turns to retain. Default 20. */
168
+ maxTurns?: number;
169
+ /** Maximum bytes of conversation history to retain. Default 512KB. */
170
+ maxBytes?: number;
171
+ /** TTL for the entire call memory entry in ms. Default 30 minutes. */
172
+ ttlMs?: number;
173
+ }
174
+ /**
175
+ * In-process LRU-backed call memory. Obtained via `createCallMemory()`.
176
+ *
177
+ * @example
178
+ * ```ts
179
+ * const memory = createCallMemory({ maxTurns: 20 })
180
+ * memory.addTurn(callId, { role: 'user', content: 'Hello' })
181
+ * const history = memory.getTurns(callId)
182
+ * ```
183
+ */
184
+ interface CallMemory {
185
+ addTurn(callId: string, message: ai.ModelMessage): void;
186
+ getTurns(callId: string): ai.ModelMessage[];
187
+ clearCall(callId: string): void;
188
+ getTokenEstimate(callId: string): number;
189
+ /** Truncate oldest turns to stay within budget. */
190
+ trimToTokenBudget(callId: string, maxTokens: number): void;
191
+ }
192
+ /**
193
+ * Type of call for TRAI DND classification.
194
+ */
195
+ type CallPurpose = 'TRANSACTIONAL' | 'PROMOTIONAL' | 'SERVICE' | 'EMERGENCY';
196
+ /**
197
+ * TRAI DNC check parameters.
198
+ */
199
+ interface DNCCheckParams {
200
+ /** E.164 format phone number, validated via libphonenumber-js. */
201
+ to: string;
202
+ /** Purpose category for TRAI classification. */
203
+ purpose: CallPurpose;
204
+ /** Scheduled call time. Defaults to now. */
205
+ scheduledAt?: Date;
206
+ }
207
+ /**
208
+ * Result of a TRAI DNC check.
209
+ */
210
+ interface DNCCheckResult {
211
+ /** Whether the call is permitted. */
212
+ allowed: boolean;
213
+ /** Human-readable reason if not allowed. */
214
+ reason?: string;
215
+ /** When this result was fetched (from LRU cache). */
216
+ cachedAt?: Date;
217
+ /** Whether result came from local LRU cache. */
218
+ fromCache: boolean;
219
+ }
220
+ /**
221
+ * Consent record stored for TRAI compliance.
222
+ */
223
+ interface ConsentRecord {
224
+ phoneNumber: string;
225
+ consentedAt: Date;
226
+ /** Channel through which consent was obtained. */
227
+ channel: 'voice' | 'sms' | 'web' | 'ivr';
228
+ /** Call purpose consent was given for. */
229
+ purpose: CallPurpose;
230
+ /** Optional reference ID (e.g. recording URL). */
231
+ referenceId?: string;
232
+ }
233
+ /**
234
+ * TRAI compliance configuration.
235
+ */
236
+ interface TRAIConfig {
237
+ /** Disable TRAI checks entirely. Default false. */
238
+ disabled?: boolean;
239
+ /** Calling timezone override. Default 'Asia/Kolkata'. */
240
+ timezone?: string;
241
+ /** Override calling hours start (24h). Default 9. */
242
+ callingHoursStart?: number;
243
+ /** Override calling hours end (24h). Default 21. */
244
+ callingHoursEnd?: number;
245
+ /** Custom DNC API endpoint. Default: mock endpoint (must be replaced in production). */
246
+ dncApiEndpoint?: string;
247
+ }
248
+ /**
249
+ * Aggregated metrics for a completed or in-progress call.
250
+ */
251
+ interface CallMetricsSummary {
252
+ callId: string;
253
+ sttFirstByteMs: number[];
254
+ ttsFirstByteMs: number[];
255
+ llmFirstTokenMs: number[];
256
+ turnLatencyMs: number[];
257
+ interruptionCount: number;
258
+ interruptionPositions: number[];
259
+ tokenCost: {
260
+ model: string;
261
+ inputTokens: number;
262
+ outputTokens: number;
263
+ estimatedUsdCost: number;
264
+ }[];
265
+ avgTurnLatencyMs: number;
266
+ p95TurnLatencyMs: number;
267
+ }
268
+ /**
269
+ * Error severity level.
270
+ */
271
+ type ErrorSeverity = 'low' | 'medium' | 'high' | 'critical';
272
+ /**
273
+ * Base error context shared by all VoiceKit errors.
274
+ */
275
+ interface VoiceKitErrorContext {
276
+ /** Error code for programmatic handling. */
277
+ code: string;
278
+ /** Associated call ID if applicable. */
279
+ callId?: string;
280
+ /** The provider that threw (e.g. 'deepgram', 'elevenlabs'). */
281
+ provider?: string;
282
+ /** Whether this error is safe to retry. */
283
+ retryable: boolean;
284
+ /** Severity for alerting/logging. */
285
+ severity: ErrorSeverity;
286
+ /** Original upstream error if wrapping. */
287
+ cause?: unknown;
288
+ }
289
+
290
+ /**
291
+ * @voice-kit/core — Typed error hierarchy
292
+ *
293
+ * All VoiceKit errors extend VoiceKitError. Never throw raw Error.
294
+ * Every error carries: code, message, provider, callId, retryable, severity.
295
+ */
296
+
297
+ /**
298
+ * Base class for all VoiceKit errors. Provides structured context for
299
+ * logging, alerting, and programmatic error handling.
300
+ *
301
+ * @example
302
+ * ```ts
303
+ * try {
304
+ * await stt.transcribeBatch(audio)
305
+ * } catch (err) {
306
+ * if (err instanceof STTError) {
307
+ * console.error(err.code, err.provider, err.retryable)
308
+ * }
309
+ * }
310
+ * ```
311
+ */
312
+ declare class VoiceKitError extends Error {
313
+ readonly code: string;
314
+ readonly callId?: string;
315
+ readonly provider?: string;
316
+ readonly retryable: boolean;
317
+ readonly severity: ErrorSeverity;
318
+ readonly cause?: unknown;
319
+ constructor(params: {
320
+ code: string;
321
+ message: string;
322
+ callId?: string;
323
+ provider?: string;
324
+ retryable?: boolean;
325
+ severity?: ErrorSeverity;
326
+ cause?: unknown;
327
+ });
328
+ toJSON(): {
329
+ name: string;
330
+ code: string;
331
+ message: string;
332
+ callId: string | undefined;
333
+ provider: string | undefined;
334
+ retryable: boolean;
335
+ severity: ErrorSeverity;
336
+ };
337
+ }
338
+
339
+ /**
340
+ * Errors from agent orchestration (turn engine, handoff, injection).
341
+ */
342
+ declare class AgentError extends VoiceKitError {
343
+ }
344
+ declare class TurnTransitionError extends AgentError {
345
+ readonly fromState: string;
346
+ readonly event: string;
347
+ constructor(fromState: string, toEvent: string, callId?: string);
348
+ }
349
+ declare class AgentHandoffError extends AgentError {
350
+ constructor(capability: string, cause?: unknown, callId?: string);
351
+ }
352
+
353
+ /**
354
+ * Errors from compliance checks (TRAI DNC, calling hours, consent).
355
+ */
356
+ declare class ComplianceError extends VoiceKitError {
357
+ readonly phoneNumber?: string;
358
+ constructor(params: {
359
+ code: string;
360
+ message: string;
361
+ callId?: string;
362
+ phoneNumber?: string;
363
+ retryable?: boolean;
364
+ severity?: ErrorSeverity;
365
+ cause?: unknown;
366
+ });
367
+ }
368
+ declare class DNCBlockedError extends ComplianceError {
369
+ constructor(phoneNumber: string, callId?: string);
370
+ }
371
+ declare class CallingHoursError extends ComplianceError {
372
+ constructor(phoneNumber: string, currentTime: string, callId?: string);
373
+ }
374
+ declare class ConsentMissingError extends ComplianceError {
375
+ constructor(phoneNumber: string, callId?: string);
376
+ }
377
+
378
+ /**
379
+ * Errors from Inngest background task dispatch.
380
+ */
381
+ declare class InngestError extends VoiceKitError {
382
+ readonly taskName?: string;
383
+ constructor(params: {
384
+ code: string;
385
+ message: string;
386
+ callId?: string;
387
+ taskName?: string;
388
+ cause?: unknown;
389
+ });
390
+ }
391
+
392
+ /**
393
+ * Errors from speech-to-text providers.
394
+ */
395
+ declare class STTError extends VoiceKitError {
396
+ readonly languageCode?: string;
397
+ constructor(params: {
398
+ code: string;
399
+ message: string;
400
+ callId?: string;
401
+ provider?: string;
402
+ retryable?: boolean;
403
+ severity?: ErrorSeverity;
404
+ cause?: unknown;
405
+ languageCode?: string;
406
+ });
407
+ }
408
+ declare class STTConnectionError extends STTError {
409
+ constructor(provider: string, cause?: unknown, callId?: string);
410
+ }
411
+ declare class STTStreamError extends STTError {
412
+ constructor(provider: string, cause?: unknown, callId?: string);
413
+ }
414
+ declare class STTLanguageNotSupportedError extends STTError {
415
+ constructor(provider: string, language: string);
416
+ }
417
+
418
+ /**
419
+ * Errors from telephony providers.
420
+ */
421
+ declare class TelephonyError extends VoiceKitError {
422
+ readonly to?: string;
423
+ readonly from?: string;
424
+ constructor(params: {
425
+ code: string;
426
+ message: string;
427
+ callId?: string;
428
+ provider?: string;
429
+ retryable?: boolean;
430
+ severity?: ErrorSeverity;
431
+ cause?: unknown;
432
+ to?: string;
433
+ from?: string;
434
+ });
435
+ }
436
+ declare class CallConnectionError extends TelephonyError {
437
+ constructor(provider: string, to: string, cause?: unknown);
438
+ }
439
+ declare class CallNotFoundError extends TelephonyError {
440
+ constructor(callId: string, provider: string);
441
+ }
442
+ declare class AudioTransportError extends TelephonyError {
443
+ constructor(provider: string, cause?: unknown, callId?: string);
444
+ }
445
+
446
+ /**
447
+ * Errors from text-to-speech providers.
448
+ */
449
+ declare class TTSError extends VoiceKitError {
450
+ }
451
+ declare class TTSConnectionError extends TTSError {
452
+ constructor(provider: string, cause?: unknown, callId?: string);
453
+ }
454
+ declare class TTSStreamError extends TTSError {
455
+ constructor(provider: string, cause?: unknown, callId?: string);
456
+ }
457
+ declare class TTSVoiceNotFoundError extends TTSError {
458
+ constructor(provider: string, voiceId: string);
459
+ }
460
+
461
+ /**
462
+ * @voice-kit/core — G.711 µ-law codec
463
+ *
464
+ * Pure TypeScript implementation of G.711 µ-law (mu-law) encode/decode.
465
+ * No external codec library needed for µ-law. This is 100% internal —
466
+ * never exported from the public API.
467
+ *
468
+ * Used by AudioPipeline to convert Twilio/Exotel µ-law audio ↔ PCM.
469
+ */
470
+ /**
471
+ * Convert a single µ-law encoded byte (0–255) to a 16-bit linear PCM sample.
472
+ * Algorithm: ITU-T G.711 Section 3.
473
+ *
474
+ * @internal
475
+ */
476
+ declare function mulawToLinear(sample: number): number;
477
+ /**
478
+ * Convert a 16-bit linear PCM sample to a µ-law encoded byte.
479
+ * Algorithm: ITU-T G.711 Section 3.
480
+ *
481
+ * @internal
482
+ */
483
+ declare function linearToMulaw(sample: number): number;
484
+ /**
485
+ * Convert a Buffer of µ-law encoded bytes to 16-bit little-endian PCM.
486
+ * Each µ-law byte expands to 2 PCM bytes (16-bit LE signed).
487
+ *
488
+ * Input: N bytes (µ-law, 8kHz mono as sent by Twilio/Exotel)
489
+ * Output: N*2 bytes (PCM 16-bit LE, same sample rate)
490
+ *
491
+ * @internal
492
+ */
493
+ declare function mulawBufferToPcm(buf: Buffer): Buffer;
494
+ /**
495
+ * Convert a Buffer of 16-bit little-endian PCM to µ-law bytes.
496
+ * Each pair of PCM bytes compresses to 1 µ-law byte.
497
+ *
498
+ * Input: N bytes (PCM 16-bit LE)
499
+ * Output: N/2 bytes (µ-law)
500
+ *
501
+ * @internal
502
+ */
503
+ declare function pcmBufferToMulaw(buf: Buffer): Buffer;
504
+ /**
505
+ * Convert a base64-encoded µ-law string (as sent by Twilio Media Streams)
506
+ * directly to PCM Buffer. Convenience wrapper used in TwilioProvider.
507
+ *
508
+ * @internal
509
+ */
510
+ declare function base64MulawToPcm(base64: string): Buffer;
511
+ /**
512
+ * Convert a PCM Buffer to a base64-encoded µ-law string (for sending
513
+ * back to Twilio Media Streams).
514
+ *
515
+ * @internal
516
+ */
517
+ declare function pcmToBase64Mulaw(pcm: Buffer): string;
518
+
519
+ /**
520
+ * @voice-kit/core — AudioPipeline
521
+ *
522
+ * Automatically selects codec, sample rate, and VAD config based on the
523
+ * telephony provider. Developers never configure codecs — the pipeline
524
+ * handles all conversions transparently.
525
+ *
526
+ * Provider audio formats:
527
+ * Twilio / Exotel → 8kHz µ-law → decode → 8kHz PCM → upsample → 16kHz PCM (for STT)
528
+ * Plivo / Telnyx → 8kHz µ-law (same as Twilio)
529
+ * LiveKit → 48kHz Opus → decode → 48kHz PCM → downsample → 16kHz PCM (for STT)
530
+ * SIP (generic) → 8kHz G.711 (same as Twilio)
531
+ *
532
+ * TTS output path (reverse):
533
+ * STT/LLM → TTS PCM (provider-native rate) → resample → telephony-native rate → encode
534
+ */
535
+
536
+ /** Telephony providers handled by the pipeline. */
537
+ type TelephonyProviderName = 'twilio' | 'exotel' | 'plivo' | 'telnyx' | 'livekit' | 'sip';
538
+ /**
539
+ * AudioPipeline: auto-wires codec → resample → VAD for a specific telephony provider.
540
+ *
541
+ * Developers never call this directly — it is instantiated by TelephonyProvider
542
+ * implementations and consumed by VoiceAgent.
543
+ *
544
+ * @internal
545
+ */
546
+ declare class AudioPipeline {
547
+ private readonly profile;
548
+ readonly provider: TelephonyProviderName;
549
+ constructor(provider: TelephonyProviderName);
550
+ /**
551
+ * Transform incoming telephony audio to 16kHz PCM for STT.
552
+ * Handles µ-law decode + resampling automatically.
553
+ *
554
+ * @param raw Raw audio bytes as received from telephony provider
555
+ * @returns Async iterable of 16kHz PCM buffers for STT
556
+ *
557
+ * @internal
558
+ */
559
+ inboundForSTT(raw: AsyncIterable<Buffer>): AsyncIterable<Buffer>;
560
+ /**
561
+ * Transform TTS output PCM to telephony-native format for sending to caller.
562
+ * Handles resampling + µ-law encode automatically.
563
+ *
564
+ * @param ttsAudio Raw PCM from TTS provider (at TTS provider's native rate)
565
+ * @param ttsSampleRate Native sample rate of the TTS provider
566
+ * @returns Async iterable of audio bytes ready to send to telephony provider
567
+ *
568
+ * @internal
569
+ */
570
+ outboundFromTTS(ttsAudio: AsyncIterable<Buffer>, ttsSampleRate: number): AsyncIterable<Buffer>;
571
+ /** Get the VAD config tuned for this provider's audio quality. @internal */
572
+ get vadConfig(): Required<VADConfig>;
573
+ /** Sample rate that STT expects (post-pipeline). @internal */
574
+ get sttSampleRate(): number;
575
+ /** Async generator: decode µ-law stream to PCM. @internal */
576
+ private decodeMulaw;
577
+ }
578
+ /**
579
+ * Factory: create an AudioPipeline pre-configured for the given telephony provider.
580
+ *
581
+ * @internal — used by TelephonyProvider implementations
582
+ */
583
+ declare function createAudioPipeline(provider: TelephonyProviderName): AudioPipeline;
584
+
585
+ /**
586
+ * @voice-kit/core — PCM audio resampler
587
+ *
588
+ * Resamples raw PCM audio between sample rates using fluent-ffmpeg.
589
+ * 100% internal — never exported from the public API.
590
+ * Used by AudioPipeline to convert provider-native rates to STT-required rates.
591
+ */
592
+
593
+ /**
594
+ * Resample a PCM Buffer from one sample rate to another.
595
+ * Both input and output are signed 16-bit little-endian PCM, mono.
596
+ *
597
+ * Common conversions:
598
+ * 8kHz → 16kHz (Twilio/Exotel µ-law decoded → Deepgram input)
599
+ * 48kHz → 16kHz (LiveKit Opus decoded → Deepgram input)
600
+ * 24kHz → 8kHz (ElevenLabs output → Twilio send)
601
+ *
602
+ * @param buf Raw PCM bytes (s16le mono)
603
+ * @param fromHz Source sample rate in Hz
604
+ * @param toHz Target sample rate in Hz
605
+ * @returns Resampled PCM bytes (s16le mono)
606
+ *
607
+ * @internal
608
+ */
609
+ declare function resample(buf: Buffer, fromHz: number, toHz: number): Promise<Buffer>;
610
+ /**
611
+ * Create a streaming resampler Transform stream.
612
+ * More efficient than buffering for large audio chunks.
613
+ *
614
+ * @param fromHz Source sample rate in Hz
615
+ * @param toHz Target sample rate in Hz
616
+ * @returns Node.js Transform stream: PCM in, resampled PCM out
617
+ *
618
+ * @internal
619
+ */
620
+ declare function createResamplerStream(fromHz: number, toHz: number): PassThrough;
621
+ /**
622
+ * Async generator that resamples chunks from an audio iterable on the fly.
623
+ * Used by AudioPipeline for realtime streaming paths.
624
+ *
625
+ * @param audio Async iterable of raw PCM buffers at fromHz
626
+ * @param fromHz Source sample rate
627
+ * @param toHz Target sample rate
628
+ *
629
+ * @internal
630
+ */
631
+ declare function resampleStream(audio: AsyncIterable<Buffer>, fromHz: number, toHz: number): AsyncIterable<Buffer>;
632
+
633
+ /**
634
+ * @voice-kit/core — Voice Activity Detection engine
635
+ *
636
+ * Wraps @ricky0123/vad-web and emits strongly-typed VoiceFrame events.
637
+ * Developers subscribe to VoiceFrame events — they never touch the raw VAD API.
638
+ *
639
+ * @example
640
+ * ```ts
641
+ * const vad = createVAD({ threshold: 0.6 })
642
+ * vad.on('frame', (frame) => {
643
+ * if (frame.type === 'speech_start') startRecording()
644
+ * if (frame.type === 'speech_end') stopRecording()
645
+ * })
646
+ * await vad.processStream(audioStream)
647
+ * ```
648
+ */
649
+
650
+ type VADEventMap = {
651
+ frame: [VoiceFrame];
652
+ error: [AudioTransportError];
653
+ };
654
+ /**
655
+ * Internal VAD engine. Processes a 16kHz PCM stream and emits VoiceFrame events.
656
+ * Automatically debounces rapid speech_start/speech_end transitions.
657
+ *
658
+ * Input: 16kHz, 16-bit little-endian PCM, mono.
659
+ * Output: VoiceFrame events on the emitter.
660
+ */
661
+ declare class VADEngine extends EventEmitter<VADEventMap> {
662
+ private readonly config;
663
+ private isSpeaking;
664
+ private positiveFrameCount;
665
+ private negativeFrameCount;
666
+ private debounceTimer;
667
+ private frameBuffer;
668
+ private vadModel;
669
+ constructor(config?: VADConfig);
670
+ /**
671
+ * Process an async stream of PCM audio frames.
672
+ * Automatically frames the input into 30ms chunks for VAD processing.
673
+ *
674
+ * @param audio Async iterable of PCM buffers (16kHz, s16le, mono)
675
+ */
676
+ processStream(audio: AsyncIterable<Buffer>): Promise<void>;
677
+ /**
678
+ * Process a single 30ms PCM frame through the VAD model.
679
+ *
680
+ * @internal
681
+ */
682
+ private processFrame;
683
+ /**
684
+ * Run Silero VAD model inference on a single frame.
685
+ * Returns confidence score 0–1.
686
+ *
687
+ * @internal
688
+ */
689
+ private runVADInference;
690
+ private emitFrame;
691
+ private scheduleDebounce;
692
+ private clearDebounce;
693
+ /**
694
+ * Load the Silero VAD model if not already loaded.
695
+ * @internal
696
+ */
697
+ private ensureModelLoaded;
698
+ /** Clean up resources. Call when the call ends. */
699
+ destroy(): void;
700
+ }
701
+ /**
702
+ * Create a configured VAD engine instance.
703
+ * Input must be 16kHz, 16-bit LE, mono PCM (handled automatically by AudioPipeline).
704
+ *
705
+ * @example
706
+ * ```ts
707
+ * const vad = createVAD({ threshold: 0.7, debounceMs: 200 })
708
+ * vad.on('frame', (frame) => handleFrame(frame))
709
+ * await vad.processStream(audioStream)
710
+ * ```
711
+ */
712
+ declare function createVAD(config?: VADConfig): VADEngine;
713
+
714
+ /**
715
+ * @voice-kit/core — Call audit log
716
+ *
717
+ * Immutable append-only audit log for compliance and debugging.
718
+ * In-memory (LRU) + optional file sink. Once written, entries cannot be modified.
719
+ */
720
+ type AuditEventType = 'call.started' | 'call.ended' | 'compliance.checked' | 'compliance.blocked' | 'consent.recorded' | 'consent.verified' | 'turn.started' | 'turn.ended' | 'interruption' | 'agent.handoff' | 'tool.called' | 'error';
721
+ interface AuditEntry {
722
+ readonly id: string;
723
+ readonly callId: string;
724
+ readonly type: AuditEventType;
725
+ readonly timestamp: Date;
726
+ readonly data: Readonly<Record<string, unknown>>;
727
+ }
728
+ /**
729
+ * Immutable append-only call audit log.
730
+ *
731
+ * Entries are written to LRU in-process memory and optionally to a JSONL file.
732
+ * Once written, entries are frozen — no modification is possible.
733
+ *
734
+ * @example
735
+ * ```ts
736
+ * const audit = new CallAuditLog({ filePath: '/var/log/voice-kit/audit.jsonl' })
737
+ * audit.append(callId, 'call.started', { from: '+91...', to: '+91...' })
738
+ * const entries = audit.getEntries(callId)
739
+ * ```
740
+ */
741
+ declare class CallAuditLog {
742
+ /** LRU: up to 10,000 calls × 200 entries each = 2M entries max */
743
+ private readonly cache;
744
+ private readonly filePath?;
745
+ constructor(options?: {
746
+ filePath?: string;
747
+ maxCalls?: number;
748
+ });
749
+ /**
750
+ * Append an immutable audit entry for a call.
751
+ *
752
+ * @param callId The call identifier
753
+ * @param type Audit event type
754
+ * @param data Additional structured data
755
+ */
756
+ append(callId: string, type: AuditEventType, data?: Record<string, unknown>): AuditEntry;
757
+ /**
758
+ * Get all audit entries for a call, in insertion order.
759
+ *
760
+ * @param callId The call identifier
761
+ */
762
+ getEntries(callId: string): ReadonlyArray<AuditEntry>;
763
+ /**
764
+ * Get entries of a specific type for a call.
765
+ */
766
+ getEntriesByType(callId: string, type: AuditEventType): ReadonlyArray<AuditEntry>;
767
+ /** Write entry to JSONL file. @internal */
768
+ private writeToFile;
769
+ }
770
+
771
+ /**
772
+ * @voice-kit/core — TRAI Compliance
773
+ *
774
+ * TRAI (Telecom Regulatory Authority of India) compliance utilities:
775
+ * - DNC (Do Not Call) registry check with 24h LRU cache
776
+ * - Calling hours enforcement (9 AM – 9 PM IST)
777
+ * - Consent tracking (180-day validity)
778
+ *
779
+ * Auto-enabled for +91 numbers. Opt-out, not opt-in.
780
+ */
781
+
782
+ /**
783
+ * TRAI compliance engine.
784
+ *
785
+ * Enforces DNC registry, calling hours, and consent rules for Indian numbers.
786
+ * Results are cached in LRU to minimize API round-trips.
787
+ *
788
+ * @example
789
+ * ```ts
790
+ * const trai = new TRAICompliance()
791
+ *
792
+ * const result = await trai.checkCallPermission({
793
+ * to: '+919876543210',
794
+ * purpose: 'TRANSACTIONAL',
795
+ * })
796
+ *
797
+ * if (!result.allowed) throw new Error(result.reason)
798
+ * ```
799
+ */
800
+ declare class TRAICompliance {
801
+ private readonly config;
802
+ private readonly http;
803
+ /** DNC check results cached for 24 hours per number. */
804
+ private readonly dncCache;
805
+ /** Consent records cached for 180 days. */
806
+ private readonly consentCache;
807
+ constructor(config?: TRAIConfig);
808
+ /**
809
+ * Check whether a call is permitted under TRAI rules.
810
+ * Checks: valid E.164, DNC registry, calling hours.
811
+ *
812
+ * @param params Call permission check parameters
813
+ * @throws DNCBlockedError if number is on DNC registry
814
+ * @throws CallingHoursError if outside allowed calling hours
815
+ * @throws ComplianceError if phone number is invalid
816
+ *
817
+ * @example
818
+ * ```ts
819
+ * const result = await trai.checkCallPermission({
820
+ * to: '+919876543210',
821
+ * purpose: 'TRANSACTIONAL',
822
+ * })
823
+ * if (!result.allowed) console.log(result.reason)
824
+ * ```
825
+ */
826
+ checkCallPermission(params: DNCCheckParams): Promise<DNCCheckResult>;
827
+ /**
828
+ * Check if the current time (or a given time) is within TRAI calling hours.
829
+ * Allowed: 9:00 AM – 9:00 PM IST.
830
+ * Uses Intl.DateTimeFormat only — no date-fns or dayjs dependency.
831
+ *
832
+ * @param at Time to check. Defaults to now.
833
+ * @param timezone IANA timezone. Defaults to 'Asia/Kolkata'.
834
+ *
835
+ * @example
836
+ * ```ts
837
+ * trai.isWithinCallingHours() // Check now
838
+ * trai.isWithinCallingHours(new Date()) // Explicit time
839
+ * ```
840
+ */
841
+ isWithinCallingHours(at?: Date, timezone?: string): boolean;
842
+ /**
843
+ * Record explicit consent from a user for future calls.
844
+ * Consent is valid for 180 days per TRAI guidelines.
845
+ *
846
+ * @param params Consent record details
847
+ *
848
+ * @example
849
+ * ```ts
850
+ * await trai.recordConsent({
851
+ * phoneNumber: '+919876543210',
852
+ * consentedAt: new Date(),
853
+ * channel: 'ivr',
854
+ * purpose: 'PROMOTIONAL',
855
+ * })
856
+ * ```
857
+ */
858
+ recordConsent(params: ConsentRecord): Promise<void>;
859
+ /**
860
+ * Check if a number has valid (non-expired) consent on record.
861
+ *
862
+ * @param phoneNumber E.164 phone number
863
+ * @returns True if valid consent exists
864
+ */
865
+ hasValidConsent(phoneNumber: string): Promise<boolean>;
866
+ /**
867
+ * Fetch DNC status from TRAI DND API.
868
+ * @internal
869
+ */
870
+ private fetchDNCStatus;
871
+ }
872
+
873
+ /**
874
+ * @voice-kit/core — LRU-backed call memory
875
+ *
876
+ * Provides a sliding window of conversation turns per call.
877
+ * Uses lru-cache for bounded in-process storage — no Redis, no DB.
878
+ * Every cache has explicit max size and TTL to prevent unbounded growth.
879
+ */
880
+
881
+ /**
882
+ * Create an LRU-backed call memory instance.
883
+ * This is the ONLY in-process memory system in the SDK.
884
+ *
885
+ * @param config Memory configuration
886
+ *
887
+ * @example
888
+ * ```ts
889
+ * // Default: 20 turns, 512KB, 30min TTL
890
+ * const memory = createCallMemory()
891
+ *
892
+ * // Custom
893
+ * const memory = createCallMemory({ maxTurns: 30, maxBytes: 1_000_000 })
894
+ * ```
895
+ */
896
+ declare function createCallMemory(config?: CallMemoryConfig): CallMemory;
897
+
898
+ /**
899
+ * @voice-kit/core — CallMetrics
900
+ *
901
+ * Records per-call performance metrics: TTFB, turn latency, token cost, interruption rate.
902
+ * In-process LRU storage — exported via getCallSummary().
903
+ */
904
+
905
+ /**
906
+ * Per-call performance metrics recorder.
907
+ *
908
+ * @example
909
+ * ```ts
910
+ * const metrics = new CallMetrics()
911
+ * metrics.recordSTTFirstByte(callId, 180)
912
+ * metrics.recordTurnLatency(callId, 340)
913
+ * const summary = metrics.getCallSummary(callId)
914
+ * console.log(summary.avgTurnLatencyMs) // 340
915
+ * ```
916
+ */
917
+ declare class CallMetrics {
918
+ private readonly store;
919
+ constructor();
920
+ private getOrCreate;
921
+ /** Record time from audio start to first STT partial result. */
922
+ recordSTTFirstByte(callId: string, ms: number): void;
923
+ /** Record time from TTS request to first audio chunk. */
924
+ recordTTSFirstByte(callId: string, ms: number): void;
925
+ /** Record time from LLM request to first token. */
926
+ recordLLMFirstToken(callId: string, ms: number): void;
927
+ /**
928
+ * Record end-to-end turn latency: speech_end → first TTS audio byte.
929
+ * This is the primary latency metric for voice agent quality.
930
+ */
931
+ recordTurnLatency(callId: string, ms: number): void;
932
+ /**
933
+ * Record an interruption event.
934
+ *
935
+ * @param callId Call identifier
936
+ * @param positionPct 0–1, how far through the TTS stream the interruption occurred
937
+ */
938
+ recordInterruption(callId: string, positionPct: number): void;
939
+ /** Record token usage and estimated cost for a model call. */
940
+ recordTokenCost(callId: string, model: string, inputTokens: number, outputTokens: number): void;
941
+ /**
942
+ * Get a full summary of metrics for a call.
943
+ *
944
+ * @param callId The call identifier
945
+ * @returns Aggregated metrics summary
946
+ */
947
+ getCallSummary(callId: string): CallMetricsSummary;
948
+ /** Remove metrics for a call. Call on call.ended to free memory. */
949
+ clearCall(callId: string): void;
950
+ }
951
+
952
+ /**
953
+ * @voice-kit/core — OpenTelemetry tracing
954
+ *
955
+ * VoiceSDKTracer: wraps every external provider call with OTel spans.
956
+ * Auto-exports to OTLP endpoint if OTEL_EXPORTER_OTLP_ENDPOINT is set.
957
+ */
958
+ /**
959
+ * OpenTelemetry tracer for VoiceKit. Wraps every external I/O with spans.
960
+ *
961
+ * @example
962
+ * ```ts
963
+ * const tracer = new VoiceSDKTracer()
964
+ * const result = await tracer.traceSTT(
965
+ * () => stt.transcribeBatch(audio),
966
+ * { provider: 'deepgram', language: 'en-IN' }
967
+ * )
968
+ * ```
969
+ */
970
+ declare class VoiceSDKTracer {
971
+ private readonly tracer;
972
+ constructor();
973
+ /**
974
+ * Trace an STT operation with provider + language attributes.
975
+ */
976
+ traceSTT<T>(fn: () => Promise<T>, attrs: {
977
+ provider: string;
978
+ language: string;
979
+ callId?: string;
980
+ }): Promise<T>;
981
+ /**
982
+ * Trace a TTS synthesis operation.
983
+ */
984
+ traceTTS<T>(fn: () => Promise<T>, attrs: {
985
+ provider: string;
986
+ voice: string;
987
+ chars: number;
988
+ callId?: string;
989
+ }): Promise<T>;
990
+ /**
991
+ * Trace an LLM generation call.
992
+ */
993
+ traceLLM<T>(fn: () => Promise<T>, attrs: {
994
+ model: string;
995
+ inputTokens: number;
996
+ callId?: string;
997
+ }): Promise<T>;
998
+ /**
999
+ * Trace a full call lifecycle.
1000
+ */
1001
+ traceCall<T>(fn: () => Promise<T>, attrs: {
1002
+ callId: string;
1003
+ direction: 'inbound' | 'outbound';
1004
+ }): Promise<T>;
1005
+ /**
1006
+ * Trace a single conversation turn.
1007
+ */
1008
+ traceTurn<T>(fn: () => Promise<T>, attrs: {
1009
+ turnIndex: number;
1010
+ callId: string;
1011
+ }): Promise<T>;
1012
+ /** Generic span wrapper. @internal */
1013
+ private withSpan;
1014
+ }
1015
+
1016
+ /**
1017
+ * @voice-kit/core — AssemblyAI STT Provider
1018
+ *
1019
+ * Async long-form transcription using AssemblyAI SDK.
1020
+ * Best for post-call recordings, meeting notes, long interviews.
1021
+ * Does not support realtime streaming — use Deepgram for live calls.
1022
+ */
1023
+
1024
+ /**
1025
+ * AssemblyAI async transcription provider.
1026
+ * @internal — obtained via createSTT('assemblyai', config)
1027
+ */
1028
+ declare class AssemblyAISTTProvider implements STTProvider {
1029
+ readonly name = "assemblyai";
1030
+ readonly supportsStreaming = false;
1031
+ readonly supportedLanguages: string[];
1032
+ private readonly client;
1033
+ private readonly config;
1034
+ constructor(config: STTConfig);
1035
+ /**
1036
+ * Batch-transcribes collected audio. AssemblyAI has no realtime streaming.
1037
+ * Collects all audio from the iterable, uploads, then polls for result.
1038
+ *
1039
+ * @param audio Async iterable of PCM buffers
1040
+ */
1041
+ transcribeStream(audio: AsyncIterable<Buffer>): AsyncIterable<STTResult>;
1042
+ /**
1043
+ * Upload audio to AssemblyAI and wait for async transcription.
1044
+ * Suitable for call recordings. Average latency: 15–45s per minute of audio.
1045
+ *
1046
+ * @param audio Raw WAV/PCM/MP3 buffer
1047
+ *
1048
+ * @example
1049
+ * ```ts
1050
+ * const stt = createSTT('assemblyai', { wordTimestamps: true })
1051
+ * const result = await stt.transcribeBatch(recordingBuffer)
1052
+ * console.log(result.words) // Word-level timestamps
1053
+ * ```
1054
+ */
1055
+ transcribeBatch(audio: Buffer): Promise<STTResult>;
1056
+ }
1057
+
1058
+ /**
1059
+ * @voice-kit/core — Deepgram Nova-3 STT Provider
1060
+ *
1061
+ * Streaming STT using Deepgram Nova-3. Handles WebSocket reconnect with
1062
+ * exponential backoff, interim + final results, language detection.
1063
+ * Never instantiate directly — use createSTT('deepgram', config).
1064
+ *
1065
+ * SDK: @deepgram/sdk v5 (beta) — https://github.com/deepgram/deepgram-js-sdk
1066
+ */
1067
+
1068
+ /**
1069
+ * Deepgram Nova-3 streaming STT provider.
1070
+ * @internal — obtained via createSTT('deepgram', config)
1071
+ */
1072
+ declare class DeepgramSTTProvider implements STTProvider {
1073
+ readonly name = "deepgram";
1074
+ readonly supportsStreaming = true;
1075
+ readonly supportedLanguages: string[];
1076
+ private readonly client;
1077
+ private readonly config;
1078
+ constructor(config: STTConfig);
1079
+ /**
1080
+ * Stream audio to Deepgram and receive interim + final transcription results.
1081
+ * Handles reconnection transparently with exponential backoff.
1082
+ *
1083
+ * @param audio Async iterable of 16kHz PCM buffers from AudioPipeline
1084
+ *
1085
+ * @example
1086
+ * ```ts
1087
+ * const stt = createSTT('deepgram', { language: 'hi-IN' })
1088
+ * for await (const result of stt.transcribeStream(audioIterable)) {
1089
+ * if (result.isFinal) console.log('User said:', result.transcript)
1090
+ * }
1091
+ * ```
1092
+ */
1093
+ transcribeStream(audio: AsyncIterable<Buffer>): AsyncIterable<STTResult>;
1094
+ /**
1095
+ * Transcribe a complete audio buffer (non-streaming).
1096
+ * Uses Deepgram pre-recorded API.
1097
+ *
1098
+ * @param audio Raw PCM or WAV buffer
1099
+ */
1100
+ transcribeBatch(audio: Buffer): Promise<STTResult>;
1101
+ /**
1102
+ * Create and open a live WebSocket connection to Deepgram.
1103
+ *
1104
+ * v5 connection lifecycle (3 explicit steps):
1105
+ * 1. await listen.v1.connect(options) — constructs the connection object
1106
+ * 2. connection.connect() — initiates the WebSocket handshake
1107
+ * 3. await connection.waitForOpen() — resolves once the socket is ready
1108
+ *
1109
+ * @internal
1110
+ */
1111
+ private connectWithRetry;
1112
+ }
1113
+
1114
+ /**
1115
+ * @voice-kit/core — Sarvam AI Indic STT Provider
1116
+ *
1117
+ * Sarvam AI provides state-of-the-art STT for Indian languages:
1118
+ * hi-IN, kn-IN, ta-IN, te-IN, mr-IN, bn-IN, gu-IN, pa-IN, or-IN
1119
+ *
1120
+ * Uses axios for HTTP calls. No official JS SDK — we use the REST API directly.
1121
+ */
1122
+
1123
+ /**
1124
+ * Sarvam AI Indic STT provider.
1125
+ * @internal — obtained via createSTT('sarvam', config)
1126
+ */
1127
+ declare class SarvamSTTProvider implements STTProvider {
1128
+ readonly name = "sarvam";
1129
+ readonly supportsStreaming = false;
1130
+ readonly supportedLanguages: string[];
1131
+ private readonly http;
1132
+ private readonly config;
1133
+ constructor(config: STTConfig);
1134
+ /**
1135
+ * Collects audio and transcribes via Sarvam batch API.
1136
+ * Sarvam doesn't support realtime streaming.
1137
+ *
1138
+ * @param audio Async iterable of 16kHz PCM buffers
1139
+ */
1140
+ transcribeStream(audio: AsyncIterable<Buffer>): AsyncIterable<STTResult>;
1141
+ /**
1142
+ * Transcribe a WAV/PCM audio buffer in an Indic language.
1143
+ *
1144
+ * @param audio 16kHz PCM or WAV buffer
1145
+ *
1146
+ * @example
1147
+ * ```ts
1148
+ * const stt = createSTT('sarvam', { language: 'ta-IN' })
1149
+ * const result = await stt.transcribeBatch(tamilAudioBuffer)
1150
+ * console.log(result.transcript) // Tamil text
1151
+ * ```
1152
+ */
1153
+ transcribeBatch(audio: Buffer): Promise<STTResult>;
1154
+ }
1155
+
1156
+ /**
1157
+ * @voice-kit/core — Hinglish language switch detector
1158
+ *
1159
+ * Detects mid-sentence Hindi↔English (Hinglish) code-switching in realtime STT output.
1160
+ * Pure algorithmic detection — no external API calls, no latency overhead.
1161
+ *
1162
+ * Detection signals:
1163
+ * 1. Devanagari Unicode range (U+0900–U+097F) for Hindi
1164
+ * 2. Latin character runs for English
1165
+ * 3. Common Hinglish transition patterns (e.g. "main think karta hun")
1166
+ * 4. Script boundary crossing mid-sentence
1167
+ */
1168
+
1169
+ type LanguageCode = 'hi-IN' | 'en-IN' | 'unknown';
1170
+ interface LanguageSwitchEvent {
1171
+ /** Language switched from. */
1172
+ from: LanguageCode;
1173
+ /** Language switched to. */
1174
+ to: LanguageCode;
1175
+ /** Position in transcript where switch occurred (word index). */
1176
+ position: number;
1177
+ /** Confidence of the detection 0–1. */
1178
+ confidence: number;
1179
+ /** Full transcript at time of detection. */
1180
+ transcript: string;
1181
+ /** Timestamp of detection. */
1182
+ detectedAt: Date;
1183
+ }
1184
+ type LanguageDetectorEventMap = {
1185
+ 'language.switched': [LanguageSwitchEvent];
1186
+ };
1187
+ /**
1188
+ * Hinglish language switch detector.
1189
+ *
1190
+ * Analyzes STT transcripts word-by-word in realtime.
1191
+ * Emits 'language.switched' events when a significant script change is detected.
1192
+ *
1193
+ * @example
1194
+ * ```ts
1195
+ * const detector = new LanguageSwitchDetector('en-IN')
1196
+ * detector.on('language.switched', ({ from, to, transcript }) => {
1197
+ * console.log(`Language switched: ${from} → ${to} in: "${transcript}"`)
1198
+ * })
1199
+ *
1200
+ * // Call on every STT final result
1201
+ * detector.analyze('main yeh kaam kal karonga I promise')
1202
+ * ```
1203
+ */
1204
+ declare class LanguageSwitchDetector extends EventEmitter<LanguageDetectorEventMap> {
1205
+ private currentLanguage;
1206
+ private readonly primaryLanguage;
1207
+ /** Rolling window of recent language classifications for smoothing. */
1208
+ private recentClassifications;
1209
+ private readonly windowSize;
1210
+ constructor(primaryLanguage?: LanguageCode);
1211
+ /**
1212
+ * Analyze a transcript for language switches.
1213
+ * Should be called on every STT final result.
1214
+ *
1215
+ * @param transcript The transcribed text to analyze
1216
+ * @returns Detected language of the transcript
1217
+ */
1218
+ analyze(transcript: string): LanguageCode;
1219
+ /**
1220
+ * Analyze a transcript and return per-word language classification.
1221
+ * Useful for word-level Hinglish mixing visualization.
1222
+ *
1223
+ * @param transcript Text to analyze
1224
+ * @returns Array of { word, language } pairs
1225
+ */
1226
+ analyzeWords(transcript: string): Array<{
1227
+ word: string;
1228
+ language: LanguageCode;
1229
+ }>;
1230
+ /** Reset to primary language (e.g., on new call). */
1231
+ reset(): void;
1232
+ /** Current detected language. */
1233
+ get language(): LanguageCode;
1234
+ private tokenize;
1235
+ private classifyWord;
1236
+ private classifySegment;
1237
+ private computeConfidence;
1238
+ private smoothedLanguage;
1239
+ }
1240
+ /**
1241
+ * Detect whether a transcript contains mixed Hindi+English (Hinglish).
1242
+ * Stateless convenience function for one-shot analysis.
1243
+ *
1244
+ * @param transcript Text to analyze
1245
+ * @returns True if both Devanagari and Latin characters are present
1246
+ *
1247
+ * @example
1248
+ * ```ts
1249
+ * isHinglish('main kal office jaaunga') // true
1250
+ * isHinglish('I will go to the office') // false
1251
+ * isHinglish('मैं कल ऑफिस जाऊंगा') // false (pure Hindi)
1252
+ * ```
1253
+ */
1254
+ declare function isInglish(transcript: string): boolean;
1255
+
1256
+ /**
1257
+ * @voice-kit/core — STT factory
1258
+ *
1259
+ * createSTT() is the ONLY public API for speech-to-text.
1260
+ * Never instantiate provider classes directly.
1261
+ */
1262
+
1263
+ /**
1264
+ * Create an STT provider instance. This is the ONLY public API for STT.
1265
+ *
1266
+ * Provider selection guide:
1267
+ * - 'deepgram' → Default. Realtime streaming, best latency, supports en-IN + Indic
1268
+ * - 'sarvam' → Best accuracy for pure Indic languages (hi-IN, ta-IN, kn-IN, te-IN, mr-IN)
1269
+ * - 'assemblyai' → Best for long-form recordings (post-call analysis)
1270
+ * - 'whisper' → Fallback batch transcription, broad language support
1271
+ *
1272
+ * @example
1273
+ * ```ts
1274
+ * // Realtime English (India) — default
1275
+ * const stt = createSTT('deepgram', { language: 'en-IN' })
1276
+ *
1277
+ * // Realtime Hindi
1278
+ * const stt = createSTT('deepgram', { language: 'hi-IN' })
1279
+ *
1280
+ * // Best Indic accuracy
1281
+ * const stt = createSTT('sarvam', { language: 'ta-IN' })
1282
+ *
1283
+ * // Post-call recording
1284
+ * const stt = createSTT('assemblyai', { wordTimestamps: true })
1285
+ * ```
1286
+ */
1287
+ declare function createSTT(provider: 'deepgram' | 'whisper' | 'assemblyai' | 'sarvam', config?: STTConfig): STTProvider;
1288
+
1289
+ /**
1290
+ * @voice-kit/core — OpenAI Whisper STT Provider (batch fallback)
1291
+ *
1292
+ * Uses @ai-sdk/openai for batch transcription. Does not support streaming.
1293
+ * Use as fallback for long-form audio or when Deepgram is unavailable.
1294
+ */
1295
+
1296
+ /**
1297
+ * OpenAI Whisper STT provider. Batch-only — does not support streaming.
1298
+ * @internal — obtained via createSTT('whisper', config)
1299
+ */
1300
+ declare class WhisperSTTProvider implements STTProvider {
1301
+ readonly name = "whisper";
1302
+ readonly supportsStreaming = false;
1303
+ readonly supportedLanguages: string[];
1304
+ private readonly config;
1305
+ constructor(config: STTConfig);
1306
+ /**
1307
+ * Streaming not supported by Whisper. Collects all audio then transcribes.
1308
+ * For realtime use, use createSTT('deepgram') instead.
1309
+ */
1310
+ transcribeStream(audio: AsyncIterable<Buffer>): AsyncIterable<STTResult>;
1311
+ /**
1312
+ * Transcribe a complete audio buffer via Whisper.
1313
+ *
1314
+ * @param audio WAV or PCM buffer
1315
+ */
1316
+ transcribeBatch(audio: Buffer): Promise<STTResult>;
1317
+ }
1318
+
1319
+ /**
1320
+ * @voice-kit/core — Cartesia TTS Provider
1321
+ *
1322
+ * Ultra-low-latency streaming TTS via @cartesia/cartesia-js.
1323
+ * Target first chunk: < 90ms. Best for latency-critical applications.
1324
+ */
1325
+
1326
+ /**
1327
+ * Cartesia ultra-low-latency TTS provider.
1328
+ * @internal — obtained via createTTS('cartesia', config)
1329
+ */
1330
+ declare class CartesiaTTSProvider implements TTSProvider {
1331
+ readonly name = "cartesia";
1332
+ readonly outputSampleRate = 22050;
1333
+ readonly outputFormat: "pcm";
1334
+ private readonly client;
1335
+ private readonly config;
1336
+ constructor(config: TTSConfig);
1337
+ /**
1338
+ * Stream audio from Cartesia. Typically delivers first chunk in < 90ms.
1339
+ *
1340
+ * @example
1341
+ * ```ts
1342
+ * const tts = createTTS('cartesia', { voiceId: 'your-voice-id' })
1343
+ * for await (const chunk of tts.synthesizeStream('Hello!')) {
1344
+ * sendToTelephony(chunk)
1345
+ * }
1346
+ * ```
1347
+ */
1348
+ synthesizeStream(text: string, config?: TTSConfig): AsyncIterable<Buffer>;
1349
+ /** Synthesize complete audio. */
1350
+ synthesizeFull(text: string, config?: TTSConfig): Promise<Buffer>;
1351
+ }
1352
+
1353
+ /**
1354
+ * @voice-kit/core — ElevenLabs TTS Provider
1355
+ *
1356
+ * Streaming TTS using ElevenLabs SDK. Features:
1357
+ * - 100ms lookahead jitter buffer to smooth burst delivery
1358
+ * - Voice clone support
1359
+ * - Sub-300ms first chunk target
1360
+ */
1361
+
1362
+ /**
1363
+ * ElevenLabs streaming TTS provider.
1364
+ * @internal — obtained via createTTS('elevenlabs', config)
1365
+ */
1366
+ declare class ElevenLabsTTSProvider implements TTSProvider {
1367
+ readonly name = "elevenlabs";
1368
+ readonly outputSampleRate = 24000;
1369
+ readonly outputFormat: "pcm";
1370
+ private readonly client;
1371
+ private readonly config;
1372
+ constructor(config: TTSConfig);
1373
+ /**
1374
+ * Stream synthesized audio from ElevenLabs.
1375
+ * First chunk target: < 300ms. Uses streaming API endpoint.
1376
+ *
1377
+ * A 100ms jitter buffer smooths burst packet delivery without adding
1378
+ * perceptible latency.
1379
+ *
1380
+ * @param text Text to synthesize (should be a sentence boundary chunk)
1381
+ * @param config Per-call config overrides
1382
+ *
1383
+ * @example
1384
+ * ```ts
1385
+ * const tts = createTTS('elevenlabs', { voiceId: 'your-voice-id' })
1386
+ * for await (const chunk of tts.synthesizeStream('Hello, how can I help?')) {
1387
+ * telephony.sendAudio(chunk)
1388
+ * }
1389
+ * ```
1390
+ */
1391
+ synthesizeStream(text: string, config?: TTSConfig): AsyncIterable<Buffer>;
1392
+ /**
1393
+ * Synthesize full audio (for pre-caching greetings, IVR prompts).
1394
+ * Collects all streaming chunks into a single buffer.
1395
+ *
1396
+ * @param text Text to synthesize
1397
+ * @param config Per-call config overrides
1398
+ */
1399
+ synthesizeFull(text: string, config?: TTSConfig): Promise<Buffer>;
1400
+ }
1401
+
1402
+ /**
1403
+ * @voice-kit/core — Sarvam AI TTS Provider
1404
+ *
1405
+ * Sarvam AI TTS for Hindi/Hinglish and regional Indian languages.
1406
+ * Supports natural-sounding Indian voices with regional accents.
1407
+ */
1408
+
1409
+ /**
1410
+ * Sarvam AI TTS provider for Indic languages.
1411
+ * @internal — obtained via createTTS('sarvam', config)
1412
+ */
1413
+ declare class SarvamTTSProvider implements TTSProvider {
1414
+ readonly name = "sarvam";
1415
+ readonly outputSampleRate = 22050;
1416
+ readonly outputFormat: "mp3";
1417
+ private readonly http;
1418
+ private readonly config;
1419
+ constructor(config: TTSConfig);
1420
+ /**
1421
+ * Synthesize text in an Indic language and stream audio chunks.
1422
+ * Sarvam returns full audio segments — we chunk them for streaming compatibility.
1423
+ *
1424
+ * @example
1425
+ * ```ts
1426
+ * const tts = createTTS('sarvam', { targetLanguage: 'hi-IN' })
1427
+ * for await (const chunk of tts.synthesizeStream('नमस्ते, मैं आपकी कैसे मदद कर सकता हूँ?')) {
1428
+ * telephony.sendAudio(chunk)
1429
+ * }
1430
+ * ```
1431
+ */
1432
+ synthesizeStream(text: string, config?: TTSConfig): AsyncIterable<Buffer>;
1433
+ /** Synthesize complete audio buffer. */
1434
+ synthesizeFull(text: string, config?: TTSConfig): Promise<Buffer>;
1435
+ }
1436
+
1437
+ /**
1438
+ * @voice-kit/core — TTS factory
1439
+ *
1440
+ * createTTS() is the ONLY public API for text-to-speech.
1441
+ * Never instantiate provider classes directly.
1442
+ */
1443
+
1444
+ /**
1445
+ * Create a TTS provider instance.
1446
+ *
1447
+ * Provider selection guide:
1448
+ * - 'elevenlabs' → Best voice quality, cloning support, en-IN
1449
+ * - 'cartesia' → Lowest latency (< 90ms TTFB), good for fast-paced agents
1450
+ * - 'sarvam' → Best for Indic languages (hi-IN, ta-IN, kn-IN, te-IN, mr-IN)
1451
+ *
1452
+ * @example
1453
+ * ```ts
1454
+ * // English with voice cloning
1455
+ * const tts = createTTS('elevenlabs', { voiceId: 'your-cloned-voice-id' })
1456
+ *
1457
+ * // Ultra-low latency English
1458
+ * const tts = createTTS('cartesia', { voiceId: 'your-voice-id' })
1459
+ *
1460
+ * // Hindi
1461
+ * const tts = createTTS('sarvam', { targetLanguage: 'hi-IN', voiceId: 'meera' })
1462
+ * ```
1463
+ */
1464
+ declare function createTTS(provider: 'elevenlabs' | 'cartesia' | 'sarvam', config?: TTSConfig): TTSProvider;
1465
+
1466
+ export { AgentError, AgentHandoffError, AssemblyAISTTProvider, AudioPipeline, AudioTransportError, type AuditEntry, type AuditEventType, CallAuditLog, CallConnectionError, type CallMemory, type CallMemoryConfig, CallMetrics, type CallMetricsSummary, CallNotFoundError, type CallPurpose, CallingHoursError, CartesiaTTSProvider, ComplianceError, ConsentMissingError, type ConsentRecord, DNCBlockedError, type DNCCheckParams, type DNCCheckResult, DeepgramSTTProvider, ElevenLabsTTSProvider, type ErrorSeverity, InngestError, type LanguageCode, LanguageSwitchDetector, type LanguageSwitchEvent, type STTConfig, STTConnectionError, STTError, STTLanguageNotSupportedError, type STTProvider, type STTResult, STTStreamError, SarvamSTTProvider, SarvamTTSProvider, TRAICompliance, type TRAIConfig, type TTSConfig, TTSConnectionError, TTSError, type TTSProvider, TTSStreamError, TTSVoiceNotFoundError, TelephonyError, type TelephonyProviderName, TurnTransitionError, type VADConfig, VADEngine, type VoiceFrame, VoiceKitError, type VoiceKitErrorContext, VoiceSDKTracer, WhisperSTTProvider, type WordTimestamp, base64MulawToPcm, createAudioPipeline, createCallMemory, createResamplerStream, createSTT, createTTS, createVAD, isInglish, linearToMulaw, mulawBufferToPcm, mulawToLinear, pcmBufferToMulaw, pcmToBase64Mulaw, resample, resampleStream };