@voice-kit/core 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/dist/index.cjs +2137 -0
  2. package/dist/index.cjs.map +1 -1
  3. package/dist/index.d.cts +1466 -4
  4. package/dist/index.d.ts +1466 -4
  5. package/dist/index.js +2102 -1
  6. package/dist/index.js.map +1 -1
  7. package/package.json +1 -31
  8. package/dist/audio.cjs +0 -533
  9. package/dist/audio.cjs.map +0 -1
  10. package/dist/audio.d.cts +0 -260
  11. package/dist/audio.d.ts +0 -260
  12. package/dist/audio.js +0 -514
  13. package/dist/audio.js.map +0 -1
  14. package/dist/compliance.cjs +0 -343
  15. package/dist/compliance.cjs.map +0 -1
  16. package/dist/compliance.d.cts +0 -163
  17. package/dist/compliance.d.ts +0 -163
  18. package/dist/compliance.js +0 -335
  19. package/dist/compliance.js.map +0 -1
  20. package/dist/errors.cjs +0 -284
  21. package/dist/errors.cjs.map +0 -1
  22. package/dist/errors.d.cts +0 -100
  23. package/dist/errors.d.ts +0 -100
  24. package/dist/errors.js +0 -262
  25. package/dist/errors.js.map +0 -1
  26. package/dist/index-D3KfRXMP.d.cts +0 -319
  27. package/dist/index-D3KfRXMP.d.ts +0 -319
  28. package/dist/memory.cjs +0 -121
  29. package/dist/memory.cjs.map +0 -1
  30. package/dist/memory.d.cts +0 -29
  31. package/dist/memory.d.ts +0 -29
  32. package/dist/memory.js +0 -115
  33. package/dist/memory.js.map +0 -1
  34. package/dist/observability.cjs +0 -229
  35. package/dist/observability.cjs.map +0 -1
  36. package/dist/observability.d.cts +0 -122
  37. package/dist/observability.d.ts +0 -122
  38. package/dist/observability.js +0 -222
  39. package/dist/observability.js.map +0 -1
  40. package/dist/stt.cjs +0 -828
  41. package/dist/stt.cjs.map +0 -1
  42. package/dist/stt.d.cts +0 -308
  43. package/dist/stt.d.ts +0 -308
  44. package/dist/stt.js +0 -815
  45. package/dist/stt.js.map +0 -1
  46. package/dist/telephony.errors-BQYr6-vl.d.cts +0 -80
  47. package/dist/telephony.errors-C0-nScrF.d.ts +0 -80
  48. package/dist/tts.cjs +0 -429
  49. package/dist/tts.cjs.map +0 -1
  50. package/dist/tts.d.cts +0 -151
  51. package/dist/tts.d.ts +0 -151
  52. package/dist/tts.js +0 -418
  53. package/dist/tts.js.map +0 -1
package/dist/index.js CHANGED
@@ -1,3 +1,23 @@
1
+ import ffmpeg from 'fluent-ffmpeg';
2
+ import { Readable, PassThrough } from 'stream';
3
+ import pino from 'pino';
4
+ import { EventEmitter } from 'events';
5
+ import { LRUCache } from 'lru-cache';
6
+ import { appendFile } from 'fs/promises';
7
+ import axios from 'axios';
8
+ import { isValidPhoneNumber, parsePhoneNumberFromString } from 'libphonenumber-js';
9
+ import { NodeTracerProvider } from '@opentelemetry/sdk-trace-node';
10
+ import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http';
11
+ import { SimpleSpanProcessor } from '@opentelemetry/sdk-trace-base';
12
+ import { ATTR_SERVICE_NAME } from '@opentelemetry/semantic-conventions';
13
+ import { trace, SpanStatusCode } from '@opentelemetry/api';
14
+ import { resourceFromAttributes } from '@opentelemetry/resources';
15
+ import { AssemblyAI } from 'assemblyai';
16
+ import { DeepgramClient } from '@deepgram/sdk';
17
+ import { createOpenAI } from '@ai-sdk/openai';
18
+ import Cartesia from '@cartesia/cartesia-js';
19
+ import { ElevenLabsClient } from 'elevenlabs';
20
+
1
21
  // src/errors/base.ts
2
22
  var VoiceKitError = class extends Error {
3
23
  code;
@@ -257,6 +277,2087 @@ var TTSVoiceNotFoundError = class extends TTSError {
257
277
  }
258
278
  };
259
279
 
260
- export { AgentError, AgentHandoffError, AudioTransportError, CallConnectionError, CallNotFoundError, CallingHoursError, ComplianceError, ConsentMissingError, DNCBlockedError, InngestError, STTConnectionError, STTError, STTLanguageNotSupportedError, STTStreamError, TTSConnectionError, TTSError, TTSStreamError, TTSVoiceNotFoundError, TelephonyError, TurnTransitionError, VoiceKitError };
280
+ // src/audio/codec/index.ts
281
+ var MULAW_BIAS = 33;
282
+ var MULAW_MAX = 32767;
283
+ function mulawToLinear(sample) {
284
+ sample = ~sample & 255;
285
+ const sign = sample & 128;
286
+ const exponent = sample >> 4 & 7;
287
+ const mantissa = sample & 15;
288
+ let linear = (mantissa << 1) + 33 << exponent;
289
+ linear -= 33;
290
+ return sign !== 0 ? -linear : linear;
291
+ }
292
+ function linearToMulaw(sample) {
293
+ sample = Math.max(-32768, Math.min(32767, sample));
294
+ const sign = sample < 0 ? 128 : 0;
295
+ if (sample < 0) sample = -sample;
296
+ sample += MULAW_BIAS;
297
+ if (sample > MULAW_MAX) sample = MULAW_MAX;
298
+ let exponent = 7;
299
+ let expMask = 16384;
300
+ for (; exponent > 0; exponent--) {
301
+ if ((sample & expMask) !== 0) break;
302
+ expMask >>= 1;
303
+ }
304
+ const mantissa = sample >> exponent + 3 & 15;
305
+ const mulaw = ~(sign | exponent << 4 | mantissa) & 255;
306
+ return mulaw;
307
+ }
308
+ function mulawBufferToPcm(buf) {
309
+ const pcm = Buffer.allocUnsafe(buf.length * 2);
310
+ for (let i = 0; i < buf.length; i++) {
311
+ const sample = mulawToLinear(buf[i] ?? 0);
312
+ pcm.writeInt16LE(sample, i * 2);
313
+ }
314
+ return pcm;
315
+ }
316
+ function pcmBufferToMulaw(buf) {
317
+ const samples = buf.length >> 1;
318
+ const mulaw = Buffer.allocUnsafe(samples);
319
+ for (let i = 0; i < samples; i++) {
320
+ const sample = buf.readInt16LE(i * 2);
321
+ mulaw[i] = linearToMulaw(sample);
322
+ }
323
+ return mulaw;
324
+ }
325
+ function base64MulawToPcm(base64) {
326
+ const mulaw = Buffer.from(base64, "base64");
327
+ return mulawBufferToPcm(mulaw);
328
+ }
329
+ function pcmToBase64Mulaw(pcm) {
330
+ return pcmBufferToMulaw(pcm).toString("base64");
331
+ }
332
+ async function resample(buf, fromHz, toHz) {
333
+ if (fromHz === toHz) return buf;
334
+ return new Promise((resolve, reject) => {
335
+ const chunks = [];
336
+ const input = new Readable({
337
+ read() {
338
+ this.push(buf);
339
+ this.push(null);
340
+ }
341
+ });
342
+ const output = new PassThrough();
343
+ output.on("data", (chunk) => chunks.push(chunk));
344
+ output.on("end", () => resolve(Buffer.concat(chunks)));
345
+ output.on(
346
+ "error",
347
+ (err) => reject(
348
+ new AudioTransportError(
349
+ "ffmpeg-resampler",
350
+ err
351
+ )
352
+ )
353
+ );
354
+ ffmpeg(input).inputOptions([
355
+ "-f s16le",
356
+ `-ar ${fromHz}`,
357
+ "-ac 1"
358
+ ]).outputOptions([
359
+ "-f s16le",
360
+ `-ar ${toHz}`,
361
+ "-ac 1"
362
+ ]).on(
363
+ "error",
364
+ (err) => reject(
365
+ new AudioTransportError("ffmpeg-resampler", err)
366
+ )
367
+ ).pipe(output, { end: true });
368
+ });
369
+ }
370
+ function createResamplerStream(fromHz, toHz) {
371
+ const output = new PassThrough();
372
+ if (fromHz === toHz) {
373
+ return output;
374
+ }
375
+ return output;
376
+ }
377
+ async function* resampleStream(audio, fromHz, toHz) {
378
+ if (fromHz === toHz) {
379
+ yield* audio;
380
+ return;
381
+ }
382
+ const CHUNK_SIZE = 16e3;
383
+ let pending = Buffer.alloc(0);
384
+ for await (const chunk of audio) {
385
+ pending = Buffer.concat([pending, chunk]);
386
+ while (pending.length >= CHUNK_SIZE) {
387
+ const slice = pending.subarray(0, CHUNK_SIZE);
388
+ pending = pending.subarray(CHUNK_SIZE);
389
+ yield await resample(slice, fromHz, toHz);
390
+ }
391
+ }
392
+ if (pending.length > 0) {
393
+ yield await resample(pending, fromHz, toHz);
394
+ }
395
+ }
396
+ var logger = pino({ name: "@voice-kit/core:pipeline" });
397
+ var AUDIO_PROFILES = {
398
+ twilio: {
399
+ inputFormat: "mulaw",
400
+ inputSampleRate: 8e3,
401
+ sttSampleRate: 16e3,
402
+ outputSampleRate: 8e3,
403
+ outputFormat: "mulaw",
404
+ vadConfig: {
405
+ threshold: 0.6,
406
+ positiveSpeechFrames: 3,
407
+ negativeSpeechFrames: 5,
408
+ debounceMs: 150,
409
+ sampleRate: 16e3
410
+ }
411
+ },
412
+ exotel: {
413
+ inputFormat: "mulaw",
414
+ inputSampleRate: 8e3,
415
+ sttSampleRate: 16e3,
416
+ outputSampleRate: 8e3,
417
+ outputFormat: "mulaw",
418
+ vadConfig: {
419
+ threshold: 0.55,
420
+ // Exotel has slightly more background noise on IN PSTN
421
+ positiveSpeechFrames: 3,
422
+ negativeSpeechFrames: 6,
423
+ debounceMs: 200,
424
+ sampleRate: 16e3
425
+ }
426
+ },
427
+ plivo: {
428
+ inputFormat: "mulaw",
429
+ inputSampleRate: 8e3,
430
+ sttSampleRate: 16e3,
431
+ outputSampleRate: 8e3,
432
+ outputFormat: "mulaw",
433
+ vadConfig: {
434
+ threshold: 0.6,
435
+ positiveSpeechFrames: 3,
436
+ negativeSpeechFrames: 5,
437
+ debounceMs: 150,
438
+ sampleRate: 16e3
439
+ }
440
+ },
441
+ telnyx: {
442
+ inputFormat: "mulaw",
443
+ inputSampleRate: 8e3,
444
+ sttSampleRate: 16e3,
445
+ outputSampleRate: 8e3,
446
+ outputFormat: "mulaw",
447
+ vadConfig: {
448
+ threshold: 0.6,
449
+ positiveSpeechFrames: 3,
450
+ negativeSpeechFrames: 5,
451
+ debounceMs: 150,
452
+ sampleRate: 16e3
453
+ }
454
+ },
455
+ livekit: {
456
+ inputFormat: "opus",
457
+ // LiveKit delivers decoded PCM via SDK — we handle 48kHz
458
+ inputSampleRate: 48e3,
459
+ sttSampleRate: 16e3,
460
+ outputSampleRate: 48e3,
461
+ outputFormat: "opus",
462
+ vadConfig: {
463
+ threshold: 0.5,
464
+ // Higher quality audio = can lower threshold
465
+ positiveSpeechFrames: 2,
466
+ negativeSpeechFrames: 4,
467
+ debounceMs: 100,
468
+ sampleRate: 16e3
469
+ }
470
+ },
471
+ sip: {
472
+ inputFormat: "mulaw",
473
+ inputSampleRate: 8e3,
474
+ sttSampleRate: 16e3,
475
+ outputSampleRate: 8e3,
476
+ outputFormat: "mulaw",
477
+ vadConfig: {
478
+ threshold: 0.6,
479
+ positiveSpeechFrames: 3,
480
+ negativeSpeechFrames: 5,
481
+ debounceMs: 150,
482
+ sampleRate: 16e3
483
+ }
484
+ }
485
+ };
486
+ var AudioPipeline = class {
487
+ profile;
488
+ provider;
489
+ constructor(provider) {
490
+ this.provider = provider;
491
+ this.profile = AUDIO_PROFILES[provider];
492
+ logger.debug(
493
+ {
494
+ provider,
495
+ inputFormat: this.profile.inputFormat,
496
+ inputSampleRate: this.profile.inputSampleRate,
497
+ sttSampleRate: this.profile.sttSampleRate
498
+ },
499
+ "AudioPipeline initialized"
500
+ );
501
+ }
502
+ /**
503
+ * Transform incoming telephony audio to 16kHz PCM for STT.
504
+ * Handles µ-law decode + resampling automatically.
505
+ *
506
+ * @param raw Raw audio bytes as received from telephony provider
507
+ * @returns Async iterable of 16kHz PCM buffers for STT
508
+ *
509
+ * @internal
510
+ */
511
+ async *inboundForSTT(raw) {
512
+ let decoded;
513
+ if (this.profile.inputFormat === "mulaw") {
514
+ decoded = this.decodeMulaw(raw);
515
+ } else {
516
+ decoded = raw;
517
+ }
518
+ yield* resampleStream(decoded, this.profile.inputSampleRate, this.profile.sttSampleRate);
519
+ }
520
+ /**
521
+ * Transform TTS output PCM to telephony-native format for sending to caller.
522
+ * Handles resampling + µ-law encode automatically.
523
+ *
524
+ * @param ttsAudio Raw PCM from TTS provider (at TTS provider's native rate)
525
+ * @param ttsSampleRate Native sample rate of the TTS provider
526
+ * @returns Async iterable of audio bytes ready to send to telephony provider
527
+ *
528
+ * @internal
529
+ */
530
+ async *outboundFromTTS(ttsAudio, ttsSampleRate) {
531
+ const resampled = resampleStream(
532
+ ttsAudio,
533
+ ttsSampleRate,
534
+ this.profile.outputSampleRate
535
+ );
536
+ if (this.profile.outputFormat === "mulaw") {
537
+ for await (const chunk of resampled) {
538
+ yield pcmBufferToMulaw(chunk);
539
+ }
540
+ } else {
541
+ yield* resampled;
542
+ }
543
+ }
544
+ /** Get the VAD config tuned for this provider's audio quality. @internal */
545
+ get vadConfig() {
546
+ return this.profile.vadConfig;
547
+ }
548
+ /** Sample rate that STT expects (post-pipeline). @internal */
549
+ get sttSampleRate() {
550
+ return this.profile.sttSampleRate;
551
+ }
552
+ /** Async generator: decode µ-law stream to PCM. @internal */
553
+ async *decodeMulaw(raw) {
554
+ for await (const chunk of raw) {
555
+ yield mulawBufferToPcm(chunk);
556
+ }
557
+ }
558
+ };
559
+ function createAudioPipeline(provider) {
560
+ return new AudioPipeline(provider);
561
+ }
562
+ var logger2 = pino({ name: "@voice-kit/core:vad" });
563
+ var FRAME_SIZE_SAMPLES = 480;
564
+ var FRAME_SIZE_BYTES = FRAME_SIZE_SAMPLES * 2;
565
+ var VAD_DEFAULTS = {
566
+ threshold: 0.6,
567
+ positiveSpeechFrames: 3,
568
+ negativeSpeechFrames: 5,
569
+ debounceMs: 150,
570
+ sampleRate: 16e3
571
+ };
572
+ var VADEngine = class extends EventEmitter {
573
+ config;
574
+ // Running state
575
+ isSpeaking = false;
576
+ positiveFrameCount = 0;
577
+ negativeFrameCount = 0;
578
+ debounceTimer = null;
579
+ frameBuffer = Buffer.alloc(0);
580
+ // Silero VAD model — loaded lazily
581
+ vadModel = null;
582
+ constructor(config) {
583
+ super();
584
+ this.config = { ...VAD_DEFAULTS, ...config };
585
+ }
586
+ /**
587
+ * Process an async stream of PCM audio frames.
588
+ * Automatically frames the input into 30ms chunks for VAD processing.
589
+ *
590
+ * @param audio Async iterable of PCM buffers (16kHz, s16le, mono)
591
+ */
592
+ async processStream(audio) {
593
+ try {
594
+ await this.ensureModelLoaded();
595
+ for await (const chunk of audio) {
596
+ this.frameBuffer = Buffer.concat([this.frameBuffer, chunk]);
597
+ while (this.frameBuffer.length >= FRAME_SIZE_BYTES) {
598
+ const frame = this.frameBuffer.subarray(0, FRAME_SIZE_BYTES);
599
+ this.frameBuffer = this.frameBuffer.subarray(FRAME_SIZE_BYTES);
600
+ await this.processFrame(frame);
601
+ }
602
+ }
603
+ if (this.isSpeaking) {
604
+ this.emitFrame("speech_end", 0, Buffer.alloc(0));
605
+ }
606
+ } catch (err) {
607
+ const error = new AudioTransportError("vad", err);
608
+ this.emit("error", error);
609
+ throw error;
610
+ }
611
+ }
612
+ /**
613
+ * Process a single 30ms PCM frame through the VAD model.
614
+ *
615
+ * @internal
616
+ */
617
+ async processFrame(frame) {
618
+ const confidence = await this.runVADInference(frame);
619
+ const durationMs = FRAME_SIZE_SAMPLES / this.config.sampleRate * 1e3;
620
+ if (confidence >= this.config.threshold) {
621
+ this.positiveFrameCount++;
622
+ this.negativeFrameCount = 0;
623
+ if (this.isSpeaking) {
624
+ this.emitFrame("speech", confidence, frame, durationMs);
625
+ } else if (this.positiveFrameCount >= this.config.positiveSpeechFrames) {
626
+ this.clearDebounce();
627
+ this.isSpeaking = true;
628
+ this.emitFrame("speech_start", confidence, frame, durationMs);
629
+ logger2.debug({ confidence, frames: this.positiveFrameCount }, "VAD: speech_start");
630
+ }
631
+ } else {
632
+ this.negativeFrameCount++;
633
+ this.positiveFrameCount = 0;
634
+ if (this.isSpeaking) {
635
+ this.emitFrame("speech", confidence, frame, durationMs);
636
+ if (this.negativeFrameCount >= this.config.negativeSpeechFrames) {
637
+ this.scheduleDebounce(() => {
638
+ this.isSpeaking = false;
639
+ this.negativeFrameCount = 0;
640
+ this.emitFrame("speech_end", confidence, Buffer.alloc(0), 0);
641
+ logger2.debug({ confidence }, "VAD: speech_end");
642
+ });
643
+ }
644
+ }
645
+ }
646
+ }
647
+ /**
648
+ * Run Silero VAD model inference on a single frame.
649
+ * Returns confidence score 0–1.
650
+ *
651
+ * @internal
652
+ */
653
+ async runVADInference(frame) {
654
+ if (!this.vadModel) throw new Error("VAD model not loaded");
655
+ const samples = new Float32Array(FRAME_SIZE_SAMPLES);
656
+ for (let i = 0; i < FRAME_SIZE_SAMPLES; i++) {
657
+ samples[i] = frame.readInt16LE(i * 2) / 32768;
658
+ }
659
+ return this.vadModel.predict(samples);
660
+ }
661
+ emitFrame(type, confidence, audioBuffer, durationMs = 0) {
662
+ const frame = { type, confidence, audioBuffer, durationMs };
663
+ this.emit("frame", frame);
664
+ }
665
+ scheduleDebounce(fn) {
666
+ this.clearDebounce();
667
+ this.debounceTimer = setTimeout(fn, this.config.debounceMs);
668
+ }
669
+ clearDebounce() {
670
+ if (this.debounceTimer !== null) {
671
+ clearTimeout(this.debounceTimer);
672
+ this.debounceTimer = null;
673
+ }
674
+ }
675
+ /**
676
+ * Load the Silero VAD model if not already loaded.
677
+ * @internal
678
+ */
679
+ async ensureModelLoaded() {
680
+ if (this.vadModel) return;
681
+ logger2.debug("Loading Silero VAD model...");
682
+ try {
683
+ const { MicVAD } = await import('@ricky0123/vad-web');
684
+ const vad = await MicVAD.new();
685
+ this.vadModel = new SileroVADAdapter(vad);
686
+ logger2.info("VAD model loaded successfully");
687
+ } catch (err) {
688
+ logger2.warn({ err }, "VAD model load failed \u2014 falling back to silence-based detection");
689
+ this.vadModel = new EnergyBasedVAD();
690
+ }
691
+ }
692
+ /** Clean up resources. Call when the call ends. */
693
+ destroy() {
694
+ this.clearDebounce();
695
+ this.removeAllListeners();
696
+ this.frameBuffer = Buffer.alloc(0);
697
+ this.vadModel = null;
698
+ }
699
+ };
700
+ var EnergyBasedVAD = class {
701
+ energyThreshold = 0.01;
702
+ async predict(samples) {
703
+ let sumSq = 0;
704
+ for (const s of samples) {
705
+ sumSq += s * s;
706
+ }
707
+ const rms = Math.sqrt(sumSq / samples.length);
708
+ return Math.min(1, rms / this.energyThreshold);
709
+ }
710
+ };
711
+ function createVAD(config) {
712
+ return new VADEngine(config);
713
+ }
714
+ var SileroVADAdapter = class {
715
+ constructor(vad) {
716
+ this.vad = vad;
717
+ }
718
+ async predict(samples) {
719
+ let sumSq = 0;
720
+ for (const s of samples) {
721
+ sumSq += s * s;
722
+ }
723
+ const rms = Math.sqrt(sumSq / samples.length);
724
+ return Math.min(1, rms * 10);
725
+ }
726
+ };
727
+ var logger3 = pino({ name: "@voice-kit/core:compliance:audit" });
728
+ var CallAuditLog = class {
729
+ /** LRU: up to 10,000 calls × 200 entries each = 2M entries max */
730
+ cache;
731
+ filePath;
732
+ constructor(options) {
733
+ this.filePath = options?.filePath;
734
+ this.cache = new LRUCache({
735
+ max: options?.maxCalls ?? 1e4,
736
+ ttl: 4 * 60 * 60 * 1e3
737
+ // 4 hours
738
+ });
739
+ }
740
+ /**
741
+ * Append an immutable audit entry for a call.
742
+ *
743
+ * @param callId The call identifier
744
+ * @param type Audit event type
745
+ * @param data Additional structured data
746
+ */
747
+ append(callId, type, data = {}) {
748
+ const entry = Object.freeze({
749
+ id: `${callId}-${Date.now()}-${Math.random().toString(36).slice(2, 9)}`,
750
+ callId,
751
+ type,
752
+ timestamp: /* @__PURE__ */ new Date(),
753
+ data: Object.freeze({ ...data })
754
+ });
755
+ const existing = this.cache.get(callId) ?? [];
756
+ this.cache.set(callId, [...existing, entry]);
757
+ logger3.debug({ callId, type, entryId: entry.id }, "Audit entry appended");
758
+ if (this.filePath) {
759
+ this.writeToFile(entry).catch(
760
+ (err) => logger3.error({ err, callId, type }, "Audit file write failed")
761
+ );
762
+ }
763
+ return entry;
764
+ }
765
+ /**
766
+ * Get all audit entries for a call, in insertion order.
767
+ *
768
+ * @param callId The call identifier
769
+ */
770
+ getEntries(callId) {
771
+ return Object.freeze(this.cache.get(callId) ?? []);
772
+ }
773
+ /**
774
+ * Get entries of a specific type for a call.
775
+ */
776
+ getEntriesByType(callId, type) {
777
+ return this.getEntries(callId).filter((e) => e.type === type);
778
+ }
779
+ /** Write entry to JSONL file. @internal */
780
+ async writeToFile(entry) {
781
+ if (!this.filePath) return;
782
+ const line = JSON.stringify({
783
+ ...entry,
784
+ timestamp: entry.timestamp.toISOString()
785
+ }) + "\n";
786
+ await appendFile(this.filePath, line, "utf-8");
787
+ }
788
+ };
789
+ var logger4 = pino({ name: "@voice-kit/core:compliance:trai" });
790
+ var TRAI_DND_API_MOCK = "https://api.trai.gov.in/dnd/check";
791
+ var DEFAULTS = {
792
+ disabled: false,
793
+ timezone: "Asia/Kolkata",
794
+ callingHoursStart: 9,
795
+ callingHoursEnd: 21,
796
+ dncApiEndpoint: TRAI_DND_API_MOCK
797
+ };
798
+ var DNC_CACHE_TTL_MS = 24 * 60 * 60 * 1e3;
799
+ var CONSENT_VALIDITY_MS = 180 * 24 * 60 * 60 * 1e3;
800
+ var TRAICompliance = class {
801
+ config;
802
+ http;
803
+ /** DNC check results cached for 24 hours per number. */
804
+ dncCache;
805
+ /** Consent records cached for 180 days. */
806
+ consentCache;
807
+ constructor(config) {
808
+ this.config = { ...DEFAULTS, ...config };
809
+ this.dncCache = new LRUCache({
810
+ max: 1e5,
811
+ ttl: DNC_CACHE_TTL_MS
812
+ });
813
+ this.consentCache = new LRUCache({
814
+ max: 5e4,
815
+ ttl: CONSENT_VALIDITY_MS
816
+ });
817
+ this.http = axios.create({
818
+ baseURL: this.config.dncApiEndpoint,
819
+ timeout: 5e3,
820
+ headers: { "Content-Type": "application/json" }
821
+ });
822
+ }
823
+ /**
824
+ * Check whether a call is permitted under TRAI rules.
825
+ * Checks: valid E.164, DNC registry, calling hours.
826
+ *
827
+ * @param params Call permission check parameters
828
+ * @throws DNCBlockedError if number is on DNC registry
829
+ * @throws CallingHoursError if outside allowed calling hours
830
+ * @throws ComplianceError if phone number is invalid
831
+ *
832
+ * @example
833
+ * ```ts
834
+ * const result = await trai.checkCallPermission({
835
+ * to: '+919876543210',
836
+ * purpose: 'TRANSACTIONAL',
837
+ * })
838
+ * if (!result.allowed) console.log(result.reason)
839
+ * ```
840
+ */
841
+ async checkCallPermission(params) {
842
+ if (this.config.disabled) {
843
+ return { allowed: true, fromCache: false };
844
+ }
845
+ if (!isValidPhoneNumber(params.to)) {
846
+ throw new ComplianceError({
847
+ code: "COMPLIANCE_INVALID_NUMBER",
848
+ message: `Invalid phone number: ${params.to}`,
849
+ phoneNumber: params.to,
850
+ retryable: false,
851
+ severity: "low"
852
+ });
853
+ }
854
+ const parsed = parsePhoneNumberFromString(params.to);
855
+ const isIndianNumber = parsed?.countryCallingCode === "91";
856
+ if (!isIndianNumber) {
857
+ return { allowed: true, fromCache: false };
858
+ }
859
+ const scheduledAt = params.scheduledAt ?? /* @__PURE__ */ new Date();
860
+ if (!this.isWithinCallingHours(scheduledAt)) {
861
+ const timeStr = new Intl.DateTimeFormat("en-IN", {
862
+ timeZone: this.config.timezone,
863
+ hour: "2-digit",
864
+ minute: "2-digit",
865
+ hour12: false
866
+ }).format(scheduledAt);
867
+ throw new CallingHoursError(params.to, timeStr);
868
+ }
869
+ if (params.purpose === "EMERGENCY") {
870
+ return { allowed: true, fromCache: false };
871
+ }
872
+ const cacheKey = `${params.to}:${params.purpose}`;
873
+ const cached = this.dncCache.get(cacheKey);
874
+ if (cached) {
875
+ logger4.debug({ to: params.to, purpose: params.purpose, allowed: cached.allowed }, "DNC cache hit");
876
+ return { ...cached, fromCache: true };
877
+ }
878
+ const result = await this.fetchDNCStatus(params);
879
+ this.dncCache.set(cacheKey, result);
880
+ if (!result.allowed) {
881
+ throw new DNCBlockedError(params.to);
882
+ }
883
+ return result;
884
+ }
885
+ /**
886
+ * Check if the current time (or a given time) is within TRAI calling hours.
887
+ * Allowed: 9:00 AM – 9:00 PM IST.
888
+ * Uses Intl.DateTimeFormat only — no date-fns or dayjs dependency.
889
+ *
890
+ * @param at Time to check. Defaults to now.
891
+ * @param timezone IANA timezone. Defaults to 'Asia/Kolkata'.
892
+ *
893
+ * @example
894
+ * ```ts
895
+ * trai.isWithinCallingHours() // Check now
896
+ * trai.isWithinCallingHours(new Date()) // Explicit time
897
+ * ```
898
+ */
899
+ isWithinCallingHours(at, timezone) {
900
+ const tz = timezone ?? this.config.timezone;
901
+ const date = at ?? /* @__PURE__ */ new Date();
902
+ const parts = new Intl.DateTimeFormat("en-IN", {
903
+ timeZone: tz,
904
+ hour: "numeric",
905
+ hour12: false
906
+ }).formatToParts(date);
907
+ const hourPart = parts.find((p) => p.type === "hour");
908
+ const hour = parseInt(hourPart?.value ?? "0", 10);
909
+ return hour >= this.config.callingHoursStart && hour < this.config.callingHoursEnd;
910
+ }
911
+ /**
912
+ * Record explicit consent from a user for future calls.
913
+ * Consent is valid for 180 days per TRAI guidelines.
914
+ *
915
+ * @param params Consent record details
916
+ *
917
+ * @example
918
+ * ```ts
919
+ * await trai.recordConsent({
920
+ * phoneNumber: '+919876543210',
921
+ * consentedAt: new Date(),
922
+ * channel: 'ivr',
923
+ * purpose: 'PROMOTIONAL',
924
+ * })
925
+ * ```
926
+ */
927
+ async recordConsent(params) {
928
+ const normalized = parsePhoneNumberFromString(params.phoneNumber)?.format("E.164");
929
+ this.consentCache.set(normalized, params);
930
+ logger4.info(
931
+ { phoneNumber: normalized, purpose: params.purpose, channel: params.channel },
932
+ "Consent recorded"
933
+ );
934
+ }
935
+ /**
936
+ * Check if a number has valid (non-expired) consent on record.
937
+ *
938
+ * @param phoneNumber E.164 phone number
939
+ * @returns True if valid consent exists
940
+ */
941
+ async hasValidConsent(phoneNumber) {
942
+ let normalized;
943
+ try {
944
+ normalized = parsePhoneNumberFromString(phoneNumber)?.format("E.164");
945
+ } catch {
946
+ return false;
947
+ }
948
+ const record = this.consentCache.get(normalized);
949
+ if (!record) return false;
950
+ const ageMs = Date.now() - record.consentedAt.getTime();
951
+ return ageMs < CONSENT_VALIDITY_MS;
952
+ }
953
+ /**
954
+ * Fetch DNC status from TRAI DND API.
955
+ * @internal
956
+ */
957
+ async fetchDNCStatus(params) {
958
+ try {
959
+ logger4.debug({ to: params.to, purpose: params.purpose }, "Fetching DNC status from TRAI");
960
+ const response = await this.http.post("", {
961
+ phone: params.to,
962
+ type: params.purpose
963
+ });
964
+ const result = {
965
+ allowed: !response.data.registered,
966
+ reason: response.data.registered ? `Number is registered on DNC for category: ${response.data.category ?? "ALL"}` : void 0,
967
+ cachedAt: /* @__PURE__ */ new Date(),
968
+ fromCache: false
969
+ };
970
+ logger4.info({ to: params.to, allowed: result.allowed }, "DNC status fetched");
971
+ return result;
972
+ } catch (err) {
973
+ if (axios.isAxiosError(err) && err.response?.status === 404) {
974
+ return { allowed: true, cachedAt: /* @__PURE__ */ new Date(), fromCache: false };
975
+ }
976
+ logger4.error({ err, to: params.to }, "TRAI DNC API unavailable \u2014 failing open");
977
+ return {
978
+ allowed: true,
979
+ reason: "DNC check unavailable \u2014 failing open",
980
+ cachedAt: /* @__PURE__ */ new Date(),
981
+ fromCache: false
982
+ };
983
+ }
984
+ }
985
+ };
986
+ var logger5 = pino({ name: "@voice-kit/core:memory" });
987
+ var DEFAULTS2 = {
988
+ maxTurns: 20,
989
+ maxBytes: 512e3,
990
+ // 512KB
991
+ ttlMs: 30 * 6e4
992
+ // 30 minutes
993
+ };
994
+ function estimateTokens(messages) {
995
+ let chars = 0;
996
+ for (const msg of messages) {
997
+ const content = typeof msg.content === "string" ? msg.content : JSON.stringify(msg.content);
998
+ chars += content.length;
999
+ }
1000
+ return Math.ceil(chars / 4);
1001
+ }
1002
+ function estimateBytes(messages) {
1003
+ return JSON.stringify(messages).length;
1004
+ }
1005
+ var LRUCallMemory = class {
1006
+ cache;
1007
+ config;
1008
+ constructor(config) {
1009
+ this.config = config;
1010
+ this.cache = new LRUCache({
1011
+ max: 1e3,
1012
+ // max concurrent calls in memory
1013
+ ttl: config.ttlMs,
1014
+ updateAgeOnGet: true
1015
+ // reset TTL on access (active calls stay warm)
1016
+ });
1017
+ }
1018
+ /**
1019
+ * Add a turn to the call's conversation window.
1020
+ * Automatically trims oldest turns when maxTurns or maxBytes is exceeded.
1021
+ *
1022
+ * @param callId The call identifier
1023
+ * @param message ModelMessage to append
1024
+ */
1025
+ addTurn(callId, message) {
1026
+ const existing = this.cache.get(callId) ?? [];
1027
+ const updated = [...existing, message];
1028
+ const trimmed = updated.length > this.config.maxTurns ? updated.slice(updated.length - this.config.maxTurns) : updated;
1029
+ let bytesTrimmed = trimmed;
1030
+ while (bytesTrimmed.length > 1 && estimateBytes(bytesTrimmed) > this.config.maxBytes) {
1031
+ bytesTrimmed = bytesTrimmed.slice(1);
1032
+ }
1033
+ this.cache.set(callId, bytesTrimmed);
1034
+ logger5.debug(
1035
+ { callId, turns: bytesTrimmed.length, bytes: estimateBytes(bytesTrimmed) },
1036
+ "Memory: turn added"
1037
+ );
1038
+ }
1039
+ /**
1040
+ * Get all turns for a call.
1041
+ *
1042
+ * @param callId The call identifier
1043
+ * @returns Array of ModelMessage (empty if call not found)
1044
+ */
1045
+ getTurns(callId) {
1046
+ return this.cache.get(callId) ?? [];
1047
+ }
1048
+ /**
1049
+ * Clear all turns for a call. Call this on call.ended to free memory.
1050
+ *
1051
+ * @param callId The call identifier
1052
+ */
1053
+ clearCall(callId) {
1054
+ this.cache.delete(callId);
1055
+ logger5.debug({ callId }, "Memory: call cleared");
1056
+ }
1057
+ /**
1058
+ * Estimate the number of LLM tokens used by a call's history.
1059
+ *
1060
+ * @param callId The call identifier
1061
+ */
1062
+ getTokenEstimate(callId) {
1063
+ const messages = this.cache.get(callId) ?? [];
1064
+ return estimateTokens(messages);
1065
+ }
1066
+ /**
1067
+ * Trim oldest turns to stay within a token budget.
1068
+ * Called by VoiceAgent before each LLM call to prevent context overflow.
1069
+ *
1070
+ * @param callId The call identifier
1071
+ * @param maxTokens Maximum tokens to retain
1072
+ */
1073
+ trimToTokenBudget(callId, maxTokens) {
1074
+ let messages = this.cache.get(callId) ?? [];
1075
+ while (messages.length > 1 && estimateTokens(messages) > maxTokens) {
1076
+ messages = messages.slice(1);
1077
+ }
1078
+ this.cache.set(callId, messages);
1079
+ logger5.debug(
1080
+ { callId, turns: messages.length, estimatedTokens: estimateTokens(messages) },
1081
+ "Memory: trimmed to token budget"
1082
+ );
1083
+ }
1084
+ };
1085
+ function createCallMemory(config) {
1086
+ const merged = {
1087
+ maxTurns: config?.maxTurns ?? DEFAULTS2.maxTurns,
1088
+ maxBytes: config?.maxBytes ?? DEFAULTS2.maxBytes,
1089
+ ttlMs: config?.ttlMs ?? DEFAULTS2.ttlMs
1090
+ };
1091
+ return new LRUCallMemory(merged);
1092
+ }
1093
+ var logger6 = pino({ name: "@voice-kit/core:metrics" });
1094
+ var TOKEN_COSTS_PER_M = {
1095
+ "gpt-4o": { input: 5, output: 15 },
1096
+ "gpt-4o-mini": { input: 0.15, output: 0.6 },
1097
+ "claude-3-5-sonnet": { input: 3, output: 15 },
1098
+ "llama-3.3-70b": { input: 0.59, output: 0.79 }
1099
+ };
1100
+ function p95(values) {
1101
+ if (values.length === 0) return 0;
1102
+ const sorted = [...values].sort((a, b) => a - b);
1103
+ const idx = Math.floor(sorted.length * 0.95);
1104
+ return sorted[Math.min(idx, sorted.length - 1)] ?? 0;
1105
+ }
1106
+ function avg(values) {
1107
+ if (values.length === 0) return 0;
1108
+ return values.reduce((a, b) => a + b, 0) / values.length;
1109
+ }
1110
+ var CallMetrics = class {
1111
+ store;
1112
+ constructor() {
1113
+ this.store = new LRUCache({
1114
+ max: 1e4,
1115
+ ttl: 2 * 60 * 60 * 1e3
1116
+ // 2 hours
1117
+ });
1118
+ }
1119
+ getOrCreate(callId) {
1120
+ const existing = this.store.get(callId);
1121
+ if (existing) return existing;
1122
+ const data = {
1123
+ sttFirstByteMs: [],
1124
+ ttsFirstByteMs: [],
1125
+ llmFirstTokenMs: [],
1126
+ turnLatencyMs: [],
1127
+ interruptionCount: 0,
1128
+ interruptionPositions: [],
1129
+ tokenCost: []
1130
+ };
1131
+ this.store.set(callId, data);
1132
+ return data;
1133
+ }
1134
+ /** Record time from audio start to first STT partial result. */
1135
+ recordSTTFirstByte(callId, ms) {
1136
+ this.getOrCreate(callId).sttFirstByteMs.push(ms);
1137
+ logger6.debug({ callId, ms }, "Metric: STT TTFB");
1138
+ }
1139
+ /** Record time from TTS request to first audio chunk. */
1140
+ recordTTSFirstByte(callId, ms) {
1141
+ this.getOrCreate(callId).ttsFirstByteMs.push(ms);
1142
+ logger6.debug({ callId, ms }, "Metric: TTS TTFB");
1143
+ }
1144
+ /** Record time from LLM request to first token. */
1145
+ recordLLMFirstToken(callId, ms) {
1146
+ this.getOrCreate(callId).llmFirstTokenMs.push(ms);
1147
+ logger6.debug({ callId, ms }, "Metric: LLM first token");
1148
+ }
1149
+ /**
1150
+ * Record end-to-end turn latency: speech_end → first TTS audio byte.
1151
+ * This is the primary latency metric for voice agent quality.
1152
+ */
1153
+ recordTurnLatency(callId, ms) {
1154
+ this.getOrCreate(callId).turnLatencyMs.push(ms);
1155
+ logger6.debug({ callId, ms }, "Metric: turn latency");
1156
+ }
1157
+ /**
1158
+ * Record an interruption event.
1159
+ *
1160
+ * @param callId Call identifier
1161
+ * @param positionPct 0–1, how far through the TTS stream the interruption occurred
1162
+ */
1163
+ recordInterruption(callId, positionPct) {
1164
+ const data = this.getOrCreate(callId);
1165
+ data.interruptionCount++;
1166
+ data.interruptionPositions.push(positionPct);
1167
+ logger6.debug({ callId, positionPct }, "Metric: interruption");
1168
+ }
1169
+ /** Record token usage and estimated cost for a model call. */
1170
+ recordTokenCost(callId, model, inputTokens, outputTokens) {
1171
+ const costs = TOKEN_COSTS_PER_M[model] ?? { input: 0, output: 0 };
1172
+ const estimatedUsdCost = inputTokens / 1e6 * costs.input + outputTokens / 1e6 * costs.output;
1173
+ this.getOrCreate(callId).tokenCost.push({
1174
+ model,
1175
+ inputTokens,
1176
+ outputTokens,
1177
+ estimatedUsdCost
1178
+ });
1179
+ logger6.debug({ callId, model, inputTokens, outputTokens, estimatedUsdCost }, "Metric: token cost");
1180
+ }
1181
+ /**
1182
+ * Get a full summary of metrics for a call.
1183
+ *
1184
+ * @param callId The call identifier
1185
+ * @returns Aggregated metrics summary
1186
+ */
1187
+ getCallSummary(callId) {
1188
+ const data = this.getOrCreate(callId);
1189
+ return {
1190
+ callId,
1191
+ sttFirstByteMs: [...data.sttFirstByteMs],
1192
+ ttsFirstByteMs: [...data.ttsFirstByteMs],
1193
+ llmFirstTokenMs: [...data.llmFirstTokenMs],
1194
+ turnLatencyMs: [...data.turnLatencyMs],
1195
+ interruptionCount: data.interruptionCount,
1196
+ interruptionPositions: [...data.interruptionPositions],
1197
+ tokenCost: [...data.tokenCost],
1198
+ avgTurnLatencyMs: Math.round(avg(data.turnLatencyMs)),
1199
+ p95TurnLatencyMs: Math.round(p95(data.turnLatencyMs))
1200
+ };
1201
+ }
1202
+ /** Remove metrics for a call. Call on call.ended to free memory. */
1203
+ clearCall(callId) {
1204
+ this.store.delete(callId);
1205
+ }
1206
+ };
1207
+ var logger7 = pino({ name: "@voice-kit/core:observability" });
1208
+ var _provider = null;
1209
+ function getOrInitProvider() {
1210
+ if (_provider) return _provider;
1211
+ const endpoint = process.env["OTEL_EXPORTER_OTLP_ENDPOINT"];
1212
+ _provider = new NodeTracerProvider({
1213
+ resource: resourceFromAttributes({
1214
+ [ATTR_SERVICE_NAME]: "voice-kit"
1215
+ }),
1216
+ // Pass span processors directly in constructor — addSpanProcessor doesn't exist in this version
1217
+ spanProcessors: endpoint ? [new SimpleSpanProcessor(new OTLPTraceExporter({ url: endpoint }))] : []
1218
+ });
1219
+ if (endpoint) {
1220
+ logger7.info({ endpoint }, "OTel OTLP exporter configured");
1221
+ }
1222
+ _provider.register();
1223
+ return _provider;
1224
+ }
1225
+ var VoiceSDKTracer = class {
1226
+ tracer;
1227
+ constructor() {
1228
+ getOrInitProvider();
1229
+ this.tracer = trace.getTracer("@voice-kit/core", "0.1.0");
1230
+ }
1231
+ /**
1232
+ * Trace an STT operation with provider + language attributes.
1233
+ */
1234
+ async traceSTT(fn, attrs) {
1235
+ return this.withSpan(`stt.${attrs.provider}`, fn, {
1236
+ "stt.provider": attrs.provider,
1237
+ "stt.language": attrs.language,
1238
+ ...attrs.callId && { "call.id": attrs.callId }
1239
+ });
1240
+ }
1241
+ /**
1242
+ * Trace a TTS synthesis operation.
1243
+ */
1244
+ async traceTTS(fn, attrs) {
1245
+ return this.withSpan(`tts.${attrs.provider}`, fn, {
1246
+ "tts.provider": attrs.provider,
1247
+ "tts.voice_id": attrs.voice,
1248
+ "tts.char_count": attrs.chars,
1249
+ ...attrs.callId && { "call.id": attrs.callId }
1250
+ });
1251
+ }
1252
+ /**
1253
+ * Trace an LLM generation call.
1254
+ */
1255
+ async traceLLM(fn, attrs) {
1256
+ return this.withSpan(`llm.${attrs.model}`, fn, {
1257
+ "llm.model": attrs.model,
1258
+ "llm.input_tokens": attrs.inputTokens,
1259
+ ...attrs.callId && { "call.id": attrs.callId }
1260
+ });
1261
+ }
1262
+ /**
1263
+ * Trace a full call lifecycle.
1264
+ */
1265
+ async traceCall(fn, attrs) {
1266
+ return this.withSpan("call", fn, {
1267
+ "call.id": attrs.callId,
1268
+ "call.direction": attrs.direction
1269
+ });
1270
+ }
1271
+ /**
1272
+ * Trace a single conversation turn.
1273
+ */
1274
+ async traceTurn(fn, attrs) {
1275
+ return this.withSpan("turn", fn, {
1276
+ "turn.index": attrs.turnIndex,
1277
+ "call.id": attrs.callId
1278
+ });
1279
+ }
1280
+ /** Generic span wrapper. @internal */
1281
+ async withSpan(name, fn, attributes) {
1282
+ const span = this.tracer.startSpan(name, { attributes });
1283
+ const startMs = Date.now();
1284
+ try {
1285
+ const result = await fn();
1286
+ span.setStatus({ code: SpanStatusCode.OK });
1287
+ span.setAttribute("duration_ms", Date.now() - startMs);
1288
+ return result;
1289
+ } catch (err) {
1290
+ span.setStatus({
1291
+ code: SpanStatusCode.ERROR,
1292
+ message: err instanceof Error ? err.message : String(err)
1293
+ });
1294
+ span.recordException(err instanceof Error ? err : new Error(String(err)));
1295
+ throw err;
1296
+ } finally {
1297
+ span.end();
1298
+ }
1299
+ }
1300
+ };
1301
+ var logger8 = pino({ name: "@voice-kit/core:stt:assemblyai" });
1302
+ var SUPPORTED_LANGUAGES = [
1303
+ "en",
1304
+ "en_au",
1305
+ "en_uk",
1306
+ "en_us",
1307
+ "hi",
1308
+ "fr",
1309
+ "de",
1310
+ "es",
1311
+ "it",
1312
+ "pt",
1313
+ "nl",
1314
+ "ja",
1315
+ "zh"
1316
+ ];
1317
+ var AssemblyAISTTProvider = class {
1318
+ name = "assemblyai";
1319
+ supportsStreaming = false;
1320
+ supportedLanguages = SUPPORTED_LANGUAGES;
1321
+ client;
1322
+ config;
1323
+ constructor(config) {
1324
+ const apiKey = config.apiKey ?? process.env["ASSEMBLYAI_API_KEY"];
1325
+ if (!apiKey) throw new STTConnectionError("assemblyai", new Error("ASSEMBLYAI_API_KEY not set"));
1326
+ this.client = new AssemblyAI({ apiKey });
1327
+ this.config = {
1328
+ language: config.language ?? "en",
1329
+ alternateLanguages: config.alternateLanguages ?? [],
1330
+ apiKey,
1331
+ model: config.model ?? "best",
1332
+ wordTimestamps: config.wordTimestamps ?? true,
1333
+ interimResults: false,
1334
+ smartFormat: config.smartFormat ?? true,
1335
+ region: ""
1336
+ };
1337
+ }
1338
+ /**
1339
+ * Batch-transcribes collected audio. AssemblyAI has no realtime streaming.
1340
+ * Collects all audio from the iterable, uploads, then polls for result.
1341
+ *
1342
+ * @param audio Async iterable of PCM buffers
1343
+ */
1344
+ async *transcribeStream(audio) {
1345
+ const chunks = [];
1346
+ for await (const chunk of audio) chunks.push(chunk);
1347
+ const result = await this.transcribeBatch(Buffer.concat(chunks));
1348
+ yield result;
1349
+ }
1350
+ /**
1351
+ * Upload audio to AssemblyAI and wait for async transcription.
1352
+ * Suitable for call recordings. Average latency: 15–45s per minute of audio.
1353
+ *
1354
+ * @param audio Raw WAV/PCM/MP3 buffer
1355
+ *
1356
+ * @example
1357
+ * ```ts
1358
+ * const stt = createSTT('assemblyai', { wordTimestamps: true })
1359
+ * const result = await stt.transcribeBatch(recordingBuffer)
1360
+ * console.log(result.words) // Word-level timestamps
1361
+ * ```
1362
+ */
1363
+ async transcribeBatch(audio) {
1364
+ const startMs = Date.now();
1365
+ try {
1366
+ logger8.debug({ bytes: audio.length, language: this.config.language }, "AssemblyAI transcription started");
1367
+ const transcript = await this.client.transcripts.transcribe({
1368
+ audio,
1369
+ language_code: this.config.language,
1370
+ speech_model: this.config.model,
1371
+ punctuate: this.config.smartFormat,
1372
+ format_text: this.config.smartFormat,
1373
+ word_boost: [],
1374
+ ...this.config.wordTimestamps && { timestamps: true }
1375
+ });
1376
+ if (transcript.status === "error") {
1377
+ throw new STTStreamError("assemblyai", new Error(transcript.error ?? "Transcription failed"));
1378
+ }
1379
+ logger8.info(
1380
+ { id: transcript.id, duration: transcript.audio_duration, latencyMs: Date.now() - startMs },
1381
+ "AssemblyAI transcription complete"
1382
+ );
1383
+ return {
1384
+ transcript: transcript.text ?? "",
1385
+ isFinal: true,
1386
+ confidence: transcript.confidence ?? 0.9,
1387
+ language: this.config.language,
1388
+ languageSwitchDetected: false,
1389
+ words: this.config.wordTimestamps && transcript.words ? transcript.words.map((w) => ({
1390
+ word: w.text,
1391
+ startMs: w.start,
1392
+ endMs: w.end,
1393
+ confidence: w.confidence
1394
+ })) : void 0,
1395
+ latencyMs: Date.now() - startMs
1396
+ };
1397
+ } catch (err) {
1398
+ if (err instanceof STTStreamError) throw err;
1399
+ throw new STTStreamError("assemblyai", err);
1400
+ }
1401
+ }
1402
+ };
1403
+ var logger9 = pino({ name: "@voice-kit/core:stt:deepgram" });
1404
+ var SUPPORTED_LANGUAGES2 = [
1405
+ "en-IN",
1406
+ "hi-IN",
1407
+ "ta-IN",
1408
+ "te-IN",
1409
+ "kn-IN",
1410
+ "mr-IN",
1411
+ "en-US",
1412
+ "en-GB",
1413
+ "en-AU"
1414
+ ];
1415
+ var BACKOFF = {
1416
+ baseMs: 100,
1417
+ maxMs: 5e3,
1418
+ jitterPct: 0.2,
1419
+ maxAttempts: 3
1420
+ };
1421
+ function backoffDelay(attempt) {
1422
+ const base = Math.min(BACKOFF.baseMs * Math.pow(2, attempt), BACKOFF.maxMs);
1423
+ const jitter = base * BACKOFF.jitterPct * (Math.random() * 2 - 1);
1424
+ return Math.round(base + jitter);
1425
+ }
1426
+ var DeepgramSTTProvider = class {
1427
+ name = "deepgram";
1428
+ supportsStreaming = true;
1429
+ supportedLanguages = SUPPORTED_LANGUAGES2;
1430
+ client;
1431
+ config;
1432
+ constructor(config) {
1433
+ const apiKey = config.apiKey ?? process.env["DEEPGRAM_API_KEY"];
1434
+ if (!apiKey) throw new STTConnectionError("deepgram", new Error("DEEPGRAM_API_KEY not set"));
1435
+ this.client = new DeepgramClient({ apiKey });
1436
+ this.config = {
1437
+ language: config.language ?? "en-IN",
1438
+ alternateLanguages: config.alternateLanguages ?? [],
1439
+ apiKey,
1440
+ // nova-3 is now Deepgram's latest recommended model
1441
+ model: config.model ?? "nova-3",
1442
+ wordTimestamps: config.wordTimestamps ?? false,
1443
+ interimResults: config.interimResults ?? true,
1444
+ smartFormat: config.smartFormat ?? true,
1445
+ region: config.region ?? ""
1446
+ };
1447
+ }
1448
+ /**
1449
+ * Stream audio to Deepgram and receive interim + final transcription results.
1450
+ * Handles reconnection transparently with exponential backoff.
1451
+ *
1452
+ * @param audio Async iterable of 16kHz PCM buffers from AudioPipeline
1453
+ *
1454
+ * @example
1455
+ * ```ts
1456
+ * const stt = createSTT('deepgram', { language: 'hi-IN' })
1457
+ * for await (const result of stt.transcribeStream(audioIterable)) {
1458
+ * if (result.isFinal) console.log('User said:', result.transcript)
1459
+ * }
1460
+ * ```
1461
+ */
1462
+ async *transcribeStream(audio) {
1463
+ let attempt = 0;
1464
+ const startMs = Date.now();
1465
+ while (attempt <= BACKOFF.maxAttempts) {
1466
+ const connection = await this.connectWithRetry(attempt);
1467
+ const results = [];
1468
+ let done = false;
1469
+ let error = null;
1470
+ connection.on("message", (data) => {
1471
+ if (data.type !== "Results") return;
1472
+ const alt = data.channel?.alternatives?.[0];
1473
+ if (!alt?.transcript) return;
1474
+ const isFinal = data.is_final === true;
1475
+ const result = {
1476
+ transcript: alt.transcript,
1477
+ isFinal,
1478
+ // speech_final=true means Deepgram detected end-of-utterance (endpointing).
1479
+ // A frame can be speech_final without is_final — callers should act on both.
1480
+ confidence: alt.confidence ?? 0,
1481
+ // alt.languages populated when detect_language is enabled
1482
+ language: alt.languages?.[0] ?? this.config.language,
1483
+ languageSwitchDetected: false,
1484
+ words: this.config.wordTimestamps ? alt.words?.map((w) => ({
1485
+ word: w.word ?? "",
1486
+ startMs: (w.start ?? 0) * 1e3,
1487
+ endMs: (w.end ?? 0) * 1e3,
1488
+ confidence: w.confidence ?? 0,
1489
+ punctuatedWord: w.punctuated_word
1490
+ })) : void 0,
1491
+ latencyMs: Date.now() - startMs
1492
+ };
1493
+ results.push(result);
1494
+ if (isFinal) {
1495
+ logger9.debug(
1496
+ { transcript: result.transcript, confidence: result.confidence, language: result.language },
1497
+ "Deepgram final transcript"
1498
+ );
1499
+ }
1500
+ });
1501
+ connection.on("close", () => {
1502
+ done = true;
1503
+ });
1504
+ connection.on("error", (err) => {
1505
+ error = err;
1506
+ logger9.warn({ err, attempt }, "Deepgram stream error");
1507
+ });
1508
+ const sendAudio = async () => {
1509
+ try {
1510
+ for await (const chunk of audio) {
1511
+ connection.socket.send(chunk);
1512
+ }
1513
+ connection.socket.send(JSON.stringify({ type: "Finalize" }));
1514
+ } catch (err) {
1515
+ error = err instanceof Error ? err : new Error(String(err));
1516
+ }
1517
+ };
1518
+ const sendPromise = sendAudio();
1519
+ let resultIndex = 0;
1520
+ while (!done || resultIndex < results.length) {
1521
+ if (resultIndex < results.length) {
1522
+ yield results[resultIndex++];
1523
+ } else {
1524
+ await new Promise((r) => setTimeout(r, 10));
1525
+ }
1526
+ if (error && attempt < BACKOFF.maxAttempts) {
1527
+ try {
1528
+ connection.socket.close();
1529
+ } catch {
1530
+ }
1531
+ break;
1532
+ }
1533
+ if (error && attempt >= BACKOFF.maxAttempts) {
1534
+ await sendPromise.catch(() => {
1535
+ });
1536
+ throw new STTStreamError("deepgram", error);
1537
+ }
1538
+ }
1539
+ await sendPromise.catch(() => {
1540
+ });
1541
+ if (!error) return;
1542
+ attempt++;
1543
+ await new Promise((r) => setTimeout(r, backoffDelay(attempt)));
1544
+ logger9.info({ attempt }, "Deepgram reconnecting...");
1545
+ }
1546
+ throw new STTStreamError("deepgram", new Error("Max reconnect attempts exceeded"));
1547
+ }
1548
+ /**
1549
+ * Transcribe a complete audio buffer (non-streaming).
1550
+ * Uses Deepgram pre-recorded API.
1551
+ *
1552
+ * @param audio Raw PCM or WAV buffer
1553
+ */
1554
+ async transcribeBatch(audio) {
1555
+ const startMs = Date.now();
1556
+ try {
1557
+ const response = await this.client.listen.v1.media.transcribeFile(
1558
+ audio,
1559
+ {
1560
+ model: this.config.model,
1561
+ language: this.config.language,
1562
+ // v5: boolean-like options must be strings
1563
+ smart_format: true,
1564
+ diarize: false
1565
+ }
1566
+ );
1567
+ const alt = response?.results?.channels?.[0]?.alternatives?.[0];
1568
+ return {
1569
+ transcript: alt?.transcript ?? "",
1570
+ isFinal: true,
1571
+ confidence: alt?.confidence ?? 0,
1572
+ language: this.config.language,
1573
+ languageSwitchDetected: false,
1574
+ latencyMs: Date.now() - startMs
1575
+ };
1576
+ } catch (err) {
1577
+ if (err instanceof STTStreamError) throw err;
1578
+ throw new STTStreamError("deepgram", err instanceof Error ? err : new Error(String(err)));
1579
+ }
1580
+ }
1581
+ /**
1582
+ * Create and open a live WebSocket connection to Deepgram.
1583
+ *
1584
+ * v5 connection lifecycle (3 explicit steps):
1585
+ * 1. await listen.v1.connect(options) — constructs the connection object
1586
+ * 2. connection.connect() — initiates the WebSocket handshake
1587
+ * 3. await connection.waitForOpen() — resolves once the socket is ready
1588
+ *
1589
+ * @internal
1590
+ */
1591
+ async connectWithRetry(attempt) {
1592
+ const delay = attempt > 0 ? backoffDelay(attempt) : 0;
1593
+ if (delay > 0) await new Promise((r) => setTimeout(r, delay));
1594
+ try {
1595
+ logger9.debug({ attempt, language: this.config.language }, "Connecting to Deepgram");
1596
+ const connection = await this.client.listen.v1.connect({
1597
+ model: this.config.model,
1598
+ language: this.config.language,
1599
+ // v5: boolean-like options must be strings
1600
+ smart_format: "true",
1601
+ interim_results: String(this.config.interimResults),
1602
+ encoding: "linear16",
1603
+ sample_rate: 16e3,
1604
+ channels: 1,
1605
+ utterance_end_ms: "1000",
1606
+ ...this.config.alternateLanguages.length > 0 && {
1607
+ detect_language: "true",
1608
+ // language must be omitted when detect_language is enabled
1609
+ language: void 0
1610
+ },
1611
+ Authorization: `Token ${this.config.apiKey}`
1612
+ });
1613
+ connection.connect();
1614
+ await Promise.race([
1615
+ connection.waitForOpen(),
1616
+ new Promise(
1617
+ (_, reject) => setTimeout(
1618
+ () => reject(new STTConnectionError("deepgram", new Error("Connection timeout"))),
1619
+ 1e4
1620
+ )
1621
+ )
1622
+ ]);
1623
+ logger9.info({ attempt, language: this.config.language }, "Deepgram connected");
1624
+ return connection;
1625
+ } catch (err) {
1626
+ if (err instanceof STTConnectionError) throw err;
1627
+ throw new STTConnectionError("deepgram", err instanceof Error ? err : new Error(String(err)));
1628
+ }
1629
+ }
1630
+ };
1631
+ var logger10 = pino({ name: "@voice-kit/core:stt:sarvam" });
1632
+ var SARVAM_API_BASE = "https://api.sarvam.ai";
1633
+ var SUPPORTED_LANGUAGES3 = [
1634
+ "hi-IN",
1635
+ "kn-IN",
1636
+ "ta-IN",
1637
+ "te-IN",
1638
+ "mr-IN",
1639
+ "bn-IN",
1640
+ "gu-IN",
1641
+ "pa-IN",
1642
+ "or-IN",
1643
+ "ml-IN"
1644
+ ];
1645
+ var SARVAM_MODELS = {
1646
+ "hi-IN": "saarika:v1",
1647
+ "kn-IN": "saarika:v1",
1648
+ "ta-IN": "saarika:v1",
1649
+ "te-IN": "saarika:v1",
1650
+ "mr-IN": "saarika:v1",
1651
+ "bn-IN": "saarika:v1",
1652
+ "gu-IN": "saarika:v1",
1653
+ "pa-IN": "saarika:v1",
1654
+ "or-IN": "saarika:v1",
1655
+ "ml-IN": "saarika:v1"
1656
+ };
1657
+ var SarvamSTTProvider = class {
1658
+ name = "sarvam";
1659
+ supportsStreaming = false;
1660
+ // Sarvam REST API is batch-only
1661
+ supportedLanguages = SUPPORTED_LANGUAGES3;
1662
+ http;
1663
+ config;
1664
+ constructor(config) {
1665
+ const apiKey = config.apiKey ?? process.env["SARVAM_API_KEY"];
1666
+ if (!apiKey) throw new STTConnectionError("sarvam", new Error("SARVAM_API_KEY not set"));
1667
+ const language = config.language ?? "hi-IN";
1668
+ if (!SUPPORTED_LANGUAGES3.includes(language)) {
1669
+ throw new STTLanguageNotSupportedError("sarvam", language);
1670
+ }
1671
+ this.http = axios.create({
1672
+ baseURL: SARVAM_API_BASE,
1673
+ headers: {
1674
+ "API-Subscription-Key": apiKey,
1675
+ "Content-Type": "multipart/form-data"
1676
+ },
1677
+ timeout: 3e4
1678
+ });
1679
+ this.config = {
1680
+ language,
1681
+ alternateLanguages: config.alternateLanguages ?? [],
1682
+ apiKey,
1683
+ model: config.model ?? SARVAM_MODELS[language] ?? "saarika:v1",
1684
+ wordTimestamps: false,
1685
+ // Sarvam doesn't support word timestamps yet
1686
+ interimResults: false,
1687
+ smartFormat: config.smartFormat ?? true,
1688
+ region: config.region ?? ""
1689
+ };
1690
+ }
1691
+ /**
1692
+ * Collects audio and transcribes via Sarvam batch API.
1693
+ * Sarvam doesn't support realtime streaming.
1694
+ *
1695
+ * @param audio Async iterable of 16kHz PCM buffers
1696
+ */
1697
+ async *transcribeStream(audio) {
1698
+ const chunks = [];
1699
+ for await (const chunk of audio) chunks.push(chunk);
1700
+ const result = await this.transcribeBatch(Buffer.concat(chunks));
1701
+ yield result;
1702
+ }
1703
+ /**
1704
+ * Transcribe a WAV/PCM audio buffer in an Indic language.
1705
+ *
1706
+ * @param audio 16kHz PCM or WAV buffer
1707
+ *
1708
+ * @example
1709
+ * ```ts
1710
+ * const stt = createSTT('sarvam', { language: 'ta-IN' })
1711
+ * const result = await stt.transcribeBatch(tamilAudioBuffer)
1712
+ * console.log(result.transcript) // Tamil text
1713
+ * ```
1714
+ */
1715
+ async transcribeBatch(audio) {
1716
+ const startMs = Date.now();
1717
+ try {
1718
+ logger10.debug(
1719
+ { language: this.config.language, bytes: audio.length },
1720
+ "Sarvam transcription request"
1721
+ );
1722
+ const form = new FormData();
1723
+ form.append("file", new Blob([audio], { type: "audio/wav" }), "audio.wav");
1724
+ form.append("language_code", this.config.language);
1725
+ form.append("model", this.config.model);
1726
+ if (this.config.smartFormat) {
1727
+ form.append("with_disfluencies", "false");
1728
+ }
1729
+ const response = await this.http.post(
1730
+ "/speech-to-text",
1731
+ form
1732
+ );
1733
+ const data = response.data;
1734
+ logger10.info(
1735
+ { language: data.language_code, confidence: data.confidence, latencyMs: Date.now() - startMs },
1736
+ "Sarvam transcription complete"
1737
+ );
1738
+ return {
1739
+ transcript: data.transcript,
1740
+ isFinal: true,
1741
+ confidence: data.confidence ?? 0.9,
1742
+ language: data.language_code ?? this.config.language,
1743
+ languageSwitchDetected: false,
1744
+ latencyMs: Date.now() - startMs
1745
+ };
1746
+ } catch (err) {
1747
+ if (axios.isAxiosError(err)) {
1748
+ throw new STTStreamError(
1749
+ "sarvam",
1750
+ new Error(`Sarvam API error: ${err.response?.status} ${JSON.stringify(err.response?.data)}`)
1751
+ );
1752
+ }
1753
+ throw new STTStreamError("sarvam", err);
1754
+ }
1755
+ }
1756
+ };
1757
+ var logger11 = pino({ name: "@voice-kit/core:stt:language-detect" });
1758
+ var DEVANAGARI_RANGE = /[\u0900-\u097F]/;
1759
+ var MIN_WORDS_FOR_CLASSIFICATION = 2;
1760
+ var SWITCH_CONFIDENCE_THRESHOLD = 0.6;
1761
+ var NEUTRAL_TOKENS = /* @__PURE__ */ new Set([
1762
+ "ok",
1763
+ "okay",
1764
+ "haan",
1765
+ "nahin",
1766
+ "nahi",
1767
+ "kya",
1768
+ "hai",
1769
+ "ho",
1770
+ "na",
1771
+ "toh",
1772
+ "aur",
1773
+ "ya",
1774
+ "matlab",
1775
+ "yani",
1776
+ "i",
1777
+ "a",
1778
+ "the",
1779
+ "is",
1780
+ "are",
1781
+ "and",
1782
+ "or"
1783
+ ]);
1784
+ var LanguageSwitchDetector = class extends EventEmitter {
1785
+ currentLanguage;
1786
+ primaryLanguage;
1787
+ /** Rolling window of recent language classifications for smoothing. */
1788
+ recentClassifications = [];
1789
+ windowSize = 5;
1790
+ constructor(primaryLanguage = "en-IN") {
1791
+ super();
1792
+ this.primaryLanguage = primaryLanguage;
1793
+ this.currentLanguage = primaryLanguage;
1794
+ }
1795
+ /**
1796
+ * Analyze a transcript for language switches.
1797
+ * Should be called on every STT final result.
1798
+ *
1799
+ * @param transcript The transcribed text to analyze
1800
+ * @returns Detected language of the transcript
1801
+ */
1802
+ analyze(transcript) {
1803
+ const words = this.tokenize(transcript);
1804
+ if (words.length === 0) return this.currentLanguage;
1805
+ const classification = this.classifySegment(words);
1806
+ const confidence = this.computeConfidence(words, classification);
1807
+ this.recentClassifications.push(classification);
1808
+ if (this.recentClassifications.length > this.windowSize) {
1809
+ this.recentClassifications.shift();
1810
+ }
1811
+ const smoothed = this.smoothedLanguage();
1812
+ if (smoothed !== this.currentLanguage && confidence >= SWITCH_CONFIDENCE_THRESHOLD && smoothed !== "unknown") {
1813
+ const event = {
1814
+ from: this.currentLanguage,
1815
+ to: smoothed,
1816
+ position: 0,
1817
+ // position in full conversation
1818
+ confidence,
1819
+ transcript,
1820
+ detectedAt: /* @__PURE__ */ new Date()
1821
+ };
1822
+ const prev = this.currentLanguage;
1823
+ this.currentLanguage = smoothed;
1824
+ logger11.info(
1825
+ { from: prev, to: smoothed, confidence, transcript: transcript.slice(0, 50) },
1826
+ "Language switch detected"
1827
+ );
1828
+ this.emit("language.switched", event);
1829
+ }
1830
+ return this.currentLanguage;
1831
+ }
1832
+ /**
1833
+ * Analyze a transcript and return per-word language classification.
1834
+ * Useful for word-level Hinglish mixing visualization.
1835
+ *
1836
+ * @param transcript Text to analyze
1837
+ * @returns Array of { word, language } pairs
1838
+ */
1839
+ analyzeWords(transcript) {
1840
+ const words = this.tokenize(transcript);
1841
+ return words.map((word) => ({
1842
+ word,
1843
+ language: this.classifyWord(word)
1844
+ }));
1845
+ }
1846
+ /** Reset to primary language (e.g., on new call). */
1847
+ reset() {
1848
+ this.currentLanguage = this.primaryLanguage;
1849
+ this.recentClassifications = [];
1850
+ }
1851
+ /** Current detected language. */
1852
+ get language() {
1853
+ return this.currentLanguage;
1854
+ }
1855
+ // ─── Private helpers ────────────────────────────────────────────────────────
1856
+ tokenize(text) {
1857
+ return text.toLowerCase().split(/\s+/).filter((w) => w.length > 0 && !NEUTRAL_TOKENS.has(w));
1858
+ }
1859
+ classifyWord(word) {
1860
+ if (DEVANAGARI_RANGE.test(word)) return "hi-IN";
1861
+ if (/^[a-z]+$/.test(word)) return "en-IN";
1862
+ return "unknown";
1863
+ }
1864
+ classifySegment(words) {
1865
+ let hindiCount = 0;
1866
+ let englishCount = 0;
1867
+ for (const word of words) {
1868
+ const lang = this.classifyWord(word);
1869
+ if (lang === "hi-IN") hindiCount++;
1870
+ else if (lang === "en-IN") englishCount++;
1871
+ }
1872
+ if (hindiCount === 0 && englishCount === 0) return "unknown";
1873
+ if (hindiCount > englishCount) return "hi-IN";
1874
+ if (englishCount > hindiCount) return "en-IN";
1875
+ return this.primaryLanguage;
1876
+ }
1877
+ computeConfidence(words, classification) {
1878
+ const relevant = words.filter((w) => this.classifyWord(w) !== "unknown");
1879
+ if (relevant.length < MIN_WORDS_FOR_CLASSIFICATION) return 0;
1880
+ const matching = relevant.filter((w) => this.classifyWord(w) === classification);
1881
+ return matching.length / relevant.length;
1882
+ }
1883
+ smoothedLanguage() {
1884
+ if (this.recentClassifications.length === 0) return this.primaryLanguage;
1885
+ const counts = { "hi-IN": 0, "en-IN": 0, "unknown": 0 };
1886
+ for (const lang of this.recentClassifications) {
1887
+ counts[lang]++;
1888
+ }
1889
+ if (counts["hi-IN"] > counts["en-IN"]) return "hi-IN";
1890
+ if (counts["en-IN"] > counts["hi-IN"]) return "en-IN";
1891
+ return this.currentLanguage;
1892
+ }
1893
+ };
1894
+ function isInglish(transcript) {
1895
+ const hasDevanagari = DEVANAGARI_RANGE.test(transcript);
1896
+ const hasLatin = /[a-zA-Z]/.test(transcript);
1897
+ return hasDevanagari && hasLatin;
1898
+ }
1899
+ var logger12 = pino({ name: "@voice-kit/core:stt:whisper" });
1900
+ var WHISPER_LANGUAGES = [
1901
+ "en",
1902
+ "hi",
1903
+ "ta",
1904
+ "te",
1905
+ "kn",
1906
+ "mr",
1907
+ "bn",
1908
+ "gu",
1909
+ "pa",
1910
+ "ur",
1911
+ "fr",
1912
+ "de",
1913
+ "es",
1914
+ "pt",
1915
+ "it",
1916
+ "nl",
1917
+ "pl",
1918
+ "ru",
1919
+ "ja",
1920
+ "zh"
1921
+ ];
1922
+ var WhisperSTTProvider = class {
1923
+ name = "whisper";
1924
+ supportsStreaming = false;
1925
+ supportedLanguages = WHISPER_LANGUAGES;
1926
+ config;
1927
+ constructor(config) {
1928
+ const apiKey = config.apiKey ?? process.env["OPENAI_API_KEY"];
1929
+ if (!apiKey) throw new STTStreamError("whisper", new Error("OPENAI_API_KEY not set"));
1930
+ const language = config.language ?? "en-IN";
1931
+ const whisperLang = language.split("-")[0] ?? "en";
1932
+ if (!WHISPER_LANGUAGES.includes(whisperLang)) {
1933
+ throw new STTLanguageNotSupportedError("whisper", language);
1934
+ }
1935
+ this.config = {
1936
+ language,
1937
+ alternateLanguages: config.alternateLanguages ?? [],
1938
+ apiKey,
1939
+ model: config.model ?? "whisper-1",
1940
+ wordTimestamps: config.wordTimestamps ?? false,
1941
+ interimResults: false,
1942
+ smartFormat: false,
1943
+ region: ""
1944
+ };
1945
+ }
1946
+ /**
1947
+ * Streaming not supported by Whisper. Collects all audio then transcribes.
1948
+ * For realtime use, use createSTT('deepgram') instead.
1949
+ */
1950
+ async *transcribeStream(audio) {
1951
+ const chunks = [];
1952
+ for await (const chunk of audio) chunks.push(chunk);
1953
+ const result = await this.transcribeBatch(Buffer.concat(chunks));
1954
+ yield result;
1955
+ }
1956
+ /**
1957
+ * Transcribe a complete audio buffer via Whisper.
1958
+ *
1959
+ * @param audio WAV or PCM buffer
1960
+ */
1961
+ async transcribeBatch(audio) {
1962
+ const startMs = Date.now();
1963
+ const language = this.config.language.split("-")[0] ?? "en";
1964
+ try {
1965
+ logger12.debug({ language, bytes: audio.length }, "Whisper batch transcription");
1966
+ const openai = createOpenAI({ apiKey: this.config.apiKey });
1967
+ const file = new File([audio], "audio.wav", { type: "audio/wav" });
1968
+ const formData = new FormData();
1969
+ formData.append("file", file);
1970
+ formData.append("model", this.config.model);
1971
+ formData.append("language", language);
1972
+ if (this.config.wordTimestamps) {
1973
+ formData.append("timestamp_granularities[]", "word");
1974
+ formData.append("response_format", "verbose_json");
1975
+ }
1976
+ const response = await fetch("https://api.openai.com/v1/audio/transcriptions", {
1977
+ method: "POST",
1978
+ headers: { Authorization: `Bearer ${this.config.apiKey}` },
1979
+ body: formData
1980
+ });
1981
+ if (!response.ok) {
1982
+ throw new Error(`Whisper API error: ${response.status} ${response.statusText}`);
1983
+ }
1984
+ const data = await response.json();
1985
+ return {
1986
+ transcript: data.text,
1987
+ isFinal: true,
1988
+ confidence: 0.95,
1989
+ // Whisper doesn't return confidence
1990
+ language: this.config.language,
1991
+ languageSwitchDetected: false,
1992
+ words: this.config.wordTimestamps && data.words ? data.words.map((w) => ({
1993
+ word: w.word,
1994
+ startMs: w.start * 1e3,
1995
+ endMs: w.end * 1e3,
1996
+ confidence: 0.95
1997
+ })) : void 0,
1998
+ latencyMs: Date.now() - startMs
1999
+ };
2000
+ } catch (err) {
2001
+ if (err instanceof STTStreamError) throw err;
2002
+ throw new STTStreamError("whisper", err);
2003
+ }
2004
+ }
2005
+ };
2006
+
2007
+ // src/stt/STT-factory.ts
2008
+ function createSTT(provider, config) {
2009
+ const cfg = config ?? {};
2010
+ switch (provider) {
2011
+ case "deepgram":
2012
+ return new DeepgramSTTProvider(cfg);
2013
+ case "whisper":
2014
+ return new WhisperSTTProvider(cfg);
2015
+ case "assemblyai":
2016
+ return new AssemblyAISTTProvider(cfg);
2017
+ case "sarvam":
2018
+ return new SarvamSTTProvider(cfg);
2019
+ default: {
2020
+ const _exhaustive = provider;
2021
+ throw new Error(`Unknown STT provider: ${String(_exhaustive)}`);
2022
+ }
2023
+ }
2024
+ }
2025
+ var logger13 = pino({ name: "@voice-kit/core:tts:cartesia" });
2026
+ var DEFAULT_VOICE_ID = "a0e99841-438c-4a64-b679-ae501e7d6091";
2027
+ var CartesiaTTSProvider = class {
2028
+ name = "cartesia";
2029
+ outputSampleRate = 22050;
2030
+ // Cartesia default
2031
+ outputFormat = "pcm";
2032
+ client;
2033
+ config;
2034
+ constructor(config) {
2035
+ const apiKey = config.apiKey ?? process.env["CARTESIA_API_KEY"];
2036
+ if (!apiKey) throw new TTSConnectionError("cartesia", new Error("CARTESIA_API_KEY not set"));
2037
+ this.client = new Cartesia({ apiKey });
2038
+ this.config = {
2039
+ voiceId: config.voiceId ?? DEFAULT_VOICE_ID,
2040
+ sampleRate: config.sampleRate ?? 22050,
2041
+ speed: config.speed ?? 1,
2042
+ pitch: config.pitch ?? 0,
2043
+ apiKey,
2044
+ modelId: config.modelId ?? "sonic-english",
2045
+ emotion: config.emotion ?? "",
2046
+ targetLanguage: config.targetLanguage ?? "en"
2047
+ };
2048
+ }
2049
+ /**
2050
+ * Stream audio from Cartesia. Typically delivers first chunk in < 90ms.
2051
+ *
2052
+ * @example
2053
+ * ```ts
2054
+ * const tts = createTTS('cartesia', { voiceId: 'your-voice-id' })
2055
+ * for await (const chunk of tts.synthesizeStream('Hello!')) {
2056
+ * sendToTelephony(chunk)
2057
+ * }
2058
+ * ```
2059
+ */
2060
+ async *synthesizeStream(text, config) {
2061
+ const voiceId = config?.voiceId ?? this.config.voiceId;
2062
+ const startMs = Date.now();
2063
+ logger13.debug({ voiceId, chars: text.length }, "Cartesia TTS stream start");
2064
+ try {
2065
+ const stream = await this.client.tts.generateSse({
2066
+ model_id: config?.modelId ?? this.config.modelId,
2067
+ transcript: text,
2068
+ voice: {
2069
+ mode: "id",
2070
+ id: voiceId,
2071
+ ...this.config.emotion && {
2072
+ __experimental_controls: {
2073
+ emotion: [this.config.emotion]
2074
+ }
2075
+ }
2076
+ },
2077
+ output_format: {
2078
+ container: "raw",
2079
+ encoding: "pcm_s16le",
2080
+ sample_rate: toValidSampleRate(config?.sampleRate ?? this.config.sampleRate)
2081
+ }
2082
+ });
2083
+ let firstChunk = true;
2084
+ for await (const event of stream) {
2085
+ if (!event.data || event.data === "[DONE]") continue;
2086
+ let payload;
2087
+ try {
2088
+ payload = JSON.parse(event.data);
2089
+ } catch {
2090
+ continue;
2091
+ }
2092
+ if (!payload.chunk?.audio) continue;
2093
+ const buf = Buffer.from(payload.chunk.audio, "base64");
2094
+ if (firstChunk) {
2095
+ firstChunk = false;
2096
+ logger13.debug({ ttfb: Date.now() - startMs, voiceId }, "Cartesia first audio chunk");
2097
+ }
2098
+ yield buf;
2099
+ }
2100
+ } catch (err) {
2101
+ throw new TTSStreamError("cartesia", err);
2102
+ }
2103
+ }
2104
+ /** Synthesize complete audio. */
2105
+ async synthesizeFull(text, config) {
2106
+ const chunks = [];
2107
+ for await (const chunk of this.synthesizeStream(text, config)) {
2108
+ chunks.push(chunk);
2109
+ }
2110
+ return Buffer.concat(chunks);
2111
+ }
2112
+ };
2113
+ var VALID_SAMPLE_RATES = [22050, 8e3, 16e3, 24e3, 44100, 48e3];
2114
+ function toValidSampleRate(rate) {
2115
+ return VALID_SAMPLE_RATES.includes(rate) ? rate : 8e3;
2116
+ }
2117
+ var logger14 = pino({ name: "@voice-kit/core:tts:elevenlabs" });
2118
+ var DEFAULT_VOICE_ID2 = "21m00Tcm4TlvDq8ikWAM";
2119
+ var JITTER_BUFFER_MS = 100;
2120
+ var ElevenLabsTTSProvider = class {
2121
+ name = "elevenlabs";
2122
+ outputSampleRate = 24e3;
2123
+ // ElevenLabs default: 24kHz
2124
+ outputFormat = "pcm";
2125
+ client;
2126
+ config;
2127
+ constructor(config) {
2128
+ const apiKey = config.apiKey ?? process.env["ELEVENLABS_API_KEY"];
2129
+ if (!apiKey) throw new TTSConnectionError("elevenlabs", new Error("ELEVENLABS_API_KEY not set"));
2130
+ this.client = new ElevenLabsClient({ apiKey });
2131
+ this.config = {
2132
+ voiceId: config.voiceId ?? DEFAULT_VOICE_ID2,
2133
+ sampleRate: config.sampleRate ?? 24e3,
2134
+ speed: config.speed ?? 1,
2135
+ pitch: config.pitch ?? 0,
2136
+ apiKey,
2137
+ modelId: config.modelId ?? "eleven_turbo_v2_5",
2138
+ emotion: config.emotion ?? "",
2139
+ targetLanguage: config.targetLanguage ?? "en-IN"
2140
+ };
2141
+ }
2142
+ /**
2143
+ * Stream synthesized audio from ElevenLabs.
2144
+ * First chunk target: < 300ms. Uses streaming API endpoint.
2145
+ *
2146
+ * A 100ms jitter buffer smooths burst packet delivery without adding
2147
+ * perceptible latency.
2148
+ *
2149
+ * @param text Text to synthesize (should be a sentence boundary chunk)
2150
+ * @param config Per-call config overrides
2151
+ *
2152
+ * @example
2153
+ * ```ts
2154
+ * const tts = createTTS('elevenlabs', { voiceId: 'your-voice-id' })
2155
+ * for await (const chunk of tts.synthesizeStream('Hello, how can I help?')) {
2156
+ * telephony.sendAudio(chunk)
2157
+ * }
2158
+ * ```
2159
+ */
2160
+ async *synthesizeStream(text, config) {
2161
+ const voiceId = config?.voiceId ?? this.config.voiceId;
2162
+ const modelId = config?.modelId ?? this.config.modelId;
2163
+ const startMs = Date.now();
2164
+ logger14.debug({ voiceId, modelId, chars: text.length }, "ElevenLabs TTS stream start");
2165
+ try {
2166
+ const audioStream = await this.client.generate({
2167
+ voice: voiceId,
2168
+ text,
2169
+ model_id: modelId,
2170
+ voice_settings: {
2171
+ stability: 0.5,
2172
+ similarity_boost: 0.8,
2173
+ speed: config?.speed ?? this.config.speed
2174
+ },
2175
+ output_format: "pcm_24000",
2176
+ stream: true
2177
+ });
2178
+ let firstChunk = true;
2179
+ let jitterBuffer = [];
2180
+ let jitterTimer = null;
2181
+ const flushJitterBuffer = function* () {
2182
+ for (const chunk of jitterBuffer) {
2183
+ yield chunk;
2184
+ }
2185
+ jitterBuffer = [];
2186
+ };
2187
+ for await (const chunk of audioStream) {
2188
+ const buf = chunk instanceof Buffer ? chunk : Buffer.from(chunk);
2189
+ if (firstChunk) {
2190
+ firstChunk = false;
2191
+ const ttfb = Date.now() - startMs;
2192
+ logger14.debug({ ttfb, voiceId }, "ElevenLabs first audio chunk");
2193
+ }
2194
+ jitterBuffer.push(buf);
2195
+ if (jitterTimer === null) {
2196
+ jitterTimer = setTimeout(() => {
2197
+ }, JITTER_BUFFER_MS);
2198
+ }
2199
+ const totalBytes = jitterBuffer.reduce((sum, b) => sum + b.length, 0);
2200
+ if (totalBytes >= 4800) {
2201
+ if (jitterTimer !== null) {
2202
+ clearTimeout(jitterTimer);
2203
+ jitterTimer = null;
2204
+ }
2205
+ yield* flushJitterBuffer();
2206
+ }
2207
+ }
2208
+ if (jitterTimer !== null) {
2209
+ clearTimeout(jitterTimer);
2210
+ }
2211
+ yield* flushJitterBuffer();
2212
+ logger14.debug({ voiceId, totalMs: Date.now() - startMs }, "ElevenLabs TTS stream complete");
2213
+ } catch (err) {
2214
+ if (err.statusCode === 404) {
2215
+ throw new TTSVoiceNotFoundError("elevenlabs", voiceId);
2216
+ }
2217
+ throw new TTSStreamError("elevenlabs", err);
2218
+ }
2219
+ }
2220
+ /**
2221
+ * Synthesize full audio (for pre-caching greetings, IVR prompts).
2222
+ * Collects all streaming chunks into a single buffer.
2223
+ *
2224
+ * @param text Text to synthesize
2225
+ * @param config Per-call config overrides
2226
+ */
2227
+ async synthesizeFull(text, config) {
2228
+ const chunks = [];
2229
+ for await (const chunk of this.synthesizeStream(text, config)) {
2230
+ chunks.push(chunk);
2231
+ }
2232
+ return Buffer.concat(chunks);
2233
+ }
2234
+ };
2235
+ var logger15 = pino({ name: "@voice-kit/core:tts:sarvam" });
2236
+ var SARVAM_API_BASE2 = "https://api.sarvam.ai";
2237
+ var DEFAULT_VOICES = {
2238
+ "hi-IN": "meera",
2239
+ "kn-IN": "pavithra",
2240
+ "ta-IN": "pavithra",
2241
+ "te-IN": "pavithra",
2242
+ "mr-IN": "meera",
2243
+ "bn-IN": "meera",
2244
+ "gu-IN": "meera",
2245
+ "pa-IN": "meera"
2246
+ };
2247
+ var SarvamTTSProvider = class {
2248
+ name = "sarvam";
2249
+ outputSampleRate = 22050;
2250
+ // Sarvam default
2251
+ outputFormat = "mp3";
2252
+ http;
2253
+ config;
2254
+ constructor(config) {
2255
+ const apiKey = config.apiKey ?? process.env["SARVAM_API_KEY"];
2256
+ if (!apiKey) throw new TTSConnectionError("sarvam", new Error("SARVAM_API_KEY not set"));
2257
+ const targetLanguage = config.targetLanguage ?? "hi-IN";
2258
+ const defaultVoice = DEFAULT_VOICES[targetLanguage] ?? "meera";
2259
+ this.http = axios.create({
2260
+ baseURL: SARVAM_API_BASE2,
2261
+ headers: {
2262
+ "API-Subscription-Key": apiKey,
2263
+ "Content-Type": "application/json"
2264
+ },
2265
+ timeout: 15e3
2266
+ });
2267
+ this.config = {
2268
+ voiceId: config.voiceId ?? defaultVoice,
2269
+ sampleRate: config.sampleRate ?? 22050,
2270
+ speed: config.speed ?? 1,
2271
+ pitch: config.pitch ?? 0,
2272
+ apiKey,
2273
+ modelId: config.modelId ?? "bulbul:v1",
2274
+ emotion: "",
2275
+ targetLanguage
2276
+ };
2277
+ }
2278
+ /**
2279
+ * Synthesize text in an Indic language and stream audio chunks.
2280
+ * Sarvam returns full audio segments — we chunk them for streaming compatibility.
2281
+ *
2282
+ * @example
2283
+ * ```ts
2284
+ * const tts = createTTS('sarvam', { targetLanguage: 'hi-IN' })
2285
+ * for await (const chunk of tts.synthesizeStream('नमस्ते, मैं आपकी कैसे मदद कर सकता हूँ?')) {
2286
+ * telephony.sendAudio(chunk)
2287
+ * }
2288
+ * ```
2289
+ */
2290
+ async *synthesizeStream(text, config) {
2291
+ const startMs = Date.now();
2292
+ const targetLanguage = config?.targetLanguage ?? this.config.targetLanguage;
2293
+ const speaker = config?.voiceId ?? this.config.voiceId;
2294
+ logger15.debug({ targetLanguage, speaker, chars: text.length }, "Sarvam TTS request");
2295
+ try {
2296
+ const response = await this.http.post("/text-to-speech", {
2297
+ inputs: [text],
2298
+ target_language_code: targetLanguage,
2299
+ speaker,
2300
+ model: config?.modelId ?? this.config.modelId,
2301
+ pitch: config?.pitch ?? this.config.pitch,
2302
+ pace: config?.speed ?? this.config.speed,
2303
+ loudness: 1,
2304
+ speech_sample_rate: config?.sampleRate ?? this.config.sampleRate,
2305
+ enable_preprocessing: true
2306
+ });
2307
+ const audioSegments = response.data.audios;
2308
+ if (!audioSegments || audioSegments.length === 0) {
2309
+ throw new TTSStreamError("sarvam", new Error("No audio returned from Sarvam TTS"));
2310
+ }
2311
+ logger15.debug(
2312
+ { segments: audioSegments.length, latencyMs: Date.now() - startMs },
2313
+ "Sarvam TTS response received"
2314
+ );
2315
+ for (const segment of audioSegments) {
2316
+ const buf = Buffer.from(segment, "base64");
2317
+ const CHUNK_SIZE = 4096;
2318
+ let offset = 0;
2319
+ while (offset < buf.length) {
2320
+ yield buf.subarray(offset, Math.min(offset + CHUNK_SIZE, buf.length));
2321
+ offset += CHUNK_SIZE;
2322
+ }
2323
+ }
2324
+ } catch (err) {
2325
+ if (axios.isAxiosError(err)) {
2326
+ throw new TTSStreamError(
2327
+ "sarvam",
2328
+ new Error(`Sarvam TTS API error: ${err.response?.status} ${JSON.stringify(err.response?.data)}`)
2329
+ );
2330
+ }
2331
+ throw new TTSStreamError("sarvam", err);
2332
+ }
2333
+ }
2334
+ /** Synthesize complete audio buffer. */
2335
+ async synthesizeFull(text, config) {
2336
+ const chunks = [];
2337
+ for await (const chunk of this.synthesizeStream(text, config)) {
2338
+ chunks.push(chunk);
2339
+ }
2340
+ return Buffer.concat(chunks);
2341
+ }
2342
+ };
2343
+
2344
+ // src/tts/TTS-factory.ts
2345
+ function createTTS(provider, config) {
2346
+ const cfg = config ?? {};
2347
+ switch (provider) {
2348
+ case "elevenlabs":
2349
+ return new ElevenLabsTTSProvider(cfg);
2350
+ case "cartesia":
2351
+ return new CartesiaTTSProvider(cfg);
2352
+ case "sarvam":
2353
+ return new SarvamTTSProvider(cfg);
2354
+ default: {
2355
+ const _exhaustive = provider;
2356
+ throw new Error(`Unknown TTS provider: ${String(_exhaustive)}`);
2357
+ }
2358
+ }
2359
+ }
2360
+
2361
+ export { AgentError, AgentHandoffError, AssemblyAISTTProvider, AudioPipeline, AudioTransportError, CallAuditLog, CallConnectionError, CallMetrics, CallNotFoundError, CallingHoursError, CartesiaTTSProvider, ComplianceError, ConsentMissingError, DNCBlockedError, DeepgramSTTProvider, ElevenLabsTTSProvider, InngestError, LanguageSwitchDetector, STTConnectionError, STTError, STTLanguageNotSupportedError, STTStreamError, SarvamSTTProvider, SarvamTTSProvider, TRAICompliance, TTSConnectionError, TTSError, TTSStreamError, TTSVoiceNotFoundError, TelephonyError, TurnTransitionError, VADEngine, VoiceKitError, VoiceSDKTracer, WhisperSTTProvider, base64MulawToPcm, createAudioPipeline, createCallMemory, createResamplerStream, createSTT, createTTS, createVAD, isInglish, linearToMulaw, mulawBufferToPcm, mulawToLinear, pcmBufferToMulaw, pcmToBase64Mulaw, resample, resampleStream };
261
2362
  //# sourceMappingURL=index.js.map
262
2363
  //# sourceMappingURL=index.js.map