@voice-kit/core 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/dist/index.cjs +2137 -0
  2. package/dist/index.cjs.map +1 -1
  3. package/dist/index.d.cts +1466 -4
  4. package/dist/index.d.ts +1466 -4
  5. package/dist/index.js +2102 -1
  6. package/dist/index.js.map +1 -1
  7. package/package.json +1 -31
  8. package/dist/audio.cjs +0 -533
  9. package/dist/audio.cjs.map +0 -1
  10. package/dist/audio.d.cts +0 -260
  11. package/dist/audio.d.ts +0 -260
  12. package/dist/audio.js +0 -514
  13. package/dist/audio.js.map +0 -1
  14. package/dist/compliance.cjs +0 -343
  15. package/dist/compliance.cjs.map +0 -1
  16. package/dist/compliance.d.cts +0 -163
  17. package/dist/compliance.d.ts +0 -163
  18. package/dist/compliance.js +0 -335
  19. package/dist/compliance.js.map +0 -1
  20. package/dist/errors.cjs +0 -284
  21. package/dist/errors.cjs.map +0 -1
  22. package/dist/errors.d.cts +0 -100
  23. package/dist/errors.d.ts +0 -100
  24. package/dist/errors.js +0 -262
  25. package/dist/errors.js.map +0 -1
  26. package/dist/index-D3KfRXMP.d.cts +0 -319
  27. package/dist/index-D3KfRXMP.d.ts +0 -319
  28. package/dist/memory.cjs +0 -121
  29. package/dist/memory.cjs.map +0 -1
  30. package/dist/memory.d.cts +0 -29
  31. package/dist/memory.d.ts +0 -29
  32. package/dist/memory.js +0 -115
  33. package/dist/memory.js.map +0 -1
  34. package/dist/observability.cjs +0 -229
  35. package/dist/observability.cjs.map +0 -1
  36. package/dist/observability.d.cts +0 -122
  37. package/dist/observability.d.ts +0 -122
  38. package/dist/observability.js +0 -222
  39. package/dist/observability.js.map +0 -1
  40. package/dist/stt.cjs +0 -828
  41. package/dist/stt.cjs.map +0 -1
  42. package/dist/stt.d.cts +0 -308
  43. package/dist/stt.d.ts +0 -308
  44. package/dist/stt.js +0 -815
  45. package/dist/stt.js.map +0 -1
  46. package/dist/telephony.errors-BQYr6-vl.d.cts +0 -80
  47. package/dist/telephony.errors-C0-nScrF.d.ts +0 -80
  48. package/dist/tts.cjs +0 -429
  49. package/dist/tts.cjs.map +0 -1
  50. package/dist/tts.d.cts +0 -151
  51. package/dist/tts.d.ts +0 -151
  52. package/dist/tts.js +0 -418
  53. package/dist/tts.js.map +0 -1
package/dist/index.cjs CHANGED
@@ -1,5 +1,32 @@
1
1
  'use strict';
2
2
 
3
+ var ffmpeg = require('fluent-ffmpeg');
4
+ var stream = require('stream');
5
+ var pino = require('pino');
6
+ var events = require('events');
7
+ var lruCache = require('lru-cache');
8
+ var promises = require('fs/promises');
9
+ var axios = require('axios');
10
+ var libphonenumberJs = require('libphonenumber-js');
11
+ var sdkTraceNode = require('@opentelemetry/sdk-trace-node');
12
+ var exporterTraceOtlpHttp = require('@opentelemetry/exporter-trace-otlp-http');
13
+ var sdkTraceBase = require('@opentelemetry/sdk-trace-base');
14
+ var semanticConventions = require('@opentelemetry/semantic-conventions');
15
+ var api = require('@opentelemetry/api');
16
+ var resources = require('@opentelemetry/resources');
17
+ var assemblyai = require('assemblyai');
18
+ var sdk = require('@deepgram/sdk');
19
+ var openai = require('@ai-sdk/openai');
20
+ var Cartesia = require('@cartesia/cartesia-js');
21
+ var elevenlabs = require('elevenlabs');
22
+
23
+ function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
24
+
25
+ var ffmpeg__default = /*#__PURE__*/_interopDefault(ffmpeg);
26
+ var pino__default = /*#__PURE__*/_interopDefault(pino);
27
+ var axios__default = /*#__PURE__*/_interopDefault(axios);
28
+ var Cartesia__default = /*#__PURE__*/_interopDefault(Cartesia);
29
+
3
30
  // src/errors/base.ts
4
31
  var VoiceKitError = class extends Error {
5
32
  code;
@@ -259,26 +286,2136 @@ var TTSVoiceNotFoundError = class extends TTSError {
259
286
  }
260
287
  };
261
288
 
289
+ // src/audio/codec/index.ts
290
+ var MULAW_BIAS = 33;
291
+ var MULAW_MAX = 32767;
292
+ function mulawToLinear(sample) {
293
+ sample = ~sample & 255;
294
+ const sign = sample & 128;
295
+ const exponent = sample >> 4 & 7;
296
+ const mantissa = sample & 15;
297
+ let linear = (mantissa << 1) + 33 << exponent;
298
+ linear -= 33;
299
+ return sign !== 0 ? -linear : linear;
300
+ }
301
+ function linearToMulaw(sample) {
302
+ sample = Math.max(-32768, Math.min(32767, sample));
303
+ const sign = sample < 0 ? 128 : 0;
304
+ if (sample < 0) sample = -sample;
305
+ sample += MULAW_BIAS;
306
+ if (sample > MULAW_MAX) sample = MULAW_MAX;
307
+ let exponent = 7;
308
+ let expMask = 16384;
309
+ for (; exponent > 0; exponent--) {
310
+ if ((sample & expMask) !== 0) break;
311
+ expMask >>= 1;
312
+ }
313
+ const mantissa = sample >> exponent + 3 & 15;
314
+ const mulaw = ~(sign | exponent << 4 | mantissa) & 255;
315
+ return mulaw;
316
+ }
317
+ function mulawBufferToPcm(buf) {
318
+ const pcm = Buffer.allocUnsafe(buf.length * 2);
319
+ for (let i = 0; i < buf.length; i++) {
320
+ const sample = mulawToLinear(buf[i] ?? 0);
321
+ pcm.writeInt16LE(sample, i * 2);
322
+ }
323
+ return pcm;
324
+ }
325
+ function pcmBufferToMulaw(buf) {
326
+ const samples = buf.length >> 1;
327
+ const mulaw = Buffer.allocUnsafe(samples);
328
+ for (let i = 0; i < samples; i++) {
329
+ const sample = buf.readInt16LE(i * 2);
330
+ mulaw[i] = linearToMulaw(sample);
331
+ }
332
+ return mulaw;
333
+ }
334
+ function base64MulawToPcm(base64) {
335
+ const mulaw = Buffer.from(base64, "base64");
336
+ return mulawBufferToPcm(mulaw);
337
+ }
338
+ function pcmToBase64Mulaw(pcm) {
339
+ return pcmBufferToMulaw(pcm).toString("base64");
340
+ }
341
+ async function resample(buf, fromHz, toHz) {
342
+ if (fromHz === toHz) return buf;
343
+ return new Promise((resolve, reject) => {
344
+ const chunks = [];
345
+ const input = new stream.Readable({
346
+ read() {
347
+ this.push(buf);
348
+ this.push(null);
349
+ }
350
+ });
351
+ const output = new stream.PassThrough();
352
+ output.on("data", (chunk) => chunks.push(chunk));
353
+ output.on("end", () => resolve(Buffer.concat(chunks)));
354
+ output.on(
355
+ "error",
356
+ (err) => reject(
357
+ new AudioTransportError(
358
+ "ffmpeg-resampler",
359
+ err
360
+ )
361
+ )
362
+ );
363
+ ffmpeg__default.default(input).inputOptions([
364
+ "-f s16le",
365
+ `-ar ${fromHz}`,
366
+ "-ac 1"
367
+ ]).outputOptions([
368
+ "-f s16le",
369
+ `-ar ${toHz}`,
370
+ "-ac 1"
371
+ ]).on(
372
+ "error",
373
+ (err) => reject(
374
+ new AudioTransportError("ffmpeg-resampler", err)
375
+ )
376
+ ).pipe(output, { end: true });
377
+ });
378
+ }
379
+ function createResamplerStream(fromHz, toHz) {
380
+ const output = new stream.PassThrough();
381
+ if (fromHz === toHz) {
382
+ return output;
383
+ }
384
+ return output;
385
+ }
386
+ async function* resampleStream(audio, fromHz, toHz) {
387
+ if (fromHz === toHz) {
388
+ yield* audio;
389
+ return;
390
+ }
391
+ const CHUNK_SIZE = 16e3;
392
+ let pending = Buffer.alloc(0);
393
+ for await (const chunk of audio) {
394
+ pending = Buffer.concat([pending, chunk]);
395
+ while (pending.length >= CHUNK_SIZE) {
396
+ const slice = pending.subarray(0, CHUNK_SIZE);
397
+ pending = pending.subarray(CHUNK_SIZE);
398
+ yield await resample(slice, fromHz, toHz);
399
+ }
400
+ }
401
+ if (pending.length > 0) {
402
+ yield await resample(pending, fromHz, toHz);
403
+ }
404
+ }
405
+ var logger = pino__default.default({ name: "@voice-kit/core:pipeline" });
406
+ var AUDIO_PROFILES = {
407
+ twilio: {
408
+ inputFormat: "mulaw",
409
+ inputSampleRate: 8e3,
410
+ sttSampleRate: 16e3,
411
+ outputSampleRate: 8e3,
412
+ outputFormat: "mulaw",
413
+ vadConfig: {
414
+ threshold: 0.6,
415
+ positiveSpeechFrames: 3,
416
+ negativeSpeechFrames: 5,
417
+ debounceMs: 150,
418
+ sampleRate: 16e3
419
+ }
420
+ },
421
+ exotel: {
422
+ inputFormat: "mulaw",
423
+ inputSampleRate: 8e3,
424
+ sttSampleRate: 16e3,
425
+ outputSampleRate: 8e3,
426
+ outputFormat: "mulaw",
427
+ vadConfig: {
428
+ threshold: 0.55,
429
+ // Exotel has slightly more background noise on IN PSTN
430
+ positiveSpeechFrames: 3,
431
+ negativeSpeechFrames: 6,
432
+ debounceMs: 200,
433
+ sampleRate: 16e3
434
+ }
435
+ },
436
+ plivo: {
437
+ inputFormat: "mulaw",
438
+ inputSampleRate: 8e3,
439
+ sttSampleRate: 16e3,
440
+ outputSampleRate: 8e3,
441
+ outputFormat: "mulaw",
442
+ vadConfig: {
443
+ threshold: 0.6,
444
+ positiveSpeechFrames: 3,
445
+ negativeSpeechFrames: 5,
446
+ debounceMs: 150,
447
+ sampleRate: 16e3
448
+ }
449
+ },
450
+ telnyx: {
451
+ inputFormat: "mulaw",
452
+ inputSampleRate: 8e3,
453
+ sttSampleRate: 16e3,
454
+ outputSampleRate: 8e3,
455
+ outputFormat: "mulaw",
456
+ vadConfig: {
457
+ threshold: 0.6,
458
+ positiveSpeechFrames: 3,
459
+ negativeSpeechFrames: 5,
460
+ debounceMs: 150,
461
+ sampleRate: 16e3
462
+ }
463
+ },
464
+ livekit: {
465
+ inputFormat: "opus",
466
+ // LiveKit delivers decoded PCM via SDK — we handle 48kHz
467
+ inputSampleRate: 48e3,
468
+ sttSampleRate: 16e3,
469
+ outputSampleRate: 48e3,
470
+ outputFormat: "opus",
471
+ vadConfig: {
472
+ threshold: 0.5,
473
+ // Higher quality audio = can lower threshold
474
+ positiveSpeechFrames: 2,
475
+ negativeSpeechFrames: 4,
476
+ debounceMs: 100,
477
+ sampleRate: 16e3
478
+ }
479
+ },
480
+ sip: {
481
+ inputFormat: "mulaw",
482
+ inputSampleRate: 8e3,
483
+ sttSampleRate: 16e3,
484
+ outputSampleRate: 8e3,
485
+ outputFormat: "mulaw",
486
+ vadConfig: {
487
+ threshold: 0.6,
488
+ positiveSpeechFrames: 3,
489
+ negativeSpeechFrames: 5,
490
+ debounceMs: 150,
491
+ sampleRate: 16e3
492
+ }
493
+ }
494
+ };
495
+ var AudioPipeline = class {
496
+ profile;
497
+ provider;
498
+ constructor(provider) {
499
+ this.provider = provider;
500
+ this.profile = AUDIO_PROFILES[provider];
501
+ logger.debug(
502
+ {
503
+ provider,
504
+ inputFormat: this.profile.inputFormat,
505
+ inputSampleRate: this.profile.inputSampleRate,
506
+ sttSampleRate: this.profile.sttSampleRate
507
+ },
508
+ "AudioPipeline initialized"
509
+ );
510
+ }
511
+ /**
512
+ * Transform incoming telephony audio to 16kHz PCM for STT.
513
+ * Handles µ-law decode + resampling automatically.
514
+ *
515
+ * @param raw Raw audio bytes as received from telephony provider
516
+ * @returns Async iterable of 16kHz PCM buffers for STT
517
+ *
518
+ * @internal
519
+ */
520
+ async *inboundForSTT(raw) {
521
+ let decoded;
522
+ if (this.profile.inputFormat === "mulaw") {
523
+ decoded = this.decodeMulaw(raw);
524
+ } else {
525
+ decoded = raw;
526
+ }
527
+ yield* resampleStream(decoded, this.profile.inputSampleRate, this.profile.sttSampleRate);
528
+ }
529
+ /**
530
+ * Transform TTS output PCM to telephony-native format for sending to caller.
531
+ * Handles resampling + µ-law encode automatically.
532
+ *
533
+ * @param ttsAudio Raw PCM from TTS provider (at TTS provider's native rate)
534
+ * @param ttsSampleRate Native sample rate of the TTS provider
535
+ * @returns Async iterable of audio bytes ready to send to telephony provider
536
+ *
537
+ * @internal
538
+ */
539
+ async *outboundFromTTS(ttsAudio, ttsSampleRate) {
540
+ const resampled = resampleStream(
541
+ ttsAudio,
542
+ ttsSampleRate,
543
+ this.profile.outputSampleRate
544
+ );
545
+ if (this.profile.outputFormat === "mulaw") {
546
+ for await (const chunk of resampled) {
547
+ yield pcmBufferToMulaw(chunk);
548
+ }
549
+ } else {
550
+ yield* resampled;
551
+ }
552
+ }
553
+ /** Get the VAD config tuned for this provider's audio quality. @internal */
554
+ get vadConfig() {
555
+ return this.profile.vadConfig;
556
+ }
557
+ /** Sample rate that STT expects (post-pipeline). @internal */
558
+ get sttSampleRate() {
559
+ return this.profile.sttSampleRate;
560
+ }
561
+ /** Async generator: decode µ-law stream to PCM. @internal */
562
+ async *decodeMulaw(raw) {
563
+ for await (const chunk of raw) {
564
+ yield mulawBufferToPcm(chunk);
565
+ }
566
+ }
567
+ };
568
+ function createAudioPipeline(provider) {
569
+ return new AudioPipeline(provider);
570
+ }
571
+ var logger2 = pino__default.default({ name: "@voice-kit/core:vad" });
572
+ var FRAME_SIZE_SAMPLES = 480;
573
+ var FRAME_SIZE_BYTES = FRAME_SIZE_SAMPLES * 2;
574
+ var VAD_DEFAULTS = {
575
+ threshold: 0.6,
576
+ positiveSpeechFrames: 3,
577
+ negativeSpeechFrames: 5,
578
+ debounceMs: 150,
579
+ sampleRate: 16e3
580
+ };
581
+ var VADEngine = class extends events.EventEmitter {
582
+ config;
583
+ // Running state
584
+ isSpeaking = false;
585
+ positiveFrameCount = 0;
586
+ negativeFrameCount = 0;
587
+ debounceTimer = null;
588
+ frameBuffer = Buffer.alloc(0);
589
+ // Silero VAD model — loaded lazily
590
+ vadModel = null;
591
+ constructor(config) {
592
+ super();
593
+ this.config = { ...VAD_DEFAULTS, ...config };
594
+ }
595
+ /**
596
+ * Process an async stream of PCM audio frames.
597
+ * Automatically frames the input into 30ms chunks for VAD processing.
598
+ *
599
+ * @param audio Async iterable of PCM buffers (16kHz, s16le, mono)
600
+ */
601
+ async processStream(audio) {
602
+ try {
603
+ await this.ensureModelLoaded();
604
+ for await (const chunk of audio) {
605
+ this.frameBuffer = Buffer.concat([this.frameBuffer, chunk]);
606
+ while (this.frameBuffer.length >= FRAME_SIZE_BYTES) {
607
+ const frame = this.frameBuffer.subarray(0, FRAME_SIZE_BYTES);
608
+ this.frameBuffer = this.frameBuffer.subarray(FRAME_SIZE_BYTES);
609
+ await this.processFrame(frame);
610
+ }
611
+ }
612
+ if (this.isSpeaking) {
613
+ this.emitFrame("speech_end", 0, Buffer.alloc(0));
614
+ }
615
+ } catch (err) {
616
+ const error = new AudioTransportError("vad", err);
617
+ this.emit("error", error);
618
+ throw error;
619
+ }
620
+ }
621
+ /**
622
+ * Process a single 30ms PCM frame through the VAD model.
623
+ *
624
+ * @internal
625
+ */
626
+ async processFrame(frame) {
627
+ const confidence = await this.runVADInference(frame);
628
+ const durationMs = FRAME_SIZE_SAMPLES / this.config.sampleRate * 1e3;
629
+ if (confidence >= this.config.threshold) {
630
+ this.positiveFrameCount++;
631
+ this.negativeFrameCount = 0;
632
+ if (this.isSpeaking) {
633
+ this.emitFrame("speech", confidence, frame, durationMs);
634
+ } else if (this.positiveFrameCount >= this.config.positiveSpeechFrames) {
635
+ this.clearDebounce();
636
+ this.isSpeaking = true;
637
+ this.emitFrame("speech_start", confidence, frame, durationMs);
638
+ logger2.debug({ confidence, frames: this.positiveFrameCount }, "VAD: speech_start");
639
+ }
640
+ } else {
641
+ this.negativeFrameCount++;
642
+ this.positiveFrameCount = 0;
643
+ if (this.isSpeaking) {
644
+ this.emitFrame("speech", confidence, frame, durationMs);
645
+ if (this.negativeFrameCount >= this.config.negativeSpeechFrames) {
646
+ this.scheduleDebounce(() => {
647
+ this.isSpeaking = false;
648
+ this.negativeFrameCount = 0;
649
+ this.emitFrame("speech_end", confidence, Buffer.alloc(0), 0);
650
+ logger2.debug({ confidence }, "VAD: speech_end");
651
+ });
652
+ }
653
+ }
654
+ }
655
+ }
656
+ /**
657
+ * Run Silero VAD model inference on a single frame.
658
+ * Returns confidence score 0–1.
659
+ *
660
+ * @internal
661
+ */
662
+ async runVADInference(frame) {
663
+ if (!this.vadModel) throw new Error("VAD model not loaded");
664
+ const samples = new Float32Array(FRAME_SIZE_SAMPLES);
665
+ for (let i = 0; i < FRAME_SIZE_SAMPLES; i++) {
666
+ samples[i] = frame.readInt16LE(i * 2) / 32768;
667
+ }
668
+ return this.vadModel.predict(samples);
669
+ }
670
+ emitFrame(type, confidence, audioBuffer, durationMs = 0) {
671
+ const frame = { type, confidence, audioBuffer, durationMs };
672
+ this.emit("frame", frame);
673
+ }
674
+ scheduleDebounce(fn) {
675
+ this.clearDebounce();
676
+ this.debounceTimer = setTimeout(fn, this.config.debounceMs);
677
+ }
678
+ clearDebounce() {
679
+ if (this.debounceTimer !== null) {
680
+ clearTimeout(this.debounceTimer);
681
+ this.debounceTimer = null;
682
+ }
683
+ }
684
+ /**
685
+ * Load the Silero VAD model if not already loaded.
686
+ * @internal
687
+ */
688
+ async ensureModelLoaded() {
689
+ if (this.vadModel) return;
690
+ logger2.debug("Loading Silero VAD model...");
691
+ try {
692
+ const { MicVAD } = await import('@ricky0123/vad-web');
693
+ const vad = await MicVAD.new();
694
+ this.vadModel = new SileroVADAdapter(vad);
695
+ logger2.info("VAD model loaded successfully");
696
+ } catch (err) {
697
+ logger2.warn({ err }, "VAD model load failed \u2014 falling back to silence-based detection");
698
+ this.vadModel = new EnergyBasedVAD();
699
+ }
700
+ }
701
+ /** Clean up resources. Call when the call ends. */
702
+ destroy() {
703
+ this.clearDebounce();
704
+ this.removeAllListeners();
705
+ this.frameBuffer = Buffer.alloc(0);
706
+ this.vadModel = null;
707
+ }
708
+ };
709
+ var EnergyBasedVAD = class {
710
+ energyThreshold = 0.01;
711
+ async predict(samples) {
712
+ let sumSq = 0;
713
+ for (const s of samples) {
714
+ sumSq += s * s;
715
+ }
716
+ const rms = Math.sqrt(sumSq / samples.length);
717
+ return Math.min(1, rms / this.energyThreshold);
718
+ }
719
+ };
720
+ function createVAD(config) {
721
+ return new VADEngine(config);
722
+ }
723
+ var SileroVADAdapter = class {
724
+ constructor(vad) {
725
+ this.vad = vad;
726
+ }
727
+ async predict(samples) {
728
+ let sumSq = 0;
729
+ for (const s of samples) {
730
+ sumSq += s * s;
731
+ }
732
+ const rms = Math.sqrt(sumSq / samples.length);
733
+ return Math.min(1, rms * 10);
734
+ }
735
+ };
736
+ var logger3 = pino__default.default({ name: "@voice-kit/core:compliance:audit" });
737
+ var CallAuditLog = class {
738
+ /** LRU: up to 10,000 calls × 200 entries each = 2M entries max */
739
+ cache;
740
+ filePath;
741
+ constructor(options) {
742
+ this.filePath = options?.filePath;
743
+ this.cache = new lruCache.LRUCache({
744
+ max: options?.maxCalls ?? 1e4,
745
+ ttl: 4 * 60 * 60 * 1e3
746
+ // 4 hours
747
+ });
748
+ }
749
+ /**
750
+ * Append an immutable audit entry for a call.
751
+ *
752
+ * @param callId The call identifier
753
+ * @param type Audit event type
754
+ * @param data Additional structured data
755
+ */
756
+ append(callId, type, data = {}) {
757
+ const entry = Object.freeze({
758
+ id: `${callId}-${Date.now()}-${Math.random().toString(36).slice(2, 9)}`,
759
+ callId,
760
+ type,
761
+ timestamp: /* @__PURE__ */ new Date(),
762
+ data: Object.freeze({ ...data })
763
+ });
764
+ const existing = this.cache.get(callId) ?? [];
765
+ this.cache.set(callId, [...existing, entry]);
766
+ logger3.debug({ callId, type, entryId: entry.id }, "Audit entry appended");
767
+ if (this.filePath) {
768
+ this.writeToFile(entry).catch(
769
+ (err) => logger3.error({ err, callId, type }, "Audit file write failed")
770
+ );
771
+ }
772
+ return entry;
773
+ }
774
+ /**
775
+ * Get all audit entries for a call, in insertion order.
776
+ *
777
+ * @param callId The call identifier
778
+ */
779
+ getEntries(callId) {
780
+ return Object.freeze(this.cache.get(callId) ?? []);
781
+ }
782
+ /**
783
+ * Get entries of a specific type for a call.
784
+ */
785
+ getEntriesByType(callId, type) {
786
+ return this.getEntries(callId).filter((e) => e.type === type);
787
+ }
788
+ /** Write entry to JSONL file. @internal */
789
+ async writeToFile(entry) {
790
+ if (!this.filePath) return;
791
+ const line = JSON.stringify({
792
+ ...entry,
793
+ timestamp: entry.timestamp.toISOString()
794
+ }) + "\n";
795
+ await promises.appendFile(this.filePath, line, "utf-8");
796
+ }
797
+ };
798
+ var logger4 = pino__default.default({ name: "@voice-kit/core:compliance:trai" });
799
+ var TRAI_DND_API_MOCK = "https://api.trai.gov.in/dnd/check";
800
+ var DEFAULTS = {
801
+ disabled: false,
802
+ timezone: "Asia/Kolkata",
803
+ callingHoursStart: 9,
804
+ callingHoursEnd: 21,
805
+ dncApiEndpoint: TRAI_DND_API_MOCK
806
+ };
807
+ var DNC_CACHE_TTL_MS = 24 * 60 * 60 * 1e3;
808
+ var CONSENT_VALIDITY_MS = 180 * 24 * 60 * 60 * 1e3;
809
+ var TRAICompliance = class {
810
+ config;
811
+ http;
812
+ /** DNC check results cached for 24 hours per number. */
813
+ dncCache;
814
+ /** Consent records cached for 180 days. */
815
+ consentCache;
816
+ constructor(config) {
817
+ this.config = { ...DEFAULTS, ...config };
818
+ this.dncCache = new lruCache.LRUCache({
819
+ max: 1e5,
820
+ ttl: DNC_CACHE_TTL_MS
821
+ });
822
+ this.consentCache = new lruCache.LRUCache({
823
+ max: 5e4,
824
+ ttl: CONSENT_VALIDITY_MS
825
+ });
826
+ this.http = axios__default.default.create({
827
+ baseURL: this.config.dncApiEndpoint,
828
+ timeout: 5e3,
829
+ headers: { "Content-Type": "application/json" }
830
+ });
831
+ }
832
+ /**
833
+ * Check whether a call is permitted under TRAI rules.
834
+ * Checks: valid E.164, DNC registry, calling hours.
835
+ *
836
+ * @param params Call permission check parameters
837
+ * @throws DNCBlockedError if number is on DNC registry
838
+ * @throws CallingHoursError if outside allowed calling hours
839
+ * @throws ComplianceError if phone number is invalid
840
+ *
841
+ * @example
842
+ * ```ts
843
+ * const result = await trai.checkCallPermission({
844
+ * to: '+919876543210',
845
+ * purpose: 'TRANSACTIONAL',
846
+ * })
847
+ * if (!result.allowed) console.log(result.reason)
848
+ * ```
849
+ */
850
+ async checkCallPermission(params) {
851
+ if (this.config.disabled) {
852
+ return { allowed: true, fromCache: false };
853
+ }
854
+ if (!libphonenumberJs.isValidPhoneNumber(params.to)) {
855
+ throw new ComplianceError({
856
+ code: "COMPLIANCE_INVALID_NUMBER",
857
+ message: `Invalid phone number: ${params.to}`,
858
+ phoneNumber: params.to,
859
+ retryable: false,
860
+ severity: "low"
861
+ });
862
+ }
863
+ const parsed = libphonenumberJs.parsePhoneNumberFromString(params.to);
864
+ const isIndianNumber = parsed?.countryCallingCode === "91";
865
+ if (!isIndianNumber) {
866
+ return { allowed: true, fromCache: false };
867
+ }
868
+ const scheduledAt = params.scheduledAt ?? /* @__PURE__ */ new Date();
869
+ if (!this.isWithinCallingHours(scheduledAt)) {
870
+ const timeStr = new Intl.DateTimeFormat("en-IN", {
871
+ timeZone: this.config.timezone,
872
+ hour: "2-digit",
873
+ minute: "2-digit",
874
+ hour12: false
875
+ }).format(scheduledAt);
876
+ throw new CallingHoursError(params.to, timeStr);
877
+ }
878
+ if (params.purpose === "EMERGENCY") {
879
+ return { allowed: true, fromCache: false };
880
+ }
881
+ const cacheKey = `${params.to}:${params.purpose}`;
882
+ const cached = this.dncCache.get(cacheKey);
883
+ if (cached) {
884
+ logger4.debug({ to: params.to, purpose: params.purpose, allowed: cached.allowed }, "DNC cache hit");
885
+ return { ...cached, fromCache: true };
886
+ }
887
+ const result = await this.fetchDNCStatus(params);
888
+ this.dncCache.set(cacheKey, result);
889
+ if (!result.allowed) {
890
+ throw new DNCBlockedError(params.to);
891
+ }
892
+ return result;
893
+ }
894
+ /**
895
+ * Check if the current time (or a given time) is within TRAI calling hours.
896
+ * Allowed: 9:00 AM – 9:00 PM IST.
897
+ * Uses Intl.DateTimeFormat only — no date-fns or dayjs dependency.
898
+ *
899
+ * @param at Time to check. Defaults to now.
900
+ * @param timezone IANA timezone. Defaults to 'Asia/Kolkata'.
901
+ *
902
+ * @example
903
+ * ```ts
904
+ * trai.isWithinCallingHours() // Check now
905
+ * trai.isWithinCallingHours(new Date()) // Explicit time
906
+ * ```
907
+ */
908
+ isWithinCallingHours(at, timezone) {
909
+ const tz = timezone ?? this.config.timezone;
910
+ const date = at ?? /* @__PURE__ */ new Date();
911
+ const parts = new Intl.DateTimeFormat("en-IN", {
912
+ timeZone: tz,
913
+ hour: "numeric",
914
+ hour12: false
915
+ }).formatToParts(date);
916
+ const hourPart = parts.find((p) => p.type === "hour");
917
+ const hour = parseInt(hourPart?.value ?? "0", 10);
918
+ return hour >= this.config.callingHoursStart && hour < this.config.callingHoursEnd;
919
+ }
920
+ /**
921
+ * Record explicit consent from a user for future calls.
922
+ * Consent is valid for 180 days per TRAI guidelines.
923
+ *
924
+ * @param params Consent record details
925
+ *
926
+ * @example
927
+ * ```ts
928
+ * await trai.recordConsent({
929
+ * phoneNumber: '+919876543210',
930
+ * consentedAt: new Date(),
931
+ * channel: 'ivr',
932
+ * purpose: 'PROMOTIONAL',
933
+ * })
934
+ * ```
935
+ */
936
+ async recordConsent(params) {
937
+ const normalized = libphonenumberJs.parsePhoneNumberFromString(params.phoneNumber)?.format("E.164");
938
+ this.consentCache.set(normalized, params);
939
+ logger4.info(
940
+ { phoneNumber: normalized, purpose: params.purpose, channel: params.channel },
941
+ "Consent recorded"
942
+ );
943
+ }
944
+ /**
945
+ * Check if a number has valid (non-expired) consent on record.
946
+ *
947
+ * @param phoneNumber E.164 phone number
948
+ * @returns True if valid consent exists
949
+ */
950
+ async hasValidConsent(phoneNumber) {
951
+ let normalized;
952
+ try {
953
+ normalized = libphonenumberJs.parsePhoneNumberFromString(phoneNumber)?.format("E.164");
954
+ } catch {
955
+ return false;
956
+ }
957
+ const record = this.consentCache.get(normalized);
958
+ if (!record) return false;
959
+ const ageMs = Date.now() - record.consentedAt.getTime();
960
+ return ageMs < CONSENT_VALIDITY_MS;
961
+ }
962
+ /**
963
+ * Fetch DNC status from TRAI DND API.
964
+ * @internal
965
+ */
966
+ async fetchDNCStatus(params) {
967
+ try {
968
+ logger4.debug({ to: params.to, purpose: params.purpose }, "Fetching DNC status from TRAI");
969
+ const response = await this.http.post("", {
970
+ phone: params.to,
971
+ type: params.purpose
972
+ });
973
+ const result = {
974
+ allowed: !response.data.registered,
975
+ reason: response.data.registered ? `Number is registered on DNC for category: ${response.data.category ?? "ALL"}` : void 0,
976
+ cachedAt: /* @__PURE__ */ new Date(),
977
+ fromCache: false
978
+ };
979
+ logger4.info({ to: params.to, allowed: result.allowed }, "DNC status fetched");
980
+ return result;
981
+ } catch (err) {
982
+ if (axios__default.default.isAxiosError(err) && err.response?.status === 404) {
983
+ return { allowed: true, cachedAt: /* @__PURE__ */ new Date(), fromCache: false };
984
+ }
985
+ logger4.error({ err, to: params.to }, "TRAI DNC API unavailable \u2014 failing open");
986
+ return {
987
+ allowed: true,
988
+ reason: "DNC check unavailable \u2014 failing open",
989
+ cachedAt: /* @__PURE__ */ new Date(),
990
+ fromCache: false
991
+ };
992
+ }
993
+ }
994
+ };
995
+ var logger5 = pino__default.default({ name: "@voice-kit/core:memory" });
996
+ var DEFAULTS2 = {
997
+ maxTurns: 20,
998
+ maxBytes: 512e3,
999
+ // 512KB
1000
+ ttlMs: 30 * 6e4
1001
+ // 30 minutes
1002
+ };
1003
+ function estimateTokens(messages) {
1004
+ let chars = 0;
1005
+ for (const msg of messages) {
1006
+ const content = typeof msg.content === "string" ? msg.content : JSON.stringify(msg.content);
1007
+ chars += content.length;
1008
+ }
1009
+ return Math.ceil(chars / 4);
1010
+ }
1011
+ function estimateBytes(messages) {
1012
+ return JSON.stringify(messages).length;
1013
+ }
1014
+ var LRUCallMemory = class {
1015
+ cache;
1016
+ config;
1017
+ constructor(config) {
1018
+ this.config = config;
1019
+ this.cache = new lruCache.LRUCache({
1020
+ max: 1e3,
1021
+ // max concurrent calls in memory
1022
+ ttl: config.ttlMs,
1023
+ updateAgeOnGet: true
1024
+ // reset TTL on access (active calls stay warm)
1025
+ });
1026
+ }
1027
+ /**
1028
+ * Add a turn to the call's conversation window.
1029
+ * Automatically trims oldest turns when maxTurns or maxBytes is exceeded.
1030
+ *
1031
+ * @param callId The call identifier
1032
+ * @param message ModelMessage to append
1033
+ */
1034
+ addTurn(callId, message) {
1035
+ const existing = this.cache.get(callId) ?? [];
1036
+ const updated = [...existing, message];
1037
+ const trimmed = updated.length > this.config.maxTurns ? updated.slice(updated.length - this.config.maxTurns) : updated;
1038
+ let bytesTrimmed = trimmed;
1039
+ while (bytesTrimmed.length > 1 && estimateBytes(bytesTrimmed) > this.config.maxBytes) {
1040
+ bytesTrimmed = bytesTrimmed.slice(1);
1041
+ }
1042
+ this.cache.set(callId, bytesTrimmed);
1043
+ logger5.debug(
1044
+ { callId, turns: bytesTrimmed.length, bytes: estimateBytes(bytesTrimmed) },
1045
+ "Memory: turn added"
1046
+ );
1047
+ }
1048
+ /**
1049
+ * Get all turns for a call.
1050
+ *
1051
+ * @param callId The call identifier
1052
+ * @returns Array of ModelMessage (empty if call not found)
1053
+ */
1054
+ getTurns(callId) {
1055
+ return this.cache.get(callId) ?? [];
1056
+ }
1057
+ /**
1058
+ * Clear all turns for a call. Call this on call.ended to free memory.
1059
+ *
1060
+ * @param callId The call identifier
1061
+ */
1062
+ clearCall(callId) {
1063
+ this.cache.delete(callId);
1064
+ logger5.debug({ callId }, "Memory: call cleared");
1065
+ }
1066
+ /**
1067
+ * Estimate the number of LLM tokens used by a call's history.
1068
+ *
1069
+ * @param callId The call identifier
1070
+ */
1071
+ getTokenEstimate(callId) {
1072
+ const messages = this.cache.get(callId) ?? [];
1073
+ return estimateTokens(messages);
1074
+ }
1075
+ /**
1076
+ * Trim oldest turns to stay within a token budget.
1077
+ * Called by VoiceAgent before each LLM call to prevent context overflow.
1078
+ *
1079
+ * @param callId The call identifier
1080
+ * @param maxTokens Maximum tokens to retain
1081
+ */
1082
+ trimToTokenBudget(callId, maxTokens) {
1083
+ let messages = this.cache.get(callId) ?? [];
1084
+ while (messages.length > 1 && estimateTokens(messages) > maxTokens) {
1085
+ messages = messages.slice(1);
1086
+ }
1087
+ this.cache.set(callId, messages);
1088
+ logger5.debug(
1089
+ { callId, turns: messages.length, estimatedTokens: estimateTokens(messages) },
1090
+ "Memory: trimmed to token budget"
1091
+ );
1092
+ }
1093
+ };
1094
+ function createCallMemory(config) {
1095
+ const merged = {
1096
+ maxTurns: config?.maxTurns ?? DEFAULTS2.maxTurns,
1097
+ maxBytes: config?.maxBytes ?? DEFAULTS2.maxBytes,
1098
+ ttlMs: config?.ttlMs ?? DEFAULTS2.ttlMs
1099
+ };
1100
+ return new LRUCallMemory(merged);
1101
+ }
1102
+ var logger6 = pino__default.default({ name: "@voice-kit/core:metrics" });
1103
+ var TOKEN_COSTS_PER_M = {
1104
+ "gpt-4o": { input: 5, output: 15 },
1105
+ "gpt-4o-mini": { input: 0.15, output: 0.6 },
1106
+ "claude-3-5-sonnet": { input: 3, output: 15 },
1107
+ "llama-3.3-70b": { input: 0.59, output: 0.79 }
1108
+ };
1109
+ function p95(values) {
1110
+ if (values.length === 0) return 0;
1111
+ const sorted = [...values].sort((a, b) => a - b);
1112
+ const idx = Math.floor(sorted.length * 0.95);
1113
+ return sorted[Math.min(idx, sorted.length - 1)] ?? 0;
1114
+ }
1115
+ function avg(values) {
1116
+ if (values.length === 0) return 0;
1117
+ return values.reduce((a, b) => a + b, 0) / values.length;
1118
+ }
1119
+ var CallMetrics = class {
1120
+ store;
1121
+ constructor() {
1122
+ this.store = new lruCache.LRUCache({
1123
+ max: 1e4,
1124
+ ttl: 2 * 60 * 60 * 1e3
1125
+ // 2 hours
1126
+ });
1127
+ }
1128
+ getOrCreate(callId) {
1129
+ const existing = this.store.get(callId);
1130
+ if (existing) return existing;
1131
+ const data = {
1132
+ sttFirstByteMs: [],
1133
+ ttsFirstByteMs: [],
1134
+ llmFirstTokenMs: [],
1135
+ turnLatencyMs: [],
1136
+ interruptionCount: 0,
1137
+ interruptionPositions: [],
1138
+ tokenCost: []
1139
+ };
1140
+ this.store.set(callId, data);
1141
+ return data;
1142
+ }
1143
+ /** Record time from audio start to first STT partial result. */
1144
+ recordSTTFirstByte(callId, ms) {
1145
+ this.getOrCreate(callId).sttFirstByteMs.push(ms);
1146
+ logger6.debug({ callId, ms }, "Metric: STT TTFB");
1147
+ }
1148
+ /** Record time from TTS request to first audio chunk. */
1149
+ recordTTSFirstByte(callId, ms) {
1150
+ this.getOrCreate(callId).ttsFirstByteMs.push(ms);
1151
+ logger6.debug({ callId, ms }, "Metric: TTS TTFB");
1152
+ }
1153
+ /** Record time from LLM request to first token. */
1154
+ recordLLMFirstToken(callId, ms) {
1155
+ this.getOrCreate(callId).llmFirstTokenMs.push(ms);
1156
+ logger6.debug({ callId, ms }, "Metric: LLM first token");
1157
+ }
1158
+ /**
1159
+ * Record end-to-end turn latency: speech_end → first TTS audio byte.
1160
+ * This is the primary latency metric for voice agent quality.
1161
+ */
1162
+ recordTurnLatency(callId, ms) {
1163
+ this.getOrCreate(callId).turnLatencyMs.push(ms);
1164
+ logger6.debug({ callId, ms }, "Metric: turn latency");
1165
+ }
1166
+ /**
1167
+ * Record an interruption event.
1168
+ *
1169
+ * @param callId Call identifier
1170
+ * @param positionPct 0–1, how far through the TTS stream the interruption occurred
1171
+ */
1172
+ recordInterruption(callId, positionPct) {
1173
+ const data = this.getOrCreate(callId);
1174
+ data.interruptionCount++;
1175
+ data.interruptionPositions.push(positionPct);
1176
+ logger6.debug({ callId, positionPct }, "Metric: interruption");
1177
+ }
1178
+ /** Record token usage and estimated cost for a model call. */
1179
+ recordTokenCost(callId, model, inputTokens, outputTokens) {
1180
+ const costs = TOKEN_COSTS_PER_M[model] ?? { input: 0, output: 0 };
1181
+ const estimatedUsdCost = inputTokens / 1e6 * costs.input + outputTokens / 1e6 * costs.output;
1182
+ this.getOrCreate(callId).tokenCost.push({
1183
+ model,
1184
+ inputTokens,
1185
+ outputTokens,
1186
+ estimatedUsdCost
1187
+ });
1188
+ logger6.debug({ callId, model, inputTokens, outputTokens, estimatedUsdCost }, "Metric: token cost");
1189
+ }
1190
+ /**
1191
+ * Get a full summary of metrics for a call.
1192
+ *
1193
+ * @param callId The call identifier
1194
+ * @returns Aggregated metrics summary
1195
+ */
1196
+ getCallSummary(callId) {
1197
+ const data = this.getOrCreate(callId);
1198
+ return {
1199
+ callId,
1200
+ sttFirstByteMs: [...data.sttFirstByteMs],
1201
+ ttsFirstByteMs: [...data.ttsFirstByteMs],
1202
+ llmFirstTokenMs: [...data.llmFirstTokenMs],
1203
+ turnLatencyMs: [...data.turnLatencyMs],
1204
+ interruptionCount: data.interruptionCount,
1205
+ interruptionPositions: [...data.interruptionPositions],
1206
+ tokenCost: [...data.tokenCost],
1207
+ avgTurnLatencyMs: Math.round(avg(data.turnLatencyMs)),
1208
+ p95TurnLatencyMs: Math.round(p95(data.turnLatencyMs))
1209
+ };
1210
+ }
1211
+ /** Remove metrics for a call. Call on call.ended to free memory. */
1212
+ clearCall(callId) {
1213
+ this.store.delete(callId);
1214
+ }
1215
+ };
1216
+ var logger7 = pino__default.default({ name: "@voice-kit/core:observability" });
1217
+ var _provider = null;
1218
+ function getOrInitProvider() {
1219
+ if (_provider) return _provider;
1220
+ const endpoint = process.env["OTEL_EXPORTER_OTLP_ENDPOINT"];
1221
+ _provider = new sdkTraceNode.NodeTracerProvider({
1222
+ resource: resources.resourceFromAttributes({
1223
+ [semanticConventions.ATTR_SERVICE_NAME]: "voice-kit"
1224
+ }),
1225
+ // Pass span processors directly in constructor — addSpanProcessor doesn't exist in this version
1226
+ spanProcessors: endpoint ? [new sdkTraceBase.SimpleSpanProcessor(new exporterTraceOtlpHttp.OTLPTraceExporter({ url: endpoint }))] : []
1227
+ });
1228
+ if (endpoint) {
1229
+ logger7.info({ endpoint }, "OTel OTLP exporter configured");
1230
+ }
1231
+ _provider.register();
1232
+ return _provider;
1233
+ }
1234
+ var VoiceSDKTracer = class {
1235
+ tracer;
1236
+ constructor() {
1237
+ getOrInitProvider();
1238
+ this.tracer = api.trace.getTracer("@voice-kit/core", "0.1.0");
1239
+ }
1240
+ /**
1241
+ * Trace an STT operation with provider + language attributes.
1242
+ */
1243
+ async traceSTT(fn, attrs) {
1244
+ return this.withSpan(`stt.${attrs.provider}`, fn, {
1245
+ "stt.provider": attrs.provider,
1246
+ "stt.language": attrs.language,
1247
+ ...attrs.callId && { "call.id": attrs.callId }
1248
+ });
1249
+ }
1250
+ /**
1251
+ * Trace a TTS synthesis operation.
1252
+ */
1253
+ async traceTTS(fn, attrs) {
1254
+ return this.withSpan(`tts.${attrs.provider}`, fn, {
1255
+ "tts.provider": attrs.provider,
1256
+ "tts.voice_id": attrs.voice,
1257
+ "tts.char_count": attrs.chars,
1258
+ ...attrs.callId && { "call.id": attrs.callId }
1259
+ });
1260
+ }
1261
+ /**
1262
+ * Trace an LLM generation call.
1263
+ */
1264
+ async traceLLM(fn, attrs) {
1265
+ return this.withSpan(`llm.${attrs.model}`, fn, {
1266
+ "llm.model": attrs.model,
1267
+ "llm.input_tokens": attrs.inputTokens,
1268
+ ...attrs.callId && { "call.id": attrs.callId }
1269
+ });
1270
+ }
1271
+ /**
1272
+ * Trace a full call lifecycle.
1273
+ */
1274
+ async traceCall(fn, attrs) {
1275
+ return this.withSpan("call", fn, {
1276
+ "call.id": attrs.callId,
1277
+ "call.direction": attrs.direction
1278
+ });
1279
+ }
1280
+ /**
1281
+ * Trace a single conversation turn.
1282
+ */
1283
+ async traceTurn(fn, attrs) {
1284
+ return this.withSpan("turn", fn, {
1285
+ "turn.index": attrs.turnIndex,
1286
+ "call.id": attrs.callId
1287
+ });
1288
+ }
1289
+ /** Generic span wrapper. @internal */
1290
+ async withSpan(name, fn, attributes) {
1291
+ const span = this.tracer.startSpan(name, { attributes });
1292
+ const startMs = Date.now();
1293
+ try {
1294
+ const result = await fn();
1295
+ span.setStatus({ code: api.SpanStatusCode.OK });
1296
+ span.setAttribute("duration_ms", Date.now() - startMs);
1297
+ return result;
1298
+ } catch (err) {
1299
+ span.setStatus({
1300
+ code: api.SpanStatusCode.ERROR,
1301
+ message: err instanceof Error ? err.message : String(err)
1302
+ });
1303
+ span.recordException(err instanceof Error ? err : new Error(String(err)));
1304
+ throw err;
1305
+ } finally {
1306
+ span.end();
1307
+ }
1308
+ }
1309
+ };
1310
+ var logger8 = pino__default.default({ name: "@voice-kit/core:stt:assemblyai" });
1311
+ var SUPPORTED_LANGUAGES = [
1312
+ "en",
1313
+ "en_au",
1314
+ "en_uk",
1315
+ "en_us",
1316
+ "hi",
1317
+ "fr",
1318
+ "de",
1319
+ "es",
1320
+ "it",
1321
+ "pt",
1322
+ "nl",
1323
+ "ja",
1324
+ "zh"
1325
+ ];
1326
+ var AssemblyAISTTProvider = class {
1327
+ name = "assemblyai";
1328
+ supportsStreaming = false;
1329
+ supportedLanguages = SUPPORTED_LANGUAGES;
1330
+ client;
1331
+ config;
1332
+ constructor(config) {
1333
+ const apiKey = config.apiKey ?? process.env["ASSEMBLYAI_API_KEY"];
1334
+ if (!apiKey) throw new STTConnectionError("assemblyai", new Error("ASSEMBLYAI_API_KEY not set"));
1335
+ this.client = new assemblyai.AssemblyAI({ apiKey });
1336
+ this.config = {
1337
+ language: config.language ?? "en",
1338
+ alternateLanguages: config.alternateLanguages ?? [],
1339
+ apiKey,
1340
+ model: config.model ?? "best",
1341
+ wordTimestamps: config.wordTimestamps ?? true,
1342
+ interimResults: false,
1343
+ smartFormat: config.smartFormat ?? true,
1344
+ region: ""
1345
+ };
1346
+ }
1347
+ /**
1348
+ * Batch-transcribes collected audio. AssemblyAI has no realtime streaming.
1349
+ * Collects all audio from the iterable, uploads, then polls for result.
1350
+ *
1351
+ * @param audio Async iterable of PCM buffers
1352
+ */
1353
+ async *transcribeStream(audio) {
1354
+ const chunks = [];
1355
+ for await (const chunk of audio) chunks.push(chunk);
1356
+ const result = await this.transcribeBatch(Buffer.concat(chunks));
1357
+ yield result;
1358
+ }
1359
+ /**
1360
+ * Upload audio to AssemblyAI and wait for async transcription.
1361
+ * Suitable for call recordings. Average latency: 15–45s per minute of audio.
1362
+ *
1363
+ * @param audio Raw WAV/PCM/MP3 buffer
1364
+ *
1365
+ * @example
1366
+ * ```ts
1367
+ * const stt = createSTT('assemblyai', { wordTimestamps: true })
1368
+ * const result = await stt.transcribeBatch(recordingBuffer)
1369
+ * console.log(result.words) // Word-level timestamps
1370
+ * ```
1371
+ */
1372
+ async transcribeBatch(audio) {
1373
+ const startMs = Date.now();
1374
+ try {
1375
+ logger8.debug({ bytes: audio.length, language: this.config.language }, "AssemblyAI transcription started");
1376
+ const transcript = await this.client.transcripts.transcribe({
1377
+ audio,
1378
+ language_code: this.config.language,
1379
+ speech_model: this.config.model,
1380
+ punctuate: this.config.smartFormat,
1381
+ format_text: this.config.smartFormat,
1382
+ word_boost: [],
1383
+ ...this.config.wordTimestamps && { timestamps: true }
1384
+ });
1385
+ if (transcript.status === "error") {
1386
+ throw new STTStreamError("assemblyai", new Error(transcript.error ?? "Transcription failed"));
1387
+ }
1388
+ logger8.info(
1389
+ { id: transcript.id, duration: transcript.audio_duration, latencyMs: Date.now() - startMs },
1390
+ "AssemblyAI transcription complete"
1391
+ );
1392
+ return {
1393
+ transcript: transcript.text ?? "",
1394
+ isFinal: true,
1395
+ confidence: transcript.confidence ?? 0.9,
1396
+ language: this.config.language,
1397
+ languageSwitchDetected: false,
1398
+ words: this.config.wordTimestamps && transcript.words ? transcript.words.map((w) => ({
1399
+ word: w.text,
1400
+ startMs: w.start,
1401
+ endMs: w.end,
1402
+ confidence: w.confidence
1403
+ })) : void 0,
1404
+ latencyMs: Date.now() - startMs
1405
+ };
1406
+ } catch (err) {
1407
+ if (err instanceof STTStreamError) throw err;
1408
+ throw new STTStreamError("assemblyai", err);
1409
+ }
1410
+ }
1411
+ };
1412
+ var logger9 = pino__default.default({ name: "@voice-kit/core:stt:deepgram" });
1413
+ var SUPPORTED_LANGUAGES2 = [
1414
+ "en-IN",
1415
+ "hi-IN",
1416
+ "ta-IN",
1417
+ "te-IN",
1418
+ "kn-IN",
1419
+ "mr-IN",
1420
+ "en-US",
1421
+ "en-GB",
1422
+ "en-AU"
1423
+ ];
1424
+ var BACKOFF = {
1425
+ baseMs: 100,
1426
+ maxMs: 5e3,
1427
+ jitterPct: 0.2,
1428
+ maxAttempts: 3
1429
+ };
1430
+ function backoffDelay(attempt) {
1431
+ const base = Math.min(BACKOFF.baseMs * Math.pow(2, attempt), BACKOFF.maxMs);
1432
+ const jitter = base * BACKOFF.jitterPct * (Math.random() * 2 - 1);
1433
+ return Math.round(base + jitter);
1434
+ }
1435
+ var DeepgramSTTProvider = class {
1436
+ name = "deepgram";
1437
+ supportsStreaming = true;
1438
+ supportedLanguages = SUPPORTED_LANGUAGES2;
1439
+ client;
1440
+ config;
1441
+ constructor(config) {
1442
+ const apiKey = config.apiKey ?? process.env["DEEPGRAM_API_KEY"];
1443
+ if (!apiKey) throw new STTConnectionError("deepgram", new Error("DEEPGRAM_API_KEY not set"));
1444
+ this.client = new sdk.DeepgramClient({ apiKey });
1445
+ this.config = {
1446
+ language: config.language ?? "en-IN",
1447
+ alternateLanguages: config.alternateLanguages ?? [],
1448
+ apiKey,
1449
+ // nova-3 is now Deepgram's latest recommended model
1450
+ model: config.model ?? "nova-3",
1451
+ wordTimestamps: config.wordTimestamps ?? false,
1452
+ interimResults: config.interimResults ?? true,
1453
+ smartFormat: config.smartFormat ?? true,
1454
+ region: config.region ?? ""
1455
+ };
1456
+ }
1457
+ /**
1458
+ * Stream audio to Deepgram and receive interim + final transcription results.
1459
+ * Handles reconnection transparently with exponential backoff.
1460
+ *
1461
+ * @param audio Async iterable of 16kHz PCM buffers from AudioPipeline
1462
+ *
1463
+ * @example
1464
+ * ```ts
1465
+ * const stt = createSTT('deepgram', { language: 'hi-IN' })
1466
+ * for await (const result of stt.transcribeStream(audioIterable)) {
1467
+ * if (result.isFinal) console.log('User said:', result.transcript)
1468
+ * }
1469
+ * ```
1470
+ */
1471
+ async *transcribeStream(audio) {
1472
+ let attempt = 0;
1473
+ const startMs = Date.now();
1474
+ while (attempt <= BACKOFF.maxAttempts) {
1475
+ const connection = await this.connectWithRetry(attempt);
1476
+ const results = [];
1477
+ let done = false;
1478
+ let error = null;
1479
+ connection.on("message", (data) => {
1480
+ if (data.type !== "Results") return;
1481
+ const alt = data.channel?.alternatives?.[0];
1482
+ if (!alt?.transcript) return;
1483
+ const isFinal = data.is_final === true;
1484
+ const result = {
1485
+ transcript: alt.transcript,
1486
+ isFinal,
1487
+ // speech_final=true means Deepgram detected end-of-utterance (endpointing).
1488
+ // A frame can be speech_final without is_final — callers should act on both.
1489
+ confidence: alt.confidence ?? 0,
1490
+ // alt.languages populated when detect_language is enabled
1491
+ language: alt.languages?.[0] ?? this.config.language,
1492
+ languageSwitchDetected: false,
1493
+ words: this.config.wordTimestamps ? alt.words?.map((w) => ({
1494
+ word: w.word ?? "",
1495
+ startMs: (w.start ?? 0) * 1e3,
1496
+ endMs: (w.end ?? 0) * 1e3,
1497
+ confidence: w.confidence ?? 0,
1498
+ punctuatedWord: w.punctuated_word
1499
+ })) : void 0,
1500
+ latencyMs: Date.now() - startMs
1501
+ };
1502
+ results.push(result);
1503
+ if (isFinal) {
1504
+ logger9.debug(
1505
+ { transcript: result.transcript, confidence: result.confidence, language: result.language },
1506
+ "Deepgram final transcript"
1507
+ );
1508
+ }
1509
+ });
1510
+ connection.on("close", () => {
1511
+ done = true;
1512
+ });
1513
+ connection.on("error", (err) => {
1514
+ error = err;
1515
+ logger9.warn({ err, attempt }, "Deepgram stream error");
1516
+ });
1517
+ const sendAudio = async () => {
1518
+ try {
1519
+ for await (const chunk of audio) {
1520
+ connection.socket.send(chunk);
1521
+ }
1522
+ connection.socket.send(JSON.stringify({ type: "Finalize" }));
1523
+ } catch (err) {
1524
+ error = err instanceof Error ? err : new Error(String(err));
1525
+ }
1526
+ };
1527
+ const sendPromise = sendAudio();
1528
+ let resultIndex = 0;
1529
+ while (!done || resultIndex < results.length) {
1530
+ if (resultIndex < results.length) {
1531
+ yield results[resultIndex++];
1532
+ } else {
1533
+ await new Promise((r) => setTimeout(r, 10));
1534
+ }
1535
+ if (error && attempt < BACKOFF.maxAttempts) {
1536
+ try {
1537
+ connection.socket.close();
1538
+ } catch {
1539
+ }
1540
+ break;
1541
+ }
1542
+ if (error && attempt >= BACKOFF.maxAttempts) {
1543
+ await sendPromise.catch(() => {
1544
+ });
1545
+ throw new STTStreamError("deepgram", error);
1546
+ }
1547
+ }
1548
+ await sendPromise.catch(() => {
1549
+ });
1550
+ if (!error) return;
1551
+ attempt++;
1552
+ await new Promise((r) => setTimeout(r, backoffDelay(attempt)));
1553
+ logger9.info({ attempt }, "Deepgram reconnecting...");
1554
+ }
1555
+ throw new STTStreamError("deepgram", new Error("Max reconnect attempts exceeded"));
1556
+ }
1557
+ /**
1558
+ * Transcribe a complete audio buffer (non-streaming).
1559
+ * Uses Deepgram pre-recorded API.
1560
+ *
1561
+ * @param audio Raw PCM or WAV buffer
1562
+ */
1563
+ async transcribeBatch(audio) {
1564
+ const startMs = Date.now();
1565
+ try {
1566
+ const response = await this.client.listen.v1.media.transcribeFile(
1567
+ audio,
1568
+ {
1569
+ model: this.config.model,
1570
+ language: this.config.language,
1571
+ // v5: boolean-like options must be strings
1572
+ smart_format: true,
1573
+ diarize: false
1574
+ }
1575
+ );
1576
+ const alt = response?.results?.channels?.[0]?.alternatives?.[0];
1577
+ return {
1578
+ transcript: alt?.transcript ?? "",
1579
+ isFinal: true,
1580
+ confidence: alt?.confidence ?? 0,
1581
+ language: this.config.language,
1582
+ languageSwitchDetected: false,
1583
+ latencyMs: Date.now() - startMs
1584
+ };
1585
+ } catch (err) {
1586
+ if (err instanceof STTStreamError) throw err;
1587
+ throw new STTStreamError("deepgram", err instanceof Error ? err : new Error(String(err)));
1588
+ }
1589
+ }
1590
+ /**
1591
+ * Create and open a live WebSocket connection to Deepgram.
1592
+ *
1593
+ * v5 connection lifecycle (3 explicit steps):
1594
+ * 1. await listen.v1.connect(options) — constructs the connection object
1595
+ * 2. connection.connect() — initiates the WebSocket handshake
1596
+ * 3. await connection.waitForOpen() — resolves once the socket is ready
1597
+ *
1598
+ * @internal
1599
+ */
1600
+ async connectWithRetry(attempt) {
1601
+ const delay = attempt > 0 ? backoffDelay(attempt) : 0;
1602
+ if (delay > 0) await new Promise((r) => setTimeout(r, delay));
1603
+ try {
1604
+ logger9.debug({ attempt, language: this.config.language }, "Connecting to Deepgram");
1605
+ const connection = await this.client.listen.v1.connect({
1606
+ model: this.config.model,
1607
+ language: this.config.language,
1608
+ // v5: boolean-like options must be strings
1609
+ smart_format: "true",
1610
+ interim_results: String(this.config.interimResults),
1611
+ encoding: "linear16",
1612
+ sample_rate: 16e3,
1613
+ channels: 1,
1614
+ utterance_end_ms: "1000",
1615
+ ...this.config.alternateLanguages.length > 0 && {
1616
+ detect_language: "true",
1617
+ // language must be omitted when detect_language is enabled
1618
+ language: void 0
1619
+ },
1620
+ Authorization: `Token ${this.config.apiKey}`
1621
+ });
1622
+ connection.connect();
1623
+ await Promise.race([
1624
+ connection.waitForOpen(),
1625
+ new Promise(
1626
+ (_, reject) => setTimeout(
1627
+ () => reject(new STTConnectionError("deepgram", new Error("Connection timeout"))),
1628
+ 1e4
1629
+ )
1630
+ )
1631
+ ]);
1632
+ logger9.info({ attempt, language: this.config.language }, "Deepgram connected");
1633
+ return connection;
1634
+ } catch (err) {
1635
+ if (err instanceof STTConnectionError) throw err;
1636
+ throw new STTConnectionError("deepgram", err instanceof Error ? err : new Error(String(err)));
1637
+ }
1638
+ }
1639
+ };
1640
+ var logger10 = pino__default.default({ name: "@voice-kit/core:stt:sarvam" });
1641
+ var SARVAM_API_BASE = "https://api.sarvam.ai";
1642
+ var SUPPORTED_LANGUAGES3 = [
1643
+ "hi-IN",
1644
+ "kn-IN",
1645
+ "ta-IN",
1646
+ "te-IN",
1647
+ "mr-IN",
1648
+ "bn-IN",
1649
+ "gu-IN",
1650
+ "pa-IN",
1651
+ "or-IN",
1652
+ "ml-IN"
1653
+ ];
1654
+ var SARVAM_MODELS = {
1655
+ "hi-IN": "saarika:v1",
1656
+ "kn-IN": "saarika:v1",
1657
+ "ta-IN": "saarika:v1",
1658
+ "te-IN": "saarika:v1",
1659
+ "mr-IN": "saarika:v1",
1660
+ "bn-IN": "saarika:v1",
1661
+ "gu-IN": "saarika:v1",
1662
+ "pa-IN": "saarika:v1",
1663
+ "or-IN": "saarika:v1",
1664
+ "ml-IN": "saarika:v1"
1665
+ };
1666
+ var SarvamSTTProvider = class {
1667
+ name = "sarvam";
1668
+ supportsStreaming = false;
1669
+ // Sarvam REST API is batch-only
1670
+ supportedLanguages = SUPPORTED_LANGUAGES3;
1671
+ http;
1672
+ config;
1673
+ constructor(config) {
1674
+ const apiKey = config.apiKey ?? process.env["SARVAM_API_KEY"];
1675
+ if (!apiKey) throw new STTConnectionError("sarvam", new Error("SARVAM_API_KEY not set"));
1676
+ const language = config.language ?? "hi-IN";
1677
+ if (!SUPPORTED_LANGUAGES3.includes(language)) {
1678
+ throw new STTLanguageNotSupportedError("sarvam", language);
1679
+ }
1680
+ this.http = axios__default.default.create({
1681
+ baseURL: SARVAM_API_BASE,
1682
+ headers: {
1683
+ "API-Subscription-Key": apiKey,
1684
+ "Content-Type": "multipart/form-data"
1685
+ },
1686
+ timeout: 3e4
1687
+ });
1688
+ this.config = {
1689
+ language,
1690
+ alternateLanguages: config.alternateLanguages ?? [],
1691
+ apiKey,
1692
+ model: config.model ?? SARVAM_MODELS[language] ?? "saarika:v1",
1693
+ wordTimestamps: false,
1694
+ // Sarvam doesn't support word timestamps yet
1695
+ interimResults: false,
1696
+ smartFormat: config.smartFormat ?? true,
1697
+ region: config.region ?? ""
1698
+ };
1699
+ }
1700
+ /**
1701
+ * Collects audio and transcribes via Sarvam batch API.
1702
+ * Sarvam doesn't support realtime streaming.
1703
+ *
1704
+ * @param audio Async iterable of 16kHz PCM buffers
1705
+ */
1706
+ async *transcribeStream(audio) {
1707
+ const chunks = [];
1708
+ for await (const chunk of audio) chunks.push(chunk);
1709
+ const result = await this.transcribeBatch(Buffer.concat(chunks));
1710
+ yield result;
1711
+ }
1712
+ /**
1713
+ * Transcribe a WAV/PCM audio buffer in an Indic language.
1714
+ *
1715
+ * @param audio 16kHz PCM or WAV buffer
1716
+ *
1717
+ * @example
1718
+ * ```ts
1719
+ * const stt = createSTT('sarvam', { language: 'ta-IN' })
1720
+ * const result = await stt.transcribeBatch(tamilAudioBuffer)
1721
+ * console.log(result.transcript) // Tamil text
1722
+ * ```
1723
+ */
1724
+ async transcribeBatch(audio) {
1725
+ const startMs = Date.now();
1726
+ try {
1727
+ logger10.debug(
1728
+ { language: this.config.language, bytes: audio.length },
1729
+ "Sarvam transcription request"
1730
+ );
1731
+ const form = new FormData();
1732
+ form.append("file", new Blob([audio], { type: "audio/wav" }), "audio.wav");
1733
+ form.append("language_code", this.config.language);
1734
+ form.append("model", this.config.model);
1735
+ if (this.config.smartFormat) {
1736
+ form.append("with_disfluencies", "false");
1737
+ }
1738
+ const response = await this.http.post(
1739
+ "/speech-to-text",
1740
+ form
1741
+ );
1742
+ const data = response.data;
1743
+ logger10.info(
1744
+ { language: data.language_code, confidence: data.confidence, latencyMs: Date.now() - startMs },
1745
+ "Sarvam transcription complete"
1746
+ );
1747
+ return {
1748
+ transcript: data.transcript,
1749
+ isFinal: true,
1750
+ confidence: data.confidence ?? 0.9,
1751
+ language: data.language_code ?? this.config.language,
1752
+ languageSwitchDetected: false,
1753
+ latencyMs: Date.now() - startMs
1754
+ };
1755
+ } catch (err) {
1756
+ if (axios__default.default.isAxiosError(err)) {
1757
+ throw new STTStreamError(
1758
+ "sarvam",
1759
+ new Error(`Sarvam API error: ${err.response?.status} ${JSON.stringify(err.response?.data)}`)
1760
+ );
1761
+ }
1762
+ throw new STTStreamError("sarvam", err);
1763
+ }
1764
+ }
1765
+ };
1766
+ var logger11 = pino__default.default({ name: "@voice-kit/core:stt:language-detect" });
1767
+ var DEVANAGARI_RANGE = /[\u0900-\u097F]/;
1768
+ var MIN_WORDS_FOR_CLASSIFICATION = 2;
1769
+ var SWITCH_CONFIDENCE_THRESHOLD = 0.6;
1770
+ var NEUTRAL_TOKENS = /* @__PURE__ */ new Set([
1771
+ "ok",
1772
+ "okay",
1773
+ "haan",
1774
+ "nahin",
1775
+ "nahi",
1776
+ "kya",
1777
+ "hai",
1778
+ "ho",
1779
+ "na",
1780
+ "toh",
1781
+ "aur",
1782
+ "ya",
1783
+ "matlab",
1784
+ "yani",
1785
+ "i",
1786
+ "a",
1787
+ "the",
1788
+ "is",
1789
+ "are",
1790
+ "and",
1791
+ "or"
1792
+ ]);
1793
+ var LanguageSwitchDetector = class extends events.EventEmitter {
1794
+ currentLanguage;
1795
+ primaryLanguage;
1796
+ /** Rolling window of recent language classifications for smoothing. */
1797
+ recentClassifications = [];
1798
+ windowSize = 5;
1799
+ constructor(primaryLanguage = "en-IN") {
1800
+ super();
1801
+ this.primaryLanguage = primaryLanguage;
1802
+ this.currentLanguage = primaryLanguage;
1803
+ }
1804
+ /**
1805
+ * Analyze a transcript for language switches.
1806
+ * Should be called on every STT final result.
1807
+ *
1808
+ * @param transcript The transcribed text to analyze
1809
+ * @returns Detected language of the transcript
1810
+ */
1811
+ analyze(transcript) {
1812
+ const words = this.tokenize(transcript);
1813
+ if (words.length === 0) return this.currentLanguage;
1814
+ const classification = this.classifySegment(words);
1815
+ const confidence = this.computeConfidence(words, classification);
1816
+ this.recentClassifications.push(classification);
1817
+ if (this.recentClassifications.length > this.windowSize) {
1818
+ this.recentClassifications.shift();
1819
+ }
1820
+ const smoothed = this.smoothedLanguage();
1821
+ if (smoothed !== this.currentLanguage && confidence >= SWITCH_CONFIDENCE_THRESHOLD && smoothed !== "unknown") {
1822
+ const event = {
1823
+ from: this.currentLanguage,
1824
+ to: smoothed,
1825
+ position: 0,
1826
+ // position in full conversation
1827
+ confidence,
1828
+ transcript,
1829
+ detectedAt: /* @__PURE__ */ new Date()
1830
+ };
1831
+ const prev = this.currentLanguage;
1832
+ this.currentLanguage = smoothed;
1833
+ logger11.info(
1834
+ { from: prev, to: smoothed, confidence, transcript: transcript.slice(0, 50) },
1835
+ "Language switch detected"
1836
+ );
1837
+ this.emit("language.switched", event);
1838
+ }
1839
+ return this.currentLanguage;
1840
+ }
1841
+ /**
1842
+ * Analyze a transcript and return per-word language classification.
1843
+ * Useful for word-level Hinglish mixing visualization.
1844
+ *
1845
+ * @param transcript Text to analyze
1846
+ * @returns Array of { word, language } pairs
1847
+ */
1848
+ analyzeWords(transcript) {
1849
+ const words = this.tokenize(transcript);
1850
+ return words.map((word) => ({
1851
+ word,
1852
+ language: this.classifyWord(word)
1853
+ }));
1854
+ }
1855
+ /** Reset to primary language (e.g., on new call). */
1856
+ reset() {
1857
+ this.currentLanguage = this.primaryLanguage;
1858
+ this.recentClassifications = [];
1859
+ }
1860
+ /** Current detected language. */
1861
+ get language() {
1862
+ return this.currentLanguage;
1863
+ }
1864
+ // ─── Private helpers ────────────────────────────────────────────────────────
1865
+ tokenize(text) {
1866
+ return text.toLowerCase().split(/\s+/).filter((w) => w.length > 0 && !NEUTRAL_TOKENS.has(w));
1867
+ }
1868
+ classifyWord(word) {
1869
+ if (DEVANAGARI_RANGE.test(word)) return "hi-IN";
1870
+ if (/^[a-z]+$/.test(word)) return "en-IN";
1871
+ return "unknown";
1872
+ }
1873
+ classifySegment(words) {
1874
+ let hindiCount = 0;
1875
+ let englishCount = 0;
1876
+ for (const word of words) {
1877
+ const lang = this.classifyWord(word);
1878
+ if (lang === "hi-IN") hindiCount++;
1879
+ else if (lang === "en-IN") englishCount++;
1880
+ }
1881
+ if (hindiCount === 0 && englishCount === 0) return "unknown";
1882
+ if (hindiCount > englishCount) return "hi-IN";
1883
+ if (englishCount > hindiCount) return "en-IN";
1884
+ return this.primaryLanguage;
1885
+ }
1886
+ computeConfidence(words, classification) {
1887
+ const relevant = words.filter((w) => this.classifyWord(w) !== "unknown");
1888
+ if (relevant.length < MIN_WORDS_FOR_CLASSIFICATION) return 0;
1889
+ const matching = relevant.filter((w) => this.classifyWord(w) === classification);
1890
+ return matching.length / relevant.length;
1891
+ }
1892
+ smoothedLanguage() {
1893
+ if (this.recentClassifications.length === 0) return this.primaryLanguage;
1894
+ const counts = { "hi-IN": 0, "en-IN": 0, "unknown": 0 };
1895
+ for (const lang of this.recentClassifications) {
1896
+ counts[lang]++;
1897
+ }
1898
+ if (counts["hi-IN"] > counts["en-IN"]) return "hi-IN";
1899
+ if (counts["en-IN"] > counts["hi-IN"]) return "en-IN";
1900
+ return this.currentLanguage;
1901
+ }
1902
+ };
1903
+ function isInglish(transcript) {
1904
+ const hasDevanagari = DEVANAGARI_RANGE.test(transcript);
1905
+ const hasLatin = /[a-zA-Z]/.test(transcript);
1906
+ return hasDevanagari && hasLatin;
1907
+ }
1908
+ var logger12 = pino__default.default({ name: "@voice-kit/core:stt:whisper" });
1909
+ var WHISPER_LANGUAGES = [
1910
+ "en",
1911
+ "hi",
1912
+ "ta",
1913
+ "te",
1914
+ "kn",
1915
+ "mr",
1916
+ "bn",
1917
+ "gu",
1918
+ "pa",
1919
+ "ur",
1920
+ "fr",
1921
+ "de",
1922
+ "es",
1923
+ "pt",
1924
+ "it",
1925
+ "nl",
1926
+ "pl",
1927
+ "ru",
1928
+ "ja",
1929
+ "zh"
1930
+ ];
1931
+ var WhisperSTTProvider = class {
1932
+ name = "whisper";
1933
+ supportsStreaming = false;
1934
+ supportedLanguages = WHISPER_LANGUAGES;
1935
+ config;
1936
+ constructor(config) {
1937
+ const apiKey = config.apiKey ?? process.env["OPENAI_API_KEY"];
1938
+ if (!apiKey) throw new STTStreamError("whisper", new Error("OPENAI_API_KEY not set"));
1939
+ const language = config.language ?? "en-IN";
1940
+ const whisperLang = language.split("-")[0] ?? "en";
1941
+ if (!WHISPER_LANGUAGES.includes(whisperLang)) {
1942
+ throw new STTLanguageNotSupportedError("whisper", language);
1943
+ }
1944
+ this.config = {
1945
+ language,
1946
+ alternateLanguages: config.alternateLanguages ?? [],
1947
+ apiKey,
1948
+ model: config.model ?? "whisper-1",
1949
+ wordTimestamps: config.wordTimestamps ?? false,
1950
+ interimResults: false,
1951
+ smartFormat: false,
1952
+ region: ""
1953
+ };
1954
+ }
1955
+ /**
1956
+ * Streaming not supported by Whisper. Collects all audio then transcribes.
1957
+ * For realtime use, use createSTT('deepgram') instead.
1958
+ */
1959
+ async *transcribeStream(audio) {
1960
+ const chunks = [];
1961
+ for await (const chunk of audio) chunks.push(chunk);
1962
+ const result = await this.transcribeBatch(Buffer.concat(chunks));
1963
+ yield result;
1964
+ }
1965
+ /**
1966
+ * Transcribe a complete audio buffer via Whisper.
1967
+ *
1968
+ * @param audio WAV or PCM buffer
1969
+ */
1970
+ async transcribeBatch(audio) {
1971
+ const startMs = Date.now();
1972
+ const language = this.config.language.split("-")[0] ?? "en";
1973
+ try {
1974
+ logger12.debug({ language, bytes: audio.length }, "Whisper batch transcription");
1975
+ const openai$1 = openai.createOpenAI({ apiKey: this.config.apiKey });
1976
+ const file = new File([audio], "audio.wav", { type: "audio/wav" });
1977
+ const formData = new FormData();
1978
+ formData.append("file", file);
1979
+ formData.append("model", this.config.model);
1980
+ formData.append("language", language);
1981
+ if (this.config.wordTimestamps) {
1982
+ formData.append("timestamp_granularities[]", "word");
1983
+ formData.append("response_format", "verbose_json");
1984
+ }
1985
+ const response = await fetch("https://api.openai.com/v1/audio/transcriptions", {
1986
+ method: "POST",
1987
+ headers: { Authorization: `Bearer ${this.config.apiKey}` },
1988
+ body: formData
1989
+ });
1990
+ if (!response.ok) {
1991
+ throw new Error(`Whisper API error: ${response.status} ${response.statusText}`);
1992
+ }
1993
+ const data = await response.json();
1994
+ return {
1995
+ transcript: data.text,
1996
+ isFinal: true,
1997
+ confidence: 0.95,
1998
+ // Whisper doesn't return confidence
1999
+ language: this.config.language,
2000
+ languageSwitchDetected: false,
2001
+ words: this.config.wordTimestamps && data.words ? data.words.map((w) => ({
2002
+ word: w.word,
2003
+ startMs: w.start * 1e3,
2004
+ endMs: w.end * 1e3,
2005
+ confidence: 0.95
2006
+ })) : void 0,
2007
+ latencyMs: Date.now() - startMs
2008
+ };
2009
+ } catch (err) {
2010
+ if (err instanceof STTStreamError) throw err;
2011
+ throw new STTStreamError("whisper", err);
2012
+ }
2013
+ }
2014
+ };
2015
+
2016
+ // src/stt/STT-factory.ts
2017
+ function createSTT(provider, config) {
2018
+ const cfg = config ?? {};
2019
+ switch (provider) {
2020
+ case "deepgram":
2021
+ return new DeepgramSTTProvider(cfg);
2022
+ case "whisper":
2023
+ return new WhisperSTTProvider(cfg);
2024
+ case "assemblyai":
2025
+ return new AssemblyAISTTProvider(cfg);
2026
+ case "sarvam":
2027
+ return new SarvamSTTProvider(cfg);
2028
+ default: {
2029
+ const _exhaustive = provider;
2030
+ throw new Error(`Unknown STT provider: ${String(_exhaustive)}`);
2031
+ }
2032
+ }
2033
+ }
2034
+ var logger13 = pino__default.default({ name: "@voice-kit/core:tts:cartesia" });
2035
+ var DEFAULT_VOICE_ID = "a0e99841-438c-4a64-b679-ae501e7d6091";
2036
+ var CartesiaTTSProvider = class {
2037
+ name = "cartesia";
2038
+ outputSampleRate = 22050;
2039
+ // Cartesia default
2040
+ outputFormat = "pcm";
2041
+ client;
2042
+ config;
2043
+ constructor(config) {
2044
+ const apiKey = config.apiKey ?? process.env["CARTESIA_API_KEY"];
2045
+ if (!apiKey) throw new TTSConnectionError("cartesia", new Error("CARTESIA_API_KEY not set"));
2046
+ this.client = new Cartesia__default.default({ apiKey });
2047
+ this.config = {
2048
+ voiceId: config.voiceId ?? DEFAULT_VOICE_ID,
2049
+ sampleRate: config.sampleRate ?? 22050,
2050
+ speed: config.speed ?? 1,
2051
+ pitch: config.pitch ?? 0,
2052
+ apiKey,
2053
+ modelId: config.modelId ?? "sonic-english",
2054
+ emotion: config.emotion ?? "",
2055
+ targetLanguage: config.targetLanguage ?? "en"
2056
+ };
2057
+ }
2058
+ /**
2059
+ * Stream audio from Cartesia. Typically delivers first chunk in < 90ms.
2060
+ *
2061
+ * @example
2062
+ * ```ts
2063
+ * const tts = createTTS('cartesia', { voiceId: 'your-voice-id' })
2064
+ * for await (const chunk of tts.synthesizeStream('Hello!')) {
2065
+ * sendToTelephony(chunk)
2066
+ * }
2067
+ * ```
2068
+ */
2069
+ async *synthesizeStream(text, config) {
2070
+ const voiceId = config?.voiceId ?? this.config.voiceId;
2071
+ const startMs = Date.now();
2072
+ logger13.debug({ voiceId, chars: text.length }, "Cartesia TTS stream start");
2073
+ try {
2074
+ const stream = await this.client.tts.generateSse({
2075
+ model_id: config?.modelId ?? this.config.modelId,
2076
+ transcript: text,
2077
+ voice: {
2078
+ mode: "id",
2079
+ id: voiceId,
2080
+ ...this.config.emotion && {
2081
+ __experimental_controls: {
2082
+ emotion: [this.config.emotion]
2083
+ }
2084
+ }
2085
+ },
2086
+ output_format: {
2087
+ container: "raw",
2088
+ encoding: "pcm_s16le",
2089
+ sample_rate: toValidSampleRate(config?.sampleRate ?? this.config.sampleRate)
2090
+ }
2091
+ });
2092
+ let firstChunk = true;
2093
+ for await (const event of stream) {
2094
+ if (!event.data || event.data === "[DONE]") continue;
2095
+ let payload;
2096
+ try {
2097
+ payload = JSON.parse(event.data);
2098
+ } catch {
2099
+ continue;
2100
+ }
2101
+ if (!payload.chunk?.audio) continue;
2102
+ const buf = Buffer.from(payload.chunk.audio, "base64");
2103
+ if (firstChunk) {
2104
+ firstChunk = false;
2105
+ logger13.debug({ ttfb: Date.now() - startMs, voiceId }, "Cartesia first audio chunk");
2106
+ }
2107
+ yield buf;
2108
+ }
2109
+ } catch (err) {
2110
+ throw new TTSStreamError("cartesia", err);
2111
+ }
2112
+ }
2113
+ /** Synthesize complete audio. */
2114
+ async synthesizeFull(text, config) {
2115
+ const chunks = [];
2116
+ for await (const chunk of this.synthesizeStream(text, config)) {
2117
+ chunks.push(chunk);
2118
+ }
2119
+ return Buffer.concat(chunks);
2120
+ }
2121
+ };
2122
+ var VALID_SAMPLE_RATES = [22050, 8e3, 16e3, 24e3, 44100, 48e3];
2123
+ function toValidSampleRate(rate) {
2124
+ return VALID_SAMPLE_RATES.includes(rate) ? rate : 8e3;
2125
+ }
2126
+ var logger14 = pino__default.default({ name: "@voice-kit/core:tts:elevenlabs" });
2127
+ var DEFAULT_VOICE_ID2 = "21m00Tcm4TlvDq8ikWAM";
2128
+ var JITTER_BUFFER_MS = 100;
2129
+ var ElevenLabsTTSProvider = class {
2130
+ name = "elevenlabs";
2131
+ outputSampleRate = 24e3;
2132
+ // ElevenLabs default: 24kHz
2133
+ outputFormat = "pcm";
2134
+ client;
2135
+ config;
2136
+ constructor(config) {
2137
+ const apiKey = config.apiKey ?? process.env["ELEVENLABS_API_KEY"];
2138
+ if (!apiKey) throw new TTSConnectionError("elevenlabs", new Error("ELEVENLABS_API_KEY not set"));
2139
+ this.client = new elevenlabs.ElevenLabsClient({ apiKey });
2140
+ this.config = {
2141
+ voiceId: config.voiceId ?? DEFAULT_VOICE_ID2,
2142
+ sampleRate: config.sampleRate ?? 24e3,
2143
+ speed: config.speed ?? 1,
2144
+ pitch: config.pitch ?? 0,
2145
+ apiKey,
2146
+ modelId: config.modelId ?? "eleven_turbo_v2_5",
2147
+ emotion: config.emotion ?? "",
2148
+ targetLanguage: config.targetLanguage ?? "en-IN"
2149
+ };
2150
+ }
2151
+ /**
2152
+ * Stream synthesized audio from ElevenLabs.
2153
+ * First chunk target: < 300ms. Uses streaming API endpoint.
2154
+ *
2155
+ * A 100ms jitter buffer smooths burst packet delivery without adding
2156
+ * perceptible latency.
2157
+ *
2158
+ * @param text Text to synthesize (should be a sentence boundary chunk)
2159
+ * @param config Per-call config overrides
2160
+ *
2161
+ * @example
2162
+ * ```ts
2163
+ * const tts = createTTS('elevenlabs', { voiceId: 'your-voice-id' })
2164
+ * for await (const chunk of tts.synthesizeStream('Hello, how can I help?')) {
2165
+ * telephony.sendAudio(chunk)
2166
+ * }
2167
+ * ```
2168
+ */
2169
+ async *synthesizeStream(text, config) {
2170
+ const voiceId = config?.voiceId ?? this.config.voiceId;
2171
+ const modelId = config?.modelId ?? this.config.modelId;
2172
+ const startMs = Date.now();
2173
+ logger14.debug({ voiceId, modelId, chars: text.length }, "ElevenLabs TTS stream start");
2174
+ try {
2175
+ const audioStream = await this.client.generate({
2176
+ voice: voiceId,
2177
+ text,
2178
+ model_id: modelId,
2179
+ voice_settings: {
2180
+ stability: 0.5,
2181
+ similarity_boost: 0.8,
2182
+ speed: config?.speed ?? this.config.speed
2183
+ },
2184
+ output_format: "pcm_24000",
2185
+ stream: true
2186
+ });
2187
+ let firstChunk = true;
2188
+ let jitterBuffer = [];
2189
+ let jitterTimer = null;
2190
+ const flushJitterBuffer = function* () {
2191
+ for (const chunk of jitterBuffer) {
2192
+ yield chunk;
2193
+ }
2194
+ jitterBuffer = [];
2195
+ };
2196
+ for await (const chunk of audioStream) {
2197
+ const buf = chunk instanceof Buffer ? chunk : Buffer.from(chunk);
2198
+ if (firstChunk) {
2199
+ firstChunk = false;
2200
+ const ttfb = Date.now() - startMs;
2201
+ logger14.debug({ ttfb, voiceId }, "ElevenLabs first audio chunk");
2202
+ }
2203
+ jitterBuffer.push(buf);
2204
+ if (jitterTimer === null) {
2205
+ jitterTimer = setTimeout(() => {
2206
+ }, JITTER_BUFFER_MS);
2207
+ }
2208
+ const totalBytes = jitterBuffer.reduce((sum, b) => sum + b.length, 0);
2209
+ if (totalBytes >= 4800) {
2210
+ if (jitterTimer !== null) {
2211
+ clearTimeout(jitterTimer);
2212
+ jitterTimer = null;
2213
+ }
2214
+ yield* flushJitterBuffer();
2215
+ }
2216
+ }
2217
+ if (jitterTimer !== null) {
2218
+ clearTimeout(jitterTimer);
2219
+ }
2220
+ yield* flushJitterBuffer();
2221
+ logger14.debug({ voiceId, totalMs: Date.now() - startMs }, "ElevenLabs TTS stream complete");
2222
+ } catch (err) {
2223
+ if (err.statusCode === 404) {
2224
+ throw new TTSVoiceNotFoundError("elevenlabs", voiceId);
2225
+ }
2226
+ throw new TTSStreamError("elevenlabs", err);
2227
+ }
2228
+ }
2229
+ /**
2230
+ * Synthesize full audio (for pre-caching greetings, IVR prompts).
2231
+ * Collects all streaming chunks into a single buffer.
2232
+ *
2233
+ * @param text Text to synthesize
2234
+ * @param config Per-call config overrides
2235
+ */
2236
+ async synthesizeFull(text, config) {
2237
+ const chunks = [];
2238
+ for await (const chunk of this.synthesizeStream(text, config)) {
2239
+ chunks.push(chunk);
2240
+ }
2241
+ return Buffer.concat(chunks);
2242
+ }
2243
+ };
2244
+ var logger15 = pino__default.default({ name: "@voice-kit/core:tts:sarvam" });
2245
+ var SARVAM_API_BASE2 = "https://api.sarvam.ai";
2246
+ var DEFAULT_VOICES = {
2247
+ "hi-IN": "meera",
2248
+ "kn-IN": "pavithra",
2249
+ "ta-IN": "pavithra",
2250
+ "te-IN": "pavithra",
2251
+ "mr-IN": "meera",
2252
+ "bn-IN": "meera",
2253
+ "gu-IN": "meera",
2254
+ "pa-IN": "meera"
2255
+ };
2256
+ var SarvamTTSProvider = class {
2257
+ name = "sarvam";
2258
+ outputSampleRate = 22050;
2259
+ // Sarvam default
2260
+ outputFormat = "mp3";
2261
+ http;
2262
+ config;
2263
+ constructor(config) {
2264
+ const apiKey = config.apiKey ?? process.env["SARVAM_API_KEY"];
2265
+ if (!apiKey) throw new TTSConnectionError("sarvam", new Error("SARVAM_API_KEY not set"));
2266
+ const targetLanguage = config.targetLanguage ?? "hi-IN";
2267
+ const defaultVoice = DEFAULT_VOICES[targetLanguage] ?? "meera";
2268
+ this.http = axios__default.default.create({
2269
+ baseURL: SARVAM_API_BASE2,
2270
+ headers: {
2271
+ "API-Subscription-Key": apiKey,
2272
+ "Content-Type": "application/json"
2273
+ },
2274
+ timeout: 15e3
2275
+ });
2276
+ this.config = {
2277
+ voiceId: config.voiceId ?? defaultVoice,
2278
+ sampleRate: config.sampleRate ?? 22050,
2279
+ speed: config.speed ?? 1,
2280
+ pitch: config.pitch ?? 0,
2281
+ apiKey,
2282
+ modelId: config.modelId ?? "bulbul:v1",
2283
+ emotion: "",
2284
+ targetLanguage
2285
+ };
2286
+ }
2287
+ /**
2288
+ * Synthesize text in an Indic language and stream audio chunks.
2289
+ * Sarvam returns full audio segments — we chunk them for streaming compatibility.
2290
+ *
2291
+ * @example
2292
+ * ```ts
2293
+ * const tts = createTTS('sarvam', { targetLanguage: 'hi-IN' })
2294
+ * for await (const chunk of tts.synthesizeStream('नमस्ते, मैं आपकी कैसे मदद कर सकता हूँ?')) {
2295
+ * telephony.sendAudio(chunk)
2296
+ * }
2297
+ * ```
2298
+ */
2299
+ async *synthesizeStream(text, config) {
2300
+ const startMs = Date.now();
2301
+ const targetLanguage = config?.targetLanguage ?? this.config.targetLanguage;
2302
+ const speaker = config?.voiceId ?? this.config.voiceId;
2303
+ logger15.debug({ targetLanguage, speaker, chars: text.length }, "Sarvam TTS request");
2304
+ try {
2305
+ const response = await this.http.post("/text-to-speech", {
2306
+ inputs: [text],
2307
+ target_language_code: targetLanguage,
2308
+ speaker,
2309
+ model: config?.modelId ?? this.config.modelId,
2310
+ pitch: config?.pitch ?? this.config.pitch,
2311
+ pace: config?.speed ?? this.config.speed,
2312
+ loudness: 1,
2313
+ speech_sample_rate: config?.sampleRate ?? this.config.sampleRate,
2314
+ enable_preprocessing: true
2315
+ });
2316
+ const audioSegments = response.data.audios;
2317
+ if (!audioSegments || audioSegments.length === 0) {
2318
+ throw new TTSStreamError("sarvam", new Error("No audio returned from Sarvam TTS"));
2319
+ }
2320
+ logger15.debug(
2321
+ { segments: audioSegments.length, latencyMs: Date.now() - startMs },
2322
+ "Sarvam TTS response received"
2323
+ );
2324
+ for (const segment of audioSegments) {
2325
+ const buf = Buffer.from(segment, "base64");
2326
+ const CHUNK_SIZE = 4096;
2327
+ let offset = 0;
2328
+ while (offset < buf.length) {
2329
+ yield buf.subarray(offset, Math.min(offset + CHUNK_SIZE, buf.length));
2330
+ offset += CHUNK_SIZE;
2331
+ }
2332
+ }
2333
+ } catch (err) {
2334
+ if (axios__default.default.isAxiosError(err)) {
2335
+ throw new TTSStreamError(
2336
+ "sarvam",
2337
+ new Error(`Sarvam TTS API error: ${err.response?.status} ${JSON.stringify(err.response?.data)}`)
2338
+ );
2339
+ }
2340
+ throw new TTSStreamError("sarvam", err);
2341
+ }
2342
+ }
2343
+ /** Synthesize complete audio buffer. */
2344
+ async synthesizeFull(text, config) {
2345
+ const chunks = [];
2346
+ for await (const chunk of this.synthesizeStream(text, config)) {
2347
+ chunks.push(chunk);
2348
+ }
2349
+ return Buffer.concat(chunks);
2350
+ }
2351
+ };
2352
+
2353
+ // src/tts/TTS-factory.ts
2354
+ function createTTS(provider, config) {
2355
+ const cfg = config ?? {};
2356
+ switch (provider) {
2357
+ case "elevenlabs":
2358
+ return new ElevenLabsTTSProvider(cfg);
2359
+ case "cartesia":
2360
+ return new CartesiaTTSProvider(cfg);
2361
+ case "sarvam":
2362
+ return new SarvamTTSProvider(cfg);
2363
+ default: {
2364
+ const _exhaustive = provider;
2365
+ throw new Error(`Unknown TTS provider: ${String(_exhaustive)}`);
2366
+ }
2367
+ }
2368
+ }
2369
+
262
2370
  exports.AgentError = AgentError;
263
2371
  exports.AgentHandoffError = AgentHandoffError;
2372
+ exports.AssemblyAISTTProvider = AssemblyAISTTProvider;
2373
+ exports.AudioPipeline = AudioPipeline;
264
2374
  exports.AudioTransportError = AudioTransportError;
2375
+ exports.CallAuditLog = CallAuditLog;
265
2376
  exports.CallConnectionError = CallConnectionError;
2377
+ exports.CallMetrics = CallMetrics;
266
2378
  exports.CallNotFoundError = CallNotFoundError;
267
2379
  exports.CallingHoursError = CallingHoursError;
2380
+ exports.CartesiaTTSProvider = CartesiaTTSProvider;
268
2381
  exports.ComplianceError = ComplianceError;
269
2382
  exports.ConsentMissingError = ConsentMissingError;
270
2383
  exports.DNCBlockedError = DNCBlockedError;
2384
+ exports.DeepgramSTTProvider = DeepgramSTTProvider;
2385
+ exports.ElevenLabsTTSProvider = ElevenLabsTTSProvider;
271
2386
  exports.InngestError = InngestError;
2387
+ exports.LanguageSwitchDetector = LanguageSwitchDetector;
272
2388
  exports.STTConnectionError = STTConnectionError;
273
2389
  exports.STTError = STTError;
274
2390
  exports.STTLanguageNotSupportedError = STTLanguageNotSupportedError;
275
2391
  exports.STTStreamError = STTStreamError;
2392
+ exports.SarvamSTTProvider = SarvamSTTProvider;
2393
+ exports.SarvamTTSProvider = SarvamTTSProvider;
2394
+ exports.TRAICompliance = TRAICompliance;
276
2395
  exports.TTSConnectionError = TTSConnectionError;
277
2396
  exports.TTSError = TTSError;
278
2397
  exports.TTSStreamError = TTSStreamError;
279
2398
  exports.TTSVoiceNotFoundError = TTSVoiceNotFoundError;
280
2399
  exports.TelephonyError = TelephonyError;
281
2400
  exports.TurnTransitionError = TurnTransitionError;
2401
+ exports.VADEngine = VADEngine;
282
2402
  exports.VoiceKitError = VoiceKitError;
2403
+ exports.VoiceSDKTracer = VoiceSDKTracer;
2404
+ exports.WhisperSTTProvider = WhisperSTTProvider;
2405
+ exports.base64MulawToPcm = base64MulawToPcm;
2406
+ exports.createAudioPipeline = createAudioPipeline;
2407
+ exports.createCallMemory = createCallMemory;
2408
+ exports.createResamplerStream = createResamplerStream;
2409
+ exports.createSTT = createSTT;
2410
+ exports.createTTS = createTTS;
2411
+ exports.createVAD = createVAD;
2412
+ exports.isInglish = isInglish;
2413
+ exports.linearToMulaw = linearToMulaw;
2414
+ exports.mulawBufferToPcm = mulawBufferToPcm;
2415
+ exports.mulawToLinear = mulawToLinear;
2416
+ exports.pcmBufferToMulaw = pcmBufferToMulaw;
2417
+ exports.pcmToBase64Mulaw = pcmToBase64Mulaw;
2418
+ exports.resample = resample;
2419
+ exports.resampleStream = resampleStream;
283
2420
  //# sourceMappingURL=index.cjs.map
284
2421
  //# sourceMappingURL=index.cjs.map