@voice-kit/core 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +2137 -0
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +1466 -4
- package/dist/index.d.ts +1466 -4
- package/dist/index.js +2102 -1
- package/dist/index.js.map +1 -1
- package/package.json +1 -31
- package/dist/audio.cjs +0 -533
- package/dist/audio.cjs.map +0 -1
- package/dist/audio.d.cts +0 -260
- package/dist/audio.d.ts +0 -260
- package/dist/audio.js +0 -514
- package/dist/audio.js.map +0 -1
- package/dist/compliance.cjs +0 -343
- package/dist/compliance.cjs.map +0 -1
- package/dist/compliance.d.cts +0 -163
- package/dist/compliance.d.ts +0 -163
- package/dist/compliance.js +0 -335
- package/dist/compliance.js.map +0 -1
- package/dist/errors.cjs +0 -284
- package/dist/errors.cjs.map +0 -1
- package/dist/errors.d.cts +0 -100
- package/dist/errors.d.ts +0 -100
- package/dist/errors.js +0 -262
- package/dist/errors.js.map +0 -1
- package/dist/index-D3KfRXMP.d.cts +0 -319
- package/dist/index-D3KfRXMP.d.ts +0 -319
- package/dist/memory.cjs +0 -121
- package/dist/memory.cjs.map +0 -1
- package/dist/memory.d.cts +0 -29
- package/dist/memory.d.ts +0 -29
- package/dist/memory.js +0 -115
- package/dist/memory.js.map +0 -1
- package/dist/observability.cjs +0 -229
- package/dist/observability.cjs.map +0 -1
- package/dist/observability.d.cts +0 -122
- package/dist/observability.d.ts +0 -122
- package/dist/observability.js +0 -222
- package/dist/observability.js.map +0 -1
- package/dist/stt.cjs +0 -828
- package/dist/stt.cjs.map +0 -1
- package/dist/stt.d.cts +0 -308
- package/dist/stt.d.ts +0 -308
- package/dist/stt.js +0 -815
- package/dist/stt.js.map +0 -1
- package/dist/telephony.errors-BQYr6-vl.d.cts +0 -80
- package/dist/telephony.errors-C0-nScrF.d.ts +0 -80
- package/dist/tts.cjs +0 -429
- package/dist/tts.cjs.map +0 -1
- package/dist/tts.d.cts +0 -151
- package/dist/tts.d.ts +0 -151
- package/dist/tts.js +0 -418
- package/dist/tts.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -1,3 +1,23 @@
|
|
|
1
|
+
import ffmpeg from 'fluent-ffmpeg';
|
|
2
|
+
import { Readable, PassThrough } from 'stream';
|
|
3
|
+
import pino from 'pino';
|
|
4
|
+
import { EventEmitter } from 'events';
|
|
5
|
+
import { LRUCache } from 'lru-cache';
|
|
6
|
+
import { appendFile } from 'fs/promises';
|
|
7
|
+
import axios from 'axios';
|
|
8
|
+
import { isValidPhoneNumber, parsePhoneNumberFromString } from 'libphonenumber-js';
|
|
9
|
+
import { NodeTracerProvider } from '@opentelemetry/sdk-trace-node';
|
|
10
|
+
import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http';
|
|
11
|
+
import { SimpleSpanProcessor } from '@opentelemetry/sdk-trace-base';
|
|
12
|
+
import { ATTR_SERVICE_NAME } from '@opentelemetry/semantic-conventions';
|
|
13
|
+
import { trace, SpanStatusCode } from '@opentelemetry/api';
|
|
14
|
+
import { resourceFromAttributes } from '@opentelemetry/resources';
|
|
15
|
+
import { AssemblyAI } from 'assemblyai';
|
|
16
|
+
import { DeepgramClient } from '@deepgram/sdk';
|
|
17
|
+
import { createOpenAI } from '@ai-sdk/openai';
|
|
18
|
+
import Cartesia from '@cartesia/cartesia-js';
|
|
19
|
+
import { ElevenLabsClient } from 'elevenlabs';
|
|
20
|
+
|
|
1
21
|
// src/errors/base.ts
|
|
2
22
|
var VoiceKitError = class extends Error {
|
|
3
23
|
code;
|
|
@@ -257,6 +277,2087 @@ var TTSVoiceNotFoundError = class extends TTSError {
|
|
|
257
277
|
}
|
|
258
278
|
};
|
|
259
279
|
|
|
260
|
-
|
|
280
|
+
// src/audio/codec/index.ts
|
|
281
|
+
var MULAW_BIAS = 33;
|
|
282
|
+
var MULAW_MAX = 32767;
|
|
283
|
+
function mulawToLinear(sample) {
|
|
284
|
+
sample = ~sample & 255;
|
|
285
|
+
const sign = sample & 128;
|
|
286
|
+
const exponent = sample >> 4 & 7;
|
|
287
|
+
const mantissa = sample & 15;
|
|
288
|
+
let linear = (mantissa << 1) + 33 << exponent;
|
|
289
|
+
linear -= 33;
|
|
290
|
+
return sign !== 0 ? -linear : linear;
|
|
291
|
+
}
|
|
292
|
+
function linearToMulaw(sample) {
|
|
293
|
+
sample = Math.max(-32768, Math.min(32767, sample));
|
|
294
|
+
const sign = sample < 0 ? 128 : 0;
|
|
295
|
+
if (sample < 0) sample = -sample;
|
|
296
|
+
sample += MULAW_BIAS;
|
|
297
|
+
if (sample > MULAW_MAX) sample = MULAW_MAX;
|
|
298
|
+
let exponent = 7;
|
|
299
|
+
let expMask = 16384;
|
|
300
|
+
for (; exponent > 0; exponent--) {
|
|
301
|
+
if ((sample & expMask) !== 0) break;
|
|
302
|
+
expMask >>= 1;
|
|
303
|
+
}
|
|
304
|
+
const mantissa = sample >> exponent + 3 & 15;
|
|
305
|
+
const mulaw = ~(sign | exponent << 4 | mantissa) & 255;
|
|
306
|
+
return mulaw;
|
|
307
|
+
}
|
|
308
|
+
function mulawBufferToPcm(buf) {
|
|
309
|
+
const pcm = Buffer.allocUnsafe(buf.length * 2);
|
|
310
|
+
for (let i = 0; i < buf.length; i++) {
|
|
311
|
+
const sample = mulawToLinear(buf[i] ?? 0);
|
|
312
|
+
pcm.writeInt16LE(sample, i * 2);
|
|
313
|
+
}
|
|
314
|
+
return pcm;
|
|
315
|
+
}
|
|
316
|
+
function pcmBufferToMulaw(buf) {
|
|
317
|
+
const samples = buf.length >> 1;
|
|
318
|
+
const mulaw = Buffer.allocUnsafe(samples);
|
|
319
|
+
for (let i = 0; i < samples; i++) {
|
|
320
|
+
const sample = buf.readInt16LE(i * 2);
|
|
321
|
+
mulaw[i] = linearToMulaw(sample);
|
|
322
|
+
}
|
|
323
|
+
return mulaw;
|
|
324
|
+
}
|
|
325
|
+
function base64MulawToPcm(base64) {
|
|
326
|
+
const mulaw = Buffer.from(base64, "base64");
|
|
327
|
+
return mulawBufferToPcm(mulaw);
|
|
328
|
+
}
|
|
329
|
+
function pcmToBase64Mulaw(pcm) {
|
|
330
|
+
return pcmBufferToMulaw(pcm).toString("base64");
|
|
331
|
+
}
|
|
332
|
+
async function resample(buf, fromHz, toHz) {
|
|
333
|
+
if (fromHz === toHz) return buf;
|
|
334
|
+
return new Promise((resolve, reject) => {
|
|
335
|
+
const chunks = [];
|
|
336
|
+
const input = new Readable({
|
|
337
|
+
read() {
|
|
338
|
+
this.push(buf);
|
|
339
|
+
this.push(null);
|
|
340
|
+
}
|
|
341
|
+
});
|
|
342
|
+
const output = new PassThrough();
|
|
343
|
+
output.on("data", (chunk) => chunks.push(chunk));
|
|
344
|
+
output.on("end", () => resolve(Buffer.concat(chunks)));
|
|
345
|
+
output.on(
|
|
346
|
+
"error",
|
|
347
|
+
(err) => reject(
|
|
348
|
+
new AudioTransportError(
|
|
349
|
+
"ffmpeg-resampler",
|
|
350
|
+
err
|
|
351
|
+
)
|
|
352
|
+
)
|
|
353
|
+
);
|
|
354
|
+
ffmpeg(input).inputOptions([
|
|
355
|
+
"-f s16le",
|
|
356
|
+
`-ar ${fromHz}`,
|
|
357
|
+
"-ac 1"
|
|
358
|
+
]).outputOptions([
|
|
359
|
+
"-f s16le",
|
|
360
|
+
`-ar ${toHz}`,
|
|
361
|
+
"-ac 1"
|
|
362
|
+
]).on(
|
|
363
|
+
"error",
|
|
364
|
+
(err) => reject(
|
|
365
|
+
new AudioTransportError("ffmpeg-resampler", err)
|
|
366
|
+
)
|
|
367
|
+
).pipe(output, { end: true });
|
|
368
|
+
});
|
|
369
|
+
}
|
|
370
|
+
function createResamplerStream(fromHz, toHz) {
|
|
371
|
+
const output = new PassThrough();
|
|
372
|
+
if (fromHz === toHz) {
|
|
373
|
+
return output;
|
|
374
|
+
}
|
|
375
|
+
return output;
|
|
376
|
+
}
|
|
377
|
+
async function* resampleStream(audio, fromHz, toHz) {
|
|
378
|
+
if (fromHz === toHz) {
|
|
379
|
+
yield* audio;
|
|
380
|
+
return;
|
|
381
|
+
}
|
|
382
|
+
const CHUNK_SIZE = 16e3;
|
|
383
|
+
let pending = Buffer.alloc(0);
|
|
384
|
+
for await (const chunk of audio) {
|
|
385
|
+
pending = Buffer.concat([pending, chunk]);
|
|
386
|
+
while (pending.length >= CHUNK_SIZE) {
|
|
387
|
+
const slice = pending.subarray(0, CHUNK_SIZE);
|
|
388
|
+
pending = pending.subarray(CHUNK_SIZE);
|
|
389
|
+
yield await resample(slice, fromHz, toHz);
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
if (pending.length > 0) {
|
|
393
|
+
yield await resample(pending, fromHz, toHz);
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
var logger = pino({ name: "@voice-kit/core:pipeline" });
|
|
397
|
+
var AUDIO_PROFILES = {
|
|
398
|
+
twilio: {
|
|
399
|
+
inputFormat: "mulaw",
|
|
400
|
+
inputSampleRate: 8e3,
|
|
401
|
+
sttSampleRate: 16e3,
|
|
402
|
+
outputSampleRate: 8e3,
|
|
403
|
+
outputFormat: "mulaw",
|
|
404
|
+
vadConfig: {
|
|
405
|
+
threshold: 0.6,
|
|
406
|
+
positiveSpeechFrames: 3,
|
|
407
|
+
negativeSpeechFrames: 5,
|
|
408
|
+
debounceMs: 150,
|
|
409
|
+
sampleRate: 16e3
|
|
410
|
+
}
|
|
411
|
+
},
|
|
412
|
+
exotel: {
|
|
413
|
+
inputFormat: "mulaw",
|
|
414
|
+
inputSampleRate: 8e3,
|
|
415
|
+
sttSampleRate: 16e3,
|
|
416
|
+
outputSampleRate: 8e3,
|
|
417
|
+
outputFormat: "mulaw",
|
|
418
|
+
vadConfig: {
|
|
419
|
+
threshold: 0.55,
|
|
420
|
+
// Exotel has slightly more background noise on IN PSTN
|
|
421
|
+
positiveSpeechFrames: 3,
|
|
422
|
+
negativeSpeechFrames: 6,
|
|
423
|
+
debounceMs: 200,
|
|
424
|
+
sampleRate: 16e3
|
|
425
|
+
}
|
|
426
|
+
},
|
|
427
|
+
plivo: {
|
|
428
|
+
inputFormat: "mulaw",
|
|
429
|
+
inputSampleRate: 8e3,
|
|
430
|
+
sttSampleRate: 16e3,
|
|
431
|
+
outputSampleRate: 8e3,
|
|
432
|
+
outputFormat: "mulaw",
|
|
433
|
+
vadConfig: {
|
|
434
|
+
threshold: 0.6,
|
|
435
|
+
positiveSpeechFrames: 3,
|
|
436
|
+
negativeSpeechFrames: 5,
|
|
437
|
+
debounceMs: 150,
|
|
438
|
+
sampleRate: 16e3
|
|
439
|
+
}
|
|
440
|
+
},
|
|
441
|
+
telnyx: {
|
|
442
|
+
inputFormat: "mulaw",
|
|
443
|
+
inputSampleRate: 8e3,
|
|
444
|
+
sttSampleRate: 16e3,
|
|
445
|
+
outputSampleRate: 8e3,
|
|
446
|
+
outputFormat: "mulaw",
|
|
447
|
+
vadConfig: {
|
|
448
|
+
threshold: 0.6,
|
|
449
|
+
positiveSpeechFrames: 3,
|
|
450
|
+
negativeSpeechFrames: 5,
|
|
451
|
+
debounceMs: 150,
|
|
452
|
+
sampleRate: 16e3
|
|
453
|
+
}
|
|
454
|
+
},
|
|
455
|
+
livekit: {
|
|
456
|
+
inputFormat: "opus",
|
|
457
|
+
// LiveKit delivers decoded PCM via SDK — we handle 48kHz
|
|
458
|
+
inputSampleRate: 48e3,
|
|
459
|
+
sttSampleRate: 16e3,
|
|
460
|
+
outputSampleRate: 48e3,
|
|
461
|
+
outputFormat: "opus",
|
|
462
|
+
vadConfig: {
|
|
463
|
+
threshold: 0.5,
|
|
464
|
+
// Higher quality audio = can lower threshold
|
|
465
|
+
positiveSpeechFrames: 2,
|
|
466
|
+
negativeSpeechFrames: 4,
|
|
467
|
+
debounceMs: 100,
|
|
468
|
+
sampleRate: 16e3
|
|
469
|
+
}
|
|
470
|
+
},
|
|
471
|
+
sip: {
|
|
472
|
+
inputFormat: "mulaw",
|
|
473
|
+
inputSampleRate: 8e3,
|
|
474
|
+
sttSampleRate: 16e3,
|
|
475
|
+
outputSampleRate: 8e3,
|
|
476
|
+
outputFormat: "mulaw",
|
|
477
|
+
vadConfig: {
|
|
478
|
+
threshold: 0.6,
|
|
479
|
+
positiveSpeechFrames: 3,
|
|
480
|
+
negativeSpeechFrames: 5,
|
|
481
|
+
debounceMs: 150,
|
|
482
|
+
sampleRate: 16e3
|
|
483
|
+
}
|
|
484
|
+
}
|
|
485
|
+
};
|
|
486
|
+
var AudioPipeline = class {
|
|
487
|
+
profile;
|
|
488
|
+
provider;
|
|
489
|
+
constructor(provider) {
|
|
490
|
+
this.provider = provider;
|
|
491
|
+
this.profile = AUDIO_PROFILES[provider];
|
|
492
|
+
logger.debug(
|
|
493
|
+
{
|
|
494
|
+
provider,
|
|
495
|
+
inputFormat: this.profile.inputFormat,
|
|
496
|
+
inputSampleRate: this.profile.inputSampleRate,
|
|
497
|
+
sttSampleRate: this.profile.sttSampleRate
|
|
498
|
+
},
|
|
499
|
+
"AudioPipeline initialized"
|
|
500
|
+
);
|
|
501
|
+
}
|
|
502
|
+
/**
|
|
503
|
+
* Transform incoming telephony audio to 16kHz PCM for STT.
|
|
504
|
+
* Handles µ-law decode + resampling automatically.
|
|
505
|
+
*
|
|
506
|
+
* @param raw Raw audio bytes as received from telephony provider
|
|
507
|
+
* @returns Async iterable of 16kHz PCM buffers for STT
|
|
508
|
+
*
|
|
509
|
+
* @internal
|
|
510
|
+
*/
|
|
511
|
+
async *inboundForSTT(raw) {
|
|
512
|
+
let decoded;
|
|
513
|
+
if (this.profile.inputFormat === "mulaw") {
|
|
514
|
+
decoded = this.decodeMulaw(raw);
|
|
515
|
+
} else {
|
|
516
|
+
decoded = raw;
|
|
517
|
+
}
|
|
518
|
+
yield* resampleStream(decoded, this.profile.inputSampleRate, this.profile.sttSampleRate);
|
|
519
|
+
}
|
|
520
|
+
/**
|
|
521
|
+
* Transform TTS output PCM to telephony-native format for sending to caller.
|
|
522
|
+
* Handles resampling + µ-law encode automatically.
|
|
523
|
+
*
|
|
524
|
+
* @param ttsAudio Raw PCM from TTS provider (at TTS provider's native rate)
|
|
525
|
+
* @param ttsSampleRate Native sample rate of the TTS provider
|
|
526
|
+
* @returns Async iterable of audio bytes ready to send to telephony provider
|
|
527
|
+
*
|
|
528
|
+
* @internal
|
|
529
|
+
*/
|
|
530
|
+
async *outboundFromTTS(ttsAudio, ttsSampleRate) {
|
|
531
|
+
const resampled = resampleStream(
|
|
532
|
+
ttsAudio,
|
|
533
|
+
ttsSampleRate,
|
|
534
|
+
this.profile.outputSampleRate
|
|
535
|
+
);
|
|
536
|
+
if (this.profile.outputFormat === "mulaw") {
|
|
537
|
+
for await (const chunk of resampled) {
|
|
538
|
+
yield pcmBufferToMulaw(chunk);
|
|
539
|
+
}
|
|
540
|
+
} else {
|
|
541
|
+
yield* resampled;
|
|
542
|
+
}
|
|
543
|
+
}
|
|
544
|
+
/** Get the VAD config tuned for this provider's audio quality. @internal */
|
|
545
|
+
get vadConfig() {
|
|
546
|
+
return this.profile.vadConfig;
|
|
547
|
+
}
|
|
548
|
+
/** Sample rate that STT expects (post-pipeline). @internal */
|
|
549
|
+
get sttSampleRate() {
|
|
550
|
+
return this.profile.sttSampleRate;
|
|
551
|
+
}
|
|
552
|
+
/** Async generator: decode µ-law stream to PCM. @internal */
|
|
553
|
+
async *decodeMulaw(raw) {
|
|
554
|
+
for await (const chunk of raw) {
|
|
555
|
+
yield mulawBufferToPcm(chunk);
|
|
556
|
+
}
|
|
557
|
+
}
|
|
558
|
+
};
|
|
559
|
+
function createAudioPipeline(provider) {
|
|
560
|
+
return new AudioPipeline(provider);
|
|
561
|
+
}
|
|
562
|
+
var logger2 = pino({ name: "@voice-kit/core:vad" });
|
|
563
|
+
var FRAME_SIZE_SAMPLES = 480;
|
|
564
|
+
var FRAME_SIZE_BYTES = FRAME_SIZE_SAMPLES * 2;
|
|
565
|
+
var VAD_DEFAULTS = {
|
|
566
|
+
threshold: 0.6,
|
|
567
|
+
positiveSpeechFrames: 3,
|
|
568
|
+
negativeSpeechFrames: 5,
|
|
569
|
+
debounceMs: 150,
|
|
570
|
+
sampleRate: 16e3
|
|
571
|
+
};
|
|
572
|
+
var VADEngine = class extends EventEmitter {
|
|
573
|
+
config;
|
|
574
|
+
// Running state
|
|
575
|
+
isSpeaking = false;
|
|
576
|
+
positiveFrameCount = 0;
|
|
577
|
+
negativeFrameCount = 0;
|
|
578
|
+
debounceTimer = null;
|
|
579
|
+
frameBuffer = Buffer.alloc(0);
|
|
580
|
+
// Silero VAD model — loaded lazily
|
|
581
|
+
vadModel = null;
|
|
582
|
+
constructor(config) {
|
|
583
|
+
super();
|
|
584
|
+
this.config = { ...VAD_DEFAULTS, ...config };
|
|
585
|
+
}
|
|
586
|
+
/**
|
|
587
|
+
* Process an async stream of PCM audio frames.
|
|
588
|
+
* Automatically frames the input into 30ms chunks for VAD processing.
|
|
589
|
+
*
|
|
590
|
+
* @param audio Async iterable of PCM buffers (16kHz, s16le, mono)
|
|
591
|
+
*/
|
|
592
|
+
async processStream(audio) {
|
|
593
|
+
try {
|
|
594
|
+
await this.ensureModelLoaded();
|
|
595
|
+
for await (const chunk of audio) {
|
|
596
|
+
this.frameBuffer = Buffer.concat([this.frameBuffer, chunk]);
|
|
597
|
+
while (this.frameBuffer.length >= FRAME_SIZE_BYTES) {
|
|
598
|
+
const frame = this.frameBuffer.subarray(0, FRAME_SIZE_BYTES);
|
|
599
|
+
this.frameBuffer = this.frameBuffer.subarray(FRAME_SIZE_BYTES);
|
|
600
|
+
await this.processFrame(frame);
|
|
601
|
+
}
|
|
602
|
+
}
|
|
603
|
+
if (this.isSpeaking) {
|
|
604
|
+
this.emitFrame("speech_end", 0, Buffer.alloc(0));
|
|
605
|
+
}
|
|
606
|
+
} catch (err) {
|
|
607
|
+
const error = new AudioTransportError("vad", err);
|
|
608
|
+
this.emit("error", error);
|
|
609
|
+
throw error;
|
|
610
|
+
}
|
|
611
|
+
}
|
|
612
|
+
/**
|
|
613
|
+
* Process a single 30ms PCM frame through the VAD model.
|
|
614
|
+
*
|
|
615
|
+
* @internal
|
|
616
|
+
*/
|
|
617
|
+
async processFrame(frame) {
|
|
618
|
+
const confidence = await this.runVADInference(frame);
|
|
619
|
+
const durationMs = FRAME_SIZE_SAMPLES / this.config.sampleRate * 1e3;
|
|
620
|
+
if (confidence >= this.config.threshold) {
|
|
621
|
+
this.positiveFrameCount++;
|
|
622
|
+
this.negativeFrameCount = 0;
|
|
623
|
+
if (this.isSpeaking) {
|
|
624
|
+
this.emitFrame("speech", confidence, frame, durationMs);
|
|
625
|
+
} else if (this.positiveFrameCount >= this.config.positiveSpeechFrames) {
|
|
626
|
+
this.clearDebounce();
|
|
627
|
+
this.isSpeaking = true;
|
|
628
|
+
this.emitFrame("speech_start", confidence, frame, durationMs);
|
|
629
|
+
logger2.debug({ confidence, frames: this.positiveFrameCount }, "VAD: speech_start");
|
|
630
|
+
}
|
|
631
|
+
} else {
|
|
632
|
+
this.negativeFrameCount++;
|
|
633
|
+
this.positiveFrameCount = 0;
|
|
634
|
+
if (this.isSpeaking) {
|
|
635
|
+
this.emitFrame("speech", confidence, frame, durationMs);
|
|
636
|
+
if (this.negativeFrameCount >= this.config.negativeSpeechFrames) {
|
|
637
|
+
this.scheduleDebounce(() => {
|
|
638
|
+
this.isSpeaking = false;
|
|
639
|
+
this.negativeFrameCount = 0;
|
|
640
|
+
this.emitFrame("speech_end", confidence, Buffer.alloc(0), 0);
|
|
641
|
+
logger2.debug({ confidence }, "VAD: speech_end");
|
|
642
|
+
});
|
|
643
|
+
}
|
|
644
|
+
}
|
|
645
|
+
}
|
|
646
|
+
}
|
|
647
|
+
/**
|
|
648
|
+
* Run Silero VAD model inference on a single frame.
|
|
649
|
+
* Returns confidence score 0–1.
|
|
650
|
+
*
|
|
651
|
+
* @internal
|
|
652
|
+
*/
|
|
653
|
+
async runVADInference(frame) {
|
|
654
|
+
if (!this.vadModel) throw new Error("VAD model not loaded");
|
|
655
|
+
const samples = new Float32Array(FRAME_SIZE_SAMPLES);
|
|
656
|
+
for (let i = 0; i < FRAME_SIZE_SAMPLES; i++) {
|
|
657
|
+
samples[i] = frame.readInt16LE(i * 2) / 32768;
|
|
658
|
+
}
|
|
659
|
+
return this.vadModel.predict(samples);
|
|
660
|
+
}
|
|
661
|
+
emitFrame(type, confidence, audioBuffer, durationMs = 0) {
|
|
662
|
+
const frame = { type, confidence, audioBuffer, durationMs };
|
|
663
|
+
this.emit("frame", frame);
|
|
664
|
+
}
|
|
665
|
+
scheduleDebounce(fn) {
|
|
666
|
+
this.clearDebounce();
|
|
667
|
+
this.debounceTimer = setTimeout(fn, this.config.debounceMs);
|
|
668
|
+
}
|
|
669
|
+
clearDebounce() {
|
|
670
|
+
if (this.debounceTimer !== null) {
|
|
671
|
+
clearTimeout(this.debounceTimer);
|
|
672
|
+
this.debounceTimer = null;
|
|
673
|
+
}
|
|
674
|
+
}
|
|
675
|
+
/**
|
|
676
|
+
* Load the Silero VAD model if not already loaded.
|
|
677
|
+
* @internal
|
|
678
|
+
*/
|
|
679
|
+
async ensureModelLoaded() {
|
|
680
|
+
if (this.vadModel) return;
|
|
681
|
+
logger2.debug("Loading Silero VAD model...");
|
|
682
|
+
try {
|
|
683
|
+
const { MicVAD } = await import('@ricky0123/vad-web');
|
|
684
|
+
const vad = await MicVAD.new();
|
|
685
|
+
this.vadModel = new SileroVADAdapter(vad);
|
|
686
|
+
logger2.info("VAD model loaded successfully");
|
|
687
|
+
} catch (err) {
|
|
688
|
+
logger2.warn({ err }, "VAD model load failed \u2014 falling back to silence-based detection");
|
|
689
|
+
this.vadModel = new EnergyBasedVAD();
|
|
690
|
+
}
|
|
691
|
+
}
|
|
692
|
+
/** Clean up resources. Call when the call ends. */
|
|
693
|
+
destroy() {
|
|
694
|
+
this.clearDebounce();
|
|
695
|
+
this.removeAllListeners();
|
|
696
|
+
this.frameBuffer = Buffer.alloc(0);
|
|
697
|
+
this.vadModel = null;
|
|
698
|
+
}
|
|
699
|
+
};
|
|
700
|
+
var EnergyBasedVAD = class {
|
|
701
|
+
energyThreshold = 0.01;
|
|
702
|
+
async predict(samples) {
|
|
703
|
+
let sumSq = 0;
|
|
704
|
+
for (const s of samples) {
|
|
705
|
+
sumSq += s * s;
|
|
706
|
+
}
|
|
707
|
+
const rms = Math.sqrt(sumSq / samples.length);
|
|
708
|
+
return Math.min(1, rms / this.energyThreshold);
|
|
709
|
+
}
|
|
710
|
+
};
|
|
711
|
+
function createVAD(config) {
|
|
712
|
+
return new VADEngine(config);
|
|
713
|
+
}
|
|
714
|
+
var SileroVADAdapter = class {
|
|
715
|
+
constructor(vad) {
|
|
716
|
+
this.vad = vad;
|
|
717
|
+
}
|
|
718
|
+
async predict(samples) {
|
|
719
|
+
let sumSq = 0;
|
|
720
|
+
for (const s of samples) {
|
|
721
|
+
sumSq += s * s;
|
|
722
|
+
}
|
|
723
|
+
const rms = Math.sqrt(sumSq / samples.length);
|
|
724
|
+
return Math.min(1, rms * 10);
|
|
725
|
+
}
|
|
726
|
+
};
|
|
727
|
+
var logger3 = pino({ name: "@voice-kit/core:compliance:audit" });
|
|
728
|
+
var CallAuditLog = class {
|
|
729
|
+
/** LRU: up to 10,000 calls × 200 entries each = 2M entries max */
|
|
730
|
+
cache;
|
|
731
|
+
filePath;
|
|
732
|
+
constructor(options) {
|
|
733
|
+
this.filePath = options?.filePath;
|
|
734
|
+
this.cache = new LRUCache({
|
|
735
|
+
max: options?.maxCalls ?? 1e4,
|
|
736
|
+
ttl: 4 * 60 * 60 * 1e3
|
|
737
|
+
// 4 hours
|
|
738
|
+
});
|
|
739
|
+
}
|
|
740
|
+
/**
|
|
741
|
+
* Append an immutable audit entry for a call.
|
|
742
|
+
*
|
|
743
|
+
* @param callId The call identifier
|
|
744
|
+
* @param type Audit event type
|
|
745
|
+
* @param data Additional structured data
|
|
746
|
+
*/
|
|
747
|
+
append(callId, type, data = {}) {
|
|
748
|
+
const entry = Object.freeze({
|
|
749
|
+
id: `${callId}-${Date.now()}-${Math.random().toString(36).slice(2, 9)}`,
|
|
750
|
+
callId,
|
|
751
|
+
type,
|
|
752
|
+
timestamp: /* @__PURE__ */ new Date(),
|
|
753
|
+
data: Object.freeze({ ...data })
|
|
754
|
+
});
|
|
755
|
+
const existing = this.cache.get(callId) ?? [];
|
|
756
|
+
this.cache.set(callId, [...existing, entry]);
|
|
757
|
+
logger3.debug({ callId, type, entryId: entry.id }, "Audit entry appended");
|
|
758
|
+
if (this.filePath) {
|
|
759
|
+
this.writeToFile(entry).catch(
|
|
760
|
+
(err) => logger3.error({ err, callId, type }, "Audit file write failed")
|
|
761
|
+
);
|
|
762
|
+
}
|
|
763
|
+
return entry;
|
|
764
|
+
}
|
|
765
|
+
/**
|
|
766
|
+
* Get all audit entries for a call, in insertion order.
|
|
767
|
+
*
|
|
768
|
+
* @param callId The call identifier
|
|
769
|
+
*/
|
|
770
|
+
getEntries(callId) {
|
|
771
|
+
return Object.freeze(this.cache.get(callId) ?? []);
|
|
772
|
+
}
|
|
773
|
+
/**
|
|
774
|
+
* Get entries of a specific type for a call.
|
|
775
|
+
*/
|
|
776
|
+
getEntriesByType(callId, type) {
|
|
777
|
+
return this.getEntries(callId).filter((e) => e.type === type);
|
|
778
|
+
}
|
|
779
|
+
/** Write entry to JSONL file. @internal */
|
|
780
|
+
async writeToFile(entry) {
|
|
781
|
+
if (!this.filePath) return;
|
|
782
|
+
const line = JSON.stringify({
|
|
783
|
+
...entry,
|
|
784
|
+
timestamp: entry.timestamp.toISOString()
|
|
785
|
+
}) + "\n";
|
|
786
|
+
await appendFile(this.filePath, line, "utf-8");
|
|
787
|
+
}
|
|
788
|
+
};
|
|
789
|
+
var logger4 = pino({ name: "@voice-kit/core:compliance:trai" });
|
|
790
|
+
var TRAI_DND_API_MOCK = "https://api.trai.gov.in/dnd/check";
|
|
791
|
+
var DEFAULTS = {
|
|
792
|
+
disabled: false,
|
|
793
|
+
timezone: "Asia/Kolkata",
|
|
794
|
+
callingHoursStart: 9,
|
|
795
|
+
callingHoursEnd: 21,
|
|
796
|
+
dncApiEndpoint: TRAI_DND_API_MOCK
|
|
797
|
+
};
|
|
798
|
+
var DNC_CACHE_TTL_MS = 24 * 60 * 60 * 1e3;
|
|
799
|
+
var CONSENT_VALIDITY_MS = 180 * 24 * 60 * 60 * 1e3;
|
|
800
|
+
var TRAICompliance = class {
|
|
801
|
+
config;
|
|
802
|
+
http;
|
|
803
|
+
/** DNC check results cached for 24 hours per number. */
|
|
804
|
+
dncCache;
|
|
805
|
+
/** Consent records cached for 180 days. */
|
|
806
|
+
consentCache;
|
|
807
|
+
constructor(config) {
|
|
808
|
+
this.config = { ...DEFAULTS, ...config };
|
|
809
|
+
this.dncCache = new LRUCache({
|
|
810
|
+
max: 1e5,
|
|
811
|
+
ttl: DNC_CACHE_TTL_MS
|
|
812
|
+
});
|
|
813
|
+
this.consentCache = new LRUCache({
|
|
814
|
+
max: 5e4,
|
|
815
|
+
ttl: CONSENT_VALIDITY_MS
|
|
816
|
+
});
|
|
817
|
+
this.http = axios.create({
|
|
818
|
+
baseURL: this.config.dncApiEndpoint,
|
|
819
|
+
timeout: 5e3,
|
|
820
|
+
headers: { "Content-Type": "application/json" }
|
|
821
|
+
});
|
|
822
|
+
}
|
|
823
|
+
/**
|
|
824
|
+
* Check whether a call is permitted under TRAI rules.
|
|
825
|
+
* Checks: valid E.164, DNC registry, calling hours.
|
|
826
|
+
*
|
|
827
|
+
* @param params Call permission check parameters
|
|
828
|
+
* @throws DNCBlockedError if number is on DNC registry
|
|
829
|
+
* @throws CallingHoursError if outside allowed calling hours
|
|
830
|
+
* @throws ComplianceError if phone number is invalid
|
|
831
|
+
*
|
|
832
|
+
* @example
|
|
833
|
+
* ```ts
|
|
834
|
+
* const result = await trai.checkCallPermission({
|
|
835
|
+
* to: '+919876543210',
|
|
836
|
+
* purpose: 'TRANSACTIONAL',
|
|
837
|
+
* })
|
|
838
|
+
* if (!result.allowed) console.log(result.reason)
|
|
839
|
+
* ```
|
|
840
|
+
*/
|
|
841
|
+
async checkCallPermission(params) {
|
|
842
|
+
if (this.config.disabled) {
|
|
843
|
+
return { allowed: true, fromCache: false };
|
|
844
|
+
}
|
|
845
|
+
if (!isValidPhoneNumber(params.to)) {
|
|
846
|
+
throw new ComplianceError({
|
|
847
|
+
code: "COMPLIANCE_INVALID_NUMBER",
|
|
848
|
+
message: `Invalid phone number: ${params.to}`,
|
|
849
|
+
phoneNumber: params.to,
|
|
850
|
+
retryable: false,
|
|
851
|
+
severity: "low"
|
|
852
|
+
});
|
|
853
|
+
}
|
|
854
|
+
const parsed = parsePhoneNumberFromString(params.to);
|
|
855
|
+
const isIndianNumber = parsed?.countryCallingCode === "91";
|
|
856
|
+
if (!isIndianNumber) {
|
|
857
|
+
return { allowed: true, fromCache: false };
|
|
858
|
+
}
|
|
859
|
+
const scheduledAt = params.scheduledAt ?? /* @__PURE__ */ new Date();
|
|
860
|
+
if (!this.isWithinCallingHours(scheduledAt)) {
|
|
861
|
+
const timeStr = new Intl.DateTimeFormat("en-IN", {
|
|
862
|
+
timeZone: this.config.timezone,
|
|
863
|
+
hour: "2-digit",
|
|
864
|
+
minute: "2-digit",
|
|
865
|
+
hour12: false
|
|
866
|
+
}).format(scheduledAt);
|
|
867
|
+
throw new CallingHoursError(params.to, timeStr);
|
|
868
|
+
}
|
|
869
|
+
if (params.purpose === "EMERGENCY") {
|
|
870
|
+
return { allowed: true, fromCache: false };
|
|
871
|
+
}
|
|
872
|
+
const cacheKey = `${params.to}:${params.purpose}`;
|
|
873
|
+
const cached = this.dncCache.get(cacheKey);
|
|
874
|
+
if (cached) {
|
|
875
|
+
logger4.debug({ to: params.to, purpose: params.purpose, allowed: cached.allowed }, "DNC cache hit");
|
|
876
|
+
return { ...cached, fromCache: true };
|
|
877
|
+
}
|
|
878
|
+
const result = await this.fetchDNCStatus(params);
|
|
879
|
+
this.dncCache.set(cacheKey, result);
|
|
880
|
+
if (!result.allowed) {
|
|
881
|
+
throw new DNCBlockedError(params.to);
|
|
882
|
+
}
|
|
883
|
+
return result;
|
|
884
|
+
}
|
|
885
|
+
/**
|
|
886
|
+
* Check if the current time (or a given time) is within TRAI calling hours.
|
|
887
|
+
* Allowed: 9:00 AM – 9:00 PM IST.
|
|
888
|
+
* Uses Intl.DateTimeFormat only — no date-fns or dayjs dependency.
|
|
889
|
+
*
|
|
890
|
+
* @param at Time to check. Defaults to now.
|
|
891
|
+
* @param timezone IANA timezone. Defaults to 'Asia/Kolkata'.
|
|
892
|
+
*
|
|
893
|
+
* @example
|
|
894
|
+
* ```ts
|
|
895
|
+
* trai.isWithinCallingHours() // Check now
|
|
896
|
+
* trai.isWithinCallingHours(new Date()) // Explicit time
|
|
897
|
+
* ```
|
|
898
|
+
*/
|
|
899
|
+
isWithinCallingHours(at, timezone) {
|
|
900
|
+
const tz = timezone ?? this.config.timezone;
|
|
901
|
+
const date = at ?? /* @__PURE__ */ new Date();
|
|
902
|
+
const parts = new Intl.DateTimeFormat("en-IN", {
|
|
903
|
+
timeZone: tz,
|
|
904
|
+
hour: "numeric",
|
|
905
|
+
hour12: false
|
|
906
|
+
}).formatToParts(date);
|
|
907
|
+
const hourPart = parts.find((p) => p.type === "hour");
|
|
908
|
+
const hour = parseInt(hourPart?.value ?? "0", 10);
|
|
909
|
+
return hour >= this.config.callingHoursStart && hour < this.config.callingHoursEnd;
|
|
910
|
+
}
|
|
911
|
+
/**
|
|
912
|
+
* Record explicit consent from a user for future calls.
|
|
913
|
+
* Consent is valid for 180 days per TRAI guidelines.
|
|
914
|
+
*
|
|
915
|
+
* @param params Consent record details
|
|
916
|
+
*
|
|
917
|
+
* @example
|
|
918
|
+
* ```ts
|
|
919
|
+
* await trai.recordConsent({
|
|
920
|
+
* phoneNumber: '+919876543210',
|
|
921
|
+
* consentedAt: new Date(),
|
|
922
|
+
* channel: 'ivr',
|
|
923
|
+
* purpose: 'PROMOTIONAL',
|
|
924
|
+
* })
|
|
925
|
+
* ```
|
|
926
|
+
*/
|
|
927
|
+
async recordConsent(params) {
|
|
928
|
+
const normalized = parsePhoneNumberFromString(params.phoneNumber)?.format("E.164");
|
|
929
|
+
this.consentCache.set(normalized, params);
|
|
930
|
+
logger4.info(
|
|
931
|
+
{ phoneNumber: normalized, purpose: params.purpose, channel: params.channel },
|
|
932
|
+
"Consent recorded"
|
|
933
|
+
);
|
|
934
|
+
}
|
|
935
|
+
/**
|
|
936
|
+
* Check if a number has valid (non-expired) consent on record.
|
|
937
|
+
*
|
|
938
|
+
* @param phoneNumber E.164 phone number
|
|
939
|
+
* @returns True if valid consent exists
|
|
940
|
+
*/
|
|
941
|
+
async hasValidConsent(phoneNumber) {
|
|
942
|
+
let normalized;
|
|
943
|
+
try {
|
|
944
|
+
normalized = parsePhoneNumberFromString(phoneNumber)?.format("E.164");
|
|
945
|
+
} catch {
|
|
946
|
+
return false;
|
|
947
|
+
}
|
|
948
|
+
const record = this.consentCache.get(normalized);
|
|
949
|
+
if (!record) return false;
|
|
950
|
+
const ageMs = Date.now() - record.consentedAt.getTime();
|
|
951
|
+
return ageMs < CONSENT_VALIDITY_MS;
|
|
952
|
+
}
|
|
953
|
+
/**
|
|
954
|
+
* Fetch DNC status from TRAI DND API.
|
|
955
|
+
* @internal
|
|
956
|
+
*/
|
|
957
|
+
async fetchDNCStatus(params) {
|
|
958
|
+
try {
|
|
959
|
+
logger4.debug({ to: params.to, purpose: params.purpose }, "Fetching DNC status from TRAI");
|
|
960
|
+
const response = await this.http.post("", {
|
|
961
|
+
phone: params.to,
|
|
962
|
+
type: params.purpose
|
|
963
|
+
});
|
|
964
|
+
const result = {
|
|
965
|
+
allowed: !response.data.registered,
|
|
966
|
+
reason: response.data.registered ? `Number is registered on DNC for category: ${response.data.category ?? "ALL"}` : void 0,
|
|
967
|
+
cachedAt: /* @__PURE__ */ new Date(),
|
|
968
|
+
fromCache: false
|
|
969
|
+
};
|
|
970
|
+
logger4.info({ to: params.to, allowed: result.allowed }, "DNC status fetched");
|
|
971
|
+
return result;
|
|
972
|
+
} catch (err) {
|
|
973
|
+
if (axios.isAxiosError(err) && err.response?.status === 404) {
|
|
974
|
+
return { allowed: true, cachedAt: /* @__PURE__ */ new Date(), fromCache: false };
|
|
975
|
+
}
|
|
976
|
+
logger4.error({ err, to: params.to }, "TRAI DNC API unavailable \u2014 failing open");
|
|
977
|
+
return {
|
|
978
|
+
allowed: true,
|
|
979
|
+
reason: "DNC check unavailable \u2014 failing open",
|
|
980
|
+
cachedAt: /* @__PURE__ */ new Date(),
|
|
981
|
+
fromCache: false
|
|
982
|
+
};
|
|
983
|
+
}
|
|
984
|
+
}
|
|
985
|
+
};
|
|
986
|
+
var logger5 = pino({ name: "@voice-kit/core:memory" });
|
|
987
|
+
var DEFAULTS2 = {
|
|
988
|
+
maxTurns: 20,
|
|
989
|
+
maxBytes: 512e3,
|
|
990
|
+
// 512KB
|
|
991
|
+
ttlMs: 30 * 6e4
|
|
992
|
+
// 30 minutes
|
|
993
|
+
};
|
|
994
|
+
function estimateTokens(messages) {
|
|
995
|
+
let chars = 0;
|
|
996
|
+
for (const msg of messages) {
|
|
997
|
+
const content = typeof msg.content === "string" ? msg.content : JSON.stringify(msg.content);
|
|
998
|
+
chars += content.length;
|
|
999
|
+
}
|
|
1000
|
+
return Math.ceil(chars / 4);
|
|
1001
|
+
}
|
|
1002
|
+
function estimateBytes(messages) {
|
|
1003
|
+
return JSON.stringify(messages).length;
|
|
1004
|
+
}
|
|
1005
|
+
var LRUCallMemory = class {
|
|
1006
|
+
cache;
|
|
1007
|
+
config;
|
|
1008
|
+
constructor(config) {
|
|
1009
|
+
this.config = config;
|
|
1010
|
+
this.cache = new LRUCache({
|
|
1011
|
+
max: 1e3,
|
|
1012
|
+
// max concurrent calls in memory
|
|
1013
|
+
ttl: config.ttlMs,
|
|
1014
|
+
updateAgeOnGet: true
|
|
1015
|
+
// reset TTL on access (active calls stay warm)
|
|
1016
|
+
});
|
|
1017
|
+
}
|
|
1018
|
+
/**
|
|
1019
|
+
* Add a turn to the call's conversation window.
|
|
1020
|
+
* Automatically trims oldest turns when maxTurns or maxBytes is exceeded.
|
|
1021
|
+
*
|
|
1022
|
+
* @param callId The call identifier
|
|
1023
|
+
* @param message ModelMessage to append
|
|
1024
|
+
*/
|
|
1025
|
+
addTurn(callId, message) {
|
|
1026
|
+
const existing = this.cache.get(callId) ?? [];
|
|
1027
|
+
const updated = [...existing, message];
|
|
1028
|
+
const trimmed = updated.length > this.config.maxTurns ? updated.slice(updated.length - this.config.maxTurns) : updated;
|
|
1029
|
+
let bytesTrimmed = trimmed;
|
|
1030
|
+
while (bytesTrimmed.length > 1 && estimateBytes(bytesTrimmed) > this.config.maxBytes) {
|
|
1031
|
+
bytesTrimmed = bytesTrimmed.slice(1);
|
|
1032
|
+
}
|
|
1033
|
+
this.cache.set(callId, bytesTrimmed);
|
|
1034
|
+
logger5.debug(
|
|
1035
|
+
{ callId, turns: bytesTrimmed.length, bytes: estimateBytes(bytesTrimmed) },
|
|
1036
|
+
"Memory: turn added"
|
|
1037
|
+
);
|
|
1038
|
+
}
|
|
1039
|
+
/**
|
|
1040
|
+
* Get all turns for a call.
|
|
1041
|
+
*
|
|
1042
|
+
* @param callId The call identifier
|
|
1043
|
+
* @returns Array of ModelMessage (empty if call not found)
|
|
1044
|
+
*/
|
|
1045
|
+
getTurns(callId) {
|
|
1046
|
+
return this.cache.get(callId) ?? [];
|
|
1047
|
+
}
|
|
1048
|
+
/**
|
|
1049
|
+
* Clear all turns for a call. Call this on call.ended to free memory.
|
|
1050
|
+
*
|
|
1051
|
+
* @param callId The call identifier
|
|
1052
|
+
*/
|
|
1053
|
+
clearCall(callId) {
|
|
1054
|
+
this.cache.delete(callId);
|
|
1055
|
+
logger5.debug({ callId }, "Memory: call cleared");
|
|
1056
|
+
}
|
|
1057
|
+
/**
|
|
1058
|
+
* Estimate the number of LLM tokens used by a call's history.
|
|
1059
|
+
*
|
|
1060
|
+
* @param callId The call identifier
|
|
1061
|
+
*/
|
|
1062
|
+
getTokenEstimate(callId) {
|
|
1063
|
+
const messages = this.cache.get(callId) ?? [];
|
|
1064
|
+
return estimateTokens(messages);
|
|
1065
|
+
}
|
|
1066
|
+
/**
|
|
1067
|
+
* Trim oldest turns to stay within a token budget.
|
|
1068
|
+
* Called by VoiceAgent before each LLM call to prevent context overflow.
|
|
1069
|
+
*
|
|
1070
|
+
* @param callId The call identifier
|
|
1071
|
+
* @param maxTokens Maximum tokens to retain
|
|
1072
|
+
*/
|
|
1073
|
+
trimToTokenBudget(callId, maxTokens) {
|
|
1074
|
+
let messages = this.cache.get(callId) ?? [];
|
|
1075
|
+
while (messages.length > 1 && estimateTokens(messages) > maxTokens) {
|
|
1076
|
+
messages = messages.slice(1);
|
|
1077
|
+
}
|
|
1078
|
+
this.cache.set(callId, messages);
|
|
1079
|
+
logger5.debug(
|
|
1080
|
+
{ callId, turns: messages.length, estimatedTokens: estimateTokens(messages) },
|
|
1081
|
+
"Memory: trimmed to token budget"
|
|
1082
|
+
);
|
|
1083
|
+
}
|
|
1084
|
+
};
|
|
1085
|
+
function createCallMemory(config) {
|
|
1086
|
+
const merged = {
|
|
1087
|
+
maxTurns: config?.maxTurns ?? DEFAULTS2.maxTurns,
|
|
1088
|
+
maxBytes: config?.maxBytes ?? DEFAULTS2.maxBytes,
|
|
1089
|
+
ttlMs: config?.ttlMs ?? DEFAULTS2.ttlMs
|
|
1090
|
+
};
|
|
1091
|
+
return new LRUCallMemory(merged);
|
|
1092
|
+
}
|
|
1093
|
+
var logger6 = pino({ name: "@voice-kit/core:metrics" });
|
|
1094
|
+
var TOKEN_COSTS_PER_M = {
|
|
1095
|
+
"gpt-4o": { input: 5, output: 15 },
|
|
1096
|
+
"gpt-4o-mini": { input: 0.15, output: 0.6 },
|
|
1097
|
+
"claude-3-5-sonnet": { input: 3, output: 15 },
|
|
1098
|
+
"llama-3.3-70b": { input: 0.59, output: 0.79 }
|
|
1099
|
+
};
|
|
1100
|
+
function p95(values) {
|
|
1101
|
+
if (values.length === 0) return 0;
|
|
1102
|
+
const sorted = [...values].sort((a, b) => a - b);
|
|
1103
|
+
const idx = Math.floor(sorted.length * 0.95);
|
|
1104
|
+
return sorted[Math.min(idx, sorted.length - 1)] ?? 0;
|
|
1105
|
+
}
|
|
1106
|
+
function avg(values) {
|
|
1107
|
+
if (values.length === 0) return 0;
|
|
1108
|
+
return values.reduce((a, b) => a + b, 0) / values.length;
|
|
1109
|
+
}
|
|
1110
|
+
var CallMetrics = class {
|
|
1111
|
+
store;
|
|
1112
|
+
constructor() {
|
|
1113
|
+
this.store = new LRUCache({
|
|
1114
|
+
max: 1e4,
|
|
1115
|
+
ttl: 2 * 60 * 60 * 1e3
|
|
1116
|
+
// 2 hours
|
|
1117
|
+
});
|
|
1118
|
+
}
|
|
1119
|
+
getOrCreate(callId) {
|
|
1120
|
+
const existing = this.store.get(callId);
|
|
1121
|
+
if (existing) return existing;
|
|
1122
|
+
const data = {
|
|
1123
|
+
sttFirstByteMs: [],
|
|
1124
|
+
ttsFirstByteMs: [],
|
|
1125
|
+
llmFirstTokenMs: [],
|
|
1126
|
+
turnLatencyMs: [],
|
|
1127
|
+
interruptionCount: 0,
|
|
1128
|
+
interruptionPositions: [],
|
|
1129
|
+
tokenCost: []
|
|
1130
|
+
};
|
|
1131
|
+
this.store.set(callId, data);
|
|
1132
|
+
return data;
|
|
1133
|
+
}
|
|
1134
|
+
/** Record time from audio start to first STT partial result. */
|
|
1135
|
+
recordSTTFirstByte(callId, ms) {
|
|
1136
|
+
this.getOrCreate(callId).sttFirstByteMs.push(ms);
|
|
1137
|
+
logger6.debug({ callId, ms }, "Metric: STT TTFB");
|
|
1138
|
+
}
|
|
1139
|
+
/** Record time from TTS request to first audio chunk. */
|
|
1140
|
+
recordTTSFirstByte(callId, ms) {
|
|
1141
|
+
this.getOrCreate(callId).ttsFirstByteMs.push(ms);
|
|
1142
|
+
logger6.debug({ callId, ms }, "Metric: TTS TTFB");
|
|
1143
|
+
}
|
|
1144
|
+
/** Record time from LLM request to first token. */
|
|
1145
|
+
recordLLMFirstToken(callId, ms) {
|
|
1146
|
+
this.getOrCreate(callId).llmFirstTokenMs.push(ms);
|
|
1147
|
+
logger6.debug({ callId, ms }, "Metric: LLM first token");
|
|
1148
|
+
}
|
|
1149
|
+
/**
|
|
1150
|
+
* Record end-to-end turn latency: speech_end → first TTS audio byte.
|
|
1151
|
+
* This is the primary latency metric for voice agent quality.
|
|
1152
|
+
*/
|
|
1153
|
+
recordTurnLatency(callId, ms) {
|
|
1154
|
+
this.getOrCreate(callId).turnLatencyMs.push(ms);
|
|
1155
|
+
logger6.debug({ callId, ms }, "Metric: turn latency");
|
|
1156
|
+
}
|
|
1157
|
+
/**
|
|
1158
|
+
* Record an interruption event.
|
|
1159
|
+
*
|
|
1160
|
+
* @param callId Call identifier
|
|
1161
|
+
* @param positionPct 0–1, how far through the TTS stream the interruption occurred
|
|
1162
|
+
*/
|
|
1163
|
+
recordInterruption(callId, positionPct) {
|
|
1164
|
+
const data = this.getOrCreate(callId);
|
|
1165
|
+
data.interruptionCount++;
|
|
1166
|
+
data.interruptionPositions.push(positionPct);
|
|
1167
|
+
logger6.debug({ callId, positionPct }, "Metric: interruption");
|
|
1168
|
+
}
|
|
1169
|
+
/** Record token usage and estimated cost for a model call. */
|
|
1170
|
+
recordTokenCost(callId, model, inputTokens, outputTokens) {
|
|
1171
|
+
const costs = TOKEN_COSTS_PER_M[model] ?? { input: 0, output: 0 };
|
|
1172
|
+
const estimatedUsdCost = inputTokens / 1e6 * costs.input + outputTokens / 1e6 * costs.output;
|
|
1173
|
+
this.getOrCreate(callId).tokenCost.push({
|
|
1174
|
+
model,
|
|
1175
|
+
inputTokens,
|
|
1176
|
+
outputTokens,
|
|
1177
|
+
estimatedUsdCost
|
|
1178
|
+
});
|
|
1179
|
+
logger6.debug({ callId, model, inputTokens, outputTokens, estimatedUsdCost }, "Metric: token cost");
|
|
1180
|
+
}
|
|
1181
|
+
/**
|
|
1182
|
+
* Get a full summary of metrics for a call.
|
|
1183
|
+
*
|
|
1184
|
+
* @param callId The call identifier
|
|
1185
|
+
* @returns Aggregated metrics summary
|
|
1186
|
+
*/
|
|
1187
|
+
getCallSummary(callId) {
|
|
1188
|
+
const data = this.getOrCreate(callId);
|
|
1189
|
+
return {
|
|
1190
|
+
callId,
|
|
1191
|
+
sttFirstByteMs: [...data.sttFirstByteMs],
|
|
1192
|
+
ttsFirstByteMs: [...data.ttsFirstByteMs],
|
|
1193
|
+
llmFirstTokenMs: [...data.llmFirstTokenMs],
|
|
1194
|
+
turnLatencyMs: [...data.turnLatencyMs],
|
|
1195
|
+
interruptionCount: data.interruptionCount,
|
|
1196
|
+
interruptionPositions: [...data.interruptionPositions],
|
|
1197
|
+
tokenCost: [...data.tokenCost],
|
|
1198
|
+
avgTurnLatencyMs: Math.round(avg(data.turnLatencyMs)),
|
|
1199
|
+
p95TurnLatencyMs: Math.round(p95(data.turnLatencyMs))
|
|
1200
|
+
};
|
|
1201
|
+
}
|
|
1202
|
+
/** Remove metrics for a call. Call on call.ended to free memory. */
|
|
1203
|
+
clearCall(callId) {
|
|
1204
|
+
this.store.delete(callId);
|
|
1205
|
+
}
|
|
1206
|
+
};
|
|
1207
|
+
var logger7 = pino({ name: "@voice-kit/core:observability" });
|
|
1208
|
+
var _provider = null;
|
|
1209
|
+
function getOrInitProvider() {
|
|
1210
|
+
if (_provider) return _provider;
|
|
1211
|
+
const endpoint = process.env["OTEL_EXPORTER_OTLP_ENDPOINT"];
|
|
1212
|
+
_provider = new NodeTracerProvider({
|
|
1213
|
+
resource: resourceFromAttributes({
|
|
1214
|
+
[ATTR_SERVICE_NAME]: "voice-kit"
|
|
1215
|
+
}),
|
|
1216
|
+
// Pass span processors directly in constructor — addSpanProcessor doesn't exist in this version
|
|
1217
|
+
spanProcessors: endpoint ? [new SimpleSpanProcessor(new OTLPTraceExporter({ url: endpoint }))] : []
|
|
1218
|
+
});
|
|
1219
|
+
if (endpoint) {
|
|
1220
|
+
logger7.info({ endpoint }, "OTel OTLP exporter configured");
|
|
1221
|
+
}
|
|
1222
|
+
_provider.register();
|
|
1223
|
+
return _provider;
|
|
1224
|
+
}
|
|
1225
|
+
var VoiceSDKTracer = class {
|
|
1226
|
+
tracer;
|
|
1227
|
+
constructor() {
|
|
1228
|
+
getOrInitProvider();
|
|
1229
|
+
this.tracer = trace.getTracer("@voice-kit/core", "0.1.0");
|
|
1230
|
+
}
|
|
1231
|
+
/**
|
|
1232
|
+
* Trace an STT operation with provider + language attributes.
|
|
1233
|
+
*/
|
|
1234
|
+
async traceSTT(fn, attrs) {
|
|
1235
|
+
return this.withSpan(`stt.${attrs.provider}`, fn, {
|
|
1236
|
+
"stt.provider": attrs.provider,
|
|
1237
|
+
"stt.language": attrs.language,
|
|
1238
|
+
...attrs.callId && { "call.id": attrs.callId }
|
|
1239
|
+
});
|
|
1240
|
+
}
|
|
1241
|
+
/**
|
|
1242
|
+
* Trace a TTS synthesis operation.
|
|
1243
|
+
*/
|
|
1244
|
+
async traceTTS(fn, attrs) {
|
|
1245
|
+
return this.withSpan(`tts.${attrs.provider}`, fn, {
|
|
1246
|
+
"tts.provider": attrs.provider,
|
|
1247
|
+
"tts.voice_id": attrs.voice,
|
|
1248
|
+
"tts.char_count": attrs.chars,
|
|
1249
|
+
...attrs.callId && { "call.id": attrs.callId }
|
|
1250
|
+
});
|
|
1251
|
+
}
|
|
1252
|
+
/**
|
|
1253
|
+
* Trace an LLM generation call.
|
|
1254
|
+
*/
|
|
1255
|
+
async traceLLM(fn, attrs) {
|
|
1256
|
+
return this.withSpan(`llm.${attrs.model}`, fn, {
|
|
1257
|
+
"llm.model": attrs.model,
|
|
1258
|
+
"llm.input_tokens": attrs.inputTokens,
|
|
1259
|
+
...attrs.callId && { "call.id": attrs.callId }
|
|
1260
|
+
});
|
|
1261
|
+
}
|
|
1262
|
+
/**
|
|
1263
|
+
* Trace a full call lifecycle.
|
|
1264
|
+
*/
|
|
1265
|
+
async traceCall(fn, attrs) {
|
|
1266
|
+
return this.withSpan("call", fn, {
|
|
1267
|
+
"call.id": attrs.callId,
|
|
1268
|
+
"call.direction": attrs.direction
|
|
1269
|
+
});
|
|
1270
|
+
}
|
|
1271
|
+
/**
|
|
1272
|
+
* Trace a single conversation turn.
|
|
1273
|
+
*/
|
|
1274
|
+
async traceTurn(fn, attrs) {
|
|
1275
|
+
return this.withSpan("turn", fn, {
|
|
1276
|
+
"turn.index": attrs.turnIndex,
|
|
1277
|
+
"call.id": attrs.callId
|
|
1278
|
+
});
|
|
1279
|
+
}
|
|
1280
|
+
/** Generic span wrapper. @internal */
|
|
1281
|
+
async withSpan(name, fn, attributes) {
|
|
1282
|
+
const span = this.tracer.startSpan(name, { attributes });
|
|
1283
|
+
const startMs = Date.now();
|
|
1284
|
+
try {
|
|
1285
|
+
const result = await fn();
|
|
1286
|
+
span.setStatus({ code: SpanStatusCode.OK });
|
|
1287
|
+
span.setAttribute("duration_ms", Date.now() - startMs);
|
|
1288
|
+
return result;
|
|
1289
|
+
} catch (err) {
|
|
1290
|
+
span.setStatus({
|
|
1291
|
+
code: SpanStatusCode.ERROR,
|
|
1292
|
+
message: err instanceof Error ? err.message : String(err)
|
|
1293
|
+
});
|
|
1294
|
+
span.recordException(err instanceof Error ? err : new Error(String(err)));
|
|
1295
|
+
throw err;
|
|
1296
|
+
} finally {
|
|
1297
|
+
span.end();
|
|
1298
|
+
}
|
|
1299
|
+
}
|
|
1300
|
+
};
|
|
1301
|
+
var logger8 = pino({ name: "@voice-kit/core:stt:assemblyai" });
|
|
1302
|
+
var SUPPORTED_LANGUAGES = [
|
|
1303
|
+
"en",
|
|
1304
|
+
"en_au",
|
|
1305
|
+
"en_uk",
|
|
1306
|
+
"en_us",
|
|
1307
|
+
"hi",
|
|
1308
|
+
"fr",
|
|
1309
|
+
"de",
|
|
1310
|
+
"es",
|
|
1311
|
+
"it",
|
|
1312
|
+
"pt",
|
|
1313
|
+
"nl",
|
|
1314
|
+
"ja",
|
|
1315
|
+
"zh"
|
|
1316
|
+
];
|
|
1317
|
+
var AssemblyAISTTProvider = class {
|
|
1318
|
+
name = "assemblyai";
|
|
1319
|
+
supportsStreaming = false;
|
|
1320
|
+
supportedLanguages = SUPPORTED_LANGUAGES;
|
|
1321
|
+
client;
|
|
1322
|
+
config;
|
|
1323
|
+
constructor(config) {
|
|
1324
|
+
const apiKey = config.apiKey ?? process.env["ASSEMBLYAI_API_KEY"];
|
|
1325
|
+
if (!apiKey) throw new STTConnectionError("assemblyai", new Error("ASSEMBLYAI_API_KEY not set"));
|
|
1326
|
+
this.client = new AssemblyAI({ apiKey });
|
|
1327
|
+
this.config = {
|
|
1328
|
+
language: config.language ?? "en",
|
|
1329
|
+
alternateLanguages: config.alternateLanguages ?? [],
|
|
1330
|
+
apiKey,
|
|
1331
|
+
model: config.model ?? "best",
|
|
1332
|
+
wordTimestamps: config.wordTimestamps ?? true,
|
|
1333
|
+
interimResults: false,
|
|
1334
|
+
smartFormat: config.smartFormat ?? true,
|
|
1335
|
+
region: ""
|
|
1336
|
+
};
|
|
1337
|
+
}
|
|
1338
|
+
/**
|
|
1339
|
+
* Batch-transcribes collected audio. AssemblyAI has no realtime streaming.
|
|
1340
|
+
* Collects all audio from the iterable, uploads, then polls for result.
|
|
1341
|
+
*
|
|
1342
|
+
* @param audio Async iterable of PCM buffers
|
|
1343
|
+
*/
|
|
1344
|
+
async *transcribeStream(audio) {
|
|
1345
|
+
const chunks = [];
|
|
1346
|
+
for await (const chunk of audio) chunks.push(chunk);
|
|
1347
|
+
const result = await this.transcribeBatch(Buffer.concat(chunks));
|
|
1348
|
+
yield result;
|
|
1349
|
+
}
|
|
1350
|
+
/**
|
|
1351
|
+
* Upload audio to AssemblyAI and wait for async transcription.
|
|
1352
|
+
* Suitable for call recordings. Average latency: 15–45s per minute of audio.
|
|
1353
|
+
*
|
|
1354
|
+
* @param audio Raw WAV/PCM/MP3 buffer
|
|
1355
|
+
*
|
|
1356
|
+
* @example
|
|
1357
|
+
* ```ts
|
|
1358
|
+
* const stt = createSTT('assemblyai', { wordTimestamps: true })
|
|
1359
|
+
* const result = await stt.transcribeBatch(recordingBuffer)
|
|
1360
|
+
* console.log(result.words) // Word-level timestamps
|
|
1361
|
+
* ```
|
|
1362
|
+
*/
|
|
1363
|
+
async transcribeBatch(audio) {
|
|
1364
|
+
const startMs = Date.now();
|
|
1365
|
+
try {
|
|
1366
|
+
logger8.debug({ bytes: audio.length, language: this.config.language }, "AssemblyAI transcription started");
|
|
1367
|
+
const transcript = await this.client.transcripts.transcribe({
|
|
1368
|
+
audio,
|
|
1369
|
+
language_code: this.config.language,
|
|
1370
|
+
speech_model: this.config.model,
|
|
1371
|
+
punctuate: this.config.smartFormat,
|
|
1372
|
+
format_text: this.config.smartFormat,
|
|
1373
|
+
word_boost: [],
|
|
1374
|
+
...this.config.wordTimestamps && { timestamps: true }
|
|
1375
|
+
});
|
|
1376
|
+
if (transcript.status === "error") {
|
|
1377
|
+
throw new STTStreamError("assemblyai", new Error(transcript.error ?? "Transcription failed"));
|
|
1378
|
+
}
|
|
1379
|
+
logger8.info(
|
|
1380
|
+
{ id: transcript.id, duration: transcript.audio_duration, latencyMs: Date.now() - startMs },
|
|
1381
|
+
"AssemblyAI transcription complete"
|
|
1382
|
+
);
|
|
1383
|
+
return {
|
|
1384
|
+
transcript: transcript.text ?? "",
|
|
1385
|
+
isFinal: true,
|
|
1386
|
+
confidence: transcript.confidence ?? 0.9,
|
|
1387
|
+
language: this.config.language,
|
|
1388
|
+
languageSwitchDetected: false,
|
|
1389
|
+
words: this.config.wordTimestamps && transcript.words ? transcript.words.map((w) => ({
|
|
1390
|
+
word: w.text,
|
|
1391
|
+
startMs: w.start,
|
|
1392
|
+
endMs: w.end,
|
|
1393
|
+
confidence: w.confidence
|
|
1394
|
+
})) : void 0,
|
|
1395
|
+
latencyMs: Date.now() - startMs
|
|
1396
|
+
};
|
|
1397
|
+
} catch (err) {
|
|
1398
|
+
if (err instanceof STTStreamError) throw err;
|
|
1399
|
+
throw new STTStreamError("assemblyai", err);
|
|
1400
|
+
}
|
|
1401
|
+
}
|
|
1402
|
+
};
|
|
1403
|
+
var logger9 = pino({ name: "@voice-kit/core:stt:deepgram" });
|
|
1404
|
+
var SUPPORTED_LANGUAGES2 = [
|
|
1405
|
+
"en-IN",
|
|
1406
|
+
"hi-IN",
|
|
1407
|
+
"ta-IN",
|
|
1408
|
+
"te-IN",
|
|
1409
|
+
"kn-IN",
|
|
1410
|
+
"mr-IN",
|
|
1411
|
+
"en-US",
|
|
1412
|
+
"en-GB",
|
|
1413
|
+
"en-AU"
|
|
1414
|
+
];
|
|
1415
|
+
var BACKOFF = {
|
|
1416
|
+
baseMs: 100,
|
|
1417
|
+
maxMs: 5e3,
|
|
1418
|
+
jitterPct: 0.2,
|
|
1419
|
+
maxAttempts: 3
|
|
1420
|
+
};
|
|
1421
|
+
function backoffDelay(attempt) {
|
|
1422
|
+
const base = Math.min(BACKOFF.baseMs * Math.pow(2, attempt), BACKOFF.maxMs);
|
|
1423
|
+
const jitter = base * BACKOFF.jitterPct * (Math.random() * 2 - 1);
|
|
1424
|
+
return Math.round(base + jitter);
|
|
1425
|
+
}
|
|
1426
|
+
var DeepgramSTTProvider = class {
|
|
1427
|
+
name = "deepgram";
|
|
1428
|
+
supportsStreaming = true;
|
|
1429
|
+
supportedLanguages = SUPPORTED_LANGUAGES2;
|
|
1430
|
+
client;
|
|
1431
|
+
config;
|
|
1432
|
+
constructor(config) {
|
|
1433
|
+
const apiKey = config.apiKey ?? process.env["DEEPGRAM_API_KEY"];
|
|
1434
|
+
if (!apiKey) throw new STTConnectionError("deepgram", new Error("DEEPGRAM_API_KEY not set"));
|
|
1435
|
+
this.client = new DeepgramClient({ apiKey });
|
|
1436
|
+
this.config = {
|
|
1437
|
+
language: config.language ?? "en-IN",
|
|
1438
|
+
alternateLanguages: config.alternateLanguages ?? [],
|
|
1439
|
+
apiKey,
|
|
1440
|
+
// nova-3 is now Deepgram's latest recommended model
|
|
1441
|
+
model: config.model ?? "nova-3",
|
|
1442
|
+
wordTimestamps: config.wordTimestamps ?? false,
|
|
1443
|
+
interimResults: config.interimResults ?? true,
|
|
1444
|
+
smartFormat: config.smartFormat ?? true,
|
|
1445
|
+
region: config.region ?? ""
|
|
1446
|
+
};
|
|
1447
|
+
}
|
|
1448
|
+
/**
|
|
1449
|
+
* Stream audio to Deepgram and receive interim + final transcription results.
|
|
1450
|
+
* Handles reconnection transparently with exponential backoff.
|
|
1451
|
+
*
|
|
1452
|
+
* @param audio Async iterable of 16kHz PCM buffers from AudioPipeline
|
|
1453
|
+
*
|
|
1454
|
+
* @example
|
|
1455
|
+
* ```ts
|
|
1456
|
+
* const stt = createSTT('deepgram', { language: 'hi-IN' })
|
|
1457
|
+
* for await (const result of stt.transcribeStream(audioIterable)) {
|
|
1458
|
+
* if (result.isFinal) console.log('User said:', result.transcript)
|
|
1459
|
+
* }
|
|
1460
|
+
* ```
|
|
1461
|
+
*/
|
|
1462
|
+
async *transcribeStream(audio) {
|
|
1463
|
+
let attempt = 0;
|
|
1464
|
+
const startMs = Date.now();
|
|
1465
|
+
while (attempt <= BACKOFF.maxAttempts) {
|
|
1466
|
+
const connection = await this.connectWithRetry(attempt);
|
|
1467
|
+
const results = [];
|
|
1468
|
+
let done = false;
|
|
1469
|
+
let error = null;
|
|
1470
|
+
connection.on("message", (data) => {
|
|
1471
|
+
if (data.type !== "Results") return;
|
|
1472
|
+
const alt = data.channel?.alternatives?.[0];
|
|
1473
|
+
if (!alt?.transcript) return;
|
|
1474
|
+
const isFinal = data.is_final === true;
|
|
1475
|
+
const result = {
|
|
1476
|
+
transcript: alt.transcript,
|
|
1477
|
+
isFinal,
|
|
1478
|
+
// speech_final=true means Deepgram detected end-of-utterance (endpointing).
|
|
1479
|
+
// A frame can be speech_final without is_final — callers should act on both.
|
|
1480
|
+
confidence: alt.confidence ?? 0,
|
|
1481
|
+
// alt.languages populated when detect_language is enabled
|
|
1482
|
+
language: alt.languages?.[0] ?? this.config.language,
|
|
1483
|
+
languageSwitchDetected: false,
|
|
1484
|
+
words: this.config.wordTimestamps ? alt.words?.map((w) => ({
|
|
1485
|
+
word: w.word ?? "",
|
|
1486
|
+
startMs: (w.start ?? 0) * 1e3,
|
|
1487
|
+
endMs: (w.end ?? 0) * 1e3,
|
|
1488
|
+
confidence: w.confidence ?? 0,
|
|
1489
|
+
punctuatedWord: w.punctuated_word
|
|
1490
|
+
})) : void 0,
|
|
1491
|
+
latencyMs: Date.now() - startMs
|
|
1492
|
+
};
|
|
1493
|
+
results.push(result);
|
|
1494
|
+
if (isFinal) {
|
|
1495
|
+
logger9.debug(
|
|
1496
|
+
{ transcript: result.transcript, confidence: result.confidence, language: result.language },
|
|
1497
|
+
"Deepgram final transcript"
|
|
1498
|
+
);
|
|
1499
|
+
}
|
|
1500
|
+
});
|
|
1501
|
+
connection.on("close", () => {
|
|
1502
|
+
done = true;
|
|
1503
|
+
});
|
|
1504
|
+
connection.on("error", (err) => {
|
|
1505
|
+
error = err;
|
|
1506
|
+
logger9.warn({ err, attempt }, "Deepgram stream error");
|
|
1507
|
+
});
|
|
1508
|
+
const sendAudio = async () => {
|
|
1509
|
+
try {
|
|
1510
|
+
for await (const chunk of audio) {
|
|
1511
|
+
connection.socket.send(chunk);
|
|
1512
|
+
}
|
|
1513
|
+
connection.socket.send(JSON.stringify({ type: "Finalize" }));
|
|
1514
|
+
} catch (err) {
|
|
1515
|
+
error = err instanceof Error ? err : new Error(String(err));
|
|
1516
|
+
}
|
|
1517
|
+
};
|
|
1518
|
+
const sendPromise = sendAudio();
|
|
1519
|
+
let resultIndex = 0;
|
|
1520
|
+
while (!done || resultIndex < results.length) {
|
|
1521
|
+
if (resultIndex < results.length) {
|
|
1522
|
+
yield results[resultIndex++];
|
|
1523
|
+
} else {
|
|
1524
|
+
await new Promise((r) => setTimeout(r, 10));
|
|
1525
|
+
}
|
|
1526
|
+
if (error && attempt < BACKOFF.maxAttempts) {
|
|
1527
|
+
try {
|
|
1528
|
+
connection.socket.close();
|
|
1529
|
+
} catch {
|
|
1530
|
+
}
|
|
1531
|
+
break;
|
|
1532
|
+
}
|
|
1533
|
+
if (error && attempt >= BACKOFF.maxAttempts) {
|
|
1534
|
+
await sendPromise.catch(() => {
|
|
1535
|
+
});
|
|
1536
|
+
throw new STTStreamError("deepgram", error);
|
|
1537
|
+
}
|
|
1538
|
+
}
|
|
1539
|
+
await sendPromise.catch(() => {
|
|
1540
|
+
});
|
|
1541
|
+
if (!error) return;
|
|
1542
|
+
attempt++;
|
|
1543
|
+
await new Promise((r) => setTimeout(r, backoffDelay(attempt)));
|
|
1544
|
+
logger9.info({ attempt }, "Deepgram reconnecting...");
|
|
1545
|
+
}
|
|
1546
|
+
throw new STTStreamError("deepgram", new Error("Max reconnect attempts exceeded"));
|
|
1547
|
+
}
|
|
1548
|
+
/**
|
|
1549
|
+
* Transcribe a complete audio buffer (non-streaming).
|
|
1550
|
+
* Uses Deepgram pre-recorded API.
|
|
1551
|
+
*
|
|
1552
|
+
* @param audio Raw PCM or WAV buffer
|
|
1553
|
+
*/
|
|
1554
|
+
async transcribeBatch(audio) {
|
|
1555
|
+
const startMs = Date.now();
|
|
1556
|
+
try {
|
|
1557
|
+
const response = await this.client.listen.v1.media.transcribeFile(
|
|
1558
|
+
audio,
|
|
1559
|
+
{
|
|
1560
|
+
model: this.config.model,
|
|
1561
|
+
language: this.config.language,
|
|
1562
|
+
// v5: boolean-like options must be strings
|
|
1563
|
+
smart_format: true,
|
|
1564
|
+
diarize: false
|
|
1565
|
+
}
|
|
1566
|
+
);
|
|
1567
|
+
const alt = response?.results?.channels?.[0]?.alternatives?.[0];
|
|
1568
|
+
return {
|
|
1569
|
+
transcript: alt?.transcript ?? "",
|
|
1570
|
+
isFinal: true,
|
|
1571
|
+
confidence: alt?.confidence ?? 0,
|
|
1572
|
+
language: this.config.language,
|
|
1573
|
+
languageSwitchDetected: false,
|
|
1574
|
+
latencyMs: Date.now() - startMs
|
|
1575
|
+
};
|
|
1576
|
+
} catch (err) {
|
|
1577
|
+
if (err instanceof STTStreamError) throw err;
|
|
1578
|
+
throw new STTStreamError("deepgram", err instanceof Error ? err : new Error(String(err)));
|
|
1579
|
+
}
|
|
1580
|
+
}
|
|
1581
|
+
/**
|
|
1582
|
+
* Create and open a live WebSocket connection to Deepgram.
|
|
1583
|
+
*
|
|
1584
|
+
* v5 connection lifecycle (3 explicit steps):
|
|
1585
|
+
* 1. await listen.v1.connect(options) — constructs the connection object
|
|
1586
|
+
* 2. connection.connect() — initiates the WebSocket handshake
|
|
1587
|
+
* 3. await connection.waitForOpen() — resolves once the socket is ready
|
|
1588
|
+
*
|
|
1589
|
+
* @internal
|
|
1590
|
+
*/
|
|
1591
|
+
async connectWithRetry(attempt) {
|
|
1592
|
+
const delay = attempt > 0 ? backoffDelay(attempt) : 0;
|
|
1593
|
+
if (delay > 0) await new Promise((r) => setTimeout(r, delay));
|
|
1594
|
+
try {
|
|
1595
|
+
logger9.debug({ attempt, language: this.config.language }, "Connecting to Deepgram");
|
|
1596
|
+
const connection = await this.client.listen.v1.connect({
|
|
1597
|
+
model: this.config.model,
|
|
1598
|
+
language: this.config.language,
|
|
1599
|
+
// v5: boolean-like options must be strings
|
|
1600
|
+
smart_format: "true",
|
|
1601
|
+
interim_results: String(this.config.interimResults),
|
|
1602
|
+
encoding: "linear16",
|
|
1603
|
+
sample_rate: 16e3,
|
|
1604
|
+
channels: 1,
|
|
1605
|
+
utterance_end_ms: "1000",
|
|
1606
|
+
...this.config.alternateLanguages.length > 0 && {
|
|
1607
|
+
detect_language: "true",
|
|
1608
|
+
// language must be omitted when detect_language is enabled
|
|
1609
|
+
language: void 0
|
|
1610
|
+
},
|
|
1611
|
+
Authorization: `Token ${this.config.apiKey}`
|
|
1612
|
+
});
|
|
1613
|
+
connection.connect();
|
|
1614
|
+
await Promise.race([
|
|
1615
|
+
connection.waitForOpen(),
|
|
1616
|
+
new Promise(
|
|
1617
|
+
(_, reject) => setTimeout(
|
|
1618
|
+
() => reject(new STTConnectionError("deepgram", new Error("Connection timeout"))),
|
|
1619
|
+
1e4
|
|
1620
|
+
)
|
|
1621
|
+
)
|
|
1622
|
+
]);
|
|
1623
|
+
logger9.info({ attempt, language: this.config.language }, "Deepgram connected");
|
|
1624
|
+
return connection;
|
|
1625
|
+
} catch (err) {
|
|
1626
|
+
if (err instanceof STTConnectionError) throw err;
|
|
1627
|
+
throw new STTConnectionError("deepgram", err instanceof Error ? err : new Error(String(err)));
|
|
1628
|
+
}
|
|
1629
|
+
}
|
|
1630
|
+
};
|
|
1631
|
+
var logger10 = pino({ name: "@voice-kit/core:stt:sarvam" });
|
|
1632
|
+
var SARVAM_API_BASE = "https://api.sarvam.ai";
|
|
1633
|
+
var SUPPORTED_LANGUAGES3 = [
|
|
1634
|
+
"hi-IN",
|
|
1635
|
+
"kn-IN",
|
|
1636
|
+
"ta-IN",
|
|
1637
|
+
"te-IN",
|
|
1638
|
+
"mr-IN",
|
|
1639
|
+
"bn-IN",
|
|
1640
|
+
"gu-IN",
|
|
1641
|
+
"pa-IN",
|
|
1642
|
+
"or-IN",
|
|
1643
|
+
"ml-IN"
|
|
1644
|
+
];
|
|
1645
|
+
var SARVAM_MODELS = {
|
|
1646
|
+
"hi-IN": "saarika:v1",
|
|
1647
|
+
"kn-IN": "saarika:v1",
|
|
1648
|
+
"ta-IN": "saarika:v1",
|
|
1649
|
+
"te-IN": "saarika:v1",
|
|
1650
|
+
"mr-IN": "saarika:v1",
|
|
1651
|
+
"bn-IN": "saarika:v1",
|
|
1652
|
+
"gu-IN": "saarika:v1",
|
|
1653
|
+
"pa-IN": "saarika:v1",
|
|
1654
|
+
"or-IN": "saarika:v1",
|
|
1655
|
+
"ml-IN": "saarika:v1"
|
|
1656
|
+
};
|
|
1657
|
+
var SarvamSTTProvider = class {
|
|
1658
|
+
name = "sarvam";
|
|
1659
|
+
supportsStreaming = false;
|
|
1660
|
+
// Sarvam REST API is batch-only
|
|
1661
|
+
supportedLanguages = SUPPORTED_LANGUAGES3;
|
|
1662
|
+
http;
|
|
1663
|
+
config;
|
|
1664
|
+
constructor(config) {
|
|
1665
|
+
const apiKey = config.apiKey ?? process.env["SARVAM_API_KEY"];
|
|
1666
|
+
if (!apiKey) throw new STTConnectionError("sarvam", new Error("SARVAM_API_KEY not set"));
|
|
1667
|
+
const language = config.language ?? "hi-IN";
|
|
1668
|
+
if (!SUPPORTED_LANGUAGES3.includes(language)) {
|
|
1669
|
+
throw new STTLanguageNotSupportedError("sarvam", language);
|
|
1670
|
+
}
|
|
1671
|
+
this.http = axios.create({
|
|
1672
|
+
baseURL: SARVAM_API_BASE,
|
|
1673
|
+
headers: {
|
|
1674
|
+
"API-Subscription-Key": apiKey,
|
|
1675
|
+
"Content-Type": "multipart/form-data"
|
|
1676
|
+
},
|
|
1677
|
+
timeout: 3e4
|
|
1678
|
+
});
|
|
1679
|
+
this.config = {
|
|
1680
|
+
language,
|
|
1681
|
+
alternateLanguages: config.alternateLanguages ?? [],
|
|
1682
|
+
apiKey,
|
|
1683
|
+
model: config.model ?? SARVAM_MODELS[language] ?? "saarika:v1",
|
|
1684
|
+
wordTimestamps: false,
|
|
1685
|
+
// Sarvam doesn't support word timestamps yet
|
|
1686
|
+
interimResults: false,
|
|
1687
|
+
smartFormat: config.smartFormat ?? true,
|
|
1688
|
+
region: config.region ?? ""
|
|
1689
|
+
};
|
|
1690
|
+
}
|
|
1691
|
+
/**
|
|
1692
|
+
* Collects audio and transcribes via Sarvam batch API.
|
|
1693
|
+
* Sarvam doesn't support realtime streaming.
|
|
1694
|
+
*
|
|
1695
|
+
* @param audio Async iterable of 16kHz PCM buffers
|
|
1696
|
+
*/
|
|
1697
|
+
async *transcribeStream(audio) {
|
|
1698
|
+
const chunks = [];
|
|
1699
|
+
for await (const chunk of audio) chunks.push(chunk);
|
|
1700
|
+
const result = await this.transcribeBatch(Buffer.concat(chunks));
|
|
1701
|
+
yield result;
|
|
1702
|
+
}
|
|
1703
|
+
/**
|
|
1704
|
+
* Transcribe a WAV/PCM audio buffer in an Indic language.
|
|
1705
|
+
*
|
|
1706
|
+
* @param audio 16kHz PCM or WAV buffer
|
|
1707
|
+
*
|
|
1708
|
+
* @example
|
|
1709
|
+
* ```ts
|
|
1710
|
+
* const stt = createSTT('sarvam', { language: 'ta-IN' })
|
|
1711
|
+
* const result = await stt.transcribeBatch(tamilAudioBuffer)
|
|
1712
|
+
* console.log(result.transcript) // Tamil text
|
|
1713
|
+
* ```
|
|
1714
|
+
*/
|
|
1715
|
+
async transcribeBatch(audio) {
|
|
1716
|
+
const startMs = Date.now();
|
|
1717
|
+
try {
|
|
1718
|
+
logger10.debug(
|
|
1719
|
+
{ language: this.config.language, bytes: audio.length },
|
|
1720
|
+
"Sarvam transcription request"
|
|
1721
|
+
);
|
|
1722
|
+
const form = new FormData();
|
|
1723
|
+
form.append("file", new Blob([audio], { type: "audio/wav" }), "audio.wav");
|
|
1724
|
+
form.append("language_code", this.config.language);
|
|
1725
|
+
form.append("model", this.config.model);
|
|
1726
|
+
if (this.config.smartFormat) {
|
|
1727
|
+
form.append("with_disfluencies", "false");
|
|
1728
|
+
}
|
|
1729
|
+
const response = await this.http.post(
|
|
1730
|
+
"/speech-to-text",
|
|
1731
|
+
form
|
|
1732
|
+
);
|
|
1733
|
+
const data = response.data;
|
|
1734
|
+
logger10.info(
|
|
1735
|
+
{ language: data.language_code, confidence: data.confidence, latencyMs: Date.now() - startMs },
|
|
1736
|
+
"Sarvam transcription complete"
|
|
1737
|
+
);
|
|
1738
|
+
return {
|
|
1739
|
+
transcript: data.transcript,
|
|
1740
|
+
isFinal: true,
|
|
1741
|
+
confidence: data.confidence ?? 0.9,
|
|
1742
|
+
language: data.language_code ?? this.config.language,
|
|
1743
|
+
languageSwitchDetected: false,
|
|
1744
|
+
latencyMs: Date.now() - startMs
|
|
1745
|
+
};
|
|
1746
|
+
} catch (err) {
|
|
1747
|
+
if (axios.isAxiosError(err)) {
|
|
1748
|
+
throw new STTStreamError(
|
|
1749
|
+
"sarvam",
|
|
1750
|
+
new Error(`Sarvam API error: ${err.response?.status} ${JSON.stringify(err.response?.data)}`)
|
|
1751
|
+
);
|
|
1752
|
+
}
|
|
1753
|
+
throw new STTStreamError("sarvam", err);
|
|
1754
|
+
}
|
|
1755
|
+
}
|
|
1756
|
+
};
|
|
1757
|
+
var logger11 = pino({ name: "@voice-kit/core:stt:language-detect" });
|
|
1758
|
+
var DEVANAGARI_RANGE = /[\u0900-\u097F]/;
|
|
1759
|
+
var MIN_WORDS_FOR_CLASSIFICATION = 2;
|
|
1760
|
+
var SWITCH_CONFIDENCE_THRESHOLD = 0.6;
|
|
1761
|
+
var NEUTRAL_TOKENS = /* @__PURE__ */ new Set([
|
|
1762
|
+
"ok",
|
|
1763
|
+
"okay",
|
|
1764
|
+
"haan",
|
|
1765
|
+
"nahin",
|
|
1766
|
+
"nahi",
|
|
1767
|
+
"kya",
|
|
1768
|
+
"hai",
|
|
1769
|
+
"ho",
|
|
1770
|
+
"na",
|
|
1771
|
+
"toh",
|
|
1772
|
+
"aur",
|
|
1773
|
+
"ya",
|
|
1774
|
+
"matlab",
|
|
1775
|
+
"yani",
|
|
1776
|
+
"i",
|
|
1777
|
+
"a",
|
|
1778
|
+
"the",
|
|
1779
|
+
"is",
|
|
1780
|
+
"are",
|
|
1781
|
+
"and",
|
|
1782
|
+
"or"
|
|
1783
|
+
]);
|
|
1784
|
+
var LanguageSwitchDetector = class extends EventEmitter {
|
|
1785
|
+
currentLanguage;
|
|
1786
|
+
primaryLanguage;
|
|
1787
|
+
/** Rolling window of recent language classifications for smoothing. */
|
|
1788
|
+
recentClassifications = [];
|
|
1789
|
+
windowSize = 5;
|
|
1790
|
+
constructor(primaryLanguage = "en-IN") {
|
|
1791
|
+
super();
|
|
1792
|
+
this.primaryLanguage = primaryLanguage;
|
|
1793
|
+
this.currentLanguage = primaryLanguage;
|
|
1794
|
+
}
|
|
1795
|
+
/**
|
|
1796
|
+
* Analyze a transcript for language switches.
|
|
1797
|
+
* Should be called on every STT final result.
|
|
1798
|
+
*
|
|
1799
|
+
* @param transcript The transcribed text to analyze
|
|
1800
|
+
* @returns Detected language of the transcript
|
|
1801
|
+
*/
|
|
1802
|
+
analyze(transcript) {
|
|
1803
|
+
const words = this.tokenize(transcript);
|
|
1804
|
+
if (words.length === 0) return this.currentLanguage;
|
|
1805
|
+
const classification = this.classifySegment(words);
|
|
1806
|
+
const confidence = this.computeConfidence(words, classification);
|
|
1807
|
+
this.recentClassifications.push(classification);
|
|
1808
|
+
if (this.recentClassifications.length > this.windowSize) {
|
|
1809
|
+
this.recentClassifications.shift();
|
|
1810
|
+
}
|
|
1811
|
+
const smoothed = this.smoothedLanguage();
|
|
1812
|
+
if (smoothed !== this.currentLanguage && confidence >= SWITCH_CONFIDENCE_THRESHOLD && smoothed !== "unknown") {
|
|
1813
|
+
const event = {
|
|
1814
|
+
from: this.currentLanguage,
|
|
1815
|
+
to: smoothed,
|
|
1816
|
+
position: 0,
|
|
1817
|
+
// position in full conversation
|
|
1818
|
+
confidence,
|
|
1819
|
+
transcript,
|
|
1820
|
+
detectedAt: /* @__PURE__ */ new Date()
|
|
1821
|
+
};
|
|
1822
|
+
const prev = this.currentLanguage;
|
|
1823
|
+
this.currentLanguage = smoothed;
|
|
1824
|
+
logger11.info(
|
|
1825
|
+
{ from: prev, to: smoothed, confidence, transcript: transcript.slice(0, 50) },
|
|
1826
|
+
"Language switch detected"
|
|
1827
|
+
);
|
|
1828
|
+
this.emit("language.switched", event);
|
|
1829
|
+
}
|
|
1830
|
+
return this.currentLanguage;
|
|
1831
|
+
}
|
|
1832
|
+
/**
|
|
1833
|
+
* Analyze a transcript and return per-word language classification.
|
|
1834
|
+
* Useful for word-level Hinglish mixing visualization.
|
|
1835
|
+
*
|
|
1836
|
+
* @param transcript Text to analyze
|
|
1837
|
+
* @returns Array of { word, language } pairs
|
|
1838
|
+
*/
|
|
1839
|
+
analyzeWords(transcript) {
|
|
1840
|
+
const words = this.tokenize(transcript);
|
|
1841
|
+
return words.map((word) => ({
|
|
1842
|
+
word,
|
|
1843
|
+
language: this.classifyWord(word)
|
|
1844
|
+
}));
|
|
1845
|
+
}
|
|
1846
|
+
/** Reset to primary language (e.g., on new call). */
|
|
1847
|
+
reset() {
|
|
1848
|
+
this.currentLanguage = this.primaryLanguage;
|
|
1849
|
+
this.recentClassifications = [];
|
|
1850
|
+
}
|
|
1851
|
+
/** Current detected language. */
|
|
1852
|
+
get language() {
|
|
1853
|
+
return this.currentLanguage;
|
|
1854
|
+
}
|
|
1855
|
+
// ─── Private helpers ────────────────────────────────────────────────────────
|
|
1856
|
+
tokenize(text) {
|
|
1857
|
+
return text.toLowerCase().split(/\s+/).filter((w) => w.length > 0 && !NEUTRAL_TOKENS.has(w));
|
|
1858
|
+
}
|
|
1859
|
+
classifyWord(word) {
|
|
1860
|
+
if (DEVANAGARI_RANGE.test(word)) return "hi-IN";
|
|
1861
|
+
if (/^[a-z]+$/.test(word)) return "en-IN";
|
|
1862
|
+
return "unknown";
|
|
1863
|
+
}
|
|
1864
|
+
classifySegment(words) {
|
|
1865
|
+
let hindiCount = 0;
|
|
1866
|
+
let englishCount = 0;
|
|
1867
|
+
for (const word of words) {
|
|
1868
|
+
const lang = this.classifyWord(word);
|
|
1869
|
+
if (lang === "hi-IN") hindiCount++;
|
|
1870
|
+
else if (lang === "en-IN") englishCount++;
|
|
1871
|
+
}
|
|
1872
|
+
if (hindiCount === 0 && englishCount === 0) return "unknown";
|
|
1873
|
+
if (hindiCount > englishCount) return "hi-IN";
|
|
1874
|
+
if (englishCount > hindiCount) return "en-IN";
|
|
1875
|
+
return this.primaryLanguage;
|
|
1876
|
+
}
|
|
1877
|
+
computeConfidence(words, classification) {
|
|
1878
|
+
const relevant = words.filter((w) => this.classifyWord(w) !== "unknown");
|
|
1879
|
+
if (relevant.length < MIN_WORDS_FOR_CLASSIFICATION) return 0;
|
|
1880
|
+
const matching = relevant.filter((w) => this.classifyWord(w) === classification);
|
|
1881
|
+
return matching.length / relevant.length;
|
|
1882
|
+
}
|
|
1883
|
+
smoothedLanguage() {
|
|
1884
|
+
if (this.recentClassifications.length === 0) return this.primaryLanguage;
|
|
1885
|
+
const counts = { "hi-IN": 0, "en-IN": 0, "unknown": 0 };
|
|
1886
|
+
for (const lang of this.recentClassifications) {
|
|
1887
|
+
counts[lang]++;
|
|
1888
|
+
}
|
|
1889
|
+
if (counts["hi-IN"] > counts["en-IN"]) return "hi-IN";
|
|
1890
|
+
if (counts["en-IN"] > counts["hi-IN"]) return "en-IN";
|
|
1891
|
+
return this.currentLanguage;
|
|
1892
|
+
}
|
|
1893
|
+
};
|
|
1894
|
+
function isInglish(transcript) {
|
|
1895
|
+
const hasDevanagari = DEVANAGARI_RANGE.test(transcript);
|
|
1896
|
+
const hasLatin = /[a-zA-Z]/.test(transcript);
|
|
1897
|
+
return hasDevanagari && hasLatin;
|
|
1898
|
+
}
|
|
1899
|
+
var logger12 = pino({ name: "@voice-kit/core:stt:whisper" });
|
|
1900
|
+
var WHISPER_LANGUAGES = [
|
|
1901
|
+
"en",
|
|
1902
|
+
"hi",
|
|
1903
|
+
"ta",
|
|
1904
|
+
"te",
|
|
1905
|
+
"kn",
|
|
1906
|
+
"mr",
|
|
1907
|
+
"bn",
|
|
1908
|
+
"gu",
|
|
1909
|
+
"pa",
|
|
1910
|
+
"ur",
|
|
1911
|
+
"fr",
|
|
1912
|
+
"de",
|
|
1913
|
+
"es",
|
|
1914
|
+
"pt",
|
|
1915
|
+
"it",
|
|
1916
|
+
"nl",
|
|
1917
|
+
"pl",
|
|
1918
|
+
"ru",
|
|
1919
|
+
"ja",
|
|
1920
|
+
"zh"
|
|
1921
|
+
];
|
|
1922
|
+
var WhisperSTTProvider = class {
|
|
1923
|
+
name = "whisper";
|
|
1924
|
+
supportsStreaming = false;
|
|
1925
|
+
supportedLanguages = WHISPER_LANGUAGES;
|
|
1926
|
+
config;
|
|
1927
|
+
constructor(config) {
|
|
1928
|
+
const apiKey = config.apiKey ?? process.env["OPENAI_API_KEY"];
|
|
1929
|
+
if (!apiKey) throw new STTStreamError("whisper", new Error("OPENAI_API_KEY not set"));
|
|
1930
|
+
const language = config.language ?? "en-IN";
|
|
1931
|
+
const whisperLang = language.split("-")[0] ?? "en";
|
|
1932
|
+
if (!WHISPER_LANGUAGES.includes(whisperLang)) {
|
|
1933
|
+
throw new STTLanguageNotSupportedError("whisper", language);
|
|
1934
|
+
}
|
|
1935
|
+
this.config = {
|
|
1936
|
+
language,
|
|
1937
|
+
alternateLanguages: config.alternateLanguages ?? [],
|
|
1938
|
+
apiKey,
|
|
1939
|
+
model: config.model ?? "whisper-1",
|
|
1940
|
+
wordTimestamps: config.wordTimestamps ?? false,
|
|
1941
|
+
interimResults: false,
|
|
1942
|
+
smartFormat: false,
|
|
1943
|
+
region: ""
|
|
1944
|
+
};
|
|
1945
|
+
}
|
|
1946
|
+
/**
|
|
1947
|
+
* Streaming not supported by Whisper. Collects all audio then transcribes.
|
|
1948
|
+
* For realtime use, use createSTT('deepgram') instead.
|
|
1949
|
+
*/
|
|
1950
|
+
async *transcribeStream(audio) {
|
|
1951
|
+
const chunks = [];
|
|
1952
|
+
for await (const chunk of audio) chunks.push(chunk);
|
|
1953
|
+
const result = await this.transcribeBatch(Buffer.concat(chunks));
|
|
1954
|
+
yield result;
|
|
1955
|
+
}
|
|
1956
|
+
/**
|
|
1957
|
+
* Transcribe a complete audio buffer via Whisper.
|
|
1958
|
+
*
|
|
1959
|
+
* @param audio WAV or PCM buffer
|
|
1960
|
+
*/
|
|
1961
|
+
async transcribeBatch(audio) {
|
|
1962
|
+
const startMs = Date.now();
|
|
1963
|
+
const language = this.config.language.split("-")[0] ?? "en";
|
|
1964
|
+
try {
|
|
1965
|
+
logger12.debug({ language, bytes: audio.length }, "Whisper batch transcription");
|
|
1966
|
+
const openai = createOpenAI({ apiKey: this.config.apiKey });
|
|
1967
|
+
const file = new File([audio], "audio.wav", { type: "audio/wav" });
|
|
1968
|
+
const formData = new FormData();
|
|
1969
|
+
formData.append("file", file);
|
|
1970
|
+
formData.append("model", this.config.model);
|
|
1971
|
+
formData.append("language", language);
|
|
1972
|
+
if (this.config.wordTimestamps) {
|
|
1973
|
+
formData.append("timestamp_granularities[]", "word");
|
|
1974
|
+
formData.append("response_format", "verbose_json");
|
|
1975
|
+
}
|
|
1976
|
+
const response = await fetch("https://api.openai.com/v1/audio/transcriptions", {
|
|
1977
|
+
method: "POST",
|
|
1978
|
+
headers: { Authorization: `Bearer ${this.config.apiKey}` },
|
|
1979
|
+
body: formData
|
|
1980
|
+
});
|
|
1981
|
+
if (!response.ok) {
|
|
1982
|
+
throw new Error(`Whisper API error: ${response.status} ${response.statusText}`);
|
|
1983
|
+
}
|
|
1984
|
+
const data = await response.json();
|
|
1985
|
+
return {
|
|
1986
|
+
transcript: data.text,
|
|
1987
|
+
isFinal: true,
|
|
1988
|
+
confidence: 0.95,
|
|
1989
|
+
// Whisper doesn't return confidence
|
|
1990
|
+
language: this.config.language,
|
|
1991
|
+
languageSwitchDetected: false,
|
|
1992
|
+
words: this.config.wordTimestamps && data.words ? data.words.map((w) => ({
|
|
1993
|
+
word: w.word,
|
|
1994
|
+
startMs: w.start * 1e3,
|
|
1995
|
+
endMs: w.end * 1e3,
|
|
1996
|
+
confidence: 0.95
|
|
1997
|
+
})) : void 0,
|
|
1998
|
+
latencyMs: Date.now() - startMs
|
|
1999
|
+
};
|
|
2000
|
+
} catch (err) {
|
|
2001
|
+
if (err instanceof STTStreamError) throw err;
|
|
2002
|
+
throw new STTStreamError("whisper", err);
|
|
2003
|
+
}
|
|
2004
|
+
}
|
|
2005
|
+
};
|
|
2006
|
+
|
|
2007
|
+
// src/stt/STT-factory.ts
|
|
2008
|
+
function createSTT(provider, config) {
|
|
2009
|
+
const cfg = config ?? {};
|
|
2010
|
+
switch (provider) {
|
|
2011
|
+
case "deepgram":
|
|
2012
|
+
return new DeepgramSTTProvider(cfg);
|
|
2013
|
+
case "whisper":
|
|
2014
|
+
return new WhisperSTTProvider(cfg);
|
|
2015
|
+
case "assemblyai":
|
|
2016
|
+
return new AssemblyAISTTProvider(cfg);
|
|
2017
|
+
case "sarvam":
|
|
2018
|
+
return new SarvamSTTProvider(cfg);
|
|
2019
|
+
default: {
|
|
2020
|
+
const _exhaustive = provider;
|
|
2021
|
+
throw new Error(`Unknown STT provider: ${String(_exhaustive)}`);
|
|
2022
|
+
}
|
|
2023
|
+
}
|
|
2024
|
+
}
|
|
2025
|
+
var logger13 = pino({ name: "@voice-kit/core:tts:cartesia" });
|
|
2026
|
+
var DEFAULT_VOICE_ID = "a0e99841-438c-4a64-b679-ae501e7d6091";
|
|
2027
|
+
var CartesiaTTSProvider = class {
|
|
2028
|
+
name = "cartesia";
|
|
2029
|
+
outputSampleRate = 22050;
|
|
2030
|
+
// Cartesia default
|
|
2031
|
+
outputFormat = "pcm";
|
|
2032
|
+
client;
|
|
2033
|
+
config;
|
|
2034
|
+
constructor(config) {
|
|
2035
|
+
const apiKey = config.apiKey ?? process.env["CARTESIA_API_KEY"];
|
|
2036
|
+
if (!apiKey) throw new TTSConnectionError("cartesia", new Error("CARTESIA_API_KEY not set"));
|
|
2037
|
+
this.client = new Cartesia({ apiKey });
|
|
2038
|
+
this.config = {
|
|
2039
|
+
voiceId: config.voiceId ?? DEFAULT_VOICE_ID,
|
|
2040
|
+
sampleRate: config.sampleRate ?? 22050,
|
|
2041
|
+
speed: config.speed ?? 1,
|
|
2042
|
+
pitch: config.pitch ?? 0,
|
|
2043
|
+
apiKey,
|
|
2044
|
+
modelId: config.modelId ?? "sonic-english",
|
|
2045
|
+
emotion: config.emotion ?? "",
|
|
2046
|
+
targetLanguage: config.targetLanguage ?? "en"
|
|
2047
|
+
};
|
|
2048
|
+
}
|
|
2049
|
+
/**
|
|
2050
|
+
* Stream audio from Cartesia. Typically delivers first chunk in < 90ms.
|
|
2051
|
+
*
|
|
2052
|
+
* @example
|
|
2053
|
+
* ```ts
|
|
2054
|
+
* const tts = createTTS('cartesia', { voiceId: 'your-voice-id' })
|
|
2055
|
+
* for await (const chunk of tts.synthesizeStream('Hello!')) {
|
|
2056
|
+
* sendToTelephony(chunk)
|
|
2057
|
+
* }
|
|
2058
|
+
* ```
|
|
2059
|
+
*/
|
|
2060
|
+
async *synthesizeStream(text, config) {
|
|
2061
|
+
const voiceId = config?.voiceId ?? this.config.voiceId;
|
|
2062
|
+
const startMs = Date.now();
|
|
2063
|
+
logger13.debug({ voiceId, chars: text.length }, "Cartesia TTS stream start");
|
|
2064
|
+
try {
|
|
2065
|
+
const stream = await this.client.tts.generateSse({
|
|
2066
|
+
model_id: config?.modelId ?? this.config.modelId,
|
|
2067
|
+
transcript: text,
|
|
2068
|
+
voice: {
|
|
2069
|
+
mode: "id",
|
|
2070
|
+
id: voiceId,
|
|
2071
|
+
...this.config.emotion && {
|
|
2072
|
+
__experimental_controls: {
|
|
2073
|
+
emotion: [this.config.emotion]
|
|
2074
|
+
}
|
|
2075
|
+
}
|
|
2076
|
+
},
|
|
2077
|
+
output_format: {
|
|
2078
|
+
container: "raw",
|
|
2079
|
+
encoding: "pcm_s16le",
|
|
2080
|
+
sample_rate: toValidSampleRate(config?.sampleRate ?? this.config.sampleRate)
|
|
2081
|
+
}
|
|
2082
|
+
});
|
|
2083
|
+
let firstChunk = true;
|
|
2084
|
+
for await (const event of stream) {
|
|
2085
|
+
if (!event.data || event.data === "[DONE]") continue;
|
|
2086
|
+
let payload;
|
|
2087
|
+
try {
|
|
2088
|
+
payload = JSON.parse(event.data);
|
|
2089
|
+
} catch {
|
|
2090
|
+
continue;
|
|
2091
|
+
}
|
|
2092
|
+
if (!payload.chunk?.audio) continue;
|
|
2093
|
+
const buf = Buffer.from(payload.chunk.audio, "base64");
|
|
2094
|
+
if (firstChunk) {
|
|
2095
|
+
firstChunk = false;
|
|
2096
|
+
logger13.debug({ ttfb: Date.now() - startMs, voiceId }, "Cartesia first audio chunk");
|
|
2097
|
+
}
|
|
2098
|
+
yield buf;
|
|
2099
|
+
}
|
|
2100
|
+
} catch (err) {
|
|
2101
|
+
throw new TTSStreamError("cartesia", err);
|
|
2102
|
+
}
|
|
2103
|
+
}
|
|
2104
|
+
/** Synthesize complete audio. */
|
|
2105
|
+
async synthesizeFull(text, config) {
|
|
2106
|
+
const chunks = [];
|
|
2107
|
+
for await (const chunk of this.synthesizeStream(text, config)) {
|
|
2108
|
+
chunks.push(chunk);
|
|
2109
|
+
}
|
|
2110
|
+
return Buffer.concat(chunks);
|
|
2111
|
+
}
|
|
2112
|
+
};
|
|
2113
|
+
var VALID_SAMPLE_RATES = [22050, 8e3, 16e3, 24e3, 44100, 48e3];
|
|
2114
|
+
function toValidSampleRate(rate) {
|
|
2115
|
+
return VALID_SAMPLE_RATES.includes(rate) ? rate : 8e3;
|
|
2116
|
+
}
|
|
2117
|
+
var logger14 = pino({ name: "@voice-kit/core:tts:elevenlabs" });
|
|
2118
|
+
var DEFAULT_VOICE_ID2 = "21m00Tcm4TlvDq8ikWAM";
|
|
2119
|
+
var JITTER_BUFFER_MS = 100;
|
|
2120
|
+
var ElevenLabsTTSProvider = class {
|
|
2121
|
+
name = "elevenlabs";
|
|
2122
|
+
outputSampleRate = 24e3;
|
|
2123
|
+
// ElevenLabs default: 24kHz
|
|
2124
|
+
outputFormat = "pcm";
|
|
2125
|
+
client;
|
|
2126
|
+
config;
|
|
2127
|
+
constructor(config) {
|
|
2128
|
+
const apiKey = config.apiKey ?? process.env["ELEVENLABS_API_KEY"];
|
|
2129
|
+
if (!apiKey) throw new TTSConnectionError("elevenlabs", new Error("ELEVENLABS_API_KEY not set"));
|
|
2130
|
+
this.client = new ElevenLabsClient({ apiKey });
|
|
2131
|
+
this.config = {
|
|
2132
|
+
voiceId: config.voiceId ?? DEFAULT_VOICE_ID2,
|
|
2133
|
+
sampleRate: config.sampleRate ?? 24e3,
|
|
2134
|
+
speed: config.speed ?? 1,
|
|
2135
|
+
pitch: config.pitch ?? 0,
|
|
2136
|
+
apiKey,
|
|
2137
|
+
modelId: config.modelId ?? "eleven_turbo_v2_5",
|
|
2138
|
+
emotion: config.emotion ?? "",
|
|
2139
|
+
targetLanguage: config.targetLanguage ?? "en-IN"
|
|
2140
|
+
};
|
|
2141
|
+
}
|
|
2142
|
+
/**
|
|
2143
|
+
* Stream synthesized audio from ElevenLabs.
|
|
2144
|
+
* First chunk target: < 300ms. Uses streaming API endpoint.
|
|
2145
|
+
*
|
|
2146
|
+
* A 100ms jitter buffer smooths burst packet delivery without adding
|
|
2147
|
+
* perceptible latency.
|
|
2148
|
+
*
|
|
2149
|
+
* @param text Text to synthesize (should be a sentence boundary chunk)
|
|
2150
|
+
* @param config Per-call config overrides
|
|
2151
|
+
*
|
|
2152
|
+
* @example
|
|
2153
|
+
* ```ts
|
|
2154
|
+
* const tts = createTTS('elevenlabs', { voiceId: 'your-voice-id' })
|
|
2155
|
+
* for await (const chunk of tts.synthesizeStream('Hello, how can I help?')) {
|
|
2156
|
+
* telephony.sendAudio(chunk)
|
|
2157
|
+
* }
|
|
2158
|
+
* ```
|
|
2159
|
+
*/
|
|
2160
|
+
async *synthesizeStream(text, config) {
|
|
2161
|
+
const voiceId = config?.voiceId ?? this.config.voiceId;
|
|
2162
|
+
const modelId = config?.modelId ?? this.config.modelId;
|
|
2163
|
+
const startMs = Date.now();
|
|
2164
|
+
logger14.debug({ voiceId, modelId, chars: text.length }, "ElevenLabs TTS stream start");
|
|
2165
|
+
try {
|
|
2166
|
+
const audioStream = await this.client.generate({
|
|
2167
|
+
voice: voiceId,
|
|
2168
|
+
text,
|
|
2169
|
+
model_id: modelId,
|
|
2170
|
+
voice_settings: {
|
|
2171
|
+
stability: 0.5,
|
|
2172
|
+
similarity_boost: 0.8,
|
|
2173
|
+
speed: config?.speed ?? this.config.speed
|
|
2174
|
+
},
|
|
2175
|
+
output_format: "pcm_24000",
|
|
2176
|
+
stream: true
|
|
2177
|
+
});
|
|
2178
|
+
let firstChunk = true;
|
|
2179
|
+
let jitterBuffer = [];
|
|
2180
|
+
let jitterTimer = null;
|
|
2181
|
+
const flushJitterBuffer = function* () {
|
|
2182
|
+
for (const chunk of jitterBuffer) {
|
|
2183
|
+
yield chunk;
|
|
2184
|
+
}
|
|
2185
|
+
jitterBuffer = [];
|
|
2186
|
+
};
|
|
2187
|
+
for await (const chunk of audioStream) {
|
|
2188
|
+
const buf = chunk instanceof Buffer ? chunk : Buffer.from(chunk);
|
|
2189
|
+
if (firstChunk) {
|
|
2190
|
+
firstChunk = false;
|
|
2191
|
+
const ttfb = Date.now() - startMs;
|
|
2192
|
+
logger14.debug({ ttfb, voiceId }, "ElevenLabs first audio chunk");
|
|
2193
|
+
}
|
|
2194
|
+
jitterBuffer.push(buf);
|
|
2195
|
+
if (jitterTimer === null) {
|
|
2196
|
+
jitterTimer = setTimeout(() => {
|
|
2197
|
+
}, JITTER_BUFFER_MS);
|
|
2198
|
+
}
|
|
2199
|
+
const totalBytes = jitterBuffer.reduce((sum, b) => sum + b.length, 0);
|
|
2200
|
+
if (totalBytes >= 4800) {
|
|
2201
|
+
if (jitterTimer !== null) {
|
|
2202
|
+
clearTimeout(jitterTimer);
|
|
2203
|
+
jitterTimer = null;
|
|
2204
|
+
}
|
|
2205
|
+
yield* flushJitterBuffer();
|
|
2206
|
+
}
|
|
2207
|
+
}
|
|
2208
|
+
if (jitterTimer !== null) {
|
|
2209
|
+
clearTimeout(jitterTimer);
|
|
2210
|
+
}
|
|
2211
|
+
yield* flushJitterBuffer();
|
|
2212
|
+
logger14.debug({ voiceId, totalMs: Date.now() - startMs }, "ElevenLabs TTS stream complete");
|
|
2213
|
+
} catch (err) {
|
|
2214
|
+
if (err.statusCode === 404) {
|
|
2215
|
+
throw new TTSVoiceNotFoundError("elevenlabs", voiceId);
|
|
2216
|
+
}
|
|
2217
|
+
throw new TTSStreamError("elevenlabs", err);
|
|
2218
|
+
}
|
|
2219
|
+
}
|
|
2220
|
+
/**
|
|
2221
|
+
* Synthesize full audio (for pre-caching greetings, IVR prompts).
|
|
2222
|
+
* Collects all streaming chunks into a single buffer.
|
|
2223
|
+
*
|
|
2224
|
+
* @param text Text to synthesize
|
|
2225
|
+
* @param config Per-call config overrides
|
|
2226
|
+
*/
|
|
2227
|
+
async synthesizeFull(text, config) {
|
|
2228
|
+
const chunks = [];
|
|
2229
|
+
for await (const chunk of this.synthesizeStream(text, config)) {
|
|
2230
|
+
chunks.push(chunk);
|
|
2231
|
+
}
|
|
2232
|
+
return Buffer.concat(chunks);
|
|
2233
|
+
}
|
|
2234
|
+
};
|
|
2235
|
+
var logger15 = pino({ name: "@voice-kit/core:tts:sarvam" });
|
|
2236
|
+
var SARVAM_API_BASE2 = "https://api.sarvam.ai";
|
|
2237
|
+
var DEFAULT_VOICES = {
|
|
2238
|
+
"hi-IN": "meera",
|
|
2239
|
+
"kn-IN": "pavithra",
|
|
2240
|
+
"ta-IN": "pavithra",
|
|
2241
|
+
"te-IN": "pavithra",
|
|
2242
|
+
"mr-IN": "meera",
|
|
2243
|
+
"bn-IN": "meera",
|
|
2244
|
+
"gu-IN": "meera",
|
|
2245
|
+
"pa-IN": "meera"
|
|
2246
|
+
};
|
|
2247
|
+
var SarvamTTSProvider = class {
|
|
2248
|
+
name = "sarvam";
|
|
2249
|
+
outputSampleRate = 22050;
|
|
2250
|
+
// Sarvam default
|
|
2251
|
+
outputFormat = "mp3";
|
|
2252
|
+
http;
|
|
2253
|
+
config;
|
|
2254
|
+
constructor(config) {
|
|
2255
|
+
const apiKey = config.apiKey ?? process.env["SARVAM_API_KEY"];
|
|
2256
|
+
if (!apiKey) throw new TTSConnectionError("sarvam", new Error("SARVAM_API_KEY not set"));
|
|
2257
|
+
const targetLanguage = config.targetLanguage ?? "hi-IN";
|
|
2258
|
+
const defaultVoice = DEFAULT_VOICES[targetLanguage] ?? "meera";
|
|
2259
|
+
this.http = axios.create({
|
|
2260
|
+
baseURL: SARVAM_API_BASE2,
|
|
2261
|
+
headers: {
|
|
2262
|
+
"API-Subscription-Key": apiKey,
|
|
2263
|
+
"Content-Type": "application/json"
|
|
2264
|
+
},
|
|
2265
|
+
timeout: 15e3
|
|
2266
|
+
});
|
|
2267
|
+
this.config = {
|
|
2268
|
+
voiceId: config.voiceId ?? defaultVoice,
|
|
2269
|
+
sampleRate: config.sampleRate ?? 22050,
|
|
2270
|
+
speed: config.speed ?? 1,
|
|
2271
|
+
pitch: config.pitch ?? 0,
|
|
2272
|
+
apiKey,
|
|
2273
|
+
modelId: config.modelId ?? "bulbul:v1",
|
|
2274
|
+
emotion: "",
|
|
2275
|
+
targetLanguage
|
|
2276
|
+
};
|
|
2277
|
+
}
|
|
2278
|
+
/**
|
|
2279
|
+
* Synthesize text in an Indic language and stream audio chunks.
|
|
2280
|
+
* Sarvam returns full audio segments — we chunk them for streaming compatibility.
|
|
2281
|
+
*
|
|
2282
|
+
* @example
|
|
2283
|
+
* ```ts
|
|
2284
|
+
* const tts = createTTS('sarvam', { targetLanguage: 'hi-IN' })
|
|
2285
|
+
* for await (const chunk of tts.synthesizeStream('नमस्ते, मैं आपकी कैसे मदद कर सकता हूँ?')) {
|
|
2286
|
+
* telephony.sendAudio(chunk)
|
|
2287
|
+
* }
|
|
2288
|
+
* ```
|
|
2289
|
+
*/
|
|
2290
|
+
async *synthesizeStream(text, config) {
|
|
2291
|
+
const startMs = Date.now();
|
|
2292
|
+
const targetLanguage = config?.targetLanguage ?? this.config.targetLanguage;
|
|
2293
|
+
const speaker = config?.voiceId ?? this.config.voiceId;
|
|
2294
|
+
logger15.debug({ targetLanguage, speaker, chars: text.length }, "Sarvam TTS request");
|
|
2295
|
+
try {
|
|
2296
|
+
const response = await this.http.post("/text-to-speech", {
|
|
2297
|
+
inputs: [text],
|
|
2298
|
+
target_language_code: targetLanguage,
|
|
2299
|
+
speaker,
|
|
2300
|
+
model: config?.modelId ?? this.config.modelId,
|
|
2301
|
+
pitch: config?.pitch ?? this.config.pitch,
|
|
2302
|
+
pace: config?.speed ?? this.config.speed,
|
|
2303
|
+
loudness: 1,
|
|
2304
|
+
speech_sample_rate: config?.sampleRate ?? this.config.sampleRate,
|
|
2305
|
+
enable_preprocessing: true
|
|
2306
|
+
});
|
|
2307
|
+
const audioSegments = response.data.audios;
|
|
2308
|
+
if (!audioSegments || audioSegments.length === 0) {
|
|
2309
|
+
throw new TTSStreamError("sarvam", new Error("No audio returned from Sarvam TTS"));
|
|
2310
|
+
}
|
|
2311
|
+
logger15.debug(
|
|
2312
|
+
{ segments: audioSegments.length, latencyMs: Date.now() - startMs },
|
|
2313
|
+
"Sarvam TTS response received"
|
|
2314
|
+
);
|
|
2315
|
+
for (const segment of audioSegments) {
|
|
2316
|
+
const buf = Buffer.from(segment, "base64");
|
|
2317
|
+
const CHUNK_SIZE = 4096;
|
|
2318
|
+
let offset = 0;
|
|
2319
|
+
while (offset < buf.length) {
|
|
2320
|
+
yield buf.subarray(offset, Math.min(offset + CHUNK_SIZE, buf.length));
|
|
2321
|
+
offset += CHUNK_SIZE;
|
|
2322
|
+
}
|
|
2323
|
+
}
|
|
2324
|
+
} catch (err) {
|
|
2325
|
+
if (axios.isAxiosError(err)) {
|
|
2326
|
+
throw new TTSStreamError(
|
|
2327
|
+
"sarvam",
|
|
2328
|
+
new Error(`Sarvam TTS API error: ${err.response?.status} ${JSON.stringify(err.response?.data)}`)
|
|
2329
|
+
);
|
|
2330
|
+
}
|
|
2331
|
+
throw new TTSStreamError("sarvam", err);
|
|
2332
|
+
}
|
|
2333
|
+
}
|
|
2334
|
+
/** Synthesize complete audio buffer. */
|
|
2335
|
+
async synthesizeFull(text, config) {
|
|
2336
|
+
const chunks = [];
|
|
2337
|
+
for await (const chunk of this.synthesizeStream(text, config)) {
|
|
2338
|
+
chunks.push(chunk);
|
|
2339
|
+
}
|
|
2340
|
+
return Buffer.concat(chunks);
|
|
2341
|
+
}
|
|
2342
|
+
};
|
|
2343
|
+
|
|
2344
|
+
// src/tts/TTS-factory.ts
|
|
2345
|
+
function createTTS(provider, config) {
|
|
2346
|
+
const cfg = config ?? {};
|
|
2347
|
+
switch (provider) {
|
|
2348
|
+
case "elevenlabs":
|
|
2349
|
+
return new ElevenLabsTTSProvider(cfg);
|
|
2350
|
+
case "cartesia":
|
|
2351
|
+
return new CartesiaTTSProvider(cfg);
|
|
2352
|
+
case "sarvam":
|
|
2353
|
+
return new SarvamTTSProvider(cfg);
|
|
2354
|
+
default: {
|
|
2355
|
+
const _exhaustive = provider;
|
|
2356
|
+
throw new Error(`Unknown TTS provider: ${String(_exhaustive)}`);
|
|
2357
|
+
}
|
|
2358
|
+
}
|
|
2359
|
+
}
|
|
2360
|
+
|
|
2361
|
+
export { AgentError, AgentHandoffError, AssemblyAISTTProvider, AudioPipeline, AudioTransportError, CallAuditLog, CallConnectionError, CallMetrics, CallNotFoundError, CallingHoursError, CartesiaTTSProvider, ComplianceError, ConsentMissingError, DNCBlockedError, DeepgramSTTProvider, ElevenLabsTTSProvider, InngestError, LanguageSwitchDetector, STTConnectionError, STTError, STTLanguageNotSupportedError, STTStreamError, SarvamSTTProvider, SarvamTTSProvider, TRAICompliance, TTSConnectionError, TTSError, TTSStreamError, TTSVoiceNotFoundError, TelephonyError, TurnTransitionError, VADEngine, VoiceKitError, VoiceSDKTracer, WhisperSTTProvider, base64MulawToPcm, createAudioPipeline, createCallMemory, createResamplerStream, createSTT, createTTS, createVAD, isInglish, linearToMulaw, mulawBufferToPcm, mulawToLinear, pcmBufferToMulaw, pcmToBase64Mulaw, resample, resampleStream };
|
|
261
2362
|
//# sourceMappingURL=index.js.map
|
|
262
2363
|
//# sourceMappingURL=index.js.map
|