@voice-kit/core 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/audio.cjs +533 -0
- package/dist/audio.cjs.map +1 -0
- package/dist/audio.d.cts +260 -0
- package/dist/audio.d.ts +260 -0
- package/dist/audio.js +514 -0
- package/dist/audio.js.map +1 -0
- package/dist/compliance.d.cts +1 -1
- package/dist/compliance.d.ts +1 -1
- package/dist/errors.d.cts +4 -79
- package/dist/errors.d.ts +4 -79
- package/dist/index-D3KfRXMP.d.cts +319 -0
- package/dist/index-D3KfRXMP.d.ts +319 -0
- package/dist/index.cjs +280 -0
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +4 -319
- package/dist/index.d.ts +4 -319
- package/dist/index.js +259 -0
- package/dist/index.js.map +1 -1
- package/dist/memory.d.cts +1 -1
- package/dist/memory.d.ts +1 -1
- package/dist/observability.d.cts +1 -1
- package/dist/observability.d.ts +1 -1
- package/dist/stt.d.cts +1 -1
- package/dist/stt.d.ts +1 -1
- package/dist/telephony.errors-BQYr6-vl.d.cts +80 -0
- package/dist/telephony.errors-C0-nScrF.d.ts +80 -0
- package/dist/tts.d.cts +1 -1
- package/dist/tts.d.ts +1 -1
- package/package.json +9 -1
package/dist/audio.cjs
ADDED
|
@@ -0,0 +1,533 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
var ffmpeg = require('fluent-ffmpeg');
|
|
4
|
+
var stream = require('stream');
|
|
5
|
+
var pino = require('pino');
|
|
6
|
+
var events = require('events');
|
|
7
|
+
|
|
8
|
+
function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
|
|
9
|
+
|
|
10
|
+
var ffmpeg__default = /*#__PURE__*/_interopDefault(ffmpeg);
|
|
11
|
+
var pino__default = /*#__PURE__*/_interopDefault(pino);
|
|
12
|
+
|
|
13
|
+
// src/audio/codec/index.ts
|
|
14
|
+
var MULAW_BIAS = 33;
|
|
15
|
+
var MULAW_MAX = 32767;
|
|
16
|
+
function mulawToLinear(sample) {
|
|
17
|
+
sample = ~sample & 255;
|
|
18
|
+
const sign = sample & 128;
|
|
19
|
+
const exponent = sample >> 4 & 7;
|
|
20
|
+
const mantissa = sample & 15;
|
|
21
|
+
let linear = (mantissa << 1) + 33 << exponent;
|
|
22
|
+
linear -= 33;
|
|
23
|
+
return sign !== 0 ? -linear : linear;
|
|
24
|
+
}
|
|
25
|
+
function linearToMulaw(sample) {
|
|
26
|
+
sample = Math.max(-32768, Math.min(32767, sample));
|
|
27
|
+
const sign = sample < 0 ? 128 : 0;
|
|
28
|
+
if (sample < 0) sample = -sample;
|
|
29
|
+
sample += MULAW_BIAS;
|
|
30
|
+
if (sample > MULAW_MAX) sample = MULAW_MAX;
|
|
31
|
+
let exponent = 7;
|
|
32
|
+
let expMask = 16384;
|
|
33
|
+
for (; exponent > 0; exponent--) {
|
|
34
|
+
if ((sample & expMask) !== 0) break;
|
|
35
|
+
expMask >>= 1;
|
|
36
|
+
}
|
|
37
|
+
const mantissa = sample >> exponent + 3 & 15;
|
|
38
|
+
const mulaw = ~(sign | exponent << 4 | mantissa) & 255;
|
|
39
|
+
return mulaw;
|
|
40
|
+
}
|
|
41
|
+
function mulawBufferToPcm(buf) {
|
|
42
|
+
const pcm = Buffer.allocUnsafe(buf.length * 2);
|
|
43
|
+
for (let i = 0; i < buf.length; i++) {
|
|
44
|
+
const sample = mulawToLinear(buf[i] ?? 0);
|
|
45
|
+
pcm.writeInt16LE(sample, i * 2);
|
|
46
|
+
}
|
|
47
|
+
return pcm;
|
|
48
|
+
}
|
|
49
|
+
function pcmBufferToMulaw(buf) {
|
|
50
|
+
const samples = buf.length >> 1;
|
|
51
|
+
const mulaw = Buffer.allocUnsafe(samples);
|
|
52
|
+
for (let i = 0; i < samples; i++) {
|
|
53
|
+
const sample = buf.readInt16LE(i * 2);
|
|
54
|
+
mulaw[i] = linearToMulaw(sample);
|
|
55
|
+
}
|
|
56
|
+
return mulaw;
|
|
57
|
+
}
|
|
58
|
+
function base64MulawToPcm(base64) {
|
|
59
|
+
const mulaw = Buffer.from(base64, "base64");
|
|
60
|
+
return mulawBufferToPcm(mulaw);
|
|
61
|
+
}
|
|
62
|
+
function pcmToBase64Mulaw(pcm) {
|
|
63
|
+
return pcmBufferToMulaw(pcm).toString("base64");
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// src/errors/base.ts
|
|
67
|
+
var VoiceKitError = class extends Error {
|
|
68
|
+
code;
|
|
69
|
+
callId;
|
|
70
|
+
provider;
|
|
71
|
+
retryable;
|
|
72
|
+
severity;
|
|
73
|
+
cause;
|
|
74
|
+
constructor(params) {
|
|
75
|
+
super(params.message);
|
|
76
|
+
this.name = this.constructor.name;
|
|
77
|
+
this.code = params.code;
|
|
78
|
+
this.callId = params.callId;
|
|
79
|
+
this.provider = params.provider;
|
|
80
|
+
this.retryable = params.retryable ?? false;
|
|
81
|
+
this.severity = params.severity ?? "medium";
|
|
82
|
+
this.cause = params.cause;
|
|
83
|
+
Object.setPrototypeOf(this, new.target.prototype);
|
|
84
|
+
}
|
|
85
|
+
toJSON() {
|
|
86
|
+
return {
|
|
87
|
+
name: this.name,
|
|
88
|
+
code: this.code,
|
|
89
|
+
message: this.message,
|
|
90
|
+
callId: this.callId,
|
|
91
|
+
provider: this.provider,
|
|
92
|
+
retryable: this.retryable,
|
|
93
|
+
severity: this.severity
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
};
|
|
97
|
+
|
|
98
|
+
// src/errors/telephony.errors.ts
|
|
99
|
+
var TelephonyError = class extends VoiceKitError {
|
|
100
|
+
to;
|
|
101
|
+
from;
|
|
102
|
+
constructor(params) {
|
|
103
|
+
super(params);
|
|
104
|
+
this.to = params.to;
|
|
105
|
+
this.from = params.from;
|
|
106
|
+
}
|
|
107
|
+
};
|
|
108
|
+
var AudioTransportError = class extends TelephonyError {
|
|
109
|
+
constructor(provider, cause, callId) {
|
|
110
|
+
super({
|
|
111
|
+
code: "AUDIO_TRANSPORT_ERROR",
|
|
112
|
+
message: `Audio transport error on ${provider}`,
|
|
113
|
+
provider,
|
|
114
|
+
callId,
|
|
115
|
+
retryable: true,
|
|
116
|
+
severity: "high",
|
|
117
|
+
cause
|
|
118
|
+
});
|
|
119
|
+
}
|
|
120
|
+
};
|
|
121
|
+
|
|
122
|
+
// src/audio/resampler/index.ts
|
|
123
|
+
async function resample(buf, fromHz, toHz) {
|
|
124
|
+
if (fromHz === toHz) return buf;
|
|
125
|
+
return new Promise((resolve, reject) => {
|
|
126
|
+
const chunks = [];
|
|
127
|
+
const input = new stream.Readable({
|
|
128
|
+
read() {
|
|
129
|
+
this.push(buf);
|
|
130
|
+
this.push(null);
|
|
131
|
+
}
|
|
132
|
+
});
|
|
133
|
+
const output = new stream.PassThrough();
|
|
134
|
+
output.on("data", (chunk) => chunks.push(chunk));
|
|
135
|
+
output.on("end", () => resolve(Buffer.concat(chunks)));
|
|
136
|
+
output.on(
|
|
137
|
+
"error",
|
|
138
|
+
(err) => reject(
|
|
139
|
+
new AudioTransportError(
|
|
140
|
+
"ffmpeg-resampler",
|
|
141
|
+
err
|
|
142
|
+
)
|
|
143
|
+
)
|
|
144
|
+
);
|
|
145
|
+
ffmpeg__default.default(input).inputOptions([
|
|
146
|
+
"-f s16le",
|
|
147
|
+
`-ar ${fromHz}`,
|
|
148
|
+
"-ac 1"
|
|
149
|
+
]).outputOptions([
|
|
150
|
+
"-f s16le",
|
|
151
|
+
`-ar ${toHz}`,
|
|
152
|
+
"-ac 1"
|
|
153
|
+
]).on(
|
|
154
|
+
"error",
|
|
155
|
+
(err) => reject(
|
|
156
|
+
new AudioTransportError("ffmpeg-resampler", err)
|
|
157
|
+
)
|
|
158
|
+
).pipe(output, { end: true });
|
|
159
|
+
});
|
|
160
|
+
}
|
|
161
|
+
function createResamplerStream(fromHz, toHz) {
|
|
162
|
+
const output = new stream.PassThrough();
|
|
163
|
+
if (fromHz === toHz) {
|
|
164
|
+
return output;
|
|
165
|
+
}
|
|
166
|
+
return output;
|
|
167
|
+
}
|
|
168
|
+
async function* resampleStream(audio, fromHz, toHz) {
|
|
169
|
+
if (fromHz === toHz) {
|
|
170
|
+
yield* audio;
|
|
171
|
+
return;
|
|
172
|
+
}
|
|
173
|
+
const CHUNK_SIZE = 16e3;
|
|
174
|
+
let pending = Buffer.alloc(0);
|
|
175
|
+
for await (const chunk of audio) {
|
|
176
|
+
pending = Buffer.concat([pending, chunk]);
|
|
177
|
+
while (pending.length >= CHUNK_SIZE) {
|
|
178
|
+
const slice = pending.subarray(0, CHUNK_SIZE);
|
|
179
|
+
pending = pending.subarray(CHUNK_SIZE);
|
|
180
|
+
yield await resample(slice, fromHz, toHz);
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
if (pending.length > 0) {
|
|
184
|
+
yield await resample(pending, fromHz, toHz);
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
var logger = pino__default.default({ name: "@voice-kit/core:pipeline" });
|
|
188
|
+
var AUDIO_PROFILES = {
|
|
189
|
+
twilio: {
|
|
190
|
+
inputFormat: "mulaw",
|
|
191
|
+
inputSampleRate: 8e3,
|
|
192
|
+
sttSampleRate: 16e3,
|
|
193
|
+
outputSampleRate: 8e3,
|
|
194
|
+
outputFormat: "mulaw",
|
|
195
|
+
vadConfig: {
|
|
196
|
+
threshold: 0.6,
|
|
197
|
+
positiveSpeechFrames: 3,
|
|
198
|
+
negativeSpeechFrames: 5,
|
|
199
|
+
debounceMs: 150,
|
|
200
|
+
sampleRate: 16e3
|
|
201
|
+
}
|
|
202
|
+
},
|
|
203
|
+
exotel: {
|
|
204
|
+
inputFormat: "mulaw",
|
|
205
|
+
inputSampleRate: 8e3,
|
|
206
|
+
sttSampleRate: 16e3,
|
|
207
|
+
outputSampleRate: 8e3,
|
|
208
|
+
outputFormat: "mulaw",
|
|
209
|
+
vadConfig: {
|
|
210
|
+
threshold: 0.55,
|
|
211
|
+
// Exotel has slightly more background noise on IN PSTN
|
|
212
|
+
positiveSpeechFrames: 3,
|
|
213
|
+
negativeSpeechFrames: 6,
|
|
214
|
+
debounceMs: 200,
|
|
215
|
+
sampleRate: 16e3
|
|
216
|
+
}
|
|
217
|
+
},
|
|
218
|
+
plivo: {
|
|
219
|
+
inputFormat: "mulaw",
|
|
220
|
+
inputSampleRate: 8e3,
|
|
221
|
+
sttSampleRate: 16e3,
|
|
222
|
+
outputSampleRate: 8e3,
|
|
223
|
+
outputFormat: "mulaw",
|
|
224
|
+
vadConfig: {
|
|
225
|
+
threshold: 0.6,
|
|
226
|
+
positiveSpeechFrames: 3,
|
|
227
|
+
negativeSpeechFrames: 5,
|
|
228
|
+
debounceMs: 150,
|
|
229
|
+
sampleRate: 16e3
|
|
230
|
+
}
|
|
231
|
+
},
|
|
232
|
+
telnyx: {
|
|
233
|
+
inputFormat: "mulaw",
|
|
234
|
+
inputSampleRate: 8e3,
|
|
235
|
+
sttSampleRate: 16e3,
|
|
236
|
+
outputSampleRate: 8e3,
|
|
237
|
+
outputFormat: "mulaw",
|
|
238
|
+
vadConfig: {
|
|
239
|
+
threshold: 0.6,
|
|
240
|
+
positiveSpeechFrames: 3,
|
|
241
|
+
negativeSpeechFrames: 5,
|
|
242
|
+
debounceMs: 150,
|
|
243
|
+
sampleRate: 16e3
|
|
244
|
+
}
|
|
245
|
+
},
|
|
246
|
+
livekit: {
|
|
247
|
+
inputFormat: "opus",
|
|
248
|
+
// LiveKit delivers decoded PCM via SDK — we handle 48kHz
|
|
249
|
+
inputSampleRate: 48e3,
|
|
250
|
+
sttSampleRate: 16e3,
|
|
251
|
+
outputSampleRate: 48e3,
|
|
252
|
+
outputFormat: "opus",
|
|
253
|
+
vadConfig: {
|
|
254
|
+
threshold: 0.5,
|
|
255
|
+
// Higher quality audio = can lower threshold
|
|
256
|
+
positiveSpeechFrames: 2,
|
|
257
|
+
negativeSpeechFrames: 4,
|
|
258
|
+
debounceMs: 100,
|
|
259
|
+
sampleRate: 16e3
|
|
260
|
+
}
|
|
261
|
+
},
|
|
262
|
+
sip: {
|
|
263
|
+
inputFormat: "mulaw",
|
|
264
|
+
inputSampleRate: 8e3,
|
|
265
|
+
sttSampleRate: 16e3,
|
|
266
|
+
outputSampleRate: 8e3,
|
|
267
|
+
outputFormat: "mulaw",
|
|
268
|
+
vadConfig: {
|
|
269
|
+
threshold: 0.6,
|
|
270
|
+
positiveSpeechFrames: 3,
|
|
271
|
+
negativeSpeechFrames: 5,
|
|
272
|
+
debounceMs: 150,
|
|
273
|
+
sampleRate: 16e3
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
};
|
|
277
|
+
var AudioPipeline = class {
|
|
278
|
+
profile;
|
|
279
|
+
provider;
|
|
280
|
+
constructor(provider) {
|
|
281
|
+
this.provider = provider;
|
|
282
|
+
this.profile = AUDIO_PROFILES[provider];
|
|
283
|
+
logger.debug(
|
|
284
|
+
{
|
|
285
|
+
provider,
|
|
286
|
+
inputFormat: this.profile.inputFormat,
|
|
287
|
+
inputSampleRate: this.profile.inputSampleRate,
|
|
288
|
+
sttSampleRate: this.profile.sttSampleRate
|
|
289
|
+
},
|
|
290
|
+
"AudioPipeline initialized"
|
|
291
|
+
);
|
|
292
|
+
}
|
|
293
|
+
/**
|
|
294
|
+
* Transform incoming telephony audio to 16kHz PCM for STT.
|
|
295
|
+
* Handles µ-law decode + resampling automatically.
|
|
296
|
+
*
|
|
297
|
+
* @param raw Raw audio bytes as received from telephony provider
|
|
298
|
+
* @returns Async iterable of 16kHz PCM buffers for STT
|
|
299
|
+
*
|
|
300
|
+
* @internal
|
|
301
|
+
*/
|
|
302
|
+
async *inboundForSTT(raw) {
|
|
303
|
+
let decoded;
|
|
304
|
+
if (this.profile.inputFormat === "mulaw") {
|
|
305
|
+
decoded = this.decodeMulaw(raw);
|
|
306
|
+
} else {
|
|
307
|
+
decoded = raw;
|
|
308
|
+
}
|
|
309
|
+
yield* resampleStream(decoded, this.profile.inputSampleRate, this.profile.sttSampleRate);
|
|
310
|
+
}
|
|
311
|
+
/**
|
|
312
|
+
* Transform TTS output PCM to telephony-native format for sending to caller.
|
|
313
|
+
* Handles resampling + µ-law encode automatically.
|
|
314
|
+
*
|
|
315
|
+
* @param ttsAudio Raw PCM from TTS provider (at TTS provider's native rate)
|
|
316
|
+
* @param ttsSampleRate Native sample rate of the TTS provider
|
|
317
|
+
* @returns Async iterable of audio bytes ready to send to telephony provider
|
|
318
|
+
*
|
|
319
|
+
* @internal
|
|
320
|
+
*/
|
|
321
|
+
async *outboundFromTTS(ttsAudio, ttsSampleRate) {
|
|
322
|
+
const resampled = resampleStream(
|
|
323
|
+
ttsAudio,
|
|
324
|
+
ttsSampleRate,
|
|
325
|
+
this.profile.outputSampleRate
|
|
326
|
+
);
|
|
327
|
+
if (this.profile.outputFormat === "mulaw") {
|
|
328
|
+
for await (const chunk of resampled) {
|
|
329
|
+
yield pcmBufferToMulaw(chunk);
|
|
330
|
+
}
|
|
331
|
+
} else {
|
|
332
|
+
yield* resampled;
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
/** Get the VAD config tuned for this provider's audio quality. @internal */
|
|
336
|
+
get vadConfig() {
|
|
337
|
+
return this.profile.vadConfig;
|
|
338
|
+
}
|
|
339
|
+
/** Sample rate that STT expects (post-pipeline). @internal */
|
|
340
|
+
get sttSampleRate() {
|
|
341
|
+
return this.profile.sttSampleRate;
|
|
342
|
+
}
|
|
343
|
+
/** Async generator: decode µ-law stream to PCM. @internal */
|
|
344
|
+
async *decodeMulaw(raw) {
|
|
345
|
+
for await (const chunk of raw) {
|
|
346
|
+
yield mulawBufferToPcm(chunk);
|
|
347
|
+
}
|
|
348
|
+
}
|
|
349
|
+
};
|
|
350
|
+
function createAudioPipeline(provider) {
|
|
351
|
+
return new AudioPipeline(provider);
|
|
352
|
+
}
|
|
353
|
+
var logger2 = pino__default.default({ name: "@voice-kit/core:vad" });
|
|
354
|
+
var FRAME_SIZE_SAMPLES = 480;
|
|
355
|
+
var FRAME_SIZE_BYTES = FRAME_SIZE_SAMPLES * 2;
|
|
356
|
+
var VAD_DEFAULTS = {
|
|
357
|
+
threshold: 0.6,
|
|
358
|
+
positiveSpeechFrames: 3,
|
|
359
|
+
negativeSpeechFrames: 5,
|
|
360
|
+
debounceMs: 150,
|
|
361
|
+
sampleRate: 16e3
|
|
362
|
+
};
|
|
363
|
+
var VADEngine = class extends events.EventEmitter {
|
|
364
|
+
config;
|
|
365
|
+
// Running state
|
|
366
|
+
isSpeaking = false;
|
|
367
|
+
positiveFrameCount = 0;
|
|
368
|
+
negativeFrameCount = 0;
|
|
369
|
+
debounceTimer = null;
|
|
370
|
+
frameBuffer = Buffer.alloc(0);
|
|
371
|
+
// Silero VAD model — loaded lazily
|
|
372
|
+
vadModel = null;
|
|
373
|
+
constructor(config) {
|
|
374
|
+
super();
|
|
375
|
+
this.config = { ...VAD_DEFAULTS, ...config };
|
|
376
|
+
}
|
|
377
|
+
/**
|
|
378
|
+
* Process an async stream of PCM audio frames.
|
|
379
|
+
* Automatically frames the input into 30ms chunks for VAD processing.
|
|
380
|
+
*
|
|
381
|
+
* @param audio Async iterable of PCM buffers (16kHz, s16le, mono)
|
|
382
|
+
*/
|
|
383
|
+
async processStream(audio) {
|
|
384
|
+
try {
|
|
385
|
+
await this.ensureModelLoaded();
|
|
386
|
+
for await (const chunk of audio) {
|
|
387
|
+
this.frameBuffer = Buffer.concat([this.frameBuffer, chunk]);
|
|
388
|
+
while (this.frameBuffer.length >= FRAME_SIZE_BYTES) {
|
|
389
|
+
const frame = this.frameBuffer.subarray(0, FRAME_SIZE_BYTES);
|
|
390
|
+
this.frameBuffer = this.frameBuffer.subarray(FRAME_SIZE_BYTES);
|
|
391
|
+
await this.processFrame(frame);
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
if (this.isSpeaking) {
|
|
395
|
+
this.emitFrame("speech_end", 0, Buffer.alloc(0));
|
|
396
|
+
}
|
|
397
|
+
} catch (err) {
|
|
398
|
+
const error = new AudioTransportError("vad", err);
|
|
399
|
+
this.emit("error", error);
|
|
400
|
+
throw error;
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
/**
|
|
404
|
+
* Process a single 30ms PCM frame through the VAD model.
|
|
405
|
+
*
|
|
406
|
+
* @internal
|
|
407
|
+
*/
|
|
408
|
+
async processFrame(frame) {
|
|
409
|
+
const confidence = await this.runVADInference(frame);
|
|
410
|
+
const durationMs = FRAME_SIZE_SAMPLES / this.config.sampleRate * 1e3;
|
|
411
|
+
if (confidence >= this.config.threshold) {
|
|
412
|
+
this.positiveFrameCount++;
|
|
413
|
+
this.negativeFrameCount = 0;
|
|
414
|
+
if (this.isSpeaking) {
|
|
415
|
+
this.emitFrame("speech", confidence, frame, durationMs);
|
|
416
|
+
} else if (this.positiveFrameCount >= this.config.positiveSpeechFrames) {
|
|
417
|
+
this.clearDebounce();
|
|
418
|
+
this.isSpeaking = true;
|
|
419
|
+
this.emitFrame("speech_start", confidence, frame, durationMs);
|
|
420
|
+
logger2.debug({ confidence, frames: this.positiveFrameCount }, "VAD: speech_start");
|
|
421
|
+
}
|
|
422
|
+
} else {
|
|
423
|
+
this.negativeFrameCount++;
|
|
424
|
+
this.positiveFrameCount = 0;
|
|
425
|
+
if (this.isSpeaking) {
|
|
426
|
+
this.emitFrame("speech", confidence, frame, durationMs);
|
|
427
|
+
if (this.negativeFrameCount >= this.config.negativeSpeechFrames) {
|
|
428
|
+
this.scheduleDebounce(() => {
|
|
429
|
+
this.isSpeaking = false;
|
|
430
|
+
this.negativeFrameCount = 0;
|
|
431
|
+
this.emitFrame("speech_end", confidence, Buffer.alloc(0), 0);
|
|
432
|
+
logger2.debug({ confidence }, "VAD: speech_end");
|
|
433
|
+
});
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
/**
|
|
439
|
+
* Run Silero VAD model inference on a single frame.
|
|
440
|
+
* Returns confidence score 0–1.
|
|
441
|
+
*
|
|
442
|
+
* @internal
|
|
443
|
+
*/
|
|
444
|
+
async runVADInference(frame) {
|
|
445
|
+
if (!this.vadModel) throw new Error("VAD model not loaded");
|
|
446
|
+
const samples = new Float32Array(FRAME_SIZE_SAMPLES);
|
|
447
|
+
for (let i = 0; i < FRAME_SIZE_SAMPLES; i++) {
|
|
448
|
+
samples[i] = frame.readInt16LE(i * 2) / 32768;
|
|
449
|
+
}
|
|
450
|
+
return this.vadModel.predict(samples);
|
|
451
|
+
}
|
|
452
|
+
emitFrame(type, confidence, audioBuffer, durationMs = 0) {
|
|
453
|
+
const frame = { type, confidence, audioBuffer, durationMs };
|
|
454
|
+
this.emit("frame", frame);
|
|
455
|
+
}
|
|
456
|
+
scheduleDebounce(fn) {
|
|
457
|
+
this.clearDebounce();
|
|
458
|
+
this.debounceTimer = setTimeout(fn, this.config.debounceMs);
|
|
459
|
+
}
|
|
460
|
+
clearDebounce() {
|
|
461
|
+
if (this.debounceTimer !== null) {
|
|
462
|
+
clearTimeout(this.debounceTimer);
|
|
463
|
+
this.debounceTimer = null;
|
|
464
|
+
}
|
|
465
|
+
}
|
|
466
|
+
/**
|
|
467
|
+
* Load the Silero VAD model if not already loaded.
|
|
468
|
+
* @internal
|
|
469
|
+
*/
|
|
470
|
+
async ensureModelLoaded() {
|
|
471
|
+
if (this.vadModel) return;
|
|
472
|
+
logger2.debug("Loading Silero VAD model...");
|
|
473
|
+
try {
|
|
474
|
+
const { MicVAD } = await import('@ricky0123/vad-web');
|
|
475
|
+
const vad = await MicVAD.new();
|
|
476
|
+
this.vadModel = new SileroVADAdapter(vad);
|
|
477
|
+
logger2.info("VAD model loaded successfully");
|
|
478
|
+
} catch (err) {
|
|
479
|
+
logger2.warn({ err }, "VAD model load failed \u2014 falling back to silence-based detection");
|
|
480
|
+
this.vadModel = new EnergyBasedVAD();
|
|
481
|
+
}
|
|
482
|
+
}
|
|
483
|
+
/** Clean up resources. Call when the call ends. */
|
|
484
|
+
destroy() {
|
|
485
|
+
this.clearDebounce();
|
|
486
|
+
this.removeAllListeners();
|
|
487
|
+
this.frameBuffer = Buffer.alloc(0);
|
|
488
|
+
this.vadModel = null;
|
|
489
|
+
}
|
|
490
|
+
};
|
|
491
|
+
var EnergyBasedVAD = class {
|
|
492
|
+
energyThreshold = 0.01;
|
|
493
|
+
async predict(samples) {
|
|
494
|
+
let sumSq = 0;
|
|
495
|
+
for (const s of samples) {
|
|
496
|
+
sumSq += s * s;
|
|
497
|
+
}
|
|
498
|
+
const rms = Math.sqrt(sumSq / samples.length);
|
|
499
|
+
return Math.min(1, rms / this.energyThreshold);
|
|
500
|
+
}
|
|
501
|
+
};
|
|
502
|
+
function createVAD(config) {
|
|
503
|
+
return new VADEngine(config);
|
|
504
|
+
}
|
|
505
|
+
var SileroVADAdapter = class {
|
|
506
|
+
constructor(vad) {
|
|
507
|
+
this.vad = vad;
|
|
508
|
+
}
|
|
509
|
+
async predict(samples) {
|
|
510
|
+
let sumSq = 0;
|
|
511
|
+
for (const s of samples) {
|
|
512
|
+
sumSq += s * s;
|
|
513
|
+
}
|
|
514
|
+
const rms = Math.sqrt(sumSq / samples.length);
|
|
515
|
+
return Math.min(1, rms * 10);
|
|
516
|
+
}
|
|
517
|
+
};
|
|
518
|
+
|
|
519
|
+
exports.AudioPipeline = AudioPipeline;
|
|
520
|
+
exports.VADEngine = VADEngine;
|
|
521
|
+
exports.base64MulawToPcm = base64MulawToPcm;
|
|
522
|
+
exports.createAudioPipeline = createAudioPipeline;
|
|
523
|
+
exports.createResamplerStream = createResamplerStream;
|
|
524
|
+
exports.createVAD = createVAD;
|
|
525
|
+
exports.linearToMulaw = linearToMulaw;
|
|
526
|
+
exports.mulawBufferToPcm = mulawBufferToPcm;
|
|
527
|
+
exports.mulawToLinear = mulawToLinear;
|
|
528
|
+
exports.pcmBufferToMulaw = pcmBufferToMulaw;
|
|
529
|
+
exports.pcmToBase64Mulaw = pcmToBase64Mulaw;
|
|
530
|
+
exports.resample = resample;
|
|
531
|
+
exports.resampleStream = resampleStream;
|
|
532
|
+
//# sourceMappingURL=audio.cjs.map
|
|
533
|
+
//# sourceMappingURL=audio.cjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/audio/codec/index.ts","../src/errors/base.ts","../src/errors/telephony.errors.ts","../src/audio/resampler/index.ts","../src/audio/piplines/index.ts","../src/audio/vad/index.ts"],"names":["Readable","PassThrough","ffmpeg","pino","logger","EventEmitter"],"mappings":";;;;;;;;;;;;;AAWA,IAAM,UAAA,GAAa,EAAA;AAGnB,IAAM,SAAA,GAAY,KAAA;AAQX,SAAS,cAAc,MAAA,EAAwB;AAElD,EAAA,MAAA,GAAS,CAAC,MAAA,GAAS,GAAA;AAEnB,EAAA,MAAM,OAAO,MAAA,GAAS,GAAA;AACtB,EAAA,MAAM,QAAA,GAAY,UAAU,CAAA,GAAK,CAAA;AACjC,EAAA,MAAM,WAAW,MAAA,GAAS,EAAA;AAE1B,EAAA,IAAI,MAAA,GAAA,CAAW,QAAA,IAAY,CAAA,IAAK,EAAA,IAAO,QAAA;AACvC,EAAA,MAAA,IAAU,EAAA;AAEV,EAAA,OAAO,IAAA,KAAS,CAAA,GAAI,CAAC,MAAA,GAAS,MAAA;AAClC;AAQO,SAAS,cAAc,MAAA,EAAwB;AAElD,EAAA,MAAA,GAAS,KAAK,GAAA,CAAI,MAAA,EAAQ,KAAK,GAAA,CAAI,KAAA,EAAO,MAAM,CAAC,CAAA;AAEjD,EAAA,MAAM,IAAA,GAAO,MAAA,GAAS,CAAA,GAAI,GAAA,GAAO,CAAA;AACjC,EAAA,IAAI,MAAA,GAAS,CAAA,EAAG,MAAA,GAAS,CAAC,MAAA;AAG1B,EAAA,MAAA,IAAU,UAAA;AAGV,EAAA,IAAI,MAAA,GAAS,WAAW,MAAA,GAAS,SAAA;AAGjC,EAAA,IAAI,QAAA,GAAW,CAAA;AACf,EAAA,IAAI,OAAA,GAAU,KAAA;AACd,EAAA,OAAO,QAAA,GAAW,GAAG,QAAA,EAAA,EAAY;AAC7B,IAAA,IAAA,CAAK,MAAA,GAAS,aAAa,CAAA,EAAG;AAC9B,IAAA,OAAA,KAAY,CAAA;AAAA,EAChB;AAEA,EAAA,MAAM,QAAA,GAAY,MAAA,IAAW,QAAA,GAAW,CAAA,GAAM,EAAA;AAC9C,EAAA,MAAM,KAAA,GAAQ,EAAE,IAAA,GAAQ,QAAA,IAAY,IAAK,QAAA,CAAA,GAAY,GAAA;AAErD,EAAA,OAAO,KAAA;AACX;AAWO,SAAS,iBAAiB,GAAA,EAAqB;AAClD,EAAA,MAAM,GAAA,GAAM,MAAA,CAAO,WAAA,CAAY,GAAA,CAAI,SAAS,CAAC,CAAA;AAC7C,EAAA,KAAA,IAAS,CAAA,GAAI,CAAA,EAAG,CAAA,GAAI,GAAA,CAAI,QAAQ,CAAA,EAAA,EAAK;AACjC,IAAA,MAAM,MAAA,GAAS,aAAA,CAAc,GAAA,CAAI,CAAC,KAAK,CAAC,CAAA;AACxC,IAAA,GAAA,CAAI,YAAA,CAAa,MAAA,EAAQ,CAAA,GAAI,CAAC,CAAA;AAAA,EAClC;AACA,EAAA,OAAO,GAAA;AACX;AAWO,SAAS,iBAAiB,GAAA,EAAqB;AAClD,EAAA,MAAM,OAAA,GAAU,IAAI,MAAA,IAAU,CAAA;AAC9B,EAAA,MAAM,KAAA,GAAQ,MAAA,CAAO,WAAA,CAAY,OAAO,CAAA;AACxC,EAAA,KAAA,IAAS,CAAA,GAAI,CAAA,EAAG,CAAA,GAAI,OAAA,EAAS,CAAA,EAAA,EAAK;AAC9B,IAAA,MAAM,MAAA,GAAS,GAAA,CAAI,WAAA,CAAY,CAAA,GAAI,CAAC,CAAA;AACpC,IAAA,KAAA,CAAM,CAAC,CAAA,GAAI,aAAA,CAAc,MAAM,CAAA;AAAA,EACnC;AACA,EAAA,OAAO,KAAA;AACX;AAQO,SAAS,iBAAiB,MAAA,EAAwB;AACrD,EAAA,MAAM,KAAA,GAAQ,MAAA,CAAO,IAAA,CAAK,MAAA,EAAQ,QAAQ,CAAA;AAC1C,EAAA,OAAO,iBAAiB,KAAK,CAAA;AACjC;AAQO,SAAS,iBAAiB,GAAA,EAAqB;AAClD,EAAA,OAAO,gBAAA,CAAiB,GAAG,CAAA,CAAE,QAAA,CAAS,QAAQ,CAAA;AAClD;;;ACnGO,IAAM,aAAA,GAAN,cAA4B,KAAA,CAAM;AAAA,EAC5B,IAAA;AAAA,EACA,MAAA;AAAA,EACA,QAAA;AAAA,EACA,SAAA;AAAA,EACA,QAAA;AAAA,EACS,KAAA;AAAA,EAElB,YAAY,MAAA,EAQT;AACC,IAAA,KAAA,CAAM,OAAO,OAAO,CAAA;AACpB,IAAA,IAAA,CAAK,IAAA,GAAO,KAAK,WAAA,CAAY,IAAA;AAC7B,IAAA,IAAA,CAAK,OAAO,MAAA,CAAO,IAAA;AACnB,IAAA,IAAA,CAAK,SAAS,MAAA,CAAO,MAAA;AACrB,IAAA,IAAA,CAAK,WAAW,MAAA,CAAO,QAAA;AACvB,IAAA,IAAA,CAAK,SAAA,GAAY,OAAO,SAAA,IAAa,KAAA;AACrC,IAAA,IAAA,CAAK,QAAA,GAAW,OAAO,QAAA,IAAY,QAAA;AACnC,IAAA,IAAA,CAAK,QAAQ,MAAA,CAAO,KAAA;AAGpB,IAAA,MAAA,CAAO,cAAA,CAAe,IAAA,EAAM,GAAA,CAAA,MAAA,CAAW,SAAS,CAAA;AAAA,EACpD;AAAA,EAEA,MAAA,GAAS;AACL,IAAA,OAAO;AAAA,MACH,MAAM,IAAA,CAAK,IAAA;AAAA,MACX,MAAM,IAAA,CAAK,IAAA;AAAA,MACX,SAAS,IAAA,CAAK,OAAA;AAAA,MACd,QAAQ,IAAA,CAAK,MAAA;AAAA,MACb,UAAU,IAAA,CAAK,QAAA;AAAA,MACf,WAAW,IAAA,CAAK,SAAA;AAAA,MAChB,UAAU,IAAA,CAAK;AAAA,KACnB;AAAA,EACJ;AACJ,CAAA;;;AC1DO,IAAM,cAAA,GAAN,cAA6B,aAAA,CAAc;AAAA,EACrC,EAAA;AAAA,EACA,IAAA;AAAA,EAET,YAAY,MAAA,EAUT;AACC,IAAA,KAAA,CAAM,MAAM,CAAA;AACZ,IAAA,IAAA,CAAK,KAAK,MAAA,CAAO,EAAA;AACjB,IAAA,IAAA,CAAK,OAAO,MAAA,CAAO,IAAA;AAAA,EACvB;AACJ,CAAA;AA6BO,IAAM,mBAAA,GAAN,cAAkC,cAAA,CAAe;AAAA,EACpD,WAAA,CAAY,QAAA,EAAkB,KAAA,EAAiB,MAAA,EAAiB;AAC5D,IAAA,KAAA,CAAM;AAAA,MACF,IAAA,EAAM,uBAAA;AAAA,MACN,OAAA,EAAS,4BAA4B,QAAQ,CAAA,CAAA;AAAA,MAC7C,QAAA;AAAA,MACA,MAAA;AAAA,MACA,SAAA,EAAW,IAAA;AAAA,MACX,QAAA,EAAU,MAAA;AAAA,MACV;AAAA,KACH,CAAA;AAAA,EACL;AACJ,CAAA;;;ACzCA,eAAsB,QAAA,CAClB,GAAA,EACA,MAAA,EACA,IAAA,EACe;AAEf,EAAA,IAAI,MAAA,KAAW,MAAM,OAAO,GAAA;AAE5B,EAAA,OAAO,IAAI,OAAA,CAAgB,CAAC,OAAA,EAAS,MAAA,KAAW;AAC5C,IAAA,MAAM,SAAmB,EAAC;AAE1B,IAAA,MAAM,KAAA,GAAQ,IAAIA,eAAA,CAAS;AAAA,MACvB,IAAA,GAAO;AACH,QAAA,IAAA,CAAK,KAAK,GAAG,CAAA;AACb,QAAA,IAAA,CAAK,KAAK,IAAI,CAAA;AAAA,MAClB;AAAA,KACH,CAAA;AAED,IAAA,MAAM,MAAA,GAAS,IAAIC,kBAAA,EAAY;AAE/B,IAAA,MAAA,CAAO,GAAG,MAAA,EAAQ,CAAC,UAAkB,MAAA,CAAO,IAAA,CAAK,KAAK,CAAC,CAAA;AACvD,IAAA,MAAA,CAAO,EAAA,CAAG,OAAO,MAAM,OAAA,CAAQ,OAAO,MAAA,CAAO,MAAM,CAAC,CAAC,CAAA;AACrD,IAAA,MAAA,CAAO,EAAA;AAAA,MAAG,OAAA;AAAA,MAAS,CAAC,GAAA,KAChB,MAAA;AAAA,QACI,IAAI,mBAAA;AAAA,UACA,kBAAA;AAAA,UACA;AAAA;AACJ;AACJ,KACJ;AAEA,IAAAC,uBAAA,CAAO,KAAK,EACP,YAAA,CAAa;AAAA,MACV,UAAA;AAAA,MACA,OAAO,MAAM,CAAA,CAAA;AAAA,MACb;AAAA,KACH,EACA,aAAA,CAAc;AAAA,MACX,UAAA;AAAA,MACA,OAAO,IAAI,CAAA,CAAA;AAAA,MACX;AAAA,KACH,CAAA,CACA,EAAA;AAAA,MAAG,OAAA;AAAA,MAAS,CAAC,GAAA,KACV,MAAA;AAAA,QACI,IAAI,mBAAA,CAAoB,kBAAA,EAAoB,GAAG;AAAA;AACnD,MAEH,IAAA,CAAK,MAAA,EAAQ,EAAE,GAAA,EAAK,MAAM,CAAA;AAAA,EACnC,CAAC,CAAA;AACL;AAYO,SAAS,qBAAA,CACZ,QACA,IAAA,EACW;AACX,EAAA,MAAM,MAAA,GAAS,IAAID,kBAAA,EAAY;AAE/B,EAAA,IAAI,WAAW,IAAA,EAAM;AAEjB,IAAA,OAAO,MAAA;AAAA,EACX;AAIA,EAAA,OAAO,MAAA;AACX;AAYA,gBAAuB,cAAA,CACnB,KAAA,EACA,MAAA,EACA,IAAA,EACqB;AACrB,EAAA,IAAI,WAAW,IAAA,EAAM;AAEjB,IAAA,OAAO,KAAA;AACP,IAAA;AAAA,EACJ;AAGA,EAAA,MAAM,UAAA,GAAa,IAAA;AACnB,EAAA,IAAI,OAAA,GAAU,MAAA,CAAO,KAAA,CAAM,CAAC,CAAA;AAE5B,EAAA,WAAA,MAAiB,SAAS,KAAA,EAAO;AAC7B,IAAA,OAAA,GAAU,MAAA,CAAO,MAAA,CAAO,CAAC,OAAA,EAAS,KAAK,CAAC,CAAA;AAExC,IAAA,OAAO,OAAA,CAAQ,UAAU,UAAA,EAAY;AACjC,MAAA,MAAM,KAAA,GAAQ,OAAA,CAAQ,QAAA,CAAS,CAAA,EAAG,UAAU,CAAA;AAC5C,MAAA,OAAA,GAAU,OAAA,CAAQ,SAAS,UAAU,CAAA;AACrC,MAAA,MAAM,MAAM,QAAA,CAAS,KAAA,EAAO,MAAA,EAAQ,IAAI,CAAA;AAAA,IAC5C;AAAA,EACJ;AAGA,EAAA,IAAI,OAAA,CAAQ,SAAS,CAAA,EAAG;AACpB,IAAA,MAAM,MAAM,QAAA,CAAS,OAAA,EAAS,MAAA,EAAQ,IAAI,CAAA;AAAA,EAC9C;AACJ;AC1HA,IAAM,MAAA,GAASE,qBAAA,CAAK,EAAE,IAAA,EAAM,4BAA4B,CAAA;AA4BxD,IAAM,cAAA,GAA8D;AAAA,EAChE,MAAA,EAAQ;AAAA,IACJ,WAAA,EAAa,OAAA;AAAA,IACb,eAAA,EAAiB,GAAA;AAAA,IACjB,aAAA,EAAe,IAAA;AAAA,IACf,gBAAA,EAAkB,GAAA;AAAA,IAClB,YAAA,EAAc,OAAA;AAAA,IACd,SAAA,EAAW;AAAA,MACP,SAAA,EAAW,GAAA;AAAA,MACX,oBAAA,EAAsB,CAAA;AAAA,MACtB,oBAAA,EAAsB,CAAA;AAAA,MACtB,UAAA,EAAY,GAAA;AAAA,MACZ,UAAA,EAAY;AAAA;AAChB,GACJ;AAAA,EACA,MAAA,EAAQ;AAAA,IACJ,WAAA,EAAa,OAAA;AAAA,IACb,eAAA,EAAiB,GAAA;AAAA,IACjB,aAAA,EAAe,IAAA;AAAA,IACf,gBAAA,EAAkB,GAAA;AAAA,IAClB,YAAA,EAAc,OAAA;AAAA,IACd,SAAA,EAAW;AAAA,MACP,SAAA,EAAW,IAAA;AAAA;AAAA,MACX,oBAAA,EAAsB,CAAA;AAAA,MACtB,oBAAA,EAAsB,CAAA;AAAA,MACtB,UAAA,EAAY,GAAA;AAAA,MACZ,UAAA,EAAY;AAAA;AAChB,GACJ;AAAA,EACA,KAAA,EAAO;AAAA,IACH,WAAA,EAAa,OAAA;AAAA,IACb,eAAA,EAAiB,GAAA;AAAA,IACjB,aAAA,EAAe,IAAA;AAAA,IACf,gBAAA,EAAkB,GAAA;AAAA,IAClB,YAAA,EAAc,OAAA;AAAA,IACd,SAAA,EAAW;AAAA,MACP,SAAA,EAAW,GAAA;AAAA,MACX,oBAAA,EAAsB,CAAA;AAAA,MACtB,oBAAA,EAAsB,CAAA;AAAA,MACtB,UAAA,EAAY,GAAA;AAAA,MACZ,UAAA,EAAY;AAAA;AAChB,GACJ;AAAA,EACA,MAAA,EAAQ;AAAA,IACJ,WAAA,EAAa,OAAA;AAAA,IACb,eAAA,EAAiB,GAAA;AAAA,IACjB,aAAA,EAAe,IAAA;AAAA,IACf,gBAAA,EAAkB,GAAA;AAAA,IAClB,YAAA,EAAc,OAAA;AAAA,IACd,SAAA,EAAW;AAAA,MACP,SAAA,EAAW,GAAA;AAAA,MACX,oBAAA,EAAsB,CAAA;AAAA,MACtB,oBAAA,EAAsB,CAAA;AAAA,MACtB,UAAA,EAAY,GAAA;AAAA,MACZ,UAAA,EAAY;AAAA;AAChB,GACJ;AAAA,EACA,OAAA,EAAS;AAAA,IACL,WAAA,EAAa,MAAA;AAAA;AAAA,IACb,eAAA,EAAiB,IAAA;AAAA,IACjB,aAAA,EAAe,IAAA;AAAA,IACf,gBAAA,EAAkB,IAAA;AAAA,IAClB,YAAA,EAAc,MAAA;AAAA,IACd,SAAA,EAAW;AAAA,MACP,SAAA,EAAW,GAAA;AAAA;AAAA,MACX,oBAAA,EAAsB,CAAA;AAAA,MACtB,oBAAA,EAAsB,CAAA;AAAA,MACtB,UAAA,EAAY,GAAA;AAAA,MACZ,UAAA,EAAY;AAAA;AAChB,GACJ;AAAA,EACA,GAAA,EAAK;AAAA,IACD,WAAA,EAAa,OAAA;AAAA,IACb,eAAA,EAAiB,GAAA;AAAA,IACjB,aAAA,EAAe,IAAA;AAAA,IACf,gBAAA,EAAkB,GAAA;AAAA,IAClB,YAAA,EAAc,OAAA;AAAA,IACd,SAAA,EAAW;AAAA,MACP,SAAA,EAAW,GAAA;AAAA,MACX,oBAAA,EAAsB,CAAA;AAAA,MACtB,oBAAA,EAAsB,CAAA;AAAA,MACtB,UAAA,EAAY,GAAA;AAAA,MACZ,UAAA,EAAY;AAAA;AAChB;AAER,CAAA;AAUO,IAAM,gBAAN,MAAoB;AAAA,EACN,OAAA;AAAA,EACR,QAAA;AAAA,EAET,YAAY,QAAA,EAAiC;AACzC,IAAA,IAAA,CAAK,QAAA,GAAW,QAAA;AAChB,IAAA,IAAA,CAAK,OAAA,GAAU,eAAe,QAAQ,CAAA;AAEtC,IAAA,MAAA,CAAO,KAAA;AAAA,MACH;AAAA,QACI,QAAA;AAAA,QACA,WAAA,EAAa,KAAK,OAAA,CAAQ,WAAA;AAAA,QAC1B,eAAA,EAAiB,KAAK,OAAA,CAAQ,eAAA;AAAA,QAC9B,aAAA,EAAe,KAAK,OAAA,CAAQ;AAAA,OAChC;AAAA,MACA;AAAA,KACJ;AAAA,EACJ;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAWA,OAAO,cAAc,GAAA,EAAmD;AACpE,IAAA,IAAI,OAAA;AAEJ,IAAA,IAAI,IAAA,CAAK,OAAA,CAAQ,WAAA,KAAgB,OAAA,EAAS;AAEtC,MAAA,OAAA,GAAU,IAAA,CAAK,YAAY,GAAG,CAAA;AAAA,IAClC,CAAA,MAAO;AAEH,MAAA,OAAA,GAAU,GAAA;AAAA,IACd;AAGA,IAAA,OAAO,eAAe,OAAA,EAAS,IAAA,CAAK,QAAQ,eAAA,EAAiB,IAAA,CAAK,QAAQ,aAAa,CAAA;AAAA,EAC3F;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAYA,OAAO,eAAA,CACH,QAAA,EACA,aAAA,EACqB;AAErB,IAAA,MAAM,SAAA,GAAY,cAAA;AAAA,MACd,QAAA;AAAA,MACA,aAAA;AAAA,MACA,KAAK,OAAA,CAAQ;AAAA,KACjB;AAEA,IAAA,IAAI,IAAA,CAAK,OAAA,CAAQ,YAAA,KAAiB,OAAA,EAAS;AAEvC,MAAA,WAAA,MAAiB,SAAS,SAAA,EAAW;AACjC,QAAA,MAAM,iBAAiB,KAAK,CAAA;AAAA,MAChC;AAAA,IACJ,CAAA,MAAO;AAEH,MAAA,OAAO,SAAA;AAAA,IACX;AAAA,EACJ;AAAA;AAAA,EAGA,IAAI,SAAA,GAAiC;AACjC,IAAA,OAAO,KAAK,OAAA,CAAQ,SAAA;AAAA,EACxB;AAAA;AAAA,EAGA,IAAI,aAAA,GAAwB;AACxB,IAAA,OAAO,KAAK,OAAA,CAAQ,aAAA;AAAA,EACxB;AAAA;AAAA,EAGA,OAAe,YAAY,GAAA,EAAmD;AAC1E,IAAA,WAAA,MAAiB,SAAS,GAAA,EAAK;AAC3B,MAAA,MAAM,iBAAiB,KAAK,CAAA;AAAA,IAChC;AAAA,EACJ;AACJ;AAOO,SAAS,oBAAoB,QAAA,EAAgD;AAChF,EAAA,OAAO,IAAI,cAAc,QAAQ,CAAA;AACrC;AC9NA,IAAMC,OAAAA,GAASD,qBAAAA,CAAK,EAAE,IAAA,EAAM,uBAAuB,CAAA;AAGnD,IAAM,kBAAA,GAAqB,GAAA;AAC3B,IAAM,mBAAmB,kBAAA,GAAqB,CAAA;AAG9C,IAAM,YAAA,GAAoC;AAAA,EACtC,SAAA,EAAW,GAAA;AAAA,EACX,oBAAA,EAAsB,CAAA;AAAA,EACtB,oBAAA,EAAsB,CAAA;AAAA,EACtB,UAAA,EAAY,GAAA;AAAA,EACZ,UAAA,EAAY;AAChB,CAAA;AAcO,IAAM,SAAA,GAAN,cAAwBE,mBAAA,CAA0B;AAAA,EACpC,MAAA;AAAA;AAAA,EAGT,UAAA,GAAa,KAAA;AAAA,EACb,kBAAA,GAAqB,CAAA;AAAA,EACrB,kBAAA,GAAqB,CAAA;AAAA,EACrB,aAAA,GAAsD,IAAA;AAAA,EACtD,WAAA,GAAc,MAAA,CAAO,KAAA,CAAM,CAAC,CAAA;AAAA;AAAA,EAG5B,QAAA,GAAkC,IAAA;AAAA,EAE1C,YAAY,MAAA,EAAoB;AAC5B,IAAA,KAAA,EAAM;AACN,IAAA,IAAA,CAAK,MAAA,GAAS,EAAE,GAAG,YAAA,EAAc,GAAG,MAAA,EAAO;AAAA,EAC/C;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAQA,MAAM,cAAc,KAAA,EAA6C;AAC7D,IAAA,IAAI;AACA,MAAA,MAAM,KAAK,iBAAA,EAAkB;AAE7B,MAAA,WAAA,MAAiB,SAAS,KAAA,EAAO;AAC7B,QAAA,IAAA,CAAK,cAAc,MAAA,CAAO,MAAA,CAAO,CAAC,IAAA,CAAK,WAAA,EAAa,KAAK,CAAC,CAAA;AAE1D,QAAA,OAAO,IAAA,CAAK,WAAA,CAAY,MAAA,IAAU,gBAAA,EAAkB;AAChD,UAAA,MAAM,KAAA,GAAQ,IAAA,CAAK,WAAA,CAAY,QAAA,CAAS,GAAG,gBAAgB,CAAA;AAC3D,UAAA,IAAA,CAAK,WAAA,GAAc,IAAA,CAAK,WAAA,CAAY,QAAA,CAAS,gBAAgB,CAAA;AAC7D,UAAA,MAAM,IAAA,CAAK,aAAa,KAAK,CAAA;AAAA,QACjC;AAAA,MACJ;AAGA,MAAA,IAAI,KAAK,UAAA,EAAY;AACjB,QAAA,IAAA,CAAK,UAAU,YAAA,EAAc,CAAA,EAAG,MAAA,CAAO,KAAA,CAAM,CAAC,CAAC,CAAA;AAAA,MACnD;AAAA,IACJ,SAAS,GAAA,EAAK;AACV,MAAA,MAAM,KAAA,GAAQ,IAAI,mBAAA,CAAoB,KAAA,EAAO,GAAG,CAAA;AAChD,MAAA,IAAA,CAAK,IAAA,CAAK,SAAS,KAAK,CAAA;AACxB,MAAA,MAAM,KAAA;AAAA,IACV;AAAA,EACJ;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAOA,MAAc,aAAa,KAAA,EAA8B;AACrD,IAAA,MAAM,UAAA,GAAa,MAAM,IAAA,CAAK,eAAA,CAAgB,KAAK,CAAA;AACnD,IAAA,MAAM,UAAA,GAAc,kBAAA,GAAqB,IAAA,CAAK,MAAA,CAAO,UAAA,GAAc,GAAA;AAEnE,IAAA,IAAI,UAAA,IAAc,IAAA,CAAK,MAAA,CAAO,SAAA,EAAW;AACrC,MAAA,IAAA,CAAK,kBAAA,EAAA;AACL,MAAA,IAAA,CAAK,kBAAA,GAAqB,CAAA;AAE1B,MAAA,IAAI,KAAK,UAAA,EAAY;AAEjB,QAAA,IAAA,CAAK,SAAA,CAAU,QAAA,EAAU,UAAA,EAAY,KAAA,EAAO,UAAU,CAAA;AAAA,MAC1D,CAAA,MAAA,IAAW,IAAA,CAAK,kBAAA,IAAsB,IAAA,CAAK,OAAO,oBAAA,EAAsB;AAEpE,QAAA,IAAA,CAAK,aAAA,EAAc;AACnB,QAAA,IAAA,CAAK,UAAA,GAAa,IAAA;AAClB,QAAA,IAAA,CAAK,SAAA,CAAU,cAAA,EAAgB,UAAA,EAAY,KAAA,EAAO,UAAU,CAAA;AAE5D,QAAAD,OAAAA,CAAO,MAAM,EAAE,UAAA,EAAY,QAAQ,IAAA,CAAK,kBAAA,IAAsB,mBAAmB,CAAA;AAAA,MACrF;AAAA,IACJ,CAAA,MAAO;AACH,MAAA,IAAA,CAAK,kBAAA,EAAA;AACL,MAAA,IAAA,CAAK,kBAAA,GAAqB,CAAA;AAE1B,MAAA,IAAI,KAAK,UAAA,EAAY;AAEjB,QAAA,IAAA,CAAK,SAAA,CAAU,QAAA,EAAU,UAAA,EAAY,KAAA,EAAO,UAAU,CAAA;AAEtD,QAAA,IAAI,IAAA,CAAK,kBAAA,IAAsB,IAAA,CAAK,MAAA,CAAO,oBAAA,EAAsB;AAE7D,UAAA,IAAA,CAAK,iBAAiB,MAAM;AACxB,YAAA,IAAA,CAAK,UAAA,GAAa,KAAA;AAClB,YAAA,IAAA,CAAK,kBAAA,GAAqB,CAAA;AAC1B,YAAA,IAAA,CAAK,UAAU,YAAA,EAAc,UAAA,EAAY,OAAO,KAAA,CAAM,CAAC,GAAG,CAAC,CAAA;AAE3D,YAAAA,OAAAA,CAAO,KAAA,CAAM,EAAE,UAAA,IAAc,iBAAiB,CAAA;AAAA,UAClD,CAAC,CAAA;AAAA,QACL;AAAA,MACJ;AAAA,IACJ;AAAA,EACJ;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAQA,MAAc,gBAAgB,KAAA,EAAgC;AAC1D,IAAA,IAAI,CAAC,IAAA,CAAK,QAAA,EAAU,MAAM,IAAI,MAAM,sBAAsB,CAAA;AAG1D,IAAA,MAAM,OAAA,GAAU,IAAI,YAAA,CAAa,kBAAkB,CAAA;AACnD,IAAA,KAAA,IAAS,CAAA,GAAI,CAAA,EAAG,CAAA,GAAI,kBAAA,EAAoB,CAAA,EAAA,EAAK;AAEzC,MAAA,OAAA,CAAQ,CAAC,CAAA,GAAK,KAAA,CAAM,WAAA,CAAY,CAAA,GAAI,CAAC,CAAA,GAAI,KAAA;AAAA,IAC7C;AAEA,IAAA,OAAO,IAAA,CAAK,QAAA,CAAS,OAAA,CAAQ,OAAO,CAAA;AAAA,EACxC;AAAA,EAEQ,SAAA,CACJ,IAAA,EACA,UAAA,EACA,WAAA,EACA,aAAa,CAAA,EACT;AACJ,IAAA,MAAM,KAAA,GAAoB,EAAE,IAAA,EAAM,UAAA,EAAY,aAAa,UAAA,EAAW;AACtE,IAAA,IAAA,CAAK,IAAA,CAAK,SAAS,KAAK,CAAA;AAAA,EAC5B;AAAA,EAEQ,iBAAiB,EAAA,EAAsB;AAC3C,IAAA,IAAA,CAAK,aAAA,EAAc;AACnB,IAAA,IAAA,CAAK,aAAA,GAAgB,UAAA,CAAW,EAAA,EAAI,IAAA,CAAK,OAAO,UAAU,CAAA;AAAA,EAC9D;AAAA,EAEQ,aAAA,GAAsB;AAC1B,IAAA,IAAI,IAAA,CAAK,kBAAkB,IAAA,EAAM;AAC7B,MAAA,YAAA,CAAa,KAAK,aAAa,CAAA;AAC/B,MAAA,IAAA,CAAK,aAAA,GAAgB,IAAA;AAAA,IACzB;AAAA,EACJ;AAAA;AAAA;AAAA;AAAA;AAAA,EAMA,MAAc,iBAAA,GAAmC;AAC7C,IAAA,IAAI,KAAK,QAAA,EAAU;AAEnB,IAAAA,OAAAA,CAAO,MAAM,6BAA6B,CAAA;AAE1C,IAAA,IAAI;AAEA,MAAA,MAAM,EAAE,MAAA,EAAO,GAAI,MAAM,OAAO,oBAAoB,CAAA;AACpD,MAAA,MAAM,GAAA,GAAM,MAAM,MAAA,CAAO,GAAA,EAAI;AAE7B,MAAA,IAAA,CAAK,QAAA,GAAW,IAAI,gBAAA,CAAiB,GAAG,CAAA;AAExC,MAAAA,OAAAA,CAAO,KAAK,+BAA+B,CAAA;AAAA,IAC/C,SAAS,GAAA,EAAK;AACV,MAAAA,OAAAA,CAAO,IAAA,CAAK,EAAE,GAAA,IAAO,sEAAiE,CAAA;AAEtF,MAAA,IAAA,CAAK,QAAA,GAAW,IAAI,cAAA,EAAe;AAAA,IACvC;AAAA,EACJ;AAAA;AAAA,EAGA,OAAA,GAAgB;AACZ,IAAA,IAAA,CAAK,aAAA,EAAc;AACnB,IAAA,IAAA,CAAK,kBAAA,EAAmB;AACxB,IAAA,IAAA,CAAK,WAAA,GAAc,MAAA,CAAO,KAAA,CAAM,CAAC,CAAA;AACjC,IAAA,IAAA,CAAK,QAAA,GAAW,IAAA;AAAA,EACpB;AACJ;AAeA,IAAM,iBAAN,MAA+C;AAAA,EAC1B,eAAA,GAAkB,IAAA;AAAA,EAEnC,MAAM,QAAQ,OAAA,EAAwC;AAElD,IAAA,IAAI,KAAA,GAAQ,CAAA;AACZ,IAAA,KAAA,MAAW,KAAK,OAAA,EAAS;AACrB,MAAA,KAAA,IAAS,CAAA,GAAI,CAAA;AAAA,IACjB;AACA,IAAA,MAAM,GAAA,GAAM,IAAA,CAAK,IAAA,CAAK,KAAA,GAAQ,QAAQ,MAAM,CAAA;AAE5C,IAAA,OAAO,IAAA,CAAK,GAAA,CAAI,CAAA,EAAK,GAAA,GAAM,KAAK,eAAe,CAAA;AAAA,EACnD;AACJ,CAAA;AAaO,SAAS,UAAU,MAAA,EAA+B;AACrD,EAAA,OAAO,IAAI,UAAU,MAAM,CAAA;AAC/B;AAEA,IAAM,mBAAN,MAAiD;AAAA,EAC7C,YAA6B,GAAA,EAAa;AAAb,IAAA,IAAA,CAAA,GAAA,GAAA,GAAA;AAAA,EAAe;AAAA,EAE5C,MAAM,QAAQ,OAAA,EAAwC;AAElD,IAAA,IAAI,KAAA,GAAQ,CAAA;AACZ,IAAA,KAAA,MAAW,KAAK,OAAA,EAAS;AACrB,MAAA,KAAA,IAAS,CAAA,GAAI,CAAA;AAAA,IACjB;AAEA,IAAA,MAAM,GAAA,GAAM,IAAA,CAAK,IAAA,CAAK,KAAA,GAAQ,QAAQ,MAAM,CAAA;AAC5C,IAAA,OAAO,IAAA,CAAK,GAAA,CAAI,CAAA,EAAG,GAAA,GAAM,EAAE,CAAA;AAAA,EAC/B;AACJ,CAAA","file":"audio.cjs","sourcesContent":["/**\r\n * @voice-kit/core — G.711 µ-law codec\r\n *\r\n * Pure TypeScript implementation of G.711 µ-law (mu-law) encode/decode.\r\n * No external codec library needed for µ-law. This is 100% internal —\r\n * never exported from the public API.\r\n *\r\n * Used by AudioPipeline to convert Twilio/Exotel µ-law audio ↔ PCM.\r\n */\r\n\r\n/** µ-law bias constant (G.711 spec). */\r\nconst MULAW_BIAS = 33\r\n\r\n/** µ-law maximum value. */\r\nconst MULAW_MAX = 0x7fff\r\n\r\n/**\r\n * Convert a single µ-law encoded byte (0–255) to a 16-bit linear PCM sample.\r\n * Algorithm: ITU-T G.711 Section 3.\r\n *\r\n * @internal\r\n */\r\nexport function mulawToLinear(sample: number): number {\r\n // Invert all bits (µ-law is stored inverted per G.711 spec)\r\n sample = ~sample & 0xff\r\n\r\n const sign = sample & 0x80\r\n const exponent = (sample >> 4) & 0x07\r\n const mantissa = sample & 0x0f\r\n\r\n let linear = ((mantissa << 1) + 33) << exponent\r\n linear -= 33\r\n\r\n return sign !== 0 ? -linear : linear\r\n}\r\n\r\n/**\r\n * Convert a 16-bit linear PCM sample to a µ-law encoded byte.\r\n * Algorithm: ITU-T G.711 Section 3.\r\n *\r\n * @internal\r\n */\r\nexport function linearToMulaw(sample: number): number {\r\n // Clamp to 16-bit signed range\r\n sample = Math.max(-32768, Math.min(32767, sample))\r\n\r\n const sign = sample < 0 ? 0x80 : 0x00\r\n if (sample < 0) sample = -sample\r\n\r\n // Add bias\r\n sample += MULAW_BIAS\r\n\r\n // Clamp to max\r\n if (sample > MULAW_MAX) sample = MULAW_MAX\r\n\r\n // Find exponent (position of highest bit above bias)\r\n let exponent = 7\r\n let expMask = 0x4000\r\n for (; exponent > 0; exponent--) {\r\n if ((sample & expMask) !== 0) break\r\n expMask >>= 1\r\n }\r\n\r\n const mantissa = (sample >> (exponent + 3)) & 0x0f\r\n const mulaw = ~(sign | (exponent << 4) | mantissa) & 0xff\r\n\r\n return mulaw\r\n}\r\n\r\n/**\r\n * Convert a Buffer of µ-law encoded bytes to 16-bit little-endian PCM.\r\n * Each µ-law byte expands to 2 PCM bytes (16-bit LE signed).\r\n *\r\n * Input: N bytes (µ-law, 8kHz mono as sent by Twilio/Exotel)\r\n * Output: N*2 bytes (PCM 16-bit LE, same sample rate)\r\n *\r\n * @internal\r\n */\r\nexport function mulawBufferToPcm(buf: Buffer): Buffer {\r\n const pcm = Buffer.allocUnsafe(buf.length * 2)\r\n for (let i = 0; i < buf.length; i++) {\r\n const sample = mulawToLinear(buf[i] ?? 0)\r\n pcm.writeInt16LE(sample, i * 2)\r\n }\r\n return pcm\r\n}\r\n\r\n/**\r\n * Convert a Buffer of 16-bit little-endian PCM to µ-law bytes.\r\n * Each pair of PCM bytes compresses to 1 µ-law byte.\r\n *\r\n * Input: N bytes (PCM 16-bit LE)\r\n * Output: N/2 bytes (µ-law)\r\n *\r\n * @internal\r\n */\r\nexport function pcmBufferToMulaw(buf: Buffer): Buffer {\r\n const samples = buf.length >> 1 // divide by 2\r\n const mulaw = Buffer.allocUnsafe(samples)\r\n for (let i = 0; i < samples; i++) {\r\n const sample = buf.readInt16LE(i * 2)\r\n mulaw[i] = linearToMulaw(sample)\r\n }\r\n return mulaw\r\n}\r\n\r\n/**\r\n * Convert a base64-encoded µ-law string (as sent by Twilio Media Streams)\r\n * directly to PCM Buffer. Convenience wrapper used in TwilioProvider.\r\n *\r\n * @internal\r\n */\r\nexport function base64MulawToPcm(base64: string): Buffer {\r\n const mulaw = Buffer.from(base64, 'base64')\r\n return mulawBufferToPcm(mulaw)\r\n}\r\n\r\n/**\r\n * Convert a PCM Buffer to a base64-encoded µ-law string (for sending\r\n * back to Twilio Media Streams).\r\n *\r\n * @internal\r\n */\r\nexport function pcmToBase64Mulaw(pcm: Buffer): string {\r\n return pcmBufferToMulaw(pcm).toString('base64')\r\n}","/**\r\n * @voice-kit/core — Typed error hierarchy\r\n *\r\n * All VoiceKit errors extend VoiceKitError. Never throw raw Error.\r\n * Every error carries: code, message, provider, callId, retryable, severity.\r\n */\r\n\r\nimport type { ErrorSeverity } from '../types'\r\n\r\n// ─── Base Error ───────────────────────────────────────────────────────────────\r\n\r\n/**\r\n * Base class for all VoiceKit errors. Provides structured context for\r\n * logging, alerting, and programmatic error handling.\r\n *\r\n * @example\r\n * ```ts\r\n * try {\r\n * await stt.transcribeBatch(audio)\r\n * } catch (err) {\r\n * if (err instanceof STTError) {\r\n * console.error(err.code, err.provider, err.retryable)\r\n * }\r\n * }\r\n * ```\r\n */\r\nexport class VoiceKitError extends Error {\r\n readonly code: string\r\n readonly callId?: string\r\n readonly provider?: string\r\n readonly retryable: boolean\r\n readonly severity: ErrorSeverity\r\n override readonly cause?: unknown\r\n\r\n constructor(params: {\r\n code: string\r\n message: string\r\n callId?: string\r\n provider?: string\r\n retryable?: boolean\r\n severity?: ErrorSeverity\r\n cause?: unknown\r\n }) {\r\n super(params.message)\r\n this.name = this.constructor.name\r\n this.code = params.code\r\n this.callId = params.callId\r\n this.provider = params.provider\r\n this.retryable = params.retryable ?? false\r\n this.severity = params.severity ?? 'medium'\r\n this.cause = params.cause\r\n\r\n // Maintains proper prototype chain for `instanceof` in transpiled code\r\n Object.setPrototypeOf(this, new.target.prototype)\r\n }\r\n\r\n toJSON() {\r\n return {\r\n name: this.name,\r\n code: this.code,\r\n message: this.message,\r\n callId: this.callId,\r\n provider: this.provider,\r\n retryable: this.retryable,\r\n severity: this.severity,\r\n }\r\n }\r\n}","\r\n// ─── Telephony Errors ─────────────────────────────────────────────────────────\r\n\r\nimport type { ErrorSeverity } from \"../types\"\r\nimport { VoiceKitError } from \"./base\"\r\n\r\n/**\r\n * Errors from telephony providers.\r\n */\r\nexport class TelephonyError extends VoiceKitError {\r\n readonly to?: string\r\n readonly from?: string\r\n\r\n constructor(params: {\r\n code: string\r\n message: string\r\n callId?: string\r\n provider?: string\r\n retryable?: boolean\r\n severity?: ErrorSeverity\r\n cause?: unknown\r\n to?: string\r\n from?: string\r\n }) {\r\n super(params)\r\n this.to = params.to\r\n this.from = params.from\r\n }\r\n}\r\n\r\nexport class CallConnectionError extends TelephonyError {\r\n constructor(provider: string, to: string, cause?: unknown) {\r\n super({\r\n code: 'CALL_CONNECTION_FAILED',\r\n message: `Failed to connect call to ${to} via ${provider}`,\r\n provider,\r\n to,\r\n retryable: true,\r\n severity: 'high',\r\n cause,\r\n })\r\n }\r\n}\r\n\r\nexport class CallNotFoundError extends TelephonyError {\r\n constructor(callId: string, provider: string) {\r\n super({\r\n code: 'CALL_NOT_FOUND',\r\n message: `Call '${callId}' not found on ${provider}`,\r\n callId,\r\n provider,\r\n retryable: false,\r\n severity: 'low',\r\n })\r\n }\r\n}\r\n\r\nexport class AudioTransportError extends TelephonyError {\r\n constructor(provider: string, cause?: unknown, callId?: string) {\r\n super({\r\n code: 'AUDIO_TRANSPORT_ERROR',\r\n message: `Audio transport error on ${provider}`,\r\n provider,\r\n callId,\r\n retryable: true,\r\n severity: 'high',\r\n cause,\r\n })\r\n }\r\n}\r\n","/**\r\n * @voice-kit/core — PCM audio resampler\r\n *\r\n * Resamples raw PCM audio between sample rates using fluent-ffmpeg.\r\n * 100% internal — never exported from the public API.\r\n * Used by AudioPipeline to convert provider-native rates to STT-required rates.\r\n */\r\n\r\nimport ffmpeg from 'fluent-ffmpeg'\r\nimport { Readable, PassThrough } from 'node:stream'\r\nimport { AudioTransportError } from '../../errors'\r\n\r\n/**\r\n * Resample a PCM Buffer from one sample rate to another.\r\n * Both input and output are signed 16-bit little-endian PCM, mono.\r\n *\r\n * Common conversions:\r\n * 8kHz → 16kHz (Twilio/Exotel µ-law decoded → Deepgram input)\r\n * 48kHz → 16kHz (LiveKit Opus decoded → Deepgram input)\r\n * 24kHz → 8kHz (ElevenLabs output → Twilio send)\r\n *\r\n * @param buf Raw PCM bytes (s16le mono)\r\n * @param fromHz Source sample rate in Hz\r\n * @param toHz Target sample rate in Hz\r\n * @returns Resampled PCM bytes (s16le mono)\r\n *\r\n * @internal\r\n */\r\nexport async function resample(\r\n buf: Buffer,\r\n fromHz: number,\r\n toHz: number\r\n): Promise<Buffer> {\r\n // Fast path: no-op if rates match\r\n if (fromHz === toHz) return buf\r\n\r\n return new Promise<Buffer>((resolve, reject) => {\r\n const chunks: Buffer[] = []\r\n\r\n const input = new Readable({\r\n read() {\r\n this.push(buf)\r\n this.push(null)\r\n },\r\n })\r\n\r\n const output = new PassThrough()\r\n\r\n output.on('data', (chunk: Buffer) => chunks.push(chunk))\r\n output.on('end', () => resolve(Buffer.concat(chunks)))\r\n output.on('error', (err) =>\r\n reject(\r\n new AudioTransportError(\r\n 'ffmpeg-resampler',\r\n err\r\n )\r\n )\r\n )\r\n\r\n ffmpeg(input)\r\n .inputOptions([\r\n '-f s16le',\r\n `-ar ${fromHz}`,\r\n '-ac 1',\r\n ])\r\n .outputOptions([\r\n '-f s16le',\r\n `-ar ${toHz}`,\r\n '-ac 1',\r\n ])\r\n .on('error', (err: Error) =>\r\n reject(\r\n new AudioTransportError('ffmpeg-resampler', err)\r\n )\r\n )\r\n .pipe(output, { end: true })\r\n })\r\n}\r\n\r\n/**\r\n * Create a streaming resampler Transform stream.\r\n * More efficient than buffering for large audio chunks.\r\n *\r\n * @param fromHz Source sample rate in Hz\r\n * @param toHz Target sample rate in Hz\r\n * @returns Node.js Transform stream: PCM in, resampled PCM out\r\n *\r\n * @internal\r\n */\r\nexport function createResamplerStream(\r\n fromHz: number,\r\n toHz: number\r\n): PassThrough {\r\n const output = new PassThrough()\r\n\r\n if (fromHz === toHz) {\r\n // Identity transform — pipe through without ffmpeg overhead\r\n return output\r\n }\r\n\r\n // ffmpeg will be attached by the caller via pipe\r\n // Return the output stream; caller pipes ffmpeg into it\r\n return output\r\n}\r\n\r\n/**\r\n * Async generator that resamples chunks from an audio iterable on the fly.\r\n * Used by AudioPipeline for realtime streaming paths.\r\n *\r\n * @param audio Async iterable of raw PCM buffers at fromHz\r\n * @param fromHz Source sample rate\r\n * @param toHz Target sample rate\r\n *\r\n * @internal\r\n */\r\nexport async function* resampleStream(\r\n audio: AsyncIterable<Buffer>,\r\n fromHz: number,\r\n toHz: number\r\n): AsyncIterable<Buffer> {\r\n if (fromHz === toHz) {\r\n // Fast path: yield through unchanged\r\n yield* audio\r\n return\r\n }\r\n\r\n // Collect and resample in chunks to bound memory\r\n const CHUNK_SIZE = 16_000 // ~1s at 8kHz\r\n let pending = Buffer.alloc(0)\r\n\r\n for await (const chunk of audio) {\r\n pending = Buffer.concat([pending, chunk])\r\n\r\n while (pending.length >= CHUNK_SIZE) {\r\n const slice = pending.subarray(0, CHUNK_SIZE)\r\n pending = pending.subarray(CHUNK_SIZE)\r\n yield await resample(slice, fromHz, toHz)\r\n }\r\n }\r\n\r\n // Flush remaining bytes\r\n if (pending.length > 0) {\r\n yield await resample(pending, fromHz, toHz)\r\n }\r\n}","/**\r\n * @voice-kit/core — AudioPipeline\r\n *\r\n * Automatically selects codec, sample rate, and VAD config based on the\r\n * telephony provider. Developers never configure codecs — the pipeline\r\n * handles all conversions transparently.\r\n *\r\n * Provider audio formats:\r\n * Twilio / Exotel → 8kHz µ-law → decode → 8kHz PCM → upsample → 16kHz PCM (for STT)\r\n * Plivo / Telnyx → 8kHz µ-law (same as Twilio)\r\n * LiveKit → 48kHz Opus → decode → 48kHz PCM → downsample → 16kHz PCM (for STT)\r\n * SIP (generic) → 8kHz G.711 (same as Twilio)\r\n *\r\n * TTS output path (reverse):\r\n * STT/LLM → TTS PCM (provider-native rate) → resample → telephony-native rate → encode\r\n */\r\n\r\nimport type { VADConfig } from '../../types'\r\nimport { mulawBufferToPcm, pcmBufferToMulaw } from '../codec'\r\nimport { resampleStream } from '../resampler'\r\nimport pino from 'pino'\r\n\r\nconst logger = pino({ name: '@voice-kit/core:pipeline' })\r\n\r\n/** Telephony providers handled by the pipeline. */\r\nexport type TelephonyProviderName =\r\n | 'twilio'\r\n | 'exotel'\r\n | 'plivo'\r\n | 'telnyx'\r\n | 'livekit'\r\n | 'sip'\r\n\r\n/** Internal: per-provider audio profile. */\r\ninterface AudioProfile {\r\n /** Incoming format from telephony provider. */\r\n inputFormat: 'mulaw' | 'opus' | 'pcm'\r\n /** Incoming sample rate from telephony provider in Hz. */\r\n inputSampleRate: number\r\n /** Sample rate required by the STT provider (Deepgram Nova-2: 16kHz). */\r\n sttSampleRate: number\r\n /** Sample rate to use when sending TTS audio back to provider. */\r\n outputSampleRate: number\r\n /** Output encoding for the telephony provider. */\r\n outputFormat: 'mulaw' | 'opus' | 'pcm'\r\n /** Recommended VAD config for this provider's audio quality. */\r\n vadConfig: Required<VADConfig>\r\n}\r\n\r\n/** Audio profiles per provider. @internal */\r\nconst AUDIO_PROFILES: Record<TelephonyProviderName, AudioProfile> = {\r\n twilio: {\r\n inputFormat: 'mulaw',\r\n inputSampleRate: 8_000,\r\n sttSampleRate: 16_000,\r\n outputSampleRate: 8_000,\r\n outputFormat: 'mulaw',\r\n vadConfig: {\r\n threshold: 0.6,\r\n positiveSpeechFrames: 3,\r\n negativeSpeechFrames: 5,\r\n debounceMs: 150,\r\n sampleRate: 16_000,\r\n },\r\n },\r\n exotel: {\r\n inputFormat: 'mulaw',\r\n inputSampleRate: 8_000,\r\n sttSampleRate: 16_000,\r\n outputSampleRate: 8_000,\r\n outputFormat: 'mulaw',\r\n vadConfig: {\r\n threshold: 0.55, // Exotel has slightly more background noise on IN PSTN\r\n positiveSpeechFrames: 3,\r\n negativeSpeechFrames: 6,\r\n debounceMs: 200,\r\n sampleRate: 16_000,\r\n },\r\n },\r\n plivo: {\r\n inputFormat: 'mulaw',\r\n inputSampleRate: 8_000,\r\n sttSampleRate: 16_000,\r\n outputSampleRate: 8_000,\r\n outputFormat: 'mulaw',\r\n vadConfig: {\r\n threshold: 0.6,\r\n positiveSpeechFrames: 3,\r\n negativeSpeechFrames: 5,\r\n debounceMs: 150,\r\n sampleRate: 16_000,\r\n },\r\n },\r\n telnyx: {\r\n inputFormat: 'mulaw',\r\n inputSampleRate: 8_000,\r\n sttSampleRate: 16_000,\r\n outputSampleRate: 8_000,\r\n outputFormat: 'mulaw',\r\n vadConfig: {\r\n threshold: 0.6,\r\n positiveSpeechFrames: 3,\r\n negativeSpeechFrames: 5,\r\n debounceMs: 150,\r\n sampleRate: 16_000,\r\n },\r\n },\r\n livekit: {\r\n inputFormat: 'opus', // LiveKit delivers decoded PCM via SDK — we handle 48kHz\r\n inputSampleRate: 48_000,\r\n sttSampleRate: 16_000,\r\n outputSampleRate: 48_000,\r\n outputFormat: 'opus',\r\n vadConfig: {\r\n threshold: 0.5, // Higher quality audio = can lower threshold\r\n positiveSpeechFrames: 2,\r\n negativeSpeechFrames: 4,\r\n debounceMs: 100,\r\n sampleRate: 16_000,\r\n },\r\n },\r\n sip: {\r\n inputFormat: 'mulaw',\r\n inputSampleRate: 8_000,\r\n sttSampleRate: 16_000,\r\n outputSampleRate: 8_000,\r\n outputFormat: 'mulaw',\r\n vadConfig: {\r\n threshold: 0.6,\r\n positiveSpeechFrames: 3,\r\n negativeSpeechFrames: 5,\r\n debounceMs: 150,\r\n sampleRate: 16_000,\r\n },\r\n },\r\n}\r\n\r\n/**\r\n * AudioPipeline: auto-wires codec → resample → VAD for a specific telephony provider.\r\n *\r\n * Developers never call this directly — it is instantiated by TelephonyProvider\r\n * implementations and consumed by VoiceAgent.\r\n *\r\n * @internal\r\n */\r\nexport class AudioPipeline {\r\n private readonly profile: AudioProfile\r\n readonly provider: TelephonyProviderName\r\n\r\n constructor(provider: TelephonyProviderName) {\r\n this.provider = provider\r\n this.profile = AUDIO_PROFILES[provider]\r\n\r\n logger.debug(\r\n {\r\n provider,\r\n inputFormat: this.profile.inputFormat,\r\n inputSampleRate: this.profile.inputSampleRate,\r\n sttSampleRate: this.profile.sttSampleRate,\r\n },\r\n 'AudioPipeline initialized'\r\n )\r\n }\r\n\r\n /**\r\n * Transform incoming telephony audio to 16kHz PCM for STT.\r\n * Handles µ-law decode + resampling automatically.\r\n *\r\n * @param raw Raw audio bytes as received from telephony provider\r\n * @returns Async iterable of 16kHz PCM buffers for STT\r\n *\r\n * @internal\r\n */\r\n async *inboundForSTT(raw: AsyncIterable<Buffer>): AsyncIterable<Buffer> {\r\n let decoded: AsyncIterable<Buffer>\r\n\r\n if (this.profile.inputFormat === 'mulaw') {\r\n // Decode µ-law to 8kHz PCM first\r\n decoded = this.decodeMulaw(raw)\r\n } else {\r\n // LiveKit: audio arrives as PCM after SDK decodes Opus\r\n decoded = raw\r\n }\r\n\r\n // Resample to 16kHz for Deepgram\r\n yield* resampleStream(decoded, this.profile.inputSampleRate, this.profile.sttSampleRate)\r\n }\r\n\r\n /**\r\n * Transform TTS output PCM to telephony-native format for sending to caller.\r\n * Handles resampling + µ-law encode automatically.\r\n *\r\n * @param ttsAudio Raw PCM from TTS provider (at TTS provider's native rate)\r\n * @param ttsSampleRate Native sample rate of the TTS provider\r\n * @returns Async iterable of audio bytes ready to send to telephony provider\r\n *\r\n * @internal\r\n */\r\n async *outboundFromTTS(\r\n ttsAudio: AsyncIterable<Buffer>,\r\n ttsSampleRate: number\r\n ): AsyncIterable<Buffer> {\r\n // Resample from TTS rate → telephony output rate\r\n const resampled = resampleStream(\r\n ttsAudio,\r\n ttsSampleRate,\r\n this.profile.outputSampleRate\r\n )\r\n\r\n if (this.profile.outputFormat === 'mulaw') {\r\n // Encode PCM → µ-law for Twilio/Exotel/Plivo/Telnyx/SIP\r\n for await (const chunk of resampled) {\r\n yield pcmBufferToMulaw(chunk)\r\n }\r\n } else {\r\n // LiveKit: emit raw PCM; SDK handles Opus encode\r\n yield* resampled\r\n }\r\n }\r\n\r\n /** Get the VAD config tuned for this provider's audio quality. @internal */\r\n get vadConfig(): Required<VADConfig> {\r\n return this.profile.vadConfig\r\n }\r\n\r\n /** Sample rate that STT expects (post-pipeline). @internal */\r\n get sttSampleRate(): number {\r\n return this.profile.sttSampleRate\r\n }\r\n\r\n /** Async generator: decode µ-law stream to PCM. @internal */\r\n private async *decodeMulaw(raw: AsyncIterable<Buffer>): AsyncIterable<Buffer> {\r\n for await (const chunk of raw) {\r\n yield mulawBufferToPcm(chunk)\r\n }\r\n }\r\n}\r\n\r\n/**\r\n * Factory: create an AudioPipeline pre-configured for the given telephony provider.\r\n *\r\n * @internal — used by TelephonyProvider implementations\r\n */\r\nexport function createAudioPipeline(provider: TelephonyProviderName): AudioPipeline {\r\n return new AudioPipeline(provider)\r\n}","/**\r\n * @voice-kit/core — Voice Activity Detection engine\r\n *\r\n * Wraps @ricky0123/vad-web and emits strongly-typed VoiceFrame events.\r\n * Developers subscribe to VoiceFrame events — they never touch the raw VAD API.\r\n *\r\n * @example\r\n * ```ts\r\n * const vad = createVAD({ threshold: 0.6 })\r\n * vad.on('frame', (frame) => {\r\n * if (frame.type === 'speech_start') startRecording()\r\n * if (frame.type === 'speech_end') stopRecording()\r\n * })\r\n * await vad.processStream(audioStream)\r\n * ```\r\n */\r\n\r\nimport { EventEmitter } from 'node:events'\r\nimport type { VADConfig, VoiceFrame } from '../../types'\r\nimport { AudioTransportError } from '../../errors'\r\nimport pino from 'pino'\r\nimport type { MicVAD } from '@ricky0123/vad-web'\r\n\r\nconst logger = pino({ name: '@voice-kit/core:vad' })\r\n\r\n/** VAD frame size: 30ms at 16kHz = 480 samples = 960 bytes (s16le) */\r\nconst FRAME_SIZE_SAMPLES = 480\r\nconst FRAME_SIZE_BYTES = FRAME_SIZE_SAMPLES * 2 // 16-bit\r\n\r\n/** Default VAD configuration. */\r\nconst VAD_DEFAULTS: Required<VADConfig> = {\r\n threshold: 0.6,\r\n positiveSpeechFrames: 3,\r\n negativeSpeechFrames: 5,\r\n debounceMs: 150,\r\n sampleRate: 16_000,\r\n}\r\n\r\ntype VADEventMap = {\r\n frame: [VoiceFrame]\r\n error: [AudioTransportError]\r\n}\r\n\r\n/**\r\n * Internal VAD engine. Processes a 16kHz PCM stream and emits VoiceFrame events.\r\n * Automatically debounces rapid speech_start/speech_end transitions.\r\n *\r\n * Input: 16kHz, 16-bit little-endian PCM, mono.\r\n * Output: VoiceFrame events on the emitter.\r\n */\r\nexport class VADEngine extends EventEmitter<VADEventMap> {\r\n private readonly config: Required<VADConfig>\r\n\r\n // Running state\r\n private isSpeaking = false\r\n private positiveFrameCount = 0\r\n private negativeFrameCount = 0\r\n private debounceTimer: ReturnType<typeof setTimeout> | null = null\r\n private frameBuffer = Buffer.alloc(0)\r\n\r\n // Silero VAD model — loaded lazily\r\n private vadModel: SileroVADModel | null = null\r\n\r\n constructor(config?: VADConfig) {\r\n super()\r\n this.config = { ...VAD_DEFAULTS, ...config }\r\n }\r\n\r\n /**\r\n * Process an async stream of PCM audio frames.\r\n * Automatically frames the input into 30ms chunks for VAD processing.\r\n *\r\n * @param audio Async iterable of PCM buffers (16kHz, s16le, mono)\r\n */\r\n async processStream(audio: AsyncIterable<Buffer>): Promise<void> {\r\n try {\r\n await this.ensureModelLoaded()\r\n\r\n for await (const chunk of audio) {\r\n this.frameBuffer = Buffer.concat([this.frameBuffer, chunk])\r\n\r\n while (this.frameBuffer.length >= FRAME_SIZE_BYTES) {\r\n const frame = this.frameBuffer.subarray(0, FRAME_SIZE_BYTES)\r\n this.frameBuffer = this.frameBuffer.subarray(FRAME_SIZE_BYTES)\r\n await this.processFrame(frame)\r\n }\r\n }\r\n\r\n // Flush remaining if speech was active\r\n if (this.isSpeaking) {\r\n this.emitFrame('speech_end', 0, Buffer.alloc(0))\r\n }\r\n } catch (err) {\r\n const error = new AudioTransportError('vad', err)\r\n this.emit('error', error)\r\n throw error\r\n }\r\n }\r\n\r\n /**\r\n * Process a single 30ms PCM frame through the VAD model.\r\n *\r\n * @internal\r\n */\r\n private async processFrame(frame: Buffer): Promise<void> {\r\n const confidence = await this.runVADInference(frame)\r\n const durationMs = (FRAME_SIZE_SAMPLES / this.config.sampleRate) * 1000\r\n\r\n if (confidence >= this.config.threshold) {\r\n this.positiveFrameCount++\r\n this.negativeFrameCount = 0\r\n\r\n if (this.isSpeaking) {\r\n // Ongoing speech\r\n this.emitFrame('speech', confidence, frame, durationMs)\r\n } else if (this.positiveFrameCount >= this.config.positiveSpeechFrames) {\r\n // Transition to speaking\r\n this.clearDebounce()\r\n this.isSpeaking = true\r\n this.emitFrame('speech_start', confidence, frame, durationMs)\r\n\r\n logger.debug({ confidence, frames: this.positiveFrameCount }, 'VAD: speech_start')\r\n }\r\n } else {\r\n this.negativeFrameCount++\r\n this.positiveFrameCount = 0\r\n\r\n if (this.isSpeaking) {\r\n // Emit silence frame while still in grace period\r\n this.emitFrame('speech', confidence, frame, durationMs)\r\n\r\n if (this.negativeFrameCount >= this.config.negativeSpeechFrames) {\r\n // Debounce before committing to speech_end (prevents TTS audio bleed-through)\r\n this.scheduleDebounce(() => {\r\n this.isSpeaking = false\r\n this.negativeFrameCount = 0\r\n this.emitFrame('speech_end', confidence, Buffer.alloc(0), 0)\r\n\r\n logger.debug({ confidence }, 'VAD: speech_end')\r\n })\r\n }\r\n }\r\n }\r\n }\r\n\r\n /**\r\n * Run Silero VAD model inference on a single frame.\r\n * Returns confidence score 0–1.\r\n *\r\n * @internal\r\n */\r\n private async runVADInference(frame: Buffer): Promise<number> {\r\n if (!this.vadModel) throw new Error('VAD model not loaded')\r\n\r\n // Convert Buffer to Float32Array for Silero model\r\n const samples = new Float32Array(FRAME_SIZE_SAMPLES)\r\n for (let i = 0; i < FRAME_SIZE_SAMPLES; i++) {\r\n // Normalize s16le to -1.0..1.0\r\n samples[i] = (frame.readInt16LE(i * 2) / 32768.0)\r\n }\r\n\r\n return this.vadModel.predict(samples)\r\n }\r\n\r\n private emitFrame(\r\n type: VoiceFrame['type'],\r\n confidence: number,\r\n audioBuffer: Buffer,\r\n durationMs = 0\r\n ): void {\r\n const frame: VoiceFrame = { type, confidence, audioBuffer, durationMs }\r\n this.emit('frame', frame)\r\n }\r\n\r\n private scheduleDebounce(fn: () => void): void {\r\n this.clearDebounce()\r\n this.debounceTimer = setTimeout(fn, this.config.debounceMs)\r\n }\r\n\r\n private clearDebounce(): void {\r\n if (this.debounceTimer !== null) {\r\n clearTimeout(this.debounceTimer)\r\n this.debounceTimer = null\r\n }\r\n }\r\n\r\n /**\r\n * Load the Silero VAD model if not already loaded.\r\n * @internal\r\n */\r\n private async ensureModelLoaded(): Promise<void> {\r\n if (this.vadModel) return\r\n\r\n logger.debug('Loading Silero VAD model...')\r\n\r\n try {\r\n\r\n const { MicVAD } = await import('@ricky0123/vad-web')\r\n const vad = await MicVAD.new()\r\n\r\n this.vadModel = new SileroVADAdapter(vad)\r\n\r\n logger.info('VAD model loaded successfully')\r\n } catch (err) {\r\n logger.warn({ err }, 'VAD model load failed — falling back to silence-based detection')\r\n // Graceful fallback: use energy-based detection\r\n this.vadModel = new EnergyBasedVAD()\r\n }\r\n }\r\n\r\n /** Clean up resources. Call when the call ends. */\r\n destroy(): void {\r\n this.clearDebounce()\r\n this.removeAllListeners()\r\n this.frameBuffer = Buffer.alloc(0)\r\n this.vadModel = null\r\n }\r\n}\r\n\r\n// ─── Internal model interface ─────────────────────────────────────────────────\r\n\r\n/** Common interface for VAD model backends. @internal */\r\ninterface SileroVADModel {\r\n predict(samples: Float32Array): Promise<number>\r\n}\r\n\r\n/**\r\n * Fallback energy-based VAD when Silero model fails to load.\r\n * Uses RMS energy threshold. Less accurate but zero dependencies.\r\n *\r\n * @internal\r\n */\r\nclass EnergyBasedVAD implements SileroVADModel {\r\n private readonly energyThreshold = 0.01\r\n\r\n async predict(samples: Float32Array): Promise<number> {\r\n // Compute RMS energy\r\n let sumSq = 0\r\n for (const s of samples) {\r\n sumSq += s * s\r\n }\r\n const rms = Math.sqrt(sumSq / samples.length)\r\n // Normalize to 0–1 confidence\r\n return Math.min(1.0, rms / this.energyThreshold)\r\n }\r\n}\r\n\r\n/**\r\n * Create a configured VAD engine instance.\r\n * Input must be 16kHz, 16-bit LE, mono PCM (handled automatically by AudioPipeline).\r\n *\r\n * @example\r\n * ```ts\r\n * const vad = createVAD({ threshold: 0.7, debounceMs: 200 })\r\n * vad.on('frame', (frame) => handleFrame(frame))\r\n * await vad.processStream(audioStream)\r\n * ```\r\n */\r\nexport function createVAD(config?: VADConfig): VADEngine {\r\n return new VADEngine(config)\r\n}\r\n\r\nclass SileroVADAdapter implements SileroVADModel {\r\n constructor(private readonly vad: MicVAD) { }\r\n\r\n async predict(samples: Float32Array): Promise<number> {\r\n // Since MicVAD doesn't expose predict(), use energy estimation\r\n let sumSq = 0\r\n for (const s of samples) {\r\n sumSq += s * s\r\n }\r\n\r\n const rms = Math.sqrt(sumSq / samples.length)\r\n return Math.min(1, rms * 10)\r\n }\r\n}"]}
|