@pompeii-labs/audio 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/dist/index-o4B-ThOL.d.mts +15 -0
- package/dist/index-o4B-ThOL.d.ts +15 -0
- package/dist/index.d.mts +44 -0
- package/dist/index.d.ts +44 -0
- package/dist/index.js +533 -0
- package/dist/index.mjs +522 -0
- package/dist/voice.d.mts +146 -0
- package/dist/voice.d.ts +146 -0
- package/dist/voice.js +756 -0
- package/dist/voice.mjs +739 -0
- package/package.json +57 -0
package/dist/voice.js
ADDED
|
@@ -0,0 +1,756 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
var sdk = require('@deepgram/sdk');
|
|
4
|
+
var elevenlabsJs = require('@elevenlabs/elevenlabs-js');
|
|
5
|
+
var hume = require('hume');
|
|
6
|
+
var OpenAI = require('openai');
|
|
7
|
+
|
|
8
|
+
function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
|
|
9
|
+
|
|
10
|
+
var OpenAI__default = /*#__PURE__*/_interopDefault(OpenAI);
|
|
11
|
+
|
|
12
|
+
// src/helpers/bufferToInt16Array.ts
|
|
13
|
+
function bufferToInt16Array(buffer) {
|
|
14
|
+
return new Int16Array(buffer.buffer, buffer.byteOffset, buffer.byteLength / 2);
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
// src/decoders/mulaw.ts
|
|
18
|
+
function mulawToPcm16(mulawData) {
|
|
19
|
+
const pcmData = new Int16Array(mulawData.length);
|
|
20
|
+
for (let i = 0; i < mulawData.length; i++) {
|
|
21
|
+
pcmData[i] = mulawToLinear(mulawData[i]);
|
|
22
|
+
}
|
|
23
|
+
return pcmData;
|
|
24
|
+
}
|
|
25
|
+
function mulawToLinear(mulawByte) {
|
|
26
|
+
const inverted = mulawByte ^ 255;
|
|
27
|
+
const sign = inverted & 128;
|
|
28
|
+
const segment = (inverted & 112) >> 4;
|
|
29
|
+
const step = inverted & 15;
|
|
30
|
+
let linear;
|
|
31
|
+
if (segment === 0) {
|
|
32
|
+
linear = (step << 1) + 1;
|
|
33
|
+
} else {
|
|
34
|
+
linear = (step << 1) + 1 + 32 << segment + 2;
|
|
35
|
+
}
|
|
36
|
+
linear -= 33;
|
|
37
|
+
return sign ? -linear : linear;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// src/encoders/mulaw.ts
|
|
41
|
+
var BIAS = 132;
|
|
42
|
+
var CLIP = 32635;
|
|
43
|
+
var encodeTable = [
|
|
44
|
+
0,
|
|
45
|
+
0,
|
|
46
|
+
1,
|
|
47
|
+
1,
|
|
48
|
+
2,
|
|
49
|
+
2,
|
|
50
|
+
2,
|
|
51
|
+
2,
|
|
52
|
+
3,
|
|
53
|
+
3,
|
|
54
|
+
3,
|
|
55
|
+
3,
|
|
56
|
+
3,
|
|
57
|
+
3,
|
|
58
|
+
3,
|
|
59
|
+
3,
|
|
60
|
+
4,
|
|
61
|
+
4,
|
|
62
|
+
4,
|
|
63
|
+
4,
|
|
64
|
+
4,
|
|
65
|
+
4,
|
|
66
|
+
4,
|
|
67
|
+
4,
|
|
68
|
+
4,
|
|
69
|
+
4,
|
|
70
|
+
4,
|
|
71
|
+
4,
|
|
72
|
+
4,
|
|
73
|
+
4,
|
|
74
|
+
4,
|
|
75
|
+
4,
|
|
76
|
+
5,
|
|
77
|
+
5,
|
|
78
|
+
5,
|
|
79
|
+
5,
|
|
80
|
+
5,
|
|
81
|
+
5,
|
|
82
|
+
5,
|
|
83
|
+
5,
|
|
84
|
+
5,
|
|
85
|
+
5,
|
|
86
|
+
5,
|
|
87
|
+
5,
|
|
88
|
+
5,
|
|
89
|
+
5,
|
|
90
|
+
5,
|
|
91
|
+
5,
|
|
92
|
+
5,
|
|
93
|
+
5,
|
|
94
|
+
5,
|
|
95
|
+
5,
|
|
96
|
+
5,
|
|
97
|
+
5,
|
|
98
|
+
5,
|
|
99
|
+
5,
|
|
100
|
+
5,
|
|
101
|
+
5,
|
|
102
|
+
5,
|
|
103
|
+
5,
|
|
104
|
+
5,
|
|
105
|
+
5,
|
|
106
|
+
5,
|
|
107
|
+
5,
|
|
108
|
+
6,
|
|
109
|
+
6,
|
|
110
|
+
6,
|
|
111
|
+
6,
|
|
112
|
+
6,
|
|
113
|
+
6,
|
|
114
|
+
6,
|
|
115
|
+
6,
|
|
116
|
+
6,
|
|
117
|
+
6,
|
|
118
|
+
6,
|
|
119
|
+
6,
|
|
120
|
+
6,
|
|
121
|
+
6,
|
|
122
|
+
6,
|
|
123
|
+
6,
|
|
124
|
+
6,
|
|
125
|
+
6,
|
|
126
|
+
6,
|
|
127
|
+
6,
|
|
128
|
+
6,
|
|
129
|
+
6,
|
|
130
|
+
6,
|
|
131
|
+
6,
|
|
132
|
+
6,
|
|
133
|
+
6,
|
|
134
|
+
6,
|
|
135
|
+
6,
|
|
136
|
+
6,
|
|
137
|
+
6,
|
|
138
|
+
6,
|
|
139
|
+
6,
|
|
140
|
+
6,
|
|
141
|
+
6,
|
|
142
|
+
6,
|
|
143
|
+
6,
|
|
144
|
+
6,
|
|
145
|
+
6,
|
|
146
|
+
6,
|
|
147
|
+
6,
|
|
148
|
+
6,
|
|
149
|
+
6,
|
|
150
|
+
6,
|
|
151
|
+
6,
|
|
152
|
+
6,
|
|
153
|
+
6,
|
|
154
|
+
6,
|
|
155
|
+
6,
|
|
156
|
+
6,
|
|
157
|
+
6,
|
|
158
|
+
6,
|
|
159
|
+
6,
|
|
160
|
+
6,
|
|
161
|
+
6,
|
|
162
|
+
6,
|
|
163
|
+
6,
|
|
164
|
+
6,
|
|
165
|
+
6,
|
|
166
|
+
6,
|
|
167
|
+
6,
|
|
168
|
+
6,
|
|
169
|
+
6,
|
|
170
|
+
6,
|
|
171
|
+
6,
|
|
172
|
+
7,
|
|
173
|
+
7,
|
|
174
|
+
7,
|
|
175
|
+
7,
|
|
176
|
+
7,
|
|
177
|
+
7,
|
|
178
|
+
7,
|
|
179
|
+
7,
|
|
180
|
+
7,
|
|
181
|
+
7,
|
|
182
|
+
7,
|
|
183
|
+
7,
|
|
184
|
+
7,
|
|
185
|
+
7,
|
|
186
|
+
7,
|
|
187
|
+
7,
|
|
188
|
+
7,
|
|
189
|
+
7,
|
|
190
|
+
7,
|
|
191
|
+
7,
|
|
192
|
+
7,
|
|
193
|
+
7,
|
|
194
|
+
7,
|
|
195
|
+
7,
|
|
196
|
+
7,
|
|
197
|
+
7,
|
|
198
|
+
7,
|
|
199
|
+
7,
|
|
200
|
+
7,
|
|
201
|
+
7,
|
|
202
|
+
7,
|
|
203
|
+
7,
|
|
204
|
+
7,
|
|
205
|
+
7,
|
|
206
|
+
7,
|
|
207
|
+
7,
|
|
208
|
+
7,
|
|
209
|
+
7,
|
|
210
|
+
7,
|
|
211
|
+
7,
|
|
212
|
+
7,
|
|
213
|
+
7,
|
|
214
|
+
7,
|
|
215
|
+
7,
|
|
216
|
+
7,
|
|
217
|
+
7,
|
|
218
|
+
7,
|
|
219
|
+
7,
|
|
220
|
+
7,
|
|
221
|
+
7,
|
|
222
|
+
7,
|
|
223
|
+
7,
|
|
224
|
+
7,
|
|
225
|
+
7,
|
|
226
|
+
7,
|
|
227
|
+
7,
|
|
228
|
+
7,
|
|
229
|
+
7,
|
|
230
|
+
7,
|
|
231
|
+
7,
|
|
232
|
+
7,
|
|
233
|
+
7,
|
|
234
|
+
7,
|
|
235
|
+
7,
|
|
236
|
+
7,
|
|
237
|
+
7,
|
|
238
|
+
7,
|
|
239
|
+
7,
|
|
240
|
+
7,
|
|
241
|
+
7,
|
|
242
|
+
7,
|
|
243
|
+
7,
|
|
244
|
+
7,
|
|
245
|
+
7,
|
|
246
|
+
7,
|
|
247
|
+
7,
|
|
248
|
+
7,
|
|
249
|
+
7,
|
|
250
|
+
7,
|
|
251
|
+
7,
|
|
252
|
+
7,
|
|
253
|
+
7,
|
|
254
|
+
7,
|
|
255
|
+
7,
|
|
256
|
+
7,
|
|
257
|
+
7,
|
|
258
|
+
7,
|
|
259
|
+
7,
|
|
260
|
+
7,
|
|
261
|
+
7,
|
|
262
|
+
7,
|
|
263
|
+
7,
|
|
264
|
+
7,
|
|
265
|
+
7,
|
|
266
|
+
7,
|
|
267
|
+
7,
|
|
268
|
+
7,
|
|
269
|
+
7,
|
|
270
|
+
7,
|
|
271
|
+
7,
|
|
272
|
+
7,
|
|
273
|
+
7,
|
|
274
|
+
7,
|
|
275
|
+
7,
|
|
276
|
+
7,
|
|
277
|
+
7,
|
|
278
|
+
7,
|
|
279
|
+
7,
|
|
280
|
+
7,
|
|
281
|
+
7,
|
|
282
|
+
7,
|
|
283
|
+
7,
|
|
284
|
+
7,
|
|
285
|
+
7,
|
|
286
|
+
7,
|
|
287
|
+
7,
|
|
288
|
+
7,
|
|
289
|
+
7,
|
|
290
|
+
7,
|
|
291
|
+
7,
|
|
292
|
+
7,
|
|
293
|
+
7,
|
|
294
|
+
7,
|
|
295
|
+
7,
|
|
296
|
+
7,
|
|
297
|
+
7,
|
|
298
|
+
7,
|
|
299
|
+
7
|
|
300
|
+
];
|
|
301
|
+
function encodeSample(sample) {
|
|
302
|
+
const sign = sample >> 8 & 128;
|
|
303
|
+
if (sign !== 0) sample = -sample;
|
|
304
|
+
sample = sample + BIAS;
|
|
305
|
+
if (sample > CLIP) sample = CLIP;
|
|
306
|
+
const exponent = encodeTable[sample >> 7 & 255];
|
|
307
|
+
const mantissa = sample >> exponent + 3 & 15;
|
|
308
|
+
return ~(sign | exponent << 4 | mantissa);
|
|
309
|
+
}
|
|
310
|
+
function pcm16ToMulaw(pcmData) {
|
|
311
|
+
const mulawData = new Uint8Array(pcmData.length);
|
|
312
|
+
for (let i = 0; i < pcmData.length; i++) {
|
|
313
|
+
mulawData[i] = encodeSample(pcmData[i]);
|
|
314
|
+
}
|
|
315
|
+
return mulawData;
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
// src/helpers/int16ArrayToBuffer.ts
|
|
319
|
+
function int16ArrayToBuffer(int16Array) {
|
|
320
|
+
return Buffer.from(int16Array.buffer, int16Array.byteOffset, int16Array.byteLength);
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
// src/helpers/convertAudioFormat.ts
|
|
324
|
+
function encodePcm(audio, encoding) {
|
|
325
|
+
switch (encoding) {
|
|
326
|
+
case "mulaw":
|
|
327
|
+
return Buffer.from(pcm16ToMulaw(audio));
|
|
328
|
+
case "pcm":
|
|
329
|
+
return int16ArrayToBuffer(audio);
|
|
330
|
+
default:
|
|
331
|
+
throw new Error(`Could not encode audio: Unsupported encoding: ${encoding}`);
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
function decodeToPcm(audio, encoding) {
|
|
335
|
+
switch (encoding) {
|
|
336
|
+
case "mulaw":
|
|
337
|
+
return mulawToPcm16(audio);
|
|
338
|
+
case "pcm":
|
|
339
|
+
return bufferToInt16Array(audio);
|
|
340
|
+
default:
|
|
341
|
+
throw new Error(`Could not decode audio: Unsupported encoding: ${encoding}`);
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
// src/helpers/generateFadeOutSamples.ts
|
|
346
|
+
function generateFadeOutSamples(lastSampleValue, fadeDurationMs, sampleRate) {
|
|
347
|
+
const fadeNumSamples = Math.ceil(fadeDurationMs / 1e3 * sampleRate);
|
|
348
|
+
const fadeSamples = new Int16Array(fadeNumSamples);
|
|
349
|
+
for (let i = 0; i < fadeNumSamples; i++) {
|
|
350
|
+
const progress = 1 - i / (fadeNumSamples - 1);
|
|
351
|
+
fadeSamples[i] = Math.round(lastSampleValue * progress);
|
|
352
|
+
}
|
|
353
|
+
return new Uint8Array(fadeSamples.buffer);
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
// src/helpers/resamplePcm.ts
|
|
357
|
+
function resamplePcm(pcm, originalSampleRate, targetSampleRate) {
|
|
358
|
+
const ratio = originalSampleRate / targetSampleRate;
|
|
359
|
+
const newLength = Math.floor(pcm.length / ratio);
|
|
360
|
+
const newSamples = new Int16Array(newLength);
|
|
361
|
+
for (let i = 0; i < newSamples.length; i++) {
|
|
362
|
+
const exactPos = i * ratio;
|
|
363
|
+
const lowerIndex = Math.floor(exactPos);
|
|
364
|
+
const upperIndex = Math.min(lowerIndex + 1, pcm.length - 1);
|
|
365
|
+
const fraction = exactPos - lowerIndex;
|
|
366
|
+
const lowerSample = pcm[lowerIndex];
|
|
367
|
+
const upperSample = pcm[upperIndex];
|
|
368
|
+
newSamples[i] = Math.round(lowerSample + (upperSample - lowerSample) * fraction);
|
|
369
|
+
}
|
|
370
|
+
return newSamples;
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
// src/voice/helpers.ts
|
|
374
|
+
function splitTextIntoChunks(text, targetLength = 100) {
|
|
375
|
+
const endOfSentencePunctuation = [".", "!", "?"];
|
|
376
|
+
const sentences = [];
|
|
377
|
+
for (let i = targetLength; i < text.length; i++) {
|
|
378
|
+
if (endOfSentencePunctuation.includes(text[i])) {
|
|
379
|
+
sentences.push(text.slice(0, i + 1));
|
|
380
|
+
text = text.slice(i + 1);
|
|
381
|
+
i = targetLength;
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
return sentences;
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
// src/voice/client.ts
|
|
388
|
+
var uniformSampleRate = 48e3;
|
|
389
|
+
var MagmaFlow = class {
|
|
390
|
+
constructor(args) {
|
|
391
|
+
this.textBuffer = "";
|
|
392
|
+
this.textQueue = [];
|
|
393
|
+
this.generatingAudio = false;
|
|
394
|
+
this.audioBuffer = [];
|
|
395
|
+
this.stt = args.stt;
|
|
396
|
+
this.tts = args.tts;
|
|
397
|
+
this.inputFormat = args.inputFormat;
|
|
398
|
+
this.outputFormat = args.outputFormat;
|
|
399
|
+
this.onAudioOutput = args.onAudioOutput;
|
|
400
|
+
this.tts.onOutput = (audio) => {
|
|
401
|
+
if (!audio) {
|
|
402
|
+
const lastChunk = this.audioBuffer[this.audioBuffer.length - 1];
|
|
403
|
+
if (lastChunk) {
|
|
404
|
+
const lastChunkSamples = bufferToInt16Array(lastChunk);
|
|
405
|
+
const lastSampleValue = lastChunkSamples[lastChunkSamples.length - 1];
|
|
406
|
+
this.audioBuffer.push(
|
|
407
|
+
Buffer.from(generateFadeOutSamples(lastSampleValue, 500, 48e3))
|
|
408
|
+
);
|
|
409
|
+
}
|
|
410
|
+
this.sendAudio();
|
|
411
|
+
this.generatingAudio = false;
|
|
412
|
+
this.generateAudio();
|
|
413
|
+
return;
|
|
414
|
+
}
|
|
415
|
+
this.audioBuffer.push(audio);
|
|
416
|
+
};
|
|
417
|
+
this.stt.onOutput = args.onTextOutput;
|
|
418
|
+
this.stt.onSpeechDetected = args.onSpeechDetected;
|
|
419
|
+
}
|
|
420
|
+
inputAudio(audio) {
|
|
421
|
+
const decodedAudio = decodeToPcm(audio, this.inputFormat.encoding);
|
|
422
|
+
const resampledPCM = resamplePcm(decodedAudio, this.inputFormat.sampleRate, 48e3);
|
|
423
|
+
this.stt.input(int16ArrayToBuffer(resampledPCM));
|
|
424
|
+
}
|
|
425
|
+
inputText(text) {
|
|
426
|
+
if (!text) {
|
|
427
|
+
this.textQueue.push(this.textBuffer);
|
|
428
|
+
this.textBuffer = "";
|
|
429
|
+
this.generateAudio();
|
|
430
|
+
return;
|
|
431
|
+
}
|
|
432
|
+
this.textBuffer += text;
|
|
433
|
+
const chunks = splitTextIntoChunks(this.textBuffer, 50);
|
|
434
|
+
for (const chunk of chunks) {
|
|
435
|
+
this.textQueue.push(chunk);
|
|
436
|
+
this.textBuffer = this.textBuffer.slice(chunk.length);
|
|
437
|
+
this.generateAudio();
|
|
438
|
+
}
|
|
439
|
+
}
|
|
440
|
+
generateAudio() {
|
|
441
|
+
if (this.generatingAudio) return;
|
|
442
|
+
const chunk = this.textQueue.shift();
|
|
443
|
+
if (!chunk) return;
|
|
444
|
+
this.generatingAudio = true;
|
|
445
|
+
this.tts.input(chunk);
|
|
446
|
+
}
|
|
447
|
+
sendAudio() {
|
|
448
|
+
if (this.audioBuffer.length === 0) return;
|
|
449
|
+
const concatenatedBuffer = Buffer.concat(this.audioBuffer);
|
|
450
|
+
const resampledPCM = resamplePcm(
|
|
451
|
+
bufferToInt16Array(concatenatedBuffer),
|
|
452
|
+
uniformSampleRate,
|
|
453
|
+
this.outputFormat.sampleRate
|
|
454
|
+
);
|
|
455
|
+
const encodedAudio = encodePcm(resampledPCM, this.outputFormat.encoding);
|
|
456
|
+
this.audioBuffer = [];
|
|
457
|
+
try {
|
|
458
|
+
this.onAudioOutput(encodedAudio);
|
|
459
|
+
} catch (error) {
|
|
460
|
+
console.error("Audio output callback error:", error);
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
kill() {
|
|
464
|
+
this.stt.kill();
|
|
465
|
+
this.tts.kill();
|
|
466
|
+
this.audioBuffer = [];
|
|
467
|
+
this.textQueue = [];
|
|
468
|
+
this.textBuffer = "";
|
|
469
|
+
this.generatingAudio = false;
|
|
470
|
+
}
|
|
471
|
+
};
|
|
472
|
+
|
|
473
|
+
// src/voice/speechToText/base.ts
|
|
474
|
+
var MagmaFlowSpeechToText = class {
|
|
475
|
+
onSpeechDetected() {
|
|
476
|
+
console.log(`[Default STT] Speech detected`);
|
|
477
|
+
}
|
|
478
|
+
onOutput(text) {
|
|
479
|
+
console.log(`[Default STT] Output: ${text}`);
|
|
480
|
+
}
|
|
481
|
+
constructor() {
|
|
482
|
+
}
|
|
483
|
+
};
|
|
484
|
+
var kKeepAliveInterval = 5e3;
|
|
485
|
+
var DeepgramModel = /* @__PURE__ */ ((DeepgramModel2) => {
|
|
486
|
+
DeepgramModel2["NOVA_3"] = "nova-3";
|
|
487
|
+
return DeepgramModel2;
|
|
488
|
+
})(DeepgramModel || {});
|
|
489
|
+
var DeepgramLanguage = /* @__PURE__ */ ((DeepgramLanguage2) => {
|
|
490
|
+
DeepgramLanguage2["EN_US"] = "en-US";
|
|
491
|
+
return DeepgramLanguage2;
|
|
492
|
+
})(DeepgramLanguage || {});
|
|
493
|
+
var DeepgramSTT = class extends MagmaFlowSpeechToText {
|
|
494
|
+
constructor(args) {
|
|
495
|
+
super();
|
|
496
|
+
this.connection = null;
|
|
497
|
+
this.textBuffer = "";
|
|
498
|
+
this.config = {
|
|
499
|
+
model: args.model,
|
|
500
|
+
vad_events: true,
|
|
501
|
+
interim_results: true,
|
|
502
|
+
encoding: "linear16",
|
|
503
|
+
sample_rate: 48e3,
|
|
504
|
+
channels: 1,
|
|
505
|
+
...args.config
|
|
506
|
+
};
|
|
507
|
+
this.client = args.client ?? new sdk.DeepgramClient({
|
|
508
|
+
key: process.env.DEEPGRAM_API_KEY
|
|
509
|
+
});
|
|
510
|
+
}
|
|
511
|
+
setup() {
|
|
512
|
+
this.connection = this.client.listen.live(this.config);
|
|
513
|
+
this.connection.on(sdk.LiveTranscriptionEvents.Error, (event) => {
|
|
514
|
+
console.log(`[Deepgram] Error: ${JSON.stringify(event)}`);
|
|
515
|
+
});
|
|
516
|
+
this.connection.on(sdk.LiveTranscriptionEvents.Close, (event) => {
|
|
517
|
+
console.log(`[Deepgram] Close: ${JSON.stringify(event)}`);
|
|
518
|
+
});
|
|
519
|
+
this.connection.on(sdk.LiveTranscriptionEvents.Open, this.onOpen.bind(this));
|
|
520
|
+
this.connection.on(sdk.LiveTranscriptionEvents.Unhandled, (event) => {
|
|
521
|
+
console.log(`[Deepgram] Unhandled event: ${JSON.stringify(event)}`);
|
|
522
|
+
});
|
|
523
|
+
this.connection.on(
|
|
524
|
+
sdk.LiveTranscriptionEvents.Transcript,
|
|
525
|
+
this.handleTranscriptionEvent.bind(this)
|
|
526
|
+
);
|
|
527
|
+
this.connection.on(sdk.LiveTranscriptionEvents.UtteranceEnd, (event) => {
|
|
528
|
+
console.log(`[Deepgram] Utterance end: ${JSON.stringify(event)}`);
|
|
529
|
+
});
|
|
530
|
+
}
|
|
531
|
+
input(audio) {
|
|
532
|
+
if (!this.connection) {
|
|
533
|
+
this.setup();
|
|
534
|
+
return this.input(audio);
|
|
535
|
+
}
|
|
536
|
+
this.connection?.send(audio.buffer);
|
|
537
|
+
}
|
|
538
|
+
flush() {
|
|
539
|
+
this.connection?.finalize();
|
|
540
|
+
}
|
|
541
|
+
kill() {
|
|
542
|
+
this.connection?.requestClose();
|
|
543
|
+
this.connection = null;
|
|
544
|
+
}
|
|
545
|
+
handleTranscriptionEvent(transcriptionEvent) {
|
|
546
|
+
const transcriptOption = transcriptionEvent.channel.alternatives[0];
|
|
547
|
+
if (transcriptOption.transcript.trim() === "") {
|
|
548
|
+
return;
|
|
549
|
+
}
|
|
550
|
+
this.onSpeechDetected();
|
|
551
|
+
if (transcriptionEvent.is_final || transcriptionEvent.speech_final || transcriptionEvent.from_finalize) {
|
|
552
|
+
const confidencePct = Math.round(transcriptOption.confidence * 100);
|
|
553
|
+
const text = `[transcription confidence=${confidencePct}%]: ${transcriptOption.transcript}`;
|
|
554
|
+
this.textBuffer += text + " ";
|
|
555
|
+
if (transcriptionEvent.speech_final) {
|
|
556
|
+
this.onOutput(this.textBuffer);
|
|
557
|
+
this.textBuffer = "";
|
|
558
|
+
}
|
|
559
|
+
}
|
|
560
|
+
}
|
|
561
|
+
onOpen() {
|
|
562
|
+
console.log(`[Deepgram] Connected`);
|
|
563
|
+
this.keepAlive();
|
|
564
|
+
}
|
|
565
|
+
keepAlive() {
|
|
566
|
+
setTimeout(() => {
|
|
567
|
+
if (this.connection?.isConnected()) {
|
|
568
|
+
this.connection.keepAlive();
|
|
569
|
+
this.keepAlive();
|
|
570
|
+
} else {
|
|
571
|
+
return;
|
|
572
|
+
}
|
|
573
|
+
}, kKeepAliveInterval);
|
|
574
|
+
}
|
|
575
|
+
};
|
|
576
|
+
|
|
577
|
+
// src/voice/textToSpeech/base.ts
|
|
578
|
+
var MagmaFlowTextToSpeech = class {
|
|
579
|
+
onOutput(audio) {
|
|
580
|
+
console.log("[Default TTS] Output:", audio);
|
|
581
|
+
}
|
|
582
|
+
constructor() {
|
|
583
|
+
}
|
|
584
|
+
};
|
|
585
|
+
var DeepgramTTS = class extends MagmaFlowTextToSpeech {
|
|
586
|
+
constructor(args) {
|
|
587
|
+
super();
|
|
588
|
+
this.client = args.client ?? new sdk.DeepgramClient({ key: process.env.DEEPGRAM_API_KEY });
|
|
589
|
+
}
|
|
590
|
+
async setup() {
|
|
591
|
+
}
|
|
592
|
+
input(text) {
|
|
593
|
+
if (!text) {
|
|
594
|
+
return;
|
|
595
|
+
}
|
|
596
|
+
this.client.speak.request(
|
|
597
|
+
{
|
|
598
|
+
text
|
|
599
|
+
},
|
|
600
|
+
{
|
|
601
|
+
sample_rate: 48e3,
|
|
602
|
+
encoding: "linear16",
|
|
603
|
+
model: "aura-2-thalia-en",
|
|
604
|
+
container: "none"
|
|
605
|
+
}
|
|
606
|
+
).then(async (response) => {
|
|
607
|
+
const stream = await response.getStream();
|
|
608
|
+
if (!stream) {
|
|
609
|
+
return;
|
|
610
|
+
}
|
|
611
|
+
for await (const chunk of stream) {
|
|
612
|
+
this.output(Buffer.from(chunk));
|
|
613
|
+
}
|
|
614
|
+
this.output(null);
|
|
615
|
+
console.log("[Deepgram] Finished:", text);
|
|
616
|
+
});
|
|
617
|
+
}
|
|
618
|
+
output(audio) {
|
|
619
|
+
this.onOutput(audio);
|
|
620
|
+
}
|
|
621
|
+
kill() {
|
|
622
|
+
}
|
|
623
|
+
reset() {
|
|
624
|
+
}
|
|
625
|
+
};
|
|
626
|
+
var ElevenVoice = /* @__PURE__ */ ((ElevenVoice2) => {
|
|
627
|
+
ElevenVoice2["chris"] = "iP95p4xoKVk53GoZ742B";
|
|
628
|
+
ElevenVoice2["josh"] = "TxGEqnHWrfWFTfGW9XjX";
|
|
629
|
+
ElevenVoice2["rachel"] = "21m00Tcm4TlvDq8ikWAM";
|
|
630
|
+
ElevenVoice2["laura"] = "FGY2WhTYpPnrIDTdsKH5";
|
|
631
|
+
ElevenVoice2["felicity"] = "aTbnroHRGIomiKpqAQR8";
|
|
632
|
+
return ElevenVoice2;
|
|
633
|
+
})(ElevenVoice || {});
|
|
634
|
+
var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
|
|
635
|
+
constructor(args) {
|
|
636
|
+
super();
|
|
637
|
+
this.client = args.client ?? new elevenlabsJs.ElevenLabsClient({
|
|
638
|
+
apiKey: process.env.ELEVENLABS_API_KEY
|
|
639
|
+
});
|
|
640
|
+
this.model = args.model;
|
|
641
|
+
this.voice = args.voice;
|
|
642
|
+
this.config = args.config ?? {};
|
|
643
|
+
}
|
|
644
|
+
async setup() {
|
|
645
|
+
}
|
|
646
|
+
input(text) {
|
|
647
|
+
if (!text) {
|
|
648
|
+
return;
|
|
649
|
+
}
|
|
650
|
+
this.client.textToSpeech.stream(this.voice, {
|
|
651
|
+
text,
|
|
652
|
+
outputFormat: "pcm_48000",
|
|
653
|
+
modelId: this.model,
|
|
654
|
+
...this.config
|
|
655
|
+
}).then(async (stream) => {
|
|
656
|
+
for await (const chunk of stream) {
|
|
657
|
+
this.output(chunk);
|
|
658
|
+
}
|
|
659
|
+
this.output(null);
|
|
660
|
+
console.log("[ElevenLabs] Finished:", text);
|
|
661
|
+
});
|
|
662
|
+
}
|
|
663
|
+
output(audio) {
|
|
664
|
+
this.onOutput(audio);
|
|
665
|
+
}
|
|
666
|
+
kill() {
|
|
667
|
+
}
|
|
668
|
+
reset() {
|
|
669
|
+
}
|
|
670
|
+
};
|
|
671
|
+
var HumeTTS = class extends MagmaFlowTextToSpeech {
|
|
672
|
+
constructor(args) {
|
|
673
|
+
super();
|
|
674
|
+
this.client = args.client ?? new hume.HumeClient({ apiKey: process.env.HUME_API_KEY });
|
|
675
|
+
}
|
|
676
|
+
async setup() {
|
|
677
|
+
}
|
|
678
|
+
input(text) {
|
|
679
|
+
if (!text) {
|
|
680
|
+
return;
|
|
681
|
+
}
|
|
682
|
+
this.client.tts.synthesizeJsonStreaming({
|
|
683
|
+
utterances: [
|
|
684
|
+
{
|
|
685
|
+
text
|
|
686
|
+
}
|
|
687
|
+
],
|
|
688
|
+
format: {
|
|
689
|
+
type: "pcm"
|
|
690
|
+
},
|
|
691
|
+
instantMode: true
|
|
692
|
+
}).then(async (stream) => {
|
|
693
|
+
for await (const chunk of stream) {
|
|
694
|
+
this.output(Buffer.from(chunk.audio, "base64"));
|
|
695
|
+
}
|
|
696
|
+
this.output(null);
|
|
697
|
+
console.log("[Hume] Finished:", text);
|
|
698
|
+
});
|
|
699
|
+
}
|
|
700
|
+
output(audio) {
|
|
701
|
+
this.onOutput(audio);
|
|
702
|
+
}
|
|
703
|
+
kill() {
|
|
704
|
+
}
|
|
705
|
+
reset() {
|
|
706
|
+
}
|
|
707
|
+
};
|
|
708
|
+
var WhisperTTS = class extends MagmaFlowTextToSpeech {
|
|
709
|
+
constructor(args) {
|
|
710
|
+
super();
|
|
711
|
+
this.client = args.client ?? new OpenAI__default.default({ apiKey: process.env.OPENAI_API_KEY });
|
|
712
|
+
}
|
|
713
|
+
async setup() {
|
|
714
|
+
}
|
|
715
|
+
input(text) {
|
|
716
|
+
if (!text) {
|
|
717
|
+
return;
|
|
718
|
+
}
|
|
719
|
+
this.client.audio.speech.create({
|
|
720
|
+
model: "gpt-4o-mini-tts",
|
|
721
|
+
voice: "alloy",
|
|
722
|
+
input: text,
|
|
723
|
+
response_format: "pcm"
|
|
724
|
+
}).then(async (res) => {
|
|
725
|
+
const result = await res.arrayBuffer();
|
|
726
|
+
const resampledPCM = resamplePcm(
|
|
727
|
+
bufferToInt16Array(Buffer.from(result)),
|
|
728
|
+
24e3,
|
|
729
|
+
48e3
|
|
730
|
+
);
|
|
731
|
+
this.output(int16ArrayToBuffer(resampledPCM));
|
|
732
|
+
this.output(null);
|
|
733
|
+
console.log("[Whisper] Finished:", text);
|
|
734
|
+
});
|
|
735
|
+
}
|
|
736
|
+
output(audio) {
|
|
737
|
+
this.onOutput(audio);
|
|
738
|
+
}
|
|
739
|
+
kill() {
|
|
740
|
+
}
|
|
741
|
+
reset() {
|
|
742
|
+
}
|
|
743
|
+
};
|
|
744
|
+
|
|
745
|
+
exports.DeepgramLanguage = DeepgramLanguage;
|
|
746
|
+
exports.DeepgramModel = DeepgramModel;
|
|
747
|
+
exports.DeepgramSTT = DeepgramSTT;
|
|
748
|
+
exports.DeepgramTTS = DeepgramTTS;
|
|
749
|
+
exports.ElevenLabsTTS = ElevenLabsTTS;
|
|
750
|
+
exports.ElevenVoice = ElevenVoice;
|
|
751
|
+
exports.HumeTTS = HumeTTS;
|
|
752
|
+
exports.MagmaFlow = MagmaFlow;
|
|
753
|
+
exports.MagmaFlowSpeechToText = MagmaFlowSpeechToText;
|
|
754
|
+
exports.MagmaFlowTextToSpeech = MagmaFlowTextToSpeech;
|
|
755
|
+
exports.WhisperTTS = WhisperTTS;
|
|
756
|
+
exports.splitTextIntoChunks = splitTextIntoChunks;
|