@pompeii-labs/audio 0.0.5 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/voice.d.mts +4 -1
- package/dist/voice.d.ts +4 -1
- package/dist/voice.js +52 -17
- package/dist/voice.mjs +52 -17
- package/package.json +6 -3
package/dist/voice.d.mts
CHANGED
|
@@ -23,7 +23,7 @@ declare abstract class MagmaFlowTextToSpeech {
|
|
|
23
23
|
|
|
24
24
|
type MagmaFlowConfig = {
|
|
25
25
|
pauseDurationMs?: number;
|
|
26
|
-
|
|
26
|
+
sentenceChunkLength?: number;
|
|
27
27
|
};
|
|
28
28
|
type MagmaFlowArgs = {
|
|
29
29
|
stt: MagmaFlowSpeechToText;
|
|
@@ -73,12 +73,15 @@ declare class DeepgramSTT extends MagmaFlowSpeechToText {
|
|
|
73
73
|
private connection;
|
|
74
74
|
private config;
|
|
75
75
|
private textBuffer;
|
|
76
|
+
private utteranceEnded;
|
|
76
77
|
constructor(args: DeepgramSTTArgs);
|
|
77
78
|
private setup;
|
|
78
79
|
input(audio: Buffer): void;
|
|
79
80
|
flush(): void;
|
|
80
81
|
kill(): void;
|
|
81
82
|
private handleTranscriptionEvent;
|
|
83
|
+
private handleUtteranceEnd;
|
|
84
|
+
private sendOutput;
|
|
82
85
|
private onOpen;
|
|
83
86
|
private keepAlive;
|
|
84
87
|
}
|
package/dist/voice.d.ts
CHANGED
|
@@ -23,7 +23,7 @@ declare abstract class MagmaFlowTextToSpeech {
|
|
|
23
23
|
|
|
24
24
|
type MagmaFlowConfig = {
|
|
25
25
|
pauseDurationMs?: number;
|
|
26
|
-
|
|
26
|
+
sentenceChunkLength?: number;
|
|
27
27
|
};
|
|
28
28
|
type MagmaFlowArgs = {
|
|
29
29
|
stt: MagmaFlowSpeechToText;
|
|
@@ -73,12 +73,15 @@ declare class DeepgramSTT extends MagmaFlowSpeechToText {
|
|
|
73
73
|
private connection;
|
|
74
74
|
private config;
|
|
75
75
|
private textBuffer;
|
|
76
|
+
private utteranceEnded;
|
|
76
77
|
constructor(args: DeepgramSTTArgs);
|
|
77
78
|
private setup;
|
|
78
79
|
input(audio: Buffer): void;
|
|
79
80
|
flush(): void;
|
|
80
81
|
kill(): void;
|
|
81
82
|
private handleTranscriptionEvent;
|
|
83
|
+
private handleUtteranceEnd;
|
|
84
|
+
private sendOutput;
|
|
82
85
|
private onOpen;
|
|
83
86
|
private keepAlive;
|
|
84
87
|
}
|
package/dist/voice.js
CHANGED
|
@@ -375,7 +375,7 @@ function splitTextIntoChunks(text, targetLength = 100) {
|
|
|
375
375
|
const endOfSentencePunctuation = [".", "!", "?"];
|
|
376
376
|
const sentences = [];
|
|
377
377
|
for (let i = targetLength; i < text.length; i++) {
|
|
378
|
-
if (endOfSentencePunctuation.includes(text[i])) {
|
|
378
|
+
if (endOfSentencePunctuation.includes(text[i]) && (i === text.length - 1 || text[i + 1] === " ")) {
|
|
379
379
|
sentences.push(text.slice(0, i + 1));
|
|
380
380
|
text = text.slice(i + 1);
|
|
381
381
|
i = targetLength;
|
|
@@ -387,16 +387,21 @@ function splitTextIntoChunks(text, targetLength = 100) {
|
|
|
387
387
|
// src/voice/client.ts
|
|
388
388
|
var uniformSampleRate = 48e3;
|
|
389
389
|
var MagmaFlow = class {
|
|
390
|
+
stt;
|
|
391
|
+
tts;
|
|
392
|
+
inputFormat;
|
|
393
|
+
outputFormat;
|
|
394
|
+
onAudioOutput;
|
|
395
|
+
textBuffer = "";
|
|
396
|
+
textQueue = [];
|
|
397
|
+
generatingAudio = false;
|
|
398
|
+
currentRequestId = null;
|
|
399
|
+
audioBuffer = [];
|
|
400
|
+
config = {
|
|
401
|
+
pauseDurationMs: 500,
|
|
402
|
+
sentenceChunkLength: 50
|
|
403
|
+
};
|
|
390
404
|
constructor(args) {
|
|
391
|
-
this.textBuffer = "";
|
|
392
|
-
this.textQueue = [];
|
|
393
|
-
this.generatingAudio = false;
|
|
394
|
-
this.currentRequestId = null;
|
|
395
|
-
this.audioBuffer = [];
|
|
396
|
-
this.config = {
|
|
397
|
-
pauseDurationMs: 500,
|
|
398
|
-
setenceChunkLength: 50
|
|
399
|
-
};
|
|
400
405
|
this.stt = args.stt;
|
|
401
406
|
this.tts = args.tts;
|
|
402
407
|
this.inputFormat = args.inputFormat;
|
|
@@ -446,7 +451,7 @@ var MagmaFlow = class {
|
|
|
446
451
|
return;
|
|
447
452
|
}
|
|
448
453
|
this.textBuffer += text;
|
|
449
|
-
const chunks = splitTextIntoChunks(this.textBuffer, this.config.
|
|
454
|
+
const chunks = splitTextIntoChunks(this.textBuffer, this.config.sentenceChunkLength ?? 50);
|
|
450
455
|
for (const chunk of chunks) {
|
|
451
456
|
this.textQueue.push(chunk);
|
|
452
457
|
this.textBuffer = this.textBuffer.slice(chunk.length);
|
|
@@ -517,10 +522,13 @@ var DeepgramLanguage = /* @__PURE__ */ ((DeepgramLanguage2) => {
|
|
|
517
522
|
return DeepgramLanguage2;
|
|
518
523
|
})(DeepgramLanguage || {});
|
|
519
524
|
var DeepgramSTT = class extends MagmaFlowSpeechToText {
|
|
525
|
+
client;
|
|
526
|
+
connection = null;
|
|
527
|
+
config;
|
|
528
|
+
textBuffer = "";
|
|
529
|
+
utteranceEnded = false;
|
|
520
530
|
constructor(args) {
|
|
521
531
|
super();
|
|
522
|
-
this.connection = null;
|
|
523
|
-
this.textBuffer = "";
|
|
524
532
|
this.config = {
|
|
525
533
|
model: args.model,
|
|
526
534
|
vad_events: true,
|
|
@@ -528,6 +536,7 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
|
|
|
528
536
|
encoding: "linear16",
|
|
529
537
|
sample_rate: 48e3,
|
|
530
538
|
channels: 1,
|
|
539
|
+
utterance_end_ms: 1500,
|
|
531
540
|
...args.config
|
|
532
541
|
};
|
|
533
542
|
this.client = args.client ?? new sdk.DeepgramClient({
|
|
@@ -552,6 +561,7 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
|
|
|
552
561
|
);
|
|
553
562
|
this.connection.on(sdk.LiveTranscriptionEvents.UtteranceEnd, (event) => {
|
|
554
563
|
console.log(`[Deepgram] Utterance end: ${JSON.stringify(event)}`);
|
|
564
|
+
this.handleUtteranceEnd();
|
|
555
565
|
});
|
|
556
566
|
}
|
|
557
567
|
input(audio) {
|
|
@@ -574,16 +584,33 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
|
|
|
574
584
|
return;
|
|
575
585
|
}
|
|
576
586
|
this.onSpeechDetected();
|
|
587
|
+
if (transcriptionEvent.speech_final) {
|
|
588
|
+
this.utteranceEnded = false;
|
|
589
|
+
}
|
|
577
590
|
if (transcriptionEvent.is_final || transcriptionEvent.speech_final || transcriptionEvent.from_finalize) {
|
|
578
591
|
const confidencePct = Math.round(transcriptOption.confidence * 100);
|
|
579
592
|
const text = `[transcription confidence=${confidencePct}%]: ${transcriptOption.transcript}`;
|
|
580
593
|
this.textBuffer += text + " ";
|
|
581
594
|
if (transcriptionEvent.speech_final) {
|
|
582
|
-
this.
|
|
583
|
-
this.textBuffer = "";
|
|
595
|
+
this.sendOutput();
|
|
584
596
|
}
|
|
585
597
|
}
|
|
586
598
|
}
|
|
599
|
+
handleUtteranceEnd() {
|
|
600
|
+
this.utteranceEnded = true;
|
|
601
|
+
this.sendOutput();
|
|
602
|
+
}
|
|
603
|
+
sendOutput() {
|
|
604
|
+
if (!this.utteranceEnded) {
|
|
605
|
+
return;
|
|
606
|
+
}
|
|
607
|
+
if (this.textBuffer.trim() === "") {
|
|
608
|
+
this.textBuffer = "[unintelligible]";
|
|
609
|
+
}
|
|
610
|
+
this.onOutput(this.textBuffer);
|
|
611
|
+
this.textBuffer = "";
|
|
612
|
+
this.utteranceEnded = false;
|
|
613
|
+
}
|
|
587
614
|
onOpen() {
|
|
588
615
|
console.log(`[Deepgram] Connected`);
|
|
589
616
|
this.keepAlive();
|
|
@@ -609,6 +636,7 @@ var MagmaFlowTextToSpeech = class {
|
|
|
609
636
|
}
|
|
610
637
|
};
|
|
611
638
|
var DeepgramTTS = class extends MagmaFlowTextToSpeech {
|
|
639
|
+
client;
|
|
612
640
|
constructor(args) {
|
|
613
641
|
super();
|
|
614
642
|
this.client = args.client ?? new sdk.DeepgramClient({ key: process.env.DEEPGRAM_API_KEY });
|
|
@@ -655,6 +683,10 @@ var ElevenVoice = /* @__PURE__ */ ((ElevenVoice2) => {
|
|
|
655
683
|
return ElevenVoice2;
|
|
656
684
|
})(ElevenVoice || {});
|
|
657
685
|
var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
|
|
686
|
+
client;
|
|
687
|
+
model;
|
|
688
|
+
voice;
|
|
689
|
+
config;
|
|
658
690
|
constructor(args) {
|
|
659
691
|
super();
|
|
660
692
|
this.client = args.client ?? new elevenlabsJs.ElevenLabsClient({
|
|
@@ -670,8 +702,9 @@ var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
|
|
|
670
702
|
if (!text) {
|
|
671
703
|
return;
|
|
672
704
|
}
|
|
705
|
+
const textToSend = text.replaceAll(/([a-zA-Z])-([a-zA-Z])/g, "$1 - $2").replaceAll(/(-\s*[a-zA-z])\s+([a-zA-z]\s*-)/g, "$1 - $2");
|
|
673
706
|
this.client.textToSpeech.stream(this.voice, {
|
|
674
|
-
text,
|
|
707
|
+
text: textToSend,
|
|
675
708
|
outputFormat: "pcm_48000",
|
|
676
709
|
modelId: this.model,
|
|
677
710
|
...this.config
|
|
@@ -680,7 +713,7 @@ var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
|
|
|
680
713
|
this.onOutput(chunk, requestId);
|
|
681
714
|
}
|
|
682
715
|
this.onOutput(null, requestId);
|
|
683
|
-
console.log("[ElevenLabs] Finished:",
|
|
716
|
+
console.log("[ElevenLabs] Finished:", textToSend);
|
|
684
717
|
});
|
|
685
718
|
}
|
|
686
719
|
kill() {
|
|
@@ -689,6 +722,7 @@ var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
|
|
|
689
722
|
}
|
|
690
723
|
};
|
|
691
724
|
var HumeTTS = class extends MagmaFlowTextToSpeech {
|
|
725
|
+
client;
|
|
692
726
|
constructor(args) {
|
|
693
727
|
super();
|
|
694
728
|
this.client = args.client ?? new hume.HumeClient({ apiKey: process.env.HUME_API_KEY });
|
|
@@ -723,6 +757,7 @@ var HumeTTS = class extends MagmaFlowTextToSpeech {
|
|
|
723
757
|
}
|
|
724
758
|
};
|
|
725
759
|
var WhisperTTS = class extends MagmaFlowTextToSpeech {
|
|
760
|
+
client;
|
|
726
761
|
constructor(args) {
|
|
727
762
|
super();
|
|
728
763
|
this.client = args.client ?? new OpenAI__default.default({ apiKey: process.env.OPENAI_API_KEY });
|
package/dist/voice.mjs
CHANGED
|
@@ -369,7 +369,7 @@ function splitTextIntoChunks(text, targetLength = 100) {
|
|
|
369
369
|
const endOfSentencePunctuation = [".", "!", "?"];
|
|
370
370
|
const sentences = [];
|
|
371
371
|
for (let i = targetLength; i < text.length; i++) {
|
|
372
|
-
if (endOfSentencePunctuation.includes(text[i])) {
|
|
372
|
+
if (endOfSentencePunctuation.includes(text[i]) && (i === text.length - 1 || text[i + 1] === " ")) {
|
|
373
373
|
sentences.push(text.slice(0, i + 1));
|
|
374
374
|
text = text.slice(i + 1);
|
|
375
375
|
i = targetLength;
|
|
@@ -381,16 +381,21 @@ function splitTextIntoChunks(text, targetLength = 100) {
|
|
|
381
381
|
// src/voice/client.ts
|
|
382
382
|
var uniformSampleRate = 48e3;
|
|
383
383
|
var MagmaFlow = class {
|
|
384
|
+
stt;
|
|
385
|
+
tts;
|
|
386
|
+
inputFormat;
|
|
387
|
+
outputFormat;
|
|
388
|
+
onAudioOutput;
|
|
389
|
+
textBuffer = "";
|
|
390
|
+
textQueue = [];
|
|
391
|
+
generatingAudio = false;
|
|
392
|
+
currentRequestId = null;
|
|
393
|
+
audioBuffer = [];
|
|
394
|
+
config = {
|
|
395
|
+
pauseDurationMs: 500,
|
|
396
|
+
sentenceChunkLength: 50
|
|
397
|
+
};
|
|
384
398
|
constructor(args) {
|
|
385
|
-
this.textBuffer = "";
|
|
386
|
-
this.textQueue = [];
|
|
387
|
-
this.generatingAudio = false;
|
|
388
|
-
this.currentRequestId = null;
|
|
389
|
-
this.audioBuffer = [];
|
|
390
|
-
this.config = {
|
|
391
|
-
pauseDurationMs: 500,
|
|
392
|
-
setenceChunkLength: 50
|
|
393
|
-
};
|
|
394
399
|
this.stt = args.stt;
|
|
395
400
|
this.tts = args.tts;
|
|
396
401
|
this.inputFormat = args.inputFormat;
|
|
@@ -440,7 +445,7 @@ var MagmaFlow = class {
|
|
|
440
445
|
return;
|
|
441
446
|
}
|
|
442
447
|
this.textBuffer += text;
|
|
443
|
-
const chunks = splitTextIntoChunks(this.textBuffer, this.config.
|
|
448
|
+
const chunks = splitTextIntoChunks(this.textBuffer, this.config.sentenceChunkLength ?? 50);
|
|
444
449
|
for (const chunk of chunks) {
|
|
445
450
|
this.textQueue.push(chunk);
|
|
446
451
|
this.textBuffer = this.textBuffer.slice(chunk.length);
|
|
@@ -511,10 +516,13 @@ var DeepgramLanguage = /* @__PURE__ */ ((DeepgramLanguage2) => {
|
|
|
511
516
|
return DeepgramLanguage2;
|
|
512
517
|
})(DeepgramLanguage || {});
|
|
513
518
|
var DeepgramSTT = class extends MagmaFlowSpeechToText {
|
|
519
|
+
client;
|
|
520
|
+
connection = null;
|
|
521
|
+
config;
|
|
522
|
+
textBuffer = "";
|
|
523
|
+
utteranceEnded = false;
|
|
514
524
|
constructor(args) {
|
|
515
525
|
super();
|
|
516
|
-
this.connection = null;
|
|
517
|
-
this.textBuffer = "";
|
|
518
526
|
this.config = {
|
|
519
527
|
model: args.model,
|
|
520
528
|
vad_events: true,
|
|
@@ -522,6 +530,7 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
|
|
|
522
530
|
encoding: "linear16",
|
|
523
531
|
sample_rate: 48e3,
|
|
524
532
|
channels: 1,
|
|
533
|
+
utterance_end_ms: 1500,
|
|
525
534
|
...args.config
|
|
526
535
|
};
|
|
527
536
|
this.client = args.client ?? new DeepgramClient({
|
|
@@ -546,6 +555,7 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
|
|
|
546
555
|
);
|
|
547
556
|
this.connection.on(LiveTranscriptionEvents.UtteranceEnd, (event) => {
|
|
548
557
|
console.log(`[Deepgram] Utterance end: ${JSON.stringify(event)}`);
|
|
558
|
+
this.handleUtteranceEnd();
|
|
549
559
|
});
|
|
550
560
|
}
|
|
551
561
|
input(audio) {
|
|
@@ -568,16 +578,33 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
|
|
|
568
578
|
return;
|
|
569
579
|
}
|
|
570
580
|
this.onSpeechDetected();
|
|
581
|
+
if (transcriptionEvent.speech_final) {
|
|
582
|
+
this.utteranceEnded = false;
|
|
583
|
+
}
|
|
571
584
|
if (transcriptionEvent.is_final || transcriptionEvent.speech_final || transcriptionEvent.from_finalize) {
|
|
572
585
|
const confidencePct = Math.round(transcriptOption.confidence * 100);
|
|
573
586
|
const text = `[transcription confidence=${confidencePct}%]: ${transcriptOption.transcript}`;
|
|
574
587
|
this.textBuffer += text + " ";
|
|
575
588
|
if (transcriptionEvent.speech_final) {
|
|
576
|
-
this.
|
|
577
|
-
this.textBuffer = "";
|
|
589
|
+
this.sendOutput();
|
|
578
590
|
}
|
|
579
591
|
}
|
|
580
592
|
}
|
|
593
|
+
handleUtteranceEnd() {
|
|
594
|
+
this.utteranceEnded = true;
|
|
595
|
+
this.sendOutput();
|
|
596
|
+
}
|
|
597
|
+
sendOutput() {
|
|
598
|
+
if (!this.utteranceEnded) {
|
|
599
|
+
return;
|
|
600
|
+
}
|
|
601
|
+
if (this.textBuffer.trim() === "") {
|
|
602
|
+
this.textBuffer = "[unintelligible]";
|
|
603
|
+
}
|
|
604
|
+
this.onOutput(this.textBuffer);
|
|
605
|
+
this.textBuffer = "";
|
|
606
|
+
this.utteranceEnded = false;
|
|
607
|
+
}
|
|
581
608
|
onOpen() {
|
|
582
609
|
console.log(`[Deepgram] Connected`);
|
|
583
610
|
this.keepAlive();
|
|
@@ -603,6 +630,7 @@ var MagmaFlowTextToSpeech = class {
|
|
|
603
630
|
}
|
|
604
631
|
};
|
|
605
632
|
var DeepgramTTS = class extends MagmaFlowTextToSpeech {
|
|
633
|
+
client;
|
|
606
634
|
constructor(args) {
|
|
607
635
|
super();
|
|
608
636
|
this.client = args.client ?? new DeepgramClient({ key: process.env.DEEPGRAM_API_KEY });
|
|
@@ -649,6 +677,10 @@ var ElevenVoice = /* @__PURE__ */ ((ElevenVoice2) => {
|
|
|
649
677
|
return ElevenVoice2;
|
|
650
678
|
})(ElevenVoice || {});
|
|
651
679
|
var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
|
|
680
|
+
client;
|
|
681
|
+
model;
|
|
682
|
+
voice;
|
|
683
|
+
config;
|
|
652
684
|
constructor(args) {
|
|
653
685
|
super();
|
|
654
686
|
this.client = args.client ?? new ElevenLabsClient({
|
|
@@ -664,8 +696,9 @@ var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
|
|
|
664
696
|
if (!text) {
|
|
665
697
|
return;
|
|
666
698
|
}
|
|
699
|
+
const textToSend = text.replaceAll(/([a-zA-Z])-([a-zA-Z])/g, "$1 - $2").replaceAll(/(-\s*[a-zA-z])\s+([a-zA-z]\s*-)/g, "$1 - $2");
|
|
667
700
|
this.client.textToSpeech.stream(this.voice, {
|
|
668
|
-
text,
|
|
701
|
+
text: textToSend,
|
|
669
702
|
outputFormat: "pcm_48000",
|
|
670
703
|
modelId: this.model,
|
|
671
704
|
...this.config
|
|
@@ -674,7 +707,7 @@ var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
|
|
|
674
707
|
this.onOutput(chunk, requestId);
|
|
675
708
|
}
|
|
676
709
|
this.onOutput(null, requestId);
|
|
677
|
-
console.log("[ElevenLabs] Finished:",
|
|
710
|
+
console.log("[ElevenLabs] Finished:", textToSend);
|
|
678
711
|
});
|
|
679
712
|
}
|
|
680
713
|
kill() {
|
|
@@ -683,6 +716,7 @@ var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
|
|
|
683
716
|
}
|
|
684
717
|
};
|
|
685
718
|
var HumeTTS = class extends MagmaFlowTextToSpeech {
|
|
719
|
+
client;
|
|
686
720
|
constructor(args) {
|
|
687
721
|
super();
|
|
688
722
|
this.client = args.client ?? new HumeClient({ apiKey: process.env.HUME_API_KEY });
|
|
@@ -717,6 +751,7 @@ var HumeTTS = class extends MagmaFlowTextToSpeech {
|
|
|
717
751
|
}
|
|
718
752
|
};
|
|
719
753
|
var WhisperTTS = class extends MagmaFlowTextToSpeech {
|
|
754
|
+
client;
|
|
720
755
|
constructor(args) {
|
|
721
756
|
super();
|
|
722
757
|
this.client = args.client ?? new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@pompeii-labs/audio",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.7",
|
|
4
4
|
"description": "The Audio SDK from Pompeii Labs",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"Pompeii",
|
|
@@ -20,15 +20,18 @@
|
|
|
20
20
|
],
|
|
21
21
|
"repository": "pompeii-labs/pompeii-audio",
|
|
22
22
|
"main": "dist/index.js",
|
|
23
|
+
"module": "dist/index.mjs",
|
|
23
24
|
"types": "dist/index.d.ts",
|
|
24
25
|
"exports": {
|
|
25
26
|
".": {
|
|
26
27
|
"types": "./dist/index.d.ts",
|
|
27
|
-
"
|
|
28
|
+
"import": "./dist/index.mjs",
|
|
29
|
+
"require": "./dist/index.js"
|
|
28
30
|
},
|
|
29
31
|
"./voice": {
|
|
30
32
|
"types": "./dist/voice.d.ts",
|
|
31
|
-
"
|
|
33
|
+
"import": "./dist/voice.mjs",
|
|
34
|
+
"require": "./dist/voice.js"
|
|
32
35
|
}
|
|
33
36
|
},
|
|
34
37
|
"scripts": {
|