@pompeii-labs/audio 0.0.5 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/voice.d.mts CHANGED
@@ -23,7 +23,7 @@ declare abstract class MagmaFlowTextToSpeech {
23
23
 
24
24
  type MagmaFlowConfig = {
25
25
  pauseDurationMs?: number;
26
- setenceChunkLength?: number;
26
+ sentenceChunkLength?: number;
27
27
  };
28
28
  type MagmaFlowArgs = {
29
29
  stt: MagmaFlowSpeechToText;
@@ -73,12 +73,15 @@ declare class DeepgramSTT extends MagmaFlowSpeechToText {
73
73
  private connection;
74
74
  private config;
75
75
  private textBuffer;
76
+ private utteranceEnded;
76
77
  constructor(args: DeepgramSTTArgs);
77
78
  private setup;
78
79
  input(audio: Buffer): void;
79
80
  flush(): void;
80
81
  kill(): void;
81
82
  private handleTranscriptionEvent;
83
+ private handleUtteranceEnd;
84
+ private sendOutput;
82
85
  private onOpen;
83
86
  private keepAlive;
84
87
  }
package/dist/voice.d.ts CHANGED
@@ -23,7 +23,7 @@ declare abstract class MagmaFlowTextToSpeech {
23
23
 
24
24
  type MagmaFlowConfig = {
25
25
  pauseDurationMs?: number;
26
- setenceChunkLength?: number;
26
+ sentenceChunkLength?: number;
27
27
  };
28
28
  type MagmaFlowArgs = {
29
29
  stt: MagmaFlowSpeechToText;
@@ -73,12 +73,15 @@ declare class DeepgramSTT extends MagmaFlowSpeechToText {
73
73
  private connection;
74
74
  private config;
75
75
  private textBuffer;
76
+ private utteranceEnded;
76
77
  constructor(args: DeepgramSTTArgs);
77
78
  private setup;
78
79
  input(audio: Buffer): void;
79
80
  flush(): void;
80
81
  kill(): void;
81
82
  private handleTranscriptionEvent;
83
+ private handleUtteranceEnd;
84
+ private sendOutput;
82
85
  private onOpen;
83
86
  private keepAlive;
84
87
  }
package/dist/voice.js CHANGED
@@ -375,7 +375,7 @@ function splitTextIntoChunks(text, targetLength = 100) {
375
375
  const endOfSentencePunctuation = [".", "!", "?"];
376
376
  const sentences = [];
377
377
  for (let i = targetLength; i < text.length; i++) {
378
- if (endOfSentencePunctuation.includes(text[i])) {
378
+ if (endOfSentencePunctuation.includes(text[i]) && (i === text.length - 1 || text[i + 1] === " ")) {
379
379
  sentences.push(text.slice(0, i + 1));
380
380
  text = text.slice(i + 1);
381
381
  i = targetLength;
@@ -387,16 +387,21 @@ function splitTextIntoChunks(text, targetLength = 100) {
387
387
  // src/voice/client.ts
388
388
  var uniformSampleRate = 48e3;
389
389
  var MagmaFlow = class {
390
+ stt;
391
+ tts;
392
+ inputFormat;
393
+ outputFormat;
394
+ onAudioOutput;
395
+ textBuffer = "";
396
+ textQueue = [];
397
+ generatingAudio = false;
398
+ currentRequestId = null;
399
+ audioBuffer = [];
400
+ config = {
401
+ pauseDurationMs: 500,
402
+ sentenceChunkLength: 50
403
+ };
390
404
  constructor(args) {
391
- this.textBuffer = "";
392
- this.textQueue = [];
393
- this.generatingAudio = false;
394
- this.currentRequestId = null;
395
- this.audioBuffer = [];
396
- this.config = {
397
- pauseDurationMs: 500,
398
- setenceChunkLength: 50
399
- };
400
405
  this.stt = args.stt;
401
406
  this.tts = args.tts;
402
407
  this.inputFormat = args.inputFormat;
@@ -446,7 +451,7 @@ var MagmaFlow = class {
446
451
  return;
447
452
  }
448
453
  this.textBuffer += text;
449
- const chunks = splitTextIntoChunks(this.textBuffer, this.config.setenceChunkLength ?? 50);
454
+ const chunks = splitTextIntoChunks(this.textBuffer, this.config.sentenceChunkLength ?? 50);
450
455
  for (const chunk of chunks) {
451
456
  this.textQueue.push(chunk);
452
457
  this.textBuffer = this.textBuffer.slice(chunk.length);
@@ -517,10 +522,13 @@ var DeepgramLanguage = /* @__PURE__ */ ((DeepgramLanguage2) => {
517
522
  return DeepgramLanguage2;
518
523
  })(DeepgramLanguage || {});
519
524
  var DeepgramSTT = class extends MagmaFlowSpeechToText {
525
+ client;
526
+ connection = null;
527
+ config;
528
+ textBuffer = "";
529
+ utteranceEnded = false;
520
530
  constructor(args) {
521
531
  super();
522
- this.connection = null;
523
- this.textBuffer = "";
524
532
  this.config = {
525
533
  model: args.model,
526
534
  vad_events: true,
@@ -528,6 +536,7 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
528
536
  encoding: "linear16",
529
537
  sample_rate: 48e3,
530
538
  channels: 1,
539
+ utterance_end_ms: 1500,
531
540
  ...args.config
532
541
  };
533
542
  this.client = args.client ?? new sdk.DeepgramClient({
@@ -552,6 +561,7 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
552
561
  );
553
562
  this.connection.on(sdk.LiveTranscriptionEvents.UtteranceEnd, (event) => {
554
563
  console.log(`[Deepgram] Utterance end: ${JSON.stringify(event)}`);
564
+ this.handleUtteranceEnd();
555
565
  });
556
566
  }
557
567
  input(audio) {
@@ -574,16 +584,33 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
574
584
  return;
575
585
  }
576
586
  this.onSpeechDetected();
587
+ if (transcriptionEvent.speech_final) {
588
+ this.utteranceEnded = false;
589
+ }
577
590
  if (transcriptionEvent.is_final || transcriptionEvent.speech_final || transcriptionEvent.from_finalize) {
578
591
  const confidencePct = Math.round(transcriptOption.confidence * 100);
579
592
  const text = `[transcription confidence=${confidencePct}%]: ${transcriptOption.transcript}`;
580
593
  this.textBuffer += text + " ";
581
594
  if (transcriptionEvent.speech_final) {
582
- this.onOutput(this.textBuffer);
583
- this.textBuffer = "";
595
+ this.sendOutput();
584
596
  }
585
597
  }
586
598
  }
599
+ handleUtteranceEnd() {
600
+ this.utteranceEnded = true;
601
+ this.sendOutput();
602
+ }
603
+ sendOutput() {
604
+ if (!this.utteranceEnded) {
605
+ return;
606
+ }
607
+ if (this.textBuffer.trim() === "") {
608
+ this.textBuffer = "[unintelligible]";
609
+ }
610
+ this.onOutput(this.textBuffer);
611
+ this.textBuffer = "";
612
+ this.utteranceEnded = false;
613
+ }
587
614
  onOpen() {
588
615
  console.log(`[Deepgram] Connected`);
589
616
  this.keepAlive();
@@ -609,6 +636,7 @@ var MagmaFlowTextToSpeech = class {
609
636
  }
610
637
  };
611
638
  var DeepgramTTS = class extends MagmaFlowTextToSpeech {
639
+ client;
612
640
  constructor(args) {
613
641
  super();
614
642
  this.client = args.client ?? new sdk.DeepgramClient({ key: process.env.DEEPGRAM_API_KEY });
@@ -655,6 +683,10 @@ var ElevenVoice = /* @__PURE__ */ ((ElevenVoice2) => {
655
683
  return ElevenVoice2;
656
684
  })(ElevenVoice || {});
657
685
  var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
686
+ client;
687
+ model;
688
+ voice;
689
+ config;
658
690
  constructor(args) {
659
691
  super();
660
692
  this.client = args.client ?? new elevenlabsJs.ElevenLabsClient({
@@ -670,8 +702,9 @@ var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
670
702
  if (!text) {
671
703
  return;
672
704
  }
705
+ const textToSend = text.replaceAll(/([a-zA-Z])-([a-zA-Z])/g, "$1 - $2").replaceAll(/(-\s*[a-zA-z])\s+([a-zA-z]\s*-)/g, "$1 - $2");
673
706
  this.client.textToSpeech.stream(this.voice, {
674
- text,
707
+ text: textToSend,
675
708
  outputFormat: "pcm_48000",
676
709
  modelId: this.model,
677
710
  ...this.config
@@ -680,7 +713,7 @@ var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
680
713
  this.onOutput(chunk, requestId);
681
714
  }
682
715
  this.onOutput(null, requestId);
683
- console.log("[ElevenLabs] Finished:", text);
716
+ console.log("[ElevenLabs] Finished:", textToSend);
684
717
  });
685
718
  }
686
719
  kill() {
@@ -689,6 +722,7 @@ var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
689
722
  }
690
723
  };
691
724
  var HumeTTS = class extends MagmaFlowTextToSpeech {
725
+ client;
692
726
  constructor(args) {
693
727
  super();
694
728
  this.client = args.client ?? new hume.HumeClient({ apiKey: process.env.HUME_API_KEY });
@@ -723,6 +757,7 @@ var HumeTTS = class extends MagmaFlowTextToSpeech {
723
757
  }
724
758
  };
725
759
  var WhisperTTS = class extends MagmaFlowTextToSpeech {
760
+ client;
726
761
  constructor(args) {
727
762
  super();
728
763
  this.client = args.client ?? new OpenAI__default.default({ apiKey: process.env.OPENAI_API_KEY });
package/dist/voice.mjs CHANGED
@@ -369,7 +369,7 @@ function splitTextIntoChunks(text, targetLength = 100) {
369
369
  const endOfSentencePunctuation = [".", "!", "?"];
370
370
  const sentences = [];
371
371
  for (let i = targetLength; i < text.length; i++) {
372
- if (endOfSentencePunctuation.includes(text[i])) {
372
+ if (endOfSentencePunctuation.includes(text[i]) && (i === text.length - 1 || text[i + 1] === " ")) {
373
373
  sentences.push(text.slice(0, i + 1));
374
374
  text = text.slice(i + 1);
375
375
  i = targetLength;
@@ -381,16 +381,21 @@ function splitTextIntoChunks(text, targetLength = 100) {
381
381
  // src/voice/client.ts
382
382
  var uniformSampleRate = 48e3;
383
383
  var MagmaFlow = class {
384
+ stt;
385
+ tts;
386
+ inputFormat;
387
+ outputFormat;
388
+ onAudioOutput;
389
+ textBuffer = "";
390
+ textQueue = [];
391
+ generatingAudio = false;
392
+ currentRequestId = null;
393
+ audioBuffer = [];
394
+ config = {
395
+ pauseDurationMs: 500,
396
+ sentenceChunkLength: 50
397
+ };
384
398
  constructor(args) {
385
- this.textBuffer = "";
386
- this.textQueue = [];
387
- this.generatingAudio = false;
388
- this.currentRequestId = null;
389
- this.audioBuffer = [];
390
- this.config = {
391
- pauseDurationMs: 500,
392
- setenceChunkLength: 50
393
- };
394
399
  this.stt = args.stt;
395
400
  this.tts = args.tts;
396
401
  this.inputFormat = args.inputFormat;
@@ -440,7 +445,7 @@ var MagmaFlow = class {
440
445
  return;
441
446
  }
442
447
  this.textBuffer += text;
443
- const chunks = splitTextIntoChunks(this.textBuffer, this.config.setenceChunkLength ?? 50);
448
+ const chunks = splitTextIntoChunks(this.textBuffer, this.config.sentenceChunkLength ?? 50);
444
449
  for (const chunk of chunks) {
445
450
  this.textQueue.push(chunk);
446
451
  this.textBuffer = this.textBuffer.slice(chunk.length);
@@ -511,10 +516,13 @@ var DeepgramLanguage = /* @__PURE__ */ ((DeepgramLanguage2) => {
511
516
  return DeepgramLanguage2;
512
517
  })(DeepgramLanguage || {});
513
518
  var DeepgramSTT = class extends MagmaFlowSpeechToText {
519
+ client;
520
+ connection = null;
521
+ config;
522
+ textBuffer = "";
523
+ utteranceEnded = false;
514
524
  constructor(args) {
515
525
  super();
516
- this.connection = null;
517
- this.textBuffer = "";
518
526
  this.config = {
519
527
  model: args.model,
520
528
  vad_events: true,
@@ -522,6 +530,7 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
522
530
  encoding: "linear16",
523
531
  sample_rate: 48e3,
524
532
  channels: 1,
533
+ utterance_end_ms: 1500,
525
534
  ...args.config
526
535
  };
527
536
  this.client = args.client ?? new DeepgramClient({
@@ -546,6 +555,7 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
546
555
  );
547
556
  this.connection.on(LiveTranscriptionEvents.UtteranceEnd, (event) => {
548
557
  console.log(`[Deepgram] Utterance end: ${JSON.stringify(event)}`);
558
+ this.handleUtteranceEnd();
549
559
  });
550
560
  }
551
561
  input(audio) {
@@ -568,16 +578,33 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
568
578
  return;
569
579
  }
570
580
  this.onSpeechDetected();
581
+ if (transcriptionEvent.speech_final) {
582
+ this.utteranceEnded = false;
583
+ }
571
584
  if (transcriptionEvent.is_final || transcriptionEvent.speech_final || transcriptionEvent.from_finalize) {
572
585
  const confidencePct = Math.round(transcriptOption.confidence * 100);
573
586
  const text = `[transcription confidence=${confidencePct}%]: ${transcriptOption.transcript}`;
574
587
  this.textBuffer += text + " ";
575
588
  if (transcriptionEvent.speech_final) {
576
- this.onOutput(this.textBuffer);
577
- this.textBuffer = "";
589
+ this.sendOutput();
578
590
  }
579
591
  }
580
592
  }
593
+ handleUtteranceEnd() {
594
+ this.utteranceEnded = true;
595
+ this.sendOutput();
596
+ }
597
+ sendOutput() {
598
+ if (!this.utteranceEnded) {
599
+ return;
600
+ }
601
+ if (this.textBuffer.trim() === "") {
602
+ this.textBuffer = "[unintelligible]";
603
+ }
604
+ this.onOutput(this.textBuffer);
605
+ this.textBuffer = "";
606
+ this.utteranceEnded = false;
607
+ }
581
608
  onOpen() {
582
609
  console.log(`[Deepgram] Connected`);
583
610
  this.keepAlive();
@@ -603,6 +630,7 @@ var MagmaFlowTextToSpeech = class {
603
630
  }
604
631
  };
605
632
  var DeepgramTTS = class extends MagmaFlowTextToSpeech {
633
+ client;
606
634
  constructor(args) {
607
635
  super();
608
636
  this.client = args.client ?? new DeepgramClient({ key: process.env.DEEPGRAM_API_KEY });
@@ -649,6 +677,10 @@ var ElevenVoice = /* @__PURE__ */ ((ElevenVoice2) => {
649
677
  return ElevenVoice2;
650
678
  })(ElevenVoice || {});
651
679
  var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
680
+ client;
681
+ model;
682
+ voice;
683
+ config;
652
684
  constructor(args) {
653
685
  super();
654
686
  this.client = args.client ?? new ElevenLabsClient({
@@ -664,8 +696,9 @@ var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
664
696
  if (!text) {
665
697
  return;
666
698
  }
699
+ const textToSend = text.replaceAll(/([a-zA-Z])-([a-zA-Z])/g, "$1 - $2").replaceAll(/(-\s*[a-zA-z])\s+([a-zA-z]\s*-)/g, "$1 - $2");
667
700
  this.client.textToSpeech.stream(this.voice, {
668
- text,
701
+ text: textToSend,
669
702
  outputFormat: "pcm_48000",
670
703
  modelId: this.model,
671
704
  ...this.config
@@ -674,7 +707,7 @@ var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
674
707
  this.onOutput(chunk, requestId);
675
708
  }
676
709
  this.onOutput(null, requestId);
677
- console.log("[ElevenLabs] Finished:", text);
710
+ console.log("[ElevenLabs] Finished:", textToSend);
678
711
  });
679
712
  }
680
713
  kill() {
@@ -683,6 +716,7 @@ var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
683
716
  }
684
717
  };
685
718
  var HumeTTS = class extends MagmaFlowTextToSpeech {
719
+ client;
686
720
  constructor(args) {
687
721
  super();
688
722
  this.client = args.client ?? new HumeClient({ apiKey: process.env.HUME_API_KEY });
@@ -717,6 +751,7 @@ var HumeTTS = class extends MagmaFlowTextToSpeech {
717
751
  }
718
752
  };
719
753
  var WhisperTTS = class extends MagmaFlowTextToSpeech {
754
+ client;
720
755
  constructor(args) {
721
756
  super();
722
757
  this.client = args.client ?? new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pompeii-labs/audio",
3
- "version": "0.0.5",
3
+ "version": "0.0.7",
4
4
  "description": "The Audio SDK from Pompeii Labs",
5
5
  "keywords": [
6
6
  "Pompeii",
@@ -20,15 +20,18 @@
20
20
  ],
21
21
  "repository": "pompeii-labs/pompeii-audio",
22
22
  "main": "dist/index.js",
23
+ "module": "dist/index.mjs",
23
24
  "types": "dist/index.d.ts",
24
25
  "exports": {
25
26
  ".": {
26
27
  "types": "./dist/index.d.ts",
27
- "default": "./dist/index.js"
28
+ "import": "./dist/index.mjs",
29
+ "require": "./dist/index.js"
28
30
  },
29
31
  "./voice": {
30
32
  "types": "./dist/voice.d.ts",
31
- "default": "./dist/voice.js"
33
+ "import": "./dist/voice.mjs",
34
+ "require": "./dist/voice.js"
32
35
  }
33
36
  },
34
37
  "scripts": {