npm - @pompeii-labs/audio - Versions diffs - 0.0.5 → 0.0.7 - Mend

@pompeii-labs/audio 0.0.5 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/voice.d.mts CHANGED Viewed

@@ -23,7 +23,7 @@ declare abstract class MagmaFlowTextToSpeech {
 type MagmaFlowConfig = {
     pauseDurationMs?: number;
-    setenceChunkLength?: number;
+    sentenceChunkLength?: number;
 };
 type MagmaFlowArgs = {
     stt: MagmaFlowSpeechToText;
@@ -73,12 +73,15 @@ declare class DeepgramSTT extends MagmaFlowSpeechToText {
     private connection;
     private config;
     private textBuffer;
+    private utteranceEnded;
     constructor(args: DeepgramSTTArgs);
     private setup;
     input(audio: Buffer): void;
     flush(): void;
     kill(): void;
     private handleTranscriptionEvent;
+    private handleUtteranceEnd;
+    private sendOutput;
     private onOpen;
     private keepAlive;
 }

package/dist/voice.d.ts CHANGED Viewed

@@ -23,7 +23,7 @@ declare abstract class MagmaFlowTextToSpeech {
 type MagmaFlowConfig = {
     pauseDurationMs?: number;
-    setenceChunkLength?: number;
+    sentenceChunkLength?: number;
 };
 type MagmaFlowArgs = {
     stt: MagmaFlowSpeechToText;
@@ -73,12 +73,15 @@ declare class DeepgramSTT extends MagmaFlowSpeechToText {
     private connection;
     private config;
     private textBuffer;
+    private utteranceEnded;
     constructor(args: DeepgramSTTArgs);
     private setup;
     input(audio: Buffer): void;
     flush(): void;
     kill(): void;
     private handleTranscriptionEvent;
+    private handleUtteranceEnd;
+    private sendOutput;
     private onOpen;
     private keepAlive;
 }

package/dist/voice.js CHANGED Viewed

@@ -375,7 +375,7 @@ function splitTextIntoChunks(text, targetLength = 100) {
   const endOfSentencePunctuation = [".", "!", "?"];
   const sentences = [];
   for (let i = targetLength; i < text.length; i++) {
-    if (endOfSentencePunctuation.includes(text[i])) {
+    if (endOfSentencePunctuation.includes(text[i]) && (i === text.length - 1 || text[i + 1] === " ")) {
       sentences.push(text.slice(0, i + 1));
       text = text.slice(i + 1);
       i = targetLength;
@@ -387,16 +387,21 @@ function splitTextIntoChunks(text, targetLength = 100) {
 // src/voice/client.ts
 var uniformSampleRate = 48e3;
 var MagmaFlow = class {
+  stt;
+  tts;
+  inputFormat;
+  outputFormat;
+  onAudioOutput;
+  textBuffer = "";
+  textQueue = [];
+  generatingAudio = false;
+  currentRequestId = null;
+  audioBuffer = [];
+  config = {
+    pauseDurationMs: 500,
+    sentenceChunkLength: 50
+  };
   constructor(args) {
-    this.textBuffer = "";
-    this.textQueue = [];
-    this.generatingAudio = false;
-    this.currentRequestId = null;
-    this.audioBuffer = [];
-    this.config = {
-      pauseDurationMs: 500,
-      setenceChunkLength: 50
-    };
     this.stt = args.stt;
     this.tts = args.tts;
     this.inputFormat = args.inputFormat;
@@ -446,7 +451,7 @@ var MagmaFlow = class {
       return;
     }
     this.textBuffer += text;
-    const chunks = splitTextIntoChunks(this.textBuffer, this.config.setenceChunkLength ?? 50);
+    const chunks = splitTextIntoChunks(this.textBuffer, this.config.sentenceChunkLength ?? 50);
     for (const chunk of chunks) {
       this.textQueue.push(chunk);
       this.textBuffer = this.textBuffer.slice(chunk.length);
@@ -517,10 +522,13 @@ var DeepgramLanguage = /* @__PURE__ */ ((DeepgramLanguage2) => {
   return DeepgramLanguage2;
 })(DeepgramLanguage || {});
 var DeepgramSTT = class extends MagmaFlowSpeechToText {
+  client;
+  connection = null;
+  config;
+  textBuffer = "";
+  utteranceEnded = false;
   constructor(args) {
     super();
-    this.connection = null;
-    this.textBuffer = "";
     this.config = {
       model: args.model,
       vad_events: true,
@@ -528,6 +536,7 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
       encoding: "linear16",
       sample_rate: 48e3,
       channels: 1,
+      utterance_end_ms: 1500,
       ...args.config
     };
     this.client = args.client ?? new sdk.DeepgramClient({
@@ -552,6 +561,7 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
     );
     this.connection.on(sdk.LiveTranscriptionEvents.UtteranceEnd, (event) => {
       console.log(`[Deepgram] Utterance end: ${JSON.stringify(event)}`);
+      this.handleUtteranceEnd();
     });
   }
   input(audio) {
@@ -574,16 +584,33 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
       return;
     }
     this.onSpeechDetected();
+    if (transcriptionEvent.speech_final) {
+      this.utteranceEnded = false;
+    }
     if (transcriptionEvent.is_final || transcriptionEvent.speech_final || transcriptionEvent.from_finalize) {
       const confidencePct = Math.round(transcriptOption.confidence * 100);
       const text = `[transcription confidence=${confidencePct}%]: ${transcriptOption.transcript}`;
       this.textBuffer += text + " ";
       if (transcriptionEvent.speech_final) {
-        this.onOutput(this.textBuffer);
-        this.textBuffer = "";
+        this.sendOutput();
       }
     }
   }
+  handleUtteranceEnd() {
+    this.utteranceEnded = true;
+    this.sendOutput();
+  }
+  sendOutput() {
+    if (!this.utteranceEnded) {
+      return;
+    }
+    if (this.textBuffer.trim() === "") {
+      this.textBuffer = "[unintelligible]";
+    }
+    this.onOutput(this.textBuffer);
+    this.textBuffer = "";
+    this.utteranceEnded = false;
+  }
   onOpen() {
     console.log(`[Deepgram] Connected`);
     this.keepAlive();
@@ -609,6 +636,7 @@ var MagmaFlowTextToSpeech = class {
   }
 };
 var DeepgramTTS = class extends MagmaFlowTextToSpeech {
+  client;
   constructor(args) {
     super();
     this.client = args.client ?? new sdk.DeepgramClient({ key: process.env.DEEPGRAM_API_KEY });
@@ -655,6 +683,10 @@ var ElevenVoice = /* @__PURE__ */ ((ElevenVoice2) => {
   return ElevenVoice2;
 })(ElevenVoice || {});
 var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
+  client;
+  model;
+  voice;
+  config;
   constructor(args) {
     super();
     this.client = args.client ?? new elevenlabsJs.ElevenLabsClient({
@@ -670,8 +702,9 @@ var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
     if (!text) {
       return;
     }
+    const textToSend = text.replaceAll(/([a-zA-Z])-([a-zA-Z])/g, "$1 - $2").replaceAll(/(-\s*[a-zA-z])\s+([a-zA-z]\s*-)/g, "$1 - $2");
     this.client.textToSpeech.stream(this.voice, {
-      text,
+      text: textToSend,
       outputFormat: "pcm_48000",
       modelId: this.model,
       ...this.config
@@ -680,7 +713,7 @@ var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
         this.onOutput(chunk, requestId);
       }
       this.onOutput(null, requestId);
-      console.log("[ElevenLabs] Finished:", text);
+      console.log("[ElevenLabs] Finished:", textToSend);
     });
   }
   kill() {
@@ -689,6 +722,7 @@ var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
   }
 };
 var HumeTTS = class extends MagmaFlowTextToSpeech {
+  client;
   constructor(args) {
     super();
     this.client = args.client ?? new hume.HumeClient({ apiKey: process.env.HUME_API_KEY });
@@ -723,6 +757,7 @@ var HumeTTS = class extends MagmaFlowTextToSpeech {
   }
 };
 var WhisperTTS = class extends MagmaFlowTextToSpeech {
+  client;
   constructor(args) {
     super();
     this.client = args.client ?? new OpenAI__default.default({ apiKey: process.env.OPENAI_API_KEY });

package/dist/voice.mjs CHANGED Viewed

@@ -369,7 +369,7 @@ function splitTextIntoChunks(text, targetLength = 100) {
   const endOfSentencePunctuation = [".", "!", "?"];
   const sentences = [];
   for (let i = targetLength; i < text.length; i++) {
-    if (endOfSentencePunctuation.includes(text[i])) {
+    if (endOfSentencePunctuation.includes(text[i]) && (i === text.length - 1 || text[i + 1] === " ")) {
       sentences.push(text.slice(0, i + 1));
       text = text.slice(i + 1);
       i = targetLength;
@@ -381,16 +381,21 @@ function splitTextIntoChunks(text, targetLength = 100) {
 // src/voice/client.ts
 var uniformSampleRate = 48e3;
 var MagmaFlow = class {
+  stt;
+  tts;
+  inputFormat;
+  outputFormat;
+  onAudioOutput;
+  textBuffer = "";
+  textQueue = [];
+  generatingAudio = false;
+  currentRequestId = null;
+  audioBuffer = [];
+  config = {
+    pauseDurationMs: 500,
+    sentenceChunkLength: 50
+  };
   constructor(args) {
-    this.textBuffer = "";
-    this.textQueue = [];
-    this.generatingAudio = false;
-    this.currentRequestId = null;
-    this.audioBuffer = [];
-    this.config = {
-      pauseDurationMs: 500,
-      setenceChunkLength: 50
-    };
     this.stt = args.stt;
     this.tts = args.tts;
     this.inputFormat = args.inputFormat;
@@ -440,7 +445,7 @@ var MagmaFlow = class {
       return;
     }
     this.textBuffer += text;
-    const chunks = splitTextIntoChunks(this.textBuffer, this.config.setenceChunkLength ?? 50);
+    const chunks = splitTextIntoChunks(this.textBuffer, this.config.sentenceChunkLength ?? 50);
     for (const chunk of chunks) {
       this.textQueue.push(chunk);
       this.textBuffer = this.textBuffer.slice(chunk.length);
@@ -511,10 +516,13 @@ var DeepgramLanguage = /* @__PURE__ */ ((DeepgramLanguage2) => {
   return DeepgramLanguage2;
 })(DeepgramLanguage || {});
 var DeepgramSTT = class extends MagmaFlowSpeechToText {
+  client;
+  connection = null;
+  config;
+  textBuffer = "";
+  utteranceEnded = false;
   constructor(args) {
     super();
-    this.connection = null;
-    this.textBuffer = "";
     this.config = {
       model: args.model,
       vad_events: true,
@@ -522,6 +530,7 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
       encoding: "linear16",
       sample_rate: 48e3,
       channels: 1,
+      utterance_end_ms: 1500,
       ...args.config
     };
     this.client = args.client ?? new DeepgramClient({
@@ -546,6 +555,7 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
     );
     this.connection.on(LiveTranscriptionEvents.UtteranceEnd, (event) => {
       console.log(`[Deepgram] Utterance end: ${JSON.stringify(event)}`);
+      this.handleUtteranceEnd();
     });
   }
   input(audio) {
@@ -568,16 +578,33 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
       return;
     }
     this.onSpeechDetected();
+    if (transcriptionEvent.speech_final) {
+      this.utteranceEnded = false;
+    }
     if (transcriptionEvent.is_final || transcriptionEvent.speech_final || transcriptionEvent.from_finalize) {
       const confidencePct = Math.round(transcriptOption.confidence * 100);
       const text = `[transcription confidence=${confidencePct}%]: ${transcriptOption.transcript}`;
       this.textBuffer += text + " ";
       if (transcriptionEvent.speech_final) {
-        this.onOutput(this.textBuffer);
-        this.textBuffer = "";
+        this.sendOutput();
       }
     }
   }
+  handleUtteranceEnd() {
+    this.utteranceEnded = true;
+    this.sendOutput();
+  }
+  sendOutput() {
+    if (!this.utteranceEnded) {
+      return;
+    }
+    if (this.textBuffer.trim() === "") {
+      this.textBuffer = "[unintelligible]";
+    }
+    this.onOutput(this.textBuffer);
+    this.textBuffer = "";
+    this.utteranceEnded = false;
+  }
   onOpen() {
     console.log(`[Deepgram] Connected`);
     this.keepAlive();
@@ -603,6 +630,7 @@ var MagmaFlowTextToSpeech = class {
   }
 };
 var DeepgramTTS = class extends MagmaFlowTextToSpeech {
+  client;
   constructor(args) {
     super();
     this.client = args.client ?? new DeepgramClient({ key: process.env.DEEPGRAM_API_KEY });
@@ -649,6 +677,10 @@ var ElevenVoice = /* @__PURE__ */ ((ElevenVoice2) => {
   return ElevenVoice2;
 })(ElevenVoice || {});
 var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
+  client;
+  model;
+  voice;
+  config;
   constructor(args) {
     super();
     this.client = args.client ?? new ElevenLabsClient({
@@ -664,8 +696,9 @@ var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
     if (!text) {
       return;
     }
+    const textToSend = text.replaceAll(/([a-zA-Z])-([a-zA-Z])/g, "$1 - $2").replaceAll(/(-\s*[a-zA-z])\s+([a-zA-z]\s*-)/g, "$1 - $2");
     this.client.textToSpeech.stream(this.voice, {
-      text,
+      text: textToSend,
       outputFormat: "pcm_48000",
       modelId: this.model,
       ...this.config
@@ -674,7 +707,7 @@ var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
         this.onOutput(chunk, requestId);
       }
       this.onOutput(null, requestId);
-      console.log("[ElevenLabs] Finished:", text);
+      console.log("[ElevenLabs] Finished:", textToSend);
     });
   }
   kill() {
@@ -683,6 +716,7 @@ var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
   }
 };
 var HumeTTS = class extends MagmaFlowTextToSpeech {
+  client;
   constructor(args) {
     super();
     this.client = args.client ?? new HumeClient({ apiKey: process.env.HUME_API_KEY });
@@ -717,6 +751,7 @@ var HumeTTS = class extends MagmaFlowTextToSpeech {
   }
 };
 var WhisperTTS = class extends MagmaFlowTextToSpeech {
+  client;
   constructor(args) {
     super();
     this.client = args.client ?? new OpenAI({ apiKey: process.env.OPENAI_API_KEY });

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@pompeii-labs/audio",
-  "version": "0.0.5",
+  "version": "0.0.7",
   "description": "The Audio SDK from Pompeii Labs",
   "keywords": [
     "Pompeii",
@@ -20,15 +20,18 @@
   ],
   "repository": "pompeii-labs/pompeii-audio",
   "main": "dist/index.js",
+  "module": "dist/index.mjs",
   "types": "dist/index.d.ts",
   "exports": {
     ".": {
       "types": "./dist/index.d.ts",
-      "default": "./dist/index.js"
+      "import": "./dist/index.mjs",
+      "require": "./dist/index.js"
     },
     "./voice": {
       "types": "./dist/voice.d.ts",
-      "default": "./dist/voice.js"
+      "import": "./dist/voice.mjs",
+      "require": "./dist/voice.js"
     }
   },
   "scripts": {