npm - @pompeii-labs/audio - Versions diffs - 0.1.5 → 0.2.0 - Mend

@pompeii-labs/audio 0.1.5 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/voice.d.mts CHANGED Viewed

@@ -1,7 +1,5 @@
 import { A as AudioFormat } from './index-o4B-ThOL.mjs';
 import { DeepgramClient, LiveSchema } from '@deepgram/sdk';
-import { ElevenLabsClient } from '@elevenlabs/elevenlabs-js';
-import { StreamTextToSpeechRequest } from '@elevenlabs/elevenlabs-js/api/index.js';
 import { HumeClient } from 'hume';
 import OpenAI from 'openai';
@@ -58,6 +56,7 @@ declare class MagmaFlow {
     private generatingAudio;
     private currentRequestId;
     private audioBuffer;
+    private lastChunk;
     private config;
     constructor(args: MagmaFlowArgs);
     inputAudio(audio: Buffer): void;
@@ -120,22 +119,44 @@ declare class DeepgramTTS extends MagmaFlowTextToSpeech {
     reset(): void;
 }
-declare enum ElevenVoice {
+declare enum ElevenLabsVoice {
     chris = "iP95p4xoKVk53GoZ742B",
     josh = "TxGEqnHWrfWFTfGW9XjX",
     rachel = "21m00Tcm4TlvDq8ikWAM",
     laura = "FGY2WhTYpPnrIDTdsKH5",
     felicity = "aTbnroHRGIomiKpqAQR8"
 }
-type ElevenLabsConfig = Omit<StreamTextToSpeechRequest, 'outputFormat' | 'text' | 'modelId'>;
+type StreamSpeechConfig = {
+    text: string;
+    model_id?: string;
+    language_code?: string;
+    voice_settings?: {
+        stability?: number;
+        use_speaker_boost?: boolean;
+        similarity_boost?: number;
+        style?: number;
+        speed?: number;
+    };
+    pronunciation_dictionary_locators?: {
+        pronunciation_dictionary_id: string;
+        version_id?: string;
+    }[];
+    seed?: number;
+    next_text?: string;
+    previous_request_ids?: string[];
+    next_request_ids?: string[];
+    apply_text_normalization?: 'auto' | 'on' | 'off';
+    apply_language_text_normalization?: boolean;
+};
+type ElevenLabsConfig = Omit<StreamSpeechConfig, 'text' | 'model_id'>;
 type ElevenLabsTTSArgs = {
-    client?: ElevenLabsClient;
     model: string;
-    voice: ElevenVoice | string;
+    voice: ElevenLabsVoice | string;
     config?: ElevenLabsConfig;
+    apiKey?: string;
 };
 declare class ElevenLabsTTS extends MagmaFlowTextToSpeech {
-    private client;
+    private apiKey;
     private model;
     private voice;
     private config;
@@ -172,4 +193,4 @@ declare class WhisperTTS extends MagmaFlowTextToSpeech {
 declare function splitTextIntoChunks(text: string, targetLength?: number): string[];
-export { DeepgramLanguage, DeepgramModel, DeepgramSTT, type DeepgramSTTArgs, DeepgramTTS, type DeepgramTTSArgs, type DeepgramWord, ElevenLabsTTS, type ElevenLabsTTSArgs, ElevenVoice, HumeTTS, type HumeTTSArgs, MagmaFlow, type MagmaFlowArgs, type MagmaFlowSTTOutput, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, type Turn, WhisperTTS, type WhisperTTSArgs, splitTextIntoChunks };
+export { DeepgramLanguage, DeepgramModel, DeepgramSTT, type DeepgramSTTArgs, DeepgramTTS, type DeepgramTTSArgs, type DeepgramWord, ElevenLabsTTS, type ElevenLabsTTSArgs, ElevenLabsVoice, HumeTTS, type HumeTTSArgs, MagmaFlow, type MagmaFlowArgs, type MagmaFlowSTTOutput, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, type Turn, WhisperTTS, type WhisperTTSArgs, splitTextIntoChunks };

package/dist/voice.d.ts CHANGED Viewed

@@ -1,7 +1,5 @@
 import { A as AudioFormat } from './index-o4B-ThOL.js';
 import { DeepgramClient, LiveSchema } from '@deepgram/sdk';
-import { ElevenLabsClient } from '@elevenlabs/elevenlabs-js';
-import { StreamTextToSpeechRequest } from '@elevenlabs/elevenlabs-js/api/index.js';
 import { HumeClient } from 'hume';
 import OpenAI from 'openai';
@@ -58,6 +56,7 @@ declare class MagmaFlow {
     private generatingAudio;
     private currentRequestId;
     private audioBuffer;
+    private lastChunk;
     private config;
     constructor(args: MagmaFlowArgs);
     inputAudio(audio: Buffer): void;
@@ -120,22 +119,44 @@ declare class DeepgramTTS extends MagmaFlowTextToSpeech {
     reset(): void;
 }
-declare enum ElevenVoice {
+declare enum ElevenLabsVoice {
     chris = "iP95p4xoKVk53GoZ742B",
     josh = "TxGEqnHWrfWFTfGW9XjX",
     rachel = "21m00Tcm4TlvDq8ikWAM",
     laura = "FGY2WhTYpPnrIDTdsKH5",
     felicity = "aTbnroHRGIomiKpqAQR8"
 }
-type ElevenLabsConfig = Omit<StreamTextToSpeechRequest, 'outputFormat' | 'text' | 'modelId'>;
+type StreamSpeechConfig = {
+    text: string;
+    model_id?: string;
+    language_code?: string;
+    voice_settings?: {
+        stability?: number;
+        use_speaker_boost?: boolean;
+        similarity_boost?: number;
+        style?: number;
+        speed?: number;
+    };
+    pronunciation_dictionary_locators?: {
+        pronunciation_dictionary_id: string;
+        version_id?: string;
+    }[];
+    seed?: number;
+    next_text?: string;
+    previous_request_ids?: string[];
+    next_request_ids?: string[];
+    apply_text_normalization?: 'auto' | 'on' | 'off';
+    apply_language_text_normalization?: boolean;
+};
+type ElevenLabsConfig = Omit<StreamSpeechConfig, 'text' | 'model_id'>;
 type ElevenLabsTTSArgs = {
-    client?: ElevenLabsClient;
     model: string;
-    voice: ElevenVoice | string;
+    voice: ElevenLabsVoice | string;
     config?: ElevenLabsConfig;
+    apiKey?: string;
 };
 declare class ElevenLabsTTS extends MagmaFlowTextToSpeech {
-    private client;
+    private apiKey;
     private model;
     private voice;
     private config;
@@ -172,4 +193,4 @@ declare class WhisperTTS extends MagmaFlowTextToSpeech {
 declare function splitTextIntoChunks(text: string, targetLength?: number): string[];
-export { DeepgramLanguage, DeepgramModel, DeepgramSTT, type DeepgramSTTArgs, DeepgramTTS, type DeepgramTTSArgs, type DeepgramWord, ElevenLabsTTS, type ElevenLabsTTSArgs, ElevenVoice, HumeTTS, type HumeTTSArgs, MagmaFlow, type MagmaFlowArgs, type MagmaFlowSTTOutput, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, type Turn, WhisperTTS, type WhisperTTSArgs, splitTextIntoChunks };
+export { DeepgramLanguage, DeepgramModel, DeepgramSTT, type DeepgramSTTArgs, DeepgramTTS, type DeepgramTTSArgs, type DeepgramWord, ElevenLabsTTS, type ElevenLabsTTSArgs, ElevenLabsVoice, HumeTTS, type HumeTTSArgs, MagmaFlow, type MagmaFlowArgs, type MagmaFlowSTTOutput, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, type Turn, WhisperTTS, type WhisperTTSArgs, splitTextIntoChunks };

package/dist/voice.js CHANGED Viewed

@@ -1,7 +1,6 @@
 'use strict';
 var sdk = require('@deepgram/sdk');
-var elevenlabsJs = require('@elevenlabs/elevenlabs-js');
 var hume = require('hume');
 var OpenAI = require('openai');
@@ -429,6 +428,7 @@ var MagmaFlow = class {
   generatingAudio = false;
   currentRequestId = null;
   audioBuffer = [];
+  lastChunk = null;
   config = {
     pauseDurationMs: 500,
     sentenceChunkLength: 50
@@ -446,9 +446,8 @@ var MagmaFlow = class {
         return;
       }
       if (!audio) {
-        const lastChunk = this.audioBuffer[this.audioBuffer.length - 1];
-        if (lastChunk) {
-          const lastChunkSamples = bufferToInt16Array(lastChunk);
+        if (this.lastChunk) {
+          const lastChunkSamples = bufferToInt16Array(this.lastChunk);
           const lastSampleValue = lastChunkSamples[lastChunkSamples.length - 1];
           this.audioBuffer.push(
             Buffer.from(
@@ -462,10 +461,15 @@ var MagmaFlow = class {
         }
         this.sendAudio();
         this.generatingAudio = false;
+        this.lastChunk = null;
         this.generateAudio();
         return;
       }
       this.audioBuffer.push(audio);
+      this.lastChunk = audio;
+      if (this.audioBuffer.reduce((acc, curr) => acc + curr.length, 0) % (2 * this.outputFormat.channels) === 0) {
+        this.sendAudio();
+      }
     };
     this.stt.onOutput = args.onTranscription;
     this.stt.onSpeechDetected = args.onSpeechDetected;
@@ -505,13 +509,13 @@ var MagmaFlow = class {
   sendAudio() {
     if (this.audioBuffer.length === 0) return;
     const concatenatedBuffer = Buffer.concat(this.audioBuffer);
+    this.audioBuffer = [];
     const resampledPCM = resamplePcm(
       bufferToInt16Array(concatenatedBuffer),
       uniformSampleRate,
       this.outputFormat.sampleRate
     );
     const encodedAudio = encodePcm(resampledPCM, this.outputFormat.encoding);
-    this.audioBuffer = [];
     try {
       this.onAudioOutput(encodedAudio);
     } catch (error) {
@@ -642,7 +646,7 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
     }
     const text = this.turnBuffer.map((turn) => turn.text).join(" ");
     let turns = void 0;
-    if (this.turnBuffer.every((turn) => !!turn.speaker)) {
+    if (this.turnBuffer.every((turn) => turn.speaker !== void 0 && turn.speaker !== null)) {
       turns = this.turnBuffer.reduce((acc, turn) => {
         if (acc.at(-1)?.speaker === turn.speaker) {
           acc.at(-1).text += turn.text;
@@ -760,24 +764,24 @@ var DeepgramTTS = class extends MagmaFlowTextToSpeech {
   reset() {
   }
 };
-var ElevenVoice = /* @__PURE__ */ ((ElevenVoice2) => {
-  ElevenVoice2["chris"] = "iP95p4xoKVk53GoZ742B";
-  ElevenVoice2["josh"] = "TxGEqnHWrfWFTfGW9XjX";
-  ElevenVoice2["rachel"] = "21m00Tcm4TlvDq8ikWAM";
-  ElevenVoice2["laura"] = "FGY2WhTYpPnrIDTdsKH5";
-  ElevenVoice2["felicity"] = "aTbnroHRGIomiKpqAQR8";
-  return ElevenVoice2;
-})(ElevenVoice || {});
+// src/voice/textToSpeech/elevenlabs.ts
+var ElevenLabsVoice = /* @__PURE__ */ ((ElevenLabsVoice2) => {
+  ElevenLabsVoice2["chris"] = "iP95p4xoKVk53GoZ742B";
+  ElevenLabsVoice2["josh"] = "TxGEqnHWrfWFTfGW9XjX";
+  ElevenLabsVoice2["rachel"] = "21m00Tcm4TlvDq8ikWAM";
+  ElevenLabsVoice2["laura"] = "FGY2WhTYpPnrIDTdsKH5";
+  ElevenLabsVoice2["felicity"] = "aTbnroHRGIomiKpqAQR8";
+  return ElevenLabsVoice2;
+})(ElevenLabsVoice || {});
 var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
-  client;
+  apiKey;
   model;
   voice;
   config;
   constructor(args) {
     super();
-    this.client = args.client ?? new elevenlabsJs.ElevenLabsClient({
-      apiKey: process.env.ELEVENLABS_API_KEY
-    });
+    this.apiKey = args.apiKey ?? process.env.ELEVENLABS_API_KEY;
     this.model = args.model;
     this.voice = args.voice;
     this.config = args.config ?? {};
@@ -789,14 +793,28 @@ var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
       return;
     }
     const textToSend = text.replaceAll(/([A-Z])-([A-Z])/g, "$1 - $2").replaceAll(/([0-9])-([0-9])/g, "$1 - $2").replaceAll(/(-\s*[A-Z])\s+([A-Z]\s*-)/g, "$1 - $2").replaceAll(/(-\s*[0-9])\s+([0-9]\s*-)/g, "$1 - $2");
-    this.client.textToSpeech.stream(this.voice, {
-      text: textToSend,
-      outputFormat: "pcm_48000",
-      modelId: this.model,
-      ...this.config
-    }).then(async (stream) => {
-      for await (const chunk of stream) {
-        this.onOutput(chunk, requestId);
+    fetch(
+      `https://api.elevenlabs.io/v1/text-to-speech/${this.voice}/stream?output_format=pcm_48000`,
+      {
+        method: "POST",
+        headers: {
+          "Content-Type": "application/json",
+          "xi-api-key": this.apiKey
+        },
+        body: JSON.stringify({
+          text: textToSend,
+          model_id: this.model,
+          ...this.config
+        })
+      }
+    ).then(async (response) => {
+      const reader = response.body?.getReader();
+      if (!reader) return;
+      new TextDecoder();
+      while (true) {
+        const { done, value } = await reader.read();
+        if (done) break;
+        this.onOutput(Buffer.from(value), requestId);
       }
       this.onOutput(null, requestId);
       console.log("[ElevenLabs] Finished:", textToSend);
@@ -882,7 +900,7 @@ exports.DeepgramModel = DeepgramModel;
 exports.DeepgramSTT = DeepgramSTT;
 exports.DeepgramTTS = DeepgramTTS;
 exports.ElevenLabsTTS = ElevenLabsTTS;
-exports.ElevenVoice = ElevenVoice;
+exports.ElevenLabsVoice = ElevenLabsVoice;
 exports.HumeTTS = HumeTTS;
 exports.MagmaFlow = MagmaFlow;
 exports.MagmaFlowSpeechToText = MagmaFlowSpeechToText;

package/dist/voice.mjs CHANGED Viewed

@@ -1,5 +1,4 @@
 import { DeepgramClient, LiveTranscriptionEvents } from '@deepgram/sdk';
-import { ElevenLabsClient } from '@elevenlabs/elevenlabs-js';
 import { HumeClient } from 'hume';
 import OpenAI from 'openai';
@@ -423,6 +422,7 @@ var MagmaFlow = class {
   generatingAudio = false;
   currentRequestId = null;
   audioBuffer = [];
+  lastChunk = null;
   config = {
     pauseDurationMs: 500,
     sentenceChunkLength: 50
@@ -440,9 +440,8 @@ var MagmaFlow = class {
         return;
       }
       if (!audio) {
-        const lastChunk = this.audioBuffer[this.audioBuffer.length - 1];
-        if (lastChunk) {
-          const lastChunkSamples = bufferToInt16Array(lastChunk);
+        if (this.lastChunk) {
+          const lastChunkSamples = bufferToInt16Array(this.lastChunk);
           const lastSampleValue = lastChunkSamples[lastChunkSamples.length - 1];
           this.audioBuffer.push(
             Buffer.from(
@@ -456,10 +455,15 @@ var MagmaFlow = class {
         }
         this.sendAudio();
         this.generatingAudio = false;
+        this.lastChunk = null;
         this.generateAudio();
         return;
       }
       this.audioBuffer.push(audio);
+      this.lastChunk = audio;
+      if (this.audioBuffer.reduce((acc, curr) => acc + curr.length, 0) % (2 * this.outputFormat.channels) === 0) {
+        this.sendAudio();
+      }
     };
     this.stt.onOutput = args.onTranscription;
     this.stt.onSpeechDetected = args.onSpeechDetected;
@@ -499,13 +503,13 @@ var MagmaFlow = class {
   sendAudio() {
     if (this.audioBuffer.length === 0) return;
     const concatenatedBuffer = Buffer.concat(this.audioBuffer);
+    this.audioBuffer = [];
     const resampledPCM = resamplePcm(
       bufferToInt16Array(concatenatedBuffer),
       uniformSampleRate,
       this.outputFormat.sampleRate
     );
     const encodedAudio = encodePcm(resampledPCM, this.outputFormat.encoding);
-    this.audioBuffer = [];
     try {
       this.onAudioOutput(encodedAudio);
     } catch (error) {
@@ -636,7 +640,7 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
     }
     const text = this.turnBuffer.map((turn) => turn.text).join(" ");
     let turns = void 0;
-    if (this.turnBuffer.every((turn) => !!turn.speaker)) {
+    if (this.turnBuffer.every((turn) => turn.speaker !== void 0 && turn.speaker !== null)) {
       turns = this.turnBuffer.reduce((acc, turn) => {
         if (acc.at(-1)?.speaker === turn.speaker) {
           acc.at(-1).text += turn.text;
@@ -754,24 +758,24 @@ var DeepgramTTS = class extends MagmaFlowTextToSpeech {
   reset() {
   }
 };
-var ElevenVoice = /* @__PURE__ */ ((ElevenVoice2) => {
-  ElevenVoice2["chris"] = "iP95p4xoKVk53GoZ742B";
-  ElevenVoice2["josh"] = "TxGEqnHWrfWFTfGW9XjX";
-  ElevenVoice2["rachel"] = "21m00Tcm4TlvDq8ikWAM";
-  ElevenVoice2["laura"] = "FGY2WhTYpPnrIDTdsKH5";
-  ElevenVoice2["felicity"] = "aTbnroHRGIomiKpqAQR8";
-  return ElevenVoice2;
-})(ElevenVoice || {});
+// src/voice/textToSpeech/elevenlabs.ts
+var ElevenLabsVoice = /* @__PURE__ */ ((ElevenLabsVoice2) => {
+  ElevenLabsVoice2["chris"] = "iP95p4xoKVk53GoZ742B";
+  ElevenLabsVoice2["josh"] = "TxGEqnHWrfWFTfGW9XjX";
+  ElevenLabsVoice2["rachel"] = "21m00Tcm4TlvDq8ikWAM";
+  ElevenLabsVoice2["laura"] = "FGY2WhTYpPnrIDTdsKH5";
+  ElevenLabsVoice2["felicity"] = "aTbnroHRGIomiKpqAQR8";
+  return ElevenLabsVoice2;
+})(ElevenLabsVoice || {});
 var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
-  client;
+  apiKey;
   model;
   voice;
   config;
   constructor(args) {
     super();
-    this.client = args.client ?? new ElevenLabsClient({
-      apiKey: process.env.ELEVENLABS_API_KEY
-    });
+    this.apiKey = args.apiKey ?? process.env.ELEVENLABS_API_KEY;
     this.model = args.model;
     this.voice = args.voice;
     this.config = args.config ?? {};
@@ -783,14 +787,28 @@ var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
       return;
     }
     const textToSend = text.replaceAll(/([A-Z])-([A-Z])/g, "$1 - $2").replaceAll(/([0-9])-([0-9])/g, "$1 - $2").replaceAll(/(-\s*[A-Z])\s+([A-Z]\s*-)/g, "$1 - $2").replaceAll(/(-\s*[0-9])\s+([0-9]\s*-)/g, "$1 - $2");
-    this.client.textToSpeech.stream(this.voice, {
-      text: textToSend,
-      outputFormat: "pcm_48000",
-      modelId: this.model,
-      ...this.config
-    }).then(async (stream) => {
-      for await (const chunk of stream) {
-        this.onOutput(chunk, requestId);
+    fetch(
+      `https://api.elevenlabs.io/v1/text-to-speech/${this.voice}/stream?output_format=pcm_48000`,
+      {
+        method: "POST",
+        headers: {
+          "Content-Type": "application/json",
+          "xi-api-key": this.apiKey
+        },
+        body: JSON.stringify({
+          text: textToSend,
+          model_id: this.model,
+          ...this.config
+        })
+      }
+    ).then(async (response) => {
+      const reader = response.body?.getReader();
+      if (!reader) return;
+      new TextDecoder();
+      while (true) {
+        const { done, value } = await reader.read();
+        if (done) break;
+        this.onOutput(Buffer.from(value), requestId);
       }
       this.onOutput(null, requestId);
       console.log("[ElevenLabs] Finished:", textToSend);
@@ -871,4 +889,4 @@ var WhisperTTS = class extends MagmaFlowTextToSpeech {
   }
 };
-export { DeepgramLanguage, DeepgramModel, DeepgramSTT, DeepgramTTS, ElevenLabsTTS, ElevenVoice, HumeTTS, MagmaFlow, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, WhisperTTS, splitTextIntoChunks };
+export { DeepgramLanguage, DeepgramModel, DeepgramSTT, DeepgramTTS, ElevenLabsTTS, ElevenLabsVoice, HumeTTS, MagmaFlow, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, WhisperTTS, splitTextIntoChunks };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@pompeii-labs/audio",
-  "version": "0.1.5",
+  "version": "0.2.0",
   "description": "The Audio SDK from Pompeii Labs",
   "keywords": [
     "Pompeii",
@@ -42,7 +42,6 @@
   },
   "dependencies": {
     "@deepgram/sdk": "4.2.0",
-    "@elevenlabs/elevenlabs-js": "2.2.0",
     "hume": "0.11.1",
     "openai": "4.86.2"
   },