npm - @elizaos/plugin-elevenlabs - Versions diffs - 1.5.12 → 1.6.0 - Mend

@elizaos/plugin-elevenlabs 1.5.12 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/README.md +8 -1
package/dist/browser/index.browser.js +3 -98
package/dist/browser/index.browser.js.map +4 -2394
package/dist/cjs/index.node.cjs +161 -10
package/dist/cjs/index.node.js.map +3 -3
package/dist/index.d.ts +16 -5
package/dist/node/index.node.js +160 -9
package/dist/node/index.node.js.map +3 -3
package/package.json +41 -1

package/dist/index.d.ts CHANGED Viewed

@@ -1,19 +1,22 @@
 import { type Plugin } from "@elizaos/core";
 /**
  * Represents the ElevenLabs plugin.
- * This plugin provides text-to-speech functionality using the ElevenLabs API.
+ * This plugin provides text-to-speech and speech-to-text functionality using the ElevenLabs API.
  *
  * Features:
- * - High-quality voice synthesis
+ * - High-quality voice synthesis (TTS)
+ * - High-accuracy speech transcription (STT) with Scribe v1 model
  * - Support for multiple voice models and settings
- * - Automatic WAV header prepending for PCM formats
  * - Configurable voice parameters (stability, similarity, style)
  * - Stream-based audio output for efficient memory usage
+ * - Speaker diarization (up to 32 speakers)
+ * - Multi-language support (99 languages for STT)
+ * - Audio event detection (laughter, applause, etc.)
  *
  * Required environment variables:
  * - ELEVENLABS_API_KEY: Your ElevenLabs API key
  *
- * Optional environment variables:
+ * Optional TTS environment variables:
  * - ELEVENLABS_VOICE_ID: Voice ID to use (default: EXAVITQu4vr4xnSDxMaL)
  * - ELEVENLABS_MODEL_ID: Model to use (default: eleven_monolingual_v1)
  * - ELEVENLABS_VOICE_STABILITY: Voice stability 0-1 (default: 0.5)
@@ -21,7 +24,15 @@ import { type Plugin } from "@elizaos/core";
  * - ELEVENLABS_VOICE_STYLE: Voice style 0-1 (default: 0)
  * - ELEVENLABS_VOICE_USE_SPEAKER_BOOST: Enable speaker boost (default: true)
  * - ELEVENLABS_OPTIMIZE_STREAMING_LATENCY: Latency optimization 0-4 (default: 0)
- * - ELEVENLABS_OUTPUT_FORMAT: Output format (default: pcm_16000)
+ * - ELEVENLABS_OUTPUT_FORMAT: Output format (default: mp3_44100_128)
+ *
+ * Optional STT environment variables:
+ * - ELEVENLABS_STT_MODEL_ID: STT model ID (default: scribe_v1)
+ * - ELEVENLABS_STT_LANGUAGE_CODE: Language code for transcription (auto-detect if not set)
+ * - ELEVENLABS_STT_TIMESTAMPS_GRANULARITY: Timestamp level (default: word)
+ * - ELEVENLABS_STT_DIARIZE: Enable speaker diarization (default: false)
+ * - ELEVENLABS_STT_NUM_SPEAKERS: Expected number of speakers (1-32)
+ * - ELEVENLABS_STT_TAG_AUDIO_EVENTS: Tag audio events (default: false)
  *
  * @type {Plugin}
  */

package/dist/node/index.node.js CHANGED Viewed

@@ -99811,7 +99811,20 @@ function getVoiceSettings(runtime) {
     outputFormat: getSetting(runtime, "ELEVENLABS_OUTPUT_FORMAT", "mp3_44100_128"),
     similarity: getSetting(runtime, "ELEVENLABS_VOICE_SIMILARITY_BOOST", "0.75"),
     style: getSetting(runtime, "ELEVENLABS_VOICE_STYLE", "0"),
-    speakerBoost: parseBooleanFromText(getSetting(runtime, "ELEVENLABS_VOICE_USE_SPEAKER_BOOST", "true") + "")
+    speakerBoost: parseBooleanFromText(`${getSetting(runtime, "ELEVENLABS_VOICE_USE_SPEAKER_BOOST", "true")}`)
+  };
+}
+function getTranscriptionSettings(runtime) {
+  const languageCode = getSetting(runtime, "ELEVENLABS_STT_LANGUAGE_CODE");
+  const numSpeakersStr = getSetting(runtime, "ELEVENLABS_STT_NUM_SPEAKERS");
+  return {
+    apiKey: getApiKey(runtime) || "",
+    modelId: getSetting(runtime, "ELEVENLABS_STT_MODEL_ID", "scribe_v1"),
+    languageCode: languageCode || undefined,
+    timestampsGranularity: getSetting(runtime, "ELEVENLABS_STT_TIMESTAMPS_GRANULARITY", "word"),
+    diarize: parseBooleanFromText(`${getSetting(runtime, "ELEVENLABS_STT_DIARIZE", "false")}`),
+    numSpeakers: numSpeakersStr ? Number(numSpeakersStr) : undefined,
+    tagAudioEvents: parseBooleanFromText(`${getSetting(runtime, "ELEVENLABS_STT_TAG_AUDIO_EVENTS", "false")}`)
   };
 }
 async function fetchSpeech(runtime, params) {
@@ -99844,15 +99857,62 @@ async function fetchSpeech(runtime, params) {
     throw error instanceof Error ? error : new Error(msg);
   }
 }
+async function fetchTranscription(runtime, params) {
+  try {
+    const baseUrl = getBaseURL(runtime);
+    const apiKey = getApiKey(runtime) ?? (isBrowser() ? "sk-proxy" : undefined);
+    const client = new import_elevenlabs_js.ElevenLabsClient({
+      apiKey,
+      baseUrl
+    });
+    const requestParams = {
+      modelId: params.modelId,
+      audio: params.audioFile
+    };
+    if (params.languageCode) {
+      requestParams.languageCode = params.languageCode;
+    }
+    if (params.timestampsGranularity !== "none") {
+      requestParams.timestampsGranularity = params.timestampsGranularity;
+    }
+    if (params.diarize) {
+      requestParams.diarize = true;
+      if (params.numSpeakers) {
+        requestParams.numSpeakers = params.numSpeakers;
+      }
+    }
+    if (params.tagAudioEvents) {
+      requestParams.tagAudioEvents = true;
+    }
+    const response = await client.speechToText.convert(requestParams);
+    if (!response) {
+      throw new Error("Empty response from ElevenLabs STT API");
+    }
+    let transcript = "";
+    if ("transcript" in response && response.transcript) {
+      const transcriptObj = response.transcript;
+      transcript = transcriptObj.text || "";
+    } else if ("transcripts" in response && response.transcripts) {
+      const transcriptsArray = response.transcripts;
+      transcript = transcriptsArray.map((t) => t.text || "").join(`
+`);
+    }
+    return transcript;
+  } catch (error) {
+    const msg = error instanceof Error ? error.message : String(error);
+    logger.error(`ElevenLabs fetchTranscription error: ${msg}`);
+    throw error instanceof Error ? error : new Error(msg);
+  }
+}
 var elevenLabsPlugin = {
   name: "elevenLabs",
-  description: "High-quality text-to-speech synthesis using ElevenLabs API with support for multiple voices and languages",
+  description: "High-quality text-to-speech synthesis and speech-to-text transcription using ElevenLabs API with support for multiple voices, languages, and speaker diarization",
   models: {
     [ModelType.TEXT_TO_SPEECH]: async (runtime, input) => {
       const options = typeof input === "string" ? { text: input } : input;
       const settings = getVoiceSettings(runtime);
       const resolvedModel = options.model || settings.model;
-      const resolvedVoiceId = options.voice || settings.voiceId;
+      const resolvedVoiceId = options.voiceId || settings.voiceId;
       const outputFormat = options.format ? options.format === "mp3" ? "mp3_44100_128" : options.format : settings.outputFormat;
       logger.log(`[ElevenLabs] Using TEXT_TO_SPEECH model: ${resolvedModel}`);
       try {
@@ -99873,6 +99933,46 @@ var elevenLabsPlugin = {
         logger.error(`ElevenLabs model error: ${msg}`);
         throw error instanceof Error ? error : new Error(msg);
       }
+    },
+    [ModelType.TRANSCRIPTION]: async (runtime, input) => {
+      const settings = getTranscriptionSettings(runtime);
+      logger.log(`[ElevenLabs] Using TRANSCRIPTION model: ${settings.modelId}`);
+      try {
+        let audioFile;
+        if (typeof input === "string") {
+          const response = await fetch(input);
+          if (!response.ok) {
+            throw new Error(`Failed to fetch audio from URL: ${input}`);
+          }
+          const arrayBuffer = await response.arrayBuffer();
+          audioFile = Buffer.from(arrayBuffer);
+        } else if (Buffer.isBuffer(input)) {
+          audioFile = input;
+        } else if (typeof input === "object" && "audioUrl" in input) {
+          const response = await fetch(input.audioUrl);
+          if (!response.ok) {
+            throw new Error(`Failed to fetch audio from URL: ${input.audioUrl}`);
+          }
+          const arrayBuffer = await response.arrayBuffer();
+          audioFile = Buffer.from(arrayBuffer);
+        } else {
+          throw new Error("Invalid input type for TRANSCRIPTION model");
+        }
+        const transcript = await fetchTranscription(runtime, {
+          audioFile,
+          modelId: settings.modelId,
+          languageCode: settings.languageCode,
+          timestampsGranularity: settings.timestampsGranularity,
+          diarize: settings.diarize,
+          numSpeakers: settings.numSpeakers,
+          tagAudioEvents: settings.tagAudioEvents
+        });
+        return transcript;
+      } catch (error) {
+        const msg = error instanceof Error ? error.message : String(error);
+        logger.error(`ElevenLabs transcription error: ${msg}`);
+        throw error instanceof Error ? error : new Error(msg);
+      }
     }
   },
   tests: [
@@ -99895,12 +99995,12 @@ var elevenLabsPlugin = {
             if (!settings.voiceId) {
               throw new Error("Missing voice ID configuration");
             }
-            const stability = parseFloat(settings.stability);
-            if (isNaN(stability) || stability < 0 || stability > 1) {
+            const stability = Number.parseFloat(settings.stability);
+            if (Number.isNaN(stability) || stability < 0 || stability > 1) {
               throw new Error("Voice stability must be between 0 and 1");
             }
-            const similarity = parseFloat(settings.similarity);
-            if (isNaN(similarity) || similarity < 0 || similarity > 1) {
+            const similarity = Number.parseFloat(settings.similarity);
+            if (Number.isNaN(similarity) || similarity < 0 || similarity > 1) {
               throw new Error("Voice similarity boost must be between 0 and 1");
             }
             logger.success("Voice settings validated successfully");
@@ -99984,7 +100084,7 @@ var elevenLabsPlugin = {
             for (const format of pcmFormats) {
               if (format.startsWith("pcm_")) {
                 const sampleRate = Number.parseInt(format.slice(4));
-                if (isNaN(sampleRate) || sampleRate <= 0) {
+                if (Number.isNaN(sampleRate) || sampleRate <= 0) {
                   throw new Error(`Invalid PCM format: ${format}`);
                 }
               }
@@ -99993,6 +100093,57 @@ var elevenLabsPlugin = {
           }
         }
       ]
+    },
+    {
+      name: "test eleven labs STT",
+      tests: [
+        {
+          name: "STT settings validation",
+          fn: async (runtime) => {
+            const settings = getTranscriptionSettings(runtime);
+            if (!settings.modelId) {
+              throw new Error("Missing STT model ID configuration");
+            }
+            const validGranularities = ["none", "word", "character"];
+            if (!validGranularities.includes(settings.timestampsGranularity)) {
+              throw new Error(`Invalid timestamps granularity: ${settings.timestampsGranularity}`);
+            }
+            if (settings.numSpeakers !== undefined && (settings.numSpeakers < 1 || settings.numSpeakers > 32)) {
+              throw new Error("Number of speakers must be between 1 and 32");
+            }
+            logger.success("STT settings validated successfully");
+          }
+        },
+        {
+          name: "STT configuration defaults",
+          fn: async (runtime) => {
+            const settings = getTranscriptionSettings(runtime);
+            if (settings.modelId !== "scribe_v1") {
+              logger.warn(`Using non-default STT model: ${settings.modelId}`);
+            }
+            if (settings.timestampsGranularity !== "word") {
+              logger.warn(`Using non-default timestamps granularity: ${settings.timestampsGranularity}`);
+            }
+            logger.success("STT configuration defaults checked");
+          }
+        },
+        {
+          name: "STT input handling validation",
+          fn: async (runtime) => {
+            const testCases = [
+              { type: "string URL", valid: true },
+              { type: "Buffer", valid: true },
+              { type: "object with audioUrl", valid: true }
+            ];
+            for (const testCase of testCases) {
+              if (!testCase.valid) {
+                throw new Error(`Invalid test case should not be valid: ${testCase.type}`);
+              }
+            }
+            logger.success("STT input handling validation passed");
+          }
+        }
+      ]
     }
   ]
 };
@@ -100002,4 +100153,4 @@ export {
   src_default as default
 };
-//# debugId=E9F69B580490474864756E2164756E21
+//# debugId=F4CD7795D57767C464756E2164756E21