@elizaos/plugin-elevenlabs 1.5.12 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -1,19 +1,22 @@
1
1
  import { type Plugin } from "@elizaos/core";
2
2
  /**
3
3
  * Represents the ElevenLabs plugin.
4
- * This plugin provides text-to-speech functionality using the ElevenLabs API.
4
+ * This plugin provides text-to-speech and speech-to-text functionality using the ElevenLabs API.
5
5
  *
6
6
  * Features:
7
- * - High-quality voice synthesis
7
+ * - High-quality voice synthesis (TTS)
8
+ * - High-accuracy speech transcription (STT) with Scribe v1 model
8
9
  * - Support for multiple voice models and settings
9
- * - Automatic WAV header prepending for PCM formats
10
10
  * - Configurable voice parameters (stability, similarity, style)
11
11
  * - Stream-based audio output for efficient memory usage
12
+ * - Speaker diarization (up to 32 speakers)
13
+ * - Multi-language support (99 languages for STT)
14
+ * - Audio event detection (laughter, applause, etc.)
12
15
  *
13
16
  * Required environment variables:
14
17
  * - ELEVENLABS_API_KEY: Your ElevenLabs API key
15
18
  *
16
- * Optional environment variables:
19
+ * Optional TTS environment variables:
17
20
  * - ELEVENLABS_VOICE_ID: Voice ID to use (default: EXAVITQu4vr4xnSDxMaL)
18
21
  * - ELEVENLABS_MODEL_ID: Model to use (default: eleven_monolingual_v1)
19
22
  * - ELEVENLABS_VOICE_STABILITY: Voice stability 0-1 (default: 0.5)
@@ -21,7 +24,15 @@ import { type Plugin } from "@elizaos/core";
21
24
  * - ELEVENLABS_VOICE_STYLE: Voice style 0-1 (default: 0)
22
25
  * - ELEVENLABS_VOICE_USE_SPEAKER_BOOST: Enable speaker boost (default: true)
23
26
  * - ELEVENLABS_OPTIMIZE_STREAMING_LATENCY: Latency optimization 0-4 (default: 0)
24
- * - ELEVENLABS_OUTPUT_FORMAT: Output format (default: pcm_16000)
27
+ * - ELEVENLABS_OUTPUT_FORMAT: Output format (default: mp3_44100_128)
28
+ *
29
+ * Optional STT environment variables:
30
+ * - ELEVENLABS_STT_MODEL_ID: STT model ID (default: scribe_v1)
31
+ * - ELEVENLABS_STT_LANGUAGE_CODE: Language code for transcription (auto-detect if not set)
32
+ * - ELEVENLABS_STT_TIMESTAMPS_GRANULARITY: Timestamp level (default: word)
33
+ * - ELEVENLABS_STT_DIARIZE: Enable speaker diarization (default: false)
34
+ * - ELEVENLABS_STT_NUM_SPEAKERS: Expected number of speakers (1-32)
35
+ * - ELEVENLABS_STT_TAG_AUDIO_EVENTS: Tag audio events (default: false)
25
36
  *
26
37
  * @type {Plugin}
27
38
  */
@@ -99811,7 +99811,20 @@ function getVoiceSettings(runtime) {
99811
99811
  outputFormat: getSetting(runtime, "ELEVENLABS_OUTPUT_FORMAT", "mp3_44100_128"),
99812
99812
  similarity: getSetting(runtime, "ELEVENLABS_VOICE_SIMILARITY_BOOST", "0.75"),
99813
99813
  style: getSetting(runtime, "ELEVENLABS_VOICE_STYLE", "0"),
99814
- speakerBoost: parseBooleanFromText(getSetting(runtime, "ELEVENLABS_VOICE_USE_SPEAKER_BOOST", "true") + "")
99814
+ speakerBoost: parseBooleanFromText(`${getSetting(runtime, "ELEVENLABS_VOICE_USE_SPEAKER_BOOST", "true")}`)
99815
+ };
99816
+ }
99817
+ function getTranscriptionSettings(runtime) {
99818
+ const languageCode = getSetting(runtime, "ELEVENLABS_STT_LANGUAGE_CODE");
99819
+ const numSpeakersStr = getSetting(runtime, "ELEVENLABS_STT_NUM_SPEAKERS");
99820
+ return {
99821
+ apiKey: getApiKey(runtime) || "",
99822
+ modelId: getSetting(runtime, "ELEVENLABS_STT_MODEL_ID", "scribe_v1"),
99823
+ languageCode: languageCode || undefined,
99824
+ timestampsGranularity: getSetting(runtime, "ELEVENLABS_STT_TIMESTAMPS_GRANULARITY", "word"),
99825
+ diarize: parseBooleanFromText(`${getSetting(runtime, "ELEVENLABS_STT_DIARIZE", "false")}`),
99826
+ numSpeakers: numSpeakersStr ? Number(numSpeakersStr) : undefined,
99827
+ tagAudioEvents: parseBooleanFromText(`${getSetting(runtime, "ELEVENLABS_STT_TAG_AUDIO_EVENTS", "false")}`)
99815
99828
  };
99816
99829
  }
99817
99830
  async function fetchSpeech(runtime, params) {
@@ -99844,15 +99857,62 @@ async function fetchSpeech(runtime, params) {
99844
99857
  throw error instanceof Error ? error : new Error(msg);
99845
99858
  }
99846
99859
  }
99860
+ async function fetchTranscription(runtime, params) {
99861
+ try {
99862
+ const baseUrl = getBaseURL(runtime);
99863
+ const apiKey = getApiKey(runtime) ?? (isBrowser() ? "sk-proxy" : undefined);
99864
+ const client = new import_elevenlabs_js.ElevenLabsClient({
99865
+ apiKey,
99866
+ baseUrl
99867
+ });
99868
+ const requestParams = {
99869
+ modelId: params.modelId,
99870
+ audio: params.audioFile
99871
+ };
99872
+ if (params.languageCode) {
99873
+ requestParams.languageCode = params.languageCode;
99874
+ }
99875
+ if (params.timestampsGranularity !== "none") {
99876
+ requestParams.timestampsGranularity = params.timestampsGranularity;
99877
+ }
99878
+ if (params.diarize) {
99879
+ requestParams.diarize = true;
99880
+ if (params.numSpeakers) {
99881
+ requestParams.numSpeakers = params.numSpeakers;
99882
+ }
99883
+ }
99884
+ if (params.tagAudioEvents) {
99885
+ requestParams.tagAudioEvents = true;
99886
+ }
99887
+ const response = await client.speechToText.convert(requestParams);
99888
+ if (!response) {
99889
+ throw new Error("Empty response from ElevenLabs STT API");
99890
+ }
99891
+ let transcript = "";
99892
+ if ("transcript" in response && response.transcript) {
99893
+ const transcriptObj = response.transcript;
99894
+ transcript = transcriptObj.text || "";
99895
+ } else if ("transcripts" in response && response.transcripts) {
99896
+ const transcriptsArray = response.transcripts;
99897
+ transcript = transcriptsArray.map((t) => t.text || "").join(`
99898
+ `);
99899
+ }
99900
+ return transcript;
99901
+ } catch (error) {
99902
+ const msg = error instanceof Error ? error.message : String(error);
99903
+ logger.error(`ElevenLabs fetchTranscription error: ${msg}`);
99904
+ throw error instanceof Error ? error : new Error(msg);
99905
+ }
99906
+ }
99847
99907
  var elevenLabsPlugin = {
99848
99908
  name: "elevenLabs",
99849
- description: "High-quality text-to-speech synthesis using ElevenLabs API with support for multiple voices and languages",
99909
+ description: "High-quality text-to-speech synthesis and speech-to-text transcription using ElevenLabs API with support for multiple voices, languages, and speaker diarization",
99850
99910
  models: {
99851
99911
  [ModelType.TEXT_TO_SPEECH]: async (runtime, input) => {
99852
99912
  const options = typeof input === "string" ? { text: input } : input;
99853
99913
  const settings = getVoiceSettings(runtime);
99854
99914
  const resolvedModel = options.model || settings.model;
99855
- const resolvedVoiceId = options.voice || settings.voiceId;
99915
+ const resolvedVoiceId = options.voiceId || settings.voiceId;
99856
99916
  const outputFormat = options.format ? options.format === "mp3" ? "mp3_44100_128" : options.format : settings.outputFormat;
99857
99917
  logger.log(`[ElevenLabs] Using TEXT_TO_SPEECH model: ${resolvedModel}`);
99858
99918
  try {
@@ -99873,6 +99933,46 @@ var elevenLabsPlugin = {
99873
99933
  logger.error(`ElevenLabs model error: ${msg}`);
99874
99934
  throw error instanceof Error ? error : new Error(msg);
99875
99935
  }
99936
+ },
99937
+ [ModelType.TRANSCRIPTION]: async (runtime, input) => {
99938
+ const settings = getTranscriptionSettings(runtime);
99939
+ logger.log(`[ElevenLabs] Using TRANSCRIPTION model: ${settings.modelId}`);
99940
+ try {
99941
+ let audioFile;
99942
+ if (typeof input === "string") {
99943
+ const response = await fetch(input);
99944
+ if (!response.ok) {
99945
+ throw new Error(`Failed to fetch audio from URL: ${input}`);
99946
+ }
99947
+ const arrayBuffer = await response.arrayBuffer();
99948
+ audioFile = Buffer.from(arrayBuffer);
99949
+ } else if (Buffer.isBuffer(input)) {
99950
+ audioFile = input;
99951
+ } else if (typeof input === "object" && "audioUrl" in input) {
99952
+ const response = await fetch(input.audioUrl);
99953
+ if (!response.ok) {
99954
+ throw new Error(`Failed to fetch audio from URL: ${input.audioUrl}`);
99955
+ }
99956
+ const arrayBuffer = await response.arrayBuffer();
99957
+ audioFile = Buffer.from(arrayBuffer);
99958
+ } else {
99959
+ throw new Error("Invalid input type for TRANSCRIPTION model");
99960
+ }
99961
+ const transcript = await fetchTranscription(runtime, {
99962
+ audioFile,
99963
+ modelId: settings.modelId,
99964
+ languageCode: settings.languageCode,
99965
+ timestampsGranularity: settings.timestampsGranularity,
99966
+ diarize: settings.diarize,
99967
+ numSpeakers: settings.numSpeakers,
99968
+ tagAudioEvents: settings.tagAudioEvents
99969
+ });
99970
+ return transcript;
99971
+ } catch (error) {
99972
+ const msg = error instanceof Error ? error.message : String(error);
99973
+ logger.error(`ElevenLabs transcription error: ${msg}`);
99974
+ throw error instanceof Error ? error : new Error(msg);
99975
+ }
99876
99976
  }
99877
99977
  },
99878
99978
  tests: [
@@ -99895,12 +99995,12 @@ var elevenLabsPlugin = {
99895
99995
  if (!settings.voiceId) {
99896
99996
  throw new Error("Missing voice ID configuration");
99897
99997
  }
99898
- const stability = parseFloat(settings.stability);
99899
- if (isNaN(stability) || stability < 0 || stability > 1) {
99998
+ const stability = Number.parseFloat(settings.stability);
99999
+ if (Number.isNaN(stability) || stability < 0 || stability > 1) {
99900
100000
  throw new Error("Voice stability must be between 0 and 1");
99901
100001
  }
99902
- const similarity = parseFloat(settings.similarity);
99903
- if (isNaN(similarity) || similarity < 0 || similarity > 1) {
100002
+ const similarity = Number.parseFloat(settings.similarity);
100003
+ if (Number.isNaN(similarity) || similarity < 0 || similarity > 1) {
99904
100004
  throw new Error("Voice similarity boost must be between 0 and 1");
99905
100005
  }
99906
100006
  logger.success("Voice settings validated successfully");
@@ -99984,7 +100084,7 @@ var elevenLabsPlugin = {
99984
100084
  for (const format of pcmFormats) {
99985
100085
  if (format.startsWith("pcm_")) {
99986
100086
  const sampleRate = Number.parseInt(format.slice(4));
99987
- if (isNaN(sampleRate) || sampleRate <= 0) {
100087
+ if (Number.isNaN(sampleRate) || sampleRate <= 0) {
99988
100088
  throw new Error(`Invalid PCM format: ${format}`);
99989
100089
  }
99990
100090
  }
@@ -99993,6 +100093,57 @@ var elevenLabsPlugin = {
99993
100093
  }
99994
100094
  }
99995
100095
  ]
100096
+ },
100097
+ {
100098
+ name: "test eleven labs STT",
100099
+ tests: [
100100
+ {
100101
+ name: "STT settings validation",
100102
+ fn: async (runtime) => {
100103
+ const settings = getTranscriptionSettings(runtime);
100104
+ if (!settings.modelId) {
100105
+ throw new Error("Missing STT model ID configuration");
100106
+ }
100107
+ const validGranularities = ["none", "word", "character"];
100108
+ if (!validGranularities.includes(settings.timestampsGranularity)) {
100109
+ throw new Error(`Invalid timestamps granularity: ${settings.timestampsGranularity}`);
100110
+ }
100111
+ if (settings.numSpeakers !== undefined && (settings.numSpeakers < 1 || settings.numSpeakers > 32)) {
100112
+ throw new Error("Number of speakers must be between 1 and 32");
100113
+ }
100114
+ logger.success("STT settings validated successfully");
100115
+ }
100116
+ },
100117
+ {
100118
+ name: "STT configuration defaults",
100119
+ fn: async (runtime) => {
100120
+ const settings = getTranscriptionSettings(runtime);
100121
+ if (settings.modelId !== "scribe_v1") {
100122
+ logger.warn(`Using non-default STT model: ${settings.modelId}`);
100123
+ }
100124
+ if (settings.timestampsGranularity !== "word") {
100125
+ logger.warn(`Using non-default timestamps granularity: ${settings.timestampsGranularity}`);
100126
+ }
100127
+ logger.success("STT configuration defaults checked");
100128
+ }
100129
+ },
100130
+ {
100131
+ name: "STT input handling validation",
100132
+ fn: async (runtime) => {
100133
+ const testCases = [
100134
+ { type: "string URL", valid: true },
100135
+ { type: "Buffer", valid: true },
100136
+ { type: "object with audioUrl", valid: true }
100137
+ ];
100138
+ for (const testCase of testCases) {
100139
+ if (!testCase.valid) {
100140
+ throw new Error(`Invalid test case should not be valid: ${testCase.type}`);
100141
+ }
100142
+ }
100143
+ logger.success("STT input handling validation passed");
100144
+ }
100145
+ }
100146
+ ]
99996
100147
  }
99997
100148
  ]
99998
100149
  };
@@ -100002,4 +100153,4 @@ export {
100002
100153
  src_default as default
100003
100154
  };
100004
100155
 
100005
- //# debugId=E9F69B580490474864756E2164756E21
100156
+ //# debugId=F4CD7795D57767C464756E2164756E21