@elizaos/plugin-elevenlabs 1.5.12 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -1
- package/dist/browser/index.browser.js +3 -98
- package/dist/browser/index.browser.js.map +4 -2394
- package/dist/cjs/index.node.cjs +161 -10
- package/dist/cjs/index.node.js.map +3 -3
- package/dist/index.d.ts +16 -5
- package/dist/node/index.node.js +160 -9
- package/dist/node/index.node.js.map +3 -3
- package/package.json +41 -1
package/dist/index.d.ts
CHANGED
|
@@ -1,19 +1,22 @@
|
|
|
1
1
|
import { type Plugin } from "@elizaos/core";
|
|
2
2
|
/**
|
|
3
3
|
* Represents the ElevenLabs plugin.
|
|
4
|
-
* This plugin provides text-to-speech functionality using the ElevenLabs API.
|
|
4
|
+
* This plugin provides text-to-speech and speech-to-text functionality using the ElevenLabs API.
|
|
5
5
|
*
|
|
6
6
|
* Features:
|
|
7
|
-
* - High-quality voice synthesis
|
|
7
|
+
* - High-quality voice synthesis (TTS)
|
|
8
|
+
* - High-accuracy speech transcription (STT) with Scribe v1 model
|
|
8
9
|
* - Support for multiple voice models and settings
|
|
9
|
-
* - Automatic WAV header prepending for PCM formats
|
|
10
10
|
* - Configurable voice parameters (stability, similarity, style)
|
|
11
11
|
* - Stream-based audio output for efficient memory usage
|
|
12
|
+
* - Speaker diarization (up to 32 speakers)
|
|
13
|
+
* - Multi-language support (99 languages for STT)
|
|
14
|
+
* - Audio event detection (laughter, applause, etc.)
|
|
12
15
|
*
|
|
13
16
|
* Required environment variables:
|
|
14
17
|
* - ELEVENLABS_API_KEY: Your ElevenLabs API key
|
|
15
18
|
*
|
|
16
|
-
* Optional environment variables:
|
|
19
|
+
* Optional TTS environment variables:
|
|
17
20
|
* - ELEVENLABS_VOICE_ID: Voice ID to use (default: EXAVITQu4vr4xnSDxMaL)
|
|
18
21
|
* - ELEVENLABS_MODEL_ID: Model to use (default: eleven_monolingual_v1)
|
|
19
22
|
* - ELEVENLABS_VOICE_STABILITY: Voice stability 0-1 (default: 0.5)
|
|
@@ -21,7 +24,15 @@ import { type Plugin } from "@elizaos/core";
|
|
|
21
24
|
* - ELEVENLABS_VOICE_STYLE: Voice style 0-1 (default: 0)
|
|
22
25
|
* - ELEVENLABS_VOICE_USE_SPEAKER_BOOST: Enable speaker boost (default: true)
|
|
23
26
|
* - ELEVENLABS_OPTIMIZE_STREAMING_LATENCY: Latency optimization 0-4 (default: 0)
|
|
24
|
-
* - ELEVENLABS_OUTPUT_FORMAT: Output format (default:
|
|
27
|
+
* - ELEVENLABS_OUTPUT_FORMAT: Output format (default: mp3_44100_128)
|
|
28
|
+
*
|
|
29
|
+
* Optional STT environment variables:
|
|
30
|
+
* - ELEVENLABS_STT_MODEL_ID: STT model ID (default: scribe_v1)
|
|
31
|
+
* - ELEVENLABS_STT_LANGUAGE_CODE: Language code for transcription (auto-detect if not set)
|
|
32
|
+
* - ELEVENLABS_STT_TIMESTAMPS_GRANULARITY: Timestamp level (default: word)
|
|
33
|
+
* - ELEVENLABS_STT_DIARIZE: Enable speaker diarization (default: false)
|
|
34
|
+
* - ELEVENLABS_STT_NUM_SPEAKERS: Expected number of speakers (1-32)
|
|
35
|
+
* - ELEVENLABS_STT_TAG_AUDIO_EVENTS: Tag audio events (default: false)
|
|
25
36
|
*
|
|
26
37
|
* @type {Plugin}
|
|
27
38
|
*/
|
package/dist/node/index.node.js
CHANGED
|
@@ -99811,7 +99811,20 @@ function getVoiceSettings(runtime) {
|
|
|
99811
99811
|
outputFormat: getSetting(runtime, "ELEVENLABS_OUTPUT_FORMAT", "mp3_44100_128"),
|
|
99812
99812
|
similarity: getSetting(runtime, "ELEVENLABS_VOICE_SIMILARITY_BOOST", "0.75"),
|
|
99813
99813
|
style: getSetting(runtime, "ELEVENLABS_VOICE_STYLE", "0"),
|
|
99814
|
-
speakerBoost: parseBooleanFromText(getSetting(runtime, "ELEVENLABS_VOICE_USE_SPEAKER_BOOST", "true")
|
|
99814
|
+
speakerBoost: parseBooleanFromText(`${getSetting(runtime, "ELEVENLABS_VOICE_USE_SPEAKER_BOOST", "true")}`)
|
|
99815
|
+
};
|
|
99816
|
+
}
|
|
99817
|
+
function getTranscriptionSettings(runtime) {
|
|
99818
|
+
const languageCode = getSetting(runtime, "ELEVENLABS_STT_LANGUAGE_CODE");
|
|
99819
|
+
const numSpeakersStr = getSetting(runtime, "ELEVENLABS_STT_NUM_SPEAKERS");
|
|
99820
|
+
return {
|
|
99821
|
+
apiKey: getApiKey(runtime) || "",
|
|
99822
|
+
modelId: getSetting(runtime, "ELEVENLABS_STT_MODEL_ID", "scribe_v1"),
|
|
99823
|
+
languageCode: languageCode || undefined,
|
|
99824
|
+
timestampsGranularity: getSetting(runtime, "ELEVENLABS_STT_TIMESTAMPS_GRANULARITY", "word"),
|
|
99825
|
+
diarize: parseBooleanFromText(`${getSetting(runtime, "ELEVENLABS_STT_DIARIZE", "false")}`),
|
|
99826
|
+
numSpeakers: numSpeakersStr ? Number(numSpeakersStr) : undefined,
|
|
99827
|
+
tagAudioEvents: parseBooleanFromText(`${getSetting(runtime, "ELEVENLABS_STT_TAG_AUDIO_EVENTS", "false")}`)
|
|
99815
99828
|
};
|
|
99816
99829
|
}
|
|
99817
99830
|
async function fetchSpeech(runtime, params) {
|
|
@@ -99844,15 +99857,62 @@ async function fetchSpeech(runtime, params) {
|
|
|
99844
99857
|
throw error instanceof Error ? error : new Error(msg);
|
|
99845
99858
|
}
|
|
99846
99859
|
}
|
|
99860
|
+
async function fetchTranscription(runtime, params) {
|
|
99861
|
+
try {
|
|
99862
|
+
const baseUrl = getBaseURL(runtime);
|
|
99863
|
+
const apiKey = getApiKey(runtime) ?? (isBrowser() ? "sk-proxy" : undefined);
|
|
99864
|
+
const client = new import_elevenlabs_js.ElevenLabsClient({
|
|
99865
|
+
apiKey,
|
|
99866
|
+
baseUrl
|
|
99867
|
+
});
|
|
99868
|
+
const requestParams = {
|
|
99869
|
+
modelId: params.modelId,
|
|
99870
|
+
audio: params.audioFile
|
|
99871
|
+
};
|
|
99872
|
+
if (params.languageCode) {
|
|
99873
|
+
requestParams.languageCode = params.languageCode;
|
|
99874
|
+
}
|
|
99875
|
+
if (params.timestampsGranularity !== "none") {
|
|
99876
|
+
requestParams.timestampsGranularity = params.timestampsGranularity;
|
|
99877
|
+
}
|
|
99878
|
+
if (params.diarize) {
|
|
99879
|
+
requestParams.diarize = true;
|
|
99880
|
+
if (params.numSpeakers) {
|
|
99881
|
+
requestParams.numSpeakers = params.numSpeakers;
|
|
99882
|
+
}
|
|
99883
|
+
}
|
|
99884
|
+
if (params.tagAudioEvents) {
|
|
99885
|
+
requestParams.tagAudioEvents = true;
|
|
99886
|
+
}
|
|
99887
|
+
const response = await client.speechToText.convert(requestParams);
|
|
99888
|
+
if (!response) {
|
|
99889
|
+
throw new Error("Empty response from ElevenLabs STT API");
|
|
99890
|
+
}
|
|
99891
|
+
let transcript = "";
|
|
99892
|
+
if ("transcript" in response && response.transcript) {
|
|
99893
|
+
const transcriptObj = response.transcript;
|
|
99894
|
+
transcript = transcriptObj.text || "";
|
|
99895
|
+
} else if ("transcripts" in response && response.transcripts) {
|
|
99896
|
+
const transcriptsArray = response.transcripts;
|
|
99897
|
+
transcript = transcriptsArray.map((t) => t.text || "").join(`
|
|
99898
|
+
`);
|
|
99899
|
+
}
|
|
99900
|
+
return transcript;
|
|
99901
|
+
} catch (error) {
|
|
99902
|
+
const msg = error instanceof Error ? error.message : String(error);
|
|
99903
|
+
logger.error(`ElevenLabs fetchTranscription error: ${msg}`);
|
|
99904
|
+
throw error instanceof Error ? error : new Error(msg);
|
|
99905
|
+
}
|
|
99906
|
+
}
|
|
99847
99907
|
var elevenLabsPlugin = {
|
|
99848
99908
|
name: "elevenLabs",
|
|
99849
|
-
description: "High-quality text-to-speech synthesis using ElevenLabs API with support for multiple voices and
|
|
99909
|
+
description: "High-quality text-to-speech synthesis and speech-to-text transcription using ElevenLabs API with support for multiple voices, languages, and speaker diarization",
|
|
99850
99910
|
models: {
|
|
99851
99911
|
[ModelType.TEXT_TO_SPEECH]: async (runtime, input) => {
|
|
99852
99912
|
const options = typeof input === "string" ? { text: input } : input;
|
|
99853
99913
|
const settings = getVoiceSettings(runtime);
|
|
99854
99914
|
const resolvedModel = options.model || settings.model;
|
|
99855
|
-
const resolvedVoiceId = options.
|
|
99915
|
+
const resolvedVoiceId = options.voiceId || settings.voiceId;
|
|
99856
99916
|
const outputFormat = options.format ? options.format === "mp3" ? "mp3_44100_128" : options.format : settings.outputFormat;
|
|
99857
99917
|
logger.log(`[ElevenLabs] Using TEXT_TO_SPEECH model: ${resolvedModel}`);
|
|
99858
99918
|
try {
|
|
@@ -99873,6 +99933,46 @@ var elevenLabsPlugin = {
|
|
|
99873
99933
|
logger.error(`ElevenLabs model error: ${msg}`);
|
|
99874
99934
|
throw error instanceof Error ? error : new Error(msg);
|
|
99875
99935
|
}
|
|
99936
|
+
},
|
|
99937
|
+
[ModelType.TRANSCRIPTION]: async (runtime, input) => {
|
|
99938
|
+
const settings = getTranscriptionSettings(runtime);
|
|
99939
|
+
logger.log(`[ElevenLabs] Using TRANSCRIPTION model: ${settings.modelId}`);
|
|
99940
|
+
try {
|
|
99941
|
+
let audioFile;
|
|
99942
|
+
if (typeof input === "string") {
|
|
99943
|
+
const response = await fetch(input);
|
|
99944
|
+
if (!response.ok) {
|
|
99945
|
+
throw new Error(`Failed to fetch audio from URL: ${input}`);
|
|
99946
|
+
}
|
|
99947
|
+
const arrayBuffer = await response.arrayBuffer();
|
|
99948
|
+
audioFile = Buffer.from(arrayBuffer);
|
|
99949
|
+
} else if (Buffer.isBuffer(input)) {
|
|
99950
|
+
audioFile = input;
|
|
99951
|
+
} else if (typeof input === "object" && "audioUrl" in input) {
|
|
99952
|
+
const response = await fetch(input.audioUrl);
|
|
99953
|
+
if (!response.ok) {
|
|
99954
|
+
throw new Error(`Failed to fetch audio from URL: ${input.audioUrl}`);
|
|
99955
|
+
}
|
|
99956
|
+
const arrayBuffer = await response.arrayBuffer();
|
|
99957
|
+
audioFile = Buffer.from(arrayBuffer);
|
|
99958
|
+
} else {
|
|
99959
|
+
throw new Error("Invalid input type for TRANSCRIPTION model");
|
|
99960
|
+
}
|
|
99961
|
+
const transcript = await fetchTranscription(runtime, {
|
|
99962
|
+
audioFile,
|
|
99963
|
+
modelId: settings.modelId,
|
|
99964
|
+
languageCode: settings.languageCode,
|
|
99965
|
+
timestampsGranularity: settings.timestampsGranularity,
|
|
99966
|
+
diarize: settings.diarize,
|
|
99967
|
+
numSpeakers: settings.numSpeakers,
|
|
99968
|
+
tagAudioEvents: settings.tagAudioEvents
|
|
99969
|
+
});
|
|
99970
|
+
return transcript;
|
|
99971
|
+
} catch (error) {
|
|
99972
|
+
const msg = error instanceof Error ? error.message : String(error);
|
|
99973
|
+
logger.error(`ElevenLabs transcription error: ${msg}`);
|
|
99974
|
+
throw error instanceof Error ? error : new Error(msg);
|
|
99975
|
+
}
|
|
99876
99976
|
}
|
|
99877
99977
|
},
|
|
99878
99978
|
tests: [
|
|
@@ -99895,12 +99995,12 @@ var elevenLabsPlugin = {
|
|
|
99895
99995
|
if (!settings.voiceId) {
|
|
99896
99996
|
throw new Error("Missing voice ID configuration");
|
|
99897
99997
|
}
|
|
99898
|
-
const stability = parseFloat(settings.stability);
|
|
99899
|
-
if (isNaN(stability) || stability < 0 || stability > 1) {
|
|
99998
|
+
const stability = Number.parseFloat(settings.stability);
|
|
99999
|
+
if (Number.isNaN(stability) || stability < 0 || stability > 1) {
|
|
99900
100000
|
throw new Error("Voice stability must be between 0 and 1");
|
|
99901
100001
|
}
|
|
99902
|
-
const similarity = parseFloat(settings.similarity);
|
|
99903
|
-
if (isNaN(similarity) || similarity < 0 || similarity > 1) {
|
|
100002
|
+
const similarity = Number.parseFloat(settings.similarity);
|
|
100003
|
+
if (Number.isNaN(similarity) || similarity < 0 || similarity > 1) {
|
|
99904
100004
|
throw new Error("Voice similarity boost must be between 0 and 1");
|
|
99905
100005
|
}
|
|
99906
100006
|
logger.success("Voice settings validated successfully");
|
|
@@ -99984,7 +100084,7 @@ var elevenLabsPlugin = {
|
|
|
99984
100084
|
for (const format of pcmFormats) {
|
|
99985
100085
|
if (format.startsWith("pcm_")) {
|
|
99986
100086
|
const sampleRate = Number.parseInt(format.slice(4));
|
|
99987
|
-
if (isNaN(sampleRate) || sampleRate <= 0) {
|
|
100087
|
+
if (Number.isNaN(sampleRate) || sampleRate <= 0) {
|
|
99988
100088
|
throw new Error(`Invalid PCM format: ${format}`);
|
|
99989
100089
|
}
|
|
99990
100090
|
}
|
|
@@ -99993,6 +100093,57 @@ var elevenLabsPlugin = {
|
|
|
99993
100093
|
}
|
|
99994
100094
|
}
|
|
99995
100095
|
]
|
|
100096
|
+
},
|
|
100097
|
+
{
|
|
100098
|
+
name: "test eleven labs STT",
|
|
100099
|
+
tests: [
|
|
100100
|
+
{
|
|
100101
|
+
name: "STT settings validation",
|
|
100102
|
+
fn: async (runtime) => {
|
|
100103
|
+
const settings = getTranscriptionSettings(runtime);
|
|
100104
|
+
if (!settings.modelId) {
|
|
100105
|
+
throw new Error("Missing STT model ID configuration");
|
|
100106
|
+
}
|
|
100107
|
+
const validGranularities = ["none", "word", "character"];
|
|
100108
|
+
if (!validGranularities.includes(settings.timestampsGranularity)) {
|
|
100109
|
+
throw new Error(`Invalid timestamps granularity: ${settings.timestampsGranularity}`);
|
|
100110
|
+
}
|
|
100111
|
+
if (settings.numSpeakers !== undefined && (settings.numSpeakers < 1 || settings.numSpeakers > 32)) {
|
|
100112
|
+
throw new Error("Number of speakers must be between 1 and 32");
|
|
100113
|
+
}
|
|
100114
|
+
logger.success("STT settings validated successfully");
|
|
100115
|
+
}
|
|
100116
|
+
},
|
|
100117
|
+
{
|
|
100118
|
+
name: "STT configuration defaults",
|
|
100119
|
+
fn: async (runtime) => {
|
|
100120
|
+
const settings = getTranscriptionSettings(runtime);
|
|
100121
|
+
if (settings.modelId !== "scribe_v1") {
|
|
100122
|
+
logger.warn(`Using non-default STT model: ${settings.modelId}`);
|
|
100123
|
+
}
|
|
100124
|
+
if (settings.timestampsGranularity !== "word") {
|
|
100125
|
+
logger.warn(`Using non-default timestamps granularity: ${settings.timestampsGranularity}`);
|
|
100126
|
+
}
|
|
100127
|
+
logger.success("STT configuration defaults checked");
|
|
100128
|
+
}
|
|
100129
|
+
},
|
|
100130
|
+
{
|
|
100131
|
+
name: "STT input handling validation",
|
|
100132
|
+
fn: async (runtime) => {
|
|
100133
|
+
const testCases = [
|
|
100134
|
+
{ type: "string URL", valid: true },
|
|
100135
|
+
{ type: "Buffer", valid: true },
|
|
100136
|
+
{ type: "object with audioUrl", valid: true }
|
|
100137
|
+
];
|
|
100138
|
+
for (const testCase of testCases) {
|
|
100139
|
+
if (!testCase.valid) {
|
|
100140
|
+
throw new Error(`Invalid test case should not be valid: ${testCase.type}`);
|
|
100141
|
+
}
|
|
100142
|
+
}
|
|
100143
|
+
logger.success("STT input handling validation passed");
|
|
100144
|
+
}
|
|
100145
|
+
}
|
|
100146
|
+
]
|
|
99996
100147
|
}
|
|
99997
100148
|
]
|
|
99998
100149
|
};
|
|
@@ -100002,4 +100153,4 @@ export {
|
|
|
100002
100153
|
src_default as default
|
|
100003
100154
|
};
|
|
100004
100155
|
|
|
100005
|
-
//# debugId=
|
|
100156
|
+
//# debugId=F4CD7795D57767C464756E2164756E21
|