@r16t/multimodal-mcp 1.2.3 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +42 -17
- package/build/config.d.ts +2 -0
- package/build/config.js +8 -0
- package/build/errors.js +1 -0
- package/build/providers/bfl.d.ts +15 -0
- package/build/providers/bfl.js +87 -0
- package/build/providers/elevenlabs.d.ts +14 -0
- package/build/providers/elevenlabs.js +96 -0
- package/build/providers/google.js +1 -0
- package/build/providers/openai.d.ts +2 -1
- package/build/providers/openai.js +14 -0
- package/build/providers/registry.d.ts +1 -0
- package/build/providers/registry.js +3 -0
- package/build/providers/types.d.ts +12 -0
- package/build/providers/xai.js +1 -0
- package/build/read-media-file.js +8 -0
- package/build/server.js +26 -8
- package/build/tools/list-providers.js +3 -1
- package/build/tools/transcribe-audio.d.ts +19 -0
- package/build/tools/transcribe-audio.js +48 -0
- package/package.json +2 -1
package/README.md
CHANGED
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
# multimodal-mcp
|
|
2
2
|
|
|
3
|
-
Multi-provider media generation MCP server. Generate images, videos, and
|
|
3
|
+
Multi-provider media generation MCP server. Generate images, videos, audio, and transcriptions from text prompts using OpenAI, xAI, Gemini, ElevenLabs, and BFL (FLUX) through a single unified interface.
|
|
4
4
|
|
|
5
5
|
## Features
|
|
6
6
|
|
|
7
|
-
- 🎨 **Image Generation** — Generate images via OpenAI (gpt-image-1), xAI (grok-imagine-image),
|
|
7
|
+
- 🎨 **Image Generation** — Generate images via OpenAI (gpt-image-1), xAI (grok-imagine-image), Gemini (imagen-4), or BFL (FLUX Pro 1.1)
|
|
8
|
+
- ✏️ **Image Editing** — Edit images via OpenAI, xAI, Gemini, or BFL (FLUX Kontext)
|
|
8
9
|
- 🎬 **Video Generation** — Generate videos via OpenAI (sora-2), xAI (grok-imagine-video), or Gemini (veo-3.1)
|
|
9
|
-
- 🔊 **Audio Generation** — Text-to-speech via OpenAI (tts-1) or
|
|
10
|
+
- 🔊 **Audio Generation** — Text-to-speech via OpenAI (tts-1), Gemini, or ElevenLabs (Flash v2.5). Sound effects via ElevenLabs
|
|
11
|
+
- 🎙️ **Audio Transcription** — Speech-to-text via OpenAI (Whisper) or ElevenLabs (Scribe)
|
|
10
12
|
- 🔄 **Auto-Discovery** — Automatically detects configured providers from environment variables
|
|
11
13
|
- 🎯 **Provider Selection** — Auto-selects or explicitly choose a provider per request
|
|
12
14
|
- 📁 **File Output** — Saves all generated media to disk with descriptive filenames
|
|
@@ -24,6 +26,12 @@ claude mcp add multimodal-mcp -e OPENAI_API_KEY=sk-... -- npx @r16t/multimodal-m
|
|
|
24
26
|
|
|
25
27
|
# Or using Gemini
|
|
26
28
|
# claude mcp add multimodal-mcp -e GEMINI_API_KEY=AIza... -- npx @r16t/multimodal-mcp@latest
|
|
29
|
+
|
|
30
|
+
# Or using ElevenLabs (audio + transcription)
|
|
31
|
+
# claude mcp add multimodal-mcp -e ELEVENLABS_API_KEY=xi-... -- npx @r16t/multimodal-mcp@latest
|
|
32
|
+
|
|
33
|
+
# Or using BFL/FLUX (images)
|
|
34
|
+
# claude mcp add multimodal-mcp -e BFL_API_KEY=... -- npx @r16t/multimodal-mcp@latest
|
|
27
35
|
```
|
|
28
36
|
|
|
29
37
|
Using a different editor? See [setup instructions](#editor-setup) for Claude Desktop, Cursor, VS Code, Windsurf, and Cline.
|
|
@@ -32,10 +40,12 @@ Using a different editor? See [setup instructions](#editor-setup) for Claude Des
|
|
|
32
40
|
|
|
33
41
|
| Variable | Required | Description |
|
|
34
42
|
|----------|----------|-------------|
|
|
35
|
-
| `OPENAI_API_KEY` | At least one provider key | OpenAI API key — enables image, video,
|
|
43
|
+
| `OPENAI_API_KEY` | At least one provider key | OpenAI API key — enables image, video, audio generation, and transcription via gpt-image-1, sora-2, tts-1, and whisper-1 |
|
|
36
44
|
| `XAI_API_KEY` | At least one provider key | xAI API key — enables image and video generation via grok-imagine-image and grok-imagine-video |
|
|
37
45
|
| `GEMINI_API_KEY` | At least one provider key | Gemini API key — enables image, video, and audio generation via imagen-4, veo-3.1, and gemini-2.5-flash-preview-tts |
|
|
38
46
|
| `GOOGLE_API_KEY` | — | Alias for `GEMINI_API_KEY`; either name is accepted |
|
|
47
|
+
| `ELEVENLABS_API_KEY` | At least one provider key | ElevenLabs API key — enables audio generation (TTS, sound effects) and transcription via Flash v2.5 and Scribe v1 |
|
|
48
|
+
| `BFL_API_KEY` | At least one provider key | BFL API key — enables image generation and editing via FLUX Pro 1.1 and FLUX Kontext |
|
|
39
49
|
| `MEDIA_OUTPUT_DIR` | No | Directory for saved media files. Defaults to the current working directory |
|
|
40
50
|
|
|
41
51
|
## Available Tools
|
|
@@ -47,7 +57,7 @@ Generate an image from a text prompt.
|
|
|
47
57
|
| Parameter | Type | Required | Description |
|
|
48
58
|
|-----------|------|----------|-------------|
|
|
49
59
|
| `prompt` | string | Yes | Text description of the image to generate |
|
|
50
|
-
| `provider` | string | No | Provider to use: `openai`, `xai`, `google`. Auto-selects if omitted |
|
|
60
|
+
| `provider` | string | No | Provider to use: `openai`, `xai`, `google`, `bfl`. Auto-selects if omitted |
|
|
51
61
|
| `aspectRatio` | string | No | Aspect ratio: `1:1`, `16:9`, `9:16`, `4:3`, `3:4` |
|
|
52
62
|
| `quality` | string | No | Quality level: `low`, `standard`, `high` |
|
|
53
63
|
| `outputDirectory` | string | No | Directory to save the generated file. Absolute or relative path. Defaults to `MEDIA_OUTPUT_DIR` or cwd |
|
|
@@ -69,16 +79,27 @@ Generate a video from a text prompt. Video generation is asynchronous and may ta
|
|
|
69
79
|
|
|
70
80
|
### `generate_audio`
|
|
71
81
|
|
|
72
|
-
Generate audio
|
|
82
|
+
Generate audio from text. Supports text-to-speech and sound effects. Audio generation is synchronous.
|
|
73
83
|
|
|
74
84
|
| Parameter | Type | Required | Description |
|
|
75
85
|
|-----------|------|----------|-------------|
|
|
76
|
-
| `text` | string | Yes | Text to convert to speech |
|
|
77
|
-
| `provider` | string | No | Provider to use: `openai`, `google`. Auto-selects if omitted |
|
|
78
|
-
| `voice` | string | No | Voice name (provider-specific). OpenAI: `alloy`, `ash`, `coral`, `echo`, `fable`, `nova`, `onyx`, `sage`, `shimmer`. Google: `Kore`, `Charon`, `Fenrir`, `Aoede`, `Puck`, etc. |
|
|
86
|
+
| `text` | string | Yes | Text to convert to speech, or a description of the sound effect to generate |
|
|
87
|
+
| `provider` | string | No | Provider to use: `openai`, `google`, `elevenlabs`. Auto-selects if omitted |
|
|
88
|
+
| `voice` | string | No | Voice name (provider-specific). OpenAI: `alloy`, `ash`, `coral`, `echo`, `fable`, `nova`, `onyx`, `sage`, `shimmer`. Google: `Kore`, `Charon`, `Fenrir`, `Aoede`, `Puck`, etc. ElevenLabs: voice ID |
|
|
79
89
|
| `speed` | number | No | Speech speed multiplier (OpenAI only): `0.25` to `4.0` |
|
|
80
90
|
| `format` | string | No | Output format (OpenAI only): `mp3`, `opus`, `aac`, `flac`, `wav`, `pcm` |
|
|
81
91
|
| `outputDirectory` | string | No | Directory to save the generated file. Absolute or relative path. Defaults to `MEDIA_OUTPUT_DIR` or cwd |
|
|
92
|
+
| `providerOptions` | object | No | Provider-specific parameters passed through directly. ElevenLabs: set `mode: "sound-effect"` for sound effects, `model` for TTS model selection |
|
|
93
|
+
|
|
94
|
+
### `transcribe_audio`
|
|
95
|
+
|
|
96
|
+
Transcribe audio to text (speech-to-text).
|
|
97
|
+
|
|
98
|
+
| Parameter | Type | Required | Description |
|
|
99
|
+
|-----------|------|----------|-------------|
|
|
100
|
+
| `audioPath` | string | Yes | Absolute path to the audio file to transcribe |
|
|
101
|
+
| `provider` | string | No | Provider to use: `openai`, `elevenlabs`. Auto-selects if omitted |
|
|
102
|
+
| `language` | string | No | Language code (e.g., `en`, `fr`, `es`) to hint the transcription language |
|
|
82
103
|
| `providerOptions` | object | No | Provider-specific parameters passed through directly |
|
|
83
104
|
|
|
84
105
|
### `list_providers`
|
|
@@ -87,11 +108,13 @@ List all configured media generation providers and their capabilities. Takes no
|
|
|
87
108
|
|
|
88
109
|
## Provider Capabilities
|
|
89
110
|
|
|
90
|
-
| Provider | Image |
|
|
91
|
-
|
|
92
|
-
| OpenAI | ✅ | ✅ | ✅ | gpt-image-1
|
|
93
|
-
| xAI | ✅ | ✅ | — | grok-imagine-image
|
|
94
|
-
| Gemini | ✅ | ✅ | ✅ | imagen-4
|
|
111
|
+
| Provider | Image | Image Editing | Video | Audio | Transcription | Key Models |
|
|
112
|
+
|----------|:-----:|:------------:|:-----:|:-----:|:------------:|------------|
|
|
113
|
+
| OpenAI | ✅ | ✅ | ✅ | ✅ | ✅ | gpt-image-1, sora-2, tts-1, whisper-1 |
|
|
114
|
+
| xAI | ✅ | ✅ | ✅ | — | — | grok-imagine-image, grok-imagine-video |
|
|
115
|
+
| Gemini | ✅ | ✅ | ✅ | ✅ | — | imagen-4, veo-3.1, gemini-2.5-flash-preview-tts |
|
|
116
|
+
| ElevenLabs | — | — | — | ✅ | ✅ | eleven_flash_v2_5, scribe_v1 |
|
|
117
|
+
| BFL | ✅ | ✅ | — | — | — | flux-pro-1.1, flux-kontext-pro |
|
|
95
118
|
|
|
96
119
|
### Image Aspect Ratios
|
|
97
120
|
|
|
@@ -100,6 +123,7 @@ List all configured media generation providers and their capabilities. Takes no
|
|
|
100
123
|
| OpenAI | ✅ | ✅ | ✅ | ✅ | ✅ |
|
|
101
124
|
| xAI | ✅ | ✅ | ✅ | ✅ | ✅ |
|
|
102
125
|
| Gemini | ✅ | ✅ | ✅ | ✅ | ✅ |
|
|
126
|
+
| BFL | ✅ | ✅ | ✅ | ✅ | ✅ |
|
|
103
127
|
|
|
104
128
|
### Video Aspect Ratios & Resolutions
|
|
105
129
|
|
|
@@ -115,6 +139,7 @@ List all configured media generation providers and their capabilities. Takes no
|
|
|
115
139
|
|----------|:---:|:----:|:---:|:----:|:---:|:---:|
|
|
116
140
|
| OpenAI | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
|
117
141
|
| Gemini | — | — | — | — | ✅ | — |
|
|
142
|
+
| ElevenLabs | ✅ | ✅ | — | — | — | ✅ |
|
|
118
143
|
|
|
119
144
|
## Troubleshooting
|
|
120
145
|
|
|
@@ -124,11 +149,11 @@ List all configured media generation providers and their capabilities. Takes no
|
|
|
124
149
|
[config] No provider API keys detected
|
|
125
150
|
```
|
|
126
151
|
|
|
127
|
-
Set at least one of `OPENAI_API_KEY`, `XAI_API_KEY`, or `
|
|
152
|
+
Set at least one of `OPENAI_API_KEY`, `XAI_API_KEY`, `GEMINI_API_KEY`, `ELEVENLABS_API_KEY`, or `BFL_API_KEY` in the MCP server's `env` block.
|
|
128
153
|
|
|
129
154
|
### Provider not available for requested media type
|
|
130
155
|
|
|
131
|
-
|
|
156
|
+
Each provider supports different media types (see [Provider Capabilities](#provider-capabilities)). If you specify a `provider` that isn't configured (no API key) or doesn't support the requested media type, you'll receive an error. Omit the `provider` parameter to auto-select from configured providers.
|
|
132
157
|
|
|
133
158
|
### Video generation timeout
|
|
134
159
|
|
|
@@ -154,7 +179,7 @@ npm run dev # Watch mode for TypeScript compilation
|
|
|
154
179
|
|
|
155
180
|
## Editor Setup
|
|
156
181
|
|
|
157
|
-
Replace `OPENAI_API_KEY` with your provider of choice (`XAI_API_KEY`, `GEMINI_API_KEY`). You can set multiple keys to enable multiple providers.
|
|
182
|
+
Replace `OPENAI_API_KEY` with your provider of choice (`XAI_API_KEY`, `GEMINI_API_KEY`, `ELEVENLABS_API_KEY`, `BFL_API_KEY`). You can set multiple keys to enable multiple providers.
|
|
158
183
|
|
|
159
184
|
### Claude Desktop
|
|
160
185
|
|
package/build/config.d.ts
CHANGED
|
@@ -3,6 +3,8 @@ declare const configSchema: z.ZodObject<{
|
|
|
3
3
|
openaiApiKey: z.ZodOptional<z.ZodString>;
|
|
4
4
|
xaiApiKey: z.ZodOptional<z.ZodString>;
|
|
5
5
|
googleApiKey: z.ZodOptional<z.ZodString>;
|
|
6
|
+
elevenlabsApiKey: z.ZodOptional<z.ZodString>;
|
|
7
|
+
bflApiKey: z.ZodOptional<z.ZodString>;
|
|
6
8
|
outputDirectory: z.ZodString;
|
|
7
9
|
}, z.core.$strip>;
|
|
8
10
|
export type Config = z.infer<typeof configSchema>;
|
package/build/config.js
CHANGED
|
@@ -3,6 +3,8 @@ const configSchema = z.object({
|
|
|
3
3
|
openaiApiKey: z.string().optional(),
|
|
4
4
|
xaiApiKey: z.string().optional(),
|
|
5
5
|
googleApiKey: z.string().optional(),
|
|
6
|
+
elevenlabsApiKey: z.string().optional(),
|
|
7
|
+
bflApiKey: z.string().optional(),
|
|
6
8
|
outputDirectory: z.string(),
|
|
7
9
|
});
|
|
8
10
|
function resolveGeminiKey() {
|
|
@@ -13,6 +15,8 @@ export function loadConfig() {
|
|
|
13
15
|
openaiApiKey: process.env.OPENAI_API_KEY || undefined,
|
|
14
16
|
xaiApiKey: process.env.XAI_API_KEY || undefined,
|
|
15
17
|
googleApiKey: resolveGeminiKey(),
|
|
18
|
+
elevenlabsApiKey: process.env.ELEVENLABS_API_KEY || undefined,
|
|
19
|
+
bflApiKey: process.env.BFL_API_KEY || undefined,
|
|
16
20
|
outputDirectory: process.env.MEDIA_OUTPUT_DIR || process.cwd(),
|
|
17
21
|
});
|
|
18
22
|
const detected = [];
|
|
@@ -22,6 +26,10 @@ export function loadConfig() {
|
|
|
22
26
|
detected.push("xAI");
|
|
23
27
|
if (config.googleApiKey)
|
|
24
28
|
detected.push("Gemini");
|
|
29
|
+
if (config.elevenlabsApiKey)
|
|
30
|
+
detected.push("ElevenLabs");
|
|
31
|
+
if (config.bflApiKey)
|
|
32
|
+
detected.push("BFL");
|
|
25
33
|
if (detected.length > 0) {
|
|
26
34
|
console.error(`[config] Detected providers: ${detected.join(", ")}`);
|
|
27
35
|
}
|
package/build/errors.js
CHANGED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import type { MediaProvider, ProviderCapabilities, ImageParams, EditImageParams, VideoParams, AudioParams, GeneratedMedia } from "./types.js";
|
|
2
|
+
export declare class BFLProvider implements MediaProvider {
|
|
3
|
+
readonly name = "bfl";
|
|
4
|
+
readonly capabilities: ProviderCapabilities;
|
|
5
|
+
private apiKey;
|
|
6
|
+
constructor(apiKey: string);
|
|
7
|
+
generateImage(params: ImageParams): Promise<GeneratedMedia>;
|
|
8
|
+
editImage(params: EditImageParams): Promise<GeneratedMedia>;
|
|
9
|
+
generateVideo(_params: VideoParams): Promise<GeneratedMedia>;
|
|
10
|
+
generateAudio(_params: AudioParams): Promise<GeneratedMedia>;
|
|
11
|
+
private submitTask;
|
|
12
|
+
private pollTask;
|
|
13
|
+
private downloadResult;
|
|
14
|
+
private mapAspectRatio;
|
|
15
|
+
}
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import { pollForCompletion } from "./polling.js";
|
|
2
|
+
const BFL_BASE_URL = "https://api.bfl.ml/v1";
|
|
3
|
+
const IMAGE_MODEL = "flux-pro-1.1";
|
|
4
|
+
const EDIT_MODEL = "flux-kontext-pro";
|
|
5
|
+
const ASPECT_RATIO_MAP = {
|
|
6
|
+
"1:1": { width: 1024, height: 1024 },
|
|
7
|
+
"16:9": { width: 1344, height: 768 },
|
|
8
|
+
"9:16": { width: 768, height: 1344 },
|
|
9
|
+
"4:3": { width: 1152, height: 896 },
|
|
10
|
+
"3:4": { width: 896, height: 1152 },
|
|
11
|
+
};
|
|
12
|
+
export class BFLProvider {
|
|
13
|
+
name = "bfl";
|
|
14
|
+
capabilities = {
|
|
15
|
+
supportsImageGeneration: true,
|
|
16
|
+
supportsImageEditing: true,
|
|
17
|
+
supportsVideoGeneration: false,
|
|
18
|
+
supportsAudioGeneration: false,
|
|
19
|
+
supportsTranscription: false,
|
|
20
|
+
supportedImageAspectRatios: ["1:1", "16:9", "9:16", "4:3", "3:4"],
|
|
21
|
+
supportedVideoAspectRatios: [],
|
|
22
|
+
supportedVideoResolutions: [],
|
|
23
|
+
supportedAudioFormats: [],
|
|
24
|
+
maxVideoDurationSeconds: 0,
|
|
25
|
+
};
|
|
26
|
+
apiKey;
|
|
27
|
+
constructor(apiKey) {
|
|
28
|
+
this.apiKey = apiKey;
|
|
29
|
+
}
|
|
30
|
+
async generateImage(params) {
|
|
31
|
+
const { model, ...options } = params.providerOptions ?? {};
|
|
32
|
+
const modelName = model ?? IMAGE_MODEL;
|
|
33
|
+
const { width, height } = this.mapAspectRatio(params.aspectRatio);
|
|
34
|
+
const task = await this.submitTask(modelName, { prompt: params.prompt, width, height, ...options });
|
|
35
|
+
const result = await this.pollTask(task.id);
|
|
36
|
+
return this.downloadResult(result.result.sample, modelName);
|
|
37
|
+
}
|
|
38
|
+
async editImage(params) {
|
|
39
|
+
const { model, ...options } = params.providerOptions ?? {};
|
|
40
|
+
const modelName = model ?? EDIT_MODEL;
|
|
41
|
+
const input_image = params.imageData.toString("base64");
|
|
42
|
+
const task = await this.submitTask(modelName, { prompt: params.prompt, input_image, ...options });
|
|
43
|
+
const result = await this.pollTask(task.id);
|
|
44
|
+
return this.downloadResult(result.result.sample, modelName);
|
|
45
|
+
}
|
|
46
|
+
async generateVideo(_params) {
|
|
47
|
+
throw new Error("BFL does not support video generation");
|
|
48
|
+
}
|
|
49
|
+
async generateAudio(_params) {
|
|
50
|
+
throw new Error("BFL does not support audio generation");
|
|
51
|
+
}
|
|
52
|
+
async submitTask(model, body) {
|
|
53
|
+
const response = await fetch(`${BFL_BASE_URL}/${model}`, {
|
|
54
|
+
method: "POST",
|
|
55
|
+
headers: { "Content-Type": "application/json", "X-Key": this.apiKey },
|
|
56
|
+
body: JSON.stringify(body),
|
|
57
|
+
});
|
|
58
|
+
if (!response.ok) {
|
|
59
|
+
throw new Error(`BFL task submission failed: ${response.status}`);
|
|
60
|
+
}
|
|
61
|
+
return response.json();
|
|
62
|
+
}
|
|
63
|
+
async pollTask(taskId) {
|
|
64
|
+
return pollForCompletion(async () => {
|
|
65
|
+
const response = await fetch(`${BFL_BASE_URL}/get_result?id=${taskId}`, {
|
|
66
|
+
headers: { "X-Key": this.apiKey },
|
|
67
|
+
});
|
|
68
|
+
return response.json();
|
|
69
|
+
}, (result) => result.status === "Ready", { timeoutMs: 300_000, intervalMs: 3_000 });
|
|
70
|
+
}
|
|
71
|
+
async downloadResult(url, model) {
|
|
72
|
+
const response = await fetch(url);
|
|
73
|
+
if (!response.ok) {
|
|
74
|
+
throw new Error(`BFL image download failed: ${response.status}`);
|
|
75
|
+
}
|
|
76
|
+
const mimeType = response.headers.get("content-type") ?? "image/png";
|
|
77
|
+
const data = Buffer.from(await response.arrayBuffer());
|
|
78
|
+
return { data, mimeType, metadata: { model, provider: "bfl" } };
|
|
79
|
+
}
|
|
80
|
+
mapAspectRatio(ratio) {
|
|
81
|
+
const dimensions = ASPECT_RATIO_MAP[ratio];
|
|
82
|
+
if (!dimensions) {
|
|
83
|
+
throw new Error(`BFL does not support aspect ratio: ${ratio}`);
|
|
84
|
+
}
|
|
85
|
+
return dimensions;
|
|
86
|
+
}
|
|
87
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import type { MediaProvider, ProviderCapabilities, ImageParams, EditImageParams, VideoParams, AudioParams, GeneratedMedia, TranscribeParams, TranscribedText } from "./types.js";
|
|
2
|
+
export declare class ElevenLabsProvider implements MediaProvider {
|
|
3
|
+
readonly name = "elevenlabs";
|
|
4
|
+
readonly capabilities: ProviderCapabilities;
|
|
5
|
+
private apiKey;
|
|
6
|
+
constructor(apiKey: string);
|
|
7
|
+
generateImage(_params: ImageParams): Promise<GeneratedMedia>;
|
|
8
|
+
editImage(_params: EditImageParams): Promise<GeneratedMedia>;
|
|
9
|
+
generateVideo(_params: VideoParams): Promise<GeneratedMedia>;
|
|
10
|
+
generateAudio(params: AudioParams): Promise<GeneratedMedia>;
|
|
11
|
+
transcribeAudio(params: TranscribeParams): Promise<TranscribedText>;
|
|
12
|
+
private generateSpeech;
|
|
13
|
+
private generateSoundEffect;
|
|
14
|
+
}
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
const BASE_URL = "https://api.elevenlabs.io/v1";
|
|
2
|
+
const DEFAULT_VOICE_ID = "JBFqnCBsd6RMkjVDRZzb";
|
|
3
|
+
const DEFAULT_TTS_MODEL = "eleven_flash_v2_5";
|
|
4
|
+
const TRANSCRIPTION_MODEL = "scribe_v1";
|
|
5
|
+
export class ElevenLabsProvider {
|
|
6
|
+
name = "elevenlabs";
|
|
7
|
+
capabilities = {
|
|
8
|
+
supportsImageGeneration: false,
|
|
9
|
+
supportsImageEditing: false,
|
|
10
|
+
supportsVideoGeneration: false,
|
|
11
|
+
supportsAudioGeneration: true,
|
|
12
|
+
supportsTranscription: true,
|
|
13
|
+
supportedImageAspectRatios: [],
|
|
14
|
+
supportedVideoAspectRatios: [],
|
|
15
|
+
supportedVideoResolutions: [],
|
|
16
|
+
supportedAudioFormats: ["mp3", "pcm", "ulaw", "opus"],
|
|
17
|
+
maxVideoDurationSeconds: 0,
|
|
18
|
+
};
|
|
19
|
+
apiKey;
|
|
20
|
+
constructor(apiKey) {
|
|
21
|
+
this.apiKey = apiKey;
|
|
22
|
+
}
|
|
23
|
+
async generateImage(_params) {
|
|
24
|
+
throw new Error("ElevenLabs does not support image generation");
|
|
25
|
+
}
|
|
26
|
+
async editImage(_params) {
|
|
27
|
+
throw new Error("ElevenLabs does not support image editing");
|
|
28
|
+
}
|
|
29
|
+
async generateVideo(_params) {
|
|
30
|
+
throw new Error("ElevenLabs does not support video generation");
|
|
31
|
+
}
|
|
32
|
+
async generateAudio(params) {
|
|
33
|
+
const mode = params.providerOptions?.mode;
|
|
34
|
+
if (mode === "sound-effect") {
|
|
35
|
+
return this.generateSoundEffect(params);
|
|
36
|
+
}
|
|
37
|
+
return this.generateSpeech(params);
|
|
38
|
+
}
|
|
39
|
+
async transcribeAudio(params) {
|
|
40
|
+
const formData = new FormData();
|
|
41
|
+
const blob = new Blob([new Uint8Array(params.audioData)], { type: params.audioMimeType });
|
|
42
|
+
formData.append("file", blob, "audio");
|
|
43
|
+
formData.append("model_id", TRANSCRIPTION_MODEL);
|
|
44
|
+
if (params.language) {
|
|
45
|
+
formData.append("language_code", params.language);
|
|
46
|
+
}
|
|
47
|
+
const response = await fetch(`${BASE_URL}/speech-to-text`, {
|
|
48
|
+
method: "POST",
|
|
49
|
+
headers: { "xi-api-key": this.apiKey },
|
|
50
|
+
body: formData,
|
|
51
|
+
});
|
|
52
|
+
if (!response.ok) {
|
|
53
|
+
throw new Error(`ElevenLabs transcription failed: ${response.status}`);
|
|
54
|
+
}
|
|
55
|
+
const result = (await response.json());
|
|
56
|
+
return {
|
|
57
|
+
text: result.text,
|
|
58
|
+
metadata: { model: TRANSCRIPTION_MODEL, provider: "elevenlabs" },
|
|
59
|
+
};
|
|
60
|
+
}
|
|
61
|
+
async generateSpeech(params) {
|
|
62
|
+
const voiceId = params.voice ?? DEFAULT_VOICE_ID;
|
|
63
|
+
const options = params.providerOptions ?? {};
|
|
64
|
+
const modelId = options.model ?? DEFAULT_TTS_MODEL;
|
|
65
|
+
const filtered = Object.fromEntries(Object.entries(options).filter(([k]) => k !== "mode" && k !== "model"));
|
|
66
|
+
const response = await fetch(`${BASE_URL}/text-to-speech/${voiceId}`, {
|
|
67
|
+
method: "POST",
|
|
68
|
+
headers: { "Content-Type": "application/json", "xi-api-key": this.apiKey },
|
|
69
|
+
body: JSON.stringify({ text: params.text, model_id: modelId, ...filtered }),
|
|
70
|
+
});
|
|
71
|
+
if (!response.ok) {
|
|
72
|
+
throw new Error(`ElevenLabs TTS failed: ${response.status}`);
|
|
73
|
+
}
|
|
74
|
+
return {
|
|
75
|
+
data: Buffer.from(await response.arrayBuffer()),
|
|
76
|
+
mimeType: "audio/mpeg",
|
|
77
|
+
metadata: { model: modelId, provider: "elevenlabs", voice: voiceId },
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
async generateSoundEffect(params) {
|
|
81
|
+
const filtered = Object.fromEntries(Object.entries(params.providerOptions ?? {}).filter(([k]) => k !== "mode"));
|
|
82
|
+
const response = await fetch(`${BASE_URL}/text-to-sound-effects`, {
|
|
83
|
+
method: "POST",
|
|
84
|
+
headers: { "Content-Type": "application/json", "xi-api-key": this.apiKey },
|
|
85
|
+
body: JSON.stringify({ text: params.text, ...filtered }),
|
|
86
|
+
});
|
|
87
|
+
if (!response.ok) {
|
|
88
|
+
throw new Error(`ElevenLabs sound effect generation failed: ${response.status}`);
|
|
89
|
+
}
|
|
90
|
+
return {
|
|
91
|
+
data: Buffer.from(await response.arrayBuffer()),
|
|
92
|
+
mimeType: "audio/mpeg",
|
|
93
|
+
metadata: { provider: "elevenlabs" },
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
}
|
|
@@ -7,6 +7,7 @@ export class GoogleProvider {
|
|
|
7
7
|
supportsImageEditing: true,
|
|
8
8
|
supportsVideoGeneration: true,
|
|
9
9
|
supportsAudioGeneration: true,
|
|
10
|
+
supportsTranscription: false,
|
|
10
11
|
supportedImageAspectRatios: ["1:1", "16:9", "9:16", "4:3", "3:4"],
|
|
11
12
|
supportedVideoAspectRatios: ["16:9", "9:16"],
|
|
12
13
|
supportedVideoResolutions: ["720p", "1080p"],
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import type { MediaProvider, ProviderCapabilities, ImageParams, EditImageParams, VideoParams, AudioParams, GeneratedMedia } from "./types.js";
|
|
1
|
+
import type { MediaProvider, ProviderCapabilities, ImageParams, EditImageParams, VideoParams, AudioParams, GeneratedMedia, TranscribeParams, TranscribedText } from "./types.js";
|
|
2
2
|
export declare class OpenAIProvider implements MediaProvider {
|
|
3
3
|
readonly name = "openai";
|
|
4
4
|
readonly capabilities: ProviderCapabilities;
|
|
@@ -8,6 +8,7 @@ export declare class OpenAIProvider implements MediaProvider {
|
|
|
8
8
|
editImage(params: EditImageParams): Promise<GeneratedMedia>;
|
|
9
9
|
generateVideo(params: VideoParams): Promise<GeneratedMedia>;
|
|
10
10
|
generateAudio(params: AudioParams): Promise<GeneratedMedia>;
|
|
11
|
+
transcribeAudio(params: TranscribeParams): Promise<TranscribedText>;
|
|
11
12
|
private audioFormatToMimeType;
|
|
12
13
|
private mapAspectRatioToSize;
|
|
13
14
|
}
|
|
@@ -14,6 +14,7 @@ export class OpenAIProvider {
|
|
|
14
14
|
supportsImageEditing: true,
|
|
15
15
|
supportsVideoGeneration: true,
|
|
16
16
|
supportsAudioGeneration: true,
|
|
17
|
+
supportsTranscription: true,
|
|
17
18
|
supportedImageAspectRatios: ["1:1", "16:9", "9:16", "4:3", "3:4"],
|
|
18
19
|
supportedVideoAspectRatios: ["16:9", "9:16", "1:1"],
|
|
19
20
|
supportedVideoResolutions: ["480p", "720p", "1080p"],
|
|
@@ -96,6 +97,19 @@ export class OpenAIProvider {
|
|
|
96
97
|
metadata: { model: "tts-1", provider: "openai", voice, format },
|
|
97
98
|
};
|
|
98
99
|
}
|
|
100
|
+
async transcribeAudio(params) {
|
|
101
|
+
const audioFile = new File([new Uint8Array(params.audioData)], "audio.wav", { type: params.audioMimeType });
|
|
102
|
+
const response = await this.client.audio.transcriptions.create({
|
|
103
|
+
model: "whisper-1",
|
|
104
|
+
file: audioFile,
|
|
105
|
+
language: params.language,
|
|
106
|
+
...params.providerOptions,
|
|
107
|
+
});
|
|
108
|
+
return {
|
|
109
|
+
text: response.text,
|
|
110
|
+
metadata: { model: "whisper-1", provider: "openai" },
|
|
111
|
+
};
|
|
112
|
+
}
|
|
99
113
|
audioFormatToMimeType(format) {
|
|
100
114
|
const mimeTypes = {
|
|
101
115
|
mp3: "audio/mpeg",
|
|
@@ -22,6 +22,9 @@ export class ProviderRegistry {
|
|
|
22
22
|
getAudioProviders() {
|
|
23
23
|
return [...this.providers.values()].filter((p) => p.capabilities.supportsAudioGeneration);
|
|
24
24
|
}
|
|
25
|
+
getTranscriptionProviders() {
|
|
26
|
+
return [...this.providers.values()].filter((p) => p.capabilities.supportsTranscription);
|
|
27
|
+
}
|
|
25
28
|
listCapabilities() {
|
|
26
29
|
return [...this.providers.values()].map((p) => ({
|
|
27
30
|
name: p.name,
|
|
@@ -5,12 +5,14 @@ export interface MediaProvider {
|
|
|
5
5
|
editImage(params: EditImageParams): Promise<GeneratedMedia>;
|
|
6
6
|
generateVideo(params: VideoParams): Promise<GeneratedMedia>;
|
|
7
7
|
generateAudio(params: AudioParams): Promise<GeneratedMedia>;
|
|
8
|
+
transcribeAudio?(params: TranscribeParams): Promise<TranscribedText>;
|
|
8
9
|
}
|
|
9
10
|
export interface ProviderCapabilities {
|
|
10
11
|
supportsImageGeneration: boolean;
|
|
11
12
|
supportsImageEditing: boolean;
|
|
12
13
|
supportsVideoGeneration: boolean;
|
|
13
14
|
supportsAudioGeneration: boolean;
|
|
15
|
+
supportsTranscription: boolean;
|
|
14
16
|
supportedImageAspectRatios: string[];
|
|
15
17
|
supportedVideoAspectRatios: string[];
|
|
16
18
|
supportedVideoResolutions: string[];
|
|
@@ -50,6 +52,16 @@ export interface GeneratedMedia {
|
|
|
50
52
|
mimeType: string;
|
|
51
53
|
metadata: Record<string, unknown>;
|
|
52
54
|
}
|
|
55
|
+
export interface TranscribeParams {
|
|
56
|
+
audioData: Buffer;
|
|
57
|
+
audioMimeType: string;
|
|
58
|
+
language?: string;
|
|
59
|
+
providerOptions?: Record<string, unknown>;
|
|
60
|
+
}
|
|
61
|
+
export interface TranscribedText {
|
|
62
|
+
text: string;
|
|
63
|
+
metadata: Record<string, unknown>;
|
|
64
|
+
}
|
|
53
65
|
export interface ProviderInfo {
|
|
54
66
|
name: string;
|
|
55
67
|
capabilities: ProviderCapabilities;
|
package/build/providers/xai.js
CHANGED
|
@@ -10,6 +10,7 @@ export class XAIProvider {
|
|
|
10
10
|
supportsImageEditing: true,
|
|
11
11
|
supportsVideoGeneration: true,
|
|
12
12
|
supportsAudioGeneration: false,
|
|
13
|
+
supportsTranscription: false,
|
|
13
14
|
supportedImageAspectRatios: ["1:1", "16:9", "9:16", "4:3", "3:4"],
|
|
14
15
|
supportedVideoAspectRatios: ["16:9", "9:16", "1:1"],
|
|
15
16
|
supportedVideoResolutions: ["720p", "1080p"],
|
package/build/read-media-file.js
CHANGED
|
@@ -7,6 +7,14 @@ const EXTENSION_TO_MIME = {
|
|
|
7
7
|
".webp": "image/webp",
|
|
8
8
|
".gif": "image/gif",
|
|
9
9
|
".mp4": "video/mp4",
|
|
10
|
+
".mp3": "audio/mpeg",
|
|
11
|
+
".wav": "audio/wav",
|
|
12
|
+
".flac": "audio/flac",
|
|
13
|
+
".ogg": "audio/ogg",
|
|
14
|
+
".m4a": "audio/mp4",
|
|
15
|
+
".aac": "audio/aac",
|
|
16
|
+
".opus": "audio/opus",
|
|
17
|
+
".webm": "audio/webm",
|
|
10
18
|
};
|
|
11
19
|
export async function readMediaFile(filePath) {
|
|
12
20
|
const absolutePath = resolve(filePath);
|
package/build/server.js
CHANGED
|
@@ -4,11 +4,14 @@ import { ProviderRegistry } from "./providers/registry.js";
|
|
|
4
4
|
import { OpenAIProvider } from "./providers/openai.js";
|
|
5
5
|
import { XAIProvider } from "./providers/xai.js";
|
|
6
6
|
import { GoogleProvider } from "./providers/google.js";
|
|
7
|
+
import { ElevenLabsProvider } from "./providers/elevenlabs.js";
|
|
8
|
+
import { BFLProvider } from "./providers/bfl.js";
|
|
7
9
|
import { FileManager } from "./file-manager.js";
|
|
8
10
|
import { buildGenerateImageHandler } from "./tools/generate-image.js";
|
|
9
11
|
import { buildEditImageHandler } from "./tools/edit-image.js";
|
|
10
12
|
import { buildGenerateVideoHandler } from "./tools/generate-video.js";
|
|
11
13
|
import { buildGenerateAudioHandler } from "./tools/generate-audio.js";
|
|
14
|
+
import { buildTranscribeAudioHandler } from "./tools/transcribe-audio.js";
|
|
12
15
|
import { buildListProvidersHandler } from "./tools/list-providers.js";
|
|
13
16
|
export function createServer(config) {
|
|
14
17
|
const registry = new ProviderRegistry();
|
|
@@ -25,25 +28,34 @@ export function createServer(config) {
|
|
|
25
28
|
registry.register(new GoogleProvider(config.googleApiKey));
|
|
26
29
|
console.error("[server] Registered Google provider");
|
|
27
30
|
}
|
|
31
|
+
if (config.elevenlabsApiKey) {
|
|
32
|
+
registry.register(new ElevenLabsProvider(config.elevenlabsApiKey));
|
|
33
|
+
console.error("[server] Registered ElevenLabs provider");
|
|
34
|
+
}
|
|
35
|
+
if (config.bflApiKey) {
|
|
36
|
+
registry.register(new BFLProvider(config.bflApiKey));
|
|
37
|
+
console.error("[server] Registered BFL provider");
|
|
38
|
+
}
|
|
28
39
|
const generateImageHandler = buildGenerateImageHandler(registry, fileManager);
|
|
29
40
|
const editImageHandler = buildEditImageHandler(registry, fileManager);
|
|
30
41
|
const generateVideoHandler = buildGenerateVideoHandler(registry, fileManager);
|
|
31
42
|
const generateAudioHandler = buildGenerateAudioHandler(registry, fileManager);
|
|
43
|
+
const transcribeAudioHandler = buildTranscribeAudioHandler(registry);
|
|
32
44
|
const listProvidersHandler = buildListProvidersHandler(registry);
|
|
33
45
|
const providerNames = registry.listCapabilities().map((p) => p.name).join(", ") || "none configured";
|
|
34
46
|
const server = new McpServer({ name: "multimodal-mcp", version: "1.0.0" });
|
|
35
|
-
server.tool("generate_image", `Generate an image from a text prompt using AI. Available
|
|
47
|
+
server.tool("generate_image", `Generate an image from a text prompt using AI. Providers: openai (DALL-E), xai (Aurora), google (Imagen), bfl (FLUX). Available: ${providerNames}`, {
|
|
36
48
|
prompt: z.string().describe("Text description of the image to generate"),
|
|
37
|
-
provider: z.string().optional().describe("Provider to use: openai, xai, google. Auto-selects if omitted."),
|
|
49
|
+
provider: z.string().optional().describe("Provider to use: openai, xai, google, bfl. Auto-selects if omitted."),
|
|
38
50
|
aspectRatio: z.string().optional().describe("Aspect ratio: 1:1, 16:9, 9:16, 4:3, 3:4"),
|
|
39
51
|
quality: z.string().optional().describe("Quality level: low, standard, high"),
|
|
40
52
|
outputDirectory: z.string().optional().describe("Directory to save the generated file. Supports absolute or relative paths (resolved from cwd). Defaults to MEDIA_OUTPUT_DIR env var or cwd."),
|
|
41
53
|
providerOptions: z.record(z.string(), z.unknown()).optional().describe("Provider-specific parameters passed through directly"),
|
|
42
54
|
}, async (params) => generateImageHandler(params));
|
|
43
|
-
server.tool("edit_image", `Edit an existing image using AI. Provide the path to an image and a text prompt describing the desired edits. Available
|
|
55
|
+
server.tool("edit_image", `Edit an existing image using AI. Provide the path to an image and a text prompt describing the desired edits. Providers: openai, xai, google, bfl (FLUX Kontext). Available: ${providerNames}`, {
|
|
44
56
|
imagePath: z.string().describe("Absolute path to the source image file to edit"),
|
|
45
57
|
prompt: z.string().describe("Text description of the edits to apply to the image"),
|
|
46
|
-
provider: z.string().optional().describe("Provider to use: openai, xai, google. Auto-selects if omitted."),
|
|
58
|
+
provider: z.string().optional().describe("Provider to use: openai, xai, google, bfl. Auto-selects if omitted."),
|
|
47
59
|
outputDirectory: z.string().optional().describe("Directory to save the edited file. Supports absolute or relative paths (resolved from cwd). Defaults to MEDIA_OUTPUT_DIR env var or cwd."),
|
|
48
60
|
providerOptions: z.record(z.string(), z.unknown()).optional().describe("Provider-specific parameters passed through directly"),
|
|
49
61
|
}, async (params) => editImageHandler(params));
|
|
@@ -57,15 +69,21 @@ export function createServer(config) {
|
|
|
57
69
|
outputDirectory: z.string().optional().describe("Directory to save the generated file. Supports absolute or relative paths (resolved from cwd). Defaults to MEDIA_OUTPUT_DIR env var or cwd."),
|
|
58
70
|
providerOptions: z.record(z.string(), z.unknown()).optional().describe("Provider-specific parameters passed through directly"),
|
|
59
71
|
}, async (params) => generateVideoHandler(params));
|
|
60
|
-
server.tool("generate_audio", `Generate audio
|
|
61
|
-
text: z.string().describe("Text to convert to speech"),
|
|
62
|
-
provider: z.string().optional().describe("Provider to use: openai, google. Auto-selects if omitted."),
|
|
63
|
-
voice: z.string().optional().describe("Voice name (provider-specific). OpenAI: alloy, ash, coral, echo, fable, nova, onyx, sage, shimmer. Google: Kore, Charon, Fenrir, Aoede, Puck, etc."),
|
|
72
|
+
server.tool("generate_audio", `Generate audio from text using AI. Supports text-to-speech and sound effects. Providers: openai, google, elevenlabs. ElevenLabs: use providerOptions.mode = "sound-effect" for sound effects. Available: ${providerNames}`, {
|
|
73
|
+
text: z.string().describe("Text to convert to speech, or a description of the sound effect to generate"),
|
|
74
|
+
provider: z.string().optional().describe("Provider to use: openai, google, elevenlabs. Auto-selects if omitted."),
|
|
75
|
+
voice: z.string().optional().describe("Voice name (provider-specific). OpenAI: alloy, ash, coral, echo, fable, nova, onyx, sage, shimmer. Google: Kore, Charon, Fenrir, Aoede, Puck, etc. ElevenLabs: voice ID."),
|
|
64
76
|
speed: z.number().optional().describe("Speech speed multiplier (OpenAI only): 0.25 to 4.0"),
|
|
65
77
|
format: z.string().optional().describe("Output format (OpenAI only): mp3, opus, aac, flac, wav, pcm"),
|
|
66
78
|
outputDirectory: z.string().optional().describe("Directory to save the generated file. Supports absolute or relative paths (resolved from cwd). Defaults to MEDIA_OUTPUT_DIR env var or cwd."),
|
|
67
79
|
providerOptions: z.record(z.string(), z.unknown()).optional().describe("Provider-specific parameters passed through directly"),
|
|
68
80
|
}, async (params) => generateAudioHandler(params));
|
|
81
|
+
server.tool("transcribe_audio", `Transcribe audio to text using AI (speech-to-text). Providers: openai (Whisper), elevenlabs (Scribe). Available: ${providerNames}`, {
|
|
82
|
+
audioPath: z.string().describe("Absolute path to the audio file to transcribe"),
|
|
83
|
+
provider: z.string().optional().describe("Provider to use: openai, elevenlabs. Auto-selects if omitted."),
|
|
84
|
+
language: z.string().optional().describe("Language code (e.g., 'en', 'fr', 'es') to hint the transcription language"),
|
|
85
|
+
providerOptions: z.record(z.string(), z.unknown()).optional().describe("Provider-specific parameters passed through directly"),
|
|
86
|
+
}, async (params) => transcribeAudioHandler(params));
|
|
69
87
|
server.tool("list_providers", "List all configured media generation providers and their capabilities", async () => listProvidersHandler());
|
|
70
88
|
return server;
|
|
71
89
|
}
|
|
@@ -5,7 +5,7 @@ export function buildListProvidersHandler(registry) {
|
|
|
5
5
|
return {
|
|
6
6
|
content: [{
|
|
7
7
|
type: "text",
|
|
8
|
-
text: "No providers configured. Set one or more API keys: OPENAI_API_KEY, XAI_API_KEY, GEMINI_API_KEY",
|
|
8
|
+
text: "No providers configured. Set one or more API keys: OPENAI_API_KEY, XAI_API_KEY, GEMINI_API_KEY, ELEVENLABS_API_KEY, BFL_API_KEY",
|
|
9
9
|
}],
|
|
10
10
|
};
|
|
11
11
|
}
|
|
@@ -19,6 +19,8 @@ export function buildListProvidersHandler(registry) {
|
|
|
19
19
|
caps.push("video");
|
|
20
20
|
if (p.capabilities.supportsAudioGeneration)
|
|
21
21
|
caps.push("audio");
|
|
22
|
+
if (p.capabilities.supportsTranscription)
|
|
23
|
+
caps.push("transcription");
|
|
22
24
|
return `- ${p.name}: ${caps.join(", ")}`;
|
|
23
25
|
});
|
|
24
26
|
return {
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import type { ProviderRegistry } from "../providers/registry.js";
|
|
2
|
+
export declare function buildTranscribeAudioHandler(registry: ProviderRegistry): (params: {
|
|
3
|
+
audioPath: string;
|
|
4
|
+
provider?: string;
|
|
5
|
+
language?: string;
|
|
6
|
+
providerOptions?: Record<string, unknown>;
|
|
7
|
+
}) => Promise<{
|
|
8
|
+
isError: true;
|
|
9
|
+
content: {
|
|
10
|
+
type: "text";
|
|
11
|
+
text: string;
|
|
12
|
+
}[];
|
|
13
|
+
} | {
|
|
14
|
+
content: {
|
|
15
|
+
type: "text";
|
|
16
|
+
text: string;
|
|
17
|
+
}[];
|
|
18
|
+
isError?: undefined;
|
|
19
|
+
}>;
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import { readMediaFile } from "../read-media-file.js";
|
|
2
|
+
import { sanitizeError } from "../errors.js";
|
|
3
|
+
export function buildTranscribeAudioHandler(registry) {
|
|
4
|
+
return async (params) => {
|
|
5
|
+
const provider = params.provider
|
|
6
|
+
? registry.getProvider(params.provider)
|
|
7
|
+
: registry.getTranscriptionProviders()[0];
|
|
8
|
+
if (!provider) {
|
|
9
|
+
const available = registry.getTranscriptionProviders().map((p) => p.name).join(", ") || "none";
|
|
10
|
+
const text = params.provider
|
|
11
|
+
? `Provider "${params.provider}" is not configured or does not support transcription. Available transcription providers: ${available}`
|
|
12
|
+
: `No transcription provider available. Available transcription providers: ${available}`;
|
|
13
|
+
return {
|
|
14
|
+
isError: true,
|
|
15
|
+
content: [{ type: "text", text }],
|
|
16
|
+
};
|
|
17
|
+
}
|
|
18
|
+
if (!provider.capabilities.supportsTranscription || !provider.transcribeAudio) {
|
|
19
|
+
const available = registry.getTranscriptionProviders().map((p) => p.name).join(", ") || "none";
|
|
20
|
+
return {
|
|
21
|
+
isError: true,
|
|
22
|
+
content: [{
|
|
23
|
+
type: "text",
|
|
24
|
+
text: `Provider "${provider.name}" does not support transcription. Available transcription providers: ${available}`,
|
|
25
|
+
}],
|
|
26
|
+
};
|
|
27
|
+
}
|
|
28
|
+
try {
|
|
29
|
+
const { data, mimeType } = await readMediaFile(params.audioPath);
|
|
30
|
+
const result = await provider.transcribeAudio({
|
|
31
|
+
audioData: data,
|
|
32
|
+
audioMimeType: mimeType,
|
|
33
|
+
language: params.language,
|
|
34
|
+
providerOptions: params.providerOptions,
|
|
35
|
+
});
|
|
36
|
+
return {
|
|
37
|
+
content: [{ type: "text", text: result.text }],
|
|
38
|
+
};
|
|
39
|
+
}
|
|
40
|
+
catch (error) {
|
|
41
|
+
const message = sanitizeError(error);
|
|
42
|
+
return {
|
|
43
|
+
isError: true,
|
|
44
|
+
content: [{ type: "text", text: `Transcription failed: ${message}` }],
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
};
|
|
48
|
+
}
|