@r16t/multimodal-mcp 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +178 -0
- package/build/config.d.ts +10 -0
- package/build/config.js +33 -0
- package/build/errors.d.ts +1 -0
- package/build/errors.js +24 -0
- package/build/file-manager.d.ts +7 -0
- package/build/file-manager.js +37 -0
- package/build/index.d.ts +2 -0
- package/build/index.js +16 -0
- package/build/providers/google.d.ts +10 -0
- package/build/providers/google.js +105 -0
- package/build/providers/openai.d.ts +12 -0
- package/build/providers/openai.js +92 -0
- package/build/providers/polling.d.ts +6 -0
- package/build/providers/polling.js +14 -0
- package/build/providers/registry.d.ts +10 -0
- package/build/providers/registry.js +28 -0
- package/build/providers/types.d.ts +46 -0
- package/build/providers/types.js +1 -0
- package/build/providers/xai.d.ts +11 -0
- package/build/providers/xai.js +80 -0
- package/build/server.d.ts +3 -0
- package/build/server.js +58 -0
- package/build/tools/generate-audio.d.ts +22 -0
- package/build/tools/generate-audio.js +48 -0
- package/build/tools/generate-image.d.ts +21 -0
- package/build/tools/generate-image.js +35 -0
- package/build/tools/generate-video.d.ts +22 -0
- package/build/tools/generate-video.js +36 -0
- package/build/tools/list-providers.d.ts +7 -0
- package/build/tools/list-providers.js +27 -0
- package/package.json +53 -0
package/README.md
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
# multimodal-mcp
|
|
2
|
+
|
|
3
|
+
Multi-provider media generation MCP server. Generate images, videos, and audio from text prompts using OpenAI, xAI, and Google through a single unified interface.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- 🎨 **Image Generation** — Generate images via OpenAI (gpt-image-1), xAI (grok-imagine-image), or Google (imagen-4)
|
|
8
|
+
- 🎬 **Video Generation** — Generate videos via OpenAI (sora-2), xAI (grok-imagine-video), or Google (veo-3.1)
|
|
9
|
+
- 🔊 **Audio Generation** — Text-to-speech via OpenAI (tts-1) or Google (gemini-2.5-flash-preview-tts)
|
|
10
|
+
- 🔄 **Auto-Discovery** — Automatically detects configured providers from environment variables
|
|
11
|
+
- 🎯 **Provider Selection** — Auto-selects or explicitly choose a provider per request
|
|
12
|
+
- 📁 **File Output** — Saves all generated media to disk with descriptive filenames
|
|
13
|
+
|
|
14
|
+
## Quick Start
|
|
15
|
+
|
|
16
|
+
### Claude Desktop
|
|
17
|
+
|
|
18
|
+
Add to your Claude Desktop config (`~/Library/Application Support/Claude/claude_desktop_config.json`):
|
|
19
|
+
|
|
20
|
+
```json
|
|
21
|
+
{
|
|
22
|
+
"mcpServers": {
|
|
23
|
+
"multimodal-mcp": {
|
|
24
|
+
"command": "npx",
|
|
25
|
+
"args": ["@r16t/multimodal-mcp"],
|
|
26
|
+
"env": {
|
|
27
|
+
"OPENAI_API_KEY": "sk-...",
|
|
28
|
+
"XAI_API_KEY": "xai-...",
|
|
29
|
+
"GOOGLE_API_KEY": "AIza...",
|
|
30
|
+
"MEDIA_OUTPUT_DIR": "/tmp/media"
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
You only need to set keys for the providers you want to use. At least one is required.
|
|
38
|
+
|
|
39
|
+
### Cursor / Other MCP Clients
|
|
40
|
+
|
|
41
|
+
```json
|
|
42
|
+
{
|
|
43
|
+
"mcpServers": {
|
|
44
|
+
"multimodal-mcp": {
|
|
45
|
+
"command": "npx",
|
|
46
|
+
"args": ["@r16t/multimodal-mcp"],
|
|
47
|
+
"env": {
|
|
48
|
+
"OPENAI_API_KEY": "sk-..."
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Environment Variables
|
|
56
|
+
|
|
57
|
+
| Variable | Required | Description |
|
|
58
|
+
|----------|----------|-------------|
|
|
59
|
+
| `OPENAI_API_KEY` | At least one provider key | OpenAI API key — enables image, video, and audio generation via gpt-image-1, sora-2, and tts-1 |
|
|
60
|
+
| `XAI_API_KEY` | At least one provider key | xAI API key — enables image and video generation via grok-imagine-image and grok-imagine-video |
|
|
61
|
+
| `GOOGLE_API_KEY` | At least one provider key | Google API key — enables image, video, and audio generation via imagen-4, veo-3.1, and gemini-2.5-flash-preview-tts |
|
|
62
|
+
| `GEMINI_API_KEY` | — | Alias for `GOOGLE_API_KEY`; either name is accepted |
|
|
63
|
+
| `MEDIA_OUTPUT_DIR` | No | Directory for saved media files. Defaults to the system temp directory |
|
|
64
|
+
|
|
65
|
+
## Available Tools
|
|
66
|
+
|
|
67
|
+
### `generate_image`
|
|
68
|
+
|
|
69
|
+
Generate an image from a text prompt.
|
|
70
|
+
|
|
71
|
+
| Parameter | Type | Required | Description |
|
|
72
|
+
|-----------|------|----------|-------------|
|
|
73
|
+
| `prompt` | string | Yes | Text description of the image to generate |
|
|
74
|
+
| `provider` | string | No | Provider to use: `openai`, `xai`, `google`. Auto-selects if omitted |
|
|
75
|
+
| `aspectRatio` | string | No | Aspect ratio: `1:1`, `16:9`, `9:16`, `4:3`, `3:4` |
|
|
76
|
+
| `quality` | string | No | Quality level: `low`, `standard`, `high` |
|
|
77
|
+
| `providerOptions` | object | No | Provider-specific parameters passed through directly |
|
|
78
|
+
|
|
79
|
+
### `generate_video`
|
|
80
|
+
|
|
81
|
+
Generate a video from a text prompt. Video generation is asynchronous and may take several minutes.
|
|
82
|
+
|
|
83
|
+
| Parameter | Type | Required | Description |
|
|
84
|
+
|-----------|------|----------|-------------|
|
|
85
|
+
| `prompt` | string | Yes | Text description of the video to generate |
|
|
86
|
+
| `provider` | string | No | Provider to use: `openai`, `xai`, `google`. Auto-selects if omitted |
|
|
87
|
+
| `duration` | number | No | Video duration in seconds (provider limits apply) |
|
|
88
|
+
| `aspectRatio` | string | No | Aspect ratio: `16:9`, `9:16`, `1:1` |
|
|
89
|
+
| `resolution` | string | No | Resolution: `480p`, `720p`, `1080p` |
|
|
90
|
+
| `providerOptions` | object | No | Provider-specific parameters passed through directly |
|
|
91
|
+
|
|
92
|
+
### `generate_audio`
|
|
93
|
+
|
|
94
|
+
Generate audio (text-to-speech) from text. Audio generation is synchronous.
|
|
95
|
+
|
|
96
|
+
| Parameter | Type | Required | Description |
|
|
97
|
+
|-----------|------|----------|-------------|
|
|
98
|
+
| `text` | string | Yes | Text to convert to speech |
|
|
99
|
+
| `provider` | string | No | Provider to use: `openai`, `google`. Auto-selects if omitted |
|
|
100
|
+
| `voice` | string | No | Voice name (provider-specific). OpenAI: `alloy`, `ash`, `coral`, `echo`, `fable`, `nova`, `onyx`, `sage`, `shimmer`. Google: `Kore`, `Charon`, `Fenrir`, `Aoede`, `Puck`, etc. |
|
|
101
|
+
| `speed` | number | No | Speech speed multiplier (OpenAI only): `0.25` to `4.0` |
|
|
102
|
+
| `format` | string | No | Output format (OpenAI only): `mp3`, `opus`, `aac`, `flac`, `wav`, `pcm` |
|
|
103
|
+
| `providerOptions` | object | No | Provider-specific parameters passed through directly |
|
|
104
|
+
|
|
105
|
+
### `list_providers`
|
|
106
|
+
|
|
107
|
+
List all configured media generation providers and their capabilities. Takes no parameters.
|
|
108
|
+
|
|
109
|
+
## Provider Capabilities
|
|
110
|
+
|
|
111
|
+
| Provider | Image | Video | Audio | Image Model | Video Model | Audio Model |
|
|
112
|
+
|----------|:-----:|:-----:|:-----:|-------------|-------------|-------------|
|
|
113
|
+
| OpenAI | ✅ | ✅ | ✅ | gpt-image-1 | sora-2 | tts-1 |
|
|
114
|
+
| xAI | ✅ | ✅ | — | grok-imagine-image | grok-imagine-video | — |
|
|
115
|
+
| Google | ✅ | ✅ | ✅ | imagen-4 | veo-3.1 | gemini-2.5-flash-preview-tts |
|
|
116
|
+
|
|
117
|
+
### Image Aspect Ratios
|
|
118
|
+
|
|
119
|
+
| Provider | 1:1 | 16:9 | 9:16 | 4:3 | 3:4 |
|
|
120
|
+
|----------|:---:|:----:|:----:|:---:|:---:|
|
|
121
|
+
| OpenAI | ✅ | ✅ | ✅ | ✅ | ✅ |
|
|
122
|
+
| xAI | ✅ | ✅ | ✅ | ✅ | ✅ |
|
|
123
|
+
| Google | ✅ | ✅ | ✅ | ✅ | ✅ |
|
|
124
|
+
|
|
125
|
+
### Video Aspect Ratios & Resolutions
|
|
126
|
+
|
|
127
|
+
| Provider | 16:9 | 9:16 | 1:1 | 480p | 720p | 1080p |
|
|
128
|
+
|----------|:----:|:----:|:---:|:----:|:----:|:-----:|
|
|
129
|
+
| OpenAI | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
|
130
|
+
| xAI | ✅ | ✅ | ✅ | — | ✅ | ✅ |
|
|
131
|
+
| Google | ✅ | ✅ | — | — | ✅ | ✅ |
|
|
132
|
+
|
|
133
|
+
### Audio Formats
|
|
134
|
+
|
|
135
|
+
| Provider | mp3 | opus | aac | flac | wav | pcm |
|
|
136
|
+
|----------|:---:|:----:|:---:|:----:|:---:|:---:|
|
|
137
|
+
| OpenAI | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
|
138
|
+
| Google | — | — | — | — | ✅ | — |
|
|
139
|
+
|
|
140
|
+
## Troubleshooting
|
|
141
|
+
|
|
142
|
+
### No providers configured
|
|
143
|
+
|
|
144
|
+
```
|
|
145
|
+
[config] No provider API keys detected
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
Set at least one of `OPENAI_API_KEY`, `XAI_API_KEY`, or `GOOGLE_API_KEY` in the MCP server's `env` block.
|
|
149
|
+
|
|
150
|
+
### Provider not available for requested media type
|
|
151
|
+
|
|
152
|
+
All three providers support image and video generation. Audio generation (text-to-speech) is supported by OpenAI and Google. xAI does not currently offer a standalone TTS API. If you specify a `provider` that isn't configured (no API key) or doesn't support the requested media type, you'll receive an error. Omit the `provider` parameter to auto-select from configured providers.
|
|
153
|
+
|
|
154
|
+
### Video generation timeout
|
|
155
|
+
|
|
156
|
+
Video generation polls for up to 10 minutes. If your video hasn't completed in that window, the request will fail with a timeout error. Try a shorter `duration` or a simpler `prompt`.
|
|
157
|
+
|
|
158
|
+
### xAI image generation returned no data
|
|
159
|
+
|
|
160
|
+
This indicates the xAI API returned an empty response. Check that your `XAI_API_KEY` is valid and that your prompt does not violate xAI content policies.
|
|
161
|
+
|
|
162
|
+
### Google image/video generation failed: 403
|
|
163
|
+
|
|
164
|
+
Verify your `GOOGLE_API_KEY` has the Generative Language API enabled in Google Cloud Console.
|
|
165
|
+
|
|
166
|
+
## Development
|
|
167
|
+
|
|
168
|
+
```bash
|
|
169
|
+
npm run build # Compile TypeScript to build/
|
|
170
|
+
npm test # Run tests with Vitest
|
|
171
|
+
npm run lint # Lint and auto-fix with ESLint
|
|
172
|
+
npm run typecheck # Type-check without emitting
|
|
173
|
+
npm run dev # Watch mode for TypeScript compilation
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
## License
|
|
177
|
+
|
|
178
|
+
MIT
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
declare const configSchema: z.ZodObject<{
|
|
3
|
+
openaiApiKey: z.ZodOptional<z.ZodString>;
|
|
4
|
+
xaiApiKey: z.ZodOptional<z.ZodString>;
|
|
5
|
+
googleApiKey: z.ZodOptional<z.ZodString>;
|
|
6
|
+
outputDirectory: z.ZodString;
|
|
7
|
+
}, z.core.$strip>;
|
|
8
|
+
export type Config = z.infer<typeof configSchema>;
|
|
9
|
+
export declare function loadConfig(): Config;
|
|
10
|
+
export {};
|
package/build/config.js
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
import { tmpdir } from "node:os";
|
|
3
|
+
const configSchema = z.object({
|
|
4
|
+
openaiApiKey: z.string().optional(),
|
|
5
|
+
xaiApiKey: z.string().optional(),
|
|
6
|
+
googleApiKey: z.string().optional(),
|
|
7
|
+
outputDirectory: z.string(),
|
|
8
|
+
});
|
|
9
|
+
function resolveGoogleKey() {
|
|
10
|
+
return process.env.GOOGLE_API_KEY || process.env.GEMINI_API_KEY || undefined;
|
|
11
|
+
}
|
|
12
|
+
export function loadConfig() {
|
|
13
|
+
const config = configSchema.parse({
|
|
14
|
+
openaiApiKey: process.env.OPENAI_API_KEY || undefined,
|
|
15
|
+
xaiApiKey: process.env.XAI_API_KEY || undefined,
|
|
16
|
+
googleApiKey: resolveGoogleKey(),
|
|
17
|
+
outputDirectory: process.env.MEDIA_OUTPUT_DIR || tmpdir(),
|
|
18
|
+
});
|
|
19
|
+
const detected = [];
|
|
20
|
+
if (config.openaiApiKey)
|
|
21
|
+
detected.push("OpenAI");
|
|
22
|
+
if (config.xaiApiKey)
|
|
23
|
+
detected.push("xAI");
|
|
24
|
+
if (config.googleApiKey)
|
|
25
|
+
detected.push("Google");
|
|
26
|
+
if (detected.length > 0) {
|
|
27
|
+
console.error(`[config] Detected providers: ${detected.join(", ")}`);
|
|
28
|
+
}
|
|
29
|
+
else {
|
|
30
|
+
console.error("[config] No provider API keys detected");
|
|
31
|
+
}
|
|
32
|
+
return config;
|
|
33
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare function sanitizeError(error: unknown): string;
|
package/build/errors.js
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
const API_KEY_PATTERNS = [
|
|
2
|
+
/sk-[a-zA-Z0-9_-]{10,}/g,
|
|
3
|
+
/xai-[a-zA-Z0-9_-]{10,}/g,
|
|
4
|
+
/AIzaSy[a-zA-Z0-9_-]{10,}/g,
|
|
5
|
+
/key=[a-zA-Z0-9_-]{20,}/g,
|
|
6
|
+
];
|
|
7
|
+
export function sanitizeError(error) {
|
|
8
|
+
let message;
|
|
9
|
+
if (error instanceof Error) {
|
|
10
|
+
message = error.message;
|
|
11
|
+
}
|
|
12
|
+
else if (typeof error === "string" && error.length > 0) {
|
|
13
|
+
message = error;
|
|
14
|
+
}
|
|
15
|
+
else {
|
|
16
|
+
return "Unknown error";
|
|
17
|
+
}
|
|
18
|
+
for (const pattern of API_KEY_PATTERNS) {
|
|
19
|
+
message = message.replace(pattern, "[REDACTED]");
|
|
20
|
+
}
|
|
21
|
+
message = message.replace(/\n\s+at .+/g, "");
|
|
22
|
+
message = message.replace(/\/[^\s:]+\.[tj]s(:\d+)?(:\d+)?/g, "[internal]");
|
|
23
|
+
return message.trim() || "Unknown error";
|
|
24
|
+
}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import type { GeneratedMedia } from "./providers/types.js";
|
|
2
|
+
export declare class FileManager {
|
|
3
|
+
private readonly outputDirectory;
|
|
4
|
+
constructor(outputDirectory: string);
|
|
5
|
+
save(media: GeneratedMedia, type: "image" | "video" | "audio"): Promise<string>;
|
|
6
|
+
private getExtension;
|
|
7
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import { mkdir, writeFile } from "node:fs/promises";
|
|
2
|
+
import { join, resolve } from "node:path";
|
|
3
|
+
import { randomBytes } from "node:crypto";
|
|
4
|
+
export class FileManager {
|
|
5
|
+
outputDirectory;
|
|
6
|
+
constructor(outputDirectory) {
|
|
7
|
+
this.outputDirectory = resolve(outputDirectory);
|
|
8
|
+
}
|
|
9
|
+
async save(media, type) {
|
|
10
|
+
await mkdir(this.outputDirectory, { recursive: true });
|
|
11
|
+
const extension = this.getExtension(type, media.mimeType);
|
|
12
|
+
const provider = media.metadata.provider || "unknown";
|
|
13
|
+
const timestamp = Date.now();
|
|
14
|
+
const random = randomBytes(4).toString("hex");
|
|
15
|
+
const filename = `${type}-${timestamp}-${provider}-${random}.${extension}`;
|
|
16
|
+
const filePath = join(this.outputDirectory, filename);
|
|
17
|
+
await writeFile(filePath, media.data);
|
|
18
|
+
return filePath;
|
|
19
|
+
}
|
|
20
|
+
getExtension(type, mimeType) {
|
|
21
|
+
if (type === "video")
|
|
22
|
+
return "mp4";
|
|
23
|
+
if (type === "audio") {
|
|
24
|
+
const audioExtensions = {
|
|
25
|
+
"audio/mpeg": "mp3",
|
|
26
|
+
"audio/opus": "opus",
|
|
27
|
+
"audio/aac": "aac",
|
|
28
|
+
"audio/flac": "flac",
|
|
29
|
+
"audio/wav": "wav",
|
|
30
|
+
"audio/pcm": "pcm",
|
|
31
|
+
};
|
|
32
|
+
return audioExtensions[mimeType] ?? "mp3";
|
|
33
|
+
}
|
|
34
|
+
const match = mimeType.match(/image\/(\w+)/);
|
|
35
|
+
return match ? match[1] : "png";
|
|
36
|
+
}
|
|
37
|
+
}
|
package/build/index.d.ts
ADDED
package/build/index.js
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
3
|
+
import { loadConfig } from "./config.js";
|
|
4
|
+
import { createServer } from "./server.js";
|
|
5
|
+
async function main() {
|
|
6
|
+
console.error("[multimodal-mcp] Starting server...");
|
|
7
|
+
const config = loadConfig();
|
|
8
|
+
const server = createServer(config);
|
|
9
|
+
const transport = new StdioServerTransport();
|
|
10
|
+
await server.connect(transport);
|
|
11
|
+
console.error("[multimodal-mcp] Server connected and ready");
|
|
12
|
+
}
|
|
13
|
+
main().catch((error) => {
|
|
14
|
+
console.error("[multimodal-mcp] Fatal error:", error);
|
|
15
|
+
process.exit(1);
|
|
16
|
+
});
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import type { MediaProvider, ProviderCapabilities, ImageParams, VideoParams, AudioParams, GeneratedMedia } from "./types.js";
|
|
2
|
+
export declare class GoogleProvider implements MediaProvider {
|
|
3
|
+
readonly name = "google";
|
|
4
|
+
readonly capabilities: ProviderCapabilities;
|
|
5
|
+
private apiKey;
|
|
6
|
+
constructor(apiKey: string);
|
|
7
|
+
generateImage(params: ImageParams): Promise<GeneratedMedia>;
|
|
8
|
+
generateVideo(params: VideoParams): Promise<GeneratedMedia>;
|
|
9
|
+
generateAudio(params: AudioParams): Promise<GeneratedMedia>;
|
|
10
|
+
}
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
import { pollForCompletion } from "./polling.js";
|
|
2
|
+
const GEMINI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta";
|
|
3
|
+
export class GoogleProvider {
|
|
4
|
+
name = "google";
|
|
5
|
+
capabilities = {
|
|
6
|
+
supportsImageGeneration: true,
|
|
7
|
+
supportsVideoGeneration: true,
|
|
8
|
+
supportsAudioGeneration: true,
|
|
9
|
+
supportedImageAspectRatios: ["1:1", "16:9", "9:16", "4:3", "3:4"],
|
|
10
|
+
supportedVideoAspectRatios: ["16:9", "9:16"],
|
|
11
|
+
supportedVideoResolutions: ["720p", "1080p"],
|
|
12
|
+
supportedAudioFormats: ["wav"],
|
|
13
|
+
maxVideoDurationSeconds: 8,
|
|
14
|
+
};
|
|
15
|
+
apiKey;
|
|
16
|
+
constructor(apiKey) {
|
|
17
|
+
this.apiKey = apiKey;
|
|
18
|
+
}
|
|
19
|
+
async generateImage(params) {
|
|
20
|
+
const response = await fetch(`${GEMINI_BASE_URL}/models/imagen-4:generateImages?key=${this.apiKey}`, {
|
|
21
|
+
method: "POST",
|
|
22
|
+
headers: { "Content-Type": "application/json" },
|
|
23
|
+
body: JSON.stringify({
|
|
24
|
+
prompt: params.prompt,
|
|
25
|
+
config: {
|
|
26
|
+
aspectRatio: params.aspectRatio,
|
|
27
|
+
...params.providerOptions,
|
|
28
|
+
},
|
|
29
|
+
}),
|
|
30
|
+
});
|
|
31
|
+
if (!response.ok) {
|
|
32
|
+
throw new Error(`Google image generation failed: ${response.status}`);
|
|
33
|
+
}
|
|
34
|
+
const result = (await response.json());
|
|
35
|
+
const base64 = result.generatedImages[0].image.bytesBase64Encoded;
|
|
36
|
+
return {
|
|
37
|
+
data: Buffer.from(base64, "base64"),
|
|
38
|
+
mimeType: "image/png",
|
|
39
|
+
metadata: { model: "imagen-4", provider: "google" },
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
async generateVideo(params) {
|
|
43
|
+
const submitResponse = await fetch(`${GEMINI_BASE_URL}/models/veo-3.1:predictLongRunning?key=${this.apiKey}`, {
|
|
44
|
+
method: "POST",
|
|
45
|
+
headers: { "Content-Type": "application/json" },
|
|
46
|
+
body: JSON.stringify({
|
|
47
|
+
prompt: params.prompt,
|
|
48
|
+
config: {
|
|
49
|
+
aspectRatio: params.aspectRatio,
|
|
50
|
+
durationSeconds: params.duration,
|
|
51
|
+
...params.providerOptions,
|
|
52
|
+
},
|
|
53
|
+
}),
|
|
54
|
+
});
|
|
55
|
+
if (!submitResponse.ok) {
|
|
56
|
+
throw new Error(`Google video generation failed: ${submitResponse.status}`);
|
|
57
|
+
}
|
|
58
|
+
const operation = (await submitResponse.json());
|
|
59
|
+
const result = await pollForCompletion(async () => {
|
|
60
|
+
const statusResponse = await fetch(`${GEMINI_BASE_URL}/${operation.name}?key=${this.apiKey}`);
|
|
61
|
+
return statusResponse.json();
|
|
62
|
+
}, (status) => status.done === true, { timeoutMs: 600_000, intervalMs: 5_000 });
|
|
63
|
+
const videoUri = result.response.videos[0].uri;
|
|
64
|
+
const videoResponse = await fetch(videoUri);
|
|
65
|
+
const data = Buffer.from(await videoResponse.arrayBuffer());
|
|
66
|
+
return {
|
|
67
|
+
data,
|
|
68
|
+
mimeType: "video/mp4",
|
|
69
|
+
metadata: { model: "veo-3.1", provider: "google", operationName: operation.name },
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
async generateAudio(params) {
|
|
73
|
+
const voice = params.voice ?? "Kore";
|
|
74
|
+
const model = "gemini-2.5-flash-preview-tts";
|
|
75
|
+
const response = await fetch(`${GEMINI_BASE_URL}/models/${model}:generateContent?key=${this.apiKey}`, {
|
|
76
|
+
method: "POST",
|
|
77
|
+
headers: { "Content-Type": "application/json" },
|
|
78
|
+
body: JSON.stringify({
|
|
79
|
+
contents: [{ parts: [{ text: params.text }] }],
|
|
80
|
+
generationConfig: {
|
|
81
|
+
response_modalities: ["AUDIO"],
|
|
82
|
+
speech_config: {
|
|
83
|
+
voiceConfig: {
|
|
84
|
+
prebuiltVoiceConfig: { voiceName: voice },
|
|
85
|
+
},
|
|
86
|
+
},
|
|
87
|
+
...params.providerOptions,
|
|
88
|
+
},
|
|
89
|
+
}),
|
|
90
|
+
});
|
|
91
|
+
if (!response.ok) {
|
|
92
|
+
throw new Error(`Google audio generation failed: ${response.status}`);
|
|
93
|
+
}
|
|
94
|
+
const result = (await response.json());
|
|
95
|
+
const audioPart = result.candidates[0]?.content?.parts?.find((part) => part.inlineData !== undefined);
|
|
96
|
+
if (!audioPart?.inlineData) {
|
|
97
|
+
throw new Error("Google audio generation returned no audio data");
|
|
98
|
+
}
|
|
99
|
+
return {
|
|
100
|
+
data: Buffer.from(audioPart.inlineData.data, "base64"),
|
|
101
|
+
mimeType: audioPart.inlineData.mimeType || "audio/wav",
|
|
102
|
+
metadata: { model, provider: "google", voice },
|
|
103
|
+
};
|
|
104
|
+
}
|
|
105
|
+
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import type { MediaProvider, ProviderCapabilities, ImageParams, VideoParams, AudioParams, GeneratedMedia } from "./types.js";
|
|
2
|
+
export declare class OpenAIProvider implements MediaProvider {
|
|
3
|
+
readonly name = "openai";
|
|
4
|
+
readonly capabilities: ProviderCapabilities;
|
|
5
|
+
private client;
|
|
6
|
+
constructor(apiKey: string);
|
|
7
|
+
generateImage(params: ImageParams): Promise<GeneratedMedia>;
|
|
8
|
+
generateVideo(params: VideoParams): Promise<GeneratedMedia>;
|
|
9
|
+
generateAudio(params: AudioParams): Promise<GeneratedMedia>;
|
|
10
|
+
private audioFormatToMimeType;
|
|
11
|
+
private mapAspectRatioToSize;
|
|
12
|
+
}
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import OpenAI from "openai";
|
|
2
|
+
import { pollForCompletion } from "./polling.js";
|
|
3
|
+
const ASPECT_RATIO_TO_SIZE = {
|
|
4
|
+
"1:1": "1024x1024",
|
|
5
|
+
"16:9": "1536x1024",
|
|
6
|
+
"9:16": "1024x1536",
|
|
7
|
+
"4:3": "1024x768",
|
|
8
|
+
"3:4": "768x1024",
|
|
9
|
+
};
|
|
10
|
+
export class OpenAIProvider {
|
|
11
|
+
name = "openai";
|
|
12
|
+
capabilities = {
|
|
13
|
+
supportsImageGeneration: true,
|
|
14
|
+
supportsVideoGeneration: true,
|
|
15
|
+
supportsAudioGeneration: true,
|
|
16
|
+
supportedImageAspectRatios: ["1:1", "16:9", "9:16", "4:3", "3:4"],
|
|
17
|
+
supportedVideoAspectRatios: ["16:9", "9:16", "1:1"],
|
|
18
|
+
supportedVideoResolutions: ["480p", "720p", "1080p"],
|
|
19
|
+
supportedAudioFormats: ["mp3", "opus", "aac", "flac", "wav", "pcm"],
|
|
20
|
+
maxVideoDurationSeconds: 20,
|
|
21
|
+
};
|
|
22
|
+
client;
|
|
23
|
+
constructor(apiKey) {
|
|
24
|
+
this.client = new OpenAI({ apiKey });
|
|
25
|
+
}
|
|
26
|
+
async generateImage(params) {
|
|
27
|
+
const response = await this.client.images.generate({
|
|
28
|
+
model: "gpt-image-1",
|
|
29
|
+
prompt: params.prompt,
|
|
30
|
+
size: this.mapAspectRatioToSize(params.aspectRatio),
|
|
31
|
+
quality: params.quality === "high" ? "hd" : "standard",
|
|
32
|
+
response_format: "b64_json",
|
|
33
|
+
...params.providerOptions,
|
|
34
|
+
});
|
|
35
|
+
const base64Data = response.data[0].b64_json;
|
|
36
|
+
return {
|
|
37
|
+
data: Buffer.from(base64Data, "base64"),
|
|
38
|
+
mimeType: "image/png",
|
|
39
|
+
metadata: { model: "gpt-image-1", provider: "openai" },
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
async generateVideo(params) {
|
|
43
|
+
const videos = this.client.videos;
|
|
44
|
+
const job = await videos.create({
|
|
45
|
+
model: "sora-2",
|
|
46
|
+
prompt: params.prompt,
|
|
47
|
+
duration: params.duration,
|
|
48
|
+
...params.providerOptions,
|
|
49
|
+
});
|
|
50
|
+
const result = await pollForCompletion(() => videos.retrieve(job.id), (status) => status.status === "completed", { timeoutMs: 600_000, intervalMs: 5_000 });
|
|
51
|
+
const videoUrl = result.url;
|
|
52
|
+
const videoResponse = await fetch(videoUrl);
|
|
53
|
+
const data = Buffer.from(await videoResponse.arrayBuffer());
|
|
54
|
+
return {
|
|
55
|
+
data,
|
|
56
|
+
mimeType: "video/mp4",
|
|
57
|
+
metadata: { model: "sora-2", provider: "openai", jobId: job.id },
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
async generateAudio(params) {
|
|
61
|
+
const format = params.format ?? "mp3";
|
|
62
|
+
const voice = params.voice ?? "alloy";
|
|
63
|
+
const response = await this.client.audio.speech.create({
|
|
64
|
+
model: "tts-1",
|
|
65
|
+
input: params.text,
|
|
66
|
+
voice: voice,
|
|
67
|
+
response_format: format,
|
|
68
|
+
speed: params.speed ?? 1.0,
|
|
69
|
+
...params.providerOptions,
|
|
70
|
+
});
|
|
71
|
+
const data = Buffer.from(await response.arrayBuffer());
|
|
72
|
+
return {
|
|
73
|
+
data,
|
|
74
|
+
mimeType: this.audioFormatToMimeType(format),
|
|
75
|
+
metadata: { model: "tts-1", provider: "openai", voice, format },
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
audioFormatToMimeType(format) {
|
|
79
|
+
const mimeTypes = {
|
|
80
|
+
mp3: "audio/mpeg",
|
|
81
|
+
opus: "audio/opus",
|
|
82
|
+
aac: "audio/aac",
|
|
83
|
+
flac: "audio/flac",
|
|
84
|
+
wav: "audio/wav",
|
|
85
|
+
pcm: "audio/pcm",
|
|
86
|
+
};
|
|
87
|
+
return mimeTypes[format] ?? "audio/mpeg";
|
|
88
|
+
}
|
|
89
|
+
mapAspectRatioToSize(aspectRatio) {
|
|
90
|
+
return ASPECT_RATIO_TO_SIZE[aspectRatio] ?? "1024x1024";
|
|
91
|
+
}
|
|
92
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
export async function pollForCompletion(checkStatus, isComplete, options) {
|
|
2
|
+
const startTime = Date.now();
|
|
3
|
+
let attempt = 0;
|
|
4
|
+
while (Date.now() - startTime < options.timeoutMs) {
|
|
5
|
+
const result = await checkStatus();
|
|
6
|
+
attempt++;
|
|
7
|
+
options.onPoll?.(attempt);
|
|
8
|
+
if (isComplete(result)) {
|
|
9
|
+
return result;
|
|
10
|
+
}
|
|
11
|
+
await new Promise((resolve) => setTimeout(resolve, options.intervalMs));
|
|
12
|
+
}
|
|
13
|
+
throw new Error(`Generation timed out after ${options.timeoutMs / 1000} seconds`);
|
|
14
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import type { MediaProvider, ProviderInfo } from "./types.js";
|
|
2
|
+
export declare class ProviderRegistry {
|
|
3
|
+
private providers;
|
|
4
|
+
register(provider: MediaProvider): void;
|
|
5
|
+
getProvider(name?: string): MediaProvider | undefined;
|
|
6
|
+
getImageProviders(): MediaProvider[];
|
|
7
|
+
getVideoProviders(): MediaProvider[];
|
|
8
|
+
getAudioProviders(): MediaProvider[];
|
|
9
|
+
listCapabilities(): ProviderInfo[];
|
|
10
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
export class ProviderRegistry {
|
|
2
|
+
providers = new Map();
|
|
3
|
+
register(provider) {
|
|
4
|
+
this.providers.set(provider.name, provider);
|
|
5
|
+
}
|
|
6
|
+
getProvider(name) {
|
|
7
|
+
if (name) {
|
|
8
|
+
return this.providers.get(name);
|
|
9
|
+
}
|
|
10
|
+
const first = this.providers.values().next();
|
|
11
|
+
return first.done ? undefined : first.value;
|
|
12
|
+
}
|
|
13
|
+
getImageProviders() {
|
|
14
|
+
return [...this.providers.values()].filter((p) => p.capabilities.supportsImageGeneration);
|
|
15
|
+
}
|
|
16
|
+
getVideoProviders() {
|
|
17
|
+
return [...this.providers.values()].filter((p) => p.capabilities.supportsVideoGeneration);
|
|
18
|
+
}
|
|
19
|
+
getAudioProviders() {
|
|
20
|
+
return [...this.providers.values()].filter((p) => p.capabilities.supportsAudioGeneration);
|
|
21
|
+
}
|
|
22
|
+
listCapabilities() {
|
|
23
|
+
return [...this.providers.values()].map((p) => ({
|
|
24
|
+
name: p.name,
|
|
25
|
+
capabilities: p.capabilities,
|
|
26
|
+
}));
|
|
27
|
+
}
|
|
28
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
export interface MediaProvider {
|
|
2
|
+
readonly name: string;
|
|
3
|
+
readonly capabilities: ProviderCapabilities;
|
|
4
|
+
generateImage(params: ImageParams): Promise<GeneratedMedia>;
|
|
5
|
+
generateVideo(params: VideoParams): Promise<GeneratedMedia>;
|
|
6
|
+
generateAudio(params: AudioParams): Promise<GeneratedMedia>;
|
|
7
|
+
}
|
|
8
|
+
export interface ProviderCapabilities {
|
|
9
|
+
supportsImageGeneration: boolean;
|
|
10
|
+
supportsVideoGeneration: boolean;
|
|
11
|
+
supportsAudioGeneration: boolean;
|
|
12
|
+
supportedImageAspectRatios: string[];
|
|
13
|
+
supportedVideoAspectRatios: string[];
|
|
14
|
+
supportedVideoResolutions: string[];
|
|
15
|
+
supportedAudioFormats: string[];
|
|
16
|
+
maxVideoDurationSeconds: number;
|
|
17
|
+
}
|
|
18
|
+
export interface ImageParams {
|
|
19
|
+
prompt: string;
|
|
20
|
+
aspectRatio: string;
|
|
21
|
+
quality: string;
|
|
22
|
+
providerOptions?: Record<string, unknown>;
|
|
23
|
+
}
|
|
24
|
+
export interface VideoParams {
|
|
25
|
+
prompt: string;
|
|
26
|
+
duration: number;
|
|
27
|
+
aspectRatio: string;
|
|
28
|
+
resolution: string;
|
|
29
|
+
providerOptions?: Record<string, unknown>;
|
|
30
|
+
}
|
|
31
|
+
export interface AudioParams {
|
|
32
|
+
text: string;
|
|
33
|
+
voice?: string;
|
|
34
|
+
speed?: number;
|
|
35
|
+
format?: string;
|
|
36
|
+
providerOptions?: Record<string, unknown>;
|
|
37
|
+
}
|
|
38
|
+
export interface GeneratedMedia {
|
|
39
|
+
data: Buffer;
|
|
40
|
+
mimeType: string;
|
|
41
|
+
metadata: Record<string, unknown>;
|
|
42
|
+
}
|
|
43
|
+
export interface ProviderInfo {
|
|
44
|
+
name: string;
|
|
45
|
+
capabilities: ProviderCapabilities;
|
|
46
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import type { MediaProvider, ProviderCapabilities, ImageParams, VideoParams, AudioParams, GeneratedMedia } from "./types.js";
|
|
2
|
+
export declare class XAIProvider implements MediaProvider {
|
|
3
|
+
readonly name = "xai";
|
|
4
|
+
readonly capabilities: ProviderCapabilities;
|
|
5
|
+
private client;
|
|
6
|
+
private apiKey;
|
|
7
|
+
constructor(apiKey: string);
|
|
8
|
+
generateImage(params: ImageParams): Promise<GeneratedMedia>;
|
|
9
|
+
generateVideo(params: VideoParams): Promise<GeneratedMedia>;
|
|
10
|
+
generateAudio(_params: AudioParams): Promise<GeneratedMedia>;
|
|
11
|
+
}
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import OpenAI from "openai";
|
|
2
|
+
import { pollForCompletion } from "./polling.js";
|
|
3
|
+
const XAI_BASE_URL = "https://api.x.ai/v1";
|
|
4
|
+
const IMAGE_MODEL = "grok-imagine-image";
|
|
5
|
+
const VIDEO_MODEL = "grok-imagine-video";
|
|
6
|
+
export class XAIProvider {
|
|
7
|
+
name = "xai";
|
|
8
|
+
capabilities = {
|
|
9
|
+
supportsImageGeneration: true,
|
|
10
|
+
supportsVideoGeneration: true,
|
|
11
|
+
supportsAudioGeneration: false,
|
|
12
|
+
supportedImageAspectRatios: ["1:1", "16:9", "9:16", "4:3", "3:4"],
|
|
13
|
+
supportedVideoAspectRatios: ["16:9", "9:16", "1:1"],
|
|
14
|
+
supportedVideoResolutions: ["720p", "1080p"],
|
|
15
|
+
supportedAudioFormats: [],
|
|
16
|
+
maxVideoDurationSeconds: 15,
|
|
17
|
+
};
|
|
18
|
+
client;
|
|
19
|
+
apiKey;
|
|
20
|
+
constructor(apiKey) {
|
|
21
|
+
this.apiKey = apiKey;
|
|
22
|
+
this.client = new OpenAI({ apiKey, baseURL: XAI_BASE_URL });
|
|
23
|
+
}
|
|
24
|
+
async generateImage(params) {
|
|
25
|
+
const response = await this.client.images.generate({
|
|
26
|
+
model: IMAGE_MODEL,
|
|
27
|
+
prompt: params.prompt,
|
|
28
|
+
response_format: "b64_json",
|
|
29
|
+
...params.providerOptions,
|
|
30
|
+
});
|
|
31
|
+
const imageData = response.data?.[0];
|
|
32
|
+
if (!imageData?.b64_json) {
|
|
33
|
+
throw new Error("xAI image generation returned no data");
|
|
34
|
+
}
|
|
35
|
+
const base64Data = imageData.b64_json;
|
|
36
|
+
return {
|
|
37
|
+
data: Buffer.from(base64Data, "base64"),
|
|
38
|
+
mimeType: "image/png",
|
|
39
|
+
metadata: { model: IMAGE_MODEL, provider: "xai" },
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
async generateVideo(params) {
|
|
43
|
+
const submitResponse = await fetch(`${XAI_BASE_URL}/videos/generations`, {
|
|
44
|
+
method: "POST",
|
|
45
|
+
headers: {
|
|
46
|
+
"Content-Type": "application/json",
|
|
47
|
+
Authorization: `Bearer ${this.apiKey}`,
|
|
48
|
+
},
|
|
49
|
+
body: JSON.stringify({
|
|
50
|
+
model: VIDEO_MODEL,
|
|
51
|
+
prompt: params.prompt,
|
|
52
|
+
duration: params.duration,
|
|
53
|
+
aspect_ratio: params.aspectRatio,
|
|
54
|
+
...params.providerOptions,
|
|
55
|
+
}),
|
|
56
|
+
});
|
|
57
|
+
if (!submitResponse.ok) {
|
|
58
|
+
throw new Error(`xAI video generation failed: ${submitResponse.status}`);
|
|
59
|
+
}
|
|
60
|
+
const submitResult = (await submitResponse.json());
|
|
61
|
+
const statusResult = await pollForCompletion(async () => {
|
|
62
|
+
const statusResponse = await fetch(`${XAI_BASE_URL}/videos/generations/${submitResult.request_id}`, { headers: { Authorization: `Bearer ${this.apiKey}` } });
|
|
63
|
+
return statusResponse.json();
|
|
64
|
+
}, (result) => result.status === "done", { timeoutMs: 600_000, intervalMs: 5_000 });
|
|
65
|
+
const videoResponse = await fetch(statusResult.video_url);
|
|
66
|
+
const data = Buffer.from(await videoResponse.arrayBuffer());
|
|
67
|
+
return {
|
|
68
|
+
data,
|
|
69
|
+
mimeType: "video/mp4",
|
|
70
|
+
metadata: {
|
|
71
|
+
model: VIDEO_MODEL,
|
|
72
|
+
provider: "xai",
|
|
73
|
+
requestId: submitResult.request_id,
|
|
74
|
+
},
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
async generateAudio(_params) {
|
|
78
|
+
throw new Error("xAI does not support audio generation");
|
|
79
|
+
}
|
|
80
|
+
}
|
package/build/server.js
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
2
|
+
import { z } from "zod";
|
|
3
|
+
import { ProviderRegistry } from "./providers/registry.js";
|
|
4
|
+
import { OpenAIProvider } from "./providers/openai.js";
|
|
5
|
+
import { XAIProvider } from "./providers/xai.js";
|
|
6
|
+
import { GoogleProvider } from "./providers/google.js";
|
|
7
|
+
import { FileManager } from "./file-manager.js";
|
|
8
|
+
import { buildGenerateImageHandler } from "./tools/generate-image.js";
|
|
9
|
+
import { buildGenerateVideoHandler } from "./tools/generate-video.js";
|
|
10
|
+
import { buildGenerateAudioHandler } from "./tools/generate-audio.js";
|
|
11
|
+
import { buildListProvidersHandler } from "./tools/list-providers.js";
|
|
12
|
+
export function createServer(config) {
|
|
13
|
+
const registry = new ProviderRegistry();
|
|
14
|
+
const fileManager = new FileManager(config.outputDirectory);
|
|
15
|
+
if (config.openaiApiKey) {
|
|
16
|
+
registry.register(new OpenAIProvider(config.openaiApiKey));
|
|
17
|
+
console.error("[server] Registered OpenAI provider");
|
|
18
|
+
}
|
|
19
|
+
if (config.xaiApiKey) {
|
|
20
|
+
registry.register(new XAIProvider(config.xaiApiKey));
|
|
21
|
+
console.error("[server] Registered xAI provider");
|
|
22
|
+
}
|
|
23
|
+
if (config.googleApiKey) {
|
|
24
|
+
registry.register(new GoogleProvider(config.googleApiKey));
|
|
25
|
+
console.error("[server] Registered Google provider");
|
|
26
|
+
}
|
|
27
|
+
const generateImageHandler = buildGenerateImageHandler(registry, fileManager);
|
|
28
|
+
const generateVideoHandler = buildGenerateVideoHandler(registry, fileManager);
|
|
29
|
+
const generateAudioHandler = buildGenerateAudioHandler(registry, fileManager);
|
|
30
|
+
const listProvidersHandler = buildListProvidersHandler(registry);
|
|
31
|
+
const providerNames = registry.listCapabilities().map((p) => p.name).join(", ") || "none configured";
|
|
32
|
+
const server = new McpServer({ name: "multimodal-mcp", version: "1.0.0" });
|
|
33
|
+
server.tool("generate_image", `Generate an image from a text prompt using AI. Available providers: ${providerNames}`, {
|
|
34
|
+
prompt: z.string().describe("Text description of the image to generate"),
|
|
35
|
+
provider: z.string().optional().describe("Provider to use: openai, xai, google. Auto-selects if omitted."),
|
|
36
|
+
aspectRatio: z.string().optional().describe("Aspect ratio: 1:1, 16:9, 9:16, 4:3, 3:4"),
|
|
37
|
+
quality: z.string().optional().describe("Quality level: low, standard, high"),
|
|
38
|
+
providerOptions: z.record(z.string(), z.unknown()).optional().describe("Provider-specific parameters passed through directly"),
|
|
39
|
+
}, async (params) => generateImageHandler(params));
|
|
40
|
+
server.tool("generate_video", `Generate a video from a text prompt using AI. Available providers: ${providerNames}`, {
|
|
41
|
+
prompt: z.string().describe("Text description of the video to generate"),
|
|
42
|
+
provider: z.string().optional().describe("Provider to use: openai, xai, google. Auto-selects if omitted."),
|
|
43
|
+
duration: z.number().optional().describe("Video duration in seconds (provider limits apply)"),
|
|
44
|
+
aspectRatio: z.string().optional().describe("Aspect ratio: 16:9, 9:16, 1:1"),
|
|
45
|
+
resolution: z.string().optional().describe("Resolution: 480p, 720p, 1080p"),
|
|
46
|
+
providerOptions: z.record(z.string(), z.unknown()).optional().describe("Provider-specific parameters passed through directly"),
|
|
47
|
+
}, async (params) => generateVideoHandler(params));
|
|
48
|
+
server.tool("generate_audio", `Generate audio (text-to-speech) from text using AI. Available providers: ${providerNames}`, {
|
|
49
|
+
text: z.string().describe("Text to convert to speech"),
|
|
50
|
+
provider: z.string().optional().describe("Provider to use: openai, google. Auto-selects if omitted."),
|
|
51
|
+
voice: z.string().optional().describe("Voice name (provider-specific). OpenAI: alloy, ash, coral, echo, fable, nova, onyx, sage, shimmer. Google: Kore, Charon, Fenrir, Aoede, Puck, etc."),
|
|
52
|
+
speed: z.number().optional().describe("Speech speed multiplier (OpenAI only): 0.25 to 4.0"),
|
|
53
|
+
format: z.string().optional().describe("Output format (OpenAI only): mp3, opus, aac, flac, wav, pcm"),
|
|
54
|
+
providerOptions: z.record(z.string(), z.unknown()).optional().describe("Provider-specific parameters passed through directly"),
|
|
55
|
+
}, async (params) => generateAudioHandler(params));
|
|
56
|
+
server.tool("list_providers", "List all configured media generation providers and their capabilities", async () => listProvidersHandler());
|
|
57
|
+
return server;
|
|
58
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import type { ProviderRegistry } from "../providers/registry.js";
|
|
2
|
+
import type { FileManager } from "../file-manager.js";
|
|
3
|
+
export declare function buildGenerateAudioHandler(registry: ProviderRegistry, fileManager: FileManager): (params: {
|
|
4
|
+
text: string;
|
|
5
|
+
provider?: string;
|
|
6
|
+
voice?: string;
|
|
7
|
+
speed?: number;
|
|
8
|
+
format?: string;
|
|
9
|
+
providerOptions?: Record<string, unknown>;
|
|
10
|
+
}) => Promise<{
|
|
11
|
+
isError: true;
|
|
12
|
+
content: {
|
|
13
|
+
type: "text";
|
|
14
|
+
text: string;
|
|
15
|
+
}[];
|
|
16
|
+
} | {
|
|
17
|
+
content: {
|
|
18
|
+
type: "text";
|
|
19
|
+
text: string;
|
|
20
|
+
}[];
|
|
21
|
+
isError?: undefined;
|
|
22
|
+
}>;
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import { sanitizeError } from "../errors.js";
|
|
2
|
+
export function buildGenerateAudioHandler(registry, fileManager) {
|
|
3
|
+
return async (params) => {
|
|
4
|
+
const provider = params.provider
|
|
5
|
+
? registry.getProvider(params.provider)
|
|
6
|
+
: registry.getAudioProviders()[0];
|
|
7
|
+
if (!provider) {
|
|
8
|
+
const available = registry.getAudioProviders().map((p) => p.name).join(", ") || "none";
|
|
9
|
+
const text = params.provider
|
|
10
|
+
? `Provider "${params.provider}" is not configured or does not support audio. Available audio providers: ${available}`
|
|
11
|
+
: "No audio provider available. Configure one of: OPENAI_API_KEY, GOOGLE_API_KEY";
|
|
12
|
+
return {
|
|
13
|
+
isError: true,
|
|
14
|
+
content: [{ type: "text", text }],
|
|
15
|
+
};
|
|
16
|
+
}
|
|
17
|
+
if (!provider.capabilities.supportsAudioGeneration) {
|
|
18
|
+
const available = registry.getAudioProviders().map((p) => p.name).join(", ") || "none";
|
|
19
|
+
return {
|
|
20
|
+
isError: true,
|
|
21
|
+
content: [{
|
|
22
|
+
type: "text",
|
|
23
|
+
text: `Provider "${provider.name}" does not support audio generation. Available audio providers: ${available}`,
|
|
24
|
+
}],
|
|
25
|
+
};
|
|
26
|
+
}
|
|
27
|
+
try {
|
|
28
|
+
const media = await provider.generateAudio({
|
|
29
|
+
text: params.text,
|
|
30
|
+
voice: params.voice,
|
|
31
|
+
speed: params.speed,
|
|
32
|
+
format: params.format,
|
|
33
|
+
providerOptions: params.providerOptions,
|
|
34
|
+
});
|
|
35
|
+
const filePath = await fileManager.save(media, "audio");
|
|
36
|
+
return {
|
|
37
|
+
content: [{ type: "text", text: `Audio saved to ${filePath}` }],
|
|
38
|
+
};
|
|
39
|
+
}
|
|
40
|
+
catch (error) {
|
|
41
|
+
const message = sanitizeError(error);
|
|
42
|
+
return {
|
|
43
|
+
isError: true,
|
|
44
|
+
content: [{ type: "text", text: `Audio generation failed: ${message}` }],
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
};
|
|
48
|
+
}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import type { ProviderRegistry } from "../providers/registry.js";
|
|
2
|
+
import type { FileManager } from "../file-manager.js";
|
|
3
|
+
export declare function buildGenerateImageHandler(registry: ProviderRegistry, fileManager: FileManager): (params: {
|
|
4
|
+
prompt: string;
|
|
5
|
+
provider?: string;
|
|
6
|
+
aspectRatio?: string;
|
|
7
|
+
quality?: string;
|
|
8
|
+
providerOptions?: Record<string, unknown>;
|
|
9
|
+
}) => Promise<{
|
|
10
|
+
isError: true;
|
|
11
|
+
content: {
|
|
12
|
+
type: "text";
|
|
13
|
+
text: string;
|
|
14
|
+
}[];
|
|
15
|
+
} | {
|
|
16
|
+
content: {
|
|
17
|
+
type: "text";
|
|
18
|
+
text: string;
|
|
19
|
+
}[];
|
|
20
|
+
isError?: undefined;
|
|
21
|
+
}>;
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import { sanitizeError } from "../errors.js";
|
|
2
|
+
export function buildGenerateImageHandler(registry, fileManager) {
|
|
3
|
+
return async (params) => {
|
|
4
|
+
const provider = registry.getProvider(params.provider);
|
|
5
|
+
if (!provider) {
|
|
6
|
+
const availableNames = registry.getImageProviders().map((p) => p.name).join(", ");
|
|
7
|
+
const text = params.provider
|
|
8
|
+
? `Provider "${params.provider}" is not configured. Available providers: ${availableNames || "none"}`
|
|
9
|
+
: "No image provider available. Configure one of: OPENAI_API_KEY, XAI_API_KEY, GOOGLE_API_KEY";
|
|
10
|
+
return {
|
|
11
|
+
isError: true,
|
|
12
|
+
content: [{ type: "text", text }],
|
|
13
|
+
};
|
|
14
|
+
}
|
|
15
|
+
try {
|
|
16
|
+
const media = await provider.generateImage({
|
|
17
|
+
prompt: params.prompt,
|
|
18
|
+
aspectRatio: params.aspectRatio ?? "1:1",
|
|
19
|
+
quality: params.quality ?? "standard",
|
|
20
|
+
providerOptions: params.providerOptions,
|
|
21
|
+
});
|
|
22
|
+
const filePath = await fileManager.save(media, "image");
|
|
23
|
+
return {
|
|
24
|
+
content: [{ type: "text", text: `Image saved to ${filePath}` }],
|
|
25
|
+
};
|
|
26
|
+
}
|
|
27
|
+
catch (error) {
|
|
28
|
+
const message = sanitizeError(error);
|
|
29
|
+
return {
|
|
30
|
+
isError: true,
|
|
31
|
+
content: [{ type: "text", text: `Image generation failed: ${message}` }],
|
|
32
|
+
};
|
|
33
|
+
}
|
|
34
|
+
};
|
|
35
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import type { ProviderRegistry } from "../providers/registry.js";
|
|
2
|
+
import type { FileManager } from "../file-manager.js";
|
|
3
|
+
export declare function buildGenerateVideoHandler(registry: ProviderRegistry, fileManager: FileManager): (params: {
|
|
4
|
+
prompt: string;
|
|
5
|
+
provider?: string;
|
|
6
|
+
duration?: number;
|
|
7
|
+
aspectRatio?: string;
|
|
8
|
+
resolution?: string;
|
|
9
|
+
providerOptions?: Record<string, unknown>;
|
|
10
|
+
}) => Promise<{
|
|
11
|
+
isError: true;
|
|
12
|
+
content: {
|
|
13
|
+
type: "text";
|
|
14
|
+
text: string;
|
|
15
|
+
}[];
|
|
16
|
+
} | {
|
|
17
|
+
content: {
|
|
18
|
+
type: "text";
|
|
19
|
+
text: string;
|
|
20
|
+
}[];
|
|
21
|
+
isError?: undefined;
|
|
22
|
+
}>;
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import { sanitizeError } from "../errors.js";
|
|
2
|
+
export function buildGenerateVideoHandler(registry, fileManager) {
|
|
3
|
+
return async (params) => {
|
|
4
|
+
const provider = registry.getProvider(params.provider);
|
|
5
|
+
if (!provider) {
|
|
6
|
+
const available = registry.getVideoProviders().map((p) => p.name).join(", ") || "none";
|
|
7
|
+
const text = params.provider
|
|
8
|
+
? `Provider "${params.provider}" is not configured. Available providers: ${available}`
|
|
9
|
+
: "No video provider available. Configure one of: OPENAI_API_KEY, XAI_API_KEY, GOOGLE_API_KEY";
|
|
10
|
+
return {
|
|
11
|
+
isError: true,
|
|
12
|
+
content: [{ type: "text", text }],
|
|
13
|
+
};
|
|
14
|
+
}
|
|
15
|
+
try {
|
|
16
|
+
const media = await provider.generateVideo({
|
|
17
|
+
prompt: params.prompt,
|
|
18
|
+
duration: params.duration ?? 5,
|
|
19
|
+
aspectRatio: params.aspectRatio ?? "16:9",
|
|
20
|
+
resolution: params.resolution ?? "720p",
|
|
21
|
+
providerOptions: params.providerOptions,
|
|
22
|
+
});
|
|
23
|
+
const filePath = await fileManager.save(media, "video");
|
|
24
|
+
return {
|
|
25
|
+
content: [{ type: "text", text: `Video saved to ${filePath}` }],
|
|
26
|
+
};
|
|
27
|
+
}
|
|
28
|
+
catch (error) {
|
|
29
|
+
const message = sanitizeError(error);
|
|
30
|
+
return {
|
|
31
|
+
isError: true,
|
|
32
|
+
content: [{ type: "text", text: `Video generation failed: ${message}` }],
|
|
33
|
+
};
|
|
34
|
+
}
|
|
35
|
+
};
|
|
36
|
+
}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
export function buildListProvidersHandler(registry) {
|
|
2
|
+
return async () => {
|
|
3
|
+
const providers = registry.listCapabilities();
|
|
4
|
+
if (providers.length === 0) {
|
|
5
|
+
return {
|
|
6
|
+
content: [{
|
|
7
|
+
type: "text",
|
|
8
|
+
text: "No providers configured. Set one or more API keys: OPENAI_API_KEY, XAI_API_KEY, GOOGLE_API_KEY",
|
|
9
|
+
}],
|
|
10
|
+
};
|
|
11
|
+
}
|
|
12
|
+
const lines = providers.map((p) => {
|
|
13
|
+
const caps = [];
|
|
14
|
+
if (p.capabilities.supportsImageGeneration)
|
|
15
|
+
caps.push("image");
|
|
16
|
+
if (p.capabilities.supportsVideoGeneration)
|
|
17
|
+
caps.push("video");
|
|
18
|
+
return `- ${p.name}: ${caps.join(", ")}`;
|
|
19
|
+
});
|
|
20
|
+
return {
|
|
21
|
+
content: [{
|
|
22
|
+
type: "text",
|
|
23
|
+
text: `Configured providers:\n${lines.join("\n")}`,
|
|
24
|
+
}],
|
|
25
|
+
};
|
|
26
|
+
};
|
|
27
|
+
}
|
package/package.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@r16t/multimodal-mcp",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Multi-provider media generation MCP server",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "build/index.js",
|
|
7
|
+
"bin": {
|
|
8
|
+
"multimodal-mcp": "build/index.js",
|
|
9
|
+
"@r16t/multimodal-mcp": "build/index.js"
|
|
10
|
+
},
|
|
11
|
+
"files": [
|
|
12
|
+
"build/"
|
|
13
|
+
],
|
|
14
|
+
"scripts": {
|
|
15
|
+
"build": "tsc && chmod +x build/index.js",
|
|
16
|
+
"prepare": "npm run build",
|
|
17
|
+
"dev": "tsc --watch",
|
|
18
|
+
"test": "vitest",
|
|
19
|
+
"lint": "eslint src --fix",
|
|
20
|
+
"typecheck": "tsc --noEmit"
|
|
21
|
+
},
|
|
22
|
+
"license": "MIT",
|
|
23
|
+
"keywords": [
|
|
24
|
+
"mcp",
|
|
25
|
+
"media",
|
|
26
|
+
"image-generation",
|
|
27
|
+
"video-generation",
|
|
28
|
+
"audio-generation",
|
|
29
|
+
"text-to-speech",
|
|
30
|
+
"openai",
|
|
31
|
+
"xai",
|
|
32
|
+
"google",
|
|
33
|
+
"claude"
|
|
34
|
+
],
|
|
35
|
+
"engines": {
|
|
36
|
+
"node": ">=18"
|
|
37
|
+
},
|
|
38
|
+
"dependencies": {
|
|
39
|
+
"@google/generative-ai": "^0.24.1",
|
|
40
|
+
"@modelcontextprotocol/sdk": "^1.27.1",
|
|
41
|
+
"openai": "^6.25.0",
|
|
42
|
+
"zod": "^4.3.6"
|
|
43
|
+
},
|
|
44
|
+
"devDependencies": {
|
|
45
|
+
"@eslint/js": "^10.0.1",
|
|
46
|
+
"@types/node": "^25.3.3",
|
|
47
|
+
"eslint": "^10.0.2",
|
|
48
|
+
"globals": "^17.4.0",
|
|
49
|
+
"typescript": "^5.9.3",
|
|
50
|
+
"typescript-eslint": "^8.56.1",
|
|
51
|
+
"vitest": "^4.0.18"
|
|
52
|
+
}
|
|
53
|
+
}
|