@r16t/multimodal-mcp 1.1.2 → 1.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +12 -9
- package/build/providers/google.d.ts +2 -1
- package/build/providers/google.js +69 -13
- package/build/providers/openai.d.ts +2 -1
- package/build/providers/openai.js +24 -3
- package/build/providers/registry.d.ts +1 -0
- package/build/providers/registry.js +3 -0
- package/build/providers/types.d.ts +10 -0
- package/build/providers/xai.d.ts +2 -1
- package/build/providers/xai.js +40 -0
- package/build/read-media-file.d.ts +4 -0
- package/build/read-media-file.js +21 -0
- package/build/server.js +11 -1
- package/build/tools/edit-image.d.ts +21 -0
- package/build/tools/edit-image.js +49 -0
- package/build/tools/generate-video.d.ts +1 -0
- package/build/tools/generate-video.js +10 -0
- package/build/tools/list-providers.js +2 -0
- package/package.json +6 -3
package/README.md
CHANGED
|
@@ -17,13 +17,13 @@ Set the API key for at least one provider. Most users only need one — add more
|
|
|
17
17
|
|
|
18
18
|
```bash
|
|
19
19
|
# Using OpenAI
|
|
20
|
-
claude mcp add multimodal-mcp -e OPENAI_API_KEY=sk-... -- npx @r16t/multimodal-mcp
|
|
20
|
+
claude mcp add multimodal-mcp -e OPENAI_API_KEY=sk-... -- npx @r16t/multimodal-mcp@latest
|
|
21
21
|
|
|
22
22
|
# Or using xAI
|
|
23
|
-
# claude mcp add multimodal-mcp -e XAI_API_KEY=xai-... -- npx @r16t/multimodal-mcp
|
|
23
|
+
# claude mcp add multimodal-mcp -e XAI_API_KEY=xai-... -- npx @r16t/multimodal-mcp@latest
|
|
24
24
|
|
|
25
25
|
# Or using Gemini
|
|
26
|
-
# claude mcp add multimodal-mcp -e GEMINI_API_KEY=AIza... -- npx @r16t/multimodal-mcp
|
|
26
|
+
# claude mcp add multimodal-mcp -e GEMINI_API_KEY=AIza... -- npx @r16t/multimodal-mcp@latest
|
|
27
27
|
```
|
|
28
28
|
|
|
29
29
|
Using a different editor? See [setup instructions](#editor-setup) for Claude Desktop, Cursor, VS Code, Windsurf, and Cline.
|
|
@@ -36,7 +36,7 @@ Using a different editor? See [setup instructions](#editor-setup) for Claude Des
|
|
|
36
36
|
| `XAI_API_KEY` | At least one provider key | xAI API key — enables image and video generation via grok-imagine-image and grok-imagine-video |
|
|
37
37
|
| `GEMINI_API_KEY` | At least one provider key | Gemini API key — enables image, video, and audio generation via imagen-4, veo-3.1, and gemini-2.5-flash-preview-tts |
|
|
38
38
|
| `GOOGLE_API_KEY` | — | Alias for `GEMINI_API_KEY`; either name is accepted |
|
|
39
|
-
| `MEDIA_OUTPUT_DIR` | No | Directory for saved media files. Defaults to the
|
|
39
|
+
| `MEDIA_OUTPUT_DIR` | No | Directory for saved media files. Defaults to the current working directory |
|
|
40
40
|
|
|
41
41
|
## Available Tools
|
|
42
42
|
|
|
@@ -50,6 +50,7 @@ Generate an image from a text prompt.
|
|
|
50
50
|
| `provider` | string | No | Provider to use: `openai`, `xai`, `google`. Auto-selects if omitted |
|
|
51
51
|
| `aspectRatio` | string | No | Aspect ratio: `1:1`, `16:9`, `9:16`, `4:3`, `3:4` |
|
|
52
52
|
| `quality` | string | No | Quality level: `low`, `standard`, `high` |
|
|
53
|
+
| `outputDirectory` | string | No | Directory to save the generated file. Absolute or relative path. Defaults to `MEDIA_OUTPUT_DIR` or cwd |
|
|
53
54
|
| `providerOptions` | object | No | Provider-specific parameters passed through directly |
|
|
54
55
|
|
|
55
56
|
### `generate_video`
|
|
@@ -63,6 +64,7 @@ Generate a video from a text prompt. Video generation is asynchronous and may ta
|
|
|
63
64
|
| `duration` | number | No | Video duration in seconds (provider limits apply) |
|
|
64
65
|
| `aspectRatio` | string | No | Aspect ratio: `16:9`, `9:16`, `1:1` |
|
|
65
66
|
| `resolution` | string | No | Resolution: `480p`, `720p`, `1080p` |
|
|
67
|
+
| `outputDirectory` | string | No | Directory to save the generated file. Absolute or relative path. Defaults to `MEDIA_OUTPUT_DIR` or cwd |
|
|
66
68
|
| `providerOptions` | object | No | Provider-specific parameters passed through directly |
|
|
67
69
|
|
|
68
70
|
### `generate_audio`
|
|
@@ -76,6 +78,7 @@ Generate audio (text-to-speech) from text. Audio generation is synchronous.
|
|
|
76
78
|
| `voice` | string | No | Voice name (provider-specific). OpenAI: `alloy`, `ash`, `coral`, `echo`, `fable`, `nova`, `onyx`, `sage`, `shimmer`. Google: `Kore`, `Charon`, `Fenrir`, `Aoede`, `Puck`, etc. |
|
|
77
79
|
| `speed` | number | No | Speech speed multiplier (OpenAI only): `0.25` to `4.0` |
|
|
78
80
|
| `format` | string | No | Output format (OpenAI only): `mp3`, `opus`, `aac`, `flac`, `wav`, `pcm` |
|
|
81
|
+
| `outputDirectory` | string | No | Directory to save the generated file. Absolute or relative path. Defaults to `MEDIA_OUTPUT_DIR` or cwd |
|
|
79
82
|
| `providerOptions` | object | No | Provider-specific parameters passed through directly |
|
|
80
83
|
|
|
81
84
|
### `list_providers`
|
|
@@ -162,7 +165,7 @@ Add to `~/Library/Application Support/Claude/claude_desktop_config.json`:
|
|
|
162
165
|
"mcpServers": {
|
|
163
166
|
"multimodal-mcp": {
|
|
164
167
|
"command": "npx",
|
|
165
|
-
"args": ["@r16t/multimodal-mcp"],
|
|
168
|
+
"args": ["@r16t/multimodal-mcp@latest"],
|
|
166
169
|
"env": {
|
|
167
170
|
"OPENAI_API_KEY": "sk-..."
|
|
168
171
|
}
|
|
@@ -180,7 +183,7 @@ Add to `.cursor/mcp.json` in your project root (or `~/.cursor/mcp.json` globally
|
|
|
180
183
|
"mcpServers": {
|
|
181
184
|
"multimodal-mcp": {
|
|
182
185
|
"command": "npx",
|
|
183
|
-
"args": ["@r16t/multimodal-mcp"],
|
|
186
|
+
"args": ["@r16t/multimodal-mcp@latest"],
|
|
184
187
|
"env": {
|
|
185
188
|
"OPENAI_API_KEY": "sk-..."
|
|
186
189
|
}
|
|
@@ -198,7 +201,7 @@ Add to `.vscode/mcp.json` in your project root:
|
|
|
198
201
|
"servers": {
|
|
199
202
|
"multimodal-mcp": {
|
|
200
203
|
"command": "npx",
|
|
201
|
-
"args": ["@r16t/multimodal-mcp"],
|
|
204
|
+
"args": ["@r16t/multimodal-mcp@latest"],
|
|
202
205
|
"env": {
|
|
203
206
|
"OPENAI_API_KEY": "sk-..."
|
|
204
207
|
}
|
|
@@ -216,7 +219,7 @@ Add to `~/.codeium/windsurf/mcp_config.json`:
|
|
|
216
219
|
"mcpServers": {
|
|
217
220
|
"multimodal-mcp": {
|
|
218
221
|
"command": "npx",
|
|
219
|
-
"args": ["@r16t/multimodal-mcp"],
|
|
222
|
+
"args": ["@r16t/multimodal-mcp@latest"],
|
|
220
223
|
"env": {
|
|
221
224
|
"OPENAI_API_KEY": "sk-..."
|
|
222
225
|
}
|
|
@@ -234,7 +237,7 @@ Add to `~/Library/Application Support/Code/User/globalStorage/saoudrizwan.claude
|
|
|
234
237
|
"mcpServers": {
|
|
235
238
|
"multimodal-mcp": {
|
|
236
239
|
"command": "npx",
|
|
237
|
-
"args": ["@r16t/multimodal-mcp"],
|
|
240
|
+
"args": ["@r16t/multimodal-mcp@latest"],
|
|
238
241
|
"env": {
|
|
239
242
|
"OPENAI_API_KEY": "sk-..."
|
|
240
243
|
}
|
|
@@ -1,10 +1,11 @@
|
|
|
1
|
-
import type { MediaProvider, ProviderCapabilities, ImageParams, VideoParams, AudioParams, GeneratedMedia } from "./types.js";
|
|
1
|
+
import type { MediaProvider, ProviderCapabilities, ImageParams, EditImageParams, VideoParams, AudioParams, GeneratedMedia } from "./types.js";
|
|
2
2
|
export declare class GoogleProvider implements MediaProvider {
|
|
3
3
|
readonly name = "google";
|
|
4
4
|
readonly capabilities: ProviderCapabilities;
|
|
5
5
|
private apiKey;
|
|
6
6
|
constructor(apiKey: string);
|
|
7
7
|
generateImage(params: ImageParams): Promise<GeneratedMedia>;
|
|
8
|
+
editImage(params: EditImageParams): Promise<GeneratedMedia>;
|
|
8
9
|
generateVideo(params: VideoParams): Promise<GeneratedMedia>;
|
|
9
10
|
generateAudio(params: AudioParams): Promise<GeneratedMedia>;
|
|
10
11
|
}
|
|
@@ -4,6 +4,7 @@ export class GoogleProvider {
|
|
|
4
4
|
name = "google";
|
|
5
5
|
capabilities = {
|
|
6
6
|
supportsImageGeneration: true,
|
|
7
|
+
supportsImageEditing: true,
|
|
7
8
|
supportsVideoGeneration: true,
|
|
8
9
|
supportsAudioGeneration: true,
|
|
9
10
|
supportedImageAspectRatios: ["1:1", "16:9", "9:16", "4:3", "3:4"],
|
|
@@ -17,12 +18,13 @@ export class GoogleProvider {
|
|
|
17
18
|
this.apiKey = apiKey;
|
|
18
19
|
}
|
|
19
20
|
async generateImage(params) {
|
|
20
|
-
const response = await fetch(`${GEMINI_BASE_URL}/models/imagen-4:
|
|
21
|
+
const response = await fetch(`${GEMINI_BASE_URL}/models/imagen-4.0-generate-001:predict?key=${this.apiKey}`, {
|
|
21
22
|
method: "POST",
|
|
22
23
|
headers: { "Content-Type": "application/json" },
|
|
23
24
|
body: JSON.stringify({
|
|
24
|
-
prompt: params.prompt,
|
|
25
|
-
|
|
25
|
+
instances: [{ prompt: params.prompt }],
|
|
26
|
+
parameters: {
|
|
27
|
+
sampleCount: 1,
|
|
26
28
|
aspectRatio: params.aspectRatio,
|
|
27
29
|
...params.providerOptions,
|
|
28
30
|
},
|
|
@@ -36,22 +38,76 @@ export class GoogleProvider {
|
|
|
36
38
|
return {
|
|
37
39
|
data: Buffer.from(base64, "base64"),
|
|
38
40
|
mimeType: "image/png",
|
|
39
|
-
metadata: { model: "imagen-4", provider: "google" },
|
|
41
|
+
metadata: { model: "imagen-4.0-generate-001", provider: "google" },
|
|
40
42
|
};
|
|
41
43
|
}
|
|
42
|
-
async
|
|
43
|
-
const
|
|
44
|
+
async editImage(params) {
|
|
45
|
+
const base64Image = params.imageData.toString("base64");
|
|
46
|
+
const response = await fetch(`${GEMINI_BASE_URL}/models/gemini-2.5-flash-preview-image:generateContent?key=${this.apiKey}`, {
|
|
44
47
|
method: "POST",
|
|
45
48
|
headers: { "Content-Type": "application/json" },
|
|
46
49
|
body: JSON.stringify({
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
50
|
+
contents: [{
|
|
51
|
+
parts: [
|
|
52
|
+
{ text: params.prompt },
|
|
53
|
+
{
|
|
54
|
+
inlineData: {
|
|
55
|
+
mimeType: params.imageMimeType,
|
|
56
|
+
data: base64Image,
|
|
57
|
+
},
|
|
58
|
+
},
|
|
59
|
+
],
|
|
60
|
+
}],
|
|
61
|
+
generationConfig: {
|
|
62
|
+
responseModalities: ["IMAGE"],
|
|
51
63
|
...params.providerOptions,
|
|
52
64
|
},
|
|
53
65
|
}),
|
|
54
66
|
});
|
|
67
|
+
if (!response.ok) {
|
|
68
|
+
throw new Error(`Google image editing failed: ${response.status}`);
|
|
69
|
+
}
|
|
70
|
+
const result = (await response.json());
|
|
71
|
+
const imagePart = result.candidates[0]?.content?.parts?.find((part) => part.inlineData !== undefined);
|
|
72
|
+
if (!imagePart?.inlineData) {
|
|
73
|
+
throw new Error("Google image editing returned no image data");
|
|
74
|
+
}
|
|
75
|
+
return {
|
|
76
|
+
data: Buffer.from(imagePart.inlineData.data, "base64"),
|
|
77
|
+
mimeType: imagePart.inlineData.mimeType || "image/png",
|
|
78
|
+
metadata: {
|
|
79
|
+
model: "gemini-2.5-flash-preview-image",
|
|
80
|
+
provider: "google",
|
|
81
|
+
operation: "edit",
|
|
82
|
+
},
|
|
83
|
+
};
|
|
84
|
+
}
|
|
85
|
+
async generateVideo(params) {
|
|
86
|
+
const instance = {
|
|
87
|
+
prompt: params.prompt,
|
|
88
|
+
};
|
|
89
|
+
if (params.imageData) {
|
|
90
|
+
const base64Image = params.imageData.toString("base64");
|
|
91
|
+
instance.image = {
|
|
92
|
+
inlineData: {
|
|
93
|
+
mimeType: params.imageMimeType ?? "image/png",
|
|
94
|
+
data: base64Image,
|
|
95
|
+
},
|
|
96
|
+
};
|
|
97
|
+
}
|
|
98
|
+
const requestBody = {
|
|
99
|
+
instances: [instance],
|
|
100
|
+
parameters: {
|
|
101
|
+
aspectRatio: params.aspectRatio,
|
|
102
|
+
durationSeconds: params.duration,
|
|
103
|
+
...params.providerOptions,
|
|
104
|
+
},
|
|
105
|
+
};
|
|
106
|
+
const submitResponse = await fetch(`${GEMINI_BASE_URL}/models/veo-3.1-generate-preview:predictLongRunning?key=${this.apiKey}`, {
|
|
107
|
+
method: "POST",
|
|
108
|
+
headers: { "Content-Type": "application/json" },
|
|
109
|
+
body: JSON.stringify(requestBody),
|
|
110
|
+
});
|
|
55
111
|
if (!submitResponse.ok) {
|
|
56
112
|
throw new Error(`Google video generation failed: ${submitResponse.status}`);
|
|
57
113
|
}
|
|
@@ -66,7 +122,7 @@ export class GoogleProvider {
|
|
|
66
122
|
return {
|
|
67
123
|
data,
|
|
68
124
|
mimeType: "video/mp4",
|
|
69
|
-
metadata: { model: "veo-3.1", provider: "google", operationName: operation.name },
|
|
125
|
+
metadata: { model: "veo-3.1-generate-preview", provider: "google", operationName: operation.name },
|
|
70
126
|
};
|
|
71
127
|
}
|
|
72
128
|
async generateAudio(params) {
|
|
@@ -78,8 +134,8 @@ export class GoogleProvider {
|
|
|
78
134
|
body: JSON.stringify({
|
|
79
135
|
contents: [{ parts: [{ text: params.text }] }],
|
|
80
136
|
generationConfig: {
|
|
81
|
-
|
|
82
|
-
|
|
137
|
+
responseModalities: ["AUDIO"],
|
|
138
|
+
speechConfig: {
|
|
83
139
|
voiceConfig: {
|
|
84
140
|
prebuiltVoiceConfig: { voiceName: voice },
|
|
85
141
|
},
|
|
@@ -1,10 +1,11 @@
|
|
|
1
|
-
import type { MediaProvider, ProviderCapabilities, ImageParams, VideoParams, AudioParams, GeneratedMedia } from "./types.js";
|
|
1
|
+
import type { MediaProvider, ProviderCapabilities, ImageParams, EditImageParams, VideoParams, AudioParams, GeneratedMedia } from "./types.js";
|
|
2
2
|
export declare class OpenAIProvider implements MediaProvider {
|
|
3
3
|
readonly name = "openai";
|
|
4
4
|
readonly capabilities: ProviderCapabilities;
|
|
5
5
|
private client;
|
|
6
6
|
constructor(apiKey: string);
|
|
7
7
|
generateImage(params: ImageParams): Promise<GeneratedMedia>;
|
|
8
|
+
editImage(params: EditImageParams): Promise<GeneratedMedia>;
|
|
8
9
|
generateVideo(params: VideoParams): Promise<GeneratedMedia>;
|
|
9
10
|
generateAudio(params: AudioParams): Promise<GeneratedMedia>;
|
|
10
11
|
private audioFormatToMimeType;
|
|
@@ -11,6 +11,7 @@ export class OpenAIProvider {
|
|
|
11
11
|
name = "openai";
|
|
12
12
|
capabilities = {
|
|
13
13
|
supportsImageGeneration: true,
|
|
14
|
+
supportsImageEditing: true,
|
|
14
15
|
supportsVideoGeneration: true,
|
|
15
16
|
supportsAudioGeneration: true,
|
|
16
17
|
supportedImageAspectRatios: ["1:1", "16:9", "9:16", "4:3", "3:4"],
|
|
@@ -39,14 +40,34 @@ export class OpenAIProvider {
|
|
|
39
40
|
metadata: { model: "gpt-image-1", provider: "openai" },
|
|
40
41
|
};
|
|
41
42
|
}
|
|
43
|
+
async editImage(params) {
|
|
44
|
+
const imageFile = new File([new Uint8Array(params.imageData)], "input.png", { type: params.imageMimeType });
|
|
45
|
+
const response = await this.client.images.edit({
|
|
46
|
+
model: "gpt-image-1",
|
|
47
|
+
image: imageFile,
|
|
48
|
+
prompt: params.prompt,
|
|
49
|
+
...params.providerOptions,
|
|
50
|
+
});
|
|
51
|
+
const base64Data = response.data[0].b64_json;
|
|
52
|
+
return {
|
|
53
|
+
data: Buffer.from(base64Data, "base64"),
|
|
54
|
+
mimeType: "image/png",
|
|
55
|
+
metadata: { model: "gpt-image-1", provider: "openai", operation: "edit" },
|
|
56
|
+
};
|
|
57
|
+
}
|
|
42
58
|
async generateVideo(params) {
|
|
43
59
|
const videos = this.client.videos;
|
|
44
|
-
const
|
|
60
|
+
const createParams = {
|
|
45
61
|
model: "sora-2",
|
|
46
62
|
prompt: params.prompt,
|
|
47
|
-
|
|
63
|
+
seconds: String(params.duration),
|
|
48
64
|
...params.providerOptions,
|
|
49
|
-
}
|
|
65
|
+
};
|
|
66
|
+
if (params.imageData) {
|
|
67
|
+
const imageFile = new File([new Uint8Array(params.imageData)], "first-frame.png", { type: params.imageMimeType ?? "image/png" });
|
|
68
|
+
createParams.input_reference = imageFile;
|
|
69
|
+
}
|
|
70
|
+
const job = await videos.create(createParams);
|
|
50
71
|
const result = await pollForCompletion(() => videos.retrieve(job.id), (status) => status.status === "completed", { timeoutMs: 600_000, intervalMs: 5_000 });
|
|
51
72
|
const videoUrl = result.url;
|
|
52
73
|
const videoResponse = await fetch(videoUrl);
|
|
@@ -4,6 +4,7 @@ export declare class ProviderRegistry {
|
|
|
4
4
|
register(provider: MediaProvider): void;
|
|
5
5
|
getProvider(name?: string): MediaProvider | undefined;
|
|
6
6
|
getImageProviders(): MediaProvider[];
|
|
7
|
+
getImageEditProviders(): MediaProvider[];
|
|
7
8
|
getVideoProviders(): MediaProvider[];
|
|
8
9
|
getAudioProviders(): MediaProvider[];
|
|
9
10
|
listCapabilities(): ProviderInfo[];
|
|
@@ -13,6 +13,9 @@ export class ProviderRegistry {
|
|
|
13
13
|
getImageProviders() {
|
|
14
14
|
return [...this.providers.values()].filter((p) => p.capabilities.supportsImageGeneration);
|
|
15
15
|
}
|
|
16
|
+
getImageEditProviders() {
|
|
17
|
+
return [...this.providers.values()].filter((p) => p.capabilities.supportsImageEditing);
|
|
18
|
+
}
|
|
16
19
|
getVideoProviders() {
|
|
17
20
|
return [...this.providers.values()].filter((p) => p.capabilities.supportsVideoGeneration);
|
|
18
21
|
}
|
|
@@ -2,11 +2,13 @@ export interface MediaProvider {
|
|
|
2
2
|
readonly name: string;
|
|
3
3
|
readonly capabilities: ProviderCapabilities;
|
|
4
4
|
generateImage(params: ImageParams): Promise<GeneratedMedia>;
|
|
5
|
+
editImage(params: EditImageParams): Promise<GeneratedMedia>;
|
|
5
6
|
generateVideo(params: VideoParams): Promise<GeneratedMedia>;
|
|
6
7
|
generateAudio(params: AudioParams): Promise<GeneratedMedia>;
|
|
7
8
|
}
|
|
8
9
|
export interface ProviderCapabilities {
|
|
9
10
|
supportsImageGeneration: boolean;
|
|
11
|
+
supportsImageEditing: boolean;
|
|
10
12
|
supportsVideoGeneration: boolean;
|
|
11
13
|
supportsAudioGeneration: boolean;
|
|
12
14
|
supportedImageAspectRatios: string[];
|
|
@@ -21,11 +23,19 @@ export interface ImageParams {
|
|
|
21
23
|
quality: string;
|
|
22
24
|
providerOptions?: Record<string, unknown>;
|
|
23
25
|
}
|
|
26
|
+
export interface EditImageParams {
|
|
27
|
+
imageData: Buffer;
|
|
28
|
+
imageMimeType: string;
|
|
29
|
+
prompt: string;
|
|
30
|
+
providerOptions?: Record<string, unknown>;
|
|
31
|
+
}
|
|
24
32
|
export interface VideoParams {
|
|
25
33
|
prompt: string;
|
|
26
34
|
duration: number;
|
|
27
35
|
aspectRatio: string;
|
|
28
36
|
resolution: string;
|
|
37
|
+
imageData?: Buffer;
|
|
38
|
+
imageMimeType?: string;
|
|
29
39
|
providerOptions?: Record<string, unknown>;
|
|
30
40
|
}
|
|
31
41
|
export interface AudioParams {
|
package/build/providers/xai.d.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import type { MediaProvider, ProviderCapabilities, ImageParams, VideoParams, AudioParams, GeneratedMedia } from "./types.js";
|
|
1
|
+
import type { MediaProvider, ProviderCapabilities, ImageParams, EditImageParams, VideoParams, AudioParams, GeneratedMedia } from "./types.js";
|
|
2
2
|
export declare class XAIProvider implements MediaProvider {
|
|
3
3
|
readonly name = "xai";
|
|
4
4
|
readonly capabilities: ProviderCapabilities;
|
|
@@ -6,6 +6,7 @@ export declare class XAIProvider implements MediaProvider {
|
|
|
6
6
|
private apiKey;
|
|
7
7
|
constructor(apiKey: string);
|
|
8
8
|
generateImage(params: ImageParams): Promise<GeneratedMedia>;
|
|
9
|
+
editImage(params: EditImageParams): Promise<GeneratedMedia>;
|
|
9
10
|
generateVideo(params: VideoParams): Promise<GeneratedMedia>;
|
|
10
11
|
generateAudio(_params: AudioParams): Promise<GeneratedMedia>;
|
|
11
12
|
}
|
package/build/providers/xai.js
CHANGED
|
@@ -7,6 +7,7 @@ export class XAIProvider {
|
|
|
7
7
|
name = "xai";
|
|
8
8
|
capabilities = {
|
|
9
9
|
supportsImageGeneration: true,
|
|
10
|
+
supportsImageEditing: true,
|
|
10
11
|
supportsVideoGeneration: true,
|
|
11
12
|
supportsAudioGeneration: false,
|
|
12
13
|
supportedImageAspectRatios: ["1:1", "16:9", "9:16", "4:3", "3:4"],
|
|
@@ -39,6 +40,45 @@ export class XAIProvider {
|
|
|
39
40
|
metadata: { model: IMAGE_MODEL, provider: "xai" },
|
|
40
41
|
};
|
|
41
42
|
}
|
|
43
|
+
async editImage(params) {
|
|
44
|
+
const base64Data = params.imageData.toString("base64");
|
|
45
|
+
const dataUri = `data:${params.imageMimeType};base64,${base64Data}`;
|
|
46
|
+
const response = await fetch(`${XAI_BASE_URL}/images/edits`, {
|
|
47
|
+
method: "POST",
|
|
48
|
+
headers: {
|
|
49
|
+
"Content-Type": "application/json",
|
|
50
|
+
Authorization: `Bearer ${this.apiKey}`,
|
|
51
|
+
},
|
|
52
|
+
body: JSON.stringify({
|
|
53
|
+
model: IMAGE_MODEL,
|
|
54
|
+
prompt: params.prompt,
|
|
55
|
+
image: { url: dataUri, type: "image_url" },
|
|
56
|
+
...params.providerOptions,
|
|
57
|
+
}),
|
|
58
|
+
});
|
|
59
|
+
if (!response.ok) {
|
|
60
|
+
throw new Error(`xAI image editing failed: ${response.status}`);
|
|
61
|
+
}
|
|
62
|
+
const result = (await response.json());
|
|
63
|
+
const imageResult = result.data?.[0];
|
|
64
|
+
if (imageResult?.b64_json) {
|
|
65
|
+
return {
|
|
66
|
+
data: Buffer.from(imageResult.b64_json, "base64"),
|
|
67
|
+
mimeType: "image/png",
|
|
68
|
+
metadata: { model: IMAGE_MODEL, provider: "xai", operation: "edit" },
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
if (imageResult?.url) {
|
|
72
|
+
const imageResponse = await fetch(imageResult.url);
|
|
73
|
+
const data = Buffer.from(await imageResponse.arrayBuffer());
|
|
74
|
+
return {
|
|
75
|
+
data,
|
|
76
|
+
mimeType: "image/png",
|
|
77
|
+
metadata: { model: IMAGE_MODEL, provider: "xai", operation: "edit" },
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
throw new Error("xAI image editing returned no data");
|
|
81
|
+
}
|
|
42
82
|
async generateVideo(params) {
|
|
43
83
|
const submitResponse = await fetch(`${XAI_BASE_URL}/videos/generations`, {
|
|
44
84
|
method: "POST",
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import { readFile } from "node:fs/promises";
|
|
2
|
+
import { extname, resolve } from "node:path";
|
|
3
|
+
const EXTENSION_TO_MIME = {
|
|
4
|
+
".png": "image/png",
|
|
5
|
+
".jpg": "image/jpeg",
|
|
6
|
+
".jpeg": "image/jpeg",
|
|
7
|
+
".webp": "image/webp",
|
|
8
|
+
".gif": "image/gif",
|
|
9
|
+
".mp4": "video/mp4",
|
|
10
|
+
};
|
|
11
|
+
export async function readMediaFile(filePath) {
|
|
12
|
+
const absolutePath = resolve(filePath);
|
|
13
|
+
const extension = extname(absolutePath).toLowerCase();
|
|
14
|
+
const mimeType = EXTENSION_TO_MIME[extension];
|
|
15
|
+
if (!mimeType) {
|
|
16
|
+
const supported = Object.keys(EXTENSION_TO_MIME).join(", ");
|
|
17
|
+
throw new Error(`Unsupported file extension "${extension}". Supported: ${supported}`);
|
|
18
|
+
}
|
|
19
|
+
const data = await readFile(absolutePath);
|
|
20
|
+
return { data: Buffer.from(data), mimeType };
|
|
21
|
+
}
|
package/build/server.js
CHANGED
|
@@ -6,6 +6,7 @@ import { XAIProvider } from "./providers/xai.js";
|
|
|
6
6
|
import { GoogleProvider } from "./providers/google.js";
|
|
7
7
|
import { FileManager } from "./file-manager.js";
|
|
8
8
|
import { buildGenerateImageHandler } from "./tools/generate-image.js";
|
|
9
|
+
import { buildEditImageHandler } from "./tools/edit-image.js";
|
|
9
10
|
import { buildGenerateVideoHandler } from "./tools/generate-video.js";
|
|
10
11
|
import { buildGenerateAudioHandler } from "./tools/generate-audio.js";
|
|
11
12
|
import { buildListProvidersHandler } from "./tools/list-providers.js";
|
|
@@ -25,6 +26,7 @@ export function createServer(config) {
|
|
|
25
26
|
console.error("[server] Registered Google provider");
|
|
26
27
|
}
|
|
27
28
|
const generateImageHandler = buildGenerateImageHandler(registry, fileManager);
|
|
29
|
+
const editImageHandler = buildEditImageHandler(registry, fileManager);
|
|
28
30
|
const generateVideoHandler = buildGenerateVideoHandler(registry, fileManager);
|
|
29
31
|
const generateAudioHandler = buildGenerateAudioHandler(registry, fileManager);
|
|
30
32
|
const listProvidersHandler = buildListProvidersHandler(registry);
|
|
@@ -38,12 +40,20 @@ export function createServer(config) {
|
|
|
38
40
|
outputDirectory: z.string().optional().describe("Directory to save the generated file. Supports absolute or relative paths (resolved from cwd). Defaults to MEDIA_OUTPUT_DIR env var or cwd."),
|
|
39
41
|
providerOptions: z.record(z.string(), z.unknown()).optional().describe("Provider-specific parameters passed through directly"),
|
|
40
42
|
}, async (params) => generateImageHandler(params));
|
|
41
|
-
server.tool("
|
|
43
|
+
server.tool("edit_image", `Edit an existing image using AI. Provide the path to an image and a text prompt describing the desired edits. Available providers: ${providerNames}`, {
|
|
44
|
+
imagePath: z.string().describe("Absolute path to the source image file to edit"),
|
|
45
|
+
prompt: z.string().describe("Text description of the edits to apply to the image"),
|
|
46
|
+
provider: z.string().optional().describe("Provider to use: openai, xai, google. Auto-selects if omitted."),
|
|
47
|
+
outputDirectory: z.string().optional().describe("Directory to save the edited file. Supports absolute or relative paths (resolved from cwd). Defaults to MEDIA_OUTPUT_DIR env var or cwd."),
|
|
48
|
+
providerOptions: z.record(z.string(), z.unknown()).optional().describe("Provider-specific parameters passed through directly"),
|
|
49
|
+
}, async (params) => editImageHandler(params));
|
|
50
|
+
server.tool("generate_video", `Generate a video from a text prompt using AI. Optionally provide an image as the first frame. Available providers: ${providerNames}`, {
|
|
42
51
|
prompt: z.string().describe("Text description of the video to generate"),
|
|
43
52
|
provider: z.string().optional().describe("Provider to use: openai, xai, google. Auto-selects if omitted."),
|
|
44
53
|
duration: z.number().optional().describe("Video duration in seconds (provider limits apply)"),
|
|
45
54
|
aspectRatio: z.string().optional().describe("Aspect ratio: 16:9, 9:16, 1:1"),
|
|
46
55
|
resolution: z.string().optional().describe("Resolution: 480p, 720p, 1080p"),
|
|
56
|
+
imagePath: z.string().optional().describe("Path to an image to use as the first frame of the video (OpenAI and Google only)"),
|
|
47
57
|
outputDirectory: z.string().optional().describe("Directory to save the generated file. Supports absolute or relative paths (resolved from cwd). Defaults to MEDIA_OUTPUT_DIR env var or cwd."),
|
|
48
58
|
providerOptions: z.record(z.string(), z.unknown()).optional().describe("Provider-specific parameters passed through directly"),
|
|
49
59
|
}, async (params) => generateVideoHandler(params));
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import type { ProviderRegistry } from "../providers/registry.js";
|
|
2
|
+
import type { FileManager } from "../file-manager.js";
|
|
3
|
+
export declare function buildEditImageHandler(registry: ProviderRegistry, fileManager: FileManager): (params: {
|
|
4
|
+
imagePath: string;
|
|
5
|
+
prompt: string;
|
|
6
|
+
provider?: string;
|
|
7
|
+
outputDirectory?: string;
|
|
8
|
+
providerOptions?: Record<string, unknown>;
|
|
9
|
+
}) => Promise<{
|
|
10
|
+
isError: true;
|
|
11
|
+
content: {
|
|
12
|
+
type: "text";
|
|
13
|
+
text: string;
|
|
14
|
+
}[];
|
|
15
|
+
} | {
|
|
16
|
+
content: {
|
|
17
|
+
type: "text";
|
|
18
|
+
text: string;
|
|
19
|
+
}[];
|
|
20
|
+
isError?: undefined;
|
|
21
|
+
}>;
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import { readMediaFile } from "../read-media-file.js";
|
|
2
|
+
import { sanitizeError } from "../errors.js";
|
|
3
|
+
export function buildEditImageHandler(registry, fileManager) {
|
|
4
|
+
return async (params) => {
|
|
5
|
+
const provider = params.provider
|
|
6
|
+
? registry.getProvider(params.provider)
|
|
7
|
+
: registry.getImageEditProviders()[0];
|
|
8
|
+
if (!provider) {
|
|
9
|
+
const availableNames = registry.getImageEditProviders().map((p) => p.name).join(", ");
|
|
10
|
+
const text = params.provider
|
|
11
|
+
? `Provider "${params.provider}" is not configured. Available providers: ${availableNames || "none"}`
|
|
12
|
+
: "No image editing provider available. Configure one of: OPENAI_API_KEY, XAI_API_KEY, GEMINI_API_KEY";
|
|
13
|
+
return {
|
|
14
|
+
isError: true,
|
|
15
|
+
content: [{ type: "text", text }],
|
|
16
|
+
};
|
|
17
|
+
}
|
|
18
|
+
if (!provider.capabilities.supportsImageEditing) {
|
|
19
|
+
const availableNames = registry.getImageEditProviders().map((p) => p.name).join(", ");
|
|
20
|
+
return {
|
|
21
|
+
isError: true,
|
|
22
|
+
content: [{
|
|
23
|
+
type: "text",
|
|
24
|
+
text: `Provider "${provider.name}" does not support image editing. Available: ${availableNames || "none"}`,
|
|
25
|
+
}],
|
|
26
|
+
};
|
|
27
|
+
}
|
|
28
|
+
try {
|
|
29
|
+
const { data, mimeType } = await readMediaFile(params.imagePath);
|
|
30
|
+
const media = await provider.editImage({
|
|
31
|
+
imageData: data,
|
|
32
|
+
imageMimeType: mimeType,
|
|
33
|
+
prompt: params.prompt,
|
|
34
|
+
providerOptions: params.providerOptions,
|
|
35
|
+
});
|
|
36
|
+
const filePath = await fileManager.save(media, "image", params.outputDirectory);
|
|
37
|
+
return {
|
|
38
|
+
content: [{ type: "text", text: `Edited image saved to ${filePath}` }],
|
|
39
|
+
};
|
|
40
|
+
}
|
|
41
|
+
catch (error) {
|
|
42
|
+
const message = sanitizeError(error);
|
|
43
|
+
return {
|
|
44
|
+
isError: true,
|
|
45
|
+
content: [{ type: "text", text: `Image editing failed: ${message}` }],
|
|
46
|
+
};
|
|
47
|
+
}
|
|
48
|
+
};
|
|
49
|
+
}
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { readMediaFile } from "../read-media-file.js";
|
|
1
2
|
import { sanitizeError } from "../errors.js";
|
|
2
3
|
export function buildGenerateVideoHandler(registry, fileManager) {
|
|
3
4
|
return async (params) => {
|
|
@@ -13,11 +14,20 @@ export function buildGenerateVideoHandler(registry, fileManager) {
|
|
|
13
14
|
};
|
|
14
15
|
}
|
|
15
16
|
try {
|
|
17
|
+
let imageData;
|
|
18
|
+
let imageMimeType;
|
|
19
|
+
if (params.imagePath) {
|
|
20
|
+
const file = await readMediaFile(params.imagePath);
|
|
21
|
+
imageData = file.data;
|
|
22
|
+
imageMimeType = file.mimeType;
|
|
23
|
+
}
|
|
16
24
|
const media = await provider.generateVideo({
|
|
17
25
|
prompt: params.prompt,
|
|
18
26
|
duration: params.duration ?? 5,
|
|
19
27
|
aspectRatio: params.aspectRatio ?? "16:9",
|
|
20
28
|
resolution: params.resolution ?? "720p",
|
|
29
|
+
imageData,
|
|
30
|
+
imageMimeType,
|
|
21
31
|
providerOptions: params.providerOptions,
|
|
22
32
|
});
|
|
23
33
|
const filePath = await fileManager.save(media, "video", params.outputDirectory);
|
|
@@ -13,6 +13,8 @@ export function buildListProvidersHandler(registry) {
|
|
|
13
13
|
const caps = [];
|
|
14
14
|
if (p.capabilities.supportsImageGeneration)
|
|
15
15
|
caps.push("image");
|
|
16
|
+
if (p.capabilities.supportsImageEditing)
|
|
17
|
+
caps.push("image editing");
|
|
16
18
|
if (p.capabilities.supportsVideoGeneration)
|
|
17
19
|
caps.push("video");
|
|
18
20
|
if (p.capabilities.supportsAudioGeneration)
|
package/package.json
CHANGED
|
@@ -1,12 +1,11 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@r16t/multimodal-mcp",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.2.3",
|
|
4
4
|
"description": "Multi-provider media generation MCP server",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "build/index.js",
|
|
7
7
|
"bin": {
|
|
8
|
-
"multimodal-mcp": "build/index.js"
|
|
9
|
-
"@r16t/multimodal-mcp": "build/index.js"
|
|
8
|
+
"multimodal-mcp": "build/index.js"
|
|
10
9
|
},
|
|
11
10
|
"files": [
|
|
12
11
|
"build/"
|
|
@@ -19,6 +18,10 @@
|
|
|
19
18
|
"lint": "eslint src --fix",
|
|
20
19
|
"typecheck": "tsc --noEmit"
|
|
21
20
|
},
|
|
21
|
+
"repository": {
|
|
22
|
+
"type": "git",
|
|
23
|
+
"url": "https://github.com/rsmdt/multimodal-mcp"
|
|
24
|
+
},
|
|
22
25
|
"license": "MIT",
|
|
23
26
|
"keywords": [
|
|
24
27
|
"mcp",
|