npm - @r16t/multimodal-mcp - Versions diffs - 1.0.0 → 1.1.2 - Mend

@r16t/multimodal-mcp 1.0.0 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/LICENSE +21 -0
package/README.md +118 -48
package/build/config.js +5 -6
package/build/file-manager.d.ts +1 -1
package/build/file-manager.js +6 -3
package/build/providers/openai.js +2 -2
package/build/server.js +3 -0
package/build/tools/generate-audio.d.ts +1 -0
package/build/tools/generate-audio.js +2 -2
package/build/tools/generate-image.d.ts +1 -0
package/build/tools/generate-image.js +2 -2
package/build/tools/generate-video.d.ts +1 -0
package/build/tools/generate-video.js +2 -2
package/build/tools/list-providers.js +3 -1
package/package.json +1 -1

package/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Rudolf S.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

package/README.md CHANGED Viewed

@@ -1,65 +1,41 @@
 # multimodal-mcp
-Multi-provider media generation MCP server. Generate images, videos, and audio from text prompts using OpenAI, xAI, and Google through a single unified interface.
+Multi-provider media generation MCP server. Generate images, videos, and audio from text prompts using OpenAI, xAI, and Gemini through a single unified interface.
 ## Features
-- 🎨 **Image Generation** — Generate images via OpenAI (gpt-image-1), xAI (grok-imagine-image), or Google (imagen-4)
-- 🎬 **Video Generation** — Generate videos via OpenAI (sora-2), xAI (grok-imagine-video), or Google (veo-3.1)
-- 🔊 **Audio Generation** — Text-to-speech via OpenAI (tts-1) or Google (gemini-2.5-flash-preview-tts)
+- 🎨 **Image Generation** — Generate images via OpenAI (gpt-image-1), xAI (grok-imagine-image), or Gemini (imagen-4)
+- 🎬 **Video Generation** — Generate videos via OpenAI (sora-2), xAI (grok-imagine-video), or Gemini (veo-3.1)
+- 🔊 **Audio Generation** — Text-to-speech via OpenAI (tts-1) or Gemini (gemini-2.5-flash-preview-tts)
 - 🔄 **Auto-Discovery** — Automatically detects configured providers from environment variables
 - 🎯 **Provider Selection** — Auto-selects or explicitly choose a provider per request
 - 📁 **File Output** — Saves all generated media to disk with descriptive filenames
 ## Quick Start
-### Claude Desktop
-Add to your Claude Desktop config (`~/Library/Application Support/Claude/claude_desktop_config.json`):
-```json
-{
-  "mcpServers": {
-    "multimodal-mcp": {
-      "command": "npx",
-      "args": ["@r16t/multimodal-mcp"],
-      "env": {
-        "OPENAI_API_KEY": "sk-...",
-        "XAI_API_KEY": "xai-...",
-        "GOOGLE_API_KEY": "AIza...",
-        "MEDIA_OUTPUT_DIR": "/tmp/media"
-      }
-    }
-  }
-}
-```
+Set the API key for at least one provider. Most users only need one — add more to access additional providers.
-You only need to set keys for the providers you want to use. At least one is required.
+```bash
+# Using OpenAI
+claude mcp add multimodal-mcp -e OPENAI_API_KEY=sk-... -- npx @r16t/multimodal-mcp
-### Cursor / Other MCP Clients
+# Or using xAI
+# claude mcp add multimodal-mcp -e XAI_API_KEY=xai-... -- npx @r16t/multimodal-mcp
-```json
-{
-  "mcpServers": {
-    "multimodal-mcp": {
-      "command": "npx",
-      "args": ["@r16t/multimodal-mcp"],
-      "env": {
-        "OPENAI_API_KEY": "sk-..."
-      }
-    }
-  }
-}
+# Or using Gemini
+# claude mcp add multimodal-mcp -e GEMINI_API_KEY=AIza... -- npx @r16t/multimodal-mcp
 ```
+Using a different editor? See [setup instructions](#editor-setup) for Claude Desktop, Cursor, VS Code, Windsurf, and Cline.
 ## Environment Variables
 | Variable | Required | Description |
 |----------|----------|-------------|
 | `OPENAI_API_KEY` | At least one provider key | OpenAI API key — enables image, video, and audio generation via gpt-image-1, sora-2, and tts-1 |
 | `XAI_API_KEY` | At least one provider key | xAI API key — enables image and video generation via grok-imagine-image and grok-imagine-video |
-| `GOOGLE_API_KEY` | At least one provider key | Google API key — enables image, video, and audio generation via imagen-4, veo-3.1, and gemini-2.5-flash-preview-tts |
-| `GEMINI_API_KEY` | — | Alias for `GOOGLE_API_KEY`; either name is accepted |
+| `GEMINI_API_KEY` | At least one provider key | Gemini API key — enables image, video, and audio generation via imagen-4, veo-3.1, and gemini-2.5-flash-preview-tts |
+| `GOOGLE_API_KEY` | — | Alias for `GEMINI_API_KEY`; either name is accepted |
 | `MEDIA_OUTPUT_DIR` | No | Directory for saved media files. Defaults to the system temp directory |
 ## Available Tools
@@ -112,7 +88,7 @@ List all configured media generation providers and their capabilities. Takes no
 |----------|:-----:|:-----:|:-----:|-------------|-------------|-------------|
 | OpenAI | ✅ | ✅ | ✅ | gpt-image-1 | sora-2 | tts-1 |
 | xAI | ✅ | ✅ | — | grok-imagine-image | grok-imagine-video | — |
-| Google | ✅ | ✅ | ✅ | imagen-4 | veo-3.1 | gemini-2.5-flash-preview-tts |
+| Gemini | ✅ | ✅ | ✅ | imagen-4 | veo-3.1 | gemini-2.5-flash-preview-tts |
 ### Image Aspect Ratios
@@ -120,7 +96,7 @@ List all configured media generation providers and their capabilities. Takes no
 |----------|:---:|:----:|:----:|:---:|:---:|
 | OpenAI | ✅ | ✅ | ✅ | ✅ | ✅ |
 | xAI | ✅ | ✅ | ✅ | ✅ | ✅ |
-| Google | ✅ | ✅ | ✅ | ✅ | ✅ |
+| Gemini | ✅ | ✅ | ✅ | ✅ | ✅ |
 ### Video Aspect Ratios & Resolutions
@@ -128,14 +104,14 @@ List all configured media generation providers and their capabilities. Takes no
 |----------|:----:|:----:|:---:|:----:|:----:|:-----:|
 | OpenAI | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
 | xAI | ✅ | ✅ | ✅ | — | ✅ | ✅ |
-| Google | ✅ | ✅ | — | — | ✅ | ✅ |
+| Gemini | ✅ | ✅ | — | — | ✅ | ✅ |
 ### Audio Formats
 | Provider | mp3 | opus | aac | flac | wav | pcm |
 |----------|:---:|:----:|:---:|:----:|:---:|:---:|
 | OpenAI | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
-| Google | — | — | — | — | ✅ | — |
+| Gemini | — | — | — | — | ✅ | — |
 ## Troubleshooting
@@ -145,11 +121,11 @@ List all configured media generation providers and their capabilities. Takes no
 [config] No provider API keys detected
 ```
-Set at least one of `OPENAI_API_KEY`, `XAI_API_KEY`, or `GOOGLE_API_KEY` in the MCP server's `env` block.
+Set at least one of `OPENAI_API_KEY`, `XAI_API_KEY`, or `GEMINI_API_KEY` in the MCP server's `env` block.
 ### Provider not available for requested media type
-All three providers support image and video generation. Audio generation (text-to-speech) is supported by OpenAI and Google. xAI does not currently offer a standalone TTS API. If you specify a `provider` that isn't configured (no API key) or doesn't support the requested media type, you'll receive an error. Omit the `provider` parameter to auto-select from configured providers.
+All three providers support image and video generation. Audio generation (text-to-speech) is supported by OpenAI and Gemini. xAI does not currently offer a standalone TTS API. If you specify a `provider` that isn't configured (no API key) or doesn't support the requested media type, you'll receive an error. Omit the `provider` parameter to auto-select from configured providers.
 ### Video generation timeout
@@ -159,9 +135,9 @@ Video generation polls for up to 10 minutes. If your video hasn't completed in t
 This indicates the xAI API returned an empty response. Check that your `XAI_API_KEY` is valid and that your prompt does not violate xAI content policies.
-### Google image/video generation failed: 403
+### Gemini image/video generation failed: 403
-Verify your `GOOGLE_API_KEY` has the Generative Language API enabled in Google Cloud Console.
+Verify your `GEMINI_API_KEY` has the Generative Language API enabled in Google Cloud Console.
 ## Development
@@ -173,6 +149,100 @@ npm run typecheck  # Type-check without emitting
 npm run dev        # Watch mode for TypeScript compilation
 ```
+## Editor Setup
+Replace `OPENAI_API_KEY` with your provider of choice (`XAI_API_KEY`, `GEMINI_API_KEY`). You can set multiple keys to enable multiple providers.
+### Claude Desktop
+Add to `~/Library/Application Support/Claude/claude_desktop_config.json`:
+```json
+{
+  "mcpServers": {
+    "multimodal-mcp": {
+      "command": "npx",
+      "args": ["@r16t/multimodal-mcp"],
+      "env": {
+        "OPENAI_API_KEY": "sk-..."
+      }
+    }
+  }
+}
+```
+### Cursor
+Add to `.cursor/mcp.json` in your project root (or `~/.cursor/mcp.json` globally):
+```json
+{
+  "mcpServers": {
+    "multimodal-mcp": {
+      "command": "npx",
+      "args": ["@r16t/multimodal-mcp"],
+      "env": {
+        "OPENAI_API_KEY": "sk-..."
+      }
+    }
+  }
+}
+```
+### VS Code (GitHub Copilot)
+Add to `.vscode/mcp.json` in your project root:
+```json
+{
+  "servers": {
+    "multimodal-mcp": {
+      "command": "npx",
+      "args": ["@r16t/multimodal-mcp"],
+      "env": {
+        "OPENAI_API_KEY": "sk-..."
+      }
+    }
+  }
+}
+```
+### Windsurf
+Add to `~/.codeium/windsurf/mcp_config.json`:
+```json
+{
+  "mcpServers": {
+    "multimodal-mcp": {
+      "command": "npx",
+      "args": ["@r16t/multimodal-mcp"],
+      "env": {
+        "OPENAI_API_KEY": "sk-..."
+      }
+    }
+  }
+}
+```
+### Cline
+Add to `~/Library/Application Support/Code/User/globalStorage/saoudrizwan.claude-dev/settings/cline_mcp_settings.json`:
+```json
+{
+  "mcpServers": {
+    "multimodal-mcp": {
+      "command": "npx",
+      "args": ["@r16t/multimodal-mcp"],
+      "env": {
+        "OPENAI_API_KEY": "sk-..."
+      }
+    }
+  }
+}
+```
 ## License
 MIT

package/build/config.js CHANGED Viewed

@@ -1,20 +1,19 @@
 import { z } from "zod";
-import { tmpdir } from "node:os";
 const configSchema = z.object({
     openaiApiKey: z.string().optional(),
     xaiApiKey: z.string().optional(),
     googleApiKey: z.string().optional(),
     outputDirectory: z.string(),
 });
-function resolveGoogleKey() {
-    return process.env.GOOGLE_API_KEY || process.env.GEMINI_API_KEY || undefined;
+function resolveGeminiKey() {
+    return process.env.GEMINI_API_KEY || process.env.GOOGLE_API_KEY || undefined;
 }
 export function loadConfig() {
     const config = configSchema.parse({
         openaiApiKey: process.env.OPENAI_API_KEY || undefined,
         xaiApiKey: process.env.XAI_API_KEY || undefined,
-        googleApiKey: resolveGoogleKey(),
-        outputDirectory: process.env.MEDIA_OUTPUT_DIR || tmpdir(),
+        googleApiKey: resolveGeminiKey(),
+        outputDirectory: process.env.MEDIA_OUTPUT_DIR || process.cwd(),
     });
     const detected = [];
     if (config.openaiApiKey)
@@ -22,7 +21,7 @@ export function loadConfig() {
     if (config.xaiApiKey)
         detected.push("xAI");
     if (config.googleApiKey)
-        detected.push("Google");
+        detected.push("Gemini");
     if (detected.length > 0) {
         console.error(`[config] Detected providers: ${detected.join(", ")}`);
     }

package/build/file-manager.d.ts CHANGED Viewed

@@ -2,6 +2,6 @@ import type { GeneratedMedia } from "./providers/types.js";
 export declare class FileManager {
     private readonly outputDirectory;
     constructor(outputDirectory: string);
-    save(media: GeneratedMedia, type: "image" | "video" | "audio"): Promise<string>;
+    save(media: GeneratedMedia, type: "image" | "video" | "audio", outputDirectory?: string): Promise<string>;
     private getExtension;
 }

package/build/file-manager.js CHANGED Viewed

@@ -6,14 +6,17 @@ export class FileManager {
     constructor(outputDirectory) {
         this.outputDirectory = resolve(outputDirectory);
     }
-    async save(media, type) {
-        await mkdir(this.outputDirectory, { recursive: true });
+    async save(media, type, outputDirectory) {
+        const targetDirectory = outputDirectory
+            ? resolve(outputDirectory)
+            : this.outputDirectory;
+        await mkdir(targetDirectory, { recursive: true });
         const extension = this.getExtension(type, media.mimeType);
         const provider = media.metadata.provider || "unknown";
         const timestamp = Date.now();
         const random = randomBytes(4).toString("hex");
         const filename = `${type}-${timestamp}-${provider}-${random}.${extension}`;
-        const filePath = join(this.outputDirectory, filename);
+        const filePath = join(targetDirectory, filename);
         await writeFile(filePath, media.data);
         return filePath;
     }

package/build/providers/openai.js CHANGED Viewed

@@ -28,8 +28,8 @@ export class OpenAIProvider {
             model: "gpt-image-1",
             prompt: params.prompt,
             size: this.mapAspectRatioToSize(params.aspectRatio),
-            quality: params.quality === "high" ? "hd" : "standard",
-            response_format: "b64_json",
+            quality: params.quality === "high" ? "high" : params.quality === "low" ? "low" : "medium",
+            output_format: "png",
             ...params.providerOptions,
         });
         const base64Data = response.data[0].b64_json;

package/build/server.js CHANGED Viewed

@@ -35,6 +35,7 @@ export function createServer(config) {
         provider: z.string().optional().describe("Provider to use: openai, xai, google. Auto-selects if omitted."),
         aspectRatio: z.string().optional().describe("Aspect ratio: 1:1, 16:9, 9:16, 4:3, 3:4"),
         quality: z.string().optional().describe("Quality level: low, standard, high"),
+        outputDirectory: z.string().optional().describe("Directory to save the generated file. Supports absolute or relative paths (resolved from cwd). Defaults to MEDIA_OUTPUT_DIR env var or cwd."),
         providerOptions: z.record(z.string(), z.unknown()).optional().describe("Provider-specific parameters passed through directly"),
     }, async (params) => generateImageHandler(params));
     server.tool("generate_video", `Generate a video from a text prompt using AI. Available providers: ${providerNames}`, {
@@ -43,6 +44,7 @@ export function createServer(config) {
         duration: z.number().optional().describe("Video duration in seconds (provider limits apply)"),
         aspectRatio: z.string().optional().describe("Aspect ratio: 16:9, 9:16, 1:1"),
         resolution: z.string().optional().describe("Resolution: 480p, 720p, 1080p"),
+        outputDirectory: z.string().optional().describe("Directory to save the generated file. Supports absolute or relative paths (resolved from cwd). Defaults to MEDIA_OUTPUT_DIR env var or cwd."),
         providerOptions: z.record(z.string(), z.unknown()).optional().describe("Provider-specific parameters passed through directly"),
     }, async (params) => generateVideoHandler(params));
     server.tool("generate_audio", `Generate audio (text-to-speech) from text using AI. Available providers: ${providerNames}`, {
@@ -51,6 +53,7 @@ export function createServer(config) {
         voice: z.string().optional().describe("Voice name (provider-specific). OpenAI: alloy, ash, coral, echo, fable, nova, onyx, sage, shimmer. Google: Kore, Charon, Fenrir, Aoede, Puck, etc."),
         speed: z.number().optional().describe("Speech speed multiplier (OpenAI only): 0.25 to 4.0"),
         format: z.string().optional().describe("Output format (OpenAI only): mp3, opus, aac, flac, wav, pcm"),
+        outputDirectory: z.string().optional().describe("Directory to save the generated file. Supports absolute or relative paths (resolved from cwd). Defaults to MEDIA_OUTPUT_DIR env var or cwd."),
         providerOptions: z.record(z.string(), z.unknown()).optional().describe("Provider-specific parameters passed through directly"),
     }, async (params) => generateAudioHandler(params));
     server.tool("list_providers", "List all configured media generation providers and their capabilities", async () => listProvidersHandler());

package/build/tools/generate-audio.d.ts CHANGED Viewed

@@ -6,6 +6,7 @@ export declare function buildGenerateAudioHandler(registry: ProviderRegistry, fi
     voice?: string;
     speed?: number;
     format?: string;
+    outputDirectory?: string;
     providerOptions?: Record<string, unknown>;
 }) => Promise<{
     isError: true;

package/build/tools/generate-audio.js CHANGED Viewed

@@ -8,7 +8,7 @@ export function buildGenerateAudioHandler(registry, fileManager) {
             const available = registry.getAudioProviders().map((p) => p.name).join(", ") || "none";
             const text = params.provider
                 ? `Provider "${params.provider}" is not configured or does not support audio. Available audio providers: ${available}`
-                : "No audio provider available. Configure one of: OPENAI_API_KEY, GOOGLE_API_KEY";
+                : "No audio provider available. Configure one of: OPENAI_API_KEY, GEMINI_API_KEY";
             return {
                 isError: true,
                 content: [{ type: "text", text }],
@@ -32,7 +32,7 @@ export function buildGenerateAudioHandler(registry, fileManager) {
                 format: params.format,
                 providerOptions: params.providerOptions,
             });
-            const filePath = await fileManager.save(media, "audio");
+            const filePath = await fileManager.save(media, "audio", params.outputDirectory);
             return {
                 content: [{ type: "text", text: `Audio saved to ${filePath}` }],
             };

package/build/tools/generate-image.d.ts CHANGED Viewed

@@ -5,6 +5,7 @@ export declare function buildGenerateImageHandler(registry: ProviderRegistry, fi
     provider?: string;
     aspectRatio?: string;
     quality?: string;
+    outputDirectory?: string;
     providerOptions?: Record<string, unknown>;
 }) => Promise<{
     isError: true;

package/build/tools/generate-image.js CHANGED Viewed

@@ -6,7 +6,7 @@ export function buildGenerateImageHandler(registry, fileManager) {
             const availableNames = registry.getImageProviders().map((p) => p.name).join(", ");
             const text = params.provider
                 ? `Provider "${params.provider}" is not configured. Available providers: ${availableNames || "none"}`
-                : "No image provider available. Configure one of: OPENAI_API_KEY, XAI_API_KEY, GOOGLE_API_KEY";
+                : "No image provider available. Configure one of: OPENAI_API_KEY, XAI_API_KEY, GEMINI_API_KEY";
             return {
                 isError: true,
                 content: [{ type: "text", text }],
@@ -19,7 +19,7 @@ export function buildGenerateImageHandler(registry, fileManager) {
                 quality: params.quality ?? "standard",
                 providerOptions: params.providerOptions,
             });
-            const filePath = await fileManager.save(media, "image");
+            const filePath = await fileManager.save(media, "image", params.outputDirectory);
             return {
                 content: [{ type: "text", text: `Image saved to ${filePath}` }],
             };

package/build/tools/generate-video.d.ts CHANGED Viewed

@@ -6,6 +6,7 @@ export declare function buildGenerateVideoHandler(registry: ProviderRegistry, fi
     duration?: number;
     aspectRatio?: string;
     resolution?: string;
+    outputDirectory?: string;
     providerOptions?: Record<string, unknown>;
 }) => Promise<{
     isError: true;

package/build/tools/generate-video.js CHANGED Viewed

@@ -6,7 +6,7 @@ export function buildGenerateVideoHandler(registry, fileManager) {
             const available = registry.getVideoProviders().map((p) => p.name).join(", ") || "none";
             const text = params.provider
                 ? `Provider "${params.provider}" is not configured. Available providers: ${available}`
-                : "No video provider available. Configure one of: OPENAI_API_KEY, XAI_API_KEY, GOOGLE_API_KEY";
+                : "No video provider available. Configure one of: OPENAI_API_KEY, XAI_API_KEY, GEMINI_API_KEY";
             return {
                 isError: true,
                 content: [{ type: "text", text }],
@@ -20,7 +20,7 @@ export function buildGenerateVideoHandler(registry, fileManager) {
                 resolution: params.resolution ?? "720p",
                 providerOptions: params.providerOptions,
             });
-            const filePath = await fileManager.save(media, "video");
+            const filePath = await fileManager.save(media, "video", params.outputDirectory);
             return {
                 content: [{ type: "text", text: `Video saved to ${filePath}` }],
             };

package/build/tools/list-providers.js CHANGED Viewed

@@ -5,7 +5,7 @@ export function buildListProvidersHandler(registry) {
             return {
                 content: [{
                         type: "text",
-                        text: "No providers configured. Set one or more API keys: OPENAI_API_KEY, XAI_API_KEY, GOOGLE_API_KEY",
+                        text: "No providers configured. Set one or more API keys: OPENAI_API_KEY, XAI_API_KEY, GEMINI_API_KEY",
                     }],
             };
         }
@@ -15,6 +15,8 @@ export function buildListProvidersHandler(registry) {
                 caps.push("image");
             if (p.capabilities.supportsVideoGeneration)
                 caps.push("video");
+            if (p.capabilities.supportsAudioGeneration)
+                caps.push("audio");
             return `- ${p.name}: ${caps.join(", ")}`;
         });
         return {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@r16t/multimodal-mcp",
-  "version": "1.0.0",
+  "version": "1.1.2",
   "description": "Multi-provider media generation MCP server",
   "type": "module",
   "main": "build/index.js",