npm - nvidia-vision-mcp - Versions diffs - 0.1.0 - Mend

nvidia-vision-mcp 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 M Jupri Amin
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

package/README.md ADDED Viewed

@@ -0,0 +1,103 @@
+# NVIDIA Vision MCP
+A small MCP server for reading local images with NVIDIA vision models.
+This is useful when the AI model you are using cannot see images directly. A common case is browser debugging: Chrome DevTools can capture a screenshot, but the model still cannot inspect what is inside the image. This server gives the model a simple way to read that screenshot.
+## What It Does
+- Describes local images and screenshots
+- Extracts visible text from images
+- Answers specific questions about an image
+- Deletes temporary screenshot files after use
+## Setup
+Add the server to your MCP client config:
+```json
+{
+  "mcpServers": {
+    "nvidia-vision": {
+      "command": "npx",
+      "args": ["-y", "nvidia-vision-mcp"],
+      "env": {
+        "NVIDIA_MODEL": "meta/llama-4-maverick-17b-128e-instruct",
+        "NVIDIA_API_KEY": "your_nvidia_api_key"
+      }
+    }
+  }
+}
+```
+The API key is read from the MCP server environment. No `.env` file is needed.
+`NVIDIA_MODEL` is optional. If it is not set, the server uses:
+```text
+meta/llama-4-maverick-17b-128e-instruct
+```
+You can replace it with another NVIDIA-hosted vision-capable chat model when needed.
+For local development from this folder:
+```json
+{
+  "mcpServers": {
+    "nvidia-vision": {
+      "command": "node",
+      "args": ["/path/to/nvidia-vision/src/server.js"],
+      "env": {
+        "NVIDIA_MODEL": "meta/llama-4-maverick-17b-128e-instruct",
+        "NVIDIA_API_KEY": "your_nvidia_api_key"
+      }
+    }
+  }
+}
+```
+## Tools
+`describe_image`
+Describes what is visible in a local image.
+`extract_text_from_image`
+Extracts text from an image or screenshot. Useful for UI errors, terminal output, form labels, dialogs, and short documents.
+`analyze_image`
+Answers a custom question about an image. For example, you can ask where a button is, what color an element uses, or whether an error message is visible.
+`delete_file`
+Deletes a local file. This is mostly for cleaning up temporary screenshots.
+## Examples
+Read text from a screenshot:
+```text
+extract_text_from_image(image_path="/tmp/screenshot.png")
+```
+Ask about a specific part of the UI:
+```text
+analyze_image(
+  image_path="/tmp/screenshot.png",
+  question="What does the primary button say, and where is it located?"
+)
+```
+Describe a screenshot and remove it afterwards:
+```text
+describe_image(image_path="/tmp/screenshot.png", cleanup=true)
+```
+## Notes
+This server intentionally stays narrow. It exists to help models inspect local screenshots when another tool can produce the image file but cannot explain what is inside it.

package/package.json ADDED Viewed

@@ -0,0 +1,25 @@
+{
+  "name": "nvidia-vision-mcp",
+  "version": "0.1.0",
+  "description": "MCP server for reading local images with NVIDIA vision models.",
+  "license": "MIT",
+  "type": "module",
+  "bin": {
+    "nvidia-vision-mcp": "./src/server.js"
+  },
+  "files": [
+    "src",
+    "README.md",
+    "LICENSE"
+  ],
+  "engines": {
+    "node": ">=18"
+  },
+  "scripts": {
+    "check": "node --check src/server.js"
+  },
+  "dependencies": {
+    "@modelcontextprotocol/sdk": "^1.0.0",
+    "zod": "^3.24.0"
+  }
+}

package/src/server.js ADDED Viewed

@@ -0,0 +1,181 @@
+#!/usr/bin/env node
+import { readFile, unlink } from "node:fs/promises";
+import { existsSync } from "node:fs";
+import { extname, resolve } from "node:path";
+import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
+import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
+import { z } from "zod";
+const NVIDIA_BASE_URL = "https://integrate.api.nvidia.com/v1";
+const DEFAULT_NVIDIA_MODEL = "meta/llama-4-maverick-17b-128e-instruct";
+const server = new McpServer({
+  name: "nvidia-vision",
+  version: "0.1.0",
+});
+function getApiKey() {
+  const apiKey = process.env.NVIDIA_API_KEY;
+  if (!apiKey) {
+    throw new Error("NVIDIA_API_KEY is not set. Add it to your MCP server environment.");
+  }
+  return apiKey;
+}
+function getModel() {
+  return process.env.NVIDIA_MODEL || DEFAULT_NVIDIA_MODEL;
+}
+function getMimeType(filePath) {
+  const mimeTypes = {
+    ".png": "image/png",
+    ".jpg": "image/jpeg",
+    ".jpeg": "image/jpeg",
+    ".webp": "image/webp",
+    ".gif": "image/gif",
+    ".bmp": "image/bmp",
+  };
+  return mimeTypes[extname(filePath).toLowerCase()] ?? "image/png";
+}
+async function callVision(prompt, imagePath) {
+  const fullPath = resolve(imagePath);
+  if (!existsSync(fullPath)) {
+    return `Error: file not found: ${imagePath}`;
+  }
+  const image = await readFile(fullPath);
+  const mimeType = getMimeType(fullPath);
+  const base64Image = image.toString("base64");
+  const response = await fetch(`${NVIDIA_BASE_URL}/chat/completions`, {
+    method: "POST",
+    headers: {
+      Authorization: `Bearer ${getApiKey()}`,
+      "Content-Type": "application/json",
+    },
+    body: JSON.stringify({
+      model: getModel(),
+      messages: [
+        {
+          role: "user",
+          content: [
+            { type: "text", text: prompt },
+            {
+              type: "image_url",
+              image_url: { url: `data:${mimeType};base64,${base64Image}` },
+            },
+          ],
+        },
+      ],
+      max_tokens: 1024,
+      temperature: 0.2,
+    }),
+  });
+  if (!response.ok) {
+    const body = await response.text();
+    throw new Error(`NVIDIA API request failed (${response.status}): ${body}`);
+  }
+  const data = await response.json();
+  return data.choices?.[0]?.message?.content ?? "No response returned from NVIDIA API.";
+}
+async function removeFile(filePath) {
+  const fullPath = resolve(filePath);
+  if (!existsSync(fullPath)) {
+    return `File not found: ${filePath}`;
+  }
+  try {
+    await unlink(fullPath);
+    return `Deleted: ${filePath}`;
+  } catch (error) {
+    return `Failed to delete ${filePath}: ${error.message}`;
+  }
+}
+function textResponse(text) {
+  return {
+    content: [{ type: "text", text }],
+  };
+}
+server.tool(
+  "describe_image",
+  "Describe a local image in detail. Useful when the current AI model cannot see screenshots directly.",
+  {
+    image_path: z.string().describe("Absolute or relative path to the image file."),
+    cleanup: z.boolean().default(false).describe("Delete the image file after reading it."),
+  },
+  async ({ image_path, cleanup }) => {
+    let result = await callVision(
+      "Describe this image in detail. Include visible elements, text, colors, layout, and anything unusual.",
+      image_path,
+    );
+    if (cleanup) {
+      result += `\n\n${await removeFile(image_path)}`;
+    }
+    return textResponse(result);
+  },
+);
+server.tool(
+  "extract_text_from_image",
+  "Extract visible text from a local image or screenshot.",
+  {
+    image_path: z.string().describe("Absolute or relative path to the image file."),
+    cleanup: z.boolean().default(false).describe("Delete the image file after reading it."),
+  },
+  async ({ image_path, cleanup }) => {
+    let result = await callVision(
+      "Extract all visible text from this image. Preserve line breaks and structure where possible. If there is no text, say that no text was found.",
+      image_path,
+    );
+    if (cleanup) {
+      result += `\n\n${await removeFile(image_path)}`;
+    }
+    return textResponse(result);
+  },
+);
+server.tool(
+  "analyze_image",
+  "Ask a specific question about a local image.",
+  {
+    image_path: z.string().describe("Absolute or relative path to the image file."),
+    question: z.string().describe("Question to answer using the image."),
+    cleanup: z.boolean().default(false).describe("Delete the image file after reading it."),
+  },
+  async ({ image_path, question, cleanup }) => {
+    let result = await callVision(question, image_path);
+    if (cleanup) {
+      result += `\n\n${await removeFile(image_path)}`;
+    }
+    return textResponse(result);
+  },
+);
+server.tool(
+  "delete_file",
+  "Delete a local file, usually a temporary screenshot that is no longer needed.",
+  {
+    file_path: z.string().describe("Path to the file to delete."),
+  },
+  async ({ file_path }) => textResponse(await removeFile(file_path)),
+);
+const transport = new StdioServerTransport();
+await server.connect(transport);