nvidia-vision-mcp 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +75 -0
  2. package/package.json +2 -2
  3. package/src/server.js +275 -16
package/README.md CHANGED
@@ -9,6 +9,13 @@ This is useful when the AI model you are using cannot see images directly. A com
9
9
  - Describes local images and screenshots
10
10
  - Extracts visible text from images
11
11
  - Answers specific questions about an image
12
+ - Turns a UI screenshot into code, a prompt, a spec, or a description
13
+ - OCRs screenshots optimized for code, terminal output, documents, or general text
14
+ - Diagnoses error screenshots and proposes fixes
15
+ - Interprets technical diagrams (architecture, flow, UML, ER, sequence, system)
16
+ - Reads charts and dashboards to surface insights and trends
17
+ - Compares two UI screenshots to flag visual drift
18
+ - General-purpose image understanding as a fallback
12
19
  - Deletes temporary screenshot files after use
13
20
 
14
21
  ## Setup
@@ -71,6 +78,39 @@ Extracts text from an image or screenshot. Useful for UI errors, terminal output
71
78
 
72
79
  Answers a custom question about an image. For example, you can ask where a button is, what color an element uses, or whether an error message is visible.
73
80
 
81
+ `ui_to_artifact`
82
+
83
+ Turns a UI screenshot into a reusable artifact. Choose `artifact_type`:
84
+
85
+ - `code` — production-ready code recreating the UI (optionally set `target`, e.g. `react + tailwind`).
86
+ - `prompt` — a text-to-UI prompt that reproduces the screenshot.
87
+ - `spec` — a structured UI specification.
88
+ - `description` — a written description for documentation.
89
+
90
+ `extract_text_from_screenshot`
91
+
92
+ OCR tuned for a specific `kind` of content: `code`, `terminal`, `document`, or `general` (default). Reproduces text verbatim with structure preserved.
93
+
94
+ `diagnose_error_screenshot`
95
+
96
+ Analyzes an error snapshot (stack trace, crash dialog, failed build, browser console). Extracts the error, explains it, finds the likely root cause, and lists ordered fix steps. Pass optional `context` for what was being attempted.
97
+
98
+ `understand_technical_diagram`
99
+
100
+ Interprets a technical diagram. Set `diagram_type` to `architecture`, `flow`, `uml`, `er`, `sequence`, `system`, or `auto` (default). Optionally ask a follow-up `question`.
101
+
102
+ `analyze_data_visualization`
103
+
104
+ Reads a chart, graph, or dashboard. Reports visualization type, axes/units, key values, trends, and insights. Optionally answer a specific `question`. Will not fabricate unreadable numbers.
105
+
106
+ `ui_diff_check`
107
+
108
+ Compares two UI screenshots (`image_path_a` / `image_path_b`) and flags visual or implementation drift, with per-difference severity and recommendations. Optionally `focus` on an aspect like `spacing`, `colors`, `layout`, or `typography`.
109
+
110
+ `image_analysis`
111
+
112
+ General-purpose image understanding when a more specific tool does not fit. Pass any freeform `task` instruction.
113
+
74
114
  `delete_file`
75
115
 
76
116
  Deletes a local file. This is mostly for cleaning up temporary screenshots.
@@ -98,6 +138,41 @@ Describe a screenshot and remove it afterwards:
98
138
  describe_image(image_path="/tmp/screenshot.png", cleanup=true)
99
139
  ```
100
140
 
141
+ Turn a UI screenshot into React + Tailwind code:
142
+
143
+ ```text
144
+ ui_to_artifact(
145
+ image_path="/tmp/screenshot.png",
146
+ artifact_type="code",
147
+ target="react + tailwind"
148
+ )
149
+ ```
150
+
151
+ OCR terminal output from a screenshot:
152
+
153
+ ```text
154
+ extract_text_from_screenshot(image_path="/tmp/terminal.png", kind="terminal")
155
+ ```
156
+
157
+ Diagnose a build error screenshot with context:
158
+
159
+ ```text
160
+ diagnose_error_screenshot(
161
+ image_path="/tmp/build-error.png",
162
+ context="Running vite build on a React + TypeScript project"
163
+ )
164
+ ```
165
+
166
+ Compare two versions of a UI:
167
+
168
+ ```text
169
+ ui_diff_check(
170
+ image_path_a="/tmp/before.png",
171
+ image_path_b="/tmp/after.png",
172
+ focus="spacing"
173
+ )
174
+ ```
175
+
101
176
  ## Notes
102
177
 
103
178
  This server intentionally stays narrow. It exists to help models inspect local screenshots when another tool can produce the image file but cannot explain what is inside it.
package/package.json CHANGED
@@ -1,11 +1,11 @@
1
1
  {
2
2
  "name": "nvidia-vision-mcp",
3
- "version": "0.1.0",
3
+ "version": "0.2.0",
4
4
  "description": "MCP server for reading local images with NVIDIA vision models.",
5
5
  "license": "MIT",
6
6
  "type": "module",
7
7
  "bin": {
8
- "nvidia-vision-mcp": "./src/server.js"
8
+ "nvidia-vision-mcp": "src/server.js"
9
9
  },
10
10
  "files": [
11
11
  "src",
package/src/server.js CHANGED
@@ -41,17 +41,44 @@ function getMimeType(filePath) {
41
41
  return mimeTypes[extname(filePath).toLowerCase()] ?? "image/png";
42
42
  }
43
43
 
44
- async function callVision(prompt, imagePath) {
44
+ async function loadImageContext(imagePath) {
45
45
  const fullPath = resolve(imagePath);
46
46
 
47
47
  if (!existsSync(fullPath)) {
48
- return `Error: file not found: ${imagePath}`;
48
+ const error = new Error(`file not found: ${imagePath}`);
49
+ error.code = "ENOENT";
50
+ error.imagePath = imagePath;
51
+ throw error;
49
52
  }
50
53
 
51
54
  const image = await readFile(fullPath);
52
55
  const mimeType = getMimeType(fullPath);
53
56
  const base64Image = image.toString("base64");
54
57
 
58
+ return {
59
+ type: "image_url",
60
+ image_url: { url: `data:${mimeType};base64,${base64Image}` },
61
+ };
62
+ }
63
+
64
+ async function callVision(prompt, imagePaths, options = {}) {
65
+ const paths = Array.isArray(imagePaths) ? imagePaths : [imagePaths];
66
+ const maxTokens = options.maxTokens ?? 1024;
67
+ const temperature = options.temperature ?? 0.2;
68
+
69
+ const content = [{ type: "text", text: prompt }];
70
+
71
+ for (const imagePath of paths) {
72
+ try {
73
+ content.push(await loadImageContext(imagePath));
74
+ } catch (error) {
75
+ if (error.code === "ENOENT") {
76
+ return `Error: ${error.message}`;
77
+ }
78
+ throw error;
79
+ }
80
+ }
81
+
55
82
  const response = await fetch(`${NVIDIA_BASE_URL}/chat/completions`, {
56
83
  method: "POST",
57
84
  headers: {
@@ -60,20 +87,9 @@ async function callVision(prompt, imagePath) {
60
87
  },
61
88
  body: JSON.stringify({
62
89
  model: getModel(),
63
- messages: [
64
- {
65
- role: "user",
66
- content: [
67
- { type: "text", text: prompt },
68
- {
69
- type: "image_url",
70
- image_url: { url: `data:${mimeType};base64,${base64Image}` },
71
- },
72
- ],
73
- },
74
- ],
75
- max_tokens: 1024,
76
- temperature: 0.2,
90
+ messages: [{ role: "user", content }],
91
+ max_tokens: maxTokens,
92
+ temperature,
77
93
  }),
78
94
  });
79
95
 
@@ -168,6 +184,249 @@ server.tool(
168
184
  },
169
185
  );
170
186
 
187
+ server.tool(
188
+ "ui_to_artifact",
189
+ "Turn a UI screenshot into a reusable artifact: code, a prompt, a spec, or a written description.",
190
+ {
191
+ image_path: z.string().describe("Absolute or relative path to the UI screenshot."),
192
+ artifact_type: z
193
+ .enum(["code", "prompt", "spec", "description"])
194
+ .describe("Kind of artifact to produce from the screenshot."),
195
+ target: z
196
+ .string()
197
+ .optional()
198
+ .describe("Optional target for the artifact, e.g. 'react + tailwind', 'html/css', or a component name."),
199
+ cleanup: z.boolean().default(false).describe("Delete the image file after reading it."),
200
+ },
201
+ async ({ image_path, artifact_type, target, cleanup }) => {
202
+ const targetHint = target ? ` Target stack/component: ${target}.` : "";
203
+
204
+ const prompts = {
205
+ code:
206
+ `Recreate this UI as production-ready code.${targetHint} Match layout, spacing, colors, typography, and component structure as closely as possible. Return only the code in a single fenced block, with a one-line summary before it.`,
207
+ prompt:
208
+ `Write a text-to-UI prompt that would reproduce this screenshot in a generative UI tool.${targetHint} Describe layout, sections, components, colors, typography, spacing, and interactions in enough detail to recreate it.`,
209
+ spec:
210
+ `Produce a concise UI specification for this screenshot.${targetHint} Cover layout, sections, components, copy, states, colors, typography, spacing, and accessibility notes as a structured list.`,
211
+ description:
212
+ `Describe this UI in detail for documentation purposes. Cover layout, visible components, content, colors, typography, and notable interaction cues.`,
213
+ };
214
+
215
+ let result = await callVision(prompts[artifact_type], image_path, { maxTokens: 2048 });
216
+
217
+ if (cleanup) {
218
+ result += `\n\n${await removeFile(image_path)}`;
219
+ }
220
+
221
+ return textResponse(result);
222
+ },
223
+ );
224
+
225
+ server.tool(
226
+ "extract_text_from_screenshot",
227
+ "OCR a screenshot optimized for a specific kind of content: code, terminal output, documents, or general text.",
228
+ {
229
+ image_path: z.string().describe("Absolute or relative path to the screenshot."),
230
+ kind: z
231
+ .enum(["code", "terminal", "document", "general"])
232
+ .default("general")
233
+ .describe("Type of content to optimize OCR for."),
234
+ cleanup: z.boolean().default(false).describe("Delete the image file after reading it."),
235
+ },
236
+ async ({ image_path, kind, cleanup }) => {
237
+ const prompts = {
238
+ code:
239
+ "Extract source code visible in this screenshot. Reproduce it verbatim in a single fenced code block, preserving indentation, language syntax, and line breaks. Add a language hint to the fence if you can infer it. Do not add commentary.",
240
+ terminal:
241
+ "Extract the terminal/command-line output visible in this screenshot verbatim. Preserve the original line breaks, prompts, paths, and ANSI-style structure. Do not add commentary.",
242
+ document:
243
+ "Extract the document text visible in this screenshot. Preserve headings, paragraphs, lists, and line breaks as faithfully as possible in markdown. Do not add commentary.",
244
+ general:
245
+ "Extract all visible text from this screenshot. Preserve line breaks and structure where possible. If there is no text, say that no text was found.",
246
+ };
247
+
248
+ let result = await callVision(prompts[kind], image_path, { maxTokens: 2048 });
249
+
250
+ if (cleanup) {
251
+ result += `\n\n${await removeFile(image_path)}`;
252
+ }
253
+
254
+ return textResponse(result);
255
+ },
256
+ );
257
+
258
+ server.tool(
259
+ "diagnose_error_screenshot",
260
+ "Analyze an error screenshot (stack trace, crash dialog, failed build, browser console) and propose actionable fixes.",
261
+ {
262
+ image_path: z.string().describe("Absolute or relative path to the error screenshot."),
263
+ context: z
264
+ .string()
265
+ .optional()
266
+ .describe("Optional context: what was being attempted, language, framework, or relevant logs."),
267
+ cleanup: z.boolean().default(false).describe("Delete the image file after reading it."),
268
+ },
269
+ async ({ image_path, context, cleanup }) => {
270
+ const prompt = `You are a senior engineer diagnosing an error from a screenshot.
271
+
272
+ Steps:
273
+ 1. Extract the exact error message, file paths, line numbers, and stack trace verbatim.
274
+ 2. Explain what the error means in plain language.
275
+ 3. Identify the most likely root cause.
276
+ 4. List concrete, ordered steps to fix it, including code or command examples where helpful.
277
+
278
+ ${context ? `Additional context from the user:\n${context}\n` : ""}
279
+ If the screenshot does not actually show an error, say so and describe what is visible instead.`;
280
+
281
+ let result = await callVision(prompt, image_path, { maxTokens: 2048 });
282
+
283
+ if (cleanup) {
284
+ result += `\n\n${await removeFile(image_path)}`;
285
+ }
286
+
287
+ return textResponse(result);
288
+ },
289
+ );
290
+
291
+ server.tool(
292
+ "understand_technical_diagram",
293
+ "Interpret a technical diagram (architecture, flow, UML, ER, sequence, system) and explain its structure.",
294
+ {
295
+ image_path: z.string().describe("Absolute or relative path to the diagram image."),
296
+ diagram_type: z
297
+ .enum(["architecture", "flow", "uml", "er", "sequence", "system", "auto"])
298
+ .default("auto")
299
+ .describe("Diagram type, or 'auto' to infer it."),
300
+ question: z
301
+ .string()
302
+ .optional()
303
+ .describe("Optional specific question about the diagram."),
304
+ cleanup: z.boolean().default(false).describe("Delete the image file after reading it."),
305
+ },
306
+ async ({ image_path, diagram_type, question, cleanup }) => {
307
+ const typeHint =
308
+ diagram_type === "auto"
309
+ ? "First identify the diagram type, then describe it."
310
+ : `Treat this as a ${diagram_type} diagram.`;
311
+
312
+ const questionHint = question ? `\n\nThen answer this question:\n${question}` : "";
313
+
314
+ const prompt = `Interpret this technical diagram.
315
+
316
+ ${typeHint}
317
+
318
+ Provide:
319
+ - A short summary of what the diagram represents.
320
+ - The key entities, components, or nodes and their roles.
321
+ - The relationships, connections, or flows between them (as a list).
322
+ - Any labels, annotations, or notable details.
323
+ - Inferred purpose or system being modeled.${questionHint}`;
324
+
325
+ let result = await callVision(prompt, image_path, { maxTokens: 2048 });
326
+
327
+ if (cleanup) {
328
+ result += `\n\n${await removeFile(image_path)}`;
329
+ }
330
+
331
+ return textResponse(result);
332
+ },
333
+ );
334
+
335
+ server.tool(
336
+ "analyze_data_visualization",
337
+ "Read a chart, graph, or dashboard from an image to surface insights, trends, and notable values.",
338
+ {
339
+ image_path: z.string().describe("Absolute or relative path to the chart/dashboard image."),
340
+ question: z
341
+ .string()
342
+ .optional()
343
+ .describe("Optional specific question about the data."),
344
+ cleanup: z.boolean().default(false).describe("Delete the image file after reading it."),
345
+ },
346
+ async ({ image_path, question, cleanup }) => {
347
+ const questionHint = question ? `\n\nThen answer this question:\n${question}` : "";
348
+
349
+ const prompt = `Analyze this data visualization carefully and accurately.
350
+
351
+ Provide:
352
+ - The type of visualization (bar, line, pie, scatter, dashboard, etc.) and its title if present.
353
+ - The axes, units, legend, and categories shown.
354
+ - The key values, ranges, highs, lows, and any outliers.
355
+ - The main trends, comparisons, and notable patterns.
356
+ - A concise set of insights a reader should take away.${questionHint}
357
+
358
+ If values are not readable, say so instead of guessing. Do not fabricate numbers.`;
359
+
360
+ let result = await callVision(prompt, image_path, { maxTokens: 2048 });
361
+
362
+ if (cleanup) {
363
+ result += `\n\n${await removeFile(image_path)}`;
364
+ }
365
+
366
+ return textResponse(result);
367
+ },
368
+ );
369
+
370
+ server.tool(
371
+ "ui_diff_check",
372
+ "Compare two UI screenshots and flag visual or implementation drift between them.",
373
+ {
374
+ image_path_a: z.string().describe("Path to the first (e.g. before / baseline) screenshot."),
375
+ image_path_b: z.string().describe("Path to the second (e.g. after / candidate) screenshot."),
376
+ label_a: z.string().default("A").describe("Label for the first screenshot."),
377
+ label_b: z.string().default("B").describe("Label for the second screenshot."),
378
+ focus: z
379
+ .string()
380
+ .optional()
381
+ .describe("Optional aspect to focus on, e.g. 'spacing', 'colors', 'layout', 'typography'."),
382
+ cleanup: z.boolean().default(false).describe("Delete both image files after reading them."),
383
+ },
384
+ async ({ image_path_a, image_path_b, label_a, label_b, focus, cleanup }) => {
385
+ const focusHint = focus ? `\nPay special attention to: ${focus}.` : "";
386
+
387
+ const prompt = `Compare these two UI screenshots. The first is "${label_a}", the second is "${label_b}".
388
+
389
+ Provide:
390
+ - A verdict: are they visually equivalent, or is there drift?
391
+ - A bulleted list of specific differences (layout, spacing, colors, typography, components, copy, states).
392
+ - For each difference, note which screenshot (${label_a} or ${label_b}) it applies to and the likely cause.
393
+ - A severity for each difference (blocking / minor / cosmetic).
394
+ - A short recommendation on what (if anything) should change.${focusHint}
395
+
396
+ If the two images are not comparable (different screens), say so and summarize each separately.`;
397
+
398
+ let result = await callVision(prompt, [image_path_a, image_path_b], { maxTokens: 2048 });
399
+
400
+ if (cleanup) {
401
+ result += `\n\n${await removeFile(image_path_a)}`;
402
+ result += `\n${await removeFile(image_path_b)}`;
403
+ }
404
+
405
+ return textResponse(result);
406
+ },
407
+ );
408
+
409
+ server.tool(
410
+ "image_analysis",
411
+ "General-purpose image understanding when a more specific tool does not fit.",
412
+ {
413
+ image_path: z.string().describe("Absolute or relative path to the image file."),
414
+ task: z
415
+ .string()
416
+ .describe("Freeform instruction or question describing what to do with the image."),
417
+ cleanup: z.boolean().default(false).describe("Delete the image file after reading it."),
418
+ },
419
+ async ({ image_path, task, cleanup }) => {
420
+ let result = await callVision(task, image_path, { maxTokens: 2048 });
421
+
422
+ if (cleanup) {
423
+ result += `\n\n${await removeFile(image_path)}`;
424
+ }
425
+
426
+ return textResponse(result);
427
+ },
428
+ );
429
+
171
430
  server.tool(
172
431
  "delete_file",
173
432
  "Delete a local file, usually a temporary screenshot that is no longer needed.",