z_ai_vision_mcp_server_clone 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,64 @@
1
+ # z_ai_vision_mcp_server_clone
2
+
3
+ OpenAI-compatible MCP server for running image analysis tools against your own vision model endpoint.
4
+
5
+ ## Tools
6
+
7
+ - `ui_to_artifact`
8
+ - `extract_text_from_screenshot`
9
+ - `diagnose_error_screenshot`
10
+ - `understand_technical_diagram`
11
+ - `analyze_data_visualization`
12
+ - `ui_diff_check`
13
+ - `analyze_image`
14
+
15
+ ## Configuration
16
+
17
+ Set either `VISION_ENDPOINT` or `VISION_BASE_URL`.
18
+
19
+ | Variable | Required | Description |
20
+ | --- | --- | --- |
21
+ | `VISION_ENDPOINT` | Yes, unless `VISION_BASE_URL` is set | Full chat completions endpoint. |
22
+ | `VISION_BASE_URL` | Yes, unless `VISION_ENDPOINT` is set | Base URL; `/chat/completions` is appended. |
23
+ | `VISION_MODEL` | Yes | Vision model name sent in the request body. |
24
+ | `VISION_API_KEY` | No | Bearer token. Omit for local endpoints that do not require auth. |
25
+ | `VISION_PROVIDER` | No | Label for your provider. Defaults to `custom`. |
26
+ | `VISION_MAX_IMAGE_MB` | No | Local image size limit. Defaults to `5`. |
27
+ | `VISION_TIMEOUT_MS` | No | Request timeout. Defaults to `300000`. |
28
+ | `VISION_TEMPERATURE` | No | Optional model temperature. |
29
+ | `VISION_TOP_P` | No | Optional model top_p. |
30
+ | `VISION_MAX_TOKENS` | No | Optional max_tokens. |
31
+
32
+ You can also place these values in a local `.env` file in the working directory where the server starts. Real environment variables override `.env` values.
33
+
34
+ ## Run
35
+
36
+ ```bash
37
+ npm install
38
+ npm run build
39
+ VISION_ENDPOINT=http://localhost:11434/v1/chat/completions VISION_MODEL=llava npm start
40
+ ```
41
+
42
+ Or with `.env`:
43
+
44
+ ```bash
45
+ npm start
46
+ ```
47
+
48
+ ## MCP Client Example
49
+
50
+ ```json
51
+ {
52
+ "mcpServers": {
53
+ "z-ai-vision-clone": {
54
+ "type": "stdio",
55
+ "command": "node",
56
+ "args": ["/absolute/path/to/z_ai_vision_mcp_server_clone/build/src/server.js"],
57
+ "env": {
58
+ "VISION_ENDPOINT": "http://localhost:11434/v1/chat/completions",
59
+ "VISION_MODEL": "llava"
60
+ }
61
+ }
62
+ }
63
+ }
64
+ ```
@@ -0,0 +1,391 @@
1
+ #!/usr/bin/env node
2
+ import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
3
+ import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
4
+ import { readFileSync } from "node:fs";
5
+ import { readFile, stat } from "node:fs/promises";
6
+ import { extname } from "node:path";
7
+ import { pathToFileURL } from "node:url";
8
+ import * as z from "zod/v4";
9
+ export const IMAGE_TOOL_NAMES = [
10
+ "ui_to_artifact",
11
+ "extract_text_from_screenshot",
12
+ "diagnose_error_screenshot",
13
+ "understand_technical_diagram",
14
+ "analyze_data_visualization",
15
+ "ui_diff_check",
16
+ "analyze_image",
17
+ ];
18
+ const imageSourceSchema = z.string().min(1);
19
+ const promptSchema = z.string().min(1);
20
+ const mimeTypes = {
21
+ ".gif": "image/gif",
22
+ ".jpg": "image/jpeg",
23
+ ".jpeg": "image/jpeg",
24
+ ".png": "image/png",
25
+ ".webp": "image/webp",
26
+ };
27
+ const prompts = {
28
+ analyzeImage: "Analyze the provided image according to the user request. Be precise, practical, and mention uncertainty when details are not visible.",
29
+ textExtraction: "Extract visible text from the screenshot. Preserve code, logs, whitespace, line breaks, and ordering as accurately as possible.",
30
+ errorDiagnosis: "Diagnose the error shown in the screenshot. Identify the likely cause, the relevant evidence, and concrete next steps.",
31
+ diagram: "Explain the technical diagram. Identify components, relationships, flow, assumptions, and any unclear parts.",
32
+ dataVisualization: "Analyze the chart or dashboard. Extract the key metrics, trends, comparisons, anomalies, and business implications.",
33
+ uiDiff: "Compare the reference UI and actual UI. Report visible differences, severity, likely causes, and fixes.",
34
+ uiArtifact: {
35
+ code: "Turn the UI screenshot into implementation guidance or frontend code. Keep the result faithful to the visible layout and states.",
36
+ prompt: "Write a clear prompt that another AI system can use to recreate the UI screenshot.",
37
+ spec: "Extract a design specification from the UI screenshot, including layout, spacing, typography, colors, and component behavior.",
38
+ description: "Describe the UI screenshot in natural language with enough detail to understand its structure and purpose.",
39
+ },
40
+ };
41
+ export function loadVisionConfig(env = loadRuntimeEnvironment()) {
42
+ const endpoint = env.VISION_ENDPOINT || endpointFromBaseUrl(env.VISION_BASE_URL);
43
+ if (!endpoint) {
44
+ throw new Error("VISION_ENDPOINT or VISION_BASE_URL is required");
45
+ }
46
+ const model = env.VISION_MODEL;
47
+ if (!model) {
48
+ throw new Error("VISION_MODEL is required");
49
+ }
50
+ return {
51
+ provider: env.VISION_PROVIDER || "custom",
52
+ endpoint,
53
+ model,
54
+ apiKey: env.VISION_API_KEY || env.OPENAI_API_KEY,
55
+ maxImageBytes: readMegabytes(env.VISION_MAX_IMAGE_MB, 5) * 1024 * 1024,
56
+ timeoutMs: readNumber(env.VISION_TIMEOUT_MS, 300_000),
57
+ temperature: readOptionalNumber(env.VISION_TEMPERATURE),
58
+ topP: readOptionalNumber(env.VISION_TOP_P),
59
+ maxTokens: readOptionalNumber(env.VISION_MAX_TOKENS),
60
+ };
61
+ }
62
+ function loadRuntimeEnvironment() {
63
+ return { ...readDotEnvFile(), ...process.env };
64
+ }
65
+ function readDotEnvFile() {
66
+ let text;
67
+ try {
68
+ text = readFileSync(".env", "utf8");
69
+ }
70
+ catch (error) {
71
+ const code = error instanceof Error && "code" in error ? error.code : undefined;
72
+ if (code === "ENOENT") {
73
+ return {};
74
+ }
75
+ throw error;
76
+ }
77
+ const env = {};
78
+ for (const rawLine of text.split(/\r?\n/)) {
79
+ const line = rawLine.trim();
80
+ if (!line || line.startsWith("#")) {
81
+ continue;
82
+ }
83
+ const index = line.indexOf("=");
84
+ if (index < 1) {
85
+ continue;
86
+ }
87
+ const key = line.slice(0, index).trim();
88
+ let value = line.slice(index + 1).trim();
89
+ if ((value.startsWith("\"") && value.endsWith("\"")) ||
90
+ (value.startsWith("'") && value.endsWith("'"))) {
91
+ value = value.slice(1, -1);
92
+ }
93
+ env[key] = value;
94
+ }
95
+ return env;
96
+ }
97
+ function endpointFromBaseUrl(baseUrl) {
98
+ if (!baseUrl) {
99
+ return undefined;
100
+ }
101
+ return `${baseUrl.replace(/\/+$/, "")}/chat/completions`;
102
+ }
103
+ function readMegabytes(value, fallback) {
104
+ const numberValue = readNumber(value, fallback);
105
+ if (numberValue <= 0) {
106
+ throw new Error("VISION_MAX_IMAGE_MB must be greater than 0");
107
+ }
108
+ return numberValue;
109
+ }
110
+ function readNumber(value, fallback) {
111
+ if (!value) {
112
+ return fallback;
113
+ }
114
+ const numberValue = Number(value);
115
+ if (!Number.isFinite(numberValue)) {
116
+ throw new Error(`Invalid numeric environment value: ${value}`);
117
+ }
118
+ return numberValue;
119
+ }
120
+ function readOptionalNumber(value) {
121
+ return value ? readNumber(value, 0) : undefined;
122
+ }
123
+ function isRemoteUrl(source) {
124
+ try {
125
+ const url = new URL(source);
126
+ return url.protocol === "http:" || url.protocol === "https:";
127
+ }
128
+ catch {
129
+ return false;
130
+ }
131
+ }
132
+ export async function imageContentFromSource(source, maxBytes = 5 * 1024 * 1024) {
133
+ if (isRemoteUrl(source)) {
134
+ return { type: "image_url", image_url: { url: source } };
135
+ }
136
+ const stats = await stat(source);
137
+ if (!stats.isFile()) {
138
+ throw new Error(`Image source is not a file: ${source}`);
139
+ }
140
+ if (stats.size > maxBytes) {
141
+ throw new Error(`Image file is too large: ${(stats.size / 1024 / 1024).toFixed(2)}MB`);
142
+ }
143
+ const extension = extname(source).toLowerCase();
144
+ const mimeType = mimeTypes[extension];
145
+ if (!mimeType) {
146
+ throw new Error(`Unsupported image format: ${extension || "(none)"}`);
147
+ }
148
+ const bytes = await readFile(source);
149
+ return {
150
+ type: "image_url",
151
+ image_url: { url: `data:${mimeType};base64,${bytes.toString("base64")}` },
152
+ };
153
+ }
154
+ export function buildVisionMessages(systemPrompt, prompt, images) {
155
+ return [
156
+ { role: "system", content: systemPrompt },
157
+ { role: "user", content: [...images, { type: "text", text: prompt }] },
158
+ ];
159
+ }
160
+ async function analyzeImageSources(config, sources, systemPrompt, prompt) {
161
+ const images = [];
162
+ for (const source of sources) {
163
+ images.push(await imageContentFromSource(source, config.maxImageBytes));
164
+ }
165
+ return callVisionEndpoint(config, buildVisionMessages(systemPrompt, prompt, images));
166
+ }
167
+ async function callVisionEndpoint(config, messages) {
168
+ const controller = new AbortController();
169
+ const timeout = setTimeout(() => controller.abort(), config.timeoutMs);
170
+ const headers = { "Content-Type": "application/json" };
171
+ if (config.apiKey) {
172
+ headers.Authorization = `Bearer ${config.apiKey}`;
173
+ }
174
+ const body = {
175
+ model: config.model,
176
+ messages,
177
+ stream: false,
178
+ };
179
+ if (config.temperature !== undefined) {
180
+ body.temperature = config.temperature;
181
+ }
182
+ if (config.topP !== undefined) {
183
+ body.top_p = config.topP;
184
+ }
185
+ if (config.maxTokens !== undefined) {
186
+ body.max_tokens = config.maxTokens;
187
+ }
188
+ try {
189
+ const response = await fetch(config.endpoint, {
190
+ method: "POST",
191
+ headers,
192
+ body: JSON.stringify(body),
193
+ signal: controller.signal,
194
+ });
195
+ if (!response.ok) {
196
+ throw new Error(`HTTP ${response.status}: ${await response.text()}`);
197
+ }
198
+ return extractVisionText(await response.json());
199
+ }
200
+ catch (error) {
201
+ if (error instanceof Error && error.name === "AbortError") {
202
+ throw new Error(`Vision endpoint timed out after ${config.timeoutMs}ms`);
203
+ }
204
+ throw error;
205
+ }
206
+ finally {
207
+ clearTimeout(timeout);
208
+ }
209
+ }
210
+ function extractVisionText(payload) {
211
+ const root = asRecord(payload);
212
+ const choices = root ? root.choices : undefined;
213
+ if (!Array.isArray(choices) || choices.length === 0) {
214
+ throw new Error("Vision endpoint response did not include choices");
215
+ }
216
+ const firstChoice = asRecord(choices[0]);
217
+ const message = asRecord(firstChoice?.message);
218
+ const content = message?.content;
219
+ if (typeof content === "string" && content.trim()) {
220
+ return content;
221
+ }
222
+ if (Array.isArray(content)) {
223
+ const text = content
224
+ .map((part) => asRecord(part)?.text)
225
+ .filter((part) => typeof part === "string")
226
+ .join("\n")
227
+ .trim();
228
+ if (text) {
229
+ return text;
230
+ }
231
+ }
232
+ throw new Error("Vision endpoint response did not include message content");
233
+ }
234
+ function asRecord(value) {
235
+ return value !== null && typeof value === "object"
236
+ ? value
237
+ : undefined;
238
+ }
239
+ function toolResponse(text) {
240
+ return { content: [{ type: "text", text }] };
241
+ }
242
+ function toolError(error) {
243
+ const message = error instanceof Error ? error.message : String(error);
244
+ return {
245
+ content: [{ type: "text", text: `Error: ${message}` }],
246
+ isError: true,
247
+ };
248
+ }
249
+ function withOptionalHint(prompt, tag, value) {
250
+ return value?.trim() ? `${prompt}\n\n<${tag}>${value}</${tag}>` : prompt;
251
+ }
252
+ export function createServer(configLoader = loadVisionConfig) {
253
+ const server = new McpServer({
254
+ name: "custom-vision-mcp-server",
255
+ version: "0.1.0",
256
+ });
257
+ const analyze = async (sources, systemPrompt, prompt) => analyzeImageSources(configLoader(), sources, systemPrompt, prompt);
258
+ server.registerTool("ui_to_artifact", {
259
+ title: "UI to Artifact",
260
+ description: "Convert a UI screenshot into code guidance, a recreation prompt, a design spec, or a description.",
261
+ inputSchema: {
262
+ image_source: imageSourceSchema,
263
+ output_type: z.enum(["code", "prompt", "spec", "description"]),
264
+ prompt: promptSchema,
265
+ },
266
+ }, async ({ image_source, output_type, prompt }) => {
267
+ try {
268
+ return toolResponse(await analyze([image_source], prompts.uiArtifact[output_type], prompt));
269
+ }
270
+ catch (error) {
271
+ return toolError(error);
272
+ }
273
+ });
274
+ server.registerTool("extract_text_from_screenshot", {
275
+ title: "Extract Text from Screenshot",
276
+ description: "Extract visible text from a screenshot, including code, logs, terminal output, and documents.",
277
+ inputSchema: {
278
+ image_source: imageSourceSchema,
279
+ prompt: promptSchema,
280
+ programming_language: z.string().optional(),
281
+ },
282
+ }, async ({ image_source, prompt, programming_language }) => {
283
+ try {
284
+ const enhancedPrompt = withOptionalHint(prompt, "programming_language", programming_language);
285
+ return toolResponse(await analyze([image_source], prompts.textExtraction, enhancedPrompt));
286
+ }
287
+ catch (error) {
288
+ return toolError(error);
289
+ }
290
+ });
291
+ server.registerTool("diagnose_error_screenshot", {
292
+ title: "Diagnose Error Screenshot",
293
+ description: "Analyze an error screenshot and suggest likely causes and fixes.",
294
+ inputSchema: {
295
+ image_source: imageSourceSchema,
296
+ prompt: promptSchema,
297
+ context: z.string().optional(),
298
+ },
299
+ }, async ({ image_source, prompt, context }) => {
300
+ try {
301
+ const enhancedPrompt = withOptionalHint(prompt, "error_context", context);
302
+ return toolResponse(await analyze([image_source], prompts.errorDiagnosis, enhancedPrompt));
303
+ }
304
+ catch (error) {
305
+ return toolError(error);
306
+ }
307
+ });
308
+ server.registerTool("understand_technical_diagram", {
309
+ title: "Understand Technical Diagram",
310
+ description: "Explain architecture diagrams, flowcharts, UML, ER diagrams, and related technical drawings.",
311
+ inputSchema: {
312
+ image_source: imageSourceSchema,
313
+ prompt: promptSchema,
314
+ diagram_type: z.string().optional(),
315
+ },
316
+ }, async ({ image_source, prompt, diagram_type }) => {
317
+ try {
318
+ const enhancedPrompt = withOptionalHint(prompt, "diagram_type", diagram_type);
319
+ return toolResponse(await analyze([image_source], prompts.diagram, enhancedPrompt));
320
+ }
321
+ catch (error) {
322
+ return toolError(error);
323
+ }
324
+ });
325
+ server.registerTool("analyze_data_visualization", {
326
+ title: "Analyze Data Visualization",
327
+ description: "Analyze charts, graphs, and dashboards for metrics, trends, anomalies, and implications.",
328
+ inputSchema: {
329
+ image_source: imageSourceSchema,
330
+ prompt: promptSchema,
331
+ analysis_focus: z.string().optional(),
332
+ },
333
+ }, async ({ image_source, prompt, analysis_focus }) => {
334
+ try {
335
+ const enhancedPrompt = withOptionalHint(prompt, "analysis_focus", analysis_focus);
336
+ return toolResponse(await analyze([image_source], prompts.dataVisualization, enhancedPrompt));
337
+ }
338
+ catch (error) {
339
+ return toolError(error);
340
+ }
341
+ });
342
+ server.registerTool("ui_diff_check", {
343
+ title: "UI Diff Check",
344
+ description: "Compare a reference UI screenshot with an actual implementation screenshot.",
345
+ inputSchema: {
346
+ expected_image_source: imageSourceSchema,
347
+ actual_image_source: imageSourceSchema,
348
+ prompt: promptSchema,
349
+ },
350
+ }, async ({ expected_image_source, actual_image_source, prompt }) => {
351
+ try {
352
+ const enhancedPrompt = `The first image is the expected reference UI. The second image is the actual UI.\n\n${prompt}`;
353
+ return toolResponse(await analyze([expected_image_source, actual_image_source], prompts.uiDiff, enhancedPrompt));
354
+ }
355
+ catch (error) {
356
+ return toolError(error);
357
+ }
358
+ });
359
+ server.registerTool("analyze_image", {
360
+ title: "Analyze Image",
361
+ description: "General-purpose image analysis for cases not covered by the specialized image tools.",
362
+ inputSchema: {
363
+ image_source: imageSourceSchema,
364
+ prompt: promptSchema,
365
+ },
366
+ }, async ({ image_source, prompt }) => {
367
+ try {
368
+ return toolResponse(await analyze([image_source], prompts.analyzeImage, prompt));
369
+ }
370
+ catch (error) {
371
+ return toolError(error);
372
+ }
373
+ });
374
+ return server;
375
+ }
376
+ export function redirectConsoleToStderr() {
377
+ console.log = console.error.bind(console);
378
+ console.info = console.error.bind(console);
379
+ console.debug = console.error.bind(console);
380
+ }
381
+ async function main() {
382
+ redirectConsoleToStderr();
383
+ const server = createServer();
384
+ await server.connect(new StdioServerTransport());
385
+ }
386
+ if (process.argv[1] && import.meta.url === pathToFileURL(process.argv[1]).href) {
387
+ main().catch((error) => {
388
+ console.error(error instanceof Error ? error.message : String(error));
389
+ process.exit(1);
390
+ });
391
+ }
@@ -0,0 +1,68 @@
1
+ import assert from "node:assert/strict";
2
+ import { chdir, cwd } from "node:process";
3
+ import { mkdtemp, rm, writeFile } from "node:fs/promises";
4
+ import { tmpdir } from "node:os";
5
+ import { join } from "node:path";
6
+ import { IMAGE_TOOL_NAMES, buildVisionMessages, imageContentFromSource, loadVisionConfig, } from "../src/server.js";
7
+ const expectedTools = [
8
+ "ui_to_artifact",
9
+ "extract_text_from_screenshot",
10
+ "diagnose_error_screenshot",
11
+ "understand_technical_diagram",
12
+ "analyze_data_visualization",
13
+ "ui_diff_check",
14
+ "analyze_image",
15
+ ];
16
+ assert.deepEqual(IMAGE_TOOL_NAMES, expectedTools);
17
+ const config = loadVisionConfig({
18
+ VISION_PROVIDER: "local",
19
+ VISION_ENDPOINT: "http://localhost:11434/v1/chat/completions",
20
+ VISION_MODEL: "llava",
21
+ VISION_API_KEY: "test-key",
22
+ });
23
+ assert.equal(config.provider, "local");
24
+ assert.equal(config.endpoint, "http://localhost:11434/v1/chat/completions");
25
+ assert.equal(config.model, "llava");
26
+ assert.equal(config.apiKey, "test-key");
27
+ const dir = await mkdtemp(join(tmpdir(), "custom-vision-mcp-"));
28
+ try {
29
+ const originalCwd = cwd();
30
+ const envKeys = ["VISION_PROVIDER", "VISION_ENDPOINT", "VISION_BASE_URL", "VISION_MODEL", "VISION_API_KEY"];
31
+ const originalEnv = Object.fromEntries(envKeys.map((key) => [key, process.env[key]]));
32
+ for (const key of envKeys) {
33
+ delete process.env[key];
34
+ }
35
+ chdir(dir);
36
+ await writeFile(".env", "VISION_PROVIDER=file-provider\nVISION_ENDPOINT=http://example.test/v1/chat/completions\nVISION_MODEL=file-model\nVISION_API_KEY=file-key\n");
37
+ const fileConfig = loadVisionConfig();
38
+ assert.equal(fileConfig.provider, "file-provider");
39
+ assert.equal(fileConfig.endpoint, "http://example.test/v1/chat/completions");
40
+ assert.equal(fileConfig.model, "file-model");
41
+ assert.equal(fileConfig.apiKey, "file-key");
42
+ chdir(originalCwd);
43
+ for (const [key, value] of Object.entries(originalEnv)) {
44
+ if (value === undefined) {
45
+ delete process.env[key];
46
+ }
47
+ else {
48
+ process.env[key] = value;
49
+ }
50
+ }
51
+ const imagePath = join(dir, "sample.png");
52
+ await writeFile(imagePath, Buffer.from([0x89, 0x50, 0x4e, 0x47]));
53
+ const localImage = await imageContentFromSource(imagePath);
54
+ assert.equal(localImage.type, "image_url");
55
+ assert.match(localImage.image_url.url, /^data:image\/png;base64,/);
56
+ const remoteImage = await imageContentFromSource("https://example.com/image.webp");
57
+ assert.deepEqual(remoteImage, {
58
+ type: "image_url",
59
+ image_url: { url: "https://example.com/image.webp" },
60
+ });
61
+ const messages = buildVisionMessages("System", "Describe it", [localImage]);
62
+ assert.equal(messages[0]?.role, "system");
63
+ assert.equal(messages[1]?.role, "user");
64
+ assert.equal(messages[1]?.content.at(-1)?.type, "text");
65
+ }
66
+ finally {
67
+ await rm(dir, { recursive: true, force: true });
68
+ }
package/package.json ADDED
@@ -0,0 +1,39 @@
1
+ {
2
+ "name": "z_ai_vision_mcp_server_clone",
3
+ "version": "0.1.0",
4
+ "description": "OpenAI-compatible MCP server for custom vision model endpoints",
5
+ "main": "build/src/server.js",
6
+ "type": "module",
7
+ "bin": {
8
+ "z_ai_vision_mcp_server_clone": "build/src/server.js"
9
+ },
10
+ "repository": {
11
+ "type": "git",
12
+ "url": "git+https://github.com/1orzero/z_ai_vision_mcp_server_clone.git"
13
+ },
14
+ "bugs": {
15
+ "url": "https://github.com/1orzero/z_ai_vision_mcp_server_clone/issues"
16
+ },
17
+ "homepage": "https://github.com/1orzero/z_ai_vision_mcp_server_clone#readme",
18
+ "scripts": {
19
+ "build": "tsc",
20
+ "prepack": "npm run build",
21
+ "start": "node build/src/server.js",
22
+ "test": "npm run build && node build/test/vision.test.js"
23
+ },
24
+ "files": [
25
+ "build",
26
+ "README.md"
27
+ ],
28
+ "engines": {
29
+ "node": ">=20.0.0"
30
+ },
31
+ "dependencies": {
32
+ "@modelcontextprotocol/sdk": "1.26.0",
33
+ "zod": "^4.0.0"
34
+ },
35
+ "devDependencies": {
36
+ "@types/node": "^24.0.0",
37
+ "typescript": "^5.9.0"
38
+ }
39
+ }