@rlabs-inc/gemini-mcp 0.7.2 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/server.js +2 -0
- package/dist/tools/image-analyze.d.ts +15 -0
- package/dist/tools/image-analyze.js +363 -0
- package/package.json +1 -1
package/dist/server.js
CHANGED
|
@@ -25,6 +25,7 @@ import { registerCacheTool } from './tools/cache.js';
|
|
|
25
25
|
import { registerSpeechTool } from './tools/speech.js';
|
|
26
26
|
import { registerTokenCountTool } from './tools/token-count.js';
|
|
27
27
|
import { registerDeepResearchTool } from './tools/deep-research.js';
|
|
28
|
+
import { registerImageAnalyzeTool } from './tools/image-analyze.js';
|
|
28
29
|
// Import Gemini client and logger
|
|
29
30
|
import { initGeminiClient } from './gemini-client.js';
|
|
30
31
|
import { setupLogger, logger } from './utils/logger.js';
|
|
@@ -134,6 +135,7 @@ For CLI mode, run: gemini --help
|
|
|
134
135
|
registerSpeechTool(server);
|
|
135
136
|
registerTokenCountTool(server);
|
|
136
137
|
registerDeepResearchTool(server);
|
|
138
|
+
registerImageAnalyzeTool(server);
|
|
137
139
|
// Start server with stdio transport
|
|
138
140
|
const transport = new StdioServerTransport();
|
|
139
141
|
// Set up error handling for transport
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Image Analysis Tool - Analyze images with object detection and bounding boxes
|
|
3
|
+
*
|
|
4
|
+
* This tool uses Gemini's vision capabilities to analyze images and detect objects
|
|
5
|
+
* with bounding box coordinates. Returns both normalized box_2d format and pixel coordinates.
|
|
6
|
+
*
|
|
7
|
+
* Bounding Box Format:
|
|
8
|
+
* - box_2d: [y_min, x_min, y_max, x_max] in 0-1000 normalized coordinates
|
|
9
|
+
* - bbox_pixels: {x, y, width, height} in pixel coordinates (when dimensions available)
|
|
10
|
+
*/
|
|
11
|
+
import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
12
|
+
/**
|
|
13
|
+
* Register image analysis tools with the MCP server
|
|
14
|
+
*/
|
|
15
|
+
export declare function registerImageAnalyzeTool(server: McpServer): void;
|
|
@@ -0,0 +1,363 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Image Analysis Tool - Analyze images with object detection and bounding boxes
|
|
3
|
+
*
|
|
4
|
+
* This tool uses Gemini's vision capabilities to analyze images and detect objects
|
|
5
|
+
* with bounding box coordinates. Returns both normalized box_2d format and pixel coordinates.
|
|
6
|
+
*
|
|
7
|
+
* Bounding Box Format:
|
|
8
|
+
* - box_2d: [y_min, x_min, y_max, x_max] in 0-1000 normalized coordinates
|
|
9
|
+
* - bbox_pixels: {x, y, width, height} in pixel coordinates (when dimensions available)
|
|
10
|
+
*/
|
|
11
|
+
import { z } from "zod";
|
|
12
|
+
import { GoogleGenAI } from "@google/genai";
|
|
13
|
+
import { logger } from "../utils/logger.js";
|
|
14
|
+
import * as fs from "fs";
|
|
15
|
+
import * as path from "path";
|
|
16
|
+
/**
|
|
17
|
+
* Extract image dimensions from PNG or JPEG file
|
|
18
|
+
*/
|
|
19
|
+
function extractImageDimensions(filePath) {
|
|
20
|
+
try {
|
|
21
|
+
const buffer = fs.readFileSync(filePath);
|
|
22
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
23
|
+
if (ext === ".png") {
|
|
24
|
+
// PNG format: width and height are at bytes 16-23 (big-endian)
|
|
25
|
+
if (buffer.length < 24)
|
|
26
|
+
return null;
|
|
27
|
+
if (buffer.toString("ascii", 1, 4) !== "PNG")
|
|
28
|
+
return null;
|
|
29
|
+
const width = buffer.readUInt32BE(16);
|
|
30
|
+
const height = buffer.readUInt32BE(20);
|
|
31
|
+
return { width, height };
|
|
32
|
+
}
|
|
33
|
+
else if (ext === ".jpg" || ext === ".jpeg") {
|
|
34
|
+
// JPEG format: scan for SOF0 or SOF2 markers
|
|
35
|
+
let offset = 2; // Skip initial 0xFFD8
|
|
36
|
+
while (offset < buffer.length - 8) {
|
|
37
|
+
if (buffer[offset] !== 0xff)
|
|
38
|
+
break;
|
|
39
|
+
const marker = buffer[offset + 1];
|
|
40
|
+
const segmentLength = buffer.readUInt16BE(offset + 2);
|
|
41
|
+
// SOF0 (0xC0) or SOF2 (0xC2) markers contain dimensions
|
|
42
|
+
if (marker === 0xc0 || marker === 0xc2) {
|
|
43
|
+
const height = buffer.readUInt16BE(offset + 5);
|
|
44
|
+
const width = buffer.readUInt16BE(offset + 7);
|
|
45
|
+
return { width, height };
|
|
46
|
+
}
|
|
47
|
+
offset += 2 + segmentLength;
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
return null;
|
|
51
|
+
}
|
|
52
|
+
catch (error) {
|
|
53
|
+
logger.debug(`Failed to extract dimensions: ${error}`);
|
|
54
|
+
return null;
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
/**
|
|
58
|
+
* Convert box_2d normalized coordinates to pixel coordinates
|
|
59
|
+
*/
|
|
60
|
+
function convertToPixelCoords(box2d, width, height) {
|
|
61
|
+
const [yMin, xMin, yMax, xMax] = box2d;
|
|
62
|
+
return {
|
|
63
|
+
x: Math.round((xMin / 1000) * width),
|
|
64
|
+
y: Math.round((yMin / 1000) * height),
|
|
65
|
+
width: Math.round(((xMax - xMin) / 1000) * width),
|
|
66
|
+
height: Math.round(((yMax - yMin) / 1000) * height),
|
|
67
|
+
};
|
|
68
|
+
}
|
|
69
|
+
/**
|
|
70
|
+
* Get MIME type from file extension
|
|
71
|
+
*/
|
|
72
|
+
function getImageMimeType(filePath) {
|
|
73
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
74
|
+
const mimeTypes = {
|
|
75
|
+
".jpg": "image/jpeg",
|
|
76
|
+
".jpeg": "image/jpeg",
|
|
77
|
+
".png": "image/png",
|
|
78
|
+
".webp": "image/webp",
|
|
79
|
+
".heic": "image/heic",
|
|
80
|
+
".heif": "image/heif",
|
|
81
|
+
".gif": "image/gif",
|
|
82
|
+
};
|
|
83
|
+
return mimeTypes[ext] || "image/jpeg";
|
|
84
|
+
}
|
|
85
|
+
/**
|
|
86
|
+
* Register image analysis tools with the MCP server
|
|
87
|
+
*/
|
|
88
|
+
export function registerImageAnalyzeTool(server) {
|
|
89
|
+
server.tool("gemini-analyze-image", {
|
|
90
|
+
imagePath: z
|
|
91
|
+
.string()
|
|
92
|
+
.describe("Path to image file. Supports JPEG, PNG, WebP, HEIC, HEIF, GIF"),
|
|
93
|
+
query: z
|
|
94
|
+
.string()
|
|
95
|
+
.optional()
|
|
96
|
+
.describe('Specific question about the image (e.g., "What objects are in this image?", "Count the people"). Default: "Analyze this image in detail."'),
|
|
97
|
+
detectObjects: z
|
|
98
|
+
.boolean()
|
|
99
|
+
.default(true)
|
|
100
|
+
.describe("Enable object detection with bounding boxes (returns box_2d coordinates). Default: true"),
|
|
101
|
+
model: z
|
|
102
|
+
.enum(["pro", "flash"])
|
|
103
|
+
.default("flash")
|
|
104
|
+
.describe("Model to use: pro (more accurate) or flash (faster). Default: flash"),
|
|
105
|
+
thinkingLevel: z
|
|
106
|
+
.enum(["minimal", "low", "medium", "high"])
|
|
107
|
+
.optional()
|
|
108
|
+
.describe("Reasoning depth: minimal/low for fast responses, medium/high for complex analysis. " +
|
|
109
|
+
"Pro supports low/high only. Flash supports all levels. Default: high"),
|
|
110
|
+
mediaResolution: z
|
|
111
|
+
.enum(["low", "medium", "high"])
|
|
112
|
+
.default("medium")
|
|
113
|
+
.describe("Resolution for processing: low (faster), medium (balanced), high (more detail). Default: medium"),
|
|
114
|
+
}, async ({ imagePath, query, detectObjects, model, thinkingLevel, mediaResolution }) => {
|
|
115
|
+
logger.info(`Analyzing image: ${imagePath}`);
|
|
116
|
+
try {
|
|
117
|
+
const apiKey = process.env.GEMINI_API_KEY;
|
|
118
|
+
if (!apiKey) {
|
|
119
|
+
throw new Error("GEMINI_API_KEY not set");
|
|
120
|
+
}
|
|
121
|
+
if (!fs.existsSync(imagePath)) {
|
|
122
|
+
throw new Error(`File not found: ${imagePath}`);
|
|
123
|
+
}
|
|
124
|
+
const fileBuffer = fs.readFileSync(imagePath);
|
|
125
|
+
const mimeType = getImageMimeType(imagePath);
|
|
126
|
+
const dimensions = extractImageDimensions(imagePath);
|
|
127
|
+
const genAI = new GoogleGenAI({ apiKey });
|
|
128
|
+
const modelName = model === "pro"
|
|
129
|
+
? process.env.GEMINI_PRO_MODEL || "gemini-3-pro-preview"
|
|
130
|
+
: process.env.GEMINI_FLASH_MODEL || "gemini-3-flash-preview";
|
|
131
|
+
const fileSize = fileBuffer.length;
|
|
132
|
+
logger.debug(`Image size: ${fileSize} bytes, MIME type: ${mimeType}`);
|
|
133
|
+
// Log image dimensions (for pixel coordinate conversion)
|
|
134
|
+
if (dimensions) {
|
|
135
|
+
logger.debug(`Image dimensions: ${dimensions.width}x${dimensions.height}`);
|
|
136
|
+
}
|
|
137
|
+
else {
|
|
138
|
+
logger.debug("Could not extract image dimensions");
|
|
139
|
+
}
|
|
140
|
+
// Build prompt based on parameters
|
|
141
|
+
let prompt = query || "Analyze this image in detail.";
|
|
142
|
+
if (detectObjects) {
|
|
143
|
+
prompt += `\n\nFor each object you identify, provide bounding box coordinates in the box_2d format: [y_min, x_min, y_max, x_max] where coordinates are normalized to 0-1000 scale.
|
|
144
|
+
|
|
145
|
+
Return your response as a JSON object with this structure:
|
|
146
|
+
{
|
|
147
|
+
"description": "Overall description of the image",
|
|
148
|
+
"objects": [
|
|
149
|
+
{
|
|
150
|
+
"label": "object name",
|
|
151
|
+
"confidence": "high/medium/low",
|
|
152
|
+
"box_2d": [y_min, x_min, y_max, x_max]
|
|
153
|
+
}
|
|
154
|
+
]
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
If you cannot detect specific objects or bounding boxes are not applicable, return an empty objects array.`;
|
|
158
|
+
}
|
|
159
|
+
// Map resolution to API parameter
|
|
160
|
+
const resolutionMap = {
|
|
161
|
+
low: "media_resolution_low",
|
|
162
|
+
medium: "media_resolution_medium",
|
|
163
|
+
high: "media_resolution_high",
|
|
164
|
+
};
|
|
165
|
+
// For files <20MB, use inline data. For larger files, use Files API
|
|
166
|
+
let responseText;
|
|
167
|
+
if (fileSize > 20 * 1024 * 1024) {
|
|
168
|
+
// Upload using Files API
|
|
169
|
+
logger.info("Large file detected, uploading via Files API...");
|
|
170
|
+
const uploadedFile = await genAI.files.upload({
|
|
171
|
+
file: new Blob([new Uint8Array(fileBuffer)], { type: mimeType }),
|
|
172
|
+
config: { mimeType },
|
|
173
|
+
});
|
|
174
|
+
const config = {};
|
|
175
|
+
if (mediaResolution !== "medium") {
|
|
176
|
+
config.mediaResolution = resolutionMap[mediaResolution];
|
|
177
|
+
}
|
|
178
|
+
// Add thinking config for Gemini 3
|
|
179
|
+
if (thinkingLevel) {
|
|
180
|
+
// Pro only supports low/high, Flash supports all levels
|
|
181
|
+
const effectiveLevel = model === "pro"
|
|
182
|
+
? (thinkingLevel === "minimal" || thinkingLevel === "low" ? "low" : "high")
|
|
183
|
+
: thinkingLevel;
|
|
184
|
+
config.thinkingConfig = { thinkingLevel: effectiveLevel };
|
|
185
|
+
logger.debug(`Using thinking level: ${effectiveLevel}${model === "pro" && effectiveLevel !== thinkingLevel ? ` (requested: ${thinkingLevel})` : ""}`);
|
|
186
|
+
}
|
|
187
|
+
// Add structured output for object detection
|
|
188
|
+
if (detectObjects) {
|
|
189
|
+
config.responseMimeType = "application/json";
|
|
190
|
+
config.responseJsonSchema = {
|
|
191
|
+
type: "object",
|
|
192
|
+
properties: {
|
|
193
|
+
description: { type: "string" },
|
|
194
|
+
objects: {
|
|
195
|
+
type: "array",
|
|
196
|
+
items: {
|
|
197
|
+
type: "object",
|
|
198
|
+
properties: {
|
|
199
|
+
label: { type: "string" },
|
|
200
|
+
confidence: { type: "string" },
|
|
201
|
+
box_2d: {
|
|
202
|
+
type: "array",
|
|
203
|
+
items: { type: "number" },
|
|
204
|
+
minItems: 4,
|
|
205
|
+
maxItems: 4,
|
|
206
|
+
},
|
|
207
|
+
},
|
|
208
|
+
required: ["label", "confidence", "box_2d"],
|
|
209
|
+
},
|
|
210
|
+
},
|
|
211
|
+
},
|
|
212
|
+
required: ["description", "objects"],
|
|
213
|
+
};
|
|
214
|
+
}
|
|
215
|
+
const response = await genAI.models.generateContent({
|
|
216
|
+
model: modelName,
|
|
217
|
+
contents: [
|
|
218
|
+
{
|
|
219
|
+
role: "user",
|
|
220
|
+
parts: [
|
|
221
|
+
{
|
|
222
|
+
fileData: {
|
|
223
|
+
fileUri: uploadedFile.uri,
|
|
224
|
+
mimeType: uploadedFile.mimeType,
|
|
225
|
+
},
|
|
226
|
+
},
|
|
227
|
+
{ text: prompt },
|
|
228
|
+
],
|
|
229
|
+
},
|
|
230
|
+
],
|
|
231
|
+
config: Object.keys(config).length > 0 ? config : undefined,
|
|
232
|
+
});
|
|
233
|
+
responseText = response.text || "";
|
|
234
|
+
}
|
|
235
|
+
else {
|
|
236
|
+
// Use inline data for smaller files
|
|
237
|
+
const base64Data = fileBuffer.toString("base64");
|
|
238
|
+
const inlineConfig = {};
|
|
239
|
+
if (mediaResolution !== "medium") {
|
|
240
|
+
inlineConfig.mediaResolution = resolutionMap[mediaResolution];
|
|
241
|
+
}
|
|
242
|
+
// Add thinking config for Gemini 3
|
|
243
|
+
if (thinkingLevel) {
|
|
244
|
+
// Pro only supports low/high, Flash supports all levels
|
|
245
|
+
const effectiveLevel = model === "pro"
|
|
246
|
+
? (thinkingLevel === "minimal" || thinkingLevel === "low" ? "low" : "high")
|
|
247
|
+
: thinkingLevel;
|
|
248
|
+
inlineConfig.thinkingConfig = { thinkingLevel: effectiveLevel };
|
|
249
|
+
logger.debug(`Using thinking level: ${effectiveLevel}${model === "pro" && effectiveLevel !== thinkingLevel ? ` (requested: ${thinkingLevel})` : ""}`);
|
|
250
|
+
}
|
|
251
|
+
// Add structured output for object detection
|
|
252
|
+
if (detectObjects) {
|
|
253
|
+
inlineConfig.responseMimeType = "application/json";
|
|
254
|
+
inlineConfig.responseJsonSchema = {
|
|
255
|
+
type: "object",
|
|
256
|
+
properties: {
|
|
257
|
+
description: { type: "string" },
|
|
258
|
+
objects: {
|
|
259
|
+
type: "array",
|
|
260
|
+
items: {
|
|
261
|
+
type: "object",
|
|
262
|
+
properties: {
|
|
263
|
+
label: { type: "string" },
|
|
264
|
+
confidence: { type: "string" },
|
|
265
|
+
box_2d: {
|
|
266
|
+
type: "array",
|
|
267
|
+
items: { type: "number" },
|
|
268
|
+
minItems: 4,
|
|
269
|
+
maxItems: 4,
|
|
270
|
+
},
|
|
271
|
+
},
|
|
272
|
+
required: ["label", "confidence", "box_2d"],
|
|
273
|
+
},
|
|
274
|
+
},
|
|
275
|
+
},
|
|
276
|
+
required: ["description", "objects"],
|
|
277
|
+
};
|
|
278
|
+
}
|
|
279
|
+
const response = await genAI.models.generateContent({
|
|
280
|
+
model: modelName,
|
|
281
|
+
contents: [
|
|
282
|
+
{
|
|
283
|
+
role: "user",
|
|
284
|
+
parts: [
|
|
285
|
+
{
|
|
286
|
+
inlineData: {
|
|
287
|
+
mimeType,
|
|
288
|
+
data: base64Data,
|
|
289
|
+
},
|
|
290
|
+
},
|
|
291
|
+
{ text: prompt },
|
|
292
|
+
],
|
|
293
|
+
},
|
|
294
|
+
],
|
|
295
|
+
config: Object.keys(inlineConfig).length > 0 ? inlineConfig : undefined,
|
|
296
|
+
});
|
|
297
|
+
responseText = response.text || "";
|
|
298
|
+
}
|
|
299
|
+
// Process response
|
|
300
|
+
if (detectObjects && responseText) {
|
|
301
|
+
try {
|
|
302
|
+
const parsed = JSON.parse(responseText);
|
|
303
|
+
// Add pixel coordinates if dimensions are available
|
|
304
|
+
if (dimensions && parsed.objects && Array.isArray(parsed.objects)) {
|
|
305
|
+
parsed.objects = parsed.objects.map((obj) => {
|
|
306
|
+
if (Array.isArray(obj.box_2d) && obj.box_2d.length === 4) {
|
|
307
|
+
return {
|
|
308
|
+
...obj,
|
|
309
|
+
bbox_pixels: convertToPixelCoords(obj.box_2d, dimensions.width, dimensions.height),
|
|
310
|
+
};
|
|
311
|
+
}
|
|
312
|
+
return obj;
|
|
313
|
+
});
|
|
314
|
+
}
|
|
315
|
+
const formattedJson = JSON.stringify(parsed, null, 2);
|
|
316
|
+
logger.info("Image analysis completed with object detection");
|
|
317
|
+
return {
|
|
318
|
+
content: [
|
|
319
|
+
{
|
|
320
|
+
type: "text",
|
|
321
|
+
text: `**Image Analysis Results:**\n\`\`\`json\n${formattedJson}\n\`\`\``,
|
|
322
|
+
},
|
|
323
|
+
],
|
|
324
|
+
};
|
|
325
|
+
}
|
|
326
|
+
catch (parseError) {
|
|
327
|
+
// Fallback to plain text if JSON parsing fails
|
|
328
|
+
logger.warn("Failed to parse JSON response, returning as text");
|
|
329
|
+
return {
|
|
330
|
+
content: [
|
|
331
|
+
{
|
|
332
|
+
type: "text",
|
|
333
|
+
text: responseText,
|
|
334
|
+
},
|
|
335
|
+
],
|
|
336
|
+
};
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
logger.info("Image analysis completed");
|
|
340
|
+
return {
|
|
341
|
+
content: [
|
|
342
|
+
{
|
|
343
|
+
type: "text",
|
|
344
|
+
text: responseText || "Unable to analyze image.",
|
|
345
|
+
},
|
|
346
|
+
],
|
|
347
|
+
};
|
|
348
|
+
}
|
|
349
|
+
catch (error) {
|
|
350
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
351
|
+
logger.error(`Error in image analysis: ${errorMessage}`);
|
|
352
|
+
return {
|
|
353
|
+
content: [
|
|
354
|
+
{
|
|
355
|
+
type: "text",
|
|
356
|
+
text: `Error analyzing image: ${errorMessage}`,
|
|
357
|
+
},
|
|
358
|
+
],
|
|
359
|
+
isError: true,
|
|
360
|
+
};
|
|
361
|
+
}
|
|
362
|
+
});
|
|
363
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@rlabs-inc/gemini-mcp",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.8.0",
|
|
4
4
|
"mcpName": "io.github.RLabs-Inc/gemini-mcp",
|
|
5
5
|
"description": "MCP server for Gemini 3 integration with Claude Code - 30+ AI tools including image/video generation, deep research, code execution, and beautiful CLI",
|
|
6
6
|
"main": "dist/index.js",
|