@rlabs-inc/gemini-mcp 0.7.2 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/server.js CHANGED
@@ -25,6 +25,7 @@ import { registerCacheTool } from './tools/cache.js';
25
25
  import { registerSpeechTool } from './tools/speech.js';
26
26
  import { registerTokenCountTool } from './tools/token-count.js';
27
27
  import { registerDeepResearchTool } from './tools/deep-research.js';
28
+ import { registerImageAnalyzeTool } from './tools/image-analyze.js';
28
29
  // Import Gemini client and logger
29
30
  import { initGeminiClient } from './gemini-client.js';
30
31
  import { setupLogger, logger } from './utils/logger.js';
@@ -134,6 +135,7 @@ For CLI mode, run: gemini --help
134
135
  registerSpeechTool(server);
135
136
  registerTokenCountTool(server);
136
137
  registerDeepResearchTool(server);
138
+ registerImageAnalyzeTool(server);
137
139
  // Start server with stdio transport
138
140
  const transport = new StdioServerTransport();
139
141
  // Set up error handling for transport
@@ -0,0 +1,15 @@
1
+ /**
2
+ * Image Analysis Tool - Analyze images with object detection and bounding boxes
3
+ *
4
+ * This tool uses Gemini's vision capabilities to analyze images and detect objects
5
+ * with bounding box coordinates. Returns both normalized box_2d format and pixel coordinates.
6
+ *
7
+ * Bounding Box Format:
8
+ * - box_2d: [y_min, x_min, y_max, x_max] in 0-1000 normalized coordinates
9
+ * - bbox_pixels: {x, y, width, height} in pixel coordinates (when dimensions available)
10
+ */
11
+ import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
12
+ /**
13
+ * Register image analysis tools with the MCP server
14
+ */
15
+ export declare function registerImageAnalyzeTool(server: McpServer): void;
@@ -0,0 +1,363 @@
1
+ /**
2
+ * Image Analysis Tool - Analyze images with object detection and bounding boxes
3
+ *
4
+ * This tool uses Gemini's vision capabilities to analyze images and detect objects
5
+ * with bounding box coordinates. Returns both normalized box_2d format and pixel coordinates.
6
+ *
7
+ * Bounding Box Format:
8
+ * - box_2d: [y_min, x_min, y_max, x_max] in 0-1000 normalized coordinates
9
+ * - bbox_pixels: {x, y, width, height} in pixel coordinates (when dimensions available)
10
+ */
11
+ import { z } from "zod";
12
+ import { GoogleGenAI } from "@google/genai";
13
+ import { logger } from "../utils/logger.js";
14
+ import * as fs from "fs";
15
+ import * as path from "path";
16
+ /**
17
+ * Extract image dimensions from PNG or JPEG file
18
+ */
19
+ function extractImageDimensions(filePath) {
20
+ try {
21
+ const buffer = fs.readFileSync(filePath);
22
+ const ext = path.extname(filePath).toLowerCase();
23
+ if (ext === ".png") {
24
+ // PNG format: width and height are at bytes 16-23 (big-endian)
25
+ if (buffer.length < 24)
26
+ return null;
27
+ if (buffer.toString("ascii", 1, 4) !== "PNG")
28
+ return null;
29
+ const width = buffer.readUInt32BE(16);
30
+ const height = buffer.readUInt32BE(20);
31
+ return { width, height };
32
+ }
33
+ else if (ext === ".jpg" || ext === ".jpeg") {
34
+ // JPEG format: scan for SOF0 or SOF2 markers
35
+ let offset = 2; // Skip initial 0xFFD8
36
+ while (offset < buffer.length - 8) {
37
+ if (buffer[offset] !== 0xff)
38
+ break;
39
+ const marker = buffer[offset + 1];
40
+ const segmentLength = buffer.readUInt16BE(offset + 2);
41
+ // SOF0 (0xC0) or SOF2 (0xC2) markers contain dimensions
42
+ if (marker === 0xc0 || marker === 0xc2) {
43
+ const height = buffer.readUInt16BE(offset + 5);
44
+ const width = buffer.readUInt16BE(offset + 7);
45
+ return { width, height };
46
+ }
47
+ offset += 2 + segmentLength;
48
+ }
49
+ }
50
+ return null;
51
+ }
52
+ catch (error) {
53
+ logger.debug(`Failed to extract dimensions: ${error}`);
54
+ return null;
55
+ }
56
+ }
57
+ /**
58
+ * Convert box_2d normalized coordinates to pixel coordinates
59
+ */
60
+ function convertToPixelCoords(box2d, width, height) {
61
+ const [yMin, xMin, yMax, xMax] = box2d;
62
+ return {
63
+ x: Math.round((xMin / 1000) * width),
64
+ y: Math.round((yMin / 1000) * height),
65
+ width: Math.round(((xMax - xMin) / 1000) * width),
66
+ height: Math.round(((yMax - yMin) / 1000) * height),
67
+ };
68
+ }
69
+ /**
70
+ * Get MIME type from file extension
71
+ */
72
+ function getImageMimeType(filePath) {
73
+ const ext = path.extname(filePath).toLowerCase();
74
+ const mimeTypes = {
75
+ ".jpg": "image/jpeg",
76
+ ".jpeg": "image/jpeg",
77
+ ".png": "image/png",
78
+ ".webp": "image/webp",
79
+ ".heic": "image/heic",
80
+ ".heif": "image/heif",
81
+ ".gif": "image/gif",
82
+ };
83
+ return mimeTypes[ext] || "image/jpeg";
84
+ }
85
+ /**
86
+ * Register image analysis tools with the MCP server
87
+ */
88
+ export function registerImageAnalyzeTool(server) {
89
+ server.tool("gemini-analyze-image", {
90
+ imagePath: z
91
+ .string()
92
+ .describe("Path to image file. Supports JPEG, PNG, WebP, HEIC, HEIF, GIF"),
93
+ query: z
94
+ .string()
95
+ .optional()
96
+ .describe('Specific question about the image (e.g., "What objects are in this image?", "Count the people"). Default: "Analyze this image in detail."'),
97
+ detectObjects: z
98
+ .boolean()
99
+ .default(true)
100
+ .describe("Enable object detection with bounding boxes (returns box_2d coordinates). Default: true"),
101
+ model: z
102
+ .enum(["pro", "flash"])
103
+ .default("flash")
104
+ .describe("Model to use: pro (more accurate) or flash (faster). Default: flash"),
105
+ thinkingLevel: z
106
+ .enum(["minimal", "low", "medium", "high"])
107
+ .optional()
108
+ .describe("Reasoning depth: minimal/low for fast responses, medium/high for complex analysis. " +
109
+ "Pro supports low/high only. Flash supports all levels. Default: high"),
110
+ mediaResolution: z
111
+ .enum(["low", "medium", "high"])
112
+ .default("medium")
113
+ .describe("Resolution for processing: low (faster), medium (balanced), high (more detail). Default: medium"),
114
+ }, async ({ imagePath, query, detectObjects, model, thinkingLevel, mediaResolution }) => {
115
+ logger.info(`Analyzing image: ${imagePath}`);
116
+ try {
117
+ const apiKey = process.env.GEMINI_API_KEY;
118
+ if (!apiKey) {
119
+ throw new Error("GEMINI_API_KEY not set");
120
+ }
121
+ if (!fs.existsSync(imagePath)) {
122
+ throw new Error(`File not found: ${imagePath}`);
123
+ }
124
+ const fileBuffer = fs.readFileSync(imagePath);
125
+ const mimeType = getImageMimeType(imagePath);
126
+ const dimensions = extractImageDimensions(imagePath);
127
+ const genAI = new GoogleGenAI({ apiKey });
128
+ const modelName = model === "pro"
129
+ ? process.env.GEMINI_PRO_MODEL || "gemini-3-pro-preview"
130
+ : process.env.GEMINI_FLASH_MODEL || "gemini-3-flash-preview";
131
+ const fileSize = fileBuffer.length;
132
+ logger.debug(`Image size: ${fileSize} bytes, MIME type: ${mimeType}`);
133
+ // Log image dimensions (for pixel coordinate conversion)
134
+ if (dimensions) {
135
+ logger.debug(`Image dimensions: ${dimensions.width}x${dimensions.height}`);
136
+ }
137
+ else {
138
+ logger.debug("Could not extract image dimensions");
139
+ }
140
+ // Build prompt based on parameters
141
+ let prompt = query || "Analyze this image in detail.";
142
+ if (detectObjects) {
143
+ prompt += `\n\nFor each object you identify, provide bounding box coordinates in the box_2d format: [y_min, x_min, y_max, x_max] where coordinates are normalized to 0-1000 scale.
144
+
145
+ Return your response as a JSON object with this structure:
146
+ {
147
+ "description": "Overall description of the image",
148
+ "objects": [
149
+ {
150
+ "label": "object name",
151
+ "confidence": "high/medium/low",
152
+ "box_2d": [y_min, x_min, y_max, x_max]
153
+ }
154
+ ]
155
+ }
156
+
157
+ If you cannot detect specific objects or bounding boxes are not applicable, return an empty objects array.`;
158
+ }
159
+ // Map resolution to API parameter
160
+ const resolutionMap = {
161
+ low: "media_resolution_low",
162
+ medium: "media_resolution_medium",
163
+ high: "media_resolution_high",
164
+ };
165
+ // For files <20MB, use inline data. For larger files, use Files API
166
+ let responseText;
167
+ if (fileSize > 20 * 1024 * 1024) {
168
+ // Upload using Files API
169
+ logger.info("Large file detected, uploading via Files API...");
170
+ const uploadedFile = await genAI.files.upload({
171
+ file: new Blob([new Uint8Array(fileBuffer)], { type: mimeType }),
172
+ config: { mimeType },
173
+ });
174
+ const config = {};
175
+ if (mediaResolution !== "medium") {
176
+ config.mediaResolution = resolutionMap[mediaResolution];
177
+ }
178
+ // Add thinking config for Gemini 3
179
+ if (thinkingLevel) {
180
+ // Pro only supports low/high, Flash supports all levels
181
+ const effectiveLevel = model === "pro"
182
+ ? (thinkingLevel === "minimal" || thinkingLevel === "low" ? "low" : "high")
183
+ : thinkingLevel;
184
+ config.thinkingConfig = { thinkingLevel: effectiveLevel };
185
+ logger.debug(`Using thinking level: ${effectiveLevel}${model === "pro" && effectiveLevel !== thinkingLevel ? ` (requested: ${thinkingLevel})` : ""}`);
186
+ }
187
+ // Add structured output for object detection
188
+ if (detectObjects) {
189
+ config.responseMimeType = "application/json";
190
+ config.responseJsonSchema = {
191
+ type: "object",
192
+ properties: {
193
+ description: { type: "string" },
194
+ objects: {
195
+ type: "array",
196
+ items: {
197
+ type: "object",
198
+ properties: {
199
+ label: { type: "string" },
200
+ confidence: { type: "string" },
201
+ box_2d: {
202
+ type: "array",
203
+ items: { type: "number" },
204
+ minItems: 4,
205
+ maxItems: 4,
206
+ },
207
+ },
208
+ required: ["label", "confidence", "box_2d"],
209
+ },
210
+ },
211
+ },
212
+ required: ["description", "objects"],
213
+ };
214
+ }
215
+ const response = await genAI.models.generateContent({
216
+ model: modelName,
217
+ contents: [
218
+ {
219
+ role: "user",
220
+ parts: [
221
+ {
222
+ fileData: {
223
+ fileUri: uploadedFile.uri,
224
+ mimeType: uploadedFile.mimeType,
225
+ },
226
+ },
227
+ { text: prompt },
228
+ ],
229
+ },
230
+ ],
231
+ config: Object.keys(config).length > 0 ? config : undefined,
232
+ });
233
+ responseText = response.text || "";
234
+ }
235
+ else {
236
+ // Use inline data for smaller files
237
+ const base64Data = fileBuffer.toString("base64");
238
+ const inlineConfig = {};
239
+ if (mediaResolution !== "medium") {
240
+ inlineConfig.mediaResolution = resolutionMap[mediaResolution];
241
+ }
242
+ // Add thinking config for Gemini 3
243
+ if (thinkingLevel) {
244
+ // Pro only supports low/high, Flash supports all levels
245
+ const effectiveLevel = model === "pro"
246
+ ? (thinkingLevel === "minimal" || thinkingLevel === "low" ? "low" : "high")
247
+ : thinkingLevel;
248
+ inlineConfig.thinkingConfig = { thinkingLevel: effectiveLevel };
249
+ logger.debug(`Using thinking level: ${effectiveLevel}${model === "pro" && effectiveLevel !== thinkingLevel ? ` (requested: ${thinkingLevel})` : ""}`);
250
+ }
251
+ // Add structured output for object detection
252
+ if (detectObjects) {
253
+ inlineConfig.responseMimeType = "application/json";
254
+ inlineConfig.responseJsonSchema = {
255
+ type: "object",
256
+ properties: {
257
+ description: { type: "string" },
258
+ objects: {
259
+ type: "array",
260
+ items: {
261
+ type: "object",
262
+ properties: {
263
+ label: { type: "string" },
264
+ confidence: { type: "string" },
265
+ box_2d: {
266
+ type: "array",
267
+ items: { type: "number" },
268
+ minItems: 4,
269
+ maxItems: 4,
270
+ },
271
+ },
272
+ required: ["label", "confidence", "box_2d"],
273
+ },
274
+ },
275
+ },
276
+ required: ["description", "objects"],
277
+ };
278
+ }
279
+ const response = await genAI.models.generateContent({
280
+ model: modelName,
281
+ contents: [
282
+ {
283
+ role: "user",
284
+ parts: [
285
+ {
286
+ inlineData: {
287
+ mimeType,
288
+ data: base64Data,
289
+ },
290
+ },
291
+ { text: prompt },
292
+ ],
293
+ },
294
+ ],
295
+ config: Object.keys(inlineConfig).length > 0 ? inlineConfig : undefined,
296
+ });
297
+ responseText = response.text || "";
298
+ }
299
+ // Process response
300
+ if (detectObjects && responseText) {
301
+ try {
302
+ const parsed = JSON.parse(responseText);
303
+ // Add pixel coordinates if dimensions are available
304
+ if (dimensions && parsed.objects && Array.isArray(parsed.objects)) {
305
+ parsed.objects = parsed.objects.map((obj) => {
306
+ if (Array.isArray(obj.box_2d) && obj.box_2d.length === 4) {
307
+ return {
308
+ ...obj,
309
+ bbox_pixels: convertToPixelCoords(obj.box_2d, dimensions.width, dimensions.height),
310
+ };
311
+ }
312
+ return obj;
313
+ });
314
+ }
315
+ const formattedJson = JSON.stringify(parsed, null, 2);
316
+ logger.info("Image analysis completed with object detection");
317
+ return {
318
+ content: [
319
+ {
320
+ type: "text",
321
+ text: `**Image Analysis Results:**\n\`\`\`json\n${formattedJson}\n\`\`\``,
322
+ },
323
+ ],
324
+ };
325
+ }
326
+ catch (parseError) {
327
+ // Fallback to plain text if JSON parsing fails
328
+ logger.warn("Failed to parse JSON response, returning as text");
329
+ return {
330
+ content: [
331
+ {
332
+ type: "text",
333
+ text: responseText,
334
+ },
335
+ ],
336
+ };
337
+ }
338
+ }
339
+ logger.info("Image analysis completed");
340
+ return {
341
+ content: [
342
+ {
343
+ type: "text",
344
+ text: responseText || "Unable to analyze image.",
345
+ },
346
+ ],
347
+ };
348
+ }
349
+ catch (error) {
350
+ const errorMessage = error instanceof Error ? error.message : String(error);
351
+ logger.error(`Error in image analysis: ${errorMessage}`);
352
+ return {
353
+ content: [
354
+ {
355
+ type: "text",
356
+ text: `Error analyzing image: ${errorMessage}`,
357
+ },
358
+ ],
359
+ isError: true,
360
+ };
361
+ }
362
+ });
363
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@rlabs-inc/gemini-mcp",
3
- "version": "0.7.2",
3
+ "version": "0.8.0",
4
4
  "mcpName": "io.github.RLabs-Inc/gemini-mcp",
5
5
  "description": "MCP server for Gemini 3 integration with Claude Code - 30+ AI tools including image/video generation, deep research, code execution, and beautiful CLI",
6
6
  "main": "dist/index.js",