voyageai-cli 1.30.2 → 1.30.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "voyageai-cli",
3
- "version": "1.30.2",
3
+ "version": "1.30.3",
4
4
  "description": "CLI for Voyage AI embeddings, reranking, and MongoDB Atlas Vector Search",
5
5
  "bin": {
6
6
  "vai": "./src/cli.js"
@@ -1,11 +1,13 @@
1
1
  'use strict';
2
2
 
3
3
  const { getDefaultModel } = require('../lib/catalog');
4
- const { generateEmbeddings } = require('../lib/api');
5
- const { resolveTextInput } = require('../lib/input');
4
+ const { generateEmbeddings, generateMultimodalEmbeddings } = require('../lib/api');
5
+ const { resolveTextInput, readMediaAsBase64, isImageFile, isVideoFile } = require('../lib/input');
6
6
  const ui = require('../lib/ui');
7
7
  const { showCostSummary } = require('../lib/cost-display');
8
8
 
9
+ const MULTIMODAL_MODEL = 'voyage-multimodal-3.5';
10
+
9
11
  /**
10
12
  * Register the embed command on a Commander program.
11
13
  * @param {import('commander').Command} program
@@ -18,6 +20,8 @@ function registerEmbed(program) {
18
20
  .option('-t, --input-type <type>', 'Input type: query or document')
19
21
  .option('-d, --dimensions <n>', 'Output dimensions', (v) => parseInt(v, 10))
20
22
  .option('-f, --file <path>', 'Read text from file')
23
+ .option('--image <path>', 'Embed an image file (uses voyage-multimodal-3.5)')
24
+ .option('--video <path>', 'Embed a video file (uses voyage-multimodal-3.5)')
21
25
  .option('--truncation', 'Enable truncation for long inputs')
22
26
  .option('--no-truncation', 'Disable truncation')
23
27
  .option('--output-dtype <type>', 'Output data type: float, int8, uint8, binary, ubinary', 'float')
@@ -28,6 +32,121 @@ function registerEmbed(program) {
28
32
  .action(async (text, opts) => {
29
33
  try {
30
34
  const telemetry = require('../lib/telemetry');
35
+ const isMultimodal = !!(opts.image || opts.video);
36
+
37
+ // Validate: --image/--video are incompatible with --file
38
+ if (isMultimodal && opts.file) {
39
+ console.error(ui.error('Cannot combine --image or --video with --file. Use --image/--video for multimodal, or --file for text.'));
40
+ process.exit(1);
41
+ }
42
+
43
+ // Multimodal path: --image and/or --video
44
+ if (isMultimodal) {
45
+ const model = opts.model === getDefaultModel() ? MULTIMODAL_MODEL : opts.model;
46
+ const useColor = !opts.json;
47
+ const useSpinner = useColor && !opts.quiet;
48
+
49
+ // Build content array
50
+ const contentItems = [];
51
+ const mediaMeta = [];
52
+
53
+ // Add text if provided
54
+ if (text) {
55
+ contentItems.push({ type: 'text', text });
56
+ }
57
+
58
+ // Add image
59
+ if (opts.image) {
60
+ if (!isImageFile(opts.image)) {
61
+ console.error(ui.error(`Not a supported image format: ${opts.image}`));
62
+ process.exit(1);
63
+ }
64
+ const media = readMediaAsBase64(opts.image);
65
+ contentItems.push({ type: 'image_base64', image_base64: media.base64DataUrl });
66
+ mediaMeta.push({ type: 'image', path: opts.image, mime: media.mimeType, size: media.sizeBytes });
67
+ }
68
+
69
+ // Add video
70
+ if (opts.video) {
71
+ if (!isVideoFile(opts.video)) {
72
+ console.error(ui.error(`Not a supported video format: ${opts.video}`));
73
+ process.exit(1);
74
+ }
75
+ const media = readMediaAsBase64(opts.video);
76
+ contentItems.push({ type: 'video_base64', video_base64: media.base64DataUrl });
77
+ mediaMeta.push({ type: 'video', path: opts.video, mime: media.mimeType, size: media.sizeBytes });
78
+ }
79
+
80
+ if (contentItems.length === 0) {
81
+ console.error(ui.error('No content provided. Pass text, --image, or --video.'));
82
+ process.exit(1);
83
+ }
84
+
85
+ const done = telemetry.timer('cli_embed', {
86
+ model,
87
+ multimodal: true,
88
+ hasText: !!text,
89
+ hasImage: !!opts.image,
90
+ hasVideo: !!opts.video,
91
+ });
92
+
93
+ let spin;
94
+ if (useSpinner) {
95
+ spin = ui.spinner('Generating multimodal embeddings...');
96
+ spin.start();
97
+ }
98
+
99
+ const mmOpts = { model };
100
+ if (opts.inputType) mmOpts.inputType = opts.inputType;
101
+ if (opts.dimensions) mmOpts.outputDimension = opts.dimensions;
102
+
103
+ const result = await generateMultimodalEmbeddings([contentItems], mmOpts);
104
+
105
+ if (spin) spin.stop();
106
+
107
+ if (opts.outputFormat === 'array') {
108
+ console.log(JSON.stringify(result.data[0].embedding));
109
+ return;
110
+ }
111
+
112
+ if (opts.json) {
113
+ console.log(JSON.stringify(result, null, 2));
114
+ return;
115
+ }
116
+
117
+ // Friendly output
118
+ if (!opts.quiet) {
119
+ console.log(ui.label('Model', ui.cyan(model)));
120
+ console.log(ui.label('Mode', ui.cyan('multimodal')));
121
+ for (const m of mediaMeta) {
122
+ const sizeStr = m.size < 1024 * 1024
123
+ ? `${(m.size / 1024).toFixed(1)} KB`
124
+ : `${(m.size / (1024 * 1024)).toFixed(1)} MB`;
125
+ console.log(ui.label(m.type === 'image' ? 'Image' : 'Video', `${m.path} ${ui.dim(`(${m.mime}, ${sizeStr})`)}`));
126
+ }
127
+ if (text) {
128
+ console.log(ui.label('Text', ui.dim(text.slice(0, 80) + (text.length > 80 ? '...' : ''))));
129
+ }
130
+ if (result.usage) {
131
+ console.log(ui.label('Tokens', ui.dim(String(result.usage.total_tokens))));
132
+ }
133
+ const dims = result.data[0]?.embedding?.length || 'N/A';
134
+ console.log(ui.label('Dimensions', ui.bold(String(dims))));
135
+ console.log('');
136
+ }
137
+
138
+ const vector = result.data[0].embedding;
139
+ const preview = vector.slice(0, 5).map(v => v.toFixed(6)).join(', ');
140
+ console.log(`[${preview}, ...] (${vector.length} dims)`);
141
+
142
+ console.log('');
143
+ console.log(ui.success('Multimodal embedding generated'));
144
+
145
+ done({ dimensions: result.data[0]?.embedding?.length });
146
+ return;
147
+ }
148
+
149
+ // Standard text embedding path
31
150
  const texts = await resolveTextInput(text, opts.file);
32
151
 
33
152
  // --estimate: show cost comparison, optionally switch model
@@ -1406,9 +1406,58 @@ function createPlaygroundServer() {
1406
1406
  res.end(JSON.stringify({ error: 'inputs must be a non-empty array' }));
1407
1407
  return;
1408
1408
  }
1409
+
1410
+ // Optimize video inputs: downsample to 1fps to fit within 32k token context
1411
+ const os = require('os');
1412
+ const path = require('path');
1413
+ const fs = require('fs');
1414
+ const { execFileSync } = require('child_process');
1415
+ const optimizedInputs = [];
1416
+ for (const input of inputs) {
1417
+ const content = input.content;
1418
+ if (content && Array.isArray(content)) {
1419
+ const optimizedContent = [];
1420
+ for (const item of content) {
1421
+ if (item.type === 'video_base64' && item.video_base64) {
1422
+ // Downsample video to 1fps using ffmpeg to reduce token count
1423
+ try {
1424
+ const b64 = item.video_base64.replace(/^data:[^;]+;base64,/, '');
1425
+ const tmpIn = path.join(os.tmpdir(), `vai_vid_in_${Date.now()}.mp4`);
1426
+ const tmpOut = path.join(os.tmpdir(), `vai_vid_out_${Date.now()}.mp4`);
1427
+ fs.writeFileSync(tmpIn, Buffer.from(b64, 'base64'));
1428
+ try {
1429
+ execFileSync('ffmpeg', [
1430
+ '-y', '-i', tmpIn,
1431
+ '-vf', 'fps=1',
1432
+ '-c:v', 'libx264', '-preset', 'fast', '-crf', '23',
1433
+ '-an', // strip audio
1434
+ tmpOut
1435
+ ], { timeout: 30000, stdio: 'pipe' });
1436
+ const optimizedBuf = fs.readFileSync(tmpOut);
1437
+ const optimizedB64 = `data:video/mp4;base64,${optimizedBuf.toString('base64')}`;
1438
+ optimizedContent.push({ type: 'video_base64', video_base64: optimizedB64 });
1439
+ } finally {
1440
+ try { fs.unlinkSync(tmpIn); } catch (_) {}
1441
+ try { fs.unlinkSync(tmpOut); } catch (_) {}
1442
+ }
1443
+ } catch (err) {
1444
+ // If optimization fails, send original and let API error naturally
1445
+ console.warn('[Playground] Video optimization failed:', err.message);
1446
+ optimizedContent.push(item);
1447
+ }
1448
+ } else {
1449
+ optimizedContent.push(item);
1450
+ }
1451
+ }
1452
+ optimizedInputs.push({ ...input, content: optimizedContent });
1453
+ } else {
1454
+ optimizedInputs.push(input);
1455
+ }
1456
+ }
1457
+
1409
1458
  const { apiRequest } = require('../lib/api');
1410
1459
  const mmBody = {
1411
- inputs,
1460
+ inputs: optimizedInputs,
1412
1461
  model: model || 'voyage-multimodal-3.5',
1413
1462
  };
1414
1463
  if (input_type) mmBody.input_type = input_type;
@@ -1590,9 +1639,13 @@ function createPlaygroundServer() {
1590
1639
  else if (output.text) summary = output.text.slice(0, 100) + (output.text.length > 100 ? '...' : '');
1591
1640
  else summary = JSON.stringify(output).slice(0, 200);
1592
1641
  }
1642
+ // Extract usage data for cost tracking (then strip from output payload)
1643
+ const _usage = (output && output._usage) ? output._usage : undefined;
1644
+ const cleanOutput = _usage ? { ...output } : output;
1645
+ if (cleanOutput && cleanOutput._usage) delete cleanOutput._usage;
1593
1646
  res.write(`event: step_complete\ndata: ${JSON.stringify({
1594
- stepId, timeMs, summary,
1595
- output: JSON.stringify(output).length < 5000 ? output : { _truncated: true, summary },
1647
+ stepId, timeMs, summary, _usage,
1648
+ output: JSON.stringify(cleanOutput).length < 5000 ? cleanOutput : { _truncated: true, summary },
1596
1649
  })}\n\n`);
1597
1650
  },
1598
1651
  onStepSkip: (stepId, reason) => {
package/src/lib/api.js CHANGED
@@ -195,6 +195,36 @@ async function generateEmbeddings(texts, options = {}) {
195
195
  return apiRequest('/embeddings', body);
196
196
  }
197
197
 
198
+ /**
199
+ * Generate multimodal embeddings for inputs containing text, images, and/or video.
200
+ * Uses the /multimodalembeddings endpoint with a different input format.
201
+ * @param {Array<Array<{type: string, text?: string, image_base64?: string, video_base64?: string}>>} inputs
202
+ * Array of content arrays. Each content array is a list of content items for one input.
203
+ * Example: [[{type: 'text', text: 'hello'}, {type: 'image_base64', image_base64: 'data:image/png;base64,...'}]]
204
+ * @param {object} options
205
+ * @param {string} [options.model] - Model name (default: voyage-multimodal-3.5)
206
+ * @param {string} [options.inputType] - Input type (query|document)
207
+ * @param {number} [options.outputDimension] - Output dimensions
208
+ * @returns {Promise<object>} API response with embeddings
209
+ */
210
+ async function generateMultimodalEmbeddings(inputs, options = {}) {
211
+ const model = options.model || 'voyage-multimodal-3.5';
212
+
213
+ const body = {
214
+ inputs: inputs.map(contentArray => ({ content: contentArray })),
215
+ model,
216
+ };
217
+
218
+ if (options.inputType) {
219
+ body.input_type = options.inputType;
220
+ }
221
+ if (options.outputDimension) {
222
+ body.output_dimension = options.outputDimension;
223
+ }
224
+
225
+ return apiRequest('/multimodalembeddings', body);
226
+ }
227
+
198
228
  module.exports = {
199
229
  API_BASE,
200
230
  ATLAS_API_BASE,
@@ -204,4 +234,5 @@ module.exports = {
204
234
  requireApiKey,
205
235
  apiRequest,
206
236
  generateEmbeddings,
237
+ generateMultimodalEmbeddings,
207
238
  };
package/src/lib/input.js CHANGED
@@ -1,6 +1,7 @@
1
1
  'use strict';
2
2
 
3
3
  const fs = require('fs');
4
+ const path = require('path');
4
5
 
5
6
  /**
6
7
  * Read text input from argument, --file flag, or stdin.
@@ -37,4 +38,94 @@ async function resolveTextInput(textArg, filePath) {
37
38
  process.exit(1);
38
39
  }
39
40
 
40
- module.exports = { resolveTextInput };
41
+ /**
42
+ * MIME type mappings for supported image formats.
43
+ */
44
+ const IMAGE_MIME_TYPES = {
45
+ '.jpg': 'image/jpeg',
46
+ '.jpeg': 'image/jpeg',
47
+ '.png': 'image/png',
48
+ '.gif': 'image/gif',
49
+ '.webp': 'image/webp',
50
+ '.bmp': 'image/bmp',
51
+ '.tiff': 'image/tiff',
52
+ '.tif': 'image/tiff',
53
+ };
54
+
55
+ /**
56
+ * MIME type mappings for supported video formats.
57
+ */
58
+ const VIDEO_MIME_TYPES = {
59
+ '.mp4': 'video/mp4',
60
+ '.mpeg': 'video/mpeg',
61
+ '.mpg': 'video/mpeg',
62
+ '.mov': 'video/quicktime',
63
+ '.avi': 'video/x-msvideo',
64
+ '.mkv': 'video/x-matroska',
65
+ '.webm': 'video/webm',
66
+ '.flv': 'video/x-flv',
67
+ '.wmv': 'video/x-ms-wmv',
68
+ };
69
+
70
+ /**
71
+ * Check if a file path is a supported image format.
72
+ * @param {string} filePath
73
+ * @returns {boolean}
74
+ */
75
+ function isImageFile(filePath) {
76
+ const ext = path.extname(filePath).toLowerCase();
77
+ return ext in IMAGE_MIME_TYPES;
78
+ }
79
+
80
+ /**
81
+ * Check if a file path is a supported video format.
82
+ * @param {string} filePath
83
+ * @returns {boolean}
84
+ */
85
+ function isVideoFile(filePath) {
86
+ const ext = path.extname(filePath).toLowerCase();
87
+ return ext in VIDEO_MIME_TYPES;
88
+ }
89
+
90
+ /**
91
+ * Read a media file (image or video) and return it as a base64 data URL.
92
+ * @param {string} filePath - Path to the media file
93
+ * @returns {{ base64DataUrl: string, mimeType: string, sizeBytes: number }}
94
+ */
95
+ function readMediaAsBase64(filePath) {
96
+ const ext = path.extname(filePath).toLowerCase();
97
+ const mimeType = IMAGE_MIME_TYPES[ext] || VIDEO_MIME_TYPES[ext];
98
+
99
+ if (!mimeType) {
100
+ const supported = [
101
+ ...Object.keys(IMAGE_MIME_TYPES),
102
+ ...Object.keys(VIDEO_MIME_TYPES),
103
+ ].join(', ');
104
+ throw new Error(
105
+ `Unsupported media format "${ext}". Supported: ${supported}`
106
+ );
107
+ }
108
+
109
+ if (!fs.existsSync(filePath)) {
110
+ throw new Error(`File not found: ${filePath}`);
111
+ }
112
+
113
+ const buffer = fs.readFileSync(filePath);
114
+ const base64 = buffer.toString('base64');
115
+ const base64DataUrl = `data:${mimeType};base64,${base64}`;
116
+
117
+ return {
118
+ base64DataUrl,
119
+ mimeType,
120
+ sizeBytes: buffer.length,
121
+ };
122
+ }
123
+
124
+ module.exports = {
125
+ resolveTextInput,
126
+ readMediaAsBase64,
127
+ isImageFile,
128
+ isVideoFile,
129
+ IMAGE_MIME_TYPES,
130
+ VIDEO_MIME_TYPES,
131
+ };
@@ -1258,6 +1258,9 @@ async function executeQuery(inputs, defaults) {
1258
1258
  },
1259
1259
  ]).toArray();
1260
1260
 
1261
+ // Track embed usage
1262
+ const _usage = [{ op: 'embed', model: embRes.model, tokens: embRes.usage?.total_tokens || 0 }];
1263
+
1261
1264
  // Rerank if requested and results exist
1262
1265
  if (doRerank && results.length > 0) {
1263
1266
  const documents = results.map(r => r.text || r.content || '');
@@ -1268,15 +1271,17 @@ async function executeQuery(inputs, defaults) {
1268
1271
  documents,
1269
1272
  });
1270
1273
 
1274
+ _usage.push({ op: 'rerank', model: rerankRes.model || inputs.rerankModel || DEFAULT_RERANK_MODEL, tokens: rerankRes.usage?.total_tokens || 0 });
1275
+
1271
1276
  const reranked = (rerankRes.data || []).map(r => ({
1272
1277
  ...results[r.index],
1273
1278
  score: r.relevance_score,
1274
1279
  }));
1275
1280
 
1276
- return { results: reranked, resultCount: reranked.length };
1281
+ return { results: reranked, resultCount: reranked.length, _usage };
1277
1282
  }
1278
1283
 
1279
- return { results, resultCount: results.length };
1284
+ return { results, resultCount: results.length, _usage };
1280
1285
  } finally {
1281
1286
  await client.close();
1282
1287
  }
@@ -1315,7 +1320,8 @@ async function executeRerank(inputs) {
1315
1320
  score: r.relevance_score,
1316
1321
  }));
1317
1322
 
1318
- return { results, resultCount: results.length };
1323
+ const _usage = [{ op: 'rerank', model: res.model || model, tokens: res.usage?.total_tokens || 0 }];
1324
+ return { results, resultCount: results.length, _usage };
1319
1325
  }
1320
1326
 
1321
1327
  /**
@@ -1339,6 +1345,7 @@ async function executeEmbed(inputs, defaults) {
1339
1345
  embedding: res.data[0].embedding,
1340
1346
  model: res.model,
1341
1347
  dimensions: res.data[0].embedding.length,
1348
+ _usage: [{ op: 'embed', model: res.model, tokens: res.usage?.total_tokens || 0 }],
1342
1349
  };
1343
1350
  }
1344
1351
 
@@ -1360,7 +1367,11 @@ async function executeSimilarity(inputs, defaults) {
1360
1367
  const res = await generateEmbeddings([text1, text2], opts);
1361
1368
  const similarity = cosineSimilarity(res.data[0].embedding, res.data[1].embedding);
1362
1369
 
1363
- return { similarity, model: res.model };
1370
+ return {
1371
+ similarity,
1372
+ model: res.model,
1373
+ _usage: [{ op: 'similarity', model: res.model, tokens: res.usage?.total_tokens || 0 }],
1374
+ };
1364
1375
  }
1365
1376
 
1366
1377
  /**
@@ -1441,6 +1452,7 @@ async function executeIngest(inputs, defaults) {
1441
1452
  source,
1442
1453
  model: embRes.model,
1443
1454
  indexCreated,
1455
+ _usage: [{ op: 'ingest', model: embRes.model, tokens: embRes.usage?.total_tokens || 0 }],
1444
1456
  };
1445
1457
  } finally {
1446
1458
  await client.close();
@@ -1585,14 +1597,20 @@ async function executeGenerate(inputs) {
1585
1597
 
1586
1598
  // Collect streaming response
1587
1599
  let text = '';
1600
+ let llmUsage = { inputTokens: 0, outputTokens: 0 };
1588
1601
  for await (const chunk of provider.chat(messages, { stream: true })) {
1589
- text += chunk;
1602
+ if (chunk && typeof chunk === 'object' && chunk.__usage) {
1603
+ llmUsage = chunk.__usage;
1604
+ } else {
1605
+ text += chunk;
1606
+ }
1590
1607
  }
1591
1608
 
1592
1609
  return {
1593
1610
  text,
1594
1611
  model: provider.model,
1595
1612
  provider: provider.name,
1613
+ _usage: [{ op: 'llm', model: provider.model, provider: provider.name, inputTokens: llmUsage.inputTokens, outputTokens: llmUsage.outputTokens }],
1596
1614
  };
1597
1615
  }
1598
1616
 
@@ -1906,14 +1924,22 @@ async function executeWorkflow(definition, opts = {}) {
1906
1924
  }
1907
1925
 
1908
1926
  const durationMs = Date.now() - stepStart;
1909
- context[stepId] = { output };
1910
1927
 
1928
+ // Pass full output (with _usage) to onStepComplete for cost tracking
1911
1929
  if (opts.onStepComplete) opts.onStepComplete(stepId, output, durationMs);
1912
1930
 
1931
+ // Strip _usage from context so downstream steps don't receive it
1932
+ let cleanOutput = output;
1933
+ if (output && output._usage) {
1934
+ cleanOutput = { ...output };
1935
+ delete cleanOutput._usage;
1936
+ }
1937
+ context[stepId] = { output: cleanOutput };
1938
+
1913
1939
  stepResults.push({
1914
1940
  id: stepId,
1915
1941
  tool: step.tool,
1916
- output,
1942
+ output: cleanOutput,
1917
1943
  durationMs,
1918
1944
  });
1919
1945
  } catch (err) {
@@ -227,6 +227,17 @@ const generateWorkflowSchema = {
227
227
  .describe('Explicit list of tools to include (e.g., ["query", "rerank", "generate"]). If omitted, tools are inferred from the description.'),
228
228
  };
229
229
 
230
+ /** vai_multimodal_embed input schema */
231
+ const multimodalEmbedSchema = {
232
+ text: z.string().max(32000).optional().describe('Optional text content to embed alongside media'),
233
+ image_base64: z.string().optional().describe('Base64 data URL for an image (e.g., data:image/jpeg;base64,...)'),
234
+ video_base64: z.string().optional().describe('Base64 data URL for a video (e.g., data:video/mp4;base64,...)'),
235
+ model: z.string().default('voyage-multimodal-3.5').describe('Multimodal embedding model'),
236
+ inputType: z.enum(['document', 'query']).optional()
237
+ .describe('Whether this input is a document or a query (affects embedding)'),
238
+ outputDimension: z.number().int().optional().describe('Output dimensions (256, 512, 1024, or 2048)'),
239
+ };
240
+
230
241
  /** vai_validate_workflow input schema */
231
242
  const validateWorkflowSchema = {
232
243
  workflow: z.object({
@@ -267,6 +278,7 @@ module.exports = {
267
278
  codeQuerySchema,
268
279
  codeFindSimilarSchema,
269
280
  codeStatusSchema,
281
+ multimodalEmbedSchema,
270
282
  generateWorkflowSchema,
271
283
  validateWorkflowSchema,
272
284
  };
@@ -1,6 +1,6 @@
1
1
  'use strict';
2
2
 
3
- const { generateEmbeddings } = require('../../lib/api');
3
+ const { generateEmbeddings, generateMultimodalEmbeddings } = require('../../lib/api');
4
4
  const { cosineSimilarity } = require('../../lib/math');
5
5
 
6
6
  /**
@@ -56,7 +56,69 @@ async function handleVaiSimilarity(input) {
56
56
  }
57
57
 
58
58
  /**
59
- * Register embedding tools: vai_embed, vai_similarity
59
+ * Handler for vai_multimodal_embed: embed text, images, and/or video.
60
+ * @param {object} input - Validated input matching multimodalEmbedSchema
61
+ * @returns {Promise<{structuredContent: object, content: Array}>}
62
+ */
63
+ async function handleVaiMultimodalEmbed(input) {
64
+ const { text, image_base64, video_base64, model, inputType, outputDimension } = input;
65
+
66
+ // Require at least one content type
67
+ if (!text && !image_base64 && !video_base64) {
68
+ return {
69
+ structuredContent: { error: 'No content provided' },
70
+ content: [{ type: 'text', text: 'Error: At least one of text, image_base64, or video_base64 must be provided.' }],
71
+ };
72
+ }
73
+
74
+ // Build content array
75
+ const contentItems = [];
76
+ const parts = [];
77
+
78
+ if (text) {
79
+ contentItems.push({ type: 'text', text });
80
+ parts.push('text');
81
+ }
82
+ if (image_base64) {
83
+ contentItems.push({ type: 'image_base64', image_base64 });
84
+ parts.push('image');
85
+ }
86
+ if (video_base64) {
87
+ contentItems.push({ type: 'video_base64', video_base64 });
88
+ parts.push('video');
89
+ }
90
+
91
+ const start = Date.now();
92
+ const mmOpts = { model };
93
+ if (inputType) mmOpts.inputType = inputType;
94
+ if (outputDimension) mmOpts.outputDimension = outputDimension;
95
+
96
+ const result = await generateMultimodalEmbeddings([contentItems], mmOpts);
97
+ const vector = result.data[0].embedding;
98
+ const timeMs = Date.now() - start;
99
+
100
+ const structured = {
101
+ model,
102
+ contentTypes: parts,
103
+ vector,
104
+ dimensions: vector.length,
105
+ inputType: inputType || null,
106
+ timeMs,
107
+ };
108
+ if (text) structured.textPreview = text.slice(0, 100) + (text.length > 100 ? '...' : '');
109
+
110
+ return {
111
+ structuredContent: structured,
112
+ content: [{
113
+ type: 'text',
114
+ text: `Multimodal embedding (${parts.join(' + ')}, ${vector.length} dimensions, model: ${model}, ${timeMs}ms). ` +
115
+ `Vector: [${vector.slice(0, 5).map(v => v.toFixed(4)).join(', ')}, ... ${vector.length - 5} more]`,
116
+ }],
117
+ };
118
+ }
119
+
120
+ /**
121
+ * Register embedding tools: vai_embed, vai_similarity, vai_multimodal_embed
60
122
  * @param {import('@modelcontextprotocol/sdk/server/mcp.js').McpServer} server
61
123
  * @param {object} schemas
62
124
  */
@@ -74,6 +136,13 @@ function registerEmbeddingTools(server, schemas) {
74
136
  schemas.similaritySchema,
75
137
  handleVaiSimilarity
76
138
  );
139
+
140
+ server.tool(
141
+ 'vai_multimodal_embed',
142
+ 'Generate multimodal embeddings for text, images, and/or video using voyage-multimodal-3.5. Accepts base64 data URLs for media. At least one of text, image, or video must be provided. Supports combining multiple content types in a single embedding.',
143
+ schemas.multimodalEmbedSchema,
144
+ handleVaiMultimodalEmbed
145
+ );
77
146
  }
78
147
 
79
- module.exports = { registerEmbeddingTools, handleVaiEmbed, handleVaiSimilarity };
148
+ module.exports = { registerEmbeddingTools, handleVaiEmbed, handleVaiSimilarity, handleVaiMultimodalEmbed };