voyageai-cli 1.30.2 → 1.30.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/commands/embed.js +121 -2
- package/src/commands/playground.js +56 -3
- package/src/lib/api.js +31 -0
- package/src/lib/input.js +92 -1
- package/src/lib/workflow.js +33 -7
- package/src/mcp/schemas/index.js +12 -0
- package/src/mcp/tools/embedding.js +72 -3
- package/src/playground/index.html +614 -82
package/package.json
CHANGED
package/src/commands/embed.js
CHANGED
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
3
|
const { getDefaultModel } = require('../lib/catalog');
|
|
4
|
-
const { generateEmbeddings } = require('../lib/api');
|
|
5
|
-
const { resolveTextInput } = require('../lib/input');
|
|
4
|
+
const { generateEmbeddings, generateMultimodalEmbeddings } = require('../lib/api');
|
|
5
|
+
const { resolveTextInput, readMediaAsBase64, isImageFile, isVideoFile } = require('../lib/input');
|
|
6
6
|
const ui = require('../lib/ui');
|
|
7
7
|
const { showCostSummary } = require('../lib/cost-display');
|
|
8
8
|
|
|
9
|
+
const MULTIMODAL_MODEL = 'voyage-multimodal-3.5';
|
|
10
|
+
|
|
9
11
|
/**
|
|
10
12
|
* Register the embed command on a Commander program.
|
|
11
13
|
* @param {import('commander').Command} program
|
|
@@ -18,6 +20,8 @@ function registerEmbed(program) {
|
|
|
18
20
|
.option('-t, --input-type <type>', 'Input type: query or document')
|
|
19
21
|
.option('-d, --dimensions <n>', 'Output dimensions', (v) => parseInt(v, 10))
|
|
20
22
|
.option('-f, --file <path>', 'Read text from file')
|
|
23
|
+
.option('--image <path>', 'Embed an image file (uses voyage-multimodal-3.5)')
|
|
24
|
+
.option('--video <path>', 'Embed a video file (uses voyage-multimodal-3.5)')
|
|
21
25
|
.option('--truncation', 'Enable truncation for long inputs')
|
|
22
26
|
.option('--no-truncation', 'Disable truncation')
|
|
23
27
|
.option('--output-dtype <type>', 'Output data type: float, int8, uint8, binary, ubinary', 'float')
|
|
@@ -28,6 +32,121 @@ function registerEmbed(program) {
|
|
|
28
32
|
.action(async (text, opts) => {
|
|
29
33
|
try {
|
|
30
34
|
const telemetry = require('../lib/telemetry');
|
|
35
|
+
const isMultimodal = !!(opts.image || opts.video);
|
|
36
|
+
|
|
37
|
+
// Validate: --image/--video are incompatible with --file
|
|
38
|
+
if (isMultimodal && opts.file) {
|
|
39
|
+
console.error(ui.error('Cannot combine --image or --video with --file. Use --image/--video for multimodal, or --file for text.'));
|
|
40
|
+
process.exit(1);
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// Multimodal path: --image and/or --video
|
|
44
|
+
if (isMultimodal) {
|
|
45
|
+
const model = opts.model === getDefaultModel() ? MULTIMODAL_MODEL : opts.model;
|
|
46
|
+
const useColor = !opts.json;
|
|
47
|
+
const useSpinner = useColor && !opts.quiet;
|
|
48
|
+
|
|
49
|
+
// Build content array
|
|
50
|
+
const contentItems = [];
|
|
51
|
+
const mediaMeta = [];
|
|
52
|
+
|
|
53
|
+
// Add text if provided
|
|
54
|
+
if (text) {
|
|
55
|
+
contentItems.push({ type: 'text', text });
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
// Add image
|
|
59
|
+
if (opts.image) {
|
|
60
|
+
if (!isImageFile(opts.image)) {
|
|
61
|
+
console.error(ui.error(`Not a supported image format: ${opts.image}`));
|
|
62
|
+
process.exit(1);
|
|
63
|
+
}
|
|
64
|
+
const media = readMediaAsBase64(opts.image);
|
|
65
|
+
contentItems.push({ type: 'image_base64', image_base64: media.base64DataUrl });
|
|
66
|
+
mediaMeta.push({ type: 'image', path: opts.image, mime: media.mimeType, size: media.sizeBytes });
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
// Add video
|
|
70
|
+
if (opts.video) {
|
|
71
|
+
if (!isVideoFile(opts.video)) {
|
|
72
|
+
console.error(ui.error(`Not a supported video format: ${opts.video}`));
|
|
73
|
+
process.exit(1);
|
|
74
|
+
}
|
|
75
|
+
const media = readMediaAsBase64(opts.video);
|
|
76
|
+
contentItems.push({ type: 'video_base64', video_base64: media.base64DataUrl });
|
|
77
|
+
mediaMeta.push({ type: 'video', path: opts.video, mime: media.mimeType, size: media.sizeBytes });
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
if (contentItems.length === 0) {
|
|
81
|
+
console.error(ui.error('No content provided. Pass text, --image, or --video.'));
|
|
82
|
+
process.exit(1);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
const done = telemetry.timer('cli_embed', {
|
|
86
|
+
model,
|
|
87
|
+
multimodal: true,
|
|
88
|
+
hasText: !!text,
|
|
89
|
+
hasImage: !!opts.image,
|
|
90
|
+
hasVideo: !!opts.video,
|
|
91
|
+
});
|
|
92
|
+
|
|
93
|
+
let spin;
|
|
94
|
+
if (useSpinner) {
|
|
95
|
+
spin = ui.spinner('Generating multimodal embeddings...');
|
|
96
|
+
spin.start();
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
const mmOpts = { model };
|
|
100
|
+
if (opts.inputType) mmOpts.inputType = opts.inputType;
|
|
101
|
+
if (opts.dimensions) mmOpts.outputDimension = opts.dimensions;
|
|
102
|
+
|
|
103
|
+
const result = await generateMultimodalEmbeddings([contentItems], mmOpts);
|
|
104
|
+
|
|
105
|
+
if (spin) spin.stop();
|
|
106
|
+
|
|
107
|
+
if (opts.outputFormat === 'array') {
|
|
108
|
+
console.log(JSON.stringify(result.data[0].embedding));
|
|
109
|
+
return;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
if (opts.json) {
|
|
113
|
+
console.log(JSON.stringify(result, null, 2));
|
|
114
|
+
return;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
// Friendly output
|
|
118
|
+
if (!opts.quiet) {
|
|
119
|
+
console.log(ui.label('Model', ui.cyan(model)));
|
|
120
|
+
console.log(ui.label('Mode', ui.cyan('multimodal')));
|
|
121
|
+
for (const m of mediaMeta) {
|
|
122
|
+
const sizeStr = m.size < 1024 * 1024
|
|
123
|
+
? `${(m.size / 1024).toFixed(1)} KB`
|
|
124
|
+
: `${(m.size / (1024 * 1024)).toFixed(1)} MB`;
|
|
125
|
+
console.log(ui.label(m.type === 'image' ? 'Image' : 'Video', `${m.path} ${ui.dim(`(${m.mime}, ${sizeStr})`)}`));
|
|
126
|
+
}
|
|
127
|
+
if (text) {
|
|
128
|
+
console.log(ui.label('Text', ui.dim(text.slice(0, 80) + (text.length > 80 ? '...' : ''))));
|
|
129
|
+
}
|
|
130
|
+
if (result.usage) {
|
|
131
|
+
console.log(ui.label('Tokens', ui.dim(String(result.usage.total_tokens))));
|
|
132
|
+
}
|
|
133
|
+
const dims = result.data[0]?.embedding?.length || 'N/A';
|
|
134
|
+
console.log(ui.label('Dimensions', ui.bold(String(dims))));
|
|
135
|
+
console.log('');
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
const vector = result.data[0].embedding;
|
|
139
|
+
const preview = vector.slice(0, 5).map(v => v.toFixed(6)).join(', ');
|
|
140
|
+
console.log(`[${preview}, ...] (${vector.length} dims)`);
|
|
141
|
+
|
|
142
|
+
console.log('');
|
|
143
|
+
console.log(ui.success('Multimodal embedding generated'));
|
|
144
|
+
|
|
145
|
+
done({ dimensions: result.data[0]?.embedding?.length });
|
|
146
|
+
return;
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
// Standard text embedding path
|
|
31
150
|
const texts = await resolveTextInput(text, opts.file);
|
|
32
151
|
|
|
33
152
|
// --estimate: show cost comparison, optionally switch model
|
|
@@ -1406,9 +1406,58 @@ function createPlaygroundServer() {
|
|
|
1406
1406
|
res.end(JSON.stringify({ error: 'inputs must be a non-empty array' }));
|
|
1407
1407
|
return;
|
|
1408
1408
|
}
|
|
1409
|
+
|
|
1410
|
+
// Optimize video inputs: downsample to 1fps to fit within 32k token context
|
|
1411
|
+
const os = require('os');
|
|
1412
|
+
const path = require('path');
|
|
1413
|
+
const fs = require('fs');
|
|
1414
|
+
const { execFileSync } = require('child_process');
|
|
1415
|
+
const optimizedInputs = [];
|
|
1416
|
+
for (const input of inputs) {
|
|
1417
|
+
const content = input.content;
|
|
1418
|
+
if (content && Array.isArray(content)) {
|
|
1419
|
+
const optimizedContent = [];
|
|
1420
|
+
for (const item of content) {
|
|
1421
|
+
if (item.type === 'video_base64' && item.video_base64) {
|
|
1422
|
+
// Downsample video to 1fps using ffmpeg to reduce token count
|
|
1423
|
+
try {
|
|
1424
|
+
const b64 = item.video_base64.replace(/^data:[^;]+;base64,/, '');
|
|
1425
|
+
const tmpIn = path.join(os.tmpdir(), `vai_vid_in_${Date.now()}.mp4`);
|
|
1426
|
+
const tmpOut = path.join(os.tmpdir(), `vai_vid_out_${Date.now()}.mp4`);
|
|
1427
|
+
fs.writeFileSync(tmpIn, Buffer.from(b64, 'base64'));
|
|
1428
|
+
try {
|
|
1429
|
+
execFileSync('ffmpeg', [
|
|
1430
|
+
'-y', '-i', tmpIn,
|
|
1431
|
+
'-vf', 'fps=1',
|
|
1432
|
+
'-c:v', 'libx264', '-preset', 'fast', '-crf', '23',
|
|
1433
|
+
'-an', // strip audio
|
|
1434
|
+
tmpOut
|
|
1435
|
+
], { timeout: 30000, stdio: 'pipe' });
|
|
1436
|
+
const optimizedBuf = fs.readFileSync(tmpOut);
|
|
1437
|
+
const optimizedB64 = `data:video/mp4;base64,${optimizedBuf.toString('base64')}`;
|
|
1438
|
+
optimizedContent.push({ type: 'video_base64', video_base64: optimizedB64 });
|
|
1439
|
+
} finally {
|
|
1440
|
+
try { fs.unlinkSync(tmpIn); } catch (_) {}
|
|
1441
|
+
try { fs.unlinkSync(tmpOut); } catch (_) {}
|
|
1442
|
+
}
|
|
1443
|
+
} catch (err) {
|
|
1444
|
+
// If optimization fails, send original and let API error naturally
|
|
1445
|
+
console.warn('[Playground] Video optimization failed:', err.message);
|
|
1446
|
+
optimizedContent.push(item);
|
|
1447
|
+
}
|
|
1448
|
+
} else {
|
|
1449
|
+
optimizedContent.push(item);
|
|
1450
|
+
}
|
|
1451
|
+
}
|
|
1452
|
+
optimizedInputs.push({ ...input, content: optimizedContent });
|
|
1453
|
+
} else {
|
|
1454
|
+
optimizedInputs.push(input);
|
|
1455
|
+
}
|
|
1456
|
+
}
|
|
1457
|
+
|
|
1409
1458
|
const { apiRequest } = require('../lib/api');
|
|
1410
1459
|
const mmBody = {
|
|
1411
|
-
inputs,
|
|
1460
|
+
inputs: optimizedInputs,
|
|
1412
1461
|
model: model || 'voyage-multimodal-3.5',
|
|
1413
1462
|
};
|
|
1414
1463
|
if (input_type) mmBody.input_type = input_type;
|
|
@@ -1590,9 +1639,13 @@ function createPlaygroundServer() {
|
|
|
1590
1639
|
else if (output.text) summary = output.text.slice(0, 100) + (output.text.length > 100 ? '...' : '');
|
|
1591
1640
|
else summary = JSON.stringify(output).slice(0, 200);
|
|
1592
1641
|
}
|
|
1642
|
+
// Extract usage data for cost tracking (then strip from output payload)
|
|
1643
|
+
const _usage = (output && output._usage) ? output._usage : undefined;
|
|
1644
|
+
const cleanOutput = _usage ? { ...output } : output;
|
|
1645
|
+
if (cleanOutput && cleanOutput._usage) delete cleanOutput._usage;
|
|
1593
1646
|
res.write(`event: step_complete\ndata: ${JSON.stringify({
|
|
1594
|
-
stepId, timeMs, summary,
|
|
1595
|
-
output: JSON.stringify(
|
|
1647
|
+
stepId, timeMs, summary, _usage,
|
|
1648
|
+
output: JSON.stringify(cleanOutput).length < 5000 ? cleanOutput : { _truncated: true, summary },
|
|
1596
1649
|
})}\n\n`);
|
|
1597
1650
|
},
|
|
1598
1651
|
onStepSkip: (stepId, reason) => {
|
package/src/lib/api.js
CHANGED
|
@@ -195,6 +195,36 @@ async function generateEmbeddings(texts, options = {}) {
|
|
|
195
195
|
return apiRequest('/embeddings', body);
|
|
196
196
|
}
|
|
197
197
|
|
|
198
|
+
/**
|
|
199
|
+
* Generate multimodal embeddings for inputs containing text, images, and/or video.
|
|
200
|
+
* Uses the /multimodalembeddings endpoint with a different input format.
|
|
201
|
+
* @param {Array<Array<{type: string, text?: string, image_base64?: string, video_base64?: string}>>} inputs
|
|
202
|
+
* Array of content arrays. Each content array is a list of content items for one input.
|
|
203
|
+
* Example: [[{type: 'text', text: 'hello'}, {type: 'image_base64', image_base64: 'data:image/png;base64,...'}]]
|
|
204
|
+
* @param {object} options
|
|
205
|
+
* @param {string} [options.model] - Model name (default: voyage-multimodal-3.5)
|
|
206
|
+
* @param {string} [options.inputType] - Input type (query|document)
|
|
207
|
+
* @param {number} [options.outputDimension] - Output dimensions
|
|
208
|
+
* @returns {Promise<object>} API response with embeddings
|
|
209
|
+
*/
|
|
210
|
+
async function generateMultimodalEmbeddings(inputs, options = {}) {
|
|
211
|
+
const model = options.model || 'voyage-multimodal-3.5';
|
|
212
|
+
|
|
213
|
+
const body = {
|
|
214
|
+
inputs: inputs.map(contentArray => ({ content: contentArray })),
|
|
215
|
+
model,
|
|
216
|
+
};
|
|
217
|
+
|
|
218
|
+
if (options.inputType) {
|
|
219
|
+
body.input_type = options.inputType;
|
|
220
|
+
}
|
|
221
|
+
if (options.outputDimension) {
|
|
222
|
+
body.output_dimension = options.outputDimension;
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
return apiRequest('/multimodalembeddings', body);
|
|
226
|
+
}
|
|
227
|
+
|
|
198
228
|
module.exports = {
|
|
199
229
|
API_BASE,
|
|
200
230
|
ATLAS_API_BASE,
|
|
@@ -204,4 +234,5 @@ module.exports = {
|
|
|
204
234
|
requireApiKey,
|
|
205
235
|
apiRequest,
|
|
206
236
|
generateEmbeddings,
|
|
237
|
+
generateMultimodalEmbeddings,
|
|
207
238
|
};
|
package/src/lib/input.js
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
3
|
const fs = require('fs');
|
|
4
|
+
const path = require('path');
|
|
4
5
|
|
|
5
6
|
/**
|
|
6
7
|
* Read text input from argument, --file flag, or stdin.
|
|
@@ -37,4 +38,94 @@ async function resolveTextInput(textArg, filePath) {
|
|
|
37
38
|
process.exit(1);
|
|
38
39
|
}
|
|
39
40
|
|
|
40
|
-
|
|
41
|
+
/**
|
|
42
|
+
* MIME type mappings for supported image formats.
|
|
43
|
+
*/
|
|
44
|
+
const IMAGE_MIME_TYPES = {
|
|
45
|
+
'.jpg': 'image/jpeg',
|
|
46
|
+
'.jpeg': 'image/jpeg',
|
|
47
|
+
'.png': 'image/png',
|
|
48
|
+
'.gif': 'image/gif',
|
|
49
|
+
'.webp': 'image/webp',
|
|
50
|
+
'.bmp': 'image/bmp',
|
|
51
|
+
'.tiff': 'image/tiff',
|
|
52
|
+
'.tif': 'image/tiff',
|
|
53
|
+
};
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* MIME type mappings for supported video formats.
|
|
57
|
+
*/
|
|
58
|
+
const VIDEO_MIME_TYPES = {
|
|
59
|
+
'.mp4': 'video/mp4',
|
|
60
|
+
'.mpeg': 'video/mpeg',
|
|
61
|
+
'.mpg': 'video/mpeg',
|
|
62
|
+
'.mov': 'video/quicktime',
|
|
63
|
+
'.avi': 'video/x-msvideo',
|
|
64
|
+
'.mkv': 'video/x-matroska',
|
|
65
|
+
'.webm': 'video/webm',
|
|
66
|
+
'.flv': 'video/x-flv',
|
|
67
|
+
'.wmv': 'video/x-ms-wmv',
|
|
68
|
+
};
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Check if a file path is a supported image format.
|
|
72
|
+
* @param {string} filePath
|
|
73
|
+
* @returns {boolean}
|
|
74
|
+
*/
|
|
75
|
+
function isImageFile(filePath) {
|
|
76
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
77
|
+
return ext in IMAGE_MIME_TYPES;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/**
|
|
81
|
+
* Check if a file path is a supported video format.
|
|
82
|
+
* @param {string} filePath
|
|
83
|
+
* @returns {boolean}
|
|
84
|
+
*/
|
|
85
|
+
function isVideoFile(filePath) {
|
|
86
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
87
|
+
return ext in VIDEO_MIME_TYPES;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Read a media file (image or video) and return it as a base64 data URL.
|
|
92
|
+
* @param {string} filePath - Path to the media file
|
|
93
|
+
* @returns {{ base64DataUrl: string, mimeType: string, sizeBytes: number }}
|
|
94
|
+
*/
|
|
95
|
+
function readMediaAsBase64(filePath) {
|
|
96
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
97
|
+
const mimeType = IMAGE_MIME_TYPES[ext] || VIDEO_MIME_TYPES[ext];
|
|
98
|
+
|
|
99
|
+
if (!mimeType) {
|
|
100
|
+
const supported = [
|
|
101
|
+
...Object.keys(IMAGE_MIME_TYPES),
|
|
102
|
+
...Object.keys(VIDEO_MIME_TYPES),
|
|
103
|
+
].join(', ');
|
|
104
|
+
throw new Error(
|
|
105
|
+
`Unsupported media format "${ext}". Supported: ${supported}`
|
|
106
|
+
);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
if (!fs.existsSync(filePath)) {
|
|
110
|
+
throw new Error(`File not found: ${filePath}`);
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
const buffer = fs.readFileSync(filePath);
|
|
114
|
+
const base64 = buffer.toString('base64');
|
|
115
|
+
const base64DataUrl = `data:${mimeType};base64,${base64}`;
|
|
116
|
+
|
|
117
|
+
return {
|
|
118
|
+
base64DataUrl,
|
|
119
|
+
mimeType,
|
|
120
|
+
sizeBytes: buffer.length,
|
|
121
|
+
};
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
module.exports = {
|
|
125
|
+
resolveTextInput,
|
|
126
|
+
readMediaAsBase64,
|
|
127
|
+
isImageFile,
|
|
128
|
+
isVideoFile,
|
|
129
|
+
IMAGE_MIME_TYPES,
|
|
130
|
+
VIDEO_MIME_TYPES,
|
|
131
|
+
};
|
package/src/lib/workflow.js
CHANGED
|
@@ -1258,6 +1258,9 @@ async function executeQuery(inputs, defaults) {
|
|
|
1258
1258
|
},
|
|
1259
1259
|
]).toArray();
|
|
1260
1260
|
|
|
1261
|
+
// Track embed usage
|
|
1262
|
+
const _usage = [{ op: 'embed', model: embRes.model, tokens: embRes.usage?.total_tokens || 0 }];
|
|
1263
|
+
|
|
1261
1264
|
// Rerank if requested and results exist
|
|
1262
1265
|
if (doRerank && results.length > 0) {
|
|
1263
1266
|
const documents = results.map(r => r.text || r.content || '');
|
|
@@ -1268,15 +1271,17 @@ async function executeQuery(inputs, defaults) {
|
|
|
1268
1271
|
documents,
|
|
1269
1272
|
});
|
|
1270
1273
|
|
|
1274
|
+
_usage.push({ op: 'rerank', model: rerankRes.model || inputs.rerankModel || DEFAULT_RERANK_MODEL, tokens: rerankRes.usage?.total_tokens || 0 });
|
|
1275
|
+
|
|
1271
1276
|
const reranked = (rerankRes.data || []).map(r => ({
|
|
1272
1277
|
...results[r.index],
|
|
1273
1278
|
score: r.relevance_score,
|
|
1274
1279
|
}));
|
|
1275
1280
|
|
|
1276
|
-
return { results: reranked, resultCount: reranked.length };
|
|
1281
|
+
return { results: reranked, resultCount: reranked.length, _usage };
|
|
1277
1282
|
}
|
|
1278
1283
|
|
|
1279
|
-
return { results, resultCount: results.length };
|
|
1284
|
+
return { results, resultCount: results.length, _usage };
|
|
1280
1285
|
} finally {
|
|
1281
1286
|
await client.close();
|
|
1282
1287
|
}
|
|
@@ -1315,7 +1320,8 @@ async function executeRerank(inputs) {
|
|
|
1315
1320
|
score: r.relevance_score,
|
|
1316
1321
|
}));
|
|
1317
1322
|
|
|
1318
|
-
|
|
1323
|
+
const _usage = [{ op: 'rerank', model: res.model || model, tokens: res.usage?.total_tokens || 0 }];
|
|
1324
|
+
return { results, resultCount: results.length, _usage };
|
|
1319
1325
|
}
|
|
1320
1326
|
|
|
1321
1327
|
/**
|
|
@@ -1339,6 +1345,7 @@ async function executeEmbed(inputs, defaults) {
|
|
|
1339
1345
|
embedding: res.data[0].embedding,
|
|
1340
1346
|
model: res.model,
|
|
1341
1347
|
dimensions: res.data[0].embedding.length,
|
|
1348
|
+
_usage: [{ op: 'embed', model: res.model, tokens: res.usage?.total_tokens || 0 }],
|
|
1342
1349
|
};
|
|
1343
1350
|
}
|
|
1344
1351
|
|
|
@@ -1360,7 +1367,11 @@ async function executeSimilarity(inputs, defaults) {
|
|
|
1360
1367
|
const res = await generateEmbeddings([text1, text2], opts);
|
|
1361
1368
|
const similarity = cosineSimilarity(res.data[0].embedding, res.data[1].embedding);
|
|
1362
1369
|
|
|
1363
|
-
return {
|
|
1370
|
+
return {
|
|
1371
|
+
similarity,
|
|
1372
|
+
model: res.model,
|
|
1373
|
+
_usage: [{ op: 'similarity', model: res.model, tokens: res.usage?.total_tokens || 0 }],
|
|
1374
|
+
};
|
|
1364
1375
|
}
|
|
1365
1376
|
|
|
1366
1377
|
/**
|
|
@@ -1441,6 +1452,7 @@ async function executeIngest(inputs, defaults) {
|
|
|
1441
1452
|
source,
|
|
1442
1453
|
model: embRes.model,
|
|
1443
1454
|
indexCreated,
|
|
1455
|
+
_usage: [{ op: 'ingest', model: embRes.model, tokens: embRes.usage?.total_tokens || 0 }],
|
|
1444
1456
|
};
|
|
1445
1457
|
} finally {
|
|
1446
1458
|
await client.close();
|
|
@@ -1585,14 +1597,20 @@ async function executeGenerate(inputs) {
|
|
|
1585
1597
|
|
|
1586
1598
|
// Collect streaming response
|
|
1587
1599
|
let text = '';
|
|
1600
|
+
let llmUsage = { inputTokens: 0, outputTokens: 0 };
|
|
1588
1601
|
for await (const chunk of provider.chat(messages, { stream: true })) {
|
|
1589
|
-
|
|
1602
|
+
if (chunk && typeof chunk === 'object' && chunk.__usage) {
|
|
1603
|
+
llmUsage = chunk.__usage;
|
|
1604
|
+
} else {
|
|
1605
|
+
text += chunk;
|
|
1606
|
+
}
|
|
1590
1607
|
}
|
|
1591
1608
|
|
|
1592
1609
|
return {
|
|
1593
1610
|
text,
|
|
1594
1611
|
model: provider.model,
|
|
1595
1612
|
provider: provider.name,
|
|
1613
|
+
_usage: [{ op: 'llm', model: provider.model, provider: provider.name, inputTokens: llmUsage.inputTokens, outputTokens: llmUsage.outputTokens }],
|
|
1596
1614
|
};
|
|
1597
1615
|
}
|
|
1598
1616
|
|
|
@@ -1906,14 +1924,22 @@ async function executeWorkflow(definition, opts = {}) {
|
|
|
1906
1924
|
}
|
|
1907
1925
|
|
|
1908
1926
|
const durationMs = Date.now() - stepStart;
|
|
1909
|
-
context[stepId] = { output };
|
|
1910
1927
|
|
|
1928
|
+
// Pass full output (with _usage) to onStepComplete for cost tracking
|
|
1911
1929
|
if (opts.onStepComplete) opts.onStepComplete(stepId, output, durationMs);
|
|
1912
1930
|
|
|
1931
|
+
// Strip _usage from context so downstream steps don't receive it
|
|
1932
|
+
let cleanOutput = output;
|
|
1933
|
+
if (output && output._usage) {
|
|
1934
|
+
cleanOutput = { ...output };
|
|
1935
|
+
delete cleanOutput._usage;
|
|
1936
|
+
}
|
|
1937
|
+
context[stepId] = { output: cleanOutput };
|
|
1938
|
+
|
|
1913
1939
|
stepResults.push({
|
|
1914
1940
|
id: stepId,
|
|
1915
1941
|
tool: step.tool,
|
|
1916
|
-
output,
|
|
1942
|
+
output: cleanOutput,
|
|
1917
1943
|
durationMs,
|
|
1918
1944
|
});
|
|
1919
1945
|
} catch (err) {
|
package/src/mcp/schemas/index.js
CHANGED
|
@@ -227,6 +227,17 @@ const generateWorkflowSchema = {
|
|
|
227
227
|
.describe('Explicit list of tools to include (e.g., ["query", "rerank", "generate"]). If omitted, tools are inferred from the description.'),
|
|
228
228
|
};
|
|
229
229
|
|
|
230
|
+
/** vai_multimodal_embed input schema */
|
|
231
|
+
const multimodalEmbedSchema = {
|
|
232
|
+
text: z.string().max(32000).optional().describe('Optional text content to embed alongside media'),
|
|
233
|
+
image_base64: z.string().optional().describe('Base64 data URL for an image (e.g., data:image/jpeg;base64,...)'),
|
|
234
|
+
video_base64: z.string().optional().describe('Base64 data URL for a video (e.g., data:video/mp4;base64,...)'),
|
|
235
|
+
model: z.string().default('voyage-multimodal-3.5').describe('Multimodal embedding model'),
|
|
236
|
+
inputType: z.enum(['document', 'query']).optional()
|
|
237
|
+
.describe('Whether this input is a document or a query (affects embedding)'),
|
|
238
|
+
outputDimension: z.number().int().optional().describe('Output dimensions (256, 512, 1024, or 2048)'),
|
|
239
|
+
};
|
|
240
|
+
|
|
230
241
|
/** vai_validate_workflow input schema */
|
|
231
242
|
const validateWorkflowSchema = {
|
|
232
243
|
workflow: z.object({
|
|
@@ -267,6 +278,7 @@ module.exports = {
|
|
|
267
278
|
codeQuerySchema,
|
|
268
279
|
codeFindSimilarSchema,
|
|
269
280
|
codeStatusSchema,
|
|
281
|
+
multimodalEmbedSchema,
|
|
270
282
|
generateWorkflowSchema,
|
|
271
283
|
validateWorkflowSchema,
|
|
272
284
|
};
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
|
-
const { generateEmbeddings } = require('../../lib/api');
|
|
3
|
+
const { generateEmbeddings, generateMultimodalEmbeddings } = require('../../lib/api');
|
|
4
4
|
const { cosineSimilarity } = require('../../lib/math');
|
|
5
5
|
|
|
6
6
|
/**
|
|
@@ -56,7 +56,69 @@ async function handleVaiSimilarity(input) {
|
|
|
56
56
|
}
|
|
57
57
|
|
|
58
58
|
/**
|
|
59
|
-
*
|
|
59
|
+
* Handler for vai_multimodal_embed: embed text, images, and/or video.
|
|
60
|
+
* @param {object} input - Validated input matching multimodalEmbedSchema
|
|
61
|
+
* @returns {Promise<{structuredContent: object, content: Array}>}
|
|
62
|
+
*/
|
|
63
|
+
async function handleVaiMultimodalEmbed(input) {
|
|
64
|
+
const { text, image_base64, video_base64, model, inputType, outputDimension } = input;
|
|
65
|
+
|
|
66
|
+
// Require at least one content type
|
|
67
|
+
if (!text && !image_base64 && !video_base64) {
|
|
68
|
+
return {
|
|
69
|
+
structuredContent: { error: 'No content provided' },
|
|
70
|
+
content: [{ type: 'text', text: 'Error: At least one of text, image_base64, or video_base64 must be provided.' }],
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// Build content array
|
|
75
|
+
const contentItems = [];
|
|
76
|
+
const parts = [];
|
|
77
|
+
|
|
78
|
+
if (text) {
|
|
79
|
+
contentItems.push({ type: 'text', text });
|
|
80
|
+
parts.push('text');
|
|
81
|
+
}
|
|
82
|
+
if (image_base64) {
|
|
83
|
+
contentItems.push({ type: 'image_base64', image_base64 });
|
|
84
|
+
parts.push('image');
|
|
85
|
+
}
|
|
86
|
+
if (video_base64) {
|
|
87
|
+
contentItems.push({ type: 'video_base64', video_base64 });
|
|
88
|
+
parts.push('video');
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
const start = Date.now();
|
|
92
|
+
const mmOpts = { model };
|
|
93
|
+
if (inputType) mmOpts.inputType = inputType;
|
|
94
|
+
if (outputDimension) mmOpts.outputDimension = outputDimension;
|
|
95
|
+
|
|
96
|
+
const result = await generateMultimodalEmbeddings([contentItems], mmOpts);
|
|
97
|
+
const vector = result.data[0].embedding;
|
|
98
|
+
const timeMs = Date.now() - start;
|
|
99
|
+
|
|
100
|
+
const structured = {
|
|
101
|
+
model,
|
|
102
|
+
contentTypes: parts,
|
|
103
|
+
vector,
|
|
104
|
+
dimensions: vector.length,
|
|
105
|
+
inputType: inputType || null,
|
|
106
|
+
timeMs,
|
|
107
|
+
};
|
|
108
|
+
if (text) structured.textPreview = text.slice(0, 100) + (text.length > 100 ? '...' : '');
|
|
109
|
+
|
|
110
|
+
return {
|
|
111
|
+
structuredContent: structured,
|
|
112
|
+
content: [{
|
|
113
|
+
type: 'text',
|
|
114
|
+
text: `Multimodal embedding (${parts.join(' + ')}, ${vector.length} dimensions, model: ${model}, ${timeMs}ms). ` +
|
|
115
|
+
`Vector: [${vector.slice(0, 5).map(v => v.toFixed(4)).join(', ')}, ... ${vector.length - 5} more]`,
|
|
116
|
+
}],
|
|
117
|
+
};
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/**
|
|
121
|
+
* Register embedding tools: vai_embed, vai_similarity, vai_multimodal_embed
|
|
60
122
|
* @param {import('@modelcontextprotocol/sdk/server/mcp.js').McpServer} server
|
|
61
123
|
* @param {object} schemas
|
|
62
124
|
*/
|
|
@@ -74,6 +136,13 @@ function registerEmbeddingTools(server, schemas) {
|
|
|
74
136
|
schemas.similaritySchema,
|
|
75
137
|
handleVaiSimilarity
|
|
76
138
|
);
|
|
139
|
+
|
|
140
|
+
server.tool(
|
|
141
|
+
'vai_multimodal_embed',
|
|
142
|
+
'Generate multimodal embeddings for text, images, and/or video using voyage-multimodal-3.5. Accepts base64 data URLs for media. At least one of text, image, or video must be provided. Supports combining multiple content types in a single embedding.',
|
|
143
|
+
schemas.multimodalEmbedSchema,
|
|
144
|
+
handleVaiMultimodalEmbed
|
|
145
|
+
);
|
|
77
146
|
}
|
|
78
147
|
|
|
79
|
-
module.exports = { registerEmbeddingTools, handleVaiEmbed, handleVaiSimilarity };
|
|
148
|
+
module.exports = { registerEmbeddingTools, handleVaiEmbed, handleVaiSimilarity, handleVaiMultimodalEmbed };
|