npm - voyageai-cli - Versions diffs - 1.30.2 → 1.30.3 - Mend

voyageai-cli 1.30.2 → 1.30.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/package.json +1 -1
package/src/commands/embed.js +121 -2
package/src/commands/playground.js +56 -3
package/src/lib/api.js +31 -0
package/src/lib/input.js +92 -1
package/src/lib/workflow.js +33 -7
package/src/mcp/schemas/index.js +12 -0
package/src/mcp/tools/embedding.js +72 -3
package/src/playground/index.html +614 -82

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "voyageai-cli",
-  "version": "1.30.2",
+  "version": "1.30.3",
   "description": "CLI for Voyage AI embeddings, reranking, and MongoDB Atlas Vector Search",
   "bin": {
     "vai": "./src/cli.js"

package/src/commands/embed.js CHANGED Viewed

@@ -1,11 +1,13 @@
 'use strict';
 const { getDefaultModel } = require('../lib/catalog');
-const { generateEmbeddings } = require('../lib/api');
-const { resolveTextInput } = require('../lib/input');
+const { generateEmbeddings, generateMultimodalEmbeddings } = require('../lib/api');
+const { resolveTextInput, readMediaAsBase64, isImageFile, isVideoFile } = require('../lib/input');
 const ui = require('../lib/ui');
 const { showCostSummary } = require('../lib/cost-display');
+const MULTIMODAL_MODEL = 'voyage-multimodal-3.5';
 /**
  * Register the embed command on a Commander program.
  * @param {import('commander').Command} program
@@ -18,6 +20,8 @@ function registerEmbed(program) {
     .option('-t, --input-type <type>', 'Input type: query or document')
     .option('-d, --dimensions <n>', 'Output dimensions', (v) => parseInt(v, 10))
     .option('-f, --file <path>', 'Read text from file')
+    .option('--image <path>', 'Embed an image file (uses voyage-multimodal-3.5)')
+    .option('--video <path>', 'Embed a video file (uses voyage-multimodal-3.5)')
     .option('--truncation', 'Enable truncation for long inputs')
     .option('--no-truncation', 'Disable truncation')
     .option('--output-dtype <type>', 'Output data type: float, int8, uint8, binary, ubinary', 'float')
@@ -28,6 +32,121 @@ function registerEmbed(program) {
     .action(async (text, opts) => {
       try {
         const telemetry = require('../lib/telemetry');
+        const isMultimodal = !!(opts.image || opts.video);
+        // Validate: --image/--video are incompatible with --file
+        if (isMultimodal && opts.file) {
+          console.error(ui.error('Cannot combine --image or --video with --file. Use --image/--video for multimodal, or --file for text.'));
+          process.exit(1);
+        }
+        // Multimodal path: --image and/or --video
+        if (isMultimodal) {
+          const model = opts.model === getDefaultModel() ? MULTIMODAL_MODEL : opts.model;
+          const useColor = !opts.json;
+          const useSpinner = useColor && !opts.quiet;
+          // Build content array
+          const contentItems = [];
+          const mediaMeta = [];
+          // Add text if provided
+          if (text) {
+            contentItems.push({ type: 'text', text });
+          }
+          // Add image
+          if (opts.image) {
+            if (!isImageFile(opts.image)) {
+              console.error(ui.error(`Not a supported image format: ${opts.image}`));
+              process.exit(1);
+            }
+            const media = readMediaAsBase64(opts.image);
+            contentItems.push({ type: 'image_base64', image_base64: media.base64DataUrl });
+            mediaMeta.push({ type: 'image', path: opts.image, mime: media.mimeType, size: media.sizeBytes });
+          }
+          // Add video
+          if (opts.video) {
+            if (!isVideoFile(opts.video)) {
+              console.error(ui.error(`Not a supported video format: ${opts.video}`));
+              process.exit(1);
+            }
+            const media = readMediaAsBase64(opts.video);
+            contentItems.push({ type: 'video_base64', video_base64: media.base64DataUrl });
+            mediaMeta.push({ type: 'video', path: opts.video, mime: media.mimeType, size: media.sizeBytes });
+          }
+          if (contentItems.length === 0) {
+            console.error(ui.error('No content provided. Pass text, --image, or --video.'));
+            process.exit(1);
+          }
+          const done = telemetry.timer('cli_embed', {
+            model,
+            multimodal: true,
+            hasText: !!text,
+            hasImage: !!opts.image,
+            hasVideo: !!opts.video,
+          });
+          let spin;
+          if (useSpinner) {
+            spin = ui.spinner('Generating multimodal embeddings...');
+            spin.start();
+          }
+          const mmOpts = { model };
+          if (opts.inputType) mmOpts.inputType = opts.inputType;
+          if (opts.dimensions) mmOpts.outputDimension = opts.dimensions;
+          const result = await generateMultimodalEmbeddings([contentItems], mmOpts);
+          if (spin) spin.stop();
+          if (opts.outputFormat === 'array') {
+            console.log(JSON.stringify(result.data[0].embedding));
+            return;
+          }
+          if (opts.json) {
+            console.log(JSON.stringify(result, null, 2));
+            return;
+          }
+          // Friendly output
+          if (!opts.quiet) {
+            console.log(ui.label('Model', ui.cyan(model)));
+            console.log(ui.label('Mode', ui.cyan('multimodal')));
+            for (const m of mediaMeta) {
+              const sizeStr = m.size < 1024 * 1024
+                ? `${(m.size / 1024).toFixed(1)} KB`
+                : `${(m.size / (1024 * 1024)).toFixed(1)} MB`;
+              console.log(ui.label(m.type === 'image' ? 'Image' : 'Video', `${m.path} ${ui.dim(`(${m.mime}, ${sizeStr})`)}`));
+            }
+            if (text) {
+              console.log(ui.label('Text', ui.dim(text.slice(0, 80) + (text.length > 80 ? '...' : ''))));
+            }
+            if (result.usage) {
+              console.log(ui.label('Tokens', ui.dim(String(result.usage.total_tokens))));
+            }
+            const dims = result.data[0]?.embedding?.length || 'N/A';
+            console.log(ui.label('Dimensions', ui.bold(String(dims))));
+            console.log('');
+          }
+          const vector = result.data[0].embedding;
+          const preview = vector.slice(0, 5).map(v => v.toFixed(6)).join(', ');
+          console.log(`[${preview}, ...] (${vector.length} dims)`);
+          console.log('');
+          console.log(ui.success('Multimodal embedding generated'));
+          done({ dimensions: result.data[0]?.embedding?.length });
+          return;
+        }
+        // Standard text embedding path
         const texts = await resolveTextInput(text, opts.file);
         // --estimate: show cost comparison, optionally switch model

package/src/commands/playground.js CHANGED Viewed

@@ -1406,9 +1406,58 @@ function createPlaygroundServer() {
             res.end(JSON.stringify({ error: 'inputs must be a non-empty array' }));
             return;
           }
+          // Optimize video inputs: downsample to 1fps to fit within 32k token context
+          const os = require('os');
+          const path = require('path');
+          const fs = require('fs');
+          const { execFileSync } = require('child_process');
+          const optimizedInputs = [];
+          for (const input of inputs) {
+            const content = input.content;
+            if (content && Array.isArray(content)) {
+              const optimizedContent = [];
+              for (const item of content) {
+                if (item.type === 'video_base64' && item.video_base64) {
+                  // Downsample video to 1fps using ffmpeg to reduce token count
+                  try {
+                    const b64 = item.video_base64.replace(/^data:[^;]+;base64,/, '');
+                    const tmpIn = path.join(os.tmpdir(), `vai_vid_in_${Date.now()}.mp4`);
+                    const tmpOut = path.join(os.tmpdir(), `vai_vid_out_${Date.now()}.mp4`);
+                    fs.writeFileSync(tmpIn, Buffer.from(b64, 'base64'));
+                    try {
+                      execFileSync('ffmpeg', [
+                        '-y', '-i', tmpIn,
+                        '-vf', 'fps=1',
+                        '-c:v', 'libx264', '-preset', 'fast', '-crf', '23',
+                        '-an',  // strip audio
+                        tmpOut
+                      ], { timeout: 30000, stdio: 'pipe' });
+                      const optimizedBuf = fs.readFileSync(tmpOut);
+                      const optimizedB64 = `data:video/mp4;base64,${optimizedBuf.toString('base64')}`;
+                      optimizedContent.push({ type: 'video_base64', video_base64: optimizedB64 });
+                    } finally {
+                      try { fs.unlinkSync(tmpIn); } catch (_) {}
+                      try { fs.unlinkSync(tmpOut); } catch (_) {}
+                    }
+                  } catch (err) {
+                    // If optimization fails, send original and let API error naturally
+                    console.warn('[Playground] Video optimization failed:', err.message);
+                    optimizedContent.push(item);
+                  }
+                } else {
+                  optimizedContent.push(item);
+                }
+              }
+              optimizedInputs.push({ ...input, content: optimizedContent });
+            } else {
+              optimizedInputs.push(input);
+            }
+          }
           const { apiRequest } = require('../lib/api');
           const mmBody = {
-            inputs,
+            inputs: optimizedInputs,
             model: model || 'voyage-multimodal-3.5',
           };
           if (input_type) mmBody.input_type = input_type;
@@ -1590,9 +1639,13 @@ function createPlaygroundServer() {
                   else if (output.text) summary = output.text.slice(0, 100) + (output.text.length > 100 ? '...' : '');
                   else summary = JSON.stringify(output).slice(0, 200);
                 }
+                // Extract usage data for cost tracking (then strip from output payload)
+                const _usage = (output && output._usage) ? output._usage : undefined;
+                const cleanOutput = _usage ? { ...output } : output;
+                if (cleanOutput && cleanOutput._usage) delete cleanOutput._usage;
                 res.write(`event: step_complete\ndata: ${JSON.stringify({
-                  stepId, timeMs, summary,
-                  output: JSON.stringify(output).length < 5000 ? output : { _truncated: true, summary },
+                  stepId, timeMs, summary, _usage,
+                  output: JSON.stringify(cleanOutput).length < 5000 ? cleanOutput : { _truncated: true, summary },
                 })}\n\n`);
               },
               onStepSkip: (stepId, reason) => {

package/src/lib/api.js CHANGED Viewed

@@ -195,6 +195,36 @@ async function generateEmbeddings(texts, options = {}) {
   return apiRequest('/embeddings', body);
 }
+/**
+ * Generate multimodal embeddings for inputs containing text, images, and/or video.
+ * Uses the /multimodalembeddings endpoint with a different input format.
+ * @param {Array<Array<{type: string, text?: string, image_base64?: string, video_base64?: string}>>} inputs
+ *   Array of content arrays. Each content array is a list of content items for one input.
+ *   Example: [[{type: 'text', text: 'hello'}, {type: 'image_base64', image_base64: 'data:image/png;base64,...'}]]
+ * @param {object} options
+ * @param {string} [options.model] - Model name (default: voyage-multimodal-3.5)
+ * @param {string} [options.inputType] - Input type (query|document)
+ * @param {number} [options.outputDimension] - Output dimensions
+ * @returns {Promise<object>} API response with embeddings
+ */
+async function generateMultimodalEmbeddings(inputs, options = {}) {
+  const model = options.model || 'voyage-multimodal-3.5';
+  const body = {
+    inputs: inputs.map(contentArray => ({ content: contentArray })),
+    model,
+  };
+  if (options.inputType) {
+    body.input_type = options.inputType;
+  }
+  if (options.outputDimension) {
+    body.output_dimension = options.outputDimension;
+  }
+  return apiRequest('/multimodalembeddings', body);
+}
 module.exports = {
   API_BASE,
   ATLAS_API_BASE,
@@ -204,4 +234,5 @@ module.exports = {
   requireApiKey,
   apiRequest,
   generateEmbeddings,
+  generateMultimodalEmbeddings,
 };

package/src/lib/input.js CHANGED Viewed

@@ -1,6 +1,7 @@
 'use strict';
 const fs = require('fs');
+const path = require('path');
 /**
  * Read text input from argument, --file flag, or stdin.
@@ -37,4 +38,94 @@ async function resolveTextInput(textArg, filePath) {
   process.exit(1);
 }
-module.exports = { resolveTextInput };
+/**
+ * MIME type mappings for supported image formats.
+ */
+const IMAGE_MIME_TYPES = {
+  '.jpg': 'image/jpeg',
+  '.jpeg': 'image/jpeg',
+  '.png': 'image/png',
+  '.gif': 'image/gif',
+  '.webp': 'image/webp',
+  '.bmp': 'image/bmp',
+  '.tiff': 'image/tiff',
+  '.tif': 'image/tiff',
+};
+/**
+ * MIME type mappings for supported video formats.
+ */
+const VIDEO_MIME_TYPES = {
+  '.mp4': 'video/mp4',
+  '.mpeg': 'video/mpeg',
+  '.mpg': 'video/mpeg',
+  '.mov': 'video/quicktime',
+  '.avi': 'video/x-msvideo',
+  '.mkv': 'video/x-matroska',
+  '.webm': 'video/webm',
+  '.flv': 'video/x-flv',
+  '.wmv': 'video/x-ms-wmv',
+};
+/**
+ * Check if a file path is a supported image format.
+ * @param {string} filePath
+ * @returns {boolean}
+ */
+function isImageFile(filePath) {
+  const ext = path.extname(filePath).toLowerCase();
+  return ext in IMAGE_MIME_TYPES;
+}
+/**
+ * Check if a file path is a supported video format.
+ * @param {string} filePath
+ * @returns {boolean}
+ */
+function isVideoFile(filePath) {
+  const ext = path.extname(filePath).toLowerCase();
+  return ext in VIDEO_MIME_TYPES;
+}
+/**
+ * Read a media file (image or video) and return it as a base64 data URL.
+ * @param {string} filePath - Path to the media file
+ * @returns {{ base64DataUrl: string, mimeType: string, sizeBytes: number }}
+ */
+function readMediaAsBase64(filePath) {
+  const ext = path.extname(filePath).toLowerCase();
+  const mimeType = IMAGE_MIME_TYPES[ext] || VIDEO_MIME_TYPES[ext];
+  if (!mimeType) {
+    const supported = [
+      ...Object.keys(IMAGE_MIME_TYPES),
+      ...Object.keys(VIDEO_MIME_TYPES),
+    ].join(', ');
+    throw new Error(
+      `Unsupported media format "${ext}". Supported: ${supported}`
+    );
+  }
+  if (!fs.existsSync(filePath)) {
+    throw new Error(`File not found: ${filePath}`);
+  }
+  const buffer = fs.readFileSync(filePath);
+  const base64 = buffer.toString('base64');
+  const base64DataUrl = `data:${mimeType};base64,${base64}`;
+  return {
+    base64DataUrl,
+    mimeType,
+    sizeBytes: buffer.length,
+  };
+}
+module.exports = {
+  resolveTextInput,
+  readMediaAsBase64,
+  isImageFile,
+  isVideoFile,
+  IMAGE_MIME_TYPES,
+  VIDEO_MIME_TYPES,
+};

package/src/lib/workflow.js CHANGED Viewed

@@ -1258,6 +1258,9 @@ async function executeQuery(inputs, defaults) {
       },
     ]).toArray();
+    // Track embed usage
+    const _usage = [{ op: 'embed', model: embRes.model, tokens: embRes.usage?.total_tokens || 0 }];
     // Rerank if requested and results exist
     if (doRerank && results.length > 0) {
       const documents = results.map(r => r.text || r.content || '');
@@ -1268,15 +1271,17 @@ async function executeQuery(inputs, defaults) {
         documents,
       });
+      _usage.push({ op: 'rerank', model: rerankRes.model || inputs.rerankModel || DEFAULT_RERANK_MODEL, tokens: rerankRes.usage?.total_tokens || 0 });
       const reranked = (rerankRes.data || []).map(r => ({
         ...results[r.index],
         score: r.relevance_score,
       }));
-      return { results: reranked, resultCount: reranked.length };
+      return { results: reranked, resultCount: reranked.length, _usage };
     }
-    return { results, resultCount: results.length };
+    return { results, resultCount: results.length, _usage };
   } finally {
     await client.close();
   }
@@ -1315,7 +1320,8 @@ async function executeRerank(inputs) {
     score: r.relevance_score,
   }));
-  return { results, resultCount: results.length };
+  const _usage = [{ op: 'rerank', model: res.model || model, tokens: res.usage?.total_tokens || 0 }];
+  return { results, resultCount: results.length, _usage };
 }
 /**
@@ -1339,6 +1345,7 @@ async function executeEmbed(inputs, defaults) {
     embedding: res.data[0].embedding,
     model: res.model,
     dimensions: res.data[0].embedding.length,
+    _usage: [{ op: 'embed', model: res.model, tokens: res.usage?.total_tokens || 0 }],
   };
 }
@@ -1360,7 +1367,11 @@ async function executeSimilarity(inputs, defaults) {
   const res = await generateEmbeddings([text1, text2], opts);
   const similarity = cosineSimilarity(res.data[0].embedding, res.data[1].embedding);
-  return { similarity, model: res.model };
+  return {
+    similarity,
+    model: res.model,
+    _usage: [{ op: 'similarity', model: res.model, tokens: res.usage?.total_tokens || 0 }],
+  };
 }
 /**
@@ -1441,6 +1452,7 @@ async function executeIngest(inputs, defaults) {
       source,
       model: embRes.model,
       indexCreated,
+      _usage: [{ op: 'ingest', model: embRes.model, tokens: embRes.usage?.total_tokens || 0 }],
     };
   } finally {
     await client.close();
@@ -1585,14 +1597,20 @@ async function executeGenerate(inputs) {
   // Collect streaming response
   let text = '';
+  let llmUsage = { inputTokens: 0, outputTokens: 0 };
   for await (const chunk of provider.chat(messages, { stream: true })) {
-    text += chunk;
+    if (chunk && typeof chunk === 'object' && chunk.__usage) {
+      llmUsage = chunk.__usage;
+    } else {
+      text += chunk;
+    }
   }
   return {
     text,
     model: provider.model,
     provider: provider.name,
+    _usage: [{ op: 'llm', model: provider.model, provider: provider.name, inputTokens: llmUsage.inputTokens, outputTokens: llmUsage.outputTokens }],
   };
 }
@@ -1906,14 +1924,22 @@ async function executeWorkflow(definition, opts = {}) {
         }
         const durationMs = Date.now() - stepStart;
-        context[stepId] = { output };
+        // Pass full output (with _usage) to onStepComplete for cost tracking
         if (opts.onStepComplete) opts.onStepComplete(stepId, output, durationMs);
+        // Strip _usage from context so downstream steps don't receive it
+        let cleanOutput = output;
+        if (output && output._usage) {
+          cleanOutput = { ...output };
+          delete cleanOutput._usage;
+        }
+        context[stepId] = { output: cleanOutput };
         stepResults.push({
           id: stepId,
           tool: step.tool,
-          output,
+          output: cleanOutput,
           durationMs,
         });
       } catch (err) {

package/src/mcp/schemas/index.js CHANGED Viewed

@@ -227,6 +227,17 @@ const generateWorkflowSchema = {
     .describe('Explicit list of tools to include (e.g., ["query", "rerank", "generate"]). If omitted, tools are inferred from the description.'),
 };
+/** vai_multimodal_embed input schema */
+const multimodalEmbedSchema = {
+  text: z.string().max(32000).optional().describe('Optional text content to embed alongside media'),
+  image_base64: z.string().optional().describe('Base64 data URL for an image (e.g., data:image/jpeg;base64,...)'),
+  video_base64: z.string().optional().describe('Base64 data URL for a video (e.g., data:video/mp4;base64,...)'),
+  model: z.string().default('voyage-multimodal-3.5').describe('Multimodal embedding model'),
+  inputType: z.enum(['document', 'query']).optional()
+    .describe('Whether this input is a document or a query (affects embedding)'),
+  outputDimension: z.number().int().optional().describe('Output dimensions (256, 512, 1024, or 2048)'),
+};
 /** vai_validate_workflow input schema */
 const validateWorkflowSchema = {
   workflow: z.object({
@@ -267,6 +278,7 @@ module.exports = {
   codeQuerySchema,
   codeFindSimilarSchema,
   codeStatusSchema,
+  multimodalEmbedSchema,
   generateWorkflowSchema,
   validateWorkflowSchema,
 };

package/src/mcp/tools/embedding.js CHANGED Viewed

@@ -1,6 +1,6 @@
 'use strict';
-const { generateEmbeddings } = require('../../lib/api');
+const { generateEmbeddings, generateMultimodalEmbeddings } = require('../../lib/api');
 const { cosineSimilarity } = require('../../lib/math');
 /**
@@ -56,7 +56,69 @@ async function handleVaiSimilarity(input) {
 }
 /**
- * Register embedding tools: vai_embed, vai_similarity
+ * Handler for vai_multimodal_embed: embed text, images, and/or video.
+ * @param {object} input - Validated input matching multimodalEmbedSchema
+ * @returns {Promise<{structuredContent: object, content: Array}>}
+ */
+async function handleVaiMultimodalEmbed(input) {
+  const { text, image_base64, video_base64, model, inputType, outputDimension } = input;
+  // Require at least one content type
+  if (!text && !image_base64 && !video_base64) {
+    return {
+      structuredContent: { error: 'No content provided' },
+      content: [{ type: 'text', text: 'Error: At least one of text, image_base64, or video_base64 must be provided.' }],
+    };
+  }
+  // Build content array
+  const contentItems = [];
+  const parts = [];
+  if (text) {
+    contentItems.push({ type: 'text', text });
+    parts.push('text');
+  }
+  if (image_base64) {
+    contentItems.push({ type: 'image_base64', image_base64 });
+    parts.push('image');
+  }
+  if (video_base64) {
+    contentItems.push({ type: 'video_base64', video_base64 });
+    parts.push('video');
+  }
+  const start = Date.now();
+  const mmOpts = { model };
+  if (inputType) mmOpts.inputType = inputType;
+  if (outputDimension) mmOpts.outputDimension = outputDimension;
+  const result = await generateMultimodalEmbeddings([contentItems], mmOpts);
+  const vector = result.data[0].embedding;
+  const timeMs = Date.now() - start;
+  const structured = {
+    model,
+    contentTypes: parts,
+    vector,
+    dimensions: vector.length,
+    inputType: inputType || null,
+    timeMs,
+  };
+  if (text) structured.textPreview = text.slice(0, 100) + (text.length > 100 ? '...' : '');
+  return {
+    structuredContent: structured,
+    content: [{
+      type: 'text',
+      text: `Multimodal embedding (${parts.join(' + ')}, ${vector.length} dimensions, model: ${model}, ${timeMs}ms). ` +
+            `Vector: [${vector.slice(0, 5).map(v => v.toFixed(4)).join(', ')}, ... ${vector.length - 5} more]`,
+    }],
+  };
+}
+/**
+ * Register embedding tools: vai_embed, vai_similarity, vai_multimodal_embed
  * @param {import('@modelcontextprotocol/sdk/server/mcp.js').McpServer} server
  * @param {object} schemas
  */
@@ -74,6 +136,13 @@ function registerEmbeddingTools(server, schemas) {
     schemas.similaritySchema,
     handleVaiSimilarity
   );
+  server.tool(
+    'vai_multimodal_embed',
+    'Generate multimodal embeddings for text, images, and/or video using voyage-multimodal-3.5. Accepts base64 data URLs for media. At least one of text, image, or video must be provided. Supports combining multiple content types in a single embedding.',
+    schemas.multimodalEmbedSchema,
+    handleVaiMultimodalEmbed
+  );
 }
-module.exports = { registerEmbeddingTools, handleVaiEmbed, handleVaiSimilarity };
+module.exports = { registerEmbeddingTools, handleVaiEmbed, handleVaiSimilarity, handleVaiMultimodalEmbed };