npm - voyageai-cli - Versions diffs - 1.13.0 → 1.16.0 - Mend

voyageai-cli 1.13.0 → 1.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/package.json +1 -1
package/src/cli.js +6 -0
package/src/commands/benchmark.js +164 -0
package/src/commands/chunk.js +277 -0
package/src/commands/completions.js +51 -1
package/src/commands/estimate.js +209 -0
package/src/commands/init.js +153 -0
package/src/commands/models.js +32 -4
package/src/lib/catalog.js +42 -18
package/src/lib/chunker.js +341 -0
package/src/lib/explanations.js +183 -0
package/src/lib/project.js +122 -0
package/src/lib/readers.js +239 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "voyageai-cli",
-  "version": "1.13.0",
+  "version": "1.16.0",
   "description": "CLI for Voyage AI embeddings, reranking, and MongoDB Atlas Vector Search",
   "bin": {
     "vai": "./src/cli.js"

package/src/cli.js CHANGED Viewed

@@ -20,6 +20,9 @@ const { registerIngest } = require('./commands/ingest');
 const { registerCompletions } = require('./commands/completions');
 const { registerPlayground } = require('./commands/playground');
 const { registerBenchmark } = require('./commands/benchmark');
+const { registerEstimate } = require('./commands/estimate');
+const { registerInit } = require('./commands/init');
+const { registerChunk } = require('./commands/chunk');
 const { registerAbout } = require('./commands/about');
 const { showBanner, showQuickStart, getVersion } = require('./lib/banner');
@@ -45,6 +48,9 @@ registerIngest(program);
 registerCompletions(program);
 registerPlayground(program);
 registerBenchmark(program);
+registerEstimate(program);
+registerInit(program);
+registerChunk(program);
 registerAbout(program);
 // Append disclaimer to all help output

package/src/commands/benchmark.js CHANGED Viewed

@@ -1212,6 +1212,170 @@ function registerBenchmark(program) {
     .option('--json', 'Machine-readable JSON output')
     .option('-q, --quiet', 'Suppress non-essential output')
     .action(benchmarkAsymmetric);
+  // ── benchmark space ──
+  bench
+    .command('space')
+    .description('Validate shared embedding space — embed same text with all Voyage 4 models')
+    .option('--text <text>', 'Text to embed across models')
+    .option('--texts <texts>', 'Comma-separated texts to compare')
+    .option('--models <models>', 'Comma-separated models', 'voyage-4-large,voyage-4,voyage-4-lite')
+    .option('-d, --dimensions <n>', 'Output dimensions (must be supported by all models)')
+    .option('--json', 'Machine-readable JSON output')
+    .option('-q, --quiet', 'Suppress non-essential output')
+    .action(benchmarkSpace);
+}
+/**
+ * benchmark space — Validate shared embedding space across Voyage 4 models.
+ * Embeds the same text(s) with multiple models, then computes pairwise cosine
+ * similarities to prove they produce compatible embeddings.
+ */
+async function benchmarkSpace(opts) {
+  const models = opts.models
+    ? parseModels(opts.models)
+    : ['voyage-4-large', 'voyage-4', 'voyage-4-lite'];
+  const texts = opts.texts
+    ? opts.texts.split(',').map(t => t.trim())
+    : opts.text
+      ? [opts.text]
+      : [
+        'MongoDB Atlas provides a fully managed cloud database with vector search.',
+        'Machine learning models transform raw data into semantic embeddings.',
+        'The quick brown fox jumps over the lazy dog.',
+      ];
+  const dimensions = opts.dimensions ? parseInt(opts.dimensions, 10) : undefined;
+  if (!opts.json && !opts.quiet) {
+    console.log('');
+    console.log(ui.bold('  🔮 Shared Embedding Space Validation'));
+    console.log(ui.dim(`  Models: ${models.join(', ')}`));
+    console.log(ui.dim(`  Texts: ${texts.length}${dimensions ? `, dimensions: ${dimensions}` : ''}`));
+    console.log('');
+  }
+  // Embed all texts with all models
+  const embeddings = {}; // { model: [[embedding for text 0], [embedding for text 1], ...] }
+  for (const model of models) {
+    const spin = (!opts.json && !opts.quiet) ? ui.spinner(`  Embedding with ${model}...`) : null;
+    if (spin) spin.start();
+    try {
+      const embedOpts = { model, inputType: 'document' };
+      if (dimensions) embedOpts.dimensions = dimensions;
+      const result = await generateEmbeddings(texts, embedOpts);
+      embeddings[model] = result.data.map(d => d.embedding);
+      if (spin) spin.stop();
+    } catch (err) {
+      if (spin) spin.stop();
+      console.error(ui.warn(`  ${model}: ${err.message} — skipping`));
+    }
+  }
+  const validModels = Object.keys(embeddings);
+  if (validModels.length < 2) {
+    console.error(ui.error('Need at least 2 models to compare embedding spaces.'));
+    process.exit(1);
+  }
+  // Compute pairwise cross-model similarities for each text
+  const results = [];
+  for (let t = 0; t < texts.length; t++) {
+    const textResult = {
+      text: texts[t],
+      pairs: [],
+    };
+    for (let i = 0; i < validModels.length; i++) {
+      for (let j = i + 1; j < validModels.length; j++) {
+        const modelA = validModels[i];
+        const modelB = validModels[j];
+        const sim = cosineSimilarity(embeddings[modelA][t], embeddings[modelB][t]);
+        textResult.pairs.push({
+          modelA,
+          modelB,
+          similarity: sim,
+        });
+      }
+    }
+    results.push(textResult);
+  }
+  // Also compute within-model similarity across different texts (baseline)
+  const withinModelSims = [];
+  if (texts.length >= 2) {
+    for (const model of validModels) {
+      const sim = cosineSimilarity(embeddings[model][0], embeddings[model][1]);
+      withinModelSims.push({ model, text0: texts[0], text1: texts[1], similarity: sim });
+    }
+  }
+  if (opts.json) {
+    console.log(JSON.stringify({ benchmark: 'space', models: validModels, texts, results, withinModelSims }, null, 2));
+    return;
+  }
+  // Display results
+  console.log(ui.bold('  Cross-Model Similarity (same text, different models):'));
+  console.log(ui.dim('  High similarity (>0.95) = shared embedding space confirmed'));
+  console.log('');
+  let allHigh = true;
+  for (const r of results) {
+    const preview = r.text.substring(0, 55) + (r.text.length > 55 ? '...' : '');
+    console.log(`  ${ui.dim('Text:')} "${preview}"`);
+    for (const p of r.pairs) {
+      const simStr = p.similarity.toFixed(4);
+      const quality = p.similarity >= 0.98 ? ui.green('●')
+        : p.similarity >= 0.95 ? ui.cyan('●')
+        : p.similarity >= 0.90 ? ui.yellow('●')
+        : ui.red('●');
+      if (p.similarity < 0.95) allHigh = false;
+      console.log(`    ${quality} ${rpad(p.modelA, 18)} ↔ ${rpad(p.modelB, 18)} ${ui.bold(simStr)}`);
+    }
+    console.log('');
+  }
+  // Show within-model cross-text similarity for context
+  if (withinModelSims.length > 0) {
+    console.log(ui.bold('  Within-Model Similarity (different texts, same model):'));
+    console.log(ui.dim('  Shows that cross-model same-text similarity is much higher'));
+    console.log('');
+    for (const w of withinModelSims) {
+      console.log(`    ${ui.dim(rpad(w.model, 18))} text₀ ↔ text₁  ${ui.dim(w.similarity.toFixed(4))}`);
+    }
+    console.log('');
+  }
+  // Summary
+  const avgCrossModel = results.flatMap(r => r.pairs).reduce((sum, p) => sum + p.similarity, 0)
+    / results.flatMap(r => r.pairs).length;
+  const avgWithin = withinModelSims.length > 0
+    ? withinModelSims.reduce((sum, w) => sum + w.similarity, 0) / withinModelSims.length
+    : null;
+  if (allHigh) {
+    console.log(ui.success(`Shared embedding space confirmed! Avg cross-model similarity: ${avgCrossModel.toFixed(4)}`));
+  } else {
+    console.log(ui.warn(`Cross-model similarity lower than expected. Avg: ${avgCrossModel.toFixed(4)}`));
+  }
+  if (avgWithin !== null) {
+    const ratio = (avgCrossModel / avgWithin).toFixed(1);
+    console.log(ui.dim(`  Cross-model same-text similarity is ${ratio}× higher than same-model different-text similarity.`));
+  }
+  console.log('');
+  console.log(ui.dim('  This means you can embed docs with voyage-4-large and query with voyage-4-lite'));
+  console.log(ui.dim('  — the embeddings live in the same space. See "vai explain shared-space".'));
+  console.log('');
 }
 module.exports = { registerBenchmark };

package/src/commands/chunk.js ADDED Viewed

@@ -0,0 +1,277 @@
+'use strict';
+const fs = require('fs');
+const path = require('path');
+const { chunk, estimateTokens, STRATEGIES, DEFAULTS } = require('../lib/chunker');
+const { readFile, scanDirectory, isSupported, getReaderType } = require('../lib/readers');
+const { loadProject, mergeOptions } = require('../lib/project');
+const ui = require('../lib/ui');
+/**
+ * Format a number with commas.
+ */
+function fmtNum(n) {
+  return n.toLocaleString('en-US');
+}
+/**
+ * Build chunk metadata for a source file.
+ * @param {string} filePath - Source file path
+ * @param {string} basePath - Base directory for relative paths
+ * @param {number} index - Chunk index within the file
+ * @param {number} total - Total chunks from this file
+ * @returns {object}
+ */
+function buildMetadata(filePath, basePath, index, total) {
+  return {
+    source: path.relative(basePath, filePath),
+    chunk_index: index,
+    total_chunks: total,
+  };
+}
+/**
+ * Register the chunk command on a Commander program.
+ * @param {import('commander').Command} program
+ */
+function registerChunk(program) {
+  program
+    .command('chunk [input]')
+    .description('Chunk documents for embedding — files, directories, or stdin')
+    .option('-s, --strategy <strategy>', `Chunking strategy: ${STRATEGIES.join(', ')}`)
+    .option('-c, --chunk-size <n>', 'Target chunk size in characters', (v) => parseInt(v, 10))
+    .option('--overlap <n>', 'Overlap between chunks in characters', (v) => parseInt(v, 10))
+    .option('--min-size <n>', 'Minimum chunk size (drop smaller)', (v) => parseInt(v, 10))
+    .option('-o, --output <path>', 'Output file (JSONL). Omit for stdout')
+    .option('--text-field <name>', 'Text field name for JSON/JSONL input', 'text')
+    .option('--extensions <exts>', 'Comma-separated file extensions to include when scanning directories')
+    .option('--ignore <dirs>', 'Comma-separated directory names to skip', 'node_modules,.git,__pycache__')
+    .option('--dry-run', 'Show what would be chunked without processing')
+    .option('--stats', 'Show chunking statistics after processing')
+    .option('--json', 'Machine-readable JSON output')
+    .option('-q, --quiet', 'Suppress non-essential output')
+    .action(async (input, opts) => {
+      try {
+        // Load project config, merge with CLI opts
+        const { config: projectConfig } = loadProject();
+        const chunkConfig = projectConfig.chunk || {};
+        const strategy = opts.strategy || chunkConfig.strategy || DEFAULTS.strategy || 'recursive';
+        const chunkSize = opts.chunkSize || chunkConfig.size || DEFAULTS.size;
+        const overlap = opts.overlap != null ? opts.overlap : (chunkConfig.overlap != null ? chunkConfig.overlap : DEFAULTS.overlap);
+        const minSize = opts.minSize || chunkConfig.minSize || DEFAULTS.minSize;
+        const textField = opts.textField || 'text';
+        if (!STRATEGIES.includes(strategy)) {
+          console.error(ui.error(`Unknown strategy: "${strategy}". Available: ${STRATEGIES.join(', ')}`));
+          process.exit(1);
+        }
+        // Resolve input files
+        const files = resolveInput(input, opts);
+        if (files.length === 0) {
+          console.error(ui.error('No supported files found. Supported types: .txt, .md, .html, .json, .jsonl, .pdf'));
+          process.exit(1);
+        }
+        // Dry run
+        if (opts.dryRun) {
+          if (opts.json) {
+            console.log(JSON.stringify({ files: files.map(f => path.relative(process.cwd(), f)), strategy, chunkSize, overlap }, null, 2));
+          } else {
+            console.log(ui.bold(`Would chunk ${files.length} file(s) with strategy: ${strategy}`));
+            console.log(ui.dim(`  Chunk size: ${chunkSize} chars, overlap: ${overlap} chars`));
+            console.log('');
+            for (const f of files) {
+              const size = fs.statSync(f).size;
+              console.log(`  ${ui.dim(path.relative(process.cwd(), f))} (${fmtNum(size)} bytes)`);
+            }
+          }
+          return;
+        }
+        // Process files
+        const basePath = input && fs.existsSync(input) && fs.statSync(input).isDirectory()
+          ? path.resolve(input)
+          : process.cwd();
+        const allChunks = [];
+        const fileStats = [];
+        const showProgress = !opts.json && !opts.quiet && files.length > 1;
+        if (showProgress) {
+          console.log(ui.bold(`Chunking ${files.length} file(s) with strategy: ${strategy}`));
+          console.log(ui.dim(`  Chunk size: ${chunkSize}, overlap: ${overlap}, min: ${minSize}`));
+          console.log('');
+        }
+        for (let fi = 0; fi < files.length; fi++) {
+          const filePath = files[fi];
+          const relPath = path.relative(basePath, filePath);
+          const readerType = getReaderType(filePath);
+          try {
+            const content = await readFile(filePath, { textField });
+            // readFile returns string for text/html/pdf, array for json/jsonl
+            let textsToChunk = [];
+            if (typeof content === 'string') {
+              textsToChunk = [{ text: content, metadata: {} }];
+            } else if (Array.isArray(content)) {
+              textsToChunk = content;
+            }
+            let fileChunkCount = 0;
+            for (const item of textsToChunk) {
+              const effectiveStrategy = readerType === 'text' && filePath.endsWith('.md') ? 'markdown' : strategy;
+              // Auto-detect markdown for .md files when using default strategy
+              const useStrategy = (strategy === 'recursive' && filePath.endsWith('.md')) ? 'markdown' : strategy;
+              const chunks = chunk(item.text, {
+                strategy: useStrategy,
+                size: chunkSize,
+                overlap,
+                minSize,
+              });
+              for (let ci = 0; ci < chunks.length; ci++) {
+                allChunks.push({
+                  text: chunks[ci],
+                  metadata: {
+                    ...item.metadata,
+                    ...buildMetadata(filePath, basePath, ci, chunks.length),
+                  },
+                });
+              }
+              fileChunkCount += chunks.length;
+            }
+            fileStats.push({
+              file: relPath,
+              inputChars: textsToChunk.reduce((sum, t) => sum + t.text.length, 0),
+              chunks: fileChunkCount,
+            });
+            if (showProgress) {
+              console.log(`  ${ui.green('✓')} ${relPath} → ${fileChunkCount} chunks`);
+            }
+          } catch (err) {
+            fileStats.push({ file: relPath, error: err.message, chunks: 0 });
+            if (!opts.quiet) {
+              console.error(`  ${ui.red('✗')} ${relPath}: ${err.message}`);
+            }
+          }
+        }
+        // Output
+        if (opts.json) {
+          const output = {
+            totalChunks: allChunks.length,
+            totalTokens: allChunks.reduce((sum, c) => sum + estimateTokens(c.text), 0),
+            strategy,
+            chunkSize,
+            overlap,
+            files: fileStats,
+            chunks: allChunks,
+          };
+          const jsonStr = JSON.stringify(output, null, 2);
+          if (opts.output) {
+            fs.writeFileSync(opts.output, jsonStr + '\n');
+          } else {
+            console.log(jsonStr);
+          }
+        } else {
+          // JSONL output
+          const lines = allChunks.map(c => JSON.stringify(c));
+          const jsonlStr = lines.join('\n') + '\n';
+          if (opts.output) {
+            fs.writeFileSync(opts.output, jsonlStr);
+            if (!opts.quiet) {
+              console.log('');
+              console.log(ui.success(`Wrote ${fmtNum(allChunks.length)} chunks to ${opts.output}`));
+            }
+          } else if (opts.quiet || !showProgress) {
+            // Stdout — write JSONL directly
+            process.stdout.write(jsonlStr);
+          } else {
+            // Progress was shown, write to stdout with separator
+            console.log('');
+            process.stdout.write(jsonlStr);
+          }
+        }
+        // Stats summary
+        if ((opts.stats || showProgress) && !opts.json) {
+          const totalChars = fileStats.reduce((sum, f) => sum + (f.inputChars || 0), 0);
+          const totalTokens = allChunks.reduce((sum, c) => sum + estimateTokens(c.text), 0);
+          const avgChunkSize = allChunks.length > 0
+            ? Math.round(allChunks.reduce((sum, c) => sum + c.text.length, 0) / allChunks.length)
+            : 0;
+          const errors = fileStats.filter(f => f.error).length;
+          console.log('');
+          console.log(ui.bold('Summary'));
+          console.log(ui.label('Files', `${fmtNum(files.length)}${errors ? ` (${errors} failed)` : ''}`));
+          console.log(ui.label('Input', `${fmtNum(totalChars)} chars`));
+          console.log(ui.label('Chunks', fmtNum(allChunks.length)));
+          console.log(ui.label('Avg chunk', `${fmtNum(avgChunkSize)} chars (~${fmtNum(Math.round(avgChunkSize / 4))} tokens)`));
+          console.log(ui.label('Est. tokens', `~${fmtNum(totalTokens)}`));
+          // Cost hint
+          const pricePerMToken = 0.12; // voyage-4-large default
+          const cost = (totalTokens / 1e6) * pricePerMToken;
+          if (cost > 0) {
+            console.log(ui.label('Est. cost', ui.dim(`~$${cost < 0.01 ? cost.toFixed(4) : cost.toFixed(2)} with voyage-4-large`)));
+          }
+        }
+      } catch (err) {
+        console.error(ui.error(err.message));
+        process.exit(1);
+      }
+    });
+}
+/**
+ * Resolve input to a list of file paths.
+ * @param {string} input - File path, directory path, or glob
+ * @param {object} opts
+ * @returns {string[]}
+ */
+function resolveInput(input, opts) {
+  if (!input) {
+    console.error(ui.error('Please provide a file or directory path.'));
+    console.error(ui.dim('  Usage: vai chunk <file-or-directory> [options]'));
+    process.exit(1);
+  }
+  const resolved = path.resolve(input);
+  if (!fs.existsSync(resolved)) {
+    console.error(ui.error(`Not found: ${input}`));
+    process.exit(1);
+  }
+  const stat = fs.statSync(resolved);
+  if (stat.isFile()) {
+    return [resolved];
+  }
+  if (stat.isDirectory()) {
+    const scanOpts = {};
+    if (opts.extensions) {
+      scanOpts.extensions = opts.extensions.split(',').map(e => e.trim());
+    }
+    if (opts.ignore) {
+      scanOpts.ignore = opts.ignore.split(',').map(d => d.trim());
+    }
+    return scanDirectory(resolved, scanOpts);
+  }
+  return [];
+}
+module.exports = { registerChunk };

package/src/commands/completions.js CHANGED Viewed

@@ -19,7 +19,7 @@ _vai_completions() {
   prev="\${COMP_WORDS[COMP_CWORD-1]}"
   # Top-level commands
-  commands="embed rerank store search index models ping config demo explain similarity ingest completions help"
+  commands="embed rerank store search index models ping config demo explain similarity ingest estimate init chunk completions help"
   # Subcommands
   local index_subs="create list delete"
@@ -102,6 +102,18 @@ _vai_completions() {
       COMPREPLY=( \$(compgen -W "--file --db --collection --field --model --input-type --dimensions --batch-size --text-field --text-column --strict --dry-run --json --quiet --help" -- "\$cur") )
       return 0
       ;;
+    estimate)
+      COMPREPLY=( \$(compgen -W "--docs --queries --doc-tokens --query-tokens --doc-model --query-model --months --json --quiet --help" -- "\$cur") )
+      return 0
+      ;;
+    init)
+      COMPREPLY=( \$(compgen -W "--yes --force --json --quiet --help" -- "\$cur") )
+      return 0
+      ;;
+    chunk)
+      COMPREPLY=( \$(compgen -W "--strategy --chunk-size --overlap --min-size --output --text-field --extensions --ignore --dry-run --stats --json --quiet --help" -- "\$cur") )
+      return 0
+      ;;
     completions)
       COMPREPLY=( \$(compgen -W "bash zsh --help" -- "\$cur") )
       return 0
@@ -172,6 +184,9 @@ _vai() {
     'explain:Learn about AI and vector search concepts'
     'similarity:Compute cosine similarity between texts'
     'ingest:Bulk import documents with progress'
+    'estimate:Estimate embedding costs — symmetric vs asymmetric'
+    'init:Initialize project with .vai.json'
+    'chunk:Chunk documents for embedding'
     'completions:Generate shell completion scripts'
     'help:Display help for command'
   )
@@ -375,6 +390,41 @@ _vai() {
             '--json[Machine-readable JSON output]' \\
             '(-q --quiet)'{-q,--quiet}'[Suppress non-essential output]'
           ;;
+        estimate)
+          _arguments \\
+            '--docs[Number of documents]:count:' \\
+            '--queries[Queries per month]:count:' \\
+            '--doc-tokens[Avg tokens per document]:tokens:' \\
+            '--query-tokens[Avg tokens per query]:tokens:' \\
+            '--doc-model[Document embedding model]:model:(\$models)' \\
+            '--query-model[Query embedding model]:model:(\$models)' \\
+            '--months[Months to project]:months:' \\
+            '--json[Machine-readable JSON output]' \\
+            '(-q --quiet)'{-q,--quiet}'[Suppress non-essential output]'
+          ;;
+        init)
+          _arguments \\
+            '(-y --yes)'{-y,--yes}'[Accept all defaults]' \\
+            '--force[Overwrite existing .vai.json]' \\
+            '--json[Output config as JSON]' \\
+            '(-q --quiet)'{-q,--quiet}'[Suppress non-essential output]'
+          ;;
+        chunk)
+          _arguments \\
+            '1:input:_files' \\
+            '(-s --strategy)'{-s,--strategy}'[Chunking strategy]:strategy:(fixed sentence paragraph recursive markdown)' \\
+            '(-c --chunk-size)'{-c,--chunk-size}'[Target chunk size]:size:' \\
+            '--overlap[Overlap between chunks]:chars:' \\
+            '--min-size[Minimum chunk size]:chars:' \\
+            '(-o --output)'{-o,--output}'[Output file]:file:_files' \\
+            '--text-field[Text field for JSON]:field:' \\
+            '--extensions[File extensions]:exts:' \\
+            '--ignore[Dirs to skip]:dirs:' \\
+            '--dry-run[Preview without processing]' \\
+            '--stats[Show statistics]' \\
+            '--json[JSON output]' \\
+            '(-q --quiet)'{-q,--quiet}'[Suppress non-essential output]'
+          ;;
         completions)
           _arguments \\
             '1:shell:(bash zsh)'