npm - voyageai-cli - Versions diffs - 1.15.0 → 1.18.0 - Mend

voyageai-cli 1.15.0 → 1.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/package.json +1 -1
package/src/cli.js +8 -0
package/src/commands/chunk.js +277 -0
package/src/commands/completions.js +84 -1
package/src/commands/init.js +153 -0
package/src/commands/pipeline.js +311 -0
package/src/commands/query.js +266 -0
package/src/lib/chunker.js +341 -0
package/src/lib/project.js +122 -0
package/src/lib/readers.js +239 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "voyageai-cli",
-  "version": "1.15.0",
+  "version": "1.18.0",
   "description": "CLI for Voyage AI embeddings, reranking, and MongoDB Atlas Vector Search",
   "bin": {
     "vai": "./src/cli.js"

package/src/cli.js CHANGED Viewed

@@ -21,6 +21,10 @@ const { registerCompletions } = require('./commands/completions');
 const { registerPlayground } = require('./commands/playground');
 const { registerBenchmark } = require('./commands/benchmark');
 const { registerEstimate } = require('./commands/estimate');
+const { registerInit } = require('./commands/init');
+const { registerChunk } = require('./commands/chunk');
+const { registerQuery } = require('./commands/query');
+const { registerPipeline } = require('./commands/pipeline');
 const { registerAbout } = require('./commands/about');
 const { showBanner, showQuickStart, getVersion } = require('./lib/banner');
@@ -47,6 +51,10 @@ registerCompletions(program);
 registerPlayground(program);
 registerBenchmark(program);
 registerEstimate(program);
+registerInit(program);
+registerChunk(program);
+registerQuery(program);
+registerPipeline(program);
 registerAbout(program);
 // Append disclaimer to all help output

package/src/commands/chunk.js ADDED Viewed

@@ -0,0 +1,277 @@
+'use strict';
+const fs = require('fs');
+const path = require('path');
+const { chunk, estimateTokens, STRATEGIES, DEFAULTS } = require('../lib/chunker');
+const { readFile, scanDirectory, isSupported, getReaderType } = require('../lib/readers');
+const { loadProject, mergeOptions } = require('../lib/project');
+const ui = require('../lib/ui');
+/**
+ * Format a number with commas.
+ */
+function fmtNum(n) {
+  return n.toLocaleString('en-US');
+}
+/**
+ * Build chunk metadata for a source file.
+ * @param {string} filePath - Source file path
+ * @param {string} basePath - Base directory for relative paths
+ * @param {number} index - Chunk index within the file
+ * @param {number} total - Total chunks from this file
+ * @returns {object}
+ */
+function buildMetadata(filePath, basePath, index, total) {
+  return {
+    source: path.relative(basePath, filePath),
+    chunk_index: index,
+    total_chunks: total,
+  };
+}
+/**
+ * Register the chunk command on a Commander program.
+ * @param {import('commander').Command} program
+ */
+function registerChunk(program) {
+  program
+    .command('chunk [input]')
+    .description('Chunk documents for embedding — files, directories, or stdin')
+    .option('-s, --strategy <strategy>', `Chunking strategy: ${STRATEGIES.join(', ')}`)
+    .option('-c, --chunk-size <n>', 'Target chunk size in characters', (v) => parseInt(v, 10))
+    .option('--overlap <n>', 'Overlap between chunks in characters', (v) => parseInt(v, 10))
+    .option('--min-size <n>', 'Minimum chunk size (drop smaller)', (v) => parseInt(v, 10))
+    .option('-o, --output <path>', 'Output file (JSONL). Omit for stdout')
+    .option('--text-field <name>', 'Text field name for JSON/JSONL input', 'text')
+    .option('--extensions <exts>', 'Comma-separated file extensions to include when scanning directories')
+    .option('--ignore <dirs>', 'Comma-separated directory names to skip', 'node_modules,.git,__pycache__')
+    .option('--dry-run', 'Show what would be chunked without processing')
+    .option('--stats', 'Show chunking statistics after processing')
+    .option('--json', 'Machine-readable JSON output')
+    .option('-q, --quiet', 'Suppress non-essential output')
+    .action(async (input, opts) => {
+      try {
+        // Load project config, merge with CLI opts
+        const { config: projectConfig } = loadProject();
+        const chunkConfig = projectConfig.chunk || {};
+        const strategy = opts.strategy || chunkConfig.strategy || DEFAULTS.strategy || 'recursive';
+        const chunkSize = opts.chunkSize || chunkConfig.size || DEFAULTS.size;
+        const overlap = opts.overlap != null ? opts.overlap : (chunkConfig.overlap != null ? chunkConfig.overlap : DEFAULTS.overlap);
+        const minSize = opts.minSize || chunkConfig.minSize || DEFAULTS.minSize;
+        const textField = opts.textField || 'text';
+        if (!STRATEGIES.includes(strategy)) {
+          console.error(ui.error(`Unknown strategy: "${strategy}". Available: ${STRATEGIES.join(', ')}`));
+          process.exit(1);
+        }
+        // Resolve input files
+        const files = resolveInput(input, opts);
+        if (files.length === 0) {
+          console.error(ui.error('No supported files found. Supported types: .txt, .md, .html, .json, .jsonl, .pdf'));
+          process.exit(1);
+        }
+        // Dry run
+        if (opts.dryRun) {
+          if (opts.json) {
+            console.log(JSON.stringify({ files: files.map(f => path.relative(process.cwd(), f)), strategy, chunkSize, overlap }, null, 2));
+          } else {
+            console.log(ui.bold(`Would chunk ${files.length} file(s) with strategy: ${strategy}`));
+            console.log(ui.dim(`  Chunk size: ${chunkSize} chars, overlap: ${overlap} chars`));
+            console.log('');
+            for (const f of files) {
+              const size = fs.statSync(f).size;
+              console.log(`  ${ui.dim(path.relative(process.cwd(), f))} (${fmtNum(size)} bytes)`);
+            }
+          }
+          return;
+        }
+        // Process files
+        const basePath = input && fs.existsSync(input) && fs.statSync(input).isDirectory()
+          ? path.resolve(input)
+          : process.cwd();
+        const allChunks = [];
+        const fileStats = [];
+        const showProgress = !opts.json && !opts.quiet && files.length > 1;
+        if (showProgress) {
+          console.log(ui.bold(`Chunking ${files.length} file(s) with strategy: ${strategy}`));
+          console.log(ui.dim(`  Chunk size: ${chunkSize}, overlap: ${overlap}, min: ${minSize}`));
+          console.log('');
+        }
+        for (let fi = 0; fi < files.length; fi++) {
+          const filePath = files[fi];
+          const relPath = path.relative(basePath, filePath);
+          const readerType = getReaderType(filePath);
+          try {
+            const content = await readFile(filePath, { textField });
+            // readFile returns string for text/html/pdf, array for json/jsonl
+            let textsToChunk = [];
+            if (typeof content === 'string') {
+              textsToChunk = [{ text: content, metadata: {} }];
+            } else if (Array.isArray(content)) {
+              textsToChunk = content;
+            }
+            let fileChunkCount = 0;
+            for (const item of textsToChunk) {
+              const effectiveStrategy = readerType === 'text' && filePath.endsWith('.md') ? 'markdown' : strategy;
+              // Auto-detect markdown for .md files when using default strategy
+              const useStrategy = (strategy === 'recursive' && filePath.endsWith('.md')) ? 'markdown' : strategy;
+              const chunks = chunk(item.text, {
+                strategy: useStrategy,
+                size: chunkSize,
+                overlap,
+                minSize,
+              });
+              for (let ci = 0; ci < chunks.length; ci++) {
+                allChunks.push({
+                  text: chunks[ci],
+                  metadata: {
+                    ...item.metadata,
+                    ...buildMetadata(filePath, basePath, ci, chunks.length),
+                  },
+                });
+              }
+              fileChunkCount += chunks.length;
+            }
+            fileStats.push({
+              file: relPath,
+              inputChars: textsToChunk.reduce((sum, t) => sum + t.text.length, 0),
+              chunks: fileChunkCount,
+            });
+            if (showProgress) {
+              console.log(`  ${ui.green('✓')} ${relPath} → ${fileChunkCount} chunks`);
+            }
+          } catch (err) {
+            fileStats.push({ file: relPath, error: err.message, chunks: 0 });
+            if (!opts.quiet) {
+              console.error(`  ${ui.red('✗')} ${relPath}: ${err.message}`);
+            }
+          }
+        }
+        // Output
+        if (opts.json) {
+          const output = {
+            totalChunks: allChunks.length,
+            totalTokens: allChunks.reduce((sum, c) => sum + estimateTokens(c.text), 0),
+            strategy,
+            chunkSize,
+            overlap,
+            files: fileStats,
+            chunks: allChunks,
+          };
+          const jsonStr = JSON.stringify(output, null, 2);
+          if (opts.output) {
+            fs.writeFileSync(opts.output, jsonStr + '\n');
+          } else {
+            console.log(jsonStr);
+          }
+        } else {
+          // JSONL output
+          const lines = allChunks.map(c => JSON.stringify(c));
+          const jsonlStr = lines.join('\n') + '\n';
+          if (opts.output) {
+            fs.writeFileSync(opts.output, jsonlStr);
+            if (!opts.quiet) {
+              console.log('');
+              console.log(ui.success(`Wrote ${fmtNum(allChunks.length)} chunks to ${opts.output}`));
+            }
+          } else if (opts.quiet || !showProgress) {
+            // Stdout — write JSONL directly
+            process.stdout.write(jsonlStr);
+          } else {
+            // Progress was shown, write to stdout with separator
+            console.log('');
+            process.stdout.write(jsonlStr);
+          }
+        }
+        // Stats summary
+        if ((opts.stats || showProgress) && !opts.json) {
+          const totalChars = fileStats.reduce((sum, f) => sum + (f.inputChars || 0), 0);
+          const totalTokens = allChunks.reduce((sum, c) => sum + estimateTokens(c.text), 0);
+          const avgChunkSize = allChunks.length > 0
+            ? Math.round(allChunks.reduce((sum, c) => sum + c.text.length, 0) / allChunks.length)
+            : 0;
+          const errors = fileStats.filter(f => f.error).length;
+          console.log('');
+          console.log(ui.bold('Summary'));
+          console.log(ui.label('Files', `${fmtNum(files.length)}${errors ? ` (${errors} failed)` : ''}`));
+          console.log(ui.label('Input', `${fmtNum(totalChars)} chars`));
+          console.log(ui.label('Chunks', fmtNum(allChunks.length)));
+          console.log(ui.label('Avg chunk', `${fmtNum(avgChunkSize)} chars (~${fmtNum(Math.round(avgChunkSize / 4))} tokens)`));
+          console.log(ui.label('Est. tokens', `~${fmtNum(totalTokens)}`));
+          // Cost hint
+          const pricePerMToken = 0.12; // voyage-4-large default
+          const cost = (totalTokens / 1e6) * pricePerMToken;
+          if (cost > 0) {
+            console.log(ui.label('Est. cost', ui.dim(`~$${cost < 0.01 ? cost.toFixed(4) : cost.toFixed(2)} with voyage-4-large`)));
+          }
+        }
+      } catch (err) {
+        console.error(ui.error(err.message));
+        process.exit(1);
+      }
+    });
+}
+/**
+ * Resolve input to a list of file paths.
+ * @param {string} input - File path, directory path, or glob
+ * @param {object} opts
+ * @returns {string[]}
+ */
+function resolveInput(input, opts) {
+  if (!input) {
+    console.error(ui.error('Please provide a file or directory path.'));
+    console.error(ui.dim('  Usage: vai chunk <file-or-directory> [options]'));
+    process.exit(1);
+  }
+  const resolved = path.resolve(input);
+  if (!fs.existsSync(resolved)) {
+    console.error(ui.error(`Not found: ${input}`));
+    process.exit(1);
+  }
+  const stat = fs.statSync(resolved);
+  if (stat.isFile()) {
+    return [resolved];
+  }
+  if (stat.isDirectory()) {
+    const scanOpts = {};
+    if (opts.extensions) {
+      scanOpts.extensions = opts.extensions.split(',').map(e => e.trim());
+    }
+    if (opts.ignore) {
+      scanOpts.ignore = opts.ignore.split(',').map(d => d.trim());
+    }
+    return scanDirectory(resolved, scanOpts);
+  }
+  return [];
+}
+module.exports = { registerChunk };

package/src/commands/completions.js CHANGED Viewed

@@ -19,7 +19,7 @@ _vai_completions() {
   prev="\${COMP_WORDS[COMP_CWORD-1]}"
   # Top-level commands
-  commands="embed rerank store search index models ping config demo explain similarity ingest estimate completions help"
+  commands="embed rerank store search index models ping config demo explain similarity ingest estimate init chunk query pipeline completions help"
   # Subcommands
   local index_subs="create list delete"
@@ -106,6 +106,22 @@ _vai_completions() {
       COMPREPLY=( \$(compgen -W "--docs --queries --doc-tokens --query-tokens --doc-model --query-model --months --json --quiet --help" -- "\$cur") )
       return 0
       ;;
+    init)
+      COMPREPLY=( \$(compgen -W "--yes --force --json --quiet --help" -- "\$cur") )
+      return 0
+      ;;
+    chunk)
+      COMPREPLY=( \$(compgen -W "--strategy --chunk-size --overlap --min-size --output --text-field --extensions --ignore --dry-run --stats --json --quiet --help" -- "\$cur") )
+      return 0
+      ;;
+    query)
+      COMPREPLY=( \$(compgen -W "--db --collection --index --field --model --dimensions --limit --top-k --rerank --no-rerank --rerank-model --text-field --filter --num-candidates --show-vectors --json --quiet --help" -- "\$cur") )
+      return 0
+      ;;
+    pipeline)
+      COMPREPLY=( \$(compgen -W "--db --collection --field --index --model --dimensions --strategy --chunk-size --overlap --batch-size --text-field --extensions --ignore --create-index --dry-run --json --quiet --help" -- "\$cur") )
+      return 0
+      ;;
     completions)
       COMPREPLY=( \$(compgen -W "bash zsh --help" -- "\$cur") )
       return 0
@@ -177,6 +193,10 @@ _vai() {
     'similarity:Compute cosine similarity between texts'
     'ingest:Bulk import documents with progress'
     'estimate:Estimate embedding costs — symmetric vs asymmetric'
+    'init:Initialize project with .vai.json'
+    'chunk:Chunk documents for embedding'
+    'query:Search + rerank in one shot'
+    'pipeline:Chunk, embed, and store documents'
     'completions:Generate shell completion scripts'
     'help:Display help for command'
   )
@@ -392,6 +412,69 @@ _vai() {
             '--json[Machine-readable JSON output]' \\
             '(-q --quiet)'{-q,--quiet}'[Suppress non-essential output]'
           ;;
+        init)
+          _arguments \\
+            '(-y --yes)'{-y,--yes}'[Accept all defaults]' \\
+            '--force[Overwrite existing .vai.json]' \\
+            '--json[Output config as JSON]' \\
+            '(-q --quiet)'{-q,--quiet}'[Suppress non-essential output]'
+          ;;
+        chunk)
+          _arguments \\
+            '1:input:_files' \\
+            '(-s --strategy)'{-s,--strategy}'[Chunking strategy]:strategy:(fixed sentence paragraph recursive markdown)' \\
+            '(-c --chunk-size)'{-c,--chunk-size}'[Target chunk size]:size:' \\
+            '--overlap[Overlap between chunks]:chars:' \\
+            '--min-size[Minimum chunk size]:chars:' \\
+            '(-o --output)'{-o,--output}'[Output file]:file:_files' \\
+            '--text-field[Text field for JSON]:field:' \\
+            '--extensions[File extensions]:exts:' \\
+            '--ignore[Dirs to skip]:dirs:' \\
+            '--dry-run[Preview without processing]' \\
+            '--stats[Show statistics]' \\
+            '--json[JSON output]' \\
+            '(-q --quiet)'{-q,--quiet}'[Suppress non-essential output]'
+          ;;
+        query)
+          _arguments \\
+            '1:query text:' \\
+            '--db[Database name]:database:' \\
+            '--collection[Collection name]:collection:' \\
+            '--index[Vector search index]:index:' \\
+            '--field[Embedding field]:field:' \\
+            '(-m --model)'{-m,--model}'[Embedding model]:model:(\$models)' \\
+            '(-d --dimensions)'{-d,--dimensions}'[Output dimensions]:dims:' \\
+            '(-l --limit)'{-l,--limit}'[Search candidates]:limit:' \\
+            '(-k --top-k)'{-k,--top-k}'[Final results]:k:' \\
+            '--rerank[Enable reranking]' \\
+            '--no-rerank[Skip reranking]' \\
+            '--rerank-model[Reranking model]:model:' \\
+            '--text-field[Document text field]:field:' \\
+            '--filter[Pre-filter JSON]:json:' \\
+            '--json[JSON output]' \\
+            '(-q --quiet)'{-q,--quiet}'[Suppress non-essential output]'
+          ;;
+        pipeline)
+          _arguments \\
+            '1:input:_files' \\
+            '--db[Database name]:database:' \\
+            '--collection[Collection name]:collection:' \\
+            '--field[Embedding field]:field:' \\
+            '--index[Vector search index]:index:' \\
+            '(-m --model)'{-m,--model}'[Embedding model]:model:(\$models)' \\
+            '(-d --dimensions)'{-d,--dimensions}'[Output dimensions]:dims:' \\
+            '(-s --strategy)'{-s,--strategy}'[Chunking strategy]:strategy:(fixed sentence paragraph recursive markdown)' \\
+            '(-c --chunk-size)'{-c,--chunk-size}'[Chunk size]:size:' \\
+            '--overlap[Chunk overlap]:chars:' \\
+            '--batch-size[Texts per API call]:size:' \\
+            '--text-field[Text field for JSON]:field:' \\
+            '--extensions[File extensions]:exts:' \\
+            '--ignore[Dirs to skip]:dirs:' \\
+            '--create-index[Auto-create vector index]' \\
+            '--dry-run[Preview without executing]' \\
+            '--json[JSON output]' \\
+            '(-q --quiet)'{-q,--quiet}'[Suppress non-essential output]'
+          ;;
         completions)
           _arguments \\
             '1:shell:(bash zsh)'

package/src/commands/init.js ADDED Viewed

@@ -0,0 +1,153 @@
+'use strict';
+const fs = require('fs');
+const path = require('path');
+const readline = require('readline');
+const { MODEL_CATALOG } = require('../lib/catalog');
+const { STRATEGIES } = require('../lib/chunker');
+const { defaultProjectConfig, saveProject, findProjectFile, PROJECT_FILE } = require('../lib/project');
+const ui = require('../lib/ui');
+/**
+ * Prompt the user for input with a default value.
+ * @param {readline.Interface} rl
+ * @param {string} question
+ * @param {string} [defaultVal]
+ * @returns {Promise<string>}
+ */
+function ask(rl, question, defaultVal) {
+  const suffix = defaultVal ? ` ${ui.dim(`(${defaultVal})`)}` : '';
+  return new Promise((resolve) => {
+    rl.question(`  ${question}${suffix}: `, (answer) => {
+      resolve(answer.trim() || defaultVal || '');
+    });
+  });
+}
+/**
+ * Prompt for a choice from a list.
+ * @param {readline.Interface} rl
+ * @param {string} question
+ * @param {string[]} choices
+ * @param {string} defaultVal
+ * @returns {Promise<string>}
+ */
+async function askChoice(rl, question, choices, defaultVal) {
+  console.log('');
+  for (let i = 0; i < choices.length; i++) {
+    const marker = choices[i] === defaultVal ? ui.cyan('→') : ' ';
+    console.log(`  ${marker} ${i + 1}. ${choices[i]}`);
+  }
+  const answer = await ask(rl, question, defaultVal);
+  // Accept number or value
+  const num = parseInt(answer, 10);
+  if (num >= 1 && num <= choices.length) return choices[num - 1];
+  if (choices.includes(answer)) return answer;
+  return defaultVal;
+}
+/**
+ * Register the init command on a Commander program.
+ * @param {import('commander').Command} program
+ */
+function registerInit(program) {
+  program
+    .command('init')
+    .description('Initialize a project with .vai.json configuration')
+    .option('-y, --yes', 'Accept all defaults (non-interactive)')
+    .option('--force', 'Overwrite existing .vai.json')
+    .option('--json', 'Output created config as JSON (non-interactive)')
+    .option('-q, --quiet', 'Suppress non-essential output')
+    .action(async (opts) => {
+      // Check for existing config
+      const existing = findProjectFile();
+      if (existing && !opts.force) {
+        const relPath = path.relative(process.cwd(), existing);
+        console.error(ui.warn(`Project already initialized: ${relPath}`));
+        console.error(ui.dim('  Use --force to overwrite.'));
+        process.exit(1);
+      }
+      const defaults = defaultProjectConfig();
+      // Non-interactive mode
+      if (opts.yes || opts.json) {
+        const filePath = saveProject(defaults);
+        if (opts.json) {
+          console.log(JSON.stringify(defaults, null, 2));
+        } else if (!opts.quiet) {
+          console.log(ui.success(`Created ${PROJECT_FILE}`));
+        }
+        return;
+      }
+      // Interactive mode
+      console.log('');
+      console.log(ui.bold('  🚀 Initialize Voyage AI Project'));
+      console.log(ui.dim('  Creates .vai.json in the current directory.'));
+      console.log(ui.dim('  Press Enter to accept defaults.'));
+      console.log('');
+      const rl = readline.createInterface({
+        input: process.stdin,
+        output: process.stdout,
+      });
+      try {
+        // Embedding model
+        const embeddingModels = MODEL_CATALOG
+          .filter(m => m.type === 'embedding' && !m.legacy && !m.unreleased)
+          .map(m => m.name);
+        const model = await askChoice(rl, 'Embedding model', embeddingModels, defaults.model);
+        // MongoDB settings
+        console.log('');
+        console.log(ui.bold('  MongoDB Atlas'));
+        const db = await ask(rl, 'Database name', defaults.db || 'myapp');
+        const collection = await ask(rl, 'Collection name', defaults.collection || 'documents');
+        const field = await ask(rl, 'Embedding field', defaults.field);
+        const index = await ask(rl, 'Vector index name', defaults.index);
+        // Dimensions
+        const modelInfo = MODEL_CATALOG.find(m => m.name === model);
+        const defaultDims = modelInfo && modelInfo.dimensions.includes('1024') ? '1024' : '512';
+        const dimensions = parseInt(await ask(rl, 'Dimensions', defaultDims), 10) || parseInt(defaultDims, 10);
+        // Chunking
+        console.log('');
+        console.log(ui.bold('  Chunking'));
+        const strategy = await askChoice(rl, 'Chunk strategy', STRATEGIES, defaults.chunk.strategy);
+        const chunkSize = parseInt(await ask(rl, 'Chunk size (chars)', String(defaults.chunk.size)), 10);
+        const chunkOverlap = parseInt(await ask(rl, 'Chunk overlap (chars)', String(defaults.chunk.overlap)), 10);
+        const config = {
+          model,
+          db,
+          collection,
+          field,
+          inputType: 'document',
+          dimensions,
+          index,
+          chunk: {
+            strategy,
+            size: chunkSize,
+            overlap: chunkOverlap,
+          },
+        };
+        const filePath = saveProject(config);
+        console.log('');
+        console.log(ui.success(`Created ${path.relative(process.cwd(), filePath)}`));
+        console.log('');
+        console.log(ui.dim('  Next steps:'));
+        console.log(ui.dim('    vai chunk ./docs/           # Chunk your documents'));
+        console.log(ui.dim('    vai pipeline ./docs/        # Chunk → embed → store (coming soon)'));
+        console.log(ui.dim('    vai search --query "..."    # Search your collection'));
+        console.log('');
+      } finally {
+        rl.close();
+      }
+    });
+}
+module.exports = { registerInit };