voyageai-cli 1.15.0 → 1.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,311 @@
1
+ 'use strict';
2
+
3
+ const fs = require('fs');
4
+ const path = require('path');
5
+ const { chunk, estimateTokens, STRATEGIES } = require('../lib/chunker');
6
+ const { readFile, scanDirectory, isSupported, getReaderType } = require('../lib/readers');
7
+ const { loadProject } = require('../lib/project');
8
+ const { getDefaultModel } = require('../lib/catalog');
9
+ const { generateEmbeddings } = require('../lib/api');
10
+ const { getMongoCollection } = require('../lib/mongo');
11
+ const ui = require('../lib/ui');
12
+
13
+ /**
14
+ * Format number with commas.
15
+ */
16
+ function fmtNum(n) {
17
+ return n.toLocaleString('en-US');
18
+ }
19
+
20
+ /**
21
+ * Resolve input path(s) to file list.
22
+ */
23
+ function resolveFiles(input, opts) {
24
+ const resolved = path.resolve(input);
25
+ if (!fs.existsSync(resolved)) {
26
+ throw new Error(`Not found: ${input}`);
27
+ }
28
+
29
+ const stat = fs.statSync(resolved);
30
+ if (stat.isFile()) return [resolved];
31
+
32
+ if (stat.isDirectory()) {
33
+ const scanOpts = {};
34
+ if (opts.extensions) scanOpts.extensions = opts.extensions.split(',').map(e => e.trim());
35
+ if (opts.ignore) scanOpts.ignore = opts.ignore.split(',').map(d => d.trim());
36
+ return scanDirectory(resolved, scanOpts);
37
+ }
38
+
39
+ return [];
40
+ }
41
+
42
+ /**
43
+ * Register the pipeline command on a Commander program.
44
+ * @param {import('commander').Command} program
45
+ */
46
+ function registerPipeline(program) {
47
+ program
48
+ .command('pipeline <input>')
49
+ .description('End-to-end: chunk → embed → store in MongoDB Atlas')
50
+ .option('--db <database>', 'Database name')
51
+ .option('--collection <name>', 'Collection name')
52
+ .option('--field <name>', 'Embedding field name')
53
+ .option('--index <name>', 'Vector search index name')
54
+ .option('-m, --model <model>', 'Embedding model')
55
+ .option('-d, --dimensions <n>', 'Output dimensions', (v) => parseInt(v, 10))
56
+ .option('-s, --strategy <strategy>', 'Chunking strategy')
57
+ .option('-c, --chunk-size <n>', 'Target chunk size in characters', (v) => parseInt(v, 10))
58
+ .option('--overlap <n>', 'Overlap between chunks', (v) => parseInt(v, 10))
59
+ .option('--batch-size <n>', 'Texts per embedding API call', (v) => parseInt(v, 10), 25)
60
+ .option('--text-field <name>', 'Text field for JSON/JSONL input', 'text')
61
+ .option('--extensions <exts>', 'File extensions to include')
62
+ .option('--ignore <dirs>', 'Directory names to skip', 'node_modules,.git,__pycache__')
63
+ .option('--create-index', 'Auto-create vector search index if it doesn\'t exist')
64
+ .option('--dry-run', 'Show what would happen without executing')
65
+ .option('--json', 'Machine-readable JSON output')
66
+ .option('-q, --quiet', 'Suppress non-essential output')
67
+ .action(async (input, opts) => {
68
+ let client;
69
+ try {
70
+ // Merge project config
71
+ const { config: proj } = loadProject();
72
+ const projChunk = proj.chunk || {};
73
+
74
+ const db = opts.db || proj.db;
75
+ const collection = opts.collection || proj.collection;
76
+ const field = opts.field || proj.field || 'embedding';
77
+ const index = opts.index || proj.index || 'vector_index';
78
+ const model = opts.model || proj.model || getDefaultModel();
79
+ const dimensions = opts.dimensions || proj.dimensions;
80
+ const strategy = opts.strategy || projChunk.strategy || 'recursive';
81
+ const chunkSize = opts.chunkSize || projChunk.size || 512;
82
+ const overlap = opts.overlap != null ? opts.overlap : (projChunk.overlap != null ? projChunk.overlap : 50);
83
+ const batchSize = opts.batchSize || 25;
84
+ const textField = opts.textField || 'text';
85
+
86
+ if (!db || !collection) {
87
+ console.error(ui.error('Database and collection required. Use --db/--collection or "vai init".'));
88
+ process.exit(1);
89
+ }
90
+
91
+ if (!STRATEGIES.includes(strategy)) {
92
+ console.error(ui.error(`Unknown strategy: "${strategy}". Available: ${STRATEGIES.join(', ')}`));
93
+ process.exit(1);
94
+ }
95
+
96
+ // Step 1: Resolve files
97
+ const files = resolveFiles(input, opts);
98
+ if (files.length === 0) {
99
+ console.error(ui.error('No supported files found.'));
100
+ process.exit(1);
101
+ }
102
+
103
+ const basePath = fs.statSync(path.resolve(input)).isDirectory()
104
+ ? path.resolve(input)
105
+ : process.cwd();
106
+
107
+ const verbose = !opts.json && !opts.quiet;
108
+
109
+ if (verbose) {
110
+ console.log('');
111
+ console.log(ui.bold('🚀 Pipeline: chunk → embed → store'));
112
+ console.log(ui.dim(` Files: ${files.length} | Strategy: ${strategy} | Model: ${model}`));
113
+ console.log(ui.dim(` Target: ${db}.${collection} (field: ${field})`));
114
+ console.log('');
115
+ }
116
+
117
+ // Step 2: Chunk all files
118
+ if (verbose) console.log(ui.bold('Step 1/3 — Chunking'));
119
+
120
+ const allChunks = [];
121
+ let totalInputChars = 0;
122
+ const fileErrors = [];
123
+
124
+ for (const filePath of files) {
125
+ const relPath = path.relative(basePath, filePath);
126
+ try {
127
+ const content = await readFile(filePath, { textField });
128
+ const texts = typeof content === 'string'
129
+ ? [{ text: content, metadata: {} }]
130
+ : content;
131
+
132
+ for (const item of texts) {
133
+ const useStrategy = (strategy === 'recursive' && filePath.endsWith('.md'))
134
+ ? 'markdown' : strategy;
135
+
136
+ const chunks = chunk(item.text, {
137
+ strategy: useStrategy,
138
+ size: chunkSize,
139
+ overlap,
140
+ });
141
+
142
+ totalInputChars += item.text.length;
143
+
144
+ for (let ci = 0; ci < chunks.length; ci++) {
145
+ allChunks.push({
146
+ text: chunks[ci],
147
+ metadata: {
148
+ ...item.metadata,
149
+ source: relPath,
150
+ chunk_index: ci,
151
+ total_chunks: chunks.length,
152
+ },
153
+ });
154
+ }
155
+ }
156
+
157
+ if (verbose) console.log(` ${ui.green('✓')} ${relPath} → ${allChunks.length} chunks total`);
158
+ } catch (err) {
159
+ fileErrors.push({ file: relPath, error: err.message });
160
+ if (verbose) console.error(` ${ui.red('✗')} ${relPath}: ${err.message}`);
161
+ }
162
+ }
163
+
164
+ if (allChunks.length === 0) {
165
+ console.error(ui.error('No chunks produced. Check your files and chunk settings.'));
166
+ process.exit(1);
167
+ }
168
+
169
+ const totalTokens = allChunks.reduce((sum, c) => sum + estimateTokens(c.text), 0);
170
+
171
+ if (verbose) {
172
+ console.log(ui.dim(` ${fmtNum(allChunks.length)} chunks, ~${fmtNum(totalTokens)} tokens`));
173
+ console.log('');
174
+ }
175
+
176
+ // Dry run — stop here
177
+ if (opts.dryRun) {
178
+ if (opts.json) {
179
+ console.log(JSON.stringify({
180
+ dryRun: true,
181
+ files: files.length,
182
+ chunks: allChunks.length,
183
+ estimatedTokens: totalTokens,
184
+ strategy, chunkSize, overlap, model, db, collection, field,
185
+ }, null, 2));
186
+ } else {
187
+ console.log(ui.success(`Dry run complete: ${fmtNum(allChunks.length)} chunks from ${files.length} files.`));
188
+ const cost = (totalTokens / 1e6) * 0.12;
189
+ console.log(ui.dim(` Estimated embedding cost: ~$${cost.toFixed(4)} with ${model}`));
190
+ }
191
+ return;
192
+ }
193
+
194
+ // Step 3: Embed in batches
195
+ if (verbose) console.log(ui.bold('Step 2/3 — Embedding'));
196
+
197
+ const batches = [];
198
+ for (let i = 0; i < allChunks.length; i += batchSize) {
199
+ batches.push(allChunks.slice(i, i + batchSize));
200
+ }
201
+
202
+ let embeddedCount = 0;
203
+ let totalApiTokens = 0;
204
+ const embeddings = new Array(allChunks.length);
205
+
206
+ for (let bi = 0; bi < batches.length; bi++) {
207
+ const batch = batches[bi];
208
+ const texts = batch.map(c => c.text);
209
+
210
+ if (verbose) {
211
+ const pct = Math.round(((bi + 1) / batches.length) * 100);
212
+ process.stderr.write(`\r Batch ${bi + 1}/${batches.length} (${pct}%)...`);
213
+ }
214
+
215
+ const embedOpts = { model, inputType: 'document' };
216
+ if (dimensions) embedOpts.dimensions = dimensions;
217
+
218
+ const result = await generateEmbeddings(texts, embedOpts);
219
+ totalApiTokens += result.usage?.total_tokens || 0;
220
+
221
+ for (let j = 0; j < result.data.length; j++) {
222
+ embeddings[embeddedCount + j] = result.data[j].embedding;
223
+ }
224
+ embeddedCount += batch.length;
225
+ }
226
+
227
+ if (verbose) {
228
+ process.stderr.write('\r');
229
+ console.log(` ${ui.green('✓')} Embedded ${fmtNum(embeddedCount)} chunks (${fmtNum(totalApiTokens)} tokens)`);
230
+ console.log('');
231
+ }
232
+
233
+ // Step 4: Store in MongoDB
234
+ if (verbose) console.log(ui.bold('Step 3/3 — Storing in MongoDB'));
235
+
236
+ const { client: c, collection: coll } = await getMongoCollection(db, collection);
237
+ client = c;
238
+
239
+ const documents = allChunks.map((chunk, i) => ({
240
+ text: chunk.text,
241
+ [field]: embeddings[i],
242
+ metadata: chunk.metadata,
243
+ _model: model,
244
+ _embeddedAt: new Date(),
245
+ }));
246
+
247
+ const insertResult = await coll.insertMany(documents);
248
+
249
+ if (verbose) {
250
+ console.log(` ${ui.green('✓')} Inserted ${fmtNum(insertResult.insertedCount)} documents`);
251
+ }
252
+
253
+ // Optional: create index
254
+ if (opts.createIndex) {
255
+ if (verbose) console.log('');
256
+ try {
257
+ const dim = embeddings[0]?.length || dimensions || 1024;
258
+ const indexDef = {
259
+ name: index,
260
+ type: 'vectorSearch',
261
+ definition: {
262
+ fields: [{
263
+ type: 'vector',
264
+ path: field,
265
+ numDimensions: dim,
266
+ similarity: 'cosine',
267
+ }],
268
+ },
269
+ };
270
+ await coll.createSearchIndex(indexDef);
271
+ if (verbose) console.log(` ${ui.green('✓')} Created vector index "${index}" (${dim} dims, cosine)`);
272
+ } catch (err) {
273
+ if (err.message?.includes('already exists')) {
274
+ if (verbose) console.log(` ${ui.dim('ℹ Index "' + index + '" already exists — skipping')}`);
275
+ } else {
276
+ if (verbose) console.error(` ${ui.yellow('⚠')} Index creation failed: ${err.message}`);
277
+ }
278
+ }
279
+ }
280
+
281
+ // Summary
282
+ if (opts.json) {
283
+ console.log(JSON.stringify({
284
+ files: files.length,
285
+ fileErrors: fileErrors.length,
286
+ chunks: allChunks.length,
287
+ tokens: totalApiTokens,
288
+ inserted: insertResult.insertedCount,
289
+ model, db, collection, field, strategy, chunkSize,
290
+ index: opts.createIndex ? index : null,
291
+ }, null, 2));
292
+ } else if (verbose) {
293
+ console.log('');
294
+ console.log(ui.success('Pipeline complete'));
295
+ console.log(ui.label('Files', `${fmtNum(files.length)}${fileErrors.length ? ` (${fileErrors.length} failed)` : ''}`));
296
+ console.log(ui.label('Chunks', fmtNum(allChunks.length)));
297
+ console.log(ui.label('Tokens', fmtNum(totalApiTokens)));
298
+ console.log(ui.label('Stored', `${fmtNum(insertResult.insertedCount)} docs → ${db}.${collection}`));
299
+ console.log('');
300
+ console.log(ui.dim(' Next: vai query "your search" --db ' + db + ' --collection ' + collection));
301
+ }
302
+ } catch (err) {
303
+ console.error(ui.error(err.message));
304
+ process.exit(1);
305
+ } finally {
306
+ if (client) await client.close();
307
+ }
308
+ });
309
+ }
310
+
311
+ module.exports = { registerPipeline };
@@ -0,0 +1,266 @@
1
+ 'use strict';
2
+
3
+ const { getDefaultModel, DEFAULT_RERANK_MODEL } = require('../lib/catalog');
4
+ const { generateEmbeddings, apiRequest } = require('../lib/api');
5
+ const { getMongoCollection } = require('../lib/mongo');
6
+ const { loadProject } = require('../lib/project');
7
+ const ui = require('../lib/ui');
8
+
9
+ /**
10
+ * Register the query command on a Commander program.
11
+ * @param {import('commander').Command} program
12
+ */
13
+ function registerQuery(program) {
14
+ program
15
+ .command('query <text>')
16
+ .description('Search + rerank in one shot — the two-stage retrieval pattern')
17
+ .option('--db <database>', 'Database name')
18
+ .option('--collection <name>', 'Collection name')
19
+ .option('--index <name>', 'Vector search index name')
20
+ .option('--field <name>', 'Embedding field name')
21
+ .option('-m, --model <model>', 'Embedding model for query')
22
+ .option('-d, --dimensions <n>', 'Output dimensions', (v) => parseInt(v, 10))
23
+ .option('-l, --limit <n>', 'Number of vector search candidates', (v) => parseInt(v, 10), 20)
24
+ .option('-k, --top-k <n>', 'Final results to return (after rerank)', (v) => parseInt(v, 10), 5)
25
+ .option('--rerank', 'Enable reranking (recommended)')
26
+ .option('--no-rerank', 'Skip reranking — vector search only')
27
+ .option('--rerank-model <model>', 'Reranking model')
28
+ .option('--text-field <name>', 'Document text field for reranking and display', 'text')
29
+ .option('--filter <json>', 'Pre-filter JSON for $vectorSearch')
30
+ .option('--num-candidates <n>', 'ANN candidates (default: limit × 15)', (v) => parseInt(v, 10))
31
+ .option('--show-vectors', 'Include embedding vectors in output')
32
+ .option('--json', 'Machine-readable JSON output')
33
+ .option('-q, --quiet', 'Suppress non-essential output')
34
+ .action(async (text, opts) => {
35
+ let client;
36
+ try {
37
+ // Merge project config
38
+ const { config: proj } = loadProject();
39
+ const db = opts.db || proj.db;
40
+ const collection = opts.collection || proj.collection;
41
+ const index = opts.index || proj.index || 'vector_index';
42
+ const field = opts.field || proj.field || 'embedding';
43
+ const model = opts.model || proj.model || getDefaultModel();
44
+ const rerankModel = opts.rerankModel || DEFAULT_RERANK_MODEL;
45
+ const textField = opts.textField || 'text';
46
+ const dimensions = opts.dimensions || proj.dimensions;
47
+ const doRerank = opts.rerank !== false;
48
+
49
+ if (!db || !collection) {
50
+ console.error(ui.error('Database and collection required. Use --db and --collection, or create .vai.json with "vai init".'));
51
+ process.exit(1);
52
+ }
53
+
54
+ const useColor = !opts.json;
55
+ const useSpinner = useColor && !opts.quiet;
56
+
57
+ // Step 1: Embed query
58
+ let spin;
59
+ if (useSpinner) {
60
+ spin = ui.spinner('Embedding query...');
61
+ spin.start();
62
+ }
63
+
64
+ const embedOpts = { model, inputType: 'query' };
65
+ if (dimensions) embedOpts.dimensions = dimensions;
66
+ const embedResult = await generateEmbeddings([text], embedOpts);
67
+ const queryVector = embedResult.data[0].embedding;
68
+ const embedTokens = embedResult.usage?.total_tokens || 0;
69
+
70
+ if (spin) spin.stop();
71
+
72
+ // Step 2: Vector search
73
+ if (useSpinner) {
74
+ spin = ui.spinner(`Searching ${db}.${collection}...`);
75
+ spin.start();
76
+ }
77
+
78
+ const { client: c, coll } = await connectCollection(db, collection);
79
+ client = c;
80
+
81
+ const numCandidates = opts.numCandidates || Math.min(opts.limit * 15, 10000);
82
+ const vectorSearchStage = {
83
+ index,
84
+ path: field,
85
+ queryVector,
86
+ numCandidates,
87
+ limit: opts.limit,
88
+ };
89
+
90
+ if (opts.filter) {
91
+ try {
92
+ vectorSearchStage.filter = JSON.parse(opts.filter);
93
+ } catch {
94
+ if (spin) spin.stop();
95
+ console.error(ui.error('Invalid --filter JSON.'));
96
+ process.exit(1);
97
+ }
98
+ }
99
+
100
+ const pipeline = [
101
+ { $vectorSearch: vectorSearchStage },
102
+ { $addFields: { _vsScore: { $meta: 'vectorSearchScore' } } },
103
+ ];
104
+
105
+ const searchResults = await coll.aggregate(pipeline).toArray();
106
+ if (spin) spin.stop();
107
+
108
+ if (searchResults.length === 0) {
109
+ if (opts.json) {
110
+ console.log(JSON.stringify({ query: text, results: [], stages: { search: 0, rerank: 0 } }, null, 2));
111
+ } else {
112
+ console.log(ui.yellow('No results found.'));
113
+ }
114
+ return;
115
+ }
116
+
117
+ // Step 3: Rerank (optional)
118
+ let finalResults;
119
+ let rerankTokens = 0;
120
+
121
+ if (doRerank && searchResults.length > 1) {
122
+ if (useSpinner) {
123
+ spin = ui.spinner(`Reranking ${searchResults.length} results...`);
124
+ spin.start();
125
+ }
126
+
127
+ // Extract text for reranking
128
+ const documents = searchResults.map(doc => {
129
+ const txt = doc[textField];
130
+ if (!txt) return JSON.stringify(doc);
131
+ return typeof txt === 'string' ? txt : JSON.stringify(txt);
132
+ });
133
+
134
+ const rerankBody = {
135
+ query: text,
136
+ documents,
137
+ model: rerankModel,
138
+ top_k: opts.topK,
139
+ };
140
+
141
+ const rerankResult = await apiRequest('/rerank', rerankBody);
142
+ rerankTokens = rerankResult.usage?.total_tokens || 0;
143
+
144
+ if (spin) spin.stop();
145
+
146
+ // Map reranked indices back to original docs
147
+ finalResults = (rerankResult.data || []).map(item => {
148
+ const doc = searchResults[item.index];
149
+ return {
150
+ ...doc,
151
+ _vsScore: doc._vsScore,
152
+ _rerankScore: item.relevance_score,
153
+ _finalScore: item.relevance_score,
154
+ };
155
+ });
156
+ } else {
157
+ // No rerank — just take top-k from vector search
158
+ finalResults = searchResults.slice(0, opts.topK).map(doc => ({
159
+ ...doc,
160
+ _finalScore: doc._vsScore,
161
+ }));
162
+ }
163
+
164
+ // Build output
165
+ const output = finalResults.map((doc, i) => {
166
+ const clean = {};
167
+ // Include key fields
168
+ if (doc._id) clean._id = doc._id;
169
+ if (doc[textField]) {
170
+ clean[textField] = doc[textField];
171
+ }
172
+ // Include metadata fields (skip embedding and internal scores)
173
+ for (const key of Object.keys(doc)) {
174
+ if (key === field || key === '_vsScore' || key === '_rerankScore' || key === '_finalScore') continue;
175
+ if (key === '_id' || key === textField) continue;
176
+ if (!opts.showVectors) clean[key] = doc[key];
177
+ else clean[key] = doc[key];
178
+ }
179
+ // Scores
180
+ clean.score = doc._finalScore;
181
+ if (doc._vsScore !== undefined) clean.vectorScore = doc._vsScore;
182
+ if (doc._rerankScore !== undefined) clean.rerankScore = doc._rerankScore;
183
+ clean.rank = i + 1;
184
+ return clean;
185
+ });
186
+
187
+ if (opts.json) {
188
+ console.log(JSON.stringify({
189
+ query: text,
190
+ model,
191
+ rerankModel: doRerank ? rerankModel : null,
192
+ db,
193
+ collection,
194
+ stages: {
195
+ searchCandidates: searchResults.length,
196
+ finalResults: output.length,
197
+ reranked: doRerank && searchResults.length > 1,
198
+ },
199
+ tokens: { embed: embedTokens, rerank: rerankTokens },
200
+ results: output,
201
+ }, null, 2));
202
+ return;
203
+ }
204
+
205
+ // Pretty output
206
+ if (!opts.quiet) {
207
+ console.log('');
208
+ console.log(ui.label('Query', ui.cyan(`"${text}"`)));
209
+ console.log(ui.label('Search', `${searchResults.length} candidates from ${ui.dim(`${db}.${collection}`)}`));
210
+ if (doRerank && searchResults.length > 1) {
211
+ console.log(ui.label('Rerank', `Top ${output.length} via ${ui.dim(rerankModel)}`));
212
+ }
213
+ console.log(ui.label('Model', ui.dim(model)));
214
+ console.log('');
215
+ }
216
+
217
+ for (let i = 0; i < output.length; i++) {
218
+ const r = output[i];
219
+ const scoreStr = r.score != null ? ui.score(r.score) : 'N/A';
220
+ const vsStr = r.vectorScore != null ? ui.dim(`vs:${r.vectorScore.toFixed(3)}`) : '';
221
+ const rrStr = r.rerankScore != null ? ui.dim(`rr:${r.rerankScore.toFixed(3)}`) : '';
222
+ const scores = [vsStr, rrStr].filter(Boolean).join(' ');
223
+
224
+ console.log(`${ui.bold(`#${i + 1}`)} ${scoreStr} ${scores}`);
225
+
226
+ // Show text preview
227
+ const textVal = r[textField];
228
+ if (textVal) {
229
+ const preview = textVal.substring(0, 200);
230
+ const ellipsis = textVal.length > 200 ? '...' : '';
231
+ console.log(` ${preview}${ellipsis}`);
232
+ }
233
+
234
+ // Show source metadata if present
235
+ if (r.source) console.log(` ${ui.dim('source: ' + r.source)}`);
236
+ if (r.metadata?.source) console.log(` ${ui.dim('source: ' + r.metadata.source)}`);
237
+
238
+ console.log(` ${ui.dim('_id: ' + r._id)}`);
239
+ console.log('');
240
+ }
241
+
242
+ if (!opts.quiet) {
243
+ const totalTokens = embedTokens + rerankTokens;
244
+ console.log(ui.dim(` Tokens: ${totalTokens} (embed: ${embedTokens}${rerankTokens ? `, rerank: ${rerankTokens}` : ''})`));
245
+ }
246
+ } catch (err) {
247
+ console.error(ui.error(err.message));
248
+ process.exit(1);
249
+ } finally {
250
+ if (client) await client.close();
251
+ }
252
+ });
253
+ }
254
+
255
+ /**
256
+ * Connect to a MongoDB collection.
257
+ * @param {string} db
258
+ * @param {string} collName
259
+ * @returns {Promise<{client: MongoClient, coll: Collection}>}
260
+ */
261
+ async function connectCollection(db, collName) {
262
+ const { client, collection } = await getMongoCollection(db, collName);
263
+ return { client, coll: collection };
264
+ }
265
+
266
+ module.exports = { registerQuery };