voyageai-cli 1.3.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,414 @@
1
+ 'use strict';
2
+
3
+ const fs = require('fs');
4
+ const path = require('path');
5
+ const { getDefaultModel } = require('../lib/catalog');
6
+ const { generateEmbeddings } = require('../lib/api');
7
+ const { getMongoCollection } = require('../lib/mongo');
8
+ const ui = require('../lib/ui');
9
+
10
+ /**
11
+ * Detect file format from extension and content.
12
+ * @param {string} filePath
13
+ * @returns {'csv'|'json'|'jsonl'|'text'}
14
+ */
15
+ function detectFormat(filePath) {
16
+ const ext = path.extname(filePath).toLowerCase();
17
+ if (ext === '.csv') return 'csv';
18
+ if (ext === '.json') return 'json';
19
+ if (ext === '.jsonl' || ext === '.ndjson') return 'jsonl';
20
+
21
+ // Try to detect from content
22
+ const content = fs.readFileSync(filePath, 'utf-8');
23
+ const firstLine = content.split('\n').find(l => l.trim());
24
+ if (!firstLine) return 'text';
25
+
26
+ // Check for JSON array first (starts with [)
27
+ if (firstLine.trim().startsWith('[')) return 'json';
28
+
29
+ try {
30
+ JSON.parse(firstLine);
31
+ return 'jsonl';
32
+ } catch {
33
+ // not JSON per line
34
+ }
35
+ return 'text';
36
+ }
37
+
38
+ /**
39
+ * Parse a CSV line handling quoted fields.
40
+ * @param {string} line
41
+ * @returns {string[]}
42
+ */
43
+ function parseCSVLine(line) {
44
+ const fields = [];
45
+ let current = '';
46
+ let inQuotes = false;
47
+
48
+ for (let i = 0; i < line.length; i++) {
49
+ const ch = line[i];
50
+ if (inQuotes) {
51
+ if (ch === '"') {
52
+ if (i + 1 < line.length && line[i + 1] === '"') {
53
+ current += '"';
54
+ i++; // skip escaped quote
55
+ } else {
56
+ inQuotes = false;
57
+ }
58
+ } else {
59
+ current += ch;
60
+ }
61
+ } else {
62
+ if (ch === '"') {
63
+ inQuotes = true;
64
+ } else if (ch === ',') {
65
+ fields.push(current);
66
+ current = '';
67
+ } else {
68
+ current += ch;
69
+ }
70
+ }
71
+ }
72
+ fields.push(current);
73
+ return fields;
74
+ }
75
+
76
+ /**
77
+ * Parse documents from a file.
78
+ * @param {string} filePath
79
+ * @param {string} format
80
+ * @param {object} options
81
+ * @param {string} [options.textField] - JSON/JSONL field for text (default: "text")
82
+ * @param {string} [options.textColumn] - CSV column for text
83
+ * @returns {{documents: object[], textKey: string}}
84
+ */
85
+ function parseFile(filePath, format, options = {}) {
86
+ const content = fs.readFileSync(filePath, 'utf-8').trim();
87
+ const textField = options.textField || 'text';
88
+
89
+ if (format === 'jsonl') {
90
+ const lines = content.split('\n').filter(l => l.trim());
91
+ const documents = lines.map((line, i) => {
92
+ try {
93
+ return JSON.parse(line);
94
+ } catch (e) {
95
+ throw new Error(`Invalid JSON on line ${i + 1}: ${e.message}`);
96
+ }
97
+ });
98
+ // Validate text field exists
99
+ for (let i = 0; i < documents.length; i++) {
100
+ if (!documents[i][textField]) {
101
+ throw new Error(`Document on line ${i + 1} missing "${textField}" field. Use --text-field to specify the text field.`);
102
+ }
103
+ }
104
+ return { documents, textKey: textField };
105
+ }
106
+
107
+ if (format === 'json') {
108
+ let documents;
109
+ try {
110
+ documents = JSON.parse(content);
111
+ } catch (e) {
112
+ throw new Error(`Invalid JSON file: ${e.message}`);
113
+ }
114
+ if (!Array.isArray(documents)) {
115
+ throw new Error('JSON file must contain an array of objects.');
116
+ }
117
+ for (let i = 0; i < documents.length; i++) {
118
+ if (!documents[i][textField]) {
119
+ throw new Error(`Document at index ${i} missing "${textField}" field. Use --text-field to specify the text field.`);
120
+ }
121
+ }
122
+ return { documents, textKey: textField };
123
+ }
124
+
125
+ if (format === 'csv') {
126
+ const lines = content.split('\n').filter(l => l.trim());
127
+ if (lines.length < 2) {
128
+ throw new Error('CSV file must have a header row and at least one data row.');
129
+ }
130
+ const headers = parseCSVLine(lines[0]);
131
+ const textColumn = options.textColumn;
132
+ if (!textColumn) {
133
+ throw new Error('CSV files require --text-column to specify which column contains the text to embed.');
134
+ }
135
+ const textIndex = headers.indexOf(textColumn);
136
+ if (textIndex === -1) {
137
+ throw new Error(`Column "${textColumn}" not found in CSV headers: ${headers.join(', ')}`);
138
+ }
139
+
140
+ const documents = [];
141
+ for (let i = 1; i < lines.length; i++) {
142
+ const values = parseCSVLine(lines[i]);
143
+ const doc = {};
144
+ for (let j = 0; j < headers.length; j++) {
145
+ doc[headers[j]] = values[j] !== undefined ? values[j] : '';
146
+ }
147
+ if (!doc[textColumn]) {
148
+ throw new Error(`Row ${i + 1} has empty text column "${textColumn}".`);
149
+ }
150
+ documents.push(doc);
151
+ }
152
+ return { documents, textKey: textColumn };
153
+ }
154
+
155
+ // Plain text: one document per non-empty line
156
+ const lines = content.split('\n').filter(l => l.trim());
157
+ const documents = lines.map(line => ({ text: line.trim() }));
158
+ return { documents, textKey: 'text' };
159
+ }
160
+
161
+ /**
162
+ * Rough token estimate (~4 chars per token).
163
+ * @param {string[]} texts
164
+ * @returns {number}
165
+ */
166
+ function estimateTokens(texts) {
167
+ const totalChars = texts.reduce((sum, t) => sum + t.length, 0);
168
+ return Math.ceil(totalChars / 4);
169
+ }
170
+
171
+ /**
172
+ * Write a progress bar to stderr.
173
+ * @param {number} current
174
+ * @param {number} total
175
+ * @param {number} batch
176
+ * @param {number} totalBatches
177
+ * @param {number} tokens
178
+ */
179
+ function updateProgress(current, total, batch, totalBatches, tokens) {
180
+ const pct = Math.round((current / total) * 100);
181
+ const barLen = 20;
182
+ const filled = Math.round(barLen * current / total);
183
+ const bar = '\u2588'.repeat(filled) + '\u2591'.repeat(barLen - filled);
184
+ const line = ` ${bar} ${current}/${total} (${pct}%) | Batch ${batch}/${totalBatches} | ${tokens.toLocaleString()} tokens`;
185
+ process.stderr.write(`\r${line}`);
186
+ }
187
+
188
+ /**
189
+ * Register the ingest command on a Commander program.
190
+ * @param {import('commander').Command} program
191
+ */
192
+ function registerIngest(program) {
193
+ program
194
+ .command('ingest')
195
+ .description('Bulk import documents: read file, embed in batches, store in MongoDB Atlas')
196
+ .requiredOption('--file <path>', 'Input file (JSON, JSONL, CSV, or plain text)')
197
+ .requiredOption('--db <database>', 'Database name')
198
+ .requiredOption('--collection <name>', 'Collection name')
199
+ .requiredOption('--field <name>', 'Embedding field name')
200
+ .option('-m, --model <model>', 'Embedding model', getDefaultModel())
201
+ .option('--input-type <type>', 'Input type: query or document', 'document')
202
+ .option('-d, --dimensions <n>', 'Output dimensions', (v) => parseInt(v, 10))
203
+ .option('--batch-size <n>', 'Documents per batch (default: 50, max: 128)', (v) => parseInt(v, 10), 50)
204
+ .option('--text-column <name>', 'CSV column to embed (required for CSV)')
205
+ .option('--text-field <name>', 'JSON/JSONL field containing text to embed', 'text')
206
+ .option('--dry-run', 'Parse file and show stats without embedding or inserting')
207
+ .option('--strict', 'Abort on first batch error')
208
+ .option('--json', 'Machine-readable JSON output')
209
+ .option('-q, --quiet', 'Suppress progress, show only final summary')
210
+ .action(async (opts) => {
211
+ const startTime = Date.now();
212
+
213
+ // Validate file exists
214
+ if (!fs.existsSync(opts.file)) {
215
+ console.error(ui.error(`File not found: ${opts.file}`));
216
+ process.exit(1);
217
+ }
218
+
219
+ // Clamp batch size
220
+ if (opts.batchSize > 128) {
221
+ console.error(ui.error('Batch size cannot exceed 128 (Voyage API limit).'));
222
+ process.exit(1);
223
+ }
224
+ if (opts.batchSize < 1) {
225
+ console.error(ui.error('Batch size must be at least 1.'));
226
+ process.exit(1);
227
+ }
228
+
229
+ // Detect format
230
+ const format = detectFormat(opts.file);
231
+
232
+ // Parse documents
233
+ let documents, textKey;
234
+ try {
235
+ const parsed = parseFile(opts.file, format, {
236
+ textField: opts.textField,
237
+ textColumn: opts.textColumn,
238
+ });
239
+ documents = parsed.documents;
240
+ textKey = parsed.textKey;
241
+ } catch (err) {
242
+ console.error(ui.error(err.message));
243
+ process.exit(1);
244
+ }
245
+
246
+ if (documents.length === 0) {
247
+ console.error(ui.error('No documents found in file.'));
248
+ process.exit(1);
249
+ }
250
+
251
+ const texts = documents.map(d => d[textKey]);
252
+ const totalBatches = Math.ceil(documents.length / opts.batchSize);
253
+
254
+ // Dry run mode
255
+ if (opts.dryRun) {
256
+ const estimated = estimateTokens(texts);
257
+ if (opts.json) {
258
+ console.log(JSON.stringify({
259
+ dryRun: true,
260
+ format,
261
+ documents: documents.length,
262
+ batches: totalBatches,
263
+ batchSize: opts.batchSize,
264
+ estimatedTokens: estimated,
265
+ model: opts.model,
266
+ textField: textKey,
267
+ }, null, 2));
268
+ } else {
269
+ console.log(ui.info('Dry run — no embeddings generated, nothing inserted.\n'));
270
+ console.log(ui.label('File', opts.file));
271
+ console.log(ui.label('Format', format));
272
+ console.log(ui.label('Documents', String(documents.length)));
273
+ console.log(ui.label('Batches', `${totalBatches} (batch size: ${opts.batchSize})`));
274
+ console.log(ui.label('Est. tokens', `~${estimated.toLocaleString()}`));
275
+ console.log(ui.label('Model', opts.model));
276
+ console.log(ui.label('Text field', textKey));
277
+ console.log(ui.label('Target', `${opts.db}.${opts.collection}`));
278
+ console.log(ui.label('Embed field', opts.field));
279
+ }
280
+ return;
281
+ }
282
+
283
+ // Real ingest
284
+ let client;
285
+ try {
286
+ const { client: c, collection } = await getMongoCollection(opts.db, opts.collection);
287
+ client = c;
288
+
289
+ let totalTokens = 0;
290
+ let succeeded = 0;
291
+ let failed = 0;
292
+ const errors = [];
293
+
294
+ if (!opts.quiet && !opts.json) {
295
+ process.stderr.write('Ingesting documents...\n');
296
+ }
297
+
298
+ for (let i = 0; i < documents.length; i += opts.batchSize) {
299
+ const batchNum = Math.floor(i / opts.batchSize) + 1;
300
+ const batch = documents.slice(i, i + opts.batchSize);
301
+ const batchTexts = batch.map(d => d[textKey]);
302
+
303
+ try {
304
+ const embedResult = await generateEmbeddings(batchTexts, {
305
+ model: opts.model,
306
+ inputType: opts.inputType,
307
+ dimensions: opts.dimensions,
308
+ });
309
+
310
+ // Attach embeddings to documents
311
+ for (let j = 0; j < batch.length; j++) {
312
+ batch[j][opts.field] = embedResult.data[j].embedding;
313
+ batch[j].model = opts.model;
314
+ batch[j].dimensions = embedResult.data[j].embedding.length;
315
+ batch[j].ingestedAt = new Date();
316
+ }
317
+
318
+ // Insert batch into MongoDB
319
+ await collection.insertMany(batch);
320
+
321
+ totalTokens += embedResult.usage?.total_tokens || 0;
322
+ succeeded += batch.length;
323
+ } catch (err) {
324
+ failed += batch.length;
325
+ errors.push({ batch: batchNum, error: err.message });
326
+
327
+ if (opts.strict) {
328
+ if (!opts.quiet && !opts.json) {
329
+ process.stderr.write('\n');
330
+ }
331
+ console.error(ui.error(`Batch ${batchNum} failed: ${err.message}`));
332
+ console.error(ui.error('Aborting (--strict mode).'));
333
+ process.exit(1);
334
+ }
335
+
336
+ if (!opts.quiet && !opts.json) {
337
+ process.stderr.write(`\n${ui.warn(`Batch ${batchNum} failed: ${err.message}`)}\n`);
338
+ }
339
+ }
340
+
341
+ // Update progress
342
+ if (!opts.quiet && !opts.json) {
343
+ updateProgress(
344
+ Math.min(i + opts.batchSize, documents.length),
345
+ documents.length,
346
+ batchNum,
347
+ totalBatches,
348
+ totalTokens
349
+ );
350
+ }
351
+ }
352
+
353
+ // Clear progress line
354
+ if (!opts.quiet && !opts.json) {
355
+ process.stderr.write('\n');
356
+ }
357
+
358
+ const duration = ((Date.now() - startTime) / 1000).toFixed(1);
359
+ const rate = (succeeded / (duration > 0 ? duration : 1)).toFixed(1);
360
+
361
+ if (opts.json) {
362
+ const summary = {
363
+ succeeded,
364
+ failed,
365
+ total: documents.length,
366
+ database: opts.db,
367
+ collection: opts.collection,
368
+ batches: totalBatches,
369
+ tokens: totalTokens,
370
+ model: opts.model,
371
+ durationSeconds: parseFloat(duration),
372
+ docsPerSecond: parseFloat(rate),
373
+ };
374
+ if (errors.length > 0) {
375
+ summary.errors = errors;
376
+ }
377
+ console.log(JSON.stringify(summary, null, 2));
378
+ } else {
379
+ if (failed === 0) {
380
+ console.log(ui.success(`Ingested ${succeeded} documents into ${opts.db}.${opts.collection}`));
381
+ } else {
382
+ console.log(ui.warn(`Ingested ${succeeded} of ${documents.length} documents into ${opts.db}.${opts.collection} (${failed} failed)`));
383
+ }
384
+ console.log(ui.label('Batches', String(totalBatches)));
385
+ console.log(ui.label('Tokens', totalTokens.toLocaleString()));
386
+ console.log(ui.label('Model', opts.model));
387
+ console.log(ui.label('Duration', `${duration}s`));
388
+ console.log(ui.label('Rate', `${rate} docs/sec`));
389
+ if (errors.length > 0) {
390
+ console.log('');
391
+ console.log(ui.warn(`${errors.length} batch(es) failed:`));
392
+ for (const e of errors) {
393
+ console.log(` Batch ${e.batch}: ${e.error}`);
394
+ }
395
+ }
396
+ }
397
+ } catch (err) {
398
+ console.error(ui.error(err.message));
399
+ process.exit(1);
400
+ } finally {
401
+ if (client) await client.close();
402
+ }
403
+ });
404
+ }
405
+
406
+ module.exports = {
407
+ registerIngest,
408
+ // Exported for testing
409
+ detectFormat,
410
+ parseFile,
411
+ parseCSVLine,
412
+ estimateTokens,
413
+ updateProgress,
414
+ };
@@ -1,10 +1,36 @@
1
1
  'use strict';
2
2
 
3
3
  const { MODEL_CATALOG } = require('../lib/catalog');
4
- const { API_BASE } = require('../lib/api');
4
+ const { getApiBase } = require('../lib/api');
5
5
  const { formatTable } = require('../lib/format');
6
6
  const ui = require('../lib/ui');
7
7
 
8
+ /**
9
+ * Shorten dimensions string for compact display.
10
+ * "1024 (default), 256, 512, 2048" → "1024*"
11
+ * "1024" → "1024"
12
+ * "—" → "—"
13
+ * @param {string} dims
14
+ * @returns {string}
15
+ */
16
+ function compactDimensions(dims) {
17
+ if (dims === '—') return dims;
18
+ const match = dims.match(/^(\d+)\s*\(default\)/);
19
+ if (match) return match[1] + '*';
20
+ return dims;
21
+ }
22
+
23
+ /**
24
+ * Shorten price string for compact display.
25
+ * "$0.12/1M tokens" → "$0.12/1M"
26
+ * "$0.12/M + $0.60/B px" → "$0.12/M+$0.60/Bpx"
27
+ * @param {string} price
28
+ * @returns {string}
29
+ */
30
+ function compactPrice(price) {
31
+ return price.replace('/1M tokens', '/1M').replace(' + ', '+').replace('/B px', '/Bpx');
32
+ }
33
+
8
34
  /**
9
35
  * Register the models command on a Commander program.
10
36
  * @param {import('commander').Command} program
@@ -14,6 +40,7 @@ function registerModels(program) {
14
40
  .command('models')
15
41
  .description('List available Voyage AI models')
16
42
  .option('-t, --type <type>', 'Filter by type: embedding, reranking, or all', 'all')
43
+ .option('-w, --wide', 'Wide output (show all columns untruncated)')
17
44
  .option('--json', 'Machine-readable JSON output')
18
45
  .option('-q, --quiet', 'Suppress non-essential output')
19
46
  .action((opts) => {
@@ -33,28 +60,49 @@ function registerModels(program) {
33
60
  return;
34
61
  }
35
62
 
63
+ const apiBase = getApiBase();
64
+
36
65
  if (!opts.quiet) {
37
66
  console.log(ui.bold('Voyage AI Models'));
38
- console.log(ui.dim(`(via MongoDB AI API — ${API_BASE})`));
67
+ console.log(ui.dim(`(via ${apiBase})`));
39
68
  console.log('');
40
69
  }
41
70
 
42
- const headers = ['Model', 'Type', 'Context', 'Dimensions', 'Price', 'Best For'];
43
- const rows = models.map(m => {
44
- const name = ui.cyan(m.name);
45
- const type = m.type === 'embedding' ? ui.green(m.type) : ui.yellow(m.type);
46
- const price = ui.dim(m.price);
47
- return [name, type, m.context, m.dimensions, price, m.bestFor];
48
- });
49
-
50
- // Use bold headers
51
- const boldHeaders = headers.map(h => ui.bold(h));
52
- console.log(formatTable(boldHeaders, rows));
71
+ if (opts.wide) {
72
+ // Full table with all details
73
+ const headers = ['Model', 'Type', 'Context', 'Dimensions', 'Price', 'Best For'];
74
+ const rows = models.map(m => {
75
+ const name = ui.cyan(m.name);
76
+ const type = m.type === 'embedding' ? ui.green(m.type) : ui.yellow(m.type);
77
+ const price = ui.dim(m.price);
78
+ return [name, type, m.context, m.dimensions, price, m.bestFor];
79
+ });
80
+ const boldHeaders = headers.map(h => ui.bold(h));
81
+ console.log(formatTable(boldHeaders, rows));
82
+ } else {
83
+ // Compact table — fits in 80 cols
84
+ const headers = ['Model', 'Type', 'Dims', 'Price', 'Use Case'];
85
+ const rows = models.map(m => {
86
+ const name = ui.cyan(m.name);
87
+ const type = m.type === 'embedding' ? ui.green('embed') : ui.yellow('rerank');
88
+ const dims = compactDimensions(m.dimensions);
89
+ const price = ui.dim(compactPrice(m.price));
90
+ return [name, type, dims, price, m.shortFor || m.bestFor];
91
+ });
92
+ const boldHeaders = headers.map(h => ui.bold(h));
93
+ console.log(formatTable(boldHeaders, rows));
94
+ }
53
95
 
54
96
  if (!opts.quiet) {
55
97
  console.log('');
98
+ if (!opts.wide) {
99
+ console.log(ui.dim('* = also supports 256, 512, 2048 dimensions'));
100
+ }
56
101
  console.log(ui.dim('Free tier: 200M tokens (most models), 50M (domain-specific)'));
57
102
  console.log(ui.dim('All 4-series models share the same embedding space.'));
103
+ if (!opts.wide) {
104
+ console.log(ui.dim('Use --wide for full details.'));
105
+ }
58
106
  }
59
107
  });
60
108
  }
@@ -1,6 +1,6 @@
1
1
  'use strict';
2
2
 
3
- const { API_BASE, requireApiKey } = require('../lib/api');
3
+ const { getApiBase, requireApiKey } = require('../lib/api');
4
4
  const ui = require('../lib/ui');
5
5
 
6
6
  /**
@@ -28,6 +28,7 @@ function registerPing(program) {
28
28
  const useColor = !opts.json;
29
29
  const useSpinner = useColor && !opts.quiet;
30
30
 
31
+ const apiBase = getApiBase();
31
32
  const model = 'voyage-4-lite';
32
33
  const startTime = Date.now();
33
34
 
@@ -38,7 +39,7 @@ function registerPing(program) {
38
39
  }
39
40
 
40
41
  try {
41
- const response = await fetch(`${API_BASE}/embeddings`, {
42
+ const response = await fetch(`${apiBase}/embeddings`, {
42
43
  method: 'POST',
43
44
  headers: {
44
45
  'Content-Type': 'application/json',
@@ -83,7 +84,7 @@ function registerPing(program) {
83
84
  const dims = data.data && data.data[0] ? data.data[0].embedding.length : 'unknown';
84
85
  const tokens = data.usage ? data.usage.total_tokens : 'unknown';
85
86
 
86
- results.voyage = { ok: true, elapsed, model, dimensions: dims, tokens, endpoint: API_BASE };
87
+ results.voyage = { ok: true, elapsed, model, dimensions: dims, tokens, endpoint: apiBase };
87
88
 
88
89
  if (spin) spin.stop();
89
90
 
@@ -93,7 +94,7 @@ function registerPing(program) {
93
94
  console.log(`ok ${elapsed}ms`);
94
95
  } else {
95
96
  console.log(ui.success(`Connected to Voyage AI API ${ui.dim('(' + elapsed + 'ms)')}`));
96
- console.log(ui.label('Endpoint', API_BASE));
97
+ console.log(ui.label('Endpoint', apiBase));
97
98
  console.log(ui.label('Model', model));
98
99
  console.log(ui.label('Dimensions', String(dims)));
99
100
  console.log(ui.label('Tokens', String(tokens)));