voyageai-cli 1.2.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,414 @@
1
+ 'use strict';
2
+
3
+ const fs = require('fs');
4
+ const path = require('path');
5
+ const { getDefaultModel } = require('../lib/catalog');
6
+ const { generateEmbeddings } = require('../lib/api');
7
+ const { getMongoCollection } = require('../lib/mongo');
8
+ const ui = require('../lib/ui');
9
+
10
+ /**
11
+ * Detect file format from extension and content.
12
+ * @param {string} filePath
13
+ * @returns {'csv'|'json'|'jsonl'|'text'}
14
+ */
15
+ function detectFormat(filePath) {
16
+ const ext = path.extname(filePath).toLowerCase();
17
+ if (ext === '.csv') return 'csv';
18
+ if (ext === '.json') return 'json';
19
+ if (ext === '.jsonl' || ext === '.ndjson') return 'jsonl';
20
+
21
+ // Try to detect from content
22
+ const content = fs.readFileSync(filePath, 'utf-8');
23
+ const firstLine = content.split('\n').find(l => l.trim());
24
+ if (!firstLine) return 'text';
25
+
26
+ // Check for JSON array first (starts with [)
27
+ if (firstLine.trim().startsWith('[')) return 'json';
28
+
29
+ try {
30
+ JSON.parse(firstLine);
31
+ return 'jsonl';
32
+ } catch {
33
+ // not JSON per line
34
+ }
35
+ return 'text';
36
+ }
37
+
38
+ /**
39
+ * Parse a CSV line handling quoted fields.
40
+ * @param {string} line
41
+ * @returns {string[]}
42
+ */
43
+ function parseCSVLine(line) {
44
+ const fields = [];
45
+ let current = '';
46
+ let inQuotes = false;
47
+
48
+ for (let i = 0; i < line.length; i++) {
49
+ const ch = line[i];
50
+ if (inQuotes) {
51
+ if (ch === '"') {
52
+ if (i + 1 < line.length && line[i + 1] === '"') {
53
+ current += '"';
54
+ i++; // skip escaped quote
55
+ } else {
56
+ inQuotes = false;
57
+ }
58
+ } else {
59
+ current += ch;
60
+ }
61
+ } else {
62
+ if (ch === '"') {
63
+ inQuotes = true;
64
+ } else if (ch === ',') {
65
+ fields.push(current);
66
+ current = '';
67
+ } else {
68
+ current += ch;
69
+ }
70
+ }
71
+ }
72
+ fields.push(current);
73
+ return fields;
74
+ }
75
+
76
+ /**
77
+ * Parse documents from a file.
78
+ * @param {string} filePath
79
+ * @param {string} format
80
+ * @param {object} options
81
+ * @param {string} [options.textField] - JSON/JSONL field for text (default: "text")
82
+ * @param {string} [options.textColumn] - CSV column for text
83
+ * @returns {{documents: object[], textKey: string}}
84
+ */
85
+ function parseFile(filePath, format, options = {}) {
86
+ const content = fs.readFileSync(filePath, 'utf-8').trim();
87
+ const textField = options.textField || 'text';
88
+
89
+ if (format === 'jsonl') {
90
+ const lines = content.split('\n').filter(l => l.trim());
91
+ const documents = lines.map((line, i) => {
92
+ try {
93
+ return JSON.parse(line);
94
+ } catch (e) {
95
+ throw new Error(`Invalid JSON on line ${i + 1}: ${e.message}`);
96
+ }
97
+ });
98
+ // Validate text field exists
99
+ for (let i = 0; i < documents.length; i++) {
100
+ if (!documents[i][textField]) {
101
+ throw new Error(`Document on line ${i + 1} missing "${textField}" field. Use --text-field to specify the text field.`);
102
+ }
103
+ }
104
+ return { documents, textKey: textField };
105
+ }
106
+
107
+ if (format === 'json') {
108
+ let documents;
109
+ try {
110
+ documents = JSON.parse(content);
111
+ } catch (e) {
112
+ throw new Error(`Invalid JSON file: ${e.message}`);
113
+ }
114
+ if (!Array.isArray(documents)) {
115
+ throw new Error('JSON file must contain an array of objects.');
116
+ }
117
+ for (let i = 0; i < documents.length; i++) {
118
+ if (!documents[i][textField]) {
119
+ throw new Error(`Document at index ${i} missing "${textField}" field. Use --text-field to specify the text field.`);
120
+ }
121
+ }
122
+ return { documents, textKey: textField };
123
+ }
124
+
125
+ if (format === 'csv') {
126
+ const lines = content.split('\n').filter(l => l.trim());
127
+ if (lines.length < 2) {
128
+ throw new Error('CSV file must have a header row and at least one data row.');
129
+ }
130
+ const headers = parseCSVLine(lines[0]);
131
+ const textColumn = options.textColumn;
132
+ if (!textColumn) {
133
+ throw new Error('CSV files require --text-column to specify which column contains the text to embed.');
134
+ }
135
+ const textIndex = headers.indexOf(textColumn);
136
+ if (textIndex === -1) {
137
+ throw new Error(`Column "${textColumn}" not found in CSV headers: ${headers.join(', ')}`);
138
+ }
139
+
140
+ const documents = [];
141
+ for (let i = 1; i < lines.length; i++) {
142
+ const values = parseCSVLine(lines[i]);
143
+ const doc = {};
144
+ for (let j = 0; j < headers.length; j++) {
145
+ doc[headers[j]] = values[j] !== undefined ? values[j] : '';
146
+ }
147
+ if (!doc[textColumn]) {
148
+ throw new Error(`Row ${i + 1} has empty text column "${textColumn}".`);
149
+ }
150
+ documents.push(doc);
151
+ }
152
+ return { documents, textKey: textColumn };
153
+ }
154
+
155
+ // Plain text: one document per non-empty line
156
+ const lines = content.split('\n').filter(l => l.trim());
157
+ const documents = lines.map(line => ({ text: line.trim() }));
158
+ return { documents, textKey: 'text' };
159
+ }
160
+
161
+ /**
162
+ * Rough token estimate (~4 chars per token).
163
+ * @param {string[]} texts
164
+ * @returns {number}
165
+ */
166
+ function estimateTokens(texts) {
167
+ const totalChars = texts.reduce((sum, t) => sum + t.length, 0);
168
+ return Math.ceil(totalChars / 4);
169
+ }
170
+
171
+ /**
172
+ * Write a progress bar to stderr.
173
+ * @param {number} current
174
+ * @param {number} total
175
+ * @param {number} batch
176
+ * @param {number} totalBatches
177
+ * @param {number} tokens
178
+ */
179
+ function updateProgress(current, total, batch, totalBatches, tokens) {
180
+ const pct = Math.round((current / total) * 100);
181
+ const barLen = 20;
182
+ const filled = Math.round(barLen * current / total);
183
+ const bar = '\u2588'.repeat(filled) + '\u2591'.repeat(barLen - filled);
184
+ const line = ` ${bar} ${current}/${total} (${pct}%) | Batch ${batch}/${totalBatches} | ${tokens.toLocaleString()} tokens`;
185
+ process.stderr.write(`\r${line}`);
186
+ }
187
+
188
+ /**
189
+ * Register the ingest command on a Commander program.
190
+ * @param {import('commander').Command} program
191
+ */
192
+ function registerIngest(program) {
193
+ program
194
+ .command('ingest')
195
+ .description('Bulk import documents: read file, embed in batches, store in MongoDB Atlas')
196
+ .requiredOption('--file <path>', 'Input file (JSON, JSONL, CSV, or plain text)')
197
+ .requiredOption('--db <database>', 'Database name')
198
+ .requiredOption('--collection <name>', 'Collection name')
199
+ .requiredOption('--field <name>', 'Embedding field name')
200
+ .option('-m, --model <model>', 'Embedding model', getDefaultModel())
201
+ .option('--input-type <type>', 'Input type: query or document', 'document')
202
+ .option('-d, --dimensions <n>', 'Output dimensions', (v) => parseInt(v, 10))
203
+ .option('--batch-size <n>', 'Documents per batch (default: 50, max: 128)', (v) => parseInt(v, 10), 50)
204
+ .option('--text-column <name>', 'CSV column to embed (required for CSV)')
205
+ .option('--text-field <name>', 'JSON/JSONL field containing text to embed', 'text')
206
+ .option('--dry-run', 'Parse file and show stats without embedding or inserting')
207
+ .option('--strict', 'Abort on first batch error')
208
+ .option('--json', 'Machine-readable JSON output')
209
+ .option('-q, --quiet', 'Suppress progress, show only final summary')
210
+ .action(async (opts) => {
211
+ const startTime = Date.now();
212
+
213
+ // Validate file exists
214
+ if (!fs.existsSync(opts.file)) {
215
+ console.error(ui.error(`File not found: ${opts.file}`));
216
+ process.exit(1);
217
+ }
218
+
219
+ // Clamp batch size
220
+ if (opts.batchSize > 128) {
221
+ console.error(ui.error('Batch size cannot exceed 128 (Voyage API limit).'));
222
+ process.exit(1);
223
+ }
224
+ if (opts.batchSize < 1) {
225
+ console.error(ui.error('Batch size must be at least 1.'));
226
+ process.exit(1);
227
+ }
228
+
229
+ // Detect format
230
+ const format = detectFormat(opts.file);
231
+
232
+ // Parse documents
233
+ let documents, textKey;
234
+ try {
235
+ const parsed = parseFile(opts.file, format, {
236
+ textField: opts.textField,
237
+ textColumn: opts.textColumn,
238
+ });
239
+ documents = parsed.documents;
240
+ textKey = parsed.textKey;
241
+ } catch (err) {
242
+ console.error(ui.error(err.message));
243
+ process.exit(1);
244
+ }
245
+
246
+ if (documents.length === 0) {
247
+ console.error(ui.error('No documents found in file.'));
248
+ process.exit(1);
249
+ }
250
+
251
+ const texts = documents.map(d => d[textKey]);
252
+ const totalBatches = Math.ceil(documents.length / opts.batchSize);
253
+
254
+ // Dry run mode
255
+ if (opts.dryRun) {
256
+ const estimated = estimateTokens(texts);
257
+ if (opts.json) {
258
+ console.log(JSON.stringify({
259
+ dryRun: true,
260
+ format,
261
+ documents: documents.length,
262
+ batches: totalBatches,
263
+ batchSize: opts.batchSize,
264
+ estimatedTokens: estimated,
265
+ model: opts.model,
266
+ textField: textKey,
267
+ }, null, 2));
268
+ } else {
269
+ console.log(ui.info('Dry run — no embeddings generated, nothing inserted.\n'));
270
+ console.log(ui.label('File', opts.file));
271
+ console.log(ui.label('Format', format));
272
+ console.log(ui.label('Documents', String(documents.length)));
273
+ console.log(ui.label('Batches', `${totalBatches} (batch size: ${opts.batchSize})`));
274
+ console.log(ui.label('Est. tokens', `~${estimated.toLocaleString()}`));
275
+ console.log(ui.label('Model', opts.model));
276
+ console.log(ui.label('Text field', textKey));
277
+ console.log(ui.label('Target', `${opts.db}.${opts.collection}`));
278
+ console.log(ui.label('Embed field', opts.field));
279
+ }
280
+ return;
281
+ }
282
+
283
+ // Real ingest
284
+ let client;
285
+ try {
286
+ const { client: c, collection } = await getMongoCollection(opts.db, opts.collection);
287
+ client = c;
288
+
289
+ let totalTokens = 0;
290
+ let succeeded = 0;
291
+ let failed = 0;
292
+ const errors = [];
293
+
294
+ if (!opts.quiet && !opts.json) {
295
+ process.stderr.write('Ingesting documents...\n');
296
+ }
297
+
298
+ for (let i = 0; i < documents.length; i += opts.batchSize) {
299
+ const batchNum = Math.floor(i / opts.batchSize) + 1;
300
+ const batch = documents.slice(i, i + opts.batchSize);
301
+ const batchTexts = batch.map(d => d[textKey]);
302
+
303
+ try {
304
+ const embedResult = await generateEmbeddings(batchTexts, {
305
+ model: opts.model,
306
+ inputType: opts.inputType,
307
+ dimensions: opts.dimensions,
308
+ });
309
+
310
+ // Attach embeddings to documents
311
+ for (let j = 0; j < batch.length; j++) {
312
+ batch[j][opts.field] = embedResult.data[j].embedding;
313
+ batch[j].model = opts.model;
314
+ batch[j].dimensions = embedResult.data[j].embedding.length;
315
+ batch[j].ingestedAt = new Date();
316
+ }
317
+
318
+ // Insert batch into MongoDB
319
+ await collection.insertMany(batch);
320
+
321
+ totalTokens += embedResult.usage?.total_tokens || 0;
322
+ succeeded += batch.length;
323
+ } catch (err) {
324
+ failed += batch.length;
325
+ errors.push({ batch: batchNum, error: err.message });
326
+
327
+ if (opts.strict) {
328
+ if (!opts.quiet && !opts.json) {
329
+ process.stderr.write('\n');
330
+ }
331
+ console.error(ui.error(`Batch ${batchNum} failed: ${err.message}`));
332
+ console.error(ui.error('Aborting (--strict mode).'));
333
+ process.exit(1);
334
+ }
335
+
336
+ if (!opts.quiet && !opts.json) {
337
+ process.stderr.write(`\n${ui.warn(`Batch ${batchNum} failed: ${err.message}`)}\n`);
338
+ }
339
+ }
340
+
341
+ // Update progress
342
+ if (!opts.quiet && !opts.json) {
343
+ updateProgress(
344
+ Math.min(i + opts.batchSize, documents.length),
345
+ documents.length,
346
+ batchNum,
347
+ totalBatches,
348
+ totalTokens
349
+ );
350
+ }
351
+ }
352
+
353
+ // Clear progress line
354
+ if (!opts.quiet && !opts.json) {
355
+ process.stderr.write('\n');
356
+ }
357
+
358
+ const duration = ((Date.now() - startTime) / 1000).toFixed(1);
359
+ const rate = (succeeded / (duration > 0 ? duration : 1)).toFixed(1);
360
+
361
+ if (opts.json) {
362
+ const summary = {
363
+ succeeded,
364
+ failed,
365
+ total: documents.length,
366
+ database: opts.db,
367
+ collection: opts.collection,
368
+ batches: totalBatches,
369
+ tokens: totalTokens,
370
+ model: opts.model,
371
+ durationSeconds: parseFloat(duration),
372
+ docsPerSecond: parseFloat(rate),
373
+ };
374
+ if (errors.length > 0) {
375
+ summary.errors = errors;
376
+ }
377
+ console.log(JSON.stringify(summary, null, 2));
378
+ } else {
379
+ if (failed === 0) {
380
+ console.log(ui.success(`Ingested ${succeeded} documents into ${opts.db}.${opts.collection}`));
381
+ } else {
382
+ console.log(ui.warn(`Ingested ${succeeded} of ${documents.length} documents into ${opts.db}.${opts.collection} (${failed} failed)`));
383
+ }
384
+ console.log(ui.label('Batches', String(totalBatches)));
385
+ console.log(ui.label('Tokens', totalTokens.toLocaleString()));
386
+ console.log(ui.label('Model', opts.model));
387
+ console.log(ui.label('Duration', `${duration}s`));
388
+ console.log(ui.label('Rate', `${rate} docs/sec`));
389
+ if (errors.length > 0) {
390
+ console.log('');
391
+ console.log(ui.warn(`${errors.length} batch(es) failed:`));
392
+ for (const e of errors) {
393
+ console.log(` Batch ${e.batch}: ${e.error}`);
394
+ }
395
+ }
396
+ }
397
+ } catch (err) {
398
+ console.error(ui.error(err.message));
399
+ process.exit(1);
400
+ } finally {
401
+ if (client) await client.close();
402
+ }
403
+ });
404
+ }
405
+
406
+ module.exports = {
407
+ registerIngest,
408
+ // Exported for testing
409
+ detectFormat,
410
+ parseFile,
411
+ parseCSVLine,
412
+ estimateTokens,
413
+ updateProgress,
414
+ };
@@ -0,0 +1,175 @@
1
+ 'use strict';
2
+
3
+ const fs = require('fs');
4
+ const { generateEmbeddings } = require('../lib/api');
5
+ const { cosineSimilarity } = require('../lib/math');
6
+ const { getDefaultModel } = require('../lib/catalog');
7
+ const ui = require('../lib/ui');
8
+
9
+ /**
10
+ * Register the similarity command on a Commander program.
11
+ * @param {import('commander').Command} program
12
+ */
13
+ function registerSimilarity(program) {
14
+ program
15
+ .command('similarity')
16
+ .description('Compute cosine similarity between texts')
17
+ .argument('[texts...]', 'Two texts to compare')
18
+ .option('--against <texts...>', 'Compare first text against multiple texts')
19
+ .option('--file1 <path>', 'Read text A from file')
20
+ .option('--file2 <path>', 'Read text B from file')
21
+ .option('-m, --model <model>', 'Embedding model', getDefaultModel())
22
+ .option('--dimensions <n>', 'Output dimensions', (v) => parseInt(v, 10))
23
+ .option('--json', 'Machine-readable JSON output')
24
+ .option('-q, --quiet', 'Suppress non-essential output')
25
+ .action(async (texts, opts) => {
26
+ try {
27
+ let textA = null;
28
+ let compareTexts = [];
29
+ let isOneVsMany = false;
30
+
31
+ // Resolve text A
32
+ if (opts.file1) {
33
+ textA = fs.readFileSync(opts.file1, 'utf-8').trim();
34
+ } else if (texts.length > 0) {
35
+ textA = texts[0];
36
+ }
37
+
38
+ // Resolve comparison targets
39
+ if (opts.against && opts.against.length > 0) {
40
+ // One-vs-many mode
41
+ isOneVsMany = true;
42
+ compareTexts = opts.against;
43
+ } else if (opts.file2) {
44
+ compareTexts = [fs.readFileSync(opts.file2, 'utf-8').trim()];
45
+ } else if (texts.length >= 2) {
46
+ compareTexts = [texts[1]];
47
+ }
48
+
49
+ // Validate inputs
50
+ if (!textA) {
51
+ console.error(ui.error('No input text provided. Provide two texts, use --file1/--file2, or use --against.'));
52
+ process.exit(1);
53
+ }
54
+
55
+ if (compareTexts.length === 0) {
56
+ console.error(ui.error('Need at least two texts to compare. Provide a second text, --file2, or --against.'));
57
+ process.exit(1);
58
+ }
59
+
60
+ // Batch all texts into one API call
61
+ const allTexts = [textA, ...compareTexts];
62
+
63
+ const useSpinner = !opts.json && !opts.quiet;
64
+ let spin;
65
+ if (useSpinner) {
66
+ spin = ui.spinner('Computing similarity...');
67
+ spin.start();
68
+ }
69
+
70
+ const embeddingOpts = {
71
+ model: opts.model,
72
+ };
73
+ if (opts.dimensions) {
74
+ embeddingOpts.dimensions = opts.dimensions;
75
+ }
76
+ // Don't set inputType — we're comparing directly, not query/document
77
+
78
+ const result = await generateEmbeddings(allTexts, embeddingOpts);
79
+
80
+ if (spin) spin.stop();
81
+
82
+ const embeddings = result.data.map(d => d.embedding);
83
+ const tokens = result.usage?.total_tokens || 0;
84
+ const model = result.model || opts.model;
85
+
86
+ const refEmbedding = embeddings[0];
87
+
88
+ if (!isOneVsMany && compareTexts.length === 1) {
89
+ // Two-text comparison
90
+ const sim = cosineSimilarity(refEmbedding, embeddings[1]);
91
+
92
+ if (opts.json) {
93
+ console.log(JSON.stringify({
94
+ similarity: sim,
95
+ metric: 'cosine',
96
+ textA,
97
+ textB: compareTexts[0],
98
+ model,
99
+ tokens,
100
+ }, null, 2));
101
+ return;
102
+ }
103
+
104
+ if (opts.quiet) {
105
+ console.log(sim.toFixed(6));
106
+ return;
107
+ }
108
+
109
+ console.log('');
110
+ console.log(` Similarity: ${ui.score(sim)} (cosine)`);
111
+ console.log('');
112
+ console.log(ui.label('Text A', `"${truncate(textA, 70)}"`));
113
+ console.log(ui.label('Text B', `"${truncate(compareTexts[0], 70)}"`));
114
+ console.log(ui.label('Model', ui.cyan(model)));
115
+ console.log(ui.label('Tokens', ui.dim(String(tokens))));
116
+ console.log('');
117
+ } else {
118
+ // One-vs-many comparison
119
+ const results = compareTexts.map((text, i) => ({
120
+ text,
121
+ similarity: cosineSimilarity(refEmbedding, embeddings[i + 1]),
122
+ }));
123
+
124
+ // Sort by similarity descending
125
+ results.sort((a, b) => b.similarity - a.similarity);
126
+
127
+ if (opts.json) {
128
+ console.log(JSON.stringify({
129
+ query: textA,
130
+ results,
131
+ model,
132
+ tokens,
133
+ }, null, 2));
134
+ return;
135
+ }
136
+
137
+ if (opts.quiet) {
138
+ for (const r of results) {
139
+ console.log(`${r.similarity.toFixed(6)}\t"${truncate(r.text, 60)}"`);
140
+ }
141
+ return;
142
+ }
143
+
144
+ console.log('');
145
+ console.log(` Query: ${ui.cyan(`"${truncate(textA, 60)}"`)}`);
146
+ console.log(` Model: ${ui.cyan(model)}`);
147
+ console.log('');
148
+
149
+ for (const r of results) {
150
+ console.log(` ${ui.score(r.similarity)} "${truncate(r.text, 60)}"`);
151
+ }
152
+
153
+ console.log('');
154
+ console.log(` ${ui.dim(`${results.length} comparisons, ${tokens} tokens`)}`);
155
+ console.log('');
156
+ }
157
+ } catch (err) {
158
+ console.error(ui.error(err.message));
159
+ process.exit(1);
160
+ }
161
+ });
162
+ }
163
+
164
+ /**
165
+ * Truncate a string to maxLen, appending '...' if truncated.
166
+ * @param {string} str
167
+ * @param {number} maxLen
168
+ * @returns {string}
169
+ */
170
+ function truncate(str, maxLen) {
171
+ if (str.length <= maxLen) return str;
172
+ return str.substring(0, maxLen) + '...';
173
+ }
174
+
175
+ module.exports = { registerSimilarity };
package/src/lib/banner.js CHANGED
@@ -35,6 +35,7 @@ function showBanner() {
35
35
  console.log(titleLine);
36
36
  console.log(taglineLine);
37
37
  console.log(bot);
38
+ console.log(pc.dim(' Community tool — not an official MongoDB or Voyage AI product'));
38
39
  console.log('');
39
40
  }
40
41