voyageai-cli 1.19.2 → 1.20.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +158 -23
- package/package.json +7 -1
- package/src/cli.js +11 -0
- package/src/commands/app.js +155 -0
- package/src/commands/completions.js +19 -6
- package/src/commands/eval.js +353 -32
- package/src/commands/playground.js +56 -0
- package/src/lib/explanations.js +267 -0
- package/src/lib/telemetry.js +72 -0
- package/src/playground/icons/dark/128.png +0 -0
- package/src/playground/icons/dark/16.png +0 -0
- package/src/playground/icons/dark/256.png +0 -0
- package/src/playground/icons/dark/32.png +0 -0
- package/src/playground/icons/dark/64.png +0 -0
- package/src/playground/icons/glyphs/Bulb.svg +5 -0
- package/src/playground/icons/glyphs/Config.svg +3 -0
- package/src/playground/icons/glyphs/Gauge.svg +4 -0
- package/src/playground/icons/glyphs/InfoWithCircle.svg +3 -0
- package/src/playground/icons/glyphs/LightningBolt.svg +3 -0
- package/src/playground/icons/glyphs/MagnifyingGlass.svg +3 -0
- package/src/playground/icons/glyphs/MultiDirectionArrow.svg +6 -0
- package/src/playground/icons/light/128.png +0 -0
- package/src/playground/icons/light/16.png +0 -0
- package/src/playground/icons/light/256.png +0 -0
- package/src/playground/icons/light/32.png +0 -0
- package/src/playground/icons/light/64.png +0 -0
- package/src/playground/index.html +4775 -1987
- package/NOTICE +0 -23
- package/demo-readme.gif +0 -0
package/src/commands/eval.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
3
|
const fs = require('fs');
|
|
4
|
-
const { getDefaultModel, DEFAULT_RERANK_MODEL } = require('../lib/catalog');
|
|
4
|
+
const { getDefaultModel, DEFAULT_RERANK_MODEL, MODEL_CATALOG } = require('../lib/catalog');
|
|
5
5
|
const { generateEmbeddings, apiRequest } = require('../lib/api');
|
|
6
6
|
const { getMongoCollection } = require('../lib/mongo');
|
|
7
7
|
const { loadProject } = require('../lib/project');
|
|
@@ -10,17 +10,41 @@ const ui = require('../lib/ui');
|
|
|
10
10
|
|
|
11
11
|
/**
|
|
12
12
|
* Load a test set from a JSONL file.
|
|
13
|
-
*
|
|
14
|
-
*
|
|
13
|
+
*
|
|
14
|
+
* Retrieval mode (default):
|
|
15
|
+
* { "query": "...", "relevant": ["id1", "id2"] }
|
|
16
|
+
* { "query": "...", "relevant_texts": ["text1", "text2"] }
|
|
17
|
+
*
|
|
18
|
+
* Rerank mode (--mode rerank):
|
|
19
|
+
* { "query": "...", "documents": ["doc1", "doc2", ...], "relevant": [0, 2] }
|
|
20
|
+
* relevant = indices into documents array that are considered relevant.
|
|
21
|
+
*
|
|
15
22
|
* @param {string} filePath
|
|
16
|
-
* @
|
|
23
|
+
* @param {string} mode - 'retrieval' or 'rerank'
|
|
24
|
+
* @returns {Array}
|
|
17
25
|
*/
|
|
18
|
-
function loadTestSet(filePath) {
|
|
26
|
+
function loadTestSet(filePath, mode = 'retrieval') {
|
|
19
27
|
const raw = fs.readFileSync(filePath, 'utf-8');
|
|
20
28
|
const lines = raw.split('\n').filter(l => l.trim().length > 0);
|
|
21
29
|
return lines.map((line, i) => {
|
|
22
30
|
const item = JSON.parse(line);
|
|
23
31
|
if (!item.query) throw new Error(`Line ${i + 1}: missing "query" field`);
|
|
32
|
+
|
|
33
|
+
if (mode === 'rerank') {
|
|
34
|
+
if (!item.documents || !Array.isArray(item.documents) || item.documents.length < 2) {
|
|
35
|
+
throw new Error(`Line ${i + 1}: rerank mode requires "documents" array (≥2 items)`);
|
|
36
|
+
}
|
|
37
|
+
if (!item.relevant || !Array.isArray(item.relevant) || item.relevant.length === 0) {
|
|
38
|
+
throw new Error(`Line ${i + 1}: rerank mode requires "relevant" array of document indices`);
|
|
39
|
+
}
|
|
40
|
+
return {
|
|
41
|
+
query: item.query,
|
|
42
|
+
documents: item.documents,
|
|
43
|
+
relevant: item.relevant, // indices into documents
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// Retrieval mode
|
|
24
48
|
if (!item.relevant && !item.relevant_texts) {
|
|
25
49
|
throw new Error(`Line ${i + 1}: need "relevant" (doc IDs) or "relevant_texts" (text matches)`);
|
|
26
50
|
}
|
|
@@ -39,25 +63,34 @@ function loadTestSet(filePath) {
|
|
|
39
63
|
function registerEval(program) {
|
|
40
64
|
program
|
|
41
65
|
.command('eval')
|
|
42
|
-
.description('Evaluate retrieval quality —
|
|
66
|
+
.description('Evaluate retrieval & reranking quality — MRR, NDCG, Recall on your data')
|
|
43
67
|
.requiredOption('--test-set <path>', 'JSONL file with queries and expected results')
|
|
44
|
-
.option('--
|
|
45
|
-
.option('--
|
|
46
|
-
.option('--
|
|
47
|
-
.option('--
|
|
48
|
-
.option('
|
|
49
|
-
.option('-
|
|
68
|
+
.option('--mode <mode>', 'Evaluation mode: "retrieval" (default) or "rerank"', 'retrieval')
|
|
69
|
+
.option('--db <database>', 'Database name (retrieval mode)')
|
|
70
|
+
.option('--collection <name>', 'Collection name (retrieval mode)')
|
|
71
|
+
.option('--index <name>', 'Vector search index name (retrieval mode)')
|
|
72
|
+
.option('--field <name>', 'Embedding field name (retrieval mode)')
|
|
73
|
+
.option('-m, --model <model>', 'Embedding model (retrieval) or rerank model (rerank mode)')
|
|
74
|
+
.option('--models <models>', 'Compare multiple rerank models (comma-separated)')
|
|
75
|
+
.option('-d, --dimensions <n>', 'Output dimensions (retrieval mode)', (v) => parseInt(v, 10))
|
|
50
76
|
.option('-l, --limit <n>', 'Vector search candidates per query', (v) => parseInt(v, 10), 20)
|
|
51
77
|
.option('-k, --k-values <values>', 'Comma-separated K values for @K metrics', '1,3,5,10')
|
|
52
|
-
.option('--rerank', 'Enable reranking')
|
|
53
|
-
.option('--no-rerank', 'Skip reranking')
|
|
54
|
-
.option('--rerank-model <model>', 'Reranking model')
|
|
78
|
+
.option('--rerank', 'Enable reranking (retrieval mode)')
|
|
79
|
+
.option('--no-rerank', 'Skip reranking (retrieval mode)')
|
|
80
|
+
.option('--rerank-model <model>', 'Reranking model (retrieval mode)')
|
|
81
|
+
.option('--top-k <n>', 'Top-K results to return from reranker', (v) => parseInt(v, 10))
|
|
55
82
|
.option('--text-field <name>', 'Document text field', 'text')
|
|
56
83
|
.option('--id-field <name>', 'Document ID field for matching (default: _id)', '_id')
|
|
57
84
|
.option('--compare <configs>', 'Compare configs: "model1,model2" or "rerank,no-rerank"')
|
|
58
85
|
.option('--json', 'Machine-readable JSON output')
|
|
59
86
|
.option('-q, --quiet', 'Suppress non-essential output')
|
|
60
87
|
.action(async (opts) => {
|
|
88
|
+
// Dispatch to rerank eval mode
|
|
89
|
+
if (opts.mode === 'rerank') {
|
|
90
|
+
await evalRerank(opts);
|
|
91
|
+
return;
|
|
92
|
+
}
|
|
93
|
+
|
|
61
94
|
let client;
|
|
62
95
|
try {
|
|
63
96
|
// Merge project config
|
|
@@ -236,23 +269,7 @@ function registerEval(program) {
|
|
|
236
269
|
console.log(` ${label} ${bar} ${color}`);
|
|
237
270
|
}
|
|
238
271
|
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
// Highlight key metrics
|
|
242
|
-
const mrr = aggregated.mrr;
|
|
243
|
-
const recall5 = aggregated['r@5'];
|
|
244
|
-
const ndcg10 = aggregated['ndcg@10'];
|
|
245
|
-
|
|
246
|
-
if (mrr !== undefined) {
|
|
247
|
-
const grade = mrr >= 0.8 ? ui.green('Excellent') : mrr >= 0.6 ? ui.cyan('Good') : mrr >= 0.4 ? ui.yellow('Fair') : ui.red('Needs work');
|
|
248
|
-
console.log(ui.label('MRR', `${mrr.toFixed(4)} — ${grade}`));
|
|
249
|
-
}
|
|
250
|
-
if (recall5 !== undefined) {
|
|
251
|
-
console.log(ui.label('Recall@5', `${(recall5 * 100).toFixed(1)}% of relevant docs found in top 5`));
|
|
252
|
-
}
|
|
253
|
-
if (ndcg10 !== undefined) {
|
|
254
|
-
console.log(ui.label('NDCG@10', `${ndcg10.toFixed(4)} — ranking quality`));
|
|
255
|
-
}
|
|
272
|
+
printMetricHighlights(aggregated);
|
|
256
273
|
|
|
257
274
|
// Worst queries
|
|
258
275
|
if (worstQueries.length > 0 && worstQueries[0].metrics.mrr < 1) {
|
|
@@ -269,6 +286,9 @@ function registerEval(program) {
|
|
|
269
286
|
console.log(ui.dim(` ${testSet.length} queries evaluated | Tokens: embed ${totalEmbedTokens}${totalRerankTokens ? `, rerank ${totalRerankTokens}` : ''}`));
|
|
270
287
|
|
|
271
288
|
// Suggestions
|
|
289
|
+
const mrr = aggregated.mrr;
|
|
290
|
+
const recall5 = aggregated['r@5'];
|
|
291
|
+
|
|
272
292
|
console.log('');
|
|
273
293
|
if (mrr !== undefined && mrr < 0.6) {
|
|
274
294
|
console.log(ui.dim(' 💡 Low MRR? Try: larger model, more candidates (--limit), or enable reranking (--rerank)'));
|
|
@@ -276,6 +296,7 @@ function registerEval(program) {
|
|
|
276
296
|
if (recall5 !== undefined && recall5 < 0.5) {
|
|
277
297
|
console.log(ui.dim(' 💡 Low recall? Try: increasing --limit, different chunking strategy, or review your test set'));
|
|
278
298
|
}
|
|
299
|
+
console.log(ui.dim(' 💡 Evaluate reranking quality: vai eval --mode rerank --test-set rerank-test.jsonl'));
|
|
279
300
|
} catch (err) {
|
|
280
301
|
console.error(ui.error(err.message));
|
|
281
302
|
process.exit(1);
|
|
@@ -285,6 +306,306 @@ function registerEval(program) {
|
|
|
285
306
|
});
|
|
286
307
|
}
|
|
287
308
|
|
|
309
|
+
/**
|
|
310
|
+
* Evaluate reranking quality.
|
|
311
|
+
*
|
|
312
|
+
* Test set format (JSONL):
|
|
313
|
+
* { "query": "...", "documents": ["doc1", "doc2", ...], "relevant": [0, 2, 5] }
|
|
314
|
+
* relevant = indices into the documents array that are considered relevant.
|
|
315
|
+
*
|
|
316
|
+
* Sends each query + docs to the rerank API, then evaluates how well
|
|
317
|
+
* the reranker surfaces relevant docs using nDCG, Recall, MRR, MAP.
|
|
318
|
+
*/
|
|
319
|
+
async function evalRerank(opts) {
|
|
320
|
+
try {
|
|
321
|
+
const kValues = opts.kValues.split(',').map(v => parseInt(v.trim(), 10)).filter(v => !isNaN(v));
|
|
322
|
+
|
|
323
|
+
// Load test set in rerank mode
|
|
324
|
+
let testSet;
|
|
325
|
+
try {
|
|
326
|
+
testSet = loadTestSet(opts.testSet, 'rerank');
|
|
327
|
+
} catch (err) {
|
|
328
|
+
console.error(ui.error(`Failed to load test set: ${err.message}`));
|
|
329
|
+
process.exit(1);
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
if (testSet.length === 0) {
|
|
333
|
+
console.error(ui.error('Test set is empty.'));
|
|
334
|
+
process.exit(1);
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
// Determine which models to evaluate
|
|
338
|
+
const rerankModels = opts.models
|
|
339
|
+
? opts.models.split(',').map(m => m.trim())
|
|
340
|
+
: [opts.model || DEFAULT_RERANK_MODEL];
|
|
341
|
+
|
|
342
|
+
const topK = opts.topK || undefined;
|
|
343
|
+
const verbose = !opts.json && !opts.quiet;
|
|
344
|
+
|
|
345
|
+
if (verbose) {
|
|
346
|
+
console.log('');
|
|
347
|
+
console.log(ui.bold('📊 Rerank Evaluation'));
|
|
348
|
+
console.log(ui.dim(` Test set: ${testSet.length} queries`));
|
|
349
|
+
console.log(ui.dim(` Models: ${rerankModels.join(', ')}`));
|
|
350
|
+
console.log(ui.dim(` K values: ${kValues.join(', ')}`));
|
|
351
|
+
if (topK) console.log(ui.dim(` Top-K: ${topK}`));
|
|
352
|
+
console.log('');
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
const allModelResults = [];
|
|
356
|
+
|
|
357
|
+
for (const rerankModel of rerankModels) {
|
|
358
|
+
const perQueryResults = [];
|
|
359
|
+
let totalTokens = 0;
|
|
360
|
+
let totalLatency = 0;
|
|
361
|
+
|
|
362
|
+
for (let qi = 0; qi < testSet.length; qi++) {
|
|
363
|
+
const testCase = testSet[qi];
|
|
364
|
+
|
|
365
|
+
if (verbose) {
|
|
366
|
+
process.stderr.write(`\r [${rerankModel}] Evaluating query ${qi + 1}/${testSet.length}...`);
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
// Call rerank API
|
|
370
|
+
const start = Date.now();
|
|
371
|
+
const rerankResult = await apiRequest('/rerank', {
|
|
372
|
+
query: testCase.query,
|
|
373
|
+
documents: testCase.documents,
|
|
374
|
+
model: rerankModel,
|
|
375
|
+
...(topK ? { top_k: topK } : {}),
|
|
376
|
+
});
|
|
377
|
+
const elapsed = Date.now() - start;
|
|
378
|
+
totalLatency += elapsed;
|
|
379
|
+
totalTokens += rerankResult.usage?.total_tokens || 0;
|
|
380
|
+
|
|
381
|
+
// Build retrieved list: reranker returns items sorted by relevance_score desc
|
|
382
|
+
// Each item has { index, relevance_score }
|
|
383
|
+
const rerankedItems = rerankResult.data || [];
|
|
384
|
+
|
|
385
|
+
// Convert relevant indices to string IDs for metrics library
|
|
386
|
+
const relevantIdSet = new Set(testCase.relevant.map(idx => `doc_${idx}`));
|
|
387
|
+
const retrievedIds = rerankedItems.map(item => `doc_${item.index}`);
|
|
388
|
+
|
|
389
|
+
// Compute metrics
|
|
390
|
+
const metrics = computeMetrics(retrievedIds, [...relevantIdSet], kValues);
|
|
391
|
+
|
|
392
|
+
perQueryResults.push({
|
|
393
|
+
query: testCase.query,
|
|
394
|
+
relevant: testCase.relevant,
|
|
395
|
+
rerankedOrder: rerankedItems.map(r => r.index),
|
|
396
|
+
scores: rerankedItems.map(r => ({ index: r.index, score: r.relevance_score })),
|
|
397
|
+
metrics,
|
|
398
|
+
hits: retrievedIds.filter(id => relevantIdSet.has(id)).length,
|
|
399
|
+
latencyMs: elapsed,
|
|
400
|
+
});
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
if (verbose) {
|
|
404
|
+
process.stderr.write('\r' + ' '.repeat(60) + '\r');
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
const allMetrics = perQueryResults.map(r => r.metrics);
|
|
408
|
+
const aggregated = aggregateMetrics(allMetrics);
|
|
409
|
+
const avgLatency = totalLatency / testSet.length;
|
|
410
|
+
|
|
411
|
+
// Get model price
|
|
412
|
+
const catalogEntry = MODEL_CATALOG.find(m => m.name === rerankModel || m.name === `rerank-${rerankModel}`);
|
|
413
|
+
const pricePerM = catalogEntry ? parseFloat((catalogEntry.price.match(/\$([0-9.]+)/) || [])[1]) || null : null;
|
|
414
|
+
|
|
415
|
+
allModelResults.push({
|
|
416
|
+
model: rerankModel,
|
|
417
|
+
aggregated,
|
|
418
|
+
perQuery: perQueryResults,
|
|
419
|
+
totalTokens,
|
|
420
|
+
avgLatencyMs: avgLatency,
|
|
421
|
+
pricePerMTokens: pricePerM,
|
|
422
|
+
queries: testSet.length,
|
|
423
|
+
});
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
// JSON output
|
|
427
|
+
if (opts.json) {
|
|
428
|
+
console.log(JSON.stringify({
|
|
429
|
+
mode: 'rerank',
|
|
430
|
+
kValues,
|
|
431
|
+
models: allModelResults.map(r => ({
|
|
432
|
+
model: r.model,
|
|
433
|
+
summary: r.aggregated,
|
|
434
|
+
tokens: r.totalTokens,
|
|
435
|
+
avgLatencyMs: r.avgLatencyMs,
|
|
436
|
+
queries: r.queries,
|
|
437
|
+
perQuery: r.perQuery,
|
|
438
|
+
})),
|
|
439
|
+
}, null, 2));
|
|
440
|
+
return;
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
// Pretty output
|
|
444
|
+
if (allModelResults.length === 1) {
|
|
445
|
+
// Single model — detailed view
|
|
446
|
+
const result = allModelResults[0];
|
|
447
|
+
console.log(ui.bold(`Results: ${result.model}`));
|
|
448
|
+
console.log('');
|
|
449
|
+
|
|
450
|
+
const metricKeys = Object.keys(result.aggregated);
|
|
451
|
+
const maxKeyLen = Math.max(...metricKeys.map(k => k.length));
|
|
452
|
+
|
|
453
|
+
for (const key of metricKeys) {
|
|
454
|
+
const val = result.aggregated[key];
|
|
455
|
+
const bar = renderBar(val, 20);
|
|
456
|
+
const label = key.toUpperCase().padEnd(maxKeyLen + 1);
|
|
457
|
+
const valStr = val.toFixed(4);
|
|
458
|
+
const color = val >= 0.8 ? ui.green(valStr) : val >= 0.5 ? ui.cyan(valStr) : ui.yellow(valStr);
|
|
459
|
+
console.log(` ${label} ${bar} ${color}`);
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
printMetricHighlights(result.aggregated);
|
|
463
|
+
|
|
464
|
+
// Worst queries
|
|
465
|
+
const sorted = [...result.perQuery].sort((a, b) => a.metrics.mrr - b.metrics.mrr);
|
|
466
|
+
const worstQueries = sorted.slice(0, Math.min(3, sorted.length));
|
|
467
|
+
if (worstQueries.length > 0 && worstQueries[0].metrics.mrr < 1) {
|
|
468
|
+
console.log('');
|
|
469
|
+
console.log(ui.bold('Hardest queries:'));
|
|
470
|
+
for (const wq of worstQueries) {
|
|
471
|
+
const preview = wq.query.substring(0, 60) + (wq.query.length > 60 ? '...' : '');
|
|
472
|
+
const mrrStr = wq.metrics.mrr === 0 ? ui.red('miss') : ui.yellow(wq.metrics.mrr.toFixed(2));
|
|
473
|
+
console.log(` ${mrrStr} "${preview}" (${wq.hits}/${wq.relevant.length} relevant found)`);
|
|
474
|
+
}
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
console.log('');
|
|
478
|
+
console.log(ui.dim(` ${result.queries} queries | ${result.totalTokens} tokens | avg ${result.avgLatencyMs.toFixed(0)}ms/query`));
|
|
479
|
+
|
|
480
|
+
} else {
|
|
481
|
+
// Multi-model comparison
|
|
482
|
+
console.log(ui.bold('Rerank Model Comparison'));
|
|
483
|
+
console.log('');
|
|
484
|
+
|
|
485
|
+
// Summary table
|
|
486
|
+
const keyMetrics = ['mrr', 'ndcg@5', 'ndcg@10', 'r@5', 'r@10', 'ap'];
|
|
487
|
+
const availableMetrics = keyMetrics.filter(k => allModelResults[0].aggregated[k] !== undefined);
|
|
488
|
+
|
|
489
|
+
// Header
|
|
490
|
+
const modelColW = Math.max(22, ...allModelResults.map(r => r.model.length + 2));
|
|
491
|
+
const header = ` ${'Model'.padEnd(modelColW)} ${availableMetrics.map(m => m.toUpperCase().padStart(9)).join('')} ${'Latency'.padStart(9)} ${'$/1M tok'.padStart(9)}`;
|
|
492
|
+
console.log(ui.dim(header));
|
|
493
|
+
console.log(ui.dim(' ' + '─'.repeat(header.length - 2)));
|
|
494
|
+
|
|
495
|
+
// Find best value per metric for highlighting
|
|
496
|
+
const bestPerMetric = {};
|
|
497
|
+
for (const m of availableMetrics) {
|
|
498
|
+
bestPerMetric[m] = Math.max(...allModelResults.map(r => r.aggregated[m]));
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
for (const result of allModelResults) {
|
|
502
|
+
const cols = availableMetrics.map(m => {
|
|
503
|
+
const val = result.aggregated[m];
|
|
504
|
+
const str = val.toFixed(4);
|
|
505
|
+
return val === bestPerMetric[m] ? ui.green(str.padStart(9)) : str.padStart(9);
|
|
506
|
+
}).join('');
|
|
507
|
+
|
|
508
|
+
const latStr = `${result.avgLatencyMs.toFixed(0)}ms`.padStart(9);
|
|
509
|
+
const priceStr = result.pricePerMTokens != null ? `$${result.pricePerMTokens.toFixed(3)}`.padStart(9) : 'N/A'.padStart(9);
|
|
510
|
+
|
|
511
|
+
console.log(` ${result.model.padEnd(modelColW)} ${cols} ${latStr} ${priceStr}`);
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
console.log('');
|
|
515
|
+
|
|
516
|
+
// Per-metric visual comparison
|
|
517
|
+
for (const m of ['ndcg@5', 'ndcg@10']) {
|
|
518
|
+
if (!allModelResults[0].aggregated[m]) continue;
|
|
519
|
+
console.log(ui.bold(` ${m.toUpperCase()}`));
|
|
520
|
+
for (const result of allModelResults) {
|
|
521
|
+
const val = result.aggregated[m];
|
|
522
|
+
const bar = renderBar(val, 30);
|
|
523
|
+
const color = val === bestPerMetric[m] ? ui.green(val.toFixed(4)) : ui.cyan(val.toFixed(4));
|
|
524
|
+
console.log(` ${result.model.padEnd(modelColW - 2)} ${bar} ${color}`);
|
|
525
|
+
}
|
|
526
|
+
console.log('');
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
// Agreement analysis
|
|
530
|
+
console.log(ui.bold('Ranking Agreement'));
|
|
531
|
+
const maxK = Math.min(5, ...allModelResults.map(r => r.perQuery[0]?.rerankedOrder?.length || 5));
|
|
532
|
+
let agreeCount = 0;
|
|
533
|
+
for (let qi = 0; qi < testSet.length; qi++) {
|
|
534
|
+
const orders = allModelResults.map(r => r.perQuery[qi].rerankedOrder.slice(0, maxK).join(','));
|
|
535
|
+
if (orders.every(o => o === orders[0])) agreeCount++;
|
|
536
|
+
}
|
|
537
|
+
const agreePct = ((agreeCount / testSet.length) * 100).toFixed(0);
|
|
538
|
+
console.log(` ${agreeCount}/${testSet.length} queries (${agreePct}%) have identical top-${maxK} rankings`);
|
|
539
|
+
|
|
540
|
+
if (parseInt(agreePct) > 80) {
|
|
541
|
+
console.log(ui.info(' High agreement — the cheaper/faster model may be sufficient.'));
|
|
542
|
+
} else {
|
|
543
|
+
console.log(ui.warn(' Significant disagreement — the premium model may capture important nuances.'));
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
console.log('');
|
|
547
|
+
|
|
548
|
+
// Token/cost summary
|
|
549
|
+
console.log(ui.dim(' Per-query averages:'));
|
|
550
|
+
for (const result of allModelResults) {
|
|
551
|
+
const tokPerQ = result.totalTokens / result.queries;
|
|
552
|
+
const costPerQ = result.pricePerMTokens != null ? (tokPerQ / 1e6) * result.pricePerMTokens : null;
|
|
553
|
+
const costStr = costPerQ != null ? `$${costPerQ.toFixed(6)}/query` : '';
|
|
554
|
+
console.log(ui.dim(` ${result.model}: ${result.avgLatencyMs.toFixed(0)}ms, ${tokPerQ.toFixed(0)} tokens ${costStr}`));
|
|
555
|
+
}
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
// Suggestions
|
|
559
|
+
console.log('');
|
|
560
|
+
const bestResult = allModelResults.reduce((a, b) =>
|
|
561
|
+
(a.aggregated['ndcg@5'] || 0) >= (b.aggregated['ndcg@5'] || 0) ? a : b
|
|
562
|
+
);
|
|
563
|
+
const ndcg5 = bestResult.aggregated['ndcg@5'];
|
|
564
|
+
const recall5 = bestResult.aggregated['r@5'];
|
|
565
|
+
|
|
566
|
+
if (ndcg5 !== undefined && ndcg5 < 0.5) {
|
|
567
|
+
console.log(ui.dim(' 💡 Low nDCG@5? Try: more documents in the candidate set, or a different reranker.'));
|
|
568
|
+
}
|
|
569
|
+
if (recall5 !== undefined && recall5 < 0.5) {
|
|
570
|
+
console.log(ui.dim(' 💡 Low Recall@5? The reranker may need more candidates to work with (increase initial retrieval).'));
|
|
571
|
+
}
|
|
572
|
+
if (allModelResults.length > 1) {
|
|
573
|
+
console.log(ui.dim(' 💡 Compare with: vai eval --mode rerank --models "rerank-2.5,rerank-2.5-lite" --test-set data.jsonl'));
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
console.log('');
|
|
577
|
+
} catch (err) {
|
|
578
|
+
console.error(ui.error(err.message));
|
|
579
|
+
process.exit(1);
|
|
580
|
+
}
|
|
581
|
+
}
|
|
582
|
+
|
|
583
|
+
/**
|
|
584
|
+
* Print highlighted interpretation of key metrics.
|
|
585
|
+
*/
|
|
586
|
+
function printMetricHighlights(aggregated) {
|
|
587
|
+
console.log('');
|
|
588
|
+
|
|
589
|
+
const mrr = aggregated.mrr;
|
|
590
|
+
const recall5 = aggregated['r@5'];
|
|
591
|
+
const ndcg5 = aggregated['ndcg@5'];
|
|
592
|
+
const ndcg10 = aggregated['ndcg@10'];
|
|
593
|
+
|
|
594
|
+
if (mrr !== undefined) {
|
|
595
|
+
const grade = mrr >= 0.8 ? ui.green('Excellent') : mrr >= 0.6 ? ui.cyan('Good') : mrr >= 0.4 ? ui.yellow('Fair') : ui.red('Needs work');
|
|
596
|
+
console.log(ui.label('MRR', `${mrr.toFixed(4)} — ${grade}`));
|
|
597
|
+
}
|
|
598
|
+
if (ndcg5 !== undefined) {
|
|
599
|
+
console.log(ui.label('NDCG@5', `${ndcg5.toFixed(4)} — ranking precision (top 5)`));
|
|
600
|
+
}
|
|
601
|
+
if (ndcg10 !== undefined) {
|
|
602
|
+
console.log(ui.label('NDCG@10', `${ndcg10.toFixed(4)} — ranking precision (top 10)`));
|
|
603
|
+
}
|
|
604
|
+
if (recall5 !== undefined) {
|
|
605
|
+
console.log(ui.label('Recall@5', `${(recall5 * 100).toFixed(1)}% of relevant docs found in top 5`));
|
|
606
|
+
}
|
|
607
|
+
}
|
|
608
|
+
|
|
288
609
|
/**
|
|
289
610
|
* Render a simple ASCII bar chart.
|
|
290
611
|
* @param {number} value - 0.0 to 1.0
|
|
@@ -82,6 +82,30 @@ function createPlaygroundServer() {
|
|
|
82
82
|
return;
|
|
83
83
|
}
|
|
84
84
|
|
|
85
|
+
// Serve icon assets: /icons/{dark|light}/{size}.png
|
|
86
|
+
const iconMatch = req.url.match(/^\/icons\/(dark|light)\/(\d+)\.png$/);
|
|
87
|
+
if (req.method === 'GET' && iconMatch) {
|
|
88
|
+
const variant = iconMatch[1];
|
|
89
|
+
const size = iconMatch[2];
|
|
90
|
+
// Try portable path first (src/playground/icons/), then electron/icons/
|
|
91
|
+
const portablePath = path.join(__dirname, '..', 'playground', 'icons', variant, `${size}.png`);
|
|
92
|
+
const electronPath = path.join(__dirname, '..', '..', 'electron', 'icons', variant,
|
|
93
|
+
'AppIcons', 'Assets.xcassets', 'AppIcon.appiconset', `${size}.png`);
|
|
94
|
+
const iconPath = fs.existsSync(portablePath) ? portablePath : electronPath;
|
|
95
|
+
if (fs.existsSync(iconPath)) {
|
|
96
|
+
const data = fs.readFileSync(iconPath);
|
|
97
|
+
res.writeHead(200, {
|
|
98
|
+
'Content-Type': 'image/png',
|
|
99
|
+
'Cache-Control': 'public, max-age=86400',
|
|
100
|
+
});
|
|
101
|
+
res.end(data);
|
|
102
|
+
} else {
|
|
103
|
+
res.writeHead(404);
|
|
104
|
+
res.end('Icon not found');
|
|
105
|
+
}
|
|
106
|
+
return;
|
|
107
|
+
}
|
|
108
|
+
|
|
85
109
|
// API: Models
|
|
86
110
|
if (req.method === 'GET' && req.url === '/api/models') {
|
|
87
111
|
const models = MODEL_CATALOG.filter(m => !m.legacy && !m.local && !m.unreleased);
|
|
@@ -125,6 +149,17 @@ function createPlaygroundServer() {
|
|
|
125
149
|
|
|
126
150
|
// Parse JSON body for POST routes
|
|
127
151
|
if (req.method === 'POST') {
|
|
152
|
+
// Check for API key before processing any API calls
|
|
153
|
+
const apiKeyConfigured = !!(process.env.VOYAGE_API_KEY || getConfigValue('apiKey'));
|
|
154
|
+
if (!apiKeyConfigured) {
|
|
155
|
+
res.writeHead(401, { 'Content-Type': 'application/json' });
|
|
156
|
+
res.end(JSON.stringify({
|
|
157
|
+
error: 'No API key configured. Run: vai config set api-key <your-key>',
|
|
158
|
+
code: 'NO_API_KEY',
|
|
159
|
+
}));
|
|
160
|
+
return;
|
|
161
|
+
}
|
|
162
|
+
|
|
128
163
|
const body = await readBody(req);
|
|
129
164
|
let parsed;
|
|
130
165
|
try {
|
|
@@ -178,6 +213,27 @@ function createPlaygroundServer() {
|
|
|
178
213
|
return;
|
|
179
214
|
}
|
|
180
215
|
|
|
216
|
+
// API: Multimodal Embed
|
|
217
|
+
if (req.url === '/api/multimodal-embed') {
|
|
218
|
+
const { inputs, model, input_type, output_dimension } = parsed;
|
|
219
|
+
if (!inputs || !Array.isArray(inputs) || inputs.length === 0) {
|
|
220
|
+
res.writeHead(400, { 'Content-Type': 'application/json' });
|
|
221
|
+
res.end(JSON.stringify({ error: 'inputs must be a non-empty array' }));
|
|
222
|
+
return;
|
|
223
|
+
}
|
|
224
|
+
const { apiRequest } = require('../lib/api');
|
|
225
|
+
const mmBody = {
|
|
226
|
+
inputs,
|
|
227
|
+
model: model || 'voyage-multimodal-3.5',
|
|
228
|
+
};
|
|
229
|
+
if (input_type) mmBody.input_type = input_type;
|
|
230
|
+
if (output_dimension) mmBody.output_dimension = output_dimension;
|
|
231
|
+
const result = await apiRequest('/multimodalembeddings', mmBody);
|
|
232
|
+
res.writeHead(200, { 'Content-Type': 'application/json' });
|
|
233
|
+
res.end(JSON.stringify(result));
|
|
234
|
+
return;
|
|
235
|
+
}
|
|
236
|
+
|
|
181
237
|
// API: Benchmark (single model, single round — UI calls this per model)
|
|
182
238
|
if (req.url === '/api/benchmark/embed') {
|
|
183
239
|
const { texts, model, inputType, dimensions } = parsed;
|