voyageai-cli 1.13.0 → 1.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/cli.js +6 -0
- package/src/commands/benchmark.js +164 -0
- package/src/commands/chunk.js +277 -0
- package/src/commands/completions.js +51 -1
- package/src/commands/estimate.js +209 -0
- package/src/commands/init.js +153 -0
- package/src/commands/models.js +32 -4
- package/src/lib/catalog.js +42 -18
- package/src/lib/chunker.js +341 -0
- package/src/lib/explanations.js +183 -0
- package/src/lib/project.js +122 -0
- package/src/lib/readers.js +239 -0
package/package.json
CHANGED
package/src/cli.js
CHANGED
|
@@ -20,6 +20,9 @@ const { registerIngest } = require('./commands/ingest');
|
|
|
20
20
|
const { registerCompletions } = require('./commands/completions');
|
|
21
21
|
const { registerPlayground } = require('./commands/playground');
|
|
22
22
|
const { registerBenchmark } = require('./commands/benchmark');
|
|
23
|
+
const { registerEstimate } = require('./commands/estimate');
|
|
24
|
+
const { registerInit } = require('./commands/init');
|
|
25
|
+
const { registerChunk } = require('./commands/chunk');
|
|
23
26
|
const { registerAbout } = require('./commands/about');
|
|
24
27
|
const { showBanner, showQuickStart, getVersion } = require('./lib/banner');
|
|
25
28
|
|
|
@@ -45,6 +48,9 @@ registerIngest(program);
|
|
|
45
48
|
registerCompletions(program);
|
|
46
49
|
registerPlayground(program);
|
|
47
50
|
registerBenchmark(program);
|
|
51
|
+
registerEstimate(program);
|
|
52
|
+
registerInit(program);
|
|
53
|
+
registerChunk(program);
|
|
48
54
|
registerAbout(program);
|
|
49
55
|
|
|
50
56
|
// Append disclaimer to all help output
|
|
@@ -1212,6 +1212,170 @@ function registerBenchmark(program) {
|
|
|
1212
1212
|
.option('--json', 'Machine-readable JSON output')
|
|
1213
1213
|
.option('-q, --quiet', 'Suppress non-essential output')
|
|
1214
1214
|
.action(benchmarkAsymmetric);
|
|
1215
|
+
|
|
1216
|
+
// ── benchmark space ──
|
|
1217
|
+
bench
|
|
1218
|
+
.command('space')
|
|
1219
|
+
.description('Validate shared embedding space — embed same text with all Voyage 4 models')
|
|
1220
|
+
.option('--text <text>', 'Text to embed across models')
|
|
1221
|
+
.option('--texts <texts>', 'Comma-separated texts to compare')
|
|
1222
|
+
.option('--models <models>', 'Comma-separated models', 'voyage-4-large,voyage-4,voyage-4-lite')
|
|
1223
|
+
.option('-d, --dimensions <n>', 'Output dimensions (must be supported by all models)')
|
|
1224
|
+
.option('--json', 'Machine-readable JSON output')
|
|
1225
|
+
.option('-q, --quiet', 'Suppress non-essential output')
|
|
1226
|
+
.action(benchmarkSpace);
|
|
1227
|
+
}
|
|
1228
|
+
|
|
1229
|
+
/**
|
|
1230
|
+
* benchmark space — Validate shared embedding space across Voyage 4 models.
|
|
1231
|
+
* Embeds the same text(s) with multiple models, then computes pairwise cosine
|
|
1232
|
+
* similarities to prove they produce compatible embeddings.
|
|
1233
|
+
*/
|
|
1234
|
+
async function benchmarkSpace(opts) {
|
|
1235
|
+
const models = opts.models
|
|
1236
|
+
? parseModels(opts.models)
|
|
1237
|
+
: ['voyage-4-large', 'voyage-4', 'voyage-4-lite'];
|
|
1238
|
+
|
|
1239
|
+
const texts = opts.texts
|
|
1240
|
+
? opts.texts.split(',').map(t => t.trim())
|
|
1241
|
+
: opts.text
|
|
1242
|
+
? [opts.text]
|
|
1243
|
+
: [
|
|
1244
|
+
'MongoDB Atlas provides a fully managed cloud database with vector search.',
|
|
1245
|
+
'Machine learning models transform raw data into semantic embeddings.',
|
|
1246
|
+
'The quick brown fox jumps over the lazy dog.',
|
|
1247
|
+
];
|
|
1248
|
+
|
|
1249
|
+
const dimensions = opts.dimensions ? parseInt(opts.dimensions, 10) : undefined;
|
|
1250
|
+
|
|
1251
|
+
if (!opts.json && !opts.quiet) {
|
|
1252
|
+
console.log('');
|
|
1253
|
+
console.log(ui.bold(' 🔮 Shared Embedding Space Validation'));
|
|
1254
|
+
console.log(ui.dim(` Models: ${models.join(', ')}`));
|
|
1255
|
+
console.log(ui.dim(` Texts: ${texts.length}${dimensions ? `, dimensions: ${dimensions}` : ''}`));
|
|
1256
|
+
console.log('');
|
|
1257
|
+
}
|
|
1258
|
+
|
|
1259
|
+
// Embed all texts with all models
|
|
1260
|
+
const embeddings = {}; // { model: [[embedding for text 0], [embedding for text 1], ...] }
|
|
1261
|
+
|
|
1262
|
+
for (const model of models) {
|
|
1263
|
+
const spin = (!opts.json && !opts.quiet) ? ui.spinner(` Embedding with ${model}...`) : null;
|
|
1264
|
+
if (spin) spin.start();
|
|
1265
|
+
|
|
1266
|
+
try {
|
|
1267
|
+
const embedOpts = { model, inputType: 'document' };
|
|
1268
|
+
if (dimensions) embedOpts.dimensions = dimensions;
|
|
1269
|
+
const result = await generateEmbeddings(texts, embedOpts);
|
|
1270
|
+
embeddings[model] = result.data.map(d => d.embedding);
|
|
1271
|
+
if (spin) spin.stop();
|
|
1272
|
+
} catch (err) {
|
|
1273
|
+
if (spin) spin.stop();
|
|
1274
|
+
console.error(ui.warn(` ${model}: ${err.message} — skipping`));
|
|
1275
|
+
}
|
|
1276
|
+
}
|
|
1277
|
+
|
|
1278
|
+
const validModels = Object.keys(embeddings);
|
|
1279
|
+
if (validModels.length < 2) {
|
|
1280
|
+
console.error(ui.error('Need at least 2 models to compare embedding spaces.'));
|
|
1281
|
+
process.exit(1);
|
|
1282
|
+
}
|
|
1283
|
+
|
|
1284
|
+
// Compute pairwise cross-model similarities for each text
|
|
1285
|
+
const results = [];
|
|
1286
|
+
|
|
1287
|
+
for (let t = 0; t < texts.length; t++) {
|
|
1288
|
+
const textResult = {
|
|
1289
|
+
text: texts[t],
|
|
1290
|
+
pairs: [],
|
|
1291
|
+
};
|
|
1292
|
+
|
|
1293
|
+
for (let i = 0; i < validModels.length; i++) {
|
|
1294
|
+
for (let j = i + 1; j < validModels.length; j++) {
|
|
1295
|
+
const modelA = validModels[i];
|
|
1296
|
+
const modelB = validModels[j];
|
|
1297
|
+
const sim = cosineSimilarity(embeddings[modelA][t], embeddings[modelB][t]);
|
|
1298
|
+
textResult.pairs.push({
|
|
1299
|
+
modelA,
|
|
1300
|
+
modelB,
|
|
1301
|
+
similarity: sim,
|
|
1302
|
+
});
|
|
1303
|
+
}
|
|
1304
|
+
}
|
|
1305
|
+
|
|
1306
|
+
results.push(textResult);
|
|
1307
|
+
}
|
|
1308
|
+
|
|
1309
|
+
// Also compute within-model similarity across different texts (baseline)
|
|
1310
|
+
const withinModelSims = [];
|
|
1311
|
+
if (texts.length >= 2) {
|
|
1312
|
+
for (const model of validModels) {
|
|
1313
|
+
const sim = cosineSimilarity(embeddings[model][0], embeddings[model][1]);
|
|
1314
|
+
withinModelSims.push({ model, text0: texts[0], text1: texts[1], similarity: sim });
|
|
1315
|
+
}
|
|
1316
|
+
}
|
|
1317
|
+
|
|
1318
|
+
if (opts.json) {
|
|
1319
|
+
console.log(JSON.stringify({ benchmark: 'space', models: validModels, texts, results, withinModelSims }, null, 2));
|
|
1320
|
+
return;
|
|
1321
|
+
}
|
|
1322
|
+
|
|
1323
|
+
// Display results
|
|
1324
|
+
console.log(ui.bold(' Cross-Model Similarity (same text, different models):'));
|
|
1325
|
+
console.log(ui.dim(' High similarity (>0.95) = shared embedding space confirmed'));
|
|
1326
|
+
console.log('');
|
|
1327
|
+
|
|
1328
|
+
let allHigh = true;
|
|
1329
|
+
for (const r of results) {
|
|
1330
|
+
const preview = r.text.substring(0, 55) + (r.text.length > 55 ? '...' : '');
|
|
1331
|
+
console.log(` ${ui.dim('Text:')} "${preview}"`);
|
|
1332
|
+
|
|
1333
|
+
for (const p of r.pairs) {
|
|
1334
|
+
const simStr = p.similarity.toFixed(4);
|
|
1335
|
+
const quality = p.similarity >= 0.98 ? ui.green('●')
|
|
1336
|
+
: p.similarity >= 0.95 ? ui.cyan('●')
|
|
1337
|
+
: p.similarity >= 0.90 ? ui.yellow('●')
|
|
1338
|
+
: ui.red('●');
|
|
1339
|
+
if (p.similarity < 0.95) allHigh = false;
|
|
1340
|
+
console.log(` ${quality} ${rpad(p.modelA, 18)} ↔ ${rpad(p.modelB, 18)} ${ui.bold(simStr)}`);
|
|
1341
|
+
}
|
|
1342
|
+
console.log('');
|
|
1343
|
+
}
|
|
1344
|
+
|
|
1345
|
+
// Show within-model cross-text similarity for context
|
|
1346
|
+
if (withinModelSims.length > 0) {
|
|
1347
|
+
console.log(ui.bold(' Within-Model Similarity (different texts, same model):'));
|
|
1348
|
+
console.log(ui.dim(' Shows that cross-model same-text similarity is much higher'));
|
|
1349
|
+
console.log('');
|
|
1350
|
+
|
|
1351
|
+
for (const w of withinModelSims) {
|
|
1352
|
+
console.log(` ${ui.dim(rpad(w.model, 18))} text₀ ↔ text₁ ${ui.dim(w.similarity.toFixed(4))}`);
|
|
1353
|
+
}
|
|
1354
|
+
console.log('');
|
|
1355
|
+
}
|
|
1356
|
+
|
|
1357
|
+
// Summary
|
|
1358
|
+
const avgCrossModel = results.flatMap(r => r.pairs).reduce((sum, p) => sum + p.similarity, 0)
|
|
1359
|
+
/ results.flatMap(r => r.pairs).length;
|
|
1360
|
+
const avgWithin = withinModelSims.length > 0
|
|
1361
|
+
? withinModelSims.reduce((sum, w) => sum + w.similarity, 0) / withinModelSims.length
|
|
1362
|
+
: null;
|
|
1363
|
+
|
|
1364
|
+
if (allHigh) {
|
|
1365
|
+
console.log(ui.success(`Shared embedding space confirmed! Avg cross-model similarity: ${avgCrossModel.toFixed(4)}`));
|
|
1366
|
+
} else {
|
|
1367
|
+
console.log(ui.warn(`Cross-model similarity lower than expected. Avg: ${avgCrossModel.toFixed(4)}`));
|
|
1368
|
+
}
|
|
1369
|
+
|
|
1370
|
+
if (avgWithin !== null) {
|
|
1371
|
+
const ratio = (avgCrossModel / avgWithin).toFixed(1);
|
|
1372
|
+
console.log(ui.dim(` Cross-model same-text similarity is ${ratio}× higher than same-model different-text similarity.`));
|
|
1373
|
+
}
|
|
1374
|
+
|
|
1375
|
+
console.log('');
|
|
1376
|
+
console.log(ui.dim(' This means you can embed docs with voyage-4-large and query with voyage-4-lite'));
|
|
1377
|
+
console.log(ui.dim(' — the embeddings live in the same space. See "vai explain shared-space".'));
|
|
1378
|
+
console.log('');
|
|
1215
1379
|
}
|
|
1216
1380
|
|
|
1217
1381
|
module.exports = { registerBenchmark };
|
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const fs = require('fs');
|
|
4
|
+
const path = require('path');
|
|
5
|
+
const { chunk, estimateTokens, STRATEGIES, DEFAULTS } = require('../lib/chunker');
|
|
6
|
+
const { readFile, scanDirectory, isSupported, getReaderType } = require('../lib/readers');
|
|
7
|
+
const { loadProject, mergeOptions } = require('../lib/project');
|
|
8
|
+
const ui = require('../lib/ui');
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Format a number with commas.
|
|
12
|
+
*/
|
|
13
|
+
function fmtNum(n) {
|
|
14
|
+
return n.toLocaleString('en-US');
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Build chunk metadata for a source file.
|
|
19
|
+
* @param {string} filePath - Source file path
|
|
20
|
+
* @param {string} basePath - Base directory for relative paths
|
|
21
|
+
* @param {number} index - Chunk index within the file
|
|
22
|
+
* @param {number} total - Total chunks from this file
|
|
23
|
+
* @returns {object}
|
|
24
|
+
*/
|
|
25
|
+
function buildMetadata(filePath, basePath, index, total) {
|
|
26
|
+
return {
|
|
27
|
+
source: path.relative(basePath, filePath),
|
|
28
|
+
chunk_index: index,
|
|
29
|
+
total_chunks: total,
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Register the chunk command on a Commander program.
|
|
35
|
+
* @param {import('commander').Command} program
|
|
36
|
+
*/
|
|
37
|
+
function registerChunk(program) {
|
|
38
|
+
program
|
|
39
|
+
.command('chunk [input]')
|
|
40
|
+
.description('Chunk documents for embedding — files, directories, or stdin')
|
|
41
|
+
.option('-s, --strategy <strategy>', `Chunking strategy: ${STRATEGIES.join(', ')}`)
|
|
42
|
+
.option('-c, --chunk-size <n>', 'Target chunk size in characters', (v) => parseInt(v, 10))
|
|
43
|
+
.option('--overlap <n>', 'Overlap between chunks in characters', (v) => parseInt(v, 10))
|
|
44
|
+
.option('--min-size <n>', 'Minimum chunk size (drop smaller)', (v) => parseInt(v, 10))
|
|
45
|
+
.option('-o, --output <path>', 'Output file (JSONL). Omit for stdout')
|
|
46
|
+
.option('--text-field <name>', 'Text field name for JSON/JSONL input', 'text')
|
|
47
|
+
.option('--extensions <exts>', 'Comma-separated file extensions to include when scanning directories')
|
|
48
|
+
.option('--ignore <dirs>', 'Comma-separated directory names to skip', 'node_modules,.git,__pycache__')
|
|
49
|
+
.option('--dry-run', 'Show what would be chunked without processing')
|
|
50
|
+
.option('--stats', 'Show chunking statistics after processing')
|
|
51
|
+
.option('--json', 'Machine-readable JSON output')
|
|
52
|
+
.option('-q, --quiet', 'Suppress non-essential output')
|
|
53
|
+
.action(async (input, opts) => {
|
|
54
|
+
try {
|
|
55
|
+
// Load project config, merge with CLI opts
|
|
56
|
+
const { config: projectConfig } = loadProject();
|
|
57
|
+
const chunkConfig = projectConfig.chunk || {};
|
|
58
|
+
|
|
59
|
+
const strategy = opts.strategy || chunkConfig.strategy || DEFAULTS.strategy || 'recursive';
|
|
60
|
+
const chunkSize = opts.chunkSize || chunkConfig.size || DEFAULTS.size;
|
|
61
|
+
const overlap = opts.overlap != null ? opts.overlap : (chunkConfig.overlap != null ? chunkConfig.overlap : DEFAULTS.overlap);
|
|
62
|
+
const minSize = opts.minSize || chunkConfig.minSize || DEFAULTS.minSize;
|
|
63
|
+
const textField = opts.textField || 'text';
|
|
64
|
+
|
|
65
|
+
if (!STRATEGIES.includes(strategy)) {
|
|
66
|
+
console.error(ui.error(`Unknown strategy: "${strategy}". Available: ${STRATEGIES.join(', ')}`));
|
|
67
|
+
process.exit(1);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// Resolve input files
|
|
71
|
+
const files = resolveInput(input, opts);
|
|
72
|
+
|
|
73
|
+
if (files.length === 0) {
|
|
74
|
+
console.error(ui.error('No supported files found. Supported types: .txt, .md, .html, .json, .jsonl, .pdf'));
|
|
75
|
+
process.exit(1);
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// Dry run
|
|
79
|
+
if (opts.dryRun) {
|
|
80
|
+
if (opts.json) {
|
|
81
|
+
console.log(JSON.stringify({ files: files.map(f => path.relative(process.cwd(), f)), strategy, chunkSize, overlap }, null, 2));
|
|
82
|
+
} else {
|
|
83
|
+
console.log(ui.bold(`Would chunk ${files.length} file(s) with strategy: ${strategy}`));
|
|
84
|
+
console.log(ui.dim(` Chunk size: ${chunkSize} chars, overlap: ${overlap} chars`));
|
|
85
|
+
console.log('');
|
|
86
|
+
for (const f of files) {
|
|
87
|
+
const size = fs.statSync(f).size;
|
|
88
|
+
console.log(` ${ui.dim(path.relative(process.cwd(), f))} (${fmtNum(size)} bytes)`);
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
return;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
// Process files
|
|
95
|
+
const basePath = input && fs.existsSync(input) && fs.statSync(input).isDirectory()
|
|
96
|
+
? path.resolve(input)
|
|
97
|
+
: process.cwd();
|
|
98
|
+
|
|
99
|
+
const allChunks = [];
|
|
100
|
+
const fileStats = [];
|
|
101
|
+
|
|
102
|
+
const showProgress = !opts.json && !opts.quiet && files.length > 1;
|
|
103
|
+
if (showProgress) {
|
|
104
|
+
console.log(ui.bold(`Chunking ${files.length} file(s) with strategy: ${strategy}`));
|
|
105
|
+
console.log(ui.dim(` Chunk size: ${chunkSize}, overlap: ${overlap}, min: ${minSize}`));
|
|
106
|
+
console.log('');
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
for (let fi = 0; fi < files.length; fi++) {
|
|
110
|
+
const filePath = files[fi];
|
|
111
|
+
const relPath = path.relative(basePath, filePath);
|
|
112
|
+
const readerType = getReaderType(filePath);
|
|
113
|
+
|
|
114
|
+
try {
|
|
115
|
+
const content = await readFile(filePath, { textField });
|
|
116
|
+
|
|
117
|
+
// readFile returns string for text/html/pdf, array for json/jsonl
|
|
118
|
+
let textsToChunk = [];
|
|
119
|
+
|
|
120
|
+
if (typeof content === 'string') {
|
|
121
|
+
textsToChunk = [{ text: content, metadata: {} }];
|
|
122
|
+
} else if (Array.isArray(content)) {
|
|
123
|
+
textsToChunk = content;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
let fileChunkCount = 0;
|
|
127
|
+
for (const item of textsToChunk) {
|
|
128
|
+
const effectiveStrategy = readerType === 'text' && filePath.endsWith('.md') ? 'markdown' : strategy;
|
|
129
|
+
// Auto-detect markdown for .md files when using default strategy
|
|
130
|
+
const useStrategy = (strategy === 'recursive' && filePath.endsWith('.md')) ? 'markdown' : strategy;
|
|
131
|
+
|
|
132
|
+
const chunks = chunk(item.text, {
|
|
133
|
+
strategy: useStrategy,
|
|
134
|
+
size: chunkSize,
|
|
135
|
+
overlap,
|
|
136
|
+
minSize,
|
|
137
|
+
});
|
|
138
|
+
|
|
139
|
+
for (let ci = 0; ci < chunks.length; ci++) {
|
|
140
|
+
allChunks.push({
|
|
141
|
+
text: chunks[ci],
|
|
142
|
+
metadata: {
|
|
143
|
+
...item.metadata,
|
|
144
|
+
...buildMetadata(filePath, basePath, ci, chunks.length),
|
|
145
|
+
},
|
|
146
|
+
});
|
|
147
|
+
}
|
|
148
|
+
fileChunkCount += chunks.length;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
fileStats.push({
|
|
152
|
+
file: relPath,
|
|
153
|
+
inputChars: textsToChunk.reduce((sum, t) => sum + t.text.length, 0),
|
|
154
|
+
chunks: fileChunkCount,
|
|
155
|
+
});
|
|
156
|
+
|
|
157
|
+
if (showProgress) {
|
|
158
|
+
console.log(` ${ui.green('✓')} ${relPath} → ${fileChunkCount} chunks`);
|
|
159
|
+
}
|
|
160
|
+
} catch (err) {
|
|
161
|
+
fileStats.push({ file: relPath, error: err.message, chunks: 0 });
|
|
162
|
+
if (!opts.quiet) {
|
|
163
|
+
console.error(` ${ui.red('✗')} ${relPath}: ${err.message}`);
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
// Output
|
|
169
|
+
if (opts.json) {
|
|
170
|
+
const output = {
|
|
171
|
+
totalChunks: allChunks.length,
|
|
172
|
+
totalTokens: allChunks.reduce((sum, c) => sum + estimateTokens(c.text), 0),
|
|
173
|
+
strategy,
|
|
174
|
+
chunkSize,
|
|
175
|
+
overlap,
|
|
176
|
+
files: fileStats,
|
|
177
|
+
chunks: allChunks,
|
|
178
|
+
};
|
|
179
|
+
const jsonStr = JSON.stringify(output, null, 2);
|
|
180
|
+
if (opts.output) {
|
|
181
|
+
fs.writeFileSync(opts.output, jsonStr + '\n');
|
|
182
|
+
} else {
|
|
183
|
+
console.log(jsonStr);
|
|
184
|
+
}
|
|
185
|
+
} else {
|
|
186
|
+
// JSONL output
|
|
187
|
+
const lines = allChunks.map(c => JSON.stringify(c));
|
|
188
|
+
const jsonlStr = lines.join('\n') + '\n';
|
|
189
|
+
|
|
190
|
+
if (opts.output) {
|
|
191
|
+
fs.writeFileSync(opts.output, jsonlStr);
|
|
192
|
+
if (!opts.quiet) {
|
|
193
|
+
console.log('');
|
|
194
|
+
console.log(ui.success(`Wrote ${fmtNum(allChunks.length)} chunks to ${opts.output}`));
|
|
195
|
+
}
|
|
196
|
+
} else if (opts.quiet || !showProgress) {
|
|
197
|
+
// Stdout — write JSONL directly
|
|
198
|
+
process.stdout.write(jsonlStr);
|
|
199
|
+
} else {
|
|
200
|
+
// Progress was shown, write to stdout with separator
|
|
201
|
+
console.log('');
|
|
202
|
+
process.stdout.write(jsonlStr);
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
// Stats summary
|
|
207
|
+
if ((opts.stats || showProgress) && !opts.json) {
|
|
208
|
+
const totalChars = fileStats.reduce((sum, f) => sum + (f.inputChars || 0), 0);
|
|
209
|
+
const totalTokens = allChunks.reduce((sum, c) => sum + estimateTokens(c.text), 0);
|
|
210
|
+
const avgChunkSize = allChunks.length > 0
|
|
211
|
+
? Math.round(allChunks.reduce((sum, c) => sum + c.text.length, 0) / allChunks.length)
|
|
212
|
+
: 0;
|
|
213
|
+
const errors = fileStats.filter(f => f.error).length;
|
|
214
|
+
|
|
215
|
+
console.log('');
|
|
216
|
+
console.log(ui.bold('Summary'));
|
|
217
|
+
console.log(ui.label('Files', `${fmtNum(files.length)}${errors ? ` (${errors} failed)` : ''}`));
|
|
218
|
+
console.log(ui.label('Input', `${fmtNum(totalChars)} chars`));
|
|
219
|
+
console.log(ui.label('Chunks', fmtNum(allChunks.length)));
|
|
220
|
+
console.log(ui.label('Avg chunk', `${fmtNum(avgChunkSize)} chars (~${fmtNum(Math.round(avgChunkSize / 4))} tokens)`));
|
|
221
|
+
console.log(ui.label('Est. tokens', `~${fmtNum(totalTokens)}`));
|
|
222
|
+
|
|
223
|
+
// Cost hint
|
|
224
|
+
const pricePerMToken = 0.12; // voyage-4-large default
|
|
225
|
+
const cost = (totalTokens / 1e6) * pricePerMToken;
|
|
226
|
+
if (cost > 0) {
|
|
227
|
+
console.log(ui.label('Est. cost', ui.dim(`~$${cost < 0.01 ? cost.toFixed(4) : cost.toFixed(2)} with voyage-4-large`)));
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
} catch (err) {
|
|
231
|
+
console.error(ui.error(err.message));
|
|
232
|
+
process.exit(1);
|
|
233
|
+
}
|
|
234
|
+
});
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
/**
|
|
238
|
+
* Resolve input to a list of file paths.
|
|
239
|
+
* @param {string} input - File path, directory path, or glob
|
|
240
|
+
* @param {object} opts
|
|
241
|
+
* @returns {string[]}
|
|
242
|
+
*/
|
|
243
|
+
function resolveInput(input, opts) {
|
|
244
|
+
if (!input) {
|
|
245
|
+
console.error(ui.error('Please provide a file or directory path.'));
|
|
246
|
+
console.error(ui.dim(' Usage: vai chunk <file-or-directory> [options]'));
|
|
247
|
+
process.exit(1);
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
const resolved = path.resolve(input);
|
|
251
|
+
|
|
252
|
+
if (!fs.existsSync(resolved)) {
|
|
253
|
+
console.error(ui.error(`Not found: ${input}`));
|
|
254
|
+
process.exit(1);
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
const stat = fs.statSync(resolved);
|
|
258
|
+
|
|
259
|
+
if (stat.isFile()) {
|
|
260
|
+
return [resolved];
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
if (stat.isDirectory()) {
|
|
264
|
+
const scanOpts = {};
|
|
265
|
+
if (opts.extensions) {
|
|
266
|
+
scanOpts.extensions = opts.extensions.split(',').map(e => e.trim());
|
|
267
|
+
}
|
|
268
|
+
if (opts.ignore) {
|
|
269
|
+
scanOpts.ignore = opts.ignore.split(',').map(d => d.trim());
|
|
270
|
+
}
|
|
271
|
+
return scanDirectory(resolved, scanOpts);
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
return [];
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
module.exports = { registerChunk };
|
|
@@ -19,7 +19,7 @@ _vai_completions() {
|
|
|
19
19
|
prev="\${COMP_WORDS[COMP_CWORD-1]}"
|
|
20
20
|
|
|
21
21
|
# Top-level commands
|
|
22
|
-
commands="embed rerank store search index models ping config demo explain similarity ingest completions help"
|
|
22
|
+
commands="embed rerank store search index models ping config demo explain similarity ingest estimate init chunk completions help"
|
|
23
23
|
|
|
24
24
|
# Subcommands
|
|
25
25
|
local index_subs="create list delete"
|
|
@@ -102,6 +102,18 @@ _vai_completions() {
|
|
|
102
102
|
COMPREPLY=( \$(compgen -W "--file --db --collection --field --model --input-type --dimensions --batch-size --text-field --text-column --strict --dry-run --json --quiet --help" -- "\$cur") )
|
|
103
103
|
return 0
|
|
104
104
|
;;
|
|
105
|
+
estimate)
|
|
106
|
+
COMPREPLY=( \$(compgen -W "--docs --queries --doc-tokens --query-tokens --doc-model --query-model --months --json --quiet --help" -- "\$cur") )
|
|
107
|
+
return 0
|
|
108
|
+
;;
|
|
109
|
+
init)
|
|
110
|
+
COMPREPLY=( \$(compgen -W "--yes --force --json --quiet --help" -- "\$cur") )
|
|
111
|
+
return 0
|
|
112
|
+
;;
|
|
113
|
+
chunk)
|
|
114
|
+
COMPREPLY=( \$(compgen -W "--strategy --chunk-size --overlap --min-size --output --text-field --extensions --ignore --dry-run --stats --json --quiet --help" -- "\$cur") )
|
|
115
|
+
return 0
|
|
116
|
+
;;
|
|
105
117
|
completions)
|
|
106
118
|
COMPREPLY=( \$(compgen -W "bash zsh --help" -- "\$cur") )
|
|
107
119
|
return 0
|
|
@@ -172,6 +184,9 @@ _vai() {
|
|
|
172
184
|
'explain:Learn about AI and vector search concepts'
|
|
173
185
|
'similarity:Compute cosine similarity between texts'
|
|
174
186
|
'ingest:Bulk import documents with progress'
|
|
187
|
+
'estimate:Estimate embedding costs — symmetric vs asymmetric'
|
|
188
|
+
'init:Initialize project with .vai.json'
|
|
189
|
+
'chunk:Chunk documents for embedding'
|
|
175
190
|
'completions:Generate shell completion scripts'
|
|
176
191
|
'help:Display help for command'
|
|
177
192
|
)
|
|
@@ -375,6 +390,41 @@ _vai() {
|
|
|
375
390
|
'--json[Machine-readable JSON output]' \\
|
|
376
391
|
'(-q --quiet)'{-q,--quiet}'[Suppress non-essential output]'
|
|
377
392
|
;;
|
|
393
|
+
estimate)
|
|
394
|
+
_arguments \\
|
|
395
|
+
'--docs[Number of documents]:count:' \\
|
|
396
|
+
'--queries[Queries per month]:count:' \\
|
|
397
|
+
'--doc-tokens[Avg tokens per document]:tokens:' \\
|
|
398
|
+
'--query-tokens[Avg tokens per query]:tokens:' \\
|
|
399
|
+
'--doc-model[Document embedding model]:model:(\$models)' \\
|
|
400
|
+
'--query-model[Query embedding model]:model:(\$models)' \\
|
|
401
|
+
'--months[Months to project]:months:' \\
|
|
402
|
+
'--json[Machine-readable JSON output]' \\
|
|
403
|
+
'(-q --quiet)'{-q,--quiet}'[Suppress non-essential output]'
|
|
404
|
+
;;
|
|
405
|
+
init)
|
|
406
|
+
_arguments \\
|
|
407
|
+
'(-y --yes)'{-y,--yes}'[Accept all defaults]' \\
|
|
408
|
+
'--force[Overwrite existing .vai.json]' \\
|
|
409
|
+
'--json[Output config as JSON]' \\
|
|
410
|
+
'(-q --quiet)'{-q,--quiet}'[Suppress non-essential output]'
|
|
411
|
+
;;
|
|
412
|
+
chunk)
|
|
413
|
+
_arguments \\
|
|
414
|
+
'1:input:_files' \\
|
|
415
|
+
'(-s --strategy)'{-s,--strategy}'[Chunking strategy]:strategy:(fixed sentence paragraph recursive markdown)' \\
|
|
416
|
+
'(-c --chunk-size)'{-c,--chunk-size}'[Target chunk size]:size:' \\
|
|
417
|
+
'--overlap[Overlap between chunks]:chars:' \\
|
|
418
|
+
'--min-size[Minimum chunk size]:chars:' \\
|
|
419
|
+
'(-o --output)'{-o,--output}'[Output file]:file:_files' \\
|
|
420
|
+
'--text-field[Text field for JSON]:field:' \\
|
|
421
|
+
'--extensions[File extensions]:exts:' \\
|
|
422
|
+
'--ignore[Dirs to skip]:dirs:' \\
|
|
423
|
+
'--dry-run[Preview without processing]' \\
|
|
424
|
+
'--stats[Show statistics]' \\
|
|
425
|
+
'--json[JSON output]' \\
|
|
426
|
+
'(-q --quiet)'{-q,--quiet}'[Suppress non-essential output]'
|
|
427
|
+
;;
|
|
378
428
|
completions)
|
|
379
429
|
_arguments \\
|
|
380
430
|
'1:shell:(bash zsh)'
|