voyageai-cli 1.15.0 → 1.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/cli.js +4 -0
- package/src/commands/chunk.js +277 -0
- package/src/commands/completions.js +34 -1
- package/src/commands/init.js +153 -0
- package/src/lib/chunker.js +341 -0
- package/src/lib/project.js +122 -0
- package/src/lib/readers.js +239 -0
package/package.json
CHANGED
package/src/cli.js
CHANGED
|
@@ -21,6 +21,8 @@ const { registerCompletions } = require('./commands/completions');
|
|
|
21
21
|
const { registerPlayground } = require('./commands/playground');
|
|
22
22
|
const { registerBenchmark } = require('./commands/benchmark');
|
|
23
23
|
const { registerEstimate } = require('./commands/estimate');
|
|
24
|
+
const { registerInit } = require('./commands/init');
|
|
25
|
+
const { registerChunk } = require('./commands/chunk');
|
|
24
26
|
const { registerAbout } = require('./commands/about');
|
|
25
27
|
const { showBanner, showQuickStart, getVersion } = require('./lib/banner');
|
|
26
28
|
|
|
@@ -47,6 +49,8 @@ registerCompletions(program);
|
|
|
47
49
|
registerPlayground(program);
|
|
48
50
|
registerBenchmark(program);
|
|
49
51
|
registerEstimate(program);
|
|
52
|
+
registerInit(program);
|
|
53
|
+
registerChunk(program);
|
|
50
54
|
registerAbout(program);
|
|
51
55
|
|
|
52
56
|
// Append disclaimer to all help output
|
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const fs = require('fs');
|
|
4
|
+
const path = require('path');
|
|
5
|
+
const { chunk, estimateTokens, STRATEGIES, DEFAULTS } = require('../lib/chunker');
|
|
6
|
+
const { readFile, scanDirectory, isSupported, getReaderType } = require('../lib/readers');
|
|
7
|
+
const { loadProject, mergeOptions } = require('../lib/project');
|
|
8
|
+
const ui = require('../lib/ui');
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Format a number with commas.
|
|
12
|
+
*/
|
|
13
|
+
function fmtNum(n) {
|
|
14
|
+
return n.toLocaleString('en-US');
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Build chunk metadata for a source file.
|
|
19
|
+
* @param {string} filePath - Source file path
|
|
20
|
+
* @param {string} basePath - Base directory for relative paths
|
|
21
|
+
* @param {number} index - Chunk index within the file
|
|
22
|
+
* @param {number} total - Total chunks from this file
|
|
23
|
+
* @returns {object}
|
|
24
|
+
*/
|
|
25
|
+
function buildMetadata(filePath, basePath, index, total) {
|
|
26
|
+
return {
|
|
27
|
+
source: path.relative(basePath, filePath),
|
|
28
|
+
chunk_index: index,
|
|
29
|
+
total_chunks: total,
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Register the chunk command on a Commander program.
|
|
35
|
+
* @param {import('commander').Command} program
|
|
36
|
+
*/
|
|
37
|
+
function registerChunk(program) {
|
|
38
|
+
program
|
|
39
|
+
.command('chunk [input]')
|
|
40
|
+
.description('Chunk documents for embedding — files, directories, or stdin')
|
|
41
|
+
.option('-s, --strategy <strategy>', `Chunking strategy: ${STRATEGIES.join(', ')}`)
|
|
42
|
+
.option('-c, --chunk-size <n>', 'Target chunk size in characters', (v) => parseInt(v, 10))
|
|
43
|
+
.option('--overlap <n>', 'Overlap between chunks in characters', (v) => parseInt(v, 10))
|
|
44
|
+
.option('--min-size <n>', 'Minimum chunk size (drop smaller)', (v) => parseInt(v, 10))
|
|
45
|
+
.option('-o, --output <path>', 'Output file (JSONL). Omit for stdout')
|
|
46
|
+
.option('--text-field <name>', 'Text field name for JSON/JSONL input', 'text')
|
|
47
|
+
.option('--extensions <exts>', 'Comma-separated file extensions to include when scanning directories')
|
|
48
|
+
.option('--ignore <dirs>', 'Comma-separated directory names to skip', 'node_modules,.git,__pycache__')
|
|
49
|
+
.option('--dry-run', 'Show what would be chunked without processing')
|
|
50
|
+
.option('--stats', 'Show chunking statistics after processing')
|
|
51
|
+
.option('--json', 'Machine-readable JSON output')
|
|
52
|
+
.option('-q, --quiet', 'Suppress non-essential output')
|
|
53
|
+
.action(async (input, opts) => {
|
|
54
|
+
try {
|
|
55
|
+
// Load project config, merge with CLI opts
|
|
56
|
+
const { config: projectConfig } = loadProject();
|
|
57
|
+
const chunkConfig = projectConfig.chunk || {};
|
|
58
|
+
|
|
59
|
+
const strategy = opts.strategy || chunkConfig.strategy || DEFAULTS.strategy || 'recursive';
|
|
60
|
+
const chunkSize = opts.chunkSize || chunkConfig.size || DEFAULTS.size;
|
|
61
|
+
const overlap = opts.overlap != null ? opts.overlap : (chunkConfig.overlap != null ? chunkConfig.overlap : DEFAULTS.overlap);
|
|
62
|
+
const minSize = opts.minSize || chunkConfig.minSize || DEFAULTS.minSize;
|
|
63
|
+
const textField = opts.textField || 'text';
|
|
64
|
+
|
|
65
|
+
if (!STRATEGIES.includes(strategy)) {
|
|
66
|
+
console.error(ui.error(`Unknown strategy: "${strategy}". Available: ${STRATEGIES.join(', ')}`));
|
|
67
|
+
process.exit(1);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// Resolve input files
|
|
71
|
+
const files = resolveInput(input, opts);
|
|
72
|
+
|
|
73
|
+
if (files.length === 0) {
|
|
74
|
+
console.error(ui.error('No supported files found. Supported types: .txt, .md, .html, .json, .jsonl, .pdf'));
|
|
75
|
+
process.exit(1);
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// Dry run
|
|
79
|
+
if (opts.dryRun) {
|
|
80
|
+
if (opts.json) {
|
|
81
|
+
console.log(JSON.stringify({ files: files.map(f => path.relative(process.cwd(), f)), strategy, chunkSize, overlap }, null, 2));
|
|
82
|
+
} else {
|
|
83
|
+
console.log(ui.bold(`Would chunk ${files.length} file(s) with strategy: ${strategy}`));
|
|
84
|
+
console.log(ui.dim(` Chunk size: ${chunkSize} chars, overlap: ${overlap} chars`));
|
|
85
|
+
console.log('');
|
|
86
|
+
for (const f of files) {
|
|
87
|
+
const size = fs.statSync(f).size;
|
|
88
|
+
console.log(` ${ui.dim(path.relative(process.cwd(), f))} (${fmtNum(size)} bytes)`);
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
return;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
// Process files
|
|
95
|
+
const basePath = input && fs.existsSync(input) && fs.statSync(input).isDirectory()
|
|
96
|
+
? path.resolve(input)
|
|
97
|
+
: process.cwd();
|
|
98
|
+
|
|
99
|
+
const allChunks = [];
|
|
100
|
+
const fileStats = [];
|
|
101
|
+
|
|
102
|
+
const showProgress = !opts.json && !opts.quiet && files.length > 1;
|
|
103
|
+
if (showProgress) {
|
|
104
|
+
console.log(ui.bold(`Chunking ${files.length} file(s) with strategy: ${strategy}`));
|
|
105
|
+
console.log(ui.dim(` Chunk size: ${chunkSize}, overlap: ${overlap}, min: ${minSize}`));
|
|
106
|
+
console.log('');
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
for (let fi = 0; fi < files.length; fi++) {
|
|
110
|
+
const filePath = files[fi];
|
|
111
|
+
const relPath = path.relative(basePath, filePath);
|
|
112
|
+
const readerType = getReaderType(filePath);
|
|
113
|
+
|
|
114
|
+
try {
|
|
115
|
+
const content = await readFile(filePath, { textField });
|
|
116
|
+
|
|
117
|
+
// readFile returns string for text/html/pdf, array for json/jsonl
|
|
118
|
+
let textsToChunk = [];
|
|
119
|
+
|
|
120
|
+
if (typeof content === 'string') {
|
|
121
|
+
textsToChunk = [{ text: content, metadata: {} }];
|
|
122
|
+
} else if (Array.isArray(content)) {
|
|
123
|
+
textsToChunk = content;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
let fileChunkCount = 0;
|
|
127
|
+
for (const item of textsToChunk) {
|
|
128
|
+
const effectiveStrategy = readerType === 'text' && filePath.endsWith('.md') ? 'markdown' : strategy;
|
|
129
|
+
// Auto-detect markdown for .md files when using default strategy
|
|
130
|
+
const useStrategy = (strategy === 'recursive' && filePath.endsWith('.md')) ? 'markdown' : strategy;
|
|
131
|
+
|
|
132
|
+
const chunks = chunk(item.text, {
|
|
133
|
+
strategy: useStrategy,
|
|
134
|
+
size: chunkSize,
|
|
135
|
+
overlap,
|
|
136
|
+
minSize,
|
|
137
|
+
});
|
|
138
|
+
|
|
139
|
+
for (let ci = 0; ci < chunks.length; ci++) {
|
|
140
|
+
allChunks.push({
|
|
141
|
+
text: chunks[ci],
|
|
142
|
+
metadata: {
|
|
143
|
+
...item.metadata,
|
|
144
|
+
...buildMetadata(filePath, basePath, ci, chunks.length),
|
|
145
|
+
},
|
|
146
|
+
});
|
|
147
|
+
}
|
|
148
|
+
fileChunkCount += chunks.length;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
fileStats.push({
|
|
152
|
+
file: relPath,
|
|
153
|
+
inputChars: textsToChunk.reduce((sum, t) => sum + t.text.length, 0),
|
|
154
|
+
chunks: fileChunkCount,
|
|
155
|
+
});
|
|
156
|
+
|
|
157
|
+
if (showProgress) {
|
|
158
|
+
console.log(` ${ui.green('✓')} ${relPath} → ${fileChunkCount} chunks`);
|
|
159
|
+
}
|
|
160
|
+
} catch (err) {
|
|
161
|
+
fileStats.push({ file: relPath, error: err.message, chunks: 0 });
|
|
162
|
+
if (!opts.quiet) {
|
|
163
|
+
console.error(` ${ui.red('✗')} ${relPath}: ${err.message}`);
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
// Output
|
|
169
|
+
if (opts.json) {
|
|
170
|
+
const output = {
|
|
171
|
+
totalChunks: allChunks.length,
|
|
172
|
+
totalTokens: allChunks.reduce((sum, c) => sum + estimateTokens(c.text), 0),
|
|
173
|
+
strategy,
|
|
174
|
+
chunkSize,
|
|
175
|
+
overlap,
|
|
176
|
+
files: fileStats,
|
|
177
|
+
chunks: allChunks,
|
|
178
|
+
};
|
|
179
|
+
const jsonStr = JSON.stringify(output, null, 2);
|
|
180
|
+
if (opts.output) {
|
|
181
|
+
fs.writeFileSync(opts.output, jsonStr + '\n');
|
|
182
|
+
} else {
|
|
183
|
+
console.log(jsonStr);
|
|
184
|
+
}
|
|
185
|
+
} else {
|
|
186
|
+
// JSONL output
|
|
187
|
+
const lines = allChunks.map(c => JSON.stringify(c));
|
|
188
|
+
const jsonlStr = lines.join('\n') + '\n';
|
|
189
|
+
|
|
190
|
+
if (opts.output) {
|
|
191
|
+
fs.writeFileSync(opts.output, jsonlStr);
|
|
192
|
+
if (!opts.quiet) {
|
|
193
|
+
console.log('');
|
|
194
|
+
console.log(ui.success(`Wrote ${fmtNum(allChunks.length)} chunks to ${opts.output}`));
|
|
195
|
+
}
|
|
196
|
+
} else if (opts.quiet || !showProgress) {
|
|
197
|
+
// Stdout — write JSONL directly
|
|
198
|
+
process.stdout.write(jsonlStr);
|
|
199
|
+
} else {
|
|
200
|
+
// Progress was shown, write to stdout with separator
|
|
201
|
+
console.log('');
|
|
202
|
+
process.stdout.write(jsonlStr);
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
// Stats summary
|
|
207
|
+
if ((opts.stats || showProgress) && !opts.json) {
|
|
208
|
+
const totalChars = fileStats.reduce((sum, f) => sum + (f.inputChars || 0), 0);
|
|
209
|
+
const totalTokens = allChunks.reduce((sum, c) => sum + estimateTokens(c.text), 0);
|
|
210
|
+
const avgChunkSize = allChunks.length > 0
|
|
211
|
+
? Math.round(allChunks.reduce((sum, c) => sum + c.text.length, 0) / allChunks.length)
|
|
212
|
+
: 0;
|
|
213
|
+
const errors = fileStats.filter(f => f.error).length;
|
|
214
|
+
|
|
215
|
+
console.log('');
|
|
216
|
+
console.log(ui.bold('Summary'));
|
|
217
|
+
console.log(ui.label('Files', `${fmtNum(files.length)}${errors ? ` (${errors} failed)` : ''}`));
|
|
218
|
+
console.log(ui.label('Input', `${fmtNum(totalChars)} chars`));
|
|
219
|
+
console.log(ui.label('Chunks', fmtNum(allChunks.length)));
|
|
220
|
+
console.log(ui.label('Avg chunk', `${fmtNum(avgChunkSize)} chars (~${fmtNum(Math.round(avgChunkSize / 4))} tokens)`));
|
|
221
|
+
console.log(ui.label('Est. tokens', `~${fmtNum(totalTokens)}`));
|
|
222
|
+
|
|
223
|
+
// Cost hint
|
|
224
|
+
const pricePerMToken = 0.12; // voyage-4-large default
|
|
225
|
+
const cost = (totalTokens / 1e6) * pricePerMToken;
|
|
226
|
+
if (cost > 0) {
|
|
227
|
+
console.log(ui.label('Est. cost', ui.dim(`~$${cost < 0.01 ? cost.toFixed(4) : cost.toFixed(2)} with voyage-4-large`)));
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
} catch (err) {
|
|
231
|
+
console.error(ui.error(err.message));
|
|
232
|
+
process.exit(1);
|
|
233
|
+
}
|
|
234
|
+
});
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
/**
|
|
238
|
+
* Resolve input to a list of file paths.
|
|
239
|
+
* @param {string} input - File path, directory path, or glob
|
|
240
|
+
* @param {object} opts
|
|
241
|
+
* @returns {string[]}
|
|
242
|
+
*/
|
|
243
|
+
function resolveInput(input, opts) {
|
|
244
|
+
if (!input) {
|
|
245
|
+
console.error(ui.error('Please provide a file or directory path.'));
|
|
246
|
+
console.error(ui.dim(' Usage: vai chunk <file-or-directory> [options]'));
|
|
247
|
+
process.exit(1);
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
const resolved = path.resolve(input);
|
|
251
|
+
|
|
252
|
+
if (!fs.existsSync(resolved)) {
|
|
253
|
+
console.error(ui.error(`Not found: ${input}`));
|
|
254
|
+
process.exit(1);
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
const stat = fs.statSync(resolved);
|
|
258
|
+
|
|
259
|
+
if (stat.isFile()) {
|
|
260
|
+
return [resolved];
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
if (stat.isDirectory()) {
|
|
264
|
+
const scanOpts = {};
|
|
265
|
+
if (opts.extensions) {
|
|
266
|
+
scanOpts.extensions = opts.extensions.split(',').map(e => e.trim());
|
|
267
|
+
}
|
|
268
|
+
if (opts.ignore) {
|
|
269
|
+
scanOpts.ignore = opts.ignore.split(',').map(d => d.trim());
|
|
270
|
+
}
|
|
271
|
+
return scanDirectory(resolved, scanOpts);
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
return [];
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
module.exports = { registerChunk };
|
|
@@ -19,7 +19,7 @@ _vai_completions() {
|
|
|
19
19
|
prev="\${COMP_WORDS[COMP_CWORD-1]}"
|
|
20
20
|
|
|
21
21
|
# Top-level commands
|
|
22
|
-
commands="embed rerank store search index models ping config demo explain similarity ingest estimate completions help"
|
|
22
|
+
commands="embed rerank store search index models ping config demo explain similarity ingest estimate init chunk completions help"
|
|
23
23
|
|
|
24
24
|
# Subcommands
|
|
25
25
|
local index_subs="create list delete"
|
|
@@ -106,6 +106,14 @@ _vai_completions() {
|
|
|
106
106
|
COMPREPLY=( \$(compgen -W "--docs --queries --doc-tokens --query-tokens --doc-model --query-model --months --json --quiet --help" -- "\$cur") )
|
|
107
107
|
return 0
|
|
108
108
|
;;
|
|
109
|
+
init)
|
|
110
|
+
COMPREPLY=( \$(compgen -W "--yes --force --json --quiet --help" -- "\$cur") )
|
|
111
|
+
return 0
|
|
112
|
+
;;
|
|
113
|
+
chunk)
|
|
114
|
+
COMPREPLY=( \$(compgen -W "--strategy --chunk-size --overlap --min-size --output --text-field --extensions --ignore --dry-run --stats --json --quiet --help" -- "\$cur") )
|
|
115
|
+
return 0
|
|
116
|
+
;;
|
|
109
117
|
completions)
|
|
110
118
|
COMPREPLY=( \$(compgen -W "bash zsh --help" -- "\$cur") )
|
|
111
119
|
return 0
|
|
@@ -177,6 +185,8 @@ _vai() {
|
|
|
177
185
|
'similarity:Compute cosine similarity between texts'
|
|
178
186
|
'ingest:Bulk import documents with progress'
|
|
179
187
|
'estimate:Estimate embedding costs — symmetric vs asymmetric'
|
|
188
|
+
'init:Initialize project with .vai.json'
|
|
189
|
+
'chunk:Chunk documents for embedding'
|
|
180
190
|
'completions:Generate shell completion scripts'
|
|
181
191
|
'help:Display help for command'
|
|
182
192
|
)
|
|
@@ -392,6 +402,29 @@ _vai() {
|
|
|
392
402
|
'--json[Machine-readable JSON output]' \\
|
|
393
403
|
'(-q --quiet)'{-q,--quiet}'[Suppress non-essential output]'
|
|
394
404
|
;;
|
|
405
|
+
init)
|
|
406
|
+
_arguments \\
|
|
407
|
+
'(-y --yes)'{-y,--yes}'[Accept all defaults]' \\
|
|
408
|
+
'--force[Overwrite existing .vai.json]' \\
|
|
409
|
+
'--json[Output config as JSON]' \\
|
|
410
|
+
'(-q --quiet)'{-q,--quiet}'[Suppress non-essential output]'
|
|
411
|
+
;;
|
|
412
|
+
chunk)
|
|
413
|
+
_arguments \\
|
|
414
|
+
'1:input:_files' \\
|
|
415
|
+
'(-s --strategy)'{-s,--strategy}'[Chunking strategy]:strategy:(fixed sentence paragraph recursive markdown)' \\
|
|
416
|
+
'(-c --chunk-size)'{-c,--chunk-size}'[Target chunk size]:size:' \\
|
|
417
|
+
'--overlap[Overlap between chunks]:chars:' \\
|
|
418
|
+
'--min-size[Minimum chunk size]:chars:' \\
|
|
419
|
+
'(-o --output)'{-o,--output}'[Output file]:file:_files' \\
|
|
420
|
+
'--text-field[Text field for JSON]:field:' \\
|
|
421
|
+
'--extensions[File extensions]:exts:' \\
|
|
422
|
+
'--ignore[Dirs to skip]:dirs:' \\
|
|
423
|
+
'--dry-run[Preview without processing]' \\
|
|
424
|
+
'--stats[Show statistics]' \\
|
|
425
|
+
'--json[JSON output]' \\
|
|
426
|
+
'(-q --quiet)'{-q,--quiet}'[Suppress non-essential output]'
|
|
427
|
+
;;
|
|
395
428
|
completions)
|
|
396
429
|
_arguments \\
|
|
397
430
|
'1:shell:(bash zsh)'
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const fs = require('fs');
|
|
4
|
+
const path = require('path');
|
|
5
|
+
const readline = require('readline');
|
|
6
|
+
const { MODEL_CATALOG } = require('../lib/catalog');
|
|
7
|
+
const { STRATEGIES } = require('../lib/chunker');
|
|
8
|
+
const { defaultProjectConfig, saveProject, findProjectFile, PROJECT_FILE } = require('../lib/project');
|
|
9
|
+
const ui = require('../lib/ui');
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Prompt the user for input with a default value.
|
|
13
|
+
* @param {readline.Interface} rl
|
|
14
|
+
* @param {string} question
|
|
15
|
+
* @param {string} [defaultVal]
|
|
16
|
+
* @returns {Promise<string>}
|
|
17
|
+
*/
|
|
18
|
+
function ask(rl, question, defaultVal) {
|
|
19
|
+
const suffix = defaultVal ? ` ${ui.dim(`(${defaultVal})`)}` : '';
|
|
20
|
+
return new Promise((resolve) => {
|
|
21
|
+
rl.question(` ${question}${suffix}: `, (answer) => {
|
|
22
|
+
resolve(answer.trim() || defaultVal || '');
|
|
23
|
+
});
|
|
24
|
+
});
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Prompt for a choice from a list.
|
|
29
|
+
* @param {readline.Interface} rl
|
|
30
|
+
* @param {string} question
|
|
31
|
+
* @param {string[]} choices
|
|
32
|
+
* @param {string} defaultVal
|
|
33
|
+
* @returns {Promise<string>}
|
|
34
|
+
*/
|
|
35
|
+
async function askChoice(rl, question, choices, defaultVal) {
|
|
36
|
+
console.log('');
|
|
37
|
+
for (let i = 0; i < choices.length; i++) {
|
|
38
|
+
const marker = choices[i] === defaultVal ? ui.cyan('→') : ' ';
|
|
39
|
+
console.log(` ${marker} ${i + 1}. ${choices[i]}`);
|
|
40
|
+
}
|
|
41
|
+
const answer = await ask(rl, question, defaultVal);
|
|
42
|
+
// Accept number or value
|
|
43
|
+
const num = parseInt(answer, 10);
|
|
44
|
+
if (num >= 1 && num <= choices.length) return choices[num - 1];
|
|
45
|
+
if (choices.includes(answer)) return answer;
|
|
46
|
+
return defaultVal;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Register the init command on a Commander program.
|
|
51
|
+
* @param {import('commander').Command} program
|
|
52
|
+
*/
|
|
53
|
+
function registerInit(program) {
|
|
54
|
+
program
|
|
55
|
+
.command('init')
|
|
56
|
+
.description('Initialize a project with .vai.json configuration')
|
|
57
|
+
.option('-y, --yes', 'Accept all defaults (non-interactive)')
|
|
58
|
+
.option('--force', 'Overwrite existing .vai.json')
|
|
59
|
+
.option('--json', 'Output created config as JSON (non-interactive)')
|
|
60
|
+
.option('-q, --quiet', 'Suppress non-essential output')
|
|
61
|
+
.action(async (opts) => {
|
|
62
|
+
// Check for existing config
|
|
63
|
+
const existing = findProjectFile();
|
|
64
|
+
if (existing && !opts.force) {
|
|
65
|
+
const relPath = path.relative(process.cwd(), existing);
|
|
66
|
+
console.error(ui.warn(`Project already initialized: ${relPath}`));
|
|
67
|
+
console.error(ui.dim(' Use --force to overwrite.'));
|
|
68
|
+
process.exit(1);
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
const defaults = defaultProjectConfig();
|
|
72
|
+
|
|
73
|
+
// Non-interactive mode
|
|
74
|
+
if (opts.yes || opts.json) {
|
|
75
|
+
const filePath = saveProject(defaults);
|
|
76
|
+
if (opts.json) {
|
|
77
|
+
console.log(JSON.stringify(defaults, null, 2));
|
|
78
|
+
} else if (!opts.quiet) {
|
|
79
|
+
console.log(ui.success(`Created ${PROJECT_FILE}`));
|
|
80
|
+
}
|
|
81
|
+
return;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// Interactive mode
|
|
85
|
+
console.log('');
|
|
86
|
+
console.log(ui.bold(' 🚀 Initialize Voyage AI Project'));
|
|
87
|
+
console.log(ui.dim(' Creates .vai.json in the current directory.'));
|
|
88
|
+
console.log(ui.dim(' Press Enter to accept defaults.'));
|
|
89
|
+
console.log('');
|
|
90
|
+
|
|
91
|
+
const rl = readline.createInterface({
|
|
92
|
+
input: process.stdin,
|
|
93
|
+
output: process.stdout,
|
|
94
|
+
});
|
|
95
|
+
|
|
96
|
+
try {
|
|
97
|
+
// Embedding model
|
|
98
|
+
const embeddingModels = MODEL_CATALOG
|
|
99
|
+
.filter(m => m.type === 'embedding' && !m.legacy && !m.unreleased)
|
|
100
|
+
.map(m => m.name);
|
|
101
|
+
const model = await askChoice(rl, 'Embedding model', embeddingModels, defaults.model);
|
|
102
|
+
|
|
103
|
+
// MongoDB settings
|
|
104
|
+
console.log('');
|
|
105
|
+
console.log(ui.bold(' MongoDB Atlas'));
|
|
106
|
+
const db = await ask(rl, 'Database name', defaults.db || 'myapp');
|
|
107
|
+
const collection = await ask(rl, 'Collection name', defaults.collection || 'documents');
|
|
108
|
+
const field = await ask(rl, 'Embedding field', defaults.field);
|
|
109
|
+
const index = await ask(rl, 'Vector index name', defaults.index);
|
|
110
|
+
|
|
111
|
+
// Dimensions
|
|
112
|
+
const modelInfo = MODEL_CATALOG.find(m => m.name === model);
|
|
113
|
+
const defaultDims = modelInfo && modelInfo.dimensions.includes('1024') ? '1024' : '512';
|
|
114
|
+
const dimensions = parseInt(await ask(rl, 'Dimensions', defaultDims), 10) || parseInt(defaultDims, 10);
|
|
115
|
+
|
|
116
|
+
// Chunking
|
|
117
|
+
console.log('');
|
|
118
|
+
console.log(ui.bold(' Chunking'));
|
|
119
|
+
const strategy = await askChoice(rl, 'Chunk strategy', STRATEGIES, defaults.chunk.strategy);
|
|
120
|
+
const chunkSize = parseInt(await ask(rl, 'Chunk size (chars)', String(defaults.chunk.size)), 10);
|
|
121
|
+
const chunkOverlap = parseInt(await ask(rl, 'Chunk overlap (chars)', String(defaults.chunk.overlap)), 10);
|
|
122
|
+
|
|
123
|
+
const config = {
|
|
124
|
+
model,
|
|
125
|
+
db,
|
|
126
|
+
collection,
|
|
127
|
+
field,
|
|
128
|
+
inputType: 'document',
|
|
129
|
+
dimensions,
|
|
130
|
+
index,
|
|
131
|
+
chunk: {
|
|
132
|
+
strategy,
|
|
133
|
+
size: chunkSize,
|
|
134
|
+
overlap: chunkOverlap,
|
|
135
|
+
},
|
|
136
|
+
};
|
|
137
|
+
|
|
138
|
+
const filePath = saveProject(config);
|
|
139
|
+
console.log('');
|
|
140
|
+
console.log(ui.success(`Created ${path.relative(process.cwd(), filePath)}`));
|
|
141
|
+
console.log('');
|
|
142
|
+
console.log(ui.dim(' Next steps:'));
|
|
143
|
+
console.log(ui.dim(' vai chunk ./docs/ # Chunk your documents'));
|
|
144
|
+
console.log(ui.dim(' vai pipeline ./docs/ # Chunk → embed → store (coming soon)'));
|
|
145
|
+
console.log(ui.dim(' vai search --query "..." # Search your collection'));
|
|
146
|
+
console.log('');
|
|
147
|
+
} finally {
|
|
148
|
+
rl.close();
|
|
149
|
+
}
|
|
150
|
+
});
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
module.exports = { registerInit };
|
|
@@ -0,0 +1,341 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Available chunking strategies.
|
|
5
|
+
*/
|
|
6
|
+
const STRATEGIES = ['fixed', 'sentence', 'paragraph', 'recursive', 'markdown'];
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Default chunk options.
|
|
10
|
+
*/
|
|
11
|
+
const DEFAULTS = {
|
|
12
|
+
size: 512,
|
|
13
|
+
overlap: 50,
|
|
14
|
+
minSize: 20,
|
|
15
|
+
};
|
|
16
|
+
|
|
17
|
+
// ── Sentence splitting ──
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* Split text into sentences. Handles common abbreviations and edge cases.
|
|
21
|
+
* @param {string} text
|
|
22
|
+
* @returns {string[]}
|
|
23
|
+
*/
|
|
24
|
+
function splitSentences(text) {
|
|
25
|
+
// Split on sentence-ending punctuation followed by whitespace or EOL.
|
|
26
|
+
// Negative lookbehind for common abbreviations (Mr., Dr., etc.)
|
|
27
|
+
const parts = text.split(/(?<=[.!?])\s+(?=[A-Z\u00C0-\u024F"])/);
|
|
28
|
+
return parts.map(s => s.trim()).filter(s => s.length > 0);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
// ── Strategy implementations ──
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Fixed-size chunking with character count and overlap.
|
|
35
|
+
* @param {string} text
|
|
36
|
+
* @param {object} opts
|
|
37
|
+
* @param {number} opts.size - Target chunk size in characters
|
|
38
|
+
* @param {number} opts.overlap - Overlap between chunks in characters
|
|
39
|
+
* @returns {string[]}
|
|
40
|
+
*/
|
|
41
|
+
function chunkFixed(text, opts) {
|
|
42
|
+
const { size, overlap } = opts;
|
|
43
|
+
const chunks = [];
|
|
44
|
+
let start = 0;
|
|
45
|
+
|
|
46
|
+
while (start < text.length) {
|
|
47
|
+
const end = start + size;
|
|
48
|
+
chunks.push(text.slice(start, end).trim());
|
|
49
|
+
start = end - overlap;
|
|
50
|
+
if (start >= text.length) break;
|
|
51
|
+
// Prevent infinite loop with tiny overlap
|
|
52
|
+
if (end >= text.length) break;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
return chunks.filter(c => c.length >= (opts.minSize || DEFAULTS.minSize));
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Sentence-boundary chunking. Groups sentences until size limit.
|
|
60
|
+
* @param {string} text
|
|
61
|
+
* @param {object} opts
|
|
62
|
+
* @returns {string[]}
|
|
63
|
+
*/
|
|
64
|
+
function chunkSentence(text, opts) {
|
|
65
|
+
const { size, overlap } = opts;
|
|
66
|
+
const sentences = splitSentences(text);
|
|
67
|
+
return groupUnits(sentences, size, overlap, opts.minSize || DEFAULTS.minSize);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Paragraph chunking. Splits on double newlines, groups if needed.
|
|
72
|
+
* @param {string} text
|
|
73
|
+
* @param {object} opts
|
|
74
|
+
* @returns {string[]}
|
|
75
|
+
*/
|
|
76
|
+
function chunkParagraph(text, opts) {
|
|
77
|
+
const { size, overlap } = opts;
|
|
78
|
+
const paragraphs = text.split(/\n\s*\n/).map(p => p.trim()).filter(p => p.length > 0);
|
|
79
|
+
return groupUnits(paragraphs, size, overlap, opts.minSize || DEFAULTS.minSize);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* Recursive chunking. Tries largest delimiters first, falls back to smaller.
|
|
84
|
+
* This is the most commonly used strategy for RAG pipelines.
|
|
85
|
+
* @param {string} text
|
|
86
|
+
* @param {object} opts
|
|
87
|
+
* @returns {string[]}
|
|
88
|
+
*/
|
|
89
|
+
function chunkRecursive(text, opts) {
|
|
90
|
+
const { size, minSize } = opts;
|
|
91
|
+
const separators = ['\n\n', '\n', '. ', '! ', '? ', '; ', ', ', ' '];
|
|
92
|
+
|
|
93
|
+
return recursiveSplit(text, separators, size, minSize || DEFAULTS.minSize);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* Internal recursive split implementation.
|
|
98
|
+
* @param {string} text
|
|
99
|
+
* @param {string[]} separators
|
|
100
|
+
* @param {number} maxSize
|
|
101
|
+
* @param {number} minSize
|
|
102
|
+
* @returns {string[]}
|
|
103
|
+
*/
|
|
104
|
+
function recursiveSplit(text, separators, maxSize, minSize) {
|
|
105
|
+
if (text.length <= maxSize) {
|
|
106
|
+
return text.trim().length >= minSize ? [text.trim()] : [];
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
// Find the first separator that exists in the text
|
|
110
|
+
let sep = null;
|
|
111
|
+
for (const s of separators) {
|
|
112
|
+
if (text.includes(s)) {
|
|
113
|
+
sep = s;
|
|
114
|
+
break;
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
// If no separator found, hard-split by characters
|
|
119
|
+
if (sep === null) {
|
|
120
|
+
const chunks = [];
|
|
121
|
+
for (let i = 0; i < text.length; i += maxSize) {
|
|
122
|
+
const chunk = text.slice(i, i + maxSize).trim();
|
|
123
|
+
if (chunk.length >= minSize) chunks.push(chunk);
|
|
124
|
+
}
|
|
125
|
+
return chunks;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
// Split on this separator and greedily merge pieces under maxSize
|
|
129
|
+
const parts = text.split(sep);
|
|
130
|
+
const chunks = [];
|
|
131
|
+
let current = '';
|
|
132
|
+
|
|
133
|
+
for (const part of parts) {
|
|
134
|
+
const candidate = current ? current + sep + part : part;
|
|
135
|
+
|
|
136
|
+
if (candidate.length <= maxSize) {
|
|
137
|
+
current = candidate;
|
|
138
|
+
} else {
|
|
139
|
+
// Flush current chunk
|
|
140
|
+
if (current.trim().length >= minSize) {
|
|
141
|
+
chunks.push(current.trim());
|
|
142
|
+
}
|
|
143
|
+
// If this single part exceeds maxSize, recurse with next separator level
|
|
144
|
+
if (part.length > maxSize) {
|
|
145
|
+
const remainingSeps = separators.slice(separators.indexOf(sep) + 1);
|
|
146
|
+
chunks.push(...recursiveSplit(part, remainingSeps, maxSize, minSize));
|
|
147
|
+
current = '';
|
|
148
|
+
} else {
|
|
149
|
+
current = part;
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
// Flush remainder
|
|
155
|
+
if (current.trim().length >= minSize) {
|
|
156
|
+
chunks.push(current.trim());
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
return chunks;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
/**
|
|
163
|
+
* Markdown-aware chunking. Splits on headings, preserves structure.
|
|
164
|
+
* Each heading starts a new chunk; content under it is grouped.
|
|
165
|
+
* @param {string} text
|
|
166
|
+
* @param {object} opts
|
|
167
|
+
* @returns {string[]}
|
|
168
|
+
*/
|
|
169
|
+
function chunkMarkdown(text, opts) {
|
|
170
|
+
const { size, minSize } = opts;
|
|
171
|
+
|
|
172
|
+
// Split on markdown headings (# through ######)
|
|
173
|
+
const headingPattern = /^(#{1,6}\s.+)$/gm;
|
|
174
|
+
const sections = [];
|
|
175
|
+
let lastIndex = 0;
|
|
176
|
+
let match;
|
|
177
|
+
|
|
178
|
+
while ((match = headingPattern.exec(text)) !== null) {
|
|
179
|
+
// Content before this heading
|
|
180
|
+
if (match.index > lastIndex) {
|
|
181
|
+
const content = text.slice(lastIndex, match.index).trim();
|
|
182
|
+
if (content) {
|
|
183
|
+
if (sections.length > 0) {
|
|
184
|
+
// Append to previous section
|
|
185
|
+
sections[sections.length - 1].content += '\n\n' + content;
|
|
186
|
+
} else {
|
|
187
|
+
sections.push({ heading: '', content });
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
sections.push({ heading: match[1], content: '' });
|
|
192
|
+
lastIndex = match.index + match[0].length;
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
// Remaining content after last heading
|
|
196
|
+
if (lastIndex < text.length) {
|
|
197
|
+
const content = text.slice(lastIndex).trim();
|
|
198
|
+
if (content) {
|
|
199
|
+
if (sections.length > 0) {
|
|
200
|
+
sections[sections.length - 1].content += '\n\n' + content;
|
|
201
|
+
} else {
|
|
202
|
+
sections.push({ heading: '', content });
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
// Build chunks from sections, splitting large sections recursively
|
|
208
|
+
const chunks = [];
|
|
209
|
+
for (const section of sections) {
|
|
210
|
+
const full = section.heading
|
|
211
|
+
? section.heading + '\n\n' + section.content.trim()
|
|
212
|
+
: section.content.trim();
|
|
213
|
+
|
|
214
|
+
if (!full || full.length < (minSize || DEFAULTS.minSize)) continue;
|
|
215
|
+
|
|
216
|
+
if (full.length <= size) {
|
|
217
|
+
chunks.push(full);
|
|
218
|
+
} else {
|
|
219
|
+
// Section too large — recursively split the content, prepend heading to first chunk
|
|
220
|
+
const subChunks = chunkRecursive(section.content.trim(), opts);
|
|
221
|
+
for (let i = 0; i < subChunks.length; i++) {
|
|
222
|
+
if (i === 0 && section.heading) {
|
|
223
|
+
chunks.push(section.heading + '\n\n' + subChunks[i]);
|
|
224
|
+
} else {
|
|
225
|
+
chunks.push(subChunks[i]);
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
return chunks;
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
// ── Shared helpers ──
|
|
235
|
+
|
|
236
|
+
/**
|
|
237
|
+
* Group text units (sentences, paragraphs) into chunks under a size limit.
|
|
238
|
+
* Supports overlap by re-including trailing units from the previous chunk.
|
|
239
|
+
* @param {string[]} units
|
|
240
|
+
* @param {number} maxSize
|
|
241
|
+
* @param {number} overlapChars
|
|
242
|
+
* @param {number} minSize
|
|
243
|
+
* @returns {string[]}
|
|
244
|
+
*/
|
|
245
|
+
function groupUnits(units, maxSize, overlapChars, minSize) {
|
|
246
|
+
const chunks = [];
|
|
247
|
+
let current = [];
|
|
248
|
+
let currentLen = 0;
|
|
249
|
+
|
|
250
|
+
for (const unit of units) {
|
|
251
|
+
const addLen = current.length > 0 ? unit.length + 1 : unit.length; // +1 for space
|
|
252
|
+
|
|
253
|
+
if (currentLen + addLen > maxSize && current.length > 0) {
|
|
254
|
+
chunks.push(current.join(' ').trim());
|
|
255
|
+
|
|
256
|
+
// Overlap: keep trailing units that fit within overlap budget
|
|
257
|
+
if (overlapChars > 0) {
|
|
258
|
+
let overlapUnits = [];
|
|
259
|
+
let overlapLen = 0;
|
|
260
|
+
for (let i = current.length - 1; i >= 0; i--) {
|
|
261
|
+
if (overlapLen + current[i].length + 1 > overlapChars) break;
|
|
262
|
+
overlapUnits.unshift(current[i]);
|
|
263
|
+
overlapLen += current[i].length + 1;
|
|
264
|
+
}
|
|
265
|
+
current = overlapUnits;
|
|
266
|
+
currentLen = overlapLen;
|
|
267
|
+
} else {
|
|
268
|
+
current = [];
|
|
269
|
+
currentLen = 0;
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
current.push(unit);
|
|
274
|
+
currentLen += addLen;
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
// Flush remainder
|
|
278
|
+
if (current.length > 0) {
|
|
279
|
+
const text = current.join(' ').trim();
|
|
280
|
+
if (text.length >= minSize) chunks.push(text);
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
return chunks;
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
// ── Token estimation ──
|
|
287
|
+
|
|
288
|
+
/**
|
|
289
|
+
* Rough token estimate. ~4 chars per token for English text.
|
|
290
|
+
* @param {string} text
|
|
291
|
+
* @returns {number}
|
|
292
|
+
*/
|
|
293
|
+
function estimateTokens(text) {
|
|
294
|
+
return Math.ceil(text.length / 4);
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
// ── Public API ──
|
|
298
|
+
|
|
299
|
+
/**
|
|
300
|
+
* Chunk text using the specified strategy.
|
|
301
|
+
* @param {string} text - Input text
|
|
302
|
+
* @param {object} [options]
|
|
303
|
+
* @param {string} [options.strategy='recursive'] - Chunking strategy
|
|
304
|
+
* @param {number} [options.size=512] - Target chunk size in characters
|
|
305
|
+
* @param {number} [options.overlap=50] - Overlap between chunks in characters
|
|
306
|
+
* @param {number} [options.minSize=20] - Minimum chunk size
|
|
307
|
+
* @returns {string[]} Array of text chunks
|
|
308
|
+
*/
|
|
309
|
+
function chunk(text, options = {}) {
|
|
310
|
+
const opts = {
|
|
311
|
+
strategy: options.strategy || 'recursive',
|
|
312
|
+
size: options.size || DEFAULTS.size,
|
|
313
|
+
overlap: options.overlap != null ? options.overlap : DEFAULTS.overlap,
|
|
314
|
+
minSize: options.minSize || DEFAULTS.minSize,
|
|
315
|
+
};
|
|
316
|
+
|
|
317
|
+
if (!text || text.trim().length === 0) return [];
|
|
318
|
+
|
|
319
|
+
switch (opts.strategy) {
|
|
320
|
+
case 'fixed':
|
|
321
|
+
return chunkFixed(text, opts);
|
|
322
|
+
case 'sentence':
|
|
323
|
+
return chunkSentence(text, opts);
|
|
324
|
+
case 'paragraph':
|
|
325
|
+
return chunkParagraph(text, opts);
|
|
326
|
+
case 'recursive':
|
|
327
|
+
return chunkRecursive(text, opts);
|
|
328
|
+
case 'markdown':
|
|
329
|
+
return chunkMarkdown(text, opts);
|
|
330
|
+
default:
|
|
331
|
+
throw new Error(`Unknown chunking strategy: ${opts.strategy}. Available: ${STRATEGIES.join(', ')}`);
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
module.exports = {
|
|
336
|
+
chunk,
|
|
337
|
+
splitSentences,
|
|
338
|
+
estimateTokens,
|
|
339
|
+
STRATEGIES,
|
|
340
|
+
DEFAULTS,
|
|
341
|
+
};
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const fs = require('fs');
|
|
4
|
+
const path = require('path');
|
|
5
|
+
|
|
6
|
+
const PROJECT_FILE = '.vai.json';
|
|
7
|
+
const PROJECT_VERSION = 1;
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Search for .vai.json starting from startDir, walking up to root.
|
|
11
|
+
* @param {string} [startDir] - Directory to start from (default: cwd)
|
|
12
|
+
* @returns {string|null} Absolute path to .vai.json or null
|
|
13
|
+
*/
|
|
14
|
+
function findProjectFile(startDir) {
|
|
15
|
+
let dir = path.resolve(startDir || process.cwd());
|
|
16
|
+
const root = path.parse(dir).root;
|
|
17
|
+
|
|
18
|
+
while (dir !== root) {
|
|
19
|
+
const candidate = path.join(dir, PROJECT_FILE);
|
|
20
|
+
if (fs.existsSync(candidate)) return candidate;
|
|
21
|
+
dir = path.dirname(dir);
|
|
22
|
+
}
|
|
23
|
+
// Check root too
|
|
24
|
+
const rootCandidate = path.join(root, PROJECT_FILE);
|
|
25
|
+
if (fs.existsSync(rootCandidate)) return rootCandidate;
|
|
26
|
+
|
|
27
|
+
return null;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Load project config from .vai.json.
|
|
32
|
+
* @param {string} [startDir] - Directory to start searching from
|
|
33
|
+
* @returns {{ config: object, filePath: string|null }}
|
|
34
|
+
*/
|
|
35
|
+
function loadProject(startDir) {
|
|
36
|
+
const filePath = findProjectFile(startDir);
|
|
37
|
+
if (!filePath) return { config: {}, filePath: null };
|
|
38
|
+
|
|
39
|
+
try {
|
|
40
|
+
const raw = fs.readFileSync(filePath, 'utf-8');
|
|
41
|
+
return { config: JSON.parse(raw), filePath };
|
|
42
|
+
} catch (err) {
|
|
43
|
+
return { config: {}, filePath };
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Save project config to .vai.json.
|
|
49
|
+
* @param {object} config - Project configuration
|
|
50
|
+
* @param {string} [targetPath] - Path to write (default: cwd/.vai.json)
|
|
51
|
+
*/
|
|
52
|
+
function saveProject(config, targetPath) {
|
|
53
|
+
const filePath = targetPath || path.join(process.cwd(), PROJECT_FILE);
|
|
54
|
+
const output = { version: PROJECT_VERSION, ...config };
|
|
55
|
+
fs.writeFileSync(filePath, JSON.stringify(output, null, 2) + '\n', 'utf-8');
|
|
56
|
+
return filePath;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Merge project config with CLI options. CLI options take precedence.
|
|
61
|
+
* Only merges known keys — doesn't blindly spread everything.
|
|
62
|
+
* @param {object} projectConfig - From .vai.json
|
|
63
|
+
* @param {object} cliOpts - From commander
|
|
64
|
+
* @returns {object} Merged options
|
|
65
|
+
*/
|
|
66
|
+
function mergeOptions(projectConfig, cliOpts) {
|
|
67
|
+
const merged = {};
|
|
68
|
+
|
|
69
|
+
// Map of project config keys → CLI option keys
|
|
70
|
+
const keys = [
|
|
71
|
+
'model', 'db', 'collection', 'field', 'inputType',
|
|
72
|
+
'dimensions', 'index',
|
|
73
|
+
];
|
|
74
|
+
|
|
75
|
+
for (const key of keys) {
|
|
76
|
+
// CLI explicit value wins, then project config, then undefined
|
|
77
|
+
if (cliOpts[key] !== undefined) {
|
|
78
|
+
merged[key] = cliOpts[key];
|
|
79
|
+
} else if (projectConfig[key] !== undefined) {
|
|
80
|
+
merged[key] = projectConfig[key];
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// Chunk config nests under project.chunk
|
|
85
|
+
if (projectConfig.chunk) {
|
|
86
|
+
merged.chunk = { ...projectConfig.chunk };
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
return merged;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Default project config scaffold.
|
|
94
|
+
* @returns {object}
|
|
95
|
+
*/
|
|
96
|
+
function defaultProjectConfig() {
|
|
97
|
+
return {
|
|
98
|
+
version: PROJECT_VERSION,
|
|
99
|
+
model: 'voyage-4-large',
|
|
100
|
+
db: '',
|
|
101
|
+
collection: '',
|
|
102
|
+
field: 'embedding',
|
|
103
|
+
inputType: 'document',
|
|
104
|
+
dimensions: 1024,
|
|
105
|
+
index: 'vector_index',
|
|
106
|
+
chunk: {
|
|
107
|
+
strategy: 'recursive',
|
|
108
|
+
size: 512,
|
|
109
|
+
overlap: 50,
|
|
110
|
+
},
|
|
111
|
+
};
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
module.exports = {
|
|
115
|
+
PROJECT_FILE,
|
|
116
|
+
PROJECT_VERSION,
|
|
117
|
+
findProjectFile,
|
|
118
|
+
loadProject,
|
|
119
|
+
saveProject,
|
|
120
|
+
mergeOptions,
|
|
121
|
+
defaultProjectConfig,
|
|
122
|
+
};
|
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const fs = require('fs');
|
|
4
|
+
const path = require('path');
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Supported file extensions and their reader types.
|
|
8
|
+
*/
|
|
9
|
+
const SUPPORTED_EXTENSIONS = {
|
|
10
|
+
'.txt': 'text',
|
|
11
|
+
'.md': 'text',
|
|
12
|
+
'.markdown': 'text',
|
|
13
|
+
'.rst': 'text',
|
|
14
|
+
'.html': 'html',
|
|
15
|
+
'.htm': 'html',
|
|
16
|
+
'.json': 'json',
|
|
17
|
+
'.jsonl': 'jsonl',
|
|
18
|
+
'.ndjson': 'jsonl',
|
|
19
|
+
'.csv': 'text',
|
|
20
|
+
'.pdf': 'pdf',
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Check if a file extension is supported.
|
|
25
|
+
* @param {string} filePath
|
|
26
|
+
* @returns {boolean}
|
|
27
|
+
*/
|
|
28
|
+
function isSupported(filePath) {
|
|
29
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
30
|
+
return ext in SUPPORTED_EXTENSIONS;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Get the reader type for a file.
|
|
35
|
+
* @param {string} filePath
|
|
36
|
+
* @returns {string|null}
|
|
37
|
+
*/
|
|
38
|
+
function getReaderType(filePath) {
|
|
39
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
40
|
+
return SUPPORTED_EXTENSIONS[ext] || null;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Read a text file (txt, md, rst, csv).
|
|
45
|
+
* @param {string} filePath
|
|
46
|
+
* @returns {Promise<string>}
|
|
47
|
+
*/
|
|
48
|
+
async function readTextFile(filePath) {
|
|
49
|
+
return fs.readFileSync(filePath, 'utf-8');
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Read an HTML file and strip tags to plain text.
|
|
54
|
+
* Lightweight — no external dependencies.
|
|
55
|
+
* @param {string} filePath
|
|
56
|
+
* @returns {Promise<string>}
|
|
57
|
+
*/
|
|
58
|
+
async function readHtmlFile(filePath) {
|
|
59
|
+
const html = fs.readFileSync(filePath, 'utf-8');
|
|
60
|
+
return stripHtml(html);
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Strip HTML tags and decode common entities.
|
|
65
|
+
* @param {string} html
|
|
66
|
+
* @returns {string}
|
|
67
|
+
*/
|
|
68
|
+
function stripHtml(html) {
|
|
69
|
+
return html
|
|
70
|
+
// Remove script and style blocks
|
|
71
|
+
.replace(/<script[\s\S]*?<\/script>/gi, '')
|
|
72
|
+
.replace(/<style[\s\S]*?<\/style>/gi, '')
|
|
73
|
+
// Replace block elements with newlines
|
|
74
|
+
.replace(/<\/?(p|div|br|h[1-6]|li|tr|blockquote|section|article|header|footer|nav|pre)[^>]*>/gi, '\n')
|
|
75
|
+
// Remove remaining tags
|
|
76
|
+
.replace(/<[^>]+>/g, '')
|
|
77
|
+
// Decode common entities
|
|
78
|
+
.replace(/&/g, '&')
|
|
79
|
+
.replace(/</g, '<')
|
|
80
|
+
.replace(/>/g, '>')
|
|
81
|
+
.replace(/"/g, '"')
|
|
82
|
+
.replace(/'/g, "'")
|
|
83
|
+
.replace(/ /g, ' ')
|
|
84
|
+
// Collapse whitespace
|
|
85
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
86
|
+
.trim();
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Read a JSON file. Extracts text from objects using a text field.
|
|
91
|
+
* Supports JSON array of objects or a single object with a text field.
|
|
92
|
+
* @param {string} filePath
|
|
93
|
+
* @param {string} [textField='text'] - Field name containing text
|
|
94
|
+
* @returns {Promise<Array<{text: string, metadata: object}>>}
|
|
95
|
+
*/
|
|
96
|
+
async function readJsonFile(filePath, textField = 'text') {
|
|
97
|
+
const raw = fs.readFileSync(filePath, 'utf-8');
|
|
98
|
+
const data = JSON.parse(raw);
|
|
99
|
+
|
|
100
|
+
if (Array.isArray(data)) {
|
|
101
|
+
return data.map((item, i) => {
|
|
102
|
+
const text = item[textField];
|
|
103
|
+
if (!text) throw new Error(`Missing "${textField}" field in array item ${i}`);
|
|
104
|
+
const metadata = { ...item };
|
|
105
|
+
delete metadata[textField];
|
|
106
|
+
return { text, metadata };
|
|
107
|
+
});
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
if (typeof data === 'object' && data[textField]) {
|
|
111
|
+
const metadata = { ...data };
|
|
112
|
+
delete metadata[textField];
|
|
113
|
+
return [{ text: data[textField], metadata }];
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
throw new Error(`JSON file must be an array of objects or an object with a "${textField}" field`);
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
/**
|
|
120
|
+
* Read a JSONL/NDJSON file.
|
|
121
|
+
* @param {string} filePath
|
|
122
|
+
* @param {string} [textField='text']
|
|
123
|
+
* @returns {Promise<Array<{text: string, metadata: object}>>}
|
|
124
|
+
*/
|
|
125
|
+
async function readJsonlFile(filePath, textField = 'text') {
|
|
126
|
+
const raw = fs.readFileSync(filePath, 'utf-8');
|
|
127
|
+
const lines = raw.split('\n').filter(l => l.trim().length > 0);
|
|
128
|
+
|
|
129
|
+
return lines.map((line, i) => {
|
|
130
|
+
const item = JSON.parse(line);
|
|
131
|
+
const text = item[textField];
|
|
132
|
+
if (!text) throw new Error(`Missing "${textField}" field on line ${i + 1}`);
|
|
133
|
+
const metadata = { ...item };
|
|
134
|
+
delete metadata[textField];
|
|
135
|
+
return { text, metadata };
|
|
136
|
+
});
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
/**
|
|
140
|
+
* Read a PDF file. Requires optional `pdf-parse` dependency.
|
|
141
|
+
* @param {string} filePath
|
|
142
|
+
* @returns {Promise<string>}
|
|
143
|
+
*/
|
|
144
|
+
async function readPdfFile(filePath) {
|
|
145
|
+
let pdfParse;
|
|
146
|
+
try {
|
|
147
|
+
pdfParse = require('pdf-parse');
|
|
148
|
+
} catch {
|
|
149
|
+
throw new Error(
|
|
150
|
+
'PDF support requires the "pdf-parse" package.\n' +
|
|
151
|
+
'Install it: npm install pdf-parse\n' +
|
|
152
|
+
'Then retry your command.'
|
|
153
|
+
);
|
|
154
|
+
}
|
|
155
|
+
const buffer = fs.readFileSync(filePath);
|
|
156
|
+
const data = await pdfParse(buffer);
|
|
157
|
+
return data.text;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
/**
|
|
161
|
+
* Read a single file and return its text content.
|
|
162
|
+
* For structured files (JSON/JSONL), returns array of {text, metadata}.
|
|
163
|
+
* For text files, returns the raw text string.
|
|
164
|
+
* @param {string} filePath
|
|
165
|
+
* @param {object} [opts]
|
|
166
|
+
* @param {string} [opts.textField='text'] - Field name for JSON/JSONL
|
|
167
|
+
* @returns {Promise<string|Array<{text: string, metadata: object}>>}
|
|
168
|
+
*/
|
|
169
|
+
async function readFile(filePath, opts = {}) {
|
|
170
|
+
const type = getReaderType(filePath);
|
|
171
|
+
if (!type) {
|
|
172
|
+
throw new Error(`Unsupported file type: ${path.extname(filePath)}. Supported: ${Object.keys(SUPPORTED_EXTENSIONS).join(', ')}`);
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
switch (type) {
|
|
176
|
+
case 'text':
|
|
177
|
+
return readTextFile(filePath);
|
|
178
|
+
case 'html':
|
|
179
|
+
return readHtmlFile(filePath);
|
|
180
|
+
case 'json':
|
|
181
|
+
return readJsonFile(filePath, opts.textField || 'text');
|
|
182
|
+
case 'jsonl':
|
|
183
|
+
return readJsonlFile(filePath, opts.textField || 'text');
|
|
184
|
+
case 'pdf':
|
|
185
|
+
return readPdfFile(filePath);
|
|
186
|
+
default:
|
|
187
|
+
throw new Error(`No reader for type: ${type}`);
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
/**
|
|
192
|
+
* Recursively scan a directory for supported files.
|
|
193
|
+
* @param {string} dirPath
|
|
194
|
+
* @param {object} [opts]
|
|
195
|
+
* @param {string[]} [opts.extensions] - Filter to specific extensions
|
|
196
|
+
* @param {string[]} [opts.ignore] - Directory names to skip
|
|
197
|
+
* @returns {string[]} Array of absolute file paths
|
|
198
|
+
*/
|
|
199
|
+
function scanDirectory(dirPath, opts = {}) {
|
|
200
|
+
const ignore = new Set(opts.ignore || ['node_modules', '.git', '.vai', '__pycache__', '.DS_Store']);
|
|
201
|
+
const extensions = opts.extensions
|
|
202
|
+
? new Set(opts.extensions.map(e => e.startsWith('.') ? e : '.' + e))
|
|
203
|
+
: null;
|
|
204
|
+
|
|
205
|
+
const results = [];
|
|
206
|
+
|
|
207
|
+
function walk(dir) {
|
|
208
|
+
const entries = fs.readdirSync(dir, { withFileTypes: true });
|
|
209
|
+
for (const entry of entries) {
|
|
210
|
+
if (entry.name.startsWith('.') && ignore.has(entry.name)) continue;
|
|
211
|
+
if (ignore.has(entry.name)) continue;
|
|
212
|
+
|
|
213
|
+
const fullPath = path.join(dir, entry.name);
|
|
214
|
+
|
|
215
|
+
if (entry.isDirectory()) {
|
|
216
|
+
walk(fullPath);
|
|
217
|
+
} else if (entry.isFile()) {
|
|
218
|
+
const ext = path.extname(entry.name).toLowerCase();
|
|
219
|
+
if (extensions) {
|
|
220
|
+
if (extensions.has(ext)) results.push(fullPath);
|
|
221
|
+
} else if (SUPPORTED_EXTENSIONS[ext]) {
|
|
222
|
+
results.push(fullPath);
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
walk(path.resolve(dirPath));
|
|
229
|
+
return results.sort();
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
module.exports = {
|
|
233
|
+
SUPPORTED_EXTENSIONS,
|
|
234
|
+
isSupported,
|
|
235
|
+
getReaderType,
|
|
236
|
+
readFile,
|
|
237
|
+
scanDirectory,
|
|
238
|
+
stripHtml,
|
|
239
|
+
};
|