voyageai-cli 1.13.0 → 1.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/cli.js +6 -0
- package/src/commands/benchmark.js +164 -0
- package/src/commands/chunk.js +277 -0
- package/src/commands/completions.js +51 -1
- package/src/commands/estimate.js +209 -0
- package/src/commands/init.js +153 -0
- package/src/commands/models.js +32 -4
- package/src/lib/catalog.js +42 -18
- package/src/lib/chunker.js +341 -0
- package/src/lib/explanations.js +183 -0
- package/src/lib/project.js +122 -0
- package/src/lib/readers.js +239 -0
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const fs = require('fs');
|
|
4
|
+
const path = require('path');
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Supported file extensions and their reader types.
|
|
8
|
+
*/
|
|
9
|
+
const SUPPORTED_EXTENSIONS = {
|
|
10
|
+
'.txt': 'text',
|
|
11
|
+
'.md': 'text',
|
|
12
|
+
'.markdown': 'text',
|
|
13
|
+
'.rst': 'text',
|
|
14
|
+
'.html': 'html',
|
|
15
|
+
'.htm': 'html',
|
|
16
|
+
'.json': 'json',
|
|
17
|
+
'.jsonl': 'jsonl',
|
|
18
|
+
'.ndjson': 'jsonl',
|
|
19
|
+
'.csv': 'text',
|
|
20
|
+
'.pdf': 'pdf',
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Check if a file extension is supported.
|
|
25
|
+
* @param {string} filePath
|
|
26
|
+
* @returns {boolean}
|
|
27
|
+
*/
|
|
28
|
+
function isSupported(filePath) {
|
|
29
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
30
|
+
return ext in SUPPORTED_EXTENSIONS;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Get the reader type for a file.
|
|
35
|
+
* @param {string} filePath
|
|
36
|
+
* @returns {string|null}
|
|
37
|
+
*/
|
|
38
|
+
function getReaderType(filePath) {
|
|
39
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
40
|
+
return SUPPORTED_EXTENSIONS[ext] || null;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Read a text file (txt, md, rst, csv).
|
|
45
|
+
* @param {string} filePath
|
|
46
|
+
* @returns {Promise<string>}
|
|
47
|
+
*/
|
|
48
|
+
async function readTextFile(filePath) {
|
|
49
|
+
return fs.readFileSync(filePath, 'utf-8');
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Read an HTML file and strip tags to plain text.
|
|
54
|
+
* Lightweight — no external dependencies.
|
|
55
|
+
* @param {string} filePath
|
|
56
|
+
* @returns {Promise<string>}
|
|
57
|
+
*/
|
|
58
|
+
async function readHtmlFile(filePath) {
|
|
59
|
+
const html = fs.readFileSync(filePath, 'utf-8');
|
|
60
|
+
return stripHtml(html);
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Strip HTML tags and decode common entities.
|
|
65
|
+
* @param {string} html
|
|
66
|
+
* @returns {string}
|
|
67
|
+
*/
|
|
68
|
+
function stripHtml(html) {
|
|
69
|
+
return html
|
|
70
|
+
// Remove script and style blocks
|
|
71
|
+
.replace(/<script[\s\S]*?<\/script>/gi, '')
|
|
72
|
+
.replace(/<style[\s\S]*?<\/style>/gi, '')
|
|
73
|
+
// Replace block elements with newlines
|
|
74
|
+
.replace(/<\/?(p|div|br|h[1-6]|li|tr|blockquote|section|article|header|footer|nav|pre)[^>]*>/gi, '\n')
|
|
75
|
+
// Remove remaining tags
|
|
76
|
+
.replace(/<[^>]+>/g, '')
|
|
77
|
+
// Decode common entities
|
|
78
|
+
.replace(/&/g, '&')
|
|
79
|
+
.replace(/</g, '<')
|
|
80
|
+
.replace(/>/g, '>')
|
|
81
|
+
.replace(/"/g, '"')
|
|
82
|
+
.replace(/'/g, "'")
|
|
83
|
+
.replace(/ /g, ' ')
|
|
84
|
+
// Collapse whitespace
|
|
85
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
86
|
+
.trim();
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Read a JSON file. Extracts text from objects using a text field.
|
|
91
|
+
* Supports JSON array of objects or a single object with a text field.
|
|
92
|
+
* @param {string} filePath
|
|
93
|
+
* @param {string} [textField='text'] - Field name containing text
|
|
94
|
+
* @returns {Promise<Array<{text: string, metadata: object}>>}
|
|
95
|
+
*/
|
|
96
|
+
async function readJsonFile(filePath, textField = 'text') {
|
|
97
|
+
const raw = fs.readFileSync(filePath, 'utf-8');
|
|
98
|
+
const data = JSON.parse(raw);
|
|
99
|
+
|
|
100
|
+
if (Array.isArray(data)) {
|
|
101
|
+
return data.map((item, i) => {
|
|
102
|
+
const text = item[textField];
|
|
103
|
+
if (!text) throw new Error(`Missing "${textField}" field in array item ${i}`);
|
|
104
|
+
const metadata = { ...item };
|
|
105
|
+
delete metadata[textField];
|
|
106
|
+
return { text, metadata };
|
|
107
|
+
});
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
if (typeof data === 'object' && data[textField]) {
|
|
111
|
+
const metadata = { ...data };
|
|
112
|
+
delete metadata[textField];
|
|
113
|
+
return [{ text: data[textField], metadata }];
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
throw new Error(`JSON file must be an array of objects or an object with a "${textField}" field`);
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
/**
|
|
120
|
+
* Read a JSONL/NDJSON file.
|
|
121
|
+
* @param {string} filePath
|
|
122
|
+
* @param {string} [textField='text']
|
|
123
|
+
* @returns {Promise<Array<{text: string, metadata: object}>>}
|
|
124
|
+
*/
|
|
125
|
+
async function readJsonlFile(filePath, textField = 'text') {
|
|
126
|
+
const raw = fs.readFileSync(filePath, 'utf-8');
|
|
127
|
+
const lines = raw.split('\n').filter(l => l.trim().length > 0);
|
|
128
|
+
|
|
129
|
+
return lines.map((line, i) => {
|
|
130
|
+
const item = JSON.parse(line);
|
|
131
|
+
const text = item[textField];
|
|
132
|
+
if (!text) throw new Error(`Missing "${textField}" field on line ${i + 1}`);
|
|
133
|
+
const metadata = { ...item };
|
|
134
|
+
delete metadata[textField];
|
|
135
|
+
return { text, metadata };
|
|
136
|
+
});
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
/**
|
|
140
|
+
* Read a PDF file. Requires optional `pdf-parse` dependency.
|
|
141
|
+
* @param {string} filePath
|
|
142
|
+
* @returns {Promise<string>}
|
|
143
|
+
*/
|
|
144
|
+
async function readPdfFile(filePath) {
|
|
145
|
+
let pdfParse;
|
|
146
|
+
try {
|
|
147
|
+
pdfParse = require('pdf-parse');
|
|
148
|
+
} catch {
|
|
149
|
+
throw new Error(
|
|
150
|
+
'PDF support requires the "pdf-parse" package.\n' +
|
|
151
|
+
'Install it: npm install pdf-parse\n' +
|
|
152
|
+
'Then retry your command.'
|
|
153
|
+
);
|
|
154
|
+
}
|
|
155
|
+
const buffer = fs.readFileSync(filePath);
|
|
156
|
+
const data = await pdfParse(buffer);
|
|
157
|
+
return data.text;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
/**
|
|
161
|
+
* Read a single file and return its text content.
|
|
162
|
+
* For structured files (JSON/JSONL), returns array of {text, metadata}.
|
|
163
|
+
* For text files, returns the raw text string.
|
|
164
|
+
* @param {string} filePath
|
|
165
|
+
* @param {object} [opts]
|
|
166
|
+
* @param {string} [opts.textField='text'] - Field name for JSON/JSONL
|
|
167
|
+
* @returns {Promise<string|Array<{text: string, metadata: object}>>}
|
|
168
|
+
*/
|
|
169
|
+
async function readFile(filePath, opts = {}) {
|
|
170
|
+
const type = getReaderType(filePath);
|
|
171
|
+
if (!type) {
|
|
172
|
+
throw new Error(`Unsupported file type: ${path.extname(filePath)}. Supported: ${Object.keys(SUPPORTED_EXTENSIONS).join(', ')}`);
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
switch (type) {
|
|
176
|
+
case 'text':
|
|
177
|
+
return readTextFile(filePath);
|
|
178
|
+
case 'html':
|
|
179
|
+
return readHtmlFile(filePath);
|
|
180
|
+
case 'json':
|
|
181
|
+
return readJsonFile(filePath, opts.textField || 'text');
|
|
182
|
+
case 'jsonl':
|
|
183
|
+
return readJsonlFile(filePath, opts.textField || 'text');
|
|
184
|
+
case 'pdf':
|
|
185
|
+
return readPdfFile(filePath);
|
|
186
|
+
default:
|
|
187
|
+
throw new Error(`No reader for type: ${type}`);
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
/**
|
|
192
|
+
* Recursively scan a directory for supported files.
|
|
193
|
+
* @param {string} dirPath
|
|
194
|
+
* @param {object} [opts]
|
|
195
|
+
* @param {string[]} [opts.extensions] - Filter to specific extensions
|
|
196
|
+
* @param {string[]} [opts.ignore] - Directory names to skip
|
|
197
|
+
* @returns {string[]} Array of absolute file paths
|
|
198
|
+
*/
|
|
199
|
+
function scanDirectory(dirPath, opts = {}) {
|
|
200
|
+
const ignore = new Set(opts.ignore || ['node_modules', '.git', '.vai', '__pycache__', '.DS_Store']);
|
|
201
|
+
const extensions = opts.extensions
|
|
202
|
+
? new Set(opts.extensions.map(e => e.startsWith('.') ? e : '.' + e))
|
|
203
|
+
: null;
|
|
204
|
+
|
|
205
|
+
const results = [];
|
|
206
|
+
|
|
207
|
+
function walk(dir) {
|
|
208
|
+
const entries = fs.readdirSync(dir, { withFileTypes: true });
|
|
209
|
+
for (const entry of entries) {
|
|
210
|
+
if (entry.name.startsWith('.') && ignore.has(entry.name)) continue;
|
|
211
|
+
if (ignore.has(entry.name)) continue;
|
|
212
|
+
|
|
213
|
+
const fullPath = path.join(dir, entry.name);
|
|
214
|
+
|
|
215
|
+
if (entry.isDirectory()) {
|
|
216
|
+
walk(fullPath);
|
|
217
|
+
} else if (entry.isFile()) {
|
|
218
|
+
const ext = path.extname(entry.name).toLowerCase();
|
|
219
|
+
if (extensions) {
|
|
220
|
+
if (extensions.has(ext)) results.push(fullPath);
|
|
221
|
+
} else if (SUPPORTED_EXTENSIONS[ext]) {
|
|
222
|
+
results.push(fullPath);
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
walk(path.resolve(dirPath));
|
|
229
|
+
return results.sort();
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
module.exports = {
|
|
233
|
+
SUPPORTED_EXTENSIONS,
|
|
234
|
+
isSupported,
|
|
235
|
+
getReaderType,
|
|
236
|
+
readFile,
|
|
237
|
+
scanDirectory,
|
|
238
|
+
stripHtml,
|
|
239
|
+
};
|