voyageai-cli 1.13.0 → 1.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,239 @@
1
+ 'use strict';
2
+
3
+ const fs = require('fs');
4
+ const path = require('path');
5
+
6
+ /**
7
+ * Supported file extensions and their reader types.
8
+ */
9
+ const SUPPORTED_EXTENSIONS = {
10
+ '.txt': 'text',
11
+ '.md': 'text',
12
+ '.markdown': 'text',
13
+ '.rst': 'text',
14
+ '.html': 'html',
15
+ '.htm': 'html',
16
+ '.json': 'json',
17
+ '.jsonl': 'jsonl',
18
+ '.ndjson': 'jsonl',
19
+ '.csv': 'text',
20
+ '.pdf': 'pdf',
21
+ };
22
+
23
+ /**
24
+ * Check if a file extension is supported.
25
+ * @param {string} filePath
26
+ * @returns {boolean}
27
+ */
28
+ function isSupported(filePath) {
29
+ const ext = path.extname(filePath).toLowerCase();
30
+ return ext in SUPPORTED_EXTENSIONS;
31
+ }
32
+
33
+ /**
34
+ * Get the reader type for a file.
35
+ * @param {string} filePath
36
+ * @returns {string|null}
37
+ */
38
+ function getReaderType(filePath) {
39
+ const ext = path.extname(filePath).toLowerCase();
40
+ return SUPPORTED_EXTENSIONS[ext] || null;
41
+ }
42
+
43
+ /**
44
+ * Read a text file (txt, md, rst, csv).
45
+ * @param {string} filePath
46
+ * @returns {Promise<string>}
47
+ */
48
+ async function readTextFile(filePath) {
49
+ return fs.readFileSync(filePath, 'utf-8');
50
+ }
51
+
52
+ /**
53
+ * Read an HTML file and strip tags to plain text.
54
+ * Lightweight — no external dependencies.
55
+ * @param {string} filePath
56
+ * @returns {Promise<string>}
57
+ */
58
+ async function readHtmlFile(filePath) {
59
+ const html = fs.readFileSync(filePath, 'utf-8');
60
+ return stripHtml(html);
61
+ }
62
+
63
+ /**
64
+ * Strip HTML tags and decode common entities.
65
+ * @param {string} html
66
+ * @returns {string}
67
+ */
68
+ function stripHtml(html) {
69
+ return html
70
+ // Remove script and style blocks
71
+ .replace(/<script[\s\S]*?<\/script>/gi, '')
72
+ .replace(/<style[\s\S]*?<\/style>/gi, '')
73
+ // Replace block elements with newlines
74
+ .replace(/<\/?(p|div|br|h[1-6]|li|tr|blockquote|section|article|header|footer|nav|pre)[^>]*>/gi, '\n')
75
+ // Remove remaining tags
76
+ .replace(/<[^>]+>/g, '')
77
+ // Decode common entities
78
+ .replace(/&amp;/g, '&')
79
+ .replace(/&lt;/g, '<')
80
+ .replace(/&gt;/g, '>')
81
+ .replace(/&quot;/g, '"')
82
+ .replace(/&#39;/g, "'")
83
+ .replace(/&nbsp;/g, ' ')
84
+ // Collapse whitespace
85
+ .replace(/\n{3,}/g, '\n\n')
86
+ .trim();
87
+ }
88
+
89
+ /**
90
+ * Read a JSON file. Extracts text from objects using a text field.
91
+ * Supports JSON array of objects or a single object with a text field.
92
+ * @param {string} filePath
93
+ * @param {string} [textField='text'] - Field name containing text
94
+ * @returns {Promise<Array<{text: string, metadata: object}>>}
95
+ */
96
+ async function readJsonFile(filePath, textField = 'text') {
97
+ const raw = fs.readFileSync(filePath, 'utf-8');
98
+ const data = JSON.parse(raw);
99
+
100
+ if (Array.isArray(data)) {
101
+ return data.map((item, i) => {
102
+ const text = item[textField];
103
+ if (!text) throw new Error(`Missing "${textField}" field in array item ${i}`);
104
+ const metadata = { ...item };
105
+ delete metadata[textField];
106
+ return { text, metadata };
107
+ });
108
+ }
109
+
110
+ if (typeof data === 'object' && data[textField]) {
111
+ const metadata = { ...data };
112
+ delete metadata[textField];
113
+ return [{ text: data[textField], metadata }];
114
+ }
115
+
116
+ throw new Error(`JSON file must be an array of objects or an object with a "${textField}" field`);
117
+ }
118
+
119
+ /**
120
+ * Read a JSONL/NDJSON file.
121
+ * @param {string} filePath
122
+ * @param {string} [textField='text']
123
+ * @returns {Promise<Array<{text: string, metadata: object}>>}
124
+ */
125
+ async function readJsonlFile(filePath, textField = 'text') {
126
+ const raw = fs.readFileSync(filePath, 'utf-8');
127
+ const lines = raw.split('\n').filter(l => l.trim().length > 0);
128
+
129
+ return lines.map((line, i) => {
130
+ const item = JSON.parse(line);
131
+ const text = item[textField];
132
+ if (!text) throw new Error(`Missing "${textField}" field on line ${i + 1}`);
133
+ const metadata = { ...item };
134
+ delete metadata[textField];
135
+ return { text, metadata };
136
+ });
137
+ }
138
+
139
+ /**
140
+ * Read a PDF file. Requires optional `pdf-parse` dependency.
141
+ * @param {string} filePath
142
+ * @returns {Promise<string>}
143
+ */
144
+ async function readPdfFile(filePath) {
145
+ let pdfParse;
146
+ try {
147
+ pdfParse = require('pdf-parse');
148
+ } catch {
149
+ throw new Error(
150
+ 'PDF support requires the "pdf-parse" package.\n' +
151
+ 'Install it: npm install pdf-parse\n' +
152
+ 'Then retry your command.'
153
+ );
154
+ }
155
+ const buffer = fs.readFileSync(filePath);
156
+ const data = await pdfParse(buffer);
157
+ return data.text;
158
+ }
159
+
160
+ /**
161
+ * Read a single file and return its text content.
162
+ * For structured files (JSON/JSONL), returns array of {text, metadata}.
163
+ * For text files, returns the raw text string.
164
+ * @param {string} filePath
165
+ * @param {object} [opts]
166
+ * @param {string} [opts.textField='text'] - Field name for JSON/JSONL
167
+ * @returns {Promise<string|Array<{text: string, metadata: object}>>}
168
+ */
169
+ async function readFile(filePath, opts = {}) {
170
+ const type = getReaderType(filePath);
171
+ if (!type) {
172
+ throw new Error(`Unsupported file type: ${path.extname(filePath)}. Supported: ${Object.keys(SUPPORTED_EXTENSIONS).join(', ')}`);
173
+ }
174
+
175
+ switch (type) {
176
+ case 'text':
177
+ return readTextFile(filePath);
178
+ case 'html':
179
+ return readHtmlFile(filePath);
180
+ case 'json':
181
+ return readJsonFile(filePath, opts.textField || 'text');
182
+ case 'jsonl':
183
+ return readJsonlFile(filePath, opts.textField || 'text');
184
+ case 'pdf':
185
+ return readPdfFile(filePath);
186
+ default:
187
+ throw new Error(`No reader for type: ${type}`);
188
+ }
189
+ }
190
+
191
+ /**
192
+ * Recursively scan a directory for supported files.
193
+ * @param {string} dirPath
194
+ * @param {object} [opts]
195
+ * @param {string[]} [opts.extensions] - Filter to specific extensions
196
+ * @param {string[]} [opts.ignore] - Directory names to skip
197
+ * @returns {string[]} Array of absolute file paths
198
+ */
199
+ function scanDirectory(dirPath, opts = {}) {
200
+ const ignore = new Set(opts.ignore || ['node_modules', '.git', '.vai', '__pycache__', '.DS_Store']);
201
+ const extensions = opts.extensions
202
+ ? new Set(opts.extensions.map(e => e.startsWith('.') ? e : '.' + e))
203
+ : null;
204
+
205
+ const results = [];
206
+
207
+ function walk(dir) {
208
+ const entries = fs.readdirSync(dir, { withFileTypes: true });
209
+ for (const entry of entries) {
210
+ if (entry.name.startsWith('.') && ignore.has(entry.name)) continue;
211
+ if (ignore.has(entry.name)) continue;
212
+
213
+ const fullPath = path.join(dir, entry.name);
214
+
215
+ if (entry.isDirectory()) {
216
+ walk(fullPath);
217
+ } else if (entry.isFile()) {
218
+ const ext = path.extname(entry.name).toLowerCase();
219
+ if (extensions) {
220
+ if (extensions.has(ext)) results.push(fullPath);
221
+ } else if (SUPPORTED_EXTENSIONS[ext]) {
222
+ results.push(fullPath);
223
+ }
224
+ }
225
+ }
226
+ }
227
+
228
+ walk(path.resolve(dirPath));
229
+ return results.sort();
230
+ }
231
+
232
+ module.exports = {
233
+ SUPPORTED_EXTENSIONS,
234
+ isSupported,
235
+ getReaderType,
236
+ readFile,
237
+ scanDirectory,
238
+ stripHtml,
239
+ };