voyageai-cli 1.20.6 → 1.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +142 -26
- package/README.md +130 -2
- package/package.json +3 -2
- package/src/cli.js +10 -0
- package/src/commands/bug.js +249 -0
- package/src/commands/eval.js +420 -10
- package/src/commands/generate.js +220 -0
- package/src/commands/playground.js +93 -0
- package/src/commands/purge.js +271 -0
- package/src/commands/refresh.js +322 -0
- package/src/commands/scaffold.js +217 -0
- package/src/lib/codegen.js +339 -0
- package/src/lib/explanations.js +155 -0
- package/src/lib/scaffold-structure.js +114 -0
- package/src/lib/templates/nextjs/README.md.tpl +106 -0
- package/src/lib/templates/nextjs/env.example.tpl +8 -0
- package/src/lib/templates/nextjs/layout.jsx.tpl +29 -0
- package/src/lib/templates/nextjs/lib-mongo.js.tpl +111 -0
- package/src/lib/templates/nextjs/lib-voyage.js.tpl +103 -0
- package/src/lib/templates/nextjs/package.json.tpl +33 -0
- package/src/lib/templates/nextjs/page-search.jsx.tpl +147 -0
- package/src/lib/templates/nextjs/route-ingest.js.tpl +114 -0
- package/src/lib/templates/nextjs/route-search.js.tpl +97 -0
- package/src/lib/templates/nextjs/theme.js.tpl +84 -0
- package/src/lib/templates/python/README.md.tpl +145 -0
- package/src/lib/templates/python/app.py.tpl +221 -0
- package/src/lib/templates/python/chunker.py.tpl +127 -0
- package/src/lib/templates/python/env.example.tpl +12 -0
- package/src/lib/templates/python/mongo_client.py.tpl +125 -0
- package/src/lib/templates/python/requirements.txt.tpl +10 -0
- package/src/lib/templates/python/voyage_client.py.tpl +124 -0
- package/src/lib/templates/vanilla/README.md.tpl +156 -0
- package/src/lib/templates/vanilla/client.js.tpl +103 -0
- package/src/lib/templates/vanilla/connection.js.tpl +126 -0
- package/src/lib/templates/vanilla/env.example.tpl +11 -0
- package/src/lib/templates/vanilla/ingest.js.tpl +231 -0
- package/src/lib/templates/vanilla/package.json.tpl +31 -0
- package/src/lib/templates/vanilla/retrieval.js.tpl +100 -0
- package/src/lib/templates/vanilla/search-api.js.tpl +175 -0
- package/src/lib/templates/vanilla/server.js.tpl +81 -0
- package/src/lib/zip.js +130 -0
- package/src/playground/index.html +708 -3
|
@@ -114,6 +114,99 @@ function createPlaygroundServer() {
|
|
|
114
114
|
return;
|
|
115
115
|
}
|
|
116
116
|
|
|
117
|
+
// API: Generate code
|
|
118
|
+
if (req.method === 'POST' && req.url === '/api/generate') {
|
|
119
|
+
let body = '';
|
|
120
|
+
req.on('data', chunk => { body += chunk; });
|
|
121
|
+
req.on('end', () => {
|
|
122
|
+
try {
|
|
123
|
+
const { target, component, config } = JSON.parse(body);
|
|
124
|
+
const codegen = require('../lib/codegen');
|
|
125
|
+
|
|
126
|
+
const templateMap = {
|
|
127
|
+
vanilla: { client: 'client.js', connection: 'connection.js', retrieval: 'retrieval.js', ingest: 'ingest.js', 'search-api': 'search-api.js' },
|
|
128
|
+
nextjs: { client: 'lib-voyage.js', connection: 'lib-mongo.js', retrieval: 'route-search.js', ingest: 'route-ingest.js', 'search-page': 'page-search.jsx' },
|
|
129
|
+
python: { client: 'voyage_client.py', connection: 'mongo_client.py', retrieval: 'app.py', ingest: 'chunker.py' },
|
|
130
|
+
};
|
|
131
|
+
|
|
132
|
+
const templateName = (templateMap[target] || {})[component];
|
|
133
|
+
if (!templateName) {
|
|
134
|
+
res.writeHead(400, { 'Content-Type': 'application/json' });
|
|
135
|
+
res.end(JSON.stringify({ error: `Unknown component: ${component}` }));
|
|
136
|
+
return;
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
const context = codegen.buildContext(config || {}, { projectName: 'my-app' });
|
|
140
|
+
const code = codegen.renderTemplate(target, templateName.replace(/\.(js|jsx|py)$/, ''), context);
|
|
141
|
+
|
|
142
|
+
res.writeHead(200, { 'Content-Type': 'application/json' });
|
|
143
|
+
res.end(JSON.stringify({ code, filename: templateName }));
|
|
144
|
+
} catch (err) {
|
|
145
|
+
res.writeHead(500, { 'Content-Type': 'application/json' });
|
|
146
|
+
res.end(JSON.stringify({ error: err.message }));
|
|
147
|
+
}
|
|
148
|
+
});
|
|
149
|
+
return;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
// API: Scaffold project (returns ZIP for web mode)
|
|
153
|
+
if (req.method === 'POST' && req.url === '/api/scaffold') {
|
|
154
|
+
let body = '';
|
|
155
|
+
req.on('data', chunk => { body += chunk; });
|
|
156
|
+
req.on('end', () => {
|
|
157
|
+
try {
|
|
158
|
+
const { projectName, target, config } = JSON.parse(body);
|
|
159
|
+
const codegen = require('../lib/codegen');
|
|
160
|
+
const { PROJECT_STRUCTURE } = require('../lib/scaffold-structure');
|
|
161
|
+
const { createZip } = require('../lib/zip');
|
|
162
|
+
|
|
163
|
+
const structure = PROJECT_STRUCTURE[target];
|
|
164
|
+
if (!structure) {
|
|
165
|
+
res.writeHead(400, { 'Content-Type': 'application/json' });
|
|
166
|
+
res.end(JSON.stringify({ error: `Unknown target: ${target}` }));
|
|
167
|
+
return;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
const context = codegen.buildContext(config || {}, { projectName: projectName || 'my-app' });
|
|
171
|
+
const files = [];
|
|
172
|
+
|
|
173
|
+
// Render template files
|
|
174
|
+
for (const file of structure.files) {
|
|
175
|
+
const content = codegen.renderTemplate(target, file.template.replace(/\.(js|jsx|py|json|md|txt)$/, ''), context);
|
|
176
|
+
files.push({
|
|
177
|
+
name: `${projectName}/${file.output}`,
|
|
178
|
+
content,
|
|
179
|
+
});
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
// Add extra static files
|
|
183
|
+
if (structure.extraFiles) {
|
|
184
|
+
for (const file of structure.extraFiles) {
|
|
185
|
+
const content = typeof file.content === 'function' ? file.content(context) : file.content;
|
|
186
|
+
files.push({
|
|
187
|
+
name: `${projectName}/${file.output}`,
|
|
188
|
+
content,
|
|
189
|
+
});
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
// Create ZIP
|
|
194
|
+
const zipBuffer = createZip(files);
|
|
195
|
+
|
|
196
|
+
res.writeHead(200, {
|
|
197
|
+
'Content-Type': 'application/zip',
|
|
198
|
+
'Content-Disposition': `attachment; filename="${projectName}.zip"`,
|
|
199
|
+
'Content-Length': zipBuffer.length,
|
|
200
|
+
});
|
|
201
|
+
res.end(zipBuffer);
|
|
202
|
+
} catch (err) {
|
|
203
|
+
res.writeHead(500, { 'Content-Type': 'application/json' });
|
|
204
|
+
res.end(JSON.stringify({ error: err.message }));
|
|
205
|
+
}
|
|
206
|
+
});
|
|
207
|
+
return;
|
|
208
|
+
}
|
|
209
|
+
|
|
117
210
|
// API: Concepts (from vai explain)
|
|
118
211
|
if (req.method === 'GET' && req.url === '/api/concepts') {
|
|
119
212
|
const { concepts } = require('../lib/explanations');
|
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const fs = require('fs');
|
|
4
|
+
const path = require('path');
|
|
5
|
+
const p = require('@clack/prompts');
|
|
6
|
+
const { loadProject } = require('../lib/project');
|
|
7
|
+
const { connect, close } = require('../lib/mongo');
|
|
8
|
+
const ui = require('../lib/ui');
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Build a MongoDB filter from the provided criteria.
|
|
12
|
+
*/
|
|
13
|
+
function buildFilter(options) {
|
|
14
|
+
const conditions = [];
|
|
15
|
+
|
|
16
|
+
// Filter by source pattern (glob-like)
|
|
17
|
+
if (options.source) {
|
|
18
|
+
// Convert glob pattern to regex
|
|
19
|
+
const pattern = options.source
|
|
20
|
+
.replace(/\./g, '\\.')
|
|
21
|
+
.replace(/\*/g, '.*')
|
|
22
|
+
.replace(/\?/g, '.');
|
|
23
|
+
conditions.push({ 'metadata.source': { $regex: pattern } });
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
// Filter by embedded date
|
|
27
|
+
if (options.before) {
|
|
28
|
+
const date = new Date(options.before);
|
|
29
|
+
if (isNaN(date.getTime())) {
|
|
30
|
+
throw new Error(`Invalid date format: ${options.before}`);
|
|
31
|
+
}
|
|
32
|
+
conditions.push({ _embeddedAt: { $lt: date } });
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// Filter by model
|
|
36
|
+
if (options.model) {
|
|
37
|
+
conditions.push({ _model: options.model });
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// Raw MongoDB filter
|
|
41
|
+
if (options.filter) {
|
|
42
|
+
try {
|
|
43
|
+
const rawFilter = JSON.parse(options.filter);
|
|
44
|
+
conditions.push(rawFilter);
|
|
45
|
+
} catch (err) {
|
|
46
|
+
throw new Error(`Invalid JSON filter: ${err.message}`);
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
// Combine conditions with $and
|
|
51
|
+
if (conditions.length === 0) {
|
|
52
|
+
return {};
|
|
53
|
+
} else if (conditions.length === 1) {
|
|
54
|
+
return conditions[0];
|
|
55
|
+
} else {
|
|
56
|
+
return { $and: conditions };
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/**
|
|
61
|
+
* Check which documents have stale source files (file no longer exists on disk).
|
|
62
|
+
*/
|
|
63
|
+
async function findStaleDocuments(collection, baseDir) {
|
|
64
|
+
const docs = await collection.find({ 'metadata.source': { $exists: true } }).toArray();
|
|
65
|
+
const staleIds = [];
|
|
66
|
+
|
|
67
|
+
for (const doc of docs) {
|
|
68
|
+
const source = doc.metadata?.source;
|
|
69
|
+
if (source) {
|
|
70
|
+
// Resolve relative to baseDir or treat as absolute
|
|
71
|
+
const filePath = path.isAbsolute(source) ? source : path.join(baseDir, source);
|
|
72
|
+
if (!fs.existsSync(filePath)) {
|
|
73
|
+
staleIds.push(doc._id);
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
return staleIds;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/**
|
|
82
|
+
* Format a sample of documents for display.
|
|
83
|
+
*/
|
|
84
|
+
function formatSample(docs, limit = 5) {
|
|
85
|
+
const sample = docs.slice(0, limit);
|
|
86
|
+
return sample.map(doc => {
|
|
87
|
+
const source = doc.metadata?.source || doc._id?.toString() || 'unknown';
|
|
88
|
+
const model = doc._model || 'unknown';
|
|
89
|
+
const date = doc._embeddedAt ? new Date(doc._embeddedAt).toISOString().split('T')[0] : 'unknown';
|
|
90
|
+
return ` • ${source} (model: ${model}, date: ${date})`;
|
|
91
|
+
}).join('\n');
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* Execute the purge command.
|
|
96
|
+
*/
|
|
97
|
+
async function purge(options = {}) {
|
|
98
|
+
const quiet = options.quiet || options.json;
|
|
99
|
+
|
|
100
|
+
// Load project config
|
|
101
|
+
const project = loadProject();
|
|
102
|
+
const db = options.db || project.db || process.env.VAI_DB || 'vai';
|
|
103
|
+
const collectionName = options.collection || project.collection || process.env.VAI_COLLECTION || 'embeddings';
|
|
104
|
+
|
|
105
|
+
if (!quiet) {
|
|
106
|
+
p.intro(ui.title('vai purge'));
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
// Validate that at least one filter is provided
|
|
110
|
+
if (!options.source && !options.before && !options.model && !options.filter && !options.stale) {
|
|
111
|
+
if (options.json) {
|
|
112
|
+
console.log(JSON.stringify({ error: 'No filter criteria provided. Use --source, --before, --model, --filter, or --stale.' }));
|
|
113
|
+
} else {
|
|
114
|
+
p.log.error('No filter criteria provided.');
|
|
115
|
+
p.log.info('Use --source, --before, --model, --filter, or --stale to specify what to purge.');
|
|
116
|
+
}
|
|
117
|
+
return { success: false, error: 'No filter criteria' };
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
let client;
|
|
121
|
+
try {
|
|
122
|
+
// Connect to MongoDB
|
|
123
|
+
if (!quiet) {
|
|
124
|
+
p.log.step(`Connecting to database: ${db}`);
|
|
125
|
+
}
|
|
126
|
+
client = await connect(db);
|
|
127
|
+
const collection = client.db(db).collection(collectionName);
|
|
128
|
+
|
|
129
|
+
let filter = {};
|
|
130
|
+
let staleIds = [];
|
|
131
|
+
|
|
132
|
+
if (options.stale) {
|
|
133
|
+
// Find documents with stale source files
|
|
134
|
+
if (!quiet) {
|
|
135
|
+
p.log.step('Scanning for stale documents (source files that no longer exist)...');
|
|
136
|
+
}
|
|
137
|
+
const baseDir = project.root || process.cwd();
|
|
138
|
+
staleIds = await findStaleDocuments(collection, baseDir);
|
|
139
|
+
|
|
140
|
+
if (staleIds.length === 0) {
|
|
141
|
+
if (options.json) {
|
|
142
|
+
console.log(JSON.stringify({ success: true, count: 0, message: 'No stale documents found' }));
|
|
143
|
+
} else {
|
|
144
|
+
p.log.success('No stale documents found.');
|
|
145
|
+
p.outro('Nothing to purge.');
|
|
146
|
+
}
|
|
147
|
+
return { success: true, count: 0 };
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
filter = { _id: { $in: staleIds } };
|
|
151
|
+
} else {
|
|
152
|
+
// Build filter from criteria
|
|
153
|
+
filter = buildFilter(options);
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
// Count matching documents
|
|
157
|
+
const count = options.stale ? staleIds.length : await collection.countDocuments(filter);
|
|
158
|
+
|
|
159
|
+
if (count === 0) {
|
|
160
|
+
if (options.json) {
|
|
161
|
+
console.log(JSON.stringify({ success: true, count: 0, message: 'No matching documents found' }));
|
|
162
|
+
} else {
|
|
163
|
+
p.log.success('No matching documents found.');
|
|
164
|
+
p.outro('Nothing to purge.');
|
|
165
|
+
}
|
|
166
|
+
return { success: true, count: 0 };
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
// Get sample for display
|
|
170
|
+
const sampleDocs = await collection.find(filter).limit(5).toArray();
|
|
171
|
+
|
|
172
|
+
if (options.json) {
|
|
173
|
+
if (options.dryRun) {
|
|
174
|
+
console.log(JSON.stringify({
|
|
175
|
+
dryRun: true,
|
|
176
|
+
count,
|
|
177
|
+
sample: sampleDocs.map(d => ({
|
|
178
|
+
id: d._id?.toString(),
|
|
179
|
+
source: d.metadata?.source,
|
|
180
|
+
model: d._model,
|
|
181
|
+
embeddedAt: d._embeddedAt,
|
|
182
|
+
})),
|
|
183
|
+
}));
|
|
184
|
+
return { success: true, dryRun: true, count };
|
|
185
|
+
}
|
|
186
|
+
} else {
|
|
187
|
+
// Show what will be deleted
|
|
188
|
+
p.log.warn(`Found ${count} document${count === 1 ? '' : 's'} matching criteria:`);
|
|
189
|
+
console.log(formatSample(sampleDocs));
|
|
190
|
+
if (count > 5) {
|
|
191
|
+
console.log(` ... and ${count - 5} more`);
|
|
192
|
+
}
|
|
193
|
+
console.log();
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
// Dry run - stop here
|
|
197
|
+
if (options.dryRun) {
|
|
198
|
+
if (!quiet) {
|
|
199
|
+
p.log.info('Dry run - no documents deleted.');
|
|
200
|
+
p.outro(`Would delete ${count} document${count === 1 ? '' : 's'}.`);
|
|
201
|
+
}
|
|
202
|
+
return { success: true, dryRun: true, count };
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
// Confirm unless --force
|
|
206
|
+
if (!options.force && !options.json) {
|
|
207
|
+
const confirmed = await p.confirm({
|
|
208
|
+
message: `Delete ${count} document${count === 1 ? '' : 's'}? This cannot be undone.`,
|
|
209
|
+
initialValue: false,
|
|
210
|
+
});
|
|
211
|
+
|
|
212
|
+
if (p.isCancel(confirmed) || !confirmed) {
|
|
213
|
+
p.log.info('Purge cancelled.');
|
|
214
|
+
p.outro('No documents deleted.');
|
|
215
|
+
return { success: false, cancelled: true };
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
// Delete documents
|
|
220
|
+
if (!quiet) {
|
|
221
|
+
p.log.step('Deleting documents...');
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
const result = await collection.deleteMany(filter);
|
|
225
|
+
const deleted = result.deletedCount;
|
|
226
|
+
|
|
227
|
+
if (options.json) {
|
|
228
|
+
console.log(JSON.stringify({ success: true, deleted }));
|
|
229
|
+
} else {
|
|
230
|
+
p.log.success(`Deleted ${deleted} document${deleted === 1 ? '' : 's'}.`);
|
|
231
|
+
p.outro('Purge complete.');
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
return { success: true, deleted };
|
|
235
|
+
|
|
236
|
+
} catch (err) {
|
|
237
|
+
if (options.json) {
|
|
238
|
+
console.log(JSON.stringify({ error: err.message }));
|
|
239
|
+
} else {
|
|
240
|
+
p.log.error(`Purge failed: ${err.message}`);
|
|
241
|
+
}
|
|
242
|
+
return { success: false, error: err.message };
|
|
243
|
+
} finally {
|
|
244
|
+
if (client) {
|
|
245
|
+
await close();
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
/**
|
|
251
|
+
* Register the purge command with Commander.
|
|
252
|
+
*/
|
|
253
|
+
function register(program) {
|
|
254
|
+
program
|
|
255
|
+
.command('purge')
|
|
256
|
+
.description('Remove embeddings from MongoDB based on criteria')
|
|
257
|
+
.option('--db <database>', 'Database name')
|
|
258
|
+
.option('--collection <name>', 'Collection name')
|
|
259
|
+
.option('--source <glob>', 'Filter by metadata.source pattern')
|
|
260
|
+
.option('--before <date>', 'Filter by _embeddedAt before date (ISO 8601)')
|
|
261
|
+
.option('-m, --model <model>', 'Filter by _model field')
|
|
262
|
+
.option('--filter <json>', 'Raw MongoDB filter (JSON)')
|
|
263
|
+
.option('--stale', 'Remove docs whose source files no longer exist')
|
|
264
|
+
.option('--force', 'Skip confirmation prompt')
|
|
265
|
+
.option('--dry-run', 'Show what would be deleted without acting')
|
|
266
|
+
.option('--json', 'Machine-readable output')
|
|
267
|
+
.option('-q, --quiet', 'Suppress non-essential output')
|
|
268
|
+
.action(purge);
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
module.exports = { register, purge, buildFilter };
|
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const p = require('@clack/prompts');
|
|
4
|
+
const { loadProject, saveProject } = require('../lib/project');
|
|
5
|
+
const { connect, close } = require('../lib/mongo');
|
|
6
|
+
const { generateEmbeddings } = require('../lib/api');
|
|
7
|
+
const { chunkText } = require('../lib/chunker');
|
|
8
|
+
const ui = require('../lib/ui');
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Process documents in batches.
|
|
12
|
+
*/
|
|
13
|
+
async function processBatch(docs, embedder, options) {
|
|
14
|
+
const texts = docs.map(d => d.text);
|
|
15
|
+
const embeddings = await embedder(texts);
|
|
16
|
+
|
|
17
|
+
return docs.map((doc, i) => ({
|
|
18
|
+
...doc,
|
|
19
|
+
[options.field]: embeddings[i],
|
|
20
|
+
_model: options.model,
|
|
21
|
+
_embeddedAt: new Date(),
|
|
22
|
+
}));
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Re-chunk a document's text.
|
|
27
|
+
*/
|
|
28
|
+
function rechunkDocument(doc, options) {
|
|
29
|
+
const text = doc.text || doc.content || '';
|
|
30
|
+
if (!text) return [doc];
|
|
31
|
+
|
|
32
|
+
const chunks = chunkText(text, {
|
|
33
|
+
strategy: options.strategy || 'recursive',
|
|
34
|
+
chunkSize: options.chunkSize || 512,
|
|
35
|
+
overlap: options.overlap || 50,
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
return chunks.map((chunk, i) => ({
|
|
39
|
+
...doc,
|
|
40
|
+
text: chunk.text,
|
|
41
|
+
_chunkIndex: i,
|
|
42
|
+
_chunkCount: chunks.length,
|
|
43
|
+
metadata: {
|
|
44
|
+
...doc.metadata,
|
|
45
|
+
chunkIndex: i,
|
|
46
|
+
chunkCount: chunks.length,
|
|
47
|
+
originalId: doc._id?.toString(),
|
|
48
|
+
},
|
|
49
|
+
}));
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Execute the refresh command.
|
|
54
|
+
*/
|
|
55
|
+
async function refresh(options = {}) {
|
|
56
|
+
const quiet = options.quiet || options.json;
|
|
57
|
+
|
|
58
|
+
// Load project config
|
|
59
|
+
const project = loadProject();
|
|
60
|
+
const db = options.db || project.db || process.env.VAI_DB || 'vai';
|
|
61
|
+
const collectionName = options.collection || project.collection || process.env.VAI_COLLECTION || 'embeddings';
|
|
62
|
+
const field = options.field || project.field || 'embedding';
|
|
63
|
+
const model = options.model || project.model || 'voyage-3.5-lite';
|
|
64
|
+
const dimensions = options.dimensions || project.dimensions;
|
|
65
|
+
const batchSize = options.batchSize || 25;
|
|
66
|
+
|
|
67
|
+
if (!quiet) {
|
|
68
|
+
p.intro(ui.title('vai refresh'));
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
let client;
|
|
72
|
+
try {
|
|
73
|
+
// Connect to MongoDB
|
|
74
|
+
if (!quiet) {
|
|
75
|
+
p.log.step(`Connecting to database: ${db}`);
|
|
76
|
+
}
|
|
77
|
+
client = await connect(db);
|
|
78
|
+
const collection = client.db(db).collection(collectionName);
|
|
79
|
+
|
|
80
|
+
// Build filter
|
|
81
|
+
let filter = {};
|
|
82
|
+
if (options.filter) {
|
|
83
|
+
try {
|
|
84
|
+
filter = JSON.parse(options.filter);
|
|
85
|
+
} catch (err) {
|
|
86
|
+
throw new Error(`Invalid JSON filter: ${err.message}`);
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
// Count documents
|
|
91
|
+
const totalCount = await collection.countDocuments(filter);
|
|
92
|
+
|
|
93
|
+
if (totalCount === 0) {
|
|
94
|
+
if (options.json) {
|
|
95
|
+
console.log(JSON.stringify({ success: true, count: 0, message: 'No documents to refresh' }));
|
|
96
|
+
} else {
|
|
97
|
+
p.log.success('No documents to refresh.');
|
|
98
|
+
p.outro('Nothing to do.');
|
|
99
|
+
}
|
|
100
|
+
return { success: true, count: 0 };
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// Show plan
|
|
104
|
+
const rechunkLabel = options.rechunk ? ` (re-chunking with ${options.strategy || 'recursive'})` : '';
|
|
105
|
+
const dimLabel = dimensions ? ` @ ${dimensions}d` : '';
|
|
106
|
+
|
|
107
|
+
if (options.json && options.dryRun) {
|
|
108
|
+
console.log(JSON.stringify({
|
|
109
|
+
dryRun: true,
|
|
110
|
+
count: totalCount,
|
|
111
|
+
model,
|
|
112
|
+
dimensions: dimensions || 'default',
|
|
113
|
+
rechunk: !!options.rechunk,
|
|
114
|
+
}));
|
|
115
|
+
return { success: true, dryRun: true, count: totalCount };
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
if (!quiet) {
|
|
119
|
+
p.log.info(`Found ${totalCount} document${totalCount === 1 ? '' : 's'} to refresh`);
|
|
120
|
+
p.log.info(`Target model: ${model}${dimLabel}${rechunkLabel}`);
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
// Dry run - stop here
|
|
124
|
+
if (options.dryRun) {
|
|
125
|
+
if (!quiet) {
|
|
126
|
+
p.log.info('Dry run - no documents modified.');
|
|
127
|
+
p.outro(`Would refresh ${totalCount} document${totalCount === 1 ? '' : 's'}.`);
|
|
128
|
+
}
|
|
129
|
+
return { success: true, dryRun: true, count: totalCount };
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
// Confirm unless --force
|
|
133
|
+
if (!options.force && !options.json) {
|
|
134
|
+
const confirmed = await p.confirm({
|
|
135
|
+
message: `Re-embed ${totalCount} document${totalCount === 1 ? '' : 's'}? This will update the embeddings in-place.`,
|
|
136
|
+
initialValue: true,
|
|
137
|
+
});
|
|
138
|
+
|
|
139
|
+
if (p.isCancel(confirmed) || !confirmed) {
|
|
140
|
+
p.log.info('Refresh cancelled.');
|
|
141
|
+
p.outro('No documents modified.');
|
|
142
|
+
return { success: false, cancelled: true };
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// Create embedder function
|
|
147
|
+
const embedder = async (texts) => {
|
|
148
|
+
const result = await generateEmbeddings(texts, {
|
|
149
|
+
model,
|
|
150
|
+
dimensions,
|
|
151
|
+
inputType: 'document',
|
|
152
|
+
});
|
|
153
|
+
return result.embeddings;
|
|
154
|
+
};
|
|
155
|
+
|
|
156
|
+
// Process documents
|
|
157
|
+
let processed = 0;
|
|
158
|
+
let errors = 0;
|
|
159
|
+
const cursor = collection.find(filter);
|
|
160
|
+
let batch = [];
|
|
161
|
+
|
|
162
|
+
const spinner = !quiet ? p.spinner() : null;
|
|
163
|
+
if (spinner) spinner.start('Processing documents...');
|
|
164
|
+
|
|
165
|
+
while (await cursor.hasNext()) {
|
|
166
|
+
const doc = await cursor.next();
|
|
167
|
+
|
|
168
|
+
if (options.rechunk) {
|
|
169
|
+
// Re-chunk the document
|
|
170
|
+
const chunks = rechunkDocument(doc, options);
|
|
171
|
+
batch.push(...chunks);
|
|
172
|
+
} else {
|
|
173
|
+
batch.push(doc);
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
// Process when batch is full
|
|
177
|
+
if (batch.length >= batchSize) {
|
|
178
|
+
try {
|
|
179
|
+
const updated = await processBatch(batch, embedder, { field, model });
|
|
180
|
+
|
|
181
|
+
// Replace documents in database
|
|
182
|
+
for (const updatedDoc of updated) {
|
|
183
|
+
if (options.rechunk && updatedDoc.metadata?.originalId) {
|
|
184
|
+
// For rechunked docs, insert new and delete original later
|
|
185
|
+
await collection.insertOne(updatedDoc);
|
|
186
|
+
} else {
|
|
187
|
+
// Update in place
|
|
188
|
+
await collection.updateOne(
|
|
189
|
+
{ _id: updatedDoc._id },
|
|
190
|
+
{ $set: { [field]: updatedDoc[field], _model: model, _embeddedAt: new Date() } }
|
|
191
|
+
);
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
processed += batch.length;
|
|
196
|
+
if (spinner) spinner.message(`Processed ${processed}/${totalCount} documents...`);
|
|
197
|
+
} catch (err) {
|
|
198
|
+
errors += batch.length;
|
|
199
|
+
if (!quiet) {
|
|
200
|
+
p.log.warn(`Batch error: ${err.message}`);
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
batch = [];
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
// Process remaining batch
|
|
208
|
+
if (batch.length > 0) {
|
|
209
|
+
try {
|
|
210
|
+
const updated = await processBatch(batch, embedder, { field, model });
|
|
211
|
+
|
|
212
|
+
for (const updatedDoc of updated) {
|
|
213
|
+
if (options.rechunk && updatedDoc.metadata?.originalId) {
|
|
214
|
+
await collection.insertOne(updatedDoc);
|
|
215
|
+
} else {
|
|
216
|
+
await collection.updateOne(
|
|
217
|
+
{ _id: updatedDoc._id },
|
|
218
|
+
{ $set: { [field]: updatedDoc[field], _model: model, _embeddedAt: new Date() } }
|
|
219
|
+
);
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
processed += batch.length;
|
|
224
|
+
} catch (err) {
|
|
225
|
+
errors += batch.length;
|
|
226
|
+
if (!quiet) {
|
|
227
|
+
p.log.warn(`Batch error: ${err.message}`);
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
// If rechunking, delete original documents
|
|
233
|
+
if (options.rechunk) {
|
|
234
|
+
const originalIds = await collection.distinct('metadata.originalId', filter);
|
|
235
|
+
if (originalIds.length > 0) {
|
|
236
|
+
// Convert string IDs back to ObjectIds for deletion
|
|
237
|
+
const { ObjectId } = require('mongodb');
|
|
238
|
+
const objectIds = originalIds
|
|
239
|
+
.filter(id => id)
|
|
240
|
+
.map(id => {
|
|
241
|
+
try { return new ObjectId(id); } catch { return null; }
|
|
242
|
+
})
|
|
243
|
+
.filter(id => id);
|
|
244
|
+
|
|
245
|
+
if (objectIds.length > 0) {
|
|
246
|
+
await collection.deleteMany({ _id: { $in: objectIds } });
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
if (spinner) spinner.stop('Processing complete.');
|
|
252
|
+
|
|
253
|
+
// Update project config if model/dimensions changed
|
|
254
|
+
const configUpdated = (model !== project.model) || (dimensions && dimensions !== project.dimensions);
|
|
255
|
+
if (configUpdated && !options.json) {
|
|
256
|
+
try {
|
|
257
|
+
saveProject({
|
|
258
|
+
...project,
|
|
259
|
+
model,
|
|
260
|
+
...(dimensions && { dimensions }),
|
|
261
|
+
});
|
|
262
|
+
if (!quiet) {
|
|
263
|
+
p.log.info('Updated .vai.json with new model/dimensions.');
|
|
264
|
+
}
|
|
265
|
+
} catch {
|
|
266
|
+
// Ignore save errors
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
if (options.json) {
|
|
271
|
+
console.log(JSON.stringify({ success: true, processed, errors }));
|
|
272
|
+
} else {
|
|
273
|
+
if (errors > 0) {
|
|
274
|
+
p.log.warn(`Refreshed ${processed} documents with ${errors} errors.`);
|
|
275
|
+
} else {
|
|
276
|
+
p.log.success(`Refreshed ${processed} document${processed === 1 ? '' : 's'}.`);
|
|
277
|
+
}
|
|
278
|
+
p.outro('Refresh complete.');
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
return { success: true, processed, errors };
|
|
282
|
+
|
|
283
|
+
} catch (err) {
|
|
284
|
+
if (options.json) {
|
|
285
|
+
console.log(JSON.stringify({ error: err.message }));
|
|
286
|
+
} else {
|
|
287
|
+
p.log.error(`Refresh failed: ${err.message}`);
|
|
288
|
+
}
|
|
289
|
+
return { success: false, error: err.message };
|
|
290
|
+
} finally {
|
|
291
|
+
if (client) {
|
|
292
|
+
await close();
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
/**
|
|
298
|
+
* Register the refresh command with Commander.
|
|
299
|
+
*/
|
|
300
|
+
function register(program) {
|
|
301
|
+
program
|
|
302
|
+
.command('refresh')
|
|
303
|
+
.description('Re-embed documents with a new model, dimensions, or chunk settings')
|
|
304
|
+
.option('--db <database>', 'Database name')
|
|
305
|
+
.option('--collection <name>', 'Collection name')
|
|
306
|
+
.option('--field <name>', 'Embedding field name')
|
|
307
|
+
.option('-m, --model <model>', 'New embedding model')
|
|
308
|
+
.option('-d, --dimensions <n>', 'New dimensions', parseInt)
|
|
309
|
+
.option('--rechunk', 'Re-chunk text before re-embedding')
|
|
310
|
+
.option('-s, --strategy <strategy>', 'Chunk strategy (with --rechunk)')
|
|
311
|
+
.option('-c, --chunk-size <n>', 'Chunk size (with --rechunk)', parseInt)
|
|
312
|
+
.option('--overlap <n>', 'Chunk overlap (with --rechunk)', parseInt)
|
|
313
|
+
.option('--batch-size <n>', 'Texts per API call (default: 25)', parseInt)
|
|
314
|
+
.option('--filter <json>', 'Only refresh matching documents (JSON)')
|
|
315
|
+
.option('--force', 'Skip confirmation prompt')
|
|
316
|
+
.option('--dry-run', 'Show plan without executing')
|
|
317
|
+
.option('--json', 'Machine-readable output')
|
|
318
|
+
.option('-q, --quiet', 'Suppress non-essential output')
|
|
319
|
+
.action(refresh);
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
module.exports = { register, refresh };
|