vesper-wizard 2.1.5 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,464 @@
1
+ import { classifyDomain } from "../metadata/domain.js";
2
+ const STOP_WORDS = new Set([
3
+ "a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for",
4
+ "of", "with", "by", "from", "as", "is", "was", "are", "were", "been",
5
+ "be", "have", "has", "had", "do", "does", "did", "will", "would",
6
+ "could", "should", "may", "might", "must", "shall", "can", "need",
7
+ "about", "into", "through", "during", "before", "after", "above",
8
+ "below", "between", "under", "again", "further", "then", "once",
9
+ "here", "there", "when", "where", "why", "how", "all", "each",
10
+ "few", "more", "most", "other", "some", "such", "no", "nor", "not",
11
+ "only", "own", "same", "so", "than", "too", "very", "just", "also",
12
+ "dataset", "datasets", "data", "find", "search", "looking", "need", "want",
13
+ "give", "show", "me", "please"
14
+ ]);
15
+ const LANGUAGE_ALIASES = {
16
+ english: ["english", "en", "eng"],
17
+ spanish: ["spanish", "es", "spa"],
18
+ french: ["french", "fr", "fra"],
19
+ german: ["german", "de", "deu"],
20
+ portuguese: ["portuguese", "pt", "por"],
21
+ chinese: ["chinese", "zh", "cmn"],
22
+ japanese: ["japanese", "ja", "jpn"],
23
+ korean: ["korean", "ko", "kor"],
24
+ arabic: ["arabic", "ar", "ara"],
25
+ russian: ["russian", "ru", "rus"],
26
+ hindi: ["hindi", "hi", "hin"],
27
+ multilingual: ["multilingual", "bilingual", "cross-lingual", "crosslingual"],
28
+ };
29
+ const TASK_PATTERNS = [
30
+ { task: "translation", patterns: [/\btranslation\b/i, /\bmachine translation\b/i, /\bparallel corpus\b/i] },
31
+ { task: "question-answering", patterns: [/\bquestion answering\b/i, /\bqa\b/i, /\bq&a\b/i] },
32
+ { task: "summarization", patterns: [/\bsummarization\b/i, /\bsummary\b/i, /\btl;dr\b/i] },
33
+ { task: "sentiment-analysis", patterns: [/\bsentiment\b/i, /\bsentiment analysis\b/i] },
34
+ { task: "text-classification", patterns: [/\bclassification\b/i, /\bclassifier\b/i, /\btext classification\b/i] },
35
+ { task: "token-classification", patterns: [/\bner\b/i, /\bnamed entity\b/i, /\btoken classification\b/i] },
36
+ { task: "text-generation", patterns: [/\btext generation\b/i, /\bgenerative\b/i, /\binstruction\b/i, /\bchat\b/i] },
37
+ { task: "image-classification", patterns: [/\bimage classification\b/i] },
38
+ { task: "object-detection", patterns: [/\bobject detection\b/i, /\bdetection\b/i] },
39
+ ];
40
+ const intentCache = new Map();
41
+ export async function analyzeDatasetQuery(query, requirements) {
42
+ const cacheKey = `${query || ""}::${requirements || ""}`;
43
+ const cached = intentCache.get(cacheKey);
44
+ if (cached) {
45
+ return cached;
46
+ }
47
+ const task = (async () => {
48
+ const heuristic = buildHeuristicIntent(query, requirements);
49
+ const llmIntent = await tryLlmIntent(heuristic, requirements);
50
+ return llmIntent ? mergeIntent(heuristic, llmIntent) : heuristic;
51
+ })();
52
+ intentCache.set(cacheKey, task);
53
+ return task;
54
+ }
55
+ export function scoreDatasetAgainstIntent(dataset, intent) {
56
+ if (!intent)
57
+ return 0;
58
+ const text = [
59
+ dataset.name,
60
+ dataset.description,
61
+ dataset.task,
62
+ dataset.domain || "",
63
+ dataset.tags.join(" "),
64
+ dataset.languages.join(" "),
65
+ ].join(" ").toLowerCase();
66
+ let score = 0;
67
+ if (intent.language) {
68
+ const aliases = getLanguageAliases(intent.language);
69
+ const datasetLanguages = dataset.languages.map(normalizeToken);
70
+ const languageMatch = aliases.some(alias => datasetLanguages.includes(alias) || text.includes(alias));
71
+ const isMultilingualIntent = intent.language === "multilingual";
72
+ if (languageMatch) {
73
+ // Check if the dataset is monolingual in the requested language vs multilingual
74
+ const nonRequestedLanguages = datasetLanguages.filter(lang => !aliases.includes(lang) && lang !== "" && lang !== "unknown");
75
+ if (nonRequestedLanguages.length === 0 || isMultilingualIntent) {
76
+ // Purely the requested language (or user wants multilingual) → full boost
77
+ score += 0.55;
78
+ }
79
+ else {
80
+ // Bilingual/multilingual dataset that CONTAINS the language but isn't exclusive
81
+ // Penalize proportionally to how many other languages are present
82
+ const ratio = nonRequestedLanguages.length / Math.max(datasetLanguages.length, 1);
83
+ score += 0.1 - (ratio * 0.4); // ranges from +0.1 (mostly target lang) to -0.3 (mostly other langs)
84
+ }
85
+ }
86
+ else if (dataset.languages.length > 0) {
87
+ score -= 0.65;
88
+ }
89
+ else {
90
+ score -= 0.1;
91
+ }
92
+ }
93
+ if (intent.task) {
94
+ if (matchesTask(dataset, intent.task, text)) {
95
+ score += 0.35;
96
+ }
97
+ else {
98
+ score -= 0.3;
99
+ }
100
+ }
101
+ if (intent.domain && intent.domain !== "general" && intent.domain !== "unknown") {
102
+ const datasetDomain = String(dataset.domain || "").toLowerCase();
103
+ if (datasetDomain === intent.domain || text.includes(intent.domain)) {
104
+ score += 0.25;
105
+ }
106
+ else {
107
+ score -= 0.2;
108
+ }
109
+ }
110
+ if (intent.minRows && intent.minRows > 0) {
111
+ const totalExamples = Number(dataset.total_examples || 0);
112
+ if (totalExamples > 0) {
113
+ const ratio = totalExamples / intent.minRows;
114
+ if (ratio >= 1) {
115
+ score += Math.min(0.45, 0.18 + (Math.log10(ratio + 1) * 0.15));
116
+ }
117
+ else if (ratio < 0.05) {
118
+ score -= 1.2;
119
+ }
120
+ else if (ratio < 0.25) {
121
+ score -= 0.8;
122
+ }
123
+ else if (ratio < 0.5) {
124
+ score -= 0.45;
125
+ }
126
+ else {
127
+ score -= 0.15;
128
+ }
129
+ }
130
+ else {
131
+ score -= 0.08;
132
+ }
133
+ }
134
+ if (intent.positiveTerms.length > 0) {
135
+ const matches = intent.positiveTerms.filter(term => text.includes(term)).length;
136
+ score += Math.min(0.25, matches * 0.06);
137
+ }
138
+ if (intent.negativeTerms.some(term => text.includes(term))) {
139
+ score -= 0.7;
140
+ }
141
+ return Math.round(score * 100) / 100;
142
+ }
143
+ export function buildIntentSearchQuery(intent) {
144
+ return intent.searchQuery;
145
+ }
146
+ /**
147
+ * Build HuggingFace-compatible filter tags from the parsed intent.
148
+ * Returns e.g. ["language:en", "task_ids:text-classification"].
149
+ */
150
+ export function buildHuggingFaceFilterTags(intent) {
151
+ const tags = [];
152
+ if (intent.language && intent.language !== "multilingual") {
153
+ const langCode = LANGUAGE_TO_CODE[intent.language];
154
+ if (langCode)
155
+ tags.push(`language:${langCode}`);
156
+ }
157
+ if (intent.task) {
158
+ tags.push(`task_ids:${intent.task}`);
159
+ }
160
+ return tags;
161
+ }
162
+ const LANGUAGE_TO_CODE = {
163
+ english: "en",
164
+ spanish: "es",
165
+ french: "fr",
166
+ german: "de",
167
+ portuguese: "pt",
168
+ chinese: "zh",
169
+ japanese: "ja",
170
+ korean: "ko",
171
+ arabic: "ar",
172
+ russian: "ru",
173
+ hindi: "hi",
174
+ };
175
+ function buildHeuristicIntent(query, requirements) {
176
+ const originalQuery = `${query || ""} ${requirements || ""}`.trim();
177
+ const normalizedQuery = originalQuery.toLowerCase();
178
+ const negativeTerms = [...normalizedQuery.matchAll(/(?:^|\s)-([\w-]{2,})/g)].map(match => normalizeToken(match[1]));
179
+ const positiveTerms = tokenize(normalizedQuery)
180
+ .filter(token => !negativeTerms.includes(token))
181
+ .slice(0, 8);
182
+ const task = detectTask(normalizedQuery);
183
+ const language = detectLanguage(normalizedQuery);
184
+ const domain = classifyDomain(normalizedQuery, [], normalizedQuery, task);
185
+ const minRows = extractRequestedRows(normalizedQuery);
186
+ const searchTerms = [
187
+ language,
188
+ task,
189
+ domain !== "general" && domain !== "unknown" ? domain : undefined,
190
+ ...positiveTerms,
191
+ ].filter((value, index, self) => !!value && self.indexOf(value) === index);
192
+ return {
193
+ originalQuery,
194
+ normalizedQuery,
195
+ searchQuery: searchTerms.slice(0, 6).join(" ") || normalizedQuery,
196
+ positiveTerms,
197
+ negativeTerms,
198
+ language,
199
+ task: task || undefined,
200
+ domain,
201
+ minRows,
202
+ llmBacked: false,
203
+ };
204
+ }
205
+ function mergeIntent(base, llmIntent) {
206
+ const language = llmIntent.language ? normalizeToken(llmIntent.language) : base.language;
207
+ const task = llmIntent.task ? normalizeToken(llmIntent.task) : base.task;
208
+ const domain = llmIntent.domain ? normalizeToken(llmIntent.domain) : base.domain;
209
+ const minRows = typeof llmIntent.minRows === "number" && Number.isFinite(llmIntent.minRows)
210
+ ? llmIntent.minRows
211
+ : base.minRows;
212
+ const positiveTerms = Array.from(new Set([...(llmIntent.positiveTerms || []), ...base.positiveTerms].map(normalizeToken))).filter(Boolean);
213
+ const negativeTerms = Array.from(new Set([...(llmIntent.negativeTerms || []), ...base.negativeTerms].map(normalizeToken))).filter(Boolean);
214
+ const merged = {
215
+ ...base,
216
+ language,
217
+ task,
218
+ domain,
219
+ minRows,
220
+ positiveTerms,
221
+ negativeTerms,
222
+ llmBacked: true,
223
+ };
224
+ merged.searchQuery = [
225
+ merged.language,
226
+ merged.task,
227
+ merged.domain !== "general" && merged.domain !== "unknown" ? merged.domain : undefined,
228
+ ...merged.positiveTerms,
229
+ ].filter((value, index, self) => !!value && self.indexOf(value) === index).slice(0, 6).join(" ") || merged.normalizedQuery;
230
+ return merged;
231
+ }
232
+ async function tryLlmIntent(base, requirements) {
233
+ const openAiKey = process.env.OPENAI_API_KEY;
234
+ if (openAiKey) {
235
+ return await callOpenAiIntent(base, requirements).catch(() => undefined);
236
+ }
237
+ const geminiKey = process.env.GEMINI_API_KEY || process.env.GOOGLE_API_KEY;
238
+ if (geminiKey) {
239
+ return await callGeminiIntent(base, requirements, geminiKey).catch(() => undefined);
240
+ }
241
+ return undefined;
242
+ }
243
+ async function callOpenAiIntent(base, requirements) {
244
+ const controller = new AbortController();
245
+ const timeout = setTimeout(() => controller.abort(), 5000);
246
+ try {
247
+ const response = await fetch("https://api.openai.com/v1/chat/completions", {
248
+ method: "POST",
249
+ headers: {
250
+ "Content-Type": "application/json",
251
+ Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
252
+ },
253
+ body: JSON.stringify({
254
+ model: process.env.OPENAI_MODEL || "gpt-4o-mini",
255
+ temperature: 0,
256
+ response_format: { type: "json_object" },
257
+ messages: [
258
+ {
259
+ role: "system",
260
+ content: "Extract dataset search intent as JSON with keys: language, task, domain, minRows, positiveTerms, negativeTerms. Use null for unknowns.",
261
+ },
262
+ {
263
+ role: "user",
264
+ content: JSON.stringify({ query: base.originalQuery, requirements: requirements || null, heuristic: base }),
265
+ },
266
+ ],
267
+ }),
268
+ signal: controller.signal,
269
+ });
270
+ if (!response.ok) {
271
+ return undefined;
272
+ }
273
+ const body = await response.json();
274
+ const content = body?.choices?.[0]?.message?.content;
275
+ return parseIntentPayload(content);
276
+ }
277
+ finally {
278
+ clearTimeout(timeout);
279
+ }
280
+ }
281
+ async function callGeminiIntent(base, requirements, apiKey) {
282
+ const controller = new AbortController();
283
+ const timeout = setTimeout(() => controller.abort(), 5000);
284
+ try {
285
+ const model = process.env.GEMINI_MODEL || "gemini-1.5-flash";
286
+ const response = await fetch(`https://generativelanguage.googleapis.com/v1beta/models/${model}:generateContent?key=${encodeURIComponent(apiKey)}`, {
287
+ method: "POST",
288
+ headers: {
289
+ "Content-Type": "application/json",
290
+ },
291
+ body: JSON.stringify({
292
+ generationConfig: {
293
+ temperature: 0,
294
+ responseMimeType: "application/json",
295
+ },
296
+ contents: [{
297
+ role: "user",
298
+ parts: [{
299
+ text: `Extract dataset search intent as JSON with keys language, task, domain, minRows, positiveTerms, negativeTerms. Query payload: ${JSON.stringify({ query: base.originalQuery, requirements: requirements || null, heuristic: base })}`,
300
+ }],
301
+ }],
302
+ }),
303
+ signal: controller.signal,
304
+ });
305
+ if (!response.ok) {
306
+ return undefined;
307
+ }
308
+ const body = await response.json();
309
+ const content = body?.candidates?.[0]?.content?.parts?.[0]?.text;
310
+ return parseIntentPayload(content);
311
+ }
312
+ finally {
313
+ clearTimeout(timeout);
314
+ }
315
+ }
316
+ function parseIntentPayload(content) {
317
+ if (typeof content !== "string" || !content.trim()) {
318
+ return undefined;
319
+ }
320
+ const jsonText = extractJsonObject(content);
321
+ if (!jsonText) {
322
+ return undefined;
323
+ }
324
+ try {
325
+ const parsed = JSON.parse(jsonText);
326
+ return {
327
+ language: typeof parsed.language === "string" ? parsed.language : undefined,
328
+ task: typeof parsed.task === "string" ? parsed.task : undefined,
329
+ domain: typeof parsed.domain === "string" ? parsed.domain : undefined,
330
+ minRows: typeof parsed.minRows === "number"
331
+ ? parsed.minRows
332
+ : typeof parsed.min_rows === "number"
333
+ ? parsed.min_rows
334
+ : undefined,
335
+ positiveTerms: Array.isArray(parsed.positiveTerms)
336
+ ? parsed.positiveTerms.filter((item) => typeof item === "string")
337
+ : Array.isArray(parsed.positive_terms)
338
+ ? parsed.positive_terms.filter((item) => typeof item === "string")
339
+ : undefined,
340
+ negativeTerms: Array.isArray(parsed.negativeTerms)
341
+ ? parsed.negativeTerms.filter((item) => typeof item === "string")
342
+ : Array.isArray(parsed.negative_terms)
343
+ ? parsed.negative_terms.filter((item) => typeof item === "string")
344
+ : undefined,
345
+ };
346
+ }
347
+ catch {
348
+ return undefined;
349
+ }
350
+ }
351
+ function extractJsonObject(text) {
352
+ const trimmed = text.trim();
353
+ if (trimmed.startsWith("{") && trimmed.endsWith("}")) {
354
+ return trimmed;
355
+ }
356
+ const start = trimmed.indexOf("{");
357
+ const end = trimmed.lastIndexOf("}");
358
+ if (start >= 0 && end > start) {
359
+ return trimmed.slice(start, end + 1);
360
+ }
361
+ return undefined;
362
+ }
363
+ function detectLanguage(text) {
364
+ for (const [language, aliases] of Object.entries(LANGUAGE_ALIASES)) {
365
+ if (aliases.some(alias => new RegExp(`(^|[^a-z])${escapeRegex(alias)}([^a-z]|$)`, "i").test(text))) {
366
+ return language;
367
+ }
368
+ }
369
+ return undefined;
370
+ }
371
+ function detectTask(text) {
372
+ const match = TASK_PATTERNS.find(entry => entry.patterns.some(pattern => pattern.test(text)));
373
+ return match?.task;
374
+ }
375
+ function tokenize(text) {
376
+ return Array.from(new Set(text
377
+ .replace(/[^\w\s-]/g, " ")
378
+ .split(/\s+/)
379
+ .map(normalizeToken)
380
+ .filter(token => token.length > 2 && !STOP_WORDS.has(token) && !/^\d+$/.test(token))));
381
+ }
382
+ function normalizeToken(value) {
383
+ return value.toLowerCase().replace(/^[^a-z0-9]+|[^a-z0-9-]+$/g, "").trim();
384
+ }
385
+ function extractRequestedRows(text) {
386
+ // Match "1 million", "2.5 billion", "500 thousand" etc.
387
+ const wordMultipliers = {
388
+ thousand: 1_000, million: 1_000_000, billion: 1_000_000_000,
389
+ mil: 1_000_000, bil: 1_000_000_000,
390
+ };
391
+ const wordPattern = new RegExp(`(\\d+(?:\\.\\d+)?)\\s*(${Object.keys(wordMultipliers).join("|")})\\b`, "i");
392
+ const wordMatch = text.match(wordPattern);
393
+ if (wordMatch) {
394
+ const base = Number(wordMatch[1]);
395
+ const multiplier = wordMultipliers[wordMatch[2].toLowerCase()];
396
+ const value = Math.round(base * multiplier);
397
+ if (Number.isFinite(value) && value > 0)
398
+ return value;
399
+ }
400
+ const explicit = text.match(/(\d[\d,\s]{1,12})\s*(samples?|rows?|records?)/i);
401
+ if (explicit) {
402
+ const value = Number(explicit[1].replace(/[\s,]/g, ""));
403
+ if (Number.isFinite(value) && value > 0) {
404
+ return value;
405
+ }
406
+ }
407
+ const humanSized = text.match(/(\d+(?:\.\d+)?)\s*([kmb])\s*(samples?|rows?|records?)/i);
408
+ if (humanSized) {
409
+ const base = Number(humanSized[1]);
410
+ const suffix = humanSized[2].toLowerCase();
411
+ const multiplier = suffix === "k" ? 1_000 : suffix === "m" ? 1_000_000 : 1_000_000_000;
412
+ const value = Math.round(base * multiplier);
413
+ if (Number.isFinite(value) && value > 0) {
414
+ return value;
415
+ }
416
+ }
417
+ const commaNumbers = [...text.matchAll(/\b\d{1,3}(?:,\d{3})+\b/g)]
418
+ .map(match => Number(match[0].replace(/,/g, "")))
419
+ .filter(value => Number.isFinite(value) && value > 0);
420
+ if (commaNumbers.length > 0) {
421
+ return Math.max(...commaNumbers);
422
+ }
423
+ const humanSizedAnywhere = [...text.matchAll(/\b(\d+(?:\.\d+)?)\s*([kmb])\b/gi)]
424
+ .map(match => {
425
+ const base = Number(match[1]);
426
+ const suffix = match[2].toLowerCase();
427
+ const multiplier = suffix === "k" ? 1_000 : suffix === "m" ? 1_000_000 : 1_000_000_000;
428
+ return Math.round(base * multiplier);
429
+ })
430
+ .filter(value => Number.isFinite(value) && value > 0);
431
+ if (humanSizedAnywhere.length > 0) {
432
+ return Math.max(...humanSizedAnywhere);
433
+ }
434
+ const allNumbers = [...text.matchAll(/\b\d{4,9}\b/g)]
435
+ .map(match => Number(match[0]))
436
+ .filter(value => Number.isFinite(value) && value > 0);
437
+ if (allNumbers.length > 0) {
438
+ return Math.max(...allNumbers);
439
+ }
440
+ return undefined;
441
+ }
442
+ function matchesTask(dataset, task, text) {
443
+ const normalizedTask = normalizeToken(task);
444
+ const aliases = {
445
+ "question-answering": ["question-answering", "qa", "question answering"],
446
+ "text-classification": ["text-classification", "classification", "text classification"],
447
+ "token-classification": ["token-classification", "ner", "named entity"],
448
+ "sentiment-analysis": ["sentiment-analysis", "sentiment"],
449
+ translation: ["translation", "machine-translation", "parallel corpus"],
450
+ summarization: ["summarization", "summary"],
451
+ "text-generation": ["text-generation", "generation", "chat", "instruction"],
452
+ "image-classification": ["image-classification", "image classification"],
453
+ "object-detection": ["object-detection", "object detection"],
454
+ };
455
+ const variants = aliases[normalizedTask] || [normalizedTask];
456
+ return variants.some(variant => normalizeToken(dataset.task).includes(variant) || text.includes(variant));
457
+ }
458
+ function getLanguageAliases(language) {
459
+ const normalized = normalizeToken(language);
460
+ return (LANGUAGE_ALIASES[normalized] || [normalized]).map(normalizeToken);
461
+ }
462
+ function escapeRegex(value) {
463
+ return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
464
+ }
@@ -0,0 +1,130 @@
1
+ import { spawn } from "child_process";
2
+ import fs from "fs";
3
+ import os from "os";
4
+ import path from "path";
5
+ function getHomeDir(buildDir) {
6
+ return os.homedir() || process.env.HOME || process.env.USERPROFILE || buildDir;
7
+ }
8
+ export function getVesperDataRoot(buildDir = process.cwd()) {
9
+ return path.join(getHomeDir(buildDir), ".vesper");
10
+ }
11
+ export function getManagedPythonPath(buildDir = process.cwd()) {
12
+ const dataRoot = getVesperDataRoot(buildDir);
13
+ return process.platform === "win32"
14
+ ? path.join(dataRoot, ".venv", "Scripts", "python.exe")
15
+ : path.join(dataRoot, ".venv", "bin", "python");
16
+ }
17
+ function getFallbackPythonCommand() {
18
+ return process.platform === "win32" ? "py" : "python3";
19
+ }
20
+ export function resolvePythonCommand(buildDir = process.cwd()) {
21
+ const managedPython = getManagedPythonPath(buildDir);
22
+ if (fs.existsSync(managedPython)) {
23
+ return managedPython;
24
+ }
25
+ const envPython = process.env.VESPER_PYTHON;
26
+ if (envPython) {
27
+ return envPython;
28
+ }
29
+ const localCandidates = process.platform === "win32"
30
+ ? [
31
+ path.resolve(buildDir, ".venv", "Scripts", "python.exe"),
32
+ path.resolve(buildDir, "..", ".venv", "Scripts", "python.exe")
33
+ ]
34
+ : [
35
+ path.resolve(buildDir, ".venv", "bin", "python"),
36
+ path.resolve(buildDir, "..", ".venv", "bin", "python")
37
+ ];
38
+ for (const candidate of localCandidates) {
39
+ if (fs.existsSync(candidate)) {
40
+ return candidate;
41
+ }
42
+ }
43
+ return getFallbackPythonCommand();
44
+ }
45
+ function runPythonCommand(pythonPath, args, timeoutMs = 300000) {
46
+ return new Promise((resolve, reject) => {
47
+ const proc = spawn(pythonPath, args, {
48
+ env: {
49
+ ...process.env,
50
+ PYTHONIOENCODING: "utf-8",
51
+ },
52
+ });
53
+ let stdout = "";
54
+ let stderr = "";
55
+ const timer = setTimeout(() => {
56
+ proc.kill();
57
+ resolve({ code: 124, stdout, stderr: stderr || `Python command timed out after ${timeoutMs}ms` });
58
+ }, timeoutMs);
59
+ proc.stdout.on("data", (data) => {
60
+ stdout += data.toString();
61
+ });
62
+ proc.stderr.on("data", (data) => {
63
+ stderr += data.toString();
64
+ });
65
+ proc.on("close", (code) => {
66
+ clearTimeout(timer);
67
+ resolve({ code: code ?? 1, stdout, stderr });
68
+ });
69
+ proc.on("error", (error) => {
70
+ clearTimeout(timer);
71
+ reject(error);
72
+ });
73
+ });
74
+ }
75
+ async function createManagedPythonEnv(buildDir) {
76
+ const dataRoot = getVesperDataRoot(buildDir);
77
+ const venvDir = path.join(dataRoot, ".venv");
78
+ const managedPython = getManagedPythonPath(buildDir);
79
+ if (fs.existsSync(managedPython)) {
80
+ return managedPython;
81
+ }
82
+ fs.mkdirSync(dataRoot, { recursive: true });
83
+ const bootstrapAttempts = process.platform === "win32"
84
+ ? [
85
+ { command: "py", args: ["-3", "-m", "venv", venvDir] },
86
+ { command: "python", args: ["-m", "venv", venvDir] },
87
+ ]
88
+ : [
89
+ { command: "python3", args: ["-m", "venv", venvDir] },
90
+ { command: "python", args: ["-m", "venv", venvDir] },
91
+ ];
92
+ let lastError = "";
93
+ for (const attempt of bootstrapAttempts) {
94
+ try {
95
+ const result = await runPythonCommand(attempt.command, attempt.args, 180000);
96
+ if (result.code === 0 && fs.existsSync(managedPython)) {
97
+ await runPythonCommand(managedPython, ["-m", "pip", "install", "--disable-pip-version-check", "--upgrade", "pip"], 300000);
98
+ return managedPython;
99
+ }
100
+ lastError = (result.stderr || result.stdout || "Unknown venv creation error").trim();
101
+ }
102
+ catch (error) {
103
+ lastError = error?.message || String(error);
104
+ }
105
+ }
106
+ throw new Error(`Failed to create Vesper Python environment. ${lastError}`.trim());
107
+ }
108
+ export async function ensurePythonPackages(buildDir, requirements) {
109
+ const pythonPath = await createManagedPythonEnv(buildDir).catch(() => resolvePythonCommand(buildDir));
110
+ const missing = [];
111
+ for (const requirement of requirements) {
112
+ const check = await runPythonCommand(pythonPath, [
113
+ "-c",
114
+ `import importlib.util,sys; sys.exit(0 if importlib.util.find_spec(${JSON.stringify(requirement.module)}) else 1)`
115
+ ], 20000);
116
+ if (check.code !== 0) {
117
+ missing.push(requirement);
118
+ }
119
+ }
120
+ if (missing.length === 0) {
121
+ return pythonPath;
122
+ }
123
+ const packages = [...new Set(missing.map(requirement => requirement.packageName))];
124
+ const install = await runPythonCommand(pythonPath, ["-m", "pip", "install", "--disable-pip-version-check", ...packages], 600000);
125
+ if (install.code !== 0) {
126
+ const details = (install.stderr || install.stdout || "Unknown pip install error").trim();
127
+ throw new Error(`Failed to install Python packages (${packages.join(", ")}). ${details}`);
128
+ }
129
+ return pythonPath;
130
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "vesper-wizard",
3
- "version": "2.1.5",
3
+ "version": "2.2.0",
4
4
  "description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
5
5
  "type": "module",
6
6
  "main": "build/index.js",