vesper-wizard 2.1.4 → 2.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,12 @@
1
1
  import { spawn } from "child_process";
2
2
  import path from "path";
3
3
  import fs from "fs";
4
+ import { ensurePythonPackages, resolvePythonCommand } from "../utils/python-runtime.js";
4
5
  export class DataExporter {
5
- pythonPath = "python";
6
+ buildDir;
6
7
  scriptPath;
7
8
  constructor(buildDir = process.cwd()) {
9
+ this.buildDir = buildDir;
8
10
  const homeDir = process.env.HOME || process.env.USERPROFILE || buildDir;
9
11
  const dataRoot = path.join(homeDir, ".vesper");
10
12
  const scriptPath0 = path.resolve(dataRoot, "python", "export_engine.py");
@@ -26,27 +28,38 @@ export class DataExporter {
26
28
  else {
27
29
  this.scriptPath = scriptPath0;
28
30
  }
29
- // Detect Python command
30
- if (process.platform === "win32") {
31
- this.pythonPath = "py";
32
- }
33
31
  }
34
32
  /**
35
33
  * Exports a dataset file to a specified format
36
34
  */
37
35
  async export(inputFile, outputFile, format, options = {}) {
36
+ const pythonRequirements = [
37
+ { module: "polars", packageName: "polars" },
38
+ ];
39
+ if (format === "feather") {
40
+ pythonRequirements.push({ module: "pyarrow", packageName: "pyarrow" });
41
+ }
42
+ if (format === "tfrecord") {
43
+ pythonRequirements.push({ module: "tensorflow", packageName: "tensorflow" });
44
+ }
45
+ const pythonPath = await ensurePythonPackages(this.buildDir, pythonRequirements).catch(() => resolvePythonCommand(this.buildDir));
38
46
  return new Promise((resolve, reject) => {
39
47
  if (!fs.existsSync(inputFile)) {
40
48
  reject(new Error(`Input file not found: ${inputFile}`));
41
49
  return;
42
50
  }
43
51
  const args = [this.scriptPath, inputFile, outputFile, format, JSON.stringify(options)];
44
- const process = spawn(this.pythonPath, args);
52
+ const childProcess = spawn(pythonPath, args, {
53
+ env: {
54
+ ...process.env,
55
+ PYTHONIOENCODING: "utf-8",
56
+ },
57
+ });
45
58
  let stdout = "";
46
59
  let stderr = "";
47
- process.stdout.on("data", (data) => stdout += data.toString());
48
- process.stderr.on("data", (data) => stderr += data.toString());
49
- process.on("close", (code) => {
60
+ childProcess.stdout.on("data", (data) => stdout += data.toString());
61
+ childProcess.stderr.on("data", (data) => stderr += data.toString());
62
+ childProcess.on("close", (code) => {
50
63
  if (code !== 0) {
51
64
  reject(new Error(`Export failed: ${stderr || stdout}`));
52
65
  return;
@@ -3,6 +3,7 @@ import path from "path";
3
3
  import http from "http";
4
4
  import https from "https";
5
5
  import { HuggingFaceScraper } from "../metadata/scraper.js";
6
+ import { analyzeDatasetQuery } from "../search/query-intent.js";
6
7
  export class UnifiedDatasetGateway {
7
8
  deps;
8
9
  constructor(deps) {
@@ -236,7 +237,7 @@ export class UnifiedDatasetGateway {
236
237
  async discoverFromSource(source, query, limit) {
237
238
  switch (source) {
238
239
  case "huggingface":
239
- return await new HuggingFaceScraper().scrape(limit, true, query);
240
+ return await new HuggingFaceScraper().scrape(limit, true, await analyzeDatasetQuery(query));
240
241
  case "openml":
241
242
  return await this.deps.openmlSource.discover(query, limit);
242
243
  case "kaggle":
package/build/index.js CHANGED
@@ -361,6 +361,21 @@ function extractRequestedRows(query, requirements) {
361
361
  if (Number.isFinite(n) && n > 0)
362
362
  return n;
363
363
  }
364
+ const commaNumbers = [...text.matchAll(/\b\d{1,3}(?:,\d{3})+\b/g)]
365
+ .map(m => Number(m[0].replace(/,/g, "")))
366
+ .filter(n => Number.isFinite(n) && n > 0);
367
+ if (commaNumbers.length > 0)
368
+ return Math.max(...commaNumbers);
369
+ const humanSized = [...text.matchAll(/\b(\d+(?:\.\d+)?)\s*([kmb])\b/gi)]
370
+ .map(m => {
371
+ const base = Number(m[1]);
372
+ const suffix = m[2].toLowerCase();
373
+ const multiplier = suffix === "k" ? 1_000 : suffix === "m" ? 1_000_000 : 1_000_000_000;
374
+ return Math.round(base * multiplier);
375
+ })
376
+ .filter(n => Number.isFinite(n) && n > 0);
377
+ if (humanSized.length > 0)
378
+ return Math.max(...humanSized);
364
379
  const allNums = [...text.matchAll(/\b\d{4,9}\b/g)]
365
380
  .map(m => Number(m[0]))
366
381
  .filter(n => Number.isFinite(n) && n > 0);
@@ -644,7 +659,7 @@ jobManager.on("processJob", async (job, execute) => {
644
659
  console.error(`[Vesper] Starting prepareDatasetTask for job ${job.id}...`);
645
660
  const metadata = job.metadata ? JSON.parse(job.metadata) : {};
646
661
  switch (job.type) {
647
- case "prepare": return await handlePrepareJob(job.id, metadata.query, metadata.requirements);
662
+ case "prepare": return await handlePrepareJob(job.id, metadata.query, metadata.requirements, metadata.outputDir);
648
663
  case "clean": return await handleCleanJob(job.id, metadata.datasetId, metadata.ops);
649
664
  default: throw new Error(`Unhandled job type: ${job.type}`);
650
665
  }
@@ -662,7 +677,7 @@ jobManager.on("processJob", async (job, execute) => {
662
677
  /**
663
678
  * Logic for preparing a dataset (Search + Ingest + Process)
664
679
  */
665
- async function handlePrepareJob(jobId, query, requirements) {
680
+ async function handlePrepareJob(jobId, query, requirements, outputDir) {
666
681
  hydrateExternalKeys();
667
682
  const update = (updates) => jobManager.updateJob(jobId, updates);
668
683
  const pipelineSteps = ["search", "validate", "download", "normalize", "quality", "register"];
@@ -689,6 +704,7 @@ async function handlePrepareJob(jobId, query, requirements) {
689
704
  // Continue anyway - direct file downloads may still work without datasets lib
690
705
  }
691
706
  const requestedRows = extractRequestedRows(query, requirements);
707
+ const searchQuery = requirements ? `${query} ${requirements}` : query;
692
708
  let selectedDataset;
693
709
  let datasetIdForDownload = "";
694
710
  let source;
@@ -729,7 +745,7 @@ async function handlePrepareJob(jobId, query, requirements) {
729
745
  else {
730
746
  markPipelineStep("search", "running");
731
747
  update({ progress: 10, status_text: "Searching for best dataset matching query..." });
732
- const results = await searchEngine.search(query, { limit: 10 });
748
+ const results = await searchEngine.search(searchQuery, { limit: 10 });
733
749
  if (results.length === 0) {
734
750
  markPipelineStep("search", "failed");
735
751
  throw new Error("No datasets found matching the query. Try refining your search terms.");
@@ -777,7 +793,7 @@ async function handlePrepareJob(jobId, query, requirements) {
777
793
  let currentRows = await countRows(rawFilePath);
778
794
  if (currentRows < requestedRows) {
779
795
  update({ progress: 64, status_text: `Only ${currentRows.toLocaleString()} rows found. Fetching more matching datasets...` });
780
- const additional = await searchEngine.search(query, { limit: 8 });
796
+ const additional = await searchEngine.search(searchQuery, { limit: 8 });
781
797
  const sourceFiles = [rawFilePath];
782
798
  let totalRows = currentRows;
783
799
  for (const ds of additional) {
@@ -882,7 +898,7 @@ async function handlePrepareJob(jobId, query, requirements) {
882
898
  }
883
899
  markPipelineStep("register", "running");
884
900
  update({ progress: 85, status_text: "Installing dataset into project..." });
885
- const installPath = await installService.install(datasetIdForDownload, rawFilePath);
901
+ const installPath = await installService.install(datasetIdForDownload, rawFilePath, outputDir);
886
902
  update({ progress: 100, status_text: "Preparation complete!" });
887
903
  // Register prepared dataset in local registry for lookup by export/list tools
888
904
  try {
@@ -1013,7 +1029,11 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
1013
1029
  },
1014
1030
  target_dir: {
1015
1031
  type: "string",
1016
- description: "Optional output directory for operation='download'.",
1032
+ description: "Optional output directory for operation='download'. Defaults to the current working directory when omitted.",
1033
+ },
1034
+ output_dir: {
1035
+ type: "string",
1036
+ description: "Alias for target_dir. Defaults to the current working directory when omitted.",
1017
1037
  },
1018
1038
  public_only: {
1019
1039
  type: "boolean",
@@ -1052,7 +1072,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
1052
1072
  },
1053
1073
  {
1054
1074
  name: "download_dataset",
1055
- description: "Download a dataset by source and ID/slug into local Vesper storage. Defaults to HuggingFace. Public providers work keylessly, while Kaggle and data.world can use server-managed credentials when configured.",
1075
+ description: "Download a dataset by source and ID/slug into a local directory. Defaults to HuggingFace. Public providers work keylessly, while Kaggle and data.world can use server-managed credentials when configured.",
1056
1076
  inputSchema: {
1057
1077
  type: "object",
1058
1078
  properties: {
@@ -1067,7 +1087,11 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
1067
1087
  },
1068
1088
  target_dir: {
1069
1089
  type: "string",
1070
- description: "Optional target directory for downloaded files.",
1090
+ description: "Optional target directory for downloaded files. Defaults to the current working directory when omitted.",
1091
+ },
1092
+ output_dir: {
1093
+ type: "string",
1094
+ description: "Alias for target_dir. Defaults to the current working directory when omitted.",
1071
1095
  }
1072
1096
  },
1073
1097
  required: ["dataset_id"],
@@ -1194,6 +1218,8 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
1194
1218
  properties: {
1195
1219
  query: { type: "string" },
1196
1220
  requirements: { type: "string" },
1221
+ target_dir: { type: "string", description: "Optional local directory for the prepared dataset. Defaults to the current working directory when omitted." },
1222
+ output_dir: { type: "string", description: "Alias for target_dir. Defaults to the current working directory when omitted." },
1197
1223
  download_images: { type: "boolean", description: "When true, enables post-prepare smart asset downloading for image/media datasets." },
1198
1224
  cleaning_options: { type: "object" },
1199
1225
  split_config: { type: "object" },
@@ -1238,7 +1264,11 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
1238
1264
  },
1239
1265
  target_dir: {
1240
1266
  type: "string",
1241
- description: "Optional custom local directory for export (e.g., './naruto-quotes').",
1267
+ description: "Optional custom local directory for export. Defaults to the current working directory when omitted.",
1268
+ },
1269
+ output_dir: {
1270
+ type: "string",
1271
+ description: "Alias for target_dir. Defaults to the current working directory when omitted.",
1242
1272
  },
1243
1273
  format: {
1244
1274
  type: "string",
@@ -1425,7 +1455,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1425
1455
  if (tool === "vesper_export" && req === "split") {
1426
1456
  // Auto-trigger prepare_dataset (start a background prepare job)
1427
1457
  try {
1428
- jobManager.createJob("prepare", 0, { query: String(datasetId), requirements: undefined, downloadImages: false });
1458
+ jobManager.createJob("prepare", 0, { query: String(datasetId), requirements: undefined, downloadImages: false, outputDir: process.cwd() });
1429
1459
  // Mark split as complete so export can proceed; export handler will also wait for data if needed.
1430
1460
  markStepComplete(String(datasetId), "split");
1431
1461
  }
@@ -1481,6 +1511,12 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1481
1511
  if (!datasetId) {
1482
1512
  throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='download'");
1483
1513
  }
1514
+ const requestedTargetDir = request.params.arguments?.target_dir
1515
+ ? String(request.params.arguments.target_dir).trim()
1516
+ : request.params.arguments?.output_dir
1517
+ ? String(request.params.arguments.output_dir).trim()
1518
+ : "";
1519
+ const targetDir = requestedTargetDir || process.cwd();
1484
1520
  try {
1485
1521
  await ensurePythonModules([{ module: "datasets", packageName: "datasets" }]);
1486
1522
  }
@@ -1490,7 +1526,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1490
1526
  const result = await unifiedDatasetGateway.download({
1491
1527
  datasetId,
1492
1528
  source,
1493
- targetDir: request.params.arguments?.target_dir ? String(request.params.arguments.target_dir) : undefined,
1529
+ targetDir,
1494
1530
  });
1495
1531
  try {
1496
1532
  upsertRegistry(result.dataset_id, result.copied_to || result.local_path, "completed");
@@ -1597,7 +1633,12 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1597
1633
  hydrateExternalKeys();
1598
1634
  const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
1599
1635
  const datasetId = String(request.params.arguments?.dataset_id || "").trim();
1600
- const targetDir = request.params.arguments?.target_dir ? String(request.params.arguments.target_dir) : undefined;
1636
+ const requestedTargetDir = request.params.arguments?.target_dir
1637
+ ? String(request.params.arguments.target_dir).trim()
1638
+ : request.params.arguments?.output_dir
1639
+ ? String(request.params.arguments.output_dir).trim()
1640
+ : "";
1641
+ const targetDir = requestedTargetDir || process.cwd();
1601
1642
  if (!datasetId) {
1602
1643
  throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
1603
1644
  }
@@ -1975,10 +2016,16 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1975
2016
  const query = String(request.params.arguments?.query);
1976
2017
  const requirements = request.params.arguments?.requirements ? String(request.params.arguments?.requirements) : undefined;
1977
2018
  const downloadImages = request.params.arguments?.download_images === true;
2019
+ const requestedOutputDir = request.params.arguments?.target_dir
2020
+ ? String(request.params.arguments.target_dir).trim()
2021
+ : request.params.arguments?.output_dir
2022
+ ? String(request.params.arguments.output_dir).trim()
2023
+ : "";
2024
+ const outputDir = requestedOutputDir || process.cwd();
1978
2025
  if (!query || query === "undefined") {
1979
2026
  throw new McpError(ErrorCode.InvalidParams, "query is required - describe the dataset you need or provide a dataset ID");
1980
2027
  }
1981
- const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages });
2028
+ const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages, outputDir });
1982
2029
  return {
1983
2030
  content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you. Use check_job_status to monitor progress.` }]
1984
2031
  };
@@ -2019,7 +2066,12 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
2019
2066
  }
2020
2067
  case "export_dataset": {
2021
2068
  const datasetId = String(request.params.arguments?.dataset_id);
2022
- const targetDir = request.params.arguments?.target_dir ? String(request.params.arguments?.target_dir) : undefined;
2069
+ const requestedTargetDir = request.params.arguments?.target_dir
2070
+ ? String(request.params.arguments?.target_dir).trim()
2071
+ : request.params.arguments?.output_dir
2072
+ ? String(request.params.arguments?.output_dir).trim()
2073
+ : "";
2074
+ const targetDir = requestedTargetDir || process.cwd();
2023
2075
  const requestedFormat = String(request.params.arguments?.format || "feather");
2024
2076
  const fastMode = request.params.arguments?.fast === true;
2025
2077
  const preview = request.params.arguments?.preview === true;
@@ -2032,7 +2084,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
2032
2084
  console.error(`[Export] No local data found for ${datasetId}. Attempting to prepare automatically...`);
2033
2085
  // Start a prepare job for this dataset id (acts like calling prepare_dataset)
2034
2086
  try {
2035
- jobManager.createJob("prepare", 0, { query: datasetId, requirements: undefined, downloadImages: false });
2087
+ jobManager.createJob("prepare", 0, { query: datasetId, requirements: undefined, downloadImages: false, outputDir: process.cwd() });
2036
2088
  }
2037
2089
  catch (e) {
2038
2090
  console.error(`[Export] Failed to start prepare job for ${datasetId}: ${e?.message || e}`);
@@ -3,12 +3,18 @@ import { categorizeLicense } from "./license.js";
3
3
  import { calculateQualityScore } from "./quality.js";
4
4
  import { classifyDomain } from "./domain.js";
5
5
  import { retryWithBackoff, delayBetweenRequests } from "./rate-limiter.js";
6
+ import { analyzeDatasetQuery, buildIntentSearchQuery, scoreDatasetAgainstIntent } from "../search/query-intent.js";
6
7
  export class HuggingFaceScraper {
7
8
  /**
8
9
  * Bulk discovery: Fetch many datasets quickly without deep details.
9
10
  * Hits the 25k target in minutes.
10
11
  */
11
- async scrapeBulk(limit = 1000, query) {
12
+ async scrapeBulk(limit = 1000, queryOrIntent) {
13
+ const intent = typeof queryOrIntent === "string"
14
+ ? await analyzeDatasetQuery(queryOrIntent)
15
+ : queryOrIntent;
16
+ const query = typeof queryOrIntent === "string" ? queryOrIntent : intent?.searchQuery;
17
+ const hfQuery = intent ? buildIntentSearchQuery(intent) : query;
12
18
  const filterMsg = query ? `, query: ${query}` : "";
13
19
  console.error(`[Bulk Scraper] Fetching datasets (target limit: ${limit}${filterMsg})...`);
14
20
  const results = [];
@@ -18,7 +24,7 @@ export class HuggingFaceScraper {
18
24
  for await (const ds of listDatasets({
19
25
  limit: limit,
20
26
  additionalFields: ["description", "tags", "downloadsAllTime", "createdAt"],
21
- search: { query: query },
27
+ search: { query: hfQuery },
22
28
  ...(hfToken ? { accessToken: hfToken } : {})
23
29
  })) {
24
30
  if (results.length >= limit)
@@ -86,8 +92,12 @@ export class HuggingFaceScraper {
86
92
  }
87
93
  return results;
88
94
  }
89
- async scrape(limit = 100, applyMVPFilters = true, query // Use as general search query
90
- ) {
95
+ async scrape(limit = 100, applyMVPFilters = true, queryOrIntent) {
96
+ const intent = typeof queryOrIntent === "string"
97
+ ? await analyzeDatasetQuery(queryOrIntent)
98
+ : queryOrIntent;
99
+ const query = typeof queryOrIntent === "string" ? queryOrIntent : intent?.searchQuery;
100
+ const hfQuery = intent ? buildIntentSearchQuery(intent) : query;
91
101
  const filterMsg = query ? `, query: ${query}` : "";
92
102
  console.error(`Fetching datasets (target limit: ${limit}, MVP filters: ${applyMVPFilters}${filterMsg})...`);
93
103
  const results = [];
@@ -113,7 +123,7 @@ export class HuggingFaceScraper {
113
123
  for await (const ds of listDatasets({
114
124
  limit: fetchLimit,
115
125
  additionalFields: ["description", "tags"],
116
- search: { query: query },
126
+ search: { query: hfQuery },
117
127
  ...(hfToken ? { accessToken: hfToken } : {})
118
128
  })) {
119
129
  if (results.length >= limit)
@@ -290,6 +300,9 @@ export class HuggingFaceScraper {
290
300
  description_length: description.length,
291
301
  has_readme: !!(cardData.readme || cardData.readme_content)
292
302
  };
303
+ if (intent) {
304
+ metadata.intent_score = scoreDatasetAgainstIntent(metadata, intent);
305
+ }
293
306
  results.push(metadata);
294
307
  }
295
308
  catch (e) {
@@ -340,8 +353,12 @@ export class HuggingFaceScraper {
340
353
  if (otherErrors > 0) {
341
354
  console.error(`[HF Scraper] āš ļø ${otherErrors} datasets skipped due to errors`);
342
355
  }
343
- // Sort by downloads descending
344
- return results.sort((a, b) => b.downloads - a.downloads);
356
+ return results.sort((a, b) => {
357
+ const intentDelta = Number(b.intent_score || 0) - Number(a.intent_score || 0);
358
+ if (intentDelta !== 0)
359
+ return intentDelta;
360
+ return b.downloads - a.downloads;
361
+ });
345
362
  }
346
363
  extractTask(tags) {
347
364
  const taskTags = [
@@ -1,4 +1,5 @@
1
1
  import { JITOrchestrator } from "./jit-orchestrator.js";
2
+ import { analyzeDatasetQuery, scoreDatasetAgainstIntent } from "./query-intent.js";
2
3
  import fs from "fs";
3
4
  function log(msg) {
4
5
  fs.appendFileSync("debug.log", new Date().toISOString() + " " + msg + "\n");
@@ -17,9 +18,10 @@ export class SearchEngine {
17
18
  async search(query, options = {}) {
18
19
  const limit = options.limit || 5;
19
20
  const enableJIT = options.enableJIT !== false; // Default: true
21
+ const intent = await analyzeDatasetQuery(query);
20
22
  log(`Search request: "${query}" Limit=${limit} Safe=${options.safeOnly} JIT=${enableJIT}`);
21
23
  // 1. Perform local search
22
- const localResults = await this.localSearch(query, options);
24
+ const localResults = await this.localSearch(query, options, intent);
23
25
  // 2. Check if JIT should be triggered
24
26
  const shouldTrigger = enableJIT && this.shouldTriggerJIT(localResults, query);
25
27
  if (!shouldTrigger) {
@@ -28,10 +30,10 @@ export class SearchEngine {
28
30
  }
29
31
  // 3. Trigger JIT fallback
30
32
  console.error(`\nWARNING: Low confidence results (${localResults.length} found, top score: ${localResults[0]?.relevance_score || 0})`);
31
- await this.jitOrchestrator.fetchAndIngest(query, 10);
33
+ await this.jitOrchestrator.fetchAndIngest(query, 10, intent);
32
34
  // 4. Re-run local search with updated index
33
35
  console.error(`Re-searching with updated library...`);
34
- const enhancedResults = await this.localSearch(query, options);
36
+ const enhancedResults = await this.localSearch(query, options, intent);
35
37
  const newCount = enhancedResults.length - localResults.length;
36
38
  if (newCount > 0) {
37
39
  console.error(`Found ${newCount} additional results\n`);
@@ -41,7 +43,7 @@ export class SearchEngine {
41
43
  /**
42
44
  * Perform hybrid search (Vector + Lexical + Penalties)
43
45
  */
44
- async localSearch(query, options) {
46
+ async localSearch(query, options, intent) {
45
47
  const limit = options.limit || 5;
46
48
  // 1. Parse Query
47
49
  const words = query.toLowerCase().split(/\s+/);
@@ -136,11 +138,13 @@ export class SearchEngine {
136
138
  bonus = sourceBonuses[metadata.source] || 0;
137
139
  // Final Combined Score
138
140
  // 70% Vector, 30% Lexical, minus Penalties, plus Bonuses
139
- const finalScore = (vectorScore * 0.7) + (lexicalScore * 0.3) - penalty + bonus;
141
+ const intentScore = scoreDatasetAgainstIntent(metadata, intent);
142
+ const finalScore = (vectorScore * 0.7) + (lexicalScore * 0.3) - penalty + bonus + intentScore;
140
143
  metadata.relevance_score = Math.round(finalScore * 100) / 100;
141
144
  metadata.vector_score = Math.round(vectorScore * 100) / 100;
142
145
  metadata.lexical_score = Math.round(lexicalScore * 100) / 100;
143
146
  metadata.accessibility_bonus = bonus;
147
+ metadata.intent_score = intentScore;
144
148
  results.push(metadata);
145
149
  }
146
150
  // Sort by final score and limit
@@ -2,6 +2,7 @@ import { HuggingFaceScraper } from "../metadata/scraper.js";
2
2
  import { UCIScraper } from "../metadata/uci-scraper.js";
3
3
  import { GitHubScraper } from "../metadata/github-scraper.js";
4
4
  import { WorldBankScraper, NASAScraper } from "../metadata/institutional-scrapers.js";
5
+ import { analyzeDatasetQuery, buildIntentSearchQuery } from "./query-intent.js";
5
6
  // Common stop words to filter out for better search
6
7
  const STOP_WORDS = new Set([
7
8
  "a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for",
@@ -61,7 +62,7 @@ export class JITOrchestrator {
61
62
  /**
62
63
  * Main JIT workflow: fetch, save, index, return new datasets
63
64
  */
64
- async fetchAndIngest(query, limit = 10) {
65
+ async fetchAndIngest(query, limit = 10, providedIntent) {
65
66
  // Rate limiting check
66
67
  if (!this.canTrigger(query)) {
67
68
  console.error(`[JIT] Query "${query}" was searched recently. Waiting...`);
@@ -69,9 +70,12 @@ export class JITOrchestrator {
69
70
  }
70
71
  console.error(`\n[JIT] Searching live sources for: "${query}"`);
71
72
  this.lastTriggerTime.set(query, Date.now());
72
- // Simplify query for better API results
73
- const keywords = this.simplifyQuery(query);
74
- if (keywords.length > 0) {
73
+ const intent = providedIntent || await analyzeDatasetQuery(query);
74
+ const keywords = this.simplifyQuery(buildIntentSearchQuery(intent));
75
+ if (intent.llmBacked || intent.language || intent.task || intent.domain || intent.minRows) {
76
+ console.error(`[JIT] Intent: ${JSON.stringify({ language: intent.language, task: intent.task, domain: intent.domain, minRows: intent.minRows, searchQuery: intent.searchQuery })}`);
77
+ }
78
+ else if (keywords.length > 0) {
75
79
  console.error(`[JIT] Keywords extracted: ${keywords.join(", ")}`);
76
80
  }
77
81
  const newDatasets = [];
@@ -81,15 +85,16 @@ export class JITOrchestrator {
81
85
  // Get existing dataset IDs to avoid duplicates
82
86
  const existing = this.metadataStore.getAllDatasets();
83
87
  existing.forEach(ds => existingIds.add(ds.id));
84
- // 1. Scrape HuggingFace - try each keyword separately for better results
85
- let hfResults = [];
86
- for (const keyword of keywords) {
87
- if (hfResults.length >= limit)
88
- break;
89
- const results = await this.scrapeHuggingFace(keyword, Math.ceil(limit / keywords.length));
90
- for (const ds of results) {
91
- if (!hfResults.some(existing => existing.id === ds.id)) {
92
- hfResults.push(ds);
88
+ let hfResults = await this.scrapeHuggingFace(intent, limit);
89
+ if (hfResults.length < Math.max(3, Math.floor(limit / 2))) {
90
+ for (const keyword of keywords) {
91
+ if (hfResults.length >= limit)
92
+ break;
93
+ const results = await this.scrapeHuggingFace(keyword, Math.ceil(limit / Math.max(keywords.length, 1)));
94
+ for (const ds of results) {
95
+ if (!hfResults.some(existing => existing.id === ds.id)) {
96
+ hfResults.push(ds);
97
+ }
93
98
  }
94
99
  }
95
100
  }
@@ -170,7 +175,6 @@ export class JITOrchestrator {
170
175
  async scrapeHuggingFace(query, limit) {
171
176
  const scraper = new HuggingFaceScraper();
172
177
  try {
173
- // Pass the query as a general search term
174
178
  return await scraper.scrape(limit, true, query);
175
179
  }
176
180
  catch (error) {
@@ -0,0 +1,409 @@
1
+ import { classifyDomain } from "../metadata/domain.js";
2
+ const STOP_WORDS = new Set([
3
+ "a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for",
4
+ "of", "with", "by", "from", "as", "is", "was", "are", "were", "been",
5
+ "be", "have", "has", "had", "do", "does", "did", "will", "would",
6
+ "could", "should", "may", "might", "must", "shall", "can", "need",
7
+ "about", "into", "through", "during", "before", "after", "above",
8
+ "below", "between", "under", "again", "further", "then", "once",
9
+ "here", "there", "when", "where", "why", "how", "all", "each",
10
+ "few", "more", "most", "other", "some", "such", "no", "nor", "not",
11
+ "only", "own", "same", "so", "than", "too", "very", "just", "also",
12
+ "dataset", "datasets", "data", "find", "search", "looking", "need", "want",
13
+ "give", "show", "me", "please"
14
+ ]);
15
+ const LANGUAGE_ALIASES = {
16
+ english: ["english", "en", "eng"],
17
+ spanish: ["spanish", "es", "spa"],
18
+ french: ["french", "fr", "fra"],
19
+ german: ["german", "de", "deu"],
20
+ portuguese: ["portuguese", "pt", "por"],
21
+ chinese: ["chinese", "zh", "cmn"],
22
+ japanese: ["japanese", "ja", "jpn"],
23
+ korean: ["korean", "ko", "kor"],
24
+ arabic: ["arabic", "ar", "ara"],
25
+ russian: ["russian", "ru", "rus"],
26
+ hindi: ["hindi", "hi", "hin"],
27
+ multilingual: ["multilingual", "bilingual", "cross-lingual", "crosslingual"],
28
+ };
29
+ const TASK_PATTERNS = [
30
+ { task: "translation", patterns: [/\btranslation\b/i, /\bmachine translation\b/i, /\bparallel corpus\b/i] },
31
+ { task: "question-answering", patterns: [/\bquestion answering\b/i, /\bqa\b/i, /\bq&a\b/i] },
32
+ { task: "summarization", patterns: [/\bsummarization\b/i, /\bsummary\b/i, /\btl;dr\b/i] },
33
+ { task: "sentiment-analysis", patterns: [/\bsentiment\b/i, /\bsentiment analysis\b/i] },
34
+ { task: "text-classification", patterns: [/\bclassification\b/i, /\bclassifier\b/i, /\btext classification\b/i] },
35
+ { task: "token-classification", patterns: [/\bner\b/i, /\bnamed entity\b/i, /\btoken classification\b/i] },
36
+ { task: "text-generation", patterns: [/\btext generation\b/i, /\bgenerative\b/i, /\binstruction\b/i, /\bchat\b/i] },
37
+ { task: "image-classification", patterns: [/\bimage classification\b/i] },
38
+ { task: "object-detection", patterns: [/\bobject detection\b/i, /\bdetection\b/i] },
39
+ ];
40
+ const intentCache = new Map();
41
+ export async function analyzeDatasetQuery(query, requirements) {
42
+ const cacheKey = `${query || ""}::${requirements || ""}`;
43
+ const cached = intentCache.get(cacheKey);
44
+ if (cached) {
45
+ return cached;
46
+ }
47
+ const task = (async () => {
48
+ const heuristic = buildHeuristicIntent(query, requirements);
49
+ const llmIntent = await tryLlmIntent(heuristic, requirements);
50
+ return llmIntent ? mergeIntent(heuristic, llmIntent) : heuristic;
51
+ })();
52
+ intentCache.set(cacheKey, task);
53
+ return task;
54
+ }
55
+ export function scoreDatasetAgainstIntent(dataset, intent) {
56
+ if (!intent)
57
+ return 0;
58
+ const text = [
59
+ dataset.name,
60
+ dataset.description,
61
+ dataset.task,
62
+ dataset.domain || "",
63
+ dataset.tags.join(" "),
64
+ dataset.languages.join(" "),
65
+ ].join(" ").toLowerCase();
66
+ let score = 0;
67
+ if (intent.language) {
68
+ const aliases = getLanguageAliases(intent.language);
69
+ const datasetLanguages = dataset.languages.map(normalizeToken);
70
+ const languageMatch = aliases.some(alias => datasetLanguages.includes(alias) || text.includes(alias));
71
+ if (languageMatch) {
72
+ score += 0.45;
73
+ }
74
+ else if (dataset.languages.length > 0) {
75
+ score -= 0.55;
76
+ }
77
+ else {
78
+ score -= 0.1;
79
+ }
80
+ }
81
+ if (intent.task) {
82
+ if (matchesTask(dataset, intent.task, text)) {
83
+ score += 0.35;
84
+ }
85
+ else {
86
+ score -= 0.3;
87
+ }
88
+ }
89
+ if (intent.domain && intent.domain !== "general" && intent.domain !== "unknown") {
90
+ const datasetDomain = String(dataset.domain || "").toLowerCase();
91
+ if (datasetDomain === intent.domain || text.includes(intent.domain)) {
92
+ score += 0.25;
93
+ }
94
+ else {
95
+ score -= 0.2;
96
+ }
97
+ }
98
+ if (intent.minRows && intent.minRows > 0) {
99
+ const totalExamples = Number(dataset.total_examples || 0);
100
+ if (totalExamples > 0) {
101
+ const ratio = totalExamples / intent.minRows;
102
+ if (ratio >= 1) {
103
+ score += Math.min(0.45, 0.18 + (Math.log10(ratio + 1) * 0.15));
104
+ }
105
+ else if (ratio < 0.05) {
106
+ score -= 1.2;
107
+ }
108
+ else if (ratio < 0.25) {
109
+ score -= 0.8;
110
+ }
111
+ else if (ratio < 0.5) {
112
+ score -= 0.45;
113
+ }
114
+ else {
115
+ score -= 0.15;
116
+ }
117
+ }
118
+ else {
119
+ score -= 0.08;
120
+ }
121
+ }
122
+ if (intent.positiveTerms.length > 0) {
123
+ const matches = intent.positiveTerms.filter(term => text.includes(term)).length;
124
+ score += Math.min(0.25, matches * 0.06);
125
+ }
126
+ if (intent.negativeTerms.some(term => text.includes(term))) {
127
+ score -= 0.7;
128
+ }
129
+ return Math.round(score * 100) / 100;
130
+ }
131
+ export function buildIntentSearchQuery(intent) {
132
+ return intent.searchQuery;
133
+ }
134
+ function buildHeuristicIntent(query, requirements) {
135
+ const originalQuery = `${query || ""} ${requirements || ""}`.trim();
136
+ const normalizedQuery = originalQuery.toLowerCase();
137
+ const negativeTerms = [...normalizedQuery.matchAll(/(?:^|\s)-([\w-]{2,})/g)].map(match => normalizeToken(match[1]));
138
+ const positiveTerms = tokenize(normalizedQuery)
139
+ .filter(token => !negativeTerms.includes(token))
140
+ .slice(0, 8);
141
+ const task = detectTask(normalizedQuery);
142
+ const language = detectLanguage(normalizedQuery);
143
+ const domain = classifyDomain(normalizedQuery, [], normalizedQuery, task);
144
+ const minRows = extractRequestedRows(normalizedQuery);
145
+ const searchTerms = [
146
+ language,
147
+ task,
148
+ domain !== "general" && domain !== "unknown" ? domain : undefined,
149
+ ...positiveTerms,
150
+ ].filter((value, index, self) => !!value && self.indexOf(value) === index);
151
+ return {
152
+ originalQuery,
153
+ normalizedQuery,
154
+ searchQuery: searchTerms.slice(0, 6).join(" ") || normalizedQuery,
155
+ positiveTerms,
156
+ negativeTerms,
157
+ language,
158
+ task: task || undefined,
159
+ domain,
160
+ minRows,
161
+ llmBacked: false,
162
+ };
163
+ }
164
+ function mergeIntent(base, llmIntent) {
165
+ const language = llmIntent.language ? normalizeToken(llmIntent.language) : base.language;
166
+ const task = llmIntent.task ? normalizeToken(llmIntent.task) : base.task;
167
+ const domain = llmIntent.domain ? normalizeToken(llmIntent.domain) : base.domain;
168
+ const minRows = typeof llmIntent.minRows === "number" && Number.isFinite(llmIntent.minRows)
169
+ ? llmIntent.minRows
170
+ : base.minRows;
171
+ const positiveTerms = Array.from(new Set([...(llmIntent.positiveTerms || []), ...base.positiveTerms].map(normalizeToken))).filter(Boolean);
172
+ const negativeTerms = Array.from(new Set([...(llmIntent.negativeTerms || []), ...base.negativeTerms].map(normalizeToken))).filter(Boolean);
173
+ const merged = {
174
+ ...base,
175
+ language,
176
+ task,
177
+ domain,
178
+ minRows,
179
+ positiveTerms,
180
+ negativeTerms,
181
+ llmBacked: true,
182
+ };
183
+ merged.searchQuery = [
184
+ merged.language,
185
+ merged.task,
186
+ merged.domain !== "general" && merged.domain !== "unknown" ? merged.domain : undefined,
187
+ ...merged.positiveTerms,
188
+ ].filter((value, index, self) => !!value && self.indexOf(value) === index).slice(0, 6).join(" ") || merged.normalizedQuery;
189
+ return merged;
190
+ }
191
+ async function tryLlmIntent(base, requirements) {
192
+ const openAiKey = process.env.OPENAI_API_KEY;
193
+ if (openAiKey) {
194
+ return await callOpenAiIntent(base, requirements).catch(() => undefined);
195
+ }
196
+ const geminiKey = process.env.GEMINI_API_KEY || process.env.GOOGLE_API_KEY;
197
+ if (geminiKey) {
198
+ return await callGeminiIntent(base, requirements, geminiKey).catch(() => undefined);
199
+ }
200
+ return undefined;
201
+ }
202
+ async function callOpenAiIntent(base, requirements) {
203
+ const controller = new AbortController();
204
+ const timeout = setTimeout(() => controller.abort(), 5000);
205
+ try {
206
+ const response = await fetch("https://api.openai.com/v1/chat/completions", {
207
+ method: "POST",
208
+ headers: {
209
+ "Content-Type": "application/json",
210
+ Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
211
+ },
212
+ body: JSON.stringify({
213
+ model: process.env.OPENAI_MODEL || "gpt-4o-mini",
214
+ temperature: 0,
215
+ response_format: { type: "json_object" },
216
+ messages: [
217
+ {
218
+ role: "system",
219
+ content: "Extract dataset search intent as JSON with keys: language, task, domain, minRows, positiveTerms, negativeTerms. Use null for unknowns.",
220
+ },
221
+ {
222
+ role: "user",
223
+ content: JSON.stringify({ query: base.originalQuery, requirements: requirements || null, heuristic: base }),
224
+ },
225
+ ],
226
+ }),
227
+ signal: controller.signal,
228
+ });
229
+ if (!response.ok) {
230
+ return undefined;
231
+ }
232
+ const body = await response.json();
233
+ const content = body?.choices?.[0]?.message?.content;
234
+ return parseIntentPayload(content);
235
+ }
236
+ finally {
237
+ clearTimeout(timeout);
238
+ }
239
+ }
240
+ async function callGeminiIntent(base, requirements, apiKey) {
241
+ const controller = new AbortController();
242
+ const timeout = setTimeout(() => controller.abort(), 5000);
243
+ try {
244
+ const model = process.env.GEMINI_MODEL || "gemini-1.5-flash";
245
+ const response = await fetch(`https://generativelanguage.googleapis.com/v1beta/models/${model}:generateContent?key=${encodeURIComponent(apiKey)}`, {
246
+ method: "POST",
247
+ headers: {
248
+ "Content-Type": "application/json",
249
+ },
250
+ body: JSON.stringify({
251
+ generationConfig: {
252
+ temperature: 0,
253
+ responseMimeType: "application/json",
254
+ },
255
+ contents: [{
256
+ role: "user",
257
+ parts: [{
258
+ text: `Extract dataset search intent as JSON with keys language, task, domain, minRows, positiveTerms, negativeTerms. Query payload: ${JSON.stringify({ query: base.originalQuery, requirements: requirements || null, heuristic: base })}`,
259
+ }],
260
+ }],
261
+ }),
262
+ signal: controller.signal,
263
+ });
264
+ if (!response.ok) {
265
+ return undefined;
266
+ }
267
+ const body = await response.json();
268
+ const content = body?.candidates?.[0]?.content?.parts?.[0]?.text;
269
+ return parseIntentPayload(content);
270
+ }
271
+ finally {
272
+ clearTimeout(timeout);
273
+ }
274
+ }
275
+ function parseIntentPayload(content) {
276
+ if (typeof content !== "string" || !content.trim()) {
277
+ return undefined;
278
+ }
279
+ const jsonText = extractJsonObject(content);
280
+ if (!jsonText) {
281
+ return undefined;
282
+ }
283
+ try {
284
+ const parsed = JSON.parse(jsonText);
285
+ return {
286
+ language: typeof parsed.language === "string" ? parsed.language : undefined,
287
+ task: typeof parsed.task === "string" ? parsed.task : undefined,
288
+ domain: typeof parsed.domain === "string" ? parsed.domain : undefined,
289
+ minRows: typeof parsed.minRows === "number"
290
+ ? parsed.minRows
291
+ : typeof parsed.min_rows === "number"
292
+ ? parsed.min_rows
293
+ : undefined,
294
+ positiveTerms: Array.isArray(parsed.positiveTerms)
295
+ ? parsed.positiveTerms.filter((item) => typeof item === "string")
296
+ : Array.isArray(parsed.positive_terms)
297
+ ? parsed.positive_terms.filter((item) => typeof item === "string")
298
+ : undefined,
299
+ negativeTerms: Array.isArray(parsed.negativeTerms)
300
+ ? parsed.negativeTerms.filter((item) => typeof item === "string")
301
+ : Array.isArray(parsed.negative_terms)
302
+ ? parsed.negative_terms.filter((item) => typeof item === "string")
303
+ : undefined,
304
+ };
305
+ }
306
+ catch {
307
+ return undefined;
308
+ }
309
+ }
310
+ function extractJsonObject(text) {
311
+ const trimmed = text.trim();
312
+ if (trimmed.startsWith("{") && trimmed.endsWith("}")) {
313
+ return trimmed;
314
+ }
315
+ const start = trimmed.indexOf("{");
316
+ const end = trimmed.lastIndexOf("}");
317
+ if (start >= 0 && end > start) {
318
+ return trimmed.slice(start, end + 1);
319
+ }
320
+ return undefined;
321
+ }
322
+ function detectLanguage(text) {
323
+ for (const [language, aliases] of Object.entries(LANGUAGE_ALIASES)) {
324
+ if (aliases.some(alias => new RegExp(`(^|[^a-z])${escapeRegex(alias)}([^a-z]|$)`, "i").test(text))) {
325
+ return language;
326
+ }
327
+ }
328
+ return undefined;
329
+ }
330
+ function detectTask(text) {
331
+ const match = TASK_PATTERNS.find(entry => entry.patterns.some(pattern => pattern.test(text)));
332
+ return match?.task;
333
+ }
334
+ function tokenize(text) {
335
+ return Array.from(new Set(text
336
+ .replace(/[^\w\s-]/g, " ")
337
+ .split(/\s+/)
338
+ .map(normalizeToken)
339
+ .filter(token => token.length > 2 && !STOP_WORDS.has(token) && !/^\d+$/.test(token))));
340
+ }
341
+ function normalizeToken(value) {
342
+ return value.toLowerCase().replace(/^[^a-z0-9]+|[^a-z0-9-]+$/g, "").trim();
343
+ }
344
+ function extractRequestedRows(text) {
345
+ const explicit = text.match(/(\d[\d,\s]{1,12})\s*(samples?|rows?|records?)/i);
346
+ if (explicit) {
347
+ const value = Number(explicit[1].replace(/[\s,]/g, ""));
348
+ if (Number.isFinite(value) && value > 0) {
349
+ return value;
350
+ }
351
+ }
352
+ const humanSized = text.match(/(\d+(?:\.\d+)?)\s*([kmb])\s*(samples?|rows?|records?)/i);
353
+ if (humanSized) {
354
+ const base = Number(humanSized[1]);
355
+ const suffix = humanSized[2].toLowerCase();
356
+ const multiplier = suffix === "k" ? 1_000 : suffix === "m" ? 1_000_000 : 1_000_000_000;
357
+ const value = Math.round(base * multiplier);
358
+ if (Number.isFinite(value) && value > 0) {
359
+ return value;
360
+ }
361
+ }
362
+ const commaNumbers = [...text.matchAll(/\b\d{1,3}(?:,\d{3})+\b/g)]
363
+ .map(match => Number(match[0].replace(/,/g, "")))
364
+ .filter(value => Number.isFinite(value) && value > 0);
365
+ if (commaNumbers.length > 0) {
366
+ return Math.max(...commaNumbers);
367
+ }
368
+ const humanSizedAnywhere = [...text.matchAll(/\b(\d+(?:\.\d+)?)\s*([kmb])\b/gi)]
369
+ .map(match => {
370
+ const base = Number(match[1]);
371
+ const suffix = match[2].toLowerCase();
372
+ const multiplier = suffix === "k" ? 1_000 : suffix === "m" ? 1_000_000 : 1_000_000_000;
373
+ return Math.round(base * multiplier);
374
+ })
375
+ .filter(value => Number.isFinite(value) && value > 0);
376
+ if (humanSizedAnywhere.length > 0) {
377
+ return Math.max(...humanSizedAnywhere);
378
+ }
379
+ const allNumbers = [...text.matchAll(/\b\d{4,9}\b/g)]
380
+ .map(match => Number(match[0]))
381
+ .filter(value => Number.isFinite(value) && value > 0);
382
+ if (allNumbers.length > 0) {
383
+ return Math.max(...allNumbers);
384
+ }
385
+ return undefined;
386
+ }
387
+ function matchesTask(dataset, task, text) {
388
+ const normalizedTask = normalizeToken(task);
389
+ const aliases = {
390
+ "question-answering": ["question-answering", "qa", "question answering"],
391
+ "text-classification": ["text-classification", "classification", "text classification"],
392
+ "token-classification": ["token-classification", "ner", "named entity"],
393
+ "sentiment-analysis": ["sentiment-analysis", "sentiment"],
394
+ translation: ["translation", "machine-translation", "parallel corpus"],
395
+ summarization: ["summarization", "summary"],
396
+ "text-generation": ["text-generation", "generation", "chat", "instruction"],
397
+ "image-classification": ["image-classification", "image classification"],
398
+ "object-detection": ["object-detection", "object detection"],
399
+ };
400
+ const variants = aliases[normalizedTask] || [normalizedTask];
401
+ return variants.some(variant => normalizeToken(dataset.task).includes(variant) || text.includes(variant));
402
+ }
403
+ function getLanguageAliases(language) {
404
+ const normalized = normalizeToken(language);
405
+ return (LANGUAGE_ALIASES[normalized] || [normalized]).map(normalizeToken);
406
+ }
407
+ function escapeRegex(value) {
408
+ return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
409
+ }
@@ -0,0 +1,130 @@
1
+ import { spawn } from "child_process";
2
+ import fs from "fs";
3
+ import os from "os";
4
+ import path from "path";
5
+ function getHomeDir(buildDir) {
6
+ return os.homedir() || process.env.HOME || process.env.USERPROFILE || buildDir;
7
+ }
8
+ export function getVesperDataRoot(buildDir = process.cwd()) {
9
+ return path.join(getHomeDir(buildDir), ".vesper");
10
+ }
11
+ export function getManagedPythonPath(buildDir = process.cwd()) {
12
+ const dataRoot = getVesperDataRoot(buildDir);
13
+ return process.platform === "win32"
14
+ ? path.join(dataRoot, ".venv", "Scripts", "python.exe")
15
+ : path.join(dataRoot, ".venv", "bin", "python");
16
+ }
17
+ function getFallbackPythonCommand() {
18
+ return process.platform === "win32" ? "py" : "python3";
19
+ }
20
+ export function resolvePythonCommand(buildDir = process.cwd()) {
21
+ const managedPython = getManagedPythonPath(buildDir);
22
+ if (fs.existsSync(managedPython)) {
23
+ return managedPython;
24
+ }
25
+ const envPython = process.env.VESPER_PYTHON;
26
+ if (envPython) {
27
+ return envPython;
28
+ }
29
+ const localCandidates = process.platform === "win32"
30
+ ? [
31
+ path.resolve(buildDir, ".venv", "Scripts", "python.exe"),
32
+ path.resolve(buildDir, "..", ".venv", "Scripts", "python.exe")
33
+ ]
34
+ : [
35
+ path.resolve(buildDir, ".venv", "bin", "python"),
36
+ path.resolve(buildDir, "..", ".venv", "bin", "python")
37
+ ];
38
+ for (const candidate of localCandidates) {
39
+ if (fs.existsSync(candidate)) {
40
+ return candidate;
41
+ }
42
+ }
43
+ return getFallbackPythonCommand();
44
+ }
45
+ function runPythonCommand(pythonPath, args, timeoutMs = 300000) {
46
+ return new Promise((resolve, reject) => {
47
+ const proc = spawn(pythonPath, args, {
48
+ env: {
49
+ ...process.env,
50
+ PYTHONIOENCODING: "utf-8",
51
+ },
52
+ });
53
+ let stdout = "";
54
+ let stderr = "";
55
+ const timer = setTimeout(() => {
56
+ proc.kill();
57
+ resolve({ code: 124, stdout, stderr: stderr || `Python command timed out after ${timeoutMs}ms` });
58
+ }, timeoutMs);
59
+ proc.stdout.on("data", (data) => {
60
+ stdout += data.toString();
61
+ });
62
+ proc.stderr.on("data", (data) => {
63
+ stderr += data.toString();
64
+ });
65
+ proc.on("close", (code) => {
66
+ clearTimeout(timer);
67
+ resolve({ code: code ?? 1, stdout, stderr });
68
+ });
69
+ proc.on("error", (error) => {
70
+ clearTimeout(timer);
71
+ reject(error);
72
+ });
73
+ });
74
+ }
75
+ async function createManagedPythonEnv(buildDir) {
76
+ const dataRoot = getVesperDataRoot(buildDir);
77
+ const venvDir = path.join(dataRoot, ".venv");
78
+ const managedPython = getManagedPythonPath(buildDir);
79
+ if (fs.existsSync(managedPython)) {
80
+ return managedPython;
81
+ }
82
+ fs.mkdirSync(dataRoot, { recursive: true });
83
+ const bootstrapAttempts = process.platform === "win32"
84
+ ? [
85
+ { command: "py", args: ["-3", "-m", "venv", venvDir] },
86
+ { command: "python", args: ["-m", "venv", venvDir] },
87
+ ]
88
+ : [
89
+ { command: "python3", args: ["-m", "venv", venvDir] },
90
+ { command: "python", args: ["-m", "venv", venvDir] },
91
+ ];
92
+ let lastError = "";
93
+ for (const attempt of bootstrapAttempts) {
94
+ try {
95
+ const result = await runPythonCommand(attempt.command, attempt.args, 180000);
96
+ if (result.code === 0 && fs.existsSync(managedPython)) {
97
+ await runPythonCommand(managedPython, ["-m", "pip", "install", "--disable-pip-version-check", "--upgrade", "pip"], 300000);
98
+ return managedPython;
99
+ }
100
+ lastError = (result.stderr || result.stdout || "Unknown venv creation error").trim();
101
+ }
102
+ catch (error) {
103
+ lastError = error?.message || String(error);
104
+ }
105
+ }
106
+ throw new Error(`Failed to create Vesper Python environment. ${lastError}`.trim());
107
+ }
108
+ export async function ensurePythonPackages(buildDir, requirements) {
109
+ const pythonPath = await createManagedPythonEnv(buildDir).catch(() => resolvePythonCommand(buildDir));
110
+ const missing = [];
111
+ for (const requirement of requirements) {
112
+ const check = await runPythonCommand(pythonPath, [
113
+ "-c",
114
+ `import importlib.util,sys; sys.exit(0 if importlib.util.find_spec(${JSON.stringify(requirement.module)}) else 1)`
115
+ ], 20000);
116
+ if (check.code !== 0) {
117
+ missing.push(requirement);
118
+ }
119
+ }
120
+ if (missing.length === 0) {
121
+ return pythonPath;
122
+ }
123
+ const packages = [...new Set(missing.map(requirement => requirement.packageName))];
124
+ const install = await runPythonCommand(pythonPath, ["-m", "pip", "install", "--disable-pip-version-check", ...packages], 600000);
125
+ if (install.code !== 0) {
126
+ const details = (install.stderr || install.stdout || "Unknown pip install error").trim();
127
+ throw new Error(`Failed to install Python packages (${packages.join(", ")}). ${details}`);
128
+ }
129
+ return pythonPath;
130
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "vesper-wizard",
3
- "version": "2.1.4",
3
+ "version": "2.1.6",
4
4
  "description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
5
5
  "type": "module",
6
6
  "main": "build/index.js",
@@ -2,13 +2,34 @@
2
2
 
3
3
  const { execSync } = require('child_process');
4
4
  const fs = require('fs');
5
+ const os = require('os');
5
6
  const path = require('path');
6
7
 
7
8
  console.log('\nšŸš€ Setting up Vesper MCP Server...\n');
8
9
 
10
+ function getPythonBootstrapCommand() {
11
+ const attempts = process.platform === 'win32'
12
+ ? ['py -3', 'python']
13
+ : ['python3', 'python'];
14
+
15
+ for (const command of attempts) {
16
+ try {
17
+ execSync(`${command} --version`, { stdio: 'pipe' });
18
+ return command;
19
+ } catch {
20
+ // try next command
21
+ }
22
+ }
23
+
24
+ return null;
25
+ }
26
+
9
27
  // 1. Check for Python
28
+ const pythonBootstrap = getPythonBootstrapCommand();
10
29
  try {
11
- execSync('python --version', { stdio: 'pipe' });
30
+ if (!pythonBootstrap) {
31
+ throw new Error('Python not found');
32
+ }
12
33
  console.log('āœ… Python found');
13
34
  } catch (e) {
14
35
  console.warn('āš ļø Python not found. Please install Python 3.8+ for full functionality.');
@@ -16,36 +37,15 @@ try {
16
37
  process.exit(0); // Don't fail installation
17
38
  }
18
39
 
19
- // 2. Install Python dependencies
20
- console.log('\nšŸ“¦ Installing Python dependencies...');
21
- const pythonPackages = [
22
- 'opencv-python',
23
- 'pillow',
24
- 'numpy',
25
- 'librosa',
26
- 'soundfile',
27
- 'aiohttp',
28
- 'aiofiles',
29
- 'datasets',
30
- 'webdataset',
31
- 'kaggle'
32
- ];
33
-
34
- try {
35
- execSync(`python -m pip install ${pythonPackages.join(' ')}`, {
36
- stdio: 'inherit',
37
- timeout: 120000 // 2 minutes timeout
38
- });
39
- console.log('āœ… Python dependencies installed');
40
- } catch (e) {
41
- console.warn('āš ļø Failed to install some Python dependencies.');
42
- console.warn(' You may need to install them manually:');
43
- console.warn(` pip install ${pythonPackages.join(' ')}\n`);
44
- }
45
-
46
- // 3. Create data directories
47
- const homeDir = process.env.HOME || process.env.USERPROFILE;
40
+ const homeDir = os.homedir() || process.env.HOME || process.env.USERPROFILE;
48
41
  const vesperDataDir = path.join(homeDir, '.vesper');
42
+ const managedVenvDir = path.join(vesperDataDir, '.venv');
43
+ const managedPython = process.platform === 'win32'
44
+ ? path.join(managedVenvDir, 'Scripts', 'python.exe')
45
+ : path.join(managedVenvDir, 'bin', 'python');
46
+ const requirementsPath = path.resolve(__dirname, '..', 'requirements.txt');
47
+
48
+ // 2. Create data directories
49
49
  const dirs = [
50
50
  vesperDataDir,
51
51
  path.join(vesperDataDir, 'data'),
@@ -62,7 +62,49 @@ dirs.forEach(dir => {
62
62
 
63
63
  console.log(`āœ… Data directories created at ${vesperDataDir}`);
64
64
 
65
- // 4. Rebuild better-sqlite3 for current Node.js version
65
+ // 3. Create a managed Vesper Python environment
66
+ console.log('\nšŸ Preparing managed Python environment...');
67
+ try {
68
+ if (!fs.existsSync(managedPython)) {
69
+ execSync(`${pythonBootstrap} -m venv "${managedVenvDir}"`, {
70
+ stdio: 'inherit',
71
+ timeout: 180000,
72
+ });
73
+ }
74
+ console.log(`āœ… Managed Python ready at ${managedVenvDir}`);
75
+ } catch (e) {
76
+ console.warn('āš ļø Failed to create the managed Vesper Python environment.');
77
+ console.warn(` Vesper will fall back to PATH Python and may need to self-heal at runtime. ${(e && e.message) || ''}`.trim());
78
+ }
79
+
80
+ // 4. Install Python dependencies into the managed environment
81
+ console.log('\nšŸ“¦ Installing Python dependencies...');
82
+ const pythonPackages = [
83
+ 'opencv-python',
84
+ 'pillow',
85
+ 'librosa',
86
+ 'soundfile',
87
+ 'pyarrow'
88
+ ];
89
+
90
+ try {
91
+ const targetPython = fs.existsSync(managedPython) ? `"${managedPython}"` : pythonBootstrap;
92
+ execSync(`${targetPython} -m pip install --disable-pip-version-check --upgrade pip`, {
93
+ stdio: 'inherit',
94
+ timeout: 180000,
95
+ });
96
+ execSync(`${targetPython} -m pip install --disable-pip-version-check -r "${requirementsPath}" ${pythonPackages.join(' ')}`, {
97
+ stdio: 'inherit',
98
+ timeout: 600000,
99
+ });
100
+ console.log('āœ… Python dependencies installed');
101
+ } catch (e) {
102
+ console.warn('āš ļø Failed to install some Python dependencies.');
103
+ console.warn(' You may need to install them manually into the Vesper runtime:');
104
+ console.warn(` ${fs.existsSync(managedPython) ? managedPython : pythonBootstrap} -m pip install -r "${requirementsPath}" ${pythonPackages.join(' ')}\n`);
105
+ }
106
+
107
+ // 5. Rebuild better-sqlite3 for current Node.js version
66
108
  console.log('\nšŸ”§ Rebuilding native modules for current Node.js...');
67
109
  try {
68
110
  execSync('npm rebuild better-sqlite3', {
@@ -76,7 +118,7 @@ try {
76
118
  console.warn(' If you see ERR_DLOPEN_FAILED, run: npm rebuild better-sqlite3');
77
119
  }
78
120
 
79
- // 5. Auto-configure Claude Desktop (Best Effort)
121
+ // 6. Auto-configure Claude Desktop (Best Effort)
80
122
  console.log('\nāš™ļø Attempting to auto-configure Claude Desktop...');
81
123
 
82
124
  function getClaudeConfigPath() {