@vespermcp/mcp-server 1.2.18 → 1.2.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -451,6 +451,7 @@ jobManager.on("processJob", async (job, execute) => {
451
451
  * Logic for preparing a dataset (Search + Ingest + Process)
452
452
  */
453
453
  async function handlePrepareJob(jobId, query, requirements) {
454
+ hydrateExternalKeys();
454
455
  const update = (updates) => jobManager.updateJob(jobId, updates);
455
456
  const requestedRows = extractRequestedRows(query, requirements);
456
457
  let selectedDataset;
@@ -480,7 +481,8 @@ async function handlePrepareJob(jobId, query, requirements) {
480
481
  datasetIdForDownload = explicitId.replace(/^dataworld:/i, "");
481
482
  }
482
483
  else {
483
- source = "kaggle";
484
+ // Default to HuggingFace for ambiguous refs (user/dataset without prefix)
485
+ source = "huggingface";
484
486
  datasetIdForDownload = explicitId;
485
487
  }
486
488
  update({
@@ -490,11 +492,21 @@ async function handlePrepareJob(jobId, query, requirements) {
490
492
  }
491
493
  else {
492
494
  update({ progress: 10, status_text: "Searching for best dataset matching query..." });
493
- const results = await searchEngine.search(query, { limit: 1 });
495
+ const results = await searchEngine.search(query, { limit: 10 });
494
496
  if (results.length === 0) {
495
497
  throw new Error("No datasets found matching the query. Try refining your search terms.");
496
498
  }
497
- selectedDataset = results[0];
499
+ // Pick the best result that we can actually download (skip sources requiring missing credentials)
500
+ const hasKaggleCreds = !!(process.env.KAGGLE_USERNAME && process.env.KAGGLE_KEY && process.env.KAGGLE_USERNAME !== "YOUR_KAGGLE_USERNAME");
501
+ const hasDwToken = hasDataWorldToken();
502
+ selectedDataset = results.find(r => {
503
+ const s = (r.source || "").toLowerCase();
504
+ if (s === "kaggle" && !hasKaggleCreds)
505
+ return false;
506
+ if (s === "dataworld" && !hasDwToken)
507
+ return false;
508
+ return true;
509
+ }) || results[0]; // Fallback to first if all require credentials
498
510
  datasetIdForDownload = selectedDataset.id;
499
511
  source = selectedDataset.source;
500
512
  update({
@@ -502,13 +514,16 @@ async function handlePrepareJob(jobId, query, requirements) {
502
514
  status_text: `Matched: ${selectedDataset.name} (${selectedDataset.source})`
503
515
  });
504
516
  }
505
- // Pre-check credentials for Kaggle
517
+ // Pre-check credentials for sources that require them
506
518
  if (source === "kaggle") {
507
- if (!process.env.KAGGLE_USERNAME || !process.env.KAGGLE_KEY ||
508
- process.env.KAGGLE_USERNAME === "YOUR_KAGGLE_USERNAME") {
509
- throw new Error("Kaggle credentials not set. Use 'kaggle login' or set KAGGLE_USERNAME/KAGGLE_KEY.");
519
+ const hasKaggleCreds = !!(process.env.KAGGLE_USERNAME && process.env.KAGGLE_KEY && process.env.KAGGLE_USERNAME !== "YOUR_KAGGLE_USERNAME");
520
+ if (!hasKaggleCreds) {
521
+ throw new Error("Kaggle credentials not set. Use the configure_keys tool or set KAGGLE_USERNAME/KAGGLE_KEY environment variables.");
510
522
  }
511
523
  }
524
+ if (source === "dataworld" && !hasDataWorldToken()) {
525
+ throw new Error("data.world token not set. Use the configure_keys tool or set DW_AUTH_TOKEN environment variable.");
526
+ }
512
527
  update({ progress: 30, status_text: `Starting download from ${source}...` });
513
528
  // ensureData handles download and returns path to the raw file
514
529
  let rawFilePath = await dataIngestor.ensureData(datasetIdForDownload, source, (msg, prog) => {
@@ -604,22 +619,49 @@ async function handlePrepareJob(jobId, query, requirements) {
604
619
  */
605
620
  async function handleCleanJob(jobId, datasetId, ops) {
606
621
  const update = (updates) => jobManager.updateJob(jobId, updates);
607
- const safeId = datasetId.replace(/\//g, "_");
608
- const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
609
- const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
610
- let filePath = parquetPath;
611
- if (!fs.existsSync(filePath)) {
612
- filePath = csvPath;
613
- }
614
- if (datasetId === "demo" || !fs.existsSync(filePath)) {
622
+ // Resolve dataset file path from multiple sources
623
+ let filePath;
624
+ // 1. Check registry (most reliable - includes prepared/fused datasets)
625
+ const regEntry = getRegistryEntry(datasetId);
626
+ const regPath = regEntry?.local_path || regEntry?.path;
627
+ if (regPath && fs.existsSync(regPath)) {
628
+ filePath = regPath;
629
+ }
630
+ // 2. Check download status from metadata store
631
+ if (!filePath) {
632
+ const dlStatus = metadataStore.getDownloadStatus(datasetId);
633
+ if (dlStatus?.local_path && fs.existsSync(dlStatus.local_path)) {
634
+ filePath = dlStatus.local_path;
635
+ }
636
+ }
637
+ // 3. Check standard raw data paths
638
+ if (!filePath) {
639
+ const safeId = datasetId.replace(/\//g, "_");
640
+ const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
641
+ const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
642
+ const featherPath = path.join(dataRoot, "data", "raw", `${safeId}.feather`);
643
+ if (fs.existsSync(parquetPath))
644
+ filePath = parquetPath;
645
+ else if (fs.existsSync(csvPath))
646
+ filePath = csvPath;
647
+ else if (fs.existsSync(featherPath))
648
+ filePath = featherPath;
649
+ }
650
+ // 4. Check if it's a direct file path
651
+ if (!filePath && fs.existsSync(datasetId)) {
652
+ filePath = datasetId;
653
+ }
654
+ // 5. Demo fallback
655
+ if (!filePath && datasetId === "demo") {
615
656
  const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
616
657
  const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
617
658
  if (fs.existsSync(demoParquetPath))
618
659
  filePath = demoParquetPath;
619
660
  else if (fs.existsSync(demoCsvPath))
620
661
  filePath = demoCsvPath;
621
- else
622
- throw new Error(`Data file not found for ${datasetId}`);
662
+ }
663
+ if (!filePath) {
664
+ throw new Error(`Data file not found for '${datasetId}'. Download the dataset first using download_dataset or prepare_dataset.`);
623
665
  }
624
666
  update({ status_text: "Cleaning dataset..." });
625
667
  const result = await dataCleaner.clean(filePath, ops);
@@ -684,14 +726,14 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
684
726
  },
685
727
  {
686
728
  name: "download_dataset",
687
- description: "Download a dataset by source and ID/slug into local Vesper storage. Kaggle requires optional API key.",
729
+ description: "Download a dataset by source and ID/slug into local Vesper storage. Defaults to HuggingFace. Kaggle and data.world require API keys (use configure_keys first).",
688
730
  inputSchema: {
689
731
  type: "object",
690
732
  properties: {
691
733
  source: {
692
734
  type: "string",
693
735
  enum: ["huggingface", "kaggle", "openml", "dataworld"],
694
- description: "Dataset source.",
736
+ description: "Dataset source (default: huggingface). HuggingFace and OpenML work without credentials.",
695
737
  },
696
738
  dataset_id: {
697
739
  type: "string",
@@ -702,7 +744,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
702
744
  description: "Optional target directory for downloaded files.",
703
745
  }
704
746
  },
705
- required: ["source", "dataset_id"],
747
+ required: ["dataset_id"],
706
748
  },
707
749
  },
708
750
  {
@@ -793,7 +835,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
793
835
  },
794
836
  {
795
837
  name: "custom_clean",
796
- description: "Apply specific cleaning operations to a dataset as an asynchronous job.",
838
+ description: "Apply specific cleaning operations to a dataset as an asynchronous job. Supports: RemoveDuplicates, DropColumns, FillMissing, FixTypes, RemoveOutliers, EncodeCategories. The dataset must be downloaded first.",
797
839
  inputSchema: {
798
840
  type: "object",
799
841
  properties: {
@@ -818,7 +860,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
818
860
  },
819
861
  {
820
862
  name: "prepare_dataset",
821
- description: "Full pipeline: Analyze, Clean, Split, and Export as an asynchronous job.",
863
+ description: "Full pipeline: Search, Download, Analyze, Clean, Split, and Install a dataset as an asynchronous job. Automatically selects the best available source (prefers HuggingFace/OpenML when no Kaggle credentials are set). Use check_job_status to monitor progress.",
822
864
  inputSchema: {
823
865
  type: "object",
824
866
  properties: {
@@ -1110,7 +1152,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1110
1152
  if (source === "kaggle") {
1111
1153
  if (!dataIngestor.hasKaggleCredentials()) {
1112
1154
  return {
1113
- content: [{ type: "text", text: `Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds).` }],
1155
+ content: [{ type: "text", text: `Kaggle requires API credentials. Use the configure_keys tool to set kaggle_username and kaggle_key, or try source='huggingface' which works without credentials.` }],
1114
1156
  isError: true,
1115
1157
  };
1116
1158
  }
@@ -1166,20 +1208,20 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1166
1208
  }
1167
1209
  case "download_dataset": {
1168
1210
  hydrateExternalKeys();
1169
- const source = String(request.params.arguments?.source || "").toLowerCase();
1211
+ const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
1170
1212
  const datasetId = String(request.params.arguments?.dataset_id || "").trim();
1171
- if (!source || !datasetId) {
1172
- throw new McpError(ErrorCode.InvalidParams, "source and dataset_id are required");
1213
+ if (!datasetId) {
1214
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
1173
1215
  }
1174
1216
  if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
1175
1217
  return {
1176
- content: [{ type: "text", text: `Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds).` }],
1218
+ content: [{ type: "text", text: `Kaggle requires API credentials. Use the configure_keys tool to set kaggle_username and kaggle_key, or switch to source='huggingface' which works without credentials.` }],
1177
1219
  isError: true,
1178
1220
  };
1179
1221
  }
1180
1222
  if (source === "dataworld" && !hasDataWorldToken()) {
1181
1223
  return {
1182
- content: [{ type: "text", text: "data.world requires API token. Run 'vespermcp config keys' and set dataworld_token." }],
1224
+ content: [{ type: "text", text: "data.world requires API token. Use the configure_keys tool to set dataworld_token, or switch to source='huggingface' which works without credentials." }],
1183
1225
  isError: true,
1184
1226
  };
1185
1227
  }
@@ -1460,18 +1502,45 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1460
1502
  case "custom_clean": {
1461
1503
  const datasetId = String(request.params.arguments?.dataset_id);
1462
1504
  const ops = request.params.arguments?.operations;
1505
+ if (!datasetId || datasetId === "undefined") {
1506
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
1507
+ }
1508
+ if (!ops || !Array.isArray(ops) || ops.length === 0) {
1509
+ throw new McpError(ErrorCode.InvalidParams, "operations array is required and must not be empty. Supported operations: RemoveDuplicates, DropColumns, FillMissing, FixTypes, RemoveOutliers, EncodeCategories");
1510
+ }
1511
+ // Pre-check: verify dataset file exists before starting the job
1512
+ const cleanRegEntry = getRegistryEntry(datasetId);
1513
+ const cleanRegPath = cleanRegEntry?.local_path || cleanRegEntry?.path;
1514
+ const cleanDlStatus = metadataStore.getDownloadStatus(datasetId);
1515
+ const cleanSafeId = datasetId.replace(/\//g, "_");
1516
+ const cleanDataExists = (cleanRegPath && fs.existsSync(cleanRegPath)) ||
1517
+ (cleanDlStatus?.local_path && fs.existsSync(cleanDlStatus.local_path)) ||
1518
+ fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.parquet`)) ||
1519
+ fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.csv`)) ||
1520
+ fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.feather`)) ||
1521
+ fs.existsSync(datasetId);
1522
+ if (!cleanDataExists) {
1523
+ return {
1524
+ content: [{ type: "text", text: `Dataset '${datasetId}' not found locally. Download it first using download_dataset or prepare_dataset, then run custom_clean.` }],
1525
+ isError: true,
1526
+ };
1527
+ }
1463
1528
  const job = jobManager.createJob("clean", 0, { datasetId, ops });
1464
1529
  return {
1465
- content: [{ type: "text", text: `Job started successfully. ID: ${job.id}. Use check_job_status to monitor progress.` }]
1530
+ content: [{ type: "text", text: `Cleaning job started. ID: ${job.id}. Use check_job_status to monitor progress.` }]
1466
1531
  };
1467
1532
  }
1468
1533
  case "prepare_dataset": {
1534
+ hydrateExternalKeys();
1469
1535
  const query = String(request.params.arguments?.query);
1470
1536
  const requirements = request.params.arguments?.requirements ? String(request.params.arguments?.requirements) : undefined;
1471
1537
  const downloadImages = request.params.arguments?.download_images === true;
1538
+ if (!query || query === "undefined") {
1539
+ throw new McpError(ErrorCode.InvalidParams, "query is required - describe the dataset you need or provide a dataset ID");
1540
+ }
1472
1541
  const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages });
1473
1542
  return {
1474
- content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you.` }]
1543
+ content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you. Use check_job_status to monitor progress.` }]
1475
1544
  };
1476
1545
  }
1477
1546
  case "compare_datasets": {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@vespermcp/mcp-server",
3
- "version": "1.2.18",
3
+ "version": "1.2.19",
4
4
  "description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
5
5
  "type": "module",
6
6
  "main": "build/index.js",
package/scripts/wizard.js CHANGED
@@ -118,7 +118,7 @@ function getAllAgentConfigs() {
118
118
 
119
119
  function installMcpToAgent(agent) {
120
120
  const npxCmd = IS_WIN ? 'npx.cmd' : 'npx';
121
- const serverEntry = { command: npxCmd, args: ['-y', '-p', '@vespermcp/mcp-server@latest', 'vespermcp'] };
121
+ const serverEntry = { command: npxCmd, args: ['-y', '@vespermcp/mcp-server@latest'] };
122
122
 
123
123
  try {
124
124
  if (agent.format === 'toml') {
@@ -156,7 +156,7 @@ function installMcpToAgent(agent) {
156
156
  async function checkServerHealth() {
157
157
  try {
158
158
  // Quick stdio check — spawn server and see if it responds
159
- const result = spawnSync(IS_WIN ? 'npx.cmd' : 'npx', ['-y', '-p', '@vespermcp/mcp-server@latest', 'vespermcp', '--version'], {
159
+ const result = spawnSync(IS_WIN ? 'npx.cmd' : 'npx', ['-y', '@vespermcp/mcp-server@latest', '--version'], {
160
160
  timeout: 10000,
161
161
  encoding: 'utf8',
162
162
  stdio: ['pipe', 'pipe', 'pipe'],
@@ -202,13 +202,13 @@ async function main() {
202
202
  console.log(`\n ${dim('[')}${cyan('4/6')}${dim(']')} Installing Vesper MCP server...`);
203
203
  try {
204
204
  const npmCmd = IS_WIN ? 'npx.cmd' : 'npx';
205
- spawnSync(npmCmd, ['-y', '-p', '@vespermcp/mcp-server@latest', 'vespermcp', '--setup', '--silent'], {
205
+ spawnSync(npmCmd, ['-y', '@vespermcp/mcp-server@latest', '--setup', '--silent'], {
206
206
  stdio: 'inherit',
207
207
  timeout: 120000,
208
208
  });
209
209
  console.log(` ${green('✓')} @vespermcp/mcp-server installed`);
210
210
  } catch {
211
- console.log(` ${yellow('⚠')} Could not auto-install — run manually: npx -y -p @vespermcp/mcp-server@latest vespermcp --setup`);
211
+ console.log(` ${yellow('⚠')} Could not auto-install — run manually: npx -y @vespermcp/mcp-server@latest --setup`);
212
212
  }
213
213
 
214
214
  // ─── Step 5: Auto-configure all detected IDEs ──────────────