@vespermcp/mcp-server 1.2.18 → 1.2.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +99 -30
- package/package.json +1 -1
- package/scripts/wizard.js +4 -4
package/build/index.js
CHANGED
|
@@ -451,6 +451,7 @@ jobManager.on("processJob", async (job, execute) => {
|
|
|
451
451
|
* Logic for preparing a dataset (Search + Ingest + Process)
|
|
452
452
|
*/
|
|
453
453
|
async function handlePrepareJob(jobId, query, requirements) {
|
|
454
|
+
hydrateExternalKeys();
|
|
454
455
|
const update = (updates) => jobManager.updateJob(jobId, updates);
|
|
455
456
|
const requestedRows = extractRequestedRows(query, requirements);
|
|
456
457
|
let selectedDataset;
|
|
@@ -480,7 +481,8 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
480
481
|
datasetIdForDownload = explicitId.replace(/^dataworld:/i, "");
|
|
481
482
|
}
|
|
482
483
|
else {
|
|
483
|
-
|
|
484
|
+
// Default to HuggingFace for ambiguous refs (user/dataset without prefix)
|
|
485
|
+
source = "huggingface";
|
|
484
486
|
datasetIdForDownload = explicitId;
|
|
485
487
|
}
|
|
486
488
|
update({
|
|
@@ -490,11 +492,21 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
490
492
|
}
|
|
491
493
|
else {
|
|
492
494
|
update({ progress: 10, status_text: "Searching for best dataset matching query..." });
|
|
493
|
-
const results = await searchEngine.search(query, { limit:
|
|
495
|
+
const results = await searchEngine.search(query, { limit: 10 });
|
|
494
496
|
if (results.length === 0) {
|
|
495
497
|
throw new Error("No datasets found matching the query. Try refining your search terms.");
|
|
496
498
|
}
|
|
497
|
-
|
|
499
|
+
// Pick the best result that we can actually download (skip sources requiring missing credentials)
|
|
500
|
+
const hasKaggleCreds = !!(process.env.KAGGLE_USERNAME && process.env.KAGGLE_KEY && process.env.KAGGLE_USERNAME !== "YOUR_KAGGLE_USERNAME");
|
|
501
|
+
const hasDwToken = hasDataWorldToken();
|
|
502
|
+
selectedDataset = results.find(r => {
|
|
503
|
+
const s = (r.source || "").toLowerCase();
|
|
504
|
+
if (s === "kaggle" && !hasKaggleCreds)
|
|
505
|
+
return false;
|
|
506
|
+
if (s === "dataworld" && !hasDwToken)
|
|
507
|
+
return false;
|
|
508
|
+
return true;
|
|
509
|
+
}) || results[0]; // Fallback to first if all require credentials
|
|
498
510
|
datasetIdForDownload = selectedDataset.id;
|
|
499
511
|
source = selectedDataset.source;
|
|
500
512
|
update({
|
|
@@ -502,13 +514,16 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
502
514
|
status_text: `Matched: ${selectedDataset.name} (${selectedDataset.source})`
|
|
503
515
|
});
|
|
504
516
|
}
|
|
505
|
-
// Pre-check credentials for
|
|
517
|
+
// Pre-check credentials for sources that require them
|
|
506
518
|
if (source === "kaggle") {
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
throw new Error("Kaggle credentials not set. Use
|
|
519
|
+
const hasKaggleCreds = !!(process.env.KAGGLE_USERNAME && process.env.KAGGLE_KEY && process.env.KAGGLE_USERNAME !== "YOUR_KAGGLE_USERNAME");
|
|
520
|
+
if (!hasKaggleCreds) {
|
|
521
|
+
throw new Error("Kaggle credentials not set. Use the configure_keys tool or set KAGGLE_USERNAME/KAGGLE_KEY environment variables.");
|
|
510
522
|
}
|
|
511
523
|
}
|
|
524
|
+
if (source === "dataworld" && !hasDataWorldToken()) {
|
|
525
|
+
throw new Error("data.world token not set. Use the configure_keys tool or set DW_AUTH_TOKEN environment variable.");
|
|
526
|
+
}
|
|
512
527
|
update({ progress: 30, status_text: `Starting download from ${source}...` });
|
|
513
528
|
// ensureData handles download and returns path to the raw file
|
|
514
529
|
let rawFilePath = await dataIngestor.ensureData(datasetIdForDownload, source, (msg, prog) => {
|
|
@@ -604,22 +619,49 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
604
619
|
*/
|
|
605
620
|
async function handleCleanJob(jobId, datasetId, ops) {
|
|
606
621
|
const update = (updates) => jobManager.updateJob(jobId, updates);
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
622
|
+
// Resolve dataset file path from multiple sources
|
|
623
|
+
let filePath;
|
|
624
|
+
// 1. Check registry (most reliable - includes prepared/fused datasets)
|
|
625
|
+
const regEntry = getRegistryEntry(datasetId);
|
|
626
|
+
const regPath = regEntry?.local_path || regEntry?.path;
|
|
627
|
+
if (regPath && fs.existsSync(regPath)) {
|
|
628
|
+
filePath = regPath;
|
|
629
|
+
}
|
|
630
|
+
// 2. Check download status from metadata store
|
|
631
|
+
if (!filePath) {
|
|
632
|
+
const dlStatus = metadataStore.getDownloadStatus(datasetId);
|
|
633
|
+
if (dlStatus?.local_path && fs.existsSync(dlStatus.local_path)) {
|
|
634
|
+
filePath = dlStatus.local_path;
|
|
635
|
+
}
|
|
636
|
+
}
|
|
637
|
+
// 3. Check standard raw data paths
|
|
638
|
+
if (!filePath) {
|
|
639
|
+
const safeId = datasetId.replace(/\//g, "_");
|
|
640
|
+
const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
|
|
641
|
+
const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
|
|
642
|
+
const featherPath = path.join(dataRoot, "data", "raw", `${safeId}.feather`);
|
|
643
|
+
if (fs.existsSync(parquetPath))
|
|
644
|
+
filePath = parquetPath;
|
|
645
|
+
else if (fs.existsSync(csvPath))
|
|
646
|
+
filePath = csvPath;
|
|
647
|
+
else if (fs.existsSync(featherPath))
|
|
648
|
+
filePath = featherPath;
|
|
649
|
+
}
|
|
650
|
+
// 4. Check if it's a direct file path
|
|
651
|
+
if (!filePath && fs.existsSync(datasetId)) {
|
|
652
|
+
filePath = datasetId;
|
|
653
|
+
}
|
|
654
|
+
// 5. Demo fallback
|
|
655
|
+
if (!filePath && datasetId === "demo") {
|
|
615
656
|
const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
|
|
616
657
|
const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
|
|
617
658
|
if (fs.existsSync(demoParquetPath))
|
|
618
659
|
filePath = demoParquetPath;
|
|
619
660
|
else if (fs.existsSync(demoCsvPath))
|
|
620
661
|
filePath = demoCsvPath;
|
|
621
|
-
|
|
622
|
-
|
|
662
|
+
}
|
|
663
|
+
if (!filePath) {
|
|
664
|
+
throw new Error(`Data file not found for '${datasetId}'. Download the dataset first using download_dataset or prepare_dataset.`);
|
|
623
665
|
}
|
|
624
666
|
update({ status_text: "Cleaning dataset..." });
|
|
625
667
|
const result = await dataCleaner.clean(filePath, ops);
|
|
@@ -684,14 +726,14 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
684
726
|
},
|
|
685
727
|
{
|
|
686
728
|
name: "download_dataset",
|
|
687
|
-
description: "Download a dataset by source and ID/slug into local Vesper storage. Kaggle
|
|
729
|
+
description: "Download a dataset by source and ID/slug into local Vesper storage. Defaults to HuggingFace. Kaggle and data.world require API keys (use configure_keys first).",
|
|
688
730
|
inputSchema: {
|
|
689
731
|
type: "object",
|
|
690
732
|
properties: {
|
|
691
733
|
source: {
|
|
692
734
|
type: "string",
|
|
693
735
|
enum: ["huggingface", "kaggle", "openml", "dataworld"],
|
|
694
|
-
description: "Dataset source.",
|
|
736
|
+
description: "Dataset source (default: huggingface). HuggingFace and OpenML work without credentials.",
|
|
695
737
|
},
|
|
696
738
|
dataset_id: {
|
|
697
739
|
type: "string",
|
|
@@ -702,7 +744,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
702
744
|
description: "Optional target directory for downloaded files.",
|
|
703
745
|
}
|
|
704
746
|
},
|
|
705
|
-
required: ["
|
|
747
|
+
required: ["dataset_id"],
|
|
706
748
|
},
|
|
707
749
|
},
|
|
708
750
|
{
|
|
@@ -793,7 +835,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
793
835
|
},
|
|
794
836
|
{
|
|
795
837
|
name: "custom_clean",
|
|
796
|
-
description: "Apply specific cleaning operations to a dataset as an asynchronous job.",
|
|
838
|
+
description: "Apply specific cleaning operations to a dataset as an asynchronous job. Supports: RemoveDuplicates, DropColumns, FillMissing, FixTypes, RemoveOutliers, EncodeCategories. The dataset must be downloaded first.",
|
|
797
839
|
inputSchema: {
|
|
798
840
|
type: "object",
|
|
799
841
|
properties: {
|
|
@@ -818,7 +860,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
818
860
|
},
|
|
819
861
|
{
|
|
820
862
|
name: "prepare_dataset",
|
|
821
|
-
description: "Full pipeline: Analyze, Clean, Split, and
|
|
863
|
+
description: "Full pipeline: Search, Download, Analyze, Clean, Split, and Install a dataset as an asynchronous job. Automatically selects the best available source (prefers HuggingFace/OpenML when no Kaggle credentials are set). Use check_job_status to monitor progress.",
|
|
822
864
|
inputSchema: {
|
|
823
865
|
type: "object",
|
|
824
866
|
properties: {
|
|
@@ -1110,7 +1152,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1110
1152
|
if (source === "kaggle") {
|
|
1111
1153
|
if (!dataIngestor.hasKaggleCredentials()) {
|
|
1112
1154
|
return {
|
|
1113
|
-
content: [{ type: "text", text: `Kaggle
|
|
1155
|
+
content: [{ type: "text", text: `Kaggle requires API credentials. Use the configure_keys tool to set kaggle_username and kaggle_key, or try source='huggingface' which works without credentials.` }],
|
|
1114
1156
|
isError: true,
|
|
1115
1157
|
};
|
|
1116
1158
|
}
|
|
@@ -1166,20 +1208,20 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1166
1208
|
}
|
|
1167
1209
|
case "download_dataset": {
|
|
1168
1210
|
hydrateExternalKeys();
|
|
1169
|
-
const source = String(request.params.arguments?.source || "").toLowerCase();
|
|
1211
|
+
const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
|
|
1170
1212
|
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
1171
|
-
if (!
|
|
1172
|
-
throw new McpError(ErrorCode.InvalidParams, "
|
|
1213
|
+
if (!datasetId) {
|
|
1214
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
1173
1215
|
}
|
|
1174
1216
|
if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
|
|
1175
1217
|
return {
|
|
1176
|
-
content: [{ type: "text", text: `Kaggle
|
|
1218
|
+
content: [{ type: "text", text: `Kaggle requires API credentials. Use the configure_keys tool to set kaggle_username and kaggle_key, or switch to source='huggingface' which works without credentials.` }],
|
|
1177
1219
|
isError: true,
|
|
1178
1220
|
};
|
|
1179
1221
|
}
|
|
1180
1222
|
if (source === "dataworld" && !hasDataWorldToken()) {
|
|
1181
1223
|
return {
|
|
1182
|
-
content: [{ type: "text", text: "data.world requires API token.
|
|
1224
|
+
content: [{ type: "text", text: "data.world requires API token. Use the configure_keys tool to set dataworld_token, or switch to source='huggingface' which works without credentials." }],
|
|
1183
1225
|
isError: true,
|
|
1184
1226
|
};
|
|
1185
1227
|
}
|
|
@@ -1460,18 +1502,45 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1460
1502
|
case "custom_clean": {
|
|
1461
1503
|
const datasetId = String(request.params.arguments?.dataset_id);
|
|
1462
1504
|
const ops = request.params.arguments?.operations;
|
|
1505
|
+
if (!datasetId || datasetId === "undefined") {
|
|
1506
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
1507
|
+
}
|
|
1508
|
+
if (!ops || !Array.isArray(ops) || ops.length === 0) {
|
|
1509
|
+
throw new McpError(ErrorCode.InvalidParams, "operations array is required and must not be empty. Supported operations: RemoveDuplicates, DropColumns, FillMissing, FixTypes, RemoveOutliers, EncodeCategories");
|
|
1510
|
+
}
|
|
1511
|
+
// Pre-check: verify dataset file exists before starting the job
|
|
1512
|
+
const cleanRegEntry = getRegistryEntry(datasetId);
|
|
1513
|
+
const cleanRegPath = cleanRegEntry?.local_path || cleanRegEntry?.path;
|
|
1514
|
+
const cleanDlStatus = metadataStore.getDownloadStatus(datasetId);
|
|
1515
|
+
const cleanSafeId = datasetId.replace(/\//g, "_");
|
|
1516
|
+
const cleanDataExists = (cleanRegPath && fs.existsSync(cleanRegPath)) ||
|
|
1517
|
+
(cleanDlStatus?.local_path && fs.existsSync(cleanDlStatus.local_path)) ||
|
|
1518
|
+
fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.parquet`)) ||
|
|
1519
|
+
fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.csv`)) ||
|
|
1520
|
+
fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.feather`)) ||
|
|
1521
|
+
fs.existsSync(datasetId);
|
|
1522
|
+
if (!cleanDataExists) {
|
|
1523
|
+
return {
|
|
1524
|
+
content: [{ type: "text", text: `Dataset '${datasetId}' not found locally. Download it first using download_dataset or prepare_dataset, then run custom_clean.` }],
|
|
1525
|
+
isError: true,
|
|
1526
|
+
};
|
|
1527
|
+
}
|
|
1463
1528
|
const job = jobManager.createJob("clean", 0, { datasetId, ops });
|
|
1464
1529
|
return {
|
|
1465
|
-
content: [{ type: "text", text: `
|
|
1530
|
+
content: [{ type: "text", text: `Cleaning job started. ID: ${job.id}. Use check_job_status to monitor progress.` }]
|
|
1466
1531
|
};
|
|
1467
1532
|
}
|
|
1468
1533
|
case "prepare_dataset": {
|
|
1534
|
+
hydrateExternalKeys();
|
|
1469
1535
|
const query = String(request.params.arguments?.query);
|
|
1470
1536
|
const requirements = request.params.arguments?.requirements ? String(request.params.arguments?.requirements) : undefined;
|
|
1471
1537
|
const downloadImages = request.params.arguments?.download_images === true;
|
|
1538
|
+
if (!query || query === "undefined") {
|
|
1539
|
+
throw new McpError(ErrorCode.InvalidParams, "query is required - describe the dataset you need or provide a dataset ID");
|
|
1540
|
+
}
|
|
1472
1541
|
const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages });
|
|
1473
1542
|
return {
|
|
1474
|
-
content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you.` }]
|
|
1543
|
+
content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you. Use check_job_status to monitor progress.` }]
|
|
1475
1544
|
};
|
|
1476
1545
|
}
|
|
1477
1546
|
case "compare_datasets": {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@vespermcp/mcp-server",
|
|
3
|
-
"version": "1.2.
|
|
3
|
+
"version": "1.2.19",
|
|
4
4
|
"description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "build/index.js",
|
package/scripts/wizard.js
CHANGED
|
@@ -118,7 +118,7 @@ function getAllAgentConfigs() {
|
|
|
118
118
|
|
|
119
119
|
function installMcpToAgent(agent) {
|
|
120
120
|
const npxCmd = IS_WIN ? 'npx.cmd' : 'npx';
|
|
121
|
-
const serverEntry = { command: npxCmd, args: ['-y', '
|
|
121
|
+
const serverEntry = { command: npxCmd, args: ['-y', '@vespermcp/mcp-server@latest'] };
|
|
122
122
|
|
|
123
123
|
try {
|
|
124
124
|
if (agent.format === 'toml') {
|
|
@@ -156,7 +156,7 @@ function installMcpToAgent(agent) {
|
|
|
156
156
|
async function checkServerHealth() {
|
|
157
157
|
try {
|
|
158
158
|
// Quick stdio check — spawn server and see if it responds
|
|
159
|
-
const result = spawnSync(IS_WIN ? 'npx.cmd' : 'npx', ['-y', '
|
|
159
|
+
const result = spawnSync(IS_WIN ? 'npx.cmd' : 'npx', ['-y', '@vespermcp/mcp-server@latest', '--version'], {
|
|
160
160
|
timeout: 10000,
|
|
161
161
|
encoding: 'utf8',
|
|
162
162
|
stdio: ['pipe', 'pipe', 'pipe'],
|
|
@@ -202,13 +202,13 @@ async function main() {
|
|
|
202
202
|
console.log(`\n ${dim('[')}${cyan('4/6')}${dim(']')} Installing Vesper MCP server...`);
|
|
203
203
|
try {
|
|
204
204
|
const npmCmd = IS_WIN ? 'npx.cmd' : 'npx';
|
|
205
|
-
spawnSync(npmCmd, ['-y', '
|
|
205
|
+
spawnSync(npmCmd, ['-y', '@vespermcp/mcp-server@latest', '--setup', '--silent'], {
|
|
206
206
|
stdio: 'inherit',
|
|
207
207
|
timeout: 120000,
|
|
208
208
|
});
|
|
209
209
|
console.log(` ${green('✓')} @vespermcp/mcp-server installed`);
|
|
210
210
|
} catch {
|
|
211
|
-
console.log(` ${yellow('⚠')} Could not auto-install — run manually: npx -y
|
|
211
|
+
console.log(` ${yellow('⚠')} Could not auto-install — run manually: npx -y @vespermcp/mcp-server@latest --setup`);
|
|
212
212
|
}
|
|
213
213
|
|
|
214
214
|
// ─── Step 5: Auto-configure all detected IDEs ──────────────
|