@vespermcp/mcp-server 1.2.18 → 1.2.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +121 -30
- package/build/ingestion/ingestor.js +117 -18
- package/build/python/hf_fallback.py +147 -0
- package/package.json +1 -1
- package/scripts/wizard.js +4 -4
- package/src/python/hf_fallback.py +147 -0
package/build/index.js
CHANGED
|
@@ -451,7 +451,19 @@ jobManager.on("processJob", async (job, execute) => {
|
|
|
451
451
|
* Logic for preparing a dataset (Search + Ingest + Process)
|
|
452
452
|
*/
|
|
453
453
|
async function handlePrepareJob(jobId, query, requirements) {
|
|
454
|
+
hydrateExternalKeys();
|
|
454
455
|
const update = (updates) => jobManager.updateJob(jobId, updates);
|
|
456
|
+
// Ensure core Python packages are available for dataset operations
|
|
457
|
+
try {
|
|
458
|
+
await ensurePythonModules([
|
|
459
|
+
{ module: "polars", packageName: "polars" },
|
|
460
|
+
{ module: "datasets", packageName: "datasets" },
|
|
461
|
+
]);
|
|
462
|
+
}
|
|
463
|
+
catch (e) {
|
|
464
|
+
console.error(`[Prepare] Python dependency setup warning: ${e.message}`);
|
|
465
|
+
// Continue anyway - direct file downloads may still work without datasets lib
|
|
466
|
+
}
|
|
455
467
|
const requestedRows = extractRequestedRows(query, requirements);
|
|
456
468
|
let selectedDataset;
|
|
457
469
|
let datasetIdForDownload = "";
|
|
@@ -480,7 +492,8 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
480
492
|
datasetIdForDownload = explicitId.replace(/^dataworld:/i, "");
|
|
481
493
|
}
|
|
482
494
|
else {
|
|
483
|
-
|
|
495
|
+
// Default to HuggingFace for ambiguous refs (user/dataset without prefix)
|
|
496
|
+
source = "huggingface";
|
|
484
497
|
datasetIdForDownload = explicitId;
|
|
485
498
|
}
|
|
486
499
|
update({
|
|
@@ -490,11 +503,21 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
490
503
|
}
|
|
491
504
|
else {
|
|
492
505
|
update({ progress: 10, status_text: "Searching for best dataset matching query..." });
|
|
493
|
-
const results = await searchEngine.search(query, { limit:
|
|
506
|
+
const results = await searchEngine.search(query, { limit: 10 });
|
|
494
507
|
if (results.length === 0) {
|
|
495
508
|
throw new Error("No datasets found matching the query. Try refining your search terms.");
|
|
496
509
|
}
|
|
497
|
-
|
|
510
|
+
// Pick the best result that we can actually download (skip sources requiring missing credentials)
|
|
511
|
+
const hasKaggleCreds = !!(process.env.KAGGLE_USERNAME && process.env.KAGGLE_KEY && process.env.KAGGLE_USERNAME !== "YOUR_KAGGLE_USERNAME");
|
|
512
|
+
const hasDwToken = hasDataWorldToken();
|
|
513
|
+
selectedDataset = results.find(r => {
|
|
514
|
+
const s = (r.source || "").toLowerCase();
|
|
515
|
+
if (s === "kaggle" && !hasKaggleCreds)
|
|
516
|
+
return false;
|
|
517
|
+
if (s === "dataworld" && !hasDwToken)
|
|
518
|
+
return false;
|
|
519
|
+
return true;
|
|
520
|
+
}) || results[0]; // Fallback to first if all require credentials
|
|
498
521
|
datasetIdForDownload = selectedDataset.id;
|
|
499
522
|
source = selectedDataset.source;
|
|
500
523
|
update({
|
|
@@ -502,13 +525,16 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
502
525
|
status_text: `Matched: ${selectedDataset.name} (${selectedDataset.source})`
|
|
503
526
|
});
|
|
504
527
|
}
|
|
505
|
-
// Pre-check credentials for
|
|
528
|
+
// Pre-check credentials for sources that require them
|
|
506
529
|
if (source === "kaggle") {
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
throw new Error("Kaggle credentials not set. Use
|
|
530
|
+
const hasKaggleCreds = !!(process.env.KAGGLE_USERNAME && process.env.KAGGLE_KEY && process.env.KAGGLE_USERNAME !== "YOUR_KAGGLE_USERNAME");
|
|
531
|
+
if (!hasKaggleCreds) {
|
|
532
|
+
throw new Error("Kaggle credentials not set. Use the configure_keys tool or set KAGGLE_USERNAME/KAGGLE_KEY environment variables.");
|
|
510
533
|
}
|
|
511
534
|
}
|
|
535
|
+
if (source === "dataworld" && !hasDataWorldToken()) {
|
|
536
|
+
throw new Error("data.world token not set. Use the configure_keys tool or set DW_AUTH_TOKEN environment variable.");
|
|
537
|
+
}
|
|
512
538
|
update({ progress: 30, status_text: `Starting download from ${source}...` });
|
|
513
539
|
// ensureData handles download and returns path to the raw file
|
|
514
540
|
let rawFilePath = await dataIngestor.ensureData(datasetIdForDownload, source, (msg, prog) => {
|
|
@@ -604,22 +630,49 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
604
630
|
*/
|
|
605
631
|
async function handleCleanJob(jobId, datasetId, ops) {
|
|
606
632
|
const update = (updates) => jobManager.updateJob(jobId, updates);
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
633
|
+
// Resolve dataset file path from multiple sources
|
|
634
|
+
let filePath;
|
|
635
|
+
// 1. Check registry (most reliable - includes prepared/fused datasets)
|
|
636
|
+
const regEntry = getRegistryEntry(datasetId);
|
|
637
|
+
const regPath = regEntry?.local_path || regEntry?.path;
|
|
638
|
+
if (regPath && fs.existsSync(regPath)) {
|
|
639
|
+
filePath = regPath;
|
|
640
|
+
}
|
|
641
|
+
// 2. Check download status from metadata store
|
|
642
|
+
if (!filePath) {
|
|
643
|
+
const dlStatus = metadataStore.getDownloadStatus(datasetId);
|
|
644
|
+
if (dlStatus?.local_path && fs.existsSync(dlStatus.local_path)) {
|
|
645
|
+
filePath = dlStatus.local_path;
|
|
646
|
+
}
|
|
647
|
+
}
|
|
648
|
+
// 3. Check standard raw data paths
|
|
649
|
+
if (!filePath) {
|
|
650
|
+
const safeId = datasetId.replace(/\//g, "_");
|
|
651
|
+
const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
|
|
652
|
+
const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
|
|
653
|
+
const featherPath = path.join(dataRoot, "data", "raw", `${safeId}.feather`);
|
|
654
|
+
if (fs.existsSync(parquetPath))
|
|
655
|
+
filePath = parquetPath;
|
|
656
|
+
else if (fs.existsSync(csvPath))
|
|
657
|
+
filePath = csvPath;
|
|
658
|
+
else if (fs.existsSync(featherPath))
|
|
659
|
+
filePath = featherPath;
|
|
660
|
+
}
|
|
661
|
+
// 4. Check if it's a direct file path
|
|
662
|
+
if (!filePath && fs.existsSync(datasetId)) {
|
|
663
|
+
filePath = datasetId;
|
|
664
|
+
}
|
|
665
|
+
// 5. Demo fallback
|
|
666
|
+
if (!filePath && datasetId === "demo") {
|
|
615
667
|
const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
|
|
616
668
|
const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
|
|
617
669
|
if (fs.existsSync(demoParquetPath))
|
|
618
670
|
filePath = demoParquetPath;
|
|
619
671
|
else if (fs.existsSync(demoCsvPath))
|
|
620
672
|
filePath = demoCsvPath;
|
|
621
|
-
|
|
622
|
-
|
|
673
|
+
}
|
|
674
|
+
if (!filePath) {
|
|
675
|
+
throw new Error(`Data file not found for '${datasetId}'. Download the dataset first using download_dataset or prepare_dataset.`);
|
|
623
676
|
}
|
|
624
677
|
update({ status_text: "Cleaning dataset..." });
|
|
625
678
|
const result = await dataCleaner.clean(filePath, ops);
|
|
@@ -684,14 +737,14 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
684
737
|
},
|
|
685
738
|
{
|
|
686
739
|
name: "download_dataset",
|
|
687
|
-
description: "Download a dataset by source and ID/slug into local Vesper storage. Kaggle
|
|
740
|
+
description: "Download a dataset by source and ID/slug into local Vesper storage. Defaults to HuggingFace. Kaggle and data.world require API keys (use configure_keys first).",
|
|
688
741
|
inputSchema: {
|
|
689
742
|
type: "object",
|
|
690
743
|
properties: {
|
|
691
744
|
source: {
|
|
692
745
|
type: "string",
|
|
693
746
|
enum: ["huggingface", "kaggle", "openml", "dataworld"],
|
|
694
|
-
description: "Dataset source.",
|
|
747
|
+
description: "Dataset source (default: huggingface). HuggingFace and OpenML work without credentials.",
|
|
695
748
|
},
|
|
696
749
|
dataset_id: {
|
|
697
750
|
type: "string",
|
|
@@ -702,7 +755,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
702
755
|
description: "Optional target directory for downloaded files.",
|
|
703
756
|
}
|
|
704
757
|
},
|
|
705
|
-
required: ["
|
|
758
|
+
required: ["dataset_id"],
|
|
706
759
|
},
|
|
707
760
|
},
|
|
708
761
|
{
|
|
@@ -793,7 +846,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
793
846
|
},
|
|
794
847
|
{
|
|
795
848
|
name: "custom_clean",
|
|
796
|
-
description: "Apply specific cleaning operations to a dataset as an asynchronous job.",
|
|
849
|
+
description: "Apply specific cleaning operations to a dataset as an asynchronous job. Supports: RemoveDuplicates, DropColumns, FillMissing, FixTypes, RemoveOutliers, EncodeCategories. The dataset must be downloaded first.",
|
|
797
850
|
inputSchema: {
|
|
798
851
|
type: "object",
|
|
799
852
|
properties: {
|
|
@@ -818,7 +871,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
818
871
|
},
|
|
819
872
|
{
|
|
820
873
|
name: "prepare_dataset",
|
|
821
|
-
description: "Full pipeline: Analyze, Clean, Split, and
|
|
874
|
+
description: "Full pipeline: Search, Download, Analyze, Clean, Split, and Install a dataset as an asynchronous job. Automatically selects the best available source (prefers HuggingFace/OpenML when no Kaggle credentials are set). Use check_job_status to monitor progress.",
|
|
822
875
|
inputSchema: {
|
|
823
876
|
type: "object",
|
|
824
877
|
properties: {
|
|
@@ -1110,7 +1163,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1110
1163
|
if (source === "kaggle") {
|
|
1111
1164
|
if (!dataIngestor.hasKaggleCredentials()) {
|
|
1112
1165
|
return {
|
|
1113
|
-
content: [{ type: "text", text: `Kaggle
|
|
1166
|
+
content: [{ type: "text", text: `Kaggle requires API credentials. Use the configure_keys tool to set kaggle_username and kaggle_key, or try source='huggingface' which works without credentials.` }],
|
|
1114
1167
|
isError: true,
|
|
1115
1168
|
};
|
|
1116
1169
|
}
|
|
@@ -1166,23 +1219,34 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1166
1219
|
}
|
|
1167
1220
|
case "download_dataset": {
|
|
1168
1221
|
hydrateExternalKeys();
|
|
1169
|
-
const source = String(request.params.arguments?.source || "").toLowerCase();
|
|
1222
|
+
const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
|
|
1170
1223
|
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
1171
|
-
if (!
|
|
1172
|
-
throw new McpError(ErrorCode.InvalidParams, "
|
|
1224
|
+
if (!datasetId) {
|
|
1225
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
1173
1226
|
}
|
|
1174
1227
|
if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
|
|
1175
1228
|
return {
|
|
1176
|
-
content: [{ type: "text", text: `Kaggle
|
|
1229
|
+
content: [{ type: "text", text: `Kaggle requires API credentials. Use the configure_keys tool to set kaggle_username and kaggle_key, or switch to source='huggingface' which works without credentials.` }],
|
|
1177
1230
|
isError: true,
|
|
1178
1231
|
};
|
|
1179
1232
|
}
|
|
1180
1233
|
if (source === "dataworld" && !hasDataWorldToken()) {
|
|
1181
1234
|
return {
|
|
1182
|
-
content: [{ type: "text", text: "data.world requires API token.
|
|
1235
|
+
content: [{ type: "text", text: "data.world requires API token. Use the configure_keys tool to set dataworld_token, or switch to source='huggingface' which works without credentials." }],
|
|
1183
1236
|
isError: true,
|
|
1184
1237
|
};
|
|
1185
1238
|
}
|
|
1239
|
+
// Pre-install Python datasets library for HuggingFace fallback
|
|
1240
|
+
if (source === "huggingface") {
|
|
1241
|
+
try {
|
|
1242
|
+
await ensurePythonModules([
|
|
1243
|
+
{ module: "datasets", packageName: "datasets" },
|
|
1244
|
+
]);
|
|
1245
|
+
}
|
|
1246
|
+
catch {
|
|
1247
|
+
// Continue - direct download may still work
|
|
1248
|
+
}
|
|
1249
|
+
}
|
|
1186
1250
|
try {
|
|
1187
1251
|
const localPath = await dataIngestor.ensureData(datasetId, source, () => undefined);
|
|
1188
1252
|
try {
|
|
@@ -1460,18 +1524,45 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1460
1524
|
case "custom_clean": {
|
|
1461
1525
|
const datasetId = String(request.params.arguments?.dataset_id);
|
|
1462
1526
|
const ops = request.params.arguments?.operations;
|
|
1527
|
+
if (!datasetId || datasetId === "undefined") {
|
|
1528
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
1529
|
+
}
|
|
1530
|
+
if (!ops || !Array.isArray(ops) || ops.length === 0) {
|
|
1531
|
+
throw new McpError(ErrorCode.InvalidParams, "operations array is required and must not be empty. Supported operations: RemoveDuplicates, DropColumns, FillMissing, FixTypes, RemoveOutliers, EncodeCategories");
|
|
1532
|
+
}
|
|
1533
|
+
// Pre-check: verify dataset file exists before starting the job
|
|
1534
|
+
const cleanRegEntry = getRegistryEntry(datasetId);
|
|
1535
|
+
const cleanRegPath = cleanRegEntry?.local_path || cleanRegEntry?.path;
|
|
1536
|
+
const cleanDlStatus = metadataStore.getDownloadStatus(datasetId);
|
|
1537
|
+
const cleanSafeId = datasetId.replace(/\//g, "_");
|
|
1538
|
+
const cleanDataExists = (cleanRegPath && fs.existsSync(cleanRegPath)) ||
|
|
1539
|
+
(cleanDlStatus?.local_path && fs.existsSync(cleanDlStatus.local_path)) ||
|
|
1540
|
+
fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.parquet`)) ||
|
|
1541
|
+
fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.csv`)) ||
|
|
1542
|
+
fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.feather`)) ||
|
|
1543
|
+
fs.existsSync(datasetId);
|
|
1544
|
+
if (!cleanDataExists) {
|
|
1545
|
+
return {
|
|
1546
|
+
content: [{ type: "text", text: `Dataset '${datasetId}' not found locally. Download it first using download_dataset or prepare_dataset, then run custom_clean.` }],
|
|
1547
|
+
isError: true,
|
|
1548
|
+
};
|
|
1549
|
+
}
|
|
1463
1550
|
const job = jobManager.createJob("clean", 0, { datasetId, ops });
|
|
1464
1551
|
return {
|
|
1465
|
-
content: [{ type: "text", text: `
|
|
1552
|
+
content: [{ type: "text", text: `Cleaning job started. ID: ${job.id}. Use check_job_status to monitor progress.` }]
|
|
1466
1553
|
};
|
|
1467
1554
|
}
|
|
1468
1555
|
case "prepare_dataset": {
|
|
1556
|
+
hydrateExternalKeys();
|
|
1469
1557
|
const query = String(request.params.arguments?.query);
|
|
1470
1558
|
const requirements = request.params.arguments?.requirements ? String(request.params.arguments?.requirements) : undefined;
|
|
1471
1559
|
const downloadImages = request.params.arguments?.download_images === true;
|
|
1560
|
+
if (!query || query === "undefined") {
|
|
1561
|
+
throw new McpError(ErrorCode.InvalidParams, "query is required - describe the dataset you need or provide a dataset ID");
|
|
1562
|
+
}
|
|
1472
1563
|
const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages });
|
|
1473
1564
|
return {
|
|
1474
|
-
content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you.` }]
|
|
1565
|
+
content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you. Use check_job_status to monitor progress.` }]
|
|
1475
1566
|
};
|
|
1476
1567
|
}
|
|
1477
1568
|
case "compare_datasets": {
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import path from "path";
|
|
2
2
|
import fs from "fs";
|
|
3
|
+
import { spawn } from "child_process";
|
|
3
4
|
import { HFDownloader } from "./hf-downloader.js";
|
|
4
5
|
import { KaggleSource } from "../metadata/kaggle-source.js";
|
|
5
6
|
import { OpenMLSource } from "../metadata/openml-source.js";
|
|
@@ -63,25 +64,42 @@ export class DataIngestor {
|
|
|
63
64
|
if (source === "huggingface") {
|
|
64
65
|
onProgress?.("Discovering data files on HuggingFace Hub...");
|
|
65
66
|
const remotePath = await this.hfDownloader.findBestFile(datasetId);
|
|
66
|
-
if (
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
67
|
+
if (remotePath) {
|
|
68
|
+
// Direct file download path (repo has raw data files)
|
|
69
|
+
const ext = path.extname(remotePath).substring(1).toLowerCase() || "csv";
|
|
70
|
+
const targetPath = this.getTargetPath(datasetId, ext);
|
|
71
|
+
this.store.registerDownload(datasetId, targetPath, "downloading");
|
|
72
|
+
try {
|
|
73
|
+
await this.hfDownloader.download(datasetId, remotePath, targetPath, (progress) => {
|
|
74
|
+
onProgress?.(`Downloading ${path.basename(remotePath)}...`, progress);
|
|
75
|
+
});
|
|
76
|
+
const resolvedPath = await this.hfDownloader.resolveExternalDataFromMetadata(targetPath, (progress) => {
|
|
77
|
+
onProgress?.("Resolving external dataset file...", progress);
|
|
78
|
+
});
|
|
79
|
+
const stats = fs.statSync(resolvedPath);
|
|
80
|
+
this.completeDownload(datasetId, resolvedPath, stats.size);
|
|
81
|
+
return resolvedPath;
|
|
82
|
+
}
|
|
83
|
+
catch (e) {
|
|
84
|
+
this.failDownload(datasetId, e.message);
|
|
85
|
+
throw e;
|
|
86
|
+
}
|
|
81
87
|
}
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
88
|
+
else {
|
|
89
|
+
// Fallback: Use Python datasets library to download and convert
|
|
90
|
+
onProgress?.("No raw files found. Using HuggingFace datasets library to download...");
|
|
91
|
+
const targetPath = this.getTargetPath(datasetId, "parquet");
|
|
92
|
+
this.store.registerDownload(datasetId, targetPath, "downloading");
|
|
93
|
+
try {
|
|
94
|
+
const result = await this.hfDatasetsFallback(datasetId, targetPath, onProgress);
|
|
95
|
+
const stats = fs.statSync(result);
|
|
96
|
+
this.completeDownload(datasetId, result, stats.size);
|
|
97
|
+
return result;
|
|
98
|
+
}
|
|
99
|
+
catch (e) {
|
|
100
|
+
this.failDownload(datasetId, e.message);
|
|
101
|
+
throw e;
|
|
102
|
+
}
|
|
85
103
|
}
|
|
86
104
|
}
|
|
87
105
|
else if (source === "kaggle") {
|
|
@@ -159,4 +177,85 @@ export class DataIngestor {
|
|
|
159
177
|
const safeId = datasetId.replace(/\//g, "_").replace(/:/g, "_");
|
|
160
178
|
return path.join(this.rawDataDir, `${safeId}.${extension}`);
|
|
161
179
|
}
|
|
180
|
+
/**
|
|
181
|
+
* Fallback: Use Python `datasets` library to download a HuggingFace dataset
|
|
182
|
+
* when no raw data files are found in the repo file listing.
|
|
183
|
+
*/
|
|
184
|
+
async hfDatasetsFallback(datasetId, targetPath, onProgress) {
|
|
185
|
+
const pyCmd = process.platform === "win32" ? "py" : "python";
|
|
186
|
+
// Resolve the fallback script path
|
|
187
|
+
const homeDir = process.env.HOME || process.env.USERPROFILE || this.projectRoot;
|
|
188
|
+
const dataRoot = path.join(homeDir, ".vesper");
|
|
189
|
+
const scriptCandidates = [
|
|
190
|
+
path.resolve(dataRoot, "python", "hf_fallback.py"),
|
|
191
|
+
path.resolve(this.projectRoot, "python", "hf_fallback.py"),
|
|
192
|
+
path.resolve(this.projectRoot, "..", "src", "python", "hf_fallback.py"),
|
|
193
|
+
path.resolve(this.projectRoot, "..", "python", "hf_fallback.py"),
|
|
194
|
+
];
|
|
195
|
+
let scriptPath = scriptCandidates.find(p => fs.existsSync(p));
|
|
196
|
+
if (!scriptPath) {
|
|
197
|
+
scriptPath = scriptCandidates[0]; // Will fail with a clear error
|
|
198
|
+
}
|
|
199
|
+
const token = process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN || undefined;
|
|
200
|
+
const payload = {
|
|
201
|
+
repo_id: datasetId,
|
|
202
|
+
output_path: targetPath,
|
|
203
|
+
token: token || null,
|
|
204
|
+
max_rows: 500000,
|
|
205
|
+
};
|
|
206
|
+
onProgress?.("Downloading via datasets library (this may take a moment)...", 30);
|
|
207
|
+
return new Promise((resolve, reject) => {
|
|
208
|
+
const proc = spawn(pyCmd, [scriptPath, JSON.stringify(payload)], {
|
|
209
|
+
env: {
|
|
210
|
+
...process.env,
|
|
211
|
+
PYTHONUTF8: "1",
|
|
212
|
+
PIP_DISABLE_PIP_VERSION_CHECK: "1",
|
|
213
|
+
},
|
|
214
|
+
});
|
|
215
|
+
let stdout = "";
|
|
216
|
+
let stderr = "";
|
|
217
|
+
proc.stdout.on("data", (d) => (stdout += d.toString()));
|
|
218
|
+
proc.stderr.on("data", (d) => {
|
|
219
|
+
const msg = d.toString();
|
|
220
|
+
stderr += msg;
|
|
221
|
+
// Forward progress info
|
|
222
|
+
if (msg.includes("Downloading") || msg.includes("Loading")) {
|
|
223
|
+
onProgress?.(msg.trim().split("\n").pop() || "Downloading...", 50);
|
|
224
|
+
}
|
|
225
|
+
});
|
|
226
|
+
const timer = setTimeout(() => {
|
|
227
|
+
try {
|
|
228
|
+
proc.kill();
|
|
229
|
+
}
|
|
230
|
+
catch { /* no-op */ }
|
|
231
|
+
reject(new Error(`HuggingFace datasets download timed out after 10 minutes for ${datasetId}`));
|
|
232
|
+
}, 600000); // 10 min timeout
|
|
233
|
+
proc.on("close", (code) => {
|
|
234
|
+
clearTimeout(timer);
|
|
235
|
+
if (code !== 0) {
|
|
236
|
+
let errorMsg = stderr || stdout || `Python exited with code ${code}`;
|
|
237
|
+
try {
|
|
238
|
+
const parsed = JSON.parse(stdout);
|
|
239
|
+
if (parsed.error)
|
|
240
|
+
errorMsg = parsed.error;
|
|
241
|
+
}
|
|
242
|
+
catch { /* use stderr */ }
|
|
243
|
+
reject(new Error(`HuggingFace datasets fallback failed: ${errorMsg}`));
|
|
244
|
+
return;
|
|
245
|
+
}
|
|
246
|
+
try {
|
|
247
|
+
const result = JSON.parse(stdout);
|
|
248
|
+
if (!result.ok) {
|
|
249
|
+
reject(new Error(result.error || "Unknown error from HF fallback"));
|
|
250
|
+
return;
|
|
251
|
+
}
|
|
252
|
+
onProgress?.(`Downloaded ${result.rows?.toLocaleString() || "?"} rows (${result.columns?.length || "?"} columns)`, 90);
|
|
253
|
+
resolve(result.path);
|
|
254
|
+
}
|
|
255
|
+
catch {
|
|
256
|
+
reject(new Error(`Failed to parse HF fallback output: ${stdout}`));
|
|
257
|
+
}
|
|
258
|
+
});
|
|
259
|
+
});
|
|
260
|
+
}
|
|
162
261
|
}
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HuggingFace Datasets Library Fallback Downloader.
|
|
3
|
+
|
|
4
|
+
Used when the HF Hub file listing finds no suitable data files
|
|
5
|
+
(e.g. script-based datasets, gated datasets, datasets that use
|
|
6
|
+
the `datasets` library format).
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python hf_fallback.py '{"repo_id": "user/dataset", "output_path": "/path/to/output.parquet", "token": "optional", "max_rows": 100000, "split": "train"}'
|
|
10
|
+
|
|
11
|
+
Output: JSON to stdout
|
|
12
|
+
{"ok": true, "path": "/path/to/output.parquet", "rows": 12345, "columns": ["col1", "col2"]}
|
|
13
|
+
{"ok": false, "error": "..."}
|
|
14
|
+
"""
|
|
15
|
+
import sys
|
|
16
|
+
import json
|
|
17
|
+
import os
|
|
18
|
+
|
|
19
|
+
def main():
|
|
20
|
+
if len(sys.argv) < 2:
|
|
21
|
+
print(json.dumps({"ok": False, "error": "Missing payload argument"}))
|
|
22
|
+
sys.exit(1)
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
payload = json.loads(sys.argv[1])
|
|
26
|
+
except json.JSONDecodeError as e:
|
|
27
|
+
print(json.dumps({"ok": False, "error": f"Invalid JSON payload: {e}"}))
|
|
28
|
+
sys.exit(1)
|
|
29
|
+
|
|
30
|
+
repo_id = payload.get("repo_id", "").strip()
|
|
31
|
+
output_path = payload.get("output_path", "").strip()
|
|
32
|
+
token = payload.get("token") or os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or None
|
|
33
|
+
max_rows = payload.get("max_rows", 500000)
|
|
34
|
+
split = payload.get("split") # None = auto-detect
|
|
35
|
+
|
|
36
|
+
if not repo_id:
|
|
37
|
+
print(json.dumps({"ok": False, "error": "repo_id is required"}))
|
|
38
|
+
sys.exit(1)
|
|
39
|
+
|
|
40
|
+
if not output_path:
|
|
41
|
+
print(json.dumps({"ok": False, "error": "output_path is required"}))
|
|
42
|
+
sys.exit(1)
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
from datasets import load_dataset
|
|
46
|
+
except ImportError:
|
|
47
|
+
print(json.dumps({"ok": False, "error": "Python 'datasets' library not installed. Install with: pip install datasets"}))
|
|
48
|
+
sys.exit(1)
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
import polars as pl
|
|
52
|
+
except ImportError:
|
|
53
|
+
pl = None
|
|
54
|
+
|
|
55
|
+
try:
|
|
56
|
+
# Try loading with streaming first (memory-efficient)
|
|
57
|
+
# If split is not specified, try common ones
|
|
58
|
+
splits_to_try = [split] if split else ["train", "test", "validation", None]
|
|
59
|
+
|
|
60
|
+
ds = None
|
|
61
|
+
used_split = None
|
|
62
|
+
|
|
63
|
+
for s in splits_to_try:
|
|
64
|
+
try:
|
|
65
|
+
kwargs = {
|
|
66
|
+
"path": repo_id,
|
|
67
|
+
"trust_remote_code": True,
|
|
68
|
+
}
|
|
69
|
+
if token:
|
|
70
|
+
kwargs["token"] = token
|
|
71
|
+
if s:
|
|
72
|
+
kwargs["split"] = s
|
|
73
|
+
|
|
74
|
+
ds = load_dataset(**kwargs)
|
|
75
|
+
used_split = s
|
|
76
|
+
break
|
|
77
|
+
except (ValueError, KeyError):
|
|
78
|
+
# Split doesn't exist, try next
|
|
79
|
+
continue
|
|
80
|
+
except Exception as e:
|
|
81
|
+
if "split" in str(e).lower() or "key" in str(e).lower():
|
|
82
|
+
continue
|
|
83
|
+
raise
|
|
84
|
+
|
|
85
|
+
if ds is None:
|
|
86
|
+
print(json.dumps({"ok": False, "error": f"Could not load dataset '{repo_id}'. No valid splits found."}))
|
|
87
|
+
sys.exit(1)
|
|
88
|
+
|
|
89
|
+
# Handle DatasetDict (when no split specified)
|
|
90
|
+
from datasets import DatasetDict, Dataset
|
|
91
|
+
if isinstance(ds, DatasetDict):
|
|
92
|
+
# Pick the best split
|
|
93
|
+
for preferred in ["train", "test", "validation"]:
|
|
94
|
+
if preferred in ds:
|
|
95
|
+
ds = ds[preferred]
|
|
96
|
+
used_split = preferred
|
|
97
|
+
break
|
|
98
|
+
else:
|
|
99
|
+
# Just pick the first available split
|
|
100
|
+
first_key = list(ds.keys())[0]
|
|
101
|
+
ds = ds[first_key]
|
|
102
|
+
used_split = first_key
|
|
103
|
+
|
|
104
|
+
# Limit rows if needed
|
|
105
|
+
total_rows = len(ds)
|
|
106
|
+
if max_rows and total_rows > max_rows:
|
|
107
|
+
ds = ds.select(range(max_rows))
|
|
108
|
+
total_rows = max_rows
|
|
109
|
+
|
|
110
|
+
# Ensure output directory exists
|
|
111
|
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
112
|
+
|
|
113
|
+
# Export to parquet
|
|
114
|
+
columns = ds.column_names
|
|
115
|
+
|
|
116
|
+
if output_path.endswith(".parquet"):
|
|
117
|
+
ds.to_parquet(output_path)
|
|
118
|
+
elif output_path.endswith(".csv"):
|
|
119
|
+
ds.to_csv(output_path)
|
|
120
|
+
else:
|
|
121
|
+
# Default to parquet
|
|
122
|
+
if not output_path.endswith(".parquet"):
|
|
123
|
+
output_path = output_path + ".parquet"
|
|
124
|
+
ds.to_parquet(output_path)
|
|
125
|
+
|
|
126
|
+
print(json.dumps({
|
|
127
|
+
"ok": True,
|
|
128
|
+
"path": output_path,
|
|
129
|
+
"rows": total_rows,
|
|
130
|
+
"columns": columns,
|
|
131
|
+
"split": used_split
|
|
132
|
+
}))
|
|
133
|
+
|
|
134
|
+
except Exception as e:
|
|
135
|
+
error_msg = str(e)
|
|
136
|
+
# Provide helpful hints
|
|
137
|
+
if "401" in error_msg or "403" in error_msg or "gated" in error_msg.lower():
|
|
138
|
+
error_msg += " (This dataset may be gated/private. Set HF_TOKEN via configure_keys tool.)"
|
|
139
|
+
elif "FileNotFoundError" in error_msg or "does not exist" in error_msg.lower():
|
|
140
|
+
error_msg = f"Dataset '{repo_id}' not found on HuggingFace. Check the dataset ID."
|
|
141
|
+
|
|
142
|
+
print(json.dumps({"ok": False, "error": error_msg}))
|
|
143
|
+
sys.exit(1)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
if __name__ == "__main__":
|
|
147
|
+
main()
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@vespermcp/mcp-server",
|
|
3
|
-
"version": "1.2.
|
|
3
|
+
"version": "1.2.20",
|
|
4
4
|
"description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "build/index.js",
|
package/scripts/wizard.js
CHANGED
|
@@ -118,7 +118,7 @@ function getAllAgentConfigs() {
|
|
|
118
118
|
|
|
119
119
|
function installMcpToAgent(agent) {
|
|
120
120
|
const npxCmd = IS_WIN ? 'npx.cmd' : 'npx';
|
|
121
|
-
const serverEntry = { command: npxCmd, args: ['-y', '
|
|
121
|
+
const serverEntry = { command: npxCmd, args: ['-y', '@vespermcp/mcp-server@latest'] };
|
|
122
122
|
|
|
123
123
|
try {
|
|
124
124
|
if (agent.format === 'toml') {
|
|
@@ -156,7 +156,7 @@ function installMcpToAgent(agent) {
|
|
|
156
156
|
async function checkServerHealth() {
|
|
157
157
|
try {
|
|
158
158
|
// Quick stdio check — spawn server and see if it responds
|
|
159
|
-
const result = spawnSync(IS_WIN ? 'npx.cmd' : 'npx', ['-y', '
|
|
159
|
+
const result = spawnSync(IS_WIN ? 'npx.cmd' : 'npx', ['-y', '@vespermcp/mcp-server@latest', '--version'], {
|
|
160
160
|
timeout: 10000,
|
|
161
161
|
encoding: 'utf8',
|
|
162
162
|
stdio: ['pipe', 'pipe', 'pipe'],
|
|
@@ -202,13 +202,13 @@ async function main() {
|
|
|
202
202
|
console.log(`\n ${dim('[')}${cyan('4/6')}${dim(']')} Installing Vesper MCP server...`);
|
|
203
203
|
try {
|
|
204
204
|
const npmCmd = IS_WIN ? 'npx.cmd' : 'npx';
|
|
205
|
-
spawnSync(npmCmd, ['-y', '
|
|
205
|
+
spawnSync(npmCmd, ['-y', '@vespermcp/mcp-server@latest', '--setup', '--silent'], {
|
|
206
206
|
stdio: 'inherit',
|
|
207
207
|
timeout: 120000,
|
|
208
208
|
});
|
|
209
209
|
console.log(` ${green('✓')} @vespermcp/mcp-server installed`);
|
|
210
210
|
} catch {
|
|
211
|
-
console.log(` ${yellow('⚠')} Could not auto-install — run manually: npx -y
|
|
211
|
+
console.log(` ${yellow('⚠')} Could not auto-install — run manually: npx -y @vespermcp/mcp-server@latest --setup`);
|
|
212
212
|
}
|
|
213
213
|
|
|
214
214
|
// ─── Step 5: Auto-configure all detected IDEs ──────────────
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HuggingFace Datasets Library Fallback Downloader.
|
|
3
|
+
|
|
4
|
+
Used when the HF Hub file listing finds no suitable data files
|
|
5
|
+
(e.g. script-based datasets, gated datasets, datasets that use
|
|
6
|
+
the `datasets` library format).
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python hf_fallback.py '{"repo_id": "user/dataset", "output_path": "/path/to/output.parquet", "token": "optional", "max_rows": 100000, "split": "train"}'
|
|
10
|
+
|
|
11
|
+
Output: JSON to stdout
|
|
12
|
+
{"ok": true, "path": "/path/to/output.parquet", "rows": 12345, "columns": ["col1", "col2"]}
|
|
13
|
+
{"ok": false, "error": "..."}
|
|
14
|
+
"""
|
|
15
|
+
import sys
|
|
16
|
+
import json
|
|
17
|
+
import os
|
|
18
|
+
|
|
19
|
+
def main():
|
|
20
|
+
if len(sys.argv) < 2:
|
|
21
|
+
print(json.dumps({"ok": False, "error": "Missing payload argument"}))
|
|
22
|
+
sys.exit(1)
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
payload = json.loads(sys.argv[1])
|
|
26
|
+
except json.JSONDecodeError as e:
|
|
27
|
+
print(json.dumps({"ok": False, "error": f"Invalid JSON payload: {e}"}))
|
|
28
|
+
sys.exit(1)
|
|
29
|
+
|
|
30
|
+
repo_id = payload.get("repo_id", "").strip()
|
|
31
|
+
output_path = payload.get("output_path", "").strip()
|
|
32
|
+
token = payload.get("token") or os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or None
|
|
33
|
+
max_rows = payload.get("max_rows", 500000)
|
|
34
|
+
split = payload.get("split") # None = auto-detect
|
|
35
|
+
|
|
36
|
+
if not repo_id:
|
|
37
|
+
print(json.dumps({"ok": False, "error": "repo_id is required"}))
|
|
38
|
+
sys.exit(1)
|
|
39
|
+
|
|
40
|
+
if not output_path:
|
|
41
|
+
print(json.dumps({"ok": False, "error": "output_path is required"}))
|
|
42
|
+
sys.exit(1)
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
from datasets import load_dataset
|
|
46
|
+
except ImportError:
|
|
47
|
+
print(json.dumps({"ok": False, "error": "Python 'datasets' library not installed. Install with: pip install datasets"}))
|
|
48
|
+
sys.exit(1)
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
import polars as pl
|
|
52
|
+
except ImportError:
|
|
53
|
+
pl = None
|
|
54
|
+
|
|
55
|
+
try:
|
|
56
|
+
# Try loading with streaming first (memory-efficient)
|
|
57
|
+
# If split is not specified, try common ones
|
|
58
|
+
splits_to_try = [split] if split else ["train", "test", "validation", None]
|
|
59
|
+
|
|
60
|
+
ds = None
|
|
61
|
+
used_split = None
|
|
62
|
+
|
|
63
|
+
for s in splits_to_try:
|
|
64
|
+
try:
|
|
65
|
+
kwargs = {
|
|
66
|
+
"path": repo_id,
|
|
67
|
+
"trust_remote_code": True,
|
|
68
|
+
}
|
|
69
|
+
if token:
|
|
70
|
+
kwargs["token"] = token
|
|
71
|
+
if s:
|
|
72
|
+
kwargs["split"] = s
|
|
73
|
+
|
|
74
|
+
ds = load_dataset(**kwargs)
|
|
75
|
+
used_split = s
|
|
76
|
+
break
|
|
77
|
+
except (ValueError, KeyError):
|
|
78
|
+
# Split doesn't exist, try next
|
|
79
|
+
continue
|
|
80
|
+
except Exception as e:
|
|
81
|
+
if "split" in str(e).lower() or "key" in str(e).lower():
|
|
82
|
+
continue
|
|
83
|
+
raise
|
|
84
|
+
|
|
85
|
+
if ds is None:
|
|
86
|
+
print(json.dumps({"ok": False, "error": f"Could not load dataset '{repo_id}'. No valid splits found."}))
|
|
87
|
+
sys.exit(1)
|
|
88
|
+
|
|
89
|
+
# Handle DatasetDict (when no split specified)
|
|
90
|
+
from datasets import DatasetDict, Dataset
|
|
91
|
+
if isinstance(ds, DatasetDict):
|
|
92
|
+
# Pick the best split
|
|
93
|
+
for preferred in ["train", "test", "validation"]:
|
|
94
|
+
if preferred in ds:
|
|
95
|
+
ds = ds[preferred]
|
|
96
|
+
used_split = preferred
|
|
97
|
+
break
|
|
98
|
+
else:
|
|
99
|
+
# Just pick the first available split
|
|
100
|
+
first_key = list(ds.keys())[0]
|
|
101
|
+
ds = ds[first_key]
|
|
102
|
+
used_split = first_key
|
|
103
|
+
|
|
104
|
+
# Limit rows if needed
|
|
105
|
+
total_rows = len(ds)
|
|
106
|
+
if max_rows and total_rows > max_rows:
|
|
107
|
+
ds = ds.select(range(max_rows))
|
|
108
|
+
total_rows = max_rows
|
|
109
|
+
|
|
110
|
+
# Ensure output directory exists
|
|
111
|
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
112
|
+
|
|
113
|
+
# Export to parquet
|
|
114
|
+
columns = ds.column_names
|
|
115
|
+
|
|
116
|
+
if output_path.endswith(".parquet"):
|
|
117
|
+
ds.to_parquet(output_path)
|
|
118
|
+
elif output_path.endswith(".csv"):
|
|
119
|
+
ds.to_csv(output_path)
|
|
120
|
+
else:
|
|
121
|
+
# Default to parquet
|
|
122
|
+
if not output_path.endswith(".parquet"):
|
|
123
|
+
output_path = output_path + ".parquet"
|
|
124
|
+
ds.to_parquet(output_path)
|
|
125
|
+
|
|
126
|
+
print(json.dumps({
|
|
127
|
+
"ok": True,
|
|
128
|
+
"path": output_path,
|
|
129
|
+
"rows": total_rows,
|
|
130
|
+
"columns": columns,
|
|
131
|
+
"split": used_split
|
|
132
|
+
}))
|
|
133
|
+
|
|
134
|
+
except Exception as e:
|
|
135
|
+
error_msg = str(e)
|
|
136
|
+
# Provide helpful hints
|
|
137
|
+
if "401" in error_msg or "403" in error_msg or "gated" in error_msg.lower():
|
|
138
|
+
error_msg += " (This dataset may be gated/private. Set HF_TOKEN via configure_keys tool.)"
|
|
139
|
+
elif "FileNotFoundError" in error_msg or "does not exist" in error_msg.lower():
|
|
140
|
+
error_msg = f"Dataset '{repo_id}' not found on HuggingFace. Check the dataset ID."
|
|
141
|
+
|
|
142
|
+
print(json.dumps({"ok": False, "error": error_msg}))
|
|
143
|
+
sys.exit(1)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
if __name__ == "__main__":
|
|
147
|
+
main()
|