@vespermcp/mcp-server 1.2.18 → 1.2.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -451,7 +451,19 @@ jobManager.on("processJob", async (job, execute) => {
451
451
  * Logic for preparing a dataset (Search + Ingest + Process)
452
452
  */
453
453
  async function handlePrepareJob(jobId, query, requirements) {
454
+ hydrateExternalKeys();
454
455
  const update = (updates) => jobManager.updateJob(jobId, updates);
456
+ // Ensure core Python packages are available for dataset operations
457
+ try {
458
+ await ensurePythonModules([
459
+ { module: "polars", packageName: "polars" },
460
+ { module: "datasets", packageName: "datasets" },
461
+ ]);
462
+ }
463
+ catch (e) {
464
+ console.error(`[Prepare] Python dependency setup warning: ${e.message}`);
465
+ // Continue anyway - direct file downloads may still work without datasets lib
466
+ }
455
467
  const requestedRows = extractRequestedRows(query, requirements);
456
468
  let selectedDataset;
457
469
  let datasetIdForDownload = "";
@@ -480,7 +492,8 @@ async function handlePrepareJob(jobId, query, requirements) {
480
492
  datasetIdForDownload = explicitId.replace(/^dataworld:/i, "");
481
493
  }
482
494
  else {
483
- source = "kaggle";
495
+ // Default to HuggingFace for ambiguous refs (user/dataset without prefix)
496
+ source = "huggingface";
484
497
  datasetIdForDownload = explicitId;
485
498
  }
486
499
  update({
@@ -490,11 +503,21 @@ async function handlePrepareJob(jobId, query, requirements) {
490
503
  }
491
504
  else {
492
505
  update({ progress: 10, status_text: "Searching for best dataset matching query..." });
493
- const results = await searchEngine.search(query, { limit: 1 });
506
+ const results = await searchEngine.search(query, { limit: 10 });
494
507
  if (results.length === 0) {
495
508
  throw new Error("No datasets found matching the query. Try refining your search terms.");
496
509
  }
497
- selectedDataset = results[0];
510
+ // Pick the best result that we can actually download (skip sources requiring missing credentials)
511
+ const hasKaggleCreds = !!(process.env.KAGGLE_USERNAME && process.env.KAGGLE_KEY && process.env.KAGGLE_USERNAME !== "YOUR_KAGGLE_USERNAME");
512
+ const hasDwToken = hasDataWorldToken();
513
+ selectedDataset = results.find(r => {
514
+ const s = (r.source || "").toLowerCase();
515
+ if (s === "kaggle" && !hasKaggleCreds)
516
+ return false;
517
+ if (s === "dataworld" && !hasDwToken)
518
+ return false;
519
+ return true;
520
+ }) || results[0]; // Fallback to first if all require credentials
498
521
  datasetIdForDownload = selectedDataset.id;
499
522
  source = selectedDataset.source;
500
523
  update({
@@ -502,13 +525,16 @@ async function handlePrepareJob(jobId, query, requirements) {
502
525
  status_text: `Matched: ${selectedDataset.name} (${selectedDataset.source})`
503
526
  });
504
527
  }
505
- // Pre-check credentials for Kaggle
528
+ // Pre-check credentials for sources that require them
506
529
  if (source === "kaggle") {
507
- if (!process.env.KAGGLE_USERNAME || !process.env.KAGGLE_KEY ||
508
- process.env.KAGGLE_USERNAME === "YOUR_KAGGLE_USERNAME") {
509
- throw new Error("Kaggle credentials not set. Use 'kaggle login' or set KAGGLE_USERNAME/KAGGLE_KEY.");
530
+ const hasKaggleCreds = !!(process.env.KAGGLE_USERNAME && process.env.KAGGLE_KEY && process.env.KAGGLE_USERNAME !== "YOUR_KAGGLE_USERNAME");
531
+ if (!hasKaggleCreds) {
532
+ throw new Error("Kaggle credentials not set. Use the configure_keys tool or set KAGGLE_USERNAME/KAGGLE_KEY environment variables.");
510
533
  }
511
534
  }
535
+ if (source === "dataworld" && !hasDataWorldToken()) {
536
+ throw new Error("data.world token not set. Use the configure_keys tool or set DW_AUTH_TOKEN environment variable.");
537
+ }
512
538
  update({ progress: 30, status_text: `Starting download from ${source}...` });
513
539
  // ensureData handles download and returns path to the raw file
514
540
  let rawFilePath = await dataIngestor.ensureData(datasetIdForDownload, source, (msg, prog) => {
@@ -604,22 +630,49 @@ async function handlePrepareJob(jobId, query, requirements) {
604
630
  */
605
631
  async function handleCleanJob(jobId, datasetId, ops) {
606
632
  const update = (updates) => jobManager.updateJob(jobId, updates);
607
- const safeId = datasetId.replace(/\//g, "_");
608
- const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
609
- const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
610
- let filePath = parquetPath;
611
- if (!fs.existsSync(filePath)) {
612
- filePath = csvPath;
613
- }
614
- if (datasetId === "demo" || !fs.existsSync(filePath)) {
633
+ // Resolve dataset file path from multiple sources
634
+ let filePath;
635
+ // 1. Check registry (most reliable - includes prepared/fused datasets)
636
+ const regEntry = getRegistryEntry(datasetId);
637
+ const regPath = regEntry?.local_path || regEntry?.path;
638
+ if (regPath && fs.existsSync(regPath)) {
639
+ filePath = regPath;
640
+ }
641
+ // 2. Check download status from metadata store
642
+ if (!filePath) {
643
+ const dlStatus = metadataStore.getDownloadStatus(datasetId);
644
+ if (dlStatus?.local_path && fs.existsSync(dlStatus.local_path)) {
645
+ filePath = dlStatus.local_path;
646
+ }
647
+ }
648
+ // 3. Check standard raw data paths
649
+ if (!filePath) {
650
+ const safeId = datasetId.replace(/\//g, "_");
651
+ const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
652
+ const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
653
+ const featherPath = path.join(dataRoot, "data", "raw", `${safeId}.feather`);
654
+ if (fs.existsSync(parquetPath))
655
+ filePath = parquetPath;
656
+ else if (fs.existsSync(csvPath))
657
+ filePath = csvPath;
658
+ else if (fs.existsSync(featherPath))
659
+ filePath = featherPath;
660
+ }
661
+ // 4. Check if it's a direct file path
662
+ if (!filePath && fs.existsSync(datasetId)) {
663
+ filePath = datasetId;
664
+ }
665
+ // 5. Demo fallback
666
+ if (!filePath && datasetId === "demo") {
615
667
  const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
616
668
  const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
617
669
  if (fs.existsSync(demoParquetPath))
618
670
  filePath = demoParquetPath;
619
671
  else if (fs.existsSync(demoCsvPath))
620
672
  filePath = demoCsvPath;
621
- else
622
- throw new Error(`Data file not found for ${datasetId}`);
673
+ }
674
+ if (!filePath) {
675
+ throw new Error(`Data file not found for '${datasetId}'. Download the dataset first using download_dataset or prepare_dataset.`);
623
676
  }
624
677
  update({ status_text: "Cleaning dataset..." });
625
678
  const result = await dataCleaner.clean(filePath, ops);
@@ -684,14 +737,14 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
684
737
  },
685
738
  {
686
739
  name: "download_dataset",
687
- description: "Download a dataset by source and ID/slug into local Vesper storage. Kaggle requires optional API key.",
740
+ description: "Download a dataset by source and ID/slug into local Vesper storage. Defaults to HuggingFace. Kaggle and data.world require API keys (use configure_keys first).",
688
741
  inputSchema: {
689
742
  type: "object",
690
743
  properties: {
691
744
  source: {
692
745
  type: "string",
693
746
  enum: ["huggingface", "kaggle", "openml", "dataworld"],
694
- description: "Dataset source.",
747
+ description: "Dataset source (default: huggingface). HuggingFace and OpenML work without credentials.",
695
748
  },
696
749
  dataset_id: {
697
750
  type: "string",
@@ -702,7 +755,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
702
755
  description: "Optional target directory for downloaded files.",
703
756
  }
704
757
  },
705
- required: ["source", "dataset_id"],
758
+ required: ["dataset_id"],
706
759
  },
707
760
  },
708
761
  {
@@ -793,7 +846,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
793
846
  },
794
847
  {
795
848
  name: "custom_clean",
796
- description: "Apply specific cleaning operations to a dataset as an asynchronous job.",
849
+ description: "Apply specific cleaning operations to a dataset as an asynchronous job. Supports: RemoveDuplicates, DropColumns, FillMissing, FixTypes, RemoveOutliers, EncodeCategories. The dataset must be downloaded first.",
797
850
  inputSchema: {
798
851
  type: "object",
799
852
  properties: {
@@ -818,7 +871,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
818
871
  },
819
872
  {
820
873
  name: "prepare_dataset",
821
- description: "Full pipeline: Analyze, Clean, Split, and Export as an asynchronous job.",
874
+ description: "Full pipeline: Search, Download, Analyze, Clean, Split, and Install a dataset as an asynchronous job. Automatically selects the best available source (prefers HuggingFace/OpenML when no Kaggle credentials are set). Use check_job_status to monitor progress.",
822
875
  inputSchema: {
823
876
  type: "object",
824
877
  properties: {
@@ -1110,7 +1163,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1110
1163
  if (source === "kaggle") {
1111
1164
  if (!dataIngestor.hasKaggleCredentials()) {
1112
1165
  return {
1113
- content: [{ type: "text", text: `Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds).` }],
1166
+ content: [{ type: "text", text: `Kaggle requires API credentials. Use the configure_keys tool to set kaggle_username and kaggle_key, or try source='huggingface' which works without credentials.` }],
1114
1167
  isError: true,
1115
1168
  };
1116
1169
  }
@@ -1166,23 +1219,34 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1166
1219
  }
1167
1220
  case "download_dataset": {
1168
1221
  hydrateExternalKeys();
1169
- const source = String(request.params.arguments?.source || "").toLowerCase();
1222
+ const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
1170
1223
  const datasetId = String(request.params.arguments?.dataset_id || "").trim();
1171
- if (!source || !datasetId) {
1172
- throw new McpError(ErrorCode.InvalidParams, "source and dataset_id are required");
1224
+ if (!datasetId) {
1225
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
1173
1226
  }
1174
1227
  if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
1175
1228
  return {
1176
- content: [{ type: "text", text: `Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds).` }],
1229
+ content: [{ type: "text", text: `Kaggle requires API credentials. Use the configure_keys tool to set kaggle_username and kaggle_key, or switch to source='huggingface' which works without credentials.` }],
1177
1230
  isError: true,
1178
1231
  };
1179
1232
  }
1180
1233
  if (source === "dataworld" && !hasDataWorldToken()) {
1181
1234
  return {
1182
- content: [{ type: "text", text: "data.world requires API token. Run 'vespermcp config keys' and set dataworld_token." }],
1235
+ content: [{ type: "text", text: "data.world requires API token. Use the configure_keys tool to set dataworld_token, or switch to source='huggingface' which works without credentials." }],
1183
1236
  isError: true,
1184
1237
  };
1185
1238
  }
1239
+ // Pre-install Python datasets library for HuggingFace fallback
1240
+ if (source === "huggingface") {
1241
+ try {
1242
+ await ensurePythonModules([
1243
+ { module: "datasets", packageName: "datasets" },
1244
+ ]);
1245
+ }
1246
+ catch {
1247
+ // Continue - direct download may still work
1248
+ }
1249
+ }
1186
1250
  try {
1187
1251
  const localPath = await dataIngestor.ensureData(datasetId, source, () => undefined);
1188
1252
  try {
@@ -1460,18 +1524,45 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1460
1524
  case "custom_clean": {
1461
1525
  const datasetId = String(request.params.arguments?.dataset_id);
1462
1526
  const ops = request.params.arguments?.operations;
1527
+ if (!datasetId || datasetId === "undefined") {
1528
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
1529
+ }
1530
+ if (!ops || !Array.isArray(ops) || ops.length === 0) {
1531
+ throw new McpError(ErrorCode.InvalidParams, "operations array is required and must not be empty. Supported operations: RemoveDuplicates, DropColumns, FillMissing, FixTypes, RemoveOutliers, EncodeCategories");
1532
+ }
1533
+ // Pre-check: verify dataset file exists before starting the job
1534
+ const cleanRegEntry = getRegistryEntry(datasetId);
1535
+ const cleanRegPath = cleanRegEntry?.local_path || cleanRegEntry?.path;
1536
+ const cleanDlStatus = metadataStore.getDownloadStatus(datasetId);
1537
+ const cleanSafeId = datasetId.replace(/\//g, "_");
1538
+ const cleanDataExists = (cleanRegPath && fs.existsSync(cleanRegPath)) ||
1539
+ (cleanDlStatus?.local_path && fs.existsSync(cleanDlStatus.local_path)) ||
1540
+ fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.parquet`)) ||
1541
+ fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.csv`)) ||
1542
+ fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.feather`)) ||
1543
+ fs.existsSync(datasetId);
1544
+ if (!cleanDataExists) {
1545
+ return {
1546
+ content: [{ type: "text", text: `Dataset '${datasetId}' not found locally. Download it first using download_dataset or prepare_dataset, then run custom_clean.` }],
1547
+ isError: true,
1548
+ };
1549
+ }
1463
1550
  const job = jobManager.createJob("clean", 0, { datasetId, ops });
1464
1551
  return {
1465
- content: [{ type: "text", text: `Job started successfully. ID: ${job.id}. Use check_job_status to monitor progress.` }]
1552
+ content: [{ type: "text", text: `Cleaning job started. ID: ${job.id}. Use check_job_status to monitor progress.` }]
1466
1553
  };
1467
1554
  }
1468
1555
  case "prepare_dataset": {
1556
+ hydrateExternalKeys();
1469
1557
  const query = String(request.params.arguments?.query);
1470
1558
  const requirements = request.params.arguments?.requirements ? String(request.params.arguments?.requirements) : undefined;
1471
1559
  const downloadImages = request.params.arguments?.download_images === true;
1560
+ if (!query || query === "undefined") {
1561
+ throw new McpError(ErrorCode.InvalidParams, "query is required - describe the dataset you need or provide a dataset ID");
1562
+ }
1472
1563
  const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages });
1473
1564
  return {
1474
- content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you.` }]
1565
+ content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you. Use check_job_status to monitor progress.` }]
1475
1566
  };
1476
1567
  }
1477
1568
  case "compare_datasets": {
@@ -1,5 +1,6 @@
1
1
  import path from "path";
2
2
  import fs from "fs";
3
+ import { spawn } from "child_process";
3
4
  import { HFDownloader } from "./hf-downloader.js";
4
5
  import { KaggleSource } from "../metadata/kaggle-source.js";
5
6
  import { OpenMLSource } from "../metadata/openml-source.js";
@@ -63,25 +64,42 @@ export class DataIngestor {
63
64
  if (source === "huggingface") {
64
65
  onProgress?.("Discovering data files on HuggingFace Hub...");
65
66
  const remotePath = await this.hfDownloader.findBestFile(datasetId);
66
- if (!remotePath)
67
- throw new Error(`No suitable data files found in HuggingFace repo: ${datasetId}`);
68
- const ext = path.extname(remotePath).substring(1).toLowerCase() || "csv";
69
- const targetPath = this.getTargetPath(datasetId, ext);
70
- this.store.registerDownload(datasetId, targetPath, "downloading");
71
- try {
72
- await this.hfDownloader.download(datasetId, remotePath, targetPath, (progress) => {
73
- onProgress?.(`Downloading ${path.basename(remotePath)}...`, progress);
74
- });
75
- const resolvedPath = await this.hfDownloader.resolveExternalDataFromMetadata(targetPath, (progress) => {
76
- onProgress?.("Resolving external dataset file...", progress);
77
- });
78
- const stats = fs.statSync(resolvedPath);
79
- this.completeDownload(datasetId, resolvedPath, stats.size);
80
- return resolvedPath;
67
+ if (remotePath) {
68
+ // Direct file download path (repo has raw data files)
69
+ const ext = path.extname(remotePath).substring(1).toLowerCase() || "csv";
70
+ const targetPath = this.getTargetPath(datasetId, ext);
71
+ this.store.registerDownload(datasetId, targetPath, "downloading");
72
+ try {
73
+ await this.hfDownloader.download(datasetId, remotePath, targetPath, (progress) => {
74
+ onProgress?.(`Downloading ${path.basename(remotePath)}...`, progress);
75
+ });
76
+ const resolvedPath = await this.hfDownloader.resolveExternalDataFromMetadata(targetPath, (progress) => {
77
+ onProgress?.("Resolving external dataset file...", progress);
78
+ });
79
+ const stats = fs.statSync(resolvedPath);
80
+ this.completeDownload(datasetId, resolvedPath, stats.size);
81
+ return resolvedPath;
82
+ }
83
+ catch (e) {
84
+ this.failDownload(datasetId, e.message);
85
+ throw e;
86
+ }
81
87
  }
82
- catch (e) {
83
- this.failDownload(datasetId, e.message);
84
- throw e;
88
+ else {
89
+ // Fallback: Use Python datasets library to download and convert
90
+ onProgress?.("No raw files found. Using HuggingFace datasets library to download...");
91
+ const targetPath = this.getTargetPath(datasetId, "parquet");
92
+ this.store.registerDownload(datasetId, targetPath, "downloading");
93
+ try {
94
+ const result = await this.hfDatasetsFallback(datasetId, targetPath, onProgress);
95
+ const stats = fs.statSync(result);
96
+ this.completeDownload(datasetId, result, stats.size);
97
+ return result;
98
+ }
99
+ catch (e) {
100
+ this.failDownload(datasetId, e.message);
101
+ throw e;
102
+ }
85
103
  }
86
104
  }
87
105
  else if (source === "kaggle") {
@@ -159,4 +177,85 @@ export class DataIngestor {
159
177
  const safeId = datasetId.replace(/\//g, "_").replace(/:/g, "_");
160
178
  return path.join(this.rawDataDir, `${safeId}.${extension}`);
161
179
  }
180
+ /**
181
+ * Fallback: Use Python `datasets` library to download a HuggingFace dataset
182
+ * when no raw data files are found in the repo file listing.
183
+ */
184
+ async hfDatasetsFallback(datasetId, targetPath, onProgress) {
185
+ const pyCmd = process.platform === "win32" ? "py" : "python";
186
+ // Resolve the fallback script path
187
+ const homeDir = process.env.HOME || process.env.USERPROFILE || this.projectRoot;
188
+ const dataRoot = path.join(homeDir, ".vesper");
189
+ const scriptCandidates = [
190
+ path.resolve(dataRoot, "python", "hf_fallback.py"),
191
+ path.resolve(this.projectRoot, "python", "hf_fallback.py"),
192
+ path.resolve(this.projectRoot, "..", "src", "python", "hf_fallback.py"),
193
+ path.resolve(this.projectRoot, "..", "python", "hf_fallback.py"),
194
+ ];
195
+ let scriptPath = scriptCandidates.find(p => fs.existsSync(p));
196
+ if (!scriptPath) {
197
+ scriptPath = scriptCandidates[0]; // Will fail with a clear error
198
+ }
199
+ const token = process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN || undefined;
200
+ const payload = {
201
+ repo_id: datasetId,
202
+ output_path: targetPath,
203
+ token: token || null,
204
+ max_rows: 500000,
205
+ };
206
+ onProgress?.("Downloading via datasets library (this may take a moment)...", 30);
207
+ return new Promise((resolve, reject) => {
208
+ const proc = spawn(pyCmd, [scriptPath, JSON.stringify(payload)], {
209
+ env: {
210
+ ...process.env,
211
+ PYTHONUTF8: "1",
212
+ PIP_DISABLE_PIP_VERSION_CHECK: "1",
213
+ },
214
+ });
215
+ let stdout = "";
216
+ let stderr = "";
217
+ proc.stdout.on("data", (d) => (stdout += d.toString()));
218
+ proc.stderr.on("data", (d) => {
219
+ const msg = d.toString();
220
+ stderr += msg;
221
+ // Forward progress info
222
+ if (msg.includes("Downloading") || msg.includes("Loading")) {
223
+ onProgress?.(msg.trim().split("\n").pop() || "Downloading...", 50);
224
+ }
225
+ });
226
+ const timer = setTimeout(() => {
227
+ try {
228
+ proc.kill();
229
+ }
230
+ catch { /* no-op */ }
231
+ reject(new Error(`HuggingFace datasets download timed out after 10 minutes for ${datasetId}`));
232
+ }, 600000); // 10 min timeout
233
+ proc.on("close", (code) => {
234
+ clearTimeout(timer);
235
+ if (code !== 0) {
236
+ let errorMsg = stderr || stdout || `Python exited with code ${code}`;
237
+ try {
238
+ const parsed = JSON.parse(stdout);
239
+ if (parsed.error)
240
+ errorMsg = parsed.error;
241
+ }
242
+ catch { /* use stderr */ }
243
+ reject(new Error(`HuggingFace datasets fallback failed: ${errorMsg}`));
244
+ return;
245
+ }
246
+ try {
247
+ const result = JSON.parse(stdout);
248
+ if (!result.ok) {
249
+ reject(new Error(result.error || "Unknown error from HF fallback"));
250
+ return;
251
+ }
252
+ onProgress?.(`Downloaded ${result.rows?.toLocaleString() || "?"} rows (${result.columns?.length || "?"} columns)`, 90);
253
+ resolve(result.path);
254
+ }
255
+ catch {
256
+ reject(new Error(`Failed to parse HF fallback output: ${stdout}`));
257
+ }
258
+ });
259
+ });
260
+ }
162
261
  }
@@ -0,0 +1,147 @@
1
+ """
2
+ HuggingFace Datasets Library Fallback Downloader.
3
+
4
+ Used when the HF Hub file listing finds no suitable data files
5
+ (e.g. script-based datasets, gated datasets, datasets that use
6
+ the `datasets` library format).
7
+
8
+ Usage:
9
+ python hf_fallback.py '{"repo_id": "user/dataset", "output_path": "/path/to/output.parquet", "token": "optional", "max_rows": 100000, "split": "train"}'
10
+
11
+ Output: JSON to stdout
12
+ {"ok": true, "path": "/path/to/output.parquet", "rows": 12345, "columns": ["col1", "col2"]}
13
+ {"ok": false, "error": "..."}
14
+ """
15
+ import sys
16
+ import json
17
+ import os
18
+
19
+ def main():
20
+ if len(sys.argv) < 2:
21
+ print(json.dumps({"ok": False, "error": "Missing payload argument"}))
22
+ sys.exit(1)
23
+
24
+ try:
25
+ payload = json.loads(sys.argv[1])
26
+ except json.JSONDecodeError as e:
27
+ print(json.dumps({"ok": False, "error": f"Invalid JSON payload: {e}"}))
28
+ sys.exit(1)
29
+
30
+ repo_id = payload.get("repo_id", "").strip()
31
+ output_path = payload.get("output_path", "").strip()
32
+ token = payload.get("token") or os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or None
33
+ max_rows = payload.get("max_rows", 500000)
34
+ split = payload.get("split") # None = auto-detect
35
+
36
+ if not repo_id:
37
+ print(json.dumps({"ok": False, "error": "repo_id is required"}))
38
+ sys.exit(1)
39
+
40
+ if not output_path:
41
+ print(json.dumps({"ok": False, "error": "output_path is required"}))
42
+ sys.exit(1)
43
+
44
+ try:
45
+ from datasets import load_dataset
46
+ except ImportError:
47
+ print(json.dumps({"ok": False, "error": "Python 'datasets' library not installed. Install with: pip install datasets"}))
48
+ sys.exit(1)
49
+
50
+ try:
51
+ import polars as pl
52
+ except ImportError:
53
+ pl = None
54
+
55
+ try:
56
+ # Try loading with streaming first (memory-efficient)
57
+ # If split is not specified, try common ones
58
+ splits_to_try = [split] if split else ["train", "test", "validation", None]
59
+
60
+ ds = None
61
+ used_split = None
62
+
63
+ for s in splits_to_try:
64
+ try:
65
+ kwargs = {
66
+ "path": repo_id,
67
+ "trust_remote_code": True,
68
+ }
69
+ if token:
70
+ kwargs["token"] = token
71
+ if s:
72
+ kwargs["split"] = s
73
+
74
+ ds = load_dataset(**kwargs)
75
+ used_split = s
76
+ break
77
+ except (ValueError, KeyError):
78
+ # Split doesn't exist, try next
79
+ continue
80
+ except Exception as e:
81
+ if "split" in str(e).lower() or "key" in str(e).lower():
82
+ continue
83
+ raise
84
+
85
+ if ds is None:
86
+ print(json.dumps({"ok": False, "error": f"Could not load dataset '{repo_id}'. No valid splits found."}))
87
+ sys.exit(1)
88
+
89
+ # Handle DatasetDict (when no split specified)
90
+ from datasets import DatasetDict, Dataset
91
+ if isinstance(ds, DatasetDict):
92
+ # Pick the best split
93
+ for preferred in ["train", "test", "validation"]:
94
+ if preferred in ds:
95
+ ds = ds[preferred]
96
+ used_split = preferred
97
+ break
98
+ else:
99
+ # Just pick the first available split
100
+ first_key = list(ds.keys())[0]
101
+ ds = ds[first_key]
102
+ used_split = first_key
103
+
104
+ # Limit rows if needed
105
+ total_rows = len(ds)
106
+ if max_rows and total_rows > max_rows:
107
+ ds = ds.select(range(max_rows))
108
+ total_rows = max_rows
109
+
110
+ # Ensure output directory exists
111
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
112
+
113
+ # Export to parquet
114
+ columns = ds.column_names
115
+
116
+ if output_path.endswith(".parquet"):
117
+ ds.to_parquet(output_path)
118
+ elif output_path.endswith(".csv"):
119
+ ds.to_csv(output_path)
120
+ else:
121
+ # Default to parquet
122
+ if not output_path.endswith(".parquet"):
123
+ output_path = output_path + ".parquet"
124
+ ds.to_parquet(output_path)
125
+
126
+ print(json.dumps({
127
+ "ok": True,
128
+ "path": output_path,
129
+ "rows": total_rows,
130
+ "columns": columns,
131
+ "split": used_split
132
+ }))
133
+
134
+ except Exception as e:
135
+ error_msg = str(e)
136
+ # Provide helpful hints
137
+ if "401" in error_msg or "403" in error_msg or "gated" in error_msg.lower():
138
+ error_msg += " (This dataset may be gated/private. Set HF_TOKEN via configure_keys tool.)"
139
+ elif "FileNotFoundError" in error_msg or "does not exist" in error_msg.lower():
140
+ error_msg = f"Dataset '{repo_id}' not found on HuggingFace. Check the dataset ID."
141
+
142
+ print(json.dumps({"ok": False, "error": error_msg}))
143
+ sys.exit(1)
144
+
145
+
146
+ if __name__ == "__main__":
147
+ main()
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@vespermcp/mcp-server",
3
- "version": "1.2.18",
3
+ "version": "1.2.20",
4
4
  "description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
5
5
  "type": "module",
6
6
  "main": "build/index.js",
package/scripts/wizard.js CHANGED
@@ -118,7 +118,7 @@ function getAllAgentConfigs() {
118
118
 
119
119
  function installMcpToAgent(agent) {
120
120
  const npxCmd = IS_WIN ? 'npx.cmd' : 'npx';
121
- const serverEntry = { command: npxCmd, args: ['-y', '-p', '@vespermcp/mcp-server@latest', 'vespermcp'] };
121
+ const serverEntry = { command: npxCmd, args: ['-y', '@vespermcp/mcp-server@latest'] };
122
122
 
123
123
  try {
124
124
  if (agent.format === 'toml') {
@@ -156,7 +156,7 @@ function installMcpToAgent(agent) {
156
156
  async function checkServerHealth() {
157
157
  try {
158
158
  // Quick stdio check — spawn server and see if it responds
159
- const result = spawnSync(IS_WIN ? 'npx.cmd' : 'npx', ['-y', '-p', '@vespermcp/mcp-server@latest', 'vespermcp', '--version'], {
159
+ const result = spawnSync(IS_WIN ? 'npx.cmd' : 'npx', ['-y', '@vespermcp/mcp-server@latest', '--version'], {
160
160
  timeout: 10000,
161
161
  encoding: 'utf8',
162
162
  stdio: ['pipe', 'pipe', 'pipe'],
@@ -202,13 +202,13 @@ async function main() {
202
202
  console.log(`\n ${dim('[')}${cyan('4/6')}${dim(']')} Installing Vesper MCP server...`);
203
203
  try {
204
204
  const npmCmd = IS_WIN ? 'npx.cmd' : 'npx';
205
- spawnSync(npmCmd, ['-y', '-p', '@vespermcp/mcp-server@latest', 'vespermcp', '--setup', '--silent'], {
205
+ spawnSync(npmCmd, ['-y', '@vespermcp/mcp-server@latest', '--setup', '--silent'], {
206
206
  stdio: 'inherit',
207
207
  timeout: 120000,
208
208
  });
209
209
  console.log(` ${green('✓')} @vespermcp/mcp-server installed`);
210
210
  } catch {
211
- console.log(` ${yellow('⚠')} Could not auto-install — run manually: npx -y -p @vespermcp/mcp-server@latest vespermcp --setup`);
211
+ console.log(` ${yellow('⚠')} Could not auto-install — run manually: npx -y @vespermcp/mcp-server@latest --setup`);
212
212
  }
213
213
 
214
214
  // ─── Step 5: Auto-configure all detected IDEs ──────────────
@@ -0,0 +1,147 @@
1
+ """
2
+ HuggingFace Datasets Library Fallback Downloader.
3
+
4
+ Used when the HF Hub file listing finds no suitable data files
5
+ (e.g. script-based datasets, gated datasets, datasets that use
6
+ the `datasets` library format).
7
+
8
+ Usage:
9
+ python hf_fallback.py '{"repo_id": "user/dataset", "output_path": "/path/to/output.parquet", "token": "optional", "max_rows": 100000, "split": "train"}'
10
+
11
+ Output: JSON to stdout
12
+ {"ok": true, "path": "/path/to/output.parquet", "rows": 12345, "columns": ["col1", "col2"]}
13
+ {"ok": false, "error": "..."}
14
+ """
15
+ import sys
16
+ import json
17
+ import os
18
+
19
+ def main():
20
+ if len(sys.argv) < 2:
21
+ print(json.dumps({"ok": False, "error": "Missing payload argument"}))
22
+ sys.exit(1)
23
+
24
+ try:
25
+ payload = json.loads(sys.argv[1])
26
+ except json.JSONDecodeError as e:
27
+ print(json.dumps({"ok": False, "error": f"Invalid JSON payload: {e}"}))
28
+ sys.exit(1)
29
+
30
+ repo_id = payload.get("repo_id", "").strip()
31
+ output_path = payload.get("output_path", "").strip()
32
+ token = payload.get("token") or os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or None
33
+ max_rows = payload.get("max_rows", 500000)
34
+ split = payload.get("split") # None = auto-detect
35
+
36
+ if not repo_id:
37
+ print(json.dumps({"ok": False, "error": "repo_id is required"}))
38
+ sys.exit(1)
39
+
40
+ if not output_path:
41
+ print(json.dumps({"ok": False, "error": "output_path is required"}))
42
+ sys.exit(1)
43
+
44
+ try:
45
+ from datasets import load_dataset
46
+ except ImportError:
47
+ print(json.dumps({"ok": False, "error": "Python 'datasets' library not installed. Install with: pip install datasets"}))
48
+ sys.exit(1)
49
+
50
+ try:
51
+ import polars as pl
52
+ except ImportError:
53
+ pl = None
54
+
55
+ try:
56
+ # Try loading with streaming first (memory-efficient)
57
+ # If split is not specified, try common ones
58
+ splits_to_try = [split] if split else ["train", "test", "validation", None]
59
+
60
+ ds = None
61
+ used_split = None
62
+
63
+ for s in splits_to_try:
64
+ try:
65
+ kwargs = {
66
+ "path": repo_id,
67
+ "trust_remote_code": True,
68
+ }
69
+ if token:
70
+ kwargs["token"] = token
71
+ if s:
72
+ kwargs["split"] = s
73
+
74
+ ds = load_dataset(**kwargs)
75
+ used_split = s
76
+ break
77
+ except (ValueError, KeyError):
78
+ # Split doesn't exist, try next
79
+ continue
80
+ except Exception as e:
81
+ if "split" in str(e).lower() or "key" in str(e).lower():
82
+ continue
83
+ raise
84
+
85
+ if ds is None:
86
+ print(json.dumps({"ok": False, "error": f"Could not load dataset '{repo_id}'. No valid splits found."}))
87
+ sys.exit(1)
88
+
89
+ # Handle DatasetDict (when no split specified)
90
+ from datasets import DatasetDict, Dataset
91
+ if isinstance(ds, DatasetDict):
92
+ # Pick the best split
93
+ for preferred in ["train", "test", "validation"]:
94
+ if preferred in ds:
95
+ ds = ds[preferred]
96
+ used_split = preferred
97
+ break
98
+ else:
99
+ # Just pick the first available split
100
+ first_key = list(ds.keys())[0]
101
+ ds = ds[first_key]
102
+ used_split = first_key
103
+
104
+ # Limit rows if needed
105
+ total_rows = len(ds)
106
+ if max_rows and total_rows > max_rows:
107
+ ds = ds.select(range(max_rows))
108
+ total_rows = max_rows
109
+
110
+ # Ensure output directory exists
111
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
112
+
113
+ # Export to parquet
114
+ columns = ds.column_names
115
+
116
+ if output_path.endswith(".parquet"):
117
+ ds.to_parquet(output_path)
118
+ elif output_path.endswith(".csv"):
119
+ ds.to_csv(output_path)
120
+ else:
121
+ # Default to parquet
122
+ if not output_path.endswith(".parquet"):
123
+ output_path = output_path + ".parquet"
124
+ ds.to_parquet(output_path)
125
+
126
+ print(json.dumps({
127
+ "ok": True,
128
+ "path": output_path,
129
+ "rows": total_rows,
130
+ "columns": columns,
131
+ "split": used_split
132
+ }))
133
+
134
+ except Exception as e:
135
+ error_msg = str(e)
136
+ # Provide helpful hints
137
+ if "401" in error_msg or "403" in error_msg or "gated" in error_msg.lower():
138
+ error_msg += " (This dataset may be gated/private. Set HF_TOKEN via configure_keys tool.)"
139
+ elif "FileNotFoundError" in error_msg or "does not exist" in error_msg.lower():
140
+ error_msg = f"Dataset '{repo_id}' not found on HuggingFace. Check the dataset ID."
141
+
142
+ print(json.dumps({"ok": False, "error": error_msg}))
143
+ sys.exit(1)
144
+
145
+
146
+ if __name__ == "__main__":
147
+ main()