npm - @vespermcp/mcp-server - Versions diffs - 1.2.18 → 1.2.20 - Mend

@vespermcp/mcp-server 1.2.18 → 1.2.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/build/index.js +121 -30
package/build/ingestion/ingestor.js +117 -18
package/build/python/hf_fallback.py +147 -0
package/package.json +1 -1
package/scripts/wizard.js +4 -4
package/src/python/hf_fallback.py +147 -0

package/build/index.js CHANGED Viewed

@@ -451,7 +451,19 @@ jobManager.on("processJob", async (job, execute) => {
  * Logic for preparing a dataset (Search + Ingest + Process)
  */
 async function handlePrepareJob(jobId, query, requirements) {
+    hydrateExternalKeys();
     const update = (updates) => jobManager.updateJob(jobId, updates);
+    // Ensure core Python packages are available for dataset operations
+    try {
+        await ensurePythonModules([
+            { module: "polars", packageName: "polars" },
+            { module: "datasets", packageName: "datasets" },
+        ]);
+    }
+    catch (e) {
+        console.error(`[Prepare] Python dependency setup warning: ${e.message}`);
+        // Continue anyway - direct file downloads may still work without datasets lib
+    }
     const requestedRows = extractRequestedRows(query, requirements);
     let selectedDataset;
     let datasetIdForDownload = "";
@@ -480,7 +492,8 @@ async function handlePrepareJob(jobId, query, requirements) {
             datasetIdForDownload = explicitId.replace(/^dataworld:/i, "");
         }
         else {
-            source = "kaggle";
+            // Default to HuggingFace for ambiguous refs (user/dataset without prefix)
+            source = "huggingface";
             datasetIdForDownload = explicitId;
         }
         update({
@@ -490,11 +503,21 @@ async function handlePrepareJob(jobId, query, requirements) {
     }
     else {
         update({ progress: 10, status_text: "Searching for best dataset matching query..." });
-        const results = await searchEngine.search(query, { limit: 1 });
+        const results = await searchEngine.search(query, { limit: 10 });
         if (results.length === 0) {
             throw new Error("No datasets found matching the query. Try refining your search terms.");
         }
-        selectedDataset = results[0];
+        // Pick the best result that we can actually download (skip sources requiring missing credentials)
+        const hasKaggleCreds = !!(process.env.KAGGLE_USERNAME && process.env.KAGGLE_KEY && process.env.KAGGLE_USERNAME !== "YOUR_KAGGLE_USERNAME");
+        const hasDwToken = hasDataWorldToken();
+        selectedDataset = results.find(r => {
+            const s = (r.source || "").toLowerCase();
+            if (s === "kaggle" && !hasKaggleCreds)
+                return false;
+            if (s === "dataworld" && !hasDwToken)
+                return false;
+            return true;
+        }) || results[0]; // Fallback to first if all require credentials
         datasetIdForDownload = selectedDataset.id;
         source = selectedDataset.source;
         update({
@@ -502,13 +525,16 @@ async function handlePrepareJob(jobId, query, requirements) {
             status_text: `Matched: ${selectedDataset.name} (${selectedDataset.source})`
         });
     }
-    // Pre-check credentials for Kaggle
+    // Pre-check credentials for sources that require them
     if (source === "kaggle") {
-        if (!process.env.KAGGLE_USERNAME || !process.env.KAGGLE_KEY ||
-            process.env.KAGGLE_USERNAME === "YOUR_KAGGLE_USERNAME") {
-            throw new Error("Kaggle credentials not set. Use 'kaggle login' or set KAGGLE_USERNAME/KAGGLE_KEY.");
+        const hasKaggleCreds = !!(process.env.KAGGLE_USERNAME && process.env.KAGGLE_KEY && process.env.KAGGLE_USERNAME !== "YOUR_KAGGLE_USERNAME");
+        if (!hasKaggleCreds) {
+            throw new Error("Kaggle credentials not set. Use the configure_keys tool or set KAGGLE_USERNAME/KAGGLE_KEY environment variables.");
         }
     }
+    if (source === "dataworld" && !hasDataWorldToken()) {
+        throw new Error("data.world token not set. Use the configure_keys tool or set DW_AUTH_TOKEN environment variable.");
+    }
     update({ progress: 30, status_text: `Starting download from ${source}...` });
     // ensureData handles download and returns path to the raw file
     let rawFilePath = await dataIngestor.ensureData(datasetIdForDownload, source, (msg, prog) => {
@@ -604,22 +630,49 @@ async function handlePrepareJob(jobId, query, requirements) {
  */
 async function handleCleanJob(jobId, datasetId, ops) {
     const update = (updates) => jobManager.updateJob(jobId, updates);
-    const safeId = datasetId.replace(/\//g, "_");
-    const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
-    const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
-    let filePath = parquetPath;
-    if (!fs.existsSync(filePath)) {
-        filePath = csvPath;
-    }
-    if (datasetId === "demo" || !fs.existsSync(filePath)) {
+    // Resolve dataset file path from multiple sources
+    let filePath;
+    // 1. Check registry (most reliable - includes prepared/fused datasets)
+    const regEntry = getRegistryEntry(datasetId);
+    const regPath = regEntry?.local_path || regEntry?.path;
+    if (regPath && fs.existsSync(regPath)) {
+        filePath = regPath;
+    }
+    // 2. Check download status from metadata store
+    if (!filePath) {
+        const dlStatus = metadataStore.getDownloadStatus(datasetId);
+        if (dlStatus?.local_path && fs.existsSync(dlStatus.local_path)) {
+            filePath = dlStatus.local_path;
+        }
+    }
+    // 3. Check standard raw data paths
+    if (!filePath) {
+        const safeId = datasetId.replace(/\//g, "_");
+        const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
+        const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
+        const featherPath = path.join(dataRoot, "data", "raw", `${safeId}.feather`);
+        if (fs.existsSync(parquetPath))
+            filePath = parquetPath;
+        else if (fs.existsSync(csvPath))
+            filePath = csvPath;
+        else if (fs.existsSync(featherPath))
+            filePath = featherPath;
+    }
+    // 4. Check if it's a direct file path
+    if (!filePath && fs.existsSync(datasetId)) {
+        filePath = datasetId;
+    }
+    // 5. Demo fallback
+    if (!filePath && datasetId === "demo") {
         const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
         const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
         if (fs.existsSync(demoParquetPath))
             filePath = demoParquetPath;
         else if (fs.existsSync(demoCsvPath))
             filePath = demoCsvPath;
-        else
-            throw new Error(`Data file not found for ${datasetId}`);
+    }
+    if (!filePath) {
+        throw new Error(`Data file not found for '${datasetId}'. Download the dataset first using download_dataset or prepare_dataset.`);
     }
     update({ status_text: "Cleaning dataset..." });
     const result = await dataCleaner.clean(filePath, ops);
@@ -684,14 +737,14 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
             },
             {
                 name: "download_dataset",
-                description: "Download a dataset by source and ID/slug into local Vesper storage. Kaggle requires optional API key.",
+                description: "Download a dataset by source and ID/slug into local Vesper storage. Defaults to HuggingFace. Kaggle and data.world require API keys (use configure_keys first).",
                 inputSchema: {
                     type: "object",
                     properties: {
                         source: {
                             type: "string",
                             enum: ["huggingface", "kaggle", "openml", "dataworld"],
-                            description: "Dataset source.",
+                            description: "Dataset source (default: huggingface). HuggingFace and OpenML work without credentials.",
                         },
                         dataset_id: {
                             type: "string",
@@ -702,7 +755,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
                             description: "Optional target directory for downloaded files.",
                         }
                     },
-                    required: ["source", "dataset_id"],
+                    required: ["dataset_id"],
                 },
             },
             {
@@ -793,7 +846,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
             },
             {
                 name: "custom_clean",
-                description: "Apply specific cleaning operations to a dataset as an asynchronous job.",
+                description: "Apply specific cleaning operations to a dataset as an asynchronous job. Supports: RemoveDuplicates, DropColumns, FillMissing, FixTypes, RemoveOutliers, EncodeCategories. The dataset must be downloaded first.",
                 inputSchema: {
                     type: "object",
                     properties: {
@@ -818,7 +871,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
             },
             {
                 name: "prepare_dataset",
-                description: "Full pipeline: Analyze, Clean, Split, and Export as an asynchronous job.",
+                description: "Full pipeline: Search, Download, Analyze, Clean, Split, and Install a dataset as an asynchronous job. Automatically selects the best available source (prefers HuggingFace/OpenML when no Kaggle credentials are set). Use check_job_status to monitor progress.",
                 inputSchema: {
                     type: "object",
                     properties: {
@@ -1110,7 +1163,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
                 if (source === "kaggle") {
                     if (!dataIngestor.hasKaggleCredentials()) {
                         return {
-                            content: [{ type: "text", text: `Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds).` }],
+                            content: [{ type: "text", text: `Kaggle requires API credentials. Use the configure_keys tool to set kaggle_username and kaggle_key, or try source='huggingface' which works without credentials.` }],
                             isError: true,
                         };
                     }
@@ -1166,23 +1219,34 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
         }
         case "download_dataset": {
             hydrateExternalKeys();
-            const source = String(request.params.arguments?.source || "").toLowerCase();
+            const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
             const datasetId = String(request.params.arguments?.dataset_id || "").trim();
-            if (!source || !datasetId) {
-                throw new McpError(ErrorCode.InvalidParams, "source and dataset_id are required");
+            if (!datasetId) {
+                throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
             }
             if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
                 return {
-                    content: [{ type: "text", text: `Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds).` }],
+                    content: [{ type: "text", text: `Kaggle requires API credentials. Use the configure_keys tool to set kaggle_username and kaggle_key, or switch to source='huggingface' which works without credentials.` }],
                     isError: true,
                 };
             }
             if (source === "dataworld" && !hasDataWorldToken()) {
                 return {
-                    content: [{ type: "text", text: "data.world requires API token. Run 'vespermcp config keys' and set dataworld_token." }],
+                    content: [{ type: "text", text: "data.world requires API token. Use the configure_keys tool to set dataworld_token, or switch to source='huggingface' which works without credentials." }],
                     isError: true,
                 };
             }
+            // Pre-install Python datasets library for HuggingFace fallback
+            if (source === "huggingface") {
+                try {
+                    await ensurePythonModules([
+                        { module: "datasets", packageName: "datasets" },
+                    ]);
+                }
+                catch {
+                    // Continue - direct download may still work
+                }
+            }
             try {
                 const localPath = await dataIngestor.ensureData(datasetId, source, () => undefined);
                 try {
@@ -1460,18 +1524,45 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
         case "custom_clean": {
             const datasetId = String(request.params.arguments?.dataset_id);
             const ops = request.params.arguments?.operations;
+            if (!datasetId || datasetId === "undefined") {
+                throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
+            }
+            if (!ops || !Array.isArray(ops) || ops.length === 0) {
+                throw new McpError(ErrorCode.InvalidParams, "operations array is required and must not be empty. Supported operations: RemoveDuplicates, DropColumns, FillMissing, FixTypes, RemoveOutliers, EncodeCategories");
+            }
+            // Pre-check: verify dataset file exists before starting the job
+            const cleanRegEntry = getRegistryEntry(datasetId);
+            const cleanRegPath = cleanRegEntry?.local_path || cleanRegEntry?.path;
+            const cleanDlStatus = metadataStore.getDownloadStatus(datasetId);
+            const cleanSafeId = datasetId.replace(/\//g, "_");
+            const cleanDataExists = (cleanRegPath && fs.existsSync(cleanRegPath)) ||
+                (cleanDlStatus?.local_path && fs.existsSync(cleanDlStatus.local_path)) ||
+                fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.parquet`)) ||
+                fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.csv`)) ||
+                fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.feather`)) ||
+                fs.existsSync(datasetId);
+            if (!cleanDataExists) {
+                return {
+                    content: [{ type: "text", text: `Dataset '${datasetId}' not found locally. Download it first using download_dataset or prepare_dataset, then run custom_clean.` }],
+                    isError: true,
+                };
+            }
             const job = jobManager.createJob("clean", 0, { datasetId, ops });
             return {
-                content: [{ type: "text", text: `Job started successfully. ID: ${job.id}. Use check_job_status to monitor progress.` }]
+                content: [{ type: "text", text: `Cleaning job started. ID: ${job.id}. Use check_job_status to monitor progress.` }]
             };
         }
         case "prepare_dataset": {
+            hydrateExternalKeys();
             const query = String(request.params.arguments?.query);
             const requirements = request.params.arguments?.requirements ? String(request.params.arguments?.requirements) : undefined;
             const downloadImages = request.params.arguments?.download_images === true;
+            if (!query || query === "undefined") {
+                throw new McpError(ErrorCode.InvalidParams, "query is required - describe the dataset you need or provide a dataset ID");
+            }
             const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages });
             return {
-                content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you.` }]
+                content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you. Use check_job_status to monitor progress.` }]
             };
         }
         case "compare_datasets": {

package/build/ingestion/ingestor.js CHANGED Viewed

@@ -1,5 +1,6 @@
 import path from "path";
 import fs from "fs";
+import { spawn } from "child_process";
 import { HFDownloader } from "./hf-downloader.js";
 import { KaggleSource } from "../metadata/kaggle-source.js";
 import { OpenMLSource } from "../metadata/openml-source.js";
@@ -63,25 +64,42 @@ export class DataIngestor {
         if (source === "huggingface") {
             onProgress?.("Discovering data files on HuggingFace Hub...");
             const remotePath = await this.hfDownloader.findBestFile(datasetId);
-            if (!remotePath)
-                throw new Error(`No suitable data files found in HuggingFace repo: ${datasetId}`);
-            const ext = path.extname(remotePath).substring(1).toLowerCase() || "csv";
-            const targetPath = this.getTargetPath(datasetId, ext);
-            this.store.registerDownload(datasetId, targetPath, "downloading");
-            try {
-                await this.hfDownloader.download(datasetId, remotePath, targetPath, (progress) => {
-                    onProgress?.(`Downloading ${path.basename(remotePath)}...`, progress);
-                });
-                const resolvedPath = await this.hfDownloader.resolveExternalDataFromMetadata(targetPath, (progress) => {
-                    onProgress?.("Resolving external dataset file...", progress);
-                });
-                const stats = fs.statSync(resolvedPath);
-                this.completeDownload(datasetId, resolvedPath, stats.size);
-                return resolvedPath;
+            if (remotePath) {
+                // Direct file download path (repo has raw data files)
+                const ext = path.extname(remotePath).substring(1).toLowerCase() || "csv";
+                const targetPath = this.getTargetPath(datasetId, ext);
+                this.store.registerDownload(datasetId, targetPath, "downloading");
+                try {
+                    await this.hfDownloader.download(datasetId, remotePath, targetPath, (progress) => {
+                        onProgress?.(`Downloading ${path.basename(remotePath)}...`, progress);
+                    });
+                    const resolvedPath = await this.hfDownloader.resolveExternalDataFromMetadata(targetPath, (progress) => {
+                        onProgress?.("Resolving external dataset file...", progress);
+                    });
+                    const stats = fs.statSync(resolvedPath);
+                    this.completeDownload(datasetId, resolvedPath, stats.size);
+                    return resolvedPath;
+                }
+                catch (e) {
+                    this.failDownload(datasetId, e.message);
+                    throw e;
+                }
             }
-            catch (e) {
-                this.failDownload(datasetId, e.message);
-                throw e;
+            else {
+                // Fallback: Use Python datasets library to download and convert
+                onProgress?.("No raw files found. Using HuggingFace datasets library to download...");
+                const targetPath = this.getTargetPath(datasetId, "parquet");
+                this.store.registerDownload(datasetId, targetPath, "downloading");
+                try {
+                    const result = await this.hfDatasetsFallback(datasetId, targetPath, onProgress);
+                    const stats = fs.statSync(result);
+                    this.completeDownload(datasetId, result, stats.size);
+                    return result;
+                }
+                catch (e) {
+                    this.failDownload(datasetId, e.message);
+                    throw e;
+                }
             }
         }
         else if (source === "kaggle") {
@@ -159,4 +177,85 @@ export class DataIngestor {
         const safeId = datasetId.replace(/\//g, "_").replace(/:/g, "_");
         return path.join(this.rawDataDir, `${safeId}.${extension}`);
     }
+    /**
+     * Fallback: Use Python `datasets` library to download a HuggingFace dataset
+     * when no raw data files are found in the repo file listing.
+     */
+    async hfDatasetsFallback(datasetId, targetPath, onProgress) {
+        const pyCmd = process.platform === "win32" ? "py" : "python";
+        // Resolve the fallback script path
+        const homeDir = process.env.HOME || process.env.USERPROFILE || this.projectRoot;
+        const dataRoot = path.join(homeDir, ".vesper");
+        const scriptCandidates = [
+            path.resolve(dataRoot, "python", "hf_fallback.py"),
+            path.resolve(this.projectRoot, "python", "hf_fallback.py"),
+            path.resolve(this.projectRoot, "..", "src", "python", "hf_fallback.py"),
+            path.resolve(this.projectRoot, "..", "python", "hf_fallback.py"),
+        ];
+        let scriptPath = scriptCandidates.find(p => fs.existsSync(p));
+        if (!scriptPath) {
+            scriptPath = scriptCandidates[0]; // Will fail with a clear error
+        }
+        const token = process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN || undefined;
+        const payload = {
+            repo_id: datasetId,
+            output_path: targetPath,
+            token: token || null,
+            max_rows: 500000,
+        };
+        onProgress?.("Downloading via datasets library (this may take a moment)...", 30);
+        return new Promise((resolve, reject) => {
+            const proc = spawn(pyCmd, [scriptPath, JSON.stringify(payload)], {
+                env: {
+                    ...process.env,
+                    PYTHONUTF8: "1",
+                    PIP_DISABLE_PIP_VERSION_CHECK: "1",
+                },
+            });
+            let stdout = "";
+            let stderr = "";
+            proc.stdout.on("data", (d) => (stdout += d.toString()));
+            proc.stderr.on("data", (d) => {
+                const msg = d.toString();
+                stderr += msg;
+                // Forward progress info
+                if (msg.includes("Downloading") || msg.includes("Loading")) {
+                    onProgress?.(msg.trim().split("\n").pop() || "Downloading...", 50);
+                }
+            });
+            const timer = setTimeout(() => {
+                try {
+                    proc.kill();
+                }
+                catch { /* no-op */ }
+                reject(new Error(`HuggingFace datasets download timed out after 10 minutes for ${datasetId}`));
+            }, 600000); // 10 min timeout
+            proc.on("close", (code) => {
+                clearTimeout(timer);
+                if (code !== 0) {
+                    let errorMsg = stderr || stdout || `Python exited with code ${code}`;
+                    try {
+                        const parsed = JSON.parse(stdout);
+                        if (parsed.error)
+                            errorMsg = parsed.error;
+                    }
+                    catch { /* use stderr */ }
+                    reject(new Error(`HuggingFace datasets fallback failed: ${errorMsg}`));
+                    return;
+                }
+                try {
+                    const result = JSON.parse(stdout);
+                    if (!result.ok) {
+                        reject(new Error(result.error || "Unknown error from HF fallback"));
+                        return;
+                    }
+                    onProgress?.(`Downloaded ${result.rows?.toLocaleString() || "?"} rows (${result.columns?.length || "?"} columns)`, 90);
+                    resolve(result.path);
+                }
+                catch {
+                    reject(new Error(`Failed to parse HF fallback output: ${stdout}`));
+                }
+            });
+        });
+    }
 }

package/build/python/hf_fallback.py ADDED Viewed

@@ -0,0 +1,147 @@
+"""
+HuggingFace Datasets Library Fallback Downloader.
+Used when the HF Hub file listing finds no suitable data files
+(e.g. script-based datasets, gated datasets, datasets that use
+the `datasets` library format).
+Usage:
+    python hf_fallback.py '{"repo_id": "user/dataset", "output_path": "/path/to/output.parquet", "token": "optional", "max_rows": 100000, "split": "train"}'
+Output: JSON to stdout
+    {"ok": true, "path": "/path/to/output.parquet", "rows": 12345, "columns": ["col1", "col2"]}
+    {"ok": false, "error": "..."}
+"""
+import sys
+import json
+import os
+def main():
+    if len(sys.argv) < 2:
+        print(json.dumps({"ok": False, "error": "Missing payload argument"}))
+        sys.exit(1)
+    try:
+        payload = json.loads(sys.argv[1])
+    except json.JSONDecodeError as e:
+        print(json.dumps({"ok": False, "error": f"Invalid JSON payload: {e}"}))
+        sys.exit(1)
+    repo_id = payload.get("repo_id", "").strip()
+    output_path = payload.get("output_path", "").strip()
+    token = payload.get("token") or os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or None
+    max_rows = payload.get("max_rows", 500000)
+    split = payload.get("split")  # None = auto-detect
+    if not repo_id:
+        print(json.dumps({"ok": False, "error": "repo_id is required"}))
+        sys.exit(1)
+    if not output_path:
+        print(json.dumps({"ok": False, "error": "output_path is required"}))
+        sys.exit(1)
+    try:
+        from datasets import load_dataset
+    except ImportError:
+        print(json.dumps({"ok": False, "error": "Python 'datasets' library not installed. Install with: pip install datasets"}))
+        sys.exit(1)
+    try:
+        import polars as pl
+    except ImportError:
+        pl = None
+    try:
+        # Try loading with streaming first (memory-efficient)
+        # If split is not specified, try common ones
+        splits_to_try = [split] if split else ["train", "test", "validation", None]
+        ds = None
+        used_split = None
+        for s in splits_to_try:
+            try:
+                kwargs = {
+                    "path": repo_id,
+                    "trust_remote_code": True,
+                }
+                if token:
+                    kwargs["token"] = token
+                if s:
+                    kwargs["split"] = s
+                ds = load_dataset(**kwargs)
+                used_split = s
+                break
+            except (ValueError, KeyError):
+                # Split doesn't exist, try next
+                continue
+            except Exception as e:
+                if "split" in str(e).lower() or "key" in str(e).lower():
+                    continue
+                raise
+        if ds is None:
+            print(json.dumps({"ok": False, "error": f"Could not load dataset '{repo_id}'. No valid splits found."}))
+            sys.exit(1)
+        # Handle DatasetDict (when no split specified)
+        from datasets import DatasetDict, Dataset
+        if isinstance(ds, DatasetDict):
+            # Pick the best split
+            for preferred in ["train", "test", "validation"]:
+                if preferred in ds:
+                    ds = ds[preferred]
+                    used_split = preferred
+                    break
+            else:
+                # Just pick the first available split
+                first_key = list(ds.keys())[0]
+                ds = ds[first_key]
+                used_split = first_key
+        # Limit rows if needed
+        total_rows = len(ds)
+        if max_rows and total_rows > max_rows:
+            ds = ds.select(range(max_rows))
+            total_rows = max_rows
+        # Ensure output directory exists
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        # Export to parquet
+        columns = ds.column_names
+        if output_path.endswith(".parquet"):
+            ds.to_parquet(output_path)
+        elif output_path.endswith(".csv"):
+            ds.to_csv(output_path)
+        else:
+            # Default to parquet
+            if not output_path.endswith(".parquet"):
+                output_path = output_path + ".parquet"
+            ds.to_parquet(output_path)
+        print(json.dumps({
+            "ok": True,
+            "path": output_path,
+            "rows": total_rows,
+            "columns": columns,
+            "split": used_split
+        }))
+    except Exception as e:
+        error_msg = str(e)
+        # Provide helpful hints
+        if "401" in error_msg or "403" in error_msg or "gated" in error_msg.lower():
+            error_msg += " (This dataset may be gated/private. Set HF_TOKEN via configure_keys tool.)"
+        elif "FileNotFoundError" in error_msg or "does not exist" in error_msg.lower():
+            error_msg = f"Dataset '{repo_id}' not found on HuggingFace. Check the dataset ID."
+        print(json.dumps({"ok": False, "error": error_msg}))
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@vespermcp/mcp-server",
-  "version": "1.2.18",
+  "version": "1.2.20",
   "description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
   "type": "module",
   "main": "build/index.js",

package/scripts/wizard.js CHANGED Viewed

@@ -118,7 +118,7 @@ function getAllAgentConfigs() {
 function installMcpToAgent(agent) {
   const npxCmd = IS_WIN ? 'npx.cmd' : 'npx';
-  const serverEntry = { command: npxCmd, args: ['-y', '-p', '@vespermcp/mcp-server@latest', 'vespermcp'] };
+  const serverEntry = { command: npxCmd, args: ['-y', '@vespermcp/mcp-server@latest'] };
   try {
     if (agent.format === 'toml') {
@@ -156,7 +156,7 @@ function installMcpToAgent(agent) {
 async function checkServerHealth() {
   try {
     // Quick stdio check — spawn server and see if it responds
-    const result = spawnSync(IS_WIN ? 'npx.cmd' : 'npx', ['-y', '-p', '@vespermcp/mcp-server@latest', 'vespermcp', '--version'], {
+    const result = spawnSync(IS_WIN ? 'npx.cmd' : 'npx', ['-y', '@vespermcp/mcp-server@latest', '--version'], {
       timeout: 10000,
       encoding: 'utf8',
       stdio: ['pipe', 'pipe', 'pipe'],
@@ -202,13 +202,13 @@ async function main() {
   console.log(`\n  ${dim('[')}${cyan('4/6')}${dim(']')} Installing Vesper MCP server...`);
   try {
     const npmCmd = IS_WIN ? 'npx.cmd' : 'npx';
-    spawnSync(npmCmd, ['-y', '-p', '@vespermcp/mcp-server@latest', 'vespermcp', '--setup', '--silent'], {
+    spawnSync(npmCmd, ['-y', '@vespermcp/mcp-server@latest', '--setup', '--silent'], {
       stdio: 'inherit',
       timeout: 120000,
     });
     console.log(`      ${green('✓')} @vespermcp/mcp-server installed`);
   } catch {
-    console.log(`      ${yellow('⚠')} Could not auto-install — run manually: npx -y -p @vespermcp/mcp-server@latest vespermcp --setup`);
+    console.log(`      ${yellow('⚠')} Could not auto-install — run manually: npx -y @vespermcp/mcp-server@latest --setup`);
   }
   // ─── Step 5: Auto-configure all detected IDEs ──────────────

package/src/python/hf_fallback.py ADDED Viewed

@@ -0,0 +1,147 @@
+"""
+HuggingFace Datasets Library Fallback Downloader.
+Used when the HF Hub file listing finds no suitable data files
+(e.g. script-based datasets, gated datasets, datasets that use
+the `datasets` library format).
+Usage:
+    python hf_fallback.py '{"repo_id": "user/dataset", "output_path": "/path/to/output.parquet", "token": "optional", "max_rows": 100000, "split": "train"}'
+Output: JSON to stdout
+    {"ok": true, "path": "/path/to/output.parquet", "rows": 12345, "columns": ["col1", "col2"]}
+    {"ok": false, "error": "..."}
+"""
+import sys
+import json
+import os
+def main():
+    if len(sys.argv) < 2:
+        print(json.dumps({"ok": False, "error": "Missing payload argument"}))
+        sys.exit(1)
+    try:
+        payload = json.loads(sys.argv[1])
+    except json.JSONDecodeError as e:
+        print(json.dumps({"ok": False, "error": f"Invalid JSON payload: {e}"}))
+        sys.exit(1)
+    repo_id = payload.get("repo_id", "").strip()
+    output_path = payload.get("output_path", "").strip()
+    token = payload.get("token") or os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or None
+    max_rows = payload.get("max_rows", 500000)
+    split = payload.get("split")  # None = auto-detect
+    if not repo_id:
+        print(json.dumps({"ok": False, "error": "repo_id is required"}))
+        sys.exit(1)
+    if not output_path:
+        print(json.dumps({"ok": False, "error": "output_path is required"}))
+        sys.exit(1)
+    try:
+        from datasets import load_dataset
+    except ImportError:
+        print(json.dumps({"ok": False, "error": "Python 'datasets' library not installed. Install with: pip install datasets"}))
+        sys.exit(1)
+    try:
+        import polars as pl
+    except ImportError:
+        pl = None
+    try:
+        # Try loading with streaming first (memory-efficient)
+        # If split is not specified, try common ones
+        splits_to_try = [split] if split else ["train", "test", "validation", None]
+        ds = None
+        used_split = None
+        for s in splits_to_try:
+            try:
+                kwargs = {
+                    "path": repo_id,
+                    "trust_remote_code": True,
+                }
+                if token:
+                    kwargs["token"] = token
+                if s:
+                    kwargs["split"] = s
+                ds = load_dataset(**kwargs)
+                used_split = s
+                break
+            except (ValueError, KeyError):
+                # Split doesn't exist, try next
+                continue
+            except Exception as e:
+                if "split" in str(e).lower() or "key" in str(e).lower():
+                    continue
+                raise
+        if ds is None:
+            print(json.dumps({"ok": False, "error": f"Could not load dataset '{repo_id}'. No valid splits found."}))
+            sys.exit(1)
+        # Handle DatasetDict (when no split specified)
+        from datasets import DatasetDict, Dataset
+        if isinstance(ds, DatasetDict):
+            # Pick the best split
+            for preferred in ["train", "test", "validation"]:
+                if preferred in ds:
+                    ds = ds[preferred]
+                    used_split = preferred
+                    break
+            else:
+                # Just pick the first available split
+                first_key = list(ds.keys())[0]
+                ds = ds[first_key]
+                used_split = first_key
+        # Limit rows if needed
+        total_rows = len(ds)
+        if max_rows and total_rows > max_rows:
+            ds = ds.select(range(max_rows))
+            total_rows = max_rows
+        # Ensure output directory exists
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        # Export to parquet
+        columns = ds.column_names
+        if output_path.endswith(".parquet"):
+            ds.to_parquet(output_path)
+        elif output_path.endswith(".csv"):
+            ds.to_csv(output_path)
+        else:
+            # Default to parquet
+            if not output_path.endswith(".parquet"):
+                output_path = output_path + ".parquet"
+            ds.to_parquet(output_path)
+        print(json.dumps({
+            "ok": True,
+            "path": output_path,
+            "rows": total_rows,
+            "columns": columns,
+            "split": used_split
+        }))
+    except Exception as e:
+        error_msg = str(e)
+        # Provide helpful hints
+        if "401" in error_msg or "403" in error_msg or "gated" in error_msg.lower():
+            error_msg += " (This dataset may be gated/private. Set HF_TOKEN via configure_keys tool.)"
+        elif "FileNotFoundError" in error_msg or "does not exist" in error_msg.lower():
+            error_msg = f"Dataset '{repo_id}' not found on HuggingFace. Check the dataset ID."
+        print(json.dumps({"ok": False, "error": error_msg}))
+        sys.exit(1)
+if __name__ == "__main__":
+    main()