npm - @vespermcp/mcp-server - Versions diffs - 1.2.14 → 1.2.16 - Mend

@vespermcp/mcp-server 1.2.14 → 1.2.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/README.md +1 -1
package/build/config/config-manager.js +1 -1
package/build/index.js +128 -31
package/build/ingestion/hf-downloader.js +71 -3
package/build/ingestion/ingestor.js +6 -3
package/build/python/export_engine.py +16 -0
package/build/python/quality_engine.py +32 -8
package/build/tools/formatter.js +6 -0
package/mcp-config-template.json +5 -2
package/package.json +3 -2
package/scripts/wizard.js +307 -0
package/src/python/export_engine.py +16 -0
package/src/python/quality_engine.py +32 -8
package/src/scripts/wizard.js +0 -77

package/README.md CHANGED Viewed

@@ -36,7 +36,7 @@ Vesper is a Model Context Protocol (MCP) server that helps you find, analyze, an
 The fastest way to install Vesper and configure it for **GitHub Copilot Chat** or **Cursor** is to run the automated setup:
 ```bash
-npx -y @vespermcp/mcp-server@latest --setup
+npx -y -p @vespermcp/mcp-server@latest vespermcp --setup
 ```
 1.  Select **Visual Studio Code (Settings.json)** from the list.

package/build/config/config-manager.js CHANGED Viewed

@@ -91,7 +91,7 @@ export class ConfigManager {
         const isWin = process.platform === "win32";
         return {
             command: isWin ? "npx.cmd" : "npx",
-            args: ["-y", "@vespermcp/mcp-server@latest"],
+            args: ["-y", "-p", "@vespermcp/mcp-server@latest", "vespermcp"],
         };
     }
     /**

package/build/index.js CHANGED Viewed

@@ -43,11 +43,12 @@ function upsertRegistry(dataset_id, local_path, status) {
 function getRegistryEntry(dataset_id) {
     const norm_id = normalize_dataset_id(dataset_id);
     console.error(`[Registry] Lookup key: ${norm_id}`);
-    return readRegistry().find(e => e.dataset_id === norm_id);
+    return readRegistry().find(e => (e.dataset_id || e.id) === norm_id);
 }
 // --- Pipeline State Tracker ---
 // Tracks completed steps per session/job/dataset
 const pipelineState = {};
+const jobStatusLastPoll = {};
 function getPipelineKey(datasetId) {
     return datasetId;
 }
@@ -77,6 +78,7 @@ import { fileURLToPath } from "url";
 import path from "path";
 import fs from "fs";
 import { spawn } from "child_process";
+import { spawnSync } from "child_process";
 import { MetadataStore } from "./metadata/store.js";
 import { VectorStore } from "./search/vector-store.js";
 import { Embedder } from "./search/embedder.js";
@@ -348,7 +350,7 @@ function syncPythonScripts(appRoot, dataRoot) {
             let shouldCopy = true;
             if (fs.existsSync(destPath)) {
                 const destStat = fs.statSync(destPath);
-                if (srcStat.size === destStat.size)
+                if (srcStat.size === destStat.size && srcStat.mtimeMs <= destStat.mtimeMs)
                     shouldCopy = false;
             }
             if (shouldCopy) {
@@ -450,17 +452,55 @@ jobManager.on("processJob", async (job, execute) => {
 async function handlePrepareJob(jobId, query, requirements) {
     const update = (updates) => jobManager.updateJob(jobId, updates);
     const requestedRows = extractRequestedRows(query, requirements);
-    update({ progress: 10, status_text: "Searching for best dataset matching query..." });
-    const results = await searchEngine.search(query, { limit: 1 });
-    if (results.length === 0) {
-        throw new Error("No datasets found matching the query. Try refining your search terms.");
+    let selectedDataset;
+    let datasetIdForDownload = "";
+    let source;
+    const parsedQuery = parseDatasetId(query);
+    const isExplicitDatasetRef = /^(kaggle:|hf:|huggingface:|openml:|dataworld:)/i.test(parsedQuery) || (query.includes("/") && !query.includes(" "));
+    if (isExplicitDatasetRef) {
+        let explicitId = parsedQuery;
+        if (/^hf:/i.test(explicitId)) {
+            explicitId = explicitId.replace(/^hf:/i, "huggingface:");
+        }
+        if (/^kaggle:/i.test(explicitId)) {
+            source = "kaggle";
+            datasetIdForDownload = explicitId.replace(/^kaggle:/i, "");
+        }
+        else if (/^huggingface:/i.test(explicitId)) {
+            source = "huggingface";
+            datasetIdForDownload = explicitId.replace(/^huggingface:/i, "");
+        }
+        else if (/^openml:/i.test(explicitId)) {
+            source = "openml";
+            datasetIdForDownload = explicitId.replace(/^openml:/i, "");
+        }
+        else if (/^dataworld:/i.test(explicitId)) {
+            source = "dataworld";
+            datasetIdForDownload = explicitId.replace(/^dataworld:/i, "");
+        }
+        else {
+            source = "kaggle";
+            datasetIdForDownload = explicitId;
+        }
+        update({
+            progress: 20,
+            status_text: `Using explicit dataset id: ${datasetIdForDownload} (${source})`
+        });
+    }
+    else {
+        update({ progress: 10, status_text: "Searching for best dataset matching query..." });
+        const results = await searchEngine.search(query, { limit: 1 });
+        if (results.length === 0) {
+            throw new Error("No datasets found matching the query. Try refining your search terms.");
+        }
+        selectedDataset = results[0];
+        datasetIdForDownload = selectedDataset.id;
+        source = selectedDataset.source;
+        update({
+            progress: 20,
+            status_text: `Matched: ${selectedDataset.name} (${selectedDataset.source})`
+        });
     }
-    const topDataset = results[0];
-    update({
-        progress: 20,
-        status_text: `Matched: ${topDataset.name} (${topDataset.source})`
-    });
-    const source = topDataset.source;
     // Pre-check credentials for Kaggle
     if (source === "kaggle") {
         if (!process.env.KAGGLE_USERNAME || !process.env.KAGGLE_KEY ||
@@ -470,10 +510,10 @@ async function handlePrepareJob(jobId, query, requirements) {
     }
     update({ progress: 30, status_text: `Starting download from ${source}...` });
     // ensureData handles download and returns path to the raw file
-    let rawFilePath = await dataIngestor.ensureData(topDataset.id, source, (msg, prog) => {
+    let rawFilePath = await dataIngestor.ensureData(datasetIdForDownload, source, (msg, prog) => {
         update({ status_text: msg, progress: 30 + (prog ? Math.floor(prog * 0.4) : 0) });
     });
-    if (requestedRows && requestedRows > 0) {
+    if (requestedRows && requestedRows > 0 && !isExplicitDatasetRef) {
         update({ progress: 62, status_text: `Validating requested sample count (${requestedRows.toLocaleString()})...` });
         let currentRows = await countRows(rawFilePath);
         if (currentRows < requestedRows) {
@@ -482,7 +522,7 @@ async function handlePrepareJob(jobId, query, requirements) {
             const sourceFiles = [rawFilePath];
             let totalRows = currentRows;
             for (const ds of additional) {
-                if (ds.id === topDataset.id)
+                if (ds.id === datasetIdForDownload)
                     continue;
                 try {
                     const dsSource = ds.source;
@@ -516,10 +556,10 @@ async function handlePrepareJob(jobId, query, requirements) {
                 rawFilePath = fusionResult.output_path;
                 try {
                     // Register fused output for this top dataset so export can find it
-                    upsertRegistry(topDataset.id, rawFilePath, "completed");
+                    upsertRegistry(datasetIdForDownload, rawFilePath, "completed");
                 }
                 catch (e) {
-                    console.error(`[Registry] Failed to write registry for fused output ${topDataset.id}: ${e?.message || e}`);
+                    console.error(`[Registry] Failed to write registry for fused output ${datasetIdForDownload}: ${e?.message || e}`);
                 }
                 currentRows = await countRows(rawFilePath);
             }
@@ -530,22 +570,31 @@ async function handlePrepareJob(jobId, query, requirements) {
             update({ progress: 69, status_text: `Sample target met: ${currentRows.toLocaleString()} rows` });
         }
     }
+    let qualityScore = selectedDataset?.quality_score ?? 70;
     update({ progress: 70, status_text: "Analyzing dataset quality..." });
-    const report = await qualityAnalyzer.analyze(rawFilePath);
-    // Update local metadata with quality info
-    metadataStore.saveDataset({
-        ...topDataset,
-        quality_score: report.overall_score
-    });
+    try {
+        const report = await qualityAnalyzer.analyze(rawFilePath);
+        qualityScore = report.overall_score;
+    }
+    catch (error) {
+        console.error(`[Prepare] Quality analysis failed for ${datasetIdForDownload}: ${error?.message || error}`);
+        update({ progress: 78, status_text: "Quality analysis skipped (unsupported schema). Continuing installation..." });
+    }
+    if (selectedDataset) {
+        metadataStore.saveDataset({
+            ...selectedDataset,
+            quality_score: qualityScore
+        });
+    }
     update({ progress: 85, status_text: "Installing dataset into project..." });
-    const installPath = await installService.install(topDataset.id, rawFilePath);
+    const installPath = await installService.install(datasetIdForDownload, rawFilePath);
     update({ progress: 100, status_text: "Preparation complete!" });
     // Register prepared dataset in local registry for lookup by export/list tools
     try {
-        upsertRegistry(topDataset.id, installPath, "completed");
+        upsertRegistry(datasetIdForDownload, installPath, "completed");
     }
     catch (e) {
-        console.error(`[Registry] Failed to write registry for ${topDataset.id}: ${e?.message || e}`);
+        console.error(`[Registry] Failed to write registry for ${datasetIdForDownload}: ${e?.message || e}`);
     }
     return installPath;
 }
@@ -1443,6 +1492,17 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
             if (!job) {
                 throw new McpError(ErrorCode.InvalidParams, `Job not found: ${jobId}`);
             }
+            const activeStatuses = new Set(["pending", "queued", "running", "retrying"]);
+            const now = Date.now();
+            const last = jobStatusLastPoll[jobId] || 0;
+            const minPollMs = 3000;
+            if (activeStatuses.has(job.status) && (now - last) < minPollMs) {
+                const waitMs = minPollMs - (now - last);
+                return {
+                    content: [{ type: "text", text: `No significant status change yet. Please poll again in ~${Math.ceil(waitMs / 1000)}s.` }]
+                };
+            }
+            jobStatusLastPoll[jobId] = now;
             return {
                 content: [{ type: "text", text: formatJobStatus(job) }]
             };
@@ -1482,9 +1542,9 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
                 catch (e) {
                     console.error(`[Export] Failed to start prepare job for ${datasetId}: ${e?.message || e}`);
                 }
-                // Poll for download status until local_path appears or timeout
+                // Poll for download status or registry entry until local_path appears or timeout
                 const wait = (ms) => new Promise(res => setTimeout(res, ms));
-                const maxWait = 60_000; // 60s
+                const maxWait = 120_000; // 120s
                 const interval = 2000;
                 let waited = 0;
                 while (waited < maxWait) {
@@ -1494,13 +1554,20 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
                         console.error(`[Export] Local data is now available for ${datasetId}: ${sourcePath}`);
                         break;
                     }
+                    const reg = getRegistryEntry(datasetId);
+                    const regPath = reg?.local_path || reg?.path;
+                    if (regPath && fs.existsSync(regPath)) {
+                        sourcePath = regPath;
+                        console.error(`[Export] Local data found in registry for ${datasetId}: ${sourcePath}`);
+                        break;
+                    }
                     await wait(interval);
                     waited += interval;
                 }
                 // If still no sourcePath, return helpful error listing prepared datasets
                 if (!sourcePath) {
                     const entries = readRegistry();
-                    const listText = entries.length === 0 ? "(no prepared datasets found)" : entries.map(e => `- ${e.dataset_id}: ${e.local_path}`).join("\n");
+                    const listText = entries.length === 0 ? "(no prepared datasets found)" : entries.map(e => `- ${e.dataset_id || e.id || "unknown"}: ${e.local_path || e.path || "unknown"}`).slice(0, 10).join("\n") + (entries.length > 10 ? "\n...and " + (entries.length - 10) + " more" : "");
                     return {
                         content: [{ type: "text", text: `ERROR: No local data found for ${datasetId} after attempting prepare. Check credentials and try running prepare_dataset manually. Prepared datasets:\n${listText}` }],
                         isError: true
@@ -1511,7 +1578,11 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
             if (!fastMode) {
                 const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
                 const pipelineFmt = (requestedFormat === "csv" || requestedFormat === "parquet") ? requestedFormat : "parquet";
-                if (currentExt !== pipelineFmt) {
+                const pipelineCompatibleInput = currentExt === "csv" || currentExt === "parquet";
+                if (!pipelineCompatibleInput) {
+                    console.error(`[Export] Skipping pipeline for ${currentExt} input; using direct exporter conversion.`);
+                }
+                else if (currentExt !== pipelineFmt) {
                     console.error(`[Export] Running quality/cleaning pipeline (use fast=true to skip)...`);
                     try {
                         const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, pipelineFmt);
@@ -1853,7 +1924,15 @@ async function main() {
     const transport = new StdioServerTransport();
     await server.connect(transport);
     console.error("Vesper MCP server running on stdio");
-    console.error("Tip: To configure Vesper for your IDE, run: npx @vespermcp/mcp-server --setup");
+    console.error("Tip: To configure Vesper for your IDE, run: npx -y -p @vespermcp/mcp-server@latest vespermcp --setup");
+    await new Promise((resolve) => {
+        const done = () => resolve();
+        process.stdin.resume();
+        process.stdin.once("end", done);
+        process.stdin.once("close", done);
+        process.once("SIGINT", done);
+        process.once("SIGTERM", done);
+    });
     console.error("[Vesper] Main loop finished");
 }
 async function runConfigCli(args) {
@@ -2161,6 +2240,24 @@ async function runFuseCli(args) {
     console.log("Next: run vespermcp split/export on the fused dataset");
 }
 async function runSetupWizard(silent = false) {
+    if (!silent && process.stdin.isTTY) {
+        const wizardCandidates = [
+            path.join(appRoot, "scripts", "wizard.js"),
+            path.join(appRoot, "src", "scripts", "wizard.js"),
+            path.join(process.cwd(), "vesper-wizard", "wizard.js"),
+        ];
+        const wizardScript = wizardCandidates.find(candidate => fs.existsSync(candidate));
+        if (wizardScript) {
+            console.error("[Vesper Setup] Running guided wizard...");
+            const result = spawnSync(process.execPath, [wizardScript], {
+                stdio: "inherit",
+                env: process.env,
+            });
+            if ((result.status ?? 1) !== 0) {
+                console.error("[Vesper Setup] Wizard exited with non-zero status, continuing with automatic MCP config only.");
+            }
+        }
+    }
     const configManager = new ConfigManager();
     if (!silent) {
         console.error(`\nVesper MCP - Universal Setup`);

package/build/ingestion/hf-downloader.js CHANGED Viewed

@@ -1,4 +1,5 @@
 import { listFiles } from "@huggingface/hub";
+import fs from "fs";
 import path from "path";
 import { RobustDownloader } from "../utils/downloader.js";
 export class HFDownloader {
@@ -19,6 +20,7 @@ export class HFDownloader {
         try {
             const token = this.getToken();
             const files = [];
+            const metadataFiles = [];
             const blacklist = [
                 ".gitattributes",
                 ".gitignore",
@@ -29,6 +31,15 @@ export class HFDownloader {
                 "requirements.txt",
                 "setup.py"
             ];
+            const metadataNamePatterns = [
+                /^dataset_infos?\.json$/i,
+                /^dataset_dict\.json$/i,
+                /^state\.json$/i,
+                /^config\.json$/i,
+                /^metadata\.json$/i,
+                /^stats\.json$/i,
+                /^index\.json$/i
+            ];
             for await (const file of listFiles({
                 repo: { type: "dataset", name: repoId },
                 recursive: true,
@@ -36,7 +47,11 @@ export class HFDownloader {
             })) {
                 if (file.type === "file") {
                     const fileName = path.basename(file.path);
-                    if (!blacklist.includes(fileName) && !fileName.startsWith(".")) {
+                    const isMetadataJson = metadataNamePatterns.some(p => p.test(fileName));
+                    if (isMetadataJson) {
+                        metadataFiles.push(file.path);
+                    }
+                    if (!blacklist.includes(fileName) && !fileName.startsWith(".") && !isMetadataJson) {
                         files.push(file.path);
                     }
                 }
@@ -49,7 +64,15 @@ export class HFDownloader {
                 /train.*\.csv$/i,
                 /data.*\.csv$/i,
                 /.*\.csv$/i,
+                /train.*\.tsv$/i,
+                /data.*\.tsv$/i,
+                /.*\.tsv$/i,
+                /train.*\.txt$/i,
+                /data.*\.txt$/i,
+                /.*\.txt$/i,
                 /.*\.jsonl$/i,
+                /.*\.ndjson$/i,
+                // Keep plain JSON as lowest priority to avoid selecting metadata-like files.
                 /.*\.json$/i
             ];
             for (const pattern of priorities) {
@@ -58,12 +81,16 @@ export class HFDownloader {
                     return match;
             }
             // Strict fallback: Only return the first file if it has a data-like extension
-            const dataExtensions = [".csv", ".parquet", ".jsonl", ".json", ".txt", ".tsv", ".avro", ".orc"];
+            const dataExtensions = [".csv", ".parquet", ".jsonl", ".ndjson", ".tsv", ".txt", ".json", ".avro", ".orc"];
             const fallback = files.find(f => {
                 const ext = path.extname(f).toLowerCase();
                 return dataExtensions.includes(ext);
             });
-            return fallback || null;
+            if (fallback)
+                return fallback;
+            // Last-resort: allow dataset metadata file, then resolve external raw URLs later.
+            const metadataFallback = metadataFiles.find(f => /dataset_infos?\.json$/i.test(path.basename(f)));
+            return metadataFallback || null;
         }
         catch (error) {
             const msg = String(error?.message || error);
@@ -90,4 +117,45 @@ export class HFDownloader {
             }
         });
     }
+    /**
+     * If downloaded file is dataset metadata (dataset_infos.json), resolve and download a real data URL.
+     * Returns the actual local data path to use.
+     */
+    async resolveExternalDataFromMetadata(localPath, onProgress) {
+        const ext = path.extname(localPath).toLowerCase();
+        if (ext !== ".json") {
+            return localPath;
+        }
+        try {
+            const raw = fs.readFileSync(localPath, "utf-8");
+            const parsed = JSON.parse(raw);
+            const firstConfig = parsed?.default || Object.values(parsed || {})[0];
+            const checksums = firstConfig?.download_checksums;
+            if (!checksums || typeof checksums !== "object") {
+                return localPath;
+            }
+            const candidateUrls = Object.keys(checksums).filter((u) => /^https?:\/\//i.test(u));
+            if (candidateUrls.length === 0) {
+                return localPath;
+            }
+            const preferred = candidateUrls.find(u => /train|data/i.test(path.basename(u))) || candidateUrls[0];
+            const ext = path.extname(preferred).toLowerCase() || ".csv";
+            const resolvedPath = localPath.replace(/\.json$/i, ext);
+            await this.downloader.download(preferred, resolvedPath, {
+                resume: true,
+                onProgress: (bytes, total) => {
+                    if (total > 0 && onProgress) {
+                        onProgress(Math.round((bytes / total) * 100));
+                    }
+                }
+            });
+            if (fs.existsSync(resolvedPath) && fs.statSync(resolvedPath).size > 0) {
+                return resolvedPath;
+            }
+            return localPath;
+        }
+        catch {
+            return localPath;
+        }
+    }
 }

package/build/ingestion/ingestor.js CHANGED Viewed

@@ -72,9 +72,12 @@ export class DataIngestor {
                 await this.hfDownloader.download(datasetId, remotePath, targetPath, (progress) => {
                     onProgress?.(`Downloading ${path.basename(remotePath)}...`, progress);
                 });
-                const stats = fs.statSync(targetPath);
-                this.completeDownload(datasetId, targetPath, stats.size);
-                return targetPath;
+                const resolvedPath = await this.hfDownloader.resolveExternalDataFromMetadata(targetPath, (progress) => {
+                    onProgress?.("Resolving external dataset file...", progress);
+                });
+                const stats = fs.statSync(resolvedPath);
+                this.completeDownload(datasetId, resolvedPath, stats.size);
+                return resolvedPath;
             }
             catch (e) {
                 this.failDownload(datasetId, e.message);

package/build/python/export_engine.py CHANGED Viewed

@@ -31,6 +31,19 @@ def _load(file_path: str, options: dict) -> pl.DataFrame:
     ext = os.path.splitext(file_path)[1].lower()
     if ext == ".csv":
         df = pl.read_csv(file_path, ignore_errors=True)
+    elif ext == ".tsv":
+        df = pl.read_csv(file_path, separator="\t", ignore_errors=True)
+    elif ext == ".txt":
+        # Heuristic delimiter detection for plain text tabular files.
+        sep = ","
+        try:
+            with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
+                first_line = fh.readline()
+                if "\t" in first_line:
+                    sep = "\t"
+        except Exception:
+            sep = ","
+        df = pl.read_csv(file_path, separator=sep, ignore_errors=True)
     elif ext in (".parquet", ".pq"):
         df = pl.read_parquet(file_path)
     elif ext in (".feather", ".ftr", ".arrow", ".ipc"):
@@ -40,6 +53,9 @@ def _load(file_path: str, options: dict) -> pl.DataFrame:
     else:
         raise ValueError(f"Unsupported input format: {ext}")
+    if len(df) == 0:
+        raise ValueError("empty CSV")
     # Column selection (before sampling for speed)
     if columns:
         valid = [c for c in columns if c in df.columns]

package/build/python/quality_engine.py CHANGED Viewed

@@ -102,6 +102,18 @@ def main():
         file_path_lower = file_path.lower()
         if file_path_lower.endswith(".csv"):
             df = pl.read_csv(file_path, ignore_errors=True, n_rows=10000)
+        elif file_path_lower.endswith(".tsv"):
+            df = pl.read_csv(file_path, separator="\t", ignore_errors=True, n_rows=10000)
+        elif file_path_lower.endswith(".txt"):
+            sep = ","
+            try:
+                with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
+                    first_line = fh.readline()
+                    if "\t" in first_line:
+                        sep = "\t"
+            except Exception:
+                sep = ","
+            df = pl.read_csv(file_path, separator=sep, ignore_errors=True, n_rows=10000)
         elif file_path_lower.endswith(".parquet"):
             try:
                 # Try scanning first (faster for large files)
@@ -133,10 +145,18 @@ def main():
         column_count = len(df.columns)
         # Duplicate detection (exact)
+        # NOTE: Some Polars versions can panic on is_duplicated() for nested/null rows.
+        # Use a Python fallback that is slower but robust for the 10k sampled rows.
+        duplicate_count = 0
         try:
-            duplicate_count = df.is_duplicated().sum()
+            seen = set()
+            for row in df.to_dicts():
+                row_key = json.dumps(row, sort_keys=True, default=str)
+                if row_key in seen:
+                    duplicate_count += 1
+                else:
+                    seen.add(row_key)
         except Exception:
-            # Duplicate check might fail on complex nested types (List, Struct)
             duplicate_count = 0
         columns_stats = []
@@ -165,12 +185,16 @@ def main():
         if duplicate_count == 0 and len(text_cols) > 0:
             # Pick longest text column as likely "content"
             # In real impl, we'd use heuristics. For now, first text col.
-            target_col = text_cols[0]
-            text_dupes = df.select(pl.col(target_col)).is_duplicated().sum()
-            if text_dupes > 0:
-                report["text_duplicates"] = int(text_dupes)
-                if text_dupes > (row_count * 0.2):
-                    report["warnings"].append(f"High text duplication in '{target_col}' ({text_dupes} rows)")
+            target_col = text_cols[0]
+            try:
+                text_dupes = df.select(pl.col(target_col)).is_duplicated().sum()
+                if text_dupes > 0:
+                    report["text_duplicates"] = int(text_dupes)
+                    if text_dupes > (row_count * 0.2):
+                        report["warnings"].append(f"High text duplication in '{target_col}' ({text_dupes} rows)")
+            except Exception:
+                # Skip text duplicate warning if backend cannot compute duplicates for this dtype
+                pass
         # Integrity Check 2: Contamination / Leakage (Basic)
         # (Skipping correlation for now)

package/build/tools/formatter.js CHANGED Viewed

@@ -20,6 +20,12 @@ export function formatJobStatus(job) {
     output += `Status: ${statusText}\n`;
     output += `Progress: ${bar} ${job.progress}%\n`;
     output += `Activity: ${job.status_text}\n`;
+    if (job.status === "running" || job.status === "retrying" || job.status === "queued" || job.status === "pending") {
+        output += `Polling hint: check again in 5-10 seconds.\n`;
+    }
+    else {
+        output += `Polling hint: no further polling required.\n`;
+    }
     if (job.result_url) {
         output += `\nResult: ${job.result_url}\n`;
     }

package/mcp-config-template.json CHANGED Viewed

@@ -1,9 +1,12 @@
 {
     "mcpServers": {
         "vesper": {
-            "command": "node",
+            "command": "npx",
             "args": [
-                "/path/to/global/node_modules/@vesper/mcp-server/build/index.js"
+                "-y",
+                "-p",
+                "@vespermcp/mcp-server@latest",
+                "vespermcp"
             ],
             "env": {
                 "KAGGLE_USERNAME": "your-kaggle-username",

package/package.json CHANGED Viewed

@@ -1,13 +1,14 @@
 {
   "name": "@vespermcp/mcp-server",
-  "version": "1.2.14",
+  "version": "1.2.16",
   "description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
   "type": "module",
   "main": "build/index.js",
   "bin": {
     "vespermcp": "./build/index.js",
+    "mcp-server": "./build/index.js",
     "@vespermcp/mcp-server": "./build/index.js",
-    "vesper-wizard": "src/scripts/wizard.js"
+    "vesper-wizard": "scripts/wizard.js"
   },
   "files": [
     "build/**/*",

package/scripts/wizard.js ADDED Viewed

@@ -0,0 +1,307 @@
+#!/usr/bin/env node
+// ─────────────────────────────────────────────────────────────
+//  vesper-wizard  —  Zero-friction local setup for Vesper MCP
+//  Run:  npx vesper-wizard@latest
+// ─────────────────────────────────────────────────────────────
+const fs = require('fs');
+const path = require('path');
+const os = require('os');
+const crypto = require('crypto');
+const { execSync, spawnSync } = require('child_process');
+// ── Paths ────────────────────────────────────────────────────
+const HOME = os.homedir();
+const VESPER_DIR = path.join(HOME, '.vesper');
+const CONFIG_TOML = path.join(VESPER_DIR, 'config.toml');
+const DATA_DIR = path.join(VESPER_DIR, 'data');
+const IS_WIN = process.platform === 'win32';
+const APPDATA = process.env.APPDATA || path.join(HOME, 'AppData', 'Roaming');
+// ── Helpers ──────────────────────────────────────────────────
+function ensureDir(dir) {
+  if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
+}
+function generateLocalKey() {
+  const random = crypto.randomBytes(24).toString('hex');
+  return `vesper_sk_local_${random}`;
+}
+function readToml(filePath) {
+  if (!fs.existsSync(filePath)) return {};
+  const content = fs.readFileSync(filePath, 'utf8');
+  const obj = {};
+  for (const line of content.split('\n')) {
+    const m = line.match(/^\s*(\w+)\s*=\s*"(.*)"\s*$/);
+    if (m) obj[m[1]] = m[2];
+  }
+  return obj;
+}
+function writeToml(filePath, data) {
+  ensureDir(path.dirname(filePath));
+  const lines = Object.entries(data).map(([k, v]) => `${k} = "${v}"`);
+  fs.writeFileSync(filePath, lines.join('\n') + '\n', 'utf8');
+}
+function dim(text) { return `\x1b[2m${text}\x1b[0m`; }
+function bold(text) { return `\x1b[1m${text}\x1b[0m`; }
+function green(text) { return `\x1b[32m${text}\x1b[0m`; }
+function cyan(text) { return `\x1b[36m${text}\x1b[0m`; }
+function yellow(text) { return `\x1b[33m${text}\x1b[0m`; }
+function red(text) { return `\x1b[31m${text}\x1b[0m`; }
+function magenta(text) { return `\x1b[35m${text}\x1b[0m`; }
+function printBanner() {
+  console.log(`
+${dim('─────────────────────────────────────────────────')}
+  ${bold('██    ██ ███████ ███████ ██████  ███████ ██████')}
+  ${bold('██    ██ ██      ██      ██   ██ ██      ██   ██')}
+  ${bold('██    ██ █████   ███████ ██████  █████   ██████')}
+  ${bold(' ██  ██  ██           ██ ██      ██      ██   ██')}
+  ${bold('  ████   ███████ ███████ ██      ███████ ██   ██')}
+  ${cyan('dataset intelligence layer')}
+  ${dim('local-first • zero-config • agent-native')}
+${dim('─────────────────────────────────────────────────')}
+`);
+}
+// ── MCP Auto-Config ──────────────────────────────────────────
+function getAllAgentConfigs() {
+  const isMac = process.platform === 'darwin';
+  return [
+    {
+      name: 'Claude Code',
+      path: path.join(HOME, '.claude.json'),
+      format: 'mcpServers',
+    },
+    {
+      name: 'Claude Desktop',
+      path: IS_WIN
+        ? path.join(APPDATA, 'Claude', 'claude_desktop_config.json')
+        : isMac
+          ? path.join(HOME, 'Library', 'Application Support', 'Claude', 'claude_desktop_config.json')
+          : path.join(HOME, '.config', 'claude', 'claude_desktop_config.json'),
+      format: 'mcpServers',
+    },
+    {
+      name: 'Cursor',
+      path: path.join(HOME, '.cursor', 'mcp.json'),
+      format: 'mcpServers',
+    },
+    {
+      name: 'VS Code',
+      path: IS_WIN
+        ? path.join(APPDATA, 'Code', 'User', 'mcp.json')
+        : isMac
+          ? path.join(HOME, 'Library', 'Application Support', 'Code', 'User', 'mcp.json')
+          : path.join(HOME, '.config', 'Code', 'User', 'mcp.json'),
+      format: 'servers',
+    },
+    {
+      name: 'Codex',
+      path: path.join(HOME, '.codex', 'config.toml'),
+      format: 'toml',
+    },
+    {
+      name: 'Gemini CLI',
+      path: path.join(HOME, '.gemini', 'settings.json'),
+      format: 'mcpServers',
+    },
+  ];
+}
+function installMcpToAgent(agent) {
+  const npxCmd = IS_WIN ? 'npx.cmd' : 'npx';
+  const serverEntry = { command: npxCmd, args: ['-y', '-p', '@vespermcp/mcp-server@latest', 'vespermcp'] };
+  try {
+    if (agent.format === 'toml') {
+      let content = fs.existsSync(agent.path) ? fs.readFileSync(agent.path, 'utf8') : '';
+      if (content.includes('[mcp_servers.vesper]')) return true;
+      ensureDir(path.dirname(agent.path));
+      content += `\n[mcp_servers.vesper]\ncommand = "${serverEntry.command}"\nargs = [${serverEntry.args.map(a => `"${a}"`).join(', ')}]\n`;
+      fs.writeFileSync(agent.path, content, 'utf8');
+      return true;
+    }
+    let config = {};
+    if (fs.existsSync(agent.path)) {
+      try { config = JSON.parse(fs.readFileSync(agent.path, 'utf8').trim() || '{}'); } catch { config = {}; }
+    } else {
+      ensureDir(path.dirname(agent.path));
+    }
+    const key = agent.format === 'servers' ? 'servers' : 'mcpServers';
+    if (!config[key]) config[key] = {};
+    const entry = agent.format === 'servers'
+      ? { type: 'stdio', ...serverEntry }
+      : serverEntry;
+    config[key].vesper = entry;
+    fs.writeFileSync(agent.path, JSON.stringify(config, null, 2), 'utf8');
+    return true;
+  } catch {
+    return false;
+  }
+}
+// ── Server Health Check ──────────────────────────────────────
+async function checkServerHealth() {
+  try {
+    // Quick stdio check — spawn server and see if it responds
+    const result = spawnSync(IS_WIN ? 'npx.cmd' : 'npx', ['-y', '-p', '@vespermcp/mcp-server@latest', 'vespermcp', '--version'], {
+      timeout: 10000,
+      encoding: 'utf8',
+      stdio: ['pipe', 'pipe', 'pipe'],
+    });
+    return result.status === 0 || (result.stderr && result.stderr.includes('Vesper'));
+  } catch {
+    return false;
+  }
+}
+// ── Main Wizard ──────────────────────────────────────────────
+async function main() {
+  printBanner();
+  console.log(`  ${green('→')} Setting up Vesper on ${bold(os.hostname())}\n`);
+  // ─── Step 1: Create directories ────────────────────────────
+  process.stdout.write(`  ${dim('[')}${cyan('1/6')}${dim(']')} Creating local directories...`);
+  ensureDir(VESPER_DIR);
+  ensureDir(DATA_DIR);
+  ensureDir(path.join(DATA_DIR, 'raw'));
+  ensureDir(path.join(DATA_DIR, 'processed'));
+  ensureDir(path.join(VESPER_DIR, 'datasets'));
+  console.log(` ${green('✓')}`);
+  // ─── Step 2: Generate local API key ────────────────────────
+  process.stdout.write(`  ${dim('[')}${cyan('2/6')}${dim(']')} Generating local API key...`);
+  const existing = readToml(CONFIG_TOML);
+  const localKey = existing.api_key || generateLocalKey();
+  const configData = { ...existing, api_key: localKey };
+  writeToml(CONFIG_TOML, configData);
+  console.log(` ${green('✓')}`);
+  console.log(`      ${dim('Key:')} ${dim(localKey.slice(0, 20) + '...')}  ${dim('→')} ${dim(CONFIG_TOML)}`);
+  // ─── Step 3: Local vault initialization ────────────────────
+  process.stdout.write(`\n  ${dim('[')}${cyan('3/6')}${dim(']')} Initializing local credentials vault...`);
+  configData.auth_mode = configData.auth_mode || 'local_unified';
+  writeToml(CONFIG_TOML, configData);
+  console.log(` ${green('✓')}`);
+  console.log(`      ${dim('Mode:')} ${dim('single local Vesper key (no external keys required)')}`);
+  // ─── Step 4: Install @vespermcp/mcp-server ─────────────────
+  console.log(`\n  ${dim('[')}${cyan('4/6')}${dim(']')} Installing Vesper MCP server...`);
+  try {
+    const npmCmd = IS_WIN ? 'npx.cmd' : 'npx';
+    spawnSync(npmCmd, ['-y', '-p', '@vespermcp/mcp-server@latest', 'vespermcp', '--setup', '--silent'], {
+      stdio: 'inherit',
+      timeout: 120000,
+    });
+    console.log(`      ${green('✓')} @vespermcp/mcp-server installed`);
+  } catch {
+    console.log(`      ${yellow('⚠')} Could not auto-install — run manually: npx -y -p @vespermcp/mcp-server@latest vespermcp --setup`);
+  }
+  // ─── Step 5: Auto-configure all detected IDEs ──────────────
+  process.stdout.write(`\n  ${dim('[')}${cyan('5/6')}${dim(']')} Configuring coding agents...`);
+  const agents = getAllAgentConfigs();
+  const configuredAgents = [];
+  const skippedAgents = [];
+  for (const agent of agents) {
+    const dirExists = fs.existsSync(path.dirname(agent.path));
+    const fileExists = fs.existsSync(agent.path);
+    if (fileExists || dirExists) {
+      const ok = installMcpToAgent(agent);
+      if (ok) configuredAgents.push(agent.name);
+      else skippedAgents.push(agent.name);
+    }
+  }
+  console.log(` ${green('✓')}`);
+  if (configuredAgents.length > 0) {
+    console.log(`\n  ┌───────────────────────────────────────────────┐`);
+    console.log(`  │  ${bold('MCP Auto-Configured')}                           │`);
+    console.log(`  ├───────────────────────────────────────────────┤`);
+    for (const name of configuredAgents) {
+      console.log(`  │  ${green('✓')} ${name.padEnd(42)}│`);
+    }
+    console.log(`  └───────────────────────────────────────────────┘`);
+  }
+  // ─── Step 6: Verify ────────────────────────────────────────
+  console.log(`\n  ${dim('[')}${cyan('6/6')}${dim(']')} Verifying installation...`);
+  const dbExists = fs.existsSync(path.join(DATA_DIR, 'metadata.db'));
+  const vecExists = fs.existsSync(path.join(DATA_DIR, 'vectors.json')) || fs.existsSync(path.join(DATA_DIR, 'vectors.bin'));
+  const keyStored = fs.existsSync(CONFIG_TOML);
+  console.log(`      ${keyStored ? green('✓') : red('✗')} Local API key         ${dim(CONFIG_TOML)}`);
+  console.log(`      ${dbExists ? green('✓') : yellow('⚠')} Dataset index         ${dim(dbExists ? 'ready' : 'will build on first search')}`);
+  console.log(`      ${vecExists ? green('✓') : yellow('⚠')} Vector store          ${dim(vecExists ? 'ready' : 'will build on first search')}`);
+  console.log(`      ${configuredAgents.length > 0 ? green('✓') : yellow('⚠')} MCP agents            ${dim(configuredAgents.length + ' configured')}`);
+  // ─── Final Summary ─────────────────────────────────────────
+  console.log(`
+${dim('═════════════════════════════════════════════════')}
+  ${green(bold('✓ Vesper is ready!'))}
+  ${bold('Your local API key:')}
+  ${cyan(localKey)}
+  ${bold('Config file:')}
+  ${dim(CONFIG_TOML)}
+  ${bold('What just happened:')}
+  ${dim('1.')} Generated a local API key (never leaves your machine)
+  ${dim('2.')} Initialized local credentials vault
+  ${dim('3.')} Auto-configured MCP for ${configuredAgents.length > 0 ? configuredAgents.join(', ') : 'detected agents'}
+  ${dim('4.')} Vesper server ready on stdio transport
+${dim('─────────────────────────────────────────────────')}
+  ${bold('Quick start — try in your AI assistant:')}
+  ${cyan('Search datasets')}
+  ${dim('>')} vesper_search(query="sentiment analysis")
+  ${cyan('Download & prepare')}
+  ${dim('>')} prepare_dataset(query="image classification cats dogs")
+  ${cyan('Quality analysis')}
+  ${dim('>')} analyze_quality(dataset_id="imdb")
+  ${cyan('Export to your project')}
+  ${dim('>')} export_dataset(dataset_id="imdb", format="parquet")
+${dim('─────────────────────────────────────────────────')}
+  ${bold('Unified API — one interface, every source:')}
+  HuggingFace · Kaggle · OpenML · data.world
+  ${dim('Agents call localhost Vesper APIs with one local key.')}
+  ${dim('Vesper adapters handle provider routing internally.')}
+${dim('─────────────────────────────────────────────────')}
+  ${yellow('→')} Restart your IDE to activate MCP
+  ${dim('Docs:')} https://github.com/vesper/mcp-server
+${dim('═════════════════════════════════════════════════')}
+`);
+}
+main().catch((err) => {
+  console.error(`\n${red('Error:')} ${err.message || err}`);
+  process.exit(1);
+});

package/src/python/export_engine.py CHANGED Viewed

@@ -31,6 +31,19 @@ def _load(file_path: str, options: dict) -> pl.DataFrame:
     ext = os.path.splitext(file_path)[1].lower()
     if ext == ".csv":
         df = pl.read_csv(file_path, ignore_errors=True)
+    elif ext == ".tsv":
+        df = pl.read_csv(file_path, separator="\t", ignore_errors=True)
+    elif ext == ".txt":
+        # Heuristic delimiter detection for plain text tabular files.
+        sep = ","
+        try:
+            with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
+                first_line = fh.readline()
+                if "\t" in first_line:
+                    sep = "\t"
+        except Exception:
+            sep = ","
+        df = pl.read_csv(file_path, separator=sep, ignore_errors=True)
     elif ext in (".parquet", ".pq"):
         df = pl.read_parquet(file_path)
     elif ext in (".feather", ".ftr", ".arrow", ".ipc"):
@@ -40,6 +53,9 @@ def _load(file_path: str, options: dict) -> pl.DataFrame:
     else:
         raise ValueError(f"Unsupported input format: {ext}")
+    if len(df) == 0:
+        raise ValueError("empty CSV")
     # Column selection (before sampling for speed)
     if columns:
         valid = [c for c in columns if c in df.columns]

package/src/python/quality_engine.py CHANGED Viewed

@@ -102,6 +102,18 @@ def main():
         file_path_lower = file_path.lower()
         if file_path_lower.endswith(".csv"):
             df = pl.read_csv(file_path, ignore_errors=True, n_rows=10000)
+        elif file_path_lower.endswith(".tsv"):
+            df = pl.read_csv(file_path, separator="\t", ignore_errors=True, n_rows=10000)
+        elif file_path_lower.endswith(".txt"):
+            sep = ","
+            try:
+                with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
+                    first_line = fh.readline()
+                    if "\t" in first_line:
+                        sep = "\t"
+            except Exception:
+                sep = ","
+            df = pl.read_csv(file_path, separator=sep, ignore_errors=True, n_rows=10000)
         elif file_path_lower.endswith(".parquet"):
             try:
                 # Try scanning first (faster for large files)
@@ -133,10 +145,18 @@ def main():
         column_count = len(df.columns)
         # Duplicate detection (exact)
+        # NOTE: Some Polars versions can panic on is_duplicated() for nested/null rows.
+        # Use a Python fallback that is slower but robust for the 10k sampled rows.
+        duplicate_count = 0
         try:
-            duplicate_count = df.is_duplicated().sum()
+            seen = set()
+            for row in df.to_dicts():
+                row_key = json.dumps(row, sort_keys=True, default=str)
+                if row_key in seen:
+                    duplicate_count += 1
+                else:
+                    seen.add(row_key)
         except Exception:
-            # Duplicate check might fail on complex nested types (List, Struct)
             duplicate_count = 0
         columns_stats = []
@@ -165,12 +185,16 @@ def main():
         if duplicate_count == 0 and len(text_cols) > 0:
             # Pick longest text column as likely "content"
             # In real impl, we'd use heuristics. For now, first text col.
-            target_col = text_cols[0]
-            text_dupes = df.select(pl.col(target_col)).is_duplicated().sum()
-            if text_dupes > 0:
-                report["text_duplicates"] = int(text_dupes)
-                if text_dupes > (row_count * 0.2):
-                    report["warnings"].append(f"High text duplication in '{target_col}' ({text_dupes} rows)")
+            target_col = text_cols[0]
+            try:
+                text_dupes = df.select(pl.col(target_col)).is_duplicated().sum()
+                if text_dupes > 0:
+                    report["text_duplicates"] = int(text_dupes)
+                    if text_dupes > (row_count * 0.2):
+                        report["warnings"].append(f"High text duplication in '{target_col}' ({text_dupes} rows)")
+            except Exception:
+                # Skip text duplicate warning if backend cannot compute duplicates for this dtype
+                pass
         # Integrity Check 2: Contamination / Leakage (Basic)
         # (Skipping correlation for now)

package/src/scripts/wizard.js DELETED Viewed

@@ -1,77 +0,0 @@
-#!/usr/bin/env node
-// Vesper Wizard CLI: Interactive setup for fast configuration
-const inquirer = require('inquirer');
-const fs = require('fs');
-const path = require('path');
-async function main() {
-  console.log('\n🧙 Welcome to the Vesper Wizard!\n');
-  // Step 1: Project basics
-  const { projectName } = await inquirer.prompt([
-    {
-      type: 'input',
-      name: 'projectName',
-      message: 'Project name:',
-      default: path.basename(process.cwd()),
-    },
-  ]);
-  // Step 2: Data directory
-  const { dataDir } = await inquirer.prompt([
-    {
-      type: 'input',
-      name: 'dataDir',
-      message: 'Path to your data directory:',
-      default: './datasets',
-    },
-  ]);
-  // Step 3: Default export format
-  const { exportFormat } = await inquirer.prompt([
-    {
-      type: 'list',
-      name: 'exportFormat',
-      message: 'Default export format:',
-      choices: ['parquet', 'csv', 'feather'],
-      default: 'parquet',
-    },
-  ]);
-  // Step 4: Add tokens/credentials
-  const { addTokens } = await inquirer.prompt([
-    {
-      type: 'confirm',
-      name: 'addTokens',
-      message: 'Would you like to add API tokens or credentials now?',
-      default: true,
-    },
-  ]);
-  let tokens = {};
-  if (addTokens) {
-    const { kaggleToken } = await inquirer.prompt([
-      {
-        type: 'input',
-        name: 'kaggleToken',
-        message: 'Kaggle API token (leave blank to skip):',
-      },
-    ]);
-    if (kaggleToken) tokens.kaggle = kaggleToken;
-    // Add more tokens as needed
-  }
-  // Step 5: Write config file
-  const config = {
-    project: projectName,
-    dataDir,
-    exportFormat,
-    tokens,
-  };
-  const configPath = path.join(process.cwd(), 'vesper-mcp-config.json');
-  fs.writeFileSync(configPath, JSON.stringify(config, null, 2));
-  console.log(`\n✅ Configuration saved to ${configPath}`);
-  console.log('\n🎉 Vesper is ready to use!\n');
-}
-main();