npm - @vespermcp/mcp-server - Versions diffs - 1.2.21 → 1.2.24 - Mend

@vespermcp/mcp-server 1.2.21 → 1.2.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

package/README.md +49 -0
package/build/cache/service.js +7 -0
package/build/cloud/adapters/supabase.js +49 -0
package/build/cloud/storage-manager.js +6 -0
package/build/export/exporter.js +22 -9
package/build/gateway/unified-dataset-gateway.js +441 -0
package/build/index.js +1815 -839
package/build/ingestion/ingestor.js +7 -4
package/build/install/install-service.js +11 -6
package/build/lib/supabase.js +3 -0
package/build/metadata/arxiv-source.js +229 -0
package/build/metadata/circuit-breaker.js +62 -0
package/build/metadata/github-source.js +203 -0
package/build/metadata/hackernews-source.js +123 -0
package/build/metadata/quality.js +27 -0
package/build/metadata/scraper.js +85 -14
package/build/metadata/semantic-scholar-source.js +138 -0
package/build/python/asset_downloader_engine.py +2 -0
package/build/python/convert_engine.py +92 -0
package/build/python/export_engine.py +45 -0
package/build/python/kaggle_engine.py +77 -5
package/build/python/normalize_engine.py +83 -0
package/build/python/vesper/core/asset_downloader.py +5 -1
package/build/scripts/test-phase1-webcore-quality.js +104 -0
package/build/search/engine.js +45 -6
package/build/search/jit-orchestrator.js +18 -14
package/build/search/query-intent.js +509 -0
package/build/tools/formatter.js +6 -3
package/build/utils/python-runtime.js +130 -0
package/build/web/extract-web.js +297 -0
package/build/web/fusion-engine.js +457 -0
package/build/web/types.js +1 -0
package/build/web/web-core.js +242 -0
package/package.json +12 -5
package/scripts/postinstall.cjs +87 -31
package/scripts/wizard.cjs +652 -0
package/scripts/wizard.js +338 -12
package/src/python/__pycache__/config.cpython-312.pyc +0 -0
package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
package/src/python/asset_downloader_engine.py +2 -0
package/src/python/convert_engine.py +92 -0
package/src/python/export_engine.py +45 -0
package/src/python/kaggle_engine.py +77 -5
package/src/python/normalize_engine.py +83 -0
package/src/python/requirements.txt +12 -0
package/src/python/vesper/core/asset_downloader.py +5 -1
package/wizard.cjs +3 -0

package/build/index.js CHANGED Viewed

@@ -1,12 +1,39 @@
 #!/usr/bin/env node
 // --- Dataset ID Normalization ---
 function normalize_dataset_id(dataset_id) {
-    // Remove kaggle: prefix for storage key
-    let id = dataset_id.replace(/^kaggle:/, "");
+    const trimmed = dataset_id.trim();
+    const sourceMatch = trimmed.match(/^(kaggle|huggingface|hf|openml|dataworld):/i);
+    let id = trimmed.replace(/^(kaggle|huggingface|hf|openml|dataworld):/i, "");
     // Replace / and : with _ for filesystem safety
-    id = id.replace(/[/:]/g, "_");
-    // Always store and lookup using the same normalized format
-    return dataset_id.startsWith("kaggle:") ? `kaggle_${id}` : id;
+    id = id.replace(/[\\/:]/g, "_");
+    if (!sourceMatch) {
+        return id;
+    }
+    const source = sourceMatch[1].toLowerCase() === "hf" ? "huggingface" : sourceMatch[1].toLowerCase();
+    return `${source}_${id}`;
+}
+function getDatasetIdAliases(dataset_id) {
+    const trimmed = dataset_id.trim();
+    const aliases = new Set([trimmed]);
+    const sourceMatch = trimmed.match(/^(kaggle|huggingface|hf|openml|dataworld):/i);
+    if (sourceMatch) {
+        const stripped = trimmed.replace(/^(kaggle|huggingface|hf|openml|dataworld):/i, "");
+        aliases.add(stripped);
+        if (sourceMatch[1].toLowerCase() === "hf") {
+            aliases.add(`huggingface:${stripped}`);
+        }
+    }
+    else {
+        aliases.add(`kaggle:${trimmed}`);
+        aliases.add(`huggingface:${trimmed}`);
+        aliases.add(`hf:${trimmed}`);
+        aliases.add(`openml:${trimmed}`);
+        aliases.add(`dataworld:${trimmed}`);
+    }
+    return Array.from(aliases);
+}
+function toSafeDatasetPathFragment(dataset_id) {
+    return normalize_dataset_id(dataset_id);
 }
 // --- Dataset Registry Helpers ---
 function getRegistryPath() {
@@ -29,10 +56,11 @@ function writeRegistry(entries) {
     fs.writeFileSync(registryPath, JSON.stringify(entries, null, 2));
 }
 function upsertRegistry(dataset_id, local_path, status) {
-    const norm_id = normalize_dataset_id(dataset_id);
+    const aliases = getDatasetIdAliases(dataset_id).map(normalize_dataset_id);
+    const norm_id = aliases[0];
     console.error(`[Registry] Writing key: ${norm_id}`);
     const entries = readRegistry();
-    const idx = entries.findIndex(e => e.dataset_id === norm_id);
+    const idx = entries.findIndex(e => aliases.includes(e.dataset_id || e.id));
     if (idx >= 0) {
         entries[idx] = { dataset_id: norm_id, local_path, status };
     }
@@ -42,9 +70,163 @@ function upsertRegistry(dataset_id, local_path, status) {
     writeRegistry(entries);
 }
 function getRegistryEntry(dataset_id) {
-    const norm_id = normalize_dataset_id(dataset_id);
-    console.error(`[Registry] Lookup key: ${norm_id}`);
-    return readRegistry().find(e => (e.dataset_id || e.id) === norm_id);
+    const aliases = getDatasetIdAliases(dataset_id).map(normalize_dataset_id);
+    console.error(`[Registry] Lookup keys: ${aliases.join(", ")}`);
+    return readRegistry().find(e => aliases.includes((e.dataset_id || e.id)));
+}
+const STRUCTURED_FILE_EXTENSIONS = [".parquet", ".csv", ".jsonl", ".json", ".feather", ".arrow", ".tsv", ".txt"];
+const IMAGE_FILE_EXTENSIONS = new Set([".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif", ".tiff", ".tif", ".svg"]);
+function walkFilesRecursive(rootDir) {
+    const out = [];
+    const stack = [rootDir];
+    while (stack.length > 0) {
+        const currentDir = stack.pop();
+        const entries = fs.readdirSync(currentDir, { withFileTypes: true });
+        for (const entry of entries) {
+            const fullPath = path.join(currentDir, entry.name);
+            if (entry.isDirectory()) {
+                stack.push(fullPath);
+            }
+            else if (entry.isFile()) {
+                out.push(fullPath);
+            }
+        }
+    }
+    out.sort();
+    return out;
+}
+function inferImageManifestRecord(rootDir, fullPath, index) {
+    const relativePath = path.relative(rootDir, fullPath).replace(/\\/g, "/");
+    const parentDir = path.posix.dirname(relativePath);
+    const parts = parentDir.split("/").filter(part => part && part !== ".");
+    let split;
+    let label;
+    if (parts.length > 0) {
+        const first = parts[0].toLowerCase();
+        if (["train", "test", "val", "valid", "validation"].includes(first)) {
+            split = parts[0];
+            if (parts.length > 1) {
+                label = parts[parts.length - 1];
+            }
+        }
+        else {
+            label = parts[parts.length - 1];
+        }
+    }
+    return {
+        id: index,
+        image_path: path.resolve(fullPath),
+        relative_path: relativePath,
+        file_name: path.basename(fullPath),
+        extension: path.extname(fullPath).toLowerCase().replace(/^\./, ""),
+        ...(split ? { split } : {}),
+        ...(label ? { label } : {}),
+    };
+}
+function createImageManifestFromDirectory(rootDir) {
+    const imageFiles = walkFilesRecursive(rootDir).filter(filePath => IMAGE_FILE_EXTENSIONS.has(path.extname(filePath).toLowerCase()));
+    if (imageFiles.length === 0) {
+        throw new Error(`No image files found under ${rootDir}`);
+    }
+    const manifestPath = path.join(rootDir, "_vesper_image_manifest.jsonl");
+    const lines = imageFiles.map((filePath, index) => JSON.stringify(inferImageManifestRecord(rootDir, filePath, index)));
+    fs.writeFileSync(manifestPath, `${lines.join("\n")}\n`, "utf-8");
+    return manifestPath;
+}
+function ensureExportableLocalPath(localPath) {
+    if (!fs.existsSync(localPath)) {
+        throw new Error(`Local path not found: ${localPath}`);
+    }
+    const stats = fs.statSync(localPath);
+    if (stats.isFile()) {
+        return localPath;
+    }
+    const manifestPath = path.join(localPath, "_vesper_image_manifest.jsonl");
+    if (fs.existsSync(manifestPath)) {
+        return manifestPath;
+    }
+    const candidates = walkFilesRecursive(localPath);
+    for (const ext of STRUCTURED_FILE_EXTENSIONS) {
+        const match = candidates.find(candidate => path.extname(candidate).toLowerCase() === ext);
+        if (match) {
+            return match;
+        }
+    }
+    return createImageManifestFromDirectory(localPath);
+}
+function isPathWithinDirectory(candidatePath, directoryPath) {
+    const relativePath = path.relative(path.resolve(directoryPath), path.resolve(candidatePath));
+    return relativePath === "" || (!relativePath.startsWith("..") && !path.isAbsolute(relativePath));
+}
+function buildDatasetCandidatePaths(baseDir, safeId) {
+    return [
+        path.join(baseDir, `${safeId}.parquet`),
+        path.join(baseDir, `${safeId}.csv`),
+        path.join(baseDir, `${safeId}.jsonl`),
+        path.join(baseDir, `${safeId}.json`),
+        path.join(baseDir, `${safeId}.feather`),
+        path.join(baseDir, `${safeId}.arrow`),
+        path.join(baseDir, safeId),
+    ];
+}
+function shouldTrackExportPath(localPath) {
+    return isPathWithinDirectory(localPath, dataRoot);
+}
+function isDirectLocalDatasetReference(datasetIdOrPath) {
+    return fs.existsSync(datasetIdOrPath);
+}
+function getExportFileStem(datasetIdOrPath) {
+    if (isDirectLocalDatasetReference(datasetIdOrPath)) {
+        const resolvedPath = path.resolve(datasetIdOrPath);
+        const stats = fs.statSync(resolvedPath);
+        const baseName = stats.isDirectory()
+            ? path.basename(resolvedPath)
+            : path.parse(resolvedPath).name;
+        return toSafeDatasetPathFragment(baseName);
+    }
+    return toSafeDatasetPathFragment(datasetIdOrPath);
+}
+function ensureLocalPipelineSource(sourcePath, datasetId, targetDir) {
+    const resolvedTargetDir = path.resolve(targetDir);
+    const resolvedSourcePath = path.resolve(sourcePath);
+    if (path.dirname(resolvedSourcePath) === resolvedTargetDir) {
+        return resolvedSourcePath;
+    }
+    if (!fs.existsSync(resolvedTargetDir)) {
+        fs.mkdirSync(resolvedTargetDir, { recursive: true });
+    }
+    const stagedPath = path.join(resolvedTargetDir, `${toSafeDatasetPathFragment(datasetId)}${path.extname(resolvedSourcePath)}`);
+    if (resolvedSourcePath !== stagedPath) {
+        fs.copyFileSync(resolvedSourcePath, stagedPath);
+    }
+    return stagedPath;
+}
+function resolveDatasetLocalPath(datasetIdOrPath, preferredDirs = []) {
+    if (fs.existsSync(datasetIdOrPath)) {
+        return ensureExportableLocalPath(datasetIdOrPath);
+    }
+    const safeId = toSafeDatasetPathFragment(datasetIdOrPath);
+    const uniquePreferredDirs = Array.from(new Set(preferredDirs
+        .filter((dir) => typeof dir === "string" && dir.trim().length > 0)
+        .map(dir => path.resolve(dir))));
+    for (const preferredDir of uniquePreferredDirs) {
+        const localMatch = buildDatasetCandidatePaths(preferredDir, safeId).find(candidate => fs.existsSync(candidate));
+        if (localMatch) {
+            return ensureExportableLocalPath(localMatch);
+        }
+    }
+    const downloadStatus = metadataStore.getDownloadStatus(datasetIdOrPath);
+    if (downloadStatus?.local_path && fs.existsSync(downloadStatus.local_path)) {
+        return ensureExportableLocalPath(downloadStatus.local_path);
+    }
+    const reg = getRegistryEntry(datasetIdOrPath);
+    const regPath = reg?.local_path || reg?.path;
+    if (regPath && fs.existsSync(regPath)) {
+        return ensureExportableLocalPath(regPath);
+    }
+    const rawCandidates = buildDatasetCandidatePaths(path.join(dataRoot, "data", "raw"), safeId);
+    const match = rawCandidates.find(candidate => fs.existsSync(candidate));
+    return match ? ensureExportableLocalPath(match) : undefined;
 }
 // --- Pipeline State Tracker ---
 // Tracks completed steps per session/job/dataset
@@ -66,7 +248,7 @@ export function hasStep(datasetId, step) {
 // --- Dataset ID Auto-Detection ---
 export function parseDatasetId(id) {
     const trimmed = id.trim();
-    if (/^(kaggle:|hf:|huggingface:|openml:|dataworld:|http|https):/i.test(trimmed))
+    if (/^(kaggle:|hf:|huggingface:|openml:|dataworld:|arxiv:|github:|http|https):/i.test(trimmed))
         return trimmed;
     if (trimmed.includes("/") && !trimmed.includes(":"))
         return `kaggle:${trimmed}`;
@@ -88,6 +270,14 @@ import { HuggingFaceScraper } from "./metadata/scraper.js";
 import { KaggleSource } from "./metadata/kaggle-source.js";
 import { OpenMLSource } from "./metadata/openml-source.js";
 import { DataWorldSource } from "./metadata/dataworld-source.js";
+import { ArxivSource } from "./metadata/arxiv-source.js";
+import { GithubSource } from "./metadata/github-source.js";
+import { UnifiedDatasetGateway } from "./gateway/unified-dataset-gateway.js";
+import { WebCoreEngine } from "./web/web-core.js";
+import { WebFusionEngine } from "./web/fusion-engine.js";
+import { WebExtractorEngine } from "./web/extract-web.js";
+import { SemanticScholarSource } from "./metadata/semantic-scholar-source.js";
+import { HackerNewsSource } from "./metadata/hackernews-source.js";
 import { formatSearchResults, formatDatasetInfo, formatJobStatus } from "./tools/formatter.js";
 import { JobManager } from "./jobs/manager.js";
 import { QualityAnalyzer } from "./quality/analyzer.js";
@@ -131,6 +321,34 @@ function logError(err, context) {
     fs.appendFileSync(errorLogPath, msg);
     console.error(`[Vesper] Critical error logged to ${errorLogPath}`);
 }
+// --- Request Queue: serialize all MCP tool calls to prevent crashes ---
+class RequestQueue {
+    queue = [];
+    running = false;
+    enqueue(task) {
+        return new Promise((resolve, reject) => {
+            this.queue.push({ resolve, reject, task });
+            this.drain();
+        });
+    }
+    async drain() {
+        if (this.running)
+            return;
+        this.running = true;
+        while (this.queue.length > 0) {
+            const item = this.queue.shift();
+            try {
+                const result = await item.task();
+                item.resolve(result);
+            }
+            catch (err) {
+                item.reject(err);
+            }
+        }
+        this.running = false;
+    }
+}
+const requestQueue = new RequestQueue();
 const SPINNER_FRAMES = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"];
 function printLaunchScreen() {
     const screen = `
@@ -198,6 +416,21 @@ function extractRequestedRows(query, requirements) {
         if (Number.isFinite(n) && n > 0)
             return n;
     }
+    const commaNumbers = [...text.matchAll(/\b\d{1,3}(?:,\d{3})+\b/g)]
+        .map(m => Number(m[0].replace(/,/g, "")))
+        .filter(n => Number.isFinite(n) && n > 0);
+    if (commaNumbers.length > 0)
+        return Math.max(...commaNumbers);
+    const humanSized = [...text.matchAll(/\b(\d+(?:\.\d+)?)\s*([kmb])\b/gi)]
+        .map(m => {
+        const base = Number(m[1]);
+        const suffix = m[2].toLowerCase();
+        const multiplier = suffix === "k" ? 1_000 : suffix === "m" ? 1_000_000 : 1_000_000_000;
+        return Math.round(base * multiplier);
+    })
+        .filter(n => Number.isFinite(n) && n > 0);
+    if (humanSized.length > 0)
+        return Math.max(...humanSized);
     const allNums = [...text.matchAll(/\b\d{4,9}\b/g)]
         .map(m => Number(m[0]))
         .filter(n => Number.isFinite(n) && n > 0);
@@ -367,7 +600,45 @@ function syncPythonScripts(appRoot, dataRoot) {
 }
 // Sync scripts immediately
 syncPythonScripts(appRoot, dataRoot);
-const metadataStore = new MetadataStore(dbPath);
+// Auto-rebuild better-sqlite3 if native binary doesn't match current Node version
+function tryRebuildSqlite() {
+    try {
+        const { execSync } = require("child_process");
+        const pkgRoot = path.resolve(__dirname, "..");
+        console.error("[Vesper] Rebuilding better-sqlite3 for Node " + process.version + "...");
+        execSync("npm rebuild better-sqlite3", {
+            stdio: "pipe",
+            timeout: 60000,
+            cwd: pkgRoot,
+        });
+        console.error("[Vesper] Rebuild succeeded. Retrying...");
+        // Clear require cache so the rebuilt module is loaded
+        for (const key of Object.keys(require.cache)) {
+            if (key.includes("better-sqlite3") || key.includes("better_sqlite3")) {
+                delete require.cache[key];
+            }
+        }
+        return true;
+    }
+    catch (e) {
+        console.error("[Vesper] Auto-rebuild failed: " + (e?.message || e));
+        return false;
+    }
+}
+let metadataStore;
+try {
+    metadataStore = new MetadataStore(dbPath);
+}
+catch (e) {
+    if (e?.code === "ERR_DLOPEN_FAILED" && tryRebuildSqlite()) {
+        metadataStore = new MetadataStore(dbPath);
+    }
+    else {
+        console.error("[Vesper] FATAL: Cannot load better-sqlite3.");
+        console.error("[Vesper] Run: npm rebuild better-sqlite3");
+        throw e;
+    }
+}
 const vectorStore = new VectorStore(vectorPath);
 const embedder = Embedder.getInstance();
 const searchEngine = new SearchEngine(metadataStore, vectorStore, embedder);
@@ -382,7 +653,16 @@ const dataSplitter = new DataSplitter(__dirname);
 const dataExporter = new DataExporter(__dirname);
 const fusionEngine = new DataFusionEngine(__dirname);
 const kaggleSource = new KaggleSource(__dirname);
+const openmlSource = new OpenMLSource(__dirname);
+const dataworldSource = new DataWorldSource(__dirname);
+const arxivSource = new ArxivSource(cacheService);
+const githubSource = new GithubSource(cacheService);
 const secureKeys = new SecureKeysManager(__dirname);
+const semanticScholarSource = new SemanticScholarSource(cacheService);
+const hackerNewsSource = new HackerNewsSource(cacheService);
+const webCoreEngine = new WebCoreEngine({ arxivSource, githubSource, semanticScholarSource, hackerNewsSource });
+const webFusionEngine = new WebFusionEngine({ webCoreEngine, embedder, cache: cacheService });
+const webExtractorEngine = new WebExtractorEngine(cacheService);
 function hydrateExternalKeys() {
     const keys = secureKeys.getAll();
     if (!process.env.HF_TOKEN && !process.env.HUGGINGFACE_TOKEN && keys.hf_token) {
@@ -401,6 +681,17 @@ function hydrateExternalKeys() {
 function hasDataWorldToken() {
     return !!(process.env.DW_AUTH_TOKEN || secureKeys.getAll().dataworld_token);
 }
+const unifiedDatasetGateway = new UnifiedDatasetGateway({
+    metadataStore,
+    dataIngestor,
+    dataRoot,
+    kaggleSource,
+    openmlSource,
+    dataworldSource,
+    arxivSource,
+    githubSource,
+    hasDataWorldToken,
+});
 // CRITICAL FIX: Pass __dirname (build directory) to analyzers
 // Python scripts are in build/python/, so analyzers should look relative to build/
 // NOT relative to project root (appRoot)
@@ -432,7 +723,7 @@ jobManager.on("processJob", async (job, execute) => {
         console.error(`[Vesper] Starting prepareDatasetTask for job ${job.id}...`);
         const metadata = job.metadata ? JSON.parse(job.metadata) : {};
         switch (job.type) {
-            case "prepare": return await handlePrepareJob(job.id, metadata.query, metadata.requirements);
+            case "prepare": return await handlePrepareJob(job.id, metadata.query, metadata.requirements, metadata.outputDir);
             case "clean": return await handleCleanJob(job.id, metadata.datasetId, metadata.ops);
             default: throw new Error(`Unhandled job type: ${job.type}`);
         }
@@ -450,9 +741,21 @@ jobManager.on("processJob", async (job, execute) => {
 /**
  * Logic for preparing a dataset (Search + Ingest + Process)
  */
-async function handlePrepareJob(jobId, query, requirements) {
+async function handlePrepareJob(jobId, query, requirements, outputDir) {
     hydrateExternalKeys();
     const update = (updates) => jobManager.updateJob(jobId, updates);
+    const pipelineSteps = ["search", "validate", "download", "normalize", "quality", "register"];
+    const stepStatus = {};
+    for (const s of pipelineSteps)
+        stepStatus[s] = "pending";
+    const markPipelineStep = (step, status) => {
+        stepStatus[step] = status;
+        const summary = pipelineSteps.map(s => {
+            const st = stepStatus[s];
+            return st === "done" ? `[${s}]` : st === "running" ? `>${s}<` : st === "failed" ? `!${s}!` : st === "skipped" ? `~${s}~` : ` ${s} `;
+        }).join(" → ");
+        console.error(`[Pipeline] ${summary}`);
+    };
     // Ensure core Python packages are available for dataset operations
     try {
         await ensurePythonModules([
@@ -465,11 +768,12 @@ async function handlePrepareJob(jobId, query, requirements) {
         // Continue anyway - direct file downloads may still work without datasets lib
     }
     const requestedRows = extractRequestedRows(query, requirements);
+    const searchQuery = requirements ? `${query} ${requirements}` : query;
     let selectedDataset;
     let datasetIdForDownload = "";
     let source;
     const parsedQuery = parseDatasetId(query);
-    const isExplicitDatasetRef = /^(kaggle:|hf:|huggingface:|openml:|dataworld:)/i.test(parsedQuery) || (query.includes("/") && !query.includes(" "));
+    const isExplicitDatasetRef = /^(kaggle:|hf:|huggingface:|openml:|dataworld:|arxiv:|github:)/i.test(parsedQuery) || (query.includes("/") && !query.includes(" "));
     if (isExplicitDatasetRef) {
         let explicitId = parsedQuery;
         if (/^hf:/i.test(explicitId)) {
@@ -491,6 +795,12 @@ async function handlePrepareJob(jobId, query, requirements) {
             source = "dataworld";
             datasetIdForDownload = explicitId.replace(/^dataworld:/i, "");
         }
+        else if (/^arxiv:/i.test(explicitId)) {
+            throw new Error("prepare_dataset does not support direct arXiv downloads yet. Use unified_dataset_api with operation='discover' or 'info' for arXiv.");
+        }
+        else if (/^github:/i.test(explicitId)) {
+            throw new Error("prepare_dataset does not support direct GitHub downloads yet. Use unified_dataset_api with operation='discover' or 'info' for GitHub.");
+        }
         else {
             // Default to HuggingFace for ambiguous refs (user/dataset without prefix)
             source = "huggingface";
@@ -500,11 +810,14 @@ async function handlePrepareJob(jobId, query, requirements) {
             progress: 20,
             status_text: `Using explicit dataset id: ${datasetIdForDownload} (${source})`
         });
+        markPipelineStep("search", "skipped");
     }
     else {
+        markPipelineStep("search", "running");
         update({ progress: 10, status_text: "Searching for best dataset matching query..." });
-        const results = await searchEngine.search(query, { limit: 10 });
+        const results = await searchEngine.search(searchQuery, { limit: 10 });
         if (results.length === 0) {
+            markPipelineStep("search", "failed");
             throw new Error("No datasets found matching the query. Try refining your search terms.");
         }
         // Pick the best result that we can actually download (skip sources requiring missing credentials)
@@ -512,20 +825,32 @@ async function handlePrepareJob(jobId, query, requirements) {
         const hasDwToken = hasDataWorldToken();
         selectedDataset = results.find(r => {
             const s = (r.source || "").toLowerCase();
+            if (s === "arxiv")
+                return false; // Phase 1: discover/info only, no direct download yet
+            if (s === "github")
+                return false; // Phase 1: discover/info only, no direct download yet
             if (s === "kaggle" && !hasKaggleCreds)
                 return false;
             if (s === "dataworld" && !hasDwToken)
                 return false;
             return true;
         }) || results[0]; // Fallback to first if all require credentials
+        if ((selectedDataset.source || "").toLowerCase() === "arxiv") {
+            throw new Error("Matched an arXiv paper, but prepare_dataset currently supports downloadable dataset providers only.");
+        }
+        if ((selectedDataset.source || "").toLowerCase() === "github") {
+            throw new Error("Matched a GitHub repo, but prepare_dataset currently supports downloadable dataset providers only.");
+        }
         datasetIdForDownload = selectedDataset.id;
         source = selectedDataset.source;
         update({
             progress: 20,
             status_text: `Matched: ${selectedDataset.name} (${selectedDataset.source})`
         });
+        markPipelineStep("search", "done");
     }
     // Pre-check credentials for sources that require them
+    markPipelineStep("validate", "running");
     if (source === "kaggle") {
         const hasKaggleCreds = !!(process.env.KAGGLE_USERNAME && process.env.KAGGLE_KEY && process.env.KAGGLE_USERNAME !== "YOUR_KAGGLE_USERNAME");
         if (!hasKaggleCreds) {
@@ -533,8 +858,11 @@ async function handlePrepareJob(jobId, query, requirements) {
         }
     }
     if (source === "dataworld" && !hasDataWorldToken()) {
+        markPipelineStep("validate", "failed");
         throw new Error("data.world token not set. Use the configure_keys tool or set DW_AUTH_TOKEN environment variable.");
     }
+    markPipelineStep("validate", "done");
+    markPipelineStep("download", "running");
     update({ progress: 30, status_text: `Starting download from ${source}...` });
     // ensureData handles download and returns path to the raw file
     let rawFilePath = await dataIngestor.ensureData(datasetIdForDownload, source, (msg, prog) => {
@@ -545,7 +873,7 @@ async function handlePrepareJob(jobId, query, requirements) {
         let currentRows = await countRows(rawFilePath);
         if (currentRows < requestedRows) {
             update({ progress: 64, status_text: `Only ${currentRows.toLocaleString()} rows found. Fetching more matching datasets...` });
-            const additional = await searchEngine.search(query, { limit: 8 });
+            const additional = await searchEngine.search(searchQuery, { limit: 8 });
             const sourceFiles = [rawFilePath];
             let totalRows = currentRows;
             for (const ds of additional) {
@@ -597,15 +925,50 @@ async function handlePrepareJob(jobId, query, requirements) {
             update({ progress: 69, status_text: `Sample target met: ${currentRows.toLocaleString()} rows` });
         }
     }
+    markPipelineStep("download", "done");
+    // ── Normalize step: convert any raw format → parquet ──
+    markPipelineStep("normalize", "running");
+    const rawExt = path.extname(rawFilePath).toLowerCase();
+    if (rawExt !== ".parquet" && rawExt !== ".pq") {
+        update({ progress: 70, status_text: "Normalizing to parquet..." });
+        const normalizedDir = path.join(dataRoot, "data", "normalized");
+        if (!fs.existsSync(normalizedDir))
+            fs.mkdirSync(normalizedDir, { recursive: true });
+        const safeId = toSafeDatasetPathFragment(datasetIdForDownload);
+        const normalizedPath = path.join(normalizedDir, `${safeId}.parquet`);
+        try {
+            const normScript = path.join(dataRoot, "python", "normalize_engine.py");
+            const normResult = await runPythonJson(normScript, [rawFilePath, normalizedPath]);
+            if (normResult.ok && fs.existsSync(normalizedPath)) {
+                console.error(`[Prepare] Normalized ${rawExt} → parquet (${normResult.rows} rows)`);
+                rawFilePath = normalizedPath;
+                markPipelineStep("normalize", "done");
+            }
+            else {
+                console.error(`[Prepare] Normalize failed: ${normResult.error}, continuing with raw file`);
+                markPipelineStep("normalize", "skipped");
+            }
+        }
+        catch (e) {
+            console.error(`[Prepare] Normalize step failed: ${e?.message || e}, continuing with raw file`);
+            markPipelineStep("normalize", "skipped");
+        }
+    }
+    else {
+        markPipelineStep("normalize", "done");
+    }
     let qualityScore = selectedDataset?.quality_score ?? 70;
-    update({ progress: 70, status_text: "Analyzing dataset quality..." });
+    markPipelineStep("quality", "running");
+    update({ progress: 75, status_text: "Analyzing dataset quality..." });
     try {
         const report = await qualityAnalyzer.analyze(rawFilePath);
         qualityScore = report.overall_score;
+        markPipelineStep("quality", "done");
     }
     catch (error) {
         console.error(`[Prepare] Quality analysis failed for ${datasetIdForDownload}: ${error?.message || error}`);
         update({ progress: 78, status_text: "Quality analysis skipped (unsupported schema). Continuing installation..." });
+        markPipelineStep("quality", "skipped");
     }
     if (selectedDataset) {
         metadataStore.saveDataset({
@@ -613,15 +976,62 @@ async function handlePrepareJob(jobId, query, requirements) {
             quality_score: qualityScore
         });
     }
+    else {
+        // Even for explicit dataset refs, save minimal metadata so get_dataset_info can find it
+        try {
+            const existingMeta = metadataStore.getDataset(datasetIdForDownload);
+            if (!existingMeta) {
+                metadataStore.saveDataset({
+                    id: datasetIdForDownload,
+                    source: source,
+                    name: datasetIdForDownload.split("/").pop() || datasetIdForDownload,
+                    description: `Dataset prepared from ${source}:${datasetIdForDownload}`,
+                    quality_warnings: [],
+                    downloads: 0,
+                    likes: 0,
+                    stars: 0,
+                    tags: [],
+                    last_updated: new Date().toISOString(),
+                    task: "unknown",
+                    domain: "unknown",
+                    languages: [],
+                    splits: [],
+                    license: { id: "unknown", category: "unknown", usage_restrictions: [], warnings: [] },
+                    quality_score: qualityScore,
+                    download_url: source === "huggingface" ? `https://huggingface.co/datasets/${datasetIdForDownload}` : "",
+                    total_examples: 0,
+                    is_structured: false,
+                    has_target_column: false,
+                    is_safe_source: true,
+                    has_personal_data: false,
+                    is_paywalled: false,
+                    is_scraped_web_data: false,
+                    uses_https: true,
+                    has_train_split: false,
+                    has_test_split: false,
+                    has_validation_split: false,
+                    description_length: 0,
+                    has_readme: false,
+                });
+            }
+        }
+        catch (e) {
+            console.error(`[Prepare] Failed to save minimal metadata for ${datasetIdForDownload}: ${e?.message || e}`);
+        }
+    }
+    markPipelineStep("register", "running");
     update({ progress: 85, status_text: "Installing dataset into project..." });
-    const installPath = await installService.install(datasetIdForDownload, rawFilePath);
+    const installPath = await installService.install(datasetIdForDownload, rawFilePath, outputDir);
     update({ progress: 100, status_text: "Preparation complete!" });
     // Register prepared dataset in local registry for lookup by export/list tools
     try {
         upsertRegistry(datasetIdForDownload, installPath, "completed");
+        markPipelineStep("register", "done");
+        markStepComplete(datasetIdForDownload, "prepare");
     }
     catch (e) {
         console.error(`[Registry] Failed to write registry for ${datasetIdForDownload}: ${e?.message || e}`);
+        markPipelineStep("register", "failed");
     }
     return installPath;
 }
@@ -647,7 +1057,7 @@ async function handleCleanJob(jobId, datasetId, ops) {
     }
     // 3. Check standard raw data paths
     if (!filePath) {
-        const safeId = datasetId.replace(/\//g, "_");
+        const safeId = toSafeDatasetPathFragment(datasetId);
         const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
         const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
         const featherPath = path.join(dataRoot, "data", "raw", `${safeId}.feather`);
@@ -712,9 +1122,146 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
                     required: ["query"],
                 },
             },
+            {
+                name: "unified_dataset_api",
+                description: "Single facade over multiple external dataset providers. Supports provider discovery, dataset search, dataset download, and dataset info through one MCP tool using public access and server-managed credentials when available.",
+                inputSchema: {
+                    type: "object",
+                    properties: {
+                        operation: {
+                            type: "string",
+                            enum: ["providers", "discover", "download", "info"],
+                            description: "Gateway operation to execute.",
+                        },
+                        source: {
+                            type: "string",
+                            enum: ["auto", "huggingface", "kaggle", "openml", "dataworld", "arxiv", "github", "s3", "bigquery"],
+                            description: "Optional provider selector. Use 'auto' to let Vesper choose a compatible backend.",
+                        },
+                        query: {
+                            type: "string",
+                            description: "Dataset discovery query. Required for operation='discover'.",
+                        },
+                        dataset_id: {
+                            type: "string",
+                            description: "Dataset identifier or object reference. Required for operation='download' and operation='info'. Supports prefixed ids like 'huggingface:user/dataset' and public S3 URIs like 's3://bucket/key'.",
+                        },
+                        limit: {
+                            type: "number",
+                            description: "Max results for operation='discover' (default: 10).",
+                        },
+                        target_dir: {
+                            type: "string",
+                            description: "Optional output directory for operation='download'. Defaults to the current working directory when omitted.",
+                        },
+                        output_dir: {
+                            type: "string",
+                            description: "Alias for target_dir. Defaults to the current working directory when omitted.",
+                        },
+                        public_only: {
+                            type: "boolean",
+                            description: "When true, discover/info stay on public providers only unless a specific source is requested.",
+                        },
+                        include_unavailable: {
+                            type: "boolean",
+                            description: "When true, operation='providers' includes connectors that are scaffolded but not currently configured.",
+                        },
+                    },
+                    required: ["operation"],
+                },
+            },
+            {
+                name: "vesper_web_find",
+                description: "Phase 1 Web Core: search web-native sources (ArXiv, GitHub) and return structured, validated documents using a unified schema (source_type, source_url, content, metadata_json, quality_score, collected_at, content_type).",
+                inputSchema: {
+                    type: "object",
+                    properties: {
+                        query: { type: "string", description: "Natural language query, e.g. 'agentic RAG evaluation'" },
+                        sources: {
+                            type: "array",
+                            items: { type: "string", enum: ["arxiv", "github", "semantic_scholar", "hackernews"] },
+                            description: "Optional subset of sources. Defaults to ['arxiv','github'] when omitted.",
+                        },
+                        limit: { type: "number", description: "Max documents to return (default 10, max 50)." },
+                        arxiv_full_text: { type: "boolean", description: "When true, fetch and parse ArXiv PDFs and return full text as document content (slower)." },
+                        github_include_readme: { type: "boolean", description: "When true, fetch and include GitHub README.md text as document content (slower)." },
+                    },
+                    required: ["query"],
+                },
+            },
+            {
+                name: "vesper.fuse",
+                description: "Phase 2 Data Fusion: fuse results from multiple web-native sources into one unified, deduplicated corpus (provenance via source_chain).",
+                inputSchema: {
+                    type: "object",
+                    properties: {
+                        sources: {
+                            type: "array",
+                            description: "Web sources to collect from, each with its own query.",
+                            items: {
+                                type: "object",
+                                properties: {
+                                    type: { type: "string", enum: ["arxiv", "github", "semantic_scholar", "hackernews", "s3"] },
+                                    query: { type: "string", description: "Query for this source." },
+                                    max_results: { type: "number", description: "Max results for this source (optional)." },
+                                    min_stars: { type: "number", description: "Optional popularity filter (GitHub) based on stars/proxy fields." },
+                                    bucket: { type: "string", description: "S3 bucket (for type='s3')." },
+                                    path: { type: "string", description: "S3 prefix/path (for type='s3')." },
+                                    region: { type: "string", description: "AWS region (for type='s3')." },
+                                    credentials: {
+                                        type: "object",
+                                        description: "Pass-through AWS credentials (optional; not persisted).",
+                                        properties: {
+                                            accessKeyId: { type: "string" },
+                                            secretAccessKey: { type: "string" },
+                                            sessionToken: { type: "string" },
+                                            roleArn: { type: "string" },
+                                        }
+                                    },
+                                },
+                                required: ["type", "query"],
+                            },
+                        },
+                        merge_strategy: {
+                            type: "string",
+                            enum: ["union", "dedup"],
+                            description: "How to merge collected documents.",
+                        },
+                        deduplication: {
+                            type: "string",
+                            enum: ["semantic", "exact", "none"],
+                            description: "How to deduplicate across sources.",
+                        },
+                    },
+                    required: ["sources"],
+                },
+            },
+            {
+                name: "vesper.extract_web",
+                description: "Phase 3 Structured Web extraction. Whitelist-only domains, deterministic extraction (tables/lists/infobox), schema validation, and cache fallback on live extraction failure.",
+                inputSchema: {
+                    type: "object",
+                    properties: {
+                        url: { type: "string", description: "Target URL from approved whitelist domains." },
+                        mode: { type: "string", enum: ["auto", "table", "list", "infobox"], description: "Extraction mode (default auto)." },
+                        strict_schema: { type: "boolean", description: "When true (default), enforce domain-specific required fields." },
+                        schema: {
+                            type: "object",
+                            properties: {
+                                required_fields: {
+                                    type: "array",
+                                    items: { type: "string" },
+                                    description: "Optional required top-level fields in extracted data payload."
+                                }
+                            }
+                        }
+                    },
+                    required: ["url"],
+                },
+            },
             {
                 name: "discover_datasets",
-                description: "Discover datasets from a specific source. Kaggle is optional and requires user-provided API key.",
+                description: "Discover datasets from a specific source. Public providers work keylessly; Kaggle and data.world can also be exposed through server-managed credentials.",
                 inputSchema: {
                     type: "object",
                     properties: {
@@ -724,7 +1271,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
                         },
                         source: {
                             type: "string",
-                            enum: ["huggingface", "kaggle", "openml", "dataworld"],
+                            enum: ["huggingface", "kaggle", "openml", "dataworld", "arxiv", "github"],
                             description: "Data source to discover from.",
                         },
                         limit: {
@@ -737,7 +1284,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
             },
             {
                 name: "download_dataset",
-                description: "Download a dataset by source and ID/slug into local Vesper storage. Defaults to HuggingFace. Kaggle and data.world require API keys (use configure_keys first).",
+                description: "Download a dataset by source and ID/slug into a local directory. Defaults to HuggingFace. Public providers work keylessly, while Kaggle and data.world can use server-managed credentials when configured.",
                 inputSchema: {
                     type: "object",
                     properties: {
@@ -752,7 +1299,11 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
                         },
                         target_dir: {
                             type: "string",
-                            description: "Optional target directory for downloaded files.",
+                            description: "Optional target directory for downloaded files. Defaults to the current working directory when omitted.",
+                        },
+                        output_dir: {
+                            type: "string",
+                            description: "Alias for target_dir. Defaults to the current working directory when omitted.",
                         }
                     },
                     required: ["dataset_id"],
@@ -770,6 +1321,8 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
                         kaggle_ref: { type: "string", description: "Kaggle dataset ref (owner/dataset)." },
                         urls: { type: "array", items: { type: "string" }, description: "Direct asset URLs." },
                         output_format: { type: "string", enum: ["webdataset", "imagefolder", "parquet"], description: "Output asset format." },
+                        target_dir: { type: "string", description: "Optional local directory where downloaded assets should be written. If provided, Vesper writes directly to this directory instead of managed asset storage." },
+                        output_dir: { type: "string", description: "Alias for target_dir. When provided, downloaded assets are written directly to this local directory." },
                         max_items: { type: "number", description: "Optional cap on number of assets to fetch." },
                         workers: { type: "number", description: "Parallel worker count (default 8)." },
                         image_column: { type: "string", description: "Explicit image column name. If omitted, auto-detected from HF features, column names, and sample values." },
@@ -877,6 +1430,8 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
                     properties: {
                         query: { type: "string" },
                         requirements: { type: "string" },
+                        target_dir: { type: "string", description: "Optional local directory for the prepared dataset. Defaults to the current working directory when omitted." },
+                        output_dir: { type: "string", description: "Alias for target_dir. Defaults to the current working directory when omitted." },
                         download_images: { type: "boolean", description: "When true, enables post-prepare smart asset downloading for image/media datasets." },
                         cleaning_options: { type: "object" },
                         split_config: { type: "object" },
@@ -921,7 +1476,11 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
                         },
                         target_dir: {
                             type: "string",
-                            description: "Optional custom local directory for export (e.g., './naruto-quotes').",
+                            description: "Optional custom local directory for export. Defaults to the current working directory when omitted.",
+                        },
+                        output_dir: {
+                            type: "string",
+                            description: "Alias for target_dir. Defaults to the current working directory when omitted.",
                         },
                         format: {
                             type: "string",
@@ -962,6 +1521,25 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
                     properties: {},
                 },
             },
+            {
+                name: "vesper_convert_format",
+                description: "Convert a dataset file between formats (CSV, Parquet, JSON, JSONL). Auto-detects input format from extension. Saves output in the same directory with the new extension and registers it in the Vesper registry.",
+                inputSchema: {
+                    type: "object",
+                    properties: {
+                        file_path: {
+                            type: "string",
+                            description: "Absolute path to the input dataset file.",
+                        },
+                        target_format: {
+                            type: "string",
+                            enum: ["csv", "parquet", "json", "jsonl"],
+                            description: "The desired output format.",
+                        },
+                    },
+                    required: ["file_path", "target_format"],
+                },
+            },
             {
                 name: "fuse_datasets",
                 description: "Fuse/combine multiple datasets via concat or join. Optionally runs quality & leakage checks afterward.",
@@ -1069,925 +1647,1225 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
         ],
     };
 });
-// Call Tool
+// Call Tool — all requests are serialized through a queue to prevent crashes from parallel calls
 server.setRequestHandler(CallToolRequestSchema, async (request) => {
-    // --- Pipeline Enforcement ---
-    // Map tool names to pipeline steps
-    const toolToStep = {
-        vesper_search: "search",
-        vesper_download: "download",
-        vesper_analyze: "analyze",
-        vesper_clean: "clean",
-        vesper_split: "split",
-        vesper_export: "export",
-        prepare_dataset: "prepare",
-    };
-    // Extract dataset_id if present and normalize
-    let datasetId = request.params.arguments?.dataset_id || request.params.arguments?.query || "";
-    if (datasetId)
-        datasetId = parseDatasetId(String(datasetId));
-    // Pipeline rules
-    const stepOrder = ["search", "download", "analyze", "clean", "split", "export"];
-    const prereqs = {
-        vesper_download: ["search"],
-        vesper_analyze: ["download"],
-        vesper_clean: ["analyze"],
-        vesper_split: ["clean"],
-        vesper_export: ["split"],
-    };
-    const tool = String(request.params.name);
-    const step = toolToStep[tool];
-    if (step && datasetId) {
-        // Check prerequisites
-        const required = prereqs[tool] || [];
-        for (const req of required) {
-            if (!hasStep(String(datasetId), req)) {
-                // Auto-run missing step if possible, else error
-                // For export, auto-run prepare_dataset if split missing
-                if (tool === "vesper_export" && req === "split") {
-                    // Auto-trigger prepare_dataset (start a background prepare job)
-                    try {
-                        jobManager.createJob("prepare", 0, { query: String(datasetId), requirements: undefined, downloadImages: false });
-                        // Mark split as complete so export can proceed; export handler will also wait for data if needed.
-                        markStepComplete(String(datasetId), "split");
+    return requestQueue.enqueue(async () => {
+        // --- Pipeline Enforcement ---
+        // Map tool names to pipeline steps
+        const toolToStep = {
+            vesper_search: "search",
+            vesper_download: "download",
+            vesper_analyze: "analyze",
+            vesper_clean: "clean",
+            vesper_split: "split",
+            vesper_export: "export",
+            prepare_dataset: "prepare",
+        };
+        // Extract dataset_id if present and normalize
+        let datasetId = request.params.arguments?.dataset_id || request.params.arguments?.query || "";
+        if (datasetId)
+            datasetId = parseDatasetId(String(datasetId));
+        // Pipeline rules
+        const stepOrder = ["search", "download", "analyze", "clean", "split", "export"];
+        const prereqs = {
+            vesper_download: ["search"],
+            vesper_analyze: ["download"],
+            vesper_clean: ["analyze"],
+            vesper_split: ["clean"],
+            vesper_export: ["split"],
+        };
+        const tool = String(request.params.name);
+        const step = toolToStep[tool];
+        if (step && datasetId) {
+            // Check prerequisites
+            const required = prereqs[tool] || [];
+            for (const req of required) {
+                if (!hasStep(String(datasetId), req)) {
+                    // Auto-run missing step if possible, else error
+                    // For export, auto-run prepare_dataset if split missing
+                    if (tool === "vesper_export" && req === "split") {
+                        // Auto-trigger prepare_dataset (start a background prepare job)
+                        try {
+                            jobManager.createJob("prepare", 0, { query: String(datasetId), requirements: undefined, downloadImages: false, outputDir: process.cwd() });
+                            // Mark split as complete so export can proceed; export handler will also wait for data if needed.
+                            markStepComplete(String(datasetId), "split");
+                        }
+                        catch (e) {
+                            console.error(`[Pipeline] Failed to auto-trigger prepare for ${datasetId}: ${e?.message || e}`);
+                            return {
+                                content: [{ type: "text", text: `ERROR: Failed to auto-run prepare for ${datasetId}. Please run prepare_dataset first.` }],
+                                isError: true,
+                            };
+                        }
                     }
-                    catch (e) {
-                        console.error(`[Pipeline] Failed to auto-trigger prepare for ${datasetId}: ${e?.message || e}`);
+                    else {
                         return {
-                            content: [{ type: "text", text: `ERROR: Failed to auto-run prepare for ${datasetId}. Please run prepare_dataset first.` }],
+                            content: [{ type: "text", text: `ERROR: Cannot run ${tool} before ${req}. Please run ${req} first.` }],
                             isError: true,
                         };
                     }
                 }
-                else {
+            }
+            // Mark this step as complete
+            markStepComplete(String(datasetId), String(step));
+        }
+        switch (request.params.name) {
+            case "vesper_web_find": {
+                hydrateExternalKeys();
+                const query = String(request.params.arguments?.query || "").trim();
+                const limit = Number(request.params.arguments?.limit || 10);
+                const sources = Array.isArray(request.params.arguments?.sources)
+                    ? (request.params.arguments?.sources).map(s => String(s).trim().toLowerCase()).filter(Boolean)
+                    : undefined;
+                try {
+                    const result = await webCoreEngine.find({
+                        query,
+                        sources: sources,
+                        limit,
+                        arxiv_full_text: request.params.arguments?.arxiv_full_text === true,
+                        github_include_readme: request.params.arguments?.github_include_readme === true,
+                    });
                     return {
-                        content: [{ type: "text", text: `ERROR: Cannot run ${tool} before ${req}. Please run ${req} first.` }],
+                        content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
+                    };
+                }
+                catch (error) {
+                    return {
+                        content: [{ type: "text", text: `ERROR: web_find failed: ${error.message}` }],
                         isError: true,
                     };
                 }
             }
-        }
-        // Mark this step as complete
-        markStepComplete(String(datasetId), String(step));
-    }
-    switch (request.params.name) {
-        case "vesper_search": {
-            const query = String(request.params.arguments?.query);
-            const limit = 5;
-            const safeOnly = true; // Enable safe filter by default
-            const enableJIT = request.params.arguments?.enable_jit === true;
-            if (!query) {
-                throw new McpError(ErrorCode.InvalidParams, "Query is required");
+            case "vesper.fuse": {
+                hydrateExternalKeys();
+                const sources = Array.isArray(request.params.arguments?.sources)
+                    ? request.params.arguments?.sources
+                    : undefined;
+                if (!sources || !Array.isArray(sources)) {
+                    return {
+                        content: [{ type: "text", text: "ERROR: vesper.fuse requires 'sources' array." }],
+                        isError: true,
+                    };
+                }
+                try {
+                    const mergeStrategyRaw = request.params.arguments?.merge_strategy
+                        ? String(request.params.arguments?.merge_strategy).toLowerCase()
+                        : undefined;
+                    const dedupRaw = request.params.arguments?.deduplication
+                        ? String(request.params.arguments?.deduplication).toLowerCase()
+                        : undefined;
+                    const merge_strategy = mergeStrategyRaw && ["union", "dedup"].includes(mergeStrategyRaw)
+                        ? mergeStrategyRaw
+                        : undefined;
+                    const deduplication = dedupRaw && ["semantic", "exact", "none"].includes(dedupRaw)
+                        ? dedupRaw
+                        : undefined;
+                    const result = await webFusionEngine.fuse({
+                        sources: sources.map((s) => ({
+                            type: String(s?.type || "").trim().toLowerCase(),
+                            query: String(s?.query || "").trim(),
+                            max_results: s?.max_results !== undefined ? Number(s.max_results) : undefined,
+                            min_stars: s?.min_stars !== undefined ? Number(s.min_stars) : undefined,
+                            bucket: s?.bucket !== undefined ? String(s.bucket) : undefined,
+                            path: s?.path !== undefined ? String(s.path) : undefined,
+                            region: s?.region !== undefined ? String(s.region) : undefined,
+                            credentials: s?.credentials ? {
+                                accessKeyId: s.credentials.accessKeyId !== undefined ? String(s.credentials.accessKeyId) : undefined,
+                                secretAccessKey: s.credentials.secretAccessKey !== undefined ? String(s.credentials.secretAccessKey) : undefined,
+                                sessionToken: s.credentials.sessionToken !== undefined ? String(s.credentials.sessionToken) : undefined,
+                                roleArn: s.credentials.roleArn !== undefined ? String(s.credentials.roleArn) : undefined,
+                            } : undefined,
+                        })),
+                        merge_strategy,
+                        deduplication,
+                    });
+                    return {
+                        content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
+                    };
+                }
+                catch (error) {
+                    return {
+                        content: [{ type: "text", text: `ERROR: vesper.fuse failed: ${error.message}` }],
+                        isError: true,
+                    };
+                }
             }
-            const results = await searchEngine.search(query, { limit, safeOnly, enableJIT });
-            const formattedOutput = formatSearchResults(results);
-            return {
-                content: [
-                    {
-                        type: "text",
-                        text: formattedOutput,
-                    },
-                ],
-            };
-        }
-        case "discover_datasets": {
-            hydrateExternalKeys();
-            const query = String(request.params.arguments?.query || "").trim();
-            const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
-            const limit = Number(request.params.arguments?.limit || 10);
-            if (!query) {
-                throw new McpError(ErrorCode.InvalidParams, "query is required");
+            case "vesper.extract_web": {
+                hydrateExternalKeys();
+                const url = String(request.params.arguments?.url || "").trim();
+                const mode = request.params.arguments?.mode
+                    ? String(request.params.arguments?.mode).trim().toLowerCase()
+                    : "auto";
+                const schema = request.params.arguments?.schema && typeof request.params.arguments.schema === "object"
+                    ? request.params.arguments.schema
+                    : undefined;
+                if (!url) {
+                    return {
+                        content: [{ type: "text", text: "ERROR: vesper.extract_web requires 'url'." }],
+                        isError: true,
+                    };
+                }
+                try {
+                    const out = await webExtractorEngine.extract({
+                        url,
+                        mode: mode,
+                        strict_schema: request.params.arguments?.strict_schema !== false,
+                        schema: schema,
+                    });
+                    return {
+                        content: [{ type: "text", text: JSON.stringify(out, null, 2) }],
+                    };
+                }
+                catch (error) {
+                    return {
+                        content: [{ type: "text", text: `ERROR: vesper.extract_web failed: ${error.message}` }],
+                        isError: true,
+                    };
+                }
             }
-            try {
-                let results = [];
-                if (source === "kaggle") {
-                    if (!dataIngestor.hasKaggleCredentials()) {
+            case "unified_dataset_api": {
+                hydrateExternalKeys();
+                const operation = String(request.params.arguments?.operation || "").trim().toLowerCase();
+                const source = String(request.params.arguments?.source || "auto").trim().toLowerCase();
+                const includeUnavailable = request.params.arguments?.include_unavailable === true;
+                const publicOnly = request.params.arguments?.public_only !== false;
+                try {
+                    if (operation === "providers") {
                         return {
-                            content: [{ type: "text", text: `Kaggle requires API credentials. Use the configure_keys tool to set kaggle_username and kaggle_key, or try source='huggingface' which works without credentials.` }],
-                            isError: true,
+                            content: [{ type: "text", text: JSON.stringify({ providers: unifiedDatasetGateway.getProviderStatuses(includeUnavailable) }, null, 2) }],
                         };
                     }
-                    results = await kaggleSource.discover(query, limit);
-                }
-                else if (source === "openml") {
-                    const openmlSource = new OpenMLSource();
-                    results = await openmlSource.discover(query, limit);
-                }
-                else if (source === "dataworld") {
-                    if (!hasDataWorldToken()) {
+                    if (operation === "discover") {
+                        const query = String(request.params.arguments?.query || "").trim();
+                        if (!query) {
+                            throw new McpError(ErrorCode.InvalidParams, "query is required for operation='discover'");
+                        }
+                        const result = await unifiedDatasetGateway.discover({
+                            query,
+                            source,
+                            limit: Number(request.params.arguments?.limit || 10),
+                            publicOnly,
+                        });
                         return {
-                            content: [{ type: "text", text: "data.world requires API token. Run 'vespermcp config keys' and set dataworld_token." }],
-                            isError: true,
+                            content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
                         };
                     }
-                    const dataworldSource = new DataWorldSource();
-                    results = await dataworldSource.discover(query, limit);
-                }
-                else {
-                    const hf = new HuggingFaceScraper();
-                    results = await hf.scrape(Math.max(1, limit), true, query);
-                }
-                const recipeScript = path.join(dataRoot, "python", "asset_downloader_engine.py");
-                for (const ds of results.slice(0, limit)) {
-                    const info = {
-                        dataset_id: ds.id,
-                        id: ds.id,
-                        source: ds.source,
-                        repo_id: ds.id,
-                        total_images: ds.total_examples || 0,
-                        image_column: undefined,
-                        recipes_dir: path.join(dataRoot, "recipes"),
-                    };
-                    try {
-                        await runPythonJson(recipeScript, ["build_recipe", JSON.stringify(info)]);
+                    if (operation === "download") {
+                        const datasetId = String(request.params.arguments?.dataset_id || "").trim();
+                        if (!datasetId) {
+                            throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='download'");
+                        }
+                        const requestedTargetDir = request.params.arguments?.target_dir
+                            ? String(request.params.arguments.target_dir).trim()
+                            : request.params.arguments?.output_dir
+                                ? String(request.params.arguments.output_dir).trim()
+                                : "";
+                        const targetDir = requestedTargetDir || process.cwd();
+                        try {
+                            await ensurePythonModules([{ module: "datasets", packageName: "datasets" }]);
+                        }
+                        catch {
+                            // best effort; non-HF providers do not require this
+                        }
+                        const result = await unifiedDatasetGateway.download({
+                            datasetId,
+                            source,
+                            targetDir,
+                        });
+                        try {
+                            upsertRegistry(result.dataset_id, result.copied_to || result.local_path, "completed");
+                        }
+                        catch (e) {
+                            console.error(`[Registry] Failed to write registry for ${result.dataset_id}: ${e?.message || e}`);
+                        }
+                        return {
+                            content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
+                        };
                     }
-                    catch {
-                        // best-effort recipe generation; ignore discovery-time recipe failures
+                    if (operation === "info") {
+                        const datasetId = String(request.params.arguments?.dataset_id || "").trim();
+                        if (!datasetId) {
+                            throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='info'");
+                        }
+                        const result = await unifiedDatasetGateway.info({
+                            datasetId,
+                            source,
+                            publicOnly,
+                        });
+                        return {
+                            content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
+                        };
                     }
+                    throw new McpError(ErrorCode.InvalidParams, `Unsupported operation: ${operation}`);
+                }
+                catch (error) {
+                    return {
+                        content: [{ type: "text", text: `ERROR: unified gateway failed: ${error.message}` }],
+                        isError: true,
+                    };
                 }
-                const formattedOutput = formatSearchResults(results.slice(0, limit));
-                return {
-                    content: [{ type: "text", text: formattedOutput }]
-                };
-            }
-            catch (error) {
-                return {
-                    content: [{ type: "text", text: `ERROR: discover failed: ${error.message}` }],
-                    isError: true,
-                };
-            }
-        }
-        case "download_dataset": {
-            hydrateExternalKeys();
-            const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
-            const datasetId = String(request.params.arguments?.dataset_id || "").trim();
-            if (!datasetId) {
-                throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
-            }
-            if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
-                return {
-                    content: [{ type: "text", text: `Kaggle requires API credentials. Use the configure_keys tool to set kaggle_username and kaggle_key, or switch to source='huggingface' which works without credentials.` }],
-                    isError: true,
-                };
             }
-            if (source === "dataworld" && !hasDataWorldToken()) {
+            case "vesper_search": {
+                const query = String(request.params.arguments?.query);
+                const limit = 5;
+                const safeOnly = true; // Enable safe filter by default
+                const enableJIT = request.params.arguments?.enable_jit === true;
+                if (!query) {
+                    throw new McpError(ErrorCode.InvalidParams, "Query is required");
+                }
+                const results = await searchEngine.search(query, { limit, safeOnly, enableJIT });
+                const formattedOutput = formatSearchResults(results);
                 return {
-                    content: [{ type: "text", text: "data.world requires API token. Use the configure_keys tool to set dataworld_token, or switch to source='huggingface' which works without credentials." }],
-                    isError: true,
+                    content: [
+                        {
+                            type: "text",
+                            text: formattedOutput,
+                        },
+                    ],
                 };
             }
-            // Pre-install Python datasets library for HuggingFace fallback
-            if (source === "huggingface") {
+            case "discover_datasets": {
+                hydrateExternalKeys();
+                const query = String(request.params.arguments?.query || "").trim();
+                const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
+                const limit = Number(request.params.arguments?.limit || 10);
+                if (!query) {
+                    throw new McpError(ErrorCode.InvalidParams, "query is required");
+                }
                 try {
-                    await ensurePythonModules([
-                        { module: "datasets", packageName: "datasets" },
-                    ]);
+                    const gatewayResult = await unifiedDatasetGateway.discover({
+                        query,
+                        source,
+                        limit,
+                        publicOnly: false,
+                    });
+                    const results = gatewayResult.results;
+                    const recipeScript = path.join(dataRoot, "python", "asset_downloader_engine.py");
+                    for (const ds of results.slice(0, limit)) {
+                        const info = {
+                            dataset_id: ds.id,
+                            id: ds.id,
+                            source: ds.source,
+                            repo_id: ds.id,
+                            total_images: ds.total_examples || 0,
+                            image_column: undefined,
+                            recipes_dir: path.join(dataRoot, "recipes"),
+                        };
+                        try {
+                            await runPythonJson(recipeScript, ["build_recipe", JSON.stringify(info)]);
+                        }
+                        catch {
+                            // best-effort recipe generation; ignore discovery-time recipe failures
+                        }
+                    }
+                    const formattedOutput = formatSearchResults(results.slice(0, limit));
+                    const noteBlock = gatewayResult.notes.length > 0
+                        ? `\nGateway notes:\n- ${gatewayResult.notes.join("\n- ")}`
+                        : "";
+                    return {
+                        content: [{ type: "text", text: `${formattedOutput}${noteBlock}` }]
+                    };
                 }
-                catch {
-                    // Continue - direct download may still work
+                catch (error) {
+                    return {
+                        content: [{ type: "text", text: `ERROR: discover failed: ${error.message}` }],
+                        isError: true,
+                    };
                 }
             }
-            try {
-                const localPath = await dataIngestor.ensureData(datasetId, source, () => undefined);
+            case "download_dataset": {
+                hydrateExternalKeys();
+                const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
+                const datasetId = String(request.params.arguments?.dataset_id || "").trim();
+                const requestedTargetDir = request.params.arguments?.target_dir
+                    ? String(request.params.arguments.target_dir).trim()
+                    : request.params.arguments?.output_dir
+                        ? String(request.params.arguments.output_dir).trim()
+                        : "";
+                const targetDir = requestedTargetDir || process.cwd();
+                if (!datasetId) {
+                    throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
+                }
+                // Pre-install Python datasets library for HuggingFace fallback
+                if (source === "huggingface") {
+                    try {
+                        await ensurePythonModules([
+                            { module: "datasets", packageName: "datasets" },
+                        ]);
+                    }
+                    catch {
+                        // Continue - direct download may still work
+                    }
+                }
                 try {
-                    upsertRegistry(datasetId, localPath, "completed");
+                    const result = await unifiedDatasetGateway.download({
+                        datasetId,
+                        source,
+                        targetDir,
+                    });
+                    try {
+                        upsertRegistry(result.dataset_id, result.copied_to || result.local_path, "completed");
+                    }
+                    catch (e) {
+                        console.error(`[Registry] Failed to write registry for ${result.dataset_id}: ${e?.message || e}`);
+                    }
+                    const noteBlock = result.notes.length > 0 ? `\nNotes:\n- ${result.notes.join("\n- ")}` : "";
+                    return {
+                        content: [{ type: "text", text: `Download complete: ${result.copied_to || result.local_path}${noteBlock}` }]
+                    };
                 }
-                catch (e) {
-                    console.error(`[Registry] Failed to write registry for ${datasetId}: ${e?.message || e}`);
+                catch (error) {
+                    return {
+                        content: [{ type: "text", text: `ERROR: download failed: ${error.message}` }],
+                        isError: true,
+                    };
                 }
-                return {
-                    content: [{ type: "text", text: `Download complete: ${localPath}` }]
-                };
             }
-            catch (error) {
-                return {
-                    content: [{ type: "text", text: `ERROR: download failed: ${error.message}` }],
-                    isError: true,
+            case "vesper_download_assets": {
+                hydrateExternalKeys();
+                const datasetId = String(request.params.arguments?.dataset_id || "").trim();
+                const source = String(request.params.arguments?.source || "").trim().toLowerCase();
+                // Auto-infer repo_id from dataset_id if not provided (common for HuggingFace)
+                const repoId = request.params.arguments?.repo_id
+                    ? String(request.params.arguments.repo_id)
+                    : (source === "huggingface" && datasetId.includes("/") ? datasetId : undefined);
+                const kaggleRef = request.params.arguments?.kaggle_ref ? String(request.params.arguments.kaggle_ref) : undefined;
+                const urls = Array.isArray(request.params.arguments?.urls)
+                    ? (request.params.arguments?.urls).map(v => String(v))
+                    : undefined;
+                const outputFormat = String(request.params.arguments?.output_format || "webdataset");
+                const requestedOutputDir = request.params.arguments?.target_dir
+                    ? String(request.params.arguments.target_dir).trim()
+                    : request.params.arguments?.output_dir
+                        ? String(request.params.arguments.output_dir).trim()
+                        : undefined;
+                const maxItems = request.params.arguments?.max_items ? Number(request.params.arguments.max_items) : undefined;
+                const workers = request.params.arguments?.workers ? Number(request.params.arguments.workers) : 8;
+                const imageColumn = request.params.arguments?.image_column ? String(request.params.arguments.image_column) : undefined;
+                if (!datasetId || !source) {
+                    throw new McpError(ErrorCode.InvalidParams, "dataset_id and source are required");
+                }
+                if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
+                    return {
+                        content: [{ type: "text", text: "Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds)." }],
+                        isError: true,
+                    };
+                }
+                const requiredModules = [
+                    { module: "aiohttp", packageName: "aiohttp" },
+                ];
+                if (source === "url") {
+                    requiredModules.push({ module: "aiofiles", packageName: "aiofiles" });
+                }
+                if (source === "huggingface") {
+                    requiredModules.push({ module: "datasets", packageName: "datasets" });
+                    requiredModules.push({ module: "PIL", packageName: "Pillow" });
+                }
+                if (source === "kaggle") {
+                    requiredModules.push({ module: "kaggle", packageName: "kaggle" });
+                }
+                try {
+                    await ensurePythonModules(requiredModules);
+                }
+                catch (error) {
+                    return {
+                        content: [{ type: "text", text: `ERROR: Python dependency setup failed: ${error.message}` }],
+                        isError: true,
+                    };
+                }
+                const scriptPath = path.join(dataRoot, "python", "asset_downloader_engine.py");
+                const payload = {
+                    dataset_id: datasetId,
+                    source,
+                    repo_id: repoId,
+                    kaggle_ref: kaggleRef,
+                    urls,
+                    output_format: outputFormat,
+                    output_dir: requestedOutputDir,
+                    max_items: maxItems,
+                    workers,
+                    image_column: imageColumn,
+                    output_root: requestedOutputDir || process.cwd(),
+                    recipes_dir: path.join(dataRoot, "recipes"),
                 };
+                try {
+                    const result = await runPythonJson(scriptPath, ["download", JSON.stringify(payload)]);
+                    if (!result?.ok) {
+                        const errMsg = result?.error || "Unknown error";
+                        // Enhance error messages for common failures
+                        let hint = "";
+                        if (errMsg.includes("No image column")) {
+                            hint = "\n\nHint: Specify image_column parameter with the column name containing images/URLs.";
+                        }
+                        else if (errMsg.includes("Authentication") || errMsg.includes("401") || errMsg.includes("403")) {
+                            hint = "\n\nHint: Use configure_keys tool to set HF_TOKEN for gated/private datasets.";
+                        }
+                        return {
+                            content: [{ type: "text", text: `ERROR: asset download failed: ${errMsg}${hint}` }],
+                            isError: true,
+                        };
+                    }
+                    return {
+                        content: [{ type: "text", text: JSON.stringify(result.result, null, 2) }],
+                    };
+                }
+                catch (error) {
+                    return {
+                        content: [{ type: "text", text: `ERROR: asset downloader execution failed: ${error.message}` }],
+                        isError: true,
+                    };
+                }
             }
-        }
-        case "vesper_download_assets": {
-            hydrateExternalKeys();
-            const datasetId = String(request.params.arguments?.dataset_id || "").trim();
-            const source = String(request.params.arguments?.source || "").trim().toLowerCase();
-            // Auto-infer repo_id from dataset_id if not provided (common for HuggingFace)
-            const repoId = request.params.arguments?.repo_id
-                ? String(request.params.arguments.repo_id)
-                : (source === "huggingface" && datasetId.includes("/") ? datasetId : undefined);
-            const kaggleRef = request.params.arguments?.kaggle_ref ? String(request.params.arguments.kaggle_ref) : undefined;
-            const urls = Array.isArray(request.params.arguments?.urls)
-                ? (request.params.arguments?.urls).map(v => String(v))
-                : undefined;
-            const outputFormat = String(request.params.arguments?.output_format || "webdataset");
-            const maxItems = request.params.arguments?.max_items ? Number(request.params.arguments.max_items) : undefined;
-            const workers = request.params.arguments?.workers ? Number(request.params.arguments.workers) : 8;
-            const imageColumn = request.params.arguments?.image_column ? String(request.params.arguments.image_column) : undefined;
-            if (!datasetId || !source) {
-                throw new McpError(ErrorCode.InvalidParams, "dataset_id and source are required");
-            }
-            if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
+            case "configure_kaggle": {
+                const username = String(request.params.arguments?.username || "").trim();
+                const key = String(request.params.arguments?.key || "").trim();
+                if (!username || !key) {
+                    throw new McpError(ErrorCode.InvalidParams, "username and key are required");
+                }
+                const r1 = secureKeys.set("kaggle_username", username);
+                const r2 = secureKeys.set("kaggle_key", key);
+                process.env.KAGGLE_USERNAME = username;
+                process.env.KAGGLE_KEY = key;
                 return {
-                    content: [{ type: "text", text: "Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds)." }],
-                    isError: true,
+                    content: [{ type: "text", text: `Kaggle credentials saved securely (${r1.method || "store"}/${r2.method || "store"}). You can now use Kaggle datasets.` }]
                 };
             }
-            const requiredModules = [
-                { module: "aiohttp", packageName: "aiohttp" },
-            ];
-            if (source === "url") {
-                requiredModules.push({ module: "aiofiles", packageName: "aiofiles" });
-            }
-            if (source === "huggingface") {
-                requiredModules.push({ module: "datasets", packageName: "datasets" });
-                requiredModules.push({ module: "PIL", packageName: "Pillow" });
-            }
-            if (source === "kaggle") {
-                requiredModules.push({ module: "kaggle", packageName: "kaggle" });
-            }
-            try {
-                await ensurePythonModules(requiredModules);
-            }
-            catch (error) {
+            case "configure_keys": {
+                const hfToken = String(request.params.arguments?.hf_token || "").trim();
+                const kaggleUsername = String(request.params.arguments?.kaggle_username || "").trim();
+                const kaggleKey = String(request.params.arguments?.kaggle_key || "").trim();
+                const dataworldToken = String(request.params.arguments?.dataworld_token || "").trim();
+                const saved = [];
+                const methods = [];
+                if (hfToken) {
+                    const r = secureKeys.set("hf_token", hfToken);
+                    if (r.ok) {
+                        process.env.HF_TOKEN = hfToken;
+                        saved.push("HF token");
+                        if (r.method)
+                            methods.push(r.method);
+                    }
+                }
+                if (kaggleUsername) {
+                    const r = secureKeys.set("kaggle_username", kaggleUsername);
+                    if (r.ok) {
+                        process.env.KAGGLE_USERNAME = kaggleUsername;
+                        saved.push("Kaggle username");
+                        if (r.method)
+                            methods.push(r.method);
+                    }
+                }
+                if (kaggleKey) {
+                    const r = secureKeys.set("kaggle_key", kaggleKey);
+                    if (r.ok) {
+                        process.env.KAGGLE_KEY = kaggleKey;
+                        saved.push("Kaggle key");
+                        if (r.method)
+                            methods.push(r.method);
+                    }
+                }
+                if (dataworldToken) {
+                    const r = secureKeys.set("dataworld_token", dataworldToken);
+                    if (r.ok) {
+                        process.env.DW_AUTH_TOKEN = dataworldToken;
+                        saved.push("data.world token");
+                        if (r.method)
+                            methods.push(r.method);
+                    }
+                }
+                if (saved.length === 0) {
+                    return {
+                        content: [{ type: "text", text: "No keys provided. Core Vesper tools continue to work without API keys." }]
+                    };
+                }
                 return {
-                    content: [{ type: "text", text: `ERROR: Python dependency setup failed: ${error.message}` }],
-                    isError: true,
+                    content: [{ type: "text", text: `Key saved securely. Updated: ${saved.join(", ")}.` }]
                 };
             }
-            const scriptPath = path.join(dataRoot, "python", "asset_downloader_engine.py");
-            const payload = {
-                dataset_id: datasetId,
-                source,
-                repo_id: repoId,
-                kaggle_ref: kaggleRef,
-                urls,
-                output_format: outputFormat,
-                max_items: maxItems,
-                workers,
-                image_column: imageColumn,
-                output_root: path.join(dataRoot, "data", "assets"),
-                recipes_dir: path.join(dataRoot, "recipes"),
-            };
-            try {
-                const result = await runPythonJson(scriptPath, ["download", JSON.stringify(payload)]);
-                if (!result?.ok) {
-                    const errMsg = result?.error || "Unknown error";
-                    // Enhance error messages for common failures
-                    let hint = "";
-                    if (errMsg.includes("No image column")) {
-                        hint = "\n\nHint: Specify image_column parameter with the column name containing images/URLs.";
-                    }
-                    else if (errMsg.includes("Authentication") || errMsg.includes("401") || errMsg.includes("403")) {
-                        hint = "\n\nHint: Use configure_keys tool to set HF_TOKEN for gated/private datasets.";
+            case "get_dataset_info": {
+                const datasetId = String(request.params.arguments?.dataset_id);
+                if (!datasetId) {
+                    throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
+                }
+                const dataset = metadataStore.getDataset(datasetId);
+                if (!dataset) {
+                    // Fallback: check the registry for local path info
+                    const regEntry = getRegistryEntry(datasetId);
+                    const regPath = regEntry?.local_path || regEntry?.path;
+                    if (regEntry) {
+                        const exists = regPath && fs.existsSync(regPath);
+                        return {
+                            content: [{ type: "text", text: `**${datasetId}** (from registry)\n- Local path: ${regPath || "unknown"}\n- Status: ${regEntry.status || "unknown"}${exists ? "" : " (file missing)"}\n\nNote: Full metadata not available in metadata store. Use prepare_dataset to get full details.` }],
+                        };
                     }
                     return {
-                        content: [{ type: "text", text: `ERROR: asset download failed: ${errMsg}${hint}` }],
+                        content: [{ type: "text", text: `ERROR: Dataset not found: ${datasetId}. Use vesper_list_datasets to see available datasets, or prepare_dataset to add new ones.` }],
                         isError: true,
                     };
                 }
+                // Enrich: if total_examples is 0-ish, try the HF datasets-server /size API
+                if ((!dataset.total_examples || dataset.total_examples === 0) && dataset.source === "huggingface") {
+                    try {
+                        const sizeResp = await fetch(`https://datasets-server.huggingface.co/size?dataset=${encodeURIComponent(dataset.id)}`);
+                        if (sizeResp.ok) {
+                            const sizeData = await sizeResp.json();
+                            const numRows = sizeData?.size?.dataset?.num_rows;
+                            if (numRows && numRows > 0) {
+                                dataset.total_examples = numRows;
+                                // Also backfill splits
+                                if (sizeData?.size?.splits && Array.isArray(sizeData.size.splits)) {
+                                    dataset.splits = sizeData.size.splits.map((s) => ({
+                                        name: s.split,
+                                        num_examples: s.num_rows || 0,
+                                        size_bytes: s.num_bytes_parquet_files || 0,
+                                    }));
+                                    dataset.has_train_split = dataset.splits.some((s) => s.name === "train");
+                                    dataset.has_test_split = dataset.splits.some((s) => s.name === "test");
+                                    dataset.has_validation_split = dataset.splits.some((s) => s.name === "validation" || s.name === "val");
+                                }
+                                // Persist enriched metadata
+                                metadataStore.saveDataset(dataset);
+                            }
+                        }
+                    }
+                    catch {
+                        // Enrichment is best-effort; continue with whatever we have
+                    }
+                }
+                const formattedOutput = formatDatasetInfo(dataset);
+                return { content: [{ type: "text", text: formattedOutput }] };
+            }
+            case "analyze_quality": {
+                const datasetId = String(request.params.arguments?.dataset_id);
+                const safeId = toSafeDatasetPathFragment(datasetId);
+                const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
+                const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
+                let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
+                // Demo Fallback for easy testing
+                if (datasetId === "demo" || !fs.existsSync(filePath)) {
+                    const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
+                    const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
+                    if (fs.existsSync(demoParquetPath)) {
+                        filePath = demoParquetPath;
+                    }
+                    else if (fs.existsSync(demoCsvPath)) {
+                        filePath = demoCsvPath;
+                    }
+                    else if (datasetId !== "demo") {
+                        return {
+                            content: [{ type: "text", text: `ERROR: Local data file for ${datasetId} not found. Try running prepare_dataset first, or use 'demo' as the dataset_id.` }],
+                            isError: true
+                        };
+                    }
+                }
+                const report = await qualityAnalyzer.analyze(filePath);
                 return {
-                    content: [{ type: "text", text: JSON.stringify(result.result, null, 2) }],
+                    content: [{ type: "text", text: JSON.stringify(report, null, 2) }]
                 };
             }
-            catch (error) {
+            case "preview_cleaning": {
+                const datasetId = String(request.params.arguments?.dataset_id);
+                const safeId = toSafeDatasetPathFragment(datasetId);
+                const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
+                const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
+                let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
+                if (datasetId === "demo" || !fs.existsSync(filePath)) {
+                    const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
+                    const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
+                    if (fs.existsSync(demoParquetPath)) {
+                        filePath = demoParquetPath;
+                    }
+                    else if (fs.existsSync(demoCsvPath)) {
+                        filePath = demoCsvPath;
+                    }
+                    else {
+                        throw new McpError(ErrorCode.InvalidParams, `Local data file not found for ${datasetId}. Please run prepare_dataset first.`);
+                    }
+                }
+                const report = await qualityAnalyzer.analyze(filePath);
+                // Phase 1: Target Detection
+                // We use the same TargetDetector instance inside CleaningPlanner now?
+                // Actually, we instantiated it inside CleaningPlanner, so we just need to pass the file path
+                // OR let the planner handle it if we update its signature to accept filePath.
+                // Let's check `CleaningPlanner.generatePlan` signature again.
+                // We updated it to accept `targetInfo`.
+                // So we need to run detection HERE and pass it.
+                // But `TargetDetector` is not exposed in `index.ts` scope yet.
+                // Let's create a global instance or use the one inside planner if exposed (it's private).
+                // Better approach: Instantiate TargetDetector here in index.ts for the tool content.
+                // Quick fix: Instantiate local detector or make global.
+                // I'll make a global `targetDetector` constant in index.ts
+                // But wait, I updated `CleaningPlanner` to instantiate its own detector.
+                // Does `CleaningPlanner` use it? No, I commented out the logic because it needed `filePath`.
+                // RETRY STRATEGY:
+                // 1. Instantiate `targetDetector` in `index.ts`.
+                // 2. Run `detectTarget(filePath)`.
+                // 3. Pass result to `cleaningPlanner.generatePlan(..., targetInfo)`.
+                // I need to add `const targetDetector = new TargetDetector(__dirname);` to imports/init section first.
+                // But since I'm in this tool, I can't look back.
+                // I will assume I can add it, or just do it inside the case for now.
+                // To do it properly, I should have added `targetDetector` to the global scope in previous step.
+                // Let's do that in a separate step if needed.
+                // For now, I'll instantiate it here.
+                const { TargetDetector } = await import("./preparation/target-detector.js");
+                const detector = new TargetDetector(__dirname);
+                const targetResult = await detector.detectTarget(filePath);
+                const targetInfo = targetResult.target_column ? {
+                    target: targetResult.target_column,
+                    confidence: targetResult.confidence
+                } : undefined;
+                const plan = await cleaningPlanner.generatePlan(datasetId, report, undefined, targetInfo);
+                let explanation = `### Cleaning Plan for ${datasetId}\n\n`;
+                if (targetInfo && targetInfo.target !== "target" && targetInfo.confidence > 0.7) {
+                    explanation += `**Target Detected**: '${targetInfo.target}' (Confidence: ${targetInfo.confidence.toFixed(2)})\n`;
+                    explanation += `   - **Action**: Renaming to 'target' for consistency.\n\n`;
+                }
+                explanation += `Estimated Quality Improvement: +${plan.estimated_impact.quality_score_improvement} points\n\n`;
+                if (plan.operations.length === 0) {
+                    explanation += "No cleaning operations required.";
+                }
+                else {
+                    plan.operations.forEach((op, i) => {
+                        explanation += `${i + 1}. **${op.type}**: ${op.reason}\n`;
+                    });
+                }
                 return {
-                    content: [{ type: "text", text: `ERROR: asset downloader execution failed: ${error.message}` }],
-                    isError: true,
+                    content: [{ type: "text", text: explanation }]
                 };
             }
-        }
-        case "configure_kaggle": {
-            const username = String(request.params.arguments?.username || "").trim();
-            const key = String(request.params.arguments?.key || "").trim();
-            if (!username || !key) {
-                throw new McpError(ErrorCode.InvalidParams, "username and key are required");
-            }
-            const r1 = secureKeys.set("kaggle_username", username);
-            const r2 = secureKeys.set("kaggle_key", key);
-            process.env.KAGGLE_USERNAME = username;
-            process.env.KAGGLE_KEY = key;
-            return {
-                content: [{ type: "text", text: `Kaggle credentials saved securely (${r1.method || "store"}/${r2.method || "store"}). You can now use Kaggle datasets.` }]
-            };
-        }
-        case "configure_keys": {
-            const hfToken = String(request.params.arguments?.hf_token || "").trim();
-            const kaggleUsername = String(request.params.arguments?.kaggle_username || "").trim();
-            const kaggleKey = String(request.params.arguments?.kaggle_key || "").trim();
-            const dataworldToken = String(request.params.arguments?.dataworld_token || "").trim();
-            const saved = [];
-            const methods = [];
-            if (hfToken) {
-                const r = secureKeys.set("hf_token", hfToken);
-                if (r.ok) {
-                    process.env.HF_TOKEN = hfToken;
-                    saved.push("HF token");
-                    if (r.method)
-                        methods.push(r.method);
+            case "custom_clean": {
+                const datasetId = String(request.params.arguments?.dataset_id);
+                const ops = request.params.arguments?.operations;
+                if (!datasetId || datasetId === "undefined") {
+                    throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
                 }
-            }
-            if (kaggleUsername) {
-                const r = secureKeys.set("kaggle_username", kaggleUsername);
-                if (r.ok) {
-                    process.env.KAGGLE_USERNAME = kaggleUsername;
-                    saved.push("Kaggle username");
-                    if (r.method)
-                        methods.push(r.method);
+                if (!ops || !Array.isArray(ops) || ops.length === 0) {
+                    throw new McpError(ErrorCode.InvalidParams, "operations array is required and must not be empty. Supported operations: RemoveDuplicates, DropColumns, FillMissing, FixTypes, RemoveOutliers, EncodeCategories");
                 }
-            }
-            if (kaggleKey) {
-                const r = secureKeys.set("kaggle_key", kaggleKey);
-                if (r.ok) {
-                    process.env.KAGGLE_KEY = kaggleKey;
-                    saved.push("Kaggle key");
-                    if (r.method)
-                        methods.push(r.method);
+                // Pre-check: verify dataset file exists before starting the job
+                const cleanRegEntry = getRegistryEntry(datasetId);
+                const cleanRegPath = cleanRegEntry?.local_path || cleanRegEntry?.path;
+                const cleanDlStatus = metadataStore.getDownloadStatus(datasetId);
+                const cleanSafeId = toSafeDatasetPathFragment(datasetId);
+                const cleanDataExists = (cleanRegPath && fs.existsSync(cleanRegPath)) ||
+                    (cleanDlStatus?.local_path && fs.existsSync(cleanDlStatus.local_path)) ||
+                    fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.parquet`)) ||
+                    fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.csv`)) ||
+                    fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.feather`)) ||
+                    fs.existsSync(datasetId);
+                if (!cleanDataExists) {
+                    return {
+                        content: [{ type: "text", text: `Dataset '${datasetId}' not found locally. Download it first using download_dataset or prepare_dataset, then run custom_clean.` }],
+                        isError: true,
+                    };
                 }
+                const job = jobManager.createJob("clean", 0, { datasetId, ops });
+                return {
+                    content: [{ type: "text", text: `Cleaning job started. ID: ${job.id}. Use check_job_status to monitor progress.` }]
+                };
             }
-            if (dataworldToken) {
-                const r = secureKeys.set("dataworld_token", dataworldToken);
-                if (r.ok) {
-                    process.env.DW_AUTH_TOKEN = dataworldToken;
-                    saved.push("data.world token");
-                    if (r.method)
-                        methods.push(r.method);
+            case "prepare_dataset": {
+                hydrateExternalKeys();
+                const query = String(request.params.arguments?.query);
+                const requirements = request.params.arguments?.requirements ? String(request.params.arguments?.requirements) : undefined;
+                const downloadImages = request.params.arguments?.download_images === true;
+                const requestedOutputDir = request.params.arguments?.target_dir
+                    ? String(request.params.arguments.target_dir).trim()
+                    : request.params.arguments?.output_dir
+                        ? String(request.params.arguments.output_dir).trim()
+                        : "";
+                const outputDir = requestedOutputDir || process.cwd();
+                if (!query || query === "undefined") {
+                    throw new McpError(ErrorCode.InvalidParams, "query is required - describe the dataset you need or provide a dataset ID");
                 }
-            }
-            if (saved.length === 0) {
+                const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages, outputDir });
                 return {
-                    content: [{ type: "text", text: "No keys provided. Core Vesper tools continue to work without API keys." }]
+                    content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you. Use check_job_status to monitor progress.` }]
                 };
             }
-            return {
-                content: [{ type: "text", text: `Key saved securely. Updated: ${saved.join(", ")}.` }]
-            };
-        }
-        case "get_dataset_info": {
-            const datasetId = String(request.params.arguments?.dataset_id);
-            if (!datasetId) {
-                throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
-            }
-            const dataset = metadataStore.getDataset(datasetId);
-            if (!dataset) {
+            case "compare_datasets": {
+                const datasetIds = request.params.arguments?.dataset_ids;
+                const datasets = datasetIds.map(id => metadataStore.getDataset(id)).filter(d => !!d);
+                let comparison = "| Metric | " + datasets.map(d => d.name).join(" | ") + " |\n";
+                comparison += "| :--- | " + datasets.map(() => " :---: ").join("|") + " |\n";
+                comparison += "| Quality Score | " + datasets.map(d => `${d.quality_score}/100`).join(" | ") + " |\n";
+                comparison += "| License | " + datasets.map(d => d.license.category).join(" | ") + " |\n";
+                comparison += "| Downloads | " + datasets.map(d => d.downloads).join(" | ") + " |\n";
+                comparison += "| Domain | " + datasets.map(d => d.domain).join(" | ") + " |\n";
                 return {
-                    content: [{ type: "text", text: `ERROR: Dataset not found: ${datasetId}` }],
-                    isError: true,
+                    content: [{ type: "text", text: comparison }]
                 };
             }
-            const formattedOutput = formatDatasetInfo(dataset);
-            return { content: [{ type: "text", text: formattedOutput }] };
-        }
-        case "analyze_quality": {
-            const datasetId = String(request.params.arguments?.dataset_id);
-            const safeId = datasetId.replace(/\//g, "_");
-            const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
-            const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
-            let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
-            // Demo Fallback for easy testing
-            if (datasetId === "demo" || !fs.existsSync(filePath)) {
-                const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
-                const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
-                if (fs.existsSync(demoParquetPath)) {
-                    filePath = demoParquetPath;
-                }
-                else if (fs.existsSync(demoCsvPath)) {
-                    filePath = demoCsvPath;
-                }
-                else if (datasetId !== "demo") {
+            case "check_job_status": {
+                const jobId = String(request.params.arguments?.job_id);
+                const job = metadataStore.getJob(jobId);
+                if (!job) {
+                    throw new McpError(ErrorCode.InvalidParams, `Job not found: ${jobId}`);
+                }
+                const activeStatuses = new Set(["pending", "queued", "running", "retrying"]);
+                const now = Date.now();
+                const last = jobStatusLastPoll[jobId] || 0;
+                const minPollMs = 3000;
+                if (activeStatuses.has(job.status) && (now - last) < minPollMs) {
+                    const waitMs = minPollMs - (now - last);
                     return {
-                        content: [{ type: "text", text: `ERROR: Local data file for ${datasetId} not found. Try running prepare_dataset first, or use 'demo' as the dataset_id.` }],
-                        isError: true
+                        content: [{ type: "text", text: `No significant status change yet. Please poll again in ~${Math.ceil(waitMs / 1000)}s.` }]
                     };
                 }
+                jobStatusLastPoll[jobId] = now;
+                return {
+                    content: [{ type: "text", text: formatJobStatus(job) }]
+                };
             }
-            const report = await qualityAnalyzer.analyze(filePath);
-            return {
-                content: [{ type: "text", text: JSON.stringify(report, null, 2) }]
-            };
-        }
-        case "preview_cleaning": {
-            const datasetId = String(request.params.arguments?.dataset_id);
-            const safeId = datasetId.replace(/\//g, "_");
-            const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
-            const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
-            let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
-            if (datasetId === "demo" || !fs.existsSync(filePath)) {
-                const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
-                const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
-                if (fs.existsSync(demoParquetPath)) {
-                    filePath = demoParquetPath;
-                }
-                else if (fs.existsSync(demoCsvPath)) {
-                    filePath = demoCsvPath;
+            case "export_dataset": {
+                const datasetId = String(request.params.arguments?.dataset_id);
+                const isDirectLocalInput = isDirectLocalDatasetReference(datasetId);
+                const requestedTargetDir = request.params.arguments?.target_dir
+                    ? String(request.params.arguments?.target_dir).trim()
+                    : request.params.arguments?.output_dir
+                        ? String(request.params.arguments?.output_dir).trim()
+                        : "";
+                const targetDir = path.resolve(requestedTargetDir || process.cwd());
+                const requestedFormat = String(request.params.arguments?.format || "feather");
+                const fastMode = request.params.arguments?.fast === true;
+                const preview = request.params.arguments?.preview === true;
+                const sampleRows = request.params.arguments?.sample_rows ? Number(request.params.arguments.sample_rows) : undefined;
+                const columns = request.params.arguments?.columns;
+                const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
+                // Use Metadata or Registry to find the actual local file
+                const preferredLookupDirs = [targetDir, process.cwd()];
+                let sourcePath = resolveDatasetLocalPath(datasetId, preferredLookupDirs);
+                if (!sourcePath) {
+                    console.error(`[Export] No local data found for ${datasetId}. Attempting to prepare automatically...`);
+                    // Start a prepare job for this dataset id (acts like calling prepare_dataset)
+                    try {
+                        jobManager.createJob("prepare", 0, { query: datasetId, requirements: undefined, downloadImages: false, outputDir: process.cwd() });
+                    }
+                    catch (e) {
+                        console.error(`[Export] Failed to start prepare job for ${datasetId}: ${e?.message || e}`);
+                    }
+                    // Poll for download status or registry entry until local_path appears or timeout
+                    const wait = (ms) => new Promise(res => setTimeout(res, ms));
+                    const maxWait = 120_000; // 120s
+                    const interval = 2000;
+                    let waited = 0;
+                    while (waited < maxWait) {
+                        const resolved = resolveDatasetLocalPath(datasetId, preferredLookupDirs);
+                        if (resolved) {
+                            sourcePath = resolved;
+                            console.error(`[Export] Local data is now available for ${datasetId}: ${sourcePath}`);
+                            break;
+                        }
+                        await wait(interval);
+                        waited += interval;
+                    }
+                    // If still no sourcePath, return helpful error listing prepared datasets
+                    if (!sourcePath) {
+                        const entries = readRegistry();
+                        const listText = entries.length === 0 ? "(no prepared datasets found)" : entries.map(e => `- ${e.dataset_id || e.id || "unknown"}: ${e.local_path || e.path || "unknown"}`).slice(0, 10).join("\n") + (entries.length > 10 ? "\n...and " + (entries.length - 10) + " more" : "");
+                        return {
+                            content: [{ type: "text", text: `ERROR: No local data found for ${datasetId} after attempting prepare. Check credentials and try running prepare_dataset manually. Prepared datasets:\n${listText}` }],
+                            isError: true
+                        };
+                    }
+                }
+                sourcePath = ensureExportableLocalPath(sourcePath);
+                try {
+                    if (!isDirectLocalInput && shouldTrackExportPath(sourcePath)) {
+                        upsertRegistry(datasetId, sourcePath, "completed");
+                    }
+                }
+                catch (e) {
+                    console.error(`[Registry] Failed to normalize registry path for ${datasetId}: ${e?.message || e}`);
+                }
+                // If NOT fast mode, run quality/cleaning pipeline first (only for csv/parquet compat)
+                if (!fastMode) {
+                    const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
+                    const pipelineFmt = (requestedFormat === "csv" || requestedFormat === "parquet") ? requestedFormat : "parquet";
+                    const pipelineCompatibleInput = currentExt === "csv" || currentExt === "parquet";
+                    if (!pipelineCompatibleInput) {
+                        console.error(`[Export] Skipping pipeline for ${currentExt} input; using direct exporter conversion.`);
+                    }
+                    else if (currentExt !== pipelineFmt) {
+                        console.error(`[Export] Running quality/cleaning pipeline (use fast=true to skip)...`);
+                        try {
+                            sourcePath = ensureLocalPipelineSource(sourcePath, datasetId, targetDir);
+                            const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, pipelineFmt);
+                            if (pipelineResult.final_output_path) {
+                                sourcePath = pipelineResult.final_output_path;
+                                try {
+                                    // Update registry to point to pipeline's final output
+                                    if (!isDirectLocalInput && shouldTrackExportPath(sourcePath)) {
+                                        upsertRegistry(datasetId, sourcePath, "completed");
+                                    }
+                                }
+                                catch (e) {
+                                    console.error(`[Registry] Failed to update registry for ${datasetId}: ${e?.message || e}`);
+                                }
+                            }
+                        }
+                        catch (err) {
+                            console.error(`[Export] Pipeline warning: ${err.message}. Continuing with raw file.`);
+                        }
+                    }
                 }
                 else {
-                    throw new McpError(ErrorCode.InvalidParams, `Local data file not found for ${datasetId}. Please run prepare_dataset first.`);
+                    console.error(`[Export] Fast mode - skipping quality analysis and cleaning`);
+                }
+                // Build export options
+                const exportOpts = {};
+                if (compression)
+                    exportOpts.compression = compression;
+                if (preview)
+                    exportOpts.preview = true;
+                if (sampleRows)
+                    exportOpts.sample_rows = sampleRows;
+                if (columns)
+                    exportOpts.columns = columns;
+                try {
+                    // Determine output file name
+                    const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
+                    const ext = extMap[requestedFormat] || ".feather";
+                    const safeName = getExportFileStem(datasetId);
+                    const outDir = targetDir;
+                    if (!fs.existsSync(outDir))
+                        fs.mkdirSync(outDir, { recursive: true });
+                    const outputFile = path.join(outDir, `${safeName}${ext}`);
+                    const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
+                    // Build rich response
+                    let msg = `**Export complete**\n`;
+                    msg += `- **File**: ${result.output_path}\n`;
+                    msg += `- **Format**: ${result.format}${result.compression ? ` (${result.compression})` : ""}\n`;
+                    msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
+                    if (result.file_size_mb !== undefined)
+                        msg += `- **Size**: ${result.file_size_mb} MB\n`;
+                    if (result.elapsed_seconds !== undefined)
+                        msg += `- **Time**: ${result.elapsed_seconds}s\n`;
+                    if (result.preview_path)
+                        msg += `- **Preview**: ${result.preview_path}\n`;
+                    msg += `\n`;
+                    if (requestedFormat === "feather") {
+                        msg += `**Inspect with:**\n`;
+                        msg += `  Python: \`pd.read_feather('${result.output_path}').head()\`\n`;
+                        msg += `  DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
+                    }
+                    else if (requestedFormat === "parquet") {
+                        msg += `**Inspect with:**\n`;
+                        msg += `  Python: \`pd.read_parquet('${result.output_path}').head()\`\n`;
+                        msg += `  DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
+                    }
+                    return { content: [{ type: "text", text: msg }] };
+                }
+                catch (error) {
+                    return {
+                        content: [{ type: "text", text: `ERROR: Export failed: ${error.message}` }],
+                        isError: true
+                    };
                 }
             }
-            const report = await qualityAnalyzer.analyze(filePath);
-            // Phase 1: Target Detection
-            // We use the same TargetDetector instance inside CleaningPlanner now?
-            // Actually, we instantiated it inside CleaningPlanner, so we just need to pass the file path
-            // OR let the planner handle it if we update its signature to accept filePath.
-            // Let's check `CleaningPlanner.generatePlan` signature again.
-            // We updated it to accept `targetInfo`.
-            // So we need to run detection HERE and pass it.
-            // But `TargetDetector` is not exposed in `index.ts` scope yet.
-            // Let's create a global instance or use the one inside planner if exposed (it's private).
-            // Better approach: Instantiate TargetDetector here in index.ts for the tool content.
-            // Quick fix: Instantiate local detector or make global.
-            // I'll make a global `targetDetector` constant in index.ts
-            // But wait, I updated `CleaningPlanner` to instantiate its own detector.
-            // Does `CleaningPlanner` use it? No, I commented out the logic because it needed `filePath`.
-            // RETRY STRATEGY:
-            // 1. Instantiate `targetDetector` in `index.ts`.
-            // 2. Run `detectTarget(filePath)`.
-            // 3. Pass result to `cleaningPlanner.generatePlan(..., targetInfo)`.
-            // I need to add `const targetDetector = new TargetDetector(__dirname);` to imports/init section first.
-            // But since I'm in this tool, I can't look back.
-            // I will assume I can add it, or just do it inside the case for now.
-            // To do it properly, I should have added `targetDetector` to the global scope in previous step.
-            // Let's do that in a separate step if needed.
-            // For now, I'll instantiate it here.
-            const { TargetDetector } = await import("./preparation/target-detector.js");
-            const detector = new TargetDetector(__dirname);
-            const targetResult = await detector.detectTarget(filePath);
-            const targetInfo = targetResult.target_column ? {
-                target: targetResult.target_column,
-                confidence: targetResult.confidence
-            } : undefined;
-            const plan = await cleaningPlanner.generatePlan(datasetId, report, undefined, targetInfo);
-            let explanation = `### Cleaning Plan for ${datasetId}\n\n`;
-            if (targetInfo && targetInfo.target !== "target" && targetInfo.confidence > 0.7) {
-                explanation += `**Target Detected**: '${targetInfo.target}' (Confidence: ${targetInfo.confidence.toFixed(2)})\n`;
-                explanation += `   - **Action**: Renaming to 'target' for consistency.\n\n`;
-            }
-            explanation += `Estimated Quality Improvement: +${plan.estimated_impact.quality_score_improvement} points\n\n`;
-            if (plan.operations.length === 0) {
-                explanation += "No cleaning operations required.";
-            }
-            else {
-                plan.operations.forEach((op, i) => {
-                    explanation += `${i + 1}. **${op.type}**: ${op.reason}\n`;
+            case "vesper_list_datasets": {
+                const entries = readRegistry();
+                if (entries.length === 0) {
+                    return {
+                        content: [{ type: "text", text: "No prepared datasets found in the Vesper registry. Use prepare_dataset or download_dataset to add datasets." }]
+                    };
+                }
+                const lines = entries.map((e, i) => {
+                    const id = e.dataset_id || e.id || "unknown";
+                    const localPath = e.local_path || e.path || "unknown";
+                    const exists = typeof localPath === "string" && localPath !== "unknown" && fs.existsSync(localPath);
+                    return `${i + 1}. **${id}**\n   Path: ${localPath}\n   Status: ${e.status || "unknown"}${exists ? "" : " (file missing)"}`;
                 });
-            }
-            return {
-                content: [{ type: "text", text: explanation }]
-            };
-        }
-        case "custom_clean": {
-            const datasetId = String(request.params.arguments?.dataset_id);
-            const ops = request.params.arguments?.operations;
-            if (!datasetId || datasetId === "undefined") {
-                throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
-            }
-            if (!ops || !Array.isArray(ops) || ops.length === 0) {
-                throw new McpError(ErrorCode.InvalidParams, "operations array is required and must not be empty. Supported operations: RemoveDuplicates, DropColumns, FillMissing, FixTypes, RemoveOutliers, EncodeCategories");
-            }
-            // Pre-check: verify dataset file exists before starting the job
-            const cleanRegEntry = getRegistryEntry(datasetId);
-            const cleanRegPath = cleanRegEntry?.local_path || cleanRegEntry?.path;
-            const cleanDlStatus = metadataStore.getDownloadStatus(datasetId);
-            const cleanSafeId = datasetId.replace(/\//g, "_");
-            const cleanDataExists = (cleanRegPath && fs.existsSync(cleanRegPath)) ||
-                (cleanDlStatus?.local_path && fs.existsSync(cleanDlStatus.local_path)) ||
-                fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.parquet`)) ||
-                fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.csv`)) ||
-                fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.feather`)) ||
-                fs.existsSync(datasetId);
-            if (!cleanDataExists) {
-                return {
-                    content: [{ type: "text", text: `Dataset '${datasetId}' not found locally. Download it first using download_dataset or prepare_dataset, then run custom_clean.` }],
-                    isError: true,
-                };
-            }
-            const job = jobManager.createJob("clean", 0, { datasetId, ops });
-            return {
-                content: [{ type: "text", text: `Cleaning job started. ID: ${job.id}. Use check_job_status to monitor progress.` }]
-            };
-        }
-        case "prepare_dataset": {
-            hydrateExternalKeys();
-            const query = String(request.params.arguments?.query);
-            const requirements = request.params.arguments?.requirements ? String(request.params.arguments?.requirements) : undefined;
-            const downloadImages = request.params.arguments?.download_images === true;
-            if (!query || query === "undefined") {
-                throw new McpError(ErrorCode.InvalidParams, "query is required - describe the dataset you need or provide a dataset ID");
-            }
-            const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages });
-            return {
-                content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you. Use check_job_status to monitor progress.` }]
-            };
-        }
-        case "compare_datasets": {
-            const datasetIds = request.params.arguments?.dataset_ids;
-            const datasets = datasetIds.map(id => metadataStore.getDataset(id)).filter(d => !!d);
-            let comparison = "| Metric | " + datasets.map(d => d.name).join(" | ") + " |\n";
-            comparison += "| :--- | " + datasets.map(() => " :---: ").join("|") + " |\n";
-            comparison += "| Quality Score | " + datasets.map(d => `${d.quality_score}/100`).join(" | ") + " |\n";
-            comparison += "| License | " + datasets.map(d => d.license.category).join(" | ") + " |\n";
-            comparison += "| Downloads | " + datasets.map(d => d.downloads).join(" | ") + " |\n";
-            comparison += "| Domain | " + datasets.map(d => d.domain).join(" | ") + " |\n";
-            return {
-                content: [{ type: "text", text: comparison }]
-            };
-        }
-        case "check_job_status": {
-            const jobId = String(request.params.arguments?.job_id);
-            const job = metadataStore.getJob(jobId);
-            if (!job) {
-                throw new McpError(ErrorCode.InvalidParams, `Job not found: ${jobId}`);
-            }
-            const activeStatuses = new Set(["pending", "queued", "running", "retrying"]);
-            const now = Date.now();
-            const last = jobStatusLastPoll[jobId] || 0;
-            const minPollMs = 3000;
-            if (activeStatuses.has(job.status) && (now - last) < minPollMs) {
-                const waitMs = minPollMs - (now - last);
                 return {
-                    content: [{ type: "text", text: `No significant status change yet. Please poll again in ~${Math.ceil(waitMs / 1000)}s.` }]
+                    content: [{ type: "text", text: `**Vesper Registry** (${entries.length} dataset${entries.length !== 1 ? "s" : ""}):\n\n${lines.join("\n\n")}` }]
                 };
             }
-            jobStatusLastPoll[jobId] = now;
-            return {
-                content: [{ type: "text", text: formatJobStatus(job) }]
-            };
-        }
-        case "export_dataset": {
-            const datasetId = String(request.params.arguments?.dataset_id);
-            const targetDir = request.params.arguments?.target_dir ? String(request.params.arguments?.target_dir) : undefined;
-            const requestedFormat = String(request.params.arguments?.format || "feather");
-            const fastMode = request.params.arguments?.fast === true;
-            const preview = request.params.arguments?.preview === true;
-            const sampleRows = request.params.arguments?.sample_rows ? Number(request.params.arguments.sample_rows) : undefined;
-            const columns = request.params.arguments?.columns;
-            const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
-            const dataset = metadataStore.getDataset(datasetId);
-            if (!dataset) {
-                throw new McpError(ErrorCode.InvalidParams, `Dataset not found: ${datasetId}`);
-            }
-            // Use Metadata or Registry to find the actual local file
-            let sourcePath = undefined;
-            const downloadStatus = metadataStore.getDownloadStatus(datasetId);
-            if (downloadStatus && fs.existsSync(downloadStatus.local_path)) {
-                sourcePath = downloadStatus.local_path;
-            }
-            else {
-                // Fallback to local registry
-                const reg = getRegistryEntry(datasetId);
-                if (reg && fs.existsSync(reg.local_path)) {
-                    sourcePath = reg.local_path;
+            case "vesper_convert_format": {
+                const filePath = String(request.params.arguments?.file_path || "").trim();
+                const targetFormat = String(request.params.arguments?.target_format || "").trim().toLowerCase();
+                if (!filePath) {
+                    throw new McpError(ErrorCode.InvalidParams, "file_path is required");
                 }
-            }
-            if (!sourcePath) {
-                console.error(`[Export] No local data found for ${datasetId}. Attempting to prepare automatically...`);
-                // Start a prepare job for this dataset id (acts like calling prepare_dataset)
+                if (!["csv", "parquet", "json", "jsonl"].includes(targetFormat)) {
+                    throw new McpError(ErrorCode.InvalidParams, "target_format must be one of: csv, parquet, json, jsonl");
+                }
+                if (!fs.existsSync(filePath)) {
+                    return {
+                        content: [{ type: "text", text: `ERROR: File not found: ${filePath}` }],
+                        isError: true,
+                    };
+                }
+                const inputExt = path.extname(filePath).toLowerCase();
+                const extMap = { csv: ".csv", parquet: ".parquet", json: ".json", jsonl: ".jsonl" };
+                const outputExt = extMap[targetFormat];
+                if (inputExt === outputExt) {
+                    return {
+                        content: [{ type: "text", text: `File is already in ${targetFormat} format: ${filePath}` }],
+                    };
+                }
+                const outputPath = filePath.replace(/\.[^.]+$/, outputExt);
                 try {
-                    jobManager.createJob("prepare", 0, { query: datasetId, requirements: undefined, downloadImages: false });
+                    await ensurePythonModules([{ module: "polars", packageName: "polars" }]);
+                    const convertScript = path.join(dataRoot, "python", "convert_engine.py");
+                    const result = await runPythonJson(convertScript, [filePath, outputPath]);
+                    if (!result.ok) {
+                        return {
+                            content: [{ type: "text", text: `ERROR: Conversion failed: ${result.error}` }],
+                            isError: true,
+                        };
+                    }
+                    // Register converted file in the registry
+                    const datasetId = path.basename(outputPath, outputExt);
+                    try {
+                        upsertRegistry(datasetId, outputPath, "completed");
+                    }
+                    catch (e) {
+                        console.error(`[Convert] Registry write failed: ${e?.message || e}`);
+                    }
+                    let msg = `**Conversion complete**\n`;
+                    msg += `- **Input**: ${filePath} (${inputExt.slice(1)})\n`;
+                    msg += `- **Output**: ${result.output_path} (${targetFormat})\n`;
+                    msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
+                    if (result.size_mb !== undefined)
+                        msg += `- **Size**: ${result.size_mb} MB\n`;
+                    return { content: [{ type: "text", text: msg }] };
                 }
-                catch (e) {
-                    console.error(`[Export] Failed to start prepare job for ${datasetId}: ${e?.message || e}`);
-                }
-                // Poll for download status or registry entry until local_path appears or timeout
-                const wait = (ms) => new Promise(res => setTimeout(res, ms));
-                const maxWait = 120_000; // 120s
-                const interval = 2000;
-                let waited = 0;
-                while (waited < maxWait) {
-                    const ds = metadataStore.getDownloadStatus(datasetId);
-                    if (ds && ds.local_path && fs.existsSync(ds.local_path)) {
-                        sourcePath = ds.local_path;
-                        console.error(`[Export] Local data is now available for ${datasetId}: ${sourcePath}`);
-                        break;
+                catch (error) {
+                    return {
+                        content: [{ type: "text", text: `ERROR: Conversion failed: ${error.message}` }],
+                        isError: true,
+                    };
+                }
+            }
+            case "fuse_datasets": {
+                const rawSources = request.params.arguments?.sources;
+                if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {
+                    throw new McpError(ErrorCode.InvalidParams, "sources must contain at least 2 dataset IDs/paths");
+                }
+                const strategy = request.params.arguments?.strategy || "concat";
+                const joinOn = request.params.arguments?.join_on;
+                const how = request.params.arguments?.how || "inner";
+                const dedup = request.params.arguments?.dedup !== false;
+                const runQualityAfter = request.params.arguments?.run_quality_after !== false;
+                const leakageCheck = request.params.arguments?.leakage_check !== false;
+                const outputFormat = request.params.arguments?.output_format || "feather";
+                const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
+                const preview = request.params.arguments?.preview !== false;
+                const resolvedPaths = [];
+                const unresolved = [];
+                for (const src of rawSources) {
+                    if (fs.existsSync(src)) {
+                        resolvedPaths.push(src);
+                        continue;
                     }
-                    const reg = getRegistryEntry(datasetId);
-                    const regPath = reg?.local_path || reg?.path;
-                    if (regPath && fs.existsSync(regPath)) {
-                        sourcePath = regPath;
-                        console.error(`[Export] Local data found in registry for ${datasetId}: ${sourcePath}`);
-                        break;
+                    const status = metadataStore.getDownloadStatus(src);
+                    if (status?.local_path && fs.existsSync(status.local_path)) {
+                        resolvedPaths.push(status.local_path);
+                        continue;
                     }
-                    await wait(interval);
-                    waited += interval;
+                    unresolved.push(src);
                 }
-                // If still no sourcePath, return helpful error listing prepared datasets
-                if (!sourcePath) {
-                    const entries = readRegistry();
-                    const listText = entries.length === 0 ? "(no prepared datasets found)" : entries.map(e => `- ${e.dataset_id || e.id || "unknown"}: ${e.local_path || e.path || "unknown"}`).slice(0, 10).join("\n") + (entries.length > 10 ? "\n...and " + (entries.length - 10) + " more" : "");
+                if (unresolved.length > 0) {
                     return {
-                        content: [{ type: "text", text: `ERROR: No local data found for ${datasetId} after attempting prepare. Check credentials and try running prepare_dataset manually. Prepared datasets:\n${listText}` }],
+                        content: [{
+                                type: "text",
+                                text: `ERROR: Could not resolve these sources to local files: ${unresolved.join(", ")}. Provide local paths or run prepare_dataset first.`
+                            }],
                         isError: true
                     };
                 }
-            }
-            // If NOT fast mode, run quality/cleaning pipeline first (only for csv/parquet compat)
-            if (!fastMode) {
-                const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
-                const pipelineFmt = (requestedFormat === "csv" || requestedFormat === "parquet") ? requestedFormat : "parquet";
-                const pipelineCompatibleInput = currentExt === "csv" || currentExt === "parquet";
-                if (!pipelineCompatibleInput) {
-                    console.error(`[Export] Skipping pipeline for ${currentExt} input; using direct exporter conversion.`);
-                }
-                else if (currentExt !== pipelineFmt) {
-                    console.error(`[Export] Running quality/cleaning pipeline (use fast=true to skip)...`);
+                try {
+                    const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow" };
+                    const ext = extMap[outputFormat] || ".feather";
+                    const outDir = process.cwd();
+                    if (!fs.existsSync(outDir))
+                        fs.mkdirSync(outDir, { recursive: true });
+                    const outputPath = path.join(outDir, `fused_${Date.now()}${ext}`);
+                    console.error(`[Fusion] Resolved output directory: ${outDir}`);
+                    const result = await fusionEngine.fuse(resolvedPaths, outputPath, {
+                        strategy,
+                        join_on: joinOn,
+                        how,
+                        dedup,
+                        run_quality_after: runQualityAfter,
+                        leakage_check: leakageCheck,
+                        output_format: outputFormat,
+                        compression: compression,
+                        preview,
+                    });
+                    const nullDelta = result.stats.null_delta;
+                    const nullText = nullDelta >= 0 ? `+${nullDelta}%` : `${nullDelta}%`;
+                    // Register fused dataset under a generated id so users can export it easily
+                    const fusedId = `fused_${Date.now()}`;
                     try {
-                        const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, pipelineFmt);
-                        if (pipelineResult.final_output_path) {
-                            sourcePath = pipelineResult.final_output_path;
-                            try {
-                                // Update registry to point to pipeline's final output
-                                upsertRegistry(datasetId, sourcePath, "completed");
-                            }
-                            catch (e) {
-                                console.error(`[Registry] Failed to update registry for ${datasetId}: ${e?.message || e}`);
-                            }
-                        }
+                        upsertRegistry(fusedId, result.output_path, "completed");
                     }
-                    catch (err) {
-                        console.error(`[Export] Pipeline warning: ${err.message}. Continuing with raw file.`);
+                    catch (e) {
+                        console.error(`[Registry] Failed to register fused dataset ${fusedId}: ${e?.message || e}`);
                     }
+                    let msg = `Fused ${result.stats.sources_count} sources -> ${result.stats.rows_after.toLocaleString()} rows (from ${result.stats.rows_before.toLocaleString()}).\n`;
+                    msg += `- Duplicates removed: ${result.stats.duplicates_removed.toLocaleString()}\n`;
+                    msg += `- Null change: ${nullText}\n`;
+                    msg += `- Output: ${result.output_path}\n`;
+                    if (result.preview_path)
+                        msg += `- Preview: ${result.preview_path}\n`;
+                    if (result.leakage_report) {
+                        msg += `- Leakage: ${result.leakage_report.leakage_detected ? "detected" : "none"}`;
+                        if (result.leakage_report.leakage_count) {
+                            msg += ` (${result.leakage_report.leakage_count})`;
+                        }
+                        msg += "\n";
+                    }
+                    msg += `\nNext: run split_dataset/export_dataset on fused output. Registered fused dataset id: ${fusedId}`;
+                    return { content: [{ type: "text", text: msg }] };
                 }
-            }
-            else {
-                console.error(`[Export] Fast mode - skipping quality analysis and cleaning`);
-            }
-            // Build export options
-            const exportOpts = {};
-            if (compression)
-                exportOpts.compression = compression;
-            if (preview)
-                exportOpts.preview = true;
-            if (sampleRows)
-                exportOpts.sample_rows = sampleRows;
-            if (columns)
-                exportOpts.columns = columns;
-            try {
-                // Determine output file name
-                const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
-                const ext = extMap[requestedFormat] || ".feather";
-                const safeName = datasetId.replace(/\//g, "_");
-                const outDir = targetDir || path.join(dataRoot, "exports");
-                if (!fs.existsSync(outDir))
-                    fs.mkdirSync(outDir, { recursive: true });
-                const outputFile = path.join(outDir, `${safeName}${ext}`);
-                const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
-                // Build rich response
-                let msg = `**Export complete**\n`;
-                msg += `- **File**: ${result.output_path}\n`;
-                msg += `- **Format**: ${result.format}${result.compression ? ` (${result.compression})` : ""}\n`;
-                msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
-                if (result.file_size_mb !== undefined)
-                    msg += `- **Size**: ${result.file_size_mb} MB\n`;
-                if (result.elapsed_seconds !== undefined)
-                    msg += `- **Time**: ${result.elapsed_seconds}s\n`;
-                if (result.preview_path)
-                    msg += `- **Preview**: ${result.preview_path}\n`;
-                msg += `\n`;
-                if (requestedFormat === "feather") {
-                    msg += `**Inspect with:**\n`;
-                    msg += `  Python: \`pd.read_feather('${result.output_path}').head()\`\n`;
-                    msg += `  DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
-                }
-                else if (requestedFormat === "parquet") {
-                    msg += `**Inspect with:**\n`;
-                    msg += `  Python: \`pd.read_parquet('${result.output_path}').head()\`\n`;
-                    msg += `  DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
-                }
-                return { content: [{ type: "text", text: msg }] };
-            }
-            catch (error) {
-                return {
-                    content: [{ type: "text", text: `ERROR: Export failed: ${error.message}` }],
-                    isError: true
-                };
-            }
-        }
-        case "fuse_datasets": {
-            const rawSources = request.params.arguments?.sources;
-            if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {
-                throw new McpError(ErrorCode.InvalidParams, "sources must contain at least 2 dataset IDs/paths");
-            }
-            const strategy = request.params.arguments?.strategy || "concat";
-            const joinOn = request.params.arguments?.join_on;
-            const how = request.params.arguments?.how || "inner";
-            const dedup = request.params.arguments?.dedup !== false;
-            const runQualityAfter = request.params.arguments?.run_quality_after !== false;
-            const leakageCheck = request.params.arguments?.leakage_check !== false;
-            const outputFormat = request.params.arguments?.output_format || "feather";
-            const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
-            const preview = request.params.arguments?.preview !== false;
-            const resolvedPaths = [];
-            const unresolved = [];
-            for (const src of rawSources) {
-                if (fs.existsSync(src)) {
-                    resolvedPaths.push(src);
-                    continue;
-                }
-                const status = metadataStore.getDownloadStatus(src);
-                if (status?.local_path && fs.existsSync(status.local_path)) {
-                    resolvedPaths.push(status.local_path);
-                    continue;
+                catch (error) {
+                    return {
+                        content: [{ type: "text", text: `ERROR: Fusion failed: ${error.message}` }],
+                        isError: true
+                    };
                 }
-                unresolved.push(src);
-            }
-            if (unresolved.length > 0) {
-                return {
-                    content: [{
-                            type: "text",
-                            text: `ERROR: Could not resolve these sources to local files: ${unresolved.join(", ")}. Provide local paths or run prepare_dataset first.`
-                        }],
-                    isError: true
-                };
             }
-            try {
-                const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow" };
-                const ext = extMap[outputFormat] || ".feather";
-                const outDir = path.join(dataRoot, "fusion");
-                if (!fs.existsSync(outDir))
-                    fs.mkdirSync(outDir, { recursive: true });
-                const outputPath = path.join(outDir, `fused_${Date.now()}${ext}`);
-                const result = await fusionEngine.fuse(resolvedPaths, outputPath, {
-                    strategy,
-                    join_on: joinOn,
-                    how,
-                    dedup,
-                    run_quality_after: runQualityAfter,
-                    leakage_check: leakageCheck,
-                    output_format: outputFormat,
-                    compression: compression,
-                    preview,
-                });
-                const nullDelta = result.stats.null_delta;
-                const nullText = nullDelta >= 0 ? `+${nullDelta}%` : `${nullDelta}%`;
-                // Register fused dataset under a generated id so users can export it easily
-                const fusedId = `fused_${Date.now()}`;
-                try {
-                    upsertRegistry(fusedId, result.output_path, "completed");
+            case "analyze_image_quality": {
+                const inputPath = String(request.params.arguments?.path);
+                if (!fs.existsSync(inputPath)) {
+                    throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
                 }
-                catch (e) {
-                    console.error(`[Registry] Failed to register fused dataset ${fusedId}: ${e?.message || e}`);
-                }
-                let msg = `Fused ${result.stats.sources_count} sources -> ${result.stats.rows_after.toLocaleString()} rows (from ${result.stats.rows_before.toLocaleString()}).\n`;
-                msg += `- Duplicates removed: ${result.stats.duplicates_removed.toLocaleString()}\n`;
-                msg += `- Null change: ${nullText}\n`;
-                msg += `- Output: ${result.output_path}\n`;
-                if (result.preview_path)
-                    msg += `- Preview: ${result.preview_path}\n`;
-                if (result.leakage_report) {
-                    msg += `- Leakage: ${result.leakage_report.leakage_detected ? "detected" : "none"}`;
-                    if (result.leakage_report.leakage_count) {
-                        msg += ` (${result.leakage_report.leakage_count})`;
+                try {
+                    const report = await imageAnalyzer.analyze(inputPath);
+                    let output = `## Image Quality Report\n\n`;
+                    output += `- **Total Images**: ${report.total_images}\n`;
+                    output += `- **Corrupted**: ${report.corrupted_count}\n`;
+                    output += `- **Average Resolution**: ${Math.round(report.average_width)}x${Math.round(report.average_height)}\n`;
+                    output += `- **Blurry Images**: ${report.blurry_count}\n\n`;
+                    if (report.individual_results.length > 0) {
+                        output += `### Sample Detail (Top 5)\n`;
+                        report.individual_results.slice(0, 5).forEach(img => {
+                            const statusLabel = img.status === "ok" ? (img.is_blurry ? "[warn]" : "[ok]") : "[error]";
+                            output += `${statusLabel} **${img.filename}**: ${img.width}x${img.height} | Mode: ${img.mode} | Blur: ${img.blur_score?.toFixed(1)}\n`;
+                        });
                     }
-                    msg += "\n";
+                    return {
+                        content: [{ type: "text", text: output }]
+                    };
+                }
+                catch (error) {
+                    return {
+                        content: [{ type: "text", text: `ERROR: Image analysis failed: ${error.message}` }],
+                        isError: true
+                    };
                 }
-                msg += `\nNext: run split_dataset/export_dataset on fused output. Registered fused dataset id: ${fusedId}`;
-                return { content: [{ type: "text", text: msg }] };
-            }
-            catch (error) {
-                return {
-                    content: [{ type: "text", text: `ERROR: Fusion failed: ${error.message}` }],
-                    isError: true
-                };
-            }
-        }
-        case "analyze_image_quality": {
-            const inputPath = String(request.params.arguments?.path);
-            if (!fs.existsSync(inputPath)) {
-                throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
             }
-            try {
-                const report = await imageAnalyzer.analyze(inputPath);
-                let output = `## Image Quality Report\n\n`;
-                output += `- **Total Images**: ${report.total_images}\n`;
-                output += `- **Corrupted**: ${report.corrupted_count}\n`;
-                output += `- **Average Resolution**: ${Math.round(report.average_width)}x${Math.round(report.average_height)}\n`;
-                output += `- **Blurry Images**: ${report.blurry_count}\n\n`;
-                if (report.individual_results.length > 0) {
-                    output += `### Sample Detail (Top 5)\n`;
-                    report.individual_results.slice(0, 5).forEach(img => {
-                        const statusLabel = img.status === "ok" ? (img.is_blurry ? "[warn]" : "[ok]") : "[error]";
-                        output += `${statusLabel} **${img.filename}**: ${img.width}x${img.height} | Mode: ${img.mode} | Blur: ${img.blur_score?.toFixed(1)}\n`;
+            case "analyze_media_quality": {
+                const inputPath = String(request.params.arguments?.path);
+                if (!fs.existsSync(inputPath)) {
+                    throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
+                }
+                try {
+                    const report = await mediaAnalyzer.analyze(inputPath);
+                    let output = `## Media Quality Report\n\n`;
+                    output += `- **Total Files**: ${report.total_files}\n`;
+                    output += `- **OK Files**: ${report.ok_files}\n`;
+                    output += `- **Failed Files**: ${report.failed_files}\n`;
+                    if ('avg_audio_duration' in report && report.avg_audio_duration) {
+                        output += `- **Average Audio Duration**: ${report.avg_audio_duration.toFixed(2)}s\n`;
+                    }
+                    if ('avg_video_duration' in report && report.avg_video_duration) {
+                        output += `- **Average Video Duration**: ${report.avg_video_duration.toFixed(2)}s\n`;
+                        output += `- **Average FPS**: ${report.avg_fps?.toFixed(1)}\n`;
+                    }
+                    output += `\n### Sample Detail (Top 5)\n`;
+                    report.details.slice(0, 5).forEach(item => {
+                        const statusLabel = item.status === "ok" ? "[ok]" : "[error]";
+                        if (item.type === "audio" && 'sample_rate' in item) {
+                            output += `${statusLabel} **${item.filename}**: ${item.duration?.toFixed(1)}s | SR: ${item.sample_rate}Hz | Silent: ${item.is_silent}\n`;
+                        }
+                        else if (item.type === "video" && 'width' in item) {
+                            output += `${statusLabel} **${item.filename}**: ${item.width}x${item.height} | ${item.fps?.toFixed(1)}fps | Risk: ${item.corruption_risk}\n`;
+                        }
+                        else {
+                            output += `${statusLabel} **${item.filename}**: ${item.error}\n`;
+                        }
                     });
+                    return {
+                        content: [{ type: "text", text: output }]
+                    };
+                }
+                catch (error) {
+                    return {
+                        content: [{ type: "text", text: `ERROR: Media analysis failed: ${error.message}` }],
+                        isError: true
+                    };
                 }
-                return {
-                    content: [{ type: "text", text: output }]
-                };
-            }
-            catch (error) {
-                return {
-                    content: [{ type: "text", text: `ERROR: Image analysis failed: ${error.message}` }],
-                    isError: true
-                };
-            }
-        }
-        case "analyze_media_quality": {
-            const inputPath = String(request.params.arguments?.path);
-            if (!fs.existsSync(inputPath)) {
-                throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
             }
-            try {
-                const report = await mediaAnalyzer.analyze(inputPath);
-                let output = `## Media Quality Report\n\n`;
-                output += `- **Total Files**: ${report.total_files}\n`;
-                output += `- **OK Files**: ${report.ok_files}\n`;
-                output += `- **Failed Files**: ${report.failed_files}\n`;
-                if ('avg_audio_duration' in report && report.avg_audio_duration) {
-                    output += `- **Average Audio Duration**: ${report.avg_audio_duration.toFixed(2)}s\n`;
-                }
-                if ('avg_video_duration' in report && report.avg_video_duration) {
-                    output += `- **Average Video Duration**: ${report.avg_video_duration.toFixed(2)}s\n`;
-                    output += `- **Average FPS**: ${report.avg_fps?.toFixed(1)}\n`;
-                }
-                output += `\n### Sample Detail (Top 5)\n`;
-                report.details.slice(0, 5).forEach(item => {
-                    const statusLabel = item.status === "ok" ? "[ok]" : "[error]";
-                    if (item.type === "audio" && 'sample_rate' in item) {
-                        output += `${statusLabel} **${item.filename}**: ${item.duration?.toFixed(1)}s | SR: ${item.sample_rate}Hz | Silent: ${item.is_silent}\n`;
+            case "generate_quality_report": {
+                const datasetId = String(request.params.arguments?.dataset_id);
+                const datasetPath = String(request.params.arguments?.dataset_path);
+                if (!fs.existsSync(datasetPath)) {
+                    throw new McpError(ErrorCode.InvalidParams, `Dataset path not found: ${datasetPath}`);
+                }
+                try {
+                    // Optionally load text quality from metadata if available
+                    const metadata = await metadataStore.getDataset(datasetId);
+                    // TODO: Integrate text quality analysis when available
+                    const textQuality = null;
+                    const report = await qualityOrchestrator.generateReport(datasetId, datasetPath, textQuality);
+                    // Save report to metadata
+                    if (metadata) {
+                        metadata.unified_quality_report = report;
+                        await metadataStore.saveDataset(metadata);
                     }
-                    else if (item.type === "video" && 'width' in item) {
-                        output += `${statusLabel} **${item.filename}**: ${item.width}x${item.height} | ${item.fps?.toFixed(1)}fps | Risk: ${item.corruption_risk}\n`;
+                    let output = `# Unified Quality Report\n\n`;
+                    output += `**Dataset**: ${datasetId}\n`;
+                    output += `**Modalities**: ${report.modalities.join(", ")}\n`;
+                    output += `**Overall Quality Score**: ${report.overall_quality_score}/100\n\n`;
+                    if (report.text_quality) {
+                        output += `## Text Quality\n`;
+                        output += `- Rows: ${report.text_quality.row_count}\n`;
+                        output += `- Columns: ${report.text_quality.column_count}\n`;
+                        output += `- Missing: ${report.text_quality.missing_percentage.toFixed(1)}%\n`;
+                        output += `- Duplicates: ${report.text_quality.duplicate_percentage.toFixed(1)}%\n\n`;
                     }
-                    else {
-                        output += `${statusLabel} **${item.filename}**: ${item.error}\n`;
+                    if (report.image_quality) {
+                        output += `## Image Quality\n`;
+                        output += `- Total Images: ${report.image_quality.total_images}\n`;
+                        output += `- Corrupted: ${report.image_quality.corrupted_count}\n`;
+                        output += `- Avg Resolution: ${report.image_quality.avg_resolution}\n`;
+                        output += `- Blurry: ${report.image_quality.blurry_percentage.toFixed(1)}%\n\n`;
                     }
-                });
-                return {
-                    content: [{ type: "text", text: output }]
-                };
-            }
-            catch (error) {
-                return {
-                    content: [{ type: "text", text: `ERROR: Media analysis failed: ${error.message}` }],
-                    isError: true
-                };
-            }
-        }
-        case "generate_quality_report": {
-            const datasetId = String(request.params.arguments?.dataset_id);
-            const datasetPath = String(request.params.arguments?.dataset_path);
-            if (!fs.existsSync(datasetPath)) {
-                throw new McpError(ErrorCode.InvalidParams, `Dataset path not found: ${datasetPath}`);
-            }
-            try {
-                // Optionally load text quality from metadata if available
-                const metadata = await metadataStore.getDataset(datasetId);
-                // TODO: Integrate text quality analysis when available
-                const textQuality = null;
-                const report = await qualityOrchestrator.generateReport(datasetId, datasetPath, textQuality);
-                // Save report to metadata
-                if (metadata) {
-                    metadata.unified_quality_report = report;
-                    await metadataStore.saveDataset(metadata);
-                }
-                let output = `# Unified Quality Report\n\n`;
-                output += `**Dataset**: ${datasetId}\n`;
-                output += `**Modalities**: ${report.modalities.join(", ")}\n`;
-                output += `**Overall Quality Score**: ${report.overall_quality_score}/100\n\n`;
-                if (report.text_quality) {
-                    output += `## Text Quality\n`;
-                    output += `- Rows: ${report.text_quality.row_count}\n`;
-                    output += `- Columns: ${report.text_quality.column_count}\n`;
-                    output += `- Missing: ${report.text_quality.missing_percentage.toFixed(1)}%\n`;
-                    output += `- Duplicates: ${report.text_quality.duplicate_percentage.toFixed(1)}%\n\n`;
-                }
-                if (report.image_quality) {
-                    output += `## Image Quality\n`;
-                    output += `- Total Images: ${report.image_quality.total_images}\n`;
-                    output += `- Corrupted: ${report.image_quality.corrupted_count}\n`;
-                    output += `- Avg Resolution: ${report.image_quality.avg_resolution}\n`;
-                    output += `- Blurry: ${report.image_quality.blurry_percentage.toFixed(1)}%\n\n`;
-                }
-                if (report.audio_quality) {
-                    output += `## Audio Quality\n`;
-                    output += `- Total Files: ${report.audio_quality.total_files}\n`;
-                    output += `- Avg Duration: ${report.audio_quality.avg_duration.toFixed(1)}s\n`;
-                    output += `- Avg Sample Rate: ${report.audio_quality.avg_sample_rate.toFixed(0)}Hz\n`;
-                    output += `- Silent: ${report.audio_quality.silent_percentage.toFixed(1)}%\n\n`;
-                }
-                if (report.video_quality) {
-                    output += `## Video Quality\n`;
-                    output += `- Total Files: ${report.video_quality.total_files}\n`;
-                    output += `- Avg Duration: ${report.video_quality.avg_duration.toFixed(1)}s\n`;
-                    output += `- Avg FPS: ${report.video_quality.avg_fps.toFixed(1)}\n`;
-                    output += `- High Corruption Risk: ${report.video_quality.corruption_risk_high}\n\n`;
-                }
-                output += `## Recommendations\n`;
-                report.recommendations.forEach(rec => {
-                    output += `- ${rec}\n`;
-                });
-                return {
-                    content: [{ type: "text", text: output }]
-                };
-            }
-            catch (error) {
-                return {
-                    content: [{ type: "text", text: `ERROR: Quality report generation failed: ${error.message}` }],
-                    isError: true
-                };
+                    if (report.audio_quality) {
+                        output += `## Audio Quality\n`;
+                        output += `- Total Files: ${report.audio_quality.total_files}\n`;
+                        output += `- Avg Duration: ${report.audio_quality.avg_duration.toFixed(1)}s\n`;
+                        output += `- Avg Sample Rate: ${report.audio_quality.avg_sample_rate.toFixed(0)}Hz\n`;
+                        output += `- Silent: ${report.audio_quality.silent_percentage.toFixed(1)}%\n\n`;
+                    }
+                    if (report.video_quality) {
+                        output += `## Video Quality\n`;
+                        output += `- Total Files: ${report.video_quality.total_files}\n`;
+                        output += `- Avg Duration: ${report.video_quality.avg_duration.toFixed(1)}s\n`;
+                        output += `- Avg FPS: ${report.video_quality.avg_fps.toFixed(1)}\n`;
+                        output += `- High Corruption Risk: ${report.video_quality.corruption_risk_high}\n\n`;
+                    }
+                    output += `## Recommendations\n`;
+                    report.recommendations.forEach(rec => {
+                        output += `- ${rec}\n`;
+                    });
+                    return {
+                        content: [{ type: "text", text: output }]
+                    };
+                }
+                catch (error) {
+                    return {
+                        content: [{ type: "text", text: `ERROR: Quality report generation failed: ${error.message}` }],
+                        isError: true
+                    };
+                }
             }
+            default:
+                throw new McpError(ErrorCode.MethodNotFound, "Tool not found");
         }
-        default:
-            throw new McpError(ErrorCode.MethodNotFound, "Tool not found");
-    }
+    }); // end requestQueue.enqueue
 });
 async function main() {
     const args = process.argv.slice(2);
@@ -1995,6 +2873,7 @@ async function main() {
     const isFuse = args.includes("fuse");
     const isDiscover = args.includes("discover");
     const isDownload = args.includes("download");
+    const isExport = args.includes("export");
     const isConfig = args.includes("config") || args.includes("configure");
     const isSetup = args.includes("--setup") || args.includes("setup");
     const isSilent = args.includes("--silent");
@@ -2017,6 +2896,10 @@ async function main() {
         await runDownloadCli(args);
         return;
     }
+    if (isExport) {
+        await runExportCli(args);
+        return;
+    }
     // If run in explicit setup mode, show setup wizard (do not auto-run on server startup)
     if (isSetup) {
         await runSetupWizard(isSilent);
@@ -2289,6 +3172,99 @@ async function runDownloadCli(args) {
     }
     console.log(`Download complete: ${localPath}`);
 }
+async function runExportCli(args) {
+    const getArgValue = (name) => {
+        const idx = args.findIndex(a => a === name);
+        if (idx >= 0 && idx + 1 < args.length)
+            return args[idx + 1];
+        return undefined;
+    };
+    const nonFlags = args.filter((arg, index) => {
+        if (arg.startsWith("--"))
+            return false;
+        const previous = index > 0 ? args[index - 1] : "";
+        if (["--target-dir", "--format", "--compression", "--sample-rows", "--columns"].includes(previous))
+            return false;
+        return true;
+    });
+    const datasetId = nonFlags[1] || "";
+    if (!datasetId) {
+        console.error("Usage: vespermcp export <dataset-id|local-path> [--format parquet|feather|csv|jsonl|arrow] [--target-dir C:/path] [--compression snappy] [--fast] [--preview] [--sample-rows N] [--columns col1,col2]");
+        process.exit(1);
+    }
+    const requestedFormat = getArgValue("--format") || "parquet";
+    const targetDir = getArgValue("--target-dir");
+    const compression = getArgValue("--compression");
+    const sampleRows = getArgValue("--sample-rows");
+    const columns = getArgValue("--columns");
+    const fastMode = args.includes("--fast");
+    const preview = args.includes("--preview");
+    const isDirectLocalInput = isDirectLocalDatasetReference(datasetId);
+    const resolvedTargetDir = path.resolve(targetDir || process.cwd());
+    let sourcePath = resolveDatasetLocalPath(datasetId, [resolvedTargetDir, process.cwd()]);
+    if (!sourcePath) {
+        console.error(`Export failed: no local data found for ${datasetId}. Run download or prepare first, or pass a direct local path.`);
+        process.exit(1);
+    }
+    sourcePath = ensureExportableLocalPath(sourcePath);
+    try {
+        if (!isDirectLocalInput && shouldTrackExportPath(sourcePath)) {
+            upsertRegistry(datasetId, sourcePath, "completed");
+        }
+    }
+    catch (e) {
+        console.error(`[Registry] Failed to normalize registry path for ${datasetId}: ${e?.message || e}`);
+    }
+    if (!fastMode) {
+        const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
+        const pipelineFmt = (requestedFormat === "csv" || requestedFormat === "parquet") ? requestedFormat : "parquet";
+        const pipelineCompatibleInput = currentExt === "csv" || currentExt === "parquet";
+        if (pipelineCompatibleInput && currentExt !== pipelineFmt) {
+            try {
+                sourcePath = ensureLocalPipelineSource(sourcePath, datasetId, resolvedTargetDir);
+                const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, pipelineFmt);
+                if (pipelineResult.final_output_path) {
+                    sourcePath = pipelineResult.final_output_path;
+                    if (!isDirectLocalInput && shouldTrackExportPath(sourcePath)) {
+                        upsertRegistry(datasetId, sourcePath, "completed");
+                    }
+                }
+            }
+            catch (err) {
+                console.error(`[Export] Pipeline warning: ${err.message}. Continuing with raw file.`);
+            }
+        }
+    }
+    const exportOpts = {};
+    if (compression)
+        exportOpts.compression = compression;
+    if (preview)
+        exportOpts.preview = true;
+    if (sampleRows)
+        exportOpts.sample_rows = Number(sampleRows);
+    if (columns)
+        exportOpts.columns = columns.split(",").map(col => col.trim()).filter(Boolean);
+    const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
+    const ext = extMap[requestedFormat] || ".parquet";
+    const safeName = getExportFileStem(datasetId);
+    const outDir = resolvedTargetDir;
+    if (!fs.existsSync(outDir))
+        fs.mkdirSync(outDir, { recursive: true });
+    const outputFile = path.join(outDir, `${safeName}${ext}`);
+    console.error(`[Export] Resolved output directory: ${outDir}`);
+    console.error(`[Export] Output file: ${outputFile}`);
+    const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
+    console.log(`Export complete: ${result.output_path}`);
+    console.log(`Format: ${result.format}${result.compression ? ` (${result.compression})` : ""}`);
+    if (result.rows !== undefined)
+        console.log(`Rows: ${result.rows.toLocaleString()}`);
+    if (result.columns !== undefined)
+        console.log(`Columns: ${result.columns}`);
+    if (result.file_size_mb !== undefined)
+        console.log(`Size: ${result.file_size_mb} MB`);
+    if (result.preview_path)
+        console.log(`Preview: ${result.preview_path}`);
+}
 async function runFuseCli(args) {
     const getArgValue = (name) => {
         const idx = args.findIndex(a => a === name);