npm - @vespermcp/mcp-server - Versions diffs - 1.2.12 → 1.2.13 - Mend

@vespermcp/mcp-server 1.2.12 → 1.2.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/build/index.js +232 -9
package/package.json +1 -1
package/scripts/preindex_registry.cjs +157 -0

package/build/index.js CHANGED Viewed

@@ -1,4 +1,75 @@
-#! /usr/bin/env node
+// --- Dataset ID Normalization ---
+function normalize_dataset_id(dataset_id) {
+    // Remove kaggle: prefix for storage key
+    let id = dataset_id.replace(/^kaggle:/, "");
+    // Replace / and : with _ for filesystem safety
+    id = id.replace(/[/:]/g, "_");
+    // Always store and lookup using the same normalized format
+    return dataset_id.startsWith("kaggle:") ? `kaggle_${id}` : id;
+}
+// --- Dataset Registry Helpers ---
+function getRegistryPath() {
+    return path.join(dataRoot, "registry.json");
+}
+function readRegistry() {
+    const registryPath = getRegistryPath();
+    if (!fs.existsSync(registryPath))
+        return [];
+    try {
+        const raw = fs.readFileSync(registryPath, "utf-8");
+        return JSON.parse(raw);
+    }
+    catch {
+        return [];
+    }
+}
+function writeRegistry(entries) {
+    const registryPath = getRegistryPath();
+    fs.writeFileSync(registryPath, JSON.stringify(entries, null, 2));
+}
+function upsertRegistry(dataset_id, local_path, status) {
+    const norm_id = normalize_dataset_id(dataset_id);
+    console.error(`[Registry] Writing key: ${norm_id}`);
+    const entries = readRegistry();
+    const idx = entries.findIndex(e => e.dataset_id === norm_id);
+    if (idx >= 0) {
+        entries[idx] = { dataset_id: norm_id, local_path, status };
+    }
+    else {
+        entries.push({ dataset_id: norm_id, local_path, status });
+    }
+    writeRegistry(entries);
+}
+function getRegistryEntry(dataset_id) {
+    const norm_id = normalize_dataset_id(dataset_id);
+    console.error(`[Registry] Lookup key: ${norm_id}`);
+    return readRegistry().find(e => e.dataset_id === norm_id);
+}
+// --- Pipeline State Tracker ---
+// Tracks completed steps per session/job/dataset
+const pipelineState = {};
+function getPipelineKey(datasetId) {
+    return datasetId;
+}
+export function markStepComplete(datasetId, step) {
+    const key = getPipelineKey(datasetId);
+    if (!pipelineState[key])
+        pipelineState[key] = new Set();
+    pipelineState[key].add(step);
+}
+export function hasStep(datasetId, step) {
+    const key = getPipelineKey(datasetId);
+    return pipelineState[key]?.has(step);
+}
+// --- Dataset ID Auto-Detection ---
+export function parseDatasetId(id) {
+    const trimmed = id.trim();
+    if (/^(kaggle:|hf:|huggingface:|openml:|dataworld:|http|https):/i.test(trimmed))
+        return trimmed;
+    if (trimmed.includes("/") && !trimmed.includes(":"))
+        return `kaggle:${trimmed}`;
+    return trimmed;
+}
 import { Server } from "@modelcontextprotocol/sdk/server/index.js";
 import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
 import { CallToolRequestSchema, ListToolsRequestSchema, ErrorCode, McpError, } from "@modelcontextprotocol/sdk/types.js";
@@ -443,6 +514,13 @@ async function handlePrepareJob(jobId, query, requirements) {
                     preview: true,
                 });
                 rawFilePath = fusionResult.output_path;
+                try {
+                    // Register fused output for this top dataset so export can find it
+                    upsertRegistry(topDataset.id, rawFilePath, "completed");
+                }
+                catch (e) {
+                    console.error(`[Registry] Failed to write registry for fused output ${topDataset.id}: ${e?.message || e}`);
+                }
                 currentRows = await countRows(rawFilePath);
             }
             if (currentRows < requestedRows) {
@@ -462,6 +540,13 @@ async function handlePrepareJob(jobId, query, requirements) {
     update({ progress: 85, status_text: "Installing dataset into project..." });
     const installPath = await installService.install(topDataset.id, rawFilePath);
     update({ progress: 100, status_text: "Preparation complete!" });
+    // Register prepared dataset in local registry for lookup by export/list tools
+    try {
+        upsertRegistry(topDataset.id, installPath, "completed");
+    }
+    catch (e) {
+        console.error(`[Registry] Failed to write registry for ${topDataset.id}: ${e?.message || e}`);
+    }
     return installPath;
 }
 /**
@@ -766,6 +851,14 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
                     required: ["dataset_id"],
                 },
             },
+            {
+                name: "vesper_list_datasets",
+                description: "List local prepared datasets from the Vesper registry (dataset_id and local_path).",
+                inputSchema: {
+                    type: "object",
+                    properties: {},
+                },
+            },
             {
                 name: "fuse_datasets",
                 description: "Fuse/combine multiple datasets via concat or join. Optionally runs quality & leakage checks afterward.",
@@ -875,6 +968,65 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
 });
 // Call Tool
 server.setRequestHandler(CallToolRequestSchema, async (request) => {
+    // --- Pipeline Enforcement ---
+    // Map tool names to pipeline steps
+    const toolToStep = {
+        vesper_search: "search",
+        vesper_download: "download",
+        vesper_analyze: "analyze",
+        vesper_clean: "clean",
+        vesper_split: "split",
+        vesper_export: "export",
+        prepare_dataset: "prepare",
+    };
+    // Extract dataset_id if present and normalize
+    let datasetId = request.params.arguments?.dataset_id || request.params.arguments?.query || "";
+    if (datasetId)
+        datasetId = parseDatasetId(String(datasetId));
+    // Pipeline rules
+    const stepOrder = ["search", "download", "analyze", "clean", "split", "export"];
+    const prereqs = {
+        vesper_download: ["search"],
+        vesper_analyze: ["download"],
+        vesper_clean: ["analyze"],
+        vesper_split: ["clean"],
+        vesper_export: ["split"],
+    };
+    const tool = String(request.params.name);
+    const step = toolToStep[tool];
+    if (step && datasetId) {
+        // Check prerequisites
+        const required = prereqs[tool] || [];
+        for (const req of required) {
+            if (!hasStep(String(datasetId), req)) {
+                // Auto-run missing step if possible, else error
+                // For export, auto-run prepare_dataset if split missing
+                if (tool === "vesper_export" && req === "split") {
+                    // Auto-trigger prepare_dataset (start a background prepare job)
+                    try {
+                        jobManager.createJob("prepare", 0, { query: String(datasetId), requirements: undefined, downloadImages: false });
+                        // Mark split as complete so export can proceed; export handler will also wait for data if needed.
+                        markStepComplete(String(datasetId), "split");
+                    }
+                    catch (e) {
+                        console.error(`[Pipeline] Failed to auto-trigger prepare for ${datasetId}: ${e?.message || e}`);
+                        return {
+                            content: [{ type: "text", text: `ERROR: Failed to auto-run prepare for ${datasetId}. Please run prepare_dataset first.` }],
+                            isError: true,
+                        };
+                    }
+                }
+                else {
+                    return {
+                        content: [{ type: "text", text: `ERROR: Cannot run ${tool} before ${req}. Please run ${req} first.` }],
+                        isError: true,
+                    };
+                }
+            }
+        }
+        // Mark this step as complete
+        markStepComplete(String(datasetId), String(step));
+    }
     switch (request.params.name) {
         case "vesper_search": {
             const query = String(request.params.arguments?.query);
@@ -983,6 +1135,12 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
             }
             try {
                 const localPath = await dataIngestor.ensureData(datasetId, source, () => undefined);
+                try {
+                    upsertRegistry(datasetId, localPath, "completed");
+                }
+                catch (e) {
+                    console.error(`[Registry] Failed to write registry for ${datasetId}: ${e?.message || e}`);
+                }
                 return {
                     content: [{ type: "text", text: `Download complete: ${localPath}` }]
                 };
@@ -1302,15 +1460,53 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
             if (!dataset) {
                 throw new McpError(ErrorCode.InvalidParams, `Dataset not found: ${datasetId}`);
             }
-            // Use Metadata to find the actual local file
+            // Use Metadata or Registry to find the actual local file
+            let sourcePath = undefined;
             const downloadStatus = metadataStore.getDownloadStatus(datasetId);
-            if (!downloadStatus || !fs.existsSync(downloadStatus.local_path)) {
-                return {
-                    content: [{ type: "text", text: `ERROR: No local data found for ${datasetId}. Please run prepare_dataset first.` }],
-                    isError: true
-                };
+            if (downloadStatus && fs.existsSync(downloadStatus.local_path)) {
+                sourcePath = downloadStatus.local_path;
+            }
+            else {
+                // Fallback to local registry
+                const reg = getRegistryEntry(datasetId);
+                if (reg && fs.existsSync(reg.local_path)) {
+                    sourcePath = reg.local_path;
+                }
+            }
+            if (!sourcePath) {
+                console.error(`[Export] No local data found for ${datasetId}. Attempting to prepare automatically...`);
+                // Start a prepare job for this dataset id (acts like calling prepare_dataset)
+                try {
+                    jobManager.createJob("prepare", 0, { query: datasetId, requirements: undefined, downloadImages: false });
+                }
+                catch (e) {
+                    console.error(`[Export] Failed to start prepare job for ${datasetId}: ${e?.message || e}`);
+                }
+                // Poll for download status until local_path appears or timeout
+                const wait = (ms) => new Promise(res => setTimeout(res, ms));
+                const maxWait = 60_000; // 60s
+                const interval = 2000;
+                let waited = 0;
+                while (waited < maxWait) {
+                    const ds = metadataStore.getDownloadStatus(datasetId);
+                    if (ds && ds.local_path && fs.existsSync(ds.local_path)) {
+                        sourcePath = ds.local_path;
+                        console.error(`[Export] Local data is now available for ${datasetId}: ${sourcePath}`);
+                        break;
+                    }
+                    await wait(interval);
+                    waited += interval;
+                }
+                // If still no sourcePath, return helpful error listing prepared datasets
+                if (!sourcePath) {
+                    const entries = readRegistry();
+                    const listText = entries.length === 0 ? "(no prepared datasets found)" : entries.map(e => `- ${e.dataset_id}: ${e.local_path}`).join("\n");
+                    return {
+                        content: [{ type: "text", text: `ERROR: No local data found for ${datasetId} after attempting prepare. Check credentials and try running prepare_dataset manually. Prepared datasets:\n${listText}` }],
+                        isError: true
+                    };
+                }
             }
-            let sourcePath = downloadStatus.local_path;
             // If NOT fast mode, run quality/cleaning pipeline first (only for csv/parquet compat)
             if (!fastMode) {
                 const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
@@ -1321,6 +1517,13 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
                         const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, pipelineFmt);
                         if (pipelineResult.final_output_path) {
                             sourcePath = pipelineResult.final_output_path;
+                            try {
+                                // Update registry to point to pipeline's final output
+                                upsertRegistry(datasetId, sourcePath, "completed");
+                            }
+                            catch (e) {
+                                console.error(`[Registry] Failed to update registry for ${datasetId}: ${e?.message || e}`);
+                            }
                         }
                     }
                     catch (err) {
@@ -1439,6 +1642,14 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
                 });
                 const nullDelta = result.stats.null_delta;
                 const nullText = nullDelta >= 0 ? `+${nullDelta}%` : `${nullDelta}%`;
+                // Register fused dataset under a generated id so users can export it easily
+                const fusedId = `fused_${Date.now()}`;
+                try {
+                    upsertRegistry(fusedId, result.output_path, "completed");
+                }
+                catch (e) {
+                    console.error(`[Registry] Failed to register fused dataset ${fusedId}: ${e?.message || e}`);
+                }
                 let msg = `Fused ${result.stats.sources_count} sources -> ${result.stats.rows_after.toLocaleString()} rows (from ${result.stats.rows_before.toLocaleString()}).\n`;
                 msg += `- Duplicates removed: ${result.stats.duplicates_removed.toLocaleString()}\n`;
                 msg += `- Null change: ${nullText}\n`;
@@ -1452,7 +1663,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
                     }
                     msg += "\n";
                 }
-                msg += `\nNext: run split_dataset/export_dataset on fused output.`;
+                msg += `\nNext: run split_dataset/export_dataset on fused output. Registered fused dataset id: ${fusedId}`;
                 return { content: [{ type: "text", text: msg }] };
             }
             catch (error) {
@@ -1865,9 +2076,21 @@ async function runDownloadCli(args) {
             localPath = dl.local_path;
             const size = fs.existsSync(localPath) ? fs.statSync(localPath).size : 0;
             metadataStore.registerDownload(normalized, localPath, "completed", size);
+            try {
+                upsertRegistry(datasetId, localPath, "completed");
+            }
+            catch (e) {
+                console.error(`[Registry] Failed to write registry for ${datasetId}: ${e?.message || e}`);
+            }
         }
         else {
             localPath = await dataIngestor.ensureData(datasetId, source, (msg) => console.log(msg));
+            try {
+                upsertRegistry(datasetId, localPath, "completed");
+            }
+            catch (e) {
+                console.error(`[Registry] Failed to write registry for ${datasetId}: ${e?.message || e}`);
+            }
         }
     }
     catch (error) {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@vespermcp/mcp-server",
-  "version": "1.2.12",
+  "version": "1.2.13",
   "description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
   "type": "module",
   "main": "build/index.js",

package/scripts/preindex_registry.cjs ADDED Viewed

@@ -0,0 +1,157 @@
+const fs = require('fs');
+const path = require('path');
+const os = require('os');
+const { argv, cwd } = process;
+function usage() {
+  console.log(`Usage: node scripts/preindex_registry.cjs [--scan dir1 dir2 ...] [--target N] [--out path] [--no-count]
+Options:
+  --scan    Directories to recursively scan for datasets (default: ./e2e_demo_output ./datasets)
+  --target  Target total registry entries (if larger than scanned, will synthesize entries)
+  --out     Output registry path (default: ~/.vesper/registry.json)
+  --no-count  Skip expensive row counting for CSV/JSONL
+`);
+}
+let scanDirs = [];
+let target = 0;
+let outPath = path.join(os.homedir(), '.vesper', 'registry.json');
+let doCount = true;
+for (let i = 2; i < argv.length; i++) {
+  const a = argv[i];
+  if (a === '--scan') {
+    i++;
+    while (i < argv.length && !argv[i].startsWith('--')) {
+      scanDirs.push(argv[i]);
+      i++;
+    }
+    i--;
+  } else if (a === '--target') {
+    target = parseInt(argv[++i], 10) || 0;
+  } else if (a === '--out') {
+    outPath = path.resolve(argv[++i]);
+  } else if (a === '--no-count') {
+    doCount = false;
+  } else if (a === '--help' || a === '-h') {
+    usage();
+    process.exit(0);
+  } else {
+    console.error('Unknown arg', a);
+    usage();
+    process.exit(2);
+  }
+}
+if (scanDirs.length === 0) scanDirs = [path.join(cwd(), 'e2e_demo_output'), path.join(cwd(), 'datasets')];
+function normalizeId(s) {
+  return s.replace(/[^a-z0-9]+/gi, '_').replace(/^_+|_+$/g, '').toLowerCase();
+}
+function walk(dir, exts = ['.csv', '.jsonl', '.json', '.arrow', '.parquet', '.feather']) {
+  const results = [];
+  try {
+    const items = fs.readdirSync(dir, { withFileTypes: true });
+    for (const it of items) {
+      const p = path.join(dir, it.name);
+      if (it.isDirectory()) results.push(...walk(p, exts));
+      else if (it.isFile()) {
+        const ext = path.extname(it.name).toLowerCase();
+        if (exts.includes(ext)) results.push(p);
+      }
+    }
+  } catch (e) {
+    // ignore
+  }
+  return results;
+}
+function countCsvRows(filePath) {
+  return new Promise((resolve, reject) => {
+    let count = 0;
+    const rs = fs.createReadStream(filePath, { encoding: 'utf8' });
+    rs.on('data', chunk => {
+      for (let i = 0; i < chunk.length; i++) if (chunk[i] === '\n') count++;
+    });
+    rs.on('end', () => resolve(count));
+    rs.on('error', reject);
+  });
+}
+(async function main() {
+  const registryDir = path.dirname(outPath);
+  if (!fs.existsSync(registryDir)) fs.mkdirSync(registryDir, { recursive: true });
+  let existing = [];
+  if (fs.existsSync(outPath)) {
+    try { existing = JSON.parse(fs.readFileSync(outPath, 'utf8')); } catch (e) { existing = []; }
+  }
+  const map = new Map();
+  for (const e of existing) map.set(e.normalized_id || e.id, e);
+  let scanned = 0;
+  for (const dir of scanDirs) {
+    const abs = path.resolve(dir);
+    const files = walk(abs);
+    for (const f of files) {
+      const stats = fs.statSync(f);
+      const base = path.basename(f, path.extname(f));
+      const rel = path.relative(process.cwd(), f);
+      const id = normalizeId(rel || base);
+      let cols = null;
+      let rows = null;
+      if (doCount && (f.endsWith('.csv') || f.endsWith('.jsonl') || f.endsWith('.json'))) {
+        try {
+          if (f.endsWith('.csv')) {
+            const header = fs.readFileSync(f, { encoding: 'utf8', flag: 'r' }).split(/\r?\n/, 1)[0] || '';
+            cols = header ? header.split(',').length : 0;
+            rows = await countCsvRows(f);
+          } else if (f.endsWith('.jsonl')) {
+            rows = await countCsvRows(f);
+          }
+        } catch (e) {
+          // ignore
+        }
+      }
+      const entry = {
+        id: id,
+        normalized_id: id,
+        source: 'scanned',
+        path: f,
+        size: stats.size,
+        mtime: stats.mtime.toISOString(),
+        meta: { rows, cols }
+      };
+      map.set(id, entry);
+      scanned++;
+    }
+  }
+  // Synthesize if target requested
+  if (target > map.size) {
+    const synthCount = target - map.size;
+    const synthDir = path.join(path.dirname(outPath), 'local_library');
+    if (!fs.existsSync(synthDir)) fs.mkdirSync(synthDir, { recursive: true });
+    for (let i = 1; i <= synthCount; i++) {
+      const idx = map.size + i;
+      const id = `synth_${String(idx).padStart(6, '0')}`;
+      const entry = {
+        id,
+        normalized_id: id,
+        source: 'synthesized',
+        path: path.join(synthDir, `${id}.csv`),
+        size: 0,
+        mtime: new Date().toISOString(),
+        meta: { rows: Math.floor(Math.random() * 1000000), cols: Math.floor(Math.random() * 200) + 1 }
+      };
+      map.set(id, entry);
+    }
+  }
+  const outArr = Array.from(map.values());
+  fs.writeFileSync(outPath, JSON.stringify(outArr, null, 2), 'utf8');
+  console.log(`Wrote ${outArr.length} registry entries to ${outPath} (${scanned} scanned, ${Math.max(0, outArr.length - scanned)} synthesized)`);
+})();