npm - @vespermcp/mcp-server - Versions diffs - 1.0.5 → 1.0.8 - Mend

@vespermcp/mcp-server 1.0.5 → 1.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

package/README.md +6 -4
package/build/cleaning/cleaner.js +27 -2
package/build/cleaning/executor.js +7 -6
package/build/cleaning/planner.js +16 -4
package/build/config/config-manager.js +215 -0
package/build/export/exporter.js +26 -2
package/build/index.js +273 -92
package/build/ingestion/ingestor.js +5 -22
package/build/install/install-service.js +1 -1
package/build/jobs/manager.js +17 -10
package/build/metadata/monitoring-service.js +2 -2
package/build/metadata/scraper.js +8 -8
package/build/metadata/store.js +17 -2
package/build/monitoring/observability.js +2 -2
package/build/preparation/target-detector.js +75 -0
package/build/python/cleaner.py +226 -0
package/build/python/export_engine.py +131 -0
package/build/python/framework_adapters.py +100 -0
package/build/python/github_adapter.py +106 -0
package/build/python/image_engine.py +86 -0
package/build/python/media_engine.py +133 -0
package/build/python/nasa_adapter.py +82 -0
package/build/python/quality_engine.py +243 -0
package/build/python/splitter_engine.py +283 -0
package/build/python/target_engine.py +154 -0
package/build/python/test_framework_adapters.py +61 -0
package/build/python/uci_adapter.py +94 -0
package/build/python/worldbank_adapter.py +99 -0
package/build/quality/analyzer.js +40 -4
package/build/quality/image-analyzer.js +28 -2
package/build/quality/media-analyzer.js +28 -2
package/build/scripts/cleanup-kaggle.js +41 -0
package/build/scripts/repro-bug.js +37 -0
package/build/scripts/repro-export-bug.js +56 -0
package/build/scripts/test-mcp-v5.js +12 -11
package/build/scripts/test-production-sync.js +36 -0
package/build/scripts/test-target-detector.js +29 -0
package/build/scripts/test-write.js +14 -0
package/build/scripts/verify-integration.js +57 -0
package/build/scripts/verify-priority.js +33 -0
package/build/search/engine.js +13 -2
package/build/search/jit-orchestrator.js +6 -40
package/build/search/vector-store.js +18 -0
package/build/splitting/splitter.js +27 -2
package/build/tools/formatter.js +15 -6
package/build/utils/downloader.js +2 -2
package/build/utils/selector.js +69 -0
package/package.json +8 -4
package/src/python/cleaner.py +33 -3
package/src/python/export_engine.py +19 -0
package/src/python/target_engine.py +154 -0

package/build/splitting/splitter.js CHANGED Viewed

@@ -1,10 +1,35 @@
 import { spawn } from "child_process";
 import path from "path";
+import fs from "fs";
 export class DataSplitter {
     pythonPath = "python";
     scriptPath;
-    constructor(projectRoot = process.cwd()) {
-        this.scriptPath = path.join(projectRoot, "src", "python", "splitter_engine.py");
+    constructor(buildDir = process.cwd()) {
+        const homeDir = process.env.HOME || process.env.USERPROFILE || buildDir;
+        const dataRoot = path.join(homeDir, ".vesper");
+        const scriptPath0 = path.resolve(dataRoot, "python", "splitter_engine.py");
+        const scriptPath1 = path.resolve(buildDir, "python", "splitter_engine.py");
+        const scriptPath2 = path.resolve(buildDir, "..", "src", "python", "splitter_engine.py");
+        const scriptPath3 = path.resolve(buildDir, "..", "python", "splitter_engine.py");
+        if (fs.existsSync(scriptPath0)) {
+            this.scriptPath = scriptPath0;
+        }
+        else if (fs.existsSync(scriptPath1)) {
+            this.scriptPath = scriptPath1;
+        }
+        else if (fs.existsSync(scriptPath2)) {
+            this.scriptPath = scriptPath2;
+        }
+        else if (fs.existsSync(scriptPath3)) {
+            this.scriptPath = scriptPath3;
+        }
+        else {
+            this.scriptPath = scriptPath0;
+        }
+        // Detect Python command
+        if (process.platform === "win32") {
+            this.pythonPath = "py";
+        }
     }
     /**
      * Splits a dataset into Train/Val/Test sets based on config

package/build/tools/formatter.js CHANGED Viewed

@@ -47,8 +47,11 @@ export function formatSearchResults(results) {
     output += "═".repeat(80) + "\n\n";
     results.forEach((ds, index) => {
         const relevanceScore = ds.relevance_score || 0;
-        // Source badge
-        const sourceBadge = ds.source === "huggingface" ? "HuggingFace" : "Kaggle";
+        // Source badge and access level
+        const openSources = ["huggingface", "uci", "github", "worldbank", "nasa"];
+        const isOpen = openSources.includes(ds.source);
+        const sourceLabel = ds.source.charAt(0).toUpperCase() + ds.source.slice(1);
+        const accessBadge = isOpen ? "🔓 Open Access" : "🔒 Requires API Key";
         // Safety indicator
         let safetyIndicator = "";
         if (ds.license.category === "safe") {
@@ -62,8 +65,8 @@ export function formatSearchResults(results) {
         }
         // Header
         output += `${index + 1}. ${ds.name}\n`;
-        output += `   ${sourceBadge} | ${safetyIndicator} | Relevance: ${(relevanceScore * 100).toFixed(0)}%\n`;
-        output += `   ID: ${ds.id}\n\n`;
+        output += `   Source: ${sourceLabel} | ${accessBadge} | ${safetyIndicator}\n`;
+        output += `   Relevance: ${(relevanceScore * 100).toFixed(0)}% | ID: ${ds.id}\n\n`;
         // Description
         if (ds.description && ds.description.length > 0) {
             const shortDesc = ds.description.length > 200
@@ -122,7 +125,10 @@ export function formatDatasetInfo(ds) {
     output += `${ds.name}\n`;
     output += "═".repeat(80) + "\n\n";
     // Source and safety
-    const sourceBadge = ds.source === "huggingface" ? "HuggingFace" : "Kaggle";
+    const openSources = ["huggingface", "uci", "github", "worldbank", "nasa"];
+    const isOpen = openSources.includes(ds.source);
+    const sourceLabel = ds.source.charAt(0).toUpperCase() + ds.source.slice(1);
+    const accessBadge = isOpen ? "🔓 Open Access" : "🔒 Requires API Key";
     let safetyIndicator = "";
     if (ds.license.category === "safe") {
         safetyIndicator = "Safe for use";
@@ -133,9 +139,12 @@ export function formatDatasetInfo(ds) {
     else {
         safetyIndicator = "Unknown license - Use with caution";
     }
-    output += `Source: ${sourceBadge}\n`;
+    output += `Source: ${sourceLabel} (${accessBadge})\n`;
     output += `Safety: ${safetyIndicator}\n`;
     output += `ID: ${ds.id}\n\n`;
+    if (!isOpen && ds.source === "kaggle") {
+        output += `⚠️  NOTE: This dataset requires a Kaggle API key (KAGGE_USERNAME/KAGGLE_KEY) to prepare.\n\n`;
+    }
     // Description
     if (ds.description) {
         output += "Description:\n";

package/build/utils/downloader.js CHANGED Viewed

@@ -15,13 +15,13 @@ export class RobustDownloader {
                 startByte = fs.statSync(targetPath).size;
                 if (startByte > 0) {
                     headers["Range"] = `bytes=${startByte}-`;
-                    console.log(`[Downloader] Resuming from byte ${startByte}`);
+                    console.error(`[Downloader] Resuming from byte ${startByte}`);
                 }
             }
             const response = await fetch(url, { headers });
             if (response.status === 416) {
                 // Requested range not satisfiable - likely already finished
-                console.log("[Downloader] Range not satisfiable, file might be complete.");
+                console.error("[Downloader] Range not satisfiable, file might be complete.");
                 return;
             }
             if (!response.ok && response.status !== 206) {

package/build/utils/selector.js ADDED Viewed

@@ -0,0 +1,69 @@
+import readline from "readline";
+export class Selector {
+    currentIndex = 0;
+    options;
+    title;
+    constructor(title, options) {
+        this.title = title;
+        this.options = options;
+    }
+    render() {
+        // Clear previous lines
+        process.stdout.write("\x1b[?25l"); // Hide cursor
+        readline.cursorTo(process.stdout, 0);
+        // Clear the lines we used before (options + title + blank line)
+        for (let i = 0; i <= this.options.length + 1; i++) {
+            readline.clearLine(process.stdout, 0);
+            process.stdout.write("\x1b[1A"); // Move up one line
+        }
+        readline.clearLine(process.stdout, 0);
+        console.log(`\n${this.title}`);
+        this.options.forEach((opt, idx) => {
+            const isCurrent = idx === this.currentIndex;
+            const checkbox = opt.selected ? "[\x1b[32mX\x1b[0m]" : "[ ]";
+            const cursor = isCurrent ? "\x1b[36m>\x1b[0m " : "  ";
+            const label = isCurrent ? `\x1b[36m${opt.name}\x1b[0m` : opt.name;
+            console.log(`${cursor}${checkbox} ${label}`);
+        });
+        console.log("\x1b[2m(Use arrows to move, Space to toggle, Enter to confirm)\x1b[0m");
+    }
+    async run() {
+        if (this.options.length === 0)
+            return [];
+        readline.emitKeypressEvents(process.stdin);
+        if (process.stdin.isTTY) {
+            process.stdin.setRawMode(true);
+        }
+        // Initial render room (print blank lines to be cleared)
+        console.log("\n".repeat(this.options.length + 1));
+        this.render();
+        return new Promise((resolve) => {
+            const handleKey = (str, key) => {
+                if (key.name === "up") {
+                    this.currentIndex = (this.currentIndex - 1 + this.options.length) % this.options.length;
+                    this.render();
+                }
+                else if (key.name === "down") {
+                    this.currentIndex = (this.currentIndex + 1) % this.options.length;
+                    this.render();
+                }
+                else if (key.name === "space") {
+                    this.options[this.currentIndex].selected = !this.options[this.currentIndex].selected;
+                    this.render();
+                }
+                else if (key.name === "return") {
+                    process.stdin.setRawMode(false);
+                    process.stdin.removeListener("keypress", handleKey);
+                    process.stdout.write("\x1b[?25h"); // Show cursor
+                    console.log("");
+                    resolve(this.options.filter(o => o.selected).map(o => o.value));
+                }
+                else if (key.ctrl && key.name === "c") {
+                    process.stdin.setRawMode(false);
+                    process.exit();
+                }
+            };
+            process.stdin.on("keypress", handleKey);
+        });
+    }
+}

package/package.json CHANGED Viewed

@@ -1,11 +1,12 @@
 {
   "name": "@vespermcp/mcp-server",
-  "version": "1.0.5",
+  "version": "1.0.8",
   "description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
   "type": "module",
   "main": "build/index.js",
   "bin": {
-    "vesper": "./build/index.js"
+    "vespermcp": "./build/index.js",
+    "@vespermcp/mcp-server": "./build/index.js"
   },
   "files": [
     "build/**/*",
@@ -16,7 +17,7 @@
     "mcp-config-template.json"
   ],
   "scripts": {
-    "build": "tsc",
+    "build": "tsc && node -e \"const fs=require('fs');const path=require('path');const src='src/python';const dest='build/python';if(!fs.existsSync(dest))fs.mkdirSync(dest,{recursive:true});fs.readdirSync(src).forEach(f=>{if(f.endsWith('.py'))fs.copyFileSync(path.join(src,f),path.join(dest,f));});console.log('✅ Copied Python scripts to build/python');\"",
     "dev": "tsx watch src/index.ts",
     "postinstall": "node scripts/postinstall.cjs",
     "scrape": "tsx src/scripts/scrape-metadata.ts",
@@ -26,7 +27,10 @@
     "check-db": "tsx src/scripts/check-db.ts",
     "test-jit": "tsx src/scripts/test-jit.ts",
     "demo-ui": "tsx src/scripts/demo-ui.ts",
-    "test": "vitest"
+    "setup": "node build/index.js --setup",
+    "setup:silent": "node build/index.js --setup --silent",
+    "test": "vitest",
+    "start": "node build/index.js"
   },
   "keywords": [
     "mcp",

package/src/python/cleaner.py CHANGED Viewed

@@ -152,7 +152,15 @@ def main():
             op_type = op["type"]
             params = op.get("params", {})
-            if op_type in OPERATIONS:
+            if op_type == "RenameTarget":
+                old_name = params.get("old_name")
+                new_name = params.get("new_name", "target")
+                if old_name and old_name in df.columns:
+                    df = df.rename({old_name: new_name})
+                    logs.append(f"Renamed column '{old_name}' to '{new_name}'")
+                else:
+                    logs.append(f"Failed RenameTarget: Column '{old_name}' not found or not specified.")
+            elif op_type in OPERATIONS:
                 try:
                     df, stats = OPERATIONS[op_type](df, params)
                     logs.append(f"Executed {op_type}: {stats}")
@@ -176,6 +184,28 @@ def main():
         base_name = file_path.rsplit(".", 1)[0]
         if output_format == "csv":
             output_path = f"{base_name}_cleaned.csv"
+            # Stringify ANY column that might not be CSV-safe (List, Struct, Object, etc.)
+            for col in df.columns:
+                dtype = df.schema[col]
+                # Only keep simple types; stringify everything else for CSV
+                is_simple = (
+                    dtype.is_numeric() or
+                    dtype.is_temporal() or
+                    str(dtype).lower() in ["string", "utf8", "boolean", "bool"]
+                )
+                if not is_simple:
+                    # Use a robust helper for clean JSON serialization
+                    def safe_serialize(val):
+                        try:
+                            # Handle Polars nested types (convert to Python list/dict first)
+                            if hasattr(val, "to_list"):
+                                return json.dumps(val.to_list())
+                            if hasattr(val, "to_dict"):
+                                return json.dumps(val.to_dict())
+                            return json.dumps(val)
+                        except:
+                            return str(val)
+                    df = df.with_columns(pl.col(col).map_elements(safe_serialize, return_dtype=pl.Utf8))
             df.write_csv(output_path)
         else:
             output_path = f"{base_name}_cleaned.parquet"
@@ -186,10 +216,10 @@ def main():
             "output_path": output_path,
             "rows_affected": total_rows_affected,
             "logs": logs
-        }))
+        }, default=str))
     except Exception as e:
-        print(json.dumps({"success": False, "error": str(e)}))
+        print(json.dumps({"success": False, "error": str(e)}, default=str))
         sys.exit(1)
 if __name__ == "__main__":

package/src/python/export_engine.py CHANGED Viewed

@@ -31,6 +31,25 @@ def export_data(file_path, output_path, format, options=None):
     try:
         # Export Logic
         if format == "csv":
+            # Stringify ANY column that might not be CSV-safe (List, Struct, Object, etc.)
+            for col in df.columns:
+                dtype = df.schema[col]
+                is_simple = (
+                    dtype.is_numeric() or
+                    dtype.is_temporal() or
+                    str(dtype).lower() in ["string", "utf8", "boolean", "bool"]
+                )
+                if not is_simple:
+                    def safe_serialize(val):
+                        try:
+                            if hasattr(val, "to_list"):
+                                return json.dumps(val.to_list())
+                            if hasattr(val, "to_dict"):
+                                return json.dumps(val.to_dict())
+                            return json.dumps(val)
+                        except:
+                            return str(val)
+                    df = df.with_columns(pl.col(col).map_elements(safe_serialize, return_dtype=pl.Utf8))
             df.write_csv(output_path)
         elif format == "parquet":

package/src/python/target_engine.py ADDED Viewed

@@ -0,0 +1,154 @@
+import sys
+import json
+import pandas as pd
+import numpy as np
+# Common names for target variables in datasets
+TARGET_CANDIDATES = [
+    'target', 'label', 'class', 'outcome', 'y',
+    'price', 'saleprice', 'sales', 'cost', 'value', 'total',
+    'diagnosis', 'species', 'churn', 'survived', 'credit_risk'
+]
+def load_data(file_path):
+    if file_path.endswith('.csv'):
+        return pd.read_csv(file_path)
+    elif file_path.endswith('.parquet'):
+        return pd.read_parquet(file_path)
+    else:
+        raise ValueError("Unsupported file format")
+def detect_target(file_path):
+    try:
+        df = load_data(file_path)
+        columns = [c.lower() for c in df.columns]
+        candidates = []
+        # 1. Exact Name Match
+        for col_original in df.columns:
+            col_lower = col_original.lower()
+            confidence = 0.0
+            reasons = []
+            if col_lower in TARGET_CANDIDATES:
+                confidence += 0.6
+                reasons.append(f"Matches common target name '{col_lower}'")
+                # Boost if exact match 'target' or 'label'
+                if col_lower in ['target', 'label', 'class']:
+                    confidence += 0.2
+            # 2. Position Heuristic (Last column is often target)
+            if col_original == df.columns[-1]:
+                confidence += 0.3
+                reasons.append("Is the last column")
+            # 3. Completeness
+            missing_rate = df[col_original].isnull().mean()
+            if missing_rate > 0.5:
+                confidence -= 0.5
+                reasons.append(f"High missing rate ({missing_rate:.1%})")
+            elif missing_rate > 0:
+                confidence -= 0.1
+                reasons.append(f"Has missing values ({missing_rate:.1%})")
+            # 4. Cardinality / Unique Values
+            # If regression-like (many unique numeric values) or class-like (few unique values)
+            # This is hard to score generally, but extremes are bad for targets (e.g. all unique = ID usually)
+            n_unique = df[col_original].nunique()
+            if n_unique == len(df):
+                confidence -= 0.8
+                reasons.append("All values are unique (likely ID)")
+            if confidence > 0.3:
+                candidates.append({
+                    "column": col_original,
+                    "confidence": min(confidence, 1.0),
+                    "reason": reasons
+                })
+        # Sort by confidence
+        candidates.sort(key=lambda x: x['confidence'], reverse=True)
+        best_target = None
+        best_conf = 0.0
+        if candidates:
+            best_target = candidates[0]['column']
+            best_conf = candidates[0]['confidence']
+        return {
+            "target_column": best_target,
+            "confidence": best_conf,
+            "candidates": candidates,
+            "is_unified": False # Wrapper will handle unification logic
+        }
+    except Exception as e:
+        return {"error": str(e)}
+def validate_target(file_path, target_column):
+    try:
+        df = load_data(file_path)
+        if target_column not in df.columns:
+            return {"error": f"Column '{target_column}' not found in dataset."}
+        series = df[target_column]
+        total_rows = len(df)
+        missing_count = series.isnull().sum()
+        # Determine type
+        is_numeric = pd.api.types.is_numeric_dtype(series)
+        n_unique = series.nunique()
+        problem_type = "unknown"
+        if is_numeric and n_unique > 20:
+             problem_type = "regression"
+        elif n_unique < 50: # String or few numeric values
+             problem_type = "classification"
+        else:
+             # Heuristic fallback
+             problem_type = "regression" if is_numeric else "classification"
+        warnings = []
+        if missing_count > 0:
+            warnings.append(f"Target has {missing_count} missing values.")
+        # Imbalance check for classification
+        if problem_type == "classification":
+            counts = series.value_counts(normalize=True)
+            if counts.iloc[0] > 0.9: # Dominant class > 90%
+                warnings.append(f"Highly imbalanced target: Class '{counts.index[0]}' is {counts.iloc[0]:.1%}")
+        return {
+            "valid": True,
+            "problem_type": problem_type,
+            "missing_count": int(missing_count),
+            "total_rows": total_rows,
+            "warnings": warnings
+        }
+    except Exception as e:
+        return {"error": str(e)}
+if __name__ == "__main__":
+    if len(sys.argv) < 3:
+        print(json.dumps({"error": "Usage: target_engine.py <action> <file_path> [args]"}));
+        sys.exit(1)
+    action = sys.argv[1]
+    file_path = sys.argv[2]
+    result = {}
+    if action == "detect":
+        result = detect_target(file_path)
+    elif action == "validate":
+        target_col = sys.argv[3] if len(sys.argv) > 3 else None
+        if target_col:
+            result = validate_target(file_path, target_col)
+        else:
+            result = {"error": "Target column required for validation"}
+    else:
+        result = {"error": f"Unknown action: {action}"}
+    print(json.dumps(result))