@vespermcp/mcp-server 1.0.5 → 1.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/README.md +6 -4
  2. package/build/cleaning/cleaner.js +27 -2
  3. package/build/cleaning/executor.js +7 -6
  4. package/build/cleaning/planner.js +16 -4
  5. package/build/config/config-manager.js +215 -0
  6. package/build/export/exporter.js +26 -2
  7. package/build/index.js +273 -92
  8. package/build/ingestion/ingestor.js +5 -22
  9. package/build/install/install-service.js +1 -1
  10. package/build/jobs/manager.js +17 -10
  11. package/build/metadata/monitoring-service.js +2 -2
  12. package/build/metadata/scraper.js +8 -8
  13. package/build/metadata/store.js +17 -2
  14. package/build/monitoring/observability.js +2 -2
  15. package/build/preparation/target-detector.js +75 -0
  16. package/build/python/cleaner.py +226 -0
  17. package/build/python/export_engine.py +131 -0
  18. package/build/python/framework_adapters.py +100 -0
  19. package/build/python/github_adapter.py +106 -0
  20. package/build/python/image_engine.py +86 -0
  21. package/build/python/media_engine.py +133 -0
  22. package/build/python/nasa_adapter.py +82 -0
  23. package/build/python/quality_engine.py +243 -0
  24. package/build/python/splitter_engine.py +283 -0
  25. package/build/python/target_engine.py +154 -0
  26. package/build/python/test_framework_adapters.py +61 -0
  27. package/build/python/uci_adapter.py +94 -0
  28. package/build/python/worldbank_adapter.py +99 -0
  29. package/build/quality/analyzer.js +40 -4
  30. package/build/quality/image-analyzer.js +28 -2
  31. package/build/quality/media-analyzer.js +28 -2
  32. package/build/scripts/cleanup-kaggle.js +41 -0
  33. package/build/scripts/repro-bug.js +37 -0
  34. package/build/scripts/repro-export-bug.js +56 -0
  35. package/build/scripts/test-mcp-v5.js +12 -11
  36. package/build/scripts/test-production-sync.js +36 -0
  37. package/build/scripts/test-target-detector.js +29 -0
  38. package/build/scripts/test-write.js +14 -0
  39. package/build/scripts/verify-integration.js +57 -0
  40. package/build/scripts/verify-priority.js +33 -0
  41. package/build/search/engine.js +13 -2
  42. package/build/search/jit-orchestrator.js +6 -40
  43. package/build/search/vector-store.js +18 -0
  44. package/build/splitting/splitter.js +27 -2
  45. package/build/tools/formatter.js +15 -6
  46. package/build/utils/downloader.js +2 -2
  47. package/build/utils/selector.js +69 -0
  48. package/package.json +8 -4
  49. package/src/python/cleaner.py +33 -3
  50. package/src/python/export_engine.py +19 -0
  51. package/src/python/target_engine.py +154 -0
@@ -1,10 +1,35 @@
1
1
  import { spawn } from "child_process";
2
2
  import path from "path";
3
+ import fs from "fs";
3
4
  export class DataSplitter {
4
5
  pythonPath = "python";
5
6
  scriptPath;
6
- constructor(projectRoot = process.cwd()) {
7
- this.scriptPath = path.join(projectRoot, "src", "python", "splitter_engine.py");
7
+ constructor(buildDir = process.cwd()) {
8
+ const homeDir = process.env.HOME || process.env.USERPROFILE || buildDir;
9
+ const dataRoot = path.join(homeDir, ".vesper");
10
+ const scriptPath0 = path.resolve(dataRoot, "python", "splitter_engine.py");
11
+ const scriptPath1 = path.resolve(buildDir, "python", "splitter_engine.py");
12
+ const scriptPath2 = path.resolve(buildDir, "..", "src", "python", "splitter_engine.py");
13
+ const scriptPath3 = path.resolve(buildDir, "..", "python", "splitter_engine.py");
14
+ if (fs.existsSync(scriptPath0)) {
15
+ this.scriptPath = scriptPath0;
16
+ }
17
+ else if (fs.existsSync(scriptPath1)) {
18
+ this.scriptPath = scriptPath1;
19
+ }
20
+ else if (fs.existsSync(scriptPath2)) {
21
+ this.scriptPath = scriptPath2;
22
+ }
23
+ else if (fs.existsSync(scriptPath3)) {
24
+ this.scriptPath = scriptPath3;
25
+ }
26
+ else {
27
+ this.scriptPath = scriptPath0;
28
+ }
29
+ // Detect Python command
30
+ if (process.platform === "win32") {
31
+ this.pythonPath = "py";
32
+ }
8
33
  }
9
34
  /**
10
35
  * Splits a dataset into Train/Val/Test sets based on config
@@ -47,8 +47,11 @@ export function formatSearchResults(results) {
47
47
  output += "═".repeat(80) + "\n\n";
48
48
  results.forEach((ds, index) => {
49
49
  const relevanceScore = ds.relevance_score || 0;
50
- // Source badge
51
- const sourceBadge = ds.source === "huggingface" ? "HuggingFace" : "Kaggle";
50
+ // Source badge and access level
51
+ const openSources = ["huggingface", "uci", "github", "worldbank", "nasa"];
52
+ const isOpen = openSources.includes(ds.source);
53
+ const sourceLabel = ds.source.charAt(0).toUpperCase() + ds.source.slice(1);
54
+ const accessBadge = isOpen ? "🔓 Open Access" : "🔒 Requires API Key";
52
55
  // Safety indicator
53
56
  let safetyIndicator = "";
54
57
  if (ds.license.category === "safe") {
@@ -62,8 +65,8 @@ export function formatSearchResults(results) {
62
65
  }
63
66
  // Header
64
67
  output += `${index + 1}. ${ds.name}\n`;
65
- output += ` ${sourceBadge} | ${safetyIndicator} | Relevance: ${(relevanceScore * 100).toFixed(0)}%\n`;
66
- output += ` ID: ${ds.id}\n\n`;
68
+ output += ` Source: ${sourceLabel} | ${accessBadge} | ${safetyIndicator}\n`;
69
+ output += ` Relevance: ${(relevanceScore * 100).toFixed(0)}% | ID: ${ds.id}\n\n`;
67
70
  // Description
68
71
  if (ds.description && ds.description.length > 0) {
69
72
  const shortDesc = ds.description.length > 200
@@ -122,7 +125,10 @@ export function formatDatasetInfo(ds) {
122
125
  output += `${ds.name}\n`;
123
126
  output += "═".repeat(80) + "\n\n";
124
127
  // Source and safety
125
- const sourceBadge = ds.source === "huggingface" ? "HuggingFace" : "Kaggle";
128
+ const openSources = ["huggingface", "uci", "github", "worldbank", "nasa"];
129
+ const isOpen = openSources.includes(ds.source);
130
+ const sourceLabel = ds.source.charAt(0).toUpperCase() + ds.source.slice(1);
131
+ const accessBadge = isOpen ? "🔓 Open Access" : "🔒 Requires API Key";
126
132
  let safetyIndicator = "";
127
133
  if (ds.license.category === "safe") {
128
134
  safetyIndicator = "Safe for use";
@@ -133,9 +139,12 @@ export function formatDatasetInfo(ds) {
133
139
  else {
134
140
  safetyIndicator = "Unknown license - Use with caution";
135
141
  }
136
- output += `Source: ${sourceBadge}\n`;
142
+ output += `Source: ${sourceLabel} (${accessBadge})\n`;
137
143
  output += `Safety: ${safetyIndicator}\n`;
138
144
  output += `ID: ${ds.id}\n\n`;
145
+ if (!isOpen && ds.source === "kaggle") {
146
+ output += `⚠️ NOTE: This dataset requires a Kaggle API key (KAGGE_USERNAME/KAGGLE_KEY) to prepare.\n\n`;
147
+ }
139
148
  // Description
140
149
  if (ds.description) {
141
150
  output += "Description:\n";
@@ -15,13 +15,13 @@ export class RobustDownloader {
15
15
  startByte = fs.statSync(targetPath).size;
16
16
  if (startByte > 0) {
17
17
  headers["Range"] = `bytes=${startByte}-`;
18
- console.log(`[Downloader] Resuming from byte ${startByte}`);
18
+ console.error(`[Downloader] Resuming from byte ${startByte}`);
19
19
  }
20
20
  }
21
21
  const response = await fetch(url, { headers });
22
22
  if (response.status === 416) {
23
23
  // Requested range not satisfiable - likely already finished
24
- console.log("[Downloader] Range not satisfiable, file might be complete.");
24
+ console.error("[Downloader] Range not satisfiable, file might be complete.");
25
25
  return;
26
26
  }
27
27
  if (!response.ok && response.status !== 206) {
@@ -0,0 +1,69 @@
1
+ import readline from "readline";
2
+ export class Selector {
3
+ currentIndex = 0;
4
+ options;
5
+ title;
6
+ constructor(title, options) {
7
+ this.title = title;
8
+ this.options = options;
9
+ }
10
+ render() {
11
+ // Clear previous lines
12
+ process.stdout.write("\x1b[?25l"); // Hide cursor
13
+ readline.cursorTo(process.stdout, 0);
14
+ // Clear the lines we used before (options + title + blank line)
15
+ for (let i = 0; i <= this.options.length + 1; i++) {
16
+ readline.clearLine(process.stdout, 0);
17
+ process.stdout.write("\x1b[1A"); // Move up one line
18
+ }
19
+ readline.clearLine(process.stdout, 0);
20
+ console.log(`\n${this.title}`);
21
+ this.options.forEach((opt, idx) => {
22
+ const isCurrent = idx === this.currentIndex;
23
+ const checkbox = opt.selected ? "[\x1b[32mX\x1b[0m]" : "[ ]";
24
+ const cursor = isCurrent ? "\x1b[36m>\x1b[0m " : " ";
25
+ const label = isCurrent ? `\x1b[36m${opt.name}\x1b[0m` : opt.name;
26
+ console.log(`${cursor}${checkbox} ${label}`);
27
+ });
28
+ console.log("\x1b[2m(Use arrows to move, Space to toggle, Enter to confirm)\x1b[0m");
29
+ }
30
+ async run() {
31
+ if (this.options.length === 0)
32
+ return [];
33
+ readline.emitKeypressEvents(process.stdin);
34
+ if (process.stdin.isTTY) {
35
+ process.stdin.setRawMode(true);
36
+ }
37
+ // Initial render room (print blank lines to be cleared)
38
+ console.log("\n".repeat(this.options.length + 1));
39
+ this.render();
40
+ return new Promise((resolve) => {
41
+ const handleKey = (str, key) => {
42
+ if (key.name === "up") {
43
+ this.currentIndex = (this.currentIndex - 1 + this.options.length) % this.options.length;
44
+ this.render();
45
+ }
46
+ else if (key.name === "down") {
47
+ this.currentIndex = (this.currentIndex + 1) % this.options.length;
48
+ this.render();
49
+ }
50
+ else if (key.name === "space") {
51
+ this.options[this.currentIndex].selected = !this.options[this.currentIndex].selected;
52
+ this.render();
53
+ }
54
+ else if (key.name === "return") {
55
+ process.stdin.setRawMode(false);
56
+ process.stdin.removeListener("keypress", handleKey);
57
+ process.stdout.write("\x1b[?25h"); // Show cursor
58
+ console.log("");
59
+ resolve(this.options.filter(o => o.selected).map(o => o.value));
60
+ }
61
+ else if (key.ctrl && key.name === "c") {
62
+ process.stdin.setRawMode(false);
63
+ process.exit();
64
+ }
65
+ };
66
+ process.stdin.on("keypress", handleKey);
67
+ });
68
+ }
69
+ }
package/package.json CHANGED
@@ -1,11 +1,12 @@
1
1
  {
2
2
  "name": "@vespermcp/mcp-server",
3
- "version": "1.0.5",
3
+ "version": "1.0.8",
4
4
  "description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
5
5
  "type": "module",
6
6
  "main": "build/index.js",
7
7
  "bin": {
8
- "vesper": "./build/index.js"
8
+ "vespermcp": "./build/index.js",
9
+ "@vespermcp/mcp-server": "./build/index.js"
9
10
  },
10
11
  "files": [
11
12
  "build/**/*",
@@ -16,7 +17,7 @@
16
17
  "mcp-config-template.json"
17
18
  ],
18
19
  "scripts": {
19
- "build": "tsc",
20
+ "build": "tsc && node -e \"const fs=require('fs');const path=require('path');const src='src/python';const dest='build/python';if(!fs.existsSync(dest))fs.mkdirSync(dest,{recursive:true});fs.readdirSync(src).forEach(f=>{if(f.endsWith('.py'))fs.copyFileSync(path.join(src,f),path.join(dest,f));});console.log('✅ Copied Python scripts to build/python');\"",
20
21
  "dev": "tsx watch src/index.ts",
21
22
  "postinstall": "node scripts/postinstall.cjs",
22
23
  "scrape": "tsx src/scripts/scrape-metadata.ts",
@@ -26,7 +27,10 @@
26
27
  "check-db": "tsx src/scripts/check-db.ts",
27
28
  "test-jit": "tsx src/scripts/test-jit.ts",
28
29
  "demo-ui": "tsx src/scripts/demo-ui.ts",
29
- "test": "vitest"
30
+ "setup": "node build/index.js --setup",
31
+ "setup:silent": "node build/index.js --setup --silent",
32
+ "test": "vitest",
33
+ "start": "node build/index.js"
30
34
  },
31
35
  "keywords": [
32
36
  "mcp",
@@ -152,7 +152,15 @@ def main():
152
152
  op_type = op["type"]
153
153
  params = op.get("params", {})
154
154
 
155
- if op_type in OPERATIONS:
155
+ if op_type == "RenameTarget":
156
+ old_name = params.get("old_name")
157
+ new_name = params.get("new_name", "target")
158
+ if old_name and old_name in df.columns:
159
+ df = df.rename({old_name: new_name})
160
+ logs.append(f"Renamed column '{old_name}' to '{new_name}'")
161
+ else:
162
+ logs.append(f"Failed RenameTarget: Column '{old_name}' not found or not specified.")
163
+ elif op_type in OPERATIONS:
156
164
  try:
157
165
  df, stats = OPERATIONS[op_type](df, params)
158
166
  logs.append(f"Executed {op_type}: {stats}")
@@ -176,6 +184,28 @@ def main():
176
184
  base_name = file_path.rsplit(".", 1)[0]
177
185
  if output_format == "csv":
178
186
  output_path = f"{base_name}_cleaned.csv"
187
+ # Stringify ANY column that might not be CSV-safe (List, Struct, Object, etc.)
188
+ for col in df.columns:
189
+ dtype = df.schema[col]
190
+ # Only keep simple types; stringify everything else for CSV
191
+ is_simple = (
192
+ dtype.is_numeric() or
193
+ dtype.is_temporal() or
194
+ str(dtype).lower() in ["string", "utf8", "boolean", "bool"]
195
+ )
196
+ if not is_simple:
197
+ # Use a robust helper for clean JSON serialization
198
+ def safe_serialize(val):
199
+ try:
200
+ # Handle Polars nested types (convert to Python list/dict first)
201
+ if hasattr(val, "to_list"):
202
+ return json.dumps(val.to_list())
203
+ if hasattr(val, "to_dict"):
204
+ return json.dumps(val.to_dict())
205
+ return json.dumps(val)
206
+ except:
207
+ return str(val)
208
+ df = df.with_columns(pl.col(col).map_elements(safe_serialize, return_dtype=pl.Utf8))
179
209
  df.write_csv(output_path)
180
210
  else:
181
211
  output_path = f"{base_name}_cleaned.parquet"
@@ -186,10 +216,10 @@ def main():
186
216
  "output_path": output_path,
187
217
  "rows_affected": total_rows_affected,
188
218
  "logs": logs
189
- }))
219
+ }, default=str))
190
220
 
191
221
  except Exception as e:
192
- print(json.dumps({"success": False, "error": str(e)}))
222
+ print(json.dumps({"success": False, "error": str(e)}, default=str))
193
223
  sys.exit(1)
194
224
 
195
225
  if __name__ == "__main__":
@@ -31,6 +31,25 @@ def export_data(file_path, output_path, format, options=None):
31
31
  try:
32
32
  # Export Logic
33
33
  if format == "csv":
34
+ # Stringify ANY column that might not be CSV-safe (List, Struct, Object, etc.)
35
+ for col in df.columns:
36
+ dtype = df.schema[col]
37
+ is_simple = (
38
+ dtype.is_numeric() or
39
+ dtype.is_temporal() or
40
+ str(dtype).lower() in ["string", "utf8", "boolean", "bool"]
41
+ )
42
+ if not is_simple:
43
+ def safe_serialize(val):
44
+ try:
45
+ if hasattr(val, "to_list"):
46
+ return json.dumps(val.to_list())
47
+ if hasattr(val, "to_dict"):
48
+ return json.dumps(val.to_dict())
49
+ return json.dumps(val)
50
+ except:
51
+ return str(val)
52
+ df = df.with_columns(pl.col(col).map_elements(safe_serialize, return_dtype=pl.Utf8))
34
53
  df.write_csv(output_path)
35
54
 
36
55
  elif format == "parquet":
@@ -0,0 +1,154 @@
1
+ import sys
2
+ import json
3
+ import pandas as pd
4
+ import numpy as np
5
+
6
+ # Common names for target variables in datasets
7
+ TARGET_CANDIDATES = [
8
+ 'target', 'label', 'class', 'outcome', 'y',
9
+ 'price', 'saleprice', 'sales', 'cost', 'value', 'total',
10
+ 'diagnosis', 'species', 'churn', 'survived', 'credit_risk'
11
+ ]
12
+
13
+ def load_data(file_path):
14
+ if file_path.endswith('.csv'):
15
+ return pd.read_csv(file_path)
16
+ elif file_path.endswith('.parquet'):
17
+ return pd.read_parquet(file_path)
18
+ else:
19
+ raise ValueError("Unsupported file format")
20
+
21
+ def detect_target(file_path):
22
+ try:
23
+ df = load_data(file_path)
24
+ columns = [c.lower() for c in df.columns]
25
+ candidates = []
26
+
27
+ # 1. Exact Name Match
28
+ for col_original in df.columns:
29
+ col_lower = col_original.lower()
30
+ confidence = 0.0
31
+ reasons = []
32
+
33
+ if col_lower in TARGET_CANDIDATES:
34
+ confidence += 0.6
35
+ reasons.append(f"Matches common target name '{col_lower}'")
36
+
37
+ # Boost if exact match 'target' or 'label'
38
+ if col_lower in ['target', 'label', 'class']:
39
+ confidence += 0.2
40
+
41
+ # 2. Position Heuristic (Last column is often target)
42
+ if col_original == df.columns[-1]:
43
+ confidence += 0.3
44
+ reasons.append("Is the last column")
45
+
46
+ # 3. Completeness
47
+ missing_rate = df[col_original].isnull().mean()
48
+ if missing_rate > 0.5:
49
+ confidence -= 0.5
50
+ reasons.append(f"High missing rate ({missing_rate:.1%})")
51
+ elif missing_rate > 0:
52
+ confidence -= 0.1
53
+ reasons.append(f"Has missing values ({missing_rate:.1%})")
54
+
55
+ # 4. Cardinality / Unique Values
56
+ # If regression-like (many unique numeric values) or class-like (few unique values)
57
+ # This is hard to score generally, but extremes are bad for targets (e.g. all unique = ID usually)
58
+ n_unique = df[col_original].nunique()
59
+ if n_unique == len(df):
60
+ confidence -= 0.8
61
+ reasons.append("All values are unique (likely ID)")
62
+
63
+ if confidence > 0.3:
64
+ candidates.append({
65
+ "column": col_original,
66
+ "confidence": min(confidence, 1.0),
67
+ "reason": reasons
68
+ })
69
+
70
+ # Sort by confidence
71
+ candidates.sort(key=lambda x: x['confidence'], reverse=True)
72
+
73
+ best_target = None
74
+ best_conf = 0.0
75
+
76
+ if candidates:
77
+ best_target = candidates[0]['column']
78
+ best_conf = candidates[0]['confidence']
79
+
80
+ return {
81
+ "target_column": best_target,
82
+ "confidence": best_conf,
83
+ "candidates": candidates,
84
+ "is_unified": False # Wrapper will handle unification logic
85
+ }
86
+
87
+ except Exception as e:
88
+ return {"error": str(e)}
89
+
90
+ def validate_target(file_path, target_column):
91
+ try:
92
+ df = load_data(file_path)
93
+ if target_column not in df.columns:
94
+ return {"error": f"Column '{target_column}' not found in dataset."}
95
+
96
+ series = df[target_column]
97
+ total_rows = len(df)
98
+ missing_count = series.isnull().sum()
99
+
100
+ # Determine type
101
+ is_numeric = pd.api.types.is_numeric_dtype(series)
102
+ n_unique = series.nunique()
103
+
104
+ problem_type = "unknown"
105
+ if is_numeric and n_unique > 20:
106
+ problem_type = "regression"
107
+ elif n_unique < 50: # String or few numeric values
108
+ problem_type = "classification"
109
+ else:
110
+ # Heuristic fallback
111
+ problem_type = "regression" if is_numeric else "classification"
112
+
113
+ warnings = []
114
+ if missing_count > 0:
115
+ warnings.append(f"Target has {missing_count} missing values.")
116
+
117
+ # Imbalance check for classification
118
+ if problem_type == "classification":
119
+ counts = series.value_counts(normalize=True)
120
+ if counts.iloc[0] > 0.9: # Dominant class > 90%
121
+ warnings.append(f"Highly imbalanced target: Class '{counts.index[0]}' is {counts.iloc[0]:.1%}")
122
+
123
+ return {
124
+ "valid": True,
125
+ "problem_type": problem_type,
126
+ "missing_count": int(missing_count),
127
+ "total_rows": total_rows,
128
+ "warnings": warnings
129
+ }
130
+
131
+ except Exception as e:
132
+ return {"error": str(e)}
133
+
134
+ if __name__ == "__main__":
135
+ if len(sys.argv) < 3:
136
+ print(json.dumps({"error": "Usage: target_engine.py <action> <file_path> [args]"}));
137
+ sys.exit(1)
138
+
139
+ action = sys.argv[1]
140
+ file_path = sys.argv[2]
141
+
142
+ result = {}
143
+ if action == "detect":
144
+ result = detect_target(file_path)
145
+ elif action == "validate":
146
+ target_col = sys.argv[3] if len(sys.argv) > 3 else None
147
+ if target_col:
148
+ result = validate_target(file_path, target_col)
149
+ else:
150
+ result = {"error": "Target column required for validation"}
151
+ else:
152
+ result = {"error": f"Unknown action: {action}"}
153
+
154
+ print(json.dumps(result))