@vespermcp/mcp-server 1.0.4 → 1.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/README.md +6 -4
  2. package/build/cleaning/cleaner.js +27 -2
  3. package/build/cleaning/executor.js +7 -6
  4. package/build/cleaning/planner.js +16 -4
  5. package/build/config/config-manager.js +199 -0
  6. package/build/export/exporter.js +26 -2
  7. package/build/index.js +272 -72
  8. package/build/ingestion/ingestor.js +17 -16
  9. package/build/ingestion/kaggle-downloader.js +25 -2
  10. package/build/install/install-service.js +1 -1
  11. package/build/jobs/manager.js +17 -10
  12. package/build/metadata/monitoring-service.js +2 -2
  13. package/build/metadata/scraper.js +8 -8
  14. package/build/metadata/store.js +17 -2
  15. package/build/monitoring/observability.js +2 -2
  16. package/build/preparation/target-detector.js +75 -0
  17. package/build/python/cleaner.py +226 -0
  18. package/build/python/export_engine.py +131 -0
  19. package/build/python/framework_adapters.py +100 -0
  20. package/build/python/github_adapter.py +106 -0
  21. package/build/python/image_engine.py +86 -0
  22. package/build/python/media_engine.py +133 -0
  23. package/build/python/nasa_adapter.py +82 -0
  24. package/build/python/quality_engine.py +243 -0
  25. package/build/python/splitter_engine.py +283 -0
  26. package/build/python/target_engine.py +154 -0
  27. package/build/python/test_framework_adapters.py +61 -0
  28. package/build/python/uci_adapter.py +94 -0
  29. package/build/python/worldbank_adapter.py +99 -0
  30. package/build/quality/analyzer.js +40 -4
  31. package/build/quality/image-analyzer.js +73 -5
  32. package/build/quality/media-analyzer.js +74 -5
  33. package/build/scripts/cleanup-kaggle.js +41 -0
  34. package/build/scripts/repro-bug.js +37 -0
  35. package/build/scripts/repro-export-bug.js +56 -0
  36. package/build/scripts/test-mcp-v5.js +12 -11
  37. package/build/scripts/test-production-sync.js +36 -0
  38. package/build/scripts/test-target-detector.js +29 -0
  39. package/build/scripts/test-write.js +14 -0
  40. package/build/scripts/verify-integration.js +57 -0
  41. package/build/scripts/verify-priority.js +33 -0
  42. package/build/search/engine.js +13 -2
  43. package/build/search/jit-orchestrator.js +6 -40
  44. package/build/search/vector-store.js +18 -0
  45. package/build/splitting/splitter.js +27 -2
  46. package/build/tools/formatter.js +23 -8
  47. package/build/utils/downloader.js +2 -2
  48. package/build/utils/selector.js +69 -0
  49. package/package.json +8 -4
  50. package/src/python/cleaner.py +33 -3
  51. package/src/python/export_engine.py +19 -0
  52. package/src/python/target_engine.py +154 -0
@@ -1,5 +1,4 @@
1
1
  import { HuggingFaceScraper } from "../metadata/scraper.js";
2
- import { KaggleMetadataScraper } from "../metadata/kaggle-scraper.js";
3
2
  import { UCIScraper } from "../metadata/uci-scraper.js";
4
3
  import { GitHubScraper } from "../metadata/github-scraper.js";
5
4
  import { WorldBankScraper, NASAScraper } from "../metadata/institutional-scrapers.js";
@@ -45,7 +44,7 @@ export class JITOrchestrator {
45
44
  // Get existing dataset IDs to avoid duplicates
46
45
  const existing = this.metadataStore.getAllDatasets();
47
46
  existing.forEach(ds => existingIds.add(ds.id));
48
- // 1. Scrape HuggingFace
47
+ // 1. Scrape HuggingFace (Open Access)
49
48
  const hfResults = await this.scrapeHuggingFace(query, limit);
50
49
  console.error(` HuggingFace: Found ${hfResults.length} datasets`);
51
50
  for (const ds of hfResults) {
@@ -54,21 +53,7 @@ export class JITOrchestrator {
54
53
  existingIds.add(ds.id);
55
54
  }
56
55
  }
57
- // 2. Scrape Kaggle (if credentials available)
58
- const kaggleUser = process.env.KAGGLE_USERNAME;
59
- const kaggleKey = process.env.KAGGLE_KEY;
60
- if (kaggleUser && kaggleKey) {
61
- const kaggleResults = await this.scrapeKaggle(query, Math.floor(limit / 2));
62
- console.error(` Kaggle: Found ${kaggleResults.length} datasets`);
63
- for (const ds of kaggleResults) {
64
- ds.id = `kaggle:${ds.id}`;
65
- if (!existingIds.has(ds.id)) {
66
- newDatasets.push(ds);
67
- existingIds.add(ds.id);
68
- }
69
- }
70
- }
71
- // 3. Scrape UCI
56
+ // 2. Scrape UCI (Open Access)
72
57
  const uciResults = await this.scrapeUCI(query, Math.floor(limit / 2));
73
58
  console.error(` UCI: Found ${uciResults.length} datasets`);
74
59
  for (const ds of uciResults) {
@@ -77,7 +62,7 @@ export class JITOrchestrator {
77
62
  existingIds.add(ds.id);
78
63
  }
79
64
  }
80
- // 4. Scrape GitHub
65
+ // 3. Scrape GitHub (Open Access)
81
66
  const githubResults = await this.scrapeGitHub(query, Math.floor(limit / 2));
82
67
  console.error(` GitHub: Found ${githubResults.length} datasets`);
83
68
  for (const ds of githubResults) {
@@ -86,7 +71,7 @@ export class JITOrchestrator {
86
71
  existingIds.add(ds.id);
87
72
  }
88
73
  }
89
- // 5. Scrape World Bank
74
+ // 4. Scrape World Bank (Open Access)
90
75
  const wbResults = await this.scrapeWorldBank(query, Math.floor(limit / 2));
91
76
  console.error(` World Bank: Found ${wbResults.length} datasets`);
92
77
  for (const ds of wbResults) {
@@ -95,7 +80,7 @@ export class JITOrchestrator {
95
80
  existingIds.add(ds.id);
96
81
  }
97
82
  }
98
- // 6. Scrape NASA
83
+ // 5. Scrape NASA (Open Access)
99
84
  const nasaResults = await this.scrapeNASA(query, Math.floor(limit / 2));
100
85
  console.error(` NASA: Found ${nasaResults.length} datasets`);
101
86
  for (const ds of nasaResults) {
@@ -125,8 +110,7 @@ export class JITOrchestrator {
125
110
  async scrapeHuggingFace(query, limit) {
126
111
  const scraper = new HuggingFaceScraper();
127
112
  try {
128
- // Use the query as a domain filter for now
129
- // In the future, we can add a freeTextSearch parameter to the scraper
113
+ // Pass the query as a general search term
130
114
  return await scraper.scrape(limit, true, query);
131
115
  }
132
116
  catch (error) {
@@ -134,24 +118,6 @@ export class JITOrchestrator {
134
118
  return [];
135
119
  }
136
120
  }
137
- /**
138
- * Scrape Kaggle with search query
139
- */
140
- async scrapeKaggle(query, limit) {
141
- const kaggleUser = process.env.KAGGLE_USERNAME;
142
- const kaggleKey = process.env.KAGGLE_KEY;
143
- if (!kaggleUser || !kaggleKey) {
144
- return [];
145
- }
146
- try {
147
- const scraper = new KaggleMetadataScraper(kaggleUser, kaggleKey);
148
- return await scraper.scrape(query, limit);
149
- }
150
- catch (error) {
151
- console.error(` ERROR: Kaggle scrape failed: ${error.message}`);
152
- return [];
153
- }
154
- }
155
121
  /**
156
122
  * Scrape UCI
157
123
  */
@@ -74,6 +74,24 @@ export class VectorStore {
74
74
  add(id, vector) {
75
75
  this.idToVector.set(id, vector instanceof Float32Array ? vector : new Float32Array(vector));
76
76
  }
77
+ /**
78
+ * Delete a vector by ID
79
+ */
80
+ delete(id) {
81
+ return this.idToVector.delete(id);
82
+ }
83
+ /**
84
+ * Delete multiple vectors by IDs
85
+ */
86
+ deleteMany(ids) {
87
+ let count = 0;
88
+ for (const id of ids) {
89
+ if (this.idToVector.delete(id)) {
90
+ count++;
91
+ }
92
+ }
93
+ return count;
94
+ }
77
95
  search(queryVector, limit = 10) {
78
96
  const q = queryVector instanceof Float32Array ? queryVector : new Float32Array(queryVector);
79
97
  const results = [];
@@ -1,10 +1,35 @@
1
1
  import { spawn } from "child_process";
2
2
  import path from "path";
3
+ import fs from "fs";
3
4
  export class DataSplitter {
4
5
  pythonPath = "python";
5
6
  scriptPath;
6
- constructor(projectRoot = process.cwd()) {
7
- this.scriptPath = path.join(projectRoot, "src", "python", "splitter_engine.py");
7
+ constructor(buildDir = process.cwd()) {
8
+ const homeDir = process.env.HOME || process.env.USERPROFILE || buildDir;
9
+ const dataRoot = path.join(homeDir, ".vesper");
10
+ const scriptPath0 = path.resolve(dataRoot, "python", "splitter_engine.py");
11
+ const scriptPath1 = path.resolve(buildDir, "python", "splitter_engine.py");
12
+ const scriptPath2 = path.resolve(buildDir, "..", "src", "python", "splitter_engine.py");
13
+ const scriptPath3 = path.resolve(buildDir, "..", "python", "splitter_engine.py");
14
+ if (fs.existsSync(scriptPath0)) {
15
+ this.scriptPath = scriptPath0;
16
+ }
17
+ else if (fs.existsSync(scriptPath1)) {
18
+ this.scriptPath = scriptPath1;
19
+ }
20
+ else if (fs.existsSync(scriptPath2)) {
21
+ this.scriptPath = scriptPath2;
22
+ }
23
+ else if (fs.existsSync(scriptPath3)) {
24
+ this.scriptPath = scriptPath3;
25
+ }
26
+ else {
27
+ this.scriptPath = scriptPath0;
28
+ }
29
+ // Detect Python command
30
+ if (process.platform === "win32") {
31
+ this.pythonPath = "py";
32
+ }
8
33
  }
9
34
  /**
10
35
  * Splits a dataset into Train/Val/Test sets based on config
@@ -21,10 +21,16 @@ export function formatJobStatus(job) {
21
21
  output += `Progress: ${bar} ${job.progress}%\n`;
22
22
  output += `Activity: ${job.status_text}\n`;
23
23
  if (job.result_url) {
24
- output += `Result: ${job.result_url}\n`;
24
+ output += `\n✅ Result: ${job.result_url}\n`;
25
25
  }
26
26
  if (job.error) {
27
- output += `Error: ${job.error}\n`;
27
+ output += `\n❌ ERROR:\n`;
28
+ // Format multi-line errors nicely
29
+ const errorLines = job.error.split('\n');
30
+ errorLines.forEach(line => {
31
+ output += ` ${line}\n`;
32
+ });
33
+ output += `\n`;
28
34
  }
29
35
  output += `Updated: ${new Date(job.updated_at).toLocaleTimeString()}\n`;
30
36
  output += "═".repeat(25) + "\n";
@@ -41,8 +47,11 @@ export function formatSearchResults(results) {
41
47
  output += "═".repeat(80) + "\n\n";
42
48
  results.forEach((ds, index) => {
43
49
  const relevanceScore = ds.relevance_score || 0;
44
- // Source badge
45
- const sourceBadge = ds.source === "huggingface" ? "HuggingFace" : "Kaggle";
50
+ // Source badge and access level
51
+ const openSources = ["huggingface", "uci", "github", "worldbank", "nasa"];
52
+ const isOpen = openSources.includes(ds.source);
53
+ const sourceLabel = ds.source.charAt(0).toUpperCase() + ds.source.slice(1);
54
+ const accessBadge = isOpen ? "🔓 Open Access" : "🔒 Requires API Key";
46
55
  // Safety indicator
47
56
  let safetyIndicator = "";
48
57
  if (ds.license.category === "safe") {
@@ -56,8 +65,8 @@ export function formatSearchResults(results) {
56
65
  }
57
66
  // Header
58
67
  output += `${index + 1}. ${ds.name}\n`;
59
- output += ` ${sourceBadge} | ${safetyIndicator} | Relevance: ${(relevanceScore * 100).toFixed(0)}%\n`;
60
- output += ` ID: ${ds.id}\n\n`;
68
+ output += ` Source: ${sourceLabel} | ${accessBadge} | ${safetyIndicator}\n`;
69
+ output += ` Relevance: ${(relevanceScore * 100).toFixed(0)}% | ID: ${ds.id}\n\n`;
61
70
  // Description
62
71
  if (ds.description && ds.description.length > 0) {
63
72
  const shortDesc = ds.description.length > 200
@@ -116,7 +125,10 @@ export function formatDatasetInfo(ds) {
116
125
  output += `${ds.name}\n`;
117
126
  output += "═".repeat(80) + "\n\n";
118
127
  // Source and safety
119
- const sourceBadge = ds.source === "huggingface" ? "HuggingFace" : "Kaggle";
128
+ const openSources = ["huggingface", "uci", "github", "worldbank", "nasa"];
129
+ const isOpen = openSources.includes(ds.source);
130
+ const sourceLabel = ds.source.charAt(0).toUpperCase() + ds.source.slice(1);
131
+ const accessBadge = isOpen ? "🔓 Open Access" : "🔒 Requires API Key";
120
132
  let safetyIndicator = "";
121
133
  if (ds.license.category === "safe") {
122
134
  safetyIndicator = "Safe for use";
@@ -127,9 +139,12 @@ export function formatDatasetInfo(ds) {
127
139
  else {
128
140
  safetyIndicator = "Unknown license - Use with caution";
129
141
  }
130
- output += `Source: ${sourceBadge}\n`;
142
+ output += `Source: ${sourceLabel} (${accessBadge})\n`;
131
143
  output += `Safety: ${safetyIndicator}\n`;
132
144
  output += `ID: ${ds.id}\n\n`;
145
+ if (!isOpen && ds.source === "kaggle") {
146
+ output += `⚠️ NOTE: This dataset requires a Kaggle API key (KAGGE_USERNAME/KAGGLE_KEY) to prepare.\n\n`;
147
+ }
133
148
  // Description
134
149
  if (ds.description) {
135
150
  output += "Description:\n";
@@ -15,13 +15,13 @@ export class RobustDownloader {
15
15
  startByte = fs.statSync(targetPath).size;
16
16
  if (startByte > 0) {
17
17
  headers["Range"] = `bytes=${startByte}-`;
18
- console.log(`[Downloader] Resuming from byte ${startByte}`);
18
+ console.error(`[Downloader] Resuming from byte ${startByte}`);
19
19
  }
20
20
  }
21
21
  const response = await fetch(url, { headers });
22
22
  if (response.status === 416) {
23
23
  // Requested range not satisfiable - likely already finished
24
- console.log("[Downloader] Range not satisfiable, file might be complete.");
24
+ console.error("[Downloader] Range not satisfiable, file might be complete.");
25
25
  return;
26
26
  }
27
27
  if (!response.ok && response.status !== 206) {
@@ -0,0 +1,69 @@
1
+ import readline from "readline";
2
+ export class Selector {
3
+ currentIndex = 0;
4
+ options;
5
+ title;
6
+ constructor(title, options) {
7
+ this.title = title;
8
+ this.options = options;
9
+ }
10
+ render() {
11
+ // Clear previous lines
12
+ process.stdout.write("\x1b[?25l"); // Hide cursor
13
+ readline.cursorTo(process.stdout, 0);
14
+ // Clear the lines we used before (options + title + blank line)
15
+ for (let i = 0; i <= this.options.length + 1; i++) {
16
+ readline.clearLine(process.stdout, 0);
17
+ process.stdout.write("\x1b[1A"); // Move up one line
18
+ }
19
+ readline.clearLine(process.stdout, 0);
20
+ console.log(`\n${this.title}`);
21
+ this.options.forEach((opt, idx) => {
22
+ const isCurrent = idx === this.currentIndex;
23
+ const checkbox = opt.selected ? "[\x1b[32mX\x1b[0m]" : "[ ]";
24
+ const cursor = isCurrent ? "\x1b[36m>\x1b[0m " : " ";
25
+ const label = isCurrent ? `\x1b[36m${opt.name}\x1b[0m` : opt.name;
26
+ console.log(`${cursor}${checkbox} ${label}`);
27
+ });
28
+ console.log("\x1b[2m(Use arrows to move, Space to toggle, Enter to confirm)\x1b[0m");
29
+ }
30
+ async run() {
31
+ if (this.options.length === 0)
32
+ return [];
33
+ readline.emitKeypressEvents(process.stdin);
34
+ if (process.stdin.isTTY) {
35
+ process.stdin.setRawMode(true);
36
+ }
37
+ // Initial render room (print blank lines to be cleared)
38
+ console.log("\n".repeat(this.options.length + 1));
39
+ this.render();
40
+ return new Promise((resolve) => {
41
+ const handleKey = (str, key) => {
42
+ if (key.name === "up") {
43
+ this.currentIndex = (this.currentIndex - 1 + this.options.length) % this.options.length;
44
+ this.render();
45
+ }
46
+ else if (key.name === "down") {
47
+ this.currentIndex = (this.currentIndex + 1) % this.options.length;
48
+ this.render();
49
+ }
50
+ else if (key.name === "space") {
51
+ this.options[this.currentIndex].selected = !this.options[this.currentIndex].selected;
52
+ this.render();
53
+ }
54
+ else if (key.name === "return") {
55
+ process.stdin.setRawMode(false);
56
+ process.stdin.removeListener("keypress", handleKey);
57
+ process.stdout.write("\x1b[?25h"); // Show cursor
58
+ console.log("");
59
+ resolve(this.options.filter(o => o.selected).map(o => o.value));
60
+ }
61
+ else if (key.ctrl && key.name === "c") {
62
+ process.stdin.setRawMode(false);
63
+ process.exit();
64
+ }
65
+ };
66
+ process.stdin.on("keypress", handleKey);
67
+ });
68
+ }
69
+ }
package/package.json CHANGED
@@ -1,11 +1,12 @@
1
1
  {
2
2
  "name": "@vespermcp/mcp-server",
3
- "version": "1.0.4",
3
+ "version": "1.0.6",
4
4
  "description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
5
5
  "type": "module",
6
6
  "main": "build/index.js",
7
7
  "bin": {
8
- "vesper": "./build/index.js"
8
+ "vespermcp": "./build/index.js",
9
+ "@vespermcp/mcp-server": "./build/index.js"
9
10
  },
10
11
  "files": [
11
12
  "build/**/*",
@@ -16,7 +17,7 @@
16
17
  "mcp-config-template.json"
17
18
  ],
18
19
  "scripts": {
19
- "build": "tsc",
20
+ "build": "tsc && node -e \"const fs=require('fs');const path=require('path');const src='src/python';const dest='build/python';if(!fs.existsSync(dest))fs.mkdirSync(dest,{recursive:true});fs.readdirSync(src).forEach(f=>{if(f.endsWith('.py'))fs.copyFileSync(path.join(src,f),path.join(dest,f));});console.log('✅ Copied Python scripts to build/python');\"",
20
21
  "dev": "tsx watch src/index.ts",
21
22
  "postinstall": "node scripts/postinstall.cjs",
22
23
  "scrape": "tsx src/scripts/scrape-metadata.ts",
@@ -26,7 +27,10 @@
26
27
  "check-db": "tsx src/scripts/check-db.ts",
27
28
  "test-jit": "tsx src/scripts/test-jit.ts",
28
29
  "demo-ui": "tsx src/scripts/demo-ui.ts",
29
- "test": "vitest"
30
+ "setup": "node build/index.js --setup",
31
+ "setup:silent": "node build/index.js --setup --silent",
32
+ "test": "vitest",
33
+ "start": "node build/index.js"
30
34
  },
31
35
  "keywords": [
32
36
  "mcp",
@@ -152,7 +152,15 @@ def main():
152
152
  op_type = op["type"]
153
153
  params = op.get("params", {})
154
154
 
155
- if op_type in OPERATIONS:
155
+ if op_type == "RenameTarget":
156
+ old_name = params.get("old_name")
157
+ new_name = params.get("new_name", "target")
158
+ if old_name and old_name in df.columns:
159
+ df = df.rename({old_name: new_name})
160
+ logs.append(f"Renamed column '{old_name}' to '{new_name}'")
161
+ else:
162
+ logs.append(f"Failed RenameTarget: Column '{old_name}' not found or not specified.")
163
+ elif op_type in OPERATIONS:
156
164
  try:
157
165
  df, stats = OPERATIONS[op_type](df, params)
158
166
  logs.append(f"Executed {op_type}: {stats}")
@@ -176,6 +184,28 @@ def main():
176
184
  base_name = file_path.rsplit(".", 1)[0]
177
185
  if output_format == "csv":
178
186
  output_path = f"{base_name}_cleaned.csv"
187
+ # Stringify ANY column that might not be CSV-safe (List, Struct, Object, etc.)
188
+ for col in df.columns:
189
+ dtype = df.schema[col]
190
+ # Only keep simple types; stringify everything else for CSV
191
+ is_simple = (
192
+ dtype.is_numeric() or
193
+ dtype.is_temporal() or
194
+ str(dtype).lower() in ["string", "utf8", "boolean", "bool"]
195
+ )
196
+ if not is_simple:
197
+ # Use a robust helper for clean JSON serialization
198
+ def safe_serialize(val):
199
+ try:
200
+ # Handle Polars nested types (convert to Python list/dict first)
201
+ if hasattr(val, "to_list"):
202
+ return json.dumps(val.to_list())
203
+ if hasattr(val, "to_dict"):
204
+ return json.dumps(val.to_dict())
205
+ return json.dumps(val)
206
+ except:
207
+ return str(val)
208
+ df = df.with_columns(pl.col(col).map_elements(safe_serialize, return_dtype=pl.Utf8))
179
209
  df.write_csv(output_path)
180
210
  else:
181
211
  output_path = f"{base_name}_cleaned.parquet"
@@ -186,10 +216,10 @@ def main():
186
216
  "output_path": output_path,
187
217
  "rows_affected": total_rows_affected,
188
218
  "logs": logs
189
- }))
219
+ }, default=str))
190
220
 
191
221
  except Exception as e:
192
- print(json.dumps({"success": False, "error": str(e)}))
222
+ print(json.dumps({"success": False, "error": str(e)}, default=str))
193
223
  sys.exit(1)
194
224
 
195
225
  if __name__ == "__main__":
@@ -31,6 +31,25 @@ def export_data(file_path, output_path, format, options=None):
31
31
  try:
32
32
  # Export Logic
33
33
  if format == "csv":
34
+ # Stringify ANY column that might not be CSV-safe (List, Struct, Object, etc.)
35
+ for col in df.columns:
36
+ dtype = df.schema[col]
37
+ is_simple = (
38
+ dtype.is_numeric() or
39
+ dtype.is_temporal() or
40
+ str(dtype).lower() in ["string", "utf8", "boolean", "bool"]
41
+ )
42
+ if not is_simple:
43
+ def safe_serialize(val):
44
+ try:
45
+ if hasattr(val, "to_list"):
46
+ return json.dumps(val.to_list())
47
+ if hasattr(val, "to_dict"):
48
+ return json.dumps(val.to_dict())
49
+ return json.dumps(val)
50
+ except:
51
+ return str(val)
52
+ df = df.with_columns(pl.col(col).map_elements(safe_serialize, return_dtype=pl.Utf8))
34
53
  df.write_csv(output_path)
35
54
 
36
55
  elif format == "parquet":
@@ -0,0 +1,154 @@
1
+ import sys
2
+ import json
3
+ import pandas as pd
4
+ import numpy as np
5
+
6
+ # Common names for target variables in datasets
7
+ TARGET_CANDIDATES = [
8
+ 'target', 'label', 'class', 'outcome', 'y',
9
+ 'price', 'saleprice', 'sales', 'cost', 'value', 'total',
10
+ 'diagnosis', 'species', 'churn', 'survived', 'credit_risk'
11
+ ]
12
+
13
+ def load_data(file_path):
14
+ if file_path.endswith('.csv'):
15
+ return pd.read_csv(file_path)
16
+ elif file_path.endswith('.parquet'):
17
+ return pd.read_parquet(file_path)
18
+ else:
19
+ raise ValueError("Unsupported file format")
20
+
21
+ def detect_target(file_path):
22
+ try:
23
+ df = load_data(file_path)
24
+ columns = [c.lower() for c in df.columns]
25
+ candidates = []
26
+
27
+ # 1. Exact Name Match
28
+ for col_original in df.columns:
29
+ col_lower = col_original.lower()
30
+ confidence = 0.0
31
+ reasons = []
32
+
33
+ if col_lower in TARGET_CANDIDATES:
34
+ confidence += 0.6
35
+ reasons.append(f"Matches common target name '{col_lower}'")
36
+
37
+ # Boost if exact match 'target' or 'label'
38
+ if col_lower in ['target', 'label', 'class']:
39
+ confidence += 0.2
40
+
41
+ # 2. Position Heuristic (Last column is often target)
42
+ if col_original == df.columns[-1]:
43
+ confidence += 0.3
44
+ reasons.append("Is the last column")
45
+
46
+ # 3. Completeness
47
+ missing_rate = df[col_original].isnull().mean()
48
+ if missing_rate > 0.5:
49
+ confidence -= 0.5
50
+ reasons.append(f"High missing rate ({missing_rate:.1%})")
51
+ elif missing_rate > 0:
52
+ confidence -= 0.1
53
+ reasons.append(f"Has missing values ({missing_rate:.1%})")
54
+
55
+ # 4. Cardinality / Unique Values
56
+ # If regression-like (many unique numeric values) or class-like (few unique values)
57
+ # This is hard to score generally, but extremes are bad for targets (e.g. all unique = ID usually)
58
+ n_unique = df[col_original].nunique()
59
+ if n_unique == len(df):
60
+ confidence -= 0.8
61
+ reasons.append("All values are unique (likely ID)")
62
+
63
+ if confidence > 0.3:
64
+ candidates.append({
65
+ "column": col_original,
66
+ "confidence": min(confidence, 1.0),
67
+ "reason": reasons
68
+ })
69
+
70
+ # Sort by confidence
71
+ candidates.sort(key=lambda x: x['confidence'], reverse=True)
72
+
73
+ best_target = None
74
+ best_conf = 0.0
75
+
76
+ if candidates:
77
+ best_target = candidates[0]['column']
78
+ best_conf = candidates[0]['confidence']
79
+
80
+ return {
81
+ "target_column": best_target,
82
+ "confidence": best_conf,
83
+ "candidates": candidates,
84
+ "is_unified": False # Wrapper will handle unification logic
85
+ }
86
+
87
+ except Exception as e:
88
+ return {"error": str(e)}
89
+
90
+ def validate_target(file_path, target_column):
91
+ try:
92
+ df = load_data(file_path)
93
+ if target_column not in df.columns:
94
+ return {"error": f"Column '{target_column}' not found in dataset."}
95
+
96
+ series = df[target_column]
97
+ total_rows = len(df)
98
+ missing_count = series.isnull().sum()
99
+
100
+ # Determine type
101
+ is_numeric = pd.api.types.is_numeric_dtype(series)
102
+ n_unique = series.nunique()
103
+
104
+ problem_type = "unknown"
105
+ if is_numeric and n_unique > 20:
106
+ problem_type = "regression"
107
+ elif n_unique < 50: # String or few numeric values
108
+ problem_type = "classification"
109
+ else:
110
+ # Heuristic fallback
111
+ problem_type = "regression" if is_numeric else "classification"
112
+
113
+ warnings = []
114
+ if missing_count > 0:
115
+ warnings.append(f"Target has {missing_count} missing values.")
116
+
117
+ # Imbalance check for classification
118
+ if problem_type == "classification":
119
+ counts = series.value_counts(normalize=True)
120
+ if counts.iloc[0] > 0.9: # Dominant class > 90%
121
+ warnings.append(f"Highly imbalanced target: Class '{counts.index[0]}' is {counts.iloc[0]:.1%}")
122
+
123
+ return {
124
+ "valid": True,
125
+ "problem_type": problem_type,
126
+ "missing_count": int(missing_count),
127
+ "total_rows": total_rows,
128
+ "warnings": warnings
129
+ }
130
+
131
+ except Exception as e:
132
+ return {"error": str(e)}
133
+
134
+ if __name__ == "__main__":
135
+ if len(sys.argv) < 3:
136
+ print(json.dumps({"error": "Usage: target_engine.py <action> <file_path> [args]"}));
137
+ sys.exit(1)
138
+
139
+ action = sys.argv[1]
140
+ file_path = sys.argv[2]
141
+
142
+ result = {}
143
+ if action == "detect":
144
+ result = detect_target(file_path)
145
+ elif action == "validate":
146
+ target_col = sys.argv[3] if len(sys.argv) > 3 else None
147
+ if target_col:
148
+ result = validate_target(file_path, target_col)
149
+ else:
150
+ result = {"error": "Target column required for validation"}
151
+ else:
152
+ result = {"error": f"Unknown action: {action}"}
153
+
154
+ print(json.dumps(result))