@vespermcp/mcp-server 1.0.4 → 1.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -4
- package/build/cleaning/cleaner.js +27 -2
- package/build/cleaning/executor.js +7 -6
- package/build/cleaning/planner.js +16 -4
- package/build/config/config-manager.js +199 -0
- package/build/export/exporter.js +26 -2
- package/build/index.js +272 -72
- package/build/ingestion/ingestor.js +17 -16
- package/build/ingestion/kaggle-downloader.js +25 -2
- package/build/install/install-service.js +1 -1
- package/build/jobs/manager.js +17 -10
- package/build/metadata/monitoring-service.js +2 -2
- package/build/metadata/scraper.js +8 -8
- package/build/metadata/store.js +17 -2
- package/build/monitoring/observability.js +2 -2
- package/build/preparation/target-detector.js +75 -0
- package/build/python/cleaner.py +226 -0
- package/build/python/export_engine.py +131 -0
- package/build/python/framework_adapters.py +100 -0
- package/build/python/github_adapter.py +106 -0
- package/build/python/image_engine.py +86 -0
- package/build/python/media_engine.py +133 -0
- package/build/python/nasa_adapter.py +82 -0
- package/build/python/quality_engine.py +243 -0
- package/build/python/splitter_engine.py +283 -0
- package/build/python/target_engine.py +154 -0
- package/build/python/test_framework_adapters.py +61 -0
- package/build/python/uci_adapter.py +94 -0
- package/build/python/worldbank_adapter.py +99 -0
- package/build/quality/analyzer.js +40 -4
- package/build/quality/image-analyzer.js +73 -5
- package/build/quality/media-analyzer.js +74 -5
- package/build/scripts/cleanup-kaggle.js +41 -0
- package/build/scripts/repro-bug.js +37 -0
- package/build/scripts/repro-export-bug.js +56 -0
- package/build/scripts/test-mcp-v5.js +12 -11
- package/build/scripts/test-production-sync.js +36 -0
- package/build/scripts/test-target-detector.js +29 -0
- package/build/scripts/test-write.js +14 -0
- package/build/scripts/verify-integration.js +57 -0
- package/build/scripts/verify-priority.js +33 -0
- package/build/search/engine.js +13 -2
- package/build/search/jit-orchestrator.js +6 -40
- package/build/search/vector-store.js +18 -0
- package/build/splitting/splitter.js +27 -2
- package/build/tools/formatter.js +23 -8
- package/build/utils/downloader.js +2 -2
- package/build/utils/selector.js +69 -0
- package/package.json +8 -4
- package/src/python/cleaner.py +33 -3
- package/src/python/export_engine.py +19 -0
- package/src/python/target_engine.py +154 -0
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import { HuggingFaceScraper } from "../metadata/scraper.js";
|
|
2
|
-
import { KaggleMetadataScraper } from "../metadata/kaggle-scraper.js";
|
|
3
2
|
import { UCIScraper } from "../metadata/uci-scraper.js";
|
|
4
3
|
import { GitHubScraper } from "../metadata/github-scraper.js";
|
|
5
4
|
import { WorldBankScraper, NASAScraper } from "../metadata/institutional-scrapers.js";
|
|
@@ -45,7 +44,7 @@ export class JITOrchestrator {
|
|
|
45
44
|
// Get existing dataset IDs to avoid duplicates
|
|
46
45
|
const existing = this.metadataStore.getAllDatasets();
|
|
47
46
|
existing.forEach(ds => existingIds.add(ds.id));
|
|
48
|
-
// 1. Scrape HuggingFace
|
|
47
|
+
// 1. Scrape HuggingFace (Open Access)
|
|
49
48
|
const hfResults = await this.scrapeHuggingFace(query, limit);
|
|
50
49
|
console.error(` HuggingFace: Found ${hfResults.length} datasets`);
|
|
51
50
|
for (const ds of hfResults) {
|
|
@@ -54,21 +53,7 @@ export class JITOrchestrator {
|
|
|
54
53
|
existingIds.add(ds.id);
|
|
55
54
|
}
|
|
56
55
|
}
|
|
57
|
-
// 2. Scrape
|
|
58
|
-
const kaggleUser = process.env.KAGGLE_USERNAME;
|
|
59
|
-
const kaggleKey = process.env.KAGGLE_KEY;
|
|
60
|
-
if (kaggleUser && kaggleKey) {
|
|
61
|
-
const kaggleResults = await this.scrapeKaggle(query, Math.floor(limit / 2));
|
|
62
|
-
console.error(` Kaggle: Found ${kaggleResults.length} datasets`);
|
|
63
|
-
for (const ds of kaggleResults) {
|
|
64
|
-
ds.id = `kaggle:${ds.id}`;
|
|
65
|
-
if (!existingIds.has(ds.id)) {
|
|
66
|
-
newDatasets.push(ds);
|
|
67
|
-
existingIds.add(ds.id);
|
|
68
|
-
}
|
|
69
|
-
}
|
|
70
|
-
}
|
|
71
|
-
// 3. Scrape UCI
|
|
56
|
+
// 2. Scrape UCI (Open Access)
|
|
72
57
|
const uciResults = await this.scrapeUCI(query, Math.floor(limit / 2));
|
|
73
58
|
console.error(` UCI: Found ${uciResults.length} datasets`);
|
|
74
59
|
for (const ds of uciResults) {
|
|
@@ -77,7 +62,7 @@ export class JITOrchestrator {
|
|
|
77
62
|
existingIds.add(ds.id);
|
|
78
63
|
}
|
|
79
64
|
}
|
|
80
|
-
//
|
|
65
|
+
// 3. Scrape GitHub (Open Access)
|
|
81
66
|
const githubResults = await this.scrapeGitHub(query, Math.floor(limit / 2));
|
|
82
67
|
console.error(` GitHub: Found ${githubResults.length} datasets`);
|
|
83
68
|
for (const ds of githubResults) {
|
|
@@ -86,7 +71,7 @@ export class JITOrchestrator {
|
|
|
86
71
|
existingIds.add(ds.id);
|
|
87
72
|
}
|
|
88
73
|
}
|
|
89
|
-
//
|
|
74
|
+
// 4. Scrape World Bank (Open Access)
|
|
90
75
|
const wbResults = await this.scrapeWorldBank(query, Math.floor(limit / 2));
|
|
91
76
|
console.error(` World Bank: Found ${wbResults.length} datasets`);
|
|
92
77
|
for (const ds of wbResults) {
|
|
@@ -95,7 +80,7 @@ export class JITOrchestrator {
|
|
|
95
80
|
existingIds.add(ds.id);
|
|
96
81
|
}
|
|
97
82
|
}
|
|
98
|
-
//
|
|
83
|
+
// 5. Scrape NASA (Open Access)
|
|
99
84
|
const nasaResults = await this.scrapeNASA(query, Math.floor(limit / 2));
|
|
100
85
|
console.error(` NASA: Found ${nasaResults.length} datasets`);
|
|
101
86
|
for (const ds of nasaResults) {
|
|
@@ -125,8 +110,7 @@ export class JITOrchestrator {
|
|
|
125
110
|
async scrapeHuggingFace(query, limit) {
|
|
126
111
|
const scraper = new HuggingFaceScraper();
|
|
127
112
|
try {
|
|
128
|
-
//
|
|
129
|
-
// In the future, we can add a freeTextSearch parameter to the scraper
|
|
113
|
+
// Pass the query as a general search term
|
|
130
114
|
return await scraper.scrape(limit, true, query);
|
|
131
115
|
}
|
|
132
116
|
catch (error) {
|
|
@@ -134,24 +118,6 @@ export class JITOrchestrator {
|
|
|
134
118
|
return [];
|
|
135
119
|
}
|
|
136
120
|
}
|
|
137
|
-
/**
|
|
138
|
-
* Scrape Kaggle with search query
|
|
139
|
-
*/
|
|
140
|
-
async scrapeKaggle(query, limit) {
|
|
141
|
-
const kaggleUser = process.env.KAGGLE_USERNAME;
|
|
142
|
-
const kaggleKey = process.env.KAGGLE_KEY;
|
|
143
|
-
if (!kaggleUser || !kaggleKey) {
|
|
144
|
-
return [];
|
|
145
|
-
}
|
|
146
|
-
try {
|
|
147
|
-
const scraper = new KaggleMetadataScraper(kaggleUser, kaggleKey);
|
|
148
|
-
return await scraper.scrape(query, limit);
|
|
149
|
-
}
|
|
150
|
-
catch (error) {
|
|
151
|
-
console.error(` ERROR: Kaggle scrape failed: ${error.message}`);
|
|
152
|
-
return [];
|
|
153
|
-
}
|
|
154
|
-
}
|
|
155
121
|
/**
|
|
156
122
|
* Scrape UCI
|
|
157
123
|
*/
|
|
@@ -74,6 +74,24 @@ export class VectorStore {
|
|
|
74
74
|
add(id, vector) {
|
|
75
75
|
this.idToVector.set(id, vector instanceof Float32Array ? vector : new Float32Array(vector));
|
|
76
76
|
}
|
|
77
|
+
/**
|
|
78
|
+
* Delete a vector by ID
|
|
79
|
+
*/
|
|
80
|
+
delete(id) {
|
|
81
|
+
return this.idToVector.delete(id);
|
|
82
|
+
}
|
|
83
|
+
/**
|
|
84
|
+
* Delete multiple vectors by IDs
|
|
85
|
+
*/
|
|
86
|
+
deleteMany(ids) {
|
|
87
|
+
let count = 0;
|
|
88
|
+
for (const id of ids) {
|
|
89
|
+
if (this.idToVector.delete(id)) {
|
|
90
|
+
count++;
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
return count;
|
|
94
|
+
}
|
|
77
95
|
search(queryVector, limit = 10) {
|
|
78
96
|
const q = queryVector instanceof Float32Array ? queryVector : new Float32Array(queryVector);
|
|
79
97
|
const results = [];
|
|
@@ -1,10 +1,35 @@
|
|
|
1
1
|
import { spawn } from "child_process";
|
|
2
2
|
import path from "path";
|
|
3
|
+
import fs from "fs";
|
|
3
4
|
export class DataSplitter {
|
|
4
5
|
pythonPath = "python";
|
|
5
6
|
scriptPath;
|
|
6
|
-
constructor(
|
|
7
|
-
|
|
7
|
+
constructor(buildDir = process.cwd()) {
|
|
8
|
+
const homeDir = process.env.HOME || process.env.USERPROFILE || buildDir;
|
|
9
|
+
const dataRoot = path.join(homeDir, ".vesper");
|
|
10
|
+
const scriptPath0 = path.resolve(dataRoot, "python", "splitter_engine.py");
|
|
11
|
+
const scriptPath1 = path.resolve(buildDir, "python", "splitter_engine.py");
|
|
12
|
+
const scriptPath2 = path.resolve(buildDir, "..", "src", "python", "splitter_engine.py");
|
|
13
|
+
const scriptPath3 = path.resolve(buildDir, "..", "python", "splitter_engine.py");
|
|
14
|
+
if (fs.existsSync(scriptPath0)) {
|
|
15
|
+
this.scriptPath = scriptPath0;
|
|
16
|
+
}
|
|
17
|
+
else if (fs.existsSync(scriptPath1)) {
|
|
18
|
+
this.scriptPath = scriptPath1;
|
|
19
|
+
}
|
|
20
|
+
else if (fs.existsSync(scriptPath2)) {
|
|
21
|
+
this.scriptPath = scriptPath2;
|
|
22
|
+
}
|
|
23
|
+
else if (fs.existsSync(scriptPath3)) {
|
|
24
|
+
this.scriptPath = scriptPath3;
|
|
25
|
+
}
|
|
26
|
+
else {
|
|
27
|
+
this.scriptPath = scriptPath0;
|
|
28
|
+
}
|
|
29
|
+
// Detect Python command
|
|
30
|
+
if (process.platform === "win32") {
|
|
31
|
+
this.pythonPath = "py";
|
|
32
|
+
}
|
|
8
33
|
}
|
|
9
34
|
/**
|
|
10
35
|
* Splits a dataset into Train/Val/Test sets based on config
|
package/build/tools/formatter.js
CHANGED
|
@@ -21,10 +21,16 @@ export function formatJobStatus(job) {
|
|
|
21
21
|
output += `Progress: ${bar} ${job.progress}%\n`;
|
|
22
22
|
output += `Activity: ${job.status_text}\n`;
|
|
23
23
|
if (job.result_url) {
|
|
24
|
-
output +=
|
|
24
|
+
output += `\n✅ Result: ${job.result_url}\n`;
|
|
25
25
|
}
|
|
26
26
|
if (job.error) {
|
|
27
|
-
output +=
|
|
27
|
+
output += `\n❌ ERROR:\n`;
|
|
28
|
+
// Format multi-line errors nicely
|
|
29
|
+
const errorLines = job.error.split('\n');
|
|
30
|
+
errorLines.forEach(line => {
|
|
31
|
+
output += ` ${line}\n`;
|
|
32
|
+
});
|
|
33
|
+
output += `\n`;
|
|
28
34
|
}
|
|
29
35
|
output += `Updated: ${new Date(job.updated_at).toLocaleTimeString()}\n`;
|
|
30
36
|
output += "═".repeat(25) + "\n";
|
|
@@ -41,8 +47,11 @@ export function formatSearchResults(results) {
|
|
|
41
47
|
output += "═".repeat(80) + "\n\n";
|
|
42
48
|
results.forEach((ds, index) => {
|
|
43
49
|
const relevanceScore = ds.relevance_score || 0;
|
|
44
|
-
// Source badge
|
|
45
|
-
const
|
|
50
|
+
// Source badge and access level
|
|
51
|
+
const openSources = ["huggingface", "uci", "github", "worldbank", "nasa"];
|
|
52
|
+
const isOpen = openSources.includes(ds.source);
|
|
53
|
+
const sourceLabel = ds.source.charAt(0).toUpperCase() + ds.source.slice(1);
|
|
54
|
+
const accessBadge = isOpen ? "🔓 Open Access" : "🔒 Requires API Key";
|
|
46
55
|
// Safety indicator
|
|
47
56
|
let safetyIndicator = "";
|
|
48
57
|
if (ds.license.category === "safe") {
|
|
@@ -56,8 +65,8 @@ export function formatSearchResults(results) {
|
|
|
56
65
|
}
|
|
57
66
|
// Header
|
|
58
67
|
output += `${index + 1}. ${ds.name}\n`;
|
|
59
|
-
output += ` ${
|
|
60
|
-
output += ` ID: ${ds.id}\n\n`;
|
|
68
|
+
output += ` Source: ${sourceLabel} | ${accessBadge} | ${safetyIndicator}\n`;
|
|
69
|
+
output += ` Relevance: ${(relevanceScore * 100).toFixed(0)}% | ID: ${ds.id}\n\n`;
|
|
61
70
|
// Description
|
|
62
71
|
if (ds.description && ds.description.length > 0) {
|
|
63
72
|
const shortDesc = ds.description.length > 200
|
|
@@ -116,7 +125,10 @@ export function formatDatasetInfo(ds) {
|
|
|
116
125
|
output += `${ds.name}\n`;
|
|
117
126
|
output += "═".repeat(80) + "\n\n";
|
|
118
127
|
// Source and safety
|
|
119
|
-
const
|
|
128
|
+
const openSources = ["huggingface", "uci", "github", "worldbank", "nasa"];
|
|
129
|
+
const isOpen = openSources.includes(ds.source);
|
|
130
|
+
const sourceLabel = ds.source.charAt(0).toUpperCase() + ds.source.slice(1);
|
|
131
|
+
const accessBadge = isOpen ? "🔓 Open Access" : "🔒 Requires API Key";
|
|
120
132
|
let safetyIndicator = "";
|
|
121
133
|
if (ds.license.category === "safe") {
|
|
122
134
|
safetyIndicator = "Safe for use";
|
|
@@ -127,9 +139,12 @@ export function formatDatasetInfo(ds) {
|
|
|
127
139
|
else {
|
|
128
140
|
safetyIndicator = "Unknown license - Use with caution";
|
|
129
141
|
}
|
|
130
|
-
output += `Source: ${
|
|
142
|
+
output += `Source: ${sourceLabel} (${accessBadge})\n`;
|
|
131
143
|
output += `Safety: ${safetyIndicator}\n`;
|
|
132
144
|
output += `ID: ${ds.id}\n\n`;
|
|
145
|
+
if (!isOpen && ds.source === "kaggle") {
|
|
146
|
+
output += `⚠️ NOTE: This dataset requires a Kaggle API key (KAGGE_USERNAME/KAGGLE_KEY) to prepare.\n\n`;
|
|
147
|
+
}
|
|
133
148
|
// Description
|
|
134
149
|
if (ds.description) {
|
|
135
150
|
output += "Description:\n";
|
|
@@ -15,13 +15,13 @@ export class RobustDownloader {
|
|
|
15
15
|
startByte = fs.statSync(targetPath).size;
|
|
16
16
|
if (startByte > 0) {
|
|
17
17
|
headers["Range"] = `bytes=${startByte}-`;
|
|
18
|
-
console.
|
|
18
|
+
console.error(`[Downloader] Resuming from byte ${startByte}`);
|
|
19
19
|
}
|
|
20
20
|
}
|
|
21
21
|
const response = await fetch(url, { headers });
|
|
22
22
|
if (response.status === 416) {
|
|
23
23
|
// Requested range not satisfiable - likely already finished
|
|
24
|
-
console.
|
|
24
|
+
console.error("[Downloader] Range not satisfiable, file might be complete.");
|
|
25
25
|
return;
|
|
26
26
|
}
|
|
27
27
|
if (!response.ok && response.status !== 206) {
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import readline from "readline";
|
|
2
|
+
export class Selector {
|
|
3
|
+
currentIndex = 0;
|
|
4
|
+
options;
|
|
5
|
+
title;
|
|
6
|
+
constructor(title, options) {
|
|
7
|
+
this.title = title;
|
|
8
|
+
this.options = options;
|
|
9
|
+
}
|
|
10
|
+
render() {
|
|
11
|
+
// Clear previous lines
|
|
12
|
+
process.stdout.write("\x1b[?25l"); // Hide cursor
|
|
13
|
+
readline.cursorTo(process.stdout, 0);
|
|
14
|
+
// Clear the lines we used before (options + title + blank line)
|
|
15
|
+
for (let i = 0; i <= this.options.length + 1; i++) {
|
|
16
|
+
readline.clearLine(process.stdout, 0);
|
|
17
|
+
process.stdout.write("\x1b[1A"); // Move up one line
|
|
18
|
+
}
|
|
19
|
+
readline.clearLine(process.stdout, 0);
|
|
20
|
+
console.log(`\n${this.title}`);
|
|
21
|
+
this.options.forEach((opt, idx) => {
|
|
22
|
+
const isCurrent = idx === this.currentIndex;
|
|
23
|
+
const checkbox = opt.selected ? "[\x1b[32mX\x1b[0m]" : "[ ]";
|
|
24
|
+
const cursor = isCurrent ? "\x1b[36m>\x1b[0m " : " ";
|
|
25
|
+
const label = isCurrent ? `\x1b[36m${opt.name}\x1b[0m` : opt.name;
|
|
26
|
+
console.log(`${cursor}${checkbox} ${label}`);
|
|
27
|
+
});
|
|
28
|
+
console.log("\x1b[2m(Use arrows to move, Space to toggle, Enter to confirm)\x1b[0m");
|
|
29
|
+
}
|
|
30
|
+
async run() {
|
|
31
|
+
if (this.options.length === 0)
|
|
32
|
+
return [];
|
|
33
|
+
readline.emitKeypressEvents(process.stdin);
|
|
34
|
+
if (process.stdin.isTTY) {
|
|
35
|
+
process.stdin.setRawMode(true);
|
|
36
|
+
}
|
|
37
|
+
// Initial render room (print blank lines to be cleared)
|
|
38
|
+
console.log("\n".repeat(this.options.length + 1));
|
|
39
|
+
this.render();
|
|
40
|
+
return new Promise((resolve) => {
|
|
41
|
+
const handleKey = (str, key) => {
|
|
42
|
+
if (key.name === "up") {
|
|
43
|
+
this.currentIndex = (this.currentIndex - 1 + this.options.length) % this.options.length;
|
|
44
|
+
this.render();
|
|
45
|
+
}
|
|
46
|
+
else if (key.name === "down") {
|
|
47
|
+
this.currentIndex = (this.currentIndex + 1) % this.options.length;
|
|
48
|
+
this.render();
|
|
49
|
+
}
|
|
50
|
+
else if (key.name === "space") {
|
|
51
|
+
this.options[this.currentIndex].selected = !this.options[this.currentIndex].selected;
|
|
52
|
+
this.render();
|
|
53
|
+
}
|
|
54
|
+
else if (key.name === "return") {
|
|
55
|
+
process.stdin.setRawMode(false);
|
|
56
|
+
process.stdin.removeListener("keypress", handleKey);
|
|
57
|
+
process.stdout.write("\x1b[?25h"); // Show cursor
|
|
58
|
+
console.log("");
|
|
59
|
+
resolve(this.options.filter(o => o.selected).map(o => o.value));
|
|
60
|
+
}
|
|
61
|
+
else if (key.ctrl && key.name === "c") {
|
|
62
|
+
process.stdin.setRawMode(false);
|
|
63
|
+
process.exit();
|
|
64
|
+
}
|
|
65
|
+
};
|
|
66
|
+
process.stdin.on("keypress", handleKey);
|
|
67
|
+
});
|
|
68
|
+
}
|
|
69
|
+
}
|
package/package.json
CHANGED
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@vespermcp/mcp-server",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.6",
|
|
4
4
|
"description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "build/index.js",
|
|
7
7
|
"bin": {
|
|
8
|
-
"
|
|
8
|
+
"vespermcp": "./build/index.js",
|
|
9
|
+
"@vespermcp/mcp-server": "./build/index.js"
|
|
9
10
|
},
|
|
10
11
|
"files": [
|
|
11
12
|
"build/**/*",
|
|
@@ -16,7 +17,7 @@
|
|
|
16
17
|
"mcp-config-template.json"
|
|
17
18
|
],
|
|
18
19
|
"scripts": {
|
|
19
|
-
"build": "tsc",
|
|
20
|
+
"build": "tsc && node -e \"const fs=require('fs');const path=require('path');const src='src/python';const dest='build/python';if(!fs.existsSync(dest))fs.mkdirSync(dest,{recursive:true});fs.readdirSync(src).forEach(f=>{if(f.endsWith('.py'))fs.copyFileSync(path.join(src,f),path.join(dest,f));});console.log('✅ Copied Python scripts to build/python');\"",
|
|
20
21
|
"dev": "tsx watch src/index.ts",
|
|
21
22
|
"postinstall": "node scripts/postinstall.cjs",
|
|
22
23
|
"scrape": "tsx src/scripts/scrape-metadata.ts",
|
|
@@ -26,7 +27,10 @@
|
|
|
26
27
|
"check-db": "tsx src/scripts/check-db.ts",
|
|
27
28
|
"test-jit": "tsx src/scripts/test-jit.ts",
|
|
28
29
|
"demo-ui": "tsx src/scripts/demo-ui.ts",
|
|
29
|
-
"
|
|
30
|
+
"setup": "node build/index.js --setup",
|
|
31
|
+
"setup:silent": "node build/index.js --setup --silent",
|
|
32
|
+
"test": "vitest",
|
|
33
|
+
"start": "node build/index.js"
|
|
30
34
|
},
|
|
31
35
|
"keywords": [
|
|
32
36
|
"mcp",
|
package/src/python/cleaner.py
CHANGED
|
@@ -152,7 +152,15 @@ def main():
|
|
|
152
152
|
op_type = op["type"]
|
|
153
153
|
params = op.get("params", {})
|
|
154
154
|
|
|
155
|
-
if op_type
|
|
155
|
+
if op_type == "RenameTarget":
|
|
156
|
+
old_name = params.get("old_name")
|
|
157
|
+
new_name = params.get("new_name", "target")
|
|
158
|
+
if old_name and old_name in df.columns:
|
|
159
|
+
df = df.rename({old_name: new_name})
|
|
160
|
+
logs.append(f"Renamed column '{old_name}' to '{new_name}'")
|
|
161
|
+
else:
|
|
162
|
+
logs.append(f"Failed RenameTarget: Column '{old_name}' not found or not specified.")
|
|
163
|
+
elif op_type in OPERATIONS:
|
|
156
164
|
try:
|
|
157
165
|
df, stats = OPERATIONS[op_type](df, params)
|
|
158
166
|
logs.append(f"Executed {op_type}: {stats}")
|
|
@@ -176,6 +184,28 @@ def main():
|
|
|
176
184
|
base_name = file_path.rsplit(".", 1)[0]
|
|
177
185
|
if output_format == "csv":
|
|
178
186
|
output_path = f"{base_name}_cleaned.csv"
|
|
187
|
+
# Stringify ANY column that might not be CSV-safe (List, Struct, Object, etc.)
|
|
188
|
+
for col in df.columns:
|
|
189
|
+
dtype = df.schema[col]
|
|
190
|
+
# Only keep simple types; stringify everything else for CSV
|
|
191
|
+
is_simple = (
|
|
192
|
+
dtype.is_numeric() or
|
|
193
|
+
dtype.is_temporal() or
|
|
194
|
+
str(dtype).lower() in ["string", "utf8", "boolean", "bool"]
|
|
195
|
+
)
|
|
196
|
+
if not is_simple:
|
|
197
|
+
# Use a robust helper for clean JSON serialization
|
|
198
|
+
def safe_serialize(val):
|
|
199
|
+
try:
|
|
200
|
+
# Handle Polars nested types (convert to Python list/dict first)
|
|
201
|
+
if hasattr(val, "to_list"):
|
|
202
|
+
return json.dumps(val.to_list())
|
|
203
|
+
if hasattr(val, "to_dict"):
|
|
204
|
+
return json.dumps(val.to_dict())
|
|
205
|
+
return json.dumps(val)
|
|
206
|
+
except:
|
|
207
|
+
return str(val)
|
|
208
|
+
df = df.with_columns(pl.col(col).map_elements(safe_serialize, return_dtype=pl.Utf8))
|
|
179
209
|
df.write_csv(output_path)
|
|
180
210
|
else:
|
|
181
211
|
output_path = f"{base_name}_cleaned.parquet"
|
|
@@ -186,10 +216,10 @@ def main():
|
|
|
186
216
|
"output_path": output_path,
|
|
187
217
|
"rows_affected": total_rows_affected,
|
|
188
218
|
"logs": logs
|
|
189
|
-
}))
|
|
219
|
+
}, default=str))
|
|
190
220
|
|
|
191
221
|
except Exception as e:
|
|
192
|
-
print(json.dumps({"success": False, "error": str(e)}))
|
|
222
|
+
print(json.dumps({"success": False, "error": str(e)}, default=str))
|
|
193
223
|
sys.exit(1)
|
|
194
224
|
|
|
195
225
|
if __name__ == "__main__":
|
|
@@ -31,6 +31,25 @@ def export_data(file_path, output_path, format, options=None):
|
|
|
31
31
|
try:
|
|
32
32
|
# Export Logic
|
|
33
33
|
if format == "csv":
|
|
34
|
+
# Stringify ANY column that might not be CSV-safe (List, Struct, Object, etc.)
|
|
35
|
+
for col in df.columns:
|
|
36
|
+
dtype = df.schema[col]
|
|
37
|
+
is_simple = (
|
|
38
|
+
dtype.is_numeric() or
|
|
39
|
+
dtype.is_temporal() or
|
|
40
|
+
str(dtype).lower() in ["string", "utf8", "boolean", "bool"]
|
|
41
|
+
)
|
|
42
|
+
if not is_simple:
|
|
43
|
+
def safe_serialize(val):
|
|
44
|
+
try:
|
|
45
|
+
if hasattr(val, "to_list"):
|
|
46
|
+
return json.dumps(val.to_list())
|
|
47
|
+
if hasattr(val, "to_dict"):
|
|
48
|
+
return json.dumps(val.to_dict())
|
|
49
|
+
return json.dumps(val)
|
|
50
|
+
except:
|
|
51
|
+
return str(val)
|
|
52
|
+
df = df.with_columns(pl.col(col).map_elements(safe_serialize, return_dtype=pl.Utf8))
|
|
34
53
|
df.write_csv(output_path)
|
|
35
54
|
|
|
36
55
|
elif format == "parquet":
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import json
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
# Common names for target variables in datasets
|
|
7
|
+
TARGET_CANDIDATES = [
|
|
8
|
+
'target', 'label', 'class', 'outcome', 'y',
|
|
9
|
+
'price', 'saleprice', 'sales', 'cost', 'value', 'total',
|
|
10
|
+
'diagnosis', 'species', 'churn', 'survived', 'credit_risk'
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
def load_data(file_path):
|
|
14
|
+
if file_path.endswith('.csv'):
|
|
15
|
+
return pd.read_csv(file_path)
|
|
16
|
+
elif file_path.endswith('.parquet'):
|
|
17
|
+
return pd.read_parquet(file_path)
|
|
18
|
+
else:
|
|
19
|
+
raise ValueError("Unsupported file format")
|
|
20
|
+
|
|
21
|
+
def detect_target(file_path):
|
|
22
|
+
try:
|
|
23
|
+
df = load_data(file_path)
|
|
24
|
+
columns = [c.lower() for c in df.columns]
|
|
25
|
+
candidates = []
|
|
26
|
+
|
|
27
|
+
# 1. Exact Name Match
|
|
28
|
+
for col_original in df.columns:
|
|
29
|
+
col_lower = col_original.lower()
|
|
30
|
+
confidence = 0.0
|
|
31
|
+
reasons = []
|
|
32
|
+
|
|
33
|
+
if col_lower in TARGET_CANDIDATES:
|
|
34
|
+
confidence += 0.6
|
|
35
|
+
reasons.append(f"Matches common target name '{col_lower}'")
|
|
36
|
+
|
|
37
|
+
# Boost if exact match 'target' or 'label'
|
|
38
|
+
if col_lower in ['target', 'label', 'class']:
|
|
39
|
+
confidence += 0.2
|
|
40
|
+
|
|
41
|
+
# 2. Position Heuristic (Last column is often target)
|
|
42
|
+
if col_original == df.columns[-1]:
|
|
43
|
+
confidence += 0.3
|
|
44
|
+
reasons.append("Is the last column")
|
|
45
|
+
|
|
46
|
+
# 3. Completeness
|
|
47
|
+
missing_rate = df[col_original].isnull().mean()
|
|
48
|
+
if missing_rate > 0.5:
|
|
49
|
+
confidence -= 0.5
|
|
50
|
+
reasons.append(f"High missing rate ({missing_rate:.1%})")
|
|
51
|
+
elif missing_rate > 0:
|
|
52
|
+
confidence -= 0.1
|
|
53
|
+
reasons.append(f"Has missing values ({missing_rate:.1%})")
|
|
54
|
+
|
|
55
|
+
# 4. Cardinality / Unique Values
|
|
56
|
+
# If regression-like (many unique numeric values) or class-like (few unique values)
|
|
57
|
+
# This is hard to score generally, but extremes are bad for targets (e.g. all unique = ID usually)
|
|
58
|
+
n_unique = df[col_original].nunique()
|
|
59
|
+
if n_unique == len(df):
|
|
60
|
+
confidence -= 0.8
|
|
61
|
+
reasons.append("All values are unique (likely ID)")
|
|
62
|
+
|
|
63
|
+
if confidence > 0.3:
|
|
64
|
+
candidates.append({
|
|
65
|
+
"column": col_original,
|
|
66
|
+
"confidence": min(confidence, 1.0),
|
|
67
|
+
"reason": reasons
|
|
68
|
+
})
|
|
69
|
+
|
|
70
|
+
# Sort by confidence
|
|
71
|
+
candidates.sort(key=lambda x: x['confidence'], reverse=True)
|
|
72
|
+
|
|
73
|
+
best_target = None
|
|
74
|
+
best_conf = 0.0
|
|
75
|
+
|
|
76
|
+
if candidates:
|
|
77
|
+
best_target = candidates[0]['column']
|
|
78
|
+
best_conf = candidates[0]['confidence']
|
|
79
|
+
|
|
80
|
+
return {
|
|
81
|
+
"target_column": best_target,
|
|
82
|
+
"confidence": best_conf,
|
|
83
|
+
"candidates": candidates,
|
|
84
|
+
"is_unified": False # Wrapper will handle unification logic
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
except Exception as e:
|
|
88
|
+
return {"error": str(e)}
|
|
89
|
+
|
|
90
|
+
def validate_target(file_path, target_column):
|
|
91
|
+
try:
|
|
92
|
+
df = load_data(file_path)
|
|
93
|
+
if target_column not in df.columns:
|
|
94
|
+
return {"error": f"Column '{target_column}' not found in dataset."}
|
|
95
|
+
|
|
96
|
+
series = df[target_column]
|
|
97
|
+
total_rows = len(df)
|
|
98
|
+
missing_count = series.isnull().sum()
|
|
99
|
+
|
|
100
|
+
# Determine type
|
|
101
|
+
is_numeric = pd.api.types.is_numeric_dtype(series)
|
|
102
|
+
n_unique = series.nunique()
|
|
103
|
+
|
|
104
|
+
problem_type = "unknown"
|
|
105
|
+
if is_numeric and n_unique > 20:
|
|
106
|
+
problem_type = "regression"
|
|
107
|
+
elif n_unique < 50: # String or few numeric values
|
|
108
|
+
problem_type = "classification"
|
|
109
|
+
else:
|
|
110
|
+
# Heuristic fallback
|
|
111
|
+
problem_type = "regression" if is_numeric else "classification"
|
|
112
|
+
|
|
113
|
+
warnings = []
|
|
114
|
+
if missing_count > 0:
|
|
115
|
+
warnings.append(f"Target has {missing_count} missing values.")
|
|
116
|
+
|
|
117
|
+
# Imbalance check for classification
|
|
118
|
+
if problem_type == "classification":
|
|
119
|
+
counts = series.value_counts(normalize=True)
|
|
120
|
+
if counts.iloc[0] > 0.9: # Dominant class > 90%
|
|
121
|
+
warnings.append(f"Highly imbalanced target: Class '{counts.index[0]}' is {counts.iloc[0]:.1%}")
|
|
122
|
+
|
|
123
|
+
return {
|
|
124
|
+
"valid": True,
|
|
125
|
+
"problem_type": problem_type,
|
|
126
|
+
"missing_count": int(missing_count),
|
|
127
|
+
"total_rows": total_rows,
|
|
128
|
+
"warnings": warnings
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
except Exception as e:
|
|
132
|
+
return {"error": str(e)}
|
|
133
|
+
|
|
134
|
+
if __name__ == "__main__":
|
|
135
|
+
if len(sys.argv) < 3:
|
|
136
|
+
print(json.dumps({"error": "Usage: target_engine.py <action> <file_path> [args]"}));
|
|
137
|
+
sys.exit(1)
|
|
138
|
+
|
|
139
|
+
action = sys.argv[1]
|
|
140
|
+
file_path = sys.argv[2]
|
|
141
|
+
|
|
142
|
+
result = {}
|
|
143
|
+
if action == "detect":
|
|
144
|
+
result = detect_target(file_path)
|
|
145
|
+
elif action == "validate":
|
|
146
|
+
target_col = sys.argv[3] if len(sys.argv) > 3 else None
|
|
147
|
+
if target_col:
|
|
148
|
+
result = validate_target(file_path, target_col)
|
|
149
|
+
else:
|
|
150
|
+
result = {"error": "Target column required for validation"}
|
|
151
|
+
else:
|
|
152
|
+
result = {"error": f"Unknown action: {action}"}
|
|
153
|
+
|
|
154
|
+
print(json.dumps(result))
|