@vespermcp/mcp-server 1.2.9 → 1.2.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/cleaning/cleaner.js +7 -1
- package/build/cleaning/executor.js +6 -5
- package/build/cleaning/planner.js +4 -0
- package/build/index.js +39 -16
- package/build/ingestion/ingestor.js +1 -1
- package/build/python/__pycache__/config.cpython-312.pyc +0 -0
- package/build/python/config.py +14 -12
- package/build/python/kaggle_engine.py +6 -7
- package/build/python/vesper/core/asset_downloader.py +11 -15
- package/package.json +3 -2
- package/src/python/config.py +14 -12
- package/src/python/kaggle_engine.py +6 -7
- package/src/python/vesper/core/asset_downloader.py +11 -15
|
@@ -63,7 +63,13 @@ export class DataCleaner {
|
|
|
63
63
|
reject(new Error(result.error));
|
|
64
64
|
}
|
|
65
65
|
else {
|
|
66
|
-
resolve(
|
|
66
|
+
resolve({
|
|
67
|
+
success: true,
|
|
68
|
+
rows_affected: Number(result.rows_affected ?? 0),
|
|
69
|
+
columns_affected: Number(result.columns_affected ?? 0),
|
|
70
|
+
output_path: result.output_path,
|
|
71
|
+
logs: Array.isArray(result.logs) ? result.logs : [],
|
|
72
|
+
});
|
|
67
73
|
}
|
|
68
74
|
}
|
|
69
75
|
catch (e) {
|
|
@@ -14,7 +14,7 @@ export class PipelineExecutor {
|
|
|
14
14
|
/**
|
|
15
15
|
* Run the full Auto-Cleaning Pipeline on a dataset file
|
|
16
16
|
*/
|
|
17
|
-
async runPipeline(datasetId, filePath, outputFormat = "
|
|
17
|
+
async runPipeline(datasetId, filePath, outputFormat = "parquet", onProgress) {
|
|
18
18
|
// ... (logging setup)
|
|
19
19
|
const log = (m) => {
|
|
20
20
|
console.error(`[Pipeline] ${m}`);
|
|
@@ -26,9 +26,10 @@ export class PipelineExecutor {
|
|
|
26
26
|
// 2. Generate Plan
|
|
27
27
|
log(`Generating cleaning plan...`);
|
|
28
28
|
const plan = await this.planner.generatePlan(datasetId, qualityReport);
|
|
29
|
+
const rules = plan.operations;
|
|
29
30
|
// If no cleaning needed, we still might need format conversion
|
|
30
31
|
const needsConversion = !filePath.toLowerCase().endsWith(`.${outputFormat}`);
|
|
31
|
-
if (
|
|
32
|
+
if (rules.length === 0 && !needsConversion) {
|
|
32
33
|
log(`No cleaning or conversion needed.`);
|
|
33
34
|
return {
|
|
34
35
|
initial_quality: qualityReport,
|
|
@@ -42,9 +43,9 @@ export class PipelineExecutor {
|
|
|
42
43
|
};
|
|
43
44
|
}
|
|
44
45
|
// 3. Execute Plan (includes conversion if requested)
|
|
45
|
-
log(`Executing ${
|
|
46
|
-
|
|
47
|
-
const cleaningResult = await this.cleaner.clean(filePath,
|
|
46
|
+
log(`Executing ${rules.length} operations (Format: ${outputFormat})...`);
|
|
47
|
+
rules.forEach(op => console.error(` - ${op.type}: ${op.reason}`));
|
|
48
|
+
const cleaningResult = await this.cleaner.clean(filePath, rules, outputFormat);
|
|
48
49
|
if (cleaningResult.success) {
|
|
49
50
|
log(`Cleaning complete. Output: ${cleaningResult.output_path}`);
|
|
50
51
|
}
|
|
@@ -110,6 +110,10 @@ export class CleaningPlanner {
|
|
|
110
110
|
}
|
|
111
111
|
return plan;
|
|
112
112
|
}
|
|
113
|
+
async generateRules(datasetId, report, ruleSet, targetInfo) {
|
|
114
|
+
const plan = await this.generatePlan(datasetId, report, ruleSet, targetInfo);
|
|
115
|
+
return plan.operations;
|
|
116
|
+
}
|
|
113
117
|
shouldFixType(col) {
|
|
114
118
|
if (col.inferred_type && col.inferred_type.includes("Numeric") && (col.type.includes("String") || col.type.includes("Utf8"))) {
|
|
115
119
|
return true;
|
package/build/index.js
CHANGED
|
@@ -469,11 +469,20 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
469
469
|
*/
|
|
470
470
|
async function handleCleanJob(jobId, datasetId, ops) {
|
|
471
471
|
const update = (updates) => jobManager.updateJob(jobId, updates);
|
|
472
|
-
|
|
472
|
+
const safeId = datasetId.replace(/\//g, "_");
|
|
473
|
+
const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
|
|
474
|
+
const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
|
|
475
|
+
let filePath = parquetPath;
|
|
476
|
+
if (!fs.existsSync(filePath)) {
|
|
477
|
+
filePath = csvPath;
|
|
478
|
+
}
|
|
473
479
|
if (datasetId === "demo" || !fs.existsSync(filePath)) {
|
|
474
|
-
const
|
|
475
|
-
|
|
476
|
-
|
|
480
|
+
const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
|
|
481
|
+
const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
|
|
482
|
+
if (fs.existsSync(demoParquetPath))
|
|
483
|
+
filePath = demoParquetPath;
|
|
484
|
+
else if (fs.existsSync(demoCsvPath))
|
|
485
|
+
filePath = demoCsvPath;
|
|
477
486
|
else
|
|
478
487
|
throw new Error(`Data file not found for ${datasetId}`);
|
|
479
488
|
}
|
|
@@ -714,7 +723,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
714
723
|
},
|
|
715
724
|
{
|
|
716
725
|
name: "export_dataset",
|
|
717
|
-
description: "Export a dataset to a local directory. Use format='
|
|
726
|
+
description: "Export a dataset to a local directory. Use format='parquet' (default) for efficient analytics and broad interoperability. Add fast=true to skip quality/cleaning steps.",
|
|
718
727
|
inputSchema: {
|
|
719
728
|
type: "object",
|
|
720
729
|
properties: {
|
|
@@ -729,7 +738,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
729
738
|
format: {
|
|
730
739
|
type: "string",
|
|
731
740
|
enum: ["feather", "parquet", "csv", "jsonl", "arrow"],
|
|
732
|
-
description: "Output format.
|
|
741
|
+
description: "Output format. parquet (default, analytics-friendly), feather (fast local IO), csv (human-readable).",
|
|
733
742
|
},
|
|
734
743
|
compression: {
|
|
735
744
|
type: "string",
|
|
@@ -800,7 +809,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
800
809
|
output_format: {
|
|
801
810
|
type: "string",
|
|
802
811
|
enum: ["feather", "parquet", "csv", "jsonl", "arrow"],
|
|
803
|
-
description: "Output format (default:
|
|
812
|
+
description: "Output format (default: parquet).",
|
|
804
813
|
},
|
|
805
814
|
compression: {
|
|
806
815
|
type: "string",
|
|
@@ -1144,12 +1153,19 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1144
1153
|
}
|
|
1145
1154
|
case "analyze_quality": {
|
|
1146
1155
|
const datasetId = String(request.params.arguments?.dataset_id);
|
|
1147
|
-
|
|
1156
|
+
const safeId = datasetId.replace(/\//g, "_");
|
|
1157
|
+
const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
|
|
1158
|
+
const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
|
|
1159
|
+
let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
|
|
1148
1160
|
// Demo Fallback for easy testing
|
|
1149
1161
|
if (datasetId === "demo" || !fs.existsSync(filePath)) {
|
|
1150
|
-
const
|
|
1151
|
-
|
|
1152
|
-
|
|
1162
|
+
const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
|
|
1163
|
+
const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
|
|
1164
|
+
if (fs.existsSync(demoParquetPath)) {
|
|
1165
|
+
filePath = demoParquetPath;
|
|
1166
|
+
}
|
|
1167
|
+
else if (fs.existsSync(demoCsvPath)) {
|
|
1168
|
+
filePath = demoCsvPath;
|
|
1153
1169
|
}
|
|
1154
1170
|
else if (datasetId !== "demo") {
|
|
1155
1171
|
return {
|
|
@@ -1165,11 +1181,18 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1165
1181
|
}
|
|
1166
1182
|
case "preview_cleaning": {
|
|
1167
1183
|
const datasetId = String(request.params.arguments?.dataset_id);
|
|
1168
|
-
|
|
1184
|
+
const safeId = datasetId.replace(/\//g, "_");
|
|
1185
|
+
const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
|
|
1186
|
+
const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
|
|
1187
|
+
let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
|
|
1169
1188
|
if (datasetId === "demo" || !fs.existsSync(filePath)) {
|
|
1170
|
-
const
|
|
1171
|
-
|
|
1172
|
-
|
|
1189
|
+
const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
|
|
1190
|
+
const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
|
|
1191
|
+
if (fs.existsSync(demoParquetPath)) {
|
|
1192
|
+
filePath = demoParquetPath;
|
|
1193
|
+
}
|
|
1194
|
+
else if (fs.existsSync(demoCsvPath)) {
|
|
1195
|
+
filePath = demoCsvPath;
|
|
1173
1196
|
}
|
|
1174
1197
|
else {
|
|
1175
1198
|
throw new McpError(ErrorCode.InvalidParams, `Local data file not found for ${datasetId}. Please run prepare_dataset first.`);
|
|
@@ -1291,7 +1314,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1291
1314
|
// If NOT fast mode, run quality/cleaning pipeline first (only for csv/parquet compat)
|
|
1292
1315
|
if (!fastMode) {
|
|
1293
1316
|
const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
|
|
1294
|
-
const pipelineFmt = (requestedFormat === "csv" || requestedFormat === "parquet") ? requestedFormat : "
|
|
1317
|
+
const pipelineFmt = (requestedFormat === "csv" || requestedFormat === "parquet") ? requestedFormat : "parquet";
|
|
1295
1318
|
if (currentExt !== pipelineFmt) {
|
|
1296
1319
|
console.error(`[Export] Running quality/cleaning pipeline (use fast=true to skip)...`);
|
|
1297
1320
|
try {
|
|
@@ -152,7 +152,7 @@ export class DataIngestor {
|
|
|
152
152
|
/**
|
|
153
153
|
* Generates a safe local filename for a dataset ID
|
|
154
154
|
*/
|
|
155
|
-
getTargetPath(datasetId, extension = "
|
|
155
|
+
getTargetPath(datasetId, extension = "parquet") {
|
|
156
156
|
const safeId = datasetId.replace(/\//g, "_").replace(/:/g, "_");
|
|
157
157
|
return path.join(this.rawDataDir, `${safeId}.${extension}`);
|
|
158
158
|
}
|
|
Binary file
|
package/build/python/config.py
CHANGED
|
@@ -155,12 +155,7 @@ def _get_from_env(name: str) -> Optional[str]:
|
|
|
155
155
|
|
|
156
156
|
|
|
157
157
|
def get_key(name: str) -> Optional[str]:
|
|
158
|
-
# 1)
|
|
159
|
-
env_val = _get_from_env(name)
|
|
160
|
-
if env_val:
|
|
161
|
-
return env_val
|
|
162
|
-
|
|
163
|
-
# 2) keyring (secure)
|
|
158
|
+
# 1) keyring (secure)
|
|
164
159
|
if HAS_KEYRING:
|
|
165
160
|
try:
|
|
166
161
|
val = keyring.get_password(SERVICE_NAME, name)
|
|
@@ -169,14 +164,21 @@ def get_key(name: str) -> Optional[str]:
|
|
|
169
164
|
except Exception:
|
|
170
165
|
pass
|
|
171
166
|
|
|
172
|
-
#
|
|
167
|
+
# 2) encrypted fallback config.toml
|
|
173
168
|
fallback = _read_fallback_toml()
|
|
174
169
|
enc = fallback.get(name)
|
|
175
|
-
if
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
170
|
+
if enc:
|
|
171
|
+
secret = _get_or_create_local_secret()
|
|
172
|
+
method = fallback.get("__method__", "fernet" if HAS_FERNET else "xor")
|
|
173
|
+
dec = _decrypt_value(enc, method, secret)
|
|
174
|
+
if dec:
|
|
175
|
+
return dec
|
|
176
|
+
|
|
177
|
+
# 3) env vars (fallback only)
|
|
178
|
+
env_val = _get_from_env(name)
|
|
179
|
+
if env_val:
|
|
180
|
+
return env_val
|
|
181
|
+
return None
|
|
180
182
|
|
|
181
183
|
|
|
182
184
|
def set_key(name: str, value: str) -> Dict[str, str]:
|
|
@@ -20,14 +20,13 @@ def _ensure_auth() -> Dict[str, Any]:
|
|
|
20
20
|
}
|
|
21
21
|
|
|
22
22
|
# Priority:
|
|
23
|
-
# 1)
|
|
24
|
-
# 2)
|
|
23
|
+
# 1) secure local store (keyring or ~/.vesper/config.toml)
|
|
24
|
+
# 2) existing env vars
|
|
25
25
|
# 3) ~/.kaggle/kaggle.json handled by KaggleApi.authenticate()
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
os.environ["KAGGLE_KEY"] = keys["kaggle_key"]
|
|
26
|
+
keys = get_all()
|
|
27
|
+
if keys.get("kaggle_username") and keys.get("kaggle_key"):
|
|
28
|
+
os.environ["KAGGLE_USERNAME"] = keys["kaggle_username"]
|
|
29
|
+
os.environ["KAGGLE_KEY"] = keys["kaggle_key"]
|
|
31
30
|
|
|
32
31
|
api = KaggleApi()
|
|
33
32
|
try:
|
|
@@ -62,23 +62,19 @@ class AssetDownloader:
|
|
|
62
62
|
|
|
63
63
|
@staticmethod
|
|
64
64
|
def _hydrate_kaggle_credentials() -> None:
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
from config import get_all # type: ignore
|
|
71
|
-
keys = get_all() or {}
|
|
72
|
-
except Exception:
|
|
73
|
-
keys = {}
|
|
65
|
+
try:
|
|
66
|
+
from config import get_all # type: ignore
|
|
67
|
+
keys = get_all() or {}
|
|
68
|
+
except Exception:
|
|
69
|
+
keys = {}
|
|
74
70
|
|
|
75
|
-
|
|
76
|
-
|
|
71
|
+
username = keys.get("kaggle_username") or os.getenv("KAGGLE_USERNAME")
|
|
72
|
+
key = keys.get("kaggle_key") or os.getenv("KAGGLE_KEY")
|
|
77
73
|
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
74
|
+
if username:
|
|
75
|
+
os.environ["KAGGLE_USERNAME"] = str(username)
|
|
76
|
+
if key:
|
|
77
|
+
os.environ["KAGGLE_KEY"] = str(key)
|
|
82
78
|
|
|
83
79
|
username = os.getenv("KAGGLE_USERNAME")
|
|
84
80
|
key = os.getenv("KAGGLE_KEY")
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@vespermcp/mcp-server",
|
|
3
|
-
"version": "1.2.
|
|
3
|
+
"version": "1.2.11",
|
|
4
4
|
"description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "build/index.js",
|
|
@@ -91,5 +91,6 @@
|
|
|
91
91
|
"tsx": "^4.21.0",
|
|
92
92
|
"typescript": "^5.9.3",
|
|
93
93
|
"vitest": "^4.0.17"
|
|
94
|
-
}
|
|
94
|
+
},
|
|
95
|
+
"packageManager": "pnpm@10.18.1+sha512.77a884a165cbba2d8d1c19e3b4880eee6d2fcabd0d879121e282196b80042351d5eb3ca0935fa599da1dc51265cc68816ad2bddd2a2de5ea9fdf92adbec7cd34"
|
|
95
96
|
}
|
package/src/python/config.py
CHANGED
|
@@ -155,12 +155,7 @@ def _get_from_env(name: str) -> Optional[str]:
|
|
|
155
155
|
|
|
156
156
|
|
|
157
157
|
def get_key(name: str) -> Optional[str]:
|
|
158
|
-
# 1)
|
|
159
|
-
env_val = _get_from_env(name)
|
|
160
|
-
if env_val:
|
|
161
|
-
return env_val
|
|
162
|
-
|
|
163
|
-
# 2) keyring (secure)
|
|
158
|
+
# 1) keyring (secure)
|
|
164
159
|
if HAS_KEYRING:
|
|
165
160
|
try:
|
|
166
161
|
val = keyring.get_password(SERVICE_NAME, name)
|
|
@@ -169,14 +164,21 @@ def get_key(name: str) -> Optional[str]:
|
|
|
169
164
|
except Exception:
|
|
170
165
|
pass
|
|
171
166
|
|
|
172
|
-
#
|
|
167
|
+
# 2) encrypted fallback config.toml
|
|
173
168
|
fallback = _read_fallback_toml()
|
|
174
169
|
enc = fallback.get(name)
|
|
175
|
-
if
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
170
|
+
if enc:
|
|
171
|
+
secret = _get_or_create_local_secret()
|
|
172
|
+
method = fallback.get("__method__", "fernet" if HAS_FERNET else "xor")
|
|
173
|
+
dec = _decrypt_value(enc, method, secret)
|
|
174
|
+
if dec:
|
|
175
|
+
return dec
|
|
176
|
+
|
|
177
|
+
# 3) env vars (fallback only)
|
|
178
|
+
env_val = _get_from_env(name)
|
|
179
|
+
if env_val:
|
|
180
|
+
return env_val
|
|
181
|
+
return None
|
|
180
182
|
|
|
181
183
|
|
|
182
184
|
def set_key(name: str, value: str) -> Dict[str, str]:
|
|
@@ -20,14 +20,13 @@ def _ensure_auth() -> Dict[str, Any]:
|
|
|
20
20
|
}
|
|
21
21
|
|
|
22
22
|
# Priority:
|
|
23
|
-
# 1)
|
|
24
|
-
# 2)
|
|
23
|
+
# 1) secure local store (keyring or ~/.vesper/config.toml)
|
|
24
|
+
# 2) existing env vars
|
|
25
25
|
# 3) ~/.kaggle/kaggle.json handled by KaggleApi.authenticate()
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
os.environ["KAGGLE_KEY"] = keys["kaggle_key"]
|
|
26
|
+
keys = get_all()
|
|
27
|
+
if keys.get("kaggle_username") and keys.get("kaggle_key"):
|
|
28
|
+
os.environ["KAGGLE_USERNAME"] = keys["kaggle_username"]
|
|
29
|
+
os.environ["KAGGLE_KEY"] = keys["kaggle_key"]
|
|
31
30
|
|
|
32
31
|
api = KaggleApi()
|
|
33
32
|
try:
|
|
@@ -62,23 +62,19 @@ class AssetDownloader:
|
|
|
62
62
|
|
|
63
63
|
@staticmethod
|
|
64
64
|
def _hydrate_kaggle_credentials() -> None:
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
from config import get_all # type: ignore
|
|
71
|
-
keys = get_all() or {}
|
|
72
|
-
except Exception:
|
|
73
|
-
keys = {}
|
|
65
|
+
try:
|
|
66
|
+
from config import get_all # type: ignore
|
|
67
|
+
keys = get_all() or {}
|
|
68
|
+
except Exception:
|
|
69
|
+
keys = {}
|
|
74
70
|
|
|
75
|
-
|
|
76
|
-
|
|
71
|
+
username = keys.get("kaggle_username") or os.getenv("KAGGLE_USERNAME")
|
|
72
|
+
key = keys.get("kaggle_key") or os.getenv("KAGGLE_KEY")
|
|
77
73
|
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
74
|
+
if username:
|
|
75
|
+
os.environ["KAGGLE_USERNAME"] = str(username)
|
|
76
|
+
if key:
|
|
77
|
+
os.environ["KAGGLE_KEY"] = str(key)
|
|
82
78
|
|
|
83
79
|
username = os.getenv("KAGGLE_USERNAME")
|
|
84
80
|
key = os.getenv("KAGGLE_KEY")
|