@vespermcp/mcp-server 1.2.4 → 1.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -131,8 +131,83 @@ function extractRequestedRows(query, requirements) {
131
131
  return Math.max(...allNums);
132
132
  return undefined;
133
133
  }
134
+ const verifiedPythonModules = new Set();
135
+ function getPythonCommand() {
136
+ return process.platform === "win32" ? "py" : "python";
137
+ }
138
+ function runPythonProcess(args, timeoutMs = 300000) {
139
+ const pyCmd = getPythonCommand();
140
+ return new Promise((resolve, reject) => {
141
+ const proc = spawn(pyCmd, args, {
142
+ env: {
143
+ ...process.env,
144
+ PIP_DISABLE_PIP_VERSION_CHECK: "1",
145
+ PYTHONUTF8: "1",
146
+ },
147
+ });
148
+ let stdout = "";
149
+ let stderr = "";
150
+ proc.stdout.on("data", (d) => (stdout += d.toString()));
151
+ proc.stderr.on("data", (d) => (stderr += d.toString()));
152
+ const timer = setTimeout(() => {
153
+ try {
154
+ proc.kill();
155
+ }
156
+ catch {
157
+ // no-op
158
+ }
159
+ reject(new Error(`Python command timed out after ${timeoutMs}ms: ${args.join(" ")}`));
160
+ }, timeoutMs);
161
+ proc.on("close", (code) => {
162
+ clearTimeout(timer);
163
+ resolve({ code: code ?? 1, stdout, stderr });
164
+ });
165
+ proc.on("error", (error) => {
166
+ clearTimeout(timer);
167
+ reject(error);
168
+ });
169
+ });
170
+ }
171
+ async function ensurePythonModules(modulePackagePairs) {
172
+ const missing = [];
173
+ for (const pair of modulePackagePairs) {
174
+ if (verifiedPythonModules.has(pair.module)) {
175
+ continue;
176
+ }
177
+ const check = await runPythonProcess([
178
+ "-c",
179
+ `import importlib.util,sys; sys.exit(0 if importlib.util.find_spec(${JSON.stringify(pair.module)}) else 1)`
180
+ ], 20000);
181
+ if (check.code === 0) {
182
+ verifiedPythonModules.add(pair.module);
183
+ }
184
+ else {
185
+ missing.push(pair);
186
+ }
187
+ }
188
+ if (missing.length === 0) {
189
+ return;
190
+ }
191
+ const packages = [...new Set(missing.map(m => m.packageName))];
192
+ console.error(`[Vesper] Installing missing Python packages: ${packages.join(", ")}`);
193
+ const installArgs = ["-m", "pip", "install", "--disable-pip-version-check", ...packages];
194
+ let install = await runPythonProcess(installArgs, 600000);
195
+ if (install.code !== 0) {
196
+ console.error(`[Vesper] pip install failed (trying --user fallback): ${(install.stderr || "").slice(0, 300)}`);
197
+ const userInstallArgs = ["-m", "pip", "install", "--disable-pip-version-check", "--user", ...packages];
198
+ install = await runPythonProcess(userInstallArgs, 600000);
199
+ }
200
+ if (install.code !== 0) {
201
+ const details = (install.stderr || install.stdout || "Unknown pip install error").trim();
202
+ throw new Error(`Failed to install required Python packages (${packages.join(", ")}). ${details}`);
203
+ }
204
+ console.error(`[Vesper] Successfully installed: ${packages.join(", ")}`);
205
+ for (const pair of missing) {
206
+ verifiedPythonModules.add(pair.module);
207
+ }
208
+ }
134
209
  function runPythonJson(scriptPath, args) {
135
- const pyCmd = process.platform === "win32" ? "py" : "python";
210
+ const pyCmd = getPythonCommand();
136
211
  return new Promise((resolve, reject) => {
137
212
  const proc = spawn(pyCmd, [scriptPath, ...args]);
138
213
  let stdout = "";
@@ -167,6 +242,25 @@ function syncPythonScripts(appRoot, dataRoot) {
167
242
  const pythonDest = path.join(dataRoot, "python");
168
243
  if (!fs.existsSync(pythonDest))
169
244
  fs.mkdirSync(pythonDest, { recursive: true });
245
+ const collectPyFiles = (dir) => {
246
+ if (!fs.existsSync(dir))
247
+ return [];
248
+ const out = [];
249
+ const stack = [dir];
250
+ while (stack.length > 0) {
251
+ const cur = stack.pop();
252
+ for (const entry of fs.readdirSync(cur, { withFileTypes: true })) {
253
+ const full = path.join(cur, entry.name);
254
+ if (entry.isDirectory()) {
255
+ stack.push(full);
256
+ }
257
+ else if (entry.isFile() && full.endsWith(".py")) {
258
+ out.push(full);
259
+ }
260
+ }
261
+ }
262
+ return out;
263
+ };
170
264
  // Sources to check for Python scripts
171
265
  const sources = [
172
266
  path.join(appRoot, "src", "python"),
@@ -175,25 +269,21 @@ function syncPythonScripts(appRoot, dataRoot) {
175
269
  ];
176
270
  let syncedCount = 0;
177
271
  for (const src of sources) {
178
- if (fs.existsSync(src)) {
179
- const files = fs.readdirSync(src);
180
- for (const file of files) {
181
- if (file.endsWith(".py")) {
182
- const srcPath = path.join(src, file);
183
- const destPath = path.join(pythonDest, file);
184
- // Only copy if file doesn't exist or is different size (basic sync)
185
- const srcStat = fs.statSync(srcPath);
186
- let shouldCopy = true;
187
- if (fs.existsSync(destPath)) {
188
- const destStat = fs.statSync(destPath);
189
- if (srcStat.size === destStat.size)
190
- shouldCopy = false;
191
- }
192
- if (shouldCopy) {
193
- fs.copyFileSync(srcPath, destPath);
194
- syncedCount++;
195
- }
196
- }
272
+ const files = collectPyFiles(src);
273
+ for (const srcPath of files) {
274
+ const rel = path.relative(src, srcPath);
275
+ const destPath = path.join(pythonDest, rel);
276
+ const srcStat = fs.statSync(srcPath);
277
+ let shouldCopy = true;
278
+ if (fs.existsSync(destPath)) {
279
+ const destStat = fs.statSync(destPath);
280
+ if (srcStat.size === destStat.size)
281
+ shouldCopy = false;
282
+ }
283
+ if (shouldCopy) {
284
+ fs.mkdirSync(path.dirname(destPath), { recursive: true });
285
+ fs.copyFileSync(srcPath, destPath);
286
+ syncedCount++;
197
287
  }
198
288
  }
199
289
  }
@@ -471,6 +561,25 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
471
561
  required: ["source", "dataset_id"],
472
562
  },
473
563
  },
564
+ {
565
+ name: "vesper_download_assets",
566
+ description: "Download real image/media assets using smart source-aware recipes (HuggingFace, Kaggle, direct URL).",
567
+ inputSchema: {
568
+ type: "object",
569
+ properties: {
570
+ dataset_id: { type: "string", description: "Unique dataset identifier." },
571
+ source: { type: "string", enum: ["huggingface", "kaggle", "url"], description: "Asset source type." },
572
+ repo_id: { type: "string", description: "Repo ID for HuggingFace (e.g. cifar100)." },
573
+ kaggle_ref: { type: "string", description: "Kaggle dataset ref (owner/dataset)." },
574
+ urls: { type: "array", items: { type: "string" }, description: "Direct asset URLs." },
575
+ output_format: { type: "string", enum: ["webdataset", "imagefolder", "parquet"], description: "Output asset format." },
576
+ max_items: { type: "number", description: "Optional cap on number of assets to fetch." },
577
+ workers: { type: "number", description: "Parallel worker count (default 8)." },
578
+ image_column: { type: "string", description: "Optional explicit image column for HuggingFace datasets." },
579
+ },
580
+ required: ["dataset_id", "source"],
581
+ },
582
+ },
474
583
  {
475
584
  name: "configure_kaggle",
476
585
  description: "Optionally store Kaggle API credentials for Kaggle discover/download. Core Vesper works without this.",
@@ -571,6 +680,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
571
680
  properties: {
572
681
  query: { type: "string" },
573
682
  requirements: { type: "string" },
683
+ download_images: { type: "boolean", description: "When true, enables post-prepare smart asset downloading for image/media datasets." },
574
684
  cleaning_options: { type: "object" },
575
685
  split_config: { type: "object" },
576
686
  },
@@ -813,6 +923,24 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
813
923
  const hf = new HuggingFaceScraper();
814
924
  results = await hf.scrape(Math.max(1, limit), true, query);
815
925
  }
926
+ const recipeScript = path.join(dataRoot, "python", "asset_downloader_engine.py");
927
+ for (const ds of results.slice(0, limit)) {
928
+ const info = {
929
+ dataset_id: ds.id,
930
+ id: ds.id,
931
+ source: ds.source,
932
+ repo_id: ds.id,
933
+ total_images: ds.total_examples || 0,
934
+ image_column: undefined,
935
+ recipes_dir: path.join(dataRoot, "recipes"),
936
+ };
937
+ try {
938
+ await runPythonJson(recipeScript, ["build_recipe", JSON.stringify(info)]);
939
+ }
940
+ catch {
941
+ // best-effort recipe generation; ignore discovery-time recipe failures
942
+ }
943
+ }
816
944
  const formattedOutput = formatSearchResults(results.slice(0, limit));
817
945
  return {
818
946
  content: [{ type: "text", text: formattedOutput }]
@@ -857,6 +985,85 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
857
985
  };
858
986
  }
859
987
  }
988
+ case "vesper_download_assets": {
989
+ hydrateExternalKeys();
990
+ const datasetId = String(request.params.arguments?.dataset_id || "").trim();
991
+ const source = String(request.params.arguments?.source || "").trim().toLowerCase();
992
+ const repoId = request.params.arguments?.repo_id ? String(request.params.arguments.repo_id) : undefined;
993
+ const kaggleRef = request.params.arguments?.kaggle_ref ? String(request.params.arguments.kaggle_ref) : undefined;
994
+ const urls = Array.isArray(request.params.arguments?.urls)
995
+ ? (request.params.arguments?.urls).map(v => String(v))
996
+ : undefined;
997
+ const outputFormat = String(request.params.arguments?.output_format || "webdataset");
998
+ const maxItems = request.params.arguments?.max_items ? Number(request.params.arguments.max_items) : undefined;
999
+ const workers = request.params.arguments?.workers ? Number(request.params.arguments.workers) : 8;
1000
+ const imageColumn = request.params.arguments?.image_column ? String(request.params.arguments.image_column) : undefined;
1001
+ if (!datasetId || !source) {
1002
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id and source are required");
1003
+ }
1004
+ if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
1005
+ return {
1006
+ content: [{ type: "text", text: "Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds)." }],
1007
+ isError: true,
1008
+ };
1009
+ }
1010
+ const requiredModules = [
1011
+ { module: "aiohttp", packageName: "aiohttp" },
1012
+ ];
1013
+ if (source === "url") {
1014
+ requiredModules.push({ module: "aiofiles", packageName: "aiofiles" });
1015
+ }
1016
+ if (source === "huggingface") {
1017
+ requiredModules.push({ module: "datasets", packageName: "datasets" });
1018
+ }
1019
+ if (source === "kaggle") {
1020
+ requiredModules.push({ module: "kaggle", packageName: "kaggle" });
1021
+ }
1022
+ if (outputFormat === "webdataset") {
1023
+ requiredModules.push({ module: "webdataset", packageName: "webdataset" });
1024
+ }
1025
+ try {
1026
+ await ensurePythonModules(requiredModules);
1027
+ }
1028
+ catch (error) {
1029
+ return {
1030
+ content: [{ type: "text", text: `ERROR: Python dependency setup failed: ${error.message}` }],
1031
+ isError: true,
1032
+ };
1033
+ }
1034
+ const scriptPath = path.join(dataRoot, "python", "asset_downloader_engine.py");
1035
+ const payload = {
1036
+ dataset_id: datasetId,
1037
+ source,
1038
+ repo_id: repoId,
1039
+ kaggle_ref: kaggleRef,
1040
+ urls,
1041
+ output_format: outputFormat,
1042
+ max_items: maxItems,
1043
+ workers,
1044
+ image_column: imageColumn,
1045
+ output_root: path.join(dataRoot, "data", "assets"),
1046
+ recipes_dir: path.join(dataRoot, "recipes"),
1047
+ };
1048
+ try {
1049
+ const result = await runPythonJson(scriptPath, ["download", JSON.stringify(payload)]);
1050
+ if (!result?.ok) {
1051
+ return {
1052
+ content: [{ type: "text", text: `ERROR: asset download failed: ${result?.error || "Unknown error"}` }],
1053
+ isError: true,
1054
+ };
1055
+ }
1056
+ return {
1057
+ content: [{ type: "text", text: JSON.stringify(result.result, null, 2) }],
1058
+ };
1059
+ }
1060
+ catch (error) {
1061
+ return {
1062
+ content: [{ type: "text", text: `ERROR: asset downloader execution failed: ${error.message}` }],
1063
+ isError: true,
1064
+ };
1065
+ }
1066
+ }
860
1067
  case "configure_kaggle": {
861
1068
  const username = String(request.params.arguments?.username || "").trim();
862
1069
  const key = String(request.params.arguments?.key || "").trim();
@@ -1033,7 +1240,8 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1033
1240
  case "prepare_dataset": {
1034
1241
  const query = String(request.params.arguments?.query);
1035
1242
  const requirements = request.params.arguments?.requirements ? String(request.params.arguments?.requirements) : undefined;
1036
- const job = jobManager.createJob("prepare", 0, { query, requirements });
1243
+ const downloadImages = request.params.arguments?.download_images === true;
1244
+ const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages });
1037
1245
  return {
1038
1246
  content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you.` }]
1039
1247
  };
@@ -0,0 +1,73 @@
1
+ import argparse
2
+ import asyncio
3
+ import json
4
+ import os
5
+ import sys
6
+ from pathlib import Path
7
+ from typing import Any, Dict
8
+
9
+ CURRENT_DIR = Path(__file__).resolve().parent
10
+ if str(CURRENT_DIR) not in sys.path:
11
+ sys.path.insert(0, str(CURRENT_DIR))
12
+
13
+ from vesper.core.asset_downloader import AssetDownloader
14
+ from vesper.core.download_recipe import build_download_recipe, save_recipe, get_download_recipe
15
+
16
+
17
+ def _print(payload: Dict[str, Any]) -> None:
18
+ print(json.dumps(payload, ensure_ascii=False))
19
+
20
+
21
+ async def _run_download(args: argparse.Namespace) -> Dict[str, Any]:
22
+ payload = json.loads(args.payload)
23
+ output_root = payload.get("output_root") or str(Path.home() / ".vesper" / "data" / "assets")
24
+ workers = int(payload.get("workers") or 8)
25
+ recipes_dir = payload.get("recipes_dir")
26
+
27
+ downloader = AssetDownloader(output_root=output_root, workers=workers, recipes_dir=recipes_dir)
28
+
29
+ result = await downloader.download_assets(
30
+ dataset_id=str(payload.get("dataset_id")),
31
+ source=payload.get("source"),
32
+ repo_id=payload.get("repo_id"),
33
+ kaggle_ref=payload.get("kaggle_ref"),
34
+ urls=payload.get("urls"),
35
+ output_format=payload.get("output_format", "webdataset"),
36
+ max_items=payload.get("max_items"),
37
+ image_column=payload.get("image_column"),
38
+ )
39
+ return {"ok": True, "result": result}
40
+
41
+
42
+ def main() -> None:
43
+ parser = argparse.ArgumentParser(description="Vesper Smart Asset Downloader Engine")
44
+ parser.add_argument("action", choices=["download", "build_recipe", "get_recipe"])
45
+ parser.add_argument("payload", help="JSON payload")
46
+ args = parser.parse_args()
47
+
48
+ try:
49
+ if args.action == "download":
50
+ response = asyncio.run(_run_download(args))
51
+ _print(response)
52
+ return
53
+
54
+ payload = json.loads(args.payload)
55
+ if args.action == "build_recipe":
56
+ recipe = build_download_recipe(payload)
57
+ saved = save_recipe(recipe, payload.get("recipes_dir"))
58
+ _print({"ok": True, "recipe": recipe, "saved_to": saved})
59
+ return
60
+
61
+ if args.action == "get_recipe":
62
+ dataset_id = str(payload.get("dataset_id"))
63
+ recipe = get_download_recipe(dataset_id, payload.get("recipes_dir"))
64
+ _print({"ok": True, "recipe": recipe})
65
+ return
66
+
67
+ _print({"ok": False, "error": f"Unknown action: {args.action}"})
68
+ except Exception as e:
69
+ _print({"ok": False, "error": str(e)})
70
+
71
+
72
+ if __name__ == "__main__":
73
+ main()
@@ -0,0 +1 @@
1
+ """Vesper Python runtime package."""
@@ -0,0 +1 @@
1
+ """Core data engines for Vesper."""