@vespermcp/mcp-server 1.2.3 → 1.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -167,6 +167,25 @@ function syncPythonScripts(appRoot, dataRoot) {
167
167
  const pythonDest = path.join(dataRoot, "python");
168
168
  if (!fs.existsSync(pythonDest))
169
169
  fs.mkdirSync(pythonDest, { recursive: true });
170
+ const collectPyFiles = (dir) => {
171
+ if (!fs.existsSync(dir))
172
+ return [];
173
+ const out = [];
174
+ const stack = [dir];
175
+ while (stack.length > 0) {
176
+ const cur = stack.pop();
177
+ for (const entry of fs.readdirSync(cur, { withFileTypes: true })) {
178
+ const full = path.join(cur, entry.name);
179
+ if (entry.isDirectory()) {
180
+ stack.push(full);
181
+ }
182
+ else if (entry.isFile() && full.endsWith(".py")) {
183
+ out.push(full);
184
+ }
185
+ }
186
+ }
187
+ return out;
188
+ };
170
189
  // Sources to check for Python scripts
171
190
  const sources = [
172
191
  path.join(appRoot, "src", "python"),
@@ -175,25 +194,21 @@ function syncPythonScripts(appRoot, dataRoot) {
175
194
  ];
176
195
  let syncedCount = 0;
177
196
  for (const src of sources) {
178
- if (fs.existsSync(src)) {
179
- const files = fs.readdirSync(src);
180
- for (const file of files) {
181
- if (file.endsWith(".py")) {
182
- const srcPath = path.join(src, file);
183
- const destPath = path.join(pythonDest, file);
184
- // Only copy if file doesn't exist or is different size (basic sync)
185
- const srcStat = fs.statSync(srcPath);
186
- let shouldCopy = true;
187
- if (fs.existsSync(destPath)) {
188
- const destStat = fs.statSync(destPath);
189
- if (srcStat.size === destStat.size)
190
- shouldCopy = false;
191
- }
192
- if (shouldCopy) {
193
- fs.copyFileSync(srcPath, destPath);
194
- syncedCount++;
195
- }
196
- }
197
+ const files = collectPyFiles(src);
198
+ for (const srcPath of files) {
199
+ const rel = path.relative(src, srcPath);
200
+ const destPath = path.join(pythonDest, rel);
201
+ const srcStat = fs.statSync(srcPath);
202
+ let shouldCopy = true;
203
+ if (fs.existsSync(destPath)) {
204
+ const destStat = fs.statSync(destPath);
205
+ if (srcStat.size === destStat.size)
206
+ shouldCopy = false;
207
+ }
208
+ if (shouldCopy) {
209
+ fs.mkdirSync(path.dirname(destPath), { recursive: true });
210
+ fs.copyFileSync(srcPath, destPath);
211
+ syncedCount++;
197
212
  }
198
213
  }
199
214
  }
@@ -471,6 +486,25 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
471
486
  required: ["source", "dataset_id"],
472
487
  },
473
488
  },
489
+ {
490
+ name: "vesper_download_assets",
491
+ description: "Download real image/media assets using smart source-aware recipes (HuggingFace, Kaggle, direct URL).",
492
+ inputSchema: {
493
+ type: "object",
494
+ properties: {
495
+ dataset_id: { type: "string", description: "Unique dataset identifier." },
496
+ source: { type: "string", enum: ["huggingface", "kaggle", "url"], description: "Asset source type." },
497
+ repo_id: { type: "string", description: "Repo ID for HuggingFace (e.g. cifar100)." },
498
+ kaggle_ref: { type: "string", description: "Kaggle dataset ref (owner/dataset)." },
499
+ urls: { type: "array", items: { type: "string" }, description: "Direct asset URLs." },
500
+ output_format: { type: "string", enum: ["webdataset", "imagefolder", "parquet"], description: "Output asset format." },
501
+ max_items: { type: "number", description: "Optional cap on number of assets to fetch." },
502
+ workers: { type: "number", description: "Parallel worker count (default 8)." },
503
+ image_column: { type: "string", description: "Optional explicit image column for HuggingFace datasets." },
504
+ },
505
+ required: ["dataset_id", "source"],
506
+ },
507
+ },
474
508
  {
475
509
  name: "configure_kaggle",
476
510
  description: "Optionally store Kaggle API credentials for Kaggle discover/download. Core Vesper works without this.",
@@ -571,6 +605,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
571
605
  properties: {
572
606
  query: { type: "string" },
573
607
  requirements: { type: "string" },
608
+ download_images: { type: "boolean", description: "When true, enables post-prepare smart asset downloading for image/media datasets." },
574
609
  cleaning_options: { type: "object" },
575
610
  split_config: { type: "object" },
576
611
  },
@@ -813,6 +848,24 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
813
848
  const hf = new HuggingFaceScraper();
814
849
  results = await hf.scrape(Math.max(1, limit), true, query);
815
850
  }
851
+ const recipeScript = path.join(dataRoot, "python", "asset_downloader_engine.py");
852
+ for (const ds of results.slice(0, limit)) {
853
+ const info = {
854
+ dataset_id: ds.id,
855
+ id: ds.id,
856
+ source: ds.source,
857
+ repo_id: ds.id,
858
+ total_images: ds.total_examples || 0,
859
+ image_column: undefined,
860
+ recipes_dir: path.join(dataRoot, "recipes"),
861
+ };
862
+ try {
863
+ await runPythonJson(recipeScript, ["build_recipe", JSON.stringify(info)]);
864
+ }
865
+ catch {
866
+ // best-effort recipe generation; ignore discovery-time recipe failures
867
+ }
868
+ }
816
869
  const formattedOutput = formatSearchResults(results.slice(0, limit));
817
870
  return {
818
871
  content: [{ type: "text", text: formattedOutput }]
@@ -857,6 +910,61 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
857
910
  };
858
911
  }
859
912
  }
913
+ case "vesper_download_assets": {
914
+ hydrateExternalKeys();
915
+ const datasetId = String(request.params.arguments?.dataset_id || "").trim();
916
+ const source = String(request.params.arguments?.source || "").trim().toLowerCase();
917
+ const repoId = request.params.arguments?.repo_id ? String(request.params.arguments.repo_id) : undefined;
918
+ const kaggleRef = request.params.arguments?.kaggle_ref ? String(request.params.arguments.kaggle_ref) : undefined;
919
+ const urls = Array.isArray(request.params.arguments?.urls)
920
+ ? (request.params.arguments?.urls).map(v => String(v))
921
+ : undefined;
922
+ const outputFormat = String(request.params.arguments?.output_format || "webdataset");
923
+ const maxItems = request.params.arguments?.max_items ? Number(request.params.arguments.max_items) : undefined;
924
+ const workers = request.params.arguments?.workers ? Number(request.params.arguments.workers) : 8;
925
+ const imageColumn = request.params.arguments?.image_column ? String(request.params.arguments.image_column) : undefined;
926
+ if (!datasetId || !source) {
927
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id and source are required");
928
+ }
929
+ if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
930
+ return {
931
+ content: [{ type: "text", text: "Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds)." }],
932
+ isError: true,
933
+ };
934
+ }
935
+ const scriptPath = path.join(dataRoot, "python", "asset_downloader_engine.py");
936
+ const payload = {
937
+ dataset_id: datasetId,
938
+ source,
939
+ repo_id: repoId,
940
+ kaggle_ref: kaggleRef,
941
+ urls,
942
+ output_format: outputFormat,
943
+ max_items: maxItems,
944
+ workers,
945
+ image_column: imageColumn,
946
+ output_root: path.join(dataRoot, "data", "assets"),
947
+ recipes_dir: path.join(dataRoot, "recipes"),
948
+ };
949
+ try {
950
+ const result = await runPythonJson(scriptPath, ["download", JSON.stringify(payload)]);
951
+ if (!result?.ok) {
952
+ return {
953
+ content: [{ type: "text", text: `ERROR: asset download failed: ${result?.error || "Unknown error"}` }],
954
+ isError: true,
955
+ };
956
+ }
957
+ return {
958
+ content: [{ type: "text", text: JSON.stringify(result.result, null, 2) }],
959
+ };
960
+ }
961
+ catch (error) {
962
+ return {
963
+ content: [{ type: "text", text: `ERROR: asset downloader execution failed: ${error.message}` }],
964
+ isError: true,
965
+ };
966
+ }
967
+ }
860
968
  case "configure_kaggle": {
861
969
  const username = String(request.params.arguments?.username || "").trim();
862
970
  const key = String(request.params.arguments?.key || "").trim();
@@ -1033,7 +1141,8 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1033
1141
  case "prepare_dataset": {
1034
1142
  const query = String(request.params.arguments?.query);
1035
1143
  const requirements = request.params.arguments?.requirements ? String(request.params.arguments?.requirements) : undefined;
1036
- const job = jobManager.createJob("prepare", 0, { query, requirements });
1144
+ const downloadImages = request.params.arguments?.download_images === true;
1145
+ const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages });
1037
1146
  return {
1038
1147
  content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you.` }]
1039
1148
  };
@@ -0,0 +1,73 @@
1
+ import argparse
2
+ import asyncio
3
+ import json
4
+ import os
5
+ import sys
6
+ from pathlib import Path
7
+ from typing import Any, Dict
8
+
9
+ CURRENT_DIR = Path(__file__).resolve().parent
10
+ if str(CURRENT_DIR) not in sys.path:
11
+ sys.path.insert(0, str(CURRENT_DIR))
12
+
13
+ from vesper.core.asset_downloader import AssetDownloader
14
+ from vesper.core.download_recipe import build_download_recipe, save_recipe, get_download_recipe
15
+
16
+
17
+ def _print(payload: Dict[str, Any]) -> None:
18
+ print(json.dumps(payload, ensure_ascii=False))
19
+
20
+
21
+ async def _run_download(args: argparse.Namespace) -> Dict[str, Any]:
22
+ payload = json.loads(args.payload)
23
+ output_root = payload.get("output_root") or str(Path.home() / ".vesper" / "data" / "assets")
24
+ workers = int(payload.get("workers") or 8)
25
+ recipes_dir = payload.get("recipes_dir")
26
+
27
+ downloader = AssetDownloader(output_root=output_root, workers=workers, recipes_dir=recipes_dir)
28
+
29
+ result = await downloader.download_assets(
30
+ dataset_id=str(payload.get("dataset_id")),
31
+ source=payload.get("source"),
32
+ repo_id=payload.get("repo_id"),
33
+ kaggle_ref=payload.get("kaggle_ref"),
34
+ urls=payload.get("urls"),
35
+ output_format=payload.get("output_format", "webdataset"),
36
+ max_items=payload.get("max_items"),
37
+ image_column=payload.get("image_column"),
38
+ )
39
+ return {"ok": True, "result": result}
40
+
41
+
42
+ def main() -> None:
43
+ parser = argparse.ArgumentParser(description="Vesper Smart Asset Downloader Engine")
44
+ parser.add_argument("action", choices=["download", "build_recipe", "get_recipe"])
45
+ parser.add_argument("payload", help="JSON payload")
46
+ args = parser.parse_args()
47
+
48
+ try:
49
+ if args.action == "download":
50
+ response = asyncio.run(_run_download(args))
51
+ _print(response)
52
+ return
53
+
54
+ payload = json.loads(args.payload)
55
+ if args.action == "build_recipe":
56
+ recipe = build_download_recipe(payload)
57
+ saved = save_recipe(recipe, payload.get("recipes_dir"))
58
+ _print({"ok": True, "recipe": recipe, "saved_to": saved})
59
+ return
60
+
61
+ if args.action == "get_recipe":
62
+ dataset_id = str(payload.get("dataset_id"))
63
+ recipe = get_download_recipe(dataset_id, payload.get("recipes_dir"))
64
+ _print({"ok": True, "recipe": recipe})
65
+ return
66
+
67
+ _print({"ok": False, "error": f"Unknown action: {args.action}"})
68
+ except Exception as e:
69
+ _print({"ok": False, "error": str(e)})
70
+
71
+
72
+ if __name__ == "__main__":
73
+ main()
@@ -109,8 +109,28 @@ def discover(query: str, limit: int = 20) -> Dict[str, Any]:
109
109
 
110
110
  api: KaggleApi = auth["api"]
111
111
  try:
112
- datasets = api.dataset_list(search=query, page_size=max(1, min(limit, 100)))
113
- items = [_dataset_to_dict(ds) for ds in datasets[:limit]]
112
+ desired = max(1, min(limit, 100))
113
+
114
+ try:
115
+ datasets = api.dataset_list(search=query, page_size=desired)
116
+ items = [_dataset_to_dict(ds) for ds in datasets[:limit]]
117
+ return {"ok": True, "results": items, "count": len(items)}
118
+ except TypeError:
119
+ pass
120
+
121
+ collected = []
122
+ page = 1
123
+ while len(collected) < limit:
124
+ page_items = api.dataset_list(search=query, page=page)
125
+ if not page_items:
126
+ break
127
+
128
+ collected.extend(page_items)
129
+ if len(page_items) < 20:
130
+ break
131
+ page += 1
132
+
133
+ items = [_dataset_to_dict(ds) for ds in collected[:limit]]
114
134
  return {"ok": True, "results": items, "count": len(items)}
115
135
  except Exception as e:
116
136
  return {"ok": False, "error": f"Kaggle discover failed: {str(e)}"}
@@ -0,0 +1 @@
1
+ """Vesper Python runtime package."""
@@ -0,0 +1 @@
1
+ """Core data engines for Vesper."""