@vespermcp/mcp-server 1.2.4 → 1.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,415 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import json
5
+ import mimetypes
6
+ import os
7
+ import shutil
8
+ import tempfile
9
+ from dataclasses import dataclass
10
+ from pathlib import Path
11
+ from typing import Any, Awaitable, Callable, Dict, Iterable, List, Optional
12
+
13
+ import aiohttp
14
+
15
+ from vesper.core.download_recipe import get_download_recipe
16
+
17
+ try:
18
+ import aiofiles
19
+ except Exception: # pragma: no cover
20
+ aiofiles = None
21
+
22
+ try:
23
+ import webdataset as wds
24
+ except Exception: # pragma: no cover
25
+ wds = None
26
+
27
+
28
+ IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif", ".tiff"}
29
+
30
+
31
+ @dataclass
32
+ class DownloadResult:
33
+ dataset_id: str
34
+ source: str
35
+ output_dir: str
36
+ downloaded_assets: int
37
+ failed_assets: int
38
+ errors_file: str
39
+ metadata_file: str
40
+ output_format: str
41
+
42
+
43
+ class AssetDownloader:
44
+ def __init__(
45
+ self,
46
+ output_root: str,
47
+ workers: int = 8,
48
+ recipes_dir: Optional[str] = None,
49
+ progress_callback: Optional[Callable[[str, Dict[str, Any]], Awaitable[None] | None]] = None,
50
+ ) -> None:
51
+ self.output_root = Path(output_root)
52
+ self.workers = max(1, min(workers, 32))
53
+ self.recipes_dir = recipes_dir
54
+ self.progress_callback = progress_callback
55
+
56
+ async def _emit(self, stage: str, payload: Dict[str, Any]) -> None:
57
+ if not self.progress_callback:
58
+ return
59
+ maybe = self.progress_callback(stage, payload)
60
+ if asyncio.iscoroutine(maybe):
61
+ await maybe
62
+
63
+ @staticmethod
64
+ def find_image_column(dataset: Any) -> Optional[str]:
65
+ features = getattr(dataset, "features", None)
66
+ if features:
67
+ for name, feature in features.items():
68
+ feature_name = feature.__class__.__name__.lower()
69
+ feature_repr = str(feature).lower()
70
+ if feature_name == "image" or "image(" in feature_repr:
71
+ return str(name)
72
+ lower = str(name).lower()
73
+ if lower in {"image", "images", "img", "image_path", "image_url", "url"}:
74
+ return str(name)
75
+
76
+ candidate_columns = ["image", "images", "img", "image_path", "image_url", "url", "file_name", "filepath"]
77
+ cols = getattr(dataset, "column_names", []) or []
78
+ for c in candidate_columns:
79
+ if c in cols:
80
+ return c
81
+ return None
82
+
83
+ async def download_assets(
84
+ self,
85
+ dataset_id: str,
86
+ source: Optional[str] = None,
87
+ repo_id: Optional[str] = None,
88
+ kaggle_ref: Optional[str] = None,
89
+ urls: Optional[List[str]] = None,
90
+ output_format: str = "webdataset",
91
+ max_items: Optional[int] = None,
92
+ image_column: Optional[str] = None,
93
+ ) -> Dict[str, Any]:
94
+ recipe = get_download_recipe(dataset_id, self.recipes_dir)
95
+ if recipe:
96
+ source = source or recipe.get("source")
97
+ repo_id = repo_id or recipe.get("repo_id")
98
+ image_column = image_column or recipe.get("image_column")
99
+
100
+ source = (source or "").lower()
101
+ if source not in {"huggingface", "kaggle", "url"}:
102
+ raise ValueError("source must be one of: huggingface, kaggle, url")
103
+
104
+ # --- Validate imports and args BEFORE creating any directories ---
105
+ if source == "huggingface":
106
+ if not repo_id:
107
+ raise ValueError("repo_id is required for source=huggingface")
108
+ try:
109
+ from datasets import load_dataset as _ld # noqa: F401
110
+ except Exception as e:
111
+ raise RuntimeError(
112
+ f"datasets package is required for HuggingFace downloads. "
113
+ f"Install with: pip install datasets. Details: {e}"
114
+ )
115
+ elif source == "kaggle":
116
+ ref = kaggle_ref or repo_id
117
+ if not ref:
118
+ raise ValueError("kaggle_ref is required for source=kaggle")
119
+ try:
120
+ from kaggle.api.kaggle_api_extended import KaggleApi as _Ka # noqa: F401
121
+ except Exception as e:
122
+ raise RuntimeError(
123
+ f"kaggle package is required for Kaggle downloads. "
124
+ f"Install with: pip install kaggle. Details: {e}"
125
+ )
126
+ else:
127
+ if not urls:
128
+ raise ValueError("urls are required for source=url")
129
+
130
+ if output_format == "webdataset" and wds is None:
131
+ raise RuntimeError(
132
+ "webdataset package is required for webdataset output. "
133
+ "Install with: pip install webdataset"
134
+ )
135
+
136
+ # --- Now safe to create directories ---
137
+ dataset_dir = self.output_root / dataset_id.replace("/", "_").replace(":", "_")
138
+ images_dir = dataset_dir / "images"
139
+ dataset_dir.mkdir(parents=True, exist_ok=True)
140
+ images_dir.mkdir(parents=True, exist_ok=True)
141
+
142
+ errors_file = dataset_dir / "errors.jsonl"
143
+ metadata_file = dataset_dir / "metadata.jsonl"
144
+
145
+ try:
146
+ if source == "huggingface":
147
+ summary = await self._download_huggingface(repo_id, dataset_id, images_dir, metadata_file, errors_file, max_items, image_column)
148
+ elif source == "kaggle":
149
+ ref = kaggle_ref or repo_id
150
+ summary = await self._download_kaggle(ref, dataset_id, images_dir, metadata_file, errors_file, max_items)
151
+ else:
152
+ summary = await self._download_urls(urls, dataset_id, images_dir, metadata_file, errors_file, max_items)
153
+ except Exception:
154
+ # Clean up empty directories on failure so we don't leave ghost artifacts
155
+ if images_dir.exists() and not any(images_dir.iterdir()):
156
+ shutil.rmtree(dataset_dir, ignore_errors=True)
157
+ raise
158
+
159
+ if output_format == "webdataset":
160
+ await self._write_webdataset(dataset_dir, images_dir, metadata_file)
161
+ elif output_format == "parquet":
162
+ await self._write_parquet(dataset_dir, metadata_file)
163
+
164
+ result = DownloadResult(
165
+ dataset_id=dataset_id,
166
+ source=source,
167
+ output_dir=str(dataset_dir),
168
+ downloaded_assets=summary["downloaded"],
169
+ failed_assets=summary["failed"],
170
+ errors_file=str(errors_file),
171
+ metadata_file=str(metadata_file),
172
+ output_format=output_format,
173
+ )
174
+ return result.__dict__
175
+
176
+ async def _download_huggingface(
177
+ self,
178
+ repo_id: str,
179
+ dataset_id: str,
180
+ images_dir: Path,
181
+ metadata_file: Path,
182
+ errors_file: Path,
183
+ max_items: Optional[int],
184
+ image_column: Optional[str],
185
+ ) -> Dict[str, int]:
186
+ from datasets import load_dataset # validated in download_assets()
187
+
188
+ await self._emit("start", {"source": "huggingface", "repo_id": repo_id})
189
+
190
+ try:
191
+ ds = load_dataset(repo_id, split="train")
192
+ except Exception:
193
+ dd = load_dataset(repo_id)
194
+ first_split = list(dd.keys())[0]
195
+ ds = dd[first_split]
196
+
197
+ col = image_column or self.find_image_column(ds)
198
+ if not col:
199
+ raise RuntimeError(
200
+ "No image column detected in HuggingFace dataset. Provide image_column or use fallback strategy with URL column."
201
+ )
202
+
203
+ total = len(ds) if hasattr(ds, "__len__") else 0
204
+ target = min(total, max_items) if max_items and total else (max_items or total or 0)
205
+
206
+ downloaded = 0
207
+ failed = 0
208
+
209
+ with metadata_file.open("w", encoding="utf-8") as mf, errors_file.open("w", encoding="utf-8") as ef:
210
+ for idx, row in enumerate(ds):
211
+ if max_items and idx >= max_items:
212
+ break
213
+ try:
214
+ out_name = f"{idx:08d}.jpg"
215
+ out_path = images_dir / out_name
216
+ self._save_image_value(row.get(col), out_path)
217
+
218
+ record = {
219
+ "dataset_id": dataset_id,
220
+ "index": idx,
221
+ "image_path": str(out_path),
222
+ "source": "huggingface",
223
+ "repo_id": repo_id,
224
+ }
225
+ mf.write(json.dumps(record, ensure_ascii=False) + "\n")
226
+ downloaded += 1
227
+ if downloaded % 50 == 0:
228
+ await self._emit("progress", {"downloaded": downloaded, "failed": failed, "target": target})
229
+ except Exception as e:
230
+ failed += 1
231
+ ef.write(json.dumps({"index": idx, "error": str(e)}, ensure_ascii=False) + "\n")
232
+
233
+ await self._emit("done", {"downloaded": downloaded, "failed": failed})
234
+ return {"downloaded": downloaded, "failed": failed}
235
+
236
+ async def _download_kaggle(
237
+ self,
238
+ kaggle_ref: str,
239
+ dataset_id: str,
240
+ images_dir: Path,
241
+ metadata_file: Path,
242
+ errors_file: Path,
243
+ max_items: Optional[int],
244
+ ) -> Dict[str, int]:
245
+ from kaggle.api.kaggle_api_extended import KaggleApi # validated in download_assets()
246
+
247
+ await self._emit("start", {"source": "kaggle", "dataset": kaggle_ref})
248
+
249
+ api = KaggleApi()
250
+ api.authenticate()
251
+
252
+ tmp_dir = Path(tempfile.mkdtemp(prefix="vesper_kaggle_assets_"))
253
+ downloaded = 0
254
+ failed = 0
255
+
256
+ try:
257
+ api.dataset_download_files(kaggle_ref, path=str(tmp_dir), unzip=True, quiet=True)
258
+ candidates = [p for p in tmp_dir.rglob("*") if p.is_file() and p.suffix.lower() in IMAGE_EXTENSIONS]
259
+ if max_items:
260
+ candidates = candidates[:max_items]
261
+
262
+ with metadata_file.open("w", encoding="utf-8") as mf, errors_file.open("w", encoding="utf-8") as ef:
263
+ for idx, src_path in enumerate(candidates):
264
+ try:
265
+ out_name = f"{idx:08d}{src_path.suffix.lower()}"
266
+ out_path = images_dir / out_name
267
+ shutil.copy2(src_path, out_path)
268
+ record = {
269
+ "dataset_id": dataset_id,
270
+ "index": idx,
271
+ "image_path": str(out_path),
272
+ "source": "kaggle",
273
+ "repo_id": kaggle_ref,
274
+ }
275
+ mf.write(json.dumps(record, ensure_ascii=False) + "\n")
276
+ downloaded += 1
277
+ except Exception as e:
278
+ failed += 1
279
+ ef.write(json.dumps({"file": str(src_path), "error": str(e)}, ensure_ascii=False) + "\n")
280
+ finally:
281
+ shutil.rmtree(tmp_dir, ignore_errors=True)
282
+
283
+ await self._emit("done", {"downloaded": downloaded, "failed": failed})
284
+ return {"downloaded": downloaded, "failed": failed}
285
+
286
+ async def _download_urls(
287
+ self,
288
+ urls: List[str],
289
+ dataset_id: str,
290
+ images_dir: Path,
291
+ metadata_file: Path,
292
+ errors_file: Path,
293
+ max_items: Optional[int],
294
+ ) -> Dict[str, int]:
295
+ if aiofiles is None:
296
+ raise RuntimeError("aiofiles is required for URL downloads. Install with: pip install aiofiles")
297
+
298
+ selected = urls[:max_items] if max_items else urls
299
+ sem = asyncio.Semaphore(self.workers)
300
+
301
+ downloaded = 0
302
+ failed = 0
303
+ metadata_lock = asyncio.Lock()
304
+
305
+ async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=180)) as session:
306
+ async def worker(idx: int, url: str) -> None:
307
+ nonlocal downloaded, failed
308
+ async with sem:
309
+ try:
310
+ local_path = await self._download_one_url(session, idx, url, images_dir)
311
+ async with metadata_lock:
312
+ async with aiofiles.open(metadata_file, "a", encoding="utf-8") as mf:
313
+ await mf.write(json.dumps({
314
+ "dataset_id": dataset_id,
315
+ "index": idx,
316
+ "image_path": str(local_path),
317
+ "source": "url",
318
+ "url": url,
319
+ }, ensure_ascii=False) + "\n")
320
+ downloaded += 1
321
+ except Exception as e:
322
+ failed += 1
323
+ async with metadata_lock:
324
+ async with aiofiles.open(errors_file, "a", encoding="utf-8") as ef:
325
+ await ef.write(json.dumps({"index": idx, "url": url, "error": str(e)}, ensure_ascii=False) + "\n")
326
+
327
+ tasks = [asyncio.create_task(worker(i, u)) for i, u in enumerate(selected)]
328
+ await asyncio.gather(*tasks)
329
+
330
+ await self._emit("done", {"downloaded": downloaded, "failed": failed})
331
+ return {"downloaded": downloaded, "failed": failed}
332
+
333
+ async def _download_one_url(self, session: aiohttp.ClientSession, idx: int, url: str, images_dir: Path) -> Path:
334
+ ext = Path(url.split("?")[0]).suffix.lower()
335
+ if ext not in IMAGE_EXTENSIONS:
336
+ ext = ".jpg"
337
+ out_path = images_dir / f"{idx:08d}{ext}"
338
+
339
+ existing_size = out_path.stat().st_size if out_path.exists() else 0
340
+ headers: Dict[str, str] = {}
341
+ if existing_size > 0:
342
+ headers["Range"] = f"bytes={existing_size}-"
343
+
344
+ async with session.get(url, headers=headers) as response:
345
+ if response.status not in (200, 206):
346
+ raise RuntimeError(f"HTTP {response.status}")
347
+
348
+ mode = "ab" if response.status == 206 and existing_size > 0 else "wb"
349
+ async with aiofiles.open(out_path, mode) as f:
350
+ async for chunk in response.content.iter_chunked(1024 * 256):
351
+ await f.write(chunk)
352
+
353
+ return out_path
354
+
355
+ @staticmethod
356
+ def _save_image_value(value: Any, out_path: Path) -> None:
357
+ if value is None:
358
+ raise ValueError("empty image value")
359
+
360
+ if hasattr(value, "save"):
361
+ value.save(out_path)
362
+ return
363
+
364
+ if isinstance(value, dict):
365
+ if value.get("bytes"):
366
+ out_path.write_bytes(value["bytes"])
367
+ return
368
+ if value.get("path") and os.path.exists(value["path"]):
369
+ shutil.copy2(value["path"], out_path)
370
+ return
371
+ if value.get("url"):
372
+ raise ValueError("image URL requires URL downloader fallback")
373
+
374
+ if isinstance(value, str):
375
+ if os.path.exists(value):
376
+ shutil.copy2(value, out_path)
377
+ return
378
+ raise ValueError("string image value is not a local path")
379
+
380
+ raise ValueError(f"Unsupported image value type: {type(value)}")
381
+
382
+ async def _write_webdataset(self, dataset_dir: Path, images_dir: Path, metadata_file: Path) -> None:
383
+ if wds is None:
384
+ raise RuntimeError("webdataset package is required for webdataset output. Install with: pip install webdataset")
385
+
386
+ shard_pattern = str(dataset_dir / "shard-%06d.tar")
387
+ with metadata_file.open("r", encoding="utf-8") as mf, wds.ShardWriter(shard_pattern, maxcount=5000) as sink:
388
+ for line in mf:
389
+ row = json.loads(line)
390
+ image_path = Path(row["image_path"])
391
+ if not image_path.exists():
392
+ continue
393
+ key = image_path.stem
394
+ ext = image_path.suffix.lstrip(".") or "jpg"
395
+ sample = {
396
+ "__key__": key,
397
+ ext: image_path.read_bytes(),
398
+ "json": json.dumps(row, ensure_ascii=False).encode("utf-8"),
399
+ }
400
+ sink.write(sample)
401
+
402
+ async def _write_parquet(self, dataset_dir: Path, metadata_file: Path) -> None:
403
+ try:
404
+ import pyarrow as pa
405
+ import pyarrow.parquet as pq
406
+ except Exception as e:
407
+ raise RuntimeError(f"pyarrow is required for parquet output: {e}")
408
+
409
+ rows: List[Dict[str, Any]] = []
410
+ with metadata_file.open("r", encoding="utf-8") as mf:
411
+ for line in mf:
412
+ rows.append(json.loads(line))
413
+
414
+ table = pa.Table.from_pylist(rows)
415
+ pq.write_table(table, str(dataset_dir / "metadata.parquet"))
@@ -0,0 +1,104 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import re
5
+ from dataclasses import dataclass, asdict
6
+ from pathlib import Path
7
+ from typing import Any, Dict, Optional
8
+
9
+
10
+ DEFAULT_RECIPES_DIR = Path.home() / ".vesper" / "recipes"
11
+
12
+
13
+ @dataclass
14
+ class DownloadRecipe:
15
+ dataset_id: str
16
+ source: str
17
+ repo_id: str
18
+ image_column: Optional[str]
19
+ download_method: str
20
+ requires_auth: bool
21
+ estimated_asset_size_gb: float
22
+ total_images: int
23
+ fallback_strategy: list[str]
24
+
25
+
26
+ def _safe_name(value: str) -> str:
27
+ return re.sub(r"[^a-zA-Z0-9._-]+", "_", value).strip("_") or "dataset"
28
+
29
+
30
+ def build_download_recipe(dataset_info: Dict[str, Any]) -> Dict[str, Any]:
31
+ dataset_id = str(dataset_info.get("dataset_id") or dataset_info.get("id") or "")
32
+ source = str(dataset_info.get("source") or "unknown").lower()
33
+ repo_id = str(dataset_info.get("repo_id") or dataset_info.get("id") or dataset_id)
34
+
35
+ image_column = dataset_info.get("image_column")
36
+ if not image_column:
37
+ features = dataset_info.get("features") or {}
38
+ if isinstance(features, dict):
39
+ for key in features.keys():
40
+ lower = str(key).lower()
41
+ if lower in {"image", "images", "img", "img_path", "image_url", "url"}:
42
+ image_column = key
43
+ break
44
+
45
+ download_method = "url_list"
46
+ if source == "huggingface":
47
+ download_method = "hf_dataset_image_feature"
48
+ elif source == "kaggle":
49
+ download_method = "kaggle_archive"
50
+ elif source in {"dataworld", "openml"}:
51
+ download_method = "direct_file_scan"
52
+
53
+ requires_auth = bool(dataset_info.get("requires_auth", source in {"kaggle", "dataworld"}))
54
+
55
+ total_images = int(dataset_info.get("total_images") or dataset_info.get("total_examples") or 0)
56
+ if total_images <= 0:
57
+ total_images = 1000
58
+
59
+ estimated_asset_size_gb = round(float(dataset_info.get("estimated_asset_size_gb") or (total_images * 0.0004)), 3)
60
+
61
+ fallback_strategy = dataset_info.get("fallback_strategy") or [
62
+ "scan_archive_for_images",
63
+ "extract_url_column_and_download",
64
+ "export_metadata_only_with_actionable_error",
65
+ ]
66
+
67
+ recipe = DownloadRecipe(
68
+ dataset_id=dataset_id or repo_id,
69
+ source=source,
70
+ repo_id=repo_id,
71
+ image_column=image_column,
72
+ download_method=download_method,
73
+ requires_auth=requires_auth,
74
+ estimated_asset_size_gb=estimated_asset_size_gb,
75
+ total_images=total_images,
76
+ fallback_strategy=list(fallback_strategy),
77
+ )
78
+
79
+ return asdict(recipe)
80
+
81
+
82
+ def save_recipe(recipe: Dict[str, Any], recipes_dir: Optional[str] = None) -> str:
83
+ root = Path(recipes_dir) if recipes_dir else DEFAULT_RECIPES_DIR
84
+ root.mkdir(parents=True, exist_ok=True)
85
+
86
+ dataset_id = str(recipe.get("dataset_id") or recipe.get("repo_id") or "dataset")
87
+ recipe_dir = root / _safe_name(dataset_id)
88
+ recipe_dir.mkdir(parents=True, exist_ok=True)
89
+
90
+ out_path = recipe_dir / "download_recipe.json"
91
+ out_path.write_text(json.dumps(recipe, indent=2, ensure_ascii=False), encoding="utf-8")
92
+ return str(out_path)
93
+
94
+
95
+ def get_download_recipe(dataset_id: str, recipes_dir: Optional[str] = None) -> Optional[Dict[str, Any]]:
96
+ root = Path(recipes_dir) if recipes_dir else DEFAULT_RECIPES_DIR
97
+ path = root / _safe_name(dataset_id) / "download_recipe.json"
98
+ if not path.exists():
99
+ return None
100
+
101
+ try:
102
+ return json.loads(path.read_text(encoding="utf-8"))
103
+ except Exception:
104
+ return None
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@vespermcp/mcp-server",
3
- "version": "1.2.4",
3
+ "version": "1.2.6",
4
4
  "description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
5
5
  "type": "module",
6
6
  "main": "build/index.js",
@@ -17,7 +17,7 @@
17
17
  "mcp-config-template.json"
18
18
  ],
19
19
  "scripts": {
20
- "build": "tsc && node -e \"const fs=require('fs');const path=require('path');const src='src/python';const dest='build/python';if(!fs.existsSync(dest))fs.mkdirSync(dest,{recursive:true});fs.readdirSync(src).forEach(f=>{if(f.endsWith('.py'))fs.copyFileSync(path.join(src,f),path.join(dest,f));});console.log('Copied Python scripts to build/python');\"",
20
+ "build": "tsc && node -e \"const fs=require('fs');const path=require('path');const src='src/python';const dest='build/python';const walk=(d)=>fs.readdirSync(d,{withFileTypes:true}).flatMap(e=>e.isDirectory()?walk(path.join(d,e.name)):[path.join(d,e.name)]);if(!fs.existsSync(dest))fs.mkdirSync(dest,{recursive:true});if(fs.existsSync(src)){for(const f of walk(src)){if(!f.endsWith('.py'))continue;const rel=path.relative(src,f);const out=path.join(dest,rel);fs.mkdirSync(path.dirname(out),{recursive:true});fs.copyFileSync(f,out);}}console.log('Copied Python scripts to build/python');\"",
21
21
  "dev": "tsx watch src/index.ts",
22
22
  "postinstall": "node scripts/postinstall.cjs",
23
23
  "scrape": "tsx src/scripts/scrape-metadata.ts",
@@ -23,7 +23,12 @@ const pythonPackages = [
23
23
  'pillow',
24
24
  'numpy',
25
25
  'librosa',
26
- 'soundfile'
26
+ 'soundfile',
27
+ 'aiohttp',
28
+ 'aiofiles',
29
+ 'datasets',
30
+ 'webdataset',
31
+ 'kaggle'
27
32
  ];
28
33
 
29
34
  try {
@@ -0,0 +1,73 @@
1
+ import argparse
2
+ import asyncio
3
+ import json
4
+ import os
5
+ import sys
6
+ from pathlib import Path
7
+ from typing import Any, Dict
8
+
9
+ CURRENT_DIR = Path(__file__).resolve().parent
10
+ if str(CURRENT_DIR) not in sys.path:
11
+ sys.path.insert(0, str(CURRENT_DIR))
12
+
13
+ from vesper.core.asset_downloader import AssetDownloader
14
+ from vesper.core.download_recipe import build_download_recipe, save_recipe, get_download_recipe
15
+
16
+
17
+ def _print(payload: Dict[str, Any]) -> None:
18
+ print(json.dumps(payload, ensure_ascii=False))
19
+
20
+
21
+ async def _run_download(args: argparse.Namespace) -> Dict[str, Any]:
22
+ payload = json.loads(args.payload)
23
+ output_root = payload.get("output_root") or str(Path.home() / ".vesper" / "data" / "assets")
24
+ workers = int(payload.get("workers") or 8)
25
+ recipes_dir = payload.get("recipes_dir")
26
+
27
+ downloader = AssetDownloader(output_root=output_root, workers=workers, recipes_dir=recipes_dir)
28
+
29
+ result = await downloader.download_assets(
30
+ dataset_id=str(payload.get("dataset_id")),
31
+ source=payload.get("source"),
32
+ repo_id=payload.get("repo_id"),
33
+ kaggle_ref=payload.get("kaggle_ref"),
34
+ urls=payload.get("urls"),
35
+ output_format=payload.get("output_format", "webdataset"),
36
+ max_items=payload.get("max_items"),
37
+ image_column=payload.get("image_column"),
38
+ )
39
+ return {"ok": True, "result": result}
40
+
41
+
42
+ def main() -> None:
43
+ parser = argparse.ArgumentParser(description="Vesper Smart Asset Downloader Engine")
44
+ parser.add_argument("action", choices=["download", "build_recipe", "get_recipe"])
45
+ parser.add_argument("payload", help="JSON payload")
46
+ args = parser.parse_args()
47
+
48
+ try:
49
+ if args.action == "download":
50
+ response = asyncio.run(_run_download(args))
51
+ _print(response)
52
+ return
53
+
54
+ payload = json.loads(args.payload)
55
+ if args.action == "build_recipe":
56
+ recipe = build_download_recipe(payload)
57
+ saved = save_recipe(recipe, payload.get("recipes_dir"))
58
+ _print({"ok": True, "recipe": recipe, "saved_to": saved})
59
+ return
60
+
61
+ if args.action == "get_recipe":
62
+ dataset_id = str(payload.get("dataset_id"))
63
+ recipe = get_download_recipe(dataset_id, payload.get("recipes_dir"))
64
+ _print({"ok": True, "recipe": recipe})
65
+ return
66
+
67
+ _print({"ok": False, "error": f"Unknown action: {args.action}"})
68
+ except Exception as e:
69
+ _print({"ok": False, "error": str(e)})
70
+
71
+
72
+ if __name__ == "__main__":
73
+ main()
@@ -0,0 +1 @@
1
+ """Vesper Python runtime package."""
@@ -0,0 +1 @@
1
+ """Core data engines for Vesper."""