@vespermcp/mcp-server 1.2.4 → 1.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +129 -20
- package/build/python/asset_downloader_engine.py +73 -0
- package/build/python/vesper/__init__.py +1 -0
- package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__init__.py +1 -0
- package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
- package/build/python/vesper/core/asset_downloader.py +388 -0
- package/build/python/vesper/core/download_recipe.py +104 -0
- package/package.json +2 -2
- package/src/python/asset_downloader_engine.py +73 -0
- package/src/python/vesper/__init__.py +1 -0
- package/src/python/vesper/core/__init__.py +1 -0
- package/src/python/vesper/core/asset_downloader.py +388 -0
- package/src/python/vesper/core/download_recipe.py +104 -0
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import re
|
|
5
|
+
from dataclasses import dataclass, asdict
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Dict, Optional
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
DEFAULT_RECIPES_DIR = Path.home() / ".vesper" / "recipes"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class DownloadRecipe:
|
|
15
|
+
dataset_id: str
|
|
16
|
+
source: str
|
|
17
|
+
repo_id: str
|
|
18
|
+
image_column: Optional[str]
|
|
19
|
+
download_method: str
|
|
20
|
+
requires_auth: bool
|
|
21
|
+
estimated_asset_size_gb: float
|
|
22
|
+
total_images: int
|
|
23
|
+
fallback_strategy: list[str]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _safe_name(value: str) -> str:
|
|
27
|
+
return re.sub(r"[^a-zA-Z0-9._-]+", "_", value).strip("_") or "dataset"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def build_download_recipe(dataset_info: Dict[str, Any]) -> Dict[str, Any]:
|
|
31
|
+
dataset_id = str(dataset_info.get("dataset_id") or dataset_info.get("id") or "")
|
|
32
|
+
source = str(dataset_info.get("source") or "unknown").lower()
|
|
33
|
+
repo_id = str(dataset_info.get("repo_id") or dataset_info.get("id") or dataset_id)
|
|
34
|
+
|
|
35
|
+
image_column = dataset_info.get("image_column")
|
|
36
|
+
if not image_column:
|
|
37
|
+
features = dataset_info.get("features") or {}
|
|
38
|
+
if isinstance(features, dict):
|
|
39
|
+
for key in features.keys():
|
|
40
|
+
lower = str(key).lower()
|
|
41
|
+
if lower in {"image", "images", "img", "img_path", "image_url", "url"}:
|
|
42
|
+
image_column = key
|
|
43
|
+
break
|
|
44
|
+
|
|
45
|
+
download_method = "url_list"
|
|
46
|
+
if source == "huggingface":
|
|
47
|
+
download_method = "hf_dataset_image_feature"
|
|
48
|
+
elif source == "kaggle":
|
|
49
|
+
download_method = "kaggle_archive"
|
|
50
|
+
elif source in {"dataworld", "openml"}:
|
|
51
|
+
download_method = "direct_file_scan"
|
|
52
|
+
|
|
53
|
+
requires_auth = bool(dataset_info.get("requires_auth", source in {"kaggle", "dataworld"}))
|
|
54
|
+
|
|
55
|
+
total_images = int(dataset_info.get("total_images") or dataset_info.get("total_examples") or 0)
|
|
56
|
+
if total_images <= 0:
|
|
57
|
+
total_images = 1000
|
|
58
|
+
|
|
59
|
+
estimated_asset_size_gb = round(float(dataset_info.get("estimated_asset_size_gb") or (total_images * 0.0004)), 3)
|
|
60
|
+
|
|
61
|
+
fallback_strategy = dataset_info.get("fallback_strategy") or [
|
|
62
|
+
"scan_archive_for_images",
|
|
63
|
+
"extract_url_column_and_download",
|
|
64
|
+
"export_metadata_only_with_actionable_error",
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
recipe = DownloadRecipe(
|
|
68
|
+
dataset_id=dataset_id or repo_id,
|
|
69
|
+
source=source,
|
|
70
|
+
repo_id=repo_id,
|
|
71
|
+
image_column=image_column,
|
|
72
|
+
download_method=download_method,
|
|
73
|
+
requires_auth=requires_auth,
|
|
74
|
+
estimated_asset_size_gb=estimated_asset_size_gb,
|
|
75
|
+
total_images=total_images,
|
|
76
|
+
fallback_strategy=list(fallback_strategy),
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
return asdict(recipe)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def save_recipe(recipe: Dict[str, Any], recipes_dir: Optional[str] = None) -> str:
|
|
83
|
+
root = Path(recipes_dir) if recipes_dir else DEFAULT_RECIPES_DIR
|
|
84
|
+
root.mkdir(parents=True, exist_ok=True)
|
|
85
|
+
|
|
86
|
+
dataset_id = str(recipe.get("dataset_id") or recipe.get("repo_id") or "dataset")
|
|
87
|
+
recipe_dir = root / _safe_name(dataset_id)
|
|
88
|
+
recipe_dir.mkdir(parents=True, exist_ok=True)
|
|
89
|
+
|
|
90
|
+
out_path = recipe_dir / "download_recipe.json"
|
|
91
|
+
out_path.write_text(json.dumps(recipe, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
92
|
+
return str(out_path)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def get_download_recipe(dataset_id: str, recipes_dir: Optional[str] = None) -> Optional[Dict[str, Any]]:
|
|
96
|
+
root = Path(recipes_dir) if recipes_dir else DEFAULT_RECIPES_DIR
|
|
97
|
+
path = root / _safe_name(dataset_id) / "download_recipe.json"
|
|
98
|
+
if not path.exists():
|
|
99
|
+
return None
|
|
100
|
+
|
|
101
|
+
try:
|
|
102
|
+
return json.loads(path.read_text(encoding="utf-8"))
|
|
103
|
+
except Exception:
|
|
104
|
+
return None
|