@vespermcp/mcp-server 1.2.4 → 1.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,104 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import re
5
+ from dataclasses import dataclass, asdict
6
+ from pathlib import Path
7
+ from typing import Any, Dict, Optional
8
+
9
+
10
+ DEFAULT_RECIPES_DIR = Path.home() / ".vesper" / "recipes"
11
+
12
+
13
+ @dataclass
14
+ class DownloadRecipe:
15
+ dataset_id: str
16
+ source: str
17
+ repo_id: str
18
+ image_column: Optional[str]
19
+ download_method: str
20
+ requires_auth: bool
21
+ estimated_asset_size_gb: float
22
+ total_images: int
23
+ fallback_strategy: list[str]
24
+
25
+
26
+ def _safe_name(value: str) -> str:
27
+ return re.sub(r"[^a-zA-Z0-9._-]+", "_", value).strip("_") or "dataset"
28
+
29
+
30
+ def build_download_recipe(dataset_info: Dict[str, Any]) -> Dict[str, Any]:
31
+ dataset_id = str(dataset_info.get("dataset_id") or dataset_info.get("id") or "")
32
+ source = str(dataset_info.get("source") or "unknown").lower()
33
+ repo_id = str(dataset_info.get("repo_id") or dataset_info.get("id") or dataset_id)
34
+
35
+ image_column = dataset_info.get("image_column")
36
+ if not image_column:
37
+ features = dataset_info.get("features") or {}
38
+ if isinstance(features, dict):
39
+ for key in features.keys():
40
+ lower = str(key).lower()
41
+ if lower in {"image", "images", "img", "img_path", "image_url", "url"}:
42
+ image_column = key
43
+ break
44
+
45
+ download_method = "url_list"
46
+ if source == "huggingface":
47
+ download_method = "hf_dataset_image_feature"
48
+ elif source == "kaggle":
49
+ download_method = "kaggle_archive"
50
+ elif source in {"dataworld", "openml"}:
51
+ download_method = "direct_file_scan"
52
+
53
+ requires_auth = bool(dataset_info.get("requires_auth", source in {"kaggle", "dataworld"}))
54
+
55
+ total_images = int(dataset_info.get("total_images") or dataset_info.get("total_examples") or 0)
56
+ if total_images <= 0:
57
+ total_images = 1000
58
+
59
+ estimated_asset_size_gb = round(float(dataset_info.get("estimated_asset_size_gb") or (total_images * 0.0004)), 3)
60
+
61
+ fallback_strategy = dataset_info.get("fallback_strategy") or [
62
+ "scan_archive_for_images",
63
+ "extract_url_column_and_download",
64
+ "export_metadata_only_with_actionable_error",
65
+ ]
66
+
67
+ recipe = DownloadRecipe(
68
+ dataset_id=dataset_id or repo_id,
69
+ source=source,
70
+ repo_id=repo_id,
71
+ image_column=image_column,
72
+ download_method=download_method,
73
+ requires_auth=requires_auth,
74
+ estimated_asset_size_gb=estimated_asset_size_gb,
75
+ total_images=total_images,
76
+ fallback_strategy=list(fallback_strategy),
77
+ )
78
+
79
+ return asdict(recipe)
80
+
81
+
82
+ def save_recipe(recipe: Dict[str, Any], recipes_dir: Optional[str] = None) -> str:
83
+ root = Path(recipes_dir) if recipes_dir else DEFAULT_RECIPES_DIR
84
+ root.mkdir(parents=True, exist_ok=True)
85
+
86
+ dataset_id = str(recipe.get("dataset_id") or recipe.get("repo_id") or "dataset")
87
+ recipe_dir = root / _safe_name(dataset_id)
88
+ recipe_dir.mkdir(parents=True, exist_ok=True)
89
+
90
+ out_path = recipe_dir / "download_recipe.json"
91
+ out_path.write_text(json.dumps(recipe, indent=2, ensure_ascii=False), encoding="utf-8")
92
+ return str(out_path)
93
+
94
+
95
+ def get_download_recipe(dataset_id: str, recipes_dir: Optional[str] = None) -> Optional[Dict[str, Any]]:
96
+ root = Path(recipes_dir) if recipes_dir else DEFAULT_RECIPES_DIR
97
+ path = root / _safe_name(dataset_id) / "download_recipe.json"
98
+ if not path.exists():
99
+ return None
100
+
101
+ try:
102
+ return json.loads(path.read_text(encoding="utf-8"))
103
+ except Exception:
104
+ return None