@vespermcp/mcp-server 1.2.20 → 1.2.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -760,19 +760,19 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
760
760
  },
761
761
  {
762
762
  name: "vesper_download_assets",
763
- description: "Download real image/media assets using smart source-aware recipes (HuggingFace, Kaggle, direct URL).",
763
+ description: "Download real image/media assets using smart source-aware recipes (HuggingFace, Kaggle, direct URL). Auto-detects image columns from HF feature types, column names, and value patterns. Supports PIL Images, URL-based images, and binary image data.",
764
764
  inputSchema: {
765
765
  type: "object",
766
766
  properties: {
767
- dataset_id: { type: "string", description: "Unique dataset identifier." },
767
+ dataset_id: { type: "string", description: "Unique dataset identifier (e.g. 'user/dataset')." },
768
768
  source: { type: "string", enum: ["huggingface", "kaggle", "url"], description: "Asset source type." },
769
- repo_id: { type: "string", description: "Repo ID for HuggingFace (e.g. cifar100)." },
769
+ repo_id: { type: "string", description: "Repo ID for HuggingFace (e.g. 'user/dataset'). Auto-inferred from dataset_id if omitted." },
770
770
  kaggle_ref: { type: "string", description: "Kaggle dataset ref (owner/dataset)." },
771
771
  urls: { type: "array", items: { type: "string" }, description: "Direct asset URLs." },
772
772
  output_format: { type: "string", enum: ["webdataset", "imagefolder", "parquet"], description: "Output asset format." },
773
773
  max_items: { type: "number", description: "Optional cap on number of assets to fetch." },
774
774
  workers: { type: "number", description: "Parallel worker count (default 8)." },
775
- image_column: { type: "string", description: "Optional explicit image column for HuggingFace datasets." },
775
+ image_column: { type: "string", description: "Explicit image column name. If omitted, auto-detected from HF features, column names, and sample values." },
776
776
  },
777
777
  required: ["dataset_id", "source"],
778
778
  },
@@ -1270,7 +1270,10 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1270
1270
  hydrateExternalKeys();
1271
1271
  const datasetId = String(request.params.arguments?.dataset_id || "").trim();
1272
1272
  const source = String(request.params.arguments?.source || "").trim().toLowerCase();
1273
- const repoId = request.params.arguments?.repo_id ? String(request.params.arguments.repo_id) : undefined;
1273
+ // Auto-infer repo_id from dataset_id if not provided (common for HuggingFace)
1274
+ const repoId = request.params.arguments?.repo_id
1275
+ ? String(request.params.arguments.repo_id)
1276
+ : (source === "huggingface" && datasetId.includes("/") ? datasetId : undefined);
1274
1277
  const kaggleRef = request.params.arguments?.kaggle_ref ? String(request.params.arguments.kaggle_ref) : undefined;
1275
1278
  const urls = Array.isArray(request.params.arguments?.urls)
1276
1279
  ? (request.params.arguments?.urls).map(v => String(v))
@@ -1296,6 +1299,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1296
1299
  }
1297
1300
  if (source === "huggingface") {
1298
1301
  requiredModules.push({ module: "datasets", packageName: "datasets" });
1302
+ requiredModules.push({ module: "PIL", packageName: "Pillow" });
1299
1303
  }
1300
1304
  if (source === "kaggle") {
1301
1305
  requiredModules.push({ module: "kaggle", packageName: "kaggle" });
@@ -1326,8 +1330,17 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1326
1330
  try {
1327
1331
  const result = await runPythonJson(scriptPath, ["download", JSON.stringify(payload)]);
1328
1332
  if (!result?.ok) {
1333
+ const errMsg = result?.error || "Unknown error";
1334
+ // Enhance error messages for common failures
1335
+ let hint = "";
1336
+ if (errMsg.includes("No image column")) {
1337
+ hint = "\n\nHint: Specify image_column parameter with the column name containing images/URLs.";
1338
+ }
1339
+ else if (errMsg.includes("Authentication") || errMsg.includes("401") || errMsg.includes("403")) {
1340
+ hint = "\n\nHint: Use configure_keys tool to set HF_TOKEN for gated/private datasets.";
1341
+ }
1329
1342
  return {
1330
- content: [{ type: "text", text: `ERROR: asset download failed: ${result?.error || "Unknown error"}` }],
1343
+ content: [{ type: "text", text: `ERROR: asset download failed: ${errMsg}${hint}` }],
1331
1344
  isError: true,
1332
1345
  };
1333
1346
  }
@@ -94,8 +94,18 @@ export class HFDownloader {
94
94
  }
95
95
  catch (error) {
96
96
  const msg = String(error?.message || error);
97
- if (msg.includes("401") || msg.includes("403") || msg.toLowerCase().includes("unauthorized")) {
98
- throw new Error("Hugging Face gated/private dataset requires token. Run 'vespermcp config keys' to set HF token.");
97
+ if (msg.includes("401") || msg.toLowerCase().includes("unauthorized")) {
98
+ throw new Error(`Authentication required for dataset '${repoId}'. ` +
99
+ `This dataset may be gated or private. ` +
100
+ `Use the configure_keys tool to set your HF_TOKEN, then retry.`);
101
+ }
102
+ if (msg.includes("403") || msg.toLowerCase().includes("forbidden")) {
103
+ throw new Error(`Access denied for dataset '${repoId}'. ` +
104
+ `You may need to accept the dataset's usage agreement on huggingface.co, ` +
105
+ `then set HF_TOKEN via configure_keys tool.`);
106
+ }
107
+ if (msg.includes("404") || msg.toLowerCase().includes("not found")) {
108
+ throw new Error(`Dataset '${repoId}' not found on HuggingFace. Check the dataset ID.`);
99
109
  }
100
110
  console.error(`[HF] Failed to list files for ${repoId}:`, msg);
101
111
  return null;
@@ -81,13 +81,20 @@ export class DataIngestor {
81
81
  return resolvedPath;
82
82
  }
83
83
  catch (e) {
84
- this.failDownload(datasetId, e.message);
85
- throw e;
84
+ const msg = String(e?.message || e);
85
+ // If auth error, propagate immediately with helpful message
86
+ if (msg.includes("401") || msg.includes("403") || msg.includes("Authentication") || msg.includes("Access denied")) {
87
+ this.failDownload(datasetId, msg);
88
+ throw e;
89
+ }
90
+ // For other download errors, try the fallback
91
+ onProgress?.(`Direct download failed (${msg}), trying datasets library fallback...`);
86
92
  }
87
93
  }
88
- else {
89
- // Fallback: Use Python datasets library to download and convert
90
- onProgress?.("No raw files found. Using HuggingFace datasets library to download...");
94
+ // Fallback: Use Python datasets library to download and convert
95
+ // This runs when findBestFile returns null OR when direct download fails (non-auth)
96
+ if (!fs.existsSync(this.getTargetPath(datasetId, "parquet")) || !this.store.getDownloadStatus(datasetId)?.status?.includes("completed")) {
97
+ onProgress?.("Using HuggingFace datasets library to download...");
91
98
  const targetPath = this.getTargetPath(datasetId, "parquet");
92
99
  this.store.registerDownload(datasetId, targetPath, "downloading");
93
100
  try {
@@ -3,9 +3,14 @@ import asyncio
3
3
  import json
4
4
  import os
5
5
  import sys
6
+ import warnings
6
7
  from pathlib import Path
7
8
  from typing import Any, Dict
8
9
 
10
+ # Suppress noisy HF warnings
11
+ warnings.filterwarnings("ignore", message=".*trust_remote_code.*")
12
+ warnings.filterwarnings("ignore", message=".*legacy.*")
13
+
9
14
  CURRENT_DIR = Path(__file__).resolve().parent
10
15
  if str(CURRENT_DIR) not in sys.path:
11
16
  sys.path.insert(0, str(CURRENT_DIR))
@@ -24,6 +29,11 @@ async def _run_download(args: argparse.Namespace) -> Dict[str, Any]:
24
29
  workers = int(payload.get("workers") or 8)
25
30
  recipes_dir = payload.get("recipes_dir")
26
31
 
32
+ # Auto-set HF token from payload if provided
33
+ token = payload.get("token") or payload.get("hf_token")
34
+ if token:
35
+ os.environ["HF_TOKEN"] = str(token)
36
+
27
37
  downloader = AssetDownloader(output_root=output_root, workers=workers, recipes_dir=recipes_dir)
28
38
 
29
39
  result = await downloader.download_assets(
@@ -66,7 +76,16 @@ def main() -> None:
66
76
 
67
77
  _print({"ok": False, "error": f"Unknown action: {args.action}"})
68
78
  except Exception as e:
69
- _print({"ok": False, "error": str(e)})
79
+ error_msg = str(e)
80
+ # Provide actionable error messages
81
+ if "401" in error_msg or "403" in error_msg or "Unauthorized" in error_msg:
82
+ error_msg = (
83
+ "Authentication required. This dataset may be gated/private. "
84
+ "Use configure_keys tool to set HF_TOKEN, then retry."
85
+ )
86
+ elif "No image column" in error_msg:
87
+ error_msg += " Hint: specify image_column parameter with the name of the column containing images."
88
+ _print({"ok": False, "error": error_msg})
70
89
 
71
90
 
72
91
  if __name__ == "__main__":
@@ -5,6 +5,12 @@ Used when the HF Hub file listing finds no suitable data files
5
5
  (e.g. script-based datasets, gated datasets, datasets that use
6
6
  the `datasets` library format).
7
7
 
8
+ Handles:
9
+ - Legacy script-based datasets (trust_remote_code)
10
+ - Gated/private datasets (token auth)
11
+ - Image datasets (PIL Image columns → stripped for tabular export)
12
+ - Various split formats (DatasetDict, single split)
13
+
8
14
  Usage:
9
15
  python hf_fallback.py '{"repo_id": "user/dataset", "output_path": "/path/to/output.parquet", "token": "optional", "max_rows": 100000, "split": "train"}'
10
16
 
@@ -15,6 +21,127 @@ Output: JSON to stdout
15
21
  import sys
16
22
  import json
17
23
  import os
24
+ import warnings
25
+
26
+ # Suppress noisy HF warnings about trust_remote_code etc.
27
+ warnings.filterwarnings("ignore", message=".*trust_remote_code.*")
28
+ warnings.filterwarnings("ignore", message=".*legacy.*")
29
+
30
+
31
+ def _detect_image_columns(ds):
32
+ """Detect columns that contain HF Image features or PIL Image objects."""
33
+ image_cols = []
34
+ features = getattr(ds, "features", None)
35
+ if features:
36
+ for name, feat in features.items():
37
+ feat_cls = feat.__class__.__name__.lower()
38
+ feat_str = str(feat).lower()
39
+ if feat_cls == "image" or "image(" in feat_str:
40
+ image_cols.append(name)
41
+ return image_cols
42
+
43
+
44
+ def _strip_image_columns(ds, image_cols):
45
+ """Remove image columns from dataset so it can be exported to Parquet/CSV.
46
+
47
+ Image columns contain PIL Image objects that can't be serialized to tabular
48
+ formats. We replace them with a placeholder string indicating the column
49
+ was an image column.
50
+ """
51
+ if not image_cols:
52
+ return ds
53
+
54
+ # Remove the image columns entirely for tabular export
55
+ cols_to_keep = [c for c in ds.column_names if c not in image_cols]
56
+ if not cols_to_keep:
57
+ # Dataset is ALL image columns — keep them but cast to path strings if possible
58
+ return ds
59
+
60
+ return ds.select_columns(cols_to_keep)
61
+
62
+
63
+ def _load_dataset_robust(repo_id, token, split):
64
+ """Load a HuggingFace dataset with multiple fallback strategies.
65
+
66
+ Strategy order:
67
+ 1. Normal load with trust_remote_code=True (handles legacy script datasets)
68
+ 2. Load without trust_remote_code (newer datasets that reject it)
69
+ 3. Load with streaming=True then materialize (handles very large datasets)
70
+ """
71
+ from datasets import load_dataset, DatasetDict
72
+
73
+ errors = []
74
+ splits_to_try = [split] if split else ["train", "test", "validation", None]
75
+
76
+ # Strategy 1: Normal load with trust_remote_code
77
+ for s in splits_to_try:
78
+ try:
79
+ kwargs = {"path": repo_id, "trust_remote_code": True}
80
+ if token:
81
+ kwargs["token"] = token
82
+ if s:
83
+ kwargs["split"] = s
84
+ ds = load_dataset(**kwargs)
85
+ return ds, s
86
+ except (ValueError, KeyError):
87
+ continue
88
+ except Exception as e:
89
+ msg = str(e)
90
+ # Auth errors should be raised immediately, not retried
91
+ if any(x in msg for x in ["401", "403", "gated", "Unauthorized", "access"]):
92
+ raise
93
+ if "split" in msg.lower() or "key" in msg.lower():
94
+ continue
95
+ errors.append(f"trust_remote_code=True, split={s}: {msg}")
96
+
97
+ # Strategy 2: Load WITHOUT trust_remote_code (some repos reject it)
98
+ for s in splits_to_try:
99
+ try:
100
+ kwargs = {"path": repo_id}
101
+ if token:
102
+ kwargs["token"] = token
103
+ if s:
104
+ kwargs["split"] = s
105
+ ds = load_dataset(**kwargs)
106
+ return ds, s
107
+ except (ValueError, KeyError):
108
+ continue
109
+ except Exception as e:
110
+ msg = str(e)
111
+ if any(x in msg for x in ["401", "403", "gated", "Unauthorized", "access"]):
112
+ raise
113
+ if "split" in msg.lower() or "key" in msg.lower():
114
+ continue
115
+ errors.append(f"trust_remote_code=False, split={s}: {msg}")
116
+
117
+ # Strategy 3: Streaming fallback (for very large / oddly structured datasets)
118
+ for s in splits_to_try:
119
+ if s is None:
120
+ continue # streaming requires a split
121
+ try:
122
+ kwargs = {"path": repo_id, "streaming": True, "trust_remote_code": True}
123
+ if token:
124
+ kwargs["token"] = token
125
+ if s:
126
+ kwargs["split"] = s
127
+ ds_stream = load_dataset(**kwargs)
128
+ # Materialize from streaming iterator
129
+ from datasets import Dataset as HFDataset
130
+ rows = []
131
+ for i, row in enumerate(ds_stream):
132
+ if i >= 500000:
133
+ break
134
+ rows.append(row)
135
+ if rows:
136
+ ds = HFDataset.from_list(rows)
137
+ return ds, s
138
+ except Exception:
139
+ continue
140
+
141
+ # All strategies failed
142
+ error_summary = "; ".join(errors[:3]) if errors else "No valid configuration found"
143
+ return None, error_summary
144
+
18
145
 
19
146
  def main():
20
147
  if len(sys.argv) < 2:
@@ -53,37 +180,10 @@ def main():
53
180
  pl = None
54
181
 
55
182
  try:
56
- # Try loading with streaming first (memory-efficient)
57
- # If split is not specified, try common ones
58
- splits_to_try = [split] if split else ["train", "test", "validation", None]
59
-
60
- ds = None
61
- used_split = None
62
-
63
- for s in splits_to_try:
64
- try:
65
- kwargs = {
66
- "path": repo_id,
67
- "trust_remote_code": True,
68
- }
69
- if token:
70
- kwargs["token"] = token
71
- if s:
72
- kwargs["split"] = s
73
-
74
- ds = load_dataset(**kwargs)
75
- used_split = s
76
- break
77
- except (ValueError, KeyError):
78
- # Split doesn't exist, try next
79
- continue
80
- except Exception as e:
81
- if "split" in str(e).lower() or "key" in str(e).lower():
82
- continue
83
- raise
183
+ ds, used_split = _load_dataset_robust(repo_id, token, split)
84
184
 
85
185
  if ds is None:
86
- print(json.dumps({"ok": False, "error": f"Could not load dataset '{repo_id}'. No valid splits found."}))
186
+ print(json.dumps({"ok": False, "error": f"Could not load dataset '{repo_id}': {used_split}"}))
87
187
  sys.exit(1)
88
188
 
89
189
  # Handle DatasetDict (when no split specified)
@@ -107,37 +207,88 @@ def main():
107
207
  ds = ds.select(range(max_rows))
108
208
  total_rows = max_rows
109
209
 
210
+ # Detect and handle image columns (PIL Image objects can't be exported to Parquet)
211
+ image_cols = _detect_image_columns(ds)
212
+ has_images = len(image_cols) > 0
213
+
214
+ if has_images:
215
+ # Strip image columns for tabular export, note them in output
216
+ export_ds = _strip_image_columns(ds, image_cols)
217
+ else:
218
+ export_ds = ds
219
+
110
220
  # Ensure output directory exists
111
221
  os.makedirs(os.path.dirname(output_path), exist_ok=True)
112
222
 
113
223
  # Export to parquet
114
- columns = ds.column_names
224
+ columns = export_ds.column_names
115
225
 
116
- if output_path.endswith(".parquet"):
117
- ds.to_parquet(output_path)
118
- elif output_path.endswith(".csv"):
119
- ds.to_csv(output_path)
120
- else:
121
- # Default to parquet
122
- if not output_path.endswith(".parquet"):
123
- output_path = output_path + ".parquet"
124
- ds.to_parquet(output_path)
226
+ try:
227
+ if output_path.endswith(".parquet"):
228
+ export_ds.to_parquet(output_path)
229
+ elif output_path.endswith(".csv"):
230
+ export_ds.to_csv(output_path)
231
+ else:
232
+ # Default to parquet
233
+ if not output_path.endswith(".parquet"):
234
+ output_path = output_path + ".parquet"
235
+ export_ds.to_parquet(output_path)
236
+ except Exception as export_err:
237
+ # If parquet export fails (e.g. complex nested types), try CSV
238
+ csv_path = output_path.replace(".parquet", ".csv")
239
+ try:
240
+ export_ds.to_csv(csv_path)
241
+ output_path = csv_path
242
+ except Exception:
243
+ raise export_err # Re-raise original error
125
244
 
126
- print(json.dumps({
245
+ result = {
127
246
  "ok": True,
128
247
  "path": output_path,
129
248
  "rows": total_rows,
130
249
  "columns": columns,
131
250
  "split": used_split
132
- }))
251
+ }
252
+
253
+ if has_images:
254
+ result["image_columns"] = image_cols
255
+ result["note"] = (
256
+ f"This dataset contains image columns ({', '.join(image_cols)}). "
257
+ "Image data was stripped for tabular export. "
258
+ "Use vesper_download_assets with source='huggingface' to download the actual images."
259
+ )
260
+
261
+ print(json.dumps(result))
133
262
 
134
263
  except Exception as e:
135
264
  error_msg = str(e)
136
- # Provide helpful hints
137
- if "401" in error_msg or "403" in error_msg or "gated" in error_msg.lower():
138
- error_msg += " (This dataset may be gated/private. Set HF_TOKEN via configure_keys tool.)"
139
- elif "FileNotFoundError" in error_msg or "does not exist" in error_msg.lower():
265
+ # Provide helpful, actionable hints
266
+ if "401" in error_msg or "Unauthorized" in error_msg:
267
+ error_msg = (
268
+ f"Authentication required for dataset '{repo_id}'. "
269
+ "This dataset may be gated or private. "
270
+ "Use the configure_keys tool to set your HF_TOKEN, then retry."
271
+ )
272
+ elif "403" in error_msg or "Forbidden" in error_msg:
273
+ error_msg = (
274
+ f"Access denied for dataset '{repo_id}'. "
275
+ "You may need to accept the dataset's usage agreement on huggingface.co, "
276
+ "then set HF_TOKEN via configure_keys tool."
277
+ )
278
+ elif "gated" in error_msg.lower():
279
+ error_msg = (
280
+ f"Dataset '{repo_id}' is gated. "
281
+ "Visit https://huggingface.co/datasets/{repo_id} to request access, "
282
+ "then set HF_TOKEN via configure_keys tool."
283
+ ).format(repo_id=repo_id)
284
+ elif "FileNotFoundError" in error_msg or "does not exist" in error_msg.lower() or "doesn't exist" in error_msg.lower():
140
285
  error_msg = f"Dataset '{repo_id}' not found on HuggingFace. Check the dataset ID."
286
+ elif "script" in error_msg.lower() and "no longer supported" in error_msg.lower():
287
+ error_msg = (
288
+ f"Dataset '{repo_id}' uses a legacy loading script that is no longer supported "
289
+ "by the current version of the datasets library. "
290
+ "Try: pip install datasets --upgrade, or use an older datasets version."
291
+ )
141
292
 
142
293
  print(json.dumps({"ok": False, "error": error_msg}))
143
294
  sys.exit(1)