@vespermcp/mcp-server 1.2.19 → 1.2.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -453,6 +453,17 @@ jobManager.on("processJob", async (job, execute) => {
453
453
  async function handlePrepareJob(jobId, query, requirements) {
454
454
  hydrateExternalKeys();
455
455
  const update = (updates) => jobManager.updateJob(jobId, updates);
456
+ // Ensure core Python packages are available for dataset operations
457
+ try {
458
+ await ensurePythonModules([
459
+ { module: "polars", packageName: "polars" },
460
+ { module: "datasets", packageName: "datasets" },
461
+ ]);
462
+ }
463
+ catch (e) {
464
+ console.error(`[Prepare] Python dependency setup warning: ${e.message}`);
465
+ // Continue anyway - direct file downloads may still work without datasets lib
466
+ }
456
467
  const requestedRows = extractRequestedRows(query, requirements);
457
468
  let selectedDataset;
458
469
  let datasetIdForDownload = "";
@@ -749,19 +760,19 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
749
760
  },
750
761
  {
751
762
  name: "vesper_download_assets",
752
- description: "Download real image/media assets using smart source-aware recipes (HuggingFace, Kaggle, direct URL).",
763
+ description: "Download real image/media assets using smart source-aware recipes (HuggingFace, Kaggle, direct URL). Auto-detects image columns from HF feature types, column names, and value patterns. Supports PIL Images, URL-based images, and binary image data.",
753
764
  inputSchema: {
754
765
  type: "object",
755
766
  properties: {
756
- dataset_id: { type: "string", description: "Unique dataset identifier." },
767
+ dataset_id: { type: "string", description: "Unique dataset identifier (e.g. 'user/dataset')." },
757
768
  source: { type: "string", enum: ["huggingface", "kaggle", "url"], description: "Asset source type." },
758
- repo_id: { type: "string", description: "Repo ID for HuggingFace (e.g. cifar100)." },
769
+ repo_id: { type: "string", description: "Repo ID for HuggingFace (e.g. 'user/dataset'). Auto-inferred from dataset_id if omitted." },
759
770
  kaggle_ref: { type: "string", description: "Kaggle dataset ref (owner/dataset)." },
760
771
  urls: { type: "array", items: { type: "string" }, description: "Direct asset URLs." },
761
772
  output_format: { type: "string", enum: ["webdataset", "imagefolder", "parquet"], description: "Output asset format." },
762
773
  max_items: { type: "number", description: "Optional cap on number of assets to fetch." },
763
774
  workers: { type: "number", description: "Parallel worker count (default 8)." },
764
- image_column: { type: "string", description: "Optional explicit image column for HuggingFace datasets." },
775
+ image_column: { type: "string", description: "Explicit image column name. If omitted, auto-detected from HF features, column names, and sample values." },
765
776
  },
766
777
  required: ["dataset_id", "source"],
767
778
  },
@@ -1225,6 +1236,17 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1225
1236
  isError: true,
1226
1237
  };
1227
1238
  }
1239
+ // Pre-install Python datasets library for HuggingFace fallback
1240
+ if (source === "huggingface") {
1241
+ try {
1242
+ await ensurePythonModules([
1243
+ { module: "datasets", packageName: "datasets" },
1244
+ ]);
1245
+ }
1246
+ catch {
1247
+ // Continue - direct download may still work
1248
+ }
1249
+ }
1228
1250
  try {
1229
1251
  const localPath = await dataIngestor.ensureData(datasetId, source, () => undefined);
1230
1252
  try {
@@ -1248,7 +1270,10 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1248
1270
  hydrateExternalKeys();
1249
1271
  const datasetId = String(request.params.arguments?.dataset_id || "").trim();
1250
1272
  const source = String(request.params.arguments?.source || "").trim().toLowerCase();
1251
- const repoId = request.params.arguments?.repo_id ? String(request.params.arguments.repo_id) : undefined;
1273
+ // Auto-infer repo_id from dataset_id if not provided (common for HuggingFace)
1274
+ const repoId = request.params.arguments?.repo_id
1275
+ ? String(request.params.arguments.repo_id)
1276
+ : (source === "huggingface" && datasetId.includes("/") ? datasetId : undefined);
1252
1277
  const kaggleRef = request.params.arguments?.kaggle_ref ? String(request.params.arguments.kaggle_ref) : undefined;
1253
1278
  const urls = Array.isArray(request.params.arguments?.urls)
1254
1279
  ? (request.params.arguments?.urls).map(v => String(v))
@@ -1274,6 +1299,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1274
1299
  }
1275
1300
  if (source === "huggingface") {
1276
1301
  requiredModules.push({ module: "datasets", packageName: "datasets" });
1302
+ requiredModules.push({ module: "PIL", packageName: "Pillow" });
1277
1303
  }
1278
1304
  if (source === "kaggle") {
1279
1305
  requiredModules.push({ module: "kaggle", packageName: "kaggle" });
@@ -1304,8 +1330,17 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1304
1330
  try {
1305
1331
  const result = await runPythonJson(scriptPath, ["download", JSON.stringify(payload)]);
1306
1332
  if (!result?.ok) {
1333
+ const errMsg = result?.error || "Unknown error";
1334
+ // Enhance error messages for common failures
1335
+ let hint = "";
1336
+ if (errMsg.includes("No image column")) {
1337
+ hint = "\n\nHint: Specify image_column parameter with the column name containing images/URLs.";
1338
+ }
1339
+ else if (errMsg.includes("Authentication") || errMsg.includes("401") || errMsg.includes("403")) {
1340
+ hint = "\n\nHint: Use configure_keys tool to set HF_TOKEN for gated/private datasets.";
1341
+ }
1307
1342
  return {
1308
- content: [{ type: "text", text: `ERROR: asset download failed: ${result?.error || "Unknown error"}` }],
1343
+ content: [{ type: "text", text: `ERROR: asset download failed: ${errMsg}${hint}` }],
1309
1344
  isError: true,
1310
1345
  };
1311
1346
  }
@@ -94,8 +94,18 @@ export class HFDownloader {
94
94
  }
95
95
  catch (error) {
96
96
  const msg = String(error?.message || error);
97
- if (msg.includes("401") || msg.includes("403") || msg.toLowerCase().includes("unauthorized")) {
98
- throw new Error("Hugging Face gated/private dataset requires token. Run 'vespermcp config keys' to set HF token.");
97
+ if (msg.includes("401") || msg.toLowerCase().includes("unauthorized")) {
98
+ throw new Error(`Authentication required for dataset '${repoId}'. ` +
99
+ `This dataset may be gated or private. ` +
100
+ `Use the configure_keys tool to set your HF_TOKEN, then retry.`);
101
+ }
102
+ if (msg.includes("403") || msg.toLowerCase().includes("forbidden")) {
103
+ throw new Error(`Access denied for dataset '${repoId}'. ` +
104
+ `You may need to accept the dataset's usage agreement on huggingface.co, ` +
105
+ `then set HF_TOKEN via configure_keys tool.`);
106
+ }
107
+ if (msg.includes("404") || msg.toLowerCase().includes("not found")) {
108
+ throw new Error(`Dataset '${repoId}' not found on HuggingFace. Check the dataset ID.`);
99
109
  }
100
110
  console.error(`[HF] Failed to list files for ${repoId}:`, msg);
101
111
  return null;
@@ -1,5 +1,6 @@
1
1
  import path from "path";
2
2
  import fs from "fs";
3
+ import { spawn } from "child_process";
3
4
  import { HFDownloader } from "./hf-downloader.js";
4
5
  import { KaggleSource } from "../metadata/kaggle-source.js";
5
6
  import { OpenMLSource } from "../metadata/openml-source.js";
@@ -63,25 +64,49 @@ export class DataIngestor {
63
64
  if (source === "huggingface") {
64
65
  onProgress?.("Discovering data files on HuggingFace Hub...");
65
66
  const remotePath = await this.hfDownloader.findBestFile(datasetId);
66
- if (!remotePath)
67
- throw new Error(`No suitable data files found in HuggingFace repo: ${datasetId}`);
68
- const ext = path.extname(remotePath).substring(1).toLowerCase() || "csv";
69
- const targetPath = this.getTargetPath(datasetId, ext);
70
- this.store.registerDownload(datasetId, targetPath, "downloading");
71
- try {
72
- await this.hfDownloader.download(datasetId, remotePath, targetPath, (progress) => {
73
- onProgress?.(`Downloading ${path.basename(remotePath)}...`, progress);
74
- });
75
- const resolvedPath = await this.hfDownloader.resolveExternalDataFromMetadata(targetPath, (progress) => {
76
- onProgress?.("Resolving external dataset file...", progress);
77
- });
78
- const stats = fs.statSync(resolvedPath);
79
- this.completeDownload(datasetId, resolvedPath, stats.size);
80
- return resolvedPath;
67
+ if (remotePath) {
68
+ // Direct file download path (repo has raw data files)
69
+ const ext = path.extname(remotePath).substring(1).toLowerCase() || "csv";
70
+ const targetPath = this.getTargetPath(datasetId, ext);
71
+ this.store.registerDownload(datasetId, targetPath, "downloading");
72
+ try {
73
+ await this.hfDownloader.download(datasetId, remotePath, targetPath, (progress) => {
74
+ onProgress?.(`Downloading ${path.basename(remotePath)}...`, progress);
75
+ });
76
+ const resolvedPath = await this.hfDownloader.resolveExternalDataFromMetadata(targetPath, (progress) => {
77
+ onProgress?.("Resolving external dataset file...", progress);
78
+ });
79
+ const stats = fs.statSync(resolvedPath);
80
+ this.completeDownload(datasetId, resolvedPath, stats.size);
81
+ return resolvedPath;
82
+ }
83
+ catch (e) {
84
+ const msg = String(e?.message || e);
85
+ // If auth error, propagate immediately with helpful message
86
+ if (msg.includes("401") || msg.includes("403") || msg.includes("Authentication") || msg.includes("Access denied")) {
87
+ this.failDownload(datasetId, msg);
88
+ throw e;
89
+ }
90
+ // For other download errors, try the fallback
91
+ onProgress?.(`Direct download failed (${msg}), trying datasets library fallback...`);
92
+ }
81
93
  }
82
- catch (e) {
83
- this.failDownload(datasetId, e.message);
84
- throw e;
94
+ // Fallback: Use Python datasets library to download and convert
95
+ // This runs when findBestFile returns null OR when direct download fails (non-auth)
96
+ if (!fs.existsSync(this.getTargetPath(datasetId, "parquet")) || !this.store.getDownloadStatus(datasetId)?.status?.includes("completed")) {
97
+ onProgress?.("Using HuggingFace datasets library to download...");
98
+ const targetPath = this.getTargetPath(datasetId, "parquet");
99
+ this.store.registerDownload(datasetId, targetPath, "downloading");
100
+ try {
101
+ const result = await this.hfDatasetsFallback(datasetId, targetPath, onProgress);
102
+ const stats = fs.statSync(result);
103
+ this.completeDownload(datasetId, result, stats.size);
104
+ return result;
105
+ }
106
+ catch (e) {
107
+ this.failDownload(datasetId, e.message);
108
+ throw e;
109
+ }
85
110
  }
86
111
  }
87
112
  else if (source === "kaggle") {
@@ -159,4 +184,85 @@ export class DataIngestor {
159
184
  const safeId = datasetId.replace(/\//g, "_").replace(/:/g, "_");
160
185
  return path.join(this.rawDataDir, `${safeId}.${extension}`);
161
186
  }
187
+ /**
188
+ * Fallback: Use Python `datasets` library to download a HuggingFace dataset
189
+ * when no raw data files are found in the repo file listing.
190
+ */
191
+ async hfDatasetsFallback(datasetId, targetPath, onProgress) {
192
+ const pyCmd = process.platform === "win32" ? "py" : "python";
193
+ // Resolve the fallback script path
194
+ const homeDir = process.env.HOME || process.env.USERPROFILE || this.projectRoot;
195
+ const dataRoot = path.join(homeDir, ".vesper");
196
+ const scriptCandidates = [
197
+ path.resolve(dataRoot, "python", "hf_fallback.py"),
198
+ path.resolve(this.projectRoot, "python", "hf_fallback.py"),
199
+ path.resolve(this.projectRoot, "..", "src", "python", "hf_fallback.py"),
200
+ path.resolve(this.projectRoot, "..", "python", "hf_fallback.py"),
201
+ ];
202
+ let scriptPath = scriptCandidates.find(p => fs.existsSync(p));
203
+ if (!scriptPath) {
204
+ scriptPath = scriptCandidates[0]; // Will fail with a clear error
205
+ }
206
+ const token = process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN || undefined;
207
+ const payload = {
208
+ repo_id: datasetId,
209
+ output_path: targetPath,
210
+ token: token || null,
211
+ max_rows: 500000,
212
+ };
213
+ onProgress?.("Downloading via datasets library (this may take a moment)...", 30);
214
+ return new Promise((resolve, reject) => {
215
+ const proc = spawn(pyCmd, [scriptPath, JSON.stringify(payload)], {
216
+ env: {
217
+ ...process.env,
218
+ PYTHONUTF8: "1",
219
+ PIP_DISABLE_PIP_VERSION_CHECK: "1",
220
+ },
221
+ });
222
+ let stdout = "";
223
+ let stderr = "";
224
+ proc.stdout.on("data", (d) => (stdout += d.toString()));
225
+ proc.stderr.on("data", (d) => {
226
+ const msg = d.toString();
227
+ stderr += msg;
228
+ // Forward progress info
229
+ if (msg.includes("Downloading") || msg.includes("Loading")) {
230
+ onProgress?.(msg.trim().split("\n").pop() || "Downloading...", 50);
231
+ }
232
+ });
233
+ const timer = setTimeout(() => {
234
+ try {
235
+ proc.kill();
236
+ }
237
+ catch { /* no-op */ }
238
+ reject(new Error(`HuggingFace datasets download timed out after 10 minutes for ${datasetId}`));
239
+ }, 600000); // 10 min timeout
240
+ proc.on("close", (code) => {
241
+ clearTimeout(timer);
242
+ if (code !== 0) {
243
+ let errorMsg = stderr || stdout || `Python exited with code ${code}`;
244
+ try {
245
+ const parsed = JSON.parse(stdout);
246
+ if (parsed.error)
247
+ errorMsg = parsed.error;
248
+ }
249
+ catch { /* use stderr */ }
250
+ reject(new Error(`HuggingFace datasets fallback failed: ${errorMsg}`));
251
+ return;
252
+ }
253
+ try {
254
+ const result = JSON.parse(stdout);
255
+ if (!result.ok) {
256
+ reject(new Error(result.error || "Unknown error from HF fallback"));
257
+ return;
258
+ }
259
+ onProgress?.(`Downloaded ${result.rows?.toLocaleString() || "?"} rows (${result.columns?.length || "?"} columns)`, 90);
260
+ resolve(result.path);
261
+ }
262
+ catch {
263
+ reject(new Error(`Failed to parse HF fallback output: ${stdout}`));
264
+ }
265
+ });
266
+ });
267
+ }
162
268
  }
@@ -3,9 +3,14 @@ import asyncio
3
3
  import json
4
4
  import os
5
5
  import sys
6
+ import warnings
6
7
  from pathlib import Path
7
8
  from typing import Any, Dict
8
9
 
10
+ # Suppress noisy HF warnings
11
+ warnings.filterwarnings("ignore", message=".*trust_remote_code.*")
12
+ warnings.filterwarnings("ignore", message=".*legacy.*")
13
+
9
14
  CURRENT_DIR = Path(__file__).resolve().parent
10
15
  if str(CURRENT_DIR) not in sys.path:
11
16
  sys.path.insert(0, str(CURRENT_DIR))
@@ -24,6 +29,11 @@ async def _run_download(args: argparse.Namespace) -> Dict[str, Any]:
24
29
  workers = int(payload.get("workers") or 8)
25
30
  recipes_dir = payload.get("recipes_dir")
26
31
 
32
+ # Auto-set HF token from payload if provided
33
+ token = payload.get("token") or payload.get("hf_token")
34
+ if token:
35
+ os.environ["HF_TOKEN"] = str(token)
36
+
27
37
  downloader = AssetDownloader(output_root=output_root, workers=workers, recipes_dir=recipes_dir)
28
38
 
29
39
  result = await downloader.download_assets(
@@ -66,7 +76,16 @@ def main() -> None:
66
76
 
67
77
  _print({"ok": False, "error": f"Unknown action: {args.action}"})
68
78
  except Exception as e:
69
- _print({"ok": False, "error": str(e)})
79
+ error_msg = str(e)
80
+ # Provide actionable error messages
81
+ if "401" in error_msg or "403" in error_msg or "Unauthorized" in error_msg:
82
+ error_msg = (
83
+ "Authentication required. This dataset may be gated/private. "
84
+ "Use configure_keys tool to set HF_TOKEN, then retry."
85
+ )
86
+ elif "No image column" in error_msg:
87
+ error_msg += " Hint: specify image_column parameter with the name of the column containing images."
88
+ _print({"ok": False, "error": error_msg})
70
89
 
71
90
 
72
91
  if __name__ == "__main__":
@@ -0,0 +1,298 @@
1
+ """
2
+ HuggingFace Datasets Library Fallback Downloader.
3
+
4
+ Used when the HF Hub file listing finds no suitable data files
5
+ (e.g. script-based datasets, gated datasets, datasets that use
6
+ the `datasets` library format).
7
+
8
+ Handles:
9
+ - Legacy script-based datasets (trust_remote_code)
10
+ - Gated/private datasets (token auth)
11
+ - Image datasets (PIL Image columns → stripped for tabular export)
12
+ - Various split formats (DatasetDict, single split)
13
+
14
+ Usage:
15
+ python hf_fallback.py '{"repo_id": "user/dataset", "output_path": "/path/to/output.parquet", "token": "optional", "max_rows": 100000, "split": "train"}'
16
+
17
+ Output: JSON to stdout
18
+ {"ok": true, "path": "/path/to/output.parquet", "rows": 12345, "columns": ["col1", "col2"]}
19
+ {"ok": false, "error": "..."}
20
+ """
21
+ import sys
22
+ import json
23
+ import os
24
+ import warnings
25
+
26
+ # Suppress noisy HF warnings about trust_remote_code etc.
27
+ warnings.filterwarnings("ignore", message=".*trust_remote_code.*")
28
+ warnings.filterwarnings("ignore", message=".*legacy.*")
29
+
30
+
31
+ def _detect_image_columns(ds):
32
+ """Detect columns that contain HF Image features or PIL Image objects."""
33
+ image_cols = []
34
+ features = getattr(ds, "features", None)
35
+ if features:
36
+ for name, feat in features.items():
37
+ feat_cls = feat.__class__.__name__.lower()
38
+ feat_str = str(feat).lower()
39
+ if feat_cls == "image" or "image(" in feat_str:
40
+ image_cols.append(name)
41
+ return image_cols
42
+
43
+
44
+ def _strip_image_columns(ds, image_cols):
45
+ """Remove image columns from dataset so it can be exported to Parquet/CSV.
46
+
47
+ Image columns contain PIL Image objects that can't be serialized to tabular
48
+ formats. We replace them with a placeholder string indicating the column
49
+ was an image column.
50
+ """
51
+ if not image_cols:
52
+ return ds
53
+
54
+ # Remove the image columns entirely for tabular export
55
+ cols_to_keep = [c for c in ds.column_names if c not in image_cols]
56
+ if not cols_to_keep:
57
+ # Dataset is ALL image columns — keep them but cast to path strings if possible
58
+ return ds
59
+
60
+ return ds.select_columns(cols_to_keep)
61
+
62
+
63
+ def _load_dataset_robust(repo_id, token, split):
64
+ """Load a HuggingFace dataset with multiple fallback strategies.
65
+
66
+ Strategy order:
67
+ 1. Normal load with trust_remote_code=True (handles legacy script datasets)
68
+ 2. Load without trust_remote_code (newer datasets that reject it)
69
+ 3. Load with streaming=True then materialize (handles very large datasets)
70
+ """
71
+ from datasets import load_dataset, DatasetDict
72
+
73
+ errors = []
74
+ splits_to_try = [split] if split else ["train", "test", "validation", None]
75
+
76
+ # Strategy 1: Normal load with trust_remote_code
77
+ for s in splits_to_try:
78
+ try:
79
+ kwargs = {"path": repo_id, "trust_remote_code": True}
80
+ if token:
81
+ kwargs["token"] = token
82
+ if s:
83
+ kwargs["split"] = s
84
+ ds = load_dataset(**kwargs)
85
+ return ds, s
86
+ except (ValueError, KeyError):
87
+ continue
88
+ except Exception as e:
89
+ msg = str(e)
90
+ # Auth errors should be raised immediately, not retried
91
+ if any(x in msg for x in ["401", "403", "gated", "Unauthorized", "access"]):
92
+ raise
93
+ if "split" in msg.lower() or "key" in msg.lower():
94
+ continue
95
+ errors.append(f"trust_remote_code=True, split={s}: {msg}")
96
+
97
+ # Strategy 2: Load WITHOUT trust_remote_code (some repos reject it)
98
+ for s in splits_to_try:
99
+ try:
100
+ kwargs = {"path": repo_id}
101
+ if token:
102
+ kwargs["token"] = token
103
+ if s:
104
+ kwargs["split"] = s
105
+ ds = load_dataset(**kwargs)
106
+ return ds, s
107
+ except (ValueError, KeyError):
108
+ continue
109
+ except Exception as e:
110
+ msg = str(e)
111
+ if any(x in msg for x in ["401", "403", "gated", "Unauthorized", "access"]):
112
+ raise
113
+ if "split" in msg.lower() or "key" in msg.lower():
114
+ continue
115
+ errors.append(f"trust_remote_code=False, split={s}: {msg}")
116
+
117
+ # Strategy 3: Streaming fallback (for very large / oddly structured datasets)
118
+ for s in splits_to_try:
119
+ if s is None:
120
+ continue # streaming requires a split
121
+ try:
122
+ kwargs = {"path": repo_id, "streaming": True, "trust_remote_code": True}
123
+ if token:
124
+ kwargs["token"] = token
125
+ if s:
126
+ kwargs["split"] = s
127
+ ds_stream = load_dataset(**kwargs)
128
+ # Materialize from streaming iterator
129
+ from datasets import Dataset as HFDataset
130
+ rows = []
131
+ for i, row in enumerate(ds_stream):
132
+ if i >= 500000:
133
+ break
134
+ rows.append(row)
135
+ if rows:
136
+ ds = HFDataset.from_list(rows)
137
+ return ds, s
138
+ except Exception:
139
+ continue
140
+
141
+ # All strategies failed
142
+ error_summary = "; ".join(errors[:3]) if errors else "No valid configuration found"
143
+ return None, error_summary
144
+
145
+
146
+ def main():
147
+ if len(sys.argv) < 2:
148
+ print(json.dumps({"ok": False, "error": "Missing payload argument"}))
149
+ sys.exit(1)
150
+
151
+ try:
152
+ payload = json.loads(sys.argv[1])
153
+ except json.JSONDecodeError as e:
154
+ print(json.dumps({"ok": False, "error": f"Invalid JSON payload: {e}"}))
155
+ sys.exit(1)
156
+
157
+ repo_id = payload.get("repo_id", "").strip()
158
+ output_path = payload.get("output_path", "").strip()
159
+ token = payload.get("token") or os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or None
160
+ max_rows = payload.get("max_rows", 500000)
161
+ split = payload.get("split") # None = auto-detect
162
+
163
+ if not repo_id:
164
+ print(json.dumps({"ok": False, "error": "repo_id is required"}))
165
+ sys.exit(1)
166
+
167
+ if not output_path:
168
+ print(json.dumps({"ok": False, "error": "output_path is required"}))
169
+ sys.exit(1)
170
+
171
+ try:
172
+ from datasets import load_dataset
173
+ except ImportError:
174
+ print(json.dumps({"ok": False, "error": "Python 'datasets' library not installed. Install with: pip install datasets"}))
175
+ sys.exit(1)
176
+
177
+ try:
178
+ import polars as pl
179
+ except ImportError:
180
+ pl = None
181
+
182
+ try:
183
+ ds, used_split = _load_dataset_robust(repo_id, token, split)
184
+
185
+ if ds is None:
186
+ print(json.dumps({"ok": False, "error": f"Could not load dataset '{repo_id}': {used_split}"}))
187
+ sys.exit(1)
188
+
189
+ # Handle DatasetDict (when no split specified)
190
+ from datasets import DatasetDict, Dataset
191
+ if isinstance(ds, DatasetDict):
192
+ # Pick the best split
193
+ for preferred in ["train", "test", "validation"]:
194
+ if preferred in ds:
195
+ ds = ds[preferred]
196
+ used_split = preferred
197
+ break
198
+ else:
199
+ # Just pick the first available split
200
+ first_key = list(ds.keys())[0]
201
+ ds = ds[first_key]
202
+ used_split = first_key
203
+
204
+ # Limit rows if needed
205
+ total_rows = len(ds)
206
+ if max_rows and total_rows > max_rows:
207
+ ds = ds.select(range(max_rows))
208
+ total_rows = max_rows
209
+
210
+ # Detect and handle image columns (PIL Image objects can't be exported to Parquet)
211
+ image_cols = _detect_image_columns(ds)
212
+ has_images = len(image_cols) > 0
213
+
214
+ if has_images:
215
+ # Strip image columns for tabular export, note them in output
216
+ export_ds = _strip_image_columns(ds, image_cols)
217
+ else:
218
+ export_ds = ds
219
+
220
+ # Ensure output directory exists
221
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
222
+
223
+ # Export to parquet
224
+ columns = export_ds.column_names
225
+
226
+ try:
227
+ if output_path.endswith(".parquet"):
228
+ export_ds.to_parquet(output_path)
229
+ elif output_path.endswith(".csv"):
230
+ export_ds.to_csv(output_path)
231
+ else:
232
+ # Default to parquet
233
+ if not output_path.endswith(".parquet"):
234
+ output_path = output_path + ".parquet"
235
+ export_ds.to_parquet(output_path)
236
+ except Exception as export_err:
237
+ # If parquet export fails (e.g. complex nested types), try CSV
238
+ csv_path = output_path.replace(".parquet", ".csv")
239
+ try:
240
+ export_ds.to_csv(csv_path)
241
+ output_path = csv_path
242
+ except Exception:
243
+ raise export_err # Re-raise original error
244
+
245
+ result = {
246
+ "ok": True,
247
+ "path": output_path,
248
+ "rows": total_rows,
249
+ "columns": columns,
250
+ "split": used_split
251
+ }
252
+
253
+ if has_images:
254
+ result["image_columns"] = image_cols
255
+ result["note"] = (
256
+ f"This dataset contains image columns ({', '.join(image_cols)}). "
257
+ "Image data was stripped for tabular export. "
258
+ "Use vesper_download_assets with source='huggingface' to download the actual images."
259
+ )
260
+
261
+ print(json.dumps(result))
262
+
263
+ except Exception as e:
264
+ error_msg = str(e)
265
+ # Provide helpful, actionable hints
266
+ if "401" in error_msg or "Unauthorized" in error_msg:
267
+ error_msg = (
268
+ f"Authentication required for dataset '{repo_id}'. "
269
+ "This dataset may be gated or private. "
270
+ "Use the configure_keys tool to set your HF_TOKEN, then retry."
271
+ )
272
+ elif "403" in error_msg or "Forbidden" in error_msg:
273
+ error_msg = (
274
+ f"Access denied for dataset '{repo_id}'. "
275
+ "You may need to accept the dataset's usage agreement on huggingface.co, "
276
+ "then set HF_TOKEN via configure_keys tool."
277
+ )
278
+ elif "gated" in error_msg.lower():
279
+ error_msg = (
280
+ f"Dataset '{repo_id}' is gated. "
281
+ "Visit https://huggingface.co/datasets/{repo_id} to request access, "
282
+ "then set HF_TOKEN via configure_keys tool."
283
+ ).format(repo_id=repo_id)
284
+ elif "FileNotFoundError" in error_msg or "does not exist" in error_msg.lower() or "doesn't exist" in error_msg.lower():
285
+ error_msg = f"Dataset '{repo_id}' not found on HuggingFace. Check the dataset ID."
286
+ elif "script" in error_msg.lower() and "no longer supported" in error_msg.lower():
287
+ error_msg = (
288
+ f"Dataset '{repo_id}' uses a legacy loading script that is no longer supported "
289
+ "by the current version of the datasets library. "
290
+ "Try: pip install datasets --upgrade, or use an older datasets version."
291
+ )
292
+
293
+ print(json.dumps({"ok": False, "error": error_msg}))
294
+ sys.exit(1)
295
+
296
+
297
+ if __name__ == "__main__":
298
+ main()