@vespermcp/mcp-server 1.2.19 → 1.2.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,298 @@
1
+ """
2
+ HuggingFace Datasets Library Fallback Downloader.
3
+
4
+ Used when the HF Hub file listing finds no suitable data files
5
+ (e.g. script-based datasets, gated datasets, datasets that use
6
+ the `datasets` library format).
7
+
8
+ Handles:
9
+ - Legacy script-based datasets (trust_remote_code)
10
+ - Gated/private datasets (token auth)
11
+ - Image datasets (PIL Image columns → stripped for tabular export)
12
+ - Various split formats (DatasetDict, single split)
13
+
14
+ Usage:
15
+ python hf_fallback.py '{"repo_id": "user/dataset", "output_path": "/path/to/output.parquet", "token": "optional", "max_rows": 100000, "split": "train"}'
16
+
17
+ Output: JSON to stdout
18
+ {"ok": true, "path": "/path/to/output.parquet", "rows": 12345, "columns": ["col1", "col2"]}
19
+ {"ok": false, "error": "..."}
20
+ """
21
+ import sys
22
+ import json
23
+ import os
24
+ import warnings
25
+
26
+ # Suppress noisy HF warnings about trust_remote_code etc.
27
+ warnings.filterwarnings("ignore", message=".*trust_remote_code.*")
28
+ warnings.filterwarnings("ignore", message=".*legacy.*")
29
+
30
+
31
+ def _detect_image_columns(ds):
32
+ """Detect columns that contain HF Image features or PIL Image objects."""
33
+ image_cols = []
34
+ features = getattr(ds, "features", None)
35
+ if features:
36
+ for name, feat in features.items():
37
+ feat_cls = feat.__class__.__name__.lower()
38
+ feat_str = str(feat).lower()
39
+ if feat_cls == "image" or "image(" in feat_str:
40
+ image_cols.append(name)
41
+ return image_cols
42
+
43
+
44
+ def _strip_image_columns(ds, image_cols):
45
+ """Remove image columns from dataset so it can be exported to Parquet/CSV.
46
+
47
+ Image columns contain PIL Image objects that can't be serialized to tabular
48
+ formats. We replace them with a placeholder string indicating the column
49
+ was an image column.
50
+ """
51
+ if not image_cols:
52
+ return ds
53
+
54
+ # Remove the image columns entirely for tabular export
55
+ cols_to_keep = [c for c in ds.column_names if c not in image_cols]
56
+ if not cols_to_keep:
57
+ # Dataset is ALL image columns — keep them but cast to path strings if possible
58
+ return ds
59
+
60
+ return ds.select_columns(cols_to_keep)
61
+
62
+
63
+ def _load_dataset_robust(repo_id, token, split):
64
+ """Load a HuggingFace dataset with multiple fallback strategies.
65
+
66
+ Strategy order:
67
+ 1. Normal load with trust_remote_code=True (handles legacy script datasets)
68
+ 2. Load without trust_remote_code (newer datasets that reject it)
69
+ 3. Load with streaming=True then materialize (handles very large datasets)
70
+ """
71
+ from datasets import load_dataset, DatasetDict
72
+
73
+ errors = []
74
+ splits_to_try = [split] if split else ["train", "test", "validation", None]
75
+
76
+ # Strategy 1: Normal load with trust_remote_code
77
+ for s in splits_to_try:
78
+ try:
79
+ kwargs = {"path": repo_id, "trust_remote_code": True}
80
+ if token:
81
+ kwargs["token"] = token
82
+ if s:
83
+ kwargs["split"] = s
84
+ ds = load_dataset(**kwargs)
85
+ return ds, s
86
+ except (ValueError, KeyError):
87
+ continue
88
+ except Exception as e:
89
+ msg = str(e)
90
+ # Auth errors should be raised immediately, not retried
91
+ if any(x in msg for x in ["401", "403", "gated", "Unauthorized", "access"]):
92
+ raise
93
+ if "split" in msg.lower() or "key" in msg.lower():
94
+ continue
95
+ errors.append(f"trust_remote_code=True, split={s}: {msg}")
96
+
97
+ # Strategy 2: Load WITHOUT trust_remote_code (some repos reject it)
98
+ for s in splits_to_try:
99
+ try:
100
+ kwargs = {"path": repo_id}
101
+ if token:
102
+ kwargs["token"] = token
103
+ if s:
104
+ kwargs["split"] = s
105
+ ds = load_dataset(**kwargs)
106
+ return ds, s
107
+ except (ValueError, KeyError):
108
+ continue
109
+ except Exception as e:
110
+ msg = str(e)
111
+ if any(x in msg for x in ["401", "403", "gated", "Unauthorized", "access"]):
112
+ raise
113
+ if "split" in msg.lower() or "key" in msg.lower():
114
+ continue
115
+ errors.append(f"trust_remote_code=False, split={s}: {msg}")
116
+
117
+ # Strategy 3: Streaming fallback (for very large / oddly structured datasets)
118
+ for s in splits_to_try:
119
+ if s is None:
120
+ continue # streaming requires a split
121
+ try:
122
+ kwargs = {"path": repo_id, "streaming": True, "trust_remote_code": True}
123
+ if token:
124
+ kwargs["token"] = token
125
+ if s:
126
+ kwargs["split"] = s
127
+ ds_stream = load_dataset(**kwargs)
128
+ # Materialize from streaming iterator
129
+ from datasets import Dataset as HFDataset
130
+ rows = []
131
+ for i, row in enumerate(ds_stream):
132
+ if i >= 500000:
133
+ break
134
+ rows.append(row)
135
+ if rows:
136
+ ds = HFDataset.from_list(rows)
137
+ return ds, s
138
+ except Exception:
139
+ continue
140
+
141
+ # All strategies failed
142
+ error_summary = "; ".join(errors[:3]) if errors else "No valid configuration found"
143
+ return None, error_summary
144
+
145
+
146
+ def main():
147
+ if len(sys.argv) < 2:
148
+ print(json.dumps({"ok": False, "error": "Missing payload argument"}))
149
+ sys.exit(1)
150
+
151
+ try:
152
+ payload = json.loads(sys.argv[1])
153
+ except json.JSONDecodeError as e:
154
+ print(json.dumps({"ok": False, "error": f"Invalid JSON payload: {e}"}))
155
+ sys.exit(1)
156
+
157
+ repo_id = payload.get("repo_id", "").strip()
158
+ output_path = payload.get("output_path", "").strip()
159
+ token = payload.get("token") or os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or None
160
+ max_rows = payload.get("max_rows", 500000)
161
+ split = payload.get("split") # None = auto-detect
162
+
163
+ if not repo_id:
164
+ print(json.dumps({"ok": False, "error": "repo_id is required"}))
165
+ sys.exit(1)
166
+
167
+ if not output_path:
168
+ print(json.dumps({"ok": False, "error": "output_path is required"}))
169
+ sys.exit(1)
170
+
171
+ try:
172
+ from datasets import load_dataset
173
+ except ImportError:
174
+ print(json.dumps({"ok": False, "error": "Python 'datasets' library not installed. Install with: pip install datasets"}))
175
+ sys.exit(1)
176
+
177
+ try:
178
+ import polars as pl
179
+ except ImportError:
180
+ pl = None
181
+
182
+ try:
183
+ ds, used_split = _load_dataset_robust(repo_id, token, split)
184
+
185
+ if ds is None:
186
+ print(json.dumps({"ok": False, "error": f"Could not load dataset '{repo_id}': {used_split}"}))
187
+ sys.exit(1)
188
+
189
+ # Handle DatasetDict (when no split specified)
190
+ from datasets import DatasetDict, Dataset
191
+ if isinstance(ds, DatasetDict):
192
+ # Pick the best split
193
+ for preferred in ["train", "test", "validation"]:
194
+ if preferred in ds:
195
+ ds = ds[preferred]
196
+ used_split = preferred
197
+ break
198
+ else:
199
+ # Just pick the first available split
200
+ first_key = list(ds.keys())[0]
201
+ ds = ds[first_key]
202
+ used_split = first_key
203
+
204
+ # Limit rows if needed
205
+ total_rows = len(ds)
206
+ if max_rows and total_rows > max_rows:
207
+ ds = ds.select(range(max_rows))
208
+ total_rows = max_rows
209
+
210
+ # Detect and handle image columns (PIL Image objects can't be exported to Parquet)
211
+ image_cols = _detect_image_columns(ds)
212
+ has_images = len(image_cols) > 0
213
+
214
+ if has_images:
215
+ # Strip image columns for tabular export, note them in output
216
+ export_ds = _strip_image_columns(ds, image_cols)
217
+ else:
218
+ export_ds = ds
219
+
220
+ # Ensure output directory exists
221
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
222
+
223
+ # Export to parquet
224
+ columns = export_ds.column_names
225
+
226
+ try:
227
+ if output_path.endswith(".parquet"):
228
+ export_ds.to_parquet(output_path)
229
+ elif output_path.endswith(".csv"):
230
+ export_ds.to_csv(output_path)
231
+ else:
232
+ # Default to parquet
233
+ if not output_path.endswith(".parquet"):
234
+ output_path = output_path + ".parquet"
235
+ export_ds.to_parquet(output_path)
236
+ except Exception as export_err:
237
+ # If parquet export fails (e.g. complex nested types), try CSV
238
+ csv_path = output_path.replace(".parquet", ".csv")
239
+ try:
240
+ export_ds.to_csv(csv_path)
241
+ output_path = csv_path
242
+ except Exception:
243
+ raise export_err # Re-raise original error
244
+
245
+ result = {
246
+ "ok": True,
247
+ "path": output_path,
248
+ "rows": total_rows,
249
+ "columns": columns,
250
+ "split": used_split
251
+ }
252
+
253
+ if has_images:
254
+ result["image_columns"] = image_cols
255
+ result["note"] = (
256
+ f"This dataset contains image columns ({', '.join(image_cols)}). "
257
+ "Image data was stripped for tabular export. "
258
+ "Use vesper_download_assets with source='huggingface' to download the actual images."
259
+ )
260
+
261
+ print(json.dumps(result))
262
+
263
+ except Exception as e:
264
+ error_msg = str(e)
265
+ # Provide helpful, actionable hints
266
+ if "401" in error_msg or "Unauthorized" in error_msg:
267
+ error_msg = (
268
+ f"Authentication required for dataset '{repo_id}'. "
269
+ "This dataset may be gated or private. "
270
+ "Use the configure_keys tool to set your HF_TOKEN, then retry."
271
+ )
272
+ elif "403" in error_msg or "Forbidden" in error_msg:
273
+ error_msg = (
274
+ f"Access denied for dataset '{repo_id}'. "
275
+ "You may need to accept the dataset's usage agreement on huggingface.co, "
276
+ "then set HF_TOKEN via configure_keys tool."
277
+ )
278
+ elif "gated" in error_msg.lower():
279
+ error_msg = (
280
+ f"Dataset '{repo_id}' is gated. "
281
+ "Visit https://huggingface.co/datasets/{repo_id} to request access, "
282
+ "then set HF_TOKEN via configure_keys tool."
283
+ ).format(repo_id=repo_id)
284
+ elif "FileNotFoundError" in error_msg or "does not exist" in error_msg.lower() or "doesn't exist" in error_msg.lower():
285
+ error_msg = f"Dataset '{repo_id}' not found on HuggingFace. Check the dataset ID."
286
+ elif "script" in error_msg.lower() and "no longer supported" in error_msg.lower():
287
+ error_msg = (
288
+ f"Dataset '{repo_id}' uses a legacy loading script that is no longer supported "
289
+ "by the current version of the datasets library. "
290
+ "Try: pip install datasets --upgrade, or use an older datasets version."
291
+ )
292
+
293
+ print(json.dumps({"ok": False, "error": error_msg}))
294
+ sys.exit(1)
295
+
296
+
297
+ if __name__ == "__main__":
298
+ main()
@@ -9,6 +9,7 @@ import tempfile
9
9
  from dataclasses import dataclass
10
10
  from pathlib import Path
11
11
  from typing import Any, Awaitable, Callable, Dict, Iterable, List, Optional
12
+ from urllib.parse import urlparse
12
13
 
13
14
  import aiohttp
14
15
 
@@ -25,7 +26,7 @@ except Exception: # pragma: no cover
25
26
  wds = None
26
27
 
27
28
 
28
- IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif", ".tiff"}
29
+ IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif", ".tiff", ".tif", ".svg"}
29
30
 
30
31
 
31
32
  @dataclass
@@ -98,22 +99,88 @@ class AssetDownloader:
98
99
 
99
100
  @staticmethod
100
101
  def find_image_column(dataset: Any) -> Optional[str]:
102
+ """Auto-detect the image column in a HuggingFace dataset.
103
+
104
+ Detection strategy (in priority order):
105
+ 1. HF Feature type: columns with Image() feature type
106
+ 2. Known column names: 'image', 'img', 'photo', 'image_url', etc.
107
+ 3. URL pattern detection: columns containing image URLs (http(s)://...jpg)
108
+ 4. Path pattern detection: columns with file paths ending in image extensions
109
+ """
110
+ # Strategy 1: Check HF Feature types (most reliable)
101
111
  features = getattr(dataset, "features", None)
102
112
  if features:
103
113
  for name, feature in features.items():
104
- feature_name = feature.__class__.__name__.lower()
105
- feature_repr = str(feature).lower()
106
- if feature_name == "image" or "image(" in feature_repr:
107
- return str(name)
108
- lower = str(name).lower()
109
- if lower in {"image", "images", "img", "image_path", "image_url", "url"}:
114
+ feat_cls = feature.__class__.__name__.lower()
115
+ feat_str = str(feature).lower()
116
+ if feat_cls == "image" or "image(" in feat_str:
110
117
  return str(name)
111
118
 
112
- candidate_columns = ["image", "images", "img", "image_path", "image_url", "url", "file_name", "filepath"]
119
+ # Strategy 2: Check known column names
113
120
  cols = getattr(dataset, "column_names", []) or []
114
- for c in candidate_columns:
121
+
122
+ # Exact match first (highest priority names)
123
+ priority_exact = ["image", "img", "photo", "picture", "images"]
124
+ for c in priority_exact:
115
125
  if c in cols:
116
126
  return c
127
+
128
+ # Partial match (column names containing image-related keywords)
129
+ priority_partial = [
130
+ "image_path", "image_url", "img_path", "img_url",
131
+ "image_file", "file_name", "filepath", "filename",
132
+ "photo_url", "picture_url", "thumbnail",
133
+ "url", "path", "file",
134
+ ]
135
+ for target in priority_partial:
136
+ for c in cols:
137
+ if c.lower() == target:
138
+ return c
139
+
140
+ # Strategy 3: Sample values to detect URL/path patterns
141
+ try:
142
+ sample_size = min(5, len(dataset)) if hasattr(dataset, "__len__") else 5
143
+ if sample_size > 0:
144
+ for c in cols:
145
+ is_image_col = False
146
+ for i in range(sample_size):
147
+ try:
148
+ val = dataset[i][c]
149
+ except Exception:
150
+ break
151
+
152
+ if val is None:
153
+ continue
154
+
155
+ # PIL Image object
156
+ if hasattr(val, "save") and hasattr(val, "size"):
157
+ is_image_col = True
158
+ break
159
+
160
+ # Dict with image data
161
+ if isinstance(val, dict) and any(k in val for k in ("bytes", "path", "url")):
162
+ is_image_col = True
163
+ break
164
+
165
+ # String: URL or file path
166
+ if isinstance(val, str):
167
+ val_lower = val.lower()
168
+ # Check for image URLs
169
+ if val_lower.startswith(("http://", "https://")) and any(
170
+ ext in val_lower.split("?")[0] for ext in IMAGE_EXTENSIONS
171
+ ):
172
+ is_image_col = True
173
+ break
174
+ # Check for file paths with image extensions
175
+ if any(val_lower.endswith(ext) for ext in IMAGE_EXTENSIONS):
176
+ is_image_col = True
177
+ break
178
+
179
+ if is_image_col:
180
+ return c
181
+ except Exception:
182
+ pass
183
+
117
184
  return None
118
185
 
119
186
  async def download_assets(
@@ -214,20 +281,73 @@ class AssetDownloader:
214
281
  image_column: Optional[str],
215
282
  ) -> Dict[str, int]:
216
283
  from datasets import load_dataset # validated in download_assets()
284
+ import warnings
285
+ warnings.filterwarnings("ignore", message=".*trust_remote_code.*")
217
286
 
218
287
  await self._emit("start", {"source": "huggingface", "repo_id": repo_id})
219
288
 
220
- try:
221
- ds = load_dataset(repo_id, split="train")
222
- except Exception:
223
- dd = load_dataset(repo_id)
224
- first_split = list(dd.keys())[0]
225
- ds = dd[first_split]
289
+ token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or None
290
+
291
+ # Try loading with multiple strategies
292
+ ds = None
293
+ load_errors = []
294
+
295
+ for trust_rc in [True, False]:
296
+ for split_name in ["train", "test", "validation"]:
297
+ try:
298
+ kwargs = {"path": repo_id, "split": split_name}
299
+ if trust_rc:
300
+ kwargs["trust_remote_code"] = True
301
+ if token:
302
+ kwargs["token"] = token
303
+ ds = load_dataset(**kwargs)
304
+ break
305
+ except Exception as e:
306
+ msg = str(e)
307
+ # Immediately raise auth errors
308
+ if any(x in msg for x in ["401", "403", "gated", "Unauthorized"]):
309
+ raise RuntimeError(
310
+ f"Authentication required for '{repo_id}'. "
311
+ "This dataset may be gated or private. "
312
+ "Use the configure_keys tool to set HF_TOKEN, then retry."
313
+ )
314
+ load_errors.append(msg)
315
+ continue
316
+ if ds is not None:
317
+ break
318
+
319
+ # Fallback: load without split
320
+ if ds is None:
321
+ try:
322
+ kwargs = {"path": repo_id, "trust_remote_code": True}
323
+ if token:
324
+ kwargs["token"] = token
325
+ dd = load_dataset(**kwargs)
326
+ from datasets import DatasetDict
327
+ if isinstance(dd, DatasetDict):
328
+ first_split = list(dd.keys())[0]
329
+ ds = dd[first_split]
330
+ else:
331
+ ds = dd
332
+ except Exception as e:
333
+ msg = str(e)
334
+ if any(x in msg for x in ["401", "403", "gated", "Unauthorized"]):
335
+ raise RuntimeError(
336
+ f"Authentication required for '{repo_id}'. "
337
+ "Use the configure_keys tool to set HF_TOKEN, then retry."
338
+ )
339
+ combined = "; ".join(load_errors[:3])
340
+ raise RuntimeError(
341
+ f"Failed to load HuggingFace dataset '{repo_id}': {msg}. "
342
+ f"Previous attempts: {combined}"
343
+ )
226
344
 
227
345
  col = image_column or self.find_image_column(ds)
228
346
  if not col:
229
347
  raise RuntimeError(
230
- "No image column detected in HuggingFace dataset. Provide image_column or use fallback strategy with URL column."
348
+ f"No image column detected in HuggingFace dataset '{repo_id}'. "
349
+ "Available columns: " + ", ".join(getattr(ds, "column_names", [])) + ". "
350
+ "Provide image_column parameter explicitly."
231
351
  )
232
352
 
233
353
  total = len(ds) if hasattr(ds, "__len__") else 0
@@ -236,33 +356,63 @@ class AssetDownloader:
236
356
  downloaded = 0
237
357
  failed = 0
238
358
 
239
- with metadata_file.open("w", encoding="utf-8") as mf, errors_file.open("w", encoding="utf-8") as ef:
240
- for idx, row in enumerate(ds):
241
- if max_items and idx >= max_items:
242
- break
243
- try:
244
- out_name = f"{idx:08d}.jpg"
245
- out_path = images_dir / out_name
246
- self._save_image_value(row.get(col), out_path)
247
-
248
- record = {
249
- "dataset_id": dataset_id,
250
- "index": idx,
251
- "image_path": str(out_path),
252
- "source": "huggingface",
253
- "repo_id": repo_id,
254
- }
255
- mf.write(json.dumps(record, ensure_ascii=False) + "\n")
256
- downloaded += 1
257
- if downloaded % 50 == 0:
258
- await self._emit("progress", {"downloaded": downloaded, "failed": failed, "target": target})
259
- except Exception as e:
260
- failed += 1
261
- ef.write(json.dumps({"index": idx, "error": str(e)}, ensure_ascii=False) + "\n")
359
+ # Create an aiohttp session for URL-based images
360
+ session = None
361
+
362
+ try:
363
+ with metadata_file.open("w", encoding="utf-8") as mf, errors_file.open("w", encoding="utf-8") as ef:
364
+ for idx, row in enumerate(ds):
365
+ if max_items and idx >= max_items:
366
+ break
367
+ try:
368
+ out_name = f"{idx:08d}.jpg"
369
+ out_path = images_dir / out_name
370
+ value = row.get(col)
371
+
372
+ # Handle URL-based images inline
373
+ if isinstance(value, dict) and value.get("url") and not value.get("bytes") and not value.get("path"):
374
+ url = value["url"]
375
+ if session is None:
376
+ session = aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=60))
377
+ await self._download_image_from_url(session, url, out_path)
378
+ elif isinstance(value, str) and value.startswith(("http://", "https://")):
379
+ if session is None:
380
+ session = aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=60))
381
+ await self._download_image_from_url(session, value, out_path)
382
+ else:
383
+ self._save_image_value(value, out_path)
384
+
385
+ record = {
386
+ "dataset_id": dataset_id,
387
+ "index": idx,
388
+ "image_path": str(out_path),
389
+ "source": "huggingface",
390
+ "repo_id": repo_id,
391
+ }
392
+ mf.write(json.dumps(record, ensure_ascii=False) + "\n")
393
+ downloaded += 1
394
+ if downloaded % 50 == 0:
395
+ await self._emit("progress", {"downloaded": downloaded, "failed": failed, "target": target})
396
+ except Exception as e:
397
+ failed += 1
398
+ ef.write(json.dumps({"index": idx, "error": str(e)}, ensure_ascii=False) + "\n")
399
+ finally:
400
+ if session is not None:
401
+ await session.close()
262
402
 
263
403
  await self._emit("done", {"downloaded": downloaded, "failed": failed})
264
404
  return {"downloaded": downloaded, "failed": failed}
265
405
 
406
+ async def _download_image_from_url(self, session: aiohttp.ClientSession, url: str, out_path: Path) -> None:
407
+ """Download an image from a URL to a local path."""
408
+ async with session.get(url) as response:
409
+ if response.status != 200:
410
+ raise RuntimeError(f"HTTP {response.status} downloading {url}")
411
+ data = await response.read()
412
+ if not data:
413
+ raise RuntimeError(f"Empty response from {url}")
414
+ out_path.write_bytes(data)
415
+
266
416
  async def _download_kaggle(
267
417
  self,
268
418
  kaggle_ref: str,
@@ -393,30 +543,66 @@ class AssetDownloader:
393
543
 
394
544
  @staticmethod
395
545
  def _save_image_value(value: Any, out_path: Path) -> None:
546
+ """Save an image value to disk. Handles multiple image representations:
547
+ - PIL Image objects (have .save method)
548
+ - dict with 'bytes' key (raw image bytes)
549
+ - dict with 'path' key (local file path)
550
+ - bytes/bytearray (raw image data)
551
+ - str (local file path)
552
+ """
396
553
  if value is None:
397
554
  raise ValueError("empty image value")
398
555
 
399
- if hasattr(value, "save"):
556
+ # PIL Image object
557
+ if hasattr(value, "save") and hasattr(value, "size"):
400
558
  value.save(out_path)
401
559
  return
402
560
 
561
+ # Raw bytes
562
+ if isinstance(value, (bytes, bytearray)):
563
+ out_path.write_bytes(value)
564
+ return
565
+
566
+ # Dict with image data
403
567
  if isinstance(value, dict):
404
568
  if value.get("bytes"):
405
- out_path.write_bytes(value["bytes"])
406
- return
407
- if value.get("path") and os.path.exists(value["path"]):
408
- shutil.copy2(value["path"], out_path)
569
+ raw = value["bytes"]
570
+ if isinstance(raw, (bytes, bytearray)):
571
+ out_path.write_bytes(raw)
572
+ else:
573
+ # Could be a list of ints
574
+ out_path.write_bytes(bytes(raw))
409
575
  return
576
+ if value.get("path"):
577
+ p = str(value["path"])
578
+ if os.path.exists(p):
579
+ shutil.copy2(p, out_path)
580
+ return
581
+ raise ValueError(f"Image path not found: {p}")
410
582
  if value.get("url"):
411
- raise ValueError("image URL requires URL downloader fallback")
583
+ raise ValueError("image URL detected — use async URL downloader")
412
584
 
585
+ # String: local file path
413
586
  if isinstance(value, str):
414
587
  if os.path.exists(value):
415
588
  shutil.copy2(value, out_path)
416
589
  return
417
- raise ValueError("string image value is not a local path")
590
+ if value.startswith(("http://", "https://")):
591
+ raise ValueError("image URL detected — use async URL downloader")
592
+ raise ValueError(f"Image path not found: {value}")
593
+
594
+ # numpy array (common in some datasets)
595
+ try:
596
+ import numpy as np
597
+ if isinstance(value, np.ndarray):
598
+ from PIL import Image
599
+ img = Image.fromarray(value)
600
+ img.save(out_path)
601
+ return
602
+ except (ImportError, Exception):
603
+ pass
418
604
 
419
- raise ValueError(f"Unsupported image value type: {type(value)}")
605
+ raise ValueError(f"Unsupported image value type: {type(value).__name__}")
420
606
 
421
607
  async def _write_webdataset(self, dataset_dir: Path, images_dir: Path, metadata_file: Path) -> None:
422
608
  """Write a webdataset-compatible tar archive.