@vespermcp/mcp-server 1.2.20 → 1.2.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +19 -6
- package/build/ingestion/hf-downloader.js +12 -2
- package/build/ingestion/ingestor.js +12 -5
- package/build/python/asset_downloader_engine.py +20 -1
- package/build/python/hf_fallback.py +196 -45
- package/build/python/vesper/core/asset_downloader.py +233 -47
- package/package.json +1 -1
- package/src/python/asset_downloader_engine.py +20 -1
- package/src/python/hf_fallback.py +196 -45
- package/src/python/vesper/core/asset_downloader.py +233 -47
|
@@ -5,6 +5,12 @@ Used when the HF Hub file listing finds no suitable data files
|
|
|
5
5
|
(e.g. script-based datasets, gated datasets, datasets that use
|
|
6
6
|
the `datasets` library format).
|
|
7
7
|
|
|
8
|
+
Handles:
|
|
9
|
+
- Legacy script-based datasets (trust_remote_code)
|
|
10
|
+
- Gated/private datasets (token auth)
|
|
11
|
+
- Image datasets (PIL Image columns → stripped for tabular export)
|
|
12
|
+
- Various split formats (DatasetDict, single split)
|
|
13
|
+
|
|
8
14
|
Usage:
|
|
9
15
|
python hf_fallback.py '{"repo_id": "user/dataset", "output_path": "/path/to/output.parquet", "token": "optional", "max_rows": 100000, "split": "train"}'
|
|
10
16
|
|
|
@@ -15,6 +21,127 @@ Output: JSON to stdout
|
|
|
15
21
|
import sys
|
|
16
22
|
import json
|
|
17
23
|
import os
|
|
24
|
+
import warnings
|
|
25
|
+
|
|
26
|
+
# Suppress noisy HF warnings about trust_remote_code etc.
|
|
27
|
+
warnings.filterwarnings("ignore", message=".*trust_remote_code.*")
|
|
28
|
+
warnings.filterwarnings("ignore", message=".*legacy.*")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _detect_image_columns(ds):
|
|
32
|
+
"""Detect columns that contain HF Image features or PIL Image objects."""
|
|
33
|
+
image_cols = []
|
|
34
|
+
features = getattr(ds, "features", None)
|
|
35
|
+
if features:
|
|
36
|
+
for name, feat in features.items():
|
|
37
|
+
feat_cls = feat.__class__.__name__.lower()
|
|
38
|
+
feat_str = str(feat).lower()
|
|
39
|
+
if feat_cls == "image" or "image(" in feat_str:
|
|
40
|
+
image_cols.append(name)
|
|
41
|
+
return image_cols
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _strip_image_columns(ds, image_cols):
|
|
45
|
+
"""Remove image columns from dataset so it can be exported to Parquet/CSV.
|
|
46
|
+
|
|
47
|
+
Image columns contain PIL Image objects that can't be serialized to tabular
|
|
48
|
+
formats. We replace them with a placeholder string indicating the column
|
|
49
|
+
was an image column.
|
|
50
|
+
"""
|
|
51
|
+
if not image_cols:
|
|
52
|
+
return ds
|
|
53
|
+
|
|
54
|
+
# Remove the image columns entirely for tabular export
|
|
55
|
+
cols_to_keep = [c for c in ds.column_names if c not in image_cols]
|
|
56
|
+
if not cols_to_keep:
|
|
57
|
+
# Dataset is ALL image columns — keep them but cast to path strings if possible
|
|
58
|
+
return ds
|
|
59
|
+
|
|
60
|
+
return ds.select_columns(cols_to_keep)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _load_dataset_robust(repo_id, token, split):
|
|
64
|
+
"""Load a HuggingFace dataset with multiple fallback strategies.
|
|
65
|
+
|
|
66
|
+
Strategy order:
|
|
67
|
+
1. Normal load with trust_remote_code=True (handles legacy script datasets)
|
|
68
|
+
2. Load without trust_remote_code (newer datasets that reject it)
|
|
69
|
+
3. Load with streaming=True then materialize (handles very large datasets)
|
|
70
|
+
"""
|
|
71
|
+
from datasets import load_dataset, DatasetDict
|
|
72
|
+
|
|
73
|
+
errors = []
|
|
74
|
+
splits_to_try = [split] if split else ["train", "test", "validation", None]
|
|
75
|
+
|
|
76
|
+
# Strategy 1: Normal load with trust_remote_code
|
|
77
|
+
for s in splits_to_try:
|
|
78
|
+
try:
|
|
79
|
+
kwargs = {"path": repo_id, "trust_remote_code": True}
|
|
80
|
+
if token:
|
|
81
|
+
kwargs["token"] = token
|
|
82
|
+
if s:
|
|
83
|
+
kwargs["split"] = s
|
|
84
|
+
ds = load_dataset(**kwargs)
|
|
85
|
+
return ds, s
|
|
86
|
+
except (ValueError, KeyError):
|
|
87
|
+
continue
|
|
88
|
+
except Exception as e:
|
|
89
|
+
msg = str(e)
|
|
90
|
+
# Auth errors should be raised immediately, not retried
|
|
91
|
+
if any(x in msg for x in ["401", "403", "gated", "Unauthorized", "access"]):
|
|
92
|
+
raise
|
|
93
|
+
if "split" in msg.lower() or "key" in msg.lower():
|
|
94
|
+
continue
|
|
95
|
+
errors.append(f"trust_remote_code=True, split={s}: {msg}")
|
|
96
|
+
|
|
97
|
+
# Strategy 2: Load WITHOUT trust_remote_code (some repos reject it)
|
|
98
|
+
for s in splits_to_try:
|
|
99
|
+
try:
|
|
100
|
+
kwargs = {"path": repo_id}
|
|
101
|
+
if token:
|
|
102
|
+
kwargs["token"] = token
|
|
103
|
+
if s:
|
|
104
|
+
kwargs["split"] = s
|
|
105
|
+
ds = load_dataset(**kwargs)
|
|
106
|
+
return ds, s
|
|
107
|
+
except (ValueError, KeyError):
|
|
108
|
+
continue
|
|
109
|
+
except Exception as e:
|
|
110
|
+
msg = str(e)
|
|
111
|
+
if any(x in msg for x in ["401", "403", "gated", "Unauthorized", "access"]):
|
|
112
|
+
raise
|
|
113
|
+
if "split" in msg.lower() or "key" in msg.lower():
|
|
114
|
+
continue
|
|
115
|
+
errors.append(f"trust_remote_code=False, split={s}: {msg}")
|
|
116
|
+
|
|
117
|
+
# Strategy 3: Streaming fallback (for very large / oddly structured datasets)
|
|
118
|
+
for s in splits_to_try:
|
|
119
|
+
if s is None:
|
|
120
|
+
continue # streaming requires a split
|
|
121
|
+
try:
|
|
122
|
+
kwargs = {"path": repo_id, "streaming": True, "trust_remote_code": True}
|
|
123
|
+
if token:
|
|
124
|
+
kwargs["token"] = token
|
|
125
|
+
if s:
|
|
126
|
+
kwargs["split"] = s
|
|
127
|
+
ds_stream = load_dataset(**kwargs)
|
|
128
|
+
# Materialize from streaming iterator
|
|
129
|
+
from datasets import Dataset as HFDataset
|
|
130
|
+
rows = []
|
|
131
|
+
for i, row in enumerate(ds_stream):
|
|
132
|
+
if i >= 500000:
|
|
133
|
+
break
|
|
134
|
+
rows.append(row)
|
|
135
|
+
if rows:
|
|
136
|
+
ds = HFDataset.from_list(rows)
|
|
137
|
+
return ds, s
|
|
138
|
+
except Exception:
|
|
139
|
+
continue
|
|
140
|
+
|
|
141
|
+
# All strategies failed
|
|
142
|
+
error_summary = "; ".join(errors[:3]) if errors else "No valid configuration found"
|
|
143
|
+
return None, error_summary
|
|
144
|
+
|
|
18
145
|
|
|
19
146
|
def main():
|
|
20
147
|
if len(sys.argv) < 2:
|
|
@@ -53,37 +180,10 @@ def main():
|
|
|
53
180
|
pl = None
|
|
54
181
|
|
|
55
182
|
try:
|
|
56
|
-
|
|
57
|
-
# If split is not specified, try common ones
|
|
58
|
-
splits_to_try = [split] if split else ["train", "test", "validation", None]
|
|
59
|
-
|
|
60
|
-
ds = None
|
|
61
|
-
used_split = None
|
|
62
|
-
|
|
63
|
-
for s in splits_to_try:
|
|
64
|
-
try:
|
|
65
|
-
kwargs = {
|
|
66
|
-
"path": repo_id,
|
|
67
|
-
"trust_remote_code": True,
|
|
68
|
-
}
|
|
69
|
-
if token:
|
|
70
|
-
kwargs["token"] = token
|
|
71
|
-
if s:
|
|
72
|
-
kwargs["split"] = s
|
|
73
|
-
|
|
74
|
-
ds = load_dataset(**kwargs)
|
|
75
|
-
used_split = s
|
|
76
|
-
break
|
|
77
|
-
except (ValueError, KeyError):
|
|
78
|
-
# Split doesn't exist, try next
|
|
79
|
-
continue
|
|
80
|
-
except Exception as e:
|
|
81
|
-
if "split" in str(e).lower() or "key" in str(e).lower():
|
|
82
|
-
continue
|
|
83
|
-
raise
|
|
183
|
+
ds, used_split = _load_dataset_robust(repo_id, token, split)
|
|
84
184
|
|
|
85
185
|
if ds is None:
|
|
86
|
-
print(json.dumps({"ok": False, "error": f"Could not load dataset '{repo_id}'
|
|
186
|
+
print(json.dumps({"ok": False, "error": f"Could not load dataset '{repo_id}': {used_split}"}))
|
|
87
187
|
sys.exit(1)
|
|
88
188
|
|
|
89
189
|
# Handle DatasetDict (when no split specified)
|
|
@@ -107,37 +207,88 @@ def main():
|
|
|
107
207
|
ds = ds.select(range(max_rows))
|
|
108
208
|
total_rows = max_rows
|
|
109
209
|
|
|
210
|
+
# Detect and handle image columns (PIL Image objects can't be exported to Parquet)
|
|
211
|
+
image_cols = _detect_image_columns(ds)
|
|
212
|
+
has_images = len(image_cols) > 0
|
|
213
|
+
|
|
214
|
+
if has_images:
|
|
215
|
+
# Strip image columns for tabular export, note them in output
|
|
216
|
+
export_ds = _strip_image_columns(ds, image_cols)
|
|
217
|
+
else:
|
|
218
|
+
export_ds = ds
|
|
219
|
+
|
|
110
220
|
# Ensure output directory exists
|
|
111
221
|
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
112
222
|
|
|
113
223
|
# Export to parquet
|
|
114
|
-
columns =
|
|
224
|
+
columns = export_ds.column_names
|
|
115
225
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
226
|
+
try:
|
|
227
|
+
if output_path.endswith(".parquet"):
|
|
228
|
+
export_ds.to_parquet(output_path)
|
|
229
|
+
elif output_path.endswith(".csv"):
|
|
230
|
+
export_ds.to_csv(output_path)
|
|
231
|
+
else:
|
|
232
|
+
# Default to parquet
|
|
233
|
+
if not output_path.endswith(".parquet"):
|
|
234
|
+
output_path = output_path + ".parquet"
|
|
235
|
+
export_ds.to_parquet(output_path)
|
|
236
|
+
except Exception as export_err:
|
|
237
|
+
# If parquet export fails (e.g. complex nested types), try CSV
|
|
238
|
+
csv_path = output_path.replace(".parquet", ".csv")
|
|
239
|
+
try:
|
|
240
|
+
export_ds.to_csv(csv_path)
|
|
241
|
+
output_path = csv_path
|
|
242
|
+
except Exception:
|
|
243
|
+
raise export_err # Re-raise original error
|
|
125
244
|
|
|
126
|
-
|
|
245
|
+
result = {
|
|
127
246
|
"ok": True,
|
|
128
247
|
"path": output_path,
|
|
129
248
|
"rows": total_rows,
|
|
130
249
|
"columns": columns,
|
|
131
250
|
"split": used_split
|
|
132
|
-
}
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
if has_images:
|
|
254
|
+
result["image_columns"] = image_cols
|
|
255
|
+
result["note"] = (
|
|
256
|
+
f"This dataset contains image columns ({', '.join(image_cols)}). "
|
|
257
|
+
"Image data was stripped for tabular export. "
|
|
258
|
+
"Use vesper_download_assets with source='huggingface' to download the actual images."
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
print(json.dumps(result))
|
|
133
262
|
|
|
134
263
|
except Exception as e:
|
|
135
264
|
error_msg = str(e)
|
|
136
|
-
# Provide helpful hints
|
|
137
|
-
if "401" in error_msg or "
|
|
138
|
-
error_msg
|
|
139
|
-
|
|
265
|
+
# Provide helpful, actionable hints
|
|
266
|
+
if "401" in error_msg or "Unauthorized" in error_msg:
|
|
267
|
+
error_msg = (
|
|
268
|
+
f"Authentication required for dataset '{repo_id}'. "
|
|
269
|
+
"This dataset may be gated or private. "
|
|
270
|
+
"Use the configure_keys tool to set your HF_TOKEN, then retry."
|
|
271
|
+
)
|
|
272
|
+
elif "403" in error_msg or "Forbidden" in error_msg:
|
|
273
|
+
error_msg = (
|
|
274
|
+
f"Access denied for dataset '{repo_id}'. "
|
|
275
|
+
"You may need to accept the dataset's usage agreement on huggingface.co, "
|
|
276
|
+
"then set HF_TOKEN via configure_keys tool."
|
|
277
|
+
)
|
|
278
|
+
elif "gated" in error_msg.lower():
|
|
279
|
+
error_msg = (
|
|
280
|
+
f"Dataset '{repo_id}' is gated. "
|
|
281
|
+
"Visit https://huggingface.co/datasets/{repo_id} to request access, "
|
|
282
|
+
"then set HF_TOKEN via configure_keys tool."
|
|
283
|
+
).format(repo_id=repo_id)
|
|
284
|
+
elif "FileNotFoundError" in error_msg or "does not exist" in error_msg.lower() or "doesn't exist" in error_msg.lower():
|
|
140
285
|
error_msg = f"Dataset '{repo_id}' not found on HuggingFace. Check the dataset ID."
|
|
286
|
+
elif "script" in error_msg.lower() and "no longer supported" in error_msg.lower():
|
|
287
|
+
error_msg = (
|
|
288
|
+
f"Dataset '{repo_id}' uses a legacy loading script that is no longer supported "
|
|
289
|
+
"by the current version of the datasets library. "
|
|
290
|
+
"Try: pip install datasets --upgrade, or use an older datasets version."
|
|
291
|
+
)
|
|
141
292
|
|
|
142
293
|
print(json.dumps({"ok": False, "error": error_msg}))
|
|
143
294
|
sys.exit(1)
|
|
@@ -9,6 +9,7 @@ import tempfile
|
|
|
9
9
|
from dataclasses import dataclass
|
|
10
10
|
from pathlib import Path
|
|
11
11
|
from typing import Any, Awaitable, Callable, Dict, Iterable, List, Optional
|
|
12
|
+
from urllib.parse import urlparse
|
|
12
13
|
|
|
13
14
|
import aiohttp
|
|
14
15
|
|
|
@@ -25,7 +26,7 @@ except Exception: # pragma: no cover
|
|
|
25
26
|
wds = None
|
|
26
27
|
|
|
27
28
|
|
|
28
|
-
IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif", ".tiff"}
|
|
29
|
+
IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif", ".tiff", ".tif", ".svg"}
|
|
29
30
|
|
|
30
31
|
|
|
31
32
|
@dataclass
|
|
@@ -98,22 +99,88 @@ class AssetDownloader:
|
|
|
98
99
|
|
|
99
100
|
@staticmethod
|
|
100
101
|
def find_image_column(dataset: Any) -> Optional[str]:
|
|
102
|
+
"""Auto-detect the image column in a HuggingFace dataset.
|
|
103
|
+
|
|
104
|
+
Detection strategy (in priority order):
|
|
105
|
+
1. HF Feature type: columns with Image() feature type
|
|
106
|
+
2. Known column names: 'image', 'img', 'photo', 'image_url', etc.
|
|
107
|
+
3. URL pattern detection: columns containing image URLs (http(s)://...jpg)
|
|
108
|
+
4. Path pattern detection: columns with file paths ending in image extensions
|
|
109
|
+
"""
|
|
110
|
+
# Strategy 1: Check HF Feature types (most reliable)
|
|
101
111
|
features = getattr(dataset, "features", None)
|
|
102
112
|
if features:
|
|
103
113
|
for name, feature in features.items():
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
if
|
|
107
|
-
return str(name)
|
|
108
|
-
lower = str(name).lower()
|
|
109
|
-
if lower in {"image", "images", "img", "image_path", "image_url", "url"}:
|
|
114
|
+
feat_cls = feature.__class__.__name__.lower()
|
|
115
|
+
feat_str = str(feature).lower()
|
|
116
|
+
if feat_cls == "image" or "image(" in feat_str:
|
|
110
117
|
return str(name)
|
|
111
118
|
|
|
112
|
-
|
|
119
|
+
# Strategy 2: Check known column names
|
|
113
120
|
cols = getattr(dataset, "column_names", []) or []
|
|
114
|
-
|
|
121
|
+
|
|
122
|
+
# Exact match first (highest priority names)
|
|
123
|
+
priority_exact = ["image", "img", "photo", "picture", "images"]
|
|
124
|
+
for c in priority_exact:
|
|
115
125
|
if c in cols:
|
|
116
126
|
return c
|
|
127
|
+
|
|
128
|
+
# Partial match (column names containing image-related keywords)
|
|
129
|
+
priority_partial = [
|
|
130
|
+
"image_path", "image_url", "img_path", "img_url",
|
|
131
|
+
"image_file", "file_name", "filepath", "filename",
|
|
132
|
+
"photo_url", "picture_url", "thumbnail",
|
|
133
|
+
"url", "path", "file",
|
|
134
|
+
]
|
|
135
|
+
for target in priority_partial:
|
|
136
|
+
for c in cols:
|
|
137
|
+
if c.lower() == target:
|
|
138
|
+
return c
|
|
139
|
+
|
|
140
|
+
# Strategy 3: Sample values to detect URL/path patterns
|
|
141
|
+
try:
|
|
142
|
+
sample_size = min(5, len(dataset)) if hasattr(dataset, "__len__") else 5
|
|
143
|
+
if sample_size > 0:
|
|
144
|
+
for c in cols:
|
|
145
|
+
is_image_col = False
|
|
146
|
+
for i in range(sample_size):
|
|
147
|
+
try:
|
|
148
|
+
val = dataset[i][c]
|
|
149
|
+
except Exception:
|
|
150
|
+
break
|
|
151
|
+
|
|
152
|
+
if val is None:
|
|
153
|
+
continue
|
|
154
|
+
|
|
155
|
+
# PIL Image object
|
|
156
|
+
if hasattr(val, "save") and hasattr(val, "size"):
|
|
157
|
+
is_image_col = True
|
|
158
|
+
break
|
|
159
|
+
|
|
160
|
+
# Dict with image data
|
|
161
|
+
if isinstance(val, dict) and any(k in val for k in ("bytes", "path", "url")):
|
|
162
|
+
is_image_col = True
|
|
163
|
+
break
|
|
164
|
+
|
|
165
|
+
# String: URL or file path
|
|
166
|
+
if isinstance(val, str):
|
|
167
|
+
val_lower = val.lower()
|
|
168
|
+
# Check for image URLs
|
|
169
|
+
if val_lower.startswith(("http://", "https://")) and any(
|
|
170
|
+
ext in val_lower.split("?")[0] for ext in IMAGE_EXTENSIONS
|
|
171
|
+
):
|
|
172
|
+
is_image_col = True
|
|
173
|
+
break
|
|
174
|
+
# Check for file paths with image extensions
|
|
175
|
+
if any(val_lower.endswith(ext) for ext in IMAGE_EXTENSIONS):
|
|
176
|
+
is_image_col = True
|
|
177
|
+
break
|
|
178
|
+
|
|
179
|
+
if is_image_col:
|
|
180
|
+
return c
|
|
181
|
+
except Exception:
|
|
182
|
+
pass
|
|
183
|
+
|
|
117
184
|
return None
|
|
118
185
|
|
|
119
186
|
async def download_assets(
|
|
@@ -214,20 +281,73 @@ class AssetDownloader:
|
|
|
214
281
|
image_column: Optional[str],
|
|
215
282
|
) -> Dict[str, int]:
|
|
216
283
|
from datasets import load_dataset # validated in download_assets()
|
|
284
|
+
import warnings
|
|
285
|
+
warnings.filterwarnings("ignore", message=".*trust_remote_code.*")
|
|
217
286
|
|
|
218
287
|
await self._emit("start", {"source": "huggingface", "repo_id": repo_id})
|
|
219
288
|
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
289
|
+
token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or None
|
|
290
|
+
|
|
291
|
+
# Try loading with multiple strategies
|
|
292
|
+
ds = None
|
|
293
|
+
load_errors = []
|
|
294
|
+
|
|
295
|
+
for trust_rc in [True, False]:
|
|
296
|
+
for split_name in ["train", "test", "validation"]:
|
|
297
|
+
try:
|
|
298
|
+
kwargs = {"path": repo_id, "split": split_name}
|
|
299
|
+
if trust_rc:
|
|
300
|
+
kwargs["trust_remote_code"] = True
|
|
301
|
+
if token:
|
|
302
|
+
kwargs["token"] = token
|
|
303
|
+
ds = load_dataset(**kwargs)
|
|
304
|
+
break
|
|
305
|
+
except Exception as e:
|
|
306
|
+
msg = str(e)
|
|
307
|
+
# Immediately raise auth errors
|
|
308
|
+
if any(x in msg for x in ["401", "403", "gated", "Unauthorized"]):
|
|
309
|
+
raise RuntimeError(
|
|
310
|
+
f"Authentication required for '{repo_id}'. "
|
|
311
|
+
"This dataset may be gated or private. "
|
|
312
|
+
"Use the configure_keys tool to set HF_TOKEN, then retry."
|
|
313
|
+
)
|
|
314
|
+
load_errors.append(msg)
|
|
315
|
+
continue
|
|
316
|
+
if ds is not None:
|
|
317
|
+
break
|
|
318
|
+
|
|
319
|
+
# Fallback: load without split
|
|
320
|
+
if ds is None:
|
|
321
|
+
try:
|
|
322
|
+
kwargs = {"path": repo_id, "trust_remote_code": True}
|
|
323
|
+
if token:
|
|
324
|
+
kwargs["token"] = token
|
|
325
|
+
dd = load_dataset(**kwargs)
|
|
326
|
+
from datasets import DatasetDict
|
|
327
|
+
if isinstance(dd, DatasetDict):
|
|
328
|
+
first_split = list(dd.keys())[0]
|
|
329
|
+
ds = dd[first_split]
|
|
330
|
+
else:
|
|
331
|
+
ds = dd
|
|
332
|
+
except Exception as e:
|
|
333
|
+
msg = str(e)
|
|
334
|
+
if any(x in msg for x in ["401", "403", "gated", "Unauthorized"]):
|
|
335
|
+
raise RuntimeError(
|
|
336
|
+
f"Authentication required for '{repo_id}'. "
|
|
337
|
+
"Use the configure_keys tool to set HF_TOKEN, then retry."
|
|
338
|
+
)
|
|
339
|
+
combined = "; ".join(load_errors[:3])
|
|
340
|
+
raise RuntimeError(
|
|
341
|
+
f"Failed to load HuggingFace dataset '{repo_id}': {msg}. "
|
|
342
|
+
f"Previous attempts: {combined}"
|
|
343
|
+
)
|
|
226
344
|
|
|
227
345
|
col = image_column or self.find_image_column(ds)
|
|
228
346
|
if not col:
|
|
229
347
|
raise RuntimeError(
|
|
230
|
-
"No image column detected in HuggingFace dataset.
|
|
348
|
+
f"No image column detected in HuggingFace dataset '{repo_id}'. "
|
|
349
|
+
"Available columns: " + ", ".join(getattr(ds, "column_names", [])) + ". "
|
|
350
|
+
"Provide image_column parameter explicitly."
|
|
231
351
|
)
|
|
232
352
|
|
|
233
353
|
total = len(ds) if hasattr(ds, "__len__") else 0
|
|
@@ -236,33 +356,63 @@ class AssetDownloader:
|
|
|
236
356
|
downloaded = 0
|
|
237
357
|
failed = 0
|
|
238
358
|
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
"
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
359
|
+
# Create an aiohttp session for URL-based images
|
|
360
|
+
session = None
|
|
361
|
+
|
|
362
|
+
try:
|
|
363
|
+
with metadata_file.open("w", encoding="utf-8") as mf, errors_file.open("w", encoding="utf-8") as ef:
|
|
364
|
+
for idx, row in enumerate(ds):
|
|
365
|
+
if max_items and idx >= max_items:
|
|
366
|
+
break
|
|
367
|
+
try:
|
|
368
|
+
out_name = f"{idx:08d}.jpg"
|
|
369
|
+
out_path = images_dir / out_name
|
|
370
|
+
value = row.get(col)
|
|
371
|
+
|
|
372
|
+
# Handle URL-based images inline
|
|
373
|
+
if isinstance(value, dict) and value.get("url") and not value.get("bytes") and not value.get("path"):
|
|
374
|
+
url = value["url"]
|
|
375
|
+
if session is None:
|
|
376
|
+
session = aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=60))
|
|
377
|
+
await self._download_image_from_url(session, url, out_path)
|
|
378
|
+
elif isinstance(value, str) and value.startswith(("http://", "https://")):
|
|
379
|
+
if session is None:
|
|
380
|
+
session = aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=60))
|
|
381
|
+
await self._download_image_from_url(session, value, out_path)
|
|
382
|
+
else:
|
|
383
|
+
self._save_image_value(value, out_path)
|
|
384
|
+
|
|
385
|
+
record = {
|
|
386
|
+
"dataset_id": dataset_id,
|
|
387
|
+
"index": idx,
|
|
388
|
+
"image_path": str(out_path),
|
|
389
|
+
"source": "huggingface",
|
|
390
|
+
"repo_id": repo_id,
|
|
391
|
+
}
|
|
392
|
+
mf.write(json.dumps(record, ensure_ascii=False) + "\n")
|
|
393
|
+
downloaded += 1
|
|
394
|
+
if downloaded % 50 == 0:
|
|
395
|
+
await self._emit("progress", {"downloaded": downloaded, "failed": failed, "target": target})
|
|
396
|
+
except Exception as e:
|
|
397
|
+
failed += 1
|
|
398
|
+
ef.write(json.dumps({"index": idx, "error": str(e)}, ensure_ascii=False) + "\n")
|
|
399
|
+
finally:
|
|
400
|
+
if session is not None:
|
|
401
|
+
await session.close()
|
|
262
402
|
|
|
263
403
|
await self._emit("done", {"downloaded": downloaded, "failed": failed})
|
|
264
404
|
return {"downloaded": downloaded, "failed": failed}
|
|
265
405
|
|
|
406
|
+
async def _download_image_from_url(self, session: aiohttp.ClientSession, url: str, out_path: Path) -> None:
|
|
407
|
+
"""Download an image from a URL to a local path."""
|
|
408
|
+
async with session.get(url) as response:
|
|
409
|
+
if response.status != 200:
|
|
410
|
+
raise RuntimeError(f"HTTP {response.status} downloading {url}")
|
|
411
|
+
data = await response.read()
|
|
412
|
+
if not data:
|
|
413
|
+
raise RuntimeError(f"Empty response from {url}")
|
|
414
|
+
out_path.write_bytes(data)
|
|
415
|
+
|
|
266
416
|
async def _download_kaggle(
|
|
267
417
|
self,
|
|
268
418
|
kaggle_ref: str,
|
|
@@ -393,30 +543,66 @@ class AssetDownloader:
|
|
|
393
543
|
|
|
394
544
|
@staticmethod
|
|
395
545
|
def _save_image_value(value: Any, out_path: Path) -> None:
|
|
546
|
+
"""Save an image value to disk. Handles multiple image representations:
|
|
547
|
+
- PIL Image objects (have .save method)
|
|
548
|
+
- dict with 'bytes' key (raw image bytes)
|
|
549
|
+
- dict with 'path' key (local file path)
|
|
550
|
+
- bytes/bytearray (raw image data)
|
|
551
|
+
- str (local file path)
|
|
552
|
+
"""
|
|
396
553
|
if value is None:
|
|
397
554
|
raise ValueError("empty image value")
|
|
398
555
|
|
|
399
|
-
|
|
556
|
+
# PIL Image object
|
|
557
|
+
if hasattr(value, "save") and hasattr(value, "size"):
|
|
400
558
|
value.save(out_path)
|
|
401
559
|
return
|
|
402
560
|
|
|
561
|
+
# Raw bytes
|
|
562
|
+
if isinstance(value, (bytes, bytearray)):
|
|
563
|
+
out_path.write_bytes(value)
|
|
564
|
+
return
|
|
565
|
+
|
|
566
|
+
# Dict with image data
|
|
403
567
|
if isinstance(value, dict):
|
|
404
568
|
if value.get("bytes"):
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
569
|
+
raw = value["bytes"]
|
|
570
|
+
if isinstance(raw, (bytes, bytearray)):
|
|
571
|
+
out_path.write_bytes(raw)
|
|
572
|
+
else:
|
|
573
|
+
# Could be a list of ints
|
|
574
|
+
out_path.write_bytes(bytes(raw))
|
|
409
575
|
return
|
|
576
|
+
if value.get("path"):
|
|
577
|
+
p = str(value["path"])
|
|
578
|
+
if os.path.exists(p):
|
|
579
|
+
shutil.copy2(p, out_path)
|
|
580
|
+
return
|
|
581
|
+
raise ValueError(f"Image path not found: {p}")
|
|
410
582
|
if value.get("url"):
|
|
411
|
-
raise ValueError("image URL
|
|
583
|
+
raise ValueError("image URL detected — use async URL downloader")
|
|
412
584
|
|
|
585
|
+
# String: local file path
|
|
413
586
|
if isinstance(value, str):
|
|
414
587
|
if os.path.exists(value):
|
|
415
588
|
shutil.copy2(value, out_path)
|
|
416
589
|
return
|
|
417
|
-
|
|
590
|
+
if value.startswith(("http://", "https://")):
|
|
591
|
+
raise ValueError("image URL detected — use async URL downloader")
|
|
592
|
+
raise ValueError(f"Image path not found: {value}")
|
|
593
|
+
|
|
594
|
+
# numpy array (common in some datasets)
|
|
595
|
+
try:
|
|
596
|
+
import numpy as np
|
|
597
|
+
if isinstance(value, np.ndarray):
|
|
598
|
+
from PIL import Image
|
|
599
|
+
img = Image.fromarray(value)
|
|
600
|
+
img.save(out_path)
|
|
601
|
+
return
|
|
602
|
+
except (ImportError, Exception):
|
|
603
|
+
pass
|
|
418
604
|
|
|
419
|
-
raise ValueError(f"Unsupported image value type: {type(value)}")
|
|
605
|
+
raise ValueError(f"Unsupported image value type: {type(value).__name__}")
|
|
420
606
|
|
|
421
607
|
async def _write_webdataset(self, dataset_dir: Path, images_dir: Path, metadata_file: Path) -> None:
|
|
422
608
|
"""Write a webdataset-compatible tar archive.
|