@vespermcp/mcp-server 1.2.20 → 1.2.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +49 -0
- package/build/cloud/adapters/supabase.js +49 -0
- package/build/cloud/storage-manager.js +6 -0
- package/build/export/exporter.js +22 -9
- package/build/gateway/unified-dataset-gateway.js +410 -0
- package/build/index.js +1592 -837
- package/build/ingestion/hf-downloader.js +12 -2
- package/build/ingestion/ingestor.js +19 -9
- package/build/install/install-service.js +11 -6
- package/build/lib/supabase.js +3 -0
- package/build/metadata/scraper.js +85 -14
- package/build/python/asset_downloader_engine.py +22 -1
- package/build/python/convert_engine.py +92 -0
- package/build/python/export_engine.py +45 -0
- package/build/python/hf_fallback.py +196 -45
- package/build/python/kaggle_engine.py +77 -5
- package/build/python/normalize_engine.py +83 -0
- package/build/python/vesper/core/asset_downloader.py +238 -48
- package/build/search/engine.js +43 -5
- package/build/search/jit-orchestrator.js +18 -14
- package/build/search/query-intent.js +509 -0
- package/build/tools/formatter.js +6 -3
- package/build/utils/python-runtime.js +130 -0
- package/package.json +7 -5
- package/scripts/postinstall.cjs +87 -31
- package/scripts/wizard.cjs +601 -0
- package/scripts/wizard.js +306 -12
- package/src/python/__pycache__/config.cpython-312.pyc +0 -0
- package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
- package/src/python/asset_downloader_engine.py +22 -1
- package/src/python/convert_engine.py +92 -0
- package/src/python/export_engine.py +45 -0
- package/src/python/hf_fallback.py +196 -45
- package/src/python/kaggle_engine.py +77 -5
- package/src/python/normalize_engine.py +83 -0
- package/src/python/requirements.txt +12 -0
- package/src/python/vesper/core/asset_downloader.py +238 -48
- package/wizard.cjs +3 -0
|
@@ -9,6 +9,7 @@ import tempfile
|
|
|
9
9
|
from dataclasses import dataclass
|
|
10
10
|
from pathlib import Path
|
|
11
11
|
from typing import Any, Awaitable, Callable, Dict, Iterable, List, Optional
|
|
12
|
+
from urllib.parse import urlparse
|
|
12
13
|
|
|
13
14
|
import aiohttp
|
|
14
15
|
|
|
@@ -25,7 +26,7 @@ except Exception: # pragma: no cover
|
|
|
25
26
|
wds = None
|
|
26
27
|
|
|
27
28
|
|
|
28
|
-
IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif", ".tiff"}
|
|
29
|
+
IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif", ".tiff", ".tif", ".svg"}
|
|
29
30
|
|
|
30
31
|
|
|
31
32
|
@dataclass
|
|
@@ -98,22 +99,88 @@ class AssetDownloader:
|
|
|
98
99
|
|
|
99
100
|
@staticmethod
|
|
100
101
|
def find_image_column(dataset: Any) -> Optional[str]:
|
|
102
|
+
"""Auto-detect the image column in a HuggingFace dataset.
|
|
103
|
+
|
|
104
|
+
Detection strategy (in priority order):
|
|
105
|
+
1. HF Feature type: columns with Image() feature type
|
|
106
|
+
2. Known column names: 'image', 'img', 'photo', 'image_url', etc.
|
|
107
|
+
3. URL pattern detection: columns containing image URLs (http(s)://...jpg)
|
|
108
|
+
4. Path pattern detection: columns with file paths ending in image extensions
|
|
109
|
+
"""
|
|
110
|
+
# Strategy 1: Check HF Feature types (most reliable)
|
|
101
111
|
features = getattr(dataset, "features", None)
|
|
102
112
|
if features:
|
|
103
113
|
for name, feature in features.items():
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
if
|
|
107
|
-
return str(name)
|
|
108
|
-
lower = str(name).lower()
|
|
109
|
-
if lower in {"image", "images", "img", "image_path", "image_url", "url"}:
|
|
114
|
+
feat_cls = feature.__class__.__name__.lower()
|
|
115
|
+
feat_str = str(feature).lower()
|
|
116
|
+
if feat_cls == "image" or "image(" in feat_str:
|
|
110
117
|
return str(name)
|
|
111
118
|
|
|
112
|
-
|
|
119
|
+
# Strategy 2: Check known column names
|
|
113
120
|
cols = getattr(dataset, "column_names", []) or []
|
|
114
|
-
|
|
121
|
+
|
|
122
|
+
# Exact match first (highest priority names)
|
|
123
|
+
priority_exact = ["image", "img", "photo", "picture", "images"]
|
|
124
|
+
for c in priority_exact:
|
|
115
125
|
if c in cols:
|
|
116
126
|
return c
|
|
127
|
+
|
|
128
|
+
# Partial match (column names containing image-related keywords)
|
|
129
|
+
priority_partial = [
|
|
130
|
+
"image_path", "image_url", "img_path", "img_url",
|
|
131
|
+
"image_file", "file_name", "filepath", "filename",
|
|
132
|
+
"photo_url", "picture_url", "thumbnail",
|
|
133
|
+
"url", "path", "file",
|
|
134
|
+
]
|
|
135
|
+
for target in priority_partial:
|
|
136
|
+
for c in cols:
|
|
137
|
+
if c.lower() == target:
|
|
138
|
+
return c
|
|
139
|
+
|
|
140
|
+
# Strategy 3: Sample values to detect URL/path patterns
|
|
141
|
+
try:
|
|
142
|
+
sample_size = min(5, len(dataset)) if hasattr(dataset, "__len__") else 5
|
|
143
|
+
if sample_size > 0:
|
|
144
|
+
for c in cols:
|
|
145
|
+
is_image_col = False
|
|
146
|
+
for i in range(sample_size):
|
|
147
|
+
try:
|
|
148
|
+
val = dataset[i][c]
|
|
149
|
+
except Exception:
|
|
150
|
+
break
|
|
151
|
+
|
|
152
|
+
if val is None:
|
|
153
|
+
continue
|
|
154
|
+
|
|
155
|
+
# PIL Image object
|
|
156
|
+
if hasattr(val, "save") and hasattr(val, "size"):
|
|
157
|
+
is_image_col = True
|
|
158
|
+
break
|
|
159
|
+
|
|
160
|
+
# Dict with image data
|
|
161
|
+
if isinstance(val, dict) and any(k in val for k in ("bytes", "path", "url")):
|
|
162
|
+
is_image_col = True
|
|
163
|
+
break
|
|
164
|
+
|
|
165
|
+
# String: URL or file path
|
|
166
|
+
if isinstance(val, str):
|
|
167
|
+
val_lower = val.lower()
|
|
168
|
+
# Check for image URLs
|
|
169
|
+
if val_lower.startswith(("http://", "https://")) and any(
|
|
170
|
+
ext in val_lower.split("?")[0] for ext in IMAGE_EXTENSIONS
|
|
171
|
+
):
|
|
172
|
+
is_image_col = True
|
|
173
|
+
break
|
|
174
|
+
# Check for file paths with image extensions
|
|
175
|
+
if any(val_lower.endswith(ext) for ext in IMAGE_EXTENSIONS):
|
|
176
|
+
is_image_col = True
|
|
177
|
+
break
|
|
178
|
+
|
|
179
|
+
if is_image_col:
|
|
180
|
+
return c
|
|
181
|
+
except Exception:
|
|
182
|
+
pass
|
|
183
|
+
|
|
117
184
|
return None
|
|
118
185
|
|
|
119
186
|
async def download_assets(
|
|
@@ -124,6 +191,7 @@ class AssetDownloader:
|
|
|
124
191
|
kaggle_ref: Optional[str] = None,
|
|
125
192
|
urls: Optional[List[str]] = None,
|
|
126
193
|
output_format: str = "webdataset",
|
|
194
|
+
output_dir: Optional[str] = None,
|
|
127
195
|
max_items: Optional[int] = None,
|
|
128
196
|
image_column: Optional[str] = None,
|
|
129
197
|
) -> Dict[str, Any]:
|
|
@@ -164,7 +232,10 @@ class AssetDownloader:
|
|
|
164
232
|
raise ValueError("urls are required for source=url")
|
|
165
233
|
|
|
166
234
|
# --- Now safe to create directories ---
|
|
167
|
-
|
|
235
|
+
if output_dir:
|
|
236
|
+
dataset_dir = Path(output_dir).expanduser().resolve()
|
|
237
|
+
else:
|
|
238
|
+
dataset_dir = self.output_root / dataset_id.replace("/", "_").replace(":", "_")
|
|
168
239
|
images_dir = dataset_dir / "images"
|
|
169
240
|
dataset_dir.mkdir(parents=True, exist_ok=True)
|
|
170
241
|
images_dir.mkdir(parents=True, exist_ok=True)
|
|
@@ -214,20 +285,73 @@ class AssetDownloader:
|
|
|
214
285
|
image_column: Optional[str],
|
|
215
286
|
) -> Dict[str, int]:
|
|
216
287
|
from datasets import load_dataset # validated in download_assets()
|
|
288
|
+
import warnings
|
|
289
|
+
warnings.filterwarnings("ignore", message=".*trust_remote_code.*")
|
|
217
290
|
|
|
218
291
|
await self._emit("start", {"source": "huggingface", "repo_id": repo_id})
|
|
219
292
|
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
293
|
+
token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or None
|
|
294
|
+
|
|
295
|
+
# Try loading with multiple strategies
|
|
296
|
+
ds = None
|
|
297
|
+
load_errors = []
|
|
298
|
+
|
|
299
|
+
for trust_rc in [True, False]:
|
|
300
|
+
for split_name in ["train", "test", "validation"]:
|
|
301
|
+
try:
|
|
302
|
+
kwargs = {"path": repo_id, "split": split_name}
|
|
303
|
+
if trust_rc:
|
|
304
|
+
kwargs["trust_remote_code"] = True
|
|
305
|
+
if token:
|
|
306
|
+
kwargs["token"] = token
|
|
307
|
+
ds = load_dataset(**kwargs)
|
|
308
|
+
break
|
|
309
|
+
except Exception as e:
|
|
310
|
+
msg = str(e)
|
|
311
|
+
# Immediately raise auth errors
|
|
312
|
+
if any(x in msg for x in ["401", "403", "gated", "Unauthorized"]):
|
|
313
|
+
raise RuntimeError(
|
|
314
|
+
f"Authentication required for '{repo_id}'. "
|
|
315
|
+
"This dataset may be gated or private. "
|
|
316
|
+
"Use the configure_keys tool to set HF_TOKEN, then retry."
|
|
317
|
+
)
|
|
318
|
+
load_errors.append(msg)
|
|
319
|
+
continue
|
|
320
|
+
if ds is not None:
|
|
321
|
+
break
|
|
322
|
+
|
|
323
|
+
# Fallback: load without split
|
|
324
|
+
if ds is None:
|
|
325
|
+
try:
|
|
326
|
+
kwargs = {"path": repo_id, "trust_remote_code": True}
|
|
327
|
+
if token:
|
|
328
|
+
kwargs["token"] = token
|
|
329
|
+
dd = load_dataset(**kwargs)
|
|
330
|
+
from datasets import DatasetDict
|
|
331
|
+
if isinstance(dd, DatasetDict):
|
|
332
|
+
first_split = list(dd.keys())[0]
|
|
333
|
+
ds = dd[first_split]
|
|
334
|
+
else:
|
|
335
|
+
ds = dd
|
|
336
|
+
except Exception as e:
|
|
337
|
+
msg = str(e)
|
|
338
|
+
if any(x in msg for x in ["401", "403", "gated", "Unauthorized"]):
|
|
339
|
+
raise RuntimeError(
|
|
340
|
+
f"Authentication required for '{repo_id}'. "
|
|
341
|
+
"Use the configure_keys tool to set HF_TOKEN, then retry."
|
|
342
|
+
)
|
|
343
|
+
combined = "; ".join(load_errors[:3])
|
|
344
|
+
raise RuntimeError(
|
|
345
|
+
f"Failed to load HuggingFace dataset '{repo_id}': {msg}. "
|
|
346
|
+
f"Previous attempts: {combined}"
|
|
347
|
+
)
|
|
226
348
|
|
|
227
349
|
col = image_column or self.find_image_column(ds)
|
|
228
350
|
if not col:
|
|
229
351
|
raise RuntimeError(
|
|
230
|
-
"No image column detected in HuggingFace dataset.
|
|
352
|
+
f"No image column detected in HuggingFace dataset '{repo_id}'. "
|
|
353
|
+
"Available columns: " + ", ".join(getattr(ds, "column_names", [])) + ". "
|
|
354
|
+
"Provide image_column parameter explicitly."
|
|
231
355
|
)
|
|
232
356
|
|
|
233
357
|
total = len(ds) if hasattr(ds, "__len__") else 0
|
|
@@ -236,33 +360,63 @@ class AssetDownloader:
|
|
|
236
360
|
downloaded = 0
|
|
237
361
|
failed = 0
|
|
238
362
|
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
"
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
363
|
+
# Create an aiohttp session for URL-based images
|
|
364
|
+
session = None
|
|
365
|
+
|
|
366
|
+
try:
|
|
367
|
+
with metadata_file.open("w", encoding="utf-8") as mf, errors_file.open("w", encoding="utf-8") as ef:
|
|
368
|
+
for idx, row in enumerate(ds):
|
|
369
|
+
if max_items and idx >= max_items:
|
|
370
|
+
break
|
|
371
|
+
try:
|
|
372
|
+
out_name = f"{idx:08d}.jpg"
|
|
373
|
+
out_path = images_dir / out_name
|
|
374
|
+
value = row.get(col)
|
|
375
|
+
|
|
376
|
+
# Handle URL-based images inline
|
|
377
|
+
if isinstance(value, dict) and value.get("url") and not value.get("bytes") and not value.get("path"):
|
|
378
|
+
url = value["url"]
|
|
379
|
+
if session is None:
|
|
380
|
+
session = aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=60))
|
|
381
|
+
await self._download_image_from_url(session, url, out_path)
|
|
382
|
+
elif isinstance(value, str) and value.startswith(("http://", "https://")):
|
|
383
|
+
if session is None:
|
|
384
|
+
session = aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=60))
|
|
385
|
+
await self._download_image_from_url(session, value, out_path)
|
|
386
|
+
else:
|
|
387
|
+
self._save_image_value(value, out_path)
|
|
388
|
+
|
|
389
|
+
record = {
|
|
390
|
+
"dataset_id": dataset_id,
|
|
391
|
+
"index": idx,
|
|
392
|
+
"image_path": str(out_path),
|
|
393
|
+
"source": "huggingface",
|
|
394
|
+
"repo_id": repo_id,
|
|
395
|
+
}
|
|
396
|
+
mf.write(json.dumps(record, ensure_ascii=False) + "\n")
|
|
397
|
+
downloaded += 1
|
|
398
|
+
if downloaded % 50 == 0:
|
|
399
|
+
await self._emit("progress", {"downloaded": downloaded, "failed": failed, "target": target})
|
|
400
|
+
except Exception as e:
|
|
401
|
+
failed += 1
|
|
402
|
+
ef.write(json.dumps({"index": idx, "error": str(e)}, ensure_ascii=False) + "\n")
|
|
403
|
+
finally:
|
|
404
|
+
if session is not None:
|
|
405
|
+
await session.close()
|
|
262
406
|
|
|
263
407
|
await self._emit("done", {"downloaded": downloaded, "failed": failed})
|
|
264
408
|
return {"downloaded": downloaded, "failed": failed}
|
|
265
409
|
|
|
410
|
+
async def _download_image_from_url(self, session: aiohttp.ClientSession, url: str, out_path: Path) -> None:
|
|
411
|
+
"""Download an image from a URL to a local path."""
|
|
412
|
+
async with session.get(url) as response:
|
|
413
|
+
if response.status != 200:
|
|
414
|
+
raise RuntimeError(f"HTTP {response.status} downloading {url}")
|
|
415
|
+
data = await response.read()
|
|
416
|
+
if not data:
|
|
417
|
+
raise RuntimeError(f"Empty response from {url}")
|
|
418
|
+
out_path.write_bytes(data)
|
|
419
|
+
|
|
266
420
|
async def _download_kaggle(
|
|
267
421
|
self,
|
|
268
422
|
kaggle_ref: str,
|
|
@@ -393,30 +547,66 @@ class AssetDownloader:
|
|
|
393
547
|
|
|
394
548
|
@staticmethod
|
|
395
549
|
def _save_image_value(value: Any, out_path: Path) -> None:
|
|
550
|
+
"""Save an image value to disk. Handles multiple image representations:
|
|
551
|
+
- PIL Image objects (have .save method)
|
|
552
|
+
- dict with 'bytes' key (raw image bytes)
|
|
553
|
+
- dict with 'path' key (local file path)
|
|
554
|
+
- bytes/bytearray (raw image data)
|
|
555
|
+
- str (local file path)
|
|
556
|
+
"""
|
|
396
557
|
if value is None:
|
|
397
558
|
raise ValueError("empty image value")
|
|
398
559
|
|
|
399
|
-
|
|
560
|
+
# PIL Image object
|
|
561
|
+
if hasattr(value, "save") and hasattr(value, "size"):
|
|
400
562
|
value.save(out_path)
|
|
401
563
|
return
|
|
402
564
|
|
|
565
|
+
# Raw bytes
|
|
566
|
+
if isinstance(value, (bytes, bytearray)):
|
|
567
|
+
out_path.write_bytes(value)
|
|
568
|
+
return
|
|
569
|
+
|
|
570
|
+
# Dict with image data
|
|
403
571
|
if isinstance(value, dict):
|
|
404
572
|
if value.get("bytes"):
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
573
|
+
raw = value["bytes"]
|
|
574
|
+
if isinstance(raw, (bytes, bytearray)):
|
|
575
|
+
out_path.write_bytes(raw)
|
|
576
|
+
else:
|
|
577
|
+
# Could be a list of ints
|
|
578
|
+
out_path.write_bytes(bytes(raw))
|
|
409
579
|
return
|
|
580
|
+
if value.get("path"):
|
|
581
|
+
p = str(value["path"])
|
|
582
|
+
if os.path.exists(p):
|
|
583
|
+
shutil.copy2(p, out_path)
|
|
584
|
+
return
|
|
585
|
+
raise ValueError(f"Image path not found: {p}")
|
|
410
586
|
if value.get("url"):
|
|
411
|
-
raise ValueError("image URL
|
|
587
|
+
raise ValueError("image URL detected — use async URL downloader")
|
|
412
588
|
|
|
589
|
+
# String: local file path
|
|
413
590
|
if isinstance(value, str):
|
|
414
591
|
if os.path.exists(value):
|
|
415
592
|
shutil.copy2(value, out_path)
|
|
416
593
|
return
|
|
417
|
-
|
|
594
|
+
if value.startswith(("http://", "https://")):
|
|
595
|
+
raise ValueError("image URL detected — use async URL downloader")
|
|
596
|
+
raise ValueError(f"Image path not found: {value}")
|
|
597
|
+
|
|
598
|
+
# numpy array (common in some datasets)
|
|
599
|
+
try:
|
|
600
|
+
import numpy as np
|
|
601
|
+
if isinstance(value, np.ndarray):
|
|
602
|
+
from PIL import Image
|
|
603
|
+
img = Image.fromarray(value)
|
|
604
|
+
img.save(out_path)
|
|
605
|
+
return
|
|
606
|
+
except (ImportError, Exception):
|
|
607
|
+
pass
|
|
418
608
|
|
|
419
|
-
raise ValueError(f"Unsupported image value type: {type(value)}")
|
|
609
|
+
raise ValueError(f"Unsupported image value type: {type(value).__name__}")
|
|
420
610
|
|
|
421
611
|
async def _write_webdataset(self, dataset_dir: Path, images_dir: Path, metadata_file: Path) -> None:
|
|
422
612
|
"""Write a webdataset-compatible tar archive.
|
package/wizard.cjs
ADDED