@vespermcp/mcp-server 1.2.20 → 1.2.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/README.md +49 -0
  2. package/build/cloud/adapters/supabase.js +49 -0
  3. package/build/cloud/storage-manager.js +6 -0
  4. package/build/export/exporter.js +22 -9
  5. package/build/gateway/unified-dataset-gateway.js +410 -0
  6. package/build/index.js +1592 -837
  7. package/build/ingestion/hf-downloader.js +12 -2
  8. package/build/ingestion/ingestor.js +19 -9
  9. package/build/install/install-service.js +11 -6
  10. package/build/lib/supabase.js +3 -0
  11. package/build/metadata/scraper.js +85 -14
  12. package/build/python/asset_downloader_engine.py +22 -1
  13. package/build/python/convert_engine.py +92 -0
  14. package/build/python/export_engine.py +45 -0
  15. package/build/python/hf_fallback.py +196 -45
  16. package/build/python/kaggle_engine.py +77 -5
  17. package/build/python/normalize_engine.py +83 -0
  18. package/build/python/vesper/core/asset_downloader.py +238 -48
  19. package/build/search/engine.js +43 -5
  20. package/build/search/jit-orchestrator.js +18 -14
  21. package/build/search/query-intent.js +509 -0
  22. package/build/tools/formatter.js +6 -3
  23. package/build/utils/python-runtime.js +130 -0
  24. package/package.json +7 -5
  25. package/scripts/postinstall.cjs +87 -31
  26. package/scripts/wizard.cjs +601 -0
  27. package/scripts/wizard.js +306 -12
  28. package/src/python/__pycache__/config.cpython-312.pyc +0 -0
  29. package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
  30. package/src/python/asset_downloader_engine.py +22 -1
  31. package/src/python/convert_engine.py +92 -0
  32. package/src/python/export_engine.py +45 -0
  33. package/src/python/hf_fallback.py +196 -45
  34. package/src/python/kaggle_engine.py +77 -5
  35. package/src/python/normalize_engine.py +83 -0
  36. package/src/python/requirements.txt +12 -0
  37. package/src/python/vesper/core/asset_downloader.py +238 -48
  38. package/wizard.cjs +3 -0
@@ -9,6 +9,7 @@ import tempfile
9
9
  from dataclasses import dataclass
10
10
  from pathlib import Path
11
11
  from typing import Any, Awaitable, Callable, Dict, Iterable, List, Optional
12
+ from urllib.parse import urlparse
12
13
 
13
14
  import aiohttp
14
15
 
@@ -25,7 +26,7 @@ except Exception: # pragma: no cover
25
26
  wds = None
26
27
 
27
28
 
28
- IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif", ".tiff"}
29
+ IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif", ".tiff", ".tif", ".svg"}
29
30
 
30
31
 
31
32
  @dataclass
@@ -98,22 +99,88 @@ class AssetDownloader:
98
99
 
99
100
  @staticmethod
100
101
  def find_image_column(dataset: Any) -> Optional[str]:
102
+ """Auto-detect the image column in a HuggingFace dataset.
103
+
104
+ Detection strategy (in priority order):
105
+ 1. HF Feature type: columns with Image() feature type
106
+ 2. Known column names: 'image', 'img', 'photo', 'image_url', etc.
107
+ 3. URL pattern detection: columns containing image URLs (http(s)://...jpg)
108
+ 4. Path pattern detection: columns with file paths ending in image extensions
109
+ """
110
+ # Strategy 1: Check HF Feature types (most reliable)
101
111
  features = getattr(dataset, "features", None)
102
112
  if features:
103
113
  for name, feature in features.items():
104
- feature_name = feature.__class__.__name__.lower()
105
- feature_repr = str(feature).lower()
106
- if feature_name == "image" or "image(" in feature_repr:
107
- return str(name)
108
- lower = str(name).lower()
109
- if lower in {"image", "images", "img", "image_path", "image_url", "url"}:
114
+ feat_cls = feature.__class__.__name__.lower()
115
+ feat_str = str(feature).lower()
116
+ if feat_cls == "image" or "image(" in feat_str:
110
117
  return str(name)
111
118
 
112
- candidate_columns = ["image", "images", "img", "image_path", "image_url", "url", "file_name", "filepath"]
119
+ # Strategy 2: Check known column names
113
120
  cols = getattr(dataset, "column_names", []) or []
114
- for c in candidate_columns:
121
+
122
+ # Exact match first (highest priority names)
123
+ priority_exact = ["image", "img", "photo", "picture", "images"]
124
+ for c in priority_exact:
115
125
  if c in cols:
116
126
  return c
127
+
128
+ # Partial match (column names containing image-related keywords)
129
+ priority_partial = [
130
+ "image_path", "image_url", "img_path", "img_url",
131
+ "image_file", "file_name", "filepath", "filename",
132
+ "photo_url", "picture_url", "thumbnail",
133
+ "url", "path", "file",
134
+ ]
135
+ for target in priority_partial:
136
+ for c in cols:
137
+ if c.lower() == target:
138
+ return c
139
+
140
+ # Strategy 3: Sample values to detect URL/path patterns
141
+ try:
142
+ sample_size = min(5, len(dataset)) if hasattr(dataset, "__len__") else 5
143
+ if sample_size > 0:
144
+ for c in cols:
145
+ is_image_col = False
146
+ for i in range(sample_size):
147
+ try:
148
+ val = dataset[i][c]
149
+ except Exception:
150
+ break
151
+
152
+ if val is None:
153
+ continue
154
+
155
+ # PIL Image object
156
+ if hasattr(val, "save") and hasattr(val, "size"):
157
+ is_image_col = True
158
+ break
159
+
160
+ # Dict with image data
161
+ if isinstance(val, dict) and any(k in val for k in ("bytes", "path", "url")):
162
+ is_image_col = True
163
+ break
164
+
165
+ # String: URL or file path
166
+ if isinstance(val, str):
167
+ val_lower = val.lower()
168
+ # Check for image URLs
169
+ if val_lower.startswith(("http://", "https://")) and any(
170
+ ext in val_lower.split("?")[0] for ext in IMAGE_EXTENSIONS
171
+ ):
172
+ is_image_col = True
173
+ break
174
+ # Check for file paths with image extensions
175
+ if any(val_lower.endswith(ext) for ext in IMAGE_EXTENSIONS):
176
+ is_image_col = True
177
+ break
178
+
179
+ if is_image_col:
180
+ return c
181
+ except Exception:
182
+ pass
183
+
117
184
  return None
118
185
 
119
186
  async def download_assets(
@@ -124,6 +191,7 @@ class AssetDownloader:
124
191
  kaggle_ref: Optional[str] = None,
125
192
  urls: Optional[List[str]] = None,
126
193
  output_format: str = "webdataset",
194
+ output_dir: Optional[str] = None,
127
195
  max_items: Optional[int] = None,
128
196
  image_column: Optional[str] = None,
129
197
  ) -> Dict[str, Any]:
@@ -164,7 +232,10 @@ class AssetDownloader:
164
232
  raise ValueError("urls are required for source=url")
165
233
 
166
234
  # --- Now safe to create directories ---
167
- dataset_dir = self.output_root / dataset_id.replace("/", "_").replace(":", "_")
235
+ if output_dir:
236
+ dataset_dir = Path(output_dir).expanduser().resolve()
237
+ else:
238
+ dataset_dir = self.output_root / dataset_id.replace("/", "_").replace(":", "_")
168
239
  images_dir = dataset_dir / "images"
169
240
  dataset_dir.mkdir(parents=True, exist_ok=True)
170
241
  images_dir.mkdir(parents=True, exist_ok=True)
@@ -214,20 +285,73 @@ class AssetDownloader:
214
285
  image_column: Optional[str],
215
286
  ) -> Dict[str, int]:
216
287
  from datasets import load_dataset # validated in download_assets()
288
+ import warnings
289
+ warnings.filterwarnings("ignore", message=".*trust_remote_code.*")
217
290
 
218
291
  await self._emit("start", {"source": "huggingface", "repo_id": repo_id})
219
292
 
220
- try:
221
- ds = load_dataset(repo_id, split="train")
222
- except Exception:
223
- dd = load_dataset(repo_id)
224
- first_split = list(dd.keys())[0]
225
- ds = dd[first_split]
293
+ token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or None
294
+
295
+ # Try loading with multiple strategies
296
+ ds = None
297
+ load_errors = []
298
+
299
+ for trust_rc in [True, False]:
300
+ for split_name in ["train", "test", "validation"]:
301
+ try:
302
+ kwargs = {"path": repo_id, "split": split_name}
303
+ if trust_rc:
304
+ kwargs["trust_remote_code"] = True
305
+ if token:
306
+ kwargs["token"] = token
307
+ ds = load_dataset(**kwargs)
308
+ break
309
+ except Exception as e:
310
+ msg = str(e)
311
+ # Immediately raise auth errors
312
+ if any(x in msg for x in ["401", "403", "gated", "Unauthorized"]):
313
+ raise RuntimeError(
314
+ f"Authentication required for '{repo_id}'. "
315
+ "This dataset may be gated or private. "
316
+ "Use the configure_keys tool to set HF_TOKEN, then retry."
317
+ )
318
+ load_errors.append(msg)
319
+ continue
320
+ if ds is not None:
321
+ break
322
+
323
+ # Fallback: load without split
324
+ if ds is None:
325
+ try:
326
+ kwargs = {"path": repo_id, "trust_remote_code": True}
327
+ if token:
328
+ kwargs["token"] = token
329
+ dd = load_dataset(**kwargs)
330
+ from datasets import DatasetDict
331
+ if isinstance(dd, DatasetDict):
332
+ first_split = list(dd.keys())[0]
333
+ ds = dd[first_split]
334
+ else:
335
+ ds = dd
336
+ except Exception as e:
337
+ msg = str(e)
338
+ if any(x in msg for x in ["401", "403", "gated", "Unauthorized"]):
339
+ raise RuntimeError(
340
+ f"Authentication required for '{repo_id}'. "
341
+ "Use the configure_keys tool to set HF_TOKEN, then retry."
342
+ )
343
+ combined = "; ".join(load_errors[:3])
344
+ raise RuntimeError(
345
+ f"Failed to load HuggingFace dataset '{repo_id}': {msg}. "
346
+ f"Previous attempts: {combined}"
347
+ )
226
348
 
227
349
  col = image_column or self.find_image_column(ds)
228
350
  if not col:
229
351
  raise RuntimeError(
230
- "No image column detected in HuggingFace dataset. Provide image_column or use fallback strategy with URL column."
352
+ f"No image column detected in HuggingFace dataset '{repo_id}'. "
353
+ "Available columns: " + ", ".join(getattr(ds, "column_names", [])) + ". "
354
+ "Provide image_column parameter explicitly."
231
355
  )
232
356
 
233
357
  total = len(ds) if hasattr(ds, "__len__") else 0
@@ -236,33 +360,63 @@ class AssetDownloader:
236
360
  downloaded = 0
237
361
  failed = 0
238
362
 
239
- with metadata_file.open("w", encoding="utf-8") as mf, errors_file.open("w", encoding="utf-8") as ef:
240
- for idx, row in enumerate(ds):
241
- if max_items and idx >= max_items:
242
- break
243
- try:
244
- out_name = f"{idx:08d}.jpg"
245
- out_path = images_dir / out_name
246
- self._save_image_value(row.get(col), out_path)
247
-
248
- record = {
249
- "dataset_id": dataset_id,
250
- "index": idx,
251
- "image_path": str(out_path),
252
- "source": "huggingface",
253
- "repo_id": repo_id,
254
- }
255
- mf.write(json.dumps(record, ensure_ascii=False) + "\n")
256
- downloaded += 1
257
- if downloaded % 50 == 0:
258
- await self._emit("progress", {"downloaded": downloaded, "failed": failed, "target": target})
259
- except Exception as e:
260
- failed += 1
261
- ef.write(json.dumps({"index": idx, "error": str(e)}, ensure_ascii=False) + "\n")
363
+ # Create an aiohttp session for URL-based images
364
+ session = None
365
+
366
+ try:
367
+ with metadata_file.open("w", encoding="utf-8") as mf, errors_file.open("w", encoding="utf-8") as ef:
368
+ for idx, row in enumerate(ds):
369
+ if max_items and idx >= max_items:
370
+ break
371
+ try:
372
+ out_name = f"{idx:08d}.jpg"
373
+ out_path = images_dir / out_name
374
+ value = row.get(col)
375
+
376
+ # Handle URL-based images inline
377
+ if isinstance(value, dict) and value.get("url") and not value.get("bytes") and not value.get("path"):
378
+ url = value["url"]
379
+ if session is None:
380
+ session = aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=60))
381
+ await self._download_image_from_url(session, url, out_path)
382
+ elif isinstance(value, str) and value.startswith(("http://", "https://")):
383
+ if session is None:
384
+ session = aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=60))
385
+ await self._download_image_from_url(session, value, out_path)
386
+ else:
387
+ self._save_image_value(value, out_path)
388
+
389
+ record = {
390
+ "dataset_id": dataset_id,
391
+ "index": idx,
392
+ "image_path": str(out_path),
393
+ "source": "huggingface",
394
+ "repo_id": repo_id,
395
+ }
396
+ mf.write(json.dumps(record, ensure_ascii=False) + "\n")
397
+ downloaded += 1
398
+ if downloaded % 50 == 0:
399
+ await self._emit("progress", {"downloaded": downloaded, "failed": failed, "target": target})
400
+ except Exception as e:
401
+ failed += 1
402
+ ef.write(json.dumps({"index": idx, "error": str(e)}, ensure_ascii=False) + "\n")
403
+ finally:
404
+ if session is not None:
405
+ await session.close()
262
406
 
263
407
  await self._emit("done", {"downloaded": downloaded, "failed": failed})
264
408
  return {"downloaded": downloaded, "failed": failed}
265
409
 
410
+ async def _download_image_from_url(self, session: aiohttp.ClientSession, url: str, out_path: Path) -> None:
411
+ """Download an image from a URL to a local path."""
412
+ async with session.get(url) as response:
413
+ if response.status != 200:
414
+ raise RuntimeError(f"HTTP {response.status} downloading {url}")
415
+ data = await response.read()
416
+ if not data:
417
+ raise RuntimeError(f"Empty response from {url}")
418
+ out_path.write_bytes(data)
419
+
266
420
  async def _download_kaggle(
267
421
  self,
268
422
  kaggle_ref: str,
@@ -393,30 +547,66 @@ class AssetDownloader:
393
547
 
394
548
  @staticmethod
395
549
  def _save_image_value(value: Any, out_path: Path) -> None:
550
+ """Save an image value to disk. Handles multiple image representations:
551
+ - PIL Image objects (have .save method)
552
+ - dict with 'bytes' key (raw image bytes)
553
+ - dict with 'path' key (local file path)
554
+ - bytes/bytearray (raw image data)
555
+ - str (local file path)
556
+ """
396
557
  if value is None:
397
558
  raise ValueError("empty image value")
398
559
 
399
- if hasattr(value, "save"):
560
+ # PIL Image object
561
+ if hasattr(value, "save") and hasattr(value, "size"):
400
562
  value.save(out_path)
401
563
  return
402
564
 
565
+ # Raw bytes
566
+ if isinstance(value, (bytes, bytearray)):
567
+ out_path.write_bytes(value)
568
+ return
569
+
570
+ # Dict with image data
403
571
  if isinstance(value, dict):
404
572
  if value.get("bytes"):
405
- out_path.write_bytes(value["bytes"])
406
- return
407
- if value.get("path") and os.path.exists(value["path"]):
408
- shutil.copy2(value["path"], out_path)
573
+ raw = value["bytes"]
574
+ if isinstance(raw, (bytes, bytearray)):
575
+ out_path.write_bytes(raw)
576
+ else:
577
+ # Could be a list of ints
578
+ out_path.write_bytes(bytes(raw))
409
579
  return
580
+ if value.get("path"):
581
+ p = str(value["path"])
582
+ if os.path.exists(p):
583
+ shutil.copy2(p, out_path)
584
+ return
585
+ raise ValueError(f"Image path not found: {p}")
410
586
  if value.get("url"):
411
- raise ValueError("image URL requires URL downloader fallback")
587
+ raise ValueError("image URL detected — use async URL downloader")
412
588
 
589
+ # String: local file path
413
590
  if isinstance(value, str):
414
591
  if os.path.exists(value):
415
592
  shutil.copy2(value, out_path)
416
593
  return
417
- raise ValueError("string image value is not a local path")
594
+ if value.startswith(("http://", "https://")):
595
+ raise ValueError("image URL detected — use async URL downloader")
596
+ raise ValueError(f"Image path not found: {value}")
597
+
598
+ # numpy array (common in some datasets)
599
+ try:
600
+ import numpy as np
601
+ if isinstance(value, np.ndarray):
602
+ from PIL import Image
603
+ img = Image.fromarray(value)
604
+ img.save(out_path)
605
+ return
606
+ except (ImportError, Exception):
607
+ pass
418
608
 
419
- raise ValueError(f"Unsupported image value type: {type(value)}")
609
+ raise ValueError(f"Unsupported image value type: {type(value).__name__}")
420
610
 
421
611
  async def _write_webdataset(self, dataset_dir: Path, images_dir: Path, metadata_file: Path) -> None:
422
612
  """Write a webdataset-compatible tar archive.
package/wizard.cjs ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env node
2
+
3
+ require('./scripts/wizard.cjs');