npm - @vespermcp/mcp-server - Versions diffs - 1.2.19 → 1.2.21 - Mend

@vespermcp/mcp-server 1.2.19 → 1.2.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/build/index.js +41 -6
package/build/ingestion/hf-downloader.js +12 -2
package/build/ingestion/ingestor.js +124 -18
package/build/python/asset_downloader_engine.py +20 -1
package/build/python/hf_fallback.py +298 -0
package/build/python/vesper/core/asset_downloader.py +233 -47
package/package.json +1 -1
package/src/python/asset_downloader_engine.py +20 -1
package/src/python/hf_fallback.py +298 -0
package/src/python/vesper/core/asset_downloader.py +233 -47

package/build/python/vesper/core/asset_downloader.py CHANGED Viewed

@@ -9,6 +9,7 @@ import tempfile
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any, Awaitable, Callable, Dict, Iterable, List, Optional
+from urllib.parse import urlparse
 import aiohttp
@@ -25,7 +26,7 @@ except Exception:  # pragma: no cover
     wds = None
-IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif", ".tiff"}
+IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif", ".tiff", ".tif", ".svg"}
 @dataclass
@@ -98,22 +99,88 @@ class AssetDownloader:
     @staticmethod
     def find_image_column(dataset: Any) -> Optional[str]:
+        """Auto-detect the image column in a HuggingFace dataset.
+        Detection strategy (in priority order):
+        1. HF Feature type: columns with Image() feature type
+        2. Known column names: 'image', 'img', 'photo', 'image_url', etc.
+        3. URL pattern detection: columns containing image URLs (http(s)://...jpg)
+        4. Path pattern detection: columns with file paths ending in image extensions
+        """
+        # Strategy 1: Check HF Feature types (most reliable)
         features = getattr(dataset, "features", None)
         if features:
             for name, feature in features.items():
-                feature_name = feature.__class__.__name__.lower()
-                feature_repr = str(feature).lower()
-                if feature_name == "image" or "image(" in feature_repr:
-                    return str(name)
-                lower = str(name).lower()
-                if lower in {"image", "images", "img", "image_path", "image_url", "url"}:
+                feat_cls = feature.__class__.__name__.lower()
+                feat_str = str(feature).lower()
+                if feat_cls == "image" or "image(" in feat_str:
                     return str(name)
-        candidate_columns = ["image", "images", "img", "image_path", "image_url", "url", "file_name", "filepath"]
+        # Strategy 2: Check known column names
         cols = getattr(dataset, "column_names", []) or []
-        for c in candidate_columns:
+        # Exact match first (highest priority names)
+        priority_exact = ["image", "img", "photo", "picture", "images"]
+        for c in priority_exact:
             if c in cols:
                 return c
+        # Partial match (column names containing image-related keywords)
+        priority_partial = [
+            "image_path", "image_url", "img_path", "img_url",
+            "image_file", "file_name", "filepath", "filename",
+            "photo_url", "picture_url", "thumbnail",
+            "url", "path", "file",
+        ]
+        for target in priority_partial:
+            for c in cols:
+                if c.lower() == target:
+                    return c
+        # Strategy 3: Sample values to detect URL/path patterns
+        try:
+            sample_size = min(5, len(dataset)) if hasattr(dataset, "__len__") else 5
+            if sample_size > 0:
+                for c in cols:
+                    is_image_col = False
+                    for i in range(sample_size):
+                        try:
+                            val = dataset[i][c]
+                        except Exception:
+                            break
+                        if val is None:
+                            continue
+                        # PIL Image object
+                        if hasattr(val, "save") and hasattr(val, "size"):
+                            is_image_col = True
+                            break
+                        # Dict with image data
+                        if isinstance(val, dict) and any(k in val for k in ("bytes", "path", "url")):
+                            is_image_col = True
+                            break
+                        # String: URL or file path
+                        if isinstance(val, str):
+                            val_lower = val.lower()
+                            # Check for image URLs
+                            if val_lower.startswith(("http://", "https://")) and any(
+                                ext in val_lower.split("?")[0] for ext in IMAGE_EXTENSIONS
+                            ):
+                                is_image_col = True
+                                break
+                            # Check for file paths with image extensions
+                            if any(val_lower.endswith(ext) for ext in IMAGE_EXTENSIONS):
+                                is_image_col = True
+                                break
+                    if is_image_col:
+                        return c
+        except Exception:
+            pass
         return None
     async def download_assets(
@@ -214,20 +281,73 @@ class AssetDownloader:
         image_column: Optional[str],
     ) -> Dict[str, int]:
         from datasets import load_dataset  # validated in download_assets()
+        import warnings
+        warnings.filterwarnings("ignore", message=".*trust_remote_code.*")
         await self._emit("start", {"source": "huggingface", "repo_id": repo_id})
-        try:
-            ds = load_dataset(repo_id, split="train")
-        except Exception:
-            dd = load_dataset(repo_id)
-            first_split = list(dd.keys())[0]
-            ds = dd[first_split]
+        token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or None
+        # Try loading with multiple strategies
+        ds = None
+        load_errors = []
+        for trust_rc in [True, False]:
+            for split_name in ["train", "test", "validation"]:
+                try:
+                    kwargs = {"path": repo_id, "split": split_name}
+                    if trust_rc:
+                        kwargs["trust_remote_code"] = True
+                    if token:
+                        kwargs["token"] = token
+                    ds = load_dataset(**kwargs)
+                    break
+                except Exception as e:
+                    msg = str(e)
+                    # Immediately raise auth errors
+                    if any(x in msg for x in ["401", "403", "gated", "Unauthorized"]):
+                        raise RuntimeError(
+                            f"Authentication required for '{repo_id}'. "
+                            "This dataset may be gated or private. "
+                            "Use the configure_keys tool to set HF_TOKEN, then retry."
+                        )
+                    load_errors.append(msg)
+                    continue
+            if ds is not None:
+                break
+        # Fallback: load without split
+        if ds is None:
+            try:
+                kwargs = {"path": repo_id, "trust_remote_code": True}
+                if token:
+                    kwargs["token"] = token
+                dd = load_dataset(**kwargs)
+                from datasets import DatasetDict
+                if isinstance(dd, DatasetDict):
+                    first_split = list(dd.keys())[0]
+                    ds = dd[first_split]
+                else:
+                    ds = dd
+            except Exception as e:
+                msg = str(e)
+                if any(x in msg for x in ["401", "403", "gated", "Unauthorized"]):
+                    raise RuntimeError(
+                        f"Authentication required for '{repo_id}'. "
+                        "Use the configure_keys tool to set HF_TOKEN, then retry."
+                    )
+                combined = "; ".join(load_errors[:3])
+                raise RuntimeError(
+                    f"Failed to load HuggingFace dataset '{repo_id}': {msg}. "
+                    f"Previous attempts: {combined}"
+                )
         col = image_column or self.find_image_column(ds)
         if not col:
             raise RuntimeError(
-                "No image column detected in HuggingFace dataset. Provide image_column or use fallback strategy with URL column."
+                f"No image column detected in HuggingFace dataset '{repo_id}'. "
+                "Available columns: " + ", ".join(getattr(ds, "column_names", [])) + ". "
+                "Provide image_column parameter explicitly."
             )
         total = len(ds) if hasattr(ds, "__len__") else 0
@@ -236,33 +356,63 @@ class AssetDownloader:
         downloaded = 0
         failed = 0
-        with metadata_file.open("w", encoding="utf-8") as mf, errors_file.open("w", encoding="utf-8") as ef:
-            for idx, row in enumerate(ds):
-                if max_items and idx >= max_items:
-                    break
-                try:
-                    out_name = f"{idx:08d}.jpg"
-                    out_path = images_dir / out_name
-                    self._save_image_value(row.get(col), out_path)
-                    record = {
-                        "dataset_id": dataset_id,
-                        "index": idx,
-                        "image_path": str(out_path),
-                        "source": "huggingface",
-                        "repo_id": repo_id,
-                    }
-                    mf.write(json.dumps(record, ensure_ascii=False) + "\n")
-                    downloaded += 1
-                    if downloaded % 50 == 0:
-                        await self._emit("progress", {"downloaded": downloaded, "failed": failed, "target": target})
-                except Exception as e:
-                    failed += 1
-                    ef.write(json.dumps({"index": idx, "error": str(e)}, ensure_ascii=False) + "\n")
+        # Create an aiohttp session for URL-based images
+        session = None
+        try:
+            with metadata_file.open("w", encoding="utf-8") as mf, errors_file.open("w", encoding="utf-8") as ef:
+                for idx, row in enumerate(ds):
+                    if max_items and idx >= max_items:
+                        break
+                    try:
+                        out_name = f"{idx:08d}.jpg"
+                        out_path = images_dir / out_name
+                        value = row.get(col)
+                        # Handle URL-based images inline
+                        if isinstance(value, dict) and value.get("url") and not value.get("bytes") and not value.get("path"):
+                            url = value["url"]
+                            if session is None:
+                                session = aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=60))
+                            await self._download_image_from_url(session, url, out_path)
+                        elif isinstance(value, str) and value.startswith(("http://", "https://")):
+                            if session is None:
+                                session = aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=60))
+                            await self._download_image_from_url(session, value, out_path)
+                        else:
+                            self._save_image_value(value, out_path)
+                        record = {
+                            "dataset_id": dataset_id,
+                            "index": idx,
+                            "image_path": str(out_path),
+                            "source": "huggingface",
+                            "repo_id": repo_id,
+                        }
+                        mf.write(json.dumps(record, ensure_ascii=False) + "\n")
+                        downloaded += 1
+                        if downloaded % 50 == 0:
+                            await self._emit("progress", {"downloaded": downloaded, "failed": failed, "target": target})
+                    except Exception as e:
+                        failed += 1
+                        ef.write(json.dumps({"index": idx, "error": str(e)}, ensure_ascii=False) + "\n")
+        finally:
+            if session is not None:
+                await session.close()
         await self._emit("done", {"downloaded": downloaded, "failed": failed})
         return {"downloaded": downloaded, "failed": failed}
+    async def _download_image_from_url(self, session: aiohttp.ClientSession, url: str, out_path: Path) -> None:
+        """Download an image from a URL to a local path."""
+        async with session.get(url) as response:
+            if response.status != 200:
+                raise RuntimeError(f"HTTP {response.status} downloading {url}")
+            data = await response.read()
+            if not data:
+                raise RuntimeError(f"Empty response from {url}")
+            out_path.write_bytes(data)
     async def _download_kaggle(
         self,
         kaggle_ref: str,
@@ -393,30 +543,66 @@ class AssetDownloader:
     @staticmethod
     def _save_image_value(value: Any, out_path: Path) -> None:
+        """Save an image value to disk. Handles multiple image representations:
+        - PIL Image objects (have .save method)
+        - dict with 'bytes' key (raw image bytes)
+        - dict with 'path' key (local file path)
+        - bytes/bytearray (raw image data)
+        - str (local file path)
+        """
         if value is None:
             raise ValueError("empty image value")
-        if hasattr(value, "save"):
+        # PIL Image object
+        if hasattr(value, "save") and hasattr(value, "size"):
             value.save(out_path)
             return
+        # Raw bytes
+        if isinstance(value, (bytes, bytearray)):
+            out_path.write_bytes(value)
+            return
+        # Dict with image data
         if isinstance(value, dict):
             if value.get("bytes"):
-                out_path.write_bytes(value["bytes"])
-                return
-            if value.get("path") and os.path.exists(value["path"]):
-                shutil.copy2(value["path"], out_path)
+                raw = value["bytes"]
+                if isinstance(raw, (bytes, bytearray)):
+                    out_path.write_bytes(raw)
+                else:
+                    # Could be a list of ints
+                    out_path.write_bytes(bytes(raw))
                 return
+            if value.get("path"):
+                p = str(value["path"])
+                if os.path.exists(p):
+                    shutil.copy2(p, out_path)
+                    return
+                raise ValueError(f"Image path not found: {p}")
             if value.get("url"):
-                raise ValueError("image URL requires URL downloader fallback")
+                raise ValueError("image URL detected — use async URL downloader")
+        # String: local file path
         if isinstance(value, str):
             if os.path.exists(value):
                 shutil.copy2(value, out_path)
                 return
-            raise ValueError("string image value is not a local path")
+            if value.startswith(("http://", "https://")):
+                raise ValueError("image URL detected — use async URL downloader")
+            raise ValueError(f"Image path not found: {value}")
+        # numpy array (common in some datasets)
+        try:
+            import numpy as np
+            if isinstance(value, np.ndarray):
+                from PIL import Image
+                img = Image.fromarray(value)
+                img.save(out_path)
+                return
+        except (ImportError, Exception):
+            pass
-        raise ValueError(f"Unsupported image value type: {type(value)}")
+        raise ValueError(f"Unsupported image value type: {type(value).__name__}")
     async def _write_webdataset(self, dataset_dir: Path, images_dir: Path, metadata_file: Path) -> None:
         """Write a webdataset-compatible tar archive.

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@vespermcp/mcp-server",
-  "version": "1.2.19",
+  "version": "1.2.21",
   "description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
   "type": "module",
   "main": "build/index.js",

package/src/python/asset_downloader_engine.py CHANGED Viewed

@@ -3,9 +3,14 @@ import asyncio
 import json
 import os
 import sys
+import warnings
 from pathlib import Path
 from typing import Any, Dict
+# Suppress noisy HF warnings
+warnings.filterwarnings("ignore", message=".*trust_remote_code.*")
+warnings.filterwarnings("ignore", message=".*legacy.*")
 CURRENT_DIR = Path(__file__).resolve().parent
 if str(CURRENT_DIR) not in sys.path:
     sys.path.insert(0, str(CURRENT_DIR))
@@ -24,6 +29,11 @@ async def _run_download(args: argparse.Namespace) -> Dict[str, Any]:
     workers = int(payload.get("workers") or 8)
     recipes_dir = payload.get("recipes_dir")
+    # Auto-set HF token from payload if provided
+    token = payload.get("token") or payload.get("hf_token")
+    if token:
+        os.environ["HF_TOKEN"] = str(token)
     downloader = AssetDownloader(output_root=output_root, workers=workers, recipes_dir=recipes_dir)
     result = await downloader.download_assets(
@@ -66,7 +76,16 @@ def main() -> None:
         _print({"ok": False, "error": f"Unknown action: {args.action}"})
     except Exception as e:
-        _print({"ok": False, "error": str(e)})
+        error_msg = str(e)
+        # Provide actionable error messages
+        if "401" in error_msg or "403" in error_msg or "Unauthorized" in error_msg:
+            error_msg = (
+                "Authentication required. This dataset may be gated/private. "
+                "Use configure_keys tool to set HF_TOKEN, then retry."
+            )
+        elif "No image column" in error_msg:
+            error_msg += " Hint: specify image_column parameter with the name of the column containing images."
+        _print({"ok": False, "error": error_msg})
 if __name__ == "__main__":