npm - @vespermcp/mcp-server - Versions diffs - 1.2.5 → 1.2.7 - Mend

@vespermcp/mcp-server 1.2.5 → 1.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/build/index.js +97 -1
package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
package/build/python/vesper/core/asset_downloader.py +92 -36
package/package.json +1 -1
package/scripts/postinstall.cjs +6 -1
package/src/python/vesper/core/asset_downloader.py +92 -36

package/build/index.js CHANGED Viewed

@@ -131,8 +131,83 @@ function extractRequestedRows(query, requirements) {
         return Math.max(...allNums);
     return undefined;
 }
+const verifiedPythonModules = new Set();
+function getPythonCommand() {
+    return process.platform === "win32" ? "py" : "python";
+}
+function runPythonProcess(args, timeoutMs = 300000) {
+    const pyCmd = getPythonCommand();
+    return new Promise((resolve, reject) => {
+        const proc = spawn(pyCmd, args, {
+            env: {
+                ...process.env,
+                PIP_DISABLE_PIP_VERSION_CHECK: "1",
+                PYTHONUTF8: "1",
+            },
+        });
+        let stdout = "";
+        let stderr = "";
+        proc.stdout.on("data", (d) => (stdout += d.toString()));
+        proc.stderr.on("data", (d) => (stderr += d.toString()));
+        const timer = setTimeout(() => {
+            try {
+                proc.kill();
+            }
+            catch {
+                // no-op
+            }
+            reject(new Error(`Python command timed out after ${timeoutMs}ms: ${args.join(" ")}`));
+        }, timeoutMs);
+        proc.on("close", (code) => {
+            clearTimeout(timer);
+            resolve({ code: code ?? 1, stdout, stderr });
+        });
+        proc.on("error", (error) => {
+            clearTimeout(timer);
+            reject(error);
+        });
+    });
+}
+async function ensurePythonModules(modulePackagePairs) {
+    const missing = [];
+    for (const pair of modulePackagePairs) {
+        if (verifiedPythonModules.has(pair.module)) {
+            continue;
+        }
+        const check = await runPythonProcess([
+            "-c",
+            `import importlib.util,sys; sys.exit(0 if importlib.util.find_spec(${JSON.stringify(pair.module)}) else 1)`
+        ], 20000);
+        if (check.code === 0) {
+            verifiedPythonModules.add(pair.module);
+        }
+        else {
+            missing.push(pair);
+        }
+    }
+    if (missing.length === 0) {
+        return;
+    }
+    const packages = [...new Set(missing.map(m => m.packageName))];
+    console.error(`[Vesper] Installing missing Python packages: ${packages.join(", ")}`);
+    const installArgs = ["-m", "pip", "install", "--disable-pip-version-check", ...packages];
+    let install = await runPythonProcess(installArgs, 600000);
+    if (install.code !== 0) {
+        console.error(`[Vesper] pip install failed (trying --user fallback): ${(install.stderr || "").slice(0, 300)}`);
+        const userInstallArgs = ["-m", "pip", "install", "--disable-pip-version-check", "--user", ...packages];
+        install = await runPythonProcess(userInstallArgs, 600000);
+    }
+    if (install.code !== 0) {
+        const details = (install.stderr || install.stdout || "Unknown pip install error").trim();
+        throw new Error(`Failed to install required Python packages (${packages.join(", ")}). ${details}`);
+    }
+    console.error(`[Vesper] Successfully installed: ${packages.join(", ")}`);
+    for (const pair of missing) {
+        verifiedPythonModules.add(pair.module);
+    }
+}
 function runPythonJson(scriptPath, args) {
-    const pyCmd = process.platform === "win32" ? "py" : "python";
+    const pyCmd = getPythonCommand();
     return new Promise((resolve, reject) => {
         const proc = spawn(pyCmd, [scriptPath, ...args]);
         let stdout = "";
@@ -932,6 +1007,27 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
                     isError: true,
                 };
             }
+            const requiredModules = [
+                { module: "aiohttp", packageName: "aiohttp" },
+            ];
+            if (source === "url") {
+                requiredModules.push({ module: "aiofiles", packageName: "aiofiles" });
+            }
+            if (source === "huggingface") {
+                requiredModules.push({ module: "datasets", packageName: "datasets" });
+            }
+            if (source === "kaggle") {
+                requiredModules.push({ module: "kaggle", packageName: "kaggle" });
+            }
+            try {
+                await ensurePythonModules(requiredModules);
+            }
+            catch (error) {
+                return {
+                    content: [{ type: "text", text: `ERROR: Python dependency setup failed: ${error.message}` }],
+                    isError: true,
+                };
+            }
             const scriptPath = path.join(dataRoot, "python", "asset_downloader_engine.py");
             const payload = {
                 dataset_id: datasetId,

package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc CHANGED Viewed

Binary file

package/build/python/vesper/core/asset_downloader.py CHANGED Viewed

@@ -101,27 +101,54 @@ class AssetDownloader:
         if source not in {"huggingface", "kaggle", "url"}:
             raise ValueError("source must be one of: huggingface, kaggle, url")
-        dataset_dir = self.output_root / dataset_id.replace("/", "_").replace(":", "_")
-        images_dir = dataset_dir / "images"
-        dataset_dir.mkdir(parents=True, exist_ok=True)
-        images_dir.mkdir(parents=True, exist_ok=True)
-        errors_file = dataset_dir / "errors.jsonl"
-        metadata_file = dataset_dir / "metadata.jsonl"
+        # --- Validate imports and args BEFORE creating any directories ---
         if source == "huggingface":
             if not repo_id:
                 raise ValueError("repo_id is required for source=huggingface")
-            summary = await self._download_huggingface(repo_id, dataset_id, images_dir, metadata_file, errors_file, max_items, image_column)
+            try:
+                from datasets import load_dataset as _ld  # noqa: F401
+            except Exception as e:
+                raise RuntimeError(
+                    f"datasets package is required for HuggingFace downloads. "
+                    f"Install with: pip install datasets. Details: {e}"
+                )
         elif source == "kaggle":
             ref = kaggle_ref or repo_id
             if not ref:
                 raise ValueError("kaggle_ref is required for source=kaggle")
-            summary = await self._download_kaggle(ref, dataset_id, images_dir, metadata_file, errors_file, max_items)
+            try:
+                from kaggle.api.kaggle_api_extended import KaggleApi as _Ka  # noqa: F401
+            except Exception as e:
+                raise RuntimeError(
+                    f"kaggle package is required for Kaggle downloads. "
+                    f"Install with: pip install kaggle. Details: {e}"
+                )
         else:
             if not urls:
                 raise ValueError("urls are required for source=url")
-            summary = await self._download_urls(urls, dataset_id, images_dir, metadata_file, errors_file, max_items)
+        # --- Now safe to create directories ---
+        dataset_dir = self.output_root / dataset_id.replace("/", "_").replace(":", "_")
+        images_dir = dataset_dir / "images"
+        dataset_dir.mkdir(parents=True, exist_ok=True)
+        images_dir.mkdir(parents=True, exist_ok=True)
+        errors_file = dataset_dir / "errors.jsonl"
+        metadata_file = dataset_dir / "metadata.jsonl"
+        try:
+            if source == "huggingface":
+                summary = await self._download_huggingface(repo_id, dataset_id, images_dir, metadata_file, errors_file, max_items, image_column)
+            elif source == "kaggle":
+                ref = kaggle_ref or repo_id
+                summary = await self._download_kaggle(ref, dataset_id, images_dir, metadata_file, errors_file, max_items)
+            else:
+                summary = await self._download_urls(urls, dataset_id, images_dir, metadata_file, errors_file, max_items)
+        except Exception:
+            # Clean up empty directories on failure so we don't leave ghost artifacts
+            if images_dir.exists() and not any(images_dir.iterdir()):
+                shutil.rmtree(dataset_dir, ignore_errors=True)
+            raise
         if output_format == "webdataset":
             await self._write_webdataset(dataset_dir, images_dir, metadata_file)
@@ -150,10 +177,7 @@ class AssetDownloader:
         max_items: Optional[int],
         image_column: Optional[str],
     ) -> Dict[str, int]:
-        try:
-            from datasets import load_dataset
-        except Exception as e:
-            raise RuntimeError(f"datasets package is required. Install with: pip install datasets. Details: {e}")
+        from datasets import load_dataset  # validated in download_assets()
         await self._emit("start", {"source": "huggingface", "repo_id": repo_id})
@@ -212,10 +236,7 @@ class AssetDownloader:
         errors_file: Path,
         max_items: Optional[int],
     ) -> Dict[str, int]:
-        try:
-            from kaggle.api.kaggle_api_extended import KaggleApi
-        except Exception as e:
-            raise RuntimeError(f"kaggle package is required: {e}")
+        from kaggle.api.kaggle_api_extended import KaggleApi  # validated in download_assets()
         await self._emit("start", {"source": "kaggle", "dataset": kaggle_ref})
@@ -353,24 +374,59 @@ class AssetDownloader:
         raise ValueError(f"Unsupported image value type: {type(value)}")
     async def _write_webdataset(self, dataset_dir: Path, images_dir: Path, metadata_file: Path) -> None:
-        if wds is None:
-            raise RuntimeError("webdataset package is required for webdataset output. Install with: pip install webdataset")
+        """Write a webdataset-compatible tar archive.
+        Uses Python's built-in tarfile module instead of wds.ShardWriter to
+        avoid the gopen() handler issue on Windows (backslash paths).
+        The resulting .tar files are fully compatible with webdataset readers.
+        """
+        import io
+        import tarfile as _tarfile
+        max_per_shard = 5000
+        shard_idx = 0
+        count_in_shard = 0
+        current_tar: _tarfile.TarFile | None = None
+        def _open_shard() -> _tarfile.TarFile:
+            nonlocal shard_idx
+            shard_path = dataset_dir / f"shard-{shard_idx:06d}.tar"
+            shard_idx += 1
+            return _tarfile.open(str(shard_path), "w")
-        shard_pattern = str(dataset_dir / "shard-%06d.tar")
-        with metadata_file.open("r", encoding="utf-8") as mf, wds.ShardWriter(shard_pattern, maxcount=5000) as sink:
-            for line in mf:
-                row = json.loads(line)
-                image_path = Path(row["image_path"])
-                if not image_path.exists():
-                    continue
-                key = image_path.stem
-                ext = image_path.suffix.lstrip(".") or "jpg"
-                sample = {
-                    "__key__": key,
-                    ext: image_path.read_bytes(),
-                    "json": json.dumps(row, ensure_ascii=False).encode("utf-8"),
-                }
-                sink.write(sample)
+        try:
+            current_tar = _open_shard()
+            with metadata_file.open("r", encoding="utf-8") as mf:
+                for line in mf:
+                    row = json.loads(line)
+                    image_path = Path(row["image_path"])
+                    if not image_path.exists():
+                        continue
+                    key = image_path.stem
+                    ext = image_path.suffix.lstrip(".") or "jpg"
+                    # Add image file
+                    img_data = image_path.read_bytes()
+                    img_info = _tarfile.TarInfo(name=f"{key}.{ext}")
+                    img_info.size = len(img_data)
+                    current_tar.addfile(img_info, io.BytesIO(img_data))
+                    # Add JSON metadata sidecar
+                    json_data = json.dumps(row, ensure_ascii=False).encode("utf-8")
+                    json_info = _tarfile.TarInfo(name=f"{key}.json")
+                    json_info.size = len(json_data)
+                    current_tar.addfile(json_info, io.BytesIO(json_data))
+                    count_in_shard += 1
+                    if count_in_shard >= max_per_shard:
+                        current_tar.close()
+                        current_tar = _open_shard()
+                        count_in_shard = 0
+        finally:
+            if current_tar is not None:
+                current_tar.close()
     async def _write_parquet(self, dataset_dir: Path, metadata_file: Path) -> None:
         try:

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@vespermcp/mcp-server",
-  "version": "1.2.5",
+  "version": "1.2.7",
   "description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
   "type": "module",
   "main": "build/index.js",

package/scripts/postinstall.cjs CHANGED Viewed

@@ -23,7 +23,12 @@ const pythonPackages = [
     'pillow',
     'numpy',
     'librosa',
-    'soundfile'
+    'soundfile',
+    'aiohttp',
+    'aiofiles',
+    'datasets',
+    'webdataset',
+    'kaggle'
 ];
 try {

package/src/python/vesper/core/asset_downloader.py CHANGED Viewed

@@ -101,27 +101,54 @@ class AssetDownloader:
         if source not in {"huggingface", "kaggle", "url"}:
             raise ValueError("source must be one of: huggingface, kaggle, url")
-        dataset_dir = self.output_root / dataset_id.replace("/", "_").replace(":", "_")
-        images_dir = dataset_dir / "images"
-        dataset_dir.mkdir(parents=True, exist_ok=True)
-        images_dir.mkdir(parents=True, exist_ok=True)
-        errors_file = dataset_dir / "errors.jsonl"
-        metadata_file = dataset_dir / "metadata.jsonl"
+        # --- Validate imports and args BEFORE creating any directories ---
         if source == "huggingface":
             if not repo_id:
                 raise ValueError("repo_id is required for source=huggingface")
-            summary = await self._download_huggingface(repo_id, dataset_id, images_dir, metadata_file, errors_file, max_items, image_column)
+            try:
+                from datasets import load_dataset as _ld  # noqa: F401
+            except Exception as e:
+                raise RuntimeError(
+                    f"datasets package is required for HuggingFace downloads. "
+                    f"Install with: pip install datasets. Details: {e}"
+                )
         elif source == "kaggle":
             ref = kaggle_ref or repo_id
             if not ref:
                 raise ValueError("kaggle_ref is required for source=kaggle")
-            summary = await self._download_kaggle(ref, dataset_id, images_dir, metadata_file, errors_file, max_items)
+            try:
+                from kaggle.api.kaggle_api_extended import KaggleApi as _Ka  # noqa: F401
+            except Exception as e:
+                raise RuntimeError(
+                    f"kaggle package is required for Kaggle downloads. "
+                    f"Install with: pip install kaggle. Details: {e}"
+                )
         else:
             if not urls:
                 raise ValueError("urls are required for source=url")
-            summary = await self._download_urls(urls, dataset_id, images_dir, metadata_file, errors_file, max_items)
+        # --- Now safe to create directories ---
+        dataset_dir = self.output_root / dataset_id.replace("/", "_").replace(":", "_")
+        images_dir = dataset_dir / "images"
+        dataset_dir.mkdir(parents=True, exist_ok=True)
+        images_dir.mkdir(parents=True, exist_ok=True)
+        errors_file = dataset_dir / "errors.jsonl"
+        metadata_file = dataset_dir / "metadata.jsonl"
+        try:
+            if source == "huggingface":
+                summary = await self._download_huggingface(repo_id, dataset_id, images_dir, metadata_file, errors_file, max_items, image_column)
+            elif source == "kaggle":
+                ref = kaggle_ref or repo_id
+                summary = await self._download_kaggle(ref, dataset_id, images_dir, metadata_file, errors_file, max_items)
+            else:
+                summary = await self._download_urls(urls, dataset_id, images_dir, metadata_file, errors_file, max_items)
+        except Exception:
+            # Clean up empty directories on failure so we don't leave ghost artifacts
+            if images_dir.exists() and not any(images_dir.iterdir()):
+                shutil.rmtree(dataset_dir, ignore_errors=True)
+            raise
         if output_format == "webdataset":
             await self._write_webdataset(dataset_dir, images_dir, metadata_file)
@@ -150,10 +177,7 @@ class AssetDownloader:
         max_items: Optional[int],
         image_column: Optional[str],
     ) -> Dict[str, int]:
-        try:
-            from datasets import load_dataset
-        except Exception as e:
-            raise RuntimeError(f"datasets package is required. Install with: pip install datasets. Details: {e}")
+        from datasets import load_dataset  # validated in download_assets()
         await self._emit("start", {"source": "huggingface", "repo_id": repo_id})
@@ -212,10 +236,7 @@ class AssetDownloader:
         errors_file: Path,
         max_items: Optional[int],
     ) -> Dict[str, int]:
-        try:
-            from kaggle.api.kaggle_api_extended import KaggleApi
-        except Exception as e:
-            raise RuntimeError(f"kaggle package is required: {e}")
+        from kaggle.api.kaggle_api_extended import KaggleApi  # validated in download_assets()
         await self._emit("start", {"source": "kaggle", "dataset": kaggle_ref})
@@ -353,24 +374,59 @@ class AssetDownloader:
         raise ValueError(f"Unsupported image value type: {type(value)}")
     async def _write_webdataset(self, dataset_dir: Path, images_dir: Path, metadata_file: Path) -> None:
-        if wds is None:
-            raise RuntimeError("webdataset package is required for webdataset output. Install with: pip install webdataset")
+        """Write a webdataset-compatible tar archive.
+        Uses Python's built-in tarfile module instead of wds.ShardWriter to
+        avoid the gopen() handler issue on Windows (backslash paths).
+        The resulting .tar files are fully compatible with webdataset readers.
+        """
+        import io
+        import tarfile as _tarfile
+        max_per_shard = 5000
+        shard_idx = 0
+        count_in_shard = 0
+        current_tar: _tarfile.TarFile | None = None
+        def _open_shard() -> _tarfile.TarFile:
+            nonlocal shard_idx
+            shard_path = dataset_dir / f"shard-{shard_idx:06d}.tar"
+            shard_idx += 1
+            return _tarfile.open(str(shard_path), "w")
-        shard_pattern = str(dataset_dir / "shard-%06d.tar")
-        with metadata_file.open("r", encoding="utf-8") as mf, wds.ShardWriter(shard_pattern, maxcount=5000) as sink:
-            for line in mf:
-                row = json.loads(line)
-                image_path = Path(row["image_path"])
-                if not image_path.exists():
-                    continue
-                key = image_path.stem
-                ext = image_path.suffix.lstrip(".") or "jpg"
-                sample = {
-                    "__key__": key,
-                    ext: image_path.read_bytes(),
-                    "json": json.dumps(row, ensure_ascii=False).encode("utf-8"),
-                }
-                sink.write(sample)
+        try:
+            current_tar = _open_shard()
+            with metadata_file.open("r", encoding="utf-8") as mf:
+                for line in mf:
+                    row = json.loads(line)
+                    image_path = Path(row["image_path"])
+                    if not image_path.exists():
+                        continue
+                    key = image_path.stem
+                    ext = image_path.suffix.lstrip(".") or "jpg"
+                    # Add image file
+                    img_data = image_path.read_bytes()
+                    img_info = _tarfile.TarInfo(name=f"{key}.{ext}")
+                    img_info.size = len(img_data)
+                    current_tar.addfile(img_info, io.BytesIO(img_data))
+                    # Add JSON metadata sidecar
+                    json_data = json.dumps(row, ensure_ascii=False).encode("utf-8")
+                    json_info = _tarfile.TarInfo(name=f"{key}.json")
+                    json_info.size = len(json_data)
+                    current_tar.addfile(json_info, io.BytesIO(json_data))
+                    count_in_shard += 1
+                    if count_in_shard >= max_per_shard:
+                        current_tar.close()
+                        current_tar = _open_shard()
+                        count_in_shard = 0
+        finally:
+            if current_tar is not None:
+                current_tar.close()
     async def _write_parquet(self, dataset_dir: Path, metadata_file: Path) -> None:
         try: