npm - @vespermcp/mcp-server - Versions diffs - 1.2.6 → 1.2.7 - Mend

@vespermcp/mcp-server 1.2.6 → 1.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/build/index.js +0 -3
package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
package/build/python/vesper/core/asset_downloader.py +52 -23
package/package.json +1 -1
package/src/python/vesper/core/asset_downloader.py +52 -23

package/build/index.js CHANGED Viewed

@@ -1019,9 +1019,6 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
             if (source === "kaggle") {
                 requiredModules.push({ module: "kaggle", packageName: "kaggle" });
             }
-            if (outputFormat === "webdataset") {
-                requiredModules.push({ module: "webdataset", packageName: "webdataset" });
-            }
             try {
                 await ensurePythonModules(requiredModules);
             }

package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc CHANGED Viewed

Binary file

package/build/python/vesper/core/asset_downloader.py CHANGED Viewed

@@ -127,12 +127,6 @@ class AssetDownloader:
             if not urls:
                 raise ValueError("urls are required for source=url")
-        if output_format == "webdataset" and wds is None:
-            raise RuntimeError(
-                "webdataset package is required for webdataset output. "
-                "Install with: pip install webdataset"
-            )
         # --- Now safe to create directories ---
         dataset_dir = self.output_root / dataset_id.replace("/", "_").replace(":", "_")
         images_dir = dataset_dir / "images"
@@ -380,24 +374,59 @@ class AssetDownloader:
         raise ValueError(f"Unsupported image value type: {type(value)}")
     async def _write_webdataset(self, dataset_dir: Path, images_dir: Path, metadata_file: Path) -> None:
-        if wds is None:
-            raise RuntimeError("webdataset package is required for webdataset output. Install with: pip install webdataset")
+        """Write a webdataset-compatible tar archive.
+        Uses Python's built-in tarfile module instead of wds.ShardWriter to
+        avoid the gopen() handler issue on Windows (backslash paths).
+        The resulting .tar files are fully compatible with webdataset readers.
+        """
+        import io
+        import tarfile as _tarfile
+        max_per_shard = 5000
+        shard_idx = 0
+        count_in_shard = 0
+        current_tar: _tarfile.TarFile | None = None
+        def _open_shard() -> _tarfile.TarFile:
+            nonlocal shard_idx
+            shard_path = dataset_dir / f"shard-{shard_idx:06d}.tar"
+            shard_idx += 1
+            return _tarfile.open(str(shard_path), "w")
-        shard_pattern = str(dataset_dir / "shard-%06d.tar")
-        with metadata_file.open("r", encoding="utf-8") as mf, wds.ShardWriter(shard_pattern, maxcount=5000) as sink:
-            for line in mf:
-                row = json.loads(line)
-                image_path = Path(row["image_path"])
-                if not image_path.exists():
-                    continue
-                key = image_path.stem
-                ext = image_path.suffix.lstrip(".") or "jpg"
-                sample = {
-                    "__key__": key,
-                    ext: image_path.read_bytes(),
-                    "json": json.dumps(row, ensure_ascii=False).encode("utf-8"),
-                }
-                sink.write(sample)
+        try:
+            current_tar = _open_shard()
+            with metadata_file.open("r", encoding="utf-8") as mf:
+                for line in mf:
+                    row = json.loads(line)
+                    image_path = Path(row["image_path"])
+                    if not image_path.exists():
+                        continue
+                    key = image_path.stem
+                    ext = image_path.suffix.lstrip(".") or "jpg"
+                    # Add image file
+                    img_data = image_path.read_bytes()
+                    img_info = _tarfile.TarInfo(name=f"{key}.{ext}")
+                    img_info.size = len(img_data)
+                    current_tar.addfile(img_info, io.BytesIO(img_data))
+                    # Add JSON metadata sidecar
+                    json_data = json.dumps(row, ensure_ascii=False).encode("utf-8")
+                    json_info = _tarfile.TarInfo(name=f"{key}.json")
+                    json_info.size = len(json_data)
+                    current_tar.addfile(json_info, io.BytesIO(json_data))
+                    count_in_shard += 1
+                    if count_in_shard >= max_per_shard:
+                        current_tar.close()
+                        current_tar = _open_shard()
+                        count_in_shard = 0
+        finally:
+            if current_tar is not None:
+                current_tar.close()
     async def _write_parquet(self, dataset_dir: Path, metadata_file: Path) -> None:
         try:

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@vespermcp/mcp-server",
-  "version": "1.2.6",
+  "version": "1.2.7",
   "description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
   "type": "module",
   "main": "build/index.js",

package/src/python/vesper/core/asset_downloader.py CHANGED Viewed

@@ -127,12 +127,6 @@ class AssetDownloader:
             if not urls:
                 raise ValueError("urls are required for source=url")
-        if output_format == "webdataset" and wds is None:
-            raise RuntimeError(
-                "webdataset package is required for webdataset output. "
-                "Install with: pip install webdataset"
-            )
         # --- Now safe to create directories ---
         dataset_dir = self.output_root / dataset_id.replace("/", "_").replace(":", "_")
         images_dir = dataset_dir / "images"
@@ -380,24 +374,59 @@ class AssetDownloader:
         raise ValueError(f"Unsupported image value type: {type(value)}")
     async def _write_webdataset(self, dataset_dir: Path, images_dir: Path, metadata_file: Path) -> None:
-        if wds is None:
-            raise RuntimeError("webdataset package is required for webdataset output. Install with: pip install webdataset")
+        """Write a webdataset-compatible tar archive.
+        Uses Python's built-in tarfile module instead of wds.ShardWriter to
+        avoid the gopen() handler issue on Windows (backslash paths).
+        The resulting .tar files are fully compatible with webdataset readers.
+        """
+        import io
+        import tarfile as _tarfile
+        max_per_shard = 5000
+        shard_idx = 0
+        count_in_shard = 0
+        current_tar: _tarfile.TarFile | None = None
+        def _open_shard() -> _tarfile.TarFile:
+            nonlocal shard_idx
+            shard_path = dataset_dir / f"shard-{shard_idx:06d}.tar"
+            shard_idx += 1
+            return _tarfile.open(str(shard_path), "w")
-        shard_pattern = str(dataset_dir / "shard-%06d.tar")
-        with metadata_file.open("r", encoding="utf-8") as mf, wds.ShardWriter(shard_pattern, maxcount=5000) as sink:
-            for line in mf:
-                row = json.loads(line)
-                image_path = Path(row["image_path"])
-                if not image_path.exists():
-                    continue
-                key = image_path.stem
-                ext = image_path.suffix.lstrip(".") or "jpg"
-                sample = {
-                    "__key__": key,
-                    ext: image_path.read_bytes(),
-                    "json": json.dumps(row, ensure_ascii=False).encode("utf-8"),
-                }
-                sink.write(sample)
+        try:
+            current_tar = _open_shard()
+            with metadata_file.open("r", encoding="utf-8") as mf:
+                for line in mf:
+                    row = json.loads(line)
+                    image_path = Path(row["image_path"])
+                    if not image_path.exists():
+                        continue
+                    key = image_path.stem
+                    ext = image_path.suffix.lstrip(".") or "jpg"
+                    # Add image file
+                    img_data = image_path.read_bytes()
+                    img_info = _tarfile.TarInfo(name=f"{key}.{ext}")
+                    img_info.size = len(img_data)
+                    current_tar.addfile(img_info, io.BytesIO(img_data))
+                    # Add JSON metadata sidecar
+                    json_data = json.dumps(row, ensure_ascii=False).encode("utf-8")
+                    json_info = _tarfile.TarInfo(name=f"{key}.json")
+                    json_info.size = len(json_data)
+                    current_tar.addfile(json_info, io.BytesIO(json_data))
+                    count_in_shard += 1
+                    if count_in_shard >= max_per_shard:
+                        current_tar.close()
+                        current_tar = _open_shard()
+                        count_in_shard = 0
+        finally:
+            if current_tar is not None:
+                current_tar.close()
     async def _write_parquet(self, dataset_dir: Path, metadata_file: Path) -> None:
         try: