@vespermcp/mcp-server 1.2.6 → 1.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -1019,9 +1019,6 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1019
1019
  if (source === "kaggle") {
1020
1020
  requiredModules.push({ module: "kaggle", packageName: "kaggle" });
1021
1021
  }
1022
- if (outputFormat === "webdataset") {
1023
- requiredModules.push({ module: "webdataset", packageName: "webdataset" });
1024
- }
1025
1022
  try {
1026
1023
  await ensurePythonModules(requiredModules);
1027
1024
  }
@@ -127,12 +127,6 @@ class AssetDownloader:
127
127
  if not urls:
128
128
  raise ValueError("urls are required for source=url")
129
129
 
130
- if output_format == "webdataset" and wds is None:
131
- raise RuntimeError(
132
- "webdataset package is required for webdataset output. "
133
- "Install with: pip install webdataset"
134
- )
135
-
136
130
  # --- Now safe to create directories ---
137
131
  dataset_dir = self.output_root / dataset_id.replace("/", "_").replace(":", "_")
138
132
  images_dir = dataset_dir / "images"
@@ -380,24 +374,59 @@ class AssetDownloader:
380
374
  raise ValueError(f"Unsupported image value type: {type(value)}")
381
375
 
382
376
  async def _write_webdataset(self, dataset_dir: Path, images_dir: Path, metadata_file: Path) -> None:
383
- if wds is None:
384
- raise RuntimeError("webdataset package is required for webdataset output. Install with: pip install webdataset")
377
+ """Write a webdataset-compatible tar archive.
378
+
379
+ Uses Python's built-in tarfile module instead of wds.ShardWriter to
380
+ avoid the gopen() handler issue on Windows (backslash paths).
381
+ The resulting .tar files are fully compatible with webdataset readers.
382
+ """
383
+ import io
384
+ import tarfile as _tarfile
385
+
386
+ max_per_shard = 5000
387
+ shard_idx = 0
388
+ count_in_shard = 0
389
+ current_tar: _tarfile.TarFile | None = None
390
+
391
+ def _open_shard() -> _tarfile.TarFile:
392
+ nonlocal shard_idx
393
+ shard_path = dataset_dir / f"shard-{shard_idx:06d}.tar"
394
+ shard_idx += 1
395
+ return _tarfile.open(str(shard_path), "w")
385
396
 
386
- shard_pattern = str(dataset_dir / "shard-%06d.tar")
387
- with metadata_file.open("r", encoding="utf-8") as mf, wds.ShardWriter(shard_pattern, maxcount=5000) as sink:
388
- for line in mf:
389
- row = json.loads(line)
390
- image_path = Path(row["image_path"])
391
- if not image_path.exists():
392
- continue
393
- key = image_path.stem
394
- ext = image_path.suffix.lstrip(".") or "jpg"
395
- sample = {
396
- "__key__": key,
397
- ext: image_path.read_bytes(),
398
- "json": json.dumps(row, ensure_ascii=False).encode("utf-8"),
399
- }
400
- sink.write(sample)
397
+ try:
398
+ current_tar = _open_shard()
399
+
400
+ with metadata_file.open("r", encoding="utf-8") as mf:
401
+ for line in mf:
402
+ row = json.loads(line)
403
+ image_path = Path(row["image_path"])
404
+ if not image_path.exists():
405
+ continue
406
+
407
+ key = image_path.stem
408
+ ext = image_path.suffix.lstrip(".") or "jpg"
409
+
410
+ # Add image file
411
+ img_data = image_path.read_bytes()
412
+ img_info = _tarfile.TarInfo(name=f"{key}.{ext}")
413
+ img_info.size = len(img_data)
414
+ current_tar.addfile(img_info, io.BytesIO(img_data))
415
+
416
+ # Add JSON metadata sidecar
417
+ json_data = json.dumps(row, ensure_ascii=False).encode("utf-8")
418
+ json_info = _tarfile.TarInfo(name=f"{key}.json")
419
+ json_info.size = len(json_data)
420
+ current_tar.addfile(json_info, io.BytesIO(json_data))
421
+
422
+ count_in_shard += 1
423
+ if count_in_shard >= max_per_shard:
424
+ current_tar.close()
425
+ current_tar = _open_shard()
426
+ count_in_shard = 0
427
+ finally:
428
+ if current_tar is not None:
429
+ current_tar.close()
401
430
 
402
431
  async def _write_parquet(self, dataset_dir: Path, metadata_file: Path) -> None:
403
432
  try:
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@vespermcp/mcp-server",
3
- "version": "1.2.6",
3
+ "version": "1.2.7",
4
4
  "description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
5
5
  "type": "module",
6
6
  "main": "build/index.js",
@@ -127,12 +127,6 @@ class AssetDownloader:
127
127
  if not urls:
128
128
  raise ValueError("urls are required for source=url")
129
129
 
130
- if output_format == "webdataset" and wds is None:
131
- raise RuntimeError(
132
- "webdataset package is required for webdataset output. "
133
- "Install with: pip install webdataset"
134
- )
135
-
136
130
  # --- Now safe to create directories ---
137
131
  dataset_dir = self.output_root / dataset_id.replace("/", "_").replace(":", "_")
138
132
  images_dir = dataset_dir / "images"
@@ -380,24 +374,59 @@ class AssetDownloader:
380
374
  raise ValueError(f"Unsupported image value type: {type(value)}")
381
375
 
382
376
  async def _write_webdataset(self, dataset_dir: Path, images_dir: Path, metadata_file: Path) -> None:
383
- if wds is None:
384
- raise RuntimeError("webdataset package is required for webdataset output. Install with: pip install webdataset")
377
+ """Write a webdataset-compatible tar archive.
378
+
379
+ Uses Python's built-in tarfile module instead of wds.ShardWriter to
380
+ avoid the gopen() handler issue on Windows (backslash paths).
381
+ The resulting .tar files are fully compatible with webdataset readers.
382
+ """
383
+ import io
384
+ import tarfile as _tarfile
385
+
386
+ max_per_shard = 5000
387
+ shard_idx = 0
388
+ count_in_shard = 0
389
+ current_tar: _tarfile.TarFile | None = None
390
+
391
+ def _open_shard() -> _tarfile.TarFile:
392
+ nonlocal shard_idx
393
+ shard_path = dataset_dir / f"shard-{shard_idx:06d}.tar"
394
+ shard_idx += 1
395
+ return _tarfile.open(str(shard_path), "w")
385
396
 
386
- shard_pattern = str(dataset_dir / "shard-%06d.tar")
387
- with metadata_file.open("r", encoding="utf-8") as mf, wds.ShardWriter(shard_pattern, maxcount=5000) as sink:
388
- for line in mf:
389
- row = json.loads(line)
390
- image_path = Path(row["image_path"])
391
- if not image_path.exists():
392
- continue
393
- key = image_path.stem
394
- ext = image_path.suffix.lstrip(".") or "jpg"
395
- sample = {
396
- "__key__": key,
397
- ext: image_path.read_bytes(),
398
- "json": json.dumps(row, ensure_ascii=False).encode("utf-8"),
399
- }
400
- sink.write(sample)
397
+ try:
398
+ current_tar = _open_shard()
399
+
400
+ with metadata_file.open("r", encoding="utf-8") as mf:
401
+ for line in mf:
402
+ row = json.loads(line)
403
+ image_path = Path(row["image_path"])
404
+ if not image_path.exists():
405
+ continue
406
+
407
+ key = image_path.stem
408
+ ext = image_path.suffix.lstrip(".") or "jpg"
409
+
410
+ # Add image file
411
+ img_data = image_path.read_bytes()
412
+ img_info = _tarfile.TarInfo(name=f"{key}.{ext}")
413
+ img_info.size = len(img_data)
414
+ current_tar.addfile(img_info, io.BytesIO(img_data))
415
+
416
+ # Add JSON metadata sidecar
417
+ json_data = json.dumps(row, ensure_ascii=False).encode("utf-8")
418
+ json_info = _tarfile.TarInfo(name=f"{key}.json")
419
+ json_info.size = len(json_data)
420
+ current_tar.addfile(json_info, io.BytesIO(json_data))
421
+
422
+ count_in_shard += 1
423
+ if count_in_shard >= max_per_shard:
424
+ current_tar.close()
425
+ current_tar = _open_shard()
426
+ count_in_shard = 0
427
+ finally:
428
+ if current_tar is not None:
429
+ current_tar.close()
401
430
 
402
431
  async def _write_parquet(self, dataset_dir: Path, metadata_file: Path) -> None:
403
432
  try: