@vespermcp/mcp-server 1.2.5 → 1.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -131,8 +131,83 @@ function extractRequestedRows(query, requirements) {
131
131
  return Math.max(...allNums);
132
132
  return undefined;
133
133
  }
134
+ const verifiedPythonModules = new Set();
135
+ function getPythonCommand() {
136
+ return process.platform === "win32" ? "py" : "python";
137
+ }
138
+ function runPythonProcess(args, timeoutMs = 300000) {
139
+ const pyCmd = getPythonCommand();
140
+ return new Promise((resolve, reject) => {
141
+ const proc = spawn(pyCmd, args, {
142
+ env: {
143
+ ...process.env,
144
+ PIP_DISABLE_PIP_VERSION_CHECK: "1",
145
+ PYTHONUTF8: "1",
146
+ },
147
+ });
148
+ let stdout = "";
149
+ let stderr = "";
150
+ proc.stdout.on("data", (d) => (stdout += d.toString()));
151
+ proc.stderr.on("data", (d) => (stderr += d.toString()));
152
+ const timer = setTimeout(() => {
153
+ try {
154
+ proc.kill();
155
+ }
156
+ catch {
157
+ // no-op
158
+ }
159
+ reject(new Error(`Python command timed out after ${timeoutMs}ms: ${args.join(" ")}`));
160
+ }, timeoutMs);
161
+ proc.on("close", (code) => {
162
+ clearTimeout(timer);
163
+ resolve({ code: code ?? 1, stdout, stderr });
164
+ });
165
+ proc.on("error", (error) => {
166
+ clearTimeout(timer);
167
+ reject(error);
168
+ });
169
+ });
170
+ }
171
+ async function ensurePythonModules(modulePackagePairs) {
172
+ const missing = [];
173
+ for (const pair of modulePackagePairs) {
174
+ if (verifiedPythonModules.has(pair.module)) {
175
+ continue;
176
+ }
177
+ const check = await runPythonProcess([
178
+ "-c",
179
+ `import importlib.util,sys; sys.exit(0 if importlib.util.find_spec(${JSON.stringify(pair.module)}) else 1)`
180
+ ], 20000);
181
+ if (check.code === 0) {
182
+ verifiedPythonModules.add(pair.module);
183
+ }
184
+ else {
185
+ missing.push(pair);
186
+ }
187
+ }
188
+ if (missing.length === 0) {
189
+ return;
190
+ }
191
+ const packages = [...new Set(missing.map(m => m.packageName))];
192
+ console.error(`[Vesper] Installing missing Python packages: ${packages.join(", ")}`);
193
+ const installArgs = ["-m", "pip", "install", "--disable-pip-version-check", ...packages];
194
+ let install = await runPythonProcess(installArgs, 600000);
195
+ if (install.code !== 0) {
196
+ console.error(`[Vesper] pip install failed (trying --user fallback): ${(install.stderr || "").slice(0, 300)}`);
197
+ const userInstallArgs = ["-m", "pip", "install", "--disable-pip-version-check", "--user", ...packages];
198
+ install = await runPythonProcess(userInstallArgs, 600000);
199
+ }
200
+ if (install.code !== 0) {
201
+ const details = (install.stderr || install.stdout || "Unknown pip install error").trim();
202
+ throw new Error(`Failed to install required Python packages (${packages.join(", ")}). ${details}`);
203
+ }
204
+ console.error(`[Vesper] Successfully installed: ${packages.join(", ")}`);
205
+ for (const pair of missing) {
206
+ verifiedPythonModules.add(pair.module);
207
+ }
208
+ }
134
209
  function runPythonJson(scriptPath, args) {
135
- const pyCmd = process.platform === "win32" ? "py" : "python";
210
+ const pyCmd = getPythonCommand();
136
211
  return new Promise((resolve, reject) => {
137
212
  const proc = spawn(pyCmd, [scriptPath, ...args]);
138
213
  let stdout = "";
@@ -932,6 +1007,27 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
932
1007
  isError: true,
933
1008
  };
934
1009
  }
1010
+ const requiredModules = [
1011
+ { module: "aiohttp", packageName: "aiohttp" },
1012
+ ];
1013
+ if (source === "url") {
1014
+ requiredModules.push({ module: "aiofiles", packageName: "aiofiles" });
1015
+ }
1016
+ if (source === "huggingface") {
1017
+ requiredModules.push({ module: "datasets", packageName: "datasets" });
1018
+ }
1019
+ if (source === "kaggle") {
1020
+ requiredModules.push({ module: "kaggle", packageName: "kaggle" });
1021
+ }
1022
+ try {
1023
+ await ensurePythonModules(requiredModules);
1024
+ }
1025
+ catch (error) {
1026
+ return {
1027
+ content: [{ type: "text", text: `ERROR: Python dependency setup failed: ${error.message}` }],
1028
+ isError: true,
1029
+ };
1030
+ }
935
1031
  const scriptPath = path.join(dataRoot, "python", "asset_downloader_engine.py");
936
1032
  const payload = {
937
1033
  dataset_id: datasetId,
@@ -101,27 +101,54 @@ class AssetDownloader:
101
101
  if source not in {"huggingface", "kaggle", "url"}:
102
102
  raise ValueError("source must be one of: huggingface, kaggle, url")
103
103
 
104
- dataset_dir = self.output_root / dataset_id.replace("/", "_").replace(":", "_")
105
- images_dir = dataset_dir / "images"
106
- dataset_dir.mkdir(parents=True, exist_ok=True)
107
- images_dir.mkdir(parents=True, exist_ok=True)
108
-
109
- errors_file = dataset_dir / "errors.jsonl"
110
- metadata_file = dataset_dir / "metadata.jsonl"
111
-
104
+ # --- Validate imports and args BEFORE creating any directories ---
112
105
  if source == "huggingface":
113
106
  if not repo_id:
114
107
  raise ValueError("repo_id is required for source=huggingface")
115
- summary = await self._download_huggingface(repo_id, dataset_id, images_dir, metadata_file, errors_file, max_items, image_column)
108
+ try:
109
+ from datasets import load_dataset as _ld # noqa: F401
110
+ except Exception as e:
111
+ raise RuntimeError(
112
+ f"datasets package is required for HuggingFace downloads. "
113
+ f"Install with: pip install datasets. Details: {e}"
114
+ )
116
115
  elif source == "kaggle":
117
116
  ref = kaggle_ref or repo_id
118
117
  if not ref:
119
118
  raise ValueError("kaggle_ref is required for source=kaggle")
120
- summary = await self._download_kaggle(ref, dataset_id, images_dir, metadata_file, errors_file, max_items)
119
+ try:
120
+ from kaggle.api.kaggle_api_extended import KaggleApi as _Ka # noqa: F401
121
+ except Exception as e:
122
+ raise RuntimeError(
123
+ f"kaggle package is required for Kaggle downloads. "
124
+ f"Install with: pip install kaggle. Details: {e}"
125
+ )
121
126
  else:
122
127
  if not urls:
123
128
  raise ValueError("urls are required for source=url")
124
- summary = await self._download_urls(urls, dataset_id, images_dir, metadata_file, errors_file, max_items)
129
+
130
+ # --- Now safe to create directories ---
131
+ dataset_dir = self.output_root / dataset_id.replace("/", "_").replace(":", "_")
132
+ images_dir = dataset_dir / "images"
133
+ dataset_dir.mkdir(parents=True, exist_ok=True)
134
+ images_dir.mkdir(parents=True, exist_ok=True)
135
+
136
+ errors_file = dataset_dir / "errors.jsonl"
137
+ metadata_file = dataset_dir / "metadata.jsonl"
138
+
139
+ try:
140
+ if source == "huggingface":
141
+ summary = await self._download_huggingface(repo_id, dataset_id, images_dir, metadata_file, errors_file, max_items, image_column)
142
+ elif source == "kaggle":
143
+ ref = kaggle_ref or repo_id
144
+ summary = await self._download_kaggle(ref, dataset_id, images_dir, metadata_file, errors_file, max_items)
145
+ else:
146
+ summary = await self._download_urls(urls, dataset_id, images_dir, metadata_file, errors_file, max_items)
147
+ except Exception:
148
+ # Clean up empty directories on failure so we don't leave ghost artifacts
149
+ if images_dir.exists() and not any(images_dir.iterdir()):
150
+ shutil.rmtree(dataset_dir, ignore_errors=True)
151
+ raise
125
152
 
126
153
  if output_format == "webdataset":
127
154
  await self._write_webdataset(dataset_dir, images_dir, metadata_file)
@@ -150,10 +177,7 @@ class AssetDownloader:
150
177
  max_items: Optional[int],
151
178
  image_column: Optional[str],
152
179
  ) -> Dict[str, int]:
153
- try:
154
- from datasets import load_dataset
155
- except Exception as e:
156
- raise RuntimeError(f"datasets package is required. Install with: pip install datasets. Details: {e}")
180
+ from datasets import load_dataset # validated in download_assets()
157
181
 
158
182
  await self._emit("start", {"source": "huggingface", "repo_id": repo_id})
159
183
 
@@ -212,10 +236,7 @@ class AssetDownloader:
212
236
  errors_file: Path,
213
237
  max_items: Optional[int],
214
238
  ) -> Dict[str, int]:
215
- try:
216
- from kaggle.api.kaggle_api_extended import KaggleApi
217
- except Exception as e:
218
- raise RuntimeError(f"kaggle package is required: {e}")
239
+ from kaggle.api.kaggle_api_extended import KaggleApi # validated in download_assets()
219
240
 
220
241
  await self._emit("start", {"source": "kaggle", "dataset": kaggle_ref})
221
242
 
@@ -353,24 +374,59 @@ class AssetDownloader:
353
374
  raise ValueError(f"Unsupported image value type: {type(value)}")
354
375
 
355
376
  async def _write_webdataset(self, dataset_dir: Path, images_dir: Path, metadata_file: Path) -> None:
356
- if wds is None:
357
- raise RuntimeError("webdataset package is required for webdataset output. Install with: pip install webdataset")
377
+ """Write a webdataset-compatible tar archive.
378
+
379
+ Uses Python's built-in tarfile module instead of wds.ShardWriter to
380
+ avoid the gopen() handler issue on Windows (backslash paths).
381
+ The resulting .tar files are fully compatible with webdataset readers.
382
+ """
383
+ import io
384
+ import tarfile as _tarfile
385
+
386
+ max_per_shard = 5000
387
+ shard_idx = 0
388
+ count_in_shard = 0
389
+ current_tar: _tarfile.TarFile | None = None
390
+
391
+ def _open_shard() -> _tarfile.TarFile:
392
+ nonlocal shard_idx
393
+ shard_path = dataset_dir / f"shard-{shard_idx:06d}.tar"
394
+ shard_idx += 1
395
+ return _tarfile.open(str(shard_path), "w")
358
396
 
359
- shard_pattern = str(dataset_dir / "shard-%06d.tar")
360
- with metadata_file.open("r", encoding="utf-8") as mf, wds.ShardWriter(shard_pattern, maxcount=5000) as sink:
361
- for line in mf:
362
- row = json.loads(line)
363
- image_path = Path(row["image_path"])
364
- if not image_path.exists():
365
- continue
366
- key = image_path.stem
367
- ext = image_path.suffix.lstrip(".") or "jpg"
368
- sample = {
369
- "__key__": key,
370
- ext: image_path.read_bytes(),
371
- "json": json.dumps(row, ensure_ascii=False).encode("utf-8"),
372
- }
373
- sink.write(sample)
397
+ try:
398
+ current_tar = _open_shard()
399
+
400
+ with metadata_file.open("r", encoding="utf-8") as mf:
401
+ for line in mf:
402
+ row = json.loads(line)
403
+ image_path = Path(row["image_path"])
404
+ if not image_path.exists():
405
+ continue
406
+
407
+ key = image_path.stem
408
+ ext = image_path.suffix.lstrip(".") or "jpg"
409
+
410
+ # Add image file
411
+ img_data = image_path.read_bytes()
412
+ img_info = _tarfile.TarInfo(name=f"{key}.{ext}")
413
+ img_info.size = len(img_data)
414
+ current_tar.addfile(img_info, io.BytesIO(img_data))
415
+
416
+ # Add JSON metadata sidecar
417
+ json_data = json.dumps(row, ensure_ascii=False).encode("utf-8")
418
+ json_info = _tarfile.TarInfo(name=f"{key}.json")
419
+ json_info.size = len(json_data)
420
+ current_tar.addfile(json_info, io.BytesIO(json_data))
421
+
422
+ count_in_shard += 1
423
+ if count_in_shard >= max_per_shard:
424
+ current_tar.close()
425
+ current_tar = _open_shard()
426
+ count_in_shard = 0
427
+ finally:
428
+ if current_tar is not None:
429
+ current_tar.close()
374
430
 
375
431
  async def _write_parquet(self, dataset_dir: Path, metadata_file: Path) -> None:
376
432
  try:
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@vespermcp/mcp-server",
3
- "version": "1.2.5",
3
+ "version": "1.2.7",
4
4
  "description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
5
5
  "type": "module",
6
6
  "main": "build/index.js",
@@ -23,7 +23,12 @@ const pythonPackages = [
23
23
  'pillow',
24
24
  'numpy',
25
25
  'librosa',
26
- 'soundfile'
26
+ 'soundfile',
27
+ 'aiohttp',
28
+ 'aiofiles',
29
+ 'datasets',
30
+ 'webdataset',
31
+ 'kaggle'
27
32
  ];
28
33
 
29
34
  try {
@@ -101,27 +101,54 @@ class AssetDownloader:
101
101
  if source not in {"huggingface", "kaggle", "url"}:
102
102
  raise ValueError("source must be one of: huggingface, kaggle, url")
103
103
 
104
- dataset_dir = self.output_root / dataset_id.replace("/", "_").replace(":", "_")
105
- images_dir = dataset_dir / "images"
106
- dataset_dir.mkdir(parents=True, exist_ok=True)
107
- images_dir.mkdir(parents=True, exist_ok=True)
108
-
109
- errors_file = dataset_dir / "errors.jsonl"
110
- metadata_file = dataset_dir / "metadata.jsonl"
111
-
104
+ # --- Validate imports and args BEFORE creating any directories ---
112
105
  if source == "huggingface":
113
106
  if not repo_id:
114
107
  raise ValueError("repo_id is required for source=huggingface")
115
- summary = await self._download_huggingface(repo_id, dataset_id, images_dir, metadata_file, errors_file, max_items, image_column)
108
+ try:
109
+ from datasets import load_dataset as _ld # noqa: F401
110
+ except Exception as e:
111
+ raise RuntimeError(
112
+ f"datasets package is required for HuggingFace downloads. "
113
+ f"Install with: pip install datasets. Details: {e}"
114
+ )
116
115
  elif source == "kaggle":
117
116
  ref = kaggle_ref or repo_id
118
117
  if not ref:
119
118
  raise ValueError("kaggle_ref is required for source=kaggle")
120
- summary = await self._download_kaggle(ref, dataset_id, images_dir, metadata_file, errors_file, max_items)
119
+ try:
120
+ from kaggle.api.kaggle_api_extended import KaggleApi as _Ka # noqa: F401
121
+ except Exception as e:
122
+ raise RuntimeError(
123
+ f"kaggle package is required for Kaggle downloads. "
124
+ f"Install with: pip install kaggle. Details: {e}"
125
+ )
121
126
  else:
122
127
  if not urls:
123
128
  raise ValueError("urls are required for source=url")
124
- summary = await self._download_urls(urls, dataset_id, images_dir, metadata_file, errors_file, max_items)
129
+
130
+ # --- Now safe to create directories ---
131
+ dataset_dir = self.output_root / dataset_id.replace("/", "_").replace(":", "_")
132
+ images_dir = dataset_dir / "images"
133
+ dataset_dir.mkdir(parents=True, exist_ok=True)
134
+ images_dir.mkdir(parents=True, exist_ok=True)
135
+
136
+ errors_file = dataset_dir / "errors.jsonl"
137
+ metadata_file = dataset_dir / "metadata.jsonl"
138
+
139
+ try:
140
+ if source == "huggingface":
141
+ summary = await self._download_huggingface(repo_id, dataset_id, images_dir, metadata_file, errors_file, max_items, image_column)
142
+ elif source == "kaggle":
143
+ ref = kaggle_ref or repo_id
144
+ summary = await self._download_kaggle(ref, dataset_id, images_dir, metadata_file, errors_file, max_items)
145
+ else:
146
+ summary = await self._download_urls(urls, dataset_id, images_dir, metadata_file, errors_file, max_items)
147
+ except Exception:
148
+ # Clean up empty directories on failure so we don't leave ghost artifacts
149
+ if images_dir.exists() and not any(images_dir.iterdir()):
150
+ shutil.rmtree(dataset_dir, ignore_errors=True)
151
+ raise
125
152
 
126
153
  if output_format == "webdataset":
127
154
  await self._write_webdataset(dataset_dir, images_dir, metadata_file)
@@ -150,10 +177,7 @@ class AssetDownloader:
150
177
  max_items: Optional[int],
151
178
  image_column: Optional[str],
152
179
  ) -> Dict[str, int]:
153
- try:
154
- from datasets import load_dataset
155
- except Exception as e:
156
- raise RuntimeError(f"datasets package is required. Install with: pip install datasets. Details: {e}")
180
+ from datasets import load_dataset # validated in download_assets()
157
181
 
158
182
  await self._emit("start", {"source": "huggingface", "repo_id": repo_id})
159
183
 
@@ -212,10 +236,7 @@ class AssetDownloader:
212
236
  errors_file: Path,
213
237
  max_items: Optional[int],
214
238
  ) -> Dict[str, int]:
215
- try:
216
- from kaggle.api.kaggle_api_extended import KaggleApi
217
- except Exception as e:
218
- raise RuntimeError(f"kaggle package is required: {e}")
239
+ from kaggle.api.kaggle_api_extended import KaggleApi # validated in download_assets()
219
240
 
220
241
  await self._emit("start", {"source": "kaggle", "dataset": kaggle_ref})
221
242
 
@@ -353,24 +374,59 @@ class AssetDownloader:
353
374
  raise ValueError(f"Unsupported image value type: {type(value)}")
354
375
 
355
376
  async def _write_webdataset(self, dataset_dir: Path, images_dir: Path, metadata_file: Path) -> None:
356
- if wds is None:
357
- raise RuntimeError("webdataset package is required for webdataset output. Install with: pip install webdataset")
377
+ """Write a webdataset-compatible tar archive.
378
+
379
+ Uses Python's built-in tarfile module instead of wds.ShardWriter to
380
+ avoid the gopen() handler issue on Windows (backslash paths).
381
+ The resulting .tar files are fully compatible with webdataset readers.
382
+ """
383
+ import io
384
+ import tarfile as _tarfile
385
+
386
+ max_per_shard = 5000
387
+ shard_idx = 0
388
+ count_in_shard = 0
389
+ current_tar: _tarfile.TarFile | None = None
390
+
391
+ def _open_shard() -> _tarfile.TarFile:
392
+ nonlocal shard_idx
393
+ shard_path = dataset_dir / f"shard-{shard_idx:06d}.tar"
394
+ shard_idx += 1
395
+ return _tarfile.open(str(shard_path), "w")
358
396
 
359
- shard_pattern = str(dataset_dir / "shard-%06d.tar")
360
- with metadata_file.open("r", encoding="utf-8") as mf, wds.ShardWriter(shard_pattern, maxcount=5000) as sink:
361
- for line in mf:
362
- row = json.loads(line)
363
- image_path = Path(row["image_path"])
364
- if not image_path.exists():
365
- continue
366
- key = image_path.stem
367
- ext = image_path.suffix.lstrip(".") or "jpg"
368
- sample = {
369
- "__key__": key,
370
- ext: image_path.read_bytes(),
371
- "json": json.dumps(row, ensure_ascii=False).encode("utf-8"),
372
- }
373
- sink.write(sample)
397
+ try:
398
+ current_tar = _open_shard()
399
+
400
+ with metadata_file.open("r", encoding="utf-8") as mf:
401
+ for line in mf:
402
+ row = json.loads(line)
403
+ image_path = Path(row["image_path"])
404
+ if not image_path.exists():
405
+ continue
406
+
407
+ key = image_path.stem
408
+ ext = image_path.suffix.lstrip(".") or "jpg"
409
+
410
+ # Add image file
411
+ img_data = image_path.read_bytes()
412
+ img_info = _tarfile.TarInfo(name=f"{key}.{ext}")
413
+ img_info.size = len(img_data)
414
+ current_tar.addfile(img_info, io.BytesIO(img_data))
415
+
416
+ # Add JSON metadata sidecar
417
+ json_data = json.dumps(row, ensure_ascii=False).encode("utf-8")
418
+ json_info = _tarfile.TarInfo(name=f"{key}.json")
419
+ json_info.size = len(json_data)
420
+ current_tar.addfile(json_info, io.BytesIO(json_data))
421
+
422
+ count_in_shard += 1
423
+ if count_in_shard >= max_per_shard:
424
+ current_tar.close()
425
+ current_tar = _open_shard()
426
+ count_in_shard = 0
427
+ finally:
428
+ if current_tar is not None:
429
+ current_tar.close()
374
430
 
375
431
  async def _write_parquet(self, dataset_dir: Path, metadata_file: Path) -> None:
376
432
  try: