@vespermcp/mcp-server 1.2.5 → 1.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -131,8 +131,83 @@ function extractRequestedRows(query, requirements) {
131
131
  return Math.max(...allNums);
132
132
  return undefined;
133
133
  }
134
+ const verifiedPythonModules = new Set();
135
+ function getPythonCommand() {
136
+ return process.platform === "win32" ? "py" : "python";
137
+ }
138
+ function runPythonProcess(args, timeoutMs = 300000) {
139
+ const pyCmd = getPythonCommand();
140
+ return new Promise((resolve, reject) => {
141
+ const proc = spawn(pyCmd, args, {
142
+ env: {
143
+ ...process.env,
144
+ PIP_DISABLE_PIP_VERSION_CHECK: "1",
145
+ PYTHONUTF8: "1",
146
+ },
147
+ });
148
+ let stdout = "";
149
+ let stderr = "";
150
+ proc.stdout.on("data", (d) => (stdout += d.toString()));
151
+ proc.stderr.on("data", (d) => (stderr += d.toString()));
152
+ const timer = setTimeout(() => {
153
+ try {
154
+ proc.kill();
155
+ }
156
+ catch {
157
+ // no-op
158
+ }
159
+ reject(new Error(`Python command timed out after ${timeoutMs}ms: ${args.join(" ")}`));
160
+ }, timeoutMs);
161
+ proc.on("close", (code) => {
162
+ clearTimeout(timer);
163
+ resolve({ code: code ?? 1, stdout, stderr });
164
+ });
165
+ proc.on("error", (error) => {
166
+ clearTimeout(timer);
167
+ reject(error);
168
+ });
169
+ });
170
+ }
171
+ async function ensurePythonModules(modulePackagePairs) {
172
+ const missing = [];
173
+ for (const pair of modulePackagePairs) {
174
+ if (verifiedPythonModules.has(pair.module)) {
175
+ continue;
176
+ }
177
+ const check = await runPythonProcess([
178
+ "-c",
179
+ `import importlib.util,sys; sys.exit(0 if importlib.util.find_spec(${JSON.stringify(pair.module)}) else 1)`
180
+ ], 20000);
181
+ if (check.code === 0) {
182
+ verifiedPythonModules.add(pair.module);
183
+ }
184
+ else {
185
+ missing.push(pair);
186
+ }
187
+ }
188
+ if (missing.length === 0) {
189
+ return;
190
+ }
191
+ const packages = [...new Set(missing.map(m => m.packageName))];
192
+ console.error(`[Vesper] Installing missing Python packages: ${packages.join(", ")}`);
193
+ const installArgs = ["-m", "pip", "install", "--disable-pip-version-check", ...packages];
194
+ let install = await runPythonProcess(installArgs, 600000);
195
+ if (install.code !== 0) {
196
+ console.error(`[Vesper] pip install failed (trying --user fallback): ${(install.stderr || "").slice(0, 300)}`);
197
+ const userInstallArgs = ["-m", "pip", "install", "--disable-pip-version-check", "--user", ...packages];
198
+ install = await runPythonProcess(userInstallArgs, 600000);
199
+ }
200
+ if (install.code !== 0) {
201
+ const details = (install.stderr || install.stdout || "Unknown pip install error").trim();
202
+ throw new Error(`Failed to install required Python packages (${packages.join(", ")}). ${details}`);
203
+ }
204
+ console.error(`[Vesper] Successfully installed: ${packages.join(", ")}`);
205
+ for (const pair of missing) {
206
+ verifiedPythonModules.add(pair.module);
207
+ }
208
+ }
134
209
  function runPythonJson(scriptPath, args) {
135
- const pyCmd = process.platform === "win32" ? "py" : "python";
210
+ const pyCmd = getPythonCommand();
136
211
  return new Promise((resolve, reject) => {
137
212
  const proc = spawn(pyCmd, [scriptPath, ...args]);
138
213
  let stdout = "";
@@ -932,6 +1007,30 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
932
1007
  isError: true,
933
1008
  };
934
1009
  }
1010
+ const requiredModules = [
1011
+ { module: "aiohttp", packageName: "aiohttp" },
1012
+ ];
1013
+ if (source === "url") {
1014
+ requiredModules.push({ module: "aiofiles", packageName: "aiofiles" });
1015
+ }
1016
+ if (source === "huggingface") {
1017
+ requiredModules.push({ module: "datasets", packageName: "datasets" });
1018
+ }
1019
+ if (source === "kaggle") {
1020
+ requiredModules.push({ module: "kaggle", packageName: "kaggle" });
1021
+ }
1022
+ if (outputFormat === "webdataset") {
1023
+ requiredModules.push({ module: "webdataset", packageName: "webdataset" });
1024
+ }
1025
+ try {
1026
+ await ensurePythonModules(requiredModules);
1027
+ }
1028
+ catch (error) {
1029
+ return {
1030
+ content: [{ type: "text", text: `ERROR: Python dependency setup failed: ${error.message}` }],
1031
+ isError: true,
1032
+ };
1033
+ }
935
1034
  const scriptPath = path.join(dataRoot, "python", "asset_downloader_engine.py");
936
1035
  const payload = {
937
1036
  dataset_id: datasetId,
@@ -101,27 +101,60 @@ class AssetDownloader:
101
101
  if source not in {"huggingface", "kaggle", "url"}:
102
102
  raise ValueError("source must be one of: huggingface, kaggle, url")
103
103
 
104
- dataset_dir = self.output_root / dataset_id.replace("/", "_").replace(":", "_")
105
- images_dir = dataset_dir / "images"
106
- dataset_dir.mkdir(parents=True, exist_ok=True)
107
- images_dir.mkdir(parents=True, exist_ok=True)
108
-
109
- errors_file = dataset_dir / "errors.jsonl"
110
- metadata_file = dataset_dir / "metadata.jsonl"
111
-
104
+ # --- Validate imports and args BEFORE creating any directories ---
112
105
  if source == "huggingface":
113
106
  if not repo_id:
114
107
  raise ValueError("repo_id is required for source=huggingface")
115
- summary = await self._download_huggingface(repo_id, dataset_id, images_dir, metadata_file, errors_file, max_items, image_column)
108
+ try:
109
+ from datasets import load_dataset as _ld # noqa: F401
110
+ except Exception as e:
111
+ raise RuntimeError(
112
+ f"datasets package is required for HuggingFace downloads. "
113
+ f"Install with: pip install datasets. Details: {e}"
114
+ )
116
115
  elif source == "kaggle":
117
116
  ref = kaggle_ref or repo_id
118
117
  if not ref:
119
118
  raise ValueError("kaggle_ref is required for source=kaggle")
120
- summary = await self._download_kaggle(ref, dataset_id, images_dir, metadata_file, errors_file, max_items)
119
+ try:
120
+ from kaggle.api.kaggle_api_extended import KaggleApi as _Ka # noqa: F401
121
+ except Exception as e:
122
+ raise RuntimeError(
123
+ f"kaggle package is required for Kaggle downloads. "
124
+ f"Install with: pip install kaggle. Details: {e}"
125
+ )
121
126
  else:
122
127
  if not urls:
123
128
  raise ValueError("urls are required for source=url")
124
- summary = await self._download_urls(urls, dataset_id, images_dir, metadata_file, errors_file, max_items)
129
+
130
+ if output_format == "webdataset" and wds is None:
131
+ raise RuntimeError(
132
+ "webdataset package is required for webdataset output. "
133
+ "Install with: pip install webdataset"
134
+ )
135
+
136
+ # --- Now safe to create directories ---
137
+ dataset_dir = self.output_root / dataset_id.replace("/", "_").replace(":", "_")
138
+ images_dir = dataset_dir / "images"
139
+ dataset_dir.mkdir(parents=True, exist_ok=True)
140
+ images_dir.mkdir(parents=True, exist_ok=True)
141
+
142
+ errors_file = dataset_dir / "errors.jsonl"
143
+ metadata_file = dataset_dir / "metadata.jsonl"
144
+
145
+ try:
146
+ if source == "huggingface":
147
+ summary = await self._download_huggingface(repo_id, dataset_id, images_dir, metadata_file, errors_file, max_items, image_column)
148
+ elif source == "kaggle":
149
+ ref = kaggle_ref or repo_id
150
+ summary = await self._download_kaggle(ref, dataset_id, images_dir, metadata_file, errors_file, max_items)
151
+ else:
152
+ summary = await self._download_urls(urls, dataset_id, images_dir, metadata_file, errors_file, max_items)
153
+ except Exception:
154
+ # Clean up empty directories on failure so we don't leave ghost artifacts
155
+ if images_dir.exists() and not any(images_dir.iterdir()):
156
+ shutil.rmtree(dataset_dir, ignore_errors=True)
157
+ raise
125
158
 
126
159
  if output_format == "webdataset":
127
160
  await self._write_webdataset(dataset_dir, images_dir, metadata_file)
@@ -150,10 +183,7 @@ class AssetDownloader:
150
183
  max_items: Optional[int],
151
184
  image_column: Optional[str],
152
185
  ) -> Dict[str, int]:
153
- try:
154
- from datasets import load_dataset
155
- except Exception as e:
156
- raise RuntimeError(f"datasets package is required. Install with: pip install datasets. Details: {e}")
186
+ from datasets import load_dataset # validated in download_assets()
157
187
 
158
188
  await self._emit("start", {"source": "huggingface", "repo_id": repo_id})
159
189
 
@@ -212,10 +242,7 @@ class AssetDownloader:
212
242
  errors_file: Path,
213
243
  max_items: Optional[int],
214
244
  ) -> Dict[str, int]:
215
- try:
216
- from kaggle.api.kaggle_api_extended import KaggleApi
217
- except Exception as e:
218
- raise RuntimeError(f"kaggle package is required: {e}")
245
+ from kaggle.api.kaggle_api_extended import KaggleApi # validated in download_assets()
219
246
 
220
247
  await self._emit("start", {"source": "kaggle", "dataset": kaggle_ref})
221
248
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@vespermcp/mcp-server",
3
- "version": "1.2.5",
3
+ "version": "1.2.6",
4
4
  "description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
5
5
  "type": "module",
6
6
  "main": "build/index.js",
@@ -23,7 +23,12 @@ const pythonPackages = [
23
23
  'pillow',
24
24
  'numpy',
25
25
  'librosa',
26
- 'soundfile'
26
+ 'soundfile',
27
+ 'aiohttp',
28
+ 'aiofiles',
29
+ 'datasets',
30
+ 'webdataset',
31
+ 'kaggle'
27
32
  ];
28
33
 
29
34
  try {
@@ -101,27 +101,60 @@ class AssetDownloader:
101
101
  if source not in {"huggingface", "kaggle", "url"}:
102
102
  raise ValueError("source must be one of: huggingface, kaggle, url")
103
103
 
104
- dataset_dir = self.output_root / dataset_id.replace("/", "_").replace(":", "_")
105
- images_dir = dataset_dir / "images"
106
- dataset_dir.mkdir(parents=True, exist_ok=True)
107
- images_dir.mkdir(parents=True, exist_ok=True)
108
-
109
- errors_file = dataset_dir / "errors.jsonl"
110
- metadata_file = dataset_dir / "metadata.jsonl"
111
-
104
+ # --- Validate imports and args BEFORE creating any directories ---
112
105
  if source == "huggingface":
113
106
  if not repo_id:
114
107
  raise ValueError("repo_id is required for source=huggingface")
115
- summary = await self._download_huggingface(repo_id, dataset_id, images_dir, metadata_file, errors_file, max_items, image_column)
108
+ try:
109
+ from datasets import load_dataset as _ld # noqa: F401
110
+ except Exception as e:
111
+ raise RuntimeError(
112
+ f"datasets package is required for HuggingFace downloads. "
113
+ f"Install with: pip install datasets. Details: {e}"
114
+ )
116
115
  elif source == "kaggle":
117
116
  ref = kaggle_ref or repo_id
118
117
  if not ref:
119
118
  raise ValueError("kaggle_ref is required for source=kaggle")
120
- summary = await self._download_kaggle(ref, dataset_id, images_dir, metadata_file, errors_file, max_items)
119
+ try:
120
+ from kaggle.api.kaggle_api_extended import KaggleApi as _Ka # noqa: F401
121
+ except Exception as e:
122
+ raise RuntimeError(
123
+ f"kaggle package is required for Kaggle downloads. "
124
+ f"Install with: pip install kaggle. Details: {e}"
125
+ )
121
126
  else:
122
127
  if not urls:
123
128
  raise ValueError("urls are required for source=url")
124
- summary = await self._download_urls(urls, dataset_id, images_dir, metadata_file, errors_file, max_items)
129
+
130
+ if output_format == "webdataset" and wds is None:
131
+ raise RuntimeError(
132
+ "webdataset package is required for webdataset output. "
133
+ "Install with: pip install webdataset"
134
+ )
135
+
136
+ # --- Now safe to create directories ---
137
+ dataset_dir = self.output_root / dataset_id.replace("/", "_").replace(":", "_")
138
+ images_dir = dataset_dir / "images"
139
+ dataset_dir.mkdir(parents=True, exist_ok=True)
140
+ images_dir.mkdir(parents=True, exist_ok=True)
141
+
142
+ errors_file = dataset_dir / "errors.jsonl"
143
+ metadata_file = dataset_dir / "metadata.jsonl"
144
+
145
+ try:
146
+ if source == "huggingface":
147
+ summary = await self._download_huggingface(repo_id, dataset_id, images_dir, metadata_file, errors_file, max_items, image_column)
148
+ elif source == "kaggle":
149
+ ref = kaggle_ref or repo_id
150
+ summary = await self._download_kaggle(ref, dataset_id, images_dir, metadata_file, errors_file, max_items)
151
+ else:
152
+ summary = await self._download_urls(urls, dataset_id, images_dir, metadata_file, errors_file, max_items)
153
+ except Exception:
154
+ # Clean up empty directories on failure so we don't leave ghost artifacts
155
+ if images_dir.exists() and not any(images_dir.iterdir()):
156
+ shutil.rmtree(dataset_dir, ignore_errors=True)
157
+ raise
125
158
 
126
159
  if output_format == "webdataset":
127
160
  await self._write_webdataset(dataset_dir, images_dir, metadata_file)
@@ -150,10 +183,7 @@ class AssetDownloader:
150
183
  max_items: Optional[int],
151
184
  image_column: Optional[str],
152
185
  ) -> Dict[str, int]:
153
- try:
154
- from datasets import load_dataset
155
- except Exception as e:
156
- raise RuntimeError(f"datasets package is required. Install with: pip install datasets. Details: {e}")
186
+ from datasets import load_dataset # validated in download_assets()
157
187
 
158
188
  await self._emit("start", {"source": "huggingface", "repo_id": repo_id})
159
189
 
@@ -212,10 +242,7 @@ class AssetDownloader:
212
242
  errors_file: Path,
213
243
  max_items: Optional[int],
214
244
  ) -> Dict[str, int]:
215
- try:
216
- from kaggle.api.kaggle_api_extended import KaggleApi
217
- except Exception as e:
218
- raise RuntimeError(f"kaggle package is required: {e}")
245
+ from kaggle.api.kaggle_api_extended import KaggleApi # validated in download_assets()
219
246
 
220
247
  await self._emit("start", {"source": "kaggle", "dataset": kaggle_ref})
221
248