@vespermcp/mcp-server 1.2.19 → 1.2.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -453,6 +453,17 @@ jobManager.on("processJob", async (job, execute) => {
453
453
  async function handlePrepareJob(jobId, query, requirements) {
454
454
  hydrateExternalKeys();
455
455
  const update = (updates) => jobManager.updateJob(jobId, updates);
456
+ // Ensure core Python packages are available for dataset operations
457
+ try {
458
+ await ensurePythonModules([
459
+ { module: "polars", packageName: "polars" },
460
+ { module: "datasets", packageName: "datasets" },
461
+ ]);
462
+ }
463
+ catch (e) {
464
+ console.error(`[Prepare] Python dependency setup warning: ${e.message}`);
465
+ // Continue anyway - direct file downloads may still work without datasets lib
466
+ }
456
467
  const requestedRows = extractRequestedRows(query, requirements);
457
468
  let selectedDataset;
458
469
  let datasetIdForDownload = "";
@@ -1225,6 +1236,17 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1225
1236
  isError: true,
1226
1237
  };
1227
1238
  }
1239
+ // Pre-install Python datasets library for HuggingFace fallback
1240
+ if (source === "huggingface") {
1241
+ try {
1242
+ await ensurePythonModules([
1243
+ { module: "datasets", packageName: "datasets" },
1244
+ ]);
1245
+ }
1246
+ catch {
1247
+ // Continue - direct download may still work
1248
+ }
1249
+ }
1228
1250
  try {
1229
1251
  const localPath = await dataIngestor.ensureData(datasetId, source, () => undefined);
1230
1252
  try {
@@ -1,5 +1,6 @@
1
1
  import path from "path";
2
2
  import fs from "fs";
3
+ import { spawn } from "child_process";
3
4
  import { HFDownloader } from "./hf-downloader.js";
4
5
  import { KaggleSource } from "../metadata/kaggle-source.js";
5
6
  import { OpenMLSource } from "../metadata/openml-source.js";
@@ -63,25 +64,42 @@ export class DataIngestor {
63
64
  if (source === "huggingface") {
64
65
  onProgress?.("Discovering data files on HuggingFace Hub...");
65
66
  const remotePath = await this.hfDownloader.findBestFile(datasetId);
66
- if (!remotePath)
67
- throw new Error(`No suitable data files found in HuggingFace repo: ${datasetId}`);
68
- const ext = path.extname(remotePath).substring(1).toLowerCase() || "csv";
69
- const targetPath = this.getTargetPath(datasetId, ext);
70
- this.store.registerDownload(datasetId, targetPath, "downloading");
71
- try {
72
- await this.hfDownloader.download(datasetId, remotePath, targetPath, (progress) => {
73
- onProgress?.(`Downloading ${path.basename(remotePath)}...`, progress);
74
- });
75
- const resolvedPath = await this.hfDownloader.resolveExternalDataFromMetadata(targetPath, (progress) => {
76
- onProgress?.("Resolving external dataset file...", progress);
77
- });
78
- const stats = fs.statSync(resolvedPath);
79
- this.completeDownload(datasetId, resolvedPath, stats.size);
80
- return resolvedPath;
67
+ if (remotePath) {
68
+ // Direct file download path (repo has raw data files)
69
+ const ext = path.extname(remotePath).substring(1).toLowerCase() || "csv";
70
+ const targetPath = this.getTargetPath(datasetId, ext);
71
+ this.store.registerDownload(datasetId, targetPath, "downloading");
72
+ try {
73
+ await this.hfDownloader.download(datasetId, remotePath, targetPath, (progress) => {
74
+ onProgress?.(`Downloading ${path.basename(remotePath)}...`, progress);
75
+ });
76
+ const resolvedPath = await this.hfDownloader.resolveExternalDataFromMetadata(targetPath, (progress) => {
77
+ onProgress?.("Resolving external dataset file...", progress);
78
+ });
79
+ const stats = fs.statSync(resolvedPath);
80
+ this.completeDownload(datasetId, resolvedPath, stats.size);
81
+ return resolvedPath;
82
+ }
83
+ catch (e) {
84
+ this.failDownload(datasetId, e.message);
85
+ throw e;
86
+ }
81
87
  }
82
- catch (e) {
83
- this.failDownload(datasetId, e.message);
84
- throw e;
88
+ else {
89
+ // Fallback: Use Python datasets library to download and convert
90
+ onProgress?.("No raw files found. Using HuggingFace datasets library to download...");
91
+ const targetPath = this.getTargetPath(datasetId, "parquet");
92
+ this.store.registerDownload(datasetId, targetPath, "downloading");
93
+ try {
94
+ const result = await this.hfDatasetsFallback(datasetId, targetPath, onProgress);
95
+ const stats = fs.statSync(result);
96
+ this.completeDownload(datasetId, result, stats.size);
97
+ return result;
98
+ }
99
+ catch (e) {
100
+ this.failDownload(datasetId, e.message);
101
+ throw e;
102
+ }
85
103
  }
86
104
  }
87
105
  else if (source === "kaggle") {
@@ -159,4 +177,85 @@ export class DataIngestor {
159
177
  const safeId = datasetId.replace(/\//g, "_").replace(/:/g, "_");
160
178
  return path.join(this.rawDataDir, `${safeId}.${extension}`);
161
179
  }
180
+ /**
181
+ * Fallback: Use Python `datasets` library to download a HuggingFace dataset
182
+ * when no raw data files are found in the repo file listing.
183
+ */
184
+ async hfDatasetsFallback(datasetId, targetPath, onProgress) {
185
+ const pyCmd = process.platform === "win32" ? "py" : "python";
186
+ // Resolve the fallback script path
187
+ const homeDir = process.env.HOME || process.env.USERPROFILE || this.projectRoot;
188
+ const dataRoot = path.join(homeDir, ".vesper");
189
+ const scriptCandidates = [
190
+ path.resolve(dataRoot, "python", "hf_fallback.py"),
191
+ path.resolve(this.projectRoot, "python", "hf_fallback.py"),
192
+ path.resolve(this.projectRoot, "..", "src", "python", "hf_fallback.py"),
193
+ path.resolve(this.projectRoot, "..", "python", "hf_fallback.py"),
194
+ ];
195
+ let scriptPath = scriptCandidates.find(p => fs.existsSync(p));
196
+ if (!scriptPath) {
197
+ scriptPath = scriptCandidates[0]; // Will fail with a clear error
198
+ }
199
+ const token = process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN || undefined;
200
+ const payload = {
201
+ repo_id: datasetId,
202
+ output_path: targetPath,
203
+ token: token || null,
204
+ max_rows: 500000,
205
+ };
206
+ onProgress?.("Downloading via datasets library (this may take a moment)...", 30);
207
+ return new Promise((resolve, reject) => {
208
+ const proc = spawn(pyCmd, [scriptPath, JSON.stringify(payload)], {
209
+ env: {
210
+ ...process.env,
211
+ PYTHONUTF8: "1",
212
+ PIP_DISABLE_PIP_VERSION_CHECK: "1",
213
+ },
214
+ });
215
+ let stdout = "";
216
+ let stderr = "";
217
+ proc.stdout.on("data", (d) => (stdout += d.toString()));
218
+ proc.stderr.on("data", (d) => {
219
+ const msg = d.toString();
220
+ stderr += msg;
221
+ // Forward progress info
222
+ if (msg.includes("Downloading") || msg.includes("Loading")) {
223
+ onProgress?.(msg.trim().split("\n").pop() || "Downloading...", 50);
224
+ }
225
+ });
226
+ const timer = setTimeout(() => {
227
+ try {
228
+ proc.kill();
229
+ }
230
+ catch { /* no-op */ }
231
+ reject(new Error(`HuggingFace datasets download timed out after 10 minutes for ${datasetId}`));
232
+ }, 600000); // 10 min timeout
233
+ proc.on("close", (code) => {
234
+ clearTimeout(timer);
235
+ if (code !== 0) {
236
+ let errorMsg = stderr || stdout || `Python exited with code ${code}`;
237
+ try {
238
+ const parsed = JSON.parse(stdout);
239
+ if (parsed.error)
240
+ errorMsg = parsed.error;
241
+ }
242
+ catch { /* use stderr */ }
243
+ reject(new Error(`HuggingFace datasets fallback failed: ${errorMsg}`));
244
+ return;
245
+ }
246
+ try {
247
+ const result = JSON.parse(stdout);
248
+ if (!result.ok) {
249
+ reject(new Error(result.error || "Unknown error from HF fallback"));
250
+ return;
251
+ }
252
+ onProgress?.(`Downloaded ${result.rows?.toLocaleString() || "?"} rows (${result.columns?.length || "?"} columns)`, 90);
253
+ resolve(result.path);
254
+ }
255
+ catch {
256
+ reject(new Error(`Failed to parse HF fallback output: ${stdout}`));
257
+ }
258
+ });
259
+ });
260
+ }
162
261
  }
@@ -0,0 +1,147 @@
1
+ """
2
+ HuggingFace Datasets Library Fallback Downloader.
3
+
4
+ Used when the HF Hub file listing finds no suitable data files
5
+ (e.g. script-based datasets, gated datasets, datasets that use
6
+ the `datasets` library format).
7
+
8
+ Usage:
9
+ python hf_fallback.py '{"repo_id": "user/dataset", "output_path": "/path/to/output.parquet", "token": "optional", "max_rows": 100000, "split": "train"}'
10
+
11
+ Output: JSON to stdout
12
+ {"ok": true, "path": "/path/to/output.parquet", "rows": 12345, "columns": ["col1", "col2"]}
13
+ {"ok": false, "error": "..."}
14
+ """
15
+ import sys
16
+ import json
17
+ import os
18
+
19
+ def main():
20
+ if len(sys.argv) < 2:
21
+ print(json.dumps({"ok": False, "error": "Missing payload argument"}))
22
+ sys.exit(1)
23
+
24
+ try:
25
+ payload = json.loads(sys.argv[1])
26
+ except json.JSONDecodeError as e:
27
+ print(json.dumps({"ok": False, "error": f"Invalid JSON payload: {e}"}))
28
+ sys.exit(1)
29
+
30
+ repo_id = payload.get("repo_id", "").strip()
31
+ output_path = payload.get("output_path", "").strip()
32
+ token = payload.get("token") or os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or None
33
+ max_rows = payload.get("max_rows", 500000)
34
+ split = payload.get("split") # None = auto-detect
35
+
36
+ if not repo_id:
37
+ print(json.dumps({"ok": False, "error": "repo_id is required"}))
38
+ sys.exit(1)
39
+
40
+ if not output_path:
41
+ print(json.dumps({"ok": False, "error": "output_path is required"}))
42
+ sys.exit(1)
43
+
44
+ try:
45
+ from datasets import load_dataset
46
+ except ImportError:
47
+ print(json.dumps({"ok": False, "error": "Python 'datasets' library not installed. Install with: pip install datasets"}))
48
+ sys.exit(1)
49
+
50
+ try:
51
+ import polars as pl
52
+ except ImportError:
53
+ pl = None
54
+
55
+ try:
56
+ # Try loading with streaming first (memory-efficient)
57
+ # If split is not specified, try common ones
58
+ splits_to_try = [split] if split else ["train", "test", "validation", None]
59
+
60
+ ds = None
61
+ used_split = None
62
+
63
+ for s in splits_to_try:
64
+ try:
65
+ kwargs = {
66
+ "path": repo_id,
67
+ "trust_remote_code": True,
68
+ }
69
+ if token:
70
+ kwargs["token"] = token
71
+ if s:
72
+ kwargs["split"] = s
73
+
74
+ ds = load_dataset(**kwargs)
75
+ used_split = s
76
+ break
77
+ except (ValueError, KeyError):
78
+ # Split doesn't exist, try next
79
+ continue
80
+ except Exception as e:
81
+ if "split" in str(e).lower() or "key" in str(e).lower():
82
+ continue
83
+ raise
84
+
85
+ if ds is None:
86
+ print(json.dumps({"ok": False, "error": f"Could not load dataset '{repo_id}'. No valid splits found."}))
87
+ sys.exit(1)
88
+
89
+ # Handle DatasetDict (when no split specified)
90
+ from datasets import DatasetDict, Dataset
91
+ if isinstance(ds, DatasetDict):
92
+ # Pick the best split
93
+ for preferred in ["train", "test", "validation"]:
94
+ if preferred in ds:
95
+ ds = ds[preferred]
96
+ used_split = preferred
97
+ break
98
+ else:
99
+ # Just pick the first available split
100
+ first_key = list(ds.keys())[0]
101
+ ds = ds[first_key]
102
+ used_split = first_key
103
+
104
+ # Limit rows if needed
105
+ total_rows = len(ds)
106
+ if max_rows and total_rows > max_rows:
107
+ ds = ds.select(range(max_rows))
108
+ total_rows = max_rows
109
+
110
+ # Ensure output directory exists
111
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
112
+
113
+ # Export to parquet
114
+ columns = ds.column_names
115
+
116
+ if output_path.endswith(".parquet"):
117
+ ds.to_parquet(output_path)
118
+ elif output_path.endswith(".csv"):
119
+ ds.to_csv(output_path)
120
+ else:
121
+ # Default to parquet
122
+ if not output_path.endswith(".parquet"):
123
+ output_path = output_path + ".parquet"
124
+ ds.to_parquet(output_path)
125
+
126
+ print(json.dumps({
127
+ "ok": True,
128
+ "path": output_path,
129
+ "rows": total_rows,
130
+ "columns": columns,
131
+ "split": used_split
132
+ }))
133
+
134
+ except Exception as e:
135
+ error_msg = str(e)
136
+ # Provide helpful hints
137
+ if "401" in error_msg or "403" in error_msg or "gated" in error_msg.lower():
138
+ error_msg += " (This dataset may be gated/private. Set HF_TOKEN via configure_keys tool.)"
139
+ elif "FileNotFoundError" in error_msg or "does not exist" in error_msg.lower():
140
+ error_msg = f"Dataset '{repo_id}' not found on HuggingFace. Check the dataset ID."
141
+
142
+ print(json.dumps({"ok": False, "error": error_msg}))
143
+ sys.exit(1)
144
+
145
+
146
+ if __name__ == "__main__":
147
+ main()
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@vespermcp/mcp-server",
3
- "version": "1.2.19",
3
+ "version": "1.2.20",
4
4
  "description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
5
5
  "type": "module",
6
6
  "main": "build/index.js",
@@ -0,0 +1,147 @@
1
+ """
2
+ HuggingFace Datasets Library Fallback Downloader.
3
+
4
+ Used when the HF Hub file listing finds no suitable data files
5
+ (e.g. script-based datasets, gated datasets, datasets that use
6
+ the `datasets` library format).
7
+
8
+ Usage:
9
+ python hf_fallback.py '{"repo_id": "user/dataset", "output_path": "/path/to/output.parquet", "token": "optional", "max_rows": 100000, "split": "train"}'
10
+
11
+ Output: JSON to stdout
12
+ {"ok": true, "path": "/path/to/output.parquet", "rows": 12345, "columns": ["col1", "col2"]}
13
+ {"ok": false, "error": "..."}
14
+ """
15
+ import sys
16
+ import json
17
+ import os
18
+
19
+ def main():
20
+ if len(sys.argv) < 2:
21
+ print(json.dumps({"ok": False, "error": "Missing payload argument"}))
22
+ sys.exit(1)
23
+
24
+ try:
25
+ payload = json.loads(sys.argv[1])
26
+ except json.JSONDecodeError as e:
27
+ print(json.dumps({"ok": False, "error": f"Invalid JSON payload: {e}"}))
28
+ sys.exit(1)
29
+
30
+ repo_id = payload.get("repo_id", "").strip()
31
+ output_path = payload.get("output_path", "").strip()
32
+ token = payload.get("token") or os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or None
33
+ max_rows = payload.get("max_rows", 500000)
34
+ split = payload.get("split") # None = auto-detect
35
+
36
+ if not repo_id:
37
+ print(json.dumps({"ok": False, "error": "repo_id is required"}))
38
+ sys.exit(1)
39
+
40
+ if not output_path:
41
+ print(json.dumps({"ok": False, "error": "output_path is required"}))
42
+ sys.exit(1)
43
+
44
+ try:
45
+ from datasets import load_dataset
46
+ except ImportError:
47
+ print(json.dumps({"ok": False, "error": "Python 'datasets' library not installed. Install with: pip install datasets"}))
48
+ sys.exit(1)
49
+
50
+ try:
51
+ import polars as pl
52
+ except ImportError:
53
+ pl = None
54
+
55
+ try:
56
+ # Try loading with streaming first (memory-efficient)
57
+ # If split is not specified, try common ones
58
+ splits_to_try = [split] if split else ["train", "test", "validation", None]
59
+
60
+ ds = None
61
+ used_split = None
62
+
63
+ for s in splits_to_try:
64
+ try:
65
+ kwargs = {
66
+ "path": repo_id,
67
+ "trust_remote_code": True,
68
+ }
69
+ if token:
70
+ kwargs["token"] = token
71
+ if s:
72
+ kwargs["split"] = s
73
+
74
+ ds = load_dataset(**kwargs)
75
+ used_split = s
76
+ break
77
+ except (ValueError, KeyError):
78
+ # Split doesn't exist, try next
79
+ continue
80
+ except Exception as e:
81
+ if "split" in str(e).lower() or "key" in str(e).lower():
82
+ continue
83
+ raise
84
+
85
+ if ds is None:
86
+ print(json.dumps({"ok": False, "error": f"Could not load dataset '{repo_id}'. No valid splits found."}))
87
+ sys.exit(1)
88
+
89
+ # Handle DatasetDict (when no split specified)
90
+ from datasets import DatasetDict, Dataset
91
+ if isinstance(ds, DatasetDict):
92
+ # Pick the best split
93
+ for preferred in ["train", "test", "validation"]:
94
+ if preferred in ds:
95
+ ds = ds[preferred]
96
+ used_split = preferred
97
+ break
98
+ else:
99
+ # Just pick the first available split
100
+ first_key = list(ds.keys())[0]
101
+ ds = ds[first_key]
102
+ used_split = first_key
103
+
104
+ # Limit rows if needed
105
+ total_rows = len(ds)
106
+ if max_rows and total_rows > max_rows:
107
+ ds = ds.select(range(max_rows))
108
+ total_rows = max_rows
109
+
110
+ # Ensure output directory exists
111
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
112
+
113
+ # Export to parquet
114
+ columns = ds.column_names
115
+
116
+ if output_path.endswith(".parquet"):
117
+ ds.to_parquet(output_path)
118
+ elif output_path.endswith(".csv"):
119
+ ds.to_csv(output_path)
120
+ else:
121
+ # Default to parquet
122
+ if not output_path.endswith(".parquet"):
123
+ output_path = output_path + ".parquet"
124
+ ds.to_parquet(output_path)
125
+
126
+ print(json.dumps({
127
+ "ok": True,
128
+ "path": output_path,
129
+ "rows": total_rows,
130
+ "columns": columns,
131
+ "split": used_split
132
+ }))
133
+
134
+ except Exception as e:
135
+ error_msg = str(e)
136
+ # Provide helpful hints
137
+ if "401" in error_msg or "403" in error_msg or "gated" in error_msg.lower():
138
+ error_msg += " (This dataset may be gated/private. Set HF_TOKEN via configure_keys tool.)"
139
+ elif "FileNotFoundError" in error_msg or "does not exist" in error_msg.lower():
140
+ error_msg = f"Dataset '{repo_id}' not found on HuggingFace. Check the dataset ID."
141
+
142
+ print(json.dumps({"ok": False, "error": error_msg}))
143
+ sys.exit(1)
144
+
145
+
146
+ if __name__ == "__main__":
147
+ main()