@vespermcp/mcp-server 1.2.2 → 1.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -12,6 +12,8 @@ import { Embedder } from "./search/embedder.js";
12
12
  import { SearchEngine } from "./search/engine.js";
13
13
  import { HuggingFaceScraper } from "./metadata/scraper.js";
14
14
  import { KaggleSource } from "./metadata/kaggle-source.js";
15
+ import { OpenMLSource } from "./metadata/openml-source.js";
16
+ import { DataWorldSource } from "./metadata/dataworld-source.js";
15
17
  import { formatSearchResults, formatDatasetInfo, formatJobStatus } from "./tools/formatter.js";
16
18
  import { JobManager } from "./jobs/manager.js";
17
19
  import { QualityAnalyzer } from "./quality/analyzer.js";
@@ -228,6 +230,12 @@ function hydrateExternalKeys() {
228
230
  if (!process.env.KAGGLE_KEY && keys.kaggle_key) {
229
231
  process.env.KAGGLE_KEY = String(keys.kaggle_key);
230
232
  }
233
+ if (!process.env.DW_AUTH_TOKEN && keys.dataworld_token) {
234
+ process.env.DW_AUTH_TOKEN = String(keys.dataworld_token);
235
+ }
236
+ }
237
+ function hasDataWorldToken() {
238
+ return !!(process.env.DW_AUTH_TOKEN || secureKeys.getAll().dataworld_token);
231
239
  }
232
240
  // CRITICAL FIX: Pass __dirname (build directory) to analyzers
233
241
  // Python scripts are in build/python/, so analyzers should look relative to build/
@@ -429,7 +437,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
429
437
  },
430
438
  source: {
431
439
  type: "string",
432
- enum: ["huggingface", "kaggle"],
440
+ enum: ["huggingface", "kaggle", "openml", "dataworld"],
433
441
  description: "Data source to discover from.",
434
442
  },
435
443
  limit: {
@@ -448,7 +456,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
448
456
  properties: {
449
457
  source: {
450
458
  type: "string",
451
- enum: ["huggingface", "kaggle"],
459
+ enum: ["huggingface", "kaggle", "openml", "dataworld"],
452
460
  description: "Dataset source.",
453
461
  },
454
462
  dataset_id: {
@@ -477,13 +485,14 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
477
485
  },
478
486
  {
479
487
  name: "configure_keys",
480
- description: "One-time optional key setup for external sources (Kaggle + gated HF). Core tools do not require keys.",
488
+ description: "One-time optional key setup for external sources (Kaggle, data.world, gated HF). Core tools do not require keys.",
481
489
  inputSchema: {
482
490
  type: "object",
483
491
  properties: {
484
492
  hf_token: { type: "string", description: "Optional Hugging Face token for gated/private datasets" },
485
493
  kaggle_username: { type: "string", description: "Optional Kaggle username" },
486
- kaggle_key: { type: "string", description: "Optional Kaggle API key" }
494
+ kaggle_key: { type: "string", description: "Optional Kaggle API key" },
495
+ dataworld_token: { type: "string", description: "Optional data.world API token" }
487
496
  },
488
497
  },
489
498
  },
@@ -495,7 +504,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
495
504
  properties: {
496
505
  dataset_id: {
497
506
  type: "string",
498
- description: "The unique dataset ID (e.g., 'user/dataset_name' for HuggingFace or 'kaggle:username/dataset' for Kaggle)",
507
+ description: "The unique dataset ID (e.g., 'user/dataset_name' for HuggingFace, 'kaggle:username/dataset' for Kaggle, 'openml:1234' for OpenML, or 'dataworld:owner/id' for data.world)",
499
508
  },
500
509
  },
501
510
  required: ["dataset_id"],
@@ -786,6 +795,20 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
786
795
  }
787
796
  results = await kaggleSource.discover(query, limit);
788
797
  }
798
+ else if (source === "openml") {
799
+ const openmlSource = new OpenMLSource();
800
+ results = await openmlSource.discover(query, limit);
801
+ }
802
+ else if (source === "dataworld") {
803
+ if (!hasDataWorldToken()) {
804
+ return {
805
+ content: [{ type: "text", text: "data.world requires API token. Run 'vespermcp config keys' and set dataworld_token." }],
806
+ isError: true,
807
+ };
808
+ }
809
+ const dataworldSource = new DataWorldSource();
810
+ results = await dataworldSource.discover(query, limit);
811
+ }
789
812
  else {
790
813
  const hf = new HuggingFaceScraper();
791
814
  results = await hf.scrape(Math.max(1, limit), true, query);
@@ -815,6 +838,12 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
815
838
  isError: true,
816
839
  };
817
840
  }
841
+ if (source === "dataworld" && !hasDataWorldToken()) {
842
+ return {
843
+ content: [{ type: "text", text: "data.world requires API token. Run 'vespermcp config keys' and set dataworld_token." }],
844
+ isError: true,
845
+ };
846
+ }
818
847
  try {
819
848
  const localPath = await dataIngestor.ensureData(datasetId, source, () => undefined);
820
849
  return {
@@ -846,6 +875,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
846
875
  const hfToken = String(request.params.arguments?.hf_token || "").trim();
847
876
  const kaggleUsername = String(request.params.arguments?.kaggle_username || "").trim();
848
877
  const kaggleKey = String(request.params.arguments?.kaggle_key || "").trim();
878
+ const dataworldToken = String(request.params.arguments?.dataworld_token || "").trim();
849
879
  const saved = [];
850
880
  const methods = [];
851
881
  if (hfToken) {
@@ -875,6 +905,15 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
875
905
  methods.push(r.method);
876
906
  }
877
907
  }
908
+ if (dataworldToken) {
909
+ const r = secureKeys.set("dataworld_token", dataworldToken);
910
+ if (r.ok) {
911
+ process.env.DW_AUTH_TOKEN = dataworldToken;
912
+ saved.push("data.world token");
913
+ if (r.method)
914
+ methods.push(r.method);
915
+ }
916
+ }
878
917
  if (saved.length === 0) {
879
918
  return {
880
919
  content: [{ type: "text", text: "No keys provided. Core Vesper tools continue to work without API keys." }]
@@ -1402,6 +1441,7 @@ async function runConfigCli(args) {
1402
1441
  const hfToken = (await ask(`Hugging Face token [${current.hf_token ? "saved" : "empty"}]: `)).trim();
1403
1442
  const kaggleUsername = (await ask(`Kaggle username [${current.kaggle_username ? "saved" : "empty"}]: `)).trim();
1404
1443
  const kaggleKey = (await ask(`Kaggle key [${current.kaggle_key ? "saved" : "empty"}]: `)).trim();
1444
+ const dataworldToken = (await ask(`data.world token [${current.dataworld_token ? "saved" : "empty"}]: `)).trim();
1405
1445
  rl.close();
1406
1446
  const saved = [];
1407
1447
  if (hfToken) {
@@ -1425,12 +1465,19 @@ async function runConfigCli(args) {
1425
1465
  saved.push("Kaggle key");
1426
1466
  }
1427
1467
  }
1468
+ if (dataworldToken) {
1469
+ const res = secureKeys.set("dataworld_token", dataworldToken);
1470
+ if (res.ok) {
1471
+ process.env.DW_AUTH_TOKEN = dataworldToken;
1472
+ saved.push("data.world token");
1473
+ }
1474
+ }
1428
1475
  if (saved.length === 0) {
1429
1476
  console.log("No new keys saved (all skipped). Core tools continue to work without keys.");
1430
1477
  return;
1431
1478
  }
1432
1479
  console.log(`Key(s) saved securely: ${saved.join(", ")}`);
1433
- console.log("You can now use Kaggle and gated Hugging Face datasets.");
1480
+ console.log("You can now use Kaggle, data.world, and gated Hugging Face datasets.");
1434
1481
  return;
1435
1482
  }
1436
1483
  // Backward-compatible Kaggle-specific path
@@ -1479,7 +1526,7 @@ async function runDiscoverCli(args) {
1479
1526
  }
1480
1527
  const query = queryParts.join(" ").trim();
1481
1528
  if (!query) {
1482
- console.error("Usage: vespermcp discover --source kaggle \"credit risk\" --limit 10");
1529
+ console.error("Usage: vespermcp discover --source <huggingface|kaggle|openml|dataworld> \"credit risk\" --limit 10");
1483
1530
  process.exit(1);
1484
1531
  }
1485
1532
  if (source === "kaggle") {
@@ -1512,6 +1559,34 @@ async function runDiscoverCli(args) {
1512
1559
  }
1513
1560
  return;
1514
1561
  }
1562
+ else if (source === "openml") {
1563
+ try {
1564
+ const openmlSource = new OpenMLSource();
1565
+ const results = await openmlSource.discover(query, limit);
1566
+ console.log(formatSearchResults(results));
1567
+ }
1568
+ catch (error) {
1569
+ console.error(`OpenML discover failed: ${error.message || error}`);
1570
+ process.exit(1);
1571
+ }
1572
+ return;
1573
+ }
1574
+ else if (source === "dataworld") {
1575
+ if (!hasDataWorldToken()) {
1576
+ console.error("data.world requires API token. Run 'vespermcp config keys' and set dataworld_token.");
1577
+ process.exit(1);
1578
+ }
1579
+ try {
1580
+ const dataworldSource = new DataWorldSource();
1581
+ const results = await dataworldSource.discover(query, limit);
1582
+ console.log(formatSearchResults(results));
1583
+ }
1584
+ catch (error) {
1585
+ console.error(`data.world discover failed: ${error.message || error}`);
1586
+ process.exit(1);
1587
+ }
1588
+ return;
1589
+ }
1515
1590
  const hf = new HuggingFaceScraper();
1516
1591
  const results = await hf.scrape(limit, true, query);
1517
1592
  console.log(formatSearchResults(results));
@@ -1530,7 +1605,7 @@ async function runDownloadCli(args) {
1530
1605
  const source = (nonFlags[1] || "").toLowerCase();
1531
1606
  const datasetId = nonFlags[2] || "";
1532
1607
  if (!source || !datasetId) {
1533
- console.error("Usage: vespermcp download kaggle <username/dataset-name> [--target-dir C:/path]");
1608
+ console.error("Usage: vespermcp download <huggingface|kaggle|openml|dataworld> <dataset-id> [--target-dir C:/path]");
1534
1609
  process.exit(1);
1535
1610
  }
1536
1611
  if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
@@ -1546,6 +1621,10 @@ async function runDownloadCli(args) {
1546
1621
  if (!dataIngestor.hasKaggleCredentials())
1547
1622
  process.exit(1);
1548
1623
  }
1624
+ if (source === "dataworld" && !hasDataWorldToken()) {
1625
+ console.error("data.world requires API token. Run 'vespermcp config keys' and set dataworld_token.");
1626
+ process.exit(1);
1627
+ }
1549
1628
  let localPath = "";
1550
1629
  try {
1551
1630
  if (source === "kaggle" && targetDir) {
@@ -2,6 +2,8 @@ import path from "path";
2
2
  import fs from "fs";
3
3
  import { HFDownloader } from "./hf-downloader.js";
4
4
  import { KaggleSource } from "../metadata/kaggle-source.js";
5
+ import { OpenMLSource } from "../metadata/openml-source.js";
6
+ import { DataWorldSource } from "../metadata/dataworld-source.js";
5
7
  import { SecureKeysManager } from "../config/secure-keys.js";
6
8
  export class DataIngestor {
7
9
  projectRoot;
@@ -9,6 +11,8 @@ export class DataIngestor {
9
11
  rawDataDir;
10
12
  hfDownloader;
11
13
  kaggleSource;
14
+ openmlSource;
15
+ dataworldSource;
12
16
  secureKeys;
13
17
  constructor(projectRoot, store) {
14
18
  this.projectRoot = projectRoot;
@@ -19,6 +23,8 @@ export class DataIngestor {
19
23
  }
20
24
  this.hfDownloader = new HFDownloader();
21
25
  this.kaggleSource = new KaggleSource();
26
+ this.openmlSource = new OpenMLSource();
27
+ this.dataworldSource = new DataWorldSource();
22
28
  this.secureKeys = new SecureKeysManager();
23
29
  }
24
30
  /**
@@ -96,6 +102,38 @@ export class DataIngestor {
96
102
  throw e;
97
103
  }
98
104
  }
105
+ else if (source === "openml") {
106
+ const targetDir = path.join(this.rawDataDir, datasetId.replace(/:/g, "_"));
107
+ this.store.registerDownload(datasetId, targetDir, "downloading");
108
+ try {
109
+ onProgress?.("Downloading from OpenML...");
110
+ const result = await this.openmlSource.download(datasetId, targetDir);
111
+ const stats = fs.statSync(result.local_path);
112
+ this.completeDownload(datasetId, result.local_path, stats.size);
113
+ onProgress?.("OpenML download complete", 100);
114
+ return result.local_path;
115
+ }
116
+ catch (e) {
117
+ this.failDownload(datasetId, e.message);
118
+ throw e;
119
+ }
120
+ }
121
+ else if (source === "dataworld") {
122
+ const targetDir = path.join(this.rawDataDir, datasetId.replace(/[:\/]/g, "_"));
123
+ this.store.registerDownload(datasetId, targetDir, "downloading");
124
+ try {
125
+ onProgress?.("Downloading from data.world...");
126
+ const result = await this.dataworldSource.download(datasetId, targetDir);
127
+ const stats = fs.statSync(result.local_path);
128
+ this.completeDownload(datasetId, result.local_path, stats.size);
129
+ onProgress?.("data.world download complete", 100);
130
+ return result.local_path;
131
+ }
132
+ catch (e) {
133
+ this.failDownload(datasetId, e.message);
134
+ throw e;
135
+ }
136
+ }
99
137
  throw new Error(`Download logic for ${source} not yet implemented`);
100
138
  }
101
139
  /**
@@ -0,0 +1,89 @@
1
+ import { spawn } from "child_process";
2
+ import path from "path";
3
+ import fs from "fs";
4
+ import os from "os";
5
+ export class DataWorldSource {
6
+ pythonPath = "python";
7
+ scriptPath;
8
+ constructor(buildDir = process.cwd()) {
9
+ const homeDir = os.homedir() || process.env.HOME || process.env.USERPROFILE || buildDir;
10
+ const dataRoot = path.join(homeDir, ".vesper");
11
+ const scriptPath0 = path.resolve(dataRoot, "python", "dataworld_engine.py");
12
+ const scriptPath1 = path.resolve(buildDir, "python", "dataworld_engine.py");
13
+ const scriptPath2 = path.resolve(buildDir, "..", "src", "python", "dataworld_engine.py");
14
+ if (fs.existsSync(scriptPath0)) {
15
+ this.scriptPath = scriptPath0;
16
+ }
17
+ else if (fs.existsSync(scriptPath1)) {
18
+ this.scriptPath = scriptPath1;
19
+ }
20
+ else if (fs.existsSync(scriptPath2)) {
21
+ this.scriptPath = scriptPath2;
22
+ }
23
+ else {
24
+ this.scriptPath = scriptPath0;
25
+ }
26
+ if (process.platform === "win32") {
27
+ const venvPy = path.resolve(buildDir, ".venv", "Scripts", "python.exe");
28
+ if (fs.existsSync(venvPy)) {
29
+ this.pythonPath = venvPy;
30
+ }
31
+ else {
32
+ this.pythonPath = "py";
33
+ }
34
+ }
35
+ else {
36
+ const venvPy = path.resolve(buildDir, ".venv", "bin", "python");
37
+ if (fs.existsSync(venvPy)) {
38
+ this.pythonPath = venvPy;
39
+ }
40
+ }
41
+ }
42
+ async discover(query, limit = 20) {
43
+ const result = await this.run(["discover", query, String(limit)]);
44
+ if (!result.ok) {
45
+ throw new Error(result.error || "data.world discover failed");
46
+ }
47
+ return (result.results || []);
48
+ }
49
+ async download(datasetRef, targetDir) {
50
+ const args = ["download", datasetRef];
51
+ if (targetDir)
52
+ args.push(targetDir);
53
+ const result = await this.run(args);
54
+ if (!result.ok) {
55
+ throw new Error(result.error || "data.world download failed");
56
+ }
57
+ return {
58
+ local_path: result.local_path,
59
+ target_dir: result.target_dir,
60
+ };
61
+ }
62
+ run(args) {
63
+ return new Promise((resolve, reject) => {
64
+ const proc = spawn(this.pythonPath, [this.scriptPath, ...args], {
65
+ env: process.env
66
+ });
67
+ let stdout = "";
68
+ let stderr = "";
69
+ proc.stdout.on("data", (data) => {
70
+ stdout += data.toString();
71
+ });
72
+ proc.stderr.on("data", (data) => {
73
+ stderr += data.toString();
74
+ });
75
+ proc.on("close", (code) => {
76
+ if (code !== 0) {
77
+ return reject(new Error(`data.world engine exited with code ${code}: ${stderr}`));
78
+ }
79
+ try {
80
+ const parsed = JSON.parse(stdout.trim());
81
+ resolve(parsed);
82
+ }
83
+ catch (e) {
84
+ reject(new Error(`Failed to parse data.world engine output: ${stdout}`));
85
+ }
86
+ });
87
+ });
88
+ }
89
+ }
@@ -0,0 +1,87 @@
1
+ import { spawn } from "child_process";
2
+ import path from "path";
3
+ import fs from "fs";
4
+ import os from "os";
5
+ export class OpenMLSource {
6
+ pythonPath = "python";
7
+ scriptPath;
8
+ constructor(buildDir = process.cwd()) {
9
+ const homeDir = os.homedir() || process.env.HOME || process.env.USERPROFILE || buildDir;
10
+ const dataRoot = path.join(homeDir, ".vesper");
11
+ const scriptPath0 = path.resolve(dataRoot, "python", "openml_engine.py");
12
+ const scriptPath1 = path.resolve(buildDir, "python", "openml_engine.py");
13
+ const scriptPath2 = path.resolve(buildDir, "..", "src", "python", "openml_engine.py");
14
+ if (fs.existsSync(scriptPath0)) {
15
+ this.scriptPath = scriptPath0;
16
+ }
17
+ else if (fs.existsSync(scriptPath1)) {
18
+ this.scriptPath = scriptPath1;
19
+ }
20
+ else if (fs.existsSync(scriptPath2)) {
21
+ this.scriptPath = scriptPath2;
22
+ }
23
+ else {
24
+ this.scriptPath = scriptPath0;
25
+ }
26
+ if (process.platform === "win32") {
27
+ const venvPy = path.resolve(buildDir, ".venv", "Scripts", "python.exe");
28
+ if (fs.existsSync(venvPy)) {
29
+ this.pythonPath = venvPy;
30
+ }
31
+ else {
32
+ this.pythonPath = "py";
33
+ }
34
+ }
35
+ else {
36
+ const venvPy = path.resolve(buildDir, ".venv", "bin", "python");
37
+ if (fs.existsSync(venvPy)) {
38
+ this.pythonPath = venvPy;
39
+ }
40
+ }
41
+ }
42
+ async discover(query, limit = 20) {
43
+ const result = await this.run(["discover", query, String(limit)]);
44
+ if (!result.ok) {
45
+ throw new Error(result.error || "OpenML discover failed");
46
+ }
47
+ return (result.results || []);
48
+ }
49
+ async download(datasetRef, targetDir) {
50
+ const args = ["download", datasetRef];
51
+ if (targetDir)
52
+ args.push(targetDir);
53
+ const result = await this.run(args);
54
+ if (!result.ok) {
55
+ throw new Error(result.error || "OpenML download failed");
56
+ }
57
+ return {
58
+ local_path: result.local_path,
59
+ target_dir: result.target_dir,
60
+ };
61
+ }
62
+ run(args) {
63
+ return new Promise((resolve, reject) => {
64
+ const proc = spawn(this.pythonPath, [this.scriptPath, ...args]);
65
+ let stdout = "";
66
+ let stderr = "";
67
+ proc.stdout.on("data", (data) => {
68
+ stdout += data.toString();
69
+ });
70
+ proc.stderr.on("data", (data) => {
71
+ stderr += data.toString();
72
+ });
73
+ proc.on("close", (code) => {
74
+ if (code !== 0) {
75
+ return reject(new Error(`OpenML engine exited with code ${code}: ${stderr}`));
76
+ }
77
+ try {
78
+ const parsed = JSON.parse(stdout.trim());
79
+ resolve(parsed);
80
+ }
81
+ catch (e) {
82
+ reject(new Error(`Failed to parse OpenML engine output: ${stdout}`));
83
+ }
84
+ });
85
+ });
86
+ }
87
+ }
@@ -13,6 +13,7 @@ KEY_ALIASES = {
13
13
  "hf_token": ["HF_TOKEN", "HUGGINGFACE_TOKEN"],
14
14
  "kaggle_username": ["KAGGLE_USERNAME"],
15
15
  "kaggle_key": ["KAGGLE_KEY"],
16
+ "dataworld_token": ["DW_AUTH_TOKEN"],
16
17
  }
17
18
 
18
19
  try:
@@ -207,6 +208,7 @@ def get_all() -> Dict[str, Optional[str]]:
207
208
  "hf_token": get_key("hf_token"),
208
209
  "kaggle_username": get_key("kaggle_username"),
209
210
  "kaggle_key": get_key("kaggle_key"),
211
+ "dataworld_token": get_key("dataworld_token"),
210
212
  }
211
213
 
212
214
 
@@ -0,0 +1,208 @@
1
+ import sys
2
+ import json
3
+ import argparse
4
+ import tempfile
5
+ import os
6
+ import urllib.request
7
+ import urllib.error
8
+ import urllib.parse
9
+ from typing import Dict, Any, List
10
+
11
+ def _get_token() -> str:
12
+ token = os.environ.get("DW_AUTH_TOKEN")
13
+ if not token:
14
+ raise ValueError("DW_AUTH_TOKEN environment variable is required for data.world")
15
+ return token
16
+
17
+ def _dataset_to_dict(ds: Dict[str, Any]) -> Dict[str, Any]:
18
+ owner_field = ds.get("owner", "")
19
+ if isinstance(owner_field, dict):
20
+ owner = owner_field.get("id") or owner_field.get("name") or ""
21
+ else:
22
+ owner = owner_field or ""
23
+
24
+ id_str = ds.get("id", "")
25
+ title = ds.get("title", "")
26
+
27
+ if (not owner or not id_str) and isinstance(ds.get("resourceLink"), str):
28
+ # Expected format includes /<owner>/<dataset-id>
29
+ parts = ds["resourceLink"].strip("/").split("/")
30
+ if len(parts) >= 2:
31
+ owner = owner or parts[-2]
32
+ id_str = id_str or parts[-1]
33
+
34
+ if isinstance(id_str, str) and "/" in id_str and not owner:
35
+ split_ref = id_str.split("/", 1)
36
+ owner = split_ref[0]
37
+ id_str = split_ref[1]
38
+
39
+ if not owner and not id_str:
40
+ owner = "unknown"
41
+ id_str = "unknown"
42
+
43
+ if not title:
44
+ title = f"{owner}/{id_str}"
45
+
46
+ return {
47
+ "id": f"dataworld:{owner}/{id_str}",
48
+ "name": title,
49
+ "source": "dataworld",
50
+ "description": ds.get("description", f"data.world dataset {title}"),
51
+ "author": owner,
52
+ "license": {
53
+ "id": "Unknown",
54
+ "category": "unknown",
55
+ "commercial_use": None,
56
+ "warnings": []
57
+ },
58
+ "tags": ds.get("tags", []) + ["dataworld"],
59
+ "downloads": 0,
60
+ "likes": 0,
61
+ "created_at": ds.get("created", ""),
62
+ "updated_at": ds.get("updated", ""),
63
+ "size_bytes": 0,
64
+ "quality_score": 0.8,
65
+ "domain": "general",
66
+ "is_gated": False,
67
+ "is_nsfw": False,
68
+ "description_length": len(ds.get("description", "")),
69
+ "has_readme": False,
70
+ "download_url": f"https://data.world/{owner}/{id_str}",
71
+ }
72
+
73
+ def discover(query: str, limit: int = 20) -> Dict[str, Any]:
74
+ try:
75
+ token = _get_token()
76
+
77
+ # data.world simple search API
78
+ url = f"https://api.data.world/v0/search/resources?size={limit}"
79
+
80
+ headers = {
81
+ "Authorization": f"Bearer {token}",
82
+ "Content-Type": "application/json",
83
+ "Accept": "application/json"
84
+ }
85
+
86
+ # Search datasets and include community results to improve recall
87
+ body = {
88
+ "query": query,
89
+ "category": ["dataset"],
90
+ "includeCommunityResults": True,
91
+ }
92
+
93
+ req = urllib.request.Request(url, data=json.dumps(body).encode('utf-8'), headers=headers, method="POST")
94
+
95
+ with urllib.request.urlopen(req) as response:
96
+ data = json.loads(response.read().decode('utf-8'))
97
+
98
+ records = data.get("records", [])
99
+
100
+ # Fallback to advanced endpoint if simple search returns nothing
101
+ if not records:
102
+ adv_url = f"https://api.data.world/v0/search?size={limit}"
103
+ adv_body = {
104
+ "query": query,
105
+ "category": ["dataset"],
106
+ }
107
+ adv_req = urllib.request.Request(
108
+ adv_url,
109
+ data=json.dumps(adv_body).encode("utf-8"),
110
+ headers=headers,
111
+ method="POST",
112
+ )
113
+ with urllib.request.urlopen(adv_req) as response:
114
+ adv_data = json.loads(response.read().decode("utf-8"))
115
+ records = adv_data.get("records", [])
116
+
117
+ items = [_dataset_to_dict(r) for r in records]
118
+
119
+ return {"ok": True, "results": items, "count": len(items)}
120
+ except Exception as e:
121
+ return {"ok": False, "error": f"data.world discover failed: {str(e)}"}
122
+
123
+ def download(dataset_ref: str, target_dir: str) -> Dict[str, Any]:
124
+ try:
125
+ token = _get_token()
126
+
127
+ # dataset_ref is expected to be "dataworld:owner/id"
128
+ if dataset_ref.startswith("dataworld:"):
129
+ ref = dataset_ref.split(":", 1)[1]
130
+ else:
131
+ ref = dataset_ref
132
+
133
+ parts = ref.split("/")
134
+ if len(parts) != 2:
135
+ return {"ok": False, "error": f"Invalid data.world dataset ID format. Expected owner/id, got {ref}"}
136
+
137
+ owner, dataset_id = parts
138
+
139
+ if not target_dir:
140
+ target_dir = tempfile.mkdtemp(prefix="vesper_dataworld_")
141
+
142
+ os.makedirs(target_dir, exist_ok=True)
143
+
144
+ # First, get the dataset metadata to find the files
145
+ url = f"https://api.data.world/v0/datasets/{owner}/{dataset_id}"
146
+ headers = {
147
+ "Authorization": f"Bearer {token}",
148
+ "Accept": "application/json"
149
+ }
150
+
151
+ req = urllib.request.Request(url, headers=headers)
152
+ with urllib.request.urlopen(req) as response:
153
+ dataset_meta = json.loads(response.read().decode('utf-8'))
154
+
155
+ files = dataset_meta.get("files", [])
156
+ if not files:
157
+ return {"ok": False, "error": "No files found in this dataset"}
158
+
159
+ # Find the best file to download (prefer csv, parquet, jsonl)
160
+ best_file = None
161
+ for ext in [".parquet", ".csv", ".jsonl", ".json"]:
162
+ for f in files:
163
+ if f.get("name", "").lower().endswith(ext):
164
+ best_file = f
165
+ break
166
+ if best_file:
167
+ break
168
+
169
+ if not best_file:
170
+ best_file = files[0] # Just take the first one if no preferred format
171
+
172
+ filename = best_file.get("name")
173
+
174
+ # Download the file
175
+ download_url = f"https://api.data.world/v0/file_download/{owner}/{dataset_id}/{urllib.parse.quote(filename)}"
176
+
177
+ file_path = os.path.join(target_dir, filename)
178
+
179
+ download_req = urllib.request.Request(download_url, headers=headers)
180
+ with urllib.request.urlopen(download_req) as response, open(file_path, 'wb') as out_file:
181
+ out_file.write(response.read())
182
+
183
+ return {
184
+ "ok": True,
185
+ "local_path": file_path,
186
+ "target_dir": target_dir
187
+ }
188
+ except Exception as e:
189
+ return {"ok": False, "error": f"data.world download failed: {str(e)}"}
190
+
191
+ def main():
192
+ parser = argparse.ArgumentParser(description="Vesper data.world Engine")
193
+ parser.add_argument("action", choices=["discover", "download"])
194
+ parser.add_argument("arg1", help="Query for discover, Dataset ID for download")
195
+ parser.add_argument("arg2", nargs="?", help="Limit for discover, Target Dir for download")
196
+
197
+ args = parser.parse_args()
198
+
199
+ if args.action == "discover":
200
+ limit = int(args.arg2) if args.arg2 else 20
201
+ result = discover(args.arg1, limit)
202
+ print(json.dumps(result))
203
+ elif args.action == "download":
204
+ result = download(args.arg1, args.arg2)
205
+ print(json.dumps(result))
206
+
207
+ if __name__ == "__main__":
208
+ main()
@@ -0,0 +1,146 @@
1
+ import sys
2
+ import json
3
+ import argparse
4
+ import tempfile
5
+ import os
6
+ from typing import Dict, Any, List
7
+
8
+ try:
9
+ import openml
10
+ except ImportError:
11
+ openml = None
12
+
13
+ def _ensure_openml() -> Dict[str, Any]:
14
+ if openml is None:
15
+ return {"ok": False, "error": "openml package is not installed. Run 'pip install openml'"}
16
+ return {"ok": True}
17
+
18
+ def _dataset_to_dict(ds: Dict[str, Any]) -> Dict[str, Any]:
19
+ # OpenML dataset dict from list_datasets
20
+ did = ds.get("did", "")
21
+ name = ds.get("name", f"dataset_{did}")
22
+ version = ds.get("version", "1")
23
+ status = ds.get("status", "active")
24
+ format = ds.get("format", "unknown")
25
+
26
+ # Map to Vesper DatasetMetadata format
27
+ return {
28
+ "id": f"openml:{did}",
29
+ "name": name,
30
+ "source": "openml",
31
+ "description": f"OpenML Dataset {name} (ID: {did}, Version: {version}, Format: {format}, Status: {status})",
32
+ "author": "OpenML Community",
33
+ "license": "Public",
34
+ "tags": ["openml", format.lower()],
35
+ "downloads": ds.get("NumberOfDownloads", 0),
36
+ "likes": ds.get("NumberOfLikes", 0),
37
+ "created_at": ds.get("upload_date", ""),
38
+ "updated_at": ds.get("upload_date", ""),
39
+ "size_bytes": 0, # Not always available in list
40
+ "quality_score": 0.8, # Default good score for OpenML
41
+ "domain": "machine_learning",
42
+ "is_gated": False,
43
+ "is_nsfw": False,
44
+ "description_length": 100,
45
+ "has_readme": False,
46
+ }
47
+
48
+ def discover(query: str, limit: int = 20) -> Dict[str, Any]:
49
+ check = _ensure_openml()
50
+ if not check.get("ok"):
51
+ return check
52
+
53
+ try:
54
+ # OpenML list_datasets doesn't have a direct text search in the python API easily exposed without downloading all.
55
+ # But we can filter by tag or just get a list and filter locally if needed, or use the REST API directly.
56
+ # Actually, openml.datasets.list_datasets() returns a dict of datasets.
57
+ # We can fetch a larger batch and filter by name/keyword.
58
+
59
+ # Fetching a batch of datasets
60
+ datasets = openml.datasets.list_datasets(output_format='dataframe')
61
+
62
+ if query:
63
+ # Simple case-insensitive search in name
64
+ mask = datasets['name'].str.contains(query, case=False, na=False)
65
+ filtered = datasets[mask]
66
+ else:
67
+ filtered = datasets
68
+
69
+ # Sort by NumberOfDownloads if available, else just take top
70
+ if 'NumberOfDownloads' in filtered.columns:
71
+ filtered = filtered.sort_values('NumberOfDownloads', ascending=False)
72
+
73
+ top_k = filtered.head(limit)
74
+
75
+ # Convert to list of dicts
76
+ records = top_k.to_dict(orient='records')
77
+ items = [_dataset_to_dict(r) for r in records]
78
+
79
+ return {"ok": True, "results": items, "count": len(items)}
80
+ except Exception as e:
81
+ return {"ok": False, "error": f"OpenML discover failed: {str(e)}"}
82
+
83
+ def download(dataset_ref: str, target_dir: str) -> Dict[str, Any]:
84
+ check = _ensure_openml()
85
+ if not check.get("ok"):
86
+ return check
87
+
88
+ try:
89
+ # dataset_ref is expected to be "openml:ID"
90
+ if dataset_ref.startswith("openml:"):
91
+ did_str = dataset_ref.split(":")[1]
92
+ else:
93
+ did_str = dataset_ref
94
+
95
+ did = int(did_str)
96
+
97
+ if not target_dir:
98
+ target_dir = tempfile.mkdtemp(prefix="vesper_openml_")
99
+
100
+ os.makedirs(target_dir, exist_ok=True)
101
+
102
+ # Get the dataset
103
+ dataset = openml.datasets.get_dataset(did, download_data=True, download_qualities=False, download_features_meta_data=False)
104
+
105
+ # Get the pandas dataframe
106
+ X, y, categorical_indicator, attribute_names = dataset.get_data(
107
+ dataset_format="dataframe"
108
+ )
109
+
110
+ # If there's a target column (y), we might want to join it back if it was separated
111
+ # get_data() can return X and y separately if target is specified, but usually X contains everything if target=None
112
+ # Let's just get everything
113
+ df, _, _, _ = dataset.get_data(target=None, dataset_format="dataframe")
114
+
115
+ # Save to parquet in the target directory
116
+ safe_name = "".join([c if c.isalnum() else "_" for c in dataset.name])
117
+ file_path = os.path.join(target_dir, f"{safe_name}_{did}.parquet")
118
+
119
+ df.to_parquet(file_path, index=False)
120
+
121
+ return {
122
+ "ok": True,
123
+ "local_path": file_path,
124
+ "target_dir": target_dir
125
+ }
126
+ except Exception as e:
127
+ return {"ok": False, "error": f"OpenML download failed: {str(e)}"}
128
+
129
+ def main():
130
+ parser = argparse.ArgumentParser(description="Vesper OpenML Engine")
131
+ parser.add_argument("action", choices=["discover", "download"])
132
+ parser.add_argument("arg1", help="Query for discover, Dataset ID for download")
133
+ parser.add_argument("arg2", nargs="?", help="Limit for discover, Target Dir for download")
134
+
135
+ args = parser.parse_args()
136
+
137
+ if args.action == "discover":
138
+ limit = int(args.arg2) if args.arg2 else 20
139
+ result = discover(args.arg1, limit)
140
+ print(json.dumps(result))
141
+ elif args.action == "download":
142
+ result = download(args.arg1, args.arg2)
143
+ print(json.dumps(result))
144
+
145
+ if __name__ == "__main__":
146
+ main()
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@vespermcp/mcp-server",
3
- "version": "1.2.2",
3
+ "version": "1.2.3",
4
4
  "description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
5
5
  "type": "module",
6
6
  "main": "build/index.js",
@@ -13,6 +13,7 @@ KEY_ALIASES = {
13
13
  "hf_token": ["HF_TOKEN", "HUGGINGFACE_TOKEN"],
14
14
  "kaggle_username": ["KAGGLE_USERNAME"],
15
15
  "kaggle_key": ["KAGGLE_KEY"],
16
+ "dataworld_token": ["DW_AUTH_TOKEN"],
16
17
  }
17
18
 
18
19
  try:
@@ -207,6 +208,7 @@ def get_all() -> Dict[str, Optional[str]]:
207
208
  "hf_token": get_key("hf_token"),
208
209
  "kaggle_username": get_key("kaggle_username"),
209
210
  "kaggle_key": get_key("kaggle_key"),
211
+ "dataworld_token": get_key("dataworld_token"),
210
212
  }
211
213
 
212
214
 
@@ -0,0 +1,208 @@
1
+ import sys
2
+ import json
3
+ import argparse
4
+ import tempfile
5
+ import os
6
+ import urllib.request
7
+ import urllib.error
8
+ import urllib.parse
9
+ from typing import Dict, Any, List
10
+
11
+ def _get_token() -> str:
12
+ token = os.environ.get("DW_AUTH_TOKEN")
13
+ if not token:
14
+ raise ValueError("DW_AUTH_TOKEN environment variable is required for data.world")
15
+ return token
16
+
17
+ def _dataset_to_dict(ds: Dict[str, Any]) -> Dict[str, Any]:
18
+ owner_field = ds.get("owner", "")
19
+ if isinstance(owner_field, dict):
20
+ owner = owner_field.get("id") or owner_field.get("name") or ""
21
+ else:
22
+ owner = owner_field or ""
23
+
24
+ id_str = ds.get("id", "")
25
+ title = ds.get("title", "")
26
+
27
+ if (not owner or not id_str) and isinstance(ds.get("resourceLink"), str):
28
+ # Expected format includes /<owner>/<dataset-id>
29
+ parts = ds["resourceLink"].strip("/").split("/")
30
+ if len(parts) >= 2:
31
+ owner = owner or parts[-2]
32
+ id_str = id_str or parts[-1]
33
+
34
+ if isinstance(id_str, str) and "/" in id_str and not owner:
35
+ split_ref = id_str.split("/", 1)
36
+ owner = split_ref[0]
37
+ id_str = split_ref[1]
38
+
39
+ if not owner and not id_str:
40
+ owner = "unknown"
41
+ id_str = "unknown"
42
+
43
+ if not title:
44
+ title = f"{owner}/{id_str}"
45
+
46
+ return {
47
+ "id": f"dataworld:{owner}/{id_str}",
48
+ "name": title,
49
+ "source": "dataworld",
50
+ "description": ds.get("description", f"data.world dataset {title}"),
51
+ "author": owner,
52
+ "license": {
53
+ "id": "Unknown",
54
+ "category": "unknown",
55
+ "commercial_use": None,
56
+ "warnings": []
57
+ },
58
+ "tags": ds.get("tags", []) + ["dataworld"],
59
+ "downloads": 0,
60
+ "likes": 0,
61
+ "created_at": ds.get("created", ""),
62
+ "updated_at": ds.get("updated", ""),
63
+ "size_bytes": 0,
64
+ "quality_score": 0.8,
65
+ "domain": "general",
66
+ "is_gated": False,
67
+ "is_nsfw": False,
68
+ "description_length": len(ds.get("description", "")),
69
+ "has_readme": False,
70
+ "download_url": f"https://data.world/{owner}/{id_str}",
71
+ }
72
+
73
+ def discover(query: str, limit: int = 20) -> Dict[str, Any]:
74
+ try:
75
+ token = _get_token()
76
+
77
+ # data.world simple search API
78
+ url = f"https://api.data.world/v0/search/resources?size={limit}"
79
+
80
+ headers = {
81
+ "Authorization": f"Bearer {token}",
82
+ "Content-Type": "application/json",
83
+ "Accept": "application/json"
84
+ }
85
+
86
+ # Search datasets and include community results to improve recall
87
+ body = {
88
+ "query": query,
89
+ "category": ["dataset"],
90
+ "includeCommunityResults": True,
91
+ }
92
+
93
+ req = urllib.request.Request(url, data=json.dumps(body).encode('utf-8'), headers=headers, method="POST")
94
+
95
+ with urllib.request.urlopen(req) as response:
96
+ data = json.loads(response.read().decode('utf-8'))
97
+
98
+ records = data.get("records", [])
99
+
100
+ # Fallback to advanced endpoint if simple search returns nothing
101
+ if not records:
102
+ adv_url = f"https://api.data.world/v0/search?size={limit}"
103
+ adv_body = {
104
+ "query": query,
105
+ "category": ["dataset"],
106
+ }
107
+ adv_req = urllib.request.Request(
108
+ adv_url,
109
+ data=json.dumps(adv_body).encode("utf-8"),
110
+ headers=headers,
111
+ method="POST",
112
+ )
113
+ with urllib.request.urlopen(adv_req) as response:
114
+ adv_data = json.loads(response.read().decode("utf-8"))
115
+ records = adv_data.get("records", [])
116
+
117
+ items = [_dataset_to_dict(r) for r in records]
118
+
119
+ return {"ok": True, "results": items, "count": len(items)}
120
+ except Exception as e:
121
+ return {"ok": False, "error": f"data.world discover failed: {str(e)}"}
122
+
123
+ def download(dataset_ref: str, target_dir: str) -> Dict[str, Any]:
124
+ try:
125
+ token = _get_token()
126
+
127
+ # dataset_ref is expected to be "dataworld:owner/id"
128
+ if dataset_ref.startswith("dataworld:"):
129
+ ref = dataset_ref.split(":", 1)[1]
130
+ else:
131
+ ref = dataset_ref
132
+
133
+ parts = ref.split("/")
134
+ if len(parts) != 2:
135
+ return {"ok": False, "error": f"Invalid data.world dataset ID format. Expected owner/id, got {ref}"}
136
+
137
+ owner, dataset_id = parts
138
+
139
+ if not target_dir:
140
+ target_dir = tempfile.mkdtemp(prefix="vesper_dataworld_")
141
+
142
+ os.makedirs(target_dir, exist_ok=True)
143
+
144
+ # First, get the dataset metadata to find the files
145
+ url = f"https://api.data.world/v0/datasets/{owner}/{dataset_id}"
146
+ headers = {
147
+ "Authorization": f"Bearer {token}",
148
+ "Accept": "application/json"
149
+ }
150
+
151
+ req = urllib.request.Request(url, headers=headers)
152
+ with urllib.request.urlopen(req) as response:
153
+ dataset_meta = json.loads(response.read().decode('utf-8'))
154
+
155
+ files = dataset_meta.get("files", [])
156
+ if not files:
157
+ return {"ok": False, "error": "No files found in this dataset"}
158
+
159
+ # Find the best file to download (prefer csv, parquet, jsonl)
160
+ best_file = None
161
+ for ext in [".parquet", ".csv", ".jsonl", ".json"]:
162
+ for f in files:
163
+ if f.get("name", "").lower().endswith(ext):
164
+ best_file = f
165
+ break
166
+ if best_file:
167
+ break
168
+
169
+ if not best_file:
170
+ best_file = files[0] # Just take the first one if no preferred format
171
+
172
+ filename = best_file.get("name")
173
+
174
+ # Download the file
175
+ download_url = f"https://api.data.world/v0/file_download/{owner}/{dataset_id}/{urllib.parse.quote(filename)}"
176
+
177
+ file_path = os.path.join(target_dir, filename)
178
+
179
+ download_req = urllib.request.Request(download_url, headers=headers)
180
+ with urllib.request.urlopen(download_req) as response, open(file_path, 'wb') as out_file:
181
+ out_file.write(response.read())
182
+
183
+ return {
184
+ "ok": True,
185
+ "local_path": file_path,
186
+ "target_dir": target_dir
187
+ }
188
+ except Exception as e:
189
+ return {"ok": False, "error": f"data.world download failed: {str(e)}"}
190
+
191
+ def main():
192
+ parser = argparse.ArgumentParser(description="Vesper data.world Engine")
193
+ parser.add_argument("action", choices=["discover", "download"])
194
+ parser.add_argument("arg1", help="Query for discover, Dataset ID for download")
195
+ parser.add_argument("arg2", nargs="?", help="Limit for discover, Target Dir for download")
196
+
197
+ args = parser.parse_args()
198
+
199
+ if args.action == "discover":
200
+ limit = int(args.arg2) if args.arg2 else 20
201
+ result = discover(args.arg1, limit)
202
+ print(json.dumps(result))
203
+ elif args.action == "download":
204
+ result = download(args.arg1, args.arg2)
205
+ print(json.dumps(result))
206
+
207
+ if __name__ == "__main__":
208
+ main()
@@ -0,0 +1,146 @@
1
+ import sys
2
+ import json
3
+ import argparse
4
+ import tempfile
5
+ import os
6
+ from typing import Dict, Any, List
7
+
8
+ try:
9
+ import openml
10
+ except ImportError:
11
+ openml = None
12
+
13
+ def _ensure_openml() -> Dict[str, Any]:
14
+ if openml is None:
15
+ return {"ok": False, "error": "openml package is not installed. Run 'pip install openml'"}
16
+ return {"ok": True}
17
+
18
+ def _dataset_to_dict(ds: Dict[str, Any]) -> Dict[str, Any]:
19
+ # OpenML dataset dict from list_datasets
20
+ did = ds.get("did", "")
21
+ name = ds.get("name", f"dataset_{did}")
22
+ version = ds.get("version", "1")
23
+ status = ds.get("status", "active")
24
+ format = ds.get("format", "unknown")
25
+
26
+ # Map to Vesper DatasetMetadata format
27
+ return {
28
+ "id": f"openml:{did}",
29
+ "name": name,
30
+ "source": "openml",
31
+ "description": f"OpenML Dataset {name} (ID: {did}, Version: {version}, Format: {format}, Status: {status})",
32
+ "author": "OpenML Community",
33
+ "license": "Public",
34
+ "tags": ["openml", format.lower()],
35
+ "downloads": ds.get("NumberOfDownloads", 0),
36
+ "likes": ds.get("NumberOfLikes", 0),
37
+ "created_at": ds.get("upload_date", ""),
38
+ "updated_at": ds.get("upload_date", ""),
39
+ "size_bytes": 0, # Not always available in list
40
+ "quality_score": 0.8, # Default good score for OpenML
41
+ "domain": "machine_learning",
42
+ "is_gated": False,
43
+ "is_nsfw": False,
44
+ "description_length": 100,
45
+ "has_readme": False,
46
+ }
47
+
48
+ def discover(query: str, limit: int = 20) -> Dict[str, Any]:
49
+ check = _ensure_openml()
50
+ if not check.get("ok"):
51
+ return check
52
+
53
+ try:
54
+ # OpenML list_datasets doesn't have a direct text search in the python API easily exposed without downloading all.
55
+ # But we can filter by tag or just get a list and filter locally if needed, or use the REST API directly.
56
+ # Actually, openml.datasets.list_datasets() returns a dict of datasets.
57
+ # We can fetch a larger batch and filter by name/keyword.
58
+
59
+ # Fetching a batch of datasets
60
+ datasets = openml.datasets.list_datasets(output_format='dataframe')
61
+
62
+ if query:
63
+ # Simple case-insensitive search in name
64
+ mask = datasets['name'].str.contains(query, case=False, na=False)
65
+ filtered = datasets[mask]
66
+ else:
67
+ filtered = datasets
68
+
69
+ # Sort by NumberOfDownloads if available, else just take top
70
+ if 'NumberOfDownloads' in filtered.columns:
71
+ filtered = filtered.sort_values('NumberOfDownloads', ascending=False)
72
+
73
+ top_k = filtered.head(limit)
74
+
75
+ # Convert to list of dicts
76
+ records = top_k.to_dict(orient='records')
77
+ items = [_dataset_to_dict(r) for r in records]
78
+
79
+ return {"ok": True, "results": items, "count": len(items)}
80
+ except Exception as e:
81
+ return {"ok": False, "error": f"OpenML discover failed: {str(e)}"}
82
+
83
+ def download(dataset_ref: str, target_dir: str) -> Dict[str, Any]:
84
+ check = _ensure_openml()
85
+ if not check.get("ok"):
86
+ return check
87
+
88
+ try:
89
+ # dataset_ref is expected to be "openml:ID"
90
+ if dataset_ref.startswith("openml:"):
91
+ did_str = dataset_ref.split(":")[1]
92
+ else:
93
+ did_str = dataset_ref
94
+
95
+ did = int(did_str)
96
+
97
+ if not target_dir:
98
+ target_dir = tempfile.mkdtemp(prefix="vesper_openml_")
99
+
100
+ os.makedirs(target_dir, exist_ok=True)
101
+
102
+ # Get the dataset
103
+ dataset = openml.datasets.get_dataset(did, download_data=True, download_qualities=False, download_features_meta_data=False)
104
+
105
+ # Get the pandas dataframe
106
+ X, y, categorical_indicator, attribute_names = dataset.get_data(
107
+ dataset_format="dataframe"
108
+ )
109
+
110
+ # If there's a target column (y), we might want to join it back if it was separated
111
+ # get_data() can return X and y separately if target is specified, but usually X contains everything if target=None
112
+ # Let's just get everything
113
+ df, _, _, _ = dataset.get_data(target=None, dataset_format="dataframe")
114
+
115
+ # Save to parquet in the target directory
116
+ safe_name = "".join([c if c.isalnum() else "_" for c in dataset.name])
117
+ file_path = os.path.join(target_dir, f"{safe_name}_{did}.parquet")
118
+
119
+ df.to_parquet(file_path, index=False)
120
+
121
+ return {
122
+ "ok": True,
123
+ "local_path": file_path,
124
+ "target_dir": target_dir
125
+ }
126
+ except Exception as e:
127
+ return {"ok": False, "error": f"OpenML download failed: {str(e)}"}
128
+
129
+ def main():
130
+ parser = argparse.ArgumentParser(description="Vesper OpenML Engine")
131
+ parser.add_argument("action", choices=["discover", "download"])
132
+ parser.add_argument("arg1", help="Query for discover, Dataset ID for download")
133
+ parser.add_argument("arg2", nargs="?", help="Limit for discover, Target Dir for download")
134
+
135
+ args = parser.parse_args()
136
+
137
+ if args.action == "discover":
138
+ limit = int(args.arg2) if args.arg2 else 20
139
+ result = discover(args.arg1, limit)
140
+ print(json.dumps(result))
141
+ elif args.action == "download":
142
+ result = download(args.arg1, args.arg2)
143
+ print(json.dumps(result))
144
+
145
+ if __name__ == "__main__":
146
+ main()