@vespermcp/mcp-server 1.2.2 → 1.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +87 -8
- package/build/ingestion/ingestor.js +38 -0
- package/build/metadata/dataworld-source.js +89 -0
- package/build/metadata/openml-source.js +87 -0
- package/build/python/config.py +2 -0
- package/build/python/dataworld_engine.py +208 -0
- package/build/python/kaggle_engine.py +22 -2
- package/build/python/openml_engine.py +146 -0
- package/package.json +1 -1
- package/src/python/config.py +2 -0
- package/src/python/dataworld_engine.py +208 -0
- package/src/python/kaggle_engine.py +22 -2
- package/src/python/openml_engine.py +146 -0
package/build/index.js
CHANGED
|
@@ -12,6 +12,8 @@ import { Embedder } from "./search/embedder.js";
|
|
|
12
12
|
import { SearchEngine } from "./search/engine.js";
|
|
13
13
|
import { HuggingFaceScraper } from "./metadata/scraper.js";
|
|
14
14
|
import { KaggleSource } from "./metadata/kaggle-source.js";
|
|
15
|
+
import { OpenMLSource } from "./metadata/openml-source.js";
|
|
16
|
+
import { DataWorldSource } from "./metadata/dataworld-source.js";
|
|
15
17
|
import { formatSearchResults, formatDatasetInfo, formatJobStatus } from "./tools/formatter.js";
|
|
16
18
|
import { JobManager } from "./jobs/manager.js";
|
|
17
19
|
import { QualityAnalyzer } from "./quality/analyzer.js";
|
|
@@ -228,6 +230,12 @@ function hydrateExternalKeys() {
|
|
|
228
230
|
if (!process.env.KAGGLE_KEY && keys.kaggle_key) {
|
|
229
231
|
process.env.KAGGLE_KEY = String(keys.kaggle_key);
|
|
230
232
|
}
|
|
233
|
+
if (!process.env.DW_AUTH_TOKEN && keys.dataworld_token) {
|
|
234
|
+
process.env.DW_AUTH_TOKEN = String(keys.dataworld_token);
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
function hasDataWorldToken() {
|
|
238
|
+
return !!(process.env.DW_AUTH_TOKEN || secureKeys.getAll().dataworld_token);
|
|
231
239
|
}
|
|
232
240
|
// CRITICAL FIX: Pass __dirname (build directory) to analyzers
|
|
233
241
|
// Python scripts are in build/python/, so analyzers should look relative to build/
|
|
@@ -429,7 +437,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
429
437
|
},
|
|
430
438
|
source: {
|
|
431
439
|
type: "string",
|
|
432
|
-
enum: ["huggingface", "kaggle"],
|
|
440
|
+
enum: ["huggingface", "kaggle", "openml", "dataworld"],
|
|
433
441
|
description: "Data source to discover from.",
|
|
434
442
|
},
|
|
435
443
|
limit: {
|
|
@@ -448,7 +456,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
448
456
|
properties: {
|
|
449
457
|
source: {
|
|
450
458
|
type: "string",
|
|
451
|
-
enum: ["huggingface", "kaggle"],
|
|
459
|
+
enum: ["huggingface", "kaggle", "openml", "dataworld"],
|
|
452
460
|
description: "Dataset source.",
|
|
453
461
|
},
|
|
454
462
|
dataset_id: {
|
|
@@ -477,13 +485,14 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
477
485
|
},
|
|
478
486
|
{
|
|
479
487
|
name: "configure_keys",
|
|
480
|
-
description: "One-time optional key setup for external sources (Kaggle
|
|
488
|
+
description: "One-time optional key setup for external sources (Kaggle, data.world, gated HF). Core tools do not require keys.",
|
|
481
489
|
inputSchema: {
|
|
482
490
|
type: "object",
|
|
483
491
|
properties: {
|
|
484
492
|
hf_token: { type: "string", description: "Optional Hugging Face token for gated/private datasets" },
|
|
485
493
|
kaggle_username: { type: "string", description: "Optional Kaggle username" },
|
|
486
|
-
kaggle_key: { type: "string", description: "Optional Kaggle API key" }
|
|
494
|
+
kaggle_key: { type: "string", description: "Optional Kaggle API key" },
|
|
495
|
+
dataworld_token: { type: "string", description: "Optional data.world API token" }
|
|
487
496
|
},
|
|
488
497
|
},
|
|
489
498
|
},
|
|
@@ -495,7 +504,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
495
504
|
properties: {
|
|
496
505
|
dataset_id: {
|
|
497
506
|
type: "string",
|
|
498
|
-
description: "The unique dataset ID (e.g., 'user/dataset_name' for HuggingFace
|
|
507
|
+
description: "The unique dataset ID (e.g., 'user/dataset_name' for HuggingFace, 'kaggle:username/dataset' for Kaggle, 'openml:1234' for OpenML, or 'dataworld:owner/id' for data.world)",
|
|
499
508
|
},
|
|
500
509
|
},
|
|
501
510
|
required: ["dataset_id"],
|
|
@@ -786,6 +795,20 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
786
795
|
}
|
|
787
796
|
results = await kaggleSource.discover(query, limit);
|
|
788
797
|
}
|
|
798
|
+
else if (source === "openml") {
|
|
799
|
+
const openmlSource = new OpenMLSource();
|
|
800
|
+
results = await openmlSource.discover(query, limit);
|
|
801
|
+
}
|
|
802
|
+
else if (source === "dataworld") {
|
|
803
|
+
if (!hasDataWorldToken()) {
|
|
804
|
+
return {
|
|
805
|
+
content: [{ type: "text", text: "data.world requires API token. Run 'vespermcp config keys' and set dataworld_token." }],
|
|
806
|
+
isError: true,
|
|
807
|
+
};
|
|
808
|
+
}
|
|
809
|
+
const dataworldSource = new DataWorldSource();
|
|
810
|
+
results = await dataworldSource.discover(query, limit);
|
|
811
|
+
}
|
|
789
812
|
else {
|
|
790
813
|
const hf = new HuggingFaceScraper();
|
|
791
814
|
results = await hf.scrape(Math.max(1, limit), true, query);
|
|
@@ -815,6 +838,12 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
815
838
|
isError: true,
|
|
816
839
|
};
|
|
817
840
|
}
|
|
841
|
+
if (source === "dataworld" && !hasDataWorldToken()) {
|
|
842
|
+
return {
|
|
843
|
+
content: [{ type: "text", text: "data.world requires API token. Run 'vespermcp config keys' and set dataworld_token." }],
|
|
844
|
+
isError: true,
|
|
845
|
+
};
|
|
846
|
+
}
|
|
818
847
|
try {
|
|
819
848
|
const localPath = await dataIngestor.ensureData(datasetId, source, () => undefined);
|
|
820
849
|
return {
|
|
@@ -846,6 +875,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
846
875
|
const hfToken = String(request.params.arguments?.hf_token || "").trim();
|
|
847
876
|
const kaggleUsername = String(request.params.arguments?.kaggle_username || "").trim();
|
|
848
877
|
const kaggleKey = String(request.params.arguments?.kaggle_key || "").trim();
|
|
878
|
+
const dataworldToken = String(request.params.arguments?.dataworld_token || "").trim();
|
|
849
879
|
const saved = [];
|
|
850
880
|
const methods = [];
|
|
851
881
|
if (hfToken) {
|
|
@@ -875,6 +905,15 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
875
905
|
methods.push(r.method);
|
|
876
906
|
}
|
|
877
907
|
}
|
|
908
|
+
if (dataworldToken) {
|
|
909
|
+
const r = secureKeys.set("dataworld_token", dataworldToken);
|
|
910
|
+
if (r.ok) {
|
|
911
|
+
process.env.DW_AUTH_TOKEN = dataworldToken;
|
|
912
|
+
saved.push("data.world token");
|
|
913
|
+
if (r.method)
|
|
914
|
+
methods.push(r.method);
|
|
915
|
+
}
|
|
916
|
+
}
|
|
878
917
|
if (saved.length === 0) {
|
|
879
918
|
return {
|
|
880
919
|
content: [{ type: "text", text: "No keys provided. Core Vesper tools continue to work without API keys." }]
|
|
@@ -1402,6 +1441,7 @@ async function runConfigCli(args) {
|
|
|
1402
1441
|
const hfToken = (await ask(`Hugging Face token [${current.hf_token ? "saved" : "empty"}]: `)).trim();
|
|
1403
1442
|
const kaggleUsername = (await ask(`Kaggle username [${current.kaggle_username ? "saved" : "empty"}]: `)).trim();
|
|
1404
1443
|
const kaggleKey = (await ask(`Kaggle key [${current.kaggle_key ? "saved" : "empty"}]: `)).trim();
|
|
1444
|
+
const dataworldToken = (await ask(`data.world token [${current.dataworld_token ? "saved" : "empty"}]: `)).trim();
|
|
1405
1445
|
rl.close();
|
|
1406
1446
|
const saved = [];
|
|
1407
1447
|
if (hfToken) {
|
|
@@ -1425,12 +1465,19 @@ async function runConfigCli(args) {
|
|
|
1425
1465
|
saved.push("Kaggle key");
|
|
1426
1466
|
}
|
|
1427
1467
|
}
|
|
1468
|
+
if (dataworldToken) {
|
|
1469
|
+
const res = secureKeys.set("dataworld_token", dataworldToken);
|
|
1470
|
+
if (res.ok) {
|
|
1471
|
+
process.env.DW_AUTH_TOKEN = dataworldToken;
|
|
1472
|
+
saved.push("data.world token");
|
|
1473
|
+
}
|
|
1474
|
+
}
|
|
1428
1475
|
if (saved.length === 0) {
|
|
1429
1476
|
console.log("No new keys saved (all skipped). Core tools continue to work without keys.");
|
|
1430
1477
|
return;
|
|
1431
1478
|
}
|
|
1432
1479
|
console.log(`Key(s) saved securely: ${saved.join(", ")}`);
|
|
1433
|
-
console.log("You can now use Kaggle and gated Hugging Face datasets.");
|
|
1480
|
+
console.log("You can now use Kaggle, data.world, and gated Hugging Face datasets.");
|
|
1434
1481
|
return;
|
|
1435
1482
|
}
|
|
1436
1483
|
// Backward-compatible Kaggle-specific path
|
|
@@ -1479,7 +1526,7 @@ async function runDiscoverCli(args) {
|
|
|
1479
1526
|
}
|
|
1480
1527
|
const query = queryParts.join(" ").trim();
|
|
1481
1528
|
if (!query) {
|
|
1482
|
-
console.error("Usage: vespermcp discover --source kaggle \"credit risk\" --limit 10");
|
|
1529
|
+
console.error("Usage: vespermcp discover --source <huggingface|kaggle|openml|dataworld> \"credit risk\" --limit 10");
|
|
1483
1530
|
process.exit(1);
|
|
1484
1531
|
}
|
|
1485
1532
|
if (source === "kaggle") {
|
|
@@ -1512,6 +1559,34 @@ async function runDiscoverCli(args) {
|
|
|
1512
1559
|
}
|
|
1513
1560
|
return;
|
|
1514
1561
|
}
|
|
1562
|
+
else if (source === "openml") {
|
|
1563
|
+
try {
|
|
1564
|
+
const openmlSource = new OpenMLSource();
|
|
1565
|
+
const results = await openmlSource.discover(query, limit);
|
|
1566
|
+
console.log(formatSearchResults(results));
|
|
1567
|
+
}
|
|
1568
|
+
catch (error) {
|
|
1569
|
+
console.error(`OpenML discover failed: ${error.message || error}`);
|
|
1570
|
+
process.exit(1);
|
|
1571
|
+
}
|
|
1572
|
+
return;
|
|
1573
|
+
}
|
|
1574
|
+
else if (source === "dataworld") {
|
|
1575
|
+
if (!hasDataWorldToken()) {
|
|
1576
|
+
console.error("data.world requires API token. Run 'vespermcp config keys' and set dataworld_token.");
|
|
1577
|
+
process.exit(1);
|
|
1578
|
+
}
|
|
1579
|
+
try {
|
|
1580
|
+
const dataworldSource = new DataWorldSource();
|
|
1581
|
+
const results = await dataworldSource.discover(query, limit);
|
|
1582
|
+
console.log(formatSearchResults(results));
|
|
1583
|
+
}
|
|
1584
|
+
catch (error) {
|
|
1585
|
+
console.error(`data.world discover failed: ${error.message || error}`);
|
|
1586
|
+
process.exit(1);
|
|
1587
|
+
}
|
|
1588
|
+
return;
|
|
1589
|
+
}
|
|
1515
1590
|
const hf = new HuggingFaceScraper();
|
|
1516
1591
|
const results = await hf.scrape(limit, true, query);
|
|
1517
1592
|
console.log(formatSearchResults(results));
|
|
@@ -1530,7 +1605,7 @@ async function runDownloadCli(args) {
|
|
|
1530
1605
|
const source = (nonFlags[1] || "").toLowerCase();
|
|
1531
1606
|
const datasetId = nonFlags[2] || "";
|
|
1532
1607
|
if (!source || !datasetId) {
|
|
1533
|
-
console.error("Usage: vespermcp download kaggle <
|
|
1608
|
+
console.error("Usage: vespermcp download <huggingface|kaggle|openml|dataworld> <dataset-id> [--target-dir C:/path]");
|
|
1534
1609
|
process.exit(1);
|
|
1535
1610
|
}
|
|
1536
1611
|
if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
|
|
@@ -1546,6 +1621,10 @@ async function runDownloadCli(args) {
|
|
|
1546
1621
|
if (!dataIngestor.hasKaggleCredentials())
|
|
1547
1622
|
process.exit(1);
|
|
1548
1623
|
}
|
|
1624
|
+
if (source === "dataworld" && !hasDataWorldToken()) {
|
|
1625
|
+
console.error("data.world requires API token. Run 'vespermcp config keys' and set dataworld_token.");
|
|
1626
|
+
process.exit(1);
|
|
1627
|
+
}
|
|
1549
1628
|
let localPath = "";
|
|
1550
1629
|
try {
|
|
1551
1630
|
if (source === "kaggle" && targetDir) {
|
|
@@ -2,6 +2,8 @@ import path from "path";
|
|
|
2
2
|
import fs from "fs";
|
|
3
3
|
import { HFDownloader } from "./hf-downloader.js";
|
|
4
4
|
import { KaggleSource } from "../metadata/kaggle-source.js";
|
|
5
|
+
import { OpenMLSource } from "../metadata/openml-source.js";
|
|
6
|
+
import { DataWorldSource } from "../metadata/dataworld-source.js";
|
|
5
7
|
import { SecureKeysManager } from "../config/secure-keys.js";
|
|
6
8
|
export class DataIngestor {
|
|
7
9
|
projectRoot;
|
|
@@ -9,6 +11,8 @@ export class DataIngestor {
|
|
|
9
11
|
rawDataDir;
|
|
10
12
|
hfDownloader;
|
|
11
13
|
kaggleSource;
|
|
14
|
+
openmlSource;
|
|
15
|
+
dataworldSource;
|
|
12
16
|
secureKeys;
|
|
13
17
|
constructor(projectRoot, store) {
|
|
14
18
|
this.projectRoot = projectRoot;
|
|
@@ -19,6 +23,8 @@ export class DataIngestor {
|
|
|
19
23
|
}
|
|
20
24
|
this.hfDownloader = new HFDownloader();
|
|
21
25
|
this.kaggleSource = new KaggleSource();
|
|
26
|
+
this.openmlSource = new OpenMLSource();
|
|
27
|
+
this.dataworldSource = new DataWorldSource();
|
|
22
28
|
this.secureKeys = new SecureKeysManager();
|
|
23
29
|
}
|
|
24
30
|
/**
|
|
@@ -96,6 +102,38 @@ export class DataIngestor {
|
|
|
96
102
|
throw e;
|
|
97
103
|
}
|
|
98
104
|
}
|
|
105
|
+
else if (source === "openml") {
|
|
106
|
+
const targetDir = path.join(this.rawDataDir, datasetId.replace(/:/g, "_"));
|
|
107
|
+
this.store.registerDownload(datasetId, targetDir, "downloading");
|
|
108
|
+
try {
|
|
109
|
+
onProgress?.("Downloading from OpenML...");
|
|
110
|
+
const result = await this.openmlSource.download(datasetId, targetDir);
|
|
111
|
+
const stats = fs.statSync(result.local_path);
|
|
112
|
+
this.completeDownload(datasetId, result.local_path, stats.size);
|
|
113
|
+
onProgress?.("OpenML download complete", 100);
|
|
114
|
+
return result.local_path;
|
|
115
|
+
}
|
|
116
|
+
catch (e) {
|
|
117
|
+
this.failDownload(datasetId, e.message);
|
|
118
|
+
throw e;
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
else if (source === "dataworld") {
|
|
122
|
+
const targetDir = path.join(this.rawDataDir, datasetId.replace(/[:\/]/g, "_"));
|
|
123
|
+
this.store.registerDownload(datasetId, targetDir, "downloading");
|
|
124
|
+
try {
|
|
125
|
+
onProgress?.("Downloading from data.world...");
|
|
126
|
+
const result = await this.dataworldSource.download(datasetId, targetDir);
|
|
127
|
+
const stats = fs.statSync(result.local_path);
|
|
128
|
+
this.completeDownload(datasetId, result.local_path, stats.size);
|
|
129
|
+
onProgress?.("data.world download complete", 100);
|
|
130
|
+
return result.local_path;
|
|
131
|
+
}
|
|
132
|
+
catch (e) {
|
|
133
|
+
this.failDownload(datasetId, e.message);
|
|
134
|
+
throw e;
|
|
135
|
+
}
|
|
136
|
+
}
|
|
99
137
|
throw new Error(`Download logic for ${source} not yet implemented`);
|
|
100
138
|
}
|
|
101
139
|
/**
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
import { spawn } from "child_process";
|
|
2
|
+
import path from "path";
|
|
3
|
+
import fs from "fs";
|
|
4
|
+
import os from "os";
|
|
5
|
+
export class DataWorldSource {
|
|
6
|
+
pythonPath = "python";
|
|
7
|
+
scriptPath;
|
|
8
|
+
constructor(buildDir = process.cwd()) {
|
|
9
|
+
const homeDir = os.homedir() || process.env.HOME || process.env.USERPROFILE || buildDir;
|
|
10
|
+
const dataRoot = path.join(homeDir, ".vesper");
|
|
11
|
+
const scriptPath0 = path.resolve(dataRoot, "python", "dataworld_engine.py");
|
|
12
|
+
const scriptPath1 = path.resolve(buildDir, "python", "dataworld_engine.py");
|
|
13
|
+
const scriptPath2 = path.resolve(buildDir, "..", "src", "python", "dataworld_engine.py");
|
|
14
|
+
if (fs.existsSync(scriptPath0)) {
|
|
15
|
+
this.scriptPath = scriptPath0;
|
|
16
|
+
}
|
|
17
|
+
else if (fs.existsSync(scriptPath1)) {
|
|
18
|
+
this.scriptPath = scriptPath1;
|
|
19
|
+
}
|
|
20
|
+
else if (fs.existsSync(scriptPath2)) {
|
|
21
|
+
this.scriptPath = scriptPath2;
|
|
22
|
+
}
|
|
23
|
+
else {
|
|
24
|
+
this.scriptPath = scriptPath0;
|
|
25
|
+
}
|
|
26
|
+
if (process.platform === "win32") {
|
|
27
|
+
const venvPy = path.resolve(buildDir, ".venv", "Scripts", "python.exe");
|
|
28
|
+
if (fs.existsSync(venvPy)) {
|
|
29
|
+
this.pythonPath = venvPy;
|
|
30
|
+
}
|
|
31
|
+
else {
|
|
32
|
+
this.pythonPath = "py";
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
else {
|
|
36
|
+
const venvPy = path.resolve(buildDir, ".venv", "bin", "python");
|
|
37
|
+
if (fs.existsSync(venvPy)) {
|
|
38
|
+
this.pythonPath = venvPy;
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
async discover(query, limit = 20) {
|
|
43
|
+
const result = await this.run(["discover", query, String(limit)]);
|
|
44
|
+
if (!result.ok) {
|
|
45
|
+
throw new Error(result.error || "data.world discover failed");
|
|
46
|
+
}
|
|
47
|
+
return (result.results || []);
|
|
48
|
+
}
|
|
49
|
+
async download(datasetRef, targetDir) {
|
|
50
|
+
const args = ["download", datasetRef];
|
|
51
|
+
if (targetDir)
|
|
52
|
+
args.push(targetDir);
|
|
53
|
+
const result = await this.run(args);
|
|
54
|
+
if (!result.ok) {
|
|
55
|
+
throw new Error(result.error || "data.world download failed");
|
|
56
|
+
}
|
|
57
|
+
return {
|
|
58
|
+
local_path: result.local_path,
|
|
59
|
+
target_dir: result.target_dir,
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
run(args) {
|
|
63
|
+
return new Promise((resolve, reject) => {
|
|
64
|
+
const proc = spawn(this.pythonPath, [this.scriptPath, ...args], {
|
|
65
|
+
env: process.env
|
|
66
|
+
});
|
|
67
|
+
let stdout = "";
|
|
68
|
+
let stderr = "";
|
|
69
|
+
proc.stdout.on("data", (data) => {
|
|
70
|
+
stdout += data.toString();
|
|
71
|
+
});
|
|
72
|
+
proc.stderr.on("data", (data) => {
|
|
73
|
+
stderr += data.toString();
|
|
74
|
+
});
|
|
75
|
+
proc.on("close", (code) => {
|
|
76
|
+
if (code !== 0) {
|
|
77
|
+
return reject(new Error(`data.world engine exited with code ${code}: ${stderr}`));
|
|
78
|
+
}
|
|
79
|
+
try {
|
|
80
|
+
const parsed = JSON.parse(stdout.trim());
|
|
81
|
+
resolve(parsed);
|
|
82
|
+
}
|
|
83
|
+
catch (e) {
|
|
84
|
+
reject(new Error(`Failed to parse data.world engine output: ${stdout}`));
|
|
85
|
+
}
|
|
86
|
+
});
|
|
87
|
+
});
|
|
88
|
+
}
|
|
89
|
+
}
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import { spawn } from "child_process";
|
|
2
|
+
import path from "path";
|
|
3
|
+
import fs from "fs";
|
|
4
|
+
import os from "os";
|
|
5
|
+
export class OpenMLSource {
|
|
6
|
+
pythonPath = "python";
|
|
7
|
+
scriptPath;
|
|
8
|
+
constructor(buildDir = process.cwd()) {
|
|
9
|
+
const homeDir = os.homedir() || process.env.HOME || process.env.USERPROFILE || buildDir;
|
|
10
|
+
const dataRoot = path.join(homeDir, ".vesper");
|
|
11
|
+
const scriptPath0 = path.resolve(dataRoot, "python", "openml_engine.py");
|
|
12
|
+
const scriptPath1 = path.resolve(buildDir, "python", "openml_engine.py");
|
|
13
|
+
const scriptPath2 = path.resolve(buildDir, "..", "src", "python", "openml_engine.py");
|
|
14
|
+
if (fs.existsSync(scriptPath0)) {
|
|
15
|
+
this.scriptPath = scriptPath0;
|
|
16
|
+
}
|
|
17
|
+
else if (fs.existsSync(scriptPath1)) {
|
|
18
|
+
this.scriptPath = scriptPath1;
|
|
19
|
+
}
|
|
20
|
+
else if (fs.existsSync(scriptPath2)) {
|
|
21
|
+
this.scriptPath = scriptPath2;
|
|
22
|
+
}
|
|
23
|
+
else {
|
|
24
|
+
this.scriptPath = scriptPath0;
|
|
25
|
+
}
|
|
26
|
+
if (process.platform === "win32") {
|
|
27
|
+
const venvPy = path.resolve(buildDir, ".venv", "Scripts", "python.exe");
|
|
28
|
+
if (fs.existsSync(venvPy)) {
|
|
29
|
+
this.pythonPath = venvPy;
|
|
30
|
+
}
|
|
31
|
+
else {
|
|
32
|
+
this.pythonPath = "py";
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
else {
|
|
36
|
+
const venvPy = path.resolve(buildDir, ".venv", "bin", "python");
|
|
37
|
+
if (fs.existsSync(venvPy)) {
|
|
38
|
+
this.pythonPath = venvPy;
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
async discover(query, limit = 20) {
|
|
43
|
+
const result = await this.run(["discover", query, String(limit)]);
|
|
44
|
+
if (!result.ok) {
|
|
45
|
+
throw new Error(result.error || "OpenML discover failed");
|
|
46
|
+
}
|
|
47
|
+
return (result.results || []);
|
|
48
|
+
}
|
|
49
|
+
async download(datasetRef, targetDir) {
|
|
50
|
+
const args = ["download", datasetRef];
|
|
51
|
+
if (targetDir)
|
|
52
|
+
args.push(targetDir);
|
|
53
|
+
const result = await this.run(args);
|
|
54
|
+
if (!result.ok) {
|
|
55
|
+
throw new Error(result.error || "OpenML download failed");
|
|
56
|
+
}
|
|
57
|
+
return {
|
|
58
|
+
local_path: result.local_path,
|
|
59
|
+
target_dir: result.target_dir,
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
run(args) {
|
|
63
|
+
return new Promise((resolve, reject) => {
|
|
64
|
+
const proc = spawn(this.pythonPath, [this.scriptPath, ...args]);
|
|
65
|
+
let stdout = "";
|
|
66
|
+
let stderr = "";
|
|
67
|
+
proc.stdout.on("data", (data) => {
|
|
68
|
+
stdout += data.toString();
|
|
69
|
+
});
|
|
70
|
+
proc.stderr.on("data", (data) => {
|
|
71
|
+
stderr += data.toString();
|
|
72
|
+
});
|
|
73
|
+
proc.on("close", (code) => {
|
|
74
|
+
if (code !== 0) {
|
|
75
|
+
return reject(new Error(`OpenML engine exited with code ${code}: ${stderr}`));
|
|
76
|
+
}
|
|
77
|
+
try {
|
|
78
|
+
const parsed = JSON.parse(stdout.trim());
|
|
79
|
+
resolve(parsed);
|
|
80
|
+
}
|
|
81
|
+
catch (e) {
|
|
82
|
+
reject(new Error(`Failed to parse OpenML engine output: ${stdout}`));
|
|
83
|
+
}
|
|
84
|
+
});
|
|
85
|
+
});
|
|
86
|
+
}
|
|
87
|
+
}
|
package/build/python/config.py
CHANGED
|
@@ -13,6 +13,7 @@ KEY_ALIASES = {
|
|
|
13
13
|
"hf_token": ["HF_TOKEN", "HUGGINGFACE_TOKEN"],
|
|
14
14
|
"kaggle_username": ["KAGGLE_USERNAME"],
|
|
15
15
|
"kaggle_key": ["KAGGLE_KEY"],
|
|
16
|
+
"dataworld_token": ["DW_AUTH_TOKEN"],
|
|
16
17
|
}
|
|
17
18
|
|
|
18
19
|
try:
|
|
@@ -207,6 +208,7 @@ def get_all() -> Dict[str, Optional[str]]:
|
|
|
207
208
|
"hf_token": get_key("hf_token"),
|
|
208
209
|
"kaggle_username": get_key("kaggle_username"),
|
|
209
210
|
"kaggle_key": get_key("kaggle_key"),
|
|
211
|
+
"dataworld_token": get_key("dataworld_token"),
|
|
210
212
|
}
|
|
211
213
|
|
|
212
214
|
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import json
|
|
3
|
+
import argparse
|
|
4
|
+
import tempfile
|
|
5
|
+
import os
|
|
6
|
+
import urllib.request
|
|
7
|
+
import urllib.error
|
|
8
|
+
import urllib.parse
|
|
9
|
+
from typing import Dict, Any, List
|
|
10
|
+
|
|
11
|
+
def _get_token() -> str:
|
|
12
|
+
token = os.environ.get("DW_AUTH_TOKEN")
|
|
13
|
+
if not token:
|
|
14
|
+
raise ValueError("DW_AUTH_TOKEN environment variable is required for data.world")
|
|
15
|
+
return token
|
|
16
|
+
|
|
17
|
+
def _dataset_to_dict(ds: Dict[str, Any]) -> Dict[str, Any]:
|
|
18
|
+
owner_field = ds.get("owner", "")
|
|
19
|
+
if isinstance(owner_field, dict):
|
|
20
|
+
owner = owner_field.get("id") or owner_field.get("name") or ""
|
|
21
|
+
else:
|
|
22
|
+
owner = owner_field or ""
|
|
23
|
+
|
|
24
|
+
id_str = ds.get("id", "")
|
|
25
|
+
title = ds.get("title", "")
|
|
26
|
+
|
|
27
|
+
if (not owner or not id_str) and isinstance(ds.get("resourceLink"), str):
|
|
28
|
+
# Expected format includes /<owner>/<dataset-id>
|
|
29
|
+
parts = ds["resourceLink"].strip("/").split("/")
|
|
30
|
+
if len(parts) >= 2:
|
|
31
|
+
owner = owner or parts[-2]
|
|
32
|
+
id_str = id_str or parts[-1]
|
|
33
|
+
|
|
34
|
+
if isinstance(id_str, str) and "/" in id_str and not owner:
|
|
35
|
+
split_ref = id_str.split("/", 1)
|
|
36
|
+
owner = split_ref[0]
|
|
37
|
+
id_str = split_ref[1]
|
|
38
|
+
|
|
39
|
+
if not owner and not id_str:
|
|
40
|
+
owner = "unknown"
|
|
41
|
+
id_str = "unknown"
|
|
42
|
+
|
|
43
|
+
if not title:
|
|
44
|
+
title = f"{owner}/{id_str}"
|
|
45
|
+
|
|
46
|
+
return {
|
|
47
|
+
"id": f"dataworld:{owner}/{id_str}",
|
|
48
|
+
"name": title,
|
|
49
|
+
"source": "dataworld",
|
|
50
|
+
"description": ds.get("description", f"data.world dataset {title}"),
|
|
51
|
+
"author": owner,
|
|
52
|
+
"license": {
|
|
53
|
+
"id": "Unknown",
|
|
54
|
+
"category": "unknown",
|
|
55
|
+
"commercial_use": None,
|
|
56
|
+
"warnings": []
|
|
57
|
+
},
|
|
58
|
+
"tags": ds.get("tags", []) + ["dataworld"],
|
|
59
|
+
"downloads": 0,
|
|
60
|
+
"likes": 0,
|
|
61
|
+
"created_at": ds.get("created", ""),
|
|
62
|
+
"updated_at": ds.get("updated", ""),
|
|
63
|
+
"size_bytes": 0,
|
|
64
|
+
"quality_score": 0.8,
|
|
65
|
+
"domain": "general",
|
|
66
|
+
"is_gated": False,
|
|
67
|
+
"is_nsfw": False,
|
|
68
|
+
"description_length": len(ds.get("description", "")),
|
|
69
|
+
"has_readme": False,
|
|
70
|
+
"download_url": f"https://data.world/{owner}/{id_str}",
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
def discover(query: str, limit: int = 20) -> Dict[str, Any]:
|
|
74
|
+
try:
|
|
75
|
+
token = _get_token()
|
|
76
|
+
|
|
77
|
+
# data.world simple search API
|
|
78
|
+
url = f"https://api.data.world/v0/search/resources?size={limit}"
|
|
79
|
+
|
|
80
|
+
headers = {
|
|
81
|
+
"Authorization": f"Bearer {token}",
|
|
82
|
+
"Content-Type": "application/json",
|
|
83
|
+
"Accept": "application/json"
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
# Search datasets and include community results to improve recall
|
|
87
|
+
body = {
|
|
88
|
+
"query": query,
|
|
89
|
+
"category": ["dataset"],
|
|
90
|
+
"includeCommunityResults": True,
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
req = urllib.request.Request(url, data=json.dumps(body).encode('utf-8'), headers=headers, method="POST")
|
|
94
|
+
|
|
95
|
+
with urllib.request.urlopen(req) as response:
|
|
96
|
+
data = json.loads(response.read().decode('utf-8'))
|
|
97
|
+
|
|
98
|
+
records = data.get("records", [])
|
|
99
|
+
|
|
100
|
+
# Fallback to advanced endpoint if simple search returns nothing
|
|
101
|
+
if not records:
|
|
102
|
+
adv_url = f"https://api.data.world/v0/search?size={limit}"
|
|
103
|
+
adv_body = {
|
|
104
|
+
"query": query,
|
|
105
|
+
"category": ["dataset"],
|
|
106
|
+
}
|
|
107
|
+
adv_req = urllib.request.Request(
|
|
108
|
+
adv_url,
|
|
109
|
+
data=json.dumps(adv_body).encode("utf-8"),
|
|
110
|
+
headers=headers,
|
|
111
|
+
method="POST",
|
|
112
|
+
)
|
|
113
|
+
with urllib.request.urlopen(adv_req) as response:
|
|
114
|
+
adv_data = json.loads(response.read().decode("utf-8"))
|
|
115
|
+
records = adv_data.get("records", [])
|
|
116
|
+
|
|
117
|
+
items = [_dataset_to_dict(r) for r in records]
|
|
118
|
+
|
|
119
|
+
return {"ok": True, "results": items, "count": len(items)}
|
|
120
|
+
except Exception as e:
|
|
121
|
+
return {"ok": False, "error": f"data.world discover failed: {str(e)}"}
|
|
122
|
+
|
|
123
|
+
def download(dataset_ref: str, target_dir: str) -> Dict[str, Any]:
|
|
124
|
+
try:
|
|
125
|
+
token = _get_token()
|
|
126
|
+
|
|
127
|
+
# dataset_ref is expected to be "dataworld:owner/id"
|
|
128
|
+
if dataset_ref.startswith("dataworld:"):
|
|
129
|
+
ref = dataset_ref.split(":", 1)[1]
|
|
130
|
+
else:
|
|
131
|
+
ref = dataset_ref
|
|
132
|
+
|
|
133
|
+
parts = ref.split("/")
|
|
134
|
+
if len(parts) != 2:
|
|
135
|
+
return {"ok": False, "error": f"Invalid data.world dataset ID format. Expected owner/id, got {ref}"}
|
|
136
|
+
|
|
137
|
+
owner, dataset_id = parts
|
|
138
|
+
|
|
139
|
+
if not target_dir:
|
|
140
|
+
target_dir = tempfile.mkdtemp(prefix="vesper_dataworld_")
|
|
141
|
+
|
|
142
|
+
os.makedirs(target_dir, exist_ok=True)
|
|
143
|
+
|
|
144
|
+
# First, get the dataset metadata to find the files
|
|
145
|
+
url = f"https://api.data.world/v0/datasets/{owner}/{dataset_id}"
|
|
146
|
+
headers = {
|
|
147
|
+
"Authorization": f"Bearer {token}",
|
|
148
|
+
"Accept": "application/json"
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
req = urllib.request.Request(url, headers=headers)
|
|
152
|
+
with urllib.request.urlopen(req) as response:
|
|
153
|
+
dataset_meta = json.loads(response.read().decode('utf-8'))
|
|
154
|
+
|
|
155
|
+
files = dataset_meta.get("files", [])
|
|
156
|
+
if not files:
|
|
157
|
+
return {"ok": False, "error": "No files found in this dataset"}
|
|
158
|
+
|
|
159
|
+
# Find the best file to download (prefer csv, parquet, jsonl)
|
|
160
|
+
best_file = None
|
|
161
|
+
for ext in [".parquet", ".csv", ".jsonl", ".json"]:
|
|
162
|
+
for f in files:
|
|
163
|
+
if f.get("name", "").lower().endswith(ext):
|
|
164
|
+
best_file = f
|
|
165
|
+
break
|
|
166
|
+
if best_file:
|
|
167
|
+
break
|
|
168
|
+
|
|
169
|
+
if not best_file:
|
|
170
|
+
best_file = files[0] # Just take the first one if no preferred format
|
|
171
|
+
|
|
172
|
+
filename = best_file.get("name")
|
|
173
|
+
|
|
174
|
+
# Download the file
|
|
175
|
+
download_url = f"https://api.data.world/v0/file_download/{owner}/{dataset_id}/{urllib.parse.quote(filename)}"
|
|
176
|
+
|
|
177
|
+
file_path = os.path.join(target_dir, filename)
|
|
178
|
+
|
|
179
|
+
download_req = urllib.request.Request(download_url, headers=headers)
|
|
180
|
+
with urllib.request.urlopen(download_req) as response, open(file_path, 'wb') as out_file:
|
|
181
|
+
out_file.write(response.read())
|
|
182
|
+
|
|
183
|
+
return {
|
|
184
|
+
"ok": True,
|
|
185
|
+
"local_path": file_path,
|
|
186
|
+
"target_dir": target_dir
|
|
187
|
+
}
|
|
188
|
+
except Exception as e:
|
|
189
|
+
return {"ok": False, "error": f"data.world download failed: {str(e)}"}
|
|
190
|
+
|
|
191
|
+
def main():
|
|
192
|
+
parser = argparse.ArgumentParser(description="Vesper data.world Engine")
|
|
193
|
+
parser.add_argument("action", choices=["discover", "download"])
|
|
194
|
+
parser.add_argument("arg1", help="Query for discover, Dataset ID for download")
|
|
195
|
+
parser.add_argument("arg2", nargs="?", help="Limit for discover, Target Dir for download")
|
|
196
|
+
|
|
197
|
+
args = parser.parse_args()
|
|
198
|
+
|
|
199
|
+
if args.action == "discover":
|
|
200
|
+
limit = int(args.arg2) if args.arg2 else 20
|
|
201
|
+
result = discover(args.arg1, limit)
|
|
202
|
+
print(json.dumps(result))
|
|
203
|
+
elif args.action == "download":
|
|
204
|
+
result = download(args.arg1, args.arg2)
|
|
205
|
+
print(json.dumps(result))
|
|
206
|
+
|
|
207
|
+
if __name__ == "__main__":
|
|
208
|
+
main()
|
|
@@ -109,8 +109,28 @@ def discover(query: str, limit: int = 20) -> Dict[str, Any]:
|
|
|
109
109
|
|
|
110
110
|
api: KaggleApi = auth["api"]
|
|
111
111
|
try:
|
|
112
|
-
|
|
113
|
-
|
|
112
|
+
desired = max(1, min(limit, 100))
|
|
113
|
+
|
|
114
|
+
try:
|
|
115
|
+
datasets = api.dataset_list(search=query, page_size=desired)
|
|
116
|
+
items = [_dataset_to_dict(ds) for ds in datasets[:limit]]
|
|
117
|
+
return {"ok": True, "results": items, "count": len(items)}
|
|
118
|
+
except TypeError:
|
|
119
|
+
pass
|
|
120
|
+
|
|
121
|
+
collected = []
|
|
122
|
+
page = 1
|
|
123
|
+
while len(collected) < limit:
|
|
124
|
+
page_items = api.dataset_list(search=query, page=page)
|
|
125
|
+
if not page_items:
|
|
126
|
+
break
|
|
127
|
+
|
|
128
|
+
collected.extend(page_items)
|
|
129
|
+
if len(page_items) < 20:
|
|
130
|
+
break
|
|
131
|
+
page += 1
|
|
132
|
+
|
|
133
|
+
items = [_dataset_to_dict(ds) for ds in collected[:limit]]
|
|
114
134
|
return {"ok": True, "results": items, "count": len(items)}
|
|
115
135
|
except Exception as e:
|
|
116
136
|
return {"ok": False, "error": f"Kaggle discover failed: {str(e)}"}
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import json
|
|
3
|
+
import argparse
|
|
4
|
+
import tempfile
|
|
5
|
+
import os
|
|
6
|
+
from typing import Dict, Any, List
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
import openml
|
|
10
|
+
except ImportError:
|
|
11
|
+
openml = None
|
|
12
|
+
|
|
13
|
+
def _ensure_openml() -> Dict[str, Any]:
|
|
14
|
+
if openml is None:
|
|
15
|
+
return {"ok": False, "error": "openml package is not installed. Run 'pip install openml'"}
|
|
16
|
+
return {"ok": True}
|
|
17
|
+
|
|
18
|
+
def _dataset_to_dict(ds: Dict[str, Any]) -> Dict[str, Any]:
|
|
19
|
+
# OpenML dataset dict from list_datasets
|
|
20
|
+
did = ds.get("did", "")
|
|
21
|
+
name = ds.get("name", f"dataset_{did}")
|
|
22
|
+
version = ds.get("version", "1")
|
|
23
|
+
status = ds.get("status", "active")
|
|
24
|
+
format = ds.get("format", "unknown")
|
|
25
|
+
|
|
26
|
+
# Map to Vesper DatasetMetadata format
|
|
27
|
+
return {
|
|
28
|
+
"id": f"openml:{did}",
|
|
29
|
+
"name": name,
|
|
30
|
+
"source": "openml",
|
|
31
|
+
"description": f"OpenML Dataset {name} (ID: {did}, Version: {version}, Format: {format}, Status: {status})",
|
|
32
|
+
"author": "OpenML Community",
|
|
33
|
+
"license": "Public",
|
|
34
|
+
"tags": ["openml", format.lower()],
|
|
35
|
+
"downloads": ds.get("NumberOfDownloads", 0),
|
|
36
|
+
"likes": ds.get("NumberOfLikes", 0),
|
|
37
|
+
"created_at": ds.get("upload_date", ""),
|
|
38
|
+
"updated_at": ds.get("upload_date", ""),
|
|
39
|
+
"size_bytes": 0, # Not always available in list
|
|
40
|
+
"quality_score": 0.8, # Default good score for OpenML
|
|
41
|
+
"domain": "machine_learning",
|
|
42
|
+
"is_gated": False,
|
|
43
|
+
"is_nsfw": False,
|
|
44
|
+
"description_length": 100,
|
|
45
|
+
"has_readme": False,
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
def discover(query: str, limit: int = 20) -> Dict[str, Any]:
|
|
49
|
+
check = _ensure_openml()
|
|
50
|
+
if not check.get("ok"):
|
|
51
|
+
return check
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
# OpenML list_datasets doesn't have a direct text search in the python API easily exposed without downloading all.
|
|
55
|
+
# But we can filter by tag or just get a list and filter locally if needed, or use the REST API directly.
|
|
56
|
+
# Actually, openml.datasets.list_datasets() returns a dict of datasets.
|
|
57
|
+
# We can fetch a larger batch and filter by name/keyword.
|
|
58
|
+
|
|
59
|
+
# Fetching a batch of datasets
|
|
60
|
+
datasets = openml.datasets.list_datasets(output_format='dataframe')
|
|
61
|
+
|
|
62
|
+
if query:
|
|
63
|
+
# Simple case-insensitive search in name
|
|
64
|
+
mask = datasets['name'].str.contains(query, case=False, na=False)
|
|
65
|
+
filtered = datasets[mask]
|
|
66
|
+
else:
|
|
67
|
+
filtered = datasets
|
|
68
|
+
|
|
69
|
+
# Sort by NumberOfDownloads if available, else just take top
|
|
70
|
+
if 'NumberOfDownloads' in filtered.columns:
|
|
71
|
+
filtered = filtered.sort_values('NumberOfDownloads', ascending=False)
|
|
72
|
+
|
|
73
|
+
top_k = filtered.head(limit)
|
|
74
|
+
|
|
75
|
+
# Convert to list of dicts
|
|
76
|
+
records = top_k.to_dict(orient='records')
|
|
77
|
+
items = [_dataset_to_dict(r) for r in records]
|
|
78
|
+
|
|
79
|
+
return {"ok": True, "results": items, "count": len(items)}
|
|
80
|
+
except Exception as e:
|
|
81
|
+
return {"ok": False, "error": f"OpenML discover failed: {str(e)}"}
|
|
82
|
+
|
|
83
|
+
def download(dataset_ref: str, target_dir: str) -> Dict[str, Any]:
|
|
84
|
+
check = _ensure_openml()
|
|
85
|
+
if not check.get("ok"):
|
|
86
|
+
return check
|
|
87
|
+
|
|
88
|
+
try:
|
|
89
|
+
# dataset_ref is expected to be "openml:ID"
|
|
90
|
+
if dataset_ref.startswith("openml:"):
|
|
91
|
+
did_str = dataset_ref.split(":")[1]
|
|
92
|
+
else:
|
|
93
|
+
did_str = dataset_ref
|
|
94
|
+
|
|
95
|
+
did = int(did_str)
|
|
96
|
+
|
|
97
|
+
if not target_dir:
|
|
98
|
+
target_dir = tempfile.mkdtemp(prefix="vesper_openml_")
|
|
99
|
+
|
|
100
|
+
os.makedirs(target_dir, exist_ok=True)
|
|
101
|
+
|
|
102
|
+
# Get the dataset
|
|
103
|
+
dataset = openml.datasets.get_dataset(did, download_data=True, download_qualities=False, download_features_meta_data=False)
|
|
104
|
+
|
|
105
|
+
# Get the pandas dataframe
|
|
106
|
+
X, y, categorical_indicator, attribute_names = dataset.get_data(
|
|
107
|
+
dataset_format="dataframe"
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
# If there's a target column (y), we might want to join it back if it was separated
|
|
111
|
+
# get_data() can return X and y separately if target is specified, but usually X contains everything if target=None
|
|
112
|
+
# Let's just get everything
|
|
113
|
+
df, _, _, _ = dataset.get_data(target=None, dataset_format="dataframe")
|
|
114
|
+
|
|
115
|
+
# Save to parquet in the target directory
|
|
116
|
+
safe_name = "".join([c if c.isalnum() else "_" for c in dataset.name])
|
|
117
|
+
file_path = os.path.join(target_dir, f"{safe_name}_{did}.parquet")
|
|
118
|
+
|
|
119
|
+
df.to_parquet(file_path, index=False)
|
|
120
|
+
|
|
121
|
+
return {
|
|
122
|
+
"ok": True,
|
|
123
|
+
"local_path": file_path,
|
|
124
|
+
"target_dir": target_dir
|
|
125
|
+
}
|
|
126
|
+
except Exception as e:
|
|
127
|
+
return {"ok": False, "error": f"OpenML download failed: {str(e)}"}
|
|
128
|
+
|
|
129
|
+
def main():
|
|
130
|
+
parser = argparse.ArgumentParser(description="Vesper OpenML Engine")
|
|
131
|
+
parser.add_argument("action", choices=["discover", "download"])
|
|
132
|
+
parser.add_argument("arg1", help="Query for discover, Dataset ID for download")
|
|
133
|
+
parser.add_argument("arg2", nargs="?", help="Limit for discover, Target Dir for download")
|
|
134
|
+
|
|
135
|
+
args = parser.parse_args()
|
|
136
|
+
|
|
137
|
+
if args.action == "discover":
|
|
138
|
+
limit = int(args.arg2) if args.arg2 else 20
|
|
139
|
+
result = discover(args.arg1, limit)
|
|
140
|
+
print(json.dumps(result))
|
|
141
|
+
elif args.action == "download":
|
|
142
|
+
result = download(args.arg1, args.arg2)
|
|
143
|
+
print(json.dumps(result))
|
|
144
|
+
|
|
145
|
+
if __name__ == "__main__":
|
|
146
|
+
main()
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@vespermcp/mcp-server",
|
|
3
|
-
"version": "1.2.
|
|
3
|
+
"version": "1.2.4",
|
|
4
4
|
"description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "build/index.js",
|
package/src/python/config.py
CHANGED
|
@@ -13,6 +13,7 @@ KEY_ALIASES = {
|
|
|
13
13
|
"hf_token": ["HF_TOKEN", "HUGGINGFACE_TOKEN"],
|
|
14
14
|
"kaggle_username": ["KAGGLE_USERNAME"],
|
|
15
15
|
"kaggle_key": ["KAGGLE_KEY"],
|
|
16
|
+
"dataworld_token": ["DW_AUTH_TOKEN"],
|
|
16
17
|
}
|
|
17
18
|
|
|
18
19
|
try:
|
|
@@ -207,6 +208,7 @@ def get_all() -> Dict[str, Optional[str]]:
|
|
|
207
208
|
"hf_token": get_key("hf_token"),
|
|
208
209
|
"kaggle_username": get_key("kaggle_username"),
|
|
209
210
|
"kaggle_key": get_key("kaggle_key"),
|
|
211
|
+
"dataworld_token": get_key("dataworld_token"),
|
|
210
212
|
}
|
|
211
213
|
|
|
212
214
|
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import json
|
|
3
|
+
import argparse
|
|
4
|
+
import tempfile
|
|
5
|
+
import os
|
|
6
|
+
import urllib.request
|
|
7
|
+
import urllib.error
|
|
8
|
+
import urllib.parse
|
|
9
|
+
from typing import Dict, Any, List
|
|
10
|
+
|
|
11
|
+
def _get_token() -> str:
|
|
12
|
+
token = os.environ.get("DW_AUTH_TOKEN")
|
|
13
|
+
if not token:
|
|
14
|
+
raise ValueError("DW_AUTH_TOKEN environment variable is required for data.world")
|
|
15
|
+
return token
|
|
16
|
+
|
|
17
|
+
def _dataset_to_dict(ds: Dict[str, Any]) -> Dict[str, Any]:
|
|
18
|
+
owner_field = ds.get("owner", "")
|
|
19
|
+
if isinstance(owner_field, dict):
|
|
20
|
+
owner = owner_field.get("id") or owner_field.get("name") or ""
|
|
21
|
+
else:
|
|
22
|
+
owner = owner_field or ""
|
|
23
|
+
|
|
24
|
+
id_str = ds.get("id", "")
|
|
25
|
+
title = ds.get("title", "")
|
|
26
|
+
|
|
27
|
+
if (not owner or not id_str) and isinstance(ds.get("resourceLink"), str):
|
|
28
|
+
# Expected format includes /<owner>/<dataset-id>
|
|
29
|
+
parts = ds["resourceLink"].strip("/").split("/")
|
|
30
|
+
if len(parts) >= 2:
|
|
31
|
+
owner = owner or parts[-2]
|
|
32
|
+
id_str = id_str or parts[-1]
|
|
33
|
+
|
|
34
|
+
if isinstance(id_str, str) and "/" in id_str and not owner:
|
|
35
|
+
split_ref = id_str.split("/", 1)
|
|
36
|
+
owner = split_ref[0]
|
|
37
|
+
id_str = split_ref[1]
|
|
38
|
+
|
|
39
|
+
if not owner and not id_str:
|
|
40
|
+
owner = "unknown"
|
|
41
|
+
id_str = "unknown"
|
|
42
|
+
|
|
43
|
+
if not title:
|
|
44
|
+
title = f"{owner}/{id_str}"
|
|
45
|
+
|
|
46
|
+
return {
|
|
47
|
+
"id": f"dataworld:{owner}/{id_str}",
|
|
48
|
+
"name": title,
|
|
49
|
+
"source": "dataworld",
|
|
50
|
+
"description": ds.get("description", f"data.world dataset {title}"),
|
|
51
|
+
"author": owner,
|
|
52
|
+
"license": {
|
|
53
|
+
"id": "Unknown",
|
|
54
|
+
"category": "unknown",
|
|
55
|
+
"commercial_use": None,
|
|
56
|
+
"warnings": []
|
|
57
|
+
},
|
|
58
|
+
"tags": ds.get("tags", []) + ["dataworld"],
|
|
59
|
+
"downloads": 0,
|
|
60
|
+
"likes": 0,
|
|
61
|
+
"created_at": ds.get("created", ""),
|
|
62
|
+
"updated_at": ds.get("updated", ""),
|
|
63
|
+
"size_bytes": 0,
|
|
64
|
+
"quality_score": 0.8,
|
|
65
|
+
"domain": "general",
|
|
66
|
+
"is_gated": False,
|
|
67
|
+
"is_nsfw": False,
|
|
68
|
+
"description_length": len(ds.get("description", "")),
|
|
69
|
+
"has_readme": False,
|
|
70
|
+
"download_url": f"https://data.world/{owner}/{id_str}",
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
def discover(query: str, limit: int = 20) -> Dict[str, Any]:
|
|
74
|
+
try:
|
|
75
|
+
token = _get_token()
|
|
76
|
+
|
|
77
|
+
# data.world simple search API
|
|
78
|
+
url = f"https://api.data.world/v0/search/resources?size={limit}"
|
|
79
|
+
|
|
80
|
+
headers = {
|
|
81
|
+
"Authorization": f"Bearer {token}",
|
|
82
|
+
"Content-Type": "application/json",
|
|
83
|
+
"Accept": "application/json"
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
# Search datasets and include community results to improve recall
|
|
87
|
+
body = {
|
|
88
|
+
"query": query,
|
|
89
|
+
"category": ["dataset"],
|
|
90
|
+
"includeCommunityResults": True,
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
req = urllib.request.Request(url, data=json.dumps(body).encode('utf-8'), headers=headers, method="POST")
|
|
94
|
+
|
|
95
|
+
with urllib.request.urlopen(req) as response:
|
|
96
|
+
data = json.loads(response.read().decode('utf-8'))
|
|
97
|
+
|
|
98
|
+
records = data.get("records", [])
|
|
99
|
+
|
|
100
|
+
# Fallback to advanced endpoint if simple search returns nothing
|
|
101
|
+
if not records:
|
|
102
|
+
adv_url = f"https://api.data.world/v0/search?size={limit}"
|
|
103
|
+
adv_body = {
|
|
104
|
+
"query": query,
|
|
105
|
+
"category": ["dataset"],
|
|
106
|
+
}
|
|
107
|
+
adv_req = urllib.request.Request(
|
|
108
|
+
adv_url,
|
|
109
|
+
data=json.dumps(adv_body).encode("utf-8"),
|
|
110
|
+
headers=headers,
|
|
111
|
+
method="POST",
|
|
112
|
+
)
|
|
113
|
+
with urllib.request.urlopen(adv_req) as response:
|
|
114
|
+
adv_data = json.loads(response.read().decode("utf-8"))
|
|
115
|
+
records = adv_data.get("records", [])
|
|
116
|
+
|
|
117
|
+
items = [_dataset_to_dict(r) for r in records]
|
|
118
|
+
|
|
119
|
+
return {"ok": True, "results": items, "count": len(items)}
|
|
120
|
+
except Exception as e:
|
|
121
|
+
return {"ok": False, "error": f"data.world discover failed: {str(e)}"}
|
|
122
|
+
|
|
123
|
+
def download(dataset_ref: str, target_dir: str) -> Dict[str, Any]:
|
|
124
|
+
try:
|
|
125
|
+
token = _get_token()
|
|
126
|
+
|
|
127
|
+
# dataset_ref is expected to be "dataworld:owner/id"
|
|
128
|
+
if dataset_ref.startswith("dataworld:"):
|
|
129
|
+
ref = dataset_ref.split(":", 1)[1]
|
|
130
|
+
else:
|
|
131
|
+
ref = dataset_ref
|
|
132
|
+
|
|
133
|
+
parts = ref.split("/")
|
|
134
|
+
if len(parts) != 2:
|
|
135
|
+
return {"ok": False, "error": f"Invalid data.world dataset ID format. Expected owner/id, got {ref}"}
|
|
136
|
+
|
|
137
|
+
owner, dataset_id = parts
|
|
138
|
+
|
|
139
|
+
if not target_dir:
|
|
140
|
+
target_dir = tempfile.mkdtemp(prefix="vesper_dataworld_")
|
|
141
|
+
|
|
142
|
+
os.makedirs(target_dir, exist_ok=True)
|
|
143
|
+
|
|
144
|
+
# First, get the dataset metadata to find the files
|
|
145
|
+
url = f"https://api.data.world/v0/datasets/{owner}/{dataset_id}"
|
|
146
|
+
headers = {
|
|
147
|
+
"Authorization": f"Bearer {token}",
|
|
148
|
+
"Accept": "application/json"
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
req = urllib.request.Request(url, headers=headers)
|
|
152
|
+
with urllib.request.urlopen(req) as response:
|
|
153
|
+
dataset_meta = json.loads(response.read().decode('utf-8'))
|
|
154
|
+
|
|
155
|
+
files = dataset_meta.get("files", [])
|
|
156
|
+
if not files:
|
|
157
|
+
return {"ok": False, "error": "No files found in this dataset"}
|
|
158
|
+
|
|
159
|
+
# Find the best file to download (prefer csv, parquet, jsonl)
|
|
160
|
+
best_file = None
|
|
161
|
+
for ext in [".parquet", ".csv", ".jsonl", ".json"]:
|
|
162
|
+
for f in files:
|
|
163
|
+
if f.get("name", "").lower().endswith(ext):
|
|
164
|
+
best_file = f
|
|
165
|
+
break
|
|
166
|
+
if best_file:
|
|
167
|
+
break
|
|
168
|
+
|
|
169
|
+
if not best_file:
|
|
170
|
+
best_file = files[0] # Just take the first one if no preferred format
|
|
171
|
+
|
|
172
|
+
filename = best_file.get("name")
|
|
173
|
+
|
|
174
|
+
# Download the file
|
|
175
|
+
download_url = f"https://api.data.world/v0/file_download/{owner}/{dataset_id}/{urllib.parse.quote(filename)}"
|
|
176
|
+
|
|
177
|
+
file_path = os.path.join(target_dir, filename)
|
|
178
|
+
|
|
179
|
+
download_req = urllib.request.Request(download_url, headers=headers)
|
|
180
|
+
with urllib.request.urlopen(download_req) as response, open(file_path, 'wb') as out_file:
|
|
181
|
+
out_file.write(response.read())
|
|
182
|
+
|
|
183
|
+
return {
|
|
184
|
+
"ok": True,
|
|
185
|
+
"local_path": file_path,
|
|
186
|
+
"target_dir": target_dir
|
|
187
|
+
}
|
|
188
|
+
except Exception as e:
|
|
189
|
+
return {"ok": False, "error": f"data.world download failed: {str(e)}"}
|
|
190
|
+
|
|
191
|
+
def main():
|
|
192
|
+
parser = argparse.ArgumentParser(description="Vesper data.world Engine")
|
|
193
|
+
parser.add_argument("action", choices=["discover", "download"])
|
|
194
|
+
parser.add_argument("arg1", help="Query for discover, Dataset ID for download")
|
|
195
|
+
parser.add_argument("arg2", nargs="?", help="Limit for discover, Target Dir for download")
|
|
196
|
+
|
|
197
|
+
args = parser.parse_args()
|
|
198
|
+
|
|
199
|
+
if args.action == "discover":
|
|
200
|
+
limit = int(args.arg2) if args.arg2 else 20
|
|
201
|
+
result = discover(args.arg1, limit)
|
|
202
|
+
print(json.dumps(result))
|
|
203
|
+
elif args.action == "download":
|
|
204
|
+
result = download(args.arg1, args.arg2)
|
|
205
|
+
print(json.dumps(result))
|
|
206
|
+
|
|
207
|
+
if __name__ == "__main__":
|
|
208
|
+
main()
|
|
@@ -109,8 +109,28 @@ def discover(query: str, limit: int = 20) -> Dict[str, Any]:
|
|
|
109
109
|
|
|
110
110
|
api: KaggleApi = auth["api"]
|
|
111
111
|
try:
|
|
112
|
-
|
|
113
|
-
|
|
112
|
+
desired = max(1, min(limit, 100))
|
|
113
|
+
|
|
114
|
+
try:
|
|
115
|
+
datasets = api.dataset_list(search=query, page_size=desired)
|
|
116
|
+
items = [_dataset_to_dict(ds) for ds in datasets[:limit]]
|
|
117
|
+
return {"ok": True, "results": items, "count": len(items)}
|
|
118
|
+
except TypeError:
|
|
119
|
+
pass
|
|
120
|
+
|
|
121
|
+
collected = []
|
|
122
|
+
page = 1
|
|
123
|
+
while len(collected) < limit:
|
|
124
|
+
page_items = api.dataset_list(search=query, page=page)
|
|
125
|
+
if not page_items:
|
|
126
|
+
break
|
|
127
|
+
|
|
128
|
+
collected.extend(page_items)
|
|
129
|
+
if len(page_items) < 20:
|
|
130
|
+
break
|
|
131
|
+
page += 1
|
|
132
|
+
|
|
133
|
+
items = [_dataset_to_dict(ds) for ds in collected[:limit]]
|
|
114
134
|
return {"ok": True, "results": items, "count": len(items)}
|
|
115
135
|
except Exception as e:
|
|
116
136
|
return {"ok": False, "error": f"Kaggle discover failed: {str(e)}"}
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import json
|
|
3
|
+
import argparse
|
|
4
|
+
import tempfile
|
|
5
|
+
import os
|
|
6
|
+
from typing import Dict, Any, List
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
import openml
|
|
10
|
+
except ImportError:
|
|
11
|
+
openml = None
|
|
12
|
+
|
|
13
|
+
def _ensure_openml() -> Dict[str, Any]:
|
|
14
|
+
if openml is None:
|
|
15
|
+
return {"ok": False, "error": "openml package is not installed. Run 'pip install openml'"}
|
|
16
|
+
return {"ok": True}
|
|
17
|
+
|
|
18
|
+
def _dataset_to_dict(ds: Dict[str, Any]) -> Dict[str, Any]:
|
|
19
|
+
# OpenML dataset dict from list_datasets
|
|
20
|
+
did = ds.get("did", "")
|
|
21
|
+
name = ds.get("name", f"dataset_{did}")
|
|
22
|
+
version = ds.get("version", "1")
|
|
23
|
+
status = ds.get("status", "active")
|
|
24
|
+
format = ds.get("format", "unknown")
|
|
25
|
+
|
|
26
|
+
# Map to Vesper DatasetMetadata format
|
|
27
|
+
return {
|
|
28
|
+
"id": f"openml:{did}",
|
|
29
|
+
"name": name,
|
|
30
|
+
"source": "openml",
|
|
31
|
+
"description": f"OpenML Dataset {name} (ID: {did}, Version: {version}, Format: {format}, Status: {status})",
|
|
32
|
+
"author": "OpenML Community",
|
|
33
|
+
"license": "Public",
|
|
34
|
+
"tags": ["openml", format.lower()],
|
|
35
|
+
"downloads": ds.get("NumberOfDownloads", 0),
|
|
36
|
+
"likes": ds.get("NumberOfLikes", 0),
|
|
37
|
+
"created_at": ds.get("upload_date", ""),
|
|
38
|
+
"updated_at": ds.get("upload_date", ""),
|
|
39
|
+
"size_bytes": 0, # Not always available in list
|
|
40
|
+
"quality_score": 0.8, # Default good score for OpenML
|
|
41
|
+
"domain": "machine_learning",
|
|
42
|
+
"is_gated": False,
|
|
43
|
+
"is_nsfw": False,
|
|
44
|
+
"description_length": 100,
|
|
45
|
+
"has_readme": False,
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
def discover(query: str, limit: int = 20) -> Dict[str, Any]:
|
|
49
|
+
check = _ensure_openml()
|
|
50
|
+
if not check.get("ok"):
|
|
51
|
+
return check
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
# OpenML list_datasets doesn't have a direct text search in the python API easily exposed without downloading all.
|
|
55
|
+
# But we can filter by tag or just get a list and filter locally if needed, or use the REST API directly.
|
|
56
|
+
# Actually, openml.datasets.list_datasets() returns a dict of datasets.
|
|
57
|
+
# We can fetch a larger batch and filter by name/keyword.
|
|
58
|
+
|
|
59
|
+
# Fetching a batch of datasets
|
|
60
|
+
datasets = openml.datasets.list_datasets(output_format='dataframe')
|
|
61
|
+
|
|
62
|
+
if query:
|
|
63
|
+
# Simple case-insensitive search in name
|
|
64
|
+
mask = datasets['name'].str.contains(query, case=False, na=False)
|
|
65
|
+
filtered = datasets[mask]
|
|
66
|
+
else:
|
|
67
|
+
filtered = datasets
|
|
68
|
+
|
|
69
|
+
# Sort by NumberOfDownloads if available, else just take top
|
|
70
|
+
if 'NumberOfDownloads' in filtered.columns:
|
|
71
|
+
filtered = filtered.sort_values('NumberOfDownloads', ascending=False)
|
|
72
|
+
|
|
73
|
+
top_k = filtered.head(limit)
|
|
74
|
+
|
|
75
|
+
# Convert to list of dicts
|
|
76
|
+
records = top_k.to_dict(orient='records')
|
|
77
|
+
items = [_dataset_to_dict(r) for r in records]
|
|
78
|
+
|
|
79
|
+
return {"ok": True, "results": items, "count": len(items)}
|
|
80
|
+
except Exception as e:
|
|
81
|
+
return {"ok": False, "error": f"OpenML discover failed: {str(e)}"}
|
|
82
|
+
|
|
83
|
+
def download(dataset_ref: str, target_dir: str) -> Dict[str, Any]:
|
|
84
|
+
check = _ensure_openml()
|
|
85
|
+
if not check.get("ok"):
|
|
86
|
+
return check
|
|
87
|
+
|
|
88
|
+
try:
|
|
89
|
+
# dataset_ref is expected to be "openml:ID"
|
|
90
|
+
if dataset_ref.startswith("openml:"):
|
|
91
|
+
did_str = dataset_ref.split(":")[1]
|
|
92
|
+
else:
|
|
93
|
+
did_str = dataset_ref
|
|
94
|
+
|
|
95
|
+
did = int(did_str)
|
|
96
|
+
|
|
97
|
+
if not target_dir:
|
|
98
|
+
target_dir = tempfile.mkdtemp(prefix="vesper_openml_")
|
|
99
|
+
|
|
100
|
+
os.makedirs(target_dir, exist_ok=True)
|
|
101
|
+
|
|
102
|
+
# Get the dataset
|
|
103
|
+
dataset = openml.datasets.get_dataset(did, download_data=True, download_qualities=False, download_features_meta_data=False)
|
|
104
|
+
|
|
105
|
+
# Get the pandas dataframe
|
|
106
|
+
X, y, categorical_indicator, attribute_names = dataset.get_data(
|
|
107
|
+
dataset_format="dataframe"
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
# If there's a target column (y), we might want to join it back if it was separated
|
|
111
|
+
# get_data() can return X and y separately if target is specified, but usually X contains everything if target=None
|
|
112
|
+
# Let's just get everything
|
|
113
|
+
df, _, _, _ = dataset.get_data(target=None, dataset_format="dataframe")
|
|
114
|
+
|
|
115
|
+
# Save to parquet in the target directory
|
|
116
|
+
safe_name = "".join([c if c.isalnum() else "_" for c in dataset.name])
|
|
117
|
+
file_path = os.path.join(target_dir, f"{safe_name}_{did}.parquet")
|
|
118
|
+
|
|
119
|
+
df.to_parquet(file_path, index=False)
|
|
120
|
+
|
|
121
|
+
return {
|
|
122
|
+
"ok": True,
|
|
123
|
+
"local_path": file_path,
|
|
124
|
+
"target_dir": target_dir
|
|
125
|
+
}
|
|
126
|
+
except Exception as e:
|
|
127
|
+
return {"ok": False, "error": f"OpenML download failed: {str(e)}"}
|
|
128
|
+
|
|
129
|
+
def main():
|
|
130
|
+
parser = argparse.ArgumentParser(description="Vesper OpenML Engine")
|
|
131
|
+
parser.add_argument("action", choices=["discover", "download"])
|
|
132
|
+
parser.add_argument("arg1", help="Query for discover, Dataset ID for download")
|
|
133
|
+
parser.add_argument("arg2", nargs="?", help="Limit for discover, Target Dir for download")
|
|
134
|
+
|
|
135
|
+
args = parser.parse_args()
|
|
136
|
+
|
|
137
|
+
if args.action == "discover":
|
|
138
|
+
limit = int(args.arg2) if args.arg2 else 20
|
|
139
|
+
result = discover(args.arg1, limit)
|
|
140
|
+
print(json.dumps(result))
|
|
141
|
+
elif args.action == "download":
|
|
142
|
+
result = download(args.arg1, args.arg2)
|
|
143
|
+
print(json.dumps(result))
|
|
144
|
+
|
|
145
|
+
if __name__ == "__main__":
|
|
146
|
+
main()
|