@vespermcp/mcp-server 1.2.4 → 1.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +229 -21
- package/build/python/asset_downloader_engine.py +73 -0
- package/build/python/vesper/__init__.py +1 -0
- package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__init__.py +1 -0
- package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
- package/build/python/vesper/core/asset_downloader.py +415 -0
- package/build/python/vesper/core/download_recipe.py +104 -0
- package/package.json +2 -2
- package/scripts/postinstall.cjs +6 -1
- package/src/python/asset_downloader_engine.py +73 -0
- package/src/python/vesper/__init__.py +1 -0
- package/src/python/vesper/core/__init__.py +1 -0
- package/src/python/vesper/core/asset_downloader.py +415 -0
- package/src/python/vesper/core/download_recipe.py +104 -0
package/build/index.js
CHANGED
|
@@ -131,8 +131,83 @@ function extractRequestedRows(query, requirements) {
|
|
|
131
131
|
return Math.max(...allNums);
|
|
132
132
|
return undefined;
|
|
133
133
|
}
|
|
134
|
+
const verifiedPythonModules = new Set();
|
|
135
|
+
function getPythonCommand() {
|
|
136
|
+
return process.platform === "win32" ? "py" : "python";
|
|
137
|
+
}
|
|
138
|
+
function runPythonProcess(args, timeoutMs = 300000) {
|
|
139
|
+
const pyCmd = getPythonCommand();
|
|
140
|
+
return new Promise((resolve, reject) => {
|
|
141
|
+
const proc = spawn(pyCmd, args, {
|
|
142
|
+
env: {
|
|
143
|
+
...process.env,
|
|
144
|
+
PIP_DISABLE_PIP_VERSION_CHECK: "1",
|
|
145
|
+
PYTHONUTF8: "1",
|
|
146
|
+
},
|
|
147
|
+
});
|
|
148
|
+
let stdout = "";
|
|
149
|
+
let stderr = "";
|
|
150
|
+
proc.stdout.on("data", (d) => (stdout += d.toString()));
|
|
151
|
+
proc.stderr.on("data", (d) => (stderr += d.toString()));
|
|
152
|
+
const timer = setTimeout(() => {
|
|
153
|
+
try {
|
|
154
|
+
proc.kill();
|
|
155
|
+
}
|
|
156
|
+
catch {
|
|
157
|
+
// no-op
|
|
158
|
+
}
|
|
159
|
+
reject(new Error(`Python command timed out after ${timeoutMs}ms: ${args.join(" ")}`));
|
|
160
|
+
}, timeoutMs);
|
|
161
|
+
proc.on("close", (code) => {
|
|
162
|
+
clearTimeout(timer);
|
|
163
|
+
resolve({ code: code ?? 1, stdout, stderr });
|
|
164
|
+
});
|
|
165
|
+
proc.on("error", (error) => {
|
|
166
|
+
clearTimeout(timer);
|
|
167
|
+
reject(error);
|
|
168
|
+
});
|
|
169
|
+
});
|
|
170
|
+
}
|
|
171
|
+
async function ensurePythonModules(modulePackagePairs) {
|
|
172
|
+
const missing = [];
|
|
173
|
+
for (const pair of modulePackagePairs) {
|
|
174
|
+
if (verifiedPythonModules.has(pair.module)) {
|
|
175
|
+
continue;
|
|
176
|
+
}
|
|
177
|
+
const check = await runPythonProcess([
|
|
178
|
+
"-c",
|
|
179
|
+
`import importlib.util,sys; sys.exit(0 if importlib.util.find_spec(${JSON.stringify(pair.module)}) else 1)`
|
|
180
|
+
], 20000);
|
|
181
|
+
if (check.code === 0) {
|
|
182
|
+
verifiedPythonModules.add(pair.module);
|
|
183
|
+
}
|
|
184
|
+
else {
|
|
185
|
+
missing.push(pair);
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
if (missing.length === 0) {
|
|
189
|
+
return;
|
|
190
|
+
}
|
|
191
|
+
const packages = [...new Set(missing.map(m => m.packageName))];
|
|
192
|
+
console.error(`[Vesper] Installing missing Python packages: ${packages.join(", ")}`);
|
|
193
|
+
const installArgs = ["-m", "pip", "install", "--disable-pip-version-check", ...packages];
|
|
194
|
+
let install = await runPythonProcess(installArgs, 600000);
|
|
195
|
+
if (install.code !== 0) {
|
|
196
|
+
console.error(`[Vesper] pip install failed (trying --user fallback): ${(install.stderr || "").slice(0, 300)}`);
|
|
197
|
+
const userInstallArgs = ["-m", "pip", "install", "--disable-pip-version-check", "--user", ...packages];
|
|
198
|
+
install = await runPythonProcess(userInstallArgs, 600000);
|
|
199
|
+
}
|
|
200
|
+
if (install.code !== 0) {
|
|
201
|
+
const details = (install.stderr || install.stdout || "Unknown pip install error").trim();
|
|
202
|
+
throw new Error(`Failed to install required Python packages (${packages.join(", ")}). ${details}`);
|
|
203
|
+
}
|
|
204
|
+
console.error(`[Vesper] Successfully installed: ${packages.join(", ")}`);
|
|
205
|
+
for (const pair of missing) {
|
|
206
|
+
verifiedPythonModules.add(pair.module);
|
|
207
|
+
}
|
|
208
|
+
}
|
|
134
209
|
function runPythonJson(scriptPath, args) {
|
|
135
|
-
const pyCmd =
|
|
210
|
+
const pyCmd = getPythonCommand();
|
|
136
211
|
return new Promise((resolve, reject) => {
|
|
137
212
|
const proc = spawn(pyCmd, [scriptPath, ...args]);
|
|
138
213
|
let stdout = "";
|
|
@@ -167,6 +242,25 @@ function syncPythonScripts(appRoot, dataRoot) {
|
|
|
167
242
|
const pythonDest = path.join(dataRoot, "python");
|
|
168
243
|
if (!fs.existsSync(pythonDest))
|
|
169
244
|
fs.mkdirSync(pythonDest, { recursive: true });
|
|
245
|
+
const collectPyFiles = (dir) => {
|
|
246
|
+
if (!fs.existsSync(dir))
|
|
247
|
+
return [];
|
|
248
|
+
const out = [];
|
|
249
|
+
const stack = [dir];
|
|
250
|
+
while (stack.length > 0) {
|
|
251
|
+
const cur = stack.pop();
|
|
252
|
+
for (const entry of fs.readdirSync(cur, { withFileTypes: true })) {
|
|
253
|
+
const full = path.join(cur, entry.name);
|
|
254
|
+
if (entry.isDirectory()) {
|
|
255
|
+
stack.push(full);
|
|
256
|
+
}
|
|
257
|
+
else if (entry.isFile() && full.endsWith(".py")) {
|
|
258
|
+
out.push(full);
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
return out;
|
|
263
|
+
};
|
|
170
264
|
// Sources to check for Python scripts
|
|
171
265
|
const sources = [
|
|
172
266
|
path.join(appRoot, "src", "python"),
|
|
@@ -175,25 +269,21 @@ function syncPythonScripts(appRoot, dataRoot) {
|
|
|
175
269
|
];
|
|
176
270
|
let syncedCount = 0;
|
|
177
271
|
for (const src of sources) {
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
fs.copyFileSync(srcPath, destPath);
|
|
194
|
-
syncedCount++;
|
|
195
|
-
}
|
|
196
|
-
}
|
|
272
|
+
const files = collectPyFiles(src);
|
|
273
|
+
for (const srcPath of files) {
|
|
274
|
+
const rel = path.relative(src, srcPath);
|
|
275
|
+
const destPath = path.join(pythonDest, rel);
|
|
276
|
+
const srcStat = fs.statSync(srcPath);
|
|
277
|
+
let shouldCopy = true;
|
|
278
|
+
if (fs.existsSync(destPath)) {
|
|
279
|
+
const destStat = fs.statSync(destPath);
|
|
280
|
+
if (srcStat.size === destStat.size)
|
|
281
|
+
shouldCopy = false;
|
|
282
|
+
}
|
|
283
|
+
if (shouldCopy) {
|
|
284
|
+
fs.mkdirSync(path.dirname(destPath), { recursive: true });
|
|
285
|
+
fs.copyFileSync(srcPath, destPath);
|
|
286
|
+
syncedCount++;
|
|
197
287
|
}
|
|
198
288
|
}
|
|
199
289
|
}
|
|
@@ -471,6 +561,25 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
471
561
|
required: ["source", "dataset_id"],
|
|
472
562
|
},
|
|
473
563
|
},
|
|
564
|
+
{
|
|
565
|
+
name: "vesper_download_assets",
|
|
566
|
+
description: "Download real image/media assets using smart source-aware recipes (HuggingFace, Kaggle, direct URL).",
|
|
567
|
+
inputSchema: {
|
|
568
|
+
type: "object",
|
|
569
|
+
properties: {
|
|
570
|
+
dataset_id: { type: "string", description: "Unique dataset identifier." },
|
|
571
|
+
source: { type: "string", enum: ["huggingface", "kaggle", "url"], description: "Asset source type." },
|
|
572
|
+
repo_id: { type: "string", description: "Repo ID for HuggingFace (e.g. cifar100)." },
|
|
573
|
+
kaggle_ref: { type: "string", description: "Kaggle dataset ref (owner/dataset)." },
|
|
574
|
+
urls: { type: "array", items: { type: "string" }, description: "Direct asset URLs." },
|
|
575
|
+
output_format: { type: "string", enum: ["webdataset", "imagefolder", "parquet"], description: "Output asset format." },
|
|
576
|
+
max_items: { type: "number", description: "Optional cap on number of assets to fetch." },
|
|
577
|
+
workers: { type: "number", description: "Parallel worker count (default 8)." },
|
|
578
|
+
image_column: { type: "string", description: "Optional explicit image column for HuggingFace datasets." },
|
|
579
|
+
},
|
|
580
|
+
required: ["dataset_id", "source"],
|
|
581
|
+
},
|
|
582
|
+
},
|
|
474
583
|
{
|
|
475
584
|
name: "configure_kaggle",
|
|
476
585
|
description: "Optionally store Kaggle API credentials for Kaggle discover/download. Core Vesper works without this.",
|
|
@@ -571,6 +680,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
571
680
|
properties: {
|
|
572
681
|
query: { type: "string" },
|
|
573
682
|
requirements: { type: "string" },
|
|
683
|
+
download_images: { type: "boolean", description: "When true, enables post-prepare smart asset downloading for image/media datasets." },
|
|
574
684
|
cleaning_options: { type: "object" },
|
|
575
685
|
split_config: { type: "object" },
|
|
576
686
|
},
|
|
@@ -813,6 +923,24 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
813
923
|
const hf = new HuggingFaceScraper();
|
|
814
924
|
results = await hf.scrape(Math.max(1, limit), true, query);
|
|
815
925
|
}
|
|
926
|
+
const recipeScript = path.join(dataRoot, "python", "asset_downloader_engine.py");
|
|
927
|
+
for (const ds of results.slice(0, limit)) {
|
|
928
|
+
const info = {
|
|
929
|
+
dataset_id: ds.id,
|
|
930
|
+
id: ds.id,
|
|
931
|
+
source: ds.source,
|
|
932
|
+
repo_id: ds.id,
|
|
933
|
+
total_images: ds.total_examples || 0,
|
|
934
|
+
image_column: undefined,
|
|
935
|
+
recipes_dir: path.join(dataRoot, "recipes"),
|
|
936
|
+
};
|
|
937
|
+
try {
|
|
938
|
+
await runPythonJson(recipeScript, ["build_recipe", JSON.stringify(info)]);
|
|
939
|
+
}
|
|
940
|
+
catch {
|
|
941
|
+
// best-effort recipe generation; ignore discovery-time recipe failures
|
|
942
|
+
}
|
|
943
|
+
}
|
|
816
944
|
const formattedOutput = formatSearchResults(results.slice(0, limit));
|
|
817
945
|
return {
|
|
818
946
|
content: [{ type: "text", text: formattedOutput }]
|
|
@@ -857,6 +985,85 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
857
985
|
};
|
|
858
986
|
}
|
|
859
987
|
}
|
|
988
|
+
case "vesper_download_assets": {
|
|
989
|
+
hydrateExternalKeys();
|
|
990
|
+
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
991
|
+
const source = String(request.params.arguments?.source || "").trim().toLowerCase();
|
|
992
|
+
const repoId = request.params.arguments?.repo_id ? String(request.params.arguments.repo_id) : undefined;
|
|
993
|
+
const kaggleRef = request.params.arguments?.kaggle_ref ? String(request.params.arguments.kaggle_ref) : undefined;
|
|
994
|
+
const urls = Array.isArray(request.params.arguments?.urls)
|
|
995
|
+
? (request.params.arguments?.urls).map(v => String(v))
|
|
996
|
+
: undefined;
|
|
997
|
+
const outputFormat = String(request.params.arguments?.output_format || "webdataset");
|
|
998
|
+
const maxItems = request.params.arguments?.max_items ? Number(request.params.arguments.max_items) : undefined;
|
|
999
|
+
const workers = request.params.arguments?.workers ? Number(request.params.arguments.workers) : 8;
|
|
1000
|
+
const imageColumn = request.params.arguments?.image_column ? String(request.params.arguments.image_column) : undefined;
|
|
1001
|
+
if (!datasetId || !source) {
|
|
1002
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id and source are required");
|
|
1003
|
+
}
|
|
1004
|
+
if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
|
|
1005
|
+
return {
|
|
1006
|
+
content: [{ type: "text", text: "Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds)." }],
|
|
1007
|
+
isError: true,
|
|
1008
|
+
};
|
|
1009
|
+
}
|
|
1010
|
+
const requiredModules = [
|
|
1011
|
+
{ module: "aiohttp", packageName: "aiohttp" },
|
|
1012
|
+
];
|
|
1013
|
+
if (source === "url") {
|
|
1014
|
+
requiredModules.push({ module: "aiofiles", packageName: "aiofiles" });
|
|
1015
|
+
}
|
|
1016
|
+
if (source === "huggingface") {
|
|
1017
|
+
requiredModules.push({ module: "datasets", packageName: "datasets" });
|
|
1018
|
+
}
|
|
1019
|
+
if (source === "kaggle") {
|
|
1020
|
+
requiredModules.push({ module: "kaggle", packageName: "kaggle" });
|
|
1021
|
+
}
|
|
1022
|
+
if (outputFormat === "webdataset") {
|
|
1023
|
+
requiredModules.push({ module: "webdataset", packageName: "webdataset" });
|
|
1024
|
+
}
|
|
1025
|
+
try {
|
|
1026
|
+
await ensurePythonModules(requiredModules);
|
|
1027
|
+
}
|
|
1028
|
+
catch (error) {
|
|
1029
|
+
return {
|
|
1030
|
+
content: [{ type: "text", text: `ERROR: Python dependency setup failed: ${error.message}` }],
|
|
1031
|
+
isError: true,
|
|
1032
|
+
};
|
|
1033
|
+
}
|
|
1034
|
+
const scriptPath = path.join(dataRoot, "python", "asset_downloader_engine.py");
|
|
1035
|
+
const payload = {
|
|
1036
|
+
dataset_id: datasetId,
|
|
1037
|
+
source,
|
|
1038
|
+
repo_id: repoId,
|
|
1039
|
+
kaggle_ref: kaggleRef,
|
|
1040
|
+
urls,
|
|
1041
|
+
output_format: outputFormat,
|
|
1042
|
+
max_items: maxItems,
|
|
1043
|
+
workers,
|
|
1044
|
+
image_column: imageColumn,
|
|
1045
|
+
output_root: path.join(dataRoot, "data", "assets"),
|
|
1046
|
+
recipes_dir: path.join(dataRoot, "recipes"),
|
|
1047
|
+
};
|
|
1048
|
+
try {
|
|
1049
|
+
const result = await runPythonJson(scriptPath, ["download", JSON.stringify(payload)]);
|
|
1050
|
+
if (!result?.ok) {
|
|
1051
|
+
return {
|
|
1052
|
+
content: [{ type: "text", text: `ERROR: asset download failed: ${result?.error || "Unknown error"}` }],
|
|
1053
|
+
isError: true,
|
|
1054
|
+
};
|
|
1055
|
+
}
|
|
1056
|
+
return {
|
|
1057
|
+
content: [{ type: "text", text: JSON.stringify(result.result, null, 2) }],
|
|
1058
|
+
};
|
|
1059
|
+
}
|
|
1060
|
+
catch (error) {
|
|
1061
|
+
return {
|
|
1062
|
+
content: [{ type: "text", text: `ERROR: asset downloader execution failed: ${error.message}` }],
|
|
1063
|
+
isError: true,
|
|
1064
|
+
};
|
|
1065
|
+
}
|
|
1066
|
+
}
|
|
860
1067
|
case "configure_kaggle": {
|
|
861
1068
|
const username = String(request.params.arguments?.username || "").trim();
|
|
862
1069
|
const key = String(request.params.arguments?.key || "").trim();
|
|
@@ -1033,7 +1240,8 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1033
1240
|
case "prepare_dataset": {
|
|
1034
1241
|
const query = String(request.params.arguments?.query);
|
|
1035
1242
|
const requirements = request.params.arguments?.requirements ? String(request.params.arguments?.requirements) : undefined;
|
|
1036
|
-
const
|
|
1243
|
+
const downloadImages = request.params.arguments?.download_images === true;
|
|
1244
|
+
const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages });
|
|
1037
1245
|
return {
|
|
1038
1246
|
content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you.` }]
|
|
1039
1247
|
};
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import asyncio
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Dict
|
|
8
|
+
|
|
9
|
+
CURRENT_DIR = Path(__file__).resolve().parent
|
|
10
|
+
if str(CURRENT_DIR) not in sys.path:
|
|
11
|
+
sys.path.insert(0, str(CURRENT_DIR))
|
|
12
|
+
|
|
13
|
+
from vesper.core.asset_downloader import AssetDownloader
|
|
14
|
+
from vesper.core.download_recipe import build_download_recipe, save_recipe, get_download_recipe
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _print(payload: Dict[str, Any]) -> None:
|
|
18
|
+
print(json.dumps(payload, ensure_ascii=False))
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
async def _run_download(args: argparse.Namespace) -> Dict[str, Any]:
|
|
22
|
+
payload = json.loads(args.payload)
|
|
23
|
+
output_root = payload.get("output_root") or str(Path.home() / ".vesper" / "data" / "assets")
|
|
24
|
+
workers = int(payload.get("workers") or 8)
|
|
25
|
+
recipes_dir = payload.get("recipes_dir")
|
|
26
|
+
|
|
27
|
+
downloader = AssetDownloader(output_root=output_root, workers=workers, recipes_dir=recipes_dir)
|
|
28
|
+
|
|
29
|
+
result = await downloader.download_assets(
|
|
30
|
+
dataset_id=str(payload.get("dataset_id")),
|
|
31
|
+
source=payload.get("source"),
|
|
32
|
+
repo_id=payload.get("repo_id"),
|
|
33
|
+
kaggle_ref=payload.get("kaggle_ref"),
|
|
34
|
+
urls=payload.get("urls"),
|
|
35
|
+
output_format=payload.get("output_format", "webdataset"),
|
|
36
|
+
max_items=payload.get("max_items"),
|
|
37
|
+
image_column=payload.get("image_column"),
|
|
38
|
+
)
|
|
39
|
+
return {"ok": True, "result": result}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def main() -> None:
|
|
43
|
+
parser = argparse.ArgumentParser(description="Vesper Smart Asset Downloader Engine")
|
|
44
|
+
parser.add_argument("action", choices=["download", "build_recipe", "get_recipe"])
|
|
45
|
+
parser.add_argument("payload", help="JSON payload")
|
|
46
|
+
args = parser.parse_args()
|
|
47
|
+
|
|
48
|
+
try:
|
|
49
|
+
if args.action == "download":
|
|
50
|
+
response = asyncio.run(_run_download(args))
|
|
51
|
+
_print(response)
|
|
52
|
+
return
|
|
53
|
+
|
|
54
|
+
payload = json.loads(args.payload)
|
|
55
|
+
if args.action == "build_recipe":
|
|
56
|
+
recipe = build_download_recipe(payload)
|
|
57
|
+
saved = save_recipe(recipe, payload.get("recipes_dir"))
|
|
58
|
+
_print({"ok": True, "recipe": recipe, "saved_to": saved})
|
|
59
|
+
return
|
|
60
|
+
|
|
61
|
+
if args.action == "get_recipe":
|
|
62
|
+
dataset_id = str(payload.get("dataset_id"))
|
|
63
|
+
recipe = get_download_recipe(dataset_id, payload.get("recipes_dir"))
|
|
64
|
+
_print({"ok": True, "recipe": recipe})
|
|
65
|
+
return
|
|
66
|
+
|
|
67
|
+
_print({"ok": False, "error": f"Unknown action: {args.action}"})
|
|
68
|
+
except Exception as e:
|
|
69
|
+
_print({"ok": False, "error": str(e)})
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
if __name__ == "__main__":
|
|
73
|
+
main()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Vesper Python runtime package."""
|
|
Binary file
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Core data engines for Vesper."""
|
|
Binary file
|