vesper-wizard 2.0.8 → 2.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +970 -852
- package/build/metadata/scraper.js +49 -6
- package/build/python/export_engine.py +45 -0
- package/build/python/normalize_engine.py +83 -0
- package/build/search/engine.js +28 -0
- package/package.json +1 -1
- package/src/python/export_engine.py +45 -0
- package/src/python/normalize_engine.py +83 -0
package/build/index.js
CHANGED
|
@@ -266,6 +266,34 @@ function logError(err, context) {
|
|
|
266
266
|
fs.appendFileSync(errorLogPath, msg);
|
|
267
267
|
console.error(`[Vesper] Critical error logged to ${errorLogPath}`);
|
|
268
268
|
}
|
|
269
|
+
// --- Request Queue: serialize all MCP tool calls to prevent crashes ---
|
|
270
|
+
class RequestQueue {
|
|
271
|
+
queue = [];
|
|
272
|
+
running = false;
|
|
273
|
+
enqueue(task) {
|
|
274
|
+
return new Promise((resolve, reject) => {
|
|
275
|
+
this.queue.push({ resolve, reject, task });
|
|
276
|
+
this.drain();
|
|
277
|
+
});
|
|
278
|
+
}
|
|
279
|
+
async drain() {
|
|
280
|
+
if (this.running)
|
|
281
|
+
return;
|
|
282
|
+
this.running = true;
|
|
283
|
+
while (this.queue.length > 0) {
|
|
284
|
+
const item = this.queue.shift();
|
|
285
|
+
try {
|
|
286
|
+
const result = await item.task();
|
|
287
|
+
item.resolve(result);
|
|
288
|
+
}
|
|
289
|
+
catch (err) {
|
|
290
|
+
item.reject(err);
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
this.running = false;
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
const requestQueue = new RequestQueue();
|
|
269
297
|
const SPINNER_FRAMES = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"];
|
|
270
298
|
function printLaunchScreen() {
|
|
271
299
|
const screen = `
|
|
@@ -599,6 +627,18 @@ jobManager.on("processJob", async (job, execute) => {
|
|
|
599
627
|
async function handlePrepareJob(jobId, query, requirements) {
|
|
600
628
|
hydrateExternalKeys();
|
|
601
629
|
const update = (updates) => jobManager.updateJob(jobId, updates);
|
|
630
|
+
const pipelineSteps = ["search", "validate", "download", "normalize", "quality", "register"];
|
|
631
|
+
const stepStatus = {};
|
|
632
|
+
for (const s of pipelineSteps)
|
|
633
|
+
stepStatus[s] = "pending";
|
|
634
|
+
const markPipelineStep = (step, status) => {
|
|
635
|
+
stepStatus[step] = status;
|
|
636
|
+
const summary = pipelineSteps.map(s => {
|
|
637
|
+
const st = stepStatus[s];
|
|
638
|
+
return st === "done" ? `[${s}]` : st === "running" ? `>${s}<` : st === "failed" ? `!${s}!` : st === "skipped" ? `~${s}~` : ` ${s} `;
|
|
639
|
+
}).join(" → ");
|
|
640
|
+
console.error(`[Pipeline] ${summary}`);
|
|
641
|
+
};
|
|
602
642
|
// Ensure core Python packages are available for dataset operations
|
|
603
643
|
try {
|
|
604
644
|
await ensurePythonModules([
|
|
@@ -646,11 +686,14 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
646
686
|
progress: 20,
|
|
647
687
|
status_text: `Using explicit dataset id: ${datasetIdForDownload} (${source})`
|
|
648
688
|
});
|
|
689
|
+
markPipelineStep("search", "skipped");
|
|
649
690
|
}
|
|
650
691
|
else {
|
|
692
|
+
markPipelineStep("search", "running");
|
|
651
693
|
update({ progress: 10, status_text: "Searching for best dataset matching query..." });
|
|
652
694
|
const results = await searchEngine.search(query, { limit: 10 });
|
|
653
695
|
if (results.length === 0) {
|
|
696
|
+
markPipelineStep("search", "failed");
|
|
654
697
|
throw new Error("No datasets found matching the query. Try refining your search terms.");
|
|
655
698
|
}
|
|
656
699
|
// Pick the best result that we can actually download (skip sources requiring missing credentials)
|
|
@@ -670,8 +713,10 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
670
713
|
progress: 20,
|
|
671
714
|
status_text: `Matched: ${selectedDataset.name} (${selectedDataset.source})`
|
|
672
715
|
});
|
|
716
|
+
markPipelineStep("search", "done");
|
|
673
717
|
}
|
|
674
718
|
// Pre-check credentials for sources that require them
|
|
719
|
+
markPipelineStep("validate", "running");
|
|
675
720
|
if (source === "kaggle") {
|
|
676
721
|
const hasKaggleCreds = !!(process.env.KAGGLE_USERNAME && process.env.KAGGLE_KEY && process.env.KAGGLE_USERNAME !== "YOUR_KAGGLE_USERNAME");
|
|
677
722
|
if (!hasKaggleCreds) {
|
|
@@ -679,8 +724,11 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
679
724
|
}
|
|
680
725
|
}
|
|
681
726
|
if (source === "dataworld" && !hasDataWorldToken()) {
|
|
727
|
+
markPipelineStep("validate", "failed");
|
|
682
728
|
throw new Error("data.world token not set. Use the configure_keys tool or set DW_AUTH_TOKEN environment variable.");
|
|
683
729
|
}
|
|
730
|
+
markPipelineStep("validate", "done");
|
|
731
|
+
markPipelineStep("download", "running");
|
|
684
732
|
update({ progress: 30, status_text: `Starting download from ${source}...` });
|
|
685
733
|
// ensureData handles download and returns path to the raw file
|
|
686
734
|
let rawFilePath = await dataIngestor.ensureData(datasetIdForDownload, source, (msg, prog) => {
|
|
@@ -743,15 +791,50 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
743
791
|
update({ progress: 69, status_text: `Sample target met: ${currentRows.toLocaleString()} rows` });
|
|
744
792
|
}
|
|
745
793
|
}
|
|
794
|
+
markPipelineStep("download", "done");
|
|
795
|
+
// ── Normalize step: convert any raw format → parquet ──
|
|
796
|
+
markPipelineStep("normalize", "running");
|
|
797
|
+
const rawExt = path.extname(rawFilePath).toLowerCase();
|
|
798
|
+
if (rawExt !== ".parquet" && rawExt !== ".pq") {
|
|
799
|
+
update({ progress: 70, status_text: "Normalizing to parquet..." });
|
|
800
|
+
const normalizedDir = path.join(dataRoot, "data", "normalized");
|
|
801
|
+
if (!fs.existsSync(normalizedDir))
|
|
802
|
+
fs.mkdirSync(normalizedDir, { recursive: true });
|
|
803
|
+
const safeId = toSafeDatasetPathFragment(datasetIdForDownload);
|
|
804
|
+
const normalizedPath = path.join(normalizedDir, `${safeId}.parquet`);
|
|
805
|
+
try {
|
|
806
|
+
const normScript = path.join(dataRoot, "python", "normalize_engine.py");
|
|
807
|
+
const normResult = await runPythonJson(normScript, [rawFilePath, normalizedPath]);
|
|
808
|
+
if (normResult.ok && fs.existsSync(normalizedPath)) {
|
|
809
|
+
console.error(`[Prepare] Normalized ${rawExt} → parquet (${normResult.rows} rows)`);
|
|
810
|
+
rawFilePath = normalizedPath;
|
|
811
|
+
markPipelineStep("normalize", "done");
|
|
812
|
+
}
|
|
813
|
+
else {
|
|
814
|
+
console.error(`[Prepare] Normalize failed: ${normResult.error}, continuing with raw file`);
|
|
815
|
+
markPipelineStep("normalize", "skipped");
|
|
816
|
+
}
|
|
817
|
+
}
|
|
818
|
+
catch (e) {
|
|
819
|
+
console.error(`[Prepare] Normalize step failed: ${e?.message || e}, continuing with raw file`);
|
|
820
|
+
markPipelineStep("normalize", "skipped");
|
|
821
|
+
}
|
|
822
|
+
}
|
|
823
|
+
else {
|
|
824
|
+
markPipelineStep("normalize", "done");
|
|
825
|
+
}
|
|
746
826
|
let qualityScore = selectedDataset?.quality_score ?? 70;
|
|
747
|
-
|
|
827
|
+
markPipelineStep("quality", "running");
|
|
828
|
+
update({ progress: 75, status_text: "Analyzing dataset quality..." });
|
|
748
829
|
try {
|
|
749
830
|
const report = await qualityAnalyzer.analyze(rawFilePath);
|
|
750
831
|
qualityScore = report.overall_score;
|
|
832
|
+
markPipelineStep("quality", "done");
|
|
751
833
|
}
|
|
752
834
|
catch (error) {
|
|
753
835
|
console.error(`[Prepare] Quality analysis failed for ${datasetIdForDownload}: ${error?.message || error}`);
|
|
754
836
|
update({ progress: 78, status_text: "Quality analysis skipped (unsupported schema). Continuing installation..." });
|
|
837
|
+
markPipelineStep("quality", "skipped");
|
|
755
838
|
}
|
|
756
839
|
if (selectedDataset) {
|
|
757
840
|
metadataStore.saveDataset({
|
|
@@ -759,15 +842,19 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
759
842
|
quality_score: qualityScore
|
|
760
843
|
});
|
|
761
844
|
}
|
|
845
|
+
markPipelineStep("register", "running");
|
|
762
846
|
update({ progress: 85, status_text: "Installing dataset into project..." });
|
|
763
847
|
const installPath = await installService.install(datasetIdForDownload, rawFilePath);
|
|
764
848
|
update({ progress: 100, status_text: "Preparation complete!" });
|
|
765
849
|
// Register prepared dataset in local registry for lookup by export/list tools
|
|
766
850
|
try {
|
|
767
851
|
upsertRegistry(datasetIdForDownload, installPath, "completed");
|
|
852
|
+
markPipelineStep("register", "done");
|
|
853
|
+
markStepComplete(datasetIdForDownload, "prepare");
|
|
768
854
|
}
|
|
769
855
|
catch (e) {
|
|
770
856
|
console.error(`[Registry] Failed to write registry for ${datasetIdForDownload}: ${e?.message || e}`);
|
|
857
|
+
markPipelineStep("register", "failed");
|
|
771
858
|
}
|
|
772
859
|
return installPath;
|
|
773
860
|
}
|
|
@@ -1261,110 +1348,237 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1261
1348
|
],
|
|
1262
1349
|
};
|
|
1263
1350
|
});
|
|
1264
|
-
// Call Tool
|
|
1351
|
+
// Call Tool — all requests are serialized through a queue to prevent crashes from parallel calls
|
|
1265
1352
|
server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
|
|
1302
|
-
|
|
1303
|
-
|
|
1304
|
-
|
|
1353
|
+
return requestQueue.enqueue(async () => {
|
|
1354
|
+
// --- Pipeline Enforcement ---
|
|
1355
|
+
// Map tool names to pipeline steps
|
|
1356
|
+
const toolToStep = {
|
|
1357
|
+
vesper_search: "search",
|
|
1358
|
+
vesper_download: "download",
|
|
1359
|
+
vesper_analyze: "analyze",
|
|
1360
|
+
vesper_clean: "clean",
|
|
1361
|
+
vesper_split: "split",
|
|
1362
|
+
vesper_export: "export",
|
|
1363
|
+
prepare_dataset: "prepare",
|
|
1364
|
+
};
|
|
1365
|
+
// Extract dataset_id if present and normalize
|
|
1366
|
+
let datasetId = request.params.arguments?.dataset_id || request.params.arguments?.query || "";
|
|
1367
|
+
if (datasetId)
|
|
1368
|
+
datasetId = parseDatasetId(String(datasetId));
|
|
1369
|
+
// Pipeline rules
|
|
1370
|
+
const stepOrder = ["search", "download", "analyze", "clean", "split", "export"];
|
|
1371
|
+
const prereqs = {
|
|
1372
|
+
vesper_download: ["search"],
|
|
1373
|
+
vesper_analyze: ["download"],
|
|
1374
|
+
vesper_clean: ["analyze"],
|
|
1375
|
+
vesper_split: ["clean"],
|
|
1376
|
+
vesper_export: ["split"],
|
|
1377
|
+
};
|
|
1378
|
+
const tool = String(request.params.name);
|
|
1379
|
+
const step = toolToStep[tool];
|
|
1380
|
+
if (step && datasetId) {
|
|
1381
|
+
// Check prerequisites
|
|
1382
|
+
const required = prereqs[tool] || [];
|
|
1383
|
+
for (const req of required) {
|
|
1384
|
+
if (!hasStep(String(datasetId), req)) {
|
|
1385
|
+
// Auto-run missing step if possible, else error
|
|
1386
|
+
// For export, auto-run prepare_dataset if split missing
|
|
1387
|
+
if (tool === "vesper_export" && req === "split") {
|
|
1388
|
+
// Auto-trigger prepare_dataset (start a background prepare job)
|
|
1389
|
+
try {
|
|
1390
|
+
jobManager.createJob("prepare", 0, { query: String(datasetId), requirements: undefined, downloadImages: false });
|
|
1391
|
+
// Mark split as complete so export can proceed; export handler will also wait for data if needed.
|
|
1392
|
+
markStepComplete(String(datasetId), "split");
|
|
1393
|
+
}
|
|
1394
|
+
catch (e) {
|
|
1395
|
+
console.error(`[Pipeline] Failed to auto-trigger prepare for ${datasetId}: ${e?.message || e}`);
|
|
1396
|
+
return {
|
|
1397
|
+
content: [{ type: "text", text: `ERROR: Failed to auto-run prepare for ${datasetId}. Please run prepare_dataset first.` }],
|
|
1398
|
+
isError: true,
|
|
1399
|
+
};
|
|
1400
|
+
}
|
|
1305
1401
|
}
|
|
1306
|
-
|
|
1307
|
-
console.error(`[Pipeline] Failed to auto-trigger prepare for ${datasetId}: ${e?.message || e}`);
|
|
1402
|
+
else {
|
|
1308
1403
|
return {
|
|
1309
|
-
content: [{ type: "text", text: `ERROR:
|
|
1404
|
+
content: [{ type: "text", text: `ERROR: Cannot run ${tool} before ${req}. Please run ${req} first.` }],
|
|
1310
1405
|
isError: true,
|
|
1311
1406
|
};
|
|
1312
1407
|
}
|
|
1313
1408
|
}
|
|
1314
|
-
|
|
1409
|
+
}
|
|
1410
|
+
// Mark this step as complete
|
|
1411
|
+
markStepComplete(String(datasetId), String(step));
|
|
1412
|
+
}
|
|
1413
|
+
switch (request.params.name) {
|
|
1414
|
+
case "unified_dataset_api": {
|
|
1415
|
+
hydrateExternalKeys();
|
|
1416
|
+
const operation = String(request.params.arguments?.operation || "").trim().toLowerCase();
|
|
1417
|
+
const source = String(request.params.arguments?.source || "auto").trim().toLowerCase();
|
|
1418
|
+
const includeUnavailable = request.params.arguments?.include_unavailable === true;
|
|
1419
|
+
const publicOnly = request.params.arguments?.public_only !== false;
|
|
1420
|
+
try {
|
|
1421
|
+
if (operation === "providers") {
|
|
1422
|
+
return {
|
|
1423
|
+
content: [{ type: "text", text: JSON.stringify({ providers: unifiedDatasetGateway.getProviderStatuses(includeUnavailable) }, null, 2) }],
|
|
1424
|
+
};
|
|
1425
|
+
}
|
|
1426
|
+
if (operation === "discover") {
|
|
1427
|
+
const query = String(request.params.arguments?.query || "").trim();
|
|
1428
|
+
if (!query) {
|
|
1429
|
+
throw new McpError(ErrorCode.InvalidParams, "query is required for operation='discover'");
|
|
1430
|
+
}
|
|
1431
|
+
const result = await unifiedDatasetGateway.discover({
|
|
1432
|
+
query,
|
|
1433
|
+
source,
|
|
1434
|
+
limit: Number(request.params.arguments?.limit || 10),
|
|
1435
|
+
publicOnly,
|
|
1436
|
+
});
|
|
1437
|
+
return {
|
|
1438
|
+
content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
|
|
1439
|
+
};
|
|
1440
|
+
}
|
|
1441
|
+
if (operation === "download") {
|
|
1442
|
+
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
1443
|
+
if (!datasetId) {
|
|
1444
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='download'");
|
|
1445
|
+
}
|
|
1446
|
+
try {
|
|
1447
|
+
await ensurePythonModules([{ module: "datasets", packageName: "datasets" }]);
|
|
1448
|
+
}
|
|
1449
|
+
catch {
|
|
1450
|
+
// best effort; non-HF providers do not require this
|
|
1451
|
+
}
|
|
1452
|
+
const result = await unifiedDatasetGateway.download({
|
|
1453
|
+
datasetId,
|
|
1454
|
+
source,
|
|
1455
|
+
targetDir: request.params.arguments?.target_dir ? String(request.params.arguments.target_dir) : undefined,
|
|
1456
|
+
});
|
|
1457
|
+
try {
|
|
1458
|
+
upsertRegistry(result.dataset_id, result.copied_to || result.local_path, "completed");
|
|
1459
|
+
}
|
|
1460
|
+
catch (e) {
|
|
1461
|
+
console.error(`[Registry] Failed to write registry for ${result.dataset_id}: ${e?.message || e}`);
|
|
1462
|
+
}
|
|
1463
|
+
return {
|
|
1464
|
+
content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
|
|
1465
|
+
};
|
|
1466
|
+
}
|
|
1467
|
+
if (operation === "info") {
|
|
1468
|
+
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
1469
|
+
if (!datasetId) {
|
|
1470
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='info'");
|
|
1471
|
+
}
|
|
1472
|
+
const result = await unifiedDatasetGateway.info({
|
|
1473
|
+
datasetId,
|
|
1474
|
+
source,
|
|
1475
|
+
publicOnly,
|
|
1476
|
+
});
|
|
1477
|
+
return {
|
|
1478
|
+
content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
|
|
1479
|
+
};
|
|
1480
|
+
}
|
|
1481
|
+
throw new McpError(ErrorCode.InvalidParams, `Unsupported operation: ${operation}`);
|
|
1482
|
+
}
|
|
1483
|
+
catch (error) {
|
|
1315
1484
|
return {
|
|
1316
|
-
content: [{ type: "text", text: `ERROR:
|
|
1485
|
+
content: [{ type: "text", text: `ERROR: unified gateway failed: ${error.message}` }],
|
|
1317
1486
|
isError: true,
|
|
1318
1487
|
};
|
|
1319
1488
|
}
|
|
1320
1489
|
}
|
|
1321
|
-
|
|
1322
|
-
|
|
1323
|
-
|
|
1324
|
-
|
|
1325
|
-
|
|
1326
|
-
|
|
1327
|
-
|
|
1328
|
-
const operation = String(request.params.arguments?.operation || "").trim().toLowerCase();
|
|
1329
|
-
const source = String(request.params.arguments?.source || "auto").trim().toLowerCase();
|
|
1330
|
-
const includeUnavailable = request.params.arguments?.include_unavailable === true;
|
|
1331
|
-
const publicOnly = request.params.arguments?.public_only !== false;
|
|
1332
|
-
try {
|
|
1333
|
-
if (operation === "providers") {
|
|
1334
|
-
return {
|
|
1335
|
-
content: [{ type: "text", text: JSON.stringify({ providers: unifiedDatasetGateway.getProviderStatuses(includeUnavailable) }, null, 2) }],
|
|
1336
|
-
};
|
|
1490
|
+
case "vesper_search": {
|
|
1491
|
+
const query = String(request.params.arguments?.query);
|
|
1492
|
+
const limit = 5;
|
|
1493
|
+
const safeOnly = true; // Enable safe filter by default
|
|
1494
|
+
const enableJIT = request.params.arguments?.enable_jit === true;
|
|
1495
|
+
if (!query) {
|
|
1496
|
+
throw new McpError(ErrorCode.InvalidParams, "Query is required");
|
|
1337
1497
|
}
|
|
1338
|
-
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
|
|
1342
|
-
|
|
1343
|
-
|
|
1498
|
+
const results = await searchEngine.search(query, { limit, safeOnly, enableJIT });
|
|
1499
|
+
const formattedOutput = formatSearchResults(results);
|
|
1500
|
+
return {
|
|
1501
|
+
content: [
|
|
1502
|
+
{
|
|
1503
|
+
type: "text",
|
|
1504
|
+
text: formattedOutput,
|
|
1505
|
+
},
|
|
1506
|
+
],
|
|
1507
|
+
};
|
|
1508
|
+
}
|
|
1509
|
+
case "discover_datasets": {
|
|
1510
|
+
hydrateExternalKeys();
|
|
1511
|
+
const query = String(request.params.arguments?.query || "").trim();
|
|
1512
|
+
const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
|
|
1513
|
+
const limit = Number(request.params.arguments?.limit || 10);
|
|
1514
|
+
if (!query) {
|
|
1515
|
+
throw new McpError(ErrorCode.InvalidParams, "query is required");
|
|
1516
|
+
}
|
|
1517
|
+
try {
|
|
1518
|
+
const gatewayResult = await unifiedDatasetGateway.discover({
|
|
1344
1519
|
query,
|
|
1345
1520
|
source,
|
|
1346
|
-
limit
|
|
1347
|
-
publicOnly,
|
|
1521
|
+
limit,
|
|
1522
|
+
publicOnly: false,
|
|
1348
1523
|
});
|
|
1524
|
+
const results = gatewayResult.results;
|
|
1525
|
+
const recipeScript = path.join(dataRoot, "python", "asset_downloader_engine.py");
|
|
1526
|
+
for (const ds of results.slice(0, limit)) {
|
|
1527
|
+
const info = {
|
|
1528
|
+
dataset_id: ds.id,
|
|
1529
|
+
id: ds.id,
|
|
1530
|
+
source: ds.source,
|
|
1531
|
+
repo_id: ds.id,
|
|
1532
|
+
total_images: ds.total_examples || 0,
|
|
1533
|
+
image_column: undefined,
|
|
1534
|
+
recipes_dir: path.join(dataRoot, "recipes"),
|
|
1535
|
+
};
|
|
1536
|
+
try {
|
|
1537
|
+
await runPythonJson(recipeScript, ["build_recipe", JSON.stringify(info)]);
|
|
1538
|
+
}
|
|
1539
|
+
catch {
|
|
1540
|
+
// best-effort recipe generation; ignore discovery-time recipe failures
|
|
1541
|
+
}
|
|
1542
|
+
}
|
|
1543
|
+
const formattedOutput = formatSearchResults(results.slice(0, limit));
|
|
1544
|
+
const noteBlock = gatewayResult.notes.length > 0
|
|
1545
|
+
? `\nGateway notes:\n- ${gatewayResult.notes.join("\n- ")}`
|
|
1546
|
+
: "";
|
|
1349
1547
|
return {
|
|
1350
|
-
content: [{ type: "text", text:
|
|
1548
|
+
content: [{ type: "text", text: `${formattedOutput}${noteBlock}` }]
|
|
1351
1549
|
};
|
|
1352
1550
|
}
|
|
1353
|
-
|
|
1354
|
-
|
|
1355
|
-
|
|
1356
|
-
|
|
1357
|
-
}
|
|
1551
|
+
catch (error) {
|
|
1552
|
+
return {
|
|
1553
|
+
content: [{ type: "text", text: `ERROR: discover failed: ${error.message}` }],
|
|
1554
|
+
isError: true,
|
|
1555
|
+
};
|
|
1556
|
+
}
|
|
1557
|
+
}
|
|
1558
|
+
case "download_dataset": {
|
|
1559
|
+
hydrateExternalKeys();
|
|
1560
|
+
const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
|
|
1561
|
+
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
1562
|
+
const targetDir = request.params.arguments?.target_dir ? String(request.params.arguments.target_dir) : undefined;
|
|
1563
|
+
if (!datasetId) {
|
|
1564
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
1565
|
+
}
|
|
1566
|
+
// Pre-install Python datasets library for HuggingFace fallback
|
|
1567
|
+
if (source === "huggingface") {
|
|
1358
1568
|
try {
|
|
1359
|
-
await ensurePythonModules([
|
|
1569
|
+
await ensurePythonModules([
|
|
1570
|
+
{ module: "datasets", packageName: "datasets" },
|
|
1571
|
+
]);
|
|
1360
1572
|
}
|
|
1361
1573
|
catch {
|
|
1362
|
-
//
|
|
1574
|
+
// Continue - direct download may still work
|
|
1363
1575
|
}
|
|
1576
|
+
}
|
|
1577
|
+
try {
|
|
1364
1578
|
const result = await unifiedDatasetGateway.download({
|
|
1365
1579
|
datasetId,
|
|
1366
1580
|
source,
|
|
1367
|
-
targetDir
|
|
1581
|
+
targetDir,
|
|
1368
1582
|
});
|
|
1369
1583
|
try {
|
|
1370
1584
|
upsertRegistry(result.dataset_id, result.copied_to || result.local_path, "completed");
|
|
@@ -1372,857 +1586,761 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1372
1586
|
catch (e) {
|
|
1373
1587
|
console.error(`[Registry] Failed to write registry for ${result.dataset_id}: ${e?.message || e}`);
|
|
1374
1588
|
}
|
|
1589
|
+
const noteBlock = result.notes.length > 0 ? `\nNotes:\n- ${result.notes.join("\n- ")}` : "";
|
|
1375
1590
|
return {
|
|
1376
|
-
content: [{ type: "text", text:
|
|
1591
|
+
content: [{ type: "text", text: `Download complete: ${result.copied_to || result.local_path}${noteBlock}` }]
|
|
1377
1592
|
};
|
|
1378
1593
|
}
|
|
1379
|
-
|
|
1380
|
-
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
1381
|
-
if (!datasetId) {
|
|
1382
|
-
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='info'");
|
|
1383
|
-
}
|
|
1384
|
-
const result = await unifiedDatasetGateway.info({
|
|
1385
|
-
datasetId,
|
|
1386
|
-
source,
|
|
1387
|
-
publicOnly,
|
|
1388
|
-
});
|
|
1594
|
+
catch (error) {
|
|
1389
1595
|
return {
|
|
1390
|
-
content: [{ type: "text", text:
|
|
1596
|
+
content: [{ type: "text", text: `ERROR: download failed: ${error.message}` }],
|
|
1597
|
+
isError: true,
|
|
1391
1598
|
};
|
|
1392
1599
|
}
|
|
1393
|
-
throw new McpError(ErrorCode.InvalidParams, `Unsupported operation: ${operation}`);
|
|
1394
|
-
}
|
|
1395
|
-
catch (error) {
|
|
1396
|
-
return {
|
|
1397
|
-
content: [{ type: "text", text: `ERROR: unified gateway failed: ${error.message}` }],
|
|
1398
|
-
isError: true,
|
|
1399
|
-
};
|
|
1400
1600
|
}
|
|
1401
|
-
|
|
1402
|
-
|
|
1403
|
-
|
|
1404
|
-
|
|
1405
|
-
|
|
1406
|
-
|
|
1407
|
-
|
|
1408
|
-
|
|
1409
|
-
|
|
1410
|
-
|
|
1411
|
-
|
|
1412
|
-
|
|
1413
|
-
|
|
1414
|
-
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
|
|
1418
|
-
|
|
1419
|
-
|
|
1420
|
-
|
|
1421
|
-
|
|
1422
|
-
|
|
1423
|
-
|
|
1424
|
-
|
|
1425
|
-
|
|
1426
|
-
|
|
1427
|
-
|
|
1428
|
-
|
|
1429
|
-
try {
|
|
1430
|
-
const gatewayResult = await unifiedDatasetGateway.discover({
|
|
1431
|
-
query,
|
|
1432
|
-
source,
|
|
1433
|
-
limit,
|
|
1434
|
-
publicOnly: false,
|
|
1435
|
-
});
|
|
1436
|
-
const results = gatewayResult.results;
|
|
1437
|
-
const recipeScript = path.join(dataRoot, "python", "asset_downloader_engine.py");
|
|
1438
|
-
for (const ds of results.slice(0, limit)) {
|
|
1439
|
-
const info = {
|
|
1440
|
-
dataset_id: ds.id,
|
|
1441
|
-
id: ds.id,
|
|
1442
|
-
source: ds.source,
|
|
1443
|
-
repo_id: ds.id,
|
|
1444
|
-
total_images: ds.total_examples || 0,
|
|
1445
|
-
image_column: undefined,
|
|
1446
|
-
recipes_dir: path.join(dataRoot, "recipes"),
|
|
1601
|
+
case "vesper_download_assets": {
|
|
1602
|
+
hydrateExternalKeys();
|
|
1603
|
+
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
1604
|
+
const source = String(request.params.arguments?.source || "").trim().toLowerCase();
|
|
1605
|
+
// Auto-infer repo_id from dataset_id if not provided (common for HuggingFace)
|
|
1606
|
+
const repoId = request.params.arguments?.repo_id
|
|
1607
|
+
? String(request.params.arguments.repo_id)
|
|
1608
|
+
: (source === "huggingface" && datasetId.includes("/") ? datasetId : undefined);
|
|
1609
|
+
const kaggleRef = request.params.arguments?.kaggle_ref ? String(request.params.arguments.kaggle_ref) : undefined;
|
|
1610
|
+
const urls = Array.isArray(request.params.arguments?.urls)
|
|
1611
|
+
? (request.params.arguments?.urls).map(v => String(v))
|
|
1612
|
+
: undefined;
|
|
1613
|
+
const outputFormat = String(request.params.arguments?.output_format || "webdataset");
|
|
1614
|
+
const requestedOutputDir = request.params.arguments?.target_dir
|
|
1615
|
+
? String(request.params.arguments.target_dir).trim()
|
|
1616
|
+
: request.params.arguments?.output_dir
|
|
1617
|
+
? String(request.params.arguments.output_dir).trim()
|
|
1618
|
+
: undefined;
|
|
1619
|
+
const maxItems = request.params.arguments?.max_items ? Number(request.params.arguments.max_items) : undefined;
|
|
1620
|
+
const workers = request.params.arguments?.workers ? Number(request.params.arguments.workers) : 8;
|
|
1621
|
+
const imageColumn = request.params.arguments?.image_column ? String(request.params.arguments.image_column) : undefined;
|
|
1622
|
+
if (!datasetId || !source) {
|
|
1623
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id and source are required");
|
|
1624
|
+
}
|
|
1625
|
+
if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
|
|
1626
|
+
return {
|
|
1627
|
+
content: [{ type: "text", text: "Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds)." }],
|
|
1628
|
+
isError: true,
|
|
1447
1629
|
};
|
|
1448
|
-
try {
|
|
1449
|
-
await runPythonJson(recipeScript, ["build_recipe", JSON.stringify(info)]);
|
|
1450
|
-
}
|
|
1451
|
-
catch {
|
|
1452
|
-
// best-effort recipe generation; ignore discovery-time recipe failures
|
|
1453
|
-
}
|
|
1454
1630
|
}
|
|
1455
|
-
const
|
|
1456
|
-
|
|
1457
|
-
|
|
1458
|
-
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
}
|
|
1468
|
-
}
|
|
1469
|
-
}
|
|
1470
|
-
case "download_dataset": {
|
|
1471
|
-
hydrateExternalKeys();
|
|
1472
|
-
const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
|
|
1473
|
-
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
1474
|
-
const targetDir = request.params.arguments?.target_dir ? String(request.params.arguments.target_dir) : undefined;
|
|
1475
|
-
if (!datasetId) {
|
|
1476
|
-
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
1477
|
-
}
|
|
1478
|
-
// Pre-install Python datasets library for HuggingFace fallback
|
|
1479
|
-
if (source === "huggingface") {
|
|
1631
|
+
const requiredModules = [
|
|
1632
|
+
{ module: "aiohttp", packageName: "aiohttp" },
|
|
1633
|
+
];
|
|
1634
|
+
if (source === "url") {
|
|
1635
|
+
requiredModules.push({ module: "aiofiles", packageName: "aiofiles" });
|
|
1636
|
+
}
|
|
1637
|
+
if (source === "huggingface") {
|
|
1638
|
+
requiredModules.push({ module: "datasets", packageName: "datasets" });
|
|
1639
|
+
requiredModules.push({ module: "PIL", packageName: "Pillow" });
|
|
1640
|
+
}
|
|
1641
|
+
if (source === "kaggle") {
|
|
1642
|
+
requiredModules.push({ module: "kaggle", packageName: "kaggle" });
|
|
1643
|
+
}
|
|
1480
1644
|
try {
|
|
1481
|
-
await ensurePythonModules(
|
|
1482
|
-
{ module: "datasets", packageName: "datasets" },
|
|
1483
|
-
]);
|
|
1645
|
+
await ensurePythonModules(requiredModules);
|
|
1484
1646
|
}
|
|
1485
|
-
catch {
|
|
1486
|
-
|
|
1647
|
+
catch (error) {
|
|
1648
|
+
return {
|
|
1649
|
+
content: [{ type: "text", text: `ERROR: Python dependency setup failed: ${error.message}` }],
|
|
1650
|
+
isError: true,
|
|
1651
|
+
};
|
|
1487
1652
|
}
|
|
1488
|
-
|
|
1489
|
-
|
|
1490
|
-
|
|
1491
|
-
datasetId,
|
|
1653
|
+
const scriptPath = path.join(dataRoot, "python", "asset_downloader_engine.py");
|
|
1654
|
+
const payload = {
|
|
1655
|
+
dataset_id: datasetId,
|
|
1492
1656
|
source,
|
|
1493
|
-
|
|
1494
|
-
|
|
1657
|
+
repo_id: repoId,
|
|
1658
|
+
kaggle_ref: kaggleRef,
|
|
1659
|
+
urls,
|
|
1660
|
+
output_format: outputFormat,
|
|
1661
|
+
output_dir: requestedOutputDir,
|
|
1662
|
+
max_items: maxItems,
|
|
1663
|
+
workers,
|
|
1664
|
+
image_column: imageColumn,
|
|
1665
|
+
output_root: path.join(dataRoot, "data", "assets"),
|
|
1666
|
+
recipes_dir: path.join(dataRoot, "recipes"),
|
|
1667
|
+
};
|
|
1495
1668
|
try {
|
|
1496
|
-
|
|
1669
|
+
const result = await runPythonJson(scriptPath, ["download", JSON.stringify(payload)]);
|
|
1670
|
+
if (!result?.ok) {
|
|
1671
|
+
const errMsg = result?.error || "Unknown error";
|
|
1672
|
+
// Enhance error messages for common failures
|
|
1673
|
+
let hint = "";
|
|
1674
|
+
if (errMsg.includes("No image column")) {
|
|
1675
|
+
hint = "\n\nHint: Specify image_column parameter with the column name containing images/URLs.";
|
|
1676
|
+
}
|
|
1677
|
+
else if (errMsg.includes("Authentication") || errMsg.includes("401") || errMsg.includes("403")) {
|
|
1678
|
+
hint = "\n\nHint: Use configure_keys tool to set HF_TOKEN for gated/private datasets.";
|
|
1679
|
+
}
|
|
1680
|
+
return {
|
|
1681
|
+
content: [{ type: "text", text: `ERROR: asset download failed: ${errMsg}${hint}` }],
|
|
1682
|
+
isError: true,
|
|
1683
|
+
};
|
|
1684
|
+
}
|
|
1685
|
+
return {
|
|
1686
|
+
content: [{ type: "text", text: JSON.stringify(result.result, null, 2) }],
|
|
1687
|
+
};
|
|
1497
1688
|
}
|
|
1498
|
-
catch (
|
|
1499
|
-
|
|
1689
|
+
catch (error) {
|
|
1690
|
+
return {
|
|
1691
|
+
content: [{ type: "text", text: `ERROR: asset downloader execution failed: ${error.message}` }],
|
|
1692
|
+
isError: true,
|
|
1693
|
+
};
|
|
1500
1694
|
}
|
|
1501
|
-
const noteBlock = result.notes.length > 0 ? `\nNotes:\n- ${result.notes.join("\n- ")}` : "";
|
|
1502
|
-
return {
|
|
1503
|
-
content: [{ type: "text", text: `Download complete: ${result.copied_to || result.local_path}${noteBlock}` }]
|
|
1504
|
-
};
|
|
1505
|
-
}
|
|
1506
|
-
catch (error) {
|
|
1507
|
-
return {
|
|
1508
|
-
content: [{ type: "text", text: `ERROR: download failed: ${error.message}` }],
|
|
1509
|
-
isError: true,
|
|
1510
|
-
};
|
|
1511
|
-
}
|
|
1512
|
-
}
|
|
1513
|
-
case "vesper_download_assets": {
|
|
1514
|
-
hydrateExternalKeys();
|
|
1515
|
-
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
1516
|
-
const source = String(request.params.arguments?.source || "").trim().toLowerCase();
|
|
1517
|
-
// Auto-infer repo_id from dataset_id if not provided (common for HuggingFace)
|
|
1518
|
-
const repoId = request.params.arguments?.repo_id
|
|
1519
|
-
? String(request.params.arguments.repo_id)
|
|
1520
|
-
: (source === "huggingface" && datasetId.includes("/") ? datasetId : undefined);
|
|
1521
|
-
const kaggleRef = request.params.arguments?.kaggle_ref ? String(request.params.arguments.kaggle_ref) : undefined;
|
|
1522
|
-
const urls = Array.isArray(request.params.arguments?.urls)
|
|
1523
|
-
? (request.params.arguments?.urls).map(v => String(v))
|
|
1524
|
-
: undefined;
|
|
1525
|
-
const outputFormat = String(request.params.arguments?.output_format || "webdataset");
|
|
1526
|
-
const requestedOutputDir = request.params.arguments?.target_dir
|
|
1527
|
-
? String(request.params.arguments.target_dir).trim()
|
|
1528
|
-
: request.params.arguments?.output_dir
|
|
1529
|
-
? String(request.params.arguments.output_dir).trim()
|
|
1530
|
-
: undefined;
|
|
1531
|
-
const maxItems = request.params.arguments?.max_items ? Number(request.params.arguments.max_items) : undefined;
|
|
1532
|
-
const workers = request.params.arguments?.workers ? Number(request.params.arguments.workers) : 8;
|
|
1533
|
-
const imageColumn = request.params.arguments?.image_column ? String(request.params.arguments.image_column) : undefined;
|
|
1534
|
-
if (!datasetId || !source) {
|
|
1535
|
-
throw new McpError(ErrorCode.InvalidParams, "dataset_id and source are required");
|
|
1536
|
-
}
|
|
1537
|
-
if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
|
|
1538
|
-
return {
|
|
1539
|
-
content: [{ type: "text", text: "Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds)." }],
|
|
1540
|
-
isError: true,
|
|
1541
|
-
};
|
|
1542
|
-
}
|
|
1543
|
-
const requiredModules = [
|
|
1544
|
-
{ module: "aiohttp", packageName: "aiohttp" },
|
|
1545
|
-
];
|
|
1546
|
-
if (source === "url") {
|
|
1547
|
-
requiredModules.push({ module: "aiofiles", packageName: "aiofiles" });
|
|
1548
1695
|
}
|
|
1549
|
-
|
|
1550
|
-
|
|
1551
|
-
|
|
1552
|
-
|
|
1553
|
-
|
|
1554
|
-
|
|
1555
|
-
|
|
1556
|
-
|
|
1557
|
-
|
|
1558
|
-
|
|
1559
|
-
catch (error) {
|
|
1696
|
+
case "configure_kaggle": {
|
|
1697
|
+
const username = String(request.params.arguments?.username || "").trim();
|
|
1698
|
+
const key = String(request.params.arguments?.key || "").trim();
|
|
1699
|
+
if (!username || !key) {
|
|
1700
|
+
throw new McpError(ErrorCode.InvalidParams, "username and key are required");
|
|
1701
|
+
}
|
|
1702
|
+
const r1 = secureKeys.set("kaggle_username", username);
|
|
1703
|
+
const r2 = secureKeys.set("kaggle_key", key);
|
|
1704
|
+
process.env.KAGGLE_USERNAME = username;
|
|
1705
|
+
process.env.KAGGLE_KEY = key;
|
|
1560
1706
|
return {
|
|
1561
|
-
content: [{ type: "text", text: `
|
|
1562
|
-
isError: true,
|
|
1707
|
+
content: [{ type: "text", text: `Kaggle credentials saved securely (${r1.method || "store"}/${r2.method || "store"}). You can now use Kaggle datasets.` }]
|
|
1563
1708
|
};
|
|
1564
1709
|
}
|
|
1565
|
-
|
|
1566
|
-
|
|
1567
|
-
|
|
1568
|
-
|
|
1569
|
-
|
|
1570
|
-
|
|
1571
|
-
|
|
1572
|
-
|
|
1573
|
-
|
|
1574
|
-
|
|
1575
|
-
|
|
1576
|
-
|
|
1577
|
-
|
|
1578
|
-
|
|
1579
|
-
};
|
|
1580
|
-
try {
|
|
1581
|
-
const result = await runPythonJson(scriptPath, ["download", JSON.stringify(payload)]);
|
|
1582
|
-
if (!result?.ok) {
|
|
1583
|
-
const errMsg = result?.error || "Unknown error";
|
|
1584
|
-
// Enhance error messages for common failures
|
|
1585
|
-
let hint = "";
|
|
1586
|
-
if (errMsg.includes("No image column")) {
|
|
1587
|
-
hint = "\n\nHint: Specify image_column parameter with the column name containing images/URLs.";
|
|
1710
|
+
case "configure_keys": {
|
|
1711
|
+
const hfToken = String(request.params.arguments?.hf_token || "").trim();
|
|
1712
|
+
const kaggleUsername = String(request.params.arguments?.kaggle_username || "").trim();
|
|
1713
|
+
const kaggleKey = String(request.params.arguments?.kaggle_key || "").trim();
|
|
1714
|
+
const dataworldToken = String(request.params.arguments?.dataworld_token || "").trim();
|
|
1715
|
+
const saved = [];
|
|
1716
|
+
const methods = [];
|
|
1717
|
+
if (hfToken) {
|
|
1718
|
+
const r = secureKeys.set("hf_token", hfToken);
|
|
1719
|
+
if (r.ok) {
|
|
1720
|
+
process.env.HF_TOKEN = hfToken;
|
|
1721
|
+
saved.push("HF token");
|
|
1722
|
+
if (r.method)
|
|
1723
|
+
methods.push(r.method);
|
|
1588
1724
|
}
|
|
1589
|
-
|
|
1590
|
-
|
|
1725
|
+
}
|
|
1726
|
+
if (kaggleUsername) {
|
|
1727
|
+
const r = secureKeys.set("kaggle_username", kaggleUsername);
|
|
1728
|
+
if (r.ok) {
|
|
1729
|
+
process.env.KAGGLE_USERNAME = kaggleUsername;
|
|
1730
|
+
saved.push("Kaggle username");
|
|
1731
|
+
if (r.method)
|
|
1732
|
+
methods.push(r.method);
|
|
1733
|
+
}
|
|
1734
|
+
}
|
|
1735
|
+
if (kaggleKey) {
|
|
1736
|
+
const r = secureKeys.set("kaggle_key", kaggleKey);
|
|
1737
|
+
if (r.ok) {
|
|
1738
|
+
process.env.KAGGLE_KEY = kaggleKey;
|
|
1739
|
+
saved.push("Kaggle key");
|
|
1740
|
+
if (r.method)
|
|
1741
|
+
methods.push(r.method);
|
|
1591
1742
|
}
|
|
1743
|
+
}
|
|
1744
|
+
if (dataworldToken) {
|
|
1745
|
+
const r = secureKeys.set("dataworld_token", dataworldToken);
|
|
1746
|
+
if (r.ok) {
|
|
1747
|
+
process.env.DW_AUTH_TOKEN = dataworldToken;
|
|
1748
|
+
saved.push("data.world token");
|
|
1749
|
+
if (r.method)
|
|
1750
|
+
methods.push(r.method);
|
|
1751
|
+
}
|
|
1752
|
+
}
|
|
1753
|
+
if (saved.length === 0) {
|
|
1592
1754
|
return {
|
|
1593
|
-
content: [{ type: "text", text:
|
|
1594
|
-
isError: true,
|
|
1755
|
+
content: [{ type: "text", text: "No keys provided. Core Vesper tools continue to work without API keys." }]
|
|
1595
1756
|
};
|
|
1596
1757
|
}
|
|
1597
1758
|
return {
|
|
1598
|
-
content: [{ type: "text", text:
|
|
1759
|
+
content: [{ type: "text", text: `Key saved securely. Updated: ${saved.join(", ")}.` }]
|
|
1599
1760
|
};
|
|
1600
1761
|
}
|
|
1601
|
-
|
|
1602
|
-
|
|
1603
|
-
|
|
1604
|
-
|
|
1605
|
-
};
|
|
1606
|
-
}
|
|
1607
|
-
}
|
|
1608
|
-
case "configure_kaggle": {
|
|
1609
|
-
const username = String(request.params.arguments?.username || "").trim();
|
|
1610
|
-
const key = String(request.params.arguments?.key || "").trim();
|
|
1611
|
-
if (!username || !key) {
|
|
1612
|
-
throw new McpError(ErrorCode.InvalidParams, "username and key are required");
|
|
1613
|
-
}
|
|
1614
|
-
const r1 = secureKeys.set("kaggle_username", username);
|
|
1615
|
-
const r2 = secureKeys.set("kaggle_key", key);
|
|
1616
|
-
process.env.KAGGLE_USERNAME = username;
|
|
1617
|
-
process.env.KAGGLE_KEY = key;
|
|
1618
|
-
return {
|
|
1619
|
-
content: [{ type: "text", text: `Kaggle credentials saved securely (${r1.method || "store"}/${r2.method || "store"}). You can now use Kaggle datasets.` }]
|
|
1620
|
-
};
|
|
1621
|
-
}
|
|
1622
|
-
case "configure_keys": {
|
|
1623
|
-
const hfToken = String(request.params.arguments?.hf_token || "").trim();
|
|
1624
|
-
const kaggleUsername = String(request.params.arguments?.kaggle_username || "").trim();
|
|
1625
|
-
const kaggleKey = String(request.params.arguments?.kaggle_key || "").trim();
|
|
1626
|
-
const dataworldToken = String(request.params.arguments?.dataworld_token || "").trim();
|
|
1627
|
-
const saved = [];
|
|
1628
|
-
const methods = [];
|
|
1629
|
-
if (hfToken) {
|
|
1630
|
-
const r = secureKeys.set("hf_token", hfToken);
|
|
1631
|
-
if (r.ok) {
|
|
1632
|
-
process.env.HF_TOKEN = hfToken;
|
|
1633
|
-
saved.push("HF token");
|
|
1634
|
-
if (r.method)
|
|
1635
|
-
methods.push(r.method);
|
|
1762
|
+
case "get_dataset_info": {
|
|
1763
|
+
const datasetId = String(request.params.arguments?.dataset_id);
|
|
1764
|
+
if (!datasetId) {
|
|
1765
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
1636
1766
|
}
|
|
1637
|
-
|
|
1638
|
-
|
|
1639
|
-
|
|
1640
|
-
|
|
1641
|
-
|
|
1642
|
-
|
|
1643
|
-
if (r.method)
|
|
1644
|
-
methods.push(r.method);
|
|
1767
|
+
const dataset = metadataStore.getDataset(datasetId);
|
|
1768
|
+
if (!dataset) {
|
|
1769
|
+
return {
|
|
1770
|
+
content: [{ type: "text", text: `ERROR: Dataset not found: ${datasetId}` }],
|
|
1771
|
+
isError: true,
|
|
1772
|
+
};
|
|
1645
1773
|
}
|
|
1646
|
-
|
|
1647
|
-
|
|
1648
|
-
|
|
1649
|
-
|
|
1650
|
-
|
|
1651
|
-
|
|
1652
|
-
|
|
1653
|
-
|
|
1774
|
+
// Enrich: if total_examples is 0-ish, try the HF datasets-server /size API
|
|
1775
|
+
if ((!dataset.total_examples || dataset.total_examples === 0) && dataset.source === "huggingface") {
|
|
1776
|
+
try {
|
|
1777
|
+
const sizeResp = await fetch(`https://datasets-server.huggingface.co/size?dataset=${encodeURIComponent(dataset.id)}`);
|
|
1778
|
+
if (sizeResp.ok) {
|
|
1779
|
+
const sizeData = await sizeResp.json();
|
|
1780
|
+
const numRows = sizeData?.size?.dataset?.num_rows;
|
|
1781
|
+
if (numRows && numRows > 0) {
|
|
1782
|
+
dataset.total_examples = numRows;
|
|
1783
|
+
// Also backfill splits
|
|
1784
|
+
if (sizeData?.size?.splits && Array.isArray(sizeData.size.splits)) {
|
|
1785
|
+
dataset.splits = sizeData.size.splits.map((s) => ({
|
|
1786
|
+
name: s.split,
|
|
1787
|
+
num_examples: s.num_rows || 0,
|
|
1788
|
+
size_bytes: s.num_bytes_parquet_files || 0,
|
|
1789
|
+
}));
|
|
1790
|
+
dataset.has_train_split = dataset.splits.some((s) => s.name === "train");
|
|
1791
|
+
dataset.has_test_split = dataset.splits.some((s) => s.name === "test");
|
|
1792
|
+
dataset.has_validation_split = dataset.splits.some((s) => s.name === "validation" || s.name === "val");
|
|
1793
|
+
}
|
|
1794
|
+
// Persist enriched metadata
|
|
1795
|
+
metadataStore.saveDataset(dataset);
|
|
1796
|
+
}
|
|
1797
|
+
}
|
|
1798
|
+
}
|
|
1799
|
+
catch {
|
|
1800
|
+
// Enrichment is best-effort; continue with whatever we have
|
|
1801
|
+
}
|
|
1654
1802
|
}
|
|
1655
|
-
|
|
1656
|
-
|
|
1657
|
-
|
|
1658
|
-
|
|
1659
|
-
|
|
1660
|
-
|
|
1661
|
-
|
|
1662
|
-
|
|
1803
|
+
const formattedOutput = formatDatasetInfo(dataset);
|
|
1804
|
+
return { content: [{ type: "text", text: formattedOutput }] };
|
|
1805
|
+
}
|
|
1806
|
+
case "analyze_quality": {
|
|
1807
|
+
const datasetId = String(request.params.arguments?.dataset_id);
|
|
1808
|
+
const safeId = toSafeDatasetPathFragment(datasetId);
|
|
1809
|
+
const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
|
|
1810
|
+
const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
|
|
1811
|
+
let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
|
|
1812
|
+
// Demo Fallback for easy testing
|
|
1813
|
+
if (datasetId === "demo" || !fs.existsSync(filePath)) {
|
|
1814
|
+
const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
|
|
1815
|
+
const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
|
|
1816
|
+
if (fs.existsSync(demoParquetPath)) {
|
|
1817
|
+
filePath = demoParquetPath;
|
|
1818
|
+
}
|
|
1819
|
+
else if (fs.existsSync(demoCsvPath)) {
|
|
1820
|
+
filePath = demoCsvPath;
|
|
1821
|
+
}
|
|
1822
|
+
else if (datasetId !== "demo") {
|
|
1823
|
+
return {
|
|
1824
|
+
content: [{ type: "text", text: `ERROR: Local data file for ${datasetId} not found. Try running prepare_dataset first, or use 'demo' as the dataset_id.` }],
|
|
1825
|
+
isError: true
|
|
1826
|
+
};
|
|
1827
|
+
}
|
|
1663
1828
|
}
|
|
1664
|
-
|
|
1665
|
-
if (saved.length === 0) {
|
|
1829
|
+
const report = await qualityAnalyzer.analyze(filePath);
|
|
1666
1830
|
return {
|
|
1667
|
-
content: [{ type: "text", text:
|
|
1831
|
+
content: [{ type: "text", text: JSON.stringify(report, null, 2) }]
|
|
1668
1832
|
};
|
|
1669
1833
|
}
|
|
1670
|
-
|
|
1671
|
-
|
|
1672
|
-
|
|
1673
|
-
|
|
1674
|
-
|
|
1675
|
-
|
|
1676
|
-
|
|
1677
|
-
|
|
1678
|
-
|
|
1679
|
-
|
|
1680
|
-
|
|
1834
|
+
case "preview_cleaning": {
|
|
1835
|
+
const datasetId = String(request.params.arguments?.dataset_id);
|
|
1836
|
+
const safeId = toSafeDatasetPathFragment(datasetId);
|
|
1837
|
+
const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
|
|
1838
|
+
const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
|
|
1839
|
+
let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
|
|
1840
|
+
if (datasetId === "demo" || !fs.existsSync(filePath)) {
|
|
1841
|
+
const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
|
|
1842
|
+
const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
|
|
1843
|
+
if (fs.existsSync(demoParquetPath)) {
|
|
1844
|
+
filePath = demoParquetPath;
|
|
1845
|
+
}
|
|
1846
|
+
else if (fs.existsSync(demoCsvPath)) {
|
|
1847
|
+
filePath = demoCsvPath;
|
|
1848
|
+
}
|
|
1849
|
+
else {
|
|
1850
|
+
throw new McpError(ErrorCode.InvalidParams, `Local data file not found for ${datasetId}. Please run prepare_dataset first.`);
|
|
1851
|
+
}
|
|
1852
|
+
}
|
|
1853
|
+
const report = await qualityAnalyzer.analyze(filePath);
|
|
1854
|
+
// Phase 1: Target Detection
|
|
1855
|
+
// We use the same TargetDetector instance inside CleaningPlanner now?
|
|
1856
|
+
// Actually, we instantiated it inside CleaningPlanner, so we just need to pass the file path
|
|
1857
|
+
// OR let the planner handle it if we update its signature to accept filePath.
|
|
1858
|
+
// Let's check `CleaningPlanner.generatePlan` signature again.
|
|
1859
|
+
// We updated it to accept `targetInfo`.
|
|
1860
|
+
// So we need to run detection HERE and pass it.
|
|
1861
|
+
// But `TargetDetector` is not exposed in `index.ts` scope yet.
|
|
1862
|
+
// Let's create a global instance or use the one inside planner if exposed (it's private).
|
|
1863
|
+
// Better approach: Instantiate TargetDetector here in index.ts for the tool content.
|
|
1864
|
+
// Quick fix: Instantiate local detector or make global.
|
|
1865
|
+
// I'll make a global `targetDetector` constant in index.ts
|
|
1866
|
+
// But wait, I updated `CleaningPlanner` to instantiate its own detector.
|
|
1867
|
+
// Does `CleaningPlanner` use it? No, I commented out the logic because it needed `filePath`.
|
|
1868
|
+
// RETRY STRATEGY:
|
|
1869
|
+
// 1. Instantiate `targetDetector` in `index.ts`.
|
|
1870
|
+
// 2. Run `detectTarget(filePath)`.
|
|
1871
|
+
// 3. Pass result to `cleaningPlanner.generatePlan(..., targetInfo)`.
|
|
1872
|
+
// I need to add `const targetDetector = new TargetDetector(__dirname);` to imports/init section first.
|
|
1873
|
+
// But since I'm in this tool, I can't look back.
|
|
1874
|
+
// I will assume I can add it, or just do it inside the case for now.
|
|
1875
|
+
// To do it properly, I should have added `targetDetector` to the global scope in previous step.
|
|
1876
|
+
// Let's do that in a separate step if needed.
|
|
1877
|
+
// For now, I'll instantiate it here.
|
|
1878
|
+
const { TargetDetector } = await import("./preparation/target-detector.js");
|
|
1879
|
+
const detector = new TargetDetector(__dirname);
|
|
1880
|
+
const targetResult = await detector.detectTarget(filePath);
|
|
1881
|
+
const targetInfo = targetResult.target_column ? {
|
|
1882
|
+
target: targetResult.target_column,
|
|
1883
|
+
confidence: targetResult.confidence
|
|
1884
|
+
} : undefined;
|
|
1885
|
+
const plan = await cleaningPlanner.generatePlan(datasetId, report, undefined, targetInfo);
|
|
1886
|
+
let explanation = `### Cleaning Plan for ${datasetId}\n\n`;
|
|
1887
|
+
if (targetInfo && targetInfo.target !== "target" && targetInfo.confidence > 0.7) {
|
|
1888
|
+
explanation += `**Target Detected**: '${targetInfo.target}' (Confidence: ${targetInfo.confidence.toFixed(2)})\n`;
|
|
1889
|
+
explanation += ` - **Action**: Renaming to 'target' for consistency.\n\n`;
|
|
1890
|
+
}
|
|
1891
|
+
explanation += `Estimated Quality Improvement: +${plan.estimated_impact.quality_score_improvement} points\n\n`;
|
|
1892
|
+
if (plan.operations.length === 0) {
|
|
1893
|
+
explanation += "No cleaning operations required.";
|
|
1894
|
+
}
|
|
1895
|
+
else {
|
|
1896
|
+
plan.operations.forEach((op, i) => {
|
|
1897
|
+
explanation += `${i + 1}. **${op.type}**: ${op.reason}\n`;
|
|
1898
|
+
});
|
|
1899
|
+
}
|
|
1681
1900
|
return {
|
|
1682
|
-
content: [{ type: "text", text:
|
|
1683
|
-
isError: true,
|
|
1901
|
+
content: [{ type: "text", text: explanation }]
|
|
1684
1902
|
};
|
|
1685
1903
|
}
|
|
1686
|
-
|
|
1687
|
-
|
|
1688
|
-
|
|
1689
|
-
|
|
1690
|
-
|
|
1691
|
-
|
|
1692
|
-
|
|
1693
|
-
|
|
1694
|
-
|
|
1695
|
-
|
|
1696
|
-
|
|
1697
|
-
const
|
|
1698
|
-
const
|
|
1699
|
-
|
|
1700
|
-
|
|
1701
|
-
|
|
1702
|
-
|
|
1703
|
-
|
|
1704
|
-
|
|
1705
|
-
|
|
1904
|
+
case "custom_clean": {
|
|
1905
|
+
const datasetId = String(request.params.arguments?.dataset_id);
|
|
1906
|
+
const ops = request.params.arguments?.operations;
|
|
1907
|
+
if (!datasetId || datasetId === "undefined") {
|
|
1908
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
1909
|
+
}
|
|
1910
|
+
if (!ops || !Array.isArray(ops) || ops.length === 0) {
|
|
1911
|
+
throw new McpError(ErrorCode.InvalidParams, "operations array is required and must not be empty. Supported operations: RemoveDuplicates, DropColumns, FillMissing, FixTypes, RemoveOutliers, EncodeCategories");
|
|
1912
|
+
}
|
|
1913
|
+
// Pre-check: verify dataset file exists before starting the job
|
|
1914
|
+
const cleanRegEntry = getRegistryEntry(datasetId);
|
|
1915
|
+
const cleanRegPath = cleanRegEntry?.local_path || cleanRegEntry?.path;
|
|
1916
|
+
const cleanDlStatus = metadataStore.getDownloadStatus(datasetId);
|
|
1917
|
+
const cleanSafeId = toSafeDatasetPathFragment(datasetId);
|
|
1918
|
+
const cleanDataExists = (cleanRegPath && fs.existsSync(cleanRegPath)) ||
|
|
1919
|
+
(cleanDlStatus?.local_path && fs.existsSync(cleanDlStatus.local_path)) ||
|
|
1920
|
+
fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.parquet`)) ||
|
|
1921
|
+
fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.csv`)) ||
|
|
1922
|
+
fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.feather`)) ||
|
|
1923
|
+
fs.existsSync(datasetId);
|
|
1924
|
+
if (!cleanDataExists) {
|
|
1706
1925
|
return {
|
|
1707
|
-
content: [{ type: "text", text: `
|
|
1708
|
-
isError: true
|
|
1926
|
+
content: [{ type: "text", text: `Dataset '${datasetId}' not found locally. Download it first using download_dataset or prepare_dataset, then run custom_clean.` }],
|
|
1927
|
+
isError: true,
|
|
1709
1928
|
};
|
|
1710
1929
|
}
|
|
1930
|
+
const job = jobManager.createJob("clean", 0, { datasetId, ops });
|
|
1931
|
+
return {
|
|
1932
|
+
content: [{ type: "text", text: `Cleaning job started. ID: ${job.id}. Use check_job_status to monitor progress.` }]
|
|
1933
|
+
};
|
|
1711
1934
|
}
|
|
1712
|
-
|
|
1713
|
-
|
|
1714
|
-
|
|
1715
|
-
|
|
1716
|
-
|
|
1717
|
-
|
|
1718
|
-
|
|
1719
|
-
const safeId = toSafeDatasetPathFragment(datasetId);
|
|
1720
|
-
const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
|
|
1721
|
-
const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
|
|
1722
|
-
let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
|
|
1723
|
-
if (datasetId === "demo" || !fs.existsSync(filePath)) {
|
|
1724
|
-
const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
|
|
1725
|
-
const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
|
|
1726
|
-
if (fs.existsSync(demoParquetPath)) {
|
|
1727
|
-
filePath = demoParquetPath;
|
|
1728
|
-
}
|
|
1729
|
-
else if (fs.existsSync(demoCsvPath)) {
|
|
1730
|
-
filePath = demoCsvPath;
|
|
1731
|
-
}
|
|
1732
|
-
else {
|
|
1733
|
-
throw new McpError(ErrorCode.InvalidParams, `Local data file not found for ${datasetId}. Please run prepare_dataset first.`);
|
|
1935
|
+
case "prepare_dataset": {
|
|
1936
|
+
hydrateExternalKeys();
|
|
1937
|
+
const query = String(request.params.arguments?.query);
|
|
1938
|
+
const requirements = request.params.arguments?.requirements ? String(request.params.arguments?.requirements) : undefined;
|
|
1939
|
+
const downloadImages = request.params.arguments?.download_images === true;
|
|
1940
|
+
if (!query || query === "undefined") {
|
|
1941
|
+
throw new McpError(ErrorCode.InvalidParams, "query is required - describe the dataset you need or provide a dataset ID");
|
|
1734
1942
|
}
|
|
1735
|
-
|
|
1736
|
-
const report = await qualityAnalyzer.analyze(filePath);
|
|
1737
|
-
// Phase 1: Target Detection
|
|
1738
|
-
// We use the same TargetDetector instance inside CleaningPlanner now?
|
|
1739
|
-
// Actually, we instantiated it inside CleaningPlanner, so we just need to pass the file path
|
|
1740
|
-
// OR let the planner handle it if we update its signature to accept filePath.
|
|
1741
|
-
// Let's check `CleaningPlanner.generatePlan` signature again.
|
|
1742
|
-
// We updated it to accept `targetInfo`.
|
|
1743
|
-
// So we need to run detection HERE and pass it.
|
|
1744
|
-
// But `TargetDetector` is not exposed in `index.ts` scope yet.
|
|
1745
|
-
// Let's create a global instance or use the one inside planner if exposed (it's private).
|
|
1746
|
-
// Better approach: Instantiate TargetDetector here in index.ts for the tool content.
|
|
1747
|
-
// Quick fix: Instantiate local detector or make global.
|
|
1748
|
-
// I'll make a global `targetDetector` constant in index.ts
|
|
1749
|
-
// But wait, I updated `CleaningPlanner` to instantiate its own detector.
|
|
1750
|
-
// Does `CleaningPlanner` use it? No, I commented out the logic because it needed `filePath`.
|
|
1751
|
-
// RETRY STRATEGY:
|
|
1752
|
-
// 1. Instantiate `targetDetector` in `index.ts`.
|
|
1753
|
-
// 2. Run `detectTarget(filePath)`.
|
|
1754
|
-
// 3. Pass result to `cleaningPlanner.generatePlan(..., targetInfo)`.
|
|
1755
|
-
// I need to add `const targetDetector = new TargetDetector(__dirname);` to imports/init section first.
|
|
1756
|
-
// But since I'm in this tool, I can't look back.
|
|
1757
|
-
// I will assume I can add it, or just do it inside the case for now.
|
|
1758
|
-
// To do it properly, I should have added `targetDetector` to the global scope in previous step.
|
|
1759
|
-
// Let's do that in a separate step if needed.
|
|
1760
|
-
// For now, I'll instantiate it here.
|
|
1761
|
-
const { TargetDetector } = await import("./preparation/target-detector.js");
|
|
1762
|
-
const detector = new TargetDetector(__dirname);
|
|
1763
|
-
const targetResult = await detector.detectTarget(filePath);
|
|
1764
|
-
const targetInfo = targetResult.target_column ? {
|
|
1765
|
-
target: targetResult.target_column,
|
|
1766
|
-
confidence: targetResult.confidence
|
|
1767
|
-
} : undefined;
|
|
1768
|
-
const plan = await cleaningPlanner.generatePlan(datasetId, report, undefined, targetInfo);
|
|
1769
|
-
let explanation = `### Cleaning Plan for ${datasetId}\n\n`;
|
|
1770
|
-
if (targetInfo && targetInfo.target !== "target" && targetInfo.confidence > 0.7) {
|
|
1771
|
-
explanation += `**Target Detected**: '${targetInfo.target}' (Confidence: ${targetInfo.confidence.toFixed(2)})\n`;
|
|
1772
|
-
explanation += ` - **Action**: Renaming to 'target' for consistency.\n\n`;
|
|
1773
|
-
}
|
|
1774
|
-
explanation += `Estimated Quality Improvement: +${plan.estimated_impact.quality_score_improvement} points\n\n`;
|
|
1775
|
-
if (plan.operations.length === 0) {
|
|
1776
|
-
explanation += "No cleaning operations required.";
|
|
1777
|
-
}
|
|
1778
|
-
else {
|
|
1779
|
-
plan.operations.forEach((op, i) => {
|
|
1780
|
-
explanation += `${i + 1}. **${op.type}**: ${op.reason}\n`;
|
|
1781
|
-
});
|
|
1782
|
-
}
|
|
1783
|
-
return {
|
|
1784
|
-
content: [{ type: "text", text: explanation }]
|
|
1785
|
-
};
|
|
1786
|
-
}
|
|
1787
|
-
case "custom_clean": {
|
|
1788
|
-
const datasetId = String(request.params.arguments?.dataset_id);
|
|
1789
|
-
const ops = request.params.arguments?.operations;
|
|
1790
|
-
if (!datasetId || datasetId === "undefined") {
|
|
1791
|
-
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
1792
|
-
}
|
|
1793
|
-
if (!ops || !Array.isArray(ops) || ops.length === 0) {
|
|
1794
|
-
throw new McpError(ErrorCode.InvalidParams, "operations array is required and must not be empty. Supported operations: RemoveDuplicates, DropColumns, FillMissing, FixTypes, RemoveOutliers, EncodeCategories");
|
|
1795
|
-
}
|
|
1796
|
-
// Pre-check: verify dataset file exists before starting the job
|
|
1797
|
-
const cleanRegEntry = getRegistryEntry(datasetId);
|
|
1798
|
-
const cleanRegPath = cleanRegEntry?.local_path || cleanRegEntry?.path;
|
|
1799
|
-
const cleanDlStatus = metadataStore.getDownloadStatus(datasetId);
|
|
1800
|
-
const cleanSafeId = toSafeDatasetPathFragment(datasetId);
|
|
1801
|
-
const cleanDataExists = (cleanRegPath && fs.existsSync(cleanRegPath)) ||
|
|
1802
|
-
(cleanDlStatus?.local_path && fs.existsSync(cleanDlStatus.local_path)) ||
|
|
1803
|
-
fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.parquet`)) ||
|
|
1804
|
-
fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.csv`)) ||
|
|
1805
|
-
fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.feather`)) ||
|
|
1806
|
-
fs.existsSync(datasetId);
|
|
1807
|
-
if (!cleanDataExists) {
|
|
1943
|
+
const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages });
|
|
1808
1944
|
return {
|
|
1809
|
-
content: [{ type: "text", text: `
|
|
1810
|
-
isError: true,
|
|
1945
|
+
content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you. Use check_job_status to monitor progress.` }]
|
|
1811
1946
|
};
|
|
1812
1947
|
}
|
|
1813
|
-
|
|
1814
|
-
|
|
1815
|
-
|
|
1816
|
-
|
|
1817
|
-
|
|
1818
|
-
|
|
1819
|
-
|
|
1820
|
-
|
|
1821
|
-
|
|
1822
|
-
|
|
1823
|
-
|
|
1824
|
-
|
|
1825
|
-
}
|
|
1826
|
-
const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages });
|
|
1827
|
-
return {
|
|
1828
|
-
content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you. Use check_job_status to monitor progress.` }]
|
|
1829
|
-
};
|
|
1830
|
-
}
|
|
1831
|
-
case "compare_datasets": {
|
|
1832
|
-
const datasetIds = request.params.arguments?.dataset_ids;
|
|
1833
|
-
const datasets = datasetIds.map(id => metadataStore.getDataset(id)).filter(d => !!d);
|
|
1834
|
-
let comparison = "| Metric | " + datasets.map(d => d.name).join(" | ") + " |\n";
|
|
1835
|
-
comparison += "| :--- | " + datasets.map(() => " :---: ").join("|") + " |\n";
|
|
1836
|
-
comparison += "| Quality Score | " + datasets.map(d => `${d.quality_score}/100`).join(" | ") + " |\n";
|
|
1837
|
-
comparison += "| License | " + datasets.map(d => d.license.category).join(" | ") + " |\n";
|
|
1838
|
-
comparison += "| Downloads | " + datasets.map(d => d.downloads).join(" | ") + " |\n";
|
|
1839
|
-
comparison += "| Domain | " + datasets.map(d => d.domain).join(" | ") + " |\n";
|
|
1840
|
-
return {
|
|
1841
|
-
content: [{ type: "text", text: comparison }]
|
|
1842
|
-
};
|
|
1843
|
-
}
|
|
1844
|
-
case "check_job_status": {
|
|
1845
|
-
const jobId = String(request.params.arguments?.job_id);
|
|
1846
|
-
const job = metadataStore.getJob(jobId);
|
|
1847
|
-
if (!job) {
|
|
1848
|
-
throw new McpError(ErrorCode.InvalidParams, `Job not found: ${jobId}`);
|
|
1948
|
+
case "compare_datasets": {
|
|
1949
|
+
const datasetIds = request.params.arguments?.dataset_ids;
|
|
1950
|
+
const datasets = datasetIds.map(id => metadataStore.getDataset(id)).filter(d => !!d);
|
|
1951
|
+
let comparison = "| Metric | " + datasets.map(d => d.name).join(" | ") + " |\n";
|
|
1952
|
+
comparison += "| :--- | " + datasets.map(() => " :---: ").join("|") + " |\n";
|
|
1953
|
+
comparison += "| Quality Score | " + datasets.map(d => `${d.quality_score}/100`).join(" | ") + " |\n";
|
|
1954
|
+
comparison += "| License | " + datasets.map(d => d.license.category).join(" | ") + " |\n";
|
|
1955
|
+
comparison += "| Downloads | " + datasets.map(d => d.downloads).join(" | ") + " |\n";
|
|
1956
|
+
comparison += "| Domain | " + datasets.map(d => d.domain).join(" | ") + " |\n";
|
|
1957
|
+
return {
|
|
1958
|
+
content: [{ type: "text", text: comparison }]
|
|
1959
|
+
};
|
|
1849
1960
|
}
|
|
1850
|
-
|
|
1851
|
-
|
|
1852
|
-
|
|
1853
|
-
|
|
1854
|
-
|
|
1855
|
-
|
|
1961
|
+
case "check_job_status": {
|
|
1962
|
+
const jobId = String(request.params.arguments?.job_id);
|
|
1963
|
+
const job = metadataStore.getJob(jobId);
|
|
1964
|
+
if (!job) {
|
|
1965
|
+
throw new McpError(ErrorCode.InvalidParams, `Job not found: ${jobId}`);
|
|
1966
|
+
}
|
|
1967
|
+
const activeStatuses = new Set(["pending", "queued", "running", "retrying"]);
|
|
1968
|
+
const now = Date.now();
|
|
1969
|
+
const last = jobStatusLastPoll[jobId] || 0;
|
|
1970
|
+
const minPollMs = 3000;
|
|
1971
|
+
if (activeStatuses.has(job.status) && (now - last) < minPollMs) {
|
|
1972
|
+
const waitMs = minPollMs - (now - last);
|
|
1973
|
+
return {
|
|
1974
|
+
content: [{ type: "text", text: `No significant status change yet. Please poll again in ~${Math.ceil(waitMs / 1000)}s.` }]
|
|
1975
|
+
};
|
|
1976
|
+
}
|
|
1977
|
+
jobStatusLastPoll[jobId] = now;
|
|
1856
1978
|
return {
|
|
1857
|
-
content: [{ type: "text", text:
|
|
1979
|
+
content: [{ type: "text", text: formatJobStatus(job) }]
|
|
1858
1980
|
};
|
|
1859
1981
|
}
|
|
1860
|
-
|
|
1861
|
-
|
|
1862
|
-
|
|
1863
|
-
|
|
1864
|
-
|
|
1865
|
-
|
|
1866
|
-
|
|
1867
|
-
|
|
1868
|
-
|
|
1869
|
-
|
|
1870
|
-
|
|
1871
|
-
|
|
1872
|
-
|
|
1873
|
-
|
|
1874
|
-
|
|
1875
|
-
|
|
1876
|
-
|
|
1877
|
-
|
|
1878
|
-
|
|
1982
|
+
case "export_dataset": {
|
|
1983
|
+
const datasetId = String(request.params.arguments?.dataset_id);
|
|
1984
|
+
const targetDir = request.params.arguments?.target_dir ? String(request.params.arguments?.target_dir) : undefined;
|
|
1985
|
+
const requestedFormat = String(request.params.arguments?.format || "feather");
|
|
1986
|
+
const fastMode = request.params.arguments?.fast === true;
|
|
1987
|
+
const preview = request.params.arguments?.preview === true;
|
|
1988
|
+
const sampleRows = request.params.arguments?.sample_rows ? Number(request.params.arguments.sample_rows) : undefined;
|
|
1989
|
+
const columns = request.params.arguments?.columns;
|
|
1990
|
+
const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
|
|
1991
|
+
// Use Metadata or Registry to find the actual local file
|
|
1992
|
+
let sourcePath = resolveDatasetLocalPath(datasetId);
|
|
1993
|
+
if (!sourcePath) {
|
|
1994
|
+
console.error(`[Export] No local data found for ${datasetId}. Attempting to prepare automatically...`);
|
|
1995
|
+
// Start a prepare job for this dataset id (acts like calling prepare_dataset)
|
|
1996
|
+
try {
|
|
1997
|
+
jobManager.createJob("prepare", 0, { query: datasetId, requirements: undefined, downloadImages: false });
|
|
1998
|
+
}
|
|
1999
|
+
catch (e) {
|
|
2000
|
+
console.error(`[Export] Failed to start prepare job for ${datasetId}: ${e?.message || e}`);
|
|
2001
|
+
}
|
|
2002
|
+
// Poll for download status or registry entry until local_path appears or timeout
|
|
2003
|
+
const wait = (ms) => new Promise(res => setTimeout(res, ms));
|
|
2004
|
+
const maxWait = 120_000; // 120s
|
|
2005
|
+
const interval = 2000;
|
|
2006
|
+
let waited = 0;
|
|
2007
|
+
while (waited < maxWait) {
|
|
2008
|
+
const resolved = resolveDatasetLocalPath(datasetId);
|
|
2009
|
+
if (resolved) {
|
|
2010
|
+
sourcePath = resolved;
|
|
2011
|
+
console.error(`[Export] Local data is now available for ${datasetId}: ${sourcePath}`);
|
|
2012
|
+
break;
|
|
2013
|
+
}
|
|
2014
|
+
await wait(interval);
|
|
2015
|
+
waited += interval;
|
|
2016
|
+
}
|
|
2017
|
+
// If still no sourcePath, return helpful error listing prepared datasets
|
|
2018
|
+
if (!sourcePath) {
|
|
2019
|
+
const entries = readRegistry();
|
|
2020
|
+
const listText = entries.length === 0 ? "(no prepared datasets found)" : entries.map(e => `- ${e.dataset_id || e.id || "unknown"}: ${e.local_path || e.path || "unknown"}`).slice(0, 10).join("\n") + (entries.length > 10 ? "\n...and " + (entries.length - 10) + " more" : "");
|
|
2021
|
+
return {
|
|
2022
|
+
content: [{ type: "text", text: `ERROR: No local data found for ${datasetId} after attempting prepare. Check credentials and try running prepare_dataset manually. Prepared datasets:\n${listText}` }],
|
|
2023
|
+
isError: true
|
|
2024
|
+
};
|
|
2025
|
+
}
|
|
2026
|
+
}
|
|
2027
|
+
sourcePath = ensureExportableLocalPath(sourcePath);
|
|
1879
2028
|
try {
|
|
1880
|
-
|
|
2029
|
+
upsertRegistry(datasetId, sourcePath, "completed");
|
|
1881
2030
|
}
|
|
1882
2031
|
catch (e) {
|
|
1883
|
-
console.error(`[
|
|
1884
|
-
}
|
|
1885
|
-
//
|
|
1886
|
-
|
|
1887
|
-
|
|
1888
|
-
|
|
1889
|
-
|
|
1890
|
-
|
|
1891
|
-
|
|
1892
|
-
|
|
1893
|
-
|
|
1894
|
-
console.error(`[Export]
|
|
1895
|
-
|
|
2032
|
+
console.error(`[Registry] Failed to normalize registry path for ${datasetId}: ${e?.message || e}`);
|
|
2033
|
+
}
|
|
2034
|
+
// If NOT fast mode, run quality/cleaning pipeline first (only for csv/parquet compat)
|
|
2035
|
+
if (!fastMode) {
|
|
2036
|
+
const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
|
|
2037
|
+
const pipelineFmt = (requestedFormat === "csv" || requestedFormat === "parquet") ? requestedFormat : "parquet";
|
|
2038
|
+
const pipelineCompatibleInput = currentExt === "csv" || currentExt === "parquet";
|
|
2039
|
+
if (!pipelineCompatibleInput) {
|
|
2040
|
+
console.error(`[Export] Skipping pipeline for ${currentExt} input; using direct exporter conversion.`);
|
|
2041
|
+
}
|
|
2042
|
+
else if (currentExt !== pipelineFmt) {
|
|
2043
|
+
console.error(`[Export] Running quality/cleaning pipeline (use fast=true to skip)...`);
|
|
2044
|
+
try {
|
|
2045
|
+
const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, pipelineFmt);
|
|
2046
|
+
if (pipelineResult.final_output_path) {
|
|
2047
|
+
sourcePath = pipelineResult.final_output_path;
|
|
2048
|
+
try {
|
|
2049
|
+
// Update registry to point to pipeline's final output
|
|
2050
|
+
upsertRegistry(datasetId, sourcePath, "completed");
|
|
2051
|
+
}
|
|
2052
|
+
catch (e) {
|
|
2053
|
+
console.error(`[Registry] Failed to update registry for ${datasetId}: ${e?.message || e}`);
|
|
2054
|
+
}
|
|
2055
|
+
}
|
|
2056
|
+
}
|
|
2057
|
+
catch (err) {
|
|
2058
|
+
console.error(`[Export] Pipeline warning: ${err.message}. Continuing with raw file.`);
|
|
2059
|
+
}
|
|
1896
2060
|
}
|
|
1897
|
-
await wait(interval);
|
|
1898
|
-
waited += interval;
|
|
1899
2061
|
}
|
|
1900
|
-
|
|
1901
|
-
|
|
1902
|
-
|
|
1903
|
-
|
|
2062
|
+
else {
|
|
2063
|
+
console.error(`[Export] Fast mode - skipping quality analysis and cleaning`);
|
|
2064
|
+
}
|
|
2065
|
+
// Build export options
|
|
2066
|
+
const exportOpts = {};
|
|
2067
|
+
if (compression)
|
|
2068
|
+
exportOpts.compression = compression;
|
|
2069
|
+
if (preview)
|
|
2070
|
+
exportOpts.preview = true;
|
|
2071
|
+
if (sampleRows)
|
|
2072
|
+
exportOpts.sample_rows = sampleRows;
|
|
2073
|
+
if (columns)
|
|
2074
|
+
exportOpts.columns = columns;
|
|
2075
|
+
try {
|
|
2076
|
+
// Determine output file name
|
|
2077
|
+
const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
|
|
2078
|
+
const ext = extMap[requestedFormat] || ".feather";
|
|
2079
|
+
const safeName = toSafeDatasetPathFragment(datasetId);
|
|
2080
|
+
const outDir = targetDir || path.join(dataRoot, "exports");
|
|
2081
|
+
if (!fs.existsSync(outDir))
|
|
2082
|
+
fs.mkdirSync(outDir, { recursive: true });
|
|
2083
|
+
const outputFile = path.join(outDir, `${safeName}${ext}`);
|
|
2084
|
+
const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
|
|
2085
|
+
// Build rich response
|
|
2086
|
+
let msg = `**Export complete**\n`;
|
|
2087
|
+
msg += `- **File**: ${result.output_path}\n`;
|
|
2088
|
+
msg += `- **Format**: ${result.format}${result.compression ? ` (${result.compression})` : ""}\n`;
|
|
2089
|
+
msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
|
|
2090
|
+
if (result.file_size_mb !== undefined)
|
|
2091
|
+
msg += `- **Size**: ${result.file_size_mb} MB\n`;
|
|
2092
|
+
if (result.elapsed_seconds !== undefined)
|
|
2093
|
+
msg += `- **Time**: ${result.elapsed_seconds}s\n`;
|
|
2094
|
+
if (result.preview_path)
|
|
2095
|
+
msg += `- **Preview**: ${result.preview_path}\n`;
|
|
2096
|
+
msg += `\n`;
|
|
2097
|
+
if (requestedFormat === "feather") {
|
|
2098
|
+
msg += `**Inspect with:**\n`;
|
|
2099
|
+
msg += ` Python: \`pd.read_feather('${result.output_path}').head()\`\n`;
|
|
2100
|
+
msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
|
|
2101
|
+
}
|
|
2102
|
+
else if (requestedFormat === "parquet") {
|
|
2103
|
+
msg += `**Inspect with:**\n`;
|
|
2104
|
+
msg += ` Python: \`pd.read_parquet('${result.output_path}').head()\`\n`;
|
|
2105
|
+
msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
|
|
2106
|
+
}
|
|
2107
|
+
return { content: [{ type: "text", text: msg }] };
|
|
2108
|
+
}
|
|
2109
|
+
catch (error) {
|
|
1904
2110
|
return {
|
|
1905
|
-
content: [{ type: "text", text: `ERROR:
|
|
2111
|
+
content: [{ type: "text", text: `ERROR: Export failed: ${error.message}` }],
|
|
1906
2112
|
isError: true
|
|
1907
2113
|
};
|
|
1908
2114
|
}
|
|
1909
2115
|
}
|
|
1910
|
-
|
|
1911
|
-
|
|
1912
|
-
|
|
1913
|
-
|
|
1914
|
-
|
|
1915
|
-
|
|
1916
|
-
|
|
1917
|
-
|
|
1918
|
-
|
|
1919
|
-
const
|
|
1920
|
-
const
|
|
1921
|
-
const
|
|
1922
|
-
|
|
1923
|
-
|
|
1924
|
-
|
|
1925
|
-
|
|
1926
|
-
|
|
1927
|
-
|
|
1928
|
-
|
|
1929
|
-
|
|
1930
|
-
sourcePath = pipelineResult.final_output_path;
|
|
1931
|
-
try {
|
|
1932
|
-
// Update registry to point to pipeline's final output
|
|
1933
|
-
upsertRegistry(datasetId, sourcePath, "completed");
|
|
1934
|
-
}
|
|
1935
|
-
catch (e) {
|
|
1936
|
-
console.error(`[Registry] Failed to update registry for ${datasetId}: ${e?.message || e}`);
|
|
1937
|
-
}
|
|
1938
|
-
}
|
|
2116
|
+
case "fuse_datasets": {
|
|
2117
|
+
const rawSources = request.params.arguments?.sources;
|
|
2118
|
+
if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {
|
|
2119
|
+
throw new McpError(ErrorCode.InvalidParams, "sources must contain at least 2 dataset IDs/paths");
|
|
2120
|
+
}
|
|
2121
|
+
const strategy = request.params.arguments?.strategy || "concat";
|
|
2122
|
+
const joinOn = request.params.arguments?.join_on;
|
|
2123
|
+
const how = request.params.arguments?.how || "inner";
|
|
2124
|
+
const dedup = request.params.arguments?.dedup !== false;
|
|
2125
|
+
const runQualityAfter = request.params.arguments?.run_quality_after !== false;
|
|
2126
|
+
const leakageCheck = request.params.arguments?.leakage_check !== false;
|
|
2127
|
+
const outputFormat = request.params.arguments?.output_format || "feather";
|
|
2128
|
+
const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
|
|
2129
|
+
const preview = request.params.arguments?.preview !== false;
|
|
2130
|
+
const resolvedPaths = [];
|
|
2131
|
+
const unresolved = [];
|
|
2132
|
+
for (const src of rawSources) {
|
|
2133
|
+
if (fs.existsSync(src)) {
|
|
2134
|
+
resolvedPaths.push(src);
|
|
2135
|
+
continue;
|
|
1939
2136
|
}
|
|
1940
|
-
|
|
1941
|
-
|
|
2137
|
+
const status = metadataStore.getDownloadStatus(src);
|
|
2138
|
+
if (status?.local_path && fs.existsSync(status.local_path)) {
|
|
2139
|
+
resolvedPaths.push(status.local_path);
|
|
2140
|
+
continue;
|
|
1942
2141
|
}
|
|
2142
|
+
unresolved.push(src);
|
|
1943
2143
|
}
|
|
1944
|
-
|
|
1945
|
-
|
|
1946
|
-
|
|
1947
|
-
|
|
1948
|
-
|
|
1949
|
-
|
|
1950
|
-
|
|
1951
|
-
|
|
1952
|
-
if (preview)
|
|
1953
|
-
exportOpts.preview = true;
|
|
1954
|
-
if (sampleRows)
|
|
1955
|
-
exportOpts.sample_rows = sampleRows;
|
|
1956
|
-
if (columns)
|
|
1957
|
-
exportOpts.columns = columns;
|
|
1958
|
-
try {
|
|
1959
|
-
// Determine output file name
|
|
1960
|
-
const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
|
|
1961
|
-
const ext = extMap[requestedFormat] || ".feather";
|
|
1962
|
-
const safeName = toSafeDatasetPathFragment(datasetId);
|
|
1963
|
-
const outDir = targetDir || path.join(dataRoot, "exports");
|
|
1964
|
-
if (!fs.existsSync(outDir))
|
|
1965
|
-
fs.mkdirSync(outDir, { recursive: true });
|
|
1966
|
-
const outputFile = path.join(outDir, `${safeName}${ext}`);
|
|
1967
|
-
const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
|
|
1968
|
-
// Build rich response
|
|
1969
|
-
let msg = `**Export complete**\n`;
|
|
1970
|
-
msg += `- **File**: ${result.output_path}\n`;
|
|
1971
|
-
msg += `- **Format**: ${result.format}${result.compression ? ` (${result.compression})` : ""}\n`;
|
|
1972
|
-
msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
|
|
1973
|
-
if (result.file_size_mb !== undefined)
|
|
1974
|
-
msg += `- **Size**: ${result.file_size_mb} MB\n`;
|
|
1975
|
-
if (result.elapsed_seconds !== undefined)
|
|
1976
|
-
msg += `- **Time**: ${result.elapsed_seconds}s\n`;
|
|
1977
|
-
if (result.preview_path)
|
|
1978
|
-
msg += `- **Preview**: ${result.preview_path}\n`;
|
|
1979
|
-
msg += `\n`;
|
|
1980
|
-
if (requestedFormat === "feather") {
|
|
1981
|
-
msg += `**Inspect with:**\n`;
|
|
1982
|
-
msg += ` Python: \`pd.read_feather('${result.output_path}').head()\`\n`;
|
|
1983
|
-
msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
|
|
1984
|
-
}
|
|
1985
|
-
else if (requestedFormat === "parquet") {
|
|
1986
|
-
msg += `**Inspect with:**\n`;
|
|
1987
|
-
msg += ` Python: \`pd.read_parquet('${result.output_path}').head()\`\n`;
|
|
1988
|
-
msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
|
|
1989
|
-
}
|
|
1990
|
-
return { content: [{ type: "text", text: msg }] };
|
|
1991
|
-
}
|
|
1992
|
-
catch (error) {
|
|
1993
|
-
return {
|
|
1994
|
-
content: [{ type: "text", text: `ERROR: Export failed: ${error.message}` }],
|
|
1995
|
-
isError: true
|
|
1996
|
-
};
|
|
1997
|
-
}
|
|
1998
|
-
}
|
|
1999
|
-
case "fuse_datasets": {
|
|
2000
|
-
const rawSources = request.params.arguments?.sources;
|
|
2001
|
-
if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {
|
|
2002
|
-
throw new McpError(ErrorCode.InvalidParams, "sources must contain at least 2 dataset IDs/paths");
|
|
2003
|
-
}
|
|
2004
|
-
const strategy = request.params.arguments?.strategy || "concat";
|
|
2005
|
-
const joinOn = request.params.arguments?.join_on;
|
|
2006
|
-
const how = request.params.arguments?.how || "inner";
|
|
2007
|
-
const dedup = request.params.arguments?.dedup !== false;
|
|
2008
|
-
const runQualityAfter = request.params.arguments?.run_quality_after !== false;
|
|
2009
|
-
const leakageCheck = request.params.arguments?.leakage_check !== false;
|
|
2010
|
-
const outputFormat = request.params.arguments?.output_format || "feather";
|
|
2011
|
-
const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
|
|
2012
|
-
const preview = request.params.arguments?.preview !== false;
|
|
2013
|
-
const resolvedPaths = [];
|
|
2014
|
-
const unresolved = [];
|
|
2015
|
-
for (const src of rawSources) {
|
|
2016
|
-
if (fs.existsSync(src)) {
|
|
2017
|
-
resolvedPaths.push(src);
|
|
2018
|
-
continue;
|
|
2144
|
+
if (unresolved.length > 0) {
|
|
2145
|
+
return {
|
|
2146
|
+
content: [{
|
|
2147
|
+
type: "text",
|
|
2148
|
+
text: `ERROR: Could not resolve these sources to local files: ${unresolved.join(", ")}. Provide local paths or run prepare_dataset first.`
|
|
2149
|
+
}],
|
|
2150
|
+
isError: true
|
|
2151
|
+
};
|
|
2019
2152
|
}
|
|
2020
|
-
|
|
2021
|
-
|
|
2022
|
-
|
|
2023
|
-
|
|
2153
|
+
try {
|
|
2154
|
+
const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow" };
|
|
2155
|
+
const ext = extMap[outputFormat] || ".feather";
|
|
2156
|
+
const outDir = path.join(dataRoot, "fusion");
|
|
2157
|
+
if (!fs.existsSync(outDir))
|
|
2158
|
+
fs.mkdirSync(outDir, { recursive: true });
|
|
2159
|
+
const outputPath = path.join(outDir, `fused_${Date.now()}${ext}`);
|
|
2160
|
+
const result = await fusionEngine.fuse(resolvedPaths, outputPath, {
|
|
2161
|
+
strategy,
|
|
2162
|
+
join_on: joinOn,
|
|
2163
|
+
how,
|
|
2164
|
+
dedup,
|
|
2165
|
+
run_quality_after: runQualityAfter,
|
|
2166
|
+
leakage_check: leakageCheck,
|
|
2167
|
+
output_format: outputFormat,
|
|
2168
|
+
compression: compression,
|
|
2169
|
+
preview,
|
|
2170
|
+
});
|
|
2171
|
+
const nullDelta = result.stats.null_delta;
|
|
2172
|
+
const nullText = nullDelta >= 0 ? `+${nullDelta}%` : `${nullDelta}%`;
|
|
2173
|
+
// Register fused dataset under a generated id so users can export it easily
|
|
2174
|
+
const fusedId = `fused_${Date.now()}`;
|
|
2175
|
+
try {
|
|
2176
|
+
upsertRegistry(fusedId, result.output_path, "completed");
|
|
2177
|
+
}
|
|
2178
|
+
catch (e) {
|
|
2179
|
+
console.error(`[Registry] Failed to register fused dataset ${fusedId}: ${e?.message || e}`);
|
|
2180
|
+
}
|
|
2181
|
+
let msg = `Fused ${result.stats.sources_count} sources -> ${result.stats.rows_after.toLocaleString()} rows (from ${result.stats.rows_before.toLocaleString()}).\n`;
|
|
2182
|
+
msg += `- Duplicates removed: ${result.stats.duplicates_removed.toLocaleString()}\n`;
|
|
2183
|
+
msg += `- Null change: ${nullText}\n`;
|
|
2184
|
+
msg += `- Output: ${result.output_path}\n`;
|
|
2185
|
+
if (result.preview_path)
|
|
2186
|
+
msg += `- Preview: ${result.preview_path}\n`;
|
|
2187
|
+
if (result.leakage_report) {
|
|
2188
|
+
msg += `- Leakage: ${result.leakage_report.leakage_detected ? "detected" : "none"}`;
|
|
2189
|
+
if (result.leakage_report.leakage_count) {
|
|
2190
|
+
msg += ` (${result.leakage_report.leakage_count})`;
|
|
2191
|
+
}
|
|
2192
|
+
msg += "\n";
|
|
2193
|
+
}
|
|
2194
|
+
msg += `\nNext: run split_dataset/export_dataset on fused output. Registered fused dataset id: ${fusedId}`;
|
|
2195
|
+
return { content: [{ type: "text", text: msg }] };
|
|
2196
|
+
}
|
|
2197
|
+
catch (error) {
|
|
2198
|
+
return {
|
|
2199
|
+
content: [{ type: "text", text: `ERROR: Fusion failed: ${error.message}` }],
|
|
2200
|
+
isError: true
|
|
2201
|
+
};
|
|
2024
2202
|
}
|
|
2025
|
-
unresolved.push(src);
|
|
2026
|
-
}
|
|
2027
|
-
if (unresolved.length > 0) {
|
|
2028
|
-
return {
|
|
2029
|
-
content: [{
|
|
2030
|
-
type: "text",
|
|
2031
|
-
text: `ERROR: Could not resolve these sources to local files: ${unresolved.join(", ")}. Provide local paths or run prepare_dataset first.`
|
|
2032
|
-
}],
|
|
2033
|
-
isError: true
|
|
2034
|
-
};
|
|
2035
2203
|
}
|
|
2036
|
-
|
|
2037
|
-
const
|
|
2038
|
-
|
|
2039
|
-
|
|
2040
|
-
if (!fs.existsSync(outDir))
|
|
2041
|
-
fs.mkdirSync(outDir, { recursive: true });
|
|
2042
|
-
const outputPath = path.join(outDir, `fused_${Date.now()}${ext}`);
|
|
2043
|
-
const result = await fusionEngine.fuse(resolvedPaths, outputPath, {
|
|
2044
|
-
strategy,
|
|
2045
|
-
join_on: joinOn,
|
|
2046
|
-
how,
|
|
2047
|
-
dedup,
|
|
2048
|
-
run_quality_after: runQualityAfter,
|
|
2049
|
-
leakage_check: leakageCheck,
|
|
2050
|
-
output_format: outputFormat,
|
|
2051
|
-
compression: compression,
|
|
2052
|
-
preview,
|
|
2053
|
-
});
|
|
2054
|
-
const nullDelta = result.stats.null_delta;
|
|
2055
|
-
const nullText = nullDelta >= 0 ? `+${nullDelta}%` : `${nullDelta}%`;
|
|
2056
|
-
// Register fused dataset under a generated id so users can export it easily
|
|
2057
|
-
const fusedId = `fused_${Date.now()}`;
|
|
2058
|
-
try {
|
|
2059
|
-
upsertRegistry(fusedId, result.output_path, "completed");
|
|
2204
|
+
case "analyze_image_quality": {
|
|
2205
|
+
const inputPath = String(request.params.arguments?.path);
|
|
2206
|
+
if (!fs.existsSync(inputPath)) {
|
|
2207
|
+
throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
|
|
2060
2208
|
}
|
|
2061
|
-
|
|
2062
|
-
|
|
2063
|
-
|
|
2064
|
-
|
|
2065
|
-
|
|
2066
|
-
|
|
2067
|
-
|
|
2068
|
-
|
|
2069
|
-
|
|
2070
|
-
|
|
2071
|
-
|
|
2072
|
-
|
|
2073
|
-
|
|
2209
|
+
try {
|
|
2210
|
+
const report = await imageAnalyzer.analyze(inputPath);
|
|
2211
|
+
let output = `## Image Quality Report\n\n`;
|
|
2212
|
+
output += `- **Total Images**: ${report.total_images}\n`;
|
|
2213
|
+
output += `- **Corrupted**: ${report.corrupted_count}\n`;
|
|
2214
|
+
output += `- **Average Resolution**: ${Math.round(report.average_width)}x${Math.round(report.average_height)}\n`;
|
|
2215
|
+
output += `- **Blurry Images**: ${report.blurry_count}\n\n`;
|
|
2216
|
+
if (report.individual_results.length > 0) {
|
|
2217
|
+
output += `### Sample Detail (Top 5)\n`;
|
|
2218
|
+
report.individual_results.slice(0, 5).forEach(img => {
|
|
2219
|
+
const statusLabel = img.status === "ok" ? (img.is_blurry ? "[warn]" : "[ok]") : "[error]";
|
|
2220
|
+
output += `${statusLabel} **${img.filename}**: ${img.width}x${img.height} | Mode: ${img.mode} | Blur: ${img.blur_score?.toFixed(1)}\n`;
|
|
2221
|
+
});
|
|
2074
2222
|
}
|
|
2075
|
-
|
|
2223
|
+
return {
|
|
2224
|
+
content: [{ type: "text", text: output }]
|
|
2225
|
+
};
|
|
2226
|
+
}
|
|
2227
|
+
catch (error) {
|
|
2228
|
+
return {
|
|
2229
|
+
content: [{ type: "text", text: `ERROR: Image analysis failed: ${error.message}` }],
|
|
2230
|
+
isError: true
|
|
2231
|
+
};
|
|
2076
2232
|
}
|
|
2077
|
-
msg += `\nNext: run split_dataset/export_dataset on fused output. Registered fused dataset id: ${fusedId}`;
|
|
2078
|
-
return { content: [{ type: "text", text: msg }] };
|
|
2079
|
-
}
|
|
2080
|
-
catch (error) {
|
|
2081
|
-
return {
|
|
2082
|
-
content: [{ type: "text", text: `ERROR: Fusion failed: ${error.message}` }],
|
|
2083
|
-
isError: true
|
|
2084
|
-
};
|
|
2085
|
-
}
|
|
2086
|
-
}
|
|
2087
|
-
case "analyze_image_quality": {
|
|
2088
|
-
const inputPath = String(request.params.arguments?.path);
|
|
2089
|
-
if (!fs.existsSync(inputPath)) {
|
|
2090
|
-
throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
|
|
2091
2233
|
}
|
|
2092
|
-
|
|
2093
|
-
const
|
|
2094
|
-
|
|
2095
|
-
|
|
2096
|
-
|
|
2097
|
-
|
|
2098
|
-
|
|
2099
|
-
|
|
2100
|
-
output +=
|
|
2101
|
-
|
|
2102
|
-
|
|
2103
|
-
|
|
2234
|
+
case "analyze_media_quality": {
|
|
2235
|
+
const inputPath = String(request.params.arguments?.path);
|
|
2236
|
+
if (!fs.existsSync(inputPath)) {
|
|
2237
|
+
throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
|
|
2238
|
+
}
|
|
2239
|
+
try {
|
|
2240
|
+
const report = await mediaAnalyzer.analyze(inputPath);
|
|
2241
|
+
let output = `## Media Quality Report\n\n`;
|
|
2242
|
+
output += `- **Total Files**: ${report.total_files}\n`;
|
|
2243
|
+
output += `- **OK Files**: ${report.ok_files}\n`;
|
|
2244
|
+
output += `- **Failed Files**: ${report.failed_files}\n`;
|
|
2245
|
+
if ('avg_audio_duration' in report && report.avg_audio_duration) {
|
|
2246
|
+
output += `- **Average Audio Duration**: ${report.avg_audio_duration.toFixed(2)}s\n`;
|
|
2247
|
+
}
|
|
2248
|
+
if ('avg_video_duration' in report && report.avg_video_duration) {
|
|
2249
|
+
output += `- **Average Video Duration**: ${report.avg_video_duration.toFixed(2)}s\n`;
|
|
2250
|
+
output += `- **Average FPS**: ${report.avg_fps?.toFixed(1)}\n`;
|
|
2251
|
+
}
|
|
2252
|
+
output += `\n### Sample Detail (Top 5)\n`;
|
|
2253
|
+
report.details.slice(0, 5).forEach(item => {
|
|
2254
|
+
const statusLabel = item.status === "ok" ? "[ok]" : "[error]";
|
|
2255
|
+
if (item.type === "audio" && 'sample_rate' in item) {
|
|
2256
|
+
output += `${statusLabel} **${item.filename}**: ${item.duration?.toFixed(1)}s | SR: ${item.sample_rate}Hz | Silent: ${item.is_silent}\n`;
|
|
2257
|
+
}
|
|
2258
|
+
else if (item.type === "video" && 'width' in item) {
|
|
2259
|
+
output += `${statusLabel} **${item.filename}**: ${item.width}x${item.height} | ${item.fps?.toFixed(1)}fps | Risk: ${item.corruption_risk}\n`;
|
|
2260
|
+
}
|
|
2261
|
+
else {
|
|
2262
|
+
output += `${statusLabel} **${item.filename}**: ${item.error}\n`;
|
|
2263
|
+
}
|
|
2104
2264
|
});
|
|
2265
|
+
return {
|
|
2266
|
+
content: [{ type: "text", text: output }]
|
|
2267
|
+
};
|
|
2268
|
+
}
|
|
2269
|
+
catch (error) {
|
|
2270
|
+
return {
|
|
2271
|
+
content: [{ type: "text", text: `ERROR: Media analysis failed: ${error.message}` }],
|
|
2272
|
+
isError: true
|
|
2273
|
+
};
|
|
2105
2274
|
}
|
|
2106
|
-
return {
|
|
2107
|
-
content: [{ type: "text", text: output }]
|
|
2108
|
-
};
|
|
2109
|
-
}
|
|
2110
|
-
catch (error) {
|
|
2111
|
-
return {
|
|
2112
|
-
content: [{ type: "text", text: `ERROR: Image analysis failed: ${error.message}` }],
|
|
2113
|
-
isError: true
|
|
2114
|
-
};
|
|
2115
|
-
}
|
|
2116
|
-
}
|
|
2117
|
-
case "analyze_media_quality": {
|
|
2118
|
-
const inputPath = String(request.params.arguments?.path);
|
|
2119
|
-
if (!fs.existsSync(inputPath)) {
|
|
2120
|
-
throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
|
|
2121
2275
|
}
|
|
2122
|
-
|
|
2123
|
-
const
|
|
2124
|
-
|
|
2125
|
-
|
|
2126
|
-
|
|
2127
|
-
|
|
2128
|
-
|
|
2129
|
-
|
|
2130
|
-
|
|
2131
|
-
|
|
2132
|
-
|
|
2133
|
-
|
|
2134
|
-
|
|
2135
|
-
|
|
2136
|
-
|
|
2137
|
-
|
|
2138
|
-
if (item.type === "audio" && 'sample_rate' in item) {
|
|
2139
|
-
output += `${statusLabel} **${item.filename}**: ${item.duration?.toFixed(1)}s | SR: ${item.sample_rate}Hz | Silent: ${item.is_silent}\n`;
|
|
2276
|
+
case "generate_quality_report": {
|
|
2277
|
+
const datasetId = String(request.params.arguments?.dataset_id);
|
|
2278
|
+
const datasetPath = String(request.params.arguments?.dataset_path);
|
|
2279
|
+
if (!fs.existsSync(datasetPath)) {
|
|
2280
|
+
throw new McpError(ErrorCode.InvalidParams, `Dataset path not found: ${datasetPath}`);
|
|
2281
|
+
}
|
|
2282
|
+
try {
|
|
2283
|
+
// Optionally load text quality from metadata if available
|
|
2284
|
+
const metadata = await metadataStore.getDataset(datasetId);
|
|
2285
|
+
// TODO: Integrate text quality analysis when available
|
|
2286
|
+
const textQuality = null;
|
|
2287
|
+
const report = await qualityOrchestrator.generateReport(datasetId, datasetPath, textQuality);
|
|
2288
|
+
// Save report to metadata
|
|
2289
|
+
if (metadata) {
|
|
2290
|
+
metadata.unified_quality_report = report;
|
|
2291
|
+
await metadataStore.saveDataset(metadata);
|
|
2140
2292
|
}
|
|
2141
|
-
|
|
2142
|
-
|
|
2293
|
+
let output = `# Unified Quality Report\n\n`;
|
|
2294
|
+
output += `**Dataset**: ${datasetId}\n`;
|
|
2295
|
+
output += `**Modalities**: ${report.modalities.join(", ")}\n`;
|
|
2296
|
+
output += `**Overall Quality Score**: ${report.overall_quality_score}/100\n\n`;
|
|
2297
|
+
if (report.text_quality) {
|
|
2298
|
+
output += `## Text Quality\n`;
|
|
2299
|
+
output += `- Rows: ${report.text_quality.row_count}\n`;
|
|
2300
|
+
output += `- Columns: ${report.text_quality.column_count}\n`;
|
|
2301
|
+
output += `- Missing: ${report.text_quality.missing_percentage.toFixed(1)}%\n`;
|
|
2302
|
+
output += `- Duplicates: ${report.text_quality.duplicate_percentage.toFixed(1)}%\n\n`;
|
|
2143
2303
|
}
|
|
2144
|
-
|
|
2145
|
-
output +=
|
|
2304
|
+
if (report.image_quality) {
|
|
2305
|
+
output += `## Image Quality\n`;
|
|
2306
|
+
output += `- Total Images: ${report.image_quality.total_images}\n`;
|
|
2307
|
+
output += `- Corrupted: ${report.image_quality.corrupted_count}\n`;
|
|
2308
|
+
output += `- Avg Resolution: ${report.image_quality.avg_resolution}\n`;
|
|
2309
|
+
output += `- Blurry: ${report.image_quality.blurry_percentage.toFixed(1)}%\n\n`;
|
|
2146
2310
|
}
|
|
2147
|
-
|
|
2148
|
-
|
|
2149
|
-
|
|
2150
|
-
|
|
2151
|
-
|
|
2152
|
-
|
|
2153
|
-
|
|
2154
|
-
|
|
2155
|
-
|
|
2156
|
-
|
|
2157
|
-
|
|
2158
|
-
|
|
2159
|
-
|
|
2160
|
-
|
|
2161
|
-
|
|
2162
|
-
|
|
2163
|
-
|
|
2164
|
-
|
|
2165
|
-
|
|
2166
|
-
|
|
2167
|
-
|
|
2168
|
-
|
|
2169
|
-
|
|
2170
|
-
|
|
2171
|
-
|
|
2172
|
-
|
|
2173
|
-
|
|
2174
|
-
|
|
2175
|
-
}
|
|
2176
|
-
let output = `# Unified Quality Report\n\n`;
|
|
2177
|
-
output += `**Dataset**: ${datasetId}\n`;
|
|
2178
|
-
output += `**Modalities**: ${report.modalities.join(", ")}\n`;
|
|
2179
|
-
output += `**Overall Quality Score**: ${report.overall_quality_score}/100\n\n`;
|
|
2180
|
-
if (report.text_quality) {
|
|
2181
|
-
output += `## Text Quality\n`;
|
|
2182
|
-
output += `- Rows: ${report.text_quality.row_count}\n`;
|
|
2183
|
-
output += `- Columns: ${report.text_quality.column_count}\n`;
|
|
2184
|
-
output += `- Missing: ${report.text_quality.missing_percentage.toFixed(1)}%\n`;
|
|
2185
|
-
output += `- Duplicates: ${report.text_quality.duplicate_percentage.toFixed(1)}%\n\n`;
|
|
2186
|
-
}
|
|
2187
|
-
if (report.image_quality) {
|
|
2188
|
-
output += `## Image Quality\n`;
|
|
2189
|
-
output += `- Total Images: ${report.image_quality.total_images}\n`;
|
|
2190
|
-
output += `- Corrupted: ${report.image_quality.corrupted_count}\n`;
|
|
2191
|
-
output += `- Avg Resolution: ${report.image_quality.avg_resolution}\n`;
|
|
2192
|
-
output += `- Blurry: ${report.image_quality.blurry_percentage.toFixed(1)}%\n\n`;
|
|
2193
|
-
}
|
|
2194
|
-
if (report.audio_quality) {
|
|
2195
|
-
output += `## Audio Quality\n`;
|
|
2196
|
-
output += `- Total Files: ${report.audio_quality.total_files}\n`;
|
|
2197
|
-
output += `- Avg Duration: ${report.audio_quality.avg_duration.toFixed(1)}s\n`;
|
|
2198
|
-
output += `- Avg Sample Rate: ${report.audio_quality.avg_sample_rate.toFixed(0)}Hz\n`;
|
|
2199
|
-
output += `- Silent: ${report.audio_quality.silent_percentage.toFixed(1)}%\n\n`;
|
|
2200
|
-
}
|
|
2201
|
-
if (report.video_quality) {
|
|
2202
|
-
output += `## Video Quality\n`;
|
|
2203
|
-
output += `- Total Files: ${report.video_quality.total_files}\n`;
|
|
2204
|
-
output += `- Avg Duration: ${report.video_quality.avg_duration.toFixed(1)}s\n`;
|
|
2205
|
-
output += `- Avg FPS: ${report.video_quality.avg_fps.toFixed(1)}\n`;
|
|
2206
|
-
output += `- High Corruption Risk: ${report.video_quality.corruption_risk_high}\n\n`;
|
|
2207
|
-
}
|
|
2208
|
-
output += `## Recommendations\n`;
|
|
2209
|
-
report.recommendations.forEach(rec => {
|
|
2210
|
-
output += `- ${rec}\n`;
|
|
2211
|
-
});
|
|
2212
|
-
return {
|
|
2213
|
-
content: [{ type: "text", text: output }]
|
|
2214
|
-
};
|
|
2215
|
-
}
|
|
2216
|
-
catch (error) {
|
|
2217
|
-
return {
|
|
2218
|
-
content: [{ type: "text", text: `ERROR: Quality report generation failed: ${error.message}` }],
|
|
2219
|
-
isError: true
|
|
2220
|
-
};
|
|
2311
|
+
if (report.audio_quality) {
|
|
2312
|
+
output += `## Audio Quality\n`;
|
|
2313
|
+
output += `- Total Files: ${report.audio_quality.total_files}\n`;
|
|
2314
|
+
output += `- Avg Duration: ${report.audio_quality.avg_duration.toFixed(1)}s\n`;
|
|
2315
|
+
output += `- Avg Sample Rate: ${report.audio_quality.avg_sample_rate.toFixed(0)}Hz\n`;
|
|
2316
|
+
output += `- Silent: ${report.audio_quality.silent_percentage.toFixed(1)}%\n\n`;
|
|
2317
|
+
}
|
|
2318
|
+
if (report.video_quality) {
|
|
2319
|
+
output += `## Video Quality\n`;
|
|
2320
|
+
output += `- Total Files: ${report.video_quality.total_files}\n`;
|
|
2321
|
+
output += `- Avg Duration: ${report.video_quality.avg_duration.toFixed(1)}s\n`;
|
|
2322
|
+
output += `- Avg FPS: ${report.video_quality.avg_fps.toFixed(1)}\n`;
|
|
2323
|
+
output += `- High Corruption Risk: ${report.video_quality.corruption_risk_high}\n\n`;
|
|
2324
|
+
}
|
|
2325
|
+
output += `## Recommendations\n`;
|
|
2326
|
+
report.recommendations.forEach(rec => {
|
|
2327
|
+
output += `- ${rec}\n`;
|
|
2328
|
+
});
|
|
2329
|
+
return {
|
|
2330
|
+
content: [{ type: "text", text: output }]
|
|
2331
|
+
};
|
|
2332
|
+
}
|
|
2333
|
+
catch (error) {
|
|
2334
|
+
return {
|
|
2335
|
+
content: [{ type: "text", text: `ERROR: Quality report generation failed: ${error.message}` }],
|
|
2336
|
+
isError: true
|
|
2337
|
+
};
|
|
2338
|
+
}
|
|
2221
2339
|
}
|
|
2340
|
+
default:
|
|
2341
|
+
throw new McpError(ErrorCode.MethodNotFound, "Tool not found");
|
|
2222
2342
|
}
|
|
2223
|
-
|
|
2224
|
-
throw new McpError(ErrorCode.MethodNotFound, "Tool not found");
|
|
2225
|
-
}
|
|
2343
|
+
}); // end requestQueue.enqueue
|
|
2226
2344
|
});
|
|
2227
2345
|
async function main() {
|
|
2228
2346
|
const args = process.argv.slice(2);
|