vesper-wizard 2.0.8 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -266,6 +266,34 @@ function logError(err, context) {
266
266
  fs.appendFileSync(errorLogPath, msg);
267
267
  console.error(`[Vesper] Critical error logged to ${errorLogPath}`);
268
268
  }
269
+ // --- Request Queue: serialize all MCP tool calls to prevent crashes ---
270
+ class RequestQueue {
271
+ queue = [];
272
+ running = false;
273
+ enqueue(task) {
274
+ return new Promise((resolve, reject) => {
275
+ this.queue.push({ resolve, reject, task });
276
+ this.drain();
277
+ });
278
+ }
279
+ async drain() {
280
+ if (this.running)
281
+ return;
282
+ this.running = true;
283
+ while (this.queue.length > 0) {
284
+ const item = this.queue.shift();
285
+ try {
286
+ const result = await item.task();
287
+ item.resolve(result);
288
+ }
289
+ catch (err) {
290
+ item.reject(err);
291
+ }
292
+ }
293
+ this.running = false;
294
+ }
295
+ }
296
+ const requestQueue = new RequestQueue();
269
297
  const SPINNER_FRAMES = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"];
270
298
  function printLaunchScreen() {
271
299
  const screen = `
@@ -502,7 +530,45 @@ function syncPythonScripts(appRoot, dataRoot) {
502
530
  }
503
531
  // Sync scripts immediately
504
532
  syncPythonScripts(appRoot, dataRoot);
505
- const metadataStore = new MetadataStore(dbPath);
533
+ // Auto-rebuild better-sqlite3 if native binary doesn't match current Node version
534
+ function tryRebuildSqlite() {
535
+ try {
536
+ const { execSync } = require("child_process");
537
+ const pkgRoot = path.resolve(__dirname, "..");
538
+ console.error("[Vesper] Rebuilding better-sqlite3 for Node " + process.version + "...");
539
+ execSync("npm rebuild better-sqlite3", {
540
+ stdio: "pipe",
541
+ timeout: 60000,
542
+ cwd: pkgRoot,
543
+ });
544
+ console.error("[Vesper] Rebuild succeeded. Retrying...");
545
+ // Clear require cache so the rebuilt module is loaded
546
+ for (const key of Object.keys(require.cache)) {
547
+ if (key.includes("better-sqlite3") || key.includes("better_sqlite3")) {
548
+ delete require.cache[key];
549
+ }
550
+ }
551
+ return true;
552
+ }
553
+ catch (e) {
554
+ console.error("[Vesper] Auto-rebuild failed: " + (e?.message || e));
555
+ return false;
556
+ }
557
+ }
558
+ let metadataStore;
559
+ try {
560
+ metadataStore = new MetadataStore(dbPath);
561
+ }
562
+ catch (e) {
563
+ if (e?.code === "ERR_DLOPEN_FAILED" && tryRebuildSqlite()) {
564
+ metadataStore = new MetadataStore(dbPath);
565
+ }
566
+ else {
567
+ console.error("[Vesper] FATAL: Cannot load better-sqlite3.");
568
+ console.error("[Vesper] Run: npm rebuild better-sqlite3");
569
+ throw e;
570
+ }
571
+ }
506
572
  const vectorStore = new VectorStore(vectorPath);
507
573
  const embedder = Embedder.getInstance();
508
574
  const searchEngine = new SearchEngine(metadataStore, vectorStore, embedder);
@@ -599,6 +665,18 @@ jobManager.on("processJob", async (job, execute) => {
599
665
  async function handlePrepareJob(jobId, query, requirements) {
600
666
  hydrateExternalKeys();
601
667
  const update = (updates) => jobManager.updateJob(jobId, updates);
668
+ const pipelineSteps = ["search", "validate", "download", "normalize", "quality", "register"];
669
+ const stepStatus = {};
670
+ for (const s of pipelineSteps)
671
+ stepStatus[s] = "pending";
672
+ const markPipelineStep = (step, status) => {
673
+ stepStatus[step] = status;
674
+ const summary = pipelineSteps.map(s => {
675
+ const st = stepStatus[s];
676
+ return st === "done" ? `[${s}]` : st === "running" ? `>${s}<` : st === "failed" ? `!${s}!` : st === "skipped" ? `~${s}~` : ` ${s} `;
677
+ }).join(" → ");
678
+ console.error(`[Pipeline] ${summary}`);
679
+ };
602
680
  // Ensure core Python packages are available for dataset operations
603
681
  try {
604
682
  await ensurePythonModules([
@@ -646,11 +724,14 @@ async function handlePrepareJob(jobId, query, requirements) {
646
724
  progress: 20,
647
725
  status_text: `Using explicit dataset id: ${datasetIdForDownload} (${source})`
648
726
  });
727
+ markPipelineStep("search", "skipped");
649
728
  }
650
729
  else {
730
+ markPipelineStep("search", "running");
651
731
  update({ progress: 10, status_text: "Searching for best dataset matching query..." });
652
732
  const results = await searchEngine.search(query, { limit: 10 });
653
733
  if (results.length === 0) {
734
+ markPipelineStep("search", "failed");
654
735
  throw new Error("No datasets found matching the query. Try refining your search terms.");
655
736
  }
656
737
  // Pick the best result that we can actually download (skip sources requiring missing credentials)
@@ -670,8 +751,10 @@ async function handlePrepareJob(jobId, query, requirements) {
670
751
  progress: 20,
671
752
  status_text: `Matched: ${selectedDataset.name} (${selectedDataset.source})`
672
753
  });
754
+ markPipelineStep("search", "done");
673
755
  }
674
756
  // Pre-check credentials for sources that require them
757
+ markPipelineStep("validate", "running");
675
758
  if (source === "kaggle") {
676
759
  const hasKaggleCreds = !!(process.env.KAGGLE_USERNAME && process.env.KAGGLE_KEY && process.env.KAGGLE_USERNAME !== "YOUR_KAGGLE_USERNAME");
677
760
  if (!hasKaggleCreds) {
@@ -679,8 +762,11 @@ async function handlePrepareJob(jobId, query, requirements) {
679
762
  }
680
763
  }
681
764
  if (source === "dataworld" && !hasDataWorldToken()) {
765
+ markPipelineStep("validate", "failed");
682
766
  throw new Error("data.world token not set. Use the configure_keys tool or set DW_AUTH_TOKEN environment variable.");
683
767
  }
768
+ markPipelineStep("validate", "done");
769
+ markPipelineStep("download", "running");
684
770
  update({ progress: 30, status_text: `Starting download from ${source}...` });
685
771
  // ensureData handles download and returns path to the raw file
686
772
  let rawFilePath = await dataIngestor.ensureData(datasetIdForDownload, source, (msg, prog) => {
@@ -743,15 +829,50 @@ async function handlePrepareJob(jobId, query, requirements) {
743
829
  update({ progress: 69, status_text: `Sample target met: ${currentRows.toLocaleString()} rows` });
744
830
  }
745
831
  }
832
+ markPipelineStep("download", "done");
833
+ // ── Normalize step: convert any raw format → parquet ──
834
+ markPipelineStep("normalize", "running");
835
+ const rawExt = path.extname(rawFilePath).toLowerCase();
836
+ if (rawExt !== ".parquet" && rawExt !== ".pq") {
837
+ update({ progress: 70, status_text: "Normalizing to parquet..." });
838
+ const normalizedDir = path.join(dataRoot, "data", "normalized");
839
+ if (!fs.existsSync(normalizedDir))
840
+ fs.mkdirSync(normalizedDir, { recursive: true });
841
+ const safeId = toSafeDatasetPathFragment(datasetIdForDownload);
842
+ const normalizedPath = path.join(normalizedDir, `${safeId}.parquet`);
843
+ try {
844
+ const normScript = path.join(dataRoot, "python", "normalize_engine.py");
845
+ const normResult = await runPythonJson(normScript, [rawFilePath, normalizedPath]);
846
+ if (normResult.ok && fs.existsSync(normalizedPath)) {
847
+ console.error(`[Prepare] Normalized ${rawExt} → parquet (${normResult.rows} rows)`);
848
+ rawFilePath = normalizedPath;
849
+ markPipelineStep("normalize", "done");
850
+ }
851
+ else {
852
+ console.error(`[Prepare] Normalize failed: ${normResult.error}, continuing with raw file`);
853
+ markPipelineStep("normalize", "skipped");
854
+ }
855
+ }
856
+ catch (e) {
857
+ console.error(`[Prepare] Normalize step failed: ${e?.message || e}, continuing with raw file`);
858
+ markPipelineStep("normalize", "skipped");
859
+ }
860
+ }
861
+ else {
862
+ markPipelineStep("normalize", "done");
863
+ }
746
864
  let qualityScore = selectedDataset?.quality_score ?? 70;
747
- update({ progress: 70, status_text: "Analyzing dataset quality..." });
865
+ markPipelineStep("quality", "running");
866
+ update({ progress: 75, status_text: "Analyzing dataset quality..." });
748
867
  try {
749
868
  const report = await qualityAnalyzer.analyze(rawFilePath);
750
869
  qualityScore = report.overall_score;
870
+ markPipelineStep("quality", "done");
751
871
  }
752
872
  catch (error) {
753
873
  console.error(`[Prepare] Quality analysis failed for ${datasetIdForDownload}: ${error?.message || error}`);
754
874
  update({ progress: 78, status_text: "Quality analysis skipped (unsupported schema). Continuing installation..." });
875
+ markPipelineStep("quality", "skipped");
755
876
  }
756
877
  if (selectedDataset) {
757
878
  metadataStore.saveDataset({
@@ -759,15 +880,19 @@ async function handlePrepareJob(jobId, query, requirements) {
759
880
  quality_score: qualityScore
760
881
  });
761
882
  }
883
+ markPipelineStep("register", "running");
762
884
  update({ progress: 85, status_text: "Installing dataset into project..." });
763
885
  const installPath = await installService.install(datasetIdForDownload, rawFilePath);
764
886
  update({ progress: 100, status_text: "Preparation complete!" });
765
887
  // Register prepared dataset in local registry for lookup by export/list tools
766
888
  try {
767
889
  upsertRegistry(datasetIdForDownload, installPath, "completed");
890
+ markPipelineStep("register", "done");
891
+ markStepComplete(datasetIdForDownload, "prepare");
768
892
  }
769
893
  catch (e) {
770
894
  console.error(`[Registry] Failed to write registry for ${datasetIdForDownload}: ${e?.message || e}`);
895
+ markPipelineStep("register", "failed");
771
896
  }
772
897
  return installPath;
773
898
  }
@@ -1261,110 +1386,237 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
1261
1386
  ],
1262
1387
  };
1263
1388
  });
1264
- // Call Tool
1389
+ // Call Tool — all requests are serialized through a queue to prevent crashes from parallel calls
1265
1390
  server.setRequestHandler(CallToolRequestSchema, async (request) => {
1266
- // --- Pipeline Enforcement ---
1267
- // Map tool names to pipeline steps
1268
- const toolToStep = {
1269
- vesper_search: "search",
1270
- vesper_download: "download",
1271
- vesper_analyze: "analyze",
1272
- vesper_clean: "clean",
1273
- vesper_split: "split",
1274
- vesper_export: "export",
1275
- prepare_dataset: "prepare",
1276
- };
1277
- // Extract dataset_id if present and normalize
1278
- let datasetId = request.params.arguments?.dataset_id || request.params.arguments?.query || "";
1279
- if (datasetId)
1280
- datasetId = parseDatasetId(String(datasetId));
1281
- // Pipeline rules
1282
- const stepOrder = ["search", "download", "analyze", "clean", "split", "export"];
1283
- const prereqs = {
1284
- vesper_download: ["search"],
1285
- vesper_analyze: ["download"],
1286
- vesper_clean: ["analyze"],
1287
- vesper_split: ["clean"],
1288
- vesper_export: ["split"],
1289
- };
1290
- const tool = String(request.params.name);
1291
- const step = toolToStep[tool];
1292
- if (step && datasetId) {
1293
- // Check prerequisites
1294
- const required = prereqs[tool] || [];
1295
- for (const req of required) {
1296
- if (!hasStep(String(datasetId), req)) {
1297
- // Auto-run missing step if possible, else error
1298
- // For export, auto-run prepare_dataset if split missing
1299
- if (tool === "vesper_export" && req === "split") {
1300
- // Auto-trigger prepare_dataset (start a background prepare job)
1301
- try {
1302
- jobManager.createJob("prepare", 0, { query: String(datasetId), requirements: undefined, downloadImages: false });
1303
- // Mark split as complete so export can proceed; export handler will also wait for data if needed.
1304
- markStepComplete(String(datasetId), "split");
1391
+ return requestQueue.enqueue(async () => {
1392
+ // --- Pipeline Enforcement ---
1393
+ // Map tool names to pipeline steps
1394
+ const toolToStep = {
1395
+ vesper_search: "search",
1396
+ vesper_download: "download",
1397
+ vesper_analyze: "analyze",
1398
+ vesper_clean: "clean",
1399
+ vesper_split: "split",
1400
+ vesper_export: "export",
1401
+ prepare_dataset: "prepare",
1402
+ };
1403
+ // Extract dataset_id if present and normalize
1404
+ let datasetId = request.params.arguments?.dataset_id || request.params.arguments?.query || "";
1405
+ if (datasetId)
1406
+ datasetId = parseDatasetId(String(datasetId));
1407
+ // Pipeline rules
1408
+ const stepOrder = ["search", "download", "analyze", "clean", "split", "export"];
1409
+ const prereqs = {
1410
+ vesper_download: ["search"],
1411
+ vesper_analyze: ["download"],
1412
+ vesper_clean: ["analyze"],
1413
+ vesper_split: ["clean"],
1414
+ vesper_export: ["split"],
1415
+ };
1416
+ const tool = String(request.params.name);
1417
+ const step = toolToStep[tool];
1418
+ if (step && datasetId) {
1419
+ // Check prerequisites
1420
+ const required = prereqs[tool] || [];
1421
+ for (const req of required) {
1422
+ if (!hasStep(String(datasetId), req)) {
1423
+ // Auto-run missing step if possible, else error
1424
+ // For export, auto-run prepare_dataset if split missing
1425
+ if (tool === "vesper_export" && req === "split") {
1426
+ // Auto-trigger prepare_dataset (start a background prepare job)
1427
+ try {
1428
+ jobManager.createJob("prepare", 0, { query: String(datasetId), requirements: undefined, downloadImages: false });
1429
+ // Mark split as complete so export can proceed; export handler will also wait for data if needed.
1430
+ markStepComplete(String(datasetId), "split");
1431
+ }
1432
+ catch (e) {
1433
+ console.error(`[Pipeline] Failed to auto-trigger prepare for ${datasetId}: ${e?.message || e}`);
1434
+ return {
1435
+ content: [{ type: "text", text: `ERROR: Failed to auto-run prepare for ${datasetId}. Please run prepare_dataset first.` }],
1436
+ isError: true,
1437
+ };
1438
+ }
1305
1439
  }
1306
- catch (e) {
1307
- console.error(`[Pipeline] Failed to auto-trigger prepare for ${datasetId}: ${e?.message || e}`);
1440
+ else {
1308
1441
  return {
1309
- content: [{ type: "text", text: `ERROR: Failed to auto-run prepare for ${datasetId}. Please run prepare_dataset first.` }],
1442
+ content: [{ type: "text", text: `ERROR: Cannot run ${tool} before ${req}. Please run ${req} first.` }],
1310
1443
  isError: true,
1311
1444
  };
1312
1445
  }
1313
1446
  }
1314
- else {
1447
+ }
1448
+ // Mark this step as complete
1449
+ markStepComplete(String(datasetId), String(step));
1450
+ }
1451
+ switch (request.params.name) {
1452
+ case "unified_dataset_api": {
1453
+ hydrateExternalKeys();
1454
+ const operation = String(request.params.arguments?.operation || "").trim().toLowerCase();
1455
+ const source = String(request.params.arguments?.source || "auto").trim().toLowerCase();
1456
+ const includeUnavailable = request.params.arguments?.include_unavailable === true;
1457
+ const publicOnly = request.params.arguments?.public_only !== false;
1458
+ try {
1459
+ if (operation === "providers") {
1460
+ return {
1461
+ content: [{ type: "text", text: JSON.stringify({ providers: unifiedDatasetGateway.getProviderStatuses(includeUnavailable) }, null, 2) }],
1462
+ };
1463
+ }
1464
+ if (operation === "discover") {
1465
+ const query = String(request.params.arguments?.query || "").trim();
1466
+ if (!query) {
1467
+ throw new McpError(ErrorCode.InvalidParams, "query is required for operation='discover'");
1468
+ }
1469
+ const result = await unifiedDatasetGateway.discover({
1470
+ query,
1471
+ source,
1472
+ limit: Number(request.params.arguments?.limit || 10),
1473
+ publicOnly,
1474
+ });
1475
+ return {
1476
+ content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
1477
+ };
1478
+ }
1479
+ if (operation === "download") {
1480
+ const datasetId = String(request.params.arguments?.dataset_id || "").trim();
1481
+ if (!datasetId) {
1482
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='download'");
1483
+ }
1484
+ try {
1485
+ await ensurePythonModules([{ module: "datasets", packageName: "datasets" }]);
1486
+ }
1487
+ catch {
1488
+ // best effort; non-HF providers do not require this
1489
+ }
1490
+ const result = await unifiedDatasetGateway.download({
1491
+ datasetId,
1492
+ source,
1493
+ targetDir: request.params.arguments?.target_dir ? String(request.params.arguments.target_dir) : undefined,
1494
+ });
1495
+ try {
1496
+ upsertRegistry(result.dataset_id, result.copied_to || result.local_path, "completed");
1497
+ }
1498
+ catch (e) {
1499
+ console.error(`[Registry] Failed to write registry for ${result.dataset_id}: ${e?.message || e}`);
1500
+ }
1501
+ return {
1502
+ content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
1503
+ };
1504
+ }
1505
+ if (operation === "info") {
1506
+ const datasetId = String(request.params.arguments?.dataset_id || "").trim();
1507
+ if (!datasetId) {
1508
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='info'");
1509
+ }
1510
+ const result = await unifiedDatasetGateway.info({
1511
+ datasetId,
1512
+ source,
1513
+ publicOnly,
1514
+ });
1515
+ return {
1516
+ content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
1517
+ };
1518
+ }
1519
+ throw new McpError(ErrorCode.InvalidParams, `Unsupported operation: ${operation}`);
1520
+ }
1521
+ catch (error) {
1315
1522
  return {
1316
- content: [{ type: "text", text: `ERROR: Cannot run ${tool} before ${req}. Please run ${req} first.` }],
1523
+ content: [{ type: "text", text: `ERROR: unified gateway failed: ${error.message}` }],
1317
1524
  isError: true,
1318
1525
  };
1319
1526
  }
1320
1527
  }
1321
- }
1322
- // Mark this step as complete
1323
- markStepComplete(String(datasetId), String(step));
1324
- }
1325
- switch (request.params.name) {
1326
- case "unified_dataset_api": {
1327
- hydrateExternalKeys();
1328
- const operation = String(request.params.arguments?.operation || "").trim().toLowerCase();
1329
- const source = String(request.params.arguments?.source || "auto").trim().toLowerCase();
1330
- const includeUnavailable = request.params.arguments?.include_unavailable === true;
1331
- const publicOnly = request.params.arguments?.public_only !== false;
1332
- try {
1333
- if (operation === "providers") {
1334
- return {
1335
- content: [{ type: "text", text: JSON.stringify({ providers: unifiedDatasetGateway.getProviderStatuses(includeUnavailable) }, null, 2) }],
1336
- };
1528
+ case "vesper_search": {
1529
+ const query = String(request.params.arguments?.query);
1530
+ const limit = 5;
1531
+ const safeOnly = true; // Enable safe filter by default
1532
+ const enableJIT = request.params.arguments?.enable_jit === true;
1533
+ if (!query) {
1534
+ throw new McpError(ErrorCode.InvalidParams, "Query is required");
1337
1535
  }
1338
- if (operation === "discover") {
1339
- const query = String(request.params.arguments?.query || "").trim();
1340
- if (!query) {
1341
- throw new McpError(ErrorCode.InvalidParams, "query is required for operation='discover'");
1342
- }
1343
- const result = await unifiedDatasetGateway.discover({
1536
+ const results = await searchEngine.search(query, { limit, safeOnly, enableJIT });
1537
+ const formattedOutput = formatSearchResults(results);
1538
+ return {
1539
+ content: [
1540
+ {
1541
+ type: "text",
1542
+ text: formattedOutput,
1543
+ },
1544
+ ],
1545
+ };
1546
+ }
1547
+ case "discover_datasets": {
1548
+ hydrateExternalKeys();
1549
+ const query = String(request.params.arguments?.query || "").trim();
1550
+ const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
1551
+ const limit = Number(request.params.arguments?.limit || 10);
1552
+ if (!query) {
1553
+ throw new McpError(ErrorCode.InvalidParams, "query is required");
1554
+ }
1555
+ try {
1556
+ const gatewayResult = await unifiedDatasetGateway.discover({
1344
1557
  query,
1345
1558
  source,
1346
- limit: Number(request.params.arguments?.limit || 10),
1347
- publicOnly,
1559
+ limit,
1560
+ publicOnly: false,
1348
1561
  });
1562
+ const results = gatewayResult.results;
1563
+ const recipeScript = path.join(dataRoot, "python", "asset_downloader_engine.py");
1564
+ for (const ds of results.slice(0, limit)) {
1565
+ const info = {
1566
+ dataset_id: ds.id,
1567
+ id: ds.id,
1568
+ source: ds.source,
1569
+ repo_id: ds.id,
1570
+ total_images: ds.total_examples || 0,
1571
+ image_column: undefined,
1572
+ recipes_dir: path.join(dataRoot, "recipes"),
1573
+ };
1574
+ try {
1575
+ await runPythonJson(recipeScript, ["build_recipe", JSON.stringify(info)]);
1576
+ }
1577
+ catch {
1578
+ // best-effort recipe generation; ignore discovery-time recipe failures
1579
+ }
1580
+ }
1581
+ const formattedOutput = formatSearchResults(results.slice(0, limit));
1582
+ const noteBlock = gatewayResult.notes.length > 0
1583
+ ? `\nGateway notes:\n- ${gatewayResult.notes.join("\n- ")}`
1584
+ : "";
1349
1585
  return {
1350
- content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
1586
+ content: [{ type: "text", text: `${formattedOutput}${noteBlock}` }]
1351
1587
  };
1352
1588
  }
1353
- if (operation === "download") {
1354
- const datasetId = String(request.params.arguments?.dataset_id || "").trim();
1355
- if (!datasetId) {
1356
- throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='download'");
1357
- }
1589
+ catch (error) {
1590
+ return {
1591
+ content: [{ type: "text", text: `ERROR: discover failed: ${error.message}` }],
1592
+ isError: true,
1593
+ };
1594
+ }
1595
+ }
1596
+ case "download_dataset": {
1597
+ hydrateExternalKeys();
1598
+ const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
1599
+ const datasetId = String(request.params.arguments?.dataset_id || "").trim();
1600
+ const targetDir = request.params.arguments?.target_dir ? String(request.params.arguments.target_dir) : undefined;
1601
+ if (!datasetId) {
1602
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
1603
+ }
1604
+ // Pre-install Python datasets library for HuggingFace fallback
1605
+ if (source === "huggingface") {
1358
1606
  try {
1359
- await ensurePythonModules([{ module: "datasets", packageName: "datasets" }]);
1607
+ await ensurePythonModules([
1608
+ { module: "datasets", packageName: "datasets" },
1609
+ ]);
1360
1610
  }
1361
1611
  catch {
1362
- // best effort; non-HF providers do not require this
1612
+ // Continue - direct download may still work
1363
1613
  }
1614
+ }
1615
+ try {
1364
1616
  const result = await unifiedDatasetGateway.download({
1365
1617
  datasetId,
1366
1618
  source,
1367
- targetDir: request.params.arguments?.target_dir ? String(request.params.arguments.target_dir) : undefined,
1619
+ targetDir,
1368
1620
  });
1369
1621
  try {
1370
1622
  upsertRegistry(result.dataset_id, result.copied_to || result.local_path, "completed");
@@ -1372,857 +1624,761 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1372
1624
  catch (e) {
1373
1625
  console.error(`[Registry] Failed to write registry for ${result.dataset_id}: ${e?.message || e}`);
1374
1626
  }
1627
+ const noteBlock = result.notes.length > 0 ? `\nNotes:\n- ${result.notes.join("\n- ")}` : "";
1375
1628
  return {
1376
- content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
1629
+ content: [{ type: "text", text: `Download complete: ${result.copied_to || result.local_path}${noteBlock}` }]
1377
1630
  };
1378
1631
  }
1379
- if (operation === "info") {
1380
- const datasetId = String(request.params.arguments?.dataset_id || "").trim();
1381
- if (!datasetId) {
1382
- throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='info'");
1383
- }
1384
- const result = await unifiedDatasetGateway.info({
1385
- datasetId,
1386
- source,
1387
- publicOnly,
1388
- });
1632
+ catch (error) {
1389
1633
  return {
1390
- content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
1634
+ content: [{ type: "text", text: `ERROR: download failed: ${error.message}` }],
1635
+ isError: true,
1391
1636
  };
1392
1637
  }
1393
- throw new McpError(ErrorCode.InvalidParams, `Unsupported operation: ${operation}`);
1394
- }
1395
- catch (error) {
1396
- return {
1397
- content: [{ type: "text", text: `ERROR: unified gateway failed: ${error.message}` }],
1398
- isError: true,
1399
- };
1400
- }
1401
- }
1402
- case "vesper_search": {
1403
- const query = String(request.params.arguments?.query);
1404
- const limit = 5;
1405
- const safeOnly = true; // Enable safe filter by default
1406
- const enableJIT = request.params.arguments?.enable_jit === true;
1407
- if (!query) {
1408
- throw new McpError(ErrorCode.InvalidParams, "Query is required");
1409
1638
  }
1410
- const results = await searchEngine.search(query, { limit, safeOnly, enableJIT });
1411
- const formattedOutput = formatSearchResults(results);
1412
- return {
1413
- content: [
1414
- {
1415
- type: "text",
1416
- text: formattedOutput,
1417
- },
1418
- ],
1419
- };
1420
- }
1421
- case "discover_datasets": {
1422
- hydrateExternalKeys();
1423
- const query = String(request.params.arguments?.query || "").trim();
1424
- const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
1425
- const limit = Number(request.params.arguments?.limit || 10);
1426
- if (!query) {
1427
- throw new McpError(ErrorCode.InvalidParams, "query is required");
1428
- }
1429
- try {
1430
- const gatewayResult = await unifiedDatasetGateway.discover({
1431
- query,
1432
- source,
1433
- limit,
1434
- publicOnly: false,
1435
- });
1436
- const results = gatewayResult.results;
1437
- const recipeScript = path.join(dataRoot, "python", "asset_downloader_engine.py");
1438
- for (const ds of results.slice(0, limit)) {
1439
- const info = {
1440
- dataset_id: ds.id,
1441
- id: ds.id,
1442
- source: ds.source,
1443
- repo_id: ds.id,
1444
- total_images: ds.total_examples || 0,
1445
- image_column: undefined,
1446
- recipes_dir: path.join(dataRoot, "recipes"),
1639
+ case "vesper_download_assets": {
1640
+ hydrateExternalKeys();
1641
+ const datasetId = String(request.params.arguments?.dataset_id || "").trim();
1642
+ const source = String(request.params.arguments?.source || "").trim().toLowerCase();
1643
+ // Auto-infer repo_id from dataset_id if not provided (common for HuggingFace)
1644
+ const repoId = request.params.arguments?.repo_id
1645
+ ? String(request.params.arguments.repo_id)
1646
+ : (source === "huggingface" && datasetId.includes("/") ? datasetId : undefined);
1647
+ const kaggleRef = request.params.arguments?.kaggle_ref ? String(request.params.arguments.kaggle_ref) : undefined;
1648
+ const urls = Array.isArray(request.params.arguments?.urls)
1649
+ ? (request.params.arguments?.urls).map(v => String(v))
1650
+ : undefined;
1651
+ const outputFormat = String(request.params.arguments?.output_format || "webdataset");
1652
+ const requestedOutputDir = request.params.arguments?.target_dir
1653
+ ? String(request.params.arguments.target_dir).trim()
1654
+ : request.params.arguments?.output_dir
1655
+ ? String(request.params.arguments.output_dir).trim()
1656
+ : undefined;
1657
+ const maxItems = request.params.arguments?.max_items ? Number(request.params.arguments.max_items) : undefined;
1658
+ const workers = request.params.arguments?.workers ? Number(request.params.arguments.workers) : 8;
1659
+ const imageColumn = request.params.arguments?.image_column ? String(request.params.arguments.image_column) : undefined;
1660
+ if (!datasetId || !source) {
1661
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id and source are required");
1662
+ }
1663
+ if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
1664
+ return {
1665
+ content: [{ type: "text", text: "Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds)." }],
1666
+ isError: true,
1447
1667
  };
1448
- try {
1449
- await runPythonJson(recipeScript, ["build_recipe", JSON.stringify(info)]);
1450
- }
1451
- catch {
1452
- // best-effort recipe generation; ignore discovery-time recipe failures
1453
- }
1454
1668
  }
1455
- const formattedOutput = formatSearchResults(results.slice(0, limit));
1456
- const noteBlock = gatewayResult.notes.length > 0
1457
- ? `\nGateway notes:\n- ${gatewayResult.notes.join("\n- ")}`
1458
- : "";
1459
- return {
1460
- content: [{ type: "text", text: `${formattedOutput}${noteBlock}` }]
1461
- };
1462
- }
1463
- catch (error) {
1464
- return {
1465
- content: [{ type: "text", text: `ERROR: discover failed: ${error.message}` }],
1466
- isError: true,
1467
- };
1468
- }
1469
- }
1470
- case "download_dataset": {
1471
- hydrateExternalKeys();
1472
- const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
1473
- const datasetId = String(request.params.arguments?.dataset_id || "").trim();
1474
- const targetDir = request.params.arguments?.target_dir ? String(request.params.arguments.target_dir) : undefined;
1475
- if (!datasetId) {
1476
- throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
1477
- }
1478
- // Pre-install Python datasets library for HuggingFace fallback
1479
- if (source === "huggingface") {
1669
+ const requiredModules = [
1670
+ { module: "aiohttp", packageName: "aiohttp" },
1671
+ ];
1672
+ if (source === "url") {
1673
+ requiredModules.push({ module: "aiofiles", packageName: "aiofiles" });
1674
+ }
1675
+ if (source === "huggingface") {
1676
+ requiredModules.push({ module: "datasets", packageName: "datasets" });
1677
+ requiredModules.push({ module: "PIL", packageName: "Pillow" });
1678
+ }
1679
+ if (source === "kaggle") {
1680
+ requiredModules.push({ module: "kaggle", packageName: "kaggle" });
1681
+ }
1480
1682
  try {
1481
- await ensurePythonModules([
1482
- { module: "datasets", packageName: "datasets" },
1483
- ]);
1683
+ await ensurePythonModules(requiredModules);
1484
1684
  }
1485
- catch {
1486
- // Continue - direct download may still work
1685
+ catch (error) {
1686
+ return {
1687
+ content: [{ type: "text", text: `ERROR: Python dependency setup failed: ${error.message}` }],
1688
+ isError: true,
1689
+ };
1487
1690
  }
1488
- }
1489
- try {
1490
- const result = await unifiedDatasetGateway.download({
1491
- datasetId,
1691
+ const scriptPath = path.join(dataRoot, "python", "asset_downloader_engine.py");
1692
+ const payload = {
1693
+ dataset_id: datasetId,
1492
1694
  source,
1493
- targetDir,
1494
- });
1695
+ repo_id: repoId,
1696
+ kaggle_ref: kaggleRef,
1697
+ urls,
1698
+ output_format: outputFormat,
1699
+ output_dir: requestedOutputDir,
1700
+ max_items: maxItems,
1701
+ workers,
1702
+ image_column: imageColumn,
1703
+ output_root: path.join(dataRoot, "data", "assets"),
1704
+ recipes_dir: path.join(dataRoot, "recipes"),
1705
+ };
1495
1706
  try {
1496
- upsertRegistry(result.dataset_id, result.copied_to || result.local_path, "completed");
1707
+ const result = await runPythonJson(scriptPath, ["download", JSON.stringify(payload)]);
1708
+ if (!result?.ok) {
1709
+ const errMsg = result?.error || "Unknown error";
1710
+ // Enhance error messages for common failures
1711
+ let hint = "";
1712
+ if (errMsg.includes("No image column")) {
1713
+ hint = "\n\nHint: Specify image_column parameter with the column name containing images/URLs.";
1714
+ }
1715
+ else if (errMsg.includes("Authentication") || errMsg.includes("401") || errMsg.includes("403")) {
1716
+ hint = "\n\nHint: Use configure_keys tool to set HF_TOKEN for gated/private datasets.";
1717
+ }
1718
+ return {
1719
+ content: [{ type: "text", text: `ERROR: asset download failed: ${errMsg}${hint}` }],
1720
+ isError: true,
1721
+ };
1722
+ }
1723
+ return {
1724
+ content: [{ type: "text", text: JSON.stringify(result.result, null, 2) }],
1725
+ };
1497
1726
  }
1498
- catch (e) {
1499
- console.error(`[Registry] Failed to write registry for ${result.dataset_id}: ${e?.message || e}`);
1727
+ catch (error) {
1728
+ return {
1729
+ content: [{ type: "text", text: `ERROR: asset downloader execution failed: ${error.message}` }],
1730
+ isError: true,
1731
+ };
1500
1732
  }
1501
- const noteBlock = result.notes.length > 0 ? `\nNotes:\n- ${result.notes.join("\n- ")}` : "";
1502
- return {
1503
- content: [{ type: "text", text: `Download complete: ${result.copied_to || result.local_path}${noteBlock}` }]
1504
- };
1505
- }
1506
- catch (error) {
1507
- return {
1508
- content: [{ type: "text", text: `ERROR: download failed: ${error.message}` }],
1509
- isError: true,
1510
- };
1511
1733
  }
1512
- }
1513
- case "vesper_download_assets": {
1514
- hydrateExternalKeys();
1515
- const datasetId = String(request.params.arguments?.dataset_id || "").trim();
1516
- const source = String(request.params.arguments?.source || "").trim().toLowerCase();
1517
- // Auto-infer repo_id from dataset_id if not provided (common for HuggingFace)
1518
- const repoId = request.params.arguments?.repo_id
1519
- ? String(request.params.arguments.repo_id)
1520
- : (source === "huggingface" && datasetId.includes("/") ? datasetId : undefined);
1521
- const kaggleRef = request.params.arguments?.kaggle_ref ? String(request.params.arguments.kaggle_ref) : undefined;
1522
- const urls = Array.isArray(request.params.arguments?.urls)
1523
- ? (request.params.arguments?.urls).map(v => String(v))
1524
- : undefined;
1525
- const outputFormat = String(request.params.arguments?.output_format || "webdataset");
1526
- const requestedOutputDir = request.params.arguments?.target_dir
1527
- ? String(request.params.arguments.target_dir).trim()
1528
- : request.params.arguments?.output_dir
1529
- ? String(request.params.arguments.output_dir).trim()
1530
- : undefined;
1531
- const maxItems = request.params.arguments?.max_items ? Number(request.params.arguments.max_items) : undefined;
1532
- const workers = request.params.arguments?.workers ? Number(request.params.arguments.workers) : 8;
1533
- const imageColumn = request.params.arguments?.image_column ? String(request.params.arguments.image_column) : undefined;
1534
- if (!datasetId || !source) {
1535
- throw new McpError(ErrorCode.InvalidParams, "dataset_id and source are required");
1536
- }
1537
- if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
1538
- return {
1539
- content: [{ type: "text", text: "Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds)." }],
1540
- isError: true,
1541
- };
1542
- }
1543
- const requiredModules = [
1544
- { module: "aiohttp", packageName: "aiohttp" },
1545
- ];
1546
- if (source === "url") {
1547
- requiredModules.push({ module: "aiofiles", packageName: "aiofiles" });
1548
- }
1549
- if (source === "huggingface") {
1550
- requiredModules.push({ module: "datasets", packageName: "datasets" });
1551
- requiredModules.push({ module: "PIL", packageName: "Pillow" });
1552
- }
1553
- if (source === "kaggle") {
1554
- requiredModules.push({ module: "kaggle", packageName: "kaggle" });
1555
- }
1556
- try {
1557
- await ensurePythonModules(requiredModules);
1558
- }
1559
- catch (error) {
1734
+ case "configure_kaggle": {
1735
+ const username = String(request.params.arguments?.username || "").trim();
1736
+ const key = String(request.params.arguments?.key || "").trim();
1737
+ if (!username || !key) {
1738
+ throw new McpError(ErrorCode.InvalidParams, "username and key are required");
1739
+ }
1740
+ const r1 = secureKeys.set("kaggle_username", username);
1741
+ const r2 = secureKeys.set("kaggle_key", key);
1742
+ process.env.KAGGLE_USERNAME = username;
1743
+ process.env.KAGGLE_KEY = key;
1560
1744
  return {
1561
- content: [{ type: "text", text: `ERROR: Python dependency setup failed: ${error.message}` }],
1562
- isError: true,
1745
+ content: [{ type: "text", text: `Kaggle credentials saved securely (${r1.method || "store"}/${r2.method || "store"}). You can now use Kaggle datasets.` }]
1563
1746
  };
1564
1747
  }
1565
- const scriptPath = path.join(dataRoot, "python", "asset_downloader_engine.py");
1566
- const payload = {
1567
- dataset_id: datasetId,
1568
- source,
1569
- repo_id: repoId,
1570
- kaggle_ref: kaggleRef,
1571
- urls,
1572
- output_format: outputFormat,
1573
- output_dir: requestedOutputDir,
1574
- max_items: maxItems,
1575
- workers,
1576
- image_column: imageColumn,
1577
- output_root: path.join(dataRoot, "data", "assets"),
1578
- recipes_dir: path.join(dataRoot, "recipes"),
1579
- };
1580
- try {
1581
- const result = await runPythonJson(scriptPath, ["download", JSON.stringify(payload)]);
1582
- if (!result?.ok) {
1583
- const errMsg = result?.error || "Unknown error";
1584
- // Enhance error messages for common failures
1585
- let hint = "";
1586
- if (errMsg.includes("No image column")) {
1587
- hint = "\n\nHint: Specify image_column parameter with the column name containing images/URLs.";
1748
+ case "configure_keys": {
1749
+ const hfToken = String(request.params.arguments?.hf_token || "").trim();
1750
+ const kaggleUsername = String(request.params.arguments?.kaggle_username || "").trim();
1751
+ const kaggleKey = String(request.params.arguments?.kaggle_key || "").trim();
1752
+ const dataworldToken = String(request.params.arguments?.dataworld_token || "").trim();
1753
+ const saved = [];
1754
+ const methods = [];
1755
+ if (hfToken) {
1756
+ const r = secureKeys.set("hf_token", hfToken);
1757
+ if (r.ok) {
1758
+ process.env.HF_TOKEN = hfToken;
1759
+ saved.push("HF token");
1760
+ if (r.method)
1761
+ methods.push(r.method);
1588
1762
  }
1589
- else if (errMsg.includes("Authentication") || errMsg.includes("401") || errMsg.includes("403")) {
1590
- hint = "\n\nHint: Use configure_keys tool to set HF_TOKEN for gated/private datasets.";
1763
+ }
1764
+ if (kaggleUsername) {
1765
+ const r = secureKeys.set("kaggle_username", kaggleUsername);
1766
+ if (r.ok) {
1767
+ process.env.KAGGLE_USERNAME = kaggleUsername;
1768
+ saved.push("Kaggle username");
1769
+ if (r.method)
1770
+ methods.push(r.method);
1771
+ }
1772
+ }
1773
+ if (kaggleKey) {
1774
+ const r = secureKeys.set("kaggle_key", kaggleKey);
1775
+ if (r.ok) {
1776
+ process.env.KAGGLE_KEY = kaggleKey;
1777
+ saved.push("Kaggle key");
1778
+ if (r.method)
1779
+ methods.push(r.method);
1591
1780
  }
1781
+ }
1782
+ if (dataworldToken) {
1783
+ const r = secureKeys.set("dataworld_token", dataworldToken);
1784
+ if (r.ok) {
1785
+ process.env.DW_AUTH_TOKEN = dataworldToken;
1786
+ saved.push("data.world token");
1787
+ if (r.method)
1788
+ methods.push(r.method);
1789
+ }
1790
+ }
1791
+ if (saved.length === 0) {
1592
1792
  return {
1593
- content: [{ type: "text", text: `ERROR: asset download failed: ${errMsg}${hint}` }],
1594
- isError: true,
1793
+ content: [{ type: "text", text: "No keys provided. Core Vesper tools continue to work without API keys." }]
1595
1794
  };
1596
1795
  }
1597
1796
  return {
1598
- content: [{ type: "text", text: JSON.stringify(result.result, null, 2) }],
1599
- };
1600
- }
1601
- catch (error) {
1602
- return {
1603
- content: [{ type: "text", text: `ERROR: asset downloader execution failed: ${error.message}` }],
1604
- isError: true,
1797
+ content: [{ type: "text", text: `Key saved securely. Updated: ${saved.join(", ")}.` }]
1605
1798
  };
1606
1799
  }
1607
- }
1608
- case "configure_kaggle": {
1609
- const username = String(request.params.arguments?.username || "").trim();
1610
- const key = String(request.params.arguments?.key || "").trim();
1611
- if (!username || !key) {
1612
- throw new McpError(ErrorCode.InvalidParams, "username and key are required");
1613
- }
1614
- const r1 = secureKeys.set("kaggle_username", username);
1615
- const r2 = secureKeys.set("kaggle_key", key);
1616
- process.env.KAGGLE_USERNAME = username;
1617
- process.env.KAGGLE_KEY = key;
1618
- return {
1619
- content: [{ type: "text", text: `Kaggle credentials saved securely (${r1.method || "store"}/${r2.method || "store"}). You can now use Kaggle datasets.` }]
1620
- };
1621
- }
1622
- case "configure_keys": {
1623
- const hfToken = String(request.params.arguments?.hf_token || "").trim();
1624
- const kaggleUsername = String(request.params.arguments?.kaggle_username || "").trim();
1625
- const kaggleKey = String(request.params.arguments?.kaggle_key || "").trim();
1626
- const dataworldToken = String(request.params.arguments?.dataworld_token || "").trim();
1627
- const saved = [];
1628
- const methods = [];
1629
- if (hfToken) {
1630
- const r = secureKeys.set("hf_token", hfToken);
1631
- if (r.ok) {
1632
- process.env.HF_TOKEN = hfToken;
1633
- saved.push("HF token");
1634
- if (r.method)
1635
- methods.push(r.method);
1800
+ case "get_dataset_info": {
1801
+ const datasetId = String(request.params.arguments?.dataset_id);
1802
+ if (!datasetId) {
1803
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
1636
1804
  }
1637
- }
1638
- if (kaggleUsername) {
1639
- const r = secureKeys.set("kaggle_username", kaggleUsername);
1640
- if (r.ok) {
1641
- process.env.KAGGLE_USERNAME = kaggleUsername;
1642
- saved.push("Kaggle username");
1643
- if (r.method)
1644
- methods.push(r.method);
1805
+ const dataset = metadataStore.getDataset(datasetId);
1806
+ if (!dataset) {
1807
+ return {
1808
+ content: [{ type: "text", text: `ERROR: Dataset not found: ${datasetId}` }],
1809
+ isError: true,
1810
+ };
1645
1811
  }
1646
- }
1647
- if (kaggleKey) {
1648
- const r = secureKeys.set("kaggle_key", kaggleKey);
1649
- if (r.ok) {
1650
- process.env.KAGGLE_KEY = kaggleKey;
1651
- saved.push("Kaggle key");
1652
- if (r.method)
1653
- methods.push(r.method);
1812
+ // Enrich: if total_examples is 0-ish, try the HF datasets-server /size API
1813
+ if ((!dataset.total_examples || dataset.total_examples === 0) && dataset.source === "huggingface") {
1814
+ try {
1815
+ const sizeResp = await fetch(`https://datasets-server.huggingface.co/size?dataset=${encodeURIComponent(dataset.id)}`);
1816
+ if (sizeResp.ok) {
1817
+ const sizeData = await sizeResp.json();
1818
+ const numRows = sizeData?.size?.dataset?.num_rows;
1819
+ if (numRows && numRows > 0) {
1820
+ dataset.total_examples = numRows;
1821
+ // Also backfill splits
1822
+ if (sizeData?.size?.splits && Array.isArray(sizeData.size.splits)) {
1823
+ dataset.splits = sizeData.size.splits.map((s) => ({
1824
+ name: s.split,
1825
+ num_examples: s.num_rows || 0,
1826
+ size_bytes: s.num_bytes_parquet_files || 0,
1827
+ }));
1828
+ dataset.has_train_split = dataset.splits.some((s) => s.name === "train");
1829
+ dataset.has_test_split = dataset.splits.some((s) => s.name === "test");
1830
+ dataset.has_validation_split = dataset.splits.some((s) => s.name === "validation" || s.name === "val");
1831
+ }
1832
+ // Persist enriched metadata
1833
+ metadataStore.saveDataset(dataset);
1834
+ }
1835
+ }
1836
+ }
1837
+ catch {
1838
+ // Enrichment is best-effort; continue with whatever we have
1839
+ }
1654
1840
  }
1655
- }
1656
- if (dataworldToken) {
1657
- const r = secureKeys.set("dataworld_token", dataworldToken);
1658
- if (r.ok) {
1659
- process.env.DW_AUTH_TOKEN = dataworldToken;
1660
- saved.push("data.world token");
1661
- if (r.method)
1662
- methods.push(r.method);
1841
+ const formattedOutput = formatDatasetInfo(dataset);
1842
+ return { content: [{ type: "text", text: formattedOutput }] };
1843
+ }
1844
+ case "analyze_quality": {
1845
+ const datasetId = String(request.params.arguments?.dataset_id);
1846
+ const safeId = toSafeDatasetPathFragment(datasetId);
1847
+ const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
1848
+ const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
1849
+ let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
1850
+ // Demo Fallback for easy testing
1851
+ if (datasetId === "demo" || !fs.existsSync(filePath)) {
1852
+ const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
1853
+ const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
1854
+ if (fs.existsSync(demoParquetPath)) {
1855
+ filePath = demoParquetPath;
1856
+ }
1857
+ else if (fs.existsSync(demoCsvPath)) {
1858
+ filePath = demoCsvPath;
1859
+ }
1860
+ else if (datasetId !== "demo") {
1861
+ return {
1862
+ content: [{ type: "text", text: `ERROR: Local data file for ${datasetId} not found. Try running prepare_dataset first, or use 'demo' as the dataset_id.` }],
1863
+ isError: true
1864
+ };
1865
+ }
1663
1866
  }
1664
- }
1665
- if (saved.length === 0) {
1867
+ const report = await qualityAnalyzer.analyze(filePath);
1666
1868
  return {
1667
- content: [{ type: "text", text: "No keys provided. Core Vesper tools continue to work without API keys." }]
1869
+ content: [{ type: "text", text: JSON.stringify(report, null, 2) }]
1668
1870
  };
1669
1871
  }
1670
- return {
1671
- content: [{ type: "text", text: `Key saved securely. Updated: ${saved.join(", ")}.` }]
1672
- };
1673
- }
1674
- case "get_dataset_info": {
1675
- const datasetId = String(request.params.arguments?.dataset_id);
1676
- if (!datasetId) {
1677
- throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
1678
- }
1679
- const dataset = metadataStore.getDataset(datasetId);
1680
- if (!dataset) {
1872
+ case "preview_cleaning": {
1873
+ const datasetId = String(request.params.arguments?.dataset_id);
1874
+ const safeId = toSafeDatasetPathFragment(datasetId);
1875
+ const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
1876
+ const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
1877
+ let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
1878
+ if (datasetId === "demo" || !fs.existsSync(filePath)) {
1879
+ const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
1880
+ const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
1881
+ if (fs.existsSync(demoParquetPath)) {
1882
+ filePath = demoParquetPath;
1883
+ }
1884
+ else if (fs.existsSync(demoCsvPath)) {
1885
+ filePath = demoCsvPath;
1886
+ }
1887
+ else {
1888
+ throw new McpError(ErrorCode.InvalidParams, `Local data file not found for ${datasetId}. Please run prepare_dataset first.`);
1889
+ }
1890
+ }
1891
+ const report = await qualityAnalyzer.analyze(filePath);
1892
+ // Phase 1: Target Detection
1893
+ // We use the same TargetDetector instance inside CleaningPlanner now?
1894
+ // Actually, we instantiated it inside CleaningPlanner, so we just need to pass the file path
1895
+ // OR let the planner handle it if we update its signature to accept filePath.
1896
+ // Let's check `CleaningPlanner.generatePlan` signature again.
1897
+ // We updated it to accept `targetInfo`.
1898
+ // So we need to run detection HERE and pass it.
1899
+ // But `TargetDetector` is not exposed in `index.ts` scope yet.
1900
+ // Let's create a global instance or use the one inside planner if exposed (it's private).
1901
+ // Better approach: Instantiate TargetDetector here in index.ts for the tool content.
1902
+ // Quick fix: Instantiate local detector or make global.
1903
+ // I'll make a global `targetDetector` constant in index.ts
1904
+ // But wait, I updated `CleaningPlanner` to instantiate its own detector.
1905
+ // Does `CleaningPlanner` use it? No, I commented out the logic because it needed `filePath`.
1906
+ // RETRY STRATEGY:
1907
+ // 1. Instantiate `targetDetector` in `index.ts`.
1908
+ // 2. Run `detectTarget(filePath)`.
1909
+ // 3. Pass result to `cleaningPlanner.generatePlan(..., targetInfo)`.
1910
+ // I need to add `const targetDetector = new TargetDetector(__dirname);` to imports/init section first.
1911
+ // But since I'm in this tool, I can't look back.
1912
+ // I will assume I can add it, or just do it inside the case for now.
1913
+ // To do it properly, I should have added `targetDetector` to the global scope in previous step.
1914
+ // Let's do that in a separate step if needed.
1915
+ // For now, I'll instantiate it here.
1916
+ const { TargetDetector } = await import("./preparation/target-detector.js");
1917
+ const detector = new TargetDetector(__dirname);
1918
+ const targetResult = await detector.detectTarget(filePath);
1919
+ const targetInfo = targetResult.target_column ? {
1920
+ target: targetResult.target_column,
1921
+ confidence: targetResult.confidence
1922
+ } : undefined;
1923
+ const plan = await cleaningPlanner.generatePlan(datasetId, report, undefined, targetInfo);
1924
+ let explanation = `### Cleaning Plan for ${datasetId}\n\n`;
1925
+ if (targetInfo && targetInfo.target !== "target" && targetInfo.confidence > 0.7) {
1926
+ explanation += `**Target Detected**: '${targetInfo.target}' (Confidence: ${targetInfo.confidence.toFixed(2)})\n`;
1927
+ explanation += ` - **Action**: Renaming to 'target' for consistency.\n\n`;
1928
+ }
1929
+ explanation += `Estimated Quality Improvement: +${plan.estimated_impact.quality_score_improvement} points\n\n`;
1930
+ if (plan.operations.length === 0) {
1931
+ explanation += "No cleaning operations required.";
1932
+ }
1933
+ else {
1934
+ plan.operations.forEach((op, i) => {
1935
+ explanation += `${i + 1}. **${op.type}**: ${op.reason}\n`;
1936
+ });
1937
+ }
1681
1938
  return {
1682
- content: [{ type: "text", text: `ERROR: Dataset not found: ${datasetId}` }],
1683
- isError: true,
1939
+ content: [{ type: "text", text: explanation }]
1684
1940
  };
1685
1941
  }
1686
- const formattedOutput = formatDatasetInfo(dataset);
1687
- return { content: [{ type: "text", text: formattedOutput }] };
1688
- }
1689
- case "analyze_quality": {
1690
- const datasetId = String(request.params.arguments?.dataset_id);
1691
- const safeId = toSafeDatasetPathFragment(datasetId);
1692
- const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
1693
- const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
1694
- let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
1695
- // Demo Fallback for easy testing
1696
- if (datasetId === "demo" || !fs.existsSync(filePath)) {
1697
- const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
1698
- const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
1699
- if (fs.existsSync(demoParquetPath)) {
1700
- filePath = demoParquetPath;
1701
- }
1702
- else if (fs.existsSync(demoCsvPath)) {
1703
- filePath = demoCsvPath;
1704
- }
1705
- else if (datasetId !== "demo") {
1942
+ case "custom_clean": {
1943
+ const datasetId = String(request.params.arguments?.dataset_id);
1944
+ const ops = request.params.arguments?.operations;
1945
+ if (!datasetId || datasetId === "undefined") {
1946
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
1947
+ }
1948
+ if (!ops || !Array.isArray(ops) || ops.length === 0) {
1949
+ throw new McpError(ErrorCode.InvalidParams, "operations array is required and must not be empty. Supported operations: RemoveDuplicates, DropColumns, FillMissing, FixTypes, RemoveOutliers, EncodeCategories");
1950
+ }
1951
+ // Pre-check: verify dataset file exists before starting the job
1952
+ const cleanRegEntry = getRegistryEntry(datasetId);
1953
+ const cleanRegPath = cleanRegEntry?.local_path || cleanRegEntry?.path;
1954
+ const cleanDlStatus = metadataStore.getDownloadStatus(datasetId);
1955
+ const cleanSafeId = toSafeDatasetPathFragment(datasetId);
1956
+ const cleanDataExists = (cleanRegPath && fs.existsSync(cleanRegPath)) ||
1957
+ (cleanDlStatus?.local_path && fs.existsSync(cleanDlStatus.local_path)) ||
1958
+ fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.parquet`)) ||
1959
+ fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.csv`)) ||
1960
+ fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.feather`)) ||
1961
+ fs.existsSync(datasetId);
1962
+ if (!cleanDataExists) {
1706
1963
  return {
1707
- content: [{ type: "text", text: `ERROR: Local data file for ${datasetId} not found. Try running prepare_dataset first, or use 'demo' as the dataset_id.` }],
1708
- isError: true
1964
+ content: [{ type: "text", text: `Dataset '${datasetId}' not found locally. Download it first using download_dataset or prepare_dataset, then run custom_clean.` }],
1965
+ isError: true,
1709
1966
  };
1710
1967
  }
1968
+ const job = jobManager.createJob("clean", 0, { datasetId, ops });
1969
+ return {
1970
+ content: [{ type: "text", text: `Cleaning job started. ID: ${job.id}. Use check_job_status to monitor progress.` }]
1971
+ };
1711
1972
  }
1712
- const report = await qualityAnalyzer.analyze(filePath);
1713
- return {
1714
- content: [{ type: "text", text: JSON.stringify(report, null, 2) }]
1715
- };
1716
- }
1717
- case "preview_cleaning": {
1718
- const datasetId = String(request.params.arguments?.dataset_id);
1719
- const safeId = toSafeDatasetPathFragment(datasetId);
1720
- const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
1721
- const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
1722
- let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
1723
- if (datasetId === "demo" || !fs.existsSync(filePath)) {
1724
- const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
1725
- const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
1726
- if (fs.existsSync(demoParquetPath)) {
1727
- filePath = demoParquetPath;
1728
- }
1729
- else if (fs.existsSync(demoCsvPath)) {
1730
- filePath = demoCsvPath;
1731
- }
1732
- else {
1733
- throw new McpError(ErrorCode.InvalidParams, `Local data file not found for ${datasetId}. Please run prepare_dataset first.`);
1973
+ case "prepare_dataset": {
1974
+ hydrateExternalKeys();
1975
+ const query = String(request.params.arguments?.query);
1976
+ const requirements = request.params.arguments?.requirements ? String(request.params.arguments?.requirements) : undefined;
1977
+ const downloadImages = request.params.arguments?.download_images === true;
1978
+ if (!query || query === "undefined") {
1979
+ throw new McpError(ErrorCode.InvalidParams, "query is required - describe the dataset you need or provide a dataset ID");
1734
1980
  }
1735
- }
1736
- const report = await qualityAnalyzer.analyze(filePath);
1737
- // Phase 1: Target Detection
1738
- // We use the same TargetDetector instance inside CleaningPlanner now?
1739
- // Actually, we instantiated it inside CleaningPlanner, so we just need to pass the file path
1740
- // OR let the planner handle it if we update its signature to accept filePath.
1741
- // Let's check `CleaningPlanner.generatePlan` signature again.
1742
- // We updated it to accept `targetInfo`.
1743
- // So we need to run detection HERE and pass it.
1744
- // But `TargetDetector` is not exposed in `index.ts` scope yet.
1745
- // Let's create a global instance or use the one inside planner if exposed (it's private).
1746
- // Better approach: Instantiate TargetDetector here in index.ts for the tool content.
1747
- // Quick fix: Instantiate local detector or make global.
1748
- // I'll make a global `targetDetector` constant in index.ts
1749
- // But wait, I updated `CleaningPlanner` to instantiate its own detector.
1750
- // Does `CleaningPlanner` use it? No, I commented out the logic because it needed `filePath`.
1751
- // RETRY STRATEGY:
1752
- // 1. Instantiate `targetDetector` in `index.ts`.
1753
- // 2. Run `detectTarget(filePath)`.
1754
- // 3. Pass result to `cleaningPlanner.generatePlan(..., targetInfo)`.
1755
- // I need to add `const targetDetector = new TargetDetector(__dirname);` to imports/init section first.
1756
- // But since I'm in this tool, I can't look back.
1757
- // I will assume I can add it, or just do it inside the case for now.
1758
- // To do it properly, I should have added `targetDetector` to the global scope in previous step.
1759
- // Let's do that in a separate step if needed.
1760
- // For now, I'll instantiate it here.
1761
- const { TargetDetector } = await import("./preparation/target-detector.js");
1762
- const detector = new TargetDetector(__dirname);
1763
- const targetResult = await detector.detectTarget(filePath);
1764
- const targetInfo = targetResult.target_column ? {
1765
- target: targetResult.target_column,
1766
- confidence: targetResult.confidence
1767
- } : undefined;
1768
- const plan = await cleaningPlanner.generatePlan(datasetId, report, undefined, targetInfo);
1769
- let explanation = `### Cleaning Plan for ${datasetId}\n\n`;
1770
- if (targetInfo && targetInfo.target !== "target" && targetInfo.confidence > 0.7) {
1771
- explanation += `**Target Detected**: '${targetInfo.target}' (Confidence: ${targetInfo.confidence.toFixed(2)})\n`;
1772
- explanation += ` - **Action**: Renaming to 'target' for consistency.\n\n`;
1773
- }
1774
- explanation += `Estimated Quality Improvement: +${plan.estimated_impact.quality_score_improvement} points\n\n`;
1775
- if (plan.operations.length === 0) {
1776
- explanation += "No cleaning operations required.";
1777
- }
1778
- else {
1779
- plan.operations.forEach((op, i) => {
1780
- explanation += `${i + 1}. **${op.type}**: ${op.reason}\n`;
1781
- });
1782
- }
1783
- return {
1784
- content: [{ type: "text", text: explanation }]
1785
- };
1786
- }
1787
- case "custom_clean": {
1788
- const datasetId = String(request.params.arguments?.dataset_id);
1789
- const ops = request.params.arguments?.operations;
1790
- if (!datasetId || datasetId === "undefined") {
1791
- throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
1792
- }
1793
- if (!ops || !Array.isArray(ops) || ops.length === 0) {
1794
- throw new McpError(ErrorCode.InvalidParams, "operations array is required and must not be empty. Supported operations: RemoveDuplicates, DropColumns, FillMissing, FixTypes, RemoveOutliers, EncodeCategories");
1795
- }
1796
- // Pre-check: verify dataset file exists before starting the job
1797
- const cleanRegEntry = getRegistryEntry(datasetId);
1798
- const cleanRegPath = cleanRegEntry?.local_path || cleanRegEntry?.path;
1799
- const cleanDlStatus = metadataStore.getDownloadStatus(datasetId);
1800
- const cleanSafeId = toSafeDatasetPathFragment(datasetId);
1801
- const cleanDataExists = (cleanRegPath && fs.existsSync(cleanRegPath)) ||
1802
- (cleanDlStatus?.local_path && fs.existsSync(cleanDlStatus.local_path)) ||
1803
- fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.parquet`)) ||
1804
- fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.csv`)) ||
1805
- fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.feather`)) ||
1806
- fs.existsSync(datasetId);
1807
- if (!cleanDataExists) {
1981
+ const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages });
1808
1982
  return {
1809
- content: [{ type: "text", text: `Dataset '${datasetId}' not found locally. Download it first using download_dataset or prepare_dataset, then run custom_clean.` }],
1810
- isError: true,
1983
+ content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you. Use check_job_status to monitor progress.` }]
1811
1984
  };
1812
1985
  }
1813
- const job = jobManager.createJob("clean", 0, { datasetId, ops });
1814
- return {
1815
- content: [{ type: "text", text: `Cleaning job started. ID: ${job.id}. Use check_job_status to monitor progress.` }]
1816
- };
1817
- }
1818
- case "prepare_dataset": {
1819
- hydrateExternalKeys();
1820
- const query = String(request.params.arguments?.query);
1821
- const requirements = request.params.arguments?.requirements ? String(request.params.arguments?.requirements) : undefined;
1822
- const downloadImages = request.params.arguments?.download_images === true;
1823
- if (!query || query === "undefined") {
1824
- throw new McpError(ErrorCode.InvalidParams, "query is required - describe the dataset you need or provide a dataset ID");
1825
- }
1826
- const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages });
1827
- return {
1828
- content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you. Use check_job_status to monitor progress.` }]
1829
- };
1830
- }
1831
- case "compare_datasets": {
1832
- const datasetIds = request.params.arguments?.dataset_ids;
1833
- const datasets = datasetIds.map(id => metadataStore.getDataset(id)).filter(d => !!d);
1834
- let comparison = "| Metric | " + datasets.map(d => d.name).join(" | ") + " |\n";
1835
- comparison += "| :--- | " + datasets.map(() => " :---: ").join("|") + " |\n";
1836
- comparison += "| Quality Score | " + datasets.map(d => `${d.quality_score}/100`).join(" | ") + " |\n";
1837
- comparison += "| License | " + datasets.map(d => d.license.category).join(" | ") + " |\n";
1838
- comparison += "| Downloads | " + datasets.map(d => d.downloads).join(" | ") + " |\n";
1839
- comparison += "| Domain | " + datasets.map(d => d.domain).join(" | ") + " |\n";
1840
- return {
1841
- content: [{ type: "text", text: comparison }]
1842
- };
1843
- }
1844
- case "check_job_status": {
1845
- const jobId = String(request.params.arguments?.job_id);
1846
- const job = metadataStore.getJob(jobId);
1847
- if (!job) {
1848
- throw new McpError(ErrorCode.InvalidParams, `Job not found: ${jobId}`);
1986
+ case "compare_datasets": {
1987
+ const datasetIds = request.params.arguments?.dataset_ids;
1988
+ const datasets = datasetIds.map(id => metadataStore.getDataset(id)).filter(d => !!d);
1989
+ let comparison = "| Metric | " + datasets.map(d => d.name).join(" | ") + " |\n";
1990
+ comparison += "| :--- | " + datasets.map(() => " :---: ").join("|") + " |\n";
1991
+ comparison += "| Quality Score | " + datasets.map(d => `${d.quality_score}/100`).join(" | ") + " |\n";
1992
+ comparison += "| License | " + datasets.map(d => d.license.category).join(" | ") + " |\n";
1993
+ comparison += "| Downloads | " + datasets.map(d => d.downloads).join(" | ") + " |\n";
1994
+ comparison += "| Domain | " + datasets.map(d => d.domain).join(" | ") + " |\n";
1995
+ return {
1996
+ content: [{ type: "text", text: comparison }]
1997
+ };
1849
1998
  }
1850
- const activeStatuses = new Set(["pending", "queued", "running", "retrying"]);
1851
- const now = Date.now();
1852
- const last = jobStatusLastPoll[jobId] || 0;
1853
- const minPollMs = 3000;
1854
- if (activeStatuses.has(job.status) && (now - last) < minPollMs) {
1855
- const waitMs = minPollMs - (now - last);
1999
+ case "check_job_status": {
2000
+ const jobId = String(request.params.arguments?.job_id);
2001
+ const job = metadataStore.getJob(jobId);
2002
+ if (!job) {
2003
+ throw new McpError(ErrorCode.InvalidParams, `Job not found: ${jobId}`);
2004
+ }
2005
+ const activeStatuses = new Set(["pending", "queued", "running", "retrying"]);
2006
+ const now = Date.now();
2007
+ const last = jobStatusLastPoll[jobId] || 0;
2008
+ const minPollMs = 3000;
2009
+ if (activeStatuses.has(job.status) && (now - last) < minPollMs) {
2010
+ const waitMs = minPollMs - (now - last);
2011
+ return {
2012
+ content: [{ type: "text", text: `No significant status change yet. Please poll again in ~${Math.ceil(waitMs / 1000)}s.` }]
2013
+ };
2014
+ }
2015
+ jobStatusLastPoll[jobId] = now;
1856
2016
  return {
1857
- content: [{ type: "text", text: `No significant status change yet. Please poll again in ~${Math.ceil(waitMs / 1000)}s.` }]
2017
+ content: [{ type: "text", text: formatJobStatus(job) }]
1858
2018
  };
1859
2019
  }
1860
- jobStatusLastPoll[jobId] = now;
1861
- return {
1862
- content: [{ type: "text", text: formatJobStatus(job) }]
1863
- };
1864
- }
1865
- case "export_dataset": {
1866
- const datasetId = String(request.params.arguments?.dataset_id);
1867
- const targetDir = request.params.arguments?.target_dir ? String(request.params.arguments?.target_dir) : undefined;
1868
- const requestedFormat = String(request.params.arguments?.format || "feather");
1869
- const fastMode = request.params.arguments?.fast === true;
1870
- const preview = request.params.arguments?.preview === true;
1871
- const sampleRows = request.params.arguments?.sample_rows ? Number(request.params.arguments.sample_rows) : undefined;
1872
- const columns = request.params.arguments?.columns;
1873
- const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
1874
- // Use Metadata or Registry to find the actual local file
1875
- let sourcePath = resolveDatasetLocalPath(datasetId);
1876
- if (!sourcePath) {
1877
- console.error(`[Export] No local data found for ${datasetId}. Attempting to prepare automatically...`);
1878
- // Start a prepare job for this dataset id (acts like calling prepare_dataset)
2020
+ case "export_dataset": {
2021
+ const datasetId = String(request.params.arguments?.dataset_id);
2022
+ const targetDir = request.params.arguments?.target_dir ? String(request.params.arguments?.target_dir) : undefined;
2023
+ const requestedFormat = String(request.params.arguments?.format || "feather");
2024
+ const fastMode = request.params.arguments?.fast === true;
2025
+ const preview = request.params.arguments?.preview === true;
2026
+ const sampleRows = request.params.arguments?.sample_rows ? Number(request.params.arguments.sample_rows) : undefined;
2027
+ const columns = request.params.arguments?.columns;
2028
+ const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
2029
+ // Use Metadata or Registry to find the actual local file
2030
+ let sourcePath = resolveDatasetLocalPath(datasetId);
2031
+ if (!sourcePath) {
2032
+ console.error(`[Export] No local data found for ${datasetId}. Attempting to prepare automatically...`);
2033
+ // Start a prepare job for this dataset id (acts like calling prepare_dataset)
2034
+ try {
2035
+ jobManager.createJob("prepare", 0, { query: datasetId, requirements: undefined, downloadImages: false });
2036
+ }
2037
+ catch (e) {
2038
+ console.error(`[Export] Failed to start prepare job for ${datasetId}: ${e?.message || e}`);
2039
+ }
2040
+ // Poll for download status or registry entry until local_path appears or timeout
2041
+ const wait = (ms) => new Promise(res => setTimeout(res, ms));
2042
+ const maxWait = 120_000; // 120s
2043
+ const interval = 2000;
2044
+ let waited = 0;
2045
+ while (waited < maxWait) {
2046
+ const resolved = resolveDatasetLocalPath(datasetId);
2047
+ if (resolved) {
2048
+ sourcePath = resolved;
2049
+ console.error(`[Export] Local data is now available for ${datasetId}: ${sourcePath}`);
2050
+ break;
2051
+ }
2052
+ await wait(interval);
2053
+ waited += interval;
2054
+ }
2055
+ // If still no sourcePath, return helpful error listing prepared datasets
2056
+ if (!sourcePath) {
2057
+ const entries = readRegistry();
2058
+ const listText = entries.length === 0 ? "(no prepared datasets found)" : entries.map(e => `- ${e.dataset_id || e.id || "unknown"}: ${e.local_path || e.path || "unknown"}`).slice(0, 10).join("\n") + (entries.length > 10 ? "\n...and " + (entries.length - 10) + " more" : "");
2059
+ return {
2060
+ content: [{ type: "text", text: `ERROR: No local data found for ${datasetId} after attempting prepare. Check credentials and try running prepare_dataset manually. Prepared datasets:\n${listText}` }],
2061
+ isError: true
2062
+ };
2063
+ }
2064
+ }
2065
+ sourcePath = ensureExportableLocalPath(sourcePath);
1879
2066
  try {
1880
- jobManager.createJob("prepare", 0, { query: datasetId, requirements: undefined, downloadImages: false });
2067
+ upsertRegistry(datasetId, sourcePath, "completed");
1881
2068
  }
1882
2069
  catch (e) {
1883
- console.error(`[Export] Failed to start prepare job for ${datasetId}: ${e?.message || e}`);
1884
- }
1885
- // Poll for download status or registry entry until local_path appears or timeout
1886
- const wait = (ms) => new Promise(res => setTimeout(res, ms));
1887
- const maxWait = 120_000; // 120s
1888
- const interval = 2000;
1889
- let waited = 0;
1890
- while (waited < maxWait) {
1891
- const resolved = resolveDatasetLocalPath(datasetId);
1892
- if (resolved) {
1893
- sourcePath = resolved;
1894
- console.error(`[Export] Local data is now available for ${datasetId}: ${sourcePath}`);
1895
- break;
2070
+ console.error(`[Registry] Failed to normalize registry path for ${datasetId}: ${e?.message || e}`);
2071
+ }
2072
+ // If NOT fast mode, run quality/cleaning pipeline first (only for csv/parquet compat)
2073
+ if (!fastMode) {
2074
+ const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
2075
+ const pipelineFmt = (requestedFormat === "csv" || requestedFormat === "parquet") ? requestedFormat : "parquet";
2076
+ const pipelineCompatibleInput = currentExt === "csv" || currentExt === "parquet";
2077
+ if (!pipelineCompatibleInput) {
2078
+ console.error(`[Export] Skipping pipeline for ${currentExt} input; using direct exporter conversion.`);
2079
+ }
2080
+ else if (currentExt !== pipelineFmt) {
2081
+ console.error(`[Export] Running quality/cleaning pipeline (use fast=true to skip)...`);
2082
+ try {
2083
+ const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, pipelineFmt);
2084
+ if (pipelineResult.final_output_path) {
2085
+ sourcePath = pipelineResult.final_output_path;
2086
+ try {
2087
+ // Update registry to point to pipeline's final output
2088
+ upsertRegistry(datasetId, sourcePath, "completed");
2089
+ }
2090
+ catch (e) {
2091
+ console.error(`[Registry] Failed to update registry for ${datasetId}: ${e?.message || e}`);
2092
+ }
2093
+ }
2094
+ }
2095
+ catch (err) {
2096
+ console.error(`[Export] Pipeline warning: ${err.message}. Continuing with raw file.`);
2097
+ }
1896
2098
  }
1897
- await wait(interval);
1898
- waited += interval;
1899
2099
  }
1900
- // If still no sourcePath, return helpful error listing prepared datasets
1901
- if (!sourcePath) {
1902
- const entries = readRegistry();
1903
- const listText = entries.length === 0 ? "(no prepared datasets found)" : entries.map(e => `- ${e.dataset_id || e.id || "unknown"}: ${e.local_path || e.path || "unknown"}`).slice(0, 10).join("\n") + (entries.length > 10 ? "\n...and " + (entries.length - 10) + " more" : "");
2100
+ else {
2101
+ console.error(`[Export] Fast mode - skipping quality analysis and cleaning`);
2102
+ }
2103
+ // Build export options
2104
+ const exportOpts = {};
2105
+ if (compression)
2106
+ exportOpts.compression = compression;
2107
+ if (preview)
2108
+ exportOpts.preview = true;
2109
+ if (sampleRows)
2110
+ exportOpts.sample_rows = sampleRows;
2111
+ if (columns)
2112
+ exportOpts.columns = columns;
2113
+ try {
2114
+ // Determine output file name
2115
+ const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
2116
+ const ext = extMap[requestedFormat] || ".feather";
2117
+ const safeName = toSafeDatasetPathFragment(datasetId);
2118
+ const outDir = targetDir || path.join(dataRoot, "exports");
2119
+ if (!fs.existsSync(outDir))
2120
+ fs.mkdirSync(outDir, { recursive: true });
2121
+ const outputFile = path.join(outDir, `${safeName}${ext}`);
2122
+ const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
2123
+ // Build rich response
2124
+ let msg = `**Export complete**\n`;
2125
+ msg += `- **File**: ${result.output_path}\n`;
2126
+ msg += `- **Format**: ${result.format}${result.compression ? ` (${result.compression})` : ""}\n`;
2127
+ msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
2128
+ if (result.file_size_mb !== undefined)
2129
+ msg += `- **Size**: ${result.file_size_mb} MB\n`;
2130
+ if (result.elapsed_seconds !== undefined)
2131
+ msg += `- **Time**: ${result.elapsed_seconds}s\n`;
2132
+ if (result.preview_path)
2133
+ msg += `- **Preview**: ${result.preview_path}\n`;
2134
+ msg += `\n`;
2135
+ if (requestedFormat === "feather") {
2136
+ msg += `**Inspect with:**\n`;
2137
+ msg += ` Python: \`pd.read_feather('${result.output_path}').head()\`\n`;
2138
+ msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
2139
+ }
2140
+ else if (requestedFormat === "parquet") {
2141
+ msg += `**Inspect with:**\n`;
2142
+ msg += ` Python: \`pd.read_parquet('${result.output_path}').head()\`\n`;
2143
+ msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
2144
+ }
2145
+ return { content: [{ type: "text", text: msg }] };
2146
+ }
2147
+ catch (error) {
1904
2148
  return {
1905
- content: [{ type: "text", text: `ERROR: No local data found for ${datasetId} after attempting prepare. Check credentials and try running prepare_dataset manually. Prepared datasets:\n${listText}` }],
2149
+ content: [{ type: "text", text: `ERROR: Export failed: ${error.message}` }],
1906
2150
  isError: true
1907
2151
  };
1908
2152
  }
1909
2153
  }
1910
- sourcePath = ensureExportableLocalPath(sourcePath);
1911
- try {
1912
- upsertRegistry(datasetId, sourcePath, "completed");
1913
- }
1914
- catch (e) {
1915
- console.error(`[Registry] Failed to normalize registry path for ${datasetId}: ${e?.message || e}`);
1916
- }
1917
- // If NOT fast mode, run quality/cleaning pipeline first (only for csv/parquet compat)
1918
- if (!fastMode) {
1919
- const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
1920
- const pipelineFmt = (requestedFormat === "csv" || requestedFormat === "parquet") ? requestedFormat : "parquet";
1921
- const pipelineCompatibleInput = currentExt === "csv" || currentExt === "parquet";
1922
- if (!pipelineCompatibleInput) {
1923
- console.error(`[Export] Skipping pipeline for ${currentExt} input; using direct exporter conversion.`);
1924
- }
1925
- else if (currentExt !== pipelineFmt) {
1926
- console.error(`[Export] Running quality/cleaning pipeline (use fast=true to skip)...`);
1927
- try {
1928
- const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, pipelineFmt);
1929
- if (pipelineResult.final_output_path) {
1930
- sourcePath = pipelineResult.final_output_path;
1931
- try {
1932
- // Update registry to point to pipeline's final output
1933
- upsertRegistry(datasetId, sourcePath, "completed");
1934
- }
1935
- catch (e) {
1936
- console.error(`[Registry] Failed to update registry for ${datasetId}: ${e?.message || e}`);
1937
- }
1938
- }
2154
+ case "fuse_datasets": {
2155
+ const rawSources = request.params.arguments?.sources;
2156
+ if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {
2157
+ throw new McpError(ErrorCode.InvalidParams, "sources must contain at least 2 dataset IDs/paths");
2158
+ }
2159
+ const strategy = request.params.arguments?.strategy || "concat";
2160
+ const joinOn = request.params.arguments?.join_on;
2161
+ const how = request.params.arguments?.how || "inner";
2162
+ const dedup = request.params.arguments?.dedup !== false;
2163
+ const runQualityAfter = request.params.arguments?.run_quality_after !== false;
2164
+ const leakageCheck = request.params.arguments?.leakage_check !== false;
2165
+ const outputFormat = request.params.arguments?.output_format || "feather";
2166
+ const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
2167
+ const preview = request.params.arguments?.preview !== false;
2168
+ const resolvedPaths = [];
2169
+ const unresolved = [];
2170
+ for (const src of rawSources) {
2171
+ if (fs.existsSync(src)) {
2172
+ resolvedPaths.push(src);
2173
+ continue;
1939
2174
  }
1940
- catch (err) {
1941
- console.error(`[Export] Pipeline warning: ${err.message}. Continuing with raw file.`);
2175
+ const status = metadataStore.getDownloadStatus(src);
2176
+ if (status?.local_path && fs.existsSync(status.local_path)) {
2177
+ resolvedPaths.push(status.local_path);
2178
+ continue;
1942
2179
  }
2180
+ unresolved.push(src);
1943
2181
  }
1944
- }
1945
- else {
1946
- console.error(`[Export] Fast mode - skipping quality analysis and cleaning`);
1947
- }
1948
- // Build export options
1949
- const exportOpts = {};
1950
- if (compression)
1951
- exportOpts.compression = compression;
1952
- if (preview)
1953
- exportOpts.preview = true;
1954
- if (sampleRows)
1955
- exportOpts.sample_rows = sampleRows;
1956
- if (columns)
1957
- exportOpts.columns = columns;
1958
- try {
1959
- // Determine output file name
1960
- const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
1961
- const ext = extMap[requestedFormat] || ".feather";
1962
- const safeName = toSafeDatasetPathFragment(datasetId);
1963
- const outDir = targetDir || path.join(dataRoot, "exports");
1964
- if (!fs.existsSync(outDir))
1965
- fs.mkdirSync(outDir, { recursive: true });
1966
- const outputFile = path.join(outDir, `${safeName}${ext}`);
1967
- const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
1968
- // Build rich response
1969
- let msg = `**Export complete**\n`;
1970
- msg += `- **File**: ${result.output_path}\n`;
1971
- msg += `- **Format**: ${result.format}${result.compression ? ` (${result.compression})` : ""}\n`;
1972
- msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
1973
- if (result.file_size_mb !== undefined)
1974
- msg += `- **Size**: ${result.file_size_mb} MB\n`;
1975
- if (result.elapsed_seconds !== undefined)
1976
- msg += `- **Time**: ${result.elapsed_seconds}s\n`;
1977
- if (result.preview_path)
1978
- msg += `- **Preview**: ${result.preview_path}\n`;
1979
- msg += `\n`;
1980
- if (requestedFormat === "feather") {
1981
- msg += `**Inspect with:**\n`;
1982
- msg += ` Python: \`pd.read_feather('${result.output_path}').head()\`\n`;
1983
- msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
1984
- }
1985
- else if (requestedFormat === "parquet") {
1986
- msg += `**Inspect with:**\n`;
1987
- msg += ` Python: \`pd.read_parquet('${result.output_path}').head()\`\n`;
1988
- msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
1989
- }
1990
- return { content: [{ type: "text", text: msg }] };
1991
- }
1992
- catch (error) {
1993
- return {
1994
- content: [{ type: "text", text: `ERROR: Export failed: ${error.message}` }],
1995
- isError: true
1996
- };
1997
- }
1998
- }
1999
- case "fuse_datasets": {
2000
- const rawSources = request.params.arguments?.sources;
2001
- if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {
2002
- throw new McpError(ErrorCode.InvalidParams, "sources must contain at least 2 dataset IDs/paths");
2003
- }
2004
- const strategy = request.params.arguments?.strategy || "concat";
2005
- const joinOn = request.params.arguments?.join_on;
2006
- const how = request.params.arguments?.how || "inner";
2007
- const dedup = request.params.arguments?.dedup !== false;
2008
- const runQualityAfter = request.params.arguments?.run_quality_after !== false;
2009
- const leakageCheck = request.params.arguments?.leakage_check !== false;
2010
- const outputFormat = request.params.arguments?.output_format || "feather";
2011
- const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
2012
- const preview = request.params.arguments?.preview !== false;
2013
- const resolvedPaths = [];
2014
- const unresolved = [];
2015
- for (const src of rawSources) {
2016
- if (fs.existsSync(src)) {
2017
- resolvedPaths.push(src);
2018
- continue;
2182
+ if (unresolved.length > 0) {
2183
+ return {
2184
+ content: [{
2185
+ type: "text",
2186
+ text: `ERROR: Could not resolve these sources to local files: ${unresolved.join(", ")}. Provide local paths or run prepare_dataset first.`
2187
+ }],
2188
+ isError: true
2189
+ };
2019
2190
  }
2020
- const status = metadataStore.getDownloadStatus(src);
2021
- if (status?.local_path && fs.existsSync(status.local_path)) {
2022
- resolvedPaths.push(status.local_path);
2023
- continue;
2191
+ try {
2192
+ const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow" };
2193
+ const ext = extMap[outputFormat] || ".feather";
2194
+ const outDir = path.join(dataRoot, "fusion");
2195
+ if (!fs.existsSync(outDir))
2196
+ fs.mkdirSync(outDir, { recursive: true });
2197
+ const outputPath = path.join(outDir, `fused_${Date.now()}${ext}`);
2198
+ const result = await fusionEngine.fuse(resolvedPaths, outputPath, {
2199
+ strategy,
2200
+ join_on: joinOn,
2201
+ how,
2202
+ dedup,
2203
+ run_quality_after: runQualityAfter,
2204
+ leakage_check: leakageCheck,
2205
+ output_format: outputFormat,
2206
+ compression: compression,
2207
+ preview,
2208
+ });
2209
+ const nullDelta = result.stats.null_delta;
2210
+ const nullText = nullDelta >= 0 ? `+${nullDelta}%` : `${nullDelta}%`;
2211
+ // Register fused dataset under a generated id so users can export it easily
2212
+ const fusedId = `fused_${Date.now()}`;
2213
+ try {
2214
+ upsertRegistry(fusedId, result.output_path, "completed");
2215
+ }
2216
+ catch (e) {
2217
+ console.error(`[Registry] Failed to register fused dataset ${fusedId}: ${e?.message || e}`);
2218
+ }
2219
+ let msg = `Fused ${result.stats.sources_count} sources -> ${result.stats.rows_after.toLocaleString()} rows (from ${result.stats.rows_before.toLocaleString()}).\n`;
2220
+ msg += `- Duplicates removed: ${result.stats.duplicates_removed.toLocaleString()}\n`;
2221
+ msg += `- Null change: ${nullText}\n`;
2222
+ msg += `- Output: ${result.output_path}\n`;
2223
+ if (result.preview_path)
2224
+ msg += `- Preview: ${result.preview_path}\n`;
2225
+ if (result.leakage_report) {
2226
+ msg += `- Leakage: ${result.leakage_report.leakage_detected ? "detected" : "none"}`;
2227
+ if (result.leakage_report.leakage_count) {
2228
+ msg += ` (${result.leakage_report.leakage_count})`;
2229
+ }
2230
+ msg += "\n";
2231
+ }
2232
+ msg += `\nNext: run split_dataset/export_dataset on fused output. Registered fused dataset id: ${fusedId}`;
2233
+ return { content: [{ type: "text", text: msg }] };
2234
+ }
2235
+ catch (error) {
2236
+ return {
2237
+ content: [{ type: "text", text: `ERROR: Fusion failed: ${error.message}` }],
2238
+ isError: true
2239
+ };
2024
2240
  }
2025
- unresolved.push(src);
2026
- }
2027
- if (unresolved.length > 0) {
2028
- return {
2029
- content: [{
2030
- type: "text",
2031
- text: `ERROR: Could not resolve these sources to local files: ${unresolved.join(", ")}. Provide local paths or run prepare_dataset first.`
2032
- }],
2033
- isError: true
2034
- };
2035
2241
  }
2036
- try {
2037
- const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow" };
2038
- const ext = extMap[outputFormat] || ".feather";
2039
- const outDir = path.join(dataRoot, "fusion");
2040
- if (!fs.existsSync(outDir))
2041
- fs.mkdirSync(outDir, { recursive: true });
2042
- const outputPath = path.join(outDir, `fused_${Date.now()}${ext}`);
2043
- const result = await fusionEngine.fuse(resolvedPaths, outputPath, {
2044
- strategy,
2045
- join_on: joinOn,
2046
- how,
2047
- dedup,
2048
- run_quality_after: runQualityAfter,
2049
- leakage_check: leakageCheck,
2050
- output_format: outputFormat,
2051
- compression: compression,
2052
- preview,
2053
- });
2054
- const nullDelta = result.stats.null_delta;
2055
- const nullText = nullDelta >= 0 ? `+${nullDelta}%` : `${nullDelta}%`;
2056
- // Register fused dataset under a generated id so users can export it easily
2057
- const fusedId = `fused_${Date.now()}`;
2058
- try {
2059
- upsertRegistry(fusedId, result.output_path, "completed");
2242
+ case "analyze_image_quality": {
2243
+ const inputPath = String(request.params.arguments?.path);
2244
+ if (!fs.existsSync(inputPath)) {
2245
+ throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
2060
2246
  }
2061
- catch (e) {
2062
- console.error(`[Registry] Failed to register fused dataset ${fusedId}: ${e?.message || e}`);
2063
- }
2064
- let msg = `Fused ${result.stats.sources_count} sources -> ${result.stats.rows_after.toLocaleString()} rows (from ${result.stats.rows_before.toLocaleString()}).\n`;
2065
- msg += `- Duplicates removed: ${result.stats.duplicates_removed.toLocaleString()}\n`;
2066
- msg += `- Null change: ${nullText}\n`;
2067
- msg += `- Output: ${result.output_path}\n`;
2068
- if (result.preview_path)
2069
- msg += `- Preview: ${result.preview_path}\n`;
2070
- if (result.leakage_report) {
2071
- msg += `- Leakage: ${result.leakage_report.leakage_detected ? "detected" : "none"}`;
2072
- if (result.leakage_report.leakage_count) {
2073
- msg += ` (${result.leakage_report.leakage_count})`;
2247
+ try {
2248
+ const report = await imageAnalyzer.analyze(inputPath);
2249
+ let output = `## Image Quality Report\n\n`;
2250
+ output += `- **Total Images**: ${report.total_images}\n`;
2251
+ output += `- **Corrupted**: ${report.corrupted_count}\n`;
2252
+ output += `- **Average Resolution**: ${Math.round(report.average_width)}x${Math.round(report.average_height)}\n`;
2253
+ output += `- **Blurry Images**: ${report.blurry_count}\n\n`;
2254
+ if (report.individual_results.length > 0) {
2255
+ output += `### Sample Detail (Top 5)\n`;
2256
+ report.individual_results.slice(0, 5).forEach(img => {
2257
+ const statusLabel = img.status === "ok" ? (img.is_blurry ? "[warn]" : "[ok]") : "[error]";
2258
+ output += `${statusLabel} **${img.filename}**: ${img.width}x${img.height} | Mode: ${img.mode} | Blur: ${img.blur_score?.toFixed(1)}\n`;
2259
+ });
2074
2260
  }
2075
- msg += "\n";
2261
+ return {
2262
+ content: [{ type: "text", text: output }]
2263
+ };
2264
+ }
2265
+ catch (error) {
2266
+ return {
2267
+ content: [{ type: "text", text: `ERROR: Image analysis failed: ${error.message}` }],
2268
+ isError: true
2269
+ };
2076
2270
  }
2077
- msg += `\nNext: run split_dataset/export_dataset on fused output. Registered fused dataset id: ${fusedId}`;
2078
- return { content: [{ type: "text", text: msg }] };
2079
- }
2080
- catch (error) {
2081
- return {
2082
- content: [{ type: "text", text: `ERROR: Fusion failed: ${error.message}` }],
2083
- isError: true
2084
- };
2085
- }
2086
- }
2087
- case "analyze_image_quality": {
2088
- const inputPath = String(request.params.arguments?.path);
2089
- if (!fs.existsSync(inputPath)) {
2090
- throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
2091
2271
  }
2092
- try {
2093
- const report = await imageAnalyzer.analyze(inputPath);
2094
- let output = `## Image Quality Report\n\n`;
2095
- output += `- **Total Images**: ${report.total_images}\n`;
2096
- output += `- **Corrupted**: ${report.corrupted_count}\n`;
2097
- output += `- **Average Resolution**: ${Math.round(report.average_width)}x${Math.round(report.average_height)}\n`;
2098
- output += `- **Blurry Images**: ${report.blurry_count}\n\n`;
2099
- if (report.individual_results.length > 0) {
2100
- output += `### Sample Detail (Top 5)\n`;
2101
- report.individual_results.slice(0, 5).forEach(img => {
2102
- const statusLabel = img.status === "ok" ? (img.is_blurry ? "[warn]" : "[ok]") : "[error]";
2103
- output += `${statusLabel} **${img.filename}**: ${img.width}x${img.height} | Mode: ${img.mode} | Blur: ${img.blur_score?.toFixed(1)}\n`;
2272
+ case "analyze_media_quality": {
2273
+ const inputPath = String(request.params.arguments?.path);
2274
+ if (!fs.existsSync(inputPath)) {
2275
+ throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
2276
+ }
2277
+ try {
2278
+ const report = await mediaAnalyzer.analyze(inputPath);
2279
+ let output = `## Media Quality Report\n\n`;
2280
+ output += `- **Total Files**: ${report.total_files}\n`;
2281
+ output += `- **OK Files**: ${report.ok_files}\n`;
2282
+ output += `- **Failed Files**: ${report.failed_files}\n`;
2283
+ if ('avg_audio_duration' in report && report.avg_audio_duration) {
2284
+ output += `- **Average Audio Duration**: ${report.avg_audio_duration.toFixed(2)}s\n`;
2285
+ }
2286
+ if ('avg_video_duration' in report && report.avg_video_duration) {
2287
+ output += `- **Average Video Duration**: ${report.avg_video_duration.toFixed(2)}s\n`;
2288
+ output += `- **Average FPS**: ${report.avg_fps?.toFixed(1)}\n`;
2289
+ }
2290
+ output += `\n### Sample Detail (Top 5)\n`;
2291
+ report.details.slice(0, 5).forEach(item => {
2292
+ const statusLabel = item.status === "ok" ? "[ok]" : "[error]";
2293
+ if (item.type === "audio" && 'sample_rate' in item) {
2294
+ output += `${statusLabel} **${item.filename}**: ${item.duration?.toFixed(1)}s | SR: ${item.sample_rate}Hz | Silent: ${item.is_silent}\n`;
2295
+ }
2296
+ else if (item.type === "video" && 'width' in item) {
2297
+ output += `${statusLabel} **${item.filename}**: ${item.width}x${item.height} | ${item.fps?.toFixed(1)}fps | Risk: ${item.corruption_risk}\n`;
2298
+ }
2299
+ else {
2300
+ output += `${statusLabel} **${item.filename}**: ${item.error}\n`;
2301
+ }
2104
2302
  });
2303
+ return {
2304
+ content: [{ type: "text", text: output }]
2305
+ };
2306
+ }
2307
+ catch (error) {
2308
+ return {
2309
+ content: [{ type: "text", text: `ERROR: Media analysis failed: ${error.message}` }],
2310
+ isError: true
2311
+ };
2105
2312
  }
2106
- return {
2107
- content: [{ type: "text", text: output }]
2108
- };
2109
- }
2110
- catch (error) {
2111
- return {
2112
- content: [{ type: "text", text: `ERROR: Image analysis failed: ${error.message}` }],
2113
- isError: true
2114
- };
2115
- }
2116
- }
2117
- case "analyze_media_quality": {
2118
- const inputPath = String(request.params.arguments?.path);
2119
- if (!fs.existsSync(inputPath)) {
2120
- throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
2121
2313
  }
2122
- try {
2123
- const report = await mediaAnalyzer.analyze(inputPath);
2124
- let output = `## Media Quality Report\n\n`;
2125
- output += `- **Total Files**: ${report.total_files}\n`;
2126
- output += `- **OK Files**: ${report.ok_files}\n`;
2127
- output += `- **Failed Files**: ${report.failed_files}\n`;
2128
- if ('avg_audio_duration' in report && report.avg_audio_duration) {
2129
- output += `- **Average Audio Duration**: ${report.avg_audio_duration.toFixed(2)}s\n`;
2130
- }
2131
- if ('avg_video_duration' in report && report.avg_video_duration) {
2132
- output += `- **Average Video Duration**: ${report.avg_video_duration.toFixed(2)}s\n`;
2133
- output += `- **Average FPS**: ${report.avg_fps?.toFixed(1)}\n`;
2134
- }
2135
- output += `\n### Sample Detail (Top 5)\n`;
2136
- report.details.slice(0, 5).forEach(item => {
2137
- const statusLabel = item.status === "ok" ? "[ok]" : "[error]";
2138
- if (item.type === "audio" && 'sample_rate' in item) {
2139
- output += `${statusLabel} **${item.filename}**: ${item.duration?.toFixed(1)}s | SR: ${item.sample_rate}Hz | Silent: ${item.is_silent}\n`;
2314
+ case "generate_quality_report": {
2315
+ const datasetId = String(request.params.arguments?.dataset_id);
2316
+ const datasetPath = String(request.params.arguments?.dataset_path);
2317
+ if (!fs.existsSync(datasetPath)) {
2318
+ throw new McpError(ErrorCode.InvalidParams, `Dataset path not found: ${datasetPath}`);
2319
+ }
2320
+ try {
2321
+ // Optionally load text quality from metadata if available
2322
+ const metadata = await metadataStore.getDataset(datasetId);
2323
+ // TODO: Integrate text quality analysis when available
2324
+ const textQuality = null;
2325
+ const report = await qualityOrchestrator.generateReport(datasetId, datasetPath, textQuality);
2326
+ // Save report to metadata
2327
+ if (metadata) {
2328
+ metadata.unified_quality_report = report;
2329
+ await metadataStore.saveDataset(metadata);
2140
2330
  }
2141
- else if (item.type === "video" && 'width' in item) {
2142
- output += `${statusLabel} **${item.filename}**: ${item.width}x${item.height} | ${item.fps?.toFixed(1)}fps | Risk: ${item.corruption_risk}\n`;
2331
+ let output = `# Unified Quality Report\n\n`;
2332
+ output += `**Dataset**: ${datasetId}\n`;
2333
+ output += `**Modalities**: ${report.modalities.join(", ")}\n`;
2334
+ output += `**Overall Quality Score**: ${report.overall_quality_score}/100\n\n`;
2335
+ if (report.text_quality) {
2336
+ output += `## Text Quality\n`;
2337
+ output += `- Rows: ${report.text_quality.row_count}\n`;
2338
+ output += `- Columns: ${report.text_quality.column_count}\n`;
2339
+ output += `- Missing: ${report.text_quality.missing_percentage.toFixed(1)}%\n`;
2340
+ output += `- Duplicates: ${report.text_quality.duplicate_percentage.toFixed(1)}%\n\n`;
2143
2341
  }
2144
- else {
2145
- output += `${statusLabel} **${item.filename}**: ${item.error}\n`;
2342
+ if (report.image_quality) {
2343
+ output += `## Image Quality\n`;
2344
+ output += `- Total Images: ${report.image_quality.total_images}\n`;
2345
+ output += `- Corrupted: ${report.image_quality.corrupted_count}\n`;
2346
+ output += `- Avg Resolution: ${report.image_quality.avg_resolution}\n`;
2347
+ output += `- Blurry: ${report.image_quality.blurry_percentage.toFixed(1)}%\n\n`;
2146
2348
  }
2147
- });
2148
- return {
2149
- content: [{ type: "text", text: output }]
2150
- };
2151
- }
2152
- catch (error) {
2153
- return {
2154
- content: [{ type: "text", text: `ERROR: Media analysis failed: ${error.message}` }],
2155
- isError: true
2156
- };
2157
- }
2158
- }
2159
- case "generate_quality_report": {
2160
- const datasetId = String(request.params.arguments?.dataset_id);
2161
- const datasetPath = String(request.params.arguments?.dataset_path);
2162
- if (!fs.existsSync(datasetPath)) {
2163
- throw new McpError(ErrorCode.InvalidParams, `Dataset path not found: ${datasetPath}`);
2164
- }
2165
- try {
2166
- // Optionally load text quality from metadata if available
2167
- const metadata = await metadataStore.getDataset(datasetId);
2168
- // TODO: Integrate text quality analysis when available
2169
- const textQuality = null;
2170
- const report = await qualityOrchestrator.generateReport(datasetId, datasetPath, textQuality);
2171
- // Save report to metadata
2172
- if (metadata) {
2173
- metadata.unified_quality_report = report;
2174
- await metadataStore.saveDataset(metadata);
2175
- }
2176
- let output = `# Unified Quality Report\n\n`;
2177
- output += `**Dataset**: ${datasetId}\n`;
2178
- output += `**Modalities**: ${report.modalities.join(", ")}\n`;
2179
- output += `**Overall Quality Score**: ${report.overall_quality_score}/100\n\n`;
2180
- if (report.text_quality) {
2181
- output += `## Text Quality\n`;
2182
- output += `- Rows: ${report.text_quality.row_count}\n`;
2183
- output += `- Columns: ${report.text_quality.column_count}\n`;
2184
- output += `- Missing: ${report.text_quality.missing_percentage.toFixed(1)}%\n`;
2185
- output += `- Duplicates: ${report.text_quality.duplicate_percentage.toFixed(1)}%\n\n`;
2186
- }
2187
- if (report.image_quality) {
2188
- output += `## Image Quality\n`;
2189
- output += `- Total Images: ${report.image_quality.total_images}\n`;
2190
- output += `- Corrupted: ${report.image_quality.corrupted_count}\n`;
2191
- output += `- Avg Resolution: ${report.image_quality.avg_resolution}\n`;
2192
- output += `- Blurry: ${report.image_quality.blurry_percentage.toFixed(1)}%\n\n`;
2193
- }
2194
- if (report.audio_quality) {
2195
- output += `## Audio Quality\n`;
2196
- output += `- Total Files: ${report.audio_quality.total_files}\n`;
2197
- output += `- Avg Duration: ${report.audio_quality.avg_duration.toFixed(1)}s\n`;
2198
- output += `- Avg Sample Rate: ${report.audio_quality.avg_sample_rate.toFixed(0)}Hz\n`;
2199
- output += `- Silent: ${report.audio_quality.silent_percentage.toFixed(1)}%\n\n`;
2200
- }
2201
- if (report.video_quality) {
2202
- output += `## Video Quality\n`;
2203
- output += `- Total Files: ${report.video_quality.total_files}\n`;
2204
- output += `- Avg Duration: ${report.video_quality.avg_duration.toFixed(1)}s\n`;
2205
- output += `- Avg FPS: ${report.video_quality.avg_fps.toFixed(1)}\n`;
2206
- output += `- High Corruption Risk: ${report.video_quality.corruption_risk_high}\n\n`;
2207
- }
2208
- output += `## Recommendations\n`;
2209
- report.recommendations.forEach(rec => {
2210
- output += `- ${rec}\n`;
2211
- });
2212
- return {
2213
- content: [{ type: "text", text: output }]
2214
- };
2215
- }
2216
- catch (error) {
2217
- return {
2218
- content: [{ type: "text", text: `ERROR: Quality report generation failed: ${error.message}` }],
2219
- isError: true
2220
- };
2349
+ if (report.audio_quality) {
2350
+ output += `## Audio Quality\n`;
2351
+ output += `- Total Files: ${report.audio_quality.total_files}\n`;
2352
+ output += `- Avg Duration: ${report.audio_quality.avg_duration.toFixed(1)}s\n`;
2353
+ output += `- Avg Sample Rate: ${report.audio_quality.avg_sample_rate.toFixed(0)}Hz\n`;
2354
+ output += `- Silent: ${report.audio_quality.silent_percentage.toFixed(1)}%\n\n`;
2355
+ }
2356
+ if (report.video_quality) {
2357
+ output += `## Video Quality\n`;
2358
+ output += `- Total Files: ${report.video_quality.total_files}\n`;
2359
+ output += `- Avg Duration: ${report.video_quality.avg_duration.toFixed(1)}s\n`;
2360
+ output += `- Avg FPS: ${report.video_quality.avg_fps.toFixed(1)}\n`;
2361
+ output += `- High Corruption Risk: ${report.video_quality.corruption_risk_high}\n\n`;
2362
+ }
2363
+ output += `## Recommendations\n`;
2364
+ report.recommendations.forEach(rec => {
2365
+ output += `- ${rec}\n`;
2366
+ });
2367
+ return {
2368
+ content: [{ type: "text", text: output }]
2369
+ };
2370
+ }
2371
+ catch (error) {
2372
+ return {
2373
+ content: [{ type: "text", text: `ERROR: Quality report generation failed: ${error.message}` }],
2374
+ isError: true
2375
+ };
2376
+ }
2221
2377
  }
2378
+ default:
2379
+ throw new McpError(ErrorCode.MethodNotFound, "Tool not found");
2222
2380
  }
2223
- default:
2224
- throw new McpError(ErrorCode.MethodNotFound, "Tool not found");
2225
- }
2381
+ }); // end requestQueue.enqueue
2226
2382
  });
2227
2383
  async function main() {
2228
2384
  const args = process.argv.slice(2);