vesper-wizard 2.0.7 → 2.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -266,6 +266,34 @@ function logError(err, context) {
266
266
  fs.appendFileSync(errorLogPath, msg);
267
267
  console.error(`[Vesper] Critical error logged to ${errorLogPath}`);
268
268
  }
269
+ // --- Request Queue: serialize all MCP tool calls to prevent crashes ---
270
+ class RequestQueue {
271
+ queue = [];
272
+ running = false;
273
+ enqueue(task) {
274
+ return new Promise((resolve, reject) => {
275
+ this.queue.push({ resolve, reject, task });
276
+ this.drain();
277
+ });
278
+ }
279
+ async drain() {
280
+ if (this.running)
281
+ return;
282
+ this.running = true;
283
+ while (this.queue.length > 0) {
284
+ const item = this.queue.shift();
285
+ try {
286
+ const result = await item.task();
287
+ item.resolve(result);
288
+ }
289
+ catch (err) {
290
+ item.reject(err);
291
+ }
292
+ }
293
+ this.running = false;
294
+ }
295
+ }
296
+ const requestQueue = new RequestQueue();
269
297
  const SPINNER_FRAMES = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"];
270
298
  function printLaunchScreen() {
271
299
  const screen = `
@@ -599,6 +627,18 @@ jobManager.on("processJob", async (job, execute) => {
599
627
  async function handlePrepareJob(jobId, query, requirements) {
600
628
  hydrateExternalKeys();
601
629
  const update = (updates) => jobManager.updateJob(jobId, updates);
630
+ const pipelineSteps = ["search", "validate", "download", "normalize", "quality", "register"];
631
+ const stepStatus = {};
632
+ for (const s of pipelineSteps)
633
+ stepStatus[s] = "pending";
634
+ const markPipelineStep = (step, status) => {
635
+ stepStatus[step] = status;
636
+ const summary = pipelineSteps.map(s => {
637
+ const st = stepStatus[s];
638
+ return st === "done" ? `[${s}]` : st === "running" ? `>${s}<` : st === "failed" ? `!${s}!` : st === "skipped" ? `~${s}~` : ` ${s} `;
639
+ }).join(" → ");
640
+ console.error(`[Pipeline] ${summary}`);
641
+ };
602
642
  // Ensure core Python packages are available for dataset operations
603
643
  try {
604
644
  await ensurePythonModules([
@@ -646,11 +686,14 @@ async function handlePrepareJob(jobId, query, requirements) {
646
686
  progress: 20,
647
687
  status_text: `Using explicit dataset id: ${datasetIdForDownload} (${source})`
648
688
  });
689
+ markPipelineStep("search", "skipped");
649
690
  }
650
691
  else {
692
+ markPipelineStep("search", "running");
651
693
  update({ progress: 10, status_text: "Searching for best dataset matching query..." });
652
694
  const results = await searchEngine.search(query, { limit: 10 });
653
695
  if (results.length === 0) {
696
+ markPipelineStep("search", "failed");
654
697
  throw new Error("No datasets found matching the query. Try refining your search terms.");
655
698
  }
656
699
  // Pick the best result that we can actually download (skip sources requiring missing credentials)
@@ -670,8 +713,10 @@ async function handlePrepareJob(jobId, query, requirements) {
670
713
  progress: 20,
671
714
  status_text: `Matched: ${selectedDataset.name} (${selectedDataset.source})`
672
715
  });
716
+ markPipelineStep("search", "done");
673
717
  }
674
718
  // Pre-check credentials for sources that require them
719
+ markPipelineStep("validate", "running");
675
720
  if (source === "kaggle") {
676
721
  const hasKaggleCreds = !!(process.env.KAGGLE_USERNAME && process.env.KAGGLE_KEY && process.env.KAGGLE_USERNAME !== "YOUR_KAGGLE_USERNAME");
677
722
  if (!hasKaggleCreds) {
@@ -679,8 +724,11 @@ async function handlePrepareJob(jobId, query, requirements) {
679
724
  }
680
725
  }
681
726
  if (source === "dataworld" && !hasDataWorldToken()) {
727
+ markPipelineStep("validate", "failed");
682
728
  throw new Error("data.world token not set. Use the configure_keys tool or set DW_AUTH_TOKEN environment variable.");
683
729
  }
730
+ markPipelineStep("validate", "done");
731
+ markPipelineStep("download", "running");
684
732
  update({ progress: 30, status_text: `Starting download from ${source}...` });
685
733
  // ensureData handles download and returns path to the raw file
686
734
  let rawFilePath = await dataIngestor.ensureData(datasetIdForDownload, source, (msg, prog) => {
@@ -743,15 +791,50 @@ async function handlePrepareJob(jobId, query, requirements) {
743
791
  update({ progress: 69, status_text: `Sample target met: ${currentRows.toLocaleString()} rows` });
744
792
  }
745
793
  }
794
+ markPipelineStep("download", "done");
795
+ // ── Normalize step: convert any raw format → parquet ──
796
+ markPipelineStep("normalize", "running");
797
+ const rawExt = path.extname(rawFilePath).toLowerCase();
798
+ if (rawExt !== ".parquet" && rawExt !== ".pq") {
799
+ update({ progress: 70, status_text: "Normalizing to parquet..." });
800
+ const normalizedDir = path.join(dataRoot, "data", "normalized");
801
+ if (!fs.existsSync(normalizedDir))
802
+ fs.mkdirSync(normalizedDir, { recursive: true });
803
+ const safeId = toSafeDatasetPathFragment(datasetIdForDownload);
804
+ const normalizedPath = path.join(normalizedDir, `${safeId}.parquet`);
805
+ try {
806
+ const normScript = path.join(dataRoot, "python", "normalize_engine.py");
807
+ const normResult = await runPythonJson(normScript, [rawFilePath, normalizedPath]);
808
+ if (normResult.ok && fs.existsSync(normalizedPath)) {
809
+ console.error(`[Prepare] Normalized ${rawExt} → parquet (${normResult.rows} rows)`);
810
+ rawFilePath = normalizedPath;
811
+ markPipelineStep("normalize", "done");
812
+ }
813
+ else {
814
+ console.error(`[Prepare] Normalize failed: ${normResult.error}, continuing with raw file`);
815
+ markPipelineStep("normalize", "skipped");
816
+ }
817
+ }
818
+ catch (e) {
819
+ console.error(`[Prepare] Normalize step failed: ${e?.message || e}, continuing with raw file`);
820
+ markPipelineStep("normalize", "skipped");
821
+ }
822
+ }
823
+ else {
824
+ markPipelineStep("normalize", "done");
825
+ }
746
826
  let qualityScore = selectedDataset?.quality_score ?? 70;
747
- update({ progress: 70, status_text: "Analyzing dataset quality..." });
827
+ markPipelineStep("quality", "running");
828
+ update({ progress: 75, status_text: "Analyzing dataset quality..." });
748
829
  try {
749
830
  const report = await qualityAnalyzer.analyze(rawFilePath);
750
831
  qualityScore = report.overall_score;
832
+ markPipelineStep("quality", "done");
751
833
  }
752
834
  catch (error) {
753
835
  console.error(`[Prepare] Quality analysis failed for ${datasetIdForDownload}: ${error?.message || error}`);
754
836
  update({ progress: 78, status_text: "Quality analysis skipped (unsupported schema). Continuing installation..." });
837
+ markPipelineStep("quality", "skipped");
755
838
  }
756
839
  if (selectedDataset) {
757
840
  metadataStore.saveDataset({
@@ -759,15 +842,19 @@ async function handlePrepareJob(jobId, query, requirements) {
759
842
  quality_score: qualityScore
760
843
  });
761
844
  }
845
+ markPipelineStep("register", "running");
762
846
  update({ progress: 85, status_text: "Installing dataset into project..." });
763
847
  const installPath = await installService.install(datasetIdForDownload, rawFilePath);
764
848
  update({ progress: 100, status_text: "Preparation complete!" });
765
849
  // Register prepared dataset in local registry for lookup by export/list tools
766
850
  try {
767
851
  upsertRegistry(datasetIdForDownload, installPath, "completed");
852
+ markPipelineStep("register", "done");
853
+ markStepComplete(datasetIdForDownload, "prepare");
768
854
  }
769
855
  catch (e) {
770
856
  console.error(`[Registry] Failed to write registry for ${datasetIdForDownload}: ${e?.message || e}`);
857
+ markPipelineStep("register", "failed");
771
858
  }
772
859
  return installPath;
773
860
  }
@@ -960,6 +1047,8 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
960
1047
  kaggle_ref: { type: "string", description: "Kaggle dataset ref (owner/dataset)." },
961
1048
  urls: { type: "array", items: { type: "string" }, description: "Direct asset URLs." },
962
1049
  output_format: { type: "string", enum: ["webdataset", "imagefolder", "parquet"], description: "Output asset format." },
1050
+ target_dir: { type: "string", description: "Optional local directory where downloaded assets should be written. If provided, Vesper writes directly to this directory instead of managed asset storage." },
1051
+ output_dir: { type: "string", description: "Alias for target_dir. When provided, downloaded assets are written directly to this local directory." },
963
1052
  max_items: { type: "number", description: "Optional cap on number of assets to fetch." },
964
1053
  workers: { type: "number", description: "Parallel worker count (default 8)." },
965
1054
  image_column: { type: "string", description: "Explicit image column name. If omitted, auto-detected from HF features, column names, and sample values." },
@@ -1259,110 +1348,237 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
1259
1348
  ],
1260
1349
  };
1261
1350
  });
1262
- // Call Tool
1351
+ // Call Tool — all requests are serialized through a queue to prevent crashes from parallel calls
1263
1352
  server.setRequestHandler(CallToolRequestSchema, async (request) => {
1264
- // --- Pipeline Enforcement ---
1265
- // Map tool names to pipeline steps
1266
- const toolToStep = {
1267
- vesper_search: "search",
1268
- vesper_download: "download",
1269
- vesper_analyze: "analyze",
1270
- vesper_clean: "clean",
1271
- vesper_split: "split",
1272
- vesper_export: "export",
1273
- prepare_dataset: "prepare",
1274
- };
1275
- // Extract dataset_id if present and normalize
1276
- let datasetId = request.params.arguments?.dataset_id || request.params.arguments?.query || "";
1277
- if (datasetId)
1278
- datasetId = parseDatasetId(String(datasetId));
1279
- // Pipeline rules
1280
- const stepOrder = ["search", "download", "analyze", "clean", "split", "export"];
1281
- const prereqs = {
1282
- vesper_download: ["search"],
1283
- vesper_analyze: ["download"],
1284
- vesper_clean: ["analyze"],
1285
- vesper_split: ["clean"],
1286
- vesper_export: ["split"],
1287
- };
1288
- const tool = String(request.params.name);
1289
- const step = toolToStep[tool];
1290
- if (step && datasetId) {
1291
- // Check prerequisites
1292
- const required = prereqs[tool] || [];
1293
- for (const req of required) {
1294
- if (!hasStep(String(datasetId), req)) {
1295
- // Auto-run missing step if possible, else error
1296
- // For export, auto-run prepare_dataset if split missing
1297
- if (tool === "vesper_export" && req === "split") {
1298
- // Auto-trigger prepare_dataset (start a background prepare job)
1299
- try {
1300
- jobManager.createJob("prepare", 0, { query: String(datasetId), requirements: undefined, downloadImages: false });
1301
- // Mark split as complete so export can proceed; export handler will also wait for data if needed.
1302
- markStepComplete(String(datasetId), "split");
1353
+ return requestQueue.enqueue(async () => {
1354
+ // --- Pipeline Enforcement ---
1355
+ // Map tool names to pipeline steps
1356
+ const toolToStep = {
1357
+ vesper_search: "search",
1358
+ vesper_download: "download",
1359
+ vesper_analyze: "analyze",
1360
+ vesper_clean: "clean",
1361
+ vesper_split: "split",
1362
+ vesper_export: "export",
1363
+ prepare_dataset: "prepare",
1364
+ };
1365
+ // Extract dataset_id if present and normalize
1366
+ let datasetId = request.params.arguments?.dataset_id || request.params.arguments?.query || "";
1367
+ if (datasetId)
1368
+ datasetId = parseDatasetId(String(datasetId));
1369
+ // Pipeline rules
1370
+ const stepOrder = ["search", "download", "analyze", "clean", "split", "export"];
1371
+ const prereqs = {
1372
+ vesper_download: ["search"],
1373
+ vesper_analyze: ["download"],
1374
+ vesper_clean: ["analyze"],
1375
+ vesper_split: ["clean"],
1376
+ vesper_export: ["split"],
1377
+ };
1378
+ const tool = String(request.params.name);
1379
+ const step = toolToStep[tool];
1380
+ if (step && datasetId) {
1381
+ // Check prerequisites
1382
+ const required = prereqs[tool] || [];
1383
+ for (const req of required) {
1384
+ if (!hasStep(String(datasetId), req)) {
1385
+ // Auto-run missing step if possible, else error
1386
+ // For export, auto-run prepare_dataset if split missing
1387
+ if (tool === "vesper_export" && req === "split") {
1388
+ // Auto-trigger prepare_dataset (start a background prepare job)
1389
+ try {
1390
+ jobManager.createJob("prepare", 0, { query: String(datasetId), requirements: undefined, downloadImages: false });
1391
+ // Mark split as complete so export can proceed; export handler will also wait for data if needed.
1392
+ markStepComplete(String(datasetId), "split");
1393
+ }
1394
+ catch (e) {
1395
+ console.error(`[Pipeline] Failed to auto-trigger prepare for ${datasetId}: ${e?.message || e}`);
1396
+ return {
1397
+ content: [{ type: "text", text: `ERROR: Failed to auto-run prepare for ${datasetId}. Please run prepare_dataset first.` }],
1398
+ isError: true,
1399
+ };
1400
+ }
1303
1401
  }
1304
- catch (e) {
1305
- console.error(`[Pipeline] Failed to auto-trigger prepare for ${datasetId}: ${e?.message || e}`);
1402
+ else {
1306
1403
  return {
1307
- content: [{ type: "text", text: `ERROR: Failed to auto-run prepare for ${datasetId}. Please run prepare_dataset first.` }],
1404
+ content: [{ type: "text", text: `ERROR: Cannot run ${tool} before ${req}. Please run ${req} first.` }],
1308
1405
  isError: true,
1309
1406
  };
1310
1407
  }
1311
1408
  }
1312
- else {
1409
+ }
1410
+ // Mark this step as complete
1411
+ markStepComplete(String(datasetId), String(step));
1412
+ }
1413
+ switch (request.params.name) {
1414
+ case "unified_dataset_api": {
1415
+ hydrateExternalKeys();
1416
+ const operation = String(request.params.arguments?.operation || "").trim().toLowerCase();
1417
+ const source = String(request.params.arguments?.source || "auto").trim().toLowerCase();
1418
+ const includeUnavailable = request.params.arguments?.include_unavailable === true;
1419
+ const publicOnly = request.params.arguments?.public_only !== false;
1420
+ try {
1421
+ if (operation === "providers") {
1422
+ return {
1423
+ content: [{ type: "text", text: JSON.stringify({ providers: unifiedDatasetGateway.getProviderStatuses(includeUnavailable) }, null, 2) }],
1424
+ };
1425
+ }
1426
+ if (operation === "discover") {
1427
+ const query = String(request.params.arguments?.query || "").trim();
1428
+ if (!query) {
1429
+ throw new McpError(ErrorCode.InvalidParams, "query is required for operation='discover'");
1430
+ }
1431
+ const result = await unifiedDatasetGateway.discover({
1432
+ query,
1433
+ source,
1434
+ limit: Number(request.params.arguments?.limit || 10),
1435
+ publicOnly,
1436
+ });
1437
+ return {
1438
+ content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
1439
+ };
1440
+ }
1441
+ if (operation === "download") {
1442
+ const datasetId = String(request.params.arguments?.dataset_id || "").trim();
1443
+ if (!datasetId) {
1444
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='download'");
1445
+ }
1446
+ try {
1447
+ await ensurePythonModules([{ module: "datasets", packageName: "datasets" }]);
1448
+ }
1449
+ catch {
1450
+ // best effort; non-HF providers do not require this
1451
+ }
1452
+ const result = await unifiedDatasetGateway.download({
1453
+ datasetId,
1454
+ source,
1455
+ targetDir: request.params.arguments?.target_dir ? String(request.params.arguments.target_dir) : undefined,
1456
+ });
1457
+ try {
1458
+ upsertRegistry(result.dataset_id, result.copied_to || result.local_path, "completed");
1459
+ }
1460
+ catch (e) {
1461
+ console.error(`[Registry] Failed to write registry for ${result.dataset_id}: ${e?.message || e}`);
1462
+ }
1463
+ return {
1464
+ content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
1465
+ };
1466
+ }
1467
+ if (operation === "info") {
1468
+ const datasetId = String(request.params.arguments?.dataset_id || "").trim();
1469
+ if (!datasetId) {
1470
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='info'");
1471
+ }
1472
+ const result = await unifiedDatasetGateway.info({
1473
+ datasetId,
1474
+ source,
1475
+ publicOnly,
1476
+ });
1477
+ return {
1478
+ content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
1479
+ };
1480
+ }
1481
+ throw new McpError(ErrorCode.InvalidParams, `Unsupported operation: ${operation}`);
1482
+ }
1483
+ catch (error) {
1313
1484
  return {
1314
- content: [{ type: "text", text: `ERROR: Cannot run ${tool} before ${req}. Please run ${req} first.` }],
1485
+ content: [{ type: "text", text: `ERROR: unified gateway failed: ${error.message}` }],
1315
1486
  isError: true,
1316
1487
  };
1317
1488
  }
1318
1489
  }
1319
- }
1320
- // Mark this step as complete
1321
- markStepComplete(String(datasetId), String(step));
1322
- }
1323
- switch (request.params.name) {
1324
- case "unified_dataset_api": {
1325
- hydrateExternalKeys();
1326
- const operation = String(request.params.arguments?.operation || "").trim().toLowerCase();
1327
- const source = String(request.params.arguments?.source || "auto").trim().toLowerCase();
1328
- const includeUnavailable = request.params.arguments?.include_unavailable === true;
1329
- const publicOnly = request.params.arguments?.public_only !== false;
1330
- try {
1331
- if (operation === "providers") {
1332
- return {
1333
- content: [{ type: "text", text: JSON.stringify({ providers: unifiedDatasetGateway.getProviderStatuses(includeUnavailable) }, null, 2) }],
1334
- };
1490
+ case "vesper_search": {
1491
+ const query = String(request.params.arguments?.query);
1492
+ const limit = 5;
1493
+ const safeOnly = true; // Enable safe filter by default
1494
+ const enableJIT = request.params.arguments?.enable_jit === true;
1495
+ if (!query) {
1496
+ throw new McpError(ErrorCode.InvalidParams, "Query is required");
1335
1497
  }
1336
- if (operation === "discover") {
1337
- const query = String(request.params.arguments?.query || "").trim();
1338
- if (!query) {
1339
- throw new McpError(ErrorCode.InvalidParams, "query is required for operation='discover'");
1340
- }
1341
- const result = await unifiedDatasetGateway.discover({
1498
+ const results = await searchEngine.search(query, { limit, safeOnly, enableJIT });
1499
+ const formattedOutput = formatSearchResults(results);
1500
+ return {
1501
+ content: [
1502
+ {
1503
+ type: "text",
1504
+ text: formattedOutput,
1505
+ },
1506
+ ],
1507
+ };
1508
+ }
1509
+ case "discover_datasets": {
1510
+ hydrateExternalKeys();
1511
+ const query = String(request.params.arguments?.query || "").trim();
1512
+ const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
1513
+ const limit = Number(request.params.arguments?.limit || 10);
1514
+ if (!query) {
1515
+ throw new McpError(ErrorCode.InvalidParams, "query is required");
1516
+ }
1517
+ try {
1518
+ const gatewayResult = await unifiedDatasetGateway.discover({
1342
1519
  query,
1343
1520
  source,
1344
- limit: Number(request.params.arguments?.limit || 10),
1345
- publicOnly,
1521
+ limit,
1522
+ publicOnly: false,
1346
1523
  });
1524
+ const results = gatewayResult.results;
1525
+ const recipeScript = path.join(dataRoot, "python", "asset_downloader_engine.py");
1526
+ for (const ds of results.slice(0, limit)) {
1527
+ const info = {
1528
+ dataset_id: ds.id,
1529
+ id: ds.id,
1530
+ source: ds.source,
1531
+ repo_id: ds.id,
1532
+ total_images: ds.total_examples || 0,
1533
+ image_column: undefined,
1534
+ recipes_dir: path.join(dataRoot, "recipes"),
1535
+ };
1536
+ try {
1537
+ await runPythonJson(recipeScript, ["build_recipe", JSON.stringify(info)]);
1538
+ }
1539
+ catch {
1540
+ // best-effort recipe generation; ignore discovery-time recipe failures
1541
+ }
1542
+ }
1543
+ const formattedOutput = formatSearchResults(results.slice(0, limit));
1544
+ const noteBlock = gatewayResult.notes.length > 0
1545
+ ? `\nGateway notes:\n- ${gatewayResult.notes.join("\n- ")}`
1546
+ : "";
1347
1547
  return {
1348
- content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
1548
+ content: [{ type: "text", text: `${formattedOutput}${noteBlock}` }]
1349
1549
  };
1350
1550
  }
1351
- if (operation === "download") {
1352
- const datasetId = String(request.params.arguments?.dataset_id || "").trim();
1353
- if (!datasetId) {
1354
- throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='download'");
1355
- }
1551
+ catch (error) {
1552
+ return {
1553
+ content: [{ type: "text", text: `ERROR: discover failed: ${error.message}` }],
1554
+ isError: true,
1555
+ };
1556
+ }
1557
+ }
1558
+ case "download_dataset": {
1559
+ hydrateExternalKeys();
1560
+ const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
1561
+ const datasetId = String(request.params.arguments?.dataset_id || "").trim();
1562
+ const targetDir = request.params.arguments?.target_dir ? String(request.params.arguments.target_dir) : undefined;
1563
+ if (!datasetId) {
1564
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
1565
+ }
1566
+ // Pre-install Python datasets library for HuggingFace fallback
1567
+ if (source === "huggingface") {
1356
1568
  try {
1357
- await ensurePythonModules([{ module: "datasets", packageName: "datasets" }]);
1569
+ await ensurePythonModules([
1570
+ { module: "datasets", packageName: "datasets" },
1571
+ ]);
1358
1572
  }
1359
1573
  catch {
1360
- // best effort; non-HF providers do not require this
1574
+ // Continue - direct download may still work
1361
1575
  }
1576
+ }
1577
+ try {
1362
1578
  const result = await unifiedDatasetGateway.download({
1363
1579
  datasetId,
1364
1580
  source,
1365
- targetDir: request.params.arguments?.target_dir ? String(request.params.arguments.target_dir) : undefined,
1581
+ targetDir,
1366
1582
  });
1367
1583
  try {
1368
1584
  upsertRegistry(result.dataset_id, result.copied_to || result.local_path, "completed");
@@ -1370,851 +1586,761 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1370
1586
  catch (e) {
1371
1587
  console.error(`[Registry] Failed to write registry for ${result.dataset_id}: ${e?.message || e}`);
1372
1588
  }
1589
+ const noteBlock = result.notes.length > 0 ? `\nNotes:\n- ${result.notes.join("\n- ")}` : "";
1373
1590
  return {
1374
- content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
1591
+ content: [{ type: "text", text: `Download complete: ${result.copied_to || result.local_path}${noteBlock}` }]
1375
1592
  };
1376
1593
  }
1377
- if (operation === "info") {
1378
- const datasetId = String(request.params.arguments?.dataset_id || "").trim();
1379
- if (!datasetId) {
1380
- throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='info'");
1381
- }
1382
- const result = await unifiedDatasetGateway.info({
1383
- datasetId,
1384
- source,
1385
- publicOnly,
1386
- });
1594
+ catch (error) {
1387
1595
  return {
1388
- content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
1596
+ content: [{ type: "text", text: `ERROR: download failed: ${error.message}` }],
1597
+ isError: true,
1389
1598
  };
1390
1599
  }
1391
- throw new McpError(ErrorCode.InvalidParams, `Unsupported operation: ${operation}`);
1392
- }
1393
- catch (error) {
1394
- return {
1395
- content: [{ type: "text", text: `ERROR: unified gateway failed: ${error.message}` }],
1396
- isError: true,
1397
- };
1398
1600
  }
1399
- }
1400
- case "vesper_search": {
1401
- const query = String(request.params.arguments?.query);
1402
- const limit = 5;
1403
- const safeOnly = true; // Enable safe filter by default
1404
- const enableJIT = request.params.arguments?.enable_jit === true;
1405
- if (!query) {
1406
- throw new McpError(ErrorCode.InvalidParams, "Query is required");
1407
- }
1408
- const results = await searchEngine.search(query, { limit, safeOnly, enableJIT });
1409
- const formattedOutput = formatSearchResults(results);
1410
- return {
1411
- content: [
1412
- {
1413
- type: "text",
1414
- text: formattedOutput,
1415
- },
1416
- ],
1417
- };
1418
- }
1419
- case "discover_datasets": {
1420
- hydrateExternalKeys();
1421
- const query = String(request.params.arguments?.query || "").trim();
1422
- const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
1423
- const limit = Number(request.params.arguments?.limit || 10);
1424
- if (!query) {
1425
- throw new McpError(ErrorCode.InvalidParams, "query is required");
1426
- }
1427
- try {
1428
- const gatewayResult = await unifiedDatasetGateway.discover({
1429
- query,
1430
- source,
1431
- limit,
1432
- publicOnly: false,
1433
- });
1434
- const results = gatewayResult.results;
1435
- const recipeScript = path.join(dataRoot, "python", "asset_downloader_engine.py");
1436
- for (const ds of results.slice(0, limit)) {
1437
- const info = {
1438
- dataset_id: ds.id,
1439
- id: ds.id,
1440
- source: ds.source,
1441
- repo_id: ds.id,
1442
- total_images: ds.total_examples || 0,
1443
- image_column: undefined,
1444
- recipes_dir: path.join(dataRoot, "recipes"),
1601
+ case "vesper_download_assets": {
1602
+ hydrateExternalKeys();
1603
+ const datasetId = String(request.params.arguments?.dataset_id || "").trim();
1604
+ const source = String(request.params.arguments?.source || "").trim().toLowerCase();
1605
+ // Auto-infer repo_id from dataset_id if not provided (common for HuggingFace)
1606
+ const repoId = request.params.arguments?.repo_id
1607
+ ? String(request.params.arguments.repo_id)
1608
+ : (source === "huggingface" && datasetId.includes("/") ? datasetId : undefined);
1609
+ const kaggleRef = request.params.arguments?.kaggle_ref ? String(request.params.arguments.kaggle_ref) : undefined;
1610
+ const urls = Array.isArray(request.params.arguments?.urls)
1611
+ ? (request.params.arguments?.urls).map(v => String(v))
1612
+ : undefined;
1613
+ const outputFormat = String(request.params.arguments?.output_format || "webdataset");
1614
+ const requestedOutputDir = request.params.arguments?.target_dir
1615
+ ? String(request.params.arguments.target_dir).trim()
1616
+ : request.params.arguments?.output_dir
1617
+ ? String(request.params.arguments.output_dir).trim()
1618
+ : undefined;
1619
+ const maxItems = request.params.arguments?.max_items ? Number(request.params.arguments.max_items) : undefined;
1620
+ const workers = request.params.arguments?.workers ? Number(request.params.arguments.workers) : 8;
1621
+ const imageColumn = request.params.arguments?.image_column ? String(request.params.arguments.image_column) : undefined;
1622
+ if (!datasetId || !source) {
1623
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id and source are required");
1624
+ }
1625
+ if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
1626
+ return {
1627
+ content: [{ type: "text", text: "Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds)." }],
1628
+ isError: true,
1445
1629
  };
1446
- try {
1447
- await runPythonJson(recipeScript, ["build_recipe", JSON.stringify(info)]);
1448
- }
1449
- catch {
1450
- // best-effort recipe generation; ignore discovery-time recipe failures
1451
- }
1452
1630
  }
1453
- const formattedOutput = formatSearchResults(results.slice(0, limit));
1454
- const noteBlock = gatewayResult.notes.length > 0
1455
- ? `\nGateway notes:\n- ${gatewayResult.notes.join("\n- ")}`
1456
- : "";
1457
- return {
1458
- content: [{ type: "text", text: `${formattedOutput}${noteBlock}` }]
1459
- };
1460
- }
1461
- catch (error) {
1462
- return {
1463
- content: [{ type: "text", text: `ERROR: discover failed: ${error.message}` }],
1464
- isError: true,
1465
- };
1466
- }
1467
- }
1468
- case "download_dataset": {
1469
- hydrateExternalKeys();
1470
- const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
1471
- const datasetId = String(request.params.arguments?.dataset_id || "").trim();
1472
- const targetDir = request.params.arguments?.target_dir ? String(request.params.arguments.target_dir) : undefined;
1473
- if (!datasetId) {
1474
- throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
1475
- }
1476
- // Pre-install Python datasets library for HuggingFace fallback
1477
- if (source === "huggingface") {
1631
+ const requiredModules = [
1632
+ { module: "aiohttp", packageName: "aiohttp" },
1633
+ ];
1634
+ if (source === "url") {
1635
+ requiredModules.push({ module: "aiofiles", packageName: "aiofiles" });
1636
+ }
1637
+ if (source === "huggingface") {
1638
+ requiredModules.push({ module: "datasets", packageName: "datasets" });
1639
+ requiredModules.push({ module: "PIL", packageName: "Pillow" });
1640
+ }
1641
+ if (source === "kaggle") {
1642
+ requiredModules.push({ module: "kaggle", packageName: "kaggle" });
1643
+ }
1478
1644
  try {
1479
- await ensurePythonModules([
1480
- { module: "datasets", packageName: "datasets" },
1481
- ]);
1645
+ await ensurePythonModules(requiredModules);
1482
1646
  }
1483
- catch {
1484
- // Continue - direct download may still work
1647
+ catch (error) {
1648
+ return {
1649
+ content: [{ type: "text", text: `ERROR: Python dependency setup failed: ${error.message}` }],
1650
+ isError: true,
1651
+ };
1485
1652
  }
1486
- }
1487
- try {
1488
- const result = await unifiedDatasetGateway.download({
1489
- datasetId,
1653
+ const scriptPath = path.join(dataRoot, "python", "asset_downloader_engine.py");
1654
+ const payload = {
1655
+ dataset_id: datasetId,
1490
1656
  source,
1491
- targetDir,
1492
- });
1657
+ repo_id: repoId,
1658
+ kaggle_ref: kaggleRef,
1659
+ urls,
1660
+ output_format: outputFormat,
1661
+ output_dir: requestedOutputDir,
1662
+ max_items: maxItems,
1663
+ workers,
1664
+ image_column: imageColumn,
1665
+ output_root: path.join(dataRoot, "data", "assets"),
1666
+ recipes_dir: path.join(dataRoot, "recipes"),
1667
+ };
1493
1668
  try {
1494
- upsertRegistry(result.dataset_id, result.copied_to || result.local_path, "completed");
1669
+ const result = await runPythonJson(scriptPath, ["download", JSON.stringify(payload)]);
1670
+ if (!result?.ok) {
1671
+ const errMsg = result?.error || "Unknown error";
1672
+ // Enhance error messages for common failures
1673
+ let hint = "";
1674
+ if (errMsg.includes("No image column")) {
1675
+ hint = "\n\nHint: Specify image_column parameter with the column name containing images/URLs.";
1676
+ }
1677
+ else if (errMsg.includes("Authentication") || errMsg.includes("401") || errMsg.includes("403")) {
1678
+ hint = "\n\nHint: Use configure_keys tool to set HF_TOKEN for gated/private datasets.";
1679
+ }
1680
+ return {
1681
+ content: [{ type: "text", text: `ERROR: asset download failed: ${errMsg}${hint}` }],
1682
+ isError: true,
1683
+ };
1684
+ }
1685
+ return {
1686
+ content: [{ type: "text", text: JSON.stringify(result.result, null, 2) }],
1687
+ };
1495
1688
  }
1496
- catch (e) {
1497
- console.error(`[Registry] Failed to write registry for ${result.dataset_id}: ${e?.message || e}`);
1689
+ catch (error) {
1690
+ return {
1691
+ content: [{ type: "text", text: `ERROR: asset downloader execution failed: ${error.message}` }],
1692
+ isError: true,
1693
+ };
1498
1694
  }
1499
- const noteBlock = result.notes.length > 0 ? `\nNotes:\n- ${result.notes.join("\n- ")}` : "";
1500
- return {
1501
- content: [{ type: "text", text: `Download complete: ${result.copied_to || result.local_path}${noteBlock}` }]
1502
- };
1503
1695
  }
1504
- catch (error) {
1505
- return {
1506
- content: [{ type: "text", text: `ERROR: download failed: ${error.message}` }],
1507
- isError: true,
1508
- };
1509
- }
1510
- }
1511
- case "vesper_download_assets": {
1512
- hydrateExternalKeys();
1513
- const datasetId = String(request.params.arguments?.dataset_id || "").trim();
1514
- const source = String(request.params.arguments?.source || "").trim().toLowerCase();
1515
- // Auto-infer repo_id from dataset_id if not provided (common for HuggingFace)
1516
- const repoId = request.params.arguments?.repo_id
1517
- ? String(request.params.arguments.repo_id)
1518
- : (source === "huggingface" && datasetId.includes("/") ? datasetId : undefined);
1519
- const kaggleRef = request.params.arguments?.kaggle_ref ? String(request.params.arguments.kaggle_ref) : undefined;
1520
- const urls = Array.isArray(request.params.arguments?.urls)
1521
- ? (request.params.arguments?.urls).map(v => String(v))
1522
- : undefined;
1523
- const outputFormat = String(request.params.arguments?.output_format || "webdataset");
1524
- const maxItems = request.params.arguments?.max_items ? Number(request.params.arguments.max_items) : undefined;
1525
- const workers = request.params.arguments?.workers ? Number(request.params.arguments.workers) : 8;
1526
- const imageColumn = request.params.arguments?.image_column ? String(request.params.arguments.image_column) : undefined;
1527
- if (!datasetId || !source) {
1528
- throw new McpError(ErrorCode.InvalidParams, "dataset_id and source are required");
1529
- }
1530
- if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
1531
- return {
1532
- content: [{ type: "text", text: "Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds)." }],
1533
- isError: true,
1534
- };
1535
- }
1536
- const requiredModules = [
1537
- { module: "aiohttp", packageName: "aiohttp" },
1538
- ];
1539
- if (source === "url") {
1540
- requiredModules.push({ module: "aiofiles", packageName: "aiofiles" });
1541
- }
1542
- if (source === "huggingface") {
1543
- requiredModules.push({ module: "datasets", packageName: "datasets" });
1544
- requiredModules.push({ module: "PIL", packageName: "Pillow" });
1545
- }
1546
- if (source === "kaggle") {
1547
- requiredModules.push({ module: "kaggle", packageName: "kaggle" });
1548
- }
1549
- try {
1550
- await ensurePythonModules(requiredModules);
1551
- }
1552
- catch (error) {
1696
+ case "configure_kaggle": {
1697
+ const username = String(request.params.arguments?.username || "").trim();
1698
+ const key = String(request.params.arguments?.key || "").trim();
1699
+ if (!username || !key) {
1700
+ throw new McpError(ErrorCode.InvalidParams, "username and key are required");
1701
+ }
1702
+ const r1 = secureKeys.set("kaggle_username", username);
1703
+ const r2 = secureKeys.set("kaggle_key", key);
1704
+ process.env.KAGGLE_USERNAME = username;
1705
+ process.env.KAGGLE_KEY = key;
1553
1706
  return {
1554
- content: [{ type: "text", text: `ERROR: Python dependency setup failed: ${error.message}` }],
1555
- isError: true,
1707
+ content: [{ type: "text", text: `Kaggle credentials saved securely (${r1.method || "store"}/${r2.method || "store"}). You can now use Kaggle datasets.` }]
1556
1708
  };
1557
1709
  }
1558
- const scriptPath = path.join(dataRoot, "python", "asset_downloader_engine.py");
1559
- const payload = {
1560
- dataset_id: datasetId,
1561
- source,
1562
- repo_id: repoId,
1563
- kaggle_ref: kaggleRef,
1564
- urls,
1565
- output_format: outputFormat,
1566
- max_items: maxItems,
1567
- workers,
1568
- image_column: imageColumn,
1569
- output_root: path.join(dataRoot, "data", "assets"),
1570
- recipes_dir: path.join(dataRoot, "recipes"),
1571
- };
1572
- try {
1573
- const result = await runPythonJson(scriptPath, ["download", JSON.stringify(payload)]);
1574
- if (!result?.ok) {
1575
- const errMsg = result?.error || "Unknown error";
1576
- // Enhance error messages for common failures
1577
- let hint = "";
1578
- if (errMsg.includes("No image column")) {
1579
- hint = "\n\nHint: Specify image_column parameter with the column name containing images/URLs.";
1710
+ case "configure_keys": {
1711
+ const hfToken = String(request.params.arguments?.hf_token || "").trim();
1712
+ const kaggleUsername = String(request.params.arguments?.kaggle_username || "").trim();
1713
+ const kaggleKey = String(request.params.arguments?.kaggle_key || "").trim();
1714
+ const dataworldToken = String(request.params.arguments?.dataworld_token || "").trim();
1715
+ const saved = [];
1716
+ const methods = [];
1717
+ if (hfToken) {
1718
+ const r = secureKeys.set("hf_token", hfToken);
1719
+ if (r.ok) {
1720
+ process.env.HF_TOKEN = hfToken;
1721
+ saved.push("HF token");
1722
+ if (r.method)
1723
+ methods.push(r.method);
1724
+ }
1725
+ }
1726
+ if (kaggleUsername) {
1727
+ const r = secureKeys.set("kaggle_username", kaggleUsername);
1728
+ if (r.ok) {
1729
+ process.env.KAGGLE_USERNAME = kaggleUsername;
1730
+ saved.push("Kaggle username");
1731
+ if (r.method)
1732
+ methods.push(r.method);
1580
1733
  }
1581
- else if (errMsg.includes("Authentication") || errMsg.includes("401") || errMsg.includes("403")) {
1582
- hint = "\n\nHint: Use configure_keys tool to set HF_TOKEN for gated/private datasets.";
1734
+ }
1735
+ if (kaggleKey) {
1736
+ const r = secureKeys.set("kaggle_key", kaggleKey);
1737
+ if (r.ok) {
1738
+ process.env.KAGGLE_KEY = kaggleKey;
1739
+ saved.push("Kaggle key");
1740
+ if (r.method)
1741
+ methods.push(r.method);
1583
1742
  }
1743
+ }
1744
+ if (dataworldToken) {
1745
+ const r = secureKeys.set("dataworld_token", dataworldToken);
1746
+ if (r.ok) {
1747
+ process.env.DW_AUTH_TOKEN = dataworldToken;
1748
+ saved.push("data.world token");
1749
+ if (r.method)
1750
+ methods.push(r.method);
1751
+ }
1752
+ }
1753
+ if (saved.length === 0) {
1584
1754
  return {
1585
- content: [{ type: "text", text: `ERROR: asset download failed: ${errMsg}${hint}` }],
1586
- isError: true,
1755
+ content: [{ type: "text", text: "No keys provided. Core Vesper tools continue to work without API keys." }]
1587
1756
  };
1588
1757
  }
1589
1758
  return {
1590
- content: [{ type: "text", text: JSON.stringify(result.result, null, 2) }],
1591
- };
1592
- }
1593
- catch (error) {
1594
- return {
1595
- content: [{ type: "text", text: `ERROR: asset downloader execution failed: ${error.message}` }],
1596
- isError: true,
1759
+ content: [{ type: "text", text: `Key saved securely. Updated: ${saved.join(", ")}.` }]
1597
1760
  };
1598
1761
  }
1599
- }
1600
- case "configure_kaggle": {
1601
- const username = String(request.params.arguments?.username || "").trim();
1602
- const key = String(request.params.arguments?.key || "").trim();
1603
- if (!username || !key) {
1604
- throw new McpError(ErrorCode.InvalidParams, "username and key are required");
1605
- }
1606
- const r1 = secureKeys.set("kaggle_username", username);
1607
- const r2 = secureKeys.set("kaggle_key", key);
1608
- process.env.KAGGLE_USERNAME = username;
1609
- process.env.KAGGLE_KEY = key;
1610
- return {
1611
- content: [{ type: "text", text: `Kaggle credentials saved securely (${r1.method || "store"}/${r2.method || "store"}). You can now use Kaggle datasets.` }]
1612
- };
1613
- }
1614
- case "configure_keys": {
1615
- const hfToken = String(request.params.arguments?.hf_token || "").trim();
1616
- const kaggleUsername = String(request.params.arguments?.kaggle_username || "").trim();
1617
- const kaggleKey = String(request.params.arguments?.kaggle_key || "").trim();
1618
- const dataworldToken = String(request.params.arguments?.dataworld_token || "").trim();
1619
- const saved = [];
1620
- const methods = [];
1621
- if (hfToken) {
1622
- const r = secureKeys.set("hf_token", hfToken);
1623
- if (r.ok) {
1624
- process.env.HF_TOKEN = hfToken;
1625
- saved.push("HF token");
1626
- if (r.method)
1627
- methods.push(r.method);
1762
+ case "get_dataset_info": {
1763
+ const datasetId = String(request.params.arguments?.dataset_id);
1764
+ if (!datasetId) {
1765
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
1628
1766
  }
1629
- }
1630
- if (kaggleUsername) {
1631
- const r = secureKeys.set("kaggle_username", kaggleUsername);
1632
- if (r.ok) {
1633
- process.env.KAGGLE_USERNAME = kaggleUsername;
1634
- saved.push("Kaggle username");
1635
- if (r.method)
1636
- methods.push(r.method);
1767
+ const dataset = metadataStore.getDataset(datasetId);
1768
+ if (!dataset) {
1769
+ return {
1770
+ content: [{ type: "text", text: `ERROR: Dataset not found: ${datasetId}` }],
1771
+ isError: true,
1772
+ };
1637
1773
  }
1638
- }
1639
- if (kaggleKey) {
1640
- const r = secureKeys.set("kaggle_key", kaggleKey);
1641
- if (r.ok) {
1642
- process.env.KAGGLE_KEY = kaggleKey;
1643
- saved.push("Kaggle key");
1644
- if (r.method)
1645
- methods.push(r.method);
1774
+ // Enrich: if total_examples is 0-ish, try the HF datasets-server /size API
1775
+ if ((!dataset.total_examples || dataset.total_examples === 0) && dataset.source === "huggingface") {
1776
+ try {
1777
+ const sizeResp = await fetch(`https://datasets-server.huggingface.co/size?dataset=${encodeURIComponent(dataset.id)}`);
1778
+ if (sizeResp.ok) {
1779
+ const sizeData = await sizeResp.json();
1780
+ const numRows = sizeData?.size?.dataset?.num_rows;
1781
+ if (numRows && numRows > 0) {
1782
+ dataset.total_examples = numRows;
1783
+ // Also backfill splits
1784
+ if (sizeData?.size?.splits && Array.isArray(sizeData.size.splits)) {
1785
+ dataset.splits = sizeData.size.splits.map((s) => ({
1786
+ name: s.split,
1787
+ num_examples: s.num_rows || 0,
1788
+ size_bytes: s.num_bytes_parquet_files || 0,
1789
+ }));
1790
+ dataset.has_train_split = dataset.splits.some((s) => s.name === "train");
1791
+ dataset.has_test_split = dataset.splits.some((s) => s.name === "test");
1792
+ dataset.has_validation_split = dataset.splits.some((s) => s.name === "validation" || s.name === "val");
1793
+ }
1794
+ // Persist enriched metadata
1795
+ metadataStore.saveDataset(dataset);
1796
+ }
1797
+ }
1798
+ }
1799
+ catch {
1800
+ // Enrichment is best-effort; continue with whatever we have
1801
+ }
1646
1802
  }
1647
- }
1648
- if (dataworldToken) {
1649
- const r = secureKeys.set("dataworld_token", dataworldToken);
1650
- if (r.ok) {
1651
- process.env.DW_AUTH_TOKEN = dataworldToken;
1652
- saved.push("data.world token");
1653
- if (r.method)
1654
- methods.push(r.method);
1803
+ const formattedOutput = formatDatasetInfo(dataset);
1804
+ return { content: [{ type: "text", text: formattedOutput }] };
1805
+ }
1806
+ case "analyze_quality": {
1807
+ const datasetId = String(request.params.arguments?.dataset_id);
1808
+ const safeId = toSafeDatasetPathFragment(datasetId);
1809
+ const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
1810
+ const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
1811
+ let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
1812
+ // Demo Fallback for easy testing
1813
+ if (datasetId === "demo" || !fs.existsSync(filePath)) {
1814
+ const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
1815
+ const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
1816
+ if (fs.existsSync(demoParquetPath)) {
1817
+ filePath = demoParquetPath;
1818
+ }
1819
+ else if (fs.existsSync(demoCsvPath)) {
1820
+ filePath = demoCsvPath;
1821
+ }
1822
+ else if (datasetId !== "demo") {
1823
+ return {
1824
+ content: [{ type: "text", text: `ERROR: Local data file for ${datasetId} not found. Try running prepare_dataset first, or use 'demo' as the dataset_id.` }],
1825
+ isError: true
1826
+ };
1827
+ }
1655
1828
  }
1656
- }
1657
- if (saved.length === 0) {
1829
+ const report = await qualityAnalyzer.analyze(filePath);
1658
1830
  return {
1659
- content: [{ type: "text", text: "No keys provided. Core Vesper tools continue to work without API keys." }]
1831
+ content: [{ type: "text", text: JSON.stringify(report, null, 2) }]
1660
1832
  };
1661
1833
  }
1662
- return {
1663
- content: [{ type: "text", text: `Key saved securely. Updated: ${saved.join(", ")}.` }]
1664
- };
1665
- }
1666
- case "get_dataset_info": {
1667
- const datasetId = String(request.params.arguments?.dataset_id);
1668
- if (!datasetId) {
1669
- throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
1670
- }
1671
- const dataset = metadataStore.getDataset(datasetId);
1672
- if (!dataset) {
1834
+ case "preview_cleaning": {
1835
+ const datasetId = String(request.params.arguments?.dataset_id);
1836
+ const safeId = toSafeDatasetPathFragment(datasetId);
1837
+ const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
1838
+ const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
1839
+ let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
1840
+ if (datasetId === "demo" || !fs.existsSync(filePath)) {
1841
+ const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
1842
+ const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
1843
+ if (fs.existsSync(demoParquetPath)) {
1844
+ filePath = demoParquetPath;
1845
+ }
1846
+ else if (fs.existsSync(demoCsvPath)) {
1847
+ filePath = demoCsvPath;
1848
+ }
1849
+ else {
1850
+ throw new McpError(ErrorCode.InvalidParams, `Local data file not found for ${datasetId}. Please run prepare_dataset first.`);
1851
+ }
1852
+ }
1853
+ const report = await qualityAnalyzer.analyze(filePath);
1854
+ // Phase 1: Target Detection
1855
+ // We use the same TargetDetector instance inside CleaningPlanner now?
1856
+ // Actually, we instantiated it inside CleaningPlanner, so we just need to pass the file path
1857
+ // OR let the planner handle it if we update its signature to accept filePath.
1858
+ // Let's check `CleaningPlanner.generatePlan` signature again.
1859
+ // We updated it to accept `targetInfo`.
1860
+ // So we need to run detection HERE and pass it.
1861
+ // But `TargetDetector` is not exposed in `index.ts` scope yet.
1862
+ // Let's create a global instance or use the one inside planner if exposed (it's private).
1863
+ // Better approach: Instantiate TargetDetector here in index.ts for the tool content.
1864
+ // Quick fix: Instantiate local detector or make global.
1865
+ // I'll make a global `targetDetector` constant in index.ts
1866
+ // But wait, I updated `CleaningPlanner` to instantiate its own detector.
1867
+ // Does `CleaningPlanner` use it? No, I commented out the logic because it needed `filePath`.
1868
+ // RETRY STRATEGY:
1869
+ // 1. Instantiate `targetDetector` in `index.ts`.
1870
+ // 2. Run `detectTarget(filePath)`.
1871
+ // 3. Pass result to `cleaningPlanner.generatePlan(..., targetInfo)`.
1872
+ // I need to add `const targetDetector = new TargetDetector(__dirname);` to imports/init section first.
1873
+ // But since I'm in this tool, I can't look back.
1874
+ // I will assume I can add it, or just do it inside the case for now.
1875
+ // To do it properly, I should have added `targetDetector` to the global scope in previous step.
1876
+ // Let's do that in a separate step if needed.
1877
+ // For now, I'll instantiate it here.
1878
+ const { TargetDetector } = await import("./preparation/target-detector.js");
1879
+ const detector = new TargetDetector(__dirname);
1880
+ const targetResult = await detector.detectTarget(filePath);
1881
+ const targetInfo = targetResult.target_column ? {
1882
+ target: targetResult.target_column,
1883
+ confidence: targetResult.confidence
1884
+ } : undefined;
1885
+ const plan = await cleaningPlanner.generatePlan(datasetId, report, undefined, targetInfo);
1886
+ let explanation = `### Cleaning Plan for ${datasetId}\n\n`;
1887
+ if (targetInfo && targetInfo.target !== "target" && targetInfo.confidence > 0.7) {
1888
+ explanation += `**Target Detected**: '${targetInfo.target}' (Confidence: ${targetInfo.confidence.toFixed(2)})\n`;
1889
+ explanation += ` - **Action**: Renaming to 'target' for consistency.\n\n`;
1890
+ }
1891
+ explanation += `Estimated Quality Improvement: +${plan.estimated_impact.quality_score_improvement} points\n\n`;
1892
+ if (plan.operations.length === 0) {
1893
+ explanation += "No cleaning operations required.";
1894
+ }
1895
+ else {
1896
+ plan.operations.forEach((op, i) => {
1897
+ explanation += `${i + 1}. **${op.type}**: ${op.reason}\n`;
1898
+ });
1899
+ }
1673
1900
  return {
1674
- content: [{ type: "text", text: `ERROR: Dataset not found: ${datasetId}` }],
1675
- isError: true,
1901
+ content: [{ type: "text", text: explanation }]
1676
1902
  };
1677
1903
  }
1678
- const formattedOutput = formatDatasetInfo(dataset);
1679
- return { content: [{ type: "text", text: formattedOutput }] };
1680
- }
1681
- case "analyze_quality": {
1682
- const datasetId = String(request.params.arguments?.dataset_id);
1683
- const safeId = toSafeDatasetPathFragment(datasetId);
1684
- const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
1685
- const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
1686
- let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
1687
- // Demo Fallback for easy testing
1688
- if (datasetId === "demo" || !fs.existsSync(filePath)) {
1689
- const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
1690
- const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
1691
- if (fs.existsSync(demoParquetPath)) {
1692
- filePath = demoParquetPath;
1693
- }
1694
- else if (fs.existsSync(demoCsvPath)) {
1695
- filePath = demoCsvPath;
1696
- }
1697
- else if (datasetId !== "demo") {
1904
+ case "custom_clean": {
1905
+ const datasetId = String(request.params.arguments?.dataset_id);
1906
+ const ops = request.params.arguments?.operations;
1907
+ if (!datasetId || datasetId === "undefined") {
1908
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
1909
+ }
1910
+ if (!ops || !Array.isArray(ops) || ops.length === 0) {
1911
+ throw new McpError(ErrorCode.InvalidParams, "operations array is required and must not be empty. Supported operations: RemoveDuplicates, DropColumns, FillMissing, FixTypes, RemoveOutliers, EncodeCategories");
1912
+ }
1913
+ // Pre-check: verify dataset file exists before starting the job
1914
+ const cleanRegEntry = getRegistryEntry(datasetId);
1915
+ const cleanRegPath = cleanRegEntry?.local_path || cleanRegEntry?.path;
1916
+ const cleanDlStatus = metadataStore.getDownloadStatus(datasetId);
1917
+ const cleanSafeId = toSafeDatasetPathFragment(datasetId);
1918
+ const cleanDataExists = (cleanRegPath && fs.existsSync(cleanRegPath)) ||
1919
+ (cleanDlStatus?.local_path && fs.existsSync(cleanDlStatus.local_path)) ||
1920
+ fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.parquet`)) ||
1921
+ fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.csv`)) ||
1922
+ fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.feather`)) ||
1923
+ fs.existsSync(datasetId);
1924
+ if (!cleanDataExists) {
1698
1925
  return {
1699
- content: [{ type: "text", text: `ERROR: Local data file for ${datasetId} not found. Try running prepare_dataset first, or use 'demo' as the dataset_id.` }],
1700
- isError: true
1926
+ content: [{ type: "text", text: `Dataset '${datasetId}' not found locally. Download it first using download_dataset or prepare_dataset, then run custom_clean.` }],
1927
+ isError: true,
1701
1928
  };
1702
1929
  }
1930
+ const job = jobManager.createJob("clean", 0, { datasetId, ops });
1931
+ return {
1932
+ content: [{ type: "text", text: `Cleaning job started. ID: ${job.id}. Use check_job_status to monitor progress.` }]
1933
+ };
1703
1934
  }
1704
- const report = await qualityAnalyzer.analyze(filePath);
1705
- return {
1706
- content: [{ type: "text", text: JSON.stringify(report, null, 2) }]
1707
- };
1708
- }
1709
- case "preview_cleaning": {
1710
- const datasetId = String(request.params.arguments?.dataset_id);
1711
- const safeId = toSafeDatasetPathFragment(datasetId);
1712
- const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
1713
- const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
1714
- let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
1715
- if (datasetId === "demo" || !fs.existsSync(filePath)) {
1716
- const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
1717
- const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
1718
- if (fs.existsSync(demoParquetPath)) {
1719
- filePath = demoParquetPath;
1720
- }
1721
- else if (fs.existsSync(demoCsvPath)) {
1722
- filePath = demoCsvPath;
1935
+ case "prepare_dataset": {
1936
+ hydrateExternalKeys();
1937
+ const query = String(request.params.arguments?.query);
1938
+ const requirements = request.params.arguments?.requirements ? String(request.params.arguments?.requirements) : undefined;
1939
+ const downloadImages = request.params.arguments?.download_images === true;
1940
+ if (!query || query === "undefined") {
1941
+ throw new McpError(ErrorCode.InvalidParams, "query is required - describe the dataset you need or provide a dataset ID");
1723
1942
  }
1724
- else {
1725
- throw new McpError(ErrorCode.InvalidParams, `Local data file not found for ${datasetId}. Please run prepare_dataset first.`);
1726
- }
1727
- }
1728
- const report = await qualityAnalyzer.analyze(filePath);
1729
- // Phase 1: Target Detection
1730
- // We use the same TargetDetector instance inside CleaningPlanner now?
1731
- // Actually, we instantiated it inside CleaningPlanner, so we just need to pass the file path
1732
- // OR let the planner handle it if we update its signature to accept filePath.
1733
- // Let's check `CleaningPlanner.generatePlan` signature again.
1734
- // We updated it to accept `targetInfo`.
1735
- // So we need to run detection HERE and pass it.
1736
- // But `TargetDetector` is not exposed in `index.ts` scope yet.
1737
- // Let's create a global instance or use the one inside planner if exposed (it's private).
1738
- // Better approach: Instantiate TargetDetector here in index.ts for the tool content.
1739
- // Quick fix: Instantiate local detector or make global.
1740
- // I'll make a global `targetDetector` constant in index.ts
1741
- // But wait, I updated `CleaningPlanner` to instantiate its own detector.
1742
- // Does `CleaningPlanner` use it? No, I commented out the logic because it needed `filePath`.
1743
- // RETRY STRATEGY:
1744
- // 1. Instantiate `targetDetector` in `index.ts`.
1745
- // 2. Run `detectTarget(filePath)`.
1746
- // 3. Pass result to `cleaningPlanner.generatePlan(..., targetInfo)`.
1747
- // I need to add `const targetDetector = new TargetDetector(__dirname);` to imports/init section first.
1748
- // But since I'm in this tool, I can't look back.
1749
- // I will assume I can add it, or just do it inside the case for now.
1750
- // To do it properly, I should have added `targetDetector` to the global scope in previous step.
1751
- // Let's do that in a separate step if needed.
1752
- // For now, I'll instantiate it here.
1753
- const { TargetDetector } = await import("./preparation/target-detector.js");
1754
- const detector = new TargetDetector(__dirname);
1755
- const targetResult = await detector.detectTarget(filePath);
1756
- const targetInfo = targetResult.target_column ? {
1757
- target: targetResult.target_column,
1758
- confidence: targetResult.confidence
1759
- } : undefined;
1760
- const plan = await cleaningPlanner.generatePlan(datasetId, report, undefined, targetInfo);
1761
- let explanation = `### Cleaning Plan for ${datasetId}\n\n`;
1762
- if (targetInfo && targetInfo.target !== "target" && targetInfo.confidence > 0.7) {
1763
- explanation += `**Target Detected**: '${targetInfo.target}' (Confidence: ${targetInfo.confidence.toFixed(2)})\n`;
1764
- explanation += ` - **Action**: Renaming to 'target' for consistency.\n\n`;
1765
- }
1766
- explanation += `Estimated Quality Improvement: +${plan.estimated_impact.quality_score_improvement} points\n\n`;
1767
- if (plan.operations.length === 0) {
1768
- explanation += "No cleaning operations required.";
1769
- }
1770
- else {
1771
- plan.operations.forEach((op, i) => {
1772
- explanation += `${i + 1}. **${op.type}**: ${op.reason}\n`;
1773
- });
1774
- }
1775
- return {
1776
- content: [{ type: "text", text: explanation }]
1777
- };
1778
- }
1779
- case "custom_clean": {
1780
- const datasetId = String(request.params.arguments?.dataset_id);
1781
- const ops = request.params.arguments?.operations;
1782
- if (!datasetId || datasetId === "undefined") {
1783
- throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
1784
- }
1785
- if (!ops || !Array.isArray(ops) || ops.length === 0) {
1786
- throw new McpError(ErrorCode.InvalidParams, "operations array is required and must not be empty. Supported operations: RemoveDuplicates, DropColumns, FillMissing, FixTypes, RemoveOutliers, EncodeCategories");
1787
- }
1788
- // Pre-check: verify dataset file exists before starting the job
1789
- const cleanRegEntry = getRegistryEntry(datasetId);
1790
- const cleanRegPath = cleanRegEntry?.local_path || cleanRegEntry?.path;
1791
- const cleanDlStatus = metadataStore.getDownloadStatus(datasetId);
1792
- const cleanSafeId = toSafeDatasetPathFragment(datasetId);
1793
- const cleanDataExists = (cleanRegPath && fs.existsSync(cleanRegPath)) ||
1794
- (cleanDlStatus?.local_path && fs.existsSync(cleanDlStatus.local_path)) ||
1795
- fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.parquet`)) ||
1796
- fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.csv`)) ||
1797
- fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.feather`)) ||
1798
- fs.existsSync(datasetId);
1799
- if (!cleanDataExists) {
1943
+ const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages });
1800
1944
  return {
1801
- content: [{ type: "text", text: `Dataset '${datasetId}' not found locally. Download it first using download_dataset or prepare_dataset, then run custom_clean.` }],
1802
- isError: true,
1945
+ content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you. Use check_job_status to monitor progress.` }]
1803
1946
  };
1804
1947
  }
1805
- const job = jobManager.createJob("clean", 0, { datasetId, ops });
1806
- return {
1807
- content: [{ type: "text", text: `Cleaning job started. ID: ${job.id}. Use check_job_status to monitor progress.` }]
1808
- };
1809
- }
1810
- case "prepare_dataset": {
1811
- hydrateExternalKeys();
1812
- const query = String(request.params.arguments?.query);
1813
- const requirements = request.params.arguments?.requirements ? String(request.params.arguments?.requirements) : undefined;
1814
- const downloadImages = request.params.arguments?.download_images === true;
1815
- if (!query || query === "undefined") {
1816
- throw new McpError(ErrorCode.InvalidParams, "query is required - describe the dataset you need or provide a dataset ID");
1817
- }
1818
- const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages });
1819
- return {
1820
- content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you. Use check_job_status to monitor progress.` }]
1821
- };
1822
- }
1823
- case "compare_datasets": {
1824
- const datasetIds = request.params.arguments?.dataset_ids;
1825
- const datasets = datasetIds.map(id => metadataStore.getDataset(id)).filter(d => !!d);
1826
- let comparison = "| Metric | " + datasets.map(d => d.name).join(" | ") + " |\n";
1827
- comparison += "| :--- | " + datasets.map(() => " :---: ").join("|") + " |\n";
1828
- comparison += "| Quality Score | " + datasets.map(d => `${d.quality_score}/100`).join(" | ") + " |\n";
1829
- comparison += "| License | " + datasets.map(d => d.license.category).join(" | ") + " |\n";
1830
- comparison += "| Downloads | " + datasets.map(d => d.downloads).join(" | ") + " |\n";
1831
- comparison += "| Domain | " + datasets.map(d => d.domain).join(" | ") + " |\n";
1832
- return {
1833
- content: [{ type: "text", text: comparison }]
1834
- };
1835
- }
1836
- case "check_job_status": {
1837
- const jobId = String(request.params.arguments?.job_id);
1838
- const job = metadataStore.getJob(jobId);
1839
- if (!job) {
1840
- throw new McpError(ErrorCode.InvalidParams, `Job not found: ${jobId}`);
1948
+ case "compare_datasets": {
1949
+ const datasetIds = request.params.arguments?.dataset_ids;
1950
+ const datasets = datasetIds.map(id => metadataStore.getDataset(id)).filter(d => !!d);
1951
+ let comparison = "| Metric | " + datasets.map(d => d.name).join(" | ") + " |\n";
1952
+ comparison += "| :--- | " + datasets.map(() => " :---: ").join("|") + " |\n";
1953
+ comparison += "| Quality Score | " + datasets.map(d => `${d.quality_score}/100`).join(" | ") + " |\n";
1954
+ comparison += "| License | " + datasets.map(d => d.license.category).join(" | ") + " |\n";
1955
+ comparison += "| Downloads | " + datasets.map(d => d.downloads).join(" | ") + " |\n";
1956
+ comparison += "| Domain | " + datasets.map(d => d.domain).join(" | ") + " |\n";
1957
+ return {
1958
+ content: [{ type: "text", text: comparison }]
1959
+ };
1841
1960
  }
1842
- const activeStatuses = new Set(["pending", "queued", "running", "retrying"]);
1843
- const now = Date.now();
1844
- const last = jobStatusLastPoll[jobId] || 0;
1845
- const minPollMs = 3000;
1846
- if (activeStatuses.has(job.status) && (now - last) < minPollMs) {
1847
- const waitMs = minPollMs - (now - last);
1961
+ case "check_job_status": {
1962
+ const jobId = String(request.params.arguments?.job_id);
1963
+ const job = metadataStore.getJob(jobId);
1964
+ if (!job) {
1965
+ throw new McpError(ErrorCode.InvalidParams, `Job not found: ${jobId}`);
1966
+ }
1967
+ const activeStatuses = new Set(["pending", "queued", "running", "retrying"]);
1968
+ const now = Date.now();
1969
+ const last = jobStatusLastPoll[jobId] || 0;
1970
+ const minPollMs = 3000;
1971
+ if (activeStatuses.has(job.status) && (now - last) < minPollMs) {
1972
+ const waitMs = minPollMs - (now - last);
1973
+ return {
1974
+ content: [{ type: "text", text: `No significant status change yet. Please poll again in ~${Math.ceil(waitMs / 1000)}s.` }]
1975
+ };
1976
+ }
1977
+ jobStatusLastPoll[jobId] = now;
1848
1978
  return {
1849
- content: [{ type: "text", text: `No significant status change yet. Please poll again in ~${Math.ceil(waitMs / 1000)}s.` }]
1979
+ content: [{ type: "text", text: formatJobStatus(job) }]
1850
1980
  };
1851
1981
  }
1852
- jobStatusLastPoll[jobId] = now;
1853
- return {
1854
- content: [{ type: "text", text: formatJobStatus(job) }]
1855
- };
1856
- }
1857
- case "export_dataset": {
1858
- const datasetId = String(request.params.arguments?.dataset_id);
1859
- const targetDir = request.params.arguments?.target_dir ? String(request.params.arguments?.target_dir) : undefined;
1860
- const requestedFormat = String(request.params.arguments?.format || "feather");
1861
- const fastMode = request.params.arguments?.fast === true;
1862
- const preview = request.params.arguments?.preview === true;
1863
- const sampleRows = request.params.arguments?.sample_rows ? Number(request.params.arguments.sample_rows) : undefined;
1864
- const columns = request.params.arguments?.columns;
1865
- const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
1866
- // Use Metadata or Registry to find the actual local file
1867
- let sourcePath = resolveDatasetLocalPath(datasetId);
1868
- if (!sourcePath) {
1869
- console.error(`[Export] No local data found for ${datasetId}. Attempting to prepare automatically...`);
1870
- // Start a prepare job for this dataset id (acts like calling prepare_dataset)
1982
+ case "export_dataset": {
1983
+ const datasetId = String(request.params.arguments?.dataset_id);
1984
+ const targetDir = request.params.arguments?.target_dir ? String(request.params.arguments?.target_dir) : undefined;
1985
+ const requestedFormat = String(request.params.arguments?.format || "feather");
1986
+ const fastMode = request.params.arguments?.fast === true;
1987
+ const preview = request.params.arguments?.preview === true;
1988
+ const sampleRows = request.params.arguments?.sample_rows ? Number(request.params.arguments.sample_rows) : undefined;
1989
+ const columns = request.params.arguments?.columns;
1990
+ const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
1991
+ // Use Metadata or Registry to find the actual local file
1992
+ let sourcePath = resolveDatasetLocalPath(datasetId);
1993
+ if (!sourcePath) {
1994
+ console.error(`[Export] No local data found for ${datasetId}. Attempting to prepare automatically...`);
1995
+ // Start a prepare job for this dataset id (acts like calling prepare_dataset)
1996
+ try {
1997
+ jobManager.createJob("prepare", 0, { query: datasetId, requirements: undefined, downloadImages: false });
1998
+ }
1999
+ catch (e) {
2000
+ console.error(`[Export] Failed to start prepare job for ${datasetId}: ${e?.message || e}`);
2001
+ }
2002
+ // Poll for download status or registry entry until local_path appears or timeout
2003
+ const wait = (ms) => new Promise(res => setTimeout(res, ms));
2004
+ const maxWait = 120_000; // 120s
2005
+ const interval = 2000;
2006
+ let waited = 0;
2007
+ while (waited < maxWait) {
2008
+ const resolved = resolveDatasetLocalPath(datasetId);
2009
+ if (resolved) {
2010
+ sourcePath = resolved;
2011
+ console.error(`[Export] Local data is now available for ${datasetId}: ${sourcePath}`);
2012
+ break;
2013
+ }
2014
+ await wait(interval);
2015
+ waited += interval;
2016
+ }
2017
+ // If still no sourcePath, return helpful error listing prepared datasets
2018
+ if (!sourcePath) {
2019
+ const entries = readRegistry();
2020
+ const listText = entries.length === 0 ? "(no prepared datasets found)" : entries.map(e => `- ${e.dataset_id || e.id || "unknown"}: ${e.local_path || e.path || "unknown"}`).slice(0, 10).join("\n") + (entries.length > 10 ? "\n...and " + (entries.length - 10) + " more" : "");
2021
+ return {
2022
+ content: [{ type: "text", text: `ERROR: No local data found for ${datasetId} after attempting prepare. Check credentials and try running prepare_dataset manually. Prepared datasets:\n${listText}` }],
2023
+ isError: true
2024
+ };
2025
+ }
2026
+ }
2027
+ sourcePath = ensureExportableLocalPath(sourcePath);
1871
2028
  try {
1872
- jobManager.createJob("prepare", 0, { query: datasetId, requirements: undefined, downloadImages: false });
2029
+ upsertRegistry(datasetId, sourcePath, "completed");
1873
2030
  }
1874
2031
  catch (e) {
1875
- console.error(`[Export] Failed to start prepare job for ${datasetId}: ${e?.message || e}`);
1876
- }
1877
- // Poll for download status or registry entry until local_path appears or timeout
1878
- const wait = (ms) => new Promise(res => setTimeout(res, ms));
1879
- const maxWait = 120_000; // 120s
1880
- const interval = 2000;
1881
- let waited = 0;
1882
- while (waited < maxWait) {
1883
- const resolved = resolveDatasetLocalPath(datasetId);
1884
- if (resolved) {
1885
- sourcePath = resolved;
1886
- console.error(`[Export] Local data is now available for ${datasetId}: ${sourcePath}`);
1887
- break;
2032
+ console.error(`[Registry] Failed to normalize registry path for ${datasetId}: ${e?.message || e}`);
2033
+ }
2034
+ // If NOT fast mode, run quality/cleaning pipeline first (only for csv/parquet compat)
2035
+ if (!fastMode) {
2036
+ const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
2037
+ const pipelineFmt = (requestedFormat === "csv" || requestedFormat === "parquet") ? requestedFormat : "parquet";
2038
+ const pipelineCompatibleInput = currentExt === "csv" || currentExt === "parquet";
2039
+ if (!pipelineCompatibleInput) {
2040
+ console.error(`[Export] Skipping pipeline for ${currentExt} input; using direct exporter conversion.`);
2041
+ }
2042
+ else if (currentExt !== pipelineFmt) {
2043
+ console.error(`[Export] Running quality/cleaning pipeline (use fast=true to skip)...`);
2044
+ try {
2045
+ const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, pipelineFmt);
2046
+ if (pipelineResult.final_output_path) {
2047
+ sourcePath = pipelineResult.final_output_path;
2048
+ try {
2049
+ // Update registry to point to pipeline's final output
2050
+ upsertRegistry(datasetId, sourcePath, "completed");
2051
+ }
2052
+ catch (e) {
2053
+ console.error(`[Registry] Failed to update registry for ${datasetId}: ${e?.message || e}`);
2054
+ }
2055
+ }
2056
+ }
2057
+ catch (err) {
2058
+ console.error(`[Export] Pipeline warning: ${err.message}. Continuing with raw file.`);
2059
+ }
1888
2060
  }
1889
- await wait(interval);
1890
- waited += interval;
1891
2061
  }
1892
- // If still no sourcePath, return helpful error listing prepared datasets
1893
- if (!sourcePath) {
1894
- const entries = readRegistry();
1895
- const listText = entries.length === 0 ? "(no prepared datasets found)" : entries.map(e => `- ${e.dataset_id || e.id || "unknown"}: ${e.local_path || e.path || "unknown"}`).slice(0, 10).join("\n") + (entries.length > 10 ? "\n...and " + (entries.length - 10) + " more" : "");
2062
+ else {
2063
+ console.error(`[Export] Fast mode - skipping quality analysis and cleaning`);
2064
+ }
2065
+ // Build export options
2066
+ const exportOpts = {};
2067
+ if (compression)
2068
+ exportOpts.compression = compression;
2069
+ if (preview)
2070
+ exportOpts.preview = true;
2071
+ if (sampleRows)
2072
+ exportOpts.sample_rows = sampleRows;
2073
+ if (columns)
2074
+ exportOpts.columns = columns;
2075
+ try {
2076
+ // Determine output file name
2077
+ const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
2078
+ const ext = extMap[requestedFormat] || ".feather";
2079
+ const safeName = toSafeDatasetPathFragment(datasetId);
2080
+ const outDir = targetDir || path.join(dataRoot, "exports");
2081
+ if (!fs.existsSync(outDir))
2082
+ fs.mkdirSync(outDir, { recursive: true });
2083
+ const outputFile = path.join(outDir, `${safeName}${ext}`);
2084
+ const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
2085
+ // Build rich response
2086
+ let msg = `**Export complete**\n`;
2087
+ msg += `- **File**: ${result.output_path}\n`;
2088
+ msg += `- **Format**: ${result.format}${result.compression ? ` (${result.compression})` : ""}\n`;
2089
+ msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
2090
+ if (result.file_size_mb !== undefined)
2091
+ msg += `- **Size**: ${result.file_size_mb} MB\n`;
2092
+ if (result.elapsed_seconds !== undefined)
2093
+ msg += `- **Time**: ${result.elapsed_seconds}s\n`;
2094
+ if (result.preview_path)
2095
+ msg += `- **Preview**: ${result.preview_path}\n`;
2096
+ msg += `\n`;
2097
+ if (requestedFormat === "feather") {
2098
+ msg += `**Inspect with:**\n`;
2099
+ msg += ` Python: \`pd.read_feather('${result.output_path}').head()\`\n`;
2100
+ msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
2101
+ }
2102
+ else if (requestedFormat === "parquet") {
2103
+ msg += `**Inspect with:**\n`;
2104
+ msg += ` Python: \`pd.read_parquet('${result.output_path}').head()\`\n`;
2105
+ msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
2106
+ }
2107
+ return { content: [{ type: "text", text: msg }] };
2108
+ }
2109
+ catch (error) {
1896
2110
  return {
1897
- content: [{ type: "text", text: `ERROR: No local data found for ${datasetId} after attempting prepare. Check credentials and try running prepare_dataset manually. Prepared datasets:\n${listText}` }],
2111
+ content: [{ type: "text", text: `ERROR: Export failed: ${error.message}` }],
1898
2112
  isError: true
1899
2113
  };
1900
2114
  }
1901
2115
  }
1902
- sourcePath = ensureExportableLocalPath(sourcePath);
1903
- try {
1904
- upsertRegistry(datasetId, sourcePath, "completed");
1905
- }
1906
- catch (e) {
1907
- console.error(`[Registry] Failed to normalize registry path for ${datasetId}: ${e?.message || e}`);
1908
- }
1909
- // If NOT fast mode, run quality/cleaning pipeline first (only for csv/parquet compat)
1910
- if (!fastMode) {
1911
- const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
1912
- const pipelineFmt = (requestedFormat === "csv" || requestedFormat === "parquet") ? requestedFormat : "parquet";
1913
- const pipelineCompatibleInput = currentExt === "csv" || currentExt === "parquet";
1914
- if (!pipelineCompatibleInput) {
1915
- console.error(`[Export] Skipping pipeline for ${currentExt} input; using direct exporter conversion.`);
1916
- }
1917
- else if (currentExt !== pipelineFmt) {
1918
- console.error(`[Export] Running quality/cleaning pipeline (use fast=true to skip)...`);
1919
- try {
1920
- const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, pipelineFmt);
1921
- if (pipelineResult.final_output_path) {
1922
- sourcePath = pipelineResult.final_output_path;
1923
- try {
1924
- // Update registry to point to pipeline's final output
1925
- upsertRegistry(datasetId, sourcePath, "completed");
1926
- }
1927
- catch (e) {
1928
- console.error(`[Registry] Failed to update registry for ${datasetId}: ${e?.message || e}`);
1929
- }
1930
- }
2116
+ case "fuse_datasets": {
2117
+ const rawSources = request.params.arguments?.sources;
2118
+ if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {
2119
+ throw new McpError(ErrorCode.InvalidParams, "sources must contain at least 2 dataset IDs/paths");
2120
+ }
2121
+ const strategy = request.params.arguments?.strategy || "concat";
2122
+ const joinOn = request.params.arguments?.join_on;
2123
+ const how = request.params.arguments?.how || "inner";
2124
+ const dedup = request.params.arguments?.dedup !== false;
2125
+ const runQualityAfter = request.params.arguments?.run_quality_after !== false;
2126
+ const leakageCheck = request.params.arguments?.leakage_check !== false;
2127
+ const outputFormat = request.params.arguments?.output_format || "feather";
2128
+ const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
2129
+ const preview = request.params.arguments?.preview !== false;
2130
+ const resolvedPaths = [];
2131
+ const unresolved = [];
2132
+ for (const src of rawSources) {
2133
+ if (fs.existsSync(src)) {
2134
+ resolvedPaths.push(src);
2135
+ continue;
1931
2136
  }
1932
- catch (err) {
1933
- console.error(`[Export] Pipeline warning: ${err.message}. Continuing with raw file.`);
2137
+ const status = metadataStore.getDownloadStatus(src);
2138
+ if (status?.local_path && fs.existsSync(status.local_path)) {
2139
+ resolvedPaths.push(status.local_path);
2140
+ continue;
1934
2141
  }
2142
+ unresolved.push(src);
1935
2143
  }
1936
- }
1937
- else {
1938
- console.error(`[Export] Fast mode - skipping quality analysis and cleaning`);
1939
- }
1940
- // Build export options
1941
- const exportOpts = {};
1942
- if (compression)
1943
- exportOpts.compression = compression;
1944
- if (preview)
1945
- exportOpts.preview = true;
1946
- if (sampleRows)
1947
- exportOpts.sample_rows = sampleRows;
1948
- if (columns)
1949
- exportOpts.columns = columns;
1950
- try {
1951
- // Determine output file name
1952
- const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
1953
- const ext = extMap[requestedFormat] || ".feather";
1954
- const safeName = toSafeDatasetPathFragment(datasetId);
1955
- const outDir = targetDir || path.join(dataRoot, "exports");
1956
- if (!fs.existsSync(outDir))
1957
- fs.mkdirSync(outDir, { recursive: true });
1958
- const outputFile = path.join(outDir, `${safeName}${ext}`);
1959
- const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
1960
- // Build rich response
1961
- let msg = `**Export complete**\n`;
1962
- msg += `- **File**: ${result.output_path}\n`;
1963
- msg += `- **Format**: ${result.format}${result.compression ? ` (${result.compression})` : ""}\n`;
1964
- msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
1965
- if (result.file_size_mb !== undefined)
1966
- msg += `- **Size**: ${result.file_size_mb} MB\n`;
1967
- if (result.elapsed_seconds !== undefined)
1968
- msg += `- **Time**: ${result.elapsed_seconds}s\n`;
1969
- if (result.preview_path)
1970
- msg += `- **Preview**: ${result.preview_path}\n`;
1971
- msg += `\n`;
1972
- if (requestedFormat === "feather") {
1973
- msg += `**Inspect with:**\n`;
1974
- msg += ` Python: \`pd.read_feather('${result.output_path}').head()\`\n`;
1975
- msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
1976
- }
1977
- else if (requestedFormat === "parquet") {
1978
- msg += `**Inspect with:**\n`;
1979
- msg += ` Python: \`pd.read_parquet('${result.output_path}').head()\`\n`;
1980
- msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
1981
- }
1982
- return { content: [{ type: "text", text: msg }] };
1983
- }
1984
- catch (error) {
1985
- return {
1986
- content: [{ type: "text", text: `ERROR: Export failed: ${error.message}` }],
1987
- isError: true
1988
- };
1989
- }
1990
- }
1991
- case "fuse_datasets": {
1992
- const rawSources = request.params.arguments?.sources;
1993
- if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {
1994
- throw new McpError(ErrorCode.InvalidParams, "sources must contain at least 2 dataset IDs/paths");
1995
- }
1996
- const strategy = request.params.arguments?.strategy || "concat";
1997
- const joinOn = request.params.arguments?.join_on;
1998
- const how = request.params.arguments?.how || "inner";
1999
- const dedup = request.params.arguments?.dedup !== false;
2000
- const runQualityAfter = request.params.arguments?.run_quality_after !== false;
2001
- const leakageCheck = request.params.arguments?.leakage_check !== false;
2002
- const outputFormat = request.params.arguments?.output_format || "feather";
2003
- const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
2004
- const preview = request.params.arguments?.preview !== false;
2005
- const resolvedPaths = [];
2006
- const unresolved = [];
2007
- for (const src of rawSources) {
2008
- if (fs.existsSync(src)) {
2009
- resolvedPaths.push(src);
2010
- continue;
2144
+ if (unresolved.length > 0) {
2145
+ return {
2146
+ content: [{
2147
+ type: "text",
2148
+ text: `ERROR: Could not resolve these sources to local files: ${unresolved.join(", ")}. Provide local paths or run prepare_dataset first.`
2149
+ }],
2150
+ isError: true
2151
+ };
2011
2152
  }
2012
- const status = metadataStore.getDownloadStatus(src);
2013
- if (status?.local_path && fs.existsSync(status.local_path)) {
2014
- resolvedPaths.push(status.local_path);
2015
- continue;
2153
+ try {
2154
+ const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow" };
2155
+ const ext = extMap[outputFormat] || ".feather";
2156
+ const outDir = path.join(dataRoot, "fusion");
2157
+ if (!fs.existsSync(outDir))
2158
+ fs.mkdirSync(outDir, { recursive: true });
2159
+ const outputPath = path.join(outDir, `fused_${Date.now()}${ext}`);
2160
+ const result = await fusionEngine.fuse(resolvedPaths, outputPath, {
2161
+ strategy,
2162
+ join_on: joinOn,
2163
+ how,
2164
+ dedup,
2165
+ run_quality_after: runQualityAfter,
2166
+ leakage_check: leakageCheck,
2167
+ output_format: outputFormat,
2168
+ compression: compression,
2169
+ preview,
2170
+ });
2171
+ const nullDelta = result.stats.null_delta;
2172
+ const nullText = nullDelta >= 0 ? `+${nullDelta}%` : `${nullDelta}%`;
2173
+ // Register fused dataset under a generated id so users can export it easily
2174
+ const fusedId = `fused_${Date.now()}`;
2175
+ try {
2176
+ upsertRegistry(fusedId, result.output_path, "completed");
2177
+ }
2178
+ catch (e) {
2179
+ console.error(`[Registry] Failed to register fused dataset ${fusedId}: ${e?.message || e}`);
2180
+ }
2181
+ let msg = `Fused ${result.stats.sources_count} sources -> ${result.stats.rows_after.toLocaleString()} rows (from ${result.stats.rows_before.toLocaleString()}).\n`;
2182
+ msg += `- Duplicates removed: ${result.stats.duplicates_removed.toLocaleString()}\n`;
2183
+ msg += `- Null change: ${nullText}\n`;
2184
+ msg += `- Output: ${result.output_path}\n`;
2185
+ if (result.preview_path)
2186
+ msg += `- Preview: ${result.preview_path}\n`;
2187
+ if (result.leakage_report) {
2188
+ msg += `- Leakage: ${result.leakage_report.leakage_detected ? "detected" : "none"}`;
2189
+ if (result.leakage_report.leakage_count) {
2190
+ msg += ` (${result.leakage_report.leakage_count})`;
2191
+ }
2192
+ msg += "\n";
2193
+ }
2194
+ msg += `\nNext: run split_dataset/export_dataset on fused output. Registered fused dataset id: ${fusedId}`;
2195
+ return { content: [{ type: "text", text: msg }] };
2196
+ }
2197
+ catch (error) {
2198
+ return {
2199
+ content: [{ type: "text", text: `ERROR: Fusion failed: ${error.message}` }],
2200
+ isError: true
2201
+ };
2016
2202
  }
2017
- unresolved.push(src);
2018
- }
2019
- if (unresolved.length > 0) {
2020
- return {
2021
- content: [{
2022
- type: "text",
2023
- text: `ERROR: Could not resolve these sources to local files: ${unresolved.join(", ")}. Provide local paths or run prepare_dataset first.`
2024
- }],
2025
- isError: true
2026
- };
2027
2203
  }
2028
- try {
2029
- const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow" };
2030
- const ext = extMap[outputFormat] || ".feather";
2031
- const outDir = path.join(dataRoot, "fusion");
2032
- if (!fs.existsSync(outDir))
2033
- fs.mkdirSync(outDir, { recursive: true });
2034
- const outputPath = path.join(outDir, `fused_${Date.now()}${ext}`);
2035
- const result = await fusionEngine.fuse(resolvedPaths, outputPath, {
2036
- strategy,
2037
- join_on: joinOn,
2038
- how,
2039
- dedup,
2040
- run_quality_after: runQualityAfter,
2041
- leakage_check: leakageCheck,
2042
- output_format: outputFormat,
2043
- compression: compression,
2044
- preview,
2045
- });
2046
- const nullDelta = result.stats.null_delta;
2047
- const nullText = nullDelta >= 0 ? `+${nullDelta}%` : `${nullDelta}%`;
2048
- // Register fused dataset under a generated id so users can export it easily
2049
- const fusedId = `fused_${Date.now()}`;
2050
- try {
2051
- upsertRegistry(fusedId, result.output_path, "completed");
2204
+ case "analyze_image_quality": {
2205
+ const inputPath = String(request.params.arguments?.path);
2206
+ if (!fs.existsSync(inputPath)) {
2207
+ throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
2052
2208
  }
2053
- catch (e) {
2054
- console.error(`[Registry] Failed to register fused dataset ${fusedId}: ${e?.message || e}`);
2055
- }
2056
- let msg = `Fused ${result.stats.sources_count} sources -> ${result.stats.rows_after.toLocaleString()} rows (from ${result.stats.rows_before.toLocaleString()}).\n`;
2057
- msg += `- Duplicates removed: ${result.stats.duplicates_removed.toLocaleString()}\n`;
2058
- msg += `- Null change: ${nullText}\n`;
2059
- msg += `- Output: ${result.output_path}\n`;
2060
- if (result.preview_path)
2061
- msg += `- Preview: ${result.preview_path}\n`;
2062
- if (result.leakage_report) {
2063
- msg += `- Leakage: ${result.leakage_report.leakage_detected ? "detected" : "none"}`;
2064
- if (result.leakage_report.leakage_count) {
2065
- msg += ` (${result.leakage_report.leakage_count})`;
2209
+ try {
2210
+ const report = await imageAnalyzer.analyze(inputPath);
2211
+ let output = `## Image Quality Report\n\n`;
2212
+ output += `- **Total Images**: ${report.total_images}\n`;
2213
+ output += `- **Corrupted**: ${report.corrupted_count}\n`;
2214
+ output += `- **Average Resolution**: ${Math.round(report.average_width)}x${Math.round(report.average_height)}\n`;
2215
+ output += `- **Blurry Images**: ${report.blurry_count}\n\n`;
2216
+ if (report.individual_results.length > 0) {
2217
+ output += `### Sample Detail (Top 5)\n`;
2218
+ report.individual_results.slice(0, 5).forEach(img => {
2219
+ const statusLabel = img.status === "ok" ? (img.is_blurry ? "[warn]" : "[ok]") : "[error]";
2220
+ output += `${statusLabel} **${img.filename}**: ${img.width}x${img.height} | Mode: ${img.mode} | Blur: ${img.blur_score?.toFixed(1)}\n`;
2221
+ });
2066
2222
  }
2067
- msg += "\n";
2223
+ return {
2224
+ content: [{ type: "text", text: output }]
2225
+ };
2226
+ }
2227
+ catch (error) {
2228
+ return {
2229
+ content: [{ type: "text", text: `ERROR: Image analysis failed: ${error.message}` }],
2230
+ isError: true
2231
+ };
2068
2232
  }
2069
- msg += `\nNext: run split_dataset/export_dataset on fused output. Registered fused dataset id: ${fusedId}`;
2070
- return { content: [{ type: "text", text: msg }] };
2071
- }
2072
- catch (error) {
2073
- return {
2074
- content: [{ type: "text", text: `ERROR: Fusion failed: ${error.message}` }],
2075
- isError: true
2076
- };
2077
- }
2078
- }
2079
- case "analyze_image_quality": {
2080
- const inputPath = String(request.params.arguments?.path);
2081
- if (!fs.existsSync(inputPath)) {
2082
- throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
2083
2233
  }
2084
- try {
2085
- const report = await imageAnalyzer.analyze(inputPath);
2086
- let output = `## Image Quality Report\n\n`;
2087
- output += `- **Total Images**: ${report.total_images}\n`;
2088
- output += `- **Corrupted**: ${report.corrupted_count}\n`;
2089
- output += `- **Average Resolution**: ${Math.round(report.average_width)}x${Math.round(report.average_height)}\n`;
2090
- output += `- **Blurry Images**: ${report.blurry_count}\n\n`;
2091
- if (report.individual_results.length > 0) {
2092
- output += `### Sample Detail (Top 5)\n`;
2093
- report.individual_results.slice(0, 5).forEach(img => {
2094
- const statusLabel = img.status === "ok" ? (img.is_blurry ? "[warn]" : "[ok]") : "[error]";
2095
- output += `${statusLabel} **${img.filename}**: ${img.width}x${img.height} | Mode: ${img.mode} | Blur: ${img.blur_score?.toFixed(1)}\n`;
2234
+ case "analyze_media_quality": {
2235
+ const inputPath = String(request.params.arguments?.path);
2236
+ if (!fs.existsSync(inputPath)) {
2237
+ throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
2238
+ }
2239
+ try {
2240
+ const report = await mediaAnalyzer.analyze(inputPath);
2241
+ let output = `## Media Quality Report\n\n`;
2242
+ output += `- **Total Files**: ${report.total_files}\n`;
2243
+ output += `- **OK Files**: ${report.ok_files}\n`;
2244
+ output += `- **Failed Files**: ${report.failed_files}\n`;
2245
+ if ('avg_audio_duration' in report && report.avg_audio_duration) {
2246
+ output += `- **Average Audio Duration**: ${report.avg_audio_duration.toFixed(2)}s\n`;
2247
+ }
2248
+ if ('avg_video_duration' in report && report.avg_video_duration) {
2249
+ output += `- **Average Video Duration**: ${report.avg_video_duration.toFixed(2)}s\n`;
2250
+ output += `- **Average FPS**: ${report.avg_fps?.toFixed(1)}\n`;
2251
+ }
2252
+ output += `\n### Sample Detail (Top 5)\n`;
2253
+ report.details.slice(0, 5).forEach(item => {
2254
+ const statusLabel = item.status === "ok" ? "[ok]" : "[error]";
2255
+ if (item.type === "audio" && 'sample_rate' in item) {
2256
+ output += `${statusLabel} **${item.filename}**: ${item.duration?.toFixed(1)}s | SR: ${item.sample_rate}Hz | Silent: ${item.is_silent}\n`;
2257
+ }
2258
+ else if (item.type === "video" && 'width' in item) {
2259
+ output += `${statusLabel} **${item.filename}**: ${item.width}x${item.height} | ${item.fps?.toFixed(1)}fps | Risk: ${item.corruption_risk}\n`;
2260
+ }
2261
+ else {
2262
+ output += `${statusLabel} **${item.filename}**: ${item.error}\n`;
2263
+ }
2096
2264
  });
2265
+ return {
2266
+ content: [{ type: "text", text: output }]
2267
+ };
2268
+ }
2269
+ catch (error) {
2270
+ return {
2271
+ content: [{ type: "text", text: `ERROR: Media analysis failed: ${error.message}` }],
2272
+ isError: true
2273
+ };
2097
2274
  }
2098
- return {
2099
- content: [{ type: "text", text: output }]
2100
- };
2101
- }
2102
- catch (error) {
2103
- return {
2104
- content: [{ type: "text", text: `ERROR: Image analysis failed: ${error.message}` }],
2105
- isError: true
2106
- };
2107
- }
2108
- }
2109
- case "analyze_media_quality": {
2110
- const inputPath = String(request.params.arguments?.path);
2111
- if (!fs.existsSync(inputPath)) {
2112
- throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
2113
2275
  }
2114
- try {
2115
- const report = await mediaAnalyzer.analyze(inputPath);
2116
- let output = `## Media Quality Report\n\n`;
2117
- output += `- **Total Files**: ${report.total_files}\n`;
2118
- output += `- **OK Files**: ${report.ok_files}\n`;
2119
- output += `- **Failed Files**: ${report.failed_files}\n`;
2120
- if ('avg_audio_duration' in report && report.avg_audio_duration) {
2121
- output += `- **Average Audio Duration**: ${report.avg_audio_duration.toFixed(2)}s\n`;
2122
- }
2123
- if ('avg_video_duration' in report && report.avg_video_duration) {
2124
- output += `- **Average Video Duration**: ${report.avg_video_duration.toFixed(2)}s\n`;
2125
- output += `- **Average FPS**: ${report.avg_fps?.toFixed(1)}\n`;
2126
- }
2127
- output += `\n### Sample Detail (Top 5)\n`;
2128
- report.details.slice(0, 5).forEach(item => {
2129
- const statusLabel = item.status === "ok" ? "[ok]" : "[error]";
2130
- if (item.type === "audio" && 'sample_rate' in item) {
2131
- output += `${statusLabel} **${item.filename}**: ${item.duration?.toFixed(1)}s | SR: ${item.sample_rate}Hz | Silent: ${item.is_silent}\n`;
2276
+ case "generate_quality_report": {
2277
+ const datasetId = String(request.params.arguments?.dataset_id);
2278
+ const datasetPath = String(request.params.arguments?.dataset_path);
2279
+ if (!fs.existsSync(datasetPath)) {
2280
+ throw new McpError(ErrorCode.InvalidParams, `Dataset path not found: ${datasetPath}`);
2281
+ }
2282
+ try {
2283
+ // Optionally load text quality from metadata if available
2284
+ const metadata = await metadataStore.getDataset(datasetId);
2285
+ // TODO: Integrate text quality analysis when available
2286
+ const textQuality = null;
2287
+ const report = await qualityOrchestrator.generateReport(datasetId, datasetPath, textQuality);
2288
+ // Save report to metadata
2289
+ if (metadata) {
2290
+ metadata.unified_quality_report = report;
2291
+ await metadataStore.saveDataset(metadata);
2132
2292
  }
2133
- else if (item.type === "video" && 'width' in item) {
2134
- output += `${statusLabel} **${item.filename}**: ${item.width}x${item.height} | ${item.fps?.toFixed(1)}fps | Risk: ${item.corruption_risk}\n`;
2293
+ let output = `# Unified Quality Report\n\n`;
2294
+ output += `**Dataset**: ${datasetId}\n`;
2295
+ output += `**Modalities**: ${report.modalities.join(", ")}\n`;
2296
+ output += `**Overall Quality Score**: ${report.overall_quality_score}/100\n\n`;
2297
+ if (report.text_quality) {
2298
+ output += `## Text Quality\n`;
2299
+ output += `- Rows: ${report.text_quality.row_count}\n`;
2300
+ output += `- Columns: ${report.text_quality.column_count}\n`;
2301
+ output += `- Missing: ${report.text_quality.missing_percentage.toFixed(1)}%\n`;
2302
+ output += `- Duplicates: ${report.text_quality.duplicate_percentage.toFixed(1)}%\n\n`;
2135
2303
  }
2136
- else {
2137
- output += `${statusLabel} **${item.filename}**: ${item.error}\n`;
2304
+ if (report.image_quality) {
2305
+ output += `## Image Quality\n`;
2306
+ output += `- Total Images: ${report.image_quality.total_images}\n`;
2307
+ output += `- Corrupted: ${report.image_quality.corrupted_count}\n`;
2308
+ output += `- Avg Resolution: ${report.image_quality.avg_resolution}\n`;
2309
+ output += `- Blurry: ${report.image_quality.blurry_percentage.toFixed(1)}%\n\n`;
2138
2310
  }
2139
- });
2140
- return {
2141
- content: [{ type: "text", text: output }]
2142
- };
2143
- }
2144
- catch (error) {
2145
- return {
2146
- content: [{ type: "text", text: `ERROR: Media analysis failed: ${error.message}` }],
2147
- isError: true
2148
- };
2149
- }
2150
- }
2151
- case "generate_quality_report": {
2152
- const datasetId = String(request.params.arguments?.dataset_id);
2153
- const datasetPath = String(request.params.arguments?.dataset_path);
2154
- if (!fs.existsSync(datasetPath)) {
2155
- throw new McpError(ErrorCode.InvalidParams, `Dataset path not found: ${datasetPath}`);
2156
- }
2157
- try {
2158
- // Optionally load text quality from metadata if available
2159
- const metadata = await metadataStore.getDataset(datasetId);
2160
- // TODO: Integrate text quality analysis when available
2161
- const textQuality = null;
2162
- const report = await qualityOrchestrator.generateReport(datasetId, datasetPath, textQuality);
2163
- // Save report to metadata
2164
- if (metadata) {
2165
- metadata.unified_quality_report = report;
2166
- await metadataStore.saveDataset(metadata);
2167
- }
2168
- let output = `# Unified Quality Report\n\n`;
2169
- output += `**Dataset**: ${datasetId}\n`;
2170
- output += `**Modalities**: ${report.modalities.join(", ")}\n`;
2171
- output += `**Overall Quality Score**: ${report.overall_quality_score}/100\n\n`;
2172
- if (report.text_quality) {
2173
- output += `## Text Quality\n`;
2174
- output += `- Rows: ${report.text_quality.row_count}\n`;
2175
- output += `- Columns: ${report.text_quality.column_count}\n`;
2176
- output += `- Missing: ${report.text_quality.missing_percentage.toFixed(1)}%\n`;
2177
- output += `- Duplicates: ${report.text_quality.duplicate_percentage.toFixed(1)}%\n\n`;
2178
- }
2179
- if (report.image_quality) {
2180
- output += `## Image Quality\n`;
2181
- output += `- Total Images: ${report.image_quality.total_images}\n`;
2182
- output += `- Corrupted: ${report.image_quality.corrupted_count}\n`;
2183
- output += `- Avg Resolution: ${report.image_quality.avg_resolution}\n`;
2184
- output += `- Blurry: ${report.image_quality.blurry_percentage.toFixed(1)}%\n\n`;
2185
- }
2186
- if (report.audio_quality) {
2187
- output += `## Audio Quality\n`;
2188
- output += `- Total Files: ${report.audio_quality.total_files}\n`;
2189
- output += `- Avg Duration: ${report.audio_quality.avg_duration.toFixed(1)}s\n`;
2190
- output += `- Avg Sample Rate: ${report.audio_quality.avg_sample_rate.toFixed(0)}Hz\n`;
2191
- output += `- Silent: ${report.audio_quality.silent_percentage.toFixed(1)}%\n\n`;
2192
- }
2193
- if (report.video_quality) {
2194
- output += `## Video Quality\n`;
2195
- output += `- Total Files: ${report.video_quality.total_files}\n`;
2196
- output += `- Avg Duration: ${report.video_quality.avg_duration.toFixed(1)}s\n`;
2197
- output += `- Avg FPS: ${report.video_quality.avg_fps.toFixed(1)}\n`;
2198
- output += `- High Corruption Risk: ${report.video_quality.corruption_risk_high}\n\n`;
2199
- }
2200
- output += `## Recommendations\n`;
2201
- report.recommendations.forEach(rec => {
2202
- output += `- ${rec}\n`;
2203
- });
2204
- return {
2205
- content: [{ type: "text", text: output }]
2206
- };
2207
- }
2208
- catch (error) {
2209
- return {
2210
- content: [{ type: "text", text: `ERROR: Quality report generation failed: ${error.message}` }],
2211
- isError: true
2212
- };
2311
+ if (report.audio_quality) {
2312
+ output += `## Audio Quality\n`;
2313
+ output += `- Total Files: ${report.audio_quality.total_files}\n`;
2314
+ output += `- Avg Duration: ${report.audio_quality.avg_duration.toFixed(1)}s\n`;
2315
+ output += `- Avg Sample Rate: ${report.audio_quality.avg_sample_rate.toFixed(0)}Hz\n`;
2316
+ output += `- Silent: ${report.audio_quality.silent_percentage.toFixed(1)}%\n\n`;
2317
+ }
2318
+ if (report.video_quality) {
2319
+ output += `## Video Quality\n`;
2320
+ output += `- Total Files: ${report.video_quality.total_files}\n`;
2321
+ output += `- Avg Duration: ${report.video_quality.avg_duration.toFixed(1)}s\n`;
2322
+ output += `- Avg FPS: ${report.video_quality.avg_fps.toFixed(1)}\n`;
2323
+ output += `- High Corruption Risk: ${report.video_quality.corruption_risk_high}\n\n`;
2324
+ }
2325
+ output += `## Recommendations\n`;
2326
+ report.recommendations.forEach(rec => {
2327
+ output += `- ${rec}\n`;
2328
+ });
2329
+ return {
2330
+ content: [{ type: "text", text: output }]
2331
+ };
2332
+ }
2333
+ catch (error) {
2334
+ return {
2335
+ content: [{ type: "text", text: `ERROR: Quality report generation failed: ${error.message}` }],
2336
+ isError: true
2337
+ };
2338
+ }
2213
2339
  }
2340
+ default:
2341
+ throw new McpError(ErrorCode.MethodNotFound, "Tool not found");
2214
2342
  }
2215
- default:
2216
- throw new McpError(ErrorCode.MethodNotFound, "Tool not found");
2217
- }
2343
+ }); // end requestQueue.enqueue
2218
2344
  });
2219
2345
  async function main() {
2220
2346
  const args = process.argv.slice(2);