@vespermcp/mcp-server 1.2.21 → 1.2.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/README.md +49 -0
  2. package/build/cloud/adapters/supabase.js +49 -0
  3. package/build/cloud/storage-manager.js +6 -0
  4. package/build/export/exporter.js +22 -9
  5. package/build/gateway/unified-dataset-gateway.js +410 -0
  6. package/build/index.js +1587 -845
  7. package/build/ingestion/ingestor.js +7 -4
  8. package/build/install/install-service.js +11 -6
  9. package/build/lib/supabase.js +3 -0
  10. package/build/metadata/scraper.js +85 -14
  11. package/build/python/asset_downloader_engine.py +2 -0
  12. package/build/python/convert_engine.py +92 -0
  13. package/build/python/export_engine.py +45 -0
  14. package/build/python/kaggle_engine.py +77 -5
  15. package/build/python/normalize_engine.py +83 -0
  16. package/build/python/vesper/core/asset_downloader.py +5 -1
  17. package/build/search/engine.js +43 -5
  18. package/build/search/jit-orchestrator.js +18 -14
  19. package/build/search/query-intent.js +509 -0
  20. package/build/tools/formatter.js +6 -3
  21. package/build/utils/python-runtime.js +130 -0
  22. package/package.json +7 -5
  23. package/scripts/postinstall.cjs +87 -31
  24. package/scripts/wizard.cjs +601 -0
  25. package/scripts/wizard.js +306 -12
  26. package/src/python/__pycache__/config.cpython-312.pyc +0 -0
  27. package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
  28. package/src/python/asset_downloader_engine.py +2 -0
  29. package/src/python/convert_engine.py +92 -0
  30. package/src/python/export_engine.py +45 -0
  31. package/src/python/kaggle_engine.py +77 -5
  32. package/src/python/normalize_engine.py +83 -0
  33. package/src/python/requirements.txt +12 -0
  34. package/src/python/vesper/core/asset_downloader.py +5 -1
  35. package/wizard.cjs +3 -0
package/build/index.js CHANGED
@@ -1,12 +1,39 @@
1
1
  #!/usr/bin/env node
2
2
  // --- Dataset ID Normalization ---
3
3
  function normalize_dataset_id(dataset_id) {
4
- // Remove kaggle: prefix for storage key
5
- let id = dataset_id.replace(/^kaggle:/, "");
4
+ const trimmed = dataset_id.trim();
5
+ const sourceMatch = trimmed.match(/^(kaggle|huggingface|hf|openml|dataworld):/i);
6
+ let id = trimmed.replace(/^(kaggle|huggingface|hf|openml|dataworld):/i, "");
6
7
  // Replace / and : with _ for filesystem safety
7
- id = id.replace(/[/:]/g, "_");
8
- // Always store and lookup using the same normalized format
9
- return dataset_id.startsWith("kaggle:") ? `kaggle_${id}` : id;
8
+ id = id.replace(/[\\/:]/g, "_");
9
+ if (!sourceMatch) {
10
+ return id;
11
+ }
12
+ const source = sourceMatch[1].toLowerCase() === "hf" ? "huggingface" : sourceMatch[1].toLowerCase();
13
+ return `${source}_${id}`;
14
+ }
15
+ function getDatasetIdAliases(dataset_id) {
16
+ const trimmed = dataset_id.trim();
17
+ const aliases = new Set([trimmed]);
18
+ const sourceMatch = trimmed.match(/^(kaggle|huggingface|hf|openml|dataworld):/i);
19
+ if (sourceMatch) {
20
+ const stripped = trimmed.replace(/^(kaggle|huggingface|hf|openml|dataworld):/i, "");
21
+ aliases.add(stripped);
22
+ if (sourceMatch[1].toLowerCase() === "hf") {
23
+ aliases.add(`huggingface:${stripped}`);
24
+ }
25
+ }
26
+ else {
27
+ aliases.add(`kaggle:${trimmed}`);
28
+ aliases.add(`huggingface:${trimmed}`);
29
+ aliases.add(`hf:${trimmed}`);
30
+ aliases.add(`openml:${trimmed}`);
31
+ aliases.add(`dataworld:${trimmed}`);
32
+ }
33
+ return Array.from(aliases);
34
+ }
35
+ function toSafeDatasetPathFragment(dataset_id) {
36
+ return normalize_dataset_id(dataset_id);
10
37
  }
11
38
  // --- Dataset Registry Helpers ---
12
39
  function getRegistryPath() {
@@ -29,10 +56,11 @@ function writeRegistry(entries) {
29
56
  fs.writeFileSync(registryPath, JSON.stringify(entries, null, 2));
30
57
  }
31
58
  function upsertRegistry(dataset_id, local_path, status) {
32
- const norm_id = normalize_dataset_id(dataset_id);
59
+ const aliases = getDatasetIdAliases(dataset_id).map(normalize_dataset_id);
60
+ const norm_id = aliases[0];
33
61
  console.error(`[Registry] Writing key: ${norm_id}`);
34
62
  const entries = readRegistry();
35
- const idx = entries.findIndex(e => e.dataset_id === norm_id);
63
+ const idx = entries.findIndex(e => aliases.includes(e.dataset_id || e.id));
36
64
  if (idx >= 0) {
37
65
  entries[idx] = { dataset_id: norm_id, local_path, status };
38
66
  }
@@ -42,9 +70,163 @@ function upsertRegistry(dataset_id, local_path, status) {
42
70
  writeRegistry(entries);
43
71
  }
44
72
  function getRegistryEntry(dataset_id) {
45
- const norm_id = normalize_dataset_id(dataset_id);
46
- console.error(`[Registry] Lookup key: ${norm_id}`);
47
- return readRegistry().find(e => (e.dataset_id || e.id) === norm_id);
73
+ const aliases = getDatasetIdAliases(dataset_id).map(normalize_dataset_id);
74
+ console.error(`[Registry] Lookup keys: ${aliases.join(", ")}`);
75
+ return readRegistry().find(e => aliases.includes((e.dataset_id || e.id)));
76
+ }
77
+ const STRUCTURED_FILE_EXTENSIONS = [".parquet", ".csv", ".jsonl", ".json", ".feather", ".arrow", ".tsv", ".txt"];
78
+ const IMAGE_FILE_EXTENSIONS = new Set([".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif", ".tiff", ".tif", ".svg"]);
79
+ function walkFilesRecursive(rootDir) {
80
+ const out = [];
81
+ const stack = [rootDir];
82
+ while (stack.length > 0) {
83
+ const currentDir = stack.pop();
84
+ const entries = fs.readdirSync(currentDir, { withFileTypes: true });
85
+ for (const entry of entries) {
86
+ const fullPath = path.join(currentDir, entry.name);
87
+ if (entry.isDirectory()) {
88
+ stack.push(fullPath);
89
+ }
90
+ else if (entry.isFile()) {
91
+ out.push(fullPath);
92
+ }
93
+ }
94
+ }
95
+ out.sort();
96
+ return out;
97
+ }
98
+ function inferImageManifestRecord(rootDir, fullPath, index) {
99
+ const relativePath = path.relative(rootDir, fullPath).replace(/\\/g, "/");
100
+ const parentDir = path.posix.dirname(relativePath);
101
+ const parts = parentDir.split("/").filter(part => part && part !== ".");
102
+ let split;
103
+ let label;
104
+ if (parts.length > 0) {
105
+ const first = parts[0].toLowerCase();
106
+ if (["train", "test", "val", "valid", "validation"].includes(first)) {
107
+ split = parts[0];
108
+ if (parts.length > 1) {
109
+ label = parts[parts.length - 1];
110
+ }
111
+ }
112
+ else {
113
+ label = parts[parts.length - 1];
114
+ }
115
+ }
116
+ return {
117
+ id: index,
118
+ image_path: path.resolve(fullPath),
119
+ relative_path: relativePath,
120
+ file_name: path.basename(fullPath),
121
+ extension: path.extname(fullPath).toLowerCase().replace(/^\./, ""),
122
+ ...(split ? { split } : {}),
123
+ ...(label ? { label } : {}),
124
+ };
125
+ }
126
+ function createImageManifestFromDirectory(rootDir) {
127
+ const imageFiles = walkFilesRecursive(rootDir).filter(filePath => IMAGE_FILE_EXTENSIONS.has(path.extname(filePath).toLowerCase()));
128
+ if (imageFiles.length === 0) {
129
+ throw new Error(`No image files found under ${rootDir}`);
130
+ }
131
+ const manifestPath = path.join(rootDir, "_vesper_image_manifest.jsonl");
132
+ const lines = imageFiles.map((filePath, index) => JSON.stringify(inferImageManifestRecord(rootDir, filePath, index)));
133
+ fs.writeFileSync(manifestPath, `${lines.join("\n")}\n`, "utf-8");
134
+ return manifestPath;
135
+ }
136
+ function ensureExportableLocalPath(localPath) {
137
+ if (!fs.existsSync(localPath)) {
138
+ throw new Error(`Local path not found: ${localPath}`);
139
+ }
140
+ const stats = fs.statSync(localPath);
141
+ if (stats.isFile()) {
142
+ return localPath;
143
+ }
144
+ const manifestPath = path.join(localPath, "_vesper_image_manifest.jsonl");
145
+ if (fs.existsSync(manifestPath)) {
146
+ return manifestPath;
147
+ }
148
+ const candidates = walkFilesRecursive(localPath);
149
+ for (const ext of STRUCTURED_FILE_EXTENSIONS) {
150
+ const match = candidates.find(candidate => path.extname(candidate).toLowerCase() === ext);
151
+ if (match) {
152
+ return match;
153
+ }
154
+ }
155
+ return createImageManifestFromDirectory(localPath);
156
+ }
157
+ function isPathWithinDirectory(candidatePath, directoryPath) {
158
+ const relativePath = path.relative(path.resolve(directoryPath), path.resolve(candidatePath));
159
+ return relativePath === "" || (!relativePath.startsWith("..") && !path.isAbsolute(relativePath));
160
+ }
161
+ function buildDatasetCandidatePaths(baseDir, safeId) {
162
+ return [
163
+ path.join(baseDir, `${safeId}.parquet`),
164
+ path.join(baseDir, `${safeId}.csv`),
165
+ path.join(baseDir, `${safeId}.jsonl`),
166
+ path.join(baseDir, `${safeId}.json`),
167
+ path.join(baseDir, `${safeId}.feather`),
168
+ path.join(baseDir, `${safeId}.arrow`),
169
+ path.join(baseDir, safeId),
170
+ ];
171
+ }
172
+ function shouldTrackExportPath(localPath) {
173
+ return isPathWithinDirectory(localPath, dataRoot);
174
+ }
175
+ function isDirectLocalDatasetReference(datasetIdOrPath) {
176
+ return fs.existsSync(datasetIdOrPath);
177
+ }
178
+ function getExportFileStem(datasetIdOrPath) {
179
+ if (isDirectLocalDatasetReference(datasetIdOrPath)) {
180
+ const resolvedPath = path.resolve(datasetIdOrPath);
181
+ const stats = fs.statSync(resolvedPath);
182
+ const baseName = stats.isDirectory()
183
+ ? path.basename(resolvedPath)
184
+ : path.parse(resolvedPath).name;
185
+ return toSafeDatasetPathFragment(baseName);
186
+ }
187
+ return toSafeDatasetPathFragment(datasetIdOrPath);
188
+ }
189
+ function ensureLocalPipelineSource(sourcePath, datasetId, targetDir) {
190
+ const resolvedTargetDir = path.resolve(targetDir);
191
+ const resolvedSourcePath = path.resolve(sourcePath);
192
+ if (path.dirname(resolvedSourcePath) === resolvedTargetDir) {
193
+ return resolvedSourcePath;
194
+ }
195
+ if (!fs.existsSync(resolvedTargetDir)) {
196
+ fs.mkdirSync(resolvedTargetDir, { recursive: true });
197
+ }
198
+ const stagedPath = path.join(resolvedTargetDir, `${toSafeDatasetPathFragment(datasetId)}${path.extname(resolvedSourcePath)}`);
199
+ if (resolvedSourcePath !== stagedPath) {
200
+ fs.copyFileSync(resolvedSourcePath, stagedPath);
201
+ }
202
+ return stagedPath;
203
+ }
204
+ function resolveDatasetLocalPath(datasetIdOrPath, preferredDirs = []) {
205
+ if (fs.existsSync(datasetIdOrPath)) {
206
+ return ensureExportableLocalPath(datasetIdOrPath);
207
+ }
208
+ const safeId = toSafeDatasetPathFragment(datasetIdOrPath);
209
+ const uniquePreferredDirs = Array.from(new Set(preferredDirs
210
+ .filter((dir) => typeof dir === "string" && dir.trim().length > 0)
211
+ .map(dir => path.resolve(dir))));
212
+ for (const preferredDir of uniquePreferredDirs) {
213
+ const localMatch = buildDatasetCandidatePaths(preferredDir, safeId).find(candidate => fs.existsSync(candidate));
214
+ if (localMatch) {
215
+ return ensureExportableLocalPath(localMatch);
216
+ }
217
+ }
218
+ const downloadStatus = metadataStore.getDownloadStatus(datasetIdOrPath);
219
+ if (downloadStatus?.local_path && fs.existsSync(downloadStatus.local_path)) {
220
+ return ensureExportableLocalPath(downloadStatus.local_path);
221
+ }
222
+ const reg = getRegistryEntry(datasetIdOrPath);
223
+ const regPath = reg?.local_path || reg?.path;
224
+ if (regPath && fs.existsSync(regPath)) {
225
+ return ensureExportableLocalPath(regPath);
226
+ }
227
+ const rawCandidates = buildDatasetCandidatePaths(path.join(dataRoot, "data", "raw"), safeId);
228
+ const match = rawCandidates.find(candidate => fs.existsSync(candidate));
229
+ return match ? ensureExportableLocalPath(match) : undefined;
48
230
  }
49
231
  // --- Pipeline State Tracker ---
50
232
  // Tracks completed steps per session/job/dataset
@@ -88,6 +270,7 @@ import { HuggingFaceScraper } from "./metadata/scraper.js";
88
270
  import { KaggleSource } from "./metadata/kaggle-source.js";
89
271
  import { OpenMLSource } from "./metadata/openml-source.js";
90
272
  import { DataWorldSource } from "./metadata/dataworld-source.js";
273
+ import { UnifiedDatasetGateway } from "./gateway/unified-dataset-gateway.js";
91
274
  import { formatSearchResults, formatDatasetInfo, formatJobStatus } from "./tools/formatter.js";
92
275
  import { JobManager } from "./jobs/manager.js";
93
276
  import { QualityAnalyzer } from "./quality/analyzer.js";
@@ -131,6 +314,34 @@ function logError(err, context) {
131
314
  fs.appendFileSync(errorLogPath, msg);
132
315
  console.error(`[Vesper] Critical error logged to ${errorLogPath}`);
133
316
  }
317
+ // --- Request Queue: serialize all MCP tool calls to prevent crashes ---
318
+ class RequestQueue {
319
+ queue = [];
320
+ running = false;
321
+ enqueue(task) {
322
+ return new Promise((resolve, reject) => {
323
+ this.queue.push({ resolve, reject, task });
324
+ this.drain();
325
+ });
326
+ }
327
+ async drain() {
328
+ if (this.running)
329
+ return;
330
+ this.running = true;
331
+ while (this.queue.length > 0) {
332
+ const item = this.queue.shift();
333
+ try {
334
+ const result = await item.task();
335
+ item.resolve(result);
336
+ }
337
+ catch (err) {
338
+ item.reject(err);
339
+ }
340
+ }
341
+ this.running = false;
342
+ }
343
+ }
344
+ const requestQueue = new RequestQueue();
134
345
  const SPINNER_FRAMES = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"];
135
346
  function printLaunchScreen() {
136
347
  const screen = `
@@ -198,6 +409,21 @@ function extractRequestedRows(query, requirements) {
198
409
  if (Number.isFinite(n) && n > 0)
199
410
  return n;
200
411
  }
412
+ const commaNumbers = [...text.matchAll(/\b\d{1,3}(?:,\d{3})+\b/g)]
413
+ .map(m => Number(m[0].replace(/,/g, "")))
414
+ .filter(n => Number.isFinite(n) && n > 0);
415
+ if (commaNumbers.length > 0)
416
+ return Math.max(...commaNumbers);
417
+ const humanSized = [...text.matchAll(/\b(\d+(?:\.\d+)?)\s*([kmb])\b/gi)]
418
+ .map(m => {
419
+ const base = Number(m[1]);
420
+ const suffix = m[2].toLowerCase();
421
+ const multiplier = suffix === "k" ? 1_000 : suffix === "m" ? 1_000_000 : 1_000_000_000;
422
+ return Math.round(base * multiplier);
423
+ })
424
+ .filter(n => Number.isFinite(n) && n > 0);
425
+ if (humanSized.length > 0)
426
+ return Math.max(...humanSized);
201
427
  const allNums = [...text.matchAll(/\b\d{4,9}\b/g)]
202
428
  .map(m => Number(m[0]))
203
429
  .filter(n => Number.isFinite(n) && n > 0);
@@ -367,7 +593,45 @@ function syncPythonScripts(appRoot, dataRoot) {
367
593
  }
368
594
  // Sync scripts immediately
369
595
  syncPythonScripts(appRoot, dataRoot);
370
- const metadataStore = new MetadataStore(dbPath);
596
+ // Auto-rebuild better-sqlite3 if native binary doesn't match current Node version
597
+ function tryRebuildSqlite() {
598
+ try {
599
+ const { execSync } = require("child_process");
600
+ const pkgRoot = path.resolve(__dirname, "..");
601
+ console.error("[Vesper] Rebuilding better-sqlite3 for Node " + process.version + "...");
602
+ execSync("npm rebuild better-sqlite3", {
603
+ stdio: "pipe",
604
+ timeout: 60000,
605
+ cwd: pkgRoot,
606
+ });
607
+ console.error("[Vesper] Rebuild succeeded. Retrying...");
608
+ // Clear require cache so the rebuilt module is loaded
609
+ for (const key of Object.keys(require.cache)) {
610
+ if (key.includes("better-sqlite3") || key.includes("better_sqlite3")) {
611
+ delete require.cache[key];
612
+ }
613
+ }
614
+ return true;
615
+ }
616
+ catch (e) {
617
+ console.error("[Vesper] Auto-rebuild failed: " + (e?.message || e));
618
+ return false;
619
+ }
620
+ }
621
+ let metadataStore;
622
+ try {
623
+ metadataStore = new MetadataStore(dbPath);
624
+ }
625
+ catch (e) {
626
+ if (e?.code === "ERR_DLOPEN_FAILED" && tryRebuildSqlite()) {
627
+ metadataStore = new MetadataStore(dbPath);
628
+ }
629
+ else {
630
+ console.error("[Vesper] FATAL: Cannot load better-sqlite3.");
631
+ console.error("[Vesper] Run: npm rebuild better-sqlite3");
632
+ throw e;
633
+ }
634
+ }
371
635
  const vectorStore = new VectorStore(vectorPath);
372
636
  const embedder = Embedder.getInstance();
373
637
  const searchEngine = new SearchEngine(metadataStore, vectorStore, embedder);
@@ -382,6 +646,8 @@ const dataSplitter = new DataSplitter(__dirname);
382
646
  const dataExporter = new DataExporter(__dirname);
383
647
  const fusionEngine = new DataFusionEngine(__dirname);
384
648
  const kaggleSource = new KaggleSource(__dirname);
649
+ const openmlSource = new OpenMLSource(__dirname);
650
+ const dataworldSource = new DataWorldSource(__dirname);
385
651
  const secureKeys = new SecureKeysManager(__dirname);
386
652
  function hydrateExternalKeys() {
387
653
  const keys = secureKeys.getAll();
@@ -401,6 +667,15 @@ function hydrateExternalKeys() {
401
667
  function hasDataWorldToken() {
402
668
  return !!(process.env.DW_AUTH_TOKEN || secureKeys.getAll().dataworld_token);
403
669
  }
670
+ const unifiedDatasetGateway = new UnifiedDatasetGateway({
671
+ metadataStore,
672
+ dataIngestor,
673
+ dataRoot,
674
+ kaggleSource,
675
+ openmlSource,
676
+ dataworldSource,
677
+ hasDataWorldToken,
678
+ });
404
679
  // CRITICAL FIX: Pass __dirname (build directory) to analyzers
405
680
  // Python scripts are in build/python/, so analyzers should look relative to build/
406
681
  // NOT relative to project root (appRoot)
@@ -432,7 +707,7 @@ jobManager.on("processJob", async (job, execute) => {
432
707
  console.error(`[Vesper] Starting prepareDatasetTask for job ${job.id}...`);
433
708
  const metadata = job.metadata ? JSON.parse(job.metadata) : {};
434
709
  switch (job.type) {
435
- case "prepare": return await handlePrepareJob(job.id, metadata.query, metadata.requirements);
710
+ case "prepare": return await handlePrepareJob(job.id, metadata.query, metadata.requirements, metadata.outputDir);
436
711
  case "clean": return await handleCleanJob(job.id, metadata.datasetId, metadata.ops);
437
712
  default: throw new Error(`Unhandled job type: ${job.type}`);
438
713
  }
@@ -450,9 +725,21 @@ jobManager.on("processJob", async (job, execute) => {
450
725
  /**
451
726
  * Logic for preparing a dataset (Search + Ingest + Process)
452
727
  */
453
- async function handlePrepareJob(jobId, query, requirements) {
728
+ async function handlePrepareJob(jobId, query, requirements, outputDir) {
454
729
  hydrateExternalKeys();
455
730
  const update = (updates) => jobManager.updateJob(jobId, updates);
731
+ const pipelineSteps = ["search", "validate", "download", "normalize", "quality", "register"];
732
+ const stepStatus = {};
733
+ for (const s of pipelineSteps)
734
+ stepStatus[s] = "pending";
735
+ const markPipelineStep = (step, status) => {
736
+ stepStatus[step] = status;
737
+ const summary = pipelineSteps.map(s => {
738
+ const st = stepStatus[s];
739
+ return st === "done" ? `[${s}]` : st === "running" ? `>${s}<` : st === "failed" ? `!${s}!` : st === "skipped" ? `~${s}~` : ` ${s} `;
740
+ }).join(" → ");
741
+ console.error(`[Pipeline] ${summary}`);
742
+ };
456
743
  // Ensure core Python packages are available for dataset operations
457
744
  try {
458
745
  await ensurePythonModules([
@@ -465,6 +752,7 @@ async function handlePrepareJob(jobId, query, requirements) {
465
752
  // Continue anyway - direct file downloads may still work without datasets lib
466
753
  }
467
754
  const requestedRows = extractRequestedRows(query, requirements);
755
+ const searchQuery = requirements ? `${query} ${requirements}` : query;
468
756
  let selectedDataset;
469
757
  let datasetIdForDownload = "";
470
758
  let source;
@@ -500,11 +788,14 @@ async function handlePrepareJob(jobId, query, requirements) {
500
788
  progress: 20,
501
789
  status_text: `Using explicit dataset id: ${datasetIdForDownload} (${source})`
502
790
  });
791
+ markPipelineStep("search", "skipped");
503
792
  }
504
793
  else {
794
+ markPipelineStep("search", "running");
505
795
  update({ progress: 10, status_text: "Searching for best dataset matching query..." });
506
- const results = await searchEngine.search(query, { limit: 10 });
796
+ const results = await searchEngine.search(searchQuery, { limit: 10 });
507
797
  if (results.length === 0) {
798
+ markPipelineStep("search", "failed");
508
799
  throw new Error("No datasets found matching the query. Try refining your search terms.");
509
800
  }
510
801
  // Pick the best result that we can actually download (skip sources requiring missing credentials)
@@ -524,8 +815,10 @@ async function handlePrepareJob(jobId, query, requirements) {
524
815
  progress: 20,
525
816
  status_text: `Matched: ${selectedDataset.name} (${selectedDataset.source})`
526
817
  });
818
+ markPipelineStep("search", "done");
527
819
  }
528
820
  // Pre-check credentials for sources that require them
821
+ markPipelineStep("validate", "running");
529
822
  if (source === "kaggle") {
530
823
  const hasKaggleCreds = !!(process.env.KAGGLE_USERNAME && process.env.KAGGLE_KEY && process.env.KAGGLE_USERNAME !== "YOUR_KAGGLE_USERNAME");
531
824
  if (!hasKaggleCreds) {
@@ -533,8 +826,11 @@ async function handlePrepareJob(jobId, query, requirements) {
533
826
  }
534
827
  }
535
828
  if (source === "dataworld" && !hasDataWorldToken()) {
829
+ markPipelineStep("validate", "failed");
536
830
  throw new Error("data.world token not set. Use the configure_keys tool or set DW_AUTH_TOKEN environment variable.");
537
831
  }
832
+ markPipelineStep("validate", "done");
833
+ markPipelineStep("download", "running");
538
834
  update({ progress: 30, status_text: `Starting download from ${source}...` });
539
835
  // ensureData handles download and returns path to the raw file
540
836
  let rawFilePath = await dataIngestor.ensureData(datasetIdForDownload, source, (msg, prog) => {
@@ -545,7 +841,7 @@ async function handlePrepareJob(jobId, query, requirements) {
545
841
  let currentRows = await countRows(rawFilePath);
546
842
  if (currentRows < requestedRows) {
547
843
  update({ progress: 64, status_text: `Only ${currentRows.toLocaleString()} rows found. Fetching more matching datasets...` });
548
- const additional = await searchEngine.search(query, { limit: 8 });
844
+ const additional = await searchEngine.search(searchQuery, { limit: 8 });
549
845
  const sourceFiles = [rawFilePath];
550
846
  let totalRows = currentRows;
551
847
  for (const ds of additional) {
@@ -597,15 +893,50 @@ async function handlePrepareJob(jobId, query, requirements) {
597
893
  update({ progress: 69, status_text: `Sample target met: ${currentRows.toLocaleString()} rows` });
598
894
  }
599
895
  }
896
+ markPipelineStep("download", "done");
897
+ // ── Normalize step: convert any raw format → parquet ──
898
+ markPipelineStep("normalize", "running");
899
+ const rawExt = path.extname(rawFilePath).toLowerCase();
900
+ if (rawExt !== ".parquet" && rawExt !== ".pq") {
901
+ update({ progress: 70, status_text: "Normalizing to parquet..." });
902
+ const normalizedDir = path.join(dataRoot, "data", "normalized");
903
+ if (!fs.existsSync(normalizedDir))
904
+ fs.mkdirSync(normalizedDir, { recursive: true });
905
+ const safeId = toSafeDatasetPathFragment(datasetIdForDownload);
906
+ const normalizedPath = path.join(normalizedDir, `${safeId}.parquet`);
907
+ try {
908
+ const normScript = path.join(dataRoot, "python", "normalize_engine.py");
909
+ const normResult = await runPythonJson(normScript, [rawFilePath, normalizedPath]);
910
+ if (normResult.ok && fs.existsSync(normalizedPath)) {
911
+ console.error(`[Prepare] Normalized ${rawExt} → parquet (${normResult.rows} rows)`);
912
+ rawFilePath = normalizedPath;
913
+ markPipelineStep("normalize", "done");
914
+ }
915
+ else {
916
+ console.error(`[Prepare] Normalize failed: ${normResult.error}, continuing with raw file`);
917
+ markPipelineStep("normalize", "skipped");
918
+ }
919
+ }
920
+ catch (e) {
921
+ console.error(`[Prepare] Normalize step failed: ${e?.message || e}, continuing with raw file`);
922
+ markPipelineStep("normalize", "skipped");
923
+ }
924
+ }
925
+ else {
926
+ markPipelineStep("normalize", "done");
927
+ }
600
928
  let qualityScore = selectedDataset?.quality_score ?? 70;
601
- update({ progress: 70, status_text: "Analyzing dataset quality..." });
929
+ markPipelineStep("quality", "running");
930
+ update({ progress: 75, status_text: "Analyzing dataset quality..." });
602
931
  try {
603
932
  const report = await qualityAnalyzer.analyze(rawFilePath);
604
933
  qualityScore = report.overall_score;
934
+ markPipelineStep("quality", "done");
605
935
  }
606
936
  catch (error) {
607
937
  console.error(`[Prepare] Quality analysis failed for ${datasetIdForDownload}: ${error?.message || error}`);
608
938
  update({ progress: 78, status_text: "Quality analysis skipped (unsupported schema). Continuing installation..." });
939
+ markPipelineStep("quality", "skipped");
609
940
  }
610
941
  if (selectedDataset) {
611
942
  metadataStore.saveDataset({
@@ -613,15 +944,62 @@ async function handlePrepareJob(jobId, query, requirements) {
613
944
  quality_score: qualityScore
614
945
  });
615
946
  }
947
+ else {
948
+ // Even for explicit dataset refs, save minimal metadata so get_dataset_info can find it
949
+ try {
950
+ const existingMeta = metadataStore.getDataset(datasetIdForDownload);
951
+ if (!existingMeta) {
952
+ metadataStore.saveDataset({
953
+ id: datasetIdForDownload,
954
+ source: source,
955
+ name: datasetIdForDownload.split("/").pop() || datasetIdForDownload,
956
+ description: `Dataset prepared from ${source}:${datasetIdForDownload}`,
957
+ quality_warnings: [],
958
+ downloads: 0,
959
+ likes: 0,
960
+ stars: 0,
961
+ tags: [],
962
+ last_updated: new Date().toISOString(),
963
+ task: "unknown",
964
+ domain: "unknown",
965
+ languages: [],
966
+ splits: [],
967
+ license: { id: "unknown", category: "unknown", usage_restrictions: [], warnings: [] },
968
+ quality_score: qualityScore,
969
+ download_url: source === "huggingface" ? `https://huggingface.co/datasets/${datasetIdForDownload}` : "",
970
+ total_examples: 0,
971
+ is_structured: false,
972
+ has_target_column: false,
973
+ is_safe_source: true,
974
+ has_personal_data: false,
975
+ is_paywalled: false,
976
+ is_scraped_web_data: false,
977
+ uses_https: true,
978
+ has_train_split: false,
979
+ has_test_split: false,
980
+ has_validation_split: false,
981
+ description_length: 0,
982
+ has_readme: false,
983
+ });
984
+ }
985
+ }
986
+ catch (e) {
987
+ console.error(`[Prepare] Failed to save minimal metadata for ${datasetIdForDownload}: ${e?.message || e}`);
988
+ }
989
+ }
990
+ markPipelineStep("register", "running");
616
991
  update({ progress: 85, status_text: "Installing dataset into project..." });
617
- const installPath = await installService.install(datasetIdForDownload, rawFilePath);
992
+ const installPath = await installService.install(datasetIdForDownload, rawFilePath, outputDir);
618
993
  update({ progress: 100, status_text: "Preparation complete!" });
619
994
  // Register prepared dataset in local registry for lookup by export/list tools
620
995
  try {
621
996
  upsertRegistry(datasetIdForDownload, installPath, "completed");
997
+ markPipelineStep("register", "done");
998
+ markStepComplete(datasetIdForDownload, "prepare");
622
999
  }
623
1000
  catch (e) {
624
1001
  console.error(`[Registry] Failed to write registry for ${datasetIdForDownload}: ${e?.message || e}`);
1002
+ markPipelineStep("register", "failed");
625
1003
  }
626
1004
  return installPath;
627
1005
  }
@@ -647,7 +1025,7 @@ async function handleCleanJob(jobId, datasetId, ops) {
647
1025
  }
648
1026
  // 3. Check standard raw data paths
649
1027
  if (!filePath) {
650
- const safeId = datasetId.replace(/\//g, "_");
1028
+ const safeId = toSafeDatasetPathFragment(datasetId);
651
1029
  const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
652
1030
  const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
653
1031
  const featherPath = path.join(dataRoot, "data", "raw", `${safeId}.feather`);
@@ -712,9 +1090,57 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
712
1090
  required: ["query"],
713
1091
  },
714
1092
  },
1093
+ {
1094
+ name: "unified_dataset_api",
1095
+ description: "Single facade over multiple external dataset providers. Supports provider discovery, dataset search, dataset download, and dataset info through one MCP tool using public access and server-managed credentials when available.",
1096
+ inputSchema: {
1097
+ type: "object",
1098
+ properties: {
1099
+ operation: {
1100
+ type: "string",
1101
+ enum: ["providers", "discover", "download", "info"],
1102
+ description: "Gateway operation to execute.",
1103
+ },
1104
+ source: {
1105
+ type: "string",
1106
+ enum: ["auto", "huggingface", "kaggle", "openml", "dataworld", "s3", "bigquery"],
1107
+ description: "Optional provider selector. Use 'auto' to let Vesper choose a compatible backend.",
1108
+ },
1109
+ query: {
1110
+ type: "string",
1111
+ description: "Dataset discovery query. Required for operation='discover'.",
1112
+ },
1113
+ dataset_id: {
1114
+ type: "string",
1115
+ description: "Dataset identifier or object reference. Required for operation='download' and operation='info'. Supports prefixed ids like 'huggingface:user/dataset' and public S3 URIs like 's3://bucket/key'.",
1116
+ },
1117
+ limit: {
1118
+ type: "number",
1119
+ description: "Max results for operation='discover' (default: 10).",
1120
+ },
1121
+ target_dir: {
1122
+ type: "string",
1123
+ description: "Optional output directory for operation='download'. Defaults to the current working directory when omitted.",
1124
+ },
1125
+ output_dir: {
1126
+ type: "string",
1127
+ description: "Alias for target_dir. Defaults to the current working directory when omitted.",
1128
+ },
1129
+ public_only: {
1130
+ type: "boolean",
1131
+ description: "When true, discover/info stay on public providers only unless a specific source is requested.",
1132
+ },
1133
+ include_unavailable: {
1134
+ type: "boolean",
1135
+ description: "When true, operation='providers' includes connectors that are scaffolded but not currently configured.",
1136
+ },
1137
+ },
1138
+ required: ["operation"],
1139
+ },
1140
+ },
715
1141
  {
716
1142
  name: "discover_datasets",
717
- description: "Discover datasets from a specific source. Kaggle is optional and requires user-provided API key.",
1143
+ description: "Discover datasets from a specific source. Public providers work keylessly; Kaggle and data.world can also be exposed through server-managed credentials.",
718
1144
  inputSchema: {
719
1145
  type: "object",
720
1146
  properties: {
@@ -737,7 +1163,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
737
1163
  },
738
1164
  {
739
1165
  name: "download_dataset",
740
- description: "Download a dataset by source and ID/slug into local Vesper storage. Defaults to HuggingFace. Kaggle and data.world require API keys (use configure_keys first).",
1166
+ description: "Download a dataset by source and ID/slug into a local directory. Defaults to HuggingFace. Public providers work keylessly, while Kaggle and data.world can use server-managed credentials when configured.",
741
1167
  inputSchema: {
742
1168
  type: "object",
743
1169
  properties: {
@@ -752,7 +1178,11 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
752
1178
  },
753
1179
  target_dir: {
754
1180
  type: "string",
755
- description: "Optional target directory for downloaded files.",
1181
+ description: "Optional target directory for downloaded files. Defaults to the current working directory when omitted.",
1182
+ },
1183
+ output_dir: {
1184
+ type: "string",
1185
+ description: "Alias for target_dir. Defaults to the current working directory when omitted.",
756
1186
  }
757
1187
  },
758
1188
  required: ["dataset_id"],
@@ -770,6 +1200,8 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
770
1200
  kaggle_ref: { type: "string", description: "Kaggle dataset ref (owner/dataset)." },
771
1201
  urls: { type: "array", items: { type: "string" }, description: "Direct asset URLs." },
772
1202
  output_format: { type: "string", enum: ["webdataset", "imagefolder", "parquet"], description: "Output asset format." },
1203
+ target_dir: { type: "string", description: "Optional local directory where downloaded assets should be written. If provided, Vesper writes directly to this directory instead of managed asset storage." },
1204
+ output_dir: { type: "string", description: "Alias for target_dir. When provided, downloaded assets are written directly to this local directory." },
773
1205
  max_items: { type: "number", description: "Optional cap on number of assets to fetch." },
774
1206
  workers: { type: "number", description: "Parallel worker count (default 8)." },
775
1207
  image_column: { type: "string", description: "Explicit image column name. If omitted, auto-detected from HF features, column names, and sample values." },
@@ -877,6 +1309,8 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
877
1309
  properties: {
878
1310
  query: { type: "string" },
879
1311
  requirements: { type: "string" },
1312
+ target_dir: { type: "string", description: "Optional local directory for the prepared dataset. Defaults to the current working directory when omitted." },
1313
+ output_dir: { type: "string", description: "Alias for target_dir. Defaults to the current working directory when omitted." },
880
1314
  download_images: { type: "boolean", description: "When true, enables post-prepare smart asset downloading for image/media datasets." },
881
1315
  cleaning_options: { type: "object" },
882
1316
  split_config: { type: "object" },
@@ -921,7 +1355,11 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
921
1355
  },
922
1356
  target_dir: {
923
1357
  type: "string",
924
- description: "Optional custom local directory for export (e.g., './naruto-quotes').",
1358
+ description: "Optional custom local directory for export. Defaults to the current working directory when omitted.",
1359
+ },
1360
+ output_dir: {
1361
+ type: "string",
1362
+ description: "Alias for target_dir. Defaults to the current working directory when omitted.",
925
1363
  },
926
1364
  format: {
927
1365
  type: "string",
@@ -962,6 +1400,25 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
962
1400
  properties: {},
963
1401
  },
964
1402
  },
1403
+ {
1404
+ name: "vesper_convert_format",
1405
+ description: "Convert a dataset file between formats (CSV, Parquet, JSON, JSONL). Auto-detects input format from extension. Saves output in the same directory with the new extension and registers it in the Vesper registry.",
1406
+ inputSchema: {
1407
+ type: "object",
1408
+ properties: {
1409
+ file_path: {
1410
+ type: "string",
1411
+ description: "Absolute path to the input dataset file.",
1412
+ },
1413
+ target_format: {
1414
+ type: "string",
1415
+ enum: ["csv", "parquet", "json", "jsonl"],
1416
+ description: "The desired output format.",
1417
+ },
1418
+ },
1419
+ required: ["file_path", "target_format"],
1420
+ },
1421
+ },
965
1422
  {
966
1423
  name: "fuse_datasets",
967
1424
  description: "Fuse/combine multiple datasets via concat or join. Optionally runs quality & leakage checks afterward.",
@@ -1069,925 +1526,1112 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
1069
1526
  ],
1070
1527
  };
1071
1528
  });
1072
- // Call Tool
1529
+ // Call Tool — all requests are serialized through a queue to prevent crashes from parallel calls
1073
1530
  server.setRequestHandler(CallToolRequestSchema, async (request) => {
1074
- // --- Pipeline Enforcement ---
1075
- // Map tool names to pipeline steps
1076
- const toolToStep = {
1077
- vesper_search: "search",
1078
- vesper_download: "download",
1079
- vesper_analyze: "analyze",
1080
- vesper_clean: "clean",
1081
- vesper_split: "split",
1082
- vesper_export: "export",
1083
- prepare_dataset: "prepare",
1084
- };
1085
- // Extract dataset_id if present and normalize
1086
- let datasetId = request.params.arguments?.dataset_id || request.params.arguments?.query || "";
1087
- if (datasetId)
1088
- datasetId = parseDatasetId(String(datasetId));
1089
- // Pipeline rules
1090
- const stepOrder = ["search", "download", "analyze", "clean", "split", "export"];
1091
- const prereqs = {
1092
- vesper_download: ["search"],
1093
- vesper_analyze: ["download"],
1094
- vesper_clean: ["analyze"],
1095
- vesper_split: ["clean"],
1096
- vesper_export: ["split"],
1097
- };
1098
- const tool = String(request.params.name);
1099
- const step = toolToStep[tool];
1100
- if (step && datasetId) {
1101
- // Check prerequisites
1102
- const required = prereqs[tool] || [];
1103
- for (const req of required) {
1104
- if (!hasStep(String(datasetId), req)) {
1105
- // Auto-run missing step if possible, else error
1106
- // For export, auto-run prepare_dataset if split missing
1107
- if (tool === "vesper_export" && req === "split") {
1108
- // Auto-trigger prepare_dataset (start a background prepare job)
1109
- try {
1110
- jobManager.createJob("prepare", 0, { query: String(datasetId), requirements: undefined, downloadImages: false });
1111
- // Mark split as complete so export can proceed; export handler will also wait for data if needed.
1112
- markStepComplete(String(datasetId), "split");
1531
+ return requestQueue.enqueue(async () => {
1532
+ // --- Pipeline Enforcement ---
1533
+ // Map tool names to pipeline steps
1534
+ const toolToStep = {
1535
+ vesper_search: "search",
1536
+ vesper_download: "download",
1537
+ vesper_analyze: "analyze",
1538
+ vesper_clean: "clean",
1539
+ vesper_split: "split",
1540
+ vesper_export: "export",
1541
+ prepare_dataset: "prepare",
1542
+ };
1543
+ // Extract dataset_id if present and normalize
1544
+ let datasetId = request.params.arguments?.dataset_id || request.params.arguments?.query || "";
1545
+ if (datasetId)
1546
+ datasetId = parseDatasetId(String(datasetId));
1547
+ // Pipeline rules
1548
+ const stepOrder = ["search", "download", "analyze", "clean", "split", "export"];
1549
+ const prereqs = {
1550
+ vesper_download: ["search"],
1551
+ vesper_analyze: ["download"],
1552
+ vesper_clean: ["analyze"],
1553
+ vesper_split: ["clean"],
1554
+ vesper_export: ["split"],
1555
+ };
1556
+ const tool = String(request.params.name);
1557
+ const step = toolToStep[tool];
1558
+ if (step && datasetId) {
1559
+ // Check prerequisites
1560
+ const required = prereqs[tool] || [];
1561
+ for (const req of required) {
1562
+ if (!hasStep(String(datasetId), req)) {
1563
+ // Auto-run missing step if possible, else error
1564
+ // For export, auto-run prepare_dataset if split missing
1565
+ if (tool === "vesper_export" && req === "split") {
1566
+ // Auto-trigger prepare_dataset (start a background prepare job)
1567
+ try {
1568
+ jobManager.createJob("prepare", 0, { query: String(datasetId), requirements: undefined, downloadImages: false, outputDir: process.cwd() });
1569
+ // Mark split as complete so export can proceed; export handler will also wait for data if needed.
1570
+ markStepComplete(String(datasetId), "split");
1571
+ }
1572
+ catch (e) {
1573
+ console.error(`[Pipeline] Failed to auto-trigger prepare for ${datasetId}: ${e?.message || e}`);
1574
+ return {
1575
+ content: [{ type: "text", text: `ERROR: Failed to auto-run prepare for ${datasetId}. Please run prepare_dataset first.` }],
1576
+ isError: true,
1577
+ };
1578
+ }
1113
1579
  }
1114
- catch (e) {
1115
- console.error(`[Pipeline] Failed to auto-trigger prepare for ${datasetId}: ${e?.message || e}`);
1580
+ else {
1116
1581
  return {
1117
- content: [{ type: "text", text: `ERROR: Failed to auto-run prepare for ${datasetId}. Please run prepare_dataset first.` }],
1582
+ content: [{ type: "text", text: `ERROR: Cannot run ${tool} before ${req}. Please run ${req} first.` }],
1118
1583
  isError: true,
1119
1584
  };
1120
1585
  }
1121
1586
  }
1122
- else {
1123
- return {
1124
- content: [{ type: "text", text: `ERROR: Cannot run ${tool} before ${req}. Please run ${req} first.` }],
1125
- isError: true,
1126
- };
1127
- }
1128
- }
1129
- }
1130
- // Mark this step as complete
1131
- markStepComplete(String(datasetId), String(step));
1132
- }
1133
- switch (request.params.name) {
1134
- case "vesper_search": {
1135
- const query = String(request.params.arguments?.query);
1136
- const limit = 5;
1137
- const safeOnly = true; // Enable safe filter by default
1138
- const enableJIT = request.params.arguments?.enable_jit === true;
1139
- if (!query) {
1140
- throw new McpError(ErrorCode.InvalidParams, "Query is required");
1141
1587
  }
1142
- const results = await searchEngine.search(query, { limit, safeOnly, enableJIT });
1143
- const formattedOutput = formatSearchResults(results);
1144
- return {
1145
- content: [
1146
- {
1147
- type: "text",
1148
- text: formattedOutput,
1149
- },
1150
- ],
1151
- };
1588
+ // Mark this step as complete
1589
+ markStepComplete(String(datasetId), String(step));
1152
1590
  }
1153
- case "discover_datasets": {
1154
- hydrateExternalKeys();
1155
- const query = String(request.params.arguments?.query || "").trim();
1156
- const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
1157
- const limit = Number(request.params.arguments?.limit || 10);
1158
- if (!query) {
1159
- throw new McpError(ErrorCode.InvalidParams, "query is required");
1160
- }
1161
- try {
1162
- let results = [];
1163
- if (source === "kaggle") {
1164
- if (!dataIngestor.hasKaggleCredentials()) {
1591
+ switch (request.params.name) {
1592
+ case "unified_dataset_api": {
1593
+ hydrateExternalKeys();
1594
+ const operation = String(request.params.arguments?.operation || "").trim().toLowerCase();
1595
+ const source = String(request.params.arguments?.source || "auto").trim().toLowerCase();
1596
+ const includeUnavailable = request.params.arguments?.include_unavailable === true;
1597
+ const publicOnly = request.params.arguments?.public_only !== false;
1598
+ try {
1599
+ if (operation === "providers") {
1165
1600
  return {
1166
- content: [{ type: "text", text: `Kaggle requires API credentials. Use the configure_keys tool to set kaggle_username and kaggle_key, or try source='huggingface' which works without credentials.` }],
1167
- isError: true,
1601
+ content: [{ type: "text", text: JSON.stringify({ providers: unifiedDatasetGateway.getProviderStatuses(includeUnavailable) }, null, 2) }],
1168
1602
  };
1169
1603
  }
1170
- results = await kaggleSource.discover(query, limit);
1171
- }
1172
- else if (source === "openml") {
1173
- const openmlSource = new OpenMLSource();
1174
- results = await openmlSource.discover(query, limit);
1175
- }
1176
- else if (source === "dataworld") {
1177
- if (!hasDataWorldToken()) {
1604
+ if (operation === "discover") {
1605
+ const query = String(request.params.arguments?.query || "").trim();
1606
+ if (!query) {
1607
+ throw new McpError(ErrorCode.InvalidParams, "query is required for operation='discover'");
1608
+ }
1609
+ const result = await unifiedDatasetGateway.discover({
1610
+ query,
1611
+ source,
1612
+ limit: Number(request.params.arguments?.limit || 10),
1613
+ publicOnly,
1614
+ });
1178
1615
  return {
1179
- content: [{ type: "text", text: "data.world requires API token. Run 'vespermcp config keys' and set dataworld_token." }],
1180
- isError: true,
1616
+ content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
1181
1617
  };
1182
1618
  }
1183
- const dataworldSource = new DataWorldSource();
1184
- results = await dataworldSource.discover(query, limit);
1185
- }
1186
- else {
1187
- const hf = new HuggingFaceScraper();
1188
- results = await hf.scrape(Math.max(1, limit), true, query);
1189
- }
1190
- const recipeScript = path.join(dataRoot, "python", "asset_downloader_engine.py");
1191
- for (const ds of results.slice(0, limit)) {
1192
- const info = {
1193
- dataset_id: ds.id,
1194
- id: ds.id,
1195
- source: ds.source,
1196
- repo_id: ds.id,
1197
- total_images: ds.total_examples || 0,
1198
- image_column: undefined,
1199
- recipes_dir: path.join(dataRoot, "recipes"),
1200
- };
1201
- try {
1202
- await runPythonJson(recipeScript, ["build_recipe", JSON.stringify(info)]);
1619
+ if (operation === "download") {
1620
+ const datasetId = String(request.params.arguments?.dataset_id || "").trim();
1621
+ if (!datasetId) {
1622
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='download'");
1623
+ }
1624
+ const requestedTargetDir = request.params.arguments?.target_dir
1625
+ ? String(request.params.arguments.target_dir).trim()
1626
+ : request.params.arguments?.output_dir
1627
+ ? String(request.params.arguments.output_dir).trim()
1628
+ : "";
1629
+ const targetDir = requestedTargetDir || process.cwd();
1630
+ try {
1631
+ await ensurePythonModules([{ module: "datasets", packageName: "datasets" }]);
1632
+ }
1633
+ catch {
1634
+ // best effort; non-HF providers do not require this
1635
+ }
1636
+ const result = await unifiedDatasetGateway.download({
1637
+ datasetId,
1638
+ source,
1639
+ targetDir,
1640
+ });
1641
+ try {
1642
+ upsertRegistry(result.dataset_id, result.copied_to || result.local_path, "completed");
1643
+ }
1644
+ catch (e) {
1645
+ console.error(`[Registry] Failed to write registry for ${result.dataset_id}: ${e?.message || e}`);
1646
+ }
1647
+ return {
1648
+ content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
1649
+ };
1203
1650
  }
1204
- catch {
1205
- // best-effort recipe generation; ignore discovery-time recipe failures
1651
+ if (operation === "info") {
1652
+ const datasetId = String(request.params.arguments?.dataset_id || "").trim();
1653
+ if (!datasetId) {
1654
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='info'");
1655
+ }
1656
+ const result = await unifiedDatasetGateway.info({
1657
+ datasetId,
1658
+ source,
1659
+ publicOnly,
1660
+ });
1661
+ return {
1662
+ content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
1663
+ };
1206
1664
  }
1665
+ throw new McpError(ErrorCode.InvalidParams, `Unsupported operation: ${operation}`);
1666
+ }
1667
+ catch (error) {
1668
+ return {
1669
+ content: [{ type: "text", text: `ERROR: unified gateway failed: ${error.message}` }],
1670
+ isError: true,
1671
+ };
1207
1672
  }
1208
- const formattedOutput = formatSearchResults(results.slice(0, limit));
1209
- return {
1210
- content: [{ type: "text", text: formattedOutput }]
1211
- };
1212
- }
1213
- catch (error) {
1214
- return {
1215
- content: [{ type: "text", text: `ERROR: discover failed: ${error.message}` }],
1216
- isError: true,
1217
- };
1218
- }
1219
- }
1220
- case "download_dataset": {
1221
- hydrateExternalKeys();
1222
- const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
1223
- const datasetId = String(request.params.arguments?.dataset_id || "").trim();
1224
- if (!datasetId) {
1225
- throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
1226
- }
1227
- if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
1228
- return {
1229
- content: [{ type: "text", text: `Kaggle requires API credentials. Use the configure_keys tool to set kaggle_username and kaggle_key, or switch to source='huggingface' which works without credentials.` }],
1230
- isError: true,
1231
- };
1232
1673
  }
1233
- if (source === "dataworld" && !hasDataWorldToken()) {
1674
+ case "vesper_search": {
1675
+ const query = String(request.params.arguments?.query);
1676
+ const limit = 5;
1677
+ const safeOnly = true; // Enable safe filter by default
1678
+ const enableJIT = request.params.arguments?.enable_jit === true;
1679
+ if (!query) {
1680
+ throw new McpError(ErrorCode.InvalidParams, "Query is required");
1681
+ }
1682
+ const results = await searchEngine.search(query, { limit, safeOnly, enableJIT });
1683
+ const formattedOutput = formatSearchResults(results);
1234
1684
  return {
1235
- content: [{ type: "text", text: "data.world requires API token. Use the configure_keys tool to set dataworld_token, or switch to source='huggingface' which works without credentials." }],
1236
- isError: true,
1685
+ content: [
1686
+ {
1687
+ type: "text",
1688
+ text: formattedOutput,
1689
+ },
1690
+ ],
1237
1691
  };
1238
1692
  }
1239
- // Pre-install Python datasets library for HuggingFace fallback
1240
- if (source === "huggingface") {
1693
+ case "discover_datasets": {
1694
+ hydrateExternalKeys();
1695
+ const query = String(request.params.arguments?.query || "").trim();
1696
+ const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
1697
+ const limit = Number(request.params.arguments?.limit || 10);
1698
+ if (!query) {
1699
+ throw new McpError(ErrorCode.InvalidParams, "query is required");
1700
+ }
1241
1701
  try {
1242
- await ensurePythonModules([
1243
- { module: "datasets", packageName: "datasets" },
1244
- ]);
1702
+ const gatewayResult = await unifiedDatasetGateway.discover({
1703
+ query,
1704
+ source,
1705
+ limit,
1706
+ publicOnly: false,
1707
+ });
1708
+ const results = gatewayResult.results;
1709
+ const recipeScript = path.join(dataRoot, "python", "asset_downloader_engine.py");
1710
+ for (const ds of results.slice(0, limit)) {
1711
+ const info = {
1712
+ dataset_id: ds.id,
1713
+ id: ds.id,
1714
+ source: ds.source,
1715
+ repo_id: ds.id,
1716
+ total_images: ds.total_examples || 0,
1717
+ image_column: undefined,
1718
+ recipes_dir: path.join(dataRoot, "recipes"),
1719
+ };
1720
+ try {
1721
+ await runPythonJson(recipeScript, ["build_recipe", JSON.stringify(info)]);
1722
+ }
1723
+ catch {
1724
+ // best-effort recipe generation; ignore discovery-time recipe failures
1725
+ }
1726
+ }
1727
+ const formattedOutput = formatSearchResults(results.slice(0, limit));
1728
+ const noteBlock = gatewayResult.notes.length > 0
1729
+ ? `\nGateway notes:\n- ${gatewayResult.notes.join("\n- ")}`
1730
+ : "";
1731
+ return {
1732
+ content: [{ type: "text", text: `${formattedOutput}${noteBlock}` }]
1733
+ };
1245
1734
  }
1246
- catch {
1247
- // Continue - direct download may still work
1735
+ catch (error) {
1736
+ return {
1737
+ content: [{ type: "text", text: `ERROR: discover failed: ${error.message}` }],
1738
+ isError: true,
1739
+ };
1248
1740
  }
1249
1741
  }
1250
- try {
1251
- const localPath = await dataIngestor.ensureData(datasetId, source, () => undefined);
1252
- try {
1253
- upsertRegistry(datasetId, localPath, "completed");
1742
+ case "download_dataset": {
1743
+ hydrateExternalKeys();
1744
+ const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
1745
+ const datasetId = String(request.params.arguments?.dataset_id || "").trim();
1746
+ const requestedTargetDir = request.params.arguments?.target_dir
1747
+ ? String(request.params.arguments.target_dir).trim()
1748
+ : request.params.arguments?.output_dir
1749
+ ? String(request.params.arguments.output_dir).trim()
1750
+ : "";
1751
+ const targetDir = requestedTargetDir || process.cwd();
1752
+ if (!datasetId) {
1753
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
1254
1754
  }
1255
- catch (e) {
1256
- console.error(`[Registry] Failed to write registry for ${datasetId}: ${e?.message || e}`);
1755
+ // Pre-install Python datasets library for HuggingFace fallback
1756
+ if (source === "huggingface") {
1757
+ try {
1758
+ await ensurePythonModules([
1759
+ { module: "datasets", packageName: "datasets" },
1760
+ ]);
1761
+ }
1762
+ catch {
1763
+ // Continue - direct download may still work
1764
+ }
1257
1765
  }
1258
- return {
1259
- content: [{ type: "text", text: `Download complete: ${localPath}` }]
1260
- };
1261
- }
1262
- catch (error) {
1263
- return {
1264
- content: [{ type: "text", text: `ERROR: download failed: ${error.message}` }],
1265
- isError: true,
1266
- };
1267
- }
1268
- }
1269
- case "vesper_download_assets": {
1270
- hydrateExternalKeys();
1271
- const datasetId = String(request.params.arguments?.dataset_id || "").trim();
1272
- const source = String(request.params.arguments?.source || "").trim().toLowerCase();
1273
- // Auto-infer repo_id from dataset_id if not provided (common for HuggingFace)
1274
- const repoId = request.params.arguments?.repo_id
1275
- ? String(request.params.arguments.repo_id)
1276
- : (source === "huggingface" && datasetId.includes("/") ? datasetId : undefined);
1277
- const kaggleRef = request.params.arguments?.kaggle_ref ? String(request.params.arguments.kaggle_ref) : undefined;
1278
- const urls = Array.isArray(request.params.arguments?.urls)
1279
- ? (request.params.arguments?.urls).map(v => String(v))
1280
- : undefined;
1281
- const outputFormat = String(request.params.arguments?.output_format || "webdataset");
1282
- const maxItems = request.params.arguments?.max_items ? Number(request.params.arguments.max_items) : undefined;
1283
- const workers = request.params.arguments?.workers ? Number(request.params.arguments.workers) : 8;
1284
- const imageColumn = request.params.arguments?.image_column ? String(request.params.arguments.image_column) : undefined;
1285
- if (!datasetId || !source) {
1286
- throw new McpError(ErrorCode.InvalidParams, "dataset_id and source are required");
1287
- }
1288
- if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
1289
- return {
1290
- content: [{ type: "text", text: "Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds)." }],
1291
- isError: true,
1292
- };
1293
- }
1294
- const requiredModules = [
1295
- { module: "aiohttp", packageName: "aiohttp" },
1296
- ];
1297
- if (source === "url") {
1298
- requiredModules.push({ module: "aiofiles", packageName: "aiofiles" });
1299
- }
1300
- if (source === "huggingface") {
1301
- requiredModules.push({ module: "datasets", packageName: "datasets" });
1302
- requiredModules.push({ module: "PIL", packageName: "Pillow" });
1303
- }
1304
- if (source === "kaggle") {
1305
- requiredModules.push({ module: "kaggle", packageName: "kaggle" });
1306
- }
1307
- try {
1308
- await ensurePythonModules(requiredModules);
1309
- }
1310
- catch (error) {
1311
- return {
1312
- content: [{ type: "text", text: `ERROR: Python dependency setup failed: ${error.message}` }],
1313
- isError: true,
1314
- };
1315
- }
1316
- const scriptPath = path.join(dataRoot, "python", "asset_downloader_engine.py");
1317
- const payload = {
1318
- dataset_id: datasetId,
1319
- source,
1320
- repo_id: repoId,
1321
- kaggle_ref: kaggleRef,
1322
- urls,
1323
- output_format: outputFormat,
1324
- max_items: maxItems,
1325
- workers,
1326
- image_column: imageColumn,
1327
- output_root: path.join(dataRoot, "data", "assets"),
1328
- recipes_dir: path.join(dataRoot, "recipes"),
1329
- };
1330
- try {
1331
- const result = await runPythonJson(scriptPath, ["download", JSON.stringify(payload)]);
1332
- if (!result?.ok) {
1333
- const errMsg = result?.error || "Unknown error";
1334
- // Enhance error messages for common failures
1335
- let hint = "";
1336
- if (errMsg.includes("No image column")) {
1337
- hint = "\n\nHint: Specify image_column parameter with the column name containing images/URLs.";
1766
+ try {
1767
+ const result = await unifiedDatasetGateway.download({
1768
+ datasetId,
1769
+ source,
1770
+ targetDir,
1771
+ });
1772
+ try {
1773
+ upsertRegistry(result.dataset_id, result.copied_to || result.local_path, "completed");
1338
1774
  }
1339
- else if (errMsg.includes("Authentication") || errMsg.includes("401") || errMsg.includes("403")) {
1340
- hint = "\n\nHint: Use configure_keys tool to set HF_TOKEN for gated/private datasets.";
1775
+ catch (e) {
1776
+ console.error(`[Registry] Failed to write registry for ${result.dataset_id}: ${e?.message || e}`);
1341
1777
  }
1778
+ const noteBlock = result.notes.length > 0 ? `\nNotes:\n- ${result.notes.join("\n- ")}` : "";
1779
+ return {
1780
+ content: [{ type: "text", text: `Download complete: ${result.copied_to || result.local_path}${noteBlock}` }]
1781
+ };
1782
+ }
1783
+ catch (error) {
1342
1784
  return {
1343
- content: [{ type: "text", text: `ERROR: asset download failed: ${errMsg}${hint}` }],
1785
+ content: [{ type: "text", text: `ERROR: download failed: ${error.message}` }],
1344
1786
  isError: true,
1345
1787
  };
1346
1788
  }
1347
- return {
1348
- content: [{ type: "text", text: JSON.stringify(result.result, null, 2) }],
1349
- };
1350
- }
1351
- catch (error) {
1352
- return {
1353
- content: [{ type: "text", text: `ERROR: asset downloader execution failed: ${error.message}` }],
1354
- isError: true,
1355
- };
1356
- }
1357
- }
1358
- case "configure_kaggle": {
1359
- const username = String(request.params.arguments?.username || "").trim();
1360
- const key = String(request.params.arguments?.key || "").trim();
1361
- if (!username || !key) {
1362
- throw new McpError(ErrorCode.InvalidParams, "username and key are required");
1363
1789
  }
1364
- const r1 = secureKeys.set("kaggle_username", username);
1365
- const r2 = secureKeys.set("kaggle_key", key);
1366
- process.env.KAGGLE_USERNAME = username;
1367
- process.env.KAGGLE_KEY = key;
1368
- return {
1369
- content: [{ type: "text", text: `Kaggle credentials saved securely (${r1.method || "store"}/${r2.method || "store"}). You can now use Kaggle datasets.` }]
1370
- };
1371
- }
1372
- case "configure_keys": {
1373
- const hfToken = String(request.params.arguments?.hf_token || "").trim();
1374
- const kaggleUsername = String(request.params.arguments?.kaggle_username || "").trim();
1375
- const kaggleKey = String(request.params.arguments?.kaggle_key || "").trim();
1376
- const dataworldToken = String(request.params.arguments?.dataworld_token || "").trim();
1377
- const saved = [];
1378
- const methods = [];
1379
- if (hfToken) {
1380
- const r = secureKeys.set("hf_token", hfToken);
1381
- if (r.ok) {
1382
- process.env.HF_TOKEN = hfToken;
1383
- saved.push("HF token");
1384
- if (r.method)
1385
- methods.push(r.method);
1790
+ case "vesper_download_assets": {
1791
+ hydrateExternalKeys();
1792
+ const datasetId = String(request.params.arguments?.dataset_id || "").trim();
1793
+ const source = String(request.params.arguments?.source || "").trim().toLowerCase();
1794
+ // Auto-infer repo_id from dataset_id if not provided (common for HuggingFace)
1795
+ const repoId = request.params.arguments?.repo_id
1796
+ ? String(request.params.arguments.repo_id)
1797
+ : (source === "huggingface" && datasetId.includes("/") ? datasetId : undefined);
1798
+ const kaggleRef = request.params.arguments?.kaggle_ref ? String(request.params.arguments.kaggle_ref) : undefined;
1799
+ const urls = Array.isArray(request.params.arguments?.urls)
1800
+ ? (request.params.arguments?.urls).map(v => String(v))
1801
+ : undefined;
1802
+ const outputFormat = String(request.params.arguments?.output_format || "webdataset");
1803
+ const requestedOutputDir = request.params.arguments?.target_dir
1804
+ ? String(request.params.arguments.target_dir).trim()
1805
+ : request.params.arguments?.output_dir
1806
+ ? String(request.params.arguments.output_dir).trim()
1807
+ : undefined;
1808
+ const maxItems = request.params.arguments?.max_items ? Number(request.params.arguments.max_items) : undefined;
1809
+ const workers = request.params.arguments?.workers ? Number(request.params.arguments.workers) : 8;
1810
+ const imageColumn = request.params.arguments?.image_column ? String(request.params.arguments.image_column) : undefined;
1811
+ if (!datasetId || !source) {
1812
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id and source are required");
1386
1813
  }
1387
- }
1388
- if (kaggleUsername) {
1389
- const r = secureKeys.set("kaggle_username", kaggleUsername);
1390
- if (r.ok) {
1391
- process.env.KAGGLE_USERNAME = kaggleUsername;
1392
- saved.push("Kaggle username");
1393
- if (r.method)
1394
- methods.push(r.method);
1814
+ if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
1815
+ return {
1816
+ content: [{ type: "text", text: "Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds)." }],
1817
+ isError: true,
1818
+ };
1395
1819
  }
1396
- }
1397
- if (kaggleKey) {
1398
- const r = secureKeys.set("kaggle_key", kaggleKey);
1399
- if (r.ok) {
1400
- process.env.KAGGLE_KEY = kaggleKey;
1401
- saved.push("Kaggle key");
1402
- if (r.method)
1403
- methods.push(r.method);
1820
+ const requiredModules = [
1821
+ { module: "aiohttp", packageName: "aiohttp" },
1822
+ ];
1823
+ if (source === "url") {
1824
+ requiredModules.push({ module: "aiofiles", packageName: "aiofiles" });
1404
1825
  }
1405
- }
1406
- if (dataworldToken) {
1407
- const r = secureKeys.set("dataworld_token", dataworldToken);
1408
- if (r.ok) {
1409
- process.env.DW_AUTH_TOKEN = dataworldToken;
1410
- saved.push("data.world token");
1411
- if (r.method)
1412
- methods.push(r.method);
1826
+ if (source === "huggingface") {
1827
+ requiredModules.push({ module: "datasets", packageName: "datasets" });
1828
+ requiredModules.push({ module: "PIL", packageName: "Pillow" });
1413
1829
  }
1414
- }
1415
- if (saved.length === 0) {
1416
- return {
1417
- content: [{ type: "text", text: "No keys provided. Core Vesper tools continue to work without API keys." }]
1830
+ if (source === "kaggle") {
1831
+ requiredModules.push({ module: "kaggle", packageName: "kaggle" });
1832
+ }
1833
+ try {
1834
+ await ensurePythonModules(requiredModules);
1835
+ }
1836
+ catch (error) {
1837
+ return {
1838
+ content: [{ type: "text", text: `ERROR: Python dependency setup failed: ${error.message}` }],
1839
+ isError: true,
1840
+ };
1841
+ }
1842
+ const scriptPath = path.join(dataRoot, "python", "asset_downloader_engine.py");
1843
+ const payload = {
1844
+ dataset_id: datasetId,
1845
+ source,
1846
+ repo_id: repoId,
1847
+ kaggle_ref: kaggleRef,
1848
+ urls,
1849
+ output_format: outputFormat,
1850
+ output_dir: requestedOutputDir,
1851
+ max_items: maxItems,
1852
+ workers,
1853
+ image_column: imageColumn,
1854
+ output_root: requestedOutputDir || process.cwd(),
1855
+ recipes_dir: path.join(dataRoot, "recipes"),
1418
1856
  };
1419
- }
1420
- return {
1421
- content: [{ type: "text", text: `Key saved securely. Updated: ${saved.join(", ")}.` }]
1422
- };
1423
- }
1424
- case "get_dataset_info": {
1425
- const datasetId = String(request.params.arguments?.dataset_id);
1426
- if (!datasetId) {
1427
- throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
1428
- }
1429
- const dataset = metadataStore.getDataset(datasetId);
1430
- if (!dataset) {
1431
- return {
1432
- content: [{ type: "text", text: `ERROR: Dataset not found: ${datasetId}` }],
1433
- isError: true,
1434
- };
1435
- }
1436
- const formattedOutput = formatDatasetInfo(dataset);
1437
- return { content: [{ type: "text", text: formattedOutput }] };
1438
- }
1439
- case "analyze_quality": {
1440
- const datasetId = String(request.params.arguments?.dataset_id);
1441
- const safeId = datasetId.replace(/\//g, "_");
1442
- const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
1443
- const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
1444
- let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
1445
- // Demo Fallback for easy testing
1446
- if (datasetId === "demo" || !fs.existsSync(filePath)) {
1447
- const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
1448
- const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
1449
- if (fs.existsSync(demoParquetPath)) {
1450
- filePath = demoParquetPath;
1451
- }
1452
- else if (fs.existsSync(demoCsvPath)) {
1453
- filePath = demoCsvPath;
1454
- }
1455
- else if (datasetId !== "demo") {
1857
+ try {
1858
+ const result = await runPythonJson(scriptPath, ["download", JSON.stringify(payload)]);
1859
+ if (!result?.ok) {
1860
+ const errMsg = result?.error || "Unknown error";
1861
+ // Enhance error messages for common failures
1862
+ let hint = "";
1863
+ if (errMsg.includes("No image column")) {
1864
+ hint = "\n\nHint: Specify image_column parameter with the column name containing images/URLs.";
1865
+ }
1866
+ else if (errMsg.includes("Authentication") || errMsg.includes("401") || errMsg.includes("403")) {
1867
+ hint = "\n\nHint: Use configure_keys tool to set HF_TOKEN for gated/private datasets.";
1868
+ }
1869
+ return {
1870
+ content: [{ type: "text", text: `ERROR: asset download failed: ${errMsg}${hint}` }],
1871
+ isError: true,
1872
+ };
1873
+ }
1456
1874
  return {
1457
- content: [{ type: "text", text: `ERROR: Local data file for ${datasetId} not found. Try running prepare_dataset first, or use 'demo' as the dataset_id.` }],
1458
- isError: true
1875
+ content: [{ type: "text", text: JSON.stringify(result.result, null, 2) }],
1459
1876
  };
1460
1877
  }
1461
- }
1462
- const report = await qualityAnalyzer.analyze(filePath);
1463
- return {
1464
- content: [{ type: "text", text: JSON.stringify(report, null, 2) }]
1465
- };
1466
- }
1467
- case "preview_cleaning": {
1468
- const datasetId = String(request.params.arguments?.dataset_id);
1469
- const safeId = datasetId.replace(/\//g, "_");
1470
- const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
1471
- const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
1472
- let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
1473
- if (datasetId === "demo" || !fs.existsSync(filePath)) {
1474
- const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
1475
- const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
1476
- if (fs.existsSync(demoParquetPath)) {
1477
- filePath = demoParquetPath;
1478
- }
1479
- else if (fs.existsSync(demoCsvPath)) {
1480
- filePath = demoCsvPath;
1481
- }
1482
- else {
1483
- throw new McpError(ErrorCode.InvalidParams, `Local data file not found for ${datasetId}. Please run prepare_dataset first.`);
1878
+ catch (error) {
1879
+ return {
1880
+ content: [{ type: "text", text: `ERROR: asset downloader execution failed: ${error.message}` }],
1881
+ isError: true,
1882
+ };
1484
1883
  }
1485
1884
  }
1486
- const report = await qualityAnalyzer.analyze(filePath);
1487
- // Phase 1: Target Detection
1488
- // We use the same TargetDetector instance inside CleaningPlanner now?
1489
- // Actually, we instantiated it inside CleaningPlanner, so we just need to pass the file path
1490
- // OR let the planner handle it if we update its signature to accept filePath.
1491
- // Let's check `CleaningPlanner.generatePlan` signature again.
1492
- // We updated it to accept `targetInfo`.
1493
- // So we need to run detection HERE and pass it.
1494
- // But `TargetDetector` is not exposed in `index.ts` scope yet.
1495
- // Let's create a global instance or use the one inside planner if exposed (it's private).
1496
- // Better approach: Instantiate TargetDetector here in index.ts for the tool content.
1497
- // Quick fix: Instantiate local detector or make global.
1498
- // I'll make a global `targetDetector` constant in index.ts
1499
- // But wait, I updated `CleaningPlanner` to instantiate its own detector.
1500
- // Does `CleaningPlanner` use it? No, I commented out the logic because it needed `filePath`.
1501
- // RETRY STRATEGY:
1502
- // 1. Instantiate `targetDetector` in `index.ts`.
1503
- // 2. Run `detectTarget(filePath)`.
1504
- // 3. Pass result to `cleaningPlanner.generatePlan(..., targetInfo)`.
1505
- // I need to add `const targetDetector = new TargetDetector(__dirname);` to imports/init section first.
1506
- // But since I'm in this tool, I can't look back.
1507
- // I will assume I can add it, or just do it inside the case for now.
1508
- // To do it properly, I should have added `targetDetector` to the global scope in previous step.
1509
- // Let's do that in a separate step if needed.
1510
- // For now, I'll instantiate it here.
1511
- const { TargetDetector } = await import("./preparation/target-detector.js");
1512
- const detector = new TargetDetector(__dirname);
1513
- const targetResult = await detector.detectTarget(filePath);
1514
- const targetInfo = targetResult.target_column ? {
1515
- target: targetResult.target_column,
1516
- confidence: targetResult.confidence
1517
- } : undefined;
1518
- const plan = await cleaningPlanner.generatePlan(datasetId, report, undefined, targetInfo);
1519
- let explanation = `### Cleaning Plan for ${datasetId}\n\n`;
1520
- if (targetInfo && targetInfo.target !== "target" && targetInfo.confidence > 0.7) {
1521
- explanation += `**Target Detected**: '${targetInfo.target}' (Confidence: ${targetInfo.confidence.toFixed(2)})\n`;
1522
- explanation += ` - **Action**: Renaming to 'target' for consistency.\n\n`;
1523
- }
1524
- explanation += `Estimated Quality Improvement: +${plan.estimated_impact.quality_score_improvement} points\n\n`;
1525
- if (plan.operations.length === 0) {
1526
- explanation += "No cleaning operations required.";
1527
- }
1528
- else {
1529
- plan.operations.forEach((op, i) => {
1530
- explanation += `${i + 1}. **${op.type}**: ${op.reason}\n`;
1531
- });
1532
- }
1533
- return {
1534
- content: [{ type: "text", text: explanation }]
1535
- };
1536
- }
1537
- case "custom_clean": {
1538
- const datasetId = String(request.params.arguments?.dataset_id);
1539
- const ops = request.params.arguments?.operations;
1540
- if (!datasetId || datasetId === "undefined") {
1541
- throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
1542
- }
1543
- if (!ops || !Array.isArray(ops) || ops.length === 0) {
1544
- throw new McpError(ErrorCode.InvalidParams, "operations array is required and must not be empty. Supported operations: RemoveDuplicates, DropColumns, FillMissing, FixTypes, RemoveOutliers, EncodeCategories");
1545
- }
1546
- // Pre-check: verify dataset file exists before starting the job
1547
- const cleanRegEntry = getRegistryEntry(datasetId);
1548
- const cleanRegPath = cleanRegEntry?.local_path || cleanRegEntry?.path;
1549
- const cleanDlStatus = metadataStore.getDownloadStatus(datasetId);
1550
- const cleanSafeId = datasetId.replace(/\//g, "_");
1551
- const cleanDataExists = (cleanRegPath && fs.existsSync(cleanRegPath)) ||
1552
- (cleanDlStatus?.local_path && fs.existsSync(cleanDlStatus.local_path)) ||
1553
- fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.parquet`)) ||
1554
- fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.csv`)) ||
1555
- fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.feather`)) ||
1556
- fs.existsSync(datasetId);
1557
- if (!cleanDataExists) {
1558
- return {
1559
- content: [{ type: "text", text: `Dataset '${datasetId}' not found locally. Download it first using download_dataset or prepare_dataset, then run custom_clean.` }],
1560
- isError: true,
1561
- };
1562
- }
1563
- const job = jobManager.createJob("clean", 0, { datasetId, ops });
1564
- return {
1565
- content: [{ type: "text", text: `Cleaning job started. ID: ${job.id}. Use check_job_status to monitor progress.` }]
1566
- };
1567
- }
1568
- case "prepare_dataset": {
1569
- hydrateExternalKeys();
1570
- const query = String(request.params.arguments?.query);
1571
- const requirements = request.params.arguments?.requirements ? String(request.params.arguments?.requirements) : undefined;
1572
- const downloadImages = request.params.arguments?.download_images === true;
1573
- if (!query || query === "undefined") {
1574
- throw new McpError(ErrorCode.InvalidParams, "query is required - describe the dataset you need or provide a dataset ID");
1575
- }
1576
- const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages });
1577
- return {
1578
- content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you. Use check_job_status to monitor progress.` }]
1579
- };
1580
- }
1581
- case "compare_datasets": {
1582
- const datasetIds = request.params.arguments?.dataset_ids;
1583
- const datasets = datasetIds.map(id => metadataStore.getDataset(id)).filter(d => !!d);
1584
- let comparison = "| Metric | " + datasets.map(d => d.name).join(" | ") + " |\n";
1585
- comparison += "| :--- | " + datasets.map(() => " :---: ").join("|") + " |\n";
1586
- comparison += "| Quality Score | " + datasets.map(d => `${d.quality_score}/100`).join(" | ") + " |\n";
1587
- comparison += "| License | " + datasets.map(d => d.license.category).join(" | ") + " |\n";
1588
- comparison += "| Downloads | " + datasets.map(d => d.downloads).join(" | ") + " |\n";
1589
- comparison += "| Domain | " + datasets.map(d => d.domain).join(" | ") + " |\n";
1590
- return {
1591
- content: [{ type: "text", text: comparison }]
1592
- };
1593
- }
1594
- case "check_job_status": {
1595
- const jobId = String(request.params.arguments?.job_id);
1596
- const job = metadataStore.getJob(jobId);
1597
- if (!job) {
1598
- throw new McpError(ErrorCode.InvalidParams, `Job not found: ${jobId}`);
1599
- }
1600
- const activeStatuses = new Set(["pending", "queued", "running", "retrying"]);
1601
- const now = Date.now();
1602
- const last = jobStatusLastPoll[jobId] || 0;
1603
- const minPollMs = 3000;
1604
- if (activeStatuses.has(job.status) && (now - last) < minPollMs) {
1605
- const waitMs = minPollMs - (now - last);
1885
+ case "configure_kaggle": {
1886
+ const username = String(request.params.arguments?.username || "").trim();
1887
+ const key = String(request.params.arguments?.key || "").trim();
1888
+ if (!username || !key) {
1889
+ throw new McpError(ErrorCode.InvalidParams, "username and key are required");
1890
+ }
1891
+ const r1 = secureKeys.set("kaggle_username", username);
1892
+ const r2 = secureKeys.set("kaggle_key", key);
1893
+ process.env.KAGGLE_USERNAME = username;
1894
+ process.env.KAGGLE_KEY = key;
1606
1895
  return {
1607
- content: [{ type: "text", text: `No significant status change yet. Please poll again in ~${Math.ceil(waitMs / 1000)}s.` }]
1896
+ content: [{ type: "text", text: `Kaggle credentials saved securely (${r1.method || "store"}/${r2.method || "store"}). You can now use Kaggle datasets.` }]
1608
1897
  };
1609
1898
  }
1610
- jobStatusLastPoll[jobId] = now;
1611
- return {
1612
- content: [{ type: "text", text: formatJobStatus(job) }]
1613
- };
1614
- }
1615
- case "export_dataset": {
1616
- const datasetId = String(request.params.arguments?.dataset_id);
1617
- const targetDir = request.params.arguments?.target_dir ? String(request.params.arguments?.target_dir) : undefined;
1618
- const requestedFormat = String(request.params.arguments?.format || "feather");
1619
- const fastMode = request.params.arguments?.fast === true;
1620
- const preview = request.params.arguments?.preview === true;
1621
- const sampleRows = request.params.arguments?.sample_rows ? Number(request.params.arguments.sample_rows) : undefined;
1622
- const columns = request.params.arguments?.columns;
1623
- const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
1624
- const dataset = metadataStore.getDataset(datasetId);
1625
- if (!dataset) {
1626
- throw new McpError(ErrorCode.InvalidParams, `Dataset not found: ${datasetId}`);
1627
- }
1628
- // Use Metadata or Registry to find the actual local file
1629
- let sourcePath = undefined;
1630
- const downloadStatus = metadataStore.getDownloadStatus(datasetId);
1631
- if (downloadStatus && fs.existsSync(downloadStatus.local_path)) {
1632
- sourcePath = downloadStatus.local_path;
1633
- }
1634
- else {
1635
- // Fallback to local registry
1636
- const reg = getRegistryEntry(datasetId);
1637
- if (reg && fs.existsSync(reg.local_path)) {
1638
- sourcePath = reg.local_path;
1899
+ case "configure_keys": {
1900
+ const hfToken = String(request.params.arguments?.hf_token || "").trim();
1901
+ const kaggleUsername = String(request.params.arguments?.kaggle_username || "").trim();
1902
+ const kaggleKey = String(request.params.arguments?.kaggle_key || "").trim();
1903
+ const dataworldToken = String(request.params.arguments?.dataworld_token || "").trim();
1904
+ const saved = [];
1905
+ const methods = [];
1906
+ if (hfToken) {
1907
+ const r = secureKeys.set("hf_token", hfToken);
1908
+ if (r.ok) {
1909
+ process.env.HF_TOKEN = hfToken;
1910
+ saved.push("HF token");
1911
+ if (r.method)
1912
+ methods.push(r.method);
1913
+ }
1639
1914
  }
1640
- }
1641
- if (!sourcePath) {
1642
- console.error(`[Export] No local data found for ${datasetId}. Attempting to prepare automatically...`);
1643
- // Start a prepare job for this dataset id (acts like calling prepare_dataset)
1644
- try {
1645
- jobManager.createJob("prepare", 0, { query: datasetId, requirements: undefined, downloadImages: false });
1915
+ if (kaggleUsername) {
1916
+ const r = secureKeys.set("kaggle_username", kaggleUsername);
1917
+ if (r.ok) {
1918
+ process.env.KAGGLE_USERNAME = kaggleUsername;
1919
+ saved.push("Kaggle username");
1920
+ if (r.method)
1921
+ methods.push(r.method);
1922
+ }
1646
1923
  }
1647
- catch (e) {
1648
- console.error(`[Export] Failed to start prepare job for ${datasetId}: ${e?.message || e}`);
1649
- }
1650
- // Poll for download status or registry entry until local_path appears or timeout
1651
- const wait = (ms) => new Promise(res => setTimeout(res, ms));
1652
- const maxWait = 120_000; // 120s
1653
- const interval = 2000;
1654
- let waited = 0;
1655
- while (waited < maxWait) {
1656
- const ds = metadataStore.getDownloadStatus(datasetId);
1657
- if (ds && ds.local_path && fs.existsSync(ds.local_path)) {
1658
- sourcePath = ds.local_path;
1659
- console.error(`[Export] Local data is now available for ${datasetId}: ${sourcePath}`);
1660
- break;
1924
+ if (kaggleKey) {
1925
+ const r = secureKeys.set("kaggle_key", kaggleKey);
1926
+ if (r.ok) {
1927
+ process.env.KAGGLE_KEY = kaggleKey;
1928
+ saved.push("Kaggle key");
1929
+ if (r.method)
1930
+ methods.push(r.method);
1661
1931
  }
1662
- const reg = getRegistryEntry(datasetId);
1663
- const regPath = reg?.local_path || reg?.path;
1664
- if (regPath && fs.existsSync(regPath)) {
1665
- sourcePath = regPath;
1666
- console.error(`[Export] Local data found in registry for ${datasetId}: ${sourcePath}`);
1667
- break;
1932
+ }
1933
+ if (dataworldToken) {
1934
+ const r = secureKeys.set("dataworld_token", dataworldToken);
1935
+ if (r.ok) {
1936
+ process.env.DW_AUTH_TOKEN = dataworldToken;
1937
+ saved.push("data.world token");
1938
+ if (r.method)
1939
+ methods.push(r.method);
1668
1940
  }
1669
- await wait(interval);
1670
- waited += interval;
1671
1941
  }
1672
- // If still no sourcePath, return helpful error listing prepared datasets
1673
- if (!sourcePath) {
1674
- const entries = readRegistry();
1675
- const listText = entries.length === 0 ? "(no prepared datasets found)" : entries.map(e => `- ${e.dataset_id || e.id || "unknown"}: ${e.local_path || e.path || "unknown"}`).slice(0, 10).join("\n") + (entries.length > 10 ? "\n...and " + (entries.length - 10) + " more" : "");
1942
+ if (saved.length === 0) {
1676
1943
  return {
1677
- content: [{ type: "text", text: `ERROR: No local data found for ${datasetId} after attempting prepare. Check credentials and try running prepare_dataset manually. Prepared datasets:\n${listText}` }],
1678
- isError: true
1944
+ content: [{ type: "text", text: "No keys provided. Core Vesper tools continue to work without API keys." }]
1679
1945
  };
1680
1946
  }
1947
+ return {
1948
+ content: [{ type: "text", text: `Key saved securely. Updated: ${saved.join(", ")}.` }]
1949
+ };
1681
1950
  }
1682
- // If NOT fast mode, run quality/cleaning pipeline first (only for csv/parquet compat)
1683
- if (!fastMode) {
1684
- const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
1685
- const pipelineFmt = (requestedFormat === "csv" || requestedFormat === "parquet") ? requestedFormat : "parquet";
1686
- const pipelineCompatibleInput = currentExt === "csv" || currentExt === "parquet";
1687
- if (!pipelineCompatibleInput) {
1688
- console.error(`[Export] Skipping pipeline for ${currentExt} input; using direct exporter conversion.`);
1689
- }
1690
- else if (currentExt !== pipelineFmt) {
1691
- console.error(`[Export] Running quality/cleaning pipeline (use fast=true to skip)...`);
1951
+ case "get_dataset_info": {
1952
+ const datasetId = String(request.params.arguments?.dataset_id);
1953
+ if (!datasetId) {
1954
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
1955
+ }
1956
+ const dataset = metadataStore.getDataset(datasetId);
1957
+ if (!dataset) {
1958
+ // Fallback: check the registry for local path info
1959
+ const regEntry = getRegistryEntry(datasetId);
1960
+ const regPath = regEntry?.local_path || regEntry?.path;
1961
+ if (regEntry) {
1962
+ const exists = regPath && fs.existsSync(regPath);
1963
+ return {
1964
+ content: [{ type: "text", text: `**${datasetId}** (from registry)\n- Local path: ${regPath || "unknown"}\n- Status: ${regEntry.status || "unknown"}${exists ? "" : " (file missing)"}\n\nNote: Full metadata not available in metadata store. Use prepare_dataset to get full details.` }],
1965
+ };
1966
+ }
1967
+ return {
1968
+ content: [{ type: "text", text: `ERROR: Dataset not found: ${datasetId}. Use vesper_list_datasets to see available datasets, or prepare_dataset to add new ones.` }],
1969
+ isError: true,
1970
+ };
1971
+ }
1972
+ // Enrich: if total_examples is 0-ish, try the HF datasets-server /size API
1973
+ if ((!dataset.total_examples || dataset.total_examples === 0) && dataset.source === "huggingface") {
1692
1974
  try {
1693
- const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, pipelineFmt);
1694
- if (pipelineResult.final_output_path) {
1695
- sourcePath = pipelineResult.final_output_path;
1696
- try {
1697
- // Update registry to point to pipeline's final output
1698
- upsertRegistry(datasetId, sourcePath, "completed");
1699
- }
1700
- catch (e) {
1701
- console.error(`[Registry] Failed to update registry for ${datasetId}: ${e?.message || e}`);
1975
+ const sizeResp = await fetch(`https://datasets-server.huggingface.co/size?dataset=${encodeURIComponent(dataset.id)}`);
1976
+ if (sizeResp.ok) {
1977
+ const sizeData = await sizeResp.json();
1978
+ const numRows = sizeData?.size?.dataset?.num_rows;
1979
+ if (numRows && numRows > 0) {
1980
+ dataset.total_examples = numRows;
1981
+ // Also backfill splits
1982
+ if (sizeData?.size?.splits && Array.isArray(sizeData.size.splits)) {
1983
+ dataset.splits = sizeData.size.splits.map((s) => ({
1984
+ name: s.split,
1985
+ num_examples: s.num_rows || 0,
1986
+ size_bytes: s.num_bytes_parquet_files || 0,
1987
+ }));
1988
+ dataset.has_train_split = dataset.splits.some((s) => s.name === "train");
1989
+ dataset.has_test_split = dataset.splits.some((s) => s.name === "test");
1990
+ dataset.has_validation_split = dataset.splits.some((s) => s.name === "validation" || s.name === "val");
1991
+ }
1992
+ // Persist enriched metadata
1993
+ metadataStore.saveDataset(dataset);
1702
1994
  }
1703
1995
  }
1704
1996
  }
1705
- catch (err) {
1706
- console.error(`[Export] Pipeline warning: ${err.message}. Continuing with raw file.`);
1997
+ catch {
1998
+ // Enrichment is best-effort; continue with whatever we have
1707
1999
  }
1708
2000
  }
1709
- }
1710
- else {
1711
- console.error(`[Export] Fast mode - skipping quality analysis and cleaning`);
1712
- }
1713
- // Build export options
1714
- const exportOpts = {};
1715
- if (compression)
1716
- exportOpts.compression = compression;
1717
- if (preview)
1718
- exportOpts.preview = true;
1719
- if (sampleRows)
1720
- exportOpts.sample_rows = sampleRows;
1721
- if (columns)
1722
- exportOpts.columns = columns;
1723
- try {
1724
- // Determine output file name
1725
- const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
1726
- const ext = extMap[requestedFormat] || ".feather";
1727
- const safeName = datasetId.replace(/\//g, "_");
1728
- const outDir = targetDir || path.join(dataRoot, "exports");
1729
- if (!fs.existsSync(outDir))
1730
- fs.mkdirSync(outDir, { recursive: true });
1731
- const outputFile = path.join(outDir, `${safeName}${ext}`);
1732
- const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
1733
- // Build rich response
1734
- let msg = `**Export complete**\n`;
1735
- msg += `- **File**: ${result.output_path}\n`;
1736
- msg += `- **Format**: ${result.format}${result.compression ? ` (${result.compression})` : ""}\n`;
1737
- msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
1738
- if (result.file_size_mb !== undefined)
1739
- msg += `- **Size**: ${result.file_size_mb} MB\n`;
1740
- if (result.elapsed_seconds !== undefined)
1741
- msg += `- **Time**: ${result.elapsed_seconds}s\n`;
1742
- if (result.preview_path)
1743
- msg += `- **Preview**: ${result.preview_path}\n`;
1744
- msg += `\n`;
1745
- if (requestedFormat === "feather") {
1746
- msg += `**Inspect with:**\n`;
1747
- msg += ` Python: \`pd.read_feather('${result.output_path}').head()\`\n`;
1748
- msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
1749
- }
1750
- else if (requestedFormat === "parquet") {
1751
- msg += `**Inspect with:**\n`;
1752
- msg += ` Python: \`pd.read_parquet('${result.output_path}').head()\`\n`;
1753
- msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
1754
- }
1755
- return { content: [{ type: "text", text: msg }] };
1756
- }
1757
- catch (error) {
2001
+ const formattedOutput = formatDatasetInfo(dataset);
2002
+ return { content: [{ type: "text", text: formattedOutput }] };
2003
+ }
2004
+ case "analyze_quality": {
2005
+ const datasetId = String(request.params.arguments?.dataset_id);
2006
+ const safeId = toSafeDatasetPathFragment(datasetId);
2007
+ const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
2008
+ const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
2009
+ let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
2010
+ // Demo Fallback for easy testing
2011
+ if (datasetId === "demo" || !fs.existsSync(filePath)) {
2012
+ const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
2013
+ const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
2014
+ if (fs.existsSync(demoParquetPath)) {
2015
+ filePath = demoParquetPath;
2016
+ }
2017
+ else if (fs.existsSync(demoCsvPath)) {
2018
+ filePath = demoCsvPath;
2019
+ }
2020
+ else if (datasetId !== "demo") {
2021
+ return {
2022
+ content: [{ type: "text", text: `ERROR: Local data file for ${datasetId} not found. Try running prepare_dataset first, or use 'demo' as the dataset_id.` }],
2023
+ isError: true
2024
+ };
2025
+ }
2026
+ }
2027
+ const report = await qualityAnalyzer.analyze(filePath);
1758
2028
  return {
1759
- content: [{ type: "text", text: `ERROR: Export failed: ${error.message}` }],
1760
- isError: true
2029
+ content: [{ type: "text", text: JSON.stringify(report, null, 2) }]
1761
2030
  };
1762
2031
  }
1763
- }
1764
- case "fuse_datasets": {
1765
- const rawSources = request.params.arguments?.sources;
1766
- if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {
1767
- throw new McpError(ErrorCode.InvalidParams, "sources must contain at least 2 dataset IDs/paths");
1768
- }
1769
- const strategy = request.params.arguments?.strategy || "concat";
1770
- const joinOn = request.params.arguments?.join_on;
1771
- const how = request.params.arguments?.how || "inner";
1772
- const dedup = request.params.arguments?.dedup !== false;
1773
- const runQualityAfter = request.params.arguments?.run_quality_after !== false;
1774
- const leakageCheck = request.params.arguments?.leakage_check !== false;
1775
- const outputFormat = request.params.arguments?.output_format || "feather";
1776
- const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
1777
- const preview = request.params.arguments?.preview !== false;
1778
- const resolvedPaths = [];
1779
- const unresolved = [];
1780
- for (const src of rawSources) {
1781
- if (fs.existsSync(src)) {
1782
- resolvedPaths.push(src);
1783
- continue;
2032
+ case "preview_cleaning": {
2033
+ const datasetId = String(request.params.arguments?.dataset_id);
2034
+ const safeId = toSafeDatasetPathFragment(datasetId);
2035
+ const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
2036
+ const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
2037
+ let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
2038
+ if (datasetId === "demo" || !fs.existsSync(filePath)) {
2039
+ const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
2040
+ const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
2041
+ if (fs.existsSync(demoParquetPath)) {
2042
+ filePath = demoParquetPath;
2043
+ }
2044
+ else if (fs.existsSync(demoCsvPath)) {
2045
+ filePath = demoCsvPath;
2046
+ }
2047
+ else {
2048
+ throw new McpError(ErrorCode.InvalidParams, `Local data file not found for ${datasetId}. Please run prepare_dataset first.`);
2049
+ }
1784
2050
  }
1785
- const status = metadataStore.getDownloadStatus(src);
1786
- if (status?.local_path && fs.existsSync(status.local_path)) {
1787
- resolvedPaths.push(status.local_path);
1788
- continue;
2051
+ const report = await qualityAnalyzer.analyze(filePath);
2052
+ // Phase 1: Target Detection
2053
+ // We use the same TargetDetector instance inside CleaningPlanner now?
2054
+ // Actually, we instantiated it inside CleaningPlanner, so we just need to pass the file path
2055
+ // OR let the planner handle it if we update its signature to accept filePath.
2056
+ // Let's check `CleaningPlanner.generatePlan` signature again.
2057
+ // We updated it to accept `targetInfo`.
2058
+ // So we need to run detection HERE and pass it.
2059
+ // But `TargetDetector` is not exposed in `index.ts` scope yet.
2060
+ // Let's create a global instance or use the one inside planner if exposed (it's private).
2061
+ // Better approach: Instantiate TargetDetector here in index.ts for the tool content.
2062
+ // Quick fix: Instantiate local detector or make global.
2063
+ // I'll make a global `targetDetector` constant in index.ts
2064
+ // But wait, I updated `CleaningPlanner` to instantiate its own detector.
2065
+ // Does `CleaningPlanner` use it? No, I commented out the logic because it needed `filePath`.
2066
+ // RETRY STRATEGY:
2067
+ // 1. Instantiate `targetDetector` in `index.ts`.
2068
+ // 2. Run `detectTarget(filePath)`.
2069
+ // 3. Pass result to `cleaningPlanner.generatePlan(..., targetInfo)`.
2070
+ // I need to add `const targetDetector = new TargetDetector(__dirname);` to imports/init section first.
2071
+ // But since I'm in this tool, I can't look back.
2072
+ // I will assume I can add it, or just do it inside the case for now.
2073
+ // To do it properly, I should have added `targetDetector` to the global scope in previous step.
2074
+ // Let's do that in a separate step if needed.
2075
+ // For now, I'll instantiate it here.
2076
+ const { TargetDetector } = await import("./preparation/target-detector.js");
2077
+ const detector = new TargetDetector(__dirname);
2078
+ const targetResult = await detector.detectTarget(filePath);
2079
+ const targetInfo = targetResult.target_column ? {
2080
+ target: targetResult.target_column,
2081
+ confidence: targetResult.confidence
2082
+ } : undefined;
2083
+ const plan = await cleaningPlanner.generatePlan(datasetId, report, undefined, targetInfo);
2084
+ let explanation = `### Cleaning Plan for ${datasetId}\n\n`;
2085
+ if (targetInfo && targetInfo.target !== "target" && targetInfo.confidence > 0.7) {
2086
+ explanation += `**Target Detected**: '${targetInfo.target}' (Confidence: ${targetInfo.confidence.toFixed(2)})\n`;
2087
+ explanation += ` - **Action**: Renaming to 'target' for consistency.\n\n`;
2088
+ }
2089
+ explanation += `Estimated Quality Improvement: +${plan.estimated_impact.quality_score_improvement} points\n\n`;
2090
+ if (plan.operations.length === 0) {
2091
+ explanation += "No cleaning operations required.";
2092
+ }
2093
+ else {
2094
+ plan.operations.forEach((op, i) => {
2095
+ explanation += `${i + 1}. **${op.type}**: ${op.reason}\n`;
2096
+ });
1789
2097
  }
1790
- unresolved.push(src);
1791
- }
1792
- if (unresolved.length > 0) {
1793
2098
  return {
1794
- content: [{
1795
- type: "text",
1796
- text: `ERROR: Could not resolve these sources to local files: ${unresolved.join(", ")}. Provide local paths or run prepare_dataset first.`
1797
- }],
1798
- isError: true
2099
+ content: [{ type: "text", text: explanation }]
1799
2100
  };
1800
2101
  }
1801
- try {
1802
- const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow" };
1803
- const ext = extMap[outputFormat] || ".feather";
1804
- const outDir = path.join(dataRoot, "fusion");
1805
- if (!fs.existsSync(outDir))
1806
- fs.mkdirSync(outDir, { recursive: true });
1807
- const outputPath = path.join(outDir, `fused_${Date.now()}${ext}`);
1808
- const result = await fusionEngine.fuse(resolvedPaths, outputPath, {
1809
- strategy,
1810
- join_on: joinOn,
1811
- how,
1812
- dedup,
1813
- run_quality_after: runQualityAfter,
1814
- leakage_check: leakageCheck,
1815
- output_format: outputFormat,
1816
- compression: compression,
1817
- preview,
1818
- });
1819
- const nullDelta = result.stats.null_delta;
1820
- const nullText = nullDelta >= 0 ? `+${nullDelta}%` : `${nullDelta}%`;
1821
- // Register fused dataset under a generated id so users can export it easily
1822
- const fusedId = `fused_${Date.now()}`;
1823
- try {
1824
- upsertRegistry(fusedId, result.output_path, "completed");
2102
+ case "custom_clean": {
2103
+ const datasetId = String(request.params.arguments?.dataset_id);
2104
+ const ops = request.params.arguments?.operations;
2105
+ if (!datasetId || datasetId === "undefined") {
2106
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
1825
2107
  }
1826
- catch (e) {
1827
- console.error(`[Registry] Failed to register fused dataset ${fusedId}: ${e?.message || e}`);
1828
- }
1829
- let msg = `Fused ${result.stats.sources_count} sources -> ${result.stats.rows_after.toLocaleString()} rows (from ${result.stats.rows_before.toLocaleString()}).\n`;
1830
- msg += `- Duplicates removed: ${result.stats.duplicates_removed.toLocaleString()}\n`;
1831
- msg += `- Null change: ${nullText}\n`;
1832
- msg += `- Output: ${result.output_path}\n`;
1833
- if (result.preview_path)
1834
- msg += `- Preview: ${result.preview_path}\n`;
1835
- if (result.leakage_report) {
1836
- msg += `- Leakage: ${result.leakage_report.leakage_detected ? "detected" : "none"}`;
1837
- if (result.leakage_report.leakage_count) {
1838
- msg += ` (${result.leakage_report.leakage_count})`;
1839
- }
1840
- msg += "\n";
2108
+ if (!ops || !Array.isArray(ops) || ops.length === 0) {
2109
+ throw new McpError(ErrorCode.InvalidParams, "operations array is required and must not be empty. Supported operations: RemoveDuplicates, DropColumns, FillMissing, FixTypes, RemoveOutliers, EncodeCategories");
1841
2110
  }
1842
- msg += `\nNext: run split_dataset/export_dataset on fused output. Registered fused dataset id: ${fusedId}`;
1843
- return { content: [{ type: "text", text: msg }] };
1844
- }
1845
- catch (error) {
2111
+ // Pre-check: verify dataset file exists before starting the job
2112
+ const cleanRegEntry = getRegistryEntry(datasetId);
2113
+ const cleanRegPath = cleanRegEntry?.local_path || cleanRegEntry?.path;
2114
+ const cleanDlStatus = metadataStore.getDownloadStatus(datasetId);
2115
+ const cleanSafeId = toSafeDatasetPathFragment(datasetId);
2116
+ const cleanDataExists = (cleanRegPath && fs.existsSync(cleanRegPath)) ||
2117
+ (cleanDlStatus?.local_path && fs.existsSync(cleanDlStatus.local_path)) ||
2118
+ fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.parquet`)) ||
2119
+ fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.csv`)) ||
2120
+ fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.feather`)) ||
2121
+ fs.existsSync(datasetId);
2122
+ if (!cleanDataExists) {
2123
+ return {
2124
+ content: [{ type: "text", text: `Dataset '${datasetId}' not found locally. Download it first using download_dataset or prepare_dataset, then run custom_clean.` }],
2125
+ isError: true,
2126
+ };
2127
+ }
2128
+ const job = jobManager.createJob("clean", 0, { datasetId, ops });
1846
2129
  return {
1847
- content: [{ type: "text", text: `ERROR: Fusion failed: ${error.message}` }],
1848
- isError: true
2130
+ content: [{ type: "text", text: `Cleaning job started. ID: ${job.id}. Use check_job_status to monitor progress.` }]
1849
2131
  };
1850
2132
  }
1851
- }
1852
- case "analyze_image_quality": {
1853
- const inputPath = String(request.params.arguments?.path);
1854
- if (!fs.existsSync(inputPath)) {
1855
- throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
1856
- }
1857
- try {
1858
- const report = await imageAnalyzer.analyze(inputPath);
1859
- let output = `## Image Quality Report\n\n`;
1860
- output += `- **Total Images**: ${report.total_images}\n`;
1861
- output += `- **Corrupted**: ${report.corrupted_count}\n`;
1862
- output += `- **Average Resolution**: ${Math.round(report.average_width)}x${Math.round(report.average_height)}\n`;
1863
- output += `- **Blurry Images**: ${report.blurry_count}\n\n`;
1864
- if (report.individual_results.length > 0) {
1865
- output += `### Sample Detail (Top 5)\n`;
1866
- report.individual_results.slice(0, 5).forEach(img => {
1867
- const statusLabel = img.status === "ok" ? (img.is_blurry ? "[warn]" : "[ok]") : "[error]";
1868
- output += `${statusLabel} **${img.filename}**: ${img.width}x${img.height} | Mode: ${img.mode} | Blur: ${img.blur_score?.toFixed(1)}\n`;
1869
- });
2133
+ case "prepare_dataset": {
2134
+ hydrateExternalKeys();
2135
+ const query = String(request.params.arguments?.query);
2136
+ const requirements = request.params.arguments?.requirements ? String(request.params.arguments?.requirements) : undefined;
2137
+ const downloadImages = request.params.arguments?.download_images === true;
2138
+ const requestedOutputDir = request.params.arguments?.target_dir
2139
+ ? String(request.params.arguments.target_dir).trim()
2140
+ : request.params.arguments?.output_dir
2141
+ ? String(request.params.arguments.output_dir).trim()
2142
+ : "";
2143
+ const outputDir = requestedOutputDir || process.cwd();
2144
+ if (!query || query === "undefined") {
2145
+ throw new McpError(ErrorCode.InvalidParams, "query is required - describe the dataset you need or provide a dataset ID");
1870
2146
  }
2147
+ const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages, outputDir });
1871
2148
  return {
1872
- content: [{ type: "text", text: output }]
2149
+ content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you. Use check_job_status to monitor progress.` }]
1873
2150
  };
1874
2151
  }
1875
- catch (error) {
2152
+ case "compare_datasets": {
2153
+ const datasetIds = request.params.arguments?.dataset_ids;
2154
+ const datasets = datasetIds.map(id => metadataStore.getDataset(id)).filter(d => !!d);
2155
+ let comparison = "| Metric | " + datasets.map(d => d.name).join(" | ") + " |\n";
2156
+ comparison += "| :--- | " + datasets.map(() => " :---: ").join("|") + " |\n";
2157
+ comparison += "| Quality Score | " + datasets.map(d => `${d.quality_score}/100`).join(" | ") + " |\n";
2158
+ comparison += "| License | " + datasets.map(d => d.license.category).join(" | ") + " |\n";
2159
+ comparison += "| Downloads | " + datasets.map(d => d.downloads).join(" | ") + " |\n";
2160
+ comparison += "| Domain | " + datasets.map(d => d.domain).join(" | ") + " |\n";
1876
2161
  return {
1877
- content: [{ type: "text", text: `ERROR: Image analysis failed: ${error.message}` }],
1878
- isError: true
2162
+ content: [{ type: "text", text: comparison }]
1879
2163
  };
1880
2164
  }
1881
- }
1882
- case "analyze_media_quality": {
1883
- const inputPath = String(request.params.arguments?.path);
1884
- if (!fs.existsSync(inputPath)) {
1885
- throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
2165
+ case "check_job_status": {
2166
+ const jobId = String(request.params.arguments?.job_id);
2167
+ const job = metadataStore.getJob(jobId);
2168
+ if (!job) {
2169
+ throw new McpError(ErrorCode.InvalidParams, `Job not found: ${jobId}`);
2170
+ }
2171
+ const activeStatuses = new Set(["pending", "queued", "running", "retrying"]);
2172
+ const now = Date.now();
2173
+ const last = jobStatusLastPoll[jobId] || 0;
2174
+ const minPollMs = 3000;
2175
+ if (activeStatuses.has(job.status) && (now - last) < minPollMs) {
2176
+ const waitMs = minPollMs - (now - last);
2177
+ return {
2178
+ content: [{ type: "text", text: `No significant status change yet. Please poll again in ~${Math.ceil(waitMs / 1000)}s.` }]
2179
+ };
2180
+ }
2181
+ jobStatusLastPoll[jobId] = now;
2182
+ return {
2183
+ content: [{ type: "text", text: formatJobStatus(job) }]
2184
+ };
1886
2185
  }
1887
- try {
1888
- const report = await mediaAnalyzer.analyze(inputPath);
1889
- let output = `## Media Quality Report\n\n`;
1890
- output += `- **Total Files**: ${report.total_files}\n`;
1891
- output += `- **OK Files**: ${report.ok_files}\n`;
1892
- output += `- **Failed Files**: ${report.failed_files}\n`;
1893
- if ('avg_audio_duration' in report && report.avg_audio_duration) {
1894
- output += `- **Average Audio Duration**: ${report.avg_audio_duration.toFixed(2)}s\n`;
1895
- }
1896
- if ('avg_video_duration' in report && report.avg_video_duration) {
1897
- output += `- **Average Video Duration**: ${report.avg_video_duration.toFixed(2)}s\n`;
1898
- output += `- **Average FPS**: ${report.avg_fps?.toFixed(1)}\n`;
1899
- }
1900
- output += `\n### Sample Detail (Top 5)\n`;
1901
- report.details.slice(0, 5).forEach(item => {
1902
- const statusLabel = item.status === "ok" ? "[ok]" : "[error]";
1903
- if (item.type === "audio" && 'sample_rate' in item) {
1904
- output += `${statusLabel} **${item.filename}**: ${item.duration?.toFixed(1)}s | SR: ${item.sample_rate}Hz | Silent: ${item.is_silent}\n`;
2186
+ case "export_dataset": {
2187
+ const datasetId = String(request.params.arguments?.dataset_id);
2188
+ const isDirectLocalInput = isDirectLocalDatasetReference(datasetId);
2189
+ const requestedTargetDir = request.params.arguments?.target_dir
2190
+ ? String(request.params.arguments?.target_dir).trim()
2191
+ : request.params.arguments?.output_dir
2192
+ ? String(request.params.arguments?.output_dir).trim()
2193
+ : "";
2194
+ const targetDir = path.resolve(requestedTargetDir || process.cwd());
2195
+ const requestedFormat = String(request.params.arguments?.format || "feather");
2196
+ const fastMode = request.params.arguments?.fast === true;
2197
+ const preview = request.params.arguments?.preview === true;
2198
+ const sampleRows = request.params.arguments?.sample_rows ? Number(request.params.arguments.sample_rows) : undefined;
2199
+ const columns = request.params.arguments?.columns;
2200
+ const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
2201
+ // Use Metadata or Registry to find the actual local file
2202
+ const preferredLookupDirs = [targetDir, process.cwd()];
2203
+ let sourcePath = resolveDatasetLocalPath(datasetId, preferredLookupDirs);
2204
+ if (!sourcePath) {
2205
+ console.error(`[Export] No local data found for ${datasetId}. Attempting to prepare automatically...`);
2206
+ // Start a prepare job for this dataset id (acts like calling prepare_dataset)
2207
+ try {
2208
+ jobManager.createJob("prepare", 0, { query: datasetId, requirements: undefined, downloadImages: false, outputDir: process.cwd() });
1905
2209
  }
1906
- else if (item.type === "video" && 'width' in item) {
1907
- output += `${statusLabel} **${item.filename}**: ${item.width}x${item.height} | ${item.fps?.toFixed(1)}fps | Risk: ${item.corruption_risk}\n`;
2210
+ catch (e) {
2211
+ console.error(`[Export] Failed to start prepare job for ${datasetId}: ${e?.message || e}`);
1908
2212
  }
1909
- else {
1910
- output += `${statusLabel} **${item.filename}**: ${item.error}\n`;
2213
+ // Poll for download status or registry entry until local_path appears or timeout
2214
+ const wait = (ms) => new Promise(res => setTimeout(res, ms));
2215
+ const maxWait = 120_000; // 120s
2216
+ const interval = 2000;
2217
+ let waited = 0;
2218
+ while (waited < maxWait) {
2219
+ const resolved = resolveDatasetLocalPath(datasetId, preferredLookupDirs);
2220
+ if (resolved) {
2221
+ sourcePath = resolved;
2222
+ console.error(`[Export] Local data is now available for ${datasetId}: ${sourcePath}`);
2223
+ break;
2224
+ }
2225
+ await wait(interval);
2226
+ waited += interval;
2227
+ }
2228
+ // If still no sourcePath, return helpful error listing prepared datasets
2229
+ if (!sourcePath) {
2230
+ const entries = readRegistry();
2231
+ const listText = entries.length === 0 ? "(no prepared datasets found)" : entries.map(e => `- ${e.dataset_id || e.id || "unknown"}: ${e.local_path || e.path || "unknown"}`).slice(0, 10).join("\n") + (entries.length > 10 ? "\n...and " + (entries.length - 10) + " more" : "");
2232
+ return {
2233
+ content: [{ type: "text", text: `ERROR: No local data found for ${datasetId} after attempting prepare. Check credentials and try running prepare_dataset manually. Prepared datasets:\n${listText}` }],
2234
+ isError: true
2235
+ };
2236
+ }
2237
+ }
2238
+ sourcePath = ensureExportableLocalPath(sourcePath);
2239
+ try {
2240
+ if (!isDirectLocalInput && shouldTrackExportPath(sourcePath)) {
2241
+ upsertRegistry(datasetId, sourcePath, "completed");
1911
2242
  }
2243
+ }
2244
+ catch (e) {
2245
+ console.error(`[Registry] Failed to normalize registry path for ${datasetId}: ${e?.message || e}`);
2246
+ }
2247
+ // If NOT fast mode, run quality/cleaning pipeline first (only for csv/parquet compat)
2248
+ if (!fastMode) {
2249
+ const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
2250
+ const pipelineFmt = (requestedFormat === "csv" || requestedFormat === "parquet") ? requestedFormat : "parquet";
2251
+ const pipelineCompatibleInput = currentExt === "csv" || currentExt === "parquet";
2252
+ if (!pipelineCompatibleInput) {
2253
+ console.error(`[Export] Skipping pipeline for ${currentExt} input; using direct exporter conversion.`);
2254
+ }
2255
+ else if (currentExt !== pipelineFmt) {
2256
+ console.error(`[Export] Running quality/cleaning pipeline (use fast=true to skip)...`);
2257
+ try {
2258
+ sourcePath = ensureLocalPipelineSource(sourcePath, datasetId, targetDir);
2259
+ const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, pipelineFmt);
2260
+ if (pipelineResult.final_output_path) {
2261
+ sourcePath = pipelineResult.final_output_path;
2262
+ try {
2263
+ // Update registry to point to pipeline's final output
2264
+ if (!isDirectLocalInput && shouldTrackExportPath(sourcePath)) {
2265
+ upsertRegistry(datasetId, sourcePath, "completed");
2266
+ }
2267
+ }
2268
+ catch (e) {
2269
+ console.error(`[Registry] Failed to update registry for ${datasetId}: ${e?.message || e}`);
2270
+ }
2271
+ }
2272
+ }
2273
+ catch (err) {
2274
+ console.error(`[Export] Pipeline warning: ${err.message}. Continuing with raw file.`);
2275
+ }
2276
+ }
2277
+ }
2278
+ else {
2279
+ console.error(`[Export] Fast mode - skipping quality analysis and cleaning`);
2280
+ }
2281
+ // Build export options
2282
+ const exportOpts = {};
2283
+ if (compression)
2284
+ exportOpts.compression = compression;
2285
+ if (preview)
2286
+ exportOpts.preview = true;
2287
+ if (sampleRows)
2288
+ exportOpts.sample_rows = sampleRows;
2289
+ if (columns)
2290
+ exportOpts.columns = columns;
2291
+ try {
2292
+ // Determine output file name
2293
+ const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
2294
+ const ext = extMap[requestedFormat] || ".feather";
2295
+ const safeName = getExportFileStem(datasetId);
2296
+ const outDir = targetDir;
2297
+ if (!fs.existsSync(outDir))
2298
+ fs.mkdirSync(outDir, { recursive: true });
2299
+ const outputFile = path.join(outDir, `${safeName}${ext}`);
2300
+ const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
2301
+ // Build rich response
2302
+ let msg = `**Export complete**\n`;
2303
+ msg += `- **File**: ${result.output_path}\n`;
2304
+ msg += `- **Format**: ${result.format}${result.compression ? ` (${result.compression})` : ""}\n`;
2305
+ msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
2306
+ if (result.file_size_mb !== undefined)
2307
+ msg += `- **Size**: ${result.file_size_mb} MB\n`;
2308
+ if (result.elapsed_seconds !== undefined)
2309
+ msg += `- **Time**: ${result.elapsed_seconds}s\n`;
2310
+ if (result.preview_path)
2311
+ msg += `- **Preview**: ${result.preview_path}\n`;
2312
+ msg += `\n`;
2313
+ if (requestedFormat === "feather") {
2314
+ msg += `**Inspect with:**\n`;
2315
+ msg += ` Python: \`pd.read_feather('${result.output_path}').head()\`\n`;
2316
+ msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
2317
+ }
2318
+ else if (requestedFormat === "parquet") {
2319
+ msg += `**Inspect with:**\n`;
2320
+ msg += ` Python: \`pd.read_parquet('${result.output_path}').head()\`\n`;
2321
+ msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
2322
+ }
2323
+ return { content: [{ type: "text", text: msg }] };
2324
+ }
2325
+ catch (error) {
2326
+ return {
2327
+ content: [{ type: "text", text: `ERROR: Export failed: ${error.message}` }],
2328
+ isError: true
2329
+ };
2330
+ }
2331
+ }
2332
+ case "vesper_list_datasets": {
2333
+ const entries = readRegistry();
2334
+ if (entries.length === 0) {
2335
+ return {
2336
+ content: [{ type: "text", text: "No prepared datasets found in the Vesper registry. Use prepare_dataset or download_dataset to add datasets." }]
2337
+ };
2338
+ }
2339
+ const lines = entries.map((e, i) => {
2340
+ const id = e.dataset_id || e.id || "unknown";
2341
+ const localPath = e.local_path || e.path || "unknown";
2342
+ const exists = typeof localPath === "string" && localPath !== "unknown" && fs.existsSync(localPath);
2343
+ return `${i + 1}. **${id}**\n Path: ${localPath}\n Status: ${e.status || "unknown"}${exists ? "" : " (file missing)"}`;
1912
2344
  });
1913
2345
  return {
1914
- content: [{ type: "text", text: output }]
2346
+ content: [{ type: "text", text: `**Vesper Registry** (${entries.length} dataset${entries.length !== 1 ? "s" : ""}):\n\n${lines.join("\n\n")}` }]
1915
2347
  };
1916
2348
  }
1917
- catch (error) {
1918
- return {
1919
- content: [{ type: "text", text: `ERROR: Media analysis failed: ${error.message}` }],
1920
- isError: true
1921
- };
2349
+ case "vesper_convert_format": {
2350
+ const filePath = String(request.params.arguments?.file_path || "").trim();
2351
+ const targetFormat = String(request.params.arguments?.target_format || "").trim().toLowerCase();
2352
+ if (!filePath) {
2353
+ throw new McpError(ErrorCode.InvalidParams, "file_path is required");
2354
+ }
2355
+ if (!["csv", "parquet", "json", "jsonl"].includes(targetFormat)) {
2356
+ throw new McpError(ErrorCode.InvalidParams, "target_format must be one of: csv, parquet, json, jsonl");
2357
+ }
2358
+ if (!fs.existsSync(filePath)) {
2359
+ return {
2360
+ content: [{ type: "text", text: `ERROR: File not found: ${filePath}` }],
2361
+ isError: true,
2362
+ };
2363
+ }
2364
+ const inputExt = path.extname(filePath).toLowerCase();
2365
+ const extMap = { csv: ".csv", parquet: ".parquet", json: ".json", jsonl: ".jsonl" };
2366
+ const outputExt = extMap[targetFormat];
2367
+ if (inputExt === outputExt) {
2368
+ return {
2369
+ content: [{ type: "text", text: `File is already in ${targetFormat} format: ${filePath}` }],
2370
+ };
2371
+ }
2372
+ const outputPath = filePath.replace(/\.[^.]+$/, outputExt);
2373
+ try {
2374
+ await ensurePythonModules([{ module: "polars", packageName: "polars" }]);
2375
+ const convertScript = path.join(dataRoot, "python", "convert_engine.py");
2376
+ const result = await runPythonJson(convertScript, [filePath, outputPath]);
2377
+ if (!result.ok) {
2378
+ return {
2379
+ content: [{ type: "text", text: `ERROR: Conversion failed: ${result.error}` }],
2380
+ isError: true,
2381
+ };
2382
+ }
2383
+ // Register converted file in the registry
2384
+ const datasetId = path.basename(outputPath, outputExt);
2385
+ try {
2386
+ upsertRegistry(datasetId, outputPath, "completed");
2387
+ }
2388
+ catch (e) {
2389
+ console.error(`[Convert] Registry write failed: ${e?.message || e}`);
2390
+ }
2391
+ let msg = `**Conversion complete**\n`;
2392
+ msg += `- **Input**: ${filePath} (${inputExt.slice(1)})\n`;
2393
+ msg += `- **Output**: ${result.output_path} (${targetFormat})\n`;
2394
+ msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
2395
+ if (result.size_mb !== undefined)
2396
+ msg += `- **Size**: ${result.size_mb} MB\n`;
2397
+ return { content: [{ type: "text", text: msg }] };
2398
+ }
2399
+ catch (error) {
2400
+ return {
2401
+ content: [{ type: "text", text: `ERROR: Conversion failed: ${error.message}` }],
2402
+ isError: true,
2403
+ };
2404
+ }
1922
2405
  }
1923
- }
1924
- case "generate_quality_report": {
1925
- const datasetId = String(request.params.arguments?.dataset_id);
1926
- const datasetPath = String(request.params.arguments?.dataset_path);
1927
- if (!fs.existsSync(datasetPath)) {
1928
- throw new McpError(ErrorCode.InvalidParams, `Dataset path not found: ${datasetPath}`);
2406
+ case "fuse_datasets": {
2407
+ const rawSources = request.params.arguments?.sources;
2408
+ if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {
2409
+ throw new McpError(ErrorCode.InvalidParams, "sources must contain at least 2 dataset IDs/paths");
2410
+ }
2411
+ const strategy = request.params.arguments?.strategy || "concat";
2412
+ const joinOn = request.params.arguments?.join_on;
2413
+ const how = request.params.arguments?.how || "inner";
2414
+ const dedup = request.params.arguments?.dedup !== false;
2415
+ const runQualityAfter = request.params.arguments?.run_quality_after !== false;
2416
+ const leakageCheck = request.params.arguments?.leakage_check !== false;
2417
+ const outputFormat = request.params.arguments?.output_format || "feather";
2418
+ const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
2419
+ const preview = request.params.arguments?.preview !== false;
2420
+ const resolvedPaths = [];
2421
+ const unresolved = [];
2422
+ for (const src of rawSources) {
2423
+ if (fs.existsSync(src)) {
2424
+ resolvedPaths.push(src);
2425
+ continue;
2426
+ }
2427
+ const status = metadataStore.getDownloadStatus(src);
2428
+ if (status?.local_path && fs.existsSync(status.local_path)) {
2429
+ resolvedPaths.push(status.local_path);
2430
+ continue;
2431
+ }
2432
+ unresolved.push(src);
2433
+ }
2434
+ if (unresolved.length > 0) {
2435
+ return {
2436
+ content: [{
2437
+ type: "text",
2438
+ text: `ERROR: Could not resolve these sources to local files: ${unresolved.join(", ")}. Provide local paths or run prepare_dataset first.`
2439
+ }],
2440
+ isError: true
2441
+ };
2442
+ }
2443
+ try {
2444
+ const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow" };
2445
+ const ext = extMap[outputFormat] || ".feather";
2446
+ const outDir = process.cwd();
2447
+ if (!fs.existsSync(outDir))
2448
+ fs.mkdirSync(outDir, { recursive: true });
2449
+ const outputPath = path.join(outDir, `fused_${Date.now()}${ext}`);
2450
+ console.error(`[Fusion] Resolved output directory: ${outDir}`);
2451
+ const result = await fusionEngine.fuse(resolvedPaths, outputPath, {
2452
+ strategy,
2453
+ join_on: joinOn,
2454
+ how,
2455
+ dedup,
2456
+ run_quality_after: runQualityAfter,
2457
+ leakage_check: leakageCheck,
2458
+ output_format: outputFormat,
2459
+ compression: compression,
2460
+ preview,
2461
+ });
2462
+ const nullDelta = result.stats.null_delta;
2463
+ const nullText = nullDelta >= 0 ? `+${nullDelta}%` : `${nullDelta}%`;
2464
+ // Register fused dataset under a generated id so users can export it easily
2465
+ const fusedId = `fused_${Date.now()}`;
2466
+ try {
2467
+ upsertRegistry(fusedId, result.output_path, "completed");
2468
+ }
2469
+ catch (e) {
2470
+ console.error(`[Registry] Failed to register fused dataset ${fusedId}: ${e?.message || e}`);
2471
+ }
2472
+ let msg = `Fused ${result.stats.sources_count} sources -> ${result.stats.rows_after.toLocaleString()} rows (from ${result.stats.rows_before.toLocaleString()}).\n`;
2473
+ msg += `- Duplicates removed: ${result.stats.duplicates_removed.toLocaleString()}\n`;
2474
+ msg += `- Null change: ${nullText}\n`;
2475
+ msg += `- Output: ${result.output_path}\n`;
2476
+ if (result.preview_path)
2477
+ msg += `- Preview: ${result.preview_path}\n`;
2478
+ if (result.leakage_report) {
2479
+ msg += `- Leakage: ${result.leakage_report.leakage_detected ? "detected" : "none"}`;
2480
+ if (result.leakage_report.leakage_count) {
2481
+ msg += ` (${result.leakage_report.leakage_count})`;
2482
+ }
2483
+ msg += "\n";
2484
+ }
2485
+ msg += `\nNext: run split_dataset/export_dataset on fused output. Registered fused dataset id: ${fusedId}`;
2486
+ return { content: [{ type: "text", text: msg }] };
2487
+ }
2488
+ catch (error) {
2489
+ return {
2490
+ content: [{ type: "text", text: `ERROR: Fusion failed: ${error.message}` }],
2491
+ isError: true
2492
+ };
2493
+ }
1929
2494
  }
1930
- try {
1931
- // Optionally load text quality from metadata if available
1932
- const metadata = await metadataStore.getDataset(datasetId);
1933
- // TODO: Integrate text quality analysis when available
1934
- const textQuality = null;
1935
- const report = await qualityOrchestrator.generateReport(datasetId, datasetPath, textQuality);
1936
- // Save report to metadata
1937
- if (metadata) {
1938
- metadata.unified_quality_report = report;
1939
- await metadataStore.saveDataset(metadata);
1940
- }
1941
- let output = `# Unified Quality Report\n\n`;
1942
- output += `**Dataset**: ${datasetId}\n`;
1943
- output += `**Modalities**: ${report.modalities.join(", ")}\n`;
1944
- output += `**Overall Quality Score**: ${report.overall_quality_score}/100\n\n`;
1945
- if (report.text_quality) {
1946
- output += `## Text Quality\n`;
1947
- output += `- Rows: ${report.text_quality.row_count}\n`;
1948
- output += `- Columns: ${report.text_quality.column_count}\n`;
1949
- output += `- Missing: ${report.text_quality.missing_percentage.toFixed(1)}%\n`;
1950
- output += `- Duplicates: ${report.text_quality.duplicate_percentage.toFixed(1)}%\n\n`;
1951
- }
1952
- if (report.image_quality) {
1953
- output += `## Image Quality\n`;
1954
- output += `- Total Images: ${report.image_quality.total_images}\n`;
1955
- output += `- Corrupted: ${report.image_quality.corrupted_count}\n`;
1956
- output += `- Avg Resolution: ${report.image_quality.avg_resolution}\n`;
1957
- output += `- Blurry: ${report.image_quality.blurry_percentage.toFixed(1)}%\n\n`;
1958
- }
1959
- if (report.audio_quality) {
1960
- output += `## Audio Quality\n`;
1961
- output += `- Total Files: ${report.audio_quality.total_files}\n`;
1962
- output += `- Avg Duration: ${report.audio_quality.avg_duration.toFixed(1)}s\n`;
1963
- output += `- Avg Sample Rate: ${report.audio_quality.avg_sample_rate.toFixed(0)}Hz\n`;
1964
- output += `- Silent: ${report.audio_quality.silent_percentage.toFixed(1)}%\n\n`;
1965
- }
1966
- if (report.video_quality) {
1967
- output += `## Video Quality\n`;
1968
- output += `- Total Files: ${report.video_quality.total_files}\n`;
1969
- output += `- Avg Duration: ${report.video_quality.avg_duration.toFixed(1)}s\n`;
1970
- output += `- Avg FPS: ${report.video_quality.avg_fps.toFixed(1)}\n`;
1971
- output += `- High Corruption Risk: ${report.video_quality.corruption_risk_high}\n\n`;
1972
- }
1973
- output += `## Recommendations\n`;
1974
- report.recommendations.forEach(rec => {
1975
- output += `- ${rec}\n`;
1976
- });
1977
- return {
1978
- content: [{ type: "text", text: output }]
1979
- };
2495
+ case "analyze_image_quality": {
2496
+ const inputPath = String(request.params.arguments?.path);
2497
+ if (!fs.existsSync(inputPath)) {
2498
+ throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
2499
+ }
2500
+ try {
2501
+ const report = await imageAnalyzer.analyze(inputPath);
2502
+ let output = `## Image Quality Report\n\n`;
2503
+ output += `- **Total Images**: ${report.total_images}\n`;
2504
+ output += `- **Corrupted**: ${report.corrupted_count}\n`;
2505
+ output += `- **Average Resolution**: ${Math.round(report.average_width)}x${Math.round(report.average_height)}\n`;
2506
+ output += `- **Blurry Images**: ${report.blurry_count}\n\n`;
2507
+ if (report.individual_results.length > 0) {
2508
+ output += `### Sample Detail (Top 5)\n`;
2509
+ report.individual_results.slice(0, 5).forEach(img => {
2510
+ const statusLabel = img.status === "ok" ? (img.is_blurry ? "[warn]" : "[ok]") : "[error]";
2511
+ output += `${statusLabel} **${img.filename}**: ${img.width}x${img.height} | Mode: ${img.mode} | Blur: ${img.blur_score?.toFixed(1)}\n`;
2512
+ });
2513
+ }
2514
+ return {
2515
+ content: [{ type: "text", text: output }]
2516
+ };
2517
+ }
2518
+ catch (error) {
2519
+ return {
2520
+ content: [{ type: "text", text: `ERROR: Image analysis failed: ${error.message}` }],
2521
+ isError: true
2522
+ };
2523
+ }
1980
2524
  }
1981
- catch (error) {
1982
- return {
1983
- content: [{ type: "text", text: `ERROR: Quality report generation failed: ${error.message}` }],
1984
- isError: true
1985
- };
2525
+ case "analyze_media_quality": {
2526
+ const inputPath = String(request.params.arguments?.path);
2527
+ if (!fs.existsSync(inputPath)) {
2528
+ throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
2529
+ }
2530
+ try {
2531
+ const report = await mediaAnalyzer.analyze(inputPath);
2532
+ let output = `## Media Quality Report\n\n`;
2533
+ output += `- **Total Files**: ${report.total_files}\n`;
2534
+ output += `- **OK Files**: ${report.ok_files}\n`;
2535
+ output += `- **Failed Files**: ${report.failed_files}\n`;
2536
+ if ('avg_audio_duration' in report && report.avg_audio_duration) {
2537
+ output += `- **Average Audio Duration**: ${report.avg_audio_duration.toFixed(2)}s\n`;
2538
+ }
2539
+ if ('avg_video_duration' in report && report.avg_video_duration) {
2540
+ output += `- **Average Video Duration**: ${report.avg_video_duration.toFixed(2)}s\n`;
2541
+ output += `- **Average FPS**: ${report.avg_fps?.toFixed(1)}\n`;
2542
+ }
2543
+ output += `\n### Sample Detail (Top 5)\n`;
2544
+ report.details.slice(0, 5).forEach(item => {
2545
+ const statusLabel = item.status === "ok" ? "[ok]" : "[error]";
2546
+ if (item.type === "audio" && 'sample_rate' in item) {
2547
+ output += `${statusLabel} **${item.filename}**: ${item.duration?.toFixed(1)}s | SR: ${item.sample_rate}Hz | Silent: ${item.is_silent}\n`;
2548
+ }
2549
+ else if (item.type === "video" && 'width' in item) {
2550
+ output += `${statusLabel} **${item.filename}**: ${item.width}x${item.height} | ${item.fps?.toFixed(1)}fps | Risk: ${item.corruption_risk}\n`;
2551
+ }
2552
+ else {
2553
+ output += `${statusLabel} **${item.filename}**: ${item.error}\n`;
2554
+ }
2555
+ });
2556
+ return {
2557
+ content: [{ type: "text", text: output }]
2558
+ };
2559
+ }
2560
+ catch (error) {
2561
+ return {
2562
+ content: [{ type: "text", text: `ERROR: Media analysis failed: ${error.message}` }],
2563
+ isError: true
2564
+ };
2565
+ }
2566
+ }
2567
+ case "generate_quality_report": {
2568
+ const datasetId = String(request.params.arguments?.dataset_id);
2569
+ const datasetPath = String(request.params.arguments?.dataset_path);
2570
+ if (!fs.existsSync(datasetPath)) {
2571
+ throw new McpError(ErrorCode.InvalidParams, `Dataset path not found: ${datasetPath}`);
2572
+ }
2573
+ try {
2574
+ // Optionally load text quality from metadata if available
2575
+ const metadata = await metadataStore.getDataset(datasetId);
2576
+ // TODO: Integrate text quality analysis when available
2577
+ const textQuality = null;
2578
+ const report = await qualityOrchestrator.generateReport(datasetId, datasetPath, textQuality);
2579
+ // Save report to metadata
2580
+ if (metadata) {
2581
+ metadata.unified_quality_report = report;
2582
+ await metadataStore.saveDataset(metadata);
2583
+ }
2584
+ let output = `# Unified Quality Report\n\n`;
2585
+ output += `**Dataset**: ${datasetId}\n`;
2586
+ output += `**Modalities**: ${report.modalities.join(", ")}\n`;
2587
+ output += `**Overall Quality Score**: ${report.overall_quality_score}/100\n\n`;
2588
+ if (report.text_quality) {
2589
+ output += `## Text Quality\n`;
2590
+ output += `- Rows: ${report.text_quality.row_count}\n`;
2591
+ output += `- Columns: ${report.text_quality.column_count}\n`;
2592
+ output += `- Missing: ${report.text_quality.missing_percentage.toFixed(1)}%\n`;
2593
+ output += `- Duplicates: ${report.text_quality.duplicate_percentage.toFixed(1)}%\n\n`;
2594
+ }
2595
+ if (report.image_quality) {
2596
+ output += `## Image Quality\n`;
2597
+ output += `- Total Images: ${report.image_quality.total_images}\n`;
2598
+ output += `- Corrupted: ${report.image_quality.corrupted_count}\n`;
2599
+ output += `- Avg Resolution: ${report.image_quality.avg_resolution}\n`;
2600
+ output += `- Blurry: ${report.image_quality.blurry_percentage.toFixed(1)}%\n\n`;
2601
+ }
2602
+ if (report.audio_quality) {
2603
+ output += `## Audio Quality\n`;
2604
+ output += `- Total Files: ${report.audio_quality.total_files}\n`;
2605
+ output += `- Avg Duration: ${report.audio_quality.avg_duration.toFixed(1)}s\n`;
2606
+ output += `- Avg Sample Rate: ${report.audio_quality.avg_sample_rate.toFixed(0)}Hz\n`;
2607
+ output += `- Silent: ${report.audio_quality.silent_percentage.toFixed(1)}%\n\n`;
2608
+ }
2609
+ if (report.video_quality) {
2610
+ output += `## Video Quality\n`;
2611
+ output += `- Total Files: ${report.video_quality.total_files}\n`;
2612
+ output += `- Avg Duration: ${report.video_quality.avg_duration.toFixed(1)}s\n`;
2613
+ output += `- Avg FPS: ${report.video_quality.avg_fps.toFixed(1)}\n`;
2614
+ output += `- High Corruption Risk: ${report.video_quality.corruption_risk_high}\n\n`;
2615
+ }
2616
+ output += `## Recommendations\n`;
2617
+ report.recommendations.forEach(rec => {
2618
+ output += `- ${rec}\n`;
2619
+ });
2620
+ return {
2621
+ content: [{ type: "text", text: output }]
2622
+ };
2623
+ }
2624
+ catch (error) {
2625
+ return {
2626
+ content: [{ type: "text", text: `ERROR: Quality report generation failed: ${error.message}` }],
2627
+ isError: true
2628
+ };
2629
+ }
1986
2630
  }
2631
+ default:
2632
+ throw new McpError(ErrorCode.MethodNotFound, "Tool not found");
1987
2633
  }
1988
- default:
1989
- throw new McpError(ErrorCode.MethodNotFound, "Tool not found");
1990
- }
2634
+ }); // end requestQueue.enqueue
1991
2635
  });
1992
2636
  async function main() {
1993
2637
  const args = process.argv.slice(2);
@@ -1995,6 +2639,7 @@ async function main() {
1995
2639
  const isFuse = args.includes("fuse");
1996
2640
  const isDiscover = args.includes("discover");
1997
2641
  const isDownload = args.includes("download");
2642
+ const isExport = args.includes("export");
1998
2643
  const isConfig = args.includes("config") || args.includes("configure");
1999
2644
  const isSetup = args.includes("--setup") || args.includes("setup");
2000
2645
  const isSilent = args.includes("--silent");
@@ -2017,6 +2662,10 @@ async function main() {
2017
2662
  await runDownloadCli(args);
2018
2663
  return;
2019
2664
  }
2665
+ if (isExport) {
2666
+ await runExportCli(args);
2667
+ return;
2668
+ }
2020
2669
  // If run in explicit setup mode, show setup wizard (do not auto-run on server startup)
2021
2670
  if (isSetup) {
2022
2671
  await runSetupWizard(isSilent);
@@ -2289,6 +2938,99 @@ async function runDownloadCli(args) {
2289
2938
  }
2290
2939
  console.log(`Download complete: ${localPath}`);
2291
2940
  }
2941
+ async function runExportCli(args) {
2942
+ const getArgValue = (name) => {
2943
+ const idx = args.findIndex(a => a === name);
2944
+ if (idx >= 0 && idx + 1 < args.length)
2945
+ return args[idx + 1];
2946
+ return undefined;
2947
+ };
2948
+ const nonFlags = args.filter((arg, index) => {
2949
+ if (arg.startsWith("--"))
2950
+ return false;
2951
+ const previous = index > 0 ? args[index - 1] : "";
2952
+ if (["--target-dir", "--format", "--compression", "--sample-rows", "--columns"].includes(previous))
2953
+ return false;
2954
+ return true;
2955
+ });
2956
+ const datasetId = nonFlags[1] || "";
2957
+ if (!datasetId) {
2958
+ console.error("Usage: vespermcp export <dataset-id|local-path> [--format parquet|feather|csv|jsonl|arrow] [--target-dir C:/path] [--compression snappy] [--fast] [--preview] [--sample-rows N] [--columns col1,col2]");
2959
+ process.exit(1);
2960
+ }
2961
+ const requestedFormat = getArgValue("--format") || "parquet";
2962
+ const targetDir = getArgValue("--target-dir");
2963
+ const compression = getArgValue("--compression");
2964
+ const sampleRows = getArgValue("--sample-rows");
2965
+ const columns = getArgValue("--columns");
2966
+ const fastMode = args.includes("--fast");
2967
+ const preview = args.includes("--preview");
2968
+ const isDirectLocalInput = isDirectLocalDatasetReference(datasetId);
2969
+ const resolvedTargetDir = path.resolve(targetDir || process.cwd());
2970
+ let sourcePath = resolveDatasetLocalPath(datasetId, [resolvedTargetDir, process.cwd()]);
2971
+ if (!sourcePath) {
2972
+ console.error(`Export failed: no local data found for ${datasetId}. Run download or prepare first, or pass a direct local path.`);
2973
+ process.exit(1);
2974
+ }
2975
+ sourcePath = ensureExportableLocalPath(sourcePath);
2976
+ try {
2977
+ if (!isDirectLocalInput && shouldTrackExportPath(sourcePath)) {
2978
+ upsertRegistry(datasetId, sourcePath, "completed");
2979
+ }
2980
+ }
2981
+ catch (e) {
2982
+ console.error(`[Registry] Failed to normalize registry path for ${datasetId}: ${e?.message || e}`);
2983
+ }
2984
+ if (!fastMode) {
2985
+ const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
2986
+ const pipelineFmt = (requestedFormat === "csv" || requestedFormat === "parquet") ? requestedFormat : "parquet";
2987
+ const pipelineCompatibleInput = currentExt === "csv" || currentExt === "parquet";
2988
+ if (pipelineCompatibleInput && currentExt !== pipelineFmt) {
2989
+ try {
2990
+ sourcePath = ensureLocalPipelineSource(sourcePath, datasetId, resolvedTargetDir);
2991
+ const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, pipelineFmt);
2992
+ if (pipelineResult.final_output_path) {
2993
+ sourcePath = pipelineResult.final_output_path;
2994
+ if (!isDirectLocalInput && shouldTrackExportPath(sourcePath)) {
2995
+ upsertRegistry(datasetId, sourcePath, "completed");
2996
+ }
2997
+ }
2998
+ }
2999
+ catch (err) {
3000
+ console.error(`[Export] Pipeline warning: ${err.message}. Continuing with raw file.`);
3001
+ }
3002
+ }
3003
+ }
3004
+ const exportOpts = {};
3005
+ if (compression)
3006
+ exportOpts.compression = compression;
3007
+ if (preview)
3008
+ exportOpts.preview = true;
3009
+ if (sampleRows)
3010
+ exportOpts.sample_rows = Number(sampleRows);
3011
+ if (columns)
3012
+ exportOpts.columns = columns.split(",").map(col => col.trim()).filter(Boolean);
3013
+ const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
3014
+ const ext = extMap[requestedFormat] || ".parquet";
3015
+ const safeName = getExportFileStem(datasetId);
3016
+ const outDir = resolvedTargetDir;
3017
+ if (!fs.existsSync(outDir))
3018
+ fs.mkdirSync(outDir, { recursive: true });
3019
+ const outputFile = path.join(outDir, `${safeName}${ext}`);
3020
+ console.error(`[Export] Resolved output directory: ${outDir}`);
3021
+ console.error(`[Export] Output file: ${outputFile}`);
3022
+ const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
3023
+ console.log(`Export complete: ${result.output_path}`);
3024
+ console.log(`Format: ${result.format}${result.compression ? ` (${result.compression})` : ""}`);
3025
+ if (result.rows !== undefined)
3026
+ console.log(`Rows: ${result.rows.toLocaleString()}`);
3027
+ if (result.columns !== undefined)
3028
+ console.log(`Columns: ${result.columns}`);
3029
+ if (result.file_size_mb !== undefined)
3030
+ console.log(`Size: ${result.file_size_mb} MB`);
3031
+ if (result.preview_path)
3032
+ console.log(`Preview: ${result.preview_path}`);
3033
+ }
2292
3034
  async function runFuseCli(args) {
2293
3035
  const getArgValue = (name) => {
2294
3036
  const idx = args.findIndex(a => a === name);