@vespermcp/mcp-server 1.2.21 → 1.2.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/README.md +49 -0
  2. package/build/cache/service.js +7 -0
  3. package/build/cloud/adapters/supabase.js +49 -0
  4. package/build/cloud/storage-manager.js +6 -0
  5. package/build/export/exporter.js +22 -9
  6. package/build/gateway/unified-dataset-gateway.js +441 -0
  7. package/build/index.js +1815 -839
  8. package/build/ingestion/ingestor.js +7 -4
  9. package/build/install/install-service.js +11 -6
  10. package/build/lib/supabase.js +3 -0
  11. package/build/metadata/arxiv-source.js +229 -0
  12. package/build/metadata/circuit-breaker.js +62 -0
  13. package/build/metadata/github-source.js +203 -0
  14. package/build/metadata/hackernews-source.js +123 -0
  15. package/build/metadata/quality.js +27 -0
  16. package/build/metadata/scraper.js +85 -14
  17. package/build/metadata/semantic-scholar-source.js +138 -0
  18. package/build/python/asset_downloader_engine.py +2 -0
  19. package/build/python/convert_engine.py +92 -0
  20. package/build/python/export_engine.py +45 -0
  21. package/build/python/kaggle_engine.py +77 -5
  22. package/build/python/normalize_engine.py +83 -0
  23. package/build/python/vesper/core/asset_downloader.py +5 -1
  24. package/build/scripts/test-phase1-webcore-quality.js +104 -0
  25. package/build/search/engine.js +45 -6
  26. package/build/search/jit-orchestrator.js +18 -14
  27. package/build/search/query-intent.js +509 -0
  28. package/build/tools/formatter.js +6 -3
  29. package/build/utils/python-runtime.js +130 -0
  30. package/build/web/extract-web.js +297 -0
  31. package/build/web/fusion-engine.js +457 -0
  32. package/build/web/types.js +1 -0
  33. package/build/web/web-core.js +242 -0
  34. package/package.json +12 -5
  35. package/scripts/postinstall.cjs +87 -31
  36. package/scripts/wizard.cjs +652 -0
  37. package/scripts/wizard.js +338 -12
  38. package/src/python/__pycache__/config.cpython-312.pyc +0 -0
  39. package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
  40. package/src/python/asset_downloader_engine.py +2 -0
  41. package/src/python/convert_engine.py +92 -0
  42. package/src/python/export_engine.py +45 -0
  43. package/src/python/kaggle_engine.py +77 -5
  44. package/src/python/normalize_engine.py +83 -0
  45. package/src/python/requirements.txt +12 -0
  46. package/src/python/vesper/core/asset_downloader.py +5 -1
  47. package/wizard.cjs +3 -0
package/build/index.js CHANGED
@@ -1,12 +1,39 @@
1
1
  #!/usr/bin/env node
2
2
  // --- Dataset ID Normalization ---
3
3
  function normalize_dataset_id(dataset_id) {
4
- // Remove kaggle: prefix for storage key
5
- let id = dataset_id.replace(/^kaggle:/, "");
4
+ const trimmed = dataset_id.trim();
5
+ const sourceMatch = trimmed.match(/^(kaggle|huggingface|hf|openml|dataworld):/i);
6
+ let id = trimmed.replace(/^(kaggle|huggingface|hf|openml|dataworld):/i, "");
6
7
  // Replace / and : with _ for filesystem safety
7
- id = id.replace(/[/:]/g, "_");
8
- // Always store and lookup using the same normalized format
9
- return dataset_id.startsWith("kaggle:") ? `kaggle_${id}` : id;
8
+ id = id.replace(/[\\/:]/g, "_");
9
+ if (!sourceMatch) {
10
+ return id;
11
+ }
12
+ const source = sourceMatch[1].toLowerCase() === "hf" ? "huggingface" : sourceMatch[1].toLowerCase();
13
+ return `${source}_${id}`;
14
+ }
15
+ function getDatasetIdAliases(dataset_id) {
16
+ const trimmed = dataset_id.trim();
17
+ const aliases = new Set([trimmed]);
18
+ const sourceMatch = trimmed.match(/^(kaggle|huggingface|hf|openml|dataworld):/i);
19
+ if (sourceMatch) {
20
+ const stripped = trimmed.replace(/^(kaggle|huggingface|hf|openml|dataworld):/i, "");
21
+ aliases.add(stripped);
22
+ if (sourceMatch[1].toLowerCase() === "hf") {
23
+ aliases.add(`huggingface:${stripped}`);
24
+ }
25
+ }
26
+ else {
27
+ aliases.add(`kaggle:${trimmed}`);
28
+ aliases.add(`huggingface:${trimmed}`);
29
+ aliases.add(`hf:${trimmed}`);
30
+ aliases.add(`openml:${trimmed}`);
31
+ aliases.add(`dataworld:${trimmed}`);
32
+ }
33
+ return Array.from(aliases);
34
+ }
35
+ function toSafeDatasetPathFragment(dataset_id) {
36
+ return normalize_dataset_id(dataset_id);
10
37
  }
11
38
  // --- Dataset Registry Helpers ---
12
39
  function getRegistryPath() {
@@ -29,10 +56,11 @@ function writeRegistry(entries) {
29
56
  fs.writeFileSync(registryPath, JSON.stringify(entries, null, 2));
30
57
  }
31
58
  function upsertRegistry(dataset_id, local_path, status) {
32
- const norm_id = normalize_dataset_id(dataset_id);
59
+ const aliases = getDatasetIdAliases(dataset_id).map(normalize_dataset_id);
60
+ const norm_id = aliases[0];
33
61
  console.error(`[Registry] Writing key: ${norm_id}`);
34
62
  const entries = readRegistry();
35
- const idx = entries.findIndex(e => e.dataset_id === norm_id);
63
+ const idx = entries.findIndex(e => aliases.includes(e.dataset_id || e.id));
36
64
  if (idx >= 0) {
37
65
  entries[idx] = { dataset_id: norm_id, local_path, status };
38
66
  }
@@ -42,9 +70,163 @@ function upsertRegistry(dataset_id, local_path, status) {
42
70
  writeRegistry(entries);
43
71
  }
44
72
  function getRegistryEntry(dataset_id) {
45
- const norm_id = normalize_dataset_id(dataset_id);
46
- console.error(`[Registry] Lookup key: ${norm_id}`);
47
- return readRegistry().find(e => (e.dataset_id || e.id) === norm_id);
73
+ const aliases = getDatasetIdAliases(dataset_id).map(normalize_dataset_id);
74
+ console.error(`[Registry] Lookup keys: ${aliases.join(", ")}`);
75
+ return readRegistry().find(e => aliases.includes((e.dataset_id || e.id)));
76
+ }
77
+ const STRUCTURED_FILE_EXTENSIONS = [".parquet", ".csv", ".jsonl", ".json", ".feather", ".arrow", ".tsv", ".txt"];
78
+ const IMAGE_FILE_EXTENSIONS = new Set([".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif", ".tiff", ".tif", ".svg"]);
79
+ function walkFilesRecursive(rootDir) {
80
+ const out = [];
81
+ const stack = [rootDir];
82
+ while (stack.length > 0) {
83
+ const currentDir = stack.pop();
84
+ const entries = fs.readdirSync(currentDir, { withFileTypes: true });
85
+ for (const entry of entries) {
86
+ const fullPath = path.join(currentDir, entry.name);
87
+ if (entry.isDirectory()) {
88
+ stack.push(fullPath);
89
+ }
90
+ else if (entry.isFile()) {
91
+ out.push(fullPath);
92
+ }
93
+ }
94
+ }
95
+ out.sort();
96
+ return out;
97
+ }
98
+ function inferImageManifestRecord(rootDir, fullPath, index) {
99
+ const relativePath = path.relative(rootDir, fullPath).replace(/\\/g, "/");
100
+ const parentDir = path.posix.dirname(relativePath);
101
+ const parts = parentDir.split("/").filter(part => part && part !== ".");
102
+ let split;
103
+ let label;
104
+ if (parts.length > 0) {
105
+ const first = parts[0].toLowerCase();
106
+ if (["train", "test", "val", "valid", "validation"].includes(first)) {
107
+ split = parts[0];
108
+ if (parts.length > 1) {
109
+ label = parts[parts.length - 1];
110
+ }
111
+ }
112
+ else {
113
+ label = parts[parts.length - 1];
114
+ }
115
+ }
116
+ return {
117
+ id: index,
118
+ image_path: path.resolve(fullPath),
119
+ relative_path: relativePath,
120
+ file_name: path.basename(fullPath),
121
+ extension: path.extname(fullPath).toLowerCase().replace(/^\./, ""),
122
+ ...(split ? { split } : {}),
123
+ ...(label ? { label } : {}),
124
+ };
125
+ }
126
+ function createImageManifestFromDirectory(rootDir) {
127
+ const imageFiles = walkFilesRecursive(rootDir).filter(filePath => IMAGE_FILE_EXTENSIONS.has(path.extname(filePath).toLowerCase()));
128
+ if (imageFiles.length === 0) {
129
+ throw new Error(`No image files found under ${rootDir}`);
130
+ }
131
+ const manifestPath = path.join(rootDir, "_vesper_image_manifest.jsonl");
132
+ const lines = imageFiles.map((filePath, index) => JSON.stringify(inferImageManifestRecord(rootDir, filePath, index)));
133
+ fs.writeFileSync(manifestPath, `${lines.join("\n")}\n`, "utf-8");
134
+ return manifestPath;
135
+ }
136
+ function ensureExportableLocalPath(localPath) {
137
+ if (!fs.existsSync(localPath)) {
138
+ throw new Error(`Local path not found: ${localPath}`);
139
+ }
140
+ const stats = fs.statSync(localPath);
141
+ if (stats.isFile()) {
142
+ return localPath;
143
+ }
144
+ const manifestPath = path.join(localPath, "_vesper_image_manifest.jsonl");
145
+ if (fs.existsSync(manifestPath)) {
146
+ return manifestPath;
147
+ }
148
+ const candidates = walkFilesRecursive(localPath);
149
+ for (const ext of STRUCTURED_FILE_EXTENSIONS) {
150
+ const match = candidates.find(candidate => path.extname(candidate).toLowerCase() === ext);
151
+ if (match) {
152
+ return match;
153
+ }
154
+ }
155
+ return createImageManifestFromDirectory(localPath);
156
+ }
157
+ function isPathWithinDirectory(candidatePath, directoryPath) {
158
+ const relativePath = path.relative(path.resolve(directoryPath), path.resolve(candidatePath));
159
+ return relativePath === "" || (!relativePath.startsWith("..") && !path.isAbsolute(relativePath));
160
+ }
161
+ function buildDatasetCandidatePaths(baseDir, safeId) {
162
+ return [
163
+ path.join(baseDir, `${safeId}.parquet`),
164
+ path.join(baseDir, `${safeId}.csv`),
165
+ path.join(baseDir, `${safeId}.jsonl`),
166
+ path.join(baseDir, `${safeId}.json`),
167
+ path.join(baseDir, `${safeId}.feather`),
168
+ path.join(baseDir, `${safeId}.arrow`),
169
+ path.join(baseDir, safeId),
170
+ ];
171
+ }
172
+ function shouldTrackExportPath(localPath) {
173
+ return isPathWithinDirectory(localPath, dataRoot);
174
+ }
175
+ function isDirectLocalDatasetReference(datasetIdOrPath) {
176
+ return fs.existsSync(datasetIdOrPath);
177
+ }
178
+ function getExportFileStem(datasetIdOrPath) {
179
+ if (isDirectLocalDatasetReference(datasetIdOrPath)) {
180
+ const resolvedPath = path.resolve(datasetIdOrPath);
181
+ const stats = fs.statSync(resolvedPath);
182
+ const baseName = stats.isDirectory()
183
+ ? path.basename(resolvedPath)
184
+ : path.parse(resolvedPath).name;
185
+ return toSafeDatasetPathFragment(baseName);
186
+ }
187
+ return toSafeDatasetPathFragment(datasetIdOrPath);
188
+ }
189
+ function ensureLocalPipelineSource(sourcePath, datasetId, targetDir) {
190
+ const resolvedTargetDir = path.resolve(targetDir);
191
+ const resolvedSourcePath = path.resolve(sourcePath);
192
+ if (path.dirname(resolvedSourcePath) === resolvedTargetDir) {
193
+ return resolvedSourcePath;
194
+ }
195
+ if (!fs.existsSync(resolvedTargetDir)) {
196
+ fs.mkdirSync(resolvedTargetDir, { recursive: true });
197
+ }
198
+ const stagedPath = path.join(resolvedTargetDir, `${toSafeDatasetPathFragment(datasetId)}${path.extname(resolvedSourcePath)}`);
199
+ if (resolvedSourcePath !== stagedPath) {
200
+ fs.copyFileSync(resolvedSourcePath, stagedPath);
201
+ }
202
+ return stagedPath;
203
+ }
204
+ function resolveDatasetLocalPath(datasetIdOrPath, preferredDirs = []) {
205
+ if (fs.existsSync(datasetIdOrPath)) {
206
+ return ensureExportableLocalPath(datasetIdOrPath);
207
+ }
208
+ const safeId = toSafeDatasetPathFragment(datasetIdOrPath);
209
+ const uniquePreferredDirs = Array.from(new Set(preferredDirs
210
+ .filter((dir) => typeof dir === "string" && dir.trim().length > 0)
211
+ .map(dir => path.resolve(dir))));
212
+ for (const preferredDir of uniquePreferredDirs) {
213
+ const localMatch = buildDatasetCandidatePaths(preferredDir, safeId).find(candidate => fs.existsSync(candidate));
214
+ if (localMatch) {
215
+ return ensureExportableLocalPath(localMatch);
216
+ }
217
+ }
218
+ const downloadStatus = metadataStore.getDownloadStatus(datasetIdOrPath);
219
+ if (downloadStatus?.local_path && fs.existsSync(downloadStatus.local_path)) {
220
+ return ensureExportableLocalPath(downloadStatus.local_path);
221
+ }
222
+ const reg = getRegistryEntry(datasetIdOrPath);
223
+ const regPath = reg?.local_path || reg?.path;
224
+ if (regPath && fs.existsSync(regPath)) {
225
+ return ensureExportableLocalPath(regPath);
226
+ }
227
+ const rawCandidates = buildDatasetCandidatePaths(path.join(dataRoot, "data", "raw"), safeId);
228
+ const match = rawCandidates.find(candidate => fs.existsSync(candidate));
229
+ return match ? ensureExportableLocalPath(match) : undefined;
48
230
  }
49
231
  // --- Pipeline State Tracker ---
50
232
  // Tracks completed steps per session/job/dataset
@@ -66,7 +248,7 @@ export function hasStep(datasetId, step) {
66
248
  // --- Dataset ID Auto-Detection ---
67
249
  export function parseDatasetId(id) {
68
250
  const trimmed = id.trim();
69
- if (/^(kaggle:|hf:|huggingface:|openml:|dataworld:|http|https):/i.test(trimmed))
251
+ if (/^(kaggle:|hf:|huggingface:|openml:|dataworld:|arxiv:|github:|http|https):/i.test(trimmed))
70
252
  return trimmed;
71
253
  if (trimmed.includes("/") && !trimmed.includes(":"))
72
254
  return `kaggle:${trimmed}`;
@@ -88,6 +270,14 @@ import { HuggingFaceScraper } from "./metadata/scraper.js";
88
270
  import { KaggleSource } from "./metadata/kaggle-source.js";
89
271
  import { OpenMLSource } from "./metadata/openml-source.js";
90
272
  import { DataWorldSource } from "./metadata/dataworld-source.js";
273
+ import { ArxivSource } from "./metadata/arxiv-source.js";
274
+ import { GithubSource } from "./metadata/github-source.js";
275
+ import { UnifiedDatasetGateway } from "./gateway/unified-dataset-gateway.js";
276
+ import { WebCoreEngine } from "./web/web-core.js";
277
+ import { WebFusionEngine } from "./web/fusion-engine.js";
278
+ import { WebExtractorEngine } from "./web/extract-web.js";
279
+ import { SemanticScholarSource } from "./metadata/semantic-scholar-source.js";
280
+ import { HackerNewsSource } from "./metadata/hackernews-source.js";
91
281
  import { formatSearchResults, formatDatasetInfo, formatJobStatus } from "./tools/formatter.js";
92
282
  import { JobManager } from "./jobs/manager.js";
93
283
  import { QualityAnalyzer } from "./quality/analyzer.js";
@@ -131,6 +321,34 @@ function logError(err, context) {
131
321
  fs.appendFileSync(errorLogPath, msg);
132
322
  console.error(`[Vesper] Critical error logged to ${errorLogPath}`);
133
323
  }
324
+ // --- Request Queue: serialize all MCP tool calls to prevent crashes ---
325
+ class RequestQueue {
326
+ queue = [];
327
+ running = false;
328
+ enqueue(task) {
329
+ return new Promise((resolve, reject) => {
330
+ this.queue.push({ resolve, reject, task });
331
+ this.drain();
332
+ });
333
+ }
334
+ async drain() {
335
+ if (this.running)
336
+ return;
337
+ this.running = true;
338
+ while (this.queue.length > 0) {
339
+ const item = this.queue.shift();
340
+ try {
341
+ const result = await item.task();
342
+ item.resolve(result);
343
+ }
344
+ catch (err) {
345
+ item.reject(err);
346
+ }
347
+ }
348
+ this.running = false;
349
+ }
350
+ }
351
+ const requestQueue = new RequestQueue();
134
352
  const SPINNER_FRAMES = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"];
135
353
  function printLaunchScreen() {
136
354
  const screen = `
@@ -198,6 +416,21 @@ function extractRequestedRows(query, requirements) {
198
416
  if (Number.isFinite(n) && n > 0)
199
417
  return n;
200
418
  }
419
+ const commaNumbers = [...text.matchAll(/\b\d{1,3}(?:,\d{3})+\b/g)]
420
+ .map(m => Number(m[0].replace(/,/g, "")))
421
+ .filter(n => Number.isFinite(n) && n > 0);
422
+ if (commaNumbers.length > 0)
423
+ return Math.max(...commaNumbers);
424
+ const humanSized = [...text.matchAll(/\b(\d+(?:\.\d+)?)\s*([kmb])\b/gi)]
425
+ .map(m => {
426
+ const base = Number(m[1]);
427
+ const suffix = m[2].toLowerCase();
428
+ const multiplier = suffix === "k" ? 1_000 : suffix === "m" ? 1_000_000 : 1_000_000_000;
429
+ return Math.round(base * multiplier);
430
+ })
431
+ .filter(n => Number.isFinite(n) && n > 0);
432
+ if (humanSized.length > 0)
433
+ return Math.max(...humanSized);
201
434
  const allNums = [...text.matchAll(/\b\d{4,9}\b/g)]
202
435
  .map(m => Number(m[0]))
203
436
  .filter(n => Number.isFinite(n) && n > 0);
@@ -367,7 +600,45 @@ function syncPythonScripts(appRoot, dataRoot) {
367
600
  }
368
601
  // Sync scripts immediately
369
602
  syncPythonScripts(appRoot, dataRoot);
370
- const metadataStore = new MetadataStore(dbPath);
603
+ // Auto-rebuild better-sqlite3 if native binary doesn't match current Node version
604
+ function tryRebuildSqlite() {
605
+ try {
606
+ const { execSync } = require("child_process");
607
+ const pkgRoot = path.resolve(__dirname, "..");
608
+ console.error("[Vesper] Rebuilding better-sqlite3 for Node " + process.version + "...");
609
+ execSync("npm rebuild better-sqlite3", {
610
+ stdio: "pipe",
611
+ timeout: 60000,
612
+ cwd: pkgRoot,
613
+ });
614
+ console.error("[Vesper] Rebuild succeeded. Retrying...");
615
+ // Clear require cache so the rebuilt module is loaded
616
+ for (const key of Object.keys(require.cache)) {
617
+ if (key.includes("better-sqlite3") || key.includes("better_sqlite3")) {
618
+ delete require.cache[key];
619
+ }
620
+ }
621
+ return true;
622
+ }
623
+ catch (e) {
624
+ console.error("[Vesper] Auto-rebuild failed: " + (e?.message || e));
625
+ return false;
626
+ }
627
+ }
628
+ let metadataStore;
629
+ try {
630
+ metadataStore = new MetadataStore(dbPath);
631
+ }
632
+ catch (e) {
633
+ if (e?.code === "ERR_DLOPEN_FAILED" && tryRebuildSqlite()) {
634
+ metadataStore = new MetadataStore(dbPath);
635
+ }
636
+ else {
637
+ console.error("[Vesper] FATAL: Cannot load better-sqlite3.");
638
+ console.error("[Vesper] Run: npm rebuild better-sqlite3");
639
+ throw e;
640
+ }
641
+ }
371
642
  const vectorStore = new VectorStore(vectorPath);
372
643
  const embedder = Embedder.getInstance();
373
644
  const searchEngine = new SearchEngine(metadataStore, vectorStore, embedder);
@@ -382,7 +653,16 @@ const dataSplitter = new DataSplitter(__dirname);
382
653
  const dataExporter = new DataExporter(__dirname);
383
654
  const fusionEngine = new DataFusionEngine(__dirname);
384
655
  const kaggleSource = new KaggleSource(__dirname);
656
+ const openmlSource = new OpenMLSource(__dirname);
657
+ const dataworldSource = new DataWorldSource(__dirname);
658
+ const arxivSource = new ArxivSource(cacheService);
659
+ const githubSource = new GithubSource(cacheService);
385
660
  const secureKeys = new SecureKeysManager(__dirname);
661
+ const semanticScholarSource = new SemanticScholarSource(cacheService);
662
+ const hackerNewsSource = new HackerNewsSource(cacheService);
663
+ const webCoreEngine = new WebCoreEngine({ arxivSource, githubSource, semanticScholarSource, hackerNewsSource });
664
+ const webFusionEngine = new WebFusionEngine({ webCoreEngine, embedder, cache: cacheService });
665
+ const webExtractorEngine = new WebExtractorEngine(cacheService);
386
666
  function hydrateExternalKeys() {
387
667
  const keys = secureKeys.getAll();
388
668
  if (!process.env.HF_TOKEN && !process.env.HUGGINGFACE_TOKEN && keys.hf_token) {
@@ -401,6 +681,17 @@ function hydrateExternalKeys() {
401
681
  function hasDataWorldToken() {
402
682
  return !!(process.env.DW_AUTH_TOKEN || secureKeys.getAll().dataworld_token);
403
683
  }
684
+ const unifiedDatasetGateway = new UnifiedDatasetGateway({
685
+ metadataStore,
686
+ dataIngestor,
687
+ dataRoot,
688
+ kaggleSource,
689
+ openmlSource,
690
+ dataworldSource,
691
+ arxivSource,
692
+ githubSource,
693
+ hasDataWorldToken,
694
+ });
404
695
  // CRITICAL FIX: Pass __dirname (build directory) to analyzers
405
696
  // Python scripts are in build/python/, so analyzers should look relative to build/
406
697
  // NOT relative to project root (appRoot)
@@ -432,7 +723,7 @@ jobManager.on("processJob", async (job, execute) => {
432
723
  console.error(`[Vesper] Starting prepareDatasetTask for job ${job.id}...`);
433
724
  const metadata = job.metadata ? JSON.parse(job.metadata) : {};
434
725
  switch (job.type) {
435
- case "prepare": return await handlePrepareJob(job.id, metadata.query, metadata.requirements);
726
+ case "prepare": return await handlePrepareJob(job.id, metadata.query, metadata.requirements, metadata.outputDir);
436
727
  case "clean": return await handleCleanJob(job.id, metadata.datasetId, metadata.ops);
437
728
  default: throw new Error(`Unhandled job type: ${job.type}`);
438
729
  }
@@ -450,9 +741,21 @@ jobManager.on("processJob", async (job, execute) => {
450
741
  /**
451
742
  * Logic for preparing a dataset (Search + Ingest + Process)
452
743
  */
453
- async function handlePrepareJob(jobId, query, requirements) {
744
+ async function handlePrepareJob(jobId, query, requirements, outputDir) {
454
745
  hydrateExternalKeys();
455
746
  const update = (updates) => jobManager.updateJob(jobId, updates);
747
+ const pipelineSteps = ["search", "validate", "download", "normalize", "quality", "register"];
748
+ const stepStatus = {};
749
+ for (const s of pipelineSteps)
750
+ stepStatus[s] = "pending";
751
+ const markPipelineStep = (step, status) => {
752
+ stepStatus[step] = status;
753
+ const summary = pipelineSteps.map(s => {
754
+ const st = stepStatus[s];
755
+ return st === "done" ? `[${s}]` : st === "running" ? `>${s}<` : st === "failed" ? `!${s}!` : st === "skipped" ? `~${s}~` : ` ${s} `;
756
+ }).join(" → ");
757
+ console.error(`[Pipeline] ${summary}`);
758
+ };
456
759
  // Ensure core Python packages are available for dataset operations
457
760
  try {
458
761
  await ensurePythonModules([
@@ -465,11 +768,12 @@ async function handlePrepareJob(jobId, query, requirements) {
465
768
  // Continue anyway - direct file downloads may still work without datasets lib
466
769
  }
467
770
  const requestedRows = extractRequestedRows(query, requirements);
771
+ const searchQuery = requirements ? `${query} ${requirements}` : query;
468
772
  let selectedDataset;
469
773
  let datasetIdForDownload = "";
470
774
  let source;
471
775
  const parsedQuery = parseDatasetId(query);
472
- const isExplicitDatasetRef = /^(kaggle:|hf:|huggingface:|openml:|dataworld:)/i.test(parsedQuery) || (query.includes("/") && !query.includes(" "));
776
+ const isExplicitDatasetRef = /^(kaggle:|hf:|huggingface:|openml:|dataworld:|arxiv:|github:)/i.test(parsedQuery) || (query.includes("/") && !query.includes(" "));
473
777
  if (isExplicitDatasetRef) {
474
778
  let explicitId = parsedQuery;
475
779
  if (/^hf:/i.test(explicitId)) {
@@ -491,6 +795,12 @@ async function handlePrepareJob(jobId, query, requirements) {
491
795
  source = "dataworld";
492
796
  datasetIdForDownload = explicitId.replace(/^dataworld:/i, "");
493
797
  }
798
+ else if (/^arxiv:/i.test(explicitId)) {
799
+ throw new Error("prepare_dataset does not support direct arXiv downloads yet. Use unified_dataset_api with operation='discover' or 'info' for arXiv.");
800
+ }
801
+ else if (/^github:/i.test(explicitId)) {
802
+ throw new Error("prepare_dataset does not support direct GitHub downloads yet. Use unified_dataset_api with operation='discover' or 'info' for GitHub.");
803
+ }
494
804
  else {
495
805
  // Default to HuggingFace for ambiguous refs (user/dataset without prefix)
496
806
  source = "huggingface";
@@ -500,11 +810,14 @@ async function handlePrepareJob(jobId, query, requirements) {
500
810
  progress: 20,
501
811
  status_text: `Using explicit dataset id: ${datasetIdForDownload} (${source})`
502
812
  });
813
+ markPipelineStep("search", "skipped");
503
814
  }
504
815
  else {
816
+ markPipelineStep("search", "running");
505
817
  update({ progress: 10, status_text: "Searching for best dataset matching query..." });
506
- const results = await searchEngine.search(query, { limit: 10 });
818
+ const results = await searchEngine.search(searchQuery, { limit: 10 });
507
819
  if (results.length === 0) {
820
+ markPipelineStep("search", "failed");
508
821
  throw new Error("No datasets found matching the query. Try refining your search terms.");
509
822
  }
510
823
  // Pick the best result that we can actually download (skip sources requiring missing credentials)
@@ -512,20 +825,32 @@ async function handlePrepareJob(jobId, query, requirements) {
512
825
  const hasDwToken = hasDataWorldToken();
513
826
  selectedDataset = results.find(r => {
514
827
  const s = (r.source || "").toLowerCase();
828
+ if (s === "arxiv")
829
+ return false; // Phase 1: discover/info only, no direct download yet
830
+ if (s === "github")
831
+ return false; // Phase 1: discover/info only, no direct download yet
515
832
  if (s === "kaggle" && !hasKaggleCreds)
516
833
  return false;
517
834
  if (s === "dataworld" && !hasDwToken)
518
835
  return false;
519
836
  return true;
520
837
  }) || results[0]; // Fallback to first if all require credentials
838
+ if ((selectedDataset.source || "").toLowerCase() === "arxiv") {
839
+ throw new Error("Matched an arXiv paper, but prepare_dataset currently supports downloadable dataset providers only.");
840
+ }
841
+ if ((selectedDataset.source || "").toLowerCase() === "github") {
842
+ throw new Error("Matched a GitHub repo, but prepare_dataset currently supports downloadable dataset providers only.");
843
+ }
521
844
  datasetIdForDownload = selectedDataset.id;
522
845
  source = selectedDataset.source;
523
846
  update({
524
847
  progress: 20,
525
848
  status_text: `Matched: ${selectedDataset.name} (${selectedDataset.source})`
526
849
  });
850
+ markPipelineStep("search", "done");
527
851
  }
528
852
  // Pre-check credentials for sources that require them
853
+ markPipelineStep("validate", "running");
529
854
  if (source === "kaggle") {
530
855
  const hasKaggleCreds = !!(process.env.KAGGLE_USERNAME && process.env.KAGGLE_KEY && process.env.KAGGLE_USERNAME !== "YOUR_KAGGLE_USERNAME");
531
856
  if (!hasKaggleCreds) {
@@ -533,8 +858,11 @@ async function handlePrepareJob(jobId, query, requirements) {
533
858
  }
534
859
  }
535
860
  if (source === "dataworld" && !hasDataWorldToken()) {
861
+ markPipelineStep("validate", "failed");
536
862
  throw new Error("data.world token not set. Use the configure_keys tool or set DW_AUTH_TOKEN environment variable.");
537
863
  }
864
+ markPipelineStep("validate", "done");
865
+ markPipelineStep("download", "running");
538
866
  update({ progress: 30, status_text: `Starting download from ${source}...` });
539
867
  // ensureData handles download and returns path to the raw file
540
868
  let rawFilePath = await dataIngestor.ensureData(datasetIdForDownload, source, (msg, prog) => {
@@ -545,7 +873,7 @@ async function handlePrepareJob(jobId, query, requirements) {
545
873
  let currentRows = await countRows(rawFilePath);
546
874
  if (currentRows < requestedRows) {
547
875
  update({ progress: 64, status_text: `Only ${currentRows.toLocaleString()} rows found. Fetching more matching datasets...` });
548
- const additional = await searchEngine.search(query, { limit: 8 });
876
+ const additional = await searchEngine.search(searchQuery, { limit: 8 });
549
877
  const sourceFiles = [rawFilePath];
550
878
  let totalRows = currentRows;
551
879
  for (const ds of additional) {
@@ -597,15 +925,50 @@ async function handlePrepareJob(jobId, query, requirements) {
597
925
  update({ progress: 69, status_text: `Sample target met: ${currentRows.toLocaleString()} rows` });
598
926
  }
599
927
  }
928
+ markPipelineStep("download", "done");
929
+ // ── Normalize step: convert any raw format → parquet ──
930
+ markPipelineStep("normalize", "running");
931
+ const rawExt = path.extname(rawFilePath).toLowerCase();
932
+ if (rawExt !== ".parquet" && rawExt !== ".pq") {
933
+ update({ progress: 70, status_text: "Normalizing to parquet..." });
934
+ const normalizedDir = path.join(dataRoot, "data", "normalized");
935
+ if (!fs.existsSync(normalizedDir))
936
+ fs.mkdirSync(normalizedDir, { recursive: true });
937
+ const safeId = toSafeDatasetPathFragment(datasetIdForDownload);
938
+ const normalizedPath = path.join(normalizedDir, `${safeId}.parquet`);
939
+ try {
940
+ const normScript = path.join(dataRoot, "python", "normalize_engine.py");
941
+ const normResult = await runPythonJson(normScript, [rawFilePath, normalizedPath]);
942
+ if (normResult.ok && fs.existsSync(normalizedPath)) {
943
+ console.error(`[Prepare] Normalized ${rawExt} → parquet (${normResult.rows} rows)`);
944
+ rawFilePath = normalizedPath;
945
+ markPipelineStep("normalize", "done");
946
+ }
947
+ else {
948
+ console.error(`[Prepare] Normalize failed: ${normResult.error}, continuing with raw file`);
949
+ markPipelineStep("normalize", "skipped");
950
+ }
951
+ }
952
+ catch (e) {
953
+ console.error(`[Prepare] Normalize step failed: ${e?.message || e}, continuing with raw file`);
954
+ markPipelineStep("normalize", "skipped");
955
+ }
956
+ }
957
+ else {
958
+ markPipelineStep("normalize", "done");
959
+ }
600
960
  let qualityScore = selectedDataset?.quality_score ?? 70;
601
- update({ progress: 70, status_text: "Analyzing dataset quality..." });
961
+ markPipelineStep("quality", "running");
962
+ update({ progress: 75, status_text: "Analyzing dataset quality..." });
602
963
  try {
603
964
  const report = await qualityAnalyzer.analyze(rawFilePath);
604
965
  qualityScore = report.overall_score;
966
+ markPipelineStep("quality", "done");
605
967
  }
606
968
  catch (error) {
607
969
  console.error(`[Prepare] Quality analysis failed for ${datasetIdForDownload}: ${error?.message || error}`);
608
970
  update({ progress: 78, status_text: "Quality analysis skipped (unsupported schema). Continuing installation..." });
971
+ markPipelineStep("quality", "skipped");
609
972
  }
610
973
  if (selectedDataset) {
611
974
  metadataStore.saveDataset({
@@ -613,15 +976,62 @@ async function handlePrepareJob(jobId, query, requirements) {
613
976
  quality_score: qualityScore
614
977
  });
615
978
  }
979
+ else {
980
+ // Even for explicit dataset refs, save minimal metadata so get_dataset_info can find it
981
+ try {
982
+ const existingMeta = metadataStore.getDataset(datasetIdForDownload);
983
+ if (!existingMeta) {
984
+ metadataStore.saveDataset({
985
+ id: datasetIdForDownload,
986
+ source: source,
987
+ name: datasetIdForDownload.split("/").pop() || datasetIdForDownload,
988
+ description: `Dataset prepared from ${source}:${datasetIdForDownload}`,
989
+ quality_warnings: [],
990
+ downloads: 0,
991
+ likes: 0,
992
+ stars: 0,
993
+ tags: [],
994
+ last_updated: new Date().toISOString(),
995
+ task: "unknown",
996
+ domain: "unknown",
997
+ languages: [],
998
+ splits: [],
999
+ license: { id: "unknown", category: "unknown", usage_restrictions: [], warnings: [] },
1000
+ quality_score: qualityScore,
1001
+ download_url: source === "huggingface" ? `https://huggingface.co/datasets/${datasetIdForDownload}` : "",
1002
+ total_examples: 0,
1003
+ is_structured: false,
1004
+ has_target_column: false,
1005
+ is_safe_source: true,
1006
+ has_personal_data: false,
1007
+ is_paywalled: false,
1008
+ is_scraped_web_data: false,
1009
+ uses_https: true,
1010
+ has_train_split: false,
1011
+ has_test_split: false,
1012
+ has_validation_split: false,
1013
+ description_length: 0,
1014
+ has_readme: false,
1015
+ });
1016
+ }
1017
+ }
1018
+ catch (e) {
1019
+ console.error(`[Prepare] Failed to save minimal metadata for ${datasetIdForDownload}: ${e?.message || e}`);
1020
+ }
1021
+ }
1022
+ markPipelineStep("register", "running");
616
1023
  update({ progress: 85, status_text: "Installing dataset into project..." });
617
- const installPath = await installService.install(datasetIdForDownload, rawFilePath);
1024
+ const installPath = await installService.install(datasetIdForDownload, rawFilePath, outputDir);
618
1025
  update({ progress: 100, status_text: "Preparation complete!" });
619
1026
  // Register prepared dataset in local registry for lookup by export/list tools
620
1027
  try {
621
1028
  upsertRegistry(datasetIdForDownload, installPath, "completed");
1029
+ markPipelineStep("register", "done");
1030
+ markStepComplete(datasetIdForDownload, "prepare");
622
1031
  }
623
1032
  catch (e) {
624
1033
  console.error(`[Registry] Failed to write registry for ${datasetIdForDownload}: ${e?.message || e}`);
1034
+ markPipelineStep("register", "failed");
625
1035
  }
626
1036
  return installPath;
627
1037
  }
@@ -647,7 +1057,7 @@ async function handleCleanJob(jobId, datasetId, ops) {
647
1057
  }
648
1058
  // 3. Check standard raw data paths
649
1059
  if (!filePath) {
650
- const safeId = datasetId.replace(/\//g, "_");
1060
+ const safeId = toSafeDatasetPathFragment(datasetId);
651
1061
  const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
652
1062
  const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
653
1063
  const featherPath = path.join(dataRoot, "data", "raw", `${safeId}.feather`);
@@ -712,9 +1122,146 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
712
1122
  required: ["query"],
713
1123
  },
714
1124
  },
1125
+ {
1126
+ name: "unified_dataset_api",
1127
+ description: "Single facade over multiple external dataset providers. Supports provider discovery, dataset search, dataset download, and dataset info through one MCP tool using public access and server-managed credentials when available.",
1128
+ inputSchema: {
1129
+ type: "object",
1130
+ properties: {
1131
+ operation: {
1132
+ type: "string",
1133
+ enum: ["providers", "discover", "download", "info"],
1134
+ description: "Gateway operation to execute.",
1135
+ },
1136
+ source: {
1137
+ type: "string",
1138
+ enum: ["auto", "huggingface", "kaggle", "openml", "dataworld", "arxiv", "github", "s3", "bigquery"],
1139
+ description: "Optional provider selector. Use 'auto' to let Vesper choose a compatible backend.",
1140
+ },
1141
+ query: {
1142
+ type: "string",
1143
+ description: "Dataset discovery query. Required for operation='discover'.",
1144
+ },
1145
+ dataset_id: {
1146
+ type: "string",
1147
+ description: "Dataset identifier or object reference. Required for operation='download' and operation='info'. Supports prefixed ids like 'huggingface:user/dataset' and public S3 URIs like 's3://bucket/key'.",
1148
+ },
1149
+ limit: {
1150
+ type: "number",
1151
+ description: "Max results for operation='discover' (default: 10).",
1152
+ },
1153
+ target_dir: {
1154
+ type: "string",
1155
+ description: "Optional output directory for operation='download'. Defaults to the current working directory when omitted.",
1156
+ },
1157
+ output_dir: {
1158
+ type: "string",
1159
+ description: "Alias for target_dir. Defaults to the current working directory when omitted.",
1160
+ },
1161
+ public_only: {
1162
+ type: "boolean",
1163
+ description: "When true, discover/info stay on public providers only unless a specific source is requested.",
1164
+ },
1165
+ include_unavailable: {
1166
+ type: "boolean",
1167
+ description: "When true, operation='providers' includes connectors that are scaffolded but not currently configured.",
1168
+ },
1169
+ },
1170
+ required: ["operation"],
1171
+ },
1172
+ },
1173
+ {
1174
+ name: "vesper_web_find",
1175
+ description: "Phase 1 Web Core: search web-native sources (ArXiv, GitHub) and return structured, validated documents using a unified schema (source_type, source_url, content, metadata_json, quality_score, collected_at, content_type).",
1176
+ inputSchema: {
1177
+ type: "object",
1178
+ properties: {
1179
+ query: { type: "string", description: "Natural language query, e.g. 'agentic RAG evaluation'" },
1180
+ sources: {
1181
+ type: "array",
1182
+ items: { type: "string", enum: ["arxiv", "github", "semantic_scholar", "hackernews"] },
1183
+ description: "Optional subset of sources. Defaults to ['arxiv','github'] when omitted.",
1184
+ },
1185
+ limit: { type: "number", description: "Max documents to return (default 10, max 50)." },
1186
+ arxiv_full_text: { type: "boolean", description: "When true, fetch and parse ArXiv PDFs and return full text as document content (slower)." },
1187
+ github_include_readme: { type: "boolean", description: "When true, fetch and include GitHub README.md text as document content (slower)." },
1188
+ },
1189
+ required: ["query"],
1190
+ },
1191
+ },
1192
+ {
1193
+ name: "vesper.fuse",
1194
+ description: "Phase 2 Data Fusion: fuse results from multiple web-native sources into one unified, deduplicated corpus (provenance via source_chain).",
1195
+ inputSchema: {
1196
+ type: "object",
1197
+ properties: {
1198
+ sources: {
1199
+ type: "array",
1200
+ description: "Web sources to collect from, each with its own query.",
1201
+ items: {
1202
+ type: "object",
1203
+ properties: {
1204
+ type: { type: "string", enum: ["arxiv", "github", "semantic_scholar", "hackernews", "s3"] },
1205
+ query: { type: "string", description: "Query for this source." },
1206
+ max_results: { type: "number", description: "Max results for this source (optional)." },
1207
+ min_stars: { type: "number", description: "Optional popularity filter (GitHub) based on stars/proxy fields." },
1208
+ bucket: { type: "string", description: "S3 bucket (for type='s3')." },
1209
+ path: { type: "string", description: "S3 prefix/path (for type='s3')." },
1210
+ region: { type: "string", description: "AWS region (for type='s3')." },
1211
+ credentials: {
1212
+ type: "object",
1213
+ description: "Pass-through AWS credentials (optional; not persisted).",
1214
+ properties: {
1215
+ accessKeyId: { type: "string" },
1216
+ secretAccessKey: { type: "string" },
1217
+ sessionToken: { type: "string" },
1218
+ roleArn: { type: "string" },
1219
+ }
1220
+ },
1221
+ },
1222
+ required: ["type", "query"],
1223
+ },
1224
+ },
1225
+ merge_strategy: {
1226
+ type: "string",
1227
+ enum: ["union", "dedup"],
1228
+ description: "How to merge collected documents.",
1229
+ },
1230
+ deduplication: {
1231
+ type: "string",
1232
+ enum: ["semantic", "exact", "none"],
1233
+ description: "How to deduplicate across sources.",
1234
+ },
1235
+ },
1236
+ required: ["sources"],
1237
+ },
1238
+ },
1239
+ {
1240
+ name: "vesper.extract_web",
1241
+ description: "Phase 3 Structured Web extraction. Whitelist-only domains, deterministic extraction (tables/lists/infobox), schema validation, and cache fallback on live extraction failure.",
1242
+ inputSchema: {
1243
+ type: "object",
1244
+ properties: {
1245
+ url: { type: "string", description: "Target URL from approved whitelist domains." },
1246
+ mode: { type: "string", enum: ["auto", "table", "list", "infobox"], description: "Extraction mode (default auto)." },
1247
+ strict_schema: { type: "boolean", description: "When true (default), enforce domain-specific required fields." },
1248
+ schema: {
1249
+ type: "object",
1250
+ properties: {
1251
+ required_fields: {
1252
+ type: "array",
1253
+ items: { type: "string" },
1254
+ description: "Optional required top-level fields in extracted data payload."
1255
+ }
1256
+ }
1257
+ }
1258
+ },
1259
+ required: ["url"],
1260
+ },
1261
+ },
715
1262
  {
716
1263
  name: "discover_datasets",
717
- description: "Discover datasets from a specific source. Kaggle is optional and requires user-provided API key.",
1264
+ description: "Discover datasets from a specific source. Public providers work keylessly; Kaggle and data.world can also be exposed through server-managed credentials.",
718
1265
  inputSchema: {
719
1266
  type: "object",
720
1267
  properties: {
@@ -724,7 +1271,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
724
1271
  },
725
1272
  source: {
726
1273
  type: "string",
727
- enum: ["huggingface", "kaggle", "openml", "dataworld"],
1274
+ enum: ["huggingface", "kaggle", "openml", "dataworld", "arxiv", "github"],
728
1275
  description: "Data source to discover from.",
729
1276
  },
730
1277
  limit: {
@@ -737,7 +1284,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
737
1284
  },
738
1285
  {
739
1286
  name: "download_dataset",
740
- description: "Download a dataset by source and ID/slug into local Vesper storage. Defaults to HuggingFace. Kaggle and data.world require API keys (use configure_keys first).",
1287
+ description: "Download a dataset by source and ID/slug into a local directory. Defaults to HuggingFace. Public providers work keylessly, while Kaggle and data.world can use server-managed credentials when configured.",
741
1288
  inputSchema: {
742
1289
  type: "object",
743
1290
  properties: {
@@ -752,7 +1299,11 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
752
1299
  },
753
1300
  target_dir: {
754
1301
  type: "string",
755
- description: "Optional target directory for downloaded files.",
1302
+ description: "Optional target directory for downloaded files. Defaults to the current working directory when omitted.",
1303
+ },
1304
+ output_dir: {
1305
+ type: "string",
1306
+ description: "Alias for target_dir. Defaults to the current working directory when omitted.",
756
1307
  }
757
1308
  },
758
1309
  required: ["dataset_id"],
@@ -770,6 +1321,8 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
770
1321
  kaggle_ref: { type: "string", description: "Kaggle dataset ref (owner/dataset)." },
771
1322
  urls: { type: "array", items: { type: "string" }, description: "Direct asset URLs." },
772
1323
  output_format: { type: "string", enum: ["webdataset", "imagefolder", "parquet"], description: "Output asset format." },
1324
+ target_dir: { type: "string", description: "Optional local directory where downloaded assets should be written. If provided, Vesper writes directly to this directory instead of managed asset storage." },
1325
+ output_dir: { type: "string", description: "Alias for target_dir. When provided, downloaded assets are written directly to this local directory." },
773
1326
  max_items: { type: "number", description: "Optional cap on number of assets to fetch." },
774
1327
  workers: { type: "number", description: "Parallel worker count (default 8)." },
775
1328
  image_column: { type: "string", description: "Explicit image column name. If omitted, auto-detected from HF features, column names, and sample values." },
@@ -877,6 +1430,8 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
877
1430
  properties: {
878
1431
  query: { type: "string" },
879
1432
  requirements: { type: "string" },
1433
+ target_dir: { type: "string", description: "Optional local directory for the prepared dataset. Defaults to the current working directory when omitted." },
1434
+ output_dir: { type: "string", description: "Alias for target_dir. Defaults to the current working directory when omitted." },
880
1435
  download_images: { type: "boolean", description: "When true, enables post-prepare smart asset downloading for image/media datasets." },
881
1436
  cleaning_options: { type: "object" },
882
1437
  split_config: { type: "object" },
@@ -921,7 +1476,11 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
921
1476
  },
922
1477
  target_dir: {
923
1478
  type: "string",
924
- description: "Optional custom local directory for export (e.g., './naruto-quotes').",
1479
+ description: "Optional custom local directory for export. Defaults to the current working directory when omitted.",
1480
+ },
1481
+ output_dir: {
1482
+ type: "string",
1483
+ description: "Alias for target_dir. Defaults to the current working directory when omitted.",
925
1484
  },
926
1485
  format: {
927
1486
  type: "string",
@@ -962,6 +1521,25 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
962
1521
  properties: {},
963
1522
  },
964
1523
  },
1524
+ {
1525
+ name: "vesper_convert_format",
1526
+ description: "Convert a dataset file between formats (CSV, Parquet, JSON, JSONL). Auto-detects input format from extension. Saves output in the same directory with the new extension and registers it in the Vesper registry.",
1527
+ inputSchema: {
1528
+ type: "object",
1529
+ properties: {
1530
+ file_path: {
1531
+ type: "string",
1532
+ description: "Absolute path to the input dataset file.",
1533
+ },
1534
+ target_format: {
1535
+ type: "string",
1536
+ enum: ["csv", "parquet", "json", "jsonl"],
1537
+ description: "The desired output format.",
1538
+ },
1539
+ },
1540
+ required: ["file_path", "target_format"],
1541
+ },
1542
+ },
965
1543
  {
966
1544
  name: "fuse_datasets",
967
1545
  description: "Fuse/combine multiple datasets via concat or join. Optionally runs quality & leakage checks afterward.",
@@ -1069,925 +1647,1225 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
1069
1647
  ],
1070
1648
  };
1071
1649
  });
1072
- // Call Tool
1650
+ // Call Tool — all requests are serialized through a queue to prevent crashes from parallel calls
1073
1651
  server.setRequestHandler(CallToolRequestSchema, async (request) => {
1074
- // --- Pipeline Enforcement ---
1075
- // Map tool names to pipeline steps
1076
- const toolToStep = {
1077
- vesper_search: "search",
1078
- vesper_download: "download",
1079
- vesper_analyze: "analyze",
1080
- vesper_clean: "clean",
1081
- vesper_split: "split",
1082
- vesper_export: "export",
1083
- prepare_dataset: "prepare",
1084
- };
1085
- // Extract dataset_id if present and normalize
1086
- let datasetId = request.params.arguments?.dataset_id || request.params.arguments?.query || "";
1087
- if (datasetId)
1088
- datasetId = parseDatasetId(String(datasetId));
1089
- // Pipeline rules
1090
- const stepOrder = ["search", "download", "analyze", "clean", "split", "export"];
1091
- const prereqs = {
1092
- vesper_download: ["search"],
1093
- vesper_analyze: ["download"],
1094
- vesper_clean: ["analyze"],
1095
- vesper_split: ["clean"],
1096
- vesper_export: ["split"],
1097
- };
1098
- const tool = String(request.params.name);
1099
- const step = toolToStep[tool];
1100
- if (step && datasetId) {
1101
- // Check prerequisites
1102
- const required = prereqs[tool] || [];
1103
- for (const req of required) {
1104
- if (!hasStep(String(datasetId), req)) {
1105
- // Auto-run missing step if possible, else error
1106
- // For export, auto-run prepare_dataset if split missing
1107
- if (tool === "vesper_export" && req === "split") {
1108
- // Auto-trigger prepare_dataset (start a background prepare job)
1109
- try {
1110
- jobManager.createJob("prepare", 0, { query: String(datasetId), requirements: undefined, downloadImages: false });
1111
- // Mark split as complete so export can proceed; export handler will also wait for data if needed.
1112
- markStepComplete(String(datasetId), "split");
1652
+ return requestQueue.enqueue(async () => {
1653
+ // --- Pipeline Enforcement ---
1654
+ // Map tool names to pipeline steps
1655
+ const toolToStep = {
1656
+ vesper_search: "search",
1657
+ vesper_download: "download",
1658
+ vesper_analyze: "analyze",
1659
+ vesper_clean: "clean",
1660
+ vesper_split: "split",
1661
+ vesper_export: "export",
1662
+ prepare_dataset: "prepare",
1663
+ };
1664
+ // Extract dataset_id if present and normalize
1665
+ let datasetId = request.params.arguments?.dataset_id || request.params.arguments?.query || "";
1666
+ if (datasetId)
1667
+ datasetId = parseDatasetId(String(datasetId));
1668
+ // Pipeline rules
1669
+ const stepOrder = ["search", "download", "analyze", "clean", "split", "export"];
1670
+ const prereqs = {
1671
+ vesper_download: ["search"],
1672
+ vesper_analyze: ["download"],
1673
+ vesper_clean: ["analyze"],
1674
+ vesper_split: ["clean"],
1675
+ vesper_export: ["split"],
1676
+ };
1677
+ const tool = String(request.params.name);
1678
+ const step = toolToStep[tool];
1679
+ if (step && datasetId) {
1680
+ // Check prerequisites
1681
+ const required = prereqs[tool] || [];
1682
+ for (const req of required) {
1683
+ if (!hasStep(String(datasetId), req)) {
1684
+ // Auto-run missing step if possible, else error
1685
+ // For export, auto-run prepare_dataset if split missing
1686
+ if (tool === "vesper_export" && req === "split") {
1687
+ // Auto-trigger prepare_dataset (start a background prepare job)
1688
+ try {
1689
+ jobManager.createJob("prepare", 0, { query: String(datasetId), requirements: undefined, downloadImages: false, outputDir: process.cwd() });
1690
+ // Mark split as complete so export can proceed; export handler will also wait for data if needed.
1691
+ markStepComplete(String(datasetId), "split");
1692
+ }
1693
+ catch (e) {
1694
+ console.error(`[Pipeline] Failed to auto-trigger prepare for ${datasetId}: ${e?.message || e}`);
1695
+ return {
1696
+ content: [{ type: "text", text: `ERROR: Failed to auto-run prepare for ${datasetId}. Please run prepare_dataset first.` }],
1697
+ isError: true,
1698
+ };
1699
+ }
1113
1700
  }
1114
- catch (e) {
1115
- console.error(`[Pipeline] Failed to auto-trigger prepare for ${datasetId}: ${e?.message || e}`);
1701
+ else {
1116
1702
  return {
1117
- content: [{ type: "text", text: `ERROR: Failed to auto-run prepare for ${datasetId}. Please run prepare_dataset first.` }],
1703
+ content: [{ type: "text", text: `ERROR: Cannot run ${tool} before ${req}. Please run ${req} first.` }],
1118
1704
  isError: true,
1119
1705
  };
1120
1706
  }
1121
1707
  }
1122
- else {
1708
+ }
1709
+ // Mark this step as complete
1710
+ markStepComplete(String(datasetId), String(step));
1711
+ }
1712
+ switch (request.params.name) {
1713
+ case "vesper_web_find": {
1714
+ hydrateExternalKeys();
1715
+ const query = String(request.params.arguments?.query || "").trim();
1716
+ const limit = Number(request.params.arguments?.limit || 10);
1717
+ const sources = Array.isArray(request.params.arguments?.sources)
1718
+ ? (request.params.arguments?.sources).map(s => String(s).trim().toLowerCase()).filter(Boolean)
1719
+ : undefined;
1720
+ try {
1721
+ const result = await webCoreEngine.find({
1722
+ query,
1723
+ sources: sources,
1724
+ limit,
1725
+ arxiv_full_text: request.params.arguments?.arxiv_full_text === true,
1726
+ github_include_readme: request.params.arguments?.github_include_readme === true,
1727
+ });
1123
1728
  return {
1124
- content: [{ type: "text", text: `ERROR: Cannot run ${tool} before ${req}. Please run ${req} first.` }],
1729
+ content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
1730
+ };
1731
+ }
1732
+ catch (error) {
1733
+ return {
1734
+ content: [{ type: "text", text: `ERROR: web_find failed: ${error.message}` }],
1125
1735
  isError: true,
1126
1736
  };
1127
1737
  }
1128
1738
  }
1129
- }
1130
- // Mark this step as complete
1131
- markStepComplete(String(datasetId), String(step));
1132
- }
1133
- switch (request.params.name) {
1134
- case "vesper_search": {
1135
- const query = String(request.params.arguments?.query);
1136
- const limit = 5;
1137
- const safeOnly = true; // Enable safe filter by default
1138
- const enableJIT = request.params.arguments?.enable_jit === true;
1139
- if (!query) {
1140
- throw new McpError(ErrorCode.InvalidParams, "Query is required");
1739
+ case "vesper.fuse": {
1740
+ hydrateExternalKeys();
1741
+ const sources = Array.isArray(request.params.arguments?.sources)
1742
+ ? request.params.arguments?.sources
1743
+ : undefined;
1744
+ if (!sources || !Array.isArray(sources)) {
1745
+ return {
1746
+ content: [{ type: "text", text: "ERROR: vesper.fuse requires 'sources' array." }],
1747
+ isError: true,
1748
+ };
1749
+ }
1750
+ try {
1751
+ const mergeStrategyRaw = request.params.arguments?.merge_strategy
1752
+ ? String(request.params.arguments?.merge_strategy).toLowerCase()
1753
+ : undefined;
1754
+ const dedupRaw = request.params.arguments?.deduplication
1755
+ ? String(request.params.arguments?.deduplication).toLowerCase()
1756
+ : undefined;
1757
+ const merge_strategy = mergeStrategyRaw && ["union", "dedup"].includes(mergeStrategyRaw)
1758
+ ? mergeStrategyRaw
1759
+ : undefined;
1760
+ const deduplication = dedupRaw && ["semantic", "exact", "none"].includes(dedupRaw)
1761
+ ? dedupRaw
1762
+ : undefined;
1763
+ const result = await webFusionEngine.fuse({
1764
+ sources: sources.map((s) => ({
1765
+ type: String(s?.type || "").trim().toLowerCase(),
1766
+ query: String(s?.query || "").trim(),
1767
+ max_results: s?.max_results !== undefined ? Number(s.max_results) : undefined,
1768
+ min_stars: s?.min_stars !== undefined ? Number(s.min_stars) : undefined,
1769
+ bucket: s?.bucket !== undefined ? String(s.bucket) : undefined,
1770
+ path: s?.path !== undefined ? String(s.path) : undefined,
1771
+ region: s?.region !== undefined ? String(s.region) : undefined,
1772
+ credentials: s?.credentials ? {
1773
+ accessKeyId: s.credentials.accessKeyId !== undefined ? String(s.credentials.accessKeyId) : undefined,
1774
+ secretAccessKey: s.credentials.secretAccessKey !== undefined ? String(s.credentials.secretAccessKey) : undefined,
1775
+ sessionToken: s.credentials.sessionToken !== undefined ? String(s.credentials.sessionToken) : undefined,
1776
+ roleArn: s.credentials.roleArn !== undefined ? String(s.credentials.roleArn) : undefined,
1777
+ } : undefined,
1778
+ })),
1779
+ merge_strategy,
1780
+ deduplication,
1781
+ });
1782
+ return {
1783
+ content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
1784
+ };
1785
+ }
1786
+ catch (error) {
1787
+ return {
1788
+ content: [{ type: "text", text: `ERROR: vesper.fuse failed: ${error.message}` }],
1789
+ isError: true,
1790
+ };
1791
+ }
1141
1792
  }
1142
- const results = await searchEngine.search(query, { limit, safeOnly, enableJIT });
1143
- const formattedOutput = formatSearchResults(results);
1144
- return {
1145
- content: [
1146
- {
1147
- type: "text",
1148
- text: formattedOutput,
1149
- },
1150
- ],
1151
- };
1152
- }
1153
- case "discover_datasets": {
1154
- hydrateExternalKeys();
1155
- const query = String(request.params.arguments?.query || "").trim();
1156
- const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
1157
- const limit = Number(request.params.arguments?.limit || 10);
1158
- if (!query) {
1159
- throw new McpError(ErrorCode.InvalidParams, "query is required");
1793
+ case "vesper.extract_web": {
1794
+ hydrateExternalKeys();
1795
+ const url = String(request.params.arguments?.url || "").trim();
1796
+ const mode = request.params.arguments?.mode
1797
+ ? String(request.params.arguments?.mode).trim().toLowerCase()
1798
+ : "auto";
1799
+ const schema = request.params.arguments?.schema && typeof request.params.arguments.schema === "object"
1800
+ ? request.params.arguments.schema
1801
+ : undefined;
1802
+ if (!url) {
1803
+ return {
1804
+ content: [{ type: "text", text: "ERROR: vesper.extract_web requires 'url'." }],
1805
+ isError: true,
1806
+ };
1807
+ }
1808
+ try {
1809
+ const out = await webExtractorEngine.extract({
1810
+ url,
1811
+ mode: mode,
1812
+ strict_schema: request.params.arguments?.strict_schema !== false,
1813
+ schema: schema,
1814
+ });
1815
+ return {
1816
+ content: [{ type: "text", text: JSON.stringify(out, null, 2) }],
1817
+ };
1818
+ }
1819
+ catch (error) {
1820
+ return {
1821
+ content: [{ type: "text", text: `ERROR: vesper.extract_web failed: ${error.message}` }],
1822
+ isError: true,
1823
+ };
1824
+ }
1160
1825
  }
1161
- try {
1162
- let results = [];
1163
- if (source === "kaggle") {
1164
- if (!dataIngestor.hasKaggleCredentials()) {
1826
+ case "unified_dataset_api": {
1827
+ hydrateExternalKeys();
1828
+ const operation = String(request.params.arguments?.operation || "").trim().toLowerCase();
1829
+ const source = String(request.params.arguments?.source || "auto").trim().toLowerCase();
1830
+ const includeUnavailable = request.params.arguments?.include_unavailable === true;
1831
+ const publicOnly = request.params.arguments?.public_only !== false;
1832
+ try {
1833
+ if (operation === "providers") {
1165
1834
  return {
1166
- content: [{ type: "text", text: `Kaggle requires API credentials. Use the configure_keys tool to set kaggle_username and kaggle_key, or try source='huggingface' which works without credentials.` }],
1167
- isError: true,
1835
+ content: [{ type: "text", text: JSON.stringify({ providers: unifiedDatasetGateway.getProviderStatuses(includeUnavailable) }, null, 2) }],
1168
1836
  };
1169
1837
  }
1170
- results = await kaggleSource.discover(query, limit);
1171
- }
1172
- else if (source === "openml") {
1173
- const openmlSource = new OpenMLSource();
1174
- results = await openmlSource.discover(query, limit);
1175
- }
1176
- else if (source === "dataworld") {
1177
- if (!hasDataWorldToken()) {
1838
+ if (operation === "discover") {
1839
+ const query = String(request.params.arguments?.query || "").trim();
1840
+ if (!query) {
1841
+ throw new McpError(ErrorCode.InvalidParams, "query is required for operation='discover'");
1842
+ }
1843
+ const result = await unifiedDatasetGateway.discover({
1844
+ query,
1845
+ source,
1846
+ limit: Number(request.params.arguments?.limit || 10),
1847
+ publicOnly,
1848
+ });
1178
1849
  return {
1179
- content: [{ type: "text", text: "data.world requires API token. Run 'vespermcp config keys' and set dataworld_token." }],
1180
- isError: true,
1850
+ content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
1181
1851
  };
1182
1852
  }
1183
- const dataworldSource = new DataWorldSource();
1184
- results = await dataworldSource.discover(query, limit);
1185
- }
1186
- else {
1187
- const hf = new HuggingFaceScraper();
1188
- results = await hf.scrape(Math.max(1, limit), true, query);
1189
- }
1190
- const recipeScript = path.join(dataRoot, "python", "asset_downloader_engine.py");
1191
- for (const ds of results.slice(0, limit)) {
1192
- const info = {
1193
- dataset_id: ds.id,
1194
- id: ds.id,
1195
- source: ds.source,
1196
- repo_id: ds.id,
1197
- total_images: ds.total_examples || 0,
1198
- image_column: undefined,
1199
- recipes_dir: path.join(dataRoot, "recipes"),
1200
- };
1201
- try {
1202
- await runPythonJson(recipeScript, ["build_recipe", JSON.stringify(info)]);
1853
+ if (operation === "download") {
1854
+ const datasetId = String(request.params.arguments?.dataset_id || "").trim();
1855
+ if (!datasetId) {
1856
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='download'");
1857
+ }
1858
+ const requestedTargetDir = request.params.arguments?.target_dir
1859
+ ? String(request.params.arguments.target_dir).trim()
1860
+ : request.params.arguments?.output_dir
1861
+ ? String(request.params.arguments.output_dir).trim()
1862
+ : "";
1863
+ const targetDir = requestedTargetDir || process.cwd();
1864
+ try {
1865
+ await ensurePythonModules([{ module: "datasets", packageName: "datasets" }]);
1866
+ }
1867
+ catch {
1868
+ // best effort; non-HF providers do not require this
1869
+ }
1870
+ const result = await unifiedDatasetGateway.download({
1871
+ datasetId,
1872
+ source,
1873
+ targetDir,
1874
+ });
1875
+ try {
1876
+ upsertRegistry(result.dataset_id, result.copied_to || result.local_path, "completed");
1877
+ }
1878
+ catch (e) {
1879
+ console.error(`[Registry] Failed to write registry for ${result.dataset_id}: ${e?.message || e}`);
1880
+ }
1881
+ return {
1882
+ content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
1883
+ };
1203
1884
  }
1204
- catch {
1205
- // best-effort recipe generation; ignore discovery-time recipe failures
1885
+ if (operation === "info") {
1886
+ const datasetId = String(request.params.arguments?.dataset_id || "").trim();
1887
+ if (!datasetId) {
1888
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='info'");
1889
+ }
1890
+ const result = await unifiedDatasetGateway.info({
1891
+ datasetId,
1892
+ source,
1893
+ publicOnly,
1894
+ });
1895
+ return {
1896
+ content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
1897
+ };
1206
1898
  }
1899
+ throw new McpError(ErrorCode.InvalidParams, `Unsupported operation: ${operation}`);
1900
+ }
1901
+ catch (error) {
1902
+ return {
1903
+ content: [{ type: "text", text: `ERROR: unified gateway failed: ${error.message}` }],
1904
+ isError: true,
1905
+ };
1207
1906
  }
1208
- const formattedOutput = formatSearchResults(results.slice(0, limit));
1209
- return {
1210
- content: [{ type: "text", text: formattedOutput }]
1211
- };
1212
- }
1213
- catch (error) {
1214
- return {
1215
- content: [{ type: "text", text: `ERROR: discover failed: ${error.message}` }],
1216
- isError: true,
1217
- };
1218
- }
1219
- }
1220
- case "download_dataset": {
1221
- hydrateExternalKeys();
1222
- const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
1223
- const datasetId = String(request.params.arguments?.dataset_id || "").trim();
1224
- if (!datasetId) {
1225
- throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
1226
- }
1227
- if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
1228
- return {
1229
- content: [{ type: "text", text: `Kaggle requires API credentials. Use the configure_keys tool to set kaggle_username and kaggle_key, or switch to source='huggingface' which works without credentials.` }],
1230
- isError: true,
1231
- };
1232
1907
  }
1233
- if (source === "dataworld" && !hasDataWorldToken()) {
1908
+ case "vesper_search": {
1909
+ const query = String(request.params.arguments?.query);
1910
+ const limit = 5;
1911
+ const safeOnly = true; // Enable safe filter by default
1912
+ const enableJIT = request.params.arguments?.enable_jit === true;
1913
+ if (!query) {
1914
+ throw new McpError(ErrorCode.InvalidParams, "Query is required");
1915
+ }
1916
+ const results = await searchEngine.search(query, { limit, safeOnly, enableJIT });
1917
+ const formattedOutput = formatSearchResults(results);
1234
1918
  return {
1235
- content: [{ type: "text", text: "data.world requires API token. Use the configure_keys tool to set dataworld_token, or switch to source='huggingface' which works without credentials." }],
1236
- isError: true,
1919
+ content: [
1920
+ {
1921
+ type: "text",
1922
+ text: formattedOutput,
1923
+ },
1924
+ ],
1237
1925
  };
1238
1926
  }
1239
- // Pre-install Python datasets library for HuggingFace fallback
1240
- if (source === "huggingface") {
1927
+ case "discover_datasets": {
1928
+ hydrateExternalKeys();
1929
+ const query = String(request.params.arguments?.query || "").trim();
1930
+ const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
1931
+ const limit = Number(request.params.arguments?.limit || 10);
1932
+ if (!query) {
1933
+ throw new McpError(ErrorCode.InvalidParams, "query is required");
1934
+ }
1241
1935
  try {
1242
- await ensurePythonModules([
1243
- { module: "datasets", packageName: "datasets" },
1244
- ]);
1936
+ const gatewayResult = await unifiedDatasetGateway.discover({
1937
+ query,
1938
+ source,
1939
+ limit,
1940
+ publicOnly: false,
1941
+ });
1942
+ const results = gatewayResult.results;
1943
+ const recipeScript = path.join(dataRoot, "python", "asset_downloader_engine.py");
1944
+ for (const ds of results.slice(0, limit)) {
1945
+ const info = {
1946
+ dataset_id: ds.id,
1947
+ id: ds.id,
1948
+ source: ds.source,
1949
+ repo_id: ds.id,
1950
+ total_images: ds.total_examples || 0,
1951
+ image_column: undefined,
1952
+ recipes_dir: path.join(dataRoot, "recipes"),
1953
+ };
1954
+ try {
1955
+ await runPythonJson(recipeScript, ["build_recipe", JSON.stringify(info)]);
1956
+ }
1957
+ catch {
1958
+ // best-effort recipe generation; ignore discovery-time recipe failures
1959
+ }
1960
+ }
1961
+ const formattedOutput = formatSearchResults(results.slice(0, limit));
1962
+ const noteBlock = gatewayResult.notes.length > 0
1963
+ ? `\nGateway notes:\n- ${gatewayResult.notes.join("\n- ")}`
1964
+ : "";
1965
+ return {
1966
+ content: [{ type: "text", text: `${formattedOutput}${noteBlock}` }]
1967
+ };
1245
1968
  }
1246
- catch {
1247
- // Continue - direct download may still work
1969
+ catch (error) {
1970
+ return {
1971
+ content: [{ type: "text", text: `ERROR: discover failed: ${error.message}` }],
1972
+ isError: true,
1973
+ };
1248
1974
  }
1249
1975
  }
1250
- try {
1251
- const localPath = await dataIngestor.ensureData(datasetId, source, () => undefined);
1976
+ case "download_dataset": {
1977
+ hydrateExternalKeys();
1978
+ const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
1979
+ const datasetId = String(request.params.arguments?.dataset_id || "").trim();
1980
+ const requestedTargetDir = request.params.arguments?.target_dir
1981
+ ? String(request.params.arguments.target_dir).trim()
1982
+ : request.params.arguments?.output_dir
1983
+ ? String(request.params.arguments.output_dir).trim()
1984
+ : "";
1985
+ const targetDir = requestedTargetDir || process.cwd();
1986
+ if (!datasetId) {
1987
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
1988
+ }
1989
+ // Pre-install Python datasets library for HuggingFace fallback
1990
+ if (source === "huggingface") {
1991
+ try {
1992
+ await ensurePythonModules([
1993
+ { module: "datasets", packageName: "datasets" },
1994
+ ]);
1995
+ }
1996
+ catch {
1997
+ // Continue - direct download may still work
1998
+ }
1999
+ }
1252
2000
  try {
1253
- upsertRegistry(datasetId, localPath, "completed");
2001
+ const result = await unifiedDatasetGateway.download({
2002
+ datasetId,
2003
+ source,
2004
+ targetDir,
2005
+ });
2006
+ try {
2007
+ upsertRegistry(result.dataset_id, result.copied_to || result.local_path, "completed");
2008
+ }
2009
+ catch (e) {
2010
+ console.error(`[Registry] Failed to write registry for ${result.dataset_id}: ${e?.message || e}`);
2011
+ }
2012
+ const noteBlock = result.notes.length > 0 ? `\nNotes:\n- ${result.notes.join("\n- ")}` : "";
2013
+ return {
2014
+ content: [{ type: "text", text: `Download complete: ${result.copied_to || result.local_path}${noteBlock}` }]
2015
+ };
1254
2016
  }
1255
- catch (e) {
1256
- console.error(`[Registry] Failed to write registry for ${datasetId}: ${e?.message || e}`);
2017
+ catch (error) {
2018
+ return {
2019
+ content: [{ type: "text", text: `ERROR: download failed: ${error.message}` }],
2020
+ isError: true,
2021
+ };
1257
2022
  }
1258
- return {
1259
- content: [{ type: "text", text: `Download complete: ${localPath}` }]
1260
- };
1261
2023
  }
1262
- catch (error) {
1263
- return {
1264
- content: [{ type: "text", text: `ERROR: download failed: ${error.message}` }],
1265
- isError: true,
2024
+ case "vesper_download_assets": {
2025
+ hydrateExternalKeys();
2026
+ const datasetId = String(request.params.arguments?.dataset_id || "").trim();
2027
+ const source = String(request.params.arguments?.source || "").trim().toLowerCase();
2028
+ // Auto-infer repo_id from dataset_id if not provided (common for HuggingFace)
2029
+ const repoId = request.params.arguments?.repo_id
2030
+ ? String(request.params.arguments.repo_id)
2031
+ : (source === "huggingface" && datasetId.includes("/") ? datasetId : undefined);
2032
+ const kaggleRef = request.params.arguments?.kaggle_ref ? String(request.params.arguments.kaggle_ref) : undefined;
2033
+ const urls = Array.isArray(request.params.arguments?.urls)
2034
+ ? (request.params.arguments?.urls).map(v => String(v))
2035
+ : undefined;
2036
+ const outputFormat = String(request.params.arguments?.output_format || "webdataset");
2037
+ const requestedOutputDir = request.params.arguments?.target_dir
2038
+ ? String(request.params.arguments.target_dir).trim()
2039
+ : request.params.arguments?.output_dir
2040
+ ? String(request.params.arguments.output_dir).trim()
2041
+ : undefined;
2042
+ const maxItems = request.params.arguments?.max_items ? Number(request.params.arguments.max_items) : undefined;
2043
+ const workers = request.params.arguments?.workers ? Number(request.params.arguments.workers) : 8;
2044
+ const imageColumn = request.params.arguments?.image_column ? String(request.params.arguments.image_column) : undefined;
2045
+ if (!datasetId || !source) {
2046
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id and source are required");
2047
+ }
2048
+ if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
2049
+ return {
2050
+ content: [{ type: "text", text: "Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds)." }],
2051
+ isError: true,
2052
+ };
2053
+ }
2054
+ const requiredModules = [
2055
+ { module: "aiohttp", packageName: "aiohttp" },
2056
+ ];
2057
+ if (source === "url") {
2058
+ requiredModules.push({ module: "aiofiles", packageName: "aiofiles" });
2059
+ }
2060
+ if (source === "huggingface") {
2061
+ requiredModules.push({ module: "datasets", packageName: "datasets" });
2062
+ requiredModules.push({ module: "PIL", packageName: "Pillow" });
2063
+ }
2064
+ if (source === "kaggle") {
2065
+ requiredModules.push({ module: "kaggle", packageName: "kaggle" });
2066
+ }
2067
+ try {
2068
+ await ensurePythonModules(requiredModules);
2069
+ }
2070
+ catch (error) {
2071
+ return {
2072
+ content: [{ type: "text", text: `ERROR: Python dependency setup failed: ${error.message}` }],
2073
+ isError: true,
2074
+ };
2075
+ }
2076
+ const scriptPath = path.join(dataRoot, "python", "asset_downloader_engine.py");
2077
+ const payload = {
2078
+ dataset_id: datasetId,
2079
+ source,
2080
+ repo_id: repoId,
2081
+ kaggle_ref: kaggleRef,
2082
+ urls,
2083
+ output_format: outputFormat,
2084
+ output_dir: requestedOutputDir,
2085
+ max_items: maxItems,
2086
+ workers,
2087
+ image_column: imageColumn,
2088
+ output_root: requestedOutputDir || process.cwd(),
2089
+ recipes_dir: path.join(dataRoot, "recipes"),
1266
2090
  };
2091
+ try {
2092
+ const result = await runPythonJson(scriptPath, ["download", JSON.stringify(payload)]);
2093
+ if (!result?.ok) {
2094
+ const errMsg = result?.error || "Unknown error";
2095
+ // Enhance error messages for common failures
2096
+ let hint = "";
2097
+ if (errMsg.includes("No image column")) {
2098
+ hint = "\n\nHint: Specify image_column parameter with the column name containing images/URLs.";
2099
+ }
2100
+ else if (errMsg.includes("Authentication") || errMsg.includes("401") || errMsg.includes("403")) {
2101
+ hint = "\n\nHint: Use configure_keys tool to set HF_TOKEN for gated/private datasets.";
2102
+ }
2103
+ return {
2104
+ content: [{ type: "text", text: `ERROR: asset download failed: ${errMsg}${hint}` }],
2105
+ isError: true,
2106
+ };
2107
+ }
2108
+ return {
2109
+ content: [{ type: "text", text: JSON.stringify(result.result, null, 2) }],
2110
+ };
2111
+ }
2112
+ catch (error) {
2113
+ return {
2114
+ content: [{ type: "text", text: `ERROR: asset downloader execution failed: ${error.message}` }],
2115
+ isError: true,
2116
+ };
2117
+ }
1267
2118
  }
1268
- }
1269
- case "vesper_download_assets": {
1270
- hydrateExternalKeys();
1271
- const datasetId = String(request.params.arguments?.dataset_id || "").trim();
1272
- const source = String(request.params.arguments?.source || "").trim().toLowerCase();
1273
- // Auto-infer repo_id from dataset_id if not provided (common for HuggingFace)
1274
- const repoId = request.params.arguments?.repo_id
1275
- ? String(request.params.arguments.repo_id)
1276
- : (source === "huggingface" && datasetId.includes("/") ? datasetId : undefined);
1277
- const kaggleRef = request.params.arguments?.kaggle_ref ? String(request.params.arguments.kaggle_ref) : undefined;
1278
- const urls = Array.isArray(request.params.arguments?.urls)
1279
- ? (request.params.arguments?.urls).map(v => String(v))
1280
- : undefined;
1281
- const outputFormat = String(request.params.arguments?.output_format || "webdataset");
1282
- const maxItems = request.params.arguments?.max_items ? Number(request.params.arguments.max_items) : undefined;
1283
- const workers = request.params.arguments?.workers ? Number(request.params.arguments.workers) : 8;
1284
- const imageColumn = request.params.arguments?.image_column ? String(request.params.arguments.image_column) : undefined;
1285
- if (!datasetId || !source) {
1286
- throw new McpError(ErrorCode.InvalidParams, "dataset_id and source are required");
1287
- }
1288
- if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
2119
+ case "configure_kaggle": {
2120
+ const username = String(request.params.arguments?.username || "").trim();
2121
+ const key = String(request.params.arguments?.key || "").trim();
2122
+ if (!username || !key) {
2123
+ throw new McpError(ErrorCode.InvalidParams, "username and key are required");
2124
+ }
2125
+ const r1 = secureKeys.set("kaggle_username", username);
2126
+ const r2 = secureKeys.set("kaggle_key", key);
2127
+ process.env.KAGGLE_USERNAME = username;
2128
+ process.env.KAGGLE_KEY = key;
1289
2129
  return {
1290
- content: [{ type: "text", text: "Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds)." }],
1291
- isError: true,
2130
+ content: [{ type: "text", text: `Kaggle credentials saved securely (${r1.method || "store"}/${r2.method || "store"}). You can now use Kaggle datasets.` }]
1292
2131
  };
1293
2132
  }
1294
- const requiredModules = [
1295
- { module: "aiohttp", packageName: "aiohttp" },
1296
- ];
1297
- if (source === "url") {
1298
- requiredModules.push({ module: "aiofiles", packageName: "aiofiles" });
1299
- }
1300
- if (source === "huggingface") {
1301
- requiredModules.push({ module: "datasets", packageName: "datasets" });
1302
- requiredModules.push({ module: "PIL", packageName: "Pillow" });
1303
- }
1304
- if (source === "kaggle") {
1305
- requiredModules.push({ module: "kaggle", packageName: "kaggle" });
1306
- }
1307
- try {
1308
- await ensurePythonModules(requiredModules);
1309
- }
1310
- catch (error) {
2133
+ case "configure_keys": {
2134
+ const hfToken = String(request.params.arguments?.hf_token || "").trim();
2135
+ const kaggleUsername = String(request.params.arguments?.kaggle_username || "").trim();
2136
+ const kaggleKey = String(request.params.arguments?.kaggle_key || "").trim();
2137
+ const dataworldToken = String(request.params.arguments?.dataworld_token || "").trim();
2138
+ const saved = [];
2139
+ const methods = [];
2140
+ if (hfToken) {
2141
+ const r = secureKeys.set("hf_token", hfToken);
2142
+ if (r.ok) {
2143
+ process.env.HF_TOKEN = hfToken;
2144
+ saved.push("HF token");
2145
+ if (r.method)
2146
+ methods.push(r.method);
2147
+ }
2148
+ }
2149
+ if (kaggleUsername) {
2150
+ const r = secureKeys.set("kaggle_username", kaggleUsername);
2151
+ if (r.ok) {
2152
+ process.env.KAGGLE_USERNAME = kaggleUsername;
2153
+ saved.push("Kaggle username");
2154
+ if (r.method)
2155
+ methods.push(r.method);
2156
+ }
2157
+ }
2158
+ if (kaggleKey) {
2159
+ const r = secureKeys.set("kaggle_key", kaggleKey);
2160
+ if (r.ok) {
2161
+ process.env.KAGGLE_KEY = kaggleKey;
2162
+ saved.push("Kaggle key");
2163
+ if (r.method)
2164
+ methods.push(r.method);
2165
+ }
2166
+ }
2167
+ if (dataworldToken) {
2168
+ const r = secureKeys.set("dataworld_token", dataworldToken);
2169
+ if (r.ok) {
2170
+ process.env.DW_AUTH_TOKEN = dataworldToken;
2171
+ saved.push("data.world token");
2172
+ if (r.method)
2173
+ methods.push(r.method);
2174
+ }
2175
+ }
2176
+ if (saved.length === 0) {
2177
+ return {
2178
+ content: [{ type: "text", text: "No keys provided. Core Vesper tools continue to work without API keys." }]
2179
+ };
2180
+ }
1311
2181
  return {
1312
- content: [{ type: "text", text: `ERROR: Python dependency setup failed: ${error.message}` }],
1313
- isError: true,
2182
+ content: [{ type: "text", text: `Key saved securely. Updated: ${saved.join(", ")}.` }]
1314
2183
  };
1315
2184
  }
1316
- const scriptPath = path.join(dataRoot, "python", "asset_downloader_engine.py");
1317
- const payload = {
1318
- dataset_id: datasetId,
1319
- source,
1320
- repo_id: repoId,
1321
- kaggle_ref: kaggleRef,
1322
- urls,
1323
- output_format: outputFormat,
1324
- max_items: maxItems,
1325
- workers,
1326
- image_column: imageColumn,
1327
- output_root: path.join(dataRoot, "data", "assets"),
1328
- recipes_dir: path.join(dataRoot, "recipes"),
1329
- };
1330
- try {
1331
- const result = await runPythonJson(scriptPath, ["download", JSON.stringify(payload)]);
1332
- if (!result?.ok) {
1333
- const errMsg = result?.error || "Unknown error";
1334
- // Enhance error messages for common failures
1335
- let hint = "";
1336
- if (errMsg.includes("No image column")) {
1337
- hint = "\n\nHint: Specify image_column parameter with the column name containing images/URLs.";
1338
- }
1339
- else if (errMsg.includes("Authentication") || errMsg.includes("401") || errMsg.includes("403")) {
1340
- hint = "\n\nHint: Use configure_keys tool to set HF_TOKEN for gated/private datasets.";
2185
+ case "get_dataset_info": {
2186
+ const datasetId = String(request.params.arguments?.dataset_id);
2187
+ if (!datasetId) {
2188
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
2189
+ }
2190
+ const dataset = metadataStore.getDataset(datasetId);
2191
+ if (!dataset) {
2192
+ // Fallback: check the registry for local path info
2193
+ const regEntry = getRegistryEntry(datasetId);
2194
+ const regPath = regEntry?.local_path || regEntry?.path;
2195
+ if (regEntry) {
2196
+ const exists = regPath && fs.existsSync(regPath);
2197
+ return {
2198
+ content: [{ type: "text", text: `**${datasetId}** (from registry)\n- Local path: ${regPath || "unknown"}\n- Status: ${regEntry.status || "unknown"}${exists ? "" : " (file missing)"}\n\nNote: Full metadata not available in metadata store. Use prepare_dataset to get full details.` }],
2199
+ };
1341
2200
  }
1342
2201
  return {
1343
- content: [{ type: "text", text: `ERROR: asset download failed: ${errMsg}${hint}` }],
2202
+ content: [{ type: "text", text: `ERROR: Dataset not found: ${datasetId}. Use vesper_list_datasets to see available datasets, or prepare_dataset to add new ones.` }],
1344
2203
  isError: true,
1345
2204
  };
1346
2205
  }
2206
+ // Enrich: if total_examples is 0-ish, try the HF datasets-server /size API
2207
+ if ((!dataset.total_examples || dataset.total_examples === 0) && dataset.source === "huggingface") {
2208
+ try {
2209
+ const sizeResp = await fetch(`https://datasets-server.huggingface.co/size?dataset=${encodeURIComponent(dataset.id)}`);
2210
+ if (sizeResp.ok) {
2211
+ const sizeData = await sizeResp.json();
2212
+ const numRows = sizeData?.size?.dataset?.num_rows;
2213
+ if (numRows && numRows > 0) {
2214
+ dataset.total_examples = numRows;
2215
+ // Also backfill splits
2216
+ if (sizeData?.size?.splits && Array.isArray(sizeData.size.splits)) {
2217
+ dataset.splits = sizeData.size.splits.map((s) => ({
2218
+ name: s.split,
2219
+ num_examples: s.num_rows || 0,
2220
+ size_bytes: s.num_bytes_parquet_files || 0,
2221
+ }));
2222
+ dataset.has_train_split = dataset.splits.some((s) => s.name === "train");
2223
+ dataset.has_test_split = dataset.splits.some((s) => s.name === "test");
2224
+ dataset.has_validation_split = dataset.splits.some((s) => s.name === "validation" || s.name === "val");
2225
+ }
2226
+ // Persist enriched metadata
2227
+ metadataStore.saveDataset(dataset);
2228
+ }
2229
+ }
2230
+ }
2231
+ catch {
2232
+ // Enrichment is best-effort; continue with whatever we have
2233
+ }
2234
+ }
2235
+ const formattedOutput = formatDatasetInfo(dataset);
2236
+ return { content: [{ type: "text", text: formattedOutput }] };
2237
+ }
2238
+ case "analyze_quality": {
2239
+ const datasetId = String(request.params.arguments?.dataset_id);
2240
+ const safeId = toSafeDatasetPathFragment(datasetId);
2241
+ const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
2242
+ const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
2243
+ let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
2244
+ // Demo Fallback for easy testing
2245
+ if (datasetId === "demo" || !fs.existsSync(filePath)) {
2246
+ const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
2247
+ const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
2248
+ if (fs.existsSync(demoParquetPath)) {
2249
+ filePath = demoParquetPath;
2250
+ }
2251
+ else if (fs.existsSync(demoCsvPath)) {
2252
+ filePath = demoCsvPath;
2253
+ }
2254
+ else if (datasetId !== "demo") {
2255
+ return {
2256
+ content: [{ type: "text", text: `ERROR: Local data file for ${datasetId} not found. Try running prepare_dataset first, or use 'demo' as the dataset_id.` }],
2257
+ isError: true
2258
+ };
2259
+ }
2260
+ }
2261
+ const report = await qualityAnalyzer.analyze(filePath);
1347
2262
  return {
1348
- content: [{ type: "text", text: JSON.stringify(result.result, null, 2) }],
2263
+ content: [{ type: "text", text: JSON.stringify(report, null, 2) }]
1349
2264
  };
1350
2265
  }
1351
- catch (error) {
2266
+ case "preview_cleaning": {
2267
+ const datasetId = String(request.params.arguments?.dataset_id);
2268
+ const safeId = toSafeDatasetPathFragment(datasetId);
2269
+ const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
2270
+ const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
2271
+ let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
2272
+ if (datasetId === "demo" || !fs.existsSync(filePath)) {
2273
+ const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
2274
+ const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
2275
+ if (fs.existsSync(demoParquetPath)) {
2276
+ filePath = demoParquetPath;
2277
+ }
2278
+ else if (fs.existsSync(demoCsvPath)) {
2279
+ filePath = demoCsvPath;
2280
+ }
2281
+ else {
2282
+ throw new McpError(ErrorCode.InvalidParams, `Local data file not found for ${datasetId}. Please run prepare_dataset first.`);
2283
+ }
2284
+ }
2285
+ const report = await qualityAnalyzer.analyze(filePath);
2286
+ // Phase 1: Target Detection
2287
+ // We use the same TargetDetector instance inside CleaningPlanner now?
2288
+ // Actually, we instantiated it inside CleaningPlanner, so we just need to pass the file path
2289
+ // OR let the planner handle it if we update its signature to accept filePath.
2290
+ // Let's check `CleaningPlanner.generatePlan` signature again.
2291
+ // We updated it to accept `targetInfo`.
2292
+ // So we need to run detection HERE and pass it.
2293
+ // But `TargetDetector` is not exposed in `index.ts` scope yet.
2294
+ // Let's create a global instance or use the one inside planner if exposed (it's private).
2295
+ // Better approach: Instantiate TargetDetector here in index.ts for the tool content.
2296
+ // Quick fix: Instantiate local detector or make global.
2297
+ // I'll make a global `targetDetector` constant in index.ts
2298
+ // But wait, I updated `CleaningPlanner` to instantiate its own detector.
2299
+ // Does `CleaningPlanner` use it? No, I commented out the logic because it needed `filePath`.
2300
+ // RETRY STRATEGY:
2301
+ // 1. Instantiate `targetDetector` in `index.ts`.
2302
+ // 2. Run `detectTarget(filePath)`.
2303
+ // 3. Pass result to `cleaningPlanner.generatePlan(..., targetInfo)`.
2304
+ // I need to add `const targetDetector = new TargetDetector(__dirname);` to imports/init section first.
2305
+ // But since I'm in this tool, I can't look back.
2306
+ // I will assume I can add it, or just do it inside the case for now.
2307
+ // To do it properly, I should have added `targetDetector` to the global scope in previous step.
2308
+ // Let's do that in a separate step if needed.
2309
+ // For now, I'll instantiate it here.
2310
+ const { TargetDetector } = await import("./preparation/target-detector.js");
2311
+ const detector = new TargetDetector(__dirname);
2312
+ const targetResult = await detector.detectTarget(filePath);
2313
+ const targetInfo = targetResult.target_column ? {
2314
+ target: targetResult.target_column,
2315
+ confidence: targetResult.confidence
2316
+ } : undefined;
2317
+ const plan = await cleaningPlanner.generatePlan(datasetId, report, undefined, targetInfo);
2318
+ let explanation = `### Cleaning Plan for ${datasetId}\n\n`;
2319
+ if (targetInfo && targetInfo.target !== "target" && targetInfo.confidence > 0.7) {
2320
+ explanation += `**Target Detected**: '${targetInfo.target}' (Confidence: ${targetInfo.confidence.toFixed(2)})\n`;
2321
+ explanation += ` - **Action**: Renaming to 'target' for consistency.\n\n`;
2322
+ }
2323
+ explanation += `Estimated Quality Improvement: +${plan.estimated_impact.quality_score_improvement} points\n\n`;
2324
+ if (plan.operations.length === 0) {
2325
+ explanation += "No cleaning operations required.";
2326
+ }
2327
+ else {
2328
+ plan.operations.forEach((op, i) => {
2329
+ explanation += `${i + 1}. **${op.type}**: ${op.reason}\n`;
2330
+ });
2331
+ }
1352
2332
  return {
1353
- content: [{ type: "text", text: `ERROR: asset downloader execution failed: ${error.message}` }],
1354
- isError: true,
2333
+ content: [{ type: "text", text: explanation }]
1355
2334
  };
1356
2335
  }
1357
- }
1358
- case "configure_kaggle": {
1359
- const username = String(request.params.arguments?.username || "").trim();
1360
- const key = String(request.params.arguments?.key || "").trim();
1361
- if (!username || !key) {
1362
- throw new McpError(ErrorCode.InvalidParams, "username and key are required");
1363
- }
1364
- const r1 = secureKeys.set("kaggle_username", username);
1365
- const r2 = secureKeys.set("kaggle_key", key);
1366
- process.env.KAGGLE_USERNAME = username;
1367
- process.env.KAGGLE_KEY = key;
1368
- return {
1369
- content: [{ type: "text", text: `Kaggle credentials saved securely (${r1.method || "store"}/${r2.method || "store"}). You can now use Kaggle datasets.` }]
1370
- };
1371
- }
1372
- case "configure_keys": {
1373
- const hfToken = String(request.params.arguments?.hf_token || "").trim();
1374
- const kaggleUsername = String(request.params.arguments?.kaggle_username || "").trim();
1375
- const kaggleKey = String(request.params.arguments?.kaggle_key || "").trim();
1376
- const dataworldToken = String(request.params.arguments?.dataworld_token || "").trim();
1377
- const saved = [];
1378
- const methods = [];
1379
- if (hfToken) {
1380
- const r = secureKeys.set("hf_token", hfToken);
1381
- if (r.ok) {
1382
- process.env.HF_TOKEN = hfToken;
1383
- saved.push("HF token");
1384
- if (r.method)
1385
- methods.push(r.method);
2336
+ case "custom_clean": {
2337
+ const datasetId = String(request.params.arguments?.dataset_id);
2338
+ const ops = request.params.arguments?.operations;
2339
+ if (!datasetId || datasetId === "undefined") {
2340
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
1386
2341
  }
1387
- }
1388
- if (kaggleUsername) {
1389
- const r = secureKeys.set("kaggle_username", kaggleUsername);
1390
- if (r.ok) {
1391
- process.env.KAGGLE_USERNAME = kaggleUsername;
1392
- saved.push("Kaggle username");
1393
- if (r.method)
1394
- methods.push(r.method);
2342
+ if (!ops || !Array.isArray(ops) || ops.length === 0) {
2343
+ throw new McpError(ErrorCode.InvalidParams, "operations array is required and must not be empty. Supported operations: RemoveDuplicates, DropColumns, FillMissing, FixTypes, RemoveOutliers, EncodeCategories");
1395
2344
  }
1396
- }
1397
- if (kaggleKey) {
1398
- const r = secureKeys.set("kaggle_key", kaggleKey);
1399
- if (r.ok) {
1400
- process.env.KAGGLE_KEY = kaggleKey;
1401
- saved.push("Kaggle key");
1402
- if (r.method)
1403
- methods.push(r.method);
2345
+ // Pre-check: verify dataset file exists before starting the job
2346
+ const cleanRegEntry = getRegistryEntry(datasetId);
2347
+ const cleanRegPath = cleanRegEntry?.local_path || cleanRegEntry?.path;
2348
+ const cleanDlStatus = metadataStore.getDownloadStatus(datasetId);
2349
+ const cleanSafeId = toSafeDatasetPathFragment(datasetId);
2350
+ const cleanDataExists = (cleanRegPath && fs.existsSync(cleanRegPath)) ||
2351
+ (cleanDlStatus?.local_path && fs.existsSync(cleanDlStatus.local_path)) ||
2352
+ fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.parquet`)) ||
2353
+ fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.csv`)) ||
2354
+ fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.feather`)) ||
2355
+ fs.existsSync(datasetId);
2356
+ if (!cleanDataExists) {
2357
+ return {
2358
+ content: [{ type: "text", text: `Dataset '${datasetId}' not found locally. Download it first using download_dataset or prepare_dataset, then run custom_clean.` }],
2359
+ isError: true,
2360
+ };
1404
2361
  }
2362
+ const job = jobManager.createJob("clean", 0, { datasetId, ops });
2363
+ return {
2364
+ content: [{ type: "text", text: `Cleaning job started. ID: ${job.id}. Use check_job_status to monitor progress.` }]
2365
+ };
1405
2366
  }
1406
- if (dataworldToken) {
1407
- const r = secureKeys.set("dataworld_token", dataworldToken);
1408
- if (r.ok) {
1409
- process.env.DW_AUTH_TOKEN = dataworldToken;
1410
- saved.push("data.world token");
1411
- if (r.method)
1412
- methods.push(r.method);
2367
+ case "prepare_dataset": {
2368
+ hydrateExternalKeys();
2369
+ const query = String(request.params.arguments?.query);
2370
+ const requirements = request.params.arguments?.requirements ? String(request.params.arguments?.requirements) : undefined;
2371
+ const downloadImages = request.params.arguments?.download_images === true;
2372
+ const requestedOutputDir = request.params.arguments?.target_dir
2373
+ ? String(request.params.arguments.target_dir).trim()
2374
+ : request.params.arguments?.output_dir
2375
+ ? String(request.params.arguments.output_dir).trim()
2376
+ : "";
2377
+ const outputDir = requestedOutputDir || process.cwd();
2378
+ if (!query || query === "undefined") {
2379
+ throw new McpError(ErrorCode.InvalidParams, "query is required - describe the dataset you need or provide a dataset ID");
1413
2380
  }
1414
- }
1415
- if (saved.length === 0) {
2381
+ const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages, outputDir });
1416
2382
  return {
1417
- content: [{ type: "text", text: "No keys provided. Core Vesper tools continue to work without API keys." }]
2383
+ content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you. Use check_job_status to monitor progress.` }]
1418
2384
  };
1419
2385
  }
1420
- return {
1421
- content: [{ type: "text", text: `Key saved securely. Updated: ${saved.join(", ")}.` }]
1422
- };
1423
- }
1424
- case "get_dataset_info": {
1425
- const datasetId = String(request.params.arguments?.dataset_id);
1426
- if (!datasetId) {
1427
- throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
1428
- }
1429
- const dataset = metadataStore.getDataset(datasetId);
1430
- if (!dataset) {
2386
+ case "compare_datasets": {
2387
+ const datasetIds = request.params.arguments?.dataset_ids;
2388
+ const datasets = datasetIds.map(id => metadataStore.getDataset(id)).filter(d => !!d);
2389
+ let comparison = "| Metric | " + datasets.map(d => d.name).join(" | ") + " |\n";
2390
+ comparison += "| :--- | " + datasets.map(() => " :---: ").join("|") + " |\n";
2391
+ comparison += "| Quality Score | " + datasets.map(d => `${d.quality_score}/100`).join(" | ") + " |\n";
2392
+ comparison += "| License | " + datasets.map(d => d.license.category).join(" | ") + " |\n";
2393
+ comparison += "| Downloads | " + datasets.map(d => d.downloads).join(" | ") + " |\n";
2394
+ comparison += "| Domain | " + datasets.map(d => d.domain).join(" | ") + " |\n";
1431
2395
  return {
1432
- content: [{ type: "text", text: `ERROR: Dataset not found: ${datasetId}` }],
1433
- isError: true,
2396
+ content: [{ type: "text", text: comparison }]
1434
2397
  };
1435
2398
  }
1436
- const formattedOutput = formatDatasetInfo(dataset);
1437
- return { content: [{ type: "text", text: formattedOutput }] };
1438
- }
1439
- case "analyze_quality": {
1440
- const datasetId = String(request.params.arguments?.dataset_id);
1441
- const safeId = datasetId.replace(/\//g, "_");
1442
- const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
1443
- const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
1444
- let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
1445
- // Demo Fallback for easy testing
1446
- if (datasetId === "demo" || !fs.existsSync(filePath)) {
1447
- const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
1448
- const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
1449
- if (fs.existsSync(demoParquetPath)) {
1450
- filePath = demoParquetPath;
1451
- }
1452
- else if (fs.existsSync(demoCsvPath)) {
1453
- filePath = demoCsvPath;
1454
- }
1455
- else if (datasetId !== "demo") {
2399
+ case "check_job_status": {
2400
+ const jobId = String(request.params.arguments?.job_id);
2401
+ const job = metadataStore.getJob(jobId);
2402
+ if (!job) {
2403
+ throw new McpError(ErrorCode.InvalidParams, `Job not found: ${jobId}`);
2404
+ }
2405
+ const activeStatuses = new Set(["pending", "queued", "running", "retrying"]);
2406
+ const now = Date.now();
2407
+ const last = jobStatusLastPoll[jobId] || 0;
2408
+ const minPollMs = 3000;
2409
+ if (activeStatuses.has(job.status) && (now - last) < minPollMs) {
2410
+ const waitMs = minPollMs - (now - last);
1456
2411
  return {
1457
- content: [{ type: "text", text: `ERROR: Local data file for ${datasetId} not found. Try running prepare_dataset first, or use 'demo' as the dataset_id.` }],
1458
- isError: true
2412
+ content: [{ type: "text", text: `No significant status change yet. Please poll again in ~${Math.ceil(waitMs / 1000)}s.` }]
1459
2413
  };
1460
2414
  }
2415
+ jobStatusLastPoll[jobId] = now;
2416
+ return {
2417
+ content: [{ type: "text", text: formatJobStatus(job) }]
2418
+ };
1461
2419
  }
1462
- const report = await qualityAnalyzer.analyze(filePath);
1463
- return {
1464
- content: [{ type: "text", text: JSON.stringify(report, null, 2) }]
1465
- };
1466
- }
1467
- case "preview_cleaning": {
1468
- const datasetId = String(request.params.arguments?.dataset_id);
1469
- const safeId = datasetId.replace(/\//g, "_");
1470
- const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
1471
- const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
1472
- let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
1473
- if (datasetId === "demo" || !fs.existsSync(filePath)) {
1474
- const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
1475
- const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
1476
- if (fs.existsSync(demoParquetPath)) {
1477
- filePath = demoParquetPath;
1478
- }
1479
- else if (fs.existsSync(demoCsvPath)) {
1480
- filePath = demoCsvPath;
2420
+ case "export_dataset": {
2421
+ const datasetId = String(request.params.arguments?.dataset_id);
2422
+ const isDirectLocalInput = isDirectLocalDatasetReference(datasetId);
2423
+ const requestedTargetDir = request.params.arguments?.target_dir
2424
+ ? String(request.params.arguments?.target_dir).trim()
2425
+ : request.params.arguments?.output_dir
2426
+ ? String(request.params.arguments?.output_dir).trim()
2427
+ : "";
2428
+ const targetDir = path.resolve(requestedTargetDir || process.cwd());
2429
+ const requestedFormat = String(request.params.arguments?.format || "feather");
2430
+ const fastMode = request.params.arguments?.fast === true;
2431
+ const preview = request.params.arguments?.preview === true;
2432
+ const sampleRows = request.params.arguments?.sample_rows ? Number(request.params.arguments.sample_rows) : undefined;
2433
+ const columns = request.params.arguments?.columns;
2434
+ const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
2435
+ // Use Metadata or Registry to find the actual local file
2436
+ const preferredLookupDirs = [targetDir, process.cwd()];
2437
+ let sourcePath = resolveDatasetLocalPath(datasetId, preferredLookupDirs);
2438
+ if (!sourcePath) {
2439
+ console.error(`[Export] No local data found for ${datasetId}. Attempting to prepare automatically...`);
2440
+ // Start a prepare job for this dataset id (acts like calling prepare_dataset)
2441
+ try {
2442
+ jobManager.createJob("prepare", 0, { query: datasetId, requirements: undefined, downloadImages: false, outputDir: process.cwd() });
2443
+ }
2444
+ catch (e) {
2445
+ console.error(`[Export] Failed to start prepare job for ${datasetId}: ${e?.message || e}`);
2446
+ }
2447
+ // Poll for download status or registry entry until local_path appears or timeout
2448
+ const wait = (ms) => new Promise(res => setTimeout(res, ms));
2449
+ const maxWait = 120_000; // 120s
2450
+ const interval = 2000;
2451
+ let waited = 0;
2452
+ while (waited < maxWait) {
2453
+ const resolved = resolveDatasetLocalPath(datasetId, preferredLookupDirs);
2454
+ if (resolved) {
2455
+ sourcePath = resolved;
2456
+ console.error(`[Export] Local data is now available for ${datasetId}: ${sourcePath}`);
2457
+ break;
2458
+ }
2459
+ await wait(interval);
2460
+ waited += interval;
2461
+ }
2462
+ // If still no sourcePath, return helpful error listing prepared datasets
2463
+ if (!sourcePath) {
2464
+ const entries = readRegistry();
2465
+ const listText = entries.length === 0 ? "(no prepared datasets found)" : entries.map(e => `- ${e.dataset_id || e.id || "unknown"}: ${e.local_path || e.path || "unknown"}`).slice(0, 10).join("\n") + (entries.length > 10 ? "\n...and " + (entries.length - 10) + " more" : "");
2466
+ return {
2467
+ content: [{ type: "text", text: `ERROR: No local data found for ${datasetId} after attempting prepare. Check credentials and try running prepare_dataset manually. Prepared datasets:\n${listText}` }],
2468
+ isError: true
2469
+ };
2470
+ }
2471
+ }
2472
+ sourcePath = ensureExportableLocalPath(sourcePath);
2473
+ try {
2474
+ if (!isDirectLocalInput && shouldTrackExportPath(sourcePath)) {
2475
+ upsertRegistry(datasetId, sourcePath, "completed");
2476
+ }
2477
+ }
2478
+ catch (e) {
2479
+ console.error(`[Registry] Failed to normalize registry path for ${datasetId}: ${e?.message || e}`);
2480
+ }
2481
+ // If NOT fast mode, run quality/cleaning pipeline first (only for csv/parquet compat)
2482
+ if (!fastMode) {
2483
+ const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
2484
+ const pipelineFmt = (requestedFormat === "csv" || requestedFormat === "parquet") ? requestedFormat : "parquet";
2485
+ const pipelineCompatibleInput = currentExt === "csv" || currentExt === "parquet";
2486
+ if (!pipelineCompatibleInput) {
2487
+ console.error(`[Export] Skipping pipeline for ${currentExt} input; using direct exporter conversion.`);
2488
+ }
2489
+ else if (currentExt !== pipelineFmt) {
2490
+ console.error(`[Export] Running quality/cleaning pipeline (use fast=true to skip)...`);
2491
+ try {
2492
+ sourcePath = ensureLocalPipelineSource(sourcePath, datasetId, targetDir);
2493
+ const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, pipelineFmt);
2494
+ if (pipelineResult.final_output_path) {
2495
+ sourcePath = pipelineResult.final_output_path;
2496
+ try {
2497
+ // Update registry to point to pipeline's final output
2498
+ if (!isDirectLocalInput && shouldTrackExportPath(sourcePath)) {
2499
+ upsertRegistry(datasetId, sourcePath, "completed");
2500
+ }
2501
+ }
2502
+ catch (e) {
2503
+ console.error(`[Registry] Failed to update registry for ${datasetId}: ${e?.message || e}`);
2504
+ }
2505
+ }
2506
+ }
2507
+ catch (err) {
2508
+ console.error(`[Export] Pipeline warning: ${err.message}. Continuing with raw file.`);
2509
+ }
2510
+ }
1481
2511
  }
1482
2512
  else {
1483
- throw new McpError(ErrorCode.InvalidParams, `Local data file not found for ${datasetId}. Please run prepare_dataset first.`);
2513
+ console.error(`[Export] Fast mode - skipping quality analysis and cleaning`);
2514
+ }
2515
+ // Build export options
2516
+ const exportOpts = {};
2517
+ if (compression)
2518
+ exportOpts.compression = compression;
2519
+ if (preview)
2520
+ exportOpts.preview = true;
2521
+ if (sampleRows)
2522
+ exportOpts.sample_rows = sampleRows;
2523
+ if (columns)
2524
+ exportOpts.columns = columns;
2525
+ try {
2526
+ // Determine output file name
2527
+ const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
2528
+ const ext = extMap[requestedFormat] || ".feather";
2529
+ const safeName = getExportFileStem(datasetId);
2530
+ const outDir = targetDir;
2531
+ if (!fs.existsSync(outDir))
2532
+ fs.mkdirSync(outDir, { recursive: true });
2533
+ const outputFile = path.join(outDir, `${safeName}${ext}`);
2534
+ const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
2535
+ // Build rich response
2536
+ let msg = `**Export complete**\n`;
2537
+ msg += `- **File**: ${result.output_path}\n`;
2538
+ msg += `- **Format**: ${result.format}${result.compression ? ` (${result.compression})` : ""}\n`;
2539
+ msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
2540
+ if (result.file_size_mb !== undefined)
2541
+ msg += `- **Size**: ${result.file_size_mb} MB\n`;
2542
+ if (result.elapsed_seconds !== undefined)
2543
+ msg += `- **Time**: ${result.elapsed_seconds}s\n`;
2544
+ if (result.preview_path)
2545
+ msg += `- **Preview**: ${result.preview_path}\n`;
2546
+ msg += `\n`;
2547
+ if (requestedFormat === "feather") {
2548
+ msg += `**Inspect with:**\n`;
2549
+ msg += ` Python: \`pd.read_feather('${result.output_path}').head()\`\n`;
2550
+ msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
2551
+ }
2552
+ else if (requestedFormat === "parquet") {
2553
+ msg += `**Inspect with:**\n`;
2554
+ msg += ` Python: \`pd.read_parquet('${result.output_path}').head()\`\n`;
2555
+ msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
2556
+ }
2557
+ return { content: [{ type: "text", text: msg }] };
2558
+ }
2559
+ catch (error) {
2560
+ return {
2561
+ content: [{ type: "text", text: `ERROR: Export failed: ${error.message}` }],
2562
+ isError: true
2563
+ };
1484
2564
  }
1485
2565
  }
1486
- const report = await qualityAnalyzer.analyze(filePath);
1487
- // Phase 1: Target Detection
1488
- // We use the same TargetDetector instance inside CleaningPlanner now?
1489
- // Actually, we instantiated it inside CleaningPlanner, so we just need to pass the file path
1490
- // OR let the planner handle it if we update its signature to accept filePath.
1491
- // Let's check `CleaningPlanner.generatePlan` signature again.
1492
- // We updated it to accept `targetInfo`.
1493
- // So we need to run detection HERE and pass it.
1494
- // But `TargetDetector` is not exposed in `index.ts` scope yet.
1495
- // Let's create a global instance or use the one inside planner if exposed (it's private).
1496
- // Better approach: Instantiate TargetDetector here in index.ts for the tool content.
1497
- // Quick fix: Instantiate local detector or make global.
1498
- // I'll make a global `targetDetector` constant in index.ts
1499
- // But wait, I updated `CleaningPlanner` to instantiate its own detector.
1500
- // Does `CleaningPlanner` use it? No, I commented out the logic because it needed `filePath`.
1501
- // RETRY STRATEGY:
1502
- // 1. Instantiate `targetDetector` in `index.ts`.
1503
- // 2. Run `detectTarget(filePath)`.
1504
- // 3. Pass result to `cleaningPlanner.generatePlan(..., targetInfo)`.
1505
- // I need to add `const targetDetector = new TargetDetector(__dirname);` to imports/init section first.
1506
- // But since I'm in this tool, I can't look back.
1507
- // I will assume I can add it, or just do it inside the case for now.
1508
- // To do it properly, I should have added `targetDetector` to the global scope in previous step.
1509
- // Let's do that in a separate step if needed.
1510
- // For now, I'll instantiate it here.
1511
- const { TargetDetector } = await import("./preparation/target-detector.js");
1512
- const detector = new TargetDetector(__dirname);
1513
- const targetResult = await detector.detectTarget(filePath);
1514
- const targetInfo = targetResult.target_column ? {
1515
- target: targetResult.target_column,
1516
- confidence: targetResult.confidence
1517
- } : undefined;
1518
- const plan = await cleaningPlanner.generatePlan(datasetId, report, undefined, targetInfo);
1519
- let explanation = `### Cleaning Plan for ${datasetId}\n\n`;
1520
- if (targetInfo && targetInfo.target !== "target" && targetInfo.confidence > 0.7) {
1521
- explanation += `**Target Detected**: '${targetInfo.target}' (Confidence: ${targetInfo.confidence.toFixed(2)})\n`;
1522
- explanation += ` - **Action**: Renaming to 'target' for consistency.\n\n`;
1523
- }
1524
- explanation += `Estimated Quality Improvement: +${plan.estimated_impact.quality_score_improvement} points\n\n`;
1525
- if (plan.operations.length === 0) {
1526
- explanation += "No cleaning operations required.";
1527
- }
1528
- else {
1529
- plan.operations.forEach((op, i) => {
1530
- explanation += `${i + 1}. **${op.type}**: ${op.reason}\n`;
2566
+ case "vesper_list_datasets": {
2567
+ const entries = readRegistry();
2568
+ if (entries.length === 0) {
2569
+ return {
2570
+ content: [{ type: "text", text: "No prepared datasets found in the Vesper registry. Use prepare_dataset or download_dataset to add datasets." }]
2571
+ };
2572
+ }
2573
+ const lines = entries.map((e, i) => {
2574
+ const id = e.dataset_id || e.id || "unknown";
2575
+ const localPath = e.local_path || e.path || "unknown";
2576
+ const exists = typeof localPath === "string" && localPath !== "unknown" && fs.existsSync(localPath);
2577
+ return `${i + 1}. **${id}**\n Path: ${localPath}\n Status: ${e.status || "unknown"}${exists ? "" : " (file missing)"}`;
1531
2578
  });
1532
- }
1533
- return {
1534
- content: [{ type: "text", text: explanation }]
1535
- };
1536
- }
1537
- case "custom_clean": {
1538
- const datasetId = String(request.params.arguments?.dataset_id);
1539
- const ops = request.params.arguments?.operations;
1540
- if (!datasetId || datasetId === "undefined") {
1541
- throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
1542
- }
1543
- if (!ops || !Array.isArray(ops) || ops.length === 0) {
1544
- throw new McpError(ErrorCode.InvalidParams, "operations array is required and must not be empty. Supported operations: RemoveDuplicates, DropColumns, FillMissing, FixTypes, RemoveOutliers, EncodeCategories");
1545
- }
1546
- // Pre-check: verify dataset file exists before starting the job
1547
- const cleanRegEntry = getRegistryEntry(datasetId);
1548
- const cleanRegPath = cleanRegEntry?.local_path || cleanRegEntry?.path;
1549
- const cleanDlStatus = metadataStore.getDownloadStatus(datasetId);
1550
- const cleanSafeId = datasetId.replace(/\//g, "_");
1551
- const cleanDataExists = (cleanRegPath && fs.existsSync(cleanRegPath)) ||
1552
- (cleanDlStatus?.local_path && fs.existsSync(cleanDlStatus.local_path)) ||
1553
- fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.parquet`)) ||
1554
- fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.csv`)) ||
1555
- fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.feather`)) ||
1556
- fs.existsSync(datasetId);
1557
- if (!cleanDataExists) {
1558
- return {
1559
- content: [{ type: "text", text: `Dataset '${datasetId}' not found locally. Download it first using download_dataset or prepare_dataset, then run custom_clean.` }],
1560
- isError: true,
1561
- };
1562
- }
1563
- const job = jobManager.createJob("clean", 0, { datasetId, ops });
1564
- return {
1565
- content: [{ type: "text", text: `Cleaning job started. ID: ${job.id}. Use check_job_status to monitor progress.` }]
1566
- };
1567
- }
1568
- case "prepare_dataset": {
1569
- hydrateExternalKeys();
1570
- const query = String(request.params.arguments?.query);
1571
- const requirements = request.params.arguments?.requirements ? String(request.params.arguments?.requirements) : undefined;
1572
- const downloadImages = request.params.arguments?.download_images === true;
1573
- if (!query || query === "undefined") {
1574
- throw new McpError(ErrorCode.InvalidParams, "query is required - describe the dataset you need or provide a dataset ID");
1575
- }
1576
- const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages });
1577
- return {
1578
- content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you. Use check_job_status to monitor progress.` }]
1579
- };
1580
- }
1581
- case "compare_datasets": {
1582
- const datasetIds = request.params.arguments?.dataset_ids;
1583
- const datasets = datasetIds.map(id => metadataStore.getDataset(id)).filter(d => !!d);
1584
- let comparison = "| Metric | " + datasets.map(d => d.name).join(" | ") + " |\n";
1585
- comparison += "| :--- | " + datasets.map(() => " :---: ").join("|") + " |\n";
1586
- comparison += "| Quality Score | " + datasets.map(d => `${d.quality_score}/100`).join(" | ") + " |\n";
1587
- comparison += "| License | " + datasets.map(d => d.license.category).join(" | ") + " |\n";
1588
- comparison += "| Downloads | " + datasets.map(d => d.downloads).join(" | ") + " |\n";
1589
- comparison += "| Domain | " + datasets.map(d => d.domain).join(" | ") + " |\n";
1590
- return {
1591
- content: [{ type: "text", text: comparison }]
1592
- };
1593
- }
1594
- case "check_job_status": {
1595
- const jobId = String(request.params.arguments?.job_id);
1596
- const job = metadataStore.getJob(jobId);
1597
- if (!job) {
1598
- throw new McpError(ErrorCode.InvalidParams, `Job not found: ${jobId}`);
1599
- }
1600
- const activeStatuses = new Set(["pending", "queued", "running", "retrying"]);
1601
- const now = Date.now();
1602
- const last = jobStatusLastPoll[jobId] || 0;
1603
- const minPollMs = 3000;
1604
- if (activeStatuses.has(job.status) && (now - last) < minPollMs) {
1605
- const waitMs = minPollMs - (now - last);
1606
2579
  return {
1607
- content: [{ type: "text", text: `No significant status change yet. Please poll again in ~${Math.ceil(waitMs / 1000)}s.` }]
2580
+ content: [{ type: "text", text: `**Vesper Registry** (${entries.length} dataset${entries.length !== 1 ? "s" : ""}):\n\n${lines.join("\n\n")}` }]
1608
2581
  };
1609
2582
  }
1610
- jobStatusLastPoll[jobId] = now;
1611
- return {
1612
- content: [{ type: "text", text: formatJobStatus(job) }]
1613
- };
1614
- }
1615
- case "export_dataset": {
1616
- const datasetId = String(request.params.arguments?.dataset_id);
1617
- const targetDir = request.params.arguments?.target_dir ? String(request.params.arguments?.target_dir) : undefined;
1618
- const requestedFormat = String(request.params.arguments?.format || "feather");
1619
- const fastMode = request.params.arguments?.fast === true;
1620
- const preview = request.params.arguments?.preview === true;
1621
- const sampleRows = request.params.arguments?.sample_rows ? Number(request.params.arguments.sample_rows) : undefined;
1622
- const columns = request.params.arguments?.columns;
1623
- const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
1624
- const dataset = metadataStore.getDataset(datasetId);
1625
- if (!dataset) {
1626
- throw new McpError(ErrorCode.InvalidParams, `Dataset not found: ${datasetId}`);
1627
- }
1628
- // Use Metadata or Registry to find the actual local file
1629
- let sourcePath = undefined;
1630
- const downloadStatus = metadataStore.getDownloadStatus(datasetId);
1631
- if (downloadStatus && fs.existsSync(downloadStatus.local_path)) {
1632
- sourcePath = downloadStatus.local_path;
1633
- }
1634
- else {
1635
- // Fallback to local registry
1636
- const reg = getRegistryEntry(datasetId);
1637
- if (reg && fs.existsSync(reg.local_path)) {
1638
- sourcePath = reg.local_path;
2583
+ case "vesper_convert_format": {
2584
+ const filePath = String(request.params.arguments?.file_path || "").trim();
2585
+ const targetFormat = String(request.params.arguments?.target_format || "").trim().toLowerCase();
2586
+ if (!filePath) {
2587
+ throw new McpError(ErrorCode.InvalidParams, "file_path is required");
1639
2588
  }
1640
- }
1641
- if (!sourcePath) {
1642
- console.error(`[Export] No local data found for ${datasetId}. Attempting to prepare automatically...`);
1643
- // Start a prepare job for this dataset id (acts like calling prepare_dataset)
2589
+ if (!["csv", "parquet", "json", "jsonl"].includes(targetFormat)) {
2590
+ throw new McpError(ErrorCode.InvalidParams, "target_format must be one of: csv, parquet, json, jsonl");
2591
+ }
2592
+ if (!fs.existsSync(filePath)) {
2593
+ return {
2594
+ content: [{ type: "text", text: `ERROR: File not found: ${filePath}` }],
2595
+ isError: true,
2596
+ };
2597
+ }
2598
+ const inputExt = path.extname(filePath).toLowerCase();
2599
+ const extMap = { csv: ".csv", parquet: ".parquet", json: ".json", jsonl: ".jsonl" };
2600
+ const outputExt = extMap[targetFormat];
2601
+ if (inputExt === outputExt) {
2602
+ return {
2603
+ content: [{ type: "text", text: `File is already in ${targetFormat} format: ${filePath}` }],
2604
+ };
2605
+ }
2606
+ const outputPath = filePath.replace(/\.[^.]+$/, outputExt);
1644
2607
  try {
1645
- jobManager.createJob("prepare", 0, { query: datasetId, requirements: undefined, downloadImages: false });
2608
+ await ensurePythonModules([{ module: "polars", packageName: "polars" }]);
2609
+ const convertScript = path.join(dataRoot, "python", "convert_engine.py");
2610
+ const result = await runPythonJson(convertScript, [filePath, outputPath]);
2611
+ if (!result.ok) {
2612
+ return {
2613
+ content: [{ type: "text", text: `ERROR: Conversion failed: ${result.error}` }],
2614
+ isError: true,
2615
+ };
2616
+ }
2617
+ // Register converted file in the registry
2618
+ const datasetId = path.basename(outputPath, outputExt);
2619
+ try {
2620
+ upsertRegistry(datasetId, outputPath, "completed");
2621
+ }
2622
+ catch (e) {
2623
+ console.error(`[Convert] Registry write failed: ${e?.message || e}`);
2624
+ }
2625
+ let msg = `**Conversion complete**\n`;
2626
+ msg += `- **Input**: ${filePath} (${inputExt.slice(1)})\n`;
2627
+ msg += `- **Output**: ${result.output_path} (${targetFormat})\n`;
2628
+ msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
2629
+ if (result.size_mb !== undefined)
2630
+ msg += `- **Size**: ${result.size_mb} MB\n`;
2631
+ return { content: [{ type: "text", text: msg }] };
1646
2632
  }
1647
- catch (e) {
1648
- console.error(`[Export] Failed to start prepare job for ${datasetId}: ${e?.message || e}`);
1649
- }
1650
- // Poll for download status or registry entry until local_path appears or timeout
1651
- const wait = (ms) => new Promise(res => setTimeout(res, ms));
1652
- const maxWait = 120_000; // 120s
1653
- const interval = 2000;
1654
- let waited = 0;
1655
- while (waited < maxWait) {
1656
- const ds = metadataStore.getDownloadStatus(datasetId);
1657
- if (ds && ds.local_path && fs.existsSync(ds.local_path)) {
1658
- sourcePath = ds.local_path;
1659
- console.error(`[Export] Local data is now available for ${datasetId}: ${sourcePath}`);
1660
- break;
2633
+ catch (error) {
2634
+ return {
2635
+ content: [{ type: "text", text: `ERROR: Conversion failed: ${error.message}` }],
2636
+ isError: true,
2637
+ };
2638
+ }
2639
+ }
2640
+ case "fuse_datasets": {
2641
+ const rawSources = request.params.arguments?.sources;
2642
+ if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {
2643
+ throw new McpError(ErrorCode.InvalidParams, "sources must contain at least 2 dataset IDs/paths");
2644
+ }
2645
+ const strategy = request.params.arguments?.strategy || "concat";
2646
+ const joinOn = request.params.arguments?.join_on;
2647
+ const how = request.params.arguments?.how || "inner";
2648
+ const dedup = request.params.arguments?.dedup !== false;
2649
+ const runQualityAfter = request.params.arguments?.run_quality_after !== false;
2650
+ const leakageCheck = request.params.arguments?.leakage_check !== false;
2651
+ const outputFormat = request.params.arguments?.output_format || "feather";
2652
+ const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
2653
+ const preview = request.params.arguments?.preview !== false;
2654
+ const resolvedPaths = [];
2655
+ const unresolved = [];
2656
+ for (const src of rawSources) {
2657
+ if (fs.existsSync(src)) {
2658
+ resolvedPaths.push(src);
2659
+ continue;
1661
2660
  }
1662
- const reg = getRegistryEntry(datasetId);
1663
- const regPath = reg?.local_path || reg?.path;
1664
- if (regPath && fs.existsSync(regPath)) {
1665
- sourcePath = regPath;
1666
- console.error(`[Export] Local data found in registry for ${datasetId}: ${sourcePath}`);
1667
- break;
2661
+ const status = metadataStore.getDownloadStatus(src);
2662
+ if (status?.local_path && fs.existsSync(status.local_path)) {
2663
+ resolvedPaths.push(status.local_path);
2664
+ continue;
1668
2665
  }
1669
- await wait(interval);
1670
- waited += interval;
2666
+ unresolved.push(src);
1671
2667
  }
1672
- // If still no sourcePath, return helpful error listing prepared datasets
1673
- if (!sourcePath) {
1674
- const entries = readRegistry();
1675
- const listText = entries.length === 0 ? "(no prepared datasets found)" : entries.map(e => `- ${e.dataset_id || e.id || "unknown"}: ${e.local_path || e.path || "unknown"}`).slice(0, 10).join("\n") + (entries.length > 10 ? "\n...and " + (entries.length - 10) + " more" : "");
2668
+ if (unresolved.length > 0) {
1676
2669
  return {
1677
- content: [{ type: "text", text: `ERROR: No local data found for ${datasetId} after attempting prepare. Check credentials and try running prepare_dataset manually. Prepared datasets:\n${listText}` }],
2670
+ content: [{
2671
+ type: "text",
2672
+ text: `ERROR: Could not resolve these sources to local files: ${unresolved.join(", ")}. Provide local paths or run prepare_dataset first.`
2673
+ }],
1678
2674
  isError: true
1679
2675
  };
1680
2676
  }
1681
- }
1682
- // If NOT fast mode, run quality/cleaning pipeline first (only for csv/parquet compat)
1683
- if (!fastMode) {
1684
- const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
1685
- const pipelineFmt = (requestedFormat === "csv" || requestedFormat === "parquet") ? requestedFormat : "parquet";
1686
- const pipelineCompatibleInput = currentExt === "csv" || currentExt === "parquet";
1687
- if (!pipelineCompatibleInput) {
1688
- console.error(`[Export] Skipping pipeline for ${currentExt} input; using direct exporter conversion.`);
1689
- }
1690
- else if (currentExt !== pipelineFmt) {
1691
- console.error(`[Export] Running quality/cleaning pipeline (use fast=true to skip)...`);
2677
+ try {
2678
+ const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow" };
2679
+ const ext = extMap[outputFormat] || ".feather";
2680
+ const outDir = process.cwd();
2681
+ if (!fs.existsSync(outDir))
2682
+ fs.mkdirSync(outDir, { recursive: true });
2683
+ const outputPath = path.join(outDir, `fused_${Date.now()}${ext}`);
2684
+ console.error(`[Fusion] Resolved output directory: ${outDir}`);
2685
+ const result = await fusionEngine.fuse(resolvedPaths, outputPath, {
2686
+ strategy,
2687
+ join_on: joinOn,
2688
+ how,
2689
+ dedup,
2690
+ run_quality_after: runQualityAfter,
2691
+ leakage_check: leakageCheck,
2692
+ output_format: outputFormat,
2693
+ compression: compression,
2694
+ preview,
2695
+ });
2696
+ const nullDelta = result.stats.null_delta;
2697
+ const nullText = nullDelta >= 0 ? `+${nullDelta}%` : `${nullDelta}%`;
2698
+ // Register fused dataset under a generated id so users can export it easily
2699
+ const fusedId = `fused_${Date.now()}`;
1692
2700
  try {
1693
- const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, pipelineFmt);
1694
- if (pipelineResult.final_output_path) {
1695
- sourcePath = pipelineResult.final_output_path;
1696
- try {
1697
- // Update registry to point to pipeline's final output
1698
- upsertRegistry(datasetId, sourcePath, "completed");
1699
- }
1700
- catch (e) {
1701
- console.error(`[Registry] Failed to update registry for ${datasetId}: ${e?.message || e}`);
1702
- }
1703
- }
2701
+ upsertRegistry(fusedId, result.output_path, "completed");
1704
2702
  }
1705
- catch (err) {
1706
- console.error(`[Export] Pipeline warning: ${err.message}. Continuing with raw file.`);
2703
+ catch (e) {
2704
+ console.error(`[Registry] Failed to register fused dataset ${fusedId}: ${e?.message || e}`);
1707
2705
  }
2706
+ let msg = `Fused ${result.stats.sources_count} sources -> ${result.stats.rows_after.toLocaleString()} rows (from ${result.stats.rows_before.toLocaleString()}).\n`;
2707
+ msg += `- Duplicates removed: ${result.stats.duplicates_removed.toLocaleString()}\n`;
2708
+ msg += `- Null change: ${nullText}\n`;
2709
+ msg += `- Output: ${result.output_path}\n`;
2710
+ if (result.preview_path)
2711
+ msg += `- Preview: ${result.preview_path}\n`;
2712
+ if (result.leakage_report) {
2713
+ msg += `- Leakage: ${result.leakage_report.leakage_detected ? "detected" : "none"}`;
2714
+ if (result.leakage_report.leakage_count) {
2715
+ msg += ` (${result.leakage_report.leakage_count})`;
2716
+ }
2717
+ msg += "\n";
2718
+ }
2719
+ msg += `\nNext: run split_dataset/export_dataset on fused output. Registered fused dataset id: ${fusedId}`;
2720
+ return { content: [{ type: "text", text: msg }] };
1708
2721
  }
1709
- }
1710
- else {
1711
- console.error(`[Export] Fast mode - skipping quality analysis and cleaning`);
1712
- }
1713
- // Build export options
1714
- const exportOpts = {};
1715
- if (compression)
1716
- exportOpts.compression = compression;
1717
- if (preview)
1718
- exportOpts.preview = true;
1719
- if (sampleRows)
1720
- exportOpts.sample_rows = sampleRows;
1721
- if (columns)
1722
- exportOpts.columns = columns;
1723
- try {
1724
- // Determine output file name
1725
- const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
1726
- const ext = extMap[requestedFormat] || ".feather";
1727
- const safeName = datasetId.replace(/\//g, "_");
1728
- const outDir = targetDir || path.join(dataRoot, "exports");
1729
- if (!fs.existsSync(outDir))
1730
- fs.mkdirSync(outDir, { recursive: true });
1731
- const outputFile = path.join(outDir, `${safeName}${ext}`);
1732
- const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
1733
- // Build rich response
1734
- let msg = `**Export complete**\n`;
1735
- msg += `- **File**: ${result.output_path}\n`;
1736
- msg += `- **Format**: ${result.format}${result.compression ? ` (${result.compression})` : ""}\n`;
1737
- msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
1738
- if (result.file_size_mb !== undefined)
1739
- msg += `- **Size**: ${result.file_size_mb} MB\n`;
1740
- if (result.elapsed_seconds !== undefined)
1741
- msg += `- **Time**: ${result.elapsed_seconds}s\n`;
1742
- if (result.preview_path)
1743
- msg += `- **Preview**: ${result.preview_path}\n`;
1744
- msg += `\n`;
1745
- if (requestedFormat === "feather") {
1746
- msg += `**Inspect with:**\n`;
1747
- msg += ` Python: \`pd.read_feather('${result.output_path}').head()\`\n`;
1748
- msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
1749
- }
1750
- else if (requestedFormat === "parquet") {
1751
- msg += `**Inspect with:**\n`;
1752
- msg += ` Python: \`pd.read_parquet('${result.output_path}').head()\`\n`;
1753
- msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
1754
- }
1755
- return { content: [{ type: "text", text: msg }] };
1756
- }
1757
- catch (error) {
1758
- return {
1759
- content: [{ type: "text", text: `ERROR: Export failed: ${error.message}` }],
1760
- isError: true
1761
- };
1762
- }
1763
- }
1764
- case "fuse_datasets": {
1765
- const rawSources = request.params.arguments?.sources;
1766
- if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {
1767
- throw new McpError(ErrorCode.InvalidParams, "sources must contain at least 2 dataset IDs/paths");
1768
- }
1769
- const strategy = request.params.arguments?.strategy || "concat";
1770
- const joinOn = request.params.arguments?.join_on;
1771
- const how = request.params.arguments?.how || "inner";
1772
- const dedup = request.params.arguments?.dedup !== false;
1773
- const runQualityAfter = request.params.arguments?.run_quality_after !== false;
1774
- const leakageCheck = request.params.arguments?.leakage_check !== false;
1775
- const outputFormat = request.params.arguments?.output_format || "feather";
1776
- const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
1777
- const preview = request.params.arguments?.preview !== false;
1778
- const resolvedPaths = [];
1779
- const unresolved = [];
1780
- for (const src of rawSources) {
1781
- if (fs.existsSync(src)) {
1782
- resolvedPaths.push(src);
1783
- continue;
1784
- }
1785
- const status = metadataStore.getDownloadStatus(src);
1786
- if (status?.local_path && fs.existsSync(status.local_path)) {
1787
- resolvedPaths.push(status.local_path);
1788
- continue;
2722
+ catch (error) {
2723
+ return {
2724
+ content: [{ type: "text", text: `ERROR: Fusion failed: ${error.message}` }],
2725
+ isError: true
2726
+ };
1789
2727
  }
1790
- unresolved.push(src);
1791
- }
1792
- if (unresolved.length > 0) {
1793
- return {
1794
- content: [{
1795
- type: "text",
1796
- text: `ERROR: Could not resolve these sources to local files: ${unresolved.join(", ")}. Provide local paths or run prepare_dataset first.`
1797
- }],
1798
- isError: true
1799
- };
1800
2728
  }
1801
- try {
1802
- const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow" };
1803
- const ext = extMap[outputFormat] || ".feather";
1804
- const outDir = path.join(dataRoot, "fusion");
1805
- if (!fs.existsSync(outDir))
1806
- fs.mkdirSync(outDir, { recursive: true });
1807
- const outputPath = path.join(outDir, `fused_${Date.now()}${ext}`);
1808
- const result = await fusionEngine.fuse(resolvedPaths, outputPath, {
1809
- strategy,
1810
- join_on: joinOn,
1811
- how,
1812
- dedup,
1813
- run_quality_after: runQualityAfter,
1814
- leakage_check: leakageCheck,
1815
- output_format: outputFormat,
1816
- compression: compression,
1817
- preview,
1818
- });
1819
- const nullDelta = result.stats.null_delta;
1820
- const nullText = nullDelta >= 0 ? `+${nullDelta}%` : `${nullDelta}%`;
1821
- // Register fused dataset under a generated id so users can export it easily
1822
- const fusedId = `fused_${Date.now()}`;
1823
- try {
1824
- upsertRegistry(fusedId, result.output_path, "completed");
2729
+ case "analyze_image_quality": {
2730
+ const inputPath = String(request.params.arguments?.path);
2731
+ if (!fs.existsSync(inputPath)) {
2732
+ throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
1825
2733
  }
1826
- catch (e) {
1827
- console.error(`[Registry] Failed to register fused dataset ${fusedId}: ${e?.message || e}`);
1828
- }
1829
- let msg = `Fused ${result.stats.sources_count} sources -> ${result.stats.rows_after.toLocaleString()} rows (from ${result.stats.rows_before.toLocaleString()}).\n`;
1830
- msg += `- Duplicates removed: ${result.stats.duplicates_removed.toLocaleString()}\n`;
1831
- msg += `- Null change: ${nullText}\n`;
1832
- msg += `- Output: ${result.output_path}\n`;
1833
- if (result.preview_path)
1834
- msg += `- Preview: ${result.preview_path}\n`;
1835
- if (result.leakage_report) {
1836
- msg += `- Leakage: ${result.leakage_report.leakage_detected ? "detected" : "none"}`;
1837
- if (result.leakage_report.leakage_count) {
1838
- msg += ` (${result.leakage_report.leakage_count})`;
2734
+ try {
2735
+ const report = await imageAnalyzer.analyze(inputPath);
2736
+ let output = `## Image Quality Report\n\n`;
2737
+ output += `- **Total Images**: ${report.total_images}\n`;
2738
+ output += `- **Corrupted**: ${report.corrupted_count}\n`;
2739
+ output += `- **Average Resolution**: ${Math.round(report.average_width)}x${Math.round(report.average_height)}\n`;
2740
+ output += `- **Blurry Images**: ${report.blurry_count}\n\n`;
2741
+ if (report.individual_results.length > 0) {
2742
+ output += `### Sample Detail (Top 5)\n`;
2743
+ report.individual_results.slice(0, 5).forEach(img => {
2744
+ const statusLabel = img.status === "ok" ? (img.is_blurry ? "[warn]" : "[ok]") : "[error]";
2745
+ output += `${statusLabel} **${img.filename}**: ${img.width}x${img.height} | Mode: ${img.mode} | Blur: ${img.blur_score?.toFixed(1)}\n`;
2746
+ });
1839
2747
  }
1840
- msg += "\n";
2748
+ return {
2749
+ content: [{ type: "text", text: output }]
2750
+ };
2751
+ }
2752
+ catch (error) {
2753
+ return {
2754
+ content: [{ type: "text", text: `ERROR: Image analysis failed: ${error.message}` }],
2755
+ isError: true
2756
+ };
1841
2757
  }
1842
- msg += `\nNext: run split_dataset/export_dataset on fused output. Registered fused dataset id: ${fusedId}`;
1843
- return { content: [{ type: "text", text: msg }] };
1844
- }
1845
- catch (error) {
1846
- return {
1847
- content: [{ type: "text", text: `ERROR: Fusion failed: ${error.message}` }],
1848
- isError: true
1849
- };
1850
- }
1851
- }
1852
- case "analyze_image_quality": {
1853
- const inputPath = String(request.params.arguments?.path);
1854
- if (!fs.existsSync(inputPath)) {
1855
- throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
1856
2758
  }
1857
- try {
1858
- const report = await imageAnalyzer.analyze(inputPath);
1859
- let output = `## Image Quality Report\n\n`;
1860
- output += `- **Total Images**: ${report.total_images}\n`;
1861
- output += `- **Corrupted**: ${report.corrupted_count}\n`;
1862
- output += `- **Average Resolution**: ${Math.round(report.average_width)}x${Math.round(report.average_height)}\n`;
1863
- output += `- **Blurry Images**: ${report.blurry_count}\n\n`;
1864
- if (report.individual_results.length > 0) {
1865
- output += `### Sample Detail (Top 5)\n`;
1866
- report.individual_results.slice(0, 5).forEach(img => {
1867
- const statusLabel = img.status === "ok" ? (img.is_blurry ? "[warn]" : "[ok]") : "[error]";
1868
- output += `${statusLabel} **${img.filename}**: ${img.width}x${img.height} | Mode: ${img.mode} | Blur: ${img.blur_score?.toFixed(1)}\n`;
2759
+ case "analyze_media_quality": {
2760
+ const inputPath = String(request.params.arguments?.path);
2761
+ if (!fs.existsSync(inputPath)) {
2762
+ throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
2763
+ }
2764
+ try {
2765
+ const report = await mediaAnalyzer.analyze(inputPath);
2766
+ let output = `## Media Quality Report\n\n`;
2767
+ output += `- **Total Files**: ${report.total_files}\n`;
2768
+ output += `- **OK Files**: ${report.ok_files}\n`;
2769
+ output += `- **Failed Files**: ${report.failed_files}\n`;
2770
+ if ('avg_audio_duration' in report && report.avg_audio_duration) {
2771
+ output += `- **Average Audio Duration**: ${report.avg_audio_duration.toFixed(2)}s\n`;
2772
+ }
2773
+ if ('avg_video_duration' in report && report.avg_video_duration) {
2774
+ output += `- **Average Video Duration**: ${report.avg_video_duration.toFixed(2)}s\n`;
2775
+ output += `- **Average FPS**: ${report.avg_fps?.toFixed(1)}\n`;
2776
+ }
2777
+ output += `\n### Sample Detail (Top 5)\n`;
2778
+ report.details.slice(0, 5).forEach(item => {
2779
+ const statusLabel = item.status === "ok" ? "[ok]" : "[error]";
2780
+ if (item.type === "audio" && 'sample_rate' in item) {
2781
+ output += `${statusLabel} **${item.filename}**: ${item.duration?.toFixed(1)}s | SR: ${item.sample_rate}Hz | Silent: ${item.is_silent}\n`;
2782
+ }
2783
+ else if (item.type === "video" && 'width' in item) {
2784
+ output += `${statusLabel} **${item.filename}**: ${item.width}x${item.height} | ${item.fps?.toFixed(1)}fps | Risk: ${item.corruption_risk}\n`;
2785
+ }
2786
+ else {
2787
+ output += `${statusLabel} **${item.filename}**: ${item.error}\n`;
2788
+ }
1869
2789
  });
2790
+ return {
2791
+ content: [{ type: "text", text: output }]
2792
+ };
2793
+ }
2794
+ catch (error) {
2795
+ return {
2796
+ content: [{ type: "text", text: `ERROR: Media analysis failed: ${error.message}` }],
2797
+ isError: true
2798
+ };
1870
2799
  }
1871
- return {
1872
- content: [{ type: "text", text: output }]
1873
- };
1874
- }
1875
- catch (error) {
1876
- return {
1877
- content: [{ type: "text", text: `ERROR: Image analysis failed: ${error.message}` }],
1878
- isError: true
1879
- };
1880
- }
1881
- }
1882
- case "analyze_media_quality": {
1883
- const inputPath = String(request.params.arguments?.path);
1884
- if (!fs.existsSync(inputPath)) {
1885
- throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
1886
2800
  }
1887
- try {
1888
- const report = await mediaAnalyzer.analyze(inputPath);
1889
- let output = `## Media Quality Report\n\n`;
1890
- output += `- **Total Files**: ${report.total_files}\n`;
1891
- output += `- **OK Files**: ${report.ok_files}\n`;
1892
- output += `- **Failed Files**: ${report.failed_files}\n`;
1893
- if ('avg_audio_duration' in report && report.avg_audio_duration) {
1894
- output += `- **Average Audio Duration**: ${report.avg_audio_duration.toFixed(2)}s\n`;
1895
- }
1896
- if ('avg_video_duration' in report && report.avg_video_duration) {
1897
- output += `- **Average Video Duration**: ${report.avg_video_duration.toFixed(2)}s\n`;
1898
- output += `- **Average FPS**: ${report.avg_fps?.toFixed(1)}\n`;
1899
- }
1900
- output += `\n### Sample Detail (Top 5)\n`;
1901
- report.details.slice(0, 5).forEach(item => {
1902
- const statusLabel = item.status === "ok" ? "[ok]" : "[error]";
1903
- if (item.type === "audio" && 'sample_rate' in item) {
1904
- output += `${statusLabel} **${item.filename}**: ${item.duration?.toFixed(1)}s | SR: ${item.sample_rate}Hz | Silent: ${item.is_silent}\n`;
2801
+ case "generate_quality_report": {
2802
+ const datasetId = String(request.params.arguments?.dataset_id);
2803
+ const datasetPath = String(request.params.arguments?.dataset_path);
2804
+ if (!fs.existsSync(datasetPath)) {
2805
+ throw new McpError(ErrorCode.InvalidParams, `Dataset path not found: ${datasetPath}`);
2806
+ }
2807
+ try {
2808
+ // Optionally load text quality from metadata if available
2809
+ const metadata = await metadataStore.getDataset(datasetId);
2810
+ // TODO: Integrate text quality analysis when available
2811
+ const textQuality = null;
2812
+ const report = await qualityOrchestrator.generateReport(datasetId, datasetPath, textQuality);
2813
+ // Save report to metadata
2814
+ if (metadata) {
2815
+ metadata.unified_quality_report = report;
2816
+ await metadataStore.saveDataset(metadata);
1905
2817
  }
1906
- else if (item.type === "video" && 'width' in item) {
1907
- output += `${statusLabel} **${item.filename}**: ${item.width}x${item.height} | ${item.fps?.toFixed(1)}fps | Risk: ${item.corruption_risk}\n`;
2818
+ let output = `# Unified Quality Report\n\n`;
2819
+ output += `**Dataset**: ${datasetId}\n`;
2820
+ output += `**Modalities**: ${report.modalities.join(", ")}\n`;
2821
+ output += `**Overall Quality Score**: ${report.overall_quality_score}/100\n\n`;
2822
+ if (report.text_quality) {
2823
+ output += `## Text Quality\n`;
2824
+ output += `- Rows: ${report.text_quality.row_count}\n`;
2825
+ output += `- Columns: ${report.text_quality.column_count}\n`;
2826
+ output += `- Missing: ${report.text_quality.missing_percentage.toFixed(1)}%\n`;
2827
+ output += `- Duplicates: ${report.text_quality.duplicate_percentage.toFixed(1)}%\n\n`;
1908
2828
  }
1909
- else {
1910
- output += `${statusLabel} **${item.filename}**: ${item.error}\n`;
2829
+ if (report.image_quality) {
2830
+ output += `## Image Quality\n`;
2831
+ output += `- Total Images: ${report.image_quality.total_images}\n`;
2832
+ output += `- Corrupted: ${report.image_quality.corrupted_count}\n`;
2833
+ output += `- Avg Resolution: ${report.image_quality.avg_resolution}\n`;
2834
+ output += `- Blurry: ${report.image_quality.blurry_percentage.toFixed(1)}%\n\n`;
1911
2835
  }
1912
- });
1913
- return {
1914
- content: [{ type: "text", text: output }]
1915
- };
1916
- }
1917
- catch (error) {
1918
- return {
1919
- content: [{ type: "text", text: `ERROR: Media analysis failed: ${error.message}` }],
1920
- isError: true
1921
- };
1922
- }
1923
- }
1924
- case "generate_quality_report": {
1925
- const datasetId = String(request.params.arguments?.dataset_id);
1926
- const datasetPath = String(request.params.arguments?.dataset_path);
1927
- if (!fs.existsSync(datasetPath)) {
1928
- throw new McpError(ErrorCode.InvalidParams, `Dataset path not found: ${datasetPath}`);
1929
- }
1930
- try {
1931
- // Optionally load text quality from metadata if available
1932
- const metadata = await metadataStore.getDataset(datasetId);
1933
- // TODO: Integrate text quality analysis when available
1934
- const textQuality = null;
1935
- const report = await qualityOrchestrator.generateReport(datasetId, datasetPath, textQuality);
1936
- // Save report to metadata
1937
- if (metadata) {
1938
- metadata.unified_quality_report = report;
1939
- await metadataStore.saveDataset(metadata);
1940
- }
1941
- let output = `# Unified Quality Report\n\n`;
1942
- output += `**Dataset**: ${datasetId}\n`;
1943
- output += `**Modalities**: ${report.modalities.join(", ")}\n`;
1944
- output += `**Overall Quality Score**: ${report.overall_quality_score}/100\n\n`;
1945
- if (report.text_quality) {
1946
- output += `## Text Quality\n`;
1947
- output += `- Rows: ${report.text_quality.row_count}\n`;
1948
- output += `- Columns: ${report.text_quality.column_count}\n`;
1949
- output += `- Missing: ${report.text_quality.missing_percentage.toFixed(1)}%\n`;
1950
- output += `- Duplicates: ${report.text_quality.duplicate_percentage.toFixed(1)}%\n\n`;
1951
- }
1952
- if (report.image_quality) {
1953
- output += `## Image Quality\n`;
1954
- output += `- Total Images: ${report.image_quality.total_images}\n`;
1955
- output += `- Corrupted: ${report.image_quality.corrupted_count}\n`;
1956
- output += `- Avg Resolution: ${report.image_quality.avg_resolution}\n`;
1957
- output += `- Blurry: ${report.image_quality.blurry_percentage.toFixed(1)}%\n\n`;
1958
- }
1959
- if (report.audio_quality) {
1960
- output += `## Audio Quality\n`;
1961
- output += `- Total Files: ${report.audio_quality.total_files}\n`;
1962
- output += `- Avg Duration: ${report.audio_quality.avg_duration.toFixed(1)}s\n`;
1963
- output += `- Avg Sample Rate: ${report.audio_quality.avg_sample_rate.toFixed(0)}Hz\n`;
1964
- output += `- Silent: ${report.audio_quality.silent_percentage.toFixed(1)}%\n\n`;
1965
- }
1966
- if (report.video_quality) {
1967
- output += `## Video Quality\n`;
1968
- output += `- Total Files: ${report.video_quality.total_files}\n`;
1969
- output += `- Avg Duration: ${report.video_quality.avg_duration.toFixed(1)}s\n`;
1970
- output += `- Avg FPS: ${report.video_quality.avg_fps.toFixed(1)}\n`;
1971
- output += `- High Corruption Risk: ${report.video_quality.corruption_risk_high}\n\n`;
1972
- }
1973
- output += `## Recommendations\n`;
1974
- report.recommendations.forEach(rec => {
1975
- output += `- ${rec}\n`;
1976
- });
1977
- return {
1978
- content: [{ type: "text", text: output }]
1979
- };
1980
- }
1981
- catch (error) {
1982
- return {
1983
- content: [{ type: "text", text: `ERROR: Quality report generation failed: ${error.message}` }],
1984
- isError: true
1985
- };
2836
+ if (report.audio_quality) {
2837
+ output += `## Audio Quality\n`;
2838
+ output += `- Total Files: ${report.audio_quality.total_files}\n`;
2839
+ output += `- Avg Duration: ${report.audio_quality.avg_duration.toFixed(1)}s\n`;
2840
+ output += `- Avg Sample Rate: ${report.audio_quality.avg_sample_rate.toFixed(0)}Hz\n`;
2841
+ output += `- Silent: ${report.audio_quality.silent_percentage.toFixed(1)}%\n\n`;
2842
+ }
2843
+ if (report.video_quality) {
2844
+ output += `## Video Quality\n`;
2845
+ output += `- Total Files: ${report.video_quality.total_files}\n`;
2846
+ output += `- Avg Duration: ${report.video_quality.avg_duration.toFixed(1)}s\n`;
2847
+ output += `- Avg FPS: ${report.video_quality.avg_fps.toFixed(1)}\n`;
2848
+ output += `- High Corruption Risk: ${report.video_quality.corruption_risk_high}\n\n`;
2849
+ }
2850
+ output += `## Recommendations\n`;
2851
+ report.recommendations.forEach(rec => {
2852
+ output += `- ${rec}\n`;
2853
+ });
2854
+ return {
2855
+ content: [{ type: "text", text: output }]
2856
+ };
2857
+ }
2858
+ catch (error) {
2859
+ return {
2860
+ content: [{ type: "text", text: `ERROR: Quality report generation failed: ${error.message}` }],
2861
+ isError: true
2862
+ };
2863
+ }
1986
2864
  }
2865
+ default:
2866
+ throw new McpError(ErrorCode.MethodNotFound, "Tool not found");
1987
2867
  }
1988
- default:
1989
- throw new McpError(ErrorCode.MethodNotFound, "Tool not found");
1990
- }
2868
+ }); // end requestQueue.enqueue
1991
2869
  });
1992
2870
  async function main() {
1993
2871
  const args = process.argv.slice(2);
@@ -1995,6 +2873,7 @@ async function main() {
1995
2873
  const isFuse = args.includes("fuse");
1996
2874
  const isDiscover = args.includes("discover");
1997
2875
  const isDownload = args.includes("download");
2876
+ const isExport = args.includes("export");
1998
2877
  const isConfig = args.includes("config") || args.includes("configure");
1999
2878
  const isSetup = args.includes("--setup") || args.includes("setup");
2000
2879
  const isSilent = args.includes("--silent");
@@ -2017,6 +2896,10 @@ async function main() {
2017
2896
  await runDownloadCli(args);
2018
2897
  return;
2019
2898
  }
2899
+ if (isExport) {
2900
+ await runExportCli(args);
2901
+ return;
2902
+ }
2020
2903
  // If run in explicit setup mode, show setup wizard (do not auto-run on server startup)
2021
2904
  if (isSetup) {
2022
2905
  await runSetupWizard(isSilent);
@@ -2289,6 +3172,99 @@ async function runDownloadCli(args) {
2289
3172
  }
2290
3173
  console.log(`Download complete: ${localPath}`);
2291
3174
  }
3175
+ async function runExportCli(args) {
3176
+ const getArgValue = (name) => {
3177
+ const idx = args.findIndex(a => a === name);
3178
+ if (idx >= 0 && idx + 1 < args.length)
3179
+ return args[idx + 1];
3180
+ return undefined;
3181
+ };
3182
+ const nonFlags = args.filter((arg, index) => {
3183
+ if (arg.startsWith("--"))
3184
+ return false;
3185
+ const previous = index > 0 ? args[index - 1] : "";
3186
+ if (["--target-dir", "--format", "--compression", "--sample-rows", "--columns"].includes(previous))
3187
+ return false;
3188
+ return true;
3189
+ });
3190
+ const datasetId = nonFlags[1] || "";
3191
+ if (!datasetId) {
3192
+ console.error("Usage: vespermcp export <dataset-id|local-path> [--format parquet|feather|csv|jsonl|arrow] [--target-dir C:/path] [--compression snappy] [--fast] [--preview] [--sample-rows N] [--columns col1,col2]");
3193
+ process.exit(1);
3194
+ }
3195
+ const requestedFormat = getArgValue("--format") || "parquet";
3196
+ const targetDir = getArgValue("--target-dir");
3197
+ const compression = getArgValue("--compression");
3198
+ const sampleRows = getArgValue("--sample-rows");
3199
+ const columns = getArgValue("--columns");
3200
+ const fastMode = args.includes("--fast");
3201
+ const preview = args.includes("--preview");
3202
+ const isDirectLocalInput = isDirectLocalDatasetReference(datasetId);
3203
+ const resolvedTargetDir = path.resolve(targetDir || process.cwd());
3204
+ let sourcePath = resolveDatasetLocalPath(datasetId, [resolvedTargetDir, process.cwd()]);
3205
+ if (!sourcePath) {
3206
+ console.error(`Export failed: no local data found for ${datasetId}. Run download or prepare first, or pass a direct local path.`);
3207
+ process.exit(1);
3208
+ }
3209
+ sourcePath = ensureExportableLocalPath(sourcePath);
3210
+ try {
3211
+ if (!isDirectLocalInput && shouldTrackExportPath(sourcePath)) {
3212
+ upsertRegistry(datasetId, sourcePath, "completed");
3213
+ }
3214
+ }
3215
+ catch (e) {
3216
+ console.error(`[Registry] Failed to normalize registry path for ${datasetId}: ${e?.message || e}`);
3217
+ }
3218
+ if (!fastMode) {
3219
+ const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
3220
+ const pipelineFmt = (requestedFormat === "csv" || requestedFormat === "parquet") ? requestedFormat : "parquet";
3221
+ const pipelineCompatibleInput = currentExt === "csv" || currentExt === "parquet";
3222
+ if (pipelineCompatibleInput && currentExt !== pipelineFmt) {
3223
+ try {
3224
+ sourcePath = ensureLocalPipelineSource(sourcePath, datasetId, resolvedTargetDir);
3225
+ const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, pipelineFmt);
3226
+ if (pipelineResult.final_output_path) {
3227
+ sourcePath = pipelineResult.final_output_path;
3228
+ if (!isDirectLocalInput && shouldTrackExportPath(sourcePath)) {
3229
+ upsertRegistry(datasetId, sourcePath, "completed");
3230
+ }
3231
+ }
3232
+ }
3233
+ catch (err) {
3234
+ console.error(`[Export] Pipeline warning: ${err.message}. Continuing with raw file.`);
3235
+ }
3236
+ }
3237
+ }
3238
+ const exportOpts = {};
3239
+ if (compression)
3240
+ exportOpts.compression = compression;
3241
+ if (preview)
3242
+ exportOpts.preview = true;
3243
+ if (sampleRows)
3244
+ exportOpts.sample_rows = Number(sampleRows);
3245
+ if (columns)
3246
+ exportOpts.columns = columns.split(",").map(col => col.trim()).filter(Boolean);
3247
+ const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
3248
+ const ext = extMap[requestedFormat] || ".parquet";
3249
+ const safeName = getExportFileStem(datasetId);
3250
+ const outDir = resolvedTargetDir;
3251
+ if (!fs.existsSync(outDir))
3252
+ fs.mkdirSync(outDir, { recursive: true });
3253
+ const outputFile = path.join(outDir, `${safeName}${ext}`);
3254
+ console.error(`[Export] Resolved output directory: ${outDir}`);
3255
+ console.error(`[Export] Output file: ${outputFile}`);
3256
+ const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
3257
+ console.log(`Export complete: ${result.output_path}`);
3258
+ console.log(`Format: ${result.format}${result.compression ? ` (${result.compression})` : ""}`);
3259
+ if (result.rows !== undefined)
3260
+ console.log(`Rows: ${result.rows.toLocaleString()}`);
3261
+ if (result.columns !== undefined)
3262
+ console.log(`Columns: ${result.columns}`);
3263
+ if (result.file_size_mb !== undefined)
3264
+ console.log(`Size: ${result.file_size_mb} MB`);
3265
+ if (result.preview_path)
3266
+ console.log(`Preview: ${result.preview_path}`);
3267
+ }
2292
3268
  async function runFuseCli(args) {
2293
3269
  const getArgValue = (name) => {
2294
3270
  const idx = args.findIndex(a => a === name);